{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 0.545670687739949, "eval_steps": 500, "global_step": 7000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "completion_length": 428.95984268188477, "epoch": 0.00015590591078284256, "grad_norm": 0.1858801865038576, "kl": 9.499490261077881e-06, "learning_rate": 9.999999400234011e-07, "loss": 0.0, "reward": 1.7031250894069672, "reward_std": 0.22567015141248703, "rewards/accuracy_reward": 0.7031250260770321, "rewards/format_reward": 1.0, "step": 2 }, { "completion_length": 435.79019927978516, "epoch": 0.0003118118215656851, "grad_norm": 0.17843536564632628, "kl": 1.749396324157715e-05, "learning_rate": 9.999997600936189e-07, "loss": 0.0, "reward": 1.7410715073347092, "reward_std": 0.23824752867221832, "rewards/accuracy_reward": 0.7433036044239998, "rewards/format_reward": 0.9977678656578064, "step": 4 }, { "completion_length": 415.2567138671875, "epoch": 0.00046771773234852767, "grad_norm": 0.11655027703181131, "kl": 2.107769250869751e-05, "learning_rate": 9.999994602106966e-07, "loss": 0.0, "reward": 1.8258929252624512, "reward_std": 0.15555683430284262, "rewards/accuracy_reward": 0.8281250447034836, "rewards/format_reward": 0.9977678656578064, "step": 6 }, { "completion_length": 426.12948989868164, "epoch": 0.0006236236431313702, "grad_norm": 0.17668382369199098, "kl": 2.025812864303589e-05, "learning_rate": 9.99999040374706e-07, "loss": 0.0, "reward": 1.6696429252624512, "reward_std": 0.1931308265775442, "rewards/accuracy_reward": 0.6696428880095482, "rewards/format_reward": 1.0, "step": 8 }, { "completion_length": 444.3951110839844, "epoch": 0.0007795295539142127, "grad_norm": 0.1974528427904806, "kl": 2.232193946838379e-05, "learning_rate": 9.99998500585748e-07, "loss": 0.0, "reward": 1.662946492433548, "reward_std": 0.2652875278145075, "rewards/accuracy_reward": 0.6651786044239998, "rewards/format_reward": 0.9977678656578064, "step": 10 }, { "completion_length": 430.71876525878906, "epoch": 0.0009354354646970553, "grad_norm": 0.18428692602987126, "kl": 2.537667751312256e-05, "learning_rate": 9.99997840843952e-07, "loss": 0.0, "reward": 1.7700893580913544, "reward_std": 0.1896844981238246, "rewards/accuracy_reward": 0.7723214626312256, "rewards/format_reward": 0.9977678656578064, "step": 12 }, { "completion_length": 430.6138572692871, "epoch": 0.0010913413754798978, "grad_norm": 0.19044589297078326, "kl": 2.5667250156402588e-05, "learning_rate": 9.999970611494763e-07, "loss": 0.0, "reward": 1.7879465222358704, "reward_std": 0.1985541246831417, "rewards/accuracy_reward": 0.7901786118745804, "rewards/format_reward": 0.9977678656578064, "step": 14 }, { "completion_length": 420.08707427978516, "epoch": 0.0012472472862627404, "grad_norm": 0.14576795620128435, "kl": 2.7045607566833496e-05, "learning_rate": 9.99996161502508e-07, "loss": 0.0, "reward": 1.7544643729925156, "reward_std": 0.1992281312122941, "rewards/accuracy_reward": 0.7544643133878708, "rewards/format_reward": 1.0, "step": 16 }, { "completion_length": 433.22993087768555, "epoch": 0.001403153197045583, "grad_norm": 0.18385508753772878, "kl": 3.1635165214538574e-05, "learning_rate": 9.999951419032628e-07, "loss": 0.0, "reward": 1.6674107760190964, "reward_std": 0.1980121172964573, "rewards/accuracy_reward": 0.6674107536673546, "rewards/format_reward": 1.0, "step": 18 }, { "completion_length": 422.46876525878906, "epoch": 0.0015590591078284255, "grad_norm": 0.18064515909570314, "kl": 3.032386302947998e-05, "learning_rate": 9.999940023519854e-07, "loss": 0.0, "reward": 1.7678571939468384, "reward_std": 0.22906015161424875, "rewards/accuracy_reward": 0.770089328289032, "rewards/format_reward": 0.9977678656578064, "step": 20 }, { "completion_length": 419.51341247558594, "epoch": 0.001714965018611268, "grad_norm": 0.19254395348965062, "kl": 3.394484519958496e-05, "learning_rate": 9.999927428489492e-07, "loss": 0.0, "reward": 1.752232238650322, "reward_std": 0.21395687386393547, "rewards/accuracy_reward": 0.7566964700818062, "rewards/format_reward": 0.9955357313156128, "step": 22 }, { "completion_length": 433.52010345458984, "epoch": 0.0018708709293941107, "grad_norm": 0.17909262686189936, "kl": 3.0487775802612305e-05, "learning_rate": 9.999913633944563e-07, "loss": 0.0, "reward": 1.6875000596046448, "reward_std": 0.24122696556150913, "rewards/accuracy_reward": 0.6875000298023224, "rewards/format_reward": 1.0, "step": 24 }, { "completion_length": 433.89734268188477, "epoch": 0.0020267768401769533, "grad_norm": 0.17665079397272185, "kl": 4.063546657562256e-05, "learning_rate": 9.999898639888376e-07, "loss": 0.0, "reward": 1.7477679401636124, "reward_std": 0.24183679185807705, "rewards/accuracy_reward": 0.7477678954601288, "rewards/format_reward": 1.0, "step": 26 }, { "completion_length": 424.1384048461914, "epoch": 0.0021826827509597957, "grad_norm": 0.1838516242142134, "kl": 4.482269287109375e-05, "learning_rate": 9.999882446324531e-07, "loss": 0.0, "reward": 1.74553582072258, "reward_std": 0.2202696269378066, "rewards/accuracy_reward": 0.745535746216774, "rewards/format_reward": 1.0, "step": 28 }, { "completion_length": 426.32368087768555, "epoch": 0.0023385886617426385, "grad_norm": 0.20249073293006303, "kl": 4.690885543823242e-05, "learning_rate": 9.999865053256908e-07, "loss": 0.0, "reward": 1.7700893580913544, "reward_std": 0.25511928647756577, "rewards/accuracy_reward": 0.7723214700818062, "rewards/format_reward": 0.9977678656578064, "step": 30 }, { "completion_length": 432.1451072692871, "epoch": 0.002494494572525481, "grad_norm": 0.18685687567450554, "kl": 5.176663398742676e-05, "learning_rate": 9.999846460689683e-07, "loss": 0.0, "reward": 1.7388393729925156, "reward_std": 0.2476068316027522, "rewards/accuracy_reward": 0.7388393133878708, "rewards/format_reward": 1.0, "step": 32 }, { "completion_length": 420.2656364440918, "epoch": 0.0026504004833083233, "grad_norm": 0.1548496576715309, "kl": 4.881620407104492e-05, "learning_rate": 9.999826668627318e-07, "loss": 0.0, "reward": 1.7879465222358704, "reward_std": 0.17330031469464302, "rewards/accuracy_reward": 0.7879464626312256, "rewards/format_reward": 1.0, "step": 34 }, { "completion_length": 422.05135345458984, "epoch": 0.002806306394091166, "grad_norm": 0.2086178802483801, "kl": 5.558133125305176e-05, "learning_rate": 9.999805677074558e-07, "loss": 0.0, "reward": 1.703125074505806, "reward_std": 0.27798085287213326, "rewards/accuracy_reward": 0.7031250298023224, "rewards/format_reward": 1.0, "step": 36 }, { "completion_length": 421.3035888671875, "epoch": 0.0029622123048740085, "grad_norm": 0.16269520241631338, "kl": 6.410479545593262e-05, "learning_rate": 9.99978348603644e-07, "loss": 0.0, "reward": 1.7410715073347092, "reward_std": 0.16139549855142832, "rewards/accuracy_reward": 0.7433035969734192, "rewards/format_reward": 0.9977678656578064, "step": 38 }, { "completion_length": 433.32144927978516, "epoch": 0.003118118215656851, "grad_norm": 0.1950602037880233, "kl": 6.80685043334961e-05, "learning_rate": 9.99976009551829e-07, "loss": 0.0, "reward": 1.665178656578064, "reward_std": 0.23213696386665106, "rewards/accuracy_reward": 0.6651785895228386, "rewards/format_reward": 1.0, "step": 40 }, { "completion_length": 427.9776954650879, "epoch": 0.0032740241264396937, "grad_norm": 0.1399396478048742, "kl": 6.836652755737305e-05, "learning_rate": 9.999735505525714e-07, "loss": 0.0, "reward": 1.814732238650322, "reward_std": 0.14127882663160563, "rewards/accuracy_reward": 0.8147321790456772, "rewards/format_reward": 1.0, "step": 42 }, { "completion_length": 413.32814025878906, "epoch": 0.003429930037222536, "grad_norm": 0.19981274710316668, "kl": 7.56382942199707e-05, "learning_rate": 9.999709716064616e-07, "loss": 0.0, "reward": 1.7500000894069672, "reward_std": 0.2419046014547348, "rewards/accuracy_reward": 0.750000037252903, "rewards/format_reward": 1.0, "step": 44 }, { "completion_length": 436.21207427978516, "epoch": 0.003585835948005379, "grad_norm": 0.1790564175683857, "kl": 8.535385131835938e-05, "learning_rate": 9.999682727141183e-07, "loss": 0.0, "reward": 1.7187500894069672, "reward_std": 0.21621346101164818, "rewards/accuracy_reward": 0.7209821790456772, "rewards/format_reward": 0.9977678656578064, "step": 46 }, { "completion_length": 422.63171768188477, "epoch": 0.0037417418587882213, "grad_norm": 0.18105216213771372, "kl": 8.785724639892578e-05, "learning_rate": 9.99965453876189e-07, "loss": 0.0, "reward": 1.7165179401636124, "reward_std": 0.2274782657623291, "rewards/accuracy_reward": 0.7187500447034836, "rewards/format_reward": 0.9977678656578064, "step": 48 }, { "completion_length": 425.1071586608887, "epoch": 0.0038976477695710637, "grad_norm": 0.20796568623888848, "kl": 9.107589721679688e-05, "learning_rate": 9.999625150933494e-07, "loss": 0.0, "reward": 1.671875074505806, "reward_std": 0.24010293371975422, "rewards/accuracy_reward": 0.6718750298023224, "rewards/format_reward": 1.0, "step": 50 }, { "completion_length": 428.56921768188477, "epoch": 0.0040535536803539066, "grad_norm": 0.17655027343346455, "kl": 9.554624557495117e-05, "learning_rate": 9.999594563663054e-07, "loss": 0.0, "reward": 1.7633929401636124, "reward_std": 0.2077960828319192, "rewards/accuracy_reward": 0.7656250298023224, "rewards/format_reward": 0.9977678656578064, "step": 52 }, { "completion_length": 433.8593940734863, "epoch": 0.004209459591136749, "grad_norm": 0.16589259873538825, "kl": 0.0001081228256225586, "learning_rate": 9.999562776957902e-07, "loss": 0.0, "reward": 1.7611607909202576, "reward_std": 0.17006732895970345, "rewards/accuracy_reward": 0.761160746216774, "rewards/format_reward": 1.0, "step": 54 }, { "completion_length": 425.9799270629883, "epoch": 0.004365365501919591, "grad_norm": 0.16200968643201055, "kl": 0.00010782480239868164, "learning_rate": 9.999529790825666e-07, "loss": 0.0, "reward": 1.750000074505806, "reward_std": 0.15735767036676407, "rewards/accuracy_reward": 0.7500000409781933, "rewards/format_reward": 1.0, "step": 56 }, { "completion_length": 413.97993087768555, "epoch": 0.004521271412702434, "grad_norm": 0.18089039814795507, "kl": 0.00011324882507324219, "learning_rate": 9.999495605274258e-07, "loss": 0.0, "reward": 1.8125000596046448, "reward_std": 0.1651018839329481, "rewards/accuracy_reward": 0.812500037252903, "rewards/format_reward": 1.0, "step": 58 }, { "completion_length": 434.0156440734863, "epoch": 0.004677177323485277, "grad_norm": 0.18578030461269673, "kl": 0.00011539459228515625, "learning_rate": 9.999460220311882e-07, "loss": 0.0, "reward": 1.727678656578064, "reward_std": 0.23010142520070076, "rewards/accuracy_reward": 0.7276785932481289, "rewards/format_reward": 1.0, "step": 60 }, { "completion_length": 428.9085006713867, "epoch": 0.004833083234268119, "grad_norm": 0.16759373108432105, "kl": 0.00012606382369995117, "learning_rate": 9.999423635947025e-07, "loss": 0.0, "reward": 1.767857238650322, "reward_std": 0.17931207362562418, "rewards/accuracy_reward": 0.7678571566939354, "rewards/format_reward": 1.0, "step": 62 }, { "completion_length": 416.20537185668945, "epoch": 0.004988989145050962, "grad_norm": 0.17881525788912997, "kl": 0.00012439489364624023, "learning_rate": 9.999385852188463e-07, "loss": 0.0, "reward": 1.7276786416769028, "reward_std": 0.18788002710789442, "rewards/accuracy_reward": 0.7276786118745804, "rewards/format_reward": 1.0, "step": 64 }, { "completion_length": 429.1451072692871, "epoch": 0.005144895055833805, "grad_norm": 0.1628043314635525, "kl": 0.00013047456741333008, "learning_rate": 9.999346869045264e-07, "loss": 0.0, "reward": 1.756696492433548, "reward_std": 0.19749288633465767, "rewards/accuracy_reward": 0.7566964700818062, "rewards/format_reward": 1.0, "step": 66 }, { "completion_length": 428.26341247558594, "epoch": 0.005300800966616647, "grad_norm": 0.17698458297024383, "kl": 0.0001296401023864746, "learning_rate": 9.999306686526777e-07, "loss": 0.0, "reward": 1.7299107760190964, "reward_std": 0.197273101657629, "rewards/accuracy_reward": 0.729910746216774, "rewards/format_reward": 1.0, "step": 68 }, { "completion_length": 422.2232360839844, "epoch": 0.005456706877399489, "grad_norm": 0.15797174621945284, "kl": 0.00014001131057739258, "learning_rate": 9.999265304642643e-07, "loss": 0.0, "reward": 1.7834822237491608, "reward_std": 0.21161612402647734, "rewards/accuracy_reward": 0.7834821790456772, "rewards/format_reward": 1.0, "step": 70 }, { "completion_length": 414.9620666503906, "epoch": 0.005612612788182332, "grad_norm": 0.16358536250859054, "kl": 0.00013959407806396484, "learning_rate": 9.999222723402792e-07, "loss": 0.0, "reward": 1.7589286416769028, "reward_std": 0.1931789219379425, "rewards/accuracy_reward": 0.7611607536673546, "rewards/format_reward": 0.9977678656578064, "step": 72 }, { "completion_length": 440.9107322692871, "epoch": 0.005768518698965174, "grad_norm": 0.1849206937723719, "kl": 0.00015020370483398438, "learning_rate": 9.999178942817435e-07, "loss": 0.0, "reward": 1.7946429252624512, "reward_std": 0.21226962096989155, "rewards/accuracy_reward": 0.7991071790456772, "rewards/format_reward": 0.9955357313156128, "step": 74 }, { "completion_length": 433.06028747558594, "epoch": 0.005924424609748017, "grad_norm": 0.165217687761387, "kl": 0.00015878677368164062, "learning_rate": 9.99913396289708e-07, "loss": 0.0, "reward": 1.7053572237491608, "reward_std": 0.2058058250695467, "rewards/accuracy_reward": 0.7075893133878708, "rewards/format_reward": 0.9977678656578064, "step": 76 }, { "completion_length": 421.37055587768555, "epoch": 0.00608033052053086, "grad_norm": 0.16869467742269192, "kl": 0.00016379356384277344, "learning_rate": 9.999087783652514e-07, "loss": 0.0, "reward": 1.7165179401636124, "reward_std": 0.17269413266330957, "rewards/accuracy_reward": 0.7165178805589676, "rewards/format_reward": 1.0, "step": 78 }, { "completion_length": 441.6875190734863, "epoch": 0.006236236431313702, "grad_norm": 0.18898005473746476, "kl": 0.00017499923706054688, "learning_rate": 9.999040405094818e-07, "loss": 0.0, "reward": 1.727678656578064, "reward_std": 0.2267872504889965, "rewards/accuracy_reward": 0.729910746216774, "rewards/format_reward": 0.9977678656578064, "step": 80 }, { "completion_length": 424.51341247558594, "epoch": 0.006392142342096545, "grad_norm": 0.1834404319539322, "kl": 0.00018739700317382812, "learning_rate": 9.99899182723536e-07, "loss": 0.0, "reward": 1.7566965073347092, "reward_std": 0.23311547748744488, "rewards/accuracy_reward": 0.7566964700818062, "rewards/format_reward": 1.0, "step": 82 }, { "completion_length": 414.7522506713867, "epoch": 0.0065480482528793875, "grad_norm": 0.2014507982946418, "kl": 0.00017690658569335938, "learning_rate": 9.998942050085787e-07, "loss": 0.0, "reward": 1.7991072237491608, "reward_std": 0.17690282315015793, "rewards/accuracy_reward": 0.7991071790456772, "rewards/format_reward": 1.0, "step": 84 }, { "completion_length": 430.0379638671875, "epoch": 0.006703954163662229, "grad_norm": 0.1765504423019189, "kl": 0.00018525123596191406, "learning_rate": 9.99889107365805e-07, "loss": 0.0, "reward": 1.7656250894069672, "reward_std": 0.21740894205868244, "rewards/accuracy_reward": 0.765625037252903, "rewards/format_reward": 1.0, "step": 86 }, { "completion_length": 422.8214416503906, "epoch": 0.006859860074445072, "grad_norm": 0.1692745205117488, "kl": 0.00019550323486328125, "learning_rate": 9.998838897964374e-07, "loss": 0.0, "reward": 1.7767857909202576, "reward_std": 0.1678672805428505, "rewards/accuracy_reward": 0.7790178954601288, "rewards/format_reward": 0.9977678656578064, "step": 88 }, { "completion_length": 422.3727836608887, "epoch": 0.007015765985227915, "grad_norm": 0.1757374827584602, "kl": 0.0001970529556274414, "learning_rate": 9.998785523017276e-07, "loss": 0.0, "reward": 1.7968751043081284, "reward_std": 0.21728879399597645, "rewards/accuracy_reward": 0.801339328289032, "rewards/format_reward": 0.9955357313156128, "step": 90 }, { "completion_length": 442.39734268188477, "epoch": 0.007171671896010758, "grad_norm": 0.18945345611433656, "kl": 0.00020051002502441406, "learning_rate": 9.998730948829562e-07, "loss": 0.0, "reward": 1.774553656578064, "reward_std": 0.17516756523400545, "rewards/accuracy_reward": 0.7745535969734192, "rewards/format_reward": 1.0, "step": 92 }, { "completion_length": 420.24555587768555, "epoch": 0.0073275778067936, "grad_norm": 0.1705408169633682, "kl": 0.00019633769989013672, "learning_rate": 9.998675175414323e-07, "loss": 0.0, "reward": 1.7500000596046448, "reward_std": 0.12941287737339735, "rewards/accuracy_reward": 0.7500000298023224, "rewards/format_reward": 1.0, "step": 94 }, { "completion_length": 433.66519927978516, "epoch": 0.007483483717576443, "grad_norm": 0.15849077358346603, "kl": 0.00021791458129882812, "learning_rate": 9.998618202784944e-07, "loss": 0.0, "reward": 1.796875074505806, "reward_std": 0.1486549274995923, "rewards/accuracy_reward": 0.7968750521540642, "rewards/format_reward": 1.0, "step": 96 }, { "completion_length": 442.6942138671875, "epoch": 0.0076393896283592855, "grad_norm": 0.17669699527938007, "kl": 0.00024950504302978516, "learning_rate": 9.998560030955088e-07, "loss": 0.0, "reward": 1.6785714775323868, "reward_std": 0.22890678979456425, "rewards/accuracy_reward": 0.6785714626312256, "rewards/format_reward": 1.0, "step": 98 }, { "completion_length": 426.67635345458984, "epoch": 0.0077952955391421275, "grad_norm": 0.1522633896225516, "kl": 0.0002453327178955078, "learning_rate": 9.998500659938714e-07, "loss": 0.0, "reward": 1.7232143580913544, "reward_std": 0.20101643074303865, "rewards/accuracy_reward": 0.7254464700818062, "rewards/format_reward": 0.9977678656578064, "step": 100 }, { "completion_length": 429.7968978881836, "epoch": 0.00795120144992497, "grad_norm": 0.20373144891535636, "kl": 0.00026810169219970703, "learning_rate": 9.998440089750062e-07, "loss": 0.0, "reward": 1.734375074505806, "reward_std": 0.23513328842818737, "rewards/accuracy_reward": 0.734375037252903, "rewards/format_reward": 1.0, "step": 102 }, { "completion_length": 422.4464454650879, "epoch": 0.008107107360707813, "grad_norm": 0.15244229453991143, "kl": 0.0002636909484863281, "learning_rate": 9.998378320403666e-07, "loss": 0.0, "reward": 1.7455357760190964, "reward_std": 0.17299722600728273, "rewards/accuracy_reward": 0.745535746216774, "rewards/format_reward": 1.0, "step": 104 }, { "completion_length": 433.8660888671875, "epoch": 0.008263013271490656, "grad_norm": 0.19627694357751202, "kl": 0.0002707242965698242, "learning_rate": 9.998315351914346e-07, "loss": 0.0, "reward": 1.7477679252624512, "reward_std": 0.23333526030182838, "rewards/accuracy_reward": 0.7500000298023224, "rewards/format_reward": 0.9977678656578064, "step": 106 }, { "completion_length": 427.77010345458984, "epoch": 0.008418919182273499, "grad_norm": 0.18270093574442692, "kl": 0.0003077983856201172, "learning_rate": 9.998251184297206e-07, "loss": 0.0, "reward": 1.7299107760190964, "reward_std": 0.2194472923874855, "rewards/accuracy_reward": 0.7299107536673546, "rewards/format_reward": 1.0, "step": 108 }, { "completion_length": 430.7924270629883, "epoch": 0.00857482509305634, "grad_norm": 0.17441960903845133, "kl": 0.00029838085174560547, "learning_rate": 9.99818581756764e-07, "loss": 0.0, "reward": 1.734375074505806, "reward_std": 0.19291468244045973, "rewards/accuracy_reward": 0.7343750298023224, "rewards/format_reward": 1.0, "step": 110 }, { "completion_length": 433.46430587768555, "epoch": 0.008730731003839183, "grad_norm": 0.15717353897731332, "kl": 0.00030541419982910156, "learning_rate": 9.998119251741331e-07, "loss": 0.0, "reward": 1.6941965073347092, "reward_std": 0.161870289593935, "rewards/accuracy_reward": 0.694196455180645, "rewards/format_reward": 1.0, "step": 112 }, { "completion_length": 422.5781440734863, "epoch": 0.008886636914622026, "grad_norm": 0.15300908827291404, "kl": 0.0003228187561035156, "learning_rate": 9.99805148683425e-07, "loss": 0.0, "reward": 1.8392858058214188, "reward_std": 0.1877958718687296, "rewards/accuracy_reward": 0.839285746216774, "rewards/format_reward": 1.0, "step": 114 }, { "completion_length": 434.3660888671875, "epoch": 0.009042542825404868, "grad_norm": 0.1941035608173932, "kl": 0.0003714561462402344, "learning_rate": 9.997982522862651e-07, "loss": 0.0, "reward": 1.7098215073347092, "reward_std": 0.21725697629153728, "rewards/accuracy_reward": 0.7098214626312256, "rewards/format_reward": 1.0, "step": 116 }, { "completion_length": 439.18751525878906, "epoch": 0.009198448736187711, "grad_norm": 0.18894572649085092, "kl": 0.0003871917724609375, "learning_rate": 9.997912359843082e-07, "loss": 0.0, "reward": 1.7053572088479996, "reward_std": 0.2380809187889099, "rewards/accuracy_reward": 0.7053571715950966, "rewards/format_reward": 1.0, "step": 118 }, { "completion_length": 436.17859649658203, "epoch": 0.009354354646970554, "grad_norm": 0.16338076486832842, "kl": 0.00041294097900390625, "learning_rate": 9.997840997792372e-07, "loss": 0.0, "reward": 1.7098215371370316, "reward_std": 0.1858416749164462, "rewards/accuracy_reward": 0.7120536118745804, "rewards/format_reward": 0.9977678656578064, "step": 120 }, { "completion_length": 422.9241256713867, "epoch": 0.009510260557753395, "grad_norm": 0.16861637287360182, "kl": 0.0003712177276611328, "learning_rate": 9.997768436727645e-07, "loss": 0.0, "reward": 1.743303656578064, "reward_std": 0.1830651443451643, "rewards/accuracy_reward": 0.7433035969734192, "rewards/format_reward": 1.0, "step": 122 }, { "completion_length": 431.3169822692871, "epoch": 0.009666166468536238, "grad_norm": 0.16704314229057507, "kl": 0.0003948211669921875, "learning_rate": 9.997694676666308e-07, "loss": 0.0, "reward": 1.6986607760190964, "reward_std": 0.21733748726546764, "rewards/accuracy_reward": 0.6986607536673546, "rewards/format_reward": 1.0, "step": 124 }, { "completion_length": 434.5692138671875, "epoch": 0.00982207237931908, "grad_norm": 0.13330869132064416, "kl": 0.00040793418884277344, "learning_rate": 9.997619717626055e-07, "loss": 0.0, "reward": 1.772321492433548, "reward_std": 0.12798071652650833, "rewards/accuracy_reward": 0.7723214700818062, "rewards/format_reward": 1.0, "step": 126 }, { "completion_length": 419.8281440734863, "epoch": 0.009977978290101924, "grad_norm": 0.18619055829898581, "kl": 0.00042724609375, "learning_rate": 9.99754355962487e-07, "loss": 0.0, "reward": 1.7366072237491608, "reward_std": 0.18291317857801914, "rewards/accuracy_reward": 0.7388393133878708, "rewards/format_reward": 0.9977678656578064, "step": 128 }, { "completion_length": 429.99555587768555, "epoch": 0.010133884200884766, "grad_norm": 0.1752478676537289, "kl": 0.0004303455352783203, "learning_rate": 9.997466202681023e-07, "loss": 0.0, "reward": 1.7075893729925156, "reward_std": 0.20929381996393204, "rewards/accuracy_reward": 0.7098214589059353, "rewards/format_reward": 0.9977678656578064, "step": 130 }, { "completion_length": 423.7544822692871, "epoch": 0.01028979011166761, "grad_norm": 0.17779911994144149, "kl": 0.0004799365997314453, "learning_rate": 9.997387646813073e-07, "loss": 0.0, "reward": 1.7187500596046448, "reward_std": 0.22372520342469215, "rewards/accuracy_reward": 0.7187500447034836, "rewards/format_reward": 1.0, "step": 132 }, { "completion_length": 430.3214416503906, "epoch": 0.01044569602245045, "grad_norm": 0.18638374475164216, "kl": 0.0004608631134033203, "learning_rate": 9.997307892039866e-07, "loss": 0.0, "reward": 1.727678656578064, "reward_std": 0.19502085633575916, "rewards/accuracy_reward": 0.7299107536673546, "rewards/format_reward": 0.9977678656578064, "step": 134 }, { "completion_length": 435.2031440734863, "epoch": 0.010601601933233293, "grad_norm": 0.20892888165843196, "kl": 0.0005154609680175781, "learning_rate": 9.997226938380537e-07, "loss": 0.0, "reward": 1.649553656578064, "reward_std": 0.22559624910354614, "rewards/accuracy_reward": 0.6562500298023224, "rewards/format_reward": 0.9933035969734192, "step": 136 }, { "completion_length": 427.2991256713867, "epoch": 0.010757507844016136, "grad_norm": 0.19539629172481077, "kl": 0.0004811286926269531, "learning_rate": 9.997144785854505e-07, "loss": 0.0, "reward": 1.7924107909202576, "reward_std": 0.20283064153045416, "rewards/accuracy_reward": 0.7946429029107094, "rewards/format_reward": 0.9977678656578064, "step": 138 }, { "completion_length": 429.5826110839844, "epoch": 0.010913413754798979, "grad_norm": 0.15611732834140002, "kl": 0.0004658699035644531, "learning_rate": 9.997061434481483e-07, "loss": 0.0, "reward": 1.7455357760190964, "reward_std": 0.14224254991859198, "rewards/accuracy_reward": 0.7477678880095482, "rewards/format_reward": 0.9977678656578064, "step": 140 }, { "completion_length": 428.221004486084, "epoch": 0.011069319665581822, "grad_norm": 0.36321029828248474, "kl": 0.0005025863647460938, "learning_rate": 9.996976884281462e-07, "loss": 0.0, "reward": 1.6718750894069672, "reward_std": 0.15796246100217104, "rewards/accuracy_reward": 0.6718750223517418, "rewards/format_reward": 1.0, "step": 142 }, { "completion_length": 435.8973388671875, "epoch": 0.011225225576364664, "grad_norm": 0.23007290686657997, "kl": 0.0005650520324707031, "learning_rate": 9.99689113527473e-07, "loss": 0.0, "reward": 1.8080358058214188, "reward_std": 0.2195151075720787, "rewards/accuracy_reward": 0.808035746216774, "rewards/format_reward": 1.0, "step": 144 }, { "completion_length": 436.2076110839844, "epoch": 0.011381131487147507, "grad_norm": 0.20523230726086733, "kl": 0.0006148815155029297, "learning_rate": 9.996804187481855e-07, "loss": 0.0, "reward": 1.6785715222358704, "reward_std": 0.21853800863027573, "rewards/accuracy_reward": 0.6785714700818062, "rewards/format_reward": 1.0, "step": 146 }, { "completion_length": 432.53796005249023, "epoch": 0.011537037397930348, "grad_norm": 0.1507392207621729, "kl": 0.0005869865417480469, "learning_rate": 9.9967160409237e-07, "loss": 0.0, "reward": 1.758928656578064, "reward_std": 0.16067705024033785, "rewards/accuracy_reward": 0.7589286118745804, "rewards/format_reward": 1.0, "step": 148 }, { "completion_length": 428.62725830078125, "epoch": 0.011692943308713191, "grad_norm": 0.15979075649924926, "kl": 0.0005362033843994141, "learning_rate": 9.996626695621412e-07, "loss": 0.0, "reward": 1.7745536416769028, "reward_std": 0.13054194021970034, "rewards/accuracy_reward": 0.7745536118745804, "rewards/format_reward": 1.0, "step": 150 }, { "completion_length": 415.7522430419922, "epoch": 0.011848849219496034, "grad_norm": 0.19560584361479483, "kl": 0.0006151199340820312, "learning_rate": 9.996536151596423e-07, "loss": 0.0, "reward": 1.8258929401636124, "reward_std": 0.19907477125525475, "rewards/accuracy_reward": 0.82589291036129, "rewards/format_reward": 1.0, "step": 152 }, { "completion_length": 432.214298248291, "epoch": 0.012004755130278877, "grad_norm": 0.1780087791154088, "kl": 0.0005424022674560547, "learning_rate": 9.996444408870457e-07, "loss": 0.0, "reward": 1.750000074505806, "reward_std": 0.18306654505431652, "rewards/accuracy_reward": 0.7500000298023224, "rewards/format_reward": 1.0, "step": 154 }, { "completion_length": 438.1562728881836, "epoch": 0.01216066104106172, "grad_norm": 0.1960295611615047, "kl": 0.0005505084991455078, "learning_rate": 9.996351467465523e-07, "loss": 0.0, "reward": 1.7991072088479996, "reward_std": 0.20508512575179338, "rewards/accuracy_reward": 0.8013393208384514, "rewards/format_reward": 0.9977678656578064, "step": 156 }, { "completion_length": 420.3571662902832, "epoch": 0.012316566951844563, "grad_norm": 0.15769415597523737, "kl": 0.0005583763122558594, "learning_rate": 9.996257327403918e-07, "loss": 0.0, "reward": 1.7879465073347092, "reward_std": 0.13001766428351402, "rewards/accuracy_reward": 0.7879464700818062, "rewards/format_reward": 1.0, "step": 158 }, { "completion_length": 416.6294822692871, "epoch": 0.012472472862627404, "grad_norm": 0.1678900405214465, "kl": 0.0005466938018798828, "learning_rate": 9.996161988708227e-07, "loss": 0.0, "reward": 1.8035715073347092, "reward_std": 0.17140196729451418, "rewards/accuracy_reward": 0.8058036044239998, "rewards/format_reward": 0.9977678656578064, "step": 160 }, { "completion_length": 429.1696586608887, "epoch": 0.012628378773410246, "grad_norm": 0.19951667414628155, "kl": 0.0006060600280761719, "learning_rate": 9.996065451401321e-07, "loss": 0.0, "reward": 1.7566965222358704, "reward_std": 0.2340925745666027, "rewards/accuracy_reward": 0.7566964626312256, "rewards/format_reward": 1.0, "step": 162 }, { "completion_length": 423.50672149658203, "epoch": 0.01278428468419309, "grad_norm": 0.1629188123415397, "kl": 0.0005753040313720703, "learning_rate": 9.995967715506364e-07, "loss": 0.0, "reward": 1.7366072088479996, "reward_std": 0.18141684029251337, "rewards/accuracy_reward": 0.736607164144516, "rewards/format_reward": 1.0, "step": 164 }, { "completion_length": 435.3214454650879, "epoch": 0.012940190594975932, "grad_norm": 0.135671568796454, "kl": 0.0005376338958740234, "learning_rate": 9.9958687810468e-07, "loss": 0.0, "reward": 1.7589286714792252, "reward_std": 0.16052368562668562, "rewards/accuracy_reward": 0.761160746216774, "rewards/format_reward": 0.9977678656578064, "step": 166 }, { "completion_length": 411.0245780944824, "epoch": 0.013096096505758775, "grad_norm": 0.1798226458335594, "kl": 0.0006172657012939453, "learning_rate": 9.995768648046363e-07, "loss": 0.0, "reward": 1.6852679401636124, "reward_std": 0.2281515896320343, "rewards/accuracy_reward": 0.687500037252903, "rewards/format_reward": 0.9977678656578064, "step": 168 }, { "completion_length": 425.00224685668945, "epoch": 0.013252002416541618, "grad_norm": 0.14876000097081488, "kl": 0.0005640983581542969, "learning_rate": 9.995667316529079e-07, "loss": 0.0, "reward": 1.821428656578064, "reward_std": 0.1571401283144951, "rewards/accuracy_reward": 0.821428619325161, "rewards/format_reward": 1.0, "step": 170 }, { "completion_length": 426.3259086608887, "epoch": 0.013407908327324459, "grad_norm": 0.14768988157957832, "kl": 0.0006422996520996094, "learning_rate": 9.995564786519257e-07, "loss": 0.0, "reward": 1.7165179550647736, "reward_std": 0.1409771330654621, "rewards/accuracy_reward": 0.7165178880095482, "rewards/format_reward": 1.0, "step": 172 }, { "completion_length": 415.49778747558594, "epoch": 0.013563814238107302, "grad_norm": 0.1753248914933796, "kl": 0.0005941390991210938, "learning_rate": 9.995461058041491e-07, "loss": 0.0, "reward": 1.7299107760190964, "reward_std": 0.2138720117509365, "rewards/accuracy_reward": 0.7321428880095482, "rewards/format_reward": 0.9977678656578064, "step": 174 }, { "completion_length": 441.15403747558594, "epoch": 0.013719720148890144, "grad_norm": 0.18811255160805143, "kl": 0.0006890296936035156, "learning_rate": 9.995356131120672e-07, "loss": 0.0, "reward": 1.7388393431901932, "reward_std": 0.17961517348885536, "rewards/accuracy_reward": 0.7477678954601288, "rewards/format_reward": 0.9910714328289032, "step": 176 }, { "completion_length": 417.721004486084, "epoch": 0.013875626059672987, "grad_norm": 0.1667188935142723, "kl": 0.00061798095703125, "learning_rate": 9.99525000578197e-07, "loss": 0.0, "reward": 1.783482238650322, "reward_std": 0.2000504694879055, "rewards/accuracy_reward": 0.7834821790456772, "rewards/format_reward": 1.0, "step": 178 }, { "completion_length": 421.92635345458984, "epoch": 0.01403153197045583, "grad_norm": 0.14933335996664499, "kl": 0.0007505416870117188, "learning_rate": 9.995142682050843e-07, "loss": 0.0, "reward": 1.743303656578064, "reward_std": 0.1730932155624032, "rewards/accuracy_reward": 0.7455357387661934, "rewards/format_reward": 0.9977678656578064, "step": 180 }, { "completion_length": 429.2768020629883, "epoch": 0.014187437881238673, "grad_norm": 0.16484060533923528, "kl": 0.0006341934204101562, "learning_rate": 9.995034159953043e-07, "loss": 0.0, "reward": 1.7723215222358704, "reward_std": 0.1578968781977892, "rewards/accuracy_reward": 0.7723214700818062, "rewards/format_reward": 1.0, "step": 182 }, { "completion_length": 437.7210006713867, "epoch": 0.014343343792021516, "grad_norm": 0.18161611275793282, "kl": 0.0007157325744628906, "learning_rate": 9.9949244395146e-07, "loss": 0.0, "reward": 1.7343750596046448, "reward_std": 0.1684868410229683, "rewards/accuracy_reward": 0.7366071864962578, "rewards/format_reward": 0.9977678656578064, "step": 184 }, { "completion_length": 433.064754486084, "epoch": 0.014499249702804357, "grad_norm": 0.16946511670505346, "kl": 0.0006685256958007812, "learning_rate": 9.99481352076184e-07, "loss": 0.0, "reward": 1.74553582072258, "reward_std": 0.20072447322309017, "rewards/accuracy_reward": 0.7455357536673546, "rewards/format_reward": 1.0, "step": 186 }, { "completion_length": 435.83484268188477, "epoch": 0.0146551556135872, "grad_norm": 0.1476624507871172, "kl": 0.0007433891296386719, "learning_rate": 9.994701403721374e-07, "loss": 0.0, "reward": 1.6584822237491608, "reward_std": 0.18516767118126154, "rewards/accuracy_reward": 0.658482164144516, "rewards/format_reward": 1.0, "step": 188 }, { "completion_length": 427.8080520629883, "epoch": 0.014811061524370043, "grad_norm": 0.1656912335564174, "kl": 0.000701904296875, "learning_rate": 9.994588088420098e-07, "loss": 0.0, "reward": 1.7500000894069672, "reward_std": 0.18111235089600086, "rewards/accuracy_reward": 0.7500000447034836, "rewards/format_reward": 1.0, "step": 190 }, { "completion_length": 433.32591247558594, "epoch": 0.014966967435152885, "grad_norm": 0.18304335970792954, "kl": 0.0006766319274902344, "learning_rate": 9.994473574885195e-07, "loss": 0.0, "reward": 1.7879465073347092, "reward_std": 0.152931435033679, "rewards/accuracy_reward": 0.7879464700818062, "rewards/format_reward": 1.0, "step": 192 }, { "completion_length": 421.2455520629883, "epoch": 0.015122873345935728, "grad_norm": 0.1448888685144108, "kl": 0.000629425048828125, "learning_rate": 9.99435786314414e-07, "loss": 0.0, "reward": 1.787946492433548, "reward_std": 0.15548902563750744, "rewards/accuracy_reward": 0.7879464700818062, "rewards/format_reward": 1.0, "step": 194 }, { "completion_length": 421.0893096923828, "epoch": 0.015278779256718571, "grad_norm": 0.1430701238517979, "kl": 0.0006852149963378906, "learning_rate": 9.994240953224694e-07, "loss": 0.0, "reward": 1.7968750894069672, "reward_std": 0.15723835583776236, "rewards/accuracy_reward": 0.7991071790456772, "rewards/format_reward": 0.9977678656578064, "step": 196 }, { "completion_length": 416.62278747558594, "epoch": 0.015434685167501412, "grad_norm": 0.1719402314509549, "kl": 0.0007276535034179688, "learning_rate": 9.994122845154903e-07, "loss": 0.0, "reward": 1.8459822088479996, "reward_std": 0.17562262620776892, "rewards/accuracy_reward": 0.8459821864962578, "rewards/format_reward": 1.0, "step": 198 }, { "completion_length": 439.02234649658203, "epoch": 0.015590591078284255, "grad_norm": 0.20839298933234013, "kl": 0.0008001327514648438, "learning_rate": 9.994003538963102e-07, "loss": 0.0, "reward": 1.7232143431901932, "reward_std": 0.2186891408637166, "rewards/accuracy_reward": 0.7232143133878708, "rewards/format_reward": 1.0, "step": 200 }, { "completion_length": 427.95091247558594, "epoch": 0.015746496989067098, "grad_norm": 0.151397147093766, "kl": 0.00081634521484375, "learning_rate": 9.993883034677912e-07, "loss": 0.0, "reward": 1.7723215073347092, "reward_std": 0.15390713792294264, "rewards/accuracy_reward": 0.7723214700818062, "rewards/format_reward": 1.0, "step": 202 }, { "completion_length": 433.6026954650879, "epoch": 0.01590240289984994, "grad_norm": 0.1583546086809797, "kl": 0.0007348060607910156, "learning_rate": 9.993761332328246e-07, "loss": 0.0, "reward": 1.7343750894069672, "reward_std": 0.15364989638328552, "rewards/accuracy_reward": 0.7366071715950966, "rewards/format_reward": 0.9977678656578064, "step": 204 }, { "completion_length": 443.76118087768555, "epoch": 0.016058308810632783, "grad_norm": 0.18414530864472592, "kl": 0.0007824897766113281, "learning_rate": 9.993638431943298e-07, "loss": 0.0, "reward": 1.750000074505806, "reward_std": 0.17422508262097836, "rewards/accuracy_reward": 0.7544643208384514, "rewards/format_reward": 0.9955357313156128, "step": 206 }, { "completion_length": 433.7276954650879, "epoch": 0.016214214721415626, "grad_norm": 0.15086124006937168, "kl": 0.0007681846618652344, "learning_rate": 9.993514333552554e-07, "loss": 0.0, "reward": 1.7098215073347092, "reward_std": 0.1522574294358492, "rewards/accuracy_reward": 0.7098214626312256, "rewards/format_reward": 1.0, "step": 208 }, { "completion_length": 426.83930587768555, "epoch": 0.01637012063219847, "grad_norm": 0.12711781025536528, "kl": 0.0007071495056152344, "learning_rate": 9.993389037185786e-07, "loss": 0.0, "reward": 1.8459822088479996, "reward_std": 0.10400933586061001, "rewards/accuracy_reward": 0.8459821715950966, "rewards/format_reward": 1.0, "step": 210 }, { "completion_length": 429.1919860839844, "epoch": 0.016526026542981312, "grad_norm": 0.10484022887402586, "kl": 0.0007071495056152344, "learning_rate": 9.993262542873054e-07, "loss": 0.0, "reward": 1.7700893729925156, "reward_std": 0.1124239107593894, "rewards/accuracy_reward": 0.7700893208384514, "rewards/format_reward": 1.0, "step": 212 }, { "completion_length": 429.33484268188477, "epoch": 0.016681932453764155, "grad_norm": 0.11760355144145325, "kl": 0.0007047653198242188, "learning_rate": 9.993134850644702e-07, "loss": 0.0, "reward": 1.7656250894069672, "reward_std": 0.13151400070637465, "rewards/accuracy_reward": 0.7656250447034836, "rewards/format_reward": 1.0, "step": 214 }, { "completion_length": 424.7544822692871, "epoch": 0.016837838364546998, "grad_norm": 0.1807644826217299, "kl": 0.0008635520935058594, "learning_rate": 9.993005960531367e-07, "loss": 0.0, "reward": 1.7142858058214188, "reward_std": 0.16683853790163994, "rewards/accuracy_reward": 0.714285746216774, "rewards/format_reward": 1.0, "step": 216 }, { "completion_length": 410.43528747558594, "epoch": 0.016993744275329837, "grad_norm": 0.13758965662925582, "kl": 0.00074005126953125, "learning_rate": 9.99287587256397e-07, "loss": 0.0, "reward": 1.781250074505806, "reward_std": 0.1572934938594699, "rewards/accuracy_reward": 0.7834821790456772, "rewards/format_reward": 0.9977678656578064, "step": 218 }, { "completion_length": 415.8638572692871, "epoch": 0.01714965018611268, "grad_norm": 0.19899168225751634, "kl": 0.0009160041809082031, "learning_rate": 9.99274458677372e-07, "loss": 0.0, "reward": 1.8013393729925156, "reward_std": 0.19706823863089085, "rewards/accuracy_reward": 0.8035714626312256, "rewards/format_reward": 0.9977678656578064, "step": 220 }, { "completion_length": 431.7232360839844, "epoch": 0.017305556096895523, "grad_norm": 0.14500391532353588, "kl": 0.0007853507995605469, "learning_rate": 9.992612103192114e-07, "loss": 0.0, "reward": 1.7812500894069672, "reward_std": 0.14316521026194096, "rewards/accuracy_reward": 0.7834821790456772, "rewards/format_reward": 0.9977678656578064, "step": 222 }, { "completion_length": 422.67412185668945, "epoch": 0.017461462007678365, "grad_norm": 0.10998664053426017, "kl": 0.0007295608520507812, "learning_rate": 9.992478421850932e-07, "loss": 0.0, "reward": 1.843750074505806, "reward_std": 0.10806465614587069, "rewards/accuracy_reward": 0.8437500447034836, "rewards/format_reward": 1.0, "step": 224 }, { "completion_length": 418.0647506713867, "epoch": 0.017617367918461208, "grad_norm": 0.1643712058861123, "kl": 0.00079345703125, "learning_rate": 9.99234354278225e-07, "loss": 0.0, "reward": 1.6986608058214188, "reward_std": 0.18757693096995354, "rewards/accuracy_reward": 0.6986607536673546, "rewards/format_reward": 1.0, "step": 226 }, { "completion_length": 424.01341247558594, "epoch": 0.01777327382924405, "grad_norm": 0.1600905457096453, "kl": 0.0008220672607421875, "learning_rate": 9.99220746601842e-07, "loss": 0.0, "reward": 1.7299107909202576, "reward_std": 0.19174816273152828, "rewards/accuracy_reward": 0.7321428880095482, "rewards/format_reward": 0.9977678656578064, "step": 228 }, { "completion_length": 435.9687690734863, "epoch": 0.017929179740026894, "grad_norm": 0.1781263948660099, "kl": 0.0007886886596679688, "learning_rate": 9.992070191592096e-07, "loss": 0.0, "reward": 1.8013393729925156, "reward_std": 0.2242430355399847, "rewards/accuracy_reward": 0.8013393208384514, "rewards/format_reward": 1.0, "step": 230 }, { "completion_length": 423.8817138671875, "epoch": 0.018085085650809737, "grad_norm": 0.14526225688252775, "kl": 0.0008168220520019531, "learning_rate": 9.991931719536204e-07, "loss": 0.0, "reward": 1.7700893729925156, "reward_std": 0.13219164591282606, "rewards/accuracy_reward": 0.7700893133878708, "rewards/format_reward": 1.0, "step": 232 }, { "completion_length": 428.0803756713867, "epoch": 0.01824099156159258, "grad_norm": 0.15304351314147913, "kl": 0.0008463859558105469, "learning_rate": 9.991792049883969e-07, "loss": 0.0, "reward": 1.741071492433548, "reward_std": 0.11468344647437334, "rewards/accuracy_reward": 0.7410714626312256, "rewards/format_reward": 1.0, "step": 234 }, { "completion_length": 419.3482246398926, "epoch": 0.018396897472375422, "grad_norm": 0.14949077250124768, "kl": 0.0008749961853027344, "learning_rate": 9.991651182668897e-07, "loss": 0.0, "reward": 1.781250074505806, "reward_std": 0.16584666725248098, "rewards/accuracy_reward": 0.7834821715950966, "rewards/format_reward": 0.9977678656578064, "step": 236 }, { "completion_length": 434.74332427978516, "epoch": 0.018552803383158265, "grad_norm": 0.1882988693443707, "kl": 0.0008168220520019531, "learning_rate": 9.991509117924781e-07, "loss": 0.0, "reward": 1.8348215222358704, "reward_std": 0.17118441872298717, "rewards/accuracy_reward": 0.837053619325161, "rewards/format_reward": 0.9977678656578064, "step": 238 }, { "completion_length": 426.4352836608887, "epoch": 0.018708709293941108, "grad_norm": 0.1465526525659429, "kl": 0.0008692741394042969, "learning_rate": 9.991365855685706e-07, "loss": 0.0, "reward": 1.8169643580913544, "reward_std": 0.16180247627198696, "rewards/accuracy_reward": 0.8191964626312256, "rewards/format_reward": 0.9977678656578064, "step": 240 }, { "completion_length": 411.439754486084, "epoch": 0.018864615204723947, "grad_norm": 0.1332122693275033, "kl": 0.0007348060607910156, "learning_rate": 9.991221395986037e-07, "loss": 0.0, "reward": 1.8928572237491608, "reward_std": 0.11745857261121273, "rewards/accuracy_reward": 0.8928571715950966, "rewards/format_reward": 1.0, "step": 242 }, { "completion_length": 412.27680587768555, "epoch": 0.01902052111550679, "grad_norm": 0.14487149423934728, "kl": 0.0008382797241210938, "learning_rate": 9.99107573886044e-07, "loss": 0.0, "reward": 1.790178656578064, "reward_std": 0.13835173286497593, "rewards/accuracy_reward": 0.7901786118745804, "rewards/format_reward": 1.0, "step": 244 }, { "completion_length": 446.924129486084, "epoch": 0.019176427026289633, "grad_norm": 0.16294074516016627, "kl": 0.0009098052978515625, "learning_rate": 9.990928884343849e-07, "loss": 0.0, "reward": 1.6919643729925156, "reward_std": 0.17818161100149155, "rewards/accuracy_reward": 0.694196455180645, "rewards/format_reward": 0.9977678656578064, "step": 246 }, { "completion_length": 418.2946548461914, "epoch": 0.019332332937072476, "grad_norm": 0.1288444991109011, "kl": 0.0008106231689453125, "learning_rate": 9.990780832471501e-07, "loss": 0.0, "reward": 1.8415179252624512, "reward_std": 0.10235963016748428, "rewards/accuracy_reward": 0.8415179029107094, "rewards/format_reward": 1.0, "step": 248 }, { "completion_length": 419.0513572692871, "epoch": 0.01948823884785532, "grad_norm": 0.15885269089008033, "kl": 0.0008983612060546875, "learning_rate": 9.990631583278912e-07, "loss": 0.0, "reward": 1.8191965073347092, "reward_std": 0.14271379075944424, "rewards/accuracy_reward": 0.819196455180645, "rewards/format_reward": 1.0, "step": 250 }, { "completion_length": 427.7455520629883, "epoch": 0.01964414475863816, "grad_norm": 0.1818290476531707, "kl": 0.0008602142333984375, "learning_rate": 9.99048113680189e-07, "loss": 0.0, "reward": 1.8102679550647736, "reward_std": 0.15939461439847946, "rewards/accuracy_reward": 0.8102678880095482, "rewards/format_reward": 1.0, "step": 252 }, { "completion_length": 427.1451110839844, "epoch": 0.019800050669421004, "grad_norm": 0.1454046833585364, "kl": 0.0009961128234863281, "learning_rate": 9.990329493076528e-07, "loss": 0.0, "reward": 1.8080357909202576, "reward_std": 0.11138403415679932, "rewards/accuracy_reward": 0.808035746216774, "rewards/format_reward": 1.0, "step": 254 }, { "completion_length": 423.8772506713867, "epoch": 0.019955956580203847, "grad_norm": 0.1784191722667504, "kl": 0.0009379386901855469, "learning_rate": 9.990176652139204e-07, "loss": 0.0, "reward": 1.7968750894069672, "reward_std": 0.1724743489176035, "rewards/accuracy_reward": 0.796875037252903, "rewards/format_reward": 1.0, "step": 256 }, { "completion_length": 421.8750190734863, "epoch": 0.02011186249098669, "grad_norm": 0.14680672013706283, "kl": 0.0008969306945800781, "learning_rate": 9.990022614026589e-07, "loss": 0.0, "reward": 1.8035715222358704, "reward_std": 0.11821812856942415, "rewards/accuracy_reward": 0.8035714775323868, "rewards/format_reward": 1.0, "step": 258 }, { "completion_length": 420.86609268188477, "epoch": 0.020267768401769533, "grad_norm": 0.1419399452957228, "kl": 0.0009961128234863281, "learning_rate": 9.989867378775633e-07, "loss": 0.0, "reward": 1.7790179401636124, "reward_std": 0.14969844464212656, "rewards/accuracy_reward": 0.7790179029107094, "rewards/format_reward": 1.0, "step": 260 }, { "completion_length": 428.1629638671875, "epoch": 0.020423674312552376, "grad_norm": 0.18363088402484073, "kl": 0.0008630752563476562, "learning_rate": 9.98971094642358e-07, "loss": 0.0, "reward": 1.79241082072258, "reward_std": 0.12084352876991034, "rewards/accuracy_reward": 0.7946428880095482, "rewards/format_reward": 0.9977678656578064, "step": 262 }, { "completion_length": 424.5491256713867, "epoch": 0.02057958022333522, "grad_norm": 0.16475670045435034, "kl": 0.0009374618530273438, "learning_rate": 9.989553317007965e-07, "loss": 0.0, "reward": 1.7700893580913544, "reward_std": 0.17315199319273233, "rewards/accuracy_reward": 0.7700893208384514, "rewards/format_reward": 1.0, "step": 264 }, { "completion_length": 432.36608123779297, "epoch": 0.02073548613411806, "grad_norm": 0.13920874668757055, "kl": 0.001018524169921875, "learning_rate": 9.989394490566594e-07, "loss": 0.0, "reward": 1.7723215073347092, "reward_std": 0.11949692294001579, "rewards/accuracy_reward": 0.7723214626312256, "rewards/format_reward": 1.0, "step": 266 }, { "completion_length": 424.3415336608887, "epoch": 0.0208913920449009, "grad_norm": 0.13756232370113966, "kl": 0.0009546279907226562, "learning_rate": 9.989234467137576e-07, "loss": 0.0, "reward": 1.7879465222358704, "reward_std": 0.1413762206211686, "rewards/accuracy_reward": 0.7901786044239998, "rewards/format_reward": 0.9977678656578064, "step": 268 }, { "completion_length": 420.4955596923828, "epoch": 0.021047297955683743, "grad_norm": 0.11952549435015918, "kl": 0.000946044921875, "learning_rate": 9.989073246759304e-07, "loss": 0.0, "reward": 1.7879465073347092, "reward_std": 0.17089105769991875, "rewards/accuracy_reward": 0.787946455180645, "rewards/format_reward": 1.0, "step": 270 }, { "completion_length": 437.4754638671875, "epoch": 0.021203203866466586, "grad_norm": 0.15957059224339154, "kl": 0.0009732246398925781, "learning_rate": 9.98891082947045e-07, "loss": 0.0, "reward": 1.7254465073347092, "reward_std": 0.22446057945489883, "rewards/accuracy_reward": 0.725446455180645, "rewards/format_reward": 1.0, "step": 272 }, { "completion_length": 423.78349685668945, "epoch": 0.02135910977724943, "grad_norm": 0.16029761625916888, "kl": 0.0010986328125, "learning_rate": 9.988747215309984e-07, "loss": 0.0, "reward": 1.7165179252624512, "reward_std": 0.15240716002881527, "rewards/accuracy_reward": 0.7165178880095482, "rewards/format_reward": 1.0, "step": 274 }, { "completion_length": 417.4085006713867, "epoch": 0.021515015688032272, "grad_norm": 0.14241353420568645, "kl": 0.0009799003601074219, "learning_rate": 9.988582404317153e-07, "loss": 0.0, "reward": 1.8102679550647736, "reward_std": 0.12422484811395407, "rewards/accuracy_reward": 0.8102678880095482, "rewards/format_reward": 1.0, "step": 276 }, { "completion_length": 420.8750190734863, "epoch": 0.021670921598815115, "grad_norm": 0.16335759475411415, "kl": 0.0009617805480957031, "learning_rate": 9.9884163965315e-07, "loss": 0.0, "reward": 1.8102679401636124, "reward_std": 0.14136577863246202, "rewards/accuracy_reward": 0.8102678954601288, "rewards/format_reward": 1.0, "step": 278 }, { "completion_length": 434.4910888671875, "epoch": 0.021826827509597958, "grad_norm": 0.16507928027863591, "kl": 0.001071929931640625, "learning_rate": 9.988249191992852e-07, "loss": 0.0, "reward": 1.8080357760190964, "reward_std": 0.17247798666357994, "rewards/accuracy_reward": 0.8080357536673546, "rewards/format_reward": 1.0, "step": 280 }, { "completion_length": 426.6629638671875, "epoch": 0.0219827334203808, "grad_norm": 0.1417938894535579, "kl": 0.0010085105895996094, "learning_rate": 9.98808079074132e-07, "loss": 0.0, "reward": 1.821428656578064, "reward_std": 0.12114662397652864, "rewards/accuracy_reward": 0.821428619325161, "rewards/format_reward": 1.0, "step": 282 }, { "completion_length": 428.1205520629883, "epoch": 0.022138639331163643, "grad_norm": 0.18840468644136554, "kl": 0.0010828971862792969, "learning_rate": 9.987911192817307e-07, "loss": 0.0, "reward": 1.852678656578064, "reward_std": 0.16863656975328922, "rewards/accuracy_reward": 0.8526786118745804, "rewards/format_reward": 1.0, "step": 284 }, { "completion_length": 413.5870704650879, "epoch": 0.022294545241946486, "grad_norm": 0.15805840564561696, "kl": 0.0010166168212890625, "learning_rate": 9.987740398261498e-07, "loss": 0.0, "reward": 1.7477679401636124, "reward_std": 0.17149725276976824, "rewards/accuracy_reward": 0.7477678880095482, "rewards/format_reward": 1.0, "step": 286 }, { "completion_length": 426.8348388671875, "epoch": 0.02245045115272933, "grad_norm": 0.1297379088690189, "kl": 0.0009903907775878906, "learning_rate": 9.987568407114867e-07, "loss": 0.0, "reward": 1.734375074505806, "reward_std": 0.10656691901385784, "rewards/accuracy_reward": 0.7343750298023224, "rewards/format_reward": 1.0, "step": 288 }, { "completion_length": 419.23662185668945, "epoch": 0.022606357063512172, "grad_norm": 0.1485034520197664, "kl": 0.0011587142944335938, "learning_rate": 9.98739521941868e-07, "loss": 0.0, "reward": 1.7343750596046448, "reward_std": 0.1446651853621006, "rewards/accuracy_reward": 0.7343750335276127, "rewards/format_reward": 1.0, "step": 290 }, { "completion_length": 428.1361770629883, "epoch": 0.022762262974295015, "grad_norm": 0.1442276598597914, "kl": 0.001140594482421875, "learning_rate": 9.987220835214483e-07, "loss": 0.0, "reward": 1.783482238650322, "reward_std": 0.13663421012461185, "rewards/accuracy_reward": 0.7857143133878708, "rewards/format_reward": 0.9977678656578064, "step": 292 }, { "completion_length": 433.9977836608887, "epoch": 0.022918168885077854, "grad_norm": 0.15248129720756784, "kl": 0.0011267662048339844, "learning_rate": 9.987045254544112e-07, "loss": 0.0, "reward": 1.7366072237491608, "reward_std": 0.17059076949954033, "rewards/accuracy_reward": 0.7366071715950966, "rewards/format_reward": 1.0, "step": 294 }, { "completion_length": 419.45314025878906, "epoch": 0.023074074795860697, "grad_norm": 0.15048530401749652, "kl": 0.0010042190551757812, "learning_rate": 9.98686847744969e-07, "loss": 0.0, "reward": 1.8058036416769028, "reward_std": 0.18163298442959785, "rewards/accuracy_reward": 0.805803619325161, "rewards/format_reward": 1.0, "step": 296 }, { "completion_length": 411.5870704650879, "epoch": 0.02322998070664354, "grad_norm": 0.091643270662043, "kl": 0.0009026527404785156, "learning_rate": 9.986690503973627e-07, "loss": 0.0, "reward": 1.8883929401636124, "reward_std": 0.0795136708766222, "rewards/accuracy_reward": 0.8883928954601288, "rewards/format_reward": 1.0, "step": 298 }, { "completion_length": 417.2388572692871, "epoch": 0.023385886617426382, "grad_norm": 0.15525443431038377, "kl": 0.0010890960693359375, "learning_rate": 9.98651133415862e-07, "loss": 0.0, "reward": 1.7611607909202576, "reward_std": 0.12843717727810144, "rewards/accuracy_reward": 0.7611607536673546, "rewards/format_reward": 1.0, "step": 300 }, { "completion_length": 418.53795623779297, "epoch": 0.023541792528209225, "grad_norm": 0.15737564278912355, "kl": 0.0010848045349121094, "learning_rate": 9.986330968047654e-07, "loss": 0.0, "reward": 1.7433036714792252, "reward_std": 0.16458124481141567, "rewards/accuracy_reward": 0.7433035895228386, "rewards/format_reward": 1.0, "step": 302 }, { "completion_length": 423.2835006713867, "epoch": 0.023697698438992068, "grad_norm": 0.14306103185024321, "kl": 0.000919342041015625, "learning_rate": 9.986149405683998e-07, "loss": 0.0, "reward": 1.7745536267757416, "reward_std": 0.11910687573254108, "rewards/accuracy_reward": 0.7745536044239998, "rewards/format_reward": 1.0, "step": 304 }, { "completion_length": 420.814754486084, "epoch": 0.02385360434977491, "grad_norm": 0.15655578139567058, "kl": 0.0009570121765136719, "learning_rate": 9.985966647111215e-07, "loss": 0.0, "reward": 1.8102679252624512, "reward_std": 0.13166736718267202, "rewards/accuracy_reward": 0.8102678880095482, "rewards/format_reward": 1.0, "step": 306 }, { "completion_length": 436.18528747558594, "epoch": 0.024009510260557754, "grad_norm": 0.13019428216472778, "kl": 0.0009813308715820312, "learning_rate": 9.985782692373142e-07, "loss": 0.0, "reward": 1.834821492433548, "reward_std": 0.0890587167814374, "rewards/accuracy_reward": 0.8348214700818062, "rewards/format_reward": 1.0, "step": 308 }, { "completion_length": 433.83707427978516, "epoch": 0.024165416171340597, "grad_norm": 0.09851821964330328, "kl": 0.0010385513305664062, "learning_rate": 9.985597541513914e-07, "loss": 0.0, "reward": 1.8035715222358704, "reward_std": 0.09769588056951761, "rewards/accuracy_reward": 0.8035714626312256, "rewards/format_reward": 1.0, "step": 310 }, { "completion_length": 424.32368087768555, "epoch": 0.02432132208212344, "grad_norm": 0.15378585385799545, "kl": 0.0011272430419921875, "learning_rate": 9.985411194577955e-07, "loss": 0.0, "reward": 1.8169643729925156, "reward_std": 0.1401829794049263, "rewards/accuracy_reward": 0.8191964700818062, "rewards/format_reward": 0.9977678656578064, "step": 312 }, { "completion_length": 422.5647506713867, "epoch": 0.024477227992906282, "grad_norm": 0.12029543765508587, "kl": 0.0010638236999511719, "learning_rate": 9.985223651609964e-07, "loss": 0.0, "reward": 1.7767857760190964, "reward_std": 0.14835184067487717, "rewards/accuracy_reward": 0.7790178954601288, "rewards/format_reward": 0.9977678656578064, "step": 314 }, { "completion_length": 413.3906478881836, "epoch": 0.024633133903689125, "grad_norm": 0.15221461407056153, "kl": 0.0010557174682617188, "learning_rate": 9.985034912654937e-07, "loss": 0.0, "reward": 1.7700893729925156, "reward_std": 0.15518733020871878, "rewards/accuracy_reward": 0.7700893208384514, "rewards/format_reward": 1.0, "step": 316 }, { "completion_length": 421.75671768188477, "epoch": 0.024789039814471968, "grad_norm": 0.1413379194966749, "kl": 0.0011153221130371094, "learning_rate": 9.984844977758153e-07, "loss": 0.0, "reward": 1.8169643580913544, "reward_std": 0.14143359754234552, "rewards/accuracy_reward": 0.816964328289032, "rewards/format_reward": 1.0, "step": 318 }, { "completion_length": 421.5915336608887, "epoch": 0.024944945725254807, "grad_norm": 0.11921979382923294, "kl": 0.0009741783142089844, "learning_rate": 9.984653846965178e-07, "loss": 0.0, "reward": 1.8058036714792252, "reward_std": 0.14260851219296455, "rewards/accuracy_reward": 0.8080357387661934, "rewards/format_reward": 0.9977678656578064, "step": 320 }, { "completion_length": 422.3995704650879, "epoch": 0.02510085163603765, "grad_norm": 0.17375172863332639, "kl": 0.0010395050048828125, "learning_rate": 9.984461520321868e-07, "loss": 0.0, "reward": 1.8415179550647736, "reward_std": 0.15518732741475105, "rewards/accuracy_reward": 0.8415178917348385, "rewards/format_reward": 1.0, "step": 322 }, { "completion_length": 425.73440170288086, "epoch": 0.025256757546820493, "grad_norm": 0.14269728922837815, "kl": 0.0010428428649902344, "learning_rate": 9.984267997874359e-07, "loss": 0.0, "reward": 1.790178656578064, "reward_std": 0.14887387864291668, "rewards/accuracy_reward": 0.7901786118745804, "rewards/format_reward": 1.0, "step": 324 }, { "completion_length": 426.10716247558594, "epoch": 0.025412663457603336, "grad_norm": 0.13607069453902287, "kl": 0.0011076927185058594, "learning_rate": 9.984073279669083e-07, "loss": 0.0, "reward": 1.7723215073347092, "reward_std": 0.13038997445255518, "rewards/accuracy_reward": 0.772321455180645, "rewards/format_reward": 1.0, "step": 326 }, { "completion_length": 432.7611846923828, "epoch": 0.02556856936838618, "grad_norm": 0.11226275481210608, "kl": 0.0009984970092773438, "learning_rate": 9.983877365752752e-07, "loss": 0.0, "reward": 1.7834822237491608, "reward_std": 0.15206660702824593, "rewards/accuracy_reward": 0.7857143133878708, "rewards/format_reward": 0.9977678656578064, "step": 328 }, { "completion_length": 425.99555587768555, "epoch": 0.02572447527916902, "grad_norm": 0.13143567396354225, "kl": 0.0009946823120117188, "learning_rate": 9.983680256172365e-07, "loss": 0.0, "reward": 1.7924107909202576, "reward_std": 0.07710581179708242, "rewards/accuracy_reward": 0.7924107387661934, "rewards/format_reward": 1.0, "step": 330 }, { "completion_length": 429.5535888671875, "epoch": 0.025880381189951864, "grad_norm": 0.15897737967998524, "kl": 0.001216888427734375, "learning_rate": 9.983481950975215e-07, "loss": 0.0, "reward": 1.7232143580913544, "reward_std": 0.15405545849353075, "rewards/accuracy_reward": 0.7232143208384514, "rewards/format_reward": 1.0, "step": 332 }, { "completion_length": 437.3437690734863, "epoch": 0.026036287100734707, "grad_norm": 0.1326607403702998, "kl": 0.00110626220703125, "learning_rate": 9.98328245020887e-07, "loss": 0.0, "reward": 1.7433036416769028, "reward_std": 0.1406062226742506, "rewards/accuracy_reward": 0.745535746216774, "rewards/format_reward": 0.9977678656578064, "step": 334 }, { "completion_length": 417.38171768188477, "epoch": 0.02619219301151755, "grad_norm": 0.1290344588318605, "kl": 0.0010461807250976562, "learning_rate": 9.983081753921197e-07, "loss": 0.0, "reward": 1.79464291036129, "reward_std": 0.10333532746881247, "rewards/accuracy_reward": 0.7968750260770321, "rewards/format_reward": 0.9977678656578064, "step": 336 }, { "completion_length": 430.9308280944824, "epoch": 0.026348098922300393, "grad_norm": 0.16423975619428405, "kl": 0.0011296272277832031, "learning_rate": 9.982879862160344e-07, "loss": 0.0, "reward": 1.8035715073347092, "reward_std": 0.1508252713829279, "rewards/accuracy_reward": 0.8058036044239998, "rewards/format_reward": 0.9977678656578064, "step": 338 }, { "completion_length": 427.98216247558594, "epoch": 0.026504004833083235, "grad_norm": 0.15318438145409302, "kl": 0.001190185546875, "learning_rate": 9.98267677497474e-07, "loss": 0.0, "reward": 1.727678656578064, "reward_std": 0.170744139701128, "rewards/accuracy_reward": 0.7276786044239998, "rewards/format_reward": 1.0, "step": 340 }, { "completion_length": 424.7277030944824, "epoch": 0.02665991074386608, "grad_norm": 1258640860744.7776, "kl": 5200936960.000877, "learning_rate": 9.982472492413114e-07, "loss": 208799344.0, "reward": 1.7767857909202576, "reward_std": 0.13249473739415407, "rewards/accuracy_reward": 0.7767857536673546, "rewards/format_reward": 1.0, "step": 342 }, { "completion_length": 422.3727836608887, "epoch": 0.026815816654648918, "grad_norm": 0.13587989552892932, "kl": 0.0011310577392578125, "learning_rate": 9.982267014524473e-07, "loss": 0.0, "reward": 1.7700893729925156, "reward_std": 0.07094432692974806, "rewards/accuracy_reward": 0.7700893208384514, "rewards/format_reward": 1.0, "step": 344 }, { "completion_length": 433.74555587768555, "epoch": 0.02697172256543176, "grad_norm": 0.1303294435038458, "kl": 0.0011668205261230469, "learning_rate": 9.982060341358113e-07, "loss": 0.0, "reward": 1.7232143580913544, "reward_std": 0.13008548226207495, "rewards/accuracy_reward": 0.723214328289032, "rewards/format_reward": 1.0, "step": 346 }, { "completion_length": 429.2031440734863, "epoch": 0.027127628476214603, "grad_norm": 0.18685827854655315, "kl": 0.0011110305786132812, "learning_rate": 9.981852472963611e-07, "loss": 0.0, "reward": 1.7924107909202576, "reward_std": 0.17135116644203663, "rewards/accuracy_reward": 0.7924107536673546, "rewards/format_reward": 1.0, "step": 348 }, { "completion_length": 436.6852836608887, "epoch": 0.027283534386997446, "grad_norm": 0.16659443361897433, "kl": 0.0013189315795898438, "learning_rate": 9.98164340939084e-07, "loss": 0.0001, "reward": 1.8258929401636124, "reward_std": 0.1448199525475502, "rewards/accuracy_reward": 0.828125037252903, "rewards/format_reward": 0.9977678656578064, "step": 350 }, { "completion_length": 427.1250190734863, "epoch": 0.02743944029778029, "grad_norm": 0.13302407579298145, "kl": 0.0012998580932617188, "learning_rate": 9.981433150689958e-07, "loss": 0.0001, "reward": 1.703125074505806, "reward_std": 0.12998160626739264, "rewards/accuracy_reward": 0.7053571790456772, "rewards/format_reward": 0.9977678656578064, "step": 352 }, { "completion_length": 423.1294822692871, "epoch": 0.027595346208563132, "grad_norm": 0.13859069963169365, "kl": 0.0011591911315917969, "learning_rate": 9.981221696911404e-07, "loss": 0.0, "reward": 1.796875074505806, "reward_std": 0.11483317241072655, "rewards/accuracy_reward": 0.7968750298023224, "rewards/format_reward": 1.0, "step": 354 }, { "completion_length": 426.6160888671875, "epoch": 0.027751252119345975, "grad_norm": 0.16583548996617842, "kl": 0.0011949539184570312, "learning_rate": 9.981009048105905e-07, "loss": 0.0, "reward": 1.758928656578064, "reward_std": 0.09671877790242434, "rewards/accuracy_reward": 0.7589285969734192, "rewards/format_reward": 1.0, "step": 356 }, { "completion_length": 417.9308204650879, "epoch": 0.027907158030128817, "grad_norm": 0.17922325301020162, "kl": 0.0012278556823730469, "learning_rate": 9.980795204324483e-07, "loss": 0.0, "reward": 1.758928656578064, "reward_std": 0.15518592949956656, "rewards/accuracy_reward": 0.7589286118745804, "rewards/format_reward": 1.0, "step": 358 }, { "completion_length": 410.08707427978516, "epoch": 0.02806306394091166, "grad_norm": 0.12217400933773048, "kl": 0.001331329345703125, "learning_rate": 9.980580165618437e-07, "loss": 0.0001, "reward": 1.8482143580913544, "reward_std": 0.10919232107698917, "rewards/accuracy_reward": 0.8482143208384514, "rewards/format_reward": 1.0, "step": 360 }, { "completion_length": 439.1986846923828, "epoch": 0.028218969851694503, "grad_norm": 0.14413915018546655, "kl": 0.001285552978515625, "learning_rate": 9.980363932039357e-07, "loss": 0.0001, "reward": 1.7611607909202576, "reward_std": 0.12437821365892887, "rewards/accuracy_reward": 0.761160746216774, "rewards/format_reward": 1.0, "step": 362 }, { "completion_length": 431.471004486084, "epoch": 0.028374875762477346, "grad_norm": 0.14682160150555135, "kl": 0.0013680458068847656, "learning_rate": 9.980146503639118e-07, "loss": 0.0001, "reward": 1.718750074505806, "reward_std": 0.1523548262193799, "rewards/accuracy_reward": 0.723214328289032, "rewards/format_reward": 0.9955357313156128, "step": 364 }, { "completion_length": 429.3660888671875, "epoch": 0.02853078167326019, "grad_norm": 0.1480449332076798, "kl": 0.001312255859375, "learning_rate": 9.97992788046988e-07, "loss": 0.0001, "reward": 1.7968750894069672, "reward_std": 0.17833638470619917, "rewards/accuracy_reward": 0.7968750298023224, "rewards/format_reward": 1.0, "step": 366 }, { "completion_length": 421.4486846923828, "epoch": 0.02868668758404303, "grad_norm": 0.1252511053179726, "kl": 0.0012674331665039062, "learning_rate": 9.979708062584098e-07, "loss": 0.0001, "reward": 1.8459822088479996, "reward_std": 0.10160007327795029, "rewards/accuracy_reward": 0.8459821715950966, "rewards/format_reward": 1.0, "step": 368 }, { "completion_length": 422.02457427978516, "epoch": 0.02884259349482587, "grad_norm": 0.13963252337289522, "kl": 0.0012063980102539062, "learning_rate": 9.979487050034504e-07, "loss": 0.0, "reward": 1.8169643878936768, "reward_std": 0.1006243759766221, "rewards/accuracy_reward": 0.8169643133878708, "rewards/format_reward": 1.0, "step": 370 }, { "completion_length": 424.6830520629883, "epoch": 0.028998499405608714, "grad_norm": 0.17099875781596052, "kl": 0.0013399124145507812, "learning_rate": 9.979264842874118e-07, "loss": 0.0001, "reward": 1.7857143729925156, "reward_std": 0.15549266710877419, "rewards/accuracy_reward": 0.785714328289032, "rewards/format_reward": 1.0, "step": 372 }, { "completion_length": 431.07368087768555, "epoch": 0.029154405316391557, "grad_norm": 0.12096786166369057, "kl": 0.0016117095947265625, "learning_rate": 9.979041441156253e-07, "loss": 0.0001, "reward": 1.7455357909202576, "reward_std": 0.13285088166594505, "rewards/accuracy_reward": 0.7477678954601288, "rewards/format_reward": 0.9977678656578064, "step": 374 }, { "completion_length": 439.3326072692871, "epoch": 0.0293103112271744, "grad_norm": 0.19928877705889192, "kl": 0.001476287841796875, "learning_rate": 9.978816844934505e-07, "loss": 0.0001, "reward": 1.7299107909202576, "reward_std": 0.19532395154237747, "rewards/accuracy_reward": 0.7299107536673546, "rewards/format_reward": 1.0, "step": 376 }, { "completion_length": 421.77457427978516, "epoch": 0.029466217137957242, "grad_norm": 0.16778404861961607, "kl": 0.001293182373046875, "learning_rate": 9.978591054262753e-07, "loss": 0.0001, "reward": 1.8459822088479996, "reward_std": 0.11340101063251495, "rewards/accuracy_reward": 0.845982164144516, "rewards/format_reward": 1.0, "step": 378 }, { "completion_length": 425.2901916503906, "epoch": 0.029622123048740085, "grad_norm": 0.13902757096125776, "kl": 0.0013489723205566406, "learning_rate": 9.978364069195167e-07, "loss": 0.0001, "reward": 1.7254464775323868, "reward_std": 0.11791503801941872, "rewards/accuracy_reward": 0.725446455180645, "rewards/format_reward": 1.0, "step": 380 }, { "completion_length": 415.5647506713867, "epoch": 0.029778028959522928, "grad_norm": 0.11347634868447472, "kl": 0.001392364501953125, "learning_rate": 9.978135889786203e-07, "loss": 0.0001, "reward": 1.7633929401636124, "reward_std": 0.08702036831527948, "rewards/accuracy_reward": 0.7633929029107094, "rewards/format_reward": 1.0, "step": 382 }, { "completion_length": 430.9643096923828, "epoch": 0.02993393487030577, "grad_norm": 0.14095251861860916, "kl": 0.0013208389282226562, "learning_rate": 9.9779065160906e-07, "loss": 0.0001, "reward": 1.7767858058214188, "reward_std": 0.13984806835651398, "rewards/accuracy_reward": 0.7767857536673546, "rewards/format_reward": 1.0, "step": 384 }, { "completion_length": 429.91966247558594, "epoch": 0.030089840781088614, "grad_norm": 0.1602986674560949, "kl": 0.0013298988342285156, "learning_rate": 9.97767594816339e-07, "loss": 0.0001, "reward": 1.79241082072258, "reward_std": 0.15857088658958673, "rewards/accuracy_reward": 0.7924107499420643, "rewards/format_reward": 1.0, "step": 386 }, { "completion_length": 418.9062690734863, "epoch": 0.030245746691871456, "grad_norm": 0.09392412522042794, "kl": 0.0012750625610351562, "learning_rate": 9.977444186059885e-07, "loss": 0.0001, "reward": 1.8705358058214188, "reward_std": 0.09265981614589691, "rewards/accuracy_reward": 0.8705357536673546, "rewards/format_reward": 1.0, "step": 388 }, { "completion_length": 420.45984268188477, "epoch": 0.0304016526026543, "grad_norm": 0.10668937647760851, "kl": 0.0014400482177734375, "learning_rate": 9.977211229835687e-07, "loss": 0.0001, "reward": 1.7120536714792252, "reward_std": 0.13527126610279083, "rewards/accuracy_reward": 0.7120535969734192, "rewards/format_reward": 1.0, "step": 390 }, { "completion_length": 407.4196586608887, "epoch": 0.030557558513437142, "grad_norm": 0.10220820817242766, "kl": 0.0012683868408203125, "learning_rate": 9.976977079546682e-07, "loss": 0.0001, "reward": 1.8147322535514832, "reward_std": 0.09665096271783113, "rewards/accuracy_reward": 0.8169643208384514, "rewards/format_reward": 0.9977678656578064, "step": 392 }, { "completion_length": 426.32368087768555, "epoch": 0.030713464424219985, "grad_norm": 0.17741304413358733, "kl": 0.00141143798828125, "learning_rate": 9.976741735249048e-07, "loss": 0.0001, "reward": 1.7611608058214188, "reward_std": 0.17788355704396963, "rewards/accuracy_reward": 0.7611607387661934, "rewards/format_reward": 1.0, "step": 394 }, { "completion_length": 418.2522506713867, "epoch": 0.030869370335002824, "grad_norm": 0.12461231454823124, "kl": 0.001277923583984375, "learning_rate": 9.976505196999243e-07, "loss": 0.0001, "reward": 1.7678572237491608, "reward_std": 0.12160168495029211, "rewards/accuracy_reward": 0.7678571790456772, "rewards/format_reward": 1.0, "step": 396 }, { "completion_length": 428.19198989868164, "epoch": 0.031025276245785667, "grad_norm": 0.15756570042022477, "kl": 0.0012598037719726562, "learning_rate": 9.976267464854014e-07, "loss": 0.0001, "reward": 1.8281250596046448, "reward_std": 0.15488283056765795, "rewards/accuracy_reward": 0.828125037252903, "rewards/format_reward": 1.0, "step": 398 }, { "completion_length": 419.9799346923828, "epoch": 0.03118118215656851, "grad_norm": 0.1164642778364414, "kl": 0.00124359130859375, "learning_rate": 9.976028538870396e-07, "loss": 0.0, "reward": 1.7991072088479996, "reward_std": 0.12340251542627811, "rewards/accuracy_reward": 0.799107164144516, "rewards/format_reward": 1.0, "step": 400 }, { "completion_length": 418.5044822692871, "epoch": 0.03133708806735135, "grad_norm": 0.16528161701294478, "kl": 0.0012607574462890625, "learning_rate": 9.975788419105706e-07, "loss": 0.0001, "reward": 1.7544643431901932, "reward_std": 0.1432316256687045, "rewards/accuracy_reward": 0.754464328289032, "rewards/format_reward": 1.0, "step": 402 }, { "completion_length": 421.53349685668945, "epoch": 0.031492993978134196, "grad_norm": 0.144992462350967, "kl": 0.0013427734375, "learning_rate": 9.975547105617552e-07, "loss": 0.0001, "reward": 1.7991072237491608, "reward_std": 0.13054053485393524, "rewards/accuracy_reward": 0.7991071715950966, "rewards/format_reward": 1.0, "step": 404 }, { "completion_length": 423.1785888671875, "epoch": 0.03164889988891704, "grad_norm": 0.13465785198529193, "kl": 0.0012450218200683594, "learning_rate": 9.975304598463829e-07, "loss": 0.0, "reward": 1.8348215222358704, "reward_std": 0.11158680729568005, "rewards/accuracy_reward": 0.8370536118745804, "rewards/format_reward": 0.9977678656578064, "step": 406 }, { "completion_length": 408.8906440734863, "epoch": 0.03180480579969988, "grad_norm": 0.1306002425227407, "kl": 0.0012917518615722656, "learning_rate": 9.975060897702712e-07, "loss": 0.0001, "reward": 1.8080357909202576, "reward_std": 0.11498290114104748, "rewards/accuracy_reward": 0.8080357611179352, "rewards/format_reward": 1.0, "step": 408 }, { "completion_length": 422.9821586608887, "epoch": 0.031960711710482724, "grad_norm": 0.13874196519618254, "kl": 0.0012617111206054688, "learning_rate": 9.974816003392667e-07, "loss": 0.0001, "reward": 1.8169643580913544, "reward_std": 0.1250522220507264, "rewards/accuracy_reward": 0.8169643133878708, "rewards/format_reward": 1.0, "step": 410 }, { "completion_length": 417.15849685668945, "epoch": 0.03211661762126557, "grad_norm": 0.16288322466302393, "kl": 0.0011835098266601562, "learning_rate": 9.97456991559245e-07, "loss": 0.0, "reward": 1.7544643580913544, "reward_std": 0.14676631055772305, "rewards/accuracy_reward": 0.7544643133878708, "rewards/format_reward": 1.0, "step": 412 }, { "completion_length": 419.8727836608887, "epoch": 0.03227252353204841, "grad_norm": 0.11881184136912183, "kl": 0.0012164115905761719, "learning_rate": 9.974322634361093e-07, "loss": 0.0, "reward": 1.8415179401636124, "reward_std": 0.1158102685585618, "rewards/accuracy_reward": 0.843750037252903, "rewards/format_reward": 0.9977678656578064, "step": 414 }, { "completion_length": 421.37501525878906, "epoch": 0.03242842944283125, "grad_norm": 0.15834602995868685, "kl": 0.0012998580932617188, "learning_rate": 9.974074159757924e-07, "loss": 0.0001, "reward": 1.8214286714792252, "reward_std": 0.1529300371184945, "rewards/accuracy_reward": 0.8214286118745804, "rewards/format_reward": 1.0, "step": 416 }, { "completion_length": 417.6272506713867, "epoch": 0.032584335353614095, "grad_norm": 0.15664118727045456, "kl": 0.0014200210571289062, "learning_rate": 9.973824491842552e-07, "loss": 0.0001, "reward": 1.7388393580913544, "reward_std": 0.1287388727068901, "rewards/accuracy_reward": 0.738839328289032, "rewards/format_reward": 1.0, "step": 418 }, { "completion_length": 432.22322845458984, "epoch": 0.03274024126439694, "grad_norm": 0.14347625794921423, "kl": 0.0013804435729980469, "learning_rate": 9.973573630674876e-07, "loss": 0.0001, "reward": 1.7968750894069672, "reward_std": 0.15271389111876488, "rewards/accuracy_reward": 0.7968750223517418, "rewards/format_reward": 1.0, "step": 420 }, { "completion_length": 423.0111846923828, "epoch": 0.03289614717517978, "grad_norm": 0.14352061899996696, "kl": 0.001346588134765625, "learning_rate": 9.97332157631508e-07, "loss": 0.0001, "reward": 1.8482143580913544, "reward_std": 0.11761194188147783, "rewards/accuracy_reward": 0.8482143208384514, "rewards/format_reward": 1.0, "step": 422 }, { "completion_length": 415.62501525878906, "epoch": 0.033052053085962624, "grad_norm": 0.1368930994774858, "kl": 0.0012011528015136719, "learning_rate": 9.973068328823629e-07, "loss": 0.0, "reward": 1.7165179401636124, "reward_std": 0.14692107867449522, "rewards/accuracy_reward": 0.7165179029107094, "rewards/format_reward": 1.0, "step": 424 }, { "completion_length": 434.9888610839844, "epoch": 0.03320795899674547, "grad_norm": 0.16913520500273102, "kl": 0.0015630722045898438, "learning_rate": 9.97281388826128e-07, "loss": 0.0001, "reward": 1.7120536714792252, "reward_std": 0.19554148986935616, "rewards/accuracy_reward": 0.7120536118745804, "rewards/format_reward": 1.0, "step": 426 }, { "completion_length": 421.79689025878906, "epoch": 0.03336386490752831, "grad_norm": 0.15231503529177454, "kl": 0.0012898445129394531, "learning_rate": 9.972558254689076e-07, "loss": 0.0001, "reward": 1.696428656578064, "reward_std": 0.12685529049485922, "rewards/accuracy_reward": 0.6964285969734192, "rewards/format_reward": 1.0, "step": 428 }, { "completion_length": 410.9308204650879, "epoch": 0.03351977081831115, "grad_norm": 0.1294099512507865, "kl": 0.0012574195861816406, "learning_rate": 9.972301428168348e-07, "loss": 0.0001, "reward": 1.7745536714792252, "reward_std": 0.1281326785683632, "rewards/accuracy_reward": 0.7745536044239998, "rewards/format_reward": 1.0, "step": 430 }, { "completion_length": 417.72993087768555, "epoch": 0.033675676729093995, "grad_norm": 0.13627078714629565, "kl": 0.0012159347534179688, "learning_rate": 9.972043408760703e-07, "loss": 0.0, "reward": 1.8303572237491608, "reward_std": 0.11504931468516588, "rewards/accuracy_reward": 0.8303571864962578, "rewards/format_reward": 1.0, "step": 432 }, { "completion_length": 417.01341247558594, "epoch": 0.03383158263987683, "grad_norm": 0.14611515781856474, "kl": 0.0012636184692382812, "learning_rate": 9.97178419652805e-07, "loss": 0.0001, "reward": 1.7946429401636124, "reward_std": 0.12304428406059742, "rewards/accuracy_reward": 0.7968750447034836, "rewards/format_reward": 0.9977678656578064, "step": 434 }, { "completion_length": 423.9085006713867, "epoch": 0.033987488550659674, "grad_norm": 0.14024180527995336, "kl": 0.0012798309326171875, "learning_rate": 9.971523791532572e-07, "loss": 0.0001, "reward": 1.7767858058214188, "reward_std": 0.11971446499228477, "rewards/accuracy_reward": 0.776785746216774, "rewards/format_reward": 1.0, "step": 436 }, { "completion_length": 423.4218940734863, "epoch": 0.03414339446144252, "grad_norm": 0.15719127440358227, "kl": 0.001270294189453125, "learning_rate": 9.971262193836738e-07, "loss": 0.0001, "reward": 1.796875074505806, "reward_std": 0.15713872388005257, "rewards/accuracy_reward": 0.796875037252903, "rewards/format_reward": 1.0, "step": 438 }, { "completion_length": 402.71207427978516, "epoch": 0.03429930037222536, "grad_norm": 0.09305993105248994, "kl": 0.0028009414672851562, "learning_rate": 9.970999403503313e-07, "loss": 0.0001, "reward": 1.8571429401636124, "reward_std": 0.1221237238496542, "rewards/accuracy_reward": 0.8571428954601288, "rewards/format_reward": 1.0, "step": 440 }, { "completion_length": 424.5803680419922, "epoch": 0.0344552062830082, "grad_norm": 0.09516547179194673, "kl": 0.0011491775512695312, "learning_rate": 9.97073542059534e-07, "loss": 0.0, "reward": 1.8325893878936768, "reward_std": 0.10964877903461456, "rewards/accuracy_reward": 0.8348214775323868, "rewards/format_reward": 0.9977678656578064, "step": 442 }, { "completion_length": 418.5178756713867, "epoch": 0.034611112193791045, "grad_norm": 0.11442754476650838, "kl": 0.001232147216796875, "learning_rate": 9.97047024517615e-07, "loss": 0.0, "reward": 1.8236608058214188, "reward_std": 0.12474912870675325, "rewards/accuracy_reward": 0.8236607611179352, "rewards/format_reward": 1.0, "step": 444 }, { "completion_length": 433.8013610839844, "epoch": 0.03476701810457389, "grad_norm": 0.13413214821786207, "kl": 0.0013408660888671875, "learning_rate": 9.970203877309355e-07, "loss": 0.0001, "reward": 1.6741072237491608, "reward_std": 0.12633325159549713, "rewards/accuracy_reward": 0.6808036118745804, "rewards/format_reward": 0.9933035969734192, "step": 446 }, { "completion_length": 412.5558166503906, "epoch": 0.03492292401535673, "grad_norm": 0.1535991267173143, "kl": 0.0013189315795898438, "learning_rate": 9.969936317058868e-07, "loss": 0.0001, "reward": 1.828125074505806, "reward_std": 0.13013497553765774, "rewards/accuracy_reward": 0.8303571864962578, "rewards/format_reward": 0.9977678656578064, "step": 448 }, { "completion_length": 417.142879486084, "epoch": 0.035078829926139574, "grad_norm": 0.1291916238515442, "kl": 0.0013213157653808594, "learning_rate": 9.969667564488873e-07, "loss": 0.0001, "reward": 1.709821492433548, "reward_std": 0.13850509747862816, "rewards/accuracy_reward": 0.7098214626312256, "rewards/format_reward": 1.0, "step": 450 }, { "completion_length": 425.6562690734863, "epoch": 0.035234735836922416, "grad_norm": 0.14575468947405876, "kl": 0.0013360977172851562, "learning_rate": 9.969397619663846e-07, "loss": 0.0001, "reward": 1.8303572237491608, "reward_std": 0.14286575186997652, "rewards/accuracy_reward": 0.8303571864962578, "rewards/format_reward": 1.0, "step": 452 }, { "completion_length": 422.955379486084, "epoch": 0.03539064174770526, "grad_norm": 0.1535657635369093, "kl": 0.0013175010681152344, "learning_rate": 9.969126482648548e-07, "loss": 0.0001, "reward": 1.828125074505806, "reward_std": 0.1222756914794445, "rewards/accuracy_reward": 0.8281250447034836, "rewards/format_reward": 1.0, "step": 454 }, { "completion_length": 422.1562690734863, "epoch": 0.0355465476584881, "grad_norm": 0.15028363450853358, "kl": 0.0013170242309570312, "learning_rate": 9.968854153508028e-07, "loss": 0.0001, "reward": 1.8080358058214188, "reward_std": 0.12444603070616722, "rewards/accuracy_reward": 0.8080357536673546, "rewards/format_reward": 1.0, "step": 456 }, { "completion_length": 415.1138572692871, "epoch": 0.035702453569270945, "grad_norm": 0.14218290953064644, "kl": 0.0012464523315429688, "learning_rate": 9.968580632307618e-07, "loss": 0.0, "reward": 1.8125000596046448, "reward_std": 0.13414444588124752, "rewards/accuracy_reward": 0.812500037252903, "rewards/format_reward": 1.0, "step": 458 }, { "completion_length": 411.86609268188477, "epoch": 0.03585835948005379, "grad_norm": 0.11835968027674192, "kl": 0.0013647079467773438, "learning_rate": 9.968305919112938e-07, "loss": 0.0001, "reward": 1.837053656578064, "reward_std": 0.09138242714107037, "rewards/accuracy_reward": 0.8370536118745804, "rewards/format_reward": 1.0, "step": 460 }, { "completion_length": 406.72099685668945, "epoch": 0.03601426539083663, "grad_norm": 0.11275721705416167, "kl": 0.0013456344604492188, "learning_rate": 9.968030013989895e-07, "loss": 0.0001, "reward": 1.8236607909202576, "reward_std": 0.11626533046364784, "rewards/accuracy_reward": 0.823660746216774, "rewards/format_reward": 1.0, "step": 462 }, { "completion_length": 433.00225830078125, "epoch": 0.03617017130161947, "grad_norm": 0.15289263743665948, "kl": 0.0014505386352539062, "learning_rate": 9.967752917004676e-07, "loss": 0.0001, "reward": 1.7678571939468384, "reward_std": 0.12279632966965437, "rewards/accuracy_reward": 0.7678571715950966, "rewards/format_reward": 1.0, "step": 464 }, { "completion_length": 424.6741256713867, "epoch": 0.036326077212402316, "grad_norm": 0.1411172076779338, "kl": 0.0012116432189941406, "learning_rate": 9.967474628223766e-07, "loss": 0.0, "reward": 1.8504465073347092, "reward_std": 0.12910978123545647, "rewards/accuracy_reward": 0.8504464700818062, "rewards/format_reward": 1.0, "step": 466 }, { "completion_length": 433.50671768188477, "epoch": 0.03648198312318516, "grad_norm": 0.14152590005555854, "kl": 0.0014514923095703125, "learning_rate": 9.96719514771392e-07, "loss": 0.0001, "reward": 1.8169643431901932, "reward_std": 0.12858410738408566, "rewards/accuracy_reward": 0.8169643133878708, "rewards/format_reward": 1.0, "step": 468 }, { "completion_length": 424.424129486084, "epoch": 0.036637889033968, "grad_norm": 0.13683717997443795, "kl": 0.0013628005981445312, "learning_rate": 9.966914475542193e-07, "loss": 0.0001, "reward": 1.8281250596046448, "reward_std": 0.09040532819926739, "rewards/accuracy_reward": 0.8281250298023224, "rewards/format_reward": 1.0, "step": 470 }, { "completion_length": 416.52234268188477, "epoch": 0.036793794944750845, "grad_norm": 0.14691723623092834, "kl": 0.0013799667358398438, "learning_rate": 9.966632611775916e-07, "loss": 0.0001, "reward": 1.814732238650322, "reward_std": 0.155489020049572, "rewards/accuracy_reward": 0.8147321790456772, "rewards/format_reward": 1.0, "step": 472 }, { "completion_length": 428.30359268188477, "epoch": 0.03694970085553369, "grad_norm": 0.13679970366646918, "kl": 0.0016155242919921875, "learning_rate": 9.966349556482713e-07, "loss": 0.0001, "reward": 1.6741072088479996, "reward_std": 0.18245531618595123, "rewards/accuracy_reward": 0.6741071715950966, "rewards/format_reward": 1.0, "step": 474 }, { "completion_length": 414.1183280944824, "epoch": 0.03710560676631653, "grad_norm": 0.152529968789961, "kl": 0.0013327598571777344, "learning_rate": 9.966065309730489e-07, "loss": 0.0001, "reward": 1.7343750596046448, "reward_std": 0.10303223133087158, "rewards/accuracy_reward": 0.7343750223517418, "rewards/format_reward": 1.0, "step": 476 }, { "completion_length": 414.2968940734863, "epoch": 0.03726151267709937, "grad_norm": 0.1474286340758123, "kl": 0.0013675689697265625, "learning_rate": 9.96577987158744e-07, "loss": 0.0001, "reward": 1.7767858058214188, "reward_std": 0.16113211028277874, "rewards/accuracy_reward": 0.7790178880095482, "rewards/format_reward": 0.9977678656578064, "step": 478 }, { "completion_length": 436.5357360839844, "epoch": 0.037417418587882216, "grad_norm": 0.11507195698767975, "kl": 0.0014963150024414062, "learning_rate": 9.965493242122037e-07, "loss": 0.0001, "reward": 1.7366072088479996, "reward_std": 0.12136416882276535, "rewards/accuracy_reward": 0.7366071790456772, "rewards/format_reward": 1.0, "step": 480 }, { "completion_length": 413.57143783569336, "epoch": 0.03757332449866506, "grad_norm": 0.1245026769872319, "kl": 0.0013360977172851562, "learning_rate": 9.965205421403052e-07, "loss": 0.0001, "reward": 1.8571429252624512, "reward_std": 0.11483176704496145, "rewards/accuracy_reward": 0.8571428880095482, "rewards/format_reward": 1.0, "step": 482 }, { "completion_length": 431.12724685668945, "epoch": 0.037729230409447895, "grad_norm": 0.09492777083586461, "kl": 0.001255035400390625, "learning_rate": 9.96491640949953e-07, "loss": 0.0001, "reward": 1.8437500894069672, "reward_std": 0.07973121665418148, "rewards/accuracy_reward": 0.8437500223517418, "rewards/format_reward": 1.0, "step": 484 }, { "completion_length": 427.714298248291, "epoch": 0.03788513632023074, "grad_norm": 0.11430601087785747, "kl": 0.0014142990112304688, "learning_rate": 9.96462620648081e-07, "loss": 0.0001, "reward": 1.8348215073347092, "reward_std": 0.08244217094033957, "rewards/accuracy_reward": 0.8348214700818062, "rewards/format_reward": 1.0, "step": 486 }, { "completion_length": 421.6942138671875, "epoch": 0.03804104223101358, "grad_norm": 0.14520970984813317, "kl": 0.001323699951171875, "learning_rate": 9.96433481241651e-07, "loss": 0.0001, "reward": 1.8013393729925156, "reward_std": 0.13296164479106665, "rewards/accuracy_reward": 0.8035714700818062, "rewards/format_reward": 0.9977678656578064, "step": 488 }, { "completion_length": 409.830379486084, "epoch": 0.03819694814179642, "grad_norm": 0.15289547600850067, "kl": 0.0016651153564453125, "learning_rate": 9.964042227376543e-07, "loss": 0.0001, "reward": 1.8102679401636124, "reward_std": 0.10986632481217384, "rewards/accuracy_reward": 0.8102678954601288, "rewards/format_reward": 1.0, "step": 490 }, { "completion_length": 434.3393020629883, "epoch": 0.038352854052579266, "grad_norm": 0.11097894193981754, "kl": 0.0013904571533203125, "learning_rate": 9.963748451431096e-07, "loss": 0.0001, "reward": 1.7834822237491608, "reward_std": 0.11924827191978693, "rewards/accuracy_reward": 0.7857143133878708, "rewards/format_reward": 0.9977678656578064, "step": 492 }, { "completion_length": 428.6138572692871, "epoch": 0.03850875996336211, "grad_norm": 0.16591073415100596, "kl": 0.0013875961303710938, "learning_rate": 9.96345348465065e-07, "loss": 0.0001, "reward": 1.7522322088479996, "reward_std": 0.13766576070338488, "rewards/accuracy_reward": 0.7544643171131611, "rewards/format_reward": 0.9977678656578064, "step": 494 }, { "completion_length": 418.17635345458984, "epoch": 0.03866466587414495, "grad_norm": 0.10694816910463441, "kl": 0.0012798309326171875, "learning_rate": 9.963157327105971e-07, "loss": 0.0001, "reward": 1.7968751043081284, "reward_std": 0.1243731752038002, "rewards/accuracy_reward": 0.796875037252903, "rewards/format_reward": 1.0, "step": 496 }, { "completion_length": 419.6071586608887, "epoch": 0.038820571784927795, "grad_norm": 0.09970353205524511, "kl": 0.0014505386352539062, "learning_rate": 9.962859978868107e-07, "loss": 0.0001, "reward": 1.8549108058214188, "reward_std": 0.06899292953312397, "rewards/accuracy_reward": 0.8549107536673546, "rewards/format_reward": 1.0, "step": 498 }, { "completion_length": 412.4419822692871, "epoch": 0.03897647769571064, "grad_norm": 0.12490541015030103, "kl": 0.0012125968933105469, "learning_rate": 9.962561440008397e-07, "loss": 0.0, "reward": 1.8147322237491608, "reward_std": 0.09136765450239182, "rewards/accuracy_reward": 0.8169643208384514, "rewards/format_reward": 0.9977678656578064, "step": 500 }, { "completion_length": 405.7857360839844, "epoch": 0.03913238360649348, "grad_norm": 0.13300055896200924, "kl": 0.001369476318359375, "learning_rate": 9.962261710598457e-07, "loss": 0.0001, "reward": 1.7968751043081284, "reward_std": 0.15713872574269772, "rewards/accuracy_reward": 0.7968750298023224, "rewards/format_reward": 1.0, "step": 502 }, { "completion_length": 417.2924270629883, "epoch": 0.03928828951727632, "grad_norm": 0.12462507099041066, "kl": 0.0013780593872070312, "learning_rate": 9.961960790710199e-07, "loss": 0.0001, "reward": 1.8593750894069672, "reward_std": 0.118066162802279, "rewards/accuracy_reward": 0.8593750223517418, "rewards/format_reward": 1.0, "step": 504 }, { "completion_length": 432.03796768188477, "epoch": 0.039444195428059166, "grad_norm": 0.1429092925075557, "kl": 0.0015044212341308594, "learning_rate": 9.961658680415812e-07, "loss": 0.0001, "reward": 1.7745536416769028, "reward_std": 0.1658600326627493, "rewards/accuracy_reward": 0.776785746216774, "rewards/format_reward": 0.9977678656578064, "step": 506 }, { "completion_length": 426.90849685668945, "epoch": 0.03960010133884201, "grad_norm": 0.14812149967405697, "kl": 0.0014667510986328125, "learning_rate": 9.961355379787776e-07, "loss": 0.0001, "reward": 1.7633929401636124, "reward_std": 0.13932883087545633, "rewards/accuracy_reward": 0.7633929029107094, "rewards/format_reward": 1.0, "step": 508 }, { "completion_length": 428.64287185668945, "epoch": 0.03975600724962485, "grad_norm": 0.1434791735951183, "kl": 0.0014944076538085938, "learning_rate": 9.961050888898855e-07, "loss": 0.0001, "reward": 1.7366072237491608, "reward_std": 0.15712535567581654, "rewards/accuracy_reward": 0.7410714626312256, "rewards/format_reward": 0.9955357313156128, "step": 510 }, { "completion_length": 423.5268020629883, "epoch": 0.039911913160407694, "grad_norm": 0.09875584527141615, "kl": 0.0012884140014648438, "learning_rate": 9.960745207822097e-07, "loss": 0.0001, "reward": 1.8638393580913544, "reward_std": 0.07936170790344477, "rewards/accuracy_reward": 0.8638393357396126, "rewards/format_reward": 1.0, "step": 512 }, { "completion_length": 426.908504486084, "epoch": 0.04006781907119054, "grad_norm": 0.12518712527684556, "kl": 0.0013370513916015625, "learning_rate": 9.960438336630838e-07, "loss": 0.0001, "reward": 1.7991072237491608, "reward_std": 0.11883699893951416, "rewards/accuracy_reward": 0.803571455180645, "rewards/format_reward": 0.9955357313156128, "step": 514 }, { "completion_length": 419.5468978881836, "epoch": 0.04022372498197338, "grad_norm": 0.12119989293203065, "kl": 0.001373291015625, "learning_rate": 9.960130275398698e-07, "loss": 0.0001, "reward": 1.8125000894069672, "reward_std": 0.1250522220507264, "rewards/accuracy_reward": 0.8147321790456772, "rewards/format_reward": 0.9977678656578064, "step": 516 }, { "completion_length": 428.68528747558594, "epoch": 0.04037963089275622, "grad_norm": 0.13986097018509117, "kl": 0.0013828277587890625, "learning_rate": 9.959821024199583e-07, "loss": 0.0001, "reward": 1.8459821939468384, "reward_std": 0.09343121852725744, "rewards/accuracy_reward": 0.8482143208384514, "rewards/format_reward": 0.9977678656578064, "step": 518 }, { "completion_length": 415.72769927978516, "epoch": 0.040535536803539066, "grad_norm": 0.13311396991872362, "kl": 0.0012826919555664062, "learning_rate": 9.959510583107683e-07, "loss": 0.0001, "reward": 1.7857143878936768, "reward_std": 0.15375377051532269, "rewards/accuracy_reward": 0.785714328289032, "rewards/format_reward": 1.0, "step": 520 }, { "completion_length": 429.7522506713867, "epoch": 0.04069144271432191, "grad_norm": 0.14905675951842284, "kl": 0.00140380859375, "learning_rate": 9.959198952197478e-07, "loss": 0.0001, "reward": 1.7991072535514832, "reward_std": 0.14691967610269785, "rewards/accuracy_reward": 0.7991071715950966, "rewards/format_reward": 1.0, "step": 522 }, { "completion_length": 424.2812728881836, "epoch": 0.04084734862510475, "grad_norm": 0.16803655831869144, "kl": 0.0017719268798828125, "learning_rate": 9.958886131543728e-07, "loss": 0.0001, "reward": 1.750000074505806, "reward_std": 0.16781283356249332, "rewards/accuracy_reward": 0.7522321715950966, "rewards/format_reward": 0.9977678656578064, "step": 524 }, { "completion_length": 427.080379486084, "epoch": 0.041003254535887594, "grad_norm": 0.1487410514498475, "kl": 0.0015468597412109375, "learning_rate": 9.95857212122148e-07, "loss": 0.0001, "reward": 1.7343750894069672, "reward_std": 0.16570666432380676, "rewards/accuracy_reward": 0.7343750223517418, "rewards/format_reward": 1.0, "step": 526 }, { "completion_length": 421.9754638671875, "epoch": 0.04115916044667044, "grad_norm": 0.10872558671810666, "kl": 0.0015344619750976562, "learning_rate": 9.958256921306066e-07, "loss": 0.0001, "reward": 1.8303572237491608, "reward_std": 0.11550717987120152, "rewards/accuracy_reward": 0.8303571864962578, "rewards/format_reward": 1.0, "step": 528 }, { "completion_length": 421.1875190734863, "epoch": 0.04131506635745328, "grad_norm": 0.15128957934574983, "kl": 0.0014858245849609375, "learning_rate": 9.95794053187311e-07, "loss": 0.0001, "reward": 1.727678656578064, "reward_std": 0.12474772334098816, "rewards/accuracy_reward": 0.7276785895228386, "rewards/format_reward": 1.0, "step": 530 }, { "completion_length": 418.3951072692871, "epoch": 0.04147097226823612, "grad_norm": 0.13513744208415074, "kl": 0.001483917236328125, "learning_rate": 9.957622952998511e-07, "loss": 0.0001, "reward": 1.7834822237491608, "reward_std": 0.09686850849539042, "rewards/accuracy_reward": 0.7834821715950966, "rewards/format_reward": 1.0, "step": 532 }, { "completion_length": 421.8147506713867, "epoch": 0.041626878179018965, "grad_norm": 0.1466670172363767, "kl": 0.00145721435546875, "learning_rate": 9.957304184758462e-07, "loss": 0.0001, "reward": 1.7187500894069672, "reward_std": 0.1873557474464178, "rewards/accuracy_reward": 0.7187500298023224, "rewards/format_reward": 1.0, "step": 534 }, { "completion_length": 423.6049270629883, "epoch": 0.0417827840898018, "grad_norm": 0.1454370473725692, "kl": 0.0013074874877929688, "learning_rate": 9.95698422722943e-07, "loss": 0.0001, "reward": 1.765625074505806, "reward_std": 0.10769738629460335, "rewards/accuracy_reward": 0.7656250298023224, "rewards/format_reward": 1.0, "step": 536 }, { "completion_length": 420.4196586608887, "epoch": 0.041938690000584644, "grad_norm": 0.157291144871801, "kl": 0.0014400482177734375, "learning_rate": 9.956663080488183e-07, "loss": 0.0001, "reward": 1.7968750894069672, "reward_std": 0.14819986931979656, "rewards/accuracy_reward": 0.7968750298023224, "rewards/format_reward": 1.0, "step": 538 }, { "completion_length": 415.34600830078125, "epoch": 0.04209459591136749, "grad_norm": 0.10793091643079387, "kl": 0.0012583732604980469, "learning_rate": 9.956340744611764e-07, "loss": 0.0001, "reward": 1.7834822237491608, "reward_std": 0.06493396684527397, "rewards/accuracy_reward": 0.7834821790456772, "rewards/format_reward": 1.0, "step": 540 }, { "completion_length": 411.7120666503906, "epoch": 0.04225050182215033, "grad_norm": 0.09689077231898427, "kl": 0.0013494491577148438, "learning_rate": 9.956017219677502e-07, "loss": 0.0001, "reward": 1.8482143431901932, "reward_std": 0.04599360655993223, "rewards/accuracy_reward": 0.8482143059372902, "rewards/format_reward": 1.0, "step": 542 }, { "completion_length": 417.0736770629883, "epoch": 0.04240640773293317, "grad_norm": 0.16143711841217942, "kl": 0.001514434814453125, "learning_rate": 9.955692505763012e-07, "loss": 0.0001, "reward": 1.7142858058214188, "reward_std": 0.14856937620788813, "rewards/accuracy_reward": 0.7142857387661934, "rewards/format_reward": 1.0, "step": 544 }, { "completion_length": 414.1428756713867, "epoch": 0.042562313643716015, "grad_norm": 0.10072019002535124, "kl": 0.0014309883117675781, "learning_rate": 9.955366602946195e-07, "loss": 0.0001, "reward": 1.7700893580913544, "reward_std": 0.09995036851614714, "rewards/accuracy_reward": 0.770089328289032, "rewards/format_reward": 1.0, "step": 546 }, { "completion_length": 417.9888572692871, "epoch": 0.04271821955449886, "grad_norm": 0.15980777422503156, "kl": 0.0013427734375, "learning_rate": 9.95503951130524e-07, "loss": 0.0001, "reward": 1.7500000894069672, "reward_std": 0.12234126590192318, "rewards/accuracy_reward": 0.7500000335276127, "rewards/format_reward": 1.0, "step": 548 }, { "completion_length": 430.1451072692871, "epoch": 0.0428741254652817, "grad_norm": 0.14783190112449582, "kl": 0.0014333724975585938, "learning_rate": 9.954711230918618e-07, "loss": 0.0001, "reward": 1.790178656578064, "reward_std": 0.14030313026160002, "rewards/accuracy_reward": 0.792410746216774, "rewards/format_reward": 0.9977678656578064, "step": 550 }, { "completion_length": 417.0982322692871, "epoch": 0.043030031376064544, "grad_norm": 0.1619509223508833, "kl": 0.0015087127685546875, "learning_rate": 9.954381761865082e-07, "loss": 0.0001, "reward": 1.8258929401636124, "reward_std": 0.12001615762710571, "rewards/accuracy_reward": 0.8258928880095482, "rewards/format_reward": 1.0, "step": 552 }, { "completion_length": 404.20091247558594, "epoch": 0.04318593728684739, "grad_norm": 0.15873066808343517, "kl": 0.001461029052734375, "learning_rate": 9.954051104223678e-07, "loss": 0.0001, "reward": 1.7544643580913544, "reward_std": 0.09333522245287895, "rewards/accuracy_reward": 0.754464328289032, "rewards/format_reward": 1.0, "step": 554 }, { "completion_length": 415.8080520629883, "epoch": 0.04334184319763023, "grad_norm": 0.11830760758175442, "kl": 0.00139617919921875, "learning_rate": 9.95371925807373e-07, "loss": 0.0001, "reward": 1.8102679401636124, "reward_std": 0.09266121964901686, "rewards/accuracy_reward": 0.8102678954601288, "rewards/format_reward": 1.0, "step": 556 }, { "completion_length": 400.70091247558594, "epoch": 0.04349774910841307, "grad_norm": 0.13360388411755858, "kl": 0.0013723373413085938, "learning_rate": 9.95338622349485e-07, "loss": 0.0001, "reward": 1.8772322237491608, "reward_std": 0.08146646898239851, "rewards/accuracy_reward": 0.8772321864962578, "rewards/format_reward": 1.0, "step": 558 }, { "completion_length": 423.26564025878906, "epoch": 0.043653655019195915, "grad_norm": 0.14656135811015747, "kl": 0.0014004707336425781, "learning_rate": 9.953052000566939e-07, "loss": 0.0001, "reward": 1.8816965073347092, "reward_std": 0.09830066375434399, "rewards/accuracy_reward": 0.8816964700818062, "rewards/format_reward": 1.0, "step": 560 }, { "completion_length": 424.9040336608887, "epoch": 0.04380956092997876, "grad_norm": 0.12790504785353282, "kl": 0.0014934539794921875, "learning_rate": 9.952716589370174e-07, "loss": 0.0001, "reward": 1.803571492433548, "reward_std": 0.13362016063183546, "rewards/accuracy_reward": 0.8035714700818062, "rewards/format_reward": 1.0, "step": 562 }, { "completion_length": 419.3571586608887, "epoch": 0.0439654668407616, "grad_norm": 0.13157365035419877, "kl": 0.0013041496276855469, "learning_rate": 9.952379989985026e-07, "loss": 0.0001, "reward": 1.787946492433548, "reward_std": 0.09815234411507845, "rewards/accuracy_reward": 0.7879464626312256, "rewards/format_reward": 1.0, "step": 564 }, { "completion_length": 428.5602912902832, "epoch": 0.044121372751544444, "grad_norm": 0.14113537880076354, "kl": 0.0016508102416992188, "learning_rate": 9.952042202492243e-07, "loss": 0.0001, "reward": 1.805803656578064, "reward_std": 0.1446651853621006, "rewards/accuracy_reward": 0.8058036118745804, "rewards/format_reward": 1.0, "step": 566 }, { "completion_length": 417.75447845458984, "epoch": 0.04427727866232729, "grad_norm": 0.11389525735961963, "kl": 0.0014243125915527344, "learning_rate": 9.951703226972866e-07, "loss": 0.0001, "reward": 1.8415179401636124, "reward_std": 0.09040532819926739, "rewards/accuracy_reward": 0.8415178880095482, "rewards/format_reward": 1.0, "step": 568 }, { "completion_length": 415.9397506713867, "epoch": 0.04443318457311013, "grad_norm": 0.15244992709982083, "kl": 0.0014524459838867188, "learning_rate": 9.951363063508218e-07, "loss": 0.0001, "reward": 1.8303572088479996, "reward_std": 0.08777992706745863, "rewards/accuracy_reward": 0.830357164144516, "rewards/format_reward": 1.0, "step": 570 }, { "completion_length": 421.1250190734863, "epoch": 0.04458909048389297, "grad_norm": 0.14975853485912088, "kl": 0.0013818740844726562, "learning_rate": 9.951021712179904e-07, "loss": 0.0001, "reward": 1.8013393580913544, "reward_std": 0.11745716817677021, "rewards/accuracy_reward": 0.8035714700818062, "rewards/format_reward": 0.9977678656578064, "step": 572 }, { "completion_length": 417.97546768188477, "epoch": 0.044744996394675815, "grad_norm": 0.17500083576762007, "kl": 0.0015497207641601562, "learning_rate": 9.950679173069819e-07, "loss": 0.0001, "reward": 1.7834822237491608, "reward_std": 0.12212231941521168, "rewards/accuracy_reward": 0.7834821790456772, "rewards/format_reward": 1.0, "step": 574 }, { "completion_length": 420.06252670288086, "epoch": 0.04490090230545866, "grad_norm": 0.12918551161979147, "kl": 0.0013375282287597656, "learning_rate": 9.950335446260138e-07, "loss": 0.0001, "reward": 1.7388393580913544, "reward_std": 0.14609510730952024, "rewards/accuracy_reward": 0.7388393208384514, "rewards/format_reward": 1.0, "step": 576 }, { "completion_length": 420.3169822692871, "epoch": 0.0450568082162415, "grad_norm": 0.08275672980584155, "kl": 0.0013475418090820312, "learning_rate": 9.949990531833323e-07, "loss": 0.0001, "reward": 1.8549107909202576, "reward_std": 0.06155041232705116, "rewards/accuracy_reward": 0.8549107611179352, "rewards/format_reward": 1.0, "step": 578 }, { "completion_length": 417.0736770629883, "epoch": 0.045212714127024344, "grad_norm": 0.11981886148339083, "kl": 0.0014820098876953125, "learning_rate": 9.949644429872124e-07, "loss": 0.0001, "reward": 1.7991072237491608, "reward_std": 0.08957795333117247, "rewards/accuracy_reward": 0.7991071715950966, "rewards/format_reward": 1.0, "step": 580 }, { "completion_length": 421.36162185668945, "epoch": 0.045368620037807186, "grad_norm": 0.17352716625012624, "kl": 0.0014905929565429688, "learning_rate": 9.94929714045957e-07, "loss": 0.0001, "reward": 1.7857143878936768, "reward_std": 0.1890746708959341, "rewards/accuracy_reward": 0.785714328289032, "rewards/format_reward": 1.0, "step": 582 }, { "completion_length": 411.6808166503906, "epoch": 0.04552452594859003, "grad_norm": 0.10932003670862961, "kl": 0.0015177726745605469, "learning_rate": 9.94894866367898e-07, "loss": 0.0001, "reward": 1.8526786416769028, "reward_std": 0.08973132446408272, "rewards/accuracy_reward": 0.8549107536673546, "rewards/format_reward": 0.9977678656578064, "step": 584 }, { "completion_length": 426.5290298461914, "epoch": 0.045680431859372865, "grad_norm": 0.13656442563631924, "kl": 0.001651763916015625, "learning_rate": 9.948598999613952e-07, "loss": 0.0001, "reward": 1.7477679550647736, "reward_std": 0.13660170417279005, "rewards/accuracy_reward": 0.7500000447034836, "rewards/format_reward": 0.9977678656578064, "step": 586 }, { "completion_length": 429.3169822692871, "epoch": 0.04583633777015571, "grad_norm": 0.11437111424068483, "kl": 0.0016183853149414062, "learning_rate": 9.94824814834838e-07, "loss": 0.0001, "reward": 1.7924108058214188, "reward_std": 0.10092746932059526, "rewards/accuracy_reward": 0.792410746216774, "rewards/format_reward": 1.0, "step": 588 }, { "completion_length": 433.8281440734863, "epoch": 0.04599224368093855, "grad_norm": 0.1375612242468201, "kl": 0.001583099365234375, "learning_rate": 9.947896109966429e-07, "loss": 0.0001, "reward": 1.81026791036129, "reward_std": 0.11565914005041122, "rewards/accuracy_reward": 0.812500037252903, "rewards/format_reward": 0.9977678656578064, "step": 590 }, { "completion_length": 420.0647430419922, "epoch": 0.046148149591721394, "grad_norm": 0.115114125744193, "kl": 0.0014019012451171875, "learning_rate": 9.947542884552558e-07, "loss": 0.0001, "reward": 1.7767858058214188, "reward_std": 0.07289712596684694, "rewards/accuracy_reward": 0.776785746216774, "rewards/format_reward": 1.0, "step": 592 }, { "completion_length": 434.51118087768555, "epoch": 0.046304055502504236, "grad_norm": 0.15214241425816424, "kl": 0.00154876708984375, "learning_rate": 9.947188472191506e-07, "loss": 0.0001, "reward": 1.8258929401636124, "reward_std": 0.14969984628260136, "rewards/accuracy_reward": 0.8281250447034836, "rewards/format_reward": 0.9977678656578064, "step": 594 }, { "completion_length": 425.9710006713867, "epoch": 0.04645996141328708, "grad_norm": 0.13138503274865748, "kl": 0.001522064208984375, "learning_rate": 9.946832872968302e-07, "loss": 0.0001, "reward": 1.796875074505806, "reward_std": 0.15466304682195187, "rewards/accuracy_reward": 0.796875037252903, "rewards/format_reward": 1.0, "step": 596 }, { "completion_length": 423.79466247558594, "epoch": 0.04661586732406992, "grad_norm": 0.13562243570651453, "kl": 0.0015153884887695312, "learning_rate": 9.946476086968255e-07, "loss": 0.0001, "reward": 1.8191965073347092, "reward_std": 0.13075948785990477, "rewards/accuracy_reward": 0.8191964626312256, "rewards/format_reward": 1.0, "step": 598 }, { "completion_length": 425.8102836608887, "epoch": 0.046771773234852765, "grad_norm": 0.1233374462574913, "kl": 0.0013909339904785156, "learning_rate": 9.946118114276959e-07, "loss": 0.0001, "reward": 1.852678656578064, "reward_std": 0.07823488209396601, "rewards/accuracy_reward": 0.8526786044239998, "rewards/format_reward": 1.0, "step": 600 }, { "completion_length": 425.1540336608887, "epoch": 0.04692767914563561, "grad_norm": 0.14465314672159155, "kl": 0.0015163421630859375, "learning_rate": 9.945758954980294e-07, "loss": 0.0001, "reward": 1.7321429252624512, "reward_std": 0.13819836638867855, "rewards/accuracy_reward": 0.7321428954601288, "rewards/format_reward": 1.0, "step": 602 }, { "completion_length": 424.66966247558594, "epoch": 0.04708358505641845, "grad_norm": 0.08702725333330194, "kl": 0.00156402587890625, "learning_rate": 9.945398609164426e-07, "loss": 0.0001, "reward": 1.7321429550647736, "reward_std": 0.1019031684845686, "rewards/accuracy_reward": 0.7321428805589676, "rewards/format_reward": 1.0, "step": 604 }, { "completion_length": 427.2076110839844, "epoch": 0.04723949096720129, "grad_norm": 0.1312802056306068, "kl": 0.0014505386352539062, "learning_rate": 9.945037076915803e-07, "loss": 0.0001, "reward": 1.8214286416769028, "reward_std": 0.11415916495025158, "rewards/accuracy_reward": 0.8214286118745804, "rewards/format_reward": 1.0, "step": 606 }, { "completion_length": 422.82368087768555, "epoch": 0.047395396877984136, "grad_norm": 0.1827333556196822, "kl": 0.0017185211181640625, "learning_rate": 9.944674358321162e-07, "loss": 0.0001, "reward": 1.7366072237491608, "reward_std": 0.19050682336091995, "rewards/accuracy_reward": 0.7366071790456772, "rewards/format_reward": 1.0, "step": 608 }, { "completion_length": 423.7120704650879, "epoch": 0.04755130278876698, "grad_norm": 0.1434294226693114, "kl": 0.0017032623291015625, "learning_rate": 9.944310453467518e-07, "loss": 0.0001, "reward": 1.743303656578064, "reward_std": 0.1117513058707118, "rewards/accuracy_reward": 0.7433036044239998, "rewards/format_reward": 1.0, "step": 610 }, { "completion_length": 420.1919860839844, "epoch": 0.04770720869954982, "grad_norm": 0.11889725886799532, "kl": 0.0014171600341796875, "learning_rate": 9.943945362442176e-07, "loss": 0.0001, "reward": 1.8683036714792252, "reward_std": 0.09040532540529966, "rewards/accuracy_reward": 0.8683036044239998, "rewards/format_reward": 1.0, "step": 612 }, { "completion_length": 430.9687690734863, "epoch": 0.047863114610332665, "grad_norm": 0.1346529946718946, "kl": 0.0015869140625, "learning_rate": 9.943579085332722e-07, "loss": 0.0001, "reward": 1.8035715371370316, "reward_std": 0.12828241009265184, "rewards/accuracy_reward": 0.803571455180645, "rewards/format_reward": 1.0, "step": 614 }, { "completion_length": 429.3236846923828, "epoch": 0.04801902052111551, "grad_norm": 0.12153309370946885, "kl": 0.0015153884887695312, "learning_rate": 9.94321162222703e-07, "loss": 0.0001, "reward": 1.845982238650322, "reward_std": 0.0873234597966075, "rewards/accuracy_reward": 0.845982164144516, "rewards/format_reward": 1.0, "step": 616 }, { "completion_length": 423.5759162902832, "epoch": 0.04817492643189835, "grad_norm": 0.13132975165426028, "kl": 0.001560211181640625, "learning_rate": 9.942842973213256e-07, "loss": 0.0001, "reward": 1.7767857909202576, "reward_std": 0.1449682842940092, "rewards/accuracy_reward": 0.7812500447034836, "rewards/format_reward": 0.9955357313156128, "step": 618 }, { "completion_length": 403.2500190734863, "epoch": 0.04833083234268119, "grad_norm": 0.12727927097047448, "kl": 0.0014390945434570312, "learning_rate": 9.94247313837984e-07, "loss": 0.0001, "reward": 1.805803656578064, "reward_std": 0.06643030606210232, "rewards/accuracy_reward": 0.8058035969734192, "rewards/format_reward": 1.0, "step": 620 }, { "completion_length": 420.88171005249023, "epoch": 0.048486738253464036, "grad_norm": 0.13364746760303114, "kl": 0.0014867782592773438, "learning_rate": 9.94210211781551e-07, "loss": 0.0001, "reward": 1.8035715073347092, "reward_std": 0.1306916680186987, "rewards/accuracy_reward": 0.8035714700818062, "rewards/format_reward": 1.0, "step": 622 }, { "completion_length": 419.0000190734863, "epoch": 0.04864264416424688, "grad_norm": 0.1565206006988708, "kl": 0.0015497207641601562, "learning_rate": 9.941729911609275e-07, "loss": 0.0001, "reward": 1.7745536416769028, "reward_std": 0.13106117770075798, "rewards/accuracy_reward": 0.7745536044239998, "rewards/format_reward": 1.0, "step": 624 }, { "completion_length": 430.1919822692871, "epoch": 0.04879855007502972, "grad_norm": 0.1691974561502761, "kl": 0.0014300346374511719, "learning_rate": 9.94135651985043e-07, "loss": 0.0001, "reward": 1.7656250894069672, "reward_std": 0.1160582285374403, "rewards/accuracy_reward": 0.7678571864962578, "rewards/format_reward": 0.9977678656578064, "step": 626 }, { "completion_length": 419.1674270629883, "epoch": 0.048954455985812564, "grad_norm": 0.13884981200889615, "kl": 0.0016565322875976562, "learning_rate": 9.940981942628554e-07, "loss": 0.0001, "reward": 1.8102679252624512, "reward_std": 0.10596072766929865, "rewards/accuracy_reward": 0.8102678805589676, "rewards/format_reward": 1.0, "step": 628 }, { "completion_length": 420.18751525878906, "epoch": 0.04911036189659541, "grad_norm": 0.1360271274282648, "kl": 0.0014085769653320312, "learning_rate": 9.940606180033509e-07, "loss": 0.0001, "reward": 1.861607238650322, "reward_std": 0.09363691601902246, "rewards/accuracy_reward": 0.8616071864962578, "rewards/format_reward": 1.0, "step": 630 }, { "completion_length": 418.80359268188477, "epoch": 0.04926626780737825, "grad_norm": 0.08675457942252837, "kl": 0.0013575553894042969, "learning_rate": 9.940229232155447e-07, "loss": 0.0001, "reward": 1.8258929252624512, "reward_std": 0.07499964907765388, "rewards/accuracy_reward": 0.8258928880095482, "rewards/format_reward": 1.0, "step": 632 }, { "completion_length": 416.64957427978516, "epoch": 0.04942217371816109, "grad_norm": 0.14583144864559644, "kl": 0.0015964508056640625, "learning_rate": 9.939851099084795e-07, "loss": 0.0001, "reward": 1.8080358058214188, "reward_std": 0.1297837859019637, "rewards/accuracy_reward": 0.808035746216774, "rewards/format_reward": 1.0, "step": 634 }, { "completion_length": 425.29243087768555, "epoch": 0.049578079628943936, "grad_norm": 0.10688628978867518, "kl": 0.0016651153564453125, "learning_rate": 9.93947178091227e-07, "loss": 0.0001, "reward": 1.7120536416769028, "reward_std": 0.10723952203989029, "rewards/accuracy_reward": 0.7120535895228386, "rewards/format_reward": 1.0, "step": 636 }, { "completion_length": 413.89734268188477, "epoch": 0.04973398553972677, "grad_norm": 0.10256775328796804, "kl": 0.001529693603515625, "learning_rate": 9.93909127772888e-07, "loss": 0.0001, "reward": 1.8125001043081284, "reward_std": 0.0417863167822361, "rewards/accuracy_reward": 0.812500037252903, "rewards/format_reward": 1.0, "step": 638 }, { "completion_length": 416.7343978881836, "epoch": 0.049889891450509614, "grad_norm": 0.15436750049066683, "kl": 0.0015897750854492188, "learning_rate": 9.938709589625903e-07, "loss": 0.0001, "reward": 1.7901786416769028, "reward_std": 0.14060762617737055, "rewards/accuracy_reward": 0.7901786044239998, "rewards/format_reward": 1.0, "step": 640 }, { "completion_length": 424.8348388671875, "epoch": 0.05004579736129246, "grad_norm": 0.1418958227870209, "kl": 0.0015878677368164062, "learning_rate": 9.938326716694908e-07, "loss": 0.0001, "reward": 1.8102679252624512, "reward_std": 0.09572831075638533, "rewards/accuracy_reward": 0.812500037252903, "rewards/format_reward": 0.9977678656578064, "step": 642 }, { "completion_length": 421.02457427978516, "epoch": 0.0502017032720753, "grad_norm": 0.11265451615943482, "kl": 0.0015268325805664062, "learning_rate": 9.937942659027753e-07, "loss": 0.0001, "reward": 1.845982238650322, "reward_std": 0.11092757247388363, "rewards/accuracy_reward": 0.8459821790456772, "rewards/format_reward": 1.0, "step": 644 }, { "completion_length": 424.1428756713867, "epoch": 0.05035760918285814, "grad_norm": 0.15289694984215418, "kl": 0.0016374588012695312, "learning_rate": 9.937557416716573e-07, "loss": 0.0001, "reward": 1.7388393729925156, "reward_std": 0.11565914191305637, "rewards/accuracy_reward": 0.7388393059372902, "rewards/format_reward": 1.0, "step": 646 }, { "completion_length": 408.0982322692871, "epoch": 0.050513515093640986, "grad_norm": 0.13642798304135068, "kl": 0.0014781951904296875, "learning_rate": 9.937170989853794e-07, "loss": 0.0001, "reward": 1.8459822237491608, "reward_std": 0.07079096138477325, "rewards/accuracy_reward": 0.8459821939468384, "rewards/format_reward": 1.0, "step": 648 }, { "completion_length": 428.5268020629883, "epoch": 0.05066942100442383, "grad_norm": 0.13052219956526068, "kl": 0.0017490386962890625, "learning_rate": 9.936783378532116e-07, "loss": 0.0001, "reward": 1.828125074505806, "reward_std": 0.13429416622966528, "rewards/accuracy_reward": 0.8303571790456772, "rewards/format_reward": 0.9977678656578064, "step": 650 }, { "completion_length": 413.73216247558594, "epoch": 0.05082532691520667, "grad_norm": 0.1400266135725097, "kl": 0.0015854835510253906, "learning_rate": 9.936394582844532e-07, "loss": 0.0001, "reward": 1.8125000596046448, "reward_std": 0.1142136100679636, "rewards/accuracy_reward": 0.8147321715950966, "rewards/format_reward": 0.9977678656578064, "step": 652 }, { "completion_length": 415.7589454650879, "epoch": 0.050981232825989514, "grad_norm": 0.17746998574036568, "kl": 0.001708984375, "learning_rate": 9.93600460288432e-07, "loss": 0.0001, "reward": 1.7544643729925156, "reward_std": 0.10851971805095673, "rewards/accuracy_reward": 0.7544643208384514, "rewards/format_reward": 1.0, "step": 654 }, { "completion_length": 404.2835006713867, "epoch": 0.05113713873677236, "grad_norm": 0.09306860562552624, "kl": 0.0013971328735351562, "learning_rate": 9.935613438745035e-07, "loss": 0.0001, "reward": 1.850446492433548, "reward_std": 0.05764481611549854, "rewards/accuracy_reward": 0.850446455180645, "rewards/format_reward": 1.0, "step": 656 }, { "completion_length": 411.7120666503906, "epoch": 0.0512930446475552, "grad_norm": 0.13963454038998074, "kl": 0.0016002655029296875, "learning_rate": 9.93522109052052e-07, "loss": 0.0001, "reward": 1.8102679550647736, "reward_std": 0.11550577450543642, "rewards/accuracy_reward": 0.8125000298023224, "rewards/format_reward": 0.9977678656578064, "step": 658 }, { "completion_length": 434.0424270629883, "epoch": 0.05144895055833804, "grad_norm": 0.14793377701102997, "kl": 0.0016956329345703125, "learning_rate": 9.934827558304903e-07, "loss": 0.0001, "reward": 1.7299108058214188, "reward_std": 0.09394001122564077, "rewards/accuracy_reward": 0.7299107499420643, "rewards/format_reward": 1.0, "step": 660 }, { "completion_length": 417.68528747558594, "epoch": 0.051604856469120886, "grad_norm": 0.1013127922155989, "kl": 0.0015163421630859375, "learning_rate": 9.934432842192595e-07, "loss": 0.0001, "reward": 1.7589286416769028, "reward_std": 0.11040470097213984, "rewards/accuracy_reward": 0.7633928805589676, "rewards/format_reward": 0.9955357313156128, "step": 662 }, { "completion_length": 418.4419860839844, "epoch": 0.05176076237990373, "grad_norm": 0.12428642116564076, "kl": 0.00147247314453125, "learning_rate": 9.934036942278291e-07, "loss": 0.0001, "reward": 1.8303572237491608, "reward_std": 0.12324914988130331, "rewards/accuracy_reward": 0.8303571864962578, "rewards/format_reward": 1.0, "step": 664 }, { "completion_length": 418.1585006713867, "epoch": 0.05191666829068657, "grad_norm": 0.09608518071806739, "kl": 0.001495361328125, "learning_rate": 9.933639858656969e-07, "loss": 0.0001, "reward": 1.8638393580913544, "reward_std": 0.10693782847374678, "rewards/accuracy_reward": 0.8638393208384514, "rewards/format_reward": 1.0, "step": 666 }, { "completion_length": 420.03349685668945, "epoch": 0.052072574201469414, "grad_norm": 0.1449688713418388, "kl": 0.0016946792602539062, "learning_rate": 9.933241591423892e-07, "loss": 0.0001, "reward": 1.7678572237491608, "reward_std": 0.11077561043202877, "rewards/accuracy_reward": 0.7678571790456772, "rewards/format_reward": 1.0, "step": 668 }, { "completion_length": 417.65627670288086, "epoch": 0.05222848011225226, "grad_norm": 0.09854045453234914, "kl": 0.00160980224609375, "learning_rate": 9.932842140674607e-07, "loss": 0.0001, "reward": 1.8258929401636124, "reward_std": 0.08176956418901682, "rewards/accuracy_reward": 0.8258928954601288, "rewards/format_reward": 1.0, "step": 670 }, { "completion_length": 416.4107322692871, "epoch": 0.0523843860230351, "grad_norm": 0.13016124735194373, "kl": 0.0015954971313476562, "learning_rate": 9.932441506504946e-07, "loss": 0.0001, "reward": 1.8080357760190964, "reward_std": 0.0898846872150898, "rewards/accuracy_reward": 0.8102678880095482, "rewards/format_reward": 0.9977678656578064, "step": 672 }, { "completion_length": 436.22993087768555, "epoch": 0.05254029193381794, "grad_norm": 0.1457113560507982, "kl": 0.001621246337890625, "learning_rate": 9.93203968901102e-07, "loss": 0.0001, "reward": 1.765625074505806, "reward_std": 0.13188490830361843, "rewards/accuracy_reward": 0.765625037252903, "rewards/format_reward": 1.0, "step": 674 }, { "completion_length": 401.18751525878906, "epoch": 0.052696197844600785, "grad_norm": 0.08202647629711743, "kl": 0.0014829635620117188, "learning_rate": 9.931636688289232e-07, "loss": 0.0001, "reward": 1.897321492433548, "reward_std": 0.05343612376600504, "rewards/accuracy_reward": 0.8973214626312256, "rewards/format_reward": 1.0, "step": 676 }, { "completion_length": 437.4955520629883, "epoch": 0.05285210375538363, "grad_norm": 0.12583174546215847, "kl": 0.0017194747924804688, "learning_rate": 9.931232504436262e-07, "loss": 0.0001, "reward": 1.7321429401636124, "reward_std": 0.12847545091062784, "rewards/accuracy_reward": 0.7366071715950966, "rewards/format_reward": 0.9955357313156128, "step": 678 }, { "completion_length": 428.60269927978516, "epoch": 0.05300800966616647, "grad_norm": 0.13419526972124451, "kl": 0.0016527175903320312, "learning_rate": 9.930827137549077e-07, "loss": 0.0001, "reward": 1.785714402794838, "reward_std": 0.09671878069639206, "rewards/accuracy_reward": 0.7857143133878708, "rewards/format_reward": 1.0, "step": 680 }, { "completion_length": 423.03349685668945, "epoch": 0.053163915576949314, "grad_norm": 0.12844040139318205, "kl": 0.00168609619140625, "learning_rate": 9.930420587724927e-07, "loss": 0.0001, "reward": 1.8370536416769028, "reward_std": 0.07320022210478783, "rewards/accuracy_reward": 0.8370536044239998, "rewards/format_reward": 1.0, "step": 682 }, { "completion_length": 418.2276916503906, "epoch": 0.05331982148773216, "grad_norm": 0.12373629124741635, "kl": 0.0015583038330078125, "learning_rate": 9.930012855061345e-07, "loss": 0.0001, "reward": 1.7678571939468384, "reward_std": 0.128282411955297, "rewards/accuracy_reward": 0.770089328289032, "rewards/format_reward": 0.9977678656578064, "step": 684 }, { "completion_length": 411.7076072692871, "epoch": 0.053475727398515, "grad_norm": 0.16418999807950302, "kl": 0.0014467239379882812, "learning_rate": 9.929603939656153e-07, "loss": 0.0001, "reward": 1.8370536416769028, "reward_std": 0.13038857374340296, "rewards/accuracy_reward": 0.8370536118745804, "rewards/format_reward": 1.0, "step": 686 }, { "completion_length": 417.4375190734863, "epoch": 0.053631633309297835, "grad_norm": 0.12212699070598523, "kl": 0.0013608932495117188, "learning_rate": 9.929193841607447e-07, "loss": 0.0001, "reward": 1.8772322237491608, "reward_std": 0.07192142587155104, "rewards/accuracy_reward": 0.8772321864962578, "rewards/format_reward": 1.0, "step": 688 }, { "completion_length": 427.955379486084, "epoch": 0.05378753922008068, "grad_norm": 0.15829165433042072, "kl": 0.0016002655029296875, "learning_rate": 9.928782561013614e-07, "loss": 0.0001, "reward": 1.7700893729925156, "reward_std": 0.1370051223784685, "rewards/accuracy_reward": 0.7700893208384514, "rewards/format_reward": 1.0, "step": 690 }, { "completion_length": 432.0669860839844, "epoch": 0.05394344513086352, "grad_norm": 0.1615618862638251, "kl": 0.0017185211181640625, "learning_rate": 9.928370097973322e-07, "loss": 0.0001, "reward": 1.727678656578064, "reward_std": 0.10528672486543655, "rewards/accuracy_reward": 0.7276786044239998, "rewards/format_reward": 1.0, "step": 692 }, { "completion_length": 436.16296768188477, "epoch": 0.054099351041646364, "grad_norm": 0.12117736691027446, "kl": 0.0017881393432617188, "learning_rate": 9.927956452585526e-07, "loss": 0.0001, "reward": 1.82366082072258, "reward_std": 0.13474922440946102, "rewards/accuracy_reward": 0.8236607387661934, "rewards/format_reward": 1.0, "step": 694 }, { "completion_length": 423.8750190734863, "epoch": 0.05425525695242921, "grad_norm": 0.15719578611679388, "kl": 0.001697540283203125, "learning_rate": 9.927541624949461e-07, "loss": 0.0001, "reward": 1.7700893431901932, "reward_std": 0.10108083859086037, "rewards/accuracy_reward": 0.7700893133878708, "rewards/format_reward": 1.0, "step": 696 }, { "completion_length": 419.4152030944824, "epoch": 0.05441116286321205, "grad_norm": 0.09093397880145353, "kl": 0.0014562606811523438, "learning_rate": 9.927125615164643e-07, "loss": 0.0001, "reward": 1.8973215073347092, "reward_std": 0.08777992334216833, "rewards/accuracy_reward": 0.8973214700818062, "rewards/format_reward": 1.0, "step": 698 }, { "completion_length": 413.1250190734863, "epoch": 0.05456706877399489, "grad_norm": 0.0662439226540212, "kl": 0.0015773773193359375, "learning_rate": 9.926708423330882e-07, "loss": 0.0001, "reward": 1.8080357909202576, "reward_std": 0.0586205143481493, "rewards/accuracy_reward": 0.8080357611179352, "rewards/format_reward": 1.0, "step": 700 }, { "completion_length": 417.1317138671875, "epoch": 0.054722974684777735, "grad_norm": 0.1505890452506428, "kl": 0.001613616943359375, "learning_rate": 9.926290049548264e-07, "loss": 0.0001, "reward": 1.7544643729925156, "reward_std": 0.13874037470668554, "rewards/accuracy_reward": 0.7544643059372902, "rewards/format_reward": 1.0, "step": 702 }, { "completion_length": 421.9486770629883, "epoch": 0.05487888059556058, "grad_norm": 0.09184623457176744, "kl": 0.0016536712646484375, "learning_rate": 9.925870493917154e-07, "loss": 0.0001, "reward": 1.812500074505806, "reward_std": 0.06560797337442636, "rewards/accuracy_reward": 0.8125000447034836, "rewards/format_reward": 1.0, "step": 704 }, { "completion_length": 428.17189025878906, "epoch": 0.05503478650634342, "grad_norm": 0.09975716189981397, "kl": 0.0017957687377929688, "learning_rate": 9.92544975653821e-07, "loss": 0.0001, "reward": 1.7790179252624512, "reward_std": 0.08357123378664255, "rewards/accuracy_reward": 0.7790179029107094, "rewards/format_reward": 1.0, "step": 706 }, { "completion_length": 419.6049270629883, "epoch": 0.055190692417126264, "grad_norm": 0.13269318072206904, "kl": 0.0016126632690429688, "learning_rate": 9.92502783751237e-07, "loss": 0.0001, "reward": 1.7991072088479996, "reward_std": 0.1269888449460268, "rewards/accuracy_reward": 0.8013393208384514, "rewards/format_reward": 0.9977678656578064, "step": 708 }, { "completion_length": 418.2812728881836, "epoch": 0.055346598327909106, "grad_norm": 0.1487743538040222, "kl": 0.0018520355224609375, "learning_rate": 9.924604736940854e-07, "loss": 0.0001, "reward": 1.7834822088479996, "reward_std": 0.11303010396659374, "rewards/accuracy_reward": 0.7834821790456772, "rewards/format_reward": 1.0, "step": 710 }, { "completion_length": 408.21430587768555, "epoch": 0.05550250423869195, "grad_norm": 0.13277273091148697, "kl": 0.0015516281127929688, "learning_rate": 9.924180454925166e-07, "loss": 0.0001, "reward": 1.8236607760190964, "reward_std": 0.07612871378660202, "rewards/accuracy_reward": 0.8236607536673546, "rewards/format_reward": 1.0, "step": 712 }, { "completion_length": 425.3035888671875, "epoch": 0.05565841014947479, "grad_norm": 0.14183662700533387, "kl": 0.0016355514526367188, "learning_rate": 9.923754991567095e-07, "loss": 0.0001, "reward": 1.796875074505806, "reward_std": 0.13737603276968002, "rewards/accuracy_reward": 0.796875037252903, "rewards/format_reward": 1.0, "step": 714 }, { "completion_length": 428.40180587768555, "epoch": 0.055814316060257635, "grad_norm": 0.13920755088202885, "kl": 0.0017366409301757812, "learning_rate": 9.923328346968713e-07, "loss": 0.0001, "reward": 1.758928656578064, "reward_std": 0.13865622505545616, "rewards/accuracy_reward": 0.7589286044239998, "rewards/format_reward": 1.0, "step": 716 }, { "completion_length": 424.3995666503906, "epoch": 0.05597022197104048, "grad_norm": 0.10883047678998069, "kl": 0.0018520355224609375, "learning_rate": 9.922900521232373e-07, "loss": 0.0001, "reward": 1.8236607909202576, "reward_std": 0.09431092441082001, "rewards/accuracy_reward": 0.8236607611179352, "rewards/format_reward": 1.0, "step": 718 }, { "completion_length": 427.44868087768555, "epoch": 0.05612612788182332, "grad_norm": 0.13022641593660716, "kl": 0.001708984375, "learning_rate": 9.922471514460715e-07, "loss": 0.0001, "reward": 1.7232143729925156, "reward_std": 0.11072187591344118, "rewards/accuracy_reward": 0.7254464589059353, "rewards/format_reward": 0.9977678656578064, "step": 720 }, { "completion_length": 409.48216247558594, "epoch": 0.05628203379260616, "grad_norm": 0.12675814698531368, "kl": 0.0016002655029296875, "learning_rate": 9.922041326756659e-07, "loss": 0.0001, "reward": 1.803571492433548, "reward_std": 0.11370634566992521, "rewards/accuracy_reward": 0.8035714626312256, "rewards/format_reward": 1.0, "step": 722 }, { "completion_length": 421.96207427978516, "epoch": 0.056437939703389006, "grad_norm": 0.11408478899137955, "kl": 0.0015621185302734375, "learning_rate": 9.92160995822341e-07, "loss": 0.0001, "reward": 1.8459822535514832, "reward_std": 0.09897327143698931, "rewards/accuracy_reward": 0.848214328289032, "rewards/format_reward": 0.9977678656578064, "step": 724 }, { "completion_length": 424.2321586608887, "epoch": 0.05659384561417185, "grad_norm": 0.09999636554575139, "kl": 0.0016965866088867188, "learning_rate": 9.921177408964456e-07, "loss": 0.0001, "reward": 1.7187500596046448, "reward_std": 0.07582561951130629, "rewards/accuracy_reward": 0.718750037252903, "rewards/format_reward": 1.0, "step": 726 }, { "completion_length": 421.6317138671875, "epoch": 0.05674975152495469, "grad_norm": 0.09142047976825779, "kl": 0.0015878677368164062, "learning_rate": 9.92074367908357e-07, "loss": 0.0001, "reward": 1.8102679252624512, "reward_std": 0.10919371899217367, "rewards/accuracy_reward": 0.8102678880095482, "rewards/format_reward": 1.0, "step": 728 }, { "completion_length": 409.89287185668945, "epoch": 0.056905657435737535, "grad_norm": 0.1423520518240581, "kl": 0.0015869140625, "learning_rate": 9.920308768684802e-07, "loss": 0.0001, "reward": 1.7120536714792252, "reward_std": 0.12602791655808687, "rewards/accuracy_reward": 0.7120535969734192, "rewards/format_reward": 1.0, "step": 730 }, { "completion_length": 418.9285888671875, "epoch": 0.05706156334652038, "grad_norm": 0.1131509562863168, "kl": 0.0017194747924804688, "learning_rate": 9.919872677872496e-07, "loss": 0.0001, "reward": 1.7165179550647736, "reward_std": 0.10882505215704441, "rewards/accuracy_reward": 0.7165178880095482, "rewards/format_reward": 1.0, "step": 732 }, { "completion_length": 417.8348388671875, "epoch": 0.05721746925730322, "grad_norm": 0.13455853846307836, "kl": 0.0015954971313476562, "learning_rate": 9.91943540675127e-07, "loss": 0.0001, "reward": 1.7656250894069672, "reward_std": 0.11535240616649389, "rewards/accuracy_reward": 0.7656250521540642, "rewards/format_reward": 1.0, "step": 734 }, { "completion_length": 430.66743087768555, "epoch": 0.05737337516808606, "grad_norm": 0.0911173898270824, "kl": 0.0017080307006835938, "learning_rate": 9.918996955426026e-07, "loss": 0.0001, "reward": 1.7544643580913544, "reward_std": 0.12874027527868748, "rewards/accuracy_reward": 0.754464328289032, "rewards/format_reward": 1.0, "step": 736 }, { "completion_length": 429.0870704650879, "epoch": 0.057529281078868906, "grad_norm": 0.14213915646490802, "kl": 0.0019102096557617188, "learning_rate": 9.918557324001955e-07, "loss": 0.0001, "reward": 1.758928656578064, "reward_std": 0.14963062852621078, "rewards/accuracy_reward": 0.7589286044239998, "rewards/format_reward": 1.0, "step": 738 }, { "completion_length": 429.1741256713867, "epoch": 0.05768518698965174, "grad_norm": 0.1738411339005668, "kl": 0.001972198486328125, "learning_rate": 9.918116512584524e-07, "loss": 0.0001, "reward": 1.765625074505806, "reward_std": 0.12556032557040453, "rewards/accuracy_reward": 0.7678571715950966, "rewards/format_reward": 0.9977678656578064, "step": 740 }, { "completion_length": 415.8125190734863, "epoch": 0.057841092900434585, "grad_norm": 0.14095651898445424, "kl": 0.001678466796875, "learning_rate": 9.91767452127949e-07, "loss": 0.0001, "reward": 1.7790179252624512, "reward_std": 0.11114512104541063, "rewards/accuracy_reward": 0.7790178880095482, "rewards/format_reward": 1.0, "step": 742 }, { "completion_length": 425.7388572692871, "epoch": 0.05799699881121743, "grad_norm": 0.1475555519181595, "kl": 0.0017805099487304688, "learning_rate": 9.917231350192888e-07, "loss": 0.0001, "reward": 1.7700893580913544, "reward_std": 0.12670052330940962, "rewards/accuracy_reward": 0.7700893208384514, "rewards/format_reward": 1.0, "step": 744 }, { "completion_length": 425.6875228881836, "epoch": 0.05815290472200027, "grad_norm": 0.13522479780356608, "kl": 0.0017070770263671875, "learning_rate": 9.916786999431037e-07, "loss": 0.0001, "reward": 1.8236607760190964, "reward_std": 0.11858763732016087, "rewards/accuracy_reward": 0.8258929029107094, "rewards/format_reward": 0.9977678656578064, "step": 746 }, { "completion_length": 413.63841247558594, "epoch": 0.05830881063278311, "grad_norm": 0.1276373616184393, "kl": 0.001766204833984375, "learning_rate": 9.91634146910054e-07, "loss": 0.0001, "reward": 1.7165179252624512, "reward_std": 0.12490249425172806, "rewards/accuracy_reward": 0.7165178880095482, "rewards/format_reward": 1.0, "step": 748 }, { "completion_length": 421.486629486084, "epoch": 0.058464716543565956, "grad_norm": 0.1409660616208819, "kl": 0.0015716552734375, "learning_rate": 9.915894759308282e-07, "loss": 0.0001, "reward": 1.765625074505806, "reward_std": 0.1553356535732746, "rewards/accuracy_reward": 0.7656250223517418, "rewards/format_reward": 1.0, "step": 750 }, { "completion_length": 423.5647506713867, "epoch": 0.0586206224543488, "grad_norm": 0.14140009876441126, "kl": 0.0016603469848632812, "learning_rate": 9.915446870161433e-07, "loss": 0.0001, "reward": 1.7767857909202576, "reward_std": 0.10693923011422157, "rewards/accuracy_reward": 0.7767857611179352, "rewards/format_reward": 1.0, "step": 752 }, { "completion_length": 419.71207427978516, "epoch": 0.05877652836513164, "grad_norm": 0.1083422463212501, "kl": 0.001678466796875, "learning_rate": 9.914997801767442e-07, "loss": 0.0001, "reward": 1.7924108058214188, "reward_std": 0.08582712337374687, "rewards/accuracy_reward": 0.792410746216774, "rewards/format_reward": 1.0, "step": 754 }, { "completion_length": 418.25894927978516, "epoch": 0.058932434275914485, "grad_norm": 0.14607835021291932, "kl": 0.00164794921875, "learning_rate": 9.914547554234045e-07, "loss": 0.0001, "reward": 1.8392857909202576, "reward_std": 0.09311767760664225, "rewards/accuracy_reward": 0.8415178880095482, "rewards/format_reward": 0.9977678656578064, "step": 756 }, { "completion_length": 424.7544822692871, "epoch": 0.05908834018669733, "grad_norm": 0.12949938085240667, "kl": 0.001644134521484375, "learning_rate": 9.91409612766926e-07, "loss": 0.0001, "reward": 1.8638393431901932, "reward_std": 0.11340100970119238, "rewards/accuracy_reward": 0.863839328289032, "rewards/format_reward": 1.0, "step": 758 }, { "completion_length": 410.7857360839844, "epoch": 0.05924424609748017, "grad_norm": 0.12168863001687365, "kl": 0.0017719268798828125, "learning_rate": 9.913643522181387e-07, "loss": 0.0001, "reward": 1.756696492433548, "reward_std": 0.11709130182862282, "rewards/accuracy_reward": 0.7566964700818062, "rewards/format_reward": 1.0, "step": 760 }, { "completion_length": 414.4352836608887, "epoch": 0.05940015200826301, "grad_norm": 0.12324423895453263, "kl": 0.0017290115356445312, "learning_rate": 9.913189737879008e-07, "loss": 0.0001, "reward": 1.8325893878936768, "reward_std": 0.11272840853780508, "rewards/accuracy_reward": 0.8325893133878708, "rewards/format_reward": 1.0, "step": 762 }, { "completion_length": 417.12278747558594, "epoch": 0.059556057919045856, "grad_norm": 0.12073977465737341, "kl": 0.0017900466918945312, "learning_rate": 9.912734774870988e-07, "loss": 0.0001, "reward": 1.8526786714792252, "reward_std": 0.10971155762672424, "rewards/accuracy_reward": 0.8526786118745804, "rewards/format_reward": 1.0, "step": 764 }, { "completion_length": 412.2455520629883, "epoch": 0.0597119638298287, "grad_norm": 0.11039013692424902, "kl": 0.0016326904296875, "learning_rate": 9.912278633266478e-07, "loss": 0.0001, "reward": 1.8169643729925156, "reward_std": 0.09040672797709703, "rewards/accuracy_reward": 0.816964328289032, "rewards/format_reward": 1.0, "step": 766 }, { "completion_length": 417.44421005249023, "epoch": 0.05986786974061154, "grad_norm": 0.10674944489468258, "kl": 0.0016374588012695312, "learning_rate": 9.911821313174906e-07, "loss": 0.0001, "reward": 1.7968751043081284, "reward_std": 0.07048926688730717, "rewards/accuracy_reward": 0.796875037252903, "rewards/format_reward": 1.0, "step": 768 }, { "completion_length": 415.75001525878906, "epoch": 0.060023775651394384, "grad_norm": 0.14722040334937517, "kl": 0.0016889572143554688, "learning_rate": 9.911362814705987e-07, "loss": 0.0001, "reward": 1.8013393729925156, "reward_std": 0.10138253029435873, "rewards/accuracy_reward": 0.8013393208384514, "rewards/format_reward": 1.0, "step": 770 }, { "completion_length": 419.8169860839844, "epoch": 0.06017968156217723, "grad_norm": 0.12771851203208112, "kl": 0.001941680908203125, "learning_rate": 9.91090313796972e-07, "loss": 0.0001, "reward": 1.7299107760190964, "reward_std": 0.119564738124609, "rewards/accuracy_reward": 0.7321428954601288, "rewards/format_reward": 0.9977678656578064, "step": 772 }, { "completion_length": 433.2143096923828, "epoch": 0.06033558747296007, "grad_norm": 0.12828588502916424, "kl": 0.0016546249389648438, "learning_rate": 9.910442283076386e-07, "loss": 0.0001, "reward": 1.8482143580913544, "reward_std": 0.10025346744805574, "rewards/accuracy_reward": 0.848214328289032, "rewards/format_reward": 1.0, "step": 774 }, { "completion_length": 423.8973388671875, "epoch": 0.06049149338374291, "grad_norm": 0.1319318810169274, "kl": 0.001598358154296875, "learning_rate": 9.90998025013654e-07, "loss": 0.0001, "reward": 1.8258929550647736, "reward_std": 0.10498503316193819, "rewards/accuracy_reward": 0.8258928954601288, "rewards/format_reward": 1.0, "step": 776 }, { "completion_length": 420.9285888671875, "epoch": 0.060647399294525756, "grad_norm": 0.12308011044628898, "kl": 0.0019073486328125, "learning_rate": 9.909517039261032e-07, "loss": 0.0001, "reward": 1.8080357760190964, "reward_std": 0.091381024569273, "rewards/accuracy_reward": 0.8102678880095482, "rewards/format_reward": 0.9977678656578064, "step": 778 }, { "completion_length": 412.18528747558594, "epoch": 0.0608033052053086, "grad_norm": 0.10107213341044373, "kl": 0.0014896392822265625, "learning_rate": 9.909052650560988e-07, "loss": 0.0001, "reward": 1.8593750894069672, "reward_std": 0.05538892187178135, "rewards/accuracy_reward": 0.8593750298023224, "rewards/format_reward": 1.0, "step": 780 }, { "completion_length": 418.9151954650879, "epoch": 0.06095921111609144, "grad_norm": 0.12462211276190541, "kl": 0.0017137527465820312, "learning_rate": 9.908587084147817e-07, "loss": 0.0001, "reward": 1.7745536267757416, "reward_std": 0.07875552028417587, "rewards/accuracy_reward": 0.7767857387661934, "rewards/format_reward": 0.9977678656578064, "step": 782 }, { "completion_length": 428.9375190734863, "epoch": 0.061115117026874284, "grad_norm": 0.13030695576617088, "kl": 0.0018291473388671875, "learning_rate": 9.908120340133214e-07, "loss": 0.0001, "reward": 1.7254465073347092, "reward_std": 0.1188921332359314, "rewards/accuracy_reward": 0.7276786044239998, "rewards/format_reward": 0.9977678656578064, "step": 784 }, { "completion_length": 411.5580520629883, "epoch": 0.06127102293765713, "grad_norm": 0.11704961653556849, "kl": 0.0017499923706054688, "learning_rate": 9.90765241862915e-07, "loss": 0.0001, "reward": 1.8504465073347092, "reward_std": 0.08567375876009464, "rewards/accuracy_reward": 0.850446455180645, "rewards/format_reward": 1.0, "step": 786 }, { "completion_length": 420.5357360839844, "epoch": 0.06142692884843997, "grad_norm": 0.1440289243537967, "kl": 0.0018291473388671875, "learning_rate": 9.907183319747884e-07, "loss": 0.0001, "reward": 1.734375074505806, "reward_std": 0.11956473346799612, "rewards/accuracy_reward": 0.7343750298023224, "rewards/format_reward": 1.0, "step": 788 }, { "completion_length": 449.0468978881836, "epoch": 0.061582834759222806, "grad_norm": 0.11203454813079652, "kl": 0.0019807815551757812, "learning_rate": 9.906713043601957e-07, "loss": 0.0001, "reward": 1.796875074505806, "reward_std": 0.10106606595218182, "rewards/accuracy_reward": 0.7991071715950966, "rewards/format_reward": 0.9977678656578064, "step": 790 }, { "completion_length": 417.3638572692871, "epoch": 0.06173874067000565, "grad_norm": 0.09741623519137002, "kl": 0.001617431640625, "learning_rate": 9.906241590304192e-07, "loss": 0.0001, "reward": 1.8459822088479996, "reward_std": 0.09318045433610678, "rewards/accuracy_reward": 0.8459821790456772, "rewards/format_reward": 1.0, "step": 792 }, { "completion_length": 432.57591247558594, "epoch": 0.06189464658078849, "grad_norm": 0.1452132748373019, "kl": 0.0019073486328125, "learning_rate": 9.90576895996769e-07, "loss": 0.0001, "reward": 1.805803656578064, "reward_std": 0.11340100970119238, "rewards/accuracy_reward": 0.8058036044239998, "rewards/format_reward": 1.0, "step": 794 }, { "completion_length": 422.0044822692871, "epoch": 0.062050552491571334, "grad_norm": 0.11479247056341227, "kl": 0.001697540283203125, "learning_rate": 9.90529515270584e-07, "loss": 0.0001, "reward": 1.8102679401636124, "reward_std": 0.10399456229060888, "rewards/accuracy_reward": 0.8125000447034836, "rewards/format_reward": 0.9977678656578064, "step": 796 }, { "completion_length": 412.8013610839844, "epoch": 0.06220645840235418, "grad_norm": 0.15419064071917352, "kl": 0.0017480850219726562, "learning_rate": 9.904820168632313e-07, "loss": 0.0001, "reward": 1.8214286714792252, "reward_std": 0.10626382380723953, "rewards/accuracy_reward": 0.8236607536673546, "rewards/format_reward": 0.9977678656578064, "step": 798 }, { "completion_length": 418.35493087768555, "epoch": 0.06236236431313702, "grad_norm": 0.08461682753565371, "kl": 0.0017404556274414062, "learning_rate": 9.904344007861058e-07, "loss": 0.0001, "reward": 1.796875074505806, "reward_std": 0.10385596193373203, "rewards/accuracy_reward": 0.7968750298023224, "rewards/format_reward": 1.0, "step": 800 }, { "completion_length": 416.8415336608887, "epoch": 0.06251827022391987, "grad_norm": 0.10828468142693938, "kl": 0.0017404556274414062, "learning_rate": 9.90386667050631e-07, "loss": 0.0001, "reward": 1.8482143878936768, "reward_std": 0.06996862776577473, "rewards/accuracy_reward": 0.848214328289032, "rewards/format_reward": 1.0, "step": 802 }, { "completion_length": 410.8683204650879, "epoch": 0.0626741761347027, "grad_norm": 0.13119377012482308, "kl": 0.0016260147094726562, "learning_rate": 9.903388156682585e-07, "loss": 0.0001, "reward": 1.8437500596046448, "reward_std": 0.10724092554301023, "rewards/accuracy_reward": 0.8437500447034836, "rewards/format_reward": 1.0, "step": 804 }, { "completion_length": 417.4709930419922, "epoch": 0.06283008204548556, "grad_norm": 0.12606880805677959, "kl": 0.0019397735595703125, "learning_rate": 9.902908466504685e-07, "loss": 0.0001, "reward": 1.8102679401636124, "reward_std": 0.11843707412481308, "rewards/accuracy_reward": 0.8102678954601288, "rewards/format_reward": 1.0, "step": 806 }, { "completion_length": 436.30359268188477, "epoch": 0.06298598795626839, "grad_norm": 0.1345411996545038, "kl": 0.0019960403442382812, "learning_rate": 9.902427600087684e-07, "loss": 0.0001, "reward": 1.7522322237491608, "reward_std": 0.1345316879451275, "rewards/accuracy_reward": 0.7522321864962578, "rewards/format_reward": 1.0, "step": 808 }, { "completion_length": 401.52233505249023, "epoch": 0.06314189386705124, "grad_norm": 0.10133198259454688, "kl": 0.00167083740234375, "learning_rate": 9.90194555754695e-07, "loss": 0.0001, "reward": 1.8549107909202576, "reward_std": 0.08244357071816921, "rewards/accuracy_reward": 0.8549107536673546, "rewards/format_reward": 1.0, "step": 810 }, { "completion_length": 418.7098388671875, "epoch": 0.06329779977783408, "grad_norm": 0.14854349076545276, "kl": 0.0020771026611328125, "learning_rate": 9.901462338998128e-07, "loss": 0.0001, "reward": 1.7700893729925156, "reward_std": 0.10949541162699461, "rewards/accuracy_reward": 0.7700893133878708, "rewards/format_reward": 1.0, "step": 812 }, { "completion_length": 427.4352798461914, "epoch": 0.06345370568861693, "grad_norm": 0.1085636414614204, "kl": 0.0019207000732421875, "learning_rate": 9.900977944557142e-07, "loss": 0.0001, "reward": 1.7142857760190964, "reward_std": 0.11940997187048197, "rewards/accuracy_reward": 0.7142857536673546, "rewards/format_reward": 1.0, "step": 814 }, { "completion_length": 426.42858505249023, "epoch": 0.06360961159939976, "grad_norm": 0.0998854037583272, "kl": 0.00206756591796875, "learning_rate": 9.900492374340206e-07, "loss": 0.0001, "reward": 1.8482143729925156, "reward_std": 0.06138591095805168, "rewards/accuracy_reward": 0.8504464626312256, "rewards/format_reward": 0.9977678656578064, "step": 816 }, { "completion_length": 421.31921768188477, "epoch": 0.0637655175101826, "grad_norm": 0.12222189515029847, "kl": 0.0016384124755859375, "learning_rate": 9.900005628463807e-07, "loss": 0.0001, "reward": 1.82589291036129, "reward_std": 0.07725777849555016, "rewards/accuracy_reward": 0.8258928880095482, "rewards/format_reward": 1.0, "step": 818 }, { "completion_length": 423.0357322692871, "epoch": 0.06392142342096545, "grad_norm": 0.1183801838185157, "kl": 0.0017557144165039062, "learning_rate": 9.89951770704472e-07, "loss": 0.0001, "reward": 1.814732238650322, "reward_std": 0.08732346352189779, "rewards/accuracy_reward": 0.8147321864962578, "rewards/format_reward": 1.0, "step": 820 }, { "completion_length": 406.18528747558594, "epoch": 0.06407732933174828, "grad_norm": 0.12700542654179625, "kl": 0.001918792724609375, "learning_rate": 9.899028610200003e-07, "loss": 0.0001, "reward": 1.6607143729925156, "reward_std": 0.09851680789142847, "rewards/accuracy_reward": 0.6607143133878708, "rewards/format_reward": 1.0, "step": 822 }, { "completion_length": 431.3214454650879, "epoch": 0.06423323524253113, "grad_norm": 0.1266325378126998, "kl": 0.001941680908203125, "learning_rate": 9.898538338046988e-07, "loss": 0.0001, "reward": 1.6919643729925156, "reward_std": 0.08116337563842535, "rewards/accuracy_reward": 0.6919643059372902, "rewards/format_reward": 1.0, "step": 824 }, { "completion_length": 422.7634086608887, "epoch": 0.06438914115331397, "grad_norm": 0.10671968904210771, "kl": 0.0019159317016601562, "learning_rate": 9.898046890703298e-07, "loss": 0.0001, "reward": 1.8303572088479996, "reward_std": 0.1061104591935873, "rewards/accuracy_reward": 0.8303571790456772, "rewards/format_reward": 1.0, "step": 826 }, { "completion_length": 419.24555587768555, "epoch": 0.06454504706409682, "grad_norm": 0.10188603846421306, "kl": 0.0019397735595703125, "learning_rate": 9.897554268286836e-07, "loss": 0.0001, "reward": 1.7388393878936768, "reward_std": 0.10528812557458878, "rewards/accuracy_reward": 0.7388393133878708, "rewards/format_reward": 1.0, "step": 828 }, { "completion_length": 418.3973388671875, "epoch": 0.06470095297487966, "grad_norm": 0.11550538652058452, "kl": 0.0017728805541992188, "learning_rate": 9.89706047091578e-07, "loss": 0.0001, "reward": 1.828125074505806, "reward_std": 0.14605624601244926, "rewards/accuracy_reward": 0.8303571715950966, "rewards/format_reward": 0.9977678656578064, "step": 830 }, { "completion_length": 415.0357360839844, "epoch": 0.0648568588856625, "grad_norm": 0.11198963157549986, "kl": 0.0020761489868164062, "learning_rate": 9.8965654987086e-07, "loss": 0.0001, "reward": 1.7455358058214188, "reward_std": 0.0747821070253849, "rewards/accuracy_reward": 0.7455357387661934, "rewards/format_reward": 1.0, "step": 832 }, { "completion_length": 408.09376525878906, "epoch": 0.06501276479644534, "grad_norm": 0.07997471358868016, "kl": 0.0015649795532226562, "learning_rate": 9.896069351784043e-07, "loss": 0.0001, "reward": 1.8794643580913544, "reward_std": 0.04764330945909023, "rewards/accuracy_reward": 0.8794643208384514, "rewards/format_reward": 1.0, "step": 834 }, { "completion_length": 420.52680587768555, "epoch": 0.06516867070722819, "grad_norm": 0.1314163442136313, "kl": 0.0021648406982421875, "learning_rate": 9.895572030261135e-07, "loss": 0.0001, "reward": 1.8147322237491608, "reward_std": 0.11422698199748993, "rewards/accuracy_reward": 0.8147321790456772, "rewards/format_reward": 1.0, "step": 836 }, { "completion_length": 414.89734268188477, "epoch": 0.06532457661801103, "grad_norm": 0.0970782079692493, "kl": 0.0017213821411132812, "learning_rate": 9.895073534259186e-07, "loss": 0.0001, "reward": 1.8102679401636124, "reward_std": 0.09122905787080526, "rewards/accuracy_reward": 0.810267873108387, "rewards/format_reward": 1.0, "step": 838 }, { "completion_length": 407.7076110839844, "epoch": 0.06548048252879388, "grad_norm": 0.14641134164061417, "kl": 0.0015683174133300781, "learning_rate": 9.894573863897792e-07, "loss": 0.0001, "reward": 1.8236607760190964, "reward_std": 0.07905721105635166, "rewards/accuracy_reward": 0.8236607536673546, "rewards/format_reward": 1.0, "step": 840 }, { "completion_length": 426.7276954650879, "epoch": 0.06563638843957671, "grad_norm": 0.10040865296227702, "kl": 0.0018873214721679688, "learning_rate": 9.894073019296825e-07, "loss": 0.0001, "reward": 1.7299107909202576, "reward_std": 0.06041994597762823, "rewards/accuracy_reward": 0.7299107536673546, "rewards/format_reward": 1.0, "step": 842 }, { "completion_length": 421.87724685668945, "epoch": 0.06579229435035956, "grad_norm": 0.09379477196841839, "kl": 0.0017557144165039062, "learning_rate": 9.89357100057644e-07, "loss": 0.0001, "reward": 1.8102679252624512, "reward_std": 0.08198570925742388, "rewards/accuracy_reward": 0.8102678880095482, "rewards/format_reward": 1.0, "step": 844 }, { "completion_length": 417.86832427978516, "epoch": 0.0659482002611424, "grad_norm": 0.11522575407172835, "kl": 0.001739501953125, "learning_rate": 9.893067807857076e-07, "loss": 0.0001, "reward": 1.8392858058214188, "reward_std": 0.07777981925755739, "rewards/accuracy_reward": 0.839285746216774, "rewards/format_reward": 1.0, "step": 846 }, { "completion_length": 420.9732322692871, "epoch": 0.06610410617192525, "grad_norm": 0.13359596026315326, "kl": 0.0020380020141601562, "learning_rate": 9.892563441259454e-07, "loss": 0.0001, "reward": 1.8214286714792252, "reward_std": 0.11415916122496128, "rewards/accuracy_reward": 0.8214286118745804, "rewards/format_reward": 1.0, "step": 848 }, { "completion_length": 419.2210006713867, "epoch": 0.06626001208270808, "grad_norm": 0.15179223921024473, "kl": 0.002166748046875, "learning_rate": 9.89205790090457e-07, "loss": 0.0001, "reward": 1.7343750894069672, "reward_std": 0.1130301021039486, "rewards/accuracy_reward": 0.7366071790456772, "rewards/format_reward": 0.9977678656578064, "step": 850 }, { "completion_length": 422.354923248291, "epoch": 0.06641591799349093, "grad_norm": 0.12510529596401027, "kl": 0.0017671585083007812, "learning_rate": 9.89155118691371e-07, "loss": 0.0001, "reward": 1.758928656578064, "reward_std": 0.09220475610345602, "rewards/accuracy_reward": 0.7589286118745804, "rewards/format_reward": 1.0, "step": 852 }, { "completion_length": 413.05805587768555, "epoch": 0.06657182390427377, "grad_norm": 0.09906878393782281, "kl": 0.0017728805541992188, "learning_rate": 9.891043299408435e-07, "loss": 0.0001, "reward": 1.852678656578064, "reward_std": 0.09281318448483944, "rewards/accuracy_reward": 0.8526786118745804, "rewards/format_reward": 1.0, "step": 854 }, { "completion_length": 417.5960006713867, "epoch": 0.06672772981505662, "grad_norm": 0.12320829097335113, "kl": 0.0016317367553710938, "learning_rate": 9.890534238510592e-07, "loss": 0.0001, "reward": 1.8660715222358704, "reward_std": 0.08695618901401758, "rewards/accuracy_reward": 0.866071455180645, "rewards/format_reward": 1.0, "step": 856 }, { "completion_length": 414.4732322692871, "epoch": 0.06688363572583945, "grad_norm": 0.0948386390376352, "kl": 0.0017766952514648438, "learning_rate": 9.89002400434231e-07, "loss": 0.0001, "reward": 1.8392857760190964, "reward_std": 0.0529810655862093, "rewards/accuracy_reward": 0.8392857387661934, "rewards/format_reward": 1.0, "step": 858 }, { "completion_length": 422.6942138671875, "epoch": 0.0670395416366223, "grad_norm": 0.09759264752594449, "kl": 0.0017452239990234375, "learning_rate": 9.889512597025995e-07, "loss": 0.0001, "reward": 1.8348215222358704, "reward_std": 0.10092606954276562, "rewards/accuracy_reward": 0.8348214700818062, "rewards/format_reward": 1.0, "step": 860 }, { "completion_length": 421.86609268188477, "epoch": 0.06719544754740514, "grad_norm": 0.11650596401592868, "kl": 0.0016202926635742188, "learning_rate": 9.889000016684338e-07, "loss": 0.0001, "reward": 1.9017857760190964, "reward_std": 0.09491570945829153, "rewards/accuracy_reward": 0.9017857387661934, "rewards/format_reward": 1.0, "step": 862 }, { "completion_length": 428.00894927978516, "epoch": 0.06735135345818799, "grad_norm": 0.14754742569234888, "kl": 0.0018701553344726562, "learning_rate": 9.88848626344031e-07, "loss": 0.0001, "reward": 1.837053656578064, "reward_std": 0.09379168413579464, "rewards/accuracy_reward": 0.8370536267757416, "rewards/format_reward": 1.0, "step": 864 }, { "completion_length": 413.92635345458984, "epoch": 0.06750725936897083, "grad_norm": 0.11405604974819812, "kl": 0.0015468597412109375, "learning_rate": 9.887971337417163e-07, "loss": 0.0001, "reward": 1.7857143729925156, "reward_std": 0.08634776342660189, "rewards/accuracy_reward": 0.7857143059372902, "rewards/format_reward": 1.0, "step": 866 }, { "completion_length": 416.97769927978516, "epoch": 0.06766316527975366, "grad_norm": 0.10692869728625168, "kl": 0.001689910888671875, "learning_rate": 9.88745523873843e-07, "loss": 0.0001, "reward": 1.8169643580913544, "reward_std": 0.07146496511995792, "rewards/accuracy_reward": 0.8169643357396126, "rewards/format_reward": 1.0, "step": 868 }, { "completion_length": 426.0736770629883, "epoch": 0.06781907119053651, "grad_norm": 0.1295756881118104, "kl": 0.0018796920776367188, "learning_rate": 9.88693796752793e-07, "loss": 0.0001, "reward": 1.750000074505806, "reward_std": 0.1219703583046794, "rewards/accuracy_reward": 0.7522321715950966, "rewards/format_reward": 0.9977678656578064, "step": 870 }, { "completion_length": 416.3169860839844, "epoch": 0.06797497710131935, "grad_norm": 0.14244734903662815, "kl": 0.0017976760864257812, "learning_rate": 9.886419523909759e-07, "loss": 0.0001, "reward": 1.8750000894069672, "reward_std": 0.07094572857022285, "rewards/accuracy_reward": 0.875000037252903, "rewards/format_reward": 1.0, "step": 872 }, { "completion_length": 421.7366256713867, "epoch": 0.0681308830121022, "grad_norm": 0.11319821277226562, "kl": 0.0017805099487304688, "learning_rate": 9.88589990800829e-07, "loss": 0.0001, "reward": 1.8169643580913544, "reward_std": 0.13248969800770283, "rewards/accuracy_reward": 0.8169643208384514, "rewards/format_reward": 1.0, "step": 874 }, { "completion_length": 420.5625190734863, "epoch": 0.06828678892288503, "grad_norm": 0.13028220675223467, "kl": 0.0015630722045898438, "learning_rate": 9.885379119948187e-07, "loss": 0.0001, "reward": 1.8459822088479996, "reward_std": 0.06658366974443197, "rewards/accuracy_reward": 0.8459821790456772, "rewards/format_reward": 1.0, "step": 876 }, { "completion_length": 424.6250190734863, "epoch": 0.06844269483366788, "grad_norm": 0.0998795090620851, "kl": 0.0020351409912109375, "learning_rate": 9.88485715985439e-07, "loss": 0.0001, "reward": 1.8258929401636124, "reward_std": 0.07267957739531994, "rewards/accuracy_reward": 0.8258928954601288, "rewards/format_reward": 1.0, "step": 878 }, { "completion_length": 422.07368087768555, "epoch": 0.06859860074445072, "grad_norm": 0.11908554139300612, "kl": 0.0017251968383789062, "learning_rate": 9.88433402785212e-07, "loss": 0.0001, "reward": 1.7968750894069672, "reward_std": 0.05929452087730169, "rewards/accuracy_reward": 0.7968750447034836, "rewards/format_reward": 1.0, "step": 880 }, { "completion_length": 413.6986846923828, "epoch": 0.06875450665523357, "grad_norm": 0.1256557118779537, "kl": 0.0017032623291015625, "learning_rate": 9.883809724066878e-07, "loss": 0.0001, "reward": 1.8683036714792252, "reward_std": 0.08454833179712296, "rewards/accuracy_reward": 0.8683036118745804, "rewards/format_reward": 1.0, "step": 882 }, { "completion_length": 425.98439025878906, "epoch": 0.0689104125660164, "grad_norm": 0.11881091059721152, "kl": 0.001674652099609375, "learning_rate": 9.88328424862445e-07, "loss": 0.0001, "reward": 1.7700893729925156, "reward_std": 0.08927485905587673, "rewards/accuracy_reward": 0.7700893133878708, "rewards/format_reward": 1.0, "step": 884 }, { "completion_length": 433.1294860839844, "epoch": 0.06906631847679925, "grad_norm": 0.16007500055079352, "kl": 0.0018177032470703125, "learning_rate": 9.8827576016509e-07, "loss": 0.0001, "reward": 1.812500074505806, "reward_std": 0.13030302338302135, "rewards/accuracy_reward": 0.812500037252903, "rewards/format_reward": 1.0, "step": 886 }, { "completion_length": 438.85940170288086, "epoch": 0.06922222438758209, "grad_norm": 0.1111635680691402, "kl": 0.0020017623901367188, "learning_rate": 9.882229783272574e-07, "loss": 0.0001, "reward": 1.805803656578064, "reward_std": 0.11257727723568678, "rewards/accuracy_reward": 0.8080357536673546, "rewards/format_reward": 0.9977678656578064, "step": 888 }, { "completion_length": 429.99332427978516, "epoch": 0.06937813029836494, "grad_norm": 0.09302395785348101, "kl": 0.0020227432250976562, "learning_rate": 9.8817007936161e-07, "loss": 0.0001, "reward": 1.8325893580913544, "reward_std": 0.07853797543793917, "rewards/accuracy_reward": 0.8325892984867096, "rewards/format_reward": 1.0, "step": 890 }, { "completion_length": 428.0781440734863, "epoch": 0.06953403620914778, "grad_norm": 0.13390579116438828, "kl": 0.0020780563354492188, "learning_rate": 9.881170632808385e-07, "loss": 0.0001, "reward": 1.7857143729925156, "reward_std": 0.14106268342584372, "rewards/accuracy_reward": 0.7857143208384514, "rewards/format_reward": 1.0, "step": 892 }, { "completion_length": 417.1071586608887, "epoch": 0.06968994211993063, "grad_norm": 0.13887036740976566, "kl": 0.0017518997192382812, "learning_rate": 9.880639300976616e-07, "loss": 0.0001, "reward": 1.8214286714792252, "reward_std": 0.1249984847381711, "rewards/accuracy_reward": 0.823660746216774, "rewards/format_reward": 0.9977678656578064, "step": 894 }, { "completion_length": 407.64957427978516, "epoch": 0.06984584803071346, "grad_norm": 0.12629588623119137, "kl": 0.001766204833984375, "learning_rate": 9.880106798248267e-07, "loss": 0.0001, "reward": 1.7522322237491608, "reward_std": 0.10580736305564642, "rewards/accuracy_reward": 0.7522321790456772, "rewards/format_reward": 1.0, "step": 896 }, { "completion_length": 416.9330520629883, "epoch": 0.07000175394149631, "grad_norm": 0.0989055107247431, "kl": 0.0016870498657226562, "learning_rate": 9.879573124751087e-07, "loss": 0.0001, "reward": 1.7901786416769028, "reward_std": 0.06515011098235846, "rewards/accuracy_reward": 0.790178619325161, "rewards/format_reward": 1.0, "step": 898 }, { "completion_length": 426.0692138671875, "epoch": 0.07015765985227915, "grad_norm": 0.11223845197311431, "kl": 0.0022192001342773438, "learning_rate": 9.879038280613106e-07, "loss": 0.0001, "reward": 1.8191965371370316, "reward_std": 0.08957935404032469, "rewards/accuracy_reward": 0.8191964626312256, "rewards/format_reward": 1.0, "step": 900 }, { "completion_length": 415.7187690734863, "epoch": 0.070313565763062, "grad_norm": 0.15169244341838214, "kl": 0.0019741058349609375, "learning_rate": 9.878502265962637e-07, "loss": 0.0001, "reward": 1.7857143878936768, "reward_std": 0.14271238818764687, "rewards/accuracy_reward": 0.7857143208384514, "rewards/format_reward": 1.0, "step": 902 }, { "completion_length": 427.3214530944824, "epoch": 0.07046947167384483, "grad_norm": 0.08488245666168202, "kl": 0.0019483566284179688, "learning_rate": 9.877965080928277e-07, "loss": 0.0001, "reward": 1.727678656578064, "reward_std": 0.05718835536390543, "rewards/accuracy_reward": 0.7299107387661934, "rewards/format_reward": 0.9977678656578064, "step": 904 }, { "completion_length": 411.5245704650879, "epoch": 0.07062537758462768, "grad_norm": 0.15347406326211127, "kl": 0.001758575439453125, "learning_rate": 9.877426725638898e-07, "loss": 0.0001, "reward": 1.8102679252624512, "reward_std": 0.12730671279132366, "rewards/accuracy_reward": 0.8102678954601288, "rewards/format_reward": 1.0, "step": 906 }, { "completion_length": 421.00671005249023, "epoch": 0.07078128349541052, "grad_norm": 0.0985665727833651, "kl": 0.0016460418701171875, "learning_rate": 9.876887200223653e-07, "loss": 0.0001, "reward": 1.7611607909202576, "reward_std": 0.09039055276662111, "rewards/accuracy_reward": 0.765625037252903, "rewards/format_reward": 0.9955357313156128, "step": 908 }, { "completion_length": 433.02680587768555, "epoch": 0.07093718940619337, "grad_norm": 0.07641318139328084, "kl": 0.0020532608032226562, "learning_rate": 9.87634650481198e-07, "loss": 0.0001, "reward": 1.8392857760190964, "reward_std": 0.06560797151178122, "rewards/accuracy_reward": 0.8392857313156128, "rewards/format_reward": 1.0, "step": 910 }, { "completion_length": 415.8460006713867, "epoch": 0.0710930953169762, "grad_norm": 0.1280849473799582, "kl": 0.00154876708984375, "learning_rate": 9.875804639533591e-07, "loss": 0.0001, "reward": 1.82366082072258, "reward_std": 0.0618521049618721, "rewards/accuracy_reward": 0.823660746216774, "rewards/format_reward": 1.0, "step": 912 }, { "completion_length": 428.3147506713867, "epoch": 0.07124900122775905, "grad_norm": 0.1324919982887254, "kl": 0.0019855499267578125, "learning_rate": 9.87526160451849e-07, "loss": 0.0001, "reward": 1.8325893580913544, "reward_std": 0.11257728189229965, "rewards/accuracy_reward": 0.8325893133878708, "rewards/format_reward": 1.0, "step": 914 }, { "completion_length": 421.30135345458984, "epoch": 0.07140490713854189, "grad_norm": 0.11139835716930357, "kl": 0.0020008087158203125, "learning_rate": 9.874717399896948e-07, "loss": 0.0001, "reward": 1.7165179252624512, "reward_std": 0.0883005615323782, "rewards/accuracy_reward": 0.7165178954601288, "rewards/format_reward": 1.0, "step": 916 }, { "completion_length": 416.5067138671875, "epoch": 0.07156081304932473, "grad_norm": 0.11597273795888399, "kl": 0.0016231536865234375, "learning_rate": 9.874172025799527e-07, "loss": 0.0001, "reward": 1.843750074505806, "reward_std": 0.11144821438938379, "rewards/accuracy_reward": 0.8437500447034836, "rewards/format_reward": 1.0, "step": 918 }, { "completion_length": 428.5602912902832, "epoch": 0.07171671896010758, "grad_norm": 0.15055100510817768, "kl": 0.0020465850830078125, "learning_rate": 9.873625482357063e-07, "loss": 0.0001, "reward": 1.7611607909202576, "reward_std": 0.14722276851534843, "rewards/accuracy_reward": 0.761160746216774, "rewards/format_reward": 1.0, "step": 920 }, { "completion_length": 400.13841247558594, "epoch": 0.07187262487089041, "grad_norm": 0.13115002510513818, "kl": 0.0017299652099609375, "learning_rate": 9.87307776970068e-07, "loss": 0.0001, "reward": 1.8125000596046448, "reward_std": 0.09686710685491562, "rewards/accuracy_reward": 0.8125000447034836, "rewards/format_reward": 1.0, "step": 922 }, { "completion_length": 405.95984268188477, "epoch": 0.07202853078167326, "grad_norm": 0.09840573407088009, "kl": 0.00167083740234375, "learning_rate": 9.872528887961774e-07, "loss": 0.0001, "reward": 1.8549107760190964, "reward_std": 0.06883956491947174, "rewards/accuracy_reward": 0.854910746216774, "rewards/format_reward": 1.0, "step": 924 }, { "completion_length": 415.9308166503906, "epoch": 0.0721844366924561, "grad_norm": 0.10225901760886896, "kl": 0.0016727447509765625, "learning_rate": 9.871978837272027e-07, "loss": 0.0001, "reward": 1.8392857611179352, "reward_std": 0.06704013235867023, "rewards/accuracy_reward": 0.8392857313156128, "rewards/format_reward": 1.0, "step": 926 }, { "completion_length": 442.0111885070801, "epoch": 0.07234034260323895, "grad_norm": 0.1542417322351337, "kl": 0.0021114349365234375, "learning_rate": 9.871427617763394e-07, "loss": 0.0001, "reward": 1.8214286416769028, "reward_std": 0.12279632687568665, "rewards/accuracy_reward": 0.823660746216774, "rewards/format_reward": 0.9977678656578064, "step": 928 }, { "completion_length": 409.5446586608887, "epoch": 0.07249624851402178, "grad_norm": 0.12990518462171424, "kl": 0.0015811920166015625, "learning_rate": 9.870875229568125e-07, "loss": 0.0001, "reward": 1.7790179252624512, "reward_std": 0.08567376248538494, "rewards/accuracy_reward": 0.7790178954601288, "rewards/format_reward": 1.0, "step": 930 }, { "completion_length": 432.3169860839844, "epoch": 0.07265215442480463, "grad_norm": 0.08929124758464052, "kl": 0.0019464492797851562, "learning_rate": 9.870321672818737e-07, "loss": 0.0001, "reward": 1.7589286267757416, "reward_std": 0.07355140335857868, "rewards/accuracy_reward": 0.7611607387661934, "rewards/format_reward": 0.9977678656578064, "step": 932 }, { "completion_length": 426.8326110839844, "epoch": 0.07280806033558747, "grad_norm": 0.10103630541469641, "kl": 0.0018167495727539062, "learning_rate": 9.86976694764803e-07, "loss": 0.0001, "reward": 1.8258929401636124, "reward_std": 0.08116337843239307, "rewards/accuracy_reward": 0.8258928805589676, "rewards/format_reward": 1.0, "step": 934 }, { "completion_length": 410.6518096923828, "epoch": 0.07296396624637032, "grad_norm": 0.10138227557048177, "kl": 0.001804351806640625, "learning_rate": 9.869211054189089e-07, "loss": 0.0001, "reward": 1.7834822237491608, "reward_std": 0.1167873702943325, "rewards/accuracy_reward": 0.7834821790456772, "rewards/format_reward": 1.0, "step": 936 }, { "completion_length": 412.2768020629883, "epoch": 0.07311987215715315, "grad_norm": 0.10543964774676053, "kl": 0.0015687942504882812, "learning_rate": 9.868653992575276e-07, "loss": 0.0001, "reward": 1.709821492433548, "reward_std": 0.09656541515141726, "rewards/accuracy_reward": 0.7098214700818062, "rewards/format_reward": 1.0, "step": 938 }, { "completion_length": 428.3348388671875, "epoch": 0.073275778067936, "grad_norm": 0.16090442937211508, "kl": 0.001964569091796875, "learning_rate": 9.868095762940232e-07, "loss": 0.0001, "reward": 1.8437500894069672, "reward_std": 0.13496817089617252, "rewards/accuracy_reward": 0.8437500298023224, "rewards/format_reward": 1.0, "step": 940 }, { "completion_length": 420.25671768188477, "epoch": 0.07343168397871884, "grad_norm": 0.13923599001152517, "kl": 0.00201416015625, "learning_rate": 9.86753636541788e-07, "loss": 0.0001, "reward": 1.7723215073347092, "reward_std": 0.12663270719349384, "rewards/accuracy_reward": 0.7723214626312256, "rewards/format_reward": 1.0, "step": 942 }, { "completion_length": 429.04913330078125, "epoch": 0.07358758988950169, "grad_norm": 0.125199028040221, "kl": 0.0017681121826171875, "learning_rate": 9.866975800142428e-07, "loss": 0.0001, "reward": 1.8080358058214188, "reward_std": 0.07274375669658184, "rewards/accuracy_reward": 0.8080357536673546, "rewards/format_reward": 1.0, "step": 944 }, { "completion_length": 419.3683166503906, "epoch": 0.07374349580028453, "grad_norm": 0.10232066024504176, "kl": 0.001865386962890625, "learning_rate": 9.866414067248352e-07, "loss": 0.0001, "reward": 1.7209822237491608, "reward_std": 0.12249323260039091, "rewards/accuracy_reward": 0.7209821715950966, "rewards/format_reward": 1.0, "step": 946 }, { "completion_length": 423.151798248291, "epoch": 0.07389940171106738, "grad_norm": 0.14026494939842504, "kl": 0.0019931793212890625, "learning_rate": 9.86585116687042e-07, "loss": 0.0001, "reward": 1.8147322088479996, "reward_std": 0.12175141088664532, "rewards/accuracy_reward": 0.8147321715950966, "rewards/format_reward": 1.0, "step": 948 }, { "completion_length": 419.7210006713867, "epoch": 0.07405530762185021, "grad_norm": 0.1314437940884521, "kl": 0.0020914077758789062, "learning_rate": 9.865287099143673e-07, "loss": 0.0001, "reward": 1.7566965222358704, "reward_std": 0.12212232034653425, "rewards/accuracy_reward": 0.7566964700818062, "rewards/format_reward": 1.0, "step": 950 }, { "completion_length": 414.46430587768555, "epoch": 0.07421121353263306, "grad_norm": 0.12073420778018579, "kl": 0.0018568038940429688, "learning_rate": 9.864721864203436e-07, "loss": 0.0001, "reward": 1.8348215073347092, "reward_std": 0.09671878069639206, "rewards/accuracy_reward": 0.8348214700818062, "rewards/format_reward": 1.0, "step": 952 }, { "completion_length": 424.64511489868164, "epoch": 0.0743671194434159, "grad_norm": 0.08000421522688435, "kl": 0.001895904541015625, "learning_rate": 9.864155462185312e-07, "loss": 0.0001, "reward": 1.7924107611179352, "reward_std": 0.06658367346972227, "rewards/accuracy_reward": 0.7924107536673546, "rewards/format_reward": 1.0, "step": 954 }, { "completion_length": 414.64957427978516, "epoch": 0.07452302535419875, "grad_norm": 0.1084578616974938, "kl": 0.002079010009765625, "learning_rate": 9.863587893225183e-07, "loss": 0.0001, "reward": 1.7968750596046448, "reward_std": 0.07905720919370651, "rewards/accuracy_reward": 0.7968750298023224, "rewards/format_reward": 1.0, "step": 956 }, { "completion_length": 421.9330520629883, "epoch": 0.07467893126498158, "grad_norm": 0.0480077859183558, "kl": 0.0018215179443359375, "learning_rate": 9.863019157459214e-07, "loss": 0.0001, "reward": 1.8571429252624512, "reward_std": 0.03644856158643961, "rewards/accuracy_reward": 0.8571428880095482, "rewards/format_reward": 1.0, "step": 958 }, { "completion_length": 408.19868087768555, "epoch": 0.07483483717576443, "grad_norm": 0.1550078472051444, "kl": 0.0018453598022460938, "learning_rate": 9.86244925502385e-07, "loss": 0.0001, "reward": 1.781250074505806, "reward_std": 0.10107943695038557, "rewards/accuracy_reward": 0.7812500298023224, "rewards/format_reward": 1.0, "step": 960 }, { "completion_length": 423.4866256713867, "epoch": 0.07499074308654727, "grad_norm": 0.05066497468591427, "kl": 0.0018863677978515625, "learning_rate": 9.861878186055812e-07, "loss": 0.0001, "reward": 1.8616071939468384, "reward_std": 0.07792814821004868, "rewards/accuracy_reward": 0.8616071790456772, "rewards/format_reward": 1.0, "step": 962 }, { "completion_length": 425.05805587768555, "epoch": 0.07514664899733012, "grad_norm": 0.15827717896548524, "kl": 0.0020503997802734375, "learning_rate": 9.8613059506921e-07, "loss": 0.0001, "reward": 1.6808036416769028, "reward_std": 0.13767772540450096, "rewards/accuracy_reward": 0.6808036044239998, "rewards/format_reward": 1.0, "step": 964 }, { "completion_length": 428.7745704650879, "epoch": 0.07530255490811295, "grad_norm": 0.1301615261108776, "kl": 0.0020799636840820312, "learning_rate": 9.860732549070005e-07, "loss": 0.0001, "reward": 1.8727679550647736, "reward_std": 0.09235672000795603, "rewards/accuracy_reward": 0.8750000447034836, "rewards/format_reward": 0.9977678656578064, "step": 966 }, { "completion_length": 418.5714454650879, "epoch": 0.07545846081889579, "grad_norm": 0.1106316976542351, "kl": 0.0016231536865234375, "learning_rate": 9.860157981327083e-07, "loss": 0.0001, "reward": 1.8035715371370316, "reward_std": 0.09528662264347076, "rewards/accuracy_reward": 0.803571455180645, "rewards/format_reward": 1.0, "step": 968 }, { "completion_length": 422.8169822692871, "epoch": 0.07561436672967864, "grad_norm": 0.13354885550011825, "kl": 0.0019769668579101562, "learning_rate": 9.859582247601179e-07, "loss": 0.0001, "reward": 1.7968750894069672, "reward_std": 0.10092747025191784, "rewards/accuracy_reward": 0.7968750298023224, "rewards/format_reward": 1.0, "step": 970 }, { "completion_length": 411.9419822692871, "epoch": 0.07577027264046148, "grad_norm": 0.11533318200631738, "kl": 0.001766204833984375, "learning_rate": 9.859005348030414e-07, "loss": 0.0001, "reward": 1.7946429252624512, "reward_std": 0.08296140655875206, "rewards/accuracy_reward": 0.7946428805589676, "rewards/format_reward": 1.0, "step": 972 }, { "completion_length": 420.45314025878906, "epoch": 0.07592617855124432, "grad_norm": 0.11804462928723494, "kl": 0.0020875930786132812, "learning_rate": 9.85842728275319e-07, "loss": 0.0001, "reward": 1.8616072237491608, "reward_std": 0.08079246431589127, "rewards/accuracy_reward": 0.8616071790456772, "rewards/format_reward": 1.0, "step": 974 }, { "completion_length": 417.14734268188477, "epoch": 0.07608208446202716, "grad_norm": 0.11834598862154269, "kl": 0.0019969940185546875, "learning_rate": 9.857848051908191e-07, "loss": 0.0001, "reward": 1.801339328289032, "reward_std": 0.12203817255795002, "rewards/accuracy_reward": 0.8013393133878708, "rewards/format_reward": 1.0, "step": 976 }, { "completion_length": 408.28349685668945, "epoch": 0.07623799037281001, "grad_norm": 0.1106318741340753, "kl": 0.0017681121826171875, "learning_rate": 9.857267655634376e-07, "loss": 0.0001, "reward": 1.828125074505806, "reward_std": 0.09619590360671282, "rewards/accuracy_reward": 0.8281250447034836, "rewards/format_reward": 1.0, "step": 978 }, { "completion_length": 428.2120704650879, "epoch": 0.07639389628359285, "grad_norm": 0.12978981702712386, "kl": 0.00197601318359375, "learning_rate": 9.856686094070985e-07, "loss": 0.0001, "reward": 1.8013393878936768, "reward_std": 0.09101151768118143, "rewards/accuracy_reward": 0.801339328289032, "rewards/format_reward": 1.0, "step": 980 }, { "completion_length": 420.33037185668945, "epoch": 0.0765498021943757, "grad_norm": 0.09740878309900881, "kl": 0.0018758773803710938, "learning_rate": 9.856103367357541e-07, "loss": 0.0001, "reward": 1.8482143729925156, "reward_std": 0.06560797430574894, "rewards/accuracy_reward": 0.8482143133878708, "rewards/format_reward": 1.0, "step": 982 }, { "completion_length": 423.10493087768555, "epoch": 0.07670570810515853, "grad_norm": 0.10185824807351601, "kl": 0.0019865036010742188, "learning_rate": 9.855519475633843e-07, "loss": 0.0001, "reward": 1.7500000596046448, "reward_std": 0.10589291620999575, "rewards/accuracy_reward": 0.7500000447034836, "rewards/format_reward": 1.0, "step": 984 }, { "completion_length": 412.77233505249023, "epoch": 0.07686161401594138, "grad_norm": 0.10698129803503138, "kl": 0.002040863037109375, "learning_rate": 9.85493441903997e-07, "loss": 0.0001, "reward": 1.9308036416769028, "reward_std": 0.07808011118322611, "rewards/accuracy_reward": 0.9308036044239998, "rewards/format_reward": 1.0, "step": 986 }, { "completion_length": 415.60939025878906, "epoch": 0.07701751992672422, "grad_norm": 0.09115647310284883, "kl": 0.0019168853759765625, "learning_rate": 9.854348197716279e-07, "loss": 0.0001, "reward": 1.8303572237491608, "reward_std": 0.054932462982833385, "rewards/accuracy_reward": 0.830357164144516, "rewards/format_reward": 1.0, "step": 988 }, { "completion_length": 420.24778747558594, "epoch": 0.07717342583750707, "grad_norm": 0.08342398384313332, "kl": 0.0018978118896484375, "learning_rate": 9.853760811803408e-07, "loss": 0.0001, "reward": 1.8125000596046448, "reward_std": 0.08161619957536459, "rewards/accuracy_reward": 0.8125000298023224, "rewards/format_reward": 1.0, "step": 990 }, { "completion_length": 423.2589454650879, "epoch": 0.0773293317482899, "grad_norm": 0.08565499831386429, "kl": 0.0018711090087890625, "learning_rate": 9.85317226144228e-07, "loss": 0.0001, "reward": 1.8415179252624512, "reward_std": 0.09491711296141148, "rewards/accuracy_reward": 0.8415178805589676, "rewards/format_reward": 1.0, "step": 992 }, { "completion_length": 428.7343940734863, "epoch": 0.07748523765907275, "grad_norm": 0.14412798715430644, "kl": 0.00234222412109375, "learning_rate": 9.852582546774086e-07, "loss": 0.0001, "reward": 1.750000074505806, "reward_std": 0.11016942001879215, "rewards/accuracy_reward": 0.7500000298023224, "rewards/format_reward": 1.0, "step": 994 }, { "completion_length": 432.68082427978516, "epoch": 0.07764114356985559, "grad_norm": 0.06638701081564342, "kl": 0.0019893646240234375, "learning_rate": 9.851991667940305e-07, "loss": 0.0001, "reward": 1.765625074505806, "reward_std": 0.09368640929460526, "rewards/accuracy_reward": 0.7678571715950966, "rewards/format_reward": 0.9977678656578064, "step": 996 }, { "completion_length": 419.51787185668945, "epoch": 0.07779704948063844, "grad_norm": 0.14111434080378532, "kl": 0.002079010009765625, "learning_rate": 9.851399625082693e-07, "loss": 0.0001, "reward": 1.7790179401636124, "reward_std": 0.11693793255835772, "rewards/accuracy_reward": 0.7812500298023224, "rewards/format_reward": 0.9977678656578064, "step": 998 }, { "completion_length": 428.53572845458984, "epoch": 0.07795295539142127, "grad_norm": 0.13340440373429352, "kl": 0.0022039413452148438, "learning_rate": 9.850806418343286e-07, "loss": 0.0001, "reward": 1.743303656578064, "reward_std": 0.12843717820942402, "rewards/accuracy_reward": 0.7455357387661934, "rewards/format_reward": 0.9977678656578064, "step": 1000 }, { "completion_length": 407.10716247558594, "epoch": 0.07810886130220412, "grad_norm": 0.13603383825201434, "kl": 0.001922607421875, "learning_rate": 9.850212047864393e-07, "loss": 0.0001, "reward": 1.7924107909202576, "reward_std": 0.1394785586744547, "rewards/accuracy_reward": 0.792410746216774, "rewards/format_reward": 1.0, "step": 1002 }, { "completion_length": 431.13841247558594, "epoch": 0.07826476721298696, "grad_norm": 0.11739456710645457, "kl": 0.0019903182983398438, "learning_rate": 9.849616513788613e-07, "loss": 0.0001, "reward": 1.7678572535514832, "reward_std": 0.12038706708699465, "rewards/accuracy_reward": 0.7678571790456772, "rewards/format_reward": 1.0, "step": 1004 }, { "completion_length": 416.611629486084, "epoch": 0.07842067312376981, "grad_norm": 0.10162486610183266, "kl": 0.0017671585083007812, "learning_rate": 9.849019816258815e-07, "loss": 0.0001, "reward": 1.7991072237491608, "reward_std": 0.08146506920456886, "rewards/accuracy_reward": 0.7991071939468384, "rewards/format_reward": 1.0, "step": 1006 }, { "completion_length": 414.99778747558594, "epoch": 0.07857657903455265, "grad_norm": 0.113935296362901, "kl": 0.002231597900390625, "learning_rate": 9.848421955418152e-07, "loss": 0.0001, "reward": 1.7857143580913544, "reward_std": 0.09461401496082544, "rewards/accuracy_reward": 0.7857143133878708, "rewards/format_reward": 1.0, "step": 1008 }, { "completion_length": 426.52680587768555, "epoch": 0.0787324849453355, "grad_norm": 0.12280514732142804, "kl": 0.0020809173583984375, "learning_rate": 9.847822931410055e-07, "loss": 0.0001, "reward": 1.796875074505806, "reward_std": 0.08274526428431273, "rewards/accuracy_reward": 0.796875037252903, "rewards/format_reward": 1.0, "step": 1010 }, { "completion_length": 420.79689025878906, "epoch": 0.07888839085611833, "grad_norm": 0.1407391397226261, "kl": 0.0018796920776367188, "learning_rate": 9.84722274437823e-07, "loss": 0.0001, "reward": 1.7566965073347092, "reward_std": 0.1093323165550828, "rewards/accuracy_reward": 0.7589286118745804, "rewards/format_reward": 0.9977678656578064, "step": 1012 }, { "completion_length": 421.2589530944824, "epoch": 0.07904429676690118, "grad_norm": 0.12303640376714188, "kl": 0.002079010009765625, "learning_rate": 9.846621394466672e-07, "loss": 0.0001, "reward": 1.7968750894069672, "reward_std": 0.12084353249520063, "rewards/accuracy_reward": 0.796875037252903, "rewards/format_reward": 1.0, "step": 1014 }, { "completion_length": 434.2009162902832, "epoch": 0.07920020267768402, "grad_norm": 0.13749169792868243, "kl": 0.0021238327026367188, "learning_rate": 9.846018881819644e-07, "loss": 0.0001, "reward": 1.7812500894069672, "reward_std": 0.09897467214614153, "rewards/accuracy_reward": 0.7834821715950966, "rewards/format_reward": 0.9977678656578064, "step": 1016 }, { "completion_length": 438.4375190734863, "epoch": 0.07935610858846687, "grad_norm": 0.1199865653198068, "kl": 0.0021514892578125, "learning_rate": 9.845415206581693e-07, "loss": 0.0001, "reward": 1.832589402794838, "reward_std": 0.1195499636232853, "rewards/accuracy_reward": 0.834821455180645, "rewards/format_reward": 0.9977678656578064, "step": 1018 }, { "completion_length": 408.76118087768555, "epoch": 0.0795120144992497, "grad_norm": 0.07793491953011777, "kl": 0.0017452239990234375, "learning_rate": 9.844810368897648e-07, "loss": 0.0001, "reward": 1.7991072088479996, "reward_std": 0.06380490213632584, "rewards/accuracy_reward": 0.7991071790456772, "rewards/format_reward": 1.0, "step": 1020 }, { "completion_length": 403.35939025878906, "epoch": 0.07966792041003254, "grad_norm": 0.10076448974737216, "kl": 0.002105712890625, "learning_rate": 9.844204368912608e-07, "loss": 0.0001, "reward": 1.7566965222358704, "reward_std": 0.0659110676497221, "rewards/accuracy_reward": 0.756696455180645, "rewards/format_reward": 1.0, "step": 1022 }, { "completion_length": 419.3259086608887, "epoch": 0.07982382632081539, "grad_norm": 0.14238235934244656, "kl": 0.0021276473999023438, "learning_rate": 9.843597206771961e-07, "loss": 0.0001, "reward": 1.7857143729925156, "reward_std": 0.13208776991814375, "rewards/accuracy_reward": 0.7879464626312256, "rewards/format_reward": 0.9977678656578064, "step": 1024 }, { "completion_length": 408.70537185668945, "epoch": 0.07997973223159822, "grad_norm": 0.10389180560185234, "kl": 0.001628875732421875, "learning_rate": 9.842988882621368e-07, "loss": 0.0001, "reward": 1.787946492433548, "reward_std": 0.09589140769094229, "rewards/accuracy_reward": 0.7879464700818062, "rewards/format_reward": 1.0, "step": 1026 }, { "completion_length": 415.8973388671875, "epoch": 0.08013563814238107, "grad_norm": 0.11483025318379367, "kl": 0.0019884109497070312, "learning_rate": 9.842379396606767e-07, "loss": 0.0001, "reward": 1.7991072237491608, "reward_std": 0.10461412090808153, "rewards/accuracy_reward": 0.7991071790456772, "rewards/format_reward": 1.0, "step": 1028 }, { "completion_length": 418.9799270629883, "epoch": 0.08029154405316391, "grad_norm": 0.11289009985445186, "kl": 0.0019483566284179688, "learning_rate": 9.841768748874381e-07, "loss": 0.0001, "reward": 1.8482143580913544, "reward_std": 0.08537066262215376, "rewards/accuracy_reward": 0.8482143208384514, "rewards/format_reward": 1.0, "step": 1030 }, { "completion_length": 425.42189025878906, "epoch": 0.08044744996394676, "grad_norm": 0.06213229982211131, "kl": 0.001827239990234375, "learning_rate": 9.841156939570707e-07, "loss": 0.0001, "reward": 1.8437500894069672, "reward_std": 0.07042145263403654, "rewards/accuracy_reward": 0.843750037252903, "rewards/format_reward": 1.0, "step": 1032 }, { "completion_length": 418.2076072692871, "epoch": 0.0806033558747296, "grad_norm": 0.13534777392269956, "kl": 0.0020952224731445312, "learning_rate": 9.840543968842522e-07, "loss": 0.0001, "reward": 1.6808036416769028, "reward_std": 0.15390853863209486, "rewards/accuracy_reward": 0.6808035969734192, "rewards/format_reward": 1.0, "step": 1034 }, { "completion_length": 428.6718940734863, "epoch": 0.08075926178551245, "grad_norm": 0.11848092943419107, "kl": 0.0020418167114257812, "learning_rate": 9.839929836836882e-07, "loss": 0.0001, "reward": 1.7611608058214188, "reward_std": 0.08845393173396587, "rewards/accuracy_reward": 0.761160746216774, "rewards/format_reward": 1.0, "step": 1036 }, { "completion_length": 426.9888610839844, "epoch": 0.08091516769629528, "grad_norm": 0.11057544923256384, "kl": 0.0021333694458007812, "learning_rate": 9.83931454370112e-07, "loss": 0.0001, "reward": 1.8147322088479996, "reward_std": 0.07192142680287361, "rewards/accuracy_reward": 0.8147321790456772, "rewards/format_reward": 1.0, "step": 1038 }, { "completion_length": 436.37948989868164, "epoch": 0.08107107360707813, "grad_norm": 0.12945213457565957, "kl": 0.0019922256469726562, "learning_rate": 9.83869808958285e-07, "loss": 0.0001, "reward": 1.7991072088479996, "reward_std": 0.11467840243130922, "rewards/accuracy_reward": 0.7991071715950966, "rewards/format_reward": 1.0, "step": 1040 }, { "completion_length": 400.2835006713867, "epoch": 0.08122697951786097, "grad_norm": 0.11010682756435439, "kl": 0.0018367767333984375, "learning_rate": 9.838080474629963e-07, "loss": 0.0001, "reward": 1.8035715073347092, "reward_std": 0.09138102736324072, "rewards/accuracy_reward": 0.8035714700818062, "rewards/format_reward": 1.0, "step": 1042 }, { "completion_length": 411.870548248291, "epoch": 0.08138288542864382, "grad_norm": 0.10632333102946655, "kl": 0.0016260147094726562, "learning_rate": 9.837461698990629e-07, "loss": 0.0001, "reward": 1.8683036416769028, "reward_std": 0.062223016284406185, "rewards/accuracy_reward": 0.8683036118745804, "rewards/format_reward": 1.0, "step": 1044 }, { "completion_length": 410.90403747558594, "epoch": 0.08153879133942665, "grad_norm": 0.08015217527226494, "kl": 0.0020198822021484375, "learning_rate": 9.836841762813295e-07, "loss": 0.0001, "reward": 1.796875074505806, "reward_std": 0.0804893709719181, "rewards/accuracy_reward": 0.7991071790456772, "rewards/format_reward": 0.9977678656578064, "step": 1046 }, { "completion_length": 425.1160888671875, "epoch": 0.0816946972502095, "grad_norm": 0.12801537223993142, "kl": 0.0019063949584960938, "learning_rate": 9.836220666246688e-07, "loss": 0.0001, "reward": 1.8928571939468384, "reward_std": 0.08702037110924721, "rewards/accuracy_reward": 0.8928571790456772, "rewards/format_reward": 1.0, "step": 1048 }, { "completion_length": 412.25224685668945, "epoch": 0.08185060316099234, "grad_norm": 0.1245691182420629, "kl": 0.00194549560546875, "learning_rate": 9.835598409439814e-07, "loss": 0.0001, "reward": 1.8772322237491608, "reward_std": 0.07109545823186636, "rewards/accuracy_reward": 0.8772321864962578, "rewards/format_reward": 1.0, "step": 1050 }, { "completion_length": 412.25894927978516, "epoch": 0.08200650907177519, "grad_norm": 0.10823056622894774, "kl": 0.0021266937255859375, "learning_rate": 9.834974992541956e-07, "loss": 0.0001, "reward": 1.7522322088479996, "reward_std": 0.06057331245392561, "rewards/accuracy_reward": 0.7522321864962578, "rewards/format_reward": 1.0, "step": 1052 }, { "completion_length": 418.4509086608887, "epoch": 0.08216241498255802, "grad_norm": 0.11897778364022872, "kl": 0.0019922256469726562, "learning_rate": 9.834350415702676e-07, "loss": 0.0001, "reward": 1.7611607909202576, "reward_std": 0.08973272237926722, "rewards/accuracy_reward": 0.7611607536673546, "rewards/format_reward": 1.0, "step": 1054 }, { "completion_length": 412.20314025878906, "epoch": 0.08231832089334087, "grad_norm": 0.09290745404726482, "kl": 0.0017976760864257812, "learning_rate": 9.833724679071813e-07, "loss": 0.0001, "reward": 1.9107143431901932, "reward_std": 0.054932461120188236, "rewards/accuracy_reward": 0.9107143059372902, "rewards/format_reward": 1.0, "step": 1056 }, { "completion_length": 432.3616256713867, "epoch": 0.08247422680412371, "grad_norm": 0.1356609139682244, "kl": 0.0020160675048828125, "learning_rate": 9.833097782799487e-07, "loss": 0.0001, "reward": 1.8102679252624512, "reward_std": 0.10220626462250948, "rewards/accuracy_reward": 0.8125000447034836, "rewards/format_reward": 0.9977678656578064, "step": 1058 }, { "completion_length": 425.60716247558594, "epoch": 0.08263013271490656, "grad_norm": 0.13714287955906374, "kl": 0.0021066665649414062, "learning_rate": 9.832469727036092e-07, "loss": 0.0001, "reward": 1.7901786714792252, "reward_std": 0.056364620104432106, "rewards/accuracy_reward": 0.7924107536673546, "rewards/format_reward": 0.9977678656578064, "step": 1060 }, { "completion_length": 423.1808204650879, "epoch": 0.0827860386256894, "grad_norm": 0.10676602321944621, "kl": 0.00202178955078125, "learning_rate": 9.831840511932305e-07, "loss": 0.0001, "reward": 1.8236608058214188, "reward_std": 0.06282920483499765, "rewards/accuracy_reward": 0.8236607611179352, "rewards/format_reward": 1.0, "step": 1062 }, { "completion_length": 411.3192138671875, "epoch": 0.08294194453647225, "grad_norm": 0.12492486871199324, "kl": 0.0023508071899414062, "learning_rate": 9.831210137639077e-07, "loss": 0.0001, "reward": 1.7924107909202576, "reward_std": 0.11535240802913904, "rewards/accuracy_reward": 0.7924107536673546, "rewards/format_reward": 1.0, "step": 1064 }, { "completion_length": 436.892879486084, "epoch": 0.08309785044725508, "grad_norm": 0.09948878421576278, "kl": 0.0020427703857421875, "learning_rate": 9.830578604307639e-07, "loss": 0.0001, "reward": 1.7946429252624512, "reward_std": 0.07808151189237833, "rewards/accuracy_reward": 0.7946428954601288, "rewards/format_reward": 1.0, "step": 1066 }, { "completion_length": 416.20091247558594, "epoch": 0.08325375635803793, "grad_norm": 0.08394619894658004, "kl": 0.0018548965454101562, "learning_rate": 9.8299459120895e-07, "loss": 0.0001, "reward": 1.7812500596046448, "reward_std": 0.05718835536390543, "rewards/accuracy_reward": 0.7812500223517418, "rewards/format_reward": 1.0, "step": 1068 }, { "completion_length": 415.2924270629883, "epoch": 0.08340966226882077, "grad_norm": 0.10939226377998208, "kl": 0.0018911361694335938, "learning_rate": 9.829312061136448e-07, "loss": 0.0001, "reward": 1.803571492433548, "reward_std": 0.10010010190308094, "rewards/accuracy_reward": 0.8035714700818062, "rewards/format_reward": 1.0, "step": 1070 }, { "completion_length": 429.75447845458984, "epoch": 0.0835655681796036, "grad_norm": 0.12839368207077065, "kl": 0.002044677734375, "learning_rate": 9.828677051600547e-07, "loss": 0.0001, "reward": 1.7343750894069672, "reward_std": 0.09558971598744392, "rewards/accuracy_reward": 0.7343750298023224, "rewards/format_reward": 1.0, "step": 1072 }, { "completion_length": 419.94197845458984, "epoch": 0.08372147409038645, "grad_norm": 0.11542805724065601, "kl": 0.0019350051879882812, "learning_rate": 9.82804088363414e-07, "loss": 0.0001, "reward": 1.8348215222358704, "reward_std": 0.09033750835806131, "rewards/accuracy_reward": 0.8348214700818062, "rewards/format_reward": 1.0, "step": 1074 }, { "completion_length": 422.9174346923828, "epoch": 0.08387738000116929, "grad_norm": 0.13754824038606764, "kl": 0.0027055740356445312, "learning_rate": 9.827403557389849e-07, "loss": 0.0001, "reward": 1.868303656578064, "reward_std": 0.07530498038977385, "rewards/accuracy_reward": 0.868303619325161, "rewards/format_reward": 1.0, "step": 1076 }, { "completion_length": 422.12724685668945, "epoch": 0.08403328591195214, "grad_norm": 0.11598529279468806, "kl": 0.0019989013671875, "learning_rate": 9.82676507302057e-07, "loss": 0.0001, "reward": 1.7477679401636124, "reward_std": 0.12865472119301558, "rewards/accuracy_reward": 0.7477678880095482, "rewards/format_reward": 1.0, "step": 1078 }, { "completion_length": 415.74778747558594, "epoch": 0.08418919182273497, "grad_norm": 0.1570478570970407, "kl": 0.0021238327026367188, "learning_rate": 9.82612543067948e-07, "loss": 0.0001, "reward": 1.7299107909202576, "reward_std": 0.12700221501290798, "rewards/accuracy_reward": 0.7299107387661934, "rewards/format_reward": 1.0, "step": 1080 }, { "completion_length": 419.40180587768555, "epoch": 0.08434509773351782, "grad_norm": 0.11055238370677478, "kl": 0.0019350051879882812, "learning_rate": 9.825484630520034e-07, "loss": 0.0001, "reward": 1.9196429252624512, "reward_std": 0.06688676495105028, "rewards/accuracy_reward": 0.9196428954601288, "rewards/format_reward": 1.0, "step": 1082 }, { "completion_length": 422.1696586608887, "epoch": 0.08450100364430066, "grad_norm": 0.08655425993956564, "kl": 0.0021409988403320312, "learning_rate": 9.824842672695965e-07, "loss": 0.0001, "reward": 1.8504465073347092, "reward_std": 0.06591106858104467, "rewards/accuracy_reward": 0.8504464700818062, "rewards/format_reward": 1.0, "step": 1084 }, { "completion_length": 418.5960006713867, "epoch": 0.08465690955508351, "grad_norm": 0.15186742977224926, "kl": 0.0020046234130859375, "learning_rate": 9.824199557361281e-07, "loss": 0.0001, "reward": 1.8258929550647736, "reward_std": 0.11498513631522655, "rewards/accuracy_reward": 0.8258929029107094, "rewards/format_reward": 1.0, "step": 1086 }, { "completion_length": 425.377254486084, "epoch": 0.08481281546586635, "grad_norm": 0.09886937046413236, "kl": 0.0019779205322265625, "learning_rate": 9.823555284670272e-07, "loss": 0.0001, "reward": 1.8928572088479996, "reward_std": 0.06981526222079992, "rewards/accuracy_reward": 0.892857164144516, "rewards/format_reward": 1.0, "step": 1088 }, { "completion_length": 414.4196662902832, "epoch": 0.0849687213766492, "grad_norm": 0.07245375291486404, "kl": 0.0019474029541015625, "learning_rate": 9.822909854777502e-07, "loss": 0.0001, "reward": 1.8571429252624512, "reward_std": 0.04471481405198574, "rewards/accuracy_reward": 0.857142873108387, "rewards/format_reward": 1.0, "step": 1090 }, { "completion_length": 414.0089454650879, "epoch": 0.08512462728743203, "grad_norm": 0.10992861001748706, "kl": 0.0020360946655273438, "learning_rate": 9.82226326783781e-07, "loss": 0.0001, "reward": 1.7857143729925156, "reward_std": 0.07515301555395126, "rewards/accuracy_reward": 0.7857143208384514, "rewards/format_reward": 1.0, "step": 1092 }, { "completion_length": 417.31921768188477, "epoch": 0.08528053319821488, "grad_norm": 0.0984165943526772, "kl": 0.0020532608032226562, "learning_rate": 9.821615524006324e-07, "loss": 0.0001, "reward": 1.7165179252624512, "reward_std": 0.0843949681147933, "rewards/accuracy_reward": 0.7165178954601288, "rewards/format_reward": 1.0, "step": 1094 }, { "completion_length": 420.54019927978516, "epoch": 0.08543643910899772, "grad_norm": 0.07237733304931561, "kl": 0.002056121826171875, "learning_rate": 9.820966623438435e-07, "loss": 0.0001, "reward": 1.7544643878936768, "reward_std": 0.0793603053316474, "rewards/accuracy_reward": 0.7544643208384514, "rewards/format_reward": 1.0, "step": 1096 }, { "completion_length": 424.6026954650879, "epoch": 0.08559234501978057, "grad_norm": 0.1705998165182482, "kl": 0.002063751220703125, "learning_rate": 9.820316566289822e-07, "loss": 0.0001, "reward": 1.8147322088479996, "reward_std": 0.13601324893534184, "rewards/accuracy_reward": 0.8169643133878708, "rewards/format_reward": 0.9977678656578064, "step": 1098 }, { "completion_length": 408.9888572692871, "epoch": 0.0857482509305634, "grad_norm": 0.08927236659712404, "kl": 0.0018177032470703125, "learning_rate": 9.819665352716438e-07, "loss": 0.0001, "reward": 1.796875074505806, "reward_std": 0.0738728241994977, "rewards/accuracy_reward": 0.796875037252903, "rewards/format_reward": 1.0, "step": 1100 }, { "completion_length": 427.02457427978516, "epoch": 0.08590415684134625, "grad_norm": 0.12162101616597618, "kl": 0.0022487640380859375, "learning_rate": 9.81901298287451e-07, "loss": 0.0001, "reward": 1.7321429550647736, "reward_std": 0.13173518422991037, "rewards/accuracy_reward": 0.7321428917348385, "rewards/format_reward": 1.0, "step": 1102 }, { "completion_length": 429.5022506713867, "epoch": 0.08606006275212909, "grad_norm": 0.13235303064998932, "kl": 0.0020952224731445312, "learning_rate": 9.818359456920548e-07, "loss": 0.0001, "reward": 1.7656251043081284, "reward_std": 0.09558971598744392, "rewards/accuracy_reward": 0.7656250298023224, "rewards/format_reward": 1.0, "step": 1104 }, { "completion_length": 416.11609268188477, "epoch": 0.08621596866291194, "grad_norm": 0.1146150279603238, "kl": 0.00200653076171875, "learning_rate": 9.817704775011337e-07, "loss": 0.0001, "reward": 1.7700893431901932, "reward_std": 0.07192142587155104, "rewards/accuracy_reward": 0.7700893059372902, "rewards/format_reward": 1.0, "step": 1106 }, { "completion_length": 410.9776954650879, "epoch": 0.08637187457369477, "grad_norm": 0.049384807891505074, "kl": 0.0019512176513671875, "learning_rate": 9.81704893730394e-07, "loss": 0.0001, "reward": 1.7924107909202576, "reward_std": 0.047946405597031116, "rewards/accuracy_reward": 0.7924107536673546, "rewards/format_reward": 1.0, "step": 1108 }, { "completion_length": 430.189754486084, "epoch": 0.08652778048447762, "grad_norm": 0.09085957232629938, "kl": 0.0018491744995117188, "learning_rate": 9.816391943955696e-07, "loss": 0.0001, "reward": 1.8638393729925156, "reward_std": 0.07612871285527945, "rewards/accuracy_reward": 0.866071455180645, "rewards/format_reward": 0.9977678656578064, "step": 1110 }, { "completion_length": 416.283504486084, "epoch": 0.08668368639526046, "grad_norm": 0.03919242873099074, "kl": 0.0018825531005859375, "learning_rate": 9.81573379512422e-07, "loss": 0.0001, "reward": 1.8482143729925156, "reward_std": 0.061247317120432854, "rewards/accuracy_reward": 0.848214328289032, "rewards/format_reward": 1.0, "step": 1112 }, { "completion_length": 417.877254486084, "epoch": 0.08683959230604331, "grad_norm": 0.10399237657377733, "kl": 0.0020723342895507812, "learning_rate": 9.81507449096741e-07, "loss": 0.0001, "reward": 1.799107238650322, "reward_std": 0.09513325244188309, "rewards/accuracy_reward": 0.7991071790456772, "rewards/format_reward": 1.0, "step": 1114 }, { "completion_length": 418.65626525878906, "epoch": 0.08699549821682614, "grad_norm": 0.10991433376254696, "kl": 0.0021657943725585938, "learning_rate": 9.814414031643434e-07, "loss": 0.0001, "reward": 1.8013393878936768, "reward_std": 0.10250795818865299, "rewards/accuracy_reward": 0.8013393357396126, "rewards/format_reward": 1.0, "step": 1116 }, { "completion_length": 426.75001525878906, "epoch": 0.087151404127609, "grad_norm": 0.10480614269585332, "kl": 0.0021371841430664062, "learning_rate": 9.813752417310743e-07, "loss": 0.0001, "reward": 1.7142858058214188, "reward_std": 0.06996862776577473, "rewards/accuracy_reward": 0.714285746216774, "rewards/format_reward": 1.0, "step": 1118 }, { "completion_length": 417.5669860839844, "epoch": 0.08730731003839183, "grad_norm": 0.13145915586492896, "kl": 0.0023145675659179688, "learning_rate": 9.81308964812806e-07, "loss": 0.0001, "reward": 1.7857143580913544, "reward_std": 0.12572482135146856, "rewards/accuracy_reward": 0.7879464700818062, "rewards/format_reward": 0.9977678656578064, "step": 1120 }, { "completion_length": 429.495548248291, "epoch": 0.08746321594917467, "grad_norm": 0.10011941458665305, "kl": 0.001918792724609375, "learning_rate": 9.81242572425439e-07, "loss": 0.0001, "reward": 1.8839286267757416, "reward_std": 0.06575629860162735, "rewards/accuracy_reward": 0.8839285969734192, "rewards/format_reward": 1.0, "step": 1122 }, { "completion_length": 418.5759086608887, "epoch": 0.08761912185995752, "grad_norm": 0.1164503855704079, "kl": 0.0023412704467773438, "learning_rate": 9.81176064584901e-07, "loss": 0.0001, "reward": 1.7678572237491608, "reward_std": 0.06996862683445215, "rewards/accuracy_reward": 0.7678571715950966, "rewards/format_reward": 1.0, "step": 1124 }, { "completion_length": 416.86608505249023, "epoch": 0.08777502777074035, "grad_norm": 0.13687659946447792, "kl": 0.0019769668579101562, "learning_rate": 9.811094413071478e-07, "loss": 0.0001, "reward": 1.7834822237491608, "reward_std": 0.09229030553251505, "rewards/accuracy_reward": 0.7834821790456772, "rewards/format_reward": 1.0, "step": 1126 }, { "completion_length": 419.44422149658203, "epoch": 0.0879309336815232, "grad_norm": 0.1057507779058835, "kl": 0.0018787384033203125, "learning_rate": 9.810427026081628e-07, "loss": 0.0001, "reward": 1.883928656578064, "reward_std": 0.08537066448479891, "rewards/accuracy_reward": 0.8839286044239998, "rewards/format_reward": 1.0, "step": 1128 }, { "completion_length": 414.3683166503906, "epoch": 0.08808683959230604, "grad_norm": 0.12369214925745567, "kl": 0.0020189285278320312, "learning_rate": 9.80975848503957e-07, "loss": 0.0001, "reward": 1.8705358058214188, "reward_std": 0.0771044148132205, "rewards/accuracy_reward": 0.8705357387661934, "rewards/format_reward": 1.0, "step": 1130 }, { "completion_length": 420.7277030944824, "epoch": 0.08824274550308889, "grad_norm": 0.11781228254274871, "kl": 0.0020771026611328125, "learning_rate": 9.809088790105688e-07, "loss": 0.0001, "reward": 1.8303572237491608, "reward_std": 0.07710441201925278, "rewards/accuracy_reward": 0.8303571790456772, "rewards/format_reward": 1.0, "step": 1132 }, { "completion_length": 416.4643020629883, "epoch": 0.08839865141387172, "grad_norm": 0.12172343292769106, "kl": 0.002223968505859375, "learning_rate": 9.808417941440653e-07, "loss": 0.0001, "reward": 1.7857143729925156, "reward_std": 0.11032278463244438, "rewards/accuracy_reward": 0.7879464700818062, "rewards/format_reward": 0.9977678656578064, "step": 1134 }, { "completion_length": 424.03796768188477, "epoch": 0.08855455732465457, "grad_norm": 0.1270163355804076, "kl": 0.0020322799682617188, "learning_rate": 9.8077459392054e-07, "loss": 0.0001, "reward": 1.8013393729925156, "reward_std": 0.12760840356349945, "rewards/accuracy_reward": 0.8013393208384514, "rewards/format_reward": 1.0, "step": 1136 }, { "completion_length": 415.3928756713867, "epoch": 0.08871046323543741, "grad_norm": 0.14516226569268573, "kl": 0.0019311904907226562, "learning_rate": 9.80707278356115e-07, "loss": 0.0001, "reward": 1.7834822237491608, "reward_std": 0.11745717190206051, "rewards/accuracy_reward": 0.7834821790456772, "rewards/format_reward": 1.0, "step": 1138 }, { "completion_length": 421.8192138671875, "epoch": 0.08886636914622026, "grad_norm": 0.13712990667982813, "kl": 0.002101898193359375, "learning_rate": 9.806398474669394e-07, "loss": 0.0001, "reward": 1.8526786267757416, "reward_std": 0.05569201707839966, "rewards/accuracy_reward": 0.8526786155998707, "rewards/format_reward": 1.0, "step": 1140 }, { "completion_length": 416.32591247558594, "epoch": 0.0890222750570031, "grad_norm": 0.07249509217413441, "kl": 0.0020017623901367188, "learning_rate": 9.805723012691909e-07, "loss": 0.0001, "reward": 1.843750074505806, "reward_std": 0.04569191299378872, "rewards/accuracy_reward": 0.8437500298023224, "rewards/format_reward": 1.0, "step": 1142 }, { "completion_length": 406.02233505249023, "epoch": 0.08917818096778594, "grad_norm": 0.09152636412796153, "kl": 0.0021190643310546875, "learning_rate": 9.805046397790736e-07, "loss": 0.0001, "reward": 1.781250074505806, "reward_std": 0.05425985809415579, "rewards/accuracy_reward": 0.7812500298023224, "rewards/format_reward": 1.0, "step": 1144 }, { "completion_length": 427.47769927978516, "epoch": 0.08933408687856878, "grad_norm": 0.09808801570613801, "kl": 0.002227783203125, "learning_rate": 9.804368630128202e-07, "loss": 0.0001, "reward": 1.7901786416769028, "reward_std": 0.08686700277030468, "rewards/accuracy_reward": 0.7901786044239998, "rewards/format_reward": 1.0, "step": 1146 }, { "completion_length": 413.6138572692871, "epoch": 0.08948999278935163, "grad_norm": 0.10670063104504166, "kl": 0.002086639404296875, "learning_rate": 9.803689709866907e-07, "loss": 0.0001, "reward": 1.812500074505806, "reward_std": 0.06027021910995245, "rewards/accuracy_reward": 0.812500037252903, "rewards/format_reward": 1.0, "step": 1148 }, { "completion_length": 433.16073989868164, "epoch": 0.08964589870013447, "grad_norm": 0.1078293903545004, "kl": 0.0023126602172851562, "learning_rate": 9.80300963716973e-07, "loss": 0.0001, "reward": 1.7924108058214188, "reward_std": 0.0963492738083005, "rewards/accuracy_reward": 0.7946429029107094, "rewards/format_reward": 0.9977678656578064, "step": 1150 }, { "completion_length": 417.38171005249023, "epoch": 0.08980180461091732, "grad_norm": 0.1149584774178752, "kl": 0.0026845932006835938, "learning_rate": 9.802328412199823e-07, "loss": 0.0001, "reward": 1.8816965222358704, "reward_std": 0.0853720661252737, "rewards/accuracy_reward": 0.8816964775323868, "rewards/format_reward": 1.0, "step": 1152 }, { "completion_length": 417.4442138671875, "epoch": 0.08995771052170015, "grad_norm": 0.13678004375274, "kl": 0.0020666122436523438, "learning_rate": 9.801646035120617e-07, "loss": 0.0001, "reward": 1.7991071939468384, "reward_std": 0.13947715610265732, "rewards/accuracy_reward": 0.7991071790456772, "rewards/format_reward": 1.0, "step": 1154 }, { "completion_length": 424.08484268188477, "epoch": 0.090113616432483, "grad_norm": 0.07002739535373827, "kl": 0.001979827880859375, "learning_rate": 9.80096250609582e-07, "loss": 0.0001, "reward": 1.8482143878936768, "reward_std": 0.05553865246474743, "rewards/accuracy_reward": 0.8482143133878708, "rewards/format_reward": 1.0, "step": 1156 }, { "completion_length": 424.86608505249023, "epoch": 0.09026952234326584, "grad_norm": 0.1391334233159408, "kl": 0.0019960403442382812, "learning_rate": 9.80027782528941e-07, "loss": 0.0001, "reward": 1.8504465073347092, "reward_std": 0.06560657173395157, "rewards/accuracy_reward": 0.8504464700818062, "rewards/format_reward": 1.0, "step": 1158 }, { "completion_length": 413.13171768188477, "epoch": 0.09042542825404869, "grad_norm": 0.11145788284711243, "kl": 0.0022993087768554688, "learning_rate": 9.799591992865655e-07, "loss": 0.0001, "reward": 1.8214286416769028, "reward_std": 0.10897477623075247, "rewards/accuracy_reward": 0.821428619325161, "rewards/format_reward": 1.0, "step": 1160 }, { "completion_length": 422.0089454650879, "epoch": 0.09058133416483152, "grad_norm": 0.10410356850465652, "kl": 0.0021648406982421875, "learning_rate": 9.798905008989084e-07, "loss": 0.0001, "reward": 1.8035715222358704, "reward_std": 0.0780815128237009, "rewards/accuracy_reward": 0.803571455180645, "rewards/format_reward": 1.0, "step": 1162 }, { "completion_length": 421.27233505249023, "epoch": 0.09073724007561437, "grad_norm": 0.10408939924571944, "kl": 0.0020208358764648438, "learning_rate": 9.79821687382451e-07, "loss": 0.0001, "reward": 1.7946429550647736, "reward_std": 0.03352006617933512, "rewards/accuracy_reward": 0.7946428954601288, "rewards/format_reward": 1.0, "step": 1164 }, { "completion_length": 419.5826110839844, "epoch": 0.09089314598639721, "grad_norm": 0.09578836598121006, "kl": 0.0020236968994140625, "learning_rate": 9.79752758753702e-07, "loss": 0.0001, "reward": 1.8303572237491608, "reward_std": 0.054932461120188236, "rewards/accuracy_reward": 0.8303571790456772, "rewards/format_reward": 1.0, "step": 1166 }, { "completion_length": 412.3750228881836, "epoch": 0.09104905189718006, "grad_norm": 0.11137595620053359, "kl": 0.0020647048950195312, "learning_rate": 9.79683715029198e-07, "loss": 0.0001, "reward": 1.7946429550647736, "reward_std": 0.060270216315984726, "rewards/accuracy_reward": 0.7946428954601288, "rewards/format_reward": 1.0, "step": 1168 }, { "completion_length": 418.5022506713867, "epoch": 0.0912049578079629, "grad_norm": 0.1359943879729965, "kl": 0.0020618438720703125, "learning_rate": 9.796145562255032e-07, "loss": 0.0001, "reward": 1.819196492433548, "reward_std": 0.0834178663790226, "rewards/accuracy_reward": 0.8191964775323868, "rewards/format_reward": 1.0, "step": 1170 }, { "completion_length": 405.9821586608887, "epoch": 0.09136086371874573, "grad_norm": 0.10662805924554439, "kl": 0.0022001266479492188, "learning_rate": 9.79545282359209e-07, "loss": 0.0001, "reward": 1.8236607909202576, "reward_std": 0.07756087463349104, "rewards/accuracy_reward": 0.823660746216774, "rewards/format_reward": 1.0, "step": 1172 }, { "completion_length": 412.5446586608887, "epoch": 0.09151676962952858, "grad_norm": 0.12303797457453161, "kl": 0.002124786376953125, "learning_rate": 9.794758934469344e-07, "loss": 0.0001, "reward": 1.7388393580913544, "reward_std": 0.09732356760650873, "rewards/accuracy_reward": 0.7388393320143223, "rewards/format_reward": 1.0, "step": 1174 }, { "completion_length": 422.07368087768555, "epoch": 0.09167267554031142, "grad_norm": 0.08398442734551326, "kl": 0.0022830963134765625, "learning_rate": 9.794063895053268e-07, "loss": 0.0001, "reward": 1.7477679401636124, "reward_std": 0.056821079924702644, "rewards/accuracy_reward": 0.7477678805589676, "rewards/format_reward": 1.0, "step": 1176 }, { "completion_length": 427.54912185668945, "epoch": 0.09182858145109427, "grad_norm": 0.14281907263089677, "kl": 0.00205230712890625, "learning_rate": 9.793367705510602e-07, "loss": 0.0001, "reward": 1.7522322088479996, "reward_std": 0.1180683970451355, "rewards/accuracy_reward": 0.7522321715950966, "rewards/format_reward": 1.0, "step": 1178 }, { "completion_length": 424.36609268188477, "epoch": 0.0919844873618771, "grad_norm": 0.09955473404756984, "kl": 0.0026788711547851562, "learning_rate": 9.792670366008368e-07, "loss": 0.0001, "reward": 1.850446492433548, "reward_std": 0.05102826748043299, "rewards/accuracy_reward": 0.8504464626312256, "rewards/format_reward": 1.0, "step": 1180 }, { "completion_length": 436.439754486084, "epoch": 0.09214039327265995, "grad_norm": 0.1279959702944325, "kl": 0.002750396728515625, "learning_rate": 9.791971876713862e-07, "loss": 0.0001, "reward": 1.7455358058214188, "reward_std": 0.0929105756804347, "rewards/accuracy_reward": 0.7477678954601288, "rewards/format_reward": 0.9977678656578064, "step": 1182 }, { "completion_length": 419.3169822692871, "epoch": 0.09229629918344279, "grad_norm": 0.12791144058048967, "kl": 0.0020017623901367188, "learning_rate": 9.791272237794656e-07, "loss": 0.0001, "reward": 1.8102678954601288, "reward_std": 0.11513486225157976, "rewards/accuracy_reward": 0.8102678768336773, "rewards/format_reward": 1.0, "step": 1184 }, { "completion_length": 418.2812690734863, "epoch": 0.09245220509422564, "grad_norm": 0.08610559325555364, "kl": 0.001949310302734375, "learning_rate": 9.790571449418598e-07, "loss": 0.0001, "reward": 1.7790179252624512, "reward_std": 0.10160007420927286, "rewards/accuracy_reward": 0.7790178880095482, "rewards/format_reward": 1.0, "step": 1186 }, { "completion_length": 415.2701072692871, "epoch": 0.09260811100500847, "grad_norm": 0.11726677495081354, "kl": 0.00199127197265625, "learning_rate": 9.789869511753814e-07, "loss": 0.0001, "reward": 1.7075893729925156, "reward_std": 0.08100860938429832, "rewards/accuracy_reward": 0.7075893133878708, "rewards/format_reward": 1.0, "step": 1188 }, { "completion_length": 412.3750228881836, "epoch": 0.09276401691579132, "grad_norm": 0.09647573214939995, "kl": 0.0018548965454101562, "learning_rate": 9.789166424968698e-07, "loss": 0.0001, "reward": 1.837053656578064, "reward_std": 0.0641744127497077, "rewards/accuracy_reward": 0.8370535969734192, "rewards/format_reward": 1.0, "step": 1190 }, { "completion_length": 424.81028747558594, "epoch": 0.09291992282657416, "grad_norm": 0.12872203738416751, "kl": 0.00215911865234375, "learning_rate": 9.78846218923193e-07, "loss": 0.0001, "reward": 1.8191965222358704, "reward_std": 0.08732346352189779, "rewards/accuracy_reward": 0.8191964775323868, "rewards/format_reward": 1.0, "step": 1192 }, { "completion_length": 431.5870666503906, "epoch": 0.09307582873735701, "grad_norm": 0.06410248707283554, "kl": 0.0021696090698242188, "learning_rate": 9.787756804712456e-07, "loss": 0.0001, "reward": 1.7946429252624512, "reward_std": 0.06883816421031952, "rewards/accuracy_reward": 0.7946428954601288, "rewards/format_reward": 1.0, "step": 1194 }, { "completion_length": 428.32143783569336, "epoch": 0.09323173464813984, "grad_norm": 0.13821361834749218, "kl": 0.0024585723876953125, "learning_rate": 9.787050271579509e-07, "loss": 0.0001, "reward": 1.8035715073347092, "reward_std": 0.12843577936291695, "rewards/accuracy_reward": 0.8035714700818062, "rewards/format_reward": 1.0, "step": 1196 }, { "completion_length": 420.6183204650879, "epoch": 0.0933876405589227, "grad_norm": 0.10389222506171672, "kl": 0.002185821533203125, "learning_rate": 9.786342590002583e-07, "loss": 0.0001, "reward": 1.7834822088479996, "reward_std": 0.09604477044194937, "rewards/accuracy_reward": 0.785714328289032, "rewards/format_reward": 0.9977678656578064, "step": 1198 }, { "completion_length": 423.0067138671875, "epoch": 0.09354354646970553, "grad_norm": 0.059666528879625175, "kl": 0.002048492431640625, "learning_rate": 9.78563376015146e-07, "loss": 0.0001, "reward": 1.8861608058214188, "reward_std": 0.06756077334284782, "rewards/accuracy_reward": 0.886160746216774, "rewards/format_reward": 1.0, "step": 1200 }, { "completion_length": 407.8214454650879, "epoch": 0.09369945238048838, "grad_norm": 0.08469609785188678, "kl": 0.0019664764404296875, "learning_rate": 9.784923782196193e-07, "loss": 0.0001, "reward": 1.9129464626312256, "reward_std": 0.042762015014886856, "rewards/accuracy_reward": 0.9129464626312256, "rewards/format_reward": 1.0, "step": 1202 }, { "completion_length": 410.8415336608887, "epoch": 0.09385535829127122, "grad_norm": 0.14110197544308983, "kl": 0.0022459030151367188, "learning_rate": 9.784212656307107e-07, "loss": 0.0001, "reward": 1.8035715222358704, "reward_std": 0.09769588150084019, "rewards/accuracy_reward": 0.8035714626312256, "rewards/format_reward": 1.0, "step": 1204 }, { "completion_length": 422.3973388671875, "epoch": 0.09401126420205407, "grad_norm": 0.2067056983541634, "kl": 0.0022001266479492188, "learning_rate": 9.78350038265481e-07, "loss": 0.0001, "reward": 1.821428656578064, "reward_std": 0.08922252804040909, "rewards/accuracy_reward": 0.8236607536673546, "rewards/format_reward": 0.9977678656578064, "step": 1206 }, { "completion_length": 418.53126525878906, "epoch": 0.0941671701128369, "grad_norm": 0.1001755474164835, "kl": 0.002162933349609375, "learning_rate": 9.782786961410177e-07, "loss": 0.0001, "reward": 1.7968751043081284, "reward_std": 0.042762015014886856, "rewards/accuracy_reward": 0.7968750447034836, "rewards/format_reward": 1.0, "step": 1208 }, { "completion_length": 409.4576110839844, "epoch": 0.09432307602361975, "grad_norm": 0.13979574403468573, "kl": 0.0022878646850585938, "learning_rate": 9.782072392744365e-07, "loss": 0.0001, "reward": 1.821428656578064, "reward_std": 0.12437681667506695, "rewards/accuracy_reward": 0.821428619325161, "rewards/format_reward": 1.0, "step": 1210 }, { "completion_length": 419.64957427978516, "epoch": 0.09447898193440259, "grad_norm": 0.16867858691079607, "kl": 0.002532958984375, "learning_rate": 9.781356676828802e-07, "loss": 0.0001, "reward": 1.7924108058214188, "reward_std": 0.14255761913955212, "rewards/accuracy_reward": 0.792410746216774, "rewards/format_reward": 1.0, "step": 1212 }, { "completion_length": 412.9330520629883, "epoch": 0.09463488784518544, "grad_norm": 0.1201933698143431, "kl": 0.0019941329956054688, "learning_rate": 9.780639813835195e-07, "loss": 0.0001, "reward": 1.9062500447034836, "reward_std": 0.05621125362813473, "rewards/accuracy_reward": 0.9062500223517418, "rewards/format_reward": 1.0, "step": 1214 }, { "completion_length": 410.0625190734863, "epoch": 0.09479079375596827, "grad_norm": 0.06833508275421298, "kl": 0.0021142959594726562, "learning_rate": 9.77992180393552e-07, "loss": 0.0001, "reward": 1.90401791036129, "reward_std": 0.031413902528584, "rewards/accuracy_reward": 0.9040179029107094, "rewards/format_reward": 1.0, "step": 1216 }, { "completion_length": 426.1294822692871, "epoch": 0.09494669966675112, "grad_norm": 0.1474312837938719, "kl": 0.0022068023681640625, "learning_rate": 9.779202647302036e-07, "loss": 0.0001, "reward": 1.7700893729925156, "reward_std": 0.07612871285527945, "rewards/accuracy_reward": 0.770089328289032, "rewards/format_reward": 1.0, "step": 1218 }, { "completion_length": 403.7098388671875, "epoch": 0.09510260557753396, "grad_norm": 0.06866887630719196, "kl": 0.0018901824951171875, "learning_rate": 9.778482344107274e-07, "loss": 0.0001, "reward": 1.8772321939468384, "reward_std": 0.07483514863997698, "rewards/accuracy_reward": 0.881696455180645, "rewards/format_reward": 0.9955357313156128, "step": 1220 }, { "completion_length": 429.8326072692871, "epoch": 0.09525851148831681, "grad_norm": 0.13336086107098988, "kl": 0.0019578933715820312, "learning_rate": 9.777760894524034e-07, "loss": 0.0001, "reward": 1.8727679550647736, "reward_std": 0.05621265713125467, "rewards/accuracy_reward": 0.872767873108387, "rewards/format_reward": 1.0, "step": 1222 }, { "completion_length": 419.6406478881836, "epoch": 0.09541441739909964, "grad_norm": 0.14632342431130563, "kl": 0.0021514892578125, "learning_rate": 9.7770382987254e-07, "loss": 0.0001, "reward": 1.805803656578064, "reward_std": 0.11190467327833176, "rewards/accuracy_reward": 0.8058036118745804, "rewards/format_reward": 1.0, "step": 1224 }, { "completion_length": 427.83260345458984, "epoch": 0.09557032330988248, "grad_norm": 0.12067474827002501, "kl": 0.0020742416381835938, "learning_rate": 9.776314556884729e-07, "loss": 0.0001, "reward": 1.781250074505806, "reward_std": 0.10641719121485949, "rewards/accuracy_reward": 0.781250037252903, "rewards/format_reward": 1.0, "step": 1226 }, { "completion_length": 414.8169822692871, "epoch": 0.09572622922066533, "grad_norm": 0.09980044668974244, "kl": 0.0023136138916015625, "learning_rate": 9.775589669175647e-07, "loss": 0.0001, "reward": 1.7299107909202576, "reward_std": 0.08936181291937828, "rewards/accuracy_reward": 0.7299107313156128, "rewards/format_reward": 1.0, "step": 1228 }, { "completion_length": 420.4464454650879, "epoch": 0.09588213513144817, "grad_norm": 0.08641407001178153, "kl": 0.0019521713256835938, "learning_rate": 9.774863635772063e-07, "loss": 0.0001, "reward": 1.805803656578064, "reward_std": 0.05005116667598486, "rewards/accuracy_reward": 0.8058035969734192, "rewards/format_reward": 1.0, "step": 1230 }, { "completion_length": 427.43082427978516, "epoch": 0.09603804104223101, "grad_norm": 0.09270099857277665, "kl": 0.0020017623901367188, "learning_rate": 9.774136456848156e-07, "loss": 0.0001, "reward": 1.8013393729925156, "reward_std": 0.05230706091970205, "rewards/accuracy_reward": 0.8013393133878708, "rewards/format_reward": 1.0, "step": 1232 }, { "completion_length": 415.41072845458984, "epoch": 0.09619394695301385, "grad_norm": 0.06724526290681415, "kl": 0.0018825531005859375, "learning_rate": 9.77340813257838e-07, "loss": 0.0001, "reward": 1.8169643580913544, "reward_std": 0.06981526128947735, "rewards/accuracy_reward": 0.8169643357396126, "rewards/format_reward": 1.0, "step": 1234 }, { "completion_length": 432.4107360839844, "epoch": 0.0963498528637967, "grad_norm": 0.15120967650248618, "kl": 0.0020809173583984375, "learning_rate": 9.772678663137464e-07, "loss": 0.0001, "reward": 1.8102679252624512, "reward_std": 0.09326740819960833, "rewards/accuracy_reward": 0.8102679029107094, "rewards/format_reward": 1.0, "step": 1236 }, { "completion_length": 419.0290336608887, "epoch": 0.09650575877457954, "grad_norm": 0.09912365249031956, "kl": 0.0021238327026367188, "learning_rate": 9.771948048700415e-07, "loss": 0.0001, "reward": 1.7790179401636124, "reward_std": 0.07875551842153072, "rewards/accuracy_reward": 0.7790178880095482, "rewards/format_reward": 1.0, "step": 1238 }, { "completion_length": 413.76787185668945, "epoch": 0.09666166468536239, "grad_norm": 0.11489367448294072, "kl": 0.0020732879638671875, "learning_rate": 9.77121628944251e-07, "loss": 0.0001, "reward": 1.8281250894069672, "reward_std": 0.0994953103363514, "rewards/accuracy_reward": 0.828125037252903, "rewards/format_reward": 1.0, "step": 1240 }, { "completion_length": 415.54689025878906, "epoch": 0.09681757059614522, "grad_norm": 0.10510781461141024, "kl": 0.00205230712890625, "learning_rate": 9.770483385539303e-07, "loss": 0.0001, "reward": 1.8058036714792252, "reward_std": 0.07094432599842548, "rewards/accuracy_reward": 0.8058036118745804, "rewards/format_reward": 1.0, "step": 1242 }, { "completion_length": 414.3080520629883, "epoch": 0.09697347650692807, "grad_norm": 0.0995783907579276, "kl": 0.0021944046020507812, "learning_rate": 9.769749337166622e-07, "loss": 0.0001, "reward": 1.8772322088479996, "reward_std": 0.08213907480239868, "rewards/accuracy_reward": 0.8772321790456772, "rewards/format_reward": 1.0, "step": 1244 }, { "completion_length": 418.2165336608887, "epoch": 0.09712938241771091, "grad_norm": 0.09113617981656355, "kl": 0.0019426345825195312, "learning_rate": 9.769014144500572e-07, "loss": 0.0001, "reward": 1.8727679252624512, "reward_std": 0.0618521049618721, "rewards/accuracy_reward": 0.8727678954601288, "rewards/format_reward": 1.0, "step": 1246 }, { "completion_length": 440.65849685668945, "epoch": 0.09728528832849376, "grad_norm": 0.0886263415583988, "kl": 0.0020380020141601562, "learning_rate": 9.768277807717528e-07, "loss": 0.0001, "reward": 1.8973215073347092, "reward_std": 0.07643180899322033, "rewards/accuracy_reward": 0.8973214626312256, "rewards/format_reward": 1.0, "step": 1248 }, { "completion_length": 418.21653747558594, "epoch": 0.0974411942392766, "grad_norm": 0.09613665791977645, "kl": 0.0023241043090820312, "learning_rate": 9.767540326994142e-07, "loss": 0.0001, "reward": 1.863839328289032, "reward_std": 0.06883956305682659, "rewards/accuracy_reward": 0.8638393133878708, "rewards/format_reward": 1.0, "step": 1250 }, { "completion_length": 427.5937690734863, "epoch": 0.09759710015005944, "grad_norm": 0.08517624171902441, "kl": 0.0019512176513671875, "learning_rate": 9.766801702507344e-07, "loss": 0.0001, "reward": 1.9040179252624512, "reward_std": 0.04132985696196556, "rewards/accuracy_reward": 0.9040178954601288, "rewards/format_reward": 1.0, "step": 1252 }, { "completion_length": 419.1227798461914, "epoch": 0.09775300606084228, "grad_norm": 0.11319137773481461, "kl": 0.00235748291015625, "learning_rate": 9.766061934434328e-07, "loss": 0.0001, "reward": 1.7790179401636124, "reward_std": 0.09747469797730446, "rewards/accuracy_reward": 0.781250037252903, "rewards/format_reward": 0.9977678656578064, "step": 1254 }, { "completion_length": 414.3571586608887, "epoch": 0.09790891197162513, "grad_norm": 0.08522133629893981, "kl": 0.0021753311157226562, "learning_rate": 9.765321022952576e-07, "loss": 0.0001, "reward": 1.812500074505806, "reward_std": 0.0660630315542221, "rewards/accuracy_reward": 0.8125000298023224, "rewards/format_reward": 1.0, "step": 1256 }, { "completion_length": 418.1741256713867, "epoch": 0.09806481788240796, "grad_norm": 0.08459965295127733, "kl": 0.0020084381103515625, "learning_rate": 9.764578968239833e-07, "loss": 0.0001, "reward": 1.8571429401636124, "reward_std": 0.060270218178629875, "rewards/accuracy_reward": 0.8571428954601288, "rewards/format_reward": 1.0, "step": 1258 }, { "completion_length": 420.5870704650879, "epoch": 0.09822072379319081, "grad_norm": 0.1298809725931641, "kl": 0.0020580291748046875, "learning_rate": 9.763835770474126e-07, "loss": 0.0001, "reward": 1.8325893878936768, "reward_std": 0.06365517433732748, "rewards/accuracy_reward": 0.8325893208384514, "rewards/format_reward": 1.0, "step": 1260 }, { "completion_length": 414.6294822692871, "epoch": 0.09837662970397365, "grad_norm": 0.11793073975707685, "kl": 0.0020017623901367188, "learning_rate": 9.763091429833748e-07, "loss": 0.0001, "reward": 1.796875074505806, "reward_std": 0.049378564581274986, "rewards/accuracy_reward": 0.796875037252903, "rewards/format_reward": 1.0, "step": 1262 }, { "completion_length": 418.6294860839844, "epoch": 0.0985325356147565, "grad_norm": 0.12454882968778778, "kl": 0.0024080276489257812, "learning_rate": 9.762345946497276e-07, "loss": 0.0001, "reward": 1.7410715073347092, "reward_std": 0.11129988729953766, "rewards/accuracy_reward": 0.7410714626312256, "rewards/format_reward": 1.0, "step": 1264 }, { "completion_length": 416.1651954650879, "epoch": 0.09868844152553934, "grad_norm": 0.1210683481758797, "kl": 0.0027685165405273438, "learning_rate": 9.761599320643554e-07, "loss": 0.0001, "reward": 1.8348215073347092, "reward_std": 0.07951367180794477, "rewards/accuracy_reward": 0.834821455180645, "rewards/format_reward": 1.0, "step": 1266 }, { "completion_length": 421.0335006713867, "epoch": 0.09884434743632219, "grad_norm": 0.1439178136629116, "kl": 0.0021848678588867188, "learning_rate": 9.760851552451702e-07, "loss": 0.0001, "reward": 1.7232143580913544, "reward_std": 0.09784420486539602, "rewards/accuracy_reward": 0.7232143208384514, "rewards/format_reward": 1.0, "step": 1268 }, { "completion_length": 422.10716247558594, "epoch": 0.09900025334710502, "grad_norm": 0.12412738416250299, "kl": 0.0020341873168945312, "learning_rate": 9.760102642101116e-07, "loss": 0.0001, "reward": 1.8415179252624512, "reward_std": 0.09394001215696335, "rewards/accuracy_reward": 0.8415178954601288, "rewards/format_reward": 1.0, "step": 1270 }, { "completion_length": 425.4040336608887, "epoch": 0.09915615925788787, "grad_norm": 0.1256572687980098, "kl": 0.0022735595703125, "learning_rate": 9.759352589771461e-07, "loss": 0.0001, "reward": 1.7700893729925156, "reward_std": 0.09537217207252979, "rewards/accuracy_reward": 0.7700893133878708, "rewards/format_reward": 1.0, "step": 1272 }, { "completion_length": 409.7143020629883, "epoch": 0.09931206516867071, "grad_norm": 0.08229282935907632, "kl": 0.0020799636840820312, "learning_rate": 9.758601395642686e-07, "loss": 0.0001, "reward": 1.828125074505806, "reward_std": 0.053956763818860054, "rewards/accuracy_reward": 0.8281250298023224, "rewards/format_reward": 1.0, "step": 1274 }, { "completion_length": 420.0580520629883, "epoch": 0.09946797107945354, "grad_norm": 0.09615781835280036, "kl": 0.0023660659790039062, "learning_rate": 9.757849059894999e-07, "loss": 0.0001, "reward": 1.765625074505806, "reward_std": 0.10821662098169327, "rewards/accuracy_reward": 0.7656250298023224, "rewards/format_reward": 1.0, "step": 1276 }, { "completion_length": 432.7009086608887, "epoch": 0.09962387699023639, "grad_norm": 0.13370951028360573, "kl": 0.0021562576293945312, "learning_rate": 9.757095582708896e-07, "loss": 0.0001, "reward": 1.814732238650322, "reward_std": 0.09943113196641207, "rewards/accuracy_reward": 0.8147321715950966, "rewards/format_reward": 1.0, "step": 1278 }, { "completion_length": 421.7031440734863, "epoch": 0.09977978290101923, "grad_norm": 0.11653297042491605, "kl": 0.0022487640380859375, "learning_rate": 9.756340964265137e-07, "loss": 0.0001, "reward": 1.8013393729925156, "reward_std": 0.07838460709899664, "rewards/accuracy_reward": 0.801339328289032, "rewards/format_reward": 1.0, "step": 1280 }, { "completion_length": 444.2589530944824, "epoch": 0.09993568881180208, "grad_norm": 0.17056759216402123, "kl": 0.0025453567504882812, "learning_rate": 9.755585204744764e-07, "loss": 0.0001, "reward": 1.774553656578064, "reward_std": 0.13572632800787687, "rewards/accuracy_reward": 0.7767857387661934, "rewards/format_reward": 0.9977678656578064, "step": 1282 }, { "completion_length": 427.2701072692871, "epoch": 0.10009159472258491, "grad_norm": 0.113029884020633, "kl": 0.00213623046875, "learning_rate": 9.754828304329088e-07, "loss": 0.0001, "reward": 1.7678572237491608, "reward_std": 0.061549010686576366, "rewards/accuracy_reward": 0.7700893133878708, "rewards/format_reward": 0.9977678656578064, "step": 1284 }, { "completion_length": 430.49778747558594, "epoch": 0.10024750063336776, "grad_norm": 0.1549539160672164, "kl": 0.00231170654296875, "learning_rate": 9.75407026319969e-07, "loss": 0.0001, "reward": 1.7834822237491608, "reward_std": 0.10745706781744957, "rewards/accuracy_reward": 0.7834821715950966, "rewards/format_reward": 1.0, "step": 1286 }, { "completion_length": 409.6808204650879, "epoch": 0.1004034065441506, "grad_norm": 0.10690622920998157, "kl": 0.0020122528076171875, "learning_rate": 9.753311081538432e-07, "loss": 0.0001, "reward": 1.8504465222358704, "reward_std": 0.06883956305682659, "rewards/accuracy_reward": 0.8504464626312256, "rewards/format_reward": 1.0, "step": 1288 }, { "completion_length": 423.4486770629883, "epoch": 0.10055931245493345, "grad_norm": 0.15307349047410582, "kl": 0.005591392517089844, "learning_rate": 9.752550759527448e-07, "loss": 0.0002, "reward": 1.812500074505806, "reward_std": 0.06560797337442636, "rewards/accuracy_reward": 0.8125000521540642, "rewards/format_reward": 1.0, "step": 1290 }, { "completion_length": 433.0669860839844, "epoch": 0.10071521836571629, "grad_norm": 0.0778780089043515, "kl": 0.0022602081298828125, "learning_rate": 9.751789297349139e-07, "loss": 0.0001, "reward": 1.8102678954601288, "reward_std": 0.07710581459105015, "rewards/accuracy_reward": 0.8102678954601288, "rewards/format_reward": 1.0, "step": 1292 }, { "completion_length": 419.34599685668945, "epoch": 0.10087112427649914, "grad_norm": 0.12378358849828397, "kl": 0.0020246505737304688, "learning_rate": 9.75102669518619e-07, "loss": 0.0001, "reward": 1.7968750596046448, "reward_std": 0.060573313385248184, "rewards/accuracy_reward": 0.796875037252903, "rewards/format_reward": 1.0, "step": 1294 }, { "completion_length": 423.52233505249023, "epoch": 0.10102703018728197, "grad_norm": 0.14023334180855737, "kl": 0.0020809173583984375, "learning_rate": 9.75026295322155e-07, "loss": 0.0001, "reward": 1.7812501043081284, "reward_std": 0.09070842061191797, "rewards/accuracy_reward": 0.7812500298023224, "rewards/format_reward": 1.0, "step": 1296 }, { "completion_length": 416.28796005249023, "epoch": 0.10118293609806482, "grad_norm": 0.10949214557410805, "kl": 0.0021886825561523438, "learning_rate": 9.74949807163845e-07, "loss": 0.0001, "reward": 1.8080357760190964, "reward_std": 0.06124731805175543, "rewards/accuracy_reward": 0.8080357611179352, "rewards/format_reward": 1.0, "step": 1298 }, { "completion_length": 425.8884086608887, "epoch": 0.10133884200884766, "grad_norm": 0.06944175775235284, "kl": 0.0020570755004882812, "learning_rate": 9.748732050620387e-07, "loss": 0.0001, "reward": 1.8102679401636124, "reward_std": 0.04111231118440628, "rewards/accuracy_reward": 0.8102678954601288, "rewards/format_reward": 1.0, "step": 1300 }, { "completion_length": 436.57591247558594, "epoch": 0.1014947479196305, "grad_norm": 0.1577730104842626, "kl": 0.002506256103515625, "learning_rate": 9.747964890351136e-07, "loss": 0.0001, "reward": 1.8035714775323868, "reward_std": 0.10160147584974766, "rewards/accuracy_reward": 0.8035714700818062, "rewards/format_reward": 1.0, "step": 1302 }, { "completion_length": 413.37724685668945, "epoch": 0.10165065383041334, "grad_norm": 0.11005542166081951, "kl": 0.002132415771484375, "learning_rate": 9.747196591014741e-07, "loss": 0.0001, "reward": 1.8303572237491608, "reward_std": 0.08372096251696348, "rewards/accuracy_reward": 0.8303571864962578, "rewards/format_reward": 1.0, "step": 1304 }, { "completion_length": 432.8348388671875, "epoch": 0.10180655974119619, "grad_norm": 0.13573373555056986, "kl": 0.002201080322265625, "learning_rate": 9.746427152795524e-07, "loss": 0.0001, "reward": 1.7946429252624512, "reward_std": 0.13038717675954103, "rewards/accuracy_reward": 0.7946428954601288, "rewards/format_reward": 1.0, "step": 1306 }, { "completion_length": 418.8058166503906, "epoch": 0.10196246565197903, "grad_norm": 0.12013514666604987, "kl": 0.0020170211791992188, "learning_rate": 9.745656575878078e-07, "loss": 0.0001, "reward": 1.8727679401636124, "reward_std": 0.08875562064349651, "rewards/accuracy_reward": 0.875000037252903, "rewards/format_reward": 0.9977678656578064, "step": 1308 }, { "completion_length": 404.21207427978516, "epoch": 0.10211837156276188, "grad_norm": 0.00396326092770772, "kl": 0.0019216537475585938, "learning_rate": 9.744884860447268e-07, "loss": 0.0001, "reward": 1.8303572088479996, "reward_std": 0.03352006711065769, "rewards/accuracy_reward": 0.8303571790456772, "rewards/format_reward": 1.0, "step": 1310 }, { "completion_length": 416.9263572692871, "epoch": 0.10227427747354471, "grad_norm": 0.1032157685042744, "kl": 0.0021619796752929688, "learning_rate": 9.744112006688237e-07, "loss": 0.0001, "reward": 1.8392858058214188, "reward_std": 0.07951367553323507, "rewards/accuracy_reward": 0.8392857536673546, "rewards/format_reward": 1.0, "step": 1312 }, { "completion_length": 426.7009162902832, "epoch": 0.10243018338432756, "grad_norm": 0.10634777834502004, "kl": 0.002227783203125, "learning_rate": 9.743338014786393e-07, "loss": 0.0001, "reward": 1.850446492433548, "reward_std": 0.052307059057056904, "rewards/accuracy_reward": 0.8504464700818062, "rewards/format_reward": 1.0, "step": 1314 }, { "completion_length": 418.8906478881836, "epoch": 0.1025860892951104, "grad_norm": 0.07693879353473133, "kl": 0.0020427703857421875, "learning_rate": 9.742562884927423e-07, "loss": 0.0001, "reward": 1.7834822237491608, "reward_std": 0.050202298909425735, "rewards/accuracy_reward": 0.7834821715950966, "rewards/format_reward": 1.0, "step": 1316 }, { "completion_length": 418.8660888671875, "epoch": 0.10274199520589325, "grad_norm": 0.1262472261639172, "kl": 0.002132415771484375, "learning_rate": 9.741786617297287e-07, "loss": 0.0001, "reward": 1.7410715073347092, "reward_std": 0.11693429667502642, "rewards/accuracy_reward": 0.7410714700818062, "rewards/format_reward": 1.0, "step": 1318 }, { "completion_length": 426.47769927978516, "epoch": 0.10289790111667609, "grad_norm": 0.13372997939548087, "kl": 0.0021905899047851562, "learning_rate": 9.741009212082216e-07, "loss": 0.0001, "reward": 1.750000074505806, "reward_std": 0.10107943695038557, "rewards/accuracy_reward": 0.7522321790456772, "rewards/format_reward": 0.9977678656578064, "step": 1320 }, { "completion_length": 418.26341247558594, "epoch": 0.10305380702745894, "grad_norm": 0.11682609702261908, "kl": 0.0021562576293945312, "learning_rate": 9.740230669468716e-07, "loss": 0.0001, "reward": 1.852678656578064, "reward_std": 0.05133136082440615, "rewards/accuracy_reward": 0.8526786118745804, "rewards/format_reward": 1.0, "step": 1322 }, { "completion_length": 427.3817138671875, "epoch": 0.10320971293824177, "grad_norm": 0.09165327829073551, "kl": 0.0020799636840820312, "learning_rate": 9.739450989643561e-07, "loss": 0.0001, "reward": 1.8437501043081284, "reward_std": 0.06319871358573437, "rewards/accuracy_reward": 0.8437500298023224, "rewards/format_reward": 1.0, "step": 1324 }, { "completion_length": 422.8058204650879, "epoch": 0.10336561884902461, "grad_norm": 0.12438340540766264, "kl": 0.0023870468139648438, "learning_rate": 9.738670172793803e-07, "loss": 0.0001, "reward": 1.8482143729925156, "reward_std": 0.07808151468634605, "rewards/accuracy_reward": 0.8482143357396126, "rewards/format_reward": 1.0, "step": 1326 }, { "completion_length": 422.9486770629883, "epoch": 0.10352152475980746, "grad_norm": 0.06473754619630717, "kl": 0.00193023681640625, "learning_rate": 9.737888219106766e-07, "loss": 0.0001, "reward": 1.8549107611179352, "reward_std": 0.053956763818860054, "rewards/accuracy_reward": 0.854910746216774, "rewards/format_reward": 1.0, "step": 1328 }, { "completion_length": 424.76341247558594, "epoch": 0.10367743067059029, "grad_norm": 0.11331778032179231, "kl": 0.002429962158203125, "learning_rate": 9.73710512877004e-07, "loss": 0.0001, "reward": 1.8616072088479996, "reward_std": 0.08244217000901699, "rewards/accuracy_reward": 0.8616071790456772, "rewards/format_reward": 1.0, "step": 1330 }, { "completion_length": 420.2991256713867, "epoch": 0.10383333658137314, "grad_norm": 0.06212829048501028, "kl": 0.0021505355834960938, "learning_rate": 9.736320901971501e-07, "loss": 0.0001, "reward": 1.821428656578064, "reward_std": 0.04824949987232685, "rewards/accuracy_reward": 0.8214286118745804, "rewards/format_reward": 1.0, "step": 1332 }, { "completion_length": 416.3906478881836, "epoch": 0.10398924249215598, "grad_norm": 0.08681290808944228, "kl": 0.002040863037109375, "learning_rate": 9.735535538899289e-07, "loss": 0.0001, "reward": 1.8325893729925156, "reward_std": 0.08296280819922686, "rewards/accuracy_reward": 0.832589328289032, "rewards/format_reward": 1.0, "step": 1334 }, { "completion_length": 413.29019927978516, "epoch": 0.10414514840293883, "grad_norm": 0.13121635599461434, "kl": 0.0022449493408203125, "learning_rate": 9.734749039741813e-07, "loss": 0.0001, "reward": 1.705357238650322, "reward_std": 0.1173052079975605, "rewards/accuracy_reward": 0.7053571715950966, "rewards/format_reward": 1.0, "step": 1336 }, { "completion_length": 415.1808204650879, "epoch": 0.10430105431372166, "grad_norm": 0.13233741435166588, "kl": 0.0022335052490234375, "learning_rate": 9.733961404687762e-07, "loss": 0.0001, "reward": 1.7879465222358704, "reward_std": 0.11648287158459425, "rewards/accuracy_reward": 0.7879464626312256, "rewards/format_reward": 1.0, "step": 1338 }, { "completion_length": 410.9375190734863, "epoch": 0.10445696022450451, "grad_norm": 0.12255408873620653, "kl": 0.0020847320556640625, "learning_rate": 9.733172633926094e-07, "loss": 0.0001, "reward": 1.734375074505806, "reward_std": 0.10400933586061001, "rewards/accuracy_reward": 0.734375037252903, "rewards/format_reward": 1.0, "step": 1340 }, { "completion_length": 424.5870704650879, "epoch": 0.10461286613528735, "grad_norm": 0.1328708624041645, "kl": 0.0024509429931640625, "learning_rate": 9.732382727646043e-07, "loss": 0.0001, "reward": 1.741071492433548, "reward_std": 0.12407512031495571, "rewards/accuracy_reward": 0.7433036118745804, "rewards/format_reward": 0.9977678656578064, "step": 1342 }, { "completion_length": 419.7299270629883, "epoch": 0.1047687720460702, "grad_norm": 0.10420557766304052, "kl": 0.0021028518676757812, "learning_rate": 9.731591686037106e-07, "loss": 0.0001, "reward": 1.8080357909202576, "reward_std": 0.04907547030597925, "rewards/accuracy_reward": 0.8080357536673546, "rewards/format_reward": 1.0, "step": 1344 }, { "completion_length": 425.6919860839844, "epoch": 0.10492467795685304, "grad_norm": 0.1424186558238102, "kl": 0.0022726058959960938, "learning_rate": 9.730799509289067e-07, "loss": 0.0001, "reward": 1.8191965073347092, "reward_std": 0.12505081575363874, "rewards/accuracy_reward": 0.821428619325161, "rewards/format_reward": 0.9977678656578064, "step": 1346 }, { "completion_length": 428.8192138671875, "epoch": 0.10508058386763588, "grad_norm": 0.1246762933730393, "kl": 0.0023183822631835938, "learning_rate": 9.730006197591968e-07, "loss": 0.0001, "reward": 1.7098215073347092, "reward_std": 0.09815093595534563, "rewards/accuracy_reward": 0.7098214477300644, "rewards/format_reward": 1.0, "step": 1348 }, { "completion_length": 414.5245780944824, "epoch": 0.10523648977841872, "grad_norm": 0.12785157061836852, "kl": 0.0021266937255859375, "learning_rate": 9.729211751136133e-07, "loss": 0.0001, "reward": 1.8549107909202576, "reward_std": 0.09830066747963428, "rewards/accuracy_reward": 0.8549107536673546, "rewards/format_reward": 1.0, "step": 1350 }, { "completion_length": 418.8370704650879, "epoch": 0.10539239568920157, "grad_norm": 0.11256330571686295, "kl": 0.0020999908447265625, "learning_rate": 9.728416170112153e-07, "loss": 0.0001, "reward": 1.7410715222358704, "reward_std": 0.0811633775010705, "rewards/accuracy_reward": 0.741071455180645, "rewards/format_reward": 1.0, "step": 1352 }, { "completion_length": 424.89064025878906, "epoch": 0.1055483015999844, "grad_norm": 0.08525776853787972, "kl": 0.0022678375244140625, "learning_rate": 9.727619454710894e-07, "loss": 0.0001, "reward": 1.8705357760190964, "reward_std": 0.0866494569927454, "rewards/accuracy_reward": 0.8705357611179352, "rewards/format_reward": 1.0, "step": 1354 }, { "completion_length": 411.63171768188477, "epoch": 0.10570420751076726, "grad_norm": 0.06893908352734236, "kl": 0.0018634796142578125, "learning_rate": 9.726821605123492e-07, "loss": 0.0001, "reward": 1.9017857909202576, "reward_std": 0.03644856158643961, "rewards/accuracy_reward": 0.9017857611179352, "rewards/format_reward": 1.0, "step": 1356 }, { "completion_length": 413.4308280944824, "epoch": 0.10586011342155009, "grad_norm": 0.12624930160180728, "kl": 0.0021123886108398438, "learning_rate": 9.726022621541356e-07, "loss": 0.0001, "reward": 1.832589328289032, "reward_std": 0.07027172483503819, "rewards/accuracy_reward": 0.8325893133878708, "rewards/format_reward": 1.0, "step": 1358 }, { "completion_length": 414.111629486084, "epoch": 0.10601601933233294, "grad_norm": 0.10647426371559134, "kl": 0.0022230148315429688, "learning_rate": 9.725222504156169e-07, "loss": 0.0001, "reward": 1.8794643729925156, "reward_std": 0.12141226045787334, "rewards/accuracy_reward": 0.8816964626312256, "rewards/format_reward": 0.9977678656578064, "step": 1360 }, { "completion_length": 424.31921768188477, "epoch": 0.10617192524311578, "grad_norm": 0.11669127206593455, "kl": 0.0022373199462890625, "learning_rate": 9.724421253159886e-07, "loss": 0.0001, "reward": 1.799107238650322, "reward_std": 0.1004068311303854, "rewards/accuracy_reward": 0.799107164144516, "rewards/format_reward": 1.0, "step": 1362 }, { "completion_length": 419.4419822692871, "epoch": 0.10632783115389863, "grad_norm": 0.03845947417366219, "kl": 0.0020308494567871094, "learning_rate": 9.723618868744728e-07, "loss": 0.0001, "reward": 1.8459822088479996, "reward_std": 0.05230706091970205, "rewards/accuracy_reward": 0.8459821790456772, "rewards/format_reward": 1.0, "step": 1364 }, { "completion_length": 428.94868087768555, "epoch": 0.10648373706468146, "grad_norm": 0.12001424254507698, "kl": 0.0024585723876953125, "learning_rate": 9.722815351103193e-07, "loss": 0.0001, "reward": 1.767857238650322, "reward_std": 0.08634776435792446, "rewards/accuracy_reward": 0.7678571790456772, "rewards/format_reward": 1.0, "step": 1366 }, { "completion_length": 429.9062690734863, "epoch": 0.10663964297546431, "grad_norm": 0.10978198672796602, "kl": 0.0023097991943359375, "learning_rate": 9.72201070042805e-07, "loss": 0.0001, "reward": 1.7455357760190964, "reward_std": 0.10250935889780521, "rewards/accuracy_reward": 0.7455357536673546, "rewards/format_reward": 1.0, "step": 1368 }, { "completion_length": 412.81697845458984, "epoch": 0.10679554888624715, "grad_norm": 0.1170470197963156, "kl": 0.0019330978393554688, "learning_rate": 9.721204916912345e-07, "loss": 0.0001, "reward": 1.8415179401636124, "reward_std": 0.09244367573410273, "rewards/accuracy_reward": 0.8415178954601288, "rewards/format_reward": 1.0, "step": 1370 }, { "completion_length": 414.97993087768555, "epoch": 0.10695145479703, "grad_norm": 0.07433520955253757, "kl": 0.0019979476928710938, "learning_rate": 9.720398000749384e-07, "loss": 0.0001, "reward": 1.7455358058214188, "reward_std": 0.06154900882393122, "rewards/accuracy_reward": 0.7455357573926449, "rewards/format_reward": 1.0, "step": 1372 }, { "completion_length": 415.3393020629883, "epoch": 0.10710736070781283, "grad_norm": 0.12435484873579118, "kl": 0.0024309158325195312, "learning_rate": 9.719589952132753e-07, "loss": 0.0001, "reward": 1.8125000894069672, "reward_std": 0.09040392655879259, "rewards/accuracy_reward": 0.8125000223517418, "rewards/format_reward": 1.0, "step": 1374 }, { "completion_length": 417.18528747558594, "epoch": 0.10726326661859567, "grad_norm": 0.10143394917776098, "kl": 0.002437591552734375, "learning_rate": 9.71878077125631e-07, "loss": 0.0001, "reward": 1.8236607760190964, "reward_std": 0.058317420072853565, "rewards/accuracy_reward": 0.8236607387661934, "rewards/format_reward": 1.0, "step": 1376 }, { "completion_length": 415.92858505249023, "epoch": 0.10741917252937852, "grad_norm": 0.12829482419353222, "kl": 0.0022983551025390625, "learning_rate": 9.71797045831418e-07, "loss": 0.0001, "reward": 1.7924107909202576, "reward_std": 0.1342941727489233, "rewards/accuracy_reward": 0.7924107536673546, "rewards/format_reward": 1.0, "step": 1378 }, { "completion_length": 424.5736770629883, "epoch": 0.10757507844016136, "grad_norm": 0.10645620207311454, "kl": 0.002223968505859375, "learning_rate": 9.717159013500765e-07, "loss": 0.0001, "reward": 1.7767857760190964, "reward_std": 0.08942962903529406, "rewards/accuracy_reward": 0.7767857536673546, "rewards/format_reward": 1.0, "step": 1380 }, { "completion_length": 416.7187614440918, "epoch": 0.1077309843509442, "grad_norm": 0.09733230620547285, "kl": 0.0020017623901367188, "learning_rate": 9.716346437010734e-07, "loss": 0.0001, "reward": 1.8035715073347092, "reward_std": 0.07274375949054956, "rewards/accuracy_reward": 0.8035714775323868, "rewards/format_reward": 1.0, "step": 1382 }, { "completion_length": 413.80359268188477, "epoch": 0.10788689026172704, "grad_norm": 0.1005643557067187, "kl": 0.002109527587890625, "learning_rate": 9.715532729039028e-07, "loss": 0.0001, "reward": 1.8303572088479996, "reward_std": 0.054413226433098316, "rewards/accuracy_reward": 0.830357164144516, "rewards/format_reward": 1.0, "step": 1384 }, { "completion_length": 425.39287185668945, "epoch": 0.10804279617250989, "grad_norm": 0.0896454905140804, "kl": 0.0022916793823242188, "learning_rate": 9.714717889780866e-07, "loss": 0.0001, "reward": 1.8437500894069672, "reward_std": 0.07417591754347086, "rewards/accuracy_reward": 0.8459821864962578, "rewards/format_reward": 0.9977678656578064, "step": 1386 }, { "completion_length": 414.5558166503906, "epoch": 0.10819870208329273, "grad_norm": 0.12733045516016137, "kl": 0.00222015380859375, "learning_rate": 9.713901919431727e-07, "loss": 0.0001, "reward": 1.7500001043081284, "reward_std": 0.07823487929999828, "rewards/accuracy_reward": 0.7500000186264515, "rewards/format_reward": 1.0, "step": 1388 }, { "completion_length": 418.5134162902832, "epoch": 0.10835460799407558, "grad_norm": 0.04623601936618194, "kl": 0.0020647048950195312, "learning_rate": 9.71308481818737e-07, "loss": 0.0001, "reward": 1.787946492433548, "reward_std": 0.024124749936163425, "rewards/accuracy_reward": 0.7879464700818062, "rewards/format_reward": 1.0, "step": 1390 }, { "completion_length": 417.5692138671875, "epoch": 0.10851051390485841, "grad_norm": 0.121870496091172, "kl": 0.0020723342895507812, "learning_rate": 9.712266586243826e-07, "loss": 0.0001, "reward": 1.845982238650322, "reward_std": 0.08845392893999815, "rewards/accuracy_reward": 0.8482143133878708, "rewards/format_reward": 0.9977678656578064, "step": 1392 }, { "completion_length": 425.4285888671875, "epoch": 0.10866641981564126, "grad_norm": 0.10403815679930113, "kl": 0.00220489501953125, "learning_rate": 9.71144722379739e-07, "loss": 0.0001, "reward": 1.883928656578064, "reward_std": 0.10498503223061562, "rewards/accuracy_reward": 0.886160746216774, "rewards/format_reward": 0.9977678656578064, "step": 1394 }, { "completion_length": 411.8861770629883, "epoch": 0.1088223257264241, "grad_norm": 0.12585817355430715, "kl": 0.0020771026611328125, "learning_rate": 9.71062673104463e-07, "loss": 0.0001, "reward": 1.87053582072258, "reward_std": 0.09656541422009468, "rewards/accuracy_reward": 0.8705357536673546, "rewards/format_reward": 1.0, "step": 1396 }, { "completion_length": 410.1428756713867, "epoch": 0.10897823163720695, "grad_norm": 0.11648795305279037, "kl": 0.0020580291748046875, "learning_rate": 9.709805108182395e-07, "loss": 0.0001, "reward": 1.8169643729925156, "reward_std": 0.1021384485065937, "rewards/accuracy_reward": 0.816964328289032, "rewards/format_reward": 1.0, "step": 1398 }, { "completion_length": 417.4442138671875, "epoch": 0.10913413754798978, "grad_norm": 0.10858485983500032, "kl": 0.002338409423828125, "learning_rate": 9.70898235540779e-07, "loss": 0.0001, "reward": 1.712053656578064, "reward_std": 0.08033600449562073, "rewards/accuracy_reward": 0.7120535969734192, "rewards/format_reward": 1.0, "step": 1400 }, { "completion_length": 426.99332427978516, "epoch": 0.10929004345877263, "grad_norm": 0.14972245706829454, "kl": 0.0022640228271484375, "learning_rate": 9.708158472918205e-07, "loss": 0.0001, "reward": 1.8303572237491608, "reward_std": 0.10303363855928183, "rewards/accuracy_reward": 0.830357164144516, "rewards/format_reward": 1.0, "step": 1402 }, { "completion_length": 415.5803756713867, "epoch": 0.10944594936955547, "grad_norm": 0.1262512586999603, "kl": 0.0029201507568359375, "learning_rate": 9.70733346091129e-07, "loss": 0.0001, "reward": 1.8281251043081284, "reward_std": 0.11257728096097708, "rewards/accuracy_reward": 0.828125037252903, "rewards/format_reward": 1.0, "step": 1404 }, { "completion_length": 417.6138572692871, "epoch": 0.10960185528033832, "grad_norm": 0.0886332729977003, "kl": 0.0021505355834960938, "learning_rate": 9.706507319584974e-07, "loss": 0.0001, "reward": 1.832589328289032, "reward_std": 0.0803360054269433, "rewards/accuracy_reward": 0.8325893208384514, "rewards/format_reward": 1.0, "step": 1406 }, { "completion_length": 418.90403747558594, "epoch": 0.10975776119112116, "grad_norm": 0.13192043128162126, "kl": 0.0023679733276367188, "learning_rate": 9.70568004913745e-07, "loss": 0.0001, "reward": 1.7142857909202576, "reward_std": 0.1438378170132637, "rewards/accuracy_reward": 0.7142857387661934, "rewards/format_reward": 1.0, "step": 1408 }, { "completion_length": 407.52903747558594, "epoch": 0.109913667101904, "grad_norm": 0.09107843588314542, "kl": 0.00201416015625, "learning_rate": 9.70485164976719e-07, "loss": 0.0001, "reward": 1.8169643580913544, "reward_std": 0.0602702172473073, "rewards/accuracy_reward": 0.816964328289032, "rewards/format_reward": 1.0, "step": 1410 }, { "completion_length": 419.1607360839844, "epoch": 0.11006957301268684, "grad_norm": 0.11612584238530678, "kl": 0.00252532958984375, "learning_rate": 9.704022121672929e-07, "loss": 0.0001, "reward": 1.7455357909202576, "reward_std": 0.05441322457045317, "rewards/accuracy_reward": 0.745535746216774, "rewards/format_reward": 1.0, "step": 1412 }, { "completion_length": 435.2634162902832, "epoch": 0.11022547892346969, "grad_norm": 0.1553024404257858, "kl": 0.002719879150390625, "learning_rate": 9.703191465053674e-07, "loss": 0.0001, "reward": 1.7879465073347092, "reward_std": 0.14954508189111948, "rewards/accuracy_reward": 0.7879464626312256, "rewards/format_reward": 1.0, "step": 1414 }, { "completion_length": 423.8125228881836, "epoch": 0.11038138483425253, "grad_norm": 0.11339694009502108, "kl": 0.00241851806640625, "learning_rate": 9.702359680108711e-07, "loss": 0.0001, "reward": 1.7299107760190964, "reward_std": 0.10949541628360748, "rewards/accuracy_reward": 0.7299107536673546, "rewards/format_reward": 1.0, "step": 1416 }, { "completion_length": 413.45314025878906, "epoch": 0.11053729074503538, "grad_norm": 0.08569064480921702, "kl": 0.002498626708984375, "learning_rate": 9.701526767037585e-07, "loss": 0.0001, "reward": 1.6718751043081284, "reward_std": 0.084394964389503, "rewards/accuracy_reward": 0.6741071715950966, "rewards/format_reward": 0.9977678656578064, "step": 1418 }, { "completion_length": 411.0111770629883, "epoch": 0.11069319665581821, "grad_norm": 0.08465614089389821, "kl": 0.0020456314086914062, "learning_rate": 9.70069272604012e-07, "loss": 0.0001, "reward": 1.8638393729925156, "reward_std": 0.053284160792827606, "rewards/accuracy_reward": 0.863839328289032, "rewards/format_reward": 1.0, "step": 1420 }, { "completion_length": 400.1406478881836, "epoch": 0.11084910256660106, "grad_norm": 0.14077499879897692, "kl": 0.0020341873168945312, "learning_rate": 9.699857557316407e-07, "loss": 0.0001, "reward": 1.7924107909202576, "reward_std": 0.06883956212550402, "rewards/accuracy_reward": 0.7924107611179352, "rewards/format_reward": 1.0, "step": 1422 }, { "completion_length": 413.1919822692871, "epoch": 0.1110050084773839, "grad_norm": 0.10797046758082884, "kl": 0.0022439956665039062, "learning_rate": 9.69902126106681e-07, "loss": 0.0001, "reward": 1.752232238650322, "reward_std": 0.0636551734060049, "rewards/accuracy_reward": 0.7522321715950966, "rewards/format_reward": 1.0, "step": 1424 }, { "completion_length": 440.47099685668945, "epoch": 0.11116091438816675, "grad_norm": 0.10860118756957896, "kl": 0.0025043487548828125, "learning_rate": 9.698183837491959e-07, "loss": 0.0001, "reward": 1.8415179252624512, "reward_std": 0.0771058164536953, "rewards/accuracy_reward": 0.8415179029107094, "rewards/format_reward": 1.0, "step": 1426 }, { "completion_length": 422.69644927978516, "epoch": 0.11131682029894958, "grad_norm": 0.13633908018394972, "kl": 0.0024690628051757812, "learning_rate": 9.69734528679276e-07, "loss": 0.0001, "reward": 1.8191965073347092, "reward_std": 0.10400933399796486, "rewards/accuracy_reward": 0.8191964700818062, "rewards/format_reward": 1.0, "step": 1428 }, { "completion_length": 414.82814025878906, "epoch": 0.11147272620973242, "grad_norm": 0.11388796560998751, "kl": 0.0024023056030273438, "learning_rate": 9.696505609170381e-07, "loss": 0.0001, "reward": 1.8303571939468384, "reward_std": 0.08702037110924721, "rewards/accuracy_reward": 0.8303571715950966, "rewards/format_reward": 1.0, "step": 1430 }, { "completion_length": 419.95537185668945, "epoch": 0.11162863212051527, "grad_norm": 0.11077705890795288, "kl": 0.00228118896484375, "learning_rate": 9.695664804826275e-07, "loss": 0.0001, "reward": 1.8950893580913544, "reward_std": 0.07027172110974789, "rewards/accuracy_reward": 0.8950893357396126, "rewards/format_reward": 1.0, "step": 1432 }, { "completion_length": 423.0781440734863, "epoch": 0.1117845380312981, "grad_norm": 0.12394384092251191, "kl": 0.002178192138671875, "learning_rate": 9.69482287396215e-07, "loss": 0.0001, "reward": 1.8526786416769028, "reward_std": 0.08439356740564108, "rewards/accuracy_reward": 0.8526786118745804, "rewards/format_reward": 1.0, "step": 1434 }, { "completion_length": 421.3326072692871, "epoch": 0.11194044394208096, "grad_norm": 0.08953559944000972, "kl": 0.002544403076171875, "learning_rate": 9.693979816779992e-07, "loss": 0.0001, "reward": 1.9084822088479996, "reward_std": 0.03983351867645979, "rewards/accuracy_reward": 0.9084821715950966, "rewards/format_reward": 1.0, "step": 1436 }, { "completion_length": 411.6651954650879, "epoch": 0.11209634985286379, "grad_norm": 0.0944021908102636, "kl": 0.0021343231201171875, "learning_rate": 9.693135633482057e-07, "loss": 0.0001, "reward": 1.8861607760190964, "reward_std": 0.0834178663790226, "rewards/accuracy_reward": 0.8861607387661934, "rewards/format_reward": 1.0, "step": 1438 }, { "completion_length": 417.5558280944824, "epoch": 0.11225225576364664, "grad_norm": 0.07468745060290617, "kl": 0.0022735595703125, "learning_rate": 9.692290324270868e-07, "loss": 0.0001, "reward": 1.8303571939468384, "reward_std": 0.05553865060210228, "rewards/accuracy_reward": 0.8325893133878708, "rewards/format_reward": 0.9977678656578064, "step": 1440 }, { "completion_length": 413.3415336608887, "epoch": 0.11240816167442948, "grad_norm": 0.10682824640419358, "kl": 0.0022325515747070312, "learning_rate": 9.691443889349222e-07, "loss": 0.0001, "reward": 1.8883929252624512, "reward_std": 0.0803374070674181, "rewards/accuracy_reward": 0.8883928880095482, "rewards/format_reward": 1.0, "step": 1442 }, { "completion_length": 420.9375190734863, "epoch": 0.11256406758521233, "grad_norm": 0.07927503776297123, "kl": 0.0026063919067382812, "learning_rate": 9.690596328920183e-07, "loss": 0.0001, "reward": 1.7834822088479996, "reward_std": 0.0746965566650033, "rewards/accuracy_reward": 0.7834821790456772, "rewards/format_reward": 1.0, "step": 1444 }, { "completion_length": 430.6183280944824, "epoch": 0.11271997349599516, "grad_norm": 0.0861266364003358, "kl": 0.0027456283569335938, "learning_rate": 9.689747643187083e-07, "loss": 0.0001, "reward": 1.8236608058214188, "reward_std": 0.11047251615673304, "rewards/accuracy_reward": 0.823660746216774, "rewards/format_reward": 1.0, "step": 1446 }, { "completion_length": 413.1562690734863, "epoch": 0.11287587940677801, "grad_norm": 0.11461311764253916, "kl": 0.0021982192993164062, "learning_rate": 9.688897832353534e-07, "loss": 0.0001, "reward": 1.7901786416769028, "reward_std": 0.06319871451705694, "rewards/accuracy_reward": 0.7901786044239998, "rewards/format_reward": 1.0, "step": 1448 }, { "completion_length": 418.6138572692871, "epoch": 0.11303178531756085, "grad_norm": 0.07915317029770085, "kl": 0.002468109130859375, "learning_rate": 9.688046896623407e-07, "loss": 0.0001, "reward": 1.803571492433548, "reward_std": 0.0810100082308054, "rewards/accuracy_reward": 0.8080357611179352, "rewards/format_reward": 0.9955357164144516, "step": 1450 }, { "completion_length": 431.78796768188477, "epoch": 0.1131876912283437, "grad_norm": 0.13622440878507247, "kl": 0.0026350021362304688, "learning_rate": 9.687194836200849e-07, "loss": 0.0001, "reward": 1.743303656578064, "reward_std": 0.08911176305264235, "rewards/accuracy_reward": 0.7477679029107094, "rewards/format_reward": 0.9955357313156128, "step": 1452 }, { "completion_length": 415.89733505249023, "epoch": 0.11334359713912653, "grad_norm": 0.11334312568764947, "kl": 0.0023450851440429688, "learning_rate": 9.686341651290271e-07, "loss": 0.0001, "reward": 1.8549107909202576, "reward_std": 0.0746965566650033, "rewards/accuracy_reward": 0.8549107536673546, "rewards/format_reward": 1.0, "step": 1454 }, { "completion_length": 405.65403747558594, "epoch": 0.11349950304990938, "grad_norm": 0.13852007376017, "kl": 0.0025453567504882812, "learning_rate": 9.68548734209636e-07, "loss": 0.0001, "reward": 1.7388393729925156, "reward_std": 0.10611185897141695, "rewards/accuracy_reward": 0.7388393208384514, "rewards/format_reward": 1.0, "step": 1456 }, { "completion_length": 417.7611770629883, "epoch": 0.11365540896069222, "grad_norm": 0.1273827735871494, "kl": 0.0025539398193359375, "learning_rate": 9.68463190882407e-07, "loss": 0.0001, "reward": 1.7656250596046448, "reward_std": 0.09723941888660192, "rewards/accuracy_reward": 0.7656250298023224, "rewards/format_reward": 1.0, "step": 1458 }, { "completion_length": 419.18974685668945, "epoch": 0.11381131487147507, "grad_norm": 0.0885903862377782, "kl": 0.0024890899658203125, "learning_rate": 9.683775351678627e-07, "loss": 0.0001, "reward": 1.7834822088479996, "reward_std": 0.06365517526865005, "rewards/accuracy_reward": 0.7834821790456772, "rewards/format_reward": 1.0, "step": 1460 }, { "completion_length": 430.846004486084, "epoch": 0.1139672207822579, "grad_norm": 0.11381275586443923, "kl": 0.0024538040161132812, "learning_rate": 9.68291767086552e-07, "loss": 0.0001, "reward": 1.781250074505806, "reward_std": 0.09430952090770006, "rewards/accuracy_reward": 0.781250037252903, "rewards/format_reward": 1.0, "step": 1462 }, { "completion_length": 417.78796768188477, "epoch": 0.11412312669304076, "grad_norm": 0.1182233633441393, "kl": 0.0023374557495117188, "learning_rate": 9.682058866590516e-07, "loss": 0.0001, "reward": 1.7857143729925156, "reward_std": 0.10611045733094215, "rewards/accuracy_reward": 0.787946455180645, "rewards/format_reward": 0.9977678656578064, "step": 1464 }, { "completion_length": 404.56921768188477, "epoch": 0.11427903260382359, "grad_norm": 0.11285222063159919, "kl": 0.0021963119506835938, "learning_rate": 9.681198939059646e-07, "loss": 0.0001, "reward": 1.8526786714792252, "reward_std": 0.06252610962837934, "rewards/accuracy_reward": 0.8526786118745804, "rewards/format_reward": 1.0, "step": 1466 }, { "completion_length": 417.4821586608887, "epoch": 0.11443493851460644, "grad_norm": 0.0989250301419496, "kl": 0.0023479461669921875, "learning_rate": 9.680337888479211e-07, "loss": 0.0001, "reward": 1.8727679401636124, "reward_std": 0.05831741914153099, "rewards/accuracy_reward": 0.8727678954601288, "rewards/format_reward": 1.0, "step": 1468 }, { "completion_length": 422.3973388671875, "epoch": 0.11459084442538928, "grad_norm": 0.13839845529199366, "kl": 0.0022830963134765625, "learning_rate": 9.679475715055785e-07, "loss": 0.0001, "reward": 1.8035715073347092, "reward_std": 0.10611045826226473, "rewards/accuracy_reward": 0.8035714700818062, "rewards/format_reward": 1.0, "step": 1470 }, { "completion_length": 422.8727836608887, "epoch": 0.11474675033617213, "grad_norm": 0.1258718099091561, "kl": 0.0026397705078125, "learning_rate": 9.678612418996208e-07, "loss": 0.0001, "reward": 1.8013393580913544, "reward_std": 0.07222311943769455, "rewards/accuracy_reward": 0.8013393208384514, "rewards/format_reward": 1.0, "step": 1472 }, { "completion_length": 404.3571586608887, "epoch": 0.11490265624695496, "grad_norm": 0.10050737693484603, "kl": 0.002193450927734375, "learning_rate": 9.67774800050759e-07, "loss": 0.0001, "reward": 1.8459822088479996, "reward_std": 0.033063605427742004, "rewards/accuracy_reward": 0.8459821790456772, "rewards/format_reward": 1.0, "step": 1474 }, { "completion_length": 402.314754486084, "epoch": 0.11505856215773781, "grad_norm": 0.11363324752508662, "kl": 0.0023527145385742188, "learning_rate": 9.676882459797311e-07, "loss": 0.0001, "reward": 1.8392857909202576, "reward_std": 0.04764331039041281, "rewards/accuracy_reward": 0.8392857387661934, "rewards/format_reward": 1.0, "step": 1476 }, { "completion_length": 419.5044822692871, "epoch": 0.11521446806852065, "grad_norm": 0.11106259376614401, "kl": 0.002239227294921875, "learning_rate": 9.67601579707302e-07, "loss": 0.0001, "reward": 1.7589286863803864, "reward_std": 0.10887585394084454, "rewards/accuracy_reward": 0.761160746216774, "rewards/format_reward": 0.9977678656578064, "step": 1478 }, { "completion_length": 409.439754486084, "epoch": 0.11537037397930348, "grad_norm": 0.1075111173256123, "kl": 0.002216339111328125, "learning_rate": 9.67514801254263e-07, "loss": 0.0001, "reward": 1.8102679252624512, "reward_std": 0.07432564627379179, "rewards/accuracy_reward": 0.8102679029107094, "rewards/format_reward": 1.0, "step": 1480 }, { "completion_length": 414.33260345458984, "epoch": 0.11552627989008633, "grad_norm": 0.12394426152621993, "kl": 0.0027008056640625, "learning_rate": 9.674279106414336e-07, "loss": 0.0001, "reward": 1.805803656578064, "reward_std": 0.08845393173396587, "rewards/accuracy_reward": 0.8058035969734192, "rewards/format_reward": 1.0, "step": 1482 }, { "completion_length": 413.4464454650879, "epoch": 0.11568218580086917, "grad_norm": 0.07540262853548407, "kl": 0.0021829605102539062, "learning_rate": 9.67340907889659e-07, "loss": 0.0001, "reward": 1.787946492433548, "reward_std": 0.08229020424187183, "rewards/accuracy_reward": 0.787946455180645, "rewards/format_reward": 1.0, "step": 1484 }, { "completion_length": 415.1428756713867, "epoch": 0.11583809171165202, "grad_norm": 0.07248756610077982, "kl": 0.0022306442260742188, "learning_rate": 9.672537930198117e-07, "loss": 0.0001, "reward": 1.8415179252624512, "reward_std": 0.0359921008348465, "rewards/accuracy_reward": 0.8415178805589676, "rewards/format_reward": 1.0, "step": 1486 }, { "completion_length": 424.45314025878906, "epoch": 0.11599399762243486, "grad_norm": 0.1099174013786018, "kl": 0.0023365020751953125, "learning_rate": 9.671665660527912e-07, "loss": 0.0001, "reward": 1.7790179699659348, "reward_std": 0.08747682813555002, "rewards/accuracy_reward": 0.7790178805589676, "rewards/format_reward": 1.0, "step": 1488 }, { "completion_length": 398.89956283569336, "epoch": 0.1161499035332177, "grad_norm": 0.08521299035756225, "kl": 0.0021495819091796875, "learning_rate": 9.670792270095236e-07, "loss": 0.0001, "reward": 1.7968750894069672, "reward_std": 0.058317420072853565, "rewards/accuracy_reward": 0.796875037252903, "rewards/format_reward": 1.0, "step": 1490 }, { "completion_length": 418.99332427978516, "epoch": 0.11630580944400054, "grad_norm": 0.14264602646712707, "kl": 0.00244140625, "learning_rate": 9.669917759109625e-07, "loss": 0.0001, "reward": 1.7901786416769028, "reward_std": 0.06996862590312958, "rewards/accuracy_reward": 0.7901786044239998, "rewards/format_reward": 1.0, "step": 1492 }, { "completion_length": 446.3750228881836, "epoch": 0.11646171535478339, "grad_norm": 0.07825421547517485, "kl": 0.0024309158325195312, "learning_rate": 9.669042127780877e-07, "loss": 0.0001, "reward": 1.8191965222358704, "reward_std": 0.07139714993536472, "rewards/accuracy_reward": 0.819196455180645, "rewards/format_reward": 1.0, "step": 1494 }, { "completion_length": 403.09822845458984, "epoch": 0.11661762126556623, "grad_norm": 0.08994211340197757, "kl": 0.0020084381103515625, "learning_rate": 9.668165376319062e-07, "loss": 0.0001, "reward": 1.7678572088479996, "reward_std": 0.043282654136419296, "rewards/accuracy_reward": 0.7678571715950966, "rewards/format_reward": 1.0, "step": 1496 }, { "completion_length": 407.02457427978516, "epoch": 0.11677352717634908, "grad_norm": 0.0803380062691819, "kl": 0.0020875930786132812, "learning_rate": 9.667287504934518e-07, "loss": 0.0001, "reward": 1.852678656578064, "reward_std": 0.0698152594268322, "rewards/accuracy_reward": 0.8526786044239998, "rewards/format_reward": 1.0, "step": 1498 }, { "completion_length": 408.3259086608887, "epoch": 0.11692943308713191, "grad_norm": 0.1091871553433136, "kl": 0.002246856689453125, "learning_rate": 9.666408513837854e-07, "loss": 0.0001, "reward": 1.8080357760190964, "reward_std": 0.0907084196805954, "rewards/accuracy_reward": 0.8080357387661934, "rewards/format_reward": 1.0, "step": 1500 }, { "completion_length": 418.4955596923828, "epoch": 0.11708533899791476, "grad_norm": 0.09421583456977875, "kl": 0.0023145675659179688, "learning_rate": 9.665528403239941e-07, "loss": 0.0001, "reward": 1.7991072237491608, "reward_std": 0.058620513416826725, "rewards/accuracy_reward": 0.8013393357396126, "rewards/format_reward": 0.9977678656578064, "step": 1502 }, { "completion_length": 418.2879638671875, "epoch": 0.1172412449086976, "grad_norm": 0.08653453782100357, "kl": 0.0022268295288085938, "learning_rate": 9.66464717335193e-07, "loss": 0.0001, "reward": 1.8258929550647736, "reward_std": 0.0660630315542221, "rewards/accuracy_reward": 0.8258928954601288, "rewards/format_reward": 1.0, "step": 1504 }, { "completion_length": 419.761173248291, "epoch": 0.11739715081948045, "grad_norm": 0.05899906269233587, "kl": 0.002254486083984375, "learning_rate": 9.663764824385227e-07, "loss": 0.0001, "reward": 1.7700893729925156, "reward_std": 0.05230706091970205, "rewards/accuracy_reward": 0.7700893133878708, "rewards/format_reward": 1.0, "step": 1506 }, { "completion_length": 417.8370704650879, "epoch": 0.11755305673026328, "grad_norm": 0.09245109841488089, "kl": 0.0024309158325195312, "learning_rate": 9.662881356551516e-07, "loss": 0.0001, "reward": 1.8303572088479996, "reward_std": 0.08311477210372686, "rewards/accuracy_reward": 0.8325893059372902, "rewards/format_reward": 0.9977678656578064, "step": 1508 }, { "completion_length": 419.377254486084, "epoch": 0.11770896264104613, "grad_norm": 0.09558748598332363, "kl": 0.0024547576904296875, "learning_rate": 9.661996770062748e-07, "loss": 0.0001, "reward": 1.8750000596046448, "reward_std": 0.06590966414660215, "rewards/accuracy_reward": 0.875000037252903, "rewards/format_reward": 1.0, "step": 1510 }, { "completion_length": 414.72099685668945, "epoch": 0.11786486855182897, "grad_norm": 0.10050564326141385, "kl": 0.0022115707397460938, "learning_rate": 9.661111065131137e-07, "loss": 0.0001, "reward": 1.87276791036129, "reward_std": 0.04404080659151077, "rewards/accuracy_reward": 0.8727678954601288, "rewards/format_reward": 1.0, "step": 1512 }, { "completion_length": 431.1495704650879, "epoch": 0.11802077446261182, "grad_norm": 0.14983332307232056, "kl": 0.0024671554565429688, "learning_rate": 9.660224241969176e-07, "loss": 0.0001, "reward": 1.7812500894069672, "reward_std": 0.09799757227301598, "rewards/accuracy_reward": 0.7812500298023224, "rewards/format_reward": 1.0, "step": 1514 }, { "completion_length": 420.3326072692871, "epoch": 0.11817668037339465, "grad_norm": 0.09687557013017438, "kl": 0.0022935867309570312, "learning_rate": 9.659336300789612e-07, "loss": 0.0001, "reward": 1.8549108058214188, "reward_std": 0.07447901461273432, "rewards/accuracy_reward": 0.8549107536673546, "rewards/format_reward": 1.0, "step": 1516 }, { "completion_length": 419.4754638671875, "epoch": 0.1183325862841775, "grad_norm": 0.11840611921270468, "kl": 0.002201080322265625, "learning_rate": 9.65844724180547e-07, "loss": 0.0001, "reward": 1.8549108058214188, "reward_std": 0.07244066335260868, "rewards/accuracy_reward": 0.854910746216774, "rewards/format_reward": 1.0, "step": 1518 }, { "completion_length": 412.04019927978516, "epoch": 0.11848849219496034, "grad_norm": 0.09373163495574977, "kl": 0.0032320022583007812, "learning_rate": 9.657557065230045e-07, "loss": 0.0001, "reward": 1.7321429550647736, "reward_std": 0.07808151375502348, "rewards/accuracy_reward": 0.7321429029107094, "rewards/format_reward": 1.0, "step": 1520 }, { "completion_length": 417.5134162902832, "epoch": 0.11864439810574319, "grad_norm": 0.12928073994480557, "kl": 0.0023260116577148438, "learning_rate": 9.65666577127689e-07, "loss": 0.0001, "reward": 1.7879465222358704, "reward_std": 0.07680131774395704, "rewards/accuracy_reward": 0.7879464700818062, "rewards/format_reward": 1.0, "step": 1522 }, { "completion_length": 430.6518096923828, "epoch": 0.11880030401652603, "grad_norm": 0.15308800106692122, "kl": 0.0025300979614257812, "learning_rate": 9.655773360159838e-07, "loss": 0.0001, "reward": 1.8303572535514832, "reward_std": 0.11498290114104748, "rewards/accuracy_reward": 0.8303571790456772, "rewards/format_reward": 1.0, "step": 1524 }, { "completion_length": 419.58707427978516, "epoch": 0.11895620992730888, "grad_norm": 0.08658842505872497, "kl": 0.00228118896484375, "learning_rate": 9.654879832092979e-07, "loss": 0.0001, "reward": 1.7678572088479996, "reward_std": 0.09363691788166761, "rewards/accuracy_reward": 0.7678571715950966, "rewards/format_reward": 1.0, "step": 1526 }, { "completion_length": 417.2901954650879, "epoch": 0.11911211583809171, "grad_norm": 0.11727978173937738, "kl": 0.0023794174194335938, "learning_rate": 9.653985187290682e-07, "loss": 0.0001, "reward": 1.7857143729925156, "reward_std": 0.0652370611205697, "rewards/accuracy_reward": 0.7857143133878708, "rewards/format_reward": 1.0, "step": 1528 }, { "completion_length": 421.8482322692871, "epoch": 0.11926802174887455, "grad_norm": 0.12562629590312252, "kl": 0.002300262451171875, "learning_rate": 9.65308942596757e-07, "loss": 0.0001, "reward": 1.7589286416769028, "reward_std": 0.07643181178718805, "rewards/accuracy_reward": 0.7589286118745804, "rewards/format_reward": 1.0, "step": 1530 }, { "completion_length": 426.7477912902832, "epoch": 0.1194239276596574, "grad_norm": 0.11164218410441715, "kl": 0.0024766921997070312, "learning_rate": 9.652192548338549e-07, "loss": 0.0001, "reward": 1.8035715222358704, "reward_std": 0.10528672207146883, "rewards/accuracy_reward": 0.8035714700818062, "rewards/format_reward": 1.0, "step": 1532 }, { "completion_length": 406.5580520629883, "epoch": 0.11957983357044023, "grad_norm": 0.12751857975612277, "kl": 0.0021848678588867188, "learning_rate": 9.651294554618783e-07, "loss": 0.0001, "reward": 1.7633929401636124, "reward_std": 0.06996862683445215, "rewards/accuracy_reward": 0.7633928954601288, "rewards/format_reward": 1.0, "step": 1534 }, { "completion_length": 428.4062728881836, "epoch": 0.11973573948122308, "grad_norm": 0.0870205020089922, "kl": 0.002414703369140625, "learning_rate": 9.650395445023705e-07, "loss": 0.0001, "reward": 1.8303572088479996, "reward_std": 0.0728971241042018, "rewards/accuracy_reward": 0.830357164144516, "rewards/format_reward": 1.0, "step": 1536 }, { "completion_length": 427.7388610839844, "epoch": 0.11989164539200592, "grad_norm": 0.10705500609505612, "kl": 0.0024366378784179688, "learning_rate": 9.64949521976902e-07, "loss": 0.0001, "reward": 1.8638393729925156, "reward_std": 0.10235963016748428, "rewards/accuracy_reward": 0.8638393133878708, "rewards/format_reward": 1.0, "step": 1538 }, { "completion_length": 428.82591247558594, "epoch": 0.12004755130278877, "grad_norm": 0.15740504563499358, "kl": 0.0022401809692382812, "learning_rate": 9.648593879070694e-07, "loss": 0.0001, "reward": 1.9084822237491608, "reward_std": 0.08417742047458887, "rewards/accuracy_reward": 0.9084821715950966, "rewards/format_reward": 1.0, "step": 1540 }, { "completion_length": 425.17635345458984, "epoch": 0.1202034572135716, "grad_norm": 0.0921686651479002, "kl": 0.00262451171875, "learning_rate": 9.647691423144967e-07, "loss": 0.0001, "reward": 1.7968751043081284, "reward_std": 0.0946126151829958, "rewards/accuracy_reward": 0.7991071715950966, "rewards/format_reward": 0.9977678656578064, "step": 1542 }, { "completion_length": 438.439754486084, "epoch": 0.12035936312435445, "grad_norm": 0.12899588900460549, "kl": 0.002349853515625, "learning_rate": 9.646787852208347e-07, "loss": 0.0001, "reward": 1.877232238650322, "reward_std": 0.07838460523635149, "rewards/accuracy_reward": 0.8772321939468384, "rewards/format_reward": 1.0, "step": 1544 }, { "completion_length": 426.9687690734863, "epoch": 0.12051526903513729, "grad_norm": 0.11122088937441633, "kl": 0.003108978271484375, "learning_rate": 9.6458831664776e-07, "loss": 0.0001, "reward": 1.772321492433548, "reward_std": 0.1229446530342102, "rewards/accuracy_reward": 0.7723214626312256, "rewards/format_reward": 1.0, "step": 1546 }, { "completion_length": 432.34599685668945, "epoch": 0.12067117494592014, "grad_norm": 0.11843991545836506, "kl": 0.0025272369384765625, "learning_rate": 9.644977366169772e-07, "loss": 0.0001, "reward": 1.7455358058214188, "reward_std": 0.10333532933145761, "rewards/accuracy_reward": 0.7522321790456772, "rewards/format_reward": 0.993303582072258, "step": 1548 }, { "completion_length": 426.12278747558594, "epoch": 0.12082708085670298, "grad_norm": 0.1654399230130512, "kl": 0.00257110595703125, "learning_rate": 9.644070451502165e-07, "loss": 0.0001, "reward": 1.7455357909202576, "reward_std": 0.12444603443145752, "rewards/accuracy_reward": 0.7455357611179352, "rewards/format_reward": 1.0, "step": 1550 }, { "completion_length": 423.35939025878906, "epoch": 0.12098298676748583, "grad_norm": 0.1621280570114556, "kl": 0.0026960372924804688, "learning_rate": 9.643162422692356e-07, "loss": 0.0001, "reward": 1.767857238650322, "reward_std": 0.13121090456843376, "rewards/accuracy_reward": 0.7678571790456772, "rewards/format_reward": 1.0, "step": 1552 }, { "completion_length": 433.1629638671875, "epoch": 0.12113889267826866, "grad_norm": 0.0950774644745986, "kl": 0.0034313201904296875, "learning_rate": 9.642253279958188e-07, "loss": 0.0001, "reward": 1.875000074505806, "reward_std": 0.057341720908880234, "rewards/accuracy_reward": 0.8750000223517418, "rewards/format_reward": 1.0, "step": 1554 }, { "completion_length": 433.22769927978516, "epoch": 0.12129479858905151, "grad_norm": 0.11453498466117139, "kl": 0.002613067626953125, "learning_rate": 9.641343023517767e-07, "loss": 0.0001, "reward": 1.7366072237491608, "reward_std": 0.08116337656974792, "rewards/accuracy_reward": 0.7366071715950966, "rewards/format_reward": 1.0, "step": 1556 }, { "completion_length": 425.19868087768555, "epoch": 0.12145070449983435, "grad_norm": 0.1312559711930456, "kl": 0.002506256103515625, "learning_rate": 9.640431653589474e-07, "loss": 0.0001, "reward": 1.8348214775323868, "reward_std": 0.07515301834791899, "rewards/accuracy_reward": 0.834821455180645, "rewards/format_reward": 1.0, "step": 1558 }, { "completion_length": 413.69868087768555, "epoch": 0.1216066104106172, "grad_norm": 0.08240748355788613, "kl": 0.0022077560424804688, "learning_rate": 9.639519170391947e-07, "loss": 0.0001, "reward": 1.8370536267757416, "reward_std": 0.04449363239109516, "rewards/accuracy_reward": 0.8370535969734192, "rewards/format_reward": 1.0, "step": 1560 }, { "completion_length": 425.79243087768555, "epoch": 0.12176251632140003, "grad_norm": 0.12986346274524388, "kl": 0.0025730133056640625, "learning_rate": 9.6386055741441e-07, "loss": 0.0001, "reward": 1.8013393580913544, "reward_std": 0.12340111564844847, "rewards/accuracy_reward": 0.8013393208384514, "rewards/format_reward": 1.0, "step": 1562 }, { "completion_length": 432.8169860839844, "epoch": 0.12191842223218288, "grad_norm": 0.11164635218501892, "kl": 0.00261688232421875, "learning_rate": 9.637690865065112e-07, "loss": 0.0001, "reward": 1.8593750596046448, "reward_std": 0.07981676701456308, "rewards/accuracy_reward": 0.8616071790456772, "rewards/format_reward": 0.9977678656578064, "step": 1564 }, { "completion_length": 423.39510345458984, "epoch": 0.12207432814296572, "grad_norm": 0.10347037220765207, "kl": 0.0024404525756835938, "learning_rate": 9.636775043374424e-07, "loss": 0.0001, "reward": 1.8526786267757416, "reward_std": 0.07808151189237833, "rewards/accuracy_reward": 0.8526786044239998, "rewards/format_reward": 1.0, "step": 1566 }, { "completion_length": 410.6317138671875, "epoch": 0.12223023405374857, "grad_norm": 0.1224415107491782, "kl": 0.002407073974609375, "learning_rate": 9.63585810929175e-07, "loss": 0.0001, "reward": 1.7924108058214188, "reward_std": 0.09040532540529966, "rewards/accuracy_reward": 0.792410746216774, "rewards/format_reward": 1.0, "step": 1568 }, { "completion_length": 405.85493087768555, "epoch": 0.1223861399645314, "grad_norm": 0.16172636460437248, "kl": 0.002986907958984375, "learning_rate": 9.634940063037067e-07, "loss": 0.0001, "reward": 1.8504464775323868, "reward_std": 0.10055655986070633, "rewards/accuracy_reward": 0.850446455180645, "rewards/format_reward": 1.0, "step": 1570 }, { "completion_length": 408.5692138671875, "epoch": 0.12254204587531425, "grad_norm": 0.2737064030864787, "kl": 0.0024700164794921875, "learning_rate": 9.63402090483062e-07, "loss": 0.0001, "reward": 1.7656250894069672, "reward_std": 0.08439496718347073, "rewards/accuracy_reward": 0.7656250298023224, "rewards/format_reward": 1.0, "step": 1572 }, { "completion_length": 415.62278747558594, "epoch": 0.12269795178609709, "grad_norm": 0.128902847629927, "kl": 0.0027742385864257812, "learning_rate": 9.633100634892921e-07, "loss": 0.0001, "reward": 1.7745536416769028, "reward_std": 0.0843949681147933, "rewards/accuracy_reward": 0.7745535969734192, "rewards/format_reward": 1.0, "step": 1574 }, { "completion_length": 416.05135345458984, "epoch": 0.12285385769687994, "grad_norm": 0.12071653058568178, "kl": 0.0022096633911132812, "learning_rate": 9.63217925344475e-07, "loss": 0.0001, "reward": 1.8013393431901932, "reward_std": 0.06643030606210232, "rewards/accuracy_reward": 0.801339328289032, "rewards/format_reward": 1.0, "step": 1576 }, { "completion_length": 409.3393020629883, "epoch": 0.12300976360766278, "grad_norm": 0.07970897432290218, "kl": 0.0022935867309570312, "learning_rate": 9.63125676070715e-07, "loss": 0.0001, "reward": 1.8839286118745804, "reward_std": 0.04599360842257738, "rewards/accuracy_reward": 0.8839285895228386, "rewards/format_reward": 1.0, "step": 1578 }, { "completion_length": 410.0826072692871, "epoch": 0.12316566951844561, "grad_norm": 0.10961464320609303, "kl": 0.0023555755615234375, "learning_rate": 9.630333156901438e-07, "loss": 0.0001, "reward": 1.81026791036129, "reward_std": 0.07371945679187775, "rewards/accuracy_reward": 0.8102679029107094, "rewards/format_reward": 1.0, "step": 1580 }, { "completion_length": 410.1808204650879, "epoch": 0.12332157542922846, "grad_norm": 0.11866342787796343, "kl": 0.0023794174194335938, "learning_rate": 9.629408442249184e-07, "loss": 0.0001, "reward": 1.866071492433548, "reward_std": 0.09265981893986464, "rewards/accuracy_reward": 0.8660714700818062, "rewards/format_reward": 1.0, "step": 1582 }, { "completion_length": 411.6741256713867, "epoch": 0.1234774813400113, "grad_norm": 0.12539834302905895, "kl": 0.0024633407592773438, "learning_rate": 9.628482616972241e-07, "loss": 0.0001, "reward": 1.8258929550647736, "reward_std": 0.11084202397614717, "rewards/accuracy_reward": 0.8281250447034836, "rewards/format_reward": 0.9977678656578064, "step": 1584 }, { "completion_length": 417.52457427978516, "epoch": 0.12363338725079415, "grad_norm": 0.09980506966062642, "kl": 0.0023822784423828125, "learning_rate": 9.627555681292715e-07, "loss": 0.0001, "reward": 1.7901786416769028, "reward_std": 0.07124742120504379, "rewards/accuracy_reward": 0.7901786044239998, "rewards/format_reward": 1.0, "step": 1586 }, { "completion_length": 420.31474685668945, "epoch": 0.12378929316157698, "grad_norm": 0.06543769236666243, "kl": 0.0021800994873046875, "learning_rate": 9.626627635432988e-07, "loss": 0.0001, "reward": 1.859375074505806, "reward_std": 0.05230705998837948, "rewards/accuracy_reward": 0.8593750447034836, "rewards/format_reward": 1.0, "step": 1588 }, { "completion_length": 435.63618087768555, "epoch": 0.12394519907235983, "grad_norm": 0.11628206787409409, "kl": 0.0027370452880859375, "learning_rate": 9.625698479615703e-07, "loss": 0.0001, "reward": 1.7098215073347092, "reward_std": 0.11385243106633425, "rewards/accuracy_reward": 0.7098214626312256, "rewards/format_reward": 1.0, "step": 1590 }, { "completion_length": 427.2433204650879, "epoch": 0.12410110498314267, "grad_norm": 0.09002781722448894, "kl": 0.0025491714477539062, "learning_rate": 9.624768214063767e-07, "loss": 0.0001, "reward": 1.8861607760190964, "reward_std": 0.0665836725383997, "rewards/accuracy_reward": 0.8861607536673546, "rewards/format_reward": 1.0, "step": 1592 }, { "completion_length": 425.4531478881836, "epoch": 0.12425701089392552, "grad_norm": 0.14176866272296107, "kl": 0.0023508071899414062, "learning_rate": 9.62383683900036e-07, "loss": 0.0001, "reward": 1.8169643729925156, "reward_std": 0.1245943596586585, "rewards/accuracy_reward": 0.8169643357396126, "rewards/format_reward": 1.0, "step": 1594 }, { "completion_length": 405.5825996398926, "epoch": 0.12441291680470835, "grad_norm": 0.07084774765352649, "kl": 0.00244140625, "learning_rate": 9.622904354648924e-07, "loss": 0.0001, "reward": 1.852678656578064, "reward_std": 0.038704453967511654, "rewards/accuracy_reward": 0.8526786118745804, "rewards/format_reward": 1.0, "step": 1596 }, { "completion_length": 422.4910888671875, "epoch": 0.1245688227154912, "grad_norm": 0.08665889206069959, "kl": 0.0024261474609375, "learning_rate": 9.621970761233166e-07, "loss": 0.0001, "reward": 1.8392857760190964, "reward_std": 0.08942962624132633, "rewards/accuracy_reward": 0.8392857536673546, "rewards/format_reward": 1.0, "step": 1598 }, { "completion_length": 414.82591247558594, "epoch": 0.12472472862627404, "grad_norm": 0.10039231383863943, "kl": 0.0022497177124023438, "learning_rate": 9.621036058977066e-07, "loss": 0.0001, "reward": 1.9129464626312256, "reward_std": 0.05102826841175556, "rewards/accuracy_reward": 0.9129464626312256, "rewards/format_reward": 1.0, "step": 1600 }, { "completion_length": 419.6183204650879, "epoch": 0.12488063453705689, "grad_norm": 0.10566705626320581, "kl": 0.0023984909057617188, "learning_rate": 9.62010024810486e-07, "loss": 0.0001, "reward": 1.805803656578064, "reward_std": 0.0787555193528533, "rewards/accuracy_reward": 0.8058036118745804, "rewards/format_reward": 1.0, "step": 1602 }, { "completion_length": 419.2812614440918, "epoch": 0.12503654044783974, "grad_norm": 0.09052987710657269, "kl": 0.0024518966674804688, "learning_rate": 9.619163328841057e-07, "loss": 0.0001, "reward": 1.7611607909202576, "reward_std": 0.053956763818860054, "rewards/accuracy_reward": 0.761160746216774, "rewards/format_reward": 1.0, "step": 1604 }, { "completion_length": 405.69421768188477, "epoch": 0.12519244635862256, "grad_norm": 0.12690203867902672, "kl": 0.002407073974609375, "learning_rate": 9.61822530141043e-07, "loss": 0.0001, "reward": 1.8169643580913544, "reward_std": 0.06560797244310379, "rewards/accuracy_reward": 0.8169643208384514, "rewards/format_reward": 1.0, "step": 1606 }, { "completion_length": 418.0357322692871, "epoch": 0.1253483522694054, "grad_norm": 0.11376176408828176, "kl": 0.0025529861450195312, "learning_rate": 9.61728616603802e-07, "loss": 0.0001, "reward": 1.8035715222358704, "reward_std": 0.08829916082322598, "rewards/accuracy_reward": 0.803571455180645, "rewards/format_reward": 1.0, "step": 1608 }, { "completion_length": 412.66296005249023, "epoch": 0.12550425818018826, "grad_norm": 0.11224932273059224, "kl": 0.0021677017211914062, "learning_rate": 9.616345922949129e-07, "loss": 0.0001, "reward": 1.8325893431901932, "reward_std": 0.030135109089314938, "rewards/accuracy_reward": 0.8325893133878708, "rewards/format_reward": 1.0, "step": 1610 }, { "completion_length": 416.39064025878906, "epoch": 0.1256601640909711, "grad_norm": 0.10255779311186529, "kl": 0.0024156570434570312, "learning_rate": 9.615404572369326e-07, "loss": 0.0001, "reward": 1.8169643729925156, "reward_std": 0.08537066448479891, "rewards/accuracy_reward": 0.8169643133878708, "rewards/format_reward": 1.0, "step": 1612 }, { "completion_length": 413.5178756713867, "epoch": 0.12581607000175393, "grad_norm": 0.11786009795490214, "kl": 0.002368927001953125, "learning_rate": 9.614462114524448e-07, "loss": 0.0001, "reward": 1.8638393580913544, "reward_std": 0.042762015014886856, "rewards/accuracy_reward": 0.8638393431901932, "rewards/format_reward": 1.0, "step": 1614 }, { "completion_length": 439.1227836608887, "epoch": 0.12597197591253678, "grad_norm": 0.12092349802476467, "kl": 0.0026559829711914062, "learning_rate": 9.6135185496406e-07, "loss": 0.0001, "reward": 1.7544643729925156, "reward_std": 0.0764318099245429, "rewards/accuracy_reward": 0.7544643133878708, "rewards/format_reward": 1.0, "step": 1616 }, { "completion_length": 423.252254486084, "epoch": 0.12612788182331963, "grad_norm": 0.13125889362967672, "kl": 0.0026369094848632812, "learning_rate": 9.612573877944145e-07, "loss": 0.0001, "reward": 1.805803656578064, "reward_std": 0.08018767926841974, "rewards/accuracy_reward": 0.8058035895228386, "rewards/format_reward": 1.0, "step": 1618 }, { "completion_length": 419.3259086608887, "epoch": 0.12628378773410248, "grad_norm": 0.12026360500413677, "kl": 0.0023193359375, "learning_rate": 9.611628099661718e-07, "loss": 0.0001, "reward": 1.7924108058214188, "reward_std": 0.09116488322615623, "rewards/accuracy_reward": 0.7946428954601288, "rewards/format_reward": 0.9977678656578064, "step": 1620 }, { "completion_length": 424.51564025878906, "epoch": 0.1264396936448853, "grad_norm": 0.11828677478788967, "kl": 0.0025005340576171875, "learning_rate": 9.610681215020215e-07, "loss": 0.0001, "reward": 1.7924107909202576, "reward_std": 0.062005470506846905, "rewards/accuracy_reward": 0.792410746216774, "rewards/format_reward": 1.0, "step": 1622 }, { "completion_length": 419.1473388671875, "epoch": 0.12659559955566815, "grad_norm": 0.09512520317089825, "kl": 0.00246429443359375, "learning_rate": 9.609733224246803e-07, "loss": 0.0001, "reward": 1.8102679252624512, "reward_std": 0.059294519014656544, "rewards/accuracy_reward": 0.8102678954601288, "rewards/format_reward": 1.0, "step": 1624 }, { "completion_length": 413.1718940734863, "epoch": 0.126751505466451, "grad_norm": 0.12072984377156178, "kl": 0.0024747848510742188, "learning_rate": 9.608784127568907e-07, "loss": 0.0001, "reward": 1.7924108058214188, "reward_std": 0.07936170976608992, "rewards/accuracy_reward": 0.7924107611179352, "rewards/format_reward": 1.0, "step": 1626 }, { "completion_length": 426.0334930419922, "epoch": 0.12690741137723385, "grad_norm": 0.0911703589324413, "kl": 0.002246856689453125, "learning_rate": 9.607833925214226e-07, "loss": 0.0001, "reward": 1.8325893580913544, "reward_std": 0.05816405080258846, "rewards/accuracy_reward": 0.8325893133878708, "rewards/format_reward": 1.0, "step": 1628 }, { "completion_length": 423.56028747558594, "epoch": 0.12706331728801668, "grad_norm": 0.13193602708202595, "kl": 0.0027704238891601562, "learning_rate": 9.606882617410717e-07, "loss": 0.0001, "reward": 1.7678572237491608, "reward_std": 0.10724092554301023, "rewards/accuracy_reward": 0.7678571864962578, "rewards/format_reward": 1.0, "step": 1630 }, { "completion_length": 414.4419860839844, "epoch": 0.12721922319879952, "grad_norm": 0.09971772567322126, "kl": 0.0022897720336914062, "learning_rate": 9.605930204386606e-07, "loss": 0.0001, "reward": 1.8861607760190964, "reward_std": 0.03156726714223623, "rewards/accuracy_reward": 0.8861607313156128, "rewards/format_reward": 1.0, "step": 1632 }, { "completion_length": 418.78349685668945, "epoch": 0.12737512910958237, "grad_norm": 0.14773871839029706, "kl": 0.00232696533203125, "learning_rate": 9.60497668637038e-07, "loss": 0.0001, "reward": 1.8191965073347092, "reward_std": 0.0955897169187665, "rewards/accuracy_reward": 0.8191964700818062, "rewards/format_reward": 1.0, "step": 1634 }, { "completion_length": 411.5602836608887, "epoch": 0.1275310350203652, "grad_norm": 0.11356150105255453, "kl": 0.002418041229248047, "learning_rate": 9.604022063590798e-07, "loss": 0.0001, "reward": 1.9151786267757416, "reward_std": 0.07658517733216286, "rewards/accuracy_reward": 0.9151786044239998, "rewards/format_reward": 1.0, "step": 1636 }, { "completion_length": 416.70314025878906, "epoch": 0.12768694093114805, "grad_norm": 0.1349608497922669, "kl": 0.0023756027221679688, "learning_rate": 9.603066336276878e-07, "loss": 0.0001, "reward": 1.8035715222358704, "reward_std": 0.08552403189241886, "rewards/accuracy_reward": 0.8035714626312256, "rewards/format_reward": 1.0, "step": 1638 }, { "completion_length": 422.1919822692871, "epoch": 0.1278428468419309, "grad_norm": 0.10897857352158781, "kl": 0.00234222412109375, "learning_rate": 9.602109504657905e-07, "loss": 0.0001, "reward": 1.680803656578064, "reward_std": 0.08296280819922686, "rewards/accuracy_reward": 0.6808035969734192, "rewards/format_reward": 1.0, "step": 1640 }, { "completion_length": 422.97546768188477, "epoch": 0.12799875275271375, "grad_norm": 0.13931733264390125, "kl": 0.0026502609252929688, "learning_rate": 9.60115156896343e-07, "loss": 0.0001, "reward": 1.705357238650322, "reward_std": 0.097020473331213, "rewards/accuracy_reward": 0.7053571790456772, "rewards/format_reward": 1.0, "step": 1642 }, { "completion_length": 429.86832427978516, "epoch": 0.12815465866349657, "grad_norm": 0.09863095387613237, "kl": 0.002536773681640625, "learning_rate": 9.600192529423266e-07, "loss": 0.0001, "reward": 1.897321492433548, "reward_std": 0.06319871358573437, "rewards/accuracy_reward": 0.897321455180645, "rewards/format_reward": 1.0, "step": 1644 }, { "completion_length": 429.8638572692871, "epoch": 0.12831056457427942, "grad_norm": 0.13298626981124417, "kl": 0.0025129318237304688, "learning_rate": 9.599232386267495e-07, "loss": 0.0001, "reward": 1.8772322088479996, "reward_std": 0.08003431279212236, "rewards/accuracy_reward": 0.8772321864962578, "rewards/format_reward": 1.0, "step": 1646 }, { "completion_length": 408.1696586608887, "epoch": 0.12846647048506227, "grad_norm": 0.12159392419597538, "kl": 0.002490997314453125, "learning_rate": 9.598271139726461e-07, "loss": 0.0001, "reward": 1.8459822237491608, "reward_std": 0.08973272517323494, "rewards/accuracy_reward": 0.8459821790456772, "rewards/format_reward": 1.0, "step": 1648 }, { "completion_length": 414.19644927978516, "epoch": 0.12862237639584512, "grad_norm": 0.09569597752875167, "kl": 0.0021677017211914062, "learning_rate": 9.59730879003077e-07, "loss": 0.0001, "reward": 1.899553656578064, "reward_std": 0.08567376062273979, "rewards/accuracy_reward": 0.8995536044239998, "rewards/format_reward": 1.0, "step": 1650 }, { "completion_length": 427.2120704650879, "epoch": 0.12877828230662794, "grad_norm": 0.09148832123148001, "kl": 0.0024652481079101562, "learning_rate": 9.596345337411302e-07, "loss": 0.0001, "reward": 1.796875074505806, "reward_std": 0.06333870906382799, "rewards/accuracy_reward": 0.7991071715950966, "rewards/format_reward": 0.9977678656578064, "step": 1652 }, { "completion_length": 410.8013610839844, "epoch": 0.1289341882174108, "grad_norm": 0.06692352755463951, "kl": 0.0021886825561523438, "learning_rate": 9.595380782099193e-07, "loss": 0.0001, "reward": 1.8660715222358704, "reward_std": 0.06710430886596441, "rewards/accuracy_reward": 0.8660714626312256, "rewards/format_reward": 1.0, "step": 1654 }, { "completion_length": 422.7745780944824, "epoch": 0.12909009412819364, "grad_norm": 0.12111592787918682, "kl": 0.0022373199462890625, "learning_rate": 9.59441512432584e-07, "loss": 0.0001, "reward": 1.848214328289032, "reward_std": 0.10242240503430367, "rewards/accuracy_reward": 0.8482143208384514, "rewards/format_reward": 1.0, "step": 1656 }, { "completion_length": 435.1473388671875, "epoch": 0.1292460000389765, "grad_norm": 0.11399167758461927, "kl": 0.00258636474609375, "learning_rate": 9.59344836432292e-07, "loss": 0.0001, "reward": 1.7790179401636124, "reward_std": 0.08732346352189779, "rewards/accuracy_reward": 0.7790178954601288, "rewards/format_reward": 1.0, "step": 1658 }, { "completion_length": 409.91296768188477, "epoch": 0.1294019059497593, "grad_norm": 0.12523393950021605, "kl": 0.00209808349609375, "learning_rate": 9.592480502322358e-07, "loss": 0.0001, "reward": 1.843750074505806, "reward_std": 0.10010009817779064, "rewards/accuracy_reward": 0.843750037252903, "rewards/format_reward": 1.0, "step": 1660 }, { "completion_length": 403.89510345458984, "epoch": 0.12955781186054216, "grad_norm": 0.1284382583639891, "kl": 0.0023651123046875, "learning_rate": 9.591511538556354e-07, "loss": 0.0001, "reward": 1.8638393431901932, "reward_std": 0.08537206798791885, "rewards/accuracy_reward": 0.8638393133878708, "rewards/format_reward": 1.0, "step": 1662 }, { "completion_length": 412.7678756713867, "epoch": 0.129713717771325, "grad_norm": 0.14248362473762977, "kl": 0.0023260116577148438, "learning_rate": 9.590541473257367e-07, "loss": 0.0001, "reward": 1.8549108058214188, "reward_std": 0.10385596752166748, "rewards/accuracy_reward": 0.854910746216774, "rewards/format_reward": 1.0, "step": 1664 }, { "completion_length": 421.0089454650879, "epoch": 0.12986962368210786, "grad_norm": 0.0917975947774854, "kl": 0.0029144287109375, "learning_rate": 9.589570306658122e-07, "loss": 0.0001, "reward": 1.7767857909202576, "reward_std": 0.10250935796648264, "rewards/accuracy_reward": 0.7767857536673546, "rewards/format_reward": 1.0, "step": 1666 }, { "completion_length": 409.0915336608887, "epoch": 0.13002552959289068, "grad_norm": 0.0992387671562558, "kl": 0.0020475387573242188, "learning_rate": 9.588598038991609e-07, "loss": 0.0001, "reward": 1.8482143580913544, "reward_std": 0.07334994990378618, "rewards/accuracy_reward": 0.8482143133878708, "rewards/format_reward": 1.0, "step": 1668 }, { "completion_length": 412.3035888671875, "epoch": 0.13018143550367353, "grad_norm": 0.14479192164490307, "kl": 0.0024328231811523438, "learning_rate": 9.58762467049108e-07, "loss": 0.0001, "reward": 1.819196492433548, "reward_std": 0.09281458705663681, "rewards/accuracy_reward": 0.8214285969734192, "rewards/format_reward": 0.9977678656578064, "step": 1670 }, { "completion_length": 433.08930587768555, "epoch": 0.13033734141445638, "grad_norm": 0.14020849495245283, "kl": 0.0023355484008789062, "learning_rate": 9.586650201390053e-07, "loss": 0.0001, "reward": 1.828125074505806, "reward_std": 0.0922903073951602, "rewards/accuracy_reward": 0.8281250447034836, "rewards/format_reward": 1.0, "step": 1672 }, { "completion_length": 417.93974685668945, "epoch": 0.13049324732523923, "grad_norm": 0.14986303395678147, "kl": 0.00254058837890625, "learning_rate": 9.585674631922308e-07, "loss": 0.0001, "reward": 1.821428656578064, "reward_std": 0.12038706988096237, "rewards/accuracy_reward": 0.8214286044239998, "rewards/format_reward": 1.0, "step": 1674 }, { "completion_length": 412.1138610839844, "epoch": 0.13064915323602205, "grad_norm": 0.10193610899863971, "kl": 0.002239227294921875, "learning_rate": 9.58469796232189e-07, "loss": 0.0001, "reward": 1.8459822237491608, "reward_std": 0.0456905122846365, "rewards/accuracy_reward": 0.8459821715950966, "rewards/format_reward": 1.0, "step": 1676 }, { "completion_length": 426.7366256713867, "epoch": 0.1308050591468049, "grad_norm": 0.12526886805573342, "kl": 0.00281524658203125, "learning_rate": 9.583720192823113e-07, "loss": 0.0001, "reward": 1.7968750447034836, "reward_std": 0.07484992314130068, "rewards/accuracy_reward": 0.7968750298023224, "rewards/format_reward": 1.0, "step": 1678 }, { "completion_length": 409.8995704650879, "epoch": 0.13096096505758775, "grad_norm": 0.06989990247648767, "kl": 0.0022764205932617188, "learning_rate": 9.582741323660546e-07, "loss": 0.0001, "reward": 1.881696492433548, "reward_std": 0.0344957634806633, "rewards/accuracy_reward": 0.8816964477300644, "rewards/format_reward": 1.0, "step": 1680 }, { "completion_length": 424.9732322692871, "epoch": 0.13111687096837057, "grad_norm": 0.08726571197460653, "kl": 0.0025396347045898438, "learning_rate": 9.581761355069028e-07, "loss": 0.0001, "reward": 1.7857143878936768, "reward_std": 0.05343612376600504, "rewards/accuracy_reward": 0.7857143208384514, "rewards/format_reward": 1.0, "step": 1682 }, { "completion_length": 409.7544860839844, "epoch": 0.13127277687915342, "grad_norm": 0.10324457315269292, "kl": 0.00240325927734375, "learning_rate": 9.580780287283656e-07, "loss": 0.0001, "reward": 1.7723215073347092, "reward_std": 0.058467148803174496, "rewards/accuracy_reward": 0.7723214626312256, "rewards/format_reward": 1.0, "step": 1684 }, { "completion_length": 419.69643783569336, "epoch": 0.13142868278993627, "grad_norm": 0.08673238542922716, "kl": 0.0027751922607421875, "learning_rate": 9.5797981205398e-07, "loss": 0.0001, "reward": 1.8147322237491608, "reward_std": 0.08146647084504366, "rewards/accuracy_reward": 0.8147321864962578, "rewards/format_reward": 1.0, "step": 1686 }, { "completion_length": 423.7343940734863, "epoch": 0.13158458870071912, "grad_norm": 0.0890068880121317, "kl": 0.0023183822631835938, "learning_rate": 9.578814855073084e-07, "loss": 0.0001, "reward": 1.8415179401636124, "reward_std": 0.08296280726790428, "rewards/accuracy_reward": 0.8437500298023224, "rewards/format_reward": 0.9977678656578064, "step": 1688 }, { "completion_length": 421.1741256713867, "epoch": 0.13174049461150195, "grad_norm": 0.09390857074601622, "kl": 0.0025758743286132812, "learning_rate": 9.5778304911194e-07, "loss": 0.0001, "reward": 1.8392858058214188, "reward_std": 0.05133136175572872, "rewards/accuracy_reward": 0.8392857611179352, "rewards/format_reward": 1.0, "step": 1690 }, { "completion_length": 420.9977836608887, "epoch": 0.1318964005222848, "grad_norm": 0.10597623121879886, "kl": 0.0024137496948242188, "learning_rate": 9.576845028914904e-07, "loss": 0.0001, "reward": 1.9129464775323868, "reward_std": 0.06206964887678623, "rewards/accuracy_reward": 0.912946455180645, "rewards/format_reward": 1.0, "step": 1692 }, { "completion_length": 426.3013610839844, "epoch": 0.13205230643306765, "grad_norm": 0.10771575975493913, "kl": 0.0027418136596679688, "learning_rate": 9.575858468696016e-07, "loss": 0.0001, "reward": 1.7790179401636124, "reward_std": 0.07371945586055517, "rewards/accuracy_reward": 0.7790178880095482, "rewards/format_reward": 1.0, "step": 1694 }, { "completion_length": 417.0602836608887, "epoch": 0.1322082123438505, "grad_norm": 0.08593086831494863, "kl": 0.0026197433471679688, "learning_rate": 9.574870810699416e-07, "loss": 0.0001, "reward": 1.8459822088479996, "reward_std": 0.07838460803031921, "rewards/accuracy_reward": 0.8459821864962578, "rewards/format_reward": 1.0, "step": 1696 }, { "completion_length": 417.52680587768555, "epoch": 0.13236411825463332, "grad_norm": 0.10184167590358789, "kl": 0.002277374267578125, "learning_rate": 9.57388205516205e-07, "loss": 0.0001, "reward": 1.8995536416769028, "reward_std": 0.06057331059128046, "rewards/accuracy_reward": 0.8995536267757416, "rewards/format_reward": 1.0, "step": 1698 }, { "completion_length": 427.02680587768555, "epoch": 0.13252002416541617, "grad_norm": 0.11291193687455077, "kl": 0.0024023056030273438, "learning_rate": 9.572892202321129e-07, "loss": 0.0001, "reward": 1.7165179252624512, "reward_std": 0.055388922803103924, "rewards/accuracy_reward": 0.7165178805589676, "rewards/format_reward": 1.0, "step": 1700 }, { "completion_length": 422.7544822692871, "epoch": 0.13267593007619902, "grad_norm": 0.15062184203550302, "kl": 0.0025634765625, "learning_rate": 9.571901252414122e-07, "loss": 0.0001, "reward": 1.7812500447034836, "reward_std": 0.13578910194337368, "rewards/accuracy_reward": 0.781250037252903, "rewards/format_reward": 1.0, "step": 1702 }, { "completion_length": 414.9442138671875, "epoch": 0.13283183598698187, "grad_norm": 0.07656371713038894, "kl": 0.0022935867309570312, "learning_rate": 9.570909205678765e-07, "loss": 0.0001, "reward": 1.8928572237491608, "reward_std": 0.058620513416826725, "rewards/accuracy_reward": 0.8928571566939354, "rewards/format_reward": 1.0, "step": 1704 }, { "completion_length": 413.5826110839844, "epoch": 0.1329877418977647, "grad_norm": 0.05105453091608395, "kl": 0.0025014877319335938, "learning_rate": 9.569916062353057e-07, "loss": 0.0001, "reward": 1.9196429252624512, "reward_std": 0.03208790626376867, "rewards/accuracy_reward": 0.9196428805589676, "rewards/format_reward": 1.0, "step": 1706 }, { "completion_length": 425.41296005249023, "epoch": 0.13314364780854754, "grad_norm": 0.12506750004819556, "kl": 0.0026950836181640625, "learning_rate": 9.56892182267526e-07, "loss": 0.0001, "reward": 1.8080358058214188, "reward_std": 0.05425985902547836, "rewards/accuracy_reward": 0.8080357611179352, "rewards/format_reward": 1.0, "step": 1708 }, { "completion_length": 434.408504486084, "epoch": 0.1332995537193304, "grad_norm": 0.10696093888400712, "kl": 0.0028076171875, "learning_rate": 9.567926486883895e-07, "loss": 0.0001, "reward": 1.8549108058214188, "reward_std": 0.0675607705488801, "rewards/accuracy_reward": 0.854910746216774, "rewards/format_reward": 1.0, "step": 1710 }, { "completion_length": 420.6852798461914, "epoch": 0.13345545963011324, "grad_norm": 0.13394608346507664, "kl": 0.0026092529296875, "learning_rate": 9.566930055217755e-07, "loss": 0.0001, "reward": 1.8169643729925156, "reward_std": 0.08650113362818956, "rewards/accuracy_reward": 0.8169643357396126, "rewards/format_reward": 1.0, "step": 1712 }, { "completion_length": 419.68305587768555, "epoch": 0.13361136554089606, "grad_norm": 0.08262255186114091, "kl": 0.0024766921997070312, "learning_rate": 9.565932527915885e-07, "loss": 0.0001, "reward": 1.881696492433548, "reward_std": 0.05929452087730169, "rewards/accuracy_reward": 0.8816964477300644, "rewards/format_reward": 1.0, "step": 1714 }, { "completion_length": 430.1406440734863, "epoch": 0.1337672714516789, "grad_norm": 0.10859519773027121, "kl": 0.002582550048828125, "learning_rate": 9.564933905217603e-07, "loss": 0.0001, "reward": 1.883928656578064, "reward_std": 0.09589280933141708, "rewards/accuracy_reward": 0.8839286118745804, "rewards/format_reward": 1.0, "step": 1716 }, { "completion_length": 424.64957427978516, "epoch": 0.13392317736246176, "grad_norm": 0.11152403887833012, "kl": 0.0025310516357421875, "learning_rate": 9.563934187362481e-07, "loss": 0.0001, "reward": 1.8459822237491608, "reward_std": 0.050051167607307434, "rewards/accuracy_reward": 0.8459821790456772, "rewards/format_reward": 1.0, "step": 1718 }, { "completion_length": 410.64287185668945, "epoch": 0.1340790832732446, "grad_norm": 0.0843018914675871, "kl": 0.0024194717407226562, "learning_rate": 9.562933374590359e-07, "loss": 0.0001, "reward": 1.7678572237491608, "reward_std": 0.0602702172473073, "rewards/accuracy_reward": 0.7678571715950966, "rewards/format_reward": 1.0, "step": 1720 }, { "completion_length": 420.8348388671875, "epoch": 0.13423498918402743, "grad_norm": 0.0915262841340607, "kl": 0.0023517608642578125, "learning_rate": 9.56193146714134e-07, "loss": 0.0001, "reward": 1.7946429252624512, "reward_std": 0.030438203364610672, "rewards/accuracy_reward": 0.7946428805589676, "rewards/format_reward": 1.0, "step": 1722 }, { "completion_length": 412.36609268188477, "epoch": 0.13439089509481028, "grad_norm": 0.07156384939469618, "kl": 0.0023555755615234375, "learning_rate": 9.560928465255784e-07, "loss": 0.0001, "reward": 1.7633929401636124, "reward_std": 0.06011684890836477, "rewards/accuracy_reward": 0.7633928880095482, "rewards/format_reward": 1.0, "step": 1724 }, { "completion_length": 434.19644927978516, "epoch": 0.13454680100559313, "grad_norm": 0.09535289469526928, "kl": 0.0023889541625976562, "learning_rate": 9.559924369174323e-07, "loss": 0.0001, "reward": 1.8125000596046448, "reward_std": 0.07207115553319454, "rewards/accuracy_reward": 0.8125000447034836, "rewards/format_reward": 1.0, "step": 1726 }, { "completion_length": 431.2343940734863, "epoch": 0.13470270691637598, "grad_norm": 0.09077509794648125, "kl": 0.002727508544921875, "learning_rate": 9.55891917913784e-07, "loss": 0.0001, "reward": 1.8080358058214188, "reward_std": 0.04892210382968187, "rewards/accuracy_reward": 0.808035746216774, "rewards/format_reward": 1.0, "step": 1728 }, { "completion_length": 412.1785888671875, "epoch": 0.1348586128271588, "grad_norm": 0.09060570549584729, "kl": 0.002349853515625, "learning_rate": 9.557912895387492e-07, "loss": 0.0001, "reward": 1.7477679550647736, "reward_std": 0.055235556326806545, "rewards/accuracy_reward": 0.7477678880095482, "rewards/format_reward": 1.0, "step": 1730 }, { "completion_length": 412.6294860839844, "epoch": 0.13501451873794165, "grad_norm": 0.09102558038212702, "kl": 0.002696990966796875, "learning_rate": 9.556905518164692e-07, "loss": 0.0001, "reward": 1.720982238650322, "reward_std": 0.062005472369492054, "rewards/accuracy_reward": 0.7209821753203869, "rewards/format_reward": 1.0, "step": 1732 }, { "completion_length": 437.986629486084, "epoch": 0.1351704246487245, "grad_norm": 0.12526279153103878, "kl": 0.0033044815063476562, "learning_rate": 9.555897047711114e-07, "loss": 0.0001, "reward": 1.8392857760190964, "reward_std": 0.10889063030481339, "rewards/accuracy_reward": 0.8392857536673546, "rewards/format_reward": 1.0, "step": 1734 }, { "completion_length": 420.3281440734863, "epoch": 0.13532633055950732, "grad_norm": 0.10962123341446035, "kl": 0.0026454925537109375, "learning_rate": 9.554887484268697e-07, "loss": 0.0001, "reward": 1.8147321939468384, "reward_std": 0.08552039135247469, "rewards/accuracy_reward": 0.8147321715950966, "rewards/format_reward": 1.0, "step": 1736 }, { "completion_length": 399.6406440734863, "epoch": 0.13548223647029017, "grad_norm": 0.1027695209694539, "kl": 0.002490997314453125, "learning_rate": 9.553876828079642e-07, "loss": 0.0001, "reward": 1.743303656578064, "reward_std": 0.0889731664210558, "rewards/accuracy_reward": 0.7433035969734192, "rewards/format_reward": 1.0, "step": 1738 }, { "completion_length": 431.7544822692871, "epoch": 0.13563814238107302, "grad_norm": 0.12756665958798286, "kl": 0.0027112960815429688, "learning_rate": 9.55286507938641e-07, "loss": 0.0001, "reward": 1.7633929401636124, "reward_std": 0.08552403096109629, "rewards/accuracy_reward": 0.7656250223517418, "rewards/format_reward": 0.9977678656578064, "step": 1740 }, { "completion_length": 412.1942138671875, "epoch": 0.13579404829185587, "grad_norm": 0.07830013800167257, "kl": 0.0025510787963867188, "learning_rate": 9.55185223843173e-07, "loss": 0.0001, "reward": 1.7745536416769028, "reward_std": 0.06463227514177561, "rewards/accuracy_reward": 0.7745536044239998, "rewards/format_reward": 1.0, "step": 1742 }, { "completion_length": 423.252254486084, "epoch": 0.1359499542026387, "grad_norm": 0.0568686704183447, "kl": 0.0024785995483398438, "learning_rate": 9.55083830545859e-07, "loss": 0.0001, "reward": 1.7232143729925156, "reward_std": 0.06124731618911028, "rewards/accuracy_reward": 0.7232143171131611, "rewards/format_reward": 1.0, "step": 1744 }, { "completion_length": 414.91966247558594, "epoch": 0.13610586011342155, "grad_norm": 0.050341507429358716, "kl": 0.00240325927734375, "learning_rate": 9.54982328071023e-07, "loss": 0.0001, "reward": 1.8504465073347092, "reward_std": 0.03968015220016241, "rewards/accuracy_reward": 0.850446455180645, "rewards/format_reward": 1.0, "step": 1746 }, { "completion_length": 418.8192138671875, "epoch": 0.1362617660242044, "grad_norm": 0.07055103070473408, "kl": 0.0028057098388671875, "learning_rate": 9.54880716443017e-07, "loss": 0.0001, "reward": 1.8214286714792252, "reward_std": 0.05425985902547836, "rewards/accuracy_reward": 0.8214286267757416, "rewards/format_reward": 1.0, "step": 1748 }, { "completion_length": 424.59599685668945, "epoch": 0.13641767193498724, "grad_norm": 0.12357909145088622, "kl": 0.0026092529296875, "learning_rate": 9.54778995686218e-07, "loss": 0.0001, "reward": 1.8705357760190964, "reward_std": 0.0683189257979393, "rewards/accuracy_reward": 0.8705357536673546, "rewards/format_reward": 1.0, "step": 1750 }, { "completion_length": 430.82591247558594, "epoch": 0.13657357784577007, "grad_norm": 0.13711702154427832, "kl": 0.0026798248291015625, "learning_rate": 9.546771658250294e-07, "loss": 0.0001, "reward": 1.79464291036129, "reward_std": 0.10553244594484568, "rewards/accuracy_reward": 0.796875037252903, "rewards/format_reward": 0.9977678656578064, "step": 1752 }, { "completion_length": 415.2835006713867, "epoch": 0.13672948375655292, "grad_norm": 0.08125406685811072, "kl": 0.0022296905517578125, "learning_rate": 9.545752268838808e-07, "loss": 0.0001, "reward": 1.843750074505806, "reward_std": 0.040809216909110546, "rewards/accuracy_reward": 0.845982164144516, "rewards/format_reward": 0.9977678656578064, "step": 1754 }, { "completion_length": 420.9085006713867, "epoch": 0.13688538966733577, "grad_norm": 0.0862916931186988, "kl": 0.00240325927734375, "learning_rate": 9.544731788872283e-07, "loss": 0.0001, "reward": 1.689732238650322, "reward_std": 0.0850675730034709, "rewards/accuracy_reward": 0.6897321827709675, "rewards/format_reward": 1.0, "step": 1756 }, { "completion_length": 426.6919822692871, "epoch": 0.13704129557811862, "grad_norm": 0.07783840068873879, "kl": 0.0027141571044921875, "learning_rate": 9.543710218595535e-07, "loss": 0.0001, "reward": 1.8125000596046448, "reward_std": 0.05298106651753187, "rewards/accuracy_reward": 0.8125000298023224, "rewards/format_reward": 1.0, "step": 1758 }, { "completion_length": 413.2254638671875, "epoch": 0.13719720148890144, "grad_norm": 0.12654417760864006, "kl": 0.0024099349975585938, "learning_rate": 9.542687558253645e-07, "loss": 0.0001, "reward": 1.7500000894069672, "reward_std": 0.08259330037981272, "rewards/accuracy_reward": 0.7500000447034836, "rewards/format_reward": 1.0, "step": 1760 }, { "completion_length": 419.08707427978516, "epoch": 0.1373531073996843, "grad_norm": 0.13251215117485626, "kl": 0.0023221969604492188, "learning_rate": 9.54166380809196e-07, "loss": 0.0001, "reward": 1.7991072088479996, "reward_std": 0.08454469498246908, "rewards/accuracy_reward": 0.7991071790456772, "rewards/format_reward": 1.0, "step": 1762 }, { "completion_length": 416.1339454650879, "epoch": 0.13750901331046714, "grad_norm": 0.1204272266903747, "kl": 0.002384185791015625, "learning_rate": 9.54063896835608e-07, "loss": 0.0001, "reward": 1.852678656578064, "reward_std": 0.0602702172473073, "rewards/accuracy_reward": 0.8526786267757416, "rewards/format_reward": 1.0, "step": 1764 }, { "completion_length": 431.00894927978516, "epoch": 0.13766491922125, "grad_norm": 0.11921927846427029, "kl": 0.002613067626953125, "learning_rate": 9.539613039291873e-07, "loss": 0.0001, "reward": 1.703125074505806, "reward_std": 0.09431092254817486, "rewards/accuracy_reward": 0.7053571715950966, "rewards/format_reward": 0.9977678656578064, "step": 1766 }, { "completion_length": 426.6607322692871, "epoch": 0.1378208251320328, "grad_norm": 0.07960155683349812, "kl": 0.0025606155395507812, "learning_rate": 9.538586021145467e-07, "loss": 0.0001, "reward": 1.852678656578064, "reward_std": 0.04907546937465668, "rewards/accuracy_reward": 0.8526786044239998, "rewards/format_reward": 1.0, "step": 1768 }, { "completion_length": 423.82368087768555, "epoch": 0.13797673104281566, "grad_norm": 0.10016737768366006, "kl": 0.0024003982543945312, "learning_rate": 9.537557914163247e-07, "loss": 0.0001, "reward": 1.9062500596046448, "reward_std": 0.06673339940607548, "rewards/accuracy_reward": 0.906250037252903, "rewards/format_reward": 1.0, "step": 1770 }, { "completion_length": 428.83260345458984, "epoch": 0.1381326369535985, "grad_norm": 0.13081798461797498, "kl": 0.0028934478759765625, "learning_rate": 9.536528718591862e-07, "loss": 0.0001, "reward": 1.7700893878936768, "reward_std": 0.11129848193377256, "rewards/accuracy_reward": 0.7700893133878708, "rewards/format_reward": 1.0, "step": 1772 }, { "completion_length": 421.5134048461914, "epoch": 0.13828854286438136, "grad_norm": 0.11616645750769804, "kl": 0.0024156570434570312, "learning_rate": 9.535498434678227e-07, "loss": 0.0001, "reward": 1.8482143580913544, "reward_std": 0.07109405566006899, "rewards/accuracy_reward": 0.8482143208384514, "rewards/format_reward": 1.0, "step": 1774 }, { "completion_length": 435.9241256713867, "epoch": 0.13844444877516418, "grad_norm": 0.12926805643027756, "kl": 0.0026445388793945312, "learning_rate": 9.534467062669509e-07, "loss": 0.0001, "reward": 1.7857143729925156, "reward_std": 0.07484852056950331, "rewards/accuracy_reward": 0.7857143133878708, "rewards/format_reward": 1.0, "step": 1776 }, { "completion_length": 426.0044822692871, "epoch": 0.13860035468594703, "grad_norm": 0.12447024118033474, "kl": 0.00229644775390625, "learning_rate": 9.533434602813144e-07, "loss": 0.0001, "reward": 1.8147321790456772, "reward_std": 0.09845179598778486, "rewards/accuracy_reward": 0.814732164144516, "rewards/format_reward": 1.0, "step": 1778 }, { "completion_length": 428.2723388671875, "epoch": 0.13875626059672988, "grad_norm": 0.05782325587572216, "kl": 0.0024566650390625, "learning_rate": 9.532401055356824e-07, "loss": 0.0001, "reward": 1.7700893580913544, "reward_std": 0.05200536735355854, "rewards/accuracy_reward": 0.7700893059372902, "rewards/format_reward": 1.0, "step": 1780 }, { "completion_length": 428.47769927978516, "epoch": 0.13891216650751273, "grad_norm": 0.0840166793413052, "kl": 0.002471923828125, "learning_rate": 9.531366420548504e-07, "loss": 0.0001, "reward": 1.8638393580913544, "reward_std": 0.05493386276066303, "rewards/accuracy_reward": 0.8638393059372902, "rewards/format_reward": 1.0, "step": 1782 }, { "completion_length": 415.5446586608887, "epoch": 0.13906807241829555, "grad_norm": 0.11480966641665565, "kl": 0.0025835037231445312, "learning_rate": 9.530330698636402e-07, "loss": 0.0001, "reward": 1.7700893431901932, "reward_std": 0.08213907666504383, "rewards/accuracy_reward": 0.7700893208384514, "rewards/format_reward": 1.0, "step": 1784 }, { "completion_length": 411.50671005249023, "epoch": 0.1392239783290784, "grad_norm": 0.10809173465312533, "kl": 0.0023250579833984375, "learning_rate": 9.52929388986899e-07, "loss": 0.0001, "reward": 1.8013393580913544, "reward_std": 0.06883956119418144, "rewards/accuracy_reward": 0.801339328289032, "rewards/format_reward": 1.0, "step": 1786 }, { "completion_length": 418.0290336608887, "epoch": 0.13937988423986125, "grad_norm": 0.062286337550038574, "kl": 0.0024166107177734375, "learning_rate": 9.528255994495007e-07, "loss": 0.0001, "reward": 1.803571492433548, "reward_std": 0.07274375762790442, "rewards/accuracy_reward": 0.8035714626312256, "rewards/format_reward": 1.0, "step": 1788 }, { "completion_length": 417.8951072692871, "epoch": 0.13953579015064407, "grad_norm": 0.13433283790379807, "kl": 0.002483367919921875, "learning_rate": 9.527217012763451e-07, "loss": 0.0001, "reward": 1.8549107909202576, "reward_std": 0.06951216701418161, "rewards/accuracy_reward": 0.8549107387661934, "rewards/format_reward": 1.0, "step": 1790 }, { "completion_length": 431.96430587768555, "epoch": 0.13969169606142692, "grad_norm": 0.0906389298791982, "kl": 0.002838134765625, "learning_rate": 9.52617694492358e-07, "loss": 0.0001, "reward": 1.8125000894069672, "reward_std": 0.0586205143481493, "rewards/accuracy_reward": 0.8125000447034836, "rewards/format_reward": 1.0, "step": 1792 }, { "completion_length": 417.73439025878906, "epoch": 0.13984760197220977, "grad_norm": 0.13377955596420807, "kl": 0.00269317626953125, "learning_rate": 9.525135791224916e-07, "loss": 0.0001, "reward": 1.877232238650322, "reward_std": 0.09507047943770885, "rewards/accuracy_reward": 0.8772321790456772, "rewards/format_reward": 1.0, "step": 1794 }, { "completion_length": 418.7812728881836, "epoch": 0.14000350788299262, "grad_norm": 0.11234319887016309, "kl": 0.0028076171875, "learning_rate": 9.524093551917233e-07, "loss": 0.0001, "reward": 1.774553656578064, "reward_std": 0.10483026225119829, "rewards/accuracy_reward": 0.7745536118745804, "rewards/format_reward": 1.0, "step": 1796 }, { "completion_length": 428.7076072692871, "epoch": 0.14015941379377544, "grad_norm": 0.08722109345389187, "kl": 0.002658843994140625, "learning_rate": 9.523050227250573e-07, "loss": 0.0001, "reward": 1.834821492433548, "reward_std": 0.029159409925341606, "rewards/accuracy_reward": 0.8348214700818062, "rewards/format_reward": 1.0, "step": 1798 }, { "completion_length": 435.9174346923828, "epoch": 0.1403153197045583, "grad_norm": 0.10408353102972638, "kl": 0.00254058837890625, "learning_rate": 9.522005817475238e-07, "loss": 0.0001, "reward": 1.8303572237491608, "reward_std": 0.0675593689084053, "rewards/accuracy_reward": 0.8303571864962578, "rewards/format_reward": 1.0, "step": 1800 }, { "completion_length": 421.1585006713867, "epoch": 0.14047122561534114, "grad_norm": 0.14900593625245945, "kl": 0.0027828216552734375, "learning_rate": 9.520960322841787e-07, "loss": 0.0001, "reward": 1.7165179401636124, "reward_std": 0.08830056339502335, "rewards/accuracy_reward": 0.7165178954601288, "rewards/format_reward": 1.0, "step": 1802 }, { "completion_length": 414.9620704650879, "epoch": 0.140627131526124, "grad_norm": 0.11887635444662553, "kl": 0.002471923828125, "learning_rate": 9.51991374360104e-07, "loss": 0.0001, "reward": 1.8459821939468384, "reward_std": 0.08792965579777956, "rewards/accuracy_reward": 0.8459821715950966, "rewards/format_reward": 1.0, "step": 1804 }, { "completion_length": 415.1741256713867, "epoch": 0.14078303743690682, "grad_norm": 0.07357614785552381, "kl": 0.0024061203002929688, "learning_rate": 9.518866080004081e-07, "loss": 0.0001, "reward": 1.7232143729925156, "reward_std": 0.05298106465488672, "rewards/accuracy_reward": 0.7232143133878708, "rewards/format_reward": 1.0, "step": 1806 }, { "completion_length": 407.0089454650879, "epoch": 0.14093894334768967, "grad_norm": 0.09876503461916508, "kl": 0.0022096633911132812, "learning_rate": 9.517817332302249e-07, "loss": 0.0001, "reward": 1.8504464775323868, "reward_std": 0.042762015014886856, "rewards/accuracy_reward": 0.850446455180645, "rewards/format_reward": 1.0, "step": 1808 }, { "completion_length": 414.861629486084, "epoch": 0.14109484925847252, "grad_norm": 0.10900303887312367, "kl": 0.0026092529296875, "learning_rate": 9.516767500747145e-07, "loss": 0.0001, "reward": 1.8482143431901932, "reward_std": 0.07499965094029903, "rewards/accuracy_reward": 0.8482143133878708, "rewards/format_reward": 1.0, "step": 1810 }, { "completion_length": 440.3951072692871, "epoch": 0.14125075516925537, "grad_norm": 0.10084891693849778, "kl": 0.0026454925537109375, "learning_rate": 9.515716585590632e-07, "loss": 0.0001, "reward": 1.7767857760190964, "reward_std": 0.06560797523707151, "rewards/accuracy_reward": 0.7767857313156128, "rewards/format_reward": 1.0, "step": 1812 }, { "completion_length": 427.3839454650879, "epoch": 0.1414066610800382, "grad_norm": 0.04628860407560374, "kl": 0.002948760986328125, "learning_rate": 9.514664587084828e-07, "loss": 0.0001, "reward": 1.7611607760190964, "reward_std": 0.04404081217944622, "rewards/accuracy_reward": 0.7611607611179352, "rewards/format_reward": 1.0, "step": 1814 }, { "completion_length": 431.95984268188477, "epoch": 0.14156256699082104, "grad_norm": 0.12201837573911492, "kl": 0.0025806427001953125, "learning_rate": 9.513611505482119e-07, "loss": 0.0001, "reward": 1.8705358058214188, "reward_std": 0.08101001102477312, "rewards/accuracy_reward": 0.8705357611179352, "rewards/format_reward": 1.0, "step": 1816 }, { "completion_length": 426.58707427978516, "epoch": 0.1417184729016039, "grad_norm": 0.11392718046871178, "kl": 0.0027790069580078125, "learning_rate": 9.512557341035142e-07, "loss": 0.0001, "reward": 1.7522322237491608, "reward_std": 0.06493396870791912, "rewards/accuracy_reward": 0.7522321790456772, "rewards/format_reward": 1.0, "step": 1818 }, { "completion_length": 409.9062728881836, "epoch": 0.14187437881238674, "grad_norm": 0.11535289753835874, "kl": 0.0024871826171875, "learning_rate": 9.511502093996799e-07, "loss": 0.0001, "reward": 1.852678656578064, "reward_std": 0.07289712596684694, "rewards/accuracy_reward": 0.8526786044239998, "rewards/format_reward": 1.0, "step": 1820 }, { "completion_length": 414.0312690734863, "epoch": 0.14203028472316956, "grad_norm": 0.09196734891143858, "kl": 0.0026683807373046875, "learning_rate": 9.510445764620251e-07, "loss": 0.0001, "reward": 1.8638393729925156, "reward_std": 0.044194173999130726, "rewards/accuracy_reward": 0.8638393357396126, "rewards/format_reward": 1.0, "step": 1822 }, { "completion_length": 404.6585006713867, "epoch": 0.1421861906339524, "grad_norm": 0.07904503149230035, "kl": 0.0020122528076171875, "learning_rate": 9.50938835315892e-07, "loss": 0.0001, "reward": 1.8236607909202576, "reward_std": 0.07244066335260868, "rewards/accuracy_reward": 0.8236607611179352, "rewards/format_reward": 1.0, "step": 1824 }, { "completion_length": 425.89287185668945, "epoch": 0.14234209654473526, "grad_norm": 0.1097034349631843, "kl": 0.0029468536376953125, "learning_rate": 9.508329859866482e-07, "loss": 0.0001, "reward": 1.750000074505806, "reward_std": 0.07048786524683237, "rewards/accuracy_reward": 0.7500000298023224, "rewards/format_reward": 1.0, "step": 1826 }, { "completion_length": 418.4576110839844, "epoch": 0.1424980024555181, "grad_norm": 0.1278607942978834, "kl": 0.0024480819702148438, "learning_rate": 9.507270284996878e-07, "loss": 0.0001, "reward": 1.859375074505806, "reward_std": 0.0874768290668726, "rewards/accuracy_reward": 0.8593750447034836, "rewards/format_reward": 1.0, "step": 1828 }, { "completion_length": 414.44421768188477, "epoch": 0.14265390836630093, "grad_norm": 0.12079142589663733, "kl": 0.0027112960815429688, "learning_rate": 9.506209628804306e-07, "loss": 0.0001, "reward": 1.7455357760190964, "reward_std": 0.08469806052744389, "rewards/accuracy_reward": 0.7455357573926449, "rewards/format_reward": 1.0, "step": 1830 }, { "completion_length": 427.752254486084, "epoch": 0.14280981427708378, "grad_norm": 0.13809678700429423, "kl": 0.0028781890869140625, "learning_rate": 9.505147891543226e-07, "loss": 0.0001, "reward": 1.7611608058214188, "reward_std": 0.1093323165550828, "rewards/accuracy_reward": 0.7633928954601288, "rewards/format_reward": 0.9977678656578064, "step": 1832 }, { "completion_length": 420.4620666503906, "epoch": 0.14296572018786663, "grad_norm": 0.11363814301394629, "kl": 0.0023221969604492188, "learning_rate": 9.504085073468354e-07, "loss": 0.0001, "reward": 1.7633929401636124, "reward_std": 0.05831882171332836, "rewards/accuracy_reward": 0.7633928880095482, "rewards/format_reward": 1.0, "step": 1834 }, { "completion_length": 430.0402030944824, "epoch": 0.14312162609864945, "grad_norm": 0.1365975058041751, "kl": 0.0029354095458984375, "learning_rate": 9.503021174834667e-07, "loss": 0.0001, "reward": 1.787946492433548, "reward_std": 0.06267807353287935, "rewards/accuracy_reward": 0.7879464626312256, "rewards/format_reward": 1.0, "step": 1836 }, { "completion_length": 426.1138572692871, "epoch": 0.1432775320094323, "grad_norm": 0.11639298258084077, "kl": 0.0027599334716796875, "learning_rate": 9.501956195897402e-07, "loss": 0.0001, "reward": 1.8125000894069672, "reward_std": 0.11550717707723379, "rewards/accuracy_reward": 0.8125000447034836, "rewards/format_reward": 1.0, "step": 1838 }, { "completion_length": 427.6138572692871, "epoch": 0.14343343792021515, "grad_norm": 0.11764310703444192, "kl": 0.0025529861450195312, "learning_rate": 9.500890136912054e-07, "loss": 0.0001, "reward": 1.877232238650322, "reward_std": 0.09178151283413172, "rewards/accuracy_reward": 0.879464328289032, "rewards/format_reward": 0.9977678656578064, "step": 1840 }, { "completion_length": 413.8348388671875, "epoch": 0.143589343830998, "grad_norm": 0.12173554671189299, "kl": 0.0024709701538085938, "learning_rate": 9.499822998134377e-07, "loss": 0.0001, "reward": 1.8861607909202576, "reward_std": 0.07838460896164179, "rewards/accuracy_reward": 0.8861607611179352, "rewards/format_reward": 1.0, "step": 1842 }, { "completion_length": 421.91966247558594, "epoch": 0.14374524974178082, "grad_norm": 0.09770498062369756, "kl": 0.00243377685546875, "learning_rate": 9.498754779820384e-07, "loss": 0.0001, "reward": 1.7633929550647736, "reward_std": 0.06883816421031952, "rewards/accuracy_reward": 0.7633928954601288, "rewards/format_reward": 1.0, "step": 1844 }, { "completion_length": 412.1875228881836, "epoch": 0.14390115565256367, "grad_norm": 0.0724923238017135, "kl": 0.0024805068969726562, "learning_rate": 9.497685482226349e-07, "loss": 0.0001, "reward": 1.8705357909202576, "reward_std": 0.03336669970303774, "rewards/accuracy_reward": 0.870535746216774, "rewards/format_reward": 1.0, "step": 1846 }, { "completion_length": 425.8437690734863, "epoch": 0.14405706156334652, "grad_norm": 0.11647830285935219, "kl": 0.002933502197265625, "learning_rate": 9.496615105608802e-07, "loss": 0.0001, "reward": 1.790178656578064, "reward_std": 0.0677105002105236, "rewards/accuracy_reward": 0.7901786118745804, "rewards/format_reward": 1.0, "step": 1848 }, { "completion_length": 423.8526954650879, "epoch": 0.14421296747412937, "grad_norm": 0.07412461375880015, "kl": 0.00261688232421875, "learning_rate": 9.495543650224533e-07, "loss": 0.0001, "reward": 1.8013393431901932, "reward_std": 0.06621276028454304, "rewards/accuracy_reward": 0.801339328289032, "rewards/format_reward": 1.0, "step": 1850 }, { "completion_length": 429.3817138671875, "epoch": 0.1443688733849122, "grad_norm": 0.09968001165927581, "kl": 0.0030364990234375, "learning_rate": 9.494471116330592e-07, "loss": 0.0001, "reward": 1.8303572535514832, "reward_std": 0.07973121758550406, "rewards/accuracy_reward": 0.830357164144516, "rewards/format_reward": 1.0, "step": 1852 }, { "completion_length": 422.02234268188477, "epoch": 0.14452477929569504, "grad_norm": 0.13114847669862711, "kl": 0.0024509429931640625, "learning_rate": 9.493397504184286e-07, "loss": 0.0001, "reward": 1.7656250596046448, "reward_std": 0.07011835556477308, "rewards/accuracy_reward": 0.7656250223517418, "rewards/format_reward": 1.0, "step": 1854 }, { "completion_length": 432.02010345458984, "epoch": 0.1446806852064779, "grad_norm": 0.08504654938735325, "kl": 0.002986907958984375, "learning_rate": 9.492322814043181e-07, "loss": 0.0001, "reward": 1.8080358058214188, "reward_std": 0.06786386482417583, "rewards/accuracy_reward": 0.8080357536673546, "rewards/format_reward": 1.0, "step": 1856 }, { "completion_length": 410.5268020629883, "epoch": 0.14483659111726074, "grad_norm": 0.11853135978733005, "kl": 0.0024967193603515625, "learning_rate": 9.491247046165103e-07, "loss": 0.0001, "reward": 1.8660715222358704, "reward_std": 0.07951367273926735, "rewards/accuracy_reward": 0.8660714626312256, "rewards/format_reward": 1.0, "step": 1858 }, { "completion_length": 411.9509086608887, "epoch": 0.14499249702804357, "grad_norm": 0.12115776608647899, "kl": 0.0022268295288085938, "learning_rate": 9.490170200808136e-07, "loss": 0.0001, "reward": 1.8705357760190964, "reward_std": 0.058620515279471874, "rewards/accuracy_reward": 0.8705357536673546, "rewards/format_reward": 1.0, "step": 1860 }, { "completion_length": 434.4955520629883, "epoch": 0.14514840293882642, "grad_norm": 0.07599978882746548, "kl": 0.0029268264770507812, "learning_rate": 9.489092278230621e-07, "loss": 0.0001, "reward": 1.8281251043081284, "reward_std": 0.08732346259057522, "rewards/accuracy_reward": 0.832589328289032, "rewards/format_reward": 0.9955357313156128, "step": 1862 }, { "completion_length": 423.60493087768555, "epoch": 0.14530430884960926, "grad_norm": 0.10695509071203912, "kl": 0.0026340484619140625, "learning_rate": 9.488013278691158e-07, "loss": 0.0001, "reward": 1.8437500894069672, "reward_std": 0.06688676495105028, "rewards/accuracy_reward": 0.843750037252903, "rewards/format_reward": 1.0, "step": 1864 }, { "completion_length": 413.50001525878906, "epoch": 0.14546021476039211, "grad_norm": 0.07640974639618531, "kl": 0.0025386810302734375, "learning_rate": 9.486933202448607e-07, "loss": 0.0001, "reward": 1.7924107909202576, "reward_std": 0.06801583059132099, "rewards/accuracy_reward": 0.792410746216774, "rewards/format_reward": 1.0, "step": 1866 }, { "completion_length": 417.3750228881836, "epoch": 0.14561612067117494, "grad_norm": 0.09889783186955166, "kl": 0.0024995803833007812, "learning_rate": 9.485852049762086e-07, "loss": 0.0001, "reward": 1.8727679252624512, "reward_std": 0.07628208305686712, "rewards/accuracy_reward": 0.8727678805589676, "rewards/format_reward": 1.0, "step": 1868 }, { "completion_length": 428.62501525878906, "epoch": 0.1457720265819578, "grad_norm": 0.10544359645120983, "kl": 0.0026292800903320312, "learning_rate": 9.484769820890968e-07, "loss": 0.0001, "reward": 1.8191965222358704, "reward_std": 0.046296700835227966, "rewards/accuracy_reward": 0.8191964477300644, "rewards/format_reward": 1.0, "step": 1870 }, { "completion_length": 424.56921005249023, "epoch": 0.14592793249274064, "grad_norm": 0.11062289792072076, "kl": 0.002841949462890625, "learning_rate": 9.483686516094889e-07, "loss": 0.0001, "reward": 1.8058036416769028, "reward_std": 0.059749577194452286, "rewards/accuracy_reward": 0.8080357536673546, "rewards/format_reward": 0.9977678656578064, "step": 1872 }, { "completion_length": 416.4821662902832, "epoch": 0.14608383840352349, "grad_norm": 0.08959041216852641, "kl": 0.002346038818359375, "learning_rate": 9.48260213563374e-07, "loss": 0.0001, "reward": 1.8392857760190964, "reward_std": 0.059444245882332325, "rewards/accuracy_reward": 0.8392857536673546, "rewards/format_reward": 1.0, "step": 1874 }, { "completion_length": 433.87947845458984, "epoch": 0.1462397443143063, "grad_norm": 0.11384897368021245, "kl": 0.0027065277099609375, "learning_rate": 9.481516679767669e-07, "loss": 0.0001, "reward": 1.752232238650322, "reward_std": 0.08582712430506945, "rewards/accuracy_reward": 0.7566964700818062, "rewards/format_reward": 0.9955357313156128, "step": 1876 }, { "completion_length": 415.6919822692871, "epoch": 0.14639565022508916, "grad_norm": 0.08023639584006488, "kl": 0.002368927001953125, "learning_rate": 9.480430148757085e-07, "loss": 0.0001, "reward": 1.8549107909202576, "reward_std": 0.06185210216790438, "rewards/accuracy_reward": 0.854910746216774, "rewards/format_reward": 1.0, "step": 1878 }, { "completion_length": 409.15849685668945, "epoch": 0.146551556135872, "grad_norm": 0.0844763327619293, "kl": 0.0022974014282226562, "learning_rate": 9.479342542862656e-07, "loss": 0.0001, "reward": 1.875000074505806, "reward_std": 0.05020089354366064, "rewards/accuracy_reward": 0.875000037252903, "rewards/format_reward": 1.0, "step": 1880 }, { "completion_length": 410.6718940734863, "epoch": 0.14670746204665486, "grad_norm": 0.11615496707519885, "kl": 0.002338409423828125, "learning_rate": 9.478253862345304e-07, "loss": 0.0001, "reward": 1.8526786416769028, "reward_std": 0.054932462982833385, "rewards/accuracy_reward": 0.8526786044239998, "rewards/format_reward": 1.0, "step": 1882 }, { "completion_length": 416.9665336608887, "epoch": 0.14686336795743768, "grad_norm": 0.10850437776155175, "kl": 0.0023756027221679688, "learning_rate": 9.477164107466209e-07, "loss": 0.0001, "reward": 1.821428656578064, "reward_std": 0.05734171997755766, "rewards/accuracy_reward": 0.823660746216774, "rewards/format_reward": 0.9977678656578064, "step": 1884 }, { "completion_length": 421.33483505249023, "epoch": 0.14701927386822053, "grad_norm": 0.09942645210688915, "kl": 0.00237274169921875, "learning_rate": 9.476073278486811e-07, "loss": 0.0001, "reward": 1.8236608058214188, "reward_std": 0.07417731918394566, "rewards/accuracy_reward": 0.8258928954601288, "rewards/format_reward": 0.9977678656578064, "step": 1886 }, { "completion_length": 426.9977912902832, "epoch": 0.14717517977900338, "grad_norm": 0.1472215577054317, "kl": 0.003055572509765625, "learning_rate": 9.474981375668809e-07, "loss": 0.0001, "reward": 1.7053571939468384, "reward_std": 0.08537066634744406, "rewards/accuracy_reward": 0.705357164144516, "rewards/format_reward": 1.0, "step": 1888 }, { "completion_length": 420.14510345458984, "epoch": 0.1473310856897862, "grad_norm": 0.088621167406875, "kl": 0.0026693344116210938, "learning_rate": 9.473888399274155e-07, "loss": 0.0001, "reward": 1.8660715073347092, "reward_std": 0.08664945978671312, "rewards/accuracy_reward": 0.8660714626312256, "rewards/format_reward": 1.0, "step": 1890 }, { "completion_length": 425.63171005249023, "epoch": 0.14748699160056905, "grad_norm": 0.0863084117458823, "kl": 0.0026235580444335938, "learning_rate": 9.472794349565061e-07, "loss": 0.0001, "reward": 1.8303572088479996, "reward_std": 0.06981526222079992, "rewards/accuracy_reward": 0.8303571864962578, "rewards/format_reward": 1.0, "step": 1892 }, { "completion_length": 413.026798248291, "epoch": 0.1476428975113519, "grad_norm": 0.10701340582238997, "kl": 0.0024871826171875, "learning_rate": 9.471699226803996e-07, "loss": 0.0001, "reward": 1.8705357760190964, "reward_std": 0.06612720992416143, "rewards/accuracy_reward": 0.8705357387661934, "rewards/format_reward": 1.0, "step": 1894 }, { "completion_length": 420.37947845458984, "epoch": 0.14779880342213475, "grad_norm": 0.10176589188181358, "kl": 0.0022945404052734375, "learning_rate": 9.47060303125369e-07, "loss": 0.0001, "reward": 1.8816965222358704, "reward_std": 0.06621275935322046, "rewards/accuracy_reward": 0.8839286044239998, "rewards/format_reward": 0.9977678656578064, "step": 1896 }, { "completion_length": 429.3125190734863, "epoch": 0.14795470933291757, "grad_norm": 0.0983832351978756, "kl": 0.003055572509765625, "learning_rate": 9.469505763177125e-07, "loss": 0.0001, "reward": 1.7388393878936768, "reward_std": 0.08033600449562073, "rewards/accuracy_reward": 0.7388393208384514, "rewards/format_reward": 1.0, "step": 1898 }, { "completion_length": 416.267879486084, "epoch": 0.14811061524370042, "grad_norm": 0.004080065735805403, "kl": 0.0024919509887695312, "learning_rate": 9.468407422837543e-07, "loss": 0.0001, "reward": 1.7589286416769028, "reward_std": 0.01555540319532156, "rewards/accuracy_reward": 0.7589286044239998, "rewards/format_reward": 1.0, "step": 1900 }, { "completion_length": 418.38394927978516, "epoch": 0.14826652115448327, "grad_norm": 0.09954114417309422, "kl": 0.0023593902587890625, "learning_rate": 9.467308010498443e-07, "loss": 0.0001, "reward": 1.799107238650322, "reward_std": 0.060270218178629875, "rewards/accuracy_reward": 0.799107164144516, "rewards/format_reward": 1.0, "step": 1902 }, { "completion_length": 423.111629486084, "epoch": 0.14842242706526612, "grad_norm": 0.11017480697242374, "kl": 0.0028629302978515625, "learning_rate": 9.466207526423581e-07, "loss": 0.0001, "reward": 1.7633929252624512, "reward_std": 0.0890587205067277, "rewards/accuracy_reward": 0.7656250223517418, "rewards/format_reward": 0.9977678656578064, "step": 1904 }, { "completion_length": 429.29466247558594, "epoch": 0.14857833297604894, "grad_norm": 0.1199079158102475, "kl": 0.002513885498046875, "learning_rate": 9.46510597087697e-07, "loss": 0.0001, "reward": 1.7857143580913544, "reward_std": 0.052003965713083744, "rewards/accuracy_reward": 0.7857143133878708, "rewards/format_reward": 1.0, "step": 1906 }, { "completion_length": 426.4643020629883, "epoch": 0.1487342388868318, "grad_norm": 0.09620267350320129, "kl": 0.00269317626953125, "learning_rate": 9.46400334412288e-07, "loss": 0.0001, "reward": 1.8392858058214188, "reward_std": 0.06463087350130081, "rewards/accuracy_reward": 0.8415178954601288, "rewards/format_reward": 0.9977678656578064, "step": 1908 }, { "completion_length": 436.6227798461914, "epoch": 0.14889014479761464, "grad_norm": 0.11980833151797901, "kl": 0.0031642913818359375, "learning_rate": 9.462899646425839e-07, "loss": 0.0001, "reward": 1.812500074505806, "reward_std": 0.08634776622056961, "rewards/accuracy_reward": 0.812500037252903, "rewards/format_reward": 1.0, "step": 1910 }, { "completion_length": 426.93082427978516, "epoch": 0.1490460507083975, "grad_norm": 0.09493161246957552, "kl": 0.0026960372924804688, "learning_rate": 9.461794878050631e-07, "loss": 0.0001, "reward": 1.8593750596046448, "reward_std": 0.07740751095116138, "rewards/accuracy_reward": 0.859375037252903, "rewards/format_reward": 1.0, "step": 1912 }, { "completion_length": 431.1607360839844, "epoch": 0.14920195661918031, "grad_norm": 0.12277675435024403, "kl": 0.0025854110717773438, "learning_rate": 9.460689039262298e-07, "loss": 0.0001, "reward": 1.8102679550647736, "reward_std": 0.10964878089725971, "rewards/accuracy_reward": 0.8102678954601288, "rewards/format_reward": 1.0, "step": 1914 }, { "completion_length": 428.61608505249023, "epoch": 0.14935786252996316, "grad_norm": 0.1442333958167205, "kl": 0.0027837753295898438, "learning_rate": 9.459582130326134e-07, "loss": 0.0001, "reward": 1.8169643729925156, "reward_std": 0.08341926988214254, "rewards/accuracy_reward": 0.8169643208384514, "rewards/format_reward": 1.0, "step": 1916 }, { "completion_length": 413.9754638671875, "epoch": 0.14951376844074601, "grad_norm": 0.11248822370461009, "kl": 0.0024747848510742188, "learning_rate": 9.458474151507697e-07, "loss": 0.0001, "reward": 1.7388393729925156, "reward_std": 0.08101141080260277, "rewards/accuracy_reward": 0.738839328289032, "rewards/format_reward": 1.0, "step": 1918 }, { "completion_length": 412.8616256713867, "epoch": 0.14966967435152886, "grad_norm": 0.10573599327373483, "kl": 0.002483367919921875, "learning_rate": 9.457365103072797e-07, "loss": 0.0001, "reward": 1.8169643729925156, "reward_std": 0.10964514315128326, "rewards/accuracy_reward": 0.8169643208384514, "rewards/format_reward": 1.0, "step": 1920 }, { "completion_length": 419.9464454650879, "epoch": 0.1498255802623117, "grad_norm": 0.08384477417514684, "kl": 0.0027904510498046875, "learning_rate": 9.456254985287503e-07, "loss": 0.0001, "reward": 1.7946429252624512, "reward_std": 0.07146496511995792, "rewards/accuracy_reward": 0.7946428880095482, "rewards/format_reward": 1.0, "step": 1922 }, { "completion_length": 426.4576072692871, "epoch": 0.14998148617309454, "grad_norm": 0.09578859788160553, "kl": 0.0023775100708007812, "learning_rate": 9.455143798418138e-07, "loss": 0.0001, "reward": 1.8035715073347092, "reward_std": 0.08244216721504927, "rewards/accuracy_reward": 0.8035714775323868, "rewards/format_reward": 1.0, "step": 1924 }, { "completion_length": 428.6004638671875, "epoch": 0.15013739208387739, "grad_norm": 0.10690834826631745, "kl": 0.0025663375854492188, "learning_rate": 9.454031542731283e-07, "loss": 0.0001, "reward": 1.8415179252624512, "reward_std": 0.0819857083261013, "rewards/accuracy_reward": 0.8415178880095482, "rewards/format_reward": 1.0, "step": 1926 }, { "completion_length": 416.7678756713867, "epoch": 0.15029329799466024, "grad_norm": 0.09855710044048349, "kl": 0.002593994140625, "learning_rate": 9.452918218493776e-07, "loss": 0.0001, "reward": 1.8147321939468384, "reward_std": 0.09296291042119265, "rewards/accuracy_reward": 0.8169643208384514, "rewards/format_reward": 0.9977678656578064, "step": 1928 }, { "completion_length": 413.54019927978516, "epoch": 0.15044920390544306, "grad_norm": 0.07801415070902125, "kl": 0.0025787353515625, "learning_rate": 9.45180382597271e-07, "loss": 0.0001, "reward": 1.8571429401636124, "reward_std": 0.0417863167822361, "rewards/accuracy_reward": 0.8571428880095482, "rewards/format_reward": 1.0, "step": 1930 }, { "completion_length": 426.9910888671875, "epoch": 0.1506051098162259, "grad_norm": 0.08328758307526886, "kl": 0.0024824142456054688, "learning_rate": 9.450688365435436e-07, "loss": 0.0001, "reward": 1.7343750894069672, "reward_std": 0.05816405173391104, "rewards/accuracy_reward": 0.734375037252903, "rewards/format_reward": 1.0, "step": 1932 }, { "completion_length": 407.026798248291, "epoch": 0.15076101572700876, "grad_norm": 0.05905198975168956, "kl": 0.0026197433471679688, "learning_rate": 9.449571837149557e-07, "loss": 0.0001, "reward": 1.850446492433548, "reward_std": 0.030135109089314938, "rewards/accuracy_reward": 0.850446455180645, "rewards/format_reward": 1.0, "step": 1934 }, { "completion_length": 415.6205596923828, "epoch": 0.15091692163779158, "grad_norm": 0.06341397510830357, "kl": 0.0025815963745117188, "learning_rate": 9.44845424138294e-07, "loss": 0.0001, "reward": 1.781250074505806, "reward_std": 0.05252320226281881, "rewards/accuracy_reward": 0.7812500447034836, "rewards/format_reward": 1.0, "step": 1936 }, { "completion_length": 420.2143096923828, "epoch": 0.15107282754857443, "grad_norm": 0.0784527225897726, "kl": 0.0026712417602539062, "learning_rate": 9.447335578403699e-07, "loss": 0.0001, "reward": 1.8415179401636124, "reward_std": 0.07191638182848692, "rewards/accuracy_reward": 0.8415178805589676, "rewards/format_reward": 1.0, "step": 1938 }, { "completion_length": 427.2991256713867, "epoch": 0.15122873345935728, "grad_norm": 0.07555171385605434, "kl": 0.0026149749755859375, "learning_rate": 9.44621584848021e-07, "loss": 0.0001, "reward": 1.7968750894069672, "reward_std": 0.07905721385031939, "rewards/accuracy_reward": 0.7968750298023224, "rewards/format_reward": 1.0, "step": 1940 }, { "completion_length": 431.96876525878906, "epoch": 0.15138463937014013, "grad_norm": 0.1562715279009717, "kl": 0.0029058456420898438, "learning_rate": 9.445095051881104e-07, "loss": 0.0001, "reward": 1.736607238650322, "reward_std": 0.10897477436810732, "rewards/accuracy_reward": 0.7366071753203869, "rewards/format_reward": 1.0, "step": 1942 }, { "completion_length": 416.88394927978516, "epoch": 0.15154054528092295, "grad_norm": 0.07326227166799636, "kl": 0.0022249221801757812, "learning_rate": 9.443973188875268e-07, "loss": 0.0001, "reward": 1.8593750596046448, "reward_std": 0.05800095293670893, "rewards/accuracy_reward": 0.861607164144516, "rewards/format_reward": 0.9977678656578064, "step": 1944 }, { "completion_length": 433.81252670288086, "epoch": 0.1516964511917058, "grad_norm": 0.08619195678321279, "kl": 0.0025796890258789062, "learning_rate": 9.44285025973184e-07, "loss": 0.0001, "reward": 1.7544643580913544, "reward_std": 0.07664935383945704, "rewards/accuracy_reward": 0.7544643133878708, "rewards/format_reward": 1.0, "step": 1946 }, { "completion_length": 411.08484268188477, "epoch": 0.15185235710248865, "grad_norm": 0.13302980039408377, "kl": 0.00275421142578125, "learning_rate": 9.441726264720224e-07, "loss": 0.0001, "reward": 1.8102679252624512, "reward_std": 0.07905721012502909, "rewards/accuracy_reward": 0.8102679029107094, "rewards/format_reward": 1.0, "step": 1948 }, { "completion_length": 414.26341247558594, "epoch": 0.1520082630132715, "grad_norm": 0.09900953552701644, "kl": 0.0026683807373046875, "learning_rate": 9.440601204110068e-07, "loss": 0.0001, "reward": 1.8035715073347092, "reward_std": 0.0691426582634449, "rewards/accuracy_reward": 0.8035714626312256, "rewards/format_reward": 1.0, "step": 1950 }, { "completion_length": 433.986629486084, "epoch": 0.15216416892405432, "grad_norm": 0.08928957880030594, "kl": 0.0026721954345703125, "learning_rate": 9.439475078171286e-07, "loss": 0.0001, "reward": 1.758928656578064, "reward_std": 0.09589280840009451, "rewards/accuracy_reward": 0.7589286118745804, "rewards/format_reward": 1.0, "step": 1952 }, { "completion_length": 428.8169822692871, "epoch": 0.15232007483483717, "grad_norm": 0.09420768859873371, "kl": 0.002941131591796875, "learning_rate": 9.438347887174038e-07, "loss": 0.0001, "reward": 1.8459822088479996, "reward_std": 0.04937856271862984, "rewards/accuracy_reward": 0.8459821715950966, "rewards/format_reward": 1.0, "step": 1954 }, { "completion_length": 417.80805587768555, "epoch": 0.15247598074562002, "grad_norm": 0.059170061986699336, "kl": 0.0023365020751953125, "learning_rate": 9.437219631388749e-07, "loss": 0.0001, "reward": 1.8705357760190964, "reward_std": 0.023821654729545116, "rewards/accuracy_reward": 0.870535746216774, "rewards/format_reward": 1.0, "step": 1956 }, { "completion_length": 421.0468978881836, "epoch": 0.15263188665640287, "grad_norm": 0.10718029935479413, "kl": 0.0027399063110351562, "learning_rate": 9.436090311086091e-07, "loss": 0.0001, "reward": 1.7700893431901932, "reward_std": 0.09461542032659054, "rewards/accuracy_reward": 0.7700893208384514, "rewards/format_reward": 1.0, "step": 1958 }, { "completion_length": 426.22099685668945, "epoch": 0.1527877925671857, "grad_norm": 0.09067648813947113, "kl": 0.0025634765625, "learning_rate": 9.434959926536998e-07, "loss": 0.0001, "reward": 1.8348215222358704, "reward_std": 0.054932461120188236, "rewards/accuracy_reward": 0.8348214775323868, "rewards/format_reward": 1.0, "step": 1960 }, { "completion_length": 424.9643020629883, "epoch": 0.15294369847796854, "grad_norm": 0.09828646726388528, "kl": 0.0026941299438476562, "learning_rate": 9.433828478012655e-07, "loss": 0.0001, "reward": 1.8549107909202576, "reward_std": 0.04712267126888037, "rewards/accuracy_reward": 0.8549107536673546, "rewards/format_reward": 1.0, "step": 1962 }, { "completion_length": 416.5714416503906, "epoch": 0.1530996043887514, "grad_norm": 0.12456233803622835, "kl": 0.0022954940795898438, "learning_rate": 9.432695965784503e-07, "loss": 0.0001, "reward": 1.8683036416769028, "reward_std": 0.05246042739599943, "rewards/accuracy_reward": 0.8683035969734192, "rewards/format_reward": 1.0, "step": 1964 }, { "completion_length": 410.66966247558594, "epoch": 0.15325551029953424, "grad_norm": 0.06875984010161594, "kl": 0.0024051666259765625, "learning_rate": 9.431562390124242e-07, "loss": 0.0001, "reward": 1.848214328289032, "reward_std": 0.04593987111002207, "rewards/accuracy_reward": 0.850446455180645, "rewards/format_reward": 0.9977678656578064, "step": 1966 }, { "completion_length": 417.5826072692871, "epoch": 0.15341141621031706, "grad_norm": 0.07440381309948782, "kl": 0.0023183822631835938, "learning_rate": 9.43042775130382e-07, "loss": 0.0001, "reward": 1.7924108058214188, "reward_std": 0.04569051321595907, "rewards/accuracy_reward": 0.7924107611179352, "rewards/format_reward": 1.0, "step": 1968 }, { "completion_length": 419.627254486084, "epoch": 0.15356732212109991, "grad_norm": 0.12990861467440695, "kl": 0.0029201507568359375, "learning_rate": 9.429292049595445e-07, "loss": 0.0001, "reward": 1.883928656578064, "reward_std": 0.09198721405118704, "rewards/accuracy_reward": 0.8861607536673546, "rewards/format_reward": 0.9977678656578064, "step": 1970 }, { "completion_length": 417.8192138671875, "epoch": 0.15372322803188276, "grad_norm": 0.1097392533355895, "kl": 0.0025453567504882812, "learning_rate": 9.428155285271583e-07, "loss": 0.0001, "reward": 1.727678656578064, "reward_std": 0.0940919779241085, "rewards/accuracy_reward": 0.7276786118745804, "rewards/format_reward": 1.0, "step": 1972 }, { "completion_length": 395.40179443359375, "epoch": 0.1538791339426656, "grad_norm": 0.0896633430398015, "kl": 0.0022459030151367188, "learning_rate": 9.427017458604947e-07, "loss": 0.0001, "reward": 1.8816965222358704, "reward_std": 0.061852104030549526, "rewards/accuracy_reward": 0.8816964849829674, "rewards/format_reward": 1.0, "step": 1974 }, { "completion_length": 421.49332427978516, "epoch": 0.15403503985344844, "grad_norm": 0.10026251798905936, "kl": 0.0025548934936523438, "learning_rate": 9.42587856986851e-07, "loss": 0.0001, "reward": 1.743303656578064, "reward_std": 0.0529796639457345, "rewards/accuracy_reward": 0.7455357536673546, "rewards/format_reward": 0.9977678656578064, "step": 1976 }, { "completion_length": 421.1651954650879, "epoch": 0.15419094576423129, "grad_norm": 0.12058637272832026, "kl": 0.0023937225341796875, "learning_rate": 9.424738619335499e-07, "loss": 0.0001, "reward": 1.6852679252624512, "reward_std": 0.06365517526865005, "rewards/accuracy_reward": 0.6852678954601288, "rewards/format_reward": 1.0, "step": 1978 }, { "completion_length": 426.64733505249023, "epoch": 0.15434685167501414, "grad_norm": 0.0746866659142969, "kl": 0.0025634765625, "learning_rate": 9.423597607279395e-07, "loss": 0.0001, "reward": 1.7834822088479996, "reward_std": 0.05493386276066303, "rewards/accuracy_reward": 0.7834821864962578, "rewards/format_reward": 1.0, "step": 1980 }, { "completion_length": 423.46430587768555, "epoch": 0.15450275758579698, "grad_norm": 0.08383051571202396, "kl": 0.0024576187133789062, "learning_rate": 9.422455533973933e-07, "loss": 0.0001, "reward": 1.90401791036129, "reward_std": 0.0456905122846365, "rewards/accuracy_reward": 0.9040178805589676, "rewards/format_reward": 1.0, "step": 1982 }, { "completion_length": 412.2276916503906, "epoch": 0.1546586634965798, "grad_norm": 0.06165794141717538, "kl": 0.0025482177734375, "learning_rate": 9.421312399693107e-07, "loss": 0.0001, "reward": 1.7299107760190964, "reward_std": 0.05816405266523361, "rewards/accuracy_reward": 0.7299107536673546, "rewards/format_reward": 1.0, "step": 1984 }, { "completion_length": 417.38841247558594, "epoch": 0.15481456940736266, "grad_norm": 0.06108910484951321, "kl": 0.0021190643310546875, "learning_rate": 9.420168204711159e-07, "loss": 0.0001, "reward": 1.8928572237491608, "reward_std": 0.028182310052216053, "rewards/accuracy_reward": 0.8928571864962578, "rewards/format_reward": 1.0, "step": 1986 }, { "completion_length": 424.58484649658203, "epoch": 0.1549704753181455, "grad_norm": 0.13037240828733632, "kl": 0.0027713775634765625, "learning_rate": 9.419022949302591e-07, "loss": 0.0001, "reward": 1.7924107760190964, "reward_std": 0.06057331059128046, "rewards/accuracy_reward": 0.7924107536673546, "rewards/format_reward": 1.0, "step": 1988 }, { "completion_length": 416.9843940734863, "epoch": 0.15512638122892833, "grad_norm": 0.07562559320421426, "kl": 0.0022172927856445312, "learning_rate": 9.417876633742154e-07, "loss": 0.0001, "reward": 1.8750000596046448, "reward_std": 0.026750151067972183, "rewards/accuracy_reward": 0.875000037252903, "rewards/format_reward": 1.0, "step": 1990 }, { "completion_length": 426.40180587768555, "epoch": 0.15528228713971118, "grad_norm": 0.11484485174254085, "kl": 0.0025472640991210938, "learning_rate": 9.416729258304861e-07, "loss": 0.0001, "reward": 1.8191965073347092, "reward_std": 0.05816405080258846, "rewards/accuracy_reward": 0.8191964626312256, "rewards/format_reward": 1.0, "step": 1992 }, { "completion_length": 427.8593940734863, "epoch": 0.15543819305049403, "grad_norm": 0.12372015340880245, "kl": 0.002758026123046875, "learning_rate": 9.41558082326597e-07, "loss": 0.0001, "reward": 1.765625074505806, "reward_std": 0.11016801930963993, "rewards/accuracy_reward": 0.7656250447034836, "rewards/format_reward": 1.0, "step": 1994 }, { "completion_length": 429.8281440734863, "epoch": 0.15559409896127688, "grad_norm": 0.11659764383793472, "kl": 0.0024805068969726562, "learning_rate": 9.414431328901e-07, "loss": 0.0001, "reward": 1.783482238650322, "reward_std": 0.08469665795564651, "rewards/accuracy_reward": 0.7834821790456772, "rewards/format_reward": 1.0, "step": 1996 }, { "completion_length": 423.4018020629883, "epoch": 0.1557500048720597, "grad_norm": 0.0926825355843547, "kl": 0.0024318695068359375, "learning_rate": 9.413280775485724e-07, "loss": 0.0001, "reward": 1.8906250596046448, "reward_std": 0.047946405597031116, "rewards/accuracy_reward": 0.890625037252903, "rewards/format_reward": 1.0, "step": 1998 }, { "completion_length": 421.7924270629883, "epoch": 0.15590591078284255, "grad_norm": 0.13154331388632826, "kl": 0.003173828125, "learning_rate": 9.412129163296165e-07, "loss": 0.0001, "reward": 1.790178656578064, "reward_std": 0.11453007534146309, "rewards/accuracy_reward": 0.7901786118745804, "rewards/format_reward": 1.0, "step": 2000 }, { "completion_length": 426.8393020629883, "epoch": 0.1560618166936254, "grad_norm": 0.09916124597627997, "kl": 0.0028972625732421875, "learning_rate": 9.4109764926086e-07, "loss": 0.0001, "reward": 1.8660715371370316, "reward_std": 0.08520756475627422, "rewards/accuracy_reward": 0.8683036118745804, "rewards/format_reward": 0.9977678656578064, "step": 2002 }, { "completion_length": 427.5000228881836, "epoch": 0.15621772260440825, "grad_norm": 0.10015135860938704, "kl": 0.0024328231811523438, "learning_rate": 9.409822763699566e-07, "loss": 0.0001, "reward": 1.796875074505806, "reward_std": 0.04712266940623522, "rewards/accuracy_reward": 0.796875037252903, "rewards/format_reward": 1.0, "step": 2004 }, { "completion_length": 434.76341247558594, "epoch": 0.15637362851519107, "grad_norm": 0.13251739345418728, "kl": 0.0030651092529296875, "learning_rate": 9.408667976845848e-07, "loss": 0.0001, "reward": 1.7700893580913544, "reward_std": 0.11730380356311798, "rewards/accuracy_reward": 0.7700893208384514, "rewards/format_reward": 1.0, "step": 2006 }, { "completion_length": 428.745548248291, "epoch": 0.15652953442597392, "grad_norm": 0.10302996097005644, "kl": 0.0025835037231445312, "learning_rate": 9.407512132324487e-07, "loss": 0.0001, "reward": 1.8727679252624512, "reward_std": 0.0823382930830121, "rewards/accuracy_reward": 0.8750000447034836, "rewards/format_reward": 0.9977678656578064, "step": 2008 }, { "completion_length": 440.0736770629883, "epoch": 0.15668544033675677, "grad_norm": 0.08169162320553378, "kl": 0.0027256011962890625, "learning_rate": 9.406355230412779e-07, "loss": 0.0001, "reward": 1.7879465222358704, "reward_std": 0.052153694443404675, "rewards/accuracy_reward": 0.7879464626312256, "rewards/format_reward": 1.0, "step": 2010 }, { "completion_length": 422.9620780944824, "epoch": 0.15684134624753962, "grad_norm": 0.1264472204620069, "kl": 0.0024871826171875, "learning_rate": 9.40519727138827e-07, "loss": 0.0001, "reward": 1.8236608058214188, "reward_std": 0.10385596752166748, "rewards/accuracy_reward": 0.8258928954601288, "rewards/format_reward": 0.9977678656578064, "step": 2012 }, { "completion_length": 423.01341247558594, "epoch": 0.15699725215832244, "grad_norm": 0.09913132382480133, "kl": 0.0026912689208984375, "learning_rate": 9.404038255528762e-07, "loss": 0.0001, "reward": 1.750000074505806, "reward_std": 0.057341720908880234, "rewards/accuracy_reward": 0.7500000260770321, "rewards/format_reward": 1.0, "step": 2014 }, { "completion_length": 421.1763610839844, "epoch": 0.1571531580691053, "grad_norm": 0.10247185329966256, "kl": 0.0026597976684570312, "learning_rate": 9.40287818311231e-07, "loss": 0.0001, "reward": 1.8325893878936768, "reward_std": 0.08146646898239851, "rewards/accuracy_reward": 0.832589328289032, "rewards/format_reward": 1.0, "step": 2016 }, { "completion_length": 413.1495704650879, "epoch": 0.15730906397988814, "grad_norm": 0.13740695713225268, "kl": 0.0026884078979492188, "learning_rate": 9.401717054417227e-07, "loss": 0.0001, "reward": 1.8660715222358704, "reward_std": 0.11940996628254652, "rewards/accuracy_reward": 0.8660714626312256, "rewards/format_reward": 1.0, "step": 2018 }, { "completion_length": 411.10493087768555, "epoch": 0.157464969890671, "grad_norm": 0.07332593124153784, "kl": 0.002651214599609375, "learning_rate": 9.400554869722068e-07, "loss": 0.0001, "reward": 1.8102679401636124, "reward_std": 0.03983351867645979, "rewards/accuracy_reward": 0.8102678880095482, "rewards/format_reward": 1.0, "step": 2020 }, { "completion_length": 424.4576110839844, "epoch": 0.1576208758014538, "grad_norm": 0.09552571123798034, "kl": 0.002521514892578125, "learning_rate": 9.399391629305655e-07, "loss": 0.0001, "reward": 1.8303572237491608, "reward_std": 0.05831882171332836, "rewards/accuracy_reward": 0.8303571790456772, "rewards/format_reward": 1.0, "step": 2022 }, { "completion_length": 424.12947845458984, "epoch": 0.15777678171223666, "grad_norm": 0.11586684314609669, "kl": 0.0026407241821289062, "learning_rate": 9.398227333447053e-07, "loss": 0.0001, "reward": 1.7790179550647736, "reward_std": 0.07756087370216846, "rewards/accuracy_reward": 0.7790178880095482, "rewards/format_reward": 1.0, "step": 2024 }, { "completion_length": 424.4531478881836, "epoch": 0.1579326876230195, "grad_norm": 0.11668031562429085, "kl": 0.0028018951416015625, "learning_rate": 9.397061982425585e-07, "loss": 0.0001, "reward": 1.8125001043081284, "reward_std": 0.08552403375506401, "rewards/accuracy_reward": 0.812500037252903, "rewards/format_reward": 1.0, "step": 2026 }, { "completion_length": 429.97099685668945, "epoch": 0.15808859353380236, "grad_norm": 0.11420985672881934, "kl": 0.0025587081909179688, "learning_rate": 9.395895576520827e-07, "loss": 0.0001, "reward": 1.787946492433548, "reward_std": 0.06643030419945717, "rewards/accuracy_reward": 0.7901785969734192, "rewards/format_reward": 0.9977678656578064, "step": 2028 }, { "completion_length": 421.7165336608887, "epoch": 0.15824449944458518, "grad_norm": 0.12004117310096368, "kl": 0.0026979446411132812, "learning_rate": 9.394728116012606e-07, "loss": 0.0001, "reward": 1.8325893580913544, "reward_std": 0.07484992034733295, "rewards/accuracy_reward": 0.8325893133878708, "rewards/format_reward": 1.0, "step": 2030 }, { "completion_length": 425.5669860839844, "epoch": 0.15840040535536803, "grad_norm": 0.12270209450042437, "kl": 0.002685546875, "learning_rate": 9.393559601181005e-07, "loss": 0.0001, "reward": 1.8147322088479996, "reward_std": 0.10122916381806135, "rewards/accuracy_reward": 0.816964328289032, "rewards/format_reward": 0.9977678656578064, "step": 2032 }, { "completion_length": 410.1607322692871, "epoch": 0.15855631126615088, "grad_norm": 0.10679227726061878, "kl": 0.0025796890258789062, "learning_rate": 9.392390032306356e-07, "loss": 0.0001, "reward": 1.7946429401636124, "reward_std": 0.04764330945909023, "rewards/accuracy_reward": 0.7946429029107094, "rewards/format_reward": 1.0, "step": 2034 }, { "completion_length": 424.17859268188477, "epoch": 0.15871221717693373, "grad_norm": 0.11370473192781336, "kl": 0.002773284912109375, "learning_rate": 9.391219409669249e-07, "loss": 0.0001, "reward": 1.8169643431901932, "reward_std": 0.07658517640084028, "rewards/accuracy_reward": 0.8169643208384514, "rewards/format_reward": 1.0, "step": 2036 }, { "completion_length": 435.72545623779297, "epoch": 0.15886812308771656, "grad_norm": 0.10007546902913615, "kl": 0.0026702880859375, "learning_rate": 9.39004773355052e-07, "loss": 0.0001, "reward": 1.8839286416769028, "reward_std": 0.05035426188260317, "rewards/accuracy_reward": 0.8839286044239998, "rewards/format_reward": 1.0, "step": 2038 }, { "completion_length": 424.8683204650879, "epoch": 0.1590240289984994, "grad_norm": 0.10654775639553514, "kl": 0.0029726028442382812, "learning_rate": 9.388875004231265e-07, "loss": 0.0001, "reward": 1.7767857760190964, "reward_std": 0.07484852150082588, "rewards/accuracy_reward": 0.7767857536673546, "rewards/format_reward": 1.0, "step": 2040 }, { "completion_length": 404.4620704650879, "epoch": 0.15917993490928226, "grad_norm": 0.1324383349058899, "kl": 0.004563331604003906, "learning_rate": 9.387701221992826e-07, "loss": 0.0002, "reward": 1.7924108058214188, "reward_std": 0.08942822925746441, "rewards/accuracy_reward": 0.792410746216774, "rewards/format_reward": 1.0, "step": 2042 }, { "completion_length": 433.1741256713867, "epoch": 0.15933584082006508, "grad_norm": 0.1253684398865278, "kl": 0.0027980804443359375, "learning_rate": 9.386526387116804e-07, "loss": 0.0001, "reward": 1.7656250894069672, "reward_std": 0.11011428572237492, "rewards/accuracy_reward": 0.7678571753203869, "rewards/format_reward": 0.9977678656578064, "step": 2044 }, { "completion_length": 441.2455596923828, "epoch": 0.15949174673084793, "grad_norm": 0.11521787278186912, "kl": 0.002994537353515625, "learning_rate": 9.385350499885048e-07, "loss": 0.0001, "reward": 1.7767857909202576, "reward_std": 0.08680282346904278, "rewards/accuracy_reward": 0.7767857387661934, "rewards/format_reward": 1.0, "step": 2046 }, { "completion_length": 437.4174270629883, "epoch": 0.15964765264163078, "grad_norm": 0.11890058516493691, "kl": 0.0029697418212890625, "learning_rate": 9.38417356057966e-07, "loss": 0.0001, "reward": 1.7254465222358704, "reward_std": 0.07981676608324051, "rewards/accuracy_reward": 0.725446455180645, "rewards/format_reward": 1.0, "step": 2048 }, { "completion_length": 434.8035888671875, "epoch": 0.15980355855241363, "grad_norm": 0.10758307016950691, "kl": 0.0027618408203125, "learning_rate": 9.382995569482996e-07, "loss": 0.0001, "reward": 1.8392857909202576, "reward_std": 0.06319871265441179, "rewards/accuracy_reward": 0.839285746216774, "rewards/format_reward": 1.0, "step": 2050 }, { "completion_length": 428.57144927978516, "epoch": 0.15995946446319645, "grad_norm": 0.13173897794900719, "kl": 0.0026378631591796875, "learning_rate": 9.381816526877666e-07, "loss": 0.0001, "reward": 1.790178656578064, "reward_std": 0.09799757227301598, "rewards/accuracy_reward": 0.7901785969734192, "rewards/format_reward": 1.0, "step": 2052 }, { "completion_length": 419.4285888671875, "epoch": 0.1601153703739793, "grad_norm": 0.11235444276088856, "kl": 0.00260162353515625, "learning_rate": 9.380636433046526e-07, "loss": 0.0001, "reward": 1.866071492433548, "reward_std": 0.09033751115202904, "rewards/accuracy_reward": 0.8660714700818062, "rewards/format_reward": 1.0, "step": 2054 }, { "completion_length": 432.83707427978516, "epoch": 0.16027127628476215, "grad_norm": 0.13157473716825763, "kl": 0.0029430389404296875, "learning_rate": 9.37945528827269e-07, "loss": 0.0001, "reward": 1.705357238650322, "reward_std": 0.12151753529906273, "rewards/accuracy_reward": 0.7075893133878708, "rewards/format_reward": 0.9977678656578064, "step": 2056 }, { "completion_length": 425.16296768188477, "epoch": 0.160427182195545, "grad_norm": 0.056674530356194144, "kl": 0.0026721954345703125, "learning_rate": 9.378273092839521e-07, "loss": 0.0001, "reward": 1.8772322088479996, "reward_std": 0.04712267220020294, "rewards/accuracy_reward": 0.8772321790456772, "rewards/format_reward": 1.0, "step": 2058 }, { "completion_length": 419.73662185668945, "epoch": 0.16058308810632782, "grad_norm": 0.10415650371117288, "kl": 0.0031528472900390625, "learning_rate": 9.377089847030638e-07, "loss": 0.0001, "reward": 1.8549108058214188, "reward_std": 0.07237648405134678, "rewards/accuracy_reward": 0.8549107387661934, "rewards/format_reward": 1.0, "step": 2060 }, { "completion_length": 436.33707427978516, "epoch": 0.16073899401711067, "grad_norm": 0.09830422868775678, "kl": 0.0032711029052734375, "learning_rate": 9.375905551129907e-07, "loss": 0.0001, "reward": 1.6875000894069672, "reward_std": 0.0780815128237009, "rewards/accuracy_reward": 0.6875000223517418, "rewards/format_reward": 1.0, "step": 2062 }, { "completion_length": 432.267879486084, "epoch": 0.16089489992789352, "grad_norm": 0.1491390467948576, "kl": 0.0029449462890625, "learning_rate": 9.374720205421448e-07, "loss": 0.0001, "reward": 1.796875074505806, "reward_std": 0.11693793442100286, "rewards/accuracy_reward": 0.7968750447034836, "rewards/format_reward": 1.0, "step": 2064 }, { "completion_length": 423.3437690734863, "epoch": 0.16105080583867637, "grad_norm": 0.11900654853150201, "kl": 0.00286102294921875, "learning_rate": 9.373533810189634e-07, "loss": 0.0001, "reward": 1.758928656578064, "reward_std": 0.06395826954394579, "rewards/accuracy_reward": 0.7611607387661934, "rewards/format_reward": 0.9977678656578064, "step": 2066 }, { "completion_length": 420.69644927978516, "epoch": 0.1612067117494592, "grad_norm": 0.09257496360219929, "kl": 0.0026454925537109375, "learning_rate": 9.372346365719088e-07, "loss": 0.0001, "reward": 1.7767857909202576, "reward_std": 0.0892762616276741, "rewards/accuracy_reward": 0.776785746216774, "rewards/format_reward": 1.0, "step": 2068 }, { "completion_length": 429.4910888671875, "epoch": 0.16136261766024204, "grad_norm": 0.13105378830143893, "kl": 0.0026750564575195312, "learning_rate": 9.371157872294686e-07, "loss": 0.0001, "reward": 1.8303572237491608, "reward_std": 0.09912804141640663, "rewards/accuracy_reward": 0.8303571864962578, "rewards/format_reward": 1.0, "step": 2070 }, { "completion_length": 415.9085006713867, "epoch": 0.1615185235710249, "grad_norm": 0.08590057511285863, "kl": 0.0025424957275390625, "learning_rate": 9.369968330201554e-07, "loss": 0.0001, "reward": 1.7968750596046448, "reward_std": 0.06155041232705116, "rewards/accuracy_reward": 0.7968750223517418, "rewards/format_reward": 1.0, "step": 2072 }, { "completion_length": 418.346004486084, "epoch": 0.16167442948180774, "grad_norm": 0.09321202840799621, "kl": 0.00231170654296875, "learning_rate": 9.368777739725074e-07, "loss": 0.0001, "reward": 1.7857143729925156, "reward_std": 0.0417863167822361, "rewards/accuracy_reward": 0.7857143059372902, "rewards/format_reward": 1.0, "step": 2074 }, { "completion_length": 423.9576110839844, "epoch": 0.16183033539259056, "grad_norm": 0.13305760668109243, "kl": 0.0028486251831054688, "learning_rate": 9.367586101150873e-07, "loss": 0.0001, "reward": 1.7678572237491608, "reward_std": 0.09301240369677544, "rewards/accuracy_reward": 0.7700893059372902, "rewards/format_reward": 0.9977678656578064, "step": 2076 }, { "completion_length": 421.2500190734863, "epoch": 0.1619862413033734, "grad_norm": 0.14819936796819386, "kl": 0.0026826858520507812, "learning_rate": 9.366393414764834e-07, "loss": 0.0001, "reward": 1.8258929252624512, "reward_std": 0.09296431206166744, "rewards/accuracy_reward": 0.8258928805589676, "rewards/format_reward": 1.0, "step": 2078 }, { "completion_length": 414.4062690734863, "epoch": 0.16214214721415626, "grad_norm": 0.096175227568584, "kl": 0.0025949478149414062, "learning_rate": 9.36519968085309e-07, "loss": 0.0001, "reward": 1.7745536267757416, "reward_std": 0.07499824743717909, "rewards/accuracy_reward": 0.7745535969734192, "rewards/format_reward": 1.0, "step": 2080 }, { "completion_length": 415.79019927978516, "epoch": 0.1622980531249391, "grad_norm": 0.08706524975188874, "kl": 0.0025148391723632812, "learning_rate": 9.364004899702025e-07, "loss": 0.0001, "reward": 1.790178656578064, "reward_std": 0.07042145077139139, "rewards/accuracy_reward": 0.790178619325161, "rewards/format_reward": 1.0, "step": 2082 }, { "completion_length": 416.3013572692871, "epoch": 0.16245395903572193, "grad_norm": 0.07344377801415095, "kl": 0.0025758743286132812, "learning_rate": 9.362809071598274e-07, "loss": 0.0001, "reward": 1.8281250596046448, "reward_std": 0.0456905122846365, "rewards/accuracy_reward": 0.8281250447034836, "rewards/format_reward": 1.0, "step": 2084 }, { "completion_length": 416.5625190734863, "epoch": 0.16260986494650478, "grad_norm": 0.10271916368828563, "kl": 0.002490997314453125, "learning_rate": 9.361612196828726e-07, "loss": 0.0001, "reward": 1.7723215073347092, "reward_std": 0.05298106465488672, "rewards/accuracy_reward": 0.7723214626312256, "rewards/format_reward": 1.0, "step": 2086 }, { "completion_length": 436.7634162902832, "epoch": 0.16276577085728763, "grad_norm": 0.12857632578290343, "kl": 0.0031890869140625, "learning_rate": 9.360414275680518e-07, "loss": 0.0001, "reward": 1.7611608058214188, "reward_std": 0.07612871658056974, "rewards/accuracy_reward": 0.7611607611179352, "rewards/format_reward": 1.0, "step": 2088 }, { "completion_length": 414.90403747558594, "epoch": 0.16292167676807046, "grad_norm": 0.10591601093714802, "kl": 0.0026988983154296875, "learning_rate": 9.359215308441039e-07, "loss": 0.0001, "reward": 1.7232143729925156, "reward_std": 0.08116337563842535, "rewards/accuracy_reward": 0.7232143133878708, "rewards/format_reward": 1.0, "step": 2090 }, { "completion_length": 421.85939025878906, "epoch": 0.1630775826788533, "grad_norm": 0.13303352617330733, "kl": 0.002765655517578125, "learning_rate": 9.358015295397928e-07, "loss": 0.0001, "reward": 1.7834822237491608, "reward_std": 0.08973272517323494, "rewards/accuracy_reward": 0.7834821864962578, "rewards/format_reward": 1.0, "step": 2092 }, { "completion_length": 415.76118087768555, "epoch": 0.16323348858963616, "grad_norm": 0.11703047021155064, "kl": 0.00249481201171875, "learning_rate": 9.356814236839075e-07, "loss": 0.0001, "reward": 1.8705357760190964, "reward_std": 0.0787541177123785, "rewards/accuracy_reward": 0.870535746216774, "rewards/format_reward": 1.0, "step": 2094 }, { "completion_length": 424.3125228881836, "epoch": 0.163389394500419, "grad_norm": 0.07629502544537048, "kl": 0.00266265869140625, "learning_rate": 9.355612133052624e-07, "loss": 0.0001, "reward": 1.7053572237491608, "reward_std": 0.05786095838993788, "rewards/accuracy_reward": 0.7053571790456772, "rewards/format_reward": 1.0, "step": 2096 }, { "completion_length": 414.95984268188477, "epoch": 0.16354530041120183, "grad_norm": 0.11757635908555758, "kl": 0.0025615692138671875, "learning_rate": 9.354408984326967e-07, "loss": 0.0001, "reward": 1.8415179550647736, "reward_std": 0.06447890773415565, "rewards/accuracy_reward": 0.8415179029107094, "rewards/format_reward": 1.0, "step": 2098 }, { "completion_length": 422.9107360839844, "epoch": 0.16370120632198468, "grad_norm": 0.08175601505226011, "kl": 0.002532958984375, "learning_rate": 9.353204790950745e-07, "loss": 0.0001, "reward": 1.837053656578064, "reward_std": 0.08168401569128036, "rewards/accuracy_reward": 0.8370536118745804, "rewards/format_reward": 1.0, "step": 2100 }, { "completion_length": 426.82144927978516, "epoch": 0.16385711223276753, "grad_norm": 0.12300763838462789, "kl": 0.002933502197265625, "learning_rate": 9.351999553212853e-07, "loss": 0.0001, "reward": 1.8191965222358704, "reward_std": 0.07387282233685255, "rewards/accuracy_reward": 0.8191964626312256, "rewards/format_reward": 1.0, "step": 2102 }, { "completion_length": 423.46207427978516, "epoch": 0.16401301814355038, "grad_norm": 0.09614891548344884, "kl": 0.0027675628662109375, "learning_rate": 9.350793271402437e-07, "loss": 0.0001, "reward": 1.8080358058214188, "reward_std": 0.057341720908880234, "rewards/accuracy_reward": 0.8102678805589676, "rewards/format_reward": 0.9977678656578064, "step": 2104 }, { "completion_length": 421.6919860839844, "epoch": 0.1641689240543332, "grad_norm": 0.004023506212243608, "kl": 0.002658843994140625, "learning_rate": 9.349585945808889e-07, "loss": 0.0001, "reward": 1.805803656578064, "reward_std": 0.01894036028534174, "rewards/accuracy_reward": 0.8058036044239998, "rewards/format_reward": 1.0, "step": 2106 }, { "completion_length": 432.50001525878906, "epoch": 0.16432482996511605, "grad_norm": 0.07640749297127723, "kl": 0.0029850006103515625, "learning_rate": 9.348377576721855e-07, "loss": 0.0001, "reward": 1.8169643878936768, "reward_std": 0.0417863167822361, "rewards/accuracy_reward": 0.8169643208384514, "rewards/format_reward": 1.0, "step": 2108 }, { "completion_length": 429.06028747558594, "epoch": 0.1644807358758989, "grad_norm": 0.10594230024801869, "kl": 0.0027942657470703125, "learning_rate": 9.347168164431232e-07, "loss": 0.0001, "reward": 1.8147322088479996, "reward_std": 0.07740750908851624, "rewards/accuracy_reward": 0.8147321864962578, "rewards/format_reward": 1.0, "step": 2110 }, { "completion_length": 417.2076072692871, "epoch": 0.16463664178668175, "grad_norm": 0.13688887636723782, "kl": 0.002758026123046875, "learning_rate": 9.345957709227163e-07, "loss": 0.0001, "reward": 1.7946429401636124, "reward_std": 0.10641719028353691, "rewards/accuracy_reward": 0.7946428880095482, "rewards/format_reward": 1.0, "step": 2112 }, { "completion_length": 427.1651916503906, "epoch": 0.16479254769746457, "grad_norm": 0.10624527921873297, "kl": 0.0027828216552734375, "learning_rate": 9.344746211400047e-07, "loss": 0.0001, "reward": 1.734375074505806, "reward_std": 0.0748499259352684, "rewards/accuracy_reward": 0.7343750335276127, "rewards/format_reward": 1.0, "step": 2114 }, { "completion_length": 405.04912185668945, "epoch": 0.16494845360824742, "grad_norm": 0.09992230656244792, "kl": 0.0025424957275390625, "learning_rate": 9.343533671240527e-07, "loss": 0.0001, "reward": 1.8080358058214188, "reward_std": 0.06463087350130081, "rewards/accuracy_reward": 0.8080357387661934, "rewards/format_reward": 1.0, "step": 2116 }, { "completion_length": 417.07368087768555, "epoch": 0.16510435951903027, "grad_norm": 0.0776488605645175, "kl": 0.0028247833251953125, "learning_rate": 9.3423200890395e-07, "loss": 0.0001, "reward": 1.7723215073347092, "reward_std": 0.048620409332215786, "rewards/accuracy_reward": 0.7723214626312256, "rewards/format_reward": 1.0, "step": 2118 }, { "completion_length": 428.68305587768555, "epoch": 0.16526026542981312, "grad_norm": 0.13199717495314067, "kl": 0.002895355224609375, "learning_rate": 9.341105465088115e-07, "loss": 0.0001, "reward": 1.7589286416769028, "reward_std": 0.0787541177123785, "rewards/accuracy_reward": 0.7589285932481289, "rewards/format_reward": 1.0, "step": 2120 }, { "completion_length": 424.52234649658203, "epoch": 0.16541617134059594, "grad_norm": 0.09734117788014415, "kl": 0.0027065277099609375, "learning_rate": 9.339889799677763e-07, "loss": 0.0001, "reward": 1.8571429252624512, "reward_std": 0.057341720908880234, "rewards/accuracy_reward": 0.8571428805589676, "rewards/format_reward": 1.0, "step": 2122 }, { "completion_length": 426.35046768188477, "epoch": 0.1655720772513788, "grad_norm": 0.14681176102261753, "kl": 0.0030994415283203125, "learning_rate": 9.338673093100096e-07, "loss": 0.0001, "reward": 1.756696492433548, "reward_std": 0.13444249890744686, "rewards/accuracy_reward": 0.7589286044239998, "rewards/format_reward": 0.9977678656578064, "step": 2124 }, { "completion_length": 403.9464454650879, "epoch": 0.16572798316216164, "grad_norm": 0.10252598605314692, "kl": 0.002655029296875, "learning_rate": 9.337455345647003e-07, "loss": 0.0001, "reward": 1.7790179252624512, "reward_std": 0.06493396684527397, "rewards/accuracy_reward": 0.7790179029107094, "rewards/format_reward": 1.0, "step": 2126 }, { "completion_length": 412.5803756713867, "epoch": 0.1658838890729445, "grad_norm": 0.05727043138273751, "kl": 0.0027408599853515625, "learning_rate": 9.336236557610635e-07, "loss": 0.0001, "reward": 1.805803656578064, "reward_std": 0.03156726714223623, "rewards/accuracy_reward": 0.8058036044239998, "rewards/format_reward": 1.0, "step": 2128 }, { "completion_length": 428.3727912902832, "epoch": 0.1660397949837273, "grad_norm": 0.09382302430364163, "kl": 0.0029687881469726562, "learning_rate": 9.335016729283383e-07, "loss": 0.0001, "reward": 1.7700893878936768, "reward_std": 0.048099770210683346, "rewards/accuracy_reward": 0.770089328289032, "rewards/format_reward": 1.0, "step": 2130 }, { "completion_length": 416.0848388671875, "epoch": 0.16619570089451016, "grad_norm": 0.11892508771201778, "kl": 0.0026445388793945312, "learning_rate": 9.333795860957896e-07, "loss": 0.0001, "reward": 1.8995536416769028, "reward_std": 0.05688526015728712, "rewards/accuracy_reward": 0.8995536118745804, "rewards/format_reward": 1.0, "step": 2132 }, { "completion_length": 432.846004486084, "epoch": 0.166351606805293, "grad_norm": 0.1256254758919404, "kl": 0.003040313720703125, "learning_rate": 9.332573952927063e-07, "loss": 0.0001, "reward": 1.8437501043081284, "reward_std": 0.08845252729952335, "rewards/accuracy_reward": 0.843750037252903, "rewards/format_reward": 1.0, "step": 2134 }, { "completion_length": 421.7567138671875, "epoch": 0.16650751271607586, "grad_norm": 0.10065295760551148, "kl": 0.0027217864990234375, "learning_rate": 9.331351005484032e-07, "loss": 0.0001, "reward": 1.7321429550647736, "reward_std": 0.08018627669662237, "rewards/accuracy_reward": 0.7321428880095482, "rewards/format_reward": 1.0, "step": 2136 }, { "completion_length": 418.81252670288086, "epoch": 0.16666341862685868, "grad_norm": 0.06982504182138242, "kl": 0.002590179443359375, "learning_rate": 9.330127018922193e-07, "loss": 0.0001, "reward": 1.821428656578064, "reward_std": 0.044714813120663166, "rewards/accuracy_reward": 0.821428619325161, "rewards/format_reward": 1.0, "step": 2138 }, { "completion_length": 419.0201110839844, "epoch": 0.16681932453764153, "grad_norm": 0.07195924728911518, "kl": 0.0024814605712890625, "learning_rate": 9.32890199353519e-07, "loss": 0.0001, "reward": 1.7745536416769028, "reward_std": 0.04111231118440628, "rewards/accuracy_reward": 0.7745536044239998, "rewards/format_reward": 1.0, "step": 2140 }, { "completion_length": 410.65403747558594, "epoch": 0.16697523044842438, "grad_norm": 0.09859841539929218, "kl": 0.002838134765625, "learning_rate": 9.327675929616913e-07, "loss": 0.0001, "reward": 1.852678656578064, "reward_std": 0.05493246391415596, "rewards/accuracy_reward": 0.852678619325161, "rewards/format_reward": 1.0, "step": 2142 }, { "completion_length": 432.9977912902832, "epoch": 0.1671311363592072, "grad_norm": 0.11475314456904953, "kl": 0.00290679931640625, "learning_rate": 9.326448827461503e-07, "loss": 0.0001, "reward": 1.8392857760190964, "reward_std": 0.057341720908880234, "rewards/accuracy_reward": 0.839285746216774, "rewards/format_reward": 1.0, "step": 2144 }, { "completion_length": 425.93305587768555, "epoch": 0.16728704226999005, "grad_norm": 0.08917526571657555, "kl": 0.0027894973754882812, "learning_rate": 9.32522068736335e-07, "loss": 0.0001, "reward": 1.8392857909202576, "reward_std": 0.055236958898603916, "rewards/accuracy_reward": 0.8392857611179352, "rewards/format_reward": 1.0, "step": 2146 }, { "completion_length": 415.43305587768555, "epoch": 0.1674429481807729, "grad_norm": 0.05158642141112666, "kl": 0.0026426315307617188, "learning_rate": 9.323991509617093e-07, "loss": 0.0001, "reward": 1.7633929401636124, "reward_std": 0.04599360469728708, "rewards/accuracy_reward": 0.7633928880095482, "rewards/format_reward": 1.0, "step": 2148 }, { "completion_length": 419.8370704650879, "epoch": 0.16759885409155575, "grad_norm": 0.14666945111519666, "kl": 0.0029926300048828125, "learning_rate": 9.322761294517618e-07, "loss": 0.0001, "reward": 1.8236607760190964, "reward_std": 0.08732346631586552, "rewards/accuracy_reward": 0.8236607387661934, "rewards/format_reward": 1.0, "step": 2150 }, { "completion_length": 423.1027030944824, "epoch": 0.16775476000233858, "grad_norm": 0.08028138410369225, "kl": 0.002841949462890625, "learning_rate": 9.321530042360064e-07, "loss": 0.0001, "reward": 1.7946429401636124, "reward_std": 0.05604815576225519, "rewards/accuracy_reward": 0.7968750298023224, "rewards/format_reward": 0.9977678656578064, "step": 2152 }, { "completion_length": 439.8995704650879, "epoch": 0.16791066591312143, "grad_norm": 0.10532617965604346, "kl": 0.00305938720703125, "learning_rate": 9.320297753439814e-07, "loss": 0.0001, "reward": 1.8593750894069672, "reward_std": 0.07853797450661659, "rewards/accuracy_reward": 0.8593750149011612, "rewards/format_reward": 1.0, "step": 2154 }, { "completion_length": 421.2053756713867, "epoch": 0.16806657182390428, "grad_norm": 0.12073348392287167, "kl": 0.0029087066650390625, "learning_rate": 9.319064428052503e-07, "loss": 0.0001, "reward": 1.7924107909202576, "reward_std": 0.07192142866551876, "rewards/accuracy_reward": 0.792410746216774, "rewards/format_reward": 1.0, "step": 2156 }, { "completion_length": 421.59599685668945, "epoch": 0.16822247773468713, "grad_norm": 0.1409279993196755, "kl": 0.0028123855590820312, "learning_rate": 9.317830066494013e-07, "loss": 0.0001, "reward": 1.8258929401636124, "reward_std": 0.08485142607241869, "rewards/accuracy_reward": 0.8258928954601288, "rewards/format_reward": 1.0, "step": 2158 }, { "completion_length": 442.1875190734863, "epoch": 0.16837838364546995, "grad_norm": 0.13532349364949234, "kl": 0.0028667449951171875, "learning_rate": 9.316594669060477e-07, "loss": 0.0001, "reward": 1.8102679550647736, "reward_std": 0.1010757964104414, "rewards/accuracy_reward": 0.8102679029107094, "rewards/format_reward": 1.0, "step": 2160 }, { "completion_length": 439.0937690734863, "epoch": 0.1685342895562528, "grad_norm": 0.11136816489870495, "kl": 0.0027828216552734375, "learning_rate": 9.315358236048273e-07, "loss": 0.0001, "reward": 1.76116082072258, "reward_std": 0.067136125639081, "rewards/accuracy_reward": 0.7633928805589676, "rewards/format_reward": 0.9977678656578064, "step": 2162 }, { "completion_length": 423.5044860839844, "epoch": 0.16869019546703565, "grad_norm": 0.10602241066416897, "kl": 0.00270843505859375, "learning_rate": 9.314120767754029e-07, "loss": 0.0001, "reward": 1.7678572237491608, "reward_std": 0.06282780412584543, "rewards/accuracy_reward": 0.7678571790456772, "rewards/format_reward": 1.0, "step": 2164 }, { "completion_length": 432.8460006713867, "epoch": 0.1688461013778185, "grad_norm": 0.13565884065276113, "kl": 0.00299835205078125, "learning_rate": 9.312882264474622e-07, "loss": 0.0001, "reward": 1.7254465073347092, "reward_std": 0.09995036851614714, "rewards/accuracy_reward": 0.7276785857975483, "rewards/format_reward": 0.9977678656578064, "step": 2166 }, { "completion_length": 410.8861770629883, "epoch": 0.16900200728860132, "grad_norm": 0.08135225010673201, "kl": 0.0029973983764648438, "learning_rate": 9.311642726507179e-07, "loss": 0.0001, "reward": 1.758928656578064, "reward_std": 0.04065585229545832, "rewards/accuracy_reward": 0.7589286118745804, "rewards/format_reward": 1.0, "step": 2168 }, { "completion_length": 434.2611770629883, "epoch": 0.16915791319938417, "grad_norm": 0.10093726068921059, "kl": 0.0027103424072265625, "learning_rate": 9.310402154149071e-07, "loss": 0.0001, "reward": 1.8348215222358704, "reward_std": 0.08994886465370655, "rewards/accuracy_reward": 0.8348214700818062, "rewards/format_reward": 1.0, "step": 2170 }, { "completion_length": 427.2901916503906, "epoch": 0.16931381911016702, "grad_norm": 0.11500900889306954, "kl": 0.0025386810302734375, "learning_rate": 9.309160547697919e-07, "loss": 0.0001, "reward": 1.8660714775323868, "reward_std": 0.05020089540630579, "rewards/accuracy_reward": 0.868303582072258, "rewards/format_reward": 0.9977678656578064, "step": 2172 }, { "completion_length": 414.5178756713867, "epoch": 0.16946972502094987, "grad_norm": 0.07602789150702721, "kl": 0.0025243759155273438, "learning_rate": 9.30791790745159e-07, "loss": 0.0001, "reward": 1.8102679252624512, "reward_std": 0.06643030513077974, "rewards/accuracy_reward": 0.8102678805589676, "rewards/format_reward": 1.0, "step": 2174 }, { "completion_length": 416.8750228881836, "epoch": 0.1696256309317327, "grad_norm": 0.05138802138068495, "kl": 0.0026445388793945312, "learning_rate": 9.306674233708207e-07, "loss": 0.0001, "reward": 1.8839286416769028, "reward_std": 0.03937705885618925, "rewards/accuracy_reward": 0.8839285969734192, "rewards/format_reward": 1.0, "step": 2176 }, { "completion_length": 442.9218940734863, "epoch": 0.16978153684251554, "grad_norm": 0.0774496947291255, "kl": 0.0032405853271484375, "learning_rate": 9.305429526766132e-07, "loss": 0.0001, "reward": 1.7767857909202576, "reward_std": 0.07710441388189793, "rewards/accuracy_reward": 0.7767857536673546, "rewards/format_reward": 1.0, "step": 2178 }, { "completion_length": 430.02903747558594, "epoch": 0.1699374427532984, "grad_norm": 0.06984103109822332, "kl": 0.00286102294921875, "learning_rate": 9.304183786923979e-07, "loss": 0.0001, "reward": 1.8638393580913544, "reward_std": 0.07320021744817495, "rewards/accuracy_reward": 0.8660714626312256, "rewards/format_reward": 0.9977678656578064, "step": 2180 }, { "completion_length": 419.6495666503906, "epoch": 0.17009334866408124, "grad_norm": 0.1167349370605733, "kl": 0.002620697021484375, "learning_rate": 9.302937014480608e-07, "loss": 0.0001, "reward": 1.866071492433548, "reward_std": 0.0635032095015049, "rewards/accuracy_reward": 0.8660714626312256, "rewards/format_reward": 1.0, "step": 2182 }, { "completion_length": 410.0067138671875, "epoch": 0.17024925457486406, "grad_norm": 0.09445674693300266, "kl": 0.003692626953125, "learning_rate": 9.301689209735127e-07, "loss": 0.0001, "reward": 1.7767857760190964, "reward_std": 0.0771044148132205, "rewards/accuracy_reward": 0.7767857685685158, "rewards/format_reward": 1.0, "step": 2184 }, { "completion_length": 416.5379638671875, "epoch": 0.1704051604856469, "grad_norm": 0.05250013771796101, "kl": 0.002529144287109375, "learning_rate": 9.300440372986894e-07, "loss": 0.0001, "reward": 1.8415179550647736, "reward_std": 0.04861900769174099, "rewards/accuracy_reward": 0.8415178880095482, "rewards/format_reward": 1.0, "step": 2186 }, { "completion_length": 421.8683166503906, "epoch": 0.17056106639642976, "grad_norm": 0.14902811680101047, "kl": 0.0028896331787109375, "learning_rate": 9.299190504535513e-07, "loss": 0.0001, "reward": 1.799107238650322, "reward_std": 0.1101694218814373, "rewards/accuracy_reward": 0.7991071864962578, "rewards/format_reward": 1.0, "step": 2188 }, { "completion_length": 417.0580520629883, "epoch": 0.1707169723072126, "grad_norm": 0.04634756771321838, "kl": 0.00270843505859375, "learning_rate": 9.297939604680835e-07, "loss": 0.0001, "reward": 1.8750000596046448, "reward_std": 0.03870445676147938, "rewards/accuracy_reward": 0.8750000298023224, "rewards/format_reward": 1.0, "step": 2190 }, { "completion_length": 417.0714454650879, "epoch": 0.17087287821799543, "grad_norm": 0.08370668764779685, "kl": 0.0025262832641601562, "learning_rate": 9.296687673722958e-07, "loss": 0.0001, "reward": 1.7388393729925156, "reward_std": 0.08927766233682632, "rewards/accuracy_reward": 0.7388393059372902, "rewards/format_reward": 1.0, "step": 2192 }, { "completion_length": 417.5714416503906, "epoch": 0.17102878412877828, "grad_norm": 0.10462173767517285, "kl": 0.0029449462890625, "learning_rate": 9.295434711962229e-07, "loss": 0.0001, "reward": 1.727678656578064, "reward_std": 0.06170237809419632, "rewards/accuracy_reward": 0.727678619325161, "rewards/format_reward": 1.0, "step": 2194 }, { "completion_length": 424.7522506713867, "epoch": 0.17118469003956113, "grad_norm": 0.09257780936180539, "kl": 0.0025720596313476562, "learning_rate": 9.294180719699243e-07, "loss": 0.0001, "reward": 1.774553656578064, "reward_std": 0.06365517247468233, "rewards/accuracy_reward": 0.776785746216774, "rewards/format_reward": 0.9977678656578064, "step": 2196 }, { "completion_length": 421.87279510498047, "epoch": 0.17134059595034395, "grad_norm": 0.07261495155649737, "kl": 0.0028409957885742188, "learning_rate": 9.292925697234837e-07, "loss": 0.0001, "reward": 1.7857143878936768, "reward_std": 0.04614697303622961, "rewards/accuracy_reward": 0.7857143208384514, "rewards/format_reward": 1.0, "step": 2198 }, { "completion_length": 427.8348388671875, "epoch": 0.1714965018611268, "grad_norm": 0.08458163375457417, "kl": 0.0029582977294921875, "learning_rate": 9.291669644870103e-07, "loss": 0.0001, "reward": 1.8950893580913544, "reward_std": 0.03840135969221592, "rewards/accuracy_reward": 0.8950893208384514, "rewards/format_reward": 1.0, "step": 2200 }, { "completion_length": 434.9910888671875, "epoch": 0.17165240777190965, "grad_norm": 0.12207480002044316, "kl": 0.0032405853271484375, "learning_rate": 9.290412562906373e-07, "loss": 0.0001, "reward": 1.8482143431901932, "reward_std": 0.06981526408344507, "rewards/accuracy_reward": 0.8482143357396126, "rewards/format_reward": 1.0, "step": 2202 }, { "completion_length": 415.9509048461914, "epoch": 0.1718083136826925, "grad_norm": 0.10766323973698858, "kl": 0.00283050537109375, "learning_rate": 9.289154451645233e-07, "loss": 0.0001, "reward": 1.7611608058214188, "reward_std": 0.05831741914153099, "rewards/accuracy_reward": 0.7611607387661934, "rewards/format_reward": 1.0, "step": 2204 }, { "completion_length": 422.23216247558594, "epoch": 0.17196421959347533, "grad_norm": 0.10685676378310936, "kl": 0.0028057098388671875, "learning_rate": 9.287895311388507e-07, "loss": 0.0001, "reward": 1.8415179401636124, "reward_std": 0.06417441088706255, "rewards/accuracy_reward": 0.8415178954601288, "rewards/format_reward": 1.0, "step": 2206 }, { "completion_length": 424.4776916503906, "epoch": 0.17212012550425818, "grad_norm": 0.11962733092402814, "kl": 0.0030574798583984375, "learning_rate": 9.286635142438273e-07, "loss": 0.0001, "reward": 1.80803582072258, "reward_std": 0.09769588150084019, "rewards/accuracy_reward": 0.8080357536673546, "rewards/format_reward": 1.0, "step": 2208 }, { "completion_length": 423.19421768188477, "epoch": 0.17227603141504103, "grad_norm": 0.10045876910553597, "kl": 0.0027141571044921875, "learning_rate": 9.285373945096852e-07, "loss": 0.0001, "reward": 1.7656250596046448, "reward_std": 0.038401360623538494, "rewards/accuracy_reward": 0.7656250447034836, "rewards/format_reward": 1.0, "step": 2210 }, { "completion_length": 422.7210006713867, "epoch": 0.17243193732582388, "grad_norm": 0.11394012987634601, "kl": 0.0027008056640625, "learning_rate": 9.284111719666816e-07, "loss": 0.0001, "reward": 1.7812500894069672, "reward_std": 0.07823488209396601, "rewards/accuracy_reward": 0.7812500223517418, "rewards/format_reward": 1.0, "step": 2212 }, { "completion_length": 411.5826072692871, "epoch": 0.1725878432366067, "grad_norm": 0.10453491495604783, "kl": 0.0025787353515625, "learning_rate": 9.282848466450981e-07, "loss": 0.0001, "reward": 1.7388393878936768, "reward_std": 0.06365517526865005, "rewards/accuracy_reward": 0.7388393133878708, "rewards/format_reward": 1.0, "step": 2214 }, { "completion_length": 423.24778747558594, "epoch": 0.17274374914738955, "grad_norm": 0.10920723237004752, "kl": 0.00258636474609375, "learning_rate": 9.281584185752407e-07, "loss": 0.0001, "reward": 1.8593750596046448, "reward_std": 0.06124591547995806, "rewards/accuracy_reward": 0.8593750298023224, "rewards/format_reward": 1.0, "step": 2216 }, { "completion_length": 441.7656440734863, "epoch": 0.1728996550581724, "grad_norm": 0.1194007703758728, "kl": 0.0028362274169921875, "learning_rate": 9.280318877874405e-07, "loss": 0.0001, "reward": 1.7946429550647736, "reward_std": 0.08567516319453716, "rewards/accuracy_reward": 0.7946428805589676, "rewards/format_reward": 1.0, "step": 2218 }, { "completion_length": 429.6763572692871, "epoch": 0.17305556096895525, "grad_norm": 0.06420287100015964, "kl": 0.0028352737426757812, "learning_rate": 9.27905254312053e-07, "loss": 0.0001, "reward": 1.843750074505806, "reward_std": 0.0602702172473073, "rewards/accuracy_reward": 0.8459821715950966, "rewards/format_reward": 0.9977678656578064, "step": 2220 }, { "completion_length": 407.4241256713867, "epoch": 0.17321146687973807, "grad_norm": 0.09843449334382842, "kl": 0.0026493072509765625, "learning_rate": 9.277785181794583e-07, "loss": 0.0001, "reward": 1.8526786416769028, "reward_std": 0.03208790719509125, "rewards/accuracy_reward": 0.8526785969734192, "rewards/format_reward": 1.0, "step": 2222 }, { "completion_length": 427.111629486084, "epoch": 0.17336737279052092, "grad_norm": 0.1005291431010626, "kl": 0.0027446746826171875, "learning_rate": 9.276516794200613e-07, "loss": 0.0001, "reward": 1.830357238650322, "reward_std": 0.05636462289839983, "rewards/accuracy_reward": 0.8325893357396126, "rewards/format_reward": 0.9977678656578064, "step": 2224 }, { "completion_length": 424.4352836608887, "epoch": 0.17352327870130377, "grad_norm": 0.13670920102202008, "kl": 0.0030107498168945312, "learning_rate": 9.275247380642915e-07, "loss": 0.0001, "reward": 1.7924107909202576, "reward_std": 0.09476598352193832, "rewards/accuracy_reward": 0.792410746216774, "rewards/format_reward": 1.0, "step": 2226 }, { "completion_length": 446.9442138671875, "epoch": 0.17367918461208662, "grad_norm": 0.09333137060835092, "kl": 0.003337860107421875, "learning_rate": 9.273976941426026e-07, "loss": 0.0001, "reward": 1.8236608058214188, "reward_std": 0.0853720661252737, "rewards/accuracy_reward": 0.8236607387661934, "rewards/format_reward": 1.0, "step": 2228 }, { "completion_length": 417.0558204650879, "epoch": 0.17383509052286944, "grad_norm": 0.08203764529514497, "kl": 0.0026798248291015625, "learning_rate": 9.272705476854737e-07, "loss": 0.0001, "reward": 1.8392857909202576, "reward_std": 0.0771044148132205, "rewards/accuracy_reward": 0.839285746216774, "rewards/format_reward": 1.0, "step": 2230 }, { "completion_length": 421.392879486084, "epoch": 0.1739909964336523, "grad_norm": 0.11665567931130469, "kl": 0.0023717880249023438, "learning_rate": 9.271432987234077e-07, "loss": 0.0001, "reward": 1.8928572237491608, "reward_std": 0.05553865246474743, "rewards/accuracy_reward": 0.8928571939468384, "rewards/format_reward": 1.0, "step": 2232 }, { "completion_length": 425.87278747558594, "epoch": 0.17414690234443514, "grad_norm": 0.15050147841548983, "kl": 0.0028781890869140625, "learning_rate": 9.270159472869326e-07, "loss": 0.0001, "reward": 1.812500074505806, "reward_std": 0.10025346651673317, "rewards/accuracy_reward": 0.8125000298023224, "rewards/format_reward": 1.0, "step": 2234 }, { "completion_length": 406.0468864440918, "epoch": 0.174302808255218, "grad_norm": 0.03470323635967946, "kl": 0.0024814605712890625, "learning_rate": 9.26888493406601e-07, "loss": 0.0001, "reward": 1.8191965222358704, "reward_std": 0.01894036028534174, "rewards/accuracy_reward": 0.8191964626312256, "rewards/format_reward": 1.0, "step": 2236 }, { "completion_length": 407.45091247558594, "epoch": 0.1744587141660008, "grad_norm": 0.06811621526930593, "kl": 0.0025835037231445312, "learning_rate": 9.267609371129895e-07, "loss": 0.0001, "reward": 1.837053656578064, "reward_std": 0.03547286335378885, "rewards/accuracy_reward": 0.8370536044239998, "rewards/format_reward": 1.0, "step": 2238 }, { "completion_length": 414.0669822692871, "epoch": 0.17461462007678366, "grad_norm": 0.11831864469503615, "kl": 0.0029239654541015625, "learning_rate": 9.266332784366999e-07, "loss": 0.0001, "reward": 1.8593750894069672, "reward_std": 0.07484992314130068, "rewards/accuracy_reward": 0.8593750447034836, "rewards/format_reward": 1.0, "step": 2240 }, { "completion_length": 423.59153747558594, "epoch": 0.1747705259875665, "grad_norm": 0.09876736242756792, "kl": 0.002620697021484375, "learning_rate": 9.265055174083582e-07, "loss": 0.0001, "reward": 1.8102679550647736, "reward_std": 0.06545600760728121, "rewards/accuracy_reward": 0.8102678880095482, "rewards/format_reward": 1.0, "step": 2242 }, { "completion_length": 434.9040336608887, "epoch": 0.17492643189834933, "grad_norm": 0.08117110733793072, "kl": 0.0025339126586914062, "learning_rate": 9.263776540586155e-07, "loss": 0.0001, "reward": 1.8058036267757416, "reward_std": 0.050573207437992096, "rewards/accuracy_reward": 0.8058036044239998, "rewards/format_reward": 1.0, "step": 2244 }, { "completion_length": 425.4241256713867, "epoch": 0.17508233780913218, "grad_norm": 0.06025727204544234, "kl": 0.002803802490234375, "learning_rate": 9.262496884181464e-07, "loss": 0.0001, "reward": 1.7857143729925156, "reward_std": 0.05553864873945713, "rewards/accuracy_reward": 0.7857143208384514, "rewards/format_reward": 1.0, "step": 2246 }, { "completion_length": 439.5067138671875, "epoch": 0.17523824371991503, "grad_norm": 0.10338437114345647, "kl": 0.0028753280639648438, "learning_rate": 9.261216205176511e-07, "loss": 0.0001, "reward": 1.8660715073347092, "reward_std": 0.07289712596684694, "rewards/accuracy_reward": 0.8660714626312256, "rewards/format_reward": 1.0, "step": 2248 }, { "completion_length": 417.2232322692871, "epoch": 0.17539414963069788, "grad_norm": 0.04675507032017657, "kl": 0.0025224685668945312, "learning_rate": 9.259934503878539e-07, "loss": 0.0001, "reward": 1.8794643580913544, "reward_std": 0.025100448168814182, "rewards/accuracy_reward": 0.8794643208384514, "rewards/format_reward": 1.0, "step": 2250 }, { "completion_length": 439.46207427978516, "epoch": 0.1755500555414807, "grad_norm": 0.10557022142962995, "kl": 0.0032587051391601562, "learning_rate": 9.258651780595037e-07, "loss": 0.0001, "reward": 1.7075893580913544, "reward_std": 0.06057331245392561, "rewards/accuracy_reward": 0.7075893059372902, "rewards/format_reward": 1.0, "step": 2252 }, { "completion_length": 421.75894927978516, "epoch": 0.17570596145226355, "grad_norm": 0.07363279627803798, "kl": 0.00260162353515625, "learning_rate": 9.257368035633734e-07, "loss": 0.0001, "reward": 1.832589328289032, "reward_std": 0.06883956305682659, "rewards/accuracy_reward": 0.8348214477300644, "rewards/format_reward": 0.9977678656578064, "step": 2254 }, { "completion_length": 439.2477836608887, "epoch": 0.1758618673630464, "grad_norm": 0.06741977496871411, "kl": 0.0029888153076171875, "learning_rate": 9.256083269302612e-07, "loss": 0.0001, "reward": 1.7857143431901932, "reward_std": 0.07499965280294418, "rewards/accuracy_reward": 0.7857143133878708, "rewards/format_reward": 1.0, "step": 2256 }, { "completion_length": 430.3169746398926, "epoch": 0.17601777327382925, "grad_norm": 0.09822559267994527, "kl": 0.0029382705688476562, "learning_rate": 9.254797481909897e-07, "loss": 0.0001, "reward": 1.7924107909202576, "reward_std": 0.05102826654911041, "rewards/accuracy_reward": 0.7924107387661934, "rewards/format_reward": 1.0, "step": 2258 }, { "completion_length": 427.908504486084, "epoch": 0.17617367918461208, "grad_norm": 0.14927651527010716, "kl": 0.0029964447021484375, "learning_rate": 9.253510673764054e-07, "loss": 0.0001, "reward": 1.7678572088479996, "reward_std": 0.10235822666436434, "rewards/accuracy_reward": 0.7678571715950966, "rewards/format_reward": 1.0, "step": 2260 }, { "completion_length": 417.846004486084, "epoch": 0.17632958509539493, "grad_norm": 0.12384850435806741, "kl": 0.0028553009033203125, "learning_rate": 9.252222845173795e-07, "loss": 0.0001, "reward": 1.7232143729925156, "reward_std": 0.08244216814637184, "rewards/accuracy_reward": 0.725446455180645, "rewards/format_reward": 0.9977678656578064, "step": 2262 }, { "completion_length": 426.7031440734863, "epoch": 0.17648549100617777, "grad_norm": 0.04654743621087414, "kl": 0.0025997161865234375, "learning_rate": 9.250933996448083e-07, "loss": 0.0001, "reward": 1.9062500447034836, "reward_std": 0.01555540319532156, "rewards/accuracy_reward": 0.9062500298023224, "rewards/format_reward": 1.0, "step": 2264 }, { "completion_length": 426.6160888671875, "epoch": 0.17664139691696062, "grad_norm": 0.1270001624205532, "kl": 0.0026493072509765625, "learning_rate": 9.249644127896116e-07, "loss": 0.0001, "reward": 1.7767857909202576, "reward_std": 0.04471481405198574, "rewards/accuracy_reward": 0.776785746216774, "rewards/format_reward": 1.0, "step": 2266 }, { "completion_length": 416.5870704650879, "epoch": 0.17679730282774345, "grad_norm": 0.07949016177347885, "kl": 0.0031824111938476562, "learning_rate": 9.248353239827348e-07, "loss": 0.0001, "reward": 1.7075893580913544, "reward_std": 0.05636602267622948, "rewards/accuracy_reward": 0.7075893059372902, "rewards/format_reward": 1.0, "step": 2268 }, { "completion_length": 437.9018020629883, "epoch": 0.1769532087385263, "grad_norm": 0.10147189736115496, "kl": 0.0029392242431640625, "learning_rate": 9.247061332551467e-07, "loss": 0.0001, "reward": 1.7745536714792252, "reward_std": 0.0641744127497077, "rewards/accuracy_reward": 0.7745536044239998, "rewards/format_reward": 1.0, "step": 2270 }, { "completion_length": 431.42189025878906, "epoch": 0.17710911464930915, "grad_norm": 0.14124084314782526, "kl": 0.0027704238891601562, "learning_rate": 9.24576840637841e-07, "loss": 0.0001, "reward": 1.741071492433548, "reward_std": 0.09018414467573166, "rewards/accuracy_reward": 0.7410714626312256, "rewards/format_reward": 1.0, "step": 2272 }, { "completion_length": 445.0156478881836, "epoch": 0.177265020560092, "grad_norm": 0.10365139353597778, "kl": 0.0030498504638671875, "learning_rate": 9.244474461618356e-07, "loss": 0.0001, "reward": 1.7343750894069672, "reward_std": 0.07079096138477325, "rewards/accuracy_reward": 0.7343750298023224, "rewards/format_reward": 1.0, "step": 2274 }, { "completion_length": 416.6451072692871, "epoch": 0.17742092647087482, "grad_norm": 0.04393697214116329, "kl": 0.002498626708984375, "learning_rate": 9.243179498581738e-07, "loss": 0.0001, "reward": 1.84151791036129, "reward_std": 0.037424261681735516, "rewards/accuracy_reward": 0.8415178954601288, "rewards/format_reward": 1.0, "step": 2276 }, { "completion_length": 415.72993087768555, "epoch": 0.17757683238165767, "grad_norm": 0.08225848851053794, "kl": 0.002613067626953125, "learning_rate": 9.24188351757922e-07, "loss": 0.0001, "reward": 1.8861607611179352, "reward_std": 0.033063605427742004, "rewards/accuracy_reward": 0.8861607387661934, "rewards/format_reward": 1.0, "step": 2278 }, { "completion_length": 418.86609268188477, "epoch": 0.17773273829244052, "grad_norm": 0.07762909700468658, "kl": 0.002658843994140625, "learning_rate": 9.240586518921717e-07, "loss": 0.0001, "reward": 1.8325893729925156, "reward_std": 0.03803044930100441, "rewards/accuracy_reward": 0.8325893208384514, "rewards/format_reward": 1.0, "step": 2280 }, { "completion_length": 418.96653747558594, "epoch": 0.17788864420322337, "grad_norm": 0.07138215119164924, "kl": 0.0025539398193359375, "learning_rate": 9.239288502920389e-07, "loss": 0.0001, "reward": 1.8727679252624512, "reward_std": 0.05102826748043299, "rewards/accuracy_reward": 0.8727678954601288, "rewards/format_reward": 1.0, "step": 2282 }, { "completion_length": 420.433048248291, "epoch": 0.1780445501140062, "grad_norm": 0.10614464075717238, "kl": 0.00289154052734375, "learning_rate": 9.237989469886636e-07, "loss": 0.0001, "reward": 1.7589286267757416, "reward_std": 0.10062437690794468, "rewards/accuracy_reward": 0.7589286118745804, "rewards/format_reward": 1.0, "step": 2284 }, { "completion_length": 421.8817138671875, "epoch": 0.17820045602478904, "grad_norm": 0.10895530107185292, "kl": 0.002655029296875, "learning_rate": 9.236689420132105e-07, "loss": 0.0001, "reward": 1.8861607760190964, "reward_std": 0.05764481518417597, "rewards/accuracy_reward": 0.8861607536673546, "rewards/format_reward": 1.0, "step": 2286 }, { "completion_length": 421.0357360839844, "epoch": 0.1783563619355719, "grad_norm": 0.25688462908706505, "kl": 0.003398895263671875, "learning_rate": 9.235388353968687e-07, "loss": 0.0001, "reward": 1.7455357760190964, "reward_std": 0.11663483921438456, "rewards/accuracy_reward": 0.7477679029107094, "rewards/format_reward": 0.9977678656578064, "step": 2288 }, { "completion_length": 418.01787185668945, "epoch": 0.17851226784635474, "grad_norm": 0.0911838615899681, "kl": 0.0024242401123046875, "learning_rate": 9.234086271708517e-07, "loss": 0.0001, "reward": 1.8415179550647736, "reward_std": 0.0478926682844758, "rewards/accuracy_reward": 0.8437500521540642, "rewards/format_reward": 0.9977678656578064, "step": 2290 }, { "completion_length": 411.4040336608887, "epoch": 0.17866817375713756, "grad_norm": 0.10642571583066951, "kl": 0.00247955322265625, "learning_rate": 9.23278317366397e-07, "loss": 0.0001, "reward": 1.8035715073347092, "reward_std": 0.07710441388189793, "rewards/accuracy_reward": 0.8035714700818062, "rewards/format_reward": 1.0, "step": 2292 }, { "completion_length": 423.6071548461914, "epoch": 0.1788240796679204, "grad_norm": 0.0741248972792698, "kl": 0.0028133392333984375, "learning_rate": 9.231479060147671e-07, "loss": 0.0001, "reward": 1.8459822237491608, "reward_std": 0.03983351867645979, "rewards/accuracy_reward": 0.8459821790456772, "rewards/format_reward": 1.0, "step": 2294 }, { "completion_length": 425.03126525878906, "epoch": 0.17897998557870326, "grad_norm": 0.09452241173349628, "kl": 0.0025081634521484375, "learning_rate": 9.230173931472483e-07, "loss": 0.0001, "reward": 1.819196492433548, "reward_std": 0.04794640466570854, "rewards/accuracy_reward": 0.8191964775323868, "rewards/format_reward": 1.0, "step": 2296 }, { "completion_length": 419.5201110839844, "epoch": 0.17913589148948608, "grad_norm": 0.06621129412057487, "kl": 0.0028066635131835938, "learning_rate": 9.228867787951513e-07, "loss": 0.0001, "reward": 1.8973214775323868, "reward_std": 0.03352006617933512, "rewards/accuracy_reward": 0.897321455180645, "rewards/format_reward": 1.0, "step": 2298 }, { "completion_length": 419.6964416503906, "epoch": 0.17929179740026893, "grad_norm": 0.09639110266833224, "kl": 0.003017425537109375, "learning_rate": 9.227560629898118e-07, "loss": 0.0001, "reward": 1.843750074505806, "reward_std": 0.07823487743735313, "rewards/accuracy_reward": 0.8437500223517418, "rewards/format_reward": 1.0, "step": 2300 }, { "completion_length": 414.09823989868164, "epoch": 0.17944770331105178, "grad_norm": 0.061890587341475164, "kl": 0.0028476715087890625, "learning_rate": 9.226252457625889e-07, "loss": 0.0001, "reward": 1.8549107909202576, "reward_std": 0.027206611819565296, "rewards/accuracy_reward": 0.8549107536673546, "rewards/format_reward": 1.0, "step": 2302 }, { "completion_length": 426.17412185668945, "epoch": 0.17960360922183463, "grad_norm": 0.12350519355161607, "kl": 0.0029315948486328125, "learning_rate": 9.224943271448669e-07, "loss": 0.0001, "reward": 1.7455357909202576, "reward_std": 0.0907084196805954, "rewards/accuracy_reward": 0.7455357611179352, "rewards/format_reward": 1.0, "step": 2304 }, { "completion_length": 417.2433204650879, "epoch": 0.17975951513261745, "grad_norm": 0.1119075909163072, "kl": 0.00249481201171875, "learning_rate": 9.223633071680537e-07, "loss": 0.0001, "reward": 1.828125074505806, "reward_std": 0.03141390159726143, "rewards/accuracy_reward": 0.8281250223517418, "rewards/format_reward": 1.0, "step": 2306 }, { "completion_length": 417.5245704650879, "epoch": 0.1799154210434003, "grad_norm": 0.10723132985169112, "kl": 0.0028142929077148438, "learning_rate": 9.222321858635818e-07, "loss": 0.0001, "reward": 1.743303656578064, "reward_std": 0.09101151395589113, "rewards/accuracy_reward": 0.7433036044239998, "rewards/format_reward": 1.0, "step": 2308 }, { "completion_length": 416.4486770629883, "epoch": 0.18007132695418315, "grad_norm": 0.057372319489231195, "kl": 0.0026149749755859375, "learning_rate": 9.221009632629082e-07, "loss": 0.0001, "reward": 1.8593750596046448, "reward_std": 0.08439496625214815, "rewards/accuracy_reward": 0.8616071715950966, "rewards/format_reward": 0.9977678656578064, "step": 2310 }, { "completion_length": 424.3080520629883, "epoch": 0.180227232864966, "grad_norm": 0.10088327804422338, "kl": 0.0026226043701171875, "learning_rate": 9.219696393975142e-07, "loss": 0.0001, "reward": 1.7857143729925156, "reward_std": 0.08469806425273418, "rewards/accuracy_reward": 0.7857143133878708, "rewards/format_reward": 1.0, "step": 2312 }, { "completion_length": 418.6093940734863, "epoch": 0.18038313877574882, "grad_norm": 0.0950361391962726, "kl": 0.00296783447265625, "learning_rate": 9.218382142989048e-07, "loss": 0.0001, "reward": 1.7321429550647736, "reward_std": 0.06688676495105028, "rewards/accuracy_reward": 0.7321428880095482, "rewards/format_reward": 1.0, "step": 2314 }, { "completion_length": 431.94421768188477, "epoch": 0.18053904468653167, "grad_norm": 0.10710223478143639, "kl": 0.0034084320068359375, "learning_rate": 9.2170668799861e-07, "loss": 0.0001, "reward": 1.7834822237491608, "reward_std": 0.10400933679193258, "rewards/accuracy_reward": 0.7834821790456772, "rewards/format_reward": 1.0, "step": 2316 }, { "completion_length": 425.8169822692871, "epoch": 0.18069495059731452, "grad_norm": 0.12984840739276146, "kl": 0.0029239654541015625, "learning_rate": 9.215750605281839e-07, "loss": 0.0001, "reward": 1.8125000596046448, "reward_std": 0.07499965000897646, "rewards/accuracy_reward": 0.8125000298023224, "rewards/format_reward": 1.0, "step": 2318 }, { "completion_length": 425.04466247558594, "epoch": 0.18085085650809737, "grad_norm": 0.09117306890612092, "kl": 0.0028524398803710938, "learning_rate": 9.214433319192045e-07, "loss": 0.0001, "reward": 1.8772322237491608, "reward_std": 0.055388920940458775, "rewards/accuracy_reward": 0.8772321715950966, "rewards/format_reward": 1.0, "step": 2320 }, { "completion_length": 435.7076072692871, "epoch": 0.1810067624188802, "grad_norm": 0.05221596034242439, "kl": 0.00322723388671875, "learning_rate": 9.213115022032745e-07, "loss": 0.0001, "reward": 1.8147322237491608, "reward_std": 0.04922519624233246, "rewards/accuracy_reward": 0.8147321864962578, "rewards/format_reward": 1.0, "step": 2322 }, { "completion_length": 425.4218940734863, "epoch": 0.18116266832966305, "grad_norm": 0.12914279823178795, "kl": 0.0029230117797851562, "learning_rate": 9.211795714120207e-07, "loss": 0.0001, "reward": 1.7008929550647736, "reward_std": 0.053282758221030235, "rewards/accuracy_reward": 0.7053571864962578, "rewards/format_reward": 0.9955357313156128, "step": 2324 }, { "completion_length": 432.6964530944824, "epoch": 0.1813185742404459, "grad_norm": 0.1163619720077535, "kl": 0.0028743743896484375, "learning_rate": 9.210475395770942e-07, "loss": 0.0001, "reward": 1.725446492433548, "reward_std": 0.08101141266524792, "rewards/accuracy_reward": 0.7254464700818062, "rewards/format_reward": 1.0, "step": 2326 }, { "completion_length": 415.66072845458984, "epoch": 0.18147448015122875, "grad_norm": 0.10256487612778294, "kl": 0.0028705596923828125, "learning_rate": 9.209154067301701e-07, "loss": 0.0001, "reward": 1.8236607760190964, "reward_std": 0.055388922803103924, "rewards/accuracy_reward": 0.8236607536673546, "rewards/format_reward": 1.0, "step": 2328 }, { "completion_length": 421.3839416503906, "epoch": 0.18163038606201157, "grad_norm": 0.115152723403054, "kl": 0.0026445388793945312, "learning_rate": 9.207831729029478e-07, "loss": 0.0001, "reward": 1.8102679401636124, "reward_std": 0.06072667893022299, "rewards/accuracy_reward": 0.8102678880095482, "rewards/format_reward": 1.0, "step": 2330 }, { "completion_length": 423.4419860839844, "epoch": 0.18178629197279442, "grad_norm": 0.11459042959790296, "kl": 0.0025920867919921875, "learning_rate": 9.206508381271514e-07, "loss": 0.0001, "reward": 1.8080358058214188, "reward_std": 0.04651284217834473, "rewards/accuracy_reward": 0.808035746216774, "rewards/format_reward": 1.0, "step": 2332 }, { "completion_length": 433.3058204650879, "epoch": 0.18194219788357727, "grad_norm": 0.09618897789237792, "kl": 0.00328826904296875, "learning_rate": 9.205184024345287e-07, "loss": 0.0001, "reward": 1.8058036416769028, "reward_std": 0.07515161670744419, "rewards/accuracy_reward": 0.8058036044239998, "rewards/format_reward": 1.0, "step": 2334 }, { "completion_length": 416.25894927978516, "epoch": 0.18209810379436012, "grad_norm": 0.10654416989539782, "kl": 0.0024938583374023438, "learning_rate": 9.203858658568519e-07, "loss": 0.0001, "reward": 1.8928572237491608, "reward_std": 0.07567225210368633, "rewards/accuracy_reward": 0.892857164144516, "rewards/format_reward": 1.0, "step": 2336 }, { "completion_length": 435.1607360839844, "epoch": 0.18225400970514294, "grad_norm": 0.10600876298438425, "kl": 0.0031108856201171875, "learning_rate": 9.202532284259173e-07, "loss": 0.0001, "reward": 1.8950893729925156, "reward_std": 0.05636602360755205, "rewards/accuracy_reward": 0.8950893208384514, "rewards/format_reward": 1.0, "step": 2338 }, { "completion_length": 420.66519927978516, "epoch": 0.1824099156159258, "grad_norm": 0.1359251396229114, "kl": 0.0028171539306640625, "learning_rate": 9.201204901735455e-07, "loss": 0.0001, "reward": 1.7767858058214188, "reward_std": 0.0868028262630105, "rewards/accuracy_reward": 0.7790178880095482, "rewards/format_reward": 0.9977678656578064, "step": 2340 }, { "completion_length": 417.32144927978516, "epoch": 0.18256582152670864, "grad_norm": 0.10630971992671748, "kl": 0.0027093887329101562, "learning_rate": 9.199876511315814e-07, "loss": 0.0001, "reward": 1.7924108058214188, "reward_std": 0.04712267033755779, "rewards/accuracy_reward": 0.792410746216774, "rewards/format_reward": 1.0, "step": 2342 }, { "completion_length": 431.7388610839844, "epoch": 0.18272172743749146, "grad_norm": 0.04168497246955904, "kl": 0.0025053024291992188, "learning_rate": 9.198547113318937e-07, "loss": 0.0001, "reward": 1.8102679401636124, "reward_std": 0.02284595649689436, "rewards/accuracy_reward": 0.8102679029107094, "rewards/format_reward": 1.0, "step": 2344 }, { "completion_length": 428.1294822692871, "epoch": 0.1828776333482743, "grad_norm": 0.06792176866705163, "kl": 0.0028438568115234375, "learning_rate": 9.197216708063755e-07, "loss": 0.0001, "reward": 1.7678572088479996, "reward_std": 0.0529810655862093, "rewards/accuracy_reward": 0.767857164144516, "rewards/format_reward": 1.0, "step": 2346 }, { "completion_length": 419.49555587768555, "epoch": 0.18303353925905716, "grad_norm": 0.09291552005041498, "kl": 0.003208160400390625, "learning_rate": 9.195885295869443e-07, "loss": 0.0001, "reward": 1.7678572237491608, "reward_std": 0.07515301927924156, "rewards/accuracy_reward": 0.7678571864962578, "rewards/format_reward": 1.0, "step": 2348 }, { "completion_length": 418.2968940734863, "epoch": 0.18318944516984, "grad_norm": 0.08829166129302796, "kl": 0.0026912689208984375, "learning_rate": 9.194552877055413e-07, "loss": 0.0001, "reward": 1.8683036416769028, "reward_std": 0.05831741914153099, "rewards/accuracy_reward": 0.8683036044239998, "rewards/format_reward": 1.0, "step": 2350 }, { "completion_length": 424.15403747558594, "epoch": 0.18334535108062283, "grad_norm": 0.13706161349966295, "kl": 0.0031299591064453125, "learning_rate": 9.193219451941323e-07, "loss": 0.0001, "reward": 1.7857143431901932, "reward_std": 0.0913810282945633, "rewards/accuracy_reward": 0.7857143133878708, "rewards/format_reward": 1.0, "step": 2352 }, { "completion_length": 423.62724685668945, "epoch": 0.18350125699140568, "grad_norm": 0.11194125223944575, "kl": 0.002689361572265625, "learning_rate": 9.191885020847069e-07, "loss": 0.0001, "reward": 1.881696492433548, "reward_std": 0.05590816028416157, "rewards/accuracy_reward": 0.8816964849829674, "rewards/format_reward": 1.0, "step": 2354 }, { "completion_length": 435.7544822692871, "epoch": 0.18365716290218853, "grad_norm": 0.08631269165920442, "kl": 0.0030841827392578125, "learning_rate": 9.190549584092789e-07, "loss": 0.0001, "reward": 1.7544643580913544, "reward_std": 0.08409187197685242, "rewards/accuracy_reward": 0.7544643208384514, "rewards/format_reward": 1.0, "step": 2356 }, { "completion_length": 415.28572845458984, "epoch": 0.18381306881297138, "grad_norm": 0.04610605419699256, "kl": 0.0026569366455078125, "learning_rate": 9.189213141998864e-07, "loss": 0.0001, "reward": 1.8616072237491608, "reward_std": 0.03788072057068348, "rewards/accuracy_reward": 0.8616071790456772, "rewards/format_reward": 1.0, "step": 2358 }, { "completion_length": 409.94868087768555, "epoch": 0.1839689747237542, "grad_norm": 0.11427252134240935, "kl": 0.0026407241821289062, "learning_rate": 9.187875694885915e-07, "loss": 0.0001, "reward": 1.7857143878936768, "reward_std": 0.06831892486661673, "rewards/accuracy_reward": 0.7857143133878708, "rewards/format_reward": 1.0, "step": 2360 }, { "completion_length": 429.8973388671875, "epoch": 0.18412488063453705, "grad_norm": 0.06799739958060179, "kl": 0.0027141571044921875, "learning_rate": 9.186537243074803e-07, "loss": 0.0001, "reward": 1.8593750596046448, "reward_std": 0.037424259819090366, "rewards/accuracy_reward": 0.859375037252903, "rewards/format_reward": 1.0, "step": 2362 }, { "completion_length": 406.6116256713867, "epoch": 0.1842807865453199, "grad_norm": 0.13646184985941856, "kl": 0.0030832290649414062, "learning_rate": 9.185197786886631e-07, "loss": 0.0001, "reward": 1.7544643580913544, "reward_std": 0.0811633775010705, "rewards/accuracy_reward": 0.7544643208384514, "rewards/format_reward": 1.0, "step": 2364 }, { "completion_length": 437.0893020629883, "epoch": 0.18443669245610275, "grad_norm": 0.0911204997201399, "kl": 0.00267791748046875, "learning_rate": 9.183857326642746e-07, "loss": 0.0001, "reward": 1.8504465073347092, "reward_std": 0.056514350697398186, "rewards/accuracy_reward": 0.8504464775323868, "rewards/format_reward": 1.0, "step": 2366 }, { "completion_length": 420.5870704650879, "epoch": 0.18459259836688557, "grad_norm": 0.09128103485566351, "kl": 0.0031795501708984375, "learning_rate": 9.182515862664728e-07, "loss": 0.0001, "reward": 1.8705357909202576, "reward_std": 0.05569201800972223, "rewards/accuracy_reward": 0.8705357760190964, "rewards/format_reward": 1.0, "step": 2368 }, { "completion_length": 409.3616256713867, "epoch": 0.18474850427766842, "grad_norm": 0.11549645098959159, "kl": 0.0026836395263671875, "learning_rate": 9.181173395274409e-07, "loss": 0.0001, "reward": 1.799107238650322, "reward_std": 0.05929311644285917, "rewards/accuracy_reward": 0.7991071790456772, "rewards/format_reward": 1.0, "step": 2370 }, { "completion_length": 416.3571586608887, "epoch": 0.18490441018845127, "grad_norm": 0.07193589036728958, "kl": 0.0024700164794921875, "learning_rate": 9.179829924793849e-07, "loss": 0.0001, "reward": 1.8392857611179352, "reward_std": 0.05718835536390543, "rewards/accuracy_reward": 0.8392857536673546, "rewards/format_reward": 1.0, "step": 2372 }, { "completion_length": 424.9486770629883, "epoch": 0.18506031609923412, "grad_norm": 0.11468968678567652, "kl": 0.00295257568359375, "learning_rate": 9.17848545154536e-07, "loss": 0.0001, "reward": 1.8794643729925156, "reward_std": 0.05636462103575468, "rewards/accuracy_reward": 0.8794643208384514, "rewards/format_reward": 1.0, "step": 2374 }, { "completion_length": 427.64733505249023, "epoch": 0.18521622201001695, "grad_norm": 0.11270089104085251, "kl": 0.0030841827392578125, "learning_rate": 9.177139975851488e-07, "loss": 0.0001, "reward": 1.821428656578064, "reward_std": 0.09198721218854189, "rewards/accuracy_reward": 0.8214286118745804, "rewards/format_reward": 1.0, "step": 2376 }, { "completion_length": 426.02903747558594, "epoch": 0.1853721279207998, "grad_norm": 0.09357648823105598, "kl": 0.0030393600463867188, "learning_rate": 9.175793498035021e-07, "loss": 0.0001, "reward": 1.7500000596046448, "reward_std": 0.07207115553319454, "rewards/accuracy_reward": 0.7500000298023224, "rewards/format_reward": 1.0, "step": 2378 }, { "completion_length": 405.564754486084, "epoch": 0.18552803383158264, "grad_norm": 0.10942692883836343, "kl": 0.0027599334716796875, "learning_rate": 9.174446018418988e-07, "loss": 0.0001, "reward": 1.8191965073347092, "reward_std": 0.07192142400890589, "rewards/accuracy_reward": 0.8191964626312256, "rewards/format_reward": 1.0, "step": 2380 }, { "completion_length": 411.9866256713867, "epoch": 0.1856839397423655, "grad_norm": 0.08282810948283621, "kl": 0.0030803680419921875, "learning_rate": 9.173097537326657e-07, "loss": 0.0001, "reward": 1.821428656578064, "reward_std": 0.08065989799797535, "rewards/accuracy_reward": 0.8258928954601288, "rewards/format_reward": 0.9955357164144516, "step": 2382 }, { "completion_length": 426.14511489868164, "epoch": 0.18583984565314832, "grad_norm": 0.09443385338213196, "kl": 0.0025634765625, "learning_rate": 9.171748055081539e-07, "loss": 0.0001, "reward": 1.8058036416769028, "reward_std": 0.07048926688730717, "rewards/accuracy_reward": 0.8058036044239998, "rewards/format_reward": 1.0, "step": 2384 }, { "completion_length": 422.71876525878906, "epoch": 0.18599575156393117, "grad_norm": 0.07357257268340718, "kl": 0.0026702880859375, "learning_rate": 9.170397572007382e-07, "loss": 0.0001, "reward": 1.8325893580913544, "reward_std": 0.03675165772438049, "rewards/accuracy_reward": 0.8325893133878708, "rewards/format_reward": 1.0, "step": 2386 }, { "completion_length": 425.9799346923828, "epoch": 0.18615165747471402, "grad_norm": 0.08704583497252118, "kl": 0.0031280517578125, "learning_rate": 9.169046088428176e-07, "loss": 0.0001, "reward": 1.8504465073347092, "reward_std": 0.042762015014886856, "rewards/accuracy_reward": 0.8504464849829674, "rewards/format_reward": 1.0, "step": 2388 }, { "completion_length": 432.24778747558594, "epoch": 0.18630756338549687, "grad_norm": 0.116630622851796, "kl": 0.002765655517578125, "learning_rate": 9.167693604668152e-07, "loss": 0.0001, "reward": 1.8660715222358704, "reward_std": 0.06981526035815477, "rewards/accuracy_reward": 0.866071455180645, "rewards/format_reward": 1.0, "step": 2390 }, { "completion_length": 421.0669860839844, "epoch": 0.1864634692962797, "grad_norm": 0.09306881151151904, "kl": 0.0026493072509765625, "learning_rate": 9.166340121051776e-07, "loss": 0.0001, "reward": 1.8147321939468384, "reward_std": 0.06155041232705116, "rewards/accuracy_reward": 0.8147321790456772, "rewards/format_reward": 1.0, "step": 2392 }, { "completion_length": 410.4085006713867, "epoch": 0.18661937520706254, "grad_norm": 0.11215790504458914, "kl": 0.0029296875, "learning_rate": 9.164985637903762e-07, "loss": 0.0001, "reward": 1.8526786416769028, "reward_std": 0.04764330945909023, "rewards/accuracy_reward": 0.8526786118745804, "rewards/format_reward": 1.0, "step": 2394 }, { "completion_length": 426.8928756713867, "epoch": 0.1867752811178454, "grad_norm": 0.05311169233009262, "kl": 0.0025997161865234375, "learning_rate": 9.163630155549056e-07, "loss": 0.0001, "reward": 1.8928571939468384, "reward_std": 0.019090089946985245, "rewards/accuracy_reward": 0.8928571715950966, "rewards/format_reward": 1.0, "step": 2396 }, { "completion_length": 417.0268020629883, "epoch": 0.1869311870286282, "grad_norm": 0.11779043434367789, "kl": 0.0025587081909179688, "learning_rate": 9.162273674312846e-07, "loss": 0.0001, "reward": 1.8861607909202576, "reward_std": 0.07244066335260868, "rewards/accuracy_reward": 0.8861607387661934, "rewards/format_reward": 1.0, "step": 2398 }, { "completion_length": 427.37278747558594, "epoch": 0.18708709293941106, "grad_norm": 0.13239626437590482, "kl": 0.00311279296875, "learning_rate": 9.160916194520563e-07, "loss": 0.0001, "reward": 1.7566965222358704, "reward_std": 0.09604477509856224, "rewards/accuracy_reward": 0.7566964700818062, "rewards/format_reward": 1.0, "step": 2400 }, { "completion_length": 413.7522506713867, "epoch": 0.1872429988501939, "grad_norm": 0.13579340539875015, "kl": 0.0028562545776367188, "learning_rate": 9.159557716497874e-07, "loss": 0.0001, "reward": 1.830357238650322, "reward_std": 0.0777106024324894, "rewards/accuracy_reward": 0.8325893208384514, "rewards/format_reward": 0.9977678656578064, "step": 2402 }, { "completion_length": 421.8303756713867, "epoch": 0.18739890476097676, "grad_norm": 0.10868396646766632, "kl": 0.0026226043701171875, "learning_rate": 9.158198240570687e-07, "loss": 0.0001, "reward": 1.8325893580913544, "reward_std": 0.07094432786107063, "rewards/accuracy_reward": 0.832589328289032, "rewards/format_reward": 1.0, "step": 2404 }, { "completion_length": 420.8504638671875, "epoch": 0.18755481067175958, "grad_norm": 0.13109784164604807, "kl": 0.002597808837890625, "learning_rate": 9.156837767065148e-07, "loss": 0.0001, "reward": 1.845982238650322, "reward_std": 0.08213907573372126, "rewards/accuracy_reward": 0.8459821790456772, "rewards/format_reward": 1.0, "step": 2406 }, { "completion_length": 424.8281440734863, "epoch": 0.18771071658254243, "grad_norm": 0.07986447334771432, "kl": 0.0024471282958984375, "learning_rate": 9.155476296307645e-07, "loss": 0.0001, "reward": 1.8526786416769028, "reward_std": 0.058620515279471874, "rewards/accuracy_reward": 0.8526786044239998, "rewards/format_reward": 1.0, "step": 2408 }, { "completion_length": 413.1250190734863, "epoch": 0.18786662249332528, "grad_norm": 0.1567240742266435, "kl": 0.0026903152465820312, "learning_rate": 9.154113828624802e-07, "loss": 0.0001, "reward": 1.8504465371370316, "reward_std": 0.09363831765949726, "rewards/accuracy_reward": 0.8504464626312256, "rewards/format_reward": 1.0, "step": 2410 }, { "completion_length": 434.07368087768555, "epoch": 0.18802252840410813, "grad_norm": 0.1560722793156822, "kl": 0.0039806365966796875, "learning_rate": 9.152750364343485e-07, "loss": 0.0002, "reward": 1.6808036416769028, "reward_std": 0.13910764921456575, "rewards/accuracy_reward": 0.6830357387661934, "rewards/format_reward": 0.9977678656578064, "step": 2412 }, { "completion_length": 411.1540336608887, "epoch": 0.18817843431489095, "grad_norm": 0.1061094919890388, "kl": 0.0029354095458984375, "learning_rate": 9.151385903790796e-07, "loss": 0.0001, "reward": 1.7566965371370316, "reward_std": 0.07027172297239304, "rewards/accuracy_reward": 0.7566964700818062, "rewards/format_reward": 1.0, "step": 2414 }, { "completion_length": 411.61832427978516, "epoch": 0.1883343402256738, "grad_norm": 0.06808660654070392, "kl": 0.0025119781494140625, "learning_rate": 9.150020447294081e-07, "loss": 0.0001, "reward": 1.727678656578064, "reward_std": 0.039983248338103294, "rewards/accuracy_reward": 0.7276786118745804, "rewards/format_reward": 1.0, "step": 2416 }, { "completion_length": 436.3750190734863, "epoch": 0.18849024613645665, "grad_norm": 0.10990416552778205, "kl": 0.00335693359375, "learning_rate": 9.148653995180916e-07, "loss": 0.0001, "reward": 1.7946429252624512, "reward_std": 0.08387432899326086, "rewards/accuracy_reward": 0.7946428880095482, "rewards/format_reward": 1.0, "step": 2418 }, { "completion_length": 406.2031440734863, "epoch": 0.1886461520472395, "grad_norm": 0.003469177598785577, "kl": 0.0027446746826171875, "learning_rate": 9.147286547779127e-07, "loss": 0.0001, "reward": 1.8147322088479996, "reward_std": 0.009241949766874313, "rewards/accuracy_reward": 0.8147321715950966, "rewards/format_reward": 1.0, "step": 2420 }, { "completion_length": 427.3593940734863, "epoch": 0.18880205795802232, "grad_norm": 0.09193151811685232, "kl": 0.002918243408203125, "learning_rate": 9.145918105416772e-07, "loss": 0.0001, "reward": 1.767857238650322, "reward_std": 0.03788072057068348, "rewards/accuracy_reward": 0.7678571939468384, "rewards/format_reward": 1.0, "step": 2422 }, { "completion_length": 409.64957427978516, "epoch": 0.18895796386880517, "grad_norm": 0.08631406057081276, "kl": 0.0028848648071289062, "learning_rate": 9.144548668422148e-07, "loss": 0.0001, "reward": 1.7767857909202576, "reward_std": 0.04764331039041281, "rewards/accuracy_reward": 0.7767857611179352, "rewards/format_reward": 1.0, "step": 2424 }, { "completion_length": 419.3370704650879, "epoch": 0.18911386977958802, "grad_norm": 0.12124116917600587, "kl": 0.0032558441162109375, "learning_rate": 9.143178237123792e-07, "loss": 0.0001, "reward": 1.7433036416769028, "reward_std": 0.11565690487623215, "rewards/accuracy_reward": 0.7433036044239998, "rewards/format_reward": 1.0, "step": 2426 }, { "completion_length": 426.8817138671875, "epoch": 0.18926977569037087, "grad_norm": 0.1061316254056861, "kl": 0.0034046173095703125, "learning_rate": 9.141806811850481e-07, "loss": 0.0001, "reward": 1.8147322237491608, "reward_std": 0.09378664195537567, "rewards/accuracy_reward": 0.8147321715950966, "rewards/format_reward": 1.0, "step": 2428 }, { "completion_length": 426.44197845458984, "epoch": 0.1894256816011537, "grad_norm": 0.044168338868103105, "kl": 0.0026836395263671875, "learning_rate": 9.140434392931226e-07, "loss": 0.0001, "reward": 1.8638393729925156, "reward_std": 0.03675165772438049, "rewards/accuracy_reward": 0.8638393208384514, "rewards/format_reward": 1.0, "step": 2430 }, { "completion_length": 439.32144927978516, "epoch": 0.18958158751193654, "grad_norm": 0.12513811537481628, "kl": 0.0030879974365234375, "learning_rate": 9.139060980695281e-07, "loss": 0.0001, "reward": 1.8035715222358704, "reward_std": 0.0659096660092473, "rewards/accuracy_reward": 0.8035714626312256, "rewards/format_reward": 1.0, "step": 2432 }, { "completion_length": 431.4308204650879, "epoch": 0.1897374934227194, "grad_norm": 0.10166313750313562, "kl": 0.0027561187744140625, "learning_rate": 9.137686575472136e-07, "loss": 0.0001, "reward": 1.8191965222358704, "reward_std": 0.05395676288753748, "rewards/accuracy_reward": 0.8191964477300644, "rewards/format_reward": 1.0, "step": 2434 }, { "completion_length": 435.75224685668945, "epoch": 0.18989339933350224, "grad_norm": 0.10221053698211159, "kl": 0.002857208251953125, "learning_rate": 9.136311177591518e-07, "loss": 0.0001, "reward": 1.9107143729925156, "reward_std": 0.04742576461285353, "rewards/accuracy_reward": 0.9107143431901932, "rewards/format_reward": 1.0, "step": 2436 }, { "completion_length": 419.8370704650879, "epoch": 0.19004930524428507, "grad_norm": 0.09990160350324122, "kl": 0.003021240234375, "learning_rate": 9.134934787383397e-07, "loss": 0.0001, "reward": 1.8303572088479996, "reward_std": 0.06755937170237303, "rewards/accuracy_reward": 0.8303571790456772, "rewards/format_reward": 1.0, "step": 2438 }, { "completion_length": 438.2031440734863, "epoch": 0.19020521115506792, "grad_norm": 0.11873348004812787, "kl": 0.00321197509765625, "learning_rate": 9.133557405177974e-07, "loss": 0.0001, "reward": 1.8080357909202576, "reward_std": 0.10559122171252966, "rewards/accuracy_reward": 0.8080357611179352, "rewards/format_reward": 1.0, "step": 2440 }, { "completion_length": 428.78573989868164, "epoch": 0.19036111706585077, "grad_norm": 0.11389985056218817, "kl": 0.0030975341796875, "learning_rate": 9.132179031305695e-07, "loss": 0.0001, "reward": 1.899553656578064, "reward_std": 0.06057331245392561, "rewards/accuracy_reward": 0.8995536118745804, "rewards/format_reward": 1.0, "step": 2442 }, { "completion_length": 433.05582427978516, "epoch": 0.19051702297663362, "grad_norm": 0.11006142264540653, "kl": 0.0030059814453125, "learning_rate": 9.13079966609724e-07, "loss": 0.0001, "reward": 1.8593750894069672, "reward_std": 0.08439496625214815, "rewards/accuracy_reward": 0.8593750447034836, "rewards/format_reward": 1.0, "step": 2444 }, { "completion_length": 429.1651954650879, "epoch": 0.19067292888741644, "grad_norm": 0.07142143170572719, "kl": 0.0027980804443359375, "learning_rate": 9.129419309883525e-07, "loss": 0.0001, "reward": 1.7879465073347092, "reward_std": 0.031413900665938854, "rewards/accuracy_reward": 0.7879464626312256, "rewards/format_reward": 1.0, "step": 2446 }, { "completion_length": 425.7187728881836, "epoch": 0.1908288347981993, "grad_norm": 0.0734789556108298, "kl": 0.0029850006103515625, "learning_rate": 9.12803796299571e-07, "loss": 0.0001, "reward": 1.8080357760190964, "reward_std": 0.06867506448179483, "rewards/accuracy_reward": 0.8102678954601288, "rewards/format_reward": 0.9977678656578064, "step": 2448 }, { "completion_length": 423.2232322692871, "epoch": 0.19098474070898214, "grad_norm": 0.10338756630550229, "kl": 0.0026769638061523438, "learning_rate": 9.126655625765186e-07, "loss": 0.0001, "reward": 1.7745536416769028, "reward_std": 0.0748499222099781, "rewards/accuracy_reward": 0.7745536044239998, "rewards/format_reward": 1.0, "step": 2450 }, { "completion_length": 438.7388572692871, "epoch": 0.19114064661976496, "grad_norm": 0.12475395617179531, "kl": 0.0030651092529296875, "learning_rate": 9.125272298523585e-07, "loss": 0.0001, "reward": 1.750000074505806, "reward_std": 0.10355287324637175, "rewards/accuracy_reward": 0.7500000298023224, "rewards/format_reward": 1.0, "step": 2452 }, { "completion_length": 422.21207427978516, "epoch": 0.1912965525305478, "grad_norm": 0.10509535531174052, "kl": 0.0028209686279296875, "learning_rate": 9.123887981602778e-07, "loss": 0.0001, "reward": 1.8370536416769028, "reward_std": 0.05981375556439161, "rewards/accuracy_reward": 0.8370536118745804, "rewards/format_reward": 1.0, "step": 2454 }, { "completion_length": 424.7187690734863, "epoch": 0.19145245844133066, "grad_norm": 0.10398135575688985, "kl": 0.0026521682739257812, "learning_rate": 9.122502675334871e-07, "loss": 0.0001, "reward": 1.8080358058214188, "reward_std": 0.036448562517762184, "rewards/accuracy_reward": 0.808035746216774, "rewards/format_reward": 1.0, "step": 2456 }, { "completion_length": 429.5937690734863, "epoch": 0.1916083643521135, "grad_norm": 0.08502241138331496, "kl": 0.0029087066650390625, "learning_rate": 9.121116380052206e-07, "loss": 0.0001, "reward": 1.7767857909202576, "reward_std": 0.06688676495105028, "rewards/accuracy_reward": 0.7767857536673546, "rewards/format_reward": 1.0, "step": 2458 }, { "completion_length": 418.41743087768555, "epoch": 0.19176427026289633, "grad_norm": 0.08298366330239215, "kl": 0.0029277801513671875, "learning_rate": 9.119729096087365e-07, "loss": 0.0001, "reward": 1.843750074505806, "reward_std": 0.054932462982833385, "rewards/accuracy_reward": 0.8437500298023224, "rewards/format_reward": 1.0, "step": 2460 }, { "completion_length": 427.88618087768555, "epoch": 0.19192017617367918, "grad_norm": 0.13495940897473635, "kl": 0.003131866455078125, "learning_rate": 9.118340823773167e-07, "loss": 0.0001, "reward": 1.7455357760190964, "reward_std": 0.11693429760634899, "rewards/accuracy_reward": 0.7455357536673546, "rewards/format_reward": 1.0, "step": 2462 }, { "completion_length": 436.46207427978516, "epoch": 0.19207608208446203, "grad_norm": 0.10830064666942334, "kl": 0.0030460357666015625, "learning_rate": 9.116951563442668e-07, "loss": 0.0001, "reward": 1.8214286416769028, "reward_std": 0.07124742120504379, "rewards/accuracy_reward": 0.8214286118745804, "rewards/format_reward": 1.0, "step": 2464 }, { "completion_length": 415.68528747558594, "epoch": 0.19223198799524488, "grad_norm": 0.07825583607207015, "kl": 0.0026531219482421875, "learning_rate": 9.115561315429157e-07, "loss": 0.0001, "reward": 1.821428656578064, "reward_std": 0.05989930871874094, "rewards/accuracy_reward": 0.821428619325161, "rewards/format_reward": 1.0, "step": 2466 }, { "completion_length": 415.4821586608887, "epoch": 0.1923878939060277, "grad_norm": 0.09327068934931591, "kl": 0.0027914047241210938, "learning_rate": 9.114170080066167e-07, "loss": 0.0001, "reward": 1.8258929252624512, "reward_std": 0.04907546937465668, "rewards/accuracy_reward": 0.8258928954601288, "rewards/format_reward": 1.0, "step": 2468 }, { "completion_length": 428.64957427978516, "epoch": 0.19254379981681055, "grad_norm": 0.12777366084227876, "kl": 0.0048847198486328125, "learning_rate": 9.112777857687462e-07, "loss": 0.0002, "reward": 1.7924107909202576, "reward_std": 0.09213694278150797, "rewards/accuracy_reward": 0.792410746216774, "rewards/format_reward": 1.0, "step": 2470 }, { "completion_length": 425.16296768188477, "epoch": 0.1926997057275934, "grad_norm": 0.11956123339321441, "kl": 0.0028667449951171875, "learning_rate": 9.111384648627048e-07, "loss": 0.0001, "reward": 1.712053656578064, "reward_std": 0.06463227327913046, "rewards/accuracy_reward": 0.7142857387661934, "rewards/format_reward": 0.9977678656578064, "step": 2472 }, { "completion_length": 413.71653747558594, "epoch": 0.19285561163837625, "grad_norm": 0.11305335874098615, "kl": 0.0028858184814453125, "learning_rate": 9.109990453219161e-07, "loss": 0.0001, "reward": 1.7544643878936768, "reward_std": 0.0882991598919034, "rewards/accuracy_reward": 0.7544643171131611, "rewards/format_reward": 1.0, "step": 2474 }, { "completion_length": 437.0201072692871, "epoch": 0.19301151754915907, "grad_norm": 0.08758357273207827, "kl": 0.002811431884765625, "learning_rate": 9.108595271798279e-07, "loss": 0.0001, "reward": 1.8392857611179352, "reward_std": 0.0505718057975173, "rewards/accuracy_reward": 0.839285746216774, "rewards/format_reward": 1.0, "step": 2476 }, { "completion_length": 428.93305587768555, "epoch": 0.19316742345994192, "grad_norm": 0.07325632197223063, "kl": 0.0027637481689453125, "learning_rate": 9.107199104699117e-07, "loss": 0.0001, "reward": 1.7968750894069672, "reward_std": 0.03156726714223623, "rewards/accuracy_reward": 0.7991071864962578, "rewards/format_reward": 0.9977678656578064, "step": 2478 }, { "completion_length": 430.4620704650879, "epoch": 0.19332332937072477, "grad_norm": 0.10745608038836714, "kl": 0.003360748291015625, "learning_rate": 9.105801952256621e-07, "loss": 0.0001, "reward": 1.812500074505806, "reward_std": 0.07432704698294401, "rewards/accuracy_reward": 0.8125000298023224, "rewards/format_reward": 1.0, "step": 2480 }, { "completion_length": 414.5602836608887, "epoch": 0.19347923528150762, "grad_norm": 0.003460538272561276, "kl": 0.0025606155395507812, "learning_rate": 9.104403814805979e-07, "loss": 0.0001, "reward": 1.8348215073347092, "reward_std": 0.03644856344908476, "rewards/accuracy_reward": 0.8348214700818062, "rewards/format_reward": 1.0, "step": 2482 }, { "completion_length": 435.3393020629883, "epoch": 0.19363514119229044, "grad_norm": 0.1064166098974175, "kl": 0.00315093994140625, "learning_rate": 9.103004692682612e-07, "loss": 0.0001, "reward": 1.8147322237491608, "reward_std": 0.07387282513082027, "rewards/accuracy_reward": 0.8147321715950966, "rewards/format_reward": 1.0, "step": 2484 }, { "completion_length": 427.29243087768555, "epoch": 0.1937910471030733, "grad_norm": 0.06843052244938491, "kl": 0.002902984619140625, "learning_rate": 9.101604586222179e-07, "loss": 0.0001, "reward": 1.7165179252624512, "reward_std": 0.06545320432633162, "rewards/accuracy_reward": 0.7165178805589676, "rewards/format_reward": 1.0, "step": 2486 }, { "completion_length": 419.89734268188477, "epoch": 0.19394695301385614, "grad_norm": 0.08754993187632056, "kl": 0.00283050537109375, "learning_rate": 9.100203495760575e-07, "loss": 0.0001, "reward": 1.8883929401636124, "reward_std": 0.03501640260219574, "rewards/accuracy_reward": 0.8883929029107094, "rewards/format_reward": 1.0, "step": 2488 }, { "completion_length": 425.7388610839844, "epoch": 0.194102858924639, "grad_norm": 0.08150930372965191, "kl": 0.0024585723876953125, "learning_rate": 9.098801421633929e-07, "loss": 0.0001, "reward": 1.783482238650322, "reward_std": 0.048923504538834095, "rewards/accuracy_reward": 0.7834821790456772, "rewards/format_reward": 1.0, "step": 2490 }, { "completion_length": 441.1384162902832, "epoch": 0.19425876483542182, "grad_norm": 0.08889978398523771, "kl": 0.00289154052734375, "learning_rate": 9.097398364178608e-07, "loss": 0.0001, "reward": 1.805803656578064, "reward_std": 0.05816405266523361, "rewards/accuracy_reward": 0.8058036044239998, "rewards/format_reward": 1.0, "step": 2492 }, { "completion_length": 427.6585006713867, "epoch": 0.19441467074620467, "grad_norm": 0.06889551778212398, "kl": 0.002727508544921875, "learning_rate": 9.095994323731216e-07, "loss": 0.0001, "reward": 1.77901791036129, "reward_std": 0.046296700835227966, "rewards/accuracy_reward": 0.7790178880095482, "rewards/format_reward": 1.0, "step": 2494 }, { "completion_length": 419.2656478881836, "epoch": 0.19457057665698752, "grad_norm": 0.09785292780545654, "kl": 0.0027379989624023438, "learning_rate": 9.09458930062859e-07, "loss": 0.0001, "reward": 1.8258929252624512, "reward_std": 0.07192002423107624, "rewards/accuracy_reward": 0.8258929029107094, "rewards/format_reward": 1.0, "step": 2496 }, { "completion_length": 421.6919860839844, "epoch": 0.19472648256777034, "grad_norm": 0.10671863704598702, "kl": 0.0029048919677734375, "learning_rate": 9.093183295207804e-07, "loss": 0.0001, "reward": 1.8839286267757416, "reward_std": 0.028182310983538628, "rewards/accuracy_reward": 0.8839286044239998, "rewards/format_reward": 1.0, "step": 2498 }, { "completion_length": 424.3638572692871, "epoch": 0.1948823884785532, "grad_norm": 0.10354227427824783, "kl": 0.0030040740966796875, "learning_rate": 9.091776307806169e-07, "loss": 0.0001, "reward": 1.7834822088479996, "reward_std": 0.10287886951118708, "rewards/accuracy_reward": 0.7834821790456772, "rewards/format_reward": 1.0, "step": 2500 }, { "completion_length": 422.3147506713867, "epoch": 0.19503829438933604, "grad_norm": 0.10641003980706401, "kl": 0.0026302337646484375, "learning_rate": 9.090368338761227e-07, "loss": 0.0001, "reward": 1.7991072088479996, "reward_std": 0.05200396478176117, "rewards/accuracy_reward": 0.7991071715950966, "rewards/format_reward": 1.0, "step": 2502 }, { "completion_length": 437.7455596923828, "epoch": 0.1951942003001189, "grad_norm": 0.15144476236308554, "kl": 0.00313568115234375, "learning_rate": 9.088959388410762e-07, "loss": 0.0001, "reward": 1.8392857909202576, "reward_std": 0.10498502943664789, "rewards/accuracy_reward": 0.84151791036129, "rewards/format_reward": 0.9977678656578064, "step": 2504 }, { "completion_length": 436.38171005249023, "epoch": 0.1953501062109017, "grad_norm": 0.10608980414260219, "kl": 0.003353118896484375, "learning_rate": 9.087549457092789e-07, "loss": 0.0001, "reward": 1.819196492433548, "reward_std": 0.06951216701418161, "rewards/accuracy_reward": 0.8191964700818062, "rewards/format_reward": 1.0, "step": 2506 }, { "completion_length": 419.8526954650879, "epoch": 0.19550601212168456, "grad_norm": 0.060818797387574966, "kl": 0.0028514862060546875, "learning_rate": 9.086138545145559e-07, "loss": 0.0001, "reward": 1.7901786416769028, "reward_std": 0.04764330945909023, "rewards/accuracy_reward": 0.7901786118745804, "rewards/format_reward": 1.0, "step": 2508 }, { "completion_length": 433.71876525878906, "epoch": 0.1956619180324674, "grad_norm": 0.09636750397160314, "kl": 0.0030202865600585938, "learning_rate": 9.084726652907561e-07, "loss": 0.0001, "reward": 1.7968750894069672, "reward_std": 0.05230706091970205, "rewards/accuracy_reward": 0.7968750223517418, "rewards/format_reward": 1.0, "step": 2510 }, { "completion_length": 422.2857360839844, "epoch": 0.19581782394325026, "grad_norm": 0.0738361755019683, "kl": 0.002716064453125, "learning_rate": 9.083313780717514e-07, "loss": 0.0001, "reward": 1.8236607760190964, "reward_std": 0.04276201594620943, "rewards/accuracy_reward": 0.8236607387661934, "rewards/format_reward": 1.0, "step": 2512 }, { "completion_length": 422.17412185668945, "epoch": 0.19597372985403308, "grad_norm": 0.13837990121304472, "kl": 0.003162384033203125, "learning_rate": 9.081899928914379e-07, "loss": 0.0001, "reward": 1.801339328289032, "reward_std": 0.08198570925742388, "rewards/accuracy_reward": 0.8013393208384514, "rewards/format_reward": 1.0, "step": 2514 }, { "completion_length": 422.7455520629883, "epoch": 0.19612963576481593, "grad_norm": 0.11684068505896407, "kl": 0.0029382705688476562, "learning_rate": 9.080485097837343e-07, "loss": 0.0001, "reward": 1.8125001043081284, "reward_std": 0.08680282533168793, "rewards/accuracy_reward": 0.8147321715950966, "rewards/format_reward": 0.9977678656578064, "step": 2516 }, { "completion_length": 406.7701072692871, "epoch": 0.19628554167559878, "grad_norm": 0.0659385095575584, "kl": 0.002655029296875, "learning_rate": 9.079069287825836e-07, "loss": 0.0001, "reward": 1.8102679252624512, "reward_std": 0.030135108157992363, "rewards/accuracy_reward": 0.8102678954601288, "rewards/format_reward": 1.0, "step": 2518 }, { "completion_length": 418.7567138671875, "epoch": 0.19644144758638163, "grad_norm": 0.11874894193421931, "kl": 0.0030498504638671875, "learning_rate": 9.077652499219521e-07, "loss": 0.0001, "reward": 1.76116082072258, "reward_std": 0.09138242714107037, "rewards/accuracy_reward": 0.761160746216774, "rewards/format_reward": 1.0, "step": 2520 }, { "completion_length": 434.299129486084, "epoch": 0.19659735349716445, "grad_norm": 0.1335236453398885, "kl": 0.0028390884399414062, "learning_rate": 9.076234732358292e-07, "loss": 0.0001, "reward": 1.8459822088479996, "reward_std": 0.08311617467552423, "rewards/accuracy_reward": 0.8459821790456772, "rewards/format_reward": 1.0, "step": 2522 }, { "completion_length": 419.62278747558594, "epoch": 0.1967532594079473, "grad_norm": 0.1288618124394075, "kl": 0.0031871795654296875, "learning_rate": 9.074815987582282e-07, "loss": 0.0001, "reward": 1.7924107909202576, "reward_std": 0.08890898991376162, "rewards/accuracy_reward": 0.792410746216774, "rewards/format_reward": 1.0, "step": 2524 }, { "completion_length": 421.48662185668945, "epoch": 0.19690916531873015, "grad_norm": 0.12955369650264173, "kl": 0.00307464599609375, "learning_rate": 9.073396265231856e-07, "loss": 0.0001, "reward": 1.8281250894069672, "reward_std": 0.05328415986150503, "rewards/accuracy_reward": 0.8281250447034836, "rewards/format_reward": 1.0, "step": 2526 }, { "completion_length": 431.3035888671875, "epoch": 0.197065071229513, "grad_norm": 0.07818566125938083, "kl": 0.0029125213623046875, "learning_rate": 9.071975565647617e-07, "loss": 0.0001, "reward": 1.8102679252624512, "reward_std": 0.042445551604032516, "rewards/accuracy_reward": 0.812500037252903, "rewards/format_reward": 0.9977678656578064, "step": 2528 }, { "completion_length": 437.06028747558594, "epoch": 0.19722097714029582, "grad_norm": 0.10584016677599846, "kl": 0.0030689239501953125, "learning_rate": 9.070553889170394e-07, "loss": 0.0001, "reward": 1.7031250894069672, "reward_std": 0.08407850004732609, "rewards/accuracy_reward": 0.7053571715950966, "rewards/format_reward": 0.9977678656578064, "step": 2530 }, { "completion_length": 441.12501525878906, "epoch": 0.19737688305107867, "grad_norm": 0.003973593784887753, "kl": 0.00310516357421875, "learning_rate": 9.069131236141263e-07, "loss": 0.0001, "reward": 1.8883928954601288, "reward_std": 0.039983248338103294, "rewards/accuracy_reward": 0.8883928805589676, "rewards/format_reward": 1.0, "step": 2532 }, { "completion_length": 432.17635345458984, "epoch": 0.19753278896186152, "grad_norm": 0.10642392663212256, "kl": 0.0031032562255859375, "learning_rate": 9.067707606901522e-07, "loss": 0.0001, "reward": 1.7678572088479996, "reward_std": 0.056364621967077255, "rewards/accuracy_reward": 0.767857164144516, "rewards/format_reward": 1.0, "step": 2534 }, { "completion_length": 423.933048248291, "epoch": 0.19768869487264437, "grad_norm": 0.0817810999370281, "kl": 0.00276947021484375, "learning_rate": 9.066283001792714e-07, "loss": 0.0001, "reward": 1.8102679252624512, "reward_std": 0.030135108157992363, "rewards/accuracy_reward": 0.8102679084986448, "rewards/format_reward": 1.0, "step": 2536 }, { "completion_length": 417.8683166503906, "epoch": 0.1978446007834272, "grad_norm": 0.11003614096076589, "kl": 0.002788543701171875, "learning_rate": 9.064857421156608e-07, "loss": 0.0001, "reward": 1.8169643431901932, "reward_std": 0.02915941085666418, "rewards/accuracy_reward": 0.8169643133878708, "rewards/format_reward": 1.0, "step": 2538 }, { "completion_length": 415.59823989868164, "epoch": 0.19800050669421004, "grad_norm": 0.09472306500027844, "kl": 0.00260162353515625, "learning_rate": 9.06343086533521e-07, "loss": 0.0001, "reward": 1.8571429252624512, "reward_std": 0.041786317713558674, "rewards/accuracy_reward": 0.8571429029107094, "rewards/format_reward": 1.0, "step": 2540 }, { "completion_length": 427.44421768188477, "epoch": 0.1981564126049929, "grad_norm": 0.08546395690541667, "kl": 0.0031871795654296875, "learning_rate": 9.06200333467076e-07, "loss": 0.0001, "reward": 1.8459822088479996, "reward_std": 0.06222301535308361, "rewards/accuracy_reward": 0.8459821715950966, "rewards/format_reward": 1.0, "step": 2542 }, { "completion_length": 418.9509086608887, "epoch": 0.19831231851577574, "grad_norm": 0.10736303830372602, "kl": 0.0028171539306640625, "learning_rate": 9.060574829505731e-07, "loss": 0.0001, "reward": 1.7968751043081284, "reward_std": 0.07222311850637197, "rewards/accuracy_reward": 0.796875037252903, "rewards/format_reward": 1.0, "step": 2544 }, { "completion_length": 426.5424270629883, "epoch": 0.19846822442655856, "grad_norm": 0.10132138540183126, "kl": 0.0029144287109375, "learning_rate": 9.059145350182832e-07, "loss": 0.0001, "reward": 1.7834822237491608, "reward_std": 0.042762015014886856, "rewards/accuracy_reward": 0.7834821715950966, "rewards/format_reward": 1.0, "step": 2546 }, { "completion_length": 427.6942138671875, "epoch": 0.19862413033734141, "grad_norm": 0.07955219891139237, "kl": 0.0025482177734375, "learning_rate": 9.057714897045004e-07, "loss": 0.0001, "reward": 1.8459822088479996, "reward_std": 0.030135109089314938, "rewards/accuracy_reward": 0.8459821715950966, "rewards/format_reward": 1.0, "step": 2548 }, { "completion_length": 434.2500190734863, "epoch": 0.19878003624812426, "grad_norm": 0.08879885981179306, "kl": 0.003208160400390625, "learning_rate": 9.05628347043542e-07, "loss": 0.0001, "reward": 1.7165179401636124, "reward_std": 0.0674074050039053, "rewards/accuracy_reward": 0.7165178880095482, "rewards/format_reward": 1.0, "step": 2550 }, { "completion_length": 420.12724685668945, "epoch": 0.1989359421589071, "grad_norm": 0.07881154642455064, "kl": 0.0029544830322265625, "learning_rate": 9.054851070697492e-07, "loss": 0.0001, "reward": 1.8169643729925156, "reward_std": 0.06590966507792473, "rewards/accuracy_reward": 0.8169643208384514, "rewards/format_reward": 1.0, "step": 2552 }, { "completion_length": 416.4910888671875, "epoch": 0.19909184806968994, "grad_norm": 0.1470623904589266, "kl": 0.0029630661010742188, "learning_rate": 9.053417698174859e-07, "loss": 0.0001, "reward": 1.7834822237491608, "reward_std": 0.07823123969137669, "rewards/accuracy_reward": 0.785714328289032, "rewards/format_reward": 0.9977678656578064, "step": 2554 }, { "completion_length": 437.0446586608887, "epoch": 0.19924775398047279, "grad_norm": 0.10487206883020843, "kl": 0.0030221939086914062, "learning_rate": 9.051983353211397e-07, "loss": 0.0001, "reward": 1.734375074505806, "reward_std": 0.05538892187178135, "rewards/accuracy_reward": 0.7343750260770321, "rewards/format_reward": 1.0, "step": 2556 }, { "completion_length": 431.79243087768555, "epoch": 0.19940365989125564, "grad_norm": 0.11487936481904845, "kl": 0.0031070709228515625, "learning_rate": 9.050548036151213e-07, "loss": 0.0001, "reward": 1.7879465073347092, "reward_std": 0.07079096138477325, "rewards/accuracy_reward": 0.787946455180645, "rewards/format_reward": 1.0, "step": 2558 }, { "completion_length": 420.4419822692871, "epoch": 0.19955956580203846, "grad_norm": 0.003655155359991898, "kl": 0.0026769638061523438, "learning_rate": 9.049111747338651e-07, "loss": 0.0001, "reward": 1.8325893431901932, "reward_std": 0.034495764411985874, "rewards/accuracy_reward": 0.8325893059372902, "rewards/format_reward": 1.0, "step": 2560 }, { "completion_length": 409.12055587768555, "epoch": 0.1997154717128213, "grad_norm": 0.13791681584350216, "kl": 0.0028409957885742188, "learning_rate": 9.047674487118286e-07, "loss": 0.0001, "reward": 1.8303572088479996, "reward_std": 0.07823488116264343, "rewards/accuracy_reward": 0.8303571864962578, "rewards/format_reward": 1.0, "step": 2562 }, { "completion_length": 447.64957427978516, "epoch": 0.19987137762360416, "grad_norm": 0.13217445806194913, "kl": 0.003185272216796875, "learning_rate": 9.046236255834923e-07, "loss": 0.0001, "reward": 1.8013393580913544, "reward_std": 0.07905721291899681, "rewards/accuracy_reward": 0.801339328289032, "rewards/format_reward": 1.0, "step": 2564 }, { "completion_length": 428.1294822692871, "epoch": 0.200027283534387, "grad_norm": 0.10412765258156133, "kl": 0.0032253265380859375, "learning_rate": 9.044797053833606e-07, "loss": 0.0001, "reward": 1.845982238650322, "reward_std": 0.07192142587155104, "rewards/accuracy_reward": 0.8459821715950966, "rewards/format_reward": 1.0, "step": 2566 }, { "completion_length": 404.02903747558594, "epoch": 0.20018318944516983, "grad_norm": 0.03550435177505858, "kl": 0.0024318695068359375, "learning_rate": 9.043356881459609e-07, "loss": 0.0001, "reward": 1.8928572088479996, "reward_std": 0.02089315839111805, "rewards/accuracy_reward": 0.8928571790456772, "rewards/format_reward": 1.0, "step": 2568 }, { "completion_length": 423.52457427978516, "epoch": 0.20033909535595268, "grad_norm": 0.06697046820593386, "kl": 0.00286865234375, "learning_rate": 9.041915739058436e-07, "loss": 0.0001, "reward": 1.8616072088479996, "reward_std": 0.037880719639360905, "rewards/accuracy_reward": 0.8616071715950966, "rewards/format_reward": 1.0, "step": 2570 }, { "completion_length": 424.8236770629883, "epoch": 0.20049500126673553, "grad_norm": 0.10998903090022427, "kl": 0.0030307769775390625, "learning_rate": 9.040473626975827e-07, "loss": 0.0001, "reward": 1.6875001043081284, "reward_std": 0.07274375576525927, "rewards/accuracy_reward": 0.6875000260770321, "rewards/format_reward": 1.0, "step": 2572 }, { "completion_length": 406.7500114440918, "epoch": 0.20065090717751838, "grad_norm": 0.11045463211802004, "kl": 0.0024271011352539062, "learning_rate": 9.039030545557756e-07, "loss": 0.0001, "reward": 1.756696492433548, "reward_std": 0.053956763818860054, "rewards/accuracy_reward": 0.756696455180645, "rewards/format_reward": 1.0, "step": 2574 }, { "completion_length": 421.5268020629883, "epoch": 0.2008068130883012, "grad_norm": 0.06423149406682131, "kl": 0.0025072097778320312, "learning_rate": 9.037586495150425e-07, "loss": 0.0001, "reward": 1.7879465073347092, "reward_std": 0.01894036028534174, "rewards/accuracy_reward": 0.7879464626312256, "rewards/format_reward": 1.0, "step": 2576 }, { "completion_length": 417.0089454650879, "epoch": 0.20096271899908405, "grad_norm": 0.10663350168728343, "kl": 0.0028324127197265625, "learning_rate": 9.03614147610027e-07, "loss": 0.0001, "reward": 1.8214286267757416, "reward_std": 0.0625261114910245, "rewards/accuracy_reward": 0.8214286118745804, "rewards/format_reward": 1.0, "step": 2578 }, { "completion_length": 430.5223388671875, "epoch": 0.2011186249098669, "grad_norm": 0.07857787061483143, "kl": 0.00318145751953125, "learning_rate": 9.034695488753964e-07, "loss": 0.0001, "reward": 1.7723215222358704, "reward_std": 0.07439346052706242, "rewards/accuracy_reward": 0.7723214626312256, "rewards/format_reward": 1.0, "step": 2580 }, { "completion_length": 422.4910888671875, "epoch": 0.20127453082064975, "grad_norm": 0.09709958271887088, "kl": 0.0028781890869140625, "learning_rate": 9.033248533458406e-07, "loss": 0.0001, "reward": 1.772321492433548, "reward_std": 0.041786317713558674, "rewards/accuracy_reward": 0.7723214700818062, "rewards/format_reward": 1.0, "step": 2582 }, { "completion_length": 420.08707427978516, "epoch": 0.20143043673143257, "grad_norm": 0.1073132109252743, "kl": 0.0028257369995117188, "learning_rate": 9.031800610560731e-07, "loss": 0.0001, "reward": 1.7968750596046448, "reward_std": 0.07011835742741823, "rewards/accuracy_reward": 0.7968750223517418, "rewards/format_reward": 1.0, "step": 2584 }, { "completion_length": 420.80582427978516, "epoch": 0.20158634264221542, "grad_norm": 0.11533071834501292, "kl": 0.0034027099609375, "learning_rate": 9.030351720408305e-07, "loss": 0.0001, "reward": 1.6941965222358704, "reward_std": 0.10108083579689264, "rewards/accuracy_reward": 0.6941964514553547, "rewards/format_reward": 1.0, "step": 2586 }, { "completion_length": 414.3348388671875, "epoch": 0.20174224855299827, "grad_norm": 0.07745153722176515, "kl": 0.0026197433471679688, "learning_rate": 9.028901863348723e-07, "loss": 0.0001, "reward": 1.8169643580913544, "reward_std": 0.038704453967511654, "rewards/accuracy_reward": 0.8169643208384514, "rewards/format_reward": 1.0, "step": 2588 }, { "completion_length": 415.6406440734863, "epoch": 0.20189815446378112, "grad_norm": 0.13880284315905925, "kl": 0.002666473388671875, "learning_rate": 9.027451039729821e-07, "loss": 0.0001, "reward": 1.8750001043081284, "reward_std": 0.09949390776455402, "rewards/accuracy_reward": 0.875000037252903, "rewards/format_reward": 1.0, "step": 2590 }, { "completion_length": 420.5357360839844, "epoch": 0.20205406037456394, "grad_norm": 0.0737367896374511, "kl": 0.0028285980224609375, "learning_rate": 9.025999249899656e-07, "loss": 0.0001, "reward": 1.8281250894069672, "reward_std": 0.056885259225964546, "rewards/accuracy_reward": 0.8281250521540642, "rewards/format_reward": 1.0, "step": 2592 }, { "completion_length": 424.80359649658203, "epoch": 0.2022099662853468, "grad_norm": 0.10839664763308003, "kl": 0.0030078887939453125, "learning_rate": 9.024546494206522e-07, "loss": 0.0001, "reward": 1.8214286416769028, "reward_std": 0.08214047644287348, "rewards/accuracy_reward": 0.821428619325161, "rewards/format_reward": 1.0, "step": 2594 }, { "completion_length": 417.10493087768555, "epoch": 0.20236587219612964, "grad_norm": 0.10722148571641366, "kl": 0.0026788711547851562, "learning_rate": 9.023092772998946e-07, "loss": 0.0001, "reward": 1.7901786267757416, "reward_std": 0.061549010686576366, "rewards/accuracy_reward": 0.7901786118745804, "rewards/format_reward": 1.0, "step": 2596 }, { "completion_length": 427.6071548461914, "epoch": 0.20252177810691246, "grad_norm": 0.05939520993398178, "kl": 0.0033321380615234375, "learning_rate": 9.021638086625685e-07, "loss": 0.0001, "reward": 1.7678572237491608, "reward_std": 0.05441322457045317, "rewards/accuracy_reward": 0.7678571864962578, "rewards/format_reward": 1.0, "step": 2598 }, { "completion_length": 416.57368087768555, "epoch": 0.20267768401769531, "grad_norm": 0.06262978330335964, "kl": 0.0027303695678710938, "learning_rate": 9.020182435435726e-07, "loss": 0.0001, "reward": 1.821428656578064, "reward_std": 0.03788072057068348, "rewards/accuracy_reward": 0.8214286118745804, "rewards/format_reward": 1.0, "step": 2600 }, { "completion_length": 446.3013610839844, "epoch": 0.20283358992847816, "grad_norm": 0.08263522301249583, "kl": 0.0030059814453125, "learning_rate": 9.018725819778291e-07, "loss": 0.0001, "reward": 1.8169643580913544, "reward_std": 0.0417863167822361, "rewards/accuracy_reward": 0.816964328289032, "rewards/format_reward": 1.0, "step": 2602 }, { "completion_length": 436.63394927978516, "epoch": 0.202989495839261, "grad_norm": 0.13351851048120333, "kl": 0.0034160614013671875, "learning_rate": 9.017268240002831e-07, "loss": 0.0001, "reward": 1.7589286714792252, "reward_std": 0.08372096065431833, "rewards/accuracy_reward": 0.7611607536673546, "rewards/format_reward": 0.9977678656578064, "step": 2604 }, { "completion_length": 421.6272430419922, "epoch": 0.20314540175004384, "grad_norm": 0.11792253213740668, "kl": 0.00305938720703125, "learning_rate": 9.015809696459027e-07, "loss": 0.0001, "reward": 1.8058036416769028, "reward_std": 0.07048926781862974, "rewards/accuracy_reward": 0.805803619325161, "rewards/format_reward": 1.0, "step": 2606 }, { "completion_length": 433.908504486084, "epoch": 0.20330130766082669, "grad_norm": 0.06906759268094148, "kl": 0.0029392242431640625, "learning_rate": 9.014350189496793e-07, "loss": 0.0001, "reward": 1.8549107760190964, "reward_std": 0.04553714580833912, "rewards/accuracy_reward": 0.8549107611179352, "rewards/format_reward": 1.0, "step": 2608 }, { "completion_length": 425.4843940734863, "epoch": 0.20345721357160954, "grad_norm": 0.07575084476085193, "kl": 0.0030059814453125, "learning_rate": 9.012889719466277e-07, "loss": 0.0001, "reward": 1.7991072088479996, "reward_std": 0.06380490213632584, "rewards/accuracy_reward": 0.799107164144516, "rewards/format_reward": 1.0, "step": 2610 }, { "completion_length": 424.7924270629883, "epoch": 0.20361311948239239, "grad_norm": 0.07436044699242716, "kl": 0.0031824111938476562, "learning_rate": 9.011428286717853e-07, "loss": 0.0001, "reward": 1.8772322088479996, "reward_std": 0.03141390159726143, "rewards/accuracy_reward": 0.8772321790456772, "rewards/format_reward": 1.0, "step": 2612 }, { "completion_length": 412.2745704650879, "epoch": 0.2037690253931752, "grad_norm": 0.10575192342469429, "kl": 0.0026369094848632812, "learning_rate": 9.009965891602127e-07, "loss": 0.0001, "reward": 1.879464328289032, "reward_std": 0.028182310052216053, "rewards/accuracy_reward": 0.8794643133878708, "rewards/format_reward": 1.0, "step": 2614 }, { "completion_length": 430.8817138671875, "epoch": 0.20392493130395806, "grad_norm": 0.04838182711052596, "kl": 0.0031871795654296875, "learning_rate": 9.00850253446994e-07, "loss": 0.0001, "reward": 1.8058036267757416, "reward_std": 0.04569051135331392, "rewards/accuracy_reward": 0.8058035969734192, "rewards/format_reward": 1.0, "step": 2616 }, { "completion_length": 434.9174346923828, "epoch": 0.2040808372147409, "grad_norm": 0.12599831729607025, "kl": 0.0032100677490234375, "learning_rate": 9.007038215672358e-07, "loss": 0.0001, "reward": 1.8191965073347092, "reward_std": 0.1022062636911869, "rewards/accuracy_reward": 0.819196455180645, "rewards/format_reward": 1.0, "step": 2618 }, { "completion_length": 424.58037185668945, "epoch": 0.20423674312552376, "grad_norm": 0.08671506280618561, "kl": 0.0028228759765625, "learning_rate": 9.005572935560681e-07, "loss": 0.0001, "reward": 1.7611608058214188, "reward_std": 0.057491449639201164, "rewards/accuracy_reward": 0.7611607536673546, "rewards/format_reward": 1.0, "step": 2620 }, { "completion_length": 419.4330596923828, "epoch": 0.20439264903630658, "grad_norm": 0.0864052147331614, "kl": 0.0032405853271484375, "learning_rate": 9.00410669448644e-07, "loss": 0.0001, "reward": 1.790178656578064, "reward_std": 0.057341720908880234, "rewards/accuracy_reward": 0.7901786044239998, "rewards/format_reward": 1.0, "step": 2622 }, { "completion_length": 415.19644927978516, "epoch": 0.20454855494708943, "grad_norm": 0.06594571561305124, "kl": 0.0026569366455078125, "learning_rate": 9.002639492801395e-07, "loss": 0.0001, "reward": 1.8191965222358704, "reward_std": 0.037424259819090366, "rewards/accuracy_reward": 0.8191964626312256, "rewards/format_reward": 1.0, "step": 2624 }, { "completion_length": 427.5915336608887, "epoch": 0.20470446085787228, "grad_norm": 0.08021859681454696, "kl": 0.0030498504638671875, "learning_rate": 9.001171330857538e-07, "loss": 0.0001, "reward": 1.7589286714792252, "reward_std": 0.07387422490864992, "rewards/accuracy_reward": 0.7589285969734192, "rewards/format_reward": 1.0, "step": 2626 }, { "completion_length": 423.9352798461914, "epoch": 0.20486036676865513, "grad_norm": 0.004047166307397902, "kl": 0.00283050537109375, "learning_rate": 8.999702209007087e-07, "loss": 0.0001, "reward": 1.8705357611179352, "reward_std": 0.012626906856894493, "rewards/accuracy_reward": 0.870535746216774, "rewards/format_reward": 1.0, "step": 2628 }, { "completion_length": 414.02234268188477, "epoch": 0.20501627267943795, "grad_norm": 0.09195736484608365, "kl": 0.002536773681640625, "learning_rate": 8.998232127602498e-07, "loss": 0.0001, "reward": 1.8035715222358704, "reward_std": 0.06463087350130081, "rewards/accuracy_reward": 0.8035714626312256, "rewards/format_reward": 1.0, "step": 2630 }, { "completion_length": 421.995548248291, "epoch": 0.2051721785902208, "grad_norm": 0.003755719289543718, "kl": 0.0029964447021484375, "learning_rate": 8.996761086996452e-07, "loss": 0.0001, "reward": 1.7366072237491608, "reward_std": 0.036448562517762184, "rewards/accuracy_reward": 0.7366071753203869, "rewards/format_reward": 1.0, "step": 2632 }, { "completion_length": 418.94644927978516, "epoch": 0.20532808450100365, "grad_norm": 0.12105491807946377, "kl": 0.0025634765625, "learning_rate": 8.995289087541859e-07, "loss": 0.0001, "reward": 1.8080358058214188, "reward_std": 0.09190306346863508, "rewards/accuracy_reward": 0.808035746216774, "rewards/format_reward": 1.0, "step": 2634 }, { "completion_length": 417.5156478881836, "epoch": 0.2054839904117865, "grad_norm": 0.08279841987719681, "kl": 0.002857208251953125, "learning_rate": 8.993816129591864e-07, "loss": 0.0001, "reward": 1.7343750894069672, "reward_std": 0.08567375876009464, "rewards/accuracy_reward": 0.7343750447034836, "rewards/format_reward": 1.0, "step": 2636 }, { "completion_length": 426.0446586608887, "epoch": 0.20563989632256932, "grad_norm": 0.13033031722780516, "kl": 0.0027875900268554688, "learning_rate": 8.992342213499836e-07, "loss": 0.0001, "reward": 1.8236607909202576, "reward_std": 0.05688526015728712, "rewards/accuracy_reward": 0.823660746216774, "rewards/format_reward": 1.0, "step": 2638 }, { "completion_length": 413.5201110839844, "epoch": 0.20579580223335217, "grad_norm": 0.09078686025915558, "kl": 0.002880096435546875, "learning_rate": 8.990867339619379e-07, "loss": 0.0001, "reward": 1.7946429401636124, "reward_std": 0.059597612358629704, "rewards/accuracy_reward": 0.7946429029107094, "rewards/format_reward": 1.0, "step": 2640 }, { "completion_length": 416.924129486084, "epoch": 0.20595170814413502, "grad_norm": 0.07532343631762134, "kl": 0.0026922225952148438, "learning_rate": 8.989391508304324e-07, "loss": 0.0001, "reward": 1.8861607760190964, "reward_std": 0.0456905122846365, "rewards/accuracy_reward": 0.886160746216774, "rewards/format_reward": 1.0, "step": 2642 }, { "completion_length": 430.9486770629883, "epoch": 0.20610761405491787, "grad_norm": 0.04445751587086728, "kl": 0.002574920654296875, "learning_rate": 8.987914719908732e-07, "loss": 0.0001, "reward": 1.866071492433548, "reward_std": 0.03352006524801254, "rewards/accuracy_reward": 0.8660714626312256, "rewards/format_reward": 1.0, "step": 2644 }, { "completion_length": 418.6741256713867, "epoch": 0.2062635199657007, "grad_norm": 0.08419428377652254, "kl": 0.0026369094848632812, "learning_rate": 8.986436974786894e-07, "loss": 0.0001, "reward": 1.90401791036129, "reward_std": 0.042762015014886856, "rewards/accuracy_reward": 0.9062500149011612, "rewards/format_reward": 0.9977678656578064, "step": 2646 }, { "completion_length": 426.4843940734863, "epoch": 0.20641942587648354, "grad_norm": 0.05507252543463654, "kl": 0.0030670166015625, "learning_rate": 8.98495827329333e-07, "loss": 0.0001, "reward": 1.7946429401636124, "reward_std": 0.02089315839111805, "rewards/accuracy_reward": 0.7946428917348385, "rewards/format_reward": 1.0, "step": 2648 }, { "completion_length": 408.4241256713867, "epoch": 0.2065753317872664, "grad_norm": 0.09903424465018322, "kl": 0.0026922225952148438, "learning_rate": 8.983478615782794e-07, "loss": 0.0001, "reward": 1.8258929252624512, "reward_std": 0.051850597374141216, "rewards/accuracy_reward": 0.8258928805589676, "rewards/format_reward": 1.0, "step": 2650 }, { "completion_length": 417.5201072692871, "epoch": 0.20673123769804921, "grad_norm": 0.06323837613161847, "kl": 0.002826690673828125, "learning_rate": 8.98199800261026e-07, "loss": 0.0001, "reward": 1.7901786267757416, "reward_std": 0.025253813713788986, "rewards/accuracy_reward": 0.7901785895228386, "rewards/format_reward": 1.0, "step": 2652 }, { "completion_length": 425.2544822692871, "epoch": 0.20688714360883206, "grad_norm": 0.0870535700706251, "kl": 0.0033969879150390625, "learning_rate": 8.980516434130938e-07, "loss": 0.0001, "reward": 1.8214286416769028, "reward_std": 0.04471481405198574, "rewards/accuracy_reward": 0.8214286118745804, "rewards/format_reward": 1.0, "step": 2654 }, { "completion_length": 423.8303756713867, "epoch": 0.2070430495196149, "grad_norm": 0.0996437547955829, "kl": 0.0029926300048828125, "learning_rate": 8.979033910700267e-07, "loss": 0.0001, "reward": 1.8616072237491608, "reward_std": 0.06395826861262321, "rewards/accuracy_reward": 0.8616071864962578, "rewards/format_reward": 1.0, "step": 2656 }, { "completion_length": 424.0580520629883, "epoch": 0.20719895543039776, "grad_norm": 0.08305395813361804, "kl": 0.0030260086059570312, "learning_rate": 8.977550432673912e-07, "loss": 0.0001, "reward": 1.7968750596046448, "reward_std": 0.0344957634806633, "rewards/accuracy_reward": 0.7968750298023224, "rewards/format_reward": 1.0, "step": 2658 }, { "completion_length": 426.0357322692871, "epoch": 0.20735486134118059, "grad_norm": 0.08265357661616561, "kl": 0.0029935836791992188, "learning_rate": 8.976066000407773e-07, "loss": 0.0001, "reward": 1.7700893729925156, "reward_std": 0.05246042553335428, "rewards/accuracy_reward": 0.7700893208384514, "rewards/format_reward": 1.0, "step": 2660 }, { "completion_length": 419.4977912902832, "epoch": 0.20751076725196343, "grad_norm": 0.10892382967897953, "kl": 0.00296783447265625, "learning_rate": 8.97458061425797e-07, "loss": 0.0001, "reward": 1.8482143580913544, "reward_std": 0.044714813120663166, "rewards/accuracy_reward": 0.8482143208384514, "rewards/format_reward": 1.0, "step": 2662 }, { "completion_length": 412.92859268188477, "epoch": 0.20766667316274628, "grad_norm": 0.04708515966780016, "kl": 0.003063201904296875, "learning_rate": 8.973094274580859e-07, "loss": 0.0001, "reward": 1.8013393729925156, "reward_std": 0.03141390159726143, "rewards/accuracy_reward": 0.8035714700818062, "rewards/format_reward": 0.9977678656578064, "step": 2664 }, { "completion_length": 429.71653747558594, "epoch": 0.20782257907352913, "grad_norm": 0.06823545568182025, "kl": 0.00290679931640625, "learning_rate": 8.971606981733022e-07, "loss": 0.0001, "reward": 1.7321429550647736, "reward_std": 0.023821654729545116, "rewards/accuracy_reward": 0.7321428917348385, "rewards/format_reward": 1.0, "step": 2666 }, { "completion_length": 443.00224685668945, "epoch": 0.20797848498431196, "grad_norm": 0.11617953557553275, "kl": 0.003787994384765625, "learning_rate": 8.970118736071271e-07, "loss": 0.0002, "reward": 1.6785715073347092, "reward_std": 0.08634776342660189, "rewards/accuracy_reward": 0.6785714626312256, "rewards/format_reward": 1.0, "step": 2668 }, { "completion_length": 431.0759086608887, "epoch": 0.2081343908950948, "grad_norm": 0.061881416252226235, "kl": 0.0031566619873046875, "learning_rate": 8.968629537952643e-07, "loss": 0.0001, "reward": 1.8214286863803864, "reward_std": 0.060876404866576195, "rewards/accuracy_reward": 0.8214286118745804, "rewards/format_reward": 1.0, "step": 2670 }, { "completion_length": 413.93750762939453, "epoch": 0.20829029680587766, "grad_norm": 0.1397635956099022, "kl": 0.00321197509765625, "learning_rate": 8.967139387734408e-07, "loss": 0.0001, "reward": 1.7700893729925156, "reward_std": 0.08064273651689291, "rewards/accuracy_reward": 0.7723214700818062, "rewards/format_reward": 0.9977678656578064, "step": 2672 }, { "completion_length": 429.3281478881836, "epoch": 0.2084462027166605, "grad_norm": 0.12344455380056822, "kl": 0.003509521484375, "learning_rate": 8.965648285774064e-07, "loss": 0.0001, "reward": 1.7991072237491608, "reward_std": 0.08146506827324629, "rewards/accuracy_reward": 0.7991071753203869, "rewards/format_reward": 1.0, "step": 2674 }, { "completion_length": 431.65403747558594, "epoch": 0.20860210862744333, "grad_norm": 0.11714099158822515, "kl": 0.0030651092529296875, "learning_rate": 8.964156232429333e-07, "loss": 0.0001, "reward": 1.8080358058214188, "reward_std": 0.08409187290817499, "rewards/accuracy_reward": 0.8080357536673546, "rewards/format_reward": 1.0, "step": 2676 }, { "completion_length": 414.04912185668945, "epoch": 0.20875801453822618, "grad_norm": 0.06218495751529936, "kl": 0.002590179443359375, "learning_rate": 8.96266322805817e-07, "loss": 0.0001, "reward": 1.8013393729925156, "reward_std": 0.046296702697873116, "rewards/accuracy_reward": 0.801339328289032, "rewards/format_reward": 1.0, "step": 2678 }, { "completion_length": 426.564754486084, "epoch": 0.20891392044900903, "grad_norm": 0.09003454113482211, "kl": 0.0027322769165039062, "learning_rate": 8.961169273018755e-07, "loss": 0.0001, "reward": 1.82589291036129, "reward_std": 0.07207115553319454, "rewards/accuracy_reward": 0.8258928880095482, "rewards/format_reward": 1.0, "step": 2680 }, { "completion_length": 418.80582427978516, "epoch": 0.20906982635979188, "grad_norm": 0.0970801666096452, "kl": 0.0031595230102539062, "learning_rate": 8.959674367669498e-07, "loss": 0.0001, "reward": 1.7678572088479996, "reward_std": 0.06590966507792473, "rewards/accuracy_reward": 0.7678571790456772, "rewards/format_reward": 1.0, "step": 2682 }, { "completion_length": 416.6785888671875, "epoch": 0.2092257322705747, "grad_norm": 0.0813088057538329, "kl": 0.0025043487548828125, "learning_rate": 8.958178512369037e-07, "loss": 0.0001, "reward": 1.8459822088479996, "reward_std": 0.042762016877532005, "rewards/accuracy_reward": 0.8459821715950966, "rewards/format_reward": 1.0, "step": 2684 }, { "completion_length": 415.4553756713867, "epoch": 0.20938163818135755, "grad_norm": 0.0958899004583714, "kl": 0.00252532958984375, "learning_rate": 8.956681707476237e-07, "loss": 0.0001, "reward": 1.8392858058214188, "reward_std": 0.07146496511995792, "rewards/accuracy_reward": 0.839285746216774, "rewards/format_reward": 1.0, "step": 2686 }, { "completion_length": 419.36609268188477, "epoch": 0.2095375440921404, "grad_norm": 0.09222538892282742, "kl": 0.0027179718017578125, "learning_rate": 8.95518395335019e-07, "loss": 0.0001, "reward": 1.7209822237491608, "reward_std": 0.05981375649571419, "rewards/accuracy_reward": 0.7209821790456772, "rewards/format_reward": 1.0, "step": 2688 }, { "completion_length": 435.9107322692871, "epoch": 0.20969345000292325, "grad_norm": 0.12239649092995196, "kl": 0.0035266876220703125, "learning_rate": 8.953685250350218e-07, "loss": 0.0001, "reward": 1.8013393878936768, "reward_std": 0.0665836725383997, "rewards/accuracy_reward": 0.8013393208384514, "rewards/format_reward": 1.0, "step": 2690 }, { "completion_length": 429.8750190734863, "epoch": 0.20984935591370607, "grad_norm": 0.07495182464421984, "kl": 0.0029993057250976562, "learning_rate": 8.952185598835869e-07, "loss": 0.0001, "reward": 1.8504465073347092, "reward_std": 0.06591106671839952, "rewards/accuracy_reward": 0.8504464775323868, "rewards/format_reward": 1.0, "step": 2692 }, { "completion_length": 426.9375228881836, "epoch": 0.21000526182448892, "grad_norm": 0.12089654153766562, "kl": 0.00308990478515625, "learning_rate": 8.950684999166919e-07, "loss": 0.0001, "reward": 1.734375074505806, "reward_std": 0.0787555193528533, "rewards/accuracy_reward": 0.7343750223517418, "rewards/format_reward": 1.0, "step": 2694 }, { "completion_length": 418.8125228881836, "epoch": 0.21016116773527177, "grad_norm": 0.11218525307872726, "kl": 0.0028371810913085938, "learning_rate": 8.94918345170337e-07, "loss": 0.0001, "reward": 1.7633929252624512, "reward_std": 0.08176956791430712, "rewards/accuracy_reward": 0.7656250298023224, "rewards/format_reward": 0.9977678656578064, "step": 2696 }, { "completion_length": 419.6942138671875, "epoch": 0.21031707364605462, "grad_norm": 0.12938376045555222, "kl": 0.00274658203125, "learning_rate": 8.947680956805456e-07, "loss": 0.0001, "reward": 1.8392858058214188, "reward_std": 0.06545460689812899, "rewards/accuracy_reward": 0.839285746216774, "rewards/format_reward": 1.0, "step": 2698 }, { "completion_length": 415.1651954650879, "epoch": 0.21047297955683744, "grad_norm": 0.07723957701745844, "kl": 0.0022678375244140625, "learning_rate": 8.946177514833634e-07, "loss": 0.0001, "reward": 1.8816965073347092, "reward_std": 0.03675165679305792, "rewards/accuracy_reward": 0.8816964626312256, "rewards/format_reward": 1.0, "step": 2700 }, { "completion_length": 406.39287185668945, "epoch": 0.2106288854676203, "grad_norm": 0.07573614005775893, "kl": 0.0029277801513671875, "learning_rate": 8.944673126148588e-07, "loss": 0.0001, "reward": 1.8102679252624512, "reward_std": 0.03531949780881405, "rewards/accuracy_reward": 0.8102678805589676, "rewards/format_reward": 1.0, "step": 2702 }, { "completion_length": 427.2790412902832, "epoch": 0.21078479137840314, "grad_norm": 0.1077439527036484, "kl": 0.0028629302978515625, "learning_rate": 8.94316779111123e-07, "loss": 0.0001, "reward": 1.837053656578064, "reward_std": 0.05666771437972784, "rewards/accuracy_reward": 0.837053619325161, "rewards/format_reward": 1.0, "step": 2704 }, { "completion_length": 410.72769927978516, "epoch": 0.21094069728918596, "grad_norm": 0.10896365210660088, "kl": 0.0023756027221679688, "learning_rate": 8.941661510082702e-07, "loss": 0.0001, "reward": 1.8258929550647736, "reward_std": 0.07936030626296997, "rewards/accuracy_reward": 0.8281250298023224, "rewards/format_reward": 0.9977678656578064, "step": 2706 }, { "completion_length": 420.20537185668945, "epoch": 0.2110966031999688, "grad_norm": 0.0959976301465248, "kl": 0.0032205581665039062, "learning_rate": 8.940154283424369e-07, "loss": 0.0001, "reward": 1.8035715222358704, "reward_std": 0.05681744311004877, "rewards/accuracy_reward": 0.803571455180645, "rewards/format_reward": 1.0, "step": 2708 }, { "completion_length": 422.5446662902832, "epoch": 0.21125250911075166, "grad_norm": 0.09950477326799112, "kl": 0.0030794143676757812, "learning_rate": 8.938646111497825e-07, "loss": 0.0001, "reward": 1.8258929401636124, "reward_std": 0.054932461120188236, "rewards/accuracy_reward": 0.8258928954601288, "rewards/format_reward": 1.0, "step": 2710 }, { "completion_length": 414.86832427978516, "epoch": 0.2114084150215345, "grad_norm": 0.08594809097982418, "kl": 0.00244903564453125, "learning_rate": 8.937136994664889e-07, "loss": 0.0001, "reward": 1.8392858058214188, "reward_std": 0.043065110221505165, "rewards/accuracy_reward": 0.8392857536673546, "rewards/format_reward": 1.0, "step": 2712 }, { "completion_length": 426.9910888671875, "epoch": 0.21156432093231733, "grad_norm": 0.11027951568518113, "kl": 0.0034046173095703125, "learning_rate": 8.935626933287608e-07, "loss": 0.0001, "reward": 1.7991072237491608, "reward_std": 0.09754251129925251, "rewards/accuracy_reward": 0.8013393059372902, "rewards/format_reward": 0.9977678656578064, "step": 2714 }, { "completion_length": 422.8370780944824, "epoch": 0.21172022684310018, "grad_norm": 0.10547202884539757, "kl": 0.00305938720703125, "learning_rate": 8.934115927728257e-07, "loss": 0.0001, "reward": 1.7834821939468384, "reward_std": 0.04861900769174099, "rewards/accuracy_reward": 0.7834821715950966, "rewards/format_reward": 1.0, "step": 2716 }, { "completion_length": 411.3325996398926, "epoch": 0.21187613275388303, "grad_norm": 0.08619720408300327, "kl": 0.0025882720947265625, "learning_rate": 8.932603978349333e-07, "loss": 0.0001, "reward": 1.897321492433548, "reward_std": 0.039377057924866676, "rewards/accuracy_reward": 0.8973214626312256, "rewards/format_reward": 1.0, "step": 2718 }, { "completion_length": 409.77233505249023, "epoch": 0.21203203866466588, "grad_norm": 0.08654543723265287, "kl": 0.0031108856201171875, "learning_rate": 8.931091085513565e-07, "loss": 0.0001, "reward": 1.7678572237491608, "reward_std": 0.03111080639064312, "rewards/accuracy_reward": 0.7678571715950966, "rewards/format_reward": 1.0, "step": 2720 }, { "completion_length": 425.5692138671875, "epoch": 0.2121879445754487, "grad_norm": 0.06999217217019353, "kl": 0.002773284912109375, "learning_rate": 8.929577249583904e-07, "loss": 0.0001, "reward": 1.8392857611179352, "reward_std": 0.03644856158643961, "rewards/accuracy_reward": 0.839285746216774, "rewards/format_reward": 1.0, "step": 2722 }, { "completion_length": 410.3192138671875, "epoch": 0.21234385048623156, "grad_norm": 0.08469251256747432, "kl": 0.0025787353515625, "learning_rate": 8.92806247092353e-07, "loss": 0.0001, "reward": 1.8191965222358704, "reward_std": 0.039833519607782364, "rewards/accuracy_reward": 0.8214285969734192, "rewards/format_reward": 0.9977678656578064, "step": 2724 }, { "completion_length": 411.4107322692871, "epoch": 0.2124997563970144, "grad_norm": 0.08533398196190324, "kl": 0.0030651092529296875, "learning_rate": 8.926546749895847e-07, "loss": 0.0001, "reward": 1.8191964775323868, "reward_std": 0.04569051414728165, "rewards/accuracy_reward": 0.8191964626312256, "rewards/format_reward": 1.0, "step": 2726 }, { "completion_length": 426.70091247558594, "epoch": 0.21265566230779726, "grad_norm": 0.11049153578122052, "kl": 0.003162384033203125, "learning_rate": 8.925030086864489e-07, "loss": 0.0001, "reward": 1.7924108058214188, "reward_std": 0.09634927101433277, "rewards/accuracy_reward": 0.7946428880095482, "rewards/format_reward": 0.9977678656578064, "step": 2728 }, { "completion_length": 419.56028747558594, "epoch": 0.21281156821858008, "grad_norm": 0.0804882689516162, "kl": 0.0028095245361328125, "learning_rate": 8.923512482193308e-07, "loss": 0.0001, "reward": 1.7901786267757416, "reward_std": 0.06463087443262339, "rewards/accuracy_reward": 0.7901786118745804, "rewards/format_reward": 1.0, "step": 2730 }, { "completion_length": 419.8750190734863, "epoch": 0.21296747412936293, "grad_norm": 0.07965757960191663, "kl": 0.0028448104858398438, "learning_rate": 8.921993936246393e-07, "loss": 0.0001, "reward": 1.8593750596046448, "reward_std": 0.049378564581274986, "rewards/accuracy_reward": 0.8593750447034836, "rewards/format_reward": 1.0, "step": 2732 }, { "completion_length": 417.59153747558594, "epoch": 0.21312338004014578, "grad_norm": 0.07341224385489524, "kl": 0.0030317306518554688, "learning_rate": 8.920474449388047e-07, "loss": 0.0001, "reward": 1.7455357909202576, "reward_std": 0.06688676681369543, "rewards/accuracy_reward": 0.745535746216774, "rewards/format_reward": 1.0, "step": 2734 }, { "completion_length": 408.8460006713867, "epoch": 0.21327928595092863, "grad_norm": 0.08852156422931794, "kl": 0.003326416015625, "learning_rate": 8.918954021982809e-07, "loss": 0.0001, "reward": 1.7410715073347092, "reward_std": 0.061395641416311264, "rewards/accuracy_reward": 0.7410714626312256, "rewards/format_reward": 1.0, "step": 2736 }, { "completion_length": 427.8616256713867, "epoch": 0.21343519186171145, "grad_norm": 0.05231545203350108, "kl": 0.0029315948486328125, "learning_rate": 8.917432654395438e-07, "loss": 0.0001, "reward": 1.7767857909202576, "reward_std": 0.04907546937465668, "rewards/accuracy_reward": 0.779017873108387, "rewards/format_reward": 0.9977678656578064, "step": 2738 }, { "completion_length": 418.21652603149414, "epoch": 0.2135910977724943, "grad_norm": 0.10939601319389015, "kl": 0.003143310546875, "learning_rate": 8.915910346990919e-07, "loss": 0.0001, "reward": 1.8035715371370316, "reward_std": 0.0877799242734909, "rewards/accuracy_reward": 0.8035714626312256, "rewards/format_reward": 1.0, "step": 2740 }, { "completion_length": 407.8951072692871, "epoch": 0.21374700368327715, "grad_norm": 0.10052844323637171, "kl": 0.00292205810546875, "learning_rate": 8.914387100134464e-07, "loss": 0.0001, "reward": 1.8482143580913544, "reward_std": 0.06395826861262321, "rewards/accuracy_reward": 0.848214328289032, "rewards/format_reward": 1.0, "step": 2742 }, { "completion_length": 418.0580596923828, "epoch": 0.21390290959406, "grad_norm": 0.1328307772668082, "kl": 0.0031147003173828125, "learning_rate": 8.91286291419151e-07, "loss": 0.0001, "reward": 1.8750000894069672, "reward_std": 0.07289712596684694, "rewards/accuracy_reward": 0.875000037252903, "rewards/format_reward": 1.0, "step": 2744 }, { "completion_length": 410.2053756713867, "epoch": 0.21405881550484282, "grad_norm": 0.06601257340091428, "kl": 0.0028400421142578125, "learning_rate": 8.911337789527719e-07, "loss": 0.0001, "reward": 1.861607238650322, "reward_std": 0.04907546937465668, "rewards/accuracy_reward": 0.8616071715950966, "rewards/format_reward": 1.0, "step": 2746 }, { "completion_length": 415.55359268188477, "epoch": 0.21421472141562567, "grad_norm": 0.10171245728578454, "kl": 0.003406524658203125, "learning_rate": 8.909811726508978e-07, "loss": 0.0001, "reward": 1.7678572237491608, "reward_std": 0.0728971241042018, "rewards/accuracy_reward": 0.7678571790456772, "rewards/format_reward": 1.0, "step": 2748 }, { "completion_length": 412.29912185668945, "epoch": 0.21437062732640852, "grad_norm": 0.13122240589758205, "kl": 0.0029296875, "learning_rate": 8.908284725501396e-07, "loss": 0.0001, "reward": 1.7075893580913544, "reward_std": 0.08146647084504366, "rewards/accuracy_reward": 0.7075893171131611, "rewards/format_reward": 1.0, "step": 2750 }, { "completion_length": 422.7254638671875, "epoch": 0.21452653323719134, "grad_norm": 0.11684673532048141, "kl": 0.0033054351806640625, "learning_rate": 8.906756786871316e-07, "loss": 0.0001, "reward": 1.7477679401636124, "reward_std": 0.08213907480239868, "rewards/accuracy_reward": 0.7477678880095482, "rewards/format_reward": 1.0, "step": 2752 }, { "completion_length": 419.3460006713867, "epoch": 0.2146824391479742, "grad_norm": 0.0793642572895931, "kl": 0.0030364990234375, "learning_rate": 8.905227910985297e-07, "loss": 0.0001, "reward": 1.7678572237491608, "reward_std": 0.07530638482421637, "rewards/accuracy_reward": 0.767857164144516, "rewards/format_reward": 1.0, "step": 2754 }, { "completion_length": 433.7544822692871, "epoch": 0.21483834505875704, "grad_norm": 0.11250543509725305, "kl": 0.0031452178955078125, "learning_rate": 8.903698098210127e-07, "loss": 0.0001, "reward": 1.783482238650322, "reward_std": 0.04809977114200592, "rewards/accuracy_reward": 0.7834821715950966, "rewards/format_reward": 1.0, "step": 2756 }, { "completion_length": 402.9464416503906, "epoch": 0.2149942509695399, "grad_norm": 0.08433253519091703, "kl": 0.0028009414672851562, "learning_rate": 8.902167348912815e-07, "loss": 0.0001, "reward": 1.897321492433548, "reward_std": 0.030438202433288097, "rewards/accuracy_reward": 0.897321455180645, "rewards/format_reward": 1.0, "step": 2758 }, { "completion_length": 434.1049270629883, "epoch": 0.2151501568803227, "grad_norm": 0.1376649011293965, "kl": 0.0030946731567382812, "learning_rate": 8.900635663460603e-07, "loss": 0.0001, "reward": 1.8058036416769028, "reward_std": 0.08680422510951757, "rewards/accuracy_reward": 0.8102678805589676, "rewards/format_reward": 0.9955357313156128, "step": 2760 }, { "completion_length": 416.9419860839844, "epoch": 0.21530606279110556, "grad_norm": 0.07602462180495521, "kl": 0.0027627944946289062, "learning_rate": 8.899103042220947e-07, "loss": 0.0001, "reward": 1.8683036267757416, "reward_std": 0.03306360449641943, "rewards/accuracy_reward": 0.8683035969734192, "rewards/format_reward": 1.0, "step": 2762 }, { "completion_length": 410.7098388671875, "epoch": 0.2154619687018884, "grad_norm": 0.07085726963444959, "kl": 0.00264739990234375, "learning_rate": 8.897569485561537e-07, "loss": 0.0001, "reward": 1.84151791036129, "reward_std": 0.03547286335378885, "rewards/accuracy_reward": 0.8415178880095482, "rewards/format_reward": 1.0, "step": 2764 }, { "completion_length": 411.5335006713867, "epoch": 0.21561787461267126, "grad_norm": 0.08957837382081645, "kl": 0.00243377685546875, "learning_rate": 8.896034993850278e-07, "loss": 0.0001, "reward": 1.8928572088479996, "reward_std": 0.039377057924866676, "rewards/accuracy_reward": 0.8928571790456772, "rewards/format_reward": 1.0, "step": 2766 }, { "completion_length": 428.8839454650879, "epoch": 0.21577378052345408, "grad_norm": 0.09441450320207831, "kl": 0.0025491714477539062, "learning_rate": 8.894499567455309e-07, "loss": 0.0001, "reward": 1.8258929550647736, "reward_std": 0.057341720908880234, "rewards/accuracy_reward": 0.828125037252903, "rewards/format_reward": 0.9977678656578064, "step": 2768 }, { "completion_length": 423.96653747558594, "epoch": 0.21592968643423693, "grad_norm": 0.07399003009223334, "kl": 0.0029659271240234375, "learning_rate": 8.892963206744988e-07, "loss": 0.0001, "reward": 1.8125001043081284, "reward_std": 0.045691913925111294, "rewards/accuracy_reward": 0.8125000298023224, "rewards/format_reward": 1.0, "step": 2770 }, { "completion_length": 422.38171768188477, "epoch": 0.21608559234501978, "grad_norm": 0.06991764608105715, "kl": 0.0031261444091796875, "learning_rate": 8.891425912087895e-07, "loss": 0.0001, "reward": 1.8102679699659348, "reward_std": 0.027206611819565296, "rewards/accuracy_reward": 0.8102678880095482, "rewards/format_reward": 1.0, "step": 2772 }, { "completion_length": 428.39734268188477, "epoch": 0.21624149825580263, "grad_norm": 0.1330603645802954, "kl": 0.0031633377075195312, "learning_rate": 8.889887683852839e-07, "loss": 0.0001, "reward": 1.8750000596046448, "reward_std": 0.06560797058045864, "rewards/accuracy_reward": 0.8750000298023224, "rewards/format_reward": 1.0, "step": 2774 }, { "completion_length": 414.3125228881836, "epoch": 0.21639740416658546, "grad_norm": 0.06670683514028172, "kl": 0.0024785995483398438, "learning_rate": 8.88834852240885e-07, "loss": 0.0001, "reward": 1.8482143431901932, "reward_std": 0.020893159322440624, "rewards/accuracy_reward": 0.8482143059372902, "rewards/format_reward": 1.0, "step": 2776 }, { "completion_length": 418.8147506713867, "epoch": 0.2165533100773683, "grad_norm": 0.12376957653547356, "kl": 0.0031099319458007812, "learning_rate": 8.886808428125183e-07, "loss": 0.0001, "reward": 1.7991072237491608, "reward_std": 0.08341926895081997, "rewards/accuracy_reward": 0.7991071790456772, "rewards/format_reward": 1.0, "step": 2778 }, { "completion_length": 412.9665412902832, "epoch": 0.21670921598815115, "grad_norm": 0.09109537299293831, "kl": 0.0030164718627929688, "learning_rate": 8.885267401371317e-07, "loss": 0.0001, "reward": 1.7901786416769028, "reward_std": 0.05261015333235264, "rewards/accuracy_reward": 0.792410746216774, "rewards/format_reward": 0.9977678656578064, "step": 2780 }, { "completion_length": 408.80135345458984, "epoch": 0.216865121898934, "grad_norm": 0.06637101059955772, "kl": 0.0027074813842773438, "learning_rate": 8.883725442516952e-07, "loss": 0.0001, "reward": 1.881696492433548, "reward_std": 0.02186885755509138, "rewards/accuracy_reward": 0.881696455180645, "rewards/format_reward": 1.0, "step": 2782 }, { "completion_length": 429.2723388671875, "epoch": 0.21702102780971683, "grad_norm": 0.126326609833216, "kl": 0.0029077529907226562, "learning_rate": 8.882182551932016e-07, "loss": 0.0001, "reward": 1.843750074505806, "reward_std": 0.10429765656590462, "rewards/accuracy_reward": 0.8459821790456772, "rewards/format_reward": 0.9977678656578064, "step": 2784 }, { "completion_length": 419.14511489868164, "epoch": 0.21717693372049968, "grad_norm": 0.07363249530075952, "kl": 0.002887725830078125, "learning_rate": 8.880638729986658e-07, "loss": 0.0001, "reward": 1.8683036416769028, "reward_std": 0.034342397935688496, "rewards/accuracy_reward": 0.868303619325161, "rewards/format_reward": 1.0, "step": 2786 }, { "completion_length": 416.9062690734863, "epoch": 0.21733283963128253, "grad_norm": 0.07903058414310393, "kl": 0.0028619766235351562, "learning_rate": 8.87909397705125e-07, "loss": 0.0001, "reward": 1.881696492433548, "reward_std": 0.04276201594620943, "rewards/accuracy_reward": 0.8816964402794838, "rewards/format_reward": 1.0, "step": 2788 }, { "completion_length": 416.2567138671875, "epoch": 0.21748874554206538, "grad_norm": 0.06863268415243146, "kl": 0.003101348876953125, "learning_rate": 8.877548293496388e-07, "loss": 0.0001, "reward": 1.8392857760190964, "reward_std": 0.03171699587255716, "rewards/accuracy_reward": 0.839285746216774, "rewards/format_reward": 1.0, "step": 2790 }, { "completion_length": 427.4218940734863, "epoch": 0.2176446514528482, "grad_norm": 0.10166404390640223, "kl": 0.0029430389404296875, "learning_rate": 8.876001679692893e-07, "loss": 0.0001, "reward": 1.783482238650322, "reward_std": 0.05892360769212246, "rewards/accuracy_reward": 0.7834821715950966, "rewards/format_reward": 1.0, "step": 2792 }, { "completion_length": 414.27457427978516, "epoch": 0.21780055736363105, "grad_norm": 0.07711174024831789, "kl": 0.002685546875, "learning_rate": 8.874454136011805e-07, "loss": 0.0001, "reward": 1.8415179401636124, "reward_std": 0.030135108157992363, "rewards/accuracy_reward": 0.8415179029107094, "rewards/format_reward": 1.0, "step": 2794 }, { "completion_length": 435.2701072692871, "epoch": 0.2179564632744139, "grad_norm": 0.12430443132035464, "kl": 0.0033855438232421875, "learning_rate": 8.872905662824393e-07, "loss": 0.0001, "reward": 1.8058036714792252, "reward_std": 0.0947659807279706, "rewards/accuracy_reward": 0.8058035895228386, "rewards/format_reward": 1.0, "step": 2796 }, { "completion_length": 417.4665336608887, "epoch": 0.21811236918519675, "grad_norm": 0.09294545996730012, "kl": 0.00313568115234375, "learning_rate": 8.871356260502142e-07, "loss": 0.0001, "reward": 1.821428656578064, "reward_std": 0.061247317120432854, "rewards/accuracy_reward": 0.8214285969734192, "rewards/format_reward": 1.0, "step": 2798 }, { "completion_length": 418.1183166503906, "epoch": 0.21826827509597957, "grad_norm": 0.08526274506121198, "kl": 0.0027256011962890625, "learning_rate": 8.869805929416766e-07, "loss": 0.0001, "reward": 1.765625074505806, "reward_std": 0.04373911488801241, "rewards/accuracy_reward": 0.7656250223517418, "rewards/format_reward": 1.0, "step": 2800 }, { "completion_length": 423.62055587768555, "epoch": 0.21842418100676242, "grad_norm": 0.10503472239740945, "kl": 0.0031528472900390625, "learning_rate": 8.868254669940198e-07, "loss": 0.0001, "reward": 1.8683036416769028, "reward_std": 0.05395676475018263, "rewards/accuracy_reward": 0.8683036044239998, "rewards/format_reward": 1.0, "step": 2802 }, { "completion_length": 424.00894927978516, "epoch": 0.21858008691754527, "grad_norm": 0.12948722986385333, "kl": 0.0036411285400390625, "learning_rate": 8.866702482444596e-07, "loss": 0.0001, "reward": 1.7098215222358704, "reward_std": 0.0877799242734909, "rewards/accuracy_reward": 0.7098214477300644, "rewards/format_reward": 1.0, "step": 2804 }, { "completion_length": 414.4620666503906, "epoch": 0.2187359928283281, "grad_norm": 0.09485384005854823, "kl": 0.00284576416015625, "learning_rate": 8.86514936730234e-07, "loss": 0.0001, "reward": 1.7745536714792252, "reward_std": 0.07079096231609583, "rewards/accuracy_reward": 0.7745536118745804, "rewards/format_reward": 1.0, "step": 2806 }, { "completion_length": 431.4062690734863, "epoch": 0.21889189873911094, "grad_norm": 0.10646587817209614, "kl": 0.0029077529907226562, "learning_rate": 8.863595324886031e-07, "loss": 0.0001, "reward": 1.7321429401636124, "reward_std": 0.06914266105741262, "rewards/accuracy_reward": 0.7321428880095482, "rewards/format_reward": 1.0, "step": 2808 }, { "completion_length": 422.2410888671875, "epoch": 0.2190478046498938, "grad_norm": 0.08820097182862041, "kl": 0.0031566619873046875, "learning_rate": 8.862040355568492e-07, "loss": 0.0001, "reward": 1.8102679550647736, "reward_std": 0.03968015220016241, "rewards/accuracy_reward": 0.8102679029107094, "rewards/format_reward": 1.0, "step": 2810 }, { "completion_length": 414.3526954650879, "epoch": 0.21920371056067664, "grad_norm": 0.09261207107920554, "kl": 0.00290679931640625, "learning_rate": 8.860484459722775e-07, "loss": 0.0001, "reward": 1.834821492433548, "reward_std": 0.0625261114910245, "rewards/accuracy_reward": 0.8348214775323868, "rewards/format_reward": 1.0, "step": 2812 }, { "completion_length": 415.24332427978516, "epoch": 0.21935961647145946, "grad_norm": 0.10391367983496869, "kl": 0.0031299591064453125, "learning_rate": 8.858927637722147e-07, "loss": 0.0001, "reward": 1.8013393431901932, "reward_std": 0.05816405266523361, "rewards/accuracy_reward": 0.8013393096625805, "rewards/format_reward": 1.0, "step": 2814 }, { "completion_length": 411.0960006713867, "epoch": 0.2195155223822423, "grad_norm": 0.12246904799137769, "kl": 0.0032958984375, "learning_rate": 8.857369889940099e-07, "loss": 0.0001, "reward": 1.7500000894069672, "reward_std": 0.06658507231622934, "rewards/accuracy_reward": 0.7500000298023224, "rewards/format_reward": 1.0, "step": 2816 }, { "completion_length": 421.5178756713867, "epoch": 0.21967142829302516, "grad_norm": 0.09999014930976016, "kl": 0.0029735565185546875, "learning_rate": 8.855811216750344e-07, "loss": 0.0001, "reward": 1.7321429550647736, "reward_std": 0.06740600056946278, "rewards/accuracy_reward": 0.7321428880095482, "rewards/format_reward": 1.0, "step": 2818 }, { "completion_length": 426.4531478881836, "epoch": 0.219827334203808, "grad_norm": 0.1134858378299412, "kl": 0.003337860107421875, "learning_rate": 8.854251618526818e-07, "loss": 0.0001, "reward": 1.8035715073347092, "reward_std": 0.09198721498250961, "rewards/accuracy_reward": 0.8035714626312256, "rewards/format_reward": 1.0, "step": 2820 }, { "completion_length": 422.0156440734863, "epoch": 0.21998324011459083, "grad_norm": 0.04822174011251026, "kl": 0.0026950836181640625, "learning_rate": 8.852691095643681e-07, "loss": 0.0001, "reward": 1.88839291036129, "reward_std": 0.030438203364610672, "rewards/accuracy_reward": 0.8883929029107094, "rewards/format_reward": 1.0, "step": 2822 }, { "completion_length": 427.7299270629883, "epoch": 0.22013914602537368, "grad_norm": 0.12212578827341368, "kl": 0.0033273696899414062, "learning_rate": 8.851129648475308e-07, "loss": 0.0001, "reward": 1.7477679401636124, "reward_std": 0.08129833079874516, "rewards/accuracy_reward": 0.7500000447034836, "rewards/format_reward": 0.9977678656578064, "step": 2824 }, { "completion_length": 413.1428756713867, "epoch": 0.22029505193615653, "grad_norm": 0.08691623669466887, "kl": 0.0026645660400390625, "learning_rate": 8.849567277396306e-07, "loss": 0.0001, "reward": 1.8727679252624512, "reward_std": 0.05666771624237299, "rewards/accuracy_reward": 0.872767873108387, "rewards/format_reward": 1.0, "step": 2826 }, { "completion_length": 427.0089454650879, "epoch": 0.22045095784693938, "grad_norm": 0.07721702825528917, "kl": 0.0029544830322265625, "learning_rate": 8.848003982781494e-07, "loss": 0.0001, "reward": 1.7857143431901932, "reward_std": 0.039377057924866676, "rewards/accuracy_reward": 0.7857143208384514, "rewards/format_reward": 1.0, "step": 2828 }, { "completion_length": 428.7098388671875, "epoch": 0.2206068637577222, "grad_norm": 0.06785410239397857, "kl": 0.0033283233642578125, "learning_rate": 8.846439765005915e-07, "loss": 0.0001, "reward": 1.7901786863803864, "reward_std": 0.07289712596684694, "rewards/accuracy_reward": 0.7901786118745804, "rewards/format_reward": 1.0, "step": 2830 }, { "completion_length": 410.14510345458984, "epoch": 0.22076276966850505, "grad_norm": 0.0033186459176792584, "kl": 0.0024709701538085938, "learning_rate": 8.844874624444838e-07, "loss": 0.0001, "reward": 1.8839286416769028, "reward_std": 0.01555540319532156, "rewards/accuracy_reward": 0.8839285895228386, "rewards/format_reward": 1.0, "step": 2832 }, { "completion_length": 419.6696586608887, "epoch": 0.2209186755792879, "grad_norm": 0.07116459012889065, "kl": 0.0029697418212890625, "learning_rate": 8.843308561473748e-07, "loss": 0.0001, "reward": 1.7790179401636124, "reward_std": 0.05590816028416157, "rewards/accuracy_reward": 0.7790178954601288, "rewards/format_reward": 1.0, "step": 2834 }, { "completion_length": 409.39734268188477, "epoch": 0.22107458149007075, "grad_norm": 0.10903626112149271, "kl": 0.0026035308837890625, "learning_rate": 8.841741576468355e-07, "loss": 0.0001, "reward": 1.8348215073347092, "reward_std": 0.05959761422127485, "rewards/accuracy_reward": 0.8348214626312256, "rewards/format_reward": 1.0, "step": 2836 }, { "completion_length": 427.8169860839844, "epoch": 0.22123048740085358, "grad_norm": 0.09375404304077084, "kl": 0.0033721923828125, "learning_rate": 8.840173669804589e-07, "loss": 0.0001, "reward": 1.8281250894069672, "reward_std": 0.07469655759632587, "rewards/accuracy_reward": 0.8281250447034836, "rewards/format_reward": 1.0, "step": 2838 }, { "completion_length": 426.04912185668945, "epoch": 0.22138639331163643, "grad_norm": 0.1120225661488724, "kl": 0.0029048919677734375, "learning_rate": 8.8386048418586e-07, "loss": 0.0001, "reward": 1.8370536267757416, "reward_std": 0.07500105164945126, "rewards/accuracy_reward": 0.8370536118745804, "rewards/format_reward": 1.0, "step": 2840 }, { "completion_length": 420.9375114440918, "epoch": 0.22154229922241928, "grad_norm": 0.08042587045685609, "kl": 0.00299835205078125, "learning_rate": 8.837035093006761e-07, "loss": 0.0001, "reward": 1.7879465073347092, "reward_std": 0.06478060036897659, "rewards/accuracy_reward": 0.7879464700818062, "rewards/format_reward": 1.0, "step": 2842 }, { "completion_length": 422.6607322692871, "epoch": 0.22169820513320213, "grad_norm": 0.1093993185675091, "kl": 0.0029554367065429688, "learning_rate": 8.835464423625662e-07, "loss": 0.0001, "reward": 1.767857238650322, "reward_std": 0.10190316941589117, "rewards/accuracy_reward": 0.7678571790456772, "rewards/format_reward": 1.0, "step": 2844 }, { "completion_length": 418.4843940734863, "epoch": 0.22185411104398495, "grad_norm": 0.1321444110904186, "kl": 0.0025625228881835938, "learning_rate": 8.83389283409212e-07, "loss": 0.0001, "reward": 1.7678572237491608, "reward_std": 0.053282756358385086, "rewards/accuracy_reward": 0.7678571566939354, "rewards/format_reward": 1.0, "step": 2846 }, { "completion_length": 424.3705520629883, "epoch": 0.2220100169547678, "grad_norm": 0.09962342586933705, "kl": 0.0027437210083007812, "learning_rate": 8.832320324783166e-07, "loss": 0.0001, "reward": 1.796875074505806, "reward_std": 0.06057331245392561, "rewards/accuracy_reward": 0.7968750447034836, "rewards/format_reward": 1.0, "step": 2848 }, { "completion_length": 422.4018020629883, "epoch": 0.22216592286555065, "grad_norm": 0.10834806976503632, "kl": 0.003536224365234375, "learning_rate": 8.830746896076058e-07, "loss": 0.0001, "reward": 1.805803656578064, "reward_std": 0.08957935497164726, "rewards/accuracy_reward": 0.8058036044239998, "rewards/format_reward": 1.0, "step": 2850 }, { "completion_length": 424.5044822692871, "epoch": 0.2223218287763335, "grad_norm": 0.12204995922442272, "kl": 0.0034198760986328125, "learning_rate": 8.82917254834827e-07, "loss": 0.0001, "reward": 1.705357238650322, "reward_std": 0.07146496418863535, "rewards/accuracy_reward": 0.7053571678698063, "rewards/format_reward": 1.0, "step": 2852 }, { "completion_length": 419.8549346923828, "epoch": 0.22247773468711632, "grad_norm": 0.10136100319484269, "kl": 0.0031518936157226562, "learning_rate": 8.827597281977498e-07, "loss": 0.0001, "reward": 1.7968750596046448, "reward_std": 0.059141152538359165, "rewards/accuracy_reward": 0.796875037252903, "rewards/format_reward": 1.0, "step": 2854 }, { "completion_length": 426.73440170288086, "epoch": 0.22263364059789917, "grad_norm": 0.10900154033810446, "kl": 0.002902984619140625, "learning_rate": 8.826021097341661e-07, "loss": 0.0001, "reward": 1.8727679401636124, "reward_std": 0.05102826748043299, "rewards/accuracy_reward": 0.8727679029107094, "rewards/format_reward": 1.0, "step": 2856 }, { "completion_length": 421.35716247558594, "epoch": 0.22278954650868202, "grad_norm": 0.0814833330206926, "kl": 0.0034122467041015625, "learning_rate": 8.824443994818891e-07, "loss": 0.0001, "reward": 1.7589286714792252, "reward_std": 0.06560797244310379, "rewards/accuracy_reward": 0.7589285969734192, "rewards/format_reward": 1.0, "step": 2858 }, { "completion_length": 419.88841247558594, "epoch": 0.22294545241946484, "grad_norm": 0.07314692198810126, "kl": 0.0029439926147460938, "learning_rate": 8.822865974787549e-07, "loss": 0.0001, "reward": 1.87276791036129, "reward_std": 0.03983351867645979, "rewards/accuracy_reward": 0.872767873108387, "rewards/format_reward": 1.0, "step": 2860 }, { "completion_length": 418.46876525878906, "epoch": 0.2231013583302477, "grad_norm": 0.13031247556403056, "kl": 0.0029897689819335938, "learning_rate": 8.821287037626211e-07, "loss": 0.0001, "reward": 1.7723215222358704, "reward_std": 0.10626382287591696, "rewards/accuracy_reward": 0.7723214477300644, "rewards/format_reward": 1.0, "step": 2862 }, { "completion_length": 416.1763572692871, "epoch": 0.22325726424103054, "grad_norm": 0.09234106863123916, "kl": 0.0030727386474609375, "learning_rate": 8.819707183713674e-07, "loss": 0.0001, "reward": 1.8549107909202576, "reward_std": 0.06996499001979828, "rewards/accuracy_reward": 0.8549107536673546, "rewards/format_reward": 1.0, "step": 2864 }, { "completion_length": 435.8393020629883, "epoch": 0.2234131701518134, "grad_norm": 0.06948166647023964, "kl": 0.0030841827392578125, "learning_rate": 8.818126413428954e-07, "loss": 0.0001, "reward": 1.772321492433548, "reward_std": 0.07079236209392548, "rewards/accuracy_reward": 0.7723214700818062, "rewards/format_reward": 1.0, "step": 2866 }, { "completion_length": 417.35939025878906, "epoch": 0.2235690760625962, "grad_norm": 0.10915160981366619, "kl": 0.0029392242431640625, "learning_rate": 8.81654472715129e-07, "loss": 0.0001, "reward": 1.8392857909202576, "reward_std": 0.07042145263403654, "rewards/accuracy_reward": 0.8392857536673546, "rewards/format_reward": 1.0, "step": 2868 }, { "completion_length": 427.3928680419922, "epoch": 0.22372498197337906, "grad_norm": 0.07468959869557008, "kl": 0.0028753280639648438, "learning_rate": 8.814962125260138e-07, "loss": 0.0001, "reward": 1.8258929401636124, "reward_std": 0.04892210382968187, "rewards/accuracy_reward": 0.8258928954601288, "rewards/format_reward": 1.0, "step": 2870 }, { "completion_length": 423.61832427978516, "epoch": 0.2238808878841619, "grad_norm": 0.12072517491114163, "kl": 0.0030422210693359375, "learning_rate": 8.813378608135174e-07, "loss": 0.0001, "reward": 1.7812500894069672, "reward_std": 0.05298106465488672, "rewards/accuracy_reward": 0.7812500223517418, "rewards/format_reward": 1.0, "step": 2872 }, { "completion_length": 412.65180587768555, "epoch": 0.22403679379494476, "grad_norm": 0.12196195061431862, "kl": 0.0029125213623046875, "learning_rate": 8.811794176156293e-07, "loss": 0.0001, "reward": 1.7522322088479996, "reward_std": 0.10656691808253527, "rewards/accuracy_reward": 0.7522321715950966, "rewards/format_reward": 1.0, "step": 2874 }, { "completion_length": 425.2567138671875, "epoch": 0.22419269970572758, "grad_norm": 0.10855786778442322, "kl": 0.0032672882080078125, "learning_rate": 8.810208829703611e-07, "loss": 0.0001, "reward": 1.7165179252624512, "reward_std": 0.055388922803103924, "rewards/accuracy_reward": 0.7165178954601288, "rewards/format_reward": 1.0, "step": 2876 }, { "completion_length": 418.31698989868164, "epoch": 0.22434860561651043, "grad_norm": 0.09701879295380753, "kl": 0.003162384033203125, "learning_rate": 8.808622569157463e-07, "loss": 0.0001, "reward": 1.8080357611179352, "reward_std": 0.05590956099331379, "rewards/accuracy_reward": 0.808035746216774, "rewards/format_reward": 1.0, "step": 2878 }, { "completion_length": 417.3772506713867, "epoch": 0.22450451152729328, "grad_norm": 0.08018527908346197, "kl": 0.0032253265380859375, "learning_rate": 8.807035394898403e-07, "loss": 0.0001, "reward": 1.7633929252624512, "reward_std": 0.05493246205151081, "rewards/accuracy_reward": 0.7633928991854191, "rewards/format_reward": 1.0, "step": 2880 }, { "completion_length": 420.9352912902832, "epoch": 0.22466041743807613, "grad_norm": 0.07958970754602471, "kl": 0.0029144287109375, "learning_rate": 8.805447307307203e-07, "loss": 0.0001, "reward": 1.7700893878936768, "reward_std": 0.07011835556477308, "rewards/accuracy_reward": 0.7700893208384514, "rewards/format_reward": 1.0, "step": 2882 }, { "completion_length": 426.69868087768555, "epoch": 0.22481632334885895, "grad_norm": 0.004232624632076724, "kl": 0.0032291412353515625, "learning_rate": 8.80385830676486e-07, "loss": 0.0001, "reward": 1.8080358058214188, "reward_std": 0.03434379957616329, "rewards/accuracy_reward": 0.8080357611179352, "rewards/format_reward": 1.0, "step": 2884 }, { "completion_length": 421.354923248291, "epoch": 0.2249722292596418, "grad_norm": 0.06645204716699, "kl": 0.0029659271240234375, "learning_rate": 8.802268393652579e-07, "loss": 0.0001, "reward": 1.7522322088479996, "reward_std": 0.03983351867645979, "rewards/accuracy_reward": 0.752232164144516, "rewards/format_reward": 1.0, "step": 2886 }, { "completion_length": 417.2053756713867, "epoch": 0.22512813517042465, "grad_norm": 0.10418577142972157, "kl": 0.0028896331787109375, "learning_rate": 8.800677568351794e-07, "loss": 0.0001, "reward": 1.8325893580913544, "reward_std": 0.04794640466570854, "rewards/accuracy_reward": 0.8325893133878708, "rewards/format_reward": 1.0, "step": 2888 }, { "completion_length": 416.42189025878906, "epoch": 0.2252840410812075, "grad_norm": 0.1317061260721837, "kl": 0.0032958984375, "learning_rate": 8.799085831244154e-07, "loss": 0.0001, "reward": 1.8459822088479996, "reward_std": 0.058317420072853565, "rewards/accuracy_reward": 0.8459821864962578, "rewards/format_reward": 1.0, "step": 2890 }, { "completion_length": 403.9576072692871, "epoch": 0.22543994699199033, "grad_norm": 0.10601985881912396, "kl": 0.0025157928466796875, "learning_rate": 8.797493182711526e-07, "loss": 0.0001, "reward": 1.9017857611179352, "reward_std": 0.04373771231621504, "rewards/accuracy_reward": 0.9017857313156128, "rewards/format_reward": 1.0, "step": 2892 }, { "completion_length": 429.4620704650879, "epoch": 0.22559585290277318, "grad_norm": 0.09629718134933053, "kl": 0.00327301025390625, "learning_rate": 8.795899623135996e-07, "loss": 0.0001, "reward": 1.8861607909202576, "reward_std": 0.05816405173391104, "rewards/accuracy_reward": 0.8861607536673546, "rewards/format_reward": 1.0, "step": 2894 }, { "completion_length": 417.33930587768555, "epoch": 0.22575175881355602, "grad_norm": 0.07127455881975554, "kl": 0.0027332305908203125, "learning_rate": 8.794305152899872e-07, "loss": 0.0001, "reward": 1.8616072088479996, "reward_std": 0.029159409925341606, "rewards/accuracy_reward": 0.861607164144516, "rewards/format_reward": 1.0, "step": 2896 }, { "completion_length": 411.3460006713867, "epoch": 0.22590766472433887, "grad_norm": 0.09403731779998638, "kl": 0.00310516357421875, "learning_rate": 8.792709772385673e-07, "loss": 0.0001, "reward": 1.8571429401636124, "reward_std": 0.05441322363913059, "rewards/accuracy_reward": 0.8593750223517418, "rewards/format_reward": 0.9977678656578064, "step": 2898 }, { "completion_length": 432.9754638671875, "epoch": 0.2260635706351217, "grad_norm": 0.08969635795693703, "kl": 0.0028810501098632812, "learning_rate": 8.791113481976146e-07, "loss": 0.0001, "reward": 1.8325893580913544, "reward_std": 0.04959610756486654, "rewards/accuracy_reward": 0.8325893208384514, "rewards/format_reward": 1.0, "step": 2900 }, { "completion_length": 423.3348388671875, "epoch": 0.22621947654590455, "grad_norm": 0.09637490086348928, "kl": 0.0029706954956054688, "learning_rate": 8.789516282054248e-07, "loss": 0.0001, "reward": 1.7343751043081284, "reward_std": 0.05230705998837948, "rewards/accuracy_reward": 0.7343750223517418, "rewards/format_reward": 1.0, "step": 2902 }, { "completion_length": 426.2701110839844, "epoch": 0.2263753824566874, "grad_norm": 0.06431154797811575, "kl": 0.002559661865234375, "learning_rate": 8.78791817300316e-07, "loss": 0.0001, "reward": 1.8705357760190964, "reward_std": 0.03336669970303774, "rewards/accuracy_reward": 0.870535746216774, "rewards/format_reward": 1.0, "step": 2904 }, { "completion_length": 420.8861770629883, "epoch": 0.22653128836747022, "grad_norm": 0.11824541461002765, "kl": 0.003139495849609375, "learning_rate": 8.786319155206274e-07, "loss": 0.0001, "reward": 1.7276786416769028, "reward_std": 0.0490754684433341, "rewards/accuracy_reward": 0.7276786081492901, "rewards/format_reward": 1.0, "step": 2906 }, { "completion_length": 407.04019927978516, "epoch": 0.22668719427825307, "grad_norm": 0.12944460516278286, "kl": 0.0029230117797851562, "learning_rate": 8.78471922904721e-07, "loss": 0.0001, "reward": 1.8549108058214188, "reward_std": 0.06951216794550419, "rewards/accuracy_reward": 0.854910746216774, "rewards/format_reward": 1.0, "step": 2908 }, { "completion_length": 407.2701072692871, "epoch": 0.22684310018903592, "grad_norm": 0.09732115744309426, "kl": 0.0023784637451171875, "learning_rate": 8.783118394909798e-07, "loss": 0.0001, "reward": 1.8504465371370316, "reward_std": 0.043739115819334984, "rewards/accuracy_reward": 0.850446455180645, "rewards/format_reward": 1.0, "step": 2910 }, { "completion_length": 417.9330520629883, "epoch": 0.22699900609981877, "grad_norm": 0.09672722341139227, "kl": 0.003627777099609375, "learning_rate": 8.781516653178087e-07, "loss": 0.0001, "reward": 1.7857143580913544, "reward_std": 0.07710441201925278, "rewards/accuracy_reward": 0.7857143208384514, "rewards/format_reward": 1.0, "step": 2912 }, { "completion_length": 423.7879638671875, "epoch": 0.2271549120106016, "grad_norm": 0.11074136694227192, "kl": 0.0028095245361328125, "learning_rate": 8.779914004236349e-07, "loss": 0.0001, "reward": 1.7477679550647736, "reward_std": 0.04937856364995241, "rewards/accuracy_reward": 0.7477678880095482, "rewards/format_reward": 1.0, "step": 2914 }, { "completion_length": 424.9486770629883, "epoch": 0.22731081792138444, "grad_norm": 0.07459499392235826, "kl": 0.0029621124267578125, "learning_rate": 8.778310448469064e-07, "loss": 0.0001, "reward": 1.7901786416769028, "reward_std": 0.052003963850438595, "rewards/accuracy_reward": 0.7901786044239998, "rewards/format_reward": 1.0, "step": 2916 }, { "completion_length": 424.42635345458984, "epoch": 0.2274667238321673, "grad_norm": 0.08354348186305877, "kl": 0.0027141571044921875, "learning_rate": 8.776705986260942e-07, "loss": 0.0001, "reward": 1.7723215222358704, "reward_std": 0.03208790719509125, "rewards/accuracy_reward": 0.772321455180645, "rewards/format_reward": 1.0, "step": 2918 }, { "completion_length": 431.0870704650879, "epoch": 0.22762262974295014, "grad_norm": 0.12826792374174958, "kl": 0.003147125244140625, "learning_rate": 8.775100617996898e-07, "loss": 0.0001, "reward": 1.8794643431901932, "reward_std": 0.06786386575549841, "rewards/accuracy_reward": 0.8839286044239998, "rewards/format_reward": 0.9955357164144516, "step": 2920 }, { "completion_length": 424.49778747558594, "epoch": 0.22777853565373296, "grad_norm": 0.06695778566323377, "kl": 0.003253936767578125, "learning_rate": 8.773494344062073e-07, "loss": 0.0001, "reward": 1.828125074505806, "reward_std": 0.04276201594620943, "rewards/accuracy_reward": 0.8281250298023224, "rewards/format_reward": 1.0, "step": 2922 }, { "completion_length": 414.64734268188477, "epoch": 0.2279344415645158, "grad_norm": 0.11697142515000428, "kl": 0.00260162353515625, "learning_rate": 8.77188716484182e-07, "loss": 0.0001, "reward": 1.8482143580913544, "reward_std": 0.06252610962837934, "rewards/accuracy_reward": 0.848214328289032, "rewards/format_reward": 1.0, "step": 2924 }, { "completion_length": 419.1897506713867, "epoch": 0.22809034747529866, "grad_norm": 0.09589431242122892, "kl": 0.0031375885009765625, "learning_rate": 8.770279080721716e-07, "loss": 0.0001, "reward": 1.819196492433548, "reward_std": 0.04861900769174099, "rewards/accuracy_reward": 0.8191964700818062, "rewards/format_reward": 1.0, "step": 2926 }, { "completion_length": 419.41743087768555, "epoch": 0.2282462533860815, "grad_norm": 0.09453758880932919, "kl": 0.0030355453491210938, "learning_rate": 8.768670092087547e-07, "loss": 0.0001, "reward": 1.8727679252624512, "reward_std": 0.03306360449641943, "rewards/accuracy_reward": 0.8727678880095482, "rewards/format_reward": 1.0, "step": 2928 }, { "completion_length": 431.8794822692871, "epoch": 0.22840215929686433, "grad_norm": 0.11321963314607414, "kl": 0.0028562545776367188, "learning_rate": 8.767060199325322e-07, "loss": 0.0001, "reward": 1.8013393580913544, "reward_std": 0.07011835649609566, "rewards/accuracy_reward": 0.8013393208384514, "rewards/format_reward": 1.0, "step": 2930 }, { "completion_length": 422.4129638671875, "epoch": 0.22855806520764718, "grad_norm": 0.09750155309724805, "kl": 0.0031795501708984375, "learning_rate": 8.765449402821261e-07, "loss": 0.0001, "reward": 1.8928572088479996, "reward_std": 0.04373771417886019, "rewards/accuracy_reward": 0.8928571790456772, "rewards/format_reward": 1.0, "step": 2932 }, { "completion_length": 416.4799270629883, "epoch": 0.22871397111843003, "grad_norm": 0.1192319931464532, "kl": 0.0028781890869140625, "learning_rate": 8.763837702961808e-07, "loss": 0.0001, "reward": 1.7901786714792252, "reward_std": 0.05569201707839966, "rewards/accuracy_reward": 0.7901786044239998, "rewards/format_reward": 1.0, "step": 2934 }, { "completion_length": 426.8794822692871, "epoch": 0.22886987702921288, "grad_norm": 0.1042823515799591, "kl": 0.0032806396484375, "learning_rate": 8.76222510013362e-07, "loss": 0.0001, "reward": 1.8102679252624512, "reward_std": 0.056885261088609695, "rewards/accuracy_reward": 0.8102679029107094, "rewards/format_reward": 1.0, "step": 2936 }, { "completion_length": 428.71430587768555, "epoch": 0.2290257829399957, "grad_norm": 0.1041230236485338, "kl": 0.0030384063720703125, "learning_rate": 8.760611594723567e-07, "loss": 0.0001, "reward": 1.7991072237491608, "reward_std": 0.036448560655117035, "rewards/accuracy_reward": 0.7991071715950966, "rewards/format_reward": 1.0, "step": 2938 }, { "completion_length": 419.43528747558594, "epoch": 0.22918168885077855, "grad_norm": 0.1359412452819368, "kl": 0.0029754638671875, "learning_rate": 8.758997187118743e-07, "loss": 0.0001, "reward": 1.8258929550647736, "reward_std": 0.061549012549221516, "rewards/accuracy_reward": 0.8258928954601288, "rewards/format_reward": 1.0, "step": 2940 }, { "completion_length": 423.30359268188477, "epoch": 0.2293375947615614, "grad_norm": 0.09620755391412612, "kl": 0.0028553009033203125, "learning_rate": 8.757381877706454e-07, "loss": 0.0001, "reward": 1.8013393580913544, "reward_std": 0.07545611169189215, "rewards/accuracy_reward": 0.801339328289032, "rewards/format_reward": 1.0, "step": 2942 }, { "completion_length": 429.9419860839844, "epoch": 0.22949350067234425, "grad_norm": 0.1043178628626832, "kl": 0.0033397674560546875, "learning_rate": 8.755765666874221e-07, "loss": 0.0001, "reward": 1.8325893729925156, "reward_std": 0.06365517526865005, "rewards/accuracy_reward": 0.8348214700818062, "rewards/format_reward": 0.9977678656578064, "step": 2944 }, { "completion_length": 417.43528747558594, "epoch": 0.22964940658312707, "grad_norm": 0.14226021626917318, "kl": 0.003543853759765625, "learning_rate": 8.754148555009786e-07, "loss": 0.0001, "reward": 1.7946429550647736, "reward_std": 0.06996862683445215, "rewards/accuracy_reward": 0.7946428991854191, "rewards/format_reward": 1.0, "step": 2946 }, { "completion_length": 413.6897506713867, "epoch": 0.22980531249390992, "grad_norm": 0.09959834637741369, "kl": 0.0031261444091796875, "learning_rate": 8.752530542501103e-07, "loss": 0.0001, "reward": 1.8549107909202576, "reward_std": 0.06643030419945717, "rewards/accuracy_reward": 0.8549107536673546, "rewards/format_reward": 1.0, "step": 2948 }, { "completion_length": 441.50671768188477, "epoch": 0.22996121840469277, "grad_norm": 0.04331938608788595, "kl": 0.0033617019653320312, "learning_rate": 8.750911629736344e-07, "loss": 0.0001, "reward": 1.7834822088479996, "reward_std": 0.030135109089314938, "rewards/accuracy_reward": 0.7834821864962578, "rewards/format_reward": 1.0, "step": 2950 }, { "completion_length": 423.88618087768555, "epoch": 0.23011712431547562, "grad_norm": 0.10343308858144407, "kl": 0.0027008056640625, "learning_rate": 8.749291817103897e-07, "loss": 0.0001, "reward": 1.8861607909202576, "reward_std": 0.0732002193108201, "rewards/accuracy_reward": 0.8861607387661934, "rewards/format_reward": 1.0, "step": 2952 }, { "completion_length": 428.2634086608887, "epoch": 0.23027303022625845, "grad_norm": 0.08433367619753238, "kl": 0.0033397674560546875, "learning_rate": 8.747671104992363e-07, "loss": 0.0001, "reward": 1.752232238650322, "reward_std": 0.06493396684527397, "rewards/accuracy_reward": 0.7522321715950966, "rewards/format_reward": 1.0, "step": 2954 }, { "completion_length": 431.6473388671875, "epoch": 0.2304289361370413, "grad_norm": 0.10068491963012208, "kl": 0.003246307373046875, "learning_rate": 8.746049493790565e-07, "loss": 0.0001, "reward": 1.8236607909202576, "reward_std": 0.053739218041300774, "rewards/accuracy_reward": 0.8236607387661934, "rewards/format_reward": 1.0, "step": 2956 }, { "completion_length": 417.6919860839844, "epoch": 0.23058484204782415, "grad_norm": 0.0483376474827851, "kl": 0.0029382705688476562, "learning_rate": 8.744426983887534e-07, "loss": 0.0001, "reward": 1.8013393729925156, "reward_std": 0.030135109089314938, "rewards/accuracy_reward": 0.8013393133878708, "rewards/format_reward": 1.0, "step": 2958 }, { "completion_length": 416.8080520629883, "epoch": 0.23074074795860697, "grad_norm": 0.10599137051634122, "kl": 0.00279998779296875, "learning_rate": 8.742803575672524e-07, "loss": 0.0001, "reward": 1.8683036416769028, "reward_std": 0.04809977114200592, "rewards/accuracy_reward": 0.8683036044239998, "rewards/format_reward": 1.0, "step": 2960 }, { "completion_length": 430.1406478881836, "epoch": 0.23089665386938982, "grad_norm": 0.09837945287555958, "kl": 0.002948760986328125, "learning_rate": 8.741179269535e-07, "loss": 0.0001, "reward": 1.8504465222358704, "reward_std": 0.057644814252853394, "rewards/accuracy_reward": 0.8504464477300644, "rewards/format_reward": 1.0, "step": 2962 }, { "completion_length": 431.8660888671875, "epoch": 0.23105255978017267, "grad_norm": 0.10371224735026709, "kl": 0.0032024383544921875, "learning_rate": 8.73955406586464e-07, "loss": 0.0001, "reward": 1.7187500596046448, "reward_std": 0.10385456401854753, "rewards/accuracy_reward": 0.7187500298023224, "rewards/format_reward": 1.0, "step": 2964 }, { "completion_length": 428.8705520629883, "epoch": 0.23120846569095552, "grad_norm": 0.10816174408846099, "kl": 0.0030727386474609375, "learning_rate": 8.737927965051346e-07, "loss": 0.0001, "reward": 1.8013393580913544, "reward_std": 0.03840135969221592, "rewards/accuracy_reward": 0.8013393059372902, "rewards/format_reward": 1.0, "step": 2966 }, { "completion_length": 428.2143020629883, "epoch": 0.23136437160173834, "grad_norm": 0.12270304199687505, "kl": 0.00315093994140625, "learning_rate": 8.736300967485227e-07, "loss": 0.0001, "reward": 1.7455357909202576, "reward_std": 0.07079236302524805, "rewards/accuracy_reward": 0.745535746216774, "rewards/format_reward": 1.0, "step": 2968 }, { "completion_length": 427.57815170288086, "epoch": 0.2315202775125212, "grad_norm": 0.0035065675359853537, "kl": 0.003025054931640625, "learning_rate": 8.734673073556611e-07, "loss": 0.0001, "reward": 1.8392857611179352, "reward_std": 0.04764331132173538, "rewards/accuracy_reward": 0.8392857387661934, "rewards/format_reward": 1.0, "step": 2970 }, { "completion_length": 427.0937690734863, "epoch": 0.23167618342330404, "grad_norm": 0.11910988832943473, "kl": 0.00347137451171875, "learning_rate": 8.733044283656039e-07, "loss": 0.0001, "reward": 1.727678656578064, "reward_std": 0.09634786657989025, "rewards/accuracy_reward": 0.7276786044239998, "rewards/format_reward": 1.0, "step": 2972 }, { "completion_length": 438.674129486084, "epoch": 0.2318320893340869, "grad_norm": 0.10247204488346158, "kl": 0.003177642822265625, "learning_rate": 8.731414598174269e-07, "loss": 0.0001, "reward": 1.7723215073347092, "reward_std": 0.06816555839031935, "rewards/accuracy_reward": 0.7745535969734192, "rewards/format_reward": 0.9977678656578064, "step": 2974 }, { "completion_length": 408.3147506713867, "epoch": 0.2319879952448697, "grad_norm": 0.08884346114416926, "kl": 0.0029058456420898438, "learning_rate": 8.729784017502275e-07, "loss": 0.0001, "reward": 1.8325893729925156, "reward_std": 0.03156726714223623, "rewards/accuracy_reward": 0.8325893133878708, "rewards/format_reward": 1.0, "step": 2976 }, { "completion_length": 424.06921768188477, "epoch": 0.23214390115565256, "grad_norm": 0.08061203749187168, "kl": 0.0027208328247070312, "learning_rate": 8.72815254203124e-07, "loss": 0.0001, "reward": 1.8571429252624512, "reward_std": 0.05425985995680094, "rewards/accuracy_reward": 0.8571428880095482, "rewards/format_reward": 1.0, "step": 2978 }, { "completion_length": 423.4643020629883, "epoch": 0.2322998070664354, "grad_norm": 0.08093511352688085, "kl": 0.0032558441162109375, "learning_rate": 8.726520172152567e-07, "loss": 0.0001, "reward": 1.7299107760190964, "reward_std": 0.06027161795645952, "rewards/accuracy_reward": 0.729910746216774, "rewards/format_reward": 1.0, "step": 2980 }, { "completion_length": 423.7611770629883, "epoch": 0.23245571297721826, "grad_norm": 0.04101168459033698, "kl": 0.0031185150146484375, "learning_rate": 8.724886908257873e-07, "loss": 0.0001, "reward": 1.7790179252624512, "reward_std": 0.05005116853863001, "rewards/accuracy_reward": 0.779017873108387, "rewards/format_reward": 1.0, "step": 2982 }, { "completion_length": 433.85046768188477, "epoch": 0.23261161888800108, "grad_norm": 0.08066362301340747, "kl": 0.00318145751953125, "learning_rate": 8.723252750738987e-07, "loss": 0.0001, "reward": 1.7946429252624512, "reward_std": 0.07222452107816935, "rewards/accuracy_reward": 0.7968750521540642, "rewards/format_reward": 0.9977678656578064, "step": 2984 }, { "completion_length": 423.4598388671875, "epoch": 0.23276752479878393, "grad_norm": 0.07742559236180752, "kl": 0.0027942657470703125, "learning_rate": 8.721617699987953e-07, "loss": 0.0001, "reward": 1.8370536267757416, "reward_std": 0.03239100240170956, "rewards/accuracy_reward": 0.8370536044239998, "rewards/format_reward": 1.0, "step": 2986 }, { "completion_length": 435.54689025878906, "epoch": 0.23292343070956678, "grad_norm": 0.1117151517696513, "kl": 0.0031299591064453125, "learning_rate": 8.719981756397033e-07, "loss": 0.0001, "reward": 1.7968750894069672, "reward_std": 0.07612871564924717, "rewards/accuracy_reward": 0.7968750521540642, "rewards/format_reward": 1.0, "step": 2988 }, { "completion_length": 430.3125190734863, "epoch": 0.23307933662034963, "grad_norm": 0.09863974399157098, "kl": 0.0031909942626953125, "learning_rate": 8.7183449203587e-07, "loss": 0.0001, "reward": 1.8392857760190964, "reward_std": 0.052003965713083744, "rewards/accuracy_reward": 0.8392857536673546, "rewards/format_reward": 1.0, "step": 2990 }, { "completion_length": 432.5111770629883, "epoch": 0.23323524253113245, "grad_norm": 0.06912602946090654, "kl": 0.00313568115234375, "learning_rate": 8.716707192265638e-07, "loss": 0.0001, "reward": 1.7991072535514832, "reward_std": 0.07710441388189793, "rewards/accuracy_reward": 0.7991071790456772, "rewards/format_reward": 1.0, "step": 2992 }, { "completion_length": 433.5022506713867, "epoch": 0.2333911484419153, "grad_norm": 0.130354378724398, "kl": 0.0030460357666015625, "learning_rate": 8.715068572510752e-07, "loss": 0.0001, "reward": 1.8571429550647736, "reward_std": 0.07951367180794477, "rewards/accuracy_reward": 0.8571428954601288, "rewards/format_reward": 1.0, "step": 2994 }, { "completion_length": 421.2946586608887, "epoch": 0.23354705435269815, "grad_norm": 0.10986128664485813, "kl": 0.0028896331787109375, "learning_rate": 8.713429061487155e-07, "loss": 0.0001, "reward": 1.8616071939468384, "reward_std": 0.036448562517762184, "rewards/accuracy_reward": 0.861607164144516, "rewards/format_reward": 1.0, "step": 2996 }, { "completion_length": 425.0290298461914, "epoch": 0.233702960263481, "grad_norm": 0.11931628313622838, "kl": 0.0029811859130859375, "learning_rate": 8.711788659588178e-07, "loss": 0.0001, "reward": 1.8258929550647736, "reward_std": 0.10144530516117811, "rewards/accuracy_reward": 0.8258929029107094, "rewards/format_reward": 1.0, "step": 2998 }, { "completion_length": 417.70983505249023, "epoch": 0.23385886617426382, "grad_norm": 0.09449886875532183, "kl": 0.0025463104248046875, "learning_rate": 8.710147367207363e-07, "loss": 0.0001, "reward": 1.821428656578064, "reward_std": 0.05590956285595894, "rewards/accuracy_reward": 0.8214286267757416, "rewards/format_reward": 1.0, "step": 3000 }, { "completion_length": 423.5491256713867, "epoch": 0.23401477208504667, "grad_norm": 0.13874310101415305, "kl": 0.0029449462890625, "learning_rate": 8.708505184738467e-07, "loss": 0.0001, "reward": 1.9107143580913544, "reward_std": 0.07973121386021376, "rewards/accuracy_reward": 0.9107143133878708, "rewards/format_reward": 1.0, "step": 3002 }, { "completion_length": 413.62501525878906, "epoch": 0.23417067799582952, "grad_norm": 0.12348363328327802, "kl": 0.0028066635131835938, "learning_rate": 8.706862112575459e-07, "loss": 0.0001, "reward": 1.8593750596046448, "reward_std": 0.07304685469716787, "rewards/accuracy_reward": 0.8593750298023224, "rewards/format_reward": 1.0, "step": 3004 }, { "completion_length": 433.75671768188477, "epoch": 0.23432658390661235, "grad_norm": 0.08844287675388562, "kl": 0.0034885406494140625, "learning_rate": 8.705218151112525e-07, "loss": 0.0001, "reward": 1.7075893878936768, "reward_std": 0.06461750064045191, "rewards/accuracy_reward": 0.7098214477300644, "rewards/format_reward": 0.9977678656578064, "step": 3006 }, { "completion_length": 423.1227836608887, "epoch": 0.2344824898173952, "grad_norm": 0.11313772140351691, "kl": 0.00347137451171875, "learning_rate": 8.703573300744058e-07, "loss": 0.0001, "reward": 1.7544643431901932, "reward_std": 0.057341719046235085, "rewards/accuracy_reward": 0.7566964477300644, "rewards/format_reward": 0.9977678656578064, "step": 3008 }, { "completion_length": 426.9419822692871, "epoch": 0.23463839572817805, "grad_norm": 0.06801986610154738, "kl": 0.002933502197265625, "learning_rate": 8.701927561864671e-07, "loss": 0.0001, "reward": 1.8705357760190964, "reward_std": 0.05734172184020281, "rewards/accuracy_reward": 0.870535746216774, "rewards/format_reward": 1.0, "step": 3010 }, { "completion_length": 431.05805587768555, "epoch": 0.2347943016389609, "grad_norm": 0.08293739880349012, "kl": 0.002918243408203125, "learning_rate": 8.700280934869188e-07, "loss": 0.0001, "reward": 1.8660715073347092, "reward_std": 0.057188354432582855, "rewards/accuracy_reward": 0.8660714849829674, "rewards/format_reward": 1.0, "step": 3012 }, { "completion_length": 445.4486846923828, "epoch": 0.23495020754974372, "grad_norm": 0.11910904260432269, "kl": 0.004375457763671875, "learning_rate": 8.698633420152641e-07, "loss": 0.0002, "reward": 1.7410715073347092, "reward_std": 0.07350331731140614, "rewards/accuracy_reward": 0.7410714700818062, "rewards/format_reward": 1.0, "step": 3014 }, { "completion_length": 424.3236770629883, "epoch": 0.23510611346052657, "grad_norm": 0.06650126170136729, "kl": 0.003162384033203125, "learning_rate": 8.696985018110284e-07, "loss": 0.0001, "reward": 1.8102679550647736, "reward_std": 0.053956763818860054, "rewards/accuracy_reward": 0.8102678954601288, "rewards/format_reward": 1.0, "step": 3016 }, { "completion_length": 420.56697845458984, "epoch": 0.23526201937130942, "grad_norm": 0.09354185682683748, "kl": 0.00373077392578125, "learning_rate": 8.695335729137579e-07, "loss": 0.0001, "reward": 1.7812501043081284, "reward_std": 0.05005256924778223, "rewards/accuracy_reward": 0.7812500447034836, "rewards/format_reward": 1.0, "step": 3018 }, { "completion_length": 420.80358123779297, "epoch": 0.23541792528209227, "grad_norm": 0.12169969888287718, "kl": 0.0026960372924804688, "learning_rate": 8.693685553630196e-07, "loss": 0.0001, "reward": 1.7946429252624512, "reward_std": 0.068536470644176, "rewards/accuracy_reward": 0.7946428880095482, "rewards/format_reward": 1.0, "step": 3020 }, { "completion_length": 427.9665412902832, "epoch": 0.2355738311928751, "grad_norm": 0.0931164574164092, "kl": 0.0028972625732421875, "learning_rate": 8.692034491984028e-07, "loss": 0.0001, "reward": 1.8549107909202576, "reward_std": 0.05651434976607561, "rewards/accuracy_reward": 0.8571428954601288, "rewards/format_reward": 0.9977678656578064, "step": 3022 }, { "completion_length": 431.57591247558594, "epoch": 0.23572973710365794, "grad_norm": 0.1439344097844079, "kl": 0.0034809112548828125, "learning_rate": 8.690382544595174e-07, "loss": 0.0001, "reward": 1.7544643729925156, "reward_std": 0.07417592126876116, "rewards/accuracy_reward": 0.7544643133878708, "rewards/format_reward": 1.0, "step": 3024 }, { "completion_length": 417.0156440734863, "epoch": 0.2358856430144408, "grad_norm": 0.10614364053542454, "kl": 0.0031299591064453125, "learning_rate": 8.688729711859944e-07, "loss": 0.0001, "reward": 1.787946492433548, "reward_std": 0.060573311522603035, "rewards/accuracy_reward": 0.787946455180645, "rewards/format_reward": 1.0, "step": 3026 }, { "completion_length": 426.3616256713867, "epoch": 0.23604154892522364, "grad_norm": 0.042512975733855846, "kl": 0.0031480789184570312, "learning_rate": 8.687075994174866e-07, "loss": 0.0001, "reward": 1.8861607760190964, "reward_std": 0.03156726714223623, "rewards/accuracy_reward": 0.8861607387661934, "rewards/format_reward": 1.0, "step": 3028 }, { "completion_length": 436.98885345458984, "epoch": 0.23619745483600646, "grad_norm": 0.1053946251907506, "kl": 0.0031833648681640625, "learning_rate": 8.685421391936678e-07, "loss": 0.0001, "reward": 1.8236607909202576, "reward_std": 0.0939400102943182, "rewards/accuracy_reward": 0.823660746216774, "rewards/format_reward": 1.0, "step": 3030 }, { "completion_length": 426.4933204650879, "epoch": 0.2363533607467893, "grad_norm": 0.08097908456125993, "kl": 0.00279998779296875, "learning_rate": 8.683765905542326e-07, "loss": 0.0001, "reward": 1.8147322088479996, "reward_std": 0.04937856364995241, "rewards/accuracy_reward": 0.8147321715950966, "rewards/format_reward": 1.0, "step": 3032 }, { "completion_length": 431.0089454650879, "epoch": 0.23650926665757216, "grad_norm": 0.1152251259434727, "kl": 0.0031452178955078125, "learning_rate": 8.682109535388976e-07, "loss": 0.0001, "reward": 1.774553656578064, "reward_std": 0.07320022117346525, "rewards/accuracy_reward": 0.7745536044239998, "rewards/format_reward": 1.0, "step": 3034 }, { "completion_length": 425.3259162902832, "epoch": 0.236665172568355, "grad_norm": 0.11768582119797212, "kl": 0.0029096603393554688, "learning_rate": 8.680452281874e-07, "loss": 0.0001, "reward": 1.8839286416769028, "reward_std": 0.07710441667586565, "rewards/accuracy_reward": 0.8839286044239998, "rewards/format_reward": 1.0, "step": 3036 }, { "completion_length": 430.61608505249023, "epoch": 0.23682107847913783, "grad_norm": 0.1464385935820751, "kl": 0.0030002593994140625, "learning_rate": 8.678794145394981e-07, "loss": 0.0001, "reward": 1.8169643729925156, "reward_std": 0.11227418296039104, "rewards/accuracy_reward": 0.8169643208384514, "rewards/format_reward": 1.0, "step": 3038 }, { "completion_length": 417.93305587768555, "epoch": 0.23697698438992068, "grad_norm": 0.12471656678899792, "kl": 0.0029430389404296875, "learning_rate": 8.677135126349722e-07, "loss": 0.0001, "reward": 1.7924108058214188, "reward_std": 0.07079096138477325, "rewards/accuracy_reward": 0.7924107536673546, "rewards/format_reward": 1.0, "step": 3040 }, { "completion_length": 428.3281440734863, "epoch": 0.23713289030070353, "grad_norm": 0.08525809765793052, "kl": 0.0029697418212890625, "learning_rate": 8.675475225136229e-07, "loss": 0.0001, "reward": 1.7008929550647736, "reward_std": 0.0417863167822361, "rewards/accuracy_reward": 0.7008928805589676, "rewards/format_reward": 1.0, "step": 3042 }, { "completion_length": 420.4866256713867, "epoch": 0.23728879621148638, "grad_norm": 0.12144131716524606, "kl": 0.0033903121948242188, "learning_rate": 8.67381444215272e-07, "loss": 0.0001, "reward": 1.8638393729925156, "reward_std": 0.094093375839293, "rewards/accuracy_reward": 0.8638393208384514, "rewards/format_reward": 1.0, "step": 3044 }, { "completion_length": 424.85269927978516, "epoch": 0.2374447021222692, "grad_norm": 0.10281047138222546, "kl": 0.002933502197265625, "learning_rate": 8.672152777797635e-07, "loss": 0.0001, "reward": 1.7433036416769028, "reward_std": 0.05230705998837948, "rewards/accuracy_reward": 0.7433035895228386, "rewards/format_reward": 1.0, "step": 3046 }, { "completion_length": 422.142879486084, "epoch": 0.23760060803305205, "grad_norm": 0.13893770433773756, "kl": 0.0029144287109375, "learning_rate": 8.670490232469611e-07, "loss": 0.0001, "reward": 1.8013393729925156, "reward_std": 0.08213907573372126, "rewards/accuracy_reward": 0.8013393133878708, "rewards/format_reward": 1.0, "step": 3048 }, { "completion_length": 427.2924270629883, "epoch": 0.2377565139438349, "grad_norm": 0.11843937681123727, "kl": 0.0032444000244140625, "learning_rate": 8.668826806567507e-07, "loss": 0.0001, "reward": 1.8258929252624512, "reward_std": 0.09214058332145214, "rewards/accuracy_reward": 0.8258928954601288, "rewards/format_reward": 1.0, "step": 3050 }, { "completion_length": 418.5335006713867, "epoch": 0.23791241985461775, "grad_norm": 0.07281328941079308, "kl": 0.0033512115478515625, "learning_rate": 8.667162500490388e-07, "loss": 0.0001, "reward": 1.8861607909202576, "reward_std": 0.07192142773419619, "rewards/accuracy_reward": 0.8861607313156128, "rewards/format_reward": 1.0, "step": 3052 }, { "completion_length": 412.0468864440918, "epoch": 0.23806832576540057, "grad_norm": 0.08851892234242367, "kl": 0.0029296875, "learning_rate": 8.665497314637534e-07, "loss": 0.0001, "reward": 1.8147322237491608, "reward_std": 0.0529796639457345, "rewards/accuracy_reward": 0.8147321790456772, "rewards/format_reward": 1.0, "step": 3054 }, { "completion_length": 426.0915412902832, "epoch": 0.23822423167618342, "grad_norm": 0.1336115502438405, "kl": 0.003307342529296875, "learning_rate": 8.663831249408429e-07, "loss": 0.0001, "reward": 1.6897322088479996, "reward_std": 0.10754401795566082, "rewards/accuracy_reward": 0.6919643208384514, "rewards/format_reward": 0.9977678656578064, "step": 3056 }, { "completion_length": 401.62055587768555, "epoch": 0.23838013758696627, "grad_norm": 0.049331759461658645, "kl": 0.0026388168334960938, "learning_rate": 8.662164305202777e-07, "loss": 0.0001, "reward": 1.8816965073347092, "reward_std": 0.04501790925860405, "rewards/accuracy_reward": 0.8816964700818062, "rewards/format_reward": 1.0, "step": 3058 }, { "completion_length": 428.8460006713867, "epoch": 0.2385360434977491, "grad_norm": 0.07368501029820146, "kl": 0.0028066635131835938, "learning_rate": 8.660496482420487e-07, "loss": 0.0001, "reward": 1.8058036416769028, "reward_std": 0.041329856030642986, "rewards/accuracy_reward": 0.8058036044239998, "rewards/format_reward": 1.0, "step": 3060 }, { "completion_length": 437.1585006713867, "epoch": 0.23869194940853194, "grad_norm": 0.12633211459236432, "kl": 0.0028409957885742188, "learning_rate": 8.65882778146168e-07, "loss": 0.0001, "reward": 1.796875074505806, "reward_std": 0.06951216794550419, "rewards/accuracy_reward": 0.7968750298023224, "rewards/format_reward": 1.0, "step": 3062 }, { "completion_length": 429.4486770629883, "epoch": 0.2388478553193148, "grad_norm": 0.07141117010816045, "kl": 0.002826690673828125, "learning_rate": 8.657158202726688e-07, "loss": 0.0001, "reward": 1.875000074505806, "reward_std": 0.029159409925341606, "rewards/accuracy_reward": 0.8750000298023224, "rewards/format_reward": 1.0, "step": 3064 }, { "completion_length": 424.8727836608887, "epoch": 0.23900376123009764, "grad_norm": 0.04800781201560112, "kl": 0.002712249755859375, "learning_rate": 8.655487746616057e-07, "loss": 0.0001, "reward": 1.8303571939468384, "reward_std": 0.039377059787511826, "rewards/accuracy_reward": 0.830357164144516, "rewards/format_reward": 1.0, "step": 3066 }, { "completion_length": 427.16966247558594, "epoch": 0.23915966714088047, "grad_norm": 0.10794956533717764, "kl": 0.0032558441162109375, "learning_rate": 8.653816413530535e-07, "loss": 0.0001, "reward": 1.8437500596046448, "reward_std": 0.028182310983538628, "rewards/accuracy_reward": 0.843750037252903, "rewards/format_reward": 1.0, "step": 3068 }, { "completion_length": 423.3526954650879, "epoch": 0.23931557305166332, "grad_norm": 0.137693404771774, "kl": 0.0030422210693359375, "learning_rate": 8.652144203871088e-07, "loss": 0.0001, "reward": 1.7812500596046448, "reward_std": 0.07515301555395126, "rewards/accuracy_reward": 0.7834821715950966, "rewards/format_reward": 0.9977678656578064, "step": 3070 }, { "completion_length": 411.85939025878906, "epoch": 0.23947147896244617, "grad_norm": 0.14792601441497039, "kl": 0.0026769638061523438, "learning_rate": 8.650471118038889e-07, "loss": 0.0001, "reward": 1.8415179252624512, "reward_std": 0.07695244625210762, "rewards/accuracy_reward": 0.8415179029107094, "rewards/format_reward": 1.0, "step": 3072 }, { "completion_length": 427.22993087768555, "epoch": 0.23962738487322902, "grad_norm": 0.11855518435682792, "kl": 0.0031175613403320312, "learning_rate": 8.648797156435323e-07, "loss": 0.0001, "reward": 1.7723215073347092, "reward_std": 0.07402255106717348, "rewards/accuracy_reward": 0.7723214626312256, "rewards/format_reward": 1.0, "step": 3074 }, { "completion_length": 434.47769927978516, "epoch": 0.23978329078401184, "grad_norm": 0.11678090387211257, "kl": 0.0034637451171875, "learning_rate": 8.647122319461983e-07, "loss": 0.0001, "reward": 1.7232143580913544, "reward_std": 0.08942962810397148, "rewards/accuracy_reward": 0.7232143171131611, "rewards/format_reward": 1.0, "step": 3076 }, { "completion_length": 416.18305587768555, "epoch": 0.2399391966947947, "grad_norm": 0.057026357320725196, "kl": 0.00295257568359375, "learning_rate": 8.645446607520676e-07, "loss": 0.0001, "reward": 1.8638393431901932, "reward_std": 0.04569051135331392, "rewards/accuracy_reward": 0.8638393357396126, "rewards/format_reward": 1.0, "step": 3078 }, { "completion_length": 418.6897506713867, "epoch": 0.24009510260557754, "grad_norm": 0.07508390131647961, "kl": 0.003376007080078125, "learning_rate": 8.643770021013412e-07, "loss": 0.0001, "reward": 1.76339291036129, "reward_std": 0.06319871358573437, "rewards/accuracy_reward": 0.7633928805589676, "rewards/format_reward": 1.0, "step": 3080 }, { "completion_length": 437.4575996398926, "epoch": 0.2402510085163604, "grad_norm": 0.13008883361193335, "kl": 0.003314971923828125, "learning_rate": 8.642092560342416e-07, "loss": 0.0001, "reward": 1.8392857760190964, "reward_std": 0.09214058239012957, "rewards/accuracy_reward": 0.8392857536673546, "rewards/format_reward": 1.0, "step": 3082 }, { "completion_length": 411.714298248291, "epoch": 0.2404069144271432, "grad_norm": 0.08754401371417082, "kl": 0.00273895263671875, "learning_rate": 8.640414225910123e-07, "loss": 0.0001, "reward": 1.7433036267757416, "reward_std": 0.06365517433732748, "rewards/accuracy_reward": 0.7433036155998707, "rewards/format_reward": 1.0, "step": 3084 }, { "completion_length": 423.70091247558594, "epoch": 0.24056282033792606, "grad_norm": 0.10722083069613127, "kl": 0.0036182403564453125, "learning_rate": 8.638735018119175e-07, "loss": 0.0001, "reward": 1.7857143580913544, "reward_std": 0.06914266012609005, "rewards/accuracy_reward": 0.7857143208384514, "rewards/format_reward": 1.0, "step": 3086 }, { "completion_length": 433.96430587768555, "epoch": 0.2407187262487089, "grad_norm": 0.10249768497615033, "kl": 0.0031795501708984375, "learning_rate": 8.637054937372426e-07, "loss": 0.0001, "reward": 1.8325893580913544, "reward_std": 0.06853506714105606, "rewards/accuracy_reward": 0.8325893208384514, "rewards/format_reward": 1.0, "step": 3088 }, { "completion_length": 426.0312728881836, "epoch": 0.24087463215949176, "grad_norm": 0.06155139855072981, "kl": 0.002887725830078125, "learning_rate": 8.635373984072936e-07, "loss": 0.0001, "reward": 1.787946492433548, "reward_std": 0.0618521049618721, "rewards/accuracy_reward": 0.7879464477300644, "rewards/format_reward": 1.0, "step": 3090 }, { "completion_length": 422.6428756713867, "epoch": 0.24103053807027458, "grad_norm": 0.09437051227187399, "kl": 0.00324249267578125, "learning_rate": 8.633692158623978e-07, "loss": 0.0001, "reward": 1.7723215073347092, "reward_std": 0.04907546937465668, "rewards/accuracy_reward": 0.7723214626312256, "rewards/format_reward": 1.0, "step": 3092 }, { "completion_length": 429.5067138671875, "epoch": 0.24118644398105743, "grad_norm": 0.10518984514506453, "kl": 0.0031719207763671875, "learning_rate": 8.63200946142903e-07, "loss": 0.0001, "reward": 1.765625074505806, "reward_std": 0.060573311522603035, "rewards/accuracy_reward": 0.7656250223517418, "rewards/format_reward": 1.0, "step": 3094 }, { "completion_length": 432.73216247558594, "epoch": 0.24134234989184028, "grad_norm": 0.13062249220005867, "kl": 0.00372314453125, "learning_rate": 8.630325892891786e-07, "loss": 0.0001, "reward": 1.7812500894069672, "reward_std": 0.10724092461168766, "rewards/accuracy_reward": 0.7834821864962578, "rewards/format_reward": 0.9977678656578064, "step": 3096 }, { "completion_length": 438.1071662902832, "epoch": 0.24149825580262313, "grad_norm": 0.11832733019519939, "kl": 0.0032520294189453125, "learning_rate": 8.628641453416143e-07, "loss": 0.0001, "reward": 1.7343751043081284, "reward_std": 0.11535240337252617, "rewards/accuracy_reward": 0.7343750298023224, "rewards/format_reward": 1.0, "step": 3098 }, { "completion_length": 413.9241256713867, "epoch": 0.24165416171340595, "grad_norm": 0.06787260759150572, "kl": 0.0021638870239257812, "learning_rate": 8.626956143406207e-07, "loss": 0.0001, "reward": 1.9084821790456772, "reward_std": 0.02772584930062294, "rewards/accuracy_reward": 0.908482164144516, "rewards/format_reward": 1.0, "step": 3100 }, { "completion_length": 420.26341247558594, "epoch": 0.2418100676241888, "grad_norm": 0.10340288202582204, "kl": 0.003345489501953125, "learning_rate": 8.6252699632663e-07, "loss": 0.0001, "reward": 1.7187501043081284, "reward_std": 0.0417863167822361, "rewards/accuracy_reward": 0.7187500298023224, "rewards/format_reward": 1.0, "step": 3102 }, { "completion_length": 442.6116256713867, "epoch": 0.24196597353497165, "grad_norm": 0.11363965444652693, "kl": 0.0036525726318359375, "learning_rate": 8.62358291340094e-07, "loss": 0.0001, "reward": 1.8058036416769028, "reward_std": 0.060573309659957886, "rewards/accuracy_reward": 0.8058036044239998, "rewards/format_reward": 1.0, "step": 3104 }, { "completion_length": 430.1071586608887, "epoch": 0.2421218794457545, "grad_norm": 0.0648250689700181, "kl": 0.0030002593994140625, "learning_rate": 8.621894994214867e-07, "loss": 0.0001, "reward": 1.906250074505806, "reward_std": 0.05883805826306343, "rewards/accuracy_reward": 0.9062500298023224, "rewards/format_reward": 1.0, "step": 3106 }, { "completion_length": 409.2745780944824, "epoch": 0.24227778535653732, "grad_norm": 0.10257003220917561, "kl": 0.00310516357421875, "learning_rate": 8.62020620611302e-07, "loss": 0.0001, "reward": 1.7924108058214188, "reward_std": 0.039833519607782364, "rewards/accuracy_reward": 0.7924107536673546, "rewards/format_reward": 1.0, "step": 3108 }, { "completion_length": 430.29019927978516, "epoch": 0.24243369126732017, "grad_norm": 0.1077812238071037, "kl": 0.0034770965576171875, "learning_rate": 8.618516549500552e-07, "loss": 0.0001, "reward": 1.8549107909202576, "reward_std": 0.08182260859757662, "rewards/accuracy_reward": 0.85714291036129, "rewards/format_reward": 0.9977678656578064, "step": 3110 }, { "completion_length": 425.908504486084, "epoch": 0.24258959717810302, "grad_norm": 0.11553616288626908, "kl": 0.0036716461181640625, "learning_rate": 8.616826024782822e-07, "loss": 0.0001, "reward": 1.7299107909202576, "reward_std": 0.08311617188155651, "rewards/accuracy_reward": 0.7299107536673546, "rewards/format_reward": 1.0, "step": 3112 }, { "completion_length": 422.8437728881836, "epoch": 0.24274550308888584, "grad_norm": 0.09401943273453744, "kl": 0.0035381317138671875, "learning_rate": 8.615134632365397e-07, "loss": 0.0001, "reward": 1.8816965073347092, "reward_std": 0.07222311943769455, "rewards/accuracy_reward": 0.8816964775323868, "rewards/format_reward": 1.0, "step": 3114 }, { "completion_length": 426.44644927978516, "epoch": 0.2429014089996687, "grad_norm": 0.0900044925999168, "kl": 0.0029087066650390625, "learning_rate": 8.613442372654054e-07, "loss": 0.0001, "reward": 1.8080357760190964, "reward_std": 0.06011684890836477, "rewards/accuracy_reward": 0.808035746216774, "rewards/format_reward": 1.0, "step": 3116 }, { "completion_length": 415.9263572692871, "epoch": 0.24305731491045154, "grad_norm": 0.06704383641286592, "kl": 0.0030393600463867188, "learning_rate": 8.611749246054776e-07, "loss": 0.0001, "reward": 1.8236607760190964, "reward_std": 0.05929452087730169, "rewards/accuracy_reward": 0.8236607536673546, "rewards/format_reward": 1.0, "step": 3118 }, { "completion_length": 423.5848388671875, "epoch": 0.2432132208212344, "grad_norm": 0.08323668109998232, "kl": 0.00286102294921875, "learning_rate": 8.610055252973756e-07, "loss": 0.0001, "reward": 1.8392857909202576, "reward_std": 0.0681655565276742, "rewards/accuracy_reward": 0.8392857387661934, "rewards/format_reward": 1.0, "step": 3120 }, { "completion_length": 407.42189025878906, "epoch": 0.24336912673201722, "grad_norm": 0.09493420597484183, "kl": 0.0025768280029296875, "learning_rate": 8.608360393817392e-07, "loss": 0.0001, "reward": 1.8415179550647736, "reward_std": 0.03547286335378885, "rewards/accuracy_reward": 0.8415179029107094, "rewards/format_reward": 1.0, "step": 3122 }, { "completion_length": 423.35046768188477, "epoch": 0.24352503264280007, "grad_norm": 0.09860572497453977, "kl": 0.002838134765625, "learning_rate": 8.606664668992292e-07, "loss": 0.0001, "reward": 1.8504464775323868, "reward_std": 0.04404080845415592, "rewards/accuracy_reward": 0.8504464626312256, "rewards/format_reward": 1.0, "step": 3124 }, { "completion_length": 439.5044860839844, "epoch": 0.24368093855358292, "grad_norm": 0.09323694964981426, "kl": 0.0034942626953125, "learning_rate": 8.604968078905273e-07, "loss": 0.0001, "reward": 1.7834822088479996, "reward_std": 0.03239100147038698, "rewards/accuracy_reward": 0.7834821827709675, "rewards/format_reward": 1.0, "step": 3126 }, { "completion_length": 424.1160888671875, "epoch": 0.24383684446436577, "grad_norm": 0.15659661117912949, "kl": 0.00366973876953125, "learning_rate": 8.603270623963357e-07, "loss": 0.0001, "reward": 1.7901786416769028, "reward_std": 0.053282758221030235, "rewards/accuracy_reward": 0.7901785969734192, "rewards/format_reward": 1.0, "step": 3128 }, { "completion_length": 410.1495704650879, "epoch": 0.2439927503751486, "grad_norm": 0.09141664109693948, "kl": 0.0029211044311523438, "learning_rate": 8.601572304573773e-07, "loss": 0.0001, "reward": 1.8147321939468384, "reward_std": 0.0450179073959589, "rewards/accuracy_reward": 0.8147321864962578, "rewards/format_reward": 1.0, "step": 3130 }, { "completion_length": 413.3460006713867, "epoch": 0.24414865628593144, "grad_norm": 0.04278100051510422, "kl": 0.0031566619873046875, "learning_rate": 8.599873121143959e-07, "loss": 0.0001, "reward": 1.837053656578064, "reward_std": 0.027206611819565296, "rewards/accuracy_reward": 0.837053619325161, "rewards/format_reward": 1.0, "step": 3132 }, { "completion_length": 412.2812690734863, "epoch": 0.2443045621967143, "grad_norm": 0.06925858899605226, "kl": 0.0027103424072265625, "learning_rate": 8.598173074081563e-07, "loss": 0.0001, "reward": 1.8727679550647736, "reward_std": 0.031112208031117916, "rewards/accuracy_reward": 0.8727678954601288, "rewards/format_reward": 1.0, "step": 3134 }, { "completion_length": 408.0625190734863, "epoch": 0.24446046810749714, "grad_norm": 0.08803782623220129, "kl": 0.0030460357666015625, "learning_rate": 8.596472163794433e-07, "loss": 0.0001, "reward": 1.8258929401636124, "reward_std": 0.05133136175572872, "rewards/accuracy_reward": 0.8258928880095482, "rewards/format_reward": 1.0, "step": 3136 }, { "completion_length": 426.0625190734863, "epoch": 0.24461637401827996, "grad_norm": 0.10620057990181554, "kl": 0.0033626556396484375, "learning_rate": 8.59477039069063e-07, "loss": 0.0001, "reward": 1.8683036416769028, "reward_std": 0.048099770210683346, "rewards/accuracy_reward": 0.8683036044239998, "rewards/format_reward": 1.0, "step": 3138 }, { "completion_length": 423.4375190734863, "epoch": 0.2447722799290628, "grad_norm": 0.11012305676430381, "kl": 0.0028514862060546875, "learning_rate": 8.59306775517842e-07, "loss": 0.0001, "reward": 1.7790179252624512, "reward_std": 0.05831742100417614, "rewards/accuracy_reward": 0.7790178805589676, "rewards/format_reward": 1.0, "step": 3140 }, { "completion_length": 420.3973388671875, "epoch": 0.24492818583984566, "grad_norm": 0.06855666125085606, "kl": 0.0028963088989257812, "learning_rate": 8.591364257666277e-07, "loss": 0.0001, "reward": 1.881696492433548, "reward_std": 0.036598291248083115, "rewards/accuracy_reward": 0.8816964700818062, "rewards/format_reward": 1.0, "step": 3142 }, { "completion_length": 426.9687690734863, "epoch": 0.2450840917506285, "grad_norm": 0.08634567403865687, "kl": 0.0030879974365234375, "learning_rate": 8.58965989856288e-07, "loss": 0.0001, "reward": 1.8058036416769028, "reward_std": 0.05877024121582508, "rewards/accuracy_reward": 0.8058036118745804, "rewards/format_reward": 1.0, "step": 3144 }, { "completion_length": 433.08930587768555, "epoch": 0.24523999766141133, "grad_norm": 0.1165846571387268, "kl": 0.003192901611328125, "learning_rate": 8.587954678277116e-07, "loss": 0.0001, "reward": 1.7790179550647736, "reward_std": 0.052005368284881115, "rewards/accuracy_reward": 0.7790178880095482, "rewards/format_reward": 1.0, "step": 3146 }, { "completion_length": 426.0245704650879, "epoch": 0.24539590357219418, "grad_norm": 0.08240117519134492, "kl": 0.0034942626953125, "learning_rate": 8.586248597218079e-07, "loss": 0.0001, "reward": 1.7924107909202576, "reward_std": 0.056667715311050415, "rewards/accuracy_reward": 0.7924107611179352, "rewards/format_reward": 1.0, "step": 3148 }, { "completion_length": 424.82368087768555, "epoch": 0.24555180948297703, "grad_norm": 0.13015044898067274, "kl": 0.00310516357421875, "learning_rate": 8.584541655795068e-07, "loss": 0.0001, "reward": 1.850446492433548, "reward_std": 0.05538892466574907, "rewards/accuracy_reward": 0.850446455180645, "rewards/format_reward": 1.0, "step": 3150 }, { "completion_length": 427.1227798461914, "epoch": 0.24570771539375988, "grad_norm": 0.09274822336237747, "kl": 0.0032482147216796875, "learning_rate": 8.582833854417587e-07, "loss": 0.0001, "reward": 1.7968750894069672, "reward_std": 0.06057331245392561, "rewards/accuracy_reward": 0.7968750298023224, "rewards/format_reward": 1.0, "step": 3152 }, { "completion_length": 432.4910888671875, "epoch": 0.2458636213045427, "grad_norm": 0.06156847264064568, "kl": 0.0029277801513671875, "learning_rate": 8.581125193495353e-07, "loss": 0.0001, "reward": 1.73214291036129, "reward_std": 0.05425985809415579, "rewards/accuracy_reward": 0.7343750223517418, "rewards/format_reward": 0.9977678656578064, "step": 3154 }, { "completion_length": 427.464298248291, "epoch": 0.24601952721532555, "grad_norm": 0.13406885398507845, "kl": 0.0028905868530273438, "learning_rate": 8.579415673438281e-07, "loss": 0.0001, "reward": 1.796875074505806, "reward_std": 0.07597535010427237, "rewards/accuracy_reward": 0.7968750298023224, "rewards/format_reward": 1.0, "step": 3156 }, { "completion_length": 437.3013610839844, "epoch": 0.2461754331261084, "grad_norm": 0.09843460756471571, "kl": 0.0032176971435546875, "learning_rate": 8.577705294656498e-07, "loss": 0.0001, "reward": 1.7901786714792252, "reward_std": 0.039377057924866676, "rewards/accuracy_reward": 0.7901786044239998, "rewards/format_reward": 1.0, "step": 3158 }, { "completion_length": 418.8638610839844, "epoch": 0.24633133903689122, "grad_norm": 0.0035073861564459317, "kl": 0.0029888153076171875, "learning_rate": 8.575994057560332e-07, "loss": 0.0001, "reward": 1.8392857611179352, "reward_std": 0.025100446306169033, "rewards/accuracy_reward": 0.8392857238650322, "rewards/format_reward": 1.0, "step": 3160 }, { "completion_length": 427.0960006713867, "epoch": 0.24648724494767407, "grad_norm": 0.11697487659731089, "kl": 0.0030193328857421875, "learning_rate": 8.574281962560324e-07, "loss": 0.0001, "reward": 1.765625074505806, "reward_std": 0.0947659807279706, "rewards/accuracy_reward": 0.7700893133878708, "rewards/format_reward": 0.9955357313156128, "step": 3162 }, { "completion_length": 409.2656440734863, "epoch": 0.24664315085845692, "grad_norm": 0.09004180822974926, "kl": 0.00262451171875, "learning_rate": 8.572569010067213e-07, "loss": 0.0001, "reward": 1.814732238650322, "reward_std": 0.030135108157992363, "rewards/accuracy_reward": 0.8169643208384514, "rewards/format_reward": 0.9977678656578064, "step": 3164 }, { "completion_length": 425.2299346923828, "epoch": 0.24679905676923977, "grad_norm": 0.09075440680174102, "kl": 0.00273895263671875, "learning_rate": 8.570855200491948e-07, "loss": 0.0001, "reward": 1.8236607909202576, "reward_std": 0.041329856030642986, "rewards/accuracy_reward": 0.8236607536673546, "rewards/format_reward": 1.0, "step": 3166 }, { "completion_length": 423.2901954650879, "epoch": 0.2469549626800226, "grad_norm": 0.10148092774760249, "kl": 0.0031404495239257812, "learning_rate": 8.569140534245684e-07, "loss": 0.0001, "reward": 1.8303572088479996, "reward_std": 0.04373771324753761, "rewards/accuracy_reward": 0.832589328289032, "rewards/format_reward": 0.9977678656578064, "step": 3168 }, { "completion_length": 429.3884086608887, "epoch": 0.24711086859080544, "grad_norm": 0.07326515679214302, "kl": 0.0032100677490234375, "learning_rate": 8.567425011739781e-07, "loss": 0.0001, "reward": 1.8191964775323868, "reward_std": 0.05395676475018263, "rewards/accuracy_reward": 0.8214286044239998, "rewards/format_reward": 0.9977678656578064, "step": 3170 }, { "completion_length": 427.7232360839844, "epoch": 0.2472667745015883, "grad_norm": 0.07822019603905213, "kl": 0.0029010772705078125, "learning_rate": 8.565708633385801e-07, "loss": 0.0001, "reward": 1.8482143580913544, "reward_std": 0.043065110221505165, "rewards/accuracy_reward": 0.8482143133878708, "rewards/format_reward": 1.0, "step": 3172 }, { "completion_length": 426.75671768188477, "epoch": 0.24742268041237114, "grad_norm": 0.11029552989724982, "kl": 0.0030002593994140625, "learning_rate": 8.563991399595516e-07, "loss": 0.0001, "reward": 1.799107238650322, "reward_std": 0.057341720908880234, "rewards/accuracy_reward": 0.7991071790456772, "rewards/format_reward": 1.0, "step": 3174 }, { "completion_length": 433.39733505249023, "epoch": 0.24757858632315397, "grad_norm": 0.06755965208897659, "kl": 0.0037364959716796875, "learning_rate": 8.562273310780901e-07, "loss": 0.0001, "reward": 1.7834822237491608, "reward_std": 0.05493386369198561, "rewards/accuracy_reward": 0.7834821790456772, "rewards/format_reward": 1.0, "step": 3176 }, { "completion_length": 436.3638610839844, "epoch": 0.24773449223393681, "grad_norm": 0.08183323403353406, "kl": 0.00394439697265625, "learning_rate": 8.560554367354136e-07, "loss": 0.0002, "reward": 1.8058036416769028, "reward_std": 0.06124591641128063, "rewards/accuracy_reward": 0.8058035969734192, "rewards/format_reward": 1.0, "step": 3178 }, { "completion_length": 416.40180587768555, "epoch": 0.24789039814471966, "grad_norm": 0.0626707073551606, "kl": 0.0027551651000976562, "learning_rate": 8.558834569727609e-07, "loss": 0.0001, "reward": 1.9129464775323868, "reward_std": 0.05523555725812912, "rewards/accuracy_reward": 0.9129464477300644, "rewards/format_reward": 1.0, "step": 3180 }, { "completion_length": 447.9442138671875, "epoch": 0.24804630405550251, "grad_norm": 0.13305129467658902, "kl": 0.00348663330078125, "learning_rate": 8.557113918313906e-07, "loss": 0.0001, "reward": 1.7142857909202576, "reward_std": 0.0890587167814374, "rewards/accuracy_reward": 0.714285746216774, "rewards/format_reward": 1.0, "step": 3182 }, { "completion_length": 427.15626525878906, "epoch": 0.24820220996628534, "grad_norm": 0.09192522364680426, "kl": 0.003173828125, "learning_rate": 8.555392413525826e-07, "loss": 0.0001, "reward": 1.8370536714792252, "reward_std": 0.03547286335378885, "rewards/accuracy_reward": 0.8370536044239998, "rewards/format_reward": 1.0, "step": 3184 }, { "completion_length": 411.16519927978516, "epoch": 0.2483581158770682, "grad_norm": 0.12230708985233314, "kl": 0.0032329559326171875, "learning_rate": 8.553670055776367e-07, "loss": 0.0001, "reward": 1.709821492433548, "reward_std": 0.0923581263050437, "rewards/accuracy_reward": 0.709821455180645, "rewards/format_reward": 1.0, "step": 3186 }, { "completion_length": 436.830379486084, "epoch": 0.24851402178785104, "grad_norm": 0.11273837179419186, "kl": 0.0037212371826171875, "learning_rate": 8.551946845478736e-07, "loss": 0.0001, "reward": 1.7968750596046448, "reward_std": 0.08402405679225922, "rewards/accuracy_reward": 0.7968750298023224, "rewards/format_reward": 1.0, "step": 3188 }, { "completion_length": 415.50671005249023, "epoch": 0.24866992769863389, "grad_norm": 0.07713257842825604, "kl": 0.0032787322998046875, "learning_rate": 8.550222783046339e-07, "loss": 0.0001, "reward": 1.7477679550647736, "reward_std": 0.053956763818860054, "rewards/accuracy_reward": 0.7477678842842579, "rewards/format_reward": 1.0, "step": 3190 }, { "completion_length": 422.45537185668945, "epoch": 0.2488258336094167, "grad_norm": 0.09810583346693678, "kl": 0.0028505325317382812, "learning_rate": 8.548497868892793e-07, "loss": 0.0001, "reward": 1.8705357611179352, "reward_std": 0.05734172184020281, "rewards/accuracy_reward": 0.8705357387661934, "rewards/format_reward": 1.0, "step": 3192 }, { "completion_length": 412.8995704650879, "epoch": 0.24898173952019956, "grad_norm": 0.11190662050528838, "kl": 0.003284454345703125, "learning_rate": 8.546772103431911e-07, "loss": 0.0001, "reward": 1.689732238650322, "reward_std": 0.09626231901347637, "rewards/accuracy_reward": 0.6897321715950966, "rewards/format_reward": 1.0, "step": 3194 }, { "completion_length": 425.4509086608887, "epoch": 0.2491376454309824, "grad_norm": 0.11583595421944248, "kl": 0.002933502197265625, "learning_rate": 8.545045487077722e-07, "loss": 0.0001, "reward": 1.8995536267757416, "reward_std": 0.03968015406280756, "rewards/accuracy_reward": 0.8995536044239998, "rewards/format_reward": 1.0, "step": 3196 }, { "completion_length": 424.39957427978516, "epoch": 0.24929355134176526, "grad_norm": 0.09413260952066009, "kl": 0.0027971267700195312, "learning_rate": 8.543318020244448e-07, "loss": 0.0001, "reward": 1.8125000894069672, "reward_std": 0.03208790719509125, "rewards/accuracy_reward": 0.812500037252903, "rewards/format_reward": 1.0, "step": 3198 }, { "completion_length": 416.870548248291, "epoch": 0.24944945725254808, "grad_norm": 0.09316847177883883, "kl": 0.0035152435302734375, "learning_rate": 8.541589703346517e-07, "loss": 0.0001, "reward": 1.6584822088479996, "reward_std": 0.04569051321595907, "rewards/accuracy_reward": 0.6584821753203869, "rewards/format_reward": 1.0, "step": 3200 }, { "completion_length": 430.04019927978516, "epoch": 0.24960536316333093, "grad_norm": 0.08522421005468414, "kl": 0.0031480789184570312, "learning_rate": 8.539860536798569e-07, "loss": 0.0001, "reward": 1.790178656578064, "reward_std": 0.045993607491254807, "rewards/accuracy_reward": 0.7901786044239998, "rewards/format_reward": 1.0, "step": 3202 }, { "completion_length": 426.4732322692871, "epoch": 0.24976126907411378, "grad_norm": 0.10139431099618487, "kl": 0.003070831298828125, "learning_rate": 8.538130521015439e-07, "loss": 0.0001, "reward": 1.7790179252624512, "reward_std": 0.059294517152011395, "rewards/accuracy_reward": 0.7790178880095482, "rewards/format_reward": 1.0, "step": 3204 }, { "completion_length": 425.12278747558594, "epoch": 0.24991717498489663, "grad_norm": 0.04643783512008501, "kl": 0.0029926300048828125, "learning_rate": 8.536399656412168e-07, "loss": 0.0001, "reward": 1.772321492433548, "reward_std": 0.036295196041464806, "rewards/accuracy_reward": 0.7723214775323868, "rewards/format_reward": 1.0, "step": 3206 }, { "completion_length": 417.92858123779297, "epoch": 0.2500730808956795, "grad_norm": 0.09145927213605273, "kl": 0.0030002593994140625, "learning_rate": 8.534667943404004e-07, "loss": 0.0001, "reward": 1.8236607909202576, "reward_std": 0.07530274521559477, "rewards/accuracy_reward": 0.8236607536673546, "rewards/format_reward": 1.0, "step": 3208 }, { "completion_length": 412.41072845458984, "epoch": 0.2502289868064623, "grad_norm": 0.09807048849338634, "kl": 0.00505828857421875, "learning_rate": 8.532935382406394e-07, "loss": 0.0002, "reward": 1.8035715222358704, "reward_std": 0.0322265001013875, "rewards/accuracy_reward": 0.8058035969734192, "rewards/format_reward": 0.9977678656578064, "step": 3210 }, { "completion_length": 418.604923248291, "epoch": 0.2503848927172451, "grad_norm": 0.08266908294916399, "kl": 0.003231048583984375, "learning_rate": 8.531201973834992e-07, "loss": 0.0001, "reward": 1.7991072088479996, "reward_std": 0.06319871451705694, "rewards/accuracy_reward": 0.7991071864962578, "rewards/format_reward": 1.0, "step": 3212 }, { "completion_length": 419.9308204650879, "epoch": 0.25054079862802797, "grad_norm": 0.10015578381764724, "kl": 0.002655029296875, "learning_rate": 8.529467718105652e-07, "loss": 0.0001, "reward": 1.7566965073347092, "reward_std": 0.05200536735355854, "rewards/accuracy_reward": 0.756696455180645, "rewards/format_reward": 1.0, "step": 3214 }, { "completion_length": 413.35269927978516, "epoch": 0.2506967045388108, "grad_norm": 0.0734172178278501, "kl": 0.0026092529296875, "learning_rate": 8.527732615634435e-07, "loss": 0.0001, "reward": 1.805803656578064, "reward_std": 0.027053246274590492, "rewards/accuracy_reward": 0.8058036118745804, "rewards/format_reward": 1.0, "step": 3216 }, { "completion_length": 407.5245666503906, "epoch": 0.25085261044959367, "grad_norm": 0.14069383341843242, "kl": 0.00267791748046875, "learning_rate": 8.5259966668376e-07, "loss": 0.0001, "reward": 1.8437500596046448, "reward_std": 0.03208790719509125, "rewards/accuracy_reward": 0.843750037252903, "rewards/format_reward": 1.0, "step": 3218 }, { "completion_length": 430.5223388671875, "epoch": 0.2510085163603765, "grad_norm": 0.06241109307729229, "kl": 0.0034389495849609375, "learning_rate": 8.524259872131617e-07, "loss": 0.0001, "reward": 1.79464291036129, "reward_std": 0.05298106651753187, "rewards/accuracy_reward": 0.7946428954601288, "rewards/format_reward": 1.0, "step": 3220 }, { "completion_length": 410.1294822692871, "epoch": 0.25116442227115937, "grad_norm": 0.09201209310145783, "kl": 0.0030364990234375, "learning_rate": 8.522522231933151e-07, "loss": 0.0001, "reward": 1.8125000596046448, "reward_std": 0.05621125642210245, "rewards/accuracy_reward": 0.8125000298023224, "rewards/format_reward": 1.0, "step": 3222 }, { "completion_length": 415.75894927978516, "epoch": 0.2513203281819422, "grad_norm": 0.12487126323085254, "kl": 0.0031147003173828125, "learning_rate": 8.520783746659073e-07, "loss": 0.0001, "reward": 1.765625074505806, "reward_std": 0.059294519014656544, "rewards/accuracy_reward": 0.7656250447034836, "rewards/format_reward": 1.0, "step": 3224 }, { "completion_length": 414.1250190734863, "epoch": 0.251476234092725, "grad_norm": 0.12193874844673266, "kl": 0.0029811859130859375, "learning_rate": 8.519044416726458e-07, "loss": 0.0001, "reward": 1.8705358058214188, "reward_std": 0.06853646785020828, "rewards/accuracy_reward": 0.8705357536673546, "rewards/format_reward": 1.0, "step": 3226 }, { "completion_length": 423.7254638671875, "epoch": 0.25163214000350786, "grad_norm": 0.10698710895607924, "kl": 0.0029621124267578125, "learning_rate": 8.517304242552581e-07, "loss": 0.0001, "reward": 1.7812500894069672, "reward_std": 0.05636461917310953, "rewards/accuracy_reward": 0.7834821715950966, "rewards/format_reward": 0.9977678656578064, "step": 3228 }, { "completion_length": 440.6562728881836, "epoch": 0.2517880459142907, "grad_norm": 0.16317794811115252, "kl": 0.00356292724609375, "learning_rate": 8.515563224554922e-07, "loss": 0.0001, "reward": 1.7544643878936768, "reward_std": 0.07432928308844566, "rewards/accuracy_reward": 0.7589286118745804, "rewards/format_reward": 0.9955357164144516, "step": 3230 }, { "completion_length": 418.2701110839844, "epoch": 0.25194395182507356, "grad_norm": 0.07469587023966212, "kl": 0.003253936767578125, "learning_rate": 8.513821363151162e-07, "loss": 0.0001, "reward": 1.852678656578064, "reward_std": 0.06786386482417583, "rewards/accuracy_reward": 0.852678619325161, "rewards/format_reward": 1.0, "step": 3232 }, { "completion_length": 410.76118087768555, "epoch": 0.2520998577358564, "grad_norm": 0.07093335932069422, "kl": 0.0027837753295898438, "learning_rate": 8.512078658759184e-07, "loss": 0.0001, "reward": 1.8705357611179352, "reward_std": 0.0489221028983593, "rewards/accuracy_reward": 0.8705357387661934, "rewards/format_reward": 1.0, "step": 3234 }, { "completion_length": 431.54243087768555, "epoch": 0.25225576364663926, "grad_norm": 0.07885310724481613, "kl": 0.003326416015625, "learning_rate": 8.510335111797075e-07, "loss": 0.0001, "reward": 1.8727679401636124, "reward_std": 0.06529010646045208, "rewards/accuracy_reward": 0.8750000521540642, "rewards/format_reward": 0.9977678656578064, "step": 3236 }, { "completion_length": 423.3817138671875, "epoch": 0.2524116695574221, "grad_norm": 0.12110284133291743, "kl": 0.003292083740234375, "learning_rate": 8.508590722683123e-07, "loss": 0.0001, "reward": 1.8102679401636124, "reward_std": 0.08131310623139143, "rewards/accuracy_reward": 0.8102678805589676, "rewards/format_reward": 1.0, "step": 3238 }, { "completion_length": 423.3906440734863, "epoch": 0.25256757546820496, "grad_norm": 0.10193426747488503, "kl": 0.003009796142578125, "learning_rate": 8.506845491835817e-07, "loss": 0.0001, "reward": 1.8080358058214188, "reward_std": 0.06688676495105028, "rewards/accuracy_reward": 0.8080357387661934, "rewards/format_reward": 1.0, "step": 3240 }, { "completion_length": 418.93305587768555, "epoch": 0.25272348137898776, "grad_norm": 0.11340272261458441, "kl": 0.002765655517578125, "learning_rate": 8.505099419673849e-07, "loss": 0.0001, "reward": 1.8125000894069672, "reward_std": 0.023821654729545116, "rewards/accuracy_reward": 0.8125000521540642, "rewards/format_reward": 1.0, "step": 3242 }, { "completion_length": 437.54689025878906, "epoch": 0.2528793872897706, "grad_norm": 0.08718638476407459, "kl": 0.0032138824462890625, "learning_rate": 8.503352506616115e-07, "loss": 0.0001, "reward": 1.8169643580913544, "reward_std": 0.04614697303622961, "rewards/accuracy_reward": 0.8169643133878708, "rewards/format_reward": 1.0, "step": 3244 }, { "completion_length": 423.2857322692871, "epoch": 0.25303529320055346, "grad_norm": 0.14074691011054566, "kl": 0.0031070709228515625, "learning_rate": 8.50160475308171e-07, "loss": 0.0001, "reward": 1.8883929252624512, "reward_std": 0.08409187477082014, "rewards/accuracy_reward": 0.8883928954601288, "rewards/format_reward": 1.0, "step": 3246 }, { "completion_length": 416.12278747558594, "epoch": 0.2531911991113363, "grad_norm": 0.11622652951251977, "kl": 0.002765655517578125, "learning_rate": 8.49985615948993e-07, "loss": 0.0001, "reward": 1.8638393431901932, "reward_std": 0.06365517526865005, "rewards/accuracy_reward": 0.8638393208384514, "rewards/format_reward": 1.0, "step": 3248 }, { "completion_length": 420.26564025878906, "epoch": 0.25334710502211916, "grad_norm": 0.10723420942588142, "kl": 0.003314971923828125, "learning_rate": 8.498106726260272e-07, "loss": 0.0001, "reward": 1.7901786416769028, "reward_std": 0.054932461120188236, "rewards/accuracy_reward": 0.7901785969734192, "rewards/format_reward": 1.0, "step": 3250 }, { "completion_length": 430.7634086608887, "epoch": 0.253503010932902, "grad_norm": 0.1230294172056074, "kl": 0.0029802322387695312, "learning_rate": 8.496356453812442e-07, "loss": 0.0001, "reward": 1.8147322088479996, "reward_std": 0.07680131867527962, "rewards/accuracy_reward": 0.8147321939468384, "rewards/format_reward": 1.0, "step": 3252 }, { "completion_length": 428.85269927978516, "epoch": 0.25365891684368486, "grad_norm": 0.046406348637036555, "kl": 0.003204345703125, "learning_rate": 8.494605342566337e-07, "loss": 0.0001, "reward": 1.774553656578064, "reward_std": 0.030135108157992363, "rewards/accuracy_reward": 0.7745535969734192, "rewards/format_reward": 1.0, "step": 3254 }, { "completion_length": 419.77903747558594, "epoch": 0.2538148227544677, "grad_norm": 0.09261215938190721, "kl": 0.0031986236572265625, "learning_rate": 8.49285339294206e-07, "loss": 0.0001, "reward": 1.774553656578064, "reward_std": 0.04794640466570854, "rewards/accuracy_reward": 0.7745536044239998, "rewards/format_reward": 1.0, "step": 3256 }, { "completion_length": 433.47546768188477, "epoch": 0.2539707286652505, "grad_norm": 0.09217894122764032, "kl": 0.003170013427734375, "learning_rate": 8.491100605359916e-07, "loss": 0.0001, "reward": 1.7142857909202576, "reward_std": 0.0490754684433341, "rewards/accuracy_reward": 0.716517873108387, "rewards/format_reward": 0.9977678656578064, "step": 3258 }, { "completion_length": 433.3839454650879, "epoch": 0.25412663457603335, "grad_norm": 0.08616148361890934, "kl": 0.0032911300659179688, "learning_rate": 8.48934698024041e-07, "loss": 0.0001, "reward": 1.8370536416769028, "reward_std": 0.05507245659828186, "rewards/accuracy_reward": 0.839285746216774, "rewards/format_reward": 0.9977678656578064, "step": 3260 }, { "completion_length": 434.8727836608887, "epoch": 0.2542825404868162, "grad_norm": 0.10300476237168525, "kl": 0.0034351348876953125, "learning_rate": 8.487592518004248e-07, "loss": 0.0001, "reward": 1.8080357909202576, "reward_std": 0.08927626255899668, "rewards/accuracy_reward": 0.8080357313156128, "rewards/format_reward": 1.0, "step": 3262 }, { "completion_length": 426.3326072692871, "epoch": 0.25443844639759905, "grad_norm": 0.15032341597701193, "kl": 0.003459930419921875, "learning_rate": 8.485837219072336e-07, "loss": 0.0001, "reward": 1.8459821939468384, "reward_std": 0.034342397935688496, "rewards/accuracy_reward": 0.8459821715950966, "rewards/format_reward": 1.0, "step": 3264 }, { "completion_length": 427.10046768188477, "epoch": 0.2545943523083819, "grad_norm": 0.06455469824770173, "kl": 0.0028533935546875, "learning_rate": 8.484081083865782e-07, "loss": 0.0001, "reward": 1.843750074505806, "reward_std": 0.025253813713788986, "rewards/accuracy_reward": 0.8437500447034836, "rewards/format_reward": 1.0, "step": 3266 }, { "completion_length": 421.8303756713867, "epoch": 0.25475025821916475, "grad_norm": 0.06822591332913365, "kl": 0.0029296875, "learning_rate": 8.482324112805894e-07, "loss": 0.0001, "reward": 1.819196492433548, "reward_std": 0.0456905122846365, "rewards/accuracy_reward": 0.8191964477300644, "rewards/format_reward": 1.0, "step": 3268 }, { "completion_length": 421.3326072692871, "epoch": 0.2549061641299476, "grad_norm": 0.0892929622674881, "kl": 0.0030574798583984375, "learning_rate": 8.480566306314178e-07, "loss": 0.0001, "reward": 1.7812500894069672, "reward_std": 0.028182310983538628, "rewards/accuracy_reward": 0.7812500298023224, "rewards/format_reward": 1.0, "step": 3270 }, { "completion_length": 420.8861770629883, "epoch": 0.2550620700407304, "grad_norm": 0.10867789075585356, "kl": 0.0025272369384765625, "learning_rate": 8.478807664812349e-07, "loss": 0.0001, "reward": 1.8415179252624512, "reward_std": 0.058978053741157055, "rewards/accuracy_reward": 0.843750037252903, "rewards/format_reward": 0.9977678656578064, "step": 3272 }, { "completion_length": 420.924129486084, "epoch": 0.25521797595151324, "grad_norm": 0.12997355219061824, "kl": 0.003543853759765625, "learning_rate": 8.477048188722311e-07, "loss": 0.0001, "reward": 1.8348215222358704, "reward_std": 0.07207115553319454, "rewards/accuracy_reward": 0.8348214700818062, "rewards/format_reward": 1.0, "step": 3274 }, { "completion_length": 421.4553756713867, "epoch": 0.2553738818622961, "grad_norm": 0.08873040678783653, "kl": 0.0029926300048828125, "learning_rate": 8.475287878466176e-07, "loss": 0.0001, "reward": 1.7991072237491608, "reward_std": 0.05298106651753187, "rewards/accuracy_reward": 0.7991071939468384, "rewards/format_reward": 1.0, "step": 3276 }, { "completion_length": 430.47546768188477, "epoch": 0.25552978777307894, "grad_norm": 0.09617816065055115, "kl": 0.00295257568359375, "learning_rate": 8.473526734466252e-07, "loss": 0.0001, "reward": 1.8325893729925156, "reward_std": 0.06786246318370104, "rewards/accuracy_reward": 0.8325893208384514, "rewards/format_reward": 1.0, "step": 3278 }, { "completion_length": 421.0669822692871, "epoch": 0.2556856936838618, "grad_norm": 0.09136150354823959, "kl": 0.0030002593994140625, "learning_rate": 8.471764757145051e-07, "loss": 0.0001, "reward": 1.8459822237491608, "reward_std": 0.06612861063331366, "rewards/accuracy_reward": 0.8459821790456772, "rewards/format_reward": 1.0, "step": 3280 }, { "completion_length": 423.3482322692871, "epoch": 0.25584159959464464, "grad_norm": 0.12611977776813343, "kl": 0.00275421142578125, "learning_rate": 8.470001946925281e-07, "loss": 0.0001, "reward": 1.8392857909202576, "reward_std": 0.05831882078200579, "rewards/accuracy_reward": 0.839285746216774, "rewards/format_reward": 1.0, "step": 3282 }, { "completion_length": 419.2009086608887, "epoch": 0.2559975055054275, "grad_norm": 0.07763473133235602, "kl": 0.0030231475830078125, "learning_rate": 8.468238304229851e-07, "loss": 0.0001, "reward": 1.8549107760190964, "reward_std": 0.030135109089314938, "rewards/accuracy_reward": 0.854910746216774, "rewards/format_reward": 1.0, "step": 3284 }, { "completion_length": 434.4888572692871, "epoch": 0.25615341141621034, "grad_norm": 0.0876486677584828, "kl": 0.0033597946166992188, "learning_rate": 8.466473829481873e-07, "loss": 0.0001, "reward": 1.9151786416769028, "reward_std": 0.044714813120663166, "rewards/accuracy_reward": 0.9151786118745804, "rewards/format_reward": 1.0, "step": 3286 }, { "completion_length": 421.80805587768555, "epoch": 0.25630931732699314, "grad_norm": 0.075963923434534, "kl": 0.0028657913208007812, "learning_rate": 8.464708523104652e-07, "loss": 0.0001, "reward": 1.8415179252624512, "reward_std": 0.038401360623538494, "rewards/accuracy_reward": 0.8415178880095482, "rewards/format_reward": 1.0, "step": 3288 }, { "completion_length": 430.1808204650879, "epoch": 0.256465223237776, "grad_norm": 0.11431165903612357, "kl": 0.00283050537109375, "learning_rate": 8.462942385521699e-07, "loss": 0.0001, "reward": 1.8593750596046448, "reward_std": 0.07371945679187775, "rewards/accuracy_reward": 0.8593750298023224, "rewards/format_reward": 1.0, "step": 3290 }, { "completion_length": 432.971004486084, "epoch": 0.25662112914855884, "grad_norm": 0.09498790659727069, "kl": 0.0033321380615234375, "learning_rate": 8.461175417156719e-07, "loss": 0.0001, "reward": 1.80803582072258, "reward_std": 0.07776504755020142, "rewards/accuracy_reward": 0.8125000298023224, "rewards/format_reward": 0.9955357313156128, "step": 3292 }, { "completion_length": 437.2968940734863, "epoch": 0.2567770350593417, "grad_norm": 0.13034349795957953, "kl": 0.0032596588134765625, "learning_rate": 8.459407618433623e-07, "loss": 0.0001, "reward": 1.8303572237491608, "reward_std": 0.08049077074974775, "rewards/accuracy_reward": 0.8303571790456772, "rewards/format_reward": 1.0, "step": 3294 }, { "completion_length": 419.8482322692871, "epoch": 0.25693294097012453, "grad_norm": 0.06572159652936536, "kl": 0.0027742385864257812, "learning_rate": 8.457638989776514e-07, "loss": 0.0001, "reward": 1.8772321790456772, "reward_std": 0.014579705893993378, "rewards/accuracy_reward": 0.8772321790456772, "rewards/format_reward": 1.0, "step": 3296 }, { "completion_length": 415.5848388671875, "epoch": 0.2570888468809074, "grad_norm": 0.07518972751355012, "kl": 0.002941131591796875, "learning_rate": 8.455869531609698e-07, "loss": 0.0001, "reward": 1.8303572088479996, "reward_std": 0.07567225396633148, "rewards/accuracy_reward": 0.8303571790456772, "rewards/format_reward": 1.0, "step": 3298 }, { "completion_length": 435.0223388671875, "epoch": 0.25724475279169023, "grad_norm": 0.11568878652282398, "kl": 0.0034122467041015625, "learning_rate": 8.45409924435768e-07, "loss": 0.0001, "reward": 1.8102679401636124, "reward_std": 0.08168401569128036, "rewards/accuracy_reward": 0.8102678880095482, "rewards/format_reward": 1.0, "step": 3300 }, { "completion_length": 422.43082427978516, "epoch": 0.2574006587024731, "grad_norm": 0.3325654181786618, "kl": 0.004151344299316406, "learning_rate": 8.452328128445164e-07, "loss": 0.0002, "reward": 1.8705357760190964, "reward_std": 0.05035426188260317, "rewards/accuracy_reward": 0.8727678954601288, "rewards/format_reward": 0.9977678656578064, "step": 3302 }, { "completion_length": 429.4888610839844, "epoch": 0.2575565646132559, "grad_norm": 0.07291749975096151, "kl": 0.0035648345947265625, "learning_rate": 8.45055618429705e-07, "loss": 0.0001, "reward": 1.7767858058214188, "reward_std": 0.05282769910991192, "rewards/accuracy_reward": 0.776785746216774, "rewards/format_reward": 1.0, "step": 3304 }, { "completion_length": 438.04243087768555, "epoch": 0.25771247052403873, "grad_norm": 0.10345107676056527, "kl": 0.003215789794921875, "learning_rate": 8.448783412338439e-07, "loss": 0.0001, "reward": 1.819196492433548, "reward_std": 0.04629670176655054, "rewards/accuracy_reward": 0.819196455180645, "rewards/format_reward": 1.0, "step": 3306 }, { "completion_length": 433.7210006713867, "epoch": 0.2578683764348216, "grad_norm": 0.1219107053206917, "kl": 0.0035419464111328125, "learning_rate": 8.447009812994632e-07, "loss": 0.0001, "reward": 1.7700893580913544, "reward_std": 0.0674074050039053, "rewards/accuracy_reward": 0.7700893133878708, "rewards/format_reward": 1.0, "step": 3308 }, { "completion_length": 423.6741256713867, "epoch": 0.2580242823456044, "grad_norm": 0.06850956072691952, "kl": 0.0032587051391601562, "learning_rate": 8.445235386691126e-07, "loss": 0.0001, "reward": 1.7767857909202576, "reward_std": 0.030438203364610672, "rewards/accuracy_reward": 0.7767857536673546, "rewards/format_reward": 1.0, "step": 3310 }, { "completion_length": 416.9843978881836, "epoch": 0.2581801882563873, "grad_norm": 0.06750322600519502, "kl": 0.0026597976684570312, "learning_rate": 8.443460133853616e-07, "loss": 0.0001, "reward": 1.8794643431901932, "reward_std": 0.028182310052216053, "rewards/accuracy_reward": 0.8794643208384514, "rewards/format_reward": 1.0, "step": 3312 }, { "completion_length": 420.8750114440918, "epoch": 0.2583360941671701, "grad_norm": 0.12061502325840846, "kl": 0.0033283233642578125, "learning_rate": 8.441684054907999e-07, "loss": 0.0001, "reward": 1.8147322237491608, "reward_std": 0.06801583059132099, "rewards/accuracy_reward": 0.8147321864962578, "rewards/format_reward": 1.0, "step": 3314 }, { "completion_length": 424.8794822692871, "epoch": 0.258492000077953, "grad_norm": 0.10018692224276442, "kl": 0.003170013427734375, "learning_rate": 8.439907150280366e-07, "loss": 0.0001, "reward": 1.8080357909202576, "reward_std": 0.079667036421597, "rewards/accuracy_reward": 0.808035746216774, "rewards/format_reward": 1.0, "step": 3316 }, { "completion_length": 419.4085006713867, "epoch": 0.25864790598873577, "grad_norm": 0.0850542136859374, "kl": 0.0032062530517578125, "learning_rate": 8.438129420397006e-07, "loss": 0.0001, "reward": 1.687500074505806, "reward_std": 0.054932462982833385, "rewards/accuracy_reward": 0.6875000298023224, "rewards/format_reward": 1.0, "step": 3318 }, { "completion_length": 423.7455520629883, "epoch": 0.2588038118995186, "grad_norm": 0.09950959939466526, "kl": 0.00330352783203125, "learning_rate": 8.43635086568441e-07, "loss": 0.0001, "reward": 1.828125074505806, "reward_std": 0.0771058164536953, "rewards/accuracy_reward": 0.8281250447034836, "rewards/format_reward": 1.0, "step": 3320 }, { "completion_length": 409.33929443359375, "epoch": 0.25895971781030147, "grad_norm": 0.08723231551710998, "kl": 0.0030698776245117188, "learning_rate": 8.434571486569266e-07, "loss": 0.0001, "reward": 1.7901786416769028, "reward_std": 0.03788072057068348, "rewards/accuracy_reward": 0.7901786044239998, "rewards/format_reward": 1.0, "step": 3322 }, { "completion_length": 421.19197845458984, "epoch": 0.2591156237210843, "grad_norm": 0.11211687583580852, "kl": 0.0028810501098632812, "learning_rate": 8.432791283478458e-07, "loss": 0.0001, "reward": 1.906250074505806, "reward_std": 0.04614697303622961, "rewards/accuracy_reward": 0.906250037252903, "rewards/format_reward": 1.0, "step": 3324 }, { "completion_length": 420.2567138671875, "epoch": 0.25927152963186717, "grad_norm": 0.11124378190149375, "kl": 0.002971649169921875, "learning_rate": 8.431010256839064e-07, "loss": 0.0001, "reward": 1.7879465073347092, "reward_std": 0.035472865216434, "rewards/accuracy_reward": 0.787946455180645, "rewards/format_reward": 1.0, "step": 3326 }, { "completion_length": 427.6651954650879, "epoch": 0.25942743554265, "grad_norm": 0.06702036897825367, "kl": 0.002777099609375, "learning_rate": 8.429228407078368e-07, "loss": 0.0001, "reward": 1.8482143580913544, "reward_std": 0.04471481405198574, "rewards/accuracy_reward": 0.8482143208384514, "rewards/format_reward": 1.0, "step": 3328 }, { "completion_length": 441.1205520629883, "epoch": 0.25958334145343287, "grad_norm": 0.09896090525404111, "kl": 0.003253936767578125, "learning_rate": 8.427445734623846e-07, "loss": 0.0001, "reward": 1.8861607611179352, "reward_std": 0.05230705998837948, "rewards/accuracy_reward": 0.8861607387661934, "rewards/format_reward": 1.0, "step": 3330 }, { "completion_length": 425.4018020629883, "epoch": 0.2597392473642157, "grad_norm": 0.12947251195290985, "kl": 0.0030431747436523438, "learning_rate": 8.425662239903172e-07, "loss": 0.0001, "reward": 1.796875074505806, "reward_std": 0.07469655480235815, "rewards/accuracy_reward": 0.7968750298023224, "rewards/format_reward": 1.0, "step": 3332 }, { "completion_length": 426.33484268188477, "epoch": 0.2598951532749985, "grad_norm": 0.10150884282423557, "kl": 0.0032062530517578125, "learning_rate": 8.423877923344218e-07, "loss": 0.0001, "reward": 1.7790179252624512, "reward_std": 0.06591106671839952, "rewards/accuracy_reward": 0.7790178880095482, "rewards/format_reward": 1.0, "step": 3334 }, { "completion_length": 425.4241256713867, "epoch": 0.26005105918578136, "grad_norm": 0.09487131256483504, "kl": 0.00316619873046875, "learning_rate": 8.422092785375055e-07, "loss": 0.0001, "reward": 1.7991072237491608, "reward_std": 0.06252611055970192, "rewards/accuracy_reward": 0.8013393133878708, "rewards/format_reward": 0.9977678656578064, "step": 3336 }, { "completion_length": 419.2812690734863, "epoch": 0.2602069650965642, "grad_norm": 0.08167880356507576, "kl": 0.0028018951416015625, "learning_rate": 8.420306826423944e-07, "loss": 0.0001, "reward": 1.90401791036129, "reward_std": 0.027206611819565296, "rewards/accuracy_reward": 0.9040178805589676, "rewards/format_reward": 1.0, "step": 3338 }, { "completion_length": 419.3794822692871, "epoch": 0.26036287100734706, "grad_norm": 0.08365211190390975, "kl": 0.0033168792724609375, "learning_rate": 8.418520046919353e-07, "loss": 0.0001, "reward": 1.774553656578064, "reward_std": 0.0562126561999321, "rewards/accuracy_reward": 0.7745536118745804, "rewards/format_reward": 1.0, "step": 3340 }, { "completion_length": 442.82144927978516, "epoch": 0.2605187769181299, "grad_norm": 0.06587323576311796, "kl": 0.0033416748046875, "learning_rate": 8.416732447289939e-07, "loss": 0.0001, "reward": 1.8772322088479996, "reward_std": 0.03675165772438049, "rewards/accuracy_reward": 0.8772321715950966, "rewards/format_reward": 1.0, "step": 3342 }, { "completion_length": 421.6585006713867, "epoch": 0.26067468282891276, "grad_norm": 0.04874695331742617, "kl": 0.0025787353515625, "learning_rate": 8.414944027964561e-07, "loss": 0.0001, "reward": 1.8303571939468384, "reward_std": 0.023821654729545116, "rewards/accuracy_reward": 0.8303571715950966, "rewards/format_reward": 1.0, "step": 3344 }, { "completion_length": 408.4442138671875, "epoch": 0.2608305887396956, "grad_norm": 0.10867657303099997, "kl": 0.0030765533447265625, "learning_rate": 8.413154789372268e-07, "loss": 0.0001, "reward": 1.8035715073347092, "reward_std": 0.058773879893124104, "rewards/accuracy_reward": 0.8035714626312256, "rewards/format_reward": 1.0, "step": 3346 }, { "completion_length": 427.72546768188477, "epoch": 0.26098649465047846, "grad_norm": 0.1416882191262479, "kl": 0.0028133392333984375, "learning_rate": 8.411364731942315e-07, "loss": 0.0001, "reward": 1.8102678954601288, "reward_std": 0.0771058164536953, "rewards/accuracy_reward": 0.8102678880095482, "rewards/format_reward": 1.0, "step": 3348 }, { "completion_length": 424.5156440734863, "epoch": 0.26114240056126126, "grad_norm": 0.10173893936390299, "kl": 0.0026454925537109375, "learning_rate": 8.409573856104145e-07, "loss": 0.0001, "reward": 1.8437500894069672, "reward_std": 0.06170237623155117, "rewards/accuracy_reward": 0.8437500298023224, "rewards/format_reward": 1.0, "step": 3350 }, { "completion_length": 415.6897506713867, "epoch": 0.2612983064720441, "grad_norm": 0.09568597484356174, "kl": 0.002918243408203125, "learning_rate": 8.4077821622874e-07, "loss": 0.0001, "reward": 1.8035715073347092, "reward_std": 0.04727239813655615, "rewards/accuracy_reward": 0.8058036118745804, "rewards/format_reward": 0.9977678656578064, "step": 3352 }, { "completion_length": 422.0067138671875, "epoch": 0.26145421238282696, "grad_norm": 0.1061501945393072, "kl": 0.00311279296875, "learning_rate": 8.405989650921922e-07, "loss": 0.0001, "reward": 1.8013393878936768, "reward_std": 0.06478060130029917, "rewards/accuracy_reward": 0.8035714700818062, "rewards/format_reward": 0.9977678656578064, "step": 3354 }, { "completion_length": 429.4977836608887, "epoch": 0.2616101182936098, "grad_norm": 0.13065476117178818, "kl": 0.003452301025390625, "learning_rate": 8.404196322437745e-07, "loss": 0.0001, "reward": 1.8125001192092896, "reward_std": 0.06688676495105028, "rewards/accuracy_reward": 0.812500037252903, "rewards/format_reward": 1.0, "step": 3356 }, { "completion_length": 424.63171768188477, "epoch": 0.26176602420439266, "grad_norm": 0.13106520726515924, "kl": 0.002887725830078125, "learning_rate": 8.402402177265096e-07, "loss": 0.0001, "reward": 1.799107238650322, "reward_std": 0.0993405394256115, "rewards/accuracy_reward": 0.7991071715950966, "rewards/format_reward": 1.0, "step": 3358 }, { "completion_length": 414.0669822692871, "epoch": 0.2619219301151755, "grad_norm": 0.12312678748708099, "kl": 0.0030765533447265625, "learning_rate": 8.400607215834406e-07, "loss": 0.0001, "reward": 1.8571429252624512, "reward_std": 0.06417581345885992, "rewards/accuracy_reward": 0.8571428954601288, "rewards/format_reward": 1.0, "step": 3360 }, { "completion_length": 429.82144927978516, "epoch": 0.26207783602595836, "grad_norm": 0.09069912007281443, "kl": 0.003101348876953125, "learning_rate": 8.398811438576296e-07, "loss": 0.0001, "reward": 1.837053656578064, "reward_std": 0.05831741914153099, "rewards/accuracy_reward": 0.8370536044239998, "rewards/format_reward": 1.0, "step": 3362 }, { "completion_length": 435.93305587768555, "epoch": 0.26223374193674115, "grad_norm": 0.11988534420828402, "kl": 0.003192901611328125, "learning_rate": 8.397014845921587e-07, "loss": 0.0001, "reward": 1.8258929550647736, "reward_std": 0.040809216909110546, "rewards/accuracy_reward": 0.8258928880095482, "rewards/format_reward": 1.0, "step": 3364 }, { "completion_length": 434.4018096923828, "epoch": 0.262389647847524, "grad_norm": 0.0848578473915258, "kl": 0.00275421142578125, "learning_rate": 8.39521743830129e-07, "loss": 0.0001, "reward": 1.845982238650322, "reward_std": 0.042762015014886856, "rewards/accuracy_reward": 0.8459821715950966, "rewards/format_reward": 1.0, "step": 3366 }, { "completion_length": 423.0134162902832, "epoch": 0.26254555375830685, "grad_norm": 0.08253746191031353, "kl": 0.0033817291259765625, "learning_rate": 8.393419216146615e-07, "loss": 0.0001, "reward": 1.7142858058214188, "reward_std": 0.03644856158643961, "rewards/accuracy_reward": 0.7142857424914837, "rewards/format_reward": 1.0, "step": 3368 }, { "completion_length": 446.5982360839844, "epoch": 0.2627014596690897, "grad_norm": 0.11395092766948, "kl": 0.0033111572265625, "learning_rate": 8.39162017988897e-07, "loss": 0.0001, "reward": 1.8415179401636124, "reward_std": 0.07176806032657623, "rewards/accuracy_reward": 0.8415178805589676, "rewards/format_reward": 1.0, "step": 3370 }, { "completion_length": 424.7544822692871, "epoch": 0.26285736557987255, "grad_norm": 0.08005521285889325, "kl": 0.00318145751953125, "learning_rate": 8.389820329959951e-07, "loss": 0.0001, "reward": 1.7723215222358704, "reward_std": 0.053282758221030235, "rewards/accuracy_reward": 0.7723214700818062, "rewards/format_reward": 1.0, "step": 3372 }, { "completion_length": 435.4620704650879, "epoch": 0.2630132714906554, "grad_norm": 0.10870521773161598, "kl": 0.0030965805053710938, "learning_rate": 8.388019666791356e-07, "loss": 0.0001, "reward": 1.8482143729925156, "reward_std": 0.08792824950069189, "rewards/accuracy_reward": 0.8504464700818062, "rewards/format_reward": 0.9977678656578064, "step": 3374 }, { "completion_length": 426.7232360839844, "epoch": 0.26316917740143825, "grad_norm": 0.09376976400859492, "kl": 0.0027904510498046875, "learning_rate": 8.386218190815176e-07, "loss": 0.0001, "reward": 1.7678572088479996, "reward_std": 0.06560797244310379, "rewards/accuracy_reward": 0.7700893133878708, "rewards/format_reward": 0.9977678656578064, "step": 3376 }, { "completion_length": 421.32591247558594, "epoch": 0.2633250833122211, "grad_norm": 0.12145427978211765, "kl": 0.0030841827392578125, "learning_rate": 8.384415902463595e-07, "loss": 0.0001, "reward": 1.8080357909202576, "reward_std": 0.06395826768130064, "rewards/accuracy_reward": 0.8080357536673546, "rewards/format_reward": 1.0, "step": 3378 }, { "completion_length": 419.02457427978516, "epoch": 0.2634809892230039, "grad_norm": 0.11210540727418132, "kl": 0.0033626556396484375, "learning_rate": 8.382612802168993e-07, "loss": 0.0001, "reward": 1.7611607760190964, "reward_std": 0.07371945679187775, "rewards/accuracy_reward": 0.761160746216774, "rewards/format_reward": 1.0, "step": 3380 }, { "completion_length": 419.8526954650879, "epoch": 0.26363689513378674, "grad_norm": 0.10628688138340509, "kl": 0.0033130645751953125, "learning_rate": 8.380808890363949e-07, "loss": 0.0001, "reward": 1.8102679252624512, "reward_std": 0.056514348834753036, "rewards/accuracy_reward": 0.81026791036129, "rewards/format_reward": 1.0, "step": 3382 }, { "completion_length": 417.7701072692871, "epoch": 0.2637928010445696, "grad_norm": 0.039702390611419384, "kl": 0.00263214111328125, "learning_rate": 8.37900416748123e-07, "loss": 0.0001, "reward": 1.8191965073347092, "reward_std": 0.03111220896244049, "rewards/accuracy_reward": 0.8191964626312256, "rewards/format_reward": 1.0, "step": 3384 }, { "completion_length": 430.22546005249023, "epoch": 0.26394870695535244, "grad_norm": 0.10654454912119846, "kl": 0.0035190582275390625, "learning_rate": 8.3771986339538e-07, "loss": 0.0001, "reward": 1.8080358058214188, "reward_std": 0.0797312157228589, "rewards/accuracy_reward": 0.808035746216774, "rewards/format_reward": 1.0, "step": 3386 }, { "completion_length": 420.56474685668945, "epoch": 0.2641046128661353, "grad_norm": 0.1389354477509305, "kl": 0.003650665283203125, "learning_rate": 8.37539229021482e-07, "loss": 0.0001, "reward": 1.7790179401636124, "reward_std": 0.09122905880212784, "rewards/accuracy_reward": 0.7790178954601288, "rewards/format_reward": 1.0, "step": 3388 }, { "completion_length": 429.79243087768555, "epoch": 0.26426051877691814, "grad_norm": 0.07331494363238689, "kl": 0.0028238296508789062, "learning_rate": 8.373585136697642e-07, "loss": 0.0001, "reward": 1.7857143431901932, "reward_std": 0.02089315839111805, "rewards/accuracy_reward": 0.7857143059372902, "rewards/format_reward": 1.0, "step": 3390 }, { "completion_length": 419.88394927978516, "epoch": 0.264416424687701, "grad_norm": 0.12578807918367746, "kl": 0.0028543472290039062, "learning_rate": 8.371777173835815e-07, "loss": 0.0001, "reward": 1.8549107909202576, "reward_std": 0.06575770024210215, "rewards/accuracy_reward": 0.854910746216774, "rewards/format_reward": 1.0, "step": 3392 }, { "completion_length": 418.97993087768555, "epoch": 0.26457233059848384, "grad_norm": 0.1252662130265821, "kl": 0.0030374526977539062, "learning_rate": 8.369968402063079e-07, "loss": 0.0001, "reward": 1.7656250894069672, "reward_std": 0.08372236229479313, "rewards/accuracy_reward": 0.7656250335276127, "rewards/format_reward": 1.0, "step": 3394 }, { "completion_length": 423.3214454650879, "epoch": 0.26472823650926663, "grad_norm": 0.06391737708540861, "kl": 0.0030727386474609375, "learning_rate": 8.368158821813371e-07, "loss": 0.0001, "reward": 1.8013393729925156, "reward_std": 0.04035275708884001, "rewards/accuracy_reward": 0.801339328289032, "rewards/format_reward": 1.0, "step": 3396 }, { "completion_length": 425.9620704650879, "epoch": 0.2648841424200495, "grad_norm": 0.09336767148857208, "kl": 0.003070831298828125, "learning_rate": 8.366348433520822e-07, "loss": 0.0001, "reward": 1.741071492433548, "reward_std": 0.0725806588307023, "rewards/accuracy_reward": 0.7433036155998707, "rewards/format_reward": 0.9977678656578064, "step": 3398 }, { "completion_length": 425.4754638671875, "epoch": 0.26504004833083233, "grad_norm": 0.10927694158423405, "kl": 0.0029315948486328125, "learning_rate": 8.364537237619754e-07, "loss": 0.0001, "reward": 1.8660715073347092, "reward_std": 0.05929311830550432, "rewards/accuracy_reward": 0.8660714700818062, "rewards/format_reward": 1.0, "step": 3400 }, { "completion_length": 435.8750190734863, "epoch": 0.2651959542416152, "grad_norm": 0.13869057568523893, "kl": 0.00327301025390625, "learning_rate": 8.362725234544685e-07, "loss": 0.0001, "reward": 1.6763393729925156, "reward_std": 0.07868629973381758, "rewards/accuracy_reward": 0.6785714626312256, "rewards/format_reward": 0.9977678656578064, "step": 3402 }, { "completion_length": 425.6652030944824, "epoch": 0.26535186015239803, "grad_norm": 0.08855981627573174, "kl": 0.0030269622802734375, "learning_rate": 8.360912424730327e-07, "loss": 0.0001, "reward": 1.7500000894069672, "reward_std": 0.05636462103575468, "rewards/accuracy_reward": 0.750000037252903, "rewards/format_reward": 1.0, "step": 3404 }, { "completion_length": 429.346004486084, "epoch": 0.2655077660631809, "grad_norm": 0.08860279172221937, "kl": 0.003040313720703125, "learning_rate": 8.359098808611583e-07, "loss": 0.0001, "reward": 1.7700893431901932, "reward_std": 0.05892360955476761, "rewards/accuracy_reward": 0.7700893096625805, "rewards/format_reward": 1.0, "step": 3406 }, { "completion_length": 442.17189025878906, "epoch": 0.26566367197396373, "grad_norm": 0.10889655764538311, "kl": 0.0033054351806640625, "learning_rate": 8.357284386623552e-07, "loss": 0.0001, "reward": 1.7812500894069672, "reward_std": 0.08650113269686699, "rewards/accuracy_reward": 0.7812500223517418, "rewards/format_reward": 1.0, "step": 3408 }, { "completion_length": 430.95984268188477, "epoch": 0.2658195778847466, "grad_norm": 0.10035503236491775, "kl": 0.003070831298828125, "learning_rate": 8.355469159201526e-07, "loss": 0.0001, "reward": 1.8348215222358704, "reward_std": 0.05298106465488672, "rewards/accuracy_reward": 0.8348214700818062, "rewards/format_reward": 1.0, "step": 3410 }, { "completion_length": 428.1830520629883, "epoch": 0.2659754837955294, "grad_norm": 0.10566044730766634, "kl": 0.0033054351806640625, "learning_rate": 8.353653126780988e-07, "loss": 0.0001, "reward": 1.7834822237491608, "reward_std": 0.07192142494022846, "rewards/accuracy_reward": 0.7857143208384514, "rewards/format_reward": 0.9977678656578064, "step": 3412 }, { "completion_length": 425.5513610839844, "epoch": 0.2661313897063122, "grad_norm": 0.11065962568226483, "kl": 0.0026493072509765625, "learning_rate": 8.351836289797619e-07, "loss": 0.0001, "reward": 1.7700893878936768, "reward_std": 0.10399456229060888, "rewards/accuracy_reward": 0.7745536044239998, "rewards/format_reward": 0.9955357313156128, "step": 3414 }, { "completion_length": 412.5870666503906, "epoch": 0.2662872956170951, "grad_norm": 0.10192535776925783, "kl": 0.0028409957885742188, "learning_rate": 8.350018648687286e-07, "loss": 0.0001, "reward": 1.8526786714792252, "reward_std": 0.04471481405198574, "rewards/accuracy_reward": 0.8526786044239998, "rewards/format_reward": 1.0, "step": 3416 }, { "completion_length": 422.7410888671875, "epoch": 0.2664432015278779, "grad_norm": 0.06835334034265642, "kl": 0.00299835205078125, "learning_rate": 8.348200203886055e-07, "loss": 0.0001, "reward": 1.8125000894069672, "reward_std": 0.06087640579789877, "rewards/accuracy_reward": 0.8125000298023224, "rewards/format_reward": 1.0, "step": 3418 }, { "completion_length": 426.20760345458984, "epoch": 0.2665991074386608, "grad_norm": 0.07775269934540946, "kl": 0.0029087066650390625, "learning_rate": 8.346380955830181e-07, "loss": 0.0001, "reward": 1.8973214775323868, "reward_std": 0.023821654729545116, "rewards/accuracy_reward": 0.897321455180645, "rewards/format_reward": 1.0, "step": 3420 }, { "completion_length": 415.1808166503906, "epoch": 0.2667550133494436, "grad_norm": 0.0822194938819765, "kl": 0.003276824951171875, "learning_rate": 8.344560904956116e-07, "loss": 0.0001, "reward": 1.812500074505806, "reward_std": 0.03479885868728161, "rewards/accuracy_reward": 0.8125000521540642, "rewards/format_reward": 1.0, "step": 3422 }, { "completion_length": 423.83707427978516, "epoch": 0.2669109192602265, "grad_norm": 0.06304519028146473, "kl": 0.00296783447265625, "learning_rate": 8.342740051700498e-07, "loss": 0.0001, "reward": 1.8325893431901932, "reward_std": 0.06591106858104467, "rewards/accuracy_reward": 0.8325893208384514, "rewards/format_reward": 1.0, "step": 3424 }, { "completion_length": 420.4107322692871, "epoch": 0.26706682517100927, "grad_norm": 0.09999449196903686, "kl": 0.0029745101928710938, "learning_rate": 8.340918396500165e-07, "loss": 0.0001, "reward": 1.8191965222358704, "reward_std": 0.08070691395550966, "rewards/accuracy_reward": 0.8191964626312256, "rewards/format_reward": 1.0, "step": 3426 }, { "completion_length": 414.7343940734863, "epoch": 0.2672227310817921, "grad_norm": 0.10380241499758176, "kl": 0.0029010772705078125, "learning_rate": 8.339095939792143e-07, "loss": 0.0001, "reward": 1.8415179550647736, "reward_std": 0.08747683186084032, "rewards/accuracy_reward": 0.8415178954601288, "rewards/format_reward": 1.0, "step": 3428 }, { "completion_length": 418.9821662902832, "epoch": 0.26737863699257497, "grad_norm": 0.11508354731003305, "kl": 0.0028667449951171875, "learning_rate": 8.337272682013647e-07, "loss": 0.0001, "reward": 1.8638393580913544, "reward_std": 0.06914125755429268, "rewards/accuracy_reward": 0.8638393208384514, "rewards/format_reward": 1.0, "step": 3430 }, { "completion_length": 417.75224685668945, "epoch": 0.2675345429033578, "grad_norm": 0.06580135177765604, "kl": 0.0025529861450195312, "learning_rate": 8.335448623602094e-07, "loss": 0.0001, "reward": 1.8727679252624512, "reward_std": 0.022845957428216934, "rewards/accuracy_reward": 0.872767873108387, "rewards/format_reward": 1.0, "step": 3432 }, { "completion_length": 427.02903747558594, "epoch": 0.26769044881414067, "grad_norm": 0.09656806002964967, "kl": 0.0029621124267578125, "learning_rate": 8.333623764995084e-07, "loss": 0.0001, "reward": 1.8616072088479996, "reward_std": 0.053282758221030235, "rewards/accuracy_reward": 0.8616071790456772, "rewards/format_reward": 1.0, "step": 3434 }, { "completion_length": 418.4218864440918, "epoch": 0.2678463547249235, "grad_norm": 0.10337084961540155, "kl": 0.002872467041015625, "learning_rate": 8.331798106630412e-07, "loss": 0.0001, "reward": 1.7500000894069672, "reward_std": 0.0489221028983593, "rewards/accuracy_reward": 0.750000037252903, "rewards/format_reward": 1.0, "step": 3436 }, { "completion_length": 415.1339454650879, "epoch": 0.26800226063570637, "grad_norm": 0.09210761192959879, "kl": 0.00258636474609375, "learning_rate": 8.329971648946066e-07, "loss": 0.0001, "reward": 1.8794643729925156, "reward_std": 0.043065110221505165, "rewards/accuracy_reward": 0.879464328289032, "rewards/format_reward": 1.0, "step": 3438 }, { "completion_length": 419.91519927978516, "epoch": 0.2681581665464892, "grad_norm": 0.08850303557798468, "kl": 0.00336456298828125, "learning_rate": 8.328144392380227e-07, "loss": 0.0001, "reward": 1.7142857760190964, "reward_std": 0.07755723688751459, "rewards/accuracy_reward": 0.7142857536673546, "rewards/format_reward": 1.0, "step": 3440 }, { "completion_length": 422.38171768188477, "epoch": 0.268314072457272, "grad_norm": 0.08030316000116149, "kl": 0.0032901763916015625, "learning_rate": 8.32631633737126e-07, "loss": 0.0001, "reward": 1.8772322088479996, "reward_std": 0.05102826654911041, "rewards/accuracy_reward": 0.8772321715950966, "rewards/format_reward": 1.0, "step": 3442 }, { "completion_length": 426.9352798461914, "epoch": 0.26846997836805486, "grad_norm": 0.047672571529122784, "kl": 0.0027933120727539062, "learning_rate": 8.324487484357734e-07, "loss": 0.0001, "reward": 1.9017857909202576, "reward_std": 0.04035415779799223, "rewards/accuracy_reward": 0.901785746216774, "rewards/format_reward": 1.0, "step": 3444 }, { "completion_length": 425.50671768188477, "epoch": 0.2686258842788377, "grad_norm": 0.08123151541663931, "kl": 0.0030422210693359375, "learning_rate": 8.322657833778397e-07, "loss": 0.0001, "reward": 1.7924108058214188, "reward_std": 0.053956763818860054, "rewards/accuracy_reward": 0.7924107611179352, "rewards/format_reward": 1.0, "step": 3446 }, { "completion_length": 422.1919822692871, "epoch": 0.26878179018962056, "grad_norm": 0.0036031467181667142, "kl": 0.003414154052734375, "learning_rate": 8.320827386072197e-07, "loss": 0.0001, "reward": 1.7790179550647736, "reward_std": 0.025774452835321426, "rewards/accuracy_reward": 0.7790178805589676, "rewards/format_reward": 1.0, "step": 3448 }, { "completion_length": 409.9196662902832, "epoch": 0.2689376961004034, "grad_norm": 0.041609818576136146, "kl": 0.0026617050170898438, "learning_rate": 8.318996141678268e-07, "loss": 0.0001, "reward": 1.8236607909202576, "reward_std": 0.03111220896244049, "rewards/accuracy_reward": 0.823660746216774, "rewards/format_reward": 1.0, "step": 3450 }, { "completion_length": 422.11608505249023, "epoch": 0.26909360201118626, "grad_norm": 0.054918587352048, "kl": 0.0029048919677734375, "learning_rate": 8.317164101035939e-07, "loss": 0.0001, "reward": 1.7678572088479996, "reward_std": 0.02089315839111805, "rewards/accuracy_reward": 0.7678571715950966, "rewards/format_reward": 1.0, "step": 3452 }, { "completion_length": 426.54019927978516, "epoch": 0.2692495079219691, "grad_norm": 0.17992957079475821, "kl": 0.003612518310546875, "learning_rate": 8.315331264584727e-07, "loss": 0.0001, "reward": 1.8080358058214188, "reward_std": 0.06981526035815477, "rewards/accuracy_reward": 0.8080357387661934, "rewards/format_reward": 1.0, "step": 3454 }, { "completion_length": 415.6250190734863, "epoch": 0.26940541383275196, "grad_norm": 0.0699374067242632, "kl": 0.0029144287109375, "learning_rate": 8.313497632764342e-07, "loss": 0.0001, "reward": 1.8950893580913544, "reward_std": 0.01750820130109787, "rewards/accuracy_reward": 0.8950893208384514, "rewards/format_reward": 1.0, "step": 3456 }, { "completion_length": 422.38394927978516, "epoch": 0.26956131974353476, "grad_norm": 0.08697812353756641, "kl": 0.0035676956176757812, "learning_rate": 8.311663206014683e-07, "loss": 0.0001, "reward": 1.7500000894069672, "reward_std": 0.088971764780581, "rewards/accuracy_reward": 0.7500000223517418, "rewards/format_reward": 1.0, "step": 3458 }, { "completion_length": 419.2857322692871, "epoch": 0.2697172256543176, "grad_norm": 0.06470008830575727, "kl": 0.0029554367065429688, "learning_rate": 8.309827984775842e-07, "loss": 0.0001, "reward": 1.7946429252624512, "reward_std": 0.03870445489883423, "rewards/accuracy_reward": 0.7946428954601288, "rewards/format_reward": 1.0, "step": 3460 }, { "completion_length": 422.0982246398926, "epoch": 0.26987313156510045, "grad_norm": 0.0697963504461345, "kl": 0.0029888153076171875, "learning_rate": 8.3079919694881e-07, "loss": 0.0001, "reward": 1.7790179401636124, "reward_std": 0.05087490100413561, "rewards/accuracy_reward": 0.7790178805589676, "rewards/format_reward": 1.0, "step": 3462 }, { "completion_length": 424.0714416503906, "epoch": 0.2700290374758833, "grad_norm": 0.14264445148987656, "kl": 0.002994537353515625, "learning_rate": 8.306155160591928e-07, "loss": 0.0001, "reward": 1.790178656578064, "reward_std": 0.08650113176554441, "rewards/accuracy_reward": 0.7901786044239998, "rewards/format_reward": 1.0, "step": 3464 }, { "completion_length": 430.4821586608887, "epoch": 0.27018494338666615, "grad_norm": 0.10139692116695871, "kl": 0.003322601318359375, "learning_rate": 8.304317558527989e-07, "loss": 0.0001, "reward": 1.765625074505806, "reward_std": 0.047946405597031116, "rewards/accuracy_reward": 0.7678571715950966, "rewards/format_reward": 0.9977678656578064, "step": 3466 }, { "completion_length": 415.2901954650879, "epoch": 0.270340849297449, "grad_norm": 0.08590317560160611, "kl": 0.0025835037231445312, "learning_rate": 8.302479163737137e-07, "loss": 0.0001, "reward": 1.8392857909202576, "reward_std": 0.029159409925341606, "rewards/accuracy_reward": 0.8392857536673546, "rewards/format_reward": 1.0, "step": 3468 }, { "completion_length": 415.72993087768555, "epoch": 0.27049675520823185, "grad_norm": 0.10193030282779704, "kl": 0.0028018951416015625, "learning_rate": 8.300639976660409e-07, "loss": 0.0001, "reward": 1.8504465222358704, "reward_std": 0.0456905122846365, "rewards/accuracy_reward": 0.8504464626312256, "rewards/format_reward": 1.0, "step": 3470 }, { "completion_length": 438.5848388671875, "epoch": 0.27065266111901465, "grad_norm": 0.09181247957793681, "kl": 0.003261566162109375, "learning_rate": 8.298799997739045e-07, "loss": 0.0001, "reward": 1.7745536118745804, "reward_std": 0.04666761215776205, "rewards/accuracy_reward": 0.7745535932481289, "rewards/format_reward": 1.0, "step": 3472 }, { "completion_length": 429.30359268188477, "epoch": 0.2708085670297975, "grad_norm": 0.08547138939778536, "kl": 0.0031642913818359375, "learning_rate": 8.296959227414463e-07, "loss": 0.0001, "reward": 1.76339291036129, "reward_std": 0.05523695982992649, "rewards/accuracy_reward": 0.7633928954601288, "rewards/format_reward": 1.0, "step": 3474 }, { "completion_length": 434.06921768188477, "epoch": 0.27096447294058035, "grad_norm": 0.11167923651864207, "kl": 0.0031070709228515625, "learning_rate": 8.295117666128278e-07, "loss": 0.0001, "reward": 1.7968750894069672, "reward_std": 0.059294519014656544, "rewards/accuracy_reward": 0.7968750447034836, "rewards/format_reward": 1.0, "step": 3476 }, { "completion_length": 424.1540336608887, "epoch": 0.2711203788513632, "grad_norm": 0.1306189704323935, "kl": 0.0030460357666015625, "learning_rate": 8.29327531432229e-07, "loss": 0.0001, "reward": 1.8303572237491608, "reward_std": 0.09671877697110176, "rewards/accuracy_reward": 0.8325893059372902, "rewards/format_reward": 0.9977678656578064, "step": 3478 }, { "completion_length": 419.11385345458984, "epoch": 0.27127628476214605, "grad_norm": 0.08684170053189089, "kl": 0.003391265869140625, "learning_rate": 8.291432172438492e-07, "loss": 0.0001, "reward": 1.7165179252624512, "reward_std": 0.04501790925860405, "rewards/accuracy_reward": 0.7165178954601288, "rewards/format_reward": 1.0, "step": 3480 }, { "completion_length": 430.1852836608887, "epoch": 0.2714321906729289, "grad_norm": 0.11038527287057669, "kl": 0.0034465789794921875, "learning_rate": 8.289588240919067e-07, "loss": 0.0001, "reward": 1.8348215073347092, "reward_std": 0.09748877864331007, "rewards/accuracy_reward": 0.839285746216774, "rewards/format_reward": 0.9955357313156128, "step": 3482 }, { "completion_length": 416.01341247558594, "epoch": 0.27158809658371175, "grad_norm": 0.0857334983475624, "kl": 0.0025854110717773438, "learning_rate": 8.287743520206384e-07, "loss": 0.0001, "reward": 1.8392857909202576, "reward_std": 0.0377273540943861, "rewards/accuracy_reward": 0.8392857685685158, "rewards/format_reward": 1.0, "step": 3484 }, { "completion_length": 411.1183166503906, "epoch": 0.2717440024944946, "grad_norm": 0.07168900576162278, "kl": 0.0028467178344726562, "learning_rate": 8.285898010743004e-07, "loss": 0.0001, "reward": 1.8214286267757416, "reward_std": 0.030438204295933247, "rewards/accuracy_reward": 0.8214286044239998, "rewards/format_reward": 1.0, "step": 3486 }, { "completion_length": 416.53126525878906, "epoch": 0.2718999084052774, "grad_norm": 0.10087417847252299, "kl": 0.0031833648681640625, "learning_rate": 8.284051712971677e-07, "loss": 0.0001, "reward": 1.7321429401636124, "reward_std": 0.05298106465488672, "rewards/accuracy_reward": 0.7321428954601288, "rewards/format_reward": 1.0, "step": 3488 }, { "completion_length": 429.59599685668945, "epoch": 0.27205581431606024, "grad_norm": 0.09538786657091619, "kl": 0.00411224365234375, "learning_rate": 8.282204627335341e-07, "loss": 0.0002, "reward": 1.7209822088479996, "reward_std": 0.09558971505612135, "rewards/accuracy_reward": 0.7209821790456772, "rewards/format_reward": 1.0, "step": 3490 }, { "completion_length": 425.62278747558594, "epoch": 0.2722117202268431, "grad_norm": 0.12118200663724805, "kl": 0.0031375885009765625, "learning_rate": 8.280356754277123e-07, "loss": 0.0001, "reward": 1.8102679550647736, "reward_std": 0.056158920750021935, "rewards/accuracy_reward": 0.8125000298023224, "rewards/format_reward": 0.9977678656578064, "step": 3492 }, { "completion_length": 427.43305587768555, "epoch": 0.27236762613762594, "grad_norm": 0.08164875958125932, "kl": 0.002918243408203125, "learning_rate": 8.278508094240341e-07, "loss": 0.0001, "reward": 1.8102679252624512, "reward_std": 0.04861900769174099, "rewards/accuracy_reward": 0.812500037252903, "rewards/format_reward": 0.9977678656578064, "step": 3494 }, { "completion_length": 423.58484649658203, "epoch": 0.2725235320484088, "grad_norm": 0.04633385377876623, "kl": 0.002849578857421875, "learning_rate": 8.276658647668501e-07, "loss": 0.0001, "reward": 1.881696492433548, "reward_std": 0.01585849840193987, "rewards/accuracy_reward": 0.8816964626312256, "rewards/format_reward": 1.0, "step": 3496 }, { "completion_length": 430.6339454650879, "epoch": 0.27267943795919164, "grad_norm": 0.11039541953148593, "kl": 0.00311279296875, "learning_rate": 8.274808415005294e-07, "loss": 0.0001, "reward": 1.8080358058214188, "reward_std": 0.08650113269686699, "rewards/accuracy_reward": 0.8080357387661934, "rewards/format_reward": 1.0, "step": 3498 }, { "completion_length": 431.00224685668945, "epoch": 0.2728353438699745, "grad_norm": 0.04940134352266028, "kl": 0.0033168792724609375, "learning_rate": 8.272957396694606e-07, "loss": 0.0001, "reward": 1.8683036267757416, "reward_std": 0.030135108157992363, "rewards/accuracy_reward": 0.8683035969734192, "rewards/format_reward": 1.0, "step": 3500 }, { "completion_length": 412.4977912902832, "epoch": 0.27299124978075734, "grad_norm": 0.005443883610335557, "kl": 0.0026502609252929688, "learning_rate": 8.271105593180507e-07, "loss": 0.0001, "reward": 1.9218750447034836, "reward_std": 0.014579704962670803, "rewards/accuracy_reward": 0.9218750223517418, "rewards/format_reward": 1.0, "step": 3502 }, { "completion_length": 424.2879638671875, "epoch": 0.27314715569154013, "grad_norm": 0.1252946228931576, "kl": 0.0031986236572265625, "learning_rate": 8.269253004907254e-07, "loss": 0.0001, "reward": 1.7544643580913544, "reward_std": 0.0876265587285161, "rewards/accuracy_reward": 0.7544643096625805, "rewards/format_reward": 1.0, "step": 3504 }, { "completion_length": 423.7745666503906, "epoch": 0.273303061602323, "grad_norm": 0.11755965536256971, "kl": 0.0032806396484375, "learning_rate": 8.267399632319299e-07, "loss": 0.0001, "reward": 1.723214328289032, "reward_std": 0.0668867640197277, "rewards/accuracy_reward": 0.7232143133878708, "rewards/format_reward": 1.0, "step": 3506 }, { "completion_length": 421.1875190734863, "epoch": 0.27345896751310583, "grad_norm": 0.0671809752388653, "kl": 0.0030879974365234375, "learning_rate": 8.265545475861275e-07, "loss": 0.0001, "reward": 1.7522322237491608, "reward_std": 0.04065725300461054, "rewards/accuracy_reward": 0.7522321715950966, "rewards/format_reward": 1.0, "step": 3508 }, { "completion_length": 437.6651916503906, "epoch": 0.2736148734238887, "grad_norm": 0.07836653909954605, "kl": 0.0034465789794921875, "learning_rate": 8.263690535978005e-07, "loss": 0.0001, "reward": 1.828125074505806, "reward_std": 0.04794640466570854, "rewards/accuracy_reward": 0.8281250298023224, "rewards/format_reward": 1.0, "step": 3510 }, { "completion_length": 430.2076072692871, "epoch": 0.27377077933467153, "grad_norm": 0.04574460385323812, "kl": 0.0029506683349609375, "learning_rate": 8.261834813114506e-07, "loss": 0.0001, "reward": 1.8482143729925156, "reward_std": 0.041786315850913525, "rewards/accuracy_reward": 0.848214328289032, "rewards/format_reward": 1.0, "step": 3512 }, { "completion_length": 427.6227836608887, "epoch": 0.2739266852454544, "grad_norm": 0.10862245124005743, "kl": 0.0030498504638671875, "learning_rate": 8.259978307715972e-07, "loss": 0.0001, "reward": 1.7276786416769028, "reward_std": 0.0601168517023325, "rewards/accuracy_reward": 0.7276786118745804, "rewards/format_reward": 1.0, "step": 3514 }, { "completion_length": 424.07368087768555, "epoch": 0.27408259115623723, "grad_norm": 0.08845574382651367, "kl": 0.0025348663330078125, "learning_rate": 8.258121020227795e-07, "loss": 0.0001, "reward": 1.8861607611179352, "reward_std": 0.03937845956534147, "rewards/accuracy_reward": 0.8861607387661934, "rewards/format_reward": 1.0, "step": 3516 }, { "completion_length": 417.2477912902832, "epoch": 0.27423849706702, "grad_norm": 0.10084726059227217, "kl": 0.0029315948486328125, "learning_rate": 8.256262951095546e-07, "loss": 0.0001, "reward": 1.8549108058214188, "reward_std": 0.056212655268609524, "rewards/accuracy_reward": 0.8549107536673546, "rewards/format_reward": 1.0, "step": 3518 }, { "completion_length": 433.7210006713867, "epoch": 0.2743944029778029, "grad_norm": 0.048310423524765035, "kl": 0.003360748291015625, "learning_rate": 8.254404100764992e-07, "loss": 0.0001, "reward": 1.8236608058214188, "reward_std": 0.03547286335378885, "rewards/accuracy_reward": 0.8236607536673546, "rewards/format_reward": 1.0, "step": 3520 }, { "completion_length": 412.2544860839844, "epoch": 0.2745503088885857, "grad_norm": 0.10015979557924275, "kl": 0.0028228759765625, "learning_rate": 8.25254446968208e-07, "loss": 0.0001, "reward": 1.8571429401636124, "reward_std": 0.055538651533424854, "rewards/accuracy_reward": 0.8571429029107094, "rewards/format_reward": 1.0, "step": 3522 }, { "completion_length": 423.02903747558594, "epoch": 0.2747062147993686, "grad_norm": 0.08050089196652381, "kl": 0.0029478073120117188, "learning_rate": 8.250684058292949e-07, "loss": 0.0001, "reward": 1.7924108058214188, "reward_std": 0.045017908327281475, "rewards/accuracy_reward": 0.7924107387661934, "rewards/format_reward": 1.0, "step": 3524 }, { "completion_length": 432.7611770629883, "epoch": 0.2748621207101514, "grad_norm": 0.10946336911978251, "kl": 0.0031414031982421875, "learning_rate": 8.248822867043922e-07, "loss": 0.0001, "reward": 1.7991072237491608, "reward_std": 0.05298106651753187, "rewards/accuracy_reward": 0.7991071790456772, "rewards/format_reward": 1.0, "step": 3526 }, { "completion_length": 434.06475830078125, "epoch": 0.2750180266209343, "grad_norm": 0.08872876705284052, "kl": 0.003276824951171875, "learning_rate": 8.246960896381511e-07, "loss": 0.0001, "reward": 1.8928572237491608, "reward_std": 0.05590956099331379, "rewards/accuracy_reward": 0.8928571790456772, "rewards/format_reward": 1.0, "step": 3528 }, { "completion_length": 429.5982322692871, "epoch": 0.2751739325317171, "grad_norm": 0.05857746007250609, "kl": 0.0029497146606445312, "learning_rate": 8.245098146752419e-07, "loss": 0.0001, "reward": 1.7790179401636124, "reward_std": 0.046667611226439476, "rewards/accuracy_reward": 0.7790178954601288, "rewards/format_reward": 1.0, "step": 3530 }, { "completion_length": 416.2477798461914, "epoch": 0.2753298384425, "grad_norm": 0.08964681008994956, "kl": 0.0027914047241210938, "learning_rate": 8.243234618603523e-07, "loss": 0.0001, "reward": 1.8928571939468384, "reward_std": 0.03757398948073387, "rewards/accuracy_reward": 0.8928571790456772, "rewards/format_reward": 1.0, "step": 3532 }, { "completion_length": 427.1919822692871, "epoch": 0.27548574435328277, "grad_norm": 0.057676433300938336, "kl": 0.002986907958984375, "learning_rate": 8.241370312381904e-07, "loss": 0.0001, "reward": 1.852678656578064, "reward_std": 0.03644856158643961, "rewards/accuracy_reward": 0.8526786044239998, "rewards/format_reward": 1.0, "step": 3534 }, { "completion_length": 423.7656478881836, "epoch": 0.2756416502640656, "grad_norm": 0.09293935096151776, "kl": 0.003032684326171875, "learning_rate": 8.239505228534816e-07, "loss": 0.0001, "reward": 1.8147322088479996, "reward_std": 0.0562126561999321, "rewards/accuracy_reward": 0.814732164144516, "rewards/format_reward": 1.0, "step": 3536 }, { "completion_length": 426.63394927978516, "epoch": 0.27579755617484847, "grad_norm": 0.09832513681532076, "kl": 0.00323486328125, "learning_rate": 8.237639367509704e-07, "loss": 0.0001, "reward": 1.812500074505806, "reward_std": 0.055909561924636364, "rewards/accuracy_reward": 0.812500037252903, "rewards/format_reward": 1.0, "step": 3538 }, { "completion_length": 424.5401954650879, "epoch": 0.2759534620856313, "grad_norm": 0.08725638073335105, "kl": 0.0028324127197265625, "learning_rate": 8.235772729754202e-07, "loss": 0.0001, "reward": 1.8258929252624512, "reward_std": 0.04764331039041281, "rewards/accuracy_reward": 0.8258928954601288, "rewards/format_reward": 1.0, "step": 3540 }, { "completion_length": 426.9085006713867, "epoch": 0.27610936799641417, "grad_norm": 0.12462270196999206, "kl": 0.003143310546875, "learning_rate": 8.23390531571613e-07, "loss": 0.0001, "reward": 1.7366072088479996, "reward_std": 0.08762655593454838, "rewards/accuracy_reward": 0.7366071715950966, "rewards/format_reward": 1.0, "step": 3542 }, { "completion_length": 430.32814025878906, "epoch": 0.276265273907197, "grad_norm": 0.12393462440505937, "kl": 0.002918243408203125, "learning_rate": 8.23203712584349e-07, "loss": 0.0001, "reward": 1.8348215073347092, "reward_std": 0.06395826861262321, "rewards/accuracy_reward": 0.8348214700818062, "rewards/format_reward": 1.0, "step": 3544 }, { "completion_length": 412.0692138671875, "epoch": 0.27642117981797987, "grad_norm": 0.10675273349438281, "kl": 0.0025959014892578125, "learning_rate": 8.230168160584472e-07, "loss": 0.0001, "reward": 1.796875074505806, "reward_std": 0.07304685190320015, "rewards/accuracy_reward": 0.796875037252903, "rewards/format_reward": 1.0, "step": 3546 }, { "completion_length": 422.49109268188477, "epoch": 0.2765770857287627, "grad_norm": 0.1254478398580817, "kl": 0.0029449462890625, "learning_rate": 8.228298420387456e-07, "loss": 0.0001, "reward": 1.7812500894069672, "reward_std": 0.07484852056950331, "rewards/accuracy_reward": 0.7812500223517418, "rewards/format_reward": 1.0, "step": 3548 }, { "completion_length": 421.033504486084, "epoch": 0.2767329916395455, "grad_norm": 0.08192915086417708, "kl": 0.0030012130737304688, "learning_rate": 8.226427905701002e-07, "loss": 0.0001, "reward": 1.8169643729925156, "reward_std": 0.05154890567064285, "rewards/accuracy_reward": 0.816964328289032, "rewards/format_reward": 1.0, "step": 3550 }, { "completion_length": 419.3281364440918, "epoch": 0.27688889755032836, "grad_norm": 0.12654476441677073, "kl": 0.002872467041015625, "learning_rate": 8.224556616973859e-07, "loss": 0.0001, "reward": 1.8303572237491608, "reward_std": 0.0753013426437974, "rewards/accuracy_reward": 0.8303571939468384, "rewards/format_reward": 1.0, "step": 3552 }, { "completion_length": 433.8393020629883, "epoch": 0.2770448034611112, "grad_norm": 0.12058773149225098, "kl": 0.0031032562255859375, "learning_rate": 8.222684554654959e-07, "loss": 0.0001, "reward": 1.8102679401636124, "reward_std": 0.06057331245392561, "rewards/accuracy_reward": 0.8102678954601288, "rewards/format_reward": 1.0, "step": 3554 }, { "completion_length": 440.7232322692871, "epoch": 0.27720070937189406, "grad_norm": 0.0842588119964498, "kl": 0.0031585693359375, "learning_rate": 8.220811719193427e-07, "loss": 0.0001, "reward": 1.803571492433548, "reward_std": 0.07925283443182707, "rewards/accuracy_reward": 0.808035746216774, "rewards/format_reward": 0.9955357313156128, "step": 3556 }, { "completion_length": 421.7053756713867, "epoch": 0.2773566152826769, "grad_norm": 0.0972109677501443, "kl": 0.002590179443359375, "learning_rate": 8.218938111038564e-07, "loss": 0.0001, "reward": 1.8549107760190964, "reward_std": 0.03547286428511143, "rewards/accuracy_reward": 0.8549107611179352, "rewards/format_reward": 1.0, "step": 3558 }, { "completion_length": 431.8080520629883, "epoch": 0.27751252119345976, "grad_norm": 0.0724075209539742, "kl": 0.003421783447265625, "learning_rate": 8.217063730639862e-07, "loss": 0.0001, "reward": 1.790178656578064, "reward_std": 0.03111080639064312, "rewards/accuracy_reward": 0.790178619325161, "rewards/format_reward": 1.0, "step": 3560 }, { "completion_length": 420.12947845458984, "epoch": 0.2776684271042426, "grad_norm": 0.13183596954502091, "kl": 0.003002166748046875, "learning_rate": 8.215188578446997e-07, "loss": 0.0001, "reward": 1.7656250894069672, "reward_std": 0.05929451994597912, "rewards/accuracy_reward": 0.765625037252903, "rewards/format_reward": 1.0, "step": 3562 }, { "completion_length": 420.8214530944824, "epoch": 0.27782433301502546, "grad_norm": 0.08054346059963204, "kl": 0.002716064453125, "learning_rate": 8.213312654909827e-07, "loss": 0.0001, "reward": 1.7321429401636124, "reward_std": 0.062155199237167835, "rewards/accuracy_reward": 0.7321428805589676, "rewards/format_reward": 1.0, "step": 3564 }, { "completion_length": 419.2991256713867, "epoch": 0.27798023892580825, "grad_norm": 0.07297913597180242, "kl": 0.00279998779296875, "learning_rate": 8.211435960478403e-07, "loss": 0.0001, "reward": 1.85714291036129, "reward_std": 0.04471481591463089, "rewards/accuracy_reward": 0.8593750447034836, "rewards/format_reward": 0.9977678656578064, "step": 3566 }, { "completion_length": 428.56922149658203, "epoch": 0.2781361448365911, "grad_norm": 0.11168428526488669, "kl": 0.0032291412353515625, "learning_rate": 8.209558495602952e-07, "loss": 0.0001, "reward": 1.8348215073347092, "reward_std": 0.07634485699236393, "rewards/accuracy_reward": 0.8348214700818062, "rewards/format_reward": 1.0, "step": 3568 }, { "completion_length": 430.8549346923828, "epoch": 0.27829205074737395, "grad_norm": 0.09831154259065074, "kl": 0.0029001235961914062, "learning_rate": 8.207680260733895e-07, "loss": 0.0001, "reward": 1.8125000894069672, "reward_std": 0.04471481405198574, "rewards/accuracy_reward": 0.8125000521540642, "rewards/format_reward": 1.0, "step": 3570 }, { "completion_length": 413.0223388671875, "epoch": 0.2784479566581568, "grad_norm": 0.07590813644496819, "kl": 0.0025663375854492188, "learning_rate": 8.205801256321825e-07, "loss": 0.0001, "reward": 1.8169643431901932, "reward_std": 0.05200396478176117, "rewards/accuracy_reward": 0.8169643357396126, "rewards/format_reward": 1.0, "step": 3572 }, { "completion_length": 420.8281440734863, "epoch": 0.27860386256893965, "grad_norm": 0.04862817806792351, "kl": 0.0027818679809570312, "learning_rate": 8.203921482817533e-07, "loss": 0.0001, "reward": 1.7700893431901932, "reward_std": 0.024124750867486, "rewards/accuracy_reward": 0.7700893208384514, "rewards/format_reward": 1.0, "step": 3574 }, { "completion_length": 430.4107322692871, "epoch": 0.2787597684797225, "grad_norm": 0.11516250648744251, "kl": 0.002849578857421875, "learning_rate": 8.202040940671989e-07, "loss": 0.0001, "reward": 1.7656250894069672, "reward_std": 0.05395676288753748, "rewards/accuracy_reward": 0.765625037252903, "rewards/format_reward": 1.0, "step": 3576 }, { "completion_length": 423.36832427978516, "epoch": 0.27891567439050535, "grad_norm": 0.1109677118992716, "kl": 0.00325775146484375, "learning_rate": 8.200159630336341e-07, "loss": 0.0001, "reward": 1.7946429252624512, "reward_std": 0.06463087256997824, "rewards/accuracy_reward": 0.7946429029107094, "rewards/format_reward": 1.0, "step": 3578 }, { "completion_length": 414.4799346923828, "epoch": 0.27907158030128815, "grad_norm": 0.14714744433326168, "kl": 0.0030155181884765625, "learning_rate": 8.198277552261933e-07, "loss": 0.0001, "reward": 1.8214286714792252, "reward_std": 0.0972380181774497, "rewards/accuracy_reward": 0.8214286118745804, "rewards/format_reward": 1.0, "step": 3580 }, { "completion_length": 420.0848388671875, "epoch": 0.279227486212071, "grad_norm": 0.09782813035788639, "kl": 0.0029649734497070312, "learning_rate": 8.196394706900286e-07, "loss": 0.0001, "reward": 1.8705357909202576, "reward_std": 0.054932462982833385, "rewards/accuracy_reward": 0.8705357387661934, "rewards/format_reward": 1.0, "step": 3582 }, { "completion_length": 415.6272506713867, "epoch": 0.27938339212285385, "grad_norm": 0.06558920915939452, "kl": 0.0029926300048828125, "learning_rate": 8.194511094703108e-07, "loss": 0.0001, "reward": 1.8191965073347092, "reward_std": 0.048619008623063564, "rewards/accuracy_reward": 0.8191964775323868, "rewards/format_reward": 1.0, "step": 3584 }, { "completion_length": 426.23439025878906, "epoch": 0.2795392980336367, "grad_norm": 0.10058995099183224, "kl": 0.00299072265625, "learning_rate": 8.192626716122286e-07, "loss": 0.0001, "reward": 1.7388393878936768, "reward_std": 0.056212655268609524, "rewards/accuracy_reward": 0.7388393133878708, "rewards/format_reward": 1.0, "step": 3586 }, { "completion_length": 421.5312690734863, "epoch": 0.27969520394441955, "grad_norm": 0.12254856304752024, "kl": 0.0032939910888671875, "learning_rate": 8.190741571609896e-07, "loss": 0.0001, "reward": 1.8705357760190964, "reward_std": 0.10138393100351095, "rewards/accuracy_reward": 0.8705357387661934, "rewards/format_reward": 1.0, "step": 3588 }, { "completion_length": 420.57591247558594, "epoch": 0.2798511098552024, "grad_norm": 0.04722319945688439, "kl": 0.0029888153076171875, "learning_rate": 8.188855661618199e-07, "loss": 0.0001, "reward": 1.8281250596046448, "reward_std": 0.021868856623768806, "rewards/accuracy_reward": 0.8281250298023224, "rewards/format_reward": 1.0, "step": 3590 }, { "completion_length": 422.63171768188477, "epoch": 0.28000701576598525, "grad_norm": 0.12433677983404734, "kl": 0.00325775146484375, "learning_rate": 8.186968986599634e-07, "loss": 0.0001, "reward": 1.834821492433548, "reward_std": 0.06463087256997824, "rewards/accuracy_reward": 0.8348214626312256, "rewards/format_reward": 1.0, "step": 3592 }, { "completion_length": 430.6294822692871, "epoch": 0.2801629216767681, "grad_norm": 0.05764512259995956, "kl": 0.0028934478759765625, "learning_rate": 8.185081547006827e-07, "loss": 0.0001, "reward": 1.8928572088479996, "reward_std": 0.061549010686576366, "rewards/accuracy_reward": 0.895089328289032, "rewards/format_reward": 0.9977678656578064, "step": 3594 }, { "completion_length": 420.71430587768555, "epoch": 0.2803188275875509, "grad_norm": 0.12211514648610189, "kl": 0.0030603408813476562, "learning_rate": 8.183193343292587e-07, "loss": 0.0001, "reward": 1.7968750596046448, "reward_std": 0.06447890680283308, "rewards/accuracy_reward": 0.796875037252903, "rewards/format_reward": 1.0, "step": 3596 }, { "completion_length": 419.6919860839844, "epoch": 0.28047473349833374, "grad_norm": 0.14402266547985915, "kl": 0.0033779144287109375, "learning_rate": 8.181304375909906e-07, "loss": 0.0001, "reward": 1.8348215222358704, "reward_std": 0.05553865246474743, "rewards/accuracy_reward": 0.834821455180645, "rewards/format_reward": 1.0, "step": 3598 }, { "completion_length": 430.7835006713867, "epoch": 0.2806306394091166, "grad_norm": 0.09510279774826702, "kl": 0.002925872802734375, "learning_rate": 8.17941464531196e-07, "loss": 0.0001, "reward": 1.8147322237491608, "reward_std": 0.055388922803103924, "rewards/accuracy_reward": 0.8147321790456772, "rewards/format_reward": 1.0, "step": 3600 }, { "completion_length": 423.8817138671875, "epoch": 0.28078654531989944, "grad_norm": 0.0746177436167661, "kl": 0.0034122467041015625, "learning_rate": 8.177524151952105e-07, "loss": 0.0001, "reward": 1.8236607760190964, "reward_std": 0.05523555725812912, "rewards/accuracy_reward": 0.823660746216774, "rewards/format_reward": 1.0, "step": 3602 }, { "completion_length": 422.6696586608887, "epoch": 0.2809424512306823, "grad_norm": 0.1018440225392621, "kl": 0.0030765533447265625, "learning_rate": 8.175632896283886e-07, "loss": 0.0001, "reward": 1.7566965073347092, "reward_std": 0.07484992407262325, "rewards/accuracy_reward": 0.7566964775323868, "rewards/format_reward": 1.0, "step": 3604 }, { "completion_length": 429.60939025878906, "epoch": 0.28109835714146514, "grad_norm": 0.1251700657613075, "kl": 0.003231048583984375, "learning_rate": 8.173740878761026e-07, "loss": 0.0001, "reward": 1.7879465222358704, "reward_std": 0.1107644746080041, "rewards/accuracy_reward": 0.7901786044239998, "rewards/format_reward": 0.9977678656578064, "step": 3606 }, { "completion_length": 424.97546005249023, "epoch": 0.281254263052248, "grad_norm": 0.19110807250314835, "kl": 0.0033893585205078125, "learning_rate": 8.17184809983743e-07, "loss": 0.0001, "reward": 1.7745536267757416, "reward_std": 0.08973272144794464, "rewards/accuracy_reward": 0.7745536044239998, "rewards/format_reward": 1.0, "step": 3608 }, { "completion_length": 411.2611770629883, "epoch": 0.28141016896303084, "grad_norm": 0.08224260417746906, "kl": 0.0029897689819335938, "learning_rate": 8.169954559967191e-07, "loss": 0.0001, "reward": 1.8504465073347092, "reward_std": 0.045017908327281475, "rewards/accuracy_reward": 0.850446455180645, "rewards/format_reward": 1.0, "step": 3610 }, { "completion_length": 422.20983505249023, "epoch": 0.28156607487381363, "grad_norm": 0.07808731905283142, "kl": 0.0027370452880859375, "learning_rate": 8.168060259604581e-07, "loss": 0.0001, "reward": 1.8571429401636124, "reward_std": 0.030438203364610672, "rewards/accuracy_reward": 0.8571428954601288, "rewards/format_reward": 1.0, "step": 3612 }, { "completion_length": 416.01564025878906, "epoch": 0.2817219807845965, "grad_norm": 0.005960103210404508, "kl": 0.0029859542846679688, "learning_rate": 8.166165199204052e-07, "loss": 0.0001, "reward": 1.8415179252624512, "reward_std": 0.028485404327511787, "rewards/accuracy_reward": 0.8415178880095482, "rewards/format_reward": 1.0, "step": 3614 }, { "completion_length": 427.03796768188477, "epoch": 0.28187788669537933, "grad_norm": 0.11565397486064453, "kl": 0.0033054351806640625, "learning_rate": 8.164269379220243e-07, "loss": 0.0001, "reward": 1.772321492433548, "reward_std": 0.061247317120432854, "rewards/accuracy_reward": 0.7723214626312256, "rewards/format_reward": 1.0, "step": 3616 }, { "completion_length": 419.9464454650879, "epoch": 0.2820337926061622, "grad_norm": 0.12251098548610243, "kl": 0.003101348876953125, "learning_rate": 8.162372800107975e-07, "loss": 0.0001, "reward": 1.7946429550647736, "reward_std": 0.09799757041037083, "rewards/accuracy_reward": 0.7946428954601288, "rewards/format_reward": 1.0, "step": 3618 }, { "completion_length": 412.1964454650879, "epoch": 0.28218969851694503, "grad_norm": 0.12262249639147586, "kl": 0.002964019775390625, "learning_rate": 8.160475462322245e-07, "loss": 0.0001, "reward": 1.7611607760190964, "reward_std": 0.04666761215776205, "rewards/accuracy_reward": 0.7611607387661934, "rewards/format_reward": 1.0, "step": 3620 }, { "completion_length": 418.5468940734863, "epoch": 0.2823456044277279, "grad_norm": 0.0890835528882367, "kl": 0.0031986236572265625, "learning_rate": 8.15857736631824e-07, "loss": 0.0001, "reward": 1.8147322237491608, "reward_std": 0.06508733332157135, "rewards/accuracy_reward": 0.8147321790456772, "rewards/format_reward": 1.0, "step": 3622 }, { "completion_length": 417.3214416503906, "epoch": 0.28250151033851073, "grad_norm": 0.0038211796618710715, "kl": 0.0028209686279296875, "learning_rate": 8.156678512551325e-07, "loss": 0.0001, "reward": 1.7991072237491608, "reward_std": 0.057188354432582855, "rewards/accuracy_reward": 0.7991071790456772, "rewards/format_reward": 1.0, "step": 3624 }, { "completion_length": 414.1852836608887, "epoch": 0.2826574162492935, "grad_norm": 0.09643022616454883, "kl": 0.0027256011962890625, "learning_rate": 8.154778901477047e-07, "loss": 0.0001, "reward": 1.8861607909202576, "reward_std": 0.04959610756486654, "rewards/accuracy_reward": 0.8861607536673546, "rewards/format_reward": 1.0, "step": 3626 }, { "completion_length": 426.9218940734863, "epoch": 0.2828133221600764, "grad_norm": 0.10284439658579368, "kl": 0.0034923553466796875, "learning_rate": 8.152878533551133e-07, "loss": 0.0001, "reward": 1.7589286714792252, "reward_std": 0.07088695280253887, "rewards/accuracy_reward": 0.7633928880095482, "rewards/format_reward": 0.9955357313156128, "step": 3628 }, { "completion_length": 417.5468940734863, "epoch": 0.2829692280708592, "grad_norm": 0.08967551910092977, "kl": 0.0027027130126953125, "learning_rate": 8.150977409229494e-07, "loss": 0.0001, "reward": 1.8526786714792252, "reward_std": 0.061702377162873745, "rewards/accuracy_reward": 0.8549107387661934, "rewards/format_reward": 0.9977678656578064, "step": 3630 }, { "completion_length": 407.62724685668945, "epoch": 0.2831251339816421, "grad_norm": 0.12321260485468735, "kl": 0.003276824951171875, "learning_rate": 8.149075528968225e-07, "loss": 0.0001, "reward": 1.7633929252624512, "reward_std": 0.06560797244310379, "rewards/accuracy_reward": 0.7633928805589676, "rewards/format_reward": 1.0, "step": 3632 }, { "completion_length": 422.99555587768555, "epoch": 0.2832810398924249, "grad_norm": 0.06942520367600173, "kl": 0.0025119781494140625, "learning_rate": 8.147172893223595e-07, "loss": 0.0001, "reward": 1.8660714775323868, "reward_std": 0.025253813713788986, "rewards/accuracy_reward": 0.8660714700818062, "rewards/format_reward": 1.0, "step": 3634 }, { "completion_length": 417.40403747558594, "epoch": 0.2834369458032078, "grad_norm": 0.07752460194977856, "kl": 0.0030536651611328125, "learning_rate": 8.145269502452061e-07, "loss": 0.0001, "reward": 1.7656250894069672, "reward_std": 0.04035275708884001, "rewards/accuracy_reward": 0.7656250298023224, "rewards/format_reward": 1.0, "step": 3636 }, { "completion_length": 424.9732360839844, "epoch": 0.2835928517139906, "grad_norm": 0.08359395591834472, "kl": 0.0032939910888671875, "learning_rate": 8.143365357110259e-07, "loss": 0.0001, "reward": 1.7790179252624512, "reward_std": 0.056667715311050415, "rewards/accuracy_reward": 0.7790178880095482, "rewards/format_reward": 1.0, "step": 3638 }, { "completion_length": 424.0982322692871, "epoch": 0.2837487576247735, "grad_norm": 0.08209790766900063, "kl": 0.003063201904296875, "learning_rate": 8.141460457655005e-07, "loss": 0.0001, "reward": 1.8236607909202576, "reward_std": 0.030135108157992363, "rewards/accuracy_reward": 0.8236607536673546, "rewards/format_reward": 1.0, "step": 3640 }, { "completion_length": 416.06474685668945, "epoch": 0.28390466353555627, "grad_norm": 0.11880597263183845, "kl": 0.00345611572265625, "learning_rate": 8.139554804543294e-07, "loss": 0.0001, "reward": 1.8080357909202576, "reward_std": 0.07350331265479326, "rewards/accuracy_reward": 0.8080357387661934, "rewards/format_reward": 1.0, "step": 3642 }, { "completion_length": 416.5044746398926, "epoch": 0.2840605694463391, "grad_norm": 0.11217643517071844, "kl": 0.0034389495849609375, "learning_rate": 8.137648398232309e-07, "loss": 0.0001, "reward": 1.8303572237491608, "reward_std": 0.0973249701783061, "rewards/accuracy_reward": 0.8303571790456772, "rewards/format_reward": 1.0, "step": 3644 }, { "completion_length": 410.5736846923828, "epoch": 0.28421647535712197, "grad_norm": 0.09101651198715362, "kl": 0.0029392242431640625, "learning_rate": 8.135741239179407e-07, "loss": 0.0001, "reward": 1.7790179252624512, "reward_std": 0.05929451808333397, "rewards/accuracy_reward": 0.7790178880095482, "rewards/format_reward": 1.0, "step": 3646 }, { "completion_length": 410.7388572692871, "epoch": 0.2843723812679048, "grad_norm": 0.11674107642047218, "kl": 0.0028133392333984375, "learning_rate": 8.133833327842125e-07, "loss": 0.0001, "reward": 1.8191965073347092, "reward_std": 0.05981375649571419, "rewards/accuracy_reward": 0.8191964775323868, "rewards/format_reward": 1.0, "step": 3648 }, { "completion_length": 432.33484649658203, "epoch": 0.28452828717868767, "grad_norm": 0.07504582741900681, "kl": 0.003337860107421875, "learning_rate": 8.131924664678187e-07, "loss": 0.0001, "reward": 1.8392857909202576, "reward_std": 0.025253813713788986, "rewards/accuracy_reward": 0.839285746216774, "rewards/format_reward": 1.0, "step": 3650 }, { "completion_length": 429.7634162902832, "epoch": 0.2846841930894705, "grad_norm": 0.06728855070773891, "kl": 0.0032196044921875, "learning_rate": 8.130015250145492e-07, "loss": 0.0001, "reward": 1.790178656578064, "reward_std": 0.05328276101499796, "rewards/accuracy_reward": 0.7901786044239998, "rewards/format_reward": 1.0, "step": 3652 }, { "completion_length": 415.2210006713867, "epoch": 0.28484009900025337, "grad_norm": 0.0040446294289632045, "kl": 0.0028963088989257812, "learning_rate": 8.12810508470212e-07, "loss": 0.0001, "reward": 1.7388393431901932, "reward_std": 0.02186885755509138, "rewards/accuracy_reward": 0.738839328289032, "rewards/format_reward": 1.0, "step": 3654 }, { "completion_length": 423.19421768188477, "epoch": 0.2849960049110362, "grad_norm": 0.04693754148268511, "kl": 0.0027513504028320312, "learning_rate": 8.126194168806332e-07, "loss": 0.0001, "reward": 1.897321492433548, "reward_std": 0.039377057924866676, "rewards/accuracy_reward": 0.8973214477300644, "rewards/format_reward": 1.0, "step": 3656 }, { "completion_length": 426.14734268188477, "epoch": 0.285151910821819, "grad_norm": 0.003424987043420432, "kl": 0.002994537353515625, "learning_rate": 8.124282502916572e-07, "loss": 0.0001, "reward": 1.805803656578064, "reward_std": 0.04794640652835369, "rewards/accuracy_reward": 0.8058036044239998, "rewards/format_reward": 1.0, "step": 3658 }, { "completion_length": 423.58707427978516, "epoch": 0.28530781673260186, "grad_norm": 0.07253681704919579, "kl": 0.002838134765625, "learning_rate": 8.122370087491457e-07, "loss": 0.0001, "reward": 1.8883929252624512, "reward_std": 0.039377057924866676, "rewards/accuracy_reward": 0.8883928805589676, "rewards/format_reward": 1.0, "step": 3660 }, { "completion_length": 421.58930587768555, "epoch": 0.2854637226433847, "grad_norm": 0.0033388418993763073, "kl": 0.0026378631591796875, "learning_rate": 8.12045692298979e-07, "loss": 0.0001, "reward": 1.8549107611179352, "reward_std": 0.04537404701113701, "rewards/accuracy_reward": 0.8571428880095482, "rewards/format_reward": 0.9977678656578064, "step": 3662 }, { "completion_length": 425.5781478881836, "epoch": 0.28561962855416756, "grad_norm": 0.08775527834430819, "kl": 0.003177642822265625, "learning_rate": 8.118543009870548e-07, "loss": 0.0001, "reward": 1.8549107760190964, "reward_std": 0.057491449639201164, "rewards/accuracy_reward": 0.8549107536673546, "rewards/format_reward": 1.0, "step": 3664 }, { "completion_length": 426.18305587768555, "epoch": 0.2857755344649504, "grad_norm": 0.0853942754858082, "kl": 0.0031642913818359375, "learning_rate": 8.116628348592898e-07, "loss": 0.0001, "reward": 1.7946429252624512, "reward_std": 0.05493246205151081, "rewards/accuracy_reward": 0.7946428880095482, "rewards/format_reward": 1.0, "step": 3666 }, { "completion_length": 419.486629486084, "epoch": 0.28593144037573326, "grad_norm": 0.0606821634521368, "kl": 0.0033130645751953125, "learning_rate": 8.114712939616173e-07, "loss": 0.0001, "reward": 1.7544643580913544, "reward_std": 0.039377057924866676, "rewards/accuracy_reward": 0.7544643059372902, "rewards/format_reward": 1.0, "step": 3668 }, { "completion_length": 430.439754486084, "epoch": 0.2860873462865161, "grad_norm": 0.08124650666618259, "kl": 0.0030574798583984375, "learning_rate": 8.112796783399891e-07, "loss": 0.0001, "reward": 1.7812501043081284, "reward_std": 0.06853646971285343, "rewards/accuracy_reward": 0.781250037252903, "rewards/format_reward": 1.0, "step": 3670 }, { "completion_length": 415.77680587768555, "epoch": 0.2862432521972989, "grad_norm": 0.0676074190993869, "kl": 0.002685546875, "learning_rate": 8.110879880403757e-07, "loss": 0.0001, "reward": 1.8504465073347092, "reward_std": 0.06575770024210215, "rewards/accuracy_reward": 0.8504464626312256, "rewards/format_reward": 1.0, "step": 3672 }, { "completion_length": 409.6339416503906, "epoch": 0.28639915810808175, "grad_norm": 0.07407431105026609, "kl": 0.002887725830078125, "learning_rate": 8.108962231087642e-07, "loss": 0.0001, "reward": 1.8504465073347092, "reward_std": 0.06252470798790455, "rewards/accuracy_reward": 0.850446455180645, "rewards/format_reward": 1.0, "step": 3674 }, { "completion_length": 417.97993087768555, "epoch": 0.2865550640188646, "grad_norm": 0.09658537982985635, "kl": 0.0027856826782226562, "learning_rate": 8.107043835911606e-07, "loss": 0.0001, "reward": 1.7165179252624512, "reward_std": 0.06786246132105589, "rewards/accuracy_reward": 0.7165178954601288, "rewards/format_reward": 1.0, "step": 3676 }, { "completion_length": 433.60269927978516, "epoch": 0.28671096992964745, "grad_norm": 0.08562301618478556, "kl": 0.002933502197265625, "learning_rate": 8.105124695335883e-07, "loss": 0.0001, "reward": 1.6986607909202576, "reward_std": 0.04794640466570854, "rewards/accuracy_reward": 0.6986607536673546, "rewards/format_reward": 1.0, "step": 3678 }, { "completion_length": 429.63618087768555, "epoch": 0.2868668758404303, "grad_norm": 0.08816833389870536, "kl": 0.0030317306518554688, "learning_rate": 8.103204809820886e-07, "loss": 0.0001, "reward": 1.7834822237491608, "reward_std": 0.043739115819334984, "rewards/accuracy_reward": 0.783482164144516, "rewards/format_reward": 1.0, "step": 3680 }, { "completion_length": 414.12278747558594, "epoch": 0.28702278175121315, "grad_norm": 0.07708933082001082, "kl": 0.0030651092529296875, "learning_rate": 8.10128417982721e-07, "loss": 0.0001, "reward": 1.734375074505806, "reward_std": 0.04276201594620943, "rewards/accuracy_reward": 0.734375037252903, "rewards/format_reward": 1.0, "step": 3682 }, { "completion_length": 431.6674346923828, "epoch": 0.287178687661996, "grad_norm": 0.10409512982927985, "kl": 0.003810882568359375, "learning_rate": 8.099362805815624e-07, "loss": 0.0002, "reward": 1.6741072237491608, "reward_std": 0.07124742120504379, "rewards/accuracy_reward": 0.6741071678698063, "rewards/format_reward": 1.0, "step": 3684 }, { "completion_length": 408.7745666503906, "epoch": 0.28733459357277885, "grad_norm": 0.08334971237865639, "kl": 0.0027074813842773438, "learning_rate": 8.09744068824708e-07, "loss": 0.0001, "reward": 1.8348215371370316, "reward_std": 0.07207115273922682, "rewards/accuracy_reward": 0.8348214700818062, "rewards/format_reward": 1.0, "step": 3686 }, { "completion_length": 424.7477836608887, "epoch": 0.28749049948356165, "grad_norm": 0.08704904185582141, "kl": 0.003322601318359375, "learning_rate": 8.095517827582705e-07, "loss": 0.0001, "reward": 1.8772322088479996, "reward_std": 0.049378564581274986, "rewards/accuracy_reward": 0.8772321715950966, "rewards/format_reward": 1.0, "step": 3688 }, { "completion_length": 432.91519927978516, "epoch": 0.2876464053943445, "grad_norm": 0.07999685240829711, "kl": 0.0032806396484375, "learning_rate": 8.093594224283806e-07, "loss": 0.0001, "reward": 1.8058036267757416, "reward_std": 0.07664795126765966, "rewards/accuracy_reward": 0.8058036044239998, "rewards/format_reward": 1.0, "step": 3690 }, { "completion_length": 427.6518020629883, "epoch": 0.28780231130512735, "grad_norm": 0.09106459902118928, "kl": 0.0031948089599609375, "learning_rate": 8.091669878811869e-07, "loss": 0.0001, "reward": 1.752232238650322, "reward_std": 0.041329856030642986, "rewards/accuracy_reward": 0.7522321790456772, "rewards/format_reward": 1.0, "step": 3692 }, { "completion_length": 427.6875190734863, "epoch": 0.2879582172159102, "grad_norm": 0.004217297906744009, "kl": 0.0034503936767578125, "learning_rate": 8.089744791628554e-07, "loss": 0.0001, "reward": 1.8080357909202576, "reward_std": 0.04419053718447685, "rewards/accuracy_reward": 0.808035746216774, "rewards/format_reward": 1.0, "step": 3694 }, { "completion_length": 429.3861770629883, "epoch": 0.28811412312669304, "grad_norm": 0.0038479057037580375, "kl": 0.0029964447021484375, "learning_rate": 8.087818963195705e-07, "loss": 0.0001, "reward": 1.7924107909202576, "reward_std": 0.02720661275088787, "rewards/accuracy_reward": 0.792410746216774, "rewards/format_reward": 1.0, "step": 3696 }, { "completion_length": 408.0647506713867, "epoch": 0.2882700290374759, "grad_norm": 0.09807390151954194, "kl": 0.0026750564575195312, "learning_rate": 8.085892393975337e-07, "loss": 0.0001, "reward": 1.8303571939468384, "reward_std": 0.07079236209392548, "rewards/accuracy_reward": 0.830357164144516, "rewards/format_reward": 1.0, "step": 3698 }, { "completion_length": 425.00672149658203, "epoch": 0.28842593494825874, "grad_norm": 0.08279681147222209, "kl": 0.0030956268310546875, "learning_rate": 8.083965084429649e-07, "loss": 0.0001, "reward": 1.8348215073347092, "reward_std": 0.0417863167822361, "rewards/accuracy_reward": 0.837053619325161, "rewards/format_reward": 0.9977678656578064, "step": 3700 }, { "completion_length": 413.7567138671875, "epoch": 0.2885818408590416, "grad_norm": 0.06166874936679049, "kl": 0.00286865234375, "learning_rate": 8.082037035021015e-07, "loss": 0.0001, "reward": 1.895089328289032, "reward_std": 0.024124749936163425, "rewards/accuracy_reward": 0.895089328289032, "rewards/format_reward": 1.0, "step": 3702 }, { "completion_length": 420.8080596923828, "epoch": 0.2887377467698244, "grad_norm": 0.09269335812218867, "kl": 0.0032291412353515625, "learning_rate": 8.080108246211984e-07, "loss": 0.0001, "reward": 1.7254465073347092, "reward_std": 0.052153694443404675, "rewards/accuracy_reward": 0.725446455180645, "rewards/format_reward": 1.0, "step": 3704 }, { "completion_length": 439.26118087768555, "epoch": 0.28889365268060724, "grad_norm": 0.10911802242751474, "kl": 0.0030651092529296875, "learning_rate": 8.078178718465286e-07, "loss": 0.0001, "reward": 1.7924108058214188, "reward_std": 0.03983351867645979, "rewards/accuracy_reward": 0.7924107313156128, "rewards/format_reward": 1.0, "step": 3706 }, { "completion_length": 427.0870704650879, "epoch": 0.2890495585913901, "grad_norm": 0.07855401093409882, "kl": 0.002994537353515625, "learning_rate": 8.076248452243828e-07, "loss": 0.0001, "reward": 1.781250074505806, "reward_std": 0.054932462982833385, "rewards/accuracy_reward": 0.781250037252903, "rewards/format_reward": 1.0, "step": 3708 }, { "completion_length": 437.5647506713867, "epoch": 0.28920546450217294, "grad_norm": 0.06388622343567003, "kl": 0.0031909942626953125, "learning_rate": 8.074317448010693e-07, "loss": 0.0001, "reward": 1.8236607760190964, "reward_std": 0.05666771624237299, "rewards/accuracy_reward": 0.8236607611179352, "rewards/format_reward": 1.0, "step": 3710 }, { "completion_length": 412.97546768188477, "epoch": 0.2893613704129558, "grad_norm": 0.0722338231811358, "kl": 0.0025997161865234375, "learning_rate": 8.072385706229139e-07, "loss": 0.0001, "reward": 1.7991072088479996, "reward_std": 0.029159409925341606, "rewards/accuracy_reward": 0.7991071715950966, "rewards/format_reward": 1.0, "step": 3712 }, { "completion_length": 433.6718940734863, "epoch": 0.28951727632373864, "grad_norm": 0.06663415484388968, "kl": 0.0029163360595703125, "learning_rate": 8.070453227362606e-07, "loss": 0.0001, "reward": 1.84151791036129, "reward_std": 0.042762015014886856, "rewards/accuracy_reward": 0.8437500298023224, "rewards/format_reward": 0.9977678656578064, "step": 3714 }, { "completion_length": 426.29243087768555, "epoch": 0.2896731822345215, "grad_norm": 0.08074129669619166, "kl": 0.0030584335327148438, "learning_rate": 8.068520011874706e-07, "loss": 0.0001, "reward": 1.8571429401636124, "reward_std": 0.02089315839111805, "rewards/accuracy_reward": 0.8571428954601288, "rewards/format_reward": 1.0, "step": 3716 }, { "completion_length": 417.51118087768555, "epoch": 0.2898290881453043, "grad_norm": 0.07669201904821064, "kl": 0.0027179718017578125, "learning_rate": 8.066586060229233e-07, "loss": 0.0001, "reward": 1.861607238650322, "reward_std": 0.043065110221505165, "rewards/accuracy_reward": 0.8616071864962578, "rewards/format_reward": 1.0, "step": 3718 }, { "completion_length": 422.1919822692871, "epoch": 0.28998499405608713, "grad_norm": 0.09564230909759765, "kl": 0.0032672882080078125, "learning_rate": 8.06465137289015e-07, "loss": 0.0001, "reward": 1.8147322237491608, "reward_std": 0.055235556326806545, "rewards/accuracy_reward": 0.8147321864962578, "rewards/format_reward": 1.0, "step": 3720 }, { "completion_length": 426.35716247558594, "epoch": 0.29014089996687, "grad_norm": 0.04467232376396484, "kl": 0.004032135009765625, "learning_rate": 8.062715950321604e-07, "loss": 0.0002, "reward": 1.8638393729925156, "reward_std": 0.03675165679305792, "rewards/accuracy_reward": 0.863839328289032, "rewards/format_reward": 1.0, "step": 3722 }, { "completion_length": 423.78126525878906, "epoch": 0.29029680587765283, "grad_norm": 0.10363318240866934, "kl": 0.0031070709228515625, "learning_rate": 8.060779792987912e-07, "loss": 0.0001, "reward": 1.8191965073347092, "reward_std": 0.08875562343746424, "rewards/accuracy_reward": 0.8191964626312256, "rewards/format_reward": 1.0, "step": 3724 }, { "completion_length": 427.4955596923828, "epoch": 0.2904527117884357, "grad_norm": 0.10356217401961113, "kl": 0.0034389495849609375, "learning_rate": 8.058842901353575e-07, "loss": 0.0001, "reward": 1.8660715073347092, "reward_std": 0.08680282346904278, "rewards/accuracy_reward": 0.8705357611179352, "rewards/format_reward": 0.9955357313156128, "step": 3726 }, { "completion_length": 424.33930587768555, "epoch": 0.29060861769921853, "grad_norm": 0.1084449285685884, "kl": 0.0032444000244140625, "learning_rate": 8.056905275883263e-07, "loss": 0.0001, "reward": 1.7790179550647736, "reward_std": 0.05764481518417597, "rewards/accuracy_reward": 0.7812500223517418, "rewards/format_reward": 0.9977678656578064, "step": 3728 }, { "completion_length": 422.69421768188477, "epoch": 0.2907645236100014, "grad_norm": 0.0803026781128282, "kl": 0.002956390380859375, "learning_rate": 8.054966917041826e-07, "loss": 0.0001, "reward": 1.8526786267757416, "reward_std": 0.05523695982992649, "rewards/accuracy_reward": 0.8526785895228386, "rewards/format_reward": 1.0, "step": 3730 }, { "completion_length": 435.6250228881836, "epoch": 0.29092042952078423, "grad_norm": 0.09982207182779988, "kl": 0.003170013427734375, "learning_rate": 8.053027825294285e-07, "loss": 0.0001, "reward": 1.8437500894069672, "reward_std": 0.0669348556548357, "rewards/accuracy_reward": 0.8459821790456772, "rewards/format_reward": 0.9977678656578064, "step": 3732 }, { "completion_length": 419.1272506713867, "epoch": 0.291076335431567, "grad_norm": 0.04782193578046638, "kl": 0.0029573440551757812, "learning_rate": 8.051088001105846e-07, "loss": 0.0001, "reward": 1.79464291036129, "reward_std": 0.01555540319532156, "rewards/accuracy_reward": 0.7946428880095482, "rewards/format_reward": 1.0, "step": 3734 }, { "completion_length": 431.1741256713867, "epoch": 0.2912322413423499, "grad_norm": 0.11246977003796078, "kl": 0.0031604766845703125, "learning_rate": 8.049147444941882e-07, "loss": 0.0001, "reward": 1.805803656578064, "reward_std": 0.07484992314130068, "rewards/accuracy_reward": 0.805803619325161, "rewards/format_reward": 1.0, "step": 3736 }, { "completion_length": 411.79689025878906, "epoch": 0.2913881472531327, "grad_norm": 0.1039642839256755, "kl": 0.0029344558715820312, "learning_rate": 8.047206157267945e-07, "loss": 0.0001, "reward": 1.8526786416769028, "reward_std": 0.055236958898603916, "rewards/accuracy_reward": 0.8526785969734192, "rewards/format_reward": 1.0, "step": 3738 }, { "completion_length": 432.7879638671875, "epoch": 0.2915440531639156, "grad_norm": 0.10110050991458913, "kl": 0.00295257568359375, "learning_rate": 8.045264138549762e-07, "loss": 0.0001, "reward": 1.9174107760190964, "reward_std": 0.03547286335378885, "rewards/accuracy_reward": 0.9174107685685158, "rewards/format_reward": 1.0, "step": 3740 }, { "completion_length": 421.3817138671875, "epoch": 0.2916999590746984, "grad_norm": 0.08847398955169286, "kl": 0.0031528472900390625, "learning_rate": 8.043321389253236e-07, "loss": 0.0001, "reward": 1.8482143729925156, "reward_std": 0.05734171997755766, "rewards/accuracy_reward": 0.8482143133878708, "rewards/format_reward": 1.0, "step": 3742 }, { "completion_length": 431.97322845458984, "epoch": 0.2918558649854813, "grad_norm": 0.0818977055969822, "kl": 0.003284454345703125, "learning_rate": 8.041377909844449e-07, "loss": 0.0001, "reward": 1.859375074505806, "reward_std": 0.05102826654911041, "rewards/accuracy_reward": 0.8593750447034836, "rewards/format_reward": 1.0, "step": 3744 }, { "completion_length": 430.6250228881836, "epoch": 0.2920117708962641, "grad_norm": 0.04685353973746242, "kl": 0.002834320068359375, "learning_rate": 8.039433700789648e-07, "loss": 0.0001, "reward": 1.8258929401636124, "reward_std": 0.03644856158643961, "rewards/accuracy_reward": 0.8258928805589676, "rewards/format_reward": 1.0, "step": 3746 }, { "completion_length": 420.8460006713867, "epoch": 0.29216767680704697, "grad_norm": 0.08684489702662185, "kl": 0.002899169921875, "learning_rate": 8.037488762555263e-07, "loss": 0.0001, "reward": 1.8236607909202576, "reward_std": 0.05328415986150503, "rewards/accuracy_reward": 0.823660746216774, "rewards/format_reward": 1.0, "step": 3748 }, { "completion_length": 427.38618087768555, "epoch": 0.29232358271782977, "grad_norm": 0.10519970964631321, "kl": 0.0030765533447265625, "learning_rate": 8.0355430956079e-07, "loss": 0.0001, "reward": 1.8214286416769028, "reward_std": 0.06380490399897099, "rewards/accuracy_reward": 0.8214286044239998, "rewards/format_reward": 1.0, "step": 3750 }, { "completion_length": 408.08260345458984, "epoch": 0.2924794886286126, "grad_norm": 0.05149400027811908, "kl": 0.0023860931396484375, "learning_rate": 8.033596700414334e-07, "loss": 0.0001, "reward": 1.8504464775323868, "reward_std": 0.056514350697398186, "rewards/accuracy_reward": 0.8504464626312256, "rewards/format_reward": 1.0, "step": 3752 }, { "completion_length": 438.07591247558594, "epoch": 0.29263539453939547, "grad_norm": 0.0407431424951292, "kl": 0.003330230712890625, "learning_rate": 8.031649577441517e-07, "loss": 0.0001, "reward": 1.783482238650322, "reward_std": 0.027206611819565296, "rewards/accuracy_reward": 0.783482164144516, "rewards/format_reward": 1.0, "step": 3754 }, { "completion_length": 422.0647506713867, "epoch": 0.2927913004501783, "grad_norm": 0.046010638197281704, "kl": 0.0027227401733398438, "learning_rate": 8.029701727156579e-07, "loss": 0.0001, "reward": 1.8258929401636124, "reward_std": 0.04306510929018259, "rewards/accuracy_reward": 0.8258928954601288, "rewards/format_reward": 1.0, "step": 3756 }, { "completion_length": 420.60716247558594, "epoch": 0.29294720636096117, "grad_norm": 0.15803880285642632, "kl": 0.0036640167236328125, "learning_rate": 8.027753150026821e-07, "loss": 0.0001, "reward": 1.709821492433548, "reward_std": 0.10266272630542517, "rewards/accuracy_reward": 0.7120536006987095, "rewards/format_reward": 0.9977678656578064, "step": 3758 }, { "completion_length": 408.06697845458984, "epoch": 0.293103112271744, "grad_norm": 0.06907735332627556, "kl": 0.0030307769775390625, "learning_rate": 8.025803846519718e-07, "loss": 0.0001, "reward": 1.8147322088479996, "reward_std": 0.061397045850753784, "rewards/accuracy_reward": 0.8147321790456772, "rewards/format_reward": 1.0, "step": 3760 }, { "completion_length": 416.1562728881836, "epoch": 0.29325901818252686, "grad_norm": 0.11355287512031975, "kl": 0.00286865234375, "learning_rate": 8.023853817102921e-07, "loss": 0.0001, "reward": 1.7946429252624512, "reward_std": 0.08003291115164757, "rewards/accuracy_reward": 0.7968750335276127, "rewards/format_reward": 0.9977678656578064, "step": 3762 }, { "completion_length": 423.78796768188477, "epoch": 0.2934149240933097, "grad_norm": 0.1039560196129944, "kl": 0.003025054931640625, "learning_rate": 8.021903062244254e-07, "loss": 0.0001, "reward": 1.8080358058214188, "reward_std": 0.05734172184020281, "rewards/accuracy_reward": 0.808035746216774, "rewards/format_reward": 1.0, "step": 3764 }, { "completion_length": 415.5245666503906, "epoch": 0.2935708300040925, "grad_norm": 0.07467722941869022, "kl": 0.002933502197265625, "learning_rate": 8.019951582411717e-07, "loss": 0.0001, "reward": 1.8348215073347092, "reward_std": 0.05621125642210245, "rewards/accuracy_reward": 0.8348214626312256, "rewards/format_reward": 1.0, "step": 3766 }, { "completion_length": 440.8482322692871, "epoch": 0.29372673591487536, "grad_norm": 0.049816020924886045, "kl": 0.0032329559326171875, "learning_rate": 8.017999378073481e-07, "loss": 0.0001, "reward": 1.8214286267757416, "reward_std": 0.048759003169834614, "rewards/accuracy_reward": 0.8236607536673546, "rewards/format_reward": 0.9977678656578064, "step": 3768 }, { "completion_length": 424.7991256713867, "epoch": 0.2938826418256582, "grad_norm": 0.10045751793207339, "kl": 0.0035762786865234375, "learning_rate": 8.016046449697892e-07, "loss": 0.0001, "reward": 1.7790179401636124, "reward_std": 0.04065725393593311, "rewards/accuracy_reward": 0.7790178880095482, "rewards/format_reward": 1.0, "step": 3770 }, { "completion_length": 421.49332427978516, "epoch": 0.29403854773644106, "grad_norm": 0.11414782098778144, "kl": 0.0036029815673828125, "learning_rate": 8.014092797753472e-07, "loss": 0.0001, "reward": 1.9062500596046448, "reward_std": 0.05137945245951414, "rewards/accuracy_reward": 0.9084821864962578, "rewards/format_reward": 0.9977678656578064, "step": 3772 }, { "completion_length": 427.7343940734863, "epoch": 0.2941944536472239, "grad_norm": 0.07038555218133778, "kl": 0.0030574798583984375, "learning_rate": 8.012138422708913e-07, "loss": 0.0001, "reward": 1.8169643878936768, "reward_std": 0.04358434583991766, "rewards/accuracy_reward": 0.816964328289032, "rewards/format_reward": 1.0, "step": 3774 }, { "completion_length": 419.4308204650879, "epoch": 0.29435035955800676, "grad_norm": 0.08592249891629322, "kl": 0.0032596588134765625, "learning_rate": 8.010183325033083e-07, "loss": 0.0001, "reward": 1.7611607909202576, "reward_std": 0.04696930479258299, "rewards/accuracy_reward": 0.7611607536673546, "rewards/format_reward": 1.0, "step": 3776 }, { "completion_length": 436.15626525878906, "epoch": 0.2945062654687896, "grad_norm": 0.05636402068269715, "kl": 0.0029401779174804688, "learning_rate": 8.008227505195022e-07, "loss": 0.0001, "reward": 1.8169643729925156, "reward_std": 0.03644856158643961, "rewards/accuracy_reward": 0.8169643133878708, "rewards/format_reward": 1.0, "step": 3778 }, { "completion_length": 437.5513572692871, "epoch": 0.2946621713795724, "grad_norm": 0.11157088834694867, "kl": 0.003429412841796875, "learning_rate": 8.006270963663943e-07, "loss": 0.0001, "reward": 1.8303572237491608, "reward_std": 0.07124741934239864, "rewards/accuracy_reward": 0.8303571864962578, "rewards/format_reward": 1.0, "step": 3780 }, { "completion_length": 421.79243087768555, "epoch": 0.29481807729035525, "grad_norm": 0.09117011669289941, "kl": 0.00301361083984375, "learning_rate": 8.004313700909233e-07, "loss": 0.0001, "reward": 1.8058036714792252, "reward_std": 0.03547286335378885, "rewards/accuracy_reward": 0.8058036044239998, "rewards/format_reward": 1.0, "step": 3782 }, { "completion_length": 419.56251525878906, "epoch": 0.2949739832011381, "grad_norm": 0.09164911974947007, "kl": 0.003192901611328125, "learning_rate": 8.002355717400453e-07, "loss": 0.0001, "reward": 1.8325893580913544, "reward_std": 0.05230705998837948, "rewards/accuracy_reward": 0.832589328289032, "rewards/format_reward": 1.0, "step": 3784 }, { "completion_length": 438.7567138671875, "epoch": 0.29512988911192095, "grad_norm": 0.08925971983947852, "kl": 0.003238677978515625, "learning_rate": 8.000397013607336e-07, "loss": 0.0001, "reward": 1.814732238650322, "reward_std": 0.039833519607782364, "rewards/accuracy_reward": 0.8147321790456772, "rewards/format_reward": 1.0, "step": 3786 }, { "completion_length": 438.1339530944824, "epoch": 0.2952857950227038, "grad_norm": 0.07716199016231803, "kl": 0.003330230712890625, "learning_rate": 7.998437589999785e-07, "loss": 0.0001, "reward": 1.7812500894069672, "reward_std": 0.029159409925341606, "rewards/accuracy_reward": 0.7812500521540642, "rewards/format_reward": 1.0, "step": 3788 }, { "completion_length": 414.30135345458984, "epoch": 0.29544170093348665, "grad_norm": 0.07175493651071727, "kl": 0.0026416778564453125, "learning_rate": 7.996477447047882e-07, "loss": 0.0001, "reward": 1.8928572088479996, "reward_std": 0.029159409925341606, "rewards/accuracy_reward": 0.8928571790456772, "rewards/format_reward": 1.0, "step": 3790 }, { "completion_length": 421.5692138671875, "epoch": 0.2955976068442695, "grad_norm": 0.08559989113480657, "kl": 0.002910614013671875, "learning_rate": 7.994516585221875e-07, "loss": 0.0001, "reward": 1.8861607909202576, "reward_std": 0.02284595649689436, "rewards/accuracy_reward": 0.886160746216774, "rewards/format_reward": 1.0, "step": 3792 }, { "completion_length": 444.6830520629883, "epoch": 0.29575351275505235, "grad_norm": 0.1588354974869482, "kl": 0.003871917724609375, "learning_rate": 7.992555004992185e-07, "loss": 0.0002, "reward": 1.765625074505806, "reward_std": 0.1021193116903305, "rewards/accuracy_reward": 0.7678571790456772, "rewards/format_reward": 0.9977678656578064, "step": 3794 }, { "completion_length": 420.85716247558594, "epoch": 0.29590941866583514, "grad_norm": 0.1401435751128343, "kl": 0.0031147003173828125, "learning_rate": 7.990592706829416e-07, "loss": 0.0001, "reward": 1.8013393580913544, "reward_std": 0.06365517526865005, "rewards/accuracy_reward": 0.8013393208384514, "rewards/format_reward": 1.0, "step": 3796 }, { "completion_length": 438.57591247558594, "epoch": 0.296065324576618, "grad_norm": 0.0944038800201022, "kl": 0.0032482147216796875, "learning_rate": 7.988629691204328e-07, "loss": 0.0001, "reward": 1.8080358058214188, "reward_std": 0.06755936797708273, "rewards/accuracy_reward": 0.8080357611179352, "rewards/format_reward": 1.0, "step": 3798 }, { "completion_length": 415.8593864440918, "epoch": 0.29622123048740084, "grad_norm": 0.06291961964690974, "kl": 0.002582550048828125, "learning_rate": 7.986665958587862e-07, "loss": 0.0001, "reward": 1.834821492433548, "reward_std": 0.04764330945909023, "rewards/accuracy_reward": 0.834821455180645, "rewards/format_reward": 1.0, "step": 3800 }, { "completion_length": 421.517879486084, "epoch": 0.2963771363981837, "grad_norm": 0.09505697961464524, "kl": 0.0034999847412109375, "learning_rate": 7.984701509451133e-07, "loss": 0.0001, "reward": 1.8013393729925156, "reward_std": 0.05929451808333397, "rewards/accuracy_reward": 0.801339328289032, "rewards/format_reward": 1.0, "step": 3802 }, { "completion_length": 402.6093940734863, "epoch": 0.29653304230896654, "grad_norm": 0.08484197005842685, "kl": 0.00260162353515625, "learning_rate": 7.982736344265424e-07, "loss": 0.0001, "reward": 1.843750074505806, "reward_std": 0.029159409925341606, "rewards/accuracy_reward": 0.8437500447034836, "rewards/format_reward": 1.0, "step": 3804 }, { "completion_length": 423.971004486084, "epoch": 0.2966889482197494, "grad_norm": 0.1110157309345708, "kl": 0.0034656524658203125, "learning_rate": 7.980770463502189e-07, "loss": 0.0001, "reward": 1.7879465073347092, "reward_std": 0.0456905122846365, "rewards/accuracy_reward": 0.7879464626312256, "rewards/format_reward": 1.0, "step": 3806 }, { "completion_length": 424.2544822692871, "epoch": 0.29684485413053224, "grad_norm": 0.08210207062667373, "kl": 0.00341796875, "learning_rate": 7.978803867633057e-07, "loss": 0.0001, "reward": 1.7477679401636124, "reward_std": 0.04989780019968748, "rewards/accuracy_reward": 0.7477678880095482, "rewards/format_reward": 1.0, "step": 3808 }, { "completion_length": 410.35716247558594, "epoch": 0.2970007600413151, "grad_norm": 0.09412391681322202, "kl": 0.0029077529907226562, "learning_rate": 7.976836557129827e-07, "loss": 0.0001, "reward": 1.7500000596046448, "reward_std": 0.05133136082440615, "rewards/accuracy_reward": 0.7500000298023224, "rewards/format_reward": 1.0, "step": 3810 }, { "completion_length": 423.2656440734863, "epoch": 0.2971566659520979, "grad_norm": 0.07082488890961906, "kl": 0.0030918121337890625, "learning_rate": 7.974868532464468e-07, "loss": 0.0001, "reward": 1.8147322088479996, "reward_std": 0.02772584930062294, "rewards/accuracy_reward": 0.8147321864962578, "rewards/format_reward": 1.0, "step": 3812 }, { "completion_length": 425.6160888671875, "epoch": 0.29731257186288074, "grad_norm": 0.08868809056244296, "kl": 0.0033283233642578125, "learning_rate": 7.972899794109123e-07, "loss": 0.0001, "reward": 1.8638393729925156, "reward_std": 0.049734702333807945, "rewards/accuracy_reward": 0.8660714775323868, "rewards/format_reward": 0.9977678656578064, "step": 3814 }, { "completion_length": 429.04019927978516, "epoch": 0.2974684777736636, "grad_norm": 0.10302948135448387, "kl": 0.0036029815673828125, "learning_rate": 7.970930342536104e-07, "loss": 0.0001, "reward": 1.8303572088479996, "reward_std": 0.05035426188260317, "rewards/accuracy_reward": 0.8303571864962578, "rewards/format_reward": 1.0, "step": 3816 }, { "completion_length": 434.5134048461914, "epoch": 0.29762438368444644, "grad_norm": 0.11121310870120212, "kl": 0.003131866455078125, "learning_rate": 7.968960178217896e-07, "loss": 0.0001, "reward": 1.8191965073347092, "reward_std": 0.05493386462330818, "rewards/accuracy_reward": 0.8191964626312256, "rewards/format_reward": 1.0, "step": 3818 }, { "completion_length": 424.85493087768555, "epoch": 0.2977802895952293, "grad_norm": 0.06956183657521105, "kl": 0.003055572509765625, "learning_rate": 7.966989301627154e-07, "loss": 0.0001, "reward": 1.8727679401636124, "reward_std": 0.01894036028534174, "rewards/accuracy_reward": 0.8727679029107094, "rewards/format_reward": 1.0, "step": 3820 }, { "completion_length": 421.6339454650879, "epoch": 0.29793619550601214, "grad_norm": 0.09703211822683935, "kl": 0.003360748291015625, "learning_rate": 7.965017713236703e-07, "loss": 0.0001, "reward": 1.859375074505806, "reward_std": 0.03840135969221592, "rewards/accuracy_reward": 0.8593750298023224, "rewards/format_reward": 1.0, "step": 3822 }, { "completion_length": 428.19868087768555, "epoch": 0.298092101416795, "grad_norm": 0.06673228861211614, "kl": 0.00342559814453125, "learning_rate": 7.96304541351954e-07, "loss": 0.0001, "reward": 1.6808036416769028, "reward_std": 0.050051167607307434, "rewards/accuracy_reward": 0.6808035969734192, "rewards/format_reward": 1.0, "step": 3824 }, { "completion_length": 426.1026916503906, "epoch": 0.2982480073275778, "grad_norm": 0.07766849676567407, "kl": 0.00348663330078125, "learning_rate": 7.961072402948832e-07, "loss": 0.0001, "reward": 1.8459822088479996, "reward_std": 0.030135108157992363, "rewards/accuracy_reward": 0.8459821790456772, "rewards/format_reward": 1.0, "step": 3826 }, { "completion_length": 420.65403747558594, "epoch": 0.29840391323836063, "grad_norm": 0.0754821226643352, "kl": 0.00301361083984375, "learning_rate": 7.959098681997918e-07, "loss": 0.0001, "reward": 1.821428656578064, "reward_std": 0.04035415779799223, "rewards/accuracy_reward": 0.8214286118745804, "rewards/format_reward": 1.0, "step": 3828 }, { "completion_length": 433.40849685668945, "epoch": 0.2985598191491435, "grad_norm": 0.06719629641818485, "kl": 0.00305938720703125, "learning_rate": 7.957124251140306e-07, "loss": 0.0001, "reward": 1.8013393580913544, "reward_std": 0.03840136155486107, "rewards/accuracy_reward": 0.8013393208384514, "rewards/format_reward": 1.0, "step": 3830 }, { "completion_length": 416.3951072692871, "epoch": 0.29871572505992633, "grad_norm": 0.09551975219687442, "kl": 0.00311279296875, "learning_rate": 7.955149110849673e-07, "loss": 0.0001, "reward": 1.781250074505806, "reward_std": 0.04434390366077423, "rewards/accuracy_reward": 0.781250037252903, "rewards/format_reward": 1.0, "step": 3832 }, { "completion_length": 417.636173248291, "epoch": 0.2988716309707092, "grad_norm": 0.10419230478360879, "kl": 0.0032806396484375, "learning_rate": 7.953173261599869e-07, "loss": 0.0001, "reward": 1.8147322237491608, "reward_std": 0.0843949681147933, "rewards/accuracy_reward": 0.8147321864962578, "rewards/format_reward": 1.0, "step": 3834 }, { "completion_length": 432.7723388671875, "epoch": 0.29902753688149203, "grad_norm": 0.10202135692039674, "kl": 0.0034923553466796875, "learning_rate": 7.951196703864913e-07, "loss": 0.0001, "reward": 1.7566964775323868, "reward_std": 0.0714663676917553, "rewards/accuracy_reward": 0.7566964626312256, "rewards/format_reward": 1.0, "step": 3836 }, { "completion_length": 414.5335006713867, "epoch": 0.2991834427922749, "grad_norm": 0.14600742668220754, "kl": 0.002750396728515625, "learning_rate": 7.949219438118994e-07, "loss": 0.0001, "reward": 1.8392858058214188, "reward_std": 0.061702377162873745, "rewards/accuracy_reward": 0.8392857536673546, "rewards/format_reward": 1.0, "step": 3838 }, { "completion_length": 423.06028747558594, "epoch": 0.29933934870305773, "grad_norm": 0.0691113695447265, "kl": 0.00286102294921875, "learning_rate": 7.947241464836469e-07, "loss": 0.0001, "reward": 1.8236607611179352, "reward_std": 0.06478060036897659, "rewards/accuracy_reward": 0.823660746216774, "rewards/format_reward": 1.0, "step": 3840 }, { "completion_length": 428.13618087768555, "epoch": 0.2994952546138405, "grad_norm": 0.08966199721486746, "kl": 0.003124237060546875, "learning_rate": 7.94526278449187e-07, "loss": 0.0001, "reward": 1.8080357909202576, "reward_std": 0.06515011005103588, "rewards/accuracy_reward": 0.8080357387661934, "rewards/format_reward": 1.0, "step": 3842 }, { "completion_length": 415.8951072692871, "epoch": 0.2996511605246234, "grad_norm": 0.08943241898998529, "kl": 0.003093719482421875, "learning_rate": 7.943283397559891e-07, "loss": 0.0001, "reward": 1.796875074505806, "reward_std": 0.036751655861735344, "rewards/accuracy_reward": 0.796875037252903, "rewards/format_reward": 1.0, "step": 3844 }, { "completion_length": 441.6093940734863, "epoch": 0.2998070664354062, "grad_norm": 0.15063617821185196, "kl": 0.003742218017578125, "learning_rate": 7.941303304515402e-07, "loss": 0.0001, "reward": 1.7879465222358704, "reward_std": 0.09995037131011486, "rewards/accuracy_reward": 0.7879464626312256, "rewards/format_reward": 1.0, "step": 3846 }, { "completion_length": 409.36609268188477, "epoch": 0.29996297234618907, "grad_norm": 0.08616988565167423, "kl": 0.002960205078125, "learning_rate": 7.939322505833439e-07, "loss": 0.0001, "reward": 1.7879465073347092, "reward_std": 0.03675165772438049, "rewards/accuracy_reward": 0.7879464402794838, "rewards/format_reward": 1.0, "step": 3848 }, { "completion_length": 417.7678756713867, "epoch": 0.3001188782569719, "grad_norm": 0.1150373194169386, "kl": 0.0028848648071289062, "learning_rate": 7.93734100198921e-07, "loss": 0.0001, "reward": 1.828125074505806, "reward_std": 0.05200536735355854, "rewards/accuracy_reward": 0.8281250298023224, "rewards/format_reward": 1.0, "step": 3850 }, { "completion_length": 404.9977836608887, "epoch": 0.30027478416775477, "grad_norm": 0.0035915809490539557, "kl": 0.002620697021484375, "learning_rate": 7.935358793458087e-07, "loss": 0.0001, "reward": 1.7834822237491608, "reward_std": 0.0063134534284472466, "rewards/accuracy_reward": 0.783482164144516, "rewards/format_reward": 1.0, "step": 3852 }, { "completion_length": 415.63171005249023, "epoch": 0.3004306900785376, "grad_norm": 0.09315870420574905, "kl": 0.0029430389404296875, "learning_rate": 7.93337588071562e-07, "loss": 0.0001, "reward": 1.875000074505806, "reward_std": 0.045993607491254807, "rewards/accuracy_reward": 0.875000037252903, "rewards/format_reward": 1.0, "step": 3854 }, { "completion_length": 418.9375190734863, "epoch": 0.30058659598932047, "grad_norm": 0.09659286709455192, "kl": 0.0029697418212890625, "learning_rate": 7.931392264237515e-07, "loss": 0.0001, "reward": 1.8035715073347092, "reward_std": 0.05441322363913059, "rewards/accuracy_reward": 0.8058036044239998, "rewards/format_reward": 0.9977678656578064, "step": 3856 }, { "completion_length": 427.2678756713867, "epoch": 0.30074250190010327, "grad_norm": 0.0531947093952768, "kl": 0.0027599334716796875, "learning_rate": 7.929407944499661e-07, "loss": 0.0001, "reward": 1.7879465222358704, "reward_std": 0.02284595649689436, "rewards/accuracy_reward": 0.7879464626312256, "rewards/format_reward": 1.0, "step": 3858 }, { "completion_length": 437.40180587768555, "epoch": 0.3008984078108861, "grad_norm": 0.0938316847688083, "kl": 0.0031795501708984375, "learning_rate": 7.927422921978106e-07, "loss": 0.0001, "reward": 1.7901786416769028, "reward_std": 0.0472723999992013, "rewards/accuracy_reward": 0.790178619325161, "rewards/format_reward": 1.0, "step": 3860 }, { "completion_length": 425.3415336608887, "epoch": 0.30105431372166896, "grad_norm": 0.09286327759750884, "kl": 0.003398895263671875, "learning_rate": 7.925437197149069e-07, "loss": 0.0001, "reward": 1.8169643729925156, "reward_std": 0.0529810655862093, "rewards/accuracy_reward": 0.816964328289032, "rewards/format_reward": 1.0, "step": 3862 }, { "completion_length": 417.0446662902832, "epoch": 0.3012102196324518, "grad_norm": 0.1121152403717273, "kl": 0.002765655517578125, "learning_rate": 7.92345077048894e-07, "loss": 0.0001, "reward": 1.7901786416769028, "reward_std": 0.04306511115282774, "rewards/accuracy_reward": 0.7901785969734192, "rewards/format_reward": 1.0, "step": 3864 }, { "completion_length": 413.8616256713867, "epoch": 0.30136612554323466, "grad_norm": 0.10191472328800114, "kl": 0.003101348876953125, "learning_rate": 7.921463642474273e-07, "loss": 0.0001, "reward": 1.7678572088479996, "reward_std": 0.04373771324753761, "rewards/accuracy_reward": 0.7678571864962578, "rewards/format_reward": 1.0, "step": 3866 }, { "completion_length": 428.1093940734863, "epoch": 0.3015220314540175, "grad_norm": 0.047173385144384655, "kl": 0.0030879974365234375, "learning_rate": 7.919475813581795e-07, "loss": 0.0001, "reward": 1.734375074505806, "reward_std": 0.0063134534284472466, "rewards/accuracy_reward": 0.7343750447034836, "rewards/format_reward": 1.0, "step": 3868 }, { "completion_length": 415.1495704650879, "epoch": 0.30167793736480036, "grad_norm": 0.1212052393693973, "kl": 0.0033435821533203125, "learning_rate": 7.917487284288399e-07, "loss": 0.0001, "reward": 1.6183036416769028, "reward_std": 0.09266121964901686, "rewards/accuracy_reward": 0.6205357387661934, "rewards/format_reward": 0.9977678656578064, "step": 3870 }, { "completion_length": 439.3169860839844, "epoch": 0.30183384327558316, "grad_norm": 0.10272654358662212, "kl": 0.0038890838623046875, "learning_rate": 7.915498055071144e-07, "loss": 0.0002, "reward": 1.7857143729925156, "reward_std": 0.04373771417886019, "rewards/accuracy_reward": 0.785714328289032, "rewards/format_reward": 1.0, "step": 3872 }, { "completion_length": 420.3839454650879, "epoch": 0.301989749186366, "grad_norm": 0.11460348278579402, "kl": 0.00330352783203125, "learning_rate": 7.91350812640726e-07, "loss": 0.0001, "reward": 1.7299108058214188, "reward_std": 0.07905721105635166, "rewards/accuracy_reward": 0.729910746216774, "rewards/format_reward": 1.0, "step": 3874 }, { "completion_length": 431.6674270629883, "epoch": 0.30214565509714886, "grad_norm": 0.045241458187555504, "kl": 0.0029458999633789062, "learning_rate": 7.911517498774145e-07, "loss": 0.0001, "reward": 1.767857238650322, "reward_std": 0.03870445489883423, "rewards/accuracy_reward": 0.7678571715950966, "rewards/format_reward": 1.0, "step": 3876 }, { "completion_length": 424.9330520629883, "epoch": 0.3023015610079317, "grad_norm": 0.10777073152569883, "kl": 0.0034637451171875, "learning_rate": 7.909526172649359e-07, "loss": 0.0001, "reward": 1.7053572088479996, "reward_std": 0.05959761422127485, "rewards/accuracy_reward": 0.705357164144516, "rewards/format_reward": 1.0, "step": 3878 }, { "completion_length": 419.70984268188477, "epoch": 0.30245746691871456, "grad_norm": 0.10921054010211595, "kl": 0.0031194686889648438, "learning_rate": 7.907534148510638e-07, "loss": 0.0001, "reward": 1.756696492433548, "reward_std": 0.05959621164947748, "rewards/accuracy_reward": 0.7566964626312256, "rewards/format_reward": 1.0, "step": 3880 }, { "completion_length": 413.98662185668945, "epoch": 0.3026133728294974, "grad_norm": 0.09210616366138183, "kl": 0.0029296875, "learning_rate": 7.90554142683588e-07, "loss": 0.0001, "reward": 1.8080357760190964, "reward_std": 0.03644856158643961, "rewards/accuracy_reward": 0.8080357536673546, "rewards/format_reward": 1.0, "step": 3882 }, { "completion_length": 422.6451072692871, "epoch": 0.30276927874028026, "grad_norm": 0.07503320490143528, "kl": 0.003063201904296875, "learning_rate": 7.903548008103152e-07, "loss": 0.0001, "reward": 1.7857143878936768, "reward_std": 0.03479885868728161, "rewards/accuracy_reward": 0.7879464626312256, "rewards/format_reward": 0.9977678656578064, "step": 3884 }, { "completion_length": 421.52234268188477, "epoch": 0.3029251846510631, "grad_norm": 0.05183669473545627, "kl": 0.0031538009643554688, "learning_rate": 7.901553892790686e-07, "loss": 0.0001, "reward": 1.7879465222358704, "reward_std": 0.03937845956534147, "rewards/accuracy_reward": 0.7879464626312256, "rewards/format_reward": 1.0, "step": 3886 }, { "completion_length": 424.3013572692871, "epoch": 0.3030810905618459, "grad_norm": 0.10568604195752675, "kl": 0.0031890869140625, "learning_rate": 7.899559081376885e-07, "loss": 0.0001, "reward": 1.8080357909202576, "reward_std": 0.055909561924636364, "rewards/accuracy_reward": 0.8080357238650322, "rewards/format_reward": 1.0, "step": 3888 }, { "completion_length": 425.46653747558594, "epoch": 0.30323699647262875, "grad_norm": 0.07556994151249095, "kl": 0.0027065277099609375, "learning_rate": 7.897563574340316e-07, "loss": 0.0001, "reward": 1.9129464775323868, "reward_std": 0.055235558189451694, "rewards/accuracy_reward": 0.9129464626312256, "rewards/format_reward": 1.0, "step": 3890 }, { "completion_length": 425.65180587768555, "epoch": 0.3033929023834116, "grad_norm": 0.09732918546546969, "kl": 0.004483222961425781, "learning_rate": 7.895567372159715e-07, "loss": 0.0002, "reward": 1.8705357909202576, "reward_std": 0.0728971241042018, "rewards/accuracy_reward": 0.8727678880095482, "rewards/format_reward": 0.9977678656578064, "step": 3892 }, { "completion_length": 410.7321548461914, "epoch": 0.30354880829419445, "grad_norm": 0.10720115005357761, "kl": 0.0029163360595703125, "learning_rate": 7.893570475313984e-07, "loss": 0.0001, "reward": 1.8392858058214188, "reward_std": 0.05569201707839966, "rewards/accuracy_reward": 0.8392857611179352, "rewards/format_reward": 1.0, "step": 3894 }, { "completion_length": 424.63171005249023, "epoch": 0.3037047142049773, "grad_norm": 0.11127420951292094, "kl": 0.0033054351806640625, "learning_rate": 7.89157288428219e-07, "loss": 0.0001, "reward": 1.783482238650322, "reward_std": 0.04404080845415592, "rewards/accuracy_reward": 0.7834821790456772, "rewards/format_reward": 1.0, "step": 3896 }, { "completion_length": 433.02233505249023, "epoch": 0.30386062011576015, "grad_norm": 0.1342603798942627, "kl": 0.002765655517578125, "learning_rate": 7.889574599543567e-07, "loss": 0.0001, "reward": 1.758928656578064, "reward_std": 0.0908617889508605, "rewards/accuracy_reward": 0.7589285969734192, "rewards/format_reward": 1.0, "step": 3898 }, { "completion_length": 415.1272506713867, "epoch": 0.304016526026543, "grad_norm": 0.06475324399547362, "kl": 0.0027751922607421875, "learning_rate": 7.887575621577518e-07, "loss": 0.0001, "reward": 1.8325893580913544, "reward_std": 0.06756076868623495, "rewards/accuracy_reward": 0.8325893133878708, "rewards/format_reward": 1.0, "step": 3900 }, { "completion_length": 421.5022506713867, "epoch": 0.30417243193732585, "grad_norm": 0.07816616134170738, "kl": 0.0026912689208984375, "learning_rate": 7.88557595086361e-07, "loss": 0.0001, "reward": 1.8392857909202576, "reward_std": 0.04035415779799223, "rewards/accuracy_reward": 0.8392857536673546, "rewards/format_reward": 1.0, "step": 3902 }, { "completion_length": 431.7745704650879, "epoch": 0.30432833784810864, "grad_norm": 0.049504442019935105, "kl": 0.002780914306640625, "learning_rate": 7.883575587881578e-07, "loss": 0.0001, "reward": 1.8459822088479996, "reward_std": 0.03547286335378885, "rewards/accuracy_reward": 0.8482143208384514, "rewards/format_reward": 0.9977678656578064, "step": 3904 }, { "completion_length": 426.2902030944824, "epoch": 0.3044842437588915, "grad_norm": 0.07780326936949382, "kl": 0.003124237060546875, "learning_rate": 7.881574533111319e-07, "loss": 0.0001, "reward": 1.7879465222358704, "reward_std": 0.055788010358810425, "rewards/accuracy_reward": 0.7924107536673546, "rewards/format_reward": 0.9955357164144516, "step": 3906 }, { "completion_length": 406.4308204650879, "epoch": 0.30464014966967434, "grad_norm": 0.10592830344008325, "kl": 0.0026998519897460938, "learning_rate": 7.879572787032903e-07, "loss": 0.0001, "reward": 1.803571492433548, "reward_std": 0.07222452107816935, "rewards/accuracy_reward": 0.803571455180645, "rewards/format_reward": 1.0, "step": 3908 }, { "completion_length": 422.2031440734863, "epoch": 0.3047960555804572, "grad_norm": 0.08581673266244412, "kl": 0.0029172897338867188, "learning_rate": 7.877570350126559e-07, "loss": 0.0001, "reward": 1.8437501043081284, "reward_std": 0.04764330945909023, "rewards/accuracy_reward": 0.8437500298023224, "rewards/format_reward": 1.0, "step": 3910 }, { "completion_length": 416.9910888671875, "epoch": 0.30495196149124004, "grad_norm": 0.0880034666244813, "kl": 0.0028324127197265625, "learning_rate": 7.875567222872684e-07, "loss": 0.0001, "reward": 1.8861607909202576, "reward_std": 0.06493396870791912, "rewards/accuracy_reward": 0.8883929029107094, "rewards/format_reward": 0.9977678656578064, "step": 3912 }, { "completion_length": 422.68528747558594, "epoch": 0.3051078674020229, "grad_norm": 0.07126467478338833, "kl": 0.00292205810546875, "learning_rate": 7.873563405751841e-07, "loss": 0.0001, "reward": 1.803571492433548, "reward_std": 0.0505718057975173, "rewards/accuracy_reward": 0.8035714700818062, "rewards/format_reward": 1.0, "step": 3914 }, { "completion_length": 436.4241256713867, "epoch": 0.30526377331280574, "grad_norm": 0.12795259656355973, "kl": 0.0034656524658203125, "learning_rate": 7.871558899244761e-07, "loss": 0.0001, "reward": 1.7678572237491608, "reward_std": 0.07806674018502235, "rewards/accuracy_reward": 0.7700893133878708, "rewards/format_reward": 0.9977678656578064, "step": 3916 }, { "completion_length": 423.6026954650879, "epoch": 0.3054196792235886, "grad_norm": 0.14460411394649958, "kl": 0.003139495849609375, "learning_rate": 7.869553703832334e-07, "loss": 0.0001, "reward": 1.7410715222358704, "reward_std": 0.11648427322506905, "rewards/accuracy_reward": 0.741071455180645, "rewards/format_reward": 1.0, "step": 3918 }, { "completion_length": 418.8415412902832, "epoch": 0.3055755851343714, "grad_norm": 0.09265560699237754, "kl": 0.003208160400390625, "learning_rate": 7.867547819995623e-07, "loss": 0.0001, "reward": 1.7656250894069672, "reward_std": 0.05005116853863001, "rewards/accuracy_reward": 0.7656250298023224, "rewards/format_reward": 1.0, "step": 3920 }, { "completion_length": 429.70314025878906, "epoch": 0.30573149104515424, "grad_norm": 0.08761092713534664, "kl": 0.0031442642211914062, "learning_rate": 7.865541248215852e-07, "loss": 0.0001, "reward": 1.7008929401636124, "reward_std": 0.07222452200949192, "rewards/accuracy_reward": 0.7008928880095482, "rewards/format_reward": 1.0, "step": 3922 }, { "completion_length": 437.6093978881836, "epoch": 0.3058873969559371, "grad_norm": 0.06382987172528166, "kl": 0.00299072265625, "learning_rate": 7.863533988974407e-07, "loss": 0.0001, "reward": 1.8191965073347092, "reward_std": 0.03306360449641943, "rewards/accuracy_reward": 0.8191964700818062, "rewards/format_reward": 1.0, "step": 3924 }, { "completion_length": 405.7567138671875, "epoch": 0.30604330286671994, "grad_norm": 0.04782715909866379, "kl": 0.0025758743286132812, "learning_rate": 7.861526042752845e-07, "loss": 0.0001, "reward": 1.8794643580913544, "reward_std": 0.029159409925341606, "rewards/accuracy_reward": 0.8794643133878708, "rewards/format_reward": 1.0, "step": 3926 }, { "completion_length": 426.7299270629883, "epoch": 0.3061992087775028, "grad_norm": 0.06767295104340647, "kl": 0.003246307373046875, "learning_rate": 7.859517410032885e-07, "loss": 0.0001, "reward": 1.6517858058214188, "reward_std": 0.05786095932126045, "rewards/accuracy_reward": 0.6517857387661934, "rewards/format_reward": 1.0, "step": 3928 }, { "completion_length": 414.97769927978516, "epoch": 0.30635511468828563, "grad_norm": 0.10274327281194377, "kl": 0.00308990478515625, "learning_rate": 7.857508091296412e-07, "loss": 0.0001, "reward": 1.8839285969734192, "reward_std": 0.05862051621079445, "rewards/accuracy_reward": 0.8839285895228386, "rewards/format_reward": 1.0, "step": 3930 }, { "completion_length": 420.8906478881836, "epoch": 0.3065110205990685, "grad_norm": 0.09173540867434185, "kl": 0.00330352783203125, "learning_rate": 7.855498087025471e-07, "loss": 0.0001, "reward": 1.8839286267757416, "reward_std": 0.04163295216858387, "rewards/accuracy_reward": 0.8839286044239998, "rewards/format_reward": 1.0, "step": 3932 }, { "completion_length": 421.12724685668945, "epoch": 0.3066669265098513, "grad_norm": 0.08350464500278403, "kl": 0.0026311874389648438, "learning_rate": 7.853487397702277e-07, "loss": 0.0001, "reward": 1.8928572088479996, "reward_std": 0.030438202433288097, "rewards/accuracy_reward": 0.8928571790456772, "rewards/format_reward": 1.0, "step": 3934 }, { "completion_length": 414.47099685668945, "epoch": 0.30682283242063413, "grad_norm": 0.1311922761322795, "kl": 0.00330352783203125, "learning_rate": 7.851476023809208e-07, "loss": 0.0001, "reward": 1.772321492433548, "reward_std": 0.06267947610467672, "rewards/accuracy_reward": 0.772321455180645, "rewards/format_reward": 1.0, "step": 3936 }, { "completion_length": 409.0982322692871, "epoch": 0.306978738331417, "grad_norm": 0.0878676353011755, "kl": 0.002872467041015625, "learning_rate": 7.849463965828804e-07, "loss": 0.0001, "reward": 1.8549108058214188, "reward_std": 0.0344957634806633, "rewards/accuracy_reward": 0.8549107611179352, "rewards/format_reward": 1.0, "step": 3938 }, { "completion_length": 419.50448989868164, "epoch": 0.30713464424219983, "grad_norm": 0.04406959634834889, "kl": 0.003177642822265625, "learning_rate": 7.84745122424377e-07, "loss": 0.0001, "reward": 1.7745536267757416, "reward_std": 0.05230706185102463, "rewards/accuracy_reward": 0.7745536006987095, "rewards/format_reward": 1.0, "step": 3940 }, { "completion_length": 411.5468940734863, "epoch": 0.3072905501529827, "grad_norm": 0.08491097267033883, "kl": 0.0029964447021484375, "learning_rate": 7.845437799536981e-07, "loss": 0.0001, "reward": 1.8258929252624512, "reward_std": 0.07582562230527401, "rewards/accuracy_reward": 0.825892873108387, "rewards/format_reward": 1.0, "step": 3942 }, { "completion_length": 422.5982360839844, "epoch": 0.3074464560637655, "grad_norm": 0.05086495721796334, "kl": 0.0029249191284179688, "learning_rate": 7.843423692191462e-07, "loss": 0.0001, "reward": 1.8191965073347092, "reward_std": 0.014579704962670803, "rewards/accuracy_reward": 0.8191964626312256, "rewards/format_reward": 1.0, "step": 3944 }, { "completion_length": 425.6718940734863, "epoch": 0.3076023619745484, "grad_norm": 0.12170991138095491, "kl": 0.00337982177734375, "learning_rate": 7.841408902690415e-07, "loss": 0.0001, "reward": 1.812500074505806, "reward_std": 0.10205653496086597, "rewards/accuracy_reward": 0.8125000223517418, "rewards/format_reward": 1.0, "step": 3946 }, { "completion_length": 420.44644927978516, "epoch": 0.3077582678853312, "grad_norm": 0.10371080864484174, "kl": 0.003368377685546875, "learning_rate": 7.8393934315172e-07, "loss": 0.0001, "reward": 1.8125000596046448, "reward_std": 0.08941485825926065, "rewards/accuracy_reward": 0.8147321790456772, "rewards/format_reward": 0.9977678656578064, "step": 3948 }, { "completion_length": 412.1585006713867, "epoch": 0.307914173796114, "grad_norm": 0.12598129125202912, "kl": 0.0028486251831054688, "learning_rate": 7.837377279155344e-07, "loss": 0.0001, "reward": 1.8080357909202576, "reward_std": 0.0707923611626029, "rewards/accuracy_reward": 0.808035746216774, "rewards/format_reward": 1.0, "step": 3950 }, { "completion_length": 418.74109268188477, "epoch": 0.30807007970689687, "grad_norm": 0.10908581539419232, "kl": 0.00318145751953125, "learning_rate": 7.83536044608853e-07, "loss": 0.0001, "reward": 1.756696492433548, "reward_std": 0.05102826841175556, "rewards/accuracy_reward": 0.7589285969734192, "rewards/format_reward": 0.9977678656578064, "step": 3952 }, { "completion_length": 425.4687690734863, "epoch": 0.3082259856176797, "grad_norm": 0.11626689550930346, "kl": 0.0029735565185546875, "learning_rate": 7.833342932800612e-07, "loss": 0.0001, "reward": 1.7544643580913544, "reward_std": 0.06027021538466215, "rewards/accuracy_reward": 0.7544643208384514, "rewards/format_reward": 1.0, "step": 3954 }, { "completion_length": 419.1227836608887, "epoch": 0.30838189152846257, "grad_norm": 0.1153978500609392, "kl": 0.00289154052734375, "learning_rate": 7.831324739775604e-07, "loss": 0.0001, "reward": 1.8392857909202576, "reward_std": 0.08079246617853642, "rewards/accuracy_reward": 0.839285746216774, "rewards/format_reward": 1.0, "step": 3956 }, { "completion_length": 426.3281440734863, "epoch": 0.3085377974392454, "grad_norm": 0.13013545188522357, "kl": 0.0029554367065429688, "learning_rate": 7.829305867497686e-07, "loss": 0.0001, "reward": 1.803571492433548, "reward_std": 0.053282760083675385, "rewards/accuracy_reward": 0.8035714775323868, "rewards/format_reward": 1.0, "step": 3958 }, { "completion_length": 424.63171768188477, "epoch": 0.30869370335002827, "grad_norm": 0.10032187171469992, "kl": 0.0028467178344726562, "learning_rate": 7.827286316451193e-07, "loss": 0.0001, "reward": 1.8616072237491608, "reward_std": 0.045993607491254807, "rewards/accuracy_reward": 0.8616071790456772, "rewards/format_reward": 1.0, "step": 3960 }, { "completion_length": 403.43082427978516, "epoch": 0.3088496092608111, "grad_norm": 0.08277006637303663, "kl": 0.0025386810302734375, "learning_rate": 7.825266087120632e-07, "loss": 0.0001, "reward": 1.9263393431901932, "reward_std": 0.04937856364995241, "rewards/accuracy_reward": 0.9263393208384514, "rewards/format_reward": 1.0, "step": 3962 }, { "completion_length": 422.74555587768555, "epoch": 0.30900551517159397, "grad_norm": 0.06265215792309042, "kl": 0.0029964447021484375, "learning_rate": 7.823245179990669e-07, "loss": 0.0001, "reward": 1.88839291036129, "reward_std": 0.012626906856894493, "rewards/accuracy_reward": 0.8883928880095482, "rewards/format_reward": 1.0, "step": 3964 }, { "completion_length": 410.1986770629883, "epoch": 0.30916142108237676, "grad_norm": 0.10820445657025195, "kl": 0.0028047561645507812, "learning_rate": 7.821223595546129e-07, "loss": 0.0001, "reward": 1.8214286416769028, "reward_std": 0.055909561924636364, "rewards/accuracy_reward": 0.821428619325161, "rewards/format_reward": 1.0, "step": 3966 }, { "completion_length": 412.46430587768555, "epoch": 0.3093173269931596, "grad_norm": 0.12053509155807304, "kl": 0.00270843505859375, "learning_rate": 7.819201334272007e-07, "loss": 0.0001, "reward": 1.8794643580913544, "reward_std": 0.05651575140655041, "rewards/accuracy_reward": 0.879464328289032, "rewards/format_reward": 1.0, "step": 3968 }, { "completion_length": 405.78572845458984, "epoch": 0.30947323290394246, "grad_norm": 0.08925067217934458, "kl": 0.0026712417602539062, "learning_rate": 7.817178396653455e-07, "loss": 0.0001, "reward": 1.8883929252624512, "reward_std": 0.045993607491254807, "rewards/accuracy_reward": 0.8883928880095482, "rewards/format_reward": 1.0, "step": 3970 }, { "completion_length": 421.10716247558594, "epoch": 0.3096291388147253, "grad_norm": 0.04906222826517398, "kl": 0.0029087066650390625, "learning_rate": 7.815154783175788e-07, "loss": 0.0001, "reward": 1.8504465073347092, "reward_std": 0.05230706091970205, "rewards/accuracy_reward": 0.8526785969734192, "rewards/format_reward": 0.9977678656578064, "step": 3972 }, { "completion_length": 413.10046768188477, "epoch": 0.30978504472550816, "grad_norm": 0.10635075878377266, "kl": 0.0028409957885742188, "learning_rate": 7.813130494324485e-07, "loss": 0.0001, "reward": 1.8526786416769028, "reward_std": 0.041632951237261295, "rewards/accuracy_reward": 0.8526786044239998, "rewards/format_reward": 1.0, "step": 3974 }, { "completion_length": 411.4085006713867, "epoch": 0.309940950636291, "grad_norm": 0.09798695755681276, "kl": 0.0028553009033203125, "learning_rate": 7.811105530585186e-07, "loss": 0.0001, "reward": 1.774553656578064, "reward_std": 0.04569051135331392, "rewards/accuracy_reward": 0.7745536118745804, "rewards/format_reward": 1.0, "step": 3976 }, { "completion_length": 412.1317138671875, "epoch": 0.31009685654707386, "grad_norm": 0.07325862452055967, "kl": 0.0026826858520507812, "learning_rate": 7.809079892443692e-07, "loss": 0.0001, "reward": 1.8125000447034836, "reward_std": 0.028182310052216053, "rewards/accuracy_reward": 0.8125000223517418, "rewards/format_reward": 1.0, "step": 3978 }, { "completion_length": 430.4062690734863, "epoch": 0.31025276245785666, "grad_norm": 0.11389285810309736, "kl": 0.003292083740234375, "learning_rate": 7.807053580385965e-07, "loss": 0.0001, "reward": 1.8147322088479996, "reward_std": 0.06350180879235268, "rewards/accuracy_reward": 0.8147321715950966, "rewards/format_reward": 1.0, "step": 3980 }, { "completion_length": 431.78796768188477, "epoch": 0.3104086683686395, "grad_norm": 0.11552261277791982, "kl": 0.00554656982421875, "learning_rate": 7.805026594898132e-07, "loss": 0.0002, "reward": 1.7455357909202576, "reward_std": 0.08341926708817482, "rewards/accuracy_reward": 0.745535746216774, "rewards/format_reward": 1.0, "step": 3982 }, { "completion_length": 434.1250228881836, "epoch": 0.31056457427942236, "grad_norm": 0.10395107070497331, "kl": 0.0033130645751953125, "learning_rate": 7.80299893646648e-07, "loss": 0.0001, "reward": 1.758928656578064, "reward_std": 0.07936030719429255, "rewards/accuracy_reward": 0.7589286118745804, "rewards/format_reward": 1.0, "step": 3984 }, { "completion_length": 418.4040336608887, "epoch": 0.3107204801902052, "grad_norm": 0.004600150619780665, "kl": 0.0032196044921875, "learning_rate": 7.800970605577459e-07, "loss": 0.0001, "reward": 1.8549107611179352, "reward_std": 0.02186885755509138, "rewards/accuracy_reward": 0.8549107387661934, "rewards/format_reward": 1.0, "step": 3986 }, { "completion_length": 414.3415336608887, "epoch": 0.31087638610098806, "grad_norm": 0.006589227860200763, "kl": 0.0026006698608398438, "learning_rate": 7.798941602717673e-07, "loss": 0.0001, "reward": 1.8281250596046448, "reward_std": 0.02284595649689436, "rewards/accuracy_reward": 0.8281250298023224, "rewards/format_reward": 1.0, "step": 3988 }, { "completion_length": 418.1607360839844, "epoch": 0.3110322920117709, "grad_norm": 0.04281934573884621, "kl": 0.0031337738037109375, "learning_rate": 7.796911928373897e-07, "loss": 0.0001, "reward": 1.7544643580913544, "reward_std": 0.05200396664440632, "rewards/accuracy_reward": 0.754464328289032, "rewards/format_reward": 1.0, "step": 3990 }, { "completion_length": 416.9665412902832, "epoch": 0.31118819792255376, "grad_norm": 0.097814992797558, "kl": 0.003345489501953125, "learning_rate": 7.794881583033063e-07, "loss": 0.0001, "reward": 1.7678572237491608, "reward_std": 0.09439647290855646, "rewards/accuracy_reward": 0.7678571715950966, "rewards/format_reward": 1.0, "step": 3992 }, { "completion_length": 427.3660888671875, "epoch": 0.3113441038333366, "grad_norm": 0.09102116862884443, "kl": 0.003021240234375, "learning_rate": 7.792850567182263e-07, "loss": 0.0001, "reward": 1.8058036416769028, "reward_std": 0.06072667706757784, "rewards/accuracy_reward": 0.8058036118745804, "rewards/format_reward": 1.0, "step": 3994 }, { "completion_length": 420.9062690734863, "epoch": 0.3115000097441194, "grad_norm": 0.09590797307111362, "kl": 0.003147125244140625, "learning_rate": 7.79081888130875e-07, "loss": 0.0001, "reward": 1.8013393431901932, "reward_std": 0.09138242620974779, "rewards/accuracy_reward": 0.8013393133878708, "rewards/format_reward": 1.0, "step": 3996 }, { "completion_length": 420.94421768188477, "epoch": 0.31165591565490225, "grad_norm": 0.0502756478645046, "kl": 0.00240325927734375, "learning_rate": 7.788786525899937e-07, "loss": 0.0001, "reward": 1.8459822237491608, "reward_std": 0.014579704962670803, "rewards/accuracy_reward": 0.8459821939468384, "rewards/format_reward": 1.0, "step": 3998 }, { "completion_length": 412.1384086608887, "epoch": 0.3118118215656851, "grad_norm": 0.06758578004835296, "kl": 0.0038013458251953125, "learning_rate": 7.786753501443403e-07, "loss": 0.0002, "reward": 1.866071492433548, "reward_std": 0.052003965713083744, "rewards/accuracy_reward": 0.8660714700818062, "rewards/format_reward": 1.0, "step": 4000 }, { "completion_length": 425.6451072692871, "epoch": 0.31196772747646795, "grad_norm": 0.0635175464329421, "kl": 0.00287628173828125, "learning_rate": 7.784719808426881e-07, "loss": 0.0001, "reward": 1.7991072535514832, "reward_std": 0.05651575140655041, "rewards/accuracy_reward": 0.7991071790456772, "rewards/format_reward": 1.0, "step": 4002 }, { "completion_length": 412.3303756713867, "epoch": 0.3121236333872508, "grad_norm": 0.10114838143781216, "kl": 0.0027523040771484375, "learning_rate": 7.782685447338268e-07, "loss": 0.0001, "reward": 1.799107238650322, "reward_std": 0.058838057331740856, "rewards/accuracy_reward": 0.7991071864962578, "rewards/format_reward": 1.0, "step": 4004 }, { "completion_length": 428.3236846923828, "epoch": 0.31227953929803365, "grad_norm": 0.09358110810829902, "kl": 0.0031137466430664062, "learning_rate": 7.780650418665622e-07, "loss": 0.0001, "reward": 1.8950893431901932, "reward_std": 0.0539567656815052, "rewards/accuracy_reward": 0.8950893059372902, "rewards/format_reward": 1.0, "step": 4006 }, { "completion_length": 405.9285888671875, "epoch": 0.3124354452088165, "grad_norm": 0.003809394624927426, "kl": 0.0025129318237304688, "learning_rate": 7.778614722897153e-07, "loss": 0.0001, "reward": 1.8638393431901932, "reward_std": 0.04132985696196556, "rewards/accuracy_reward": 0.8638393133878708, "rewards/format_reward": 1.0, "step": 4008 }, { "completion_length": 433.7254638671875, "epoch": 0.31259135111959935, "grad_norm": 0.08240678122637352, "kl": 0.003154754638671875, "learning_rate": 7.776578360521244e-07, "loss": 0.0001, "reward": 1.7723215222358704, "reward_std": 0.03111080639064312, "rewards/accuracy_reward": 0.7723214626312256, "rewards/format_reward": 1.0, "step": 4010 }, { "completion_length": 422.4843940734863, "epoch": 0.31274725703038214, "grad_norm": 0.06539903657409013, "kl": 0.002864837646484375, "learning_rate": 7.774541332026428e-07, "loss": 0.0001, "reward": 1.7812500596046448, "reward_std": 0.039983248338103294, "rewards/accuracy_reward": 0.781250037252903, "rewards/format_reward": 1.0, "step": 4012 }, { "completion_length": 421.4129638671875, "epoch": 0.312903162941165, "grad_norm": 0.12466892936041456, "kl": 0.00287628173828125, "learning_rate": 7.772503637901403e-07, "loss": 0.0001, "reward": 1.8616072237491608, "reward_std": 0.08387432899326086, "rewards/accuracy_reward": 0.8616071790456772, "rewards/format_reward": 1.0, "step": 4014 }, { "completion_length": 437.5312728881836, "epoch": 0.31305906885194784, "grad_norm": 0.06057970389420586, "kl": 0.0033626556396484375, "learning_rate": 7.770465278635024e-07, "loss": 0.0001, "reward": 1.7790179401636124, "reward_std": 0.055235556326806545, "rewards/accuracy_reward": 0.7790178805589676, "rewards/format_reward": 1.0, "step": 4016 }, { "completion_length": 429.4531440734863, "epoch": 0.3132149747627307, "grad_norm": 0.07460374049188347, "kl": 0.0029811859130859375, "learning_rate": 7.768426254716304e-07, "loss": 0.0001, "reward": 1.7455357909202576, "reward_std": 0.020893159322440624, "rewards/accuracy_reward": 0.745535746216774, "rewards/format_reward": 1.0, "step": 4018 }, { "completion_length": 415.2991256713867, "epoch": 0.31337088067351354, "grad_norm": 0.0034746093841893327, "kl": 0.0028018951416015625, "learning_rate": 7.766386566634424e-07, "loss": 0.0001, "reward": 1.90401791036129, "reward_std": 0.021868856623768806, "rewards/accuracy_reward": 0.9040179029107094, "rewards/format_reward": 1.0, "step": 4020 }, { "completion_length": 405.0714454650879, "epoch": 0.3135267865842964, "grad_norm": 0.0835312017619313, "kl": 0.00260162353515625, "learning_rate": 7.764346214878711e-07, "loss": 0.0001, "reward": 1.8281250596046448, "reward_std": 0.03742426075041294, "rewards/accuracy_reward": 0.828125037252903, "rewards/format_reward": 1.0, "step": 4022 }, { "completion_length": 428.9107322692871, "epoch": 0.31368269249507924, "grad_norm": 0.11106114880747969, "kl": 0.002994537353515625, "learning_rate": 7.762305199938664e-07, "loss": 0.0001, "reward": 1.8437500596046448, "reward_std": 0.04306511115282774, "rewards/accuracy_reward": 0.8459821715950966, "rewards/format_reward": 0.9977678656578064, "step": 4024 }, { "completion_length": 430.3861770629883, "epoch": 0.31383859840586203, "grad_norm": 0.15902679089721602, "kl": 0.003238677978515625, "learning_rate": 7.760263522303933e-07, "loss": 0.0001, "reward": 1.7500001043081284, "reward_std": 0.11160158272832632, "rewards/accuracy_reward": 0.750000037252903, "rewards/format_reward": 1.0, "step": 4026 }, { "completion_length": 420.752254486084, "epoch": 0.3139945043166449, "grad_norm": 0.1027972494810613, "kl": 0.0028514862060546875, "learning_rate": 7.75822118246433e-07, "loss": 0.0001, "reward": 1.803571492433548, "reward_std": 0.03479885868728161, "rewards/accuracy_reward": 0.8035714626312256, "rewards/format_reward": 1.0, "step": 4028 }, { "completion_length": 422.1562690734863, "epoch": 0.31415041022742773, "grad_norm": 0.05567532831641929, "kl": 0.003070831298828125, "learning_rate": 7.756178180909824e-07, "loss": 0.0001, "reward": 1.79241082072258, "reward_std": 0.05102826748043299, "rewards/accuracy_reward": 0.7924107536673546, "rewards/format_reward": 1.0, "step": 4030 }, { "completion_length": 437.6004638671875, "epoch": 0.3143063161382106, "grad_norm": 0.05112514965031053, "kl": 0.0033740997314453125, "learning_rate": 7.754134518130549e-07, "loss": 0.0001, "reward": 1.7700893580913544, "reward_std": 0.02284595649689436, "rewards/accuracy_reward": 0.7700893059372902, "rewards/format_reward": 1.0, "step": 4032 }, { "completion_length": 429.97322845458984, "epoch": 0.31446222204899343, "grad_norm": 0.07462257025929389, "kl": 0.0028476715087890625, "learning_rate": 7.752090194616786e-07, "loss": 0.0001, "reward": 1.8683036416769028, "reward_std": 0.0664303032681346, "rewards/accuracy_reward": 0.8683036044239998, "rewards/format_reward": 1.0, "step": 4034 }, { "completion_length": 410.9665336608887, "epoch": 0.3146181279597763, "grad_norm": 0.11179096217617436, "kl": 0.00295257568359375, "learning_rate": 7.750045210858985e-07, "loss": 0.0001, "reward": 1.8058036267757416, "reward_std": 0.05102826841175556, "rewards/accuracy_reward": 0.8058035969734192, "rewards/format_reward": 1.0, "step": 4036 }, { "completion_length": 427.13616943359375, "epoch": 0.31477403387055913, "grad_norm": 0.09398000772679851, "kl": 0.0030508041381835938, "learning_rate": 7.747999567347752e-07, "loss": 0.0001, "reward": 1.8727679252624512, "reward_std": 0.056667715311050415, "rewards/accuracy_reward": 0.8727679029107094, "rewards/format_reward": 1.0, "step": 4038 }, { "completion_length": 429.96430587768555, "epoch": 0.314929939781342, "grad_norm": 0.11671086780618903, "kl": 0.003734588623046875, "learning_rate": 7.745953264573846e-07, "loss": 0.0001, "reward": 1.758928656578064, "reward_std": 0.05831881985068321, "rewards/accuracy_reward": 0.7611607536673546, "rewards/format_reward": 0.9977678656578064, "step": 4040 }, { "completion_length": 419.3593940734863, "epoch": 0.3150858456921248, "grad_norm": 0.07041442902599299, "kl": 0.00328826904296875, "learning_rate": 7.743906303028192e-07, "loss": 0.0001, "reward": 1.828125074505806, "reward_std": 0.03531949780881405, "rewards/accuracy_reward": 0.828125037252903, "rewards/format_reward": 1.0, "step": 4042 }, { "completion_length": 432.2388610839844, "epoch": 0.3152417516029076, "grad_norm": 0.12642616888539757, "kl": 0.0034027099609375, "learning_rate": 7.741858683201866e-07, "loss": 0.0001, "reward": 1.84151791036129, "reward_std": 0.07079095765948296, "rewards/accuracy_reward": 0.8437500298023224, "rewards/format_reward": 0.9977678656578064, "step": 4044 }, { "completion_length": 413.3080520629883, "epoch": 0.3153976575136905, "grad_norm": 0.10481775831563822, "kl": 0.0028314590454101562, "learning_rate": 7.739810405586108e-07, "loss": 0.0001, "reward": 1.770089402794838, "reward_std": 0.048099772073328495, "rewards/accuracy_reward": 0.7700893133878708, "rewards/format_reward": 1.0, "step": 4046 }, { "completion_length": 418.47099685668945, "epoch": 0.3155535634244733, "grad_norm": 0.08589858454359642, "kl": 0.0028924942016601562, "learning_rate": 7.73776147067231e-07, "loss": 0.0001, "reward": 1.8102679252624512, "reward_std": 0.07048926688730717, "rewards/accuracy_reward": 0.8102678880095482, "rewards/format_reward": 1.0, "step": 4048 }, { "completion_length": 412.7946586608887, "epoch": 0.3157094693352562, "grad_norm": 0.12483415165140432, "kl": 0.0029544830322265625, "learning_rate": 7.735711878952028e-07, "loss": 0.0001, "reward": 1.718750074505806, "reward_std": 0.0923581225797534, "rewards/accuracy_reward": 0.718750037252903, "rewards/format_reward": 1.0, "step": 4050 }, { "completion_length": 427.6183204650879, "epoch": 0.315865375246039, "grad_norm": 0.08889914053522138, "kl": 0.003238677978515625, "learning_rate": 7.73366163091697e-07, "loss": 0.0001, "reward": 1.861607238650322, "reward_std": 0.041786317713558674, "rewards/accuracy_reward": 0.8616071864962578, "rewards/format_reward": 1.0, "step": 4052 }, { "completion_length": 435.1607322692871, "epoch": 0.3160212811568219, "grad_norm": 0.12306314123833718, "kl": 0.00363922119140625, "learning_rate": 7.731610727059002e-07, "loss": 0.0001, "reward": 1.7544643580913544, "reward_std": 0.09522244241088629, "rewards/accuracy_reward": 0.756696455180645, "rewards/format_reward": 0.9977678656578064, "step": 4054 }, { "completion_length": 420.31474685668945, "epoch": 0.3161771870676047, "grad_norm": 0.0961062963542472, "kl": 0.0033168792724609375, "learning_rate": 7.729559167870152e-07, "loss": 0.0001, "reward": 1.7700893580913544, "reward_std": 0.05753953941166401, "rewards/accuracy_reward": 0.772321455180645, "rewards/format_reward": 0.9977678656578064, "step": 4056 }, { "completion_length": 432.80135345458984, "epoch": 0.3163330929783875, "grad_norm": 0.08065688196674965, "kl": 0.0030059814453125, "learning_rate": 7.727506953842601e-07, "loss": 0.0001, "reward": 1.8147322088479996, "reward_std": 0.03306360449641943, "rewards/accuracy_reward": 0.8147321790456772, "rewards/format_reward": 1.0, "step": 4058 }, { "completion_length": 421.38394927978516, "epoch": 0.31648899888917037, "grad_norm": 0.060269980446805094, "kl": 0.0027866363525390625, "learning_rate": 7.725454085468688e-07, "loss": 0.0001, "reward": 1.7299107909202576, "reward_std": 0.01894036028534174, "rewards/accuracy_reward": 0.7299107387661934, "rewards/format_reward": 1.0, "step": 4060 }, { "completion_length": 421.46653747558594, "epoch": 0.3166449047999532, "grad_norm": 0.05136760315935147, "kl": 0.0030155181884765625, "learning_rate": 7.72340056324091e-07, "loss": 0.0001, "reward": 1.8080357909202576, "reward_std": 0.040809216909110546, "rewards/accuracy_reward": 0.8080357536673546, "rewards/format_reward": 1.0, "step": 4062 }, { "completion_length": 429.8348388671875, "epoch": 0.31680081071073607, "grad_norm": 0.0039241585442192015, "kl": 0.002941131591796875, "learning_rate": 7.721346387651919e-07, "loss": 0.0001, "reward": 1.734375074505806, "reward_std": 0.040657252073287964, "rewards/accuracy_reward": 0.7343750223517418, "rewards/format_reward": 1.0, "step": 4064 }, { "completion_length": 426.9710006713867, "epoch": 0.3169567166215189, "grad_norm": 0.0775821189624066, "kl": 0.00315093994140625, "learning_rate": 7.719291559194525e-07, "loss": 0.0001, "reward": 1.8258929252624512, "reward_std": 0.048922101967036724, "rewards/accuracy_reward": 0.8258929029107094, "rewards/format_reward": 1.0, "step": 4066 }, { "completion_length": 415.5067138671875, "epoch": 0.31711262253230177, "grad_norm": 0.08523543824354068, "kl": 0.003021240234375, "learning_rate": 7.717236078361698e-07, "loss": 0.0001, "reward": 1.7656250596046448, "reward_std": 0.0539567656815052, "rewards/accuracy_reward": 0.765625037252903, "rewards/format_reward": 1.0, "step": 4068 }, { "completion_length": 421.69644927978516, "epoch": 0.3172685284430846, "grad_norm": 0.10177472537385891, "kl": 0.003040313720703125, "learning_rate": 7.715179945646554e-07, "loss": 0.0001, "reward": 1.7611607909202576, "reward_std": 0.034495764411985874, "rewards/accuracy_reward": 0.7611607611179352, "rewards/format_reward": 1.0, "step": 4070 }, { "completion_length": 434.61609268188477, "epoch": 0.31742443435386747, "grad_norm": 0.11424192814249678, "kl": 0.0030241012573242188, "learning_rate": 7.713123161542378e-07, "loss": 0.0001, "reward": 1.8883929252624512, "reward_std": 0.06786386482417583, "rewards/accuracy_reward": 0.8928571566939354, "rewards/format_reward": 0.9955357164144516, "step": 4072 }, { "completion_length": 424.36163330078125, "epoch": 0.31758034026465026, "grad_norm": 0.12622402869991936, "kl": 0.003528594970703125, "learning_rate": 7.711065726542604e-07, "loss": 0.0001, "reward": 1.8035715073347092, "reward_std": 0.07124741934239864, "rewards/accuracy_reward": 0.8035714626312256, "rewards/format_reward": 1.0, "step": 4074 }, { "completion_length": 426.767879486084, "epoch": 0.3177362461754331, "grad_norm": 0.09455957909204377, "kl": 0.0028467178344726562, "learning_rate": 7.709007641140823e-07, "loss": 0.0001, "reward": 1.7879464775323868, "reward_std": 0.061852105893194675, "rewards/accuracy_reward": 0.7879464477300644, "rewards/format_reward": 1.0, "step": 4076 }, { "completion_length": 423.314754486084, "epoch": 0.31789215208621596, "grad_norm": 0.058912886664706884, "kl": 0.002986907958984375, "learning_rate": 7.706948905830786e-07, "loss": 0.0001, "reward": 1.7209822237491608, "reward_std": 0.03531949780881405, "rewards/accuracy_reward": 0.7209821790456772, "rewards/format_reward": 1.0, "step": 4078 }, { "completion_length": 426.799129486084, "epoch": 0.3180480579969988, "grad_norm": 0.07326897573643533, "kl": 0.0032444000244140625, "learning_rate": 7.704889521106393e-07, "loss": 0.0001, "reward": 1.7611608058214188, "reward_std": 0.06222301535308361, "rewards/accuracy_reward": 0.7611607536673546, "rewards/format_reward": 1.0, "step": 4080 }, { "completion_length": 427.39734268188477, "epoch": 0.31820396390778166, "grad_norm": 0.048668444018630595, "kl": 0.003078460693359375, "learning_rate": 7.702829487461704e-07, "loss": 0.0001, "reward": 1.7946429401636124, "reward_std": 0.030438202433288097, "rewards/accuracy_reward": 0.7946428880095482, "rewards/format_reward": 1.0, "step": 4082 }, { "completion_length": 403.03349685668945, "epoch": 0.3183598698185645, "grad_norm": 0.08017777357989463, "kl": 0.0026683807373046875, "learning_rate": 7.700768805390936e-07, "loss": 0.0001, "reward": 1.8080357760190964, "reward_std": 0.040809216909110546, "rewards/accuracy_reward": 0.8080357536673546, "rewards/format_reward": 1.0, "step": 4084 }, { "completion_length": 425.9754638671875, "epoch": 0.31851577572934736, "grad_norm": 0.12115468506311657, "kl": 0.0034580230712890625, "learning_rate": 7.698707475388459e-07, "loss": 0.0001, "reward": 1.7767857760190964, "reward_std": 0.06447750516235828, "rewards/accuracy_reward": 0.7767857387661934, "rewards/format_reward": 1.0, "step": 4086 }, { "completion_length": 417.1026954650879, "epoch": 0.31867168164013016, "grad_norm": 0.06767876025440052, "kl": 0.003215789794921875, "learning_rate": 7.696645497948799e-07, "loss": 0.0001, "reward": 1.8392857909202576, "reward_std": 0.023821654729545116, "rewards/accuracy_reward": 0.8392857536673546, "rewards/format_reward": 1.0, "step": 4088 }, { "completion_length": 422.20538330078125, "epoch": 0.318827587550913, "grad_norm": 0.056285142217445766, "kl": 0.00310516357421875, "learning_rate": 7.69458287356664e-07, "loss": 0.0001, "reward": 1.781250074505806, "reward_std": 0.04471481405198574, "rewards/accuracy_reward": 0.7812500447034836, "rewards/format_reward": 1.0, "step": 4090 }, { "completion_length": 419.91743087768555, "epoch": 0.31898349346169586, "grad_norm": 0.10468453759960525, "kl": 0.002902984619140625, "learning_rate": 7.692519602736813e-07, "loss": 0.0001, "reward": 1.828125074505806, "reward_std": 0.08634636178612709, "rewards/accuracy_reward": 0.8281250260770321, "rewards/format_reward": 1.0, "step": 4092 }, { "completion_length": 431.3817138671875, "epoch": 0.3191393993724787, "grad_norm": 0.10557076933681678, "kl": 0.0033054351806640625, "learning_rate": 7.690455685954316e-07, "loss": 0.0001, "reward": 1.790178656578064, "reward_std": 0.04471481405198574, "rewards/accuracy_reward": 0.792410746216774, "rewards/format_reward": 0.9977678656578064, "step": 4094 }, { "completion_length": 426.8303756713867, "epoch": 0.31929530528326155, "grad_norm": 0.07641968295458526, "kl": 0.0028009414672851562, "learning_rate": 7.688391123714295e-07, "loss": 0.0001, "reward": 1.7656250894069672, "reward_std": 0.050051167607307434, "rewards/accuracy_reward": 0.7656250223517418, "rewards/format_reward": 1.0, "step": 4096 }, { "completion_length": 419.6071662902832, "epoch": 0.3194512111940444, "grad_norm": 0.09600947386977997, "kl": 0.002716064453125, "learning_rate": 7.686325916512046e-07, "loss": 0.0001, "reward": 1.8169643580913544, "reward_std": 0.04764330945909023, "rewards/accuracy_reward": 0.8169643357396126, "rewards/format_reward": 1.0, "step": 4098 }, { "completion_length": 426.0960006713867, "epoch": 0.31960711710482725, "grad_norm": 0.08718127005015178, "kl": 0.0034236907958984375, "learning_rate": 7.684260064843032e-07, "loss": 0.0001, "reward": 1.814732238650322, "reward_std": 0.07062786165624857, "rewards/accuracy_reward": 0.8169643208384514, "rewards/format_reward": 0.9977678656578064, "step": 4100 }, { "completion_length": 429.98662185668945, "epoch": 0.3197630230156101, "grad_norm": 0.076761216767753, "kl": 0.003070831298828125, "learning_rate": 7.682193569202862e-07, "loss": 0.0001, "reward": 1.7723215222358704, "reward_std": 0.028182310052216053, "rewards/accuracy_reward": 0.772321455180645, "rewards/format_reward": 1.0, "step": 4102 }, { "completion_length": 427.57814025878906, "epoch": 0.3199189289263929, "grad_norm": 0.13158190218941773, "kl": 0.00302886962890625, "learning_rate": 7.6801264300873e-07, "loss": 0.0001, "reward": 1.8482143580913544, "reward_std": 0.0723728472366929, "rewards/accuracy_reward": 0.8504464626312256, "rewards/format_reward": 0.9977678656578064, "step": 4104 }, { "completion_length": 429.48439025878906, "epoch": 0.32007483483717575, "grad_norm": 0.0696130742590454, "kl": 0.0035858154296875, "learning_rate": 7.678058647992267e-07, "loss": 0.0001, "reward": 1.703125074505806, "reward_std": 0.045017908327281475, "rewards/accuracy_reward": 0.7031250447034836, "rewards/format_reward": 1.0, "step": 4106 }, { "completion_length": 432.2879638671875, "epoch": 0.3202307407479586, "grad_norm": 0.11254962783218526, "kl": 0.0033054351806640625, "learning_rate": 7.675990223413836e-07, "loss": 0.0001, "reward": 1.8035715073347092, "reward_std": 0.06417581252753735, "rewards/accuracy_reward": 0.8035714477300644, "rewards/format_reward": 1.0, "step": 4108 }, { "completion_length": 430.63618087768555, "epoch": 0.32038664665874145, "grad_norm": 0.13536472576632003, "kl": 0.0030279159545898438, "learning_rate": 7.673921156848237e-07, "loss": 0.0001, "reward": 1.8415179401636124, "reward_std": 0.07484992407262325, "rewards/accuracy_reward": 0.8415178880095482, "rewards/format_reward": 1.0, "step": 4110 }, { "completion_length": 441.7053756713867, "epoch": 0.3205425525695243, "grad_norm": 0.1155761279899606, "kl": 0.00341796875, "learning_rate": 7.671851448791853e-07, "loss": 0.0001, "reward": 1.7834822237491608, "reward_std": 0.059294519014656544, "rewards/accuracy_reward": 0.7834821790456772, "rewards/format_reward": 1.0, "step": 4112 }, { "completion_length": 422.189754486084, "epoch": 0.32069845848030715, "grad_norm": 0.09459033631462324, "kl": 0.00318145751953125, "learning_rate": 7.669781099741217e-07, "loss": 0.0001, "reward": 1.8415179550647736, "reward_std": 0.06756077148020267, "rewards/accuracy_reward": 0.8415178954601288, "rewards/format_reward": 1.0, "step": 4114 }, { "completion_length": 415.55358123779297, "epoch": 0.32085436439109, "grad_norm": 0.08473170959162353, "kl": 0.00296783447265625, "learning_rate": 7.667710110193022e-07, "loss": 0.0001, "reward": 1.8348215073347092, "reward_std": 0.0505718057975173, "rewards/accuracy_reward": 0.8348214700818062, "rewards/format_reward": 1.0, "step": 4116 }, { "completion_length": 419.9263572692871, "epoch": 0.32101027030187285, "grad_norm": 0.0698569702422274, "kl": 0.00335693359375, "learning_rate": 7.665638480644111e-07, "loss": 0.0001, "reward": 1.8080358058214188, "reward_std": 0.04569191299378872, "rewards/accuracy_reward": 0.8080357387661934, "rewards/format_reward": 1.0, "step": 4118 }, { "completion_length": 420.377254486084, "epoch": 0.32116617621265564, "grad_norm": 0.0655925442414531, "kl": 0.0027942657470703125, "learning_rate": 7.663566211591479e-07, "loss": 0.0001, "reward": 1.8348215073347092, "reward_std": 0.037727353163063526, "rewards/accuracy_reward": 0.8370536118745804, "rewards/format_reward": 0.9977678656578064, "step": 4120 }, { "completion_length": 417.5000190734863, "epoch": 0.3213220821234385, "grad_norm": 0.09974048020117472, "kl": 0.002960205078125, "learning_rate": 7.661493303532278e-07, "loss": 0.0001, "reward": 1.8437500894069672, "reward_std": 0.06710430979728699, "rewards/accuracy_reward": 0.8437500298023224, "rewards/format_reward": 1.0, "step": 4122 }, { "completion_length": 406.32814025878906, "epoch": 0.32147798803422134, "grad_norm": 0.12057722737687103, "kl": 0.0029354095458984375, "learning_rate": 7.659419756963814e-07, "loss": 0.0001, "reward": 1.8593750447034836, "reward_std": 0.056366024538874626, "rewards/accuracy_reward": 0.8593750223517418, "rewards/format_reward": 1.0, "step": 4124 }, { "completion_length": 433.61162185668945, "epoch": 0.3216338939450042, "grad_norm": 0.004002033301218277, "kl": 0.0037441253662109375, "learning_rate": 7.657345572383541e-07, "loss": 0.0001, "reward": 1.7924108058214188, "reward_std": 0.04666761215776205, "rewards/accuracy_reward": 0.7924107387661934, "rewards/format_reward": 1.0, "step": 4126 }, { "completion_length": 417.9062690734863, "epoch": 0.32178979985578704, "grad_norm": 0.08304828152357276, "kl": 0.0029621124267578125, "learning_rate": 7.655270750289071e-07, "loss": 0.0001, "reward": 1.7656250894069672, "reward_std": 0.024797352962195873, "rewards/accuracy_reward": 0.7656250260770321, "rewards/format_reward": 1.0, "step": 4128 }, { "completion_length": 413.017879486084, "epoch": 0.3219457057665699, "grad_norm": 0.05754631949357605, "kl": 0.00284576416015625, "learning_rate": 7.653195291178166e-07, "loss": 0.0001, "reward": 1.8616071939468384, "reward_std": 0.01781129650771618, "rewards/accuracy_reward": 0.8616071790456772, "rewards/format_reward": 1.0, "step": 4130 }, { "completion_length": 451.9598388671875, "epoch": 0.32210161167735274, "grad_norm": 0.1335986014538015, "kl": 0.0040264129638671875, "learning_rate": 7.651119195548743e-07, "loss": 0.0002, "reward": 1.7522322535514832, "reward_std": 0.09168411791324615, "rewards/accuracy_reward": 0.7522321864962578, "rewards/format_reward": 1.0, "step": 4132 }, { "completion_length": 423.19868087768555, "epoch": 0.32225751758813553, "grad_norm": 0.055790652136008645, "kl": 0.00299072265625, "learning_rate": 7.64904246389887e-07, "loss": 0.0001, "reward": 1.8549107909202576, "reward_std": 0.03239100147038698, "rewards/accuracy_reward": 0.8549107313156128, "rewards/format_reward": 1.0, "step": 4134 }, { "completion_length": 425.4263572692871, "epoch": 0.3224134234989184, "grad_norm": 0.07669668320003464, "kl": 0.00307464599609375, "learning_rate": 7.646965096726766e-07, "loss": 0.0001, "reward": 1.7745536267757416, "reward_std": 0.03547286335378885, "rewards/accuracy_reward": 0.7745536118745804, "rewards/format_reward": 1.0, "step": 4136 }, { "completion_length": 429.6942138671875, "epoch": 0.32256932940970123, "grad_norm": 0.10165261051661605, "kl": 0.0031566619873046875, "learning_rate": 7.64488709453081e-07, "loss": 0.0001, "reward": 1.7633929401636124, "reward_std": 0.049947294406592846, "rewards/accuracy_reward": 0.7656250447034836, "rewards/format_reward": 0.9977678656578064, "step": 4138 }, { "completion_length": 422.61832427978516, "epoch": 0.3227252353204841, "grad_norm": 0.09784588255007981, "kl": 0.002956390380859375, "learning_rate": 7.642808457809522e-07, "loss": 0.0001, "reward": 1.868303656578064, "reward_std": 0.056667715311050415, "rewards/accuracy_reward": 0.8683036118745804, "rewards/format_reward": 1.0, "step": 4140 }, { "completion_length": 430.16743087768555, "epoch": 0.32288114123126693, "grad_norm": 0.12171263891473125, "kl": 0.003307342529296875, "learning_rate": 7.640729187061585e-07, "loss": 0.0001, "reward": 1.8013393580913544, "reward_std": 0.05395676475018263, "rewards/accuracy_reward": 0.801339328289032, "rewards/format_reward": 1.0, "step": 4142 }, { "completion_length": 428.8839454650879, "epoch": 0.3230370471420498, "grad_norm": 0.13779987657225506, "kl": 0.0034618377685546875, "learning_rate": 7.638649282785827e-07, "loss": 0.0001, "reward": 1.6964286416769028, "reward_std": 0.07951367367058992, "rewards/accuracy_reward": 0.6964285895228386, "rewards/format_reward": 1.0, "step": 4144 }, { "completion_length": 423.29019927978516, "epoch": 0.32319295305283263, "grad_norm": 0.08233273416566306, "kl": 0.002960205078125, "learning_rate": 7.636568745481231e-07, "loss": 0.0001, "reward": 1.850446492433548, "reward_std": 0.03239100147038698, "rewards/accuracy_reward": 0.8504464700818062, "rewards/format_reward": 1.0, "step": 4146 }, { "completion_length": 419.4620704650879, "epoch": 0.3233488589636155, "grad_norm": 0.11507839271764689, "kl": 0.0031538009643554688, "learning_rate": 7.634487575646931e-07, "loss": 0.0001, "reward": 1.7209822088479996, "reward_std": 0.10010373964905739, "rewards/accuracy_reward": 0.720982164144516, "rewards/format_reward": 1.0, "step": 4148 }, { "completion_length": 431.2254638671875, "epoch": 0.3235047648743983, "grad_norm": 0.09582017156710453, "kl": 0.0033483505249023438, "learning_rate": 7.632405773782214e-07, "loss": 0.0001, "reward": 1.76116082072258, "reward_std": 0.044194173999130726, "rewards/accuracy_reward": 0.7611607499420643, "rewards/format_reward": 1.0, "step": 4150 }, { "completion_length": 422.7656478881836, "epoch": 0.3236606707851811, "grad_norm": 0.09143612874722294, "kl": 0.0032358169555664062, "learning_rate": 7.630323340386515e-07, "loss": 0.0001, "reward": 1.821428656578064, "reward_std": 0.025253813713788986, "rewards/accuracy_reward": 0.821428619325161, "rewards/format_reward": 1.0, "step": 4152 }, { "completion_length": 401.59376525878906, "epoch": 0.323816576695964, "grad_norm": 0.0033305587074829055, "kl": 0.0020818710327148438, "learning_rate": 7.628240275959424e-07, "loss": 0.0001, "reward": 1.8794643431901932, "reward_std": 0.01555540319532156, "rewards/accuracy_reward": 0.879464328289032, "rewards/format_reward": 1.0, "step": 4154 }, { "completion_length": 424.7901954650879, "epoch": 0.3239724826067468, "grad_norm": 0.07511297169675894, "kl": 0.0032405853271484375, "learning_rate": 7.626156581000682e-07, "loss": 0.0001, "reward": 1.7723215222358704, "reward_std": 0.03352006524801254, "rewards/accuracy_reward": 0.7723214626312256, "rewards/format_reward": 1.0, "step": 4156 }, { "completion_length": 431.96653747558594, "epoch": 0.3241283885175297, "grad_norm": 0.10671982610216225, "kl": 0.0035762786865234375, "learning_rate": 7.624072256010183e-07, "loss": 0.0001, "reward": 1.805803656578064, "reward_std": 0.05230705998837948, "rewards/accuracy_reward": 0.8058036044239998, "rewards/format_reward": 1.0, "step": 4158 }, { "completion_length": 431.88618087768555, "epoch": 0.3242842944283125, "grad_norm": 0.07262665809482795, "kl": 0.0030727386474609375, "learning_rate": 7.621987301487966e-07, "loss": 0.0001, "reward": 1.8147322088479996, "reward_std": 0.024124749936163425, "rewards/accuracy_reward": 0.8147321790456772, "rewards/format_reward": 1.0, "step": 4160 }, { "completion_length": 431.4241256713867, "epoch": 0.3244402003390954, "grad_norm": 0.0904106716928908, "kl": 0.00305938720703125, "learning_rate": 7.619901717934228e-07, "loss": 0.0001, "reward": 1.7767857909202576, "reward_std": 0.060876406729221344, "rewards/accuracy_reward": 0.776785746216774, "rewards/format_reward": 1.0, "step": 4162 }, { "completion_length": 417.1138572692871, "epoch": 0.3245961062498782, "grad_norm": 0.1387485600176119, "kl": 0.0028333663940429688, "learning_rate": 7.617815505849312e-07, "loss": 0.0001, "reward": 1.796875074505806, "reward_std": 0.07762505300343037, "rewards/accuracy_reward": 0.796875037252903, "rewards/format_reward": 1.0, "step": 4164 }, { "completion_length": 426.04019927978516, "epoch": 0.324752012160661, "grad_norm": 0.06430619479692735, "kl": 0.0032958984375, "learning_rate": 7.615728665733715e-07, "loss": 0.0001, "reward": 1.787946492433548, "reward_std": 0.06057331245392561, "rewards/accuracy_reward": 0.7879464626312256, "rewards/format_reward": 1.0, "step": 4166 }, { "completion_length": 431.22099685668945, "epoch": 0.32490791807144387, "grad_norm": 0.08863517565323546, "kl": 0.0030364990234375, "learning_rate": 7.613641198088081e-07, "loss": 0.0001, "reward": 1.8750000596046448, "reward_std": 0.04035415779799223, "rewards/accuracy_reward": 0.8750000298023224, "rewards/format_reward": 1.0, "step": 4168 }, { "completion_length": 423.6339454650879, "epoch": 0.3250638239822267, "grad_norm": 0.11329989483333842, "kl": 0.0028209686279296875, "learning_rate": 7.611553103413208e-07, "loss": 0.0001, "reward": 1.8191965222358704, "reward_std": 0.05395676288753748, "rewards/accuracy_reward": 0.8191964775323868, "rewards/format_reward": 1.0, "step": 4170 }, { "completion_length": 418.9620666503906, "epoch": 0.32521972989300957, "grad_norm": 0.09585347976381638, "kl": 0.002933502197265625, "learning_rate": 7.609464382210045e-07, "loss": 0.0001, "reward": 1.8169643729925156, "reward_std": 0.040809216909110546, "rewards/accuracy_reward": 0.8191964700818062, "rewards/format_reward": 0.9977678656578064, "step": 4172 }, { "completion_length": 406.8727836608887, "epoch": 0.3253756358037924, "grad_norm": 0.07831137621175624, "kl": 0.0032320022583007812, "learning_rate": 7.607375034979687e-07, "loss": 0.0001, "reward": 1.7790179252624512, "reward_std": 0.05636602267622948, "rewards/accuracy_reward": 0.7790179029107094, "rewards/format_reward": 1.0, "step": 4174 }, { "completion_length": 408.02233505249023, "epoch": 0.32553154171457527, "grad_norm": 0.11564318324718988, "kl": 0.0028371810913085938, "learning_rate": 7.605285062223383e-07, "loss": 0.0001, "reward": 1.812500074505806, "reward_std": 0.05523695703595877, "rewards/accuracy_reward": 0.8125000298023224, "rewards/format_reward": 1.0, "step": 4176 }, { "completion_length": 412.05582427978516, "epoch": 0.3256874476253581, "grad_norm": 0.05958082031345274, "kl": 0.0025606155395507812, "learning_rate": 7.603194464442533e-07, "loss": 0.0001, "reward": 1.8973214775323868, "reward_std": 0.008266251534223557, "rewards/accuracy_reward": 0.897321455180645, "rewards/format_reward": 1.0, "step": 4178 }, { "completion_length": 437.18974685668945, "epoch": 0.3258433535361409, "grad_norm": 0.09577838871002609, "kl": 0.00289154052734375, "learning_rate": 7.60110324213868e-07, "loss": 0.0001, "reward": 1.7857143729925156, "reward_std": 0.06637797132134438, "rewards/accuracy_reward": 0.7879464477300644, "rewards/format_reward": 0.9977678656578064, "step": 4180 }, { "completion_length": 419.32368087768555, "epoch": 0.32599925944692376, "grad_norm": 0.13790146666616254, "kl": 0.0032444000244140625, "learning_rate": 7.599011395813524e-07, "loss": 0.0001, "reward": 1.8616072088479996, "reward_std": 0.09363691881299019, "rewards/accuracy_reward": 0.8616071864962578, "rewards/format_reward": 1.0, "step": 4182 }, { "completion_length": 436.31028747558594, "epoch": 0.3261551653577066, "grad_norm": 0.11637119489083181, "kl": 0.0032787322998046875, "learning_rate": 7.596918925968913e-07, "loss": 0.0001, "reward": 1.7544643729925156, "reward_std": 0.07274375669658184, "rewards/accuracy_reward": 0.7544643208384514, "rewards/format_reward": 1.0, "step": 4184 }, { "completion_length": 420.95983505249023, "epoch": 0.32631107126848946, "grad_norm": 0.08438746286131267, "kl": 0.0026454925537109375, "learning_rate": 7.594825833106844e-07, "loss": 0.0001, "reward": 1.8839286267757416, "reward_std": 0.04373771511018276, "rewards/accuracy_reward": 0.8839285969734192, "rewards/format_reward": 1.0, "step": 4186 }, { "completion_length": 428.08484268188477, "epoch": 0.3264669771792723, "grad_norm": 0.06679066259918173, "kl": 0.0030965805053710938, "learning_rate": 7.592732117729461e-07, "loss": 0.0001, "reward": 1.7767858058214188, "reward_std": 0.040809216909110546, "rewards/accuracy_reward": 0.7790179029107094, "rewards/format_reward": 0.9977678656578064, "step": 4188 }, { "completion_length": 422.4843940734863, "epoch": 0.32662288309005516, "grad_norm": 0.04633375321767518, "kl": 0.0031986236572265625, "learning_rate": 7.590637780339063e-07, "loss": 0.0001, "reward": 1.8191965073347092, "reward_std": 0.04569051135331392, "rewards/accuracy_reward": 0.8191964477300644, "rewards/format_reward": 1.0, "step": 4190 }, { "completion_length": 435.85493087768555, "epoch": 0.326778789000838, "grad_norm": 0.12749901050331544, "kl": 0.0032863616943359375, "learning_rate": 7.588542821438093e-07, "loss": 0.0001, "reward": 1.7589286416769028, "reward_std": 0.06463087163865566, "rewards/accuracy_reward": 0.7611607536673546, "rewards/format_reward": 0.9977678656578064, "step": 4192 }, { "completion_length": 410.34153747558594, "epoch": 0.32693469491162086, "grad_norm": 0.07852979402838871, "kl": 0.002811431884765625, "learning_rate": 7.586447241529145e-07, "loss": 0.0001, "reward": 1.8325893729925156, "reward_std": 0.03547286428511143, "rewards/accuracy_reward": 0.8325893133878708, "rewards/format_reward": 1.0, "step": 4194 }, { "completion_length": 427.05582427978516, "epoch": 0.32709060082240365, "grad_norm": 0.044756096697775326, "kl": 0.0033206939697265625, "learning_rate": 7.584351041114961e-07, "loss": 0.0001, "reward": 1.6026786416769028, "reward_std": 0.041786317713558674, "rewards/accuracy_reward": 0.6026786006987095, "rewards/format_reward": 1.0, "step": 4196 }, { "completion_length": 423.8125228881836, "epoch": 0.3272465067331865, "grad_norm": 0.09367441128326047, "kl": 0.0031890869140625, "learning_rate": 7.582254220698437e-07, "loss": 0.0001, "reward": 1.834821492433548, "reward_std": 0.043065110221505165, "rewards/accuracy_reward": 0.834821455180645, "rewards/format_reward": 1.0, "step": 4198 }, { "completion_length": 437.7477912902832, "epoch": 0.32740241264396935, "grad_norm": 0.11585545926444424, "kl": 0.0035724639892578125, "learning_rate": 7.58015678078261e-07, "loss": 0.0001, "reward": 1.8102679252624512, "reward_std": 0.08799606841057539, "rewards/accuracy_reward": 0.8102678880095482, "rewards/format_reward": 1.0, "step": 4200 }, { "completion_length": 419.6093940734863, "epoch": 0.3275583185547522, "grad_norm": 0.03851065995400745, "kl": 0.00287628173828125, "learning_rate": 7.578058721870668e-07, "loss": 0.0001, "reward": 1.879464328289032, "reward_std": 0.012626906856894493, "rewards/accuracy_reward": 0.8794643133878708, "rewards/format_reward": 1.0, "step": 4202 }, { "completion_length": 441.54019927978516, "epoch": 0.32771422446553505, "grad_norm": 0.09626264506181044, "kl": 0.00357818603515625, "learning_rate": 7.575960044465952e-07, "loss": 0.0001, "reward": 1.82589291036129, "reward_std": 0.0683831013739109, "rewards/accuracy_reward": 0.8258928954601288, "rewards/format_reward": 1.0, "step": 4204 }, { "completion_length": 428.33930587768555, "epoch": 0.3278701303763179, "grad_norm": 0.08648974960648563, "kl": 0.00322723388671875, "learning_rate": 7.573860749071948e-07, "loss": 0.0001, "reward": 1.8526786416769028, "reward_std": 0.03644856344908476, "rewards/accuracy_reward": 0.8526786118745804, "rewards/format_reward": 1.0, "step": 4206 }, { "completion_length": 430.4643096923828, "epoch": 0.32802603628710075, "grad_norm": 0.07990642132776357, "kl": 0.0046062469482421875, "learning_rate": 7.571760836192289e-07, "loss": 0.0002, "reward": 1.8035715222358704, "reward_std": 0.06575629953294992, "rewards/accuracy_reward": 0.8035714700818062, "rewards/format_reward": 1.0, "step": 4208 }, { "completion_length": 420.3214530944824, "epoch": 0.3281819421978836, "grad_norm": 0.07207265557129162, "kl": 0.003509521484375, "learning_rate": 7.569660306330758e-07, "loss": 0.0001, "reward": 1.8995536267757416, "reward_std": 0.04501790925860405, "rewards/accuracy_reward": 0.8995536044239998, "rewards/format_reward": 1.0, "step": 4210 }, { "completion_length": 427.18751525878906, "epoch": 0.3283378481086664, "grad_norm": 0.08184779399441983, "kl": 0.0037021636962890625, "learning_rate": 7.567559159991286e-07, "loss": 0.0001, "reward": 1.718750074505806, "reward_std": 0.05846714973449707, "rewards/accuracy_reward": 0.718750037252903, "rewards/format_reward": 1.0, "step": 4212 }, { "completion_length": 427.91296005249023, "epoch": 0.32849375401944925, "grad_norm": 0.10605721742381112, "kl": 0.00318145751953125, "learning_rate": 7.56545739767795e-07, "loss": 0.0001, "reward": 1.8281250894069672, "reward_std": 0.05102826841175556, "rewards/accuracy_reward": 0.828125037252903, "rewards/format_reward": 1.0, "step": 4214 }, { "completion_length": 420.49332427978516, "epoch": 0.3286496599302321, "grad_norm": 0.1324347017121903, "kl": 0.003284454345703125, "learning_rate": 7.563355019894975e-07, "loss": 0.0001, "reward": 1.727678656578064, "reward_std": 0.08372096065431833, "rewards/accuracy_reward": 0.7299107499420643, "rewards/format_reward": 0.9977678656578064, "step": 4216 }, { "completion_length": 429.05359268188477, "epoch": 0.32880556584101495, "grad_norm": 0.08673688770615266, "kl": 0.0034313201904296875, "learning_rate": 7.561252027146741e-07, "loss": 0.0001, "reward": 1.8705357760190964, "reward_std": 0.04764331039041281, "rewards/accuracy_reward": 0.8705357536673546, "rewards/format_reward": 1.0, "step": 4218 }, { "completion_length": 436.59376525878906, "epoch": 0.3289614717517978, "grad_norm": 0.043891282156724144, "kl": 0.0029544830322265625, "learning_rate": 7.559148419937764e-07, "loss": 0.0001, "reward": 1.8571429252624512, "reward_std": 0.025253813713788986, "rewards/accuracy_reward": 0.8571428805589676, "rewards/format_reward": 1.0, "step": 4220 }, { "completion_length": 416.2477912902832, "epoch": 0.32911737766258065, "grad_norm": 0.11161770315979865, "kl": 0.0030422210693359375, "learning_rate": 7.557044198772712e-07, "loss": 0.0001, "reward": 1.7232143729925156, "reward_std": 0.055909561924636364, "rewards/accuracy_reward": 0.7232143096625805, "rewards/format_reward": 1.0, "step": 4222 }, { "completion_length": 430.808048248291, "epoch": 0.3292732835733635, "grad_norm": 0.06236286465032003, "kl": 0.0033969879150390625, "learning_rate": 7.554939364156407e-07, "loss": 0.0001, "reward": 1.8102679550647736, "reward_std": 0.042762015014886856, "rewards/accuracy_reward": 0.8102678880095482, "rewards/format_reward": 1.0, "step": 4224 }, { "completion_length": 424.5513572692871, "epoch": 0.32942918948414635, "grad_norm": 0.07814691283280788, "kl": 0.0028438568115234375, "learning_rate": 7.552833916593807e-07, "loss": 0.0001, "reward": 1.7678572237491608, "reward_std": 0.04471481405198574, "rewards/accuracy_reward": 0.7678571790456772, "rewards/format_reward": 1.0, "step": 4226 }, { "completion_length": 419.43528747558594, "epoch": 0.32958509539492914, "grad_norm": 0.1095970167308775, "kl": 0.004058837890625, "learning_rate": 7.550727856590025e-07, "loss": 0.0002, "reward": 1.7633929401636124, "reward_std": 0.06508369371294975, "rewards/accuracy_reward": 0.7633929029107094, "rewards/format_reward": 1.0, "step": 4228 }, { "completion_length": 426.47323989868164, "epoch": 0.329741001305712, "grad_norm": 0.10127736340345567, "kl": 0.0033159255981445312, "learning_rate": 7.548621184650315e-07, "loss": 0.0001, "reward": 1.7968750894069672, "reward_std": 0.03840135969221592, "rewards/accuracy_reward": 0.7968750298023224, "rewards/format_reward": 1.0, "step": 4230 }, { "completion_length": 424.66966247558594, "epoch": 0.32989690721649484, "grad_norm": 0.06014094777945988, "kl": 0.00302886962890625, "learning_rate": 7.546513901280085e-07, "loss": 0.0001, "reward": 1.8928571939468384, "reward_std": 0.012626906856894493, "rewards/accuracy_reward": 0.8928571864962578, "rewards/format_reward": 1.0, "step": 4232 }, { "completion_length": 433.9732322692871, "epoch": 0.3300528131272777, "grad_norm": 0.06217094815653492, "kl": 0.0030498504638671875, "learning_rate": 7.544406006984884e-07, "loss": 0.0001, "reward": 1.8058036416769028, "reward_std": 0.05831742100417614, "rewards/accuracy_reward": 0.8058036118745804, "rewards/format_reward": 1.0, "step": 4234 }, { "completion_length": 435.60493087768555, "epoch": 0.33020871903806054, "grad_norm": 0.044314305097168154, "kl": 0.003353118896484375, "learning_rate": 7.542297502270408e-07, "loss": 0.0001, "reward": 1.7968750894069672, "reward_std": 0.014579704962670803, "rewards/accuracy_reward": 0.7968750298023224, "rewards/format_reward": 1.0, "step": 4236 }, { "completion_length": 427.72769927978516, "epoch": 0.3303646249488434, "grad_norm": 0.09676983113769347, "kl": 0.0032901763916015625, "learning_rate": 7.540188387642503e-07, "loss": 0.0001, "reward": 1.6941965222358704, "reward_std": 0.059294519014656544, "rewards/accuracy_reward": 0.694196455180645, "rewards/format_reward": 1.0, "step": 4238 }, { "completion_length": 422.86609268188477, "epoch": 0.33052053085962624, "grad_norm": 0.07747269315968007, "kl": 0.0028839111328125, "learning_rate": 7.538078663607159e-07, "loss": 0.0001, "reward": 1.8482143878936768, "reward_std": 0.057188354432582855, "rewards/accuracy_reward": 0.848214328289032, "rewards/format_reward": 1.0, "step": 4240 }, { "completion_length": 433.22546768188477, "epoch": 0.33067643677040903, "grad_norm": 0.09319319874169273, "kl": 0.0033693313598632812, "learning_rate": 7.535968330670513e-07, "loss": 0.0001, "reward": 1.8705357909202576, "reward_std": 0.08166428655385971, "rewards/accuracy_reward": 0.8727678954601288, "rewards/format_reward": 0.9977678656578064, "step": 4242 }, { "completion_length": 435.8013610839844, "epoch": 0.3308323426811919, "grad_norm": 0.08490413439095008, "kl": 0.0034198760986328125, "learning_rate": 7.533857389338842e-07, "loss": 0.0001, "reward": 1.8214286267757416, "reward_std": 0.058620515279471874, "rewards/accuracy_reward": 0.8236607387661934, "rewards/format_reward": 0.9977678656578064, "step": 4244 }, { "completion_length": 431.1540336608887, "epoch": 0.33098824859197473, "grad_norm": 0.11538353973752115, "kl": 0.003162384033203125, "learning_rate": 7.53174584011858e-07, "loss": 0.0001, "reward": 1.7522322088479996, "reward_std": 0.07387282326817513, "rewards/accuracy_reward": 0.7522321864962578, "rewards/format_reward": 1.0, "step": 4246 }, { "completion_length": 417.8125190734863, "epoch": 0.3311441545027576, "grad_norm": 0.08592616367750218, "kl": 0.00293731689453125, "learning_rate": 7.5296336835163e-07, "loss": 0.0001, "reward": 1.8482143431901932, "reward_std": 0.06380490306764841, "rewards/accuracy_reward": 0.8482143208384514, "rewards/format_reward": 1.0, "step": 4248 }, { "completion_length": 418.88171005249023, "epoch": 0.33130006041354043, "grad_norm": 0.14252290204103968, "kl": 0.003040313720703125, "learning_rate": 7.527520920038721e-07, "loss": 0.0001, "reward": 1.9017857760190964, "reward_std": 0.06282780319452286, "rewards/accuracy_reward": 0.9017857685685158, "rewards/format_reward": 1.0, "step": 4250 }, { "completion_length": 417.47546768188477, "epoch": 0.3314559663243233, "grad_norm": 0.10578107792594418, "kl": 0.0029449462890625, "learning_rate": 7.525407550192709e-07, "loss": 0.0001, "reward": 1.8258929401636124, "reward_std": 0.05298106651753187, "rewards/accuracy_reward": 0.8258928954601288, "rewards/format_reward": 1.0, "step": 4252 }, { "completion_length": 443.69868087768555, "epoch": 0.33161187223510613, "grad_norm": 0.1245750550301893, "kl": 0.0031147003173828125, "learning_rate": 7.523293574485274e-07, "loss": 0.0001, "reward": 1.7299107760190964, "reward_std": 0.13054194208234549, "rewards/accuracy_reward": 0.7321428805589676, "rewards/format_reward": 0.9977678656578064, "step": 4254 }, { "completion_length": 416.1205596923828, "epoch": 0.331767778145889, "grad_norm": 0.11800720930289525, "kl": 0.00318145751953125, "learning_rate": 7.521178993423574e-07, "loss": 0.0001, "reward": 1.8571429252624512, "reward_std": 0.058620515279471874, "rewards/accuracy_reward": 0.8571428954601288, "rewards/format_reward": 1.0, "step": 4256 }, { "completion_length": 421.21430587768555, "epoch": 0.3319236840566718, "grad_norm": 0.08313401049918154, "kl": 0.00298309326171875, "learning_rate": 7.519063807514907e-07, "loss": 0.0001, "reward": 1.8683036714792252, "reward_std": 0.04794640466570854, "rewards/accuracy_reward": 0.868303619325161, "rewards/format_reward": 1.0, "step": 4258 }, { "completion_length": 415.9308204650879, "epoch": 0.3320795899674546, "grad_norm": 0.09260662995409304, "kl": 0.0028781890869140625, "learning_rate": 7.516948017266725e-07, "loss": 0.0001, "reward": 1.8102679401636124, "reward_std": 0.03156726714223623, "rewards/accuracy_reward": 0.8102678954601288, "rewards/format_reward": 1.0, "step": 4260 }, { "completion_length": 422.5803756713867, "epoch": 0.3322354958782375, "grad_norm": 0.10322250546375775, "kl": 0.0034465789794921875, "learning_rate": 7.514831623186615e-07, "loss": 0.0001, "reward": 1.7433036416769028, "reward_std": 0.08860225696116686, "rewards/accuracy_reward": 0.743303619325161, "rewards/format_reward": 1.0, "step": 4262 }, { "completion_length": 417.93528747558594, "epoch": 0.3323914017890203, "grad_norm": 0.08428851260800908, "kl": 0.0029582977294921875, "learning_rate": 7.512714625782315e-07, "loss": 0.0001, "reward": 1.8281250894069672, "reward_std": 0.05651434697210789, "rewards/accuracy_reward": 0.8281250447034836, "rewards/format_reward": 1.0, "step": 4264 }, { "completion_length": 428.9018020629883, "epoch": 0.3325473076998032, "grad_norm": 0.08184997099327013, "kl": 0.0028629302978515625, "learning_rate": 7.510597025561708e-07, "loss": 0.0001, "reward": 1.705357238650322, "reward_std": 0.040354158729314804, "rewards/accuracy_reward": 0.7053571715950966, "rewards/format_reward": 1.0, "step": 4266 }, { "completion_length": 423.08484268188477, "epoch": 0.332703213610586, "grad_norm": 0.08051115803178725, "kl": 0.0029745101928710938, "learning_rate": 7.508478823032818e-07, "loss": 0.0001, "reward": 1.8303572088479996, "reward_std": 0.030438202433288097, "rewards/accuracy_reward": 0.8303571790456772, "rewards/format_reward": 1.0, "step": 4268 }, { "completion_length": 434.6540412902832, "epoch": 0.3328591195213689, "grad_norm": 0.07977301592444191, "kl": 0.0029392242431640625, "learning_rate": 7.506360018703814e-07, "loss": 0.0001, "reward": 1.8593750447034836, "reward_std": 0.03547286428511143, "rewards/accuracy_reward": 0.859375037252903, "rewards/format_reward": 1.0, "step": 4270 }, { "completion_length": 436.56028747558594, "epoch": 0.3330150254321517, "grad_norm": 0.10115391692623658, "kl": 0.003246307373046875, "learning_rate": 7.504240613083014e-07, "loss": 0.0001, "reward": 1.7946429252624512, "reward_std": 0.04065584950149059, "rewards/accuracy_reward": 0.7946428805589676, "rewards/format_reward": 1.0, "step": 4272 }, { "completion_length": 432.25002670288086, "epoch": 0.3331709313429345, "grad_norm": 0.09826610116238885, "kl": 0.0033416748046875, "learning_rate": 7.502120606678874e-07, "loss": 0.0001, "reward": 1.799107238650322, "reward_std": 0.044714813120663166, "rewards/accuracy_reward": 0.7991071790456772, "rewards/format_reward": 1.0, "step": 4274 }, { "completion_length": 430.77233505249023, "epoch": 0.33332683725371737, "grad_norm": 0.21462456075926667, "kl": 0.007716178894042969, "learning_rate": 7.5e-07, "loss": 0.0003, "reward": 1.774553656578064, "reward_std": 0.05230706091970205, "rewards/accuracy_reward": 0.7745535969734192, "rewards/format_reward": 1.0, "step": 4276 }, { "completion_length": 429.2924270629883, "epoch": 0.3334827431645002, "grad_norm": 0.08005392639226189, "kl": 0.0033826828002929688, "learning_rate": 7.497878793555136e-07, "loss": 0.0001, "reward": 1.7589286416769028, "reward_std": 0.05298106465488672, "rewards/accuracy_reward": 0.7589285969734192, "rewards/format_reward": 1.0, "step": 4278 }, { "completion_length": 422.02457427978516, "epoch": 0.33363864907528307, "grad_norm": 0.10138144182522457, "kl": 0.00296783447265625, "learning_rate": 7.495756987853174e-07, "loss": 0.0001, "reward": 1.7834822237491608, "reward_std": 0.03803044930100441, "rewards/accuracy_reward": 0.7834821864962578, "rewards/format_reward": 1.0, "step": 4280 }, { "completion_length": 440.6294822692871, "epoch": 0.3337945549860659, "grad_norm": 0.07019825281098603, "kl": 0.0034160614013671875, "learning_rate": 7.493634583403153e-07, "loss": 0.0001, "reward": 1.8549107611179352, "reward_std": 0.05538892187178135, "rewards/accuracy_reward": 0.8549107387661934, "rewards/format_reward": 1.0, "step": 4282 }, { "completion_length": 426.8035888671875, "epoch": 0.33395046089684877, "grad_norm": 0.13209873814162945, "kl": 0.0031290054321289062, "learning_rate": 7.491511580714244e-07, "loss": 0.0001, "reward": 1.767857238650322, "reward_std": 0.06124731618911028, "rewards/accuracy_reward": 0.770089328289032, "rewards/format_reward": 0.9977678656578064, "step": 4284 }, { "completion_length": 417.4218940734863, "epoch": 0.3341063668076316, "grad_norm": 0.11122674736396401, "kl": 0.00299835205078125, "learning_rate": 7.489387980295773e-07, "loss": 0.0001, "reward": 1.8080357909202576, "reward_std": 0.05200396664440632, "rewards/accuracy_reward": 0.8080357387661934, "rewards/format_reward": 1.0, "step": 4286 }, { "completion_length": 412.14733505249023, "epoch": 0.3342622727184144, "grad_norm": 0.07342852065408643, "kl": 0.00251007080078125, "learning_rate": 7.487263782657205e-07, "loss": 0.0001, "reward": 1.8348215222358704, "reward_std": 0.03742566145956516, "rewards/accuracy_reward": 0.8348214775323868, "rewards/format_reward": 1.0, "step": 4288 }, { "completion_length": 433.2031440734863, "epoch": 0.33441817862919726, "grad_norm": 0.08505218030074316, "kl": 0.00319671630859375, "learning_rate": 7.485138988308148e-07, "loss": 0.0001, "reward": 1.7924108058214188, "reward_std": 0.043739115819334984, "rewards/accuracy_reward": 0.792410746216774, "rewards/format_reward": 1.0, "step": 4290 }, { "completion_length": 416.83483505249023, "epoch": 0.3345740845399801, "grad_norm": 0.08180770677524941, "kl": 0.0033931732177734375, "learning_rate": 7.483013597758356e-07, "loss": 0.0001, "reward": 1.828125074505806, "reward_std": 0.06320011615753174, "rewards/accuracy_reward": 0.8281250447034836, "rewards/format_reward": 1.0, "step": 4292 }, { "completion_length": 435.2388572692871, "epoch": 0.33472999045076296, "grad_norm": 0.10501670002731688, "kl": 0.003711700439453125, "learning_rate": 7.480887611517721e-07, "loss": 0.0001, "reward": 1.8191964775323868, "reward_std": 0.09394001122564077, "rewards/accuracy_reward": 0.819196455180645, "rewards/format_reward": 1.0, "step": 4294 }, { "completion_length": 424.8906440734863, "epoch": 0.3348858963615458, "grad_norm": 0.11976776500281318, "kl": 0.0029010772705078125, "learning_rate": 7.478761030096281e-07, "loss": 0.0001, "reward": 1.7142858058214188, "reward_std": 0.08409187290817499, "rewards/accuracy_reward": 0.7142857536673546, "rewards/format_reward": 1.0, "step": 4296 }, { "completion_length": 421.0982322692871, "epoch": 0.33504180227232866, "grad_norm": 0.1057502367314225, "kl": 0.002941131591796875, "learning_rate": 7.476633854004217e-07, "loss": 0.0001, "reward": 1.7857143580913544, "reward_std": 0.05688666179776192, "rewards/accuracy_reward": 0.785714328289032, "rewards/format_reward": 1.0, "step": 4298 }, { "completion_length": 415.98216247558594, "epoch": 0.3351977081831115, "grad_norm": 0.10670199657337213, "kl": 0.00362396240234375, "learning_rate": 7.474506083751853e-07, "loss": 0.0001, "reward": 1.8437500596046448, "reward_std": 0.034798857755959034, "rewards/accuracy_reward": 0.843750037252903, "rewards/format_reward": 1.0, "step": 4300 }, { "completion_length": 421.13394927978516, "epoch": 0.33535361409389436, "grad_norm": 0.08862076002357179, "kl": 0.0030345916748046875, "learning_rate": 7.472377719849655e-07, "loss": 0.0001, "reward": 1.7745536416769028, "reward_std": 0.03239100053906441, "rewards/accuracy_reward": 0.7745536118745804, "rewards/format_reward": 1.0, "step": 4302 }, { "completion_length": 419.9754638671875, "epoch": 0.33550952000467715, "grad_norm": 0.07334538314440507, "kl": 0.00287628173828125, "learning_rate": 7.47024876280823e-07, "loss": 0.0001, "reward": 1.837053656578064, "reward_std": 0.027206611819565296, "rewards/accuracy_reward": 0.8370535969734192, "rewards/format_reward": 1.0, "step": 4304 }, { "completion_length": 418.29913330078125, "epoch": 0.33566542591546, "grad_norm": 0.05731327989432479, "kl": 0.002994537353515625, "learning_rate": 7.468119213138327e-07, "loss": 0.0001, "reward": 1.8348215073347092, "reward_std": 0.01781129650771618, "rewards/accuracy_reward": 0.8348214700818062, "rewards/format_reward": 1.0, "step": 4306 }, { "completion_length": 429.9263610839844, "epoch": 0.33582133182624285, "grad_norm": 0.1014568168444635, "kl": 0.0034027099609375, "learning_rate": 7.465989071350841e-07, "loss": 0.0001, "reward": 1.7946429252624512, "reward_std": 0.06252610962837934, "rewards/accuracy_reward": 0.7946428954601288, "rewards/format_reward": 1.0, "step": 4308 }, { "completion_length": 430.0245819091797, "epoch": 0.3359772377370257, "grad_norm": 0.1129436566522056, "kl": 0.003116607666015625, "learning_rate": 7.463858337956806e-07, "loss": 0.0001, "reward": 1.8504465073347092, "reward_std": 0.048099772073328495, "rewards/accuracy_reward": 0.8504464700818062, "rewards/format_reward": 1.0, "step": 4310 }, { "completion_length": 426.91296768188477, "epoch": 0.33613314364780855, "grad_norm": 0.11333665813544114, "kl": 0.0032062530517578125, "learning_rate": 7.461727013467398e-07, "loss": 0.0001, "reward": 1.7656251043081284, "reward_std": 0.04809977114200592, "rewards/accuracy_reward": 0.7656250298023224, "rewards/format_reward": 1.0, "step": 4312 }, { "completion_length": 433.127254486084, "epoch": 0.3362890495585914, "grad_norm": 0.10571182533152068, "kl": 0.00296783447265625, "learning_rate": 7.459595098393936e-07, "loss": 0.0001, "reward": 1.8169643580913544, "reward_std": 0.0819871099665761, "rewards/accuracy_reward": 0.816964328289032, "rewards/format_reward": 1.0, "step": 4314 }, { "completion_length": 427.09153747558594, "epoch": 0.33644495546937425, "grad_norm": 0.10015130086250415, "kl": 0.0031986236572265625, "learning_rate": 7.457462593247881e-07, "loss": 0.0001, "reward": 1.868303656578064, "reward_std": 0.038401360623538494, "rewards/accuracy_reward": 0.8683036044239998, "rewards/format_reward": 1.0, "step": 4316 }, { "completion_length": 412.70091247558594, "epoch": 0.3366008613801571, "grad_norm": 0.11516869306628029, "kl": 0.002948760986328125, "learning_rate": 7.455329498540831e-07, "loss": 0.0001, "reward": 1.7812500894069672, "reward_std": 0.050052570179104805, "rewards/accuracy_reward": 0.7812500298023224, "rewards/format_reward": 1.0, "step": 4318 }, { "completion_length": 430.6138610839844, "epoch": 0.3367567672909399, "grad_norm": 0.11356737888041622, "kl": 0.00307464599609375, "learning_rate": 7.453195814784532e-07, "loss": 0.0001, "reward": 1.805803656578064, "reward_std": 0.042762015014886856, "rewards/accuracy_reward": 0.8058036230504513, "rewards/format_reward": 1.0, "step": 4320 }, { "completion_length": 423.0870704650879, "epoch": 0.33691267320172275, "grad_norm": 0.0683920525068013, "kl": 0.0032300949096679688, "learning_rate": 7.451061542490868e-07, "loss": 0.0001, "reward": 1.7991072088479996, "reward_std": 0.06710430979728699, "rewards/accuracy_reward": 0.7991071790456772, "rewards/format_reward": 1.0, "step": 4322 }, { "completion_length": 419.502254486084, "epoch": 0.3370685791125056, "grad_norm": 0.11553001505012908, "kl": 0.00325775146484375, "learning_rate": 7.448926682171866e-07, "loss": 0.0001, "reward": 1.7834822237491608, "reward_std": 0.06222301535308361, "rewards/accuracy_reward": 0.7834821864962578, "rewards/format_reward": 1.0, "step": 4324 }, { "completion_length": 429.79019927978516, "epoch": 0.33722448502328844, "grad_norm": 0.10664817933079153, "kl": 0.0033054351806640625, "learning_rate": 7.446791234339689e-07, "loss": 0.0001, "reward": 1.7522322088479996, "reward_std": 0.0657577021047473, "rewards/accuracy_reward": 0.752232164144516, "rewards/format_reward": 1.0, "step": 4326 }, { "completion_length": 415.98662185668945, "epoch": 0.3373803909340713, "grad_norm": 0.1245841461799454, "kl": 0.0030002593994140625, "learning_rate": 7.444655199506647e-07, "loss": 0.0001, "reward": 1.8169643580913544, "reward_std": 0.05734172184020281, "rewards/accuracy_reward": 0.8169643133878708, "rewards/format_reward": 1.0, "step": 4328 }, { "completion_length": 407.1718940734863, "epoch": 0.33753629684485414, "grad_norm": 0.07963903006035973, "kl": 0.0030498504638671875, "learning_rate": 7.442518578185188e-07, "loss": 0.0001, "reward": 1.7767858058214188, "reward_std": 0.0490754684433341, "rewards/accuracy_reward": 0.7767857536673546, "rewards/format_reward": 1.0, "step": 4330 }, { "completion_length": 422.87278747558594, "epoch": 0.337692202755637, "grad_norm": 0.05321892601448255, "kl": 0.0032787322998046875, "learning_rate": 7.440381370887903e-07, "loss": 0.0001, "reward": 1.7611607909202576, "reward_std": 0.024797352962195873, "rewards/accuracy_reward": 0.761160746216774, "rewards/format_reward": 1.0, "step": 4332 }, { "completion_length": 414.15403747558594, "epoch": 0.3378481086664198, "grad_norm": 0.04736984161648482, "kl": 0.0029506683349609375, "learning_rate": 7.438243578127518e-07, "loss": 0.0001, "reward": 1.819196492433548, "reward_std": 0.038030450232326984, "rewards/accuracy_reward": 0.8191964477300644, "rewards/format_reward": 1.0, "step": 4334 }, { "completion_length": 417.4888610839844, "epoch": 0.33800401457720264, "grad_norm": 0.10509821416874181, "kl": 0.0031185150146484375, "learning_rate": 7.436105200416905e-07, "loss": 0.0001, "reward": 1.8861607909202576, "reward_std": 0.0632842630147934, "rewards/accuracy_reward": 0.8861607611179352, "rewards/format_reward": 1.0, "step": 4336 }, { "completion_length": 411.5245704650879, "epoch": 0.3381599204879855, "grad_norm": 0.1130952310247168, "kl": 0.0027713775634765625, "learning_rate": 7.433966238269076e-07, "loss": 0.0001, "reward": 1.8013393729925156, "reward_std": 0.04712267033755779, "rewards/accuracy_reward": 0.801339328289032, "rewards/format_reward": 1.0, "step": 4338 }, { "completion_length": 420.76118087768555, "epoch": 0.33831582639876834, "grad_norm": 0.1063816538832513, "kl": 0.0035505294799804688, "learning_rate": 7.431826692197179e-07, "loss": 0.0001, "reward": 1.7968750894069672, "reward_std": 0.05764481611549854, "rewards/accuracy_reward": 0.7968750521540642, "rewards/format_reward": 1.0, "step": 4340 }, { "completion_length": 416.8705520629883, "epoch": 0.3384717323095512, "grad_norm": 0.07985346425119065, "kl": 0.0029754638671875, "learning_rate": 7.429686562714506e-07, "loss": 0.0001, "reward": 1.8750000447034836, "reward_std": 0.04373771417886019, "rewards/accuracy_reward": 0.8750000298023224, "rewards/format_reward": 1.0, "step": 4342 }, { "completion_length": 422.8236770629883, "epoch": 0.33862763822033404, "grad_norm": 0.06311677260858582, "kl": 0.0028743743896484375, "learning_rate": 7.427545850334489e-07, "loss": 0.0001, "reward": 1.8013393729925156, "reward_std": 0.04696930479258299, "rewards/accuracy_reward": 0.8013393208384514, "rewards/format_reward": 1.0, "step": 4344 }, { "completion_length": 417.26341247558594, "epoch": 0.3387835441311169, "grad_norm": 0.06841992437373144, "kl": 0.0034198760986328125, "learning_rate": 7.425404555570695e-07, "loss": 0.0001, "reward": 1.7946429252624512, "reward_std": 0.05523695796728134, "rewards/accuracy_reward": 0.7946429029107094, "rewards/format_reward": 1.0, "step": 4346 }, { "completion_length": 422.1718940734863, "epoch": 0.33893945004189974, "grad_norm": 0.12446762991419395, "kl": 0.0035762786865234375, "learning_rate": 7.423262678936838e-07, "loss": 0.0001, "reward": 1.7008929252624512, "reward_std": 0.09574168175458908, "rewards/accuracy_reward": 0.7008929029107094, "rewards/format_reward": 1.0, "step": 4348 }, { "completion_length": 424.8906440734863, "epoch": 0.33909535595268253, "grad_norm": 0.07900945735687157, "kl": 0.003269195556640625, "learning_rate": 7.421120220946767e-07, "loss": 0.0001, "reward": 1.8861607909202576, "reward_std": 0.042762015014886856, "rewards/accuracy_reward": 0.8883928954601288, "rewards/format_reward": 0.9977678656578064, "step": 4350 }, { "completion_length": 423.3482322692871, "epoch": 0.3392512618634654, "grad_norm": 0.1318376700048209, "kl": 0.0031719207763671875, "learning_rate": 7.41897718211447e-07, "loss": 0.0001, "reward": 1.772321492433548, "reward_std": 0.05441322270780802, "rewards/accuracy_reward": 0.7745536044239998, "rewards/format_reward": 0.9977678656578064, "step": 4352 }, { "completion_length": 416.2790336608887, "epoch": 0.33940716777424823, "grad_norm": 0.13733178306141275, "kl": 0.0031518936157226562, "learning_rate": 7.416833562954078e-07, "loss": 0.0001, "reward": 1.8571429252624512, "reward_std": 0.09341937303543091, "rewards/accuracy_reward": 0.85714291036129, "rewards/format_reward": 1.0, "step": 4354 }, { "completion_length": 420.2366256713867, "epoch": 0.3395630736850311, "grad_norm": 0.08658098383482121, "kl": 0.003261566162109375, "learning_rate": 7.414689363979857e-07, "loss": 0.0001, "reward": 1.852678656578064, "reward_std": 0.05734171997755766, "rewards/accuracy_reward": 0.8526786118745804, "rewards/format_reward": 1.0, "step": 4356 }, { "completion_length": 425.0134086608887, "epoch": 0.33971897959581393, "grad_norm": 0.06261008883771152, "kl": 0.0030727386474609375, "learning_rate": 7.412544585706214e-07, "loss": 0.0001, "reward": 1.7700893580913544, "reward_std": 0.03840135969221592, "rewards/accuracy_reward": 0.7700893245637417, "rewards/format_reward": 1.0, "step": 4358 }, { "completion_length": 415.471004486084, "epoch": 0.3398748855065968, "grad_norm": 0.11320320375405131, "kl": 0.0030584335327148438, "learning_rate": 7.410399228647697e-07, "loss": 0.0001, "reward": 1.8058036416769028, "reward_std": 0.06591106671839952, "rewards/accuracy_reward": 0.8058035895228386, "rewards/format_reward": 1.0, "step": 4360 }, { "completion_length": 430.252254486084, "epoch": 0.34003079141737963, "grad_norm": 0.09105449792438086, "kl": 0.0035924911499023438, "learning_rate": 7.408253293318987e-07, "loss": 0.0001, "reward": 1.9196429401636124, "reward_std": 0.06395827047526836, "rewards/accuracy_reward": 0.9196429029107094, "rewards/format_reward": 1.0, "step": 4362 }, { "completion_length": 432.3370780944824, "epoch": 0.3401866973281625, "grad_norm": 0.07109526047452495, "kl": 0.003231048583984375, "learning_rate": 7.406106780234912e-07, "loss": 0.0001, "reward": 1.8325893580913544, "reward_std": 0.037424261681735516, "rewards/accuracy_reward": 0.8325893208384514, "rewards/format_reward": 1.0, "step": 4364 }, { "completion_length": 431.1116256713867, "epoch": 0.3403426032389453, "grad_norm": 0.0706731248638164, "kl": 0.0032501220703125, "learning_rate": 7.403959689910433e-07, "loss": 0.0001, "reward": 1.8526786416769028, "reward_std": 0.0675593689084053, "rewards/accuracy_reward": 0.8549107536673546, "rewards/format_reward": 0.9977678656578064, "step": 4366 }, { "completion_length": 422.5000190734863, "epoch": 0.3404985091497281, "grad_norm": 0.12919455754225712, "kl": 0.00318145751953125, "learning_rate": 7.401812022860649e-07, "loss": 0.0001, "reward": 1.7968750894069672, "reward_std": 0.08732346259057522, "rewards/accuracy_reward": 0.7991071790456772, "rewards/format_reward": 0.9977678656578064, "step": 4368 }, { "completion_length": 425.7455520629883, "epoch": 0.340654415060511, "grad_norm": 0.12331809618331495, "kl": 0.002895355224609375, "learning_rate": 7.3996637796008e-07, "loss": 0.0001, "reward": 1.8526786416769028, "reward_std": 0.05621125362813473, "rewards/accuracy_reward": 0.8526786044239998, "rewards/format_reward": 1.0, "step": 4370 }, { "completion_length": 425.47769927978516, "epoch": 0.3408103209712938, "grad_norm": 0.07360851459356638, "kl": 0.0031299591064453125, "learning_rate": 7.397514960646264e-07, "loss": 0.0001, "reward": 1.8593750596046448, "reward_std": 0.049378564581274986, "rewards/accuracy_reward": 0.8593750223517418, "rewards/format_reward": 1.0, "step": 4372 }, { "completion_length": 411.0178680419922, "epoch": 0.3409662268820767, "grad_norm": 0.09495183776855319, "kl": 0.0025539398193359375, "learning_rate": 7.395365566512556e-07, "loss": 0.0001, "reward": 1.8258929252624512, "reward_std": 0.03644856158643961, "rewards/accuracy_reward": 0.82589291036129, "rewards/format_reward": 1.0, "step": 4374 }, { "completion_length": 415.75894927978516, "epoch": 0.3411221327928595, "grad_norm": 0.1164623145702048, "kl": 0.0029811859130859375, "learning_rate": 7.393215597715329e-07, "loss": 0.0001, "reward": 1.852678656578064, "reward_std": 0.04456144571304321, "rewards/accuracy_reward": 0.8526786044239998, "rewards/format_reward": 1.0, "step": 4376 }, { "completion_length": 415.0156440734863, "epoch": 0.3412780387036424, "grad_norm": 0.08930669985076055, "kl": 0.0027713775634765625, "learning_rate": 7.391065054770374e-07, "loss": 0.0001, "reward": 1.8080357611179352, "reward_std": 0.03403930272907019, "rewards/accuracy_reward": 0.8080357611179352, "rewards/format_reward": 1.0, "step": 4378 }, { "completion_length": 422.80359649658203, "epoch": 0.3414339446144252, "grad_norm": 0.09207901999815117, "kl": 0.003021240234375, "learning_rate": 7.388913938193621e-07, "loss": 0.0001, "reward": 1.9040179401636124, "reward_std": 0.06883956491947174, "rewards/accuracy_reward": 0.9040178880095482, "rewards/format_reward": 1.0, "step": 4380 }, { "completion_length": 410.667423248291, "epoch": 0.341589850525208, "grad_norm": 0.1623892859073915, "kl": 0.0029850006103515625, "learning_rate": 7.386762248501136e-07, "loss": 0.0001, "reward": 1.87276791036129, "reward_std": 0.06966329738497734, "rewards/accuracy_reward": 0.8727678880095482, "rewards/format_reward": 1.0, "step": 4382 }, { "completion_length": 446.89288330078125, "epoch": 0.34174575643599087, "grad_norm": 0.13764089443875485, "kl": 0.003574371337890625, "learning_rate": 7.384609986209124e-07, "loss": 0.0001, "reward": 1.8125000894069672, "reward_std": 0.09168552048504353, "rewards/accuracy_reward": 0.8147321864962578, "rewards/format_reward": 0.9977678656578064, "step": 4384 }, { "completion_length": 429.22993087768555, "epoch": 0.3419016623467737, "grad_norm": 0.05975682295831155, "kl": 0.0031604766845703125, "learning_rate": 7.382457151833925e-07, "loss": 0.0001, "reward": 1.8058036416769028, "reward_std": 0.04255491402000189, "rewards/accuracy_reward": 0.8080357536673546, "rewards/format_reward": 0.9977678656578064, "step": 4386 }, { "completion_length": 405.68751525878906, "epoch": 0.34205756825755657, "grad_norm": 0.11263334781149537, "kl": 0.0029897689819335938, "learning_rate": 7.380303745892018e-07, "loss": 0.0001, "reward": 1.8348215222358704, "reward_std": 0.061702377162873745, "rewards/accuracy_reward": 0.8370536118745804, "rewards/format_reward": 0.9977678656578064, "step": 4388 }, { "completion_length": 429.58707427978516, "epoch": 0.3422134741683394, "grad_norm": 0.0032889297390092786, "kl": 0.0031223297119140625, "learning_rate": 7.37814976890002e-07, "loss": 0.0001, "reward": 1.8258929401636124, "reward_std": 0.03403930272907019, "rewards/accuracy_reward": 0.8258928954601288, "rewards/format_reward": 1.0, "step": 4390 }, { "completion_length": 423.5290336608887, "epoch": 0.34236938007912227, "grad_norm": 0.091989207252836, "kl": 0.0033512115478515625, "learning_rate": 7.375995221374684e-07, "loss": 0.0001, "reward": 1.8191965073347092, "reward_std": 0.0651515107601881, "rewards/accuracy_reward": 0.819196455180645, "rewards/format_reward": 1.0, "step": 4392 }, { "completion_length": 423.2232322692871, "epoch": 0.3425252859899051, "grad_norm": 0.10467661254296036, "kl": 0.0037555694580078125, "learning_rate": 7.373840103832898e-07, "loss": 0.0002, "reward": 1.6651786416769028, "reward_std": 0.06560797244310379, "rewards/accuracy_reward": 0.6651785932481289, "rewards/format_reward": 1.0, "step": 4394 }, { "completion_length": 413.54466247558594, "epoch": 0.3426811919006879, "grad_norm": 0.11569037722047396, "kl": 0.0035076141357421875, "learning_rate": 7.371684416791689e-07, "loss": 0.0001, "reward": 1.712053656578064, "reward_std": 0.08428969047963619, "rewards/accuracy_reward": 0.7142857536673546, "rewards/format_reward": 0.9977678656578064, "step": 4396 }, { "completion_length": 418.46207427978516, "epoch": 0.34283709781147076, "grad_norm": 0.07806459919961384, "kl": 0.0032215118408203125, "learning_rate": 7.369528160768221e-07, "loss": 0.0001, "reward": 1.7812500447034836, "reward_std": 0.028182310052216053, "rewards/accuracy_reward": 0.7812500298023224, "rewards/format_reward": 1.0, "step": 4398 }, { "completion_length": 417.39734649658203, "epoch": 0.3429930037222536, "grad_norm": 0.1255455285023544, "kl": 0.0032100677490234375, "learning_rate": 7.367371336279794e-07, "loss": 0.0001, "reward": 1.8571429550647736, "reward_std": 0.08101000916212797, "rewards/accuracy_reward": 0.8571428954601288, "rewards/format_reward": 1.0, "step": 4400 }, { "completion_length": 427.22769927978516, "epoch": 0.34314890963303646, "grad_norm": 0.09700829137125996, "kl": 0.003429412841796875, "learning_rate": 7.365213943843841e-07, "loss": 0.0001, "reward": 1.7388393580913544, "reward_std": 0.05764481518417597, "rewards/accuracy_reward": 0.738839328289032, "rewards/format_reward": 1.0, "step": 4402 }, { "completion_length": 409.12501525878906, "epoch": 0.3433048155438193, "grad_norm": 0.0799831430405706, "kl": 0.003040313720703125, "learning_rate": 7.363055983977937e-07, "loss": 0.0001, "reward": 1.77678582072258, "reward_std": 0.02089315839111805, "rewards/accuracy_reward": 0.776785746216774, "rewards/format_reward": 1.0, "step": 4404 }, { "completion_length": 417.29466247558594, "epoch": 0.34346072145460216, "grad_norm": 0.09805613737466815, "kl": 0.0032138824462890625, "learning_rate": 7.360897457199791e-07, "loss": 0.0001, "reward": 1.8058036714792252, "reward_std": 0.036751655861735344, "rewards/accuracy_reward": 0.8058036118745804, "rewards/format_reward": 1.0, "step": 4406 }, { "completion_length": 420.49778747558594, "epoch": 0.343616627365385, "grad_norm": 0.08731078041478778, "kl": 0.0029392242431640625, "learning_rate": 7.358738364027246e-07, "loss": 0.0001, "reward": 1.8147322088479996, "reward_std": 0.049378564581274986, "rewards/accuracy_reward": 0.8147321790456772, "rewards/format_reward": 1.0, "step": 4408 }, { "completion_length": 424.53797149658203, "epoch": 0.34377253327616786, "grad_norm": 0.09169604911726931, "kl": 0.0027256011962890625, "learning_rate": 7.356578704978281e-07, "loss": 0.0001, "reward": 1.7924107760190964, "reward_std": 0.05343248788267374, "rewards/accuracy_reward": 0.7924107536673546, "rewards/format_reward": 1.0, "step": 4410 }, { "completion_length": 412.97993087768555, "epoch": 0.34392843918695065, "grad_norm": 0.11676232524036655, "kl": 0.003345489501953125, "learning_rate": 7.354418480571014e-07, "loss": 0.0001, "reward": 1.8013393729925156, "reward_std": 0.07838460803031921, "rewards/accuracy_reward": 0.8035714626312256, "rewards/format_reward": 0.9977678656578064, "step": 4412 }, { "completion_length": 412.89064025878906, "epoch": 0.3440843450977335, "grad_norm": 0.13641294039519888, "kl": 0.0031032562255859375, "learning_rate": 7.352257691323697e-07, "loss": 0.0001, "reward": 1.8236607611179352, "reward_std": 0.06883956491947174, "rewards/accuracy_reward": 0.8236607387661934, "rewards/format_reward": 1.0, "step": 4414 }, { "completion_length": 419.0558204650879, "epoch": 0.34424025100851635, "grad_norm": 0.10333105112445165, "kl": 0.0033512115478515625, "learning_rate": 7.350096337754715e-07, "loss": 0.0001, "reward": 1.897321492433548, "reward_std": 0.05298106651753187, "rewards/accuracy_reward": 0.8973214626312256, "rewards/format_reward": 1.0, "step": 4416 }, { "completion_length": 427.2388572692871, "epoch": 0.3443961569192992, "grad_norm": 0.11027762712646542, "kl": 0.0031280517578125, "learning_rate": 7.347934420382593e-07, "loss": 0.0001, "reward": 1.7745536416769028, "reward_std": 0.06971138902008533, "rewards/accuracy_reward": 0.7767857611179352, "rewards/format_reward": 0.9977678656578064, "step": 4418 }, { "completion_length": 429.49555587768555, "epoch": 0.34455206283008205, "grad_norm": 0.049481833071626645, "kl": 0.0030345916748046875, "learning_rate": 7.345771939725987e-07, "loss": 0.0001, "reward": 1.8482143729925156, "reward_std": 0.025253813713788986, "rewards/accuracy_reward": 0.8482143357396126, "rewards/format_reward": 1.0, "step": 4420 }, { "completion_length": 413.2366256713867, "epoch": 0.3447079687408649, "grad_norm": 0.07887494977568289, "kl": 0.0031375885009765625, "learning_rate": 7.34360889630369e-07, "loss": 0.0001, "reward": 1.7656250596046448, "reward_std": 0.038401360623538494, "rewards/accuracy_reward": 0.7656250298023224, "rewards/format_reward": 1.0, "step": 4422 }, { "completion_length": 419.68305587768555, "epoch": 0.34486387465164775, "grad_norm": 0.11801698082673497, "kl": 0.003520965576171875, "learning_rate": 7.341445290634629e-07, "loss": 0.0001, "reward": 1.7455358058214188, "reward_std": 0.06170237809419632, "rewards/accuracy_reward": 0.7455357536673546, "rewards/format_reward": 1.0, "step": 4424 }, { "completion_length": 426.13618087768555, "epoch": 0.3450197805624306, "grad_norm": 0.05651462962569927, "kl": 0.0033397674560546875, "learning_rate": 7.33928112323787e-07, "loss": 0.0001, "reward": 1.8147322237491608, "reward_std": 0.027206613682210445, "rewards/accuracy_reward": 0.814732201397419, "rewards/format_reward": 1.0, "step": 4426 }, { "completion_length": 427.908504486084, "epoch": 0.3451756864732134, "grad_norm": 0.10824096370721464, "kl": 0.0032501220703125, "learning_rate": 7.33711639463261e-07, "loss": 0.0001, "reward": 1.774553656578064, "reward_std": 0.08582712709903717, "rewards/accuracy_reward": 0.776785746216774, "rewards/format_reward": 0.9977678656578064, "step": 4428 }, { "completion_length": 420.58484268188477, "epoch": 0.34533159238399624, "grad_norm": 0.09607155088950503, "kl": 0.00345611572265625, "learning_rate": 7.334951105338178e-07, "loss": 0.0001, "reward": 1.7857143580913544, "reward_std": 0.04080921784043312, "rewards/accuracy_reward": 0.7857143208384514, "rewards/format_reward": 1.0, "step": 4430 }, { "completion_length": 424.9576072692871, "epoch": 0.3454874982947791, "grad_norm": 0.10534921195244393, "kl": 0.0030231475830078125, "learning_rate": 7.332785255874043e-07, "loss": 0.0001, "reward": 1.7991072088479996, "reward_std": 0.04567714221775532, "rewards/accuracy_reward": 0.8013393208384514, "rewards/format_reward": 0.9977678656578064, "step": 4432 }, { "completion_length": 427.6518020629883, "epoch": 0.34564340420556194, "grad_norm": 0.11714506950005109, "kl": 0.0032863616943359375, "learning_rate": 7.330618846759807e-07, "loss": 0.0001, "reward": 1.8169643729925156, "reward_std": 0.05734172277152538, "rewards/accuracy_reward": 0.8169643208384514, "rewards/format_reward": 1.0, "step": 4434 }, { "completion_length": 415.75224685668945, "epoch": 0.3457993101163448, "grad_norm": 0.09814580012845874, "kl": 0.0037145614624023438, "learning_rate": 7.328451878515205e-07, "loss": 0.0001, "reward": 1.8727679252624512, "reward_std": 0.058317420072853565, "rewards/accuracy_reward": 0.8727678954601288, "rewards/format_reward": 1.0, "step": 4436 }, { "completion_length": 414.9352836608887, "epoch": 0.34595521602712764, "grad_norm": 0.0029390117507223884, "kl": 0.0027914047241210938, "learning_rate": 7.326284351660105e-07, "loss": 0.0001, "reward": 1.787946492433548, "reward_std": 0.02284595649689436, "rewards/accuracy_reward": 0.7879464477300644, "rewards/format_reward": 1.0, "step": 4438 }, { "completion_length": 419.8750190734863, "epoch": 0.3461111219379105, "grad_norm": 0.09014247780631583, "kl": 0.0029020309448242188, "learning_rate": 7.324116266714512e-07, "loss": 0.0001, "reward": 1.7924108058214188, "reward_std": 0.0674074050039053, "rewards/accuracy_reward": 0.792410746216774, "rewards/format_reward": 1.0, "step": 4440 }, { "completion_length": 420.6361770629883, "epoch": 0.3462670278486933, "grad_norm": 0.08262383353610829, "kl": 0.00330352783203125, "learning_rate": 7.321947624198562e-07, "loss": 0.0001, "reward": 1.7946429401636124, "reward_std": 0.03742566239088774, "rewards/accuracy_reward": 0.79464291036129, "rewards/format_reward": 1.0, "step": 4442 }, { "completion_length": 434.06474685668945, "epoch": 0.34642293375947614, "grad_norm": 0.11447952632563022, "kl": 0.003833770751953125, "learning_rate": 7.319778424632527e-07, "loss": 0.0002, "reward": 1.7924108058214188, "reward_std": 0.060726677998900414, "rewards/accuracy_reward": 0.7946429029107094, "rewards/format_reward": 0.9977678656578064, "step": 4444 }, { "completion_length": 431.75224685668945, "epoch": 0.346578839670259, "grad_norm": 0.07995417986442786, "kl": 0.00359344482421875, "learning_rate": 7.317608668536813e-07, "loss": 0.0001, "reward": 1.8102679401636124, "reward_std": 0.0344957634806633, "rewards/accuracy_reward": 0.810267873108387, "rewards/format_reward": 1.0, "step": 4446 }, { "completion_length": 429.95538330078125, "epoch": 0.34673474558104184, "grad_norm": 0.15034360265882707, "kl": 0.0032701492309570312, "learning_rate": 7.315438356431957e-07, "loss": 0.0001, "reward": 1.7901786416769028, "reward_std": 0.10626382380723953, "rewards/accuracy_reward": 0.7901786044239998, "rewards/format_reward": 1.0, "step": 4448 }, { "completion_length": 413.3437690734863, "epoch": 0.3468906514918247, "grad_norm": 0.09258463147920433, "kl": 0.0030384063720703125, "learning_rate": 7.31326748883863e-07, "loss": 0.0001, "reward": 1.6785715073347092, "reward_std": 0.037727355025708675, "rewards/accuracy_reward": 0.6785714663565159, "rewards/format_reward": 1.0, "step": 4450 }, { "completion_length": 418.3861846923828, "epoch": 0.34704655740260754, "grad_norm": 0.12141021105559924, "kl": 0.0033550262451171875, "learning_rate": 7.31109606627764e-07, "loss": 0.0001, "reward": 1.8370536416769028, "reward_std": 0.0675607705488801, "rewards/accuracy_reward": 0.8370536044239998, "rewards/format_reward": 1.0, "step": 4452 }, { "completion_length": 417.9509086608887, "epoch": 0.3472024633133904, "grad_norm": 0.08872292041548564, "kl": 0.002902984619140625, "learning_rate": 7.308924089269922e-07, "loss": 0.0001, "reward": 1.8526786416769028, "reward_std": 0.03336669970303774, "rewards/accuracy_reward": 0.852678619325161, "rewards/format_reward": 1.0, "step": 4454 }, { "completion_length": 415.10045623779297, "epoch": 0.34735836922417324, "grad_norm": 0.0036229692150656773, "kl": 0.0031003952026367188, "learning_rate": 7.306751558336549e-07, "loss": 0.0001, "reward": 1.8950893431901932, "reward_std": 0.014579704962670803, "rewards/accuracy_reward": 0.8950893059372902, "rewards/format_reward": 1.0, "step": 4456 }, { "completion_length": 423.2343940734863, "epoch": 0.34751427513495603, "grad_norm": 0.09053757399707259, "kl": 0.0027637481689453125, "learning_rate": 7.304578473998725e-07, "loss": 0.0001, "reward": 1.8750000596046448, "reward_std": 0.04178631864488125, "rewards/accuracy_reward": 0.8750000223517418, "rewards/format_reward": 1.0, "step": 4458 }, { "completion_length": 424.7544860839844, "epoch": 0.3476701810457389, "grad_norm": 0.13479841759182445, "kl": 0.0033550262451171875, "learning_rate": 7.302404836777786e-07, "loss": 0.0001, "reward": 1.7901786416769028, "reward_std": 0.08146506734192371, "rewards/accuracy_reward": 0.7901785969734192, "rewards/format_reward": 1.0, "step": 4460 }, { "completion_length": 430.56921005249023, "epoch": 0.34782608695652173, "grad_norm": 0.10015030500073153, "kl": 0.00319671630859375, "learning_rate": 7.300230647195201e-07, "loss": 0.0001, "reward": 1.7566965073347092, "reward_std": 0.06786246225237846, "rewards/accuracy_reward": 0.7566964700818062, "rewards/format_reward": 1.0, "step": 4462 }, { "completion_length": 416.7232322692871, "epoch": 0.3479819928673046, "grad_norm": 0.0940993626635451, "kl": 0.0026454925537109375, "learning_rate": 7.298055905772573e-07, "loss": 0.0001, "reward": 1.7901786416769028, "reward_std": 0.03208790626376867, "rewards/accuracy_reward": 0.7901786044239998, "rewards/format_reward": 1.0, "step": 4464 }, { "completion_length": 412.1026954650879, "epoch": 0.34813789877808743, "grad_norm": 0.07757544751248666, "kl": 0.002765655517578125, "learning_rate": 7.295880613031638e-07, "loss": 0.0001, "reward": 1.8616072088479996, "reward_std": 0.03644856158643961, "rewards/accuracy_reward": 0.8616071939468384, "rewards/format_reward": 1.0, "step": 4466 }, { "completion_length": 439.93305587768555, "epoch": 0.3482938046888703, "grad_norm": 0.08340407871786329, "kl": 0.0034008026123046875, "learning_rate": 7.293704769494259e-07, "loss": 0.0001, "reward": 1.8169643580913544, "reward_std": 0.06981526222079992, "rewards/accuracy_reward": 0.8191964626312256, "rewards/format_reward": 0.9977678656578064, "step": 4468 }, { "completion_length": 418.99555587768555, "epoch": 0.34844971059965313, "grad_norm": 0.050664461822623706, "kl": 0.0030193328857421875, "learning_rate": 7.291528375682438e-07, "loss": 0.0001, "reward": 1.796875074505806, "reward_std": 0.028485405258834362, "rewards/accuracy_reward": 0.796875037252903, "rewards/format_reward": 1.0, "step": 4470 }, { "completion_length": 430.8169822692871, "epoch": 0.348605616510436, "grad_norm": 0.12184164223672961, "kl": 0.0030612945556640625, "learning_rate": 7.289351432118303e-07, "loss": 0.0001, "reward": 1.781250074505806, "reward_std": 0.03644856158643961, "rewards/accuracy_reward": 0.7812500298023224, "rewards/format_reward": 1.0, "step": 4472 }, { "completion_length": 427.6049270629883, "epoch": 0.3487615224212188, "grad_norm": 0.1092584589846277, "kl": 0.0036230087280273438, "learning_rate": 7.287173939324119e-07, "loss": 0.0001, "reward": 1.8147322088479996, "reward_std": 0.08875561971217394, "rewards/accuracy_reward": 0.8147321715950966, "rewards/format_reward": 1.0, "step": 4474 }, { "completion_length": 422.0000228881836, "epoch": 0.3489174283320016, "grad_norm": 0.13330169187938132, "kl": 0.00347900390625, "learning_rate": 7.284995897822278e-07, "loss": 0.0001, "reward": 1.8102679401636124, "reward_std": 0.08875562436878681, "rewards/accuracy_reward": 0.812500037252903, "rewards/format_reward": 0.9977678656578064, "step": 4476 }, { "completion_length": 425.95984268188477, "epoch": 0.34907333424278447, "grad_norm": 0.11536142481232799, "kl": 0.0038890838623046875, "learning_rate": 7.28281730813531e-07, "loss": 0.0002, "reward": 1.8058036416769028, "reward_std": 0.06252470891922712, "rewards/accuracy_reward": 0.8058035969734192, "rewards/format_reward": 1.0, "step": 4478 }, { "completion_length": 423.9263610839844, "epoch": 0.3492292401535673, "grad_norm": 0.1335058732325305, "kl": 0.0033512115478515625, "learning_rate": 7.280638170785868e-07, "loss": 0.0001, "reward": 1.7522322088479996, "reward_std": 0.08808301948010921, "rewards/accuracy_reward": 0.7522321790456772, "rewards/format_reward": 1.0, "step": 4480 }, { "completion_length": 408.9196586608887, "epoch": 0.34938514606435017, "grad_norm": 0.04057840969727584, "kl": 0.0027494430541992188, "learning_rate": 7.278458486296745e-07, "loss": 0.0001, "reward": 1.8638393431901932, "reward_std": 0.01585849840193987, "rewards/accuracy_reward": 0.863839328289032, "rewards/format_reward": 1.0, "step": 4482 }, { "completion_length": 421.9687690734863, "epoch": 0.349541051975133, "grad_norm": 0.08079313486753337, "kl": 0.0034160614013671875, "learning_rate": 7.276278255190858e-07, "loss": 0.0001, "reward": 1.8191965073347092, "reward_std": 0.04260864946991205, "rewards/accuracy_reward": 0.8191964477300644, "rewards/format_reward": 1.0, "step": 4484 }, { "completion_length": 418.4419860839844, "epoch": 0.34969695788591587, "grad_norm": 0.10282414648639918, "kl": 0.003082275390625, "learning_rate": 7.274097477991259e-07, "loss": 0.0001, "reward": 1.7790179252624512, "reward_std": 0.0344957634806633, "rewards/accuracy_reward": 0.7790178954601288, "rewards/format_reward": 1.0, "step": 4486 }, { "completion_length": 427.5379638671875, "epoch": 0.34985286379669867, "grad_norm": 0.08215955983758241, "kl": 0.0037670135498046875, "learning_rate": 7.271916155221134e-07, "loss": 0.0002, "reward": 1.866071492433548, "reward_std": 0.062827805057168, "rewards/accuracy_reward": 0.8683036044239998, "rewards/format_reward": 0.9977678656578064, "step": 4488 }, { "completion_length": 421.5000190734863, "epoch": 0.3500087697074815, "grad_norm": 0.10511944695868987, "kl": 0.002834320068359375, "learning_rate": 7.269734287403793e-07, "loss": 0.0001, "reward": 1.868303656578064, "reward_std": 0.0761287147179246, "rewards/accuracy_reward": 0.8683036118745804, "rewards/format_reward": 1.0, "step": 4490 }, { "completion_length": 427.0357360839844, "epoch": 0.35016467561826436, "grad_norm": 0.06344173041572551, "kl": 0.0029048919677734375, "learning_rate": 7.267551875062678e-07, "loss": 0.0001, "reward": 1.899553656578064, "reward_std": 0.03840135969221592, "rewards/accuracy_reward": 0.8995536044239998, "rewards/format_reward": 1.0, "step": 4492 }, { "completion_length": 432.40849685668945, "epoch": 0.3503205815290472, "grad_norm": 0.10830554722759016, "kl": 0.0031042098999023438, "learning_rate": 7.265368918721368e-07, "loss": 0.0001, "reward": 1.8950893431901932, "reward_std": 0.06333870999515057, "rewards/accuracy_reward": 0.8973214477300644, "rewards/format_reward": 0.9977678656578064, "step": 4494 }, { "completion_length": 411.2053680419922, "epoch": 0.35047648743983006, "grad_norm": 0.1024022015677041, "kl": 0.002780914306640625, "learning_rate": 7.263185418903568e-07, "loss": 0.0001, "reward": 1.8258929550647736, "reward_std": 0.08829916082322598, "rewards/accuracy_reward": 0.8258928805589676, "rewards/format_reward": 1.0, "step": 4496 }, { "completion_length": 421.14733123779297, "epoch": 0.3506323933506129, "grad_norm": 0.08566547543504958, "kl": 0.0034618377685546875, "learning_rate": 7.26100137613311e-07, "loss": 0.0001, "reward": 1.774553656578064, "reward_std": 0.05981375649571419, "rewards/accuracy_reward": 0.7745535895228386, "rewards/format_reward": 1.0, "step": 4498 }, { "completion_length": 433.3281440734863, "epoch": 0.35078829926139576, "grad_norm": 0.1316751864929623, "kl": 0.003437042236328125, "learning_rate": 7.258816790933962e-07, "loss": 0.0001, "reward": 1.8303572535514832, "reward_std": 0.0989746730774641, "rewards/accuracy_reward": 0.8303571790456772, "rewards/format_reward": 1.0, "step": 4500 }, { "completion_length": 425.90180587768555, "epoch": 0.3509442051721786, "grad_norm": 0.12885840317138272, "kl": 0.0027103424072265625, "learning_rate": 7.25663166383022e-07, "loss": 0.0001, "reward": 1.7924107909202576, "reward_std": 0.07515161670744419, "rewards/accuracy_reward": 0.7924107536673546, "rewards/format_reward": 1.0, "step": 4502 }, { "completion_length": 421.0000190734863, "epoch": 0.3511001110829614, "grad_norm": 0.09243256809857378, "kl": 0.0032596588134765625, "learning_rate": 7.254445995346111e-07, "loss": 0.0001, "reward": 1.8102679401636124, "reward_std": 0.06463227421045303, "rewards/accuracy_reward": 0.8102678805589676, "rewards/format_reward": 1.0, "step": 4504 }, { "completion_length": 423.0335006713867, "epoch": 0.35125601699374426, "grad_norm": 0.0885345407626391, "kl": 0.0036773681640625, "learning_rate": 7.252259786005987e-07, "loss": 0.0001, "reward": 1.8325893580913544, "reward_std": 0.059294519014656544, "rewards/accuracy_reward": 0.832589328289032, "rewards/format_reward": 1.0, "step": 4506 }, { "completion_length": 419.5290336608887, "epoch": 0.3514119229045271, "grad_norm": 0.08358941872622003, "kl": 0.0035190582275390625, "learning_rate": 7.250073036334338e-07, "loss": 0.0001, "reward": 1.8571429401636124, "reward_std": 0.06395827047526836, "rewards/accuracy_reward": 0.85714291036129, "rewards/format_reward": 1.0, "step": 4508 }, { "completion_length": 415.4174270629883, "epoch": 0.35156782881530996, "grad_norm": 0.003259071779630737, "kl": 0.002521514892578125, "learning_rate": 7.247885746855778e-07, "loss": 0.0001, "reward": 1.8102679401636124, "reward_std": 0.018940359354019165, "rewards/accuracy_reward": 0.8102678954601288, "rewards/format_reward": 1.0, "step": 4510 }, { "completion_length": 420.69868087768555, "epoch": 0.3517237347260928, "grad_norm": 0.10748590129121446, "kl": 0.002918243408203125, "learning_rate": 7.24569791809505e-07, "loss": 0.0001, "reward": 1.8638393431901932, "reward_std": 0.05005116853863001, "rewards/accuracy_reward": 0.8638393208384514, "rewards/format_reward": 1.0, "step": 4512 }, { "completion_length": 433.4062728881836, "epoch": 0.35187964063687566, "grad_norm": 0.13694023914054163, "kl": 0.00351715087890625, "learning_rate": 7.243509550577029e-07, "loss": 0.0001, "reward": 1.7142857760190964, "reward_std": 0.07936030626296997, "rewards/accuracy_reward": 0.714285746216774, "rewards/format_reward": 1.0, "step": 4514 }, { "completion_length": 414.4397506713867, "epoch": 0.3520355465476585, "grad_norm": 0.09943502254188469, "kl": 0.0033855438232421875, "learning_rate": 7.241320644826719e-07, "loss": 0.0001, "reward": 1.7455357760190964, "reward_std": 0.057341720908880234, "rewards/accuracy_reward": 0.7455357387661934, "rewards/format_reward": 1.0, "step": 4516 }, { "completion_length": 420.18529510498047, "epoch": 0.35219145245844136, "grad_norm": 0.10168215875235642, "kl": 0.003131866455078125, "learning_rate": 7.239131201369253e-07, "loss": 0.0001, "reward": 1.8571429401636124, "reward_std": 0.03742566145956516, "rewards/accuracy_reward": 0.8571428880095482, "rewards/format_reward": 1.0, "step": 4518 }, { "completion_length": 417.27234268188477, "epoch": 0.35234735836922415, "grad_norm": 0.06996327944794684, "kl": 0.0036678314208984375, "learning_rate": 7.236941220729891e-07, "loss": 0.0001, "reward": 1.7098215073347092, "reward_std": 0.05786095838993788, "rewards/accuracy_reward": 0.7098214626312256, "rewards/format_reward": 1.0, "step": 4520 }, { "completion_length": 434.8192138671875, "epoch": 0.352503264280007, "grad_norm": 0.09223773706611516, "kl": 0.0035877227783203125, "learning_rate": 7.234750703434024e-07, "loss": 0.0001, "reward": 1.8839286267757416, "reward_std": 0.03501640260219574, "rewards/accuracy_reward": 0.8839286118745804, "rewards/format_reward": 1.0, "step": 4522 }, { "completion_length": 431.5669822692871, "epoch": 0.35265917019078985, "grad_norm": 0.11735112113817896, "kl": 0.0032138824462890625, "learning_rate": 7.232559650007172e-07, "loss": 0.0001, "reward": 1.734375074505806, "reward_std": 0.08003431279212236, "rewards/accuracy_reward": 0.7343750260770321, "rewards/format_reward": 1.0, "step": 4524 }, { "completion_length": 419.8415336608887, "epoch": 0.3528150761015727, "grad_norm": 0.04480532079559176, "kl": 0.003063201904296875, "learning_rate": 7.230368060974979e-07, "loss": 0.0001, "reward": 1.8281250894069672, "reward_std": 0.021868856623768806, "rewards/accuracy_reward": 0.8281250298023224, "rewards/format_reward": 1.0, "step": 4526 }, { "completion_length": 428.9107322692871, "epoch": 0.35297098201235555, "grad_norm": 0.10609535555171028, "kl": 0.0034122467041015625, "learning_rate": 7.228175936863227e-07, "loss": 0.0001, "reward": 1.7991072088479996, "reward_std": 0.04614697117358446, "rewards/accuracy_reward": 0.7991071715950966, "rewards/format_reward": 1.0, "step": 4528 }, { "completion_length": 416.9196662902832, "epoch": 0.3531268879231384, "grad_norm": 0.06395869069716564, "kl": 0.003692626953125, "learning_rate": 7.225983278197816e-07, "loss": 0.0001, "reward": 1.8705357909202576, "reward_std": 0.0586205143481493, "rewards/accuracy_reward": 0.8705357685685158, "rewards/format_reward": 1.0, "step": 4530 }, { "completion_length": 425.1451072692871, "epoch": 0.35328279383392125, "grad_norm": 0.10114432226009001, "kl": 0.0034999847412109375, "learning_rate": 7.223790085504782e-07, "loss": 0.0001, "reward": 1.7946429401636124, "reward_std": 0.0529810655862093, "rewards/accuracy_reward": 0.7946428880095482, "rewards/format_reward": 1.0, "step": 4532 }, { "completion_length": 447.1406478881836, "epoch": 0.35343869974470404, "grad_norm": 0.10203447912078809, "kl": 0.003643035888671875, "learning_rate": 7.221596359310283e-07, "loss": 0.0001, "reward": 1.7700893729925156, "reward_std": 0.048619008623063564, "rewards/accuracy_reward": 0.7700893357396126, "rewards/format_reward": 1.0, "step": 4534 }, { "completion_length": 422.04466247558594, "epoch": 0.3535946056554869, "grad_norm": 0.08539837586157151, "kl": 0.0035991668701171875, "learning_rate": 7.21940210014061e-07, "loss": 0.0001, "reward": 1.734375074505806, "reward_std": 0.07597534544765949, "rewards/accuracy_reward": 0.7343750298023224, "rewards/format_reward": 1.0, "step": 4536 }, { "completion_length": 436.43305587768555, "epoch": 0.35375051156626974, "grad_norm": 0.048013894931094245, "kl": 0.0034465789794921875, "learning_rate": 7.21720730852218e-07, "loss": 0.0001, "reward": 1.8415179401636124, "reward_std": 0.045017908327281475, "rewards/accuracy_reward": 0.8415178805589676, "rewards/format_reward": 1.0, "step": 4538 }, { "completion_length": 418.86162185668945, "epoch": 0.3539064174770526, "grad_norm": 0.08501480237437471, "kl": 0.0030269622802734375, "learning_rate": 7.215011984981535e-07, "loss": 0.0001, "reward": 1.8058036416769028, "reward_std": 0.07582702301442623, "rewards/accuracy_reward": 0.8058035969734192, "rewards/format_reward": 1.0, "step": 4540 }, { "completion_length": 430.74109268188477, "epoch": 0.35406232338783544, "grad_norm": 0.08129448840898136, "kl": 0.00322723388671875, "learning_rate": 7.212816130045349e-07, "loss": 0.0001, "reward": 1.8102678954601288, "reward_std": 0.04794640466570854, "rewards/accuracy_reward": 0.8102678805589676, "rewards/format_reward": 1.0, "step": 4542 }, { "completion_length": 412.5401916503906, "epoch": 0.3542182292986183, "grad_norm": 0.10162002153589884, "kl": 0.0027704238891601562, "learning_rate": 7.210619744240423e-07, "loss": 0.0001, "reward": 1.8504465073347092, "reward_std": 0.04276201594620943, "rewards/accuracy_reward": 0.8504464775323868, "rewards/format_reward": 1.0, "step": 4544 }, { "completion_length": 444.9821586608887, "epoch": 0.35437413520940114, "grad_norm": 0.09926756938406754, "kl": 0.0034732818603515625, "learning_rate": 7.208422828093683e-07, "loss": 0.0001, "reward": 1.79464291036129, "reward_std": 0.029159409925341606, "rewards/accuracy_reward": 0.7946428805589676, "rewards/format_reward": 1.0, "step": 4546 }, { "completion_length": 425.6540298461914, "epoch": 0.354530041120184, "grad_norm": 0.11973799638239259, "kl": 0.003299713134765625, "learning_rate": 7.206225382132179e-07, "loss": 0.0001, "reward": 1.8169643729925156, "reward_std": 0.08860365580767393, "rewards/accuracy_reward": 0.8191964626312256, "rewards/format_reward": 0.9977678656578064, "step": 4548 }, { "completion_length": 418.8259086608887, "epoch": 0.3546859470309668, "grad_norm": 0.0848236945471731, "kl": 0.0028581619262695312, "learning_rate": 7.2040274068831e-07, "loss": 0.0001, "reward": 1.84151791036129, "reward_std": 0.043585749343037605, "rewards/accuracy_reward": 0.8415178805589676, "rewards/format_reward": 1.0, "step": 4550 }, { "completion_length": 417.94197845458984, "epoch": 0.35484185294174964, "grad_norm": 0.09144960554520186, "kl": 0.0031795501708984375, "learning_rate": 7.20182890287375e-07, "loss": 0.0001, "reward": 1.8638393580913544, "reward_std": 0.08213907387107611, "rewards/accuracy_reward": 0.8638393133878708, "rewards/format_reward": 1.0, "step": 4552 }, { "completion_length": 411.5290336608887, "epoch": 0.3549977588525325, "grad_norm": 0.07071971718868657, "kl": 0.0030269622802734375, "learning_rate": 7.199629870631562e-07, "loss": 0.0001, "reward": 1.7678572237491608, "reward_std": 0.03208790719509125, "rewards/accuracy_reward": 0.7678571864962578, "rewards/format_reward": 1.0, "step": 4554 }, { "completion_length": 433.32368087768555, "epoch": 0.35515366476331534, "grad_norm": 0.0035103563121288, "kl": 0.00299835205078125, "learning_rate": 7.197430310684103e-07, "loss": 0.0001, "reward": 1.8482143580913544, "reward_std": 0.01555540319532156, "rewards/accuracy_reward": 0.848214328289032, "rewards/format_reward": 1.0, "step": 4556 }, { "completion_length": 432.3259086608887, "epoch": 0.3553095706740982, "grad_norm": 0.0831265907460605, "kl": 0.0026769638061523438, "learning_rate": 7.195230223559058e-07, "loss": 0.0001, "reward": 1.9017857611179352, "reward_std": 0.03501640260219574, "rewards/accuracy_reward": 0.9017857387661934, "rewards/format_reward": 1.0, "step": 4558 }, { "completion_length": 423.47322845458984, "epoch": 0.35546547658488103, "grad_norm": 0.08618407960858142, "kl": 0.0029325485229492188, "learning_rate": 7.193029609784243e-07, "loss": 0.0001, "reward": 1.8392857909202576, "reward_std": 0.051331362687051296, "rewards/accuracy_reward": 0.839285746216774, "rewards/format_reward": 1.0, "step": 4560 }, { "completion_length": 432.5826072692871, "epoch": 0.3556213824956639, "grad_norm": 0.09844245890159276, "kl": 0.0035533905029296875, "learning_rate": 7.190828469887599e-07, "loss": 0.0001, "reward": 1.812500074505806, "reward_std": 0.03111080639064312, "rewards/accuracy_reward": 0.8125000298023224, "rewards/format_reward": 1.0, "step": 4562 }, { "completion_length": 407.1049270629883, "epoch": 0.35577728840644673, "grad_norm": 0.05037840245001824, "kl": 0.0028629302978515625, "learning_rate": 7.188626804397195e-07, "loss": 0.0001, "reward": 1.881696492433548, "reward_std": 0.04260864946991205, "rewards/accuracy_reward": 0.8816964477300644, "rewards/format_reward": 1.0, "step": 4564 }, { "completion_length": 418.8772506713867, "epoch": 0.35593319431722953, "grad_norm": 0.07074402617958687, "kl": 0.0032024383544921875, "learning_rate": 7.186424613841222e-07, "loss": 0.0001, "reward": 1.837053656578064, "reward_std": 0.03366979584097862, "rewards/accuracy_reward": 0.8370536044239998, "rewards/format_reward": 1.0, "step": 4566 }, { "completion_length": 405.02903747558594, "epoch": 0.3560891002280124, "grad_norm": 0.06751404280081735, "kl": 0.004871368408203125, "learning_rate": 7.184221898748002e-07, "loss": 0.0002, "reward": 1.8616072237491608, "reward_std": 0.040809216909110546, "rewards/accuracy_reward": 0.8616071864962578, "rewards/format_reward": 1.0, "step": 4568 }, { "completion_length": 437.5602798461914, "epoch": 0.35624500613879523, "grad_norm": 0.10407811290843594, "kl": 0.0034313201904296875, "learning_rate": 7.182018659645979e-07, "loss": 0.0001, "reward": 1.743303656578064, "reward_std": 0.04666761215776205, "rewards/accuracy_reward": 0.7433036118745804, "rewards/format_reward": 1.0, "step": 4570 }, { "completion_length": 420.2053756713867, "epoch": 0.3564009120495781, "grad_norm": 0.06897613738478754, "kl": 0.0031223297119140625, "learning_rate": 7.179814897063725e-07, "loss": 0.0001, "reward": 1.830357238650322, "reward_std": 0.0490754684433341, "rewards/accuracy_reward": 0.8303571790456772, "rewards/format_reward": 1.0, "step": 4572 }, { "completion_length": 419.66743087768555, "epoch": 0.35655681796036093, "grad_norm": 0.10354458373665394, "kl": 0.002986907958984375, "learning_rate": 7.177610611529938e-07, "loss": 0.0001, "reward": 1.8214286416769028, "reward_std": 0.041632951237261295, "rewards/accuracy_reward": 0.8214286044239998, "rewards/format_reward": 1.0, "step": 4574 }, { "completion_length": 437.8236770629883, "epoch": 0.3567127238711438, "grad_norm": 0.12811238519749663, "kl": 0.0034580230712890625, "learning_rate": 7.175405803573436e-07, "loss": 0.0001, "reward": 1.7745536416769028, "reward_std": 0.08897316735237837, "rewards/accuracy_reward": 0.7745536118745804, "rewards/format_reward": 1.0, "step": 4576 }, { "completion_length": 429.42412185668945, "epoch": 0.3568686297819266, "grad_norm": 0.12147040158301775, "kl": 0.00333404541015625, "learning_rate": 7.173200473723169e-07, "loss": 0.0001, "reward": 1.674107238650322, "reward_std": 0.06981526128947735, "rewards/accuracy_reward": 0.674107164144516, "rewards/format_reward": 1.0, "step": 4578 }, { "completion_length": 417.1027030944824, "epoch": 0.3570245356927095, "grad_norm": 0.1108038658129401, "kl": 0.00310516357421875, "learning_rate": 7.170994622508211e-07, "loss": 0.0001, "reward": 1.8392857760190964, "reward_std": 0.051124260760843754, "rewards/accuracy_reward": 0.841517873108387, "rewards/format_reward": 0.9977678656578064, "step": 4580 }, { "completion_length": 432.93082427978516, "epoch": 0.35718044160349227, "grad_norm": 0.08516851201667264, "kl": 0.003879547119140625, "learning_rate": 7.168788250457758e-07, "loss": 0.0002, "reward": 1.7723215073347092, "reward_std": 0.044714814983308315, "rewards/accuracy_reward": 0.7723214626312256, "rewards/format_reward": 1.0, "step": 4582 }, { "completion_length": 428.31474685668945, "epoch": 0.3573363475142751, "grad_norm": 0.11662927298283267, "kl": 0.0031032562255859375, "learning_rate": 7.166581358101135e-07, "loss": 0.0001, "reward": 1.7232143729925156, "reward_std": 0.07417591568082571, "rewards/accuracy_reward": 0.7254464626312256, "rewards/format_reward": 0.9977678656578064, "step": 4584 }, { "completion_length": 419.09153747558594, "epoch": 0.35749225342505797, "grad_norm": 0.12077607799830138, "kl": 0.0031061172485351562, "learning_rate": 7.164373945967787e-07, "loss": 0.0001, "reward": 1.8191965073347092, "reward_std": 0.05102826748043299, "rewards/accuracy_reward": 0.819196455180645, "rewards/format_reward": 1.0, "step": 4586 }, { "completion_length": 406.40849685668945, "epoch": 0.3576481593358408, "grad_norm": 0.09073820494168809, "kl": 0.00247955322265625, "learning_rate": 7.162166014587286e-07, "loss": 0.0001, "reward": 1.8660715073347092, "reward_std": 0.04065585043281317, "rewards/accuracy_reward": 0.8660714775323868, "rewards/format_reward": 1.0, "step": 4588 }, { "completion_length": 425.9710006713867, "epoch": 0.35780406524662367, "grad_norm": 0.09529606764022995, "kl": 0.003284454345703125, "learning_rate": 7.159957564489331e-07, "loss": 0.0001, "reward": 1.8415179401636124, "reward_std": 0.0456905122846365, "rewards/accuracy_reward": 0.8415178805589676, "rewards/format_reward": 1.0, "step": 4590 }, { "completion_length": 429.7991256713867, "epoch": 0.3579599711574065, "grad_norm": 0.1263625098287727, "kl": 0.0033855438232421875, "learning_rate": 7.157748596203743e-07, "loss": 0.0001, "reward": 1.7924108058214188, "reward_std": 0.07612871378660202, "rewards/accuracy_reward": 0.792410746216774, "rewards/format_reward": 1.0, "step": 4592 }, { "completion_length": 436.70984649658203, "epoch": 0.35811587706818937, "grad_norm": 0.04678650107460608, "kl": 0.00286865234375, "learning_rate": 7.155539110260468e-07, "loss": 0.0001, "reward": 1.7946429252624512, "reward_std": 0.040354158729314804, "rewards/accuracy_reward": 0.7946428954601288, "rewards/format_reward": 1.0, "step": 4594 }, { "completion_length": 430.174129486084, "epoch": 0.35827178297897216, "grad_norm": 0.08540139800341796, "kl": 0.0034475326538085938, "learning_rate": 7.153329107189573e-07, "loss": 0.0001, "reward": 1.85714291036129, "reward_std": 0.04471481405198574, "rewards/accuracy_reward": 0.8571428954601288, "rewards/format_reward": 1.0, "step": 4596 }, { "completion_length": 418.8169860839844, "epoch": 0.358427688889755, "grad_norm": 0.11544069927298166, "kl": 0.0035200119018554688, "learning_rate": 7.151118587521254e-07, "loss": 0.0001, "reward": 1.8303571939468384, "reward_std": 0.06170237623155117, "rewards/accuracy_reward": 0.8303571790456772, "rewards/format_reward": 1.0, "step": 4598 }, { "completion_length": 422.4218940734863, "epoch": 0.35858359480053786, "grad_norm": 0.12064890337833792, "kl": 0.0034322738647460938, "learning_rate": 7.14890755178583e-07, "loss": 0.0001, "reward": 1.79241082072258, "reward_std": 0.07628208119422197, "rewards/accuracy_reward": 0.7924107387661934, "rewards/format_reward": 1.0, "step": 4600 }, { "completion_length": 415.6406440734863, "epoch": 0.3587395007113207, "grad_norm": 0.06635300227957146, "kl": 0.0033206939697265625, "learning_rate": 7.14669600051374e-07, "loss": 0.0001, "reward": 1.7745536267757416, "reward_std": 0.048923504538834095, "rewards/accuracy_reward": 0.7745536044239998, "rewards/format_reward": 1.0, "step": 4602 }, { "completion_length": 420.8638572692871, "epoch": 0.35889540662210356, "grad_norm": 0.08349979657286544, "kl": 0.003398895263671875, "learning_rate": 7.144483934235551e-07, "loss": 0.0001, "reward": 1.8013393580913544, "reward_std": 0.03675165679305792, "rewards/accuracy_reward": 0.801339328289032, "rewards/format_reward": 1.0, "step": 4604 }, { "completion_length": 418.721004486084, "epoch": 0.3590513125328864, "grad_norm": 0.07464603053517989, "kl": 0.003032684326171875, "learning_rate": 7.142271353481951e-07, "loss": 0.0001, "reward": 1.852678656578064, "reward_std": 0.048249500803649426, "rewards/accuracy_reward": 0.8526786118745804, "rewards/format_reward": 1.0, "step": 4606 }, { "completion_length": 430.51564025878906, "epoch": 0.35920721844366926, "grad_norm": 0.11596411230501524, "kl": 0.003204345703125, "learning_rate": 7.140058258783753e-07, "loss": 0.0001, "reward": 1.790178656578064, "reward_std": 0.0626794770359993, "rewards/accuracy_reward": 0.7901786044239998, "rewards/format_reward": 1.0, "step": 4608 }, { "completion_length": 422.74332427978516, "epoch": 0.3593631243544521, "grad_norm": 0.04254959063570949, "kl": 0.0029449462890625, "learning_rate": 7.13784465067189e-07, "loss": 0.0001, "reward": 1.7924108058214188, "reward_std": 0.021868856623768806, "rewards/accuracy_reward": 0.7924107536673546, "rewards/format_reward": 1.0, "step": 4610 }, { "completion_length": 419.3906440734863, "epoch": 0.3595190302652349, "grad_norm": 0.06542191745433128, "kl": 0.0032711029052734375, "learning_rate": 7.135630529677425e-07, "loss": 0.0001, "reward": 1.7812500894069672, "reward_std": 0.03772735595703125, "rewards/accuracy_reward": 0.7812500298023224, "rewards/format_reward": 1.0, "step": 4612 }, { "completion_length": 417.5870704650879, "epoch": 0.35967493617601776, "grad_norm": 0.1376568197667046, "kl": 0.0033206939697265625, "learning_rate": 7.13341589633154e-07, "loss": 0.0001, "reward": 1.8571429252624512, "reward_std": 0.07973121851682663, "rewards/accuracy_reward": 0.8571428880095482, "rewards/format_reward": 1.0, "step": 4614 }, { "completion_length": 426.5111770629883, "epoch": 0.3598308420868006, "grad_norm": 0.04719171749303199, "kl": 0.00325775146484375, "learning_rate": 7.131200751165534e-07, "loss": 0.0001, "reward": 1.8482143580913544, "reward_std": 0.01555540319532156, "rewards/accuracy_reward": 0.8482143133878708, "rewards/format_reward": 1.0, "step": 4616 }, { "completion_length": 419.1875305175781, "epoch": 0.35998674799758346, "grad_norm": 0.0034056265242446353, "kl": 0.002811431884765625, "learning_rate": 7.128985094710838e-07, "loss": 0.0001, "reward": 1.8281250447034836, "reward_std": 0.02720661275088787, "rewards/accuracy_reward": 0.828125037252903, "rewards/format_reward": 1.0, "step": 4618 }, { "completion_length": 427.74109268188477, "epoch": 0.3601426539083663, "grad_norm": 0.095868770721587, "kl": 0.004096031188964844, "learning_rate": 7.126768927499003e-07, "loss": 0.0002, "reward": 1.8392857909202576, "reward_std": 0.06124731805175543, "rewards/accuracy_reward": 0.839285746216774, "rewards/format_reward": 1.0, "step": 4620 }, { "completion_length": 429.5178756713867, "epoch": 0.36029855981914916, "grad_norm": 0.10285692165380661, "kl": 0.0032444000244140625, "learning_rate": 7.124552250061701e-07, "loss": 0.0001, "reward": 1.7075893878936768, "reward_std": 0.06801582872867584, "rewards/accuracy_reward": 0.7075893133878708, "rewards/format_reward": 1.0, "step": 4622 }, { "completion_length": 419.6250190734863, "epoch": 0.360454465729932, "grad_norm": 0.12392393588387697, "kl": 0.003082275390625, "learning_rate": 7.122335062930726e-07, "loss": 0.0001, "reward": 1.7879465222358704, "reward_std": 0.0664303032681346, "rewards/accuracy_reward": 0.787946455180645, "rewards/format_reward": 1.0, "step": 4624 }, { "completion_length": 409.8727912902832, "epoch": 0.36061037164071486, "grad_norm": 0.06898584162745275, "kl": 0.00312042236328125, "learning_rate": 7.120117366637996e-07, "loss": 0.0001, "reward": 1.8125000447034836, "reward_std": 0.02915941085666418, "rewards/accuracy_reward": 0.8125000149011612, "rewards/format_reward": 1.0, "step": 4626 }, { "completion_length": 421.58930587768555, "epoch": 0.36076627755149765, "grad_norm": 0.05919966416875032, "kl": 0.0032558441162109375, "learning_rate": 7.117899161715552e-07, "loss": 0.0001, "reward": 1.8571429401636124, "reward_std": 0.040809216909110546, "rewards/accuracy_reward": 0.8571428805589676, "rewards/format_reward": 1.0, "step": 4628 }, { "completion_length": 414.3906440734863, "epoch": 0.3609221834622805, "grad_norm": 0.08453946040155859, "kl": 0.0037288665771484375, "learning_rate": 7.115680448695552e-07, "loss": 0.0001, "reward": 1.6651786863803864, "reward_std": 0.041786317713558674, "rewards/accuracy_reward": 0.6651786006987095, "rewards/format_reward": 1.0, "step": 4630 }, { "completion_length": 427.439754486084, "epoch": 0.36107808937306335, "grad_norm": 0.05336563160936852, "kl": 0.0031604766845703125, "learning_rate": 7.113461228110283e-07, "loss": 0.0001, "reward": 1.7477679401636124, "reward_std": 0.03306360449641943, "rewards/accuracy_reward": 0.7477679029107094, "rewards/format_reward": 1.0, "step": 4632 }, { "completion_length": 427.2611770629883, "epoch": 0.3612339952838462, "grad_norm": 0.11567931021037683, "kl": 0.0038013458251953125, "learning_rate": 7.111241500492149e-07, "loss": 0.0002, "reward": 1.8169643729925156, "reward_std": 0.06559320073574781, "rewards/accuracy_reward": 0.8191964477300644, "rewards/format_reward": 0.9977678656578064, "step": 4634 }, { "completion_length": 434.78796005249023, "epoch": 0.36138990119462905, "grad_norm": 0.13012756605805323, "kl": 0.0031681060791015625, "learning_rate": 7.109021266373679e-07, "loss": 0.0001, "reward": 1.8526786714792252, "reward_std": 0.05877387896180153, "rewards/accuracy_reward": 0.852678619325161, "rewards/format_reward": 1.0, "step": 4636 }, { "completion_length": 419.2053756713867, "epoch": 0.3615458071054119, "grad_norm": 0.08430185359191307, "kl": 0.0030698776245117188, "learning_rate": 7.106800526287515e-07, "loss": 0.0001, "reward": 1.8169643729925156, "reward_std": 0.03171699680387974, "rewards/accuracy_reward": 0.8169643208384514, "rewards/format_reward": 1.0, "step": 4638 }, { "completion_length": 429.2143020629883, "epoch": 0.36170171301619475, "grad_norm": 0.08602003350908456, "kl": 0.003528594970703125, "learning_rate": 7.104579280766434e-07, "loss": 0.0001, "reward": 1.7566965222358704, "reward_std": 0.05102826841175556, "rewards/accuracy_reward": 0.756696455180645, "rewards/format_reward": 1.0, "step": 4640 }, { "completion_length": 404.96653747558594, "epoch": 0.36185761892697754, "grad_norm": 0.07401739934588905, "kl": 0.0026674270629882812, "learning_rate": 7.102357530343322e-07, "loss": 0.0001, "reward": 1.88839291036129, "reward_std": 0.04049275256693363, "rewards/accuracy_reward": 0.8906250298023224, "rewards/format_reward": 0.9977678656578064, "step": 4642 }, { "completion_length": 415.53126525878906, "epoch": 0.3620135248377604, "grad_norm": 0.1270126996274286, "kl": 0.003704071044921875, "learning_rate": 7.100135275551195e-07, "loss": 0.0001, "reward": 1.7790179252624512, "reward_std": 0.05831741914153099, "rewards/accuracy_reward": 0.77901791036129, "rewards/format_reward": 1.0, "step": 4644 }, { "completion_length": 427.3415336608887, "epoch": 0.36216943074854324, "grad_norm": 0.05100259768845319, "kl": 0.002780914306640625, "learning_rate": 7.097912516923182e-07, "loss": 0.0001, "reward": 1.8437500596046448, "reward_std": 0.03434379957616329, "rewards/accuracy_reward": 0.8437500447034836, "rewards/format_reward": 1.0, "step": 4646 }, { "completion_length": 416.0357322692871, "epoch": 0.3623253366593261, "grad_norm": 0.07528652195705378, "kl": 0.002918243408203125, "learning_rate": 7.09568925499254e-07, "loss": 0.0001, "reward": 1.7968751043081284, "reward_std": 0.03742426075041294, "rewards/accuracy_reward": 0.796875037252903, "rewards/format_reward": 1.0, "step": 4648 }, { "completion_length": 420.6696586608887, "epoch": 0.36248124257010894, "grad_norm": 0.13321111433339602, "kl": 0.003772735595703125, "learning_rate": 7.093465490292643e-07, "loss": 0.0002, "reward": 1.7633929401636124, "reward_std": 0.10934568755328655, "rewards/accuracy_reward": 0.7633928954601288, "rewards/format_reward": 1.0, "step": 4650 }, { "completion_length": 422.29243087768555, "epoch": 0.3626371484808918, "grad_norm": 0.08484535595081816, "kl": 0.0030918121337890625, "learning_rate": 7.091241223356986e-07, "loss": 0.0001, "reward": 1.8928572088479996, "reward_std": 0.05343612376600504, "rewards/accuracy_reward": 0.8928571864962578, "rewards/format_reward": 1.0, "step": 4652 }, { "completion_length": 421.07591247558594, "epoch": 0.36279305439167464, "grad_norm": 0.066728205568317, "kl": 0.0028228759765625, "learning_rate": 7.089016454719186e-07, "loss": 0.0001, "reward": 1.8839286416769028, "reward_std": 0.03352006524801254, "rewards/accuracy_reward": 0.8839285895228386, "rewards/format_reward": 1.0, "step": 4654 }, { "completion_length": 428.6585006713867, "epoch": 0.3629489603024575, "grad_norm": 0.12014341187115857, "kl": 0.00351715087890625, "learning_rate": 7.086791184912979e-07, "loss": 0.0001, "reward": 1.79464291036129, "reward_std": 0.0667333984747529, "rewards/accuracy_reward": 0.7946428954601288, "rewards/format_reward": 1.0, "step": 4656 }, { "completion_length": 412.0826072692871, "epoch": 0.3631048662132403, "grad_norm": 0.09990659853152871, "kl": 0.00383758544921875, "learning_rate": 7.084565414472219e-07, "loss": 0.0002, "reward": 1.7366072237491608, "reward_std": 0.07109405472874641, "rewards/accuracy_reward": 0.7366071715950966, "rewards/format_reward": 1.0, "step": 4658 }, { "completion_length": 416.080379486084, "epoch": 0.36326077212402313, "grad_norm": 0.08935393765079737, "kl": 0.002960205078125, "learning_rate": 7.082339143930886e-07, "loss": 0.0001, "reward": 1.8459822088479996, "reward_std": 0.027206611819565296, "rewards/accuracy_reward": 0.8459821790456772, "rewards/format_reward": 1.0, "step": 4660 }, { "completion_length": 416.32814025878906, "epoch": 0.363416678034806, "grad_norm": 0.10265185694132063, "kl": 0.0030155181884765625, "learning_rate": 7.080112373823074e-07, "loss": 0.0001, "reward": 1.8191965222358704, "reward_std": 0.06057331059128046, "rewards/accuracy_reward": 0.819196455180645, "rewards/format_reward": 1.0, "step": 4662 }, { "completion_length": 411.8504638671875, "epoch": 0.36357258394558883, "grad_norm": 0.0783462839373351, "kl": 0.0034742355346679688, "learning_rate": 7.077885104682999e-07, "loss": 0.0001, "reward": 1.7968750894069672, "reward_std": 0.06011825241148472, "rewards/accuracy_reward": 0.7968750298023224, "rewards/format_reward": 1.0, "step": 4664 }, { "completion_length": 420.3058166503906, "epoch": 0.3637284898563717, "grad_norm": 0.07090831864116817, "kl": 0.0034275054931640625, "learning_rate": 7.075657337045001e-07, "loss": 0.0001, "reward": 1.8415179550647736, "reward_std": 0.042762015014886856, "rewards/accuracy_reward": 0.8415178954601288, "rewards/format_reward": 1.0, "step": 4666 }, { "completion_length": 415.43528747558594, "epoch": 0.36388439576715453, "grad_norm": 0.13970178287544174, "kl": 0.0030841827392578125, "learning_rate": 7.073429071443533e-07, "loss": 0.0001, "reward": 1.8191965073347092, "reward_std": 0.0744790118187666, "rewards/accuracy_reward": 0.819196455180645, "rewards/format_reward": 1.0, "step": 4668 }, { "completion_length": 412.0044822692871, "epoch": 0.3640403016779374, "grad_norm": 0.06941190364553376, "kl": 0.00302886962890625, "learning_rate": 7.07120030841317e-07, "loss": 0.0001, "reward": 1.8549107760190964, "reward_std": 0.05456295423209667, "rewards/accuracy_reward": 0.8549107387661934, "rewards/format_reward": 1.0, "step": 4670 }, { "completion_length": 415.39064025878906, "epoch": 0.36419620758872023, "grad_norm": 0.07445149517840423, "kl": 0.00307464599609375, "learning_rate": 7.068971048488605e-07, "loss": 0.0001, "reward": 1.7946429401636124, "reward_std": 0.06124731805175543, "rewards/accuracy_reward": 0.7946428954601288, "rewards/format_reward": 1.0, "step": 4672 }, { "completion_length": 428.1897506713867, "epoch": 0.364352113499503, "grad_norm": 0.06634606115086102, "kl": 0.0033311843872070312, "learning_rate": 7.066741292204654e-07, "loss": 0.0001, "reward": 1.7901786416769028, "reward_std": 0.046146972104907036, "rewards/accuracy_reward": 0.7901785895228386, "rewards/format_reward": 1.0, "step": 4674 }, { "completion_length": 427.45984268188477, "epoch": 0.3645080194102859, "grad_norm": 0.11341299780405113, "kl": 0.0034427642822265625, "learning_rate": 7.064511040096251e-07, "loss": 0.0001, "reward": 1.765625074505806, "reward_std": 0.03141390159726143, "rewards/accuracy_reward": 0.7656250335276127, "rewards/format_reward": 1.0, "step": 4676 }, { "completion_length": 422.31252670288086, "epoch": 0.3646639253210687, "grad_norm": 0.14642842932551892, "kl": 0.0054340362548828125, "learning_rate": 7.062280292698446e-07, "loss": 0.0002, "reward": 1.734375074505806, "reward_std": 0.06252470798790455, "rewards/accuracy_reward": 0.7343750447034836, "rewards/format_reward": 1.0, "step": 4678 }, { "completion_length": 416.27010345458984, "epoch": 0.3648198312318516, "grad_norm": 0.09523997319409537, "kl": 0.0028676986694335938, "learning_rate": 7.060049050546408e-07, "loss": 0.0001, "reward": 1.8303572088479996, "reward_std": 0.039377057924866676, "rewards/accuracy_reward": 0.8303571864962578, "rewards/format_reward": 1.0, "step": 4680 }, { "completion_length": 411.3348388671875, "epoch": 0.3649757371426344, "grad_norm": 0.10343094946482653, "kl": 0.0031452178955078125, "learning_rate": 7.05781731417543e-07, "loss": 0.0001, "reward": 1.843750074505806, "reward_std": 0.03352006431668997, "rewards/accuracy_reward": 0.8437500298023224, "rewards/format_reward": 1.0, "step": 4682 }, { "completion_length": 401.95983505249023, "epoch": 0.3651316430534173, "grad_norm": 0.08043506479047867, "kl": 0.00267791748046875, "learning_rate": 7.055585084120916e-07, "loss": 0.0001, "reward": 1.801339328289032, "reward_std": 0.0344957634806633, "rewards/accuracy_reward": 0.8013393059372902, "rewards/format_reward": 1.0, "step": 4684 }, { "completion_length": 437.8370704650879, "epoch": 0.3652875489642001, "grad_norm": 0.037611353398741075, "kl": 0.0031175613403320312, "learning_rate": 7.053352360918394e-07, "loss": 0.0001, "reward": 1.8281250596046448, "reward_std": 0.038401360623538494, "rewards/accuracy_reward": 0.8281250447034836, "rewards/format_reward": 1.0, "step": 4686 }, { "completion_length": 408.79243087768555, "epoch": 0.3654434548749829, "grad_norm": 0.09973367998059104, "kl": 0.002849578857421875, "learning_rate": 7.051119145103511e-07, "loss": 0.0001, "reward": 1.8325893580913544, "reward_std": 0.0456905122846365, "rewards/accuracy_reward": 0.8325893208384514, "rewards/format_reward": 1.0, "step": 4688 }, { "completion_length": 428.04689025878906, "epoch": 0.36559936078576577, "grad_norm": 0.10793823448249408, "kl": 0.0034465789794921875, "learning_rate": 7.048885437212025e-07, "loss": 0.0001, "reward": 1.8013393580913544, "reward_std": 0.04809977114200592, "rewards/accuracy_reward": 0.8013393133878708, "rewards/format_reward": 1.0, "step": 4690 }, { "completion_length": 429.42190170288086, "epoch": 0.3657552666965486, "grad_norm": 0.12378913676349038, "kl": 0.0033473968505859375, "learning_rate": 7.046651237779818e-07, "loss": 0.0001, "reward": 1.7834822684526443, "reward_std": 0.07545611169189215, "rewards/accuracy_reward": 0.7857143059372902, "rewards/format_reward": 0.9977678656578064, "step": 4692 }, { "completion_length": 411.2567138671875, "epoch": 0.36591117260733147, "grad_norm": 0.0704670005600711, "kl": 0.0031690597534179688, "learning_rate": 7.044416547342891e-07, "loss": 0.0001, "reward": 1.8058036416769028, "reward_std": 0.05892360769212246, "rewards/accuracy_reward": 0.8058036044239998, "rewards/format_reward": 1.0, "step": 4694 }, { "completion_length": 411.5535888671875, "epoch": 0.3660670785181143, "grad_norm": 0.08909752347562237, "kl": 0.0029315948486328125, "learning_rate": 7.04218136643736e-07, "loss": 0.0001, "reward": 1.850446492433548, "reward_std": 0.03547286428511143, "rewards/accuracy_reward": 0.850446455180645, "rewards/format_reward": 1.0, "step": 4696 }, { "completion_length": 421.41519927978516, "epoch": 0.36622298442889717, "grad_norm": 0.08438910957432476, "kl": 0.00296783447265625, "learning_rate": 7.039945695599461e-07, "loss": 0.0001, "reward": 1.8839286267757416, "reward_std": 0.029159409925341606, "rewards/accuracy_reward": 0.8839286044239998, "rewards/format_reward": 1.0, "step": 4698 }, { "completion_length": 425.63171768188477, "epoch": 0.36637889033968, "grad_norm": 0.0742578744282545, "kl": 0.002964019775390625, "learning_rate": 7.037709535365539e-07, "loss": 0.0001, "reward": 1.8303572088479996, "reward_std": 0.029159409925341606, "rewards/accuracy_reward": 0.8303571715950966, "rewards/format_reward": 1.0, "step": 4700 }, { "completion_length": 425.59153747558594, "epoch": 0.36653479625046287, "grad_norm": 0.08976785782316994, "kl": 0.003093719482421875, "learning_rate": 7.035472886272071e-07, "loss": 0.0001, "reward": 1.7924107909202576, "reward_std": 0.046969303861260414, "rewards/accuracy_reward": 0.7924107536673546, "rewards/format_reward": 1.0, "step": 4702 }, { "completion_length": 419.7254638671875, "epoch": 0.36669070216124566, "grad_norm": 0.12171786548387306, "kl": 0.00325775146484375, "learning_rate": 7.033235748855637e-07, "loss": 0.0001, "reward": 1.7633929401636124, "reward_std": 0.10724092368036509, "rewards/accuracy_reward": 0.7633928880095482, "rewards/format_reward": 1.0, "step": 4704 }, { "completion_length": 418.8013610839844, "epoch": 0.3668466080720285, "grad_norm": 0.084051770422147, "kl": 0.0028333663940429688, "learning_rate": 7.030998123652944e-07, "loss": 0.0001, "reward": 1.7611608356237411, "reward_std": 0.03968015406280756, "rewards/accuracy_reward": 0.7611607387661934, "rewards/format_reward": 1.0, "step": 4706 }, { "completion_length": 417.54466247558594, "epoch": 0.36700251398281136, "grad_norm": 0.08245566214446522, "kl": 0.003047943115234375, "learning_rate": 7.028760011200814e-07, "loss": 0.0001, "reward": 1.8125000894069672, "reward_std": 0.04907546937465668, "rewards/accuracy_reward": 0.8125000298023224, "rewards/format_reward": 1.0, "step": 4708 }, { "completion_length": 436.27903747558594, "epoch": 0.3671584198935942, "grad_norm": 0.09594926437597331, "kl": 0.003444671630859375, "learning_rate": 7.026521412036181e-07, "loss": 0.0001, "reward": 1.787946492433548, "reward_std": 0.06478060036897659, "rewards/accuracy_reward": 0.792410746216774, "rewards/format_reward": 0.9955357313156128, "step": 4710 }, { "completion_length": 416.4419822692871, "epoch": 0.36731432580437706, "grad_norm": 0.05107492268278717, "kl": 0.0028123855590820312, "learning_rate": 7.024282326696101e-07, "loss": 0.0001, "reward": 1.8638393431901932, "reward_std": 0.0344957634806633, "rewards/accuracy_reward": 0.8638393059372902, "rewards/format_reward": 1.0, "step": 4712 }, { "completion_length": 412.3192138671875, "epoch": 0.3674702317151599, "grad_norm": 0.10705597786633354, "kl": 0.00292205810546875, "learning_rate": 7.022042755717745e-07, "loss": 0.0001, "reward": 1.7611607760190964, "reward_std": 0.0440408093854785, "rewards/accuracy_reward": 0.7611607387661934, "rewards/format_reward": 1.0, "step": 4714 }, { "completion_length": 418.502254486084, "epoch": 0.36762613762594276, "grad_norm": 0.04401142528036456, "kl": 0.0026111602783203125, "learning_rate": 7.0198026996384e-07, "loss": 0.0001, "reward": 1.7968750894069672, "reward_std": 0.04666761215776205, "rewards/accuracy_reward": 0.7968750298023224, "rewards/format_reward": 1.0, "step": 4716 }, { "completion_length": 425.1674270629883, "epoch": 0.3677820435367256, "grad_norm": 0.0578375285896037, "kl": 0.0032482147216796875, "learning_rate": 7.017562158995472e-07, "loss": 0.0001, "reward": 1.7566965073347092, "reward_std": 0.04501790925860405, "rewards/accuracy_reward": 0.7566964477300644, "rewards/format_reward": 1.0, "step": 4718 }, { "completion_length": 432.01118087768555, "epoch": 0.3679379494475084, "grad_norm": 0.094904407117723, "kl": 0.0036754608154296875, "learning_rate": 7.015321134326475e-07, "loss": 0.0001, "reward": 1.7745536416769028, "reward_std": 0.08357123285531998, "rewards/accuracy_reward": 0.7767857536673546, "rewards/format_reward": 0.9977678656578064, "step": 4720 }, { "completion_length": 427.2232322692871, "epoch": 0.36809385535829126, "grad_norm": 0.10057336827046842, "kl": 0.003814697265625, "learning_rate": 7.01307962616905e-07, "loss": 0.0002, "reward": 1.8147322088479996, "reward_std": 0.07484992314130068, "rewards/accuracy_reward": 0.8147321790456772, "rewards/format_reward": 1.0, "step": 4722 }, { "completion_length": 424.136173248291, "epoch": 0.3682497612690741, "grad_norm": 0.10917143229043177, "kl": 0.0032033920288085938, "learning_rate": 7.01083763506095e-07, "loss": 0.0001, "reward": 1.8772322088479996, "reward_std": 0.06267807260155678, "rewards/accuracy_reward": 0.8772321753203869, "rewards/format_reward": 1.0, "step": 4724 }, { "completion_length": 421.096004486084, "epoch": 0.36840566717985695, "grad_norm": 0.06493703440707892, "kl": 0.0030231475830078125, "learning_rate": 7.008595161540039e-07, "loss": 0.0001, "reward": 1.8303572237491608, "reward_std": 0.04907546937465668, "rewards/accuracy_reward": 0.8348214849829674, "rewards/format_reward": 0.9955357313156128, "step": 4726 }, { "completion_length": 413.7276954650879, "epoch": 0.3685615730906398, "grad_norm": 0.08495545253375236, "kl": 0.0030689239501953125, "learning_rate": 7.006352206144306e-07, "loss": 0.0001, "reward": 1.863839328289032, "reward_std": 0.061092549934983253, "rewards/accuracy_reward": 0.8638393059372902, "rewards/format_reward": 1.0, "step": 4728 }, { "completion_length": 414.6116256713867, "epoch": 0.36871747900142265, "grad_norm": 0.08520252043809569, "kl": 0.00321197509765625, "learning_rate": 7.004108769411845e-07, "loss": 0.0001, "reward": 1.7924107760190964, "reward_std": 0.06057331059128046, "rewards/accuracy_reward": 0.792410746216774, "rewards/format_reward": 1.0, "step": 4730 }, { "completion_length": 413.2343940734863, "epoch": 0.3688733849122055, "grad_norm": 0.11940512267137135, "kl": 0.0029535293579101562, "learning_rate": 7.001864851880875e-07, "loss": 0.0001, "reward": 1.7991072088479996, "reward_std": 0.05328275915235281, "rewards/accuracy_reward": 0.799107164144516, "rewards/format_reward": 1.0, "step": 4732 }, { "completion_length": 417.6183166503906, "epoch": 0.36902929082298835, "grad_norm": 0.09310833275485549, "kl": 0.003391265869140625, "learning_rate": 6.999620454089721e-07, "loss": 0.0001, "reward": 1.8214286416769028, "reward_std": 0.06898929364979267, "rewards/accuracy_reward": 0.821428619325161, "rewards/format_reward": 1.0, "step": 4734 }, { "completion_length": 415.7098388671875, "epoch": 0.36918519673377115, "grad_norm": 0.00335880521347992, "kl": 0.003025054931640625, "learning_rate": 6.997375576576834e-07, "loss": 0.0001, "reward": 1.7767858058214188, "reward_std": 0.041786317713558674, "rewards/accuracy_reward": 0.7767857536673546, "rewards/format_reward": 1.0, "step": 4736 }, { "completion_length": 433.3393096923828, "epoch": 0.369341102644554, "grad_norm": 0.06481092565159806, "kl": 0.00316619873046875, "learning_rate": 6.995130219880771e-07, "loss": 0.0001, "reward": 1.8794643431901932, "reward_std": 0.02089315839111805, "rewards/accuracy_reward": 0.879464328289032, "rewards/format_reward": 1.0, "step": 4738 }, { "completion_length": 417.1852836608887, "epoch": 0.36949700855533685, "grad_norm": 0.1090453572684758, "kl": 0.0032100677490234375, "learning_rate": 6.99288438454021e-07, "loss": 0.0001, "reward": 1.805803656578064, "reward_std": 0.043739115819334984, "rewards/accuracy_reward": 0.8058036044239998, "rewards/format_reward": 1.0, "step": 4740 }, { "completion_length": 432.7053756713867, "epoch": 0.3696529144661197, "grad_norm": 0.1104500511384634, "kl": 0.0035266876220703125, "learning_rate": 6.99063807109394e-07, "loss": 0.0001, "reward": 1.6852679401636124, "reward_std": 0.061852105893194675, "rewards/accuracy_reward": 0.6875000223517418, "rewards/format_reward": 0.9977678656578064, "step": 4742 }, { "completion_length": 433.5089454650879, "epoch": 0.36980882037690255, "grad_norm": 0.08950562219975686, "kl": 0.0040073394775390625, "learning_rate": 6.988391280080865e-07, "loss": 0.0002, "reward": 1.8035715222358704, "reward_std": 0.058620513416826725, "rewards/accuracy_reward": 0.8035714700818062, "rewards/format_reward": 1.0, "step": 4744 }, { "completion_length": 409.3102836608887, "epoch": 0.3699647262876854, "grad_norm": 0.06693441978160208, "kl": 0.0029287338256835938, "learning_rate": 6.986144012040004e-07, "loss": 0.0001, "reward": 1.7745536416769028, "reward_std": 0.028485405258834362, "rewards/accuracy_reward": 0.7745536044239998, "rewards/format_reward": 1.0, "step": 4746 }, { "completion_length": 432.50894927978516, "epoch": 0.37012063219846825, "grad_norm": 0.11728150542515406, "kl": 0.0032596588134765625, "learning_rate": 6.983896267510494e-07, "loss": 0.0001, "reward": 1.85491082072258, "reward_std": 0.058923606760799885, "rewards/accuracy_reward": 0.8571428880095482, "rewards/format_reward": 0.9977678656578064, "step": 4748 }, { "completion_length": 406.29019927978516, "epoch": 0.37027653810925104, "grad_norm": 0.10722153523763331, "kl": 0.0026559829711914062, "learning_rate": 6.981648047031581e-07, "loss": 0.0001, "reward": 1.7656250596046448, "reward_std": 0.043739115819334984, "rewards/accuracy_reward": 0.7656250298023224, "rewards/format_reward": 1.0, "step": 4750 }, { "completion_length": 424.6406478881836, "epoch": 0.3704324440200339, "grad_norm": 0.09577767105476015, "kl": 0.0031108856201171875, "learning_rate": 6.979399351142629e-07, "loss": 0.0001, "reward": 1.8660714775323868, "reward_std": 0.04593987204134464, "rewards/accuracy_reward": 0.8705357387661934, "rewards/format_reward": 0.9955357313156128, "step": 4752 }, { "completion_length": 416.2544822692871, "epoch": 0.37058834993081674, "grad_norm": 0.1249371139093845, "kl": 0.0034694671630859375, "learning_rate": 6.977150180383112e-07, "loss": 0.0001, "reward": 1.8482143580913544, "reward_std": 0.044714813120663166, "rewards/accuracy_reward": 0.8482143357396126, "rewards/format_reward": 1.0, "step": 4754 }, { "completion_length": 436.2143096923828, "epoch": 0.3707442558415996, "grad_norm": 0.10142267589473788, "kl": 0.003490447998046875, "learning_rate": 6.974900535292623e-07, "loss": 0.0001, "reward": 1.7924107909202576, "reward_std": 0.05651434790343046, "rewards/accuracy_reward": 0.7924107387661934, "rewards/format_reward": 1.0, "step": 4756 }, { "completion_length": 420.2187728881836, "epoch": 0.37090016175238244, "grad_norm": 0.06664455768662234, "kl": 0.0034074783325195312, "learning_rate": 6.972650416410864e-07, "loss": 0.0001, "reward": 1.7834822237491608, "reward_std": 0.04569051135331392, "rewards/accuracy_reward": 0.7834821790456772, "rewards/format_reward": 1.0, "step": 4758 }, { "completion_length": 427.7500190734863, "epoch": 0.3710560676631653, "grad_norm": 0.0728269767923743, "kl": 0.0032062530517578125, "learning_rate": 6.970399824277658e-07, "loss": 0.0001, "reward": 1.741071492433548, "reward_std": 0.040809216909110546, "rewards/accuracy_reward": 0.7410714626312256, "rewards/format_reward": 1.0, "step": 4760 }, { "completion_length": 413.9107322692871, "epoch": 0.37121197357394814, "grad_norm": 0.11061987521933662, "kl": 0.0032100677490234375, "learning_rate": 6.968148759432928e-07, "loss": 0.0001, "reward": 1.8169643729925156, "reward_std": 0.05681744497269392, "rewards/accuracy_reward": 0.8169643133878708, "rewards/format_reward": 1.0, "step": 4762 }, { "completion_length": 432.6004638671875, "epoch": 0.371367879484731, "grad_norm": 0.10578044844181583, "kl": 0.003154754638671875, "learning_rate": 6.965897222416726e-07, "loss": 0.0001, "reward": 1.7857143729925156, "reward_std": 0.034798857755959034, "rewards/accuracy_reward": 0.7857143133878708, "rewards/format_reward": 1.0, "step": 4764 }, { "completion_length": 416.9107360839844, "epoch": 0.3715237853955138, "grad_norm": 0.10676885486363782, "kl": 0.0035533905029296875, "learning_rate": 6.963645213769207e-07, "loss": 0.0001, "reward": 1.8571429252624512, "reward_std": 0.04373771324753761, "rewards/accuracy_reward": 0.8571428880095482, "rewards/format_reward": 1.0, "step": 4766 }, { "completion_length": 414.7009048461914, "epoch": 0.37167969130629663, "grad_norm": 0.11788538289316573, "kl": 0.0032978057861328125, "learning_rate": 6.961392734030642e-07, "loss": 0.0001, "reward": 1.8258929401636124, "reward_std": 0.053436124697327614, "rewards/accuracy_reward": 0.8258928954601288, "rewards/format_reward": 1.0, "step": 4768 }, { "completion_length": 414.0424270629883, "epoch": 0.3718355972170795, "grad_norm": 0.06840876196661685, "kl": 0.0036182403564453125, "learning_rate": 6.959139783741418e-07, "loss": 0.0001, "reward": 1.8102679401636124, "reward_std": 0.030135108157992363, "rewards/accuracy_reward": 0.8102678880095482, "rewards/format_reward": 1.0, "step": 4770 }, { "completion_length": 404.7968940734863, "epoch": 0.37199150312786233, "grad_norm": 0.04499364684739301, "kl": 0.0028934478759765625, "learning_rate": 6.95688636344203e-07, "loss": 0.0001, "reward": 1.8593750447034836, "reward_std": 0.0412245811894536, "rewards/accuracy_reward": 0.8616071790456772, "rewards/format_reward": 0.9977678656578064, "step": 4772 }, { "completion_length": 421.39287185668945, "epoch": 0.3721474090386452, "grad_norm": 0.13077281164855764, "kl": 0.0030336380004882812, "learning_rate": 6.954632473673088e-07, "loss": 0.0001, "reward": 1.8549107909202576, "reward_std": 0.06277547217905521, "rewards/accuracy_reward": 0.85714291036129, "rewards/format_reward": 0.9977678656578064, "step": 4774 }, { "completion_length": 425.37278747558594, "epoch": 0.37230331494942803, "grad_norm": 0.05852732989598291, "kl": 0.003437042236328125, "learning_rate": 6.952378114975313e-07, "loss": 0.0001, "reward": 1.866071492433548, "reward_std": 0.0602702172473073, "rewards/accuracy_reward": 0.8660714626312256, "rewards/format_reward": 1.0, "step": 4776 }, { "completion_length": 422.63171768188477, "epoch": 0.3724592208602109, "grad_norm": 0.1368185592507345, "kl": 0.0036029815673828125, "learning_rate": 6.950123287889544e-07, "loss": 0.0001, "reward": 1.7165179401636124, "reward_std": 0.06365517433732748, "rewards/accuracy_reward": 0.7165178880095482, "rewards/format_reward": 1.0, "step": 4778 }, { "completion_length": 430.76118087768555, "epoch": 0.37261512677099373, "grad_norm": 0.10005827537586655, "kl": 0.0035190582275390625, "learning_rate": 6.947867992956726e-07, "loss": 0.0001, "reward": 1.750000074505806, "reward_std": 0.0626794770359993, "rewards/accuracy_reward": 0.7500000298023224, "rewards/format_reward": 1.0, "step": 4780 }, { "completion_length": 410.31474685668945, "epoch": 0.3727710326817765, "grad_norm": 0.12640852786587894, "kl": 0.0032682418823242188, "learning_rate": 6.945612230717919e-07, "loss": 0.0001, "reward": 1.8638393431901932, "reward_std": 0.08274526428431273, "rewards/accuracy_reward": 0.8638393208384514, "rewards/format_reward": 1.0, "step": 4782 }, { "completion_length": 409.3303756713867, "epoch": 0.3729269385925594, "grad_norm": 0.12398095218436495, "kl": 0.0030622482299804688, "learning_rate": 6.943356001714296e-07, "loss": 0.0001, "reward": 1.7946429401636124, "reward_std": 0.06395826861262321, "rewards/accuracy_reward": 0.7946428954601288, "rewards/format_reward": 1.0, "step": 4784 }, { "completion_length": 418.721004486084, "epoch": 0.3730828445033422, "grad_norm": 0.06495602168748646, "kl": 0.0034275054931640625, "learning_rate": 6.941099306487138e-07, "loss": 0.0001, "reward": 1.8392857909202576, "reward_std": 0.05425985902547836, "rewards/accuracy_reward": 0.839285746216774, "rewards/format_reward": 1.0, "step": 4786 }, { "completion_length": 435.32144927978516, "epoch": 0.3732387504141251, "grad_norm": 0.116323668358221, "kl": 0.0040225982666015625, "learning_rate": 6.938842145577844e-07, "loss": 0.0002, "reward": 1.725446492433548, "reward_std": 0.05230705998837948, "rewards/accuracy_reward": 0.7254464700818062, "rewards/format_reward": 1.0, "step": 4788 }, { "completion_length": 425.6339454650879, "epoch": 0.3733946563249079, "grad_norm": 0.1419813321161478, "kl": 0.0035228729248046875, "learning_rate": 6.936584519527919e-07, "loss": 0.0001, "reward": 1.7566965073347092, "reward_std": 0.07680131681263447, "rewards/accuracy_reward": 0.756696455180645, "rewards/format_reward": 1.0, "step": 4790 }, { "completion_length": 412.85269927978516, "epoch": 0.3735505622356908, "grad_norm": 0.11143068446838668, "kl": 0.002994537353515625, "learning_rate": 6.934326428878981e-07, "loss": 0.0001, "reward": 1.8325893431901932, "reward_std": 0.06124591547995806, "rewards/accuracy_reward": 0.8325893208384514, "rewards/format_reward": 1.0, "step": 4792 }, { "completion_length": 425.0134162902832, "epoch": 0.3737064681464736, "grad_norm": 0.12220510506234247, "kl": 0.0032396316528320312, "learning_rate": 6.932067874172762e-07, "loss": 0.0001, "reward": 1.8102679252624512, "reward_std": 0.05102826841175556, "rewards/accuracy_reward": 0.8102678954601288, "rewards/format_reward": 1.0, "step": 4794 }, { "completion_length": 412.70537185668945, "epoch": 0.3738623740572564, "grad_norm": 0.09916953201715849, "kl": 0.0029344558715820312, "learning_rate": 6.929808855951105e-07, "loss": 0.0001, "reward": 1.9151786416769028, "reward_std": 0.0417863167822361, "rewards/accuracy_reward": 0.9151785969734192, "rewards/format_reward": 1.0, "step": 4796 }, { "completion_length": 425.72546005249023, "epoch": 0.37401827996803927, "grad_norm": 0.07053149233686919, "kl": 0.0033044815063476562, "learning_rate": 6.92754937475596e-07, "loss": 0.0001, "reward": 1.8303572088479996, "reward_std": 0.03742566145956516, "rewards/accuracy_reward": 0.8303571864962578, "rewards/format_reward": 1.0, "step": 4798 }, { "completion_length": 414.39287185668945, "epoch": 0.3741741858788221, "grad_norm": 0.09942582833525815, "kl": 0.0029468536376953125, "learning_rate": 6.925289431129395e-07, "loss": 0.0001, "reward": 1.7857143580913544, "reward_std": 0.03937705885618925, "rewards/accuracy_reward": 0.7857143208384514, "rewards/format_reward": 1.0, "step": 4800 }, { "completion_length": 416.0960006713867, "epoch": 0.37433009178960497, "grad_norm": 0.09358383200492897, "kl": 0.0030059814453125, "learning_rate": 6.92302902561358e-07, "loss": 0.0001, "reward": 1.7834822088479996, "reward_std": 0.0641744127497077, "rewards/accuracy_reward": 0.7834821790456772, "rewards/format_reward": 1.0, "step": 4802 }, { "completion_length": 412.6875190734863, "epoch": 0.3744859977003878, "grad_norm": 0.0786164940581654, "kl": 0.0031871795654296875, "learning_rate": 6.920768158750805e-07, "loss": 0.0001, "reward": 1.8147322535514832, "reward_std": 0.04404080845415592, "rewards/accuracy_reward": 0.8147321790456772, "rewards/format_reward": 1.0, "step": 4804 }, { "completion_length": 431.2254638671875, "epoch": 0.37464190361117067, "grad_norm": 0.0033693095902980314, "kl": 0.0032100677490234375, "learning_rate": 6.918506831083463e-07, "loss": 0.0001, "reward": 1.8392857760190964, "reward_std": 0.025100448168814182, "rewards/accuracy_reward": 0.8392857387661934, "rewards/format_reward": 1.0, "step": 4806 }, { "completion_length": 410.73216247558594, "epoch": 0.3747978095219535, "grad_norm": 0.08723348022683064, "kl": 0.002899169921875, "learning_rate": 6.916245043154064e-07, "loss": 0.0001, "reward": 1.7946429550647736, "reward_std": 0.03742566145956516, "rewards/accuracy_reward": 0.7946428954601288, "rewards/format_reward": 1.0, "step": 4808 }, { "completion_length": 432.98886489868164, "epoch": 0.37495371543273637, "grad_norm": 0.1347263840401886, "kl": 0.003952980041503906, "learning_rate": 6.913982795505224e-07, "loss": 0.0002, "reward": 1.7075893729925156, "reward_std": 0.06756077148020267, "rewards/accuracy_reward": 0.7075893059372902, "rewards/format_reward": 1.0, "step": 4810 }, { "completion_length": 423.4799270629883, "epoch": 0.37510962134351916, "grad_norm": 0.08613096381043155, "kl": 0.0032520294189453125, "learning_rate": 6.911720088679669e-07, "loss": 0.0001, "reward": 1.7790179252624512, "reward_std": 0.03547286428511143, "rewards/accuracy_reward": 0.7790178805589676, "rewards/format_reward": 1.0, "step": 4812 }, { "completion_length": 406.6964454650879, "epoch": 0.375265527254302, "grad_norm": 0.059086683163027794, "kl": 0.0030660629272460938, "learning_rate": 6.90945692322024e-07, "loss": 0.0001, "reward": 1.7834822088479996, "reward_std": 0.03141390159726143, "rewards/accuracy_reward": 0.7834821790456772, "rewards/format_reward": 1.0, "step": 4814 }, { "completion_length": 428.6942138671875, "epoch": 0.37542143316508486, "grad_norm": 0.04539189700512858, "kl": 0.003047943115234375, "learning_rate": 6.907193299669882e-07, "loss": 0.0001, "reward": 1.8348214775323868, "reward_std": 0.05200396664440632, "rewards/accuracy_reward": 0.834821455180645, "rewards/format_reward": 1.0, "step": 4816 }, { "completion_length": 408.0022506713867, "epoch": 0.3755773390758677, "grad_norm": 0.14074156704452911, "kl": 0.0031557083129882812, "learning_rate": 6.904929218571655e-07, "loss": 0.0001, "reward": 1.828125074505806, "reward_std": 0.06658367346972227, "rewards/accuracy_reward": 0.8281250298023224, "rewards/format_reward": 1.0, "step": 4818 }, { "completion_length": 416.2276954650879, "epoch": 0.37573324498665056, "grad_norm": 0.04603109256141342, "kl": 0.002811431884765625, "learning_rate": 6.902664680468726e-07, "loss": 0.0001, "reward": 1.8883929252624512, "reward_std": 0.008266251534223557, "rewards/accuracy_reward": 0.8883928805589676, "rewards/format_reward": 1.0, "step": 4820 }, { "completion_length": 403.9464416503906, "epoch": 0.3758891508974334, "grad_norm": 0.09569468224669544, "kl": 0.0031909942626953125, "learning_rate": 6.900399685904372e-07, "loss": 0.0001, "reward": 1.7566964626312256, "reward_std": 0.034495764411985874, "rewards/accuracy_reward": 0.7566964477300644, "rewards/format_reward": 1.0, "step": 4822 }, { "completion_length": 427.2433166503906, "epoch": 0.37604505680821626, "grad_norm": 0.08808580493660363, "kl": 0.0031070709228515625, "learning_rate": 6.898134235421979e-07, "loss": 0.0001, "reward": 1.8058036714792252, "reward_std": 0.0506573561578989, "rewards/accuracy_reward": 0.8058036118745804, "rewards/format_reward": 1.0, "step": 4824 }, { "completion_length": 414.8593940734863, "epoch": 0.3762009627189991, "grad_norm": 0.07153055490747788, "kl": 0.0030679702758789062, "learning_rate": 6.895868329565044e-07, "loss": 0.0001, "reward": 1.8035715222358704, "reward_std": 0.05425985809415579, "rewards/accuracy_reward": 0.8035714700818062, "rewards/format_reward": 1.0, "step": 4826 }, { "completion_length": 421.7522506713867, "epoch": 0.3763568686297819, "grad_norm": 0.09254130092252642, "kl": 0.00273895263671875, "learning_rate": 6.89360196887717e-07, "loss": 0.0001, "reward": 1.727678656578064, "reward_std": 0.044714813120663166, "rewards/accuracy_reward": 0.7276786118745804, "rewards/format_reward": 1.0, "step": 4828 }, { "completion_length": 425.3794860839844, "epoch": 0.37651277454056475, "grad_norm": 0.0804700496242964, "kl": 0.00333404541015625, "learning_rate": 6.891335153902073e-07, "loss": 0.0001, "reward": 1.8281250894069672, "reward_std": 0.055388922803103924, "rewards/accuracy_reward": 0.8281250298023224, "rewards/format_reward": 1.0, "step": 4830 }, { "completion_length": 426.002254486084, "epoch": 0.3766686804513476, "grad_norm": 0.11054854833470984, "kl": 0.0032100677490234375, "learning_rate": 6.889067885183582e-07, "loss": 0.0001, "reward": 1.7924107760190964, "reward_std": 0.058317420072853565, "rewards/accuracy_reward": 0.792410746216774, "rewards/format_reward": 1.0, "step": 4832 }, { "completion_length": 419.1160888671875, "epoch": 0.37682458636213045, "grad_norm": 0.003602305480754229, "kl": 0.0028209686279296875, "learning_rate": 6.886800163265619e-07, "loss": 0.0001, "reward": 1.7991072088479996, "reward_std": 0.008266251534223557, "rewards/accuracy_reward": 0.7991071715950966, "rewards/format_reward": 1.0, "step": 4834 }, { "completion_length": 418.29243087768555, "epoch": 0.3769804922729133, "grad_norm": 0.13445545574099316, "kl": 0.0030727386474609375, "learning_rate": 6.884531988692231e-07, "loss": 0.0001, "reward": 1.8616072237491608, "reward_std": 0.06463087070733309, "rewards/accuracy_reward": 0.8616071790456772, "rewards/format_reward": 1.0, "step": 4836 }, { "completion_length": 422.31921005249023, "epoch": 0.37713639818369615, "grad_norm": 0.11206236317094556, "kl": 0.003383636474609375, "learning_rate": 6.882263362007566e-07, "loss": 0.0001, "reward": 1.7343750894069672, "reward_std": 0.0874768290668726, "rewards/accuracy_reward": 0.734375037252903, "rewards/format_reward": 1.0, "step": 4838 }, { "completion_length": 429.76118087768555, "epoch": 0.377292304094479, "grad_norm": 0.10743880535266112, "kl": 0.00341033935546875, "learning_rate": 6.879994283755884e-07, "loss": 0.0001, "reward": 1.8125000894069672, "reward_std": 0.043065110221505165, "rewards/accuracy_reward": 0.812500037252903, "rewards/format_reward": 1.0, "step": 4840 }, { "completion_length": 431.8928756713867, "epoch": 0.3774482100052618, "grad_norm": 0.0980681973149055, "kl": 0.00348663330078125, "learning_rate": 6.87772475448155e-07, "loss": 0.0001, "reward": 1.834821492433548, "reward_std": 0.05621125642210245, "rewards/accuracy_reward": 0.8348214626312256, "rewards/format_reward": 1.0, "step": 4842 }, { "completion_length": 434.8415336608887, "epoch": 0.37760411591604465, "grad_norm": 0.08551669850864968, "kl": 0.00324249267578125, "learning_rate": 6.875454774729038e-07, "loss": 0.0001, "reward": 1.8303572237491608, "reward_std": 0.03479885868728161, "rewards/accuracy_reward": 0.8303571790456772, "rewards/format_reward": 1.0, "step": 4844 }, { "completion_length": 420.5625190734863, "epoch": 0.3777600218268275, "grad_norm": 0.09576625094939659, "kl": 0.003391265869140625, "learning_rate": 6.873184345042932e-07, "loss": 0.0001, "reward": 1.7767857760190964, "reward_std": 0.05200396664440632, "rewards/accuracy_reward": 0.7767857536673546, "rewards/format_reward": 1.0, "step": 4846 }, { "completion_length": 430.5535888671875, "epoch": 0.37791592773761035, "grad_norm": 0.07916841312176448, "kl": 0.003131866455078125, "learning_rate": 6.870913465967921e-07, "loss": 0.0001, "reward": 1.8147321939468384, "reward_std": 0.033063605427742004, "rewards/accuracy_reward": 0.8147321790456772, "rewards/format_reward": 1.0, "step": 4848 }, { "completion_length": 424.4263610839844, "epoch": 0.3780718336483932, "grad_norm": 0.08519949728199201, "kl": 0.0035037994384765625, "learning_rate": 6.868642138048805e-07, "loss": 0.0001, "reward": 1.8437500894069672, "reward_std": 0.043065110221505165, "rewards/accuracy_reward": 0.843750037252903, "rewards/format_reward": 1.0, "step": 4850 }, { "completion_length": 419.9955520629883, "epoch": 0.37822773955917605, "grad_norm": 0.11890200136311886, "kl": 0.0030574798583984375, "learning_rate": 6.866370361830489e-07, "loss": 0.0001, "reward": 1.7946429401636124, "reward_std": 0.04907547030597925, "rewards/accuracy_reward": 0.794642873108387, "rewards/format_reward": 1.0, "step": 4852 }, { "completion_length": 418.323673248291, "epoch": 0.3783836454699589, "grad_norm": 0.08952835760779572, "kl": 0.0027589797973632812, "learning_rate": 6.864098137857986e-07, "loss": 0.0001, "reward": 1.8013393580913544, "reward_std": 0.03156726714223623, "rewards/accuracy_reward": 0.8013393357396126, "rewards/format_reward": 1.0, "step": 4854 }, { "completion_length": 424.77234268188477, "epoch": 0.37853955138074175, "grad_norm": 0.09493419697437082, "kl": 0.0028324127197265625, "learning_rate": 6.86182546667642e-07, "loss": 0.0001, "reward": 1.8102679252624512, "reward_std": 0.024797353893518448, "rewards/accuracy_reward": 0.8102679029107094, "rewards/format_reward": 1.0, "step": 4856 }, { "completion_length": 432.1004638671875, "epoch": 0.37869545729152454, "grad_norm": 0.07589351515067633, "kl": 0.0032787322998046875, "learning_rate": 6.859552348831015e-07, "loss": 0.0001, "reward": 1.843750074505806, "reward_std": 0.04080921784043312, "rewards/accuracy_reward": 0.843750037252903, "rewards/format_reward": 1.0, "step": 4858 }, { "completion_length": 437.2187728881836, "epoch": 0.3788513632023074, "grad_norm": 0.06766036911817651, "kl": 0.0031261444091796875, "learning_rate": 6.85727878486711e-07, "loss": 0.0001, "reward": 1.8013393431901932, "reward_std": 0.030135109089314938, "rewards/accuracy_reward": 0.8013393133878708, "rewards/format_reward": 1.0, "step": 4860 }, { "completion_length": 427.8058204650879, "epoch": 0.37900726911309024, "grad_norm": 0.0690703071620793, "kl": 0.0028600692749023438, "learning_rate": 6.855004775330146e-07, "loss": 0.0001, "reward": 1.8593750894069672, "reward_std": 0.031112208031117916, "rewards/accuracy_reward": 0.8593750149011612, "rewards/format_reward": 1.0, "step": 4862 }, { "completion_length": 430.90180587768555, "epoch": 0.3791631750238731, "grad_norm": 0.08524738828122087, "kl": 0.002773284912109375, "learning_rate": 6.852730320765675e-07, "loss": 0.0001, "reward": 1.801339328289032, "reward_std": 0.04959610849618912, "rewards/accuracy_reward": 0.8013393208384514, "rewards/format_reward": 1.0, "step": 4864 }, { "completion_length": 419.0580520629883, "epoch": 0.37931908093465594, "grad_norm": 0.09422423037133691, "kl": 0.0027618408203125, "learning_rate": 6.850455421719349e-07, "loss": 0.0001, "reward": 1.8616072237491608, "reward_std": 0.05200396478176117, "rewards/accuracy_reward": 0.8616071939468384, "rewards/format_reward": 1.0, "step": 4866 }, { "completion_length": 437.11832427978516, "epoch": 0.3794749868454388, "grad_norm": 0.09642788666462704, "kl": 0.0034389495849609375, "learning_rate": 6.848180078736934e-07, "loss": 0.0001, "reward": 1.8571429252624512, "reward_std": 0.055692016147077084, "rewards/accuracy_reward": 0.8571429029107094, "rewards/format_reward": 1.0, "step": 4868 }, { "completion_length": 436.9218940734863, "epoch": 0.37963089275622164, "grad_norm": 0.1037485230804219, "kl": 0.0032405853271484375, "learning_rate": 6.845904292364298e-07, "loss": 0.0001, "reward": 1.7879465222358704, "reward_std": 0.05246042553335428, "rewards/accuracy_reward": 0.7879464626312256, "rewards/format_reward": 1.0, "step": 4870 }, { "completion_length": 433.6116256713867, "epoch": 0.3797867986670045, "grad_norm": 0.08088199459279449, "kl": 0.0030498504638671875, "learning_rate": 6.843628063147415e-07, "loss": 0.0001, "reward": 1.8593750894069672, "reward_std": 0.05328416172415018, "rewards/accuracy_reward": 0.8593750223517418, "rewards/format_reward": 1.0, "step": 4872 }, { "completion_length": 436.9888572692871, "epoch": 0.3799427045777873, "grad_norm": 0.0833204543515102, "kl": 0.0032291412353515625, "learning_rate": 6.841351391632372e-07, "loss": 0.0001, "reward": 1.875000074505806, "reward_std": 0.03742566145956516, "rewards/accuracy_reward": 0.8750000447034836, "rewards/format_reward": 1.0, "step": 4874 }, { "completion_length": 426.32591247558594, "epoch": 0.38009861048857013, "grad_norm": 0.08584940905842313, "kl": 0.0027408599853515625, "learning_rate": 6.839074278365352e-07, "loss": 0.0001, "reward": 1.8794643729925156, "reward_std": 0.03742566239088774, "rewards/accuracy_reward": 0.8794643059372902, "rewards/format_reward": 1.0, "step": 4876 }, { "completion_length": 418.9285888671875, "epoch": 0.380254516399353, "grad_norm": 0.11983159900247342, "kl": 0.002948760986328125, "learning_rate": 6.836796723892651e-07, "loss": 0.0001, "reward": 1.8169643729925156, "reward_std": 0.07109405566006899, "rewards/accuracy_reward": 0.816964328289032, "rewards/format_reward": 1.0, "step": 4878 }, { "completion_length": 432.92412185668945, "epoch": 0.38041042231013583, "grad_norm": 0.07648456947564367, "kl": 0.003261566162109375, "learning_rate": 6.834518728760669e-07, "loss": 0.0001, "reward": 1.7209822237491608, "reward_std": 0.036751655861735344, "rewards/accuracy_reward": 0.7209821790456772, "rewards/format_reward": 1.0, "step": 4880 }, { "completion_length": 424.31697845458984, "epoch": 0.3805663282209187, "grad_norm": 0.17824132107614463, "kl": 0.0039424896240234375, "learning_rate": 6.832240293515912e-07, "loss": 0.0002, "reward": 1.8526786267757416, "reward_std": 0.043065110221505165, "rewards/accuracy_reward": 0.854910746216774, "rewards/format_reward": 0.9977678656578064, "step": 4882 }, { "completion_length": 431.68528747558594, "epoch": 0.38072223413170153, "grad_norm": 0.04579470150223698, "kl": 0.0033693313598632812, "learning_rate": 6.82996141870499e-07, "loss": 0.0001, "reward": 1.848214328289032, "reward_std": 0.03336670063436031, "rewards/accuracy_reward": 0.8482143133878708, "rewards/format_reward": 1.0, "step": 4884 }, { "completion_length": 418.5067138671875, "epoch": 0.3808781400424844, "grad_norm": 0.08926295618254709, "kl": 0.0028705596923828125, "learning_rate": 6.827682104874621e-07, "loss": 0.0001, "reward": 1.8147322088479996, "reward_std": 0.03840135969221592, "rewards/accuracy_reward": 0.8147321715950966, "rewards/format_reward": 1.0, "step": 4886 }, { "completion_length": 425.25001525878906, "epoch": 0.38103404595326723, "grad_norm": 0.10876295595900183, "kl": 0.0033397674560546875, "learning_rate": 6.825402352571627e-07, "loss": 0.0001, "reward": 1.7879465073347092, "reward_std": 0.06057331245392561, "rewards/accuracy_reward": 0.787946455180645, "rewards/format_reward": 1.0, "step": 4888 }, { "completion_length": 422.58707427978516, "epoch": 0.38118995186405, "grad_norm": 0.06986227305841662, "kl": 0.0027065277099609375, "learning_rate": 6.823122162342932e-07, "loss": 0.0001, "reward": 1.812500074505806, "reward_std": 0.041786317713558674, "rewards/accuracy_reward": 0.8125000447034836, "rewards/format_reward": 1.0, "step": 4890 }, { "completion_length": 423.7946586608887, "epoch": 0.3813458577748329, "grad_norm": 0.10305328252134537, "kl": 0.0029144287109375, "learning_rate": 6.820841534735572e-07, "loss": 0.0001, "reward": 1.8950893729925156, "reward_std": 0.06786246318370104, "rewards/accuracy_reward": 0.8950893208384514, "rewards/format_reward": 1.0, "step": 4892 }, { "completion_length": 425.24778747558594, "epoch": 0.3815017636856157, "grad_norm": 0.05519942910351611, "kl": 0.0029115676879882812, "learning_rate": 6.818560470296684e-07, "loss": 0.0001, "reward": 1.8727679401636124, "reward_std": 0.03675165679305792, "rewards/accuracy_reward": 0.8750000447034836, "rewards/format_reward": 0.9977678656578064, "step": 4894 }, { "completion_length": 427.533504486084, "epoch": 0.3816576695963986, "grad_norm": 0.07867641244011864, "kl": 0.003070831298828125, "learning_rate": 6.816278969573508e-07, "loss": 0.0001, "reward": 1.843750074505806, "reward_std": 0.05343612376600504, "rewards/accuracy_reward": 0.843750037252903, "rewards/format_reward": 1.0, "step": 4896 }, { "completion_length": 434.33930587768555, "epoch": 0.3818135755071814, "grad_norm": 0.07554707252332082, "kl": 0.003444671630859375, "learning_rate": 6.81399703311339e-07, "loss": 0.0001, "reward": 1.8571429252624512, "reward_std": 0.03336669970303774, "rewards/accuracy_reward": 0.8571428954601288, "rewards/format_reward": 1.0, "step": 4898 }, { "completion_length": 427.2589454650879, "epoch": 0.3819694814179643, "grad_norm": 0.10408451725783724, "kl": 0.003086090087890625, "learning_rate": 6.811714661463783e-07, "loss": 0.0001, "reward": 1.7477679252624512, "reward_std": 0.08048937190324068, "rewards/accuracy_reward": 0.7500000298023224, "rewards/format_reward": 0.9977678656578064, "step": 4900 }, { "completion_length": 415.7098388671875, "epoch": 0.3821253873287471, "grad_norm": 0.04430108582297722, "kl": 0.0030651092529296875, "learning_rate": 6.809431855172243e-07, "loss": 0.0001, "reward": 1.8125000596046448, "reward_std": 0.022171951830387115, "rewards/accuracy_reward": 0.8125000298023224, "rewards/format_reward": 1.0, "step": 4902 }, { "completion_length": 428.91519927978516, "epoch": 0.3822812932395299, "grad_norm": 0.11412989809859515, "kl": 0.0034198760986328125, "learning_rate": 6.807148614786429e-07, "loss": 0.0001, "reward": 1.85491082072258, "reward_std": 0.0827452652156353, "rewards/accuracy_reward": 0.8549107611179352, "rewards/format_reward": 1.0, "step": 4904 }, { "completion_length": 423.1406440734863, "epoch": 0.38243719915031277, "grad_norm": 0.06918283597793438, "kl": 0.0030651092529296875, "learning_rate": 6.804864940854104e-07, "loss": 0.0001, "reward": 1.7901786416769028, "reward_std": 0.04907547030597925, "rewards/accuracy_reward": 0.7901785895228386, "rewards/format_reward": 1.0, "step": 4906 }, { "completion_length": 425.2723388671875, "epoch": 0.3825931050610956, "grad_norm": 0.05329165903369297, "kl": 0.002635955810546875, "learning_rate": 6.802580833923138e-07, "loss": 0.0001, "reward": 1.8571429252624512, "reward_std": 0.033366698771715164, "rewards/accuracy_reward": 0.8571428880095482, "rewards/format_reward": 1.0, "step": 4908 }, { "completion_length": 443.68528747558594, "epoch": 0.38274901097187847, "grad_norm": 0.08333388892973609, "kl": 0.0035037994384765625, "learning_rate": 6.800296294541499e-07, "loss": 0.0001, "reward": 1.7544643580913544, "reward_std": 0.04306511115282774, "rewards/accuracy_reward": 0.7566964626312256, "rewards/format_reward": 0.9977678656578064, "step": 4910 }, { "completion_length": 432.4107360839844, "epoch": 0.3829049168826613, "grad_norm": 0.09541808652203396, "kl": 0.003223419189453125, "learning_rate": 6.798011323257267e-07, "loss": 0.0001, "reward": 1.743303656578064, "reward_std": 0.034342397935688496, "rewards/accuracy_reward": 0.7433035895228386, "rewards/format_reward": 1.0, "step": 4912 }, { "completion_length": 421.127254486084, "epoch": 0.38306082279344417, "grad_norm": 0.09671262948443791, "kl": 0.003040313720703125, "learning_rate": 6.79572592061862e-07, "loss": 0.0001, "reward": 1.7767857909202576, "reward_std": 0.041786317713558674, "rewards/accuracy_reward": 0.776785746216774, "rewards/format_reward": 1.0, "step": 4914 }, { "completion_length": 436.55582427978516, "epoch": 0.383216728704227, "grad_norm": 0.10868973421737933, "kl": 0.0032367706298828125, "learning_rate": 6.79344008717384e-07, "loss": 0.0001, "reward": 1.8147322237491608, "reward_std": 0.05328415986150503, "rewards/accuracy_reward": 0.8147321715950966, "rewards/format_reward": 1.0, "step": 4916 }, { "completion_length": 433.5402030944824, "epoch": 0.38337263461500987, "grad_norm": 0.04746398415243777, "kl": 0.0029840469360351562, "learning_rate": 6.791153823471313e-07, "loss": 0.0001, "reward": 1.8013393431901932, "reward_std": 0.030135108157992363, "rewards/accuracy_reward": 0.8013393133878708, "rewards/format_reward": 1.0, "step": 4918 }, { "completion_length": 412.4732322692871, "epoch": 0.38352854052579266, "grad_norm": 0.08238362408205802, "kl": 0.002796173095703125, "learning_rate": 6.788867130059528e-07, "loss": 0.0001, "reward": 1.7611607909202576, "reward_std": 0.028485405258834362, "rewards/accuracy_reward": 0.761160746216774, "rewards/format_reward": 1.0, "step": 4920 }, { "completion_length": 428.06474685668945, "epoch": 0.3836844464365755, "grad_norm": 0.10468288139581149, "kl": 0.003170013427734375, "learning_rate": 6.78658000748708e-07, "loss": 0.0001, "reward": 1.875000074505806, "reward_std": 0.06704013235867023, "rewards/accuracy_reward": 0.875000037252903, "rewards/format_reward": 1.0, "step": 4922 }, { "completion_length": 418.84822845458984, "epoch": 0.38384035234735836, "grad_norm": 0.044811297311928774, "kl": 0.0032138824462890625, "learning_rate": 6.78429245630266e-07, "loss": 0.0001, "reward": 1.7924107909202576, "reward_std": 0.025774452835321426, "rewards/accuracy_reward": 0.7924107611179352, "rewards/format_reward": 1.0, "step": 4924 }, { "completion_length": 428.44197845458984, "epoch": 0.3839962582581412, "grad_norm": 0.0838559126642808, "kl": 0.0027818679809570312, "learning_rate": 6.78200447705507e-07, "loss": 0.0001, "reward": 1.8638393580913544, "reward_std": 0.04569051135331392, "rewards/accuracy_reward": 0.8638393133878708, "rewards/format_reward": 1.0, "step": 4926 }, { "completion_length": 429.3995704650879, "epoch": 0.38415216416892406, "grad_norm": 0.0950881634301255, "kl": 0.0034503936767578125, "learning_rate": 6.779716070293209e-07, "loss": 0.0001, "reward": 1.8102679401636124, "reward_std": 0.039680153131484985, "rewards/accuracy_reward": 0.8102679029107094, "rewards/format_reward": 1.0, "step": 4928 }, { "completion_length": 414.6897506713867, "epoch": 0.3843080700797069, "grad_norm": 0.08252157052577978, "kl": 0.0029144287109375, "learning_rate": 6.77742723656608e-07, "loss": 0.0001, "reward": 1.758928656578064, "reward_std": 0.05590956099331379, "rewards/accuracy_reward": 0.7589285969734192, "rewards/format_reward": 1.0, "step": 4930 }, { "completion_length": 415.85939025878906, "epoch": 0.38446397599048976, "grad_norm": 0.11239699621823937, "kl": 0.0030193328857421875, "learning_rate": 6.77513797642279e-07, "loss": 0.0001, "reward": 1.8258929401636124, "reward_std": 0.04471481405198574, "rewards/accuracy_reward": 0.8258928805589676, "rewards/format_reward": 1.0, "step": 4932 }, { "completion_length": 445.1651954650879, "epoch": 0.3846198819012726, "grad_norm": 0.1046763114347062, "kl": 0.0031795501708984375, "learning_rate": 6.772848290412545e-07, "loss": 0.0001, "reward": 1.7857143580913544, "reward_std": 0.04742576461285353, "rewards/accuracy_reward": 0.7857143431901932, "rewards/format_reward": 1.0, "step": 4934 }, { "completion_length": 426.8325996398926, "epoch": 0.3847757878120554, "grad_norm": 0.06632122987069415, "kl": 0.0032138824462890625, "learning_rate": 6.77055817908466e-07, "loss": 0.0001, "reward": 1.7477679550647736, "reward_std": 0.021868856623768806, "rewards/accuracy_reward": 0.7477678954601288, "rewards/format_reward": 1.0, "step": 4936 }, { "completion_length": 447.2701072692871, "epoch": 0.38493169372283825, "grad_norm": 0.10124397060486537, "kl": 0.00366973876953125, "learning_rate": 6.768267642988543e-07, "loss": 0.0001, "reward": 1.7834822088479996, "reward_std": 0.05959621071815491, "rewards/accuracy_reward": 0.7834821790456772, "rewards/format_reward": 1.0, "step": 4938 }, { "completion_length": 436.2901954650879, "epoch": 0.3850875996336211, "grad_norm": 0.10184221234624488, "kl": 0.0035724639892578125, "learning_rate": 6.76597668267371e-07, "loss": 0.0001, "reward": 1.830357238650322, "reward_std": 0.08439637068659067, "rewards/accuracy_reward": 0.8303571790456772, "rewards/format_reward": 1.0, "step": 4940 }, { "completion_length": 427.4062690734863, "epoch": 0.38524350554440395, "grad_norm": 0.08984854769602157, "kl": 0.0031890869140625, "learning_rate": 6.763685298689776e-07, "loss": 0.0001, "reward": 1.7991072535514832, "reward_std": 0.05441322363913059, "rewards/accuracy_reward": 0.799107164144516, "rewards/format_reward": 1.0, "step": 4942 }, { "completion_length": 425.83484268188477, "epoch": 0.3853994114551868, "grad_norm": 0.09355912002025168, "kl": 0.00284576416015625, "learning_rate": 6.76139349158646e-07, "loss": 0.0001, "reward": 1.752232238650322, "reward_std": 0.05230706185102463, "rewards/accuracy_reward": 0.7522321790456772, "rewards/format_reward": 1.0, "step": 4944 }, { "completion_length": 419.1049270629883, "epoch": 0.38555531736596965, "grad_norm": 0.05445929636032575, "kl": 0.0028352737426757812, "learning_rate": 6.75910126191358e-07, "loss": 0.0001, "reward": 1.7723215073347092, "reward_std": 0.02089315839111805, "rewards/accuracy_reward": 0.7723214700818062, "rewards/format_reward": 1.0, "step": 4946 }, { "completion_length": 421.049129486084, "epoch": 0.3857112232767525, "grad_norm": 0.10338393103015846, "kl": 0.0027408599853515625, "learning_rate": 6.756808610221057e-07, "loss": 0.0001, "reward": 1.7544643729925156, "reward_std": 0.05944424867630005, "rewards/accuracy_reward": 0.7566964626312256, "rewards/format_reward": 0.9977678656578064, "step": 4948 }, { "completion_length": 422.955379486084, "epoch": 0.3858671291875353, "grad_norm": 0.08255586175533615, "kl": 0.00278472900390625, "learning_rate": 6.754515537058915e-07, "loss": 0.0001, "reward": 1.8928572088479996, "reward_std": 0.028182310983538628, "rewards/accuracy_reward": 0.8928571715950966, "rewards/format_reward": 1.0, "step": 4950 }, { "completion_length": 432.283504486084, "epoch": 0.38602303509831815, "grad_norm": 0.08509147484651626, "kl": 0.0031566619873046875, "learning_rate": 6.752222042977273e-07, "loss": 0.0001, "reward": 1.8816965073347092, "reward_std": 0.05929451994597912, "rewards/accuracy_reward": 0.8816964700818062, "rewards/format_reward": 1.0, "step": 4952 }, { "completion_length": 434.27680587768555, "epoch": 0.386178941009101, "grad_norm": 0.12137129916789843, "kl": 0.0052280426025390625, "learning_rate": 6.749928128526355e-07, "loss": 0.0002, "reward": 1.7232143580913544, "reward_std": 0.028182310052216053, "rewards/accuracy_reward": 0.7232143133878708, "rewards/format_reward": 1.0, "step": 4954 }, { "completion_length": 447.2500190734863, "epoch": 0.38633484691988385, "grad_norm": 0.12229138439397179, "kl": 0.0036067962646484375, "learning_rate": 6.747633794256491e-07, "loss": 0.0001, "reward": 1.8303572237491608, "reward_std": 0.07499965187162161, "rewards/accuracy_reward": 0.8303571790456772, "rewards/format_reward": 1.0, "step": 4956 }, { "completion_length": 420.8259086608887, "epoch": 0.3864907528306667, "grad_norm": 0.07561526246439414, "kl": 0.0030574798583984375, "learning_rate": 6.745339040718101e-07, "loss": 0.0001, "reward": 1.772321492433548, "reward_std": 0.025253813713788986, "rewards/accuracy_reward": 0.7723214738070965, "rewards/format_reward": 1.0, "step": 4958 }, { "completion_length": 417.05359268188477, "epoch": 0.38664665874144954, "grad_norm": 0.08958516962894662, "kl": 0.0025873184204101562, "learning_rate": 6.743043868461713e-07, "loss": 0.0001, "reward": 1.8013393878936768, "reward_std": 0.040657252073287964, "rewards/accuracy_reward": 0.8013393208384514, "rewards/format_reward": 1.0, "step": 4960 }, { "completion_length": 419.361629486084, "epoch": 0.3868025646522324, "grad_norm": 0.09499261701564748, "kl": 0.0032243728637695312, "learning_rate": 6.740748278037953e-07, "loss": 0.0001, "reward": 1.7321429401636124, "reward_std": 0.05035426281392574, "rewards/accuracy_reward": 0.7321428880095482, "rewards/format_reward": 1.0, "step": 4962 }, { "completion_length": 417.2857322692871, "epoch": 0.38695847056301524, "grad_norm": 0.11000020208915286, "kl": 0.0037631988525390625, "learning_rate": 6.738452269997548e-07, "loss": 0.0002, "reward": 1.8370536714792252, "reward_std": 0.07320021837949753, "rewards/accuracy_reward": 0.837053619325161, "rewards/format_reward": 1.0, "step": 4964 }, { "completion_length": 418.3169822692871, "epoch": 0.38711437647379804, "grad_norm": 0.08448655470589277, "kl": 0.004054069519042969, "learning_rate": 6.736155844891327e-07, "loss": 0.0002, "reward": 1.7611607909202576, "reward_std": 0.042762015014886856, "rewards/accuracy_reward": 0.7611607387661934, "rewards/format_reward": 1.0, "step": 4966 }, { "completion_length": 426.63394927978516, "epoch": 0.3872702823845809, "grad_norm": 0.07530449510846034, "kl": 0.0028667449951171875, "learning_rate": 6.733859003270215e-07, "loss": 0.0001, "reward": 1.90178582072258, "reward_std": 0.0417863167822361, "rewards/accuracy_reward": 0.901785746216774, "rewards/format_reward": 1.0, "step": 4968 }, { "completion_length": 427.68305587768555, "epoch": 0.38742618829536374, "grad_norm": 0.0034486949324420263, "kl": 0.003116607666015625, "learning_rate": 6.731561745685239e-07, "loss": 0.0001, "reward": 1.8839286267757416, "reward_std": 0.009545044973492622, "rewards/accuracy_reward": 0.8839286044239998, "rewards/format_reward": 1.0, "step": 4970 }, { "completion_length": 419.00671005249023, "epoch": 0.3875820942061466, "grad_norm": 0.08727202344987989, "kl": 0.0031032562255859375, "learning_rate": 6.729264072687526e-07, "loss": 0.0001, "reward": 1.8058036714792252, "reward_std": 0.0344957634806633, "rewards/accuracy_reward": 0.8058035969734192, "rewards/format_reward": 1.0, "step": 4972 }, { "completion_length": 425.0491256713867, "epoch": 0.38773800011692944, "grad_norm": 0.10773735640732042, "kl": 0.0028638839721679688, "learning_rate": 6.726965984828304e-07, "loss": 0.0001, "reward": 1.837053656578064, "reward_std": 0.041329856030642986, "rewards/accuracy_reward": 0.837053619325161, "rewards/format_reward": 1.0, "step": 4974 }, { "completion_length": 423.8638572692871, "epoch": 0.3878939060277123, "grad_norm": 0.06602562574200956, "kl": 0.0029821395874023438, "learning_rate": 6.724667482658896e-07, "loss": 0.0001, "reward": 1.8102679252624512, "reward_std": 0.04111231118440628, "rewards/accuracy_reward": 0.8102678805589676, "rewards/format_reward": 1.0, "step": 4976 }, { "completion_length": 414.64064025878906, "epoch": 0.38804981193849514, "grad_norm": 0.1058835152200872, "kl": 0.0030364990234375, "learning_rate": 6.72236856673073e-07, "loss": 0.0001, "reward": 1.8013393878936768, "reward_std": 0.07100850343704224, "rewards/accuracy_reward": 0.8013393208384514, "rewards/format_reward": 1.0, "step": 4978 }, { "completion_length": 429.0379638671875, "epoch": 0.388205717849278, "grad_norm": 0.05306451481651051, "kl": 0.003421783447265625, "learning_rate": 6.720069237595329e-07, "loss": 0.0001, "reward": 1.8147322088479996, "reward_std": 0.030135109089314938, "rewards/accuracy_reward": 0.8147321753203869, "rewards/format_reward": 1.0, "step": 4980 }, { "completion_length": 433.4576072692871, "epoch": 0.3883616237600608, "grad_norm": 0.07694307122952576, "kl": 0.0036258697509765625, "learning_rate": 6.717769495804315e-07, "loss": 0.0001, "reward": 1.7522322237491608, "reward_std": 0.03239100147038698, "rewards/accuracy_reward": 0.7522321715950966, "rewards/format_reward": 1.0, "step": 4982 }, { "completion_length": 428.9040336608887, "epoch": 0.38851752967084363, "grad_norm": 0.051083239716840145, "kl": 0.0030975341796875, "learning_rate": 6.715469341909417e-07, "loss": 0.0001, "reward": 1.799107238650322, "reward_std": 0.043065110221505165, "rewards/accuracy_reward": 0.7991071864962578, "rewards/format_reward": 1.0, "step": 4984 }, { "completion_length": 425.3884162902832, "epoch": 0.3886734355816265, "grad_norm": 0.07234913923122363, "kl": 0.0028591156005859375, "learning_rate": 6.71316877646245e-07, "loss": 0.0001, "reward": 1.8303571939468384, "reward_std": 0.03479885868728161, "rewards/accuracy_reward": 0.8303571790456772, "rewards/format_reward": 1.0, "step": 4986 }, { "completion_length": 416.3750190734863, "epoch": 0.38882934149240933, "grad_norm": 0.09422123675339798, "kl": 0.0028219223022460938, "learning_rate": 6.71086780001534e-07, "loss": 0.0001, "reward": 1.910714328289032, "reward_std": 0.025253813713788986, "rewards/accuracy_reward": 0.9107143133878708, "rewards/format_reward": 1.0, "step": 4988 }, { "completion_length": 419.9776954650879, "epoch": 0.3889852474031922, "grad_norm": 0.07163562086560851, "kl": 0.0030145645141601562, "learning_rate": 6.7085664131201e-07, "loss": 0.0001, "reward": 1.8035715073347092, "reward_std": 0.03742566239088774, "rewards/accuracy_reward": 0.8035714402794838, "rewards/format_reward": 1.0, "step": 4990 }, { "completion_length": 414.7901954650879, "epoch": 0.38914115331397503, "grad_norm": 0.10577598532973327, "kl": 0.0030164718627929688, "learning_rate": 6.706264616328852e-07, "loss": 0.0001, "reward": 1.8392857760190964, "reward_std": 0.04373771417886019, "rewards/accuracy_reward": 0.839285746216774, "rewards/format_reward": 1.0, "step": 4992 }, { "completion_length": 416.3415412902832, "epoch": 0.3892970592247579, "grad_norm": 0.11750280886874104, "kl": 0.0031585693359375, "learning_rate": 6.70396241019381e-07, "loss": 0.0001, "reward": 1.8303572237491608, "reward_std": 0.03479885868728161, "rewards/accuracy_reward": 0.8303571790456772, "rewards/format_reward": 1.0, "step": 4994 }, { "completion_length": 426.2455520629883, "epoch": 0.3894529651355407, "grad_norm": 0.06242895813713725, "kl": 0.0029277801513671875, "learning_rate": 6.701659795267287e-07, "loss": 0.0001, "reward": 1.7477679550647736, "reward_std": 0.06447890866547823, "rewards/accuracy_reward": 0.7477678954601288, "rewards/format_reward": 1.0, "step": 4996 }, { "completion_length": 428.1160888671875, "epoch": 0.3896088710463235, "grad_norm": 0.050014212457431664, "kl": 0.0032024383544921875, "learning_rate": 6.699356772101699e-07, "loss": 0.0001, "reward": 1.850446492433548, "reward_std": 0.05358585249632597, "rewards/accuracy_reward": 0.850446455180645, "rewards/format_reward": 1.0, "step": 4998 }, { "completion_length": 412.06474685668945, "epoch": 0.3897647769571064, "grad_norm": 0.06840151951310267, "kl": 0.0028657913208007812, "learning_rate": 6.697053341249552e-07, "loss": 0.0001, "reward": 1.7477679401636124, "reward_std": 0.036598289385437965, "rewards/accuracy_reward": 0.7477678954601288, "rewards/format_reward": 1.0, "step": 5000 }, { "completion_length": 414.70760345458984, "epoch": 0.3899206828678892, "grad_norm": 0.06856247553281523, "kl": 0.0030422210693359375, "learning_rate": 6.694749503263455e-07, "loss": 0.0001, "reward": 1.7656250894069672, "reward_std": 0.058317420072853565, "rewards/accuracy_reward": 0.7656250149011612, "rewards/format_reward": 1.0, "step": 5002 }, { "completion_length": 427.2500190734863, "epoch": 0.3900765887786721, "grad_norm": 0.06923399012009152, "kl": 0.0029754638671875, "learning_rate": 6.692445258696114e-07, "loss": 0.0001, "reward": 1.895089328289032, "reward_std": 0.03366979490965605, "rewards/accuracy_reward": 0.8950893133878708, "rewards/format_reward": 1.0, "step": 5004 }, { "completion_length": 420.1339416503906, "epoch": 0.3902324946894549, "grad_norm": 0.10555958072031242, "kl": 0.0030813217163085938, "learning_rate": 6.69014060810033e-07, "loss": 0.0001, "reward": 1.8549107909202576, "reward_std": 0.062223014421761036, "rewards/accuracy_reward": 0.8549107536673546, "rewards/format_reward": 1.0, "step": 5006 }, { "completion_length": 437.42636489868164, "epoch": 0.3903884006002378, "grad_norm": 0.08314218788085086, "kl": 0.003448486328125, "learning_rate": 6.687835552029006e-07, "loss": 0.0001, "reward": 1.8258929550647736, "reward_std": 0.0474257655441761, "rewards/accuracy_reward": 0.8258928880095482, "rewards/format_reward": 1.0, "step": 5008 }, { "completion_length": 435.7366256713867, "epoch": 0.3905443065110206, "grad_norm": 0.09460195964487593, "kl": 0.0032520294189453125, "learning_rate": 6.685530091035138e-07, "loss": 0.0001, "reward": 1.8571429401636124, "reward_std": 0.060270216315984726, "rewards/accuracy_reward": 0.8571428954601288, "rewards/format_reward": 1.0, "step": 5010 }, { "completion_length": 427.94421005249023, "epoch": 0.3907002124218034, "grad_norm": 0.09018020645510323, "kl": 0.003185272216796875, "learning_rate": 6.683224225671823e-07, "loss": 0.0001, "reward": 1.9397321939468384, "reward_std": 0.030135108157992363, "rewards/accuracy_reward": 0.9397321790456772, "rewards/format_reward": 1.0, "step": 5012 }, { "completion_length": 403.46430587768555, "epoch": 0.39085611833258627, "grad_norm": 0.08536038776352851, "kl": 0.0029745101928710938, "learning_rate": 6.680917956492251e-07, "loss": 0.0001, "reward": 1.8281250596046448, "reward_std": 0.030135108157992363, "rewards/accuracy_reward": 0.828125037252903, "rewards/format_reward": 1.0, "step": 5014 }, { "completion_length": 428.18974685668945, "epoch": 0.3910120242433691, "grad_norm": 0.1134614359768105, "kl": 0.0032520294189453125, "learning_rate": 6.67861128404971e-07, "loss": 0.0001, "reward": 1.7544643729925156, "reward_std": 0.06853646878153086, "rewards/accuracy_reward": 0.7544643133878708, "rewards/format_reward": 1.0, "step": 5016 }, { "completion_length": 435.50671768188477, "epoch": 0.39116793015415197, "grad_norm": 0.09778928152328813, "kl": 0.0030956268310546875, "learning_rate": 6.676304208897588e-07, "loss": 0.0001, "reward": 1.7879465222358704, "reward_std": 0.06914125476032495, "rewards/accuracy_reward": 0.7879464626312256, "rewards/format_reward": 1.0, "step": 5018 }, { "completion_length": 438.9285888671875, "epoch": 0.3913238360649348, "grad_norm": 0.09351386941565323, "kl": 0.0032520294189453125, "learning_rate": 6.673996731589366e-07, "loss": 0.0001, "reward": 1.8169643878936768, "reward_std": 0.07613011728972197, "rewards/accuracy_reward": 0.8214286118745804, "rewards/format_reward": 0.9955357164144516, "step": 5020 }, { "completion_length": 425.03796005249023, "epoch": 0.39147974197571767, "grad_norm": 0.10752987561449452, "kl": 0.002780914306640625, "learning_rate": 6.671688852678619e-07, "loss": 0.0001, "reward": 1.8794643431901932, "reward_std": 0.0612325444817543, "rewards/accuracy_reward": 0.8816964477300644, "rewards/format_reward": 0.9977678656578064, "step": 5022 }, { "completion_length": 428.7611770629883, "epoch": 0.3916356478865005, "grad_norm": 0.20024907144888673, "kl": 0.0027484893798828125, "learning_rate": 6.669380572719028e-07, "loss": 0.0001, "reward": 1.8058036714792252, "reward_std": 0.04276201594620943, "rewards/accuracy_reward": 0.8058036118745804, "rewards/format_reward": 1.0, "step": 5024 }, { "completion_length": 418.19420623779297, "epoch": 0.39179155379728337, "grad_norm": 0.1190107131028575, "kl": 0.0030517578125, "learning_rate": 6.66707189226436e-07, "loss": 0.0001, "reward": 1.9084821939468384, "reward_std": 0.0644789095968008, "rewards/accuracy_reward": 0.908482164144516, "rewards/format_reward": 1.0, "step": 5026 }, { "completion_length": 418.0178756713867, "epoch": 0.39194745970806616, "grad_norm": 0.06770407074643785, "kl": 0.0028896331787109375, "learning_rate": 6.664762811868484e-07, "loss": 0.0001, "reward": 1.9017857760190964, "reward_std": 0.022171951830387115, "rewards/accuracy_reward": 0.901785746216774, "rewards/format_reward": 1.0, "step": 5028 }, { "completion_length": 430.78349685668945, "epoch": 0.392103365618849, "grad_norm": 0.10642666378114303, "kl": 0.0032405853271484375, "learning_rate": 6.662453332085362e-07, "loss": 0.0001, "reward": 1.790178656578064, "reward_std": 0.05959761328995228, "rewards/accuracy_reward": 0.792410746216774, "rewards/format_reward": 0.9977678656578064, "step": 5030 }, { "completion_length": 424.41296768188477, "epoch": 0.39225927152963186, "grad_norm": 0.09304999116935612, "kl": 0.0032958984375, "learning_rate": 6.660143453469054e-07, "loss": 0.0001, "reward": 1.7946429401636124, "reward_std": 0.0417863167822361, "rewards/accuracy_reward": 0.7968750521540642, "rewards/format_reward": 0.9977678656578064, "step": 5032 }, { "completion_length": 418.2567138671875, "epoch": 0.3924151774404147, "grad_norm": 0.09332488015145579, "kl": 0.0032672882080078125, "learning_rate": 6.657833176573715e-07, "loss": 0.0001, "reward": 1.8058036267757416, "reward_std": 0.0440408093854785, "rewards/accuracy_reward": 0.8058036044239998, "rewards/format_reward": 1.0, "step": 5034 }, { "completion_length": 426.73439025878906, "epoch": 0.39257108335119756, "grad_norm": 0.08862802816182763, "kl": 0.0027751922607421875, "learning_rate": 6.655522501953593e-07, "loss": 0.0001, "reward": 1.8861607760190964, "reward_std": 0.042762016877532005, "rewards/accuracy_reward": 0.8861607685685158, "rewards/format_reward": 1.0, "step": 5036 }, { "completion_length": 424.401798248291, "epoch": 0.3927269892619804, "grad_norm": 0.10084251984395813, "kl": 0.003002166748046875, "learning_rate": 6.653211430163035e-07, "loss": 0.0001, "reward": 1.8415179401636124, "reward_std": 0.04794640466570854, "rewards/accuracy_reward": 0.8415178880095482, "rewards/format_reward": 1.0, "step": 5038 }, { "completion_length": 424.40626525878906, "epoch": 0.39288289517276326, "grad_norm": 0.0979907199012394, "kl": 0.0028409957885742188, "learning_rate": 6.650899961756483e-07, "loss": 0.0001, "reward": 1.8348215222358704, "reward_std": 0.05734172184020281, "rewards/accuracy_reward": 0.8370536044239998, "rewards/format_reward": 0.9977678656578064, "step": 5040 }, { "completion_length": 427.0178756713867, "epoch": 0.3930388010835461, "grad_norm": 0.09628360266287682, "kl": 0.0029954910278320312, "learning_rate": 6.648588097288472e-07, "loss": 0.0001, "reward": 1.7901786267757416, "reward_std": 0.05425985902547836, "rewards/accuracy_reward": 0.7901786044239998, "rewards/format_reward": 1.0, "step": 5042 }, { "completion_length": 419.7946662902832, "epoch": 0.3931947069943289, "grad_norm": 0.07636587968502252, "kl": 0.0034160614013671875, "learning_rate": 6.646275837313631e-07, "loss": 0.0001, "reward": 1.7767857909202576, "reward_std": 0.062155199237167835, "rewards/accuracy_reward": 0.7790178880095482, "rewards/format_reward": 0.9977678656578064, "step": 5044 }, { "completion_length": 413.14064025878906, "epoch": 0.39335061290511175, "grad_norm": 0.08043545942254957, "kl": 0.0028247833251953125, "learning_rate": 6.64396318238669e-07, "loss": 0.0001, "reward": 1.8392857909202576, "reward_std": 0.03644856344908476, "rewards/accuracy_reward": 0.8392857611179352, "rewards/format_reward": 1.0, "step": 5046 }, { "completion_length": 413.66296768188477, "epoch": 0.3935065188158946, "grad_norm": 0.07924873571864186, "kl": 0.0029630661010742188, "learning_rate": 6.641650133062467e-07, "loss": 0.0001, "reward": 1.8526786416769028, "reward_std": 0.07274375576525927, "rewards/accuracy_reward": 0.8526786118745804, "rewards/format_reward": 1.0, "step": 5048 }, { "completion_length": 420.80582427978516, "epoch": 0.39366242472667745, "grad_norm": 0.11832753857323615, "kl": 0.0028562545776367188, "learning_rate": 6.639336689895879e-07, "loss": 0.0001, "reward": 1.805803656578064, "reward_std": 0.05584174674004316, "rewards/accuracy_reward": 0.805803619325161, "rewards/format_reward": 1.0, "step": 5050 }, { "completion_length": 414.2634086608887, "epoch": 0.3938183306374603, "grad_norm": 0.07388711259140172, "kl": 0.0027065277099609375, "learning_rate": 6.637022853441935e-07, "loss": 0.0001, "reward": 1.805803656578064, "reward_std": 0.04095894657075405, "rewards/accuracy_reward": 0.805803619325161, "rewards/format_reward": 1.0, "step": 5052 }, { "completion_length": 423.7544822692871, "epoch": 0.39397423654824315, "grad_norm": 0.10666400871779917, "kl": 0.0031032562255859375, "learning_rate": 6.634708624255739e-07, "loss": 0.0001, "reward": 1.8348214775323868, "reward_std": 0.0795136708766222, "rewards/accuracy_reward": 0.8370536044239998, "rewards/format_reward": 0.9977678656578064, "step": 5054 }, { "completion_length": 428.8058204650879, "epoch": 0.394130142459026, "grad_norm": 0.09442812288235335, "kl": 0.00313568115234375, "learning_rate": 6.632394002892488e-07, "loss": 0.0001, "reward": 1.8080357909202576, "reward_std": 0.0602702172473073, "rewards/accuracy_reward": 0.8102678880095482, "rewards/format_reward": 0.9977678656578064, "step": 5056 }, { "completion_length": 416.96430587768555, "epoch": 0.3942860483698088, "grad_norm": 0.003418392701968743, "kl": 0.0028133392333984375, "learning_rate": 6.630078989907474e-07, "loss": 0.0001, "reward": 1.7991071939468384, "reward_std": 0.02089315839111805, "rewards/accuracy_reward": 0.7991071566939354, "rewards/format_reward": 1.0, "step": 5058 }, { "completion_length": 426.1250190734863, "epoch": 0.39444195428059164, "grad_norm": 0.09971616109476716, "kl": 0.003406524658203125, "learning_rate": 6.62776358585609e-07, "loss": 0.0001, "reward": 1.7968751043081284, "reward_std": 0.05102826654911041, "rewards/accuracy_reward": 0.796875037252903, "rewards/format_reward": 1.0, "step": 5060 }, { "completion_length": 417.2433166503906, "epoch": 0.3945978601913745, "grad_norm": 0.08816877279035228, "kl": 0.002704620361328125, "learning_rate": 6.62544779129381e-07, "loss": 0.0001, "reward": 1.8616072088479996, "reward_std": 0.028182310983538628, "rewards/accuracy_reward": 0.8616071790456772, "rewards/format_reward": 1.0, "step": 5062 }, { "completion_length": 420.83930587768555, "epoch": 0.39475376610215734, "grad_norm": 0.09789740567814025, "kl": 0.0028924942016601562, "learning_rate": 6.623131606776208e-07, "loss": 0.0001, "reward": 1.772321492433548, "reward_std": 0.04764331039041281, "rewards/accuracy_reward": 0.7723214514553547, "rewards/format_reward": 1.0, "step": 5064 }, { "completion_length": 422.8393020629883, "epoch": 0.3949096720129402, "grad_norm": 0.1078923112461242, "kl": 0.0032291412353515625, "learning_rate": 6.620815032858953e-07, "loss": 0.0001, "reward": 1.803571492433548, "reward_std": 0.07439346145838499, "rewards/accuracy_reward": 0.8035714626312256, "rewards/format_reward": 1.0, "step": 5066 }, { "completion_length": 421.84153747558594, "epoch": 0.39506557792372304, "grad_norm": 0.11268090088429693, "kl": 0.002777099609375, "learning_rate": 6.618498070097807e-07, "loss": 0.0001, "reward": 1.8549107760190964, "reward_std": 0.05230705998837948, "rewards/accuracy_reward": 0.854910746216774, "rewards/format_reward": 1.0, "step": 5068 }, { "completion_length": 427.4196662902832, "epoch": 0.3952214838345059, "grad_norm": 0.07068377996536238, "kl": 0.0029611587524414062, "learning_rate": 6.616180719048623e-07, "loss": 0.0001, "reward": 1.7544643580913544, "reward_std": 0.0625261114910245, "rewards/accuracy_reward": 0.7544642984867096, "rewards/format_reward": 1.0, "step": 5070 }, { "completion_length": 413.6183204650879, "epoch": 0.39537738974528874, "grad_norm": 0.08941006027839529, "kl": 0.0030584335327148438, "learning_rate": 6.613862980267349e-07, "loss": 0.0001, "reward": 1.80803582072258, "reward_std": 0.03352006617933512, "rewards/accuracy_reward": 0.808035746216774, "rewards/format_reward": 1.0, "step": 5072 }, { "completion_length": 425.97993087768555, "epoch": 0.39553329565607154, "grad_norm": 0.06600258371068748, "kl": 0.0030879974365234375, "learning_rate": 6.611544854310023e-07, "loss": 0.0001, "reward": 1.8950893431901932, "reward_std": 0.06025684718042612, "rewards/accuracy_reward": 0.8995536044239998, "rewards/format_reward": 0.9955357313156128, "step": 5074 }, { "completion_length": 428.5022506713867, "epoch": 0.3956892015668544, "grad_norm": 0.09701804457329852, "kl": 0.00337982177734375, "learning_rate": 6.609226341732781e-07, "loss": 0.0001, "reward": 1.799107238650322, "reward_std": 0.0674060033634305, "rewards/accuracy_reward": 0.7991071790456772, "rewards/format_reward": 1.0, "step": 5076 }, { "completion_length": 411.61609268188477, "epoch": 0.39584510747763724, "grad_norm": 0.13156063899953552, "kl": 0.003055572509765625, "learning_rate": 6.606907443091847e-07, "loss": 0.0001, "reward": 1.8325893729925156, "reward_std": 0.08341786824166775, "rewards/accuracy_reward": 0.8325893208384514, "rewards/format_reward": 1.0, "step": 5078 }, { "completion_length": 428.1384048461914, "epoch": 0.3960010133884201, "grad_norm": 0.10954906193079204, "kl": 0.0031375885009765625, "learning_rate": 6.604588158943541e-07, "loss": 0.0001, "reward": 1.79241082072258, "reward_std": 0.05831741914153099, "rewards/accuracy_reward": 0.7946428880095482, "rewards/format_reward": 0.9977678656578064, "step": 5080 }, { "completion_length": 425.05582427978516, "epoch": 0.39615691929920294, "grad_norm": 0.051512172000572294, "kl": 0.0028514862060546875, "learning_rate": 6.602268489844273e-07, "loss": 0.0001, "reward": 1.8392857909202576, "reward_std": 0.012626906856894493, "rewards/accuracy_reward": 0.8392857536673546, "rewards/format_reward": 1.0, "step": 5082 }, { "completion_length": 419.0245704650879, "epoch": 0.3963128252099858, "grad_norm": 0.05031190676206724, "kl": 0.002979278564453125, "learning_rate": 6.59994843635055e-07, "loss": 0.0001, "reward": 1.8370536416769028, "reward_std": 0.027053244411945343, "rewards/accuracy_reward": 0.837053619325161, "rewards/format_reward": 1.0, "step": 5084 }, { "completion_length": 404.34376525878906, "epoch": 0.39646873112076864, "grad_norm": 0.11743657702643627, "kl": 0.002902984619140625, "learning_rate": 6.597627999018961e-07, "loss": 0.0001, "reward": 1.8191965222358704, "reward_std": 0.05523555539548397, "rewards/accuracy_reward": 0.8191964700818062, "rewards/format_reward": 1.0, "step": 5086 }, { "completion_length": 419.7522506713867, "epoch": 0.3966246370315515, "grad_norm": 0.09406152927538808, "kl": 0.0032033920288085938, "learning_rate": 6.5953071784062e-07, "loss": 0.0001, "reward": 1.828125074505806, "reward_std": 0.045017908327281475, "rewards/accuracy_reward": 0.8281250223517418, "rewards/format_reward": 1.0, "step": 5088 }, { "completion_length": 438.76118087768555, "epoch": 0.3967805429423343, "grad_norm": 0.12133806086260444, "kl": 0.0034637451171875, "learning_rate": 6.592985975069043e-07, "loss": 0.0001, "reward": 1.8214286267757416, "reward_std": 0.04178631864488125, "rewards/accuracy_reward": 0.8214286118745804, "rewards/format_reward": 1.0, "step": 5090 }, { "completion_length": 431.1518020629883, "epoch": 0.39693644885311713, "grad_norm": 0.11510647196007498, "kl": 0.0034732818603515625, "learning_rate": 6.590664389564364e-07, "loss": 0.0001, "reward": 1.772321492433548, "reward_std": 0.039377057924866676, "rewards/accuracy_reward": 0.772321455180645, "rewards/format_reward": 1.0, "step": 5092 }, { "completion_length": 420.9821586608887, "epoch": 0.3970923547639, "grad_norm": 0.09992057933972565, "kl": 0.002719879150390625, "learning_rate": 6.588342422449124e-07, "loss": 0.0001, "reward": 1.8504465222358704, "reward_std": 0.0562126561999321, "rewards/accuracy_reward": 0.8504464700818062, "rewards/format_reward": 1.0, "step": 5094 }, { "completion_length": 433.25447845458984, "epoch": 0.39724826067468283, "grad_norm": 0.07944891359773783, "kl": 0.0030918121337890625, "learning_rate": 6.586020074280379e-07, "loss": 0.0001, "reward": 1.8035715073347092, "reward_std": 0.045993607491254807, "rewards/accuracy_reward": 0.8035714626312256, "rewards/format_reward": 1.0, "step": 5096 }, { "completion_length": 440.2120704650879, "epoch": 0.3974041665854657, "grad_norm": 0.09877200747387548, "kl": 0.003627777099609375, "learning_rate": 6.583697345615276e-07, "loss": 0.0001, "reward": 1.6607143580913544, "reward_std": 0.04485340788960457, "rewards/accuracy_reward": 0.662946455180645, "rewards/format_reward": 0.9977678656578064, "step": 5098 }, { "completion_length": 434.9308204650879, "epoch": 0.39756007249624853, "grad_norm": 0.10057761225896512, "kl": 0.0029544830322265625, "learning_rate": 6.58137423701105e-07, "loss": 0.0001, "reward": 1.8616072088479996, "reward_std": 0.051177993416786194, "rewards/accuracy_reward": 0.8616071715950966, "rewards/format_reward": 1.0, "step": 5100 }, { "completion_length": 406.0178756713867, "epoch": 0.3977159784070314, "grad_norm": 0.08149933997400735, "kl": 0.0028629302978515625, "learning_rate": 6.579050749025031e-07, "loss": 0.0001, "reward": 1.8013393580913544, "reward_std": 0.04276201594620943, "rewards/accuracy_reward": 0.8013393357396126, "rewards/format_reward": 1.0, "step": 5102 }, { "completion_length": 425.2835006713867, "epoch": 0.3978718843178142, "grad_norm": 0.0738559507019963, "kl": 0.003173828125, "learning_rate": 6.57672688221464e-07, "loss": 0.0001, "reward": 1.712053656578064, "reward_std": 0.05666771437972784, "rewards/accuracy_reward": 0.7120535969734192, "rewards/format_reward": 1.0, "step": 5104 }, { "completion_length": 420.18305587768555, "epoch": 0.398027790228597, "grad_norm": 0.11727589768190429, "kl": 0.003414154052734375, "learning_rate": 6.574402637137384e-07, "loss": 0.0001, "reward": 1.821428656578064, "reward_std": 0.05441322550177574, "rewards/accuracy_reward": 0.8214285969734192, "rewards/format_reward": 1.0, "step": 5106 }, { "completion_length": 411.1406478881836, "epoch": 0.3981836961393799, "grad_norm": 0.07651623507897416, "kl": 0.0030689239501953125, "learning_rate": 6.572078014350868e-07, "loss": 0.0001, "reward": 1.87276791036129, "reward_std": 0.06447890773415565, "rewards/accuracy_reward": 0.8727678805589676, "rewards/format_reward": 1.0, "step": 5108 }, { "completion_length": 429.7723388671875, "epoch": 0.3983396020501627, "grad_norm": 0.06048851260790068, "kl": 0.0034923553466796875, "learning_rate": 6.569753014412781e-07, "loss": 0.0001, "reward": 1.7633929550647736, "reward_std": 0.04373771324753761, "rewards/accuracy_reward": 0.7633928880095482, "rewards/format_reward": 1.0, "step": 5110 }, { "completion_length": 407.0937728881836, "epoch": 0.39849550796094557, "grad_norm": 0.07133689941907684, "kl": 0.0026006698608398438, "learning_rate": 6.567427637880908e-07, "loss": 0.0001, "reward": 1.8504464775323868, "reward_std": 0.024797353893518448, "rewards/accuracy_reward": 0.850446455180645, "rewards/format_reward": 1.0, "step": 5112 }, { "completion_length": 418.85716247558594, "epoch": 0.3986514138717284, "grad_norm": 0.09421878590059175, "kl": 0.0031719207763671875, "learning_rate": 6.56510188531312e-07, "loss": 0.0001, "reward": 1.7209822088479996, "reward_std": 0.041936046443879604, "rewards/accuracy_reward": 0.7209821790456772, "rewards/format_reward": 1.0, "step": 5114 }, { "completion_length": 418.16296768188477, "epoch": 0.39880731978251127, "grad_norm": 0.06753474866936954, "kl": 0.0027456283569335938, "learning_rate": 6.56277575726738e-07, "loss": 0.0001, "reward": 1.7924107909202576, "reward_std": 0.05846854951232672, "rewards/accuracy_reward": 0.7924107536673546, "rewards/format_reward": 1.0, "step": 5116 }, { "completion_length": 423.8817138671875, "epoch": 0.3989632256932941, "grad_norm": 0.003525962656994566, "kl": 0.0031137466430664062, "learning_rate": 6.560449254301741e-07, "loss": 0.0001, "reward": 1.8549107909202576, "reward_std": 0.0063134534284472466, "rewards/accuracy_reward": 0.854910746216774, "rewards/format_reward": 1.0, "step": 5118 }, { "completion_length": 441.54019927978516, "epoch": 0.3991191316040769, "grad_norm": 0.1095926898166876, "kl": 0.003177642822265625, "learning_rate": 6.558122376974347e-07, "loss": 0.0001, "reward": 1.796875074505806, "reward_std": 0.06951216794550419, "rewards/accuracy_reward": 0.7968750447034836, "rewards/format_reward": 1.0, "step": 5120 }, { "completion_length": 417.479923248291, "epoch": 0.39927503751485977, "grad_norm": 0.0537888320629914, "kl": 0.002986907958984375, "learning_rate": 6.55579512584343e-07, "loss": 0.0001, "reward": 1.8526786416769028, "reward_std": 0.03772735595703125, "rewards/accuracy_reward": 0.8526785969734192, "rewards/format_reward": 1.0, "step": 5122 }, { "completion_length": 431.04689025878906, "epoch": 0.3994309434256426, "grad_norm": 0.08863970269483304, "kl": 0.0033168792724609375, "learning_rate": 6.553467501467312e-07, "loss": 0.0001, "reward": 1.84151791036129, "reward_std": 0.01894036028534174, "rewards/accuracy_reward": 0.8415178880095482, "rewards/format_reward": 1.0, "step": 5124 }, { "completion_length": 429.70314025878906, "epoch": 0.39958684933642546, "grad_norm": 0.09174233722110445, "kl": 0.003269195556640625, "learning_rate": 6.551139504404405e-07, "loss": 0.0001, "reward": 1.8058036416769028, "reward_std": 0.04276201780885458, "rewards/accuracy_reward": 0.8058036118745804, "rewards/format_reward": 1.0, "step": 5126 }, { "completion_length": 417.10493087768555, "epoch": 0.3997427552472083, "grad_norm": 0.1127044599059519, "kl": 0.0029554367065429688, "learning_rate": 6.548811135213209e-07, "loss": 0.0001, "reward": 1.787946492433548, "reward_std": 0.07725918292999268, "rewards/accuracy_reward": 0.7879464626312256, "rewards/format_reward": 1.0, "step": 5128 }, { "completion_length": 414.0111770629883, "epoch": 0.39989866115799116, "grad_norm": 0.08639806450807719, "kl": 0.0030612945556640625, "learning_rate": 6.546482394452319e-07, "loss": 0.0001, "reward": 1.8236607909202576, "reward_std": 0.046296700835227966, "rewards/accuracy_reward": 0.823660746216774, "rewards/format_reward": 1.0, "step": 5130 }, { "completion_length": 416.12055587768555, "epoch": 0.400054567068774, "grad_norm": 0.09536192471083917, "kl": 0.00289154052734375, "learning_rate": 6.544153282680411e-07, "loss": 0.0001, "reward": 1.845982238650322, "reward_std": 0.04260864853858948, "rewards/accuracy_reward": 0.8459821790456772, "rewards/format_reward": 1.0, "step": 5132 }, { "completion_length": 423.0178756713867, "epoch": 0.40021047297955686, "grad_norm": 0.17043308893532044, "kl": 0.0027437210083007812, "learning_rate": 6.541823800456255e-07, "loss": 0.0001, "reward": 1.8415179401636124, "reward_std": 0.05929451994597912, "rewards/accuracy_reward": 0.8415178880095482, "rewards/format_reward": 1.0, "step": 5134 }, { "completion_length": 407.9799270629883, "epoch": 0.40036637889033966, "grad_norm": 0.00336601123678602, "kl": 0.0027837753295898438, "learning_rate": 6.53949394833871e-07, "loss": 0.0001, "reward": 1.743303656578064, "reward_std": 0.009241949766874313, "rewards/accuracy_reward": 0.743303619325161, "rewards/format_reward": 1.0, "step": 5136 }, { "completion_length": 428.06474685668945, "epoch": 0.4005222848011225, "grad_norm": 0.0033387130054177254, "kl": 0.0029773712158203125, "learning_rate": 6.53716372688672e-07, "loss": 0.0001, "reward": 1.803571492433548, "reward_std": 0.03352006524801254, "rewards/accuracy_reward": 0.803571455180645, "rewards/format_reward": 1.0, "step": 5138 }, { "completion_length": 419.7477836608887, "epoch": 0.40067819071190536, "grad_norm": 0.08564688271125252, "kl": 0.003559112548828125, "learning_rate": 6.534833136659321e-07, "loss": 0.0001, "reward": 1.8169643580913544, "reward_std": 0.06350321136415005, "rewards/accuracy_reward": 0.8169643133878708, "rewards/format_reward": 1.0, "step": 5140 }, { "completion_length": 429.6026916503906, "epoch": 0.4008340966226882, "grad_norm": 0.08509200740885459, "kl": 0.0036983489990234375, "learning_rate": 6.532502178215635e-07, "loss": 0.0001, "reward": 1.8392857611179352, "reward_std": 0.048922101967036724, "rewards/accuracy_reward": 0.8392857387661934, "rewards/format_reward": 1.0, "step": 5142 }, { "completion_length": 426.66519927978516, "epoch": 0.40099000253347106, "grad_norm": 0.09160145911290002, "kl": 0.0033416748046875, "learning_rate": 6.530170852114879e-07, "loss": 0.0001, "reward": 1.814732238650322, "reward_std": 0.03675165679305792, "rewards/accuracy_reward": 0.8147321790456772, "rewards/format_reward": 1.0, "step": 5144 }, { "completion_length": 420.0535888671875, "epoch": 0.4011459084442539, "grad_norm": 0.06939756481345959, "kl": 0.0029306411743164062, "learning_rate": 6.527839158916349e-07, "loss": 0.0001, "reward": 1.7433036416769028, "reward_std": 0.052307059057056904, "rewards/accuracy_reward": 0.7433035895228386, "rewards/format_reward": 1.0, "step": 5146 }, { "completion_length": 423.99108505249023, "epoch": 0.40130181435503676, "grad_norm": 0.09479643994017349, "kl": 0.003299713134765625, "learning_rate": 6.52550709917943e-07, "loss": 0.0001, "reward": 1.796875074505806, "reward_std": 0.03156726714223623, "rewards/accuracy_reward": 0.796875037252903, "rewards/format_reward": 1.0, "step": 5148 }, { "completion_length": 432.7901916503906, "epoch": 0.40145772026581955, "grad_norm": 0.004401536869431581, "kl": 0.002979278564453125, "learning_rate": 6.523174673463605e-07, "loss": 0.0001, "reward": 1.8236607909202576, "reward_std": 0.021868856623768806, "rewards/accuracy_reward": 0.8236607387661934, "rewards/format_reward": 1.0, "step": 5150 }, { "completion_length": 425.7276954650879, "epoch": 0.4016136261766024, "grad_norm": 0.06623669721658257, "kl": 0.0030517578125, "learning_rate": 6.520841882328434e-07, "loss": 0.0001, "reward": 1.8191965222358704, "reward_std": 0.02284595649689436, "rewards/accuracy_reward": 0.8191964626312256, "rewards/format_reward": 1.0, "step": 5152 }, { "completion_length": 423.2857360839844, "epoch": 0.40176953208738525, "grad_norm": 0.09685207897884487, "kl": 0.0032682418823242188, "learning_rate": 6.51850872633357e-07, "loss": 0.0001, "reward": 1.8727679252624512, "reward_std": 0.04373911488801241, "rewards/accuracy_reward": 0.8727678805589676, "rewards/format_reward": 1.0, "step": 5154 }, { "completion_length": 426.3928756713867, "epoch": 0.4019254379981681, "grad_norm": 0.07421812533209021, "kl": 0.0028095245361328125, "learning_rate": 6.516175206038748e-07, "loss": 0.0001, "reward": 1.9107143431901932, "reward_std": 0.028182310052216053, "rewards/accuracy_reward": 0.910714328289032, "rewards/format_reward": 1.0, "step": 5156 }, { "completion_length": 422.455379486084, "epoch": 0.40208134390895095, "grad_norm": 0.106996687284609, "kl": 0.0031280517578125, "learning_rate": 6.513841322003799e-07, "loss": 0.0001, "reward": 1.7633929401636124, "reward_std": 0.055538653396070004, "rewards/accuracy_reward": 0.7633928954601288, "rewards/format_reward": 1.0, "step": 5158 }, { "completion_length": 428.8504638671875, "epoch": 0.4022372498197338, "grad_norm": 0.11718428459406087, "kl": 0.0029430389404296875, "learning_rate": 6.511507074788635e-07, "loss": 0.0001, "reward": 1.8772321790456772, "reward_std": 0.05102826654911041, "rewards/accuracy_reward": 0.877232164144516, "rewards/format_reward": 1.0, "step": 5160 }, { "completion_length": 423.04466247558594, "epoch": 0.40239315573051665, "grad_norm": 0.10339758201976808, "kl": 0.003223419189453125, "learning_rate": 6.509172464953258e-07, "loss": 0.0001, "reward": 1.812500074505806, "reward_std": 0.030438203364610672, "rewards/accuracy_reward": 0.8125000298023224, "rewards/format_reward": 1.0, "step": 5162 }, { "completion_length": 415.16966247558594, "epoch": 0.4025490616412995, "grad_norm": 0.08178116547962838, "kl": 0.0029754638671875, "learning_rate": 6.506837493057755e-07, "loss": 0.0001, "reward": 1.850446492433548, "reward_std": 0.05687048565596342, "rewards/accuracy_reward": 0.8526786118745804, "rewards/format_reward": 0.9977678656578064, "step": 5164 }, { "completion_length": 408.52680587768555, "epoch": 0.4027049675520823, "grad_norm": 0.07493482997014313, "kl": 0.00281524658203125, "learning_rate": 6.5045021596623e-07, "loss": 0.0001, "reward": 1.8102679401636124, "reward_std": 0.030135108157992363, "rewards/accuracy_reward": 0.8102678880095482, "rewards/format_reward": 1.0, "step": 5166 }, { "completion_length": 429.9710006713867, "epoch": 0.40286087346286514, "grad_norm": 0.07070482542376556, "kl": 0.0028934478759765625, "learning_rate": 6.502166465327155e-07, "loss": 0.0001, "reward": 1.8325893580913544, "reward_std": 0.05688526015728712, "rewards/accuracy_reward": 0.8325893133878708, "rewards/format_reward": 1.0, "step": 5168 }, { "completion_length": 423.8214530944824, "epoch": 0.403016779373648, "grad_norm": 0.0847742828690651, "kl": 0.0034847259521484375, "learning_rate": 6.499830410612667e-07, "loss": 0.0001, "reward": 1.8370536267757416, "reward_std": 0.01894036028534174, "rewards/accuracy_reward": 0.8370535969734192, "rewards/format_reward": 1.0, "step": 5170 }, { "completion_length": 426.9107322692871, "epoch": 0.40317268528443084, "grad_norm": 0.10878865684127008, "kl": 0.0029954910278320312, "learning_rate": 6.497493996079273e-07, "loss": 0.0001, "reward": 1.7991071939468384, "reward_std": 0.06463087350130081, "rewards/accuracy_reward": 0.7991071790456772, "rewards/format_reward": 1.0, "step": 5172 }, { "completion_length": 431.72546768188477, "epoch": 0.4033285911952137, "grad_norm": 0.05790302727831783, "kl": 0.00307464599609375, "learning_rate": 6.49515722228749e-07, "loss": 0.0001, "reward": 1.7924108058214188, "reward_std": 0.039833519607782364, "rewards/accuracy_reward": 0.7924107536673546, "rewards/format_reward": 1.0, "step": 5174 }, { "completion_length": 410.60716247558594, "epoch": 0.40348449710599654, "grad_norm": 0.08700282007213575, "kl": 0.0026721954345703125, "learning_rate": 6.49282008979793e-07, "loss": 0.0001, "reward": 1.8816964775323868, "reward_std": 0.027206611819565296, "rewards/accuracy_reward": 0.8816964477300644, "rewards/format_reward": 1.0, "step": 5176 }, { "completion_length": 423.72993087768555, "epoch": 0.4036404030167794, "grad_norm": 0.10786463172593054, "kl": 0.0032320022583007812, "learning_rate": 6.490482599171282e-07, "loss": 0.0001, "reward": 1.7477679401636124, "reward_std": 0.05764481611549854, "rewards/accuracy_reward": 0.7477678954601288, "rewards/format_reward": 1.0, "step": 5178 }, { "completion_length": 428.16743087768555, "epoch": 0.40379630892756224, "grad_norm": 0.08277411873644805, "kl": 0.00302886962890625, "learning_rate": 6.488144750968327e-07, "loss": 0.0001, "reward": 1.819196492433548, "reward_std": 0.03404070436954498, "rewards/accuracy_reward": 0.8191964626312256, "rewards/format_reward": 1.0, "step": 5180 }, { "completion_length": 419.64510345458984, "epoch": 0.40395221483834504, "grad_norm": 0.07001062704205235, "kl": 0.003192901611328125, "learning_rate": 6.485806545749927e-07, "loss": 0.0001, "reward": 1.7410715073347092, "reward_std": 0.04907547030597925, "rewards/accuracy_reward": 0.7410714700818062, "rewards/format_reward": 1.0, "step": 5182 }, { "completion_length": 422.6674270629883, "epoch": 0.4041081207491279, "grad_norm": 0.08061582041444675, "kl": 0.0028896331787109375, "learning_rate": 6.483467984077037e-07, "loss": 0.0001, "reward": 1.8727679252624512, "reward_std": 0.05230705998837948, "rewards/accuracy_reward": 0.8727678880095482, "rewards/format_reward": 1.0, "step": 5184 }, { "completion_length": 442.267879486084, "epoch": 0.40426402665991074, "grad_norm": 0.13325469012140456, "kl": 0.003498077392578125, "learning_rate": 6.481129066510689e-07, "loss": 0.0001, "reward": 1.7700893431901932, "reward_std": 0.10543645266443491, "rewards/accuracy_reward": 0.7700893133878708, "rewards/format_reward": 1.0, "step": 5186 }, { "completion_length": 424.89734268188477, "epoch": 0.4044199325706936, "grad_norm": 0.100958595780483, "kl": 0.00293731689453125, "learning_rate": 6.478789793612005e-07, "loss": 0.0001, "reward": 1.8013393729925156, "reward_std": 0.04501790925860405, "rewards/accuracy_reward": 0.8035714626312256, "rewards/format_reward": 0.9977678656578064, "step": 5188 }, { "completion_length": 418.8906440734863, "epoch": 0.40457583848147644, "grad_norm": 0.06364576840019008, "kl": 0.0030269622802734375, "learning_rate": 6.476450165942191e-07, "loss": 0.0001, "reward": 1.7879465222358704, "reward_std": 0.04260864946991205, "rewards/accuracy_reward": 0.7879464626312256, "rewards/format_reward": 1.0, "step": 5190 }, { "completion_length": 407.2210006713867, "epoch": 0.4047317443922593, "grad_norm": 0.10715811699799169, "kl": 0.0026426315307617188, "learning_rate": 6.474110184062541e-07, "loss": 0.0001, "reward": 1.8236607909202576, "reward_std": 0.04035275708884001, "rewards/accuracy_reward": 0.8236607387661934, "rewards/format_reward": 1.0, "step": 5192 }, { "completion_length": 414.80135345458984, "epoch": 0.40488765030304213, "grad_norm": 0.0863092391560828, "kl": 0.00286102294921875, "learning_rate": 6.47176984853443e-07, "loss": 0.0001, "reward": 1.8258929401636124, "reward_std": 0.06252610962837934, "rewards/accuracy_reward": 0.8258928805589676, "rewards/format_reward": 1.0, "step": 5194 }, { "completion_length": 422.783504486084, "epoch": 0.40504355621382493, "grad_norm": 0.10038355065088758, "kl": 0.002841949462890625, "learning_rate": 6.469429159919322e-07, "loss": 0.0001, "reward": 1.8013393729925156, "reward_std": 0.07530274521559477, "rewards/accuracy_reward": 0.8013393431901932, "rewards/format_reward": 1.0, "step": 5196 }, { "completion_length": 412.5669860839844, "epoch": 0.4051994621246078, "grad_norm": 0.05212331029120165, "kl": 0.002948760986328125, "learning_rate": 6.467088118778758e-07, "loss": 0.0001, "reward": 1.8660715073347092, "reward_std": 0.04764331039041281, "rewards/accuracy_reward": 0.866071455180645, "rewards/format_reward": 1.0, "step": 5198 }, { "completion_length": 432.4710006713867, "epoch": 0.40535536803539063, "grad_norm": 0.08488860594809866, "kl": 0.00315093994140625, "learning_rate": 6.464746725674372e-07, "loss": 0.0001, "reward": 1.7834822088479996, "reward_std": 0.04922519717365503, "rewards/accuracy_reward": 0.7857143357396126, "rewards/format_reward": 0.9977678656578064, "step": 5200 }, { "completion_length": 424.2678756713867, "epoch": 0.4055112739461735, "grad_norm": 0.0915164778709211, "kl": 0.0032558441162109375, "learning_rate": 6.462404981167881e-07, "loss": 0.0001, "reward": 1.8325893580913544, "reward_std": 0.042762016877532005, "rewards/accuracy_reward": 0.8325893208384514, "rewards/format_reward": 1.0, "step": 5202 }, { "completion_length": 418.4107322692871, "epoch": 0.40566717985695633, "grad_norm": 0.06454411557775691, "kl": 0.0031299591064453125, "learning_rate": 6.46006288582108e-07, "loss": 0.0001, "reward": 1.7656250894069672, "reward_std": 0.04937856271862984, "rewards/accuracy_reward": 0.765625037252903, "rewards/format_reward": 1.0, "step": 5204 }, { "completion_length": 425.5022506713867, "epoch": 0.4058230857677392, "grad_norm": 0.07342903177874134, "kl": 0.003589630126953125, "learning_rate": 6.457720440195858e-07, "loss": 0.0001, "reward": 1.7633929401636124, "reward_std": 0.04373771324753761, "rewards/accuracy_reward": 0.7633928880095482, "rewards/format_reward": 1.0, "step": 5206 }, { "completion_length": 418.68974685668945, "epoch": 0.405978991678522, "grad_norm": 0.0661751193222636, "kl": 0.0029993057250976562, "learning_rate": 6.455377644854176e-07, "loss": 0.0001, "reward": 1.8392857909202576, "reward_std": 0.025100448168814182, "rewards/accuracy_reward": 0.8392857387661934, "rewards/format_reward": 1.0, "step": 5208 }, { "completion_length": 422.1093940734863, "epoch": 0.4061348975893049, "grad_norm": 0.12115407903149895, "kl": 0.0029582977294921875, "learning_rate": 6.45303450035809e-07, "loss": 0.0001, "reward": 1.8392857909202576, "reward_std": 0.05425985902547836, "rewards/accuracy_reward": 0.8392857536673546, "rewards/format_reward": 1.0, "step": 5210 }, { "completion_length": 436.658504486084, "epoch": 0.40629080350008767, "grad_norm": 0.07741779362578781, "kl": 0.0037708282470703125, "learning_rate": 6.450691007269736e-07, "loss": 0.0002, "reward": 1.8102679252624512, "reward_std": 0.059294519014656544, "rewards/accuracy_reward": 0.8102679029107094, "rewards/format_reward": 1.0, "step": 5212 }, { "completion_length": 416.0937690734863, "epoch": 0.4064467094108705, "grad_norm": 0.08611151992650817, "kl": 0.0026006698608398438, "learning_rate": 6.448347166151331e-07, "loss": 0.0001, "reward": 1.8861607909202576, "reward_std": 0.03141390159726143, "rewards/accuracy_reward": 0.8861607611179352, "rewards/format_reward": 1.0, "step": 5214 }, { "completion_length": 413.5937690734863, "epoch": 0.40660261532165337, "grad_norm": 0.06981506909122248, "kl": 0.0033473968505859375, "learning_rate": 6.446002977565178e-07, "loss": 0.0001, "reward": 1.8816964626312256, "reward_std": 0.0539567656815052, "rewards/accuracy_reward": 0.8816964477300644, "rewards/format_reward": 1.0, "step": 5216 }, { "completion_length": 426.32814025878906, "epoch": 0.4067585212324362, "grad_norm": 0.07266674550894325, "kl": 0.00301361083984375, "learning_rate": 6.443658442073663e-07, "loss": 0.0001, "reward": 1.8638393729925156, "reward_std": 0.03983351867645979, "rewards/accuracy_reward": 0.8638393208384514, "rewards/format_reward": 1.0, "step": 5218 }, { "completion_length": 418.4531440734863, "epoch": 0.40691442714321907, "grad_norm": 0.11477321664125129, "kl": 0.002819061279296875, "learning_rate": 6.441313560239255e-07, "loss": 0.0001, "reward": 1.8482143580913544, "reward_std": 0.060270216315984726, "rewards/accuracy_reward": 0.8482143133878708, "rewards/format_reward": 1.0, "step": 5220 }, { "completion_length": 420.0870666503906, "epoch": 0.4070703330540019, "grad_norm": 0.09062222882131792, "kl": 0.00266265869140625, "learning_rate": 6.438968332624506e-07, "loss": 0.0001, "reward": 1.7924108058214188, "reward_std": 0.028485405258834362, "rewards/accuracy_reward": 0.7924107611179352, "rewards/format_reward": 1.0, "step": 5222 }, { "completion_length": 412.7366256713867, "epoch": 0.40722623896478477, "grad_norm": 0.07447709870867913, "kl": 0.003490447998046875, "learning_rate": 6.436622759792051e-07, "loss": 0.0001, "reward": 1.8705357909202576, "reward_std": 0.02382165566086769, "rewards/accuracy_reward": 0.870535746216774, "rewards/format_reward": 1.0, "step": 5224 }, { "completion_length": 412.8281440734863, "epoch": 0.4073821448755676, "grad_norm": 0.10145514774541776, "kl": 0.0028705596923828125, "learning_rate": 6.434276842304609e-07, "loss": 0.0001, "reward": 1.7812501043081284, "reward_std": 0.052773963660001755, "rewards/accuracy_reward": 0.7834821790456772, "rewards/format_reward": 0.9977678656578064, "step": 5226 }, { "completion_length": 427.6093978881836, "epoch": 0.4075380507863504, "grad_norm": 0.09038974635827898, "kl": 0.0030345916748046875, "learning_rate": 6.431930580724979e-07, "loss": 0.0001, "reward": 1.8102679401636124, "reward_std": 0.060726676136255264, "rewards/accuracy_reward": 0.8102678954601288, "rewards/format_reward": 1.0, "step": 5228 }, { "completion_length": 430.17635345458984, "epoch": 0.40769395669713326, "grad_norm": 0.0972675772307071, "kl": 0.0028734207153320312, "learning_rate": 6.429583975616044e-07, "loss": 0.0001, "reward": 1.832589328289032, "reward_std": 0.05831741727888584, "rewards/accuracy_reward": 0.8325893059372902, "rewards/format_reward": 1.0, "step": 5230 }, { "completion_length": 421.05582427978516, "epoch": 0.4078498626079161, "grad_norm": 0.07886429262588279, "kl": 0.0030155181884765625, "learning_rate": 6.427237027540771e-07, "loss": 0.0001, "reward": 1.8616071939468384, "reward_std": 0.05636462103575468, "rewards/accuracy_reward": 0.8616071715950966, "rewards/format_reward": 1.0, "step": 5232 }, { "completion_length": 419.57814025878906, "epoch": 0.40800576851869896, "grad_norm": 0.08401867467221036, "kl": 0.002948760986328125, "learning_rate": 6.424889737062209e-07, "loss": 0.0001, "reward": 1.837053656578064, "reward_std": 0.07094432692974806, "rewards/accuracy_reward": 0.8370536044239998, "rewards/format_reward": 1.0, "step": 5234 }, { "completion_length": 414.6183204650879, "epoch": 0.4081616744294818, "grad_norm": 0.08314138214839903, "kl": 0.0028896331787109375, "learning_rate": 6.422542104743485e-07, "loss": 0.0001, "reward": 1.8191965222358704, "reward_std": 0.042762016877532005, "rewards/accuracy_reward": 0.819196455180645, "rewards/format_reward": 1.0, "step": 5236 }, { "completion_length": 428.7634086608887, "epoch": 0.40831758034026466, "grad_norm": 0.06985320309811426, "kl": 0.0036773681640625, "learning_rate": 6.420194131147813e-07, "loss": 0.0001, "reward": 1.8236607909202576, "reward_std": 0.06756077148020267, "rewards/accuracy_reward": 0.8236607611179352, "rewards/format_reward": 1.0, "step": 5238 }, { "completion_length": 411.58707427978516, "epoch": 0.4084734862510475, "grad_norm": 0.11512217472358759, "kl": 0.0037221908569335938, "learning_rate": 6.417845816838486e-07, "loss": 0.0001, "reward": 1.7857143729925156, "reward_std": 0.06319871544837952, "rewards/accuracy_reward": 0.7857143208384514, "rewards/format_reward": 1.0, "step": 5240 }, { "completion_length": 432.1651954650879, "epoch": 0.40862939216183036, "grad_norm": 0.06470976322305637, "kl": 0.003082275390625, "learning_rate": 6.415497162378879e-07, "loss": 0.0001, "reward": 1.9330357760190964, "reward_std": 0.029159409925341606, "rewards/accuracy_reward": 0.9330357387661934, "rewards/format_reward": 1.0, "step": 5242 }, { "completion_length": 426.558048248291, "epoch": 0.40878529807261316, "grad_norm": 0.10379366618263267, "kl": 0.003265380859375, "learning_rate": 6.413148168332449e-07, "loss": 0.0001, "reward": 1.743303656578064, "reward_std": 0.059294519014656544, "rewards/accuracy_reward": 0.7433036081492901, "rewards/format_reward": 1.0, "step": 5244 }, { "completion_length": 413.2366256713867, "epoch": 0.408941203983396, "grad_norm": 0.11462168915615235, "kl": 0.0031557083129882812, "learning_rate": 6.410798835262738e-07, "loss": 0.0001, "reward": 1.7433036416769028, "reward_std": 0.052979664877057076, "rewards/accuracy_reward": 0.7433036118745804, "rewards/format_reward": 1.0, "step": 5246 }, { "completion_length": 415.0089530944824, "epoch": 0.40909710989417886, "grad_norm": 0.046300094755954646, "kl": 0.0031232833862304688, "learning_rate": 6.408449163733365e-07, "loss": 0.0001, "reward": 1.8058036416769028, "reward_std": 0.05246042460203171, "rewards/accuracy_reward": 0.8058036044239998, "rewards/format_reward": 1.0, "step": 5248 }, { "completion_length": 417.89733505249023, "epoch": 0.4092530158049617, "grad_norm": 0.09489017197063071, "kl": 0.0029754638671875, "learning_rate": 6.406099154308028e-07, "loss": 0.0001, "reward": 1.843750074505806, "reward_std": 0.055538651533424854, "rewards/accuracy_reward": 0.8437500223517418, "rewards/format_reward": 1.0, "step": 5250 }, { "completion_length": 408.3169860839844, "epoch": 0.40940892171574456, "grad_norm": 0.06335086531162663, "kl": 0.003070831298828125, "learning_rate": 6.403748807550511e-07, "loss": 0.0001, "reward": 1.8616071939468384, "reward_std": 0.03352006524801254, "rewards/accuracy_reward": 0.8616071790456772, "rewards/format_reward": 1.0, "step": 5252 }, { "completion_length": 421.63618087768555, "epoch": 0.4095648276265274, "grad_norm": 0.06291859120724039, "kl": 0.002956390380859375, "learning_rate": 6.401398124024677e-07, "loss": 0.0001, "reward": 1.8683036416769028, "reward_std": 0.03547286428511143, "rewards/accuracy_reward": 0.8683036118745804, "rewards/format_reward": 1.0, "step": 5254 }, { "completion_length": 415.8058204650879, "epoch": 0.40972073353731026, "grad_norm": 0.07624378236185923, "kl": 0.00301361083984375, "learning_rate": 6.399047104294471e-07, "loss": 0.0001, "reward": 1.8683036416769028, "reward_std": 0.041329856030642986, "rewards/accuracy_reward": 0.8683036044239998, "rewards/format_reward": 1.0, "step": 5256 }, { "completion_length": 417.58484268188477, "epoch": 0.40987663944809305, "grad_norm": 0.06547499446888914, "kl": 0.0028429031372070312, "learning_rate": 6.396695748923918e-07, "loss": 0.0001, "reward": 1.8616072088479996, "reward_std": 0.0490754684433341, "rewards/accuracy_reward": 0.861607164144516, "rewards/format_reward": 1.0, "step": 5258 }, { "completion_length": 402.33707427978516, "epoch": 0.4100325453588759, "grad_norm": 0.09431957497730993, "kl": 0.0025796890258789062, "learning_rate": 6.394344058477122e-07, "loss": 0.0001, "reward": 1.8950893878936768, "reward_std": 0.048099770210683346, "rewards/accuracy_reward": 0.8950893059372902, "rewards/format_reward": 1.0, "step": 5260 }, { "completion_length": 427.5669860839844, "epoch": 0.41018845126965875, "grad_norm": 0.046695196735413384, "kl": 0.002994537353515625, "learning_rate": 6.39199203351827e-07, "loss": 0.0001, "reward": 1.850446492433548, "reward_std": 0.03840136155486107, "rewards/accuracy_reward": 0.8504464626312256, "rewards/format_reward": 1.0, "step": 5262 }, { "completion_length": 417.854923248291, "epoch": 0.4103443571804416, "grad_norm": 0.0878508348378597, "kl": 0.0033588409423828125, "learning_rate": 6.389639674611625e-07, "loss": 0.0001, "reward": 1.8281250894069672, "reward_std": 0.05230706091970205, "rewards/accuracy_reward": 0.8281250223517418, "rewards/format_reward": 1.0, "step": 5264 }, { "completion_length": 417.42412185668945, "epoch": 0.41050026309122445, "grad_norm": 0.08276955439329663, "kl": 0.003307342529296875, "learning_rate": 6.387286982321536e-07, "loss": 0.0001, "reward": 1.8214286416769028, "reward_std": 0.04599360562860966, "rewards/accuracy_reward": 0.8214285969734192, "rewards/format_reward": 1.0, "step": 5266 }, { "completion_length": 412.8928756713867, "epoch": 0.4106561690020073, "grad_norm": 0.10379426680925914, "kl": 0.0029754638671875, "learning_rate": 6.384933957212428e-07, "loss": 0.0001, "reward": 1.8169643431901932, "reward_std": 0.02089315839111805, "rewards/accuracy_reward": 0.8169643208384514, "rewards/format_reward": 1.0, "step": 5268 }, { "completion_length": 420.6183204650879, "epoch": 0.41081207491279015, "grad_norm": 0.09636932514961731, "kl": 0.0031795501708984375, "learning_rate": 6.382580599848806e-07, "loss": 0.0001, "reward": 1.8035715222358704, "reward_std": 0.044714813120663166, "rewards/accuracy_reward": 0.8035714626312256, "rewards/format_reward": 1.0, "step": 5270 }, { "completion_length": 421.42859268188477, "epoch": 0.410967980823573, "grad_norm": 0.09138101463413895, "kl": 0.0032520294189453125, "learning_rate": 6.380226910795254e-07, "loss": 0.0001, "reward": 1.772321492433548, "reward_std": 0.05831881985068321, "rewards/accuracy_reward": 0.7723214626312256, "rewards/format_reward": 1.0, "step": 5272 }, { "completion_length": 434.64734268188477, "epoch": 0.4111238867343558, "grad_norm": 0.12708100120693233, "kl": 0.00307464599609375, "learning_rate": 6.37787289061644e-07, "loss": 0.0001, "reward": 1.8482143729925156, "reward_std": 0.081682613119483, "rewards/accuracy_reward": 0.8482143208384514, "rewards/format_reward": 1.0, "step": 5274 }, { "completion_length": 413.1852912902832, "epoch": 0.41127979264513864, "grad_norm": 0.13805111383722032, "kl": 0.003368377685546875, "learning_rate": 6.375518539877109e-07, "loss": 0.0001, "reward": 1.7857143729925156, "reward_std": 0.05734171997755766, "rewards/accuracy_reward": 0.7857143133878708, "rewards/format_reward": 1.0, "step": 5276 }, { "completion_length": 419.2611770629883, "epoch": 0.4114356985559215, "grad_norm": 0.12597945774660044, "kl": 0.0042934417724609375, "learning_rate": 6.373163859142082e-07, "loss": 0.0002, "reward": 1.7745536714792252, "reward_std": 0.07823347672820091, "rewards/accuracy_reward": 0.7745536044239998, "rewards/format_reward": 1.0, "step": 5278 }, { "completion_length": 423.4419822692871, "epoch": 0.41159160446670434, "grad_norm": 0.10905828534841118, "kl": 0.0030059814453125, "learning_rate": 6.370808848976263e-07, "loss": 0.0001, "reward": 1.7857143580913544, "reward_std": 0.06853646971285343, "rewards/accuracy_reward": 0.7857143133878708, "rewards/format_reward": 1.0, "step": 5280 }, { "completion_length": 420.02903747558594, "epoch": 0.4117475103774872, "grad_norm": 0.08793233842091813, "kl": 0.0028743743896484375, "learning_rate": 6.368453509944635e-07, "loss": 0.0001, "reward": 1.7633929401636124, "reward_std": 0.048620409332215786, "rewards/accuracy_reward": 0.7633928805589676, "rewards/format_reward": 1.0, "step": 5282 }, { "completion_length": 424.0491256713867, "epoch": 0.41190341628827004, "grad_norm": 0.09439148970011632, "kl": 0.0026979446411132812, "learning_rate": 6.366097842612259e-07, "loss": 0.0001, "reward": 1.821428656578064, "reward_std": 0.06590966694056988, "rewards/accuracy_reward": 0.8258928880095482, "rewards/format_reward": 0.9955357164144516, "step": 5284 }, { "completion_length": 401.5625190734863, "epoch": 0.4120593221990529, "grad_norm": 0.058535246289018364, "kl": 0.0026025772094726562, "learning_rate": 6.363741847544272e-07, "loss": 0.0001, "reward": 1.8571429550647736, "reward_std": 0.060052672401070595, "rewards/accuracy_reward": 0.8571428880095482, "rewards/format_reward": 1.0, "step": 5286 }, { "completion_length": 420.0178756713867, "epoch": 0.41221522810983574, "grad_norm": 0.12067465446452097, "kl": 0.0030975341796875, "learning_rate": 6.361385525305896e-07, "loss": 0.0001, "reward": 1.7678571939468384, "reward_std": 0.03644856344908476, "rewards/accuracy_reward": 0.7678571790456772, "rewards/format_reward": 1.0, "step": 5288 }, { "completion_length": 429.7254638671875, "epoch": 0.41237113402061853, "grad_norm": 0.07266019555044052, "kl": 0.00334930419921875, "learning_rate": 6.359028876462424e-07, "loss": 0.0001, "reward": 1.7901786416769028, "reward_std": 0.06380490399897099, "rewards/accuracy_reward": 0.7901786118745804, "rewards/format_reward": 1.0, "step": 5290 }, { "completion_length": 415.4352836608887, "epoch": 0.4125270399314014, "grad_norm": 0.1184165463352231, "kl": 0.0027132034301757812, "learning_rate": 6.356671901579234e-07, "loss": 0.0001, "reward": 1.758928656578064, "reward_std": 0.058620511554181576, "rewards/accuracy_reward": 0.7589285969734192, "rewards/format_reward": 1.0, "step": 5292 }, { "completion_length": 408.3951072692871, "epoch": 0.41268294584218423, "grad_norm": 0.09673112018088725, "kl": 0.002872467041015625, "learning_rate": 6.354314601221777e-07, "loss": 0.0001, "reward": 1.7879465222358704, "reward_std": 0.05523555539548397, "rewards/accuracy_reward": 0.7879464626312256, "rewards/format_reward": 1.0, "step": 5294 }, { "completion_length": 401.6116256713867, "epoch": 0.4128388517529671, "grad_norm": 0.04586185037688329, "kl": 0.0029449462890625, "learning_rate": 6.351956975955586e-07, "loss": 0.0001, "reward": 1.8861607760190964, "reward_std": 0.02720661275088787, "rewards/accuracy_reward": 0.886160746216774, "rewards/format_reward": 1.0, "step": 5296 }, { "completion_length": 416.9553756713867, "epoch": 0.41299475766374993, "grad_norm": 0.12602876136406527, "kl": 0.003093719482421875, "learning_rate": 6.349599026346272e-07, "loss": 0.0001, "reward": 1.8839286416769028, "reward_std": 0.07350331265479326, "rewards/accuracy_reward": 0.8861607536673546, "rewards/format_reward": 0.9977678656578064, "step": 5298 }, { "completion_length": 416.99555587768555, "epoch": 0.4131506635745328, "grad_norm": 0.10028580817876687, "kl": 0.00284576416015625, "learning_rate": 6.34724075295952e-07, "loss": 0.0001, "reward": 1.8370536118745804, "reward_std": 0.03156726714223623, "rewards/accuracy_reward": 0.8370535895228386, "rewards/format_reward": 1.0, "step": 5300 }, { "completion_length": 434.73662185668945, "epoch": 0.41330656948531563, "grad_norm": 0.11403601845035603, "kl": 0.00334930419921875, "learning_rate": 6.344882156361094e-07, "loss": 0.0001, "reward": 1.7433036416769028, "reward_std": 0.05764481518417597, "rewards/accuracy_reward": 0.7433036044239998, "rewards/format_reward": 1.0, "step": 5302 }, { "completion_length": 420.0826072692871, "epoch": 0.41346247539609843, "grad_norm": 0.23889223105811602, "kl": 0.00417327880859375, "learning_rate": 6.342523237116838e-07, "loss": 0.0002, "reward": 1.7767858058214188, "reward_std": 0.07387422397732735, "rewards/accuracy_reward": 0.7767857536673546, "rewards/format_reward": 1.0, "step": 5304 }, { "completion_length": 411.23662185668945, "epoch": 0.4136183813068813, "grad_norm": 0.09025342161815325, "kl": 0.0031843185424804688, "learning_rate": 6.340163995792671e-07, "loss": 0.0001, "reward": 1.8035715222358704, "reward_std": 0.05328275915235281, "rewards/accuracy_reward": 0.8058036118745804, "rewards/format_reward": 0.9977678656578064, "step": 5306 }, { "completion_length": 422.1451072692871, "epoch": 0.4137742872176641, "grad_norm": 0.07164525320311955, "kl": 0.00295257568359375, "learning_rate": 6.337804432954591e-07, "loss": 0.0001, "reward": 1.8325893729925156, "reward_std": 0.053284160792827606, "rewards/accuracy_reward": 0.832589328289032, "rewards/format_reward": 1.0, "step": 5308 }, { "completion_length": 426.9062690734863, "epoch": 0.413930193128447, "grad_norm": 0.1050986668859045, "kl": 0.0032901763916015625, "learning_rate": 6.335444549168673e-07, "loss": 0.0001, "reward": 1.7745536416769028, "reward_std": 0.08146647177636623, "rewards/accuracy_reward": 0.7767857536673546, "rewards/format_reward": 0.9977678656578064, "step": 5310 }, { "completion_length": 410.2768020629883, "epoch": 0.4140860990392298, "grad_norm": 0.0832541079690569, "kl": 0.00272369384765625, "learning_rate": 6.333084345001065e-07, "loss": 0.0001, "reward": 1.8058036714792252, "reward_std": 0.03840135969221592, "rewards/accuracy_reward": 0.8080357536673546, "rewards/format_reward": 0.9977678656578064, "step": 5312 }, { "completion_length": 417.42189025878906, "epoch": 0.4142420049500127, "grad_norm": 0.04931072240291526, "kl": 0.0028839111328125, "learning_rate": 6.330723821017998e-07, "loss": 0.0001, "reward": 1.8772322088479996, "reward_std": 0.014579704962670803, "rewards/accuracy_reward": 0.8772321715950966, "rewards/format_reward": 1.0, "step": 5314 }, { "completion_length": 421.4442138671875, "epoch": 0.4143979108607955, "grad_norm": 0.11577236560694945, "kl": 0.0030364990234375, "learning_rate": 6.328362977785775e-07, "loss": 0.0001, "reward": 1.799107238650322, "reward_std": 0.050354259088635445, "rewards/accuracy_reward": 0.7991071753203869, "rewards/format_reward": 1.0, "step": 5316 }, { "completion_length": 426.5245704650879, "epoch": 0.4145538167715784, "grad_norm": 0.0495238248851008, "kl": 0.0033283233642578125, "learning_rate": 6.326001815870778e-07, "loss": 0.0001, "reward": 1.8191965073347092, "reward_std": 0.022845957428216934, "rewards/accuracy_reward": 0.819196455180645, "rewards/format_reward": 1.0, "step": 5318 }, { "completion_length": 417.9218940734863, "epoch": 0.41470972268236117, "grad_norm": 0.04095625642316834, "kl": 0.002986907958984375, "learning_rate": 6.323640335839467e-07, "loss": 0.0001, "reward": 1.8080357760190964, "reward_std": 0.03352006617933512, "rewards/accuracy_reward": 0.808035746216774, "rewards/format_reward": 1.0, "step": 5320 }, { "completion_length": 424.1004638671875, "epoch": 0.414865628593144, "grad_norm": 0.003126955897176922, "kl": 0.0026273727416992188, "learning_rate": 6.321278538258372e-07, "loss": 0.0001, "reward": 1.8325893580913544, "reward_std": 0.038030450232326984, "rewards/accuracy_reward": 0.8348214700818062, "rewards/format_reward": 0.9977678656578064, "step": 5322 }, { "completion_length": 412.5580520629883, "epoch": 0.41502153450392687, "grad_norm": 0.09919389753196428, "kl": 0.0031480789184570312, "learning_rate": 6.318916423694108e-07, "loss": 0.0001, "reward": 1.7968750596046448, "reward_std": 0.04696930479258299, "rewards/accuracy_reward": 0.796875037252903, "rewards/format_reward": 1.0, "step": 5324 }, { "completion_length": 420.7388496398926, "epoch": 0.4151774404147097, "grad_norm": 0.0858504890934363, "kl": 0.0030574798583984375, "learning_rate": 6.316553992713357e-07, "loss": 0.0001, "reward": 1.805803656578064, "reward_std": 0.039680153131484985, "rewards/accuracy_reward": 0.8058036118745804, "rewards/format_reward": 1.0, "step": 5326 }, { "completion_length": 424.1674346923828, "epoch": 0.41533334632549257, "grad_norm": 0.0791779096983694, "kl": 0.00318145751953125, "learning_rate": 6.314191245882882e-07, "loss": 0.0001, "reward": 1.7946429401636124, "reward_std": 0.06124731805175543, "rewards/accuracy_reward": 0.7946428954601288, "rewards/format_reward": 1.0, "step": 5328 }, { "completion_length": 410.73662185668945, "epoch": 0.4154892522362754, "grad_norm": 0.07258432697463933, "kl": 0.0026693344116210938, "learning_rate": 6.311828183769526e-07, "loss": 0.0001, "reward": 1.8191964775323868, "reward_std": 0.027206611819565296, "rewards/accuracy_reward": 0.8214285932481289, "rewards/format_reward": 0.9977678656578064, "step": 5330 }, { "completion_length": 406.1897506713867, "epoch": 0.41564515814705827, "grad_norm": 0.09388490762410662, "kl": 0.00281524658203125, "learning_rate": 6.309464806940196e-07, "loss": 0.0001, "reward": 1.7924107909202576, "reward_std": 0.05636602267622948, "rewards/accuracy_reward": 0.792410746216774, "rewards/format_reward": 1.0, "step": 5332 }, { "completion_length": 416.6919822692871, "epoch": 0.4158010640578411, "grad_norm": 0.12727186324169706, "kl": 0.0028867721557617188, "learning_rate": 6.307101115961886e-07, "loss": 0.0001, "reward": 1.7968750596046448, "reward_std": 0.06740740407258272, "rewards/accuracy_reward": 0.7968750447034836, "rewards/format_reward": 1.0, "step": 5334 }, { "completion_length": 424.17635345458984, "epoch": 0.4159569699686239, "grad_norm": 0.05045329981797498, "kl": 0.0030002593994140625, "learning_rate": 6.304737111401658e-07, "loss": 0.0001, "reward": 1.912946492433548, "reward_std": 0.039680153131484985, "rewards/accuracy_reward": 0.9129464775323868, "rewards/format_reward": 1.0, "step": 5336 }, { "completion_length": 420.517879486084, "epoch": 0.41611287587940676, "grad_norm": 0.06752755531540336, "kl": 0.0030813217163085938, "learning_rate": 6.302372793826653e-07, "loss": 0.0001, "reward": 1.8191965073347092, "reward_std": 0.030135108157992363, "rewards/accuracy_reward": 0.8191964626312256, "rewards/format_reward": 1.0, "step": 5338 }, { "completion_length": 426.1897506713867, "epoch": 0.4162687817901896, "grad_norm": 0.1110129548104985, "kl": 0.002964019775390625, "learning_rate": 6.300008163804084e-07, "loss": 0.0001, "reward": 1.8191965073347092, "reward_std": 0.04809976927936077, "rewards/accuracy_reward": 0.8191964700818062, "rewards/format_reward": 1.0, "step": 5340 }, { "completion_length": 423.6808204650879, "epoch": 0.41642468770097246, "grad_norm": 0.0612044237488784, "kl": 0.0033893585205078125, "learning_rate": 6.297643221901244e-07, "loss": 0.0001, "reward": 1.7767857611179352, "reward_std": 0.04065585043281317, "rewards/accuracy_reward": 0.7767857536673546, "rewards/format_reward": 1.0, "step": 5342 }, { "completion_length": 418.6227912902832, "epoch": 0.4165805936117553, "grad_norm": 0.09186570242202381, "kl": 0.00342559814453125, "learning_rate": 6.295277968685496e-07, "loss": 0.0001, "reward": 1.772321492433548, "reward_std": 0.044714814983308315, "rewards/accuracy_reward": 0.7723214626312256, "rewards/format_reward": 1.0, "step": 5344 }, { "completion_length": 414.7276954650879, "epoch": 0.41673649952253816, "grad_norm": 0.09970745707315289, "kl": 0.0029516220092773438, "learning_rate": 6.29291240472428e-07, "loss": 0.0001, "reward": 1.7321429401636124, "reward_std": 0.05425985902547836, "rewards/accuracy_reward": 0.7321428954601288, "rewards/format_reward": 1.0, "step": 5346 }, { "completion_length": 428.3660888671875, "epoch": 0.416892405433321, "grad_norm": 0.06582916834258272, "kl": 0.0032787322998046875, "learning_rate": 6.290546530585108e-07, "loss": 0.0001, "reward": 1.8303572088479996, "reward_std": 0.041681041941046715, "rewards/accuracy_reward": 0.832589328289032, "rewards/format_reward": 0.9977678656578064, "step": 5348 }, { "completion_length": 424.814754486084, "epoch": 0.4170483113441038, "grad_norm": 0.07291175430917589, "kl": 0.0033092498779296875, "learning_rate": 6.288180346835571e-07, "loss": 0.0001, "reward": 1.7901786267757416, "reward_std": 0.03208790719509125, "rewards/accuracy_reward": 0.7901786044239998, "rewards/format_reward": 1.0, "step": 5350 }, { "completion_length": 422.4799270629883, "epoch": 0.41720421725488666, "grad_norm": 0.08165366440489598, "kl": 0.0028057098388671875, "learning_rate": 6.28581385404333e-07, "loss": 0.0001, "reward": 1.7857143729925156, "reward_std": 0.04764330945909023, "rewards/accuracy_reward": 0.7857143357396126, "rewards/format_reward": 1.0, "step": 5352 }, { "completion_length": 428.6339454650879, "epoch": 0.4173601231656695, "grad_norm": 0.09053721134063489, "kl": 0.00301361083984375, "learning_rate": 6.283447052776122e-07, "loss": 0.0001, "reward": 1.9017857611179352, "reward_std": 0.02382165566086769, "rewards/accuracy_reward": 0.901785746216774, "rewards/format_reward": 1.0, "step": 5354 }, { "completion_length": 423.3951110839844, "epoch": 0.41751602907645236, "grad_norm": 0.13289417710365342, "kl": 0.0035419464111328125, "learning_rate": 6.281079943601757e-07, "loss": 0.0001, "reward": 1.8214286267757416, "reward_std": 0.07808151375502348, "rewards/accuracy_reward": 0.8214286044239998, "rewards/format_reward": 1.0, "step": 5356 }, { "completion_length": 411.3035888671875, "epoch": 0.4176719349872352, "grad_norm": 0.10389959154732986, "kl": 0.0029840469360351562, "learning_rate": 6.278712527088122e-07, "loss": 0.0001, "reward": 1.772321492433548, "reward_std": 0.06267947610467672, "rewards/accuracy_reward": 0.7723214477300644, "rewards/format_reward": 1.0, "step": 5358 }, { "completion_length": 433.38841247558594, "epoch": 0.41782784089801805, "grad_norm": 0.13179623195780096, "kl": 0.00353240966796875, "learning_rate": 6.276344803803173e-07, "loss": 0.0001, "reward": 1.87276791036129, "reward_std": 0.0506573561578989, "rewards/accuracy_reward": 0.8727678805589676, "rewards/format_reward": 1.0, "step": 5360 }, { "completion_length": 422.0401916503906, "epoch": 0.4179837468088009, "grad_norm": 0.05054923055507408, "kl": 0.0029888153076171875, "learning_rate": 6.273976774314943e-07, "loss": 0.0001, "reward": 1.8080357760190964, "reward_std": 0.05425985995680094, "rewards/accuracy_reward": 0.8080357536673546, "rewards/format_reward": 1.0, "step": 5362 }, { "completion_length": 408.18974685668945, "epoch": 0.41813965271958375, "grad_norm": 0.09313865375102989, "kl": 0.0029754638671875, "learning_rate": 6.271608439191537e-07, "loss": 0.0001, "reward": 1.8013393878936768, "reward_std": 0.07680412102490664, "rewards/accuracy_reward": 0.801339328289032, "rewards/format_reward": 1.0, "step": 5364 }, { "completion_length": 434.33484268188477, "epoch": 0.41829555863036655, "grad_norm": 0.08854413990976624, "kl": 0.0035915374755859375, "learning_rate": 6.269239799001134e-07, "loss": 0.0001, "reward": 1.7053572237491608, "reward_std": 0.05298106372356415, "rewards/accuracy_reward": 0.7053571715950966, "rewards/format_reward": 1.0, "step": 5366 }, { "completion_length": 425.0044822692871, "epoch": 0.4184514645411494, "grad_norm": 0.094975330581955, "kl": 0.0027360916137695312, "learning_rate": 6.266870854311987e-07, "loss": 0.0001, "reward": 1.8169643580913544, "reward_std": 0.04764331132173538, "rewards/accuracy_reward": 0.8169643208384514, "rewards/format_reward": 1.0, "step": 5368 }, { "completion_length": 414.7723388671875, "epoch": 0.41860737045193225, "grad_norm": 0.003157150837281177, "kl": 0.0026330947875976562, "learning_rate": 6.264501605692417e-07, "loss": 0.0001, "reward": 1.9129464775323868, "reward_std": 0.0063134534284472466, "rewards/accuracy_reward": 0.912946455180645, "rewards/format_reward": 1.0, "step": 5370 }, { "completion_length": 416.00447845458984, "epoch": 0.4187632763627151, "grad_norm": 0.14024514209248978, "kl": 0.0026922225952148438, "learning_rate": 6.262132053710828e-07, "loss": 0.0001, "reward": 1.8325893580913544, "reward_std": 0.06831528805196285, "rewards/accuracy_reward": 0.832589328289032, "rewards/format_reward": 1.0, "step": 5372 }, { "completion_length": 422.7835006713867, "epoch": 0.41891918227349795, "grad_norm": 0.08275756238779836, "kl": 0.00286865234375, "learning_rate": 6.259762198935687e-07, "loss": 0.0001, "reward": 1.821428656578064, "reward_std": 0.051331364549696445, "rewards/accuracy_reward": 0.8214286118745804, "rewards/format_reward": 1.0, "step": 5374 }, { "completion_length": 419.6808204650879, "epoch": 0.4190750881842808, "grad_norm": 0.07328469412975555, "kl": 0.0031375885009765625, "learning_rate": 6.257392041935535e-07, "loss": 0.0001, "reward": 1.8727679252624512, "reward_std": 0.04809977114200592, "rewards/accuracy_reward": 0.8727678880095482, "rewards/format_reward": 1.0, "step": 5376 }, { "completion_length": 418.06251525878906, "epoch": 0.41923099409506365, "grad_norm": 0.09658610996924899, "kl": 0.0030298233032226562, "learning_rate": 6.255021583278993e-07, "loss": 0.0001, "reward": 1.8906250894069672, "reward_std": 0.04712267126888037, "rewards/accuracy_reward": 0.8906250521540642, "rewards/format_reward": 1.0, "step": 5378 }, { "completion_length": 418.47993087768555, "epoch": 0.4193869000058465, "grad_norm": 0.04514135995333049, "kl": 0.002490997314453125, "learning_rate": 6.252650823534746e-07, "loss": 0.0001, "reward": 1.8883928954601288, "reward_std": 0.02089315839111805, "rewards/accuracy_reward": 0.888392873108387, "rewards/format_reward": 1.0, "step": 5380 }, { "completion_length": 432.6205520629883, "epoch": 0.4195428059166293, "grad_norm": 0.05972219540703138, "kl": 0.003574371337890625, "learning_rate": 6.250279763271557e-07, "loss": 0.0001, "reward": 1.7500000596046448, "reward_std": 0.012626906856894493, "rewards/accuracy_reward": 0.7500000447034836, "rewards/format_reward": 1.0, "step": 5382 }, { "completion_length": 413.52010345458984, "epoch": 0.41969871182741214, "grad_norm": 0.06756924777252576, "kl": 0.0029840469360351562, "learning_rate": 6.247908403058255e-07, "loss": 0.0001, "reward": 1.8392857760190964, "reward_std": 0.043065110221505165, "rewards/accuracy_reward": 0.839285746216774, "rewards/format_reward": 1.0, "step": 5384 }, { "completion_length": 424.5759086608887, "epoch": 0.419854617738195, "grad_norm": 0.10926760020495514, "kl": 0.0033588409423828125, "learning_rate": 6.245536743463746e-07, "loss": 0.0001, "reward": 1.72991082072258, "reward_std": 0.08018768019974232, "rewards/accuracy_reward": 0.7299107424914837, "rewards/format_reward": 1.0, "step": 5386 }, { "completion_length": 419.22769927978516, "epoch": 0.42001052364897784, "grad_norm": 0.07977104369047187, "kl": 0.0027866363525390625, "learning_rate": 6.243164785057007e-07, "loss": 0.0001, "reward": 1.850446492433548, "reward_std": 0.02720661275088787, "rewards/accuracy_reward": 0.8504464626312256, "rewards/format_reward": 1.0, "step": 5388 }, { "completion_length": 425.39957427978516, "epoch": 0.4201664295597607, "grad_norm": 0.06798669796032422, "kl": 0.00299072265625, "learning_rate": 6.240792528407085e-07, "loss": 0.0001, "reward": 1.7656250596046448, "reward_std": 0.05102826748043299, "rewards/accuracy_reward": 0.7656250447034836, "rewards/format_reward": 1.0, "step": 5390 }, { "completion_length": 429.02680587768555, "epoch": 0.42032233547054354, "grad_norm": 0.0624017013000988, "kl": 0.002994537353515625, "learning_rate": 6.238419974083103e-07, "loss": 0.0001, "reward": 1.7098215371370316, "reward_std": 0.036448562517762184, "rewards/accuracy_reward": 0.7098214626312256, "rewards/format_reward": 1.0, "step": 5392 }, { "completion_length": 423.502254486084, "epoch": 0.4204782413813264, "grad_norm": 9.39996208307494, "kl": 0.03324699401855469, "learning_rate": 6.236047122654245e-07, "loss": 0.0013, "reward": 1.834821492433548, "reward_std": 0.04080921597778797, "rewards/accuracy_reward": 0.8370536118745804, "rewards/format_reward": 0.9977678656578064, "step": 5394 }, { "completion_length": 428.0067138671875, "epoch": 0.42063414729210924, "grad_norm": 0.09485016128885541, "kl": 0.0029468536376953125, "learning_rate": 6.233673974689779e-07, "loss": 0.0001, "reward": 1.8236607909202576, "reward_std": 0.04629670176655054, "rewards/accuracy_reward": 0.8236607313156128, "rewards/format_reward": 1.0, "step": 5396 }, { "completion_length": 416.5513572692871, "epoch": 0.42079005320289203, "grad_norm": 0.05897932512641789, "kl": 0.002811431884765625, "learning_rate": 6.231300530759037e-07, "loss": 0.0001, "reward": 1.7410715222358704, "reward_std": 0.03352006524801254, "rewards/accuracy_reward": 0.7410714775323868, "rewards/format_reward": 1.0, "step": 5398 }, { "completion_length": 419.0535888671875, "epoch": 0.4209459591136749, "grad_norm": 0.0876977842464778, "kl": 0.0031032562255859375, "learning_rate": 6.228926791431422e-07, "loss": 0.0001, "reward": 1.8191965073347092, "reward_std": 0.03983352147042751, "rewards/accuracy_reward": 0.8191964477300644, "rewards/format_reward": 1.0, "step": 5400 }, { "completion_length": 421.948673248291, "epoch": 0.42110186502445773, "grad_norm": 0.13148502635753115, "kl": 0.0029344558715820312, "learning_rate": 6.226552757276411e-07, "loss": 0.0001, "reward": 1.8683036267757416, "reward_std": 0.04809977114200592, "rewards/accuracy_reward": 0.8683036044239998, "rewards/format_reward": 1.0, "step": 5402 }, { "completion_length": 439.6629638671875, "epoch": 0.4212577709352406, "grad_norm": 0.11999242499659193, "kl": 0.0031986236572265625, "learning_rate": 6.224178428863549e-07, "loss": 0.0001, "reward": 1.7522322237491608, "reward_std": 0.06560657173395157, "rewards/accuracy_reward": 0.7544643208384514, "rewards/format_reward": 0.9977678656578064, "step": 5404 }, { "completion_length": 422.3437690734863, "epoch": 0.42141367684602343, "grad_norm": 0.09794663204933993, "kl": 0.0032672882080078125, "learning_rate": 6.221803806762453e-07, "loss": 0.0001, "reward": 1.82589291036129, "reward_std": 0.06282780319452286, "rewards/accuracy_reward": 0.825892873108387, "rewards/format_reward": 1.0, "step": 5406 }, { "completion_length": 416.02903747558594, "epoch": 0.4215695827568063, "grad_norm": 0.0801230382573707, "kl": 0.002857208251953125, "learning_rate": 6.21942889154281e-07, "loss": 0.0001, "reward": 1.8058036267757416, "reward_std": 0.03840135969221592, "rewards/accuracy_reward": 0.8058035969734192, "rewards/format_reward": 1.0, "step": 5408 }, { "completion_length": 426.1250228881836, "epoch": 0.42172548866758913, "grad_norm": 0.08064659273826147, "kl": 0.0027990341186523438, "learning_rate": 6.217053683774376e-07, "loss": 0.0001, "reward": 1.8616072088479996, "reward_std": 0.03501640260219574, "rewards/accuracy_reward": 0.8616071864962578, "rewards/format_reward": 1.0, "step": 5410 }, { "completion_length": 422.3236770629883, "epoch": 0.4218813945783719, "grad_norm": 0.08760606379592105, "kl": 0.0025787353515625, "learning_rate": 6.21467818402698e-07, "loss": 0.0001, "reward": 1.7321429401636124, "reward_std": 0.04471481405198574, "rewards/accuracy_reward": 0.7321428954601288, "rewards/format_reward": 1.0, "step": 5412 }, { "completion_length": 430.2857322692871, "epoch": 0.4220373004891548, "grad_norm": 0.08298869881262902, "kl": 0.0032825469970703125, "learning_rate": 6.212302392870521e-07, "loss": 0.0001, "reward": 1.8772322088479996, "reward_std": 0.03675165679305792, "rewards/accuracy_reward": 0.8772321790456772, "rewards/format_reward": 1.0, "step": 5414 }, { "completion_length": 423.346004486084, "epoch": 0.4221932063999376, "grad_norm": 0.12293741149265786, "kl": 0.0030260086059570312, "learning_rate": 6.209926310874963e-07, "loss": 0.0001, "reward": 1.8214286416769028, "reward_std": 0.05441322550177574, "rewards/accuracy_reward": 0.8236607611179352, "rewards/format_reward": 0.9977678656578064, "step": 5416 }, { "completion_length": 424.4241256713867, "epoch": 0.4223491123107205, "grad_norm": 0.07165892111375571, "kl": 0.0029163360595703125, "learning_rate": 6.207549938610345e-07, "loss": 0.0001, "reward": 1.7343750596046448, "reward_std": 0.045017908327281475, "rewards/accuracy_reward": 0.7343750298023224, "rewards/format_reward": 1.0, "step": 5418 }, { "completion_length": 428.52457427978516, "epoch": 0.4225050182215033, "grad_norm": 0.11721296560899809, "kl": 0.002986907958984375, "learning_rate": 6.205173276646774e-07, "loss": 0.0001, "reward": 1.8325893431901932, "reward_std": 0.03937846049666405, "rewards/accuracy_reward": 0.8325893133878708, "rewards/format_reward": 1.0, "step": 5420 }, { "completion_length": 430.5290336608887, "epoch": 0.4226609241322862, "grad_norm": 0.07330224244184264, "kl": 0.0031108856201171875, "learning_rate": 6.202796325554428e-07, "loss": 0.0001, "reward": 1.7924107760190964, "reward_std": 0.030135108157992363, "rewards/accuracy_reward": 0.792410746216774, "rewards/format_reward": 1.0, "step": 5422 }, { "completion_length": 429.61832427978516, "epoch": 0.422816830043069, "grad_norm": 0.10387694474281425, "kl": 0.0039463043212890625, "learning_rate": 6.200419085903549e-07, "loss": 0.0002, "reward": 1.7678572237491608, "reward_std": 0.05831881985068321, "rewards/accuracy_reward": 0.767857164144516, "rewards/format_reward": 1.0, "step": 5424 }, { "completion_length": 439.7790412902832, "epoch": 0.4229727359538519, "grad_norm": 0.0970091051184646, "kl": 0.0036029815673828125, "learning_rate": 6.198041558264456e-07, "loss": 0.0001, "reward": 1.7745536416769028, "reward_std": 0.07597534824162722, "rewards/accuracy_reward": 0.7745536118745804, "rewards/format_reward": 1.0, "step": 5426 }, { "completion_length": 424.77903747558594, "epoch": 0.42312864186463467, "grad_norm": 0.12589436165310727, "kl": 0.003086090087890625, "learning_rate": 6.19566374320753e-07, "loss": 0.0001, "reward": 1.7611608058214188, "reward_std": 0.0344957634806633, "rewards/accuracy_reward": 0.761160746216774, "rewards/format_reward": 1.0, "step": 5428 }, { "completion_length": 413.01118087768555, "epoch": 0.4232845477754175, "grad_norm": 0.04962514538334336, "kl": 0.002941131591796875, "learning_rate": 6.193285641303226e-07, "loss": 0.0001, "reward": 1.9308036267757416, "reward_std": 0.030135108157992363, "rewards/accuracy_reward": 0.9308035969734192, "rewards/format_reward": 1.0, "step": 5430 }, { "completion_length": 426.0089454650879, "epoch": 0.42344045368620037, "grad_norm": 0.14283575717722052, "kl": 0.0035762786865234375, "learning_rate": 6.190907253122064e-07, "loss": 0.0001, "reward": 1.8191965073347092, "reward_std": 0.07921057753264904, "rewards/accuracy_reward": 0.8214285969734192, "rewards/format_reward": 0.9977678656578064, "step": 5432 }, { "completion_length": 420.7366256713867, "epoch": 0.4235963595969832, "grad_norm": 0.08187264328294099, "kl": 0.0033168792724609375, "learning_rate": 6.188528579234638e-07, "loss": 0.0001, "reward": 1.8236607909202576, "reward_std": 0.04276201594620943, "rewards/accuracy_reward": 0.8236607611179352, "rewards/format_reward": 1.0, "step": 5434 }, { "completion_length": 424.6964454650879, "epoch": 0.42375226550776607, "grad_norm": 0.1288895720126785, "kl": 0.00308990478515625, "learning_rate": 6.186149620211602e-07, "loss": 0.0001, "reward": 1.8727679401636124, "reward_std": 0.06365517433732748, "rewards/accuracy_reward": 0.8727678954601288, "rewards/format_reward": 1.0, "step": 5436 }, { "completion_length": 418.96430587768555, "epoch": 0.4239081714185489, "grad_norm": 0.08492928729727056, "kl": 0.00292205810546875, "learning_rate": 6.183770376623685e-07, "loss": 0.0001, "reward": 1.8727679401636124, "reward_std": 0.04260864853858948, "rewards/accuracy_reward": 0.8727678805589676, "rewards/format_reward": 1.0, "step": 5438 }, { "completion_length": 425.4955520629883, "epoch": 0.42406407732933177, "grad_norm": 0.11003766978031675, "kl": 0.003246307373046875, "learning_rate": 6.181390849041688e-07, "loss": 0.0001, "reward": 1.790178656578064, "reward_std": 0.0417863167822361, "rewards/accuracy_reward": 0.7901786044239998, "rewards/format_reward": 1.0, "step": 5440 }, { "completion_length": 428.6361846923828, "epoch": 0.4242199832401146, "grad_norm": 0.08218036518950503, "kl": 0.0027446746826171875, "learning_rate": 6.179011038036469e-07, "loss": 0.0001, "reward": 1.821428656578064, "reward_std": 0.037727355025708675, "rewards/accuracy_reward": 0.8214285969734192, "rewards/format_reward": 1.0, "step": 5442 }, { "completion_length": 416.5669860839844, "epoch": 0.4243758891508974, "grad_norm": 0.09813191896848467, "kl": 0.0028533935546875, "learning_rate": 6.176630944178961e-07, "loss": 0.0001, "reward": 1.8571429252624512, "reward_std": 0.03644856158643961, "rewards/accuracy_reward": 0.8571429029107094, "rewards/format_reward": 1.0, "step": 5444 }, { "completion_length": 421.79019927978516, "epoch": 0.42453179506168026, "grad_norm": 0.1233135194252227, "kl": 0.0027370452880859375, "learning_rate": 6.174250568040165e-07, "loss": 0.0001, "reward": 1.7678572088479996, "reward_std": 0.06981526222079992, "rewards/accuracy_reward": 0.7678571790456772, "rewards/format_reward": 1.0, "step": 5446 }, { "completion_length": 430.596004486084, "epoch": 0.4246877009724631, "grad_norm": 0.12879407013478159, "kl": 0.0031719207763671875, "learning_rate": 6.171869910191149e-07, "loss": 0.0001, "reward": 1.7388393729925156, "reward_std": 0.06621275749057531, "rewards/accuracy_reward": 0.7388393133878708, "rewards/format_reward": 1.0, "step": 5448 }, { "completion_length": 432.82591247558594, "epoch": 0.42484360688324596, "grad_norm": 0.07777483308705518, "kl": 0.0034427642822265625, "learning_rate": 6.169488971203046e-07, "loss": 0.0001, "reward": 1.783482238650322, "reward_std": 0.041329856030642986, "rewards/accuracy_reward": 0.7834821715950966, "rewards/format_reward": 1.0, "step": 5450 }, { "completion_length": 432.02680587768555, "epoch": 0.4249995127940288, "grad_norm": 0.14132304602934914, "kl": 0.003391265869140625, "learning_rate": 6.167107751647058e-07, "loss": 0.0001, "reward": 1.8727679252624512, "reward_std": 0.06447890866547823, "rewards/accuracy_reward": 0.8727678954601288, "rewards/format_reward": 1.0, "step": 5452 }, { "completion_length": 422.2634086608887, "epoch": 0.42515541870481166, "grad_norm": 0.08671910077043207, "kl": 0.0031633377075195312, "learning_rate": 6.164726252094459e-07, "loss": 0.0001, "reward": 1.8303572088479996, "reward_std": 0.04178631864488125, "rewards/accuracy_reward": 0.8303571864962578, "rewards/format_reward": 1.0, "step": 5454 }, { "completion_length": 415.0379638671875, "epoch": 0.4253113246155945, "grad_norm": 0.0031799177862303924, "kl": 0.002918243408203125, "learning_rate": 6.162344473116582e-07, "loss": 0.0001, "reward": 1.7924107909202576, "reward_std": 0.022845957428216934, "rewards/accuracy_reward": 0.7924107536673546, "rewards/format_reward": 1.0, "step": 5456 }, { "completion_length": 424.6518020629883, "epoch": 0.4254672305263773, "grad_norm": 0.10903882547475298, "kl": 0.0030975341796875, "learning_rate": 6.159962415284832e-07, "loss": 0.0001, "reward": 1.8928572237491608, "reward_std": 0.05734171997755766, "rewards/accuracy_reward": 0.8928571790456772, "rewards/format_reward": 1.0, "step": 5458 }, { "completion_length": 423.0000114440918, "epoch": 0.42562313643716015, "grad_norm": 0.08707674493311096, "kl": 0.0031213760375976562, "learning_rate": 6.15758007917068e-07, "loss": 0.0001, "reward": 1.7812500894069672, "reward_std": 0.052003965713083744, "rewards/accuracy_reward": 0.7812500223517418, "rewards/format_reward": 1.0, "step": 5460 }, { "completion_length": 428.15180587768555, "epoch": 0.425779042347943, "grad_norm": 0.11340444625974033, "kl": 0.0029726028442382812, "learning_rate": 6.155197465345664e-07, "loss": 0.0001, "reward": 1.834821492433548, "reward_std": 0.06560797337442636, "rewards/accuracy_reward": 0.8348214626312256, "rewards/format_reward": 1.0, "step": 5462 }, { "completion_length": 423.1495704650879, "epoch": 0.42593494825872585, "grad_norm": 0.06578742427423874, "kl": 0.00295257568359375, "learning_rate": 6.152814574381388e-07, "loss": 0.0001, "reward": 1.897321492433548, "reward_std": 0.03644856158643961, "rewards/accuracy_reward": 0.8973214626312256, "rewards/format_reward": 1.0, "step": 5464 }, { "completion_length": 439.9710006713867, "epoch": 0.4260908541695087, "grad_norm": 0.10036891458458153, "kl": 0.0030202865600585938, "learning_rate": 6.150431406849523e-07, "loss": 0.0001, "reward": 1.8325893580913544, "reward_std": 0.0450179111212492, "rewards/accuracy_reward": 0.8325893133878708, "rewards/format_reward": 1.0, "step": 5466 }, { "completion_length": 427.6406478881836, "epoch": 0.42624676008029155, "grad_norm": 0.08284543235727461, "kl": 0.0027790069580078125, "learning_rate": 6.148047963321807e-07, "loss": 0.0001, "reward": 1.8482143580913544, "reward_std": 0.026077548041939735, "rewards/accuracy_reward": 0.8482143208384514, "rewards/format_reward": 1.0, "step": 5468 }, { "completion_length": 420.7165298461914, "epoch": 0.4264026659910744, "grad_norm": 0.06848442299444146, "kl": 0.0031585693359375, "learning_rate": 6.145664244370042e-07, "loss": 0.0001, "reward": 1.8258929401636124, "reward_std": 0.05493246205151081, "rewards/accuracy_reward": 0.8258928954601288, "rewards/format_reward": 1.0, "step": 5470 }, { "completion_length": 423.42635345458984, "epoch": 0.42655857190185725, "grad_norm": 0.08499067552768692, "kl": 0.0029821395874023438, "learning_rate": 6.143280250566095e-07, "loss": 0.0001, "reward": 1.90401791036129, "reward_std": 0.030135109089314938, "rewards/accuracy_reward": 0.9040178880095482, "rewards/format_reward": 1.0, "step": 5472 }, { "completion_length": 420.15403747558594, "epoch": 0.42671447781264005, "grad_norm": 0.06356380918450531, "kl": 0.003162384033203125, "learning_rate": 6.140895982481908e-07, "loss": 0.0001, "reward": 1.7700893431901932, "reward_std": 0.030135108157992363, "rewards/accuracy_reward": 0.770089328289032, "rewards/format_reward": 1.0, "step": 5474 }, { "completion_length": 428.0111770629883, "epoch": 0.4268703837234229, "grad_norm": 0.09434196154506642, "kl": 0.0026769638061523438, "learning_rate": 6.138511440689477e-07, "loss": 0.0001, "reward": 1.7321429252624512, "reward_std": 0.06899152789264917, "rewards/accuracy_reward": 0.7321428880095482, "rewards/format_reward": 1.0, "step": 5476 }, { "completion_length": 418.57591247558594, "epoch": 0.42702628963420575, "grad_norm": 0.0030102387551190725, "kl": 0.0027675628662109375, "learning_rate": 6.13612662576087e-07, "loss": 0.0001, "reward": 1.781250074505806, "reward_std": 0.015555404126644135, "rewards/accuracy_reward": 0.7812500223517418, "rewards/format_reward": 1.0, "step": 5478 }, { "completion_length": 417.2366256713867, "epoch": 0.4271821955449886, "grad_norm": 0.14339598540296236, "kl": 0.0031909942626953125, "learning_rate": 6.133741538268218e-07, "loss": 0.0001, "reward": 1.8214286714792252, "reward_std": 0.06545460596680641, "rewards/accuracy_reward": 0.8214285969734192, "rewards/format_reward": 1.0, "step": 5480 }, { "completion_length": 428.9576110839844, "epoch": 0.42733810145577145, "grad_norm": 0.1476280923711628, "kl": 0.0033855438232421875, "learning_rate": 6.131356178783722e-07, "loss": 0.0001, "reward": 1.752232238650322, "reward_std": 0.07447901368141174, "rewards/accuracy_reward": 0.7522321790456772, "rewards/format_reward": 1.0, "step": 5482 }, { "completion_length": 417.9776954650879, "epoch": 0.4274940073665543, "grad_norm": 0.09668260679403196, "kl": 0.0029087066650390625, "learning_rate": 6.128970547879643e-07, "loss": 0.0001, "reward": 1.8236608058214188, "reward_std": 0.04629670176655054, "rewards/accuracy_reward": 0.8236607536673546, "rewards/format_reward": 1.0, "step": 5484 }, { "completion_length": 422.4955520629883, "epoch": 0.42764991327733715, "grad_norm": 0.08894636040783273, "kl": 0.0030498504638671875, "learning_rate": 6.126584646128308e-07, "loss": 0.0001, "reward": 1.8459822237491608, "reward_std": 0.06769936624914408, "rewards/accuracy_reward": 0.8482143133878708, "rewards/format_reward": 0.9977678656578064, "step": 5486 }, { "completion_length": 444.53796768188477, "epoch": 0.42780581918812, "grad_norm": 0.05136672988144956, "kl": 0.0030231475830078125, "learning_rate": 6.124198474102113e-07, "loss": 0.0001, "reward": 1.796875074505806, "reward_std": 0.028332039713859558, "rewards/accuracy_reward": 0.7968750521540642, "rewards/format_reward": 1.0, "step": 5488 }, { "completion_length": 424.3951072692871, "epoch": 0.4279617250989028, "grad_norm": 0.06761334295909836, "kl": 0.0028352737426757812, "learning_rate": 6.121812032373515e-07, "loss": 0.0001, "reward": 1.8638393729925156, "reward_std": 0.0353194959461689, "rewards/accuracy_reward": 0.8638393357396126, "rewards/format_reward": 1.0, "step": 5490 }, { "completion_length": 415.32368087768555, "epoch": 0.42811763100968564, "grad_norm": 0.10792203294260838, "kl": 0.003299713134765625, "learning_rate": 6.119425321515034e-07, "loss": 0.0001, "reward": 1.6875000596046448, "reward_std": 0.07222452107816935, "rewards/accuracy_reward": 0.6875000298023224, "rewards/format_reward": 1.0, "step": 5492 }, { "completion_length": 430.2455520629883, "epoch": 0.4282735369204685, "grad_norm": 0.052120418991160045, "kl": 0.0038852691650390625, "learning_rate": 6.117038342099259e-07, "loss": 0.0002, "reward": 1.7723215073347092, "reward_std": 0.03788072057068348, "rewards/accuracy_reward": 0.7723214700818062, "rewards/format_reward": 1.0, "step": 5494 }, { "completion_length": 424.3884086608887, "epoch": 0.42842944283125134, "grad_norm": 0.08861928354874966, "kl": 0.0029239654541015625, "learning_rate": 6.114651094698844e-07, "loss": 0.0001, "reward": 1.68303582072258, "reward_std": 0.03208790719509125, "rewards/accuracy_reward": 0.6830357313156128, "rewards/format_reward": 1.0, "step": 5496 }, { "completion_length": 433.0893020629883, "epoch": 0.4285853487420342, "grad_norm": 0.1205559285970374, "kl": 0.00353240966796875, "learning_rate": 6.112263579886502e-07, "loss": 0.0001, "reward": 1.790178656578064, "reward_std": 0.08747823256999254, "rewards/accuracy_reward": 0.7901785969734192, "rewards/format_reward": 1.0, "step": 5498 }, { "completion_length": 425.6339454650879, "epoch": 0.42874125465281704, "grad_norm": 0.08539308855042824, "kl": 0.0029354095458984375, "learning_rate": 6.109875798235012e-07, "loss": 0.0001, "reward": 1.7343750894069672, "reward_std": 0.041112312115728855, "rewards/accuracy_reward": 0.7343750260770321, "rewards/format_reward": 1.0, "step": 5500 }, { "completion_length": 423.776798248291, "epoch": 0.4288971605635999, "grad_norm": 0.06048775482006725, "kl": 0.0033168792724609375, "learning_rate": 6.107487750317222e-07, "loss": 0.0001, "reward": 1.8080357909202576, "reward_std": 0.025253813713788986, "rewards/accuracy_reward": 0.8080357313156128, "rewards/format_reward": 1.0, "step": 5502 }, { "completion_length": 430.69421005249023, "epoch": 0.4290530664743827, "grad_norm": 0.09097262464637632, "kl": 0.003330230712890625, "learning_rate": 6.105099436706036e-07, "loss": 0.0001, "reward": 1.8415179550647736, "reward_std": 0.05230706091970205, "rewards/accuracy_reward": 0.8415178805589676, "rewards/format_reward": 1.0, "step": 5504 }, { "completion_length": 431.4888572692871, "epoch": 0.42920897238516553, "grad_norm": 0.04574967040150049, "kl": 0.0029506683349609375, "learning_rate": 6.102710857974429e-07, "loss": 0.0001, "reward": 1.8415179401636124, "reward_std": 0.03449576534330845, "rewards/accuracy_reward": 0.84151791036129, "rewards/format_reward": 1.0, "step": 5506 }, { "completion_length": 418.91296768188477, "epoch": 0.4293648782959484, "grad_norm": 0.04990367239013651, "kl": 0.0028333663940429688, "learning_rate": 6.100322014695435e-07, "loss": 0.0001, "reward": 1.910714328289032, "reward_std": 0.04764330945909023, "rewards/accuracy_reward": 0.9107143059372902, "rewards/format_reward": 1.0, "step": 5508 }, { "completion_length": 424.20984649658203, "epoch": 0.42952078420673123, "grad_norm": 0.003889584834914946, "kl": 0.0030956268310546875, "learning_rate": 6.097932907442153e-07, "loss": 0.0001, "reward": 1.8370536416769028, "reward_std": 0.0063134534284472466, "rewards/accuracy_reward": 0.8370536044239998, "rewards/format_reward": 1.0, "step": 5510 }, { "completion_length": 430.4107360839844, "epoch": 0.4296766901175141, "grad_norm": 0.0034556606777392104, "kl": 0.0029926300048828125, "learning_rate": 6.095543536787744e-07, "loss": 0.0001, "reward": 1.7321429252624512, "reward_std": 0.0, "rewards/accuracy_reward": 0.73214291036129, "rewards/format_reward": 1.0, "step": 5512 }, { "completion_length": 425.486629486084, "epoch": 0.42983259602829693, "grad_norm": 0.1306736001509429, "kl": 0.0033588409423828125, "learning_rate": 6.093153903305432e-07, "loss": 0.0001, "reward": 1.7589286416769028, "reward_std": 0.07838320545852184, "rewards/accuracy_reward": 0.758928619325161, "rewards/format_reward": 1.0, "step": 5514 }, { "completion_length": 420.5468864440918, "epoch": 0.4299885019390798, "grad_norm": 0.04248709248007156, "kl": 0.0025720596313476562, "learning_rate": 6.09076400756851e-07, "loss": 0.0001, "reward": 1.850446492433548, "reward_std": 0.025774452835321426, "rewards/accuracy_reward": 0.8504464626312256, "rewards/format_reward": 1.0, "step": 5516 }, { "completion_length": 441.8102836608887, "epoch": 0.43014440784986263, "grad_norm": 0.11605312795145378, "kl": 0.0038356781005859375, "learning_rate": 6.088373850150328e-07, "loss": 0.0002, "reward": 1.8102679252624512, "reward_std": 0.07673490606248379, "rewards/accuracy_reward": 0.8102678954601288, "rewards/format_reward": 1.0, "step": 5518 }, { "completion_length": 439.1718978881836, "epoch": 0.4303003137606454, "grad_norm": 0.09495204533828537, "kl": 0.003314971923828125, "learning_rate": 6.085983431624295e-07, "loss": 0.0001, "reward": 1.7678572088479996, "reward_std": 0.089731321670115, "rewards/accuracy_reward": 0.7678571864962578, "rewards/format_reward": 1.0, "step": 5520 }, { "completion_length": 439.752254486084, "epoch": 0.4304562196714283, "grad_norm": 0.10778177998245271, "kl": 0.003337860107421875, "learning_rate": 6.083592752563894e-07, "loss": 0.0001, "reward": 1.7187500894069672, "reward_std": 0.05328275728970766, "rewards/accuracy_reward": 0.7209821715950966, "rewards/format_reward": 0.9977678656578064, "step": 5522 }, { "completion_length": 427.1718978881836, "epoch": 0.4306121255822111, "grad_norm": 0.08295511290773454, "kl": 0.0025777816772460938, "learning_rate": 6.081201813542661e-07, "loss": 0.0001, "reward": 1.8549107760190964, "reward_std": 0.030135108157992363, "rewards/accuracy_reward": 0.8549107536673546, "rewards/format_reward": 1.0, "step": 5524 }, { "completion_length": 422.04912185668945, "epoch": 0.430768031492994, "grad_norm": 0.08750402659784579, "kl": 0.0028295516967773438, "learning_rate": 6.078810615134198e-07, "loss": 0.0001, "reward": 1.781250074505806, "reward_std": 0.06447750702500343, "rewards/accuracy_reward": 0.781250037252903, "rewards/format_reward": 1.0, "step": 5526 }, { "completion_length": 421.8013572692871, "epoch": 0.4309239374037768, "grad_norm": 0.05089910089220325, "kl": 0.0029458999633789062, "learning_rate": 6.076419157912169e-07, "loss": 0.0001, "reward": 1.8236608058214188, "reward_std": 0.027206611819565296, "rewards/accuracy_reward": 0.8236607536673546, "rewards/format_reward": 1.0, "step": 5528 }, { "completion_length": 416.0558204650879, "epoch": 0.4310798433145597, "grad_norm": 0.10042591170200994, "kl": 0.0024385452270507812, "learning_rate": 6.0740274424503e-07, "loss": 0.0001, "reward": 1.9062500447034836, "reward_std": 0.0667333984747529, "rewards/accuracy_reward": 0.9062500298023224, "rewards/format_reward": 1.0, "step": 5530 }, { "completion_length": 425.69644927978516, "epoch": 0.4312357492253425, "grad_norm": 0.09896971316258758, "kl": 0.0028791427612304688, "learning_rate": 6.071635469322379e-07, "loss": 0.0001, "reward": 1.7633929401636124, "reward_std": 0.05441322457045317, "rewards/accuracy_reward": 0.765625037252903, "rewards/format_reward": 0.9977678656578064, "step": 5532 }, { "completion_length": 425.1495704650879, "epoch": 0.4313916551361254, "grad_norm": 0.05202595195513438, "kl": 0.0033893585205078125, "learning_rate": 6.069243239102255e-07, "loss": 0.0001, "reward": 1.819196492433548, "reward_std": 0.0344957634806633, "rewards/accuracy_reward": 0.819196455180645, "rewards/format_reward": 1.0, "step": 5534 }, { "completion_length": 413.78796005249023, "epoch": 0.43154756104690817, "grad_norm": 0.04444912275488047, "kl": 0.0028352737426757812, "learning_rate": 6.066850752363839e-07, "loss": 0.0001, "reward": 1.8191965222358704, "reward_std": 0.01894036028534174, "rewards/accuracy_reward": 0.8191964775323868, "rewards/format_reward": 1.0, "step": 5536 }, { "completion_length": 417.1875190734863, "epoch": 0.431703466957691, "grad_norm": 0.0036302161583065826, "kl": 0.0028753280639648438, "learning_rate": 6.064458009681107e-07, "loss": 0.0001, "reward": 1.805803656578064, "reward_std": 0.03547286335378885, "rewards/accuracy_reward": 0.808035746216774, "rewards/format_reward": 0.9977678656578064, "step": 5538 }, { "completion_length": 422.3013572692871, "epoch": 0.43185937286847387, "grad_norm": 0.07788770095270019, "kl": 0.0029764175415039062, "learning_rate": 6.062065011628089e-07, "loss": 0.0001, "reward": 1.8995536267757416, "reward_std": 0.021868856623768806, "rewards/accuracy_reward": 0.899553582072258, "rewards/format_reward": 1.0, "step": 5540 }, { "completion_length": 417.8616256713867, "epoch": 0.4320152787792567, "grad_norm": 0.09729574726037547, "kl": 0.0024890899658203125, "learning_rate": 6.059671758778882e-07, "loss": 0.0001, "reward": 1.9062500596046448, "reward_std": 0.03352006524801254, "rewards/accuracy_reward": 0.906250037252903, "rewards/format_reward": 1.0, "step": 5542 }, { "completion_length": 421.3013572692871, "epoch": 0.43217118469003957, "grad_norm": 0.06624647906389802, "kl": 0.0029430389404296875, "learning_rate": 6.057278251707644e-07, "loss": 0.0001, "reward": 1.8303572237491608, "reward_std": 0.040809216909110546, "rewards/accuracy_reward": 0.8303571864962578, "rewards/format_reward": 1.0, "step": 5544 }, { "completion_length": 421.830379486084, "epoch": 0.4323270906008224, "grad_norm": 0.09115409506078344, "kl": 0.00318145751953125, "learning_rate": 6.054884490988591e-07, "loss": 0.0001, "reward": 1.750000074505806, "reward_std": 0.07771060150116682, "rewards/accuracy_reward": 0.7500000223517418, "rewards/format_reward": 1.0, "step": 5546 }, { "completion_length": 418.3526954650879, "epoch": 0.43248299651160527, "grad_norm": 0.08700661115338137, "kl": 0.0028562545776367188, "learning_rate": 6.052490477196003e-07, "loss": 0.0001, "reward": 1.843750074505806, "reward_std": 0.0417863167822361, "rewards/accuracy_reward": 0.8459821790456772, "rewards/format_reward": 0.9977678656578064, "step": 5548 }, { "completion_length": 405.479923248291, "epoch": 0.4326389024223881, "grad_norm": 0.06364358848083274, "kl": 0.0028076171875, "learning_rate": 6.050096210904217e-07, "loss": 0.0001, "reward": 1.812500074505806, "reward_std": 0.039377057924866676, "rewards/accuracy_reward": 0.812500037252903, "rewards/format_reward": 1.0, "step": 5550 }, { "completion_length": 429.3102836608887, "epoch": 0.4327948083331709, "grad_norm": 0.06123750676098589, "kl": 0.0031538009643554688, "learning_rate": 6.047701692687636e-07, "loss": 0.0001, "reward": 1.7812500596046448, "reward_std": 0.023821654729545116, "rewards/accuracy_reward": 0.7834821566939354, "rewards/format_reward": 0.9977678656578064, "step": 5552 }, { "completion_length": 422.37055587768555, "epoch": 0.43295071424395376, "grad_norm": 0.07882066114501238, "kl": 0.0027799606323242188, "learning_rate": 6.045306923120718e-07, "loss": 0.0001, "reward": 1.8750000596046448, "reward_std": 0.04080921784043312, "rewards/accuracy_reward": 0.8750000298023224, "rewards/format_reward": 1.0, "step": 5554 }, { "completion_length": 427.82368087768555, "epoch": 0.4331066201547366, "grad_norm": 0.10700795274322249, "kl": 0.0029544830322265625, "learning_rate": 6.042911902777982e-07, "loss": 0.0001, "reward": 1.8772321939468384, "reward_std": 0.05102826748043299, "rewards/accuracy_reward": 0.8794643133878708, "rewards/format_reward": 0.9977678656578064, "step": 5556 }, { "completion_length": 421.77457427978516, "epoch": 0.43326252606551946, "grad_norm": 0.04707301288012183, "kl": 0.0028371810913085938, "learning_rate": 6.040516632234014e-07, "loss": 0.0001, "reward": 1.814732238650322, "reward_std": 0.039680151268839836, "rewards/accuracy_reward": 0.8147321790456772, "rewards/format_reward": 1.0, "step": 5558 }, { "completion_length": 434.4821586608887, "epoch": 0.4334184319763023, "grad_norm": 0.08424737967011192, "kl": 0.0036334991455078125, "learning_rate": 6.038121112063449e-07, "loss": 0.0001, "reward": 1.7745536416769028, "reward_std": 0.053284160792827606, "rewards/accuracy_reward": 0.7767857387661934, "rewards/format_reward": 0.9977678656578064, "step": 5560 }, { "completion_length": 413.7812614440918, "epoch": 0.43357433788708516, "grad_norm": 0.0625076825708717, "kl": 0.0025396347045898438, "learning_rate": 6.035725342840989e-07, "loss": 0.0001, "reward": 1.8839286416769028, "reward_std": 0.04471481405198574, "rewards/accuracy_reward": 0.8839286044239998, "rewards/format_reward": 1.0, "step": 5562 }, { "completion_length": 436.4263572692871, "epoch": 0.433730243797868, "grad_norm": 0.04063747235144631, "kl": 0.003093719482421875, "learning_rate": 6.033329325141397e-07, "loss": 0.0001, "reward": 1.758928656578064, "reward_std": 0.02217195089906454, "rewards/accuracy_reward": 0.7589286044239998, "rewards/format_reward": 1.0, "step": 5564 }, { "completion_length": 408.6205520629883, "epoch": 0.4338861497086508, "grad_norm": 0.06621402394740404, "kl": 0.00260162353515625, "learning_rate": 6.030933059539492e-07, "loss": 0.0001, "reward": 1.812500074505806, "reward_std": 0.04599360562860966, "rewards/accuracy_reward": 0.812500037252903, "rewards/format_reward": 1.0, "step": 5566 }, { "completion_length": 423.32591247558594, "epoch": 0.43404205561943365, "grad_norm": 0.09022599967997634, "kl": 0.0027675628662109375, "learning_rate": 6.028536546610152e-07, "loss": 0.0001, "reward": 1.8013393729925156, "reward_std": 0.04696930479258299, "rewards/accuracy_reward": 0.8013393208384514, "rewards/format_reward": 1.0, "step": 5568 }, { "completion_length": 444.4732360839844, "epoch": 0.4341979615302165, "grad_norm": 0.10350193992741276, "kl": 0.003345489501953125, "learning_rate": 6.026139786928317e-07, "loss": 0.0001, "reward": 1.7857143878936768, "reward_std": 0.05200396478176117, "rewards/accuracy_reward": 0.785714328289032, "rewards/format_reward": 1.0, "step": 5570 }, { "completion_length": 431.23439025878906, "epoch": 0.43435386744099935, "grad_norm": 0.09214184115360113, "kl": 0.0030088424682617188, "learning_rate": 6.023742781068984e-07, "loss": 0.0001, "reward": 1.7120536416769028, "reward_std": 0.042762015014886856, "rewards/accuracy_reward": 0.7120536118745804, "rewards/format_reward": 1.0, "step": 5572 }, { "completion_length": 418.2790412902832, "epoch": 0.4345097733517822, "grad_norm": 0.1255949750268571, "kl": 0.0032939910888671875, "learning_rate": 6.021345529607212e-07, "loss": 0.0001, "reward": 1.8750000596046448, "reward_std": 0.08259553462266922, "rewards/accuracy_reward": 0.8750000596046448, "rewards/format_reward": 1.0, "step": 5574 }, { "completion_length": 417.89510345458984, "epoch": 0.43466567926256505, "grad_norm": 0.04914155637885083, "kl": 0.0027952194213867188, "learning_rate": 6.018948033118111e-07, "loss": 0.0001, "reward": 1.8482143580913544, "reward_std": 0.01555540319532156, "rewards/accuracy_reward": 0.8482143133878708, "rewards/format_reward": 1.0, "step": 5576 }, { "completion_length": 436.17412185668945, "epoch": 0.4348215851733479, "grad_norm": 0.0446331855803227, "kl": 0.00324249267578125, "learning_rate": 6.016550292176865e-07, "loss": 0.0001, "reward": 1.7790179252624512, "reward_std": 0.021868856623768806, "rewards/accuracy_reward": 0.7790178880095482, "rewards/format_reward": 1.0, "step": 5578 }, { "completion_length": 431.49109268188477, "epoch": 0.43497749108413075, "grad_norm": 0.09923506411596356, "kl": 0.0034008026123046875, "learning_rate": 6.014152307358702e-07, "loss": 0.0001, "reward": 1.7879465222358704, "reward_std": 0.05636602267622948, "rewards/accuracy_reward": 0.7879464626312256, "rewards/format_reward": 1.0, "step": 5580 }, { "completion_length": 419.04689025878906, "epoch": 0.43513339699491355, "grad_norm": 0.0028023247932757764, "kl": 0.0024566650390625, "learning_rate": 6.011754079238914e-07, "loss": 0.0001, "reward": 1.88839291036129, "reward_std": 0.008266251534223557, "rewards/accuracy_reward": 0.888392873108387, "rewards/format_reward": 1.0, "step": 5582 }, { "completion_length": 424.8326110839844, "epoch": 0.4352893029056964, "grad_norm": 0.047093120928877, "kl": 0.0031948089599609375, "learning_rate": 6.009355608392851e-07, "loss": 0.0001, "reward": 1.8593750596046448, "reward_std": 0.028485405258834362, "rewards/accuracy_reward": 0.8593750298023224, "rewards/format_reward": 1.0, "step": 5584 }, { "completion_length": 424.7053756713867, "epoch": 0.43544520881647925, "grad_norm": 0.05598259918036274, "kl": 0.004439353942871094, "learning_rate": 6.006956895395922e-07, "loss": 0.0002, "reward": 1.88839291036129, "reward_std": 0.025253813713788986, "rewards/accuracy_reward": 0.8883928880095482, "rewards/format_reward": 1.0, "step": 5586 }, { "completion_length": 421.0022430419922, "epoch": 0.4356011147272621, "grad_norm": 0.08587347247684254, "kl": 0.0027093887329101562, "learning_rate": 6.004557940823596e-07, "loss": 0.0001, "reward": 1.796875074505806, "reward_std": 0.028332039713859558, "rewards/accuracy_reward": 0.796875037252903, "rewards/format_reward": 1.0, "step": 5588 }, { "completion_length": 433.48886489868164, "epoch": 0.43575702063804495, "grad_norm": 0.08020667931806381, "kl": 0.0034942626953125, "learning_rate": 6.002158745251393e-07, "loss": 0.0001, "reward": 1.821428656578064, "reward_std": 0.07582562137395144, "rewards/accuracy_reward": 0.8236607611179352, "rewards/format_reward": 0.9977678656578064, "step": 5590 }, { "completion_length": 438.81921768188477, "epoch": 0.4359129265488278, "grad_norm": 0.08825048465855123, "kl": 0.003047943115234375, "learning_rate": 5.999759309254899e-07, "loss": 0.0001, "reward": 1.899553656578064, "reward_std": 0.05831741914153099, "rewards/accuracy_reward": 0.899553619325161, "rewards/format_reward": 1.0, "step": 5592 }, { "completion_length": 427.6651916503906, "epoch": 0.43606883245961064, "grad_norm": 0.1023223315605598, "kl": 0.0031757354736328125, "learning_rate": 5.997359633409753e-07, "loss": 0.0001, "reward": 1.767857238650322, "reward_std": 0.04599360655993223, "rewards/accuracy_reward": 0.767857164144516, "rewards/format_reward": 1.0, "step": 5594 }, { "completion_length": 424.6183280944824, "epoch": 0.4362247383703935, "grad_norm": 0.03873601706622566, "kl": 0.0025157928466796875, "learning_rate": 5.994959718291652e-07, "loss": 0.0001, "reward": 1.9107143133878708, "reward_std": 0.023821654729545116, "rewards/accuracy_reward": 0.9107143133878708, "rewards/format_reward": 1.0, "step": 5596 }, { "completion_length": 427.5357322692871, "epoch": 0.4363806442811763, "grad_norm": 0.09424247001679406, "kl": 0.002643585205078125, "learning_rate": 5.992559564476351e-07, "loss": 0.0001, "reward": 1.7566965073347092, "reward_std": 0.0514810923486948, "rewards/accuracy_reward": 0.7566964626312256, "rewards/format_reward": 1.0, "step": 5598 }, { "completion_length": 431.1651954650879, "epoch": 0.43653655019195914, "grad_norm": 0.08463261761616676, "kl": 0.002986907958984375, "learning_rate": 5.990159172539665e-07, "loss": 0.0001, "reward": 1.79241082072258, "reward_std": 0.03547286335378885, "rewards/accuracy_reward": 0.792410746216774, "rewards/format_reward": 1.0, "step": 5600 }, { "completion_length": 416.38618087768555, "epoch": 0.436692456102742, "grad_norm": 0.08158821700160208, "kl": 0.002712249755859375, "learning_rate": 5.987758543057459e-07, "loss": 0.0001, "reward": 1.8191965073347092, "reward_std": 0.030135108157992363, "rewards/accuracy_reward": 0.819196455180645, "rewards/format_reward": 1.0, "step": 5602 }, { "completion_length": 423.9576110839844, "epoch": 0.43684836201352484, "grad_norm": 0.058011569772206485, "kl": 0.0028123855590820312, "learning_rate": 5.985357676605662e-07, "loss": 0.0001, "reward": 1.8437500894069672, "reward_std": 0.028182310052216053, "rewards/accuracy_reward": 0.8437500521540642, "rewards/format_reward": 1.0, "step": 5604 }, { "completion_length": 427.6384086608887, "epoch": 0.4370042679243077, "grad_norm": 0.08158737300564245, "kl": 0.002758502960205078, "learning_rate": 5.982956573760255e-07, "loss": 0.0001, "reward": 1.8526786267757416, "reward_std": 0.043065108358860016, "rewards/accuracy_reward": 0.854910746216774, "rewards/format_reward": 0.9977678656578064, "step": 5606 }, { "completion_length": 413.48216247558594, "epoch": 0.43716017383509054, "grad_norm": 0.06979649138554193, "kl": 0.0035076141357421875, "learning_rate": 5.980555235097281e-07, "loss": 0.0001, "reward": 1.8526786416769028, "reward_std": 0.03352006524801254, "rewards/accuracy_reward": 0.8526785969734192, "rewards/format_reward": 1.0, "step": 5608 }, { "completion_length": 435.85493087768555, "epoch": 0.4373160797458734, "grad_norm": 0.0943097299748236, "kl": 0.0036144256591796875, "learning_rate": 5.978153661192835e-07, "loss": 0.0001, "reward": 1.7633929401636124, "reward_std": 0.09363691788166761, "rewards/accuracy_reward": 0.7633928880095482, "rewards/format_reward": 1.0, "step": 5610 }, { "completion_length": 418.49555587768555, "epoch": 0.4374719856566562, "grad_norm": 0.04691147440689919, "kl": 0.0025920867919921875, "learning_rate": 5.97575185262307e-07, "loss": 0.0001, "reward": 1.852678656578064, "reward_std": 0.012626906856894493, "rewards/accuracy_reward": 0.854910746216774, "rewards/format_reward": 0.9977678656578064, "step": 5612 }, { "completion_length": 411.65626525878906, "epoch": 0.43762789156743903, "grad_norm": 0.10553906899735667, "kl": 0.00231170654296875, "learning_rate": 5.973349809964195e-07, "loss": 0.0001, "reward": 1.8236607760190964, "reward_std": 0.030135108157992363, "rewards/accuracy_reward": 0.8236607611179352, "rewards/format_reward": 1.0, "step": 5614 }, { "completion_length": 426.33707427978516, "epoch": 0.4377837974782219, "grad_norm": 0.09170224228129113, "kl": 0.0026998519897460938, "learning_rate": 5.970947533792477e-07, "loss": 0.0001, "reward": 1.8058036863803864, "reward_std": 0.03840135969221592, "rewards/accuracy_reward": 0.8058036044239998, "rewards/format_reward": 1.0, "step": 5616 }, { "completion_length": 439.5178756713867, "epoch": 0.43793970338900473, "grad_norm": 0.07118714716489712, "kl": 0.0032520294189453125, "learning_rate": 5.968545024684233e-07, "loss": 0.0001, "reward": 1.8080357909202576, "reward_std": 0.04471481405198574, "rewards/accuracy_reward": 0.808035746216774, "rewards/format_reward": 1.0, "step": 5618 }, { "completion_length": 438.3125228881836, "epoch": 0.4380956092997876, "grad_norm": 0.09209466709617267, "kl": 0.003200531005859375, "learning_rate": 5.966142283215845e-07, "loss": 0.0001, "reward": 1.8482143431901932, "reward_std": 0.048249500803649426, "rewards/accuracy_reward": 0.8482143208384514, "rewards/format_reward": 1.0, "step": 5620 }, { "completion_length": 426.60269927978516, "epoch": 0.43825151521057043, "grad_norm": 0.11212578426399503, "kl": 0.0029134750366210938, "learning_rate": 5.963739309963746e-07, "loss": 0.0001, "reward": 1.7700893431901932, "reward_std": 0.05764481611549854, "rewards/accuracy_reward": 0.7700893208384514, "rewards/format_reward": 1.0, "step": 5622 }, { "completion_length": 418.1607322692871, "epoch": 0.4384074211213533, "grad_norm": 0.10555913032772256, "kl": 0.003170013427734375, "learning_rate": 5.96133610550442e-07, "loss": 0.0001, "reward": 1.821428656578064, "reward_std": 0.07545471005141735, "rewards/accuracy_reward": 0.8214286118745804, "rewards/format_reward": 1.0, "step": 5624 }, { "completion_length": 433.2254638671875, "epoch": 0.43856332703213613, "grad_norm": 0.08120544217459819, "kl": 0.002834320068359375, "learning_rate": 5.958932670414416e-07, "loss": 0.0001, "reward": 1.8883928954601288, "reward_std": 0.02382165566086769, "rewards/accuracy_reward": 0.8883928805589676, "rewards/format_reward": 1.0, "step": 5626 }, { "completion_length": 420.95314025878906, "epoch": 0.4387192329429189, "grad_norm": 0.07099354978863501, "kl": 0.0032520294189453125, "learning_rate": 5.956529005270331e-07, "loss": 0.0001, "reward": 1.7075893878936768, "reward_std": 0.04809977114200592, "rewards/accuracy_reward": 0.7075893208384514, "rewards/format_reward": 1.0, "step": 5628 }, { "completion_length": 417.57814025878906, "epoch": 0.4388751388537018, "grad_norm": 0.05241713574336963, "kl": 0.003047943115234375, "learning_rate": 5.954125110648821e-07, "loss": 0.0001, "reward": 1.881696492433548, "reward_std": 0.030135109089314938, "rewards/accuracy_reward": 0.8816964700818062, "rewards/format_reward": 1.0, "step": 5630 }, { "completion_length": 446.252254486084, "epoch": 0.4390310447644846, "grad_norm": 0.09247069573925322, "kl": 0.003322601318359375, "learning_rate": 5.951720987126594e-07, "loss": 0.0001, "reward": 1.6227679550647736, "reward_std": 0.06756077148020267, "rewards/accuracy_reward": 0.6227678805589676, "rewards/format_reward": 1.0, "step": 5632 }, { "completion_length": 427.56921768188477, "epoch": 0.4391869506752675, "grad_norm": 0.06788751431106893, "kl": 0.00308990478515625, "learning_rate": 5.949316635280416e-07, "loss": 0.0001, "reward": 1.8058036267757416, "reward_std": 0.05230706091970205, "rewards/accuracy_reward": 0.8058036044239998, "rewards/format_reward": 1.0, "step": 5634 }, { "completion_length": 409.3973388671875, "epoch": 0.4393428565860503, "grad_norm": 0.09413650684073668, "kl": 0.0026416778564453125, "learning_rate": 5.946912055687106e-07, "loss": 0.0001, "reward": 1.8482143431901932, "reward_std": 0.04614697303622961, "rewards/accuracy_reward": 0.8482143133878708, "rewards/format_reward": 1.0, "step": 5636 }, { "completion_length": 425.55582427978516, "epoch": 0.4394987624968332, "grad_norm": 0.003199231208511438, "kl": 0.002941131591796875, "learning_rate": 5.944507248923537e-07, "loss": 0.0001, "reward": 1.7723215222358704, "reward_std": 0.008266251534223557, "rewards/accuracy_reward": 0.7723214700818062, "rewards/format_reward": 1.0, "step": 5638 }, { "completion_length": 423.5357322692871, "epoch": 0.439654668407616, "grad_norm": 0.06270141090313128, "kl": 0.0029211044311523438, "learning_rate": 5.942102215566639e-07, "loss": 0.0001, "reward": 1.7544643729925156, "reward_std": 0.045691913925111294, "rewards/accuracy_reward": 0.7544643208384514, "rewards/format_reward": 1.0, "step": 5640 }, { "completion_length": 417.28572845458984, "epoch": 0.4398105743183989, "grad_norm": 0.03972083947507656, "kl": 0.0023708343505859375, "learning_rate": 5.939696956193396e-07, "loss": 0.0001, "reward": 1.8437500894069672, "reward_std": 0.023821654729545116, "rewards/accuracy_reward": 0.8437500298023224, "rewards/format_reward": 1.0, "step": 5642 }, { "completion_length": 413.2009162902832, "epoch": 0.43996648022918167, "grad_norm": 0.0031094450300444124, "kl": 0.003345489501953125, "learning_rate": 5.937291471380841e-07, "loss": 0.0001, "reward": 1.8459822237491608, "reward_std": 0.03111220896244049, "rewards/accuracy_reward": 0.8459821864962578, "rewards/format_reward": 1.0, "step": 5644 }, { "completion_length": 435.75671768188477, "epoch": 0.4401223861399645, "grad_norm": 0.10820333079017042, "kl": 0.0034389495849609375, "learning_rate": 5.934885761706069e-07, "loss": 0.0001, "reward": 1.7656251043081284, "reward_std": 0.0562126561999321, "rewards/accuracy_reward": 0.765625037252903, "rewards/format_reward": 1.0, "step": 5646 }, { "completion_length": 417.9419822692871, "epoch": 0.44027829205074737, "grad_norm": 0.06545778769451015, "kl": 0.0023021697998046875, "learning_rate": 5.932479827746224e-07, "loss": 0.0001, "reward": 1.866071492433548, "reward_std": 0.0489221028983593, "rewards/accuracy_reward": 0.8660714626312256, "rewards/format_reward": 1.0, "step": 5648 }, { "completion_length": 430.3504638671875, "epoch": 0.4404341979615302, "grad_norm": 0.10897276369413751, "kl": 0.004329681396484375, "learning_rate": 5.930073670078503e-07, "loss": 0.0002, "reward": 1.8080358058214188, "reward_std": 0.04907546937465668, "rewards/accuracy_reward": 0.808035746216774, "rewards/format_reward": 1.0, "step": 5650 }, { "completion_length": 422.71653747558594, "epoch": 0.44059010387231307, "grad_norm": 0.06382282040079304, "kl": 0.0027256011962890625, "learning_rate": 5.927667289280163e-07, "loss": 0.0001, "reward": 1.8325893878936768, "reward_std": 0.027206611819565296, "rewards/accuracy_reward": 0.8325893133878708, "rewards/format_reward": 1.0, "step": 5652 }, { "completion_length": 410.37055587768555, "epoch": 0.4407460097830959, "grad_norm": 0.08849000996279174, "kl": 0.0026998519897460938, "learning_rate": 5.925260685928506e-07, "loss": 0.0001, "reward": 1.8571429252624512, "reward_std": 0.0490754684433341, "rewards/accuracy_reward": 0.8571428805589676, "rewards/format_reward": 1.0, "step": 5654 }, { "completion_length": 416.8928756713867, "epoch": 0.44090191569387877, "grad_norm": 0.10011724606443767, "kl": 0.0029697418212890625, "learning_rate": 5.922853860600892e-07, "loss": 0.0001, "reward": 1.8459822088479996, "reward_std": 0.042762016877532005, "rewards/accuracy_reward": 0.845982164144516, "rewards/format_reward": 1.0, "step": 5656 }, { "completion_length": 417.6964454650879, "epoch": 0.44105782160466156, "grad_norm": 0.06752698888433374, "kl": 0.0029048919677734375, "learning_rate": 5.920446813874735e-07, "loss": 0.0001, "reward": 1.8013393729925156, "reward_std": 0.024124749936163425, "rewards/accuracy_reward": 0.801339328289032, "rewards/format_reward": 1.0, "step": 5658 }, { "completion_length": 413.7701072692871, "epoch": 0.4412137275154444, "grad_norm": 0.10151962707917259, "kl": 0.0045185089111328125, "learning_rate": 5.918039546327503e-07, "loss": 0.0002, "reward": 1.74553582072258, "reward_std": 0.061247317120432854, "rewards/accuracy_reward": 0.7455357387661934, "rewards/format_reward": 1.0, "step": 5660 }, { "completion_length": 412.1093978881836, "epoch": 0.44136963342622726, "grad_norm": 0.092670112538505, "kl": 0.002811431884765625, "learning_rate": 5.915632058536713e-07, "loss": 0.0001, "reward": 1.816964328289032, "reward_std": 0.03403930272907019, "rewards/accuracy_reward": 0.8169643059372902, "rewards/format_reward": 1.0, "step": 5662 }, { "completion_length": 421.38841247558594, "epoch": 0.4415255393370101, "grad_norm": 0.09436124480197303, "kl": 0.0029039382934570312, "learning_rate": 5.913224351079934e-07, "loss": 0.0001, "reward": 1.866071492433548, "reward_std": 0.05298106465488672, "rewards/accuracy_reward": 0.866071455180645, "rewards/format_reward": 1.0, "step": 5664 }, { "completion_length": 425.3393020629883, "epoch": 0.44168144524779296, "grad_norm": 0.11539602749291665, "kl": 0.00360870361328125, "learning_rate": 5.910816424534793e-07, "loss": 0.0001, "reward": 1.8415179550647736, "reward_std": 0.05102826748043299, "rewards/accuracy_reward": 0.8415178805589676, "rewards/format_reward": 1.0, "step": 5666 }, { "completion_length": 436.3303756713867, "epoch": 0.4418373511585758, "grad_norm": 0.3005580702185093, "kl": 0.0030307769775390625, "learning_rate": 5.908408279478966e-07, "loss": 0.0001, "reward": 1.8794643580913544, "reward_std": 0.029159409925341606, "rewards/accuracy_reward": 0.879464328289032, "rewards/format_reward": 1.0, "step": 5668 }, { "completion_length": 438.6651954650879, "epoch": 0.44199325706935866, "grad_norm": 0.1010671797857151, "kl": 0.003398895263671875, "learning_rate": 5.905999916490184e-07, "loss": 0.0001, "reward": 1.7388393878936768, "reward_std": 0.045017908327281475, "rewards/accuracy_reward": 0.7388393133878708, "rewards/format_reward": 1.0, "step": 5670 }, { "completion_length": 425.5424346923828, "epoch": 0.4421491629801415, "grad_norm": 0.11656975866950751, "kl": 0.003154754638671875, "learning_rate": 5.903591336146229e-07, "loss": 0.0001, "reward": 1.805803656578064, "reward_std": 0.04959610849618912, "rewards/accuracy_reward": 0.8058035895228386, "rewards/format_reward": 1.0, "step": 5672 }, { "completion_length": 407.79689025878906, "epoch": 0.4423050688909243, "grad_norm": 0.10589005821537005, "kl": 0.0029048919677734375, "learning_rate": 5.901182539024932e-07, "loss": 0.0001, "reward": 1.8459822237491608, "reward_std": 0.046296700835227966, "rewards/accuracy_reward": 0.8459821939468384, "rewards/format_reward": 1.0, "step": 5674 }, { "completion_length": 417.81921005249023, "epoch": 0.44246097480170715, "grad_norm": 0.06875408484184735, "kl": 0.0025539398193359375, "learning_rate": 5.898773525704181e-07, "loss": 0.0001, "reward": 1.8415179401636124, "reward_std": 0.03727089241147041, "rewards/accuracy_reward": 0.84151791036129, "rewards/format_reward": 1.0, "step": 5676 }, { "completion_length": 439.5513572692871, "epoch": 0.44261688071249, "grad_norm": 0.07850148083835166, "kl": 0.0034780502319335938, "learning_rate": 5.896364296761913e-07, "loss": 0.0001, "reward": 1.7678572088479996, "reward_std": 0.029159409925341606, "rewards/accuracy_reward": 0.7678571715950966, "rewards/format_reward": 1.0, "step": 5678 }, { "completion_length": 431.4732322692871, "epoch": 0.44277278662327285, "grad_norm": 0.051193353615563826, "kl": 0.003139495849609375, "learning_rate": 5.893954852776117e-07, "loss": 0.0001, "reward": 1.908482164144516, "reward_std": 0.04035275708884001, "rewards/accuracy_reward": 0.908482164144516, "rewards/format_reward": 1.0, "step": 5680 }, { "completion_length": 425.20983505249023, "epoch": 0.4429286925340557, "grad_norm": 0.07489846777922186, "kl": 0.002716064453125, "learning_rate": 5.891545194324835e-07, "loss": 0.0001, "reward": 1.8861607760190964, "reward_std": 0.03983351867645979, "rewards/accuracy_reward": 0.8861607536673546, "rewards/format_reward": 1.0, "step": 5682 }, { "completion_length": 416.12278747558594, "epoch": 0.44308459844483855, "grad_norm": 0.10211341986990664, "kl": 0.0024690628051757812, "learning_rate": 5.88913532198616e-07, "loss": 0.0001, "reward": 1.8080357909202576, "reward_std": 0.06951356958597898, "rewards/accuracy_reward": 0.8080357536673546, "rewards/format_reward": 1.0, "step": 5684 }, { "completion_length": 419.73216247558594, "epoch": 0.4432405043556214, "grad_norm": 0.0854902967987061, "kl": 0.0029964447021484375, "learning_rate": 5.886725236338235e-07, "loss": 0.0001, "reward": 1.8415179252624512, "reward_std": 0.03675165679305792, "rewards/accuracy_reward": 0.8415178954601288, "rewards/format_reward": 1.0, "step": 5686 }, { "completion_length": 421.37278747558594, "epoch": 0.44339641026640425, "grad_norm": 0.11547425774353837, "kl": 0.0026912689208984375, "learning_rate": 5.884314937959254e-07, "loss": 0.0001, "reward": 1.8102679252624512, "reward_std": 0.06621276028454304, "rewards/accuracy_reward": 0.8102678880095482, "rewards/format_reward": 1.0, "step": 5688 }, { "completion_length": 421.2656478881836, "epoch": 0.44355231617718704, "grad_norm": 0.08679293381892667, "kl": 0.00304412841796875, "learning_rate": 5.881904427427463e-07, "loss": 0.0001, "reward": 1.8459822535514832, "reward_std": 0.03853995352983475, "rewards/accuracy_reward": 0.8482143208384514, "rewards/format_reward": 0.9977678656578064, "step": 5690 }, { "completion_length": 423.93974685668945, "epoch": 0.4437082220879699, "grad_norm": 0.12696334002187987, "kl": 0.0038127899169921875, "learning_rate": 5.879493705321161e-07, "loss": 0.0002, "reward": 1.7767857909202576, "reward_std": 0.08552403282374144, "rewards/accuracy_reward": 0.7767857536673546, "rewards/format_reward": 1.0, "step": 5692 }, { "completion_length": 429.7835006713867, "epoch": 0.44386412799875274, "grad_norm": 0.11493182532864624, "kl": 0.0032482147216796875, "learning_rate": 5.877082772218694e-07, "loss": 0.0001, "reward": 1.7745536714792252, "reward_std": 0.056885259225964546, "rewards/accuracy_reward": 0.7745536044239998, "rewards/format_reward": 1.0, "step": 5694 }, { "completion_length": 418.0134086608887, "epoch": 0.4440200339095356, "grad_norm": 0.060866239724278424, "kl": 0.0028438568115234375, "learning_rate": 5.874671628698461e-07, "loss": 0.0001, "reward": 1.8392857909202576, "reward_std": 0.012626906856894493, "rewards/accuracy_reward": 0.8392857611179352, "rewards/format_reward": 1.0, "step": 5696 }, { "completion_length": 414.5089416503906, "epoch": 0.44417593982031844, "grad_norm": 0.07367469951365356, "kl": 0.002712249755859375, "learning_rate": 5.87226027533891e-07, "loss": 0.0001, "reward": 1.9062500447034836, "reward_std": 0.03464549221098423, "rewards/accuracy_reward": 0.906250037252903, "rewards/format_reward": 1.0, "step": 5698 }, { "completion_length": 420.27903747558594, "epoch": 0.4443318457311013, "grad_norm": 0.0797964001999325, "kl": 0.0026330947875976562, "learning_rate": 5.869848712718539e-07, "loss": 0.0001, "reward": 1.8236608058214188, "reward_std": 0.053956763818860054, "rewards/accuracy_reward": 0.8236607611179352, "rewards/format_reward": 1.0, "step": 5700 }, { "completion_length": 416.7143096923828, "epoch": 0.44448775164188414, "grad_norm": 0.06540434443505934, "kl": 0.0027093887329101562, "learning_rate": 5.867436941415901e-07, "loss": 0.0001, "reward": 1.906250074505806, "reward_std": 0.02089315839111805, "rewards/accuracy_reward": 0.906250037252903, "rewards/format_reward": 1.0, "step": 5702 }, { "completion_length": 413.19868087768555, "epoch": 0.444643657552667, "grad_norm": 0.09732099658018363, "kl": 0.0026035308837890625, "learning_rate": 5.865024962009594e-07, "loss": 0.0001, "reward": 1.8169643431901932, "reward_std": 0.02089315839111805, "rewards/accuracy_reward": 0.8169643133878708, "rewards/format_reward": 1.0, "step": 5704 }, { "completion_length": 416.78796768188477, "epoch": 0.4447995634634498, "grad_norm": 0.09522587994744364, "kl": 0.002597808837890625, "learning_rate": 5.862612775078265e-07, "loss": 0.0001, "reward": 1.82589291036129, "reward_std": 0.039377057924866676, "rewards/accuracy_reward": 0.8258928880095482, "rewards/format_reward": 1.0, "step": 5706 }, { "completion_length": 432.6183280944824, "epoch": 0.44495546937423264, "grad_norm": 0.11070784051992877, "kl": 0.0032367706298828125, "learning_rate": 5.860200381200612e-07, "loss": 0.0001, "reward": 1.8526786714792252, "reward_std": 0.0602702172473073, "rewards/accuracy_reward": 0.8526785895228386, "rewards/format_reward": 1.0, "step": 5708 }, { "completion_length": 420.69421768188477, "epoch": 0.4451113752850155, "grad_norm": 0.09572389308718889, "kl": 0.0030870437622070312, "learning_rate": 5.85778778095539e-07, "loss": 0.0001, "reward": 1.8080357760190964, "reward_std": 0.03208790719509125, "rewards/accuracy_reward": 0.8080357536673546, "rewards/format_reward": 1.0, "step": 5710 }, { "completion_length": 434.47993087768555, "epoch": 0.44526728119579834, "grad_norm": 0.12436052001421645, "kl": 0.003162384033203125, "learning_rate": 5.855374974921392e-07, "loss": 0.0001, "reward": 1.8392857909202576, "reward_std": 0.08244216814637184, "rewards/accuracy_reward": 0.839285746216774, "rewards/format_reward": 1.0, "step": 5712 }, { "completion_length": 413.0513572692871, "epoch": 0.4454231871065812, "grad_norm": 0.042937037272890756, "kl": 0.002597808837890625, "learning_rate": 5.852961963677468e-07, "loss": 0.0001, "reward": 1.8080358058214188, "reward_std": 0.03352006711065769, "rewards/accuracy_reward": 0.808035746216774, "rewards/format_reward": 1.0, "step": 5714 }, { "completion_length": 411.4464454650879, "epoch": 0.44557909301736404, "grad_norm": 0.08792032663596029, "kl": 0.0032072067260742188, "learning_rate": 5.850548747802512e-07, "loss": 0.0001, "reward": 1.7968750894069672, "reward_std": 0.03156726621091366, "rewards/accuracy_reward": 0.7968750484287739, "rewards/format_reward": 1.0, "step": 5716 }, { "completion_length": 430.4843940734863, "epoch": 0.4457349989281469, "grad_norm": 0.08303108135113293, "kl": 0.0027761459350585938, "learning_rate": 5.848135327875473e-07, "loss": 0.0001, "reward": 1.85714291036129, "reward_std": 0.04892210382968187, "rewards/accuracy_reward": 0.8571428880095482, "rewards/format_reward": 1.0, "step": 5718 }, { "completion_length": 407.8192138671875, "epoch": 0.4458909048389297, "grad_norm": 0.09464530086653061, "kl": 0.00246429443359375, "learning_rate": 5.845721704475343e-07, "loss": 0.0001, "reward": 1.875000074505806, "reward_std": 0.03479885868728161, "rewards/accuracy_reward": 0.8750000596046448, "rewards/format_reward": 1.0, "step": 5720 }, { "completion_length": 423.70983505249023, "epoch": 0.44604681074971253, "grad_norm": 0.10092178214181308, "kl": 0.00323486328125, "learning_rate": 5.84330787818117e-07, "loss": 0.0001, "reward": 1.7767858058214188, "reward_std": 0.04907546937465668, "rewards/accuracy_reward": 0.776785746216774, "rewards/format_reward": 1.0, "step": 5722 }, { "completion_length": 426.674129486084, "epoch": 0.4462027166604954, "grad_norm": 0.12076163941593535, "kl": 0.0030956268310546875, "learning_rate": 5.840893849572043e-07, "loss": 0.0001, "reward": 1.7254465222358704, "reward_std": 0.06365517433732748, "rewards/accuracy_reward": 0.725446455180645, "rewards/format_reward": 1.0, "step": 5724 }, { "completion_length": 437.4419860839844, "epoch": 0.44635862257127823, "grad_norm": 0.13478525509357617, "kl": 0.0033178329467773438, "learning_rate": 5.838479619227103e-07, "loss": 0.0001, "reward": 1.906250074505806, "reward_std": 0.07658517546951771, "rewards/accuracy_reward": 0.9062500447034836, "rewards/format_reward": 1.0, "step": 5726 }, { "completion_length": 426.86162185668945, "epoch": 0.4465145284820611, "grad_norm": 0.11502620034381983, "kl": 0.003719329833984375, "learning_rate": 5.836065187725538e-07, "loss": 0.0001, "reward": 1.8191965073347092, "reward_std": 0.0635018078610301, "rewards/accuracy_reward": 0.8191964700818062, "rewards/format_reward": 1.0, "step": 5728 }, { "completion_length": 420.5625228881836, "epoch": 0.44667043439284393, "grad_norm": 0.04927362592567855, "kl": 0.00304412841796875, "learning_rate": 5.833650555646591e-07, "loss": 0.0001, "reward": 1.8549107760190964, "reward_std": 0.03675165679305792, "rewards/accuracy_reward": 0.8549107536673546, "rewards/format_reward": 1.0, "step": 5730 }, { "completion_length": 440.3326072692871, "epoch": 0.4468263403036268, "grad_norm": 0.06503469315678186, "kl": 0.0034465789794921875, "learning_rate": 5.831235723569542e-07, "loss": 0.0001, "reward": 1.7991072237491608, "reward_std": 0.036448562517762184, "rewards/accuracy_reward": 0.7991071790456772, "rewards/format_reward": 1.0, "step": 5732 }, { "completion_length": 416.8750228881836, "epoch": 0.44698224621440963, "grad_norm": 0.12534081874122172, "kl": 0.00296783447265625, "learning_rate": 5.828820692073729e-07, "loss": 0.0001, "reward": 1.8191965073347092, "reward_std": 0.07921057939529419, "rewards/accuracy_reward": 0.821428619325161, "rewards/format_reward": 0.9977678656578064, "step": 5734 }, { "completion_length": 417.55135345458984, "epoch": 0.4471381521251924, "grad_norm": 0.07345704512279673, "kl": 0.0028023719787597656, "learning_rate": 5.826405461738528e-07, "loss": 0.0001, "reward": 1.7968750894069672, "reward_std": 0.030135108157992363, "rewards/accuracy_reward": 0.796875037252903, "rewards/format_reward": 1.0, "step": 5736 }, { "completion_length": 423.88171005249023, "epoch": 0.4472940580359753, "grad_norm": 0.04404971529804876, "kl": 0.002777099609375, "learning_rate": 5.823990033143374e-07, "loss": 0.0001, "reward": 1.8191965073347092, "reward_std": 0.030135108157992363, "rewards/accuracy_reward": 0.8191964849829674, "rewards/format_reward": 1.0, "step": 5738 }, { "completion_length": 431.9285888671875, "epoch": 0.4474499639467581, "grad_norm": 0.09973870631391156, "kl": 0.0030298233032226562, "learning_rate": 5.821574406867741e-07, "loss": 0.0001, "reward": 1.781250074505806, "reward_std": 0.03352006431668997, "rewards/accuracy_reward": 0.7812500149011612, "rewards/format_reward": 1.0, "step": 5740 }, { "completion_length": 415.27680587768555, "epoch": 0.44760586985754097, "grad_norm": 0.08472637312431387, "kl": 0.0025730133056640625, "learning_rate": 5.819158583491151e-07, "loss": 0.0001, "reward": 1.8638393580913544, "reward_std": 0.05230706185102463, "rewards/accuracy_reward": 0.8638393133878708, "rewards/format_reward": 1.0, "step": 5742 }, { "completion_length": 416.7232360839844, "epoch": 0.4477617757683238, "grad_norm": 0.0912019089099166, "kl": 0.0027599334716796875, "learning_rate": 5.816742563593181e-07, "loss": 0.0001, "reward": 1.796875074505806, "reward_std": 0.05102826748043299, "rewards/accuracy_reward": 0.7968750298023224, "rewards/format_reward": 1.0, "step": 5744 }, { "completion_length": 419.7634086608887, "epoch": 0.44791768167910667, "grad_norm": 0.05795032233340338, "kl": 0.003173828125, "learning_rate": 5.814326347753447e-07, "loss": 0.0001, "reward": 1.8080357611179352, "reward_std": 0.03870445489883423, "rewards/accuracy_reward": 0.8080357387661934, "rewards/format_reward": 1.0, "step": 5746 }, { "completion_length": 425.32591247558594, "epoch": 0.4480735875898895, "grad_norm": 0.06500436874557694, "kl": 0.0033903121948242188, "learning_rate": 5.811909936551612e-07, "loss": 0.0001, "reward": 1.7611608058214188, "reward_std": 0.027206611819565296, "rewards/accuracy_reward": 0.7611607387661934, "rewards/format_reward": 1.0, "step": 5748 }, { "completion_length": 427.0178756713867, "epoch": 0.44822949350067237, "grad_norm": 0.10226598303026868, "kl": 0.003082275390625, "learning_rate": 5.809493330567392e-07, "loss": 0.0001, "reward": 1.7343751043081284, "reward_std": 0.039833519607782364, "rewards/accuracy_reward": 0.7343750298023224, "rewards/format_reward": 1.0, "step": 5750 }, { "completion_length": 425.0201072692871, "epoch": 0.44838539941145517, "grad_norm": 0.08748594196743903, "kl": 0.002834320068359375, "learning_rate": 5.807076530380545e-07, "loss": 0.0001, "reward": 1.8549108058214188, "reward_std": 0.039680153131484985, "rewards/accuracy_reward": 0.8549107611179352, "rewards/format_reward": 1.0, "step": 5752 }, { "completion_length": 422.63394927978516, "epoch": 0.448541305322238, "grad_norm": 0.09943440795308417, "kl": 0.003139495849609375, "learning_rate": 5.804659536570877e-07, "loss": 0.0001, "reward": 1.8526786416769028, "reward_std": 0.07109405286610126, "rewards/accuracy_reward": 0.8526786044239998, "rewards/format_reward": 1.0, "step": 5754 }, { "completion_length": 413.1718864440918, "epoch": 0.44869721123302087, "grad_norm": 0.0925202442988256, "kl": 0.0031642913818359375, "learning_rate": 5.802242349718239e-07, "loss": 0.0001, "reward": 1.7611607909202576, "reward_std": 0.059749577194452286, "rewards/accuracy_reward": 0.761160746216774, "rewards/format_reward": 1.0, "step": 5756 }, { "completion_length": 414.55805587768555, "epoch": 0.4488531171438037, "grad_norm": 0.06557885985502238, "kl": 0.0025777816772460938, "learning_rate": 5.79982497040253e-07, "loss": 0.0001, "reward": 1.90401791036129, "reward_std": 0.021868856623768806, "rewards/accuracy_reward": 0.9040178880095482, "rewards/format_reward": 1.0, "step": 5758 }, { "completion_length": 409.0022506713867, "epoch": 0.44900902305458656, "grad_norm": 0.08798007467512271, "kl": 0.002655029296875, "learning_rate": 5.797407399203696e-07, "loss": 0.0001, "reward": 1.7968751043081284, "reward_std": 0.050202298909425735, "rewards/accuracy_reward": 0.7968750298023224, "rewards/format_reward": 1.0, "step": 5760 }, { "completion_length": 445.88841247558594, "epoch": 0.4491649289653694, "grad_norm": 0.12192177884296855, "kl": 0.0036678314208984375, "learning_rate": 5.794989636701728e-07, "loss": 0.0001, "reward": 1.7946429401636124, "reward_std": 0.0877799242734909, "rewards/accuracy_reward": 0.7946429029107094, "rewards/format_reward": 1.0, "step": 5762 }, { "completion_length": 423.10269927978516, "epoch": 0.44932083487615226, "grad_norm": 0.04038411766141065, "kl": 0.0029392242431640625, "learning_rate": 5.792571683476661e-07, "loss": 0.0001, "reward": 1.8861607611179352, "reward_std": 0.04276201594620943, "rewards/accuracy_reward": 0.8883928805589676, "rewards/format_reward": 0.9977678656578064, "step": 5764 }, { "completion_length": 427.38618087768555, "epoch": 0.44947674078693506, "grad_norm": 0.1278056446057301, "kl": 0.003753662109375, "learning_rate": 5.79015354010858e-07, "loss": 0.0002, "reward": 1.743303656578064, "reward_std": 0.059294517152011395, "rewards/accuracy_reward": 0.7433035969734192, "rewards/format_reward": 1.0, "step": 5766 }, { "completion_length": 432.0803756713867, "epoch": 0.4496326466977179, "grad_norm": 0.12429154483289015, "kl": 0.0031681060791015625, "learning_rate": 5.787735207177609e-07, "loss": 0.0001, "reward": 1.750000074505806, "reward_std": 0.08161843568086624, "rewards/accuracy_reward": 0.752232164144516, "rewards/format_reward": 0.9977678656578064, "step": 5768 }, { "completion_length": 428.4509162902832, "epoch": 0.44978855260850076, "grad_norm": 0.05070014637793948, "kl": 0.00278472900390625, "learning_rate": 5.785316685263926e-07, "loss": 0.0001, "reward": 1.8147322088479996, "reward_std": 0.01585849840193987, "rewards/accuracy_reward": 0.8147321715950966, "rewards/format_reward": 1.0, "step": 5770 }, { "completion_length": 426.64064025878906, "epoch": 0.4499444585192836, "grad_norm": 0.10876760289384764, "kl": 0.0026760101318359375, "learning_rate": 5.782897974947746e-07, "loss": 0.0001, "reward": 1.7745536714792252, "reward_std": 0.043585749343037605, "rewards/accuracy_reward": 0.7745536118745804, "rewards/format_reward": 1.0, "step": 5772 }, { "completion_length": 431.80805587768555, "epoch": 0.45010036443006646, "grad_norm": 0.10118061767455142, "kl": 0.003162384033203125, "learning_rate": 5.780479076809335e-07, "loss": 0.0001, "reward": 1.8102679401636124, "reward_std": 0.05831741914153099, "rewards/accuracy_reward": 0.8102678954601288, "rewards/format_reward": 1.0, "step": 5774 }, { "completion_length": 434.7500190734863, "epoch": 0.4502562703408493, "grad_norm": 0.14356568117557658, "kl": 0.003597259521484375, "learning_rate": 5.778059991429004e-07, "loss": 0.0001, "reward": 1.7410715222358704, "reward_std": 0.06755937077105045, "rewards/accuracy_reward": 0.7410714626312256, "rewards/format_reward": 1.0, "step": 5776 }, { "completion_length": 428.3035888671875, "epoch": 0.45041217625163216, "grad_norm": 0.04196325233366369, "kl": 0.0030517578125, "learning_rate": 5.775640719387104e-07, "loss": 0.0001, "reward": 1.8147322237491608, "reward_std": 0.05102826748043299, "rewards/accuracy_reward": 0.8147321715950966, "rewards/format_reward": 1.0, "step": 5778 }, { "completion_length": 444.2835006713867, "epoch": 0.450568082162415, "grad_norm": 0.09915665783059927, "kl": 0.0034542083740234375, "learning_rate": 5.773221261264034e-07, "loss": 0.0001, "reward": 1.8169643729925156, "reward_std": 0.044714813120663166, "rewards/accuracy_reward": 0.816964328289032, "rewards/format_reward": 1.0, "step": 5780 }, { "completion_length": 428.0201072692871, "epoch": 0.4507239880731978, "grad_norm": 0.08300216840463719, "kl": 0.0031652450561523438, "learning_rate": 5.77080161764024e-07, "loss": 0.0001, "reward": 1.8214286416769028, "reward_std": 0.04599360655993223, "rewards/accuracy_reward": 0.821428619325161, "rewards/format_reward": 1.0, "step": 5782 }, { "completion_length": 430.1919860839844, "epoch": 0.45087989398398065, "grad_norm": 0.10566303895258641, "kl": 0.003147125244140625, "learning_rate": 5.768381789096206e-07, "loss": 0.0001, "reward": 1.8125000894069672, "reward_std": 0.05734172184020281, "rewards/accuracy_reward": 0.8147321790456772, "rewards/format_reward": 0.9977678656578064, "step": 5784 }, { "completion_length": 421.9107360839844, "epoch": 0.4510357998947635, "grad_norm": 0.10705445489824868, "kl": 0.0030107498168945312, "learning_rate": 5.765961776212468e-07, "loss": 0.0001, "reward": 1.736607238650322, "reward_std": 0.056364620104432106, "rewards/accuracy_reward": 0.7366071790456772, "rewards/format_reward": 1.0, "step": 5786 }, { "completion_length": 427.81921768188477, "epoch": 0.45119170580554635, "grad_norm": 0.10581114626612988, "kl": 0.0029249191284179688, "learning_rate": 5.763541579569602e-07, "loss": 0.0001, "reward": 1.7790179401636124, "reward_std": 0.05621265713125467, "rewards/accuracy_reward": 0.7790178880095482, "rewards/format_reward": 1.0, "step": 5788 }, { "completion_length": 404.87947845458984, "epoch": 0.4513476117163292, "grad_norm": 0.07540328428556836, "kl": 0.0028200149536132812, "learning_rate": 5.761121199748227e-07, "loss": 0.0001, "reward": 1.8191965073347092, "reward_std": 0.01894036028534174, "rewards/accuracy_reward": 0.819196455180645, "rewards/format_reward": 1.0, "step": 5790 }, { "completion_length": 418.1964454650879, "epoch": 0.45150351762711205, "grad_norm": 0.10871937430946275, "kl": 0.0029420852661132812, "learning_rate": 5.758700637329008e-07, "loss": 0.0001, "reward": 1.8816965073347092, "reward_std": 0.052005368284881115, "rewards/accuracy_reward": 0.881696455180645, "rewards/format_reward": 1.0, "step": 5792 }, { "completion_length": 412.5401954650879, "epoch": 0.4516594235378949, "grad_norm": 0.11244998035205148, "kl": 0.0024728775024414062, "learning_rate": 5.756279892892655e-07, "loss": 0.0001, "reward": 1.8660715073347092, "reward_std": 0.043065110221505165, "rewards/accuracy_reward": 0.8660714700818062, "rewards/format_reward": 1.0, "step": 5794 }, { "completion_length": 432.64733505249023, "epoch": 0.45181532944867775, "grad_norm": 0.0501342527653602, "kl": 0.0030031204223632812, "learning_rate": 5.753858967019918e-07, "loss": 0.0001, "reward": 1.7611607760190964, "reward_std": 0.03366979397833347, "rewards/accuracy_reward": 0.7611607536673546, "rewards/format_reward": 1.0, "step": 5796 }, { "completion_length": 421.0424346923828, "epoch": 0.45197123535946054, "grad_norm": 0.11408125540255923, "kl": 0.0033626556396484375, "learning_rate": 5.751437860291594e-07, "loss": 0.0001, "reward": 1.736607238650322, "reward_std": 0.07109405286610126, "rewards/accuracy_reward": 0.7366071715950966, "rewards/format_reward": 1.0, "step": 5798 }, { "completion_length": 426.721004486084, "epoch": 0.4521271412702434, "grad_norm": 0.08320958836654539, "kl": 0.0030956268310546875, "learning_rate": 5.749016573288523e-07, "loss": 0.0001, "reward": 1.7500000894069672, "reward_std": 0.0602702172473073, "rewards/accuracy_reward": 0.7500000223517418, "rewards/format_reward": 1.0, "step": 5800 }, { "completion_length": 420.37278747558594, "epoch": 0.45228304718102624, "grad_norm": 0.09167698207245142, "kl": 0.0027017593383789062, "learning_rate": 5.746595106591583e-07, "loss": 0.0001, "reward": 1.9218750447034836, "reward_std": 0.03306360449641943, "rewards/accuracy_reward": 0.9218750223517418, "rewards/format_reward": 1.0, "step": 5802 }, { "completion_length": 426.57144927978516, "epoch": 0.4524389530918091, "grad_norm": 0.10230658910227941, "kl": 0.0032148361206054688, "learning_rate": 5.744173460781702e-07, "loss": 0.0001, "reward": 1.7834822237491608, "reward_std": 0.06883956491947174, "rewards/accuracy_reward": 0.783482164144516, "rewards/format_reward": 1.0, "step": 5804 }, { "completion_length": 426.2500114440918, "epoch": 0.45259485900259194, "grad_norm": 0.08582836439645611, "kl": 0.0031948089599609375, "learning_rate": 5.741751636439851e-07, "loss": 0.0001, "reward": 1.8281250894069672, "reward_std": 0.04712267033755779, "rewards/accuracy_reward": 0.8281250298023224, "rewards/format_reward": 1.0, "step": 5806 }, { "completion_length": 436.07592391967773, "epoch": 0.4527507649133748, "grad_norm": 0.12647615829183953, "kl": 0.0033092498779296875, "learning_rate": 5.739329634147039e-07, "loss": 0.0001, "reward": 1.734375074505806, "reward_std": 0.06883956212550402, "rewards/accuracy_reward": 0.7343750223517418, "rewards/format_reward": 1.0, "step": 5808 }, { "completion_length": 418.6294822692871, "epoch": 0.45290667082415764, "grad_norm": 0.11836465453746792, "kl": 0.0030851364135742188, "learning_rate": 5.736907454484316e-07, "loss": 0.0001, "reward": 1.7388393580913544, "reward_std": 0.03983352053910494, "rewards/accuracy_reward": 0.7388393208384514, "rewards/format_reward": 1.0, "step": 5810 }, { "completion_length": 418.8571586608887, "epoch": 0.45306257673494044, "grad_norm": 0.0915586398494089, "kl": 0.0035810470581054688, "learning_rate": 5.734485098032785e-07, "loss": 0.0001, "reward": 1.8571429401636124, "reward_std": 0.0785365728661418, "rewards/accuracy_reward": 0.8571428805589676, "rewards/format_reward": 1.0, "step": 5812 }, { "completion_length": 429.63841247558594, "epoch": 0.4532184826457233, "grad_norm": 0.11504976831274016, "kl": 0.0030088424682617188, "learning_rate": 5.732062565373581e-07, "loss": 0.0001, "reward": 1.6830358058214188, "reward_std": 0.05553865246474743, "rewards/accuracy_reward": 0.6830357424914837, "rewards/format_reward": 1.0, "step": 5814 }, { "completion_length": 414.69197845458984, "epoch": 0.45337438855650614, "grad_norm": 0.0717405412307599, "kl": 0.002864837646484375, "learning_rate": 5.729639857087885e-07, "loss": 0.0001, "reward": 1.9129464775323868, "reward_std": 0.024124749936163425, "rewards/accuracy_reward": 0.9151786044239998, "rewards/format_reward": 0.9977678656578064, "step": 5816 }, { "completion_length": 420.7143020629883, "epoch": 0.453530294467289, "grad_norm": 0.07102732292974774, "kl": 0.0028934478759765625, "learning_rate": 5.727216973756921e-07, "loss": 0.0001, "reward": 1.8549107909202576, "reward_std": 0.03547286335378885, "rewards/accuracy_reward": 0.854910746216774, "rewards/format_reward": 1.0, "step": 5818 }, { "completion_length": 420.7321662902832, "epoch": 0.45368620037807184, "grad_norm": 0.10957791584816526, "kl": 0.003021240234375, "learning_rate": 5.724793915961955e-07, "loss": 0.0001, "reward": 1.7723215222358704, "reward_std": 0.07417591940611601, "rewards/accuracy_reward": 0.7745536044239998, "rewards/format_reward": 0.9977678656578064, "step": 5820 }, { "completion_length": 434.31921768188477, "epoch": 0.4538421062888547, "grad_norm": 0.06505187644126822, "kl": 0.0036792755126953125, "learning_rate": 5.722370684284293e-07, "loss": 0.0001, "reward": 1.7678571939468384, "reward_std": 0.03352006617933512, "rewards/accuracy_reward": 0.7678571790456772, "rewards/format_reward": 1.0, "step": 5822 }, { "completion_length": 418.10493087768555, "epoch": 0.45399801219963754, "grad_norm": 0.09266941274655936, "kl": 0.003070831298828125, "learning_rate": 5.719947279305282e-07, "loss": 0.0001, "reward": 1.8191965073347092, "reward_std": 0.03803044930100441, "rewards/accuracy_reward": 0.8191964700818062, "rewards/format_reward": 1.0, "step": 5824 }, { "completion_length": 430.00001525878906, "epoch": 0.4541539181104204, "grad_norm": 0.08574660878676689, "kl": 0.0030612945556640625, "learning_rate": 5.717523701606316e-07, "loss": 0.0001, "reward": 1.8883929401636124, "reward_std": 0.03352006524801254, "rewards/accuracy_reward": 0.88839291036129, "rewards/format_reward": 1.0, "step": 5826 }, { "completion_length": 421.7634048461914, "epoch": 0.4543098240212032, "grad_norm": 0.06873803479420593, "kl": 0.0027017593383789062, "learning_rate": 5.715099951768827e-07, "loss": 0.0001, "reward": 1.814732238650322, "reward_std": 0.056667713448405266, "rewards/accuracy_reward": 0.814732164144516, "rewards/format_reward": 1.0, "step": 5828 }, { "completion_length": 425.46430587768555, "epoch": 0.45446572993198603, "grad_norm": 0.09131613350480783, "kl": 0.003143310546875, "learning_rate": 5.712676030374284e-07, "loss": 0.0001, "reward": 1.8013393580913544, "reward_std": 0.0456905122846365, "rewards/accuracy_reward": 0.8013393208384514, "rewards/format_reward": 1.0, "step": 5830 }, { "completion_length": 428.0692138671875, "epoch": 0.4546216358427689, "grad_norm": 0.11863322106567895, "kl": 0.0032138824462890625, "learning_rate": 5.710251938004202e-07, "loss": 0.0001, "reward": 1.7611607909202576, "reward_std": 0.05102826748043299, "rewards/accuracy_reward": 0.7611607536673546, "rewards/format_reward": 1.0, "step": 5832 }, { "completion_length": 426.1294822692871, "epoch": 0.45477754175355173, "grad_norm": 0.08799720766726576, "kl": 0.0028047561645507812, "learning_rate": 5.70782767524014e-07, "loss": 0.0001, "reward": 1.8102679401636124, "reward_std": 0.0562126561999321, "rewards/accuracy_reward": 0.8102678954601288, "rewards/format_reward": 1.0, "step": 5834 }, { "completion_length": 427.7656440734863, "epoch": 0.4549334476643346, "grad_norm": 0.12343256363054707, "kl": 0.0031795501708984375, "learning_rate": 5.705403242663691e-07, "loss": 0.0001, "reward": 1.8125000894069672, "reward_std": 0.0764318099245429, "rewards/accuracy_reward": 0.812500037252903, "rewards/format_reward": 1.0, "step": 5836 }, { "completion_length": 419.3794860839844, "epoch": 0.45508935357511743, "grad_norm": 0.08816974754399748, "kl": 0.0030241012573242188, "learning_rate": 5.702978640856494e-07, "loss": 0.0001, "reward": 1.7522322088479996, "reward_std": 0.03983351867645979, "rewards/accuracy_reward": 0.7522321715950966, "rewards/format_reward": 1.0, "step": 5838 }, { "completion_length": 421.5335006713867, "epoch": 0.4552452594859003, "grad_norm": 0.0030538263218488435, "kl": 0.0023965835571289062, "learning_rate": 5.700553870400224e-07, "loss": 0.0001, "reward": 1.834821492433548, "reward_std": 0.025100449100136757, "rewards/accuracy_reward": 0.8348214626312256, "rewards/format_reward": 1.0, "step": 5840 }, { "completion_length": 411.25001525878906, "epoch": 0.4554011653966831, "grad_norm": 0.049277335456028, "kl": 0.0028018951416015625, "learning_rate": 5.6981289318766e-07, "loss": 0.0001, "reward": 1.910714328289032, "reward_std": 0.026077548041939735, "rewards/accuracy_reward": 0.9107143059372902, "rewards/format_reward": 1.0, "step": 5842 }, { "completion_length": 430.03350830078125, "epoch": 0.4555570713074659, "grad_norm": 0.11408117915424516, "kl": 0.0034971237182617188, "learning_rate": 5.695703825867382e-07, "loss": 0.0001, "reward": 1.7946429252624512, "reward_std": 0.0740225501358509, "rewards/accuracy_reward": 0.7946428880095482, "rewards/format_reward": 1.0, "step": 5844 }, { "completion_length": 418.4375190734863, "epoch": 0.45571297721824877, "grad_norm": 0.06774332062665994, "kl": 0.0027265548706054688, "learning_rate": 5.693278552954363e-07, "loss": 0.0001, "reward": 1.7924108058214188, "reward_std": 0.031112208031117916, "rewards/accuracy_reward": 0.792410746216774, "rewards/format_reward": 1.0, "step": 5846 }, { "completion_length": 424.6808204650879, "epoch": 0.4558688831290316, "grad_norm": 0.06979814576454482, "kl": 0.0031118392944335938, "learning_rate": 5.690853113719389e-07, "loss": 0.0001, "reward": 1.8861607760190964, "reward_std": 0.037424259819090366, "rewards/accuracy_reward": 0.886160746216774, "rewards/format_reward": 1.0, "step": 5848 }, { "completion_length": 429.1160888671875, "epoch": 0.45602478903981447, "grad_norm": 0.04702334098070883, "kl": 0.0030975341796875, "learning_rate": 5.688427508744334e-07, "loss": 0.0001, "reward": 1.7544643729925156, "reward_std": 0.04178631864488125, "rewards/accuracy_reward": 0.7544643059372902, "rewards/format_reward": 1.0, "step": 5850 }, { "completion_length": 426.52680587768555, "epoch": 0.4561806949505973, "grad_norm": 0.09500964678235423, "kl": 0.002735137939453125, "learning_rate": 5.686001738611117e-07, "loss": 0.0001, "reward": 1.734375074505806, "reward_std": 0.06057331245392561, "rewards/accuracy_reward": 0.7343750298023224, "rewards/format_reward": 1.0, "step": 5852 }, { "completion_length": 425.9308166503906, "epoch": 0.45633660086138017, "grad_norm": 0.07445930845610407, "kl": 0.0029401779174804688, "learning_rate": 5.683575803901694e-07, "loss": 0.0001, "reward": 1.8504465073347092, "reward_std": 0.04696930479258299, "rewards/accuracy_reward": 0.8504464849829674, "rewards/format_reward": 1.0, "step": 5854 }, { "completion_length": 423.1875190734863, "epoch": 0.456492506772163, "grad_norm": 0.06656345362906735, "kl": 0.0028696060180664062, "learning_rate": 5.681149705198066e-07, "loss": 0.0001, "reward": 1.8526786416769028, "reward_std": 0.04049275163561106, "rewards/accuracy_reward": 0.854910746216774, "rewards/format_reward": 0.9977678656578064, "step": 5856 }, { "completion_length": 415.83484268188477, "epoch": 0.4566484126829458, "grad_norm": 0.07851363866544794, "kl": 0.003681182861328125, "learning_rate": 5.678723443082268e-07, "loss": 0.0001, "reward": 1.8638393580913544, "reward_std": 0.04373911488801241, "rewards/accuracy_reward": 0.863839328289032, "rewards/format_reward": 1.0, "step": 5858 }, { "completion_length": 426.1227836608887, "epoch": 0.45680431859372866, "grad_norm": 0.08104631632223215, "kl": 0.0032434463500976562, "learning_rate": 5.676297018136373e-07, "loss": 0.0001, "reward": 1.7991072088479996, "reward_std": 0.0625261114910245, "rewards/accuracy_reward": 0.7991071790456772, "rewards/format_reward": 1.0, "step": 5860 }, { "completion_length": 426.8192138671875, "epoch": 0.4569602245045115, "grad_norm": 0.11615598210671564, "kl": 0.0030517578125, "learning_rate": 5.673870430942499e-07, "loss": 0.0001, "reward": 1.897321492433548, "reward_std": 0.04907546751201153, "rewards/accuracy_reward": 0.8973214402794838, "rewards/format_reward": 1.0, "step": 5862 }, { "completion_length": 430.5156440734863, "epoch": 0.45711613041529436, "grad_norm": 0.1200532433451676, "kl": 0.00312042236328125, "learning_rate": 5.671443682082799e-07, "loss": 0.0001, "reward": 1.6651786416769028, "reward_std": 0.08372096344828606, "rewards/accuracy_reward": 0.6651786081492901, "rewards/format_reward": 1.0, "step": 5864 }, { "completion_length": 430.55805587768555, "epoch": 0.4572720363260772, "grad_norm": 0.07951751529129676, "kl": 0.003139495849609375, "learning_rate": 5.669016772139465e-07, "loss": 0.0001, "reward": 1.8504465073347092, "reward_std": 0.03547286335378885, "rewards/accuracy_reward": 0.8504464700818062, "rewards/format_reward": 1.0, "step": 5866 }, { "completion_length": 429.5491256713867, "epoch": 0.45742794223686006, "grad_norm": 0.05605089458544741, "kl": 0.0033903121948242188, "learning_rate": 5.666589701694729e-07, "loss": 0.0001, "reward": 1.7209822088479996, "reward_std": 0.039680153131484985, "rewards/accuracy_reward": 0.7209821604192257, "rewards/format_reward": 1.0, "step": 5868 }, { "completion_length": 414.5848388671875, "epoch": 0.4575838481476429, "grad_norm": 0.06696841813801829, "kl": 0.0025434494018554688, "learning_rate": 5.664162471330862e-07, "loss": 0.0001, "reward": 1.7723215222358704, "reward_std": 0.020893159322440624, "rewards/accuracy_reward": 0.7723214700818062, "rewards/format_reward": 1.0, "step": 5870 }, { "completion_length": 428.60716247558594, "epoch": 0.45773975405842576, "grad_norm": 0.048976583192461044, "kl": 0.0026597976684570312, "learning_rate": 5.66173508163017e-07, "loss": 0.0001, "reward": 1.843750074505806, "reward_std": 0.01781129650771618, "rewards/accuracy_reward": 0.8437500298023224, "rewards/format_reward": 1.0, "step": 5872 }, { "completion_length": 424.1361846923828, "epoch": 0.45789565996920856, "grad_norm": 0.08715036236690861, "kl": 0.0047016143798828125, "learning_rate": 5.659307533174997e-07, "loss": 0.0002, "reward": 1.8638393580913544, "reward_std": 0.05929451808333397, "rewards/accuracy_reward": 0.8660714700818062, "rewards/format_reward": 0.9977678656578064, "step": 5874 }, { "completion_length": 405.189754486084, "epoch": 0.4580515658799914, "grad_norm": 0.08305964480989249, "kl": 0.0027751922607421875, "learning_rate": 5.656879826547734e-07, "loss": 0.0001, "reward": 1.8861608058214188, "reward_std": 0.02284595649689436, "rewards/accuracy_reward": 0.8861607536673546, "rewards/format_reward": 1.0, "step": 5876 }, { "completion_length": 423.6919822692871, "epoch": 0.45820747179077426, "grad_norm": 0.09999558055392938, "kl": 0.003204345703125, "learning_rate": 5.654451962330797e-07, "loss": 0.0001, "reward": 1.767857238650322, "reward_std": 0.04599360562860966, "rewards/accuracy_reward": 0.7678571790456772, "rewards/format_reward": 1.0, "step": 5878 }, { "completion_length": 421.95984268188477, "epoch": 0.4583633777015571, "grad_norm": 0.10114162760484328, "kl": 0.00315093994140625, "learning_rate": 5.65202394110665e-07, "loss": 0.0001, "reward": 1.8593751043081284, "reward_std": 0.04501790925860405, "rewards/accuracy_reward": 0.859375037252903, "rewards/format_reward": 1.0, "step": 5880 }, { "completion_length": 415.47769927978516, "epoch": 0.45851928361233996, "grad_norm": 0.08522271639400873, "kl": 0.002979278564453125, "learning_rate": 5.64959576345779e-07, "loss": 0.0001, "reward": 1.8348215073347092, "reward_std": 0.03772735595703125, "rewards/accuracy_reward": 0.8348214477300644, "rewards/format_reward": 1.0, "step": 5882 }, { "completion_length": 433.4732360839844, "epoch": 0.4586751895231228, "grad_norm": 0.12366289080613674, "kl": 0.003192901611328125, "learning_rate": 5.64716742996675e-07, "loss": 0.0001, "reward": 1.8616072088479996, "reward_std": 0.0698152594268322, "rewards/accuracy_reward": 0.8638392984867096, "rewards/format_reward": 0.9977678656578064, "step": 5884 }, { "completion_length": 420.2634086608887, "epoch": 0.45883109543390566, "grad_norm": 0.08866306304014844, "kl": 0.0029306411743164062, "learning_rate": 5.644738941216105e-07, "loss": 0.0001, "reward": 1.7991072237491608, "reward_std": 0.05035426281392574, "rewards/accuracy_reward": 0.7991071864962578, "rewards/format_reward": 1.0, "step": 5886 }, { "completion_length": 414.5111770629883, "epoch": 0.4589870013446885, "grad_norm": 0.09840276628214818, "kl": 0.0026111602783203125, "learning_rate": 5.642310297788466e-07, "loss": 0.0001, "reward": 1.9017857909202576, "reward_std": 0.06688676495105028, "rewards/accuracy_reward": 0.9017857536673546, "rewards/format_reward": 1.0, "step": 5888 }, { "completion_length": 430.892879486084, "epoch": 0.4591429072554713, "grad_norm": 0.1003762375173898, "kl": 0.003185272216796875, "learning_rate": 5.639881500266478e-07, "loss": 0.0001, "reward": 1.8616072088479996, "reward_std": 0.0490754684433341, "rewards/accuracy_reward": 0.8616071790456772, "rewards/format_reward": 1.0, "step": 5890 }, { "completion_length": 418.93974685668945, "epoch": 0.45929881316625415, "grad_norm": 0.11006191524328982, "kl": 0.003032684326171875, "learning_rate": 5.637452549232827e-07, "loss": 0.0001, "reward": 1.8727679252624512, "reward_std": 0.052005368284881115, "rewards/accuracy_reward": 0.8727678880095482, "rewards/format_reward": 1.0, "step": 5892 }, { "completion_length": 408.89734268188477, "epoch": 0.459454719077037, "grad_norm": 0.08245699334942458, "kl": 0.002674102783203125, "learning_rate": 5.635023445270231e-07, "loss": 0.0001, "reward": 1.8214286714792252, "reward_std": 0.03788072057068348, "rewards/accuracy_reward": 0.8214285969734192, "rewards/format_reward": 1.0, "step": 5894 }, { "completion_length": 426.72769927978516, "epoch": 0.45961062498781985, "grad_norm": 0.09887399999746299, "kl": 0.00316619873046875, "learning_rate": 5.63259418896145e-07, "loss": 0.0001, "reward": 1.7767858058214188, "reward_std": 0.0691426582634449, "rewards/accuracy_reward": 0.7767857499420643, "rewards/format_reward": 1.0, "step": 5896 }, { "completion_length": 423.0893020629883, "epoch": 0.4597665308986027, "grad_norm": 0.06642016105180171, "kl": 0.00270843505859375, "learning_rate": 5.630164780889277e-07, "loss": 0.0001, "reward": 1.8683036416769028, "reward_std": 0.04794640466570854, "rewards/accuracy_reward": 0.8683036044239998, "rewards/format_reward": 1.0, "step": 5898 }, { "completion_length": 419.3169822692871, "epoch": 0.45992243680938555, "grad_norm": 0.06548438881310722, "kl": 0.003116607666015625, "learning_rate": 5.627735221636544e-07, "loss": 0.0001, "reward": 1.87276791036129, "reward_std": 0.06658367160707712, "rewards/accuracy_reward": 0.8727678805589676, "rewards/format_reward": 1.0, "step": 5900 }, { "completion_length": 426.3750190734863, "epoch": 0.4600783427201684, "grad_norm": 0.06638344700501803, "kl": 0.0032453536987304688, "learning_rate": 5.625305511786116e-07, "loss": 0.0001, "reward": 1.7901786416769028, "reward_std": 0.04035415779799223, "rewards/accuracy_reward": 0.790178619325161, "rewards/format_reward": 1.0, "step": 5902 }, { "completion_length": 428.2232322692871, "epoch": 0.46023424863095125, "grad_norm": 0.07199803522749652, "kl": 0.0029430389404296875, "learning_rate": 5.622875651920896e-07, "loss": 0.0001, "reward": 1.8169643580913544, "reward_std": 0.046666210517287254, "rewards/accuracy_reward": 0.816964328289032, "rewards/format_reward": 1.0, "step": 5904 }, { "completion_length": 427.5134086608887, "epoch": 0.46039015454173404, "grad_norm": 0.07934180567031145, "kl": 0.0029468536376953125, "learning_rate": 5.620445642623825e-07, "loss": 0.0001, "reward": 1.7388393431901932, "reward_std": 0.05929451994597912, "rewards/accuracy_reward": 0.7388393059372902, "rewards/format_reward": 1.0, "step": 5906 }, { "completion_length": 422.06922149658203, "epoch": 0.4605460604525169, "grad_norm": 0.04908456498336949, "kl": 0.002941131591796875, "learning_rate": 5.618015484477874e-07, "loss": 0.0001, "reward": 1.805803656578064, "reward_std": 0.04569051414728165, "rewards/accuracy_reward": 0.8058036118745804, "rewards/format_reward": 1.0, "step": 5908 }, { "completion_length": 442.5223388671875, "epoch": 0.46070196636329974, "grad_norm": 0.08212729138211808, "kl": 0.0028934478759765625, "learning_rate": 5.615585178066057e-07, "loss": 0.0001, "reward": 1.8459822088479996, "reward_std": 0.03675165772438049, "rewards/accuracy_reward": 0.8459821939468384, "rewards/format_reward": 1.0, "step": 5910 }, { "completion_length": 423.41743087768555, "epoch": 0.4608578722740826, "grad_norm": 0.0872222278934473, "kl": 0.0030345916748046875, "learning_rate": 5.61315472397142e-07, "loss": 0.0001, "reward": 1.8616071939468384, "reward_std": 0.03644856158643961, "rewards/accuracy_reward": 0.8616071715950966, "rewards/format_reward": 1.0, "step": 5912 }, { "completion_length": 431.5937690734863, "epoch": 0.46101377818486544, "grad_norm": 0.09307375936044814, "kl": 0.0033473968505859375, "learning_rate": 5.610724122777042e-07, "loss": 0.0001, "reward": 1.8058036118745804, "reward_std": 0.027206611819565296, "rewards/accuracy_reward": 0.8058035895228386, "rewards/format_reward": 1.0, "step": 5914 }, { "completion_length": 419.6808204650879, "epoch": 0.4611696840956483, "grad_norm": 0.08661667951373939, "kl": 0.003322601318359375, "learning_rate": 5.608293375066041e-07, "loss": 0.0001, "reward": 1.7767857909202576, "reward_std": 0.06899152882397175, "rewards/accuracy_reward": 0.776785746216774, "rewards/format_reward": 1.0, "step": 5916 }, { "completion_length": 428.6160888671875, "epoch": 0.46132559000643114, "grad_norm": 0.08591167529999126, "kl": 0.003047943115234375, "learning_rate": 5.60586248142157e-07, "loss": 0.0001, "reward": 1.743303656578064, "reward_std": 0.05523555539548397, "rewards/accuracy_reward": 0.7433036006987095, "rewards/format_reward": 1.0, "step": 5918 }, { "completion_length": 423.80135345458984, "epoch": 0.46148149591721394, "grad_norm": 0.04333658954901421, "kl": 0.0030736923217773438, "learning_rate": 5.603431442426814e-07, "loss": 0.0001, "reward": 1.8482143580913544, "reward_std": 0.04666620958596468, "rewards/accuracy_reward": 0.8482143357396126, "rewards/format_reward": 1.0, "step": 5920 }, { "completion_length": 433.939754486084, "epoch": 0.4616374018279968, "grad_norm": 0.09798775129388229, "kl": 0.0032625198364257812, "learning_rate": 5.601000258664996e-07, "loss": 0.0001, "reward": 1.819196492433548, "reward_std": 0.03675165679305792, "rewards/accuracy_reward": 0.819196455180645, "rewards/format_reward": 1.0, "step": 5922 }, { "completion_length": 427.0156440734863, "epoch": 0.46179330773877963, "grad_norm": 0.1060760496044766, "kl": 0.0031490325927734375, "learning_rate": 5.598568930719373e-07, "loss": 0.0001, "reward": 1.7566965222358704, "reward_std": 0.0636551734060049, "rewards/accuracy_reward": 0.7589285969734192, "rewards/format_reward": 0.9977678656578064, "step": 5924 }, { "completion_length": 428.55582427978516, "epoch": 0.4619492136495625, "grad_norm": 0.09948174311204519, "kl": 0.0030536651611328125, "learning_rate": 5.596137459173236e-07, "loss": 0.0001, "reward": 1.8772322535514832, "reward_std": 0.05230705812573433, "rewards/accuracy_reward": 0.877232164144516, "rewards/format_reward": 1.0, "step": 5926 }, { "completion_length": 420.4352836608887, "epoch": 0.46210511956034533, "grad_norm": 0.08469346469649534, "kl": 0.0030765533447265625, "learning_rate": 5.593705844609907e-07, "loss": 0.0001, "reward": 1.8058036863803864, "reward_std": 0.052307059057056904, "rewards/accuracy_reward": 0.8058036044239998, "rewards/format_reward": 1.0, "step": 5928 }, { "completion_length": 418.6897506713867, "epoch": 0.4622610254711282, "grad_norm": 0.0890648495589157, "kl": 0.0027256011962890625, "learning_rate": 5.591274087612752e-07, "loss": 0.0001, "reward": 1.8616072088479996, "reward_std": 0.03644856158643961, "rewards/accuracy_reward": 0.8616071715950966, "rewards/format_reward": 1.0, "step": 5930 }, { "completion_length": 420.3638610839844, "epoch": 0.46241693138191103, "grad_norm": 0.10965814124041609, "kl": 0.0030670166015625, "learning_rate": 5.588842188765161e-07, "loss": 0.0001, "reward": 1.7522322237491608, "reward_std": 0.055388922803103924, "rewards/accuracy_reward": 0.7522321790456772, "rewards/format_reward": 1.0, "step": 5932 }, { "completion_length": 431.52234268188477, "epoch": 0.4625728372926939, "grad_norm": 0.06921518811028689, "kl": 0.0030612945556640625, "learning_rate": 5.586410148650563e-07, "loss": 0.0001, "reward": 1.850446492433548, "reward_std": 0.024124750867486, "rewards/accuracy_reward": 0.850446455180645, "rewards/format_reward": 1.0, "step": 5934 }, { "completion_length": 429.7522506713867, "epoch": 0.4627287432034767, "grad_norm": 0.09177368923969591, "kl": 0.0032501220703125, "learning_rate": 5.583977967852419e-07, "loss": 0.0001, "reward": 1.8147322088479996, "reward_std": 0.04569051135331392, "rewards/accuracy_reward": 0.814732164144516, "rewards/format_reward": 1.0, "step": 5936 }, { "completion_length": 413.8460006713867, "epoch": 0.4628846491142595, "grad_norm": 0.06520341841585132, "kl": 0.002532958984375, "learning_rate": 5.581545646954227e-07, "loss": 0.0001, "reward": 1.837053656578064, "reward_std": 0.021868856623768806, "rewards/accuracy_reward": 0.8370536044239998, "rewards/format_reward": 1.0, "step": 5938 }, { "completion_length": 415.5401954650879, "epoch": 0.4630405550250424, "grad_norm": 0.0833249119824517, "kl": 0.00323486328125, "learning_rate": 5.579113186539514e-07, "loss": 0.0001, "reward": 1.678571492433548, "reward_std": 0.045993607491254807, "rewards/accuracy_reward": 0.6785714700818062, "rewards/format_reward": 1.0, "step": 5940 }, { "completion_length": 411.94644927978516, "epoch": 0.4631964609358252, "grad_norm": 0.05580241687086036, "kl": 0.0028591156005859375, "learning_rate": 5.576680587191846e-07, "loss": 0.0001, "reward": 1.8392858058214188, "reward_std": 0.04230555426329374, "rewards/accuracy_reward": 0.839285746216774, "rewards/format_reward": 1.0, "step": 5942 }, { "completion_length": 416.8147506713867, "epoch": 0.4633523668466081, "grad_norm": 0.06302688421607995, "kl": 0.00229644775390625, "learning_rate": 5.574247849494815e-07, "loss": 0.0001, "reward": 1.90401791036129, "reward_std": 0.014579704962670803, "rewards/accuracy_reward": 0.9040178880095482, "rewards/format_reward": 1.0, "step": 5944 }, { "completion_length": 415.2366256713867, "epoch": 0.4635082727573909, "grad_norm": 0.08268337025130719, "kl": 0.0027265548706054688, "learning_rate": 5.571814974032053e-07, "loss": 0.0001, "reward": 1.819196492433548, "reward_std": 0.04809977114200592, "rewards/accuracy_reward": 0.8214285895228386, "rewards/format_reward": 0.9977678656578064, "step": 5946 }, { "completion_length": 406.24332427978516, "epoch": 0.4636641786681738, "grad_norm": 0.07174920175044319, "kl": 0.0023851394653320312, "learning_rate": 5.569381961387221e-07, "loss": 0.0001, "reward": 1.7544643729925156, "reward_std": 0.03208790719509125, "rewards/accuracy_reward": 0.754464328289032, "rewards/format_reward": 1.0, "step": 5948 }, { "completion_length": 428.1607360839844, "epoch": 0.4638200845789566, "grad_norm": 0.11627406825337562, "kl": 0.003658294677734375, "learning_rate": 5.566948812144016e-07, "loss": 0.0001, "reward": 1.7700893580913544, "reward_std": 0.08582712430506945, "rewards/accuracy_reward": 0.7700893208384514, "rewards/format_reward": 1.0, "step": 5950 }, { "completion_length": 427.229923248291, "epoch": 0.4639759904897394, "grad_norm": 0.10372386540489525, "kl": 0.003078460693359375, "learning_rate": 5.564515526886164e-07, "loss": 0.0001, "reward": 1.7098215073347092, "reward_std": 0.04599360655993223, "rewards/accuracy_reward": 0.709821455180645, "rewards/format_reward": 1.0, "step": 5952 }, { "completion_length": 434.5647506713867, "epoch": 0.46413189640052227, "grad_norm": 0.06740430823490107, "kl": 0.004385948181152344, "learning_rate": 5.562082106197429e-07, "loss": 0.0002, "reward": 1.8236607909202576, "reward_std": 0.041112312115728855, "rewards/accuracy_reward": 0.8258928880095482, "rewards/format_reward": 0.9977678656578064, "step": 5954 }, { "completion_length": 436.70091247558594, "epoch": 0.4642878023113051, "grad_norm": 0.10797025559461074, "kl": 0.0032978057861328125, "learning_rate": 5.5596485506616e-07, "loss": 0.0001, "reward": 1.8035715222358704, "reward_std": 0.05117799621075392, "rewards/accuracy_reward": 0.8035714626312256, "rewards/format_reward": 1.0, "step": 5956 }, { "completion_length": 431.0067138671875, "epoch": 0.46444370822208797, "grad_norm": 0.09231539809280817, "kl": 0.0045642852783203125, "learning_rate": 5.557214860862504e-07, "loss": 0.0002, "reward": 1.7455357909202576, "reward_std": 0.04373771511018276, "rewards/accuracy_reward": 0.745535746216774, "rewards/format_reward": 1.0, "step": 5958 }, { "completion_length": 420.2165298461914, "epoch": 0.4645996141328708, "grad_norm": 0.12488691451826157, "kl": 0.0029687881469726562, "learning_rate": 5.554781037384001e-07, "loss": 0.0001, "reward": 1.8281250447034836, "reward_std": 0.04959610849618912, "rewards/accuracy_reward": 0.828125037252903, "rewards/format_reward": 1.0, "step": 5960 }, { "completion_length": 421.9397506713867, "epoch": 0.46475552004365367, "grad_norm": 0.1113391985875679, "kl": 0.003383636474609375, "learning_rate": 5.552347080809979e-07, "loss": 0.0001, "reward": 1.8504465222358704, "reward_std": 0.03983351867645979, "rewards/accuracy_reward": 0.8504464626312256, "rewards/format_reward": 1.0, "step": 5962 }, { "completion_length": 432.1919822692871, "epoch": 0.4649114259544365, "grad_norm": 0.10268512674677854, "kl": 0.002971649169921875, "learning_rate": 5.54991299172436e-07, "loss": 0.0001, "reward": 1.8950893431901932, "reward_std": 0.061852107755839825, "rewards/accuracy_reward": 0.8950893133878708, "rewards/format_reward": 1.0, "step": 5964 }, { "completion_length": 436.58484268188477, "epoch": 0.4650673318652193, "grad_norm": 0.10070587813595056, "kl": 0.003101348876953125, "learning_rate": 5.547478770711096e-07, "loss": 0.0001, "reward": 1.7187500894069672, "reward_std": 0.06688676588237286, "rewards/accuracy_reward": 0.7187500298023224, "rewards/format_reward": 1.0, "step": 5966 }, { "completion_length": 427.0201072692871, "epoch": 0.46522323777600216, "grad_norm": 0.055179479378441906, "kl": 0.0027217864990234375, "learning_rate": 5.545044418354173e-07, "loss": 0.0001, "reward": 1.8058036416769028, "reward_std": 0.036751655861735344, "rewards/accuracy_reward": 0.8058036118745804, "rewards/format_reward": 1.0, "step": 5968 }, { "completion_length": 425.5826110839844, "epoch": 0.465379143686785, "grad_norm": 0.1333063865910371, "kl": 0.0031414031982421875, "learning_rate": 5.542609935237609e-07, "loss": 0.0001, "reward": 1.790178656578064, "reward_std": 0.05734172277152538, "rewards/accuracy_reward": 0.7901786118745804, "rewards/format_reward": 1.0, "step": 5970 }, { "completion_length": 424.22099685668945, "epoch": 0.46553504959756786, "grad_norm": 0.08604833119455593, "kl": 0.00311279296875, "learning_rate": 5.540175321945451e-07, "loss": 0.0001, "reward": 1.8080357909202576, "reward_std": 0.03352006711065769, "rewards/accuracy_reward": 0.8080357611179352, "rewards/format_reward": 1.0, "step": 5972 }, { "completion_length": 412.89064025878906, "epoch": 0.4656909555083507, "grad_norm": 0.09237119037761411, "kl": 0.002407073974609375, "learning_rate": 5.537740579061782e-07, "loss": 0.0001, "reward": 1.8727679401636124, "reward_std": 0.030135108157992363, "rewards/accuracy_reward": 0.87276791036129, "rewards/format_reward": 1.0, "step": 5974 }, { "completion_length": 415.73662185668945, "epoch": 0.46584686141913356, "grad_norm": 0.08131089102335504, "kl": 0.003040313720703125, "learning_rate": 5.535305707170705e-07, "loss": 0.0001, "reward": 1.8281250596046448, "reward_std": 0.05230706091970205, "rewards/accuracy_reward": 0.8303571864962578, "rewards/format_reward": 0.9977678656578064, "step": 5976 }, { "completion_length": 428.4375190734863, "epoch": 0.4660027673299164, "grad_norm": 0.09163623677600292, "kl": 0.0030879974365234375, "learning_rate": 5.532870706856366e-07, "loss": 0.0001, "reward": 1.8482143580913544, "reward_std": 0.06545460689812899, "rewards/accuracy_reward": 0.8482143208384514, "rewards/format_reward": 1.0, "step": 5978 }, { "completion_length": 415.38172149658203, "epoch": 0.46615867324069926, "grad_norm": 0.1131919204834752, "kl": 0.002780914306640625, "learning_rate": 5.530435578702938e-07, "loss": 0.0001, "reward": 1.8482143878936768, "reward_std": 0.06222161278128624, "rewards/accuracy_reward": 0.848214328289032, "rewards/format_reward": 1.0, "step": 5980 }, { "completion_length": 445.1428756713867, "epoch": 0.46631457915148206, "grad_norm": 0.1049436058398845, "kl": 0.003173828125, "learning_rate": 5.528000323294622e-07, "loss": 0.0001, "reward": 1.8504465073347092, "reward_std": 0.07094432599842548, "rewards/accuracy_reward": 0.8526786118745804, "rewards/format_reward": 0.9977678656578064, "step": 5982 }, { "completion_length": 425.1651954650879, "epoch": 0.4664704850622649, "grad_norm": 0.09607776637338954, "kl": 0.0027475357055664062, "learning_rate": 5.525564941215652e-07, "loss": 0.0001, "reward": 1.7857143878936768, "reward_std": 0.05681744311004877, "rewards/accuracy_reward": 0.785714328289032, "rewards/format_reward": 1.0, "step": 5984 }, { "completion_length": 413.79019927978516, "epoch": 0.46662639097304776, "grad_norm": 0.0033349785553885964, "kl": 0.0027942657470703125, "learning_rate": 5.52312943305029e-07, "loss": 0.0001, "reward": 1.8392857611179352, "reward_std": 0.012626906856894493, "rewards/accuracy_reward": 0.8392857536673546, "rewards/format_reward": 1.0, "step": 5986 }, { "completion_length": 425.91073989868164, "epoch": 0.4667822968838306, "grad_norm": 0.11279390551480566, "kl": 0.00270843505859375, "learning_rate": 5.520693799382831e-07, "loss": 0.0001, "reward": 1.8995536267757416, "reward_std": 0.05102826654911041, "rewards/accuracy_reward": 0.8995535969734192, "rewards/format_reward": 1.0, "step": 5988 }, { "completion_length": 417.03349685668945, "epoch": 0.46693820279461346, "grad_norm": 0.0031322228328713813, "kl": 0.0029191970825195312, "learning_rate": 5.5182580407976e-07, "loss": 0.0001, "reward": 1.933035746216774, "reward_std": 0.04727240093052387, "rewards/accuracy_reward": 0.9330357387661934, "rewards/format_reward": 1.0, "step": 5990 }, { "completion_length": 419.01341247558594, "epoch": 0.4670941087053963, "grad_norm": 0.11899621168767974, "kl": 0.003215789794921875, "learning_rate": 5.515822157878951e-07, "loss": 0.0001, "reward": 1.7723215073347092, "reward_std": 0.07582562137395144, "rewards/accuracy_reward": 0.7723214514553547, "rewards/format_reward": 1.0, "step": 5992 }, { "completion_length": 429.4754638671875, "epoch": 0.46725001461617915, "grad_norm": 0.08190724003937278, "kl": 0.0030975341796875, "learning_rate": 5.513386151211268e-07, "loss": 0.0001, "reward": 1.8125000596046448, "reward_std": 0.0602702172473073, "rewards/accuracy_reward": 0.8125000298023224, "rewards/format_reward": 1.0, "step": 5994 }, { "completion_length": 420.3794822692871, "epoch": 0.467405920526962, "grad_norm": 0.08727173937142753, "kl": 0.002414703369140625, "learning_rate": 5.510950021378964e-07, "loss": 0.0001, "reward": 1.8660714775323868, "reward_std": 0.057134619913995266, "rewards/accuracy_reward": 0.8705357536673546, "rewards/format_reward": 0.9955357164144516, "step": 5996 }, { "completion_length": 416.3437690734863, "epoch": 0.4675618264377448, "grad_norm": 0.1030521781180366, "kl": 0.0027608871459960938, "learning_rate": 5.508513768966481e-07, "loss": 0.0001, "reward": 1.819196492433548, "reward_std": 0.04794640373438597, "rewards/accuracy_reward": 0.8191964626312256, "rewards/format_reward": 1.0, "step": 5998 }, { "completion_length": 422.7790336608887, "epoch": 0.46771773234852765, "grad_norm": 0.06376862539810425, "kl": 0.0024738311767578125, "learning_rate": 5.506077394558293e-07, "loss": 0.0001, "reward": 1.8571429401636124, "reward_std": 0.023821654729545116, "rewards/accuracy_reward": 0.8571428954601288, "rewards/format_reward": 1.0, "step": 6000 }, { "completion_length": 440.88841247558594, "epoch": 0.4678736382593105, "grad_norm": 0.08638091885043966, "kl": 0.0034646987915039062, "learning_rate": 5.503640898738902e-07, "loss": 0.0001, "reward": 1.8616072088479996, "reward_std": 0.04178631864488125, "rewards/accuracy_reward": 0.8616071715950966, "rewards/format_reward": 1.0, "step": 6002 }, { "completion_length": 433.8102912902832, "epoch": 0.46802954417009335, "grad_norm": 0.08258030150976388, "kl": 0.00305938720703125, "learning_rate": 5.501204282092839e-07, "loss": 0.0001, "reward": 1.8392857909202576, "reward_std": 0.052003965713083744, "rewards/accuracy_reward": 0.8392857685685158, "rewards/format_reward": 1.0, "step": 6004 }, { "completion_length": 428.49555587768555, "epoch": 0.4681854500808762, "grad_norm": 0.06011856488855205, "kl": 0.0028057098388671875, "learning_rate": 5.498767545204661e-07, "loss": 0.0001, "reward": 1.8504464775323868, "reward_std": 0.034495764411985874, "rewards/accuracy_reward": 0.8504464626312256, "rewards/format_reward": 1.0, "step": 6006 }, { "completion_length": 416.15180587768555, "epoch": 0.46834135599165905, "grad_norm": 0.036601745794389344, "kl": 0.0031337738037109375, "learning_rate": 5.496330688658962e-07, "loss": 0.0001, "reward": 1.859375074505806, "reward_std": 0.0063134534284472466, "rewards/accuracy_reward": 0.859375037252903, "rewards/format_reward": 1.0, "step": 6008 }, { "completion_length": 429.0156478881836, "epoch": 0.4684972619024419, "grad_norm": 0.003354878358947265, "kl": 0.003009796142578125, "learning_rate": 5.493893713040353e-07, "loss": 0.0001, "reward": 1.7790179401636124, "reward_std": 0.034495764411985874, "rewards/accuracy_reward": 0.7790179029107094, "rewards/format_reward": 1.0, "step": 6010 }, { "completion_length": 425.2700996398926, "epoch": 0.4686531678132247, "grad_norm": 0.08494094041037724, "kl": 0.003170013427734375, "learning_rate": 5.491456618933485e-07, "loss": 0.0001, "reward": 1.8102679401636124, "reward_std": 0.0344957634806633, "rewards/accuracy_reward": 0.8102678805589676, "rewards/format_reward": 1.0, "step": 6012 }, { "completion_length": 417.4620704650879, "epoch": 0.46880907372400754, "grad_norm": 0.002978165192054141, "kl": 0.0027418136596679688, "learning_rate": 5.489019406923032e-07, "loss": 0.0001, "reward": 1.843750074505806, "reward_std": 0.05590956099331379, "rewards/accuracy_reward": 0.843750037252903, "rewards/format_reward": 1.0, "step": 6014 }, { "completion_length": 415.7366256713867, "epoch": 0.4689649796347904, "grad_norm": 0.053434529850969324, "kl": 0.0028781890869140625, "learning_rate": 5.486582077593697e-07, "loss": 0.0001, "reward": 1.8526786416769028, "reward_std": 0.023821654729545116, "rewards/accuracy_reward": 0.8526785895228386, "rewards/format_reward": 1.0, "step": 6016 }, { "completion_length": 439.6763572692871, "epoch": 0.46912088554557324, "grad_norm": 0.08690001264213472, "kl": 0.00330352783203125, "learning_rate": 5.484144631530208e-07, "loss": 0.0001, "reward": 1.7678572088479996, "reward_std": 0.0529810655862093, "rewards/accuracy_reward": 0.7678571864962578, "rewards/format_reward": 1.0, "step": 6018 }, { "completion_length": 428.5067138671875, "epoch": 0.4692767914563561, "grad_norm": 0.08861256356126451, "kl": 0.002895355224609375, "learning_rate": 5.481707069317325e-07, "loss": 0.0001, "reward": 1.7924108058214188, "reward_std": 0.05959621164947748, "rewards/accuracy_reward": 0.7924107387661934, "rewards/format_reward": 1.0, "step": 6020 }, { "completion_length": 432.14288330078125, "epoch": 0.46943269736713894, "grad_norm": 0.10524903116743653, "kl": 0.0031909942626953125, "learning_rate": 5.479269391539837e-07, "loss": 0.0001, "reward": 1.8147322237491608, "reward_std": 0.05102826654911041, "rewards/accuracy_reward": 0.8147321715950966, "rewards/format_reward": 1.0, "step": 6022 }, { "completion_length": 417.00672149658203, "epoch": 0.4695886032779218, "grad_norm": 0.05255633430037134, "kl": 0.0025415420532226562, "learning_rate": 5.476831598782555e-07, "loss": 0.0001, "reward": 1.9218750596046448, "reward_std": 0.027206611819565296, "rewards/accuracy_reward": 0.9218750298023224, "rewards/format_reward": 1.0, "step": 6024 }, { "completion_length": 415.3504638671875, "epoch": 0.46974450918870464, "grad_norm": 0.09523145303060798, "kl": 0.0026187896728515625, "learning_rate": 5.474393691630325e-07, "loss": 0.0001, "reward": 1.8035714775323868, "reward_std": 0.03998324926942587, "rewards/accuracy_reward": 0.803571455180645, "rewards/format_reward": 1.0, "step": 6026 }, { "completion_length": 425.3259086608887, "epoch": 0.46990041509948743, "grad_norm": 0.09777072398648365, "kl": 0.0030069351196289062, "learning_rate": 5.471955670668013e-07, "loss": 0.0001, "reward": 1.812500074505806, "reward_std": 0.05261015519499779, "rewards/accuracy_reward": 0.8125000298023224, "rewards/format_reward": 1.0, "step": 6028 }, { "completion_length": 413.30582427978516, "epoch": 0.4700563210102703, "grad_norm": 0.052751756314903774, "kl": 0.0031023025512695312, "learning_rate": 5.469517536480518e-07, "loss": 0.0001, "reward": 1.8794643580913544, "reward_std": 0.044714813120663166, "rewards/accuracy_reward": 0.8794643133878708, "rewards/format_reward": 1.0, "step": 6030 }, { "completion_length": 435.49109268188477, "epoch": 0.47021222692105313, "grad_norm": 0.09678560906485671, "kl": 0.0030269622802734375, "learning_rate": 5.467079289652762e-07, "loss": 0.0001, "reward": 1.7991072237491608, "reward_std": 0.058467146940529346, "rewards/accuracy_reward": 0.7991071715950966, "rewards/format_reward": 1.0, "step": 6032 }, { "completion_length": 406.1361770629883, "epoch": 0.470368132831836, "grad_norm": 0.04291079608022508, "kl": 0.0026979446411132812, "learning_rate": 5.464640930769697e-07, "loss": 0.0001, "reward": 1.8727679252624512, "reward_std": 0.027206611819565296, "rewards/accuracy_reward": 0.872767873108387, "rewards/format_reward": 1.0, "step": 6034 }, { "completion_length": 404.61608505249023, "epoch": 0.47052403874261883, "grad_norm": 0.10108775225278642, "kl": 0.0026683807373046875, "learning_rate": 5.462202460416304e-07, "loss": 0.0001, "reward": 1.8281250894069672, "reward_std": 0.03968015406280756, "rewards/accuracy_reward": 0.8281250223517418, "rewards/format_reward": 1.0, "step": 6036 }, { "completion_length": 423.1294860839844, "epoch": 0.4706799446534017, "grad_norm": 0.09565434180073484, "kl": 0.0028963088989257812, "learning_rate": 5.459763879177582e-07, "loss": 0.0001, "reward": 1.796875074505806, "reward_std": 0.0344957634806633, "rewards/accuracy_reward": 0.796875037252903, "rewards/format_reward": 1.0, "step": 6038 }, { "completion_length": 419.7901954650879, "epoch": 0.47083585056418453, "grad_norm": 0.12152353169561331, "kl": 0.0031604766845703125, "learning_rate": 5.457325187638566e-07, "loss": 0.0001, "reward": 1.7946429401636124, "reward_std": 0.04907546937465668, "rewards/accuracy_reward": 0.7946428880095482, "rewards/format_reward": 1.0, "step": 6040 }, { "completion_length": 437.21653747558594, "epoch": 0.4709917564749674, "grad_norm": 0.08775477691067604, "kl": 0.003387451171875, "learning_rate": 5.454886386384313e-07, "loss": 0.0001, "reward": 1.8035715073347092, "reward_std": 0.05035426188260317, "rewards/accuracy_reward": 0.8058036118745804, "rewards/format_reward": 0.9977678656578064, "step": 6042 }, { "completion_length": 425.6986846923828, "epoch": 0.4711476623857502, "grad_norm": 0.0704731450564959, "kl": 0.0029554367065429688, "learning_rate": 5.452447475999906e-07, "loss": 0.0001, "reward": 1.7924107909202576, "reward_std": 0.048619008623063564, "rewards/accuracy_reward": 0.7924107611179352, "rewards/format_reward": 1.0, "step": 6044 }, { "completion_length": 426.8549270629883, "epoch": 0.471303568296533, "grad_norm": 0.10629520833942456, "kl": 0.003047943115234375, "learning_rate": 5.450008457070457e-07, "loss": 0.0001, "reward": 1.7790179401636124, "reward_std": 0.0456905122846365, "rewards/accuracy_reward": 0.7790178880095482, "rewards/format_reward": 1.0, "step": 6046 }, { "completion_length": 426.28796768188477, "epoch": 0.4714594742073159, "grad_norm": 0.092881454834492, "kl": 0.002933502197265625, "learning_rate": 5.4475693301811e-07, "loss": 0.0001, "reward": 1.8013393580913544, "reward_std": 0.04404080752283335, "rewards/accuracy_reward": 0.803571455180645, "rewards/format_reward": 0.9977678656578064, "step": 6048 }, { "completion_length": 428.02457427978516, "epoch": 0.4716153801180987, "grad_norm": 0.1429456491274047, "kl": 0.0030689239501953125, "learning_rate": 5.445130095916998e-07, "loss": 0.0001, "reward": 1.8169643580913544, "reward_std": 0.06319871358573437, "rewards/accuracy_reward": 0.816964328289032, "rewards/format_reward": 1.0, "step": 6050 }, { "completion_length": 422.4419860839844, "epoch": 0.4717712860288816, "grad_norm": 0.09147741904661609, "kl": 0.0027008056640625, "learning_rate": 5.442690754863341e-07, "loss": 0.0001, "reward": 1.758928656578064, "reward_std": 0.022171951830387115, "rewards/accuracy_reward": 0.7589286044239998, "rewards/format_reward": 1.0, "step": 6052 }, { "completion_length": 410.15403747558594, "epoch": 0.4719271919396644, "grad_norm": 0.10109569276915402, "kl": 0.0027647018432617188, "learning_rate": 5.440251307605339e-07, "loss": 0.0001, "reward": 1.8549107909202576, "reward_std": 0.046296700835227966, "rewards/accuracy_reward": 0.854910746216774, "rewards/format_reward": 1.0, "step": 6054 }, { "completion_length": 426.7544860839844, "epoch": 0.4720830978504473, "grad_norm": 0.09410757974982928, "kl": 0.0032253265380859375, "learning_rate": 5.437811754728233e-07, "loss": 0.0001, "reward": 1.906250074505806, "reward_std": 0.061702375300228596, "rewards/accuracy_reward": 0.906250037252903, "rewards/format_reward": 1.0, "step": 6056 }, { "completion_length": 416.5178756713867, "epoch": 0.4722390037612301, "grad_norm": 0.04534522803595235, "kl": 0.0028133392333984375, "learning_rate": 5.435372096817289e-07, "loss": 0.0001, "reward": 1.8325893729925156, "reward_std": 0.03156726714223623, "rewards/accuracy_reward": 0.8325893208384514, "rewards/format_reward": 1.0, "step": 6058 }, { "completion_length": 421.73439025878906, "epoch": 0.4723949096720129, "grad_norm": 0.10272464908761829, "kl": 0.0029506683349609375, "learning_rate": 5.432932334457792e-07, "loss": 0.0001, "reward": 1.8616072237491608, "reward_std": 0.0641758143901825, "rewards/accuracy_reward": 0.8616071790456772, "rewards/format_reward": 1.0, "step": 6060 }, { "completion_length": 416.27903747558594, "epoch": 0.47255081558279577, "grad_norm": 0.07793298313168227, "kl": 0.0030879974365234375, "learning_rate": 5.43049246823506e-07, "loss": 0.0001, "reward": 1.8459822237491608, "reward_std": 0.04111231118440628, "rewards/accuracy_reward": 0.8459821790456772, "rewards/format_reward": 1.0, "step": 6062 }, { "completion_length": 432.02234268188477, "epoch": 0.4727067214935786, "grad_norm": 0.09937728797749386, "kl": 0.0032825469970703125, "learning_rate": 5.428052498734432e-07, "loss": 0.0001, "reward": 1.8102679401636124, "reward_std": 0.0758270239457488, "rewards/accuracy_reward": 0.8102678805589676, "rewards/format_reward": 1.0, "step": 6064 }, { "completion_length": 422.0602836608887, "epoch": 0.47286262740436147, "grad_norm": 0.10916141713723333, "kl": 0.0033626556396484375, "learning_rate": 5.425612426541272e-07, "loss": 0.0001, "reward": 1.734375074505806, "reward_std": 0.06493396684527397, "rewards/accuracy_reward": 0.7343750447034836, "rewards/format_reward": 1.0, "step": 6066 }, { "completion_length": 412.35939025878906, "epoch": 0.4730185333151443, "grad_norm": 0.07831815555151274, "kl": 0.0025854110717773438, "learning_rate": 5.42317225224097e-07, "loss": 0.0001, "reward": 1.9062500447034836, "reward_std": 0.01781129650771618, "rewards/accuracy_reward": 0.9062500223517418, "rewards/format_reward": 1.0, "step": 6068 }, { "completion_length": 431.58483505249023, "epoch": 0.47317443922592717, "grad_norm": 0.07622338778643047, "kl": 0.0036029815673828125, "learning_rate": 5.420731976418937e-07, "loss": 0.0001, "reward": 1.7678572088479996, "reward_std": 0.03788072057068348, "rewards/accuracy_reward": 0.7678571715950966, "rewards/format_reward": 1.0, "step": 6070 }, { "completion_length": 421.7410888671875, "epoch": 0.47333034513671, "grad_norm": 0.0887891655315908, "kl": 0.0027751922607421875, "learning_rate": 5.418291599660611e-07, "loss": 0.0001, "reward": 1.8392857909202576, "reward_std": 0.05343612376600504, "rewards/accuracy_reward": 0.8392857387661934, "rewards/format_reward": 1.0, "step": 6072 }, { "completion_length": 424.11609268188477, "epoch": 0.4734862510474928, "grad_norm": 0.06846290847107785, "kl": 0.0031442642211914062, "learning_rate": 5.415851122551457e-07, "loss": 0.0001, "reward": 1.8638393431901932, "reward_std": 0.04922519903630018, "rewards/accuracy_reward": 0.8638393208384514, "rewards/format_reward": 1.0, "step": 6074 }, { "completion_length": 422.35716247558594, "epoch": 0.47364215695827566, "grad_norm": 0.10207668581201639, "kl": 0.0031890869140625, "learning_rate": 5.413410545676958e-07, "loss": 0.0001, "reward": 1.7500000894069672, "reward_std": 0.06688676681369543, "rewards/accuracy_reward": 0.7522321753203869, "rewards/format_reward": 0.9977678656578064, "step": 6076 }, { "completion_length": 437.9531440734863, "epoch": 0.4737980628690585, "grad_norm": 0.06881409076712505, "kl": 0.0033664703369140625, "learning_rate": 5.410969869622626e-07, "loss": 0.0001, "reward": 1.8058036416769028, "reward_std": 0.04132985696196556, "rewards/accuracy_reward": 0.8058036044239998, "rewards/format_reward": 1.0, "step": 6078 }, { "completion_length": 431.70314025878906, "epoch": 0.47395396877984136, "grad_norm": 0.07715460086971355, "kl": 0.0032329559326171875, "learning_rate": 5.408529094973994e-07, "loss": 0.0001, "reward": 1.7566965371370316, "reward_std": 0.027206611819565296, "rewards/accuracy_reward": 0.758928619325161, "rewards/format_reward": 0.9977678656578064, "step": 6080 }, { "completion_length": 423.77234268188477, "epoch": 0.4741098746906242, "grad_norm": 0.07243146023887236, "kl": 0.00315093994140625, "learning_rate": 5.406088222316618e-07, "loss": 0.0001, "reward": 1.8459822237491608, "reward_std": 0.05764481518417597, "rewards/accuracy_reward": 0.8459821790456772, "rewards/format_reward": 1.0, "step": 6082 }, { "completion_length": 423.752254486084, "epoch": 0.47426578060140706, "grad_norm": 0.05200161275978517, "kl": 0.0028362274169921875, "learning_rate": 5.403647252236082e-07, "loss": 0.0001, "reward": 1.8482143729925156, "reward_std": 0.022171951830387115, "rewards/accuracy_reward": 0.8482143133878708, "rewards/format_reward": 1.0, "step": 6084 }, { "completion_length": 421.5714454650879, "epoch": 0.4744216865121899, "grad_norm": 0.04939689370709391, "kl": 0.0026807785034179688, "learning_rate": 5.401206185317988e-07, "loss": 0.0001, "reward": 1.7790179550647736, "reward_std": 0.024124749936163425, "rewards/accuracy_reward": 0.7790178954601288, "rewards/format_reward": 1.0, "step": 6086 }, { "completion_length": 433.455379486084, "epoch": 0.47457759242297276, "grad_norm": 0.04906981959211465, "kl": 0.0028972625732421875, "learning_rate": 5.398765022147965e-07, "loss": 0.0001, "reward": 1.8415179401636124, "reward_std": 0.03111220896244049, "rewards/accuracy_reward": 0.8415178954601288, "rewards/format_reward": 1.0, "step": 6088 }, { "completion_length": 415.3973388671875, "epoch": 0.47473349833375555, "grad_norm": 0.08017583990789867, "kl": 0.0026292800903320312, "learning_rate": 5.396323763311662e-07, "loss": 0.0001, "reward": 1.7946429252624512, "reward_std": 0.06463087350130081, "rewards/accuracy_reward": 0.7946428954601288, "rewards/format_reward": 1.0, "step": 6090 }, { "completion_length": 424.7812690734863, "epoch": 0.4748894042445384, "grad_norm": 0.053802452623542336, "kl": 0.0024023056030273438, "learning_rate": 5.393882409394755e-07, "loss": 0.0001, "reward": 1.8482143580913544, "reward_std": 0.06673339754343033, "rewards/accuracy_reward": 0.848214328289032, "rewards/format_reward": 1.0, "step": 6092 }, { "completion_length": 433.88171005249023, "epoch": 0.47504531015532125, "grad_norm": 0.06392647989463275, "kl": 0.00325775146484375, "learning_rate": 5.391440960982935e-07, "loss": 0.0001, "reward": 1.7611607760190964, "reward_std": 0.04373911768198013, "rewards/accuracy_reward": 0.7633928954601288, "rewards/format_reward": 0.9977678656578064, "step": 6094 }, { "completion_length": 419.9018096923828, "epoch": 0.4752012160661041, "grad_norm": 0.09681531406787354, "kl": 0.0027914047241210938, "learning_rate": 5.388999418661929e-07, "loss": 0.0001, "reward": 1.7991072088479996, "reward_std": 0.05425985809415579, "rewards/accuracy_reward": 0.7991071790456772, "rewards/format_reward": 1.0, "step": 6096 }, { "completion_length": 427.4576072692871, "epoch": 0.47535712197688695, "grad_norm": 0.08750969553998604, "kl": 0.0031452178955078125, "learning_rate": 5.386557783017475e-07, "loss": 0.0001, "reward": 1.6495536416769028, "reward_std": 0.05636602360755205, "rewards/accuracy_reward": 0.6495535969734192, "rewards/format_reward": 1.0, "step": 6098 }, { "completion_length": 432.32814025878906, "epoch": 0.4755130278876698, "grad_norm": 0.13743868263816825, "kl": 0.0030126571655273438, "learning_rate": 5.384116054635333e-07, "loss": 0.0001, "reward": 1.8125000596046448, "reward_std": 0.09393861051648855, "rewards/accuracy_reward": 0.8125000298023224, "rewards/format_reward": 1.0, "step": 6100 }, { "completion_length": 423.5893020629883, "epoch": 0.47566893379845265, "grad_norm": 0.07942279001618367, "kl": 0.0025014877319335938, "learning_rate": 5.381674234101295e-07, "loss": 0.0001, "reward": 1.8616071939468384, "reward_std": 0.04989920277148485, "rewards/accuracy_reward": 0.8616071790456772, "rewards/format_reward": 1.0, "step": 6102 }, { "completion_length": 426.0044822692871, "epoch": 0.4758248397092355, "grad_norm": 0.08700144815427648, "kl": 0.0031490325927734375, "learning_rate": 5.379232322001168e-07, "loss": 0.0001, "reward": 1.7812500894069672, "reward_std": 0.046970706433057785, "rewards/accuracy_reward": 0.781250037252903, "rewards/format_reward": 1.0, "step": 6104 }, { "completion_length": 423.43305587768555, "epoch": 0.4759807456200183, "grad_norm": 0.05532511290108783, "kl": 0.0029010772705078125, "learning_rate": 5.376790318920781e-07, "loss": 0.0001, "reward": 1.787946492433548, "reward_std": 0.048099772073328495, "rewards/accuracy_reward": 0.7879464626312256, "rewards/format_reward": 1.0, "step": 6106 }, { "completion_length": 412.48885345458984, "epoch": 0.47613665153080115, "grad_norm": 0.08420905187453852, "kl": 0.002574920654296875, "learning_rate": 5.374348225445987e-07, "loss": 0.0001, "reward": 1.7700893580913544, "reward_std": 0.04712267126888037, "rewards/accuracy_reward": 0.770089328289032, "rewards/format_reward": 1.0, "step": 6108 }, { "completion_length": 424.0625228881836, "epoch": 0.476292557441584, "grad_norm": 0.09820809997774711, "kl": 0.002765655517578125, "learning_rate": 5.37190604216266e-07, "loss": 0.0001, "reward": 1.8392857909202576, "reward_std": 0.06417581252753735, "rewards/accuracy_reward": 0.839285746216774, "rewards/format_reward": 1.0, "step": 6110 }, { "completion_length": 421.2812690734863, "epoch": 0.47644846335236685, "grad_norm": 0.09768214681527025, "kl": 0.00293731689453125, "learning_rate": 5.369463769656693e-07, "loss": 0.0001, "reward": 1.8058036267757416, "reward_std": 0.0748499222099781, "rewards/accuracy_reward": 0.8058035969734192, "rewards/format_reward": 1.0, "step": 6112 }, { "completion_length": 412.69421005249023, "epoch": 0.4766043692631497, "grad_norm": 0.0465346132677167, "kl": 0.002605438232421875, "learning_rate": 5.367021408514006e-07, "loss": 0.0001, "reward": 1.8526786267757416, "reward_std": 0.031110807321965694, "rewards/accuracy_reward": 0.8526786044239998, "rewards/format_reward": 1.0, "step": 6114 }, { "completion_length": 416.93082427978516, "epoch": 0.47676027517393255, "grad_norm": 0.11195016156425698, "kl": 0.0029964447021484375, "learning_rate": 5.364578959320535e-07, "loss": 0.0001, "reward": 1.7678572088479996, "reward_std": 0.053436122834682465, "rewards/accuracy_reward": 0.7678571715950966, "rewards/format_reward": 1.0, "step": 6116 }, { "completion_length": 437.48216247558594, "epoch": 0.4769161810847154, "grad_norm": 0.09578919073222793, "kl": 0.0032176971435546875, "learning_rate": 5.362136422662241e-07, "loss": 0.0001, "reward": 1.837053656578064, "reward_std": 0.0657577021047473, "rewards/accuracy_reward": 0.8370535895228386, "rewards/format_reward": 1.0, "step": 6118 }, { "completion_length": 437.6183280944824, "epoch": 0.4770720869954982, "grad_norm": 0.08768031414229505, "kl": 0.00365447998046875, "learning_rate": 5.359693799125104e-07, "loss": 0.0001, "reward": 1.8169643729925156, "reward_std": 0.04764330945909023, "rewards/accuracy_reward": 0.8169643133878708, "rewards/format_reward": 1.0, "step": 6120 }, { "completion_length": 432.0134162902832, "epoch": 0.47722799290628104, "grad_norm": 0.10618032746598646, "kl": 0.0032501220703125, "learning_rate": 5.357251089295123e-07, "loss": 0.0001, "reward": 1.8705357909202576, "reward_std": 0.05846714600920677, "rewards/accuracy_reward": 0.8705357387661934, "rewards/format_reward": 1.0, "step": 6122 }, { "completion_length": 427.5089530944824, "epoch": 0.4773838988170639, "grad_norm": 0.09080672434096974, "kl": 0.00274658203125, "learning_rate": 5.354808293758321e-07, "loss": 0.0001, "reward": 1.8839286416769028, "reward_std": 0.06523706298321486, "rewards/accuracy_reward": 0.8839286044239998, "rewards/format_reward": 1.0, "step": 6124 }, { "completion_length": 443.4352912902832, "epoch": 0.47753980472784674, "grad_norm": 0.0028601443870143167, "kl": 0.002628326416015625, "learning_rate": 5.352365413100742e-07, "loss": 0.0001, "reward": 1.8370536416769028, "reward_std": 0.0063134534284472466, "rewards/accuracy_reward": 0.8370535895228386, "rewards/format_reward": 1.0, "step": 6126 }, { "completion_length": 418.127254486084, "epoch": 0.4776957106386296, "grad_norm": 0.0050695142851841695, "kl": 0.002918243408203125, "learning_rate": 5.349922447908444e-07, "loss": 0.0001, "reward": 1.7879465222358704, "reward_std": 0.014579704962670803, "rewards/accuracy_reward": 0.787946455180645, "rewards/format_reward": 1.0, "step": 6128 }, { "completion_length": 430.05359268188477, "epoch": 0.47785161654941244, "grad_norm": 0.1280982882424784, "kl": 0.0030536651611328125, "learning_rate": 5.347479398767514e-07, "loss": 0.0001, "reward": 1.7991072237491608, "reward_std": 0.04373771324753761, "rewards/accuracy_reward": 0.7991071790456772, "rewards/format_reward": 1.0, "step": 6130 }, { "completion_length": 419.8705520629883, "epoch": 0.4780075224601953, "grad_norm": 0.04969110237760752, "kl": 0.0025262832641601562, "learning_rate": 5.345036266264055e-07, "loss": 0.0001, "reward": 1.8102679252624512, "reward_std": 0.04276201594620943, "rewards/accuracy_reward": 0.8102678880095482, "rewards/format_reward": 1.0, "step": 6132 }, { "completion_length": 425.96430587768555, "epoch": 0.47816342837097814, "grad_norm": 0.11461785535733733, "kl": 0.0032491683959960938, "learning_rate": 5.342593050984188e-07, "loss": 0.0001, "reward": 1.7544643729925156, "reward_std": 0.05298106651753187, "rewards/accuracy_reward": 0.7544643320143223, "rewards/format_reward": 1.0, "step": 6134 }, { "completion_length": 425.589298248291, "epoch": 0.47831933428176093, "grad_norm": 0.08783018928257476, "kl": 0.00286102294921875, "learning_rate": 5.340149753514056e-07, "loss": 0.0001, "reward": 1.8906250894069672, "reward_std": 0.055388920940458775, "rewards/accuracy_reward": 0.892857164144516, "rewards/format_reward": 0.9977678656578064, "step": 6136 }, { "completion_length": 437.60939025878906, "epoch": 0.4784752401925438, "grad_norm": 0.0584966914885149, "kl": 0.003307342529296875, "learning_rate": 5.337706374439825e-07, "loss": 0.0001, "reward": 1.8258929252624512, "reward_std": 0.044714813120663166, "rewards/accuracy_reward": 0.828125037252903, "rewards/format_reward": 0.9977678656578064, "step": 6138 }, { "completion_length": 424.049129486084, "epoch": 0.47863114610332663, "grad_norm": 0.10206362225460339, "kl": 0.0031986236572265625, "learning_rate": 5.335262914347673e-07, "loss": 0.0001, "reward": 1.8348215073347092, "reward_std": 0.0490754684433341, "rewards/accuracy_reward": 0.8348214775323868, "rewards/format_reward": 1.0, "step": 6140 }, { "completion_length": 427.51341247558594, "epoch": 0.4787870520141095, "grad_norm": 0.11466798117294723, "kl": 0.0028676986694335938, "learning_rate": 5.332819373823805e-07, "loss": 0.0001, "reward": 1.8571429252624512, "reward_std": 0.06545460689812899, "rewards/accuracy_reward": 0.859375037252903, "rewards/format_reward": 0.9977678656578064, "step": 6142 }, { "completion_length": 414.7276954650879, "epoch": 0.47894295792489233, "grad_norm": 0.06589594843698521, "kl": 0.0026311874389648438, "learning_rate": 5.33037575345444e-07, "loss": 0.0001, "reward": 1.814732238650322, "reward_std": 0.027206611819565296, "rewards/accuracy_reward": 0.8147321864962578, "rewards/format_reward": 1.0, "step": 6144 }, { "completion_length": 422.4352912902832, "epoch": 0.4790988638356752, "grad_norm": 0.0865644402628632, "kl": 0.0024938583374023438, "learning_rate": 5.327932053825818e-07, "loss": 0.0001, "reward": 1.8638393580913544, "reward_std": 0.03742426075041294, "rewards/accuracy_reward": 0.8638393059372902, "rewards/format_reward": 1.0, "step": 6146 }, { "completion_length": 436.61832427978516, "epoch": 0.47925476974645803, "grad_norm": 0.06881969155597847, "kl": 0.003063201904296875, "learning_rate": 5.325488275524201e-07, "loss": 0.0001, "reward": 1.8616072237491608, "reward_std": 0.0529810655862093, "rewards/accuracy_reward": 0.8616071864962578, "rewards/format_reward": 1.0, "step": 6148 }, { "completion_length": 426.6830520629883, "epoch": 0.4794106756572409, "grad_norm": 0.08426039719816153, "kl": 0.0028553009033203125, "learning_rate": 5.323044419135864e-07, "loss": 0.0001, "reward": 1.7946429401636124, "reward_std": 0.03644856158643961, "rewards/accuracy_reward": 0.7946428880095482, "rewards/format_reward": 1.0, "step": 6150 }, { "completion_length": 411.1294822692871, "epoch": 0.4795665815680237, "grad_norm": 0.08557736312273115, "kl": 0.002758026123046875, "learning_rate": 5.320600485247106e-07, "loss": 0.0001, "reward": 1.9040179401636124, "reward_std": 0.058317420072853565, "rewards/accuracy_reward": 0.9040178954601288, "rewards/format_reward": 1.0, "step": 6152 }, { "completion_length": 423.2366256713867, "epoch": 0.4797224874788065, "grad_norm": 0.09046165693814802, "kl": 0.00263214111328125, "learning_rate": 5.31815647444424e-07, "loss": 0.0001, "reward": 1.7991072237491608, "reward_std": 0.044714813120663166, "rewards/accuracy_reward": 0.7991071790456772, "rewards/format_reward": 1.0, "step": 6154 }, { "completion_length": 417.7544822692871, "epoch": 0.4798783933895894, "grad_norm": 0.08213564870033045, "kl": 0.0025730133056640625, "learning_rate": 5.315712387313601e-07, "loss": 0.0001, "reward": 1.8370536267757416, "reward_std": 0.05493386276066303, "rewards/accuracy_reward": 0.8370535969734192, "rewards/format_reward": 1.0, "step": 6156 }, { "completion_length": 429.40403747558594, "epoch": 0.4800342993003722, "grad_norm": 0.0632611124918686, "kl": 1081919441731584.0, "learning_rate": 5.313268224441542e-07, "loss": 43210466394112.0, "reward": 1.8169643878936768, "reward_std": 0.053436122834682465, "rewards/accuracy_reward": 0.8191964626312256, "rewards/format_reward": 0.9977678656578064, "step": 6158 }, { "completion_length": 423.38171768188477, "epoch": 0.4801902052111551, "grad_norm": 0.1026136984965272, "kl": 0.005161285400390625, "learning_rate": 5.310823986414432e-07, "loss": 0.0002, "reward": 1.8794643580913544, "reward_std": 0.06170237623155117, "rewards/accuracy_reward": 0.8794643506407738, "rewards/format_reward": 1.0, "step": 6160 }, { "completion_length": 446.0000228881836, "epoch": 0.4803461111219379, "grad_norm": 0.06649961067195864, "kl": 0.003177642822265625, "learning_rate": 5.30837967381866e-07, "loss": 0.0001, "reward": 1.837053656578064, "reward_std": 0.027206611819565296, "rewards/accuracy_reward": 0.8370536044239998, "rewards/format_reward": 1.0, "step": 6162 }, { "completion_length": 427.8460006713867, "epoch": 0.4805020170327208, "grad_norm": 0.1393783327558245, "kl": 0.0030078887939453125, "learning_rate": 5.305935287240631e-07, "loss": 0.0001, "reward": 1.7723215222358704, "reward_std": 0.08296420983970165, "rewards/accuracy_reward": 0.7723214700818062, "rewards/format_reward": 1.0, "step": 6164 }, { "completion_length": 432.45983505249023, "epoch": 0.48065792294350357, "grad_norm": 0.08965332786380929, "kl": 0.003078460693359375, "learning_rate": 5.303490827266772e-07, "loss": 0.0001, "reward": 1.7008929550647736, "reward_std": 0.05929311737418175, "rewards/accuracy_reward": 0.7008928805589676, "rewards/format_reward": 1.0, "step": 6166 }, { "completion_length": 429.2120704650879, "epoch": 0.4808138288542864, "grad_norm": 0.12166105289702943, "kl": 0.0028896331787109375, "learning_rate": 5.301046294483521e-07, "loss": 0.0001, "reward": 1.8013393729925156, "reward_std": 0.07176806125789881, "rewards/accuracy_reward": 0.803571455180645, "rewards/format_reward": 0.9977678656578064, "step": 6168 }, { "completion_length": 435.89064025878906, "epoch": 0.48096973476506927, "grad_norm": 0.1145376756932647, "kl": 0.0033969879150390625, "learning_rate": 5.29860168947734e-07, "loss": 0.0001, "reward": 1.7745536416769028, "reward_std": 0.045065999031066895, "rewards/accuracy_reward": 0.7767857536673546, "rewards/format_reward": 0.9977678656578064, "step": 6170 }, { "completion_length": 414.0669822692871, "epoch": 0.4811256406758521, "grad_norm": 0.05622670453657312, "kl": 0.0027027130126953125, "learning_rate": 5.296157012834703e-07, "loss": 0.0001, "reward": 1.8839286416769028, "reward_std": 0.03937705885618925, "rewards/accuracy_reward": 0.883928619325161, "rewards/format_reward": 1.0, "step": 6172 }, { "completion_length": 422.59599685668945, "epoch": 0.48128154658663497, "grad_norm": 0.07920915652517303, "kl": 0.0033931732177734375, "learning_rate": 5.293712265142105e-07, "loss": 0.0001, "reward": 1.8504465073347092, "reward_std": 0.03366979490965605, "rewards/accuracy_reward": 0.8504464626312256, "rewards/format_reward": 1.0, "step": 6174 }, { "completion_length": 425.4285888671875, "epoch": 0.4814374524974178, "grad_norm": 0.09080325791230971, "kl": 0.0034189224243164062, "learning_rate": 5.291267446986054e-07, "loss": 0.0001, "reward": 1.79464291036129, "reward_std": 0.03352006711065769, "rewards/accuracy_reward": 0.7946428805589676, "rewards/format_reward": 1.0, "step": 6176 }, { "completion_length": 430.1361846923828, "epoch": 0.48159335840820067, "grad_norm": 0.09408884608850727, "kl": 0.003082275390625, "learning_rate": 5.288822558953082e-07, "loss": 0.0001, "reward": 1.7633929550647736, "reward_std": 0.062155201099812984, "rewards/accuracy_reward": 0.7633928880095482, "rewards/format_reward": 1.0, "step": 6178 }, { "completion_length": 429.8460006713867, "epoch": 0.4817492643189835, "grad_norm": 0.06757247699506953, "kl": 0.002666473388671875, "learning_rate": 5.286377601629731e-07, "loss": 0.0001, "reward": 1.8727679252624512, "reward_std": 0.03675165679305792, "rewards/accuracy_reward": 0.8727678954601288, "rewards/format_reward": 1.0, "step": 6180 }, { "completion_length": 421.30358505249023, "epoch": 0.4819051702297663, "grad_norm": 0.07513418765015797, "kl": 0.0029344558715820312, "learning_rate": 5.283932575602562e-07, "loss": 0.0001, "reward": 1.8593750447034836, "reward_std": 0.030135109089314938, "rewards/accuracy_reward": 0.8593750223517418, "rewards/format_reward": 1.0, "step": 6182 }, { "completion_length": 431.870548248291, "epoch": 0.48206107614054916, "grad_norm": 0.11178936132529527, "kl": 0.0031909942626953125, "learning_rate": 5.281487481458151e-07, "loss": 0.0001, "reward": 1.774553656578064, "reward_std": 0.05102826748043299, "rewards/accuracy_reward": 0.7745536044239998, "rewards/format_reward": 1.0, "step": 6184 }, { "completion_length": 413.7790336608887, "epoch": 0.482216982051332, "grad_norm": 0.055290119000390825, "kl": 0.00226593017578125, "learning_rate": 5.279042319783093e-07, "loss": 0.0001, "reward": 1.8147322237491608, "reward_std": 0.030135109089314938, "rewards/accuracy_reward": 0.814732164144516, "rewards/format_reward": 1.0, "step": 6186 }, { "completion_length": 433.6250190734863, "epoch": 0.48237288796211486, "grad_norm": 0.10449414384545125, "kl": 0.0032482147216796875, "learning_rate": 5.276597091164e-07, "loss": 0.0001, "reward": 1.7589286416769028, "reward_std": 0.06981526128947735, "rewards/accuracy_reward": 0.7611607536673546, "rewards/format_reward": 0.9977678656578064, "step": 6188 }, { "completion_length": 423.2210006713867, "epoch": 0.4825287938728977, "grad_norm": 0.08732994044040482, "kl": 0.002910614013671875, "learning_rate": 5.274151796187493e-07, "loss": 0.0001, "reward": 1.7767858058214188, "reward_std": 0.04862041026353836, "rewards/accuracy_reward": 0.7767857536673546, "rewards/format_reward": 1.0, "step": 6190 }, { "completion_length": 407.62278747558594, "epoch": 0.48268469978368056, "grad_norm": 0.05066659550264544, "kl": 0.0022611618041992188, "learning_rate": 5.27170643544022e-07, "loss": 0.0001, "reward": 1.8638393580913544, "reward_std": 0.027206611819565296, "rewards/accuracy_reward": 0.8638392984867096, "rewards/format_reward": 1.0, "step": 6192 }, { "completion_length": 431.89734268188477, "epoch": 0.4828406056944634, "grad_norm": 0.07453234757933838, "kl": 0.003284454345703125, "learning_rate": 5.269261009508833e-07, "loss": 0.0001, "reward": 1.8526786416769028, "reward_std": 0.029159409925341606, "rewards/accuracy_reward": 0.8526786118745804, "rewards/format_reward": 1.0, "step": 6194 }, { "completion_length": 423.6919860839844, "epoch": 0.48299651160524626, "grad_norm": 0.0840352608760704, "kl": 0.002780914306640625, "learning_rate": 5.266815518980009e-07, "loss": 0.0001, "reward": 1.812500074505806, "reward_std": 0.054932462982833385, "rewards/accuracy_reward": 0.812500037252903, "rewards/format_reward": 1.0, "step": 6196 }, { "completion_length": 428.77457427978516, "epoch": 0.48315241751602905, "grad_norm": 0.0389811040755607, "kl": 0.0032863616943359375, "learning_rate": 5.264369964440434e-07, "loss": 0.0001, "reward": 1.8191965222358704, "reward_std": 0.04569051321595907, "rewards/accuracy_reward": 0.819196455180645, "rewards/format_reward": 1.0, "step": 6198 }, { "completion_length": 428.20760345458984, "epoch": 0.4833083234268119, "grad_norm": 0.1246812958830152, "kl": 0.0033206939697265625, "learning_rate": 5.261924346476814e-07, "loss": 0.0001, "reward": 1.8236607909202576, "reward_std": 0.07048926781862974, "rewards/accuracy_reward": 0.823660746216774, "rewards/format_reward": 1.0, "step": 6200 }, { "completion_length": 424.4643020629883, "epoch": 0.48346422933759475, "grad_norm": 0.07292067307878185, "kl": 0.00299072265625, "learning_rate": 5.25947866567587e-07, "loss": 0.0001, "reward": 1.8593750596046448, "reward_std": 0.05395676475018263, "rewards/accuracy_reward": 0.859375037252903, "rewards/format_reward": 1.0, "step": 6202 }, { "completion_length": 419.45760345458984, "epoch": 0.4836201352483776, "grad_norm": 0.12303797082487947, "kl": 0.002986907958984375, "learning_rate": 5.257032922624333e-07, "loss": 0.0001, "reward": 1.8705357909202576, "reward_std": 0.08634776528924704, "rewards/accuracy_reward": 0.870535746216774, "rewards/format_reward": 1.0, "step": 6204 }, { "completion_length": 422.69644927978516, "epoch": 0.48377604115916045, "grad_norm": 0.0778142452331877, "kl": 0.00362396240234375, "learning_rate": 5.254587117908956e-07, "loss": 0.0001, "reward": 1.8973214775323868, "reward_std": 0.012626906856894493, "rewards/accuracy_reward": 0.8973214626312256, "rewards/format_reward": 1.0, "step": 6206 }, { "completion_length": 429.8325996398926, "epoch": 0.4839319470699433, "grad_norm": 0.07278948775215027, "kl": 0.00336456298828125, "learning_rate": 5.252141252116499e-07, "loss": 0.0001, "reward": 1.93526791036129, "reward_std": 0.022845957428216934, "rewards/accuracy_reward": 0.9352678880095482, "rewards/format_reward": 1.0, "step": 6208 }, { "completion_length": 439.3527030944824, "epoch": 0.48408785298072615, "grad_norm": 0.11394073594873257, "kl": 0.0033407211303710938, "learning_rate": 5.249695325833744e-07, "loss": 0.0001, "reward": 1.866071492433548, "reward_std": 0.03742566239088774, "rewards/accuracy_reward": 0.8683035969734192, "rewards/format_reward": 0.9977678656578064, "step": 6210 }, { "completion_length": 435.26341247558594, "epoch": 0.484243758891509, "grad_norm": 0.0799390262738771, "kl": 0.003955841064453125, "learning_rate": 5.247249339647483e-07, "loss": 0.0002, "reward": 1.837053656578064, "reward_std": 0.06658367067575455, "rewards/accuracy_reward": 0.8392857611179352, "rewards/format_reward": 0.9977678656578064, "step": 6212 }, { "completion_length": 418.22769927978516, "epoch": 0.4843996648022918, "grad_norm": 0.07588046697183999, "kl": 0.0022077560424804688, "learning_rate": 5.244803294144525e-07, "loss": 0.0001, "reward": 1.8861607760190964, "reward_std": 0.021868856623768806, "rewards/accuracy_reward": 0.8861607611179352, "rewards/format_reward": 1.0, "step": 6214 }, { "completion_length": 415.6071586608887, "epoch": 0.48455557071307465, "grad_norm": 0.11752799658652947, "kl": 0.0029249191284179688, "learning_rate": 5.242357189911691e-07, "loss": 0.0001, "reward": 1.790178656578064, "reward_std": 0.0916707506403327, "rewards/accuracy_reward": 0.792410746216774, "rewards/format_reward": 0.9977678656578064, "step": 6216 }, { "completion_length": 426.69644927978516, "epoch": 0.4847114766238575, "grad_norm": 0.04306188683890003, "kl": 0.0028076171875, "learning_rate": 5.239911027535816e-07, "loss": 0.0001, "reward": 1.7946429252624512, "reward_std": 0.02089315839111805, "rewards/accuracy_reward": 0.7946428805589676, "rewards/format_reward": 1.0, "step": 6218 }, { "completion_length": 423.5491256713867, "epoch": 0.48486738253464035, "grad_norm": 0.08604213717342278, "kl": 0.0030469894409179688, "learning_rate": 5.237464807603752e-07, "loss": 0.0001, "reward": 1.7388393580913544, "reward_std": 0.059294519014656544, "rewards/accuracy_reward": 0.7388393059372902, "rewards/format_reward": 1.0, "step": 6220 }, { "completion_length": 424.0848388671875, "epoch": 0.4850232884454232, "grad_norm": 0.08379053171327934, "kl": 0.0030527114868164062, "learning_rate": 5.235018530702362e-07, "loss": 0.0001, "reward": 1.758928656578064, "reward_std": 0.04764331039041281, "rewards/accuracy_reward": 0.7589286118745804, "rewards/format_reward": 1.0, "step": 6222 }, { "completion_length": 425.48439025878906, "epoch": 0.48517919435620604, "grad_norm": 0.06986564695034482, "kl": 0.002899169921875, "learning_rate": 5.232572197418525e-07, "loss": 0.0001, "reward": 1.8169643729925156, "reward_std": 0.039983248338103294, "rewards/accuracy_reward": 0.8169643208384514, "rewards/format_reward": 1.0, "step": 6224 }, { "completion_length": 423.51341247558594, "epoch": 0.4853351002669889, "grad_norm": 0.08144488749240124, "kl": 0.003086090087890625, "learning_rate": 5.230125808339127e-07, "loss": 0.0001, "reward": 1.8325893878936768, "reward_std": 0.04276201594620943, "rewards/accuracy_reward": 0.832589328289032, "rewards/format_reward": 1.0, "step": 6226 }, { "completion_length": 426.28796005249023, "epoch": 0.4854910061777717, "grad_norm": 0.11572798750928068, "kl": 0.0029878616333007812, "learning_rate": 5.227679364051078e-07, "loss": 0.0001, "reward": 1.7946429252624512, "reward_std": 0.057341720908880234, "rewards/accuracy_reward": 0.7946428954601288, "rewards/format_reward": 1.0, "step": 6228 }, { "completion_length": 435.1071586608887, "epoch": 0.48564691208855454, "grad_norm": 0.1474614852574851, "kl": 0.0031118392944335938, "learning_rate": 5.225232865141294e-07, "loss": 0.0001, "reward": 1.8392857909202576, "reward_std": 0.05929311644285917, "rewards/accuracy_reward": 0.8392857387661934, "rewards/format_reward": 1.0, "step": 6230 }, { "completion_length": 424.2187728881836, "epoch": 0.4858028179993374, "grad_norm": 0.07980350048893035, "kl": 0.0032176971435546875, "learning_rate": 5.222786312196704e-07, "loss": 0.0001, "reward": 1.837053656578064, "reward_std": 0.01894036028534174, "rewards/accuracy_reward": 0.8370535969734192, "rewards/format_reward": 1.0, "step": 6232 }, { "completion_length": 428.10046768188477, "epoch": 0.48595872391012024, "grad_norm": 0.04952780509096699, "kl": 0.0030002593994140625, "learning_rate": 5.220339705804253e-07, "loss": 0.0001, "reward": 1.9308036118745804, "reward_std": 0.01750820130109787, "rewards/accuracy_reward": 0.9308035969734192, "rewards/format_reward": 1.0, "step": 6234 }, { "completion_length": 423.7366256713867, "epoch": 0.4861146298209031, "grad_norm": 0.08394137197069547, "kl": 0.0029354095458984375, "learning_rate": 5.217893046550898e-07, "loss": 0.0001, "reward": 1.9196429252624512, "reward_std": 0.046146972104907036, "rewards/accuracy_reward": 0.9196428954601288, "rewards/format_reward": 1.0, "step": 6236 }, { "completion_length": 426.2366256713867, "epoch": 0.48627053573168594, "grad_norm": 0.10010490674093031, "kl": 0.0027332305908203125, "learning_rate": 5.215446335023605e-07, "loss": 0.0001, "reward": 1.9062500447034836, "reward_std": 0.0387044558301568, "rewards/accuracy_reward": 0.9062500298023224, "rewards/format_reward": 1.0, "step": 6238 }, { "completion_length": 406.50447845458984, "epoch": 0.4864264416424688, "grad_norm": 0.07109491136244807, "kl": 0.0023565292358398438, "learning_rate": 5.212999571809362e-07, "loss": 0.0001, "reward": 1.850446492433548, "reward_std": 0.014579704962670803, "rewards/accuracy_reward": 0.8504464477300644, "rewards/format_reward": 1.0, "step": 6240 }, { "completion_length": 419.1049346923828, "epoch": 0.48658234755325164, "grad_norm": 0.09470907661033302, "kl": 0.003040313720703125, "learning_rate": 5.210552757495158e-07, "loss": 0.0001, "reward": 1.6741072535514832, "reward_std": 0.06170237809419632, "rewards/accuracy_reward": 0.674107164144516, "rewards/format_reward": 1.0, "step": 6242 }, { "completion_length": 429.90849685668945, "epoch": 0.48673825346403443, "grad_norm": 0.07166444762415781, "kl": 0.0031948089599609375, "learning_rate": 5.208105892668002e-07, "loss": 0.0001, "reward": 1.796875074505806, "reward_std": 0.05816405266523361, "rewards/accuracy_reward": 0.796875037252903, "rewards/format_reward": 1.0, "step": 6244 }, { "completion_length": 426.63394927978516, "epoch": 0.4868941593748173, "grad_norm": 0.10129708771609046, "kl": 0.003070831298828125, "learning_rate": 5.20565897791491e-07, "loss": 0.0001, "reward": 1.8169643580913544, "reward_std": 0.036448562517762184, "rewards/accuracy_reward": 0.8169643431901932, "rewards/format_reward": 1.0, "step": 6246 }, { "completion_length": 427.4352836608887, "epoch": 0.48705006528560013, "grad_norm": 0.043035939238292534, "kl": 0.0031871795654296875, "learning_rate": 5.203212013822914e-07, "loss": 0.0001, "reward": 1.7946429401636124, "reward_std": 0.02089315839111805, "rewards/accuracy_reward": 0.7946428880095482, "rewards/format_reward": 1.0, "step": 6248 }, { "completion_length": 420.5000228881836, "epoch": 0.487205971196383, "grad_norm": 0.08774298747017815, "kl": 0.003204345703125, "learning_rate": 5.200765000979057e-07, "loss": 0.0001, "reward": 1.7790179252624512, "reward_std": 0.04404080752283335, "rewards/accuracy_reward": 0.7790179029107094, "rewards/format_reward": 1.0, "step": 6250 }, { "completion_length": 425.1964454650879, "epoch": 0.48736187710716583, "grad_norm": 0.0032798050781216496, "kl": 0.002933502197265625, "learning_rate": 5.19831793997039e-07, "loss": 0.0001, "reward": 1.7410715222358704, "reward_std": 0.02089315839111805, "rewards/accuracy_reward": 0.7410714626312256, "rewards/format_reward": 1.0, "step": 6252 }, { "completion_length": 416.4486770629883, "epoch": 0.4875177830179487, "grad_norm": 0.09310614371817347, "kl": 0.0032634735107421875, "learning_rate": 5.195870831383985e-07, "loss": 0.0001, "reward": 1.8772321939468384, "reward_std": 0.05102826654911041, "rewards/accuracy_reward": 0.8772321715950966, "rewards/format_reward": 1.0, "step": 6254 }, { "completion_length": 435.49109268188477, "epoch": 0.48767368892873153, "grad_norm": 0.08569346485542438, "kl": 0.0028810501098632812, "learning_rate": 5.193423675806913e-07, "loss": 0.0001, "reward": 1.7477679550647736, "reward_std": 0.04501790925860405, "rewards/accuracy_reward": 0.7477678880095482, "rewards/format_reward": 1.0, "step": 6256 }, { "completion_length": 421.2522506713867, "epoch": 0.4878295948395144, "grad_norm": 0.07706879613586026, "kl": 0.0025663375854492188, "learning_rate": 5.190976473826262e-07, "loss": 0.0001, "reward": 1.8705357760190964, "reward_std": 0.036448562517762184, "rewards/accuracy_reward": 0.8705357611179352, "rewards/format_reward": 1.0, "step": 6258 }, { "completion_length": 420.6205520629883, "epoch": 0.4879855007502972, "grad_norm": 0.05738790940870286, "kl": 0.0030364990234375, "learning_rate": 5.188529226029135e-07, "loss": 0.0001, "reward": 1.8147322237491608, "reward_std": 0.038030450232326984, "rewards/accuracy_reward": 0.8147321790456772, "rewards/format_reward": 1.0, "step": 6260 }, { "completion_length": 427.03349685668945, "epoch": 0.48814140666108, "grad_norm": 0.049590762304846894, "kl": 0.002933502197265625, "learning_rate": 5.186081933002641e-07, "loss": 0.0001, "reward": 1.7745536416769028, "reward_std": 0.0344957634806633, "rewards/accuracy_reward": 0.7745536044239998, "rewards/format_reward": 1.0, "step": 6262 }, { "completion_length": 408.471004486084, "epoch": 0.4882973125718629, "grad_norm": 0.11136180070409202, "kl": 0.0026340484619140625, "learning_rate": 5.183634595333902e-07, "loss": 0.0001, "reward": 1.8571429401636124, "reward_std": 0.041786317713558674, "rewards/accuracy_reward": 0.8571428805589676, "rewards/format_reward": 1.0, "step": 6264 }, { "completion_length": 406.6785888671875, "epoch": 0.4884532184826457, "grad_norm": 0.09101059213589856, "kl": 0.002574920654296875, "learning_rate": 5.18118721361005e-07, "loss": 0.0001, "reward": 1.8683036118745804, "reward_std": 0.04260864853858948, "rewards/accuracy_reward": 0.8683035969734192, "rewards/format_reward": 1.0, "step": 6266 }, { "completion_length": 421.3013572692871, "epoch": 0.4886091243934286, "grad_norm": 0.07581962968434361, "kl": 0.00292205810546875, "learning_rate": 5.178739788418225e-07, "loss": 0.0001, "reward": 1.799107238650322, "reward_std": 0.02089315839111805, "rewards/accuracy_reward": 0.799107164144516, "rewards/format_reward": 1.0, "step": 6268 }, { "completion_length": 423.5178756713867, "epoch": 0.4887650303042114, "grad_norm": 0.06256562339138895, "kl": 0.0027256011962890625, "learning_rate": 5.176292320345582e-07, "loss": 0.0001, "reward": 1.8080357909202576, "reward_std": 0.036448562517762184, "rewards/accuracy_reward": 0.808035746216774, "rewards/format_reward": 1.0, "step": 6270 }, { "completion_length": 425.0468978881836, "epoch": 0.4889209362149943, "grad_norm": 0.10098910719113215, "kl": 0.0031490325927734375, "learning_rate": 5.173844809979284e-07, "loss": 0.0001, "reward": 1.7723215073347092, "reward_std": 0.0699686286970973, "rewards/accuracy_reward": 0.7723214589059353, "rewards/format_reward": 1.0, "step": 6272 }, { "completion_length": 416.6160888671875, "epoch": 0.48907684212577707, "grad_norm": 0.05615556621061187, "kl": 0.002460479736328125, "learning_rate": 5.171397257906505e-07, "loss": 0.0001, "reward": 1.8370536267757416, "reward_std": 0.05087490193545818, "rewards/accuracy_reward": 0.837053619325161, "rewards/format_reward": 1.0, "step": 6274 }, { "completion_length": 416.71207427978516, "epoch": 0.4892327480365599, "grad_norm": 0.12111969522906424, "kl": 0.00388336181640625, "learning_rate": 5.168949664714426e-07, "loss": 0.0002, "reward": 1.8348214775323868, "reward_std": 0.03352006524801254, "rewards/accuracy_reward": 0.8348214626312256, "rewards/format_reward": 1.0, "step": 6276 }, { "completion_length": 438.1696662902832, "epoch": 0.48938865394734277, "grad_norm": 0.11836840187776781, "kl": 0.003368377685546875, "learning_rate": 5.166502030990242e-07, "loss": 0.0001, "reward": 1.7678572237491608, "reward_std": 0.0825955355539918, "rewards/accuracy_reward": 0.7700893208384514, "rewards/format_reward": 0.9977678656578064, "step": 6278 }, { "completion_length": 430.7388572692871, "epoch": 0.4895445598581256, "grad_norm": 0.054089047259923737, "kl": 0.0028476715087890625, "learning_rate": 5.164054357321157e-07, "loss": 0.0001, "reward": 1.8459822237491608, "reward_std": 0.01750820130109787, "rewards/accuracy_reward": 0.8459821790456772, "rewards/format_reward": 1.0, "step": 6280 }, { "completion_length": 409.1495666503906, "epoch": 0.48970046576890847, "grad_norm": 0.10611537536059588, "kl": 0.004214286804199219, "learning_rate": 5.161606644294381e-07, "loss": 0.0002, "reward": 1.7834822237491608, "reward_std": 0.05538892187178135, "rewards/accuracy_reward": 0.7834821790456772, "rewards/format_reward": 1.0, "step": 6282 }, { "completion_length": 408.1942138671875, "epoch": 0.4898563716796913, "grad_norm": 0.07393665737141249, "kl": 0.0026311874389648438, "learning_rate": 5.159158892497139e-07, "loss": 0.0001, "reward": 1.8794643431901932, "reward_std": 0.02089315839111805, "rewards/accuracy_reward": 0.8794643208384514, "rewards/format_reward": 1.0, "step": 6284 }, { "completion_length": 418.3616256713867, "epoch": 0.49001227759047417, "grad_norm": 0.05515983751660208, "kl": 0.0026006698608398438, "learning_rate": 5.156711102516663e-07, "loss": 0.0001, "reward": 1.8325893431901932, "reward_std": 0.02720661275088787, "rewards/accuracy_reward": 0.8325893059372902, "rewards/format_reward": 1.0, "step": 6286 }, { "completion_length": 416.81474685668945, "epoch": 0.490168183501257, "grad_norm": 0.13195576844094775, "kl": 0.0032205581665039062, "learning_rate": 5.154263274940187e-07, "loss": 0.0001, "reward": 1.7611607760190964, "reward_std": 0.07515161670744419, "rewards/accuracy_reward": 0.7611607611179352, "rewards/format_reward": 1.0, "step": 6288 }, { "completion_length": 429.392879486084, "epoch": 0.4903240894120398, "grad_norm": 0.0032964879898795185, "kl": 0.002651214599609375, "learning_rate": 5.151815410354967e-07, "loss": 0.0001, "reward": 1.783482238650322, "reward_std": 0.022845957428216934, "rewards/accuracy_reward": 0.7834821715950966, "rewards/format_reward": 1.0, "step": 6290 }, { "completion_length": 426.7232322692871, "epoch": 0.49047999532282266, "grad_norm": 0.07990622110897867, "kl": 0.0028486251831054688, "learning_rate": 5.14936750934826e-07, "loss": 0.0001, "reward": 1.8392857611179352, "reward_std": 0.03870445489883423, "rewards/accuracy_reward": 0.8392857536673546, "rewards/format_reward": 1.0, "step": 6292 }, { "completion_length": 424.4933204650879, "epoch": 0.4906359012336055, "grad_norm": 0.08459383626170781, "kl": 0.0027980804443359375, "learning_rate": 5.14691957250733e-07, "loss": 0.0001, "reward": 1.859375074505806, "reward_std": 0.024124750867486, "rewards/accuracy_reward": 0.8593750223517418, "rewards/format_reward": 1.0, "step": 6294 }, { "completion_length": 436.5468940734863, "epoch": 0.49079180714438836, "grad_norm": 0.1127044540000716, "kl": 0.003475189208984375, "learning_rate": 5.144471600419456e-07, "loss": 0.0001, "reward": 1.8482143729925156, "reward_std": 0.0490754684433341, "rewards/accuracy_reward": 0.8482143133878708, "rewards/format_reward": 1.0, "step": 6296 }, { "completion_length": 429.77234268188477, "epoch": 0.4909477130551712, "grad_norm": 0.12293618313472394, "kl": 0.0029439926147460938, "learning_rate": 5.14202359367192e-07, "loss": 0.0001, "reward": 1.7879465073347092, "reward_std": 0.07289852853864431, "rewards/accuracy_reward": 0.7879464700818062, "rewards/format_reward": 1.0, "step": 6298 }, { "completion_length": 432.964298248291, "epoch": 0.49110361896595406, "grad_norm": 0.050863524004744726, "kl": 0.0030994415283203125, "learning_rate": 5.139575552852018e-07, "loss": 0.0001, "reward": 1.9040178805589676, "reward_std": 0.014579704962670803, "rewards/accuracy_reward": 0.9040178768336773, "rewards/format_reward": 1.0, "step": 6300 }, { "completion_length": 413.25894927978516, "epoch": 0.4912595248767369, "grad_norm": 0.07034120177778254, "kl": 0.0028047561645507812, "learning_rate": 5.137127478547045e-07, "loss": 0.0001, "reward": 1.7991072237491608, "reward_std": 0.04230555426329374, "rewards/accuracy_reward": 0.7991071864962578, "rewards/format_reward": 1.0, "step": 6302 }, { "completion_length": 412.9665336608887, "epoch": 0.49141543078751976, "grad_norm": 0.1054773481499564, "kl": 0.00283050537109375, "learning_rate": 5.134679371344315e-07, "loss": 0.0001, "reward": 1.8616072237491608, "reward_std": 0.036448562517762184, "rewards/accuracy_reward": 0.8616071864962578, "rewards/format_reward": 1.0, "step": 6304 }, { "completion_length": 435.46653747558594, "epoch": 0.49157133669830255, "grad_norm": 0.06518424931025332, "kl": 0.0032243728637695312, "learning_rate": 5.132231231831144e-07, "loss": 0.0001, "reward": 1.8839286267757416, "reward_std": 0.01555540319532156, "rewards/accuracy_reward": 0.8839285969734192, "rewards/format_reward": 1.0, "step": 6306 }, { "completion_length": 426.2611846923828, "epoch": 0.4917272426090854, "grad_norm": 0.07130614582220096, "kl": 0.0031061172485351562, "learning_rate": 5.129783060594851e-07, "loss": 0.0001, "reward": 1.8169643580913544, "reward_std": 0.025253813713788986, "rewards/accuracy_reward": 0.8169643059372902, "rewards/format_reward": 1.0, "step": 6308 }, { "completion_length": 434.6763610839844, "epoch": 0.49188314851986825, "grad_norm": 0.1063227105832042, "kl": 0.0031280517578125, "learning_rate": 5.127334858222774e-07, "loss": 0.0001, "reward": 1.7031251043081284, "reward_std": 0.06057331059128046, "rewards/accuracy_reward": 0.7031250260770321, "rewards/format_reward": 1.0, "step": 6310 }, { "completion_length": 428.111629486084, "epoch": 0.4920390544306511, "grad_norm": 0.08260802002868424, "kl": 0.0029010772705078125, "learning_rate": 5.124886625302251e-07, "loss": 0.0001, "reward": 1.8258929252624512, "reward_std": 0.029159409925341606, "rewards/accuracy_reward": 0.8258928805589676, "rewards/format_reward": 1.0, "step": 6312 }, { "completion_length": 417.3058204650879, "epoch": 0.49219496034143395, "grad_norm": 0.1236113628047371, "kl": 0.0029201507568359375, "learning_rate": 5.122438362420625e-07, "loss": 0.0001, "reward": 1.843750074505806, "reward_std": 0.057341720908880234, "rewards/accuracy_reward": 0.8437500298023224, "rewards/format_reward": 1.0, "step": 6314 }, { "completion_length": 418.3236846923828, "epoch": 0.4923508662522168, "grad_norm": 0.07102271057521999, "kl": 0.002864837646484375, "learning_rate": 5.119990070165255e-07, "loss": 0.0001, "reward": 1.7812500596046448, "reward_std": 0.028182310052216053, "rewards/accuracy_reward": 0.7812500298023224, "rewards/format_reward": 1.0, "step": 6316 }, { "completion_length": 428.4419860839844, "epoch": 0.49250677216299965, "grad_norm": 0.08141894920827011, "kl": 0.0028781890869140625, "learning_rate": 5.1175417491235e-07, "loss": 0.0001, "reward": 1.8258929252624512, "reward_std": 0.06395826768130064, "rewards/accuracy_reward": 0.8258928954601288, "rewards/format_reward": 1.0, "step": 6318 }, { "completion_length": 412.0848388671875, "epoch": 0.49266267807378245, "grad_norm": 0.09023502795530829, "kl": 0.0027313232421875, "learning_rate": 5.115093399882727e-07, "loss": 0.0001, "reward": 1.84151791036129, "reward_std": 0.06658367160707712, "rewards/accuracy_reward": 0.841517873108387, "rewards/format_reward": 1.0, "step": 6320 }, { "completion_length": 415.799129486084, "epoch": 0.4928185839845653, "grad_norm": 0.100368462396504, "kl": 0.0030345916748046875, "learning_rate": 5.11264502303031e-07, "loss": 0.0001, "reward": 1.71428582072258, "reward_std": 0.055909561924636364, "rewards/accuracy_reward": 0.714285746216774, "rewards/format_reward": 1.0, "step": 6322 }, { "completion_length": 419.05135345458984, "epoch": 0.49297448989534814, "grad_norm": 0.0775994613539162, "kl": 0.0028247833251953125, "learning_rate": 5.110196619153633e-07, "loss": 0.0001, "reward": 1.8169643878936768, "reward_std": 0.023821654729545116, "rewards/accuracy_reward": 0.816964328289032, "rewards/format_reward": 1.0, "step": 6324 }, { "completion_length": 424.9576072692871, "epoch": 0.493130395806131, "grad_norm": 0.07298244914112449, "kl": 0.0031070709228515625, "learning_rate": 5.107748188840082e-07, "loss": 0.0001, "reward": 1.8035714775323868, "reward_std": 0.06853647157549858, "rewards/accuracy_reward": 0.8035714477300644, "rewards/format_reward": 1.0, "step": 6326 }, { "completion_length": 434.2098388671875, "epoch": 0.49328630171691384, "grad_norm": 0.1232556535743958, "kl": 0.002796173095703125, "learning_rate": 5.105299732677052e-07, "loss": 0.0001, "reward": 1.8392857909202576, "reward_std": 0.055909561924636364, "rewards/accuracy_reward": 0.8392857387661934, "rewards/format_reward": 1.0, "step": 6328 }, { "completion_length": 415.6473388671875, "epoch": 0.4934422076276967, "grad_norm": 0.06441247748146359, "kl": 0.0023403167724609375, "learning_rate": 5.102851251251943e-07, "loss": 0.0001, "reward": 1.8214286714792252, "reward_std": 0.025253813713788986, "rewards/accuracy_reward": 0.8214285969734192, "rewards/format_reward": 1.0, "step": 6330 }, { "completion_length": 411.56028747558594, "epoch": 0.49359811353847954, "grad_norm": 0.05177765211635205, "kl": 0.0031719207763671875, "learning_rate": 5.10040274515216e-07, "loss": 0.0001, "reward": 1.738839328289032, "reward_std": 0.06493396870791912, "rewards/accuracy_reward": 0.7388393208384514, "rewards/format_reward": 1.0, "step": 6332 }, { "completion_length": 421.3750190734863, "epoch": 0.4937540194492624, "grad_norm": 0.08707955410223941, "kl": 0.0034847259521484375, "learning_rate": 5.097954214965116e-07, "loss": 0.0001, "reward": 1.752232238650322, "reward_std": 0.04959610756486654, "rewards/accuracy_reward": 0.7522321790456772, "rewards/format_reward": 1.0, "step": 6334 }, { "completion_length": 422.70314025878906, "epoch": 0.4939099253600452, "grad_norm": 0.06874878503400181, "kl": 0.0030193328857421875, "learning_rate": 5.095505661278231e-07, "loss": 0.0001, "reward": 1.821428656578064, "reward_std": 0.048249500803649426, "rewards/accuracy_reward": 0.8214286044239998, "rewards/format_reward": 1.0, "step": 6336 }, { "completion_length": 426.43528747558594, "epoch": 0.49406583127082804, "grad_norm": 0.11270827995786371, "kl": 0.0031528472900390625, "learning_rate": 5.093057084678925e-07, "loss": 0.0001, "reward": 1.8459822088479996, "reward_std": 0.048099772073328495, "rewards/accuracy_reward": 0.8459821715950966, "rewards/format_reward": 1.0, "step": 6338 }, { "completion_length": 422.8482360839844, "epoch": 0.4942217371816109, "grad_norm": 0.07508315284423506, "kl": 0.0031871795654296875, "learning_rate": 5.090608485754632e-07, "loss": 0.0001, "reward": 1.9017857909202576, "reward_std": 0.04065585136413574, "rewards/accuracy_reward": 0.9017857536673546, "rewards/format_reward": 1.0, "step": 6340 }, { "completion_length": 418.10046005249023, "epoch": 0.49437764309239374, "grad_norm": 0.11864818815119288, "kl": 0.0029296875, "learning_rate": 5.08815986509278e-07, "loss": 0.0001, "reward": 1.7991072237491608, "reward_std": 0.0728823496028781, "rewards/accuracy_reward": 0.8013393133878708, "rewards/format_reward": 0.9977678656578064, "step": 6342 }, { "completion_length": 421.4375228881836, "epoch": 0.4945335490031766, "grad_norm": 0.1332163348152982, "kl": 0.0031871795654296875, "learning_rate": 5.085711223280815e-07, "loss": 0.0001, "reward": 1.8102679550647736, "reward_std": 0.055388920940458775, "rewards/accuracy_reward": 0.8102678880095482, "rewards/format_reward": 1.0, "step": 6344 }, { "completion_length": 426.09822845458984, "epoch": 0.49468945491395944, "grad_norm": 0.06279506236924144, "kl": 0.00308990478515625, "learning_rate": 5.083262560906179e-07, "loss": 0.0001, "reward": 1.8415179401636124, "reward_std": 0.042762015014886856, "rewards/accuracy_reward": 0.8415179029107094, "rewards/format_reward": 1.0, "step": 6346 }, { "completion_length": 430.1763610839844, "epoch": 0.4948453608247423, "grad_norm": 0.0028873428825977924, "kl": 0.002765655517578125, "learning_rate": 5.080813878556323e-07, "loss": 0.0001, "reward": 1.814732238650322, "reward_std": 0.014579704962670803, "rewards/accuracy_reward": 0.8147321715950966, "rewards/format_reward": 1.0, "step": 6348 }, { "completion_length": 419.5312728881836, "epoch": 0.49500126673552514, "grad_norm": 0.08617677381587856, "kl": 0.002819061279296875, "learning_rate": 5.078365176818699e-07, "loss": 0.0001, "reward": 1.8013393580913544, "reward_std": 0.03983351867645979, "rewards/accuracy_reward": 0.8035714700818062, "rewards/format_reward": 0.9977678656578064, "step": 6350 }, { "completion_length": 418.4710006713867, "epoch": 0.49515717264630793, "grad_norm": 0.07176854553636605, "kl": 0.002838134765625, "learning_rate": 5.07591645628077e-07, "loss": 0.0001, "reward": 1.7522322088479996, "reward_std": 0.056667717173695564, "rewards/accuracy_reward": 0.7522321939468384, "rewards/format_reward": 1.0, "step": 6352 }, { "completion_length": 421.0692138671875, "epoch": 0.4953130785570908, "grad_norm": 0.0861225018663771, "kl": 0.0028133392333984375, "learning_rate": 5.073467717529995e-07, "loss": 0.0001, "reward": 1.8482143729925156, "reward_std": 0.039377057924866676, "rewards/accuracy_reward": 0.8482143208384514, "rewards/format_reward": 1.0, "step": 6354 }, { "completion_length": 416.0156440734863, "epoch": 0.49546898446787363, "grad_norm": 0.08429043221976498, "kl": 0.002758026123046875, "learning_rate": 5.071018961153846e-07, "loss": 0.0001, "reward": 1.8705357909202576, "reward_std": 0.046146972104907036, "rewards/accuracy_reward": 0.8727679029107094, "rewards/format_reward": 0.9977678656578064, "step": 6356 }, { "completion_length": 408.2076072692871, "epoch": 0.4956248903786565, "grad_norm": 0.0785117945662403, "kl": 0.0031528472900390625, "learning_rate": 5.068570187739795e-07, "loss": 0.0001, "reward": 1.743303656578064, "reward_std": 0.03968015220016241, "rewards/accuracy_reward": 0.7433036044239998, "rewards/format_reward": 1.0, "step": 6358 }, { "completion_length": 428.21876525878906, "epoch": 0.49578079628943933, "grad_norm": 0.06319811563155767, "kl": 0.0031557083129882812, "learning_rate": 5.066121397875317e-07, "loss": 0.0001, "reward": 1.7991072088479996, "reward_std": 0.048922101967036724, "rewards/accuracy_reward": 0.7991071715950966, "rewards/format_reward": 1.0, "step": 6360 }, { "completion_length": 416.57144927978516, "epoch": 0.4959367022002222, "grad_norm": 0.06829129439600345, "kl": 0.0026988983154296875, "learning_rate": 5.063672592147893e-07, "loss": 0.0001, "reward": 1.8482143729925156, "reward_std": 0.04697070736438036, "rewards/accuracy_reward": 0.8482143133878708, "rewards/format_reward": 1.0, "step": 6362 }, { "completion_length": 425.6071548461914, "epoch": 0.49609260811100503, "grad_norm": 0.10028181195321993, "kl": 0.0034809112548828125, "learning_rate": 5.061223771145004e-07, "loss": 0.0001, "reward": 1.7656250894069672, "reward_std": 0.06124591641128063, "rewards/accuracy_reward": 0.7656250223517418, "rewards/format_reward": 1.0, "step": 6364 }, { "completion_length": 423.6763610839844, "epoch": 0.4962485140217879, "grad_norm": 0.11019226515296873, "kl": 0.0029621124267578125, "learning_rate": 5.058774935454145e-07, "loss": 0.0001, "reward": 1.6852679252624512, "reward_std": 0.05929452180862427, "rewards/accuracy_reward": 0.6852678954601288, "rewards/format_reward": 1.0, "step": 6366 }, { "completion_length": 417.32144927978516, "epoch": 0.4964044199325707, "grad_norm": 0.12700060901481758, "kl": 0.0028076171875, "learning_rate": 5.0563260856628e-07, "loss": 0.0001, "reward": 1.8482143431901932, "reward_std": 0.05734171997755766, "rewards/accuracy_reward": 0.8482143208384514, "rewards/format_reward": 1.0, "step": 6368 }, { "completion_length": 424.3013572692871, "epoch": 0.4965603258433535, "grad_norm": 0.10430710823200094, "kl": 0.0035915374755859375, "learning_rate": 5.053877222358468e-07, "loss": 0.0001, "reward": 1.866071492433548, "reward_std": 0.05200396664440632, "rewards/accuracy_reward": 0.8660714775323868, "rewards/format_reward": 1.0, "step": 6370 }, { "completion_length": 409.38171768188477, "epoch": 0.4967162317541364, "grad_norm": 0.08603573619555412, "kl": 0.0025234222412109375, "learning_rate": 5.051428346128645e-07, "loss": 0.0001, "reward": 1.8861607909202576, "reward_std": 0.03156726714223623, "rewards/accuracy_reward": 0.8861607611179352, "rewards/format_reward": 1.0, "step": 6372 }, { "completion_length": 421.16295623779297, "epoch": 0.4968721376649192, "grad_norm": 0.07085800473310114, "kl": 0.0029048919677734375, "learning_rate": 5.048979457560835e-07, "loss": 0.0001, "reward": 1.7879465073347092, "reward_std": 0.05927974823862314, "rewards/accuracy_reward": 0.792410746216774, "rewards/format_reward": 0.9955357313156128, "step": 6374 }, { "completion_length": 417.3437690734863, "epoch": 0.49702804357570207, "grad_norm": 0.07043769033670592, "kl": 0.0020761489868164062, "learning_rate": 5.046530557242538e-07, "loss": 0.0001, "reward": 1.8571429401636124, "reward_std": 0.02089315839111805, "rewards/accuracy_reward": 0.8571428954601288, "rewards/format_reward": 1.0, "step": 6376 }, { "completion_length": 403.9843940734863, "epoch": 0.4971839494864849, "grad_norm": 0.08815659559959396, "kl": 0.0023975372314453125, "learning_rate": 5.044081645761264e-07, "loss": 0.0001, "reward": 1.8660715073347092, "reward_std": 0.055692016147077084, "rewards/accuracy_reward": 0.8683036044239998, "rewards/format_reward": 0.9977678656578064, "step": 6378 }, { "completion_length": 427.02234268188477, "epoch": 0.49733985539726777, "grad_norm": 0.10415450224087114, "kl": 0.003337860107421875, "learning_rate": 5.04163272370452e-07, "loss": 0.0001, "reward": 1.8191965073347092, "reward_std": 0.0649339659139514, "rewards/accuracy_reward": 0.819196455180645, "rewards/format_reward": 1.0, "step": 6380 }, { "completion_length": 421.0044860839844, "epoch": 0.49749576130805057, "grad_norm": 0.09809840265811297, "kl": 0.0029144287109375, "learning_rate": 5.039183791659821e-07, "loss": 0.0001, "reward": 1.8660715073347092, "reward_std": 0.060270216315984726, "rewards/accuracy_reward": 0.8660714626312256, "rewards/format_reward": 1.0, "step": 6382 }, { "completion_length": 420.22546768188477, "epoch": 0.4976516672188334, "grad_norm": 0.06941605380124724, "kl": 0.0031795501708984375, "learning_rate": 5.036734850214676e-07, "loss": 0.0001, "reward": 1.776785746216774, "reward_std": 0.03644856158643961, "rewards/accuracy_reward": 0.7767857387661934, "rewards/format_reward": 1.0, "step": 6384 }, { "completion_length": 428.6852836608887, "epoch": 0.49780757312961627, "grad_norm": 0.08502911835888845, "kl": 0.00293731689453125, "learning_rate": 5.034285899956608e-07, "loss": 0.0001, "reward": 1.8616072237491608, "reward_std": 0.04471481405198574, "rewards/accuracy_reward": 0.8638393208384514, "rewards/format_reward": 0.9977678656578064, "step": 6386 }, { "completion_length": 434.22546768188477, "epoch": 0.4979634790403991, "grad_norm": 0.08451989506894396, "kl": 0.003253936767578125, "learning_rate": 5.031836941473135e-07, "loss": 0.0001, "reward": 1.7991072088479996, "reward_std": 0.06124731805175543, "rewards/accuracy_reward": 0.7991071864962578, "rewards/format_reward": 1.0, "step": 6388 }, { "completion_length": 423.47322845458984, "epoch": 0.49811938495118196, "grad_norm": 0.07174079759547114, "kl": 0.0027008056640625, "learning_rate": 5.029387975351773e-07, "loss": 0.0001, "reward": 1.8370536714792252, "reward_std": 0.01894036028534174, "rewards/accuracy_reward": 0.837053619325161, "rewards/format_reward": 1.0, "step": 6390 }, { "completion_length": 416.8236846923828, "epoch": 0.4982752908619648, "grad_norm": 0.047644702289644646, "kl": 0.002445220947265625, "learning_rate": 5.02693900218005e-07, "loss": 0.0001, "reward": 1.8102679401636124, "reward_std": 0.014579704962670803, "rewards/accuracy_reward": 0.8102678954601288, "rewards/format_reward": 1.0, "step": 6392 }, { "completion_length": 420.88171768188477, "epoch": 0.49843119677274766, "grad_norm": 0.040886317741737316, "kl": 0.002758026123046875, "learning_rate": 5.024490022545488e-07, "loss": 0.0001, "reward": 1.8348215073347092, "reward_std": 0.044714814983308315, "rewards/accuracy_reward": 0.8348214626312256, "rewards/format_reward": 1.0, "step": 6394 }, { "completion_length": 420.8370704650879, "epoch": 0.4985871026835305, "grad_norm": 0.0476591772421427, "kl": 0.0027589797973632812, "learning_rate": 5.022041037035611e-07, "loss": 0.0001, "reward": 1.7522322237491608, "reward_std": 0.031413902528584, "rewards/accuracy_reward": 0.7522321790456772, "rewards/format_reward": 1.0, "step": 6396 }, { "completion_length": 409.08930587768555, "epoch": 0.4987430085943133, "grad_norm": 0.1061609367087498, "kl": 0.0027322769165039062, "learning_rate": 5.01959204623795e-07, "loss": 0.0001, "reward": 1.8928572088479996, "reward_std": 0.048922101967036724, "rewards/accuracy_reward": 0.892857164144516, "rewards/format_reward": 1.0, "step": 6398 }, { "completion_length": 430.8839530944824, "epoch": 0.49889891450509616, "grad_norm": 0.0677521447538325, "kl": 0.003147125244140625, "learning_rate": 5.017143050740033e-07, "loss": 0.0001, "reward": 1.821428656578064, "reward_std": 0.030438203364610672, "rewards/accuracy_reward": 0.821428619325161, "rewards/format_reward": 1.0, "step": 6400 }, { "completion_length": 420.12278747558594, "epoch": 0.499054820415879, "grad_norm": 0.10906933587824494, "kl": 0.0030002593994140625, "learning_rate": 5.014694051129386e-07, "loss": 0.0001, "reward": 1.7901786416769028, "reward_std": 0.03352006617933512, "rewards/accuracy_reward": 0.790178619325161, "rewards/format_reward": 1.0, "step": 6402 }, { "completion_length": 410.9464416503906, "epoch": 0.49921072632666186, "grad_norm": 0.0031980236824320804, "kl": 0.0023059844970703125, "learning_rate": 5.012245047993542e-07, "loss": 0.0001, "reward": 1.910714328289032, "reward_std": 0.0, "rewards/accuracy_reward": 0.910714328289032, "rewards/format_reward": 1.0, "step": 6404 }, { "completion_length": 425.3794860839844, "epoch": 0.4993666322374447, "grad_norm": 0.09969597060642824, "kl": 0.0032558441162109375, "learning_rate": 5.009796041920034e-07, "loss": 0.0001, "reward": 1.8058036267757416, "reward_std": 0.027206611819565296, "rewards/accuracy_reward": 0.8058036044239998, "rewards/format_reward": 1.0, "step": 6406 }, { "completion_length": 414.2276954650879, "epoch": 0.49952253814822756, "grad_norm": 0.07134070043507582, "kl": 0.0028867721557617188, "learning_rate": 5.007347033496393e-07, "loss": 0.0001, "reward": 1.8794643729925156, "reward_std": 0.03644856158643961, "rewards/accuracy_reward": 0.879464328289032, "rewards/format_reward": 1.0, "step": 6408 }, { "completion_length": 427.7299346923828, "epoch": 0.4996784440590104, "grad_norm": 0.09527418576092983, "kl": 0.0033931732177734375, "learning_rate": 5.004898023310153e-07, "loss": 0.0001, "reward": 1.7700893431901932, "reward_std": 0.039833519607782364, "rewards/accuracy_reward": 0.7700893208384514, "rewards/format_reward": 1.0, "step": 6410 }, { "completion_length": 421.1384162902832, "epoch": 0.49983434996979326, "grad_norm": 0.05147643631499491, "kl": 0.0022430419921875, "learning_rate": 5.002449011948843e-07, "loss": 0.0001, "reward": 1.9107143431901932, "reward_std": 0.012626906856894493, "rewards/accuracy_reward": 0.9107143208384514, "rewards/format_reward": 1.0, "step": 6412 }, { "completion_length": 422.89957427978516, "epoch": 0.49999025588057605, "grad_norm": 0.07565864684379713, "kl": 0.002986907958984375, "learning_rate": 5e-07, "loss": 0.0001, "reward": 1.7991072237491608, "reward_std": 0.025253813713788986, "rewards/accuracy_reward": 0.7991071790456772, "rewards/format_reward": 1.0, "step": 6414 }, { "completion_length": 413.66296768188477, "epoch": 0.500146161791359, "grad_norm": 0.0037842909587219044, "kl": 0.0026464462280273438, "learning_rate": 4.997550988051156e-07, "loss": 0.0001, "reward": 1.7991072237491608, "reward_std": 0.012626906856894493, "rewards/accuracy_reward": 0.7991071790456772, "rewards/format_reward": 1.0, "step": 6416 }, { "completion_length": 414.00671005249023, "epoch": 0.5003020677021418, "grad_norm": 0.07038528573870724, "kl": 0.0026226043701171875, "learning_rate": 4.995101976689849e-07, "loss": 0.0001, "reward": 1.8549107760190964, "reward_std": 0.041329856030642986, "rewards/accuracy_reward": 0.854910746216774, "rewards/format_reward": 1.0, "step": 6418 }, { "completion_length": 431.3638572692871, "epoch": 0.5004579736129245, "grad_norm": 0.08676948192743322, "kl": 0.0030765533447265625, "learning_rate": 4.992652966503607e-07, "loss": 0.0001, "reward": 1.8950893580913544, "reward_std": 0.042762015014886856, "rewards/accuracy_reward": 0.8950893208384514, "rewards/format_reward": 1.0, "step": 6420 }, { "completion_length": 425.62055587768555, "epoch": 0.5006138795237074, "grad_norm": 0.04830704791705166, "kl": 0.00302886962890625, "learning_rate": 4.990203958079965e-07, "loss": 0.0001, "reward": 1.8013393729925156, "reward_std": 0.036751655861735344, "rewards/accuracy_reward": 0.801339328289032, "rewards/format_reward": 1.0, "step": 6422 }, { "completion_length": 431.50671768188477, "epoch": 0.5007697854344902, "grad_norm": 0.08684437399020602, "kl": 0.0034999847412109375, "learning_rate": 4.987754952006458e-07, "loss": 0.0001, "reward": 1.734375074505806, "reward_std": 0.04959610849618912, "rewards/accuracy_reward": 0.7343750335276127, "rewards/format_reward": 1.0, "step": 6424 }, { "completion_length": 419.75894927978516, "epoch": 0.5009256913452731, "grad_norm": 0.06494340551020197, "kl": 0.00267791748046875, "learning_rate": 4.985305948870614e-07, "loss": 0.0001, "reward": 1.9017857760190964, "reward_std": 0.03352006524801254, "rewards/accuracy_reward": 0.9017857536673546, "rewards/format_reward": 1.0, "step": 6426 }, { "completion_length": 433.94198989868164, "epoch": 0.5010815972560559, "grad_norm": 0.11873953313669176, "kl": 0.00341796875, "learning_rate": 4.982856949259968e-07, "loss": 0.0001, "reward": 1.8236607909202576, "reward_std": 0.10708615556359291, "rewards/accuracy_reward": 0.8236607536673546, "rewards/format_reward": 1.0, "step": 6428 }, { "completion_length": 433.03572845458984, "epoch": 0.5012375031668388, "grad_norm": 0.09083176457727266, "kl": 0.0033283233642578125, "learning_rate": 4.980407953762049e-07, "loss": 0.0001, "reward": 1.7991072535514832, "reward_std": 0.055909561924636364, "rewards/accuracy_reward": 0.7991071790456772, "rewards/format_reward": 1.0, "step": 6430 }, { "completion_length": 419.2656440734863, "epoch": 0.5013934090776216, "grad_norm": 0.07203140965962605, "kl": 0.0027341842651367188, "learning_rate": 4.977958962964388e-07, "loss": 0.0001, "reward": 1.8683036267757416, "reward_std": 0.040958947502076626, "rewards/accuracy_reward": 0.8683036044239998, "rewards/format_reward": 1.0, "step": 6432 }, { "completion_length": 421.7835006713867, "epoch": 0.5015493149884045, "grad_norm": 0.05616409653954851, "kl": 0.0029315948486328125, "learning_rate": 4.975509977454514e-07, "loss": 0.0001, "reward": 1.8482143729925156, "reward_std": 0.034645493142306805, "rewards/accuracy_reward": 0.848214328289032, "rewards/format_reward": 1.0, "step": 6434 }, { "completion_length": 415.0201072692871, "epoch": 0.5017052208991873, "grad_norm": 0.09879394283491706, "kl": 0.0028104782104492188, "learning_rate": 4.97306099781995e-07, "loss": 0.0001, "reward": 1.7388393580913544, "reward_std": 0.030135108157992363, "rewards/accuracy_reward": 0.738839328289032, "rewards/format_reward": 1.0, "step": 6436 }, { "completion_length": 414.5960006713867, "epoch": 0.5018611268099702, "grad_norm": 0.10088931941380838, "kl": 0.0032558441162109375, "learning_rate": 4.970612024648225e-07, "loss": 0.0001, "reward": 1.8392858058214188, "reward_std": 0.05133136175572872, "rewards/accuracy_reward": 0.839285746216774, "rewards/format_reward": 1.0, "step": 6438 }, { "completion_length": 427.00671768188477, "epoch": 0.502017032720753, "grad_norm": 0.08654773454916309, "kl": 0.0033740997314453125, "learning_rate": 4.968163058526866e-07, "loss": 0.0001, "reward": 1.7678572535514832, "reward_std": 0.029159409925341606, "rewards/accuracy_reward": 0.7678571715950966, "rewards/format_reward": 1.0, "step": 6440 }, { "completion_length": 425.6875190734863, "epoch": 0.5021729386315359, "grad_norm": 0.0725757879780398, "kl": 0.00305938720703125, "learning_rate": 4.965714100043391e-07, "loss": 0.0001, "reward": 1.7834822088479996, "reward_std": 0.05395676288753748, "rewards/accuracy_reward": 0.7834821715950966, "rewards/format_reward": 1.0, "step": 6442 }, { "completion_length": 421.3013572692871, "epoch": 0.5023288445423187, "grad_norm": 0.0531711908396796, "kl": 0.0027074813842773438, "learning_rate": 4.963265149785323e-07, "loss": 0.0001, "reward": 1.926339328289032, "reward_std": 0.0063134534284472466, "rewards/accuracy_reward": 0.926339328289032, "rewards/format_reward": 1.0, "step": 6444 }, { "completion_length": 435.9843978881836, "epoch": 0.5024847504531016, "grad_norm": 0.06779076797406017, "kl": 0.0029277801513671875, "learning_rate": 4.960816208340182e-07, "loss": 0.0001, "reward": 1.7879465222358704, "reward_std": 0.041329856030642986, "rewards/accuracy_reward": 0.7879464700818062, "rewards/format_reward": 1.0, "step": 6446 }, { "completion_length": 421.8259086608887, "epoch": 0.5026406563638844, "grad_norm": 0.09459583531411292, "kl": 0.0027742385864257812, "learning_rate": 4.95836727629548e-07, "loss": 0.0001, "reward": 1.8102679252624512, "reward_std": 0.05230706185102463, "rewards/accuracy_reward": 0.8102678954601288, "rewards/format_reward": 1.0, "step": 6448 }, { "completion_length": 438.7120704650879, "epoch": 0.5027965622746673, "grad_norm": 0.06193858517711551, "kl": 0.0029888153076171875, "learning_rate": 4.955918354238737e-07, "loss": 0.0001, "reward": 1.8236607760190964, "reward_std": 0.034495764411985874, "rewards/accuracy_reward": 0.8236607536673546, "rewards/format_reward": 1.0, "step": 6450 }, { "completion_length": 416.30135345458984, "epoch": 0.50295246818545, "grad_norm": 0.04635133640987195, "kl": 0.00281524658203125, "learning_rate": 4.953469442757463e-07, "loss": 0.0001, "reward": 1.8258929252624512, "reward_std": 0.039377059787511826, "rewards/accuracy_reward": 0.8258928954601288, "rewards/format_reward": 1.0, "step": 6452 }, { "completion_length": 426.06251525878906, "epoch": 0.5031083740962329, "grad_norm": 0.07062375438650284, "kl": 0.002925872802734375, "learning_rate": 4.951020542439166e-07, "loss": 0.0001, "reward": 1.7767857909202576, "reward_std": 0.033366698771715164, "rewards/accuracy_reward": 0.776785746216774, "rewards/format_reward": 1.0, "step": 6454 }, { "completion_length": 435.77233505249023, "epoch": 0.5032642800070157, "grad_norm": 0.1105821181971874, "kl": 0.0030918121337890625, "learning_rate": 4.948571653871353e-07, "loss": 0.0001, "reward": 1.899553656578064, "reward_std": 0.06703649181872606, "rewards/accuracy_reward": 0.899553619325161, "rewards/format_reward": 1.0, "step": 6456 }, { "completion_length": 428.25894927978516, "epoch": 0.5034201859177986, "grad_norm": 0.0030431188281947515, "kl": 0.0030803680419921875, "learning_rate": 4.946122777641532e-07, "loss": 0.0001, "reward": 1.8459822088479996, "reward_std": 0.03983352053910494, "rewards/accuracy_reward": 0.8459821790456772, "rewards/format_reward": 1.0, "step": 6458 }, { "completion_length": 425.6808204650879, "epoch": 0.5035760918285814, "grad_norm": 0.046848657698437476, "kl": 0.00270843505859375, "learning_rate": 4.9436739143372e-07, "loss": 0.0001, "reward": 1.8348215073347092, "reward_std": 0.02089315839111805, "rewards/accuracy_reward": 0.8348214700818062, "rewards/format_reward": 1.0, "step": 6460 }, { "completion_length": 426.9285888671875, "epoch": 0.5037319977393643, "grad_norm": 0.0789417008173098, "kl": 0.0027256011962890625, "learning_rate": 4.941225064545856e-07, "loss": 0.0001, "reward": 1.8348215073347092, "reward_std": 0.066886767745018, "rewards/accuracy_reward": 0.8370535969734192, "rewards/format_reward": 0.9977678656578064, "step": 6462 }, { "completion_length": 427.31251525878906, "epoch": 0.5038879036501471, "grad_norm": 0.1341443947349821, "kl": 0.0040950775146484375, "learning_rate": 4.938776228854995e-07, "loss": 0.0002, "reward": 1.7455357909202576, "reward_std": 0.07124742306768894, "rewards/accuracy_reward": 0.745535746216774, "rewards/format_reward": 1.0, "step": 6464 }, { "completion_length": 423.71876525878906, "epoch": 0.50404380956093, "grad_norm": 0.14435470505864184, "kl": 0.0031871795654296875, "learning_rate": 4.936327407852108e-07, "loss": 0.0001, "reward": 1.80803582072258, "reward_std": 0.06688676681369543, "rewards/accuracy_reward": 0.808035746216774, "rewards/format_reward": 1.0, "step": 6466 }, { "completion_length": 431.6942138671875, "epoch": 0.5041997154717128, "grad_norm": 0.05348462639885028, "kl": 0.0028066635131835938, "learning_rate": 4.933878602124684e-07, "loss": 0.0001, "reward": 1.8883929252624512, "reward_std": 0.012626906856894493, "rewards/accuracy_reward": 0.8883928954601288, "rewards/format_reward": 1.0, "step": 6468 }, { "completion_length": 433.86162185668945, "epoch": 0.5043556213824957, "grad_norm": 0.003249815970068927, "kl": 0.0030336380004882812, "learning_rate": 4.931429812260205e-07, "loss": 0.0001, "reward": 1.859375074505806, "reward_std": 0.021868856623768806, "rewards/accuracy_reward": 0.8593750298023224, "rewards/format_reward": 1.0, "step": 6470 }, { "completion_length": 416.5022506713867, "epoch": 0.5045115272932785, "grad_norm": 0.003113613210119674, "kl": 0.0025424957275390625, "learning_rate": 4.928981038846152e-07, "loss": 0.0001, "reward": 1.8325893580913544, "reward_std": 0.038401360623538494, "rewards/accuracy_reward": 0.8325893357396126, "rewards/format_reward": 1.0, "step": 6472 }, { "completion_length": 420.4419860839844, "epoch": 0.5046674332040614, "grad_norm": 0.09326866016856127, "kl": 0.0032596588134765625, "learning_rate": 4.926532282470005e-07, "loss": 0.0001, "reward": 1.8013393580913544, "reward_std": 0.05185199901461601, "rewards/accuracy_reward": 0.801339328289032, "rewards/format_reward": 1.0, "step": 6474 }, { "completion_length": 418.81921768188477, "epoch": 0.5048233391148442, "grad_norm": 0.0030959707925205643, "kl": 0.00237274169921875, "learning_rate": 4.924083543719231e-07, "loss": 0.0001, "reward": 1.910714328289032, "reward_std": 0.0, "rewards/accuracy_reward": 0.9107143133878708, "rewards/format_reward": 1.0, "step": 6476 }, { "completion_length": 426.6361770629883, "epoch": 0.5049792450256271, "grad_norm": 0.08818902498236546, "kl": 0.00286102294921875, "learning_rate": 4.921634823181299e-07, "loss": 0.0001, "reward": 1.8415179401636124, "reward_std": 0.03983351867645979, "rewards/accuracy_reward": 0.8415179029107094, "rewards/format_reward": 1.0, "step": 6478 }, { "completion_length": 432.3415336608887, "epoch": 0.5051351509364099, "grad_norm": 0.08800046418532405, "kl": 0.0033550262451171875, "learning_rate": 4.919186121443677e-07, "loss": 0.0001, "reward": 1.7053572088479996, "reward_std": 0.07680272217839956, "rewards/accuracy_reward": 0.7075893133878708, "rewards/format_reward": 0.9977678656578064, "step": 6480 }, { "completion_length": 425.86609268188477, "epoch": 0.5052910568471927, "grad_norm": 0.0028781077039575567, "kl": 0.002773284912109375, "learning_rate": 4.916737439093821e-07, "loss": 0.0001, "reward": 1.736607238650322, "reward_std": 0.0417863167822361, "rewards/accuracy_reward": 0.7366071715950966, "rewards/format_reward": 1.0, "step": 6482 }, { "completion_length": 423.33707427978516, "epoch": 0.5054469627579755, "grad_norm": 0.06847090024993693, "kl": 0.0027923583984375, "learning_rate": 4.914288776719184e-07, "loss": 0.0001, "reward": 1.8482143580913544, "reward_std": 0.025253813713788986, "rewards/accuracy_reward": 0.8482143133878708, "rewards/format_reward": 1.0, "step": 6484 }, { "completion_length": 411.0201072692871, "epoch": 0.5056028686687584, "grad_norm": 0.06767410329966239, "kl": 0.0025262832641601562, "learning_rate": 4.91184013490722e-07, "loss": 0.0001, "reward": 1.8772321790456772, "reward_std": 0.035472865216434, "rewards/accuracy_reward": 0.8772321715950966, "rewards/format_reward": 1.0, "step": 6486 }, { "completion_length": 427.16519927978516, "epoch": 0.5057587745795412, "grad_norm": 0.002789532280608272, "kl": 0.0026063919067382812, "learning_rate": 4.909391514245369e-07, "loss": 0.0001, "reward": 1.850446492433548, "reward_std": 0.009241949766874313, "rewards/accuracy_reward": 0.8504464626312256, "rewards/format_reward": 1.0, "step": 6488 }, { "completion_length": 425.00447845458984, "epoch": 0.5059146804903241, "grad_norm": 0.07633154781526824, "kl": 0.0026807785034179688, "learning_rate": 4.906942915321074e-07, "loss": 0.0001, "reward": 1.859375074505806, "reward_std": 0.04696930665522814, "rewards/accuracy_reward": 0.8593750447034836, "rewards/format_reward": 1.0, "step": 6490 }, { "completion_length": 415.5625114440918, "epoch": 0.5060705864011069, "grad_norm": 0.09798071632988233, "kl": 0.0026073455810546875, "learning_rate": 4.90449433872177e-07, "loss": 0.0001, "reward": 1.7991072237491608, "reward_std": 0.05425985809415579, "rewards/accuracy_reward": 0.7991071864962578, "rewards/format_reward": 1.0, "step": 6492 }, { "completion_length": 419.5960006713867, "epoch": 0.5062264923118898, "grad_norm": 0.08234843696304776, "kl": 0.002956390380859375, "learning_rate": 4.902045785034883e-07, "loss": 0.0001, "reward": 1.85714291036129, "reward_std": 0.02089315839111805, "rewards/accuracy_reward": 0.8571428880095482, "rewards/format_reward": 1.0, "step": 6494 }, { "completion_length": 432.9933204650879, "epoch": 0.5063823982226726, "grad_norm": 0.0731576829736423, "kl": 0.0027074813842773438, "learning_rate": 4.899597254847841e-07, "loss": 0.0001, "reward": 1.8392857760190964, "reward_std": 0.03352006524801254, "rewards/accuracy_reward": 0.8392857611179352, "rewards/format_reward": 1.0, "step": 6496 }, { "completion_length": 424.0826072692871, "epoch": 0.5065383041334555, "grad_norm": 0.12279240131051944, "kl": 0.0028247833251953125, "learning_rate": 4.897148748748058e-07, "loss": 0.0001, "reward": 1.7834822237491608, "reward_std": 0.05666771437972784, "rewards/accuracy_reward": 0.7857143208384514, "rewards/format_reward": 0.9977678656578064, "step": 6498 }, { "completion_length": 422.71653747558594, "epoch": 0.5066942100442383, "grad_norm": 0.04667536784895682, "kl": 0.0033283233642578125, "learning_rate": 4.894700267322948e-07, "loss": 0.0001, "reward": 1.7968750894069672, "reward_std": 0.01894036028534174, "rewards/accuracy_reward": 0.7968750447034836, "rewards/format_reward": 1.0, "step": 6500 }, { "completion_length": 411.564754486084, "epoch": 0.5068501159550212, "grad_norm": 0.12491979377281798, "kl": 0.0024881362915039062, "learning_rate": 4.892251811159917e-07, "loss": 0.0001, "reward": 1.8147321790456772, "reward_std": 0.01894036028534174, "rewards/accuracy_reward": 0.8147321566939354, "rewards/format_reward": 1.0, "step": 6502 }, { "completion_length": 409.83707427978516, "epoch": 0.507006021865804, "grad_norm": 0.04943880827596102, "kl": 0.00252532958984375, "learning_rate": 4.889803380846366e-07, "loss": 0.0001, "reward": 1.8102679550647736, "reward_std": 0.01750820130109787, "rewards/accuracy_reward": 0.8102678954601288, "rewards/format_reward": 1.0, "step": 6504 }, { "completion_length": 424.8326110839844, "epoch": 0.5071619277765869, "grad_norm": 0.09945286935620087, "kl": 0.002536773681640625, "learning_rate": 4.887354976969689e-07, "loss": 0.0001, "reward": 1.8616071790456772, "reward_std": 0.06222161278128624, "rewards/accuracy_reward": 0.8638392984867096, "rewards/format_reward": 0.9977678656578064, "step": 6506 }, { "completion_length": 431.2009162902832, "epoch": 0.5073178336873697, "grad_norm": 0.06246522396271782, "kl": 0.0027370452880859375, "learning_rate": 4.884906600117274e-07, "loss": 0.0001, "reward": 1.8660715222358704, "reward_std": 0.02089315839111805, "rewards/accuracy_reward": 0.8660714626312256, "rewards/format_reward": 1.0, "step": 6508 }, { "completion_length": 418.99332427978516, "epoch": 0.5074737395981526, "grad_norm": 0.04316403105405738, "kl": 0.00281524658203125, "learning_rate": 4.882458250876501e-07, "loss": 0.0001, "reward": 1.7700893580913544, "reward_std": 0.03742426075041294, "rewards/accuracy_reward": 0.7700893357396126, "rewards/format_reward": 1.0, "step": 6510 }, { "completion_length": 421.62278747558594, "epoch": 0.5076296455089354, "grad_norm": 0.0746208546981174, "kl": 0.003017425537109375, "learning_rate": 4.880009929834744e-07, "loss": 0.0001, "reward": 1.7187500596046448, "reward_std": 0.03352006617933512, "rewards/accuracy_reward": 0.7187500223517418, "rewards/format_reward": 1.0, "step": 6512 }, { "completion_length": 428.19197845458984, "epoch": 0.5077855514197182, "grad_norm": 0.11594954745485835, "kl": 0.0031642913818359375, "learning_rate": 4.877561637579375e-07, "loss": 0.0001, "reward": 1.8549107760190964, "reward_std": 0.05005116853863001, "rewards/accuracy_reward": 0.8549107536673546, "rewards/format_reward": 1.0, "step": 6514 }, { "completion_length": 425.69644927978516, "epoch": 0.507941457330501, "grad_norm": 0.11972840503559057, "kl": 0.0034074783325195312, "learning_rate": 4.87511337469775e-07, "loss": 0.0001, "reward": 1.8102679401636124, "reward_std": 0.06658367067575455, "rewards/accuracy_reward": 0.8102678954601288, "rewards/format_reward": 1.0, "step": 6516 }, { "completion_length": 427.93528747558594, "epoch": 0.5080973632412839, "grad_norm": 0.08753851823205626, "kl": 0.0027780532836914062, "learning_rate": 4.872665141777225e-07, "loss": 0.0001, "reward": 1.8325893431901932, "reward_std": 0.04193604737520218, "rewards/accuracy_reward": 0.8325893059372902, "rewards/format_reward": 1.0, "step": 6518 }, { "completion_length": 420.16743087768555, "epoch": 0.5082532691520667, "grad_norm": 0.07522176495198026, "kl": 0.0033168792724609375, "learning_rate": 4.870216939405149e-07, "loss": 0.0001, "reward": 1.8169643878936768, "reward_std": 0.03644856158643961, "rewards/accuracy_reward": 0.816964328289032, "rewards/format_reward": 1.0, "step": 6520 }, { "completion_length": 431.2857360839844, "epoch": 0.5084091750628495, "grad_norm": 0.11199225661367282, "kl": 0.0032253265380859375, "learning_rate": 4.867768768168857e-07, "loss": 0.0001, "reward": 1.8437501043081284, "reward_std": 0.05959761328995228, "rewards/accuracy_reward": 0.843750037252903, "rewards/format_reward": 1.0, "step": 6522 }, { "completion_length": 423.24108505249023, "epoch": 0.5085650809736324, "grad_norm": 0.0790280890859844, "kl": 0.0028514862060546875, "learning_rate": 4.865320628655685e-07, "loss": 0.0001, "reward": 1.7477679401636124, "reward_std": 0.06914125569164753, "rewards/accuracy_reward": 0.7477678954601288, "rewards/format_reward": 1.0, "step": 6524 }, { "completion_length": 413.839298248291, "epoch": 0.5087209868844152, "grad_norm": 0.09061446460024873, "kl": 0.0028181076049804688, "learning_rate": 4.862872521452955e-07, "loss": 0.0001, "reward": 1.7767857909202576, "reward_std": 0.05718835536390543, "rewards/accuracy_reward": 0.776785746216774, "rewards/format_reward": 1.0, "step": 6526 }, { "completion_length": 427.24555587768555, "epoch": 0.5088768927951981, "grad_norm": 0.09833948995475868, "kl": 0.0025806427001953125, "learning_rate": 4.860424447147983e-07, "loss": 0.0001, "reward": 1.774553656578064, "reward_std": 0.04373911488801241, "rewards/accuracy_reward": 0.7745536118745804, "rewards/format_reward": 1.0, "step": 6528 }, { "completion_length": 401.1093940734863, "epoch": 0.509032798705981, "grad_norm": 0.12630708377925792, "kl": 0.0025129318237304688, "learning_rate": 4.85797640632808e-07, "loss": 0.0001, "reward": 1.8415179252624512, "reward_std": 0.057862360030412674, "rewards/accuracy_reward": 0.8415178805589676, "rewards/format_reward": 1.0, "step": 6530 }, { "completion_length": 415.9308204650879, "epoch": 0.5091887046167638, "grad_norm": 0.10149555165588736, "kl": 0.0029850006103515625, "learning_rate": 4.855528399580545e-07, "loss": 0.0001, "reward": 1.7388393580913544, "reward_std": 0.03547286335378885, "rewards/accuracy_reward": 0.7388393171131611, "rewards/format_reward": 1.0, "step": 6532 }, { "completion_length": 427.20537185668945, "epoch": 0.5093446105275466, "grad_norm": 0.04330052679475009, "kl": 0.0028295516967773438, "learning_rate": 4.85308042749267e-07, "loss": 0.0001, "reward": 1.7500000596046448, "reward_std": 0.04035415779799223, "rewards/accuracy_reward": 0.7500000447034836, "rewards/format_reward": 1.0, "step": 6534 }, { "completion_length": 428.9040298461914, "epoch": 0.5095005164383295, "grad_norm": 0.10153104506624043, "kl": 0.0033206939697265625, "learning_rate": 4.850632490651742e-07, "loss": 0.0001, "reward": 1.8660715073347092, "reward_std": 0.057341720908880234, "rewards/accuracy_reward": 0.8660714626312256, "rewards/format_reward": 1.0, "step": 6536 }, { "completion_length": 435.6562690734863, "epoch": 0.5096564223491123, "grad_norm": 0.08936159370068328, "kl": 0.0028076171875, "learning_rate": 4.848184589645033e-07, "loss": 0.0001, "reward": 1.781250074505806, "reward_std": 0.028182310052216053, "rewards/accuracy_reward": 0.7812500298023224, "rewards/format_reward": 1.0, "step": 6538 }, { "completion_length": 428.3482360839844, "epoch": 0.5098123282598952, "grad_norm": 0.11162990362570568, "kl": 0.0029458999633789062, "learning_rate": 4.845736725059811e-07, "loss": 0.0001, "reward": 1.7901786267757416, "reward_std": 0.060270216315984726, "rewards/accuracy_reward": 0.7901785969734192, "rewards/format_reward": 1.0, "step": 6540 }, { "completion_length": 428.5826110839844, "epoch": 0.509968234170678, "grad_norm": 0.08514708581540047, "kl": 0.0031194686889648438, "learning_rate": 4.843288897483339e-07, "loss": 0.0001, "reward": 1.9107143431901932, "reward_std": 0.05200396664440632, "rewards/accuracy_reward": 0.910714328289032, "rewards/format_reward": 1.0, "step": 6542 }, { "completion_length": 414.4330596923828, "epoch": 0.5101241400814608, "grad_norm": 0.1062890115711368, "kl": 0.0033397674560546875, "learning_rate": 4.84084110750286e-07, "loss": 0.0001, "reward": 1.7477679401636124, "reward_std": 0.05914115160703659, "rewards/accuracy_reward": 0.747767873108387, "rewards/format_reward": 1.0, "step": 6544 }, { "completion_length": 415.11832427978516, "epoch": 0.5102800459922436, "grad_norm": 0.06072627093730272, "kl": 0.0034732818603515625, "learning_rate": 4.838393355705617e-07, "loss": 0.0001, "reward": 1.8794643729925156, "reward_std": 0.05133136175572872, "rewards/accuracy_reward": 0.879464328289032, "rewards/format_reward": 1.0, "step": 6546 }, { "completion_length": 427.8995780944824, "epoch": 0.5104359519030265, "grad_norm": 0.08252241125977063, "kl": 0.002872467041015625, "learning_rate": 4.835945642678843e-07, "loss": 0.0001, "reward": 1.8526786267757416, "reward_std": 0.03870445489883423, "rewards/accuracy_reward": 0.8526785969734192, "rewards/format_reward": 1.0, "step": 6548 }, { "completion_length": 427.91743087768555, "epoch": 0.5105918578138093, "grad_norm": 0.062155958732974734, "kl": 0.003246307373046875, "learning_rate": 4.833497969009757e-07, "loss": 0.0001, "reward": 1.7678571939468384, "reward_std": 0.04358434770256281, "rewards/accuracy_reward": 0.7678571864962578, "rewards/format_reward": 1.0, "step": 6550 }, { "completion_length": 428.9732322692871, "epoch": 0.5107477637245922, "grad_norm": 0.06578695424574307, "kl": 0.0031595230102539062, "learning_rate": 4.831050335285575e-07, "loss": 0.0001, "reward": 1.8258929252624512, "reward_std": 0.05954387877136469, "rewards/accuracy_reward": 0.828125037252903, "rewards/format_reward": 0.9977678656578064, "step": 6552 }, { "completion_length": 420.9754638671875, "epoch": 0.510903669635375, "grad_norm": 0.07735535147794985, "kl": 0.0030164718627929688, "learning_rate": 4.828602742093497e-07, "loss": 0.0001, "reward": 1.8571429252624512, "reward_std": 0.023821654729545116, "rewards/accuracy_reward": 0.8571428880095482, "rewards/format_reward": 1.0, "step": 6554 }, { "completion_length": 422.080379486084, "epoch": 0.5110595755461579, "grad_norm": 0.08720933226046544, "kl": 0.002872467041015625, "learning_rate": 4.826155190020715e-07, "loss": 0.0001, "reward": 1.7767857909202576, "reward_std": 0.05553865246474743, "rewards/accuracy_reward": 0.7767857387661934, "rewards/format_reward": 1.0, "step": 6556 }, { "completion_length": 426.6919860839844, "epoch": 0.5112154814569407, "grad_norm": 0.06911790031692246, "kl": 0.0030794143676757812, "learning_rate": 4.823707679654419e-07, "loss": 0.0001, "reward": 1.848214328289032, "reward_std": 0.043065110221505165, "rewards/accuracy_reward": 0.850446455180645, "rewards/format_reward": 0.9977678656578064, "step": 6558 }, { "completion_length": 422.96207427978516, "epoch": 0.5113713873677236, "grad_norm": 0.06266677004454797, "kl": 0.0026569366455078125, "learning_rate": 4.821260211581775e-07, "loss": 0.0001, "reward": 1.895089328289032, "reward_std": 0.014579704962670803, "rewards/accuracy_reward": 0.8950893208384514, "rewards/format_reward": 1.0, "step": 6560 }, { "completion_length": 413.3147506713867, "epoch": 0.5115272932785064, "grad_norm": 0.15034227285104157, "kl": 0.0025196075439453125, "learning_rate": 4.818812786389949e-07, "loss": 0.0001, "reward": 1.8325893431901932, "reward_std": 0.0740755945444107, "rewards/accuracy_reward": 0.8348214775323868, "rewards/format_reward": 0.9977678656578064, "step": 6562 }, { "completion_length": 425.32591247558594, "epoch": 0.5116831991892893, "grad_norm": 0.10754177298998463, "kl": 0.0029726028442382812, "learning_rate": 4.816365404666098e-07, "loss": 0.0001, "reward": 1.8080357909202576, "reward_std": 0.03644856158643961, "rewards/accuracy_reward": 0.8080357387661934, "rewards/format_reward": 1.0, "step": 6564 }, { "completion_length": 428.205379486084, "epoch": 0.5118391051000721, "grad_norm": 0.09062902136022263, "kl": 0.0035610198974609375, "learning_rate": 4.813918066997358e-07, "loss": 0.0001, "reward": 1.7366072088479996, "reward_std": 0.05133136175572872, "rewards/accuracy_reward": 0.7366071790456772, "rewards/format_reward": 1.0, "step": 6566 }, { "completion_length": 418.7455520629883, "epoch": 0.511995011010855, "grad_norm": 0.0979269357053345, "kl": 0.002597808837890625, "learning_rate": 4.811470773970863e-07, "loss": 0.0001, "reward": 1.8928571939468384, "reward_std": 0.048922101967036724, "rewards/accuracy_reward": 0.8928571790456772, "rewards/format_reward": 1.0, "step": 6568 }, { "completion_length": 424.1317138671875, "epoch": 0.5121509169216378, "grad_norm": 0.08241007825367584, "kl": 0.0029659271240234375, "learning_rate": 4.809023526173738e-07, "loss": 0.0001, "reward": 1.9397321939468384, "reward_std": 0.01894036028534174, "rewards/accuracy_reward": 0.9397321790456772, "rewards/format_reward": 1.0, "step": 6570 }, { "completion_length": 419.3727836608887, "epoch": 0.5123068228324207, "grad_norm": 0.08992118073035239, "kl": 0.003002166748046875, "learning_rate": 4.806576324193088e-07, "loss": 0.0001, "reward": 1.8303572088479996, "reward_std": 0.03352006617933512, "rewards/accuracy_reward": 0.8303571790456772, "rewards/format_reward": 1.0, "step": 6572 }, { "completion_length": 420.21207427978516, "epoch": 0.5124627287432034, "grad_norm": 0.04505350880554062, "kl": 0.0025339126586914062, "learning_rate": 4.804129168616015e-07, "loss": 0.0001, "reward": 1.8325893729925156, "reward_std": 0.028485405258834362, "rewards/accuracy_reward": 0.832589328289032, "rewards/format_reward": 1.0, "step": 6574 }, { "completion_length": 420.9888610839844, "epoch": 0.5126186346539863, "grad_norm": 0.09458634181569121, "kl": 0.0027618408203125, "learning_rate": 4.80168206002961e-07, "loss": 0.0001, "reward": 1.8549108058214188, "reward_std": 0.03675165679305792, "rewards/accuracy_reward": 0.8549107313156128, "rewards/format_reward": 1.0, "step": 6576 }, { "completion_length": 427.6808204650879, "epoch": 0.5127745405647691, "grad_norm": 0.09881448239766427, "kl": 0.0029239654541015625, "learning_rate": 4.799234999020944e-07, "loss": 0.0001, "reward": 1.8549107909202576, "reward_std": 0.06756076868623495, "rewards/accuracy_reward": 0.8549107536673546, "rewards/format_reward": 1.0, "step": 6578 }, { "completion_length": 410.7522506713867, "epoch": 0.512930446475552, "grad_norm": 0.04455414605931201, "kl": 0.0025625228881835938, "learning_rate": 4.796787986177085e-07, "loss": 0.0001, "reward": 1.8058036714792252, "reward_std": 0.0063134534284472466, "rewards/accuracy_reward": 0.8058036044239998, "rewards/format_reward": 1.0, "step": 6580 }, { "completion_length": 432.4576072692871, "epoch": 0.5130863523863348, "grad_norm": 0.05377817673893338, "kl": 0.00385284423828125, "learning_rate": 4.794341022085091e-07, "loss": 0.0002, "reward": 1.7656250894069672, "reward_std": 0.04276201594620943, "rewards/accuracy_reward": 0.7656250298023224, "rewards/format_reward": 1.0, "step": 6582 }, { "completion_length": 427.4196662902832, "epoch": 0.5132422582971177, "grad_norm": 0.002663092212252722, "kl": 0.0024356842041015625, "learning_rate": 4.791894107331998e-07, "loss": 0.0001, "reward": 1.8705357909202576, "reward_std": 0.012626906856894493, "rewards/accuracy_reward": 0.870535746216774, "rewards/format_reward": 1.0, "step": 6584 }, { "completion_length": 445.49778747558594, "epoch": 0.5133981642079005, "grad_norm": 0.06591629802251785, "kl": 0.003261566162109375, "learning_rate": 4.789447242504842e-07, "loss": 0.0001, "reward": 1.8147322088479996, "reward_std": 0.060573313385248184, "rewards/accuracy_reward": 0.8147321715950966, "rewards/format_reward": 1.0, "step": 6586 }, { "completion_length": 428.7701072692871, "epoch": 0.5135540701186834, "grad_norm": 0.059446750749077, "kl": 0.0030384063720703125, "learning_rate": 4.787000428190638e-07, "loss": 0.0001, "reward": 1.7633929699659348, "reward_std": 0.05441322550177574, "rewards/accuracy_reward": 0.7633928805589676, "rewards/format_reward": 1.0, "step": 6588 }, { "completion_length": 427.1250228881836, "epoch": 0.5137099760294662, "grad_norm": 0.07832309713117701, "kl": 0.0030059814453125, "learning_rate": 4.784553664976393e-07, "loss": 0.0001, "reward": 1.8013393729925156, "reward_std": 0.027206611819565296, "rewards/accuracy_reward": 0.8013393357396126, "rewards/format_reward": 1.0, "step": 6590 }, { "completion_length": 428.1808204650879, "epoch": 0.5138658819402491, "grad_norm": 0.045325315605520604, "kl": 0.0030498504638671875, "learning_rate": 4.782106953449103e-07, "loss": 0.0001, "reward": 1.81026791036129, "reward_std": 0.03968015406280756, "rewards/accuracy_reward": 0.8102678954601288, "rewards/format_reward": 1.0, "step": 6592 }, { "completion_length": 430.9018020629883, "epoch": 0.5140217878510319, "grad_norm": 0.041609124479748696, "kl": 0.00323486328125, "learning_rate": 4.779660294195747e-07, "loss": 0.0001, "reward": 1.7433036416769028, "reward_std": 0.027206611819565296, "rewards/accuracy_reward": 0.7433036044239998, "rewards/format_reward": 1.0, "step": 6594 }, { "completion_length": 419.27234268188477, "epoch": 0.5141776937618148, "grad_norm": 0.047924986909192134, "kl": 0.0024042129516601562, "learning_rate": 4.777213687803296e-07, "loss": 0.0001, "reward": 1.8058036416769028, "reward_std": 0.024797352962195873, "rewards/accuracy_reward": 0.8058036044239998, "rewards/format_reward": 1.0, "step": 6596 }, { "completion_length": 425.6227836608887, "epoch": 0.5143335996725976, "grad_norm": 0.04898091587631506, "kl": 0.0028352737426757812, "learning_rate": 4.774767134858708e-07, "loss": 0.0001, "reward": 1.8147322535514832, "reward_std": 0.014579704962670803, "rewards/accuracy_reward": 0.814732164144516, "rewards/format_reward": 1.0, "step": 6598 }, { "completion_length": 423.4866256713867, "epoch": 0.5144895055833805, "grad_norm": 0.046011882211225666, "kl": 0.0024251937866210938, "learning_rate": 4.772320635948922e-07, "loss": 0.0001, "reward": 1.734375074505806, "reward_std": 0.02540354337543249, "rewards/accuracy_reward": 0.7343750447034836, "rewards/format_reward": 1.0, "step": 6600 }, { "completion_length": 431.41296768188477, "epoch": 0.5146454114941633, "grad_norm": 0.12319523369324051, "kl": 0.0030193328857421875, "learning_rate": 4.769874191660871e-07, "loss": 0.0001, "reward": 1.8772322088479996, "reward_std": 0.05102826748043299, "rewards/accuracy_reward": 0.8772321715950966, "rewards/format_reward": 1.0, "step": 6602 }, { "completion_length": 430.15850830078125, "epoch": 0.5148013174049462, "grad_norm": 0.05638052785974846, "kl": 0.0028896331787109375, "learning_rate": 4.767427802581477e-07, "loss": 0.0001, "reward": 1.803571492433548, "reward_std": 0.037727355025708675, "rewards/accuracy_reward": 0.8035714700818062, "rewards/format_reward": 1.0, "step": 6604 }, { "completion_length": 427.9821586608887, "epoch": 0.5149572233157289, "grad_norm": 0.0756998561748031, "kl": 0.0028858184814453125, "learning_rate": 4.7649814692976383e-07, "loss": 0.0001, "reward": 1.828125074505806, "reward_std": 0.036751655861735344, "rewards/accuracy_reward": 0.8281250447034836, "rewards/format_reward": 1.0, "step": 6606 }, { "completion_length": 401.4665336608887, "epoch": 0.5151131292265118, "grad_norm": 0.09093843130792259, "kl": 0.005335807800292969, "learning_rate": 4.762535192396247e-07, "loss": 0.0002, "reward": 1.8169643878936768, "reward_std": 0.05035426188260317, "rewards/accuracy_reward": 0.8169643208384514, "rewards/format_reward": 1.0, "step": 6608 }, { "completion_length": 418.54466247558594, "epoch": 0.5152690351372946, "grad_norm": 0.11892511571715615, "kl": 0.0031108856201171875, "learning_rate": 4.7600889724641846e-07, "loss": 0.0001, "reward": 1.7901786714792252, "reward_std": 0.07966703921556473, "rewards/accuracy_reward": 0.7901786044239998, "rewards/format_reward": 1.0, "step": 6610 }, { "completion_length": 436.85716247558594, "epoch": 0.5154249410480775, "grad_norm": 0.0855123719160727, "kl": 0.003265380859375, "learning_rate": 4.7576428100883094e-07, "loss": 0.0001, "reward": 1.8013393580913544, "reward_std": 0.03156726714223623, "rewards/accuracy_reward": 0.8013393208384514, "rewards/format_reward": 1.0, "step": 6612 }, { "completion_length": 417.63841247558594, "epoch": 0.5155808469588603, "grad_norm": 0.06688332292434813, "kl": 0.0026674270629882812, "learning_rate": 4.755196705855476e-07, "loss": 0.0001, "reward": 1.7767857909202576, "reward_std": 0.03336670063436031, "rewards/accuracy_reward": 0.7767857611179352, "rewards/format_reward": 1.0, "step": 6614 }, { "completion_length": 418.76341247558594, "epoch": 0.5157367528696432, "grad_norm": 0.09928870502026413, "kl": 0.0025377273559570312, "learning_rate": 4.7527506603525166e-07, "loss": 0.0001, "reward": 1.8080357909202576, "reward_std": 0.036448562517762184, "rewards/accuracy_reward": 0.808035746216774, "rewards/format_reward": 1.0, "step": 6616 }, { "completion_length": 410.6651954650879, "epoch": 0.515892658780426, "grad_norm": 0.10783038492540677, "kl": 0.0024137496948242188, "learning_rate": 4.7503046741662563e-07, "loss": 0.0001, "reward": 1.8638393580913544, "reward_std": 0.04712266940623522, "rewards/accuracy_reward": 0.8638393133878708, "rewards/format_reward": 1.0, "step": 6618 }, { "completion_length": 426.3660888671875, "epoch": 0.5160485646912089, "grad_norm": 0.10597619883865739, "kl": 0.0027561187744140625, "learning_rate": 4.747858747883502e-07, "loss": 0.0001, "reward": 1.7857143729925156, "reward_std": 0.028182310052216053, "rewards/accuracy_reward": 0.7857142984867096, "rewards/format_reward": 1.0, "step": 6620 }, { "completion_length": 426.5335006713867, "epoch": 0.5162044706019917, "grad_norm": 0.08766494913938484, "kl": 0.003093719482421875, "learning_rate": 4.745412882091045e-07, "loss": 0.0001, "reward": 1.8191965222358704, "reward_std": 0.05102826654911041, "rewards/accuracy_reward": 0.8191964775323868, "rewards/format_reward": 1.0, "step": 6622 }, { "completion_length": 422.8236770629883, "epoch": 0.5163603765127746, "grad_norm": 0.04758880302794355, "kl": 0.0020856857299804688, "learning_rate": 4.7429670773756656e-07, "loss": 0.0001, "reward": 1.8861607760190964, "reward_std": 0.01585849840193987, "rewards/accuracy_reward": 0.8861607536673546, "rewards/format_reward": 1.0, "step": 6624 }, { "completion_length": 410.4352836608887, "epoch": 0.5165162824235574, "grad_norm": 0.09470095552988401, "kl": 0.0022954940795898438, "learning_rate": 4.74052133432413e-07, "loss": 0.0001, "reward": 1.7633929550647736, "reward_std": 0.06319871358573437, "rewards/accuracy_reward": 0.7633928954601288, "rewards/format_reward": 1.0, "step": 6626 }, { "completion_length": 408.8370704650879, "epoch": 0.5166721883343403, "grad_norm": 0.07257424900636991, "kl": 0.0026464462280273438, "learning_rate": 4.738075653523185e-07, "loss": 0.0001, "reward": 1.8214286416769028, "reward_std": 0.025253813713788986, "rewards/accuracy_reward": 0.8214286155998707, "rewards/format_reward": 1.0, "step": 6628 }, { "completion_length": 413.6004638671875, "epoch": 0.5168280942451231, "grad_norm": 0.09943337642302932, "kl": 0.002655029296875, "learning_rate": 4.7356300355595653e-07, "loss": 0.0001, "reward": 1.8348215073347092, "reward_std": 0.05200396664440632, "rewards/accuracy_reward": 0.834821455180645, "rewards/format_reward": 1.0, "step": 6630 }, { "completion_length": 435.7321662902832, "epoch": 0.516984000155906, "grad_norm": 0.06701775259476951, "kl": 0.0033817291259765625, "learning_rate": 4.7331844810199927e-07, "loss": 0.0001, "reward": 1.7321429550647736, "reward_std": 0.041786317713558674, "rewards/accuracy_reward": 0.7321428880095482, "rewards/format_reward": 1.0, "step": 6632 }, { "completion_length": 426.4419822692871, "epoch": 0.5171399060666888, "grad_norm": 0.06152000873508923, "kl": 0.0028018951416015625, "learning_rate": 4.730738990491167e-07, "loss": 0.0001, "reward": 1.8281250894069672, "reward_std": 0.034342396073043346, "rewards/accuracy_reward": 0.8281250298023224, "rewards/format_reward": 1.0, "step": 6634 }, { "completion_length": 419.2366256713867, "epoch": 0.5172958119774715, "grad_norm": 0.05705658521310779, "kl": 0.0026760101318359375, "learning_rate": 4.728293564559781e-07, "loss": 0.0001, "reward": 1.8258929252624512, "reward_std": 0.02089315839111805, "rewards/accuracy_reward": 0.82589291036129, "rewards/format_reward": 1.0, "step": 6636 }, { "completion_length": 430.57368087768555, "epoch": 0.5174517178882544, "grad_norm": 0.18933461861509135, "kl": 0.002849578857421875, "learning_rate": 4.7258482038125056e-07, "loss": 0.0001, "reward": 1.8191965222358704, "reward_std": 0.05929451994597912, "rewards/accuracy_reward": 0.8191964626312256, "rewards/format_reward": 1.0, "step": 6638 }, { "completion_length": 439.7656440734863, "epoch": 0.5176076237990372, "grad_norm": 0.09181182808050742, "kl": 0.0028257369995117188, "learning_rate": 4.7234029088360005e-07, "loss": 0.0001, "reward": 1.772321492433548, "reward_std": 0.05425985902547836, "rewards/accuracy_reward": 0.7723214663565159, "rewards/format_reward": 1.0, "step": 6640 }, { "completion_length": 419.8415336608887, "epoch": 0.5177635297098201, "grad_norm": 0.07248050566619481, "kl": 0.0027742385864257812, "learning_rate": 4.720957680216905e-07, "loss": 0.0001, "reward": 1.783482238650322, "reward_std": 0.027206611819565296, "rewards/accuracy_reward": 0.7834821790456772, "rewards/format_reward": 1.0, "step": 6642 }, { "completion_length": 422.9419822692871, "epoch": 0.5179194356206029, "grad_norm": 0.04436022176150525, "kl": 0.0026006698608398438, "learning_rate": 4.718512518541849e-07, "loss": 0.0001, "reward": 1.7991072088479996, "reward_std": 0.0417863167822361, "rewards/accuracy_reward": 0.7991071790456772, "rewards/format_reward": 1.0, "step": 6644 }, { "completion_length": 434.4754638671875, "epoch": 0.5180753415313858, "grad_norm": 0.08665310570953945, "kl": 0.0029125213623046875, "learning_rate": 4.7160674243974373e-07, "loss": 0.0001, "reward": 1.7745536416769028, "reward_std": 0.042762015014886856, "rewards/accuracy_reward": 0.774553582072258, "rewards/format_reward": 1.0, "step": 6646 }, { "completion_length": 427.5580520629883, "epoch": 0.5182312474421686, "grad_norm": 0.10090707111262688, "kl": 0.0028171539306640625, "learning_rate": 4.713622398370269e-07, "loss": 0.0001, "reward": 1.8370536416769028, "reward_std": 0.05974734202027321, "rewards/accuracy_reward": 0.8370535895228386, "rewards/format_reward": 1.0, "step": 6648 }, { "completion_length": 426.19421005249023, "epoch": 0.5183871533529515, "grad_norm": 0.10468389500389813, "kl": 0.0030527114868164062, "learning_rate": 4.711177441046918e-07, "loss": 0.0001, "reward": 1.709821492433548, "reward_std": 0.045691913925111294, "rewards/accuracy_reward": 0.7098214700818062, "rewards/format_reward": 1.0, "step": 6650 }, { "completion_length": 415.7433204650879, "epoch": 0.5185430592637343, "grad_norm": 0.08837776892323271, "kl": 0.0028028488159179688, "learning_rate": 4.708732553013944e-07, "loss": 0.0001, "reward": 1.783482238650322, "reward_std": 0.03675165679305792, "rewards/accuracy_reward": 0.7834821939468384, "rewards/format_reward": 1.0, "step": 6652 }, { "completion_length": 415.44421768188477, "epoch": 0.5186989651745172, "grad_norm": 0.08227973335403956, "kl": 0.0028476715087890625, "learning_rate": 4.7062877348578967e-07, "loss": 0.0001, "reward": 1.9084822237491608, "reward_std": 0.07190665416419506, "rewards/accuracy_reward": 0.9107143208384514, "rewards/format_reward": 0.9977678656578064, "step": 6654 }, { "completion_length": 432.4107322692871, "epoch": 0.5188548710853, "grad_norm": 0.07000815060760043, "kl": 0.0029916763305664062, "learning_rate": 4.703842987165297e-07, "loss": 0.0001, "reward": 1.81026791036129, "reward_std": 0.03306360449641943, "rewards/accuracy_reward": 0.8102678880095482, "rewards/format_reward": 1.0, "step": 6656 }, { "completion_length": 413.7433204650879, "epoch": 0.5190107769960829, "grad_norm": 0.09606815995948682, "kl": 0.0026760101318359375, "learning_rate": 4.701398310522661e-07, "loss": 0.0001, "reward": 1.7857143878936768, "reward_std": 0.06170237623155117, "rewards/accuracy_reward": 0.7857143208384514, "rewards/format_reward": 1.0, "step": 6658 }, { "completion_length": 430.61832427978516, "epoch": 0.5191666829068657, "grad_norm": 0.00334794872061827, "kl": 0.0027103424072265625, "learning_rate": 4.6989537055164797e-07, "loss": 0.0001, "reward": 1.8348215073347092, "reward_std": 0.02479875460267067, "rewards/accuracy_reward": 0.8348214849829674, "rewards/format_reward": 1.0, "step": 6660 }, { "completion_length": 418.7656478881836, "epoch": 0.5193225888176486, "grad_norm": 0.10051068562606336, "kl": 0.002964019775390625, "learning_rate": 4.6965091727332283e-07, "loss": 0.0001, "reward": 1.7700893729925156, "reward_std": 0.05749144684523344, "rewards/accuracy_reward": 0.770089328289032, "rewards/format_reward": 1.0, "step": 6662 }, { "completion_length": 430.2120780944824, "epoch": 0.5194784947284314, "grad_norm": 0.09199553170965642, "kl": 0.0028390884399414062, "learning_rate": 4.6940647127593673e-07, "loss": 0.0001, "reward": 1.774553656578064, "reward_std": 0.04666761215776205, "rewards/accuracy_reward": 0.7745536044239998, "rewards/format_reward": 1.0, "step": 6664 }, { "completion_length": 420.93528747558594, "epoch": 0.5196344006392143, "grad_norm": 0.07342742290919384, "kl": 0.0023908615112304688, "learning_rate": 4.69162032618134e-07, "loss": 0.0001, "reward": 1.8013393431901932, "reward_std": 0.04388744384050369, "rewards/accuracy_reward": 0.8013392984867096, "rewards/format_reward": 1.0, "step": 6666 }, { "completion_length": 434.54019927978516, "epoch": 0.519790306549997, "grad_norm": 0.1158297339692602, "kl": 0.0029239654541015625, "learning_rate": 4.6891760135855685e-07, "loss": 0.0001, "reward": 1.7857143580913544, "reward_std": 0.057341720908880234, "rewards/accuracy_reward": 0.787946455180645, "rewards/format_reward": 0.9977678656578064, "step": 6668 }, { "completion_length": 427.4241256713867, "epoch": 0.5199462124607799, "grad_norm": 0.06032463705239786, "kl": 0.0027141571044921875, "learning_rate": 4.6867317755584574e-07, "loss": 0.0001, "reward": 1.812500074505806, "reward_std": 0.023821654729545116, "rewards/accuracy_reward": 0.812500037252903, "rewards/format_reward": 1.0, "step": 6670 }, { "completion_length": 423.88171768188477, "epoch": 0.5201021183715627, "grad_norm": 0.0038161310055042216, "kl": 0.0027322769165039062, "learning_rate": 4.684287612686399e-07, "loss": 0.0001, "reward": 1.7968750894069672, "reward_std": 0.014579704962670803, "rewards/accuracy_reward": 0.7968750298023224, "rewards/format_reward": 1.0, "step": 6672 }, { "completion_length": 420.26118087768555, "epoch": 0.5202580242823456, "grad_norm": 0.10016066350923823, "kl": 0.002948760986328125, "learning_rate": 4.681843525555759e-07, "loss": 0.0001, "reward": 1.7946429401636124, "reward_std": 0.06786386761814356, "rewards/accuracy_reward": 0.7946428880095482, "rewards/format_reward": 1.0, "step": 6674 }, { "completion_length": 434.6361770629883, "epoch": 0.5204139301931284, "grad_norm": 0.08264161809919268, "kl": 0.0030384063720703125, "learning_rate": 4.679399514752895e-07, "loss": 0.0001, "reward": 1.689732238650322, "reward_std": 0.04404080752283335, "rewards/accuracy_reward": 0.689732164144516, "rewards/format_reward": 1.0, "step": 6676 }, { "completion_length": 424.2187690734863, "epoch": 0.5205698361039113, "grad_norm": 0.0876464030410035, "kl": 0.003269195556640625, "learning_rate": 4.676955580864135e-07, "loss": 0.0001, "reward": 1.8459822088479996, "reward_std": 0.0563660217449069, "rewards/accuracy_reward": 0.8482143208384514, "rewards/format_reward": 0.9977678656578064, "step": 6678 }, { "completion_length": 412.42412185668945, "epoch": 0.5207257420146941, "grad_norm": 0.0971709708620804, "kl": 0.0024118423461914062, "learning_rate": 4.6745117244757987e-07, "loss": 0.0001, "reward": 1.819196492433548, "reward_std": 0.0456905122846365, "rewards/accuracy_reward": 0.8191964700818062, "rewards/format_reward": 1.0, "step": 6680 }, { "completion_length": 414.7031478881836, "epoch": 0.520881647925477, "grad_norm": 0.09514028970596966, "kl": 0.0025177001953125, "learning_rate": 4.672067946174182e-07, "loss": 0.0001, "reward": 1.8861607611179352, "reward_std": 0.034495762549340725, "rewards/accuracy_reward": 0.8861607536673546, "rewards/format_reward": 1.0, "step": 6682 }, { "completion_length": 434.370548248291, "epoch": 0.5210375538362598, "grad_norm": 0.04745209642152673, "kl": 0.0028896331787109375, "learning_rate": 4.66962424654556e-07, "loss": 0.0001, "reward": 1.7946429401636124, "reward_std": 0.02089315839111805, "rewards/accuracy_reward": 0.7946428880095482, "rewards/format_reward": 1.0, "step": 6684 }, { "completion_length": 426.0268020629883, "epoch": 0.5211934597470427, "grad_norm": 0.10650908050706542, "kl": 0.00305938720703125, "learning_rate": 4.667180626176194e-07, "loss": 0.0001, "reward": 1.7656251043081284, "reward_std": 0.057644814252853394, "rewards/accuracy_reward": 0.7656250298023224, "rewards/format_reward": 1.0, "step": 6686 }, { "completion_length": 408.41072845458984, "epoch": 0.5213493656578255, "grad_norm": 0.11883694824991455, "kl": 0.0028429031372070312, "learning_rate": 4.6647370856523273e-07, "loss": 0.0001, "reward": 1.8883929252624512, "reward_std": 0.06688676588237286, "rewards/accuracy_reward": 0.8883928880095482, "rewards/format_reward": 1.0, "step": 6688 }, { "completion_length": 417.15180587768555, "epoch": 0.5215052715686084, "grad_norm": 0.05449645817127757, "kl": 0.00304412841796875, "learning_rate": 4.6622936255601765e-07, "loss": 0.0001, "reward": 1.8504465073347092, "reward_std": 0.030135108157992363, "rewards/accuracy_reward": 0.8504464626312256, "rewards/format_reward": 1.0, "step": 6690 }, { "completion_length": 413.9754638671875, "epoch": 0.5216611774793912, "grad_norm": 0.09687296215165467, "kl": 0.002635955810546875, "learning_rate": 4.6598502464859436e-07, "loss": 0.0001, "reward": 1.8482143580913544, "reward_std": 0.034798857755959034, "rewards/accuracy_reward": 0.848214328289032, "rewards/format_reward": 1.0, "step": 6692 }, { "completion_length": 427.6942138671875, "epoch": 0.5218170833901741, "grad_norm": 0.10805300825093866, "kl": 0.0027399063110351562, "learning_rate": 4.6574069490158136e-07, "loss": 0.0001, "reward": 1.8861607611179352, "reward_std": 0.024797353893518448, "rewards/accuracy_reward": 0.8861607536673546, "rewards/format_reward": 1.0, "step": 6694 }, { "completion_length": 416.9687690734863, "epoch": 0.5219729893009569, "grad_norm": 0.09571721549077591, "kl": 0.002948760986328125, "learning_rate": 4.654963733735946e-07, "loss": 0.0001, "reward": 1.7455357909202576, "reward_std": 0.04569191299378872, "rewards/accuracy_reward": 0.745535746216774, "rewards/format_reward": 1.0, "step": 6696 }, { "completion_length": 412.2410888671875, "epoch": 0.5221288952117397, "grad_norm": 0.09866647713101682, "kl": 0.00290679931640625, "learning_rate": 4.6525206012324856e-07, "loss": 0.0001, "reward": 1.7633929252624512, "reward_std": 0.06154900789260864, "rewards/accuracy_reward": 0.7633928954601288, "rewards/format_reward": 1.0, "step": 6698 }, { "completion_length": 434.0669860839844, "epoch": 0.5222848011225225, "grad_norm": 0.07130758693357486, "kl": 0.0031719207763671875, "learning_rate": 4.650077552091556e-07, "loss": 0.0001, "reward": 1.8102679550647736, "reward_std": 0.031112208031117916, "rewards/accuracy_reward": 0.8102679029107094, "rewards/format_reward": 1.0, "step": 6700 }, { "completion_length": 424.5982322692871, "epoch": 0.5224407070333054, "grad_norm": 0.07720467996624958, "kl": 0.0030851364135742188, "learning_rate": 4.6476345868992595e-07, "loss": 0.0001, "reward": 1.8370536267757416, "reward_std": 0.02720661275088787, "rewards/accuracy_reward": 0.8370535895228386, "rewards/format_reward": 1.0, "step": 6702 }, { "completion_length": 425.60493087768555, "epoch": 0.5225966129440882, "grad_norm": 0.09428461011060742, "kl": 0.0028219223022460938, "learning_rate": 4.6451917062416777e-07, "loss": 0.0001, "reward": 1.7745536267757416, "reward_std": 0.04937856271862984, "rewards/accuracy_reward": 0.7745535932481289, "rewards/format_reward": 1.0, "step": 6704 }, { "completion_length": 423.4486770629883, "epoch": 0.5227525188548711, "grad_norm": 0.06639267899924879, "kl": 0.002971649169921875, "learning_rate": 4.642748910704877e-07, "loss": 0.0001, "reward": 1.8482143580913544, "reward_std": 0.04471481218934059, "rewards/accuracy_reward": 0.8482143133878708, "rewards/format_reward": 1.0, "step": 6706 }, { "completion_length": 442.59376525878906, "epoch": 0.5229084247656539, "grad_norm": 0.0880817025337797, "kl": 0.0031909942626953125, "learning_rate": 4.6403062008748954e-07, "loss": 0.0001, "reward": 1.8080357760190964, "reward_std": 0.05959761422127485, "rewards/accuracy_reward": 0.808035746216774, "rewards/format_reward": 1.0, "step": 6708 }, { "completion_length": 426.9241256713867, "epoch": 0.5230643306764368, "grad_norm": 0.10447664208189907, "kl": 0.0024852752685546875, "learning_rate": 4.6378635773377587e-07, "loss": 0.0001, "reward": 1.850446492433548, "reward_std": 0.03983351867645979, "rewards/accuracy_reward": 0.8504464477300644, "rewards/format_reward": 1.0, "step": 6710 }, { "completion_length": 440.3794822692871, "epoch": 0.5232202365872196, "grad_norm": 0.11161716026872372, "kl": 0.003185272216796875, "learning_rate": 4.635421040679465e-07, "loss": 0.0001, "reward": 1.7633929401636124, "reward_std": 0.050052570179104805, "rewards/accuracy_reward": 0.7633928954601288, "rewards/format_reward": 1.0, "step": 6712 }, { "completion_length": 423.05358505249023, "epoch": 0.5233761424980025, "grad_norm": 0.0933117257950903, "kl": 0.0026264190673828125, "learning_rate": 4.6329785914859937e-07, "loss": 0.0001, "reward": 1.7544643431901932, "reward_std": 0.07545470725744963, "rewards/accuracy_reward": 0.7589286044239998, "rewards/format_reward": 0.9955357313156128, "step": 6714 }, { "completion_length": 404.2678756713867, "epoch": 0.5235320484087853, "grad_norm": 0.06506647195341556, "kl": 0.0021181106567382812, "learning_rate": 4.630536230343308e-07, "loss": 0.0001, "reward": 1.8616072088479996, "reward_std": 0.03336669970303774, "rewards/accuracy_reward": 0.8616071790456772, "rewards/format_reward": 1.0, "step": 6716 }, { "completion_length": 428.34153747558594, "epoch": 0.5236879543195682, "grad_norm": 0.07590443939508182, "kl": 0.0027589797973632812, "learning_rate": 4.6280939578373415e-07, "loss": 0.0001, "reward": 1.8392857611179352, "reward_std": 0.04163295216858387, "rewards/accuracy_reward": 0.839285746216774, "rewards/format_reward": 1.0, "step": 6718 }, { "completion_length": 440.5044822692871, "epoch": 0.523843860230351, "grad_norm": 0.0710479288767104, "kl": 0.00273895263671875, "learning_rate": 4.6256517745540135e-07, "loss": 0.0001, "reward": 1.897321492433548, "reward_std": 0.023821654729545116, "rewards/accuracy_reward": 0.8973214626312256, "rewards/format_reward": 1.0, "step": 6720 }, { "completion_length": 429.6049270629883, "epoch": 0.5239997661411339, "grad_norm": 0.10371612767365149, "kl": 0.0029468536376953125, "learning_rate": 4.6232096810792205e-07, "loss": 0.0001, "reward": 1.8102679252624512, "reward_std": 0.03434239700436592, "rewards/accuracy_reward": 0.8102678880095482, "rewards/format_reward": 1.0, "step": 6722 }, { "completion_length": 453.59822845458984, "epoch": 0.5241556720519167, "grad_norm": 0.09649675127995934, "kl": 0.0028591156005859375, "learning_rate": 4.6207676779988327e-07, "loss": 0.0001, "reward": 1.8549107909202576, "reward_std": 0.05102826841175556, "rewards/accuracy_reward": 0.854910746216774, "rewards/format_reward": 1.0, "step": 6724 }, { "completion_length": 422.87501525878906, "epoch": 0.5243115779626996, "grad_norm": 0.10901880219631421, "kl": 0.003631591796875, "learning_rate": 4.6183257658987033e-07, "loss": 0.0001, "reward": 1.8013393729925156, "reward_std": 0.08311617374420166, "rewards/accuracy_reward": 0.8013393208384514, "rewards/format_reward": 1.0, "step": 6726 }, { "completion_length": 427.83707427978516, "epoch": 0.5244674838734823, "grad_norm": 0.11667394494919951, "kl": 0.0029125213623046875, "learning_rate": 4.615883945364667e-07, "loss": 0.0001, "reward": 1.8325893729925156, "reward_std": 0.04419417306780815, "rewards/accuracy_reward": 0.8325893133878708, "rewards/format_reward": 1.0, "step": 6728 }, { "completion_length": 432.8995704650879, "epoch": 0.5246233897842651, "grad_norm": 0.0904114770326806, "kl": 0.00292205810546875, "learning_rate": 4.613442216982526e-07, "loss": 0.0001, "reward": 1.868303656578064, "reward_std": 0.04809977114200592, "rewards/accuracy_reward": 0.8683036044239998, "rewards/format_reward": 1.0, "step": 6730 }, { "completion_length": 437.45984268188477, "epoch": 0.524779295695048, "grad_norm": 0.0921267838390858, "kl": 0.0030164718627929688, "learning_rate": 4.6110005813380703e-07, "loss": 0.0001, "reward": 1.906250074505806, "reward_std": 0.04907546937465668, "rewards/accuracy_reward": 0.9062500521540642, "rewards/format_reward": 1.0, "step": 6732 }, { "completion_length": 433.5692138671875, "epoch": 0.5249352016058308, "grad_norm": 0.05612234907408063, "kl": 0.002590179443359375, "learning_rate": 4.608559039017064e-07, "loss": 0.0001, "reward": 1.8861607760190964, "reward_std": 0.038401360623538494, "rewards/accuracy_reward": 0.886160746216774, "rewards/format_reward": 1.0, "step": 6734 }, { "completion_length": 422.2745704650879, "epoch": 0.5250911075166137, "grad_norm": 0.15405252387366694, "kl": 0.003215789794921875, "learning_rate": 4.6061175906052465e-07, "loss": 0.0001, "reward": 1.7589286714792252, "reward_std": 0.0894296308979392, "rewards/accuracy_reward": 0.7589285895228386, "rewards/format_reward": 1.0, "step": 6736 }, { "completion_length": 419.6585006713867, "epoch": 0.5252470134273965, "grad_norm": 0.11736024563921323, "kl": 0.002986907958984375, "learning_rate": 4.6036762366883394e-07, "loss": 0.0001, "reward": 1.819196492433548, "reward_std": 0.04937856551259756, "rewards/accuracy_reward": 0.8191964626312256, "rewards/format_reward": 1.0, "step": 6738 }, { "completion_length": 431.9977836608887, "epoch": 0.5254029193381794, "grad_norm": 0.050196815402158305, "kl": 0.0025081634521484375, "learning_rate": 4.6012349778520357e-07, "loss": 0.0001, "reward": 1.7098215222358704, "reward_std": 0.02915941085666418, "rewards/accuracy_reward": 0.7098214589059353, "rewards/format_reward": 1.0, "step": 6740 }, { "completion_length": 415.60493087768555, "epoch": 0.5255588252489622, "grad_norm": 0.041899783239092624, "kl": 0.0024003982543945312, "learning_rate": 4.5987938146820116e-07, "loss": 0.0001, "reward": 1.8348215073347092, "reward_std": 0.03208790626376867, "rewards/accuracy_reward": 0.8348214626312256, "rewards/format_reward": 1.0, "step": 6742 }, { "completion_length": 443.721004486084, "epoch": 0.5257147311597451, "grad_norm": 0.08702382396690606, "kl": 0.003116607666015625, "learning_rate": 4.596352747763919e-07, "loss": 0.0001, "reward": 1.7857143729925156, "reward_std": 0.028182310052216053, "rewards/accuracy_reward": 0.785714328289032, "rewards/format_reward": 1.0, "step": 6744 }, { "completion_length": 412.6852912902832, "epoch": 0.525870637070528, "grad_norm": 0.0833593719574151, "kl": 0.0030574798583984375, "learning_rate": 4.5939117776833815e-07, "loss": 0.0001, "reward": 1.7834822088479996, "reward_std": 0.03742426075041294, "rewards/accuracy_reward": 0.7834821790456772, "rewards/format_reward": 1.0, "step": 6746 }, { "completion_length": 428.5513610839844, "epoch": 0.5260265429813108, "grad_norm": 0.10119766459138588, "kl": 0.0027523040771484375, "learning_rate": 4.591470905026005e-07, "loss": 0.0001, "reward": 1.8437500596046448, "reward_std": 0.039377057924866676, "rewards/accuracy_reward": 0.843750037252903, "rewards/format_reward": 1.0, "step": 6748 }, { "completion_length": 426.0468978881836, "epoch": 0.5261824488920936, "grad_norm": 0.07987896216357254, "kl": 0.0032863616943359375, "learning_rate": 4.589030130377374e-07, "loss": 0.0001, "reward": 1.828125074505806, "reward_std": 0.039833519607782364, "rewards/accuracy_reward": 0.828125037252903, "rewards/format_reward": 1.0, "step": 6750 }, { "completion_length": 422.1183280944824, "epoch": 0.5263383548028765, "grad_norm": 0.06052968308519536, "kl": 0.0024156570434570312, "learning_rate": 4.586589454323042e-07, "loss": 0.0001, "reward": 1.8638393431901932, "reward_std": 0.01585849840193987, "rewards/accuracy_reward": 0.8638393208384514, "rewards/format_reward": 1.0, "step": 6752 }, { "completion_length": 416.38618087768555, "epoch": 0.5264942607136593, "grad_norm": 0.06950452800843489, "kl": 0.0025911331176757812, "learning_rate": 4.584148877448543e-07, "loss": 0.0001, "reward": 1.8214286416769028, "reward_std": 0.02382165566086769, "rewards/accuracy_reward": 0.8214286044239998, "rewards/format_reward": 1.0, "step": 6754 }, { "completion_length": 426.65626525878906, "epoch": 0.5266501666244422, "grad_norm": 0.09434530795879015, "kl": 0.0027866363525390625, "learning_rate": 4.581708400339389e-07, "loss": 0.0001, "reward": 1.8638393729925156, "reward_std": 0.03983351867645979, "rewards/accuracy_reward": 0.8638393208384514, "rewards/format_reward": 1.0, "step": 6756 }, { "completion_length": 412.63394927978516, "epoch": 0.526806072535225, "grad_norm": 0.10183651999262981, "kl": 0.0031366348266601562, "learning_rate": 4.5792680235810633e-07, "loss": 0.0001, "reward": 1.8147322237491608, "reward_std": 0.05395676288753748, "rewards/accuracy_reward": 0.8147321678698063, "rewards/format_reward": 1.0, "step": 6758 }, { "completion_length": 422.3995704650879, "epoch": 0.5269619784460078, "grad_norm": 0.0033217618447897368, "kl": 0.0028820037841796875, "learning_rate": 4.5768277477590305e-07, "loss": 0.0001, "reward": 1.879464328289032, "reward_std": 0.012626906856894493, "rewards/accuracy_reward": 0.8794643059372902, "rewards/format_reward": 1.0, "step": 6760 }, { "completion_length": 422.4821662902832, "epoch": 0.5271178843567906, "grad_norm": 0.1408299555641056, "kl": 0.0027246475219726562, "learning_rate": 4.5743875734587276e-07, "loss": 0.0001, "reward": 1.8861607760190964, "reward_std": 0.0665836725383997, "rewards/accuracy_reward": 0.886160746216774, "rewards/format_reward": 1.0, "step": 6762 }, { "completion_length": 424.4419822692871, "epoch": 0.5272737902675735, "grad_norm": 0.06468250834548257, "kl": 0.0020294189453125, "learning_rate": 4.5719475012655673e-07, "loss": 0.0001, "reward": 1.8883929252624512, "reward_std": 0.02089315839111805, "rewards/accuracy_reward": 0.8883928954601288, "rewards/format_reward": 1.0, "step": 6764 }, { "completion_length": 425.6406478881836, "epoch": 0.5274296961783563, "grad_norm": 0.003872218204130191, "kl": 0.00284576416015625, "learning_rate": 4.569507531764941e-07, "loss": 0.0001, "reward": 1.8102679252624512, "reward_std": 0.021868856623768806, "rewards/accuracy_reward": 0.8102678880095482, "rewards/format_reward": 1.0, "step": 6766 }, { "completion_length": 413.5401916503906, "epoch": 0.5275856020891392, "grad_norm": 0.1168344642564688, "kl": 0.0027418136596679688, "learning_rate": 4.567067665542208e-07, "loss": 0.0001, "reward": 1.7254465073347092, "reward_std": 0.04712267126888037, "rewards/accuracy_reward": 0.7254464700818062, "rewards/format_reward": 1.0, "step": 6768 }, { "completion_length": 415.0335006713867, "epoch": 0.527741507999922, "grad_norm": 0.10697567856707757, "kl": 0.0025758743286132812, "learning_rate": 4.564627903182711e-07, "loss": 0.0001, "reward": 1.7991072088479996, "reward_std": 0.06380490306764841, "rewards/accuracy_reward": 0.7991071790456772, "rewards/format_reward": 1.0, "step": 6770 }, { "completion_length": 425.78796768188477, "epoch": 0.5278974139107049, "grad_norm": 0.06420024748860818, "kl": 0.0028285980224609375, "learning_rate": 4.5621882452717664e-07, "loss": 0.0001, "reward": 1.9062500447034836, "reward_std": 0.03742566145956516, "rewards/accuracy_reward": 0.9062500298023224, "rewards/format_reward": 1.0, "step": 6772 }, { "completion_length": 431.69644927978516, "epoch": 0.5280533198214877, "grad_norm": 0.08526340424159179, "kl": 0.003208160400390625, "learning_rate": 4.559748692394661e-07, "loss": 0.0001, "reward": 1.73214291036129, "reward_std": 0.037727355025708675, "rewards/accuracy_reward": 0.7321428954601288, "rewards/format_reward": 1.0, "step": 6774 }, { "completion_length": 422.60269927978516, "epoch": 0.5282092257322706, "grad_norm": 0.07156783248854524, "kl": 0.0026607513427734375, "learning_rate": 4.557309245136659e-07, "loss": 0.0001, "reward": 1.8928571939468384, "reward_std": 0.016532503068447113, "rewards/accuracy_reward": 0.8928571715950966, "rewards/format_reward": 1.0, "step": 6776 }, { "completion_length": 420.29019927978516, "epoch": 0.5283651316430534, "grad_norm": 0.11411363094093421, "kl": 0.0031490325927734375, "learning_rate": 4.554869904083002e-07, "loss": 0.0001, "reward": 1.8058036416769028, "reward_std": 0.06658367160707712, "rewards/accuracy_reward": 0.8058035969734192, "rewards/format_reward": 1.0, "step": 6778 }, { "completion_length": 416.09376525878906, "epoch": 0.5285210375538363, "grad_norm": 0.06951008113496719, "kl": 0.003009796142578125, "learning_rate": 4.5524306698188996e-07, "loss": 0.0001, "reward": 1.8058036416769028, "reward_std": 0.03675165772438049, "rewards/accuracy_reward": 0.8058036118745804, "rewards/format_reward": 1.0, "step": 6780 }, { "completion_length": 417.52903747558594, "epoch": 0.5286769434646191, "grad_norm": 0.05313552253733282, "kl": 0.0025663375854492188, "learning_rate": 4.549991542929543e-07, "loss": 0.0001, "reward": 1.8013393729925156, "reward_std": 0.0344957634806633, "rewards/accuracy_reward": 0.8013393133878708, "rewards/format_reward": 1.0, "step": 6782 }, { "completion_length": 414.5669860839844, "epoch": 0.528832849375402, "grad_norm": 0.12252824726042506, "kl": 0.0026493072509765625, "learning_rate": 4.547552524000095e-07, "loss": 0.0001, "reward": 1.8325893729925156, "reward_std": 0.06072667706757784, "rewards/accuracy_reward": 0.8325893357396126, "rewards/format_reward": 1.0, "step": 6784 }, { "completion_length": 418.4710006713867, "epoch": 0.5289887552861848, "grad_norm": 0.0808922635582071, "kl": 0.0029239654541015625, "learning_rate": 4.5451136136156874e-07, "loss": 0.0001, "reward": 1.8727679252624512, "reward_std": 0.06515151262283325, "rewards/accuracy_reward": 0.8727678954601288, "rewards/format_reward": 1.0, "step": 6786 }, { "completion_length": 422.5669860839844, "epoch": 0.5291446611969677, "grad_norm": 0.08446899295717579, "kl": 0.0027866363525390625, "learning_rate": 4.542674812361433e-07, "loss": 0.0001, "reward": 1.8616072088479996, "reward_std": 0.041632951237261295, "rewards/accuracy_reward": 0.8616071939468384, "rewards/format_reward": 1.0, "step": 6788 }, { "completion_length": 421.46207427978516, "epoch": 0.5293005671077504, "grad_norm": 0.07278459504823981, "kl": 0.0031890869140625, "learning_rate": 4.5402361208224183e-07, "loss": 0.0001, "reward": 1.8125000894069672, "reward_std": 0.029159409925341606, "rewards/accuracy_reward": 0.812500037252903, "rewards/format_reward": 1.0, "step": 6790 }, { "completion_length": 419.20314025878906, "epoch": 0.5294564730185333, "grad_norm": 0.0559394293744735, "kl": 0.0029201507568359375, "learning_rate": 4.537797539583696e-07, "loss": 0.0001, "reward": 1.7857143729925156, "reward_std": 0.03479885868728161, "rewards/accuracy_reward": 0.7857143133878708, "rewards/format_reward": 1.0, "step": 6792 }, { "completion_length": 442.35493087768555, "epoch": 0.5296123789293161, "grad_norm": 0.10132155908007147, "kl": 0.0030536651611328125, "learning_rate": 4.535359069230301e-07, "loss": 0.0001, "reward": 1.819196492433548, "reward_std": 0.046667611226439476, "rewards/accuracy_reward": 0.8191964700818062, "rewards/format_reward": 1.0, "step": 6794 }, { "completion_length": 421.5000228881836, "epoch": 0.529768284840099, "grad_norm": 0.00299639662403894, "kl": 0.002613067626953125, "learning_rate": 4.532920710347239e-07, "loss": 0.0001, "reward": 1.8526786118745804, "reward_std": 0.01781129650771618, "rewards/accuracy_reward": 0.8526785969734192, "rewards/format_reward": 1.0, "step": 6796 }, { "completion_length": 436.9710006713867, "epoch": 0.5299241907508818, "grad_norm": 0.09822561156336142, "kl": 0.003131866455078125, "learning_rate": 4.5304824635194826e-07, "loss": 0.0001, "reward": 1.8102679252624512, "reward_std": 0.06447890773415565, "rewards/accuracy_reward": 0.81026791036129, "rewards/format_reward": 1.0, "step": 6798 }, { "completion_length": 431.8906440734863, "epoch": 0.5300800966616647, "grad_norm": 0.047197719202401345, "kl": 0.0031566619873046875, "learning_rate": 4.528044329331988e-07, "loss": 0.0001, "reward": 1.7254465073347092, "reward_std": 0.014579704962670803, "rewards/accuracy_reward": 0.725446455180645, "rewards/format_reward": 1.0, "step": 6800 }, { "completion_length": 416.8526954650879, "epoch": 0.5302360025724475, "grad_norm": 0.07103164411586722, "kl": 0.002910614013671875, "learning_rate": 4.5256063083696753e-07, "loss": 0.0001, "reward": 1.7611608058214188, "reward_std": 0.038401360623538494, "rewards/accuracy_reward": 0.7633928880095482, "rewards/format_reward": 0.9977678656578064, "step": 6802 }, { "completion_length": 412.7210006713867, "epoch": 0.5303919084832304, "grad_norm": 0.11031051197739691, "kl": 0.0028028488159179688, "learning_rate": 4.523168401217444e-07, "loss": 0.0001, "reward": 1.8415179252624512, "reward_std": 0.027206613682210445, "rewards/accuracy_reward": 0.841517873108387, "rewards/format_reward": 1.0, "step": 6804 }, { "completion_length": 416.1071586608887, "epoch": 0.5305478143940132, "grad_norm": 0.08543297294340056, "kl": 0.0025177001953125, "learning_rate": 4.520730608460165e-07, "loss": 0.0001, "reward": 1.8058036416769028, "reward_std": 0.06124591734260321, "rewards/accuracy_reward": 0.805803619325161, "rewards/format_reward": 1.0, "step": 6806 }, { "completion_length": 420.5312690734863, "epoch": 0.5307037203047961, "grad_norm": 0.0690470847304262, "kl": 0.0027637481689453125, "learning_rate": 4.518292930682675e-07, "loss": 0.0001, "reward": 1.8883929252624512, "reward_std": 0.053958166390657425, "rewards/accuracy_reward": 0.8883928954601288, "rewards/format_reward": 1.0, "step": 6808 }, { "completion_length": 418.4888572692871, "epoch": 0.5308596262155789, "grad_norm": 0.07986944832948943, "kl": 0.002574920654296875, "learning_rate": 4.5158553684697915e-07, "loss": 0.0001, "reward": 1.7232143878936768, "reward_std": 0.02089315839111805, "rewards/accuracy_reward": 0.7232143059372902, "rewards/format_reward": 1.0, "step": 6810 }, { "completion_length": 428.8125228881836, "epoch": 0.5310155321263618, "grad_norm": 0.11482432552127456, "kl": 0.0028858184814453125, "learning_rate": 4.513417922406304e-07, "loss": 0.0001, "reward": 1.828125074505806, "reward_std": 0.07447900995612144, "rewards/accuracy_reward": 0.8281250447034836, "rewards/format_reward": 1.0, "step": 6812 }, { "completion_length": 431.6942138671875, "epoch": 0.5311714380371446, "grad_norm": 0.0931799730507805, "kl": 0.002490997314453125, "learning_rate": 4.5109805930769674e-07, "loss": 0.0001, "reward": 1.7812501192092896, "reward_std": 0.06267947517335415, "rewards/accuracy_reward": 0.7834821864962578, "rewards/format_reward": 0.9977678656578064, "step": 6814 }, { "completion_length": 431.9620666503906, "epoch": 0.5313273439479275, "grad_norm": 0.1100008090030137, "kl": 0.003383636474609375, "learning_rate": 4.508543381066513e-07, "loss": 0.0001, "reward": 1.8169643878936768, "reward_std": 0.06395827047526836, "rewards/accuracy_reward": 0.8169643133878708, "rewards/format_reward": 1.0, "step": 6816 }, { "completion_length": 420.33037185668945, "epoch": 0.5314832498587103, "grad_norm": 0.05668484140624276, "kl": 0.002437591552734375, "learning_rate": 4.506106286959647e-07, "loss": 0.0001, "reward": 1.84151791036129, "reward_std": 0.041936046443879604, "rewards/accuracy_reward": 0.8415178880095482, "rewards/format_reward": 1.0, "step": 6818 }, { "completion_length": 423.3326110839844, "epoch": 0.5316391557694932, "grad_norm": 0.13105099403632214, "kl": 0.002758026123046875, "learning_rate": 4.503669311341039e-07, "loss": 0.0001, "reward": 1.8147322088479996, "reward_std": 0.06658366974443197, "rewards/accuracy_reward": 0.8169643208384514, "rewards/format_reward": 0.9977678656578064, "step": 6820 }, { "completion_length": 422.636173248291, "epoch": 0.5317950616802759, "grad_norm": 0.09739604072417173, "kl": 0.0028209686279296875, "learning_rate": 4.5012324547953386e-07, "loss": 0.0001, "reward": 1.7879465073347092, "reward_std": 0.056366024538874626, "rewards/accuracy_reward": 0.787946455180645, "rewards/format_reward": 1.0, "step": 6822 }, { "completion_length": 431.174129486084, "epoch": 0.5319509675910588, "grad_norm": 0.08922082009387908, "kl": 0.0030536651611328125, "learning_rate": 4.4987957179071625e-07, "loss": 0.0001, "reward": 1.8683036714792252, "reward_std": 0.055388922803103924, "rewards/accuracy_reward": 0.8683035969734192, "rewards/format_reward": 1.0, "step": 6824 }, { "completion_length": 419.24108505249023, "epoch": 0.5321068735018416, "grad_norm": 0.06606866375944612, "kl": 0.00243377685546875, "learning_rate": 4.4963591012610985e-07, "loss": 0.0001, "reward": 1.7723215073347092, "reward_std": 0.036448562517762184, "rewards/accuracy_reward": 0.7723214626312256, "rewards/format_reward": 1.0, "step": 6826 }, { "completion_length": 429.0424346923828, "epoch": 0.5322627794126245, "grad_norm": 0.1247752856050583, "kl": 0.0029582977294921875, "learning_rate": 4.4939226054417083e-07, "loss": 0.0001, "reward": 1.80803582072258, "reward_std": 0.05553865060210228, "rewards/accuracy_reward": 0.8102679029107094, "rewards/format_reward": 0.9977678656578064, "step": 6828 }, { "completion_length": 433.37948989868164, "epoch": 0.5324186853234073, "grad_norm": 0.09384729964629537, "kl": 0.002613067626953125, "learning_rate": 4.4914862310335195e-07, "loss": 0.0001, "reward": 1.881696492433548, "reward_std": 0.0594428451731801, "rewards/accuracy_reward": 0.8816964700818062, "rewards/format_reward": 1.0, "step": 6830 }, { "completion_length": 430.5669822692871, "epoch": 0.5325745912341902, "grad_norm": 0.08954982280900961, "kl": 0.0025854110717773438, "learning_rate": 4.489049978621036e-07, "loss": 0.0001, "reward": 1.8125001043081284, "reward_std": 0.044714813120663166, "rewards/accuracy_reward": 0.8125000447034836, "rewards/format_reward": 1.0, "step": 6832 }, { "completion_length": 440.26341247558594, "epoch": 0.532730497144973, "grad_norm": 0.05399824313650106, "kl": 0.0030126571655273438, "learning_rate": 4.4866138487887327e-07, "loss": 0.0001, "reward": 1.8906250596046448, "reward_std": 0.01894036028534174, "rewards/accuracy_reward": 0.890625037252903, "rewards/format_reward": 1.0, "step": 6834 }, { "completion_length": 429.6473388671875, "epoch": 0.5328864030557559, "grad_norm": 0.09828925753576805, "kl": 0.002758026123046875, "learning_rate": 4.4841778421210493e-07, "loss": 0.0001, "reward": 1.866071492433548, "reward_std": 0.0625261114910245, "rewards/accuracy_reward": 0.8660714626312256, "rewards/format_reward": 1.0, "step": 6836 }, { "completion_length": 429.70983505249023, "epoch": 0.5330423089665387, "grad_norm": 0.09321907790238033, "kl": 0.003078460693359375, "learning_rate": 4.4817419592023993e-07, "loss": 0.0001, "reward": 1.756696492433548, "reward_std": 0.07905721385031939, "rewards/accuracy_reward": 0.7566964700818062, "rewards/format_reward": 1.0, "step": 6838 }, { "completion_length": 431.4665412902832, "epoch": 0.5331982148773216, "grad_norm": 0.08580003476634801, "kl": 0.002803802490234375, "learning_rate": 4.47930620061717e-07, "loss": 0.0001, "reward": 1.8392857611179352, "reward_std": 0.039377057924866676, "rewards/accuracy_reward": 0.8392857313156128, "rewards/format_reward": 1.0, "step": 6840 }, { "completion_length": 433.1071548461914, "epoch": 0.5333541207881044, "grad_norm": 0.10768747075361776, "kl": 0.003330230712890625, "learning_rate": 4.4768705669497107e-07, "loss": 0.0001, "reward": 1.7633929550647736, "reward_std": 0.053436124697327614, "rewards/accuracy_reward": 0.7633928805589676, "rewards/format_reward": 1.0, "step": 6842 }, { "completion_length": 411.1607322692871, "epoch": 0.5335100266988873, "grad_norm": 0.0810979184708247, "kl": 0.0026302337646484375, "learning_rate": 4.4744350587843494e-07, "loss": 0.0001, "reward": 1.6986607909202576, "reward_std": 0.04569051135331392, "rewards/accuracy_reward": 0.698660746216774, "rewards/format_reward": 1.0, "step": 6844 }, { "completion_length": 422.7656440734863, "epoch": 0.5336659326096701, "grad_norm": 0.06391083863808944, "kl": 0.0023851394653320312, "learning_rate": 4.47199967670538e-07, "loss": 0.0001, "reward": 1.8080357909202576, "reward_std": 0.028182310052216053, "rewards/accuracy_reward": 0.8102679029107094, "rewards/format_reward": 0.9977678656578064, "step": 6846 }, { "completion_length": 428.27457427978516, "epoch": 0.533821838520453, "grad_norm": 0.11431532934007944, "kl": 0.0030841827392578125, "learning_rate": 4.4695644212970625e-07, "loss": 0.0001, "reward": 1.7209822237491608, "reward_std": 0.05246042646467686, "rewards/accuracy_reward": 0.7209821715950966, "rewards/format_reward": 1.0, "step": 6848 }, { "completion_length": 422.20537185668945, "epoch": 0.5339777444312358, "grad_norm": 0.08097300552101339, "kl": 0.0030107498168945312, "learning_rate": 4.467129293143632e-07, "loss": 0.0001, "reward": 1.790178656578064, "reward_std": 0.05220318678766489, "rewards/accuracy_reward": 0.7946429029107094, "rewards/format_reward": 0.9955357164144516, "step": 6850 }, { "completion_length": 428.3460006713867, "epoch": 0.5341336503420185, "grad_norm": 0.07829766185413924, "kl": 0.0027256011962890625, "learning_rate": 4.4646942928292954e-07, "loss": 0.0001, "reward": 1.7946429401636124, "reward_std": 0.049075471237301826, "rewards/accuracy_reward": 0.7946428954601288, "rewards/format_reward": 1.0, "step": 6852 }, { "completion_length": 415.3147430419922, "epoch": 0.5342895562528014, "grad_norm": 0.07009781686905706, "kl": 0.002521514892578125, "learning_rate": 4.4622594209382186e-07, "loss": 0.0001, "reward": 1.8236607760190964, "reward_std": 0.05087490100413561, "rewards/accuracy_reward": 0.8236607611179352, "rewards/format_reward": 1.0, "step": 6854 }, { "completion_length": 425.7522506713867, "epoch": 0.5344454621635842, "grad_norm": 0.111307064354964, "kl": 0.0031414031982421875, "learning_rate": 4.459824678054547e-07, "loss": 0.0001, "reward": 1.7790179252624512, "reward_std": 0.04666761215776205, "rewards/accuracy_reward": 0.7790178880095482, "rewards/format_reward": 1.0, "step": 6856 }, { "completion_length": 417.38618087768555, "epoch": 0.5346013680743671, "grad_norm": 0.08629438080621596, "kl": 0.0028600692749023438, "learning_rate": 4.457390064762391e-07, "loss": 0.0001, "reward": 1.8214286416769028, "reward_std": 0.050200898200273514, "rewards/accuracy_reward": 0.8214286044239998, "rewards/format_reward": 1.0, "step": 6858 }, { "completion_length": 425.10047149658203, "epoch": 0.5347572739851499, "grad_norm": 0.0751997805306693, "kl": 0.0026149749755859375, "learning_rate": 4.454955581645826e-07, "loss": 0.0001, "reward": 1.8482143729925156, "reward_std": 0.045586638152599335, "rewards/accuracy_reward": 0.8504464626312256, "rewards/format_reward": 0.9977678656578064, "step": 6860 }, { "completion_length": 421.0357322692871, "epoch": 0.5349131798959328, "grad_norm": 0.04585975917584866, "kl": 0.0024995803833007812, "learning_rate": 4.452521229288906e-07, "loss": 0.0001, "reward": 1.8705357909202576, "reward_std": 0.025253813713788986, "rewards/accuracy_reward": 0.8705357313156128, "rewards/format_reward": 1.0, "step": 6862 }, { "completion_length": 430.54019927978516, "epoch": 0.5350690858067156, "grad_norm": 0.10216287896407217, "kl": 0.0030651092529296875, "learning_rate": 4.4500870082756414e-07, "loss": 0.0001, "reward": 1.7455357909202576, "reward_std": 0.06463087350130081, "rewards/accuracy_reward": 0.7455357536673546, "rewards/format_reward": 1.0, "step": 6864 }, { "completion_length": 417.3437690734863, "epoch": 0.5352249917174985, "grad_norm": 0.10488266258483246, "kl": 0.0025892257690429688, "learning_rate": 4.4476529191900216e-07, "loss": 0.0001, "reward": 1.8437500596046448, "reward_std": 0.04178631864488125, "rewards/accuracy_reward": 0.8437500223517418, "rewards/format_reward": 1.0, "step": 6866 }, { "completion_length": 411.4107360839844, "epoch": 0.5353808976282813, "grad_norm": 0.044093397439987476, "kl": 0.0020132064819335938, "learning_rate": 4.4452189626160005e-07, "loss": 0.0001, "reward": 1.8816964775323868, "reward_std": 0.025774452835321426, "rewards/accuracy_reward": 0.881696455180645, "rewards/format_reward": 1.0, "step": 6868 }, { "completion_length": 419.0402030944824, "epoch": 0.5355368035390642, "grad_norm": 0.09776562509926154, "kl": 0.0030736923217773438, "learning_rate": 4.4427851391374956e-07, "loss": 0.0001, "reward": 1.8058036714792252, "reward_std": 0.05523555725812912, "rewards/accuracy_reward": 0.8058036044239998, "rewards/format_reward": 1.0, "step": 6870 }, { "completion_length": 420.4509086608887, "epoch": 0.535692709449847, "grad_norm": 0.07337505639953971, "kl": 0.0027971267700195312, "learning_rate": 4.4403514493383997e-07, "loss": 0.0001, "reward": 1.843750074505806, "reward_std": 0.030438203364610672, "rewards/accuracy_reward": 0.8437500223517418, "rewards/format_reward": 1.0, "step": 6872 }, { "completion_length": 420.46207427978516, "epoch": 0.5358486153606299, "grad_norm": 0.11344326366128409, "kl": 0.0027532577514648438, "learning_rate": 4.4379178938025724e-07, "loss": 0.0001, "reward": 1.7388393729925156, "reward_std": 0.04373911675065756, "rewards/accuracy_reward": 0.738839328289032, "rewards/format_reward": 1.0, "step": 6874 }, { "completion_length": 434.07368087768555, "epoch": 0.5360045212714127, "grad_norm": 0.25706517383521377, "kl": 0.013519287109375, "learning_rate": 4.435484473113836e-07, "loss": 0.0005, "reward": 1.8013393729925156, "reward_std": 0.045017908327281475, "rewards/accuracy_reward": 0.8013393208384514, "rewards/format_reward": 1.0, "step": 6876 }, { "completion_length": 431.12724685668945, "epoch": 0.5361604271821956, "grad_norm": 0.1091377883456251, "kl": 0.002780914306640625, "learning_rate": 4.433051187855984e-07, "loss": 0.0001, "reward": 1.7857143580913544, "reward_std": 0.08101000916212797, "rewards/accuracy_reward": 0.7857143208384514, "rewards/format_reward": 1.0, "step": 6878 }, { "completion_length": 427.76787185668945, "epoch": 0.5363163330929784, "grad_norm": 0.0776328831547509, "kl": 0.0028858184814453125, "learning_rate": 4.4306180386127795e-07, "loss": 0.0001, "reward": 1.924107238650322, "reward_std": 0.044714813120663166, "rewards/accuracy_reward": 0.9241071790456772, "rewards/format_reward": 1.0, "step": 6880 }, { "completion_length": 429.18305587768555, "epoch": 0.5364722390037612, "grad_norm": 0.04272681241389847, "kl": 0.002635955810546875, "learning_rate": 4.4281850259679475e-07, "loss": 0.0001, "reward": 1.7991072088479996, "reward_std": 0.030438203364610672, "rewards/accuracy_reward": 0.7991071790456772, "rewards/format_reward": 1.0, "step": 6882 }, { "completion_length": 430.41743087768555, "epoch": 0.536628144914544, "grad_norm": 0.1324117887143675, "kl": 0.0027914047241210938, "learning_rate": 4.425752150505184e-07, "loss": 0.0001, "reward": 1.7901786714792252, "reward_std": 0.0602702172473073, "rewards/accuracy_reward": 0.7901786044239998, "rewards/format_reward": 1.0, "step": 6884 }, { "completion_length": 414.7388610839844, "epoch": 0.5367840508253269, "grad_norm": 0.07535893540381716, "kl": 0.0028057098388671875, "learning_rate": 4.4233194128081553e-07, "loss": 0.0001, "reward": 1.7700893431901932, "reward_std": 0.0426086476072669, "rewards/accuracy_reward": 0.7700893096625805, "rewards/format_reward": 1.0, "step": 6886 }, { "completion_length": 416.72322845458984, "epoch": 0.5369399567361097, "grad_norm": 0.08541996160249331, "kl": 0.0025606155395507812, "learning_rate": 4.420886813460485e-07, "loss": 0.0001, "reward": 1.8348215073347092, "reward_std": 0.030438202433288097, "rewards/accuracy_reward": 0.8348214700818062, "rewards/format_reward": 1.0, "step": 6888 }, { "completion_length": 435.17859649658203, "epoch": 0.5370958626468926, "grad_norm": 0.040482428864699854, "kl": 0.0030040740966796875, "learning_rate": 4.4184543530457744e-07, "loss": 0.0001, "reward": 1.8058036416769028, "reward_std": 0.04794640280306339, "rewards/accuracy_reward": 0.8058035895228386, "rewards/format_reward": 1.0, "step": 6890 }, { "completion_length": 423.3817138671875, "epoch": 0.5372517685576754, "grad_norm": 0.10193244093621952, "kl": 0.0030727386474609375, "learning_rate": 4.416022032147581e-07, "loss": 0.0001, "reward": 1.74553582072258, "reward_std": 0.05298106651753187, "rewards/accuracy_reward": 0.7455357387661934, "rewards/format_reward": 1.0, "step": 6892 }, { "completion_length": 421.69644927978516, "epoch": 0.5374076744684583, "grad_norm": 0.054012555563745346, "kl": 0.0025053024291992188, "learning_rate": 4.413589851349437e-07, "loss": 0.0001, "reward": 1.796875074505806, "reward_std": 0.021868856623768806, "rewards/accuracy_reward": 0.7968750223517418, "rewards/format_reward": 1.0, "step": 6894 }, { "completion_length": 417.8772506713867, "epoch": 0.5375635803792411, "grad_norm": 0.0766977072082602, "kl": 0.0026073455810546875, "learning_rate": 4.4111578112348403e-07, "loss": 0.0001, "reward": 1.7834822237491608, "reward_std": 0.04404080845415592, "rewards/accuracy_reward": 0.783482164144516, "rewards/format_reward": 1.0, "step": 6896 }, { "completion_length": 417.4196586608887, "epoch": 0.537719486290024, "grad_norm": 0.08986169561195842, "kl": 0.003025054931640625, "learning_rate": 4.408725912387249e-07, "loss": 0.0001, "reward": 1.8080358058214188, "reward_std": 0.060270216315984726, "rewards/accuracy_reward": 0.808035746216774, "rewards/format_reward": 1.0, "step": 6898 }, { "completion_length": 413.5759162902832, "epoch": 0.5378753922008068, "grad_norm": 0.04894528186792258, "kl": 0.0024967193603515625, "learning_rate": 4.406294155390092e-07, "loss": 0.0001, "reward": 1.8370536416769028, "reward_std": 0.02284595649689436, "rewards/accuracy_reward": 0.8370535969734192, "rewards/format_reward": 1.0, "step": 6900 }, { "completion_length": 413.6451072692871, "epoch": 0.5380312981115897, "grad_norm": 0.10786745926576857, "kl": 0.0024700164794921875, "learning_rate": 4.403862540826766e-07, "loss": 0.0001, "reward": 1.8459822088479996, "reward_std": 0.028485404327511787, "rewards/accuracy_reward": 0.8459821864962578, "rewards/format_reward": 1.0, "step": 6902 }, { "completion_length": 421.0893020629883, "epoch": 0.5381872040223725, "grad_norm": 0.09069151321341841, "kl": 0.00272369384765625, "learning_rate": 4.4014310692806267e-07, "loss": 0.0001, "reward": 1.8035715222358704, "reward_std": 0.03644856158643961, "rewards/accuracy_reward": 0.8035714626312256, "rewards/format_reward": 1.0, "step": 6904 }, { "completion_length": 430.0870704650879, "epoch": 0.5383431099331554, "grad_norm": 0.1373859476032075, "kl": 0.0028829574584960938, "learning_rate": 4.398999741335003e-07, "loss": 0.0001, "reward": 1.7834822237491608, "reward_std": 0.07237648498266935, "rewards/accuracy_reward": 0.7834821715950966, "rewards/format_reward": 1.0, "step": 6906 }, { "completion_length": 413.2076072692871, "epoch": 0.5384990158439382, "grad_norm": 0.0772838253190408, "kl": 0.0024557113647460938, "learning_rate": 4.396568557573186e-07, "loss": 0.0001, "reward": 1.8080358058214188, "reward_std": 0.028182310052216053, "rewards/accuracy_reward": 0.808035746216774, "rewards/format_reward": 1.0, "step": 6908 }, { "completion_length": 427.1986770629883, "epoch": 0.5386549217547211, "grad_norm": 0.05481871088077167, "kl": 0.0028390884399414062, "learning_rate": 4.394137518578431e-07, "loss": 0.0001, "reward": 1.7991072237491608, "reward_std": 0.04065584950149059, "rewards/accuracy_reward": 0.7991071790456772, "rewards/format_reward": 1.0, "step": 6910 }, { "completion_length": 418.71653747558594, "epoch": 0.5388108276655039, "grad_norm": 0.07203513639571665, "kl": 0.0024690628051757812, "learning_rate": 4.391706624933958e-07, "loss": 0.0001, "reward": 1.7455357909202576, "reward_std": 0.045993607491254807, "rewards/accuracy_reward": 0.745535746216774, "rewards/format_reward": 1.0, "step": 6912 }, { "completion_length": 419.3794822692871, "epoch": 0.5389667335762867, "grad_norm": 0.10746200801298575, "kl": 0.0031280517578125, "learning_rate": 4.3892758772229587e-07, "loss": 0.0001, "reward": 1.7834822088479996, "reward_std": 0.06124591641128063, "rewards/accuracy_reward": 0.7834821939468384, "rewards/format_reward": 1.0, "step": 6914 }, { "completion_length": 411.46876525878906, "epoch": 0.5391226394870695, "grad_norm": 0.09523690354166962, "kl": 0.0025281906127929688, "learning_rate": 4.3868452760285804e-07, "loss": 0.0001, "reward": 1.8794643431901932, "reward_std": 0.05493246205151081, "rewards/accuracy_reward": 0.8794643133878708, "rewards/format_reward": 1.0, "step": 6916 }, { "completion_length": 415.5580520629883, "epoch": 0.5392785453978524, "grad_norm": 0.08824278849845452, "kl": 0.002460479736328125, "learning_rate": 4.3844148219339433e-07, "loss": 0.0001, "reward": 1.803571492433548, "reward_std": 0.038704453967511654, "rewards/accuracy_reward": 0.8035714700818062, "rewards/format_reward": 1.0, "step": 6918 }, { "completion_length": 423.6986770629883, "epoch": 0.5394344513086352, "grad_norm": 0.08492983149751851, "kl": 0.003253936767578125, "learning_rate": 4.381984515522127e-07, "loss": 0.0001, "reward": 1.790178656578064, "reward_std": 0.04742576461285353, "rewards/accuracy_reward": 0.7901785969734192, "rewards/format_reward": 1.0, "step": 6920 }, { "completion_length": 420.57591247558594, "epoch": 0.5395903572194181, "grad_norm": 0.11281433086291241, "kl": 0.002899169921875, "learning_rate": 4.379554357376176e-07, "loss": 0.0001, "reward": 1.79464291036129, "reward_std": 0.05425985995680094, "rewards/accuracy_reward": 0.7946428805589676, "rewards/format_reward": 1.0, "step": 6922 }, { "completion_length": 426.79466247558594, "epoch": 0.5397462631302009, "grad_norm": 0.07463239947957588, "kl": 0.0032510757446289062, "learning_rate": 4.377124348079105e-07, "loss": 0.0001, "reward": 1.8013393729925156, "reward_std": 0.044194173999130726, "rewards/accuracy_reward": 0.8013393245637417, "rewards/format_reward": 1.0, "step": 6924 }, { "completion_length": 406.54689025878906, "epoch": 0.5399021690409838, "grad_norm": 0.08995518737369303, "kl": 0.0025768280029296875, "learning_rate": 4.3746944882138846e-07, "loss": 0.0001, "reward": 1.8102679401636124, "reward_std": 0.06447890773415565, "rewards/accuracy_reward": 0.810267873108387, "rewards/format_reward": 1.0, "step": 6926 }, { "completion_length": 428.2634048461914, "epoch": 0.5400580749517666, "grad_norm": 0.1199434478210028, "kl": 0.00293731689453125, "learning_rate": 4.372264778363457e-07, "loss": 0.0001, "reward": 1.803571492433548, "reward_std": 0.06395827047526836, "rewards/accuracy_reward": 0.803571455180645, "rewards/format_reward": 1.0, "step": 6928 }, { "completion_length": 419.7500190734863, "epoch": 0.5402139808625495, "grad_norm": 0.0616041371672195, "kl": 0.0027647018432617188, "learning_rate": 4.369835219110724e-07, "loss": 0.0001, "reward": 1.8482143431901932, "reward_std": 0.04373771417886019, "rewards/accuracy_reward": 0.8482143133878708, "rewards/format_reward": 1.0, "step": 6930 }, { "completion_length": 416.99332427978516, "epoch": 0.5403698867733323, "grad_norm": 0.048517118112772414, "kl": 0.0024738311767578125, "learning_rate": 4.3674058110385505e-07, "loss": 0.0001, "reward": 1.866071492433548, "reward_std": 0.028182310983538628, "rewards/accuracy_reward": 0.8660714700818062, "rewards/format_reward": 1.0, "step": 6932 }, { "completion_length": 421.1897506713867, "epoch": 0.5405257926841152, "grad_norm": 0.13177791814929968, "kl": 0.0026006698608398438, "learning_rate": 4.364976554729768e-07, "loss": 0.0001, "reward": 1.8147322237491608, "reward_std": 0.05682107899338007, "rewards/accuracy_reward": 0.8147321715950966, "rewards/format_reward": 1.0, "step": 6934 }, { "completion_length": 419.97993087768555, "epoch": 0.540681698594898, "grad_norm": 0.05072421485814673, "kl": 0.0024061203002929688, "learning_rate": 4.362547450767174e-07, "loss": 0.0001, "reward": 1.9062500596046448, "reward_std": 0.012626906856894493, "rewards/accuracy_reward": 0.9062500596046448, "rewards/format_reward": 1.0, "step": 6936 }, { "completion_length": 419.2768020629883, "epoch": 0.5408376045056809, "grad_norm": 0.08581034113808157, "kl": 0.0029735565185546875, "learning_rate": 4.3601184997335217e-07, "loss": 0.0001, "reward": 1.8147322237491608, "reward_std": 0.053956763818860054, "rewards/accuracy_reward": 0.8147321790456772, "rewards/format_reward": 1.0, "step": 6938 }, { "completion_length": 419.6406440734863, "epoch": 0.5409935104164637, "grad_norm": 0.10343119200335055, "kl": 0.0027313232421875, "learning_rate": 4.3576897022115333e-07, "loss": 0.0001, "reward": 1.7790179252624512, "reward_std": 0.053956763818860054, "rewards/accuracy_reward": 0.7790178805589676, "rewards/format_reward": 1.0, "step": 6940 }, { "completion_length": 420.7812614440918, "epoch": 0.5411494163272466, "grad_norm": 0.06706408510769159, "kl": 0.0024585723876953125, "learning_rate": 4.3552610587838946e-07, "loss": 0.0001, "reward": 1.8638393580913544, "reward_std": 0.03306360449641943, "rewards/accuracy_reward": 0.8638393059372902, "rewards/format_reward": 1.0, "step": 6942 }, { "completion_length": 417.70537185668945, "epoch": 0.5413053222380293, "grad_norm": 0.06472462422406275, "kl": 0.0025577545166015625, "learning_rate": 4.3528325700332495e-07, "loss": 0.0001, "reward": 1.816964402794838, "reward_std": 0.025253813713788986, "rewards/accuracy_reward": 0.8169643208384514, "rewards/format_reward": 1.0, "step": 6944 }, { "completion_length": 422.0044822692871, "epoch": 0.5414612281488121, "grad_norm": 0.10599420752538263, "kl": 0.002979278564453125, "learning_rate": 4.35040423654221e-07, "loss": 0.0001, "reward": 1.8459822237491608, "reward_std": 0.047946405597031116, "rewards/accuracy_reward": 0.8459821939468384, "rewards/format_reward": 1.0, "step": 6946 }, { "completion_length": 421.4151954650879, "epoch": 0.541617134059595, "grad_norm": 0.09076414313796732, "kl": 0.0027074813842773438, "learning_rate": 4.34797605889335e-07, "loss": 0.0001, "reward": 1.7901786267757416, "reward_std": 0.06072304118424654, "rewards/accuracy_reward": 0.7901786118745804, "rewards/format_reward": 1.0, "step": 6948 }, { "completion_length": 422.74555587768555, "epoch": 0.5417730399703778, "grad_norm": 0.0025059627713930758, "kl": 0.0024194717407226562, "learning_rate": 4.3455480376692035e-07, "loss": 0.0001, "reward": 1.8348215222358704, "reward_std": 0.025253813713788986, "rewards/accuracy_reward": 0.8348214775323868, "rewards/format_reward": 1.0, "step": 6950 }, { "completion_length": 424.0535888671875, "epoch": 0.5419289458811607, "grad_norm": 0.11035054310147371, "kl": 0.0029659271240234375, "learning_rate": 4.3431201734522685e-07, "loss": 0.0001, "reward": 1.7477679401636124, "reward_std": 0.055388922803103924, "rewards/accuracy_reward": 0.7477678880095482, "rewards/format_reward": 1.0, "step": 6952 }, { "completion_length": 419.64064025878906, "epoch": 0.5420848517919435, "grad_norm": 0.04359811493024913, "kl": 0.0031185150146484375, "learning_rate": 4.340692466825002e-07, "loss": 0.0001, "reward": 1.7723215222358704, "reward_std": 0.05343612376600504, "rewards/accuracy_reward": 0.7723214700818062, "rewards/format_reward": 1.0, "step": 6954 }, { "completion_length": 436.3236846923828, "epoch": 0.5422407577027264, "grad_norm": 0.08297920130613477, "kl": 0.0030727386474609375, "learning_rate": 4.3382649183698305e-07, "loss": 0.0001, "reward": 1.7901786416769028, "reward_std": 0.05862051248550415, "rewards/accuracy_reward": 0.7924107536673546, "rewards/format_reward": 0.9977678656578064, "step": 6956 }, { "completion_length": 426.7433204650879, "epoch": 0.5423966636135092, "grad_norm": 0.09582059722082327, "kl": 0.0028514862060546875, "learning_rate": 4.335837528669139e-07, "loss": 0.0001, "reward": 1.8303571939468384, "reward_std": 0.036448562517762184, "rewards/accuracy_reward": 0.8303571790456772, "rewards/format_reward": 1.0, "step": 6958 }, { "completion_length": 443.1629638671875, "epoch": 0.5425525695242921, "grad_norm": 0.11815575018505656, "kl": 0.006153106689453125, "learning_rate": 4.333410298305271e-07, "loss": 0.0002, "reward": 1.8102679401636124, "reward_std": 0.04111231118440628, "rewards/accuracy_reward": 0.8102678880095482, "rewards/format_reward": 1.0, "step": 6960 }, { "completion_length": 415.7410888671875, "epoch": 0.5427084754350749, "grad_norm": 0.0914254046230792, "kl": 0.0027627944946289062, "learning_rate": 4.330983227860534e-07, "loss": 0.0001, "reward": 1.8303572684526443, "reward_std": 0.04614697303622961, "rewards/accuracy_reward": 0.8303571715950966, "rewards/format_reward": 1.0, "step": 6962 }, { "completion_length": 425.36163330078125, "epoch": 0.5428643813458578, "grad_norm": 0.08419017814309021, "kl": 0.003147125244140625, "learning_rate": 4.3285563179172017e-07, "loss": 0.0001, "reward": 1.76116082072258, "reward_std": 0.07320021837949753, "rewards/accuracy_reward": 0.7611607387661934, "rewards/format_reward": 1.0, "step": 6964 }, { "completion_length": 422.5781440734863, "epoch": 0.5430202872566406, "grad_norm": 0.05001402156467398, "kl": 0.0023326873779296875, "learning_rate": 4.3261295690575005e-07, "loss": 0.0001, "reward": 1.8013393729925156, "reward_std": 0.01894036028534174, "rewards/accuracy_reward": 0.8013393208384514, "rewards/format_reward": 1.0, "step": 6966 }, { "completion_length": 418.2299270629883, "epoch": 0.5431761931674235, "grad_norm": 0.09908870713899143, "kl": 0.003170013427734375, "learning_rate": 4.3237029818636273e-07, "loss": 0.0001, "reward": 1.7901786714792252, "reward_std": 0.036448562517762184, "rewards/accuracy_reward": 0.7901786118745804, "rewards/format_reward": 1.0, "step": 6968 }, { "completion_length": 428.8169822692871, "epoch": 0.5433320990782063, "grad_norm": 0.0975371521890931, "kl": 0.003040313720703125, "learning_rate": 4.3212765569177335e-07, "loss": 0.0001, "reward": 1.8169643729925156, "reward_std": 0.062155201099812984, "rewards/accuracy_reward": 0.8169643208384514, "rewards/format_reward": 1.0, "step": 6970 }, { "completion_length": 426.68528747558594, "epoch": 0.5434880049889892, "grad_norm": 0.002983279478071925, "kl": 0.00279998779296875, "learning_rate": 4.318850294801934e-07, "loss": 0.0001, "reward": 1.8258929401636124, "reward_std": 0.012626906856894493, "rewards/accuracy_reward": 0.8258928954601288, "rewards/format_reward": 1.0, "step": 6972 }, { "completion_length": 431.26564025878906, "epoch": 0.543643910899772, "grad_norm": 0.05268762334541054, "kl": 0.0026760101318359375, "learning_rate": 4.316424196098304e-07, "loss": 0.0001, "reward": 1.8437500447034836, "reward_std": 0.03208790626376867, "rewards/accuracy_reward": 0.843750037252903, "rewards/format_reward": 1.0, "step": 6974 }, { "completion_length": 419.6585006713867, "epoch": 0.5437998168105548, "grad_norm": 0.0627087581858751, "kl": 0.0024633407592773438, "learning_rate": 4.313998261388884e-07, "loss": 0.0001, "reward": 1.8258929401636124, "reward_std": 0.06124731805175543, "rewards/accuracy_reward": 0.8258928880095482, "rewards/format_reward": 1.0, "step": 6976 }, { "completion_length": 410.57814025878906, "epoch": 0.5439557227213376, "grad_norm": 0.10113756973722228, "kl": 0.0025501251220703125, "learning_rate": 4.311572491255665e-07, "loss": 0.0001, "reward": 1.8660715073347092, "reward_std": 0.05441322457045317, "rewards/accuracy_reward": 0.8660714626312256, "rewards/format_reward": 1.0, "step": 6978 }, { "completion_length": 406.2299270629883, "epoch": 0.5441116286321205, "grad_norm": 0.07290222731923274, "kl": 0.0024080276489257812, "learning_rate": 4.309146886280611e-07, "loss": 0.0001, "reward": 1.8147322237491608, "reward_std": 0.03968015220016241, "rewards/accuracy_reward": 0.8147321790456772, "rewards/format_reward": 1.0, "step": 6980 }, { "completion_length": 441.91743087768555, "epoch": 0.5442675345429033, "grad_norm": 0.09614182923374247, "kl": 0.002689361572265625, "learning_rate": 4.306721447045636e-07, "loss": 0.0001, "reward": 1.8348215073347092, "reward_std": 0.058838057331740856, "rewards/accuracy_reward": 0.8348214700818062, "rewards/format_reward": 1.0, "step": 6982 }, { "completion_length": 414.2388610839844, "epoch": 0.5444234404536862, "grad_norm": 0.052926881868510765, "kl": 0.0026264190673828125, "learning_rate": 4.304296174132619e-07, "loss": 0.0001, "reward": 1.7366072237491608, "reward_std": 0.03111080639064312, "rewards/accuracy_reward": 0.7366071864962578, "rewards/format_reward": 1.0, "step": 6984 }, { "completion_length": 414.41743087768555, "epoch": 0.544579346364469, "grad_norm": 0.10841011051546345, "kl": 0.0028133392333984375, "learning_rate": 4.301871068123401e-07, "loss": 0.0001, "reward": 1.7767858058214188, "reward_std": 0.0529810655862093, "rewards/accuracy_reward": 0.7767857387661934, "rewards/format_reward": 1.0, "step": 6986 }, { "completion_length": 421.3616256713867, "epoch": 0.5447352522752519, "grad_norm": 0.06846012338999144, "kl": 0.0025653839111328125, "learning_rate": 4.2994461295997764e-07, "loss": 0.0001, "reward": 1.7991072088479996, "reward_std": 0.041632951237261295, "rewards/accuracy_reward": 0.799107164144516, "rewards/format_reward": 1.0, "step": 6988 }, { "completion_length": 420.55805587768555, "epoch": 0.5448911581860347, "grad_norm": 0.2705462330904095, "kl": 0.0031404495239257812, "learning_rate": 4.297021359143507e-07, "loss": 0.0001, "reward": 1.7544643729925156, "reward_std": 0.08469806239008904, "rewards/accuracy_reward": 0.7566964700818062, "rewards/format_reward": 0.9977678656578064, "step": 6990 }, { "completion_length": 415.28572845458984, "epoch": 0.5450470640968176, "grad_norm": 0.08994406769136543, "kl": 0.0026302337646484375, "learning_rate": 4.2945967573363103e-07, "loss": 0.0001, "reward": 1.7633929401636124, "reward_std": 0.02915941085666418, "rewards/accuracy_reward": 0.7633928880095482, "rewards/format_reward": 1.0, "step": 6992 }, { "completion_length": 419.36609268188477, "epoch": 0.5452029700076004, "grad_norm": 0.08183749779741711, "kl": 0.0028667449951171875, "learning_rate": 4.2921723247598603e-07, "loss": 0.0001, "reward": 1.8080357760190964, "reward_std": 0.04065585229545832, "rewards/accuracy_reward": 0.8080357387661934, "rewards/format_reward": 1.0, "step": 6994 }, { "completion_length": 426.64957427978516, "epoch": 0.5453588759183833, "grad_norm": 0.10961149496268685, "kl": 0.0028018951416015625, "learning_rate": 4.289748061995797e-07, "loss": 0.0001, "reward": 1.941964328289032, "reward_std": 0.05425985809415579, "rewards/accuracy_reward": 0.9419643133878708, "rewards/format_reward": 1.0, "step": 6996 }, { "completion_length": 436.4419860839844, "epoch": 0.5455147818291661, "grad_norm": 0.11117281874757906, "kl": 0.002986907958984375, "learning_rate": 4.2873239696257175e-07, "loss": 0.0001, "reward": 1.8169643580913544, "reward_std": 0.05636462103575468, "rewards/accuracy_reward": 0.8169643133878708, "rewards/format_reward": 1.0, "step": 6998 }, { "completion_length": 435.8750228881836, "epoch": 0.545670687739949, "grad_norm": 0.08972913619844713, "kl": 0.0031528472900390625, "learning_rate": 4.2849000482311735e-07, "loss": 0.0001, "reward": 1.7968750894069672, "reward_std": 0.07853797357529402, "rewards/accuracy_reward": 0.7968750298023224, "rewards/format_reward": 1.0, "step": 7000 } ], "logging_steps": 2, "max_steps": 12828, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 1000, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 1, "trial_name": null, "trial_params": null }