{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.03448228300299306, "eval_steps": 500, "global_step": 2500, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "all_correct": 0.25, "all_wrong": 0.0, "completion_length": 330.09375, "epoch": 1.3792913201197225e-05, "grad_norm": 2.238956052403626, "kl": 0.0, "learning_rate": 9.999999995305906e-07, "loss": 0.0, "reward": 1.9106249809265137, "reward_std": 0.18016189336776733, "rewards/accuracy_reward": 0.7918750047683716, "rewards/format_reward": 1.0, "step": 1 }, { "all_correct": 0.0, "all_wrong": 0.0, "completion_length": 376.03125, "epoch": 2.758582640239445e-05, "grad_norm": 2.425725981909806, "kl": 0.00069427490234375, "learning_rate": 9.999999981223625e-07, "loss": 0.0, "reward": 1.8987188339233398, "reward_std": 0.1683354675769806, "rewards/accuracy_reward": 0.7549687027931213, "rewards/format_reward": 1.0, "step": 2 }, { "all_correct": 0.0, "all_wrong": 0.0, "completion_length": 343.71875, "epoch": 4.137873960359168e-05, "grad_norm": 2.6361672047505698, "kl": 0.000736236572265625, "learning_rate": 9.999999957753156e-07, "loss": 0.0, "reward": 1.870500087738037, "reward_std": 0.13440653681755066, "rewards/accuracy_reward": 0.7267500162124634, "rewards/format_reward": 1.0, "step": 3 }, { "all_correct": 0.0, "all_wrong": 0.0, "completion_length": 346.5, "epoch": 5.51716528047889e-05, "grad_norm": 1.920993511756084, "kl": 0.000751495361328125, "learning_rate": 9.9999999248945e-07, "loss": 0.0, "reward": 2.0257186889648438, "reward_std": 0.10654528439044952, "rewards/accuracy_reward": 0.8944687247276306, "rewards/format_reward": 1.0, "step": 4 }, { "all_correct": 0.0, "all_wrong": 0.0, "completion_length": 386.375, "epoch": 6.896456600598612e-05, "grad_norm": 6.553090219822929, "kl": 0.0008087158203125, "learning_rate": 9.999999882647657e-07, "loss": 0.0, "reward": 1.8300312757492065, "reward_std": 0.2219369113445282, "rewards/accuracy_reward": 0.7112812399864197, "rewards/format_reward": 0.96875, "step": 5 }, { "all_correct": 0.0, "all_wrong": 0.0, "completion_length": 384.4375, "epoch": 8.275747920718336e-05, "grad_norm": 2.8837391275739344, "kl": 0.00110626220703125, "learning_rate": 9.999999831012624e-07, "loss": 0.0, "reward": 1.9354686737060547, "reward_std": 0.14949820935726166, "rewards/accuracy_reward": 0.7729687690734863, "rewards/format_reward": 1.0, "step": 6 }, { "all_correct": 0.0, "all_wrong": 0.0, "completion_length": 384.84375, "epoch": 9.655039240838057e-05, "grad_norm": 4.733787626862378, "kl": 0.001739501953125, "learning_rate": 9.999999769989408e-07, "loss": 0.0001, "reward": 1.861781358718872, "reward_std": 0.14443737268447876, "rewards/accuracy_reward": 0.6992812752723694, "rewards/format_reward": 1.0, "step": 7 }, { "all_correct": 0.25, "all_wrong": 0.0, "completion_length": 387.59375, "epoch": 0.0001103433056095778, "grad_norm": 1.9859377598029717, "kl": 0.001922607421875, "learning_rate": 9.999999699578e-07, "loss": 0.0001, "reward": 2.033656358718872, "reward_std": 0.09668964147567749, "rewards/accuracy_reward": 0.8774062991142273, "rewards/format_reward": 1.0, "step": 8 }, { "all_correct": 0.0, "all_wrong": 0.0, "completion_length": 404.21875, "epoch": 0.00012413621881077503, "grad_norm": 2.2445824718340517, "kl": 0.001953125, "learning_rate": 9.99999961977841e-07, "loss": 0.0001, "reward": 2.036250114440918, "reward_std": 0.10071505606174469, "rewards/accuracy_reward": 0.8487499356269836, "rewards/format_reward": 1.0, "step": 9 }, { "all_correct": 0.0, "all_wrong": 0.0, "completion_length": 393.15625, "epoch": 0.00013792913201197225, "grad_norm": 2.0067474304549084, "kl": 0.0026397705078125, "learning_rate": 9.99999953059063e-07, "loss": 0.0001, "reward": 1.9812188148498535, "reward_std": 0.16516642272472382, "rewards/accuracy_reward": 0.8124687671661377, "rewards/format_reward": 1.0, "step": 10 }, { "all_correct": 0.0, "all_wrong": 0.0, "completion_length": 405.9375, "epoch": 0.00015172204521316947, "grad_norm": 1.8764629345450639, "kl": 0.0032501220703125, "learning_rate": 9.999999432014664e-07, "loss": 0.0001, "reward": 1.925624966621399, "reward_std": 0.20176257193088531, "rewards/accuracy_reward": 0.7693750262260437, "rewards/format_reward": 1.0, "step": 11 }, { "all_correct": 0.0, "all_wrong": 0.0, "completion_length": 434.90625, "epoch": 0.0001655149584143667, "grad_norm": 1.9816398853663497, "kl": 0.0036163330078125, "learning_rate": 9.999999324050513e-07, "loss": 0.0001, "reward": 1.9718124866485596, "reward_std": 0.15928256511688232, "rewards/accuracy_reward": 0.8155624270439148, "rewards/format_reward": 1.0, "step": 12 }, { "all_correct": 0.0, "all_wrong": 0.0, "completion_length": 448.9375, "epoch": 0.00017930787161556393, "grad_norm": 2.235913774661294, "kl": 0.00537109375, "learning_rate": 9.999999206698174e-07, "loss": 0.0002, "reward": 1.8944687843322754, "reward_std": 0.13046111166477203, "rewards/accuracy_reward": 0.7444687485694885, "rewards/format_reward": 1.0, "step": 13 }, { "all_correct": 0.0, "all_wrong": 0.0, "completion_length": 449.0625, "epoch": 0.00019310078481676114, "grad_norm": 2.1547272625309453, "kl": 0.00537109375, "learning_rate": 9.99999907995765e-07, "loss": 0.0002, "reward": 1.9081875085830688, "reward_std": 0.17851057648658752, "rewards/accuracy_reward": 0.7456874847412109, "rewards/format_reward": 1.0, "step": 14 }, { "all_correct": 0.0, "all_wrong": 0.0, "completion_length": 454.03125, "epoch": 0.00020689369801795836, "grad_norm": 1.8688326807239954, "kl": 0.00531005859375, "learning_rate": 9.99999894382894e-07, "loss": 0.0002, "reward": 1.847000002861023, "reward_std": 0.29689547419548035, "rewards/accuracy_reward": 0.7095000147819519, "rewards/format_reward": 0.96875, "step": 15 }, { "all_correct": 0.0, "all_wrong": 0.0, "completion_length": 448.90625, "epoch": 0.0002206866112191556, "grad_norm": 1.877697097527582, "kl": 0.006103515625, "learning_rate": 9.999998798312043e-07, "loss": 0.0002, "reward": 1.8849375247955322, "reward_std": 0.18553191423416138, "rewards/accuracy_reward": 0.7224375605583191, "rewards/format_reward": 1.0, "step": 16 }, { "all_correct": 0.0, "all_wrong": 0.0, "completion_length": 446.53125, "epoch": 0.00023447952442035282, "grad_norm": 2.948823192525603, "kl": 0.006011962890625, "learning_rate": 9.999998643406962e-07, "loss": 0.0002, "reward": 1.9022188186645508, "reward_std": 0.1289326548576355, "rewards/accuracy_reward": 0.7147186994552612, "rewards/format_reward": 1.0, "step": 17 }, { "all_correct": 0.0, "all_wrong": 0.0, "completion_length": 449.9375, "epoch": 0.00024827243762155007, "grad_norm": 2.3773367391075633, "kl": 0.006011962890625, "learning_rate": 9.999998479113697e-07, "loss": 0.0002, "reward": 1.9257500171661377, "reward_std": 0.17228159308433533, "rewards/accuracy_reward": 0.750749945640564, "rewards/format_reward": 1.0, "step": 18 }, { "all_correct": 0.0, "all_wrong": 0.0, "completion_length": 449.6875, "epoch": 0.0002620653508227473, "grad_norm": 1.9233344476825995, "kl": 0.005584716796875, "learning_rate": 9.999998305432246e-07, "loss": 0.0002, "reward": 1.9933750629425049, "reward_std": 0.17885765433311462, "rewards/accuracy_reward": 0.8246250152587891, "rewards/format_reward": 1.0, "step": 19 }, { "all_correct": 0.0, "all_wrong": 0.0, "completion_length": 438.75, "epoch": 0.0002758582640239445, "grad_norm": 2.0098045856353592, "kl": 0.00653076171875, "learning_rate": 9.999998122362611e-07, "loss": 0.0003, "reward": 1.9439375400543213, "reward_std": 0.1554916799068451, "rewards/accuracy_reward": 0.7814374566078186, "rewards/format_reward": 1.0, "step": 20 }, { "all_correct": 0.0, "all_wrong": 0.0, "completion_length": 424.75, "epoch": 0.0002896511772251417, "grad_norm": 2.0840967435424096, "kl": 0.007080078125, "learning_rate": 9.999997929904793e-07, "loss": 0.0003, "reward": 1.913562536239624, "reward_std": 0.21141326427459717, "rewards/accuracy_reward": 0.7760624885559082, "rewards/format_reward": 0.96875, "step": 21 }, { "all_correct": 0.25, "all_wrong": 0.0, "completion_length": 427.46875, "epoch": 0.00030344409042633893, "grad_norm": 2.441428436063154, "kl": 0.00701904296875, "learning_rate": 9.99999772805879e-07, "loss": 0.0003, "reward": 2.0033750534057617, "reward_std": 0.09448859095573425, "rewards/accuracy_reward": 0.8408750295639038, "rewards/format_reward": 1.0, "step": 22 }, { "all_correct": 0.0, "all_wrong": 0.0, "completion_length": 439.75, "epoch": 0.00031723700362753615, "grad_norm": 3.8166061083190286, "kl": 0.00726318359375, "learning_rate": 9.999997516824604e-07, "loss": 0.0003, "reward": 1.8932501077651978, "reward_std": 0.18665450811386108, "rewards/accuracy_reward": 0.7120000123977661, "rewards/format_reward": 1.0, "step": 23 }, { "all_correct": 0.0, "all_wrong": 0.0, "completion_length": 409.625, "epoch": 0.0003310299168287334, "grad_norm": 2.1091114967614555, "kl": 0.006134033203125, "learning_rate": 9.999997296202233e-07, "loss": 0.0002, "reward": 1.9282188415527344, "reward_std": 0.20147812366485596, "rewards/accuracy_reward": 0.7782187461853027, "rewards/format_reward": 1.0, "step": 24 }, { "all_correct": 0.25, "all_wrong": 0.0, "completion_length": 419.9375, "epoch": 0.00034482283002993064, "grad_norm": 1.8649375597170077, "kl": 0.005859375, "learning_rate": 9.999997066191683e-07, "loss": 0.0002, "reward": 2.001406192779541, "reward_std": 0.1283920705318451, "rewards/accuracy_reward": 0.8264062404632568, "rewards/format_reward": 1.0, "step": 25 }, { "all_correct": 0.0, "all_wrong": 0.0, "completion_length": 413.0625, "epoch": 0.00035861574323112785, "grad_norm": 2.0908467800291706, "kl": 0.00714111328125, "learning_rate": 9.999996826792949e-07, "loss": 0.0003, "reward": 1.9485938549041748, "reward_std": 0.15018907189369202, "rewards/accuracy_reward": 0.7735937833786011, "rewards/format_reward": 1.0, "step": 26 }, { "all_correct": 0.0, "all_wrong": 0.0, "completion_length": 437.75, "epoch": 0.00037240865643232507, "grad_norm": 2.1459856824688344, "kl": 0.00726318359375, "learning_rate": 9.999996578006036e-07, "loss": 0.0003, "reward": 1.926218867301941, "reward_std": 0.13377031683921814, "rewards/accuracy_reward": 0.7824687957763672, "rewards/format_reward": 1.0, "step": 27 }, { "all_correct": 0.25, "all_wrong": 0.0, "completion_length": 393.46875, "epoch": 0.0003862015696335223, "grad_norm": 1.660251827892846, "kl": 0.00738525390625, "learning_rate": 9.99999631983094e-07, "loss": 0.0003, "reward": 2.0258748531341553, "reward_std": 0.09291009604930878, "rewards/accuracy_reward": 0.8383749723434448, "rewards/format_reward": 1.0, "step": 28 }, { "all_correct": 0.0, "all_wrong": 0.0, "completion_length": 428.3125, "epoch": 0.0003999944828347195, "grad_norm": 2.4442043129587425, "kl": 0.00701904296875, "learning_rate": 9.999996052267663e-07, "loss": 0.0003, "reward": 1.8992187976837158, "reward_std": 0.15343788266181946, "rewards/accuracy_reward": 0.7242187261581421, "rewards/format_reward": 1.0, "step": 29 }, { "all_correct": 0.0, "all_wrong": 0.0, "completion_length": 408.46875, "epoch": 0.0004137873960359167, "grad_norm": 1.941546303448589, "kl": 0.006744384765625, "learning_rate": 9.999995775316205e-07, "loss": 0.0003, "reward": 1.9279687404632568, "reward_std": 0.23424255847930908, "rewards/accuracy_reward": 0.8092187643051147, "rewards/format_reward": 0.96875, "step": 30 }, { "all_correct": 0.0, "all_wrong": 0.0, "completion_length": 410.15625, "epoch": 0.000427580309237114, "grad_norm": 1.939380030838396, "kl": 0.007568359375, "learning_rate": 9.99999548897657e-07, "loss": 0.0003, "reward": 1.9720938205718994, "reward_std": 0.1112823337316513, "rewards/accuracy_reward": 0.7908437252044678, "rewards/format_reward": 1.0, "step": 31 }, { "all_correct": 0.0, "all_wrong": 0.0, "completion_length": 401.6875, "epoch": 0.0004413732224383112, "grad_norm": 1.8102874359675372, "kl": 0.006561279296875, "learning_rate": 9.999995193248753e-07, "loss": 0.0003, "reward": 1.8945938348770142, "reward_std": 0.20895081758499146, "rewards/accuracy_reward": 0.7320938110351562, "rewards/format_reward": 1.0, "step": 32 }, { "all_correct": 0.25, "all_wrong": 0.0, "completion_length": 417.28125, "epoch": 0.0004551661356395084, "grad_norm": 2.440861370440923, "kl": 0.0078125, "learning_rate": 9.99999488813276e-07, "loss": 0.0003, "reward": 1.9929375648498535, "reward_std": 0.14452257752418518, "rewards/accuracy_reward": 0.7991874814033508, "rewards/format_reward": 1.0, "step": 33 }, { "all_correct": 0.25, "all_wrong": 0.0, "completion_length": 431.96875, "epoch": 0.00046895904884070564, "grad_norm": 2.634597469540262, "kl": 0.00665283203125, "learning_rate": 9.999994573628587e-07, "loss": 0.0003, "reward": 2.0103750228881836, "reward_std": 0.1038256287574768, "rewards/accuracy_reward": 0.8353750109672546, "rewards/format_reward": 1.0, "step": 34 }, { "all_correct": 0.25, "all_wrong": 0.0, "completion_length": 403.625, "epoch": 0.00048275196204190286, "grad_norm": 1.9498064618486233, "kl": 0.00811767578125, "learning_rate": 9.99999424973624e-07, "loss": 0.0003, "reward": 1.9167187213897705, "reward_std": 0.13575075566768646, "rewards/accuracy_reward": 0.7292187809944153, "rewards/format_reward": 1.0, "step": 35 }, { "all_correct": 0.0, "all_wrong": 0.0, "completion_length": 398.59375, "epoch": 0.0004965448752431001, "grad_norm": 1.97400223525883, "kl": 0.00732421875, "learning_rate": 9.999993916455714e-07, "loss": 0.0003, "reward": 1.9462811946868896, "reward_std": 0.21417558193206787, "rewards/accuracy_reward": 0.7712812423706055, "rewards/format_reward": 1.0, "step": 36 }, { "all_correct": 0.0, "all_wrong": 0.0, "completion_length": 412.4375, "epoch": 0.0005103377884442973, "grad_norm": 1.96676548570803, "kl": 0.007598876953125, "learning_rate": 9.99999357378701e-07, "loss": 0.0003, "reward": 1.9428437948226929, "reward_std": 0.1451302170753479, "rewards/accuracy_reward": 0.7553436756134033, "rewards/format_reward": 1.0, "step": 37 }, { "all_correct": 0.25, "all_wrong": 0.0, "completion_length": 406.25, "epoch": 0.0005241307016454946, "grad_norm": 1.8500098748913225, "kl": 0.0087890625, "learning_rate": 9.999993221730135e-07, "loss": 0.0004, "reward": 1.9378750324249268, "reward_std": 0.099956214427948, "rewards/accuracy_reward": 0.750374972820282, "rewards/format_reward": 1.0, "step": 38 }, { "all_correct": 0.0, "all_wrong": 0.0, "completion_length": 418.1875, "epoch": 0.0005379236148466917, "grad_norm": 2.228539734236883, "kl": 0.007568359375, "learning_rate": 9.99999286028508e-07, "loss": 0.0003, "reward": 1.815812587738037, "reward_std": 0.11345121264457703, "rewards/accuracy_reward": 0.6470625400543213, "rewards/format_reward": 1.0, "step": 39 }, { "all_correct": 0.0, "all_wrong": 0.0, "completion_length": 389.71875, "epoch": 0.000551716528047889, "grad_norm": 2.050035893914353, "kl": 0.007354736328125, "learning_rate": 9.999992489451854e-07, "loss": 0.0003, "reward": 1.8904376029968262, "reward_std": 0.15531973540782928, "rewards/accuracy_reward": 0.7216874957084656, "rewards/format_reward": 1.0, "step": 40 }, { "all_correct": 0.25, "all_wrong": 0.0, "completion_length": 434.0, "epoch": 0.0005655094412490863, "grad_norm": 2.2963652984261387, "kl": 0.00872802734375, "learning_rate": 9.999992109230455e-07, "loss": 0.0003, "reward": 1.9235312938690186, "reward_std": 0.10360515117645264, "rewards/accuracy_reward": 0.7297812104225159, "rewards/format_reward": 1.0, "step": 41 }, { "all_correct": 0.25, "all_wrong": 0.0, "completion_length": 404.5, "epoch": 0.0005793023544502834, "grad_norm": 1.9915290036304918, "kl": 0.00872802734375, "learning_rate": 9.999991719620882e-07, "loss": 0.0003, "reward": 2.003124952316284, "reward_std": 0.14130809903144836, "rewards/accuracy_reward": 0.8218749761581421, "rewards/format_reward": 1.0, "step": 42 }, { "all_correct": 0.25, "all_wrong": 0.0, "completion_length": 380.0, "epoch": 0.0005930952676514807, "grad_norm": 2.0683840656172783, "kl": 0.0103759765625, "learning_rate": 9.999991320623139e-07, "loss": 0.0004, "reward": 2.047468900680542, "reward_std": 0.10697433352470398, "rewards/accuracy_reward": 0.8662187457084656, "rewards/format_reward": 1.0, "step": 43 }, { "all_correct": 0.25, "all_wrong": 0.0, "completion_length": 397.1875, "epoch": 0.0006068881808526779, "grad_norm": 2.794135333481504, "kl": 0.00933837890625, "learning_rate": 9.999990912237223e-07, "loss": 0.0004, "reward": 1.9695312976837158, "reward_std": 0.1284584403038025, "rewards/accuracy_reward": 0.76953125, "rewards/format_reward": 1.0, "step": 44 }, { "all_correct": 0.25, "all_wrong": 0.0, "completion_length": 417.0, "epoch": 0.0006206810940538751, "grad_norm": 3.3140997714530607, "kl": 0.00872802734375, "learning_rate": 9.999990494463137e-07, "loss": 0.0003, "reward": 2.109499931335449, "reward_std": 0.08382292836904526, "rewards/accuracy_reward": 0.934499979019165, "rewards/format_reward": 1.0, "step": 45 }, { "all_correct": 0.0, "all_wrong": 0.0, "completion_length": 409.03125, "epoch": 0.0006344740072550723, "grad_norm": 1.9271837639283826, "kl": 0.00933837890625, "learning_rate": 9.999990067300881e-07, "loss": 0.0004, "reward": 1.7727187871932983, "reward_std": 0.15261155366897583, "rewards/accuracy_reward": 0.5914687514305115, "rewards/format_reward": 1.0, "step": 46 }, { "all_correct": 0.25, "all_wrong": 0.0, "completion_length": 408.6875, "epoch": 0.0006482669204562696, "grad_norm": 1.9746881047259788, "kl": 0.00860595703125, "learning_rate": 9.999989630750455e-07, "loss": 0.0003, "reward": 1.9672812223434448, "reward_std": 0.1265501081943512, "rewards/accuracy_reward": 0.7860312461853027, "rewards/format_reward": 1.0, "step": 47 }, { "all_correct": 0.0, "all_wrong": 0.0, "completion_length": 401.125, "epoch": 0.0006620598336574668, "grad_norm": 2.16103713753327, "kl": 0.0093994140625, "learning_rate": 9.999989184811863e-07, "loss": 0.0004, "reward": 1.9534375667572021, "reward_std": 0.14682571589946747, "rewards/accuracy_reward": 0.7721874713897705, "rewards/format_reward": 1.0, "step": 48 }, { "all_correct": 0.25, "all_wrong": 0.0, "completion_length": 405.28125, "epoch": 0.000675852746858664, "grad_norm": 4.506059479243003, "kl": 0.009765625, "learning_rate": 9.999988729485103e-07, "loss": 0.0004, "reward": 2.007718801498413, "reward_std": 0.10468215495347977, "rewards/accuracy_reward": 0.8202187418937683, "rewards/format_reward": 1.0, "step": 49 }, { "all_correct": 0.25, "all_wrong": 0.0, "completion_length": 396.1875, "epoch": 0.0006896456600598613, "grad_norm": 2.7545928479935124, "kl": 0.00830078125, "learning_rate": 9.999988264770175e-07, "loss": 0.0003, "reward": 1.9384374618530273, "reward_std": 0.15367849171161652, "rewards/accuracy_reward": 0.7696875333786011, "rewards/format_reward": 1.0, "step": 50 }, { "all_correct": 0.0, "all_wrong": 0.0, "completion_length": 396.0625, "epoch": 0.0007034385732610584, "grad_norm": 2.06060525063386, "kl": 0.0115966796875, "learning_rate": 9.999987790667086e-07, "loss": 0.0005, "reward": 1.9795312881469727, "reward_std": 0.1289536952972412, "rewards/accuracy_reward": 0.8295312523841858, "rewards/format_reward": 1.0, "step": 51 }, { "all_correct": 0.0, "all_wrong": 0.0, "completion_length": 391.46875, "epoch": 0.0007172314864622557, "grad_norm": 2.201080580950615, "kl": 0.01141357421875, "learning_rate": 9.999987307175828e-07, "loss": 0.0005, "reward": 2.025031328201294, "reward_std": 0.12147818505764008, "rewards/accuracy_reward": 0.8375312685966492, "rewards/format_reward": 1.0, "step": 52 }, { "all_correct": 0.25, "all_wrong": 0.0, "completion_length": 402.40625, "epoch": 0.0007310243996634529, "grad_norm": 2.789599554476296, "kl": 0.01031494140625, "learning_rate": 9.999986814296409e-07, "loss": 0.0004, "reward": 2.0395936965942383, "reward_std": 0.12432658672332764, "rewards/accuracy_reward": 0.8645938038825989, "rewards/format_reward": 1.0, "step": 53 }, { "all_correct": 0.0, "all_wrong": 0.0, "completion_length": 403.75, "epoch": 0.0007448173128646501, "grad_norm": 2.0780164567674855, "kl": 0.0103759765625, "learning_rate": 9.999986312028826e-07, "loss": 0.0004, "reward": 1.9655625820159912, "reward_std": 0.09623858332633972, "rewards/accuracy_reward": 0.7718124985694885, "rewards/format_reward": 1.0, "step": 54 }, { "all_correct": 0.0, "all_wrong": 0.0, "completion_length": 403.0625, "epoch": 0.0007586102260658474, "grad_norm": 2.034057056271373, "kl": 0.0113525390625, "learning_rate": 9.99998580037308e-07, "loss": 0.0005, "reward": 2.0223751068115234, "reward_std": 0.10908350348472595, "rewards/accuracy_reward": 0.8348749876022339, "rewards/format_reward": 1.0, "step": 55 }, { "all_correct": 0.25, "all_wrong": 0.0, "completion_length": 412.09375, "epoch": 0.0007724031392670446, "grad_norm": 3.5188726986869936, "kl": 0.01123046875, "learning_rate": 9.999985279329174e-07, "loss": 0.0004, "reward": 1.953125, "reward_std": 0.1149853765964508, "rewards/accuracy_reward": 0.7593749761581421, "rewards/format_reward": 1.0, "step": 56 }, { "all_correct": 0.0, "all_wrong": 0.0, "completion_length": 405.28125, "epoch": 0.0007861960524682418, "grad_norm": 2.234293267617364, "kl": 0.0118408203125, "learning_rate": 9.999984748897108e-07, "loss": 0.0005, "reward": 1.9143438339233398, "reward_std": 0.12217641621828079, "rewards/accuracy_reward": 0.7205937504768372, "rewards/format_reward": 1.0, "step": 57 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 396.65625, "epoch": 0.000799988965669439, "grad_norm": 1.9985754265007123, "kl": 0.0120849609375, "learning_rate": 9.999984209076883e-07, "loss": 0.0005, "reward": 2.0999374389648438, "reward_std": 0.06573973596096039, "rewards/accuracy_reward": 0.9061875343322754, "rewards/format_reward": 1.0, "step": 58 }, { "all_correct": 0.0, "all_wrong": 0.0, "completion_length": 406.125, "epoch": 0.0008137818788706363, "grad_norm": 2.51497917133664, "kl": 0.0128173828125, "learning_rate": 9.999983659868502e-07, "loss": 0.0005, "reward": 1.9672188758850098, "reward_std": 0.1114722415804863, "rewards/accuracy_reward": 0.7734687328338623, "rewards/format_reward": 1.0, "step": 59 }, { "all_correct": 0.0, "all_wrong": 0.0, "completion_length": 411.3125, "epoch": 0.0008275747920718334, "grad_norm": 2.382568986637134, "kl": 0.01434326171875, "learning_rate": 9.999983101271964e-07, "loss": 0.0006, "reward": 1.9099063873291016, "reward_std": 0.11956294625997543, "rewards/accuracy_reward": 0.7161562442779541, "rewards/format_reward": 1.0, "step": 60 }, { "all_correct": 0.0, "all_wrong": 0.0, "completion_length": 419.34375, "epoch": 0.0008413677052730307, "grad_norm": 2.3339976498187918, "kl": 0.01361083984375, "learning_rate": 9.999982533287268e-07, "loss": 0.0005, "reward": 1.914156436920166, "reward_std": 0.11630599945783615, "rewards/accuracy_reward": 0.7204062342643738, "rewards/format_reward": 1.0, "step": 61 }, { "all_correct": 0.0, "all_wrong": 0.0, "completion_length": 414.1875, "epoch": 0.000855160618474228, "grad_norm": 2.0397915840923666, "kl": 0.0125732421875, "learning_rate": 9.999981955914419e-07, "loss": 0.0005, "reward": 2.0025625228881836, "reward_std": 0.09353870898485184, "rewards/accuracy_reward": 0.8025624752044678, "rewards/format_reward": 1.0, "step": 62 }, { "all_correct": 0.0, "all_wrong": 0.0, "completion_length": 406.6875, "epoch": 0.0008689535316754251, "grad_norm": 2.7440557394184113, "kl": 0.0137939453125, "learning_rate": 9.999981369153415e-07, "loss": 0.0006, "reward": 1.9893436431884766, "reward_std": 0.10104025900363922, "rewards/accuracy_reward": 0.8018437623977661, "rewards/format_reward": 1.0, "step": 63 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 400.34375, "epoch": 0.0008827464448766224, "grad_norm": 3.187129970062904, "kl": 0.0137939453125, "learning_rate": 9.99998077300426e-07, "loss": 0.0006, "reward": 2.0980000495910645, "reward_std": 0.08940361440181732, "rewards/accuracy_reward": 0.9104999899864197, "rewards/format_reward": 1.0, "step": 64 }, { "all_correct": 0.25, "all_wrong": 0.0, "completion_length": 382.90625, "epoch": 0.0008965393580778196, "grad_norm": 2.2765766881694027, "kl": 0.01507568359375, "learning_rate": 9.99998016746695e-07, "loss": 0.0006, "reward": 2.0280938148498535, "reward_std": 0.12347737699747086, "rewards/accuracy_reward": 0.8593437671661377, "rewards/format_reward": 1.0, "step": 65 }, { "all_correct": 0.25, "all_wrong": 0.0, "completion_length": 406.03125, "epoch": 0.0009103322712790169, "grad_norm": 2.059881050081888, "kl": 0.01513671875, "learning_rate": 9.999979552541493e-07, "loss": 0.0006, "reward": 1.9948437213897705, "reward_std": 0.09006933122873306, "rewards/accuracy_reward": 0.8010937571525574, "rewards/format_reward": 1.0, "step": 66 }, { "all_correct": 0.25, "all_wrong": 0.0, "completion_length": 392.78125, "epoch": 0.000924125184480214, "grad_norm": 1.9240285675429754, "kl": 0.0142822265625, "learning_rate": 9.999978928227889e-07, "loss": 0.0006, "reward": 2.0140626430511475, "reward_std": 0.08456151187419891, "rewards/accuracy_reward": 0.8265625238418579, "rewards/format_reward": 1.0, "step": 67 }, { "all_correct": 0.0, "all_wrong": 0.0, "completion_length": 398.625, "epoch": 0.0009379180976814113, "grad_norm": 2.063638176454996, "kl": 0.015625, "learning_rate": 9.999978294526131e-07, "loss": 0.0006, "reward": 1.9393125772476196, "reward_std": 0.11400812119245529, "rewards/accuracy_reward": 0.770562469959259, "rewards/format_reward": 1.0, "step": 68 }, { "all_correct": 0.25, "all_wrong": 0.0, "completion_length": 413.21875, "epoch": 0.0009517110108826086, "grad_norm": 2.364884988934533, "kl": 0.0167236328125, "learning_rate": 9.999977651436231e-07, "loss": 0.0007, "reward": 2.0544376373291016, "reward_std": 0.09280771017074585, "rewards/accuracy_reward": 0.866937518119812, "rewards/format_reward": 1.0, "step": 69 }, { "all_correct": 0.0, "all_wrong": 0.0, "completion_length": 404.59375, "epoch": 0.0009655039240838057, "grad_norm": 1.9801870748243584, "kl": 0.017333984375, "learning_rate": 9.999976998958184e-07, "loss": 0.0007, "reward": 1.9036874771118164, "reward_std": 0.07547667622566223, "rewards/accuracy_reward": 0.7224375009536743, "rewards/format_reward": 1.0, "step": 70 }, { "all_correct": 0.25, "all_wrong": 0.0, "completion_length": 420.84375, "epoch": 0.000979296837285003, "grad_norm": 2.2231607732222174, "kl": 0.0177001953125, "learning_rate": 9.999976337091992e-07, "loss": 0.0007, "reward": 1.9744688272476196, "reward_std": 0.11905109882354736, "rewards/accuracy_reward": 0.793218731880188, "rewards/format_reward": 1.0, "step": 71 }, { "all_correct": 0.0, "all_wrong": 0.0, "completion_length": 406.03125, "epoch": 0.0009930897504862003, "grad_norm": 1.9324225599040201, "kl": 0.01806640625, "learning_rate": 9.999975665837658e-07, "loss": 0.0007, "reward": 1.916468858718872, "reward_std": 0.13691526651382446, "rewards/accuracy_reward": 0.7352187037467957, "rewards/format_reward": 1.0, "step": 72 }, { "all_correct": 0.0, "all_wrong": 0.0, "completion_length": 403.65625, "epoch": 0.0010068826636873973, "grad_norm": 2.1967236353102644, "kl": 0.0189208984375, "learning_rate": 9.999974985195181e-07, "loss": 0.0008, "reward": 1.9445313215255737, "reward_std": 0.09076535701751709, "rewards/accuracy_reward": 0.7445312738418579, "rewards/format_reward": 1.0, "step": 73 }, { "all_correct": 0.25, "all_wrong": 0.0, "completion_length": 386.53125, "epoch": 0.0010206755768885946, "grad_norm": 1.975824500724877, "kl": 0.0196533203125, "learning_rate": 9.999974295164566e-07, "loss": 0.0008, "reward": 1.9469687938690186, "reward_std": 0.12988199293613434, "rewards/accuracy_reward": 0.7844687700271606, "rewards/format_reward": 1.0, "step": 74 }, { "all_correct": 0.25, "all_wrong": 0.0, "completion_length": 406.09375, "epoch": 0.0010344684900897919, "grad_norm": 2.0425062104358624, "kl": 0.0177001953125, "learning_rate": 9.999973595745808e-07, "loss": 0.0007, "reward": 2.0570626258850098, "reward_std": 0.10216653347015381, "rewards/accuracy_reward": 0.8633125424385071, "rewards/format_reward": 1.0, "step": 75 }, { "all_correct": 0.0, "all_wrong": 0.0, "completion_length": 405.21875, "epoch": 0.0010482614032909891, "grad_norm": 2.48173438610056, "kl": 0.0191650390625, "learning_rate": 9.999972886938914e-07, "loss": 0.0008, "reward": 2.0001564025878906, "reward_std": 0.11471109092235565, "rewards/accuracy_reward": 0.8064062595367432, "rewards/format_reward": 1.0, "step": 76 }, { "all_correct": 0.0, "all_wrong": 0.0, "completion_length": 409.0625, "epoch": 0.0010620543164921864, "grad_norm": 2.2211721391522454, "kl": 0.018798828125, "learning_rate": 9.999972168743885e-07, "loss": 0.0008, "reward": 1.893031358718872, "reward_std": 0.07950831949710846, "rewards/accuracy_reward": 0.6992812156677246, "rewards/format_reward": 1.0, "step": 77 }, { "all_correct": 0.25, "all_wrong": 0.0, "completion_length": 413.59375, "epoch": 0.0010758472296933835, "grad_norm": 6.5477114597357335, "kl": 0.0205078125, "learning_rate": 9.999971441160716e-07, "loss": 0.0008, "reward": 1.9737812280654907, "reward_std": 0.07410921156406403, "rewards/accuracy_reward": 0.7800312638282776, "rewards/format_reward": 1.0, "step": 78 }, { "all_correct": 0.25, "all_wrong": 0.0, "completion_length": 405.46875, "epoch": 0.0010896401428945807, "grad_norm": 2.0330019258334375, "kl": 0.0194091796875, "learning_rate": 9.999970704189418e-07, "loss": 0.0008, "reward": 2.061781167984009, "reward_std": 0.10087407380342484, "rewards/accuracy_reward": 0.8805313110351562, "rewards/format_reward": 1.0, "step": 79 }, { "all_correct": 0.25, "all_wrong": 0.0, "completion_length": 391.3125, "epoch": 0.001103433056095778, "grad_norm": 2.607390388875823, "kl": 0.0216064453125, "learning_rate": 9.999969957829985e-07, "loss": 0.0009, "reward": 1.98046875, "reward_std": 0.09627526253461838, "rewards/accuracy_reward": 0.7992187738418579, "rewards/format_reward": 1.0, "step": 80 }, { "all_correct": 0.25, "all_wrong": 0.0, "completion_length": 373.4375, "epoch": 0.0011172259692969753, "grad_norm": 1.876950641564629, "kl": 0.02001953125, "learning_rate": 9.999969202082422e-07, "loss": 0.0008, "reward": 1.9832813739776611, "reward_std": 0.1238895058631897, "rewards/accuracy_reward": 0.8020312190055847, "rewards/format_reward": 1.0, "step": 81 }, { "all_correct": 0.0, "all_wrong": 0.0, "completion_length": 408.09375, "epoch": 0.0011310188824981725, "grad_norm": 2.0206454330604857, "kl": 0.020751953125, "learning_rate": 9.99996843694673e-07, "loss": 0.0008, "reward": 1.8744688034057617, "reward_std": 0.12994661927223206, "rewards/accuracy_reward": 0.6807186603546143, "rewards/format_reward": 1.0, "step": 82 }, { "all_correct": 0.0, "all_wrong": 0.0, "completion_length": 418.0625, "epoch": 0.0011448117956993696, "grad_norm": 3.293451538465561, "kl": 0.0186767578125, "learning_rate": 9.999967662422907e-07, "loss": 0.0007, "reward": 1.883750081062317, "reward_std": 0.1281936764717102, "rewards/accuracy_reward": 0.7024999856948853, "rewards/format_reward": 1.0, "step": 83 }, { "all_correct": 0.0, "all_wrong": 0.0, "completion_length": 434.375, "epoch": 0.0011586047089005669, "grad_norm": 2.8365982056535293, "kl": 0.020263671875, "learning_rate": 9.999966878510959e-07, "loss": 0.0008, "reward": 1.9679062366485596, "reward_std": 0.10362934321165085, "rewards/accuracy_reward": 0.7804062366485596, "rewards/format_reward": 1.0, "step": 84 }, { "all_correct": 0.0, "all_wrong": 0.0, "completion_length": 438.25, "epoch": 0.0011723976221017641, "grad_norm": 1.903995909966491, "kl": 0.02099609375, "learning_rate": 9.999966085210885e-07, "loss": 0.0008, "reward": 1.9658124446868896, "reward_std": 0.10568737983703613, "rewards/accuracy_reward": 0.7908124923706055, "rewards/format_reward": 1.0, "step": 85 }, { "all_correct": 0.25, "all_wrong": 0.0, "completion_length": 421.5625, "epoch": 0.0011861905353029614, "grad_norm": 1.914680102222218, "kl": 0.019287109375, "learning_rate": 9.999965282522687e-07, "loss": 0.0008, "reward": 1.9978125095367432, "reward_std": 0.10486941784620285, "rewards/accuracy_reward": 0.8103125095367432, "rewards/format_reward": 1.0, "step": 86 }, { "all_correct": 0.0, "all_wrong": 0.0, "completion_length": 437.90625, "epoch": 0.0011999834485041585, "grad_norm": 2.2162677073510375, "kl": 0.0224609375, "learning_rate": 9.999964470446366e-07, "loss": 0.0009, "reward": 2.030156135559082, "reward_std": 0.1169753447175026, "rewards/accuracy_reward": 0.8426562547683716, "rewards/format_reward": 1.0, "step": 87 }, { "all_correct": 0.25, "all_wrong": 0.0, "completion_length": 438.78125, "epoch": 0.0012137763617053557, "grad_norm": 1.7977164346728522, "kl": 0.0211181640625, "learning_rate": 9.999963648981926e-07, "loss": 0.0008, "reward": 2.1032187938690186, "reward_std": 0.08337854593992233, "rewards/accuracy_reward": 0.9157187342643738, "rewards/format_reward": 1.0, "step": 88 }, { "all_correct": 0.0, "all_wrong": 0.0, "completion_length": 432.84375, "epoch": 0.001227569274906553, "grad_norm": 1.9561962964755193, "kl": 0.022216796875, "learning_rate": 9.999962818129368e-07, "loss": 0.0009, "reward": 1.8780937194824219, "reward_std": 0.10173892974853516, "rewards/accuracy_reward": 0.7030937075614929, "rewards/format_reward": 1.0, "step": 89 }, { "all_correct": 0.25, "all_wrong": 0.0, "completion_length": 423.4375, "epoch": 0.0012413621881077503, "grad_norm": 2.1335825875575956, "kl": 0.020263671875, "learning_rate": 9.99996197788869e-07, "loss": 0.0008, "reward": 2.015125036239624, "reward_std": 0.11473339051008224, "rewards/accuracy_reward": 0.8338750004768372, "rewards/format_reward": 1.0, "step": 90 }, { "all_correct": 0.0, "all_wrong": 0.0, "completion_length": 416.125, "epoch": 0.0012551551013089475, "grad_norm": 2.0916217890362807, "kl": 0.02099609375, "learning_rate": 9.999961128259895e-07, "loss": 0.0008, "reward": 2.062406063079834, "reward_std": 0.10356153547763824, "rewards/accuracy_reward": 0.8811562061309814, "rewards/format_reward": 1.0, "step": 91 }, { "all_correct": 0.25, "all_wrong": 0.0, "completion_length": 407.84375, "epoch": 0.0012689480145101446, "grad_norm": 1.87796792518994, "kl": 0.0198974609375, "learning_rate": 9.999960269242986e-07, "loss": 0.0008, "reward": 2.0895938873291016, "reward_std": 0.08552618324756622, "rewards/accuracy_reward": 0.889593780040741, "rewards/format_reward": 1.0, "step": 92 }, { "all_correct": 0.0, "all_wrong": 0.0, "completion_length": 430.125, "epoch": 0.0012827409277113419, "grad_norm": 2.092998703269986, "kl": 0.0216064453125, "learning_rate": 9.999959400837966e-07, "loss": 0.0009, "reward": 2.058500051498413, "reward_std": 0.0773710161447525, "rewards/accuracy_reward": 0.8772499561309814, "rewards/format_reward": 1.0, "step": 93 }, { "all_correct": 0.0, "all_wrong": 0.0, "completion_length": 398.9375, "epoch": 0.0012965338409125391, "grad_norm": 2.1335825161203554, "kl": 0.02001953125, "learning_rate": 9.999958523044833e-07, "loss": 0.0008, "reward": 2.0511250495910645, "reward_std": 0.08750750869512558, "rewards/accuracy_reward": 0.8636249303817749, "rewards/format_reward": 1.0, "step": 94 }, { "all_correct": 0.0, "all_wrong": 0.0, "completion_length": 397.25, "epoch": 0.0013103267541137364, "grad_norm": 3.2133359318335737, "kl": 0.0250244140625, "learning_rate": 9.999957635863591e-07, "loss": 0.001, "reward": 1.8879687786102295, "reward_std": 0.10368669033050537, "rewards/accuracy_reward": 0.7067187428474426, "rewards/format_reward": 1.0, "step": 95 }, { "all_correct": 0.0, "all_wrong": 0.0, "completion_length": 406.0625, "epoch": 0.0013241196673149337, "grad_norm": 1.9467134814363738, "kl": 0.0194091796875, "learning_rate": 9.99995673929424e-07, "loss": 0.0008, "reward": 1.98046875, "reward_std": 0.12112919986248016, "rewards/accuracy_reward": 0.7992187142372131, "rewards/format_reward": 1.0, "step": 96 }, { "all_correct": 0.25, "all_wrong": 0.0, "completion_length": 411.125, "epoch": 0.0013379125805161307, "grad_norm": 1.97315248003581, "kl": 0.0198974609375, "learning_rate": 9.999955833336784e-07, "loss": 0.0008, "reward": 2.066281318664551, "reward_std": 0.09585392475128174, "rewards/accuracy_reward": 0.8850312829017639, "rewards/format_reward": 1.0, "step": 97 }, { "all_correct": 0.25, "all_wrong": 0.0, "completion_length": 417.21875, "epoch": 0.001351705493717328, "grad_norm": 1.90936454331302, "kl": 0.019287109375, "learning_rate": 9.999954917991222e-07, "loss": 0.0008, "reward": 2.0439374446868896, "reward_std": 0.08787743002176285, "rewards/accuracy_reward": 0.8626875281333923, "rewards/format_reward": 1.0, "step": 98 }, { "all_correct": 0.25, "all_wrong": 0.0, "completion_length": 411.875, "epoch": 0.0013654984069185253, "grad_norm": 2.3092667844613226, "kl": 0.023193359375, "learning_rate": 9.999953993257558e-07, "loss": 0.0009, "reward": 2.052999973297119, "reward_std": 0.09241437166929245, "rewards/accuracy_reward": 0.8530000448226929, "rewards/format_reward": 1.0, "step": 99 }, { "all_correct": 0.25, "all_wrong": 0.0, "completion_length": 387.0625, "epoch": 0.0013792913201197226, "grad_norm": 2.245471536346899, "kl": 0.022705078125, "learning_rate": 9.999953059135794e-07, "loss": 0.0009, "reward": 2.028250217437744, "reward_std": 0.10045875608921051, "rewards/accuracy_reward": 0.8470000624656677, "rewards/format_reward": 1.0, "step": 100 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 409.1875, "epoch": 0.0013930842333209196, "grad_norm": 3.7452978064396247, "kl": 0.018310546875, "learning_rate": 9.99995211562593e-07, "loss": 0.0007, "reward": 2.0342187881469727, "reward_std": 0.07907479256391525, "rewards/accuracy_reward": 0.8404687643051147, "rewards/format_reward": 1.0, "step": 101 }, { "all_correct": 0.25, "all_wrong": 0.0, "completion_length": 423.25, "epoch": 0.0014068771465221169, "grad_norm": 3.3520192881557844, "kl": 0.0189208984375, "learning_rate": 9.999951162727967e-07, "loss": 0.0008, "reward": 1.8736562728881836, "reward_std": 0.10261452943086624, "rewards/accuracy_reward": 0.6861562728881836, "rewards/format_reward": 1.0, "step": 102 }, { "all_correct": 0.0, "all_wrong": 0.0, "completion_length": 432.0625, "epoch": 0.0014206700597233141, "grad_norm": 2.6809604380851657, "kl": 0.019287109375, "learning_rate": 9.999950200441909e-07, "loss": 0.0008, "reward": 1.965343713760376, "reward_std": 0.09344343841075897, "rewards/accuracy_reward": 0.7715936899185181, "rewards/format_reward": 1.0, "step": 103 }, { "all_correct": 0.25, "all_wrong": 0.0, "completion_length": 419.59375, "epoch": 0.0014344629729245114, "grad_norm": 2.0950174630499228, "kl": 0.0205078125, "learning_rate": 9.999949228767756e-07, "loss": 0.0008, "reward": 2.0600624084472656, "reward_std": 0.07353753596544266, "rewards/accuracy_reward": 0.8850624561309814, "rewards/format_reward": 1.0, "step": 104 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 417.15625, "epoch": 0.0014482558861257087, "grad_norm": 1.9462977496641147, "kl": 0.0198974609375, "learning_rate": 9.999948247705514e-07, "loss": 0.0008, "reward": 2.112968683242798, "reward_std": 0.06496657431125641, "rewards/accuracy_reward": 0.9192187786102295, "rewards/format_reward": 1.0, "step": 105 }, { "all_correct": 0.0, "all_wrong": 0.0, "completion_length": 421.34375, "epoch": 0.0014620487993269057, "grad_norm": 2.2271402027862055, "kl": 0.0191650390625, "learning_rate": 9.99994725725518e-07, "loss": 0.0008, "reward": 2.018531322479248, "reward_std": 0.10074889659881592, "rewards/accuracy_reward": 0.8310312032699585, "rewards/format_reward": 1.0, "step": 106 }, { "all_correct": 0.0, "all_wrong": 0.0, "completion_length": 445.71875, "epoch": 0.001475841712528103, "grad_norm": 1.845138686309047, "kl": 0.017333984375, "learning_rate": 9.999946257416756e-07, "loss": 0.0007, "reward": 1.9974374771118164, "reward_std": 0.09323931485414505, "rewards/accuracy_reward": 0.8099374771118164, "rewards/format_reward": 1.0, "step": 107 }, { "all_correct": 0.25, "all_wrong": 0.0, "completion_length": 399.96875, "epoch": 0.0014896346257293003, "grad_norm": 2.2346418957871577, "kl": 0.019287109375, "learning_rate": 9.999945248190245e-07, "loss": 0.0008, "reward": 2.0425000190734863, "reward_std": 0.07148788869380951, "rewards/accuracy_reward": 0.8487499952316284, "rewards/format_reward": 1.0, "step": 108 }, { "all_correct": 0.0, "all_wrong": 0.0, "completion_length": 434.1875, "epoch": 0.0015034275389304976, "grad_norm": 2.7485539956635097, "kl": 0.01904296875, "learning_rate": 9.99994422957565e-07, "loss": 0.0008, "reward": 1.946812629699707, "reward_std": 0.11658565700054169, "rewards/accuracy_reward": 0.7780625224113464, "rewards/format_reward": 1.0, "step": 109 }, { "all_correct": 0.0, "all_wrong": 0.0, "completion_length": 430.4375, "epoch": 0.0015172204521316948, "grad_norm": 1.6256790468322975, "kl": 0.0203857421875, "learning_rate": 9.999943201572973e-07, "loss": 0.0008, "reward": 1.9387812614440918, "reward_std": 0.07584629952907562, "rewards/accuracy_reward": 0.751281201839447, "rewards/format_reward": 1.0, "step": 110 }, { "all_correct": 0.0, "all_wrong": 0.0, "completion_length": 426.9375, "epoch": 0.0015310133653328919, "grad_norm": 2.267961508359044, "kl": 0.021484375, "learning_rate": 9.999942164182216e-07, "loss": 0.0009, "reward": 1.9941250085830688, "reward_std": 0.09937582910060883, "rewards/accuracy_reward": 0.8128750324249268, "rewards/format_reward": 1.0, "step": 111 }, { "all_correct": 0.0, "all_wrong": 0.0, "completion_length": 412.9375, "epoch": 0.0015448062785340892, "grad_norm": 2.0581790967612847, "kl": 0.0196533203125, "learning_rate": 9.999941117403378e-07, "loss": 0.0008, "reward": 2.0332813262939453, "reward_std": 0.17447586357593536, "rewards/accuracy_reward": 0.8520312309265137, "rewards/format_reward": 1.0, "step": 112 }, { "all_correct": 0.0, "all_wrong": 0.0, "completion_length": 445.21875, "epoch": 0.0015585991917352864, "grad_norm": 5.766110130139589, "kl": 0.018310546875, "learning_rate": 9.999940061236465e-07, "loss": 0.0007, "reward": 2.015812397003174, "reward_std": 0.10099396109580994, "rewards/accuracy_reward": 0.8408124446868896, "rewards/format_reward": 1.0, "step": 113 }, { "all_correct": 0.25, "all_wrong": 0.0, "completion_length": 399.78125, "epoch": 0.0015723921049364837, "grad_norm": 2.028823496278505, "kl": 0.0198974609375, "learning_rate": 9.999938995681475e-07, "loss": 0.0008, "reward": 2.068406343460083, "reward_std": 0.07293745130300522, "rewards/accuracy_reward": 0.8809062242507935, "rewards/format_reward": 1.0, "step": 114 }, { "all_correct": 0.25, "all_wrong": 0.0, "completion_length": 408.25, "epoch": 0.001586185018137681, "grad_norm": 2.262979988843303, "kl": 0.023193359375, "learning_rate": 9.999937920738415e-07, "loss": 0.0009, "reward": 2.0093436241149902, "reward_std": 0.08357476443052292, "rewards/accuracy_reward": 0.8280937671661377, "rewards/format_reward": 1.0, "step": 115 }, { "all_correct": 0.0, "all_wrong": 0.0, "completion_length": 385.1875, "epoch": 0.001599977931338878, "grad_norm": 2.2396106340034927, "kl": 0.021484375, "learning_rate": 9.999936836407281e-07, "loss": 0.0009, "reward": 1.9998438358306885, "reward_std": 0.06379185616970062, "rewards/accuracy_reward": 0.8060937523841858, "rewards/format_reward": 1.0, "step": 116 }, { "all_correct": 0.25, "all_wrong": 0.0, "completion_length": 418.3125, "epoch": 0.0016137708445400753, "grad_norm": 2.0190076027293506, "kl": 0.02197265625, "learning_rate": 9.99993574268808e-07, "loss": 0.0009, "reward": 2.029125213623047, "reward_std": 0.11474525928497314, "rewards/accuracy_reward": 0.8478749990463257, "rewards/format_reward": 1.0, "step": 117 }, { "all_correct": 0.0, "all_wrong": 0.0, "completion_length": 404.875, "epoch": 0.0016275637577412726, "grad_norm": 2.984145460599882, "kl": 0.02294921875, "learning_rate": 9.99993463958081e-07, "loss": 0.0009, "reward": 1.9973125457763672, "reward_std": 0.11626149713993073, "rewards/accuracy_reward": 0.8098124861717224, "rewards/format_reward": 1.0, "step": 118 }, { "all_correct": 0.25, "all_wrong": 0.0, "completion_length": 411.125, "epoch": 0.0016413566709424698, "grad_norm": 2.227864110534936, "kl": 0.025634765625, "learning_rate": 9.999933527085475e-07, "loss": 0.001, "reward": 1.988968849182129, "reward_std": 0.0794340968132019, "rewards/accuracy_reward": 0.8139687776565552, "rewards/format_reward": 1.0, "step": 119 }, { "all_correct": 0.0, "all_wrong": 0.0, "completion_length": 428.9375, "epoch": 0.0016551495841436669, "grad_norm": 2.1287258166421252, "kl": 0.01904296875, "learning_rate": 9.99993240520208e-07, "loss": 0.0008, "reward": 2.0016250610351562, "reward_std": 0.10000091791152954, "rewards/accuracy_reward": 0.8266250491142273, "rewards/format_reward": 1.0, "step": 120 }, { "all_correct": 0.25, "all_wrong": 0.0, "completion_length": 428.8125, "epoch": 0.0016689424973448642, "grad_norm": 2.2238376452996795, "kl": 0.0233154296875, "learning_rate": 9.999931273930624e-07, "loss": 0.0009, "reward": 2.0781874656677246, "reward_std": 0.066957026720047, "rewards/accuracy_reward": 0.8969375491142273, "rewards/format_reward": 1.0, "step": 121 }, { "all_correct": 0.0, "all_wrong": 0.0, "completion_length": 415.75, "epoch": 0.0016827354105460614, "grad_norm": 5.494504448483442, "kl": 0.0244140625, "learning_rate": 9.99993013327111e-07, "loss": 0.001, "reward": 2.0068438053131104, "reward_std": 0.13612675666809082, "rewards/accuracy_reward": 0.8318437933921814, "rewards/format_reward": 1.0, "step": 122 }, { "all_correct": 0.25, "all_wrong": 0.0, "completion_length": 420.125, "epoch": 0.0016965283237472587, "grad_norm": 1.9577923727826128, "kl": 0.0233154296875, "learning_rate": 9.999928983223537e-07, "loss": 0.0009, "reward": 2.02371883392334, "reward_std": 0.08236443251371384, "rewards/accuracy_reward": 0.8299687504768372, "rewards/format_reward": 1.0, "step": 123 }, { "all_correct": 0.25, "all_wrong": 0.0, "completion_length": 419.3125, "epoch": 0.001710321236948456, "grad_norm": 1.9724912522052493, "kl": 0.0242919921875, "learning_rate": 9.99992782378791e-07, "loss": 0.001, "reward": 1.937093734741211, "reward_std": 0.07229560613632202, "rewards/accuracy_reward": 0.7558437585830688, "rewards/format_reward": 1.0, "step": 124 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 403.53125, "epoch": 0.001724114150149653, "grad_norm": 2.0354778997109513, "kl": 0.020263671875, "learning_rate": 9.999926654964233e-07, "loss": 0.0008, "reward": 2.110375165939331, "reward_std": 0.0669713169336319, "rewards/accuracy_reward": 0.9228750467300415, "rewards/format_reward": 1.0, "step": 125 }, { "all_correct": 0.25, "all_wrong": 0.0, "completion_length": 416.0, "epoch": 0.0017379070633508503, "grad_norm": 1.902770269444391, "kl": 0.02490234375, "learning_rate": 9.999925476752503e-07, "loss": 0.001, "reward": 2.0635311603546143, "reward_std": 0.05498639866709709, "rewards/accuracy_reward": 0.8760312795639038, "rewards/format_reward": 1.0, "step": 126 }, { "all_correct": 0.0, "all_wrong": 0.0, "completion_length": 396.75, "epoch": 0.0017516999765520476, "grad_norm": 1.8958305262248902, "kl": 0.0263671875, "learning_rate": 9.99992428915273e-07, "loss": 0.0011, "reward": 1.9792187213897705, "reward_std": 0.10229627043008804, "rewards/accuracy_reward": 0.7917187809944153, "rewards/format_reward": 1.0, "step": 127 }, { "all_correct": 0.25, "all_wrong": 0.0, "completion_length": 401.59375, "epoch": 0.0017654928897532448, "grad_norm": 2.742346947447369, "kl": 0.0245361328125, "learning_rate": 9.999923092164907e-07, "loss": 0.001, "reward": 1.9277499914169312, "reward_std": 0.09760669618844986, "rewards/accuracy_reward": 0.7340000867843628, "rewards/format_reward": 1.0, "step": 128 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 418.625, "epoch": 0.0017792858029544421, "grad_norm": 2.1089655193726706, "kl": 0.0234375, "learning_rate": 9.999921885789044e-07, "loss": 0.0009, "reward": 2.0311875343322754, "reward_std": 0.04695471376180649, "rewards/accuracy_reward": 0.8311874866485596, "rewards/format_reward": 1.0, "step": 129 }, { "all_correct": 0.25, "all_wrong": 0.0, "completion_length": 420.53125, "epoch": 0.0017930787161556392, "grad_norm": 2.1255558819924354, "kl": 0.021240234375, "learning_rate": 9.999920670025139e-07, "loss": 0.0008, "reward": 2.0605311393737793, "reward_std": 0.10061533004045486, "rewards/accuracy_reward": 0.8667812943458557, "rewards/format_reward": 1.0, "step": 130 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 406.65625, "epoch": 0.0018068716293568364, "grad_norm": 2.3191532175637786, "kl": 0.02587890625, "learning_rate": 9.999919444873198e-07, "loss": 0.001, "reward": 2.0655312538146973, "reward_std": 0.05533017963171005, "rewards/accuracy_reward": 0.8655312657356262, "rewards/format_reward": 1.0, "step": 131 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 416.6875, "epoch": 0.0018206645425580337, "grad_norm": 2.2065081261455477, "kl": 0.0247802734375, "learning_rate": 9.999918210333217e-07, "loss": 0.001, "reward": 1.9338126182556152, "reward_std": 0.06746178865432739, "rewards/accuracy_reward": 0.7338124513626099, "rewards/format_reward": 1.0, "step": 132 }, { "all_correct": 0.25, "all_wrong": 0.0, "completion_length": 421.65625, "epoch": 0.001834457455759231, "grad_norm": 2.006884300591598, "kl": 0.025390625, "learning_rate": 9.999916966405205e-07, "loss": 0.001, "reward": 1.9874374866485596, "reward_std": 0.08689753711223602, "rewards/accuracy_reward": 0.7999374866485596, "rewards/format_reward": 1.0, "step": 133 }, { "all_correct": 0.0, "all_wrong": 0.0, "completion_length": 412.15625, "epoch": 0.001848250368960428, "grad_norm": 2.352634186327791, "kl": 0.02587890625, "learning_rate": 9.999915713089159e-07, "loss": 0.001, "reward": 1.9622812271118164, "reward_std": 0.09301012754440308, "rewards/accuracy_reward": 0.7685312032699585, "rewards/format_reward": 1.0, "step": 134 }, { "all_correct": 0.25, "all_wrong": 0.0, "completion_length": 407.4375, "epoch": 0.0018620432821616253, "grad_norm": 2.062812479729722, "kl": 0.0230712890625, "learning_rate": 9.999914450385084e-07, "loss": 0.0009, "reward": 1.9440312385559082, "reward_std": 0.08110732585191727, "rewards/accuracy_reward": 0.7440312504768372, "rewards/format_reward": 1.0, "step": 135 }, { "all_correct": 0.0, "all_wrong": 0.0, "completion_length": 421.25, "epoch": 0.0018758361953628226, "grad_norm": 2.490263680455753, "kl": 0.023681640625, "learning_rate": 9.999913178292983e-07, "loss": 0.0009, "reward": 1.92787504196167, "reward_std": 0.11337137222290039, "rewards/accuracy_reward": 0.7466249465942383, "rewards/format_reward": 1.0, "step": 136 }, { "all_correct": 0.0, "all_wrong": 0.0, "completion_length": 421.875, "epoch": 0.0018896291085640198, "grad_norm": 2.067767356062963, "kl": 0.0264892578125, "learning_rate": 9.999911896812856e-07, "loss": 0.0011, "reward": 2.0640313625335693, "reward_std": 0.0756748840212822, "rewards/accuracy_reward": 0.8765312433242798, "rewards/format_reward": 1.0, "step": 137 }, { "all_correct": 0.25, "all_wrong": 0.0, "completion_length": 417.34375, "epoch": 0.0019034220217652171, "grad_norm": 3.6049516017328935, "kl": 0.0245361328125, "learning_rate": 9.99991060594471e-07, "loss": 0.001, "reward": 1.9338126182556152, "reward_std": 0.09241359680891037, "rewards/accuracy_reward": 0.7400625348091125, "rewards/format_reward": 1.0, "step": 138 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 418.59375, "epoch": 0.0019172149349664142, "grad_norm": 3.1162742378502424, "kl": 0.027099609375, "learning_rate": 9.99990930568854e-07, "loss": 0.0011, "reward": 2.0812811851501465, "reward_std": 0.0551903173327446, "rewards/accuracy_reward": 0.8875312805175781, "rewards/format_reward": 1.0, "step": 139 }, { "all_correct": 0.0, "all_wrong": 0.0, "completion_length": 417.96875, "epoch": 0.0019310078481676114, "grad_norm": 2.0380321462001407, "kl": 0.026611328125, "learning_rate": 9.999907996044357e-07, "loss": 0.0011, "reward": 1.9214375019073486, "reward_std": 0.0846758633852005, "rewards/accuracy_reward": 0.7214374542236328, "rewards/format_reward": 1.0, "step": 140 }, { "all_correct": 0.25, "all_wrong": 0.0, "completion_length": 428.5, "epoch": 0.0019448007613688087, "grad_norm": 2.1448785744585814, "kl": 0.024658203125, "learning_rate": 9.999906677012157e-07, "loss": 0.001, "reward": 2.0080313682556152, "reward_std": 0.08222126960754395, "rewards/accuracy_reward": 0.8142812252044678, "rewards/format_reward": 1.0, "step": 141 }, { "all_correct": 0.25, "all_wrong": 0.0, "completion_length": 414.3125, "epoch": 0.001958593674570006, "grad_norm": 1.9206734428203467, "kl": 0.0277099609375, "learning_rate": 9.999905348591946e-07, "loss": 0.0011, "reward": 2.040156126022339, "reward_std": 0.09859336912631989, "rewards/accuracy_reward": 0.8464062809944153, "rewards/format_reward": 1.0, "step": 142 }, { "all_correct": 0.0, "all_wrong": 0.0, "completion_length": 408.375, "epoch": 0.0019723865877712033, "grad_norm": 2.4906914721485616, "kl": 0.028076171875, "learning_rate": 9.999904010783723e-07, "loss": 0.0011, "reward": 1.946218729019165, "reward_std": 0.1150365024805069, "rewards/accuracy_reward": 0.764968752861023, "rewards/format_reward": 1.0, "step": 143 }, { "all_correct": 0.25, "all_wrong": 0.0, "completion_length": 421.5625, "epoch": 0.0019861795009724005, "grad_norm": 2.170189757636946, "kl": 0.025634765625, "learning_rate": 9.999902663587494e-07, "loss": 0.001, "reward": 1.9680312871932983, "reward_std": 0.09352342784404755, "rewards/accuracy_reward": 0.7805312275886536, "rewards/format_reward": 1.0, "step": 144 }, { "all_correct": 0.25, "all_wrong": 0.0, "completion_length": 408.3125, "epoch": 0.001999972414173598, "grad_norm": 2.908442735405397, "kl": 0.031494140625, "learning_rate": 9.99990130700326e-07, "loss": 0.0013, "reward": 2.0344998836517334, "reward_std": 0.05818683281540871, "rewards/accuracy_reward": 0.8407500386238098, "rewards/format_reward": 1.0, "step": 145 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 397.71875, "epoch": 0.0020137653273747946, "grad_norm": 2.0502885124902677, "kl": 0.03076171875, "learning_rate": 9.999899941031024e-07, "loss": 0.0012, "reward": 1.9617501497268677, "reward_std": 0.06440916657447815, "rewards/accuracy_reward": 0.7617500424385071, "rewards/format_reward": 1.0, "step": 146 }, { "all_correct": 0.25, "all_wrong": 0.0, "completion_length": 400.125, "epoch": 0.002027558240575992, "grad_norm": 2.2132029324250677, "kl": 0.031005859375, "learning_rate": 9.999898565670788e-07, "loss": 0.0012, "reward": 1.7540313005447388, "reward_std": 0.08751586079597473, "rewards/accuracy_reward": 0.5602812767028809, "rewards/format_reward": 1.0, "step": 147 }, { "all_correct": 0.25, "all_wrong": 0.0, "completion_length": 409.96875, "epoch": 0.002041351153777189, "grad_norm": 3.884026341687608, "kl": 0.03125, "learning_rate": 9.999897180922556e-07, "loss": 0.0012, "reward": 2.0512187480926514, "reward_std": 0.09547102451324463, "rewards/accuracy_reward": 0.8512187004089355, "rewards/format_reward": 1.0, "step": 148 }, { "all_correct": 0.0, "all_wrong": 0.0, "completion_length": 407.4375, "epoch": 0.0020551440669783864, "grad_norm": 2.7578436540602835, "kl": 0.02783203125, "learning_rate": 9.99989578678633e-07, "loss": 0.0011, "reward": 1.974812626838684, "reward_std": 0.08471481502056122, "rewards/accuracy_reward": 0.7935625314712524, "rewards/format_reward": 1.0, "step": 149 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 421.125, "epoch": 0.0020689369801795837, "grad_norm": 2.332294025572221, "kl": 0.031005859375, "learning_rate": 9.99989438326211e-07, "loss": 0.0012, "reward": 2.0224063396453857, "reward_std": 0.05896058678627014, "rewards/accuracy_reward": 0.8286561965942383, "rewards/format_reward": 1.0, "step": 150 }, { "all_correct": 0.0, "all_wrong": 0.0, "completion_length": 386.0, "epoch": 0.002082729893380781, "grad_norm": 41.82757073680713, "kl": 0.02685546875, "learning_rate": 9.9998929703499e-07, "loss": 0.0011, "reward": 1.8978750705718994, "reward_std": 0.07568147033452988, "rewards/accuracy_reward": 0.7041249871253967, "rewards/format_reward": 1.0, "step": 151 }, { "all_correct": 0.0, "all_wrong": 0.0, "completion_length": 422.0625, "epoch": 0.0020965228065819783, "grad_norm": 1.8603595195863625, "kl": 0.02685546875, "learning_rate": 9.999891548049706e-07, "loss": 0.0011, "reward": 1.8349063396453857, "reward_std": 0.047719232738018036, "rewards/accuracy_reward": 0.6349062919616699, "rewards/format_reward": 1.0, "step": 152 }, { "all_correct": 0.25, "all_wrong": 0.0, "completion_length": 419.25, "epoch": 0.0021103157197831755, "grad_norm": 2.053950040665595, "kl": 0.03125, "learning_rate": 9.999890116361529e-07, "loss": 0.0012, "reward": 1.8634687662124634, "reward_std": 0.09209747612476349, "rewards/accuracy_reward": 0.6697187423706055, "rewards/format_reward": 1.0, "step": 153 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 408.71875, "epoch": 0.002124108632984373, "grad_norm": 2.557083491821594, "kl": 0.0341796875, "learning_rate": 9.999888675285368e-07, "loss": 0.0014, "reward": 2.0903749465942383, "reward_std": 0.06731757521629333, "rewards/accuracy_reward": 0.8903749585151672, "rewards/format_reward": 1.0, "step": 154 }, { "all_correct": 0.0, "all_wrong": 0.0, "completion_length": 394.65625, "epoch": 0.0021379015461855696, "grad_norm": 1.6808665412799852, "kl": 0.0238037109375, "learning_rate": 9.99988722482123e-07, "loss": 0.001, "reward": 1.9673436880111694, "reward_std": 0.05538776516914368, "rewards/accuracy_reward": 0.7860937118530273, "rewards/format_reward": 1.0, "step": 155 }, { "all_correct": 0.0, "all_wrong": 0.0, "completion_length": 396.9375, "epoch": 0.002151694459386767, "grad_norm": 9.801379640847554, "kl": 0.029052734375, "learning_rate": 9.999885764969117e-07, "loss": 0.0012, "reward": 1.9736251831054688, "reward_std": 0.08686383068561554, "rewards/accuracy_reward": 0.7798749804496765, "rewards/format_reward": 1.0, "step": 156 }, { "all_correct": 0.25, "all_wrong": 0.0, "completion_length": 389.625, "epoch": 0.002165487372587964, "grad_norm": 1.864255705124676, "kl": 0.024658203125, "learning_rate": 9.999884295729028e-07, "loss": 0.001, "reward": 2.064687728881836, "reward_std": 0.07195336371660233, "rewards/accuracy_reward": 0.8709374666213989, "rewards/format_reward": 1.0, "step": 157 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 385.65625, "epoch": 0.0021792802857891615, "grad_norm": 2.648487411963937, "kl": 0.027099609375, "learning_rate": 9.99988281710097e-07, "loss": 0.0011, "reward": 1.9994688034057617, "reward_std": 0.0530242919921875, "rewards/accuracy_reward": 0.7994687557220459, "rewards/format_reward": 1.0, "step": 158 }, { "all_correct": 0.25, "all_wrong": 0.0, "completion_length": 399.96875, "epoch": 0.0021930731989903587, "grad_norm": 3.135546230672736, "kl": 0.0250244140625, "learning_rate": 9.999881329084944e-07, "loss": 0.001, "reward": 2.0918750762939453, "reward_std": 0.06598945707082748, "rewards/accuracy_reward": 0.8981249928474426, "rewards/format_reward": 1.0, "step": 159 }, { "all_correct": 0.0, "all_wrong": 0.0, "completion_length": 374.25, "epoch": 0.002206866112191556, "grad_norm": 2.831192695495464, "kl": 0.0274658203125, "learning_rate": 9.999879831680955e-07, "loss": 0.0011, "reward": 2.026031494140625, "reward_std": 0.09407088905572891, "rewards/accuracy_reward": 0.8510312438011169, "rewards/format_reward": 1.0, "step": 160 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 397.5625, "epoch": 0.0022206590253927533, "grad_norm": 2.162999953097895, "kl": 0.0244140625, "learning_rate": 9.999878324889003e-07, "loss": 0.001, "reward": 2.0853123664855957, "reward_std": 0.08686895668506622, "rewards/accuracy_reward": 0.8978124856948853, "rewards/format_reward": 1.0, "step": 161 }, { "all_correct": 0.25, "all_wrong": 0.0, "completion_length": 382.28125, "epoch": 0.0022344519385939505, "grad_norm": 2.597197155988873, "kl": 0.0263671875, "learning_rate": 9.999876808709093e-07, "loss": 0.0011, "reward": 2.0526249408721924, "reward_std": 0.08961372822523117, "rewards/accuracy_reward": 0.8713749647140503, "rewards/format_reward": 1.0, "step": 162 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 385.71875, "epoch": 0.002248244851795148, "grad_norm": 2.7874941055827342, "kl": 0.0255126953125, "learning_rate": 9.999875283141226e-07, "loss": 0.001, "reward": 2.056093692779541, "reward_std": 0.07780696451663971, "rewards/accuracy_reward": 0.8748437762260437, "rewards/format_reward": 1.0, "step": 163 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 390.1875, "epoch": 0.002262037764996345, "grad_norm": 2.14309924891198, "kl": 0.0277099609375, "learning_rate": 9.999873748185406e-07, "loss": 0.0011, "reward": 2.0775938034057617, "reward_std": 0.06876933574676514, "rewards/accuracy_reward": 0.8900938034057617, "rewards/format_reward": 1.0, "step": 164 }, { "all_correct": 0.25, "all_wrong": 0.0, "completion_length": 395.03125, "epoch": 0.002275830678197542, "grad_norm": 2.387474108703245, "kl": 0.0286865234375, "learning_rate": 9.999872203841635e-07, "loss": 0.0011, "reward": 1.8728125095367432, "reward_std": 0.07647205889225006, "rewards/accuracy_reward": 0.6790624856948853, "rewards/format_reward": 1.0, "step": 165 }, { "all_correct": 0.25, "all_wrong": 0.0, "completion_length": 385.8125, "epoch": 0.002289623591398739, "grad_norm": 6.624381647171409, "kl": 0.026123046875, "learning_rate": 9.999870650109917e-07, "loss": 0.001, "reward": 1.9303126335144043, "reward_std": 0.0732966959476471, "rewards/accuracy_reward": 0.7428125739097595, "rewards/format_reward": 1.0, "step": 166 }, { "all_correct": 0.25, "all_wrong": 0.0, "completion_length": 375.3125, "epoch": 0.0023034165045999365, "grad_norm": 2.229535096022293, "kl": 0.0267333984375, "learning_rate": 9.999869086990253e-07, "loss": 0.0011, "reward": 2.0582189559936523, "reward_std": 0.09584961086511612, "rewards/accuracy_reward": 0.8769687414169312, "rewards/format_reward": 1.0, "step": 167 }, { "all_correct": 0.0, "all_wrong": 0.0, "completion_length": 395.65625, "epoch": 0.0023172094178011337, "grad_norm": 2.0984437457196923, "kl": 0.02783203125, "learning_rate": 9.99986751448265e-07, "loss": 0.0011, "reward": 2.0131874084472656, "reward_std": 0.07278886437416077, "rewards/accuracy_reward": 0.8131875395774841, "rewards/format_reward": 1.0, "step": 168 }, { "all_correct": 0.25, "all_wrong": 0.0, "completion_length": 405.5, "epoch": 0.002331002331002331, "grad_norm": 2.128063773187203, "kl": 0.0269775390625, "learning_rate": 9.999865932587107e-07, "loss": 0.0011, "reward": 2.0375938415527344, "reward_std": 0.08132210373878479, "rewards/accuracy_reward": 0.8500937819480896, "rewards/format_reward": 1.0, "step": 169 }, { "all_correct": 0.0, "all_wrong": 0.0, "completion_length": 417.75, "epoch": 0.0023447952442035283, "grad_norm": 2.067958849274226, "kl": 0.0289306640625, "learning_rate": 9.999864341303627e-07, "loss": 0.0012, "reward": 2.0456666946411133, "reward_std": 0.08951409161090851, "rewards/accuracy_reward": 0.8519166707992554, "rewards/format_reward": 1.0, "step": 170 }, { "all_correct": 0.25, "all_wrong": 0.0, "completion_length": 383.6875, "epoch": 0.0023585881574047255, "grad_norm": 2.3054266852830847, "kl": 0.0296630859375, "learning_rate": 9.999862740632214e-07, "loss": 0.0012, "reward": 2.0493438243865967, "reward_std": 0.06850887835025787, "rewards/accuracy_reward": 0.868093729019165, "rewards/format_reward": 1.0, "step": 171 }, { "all_correct": 0.25, "all_wrong": 0.0, "completion_length": 423.125, "epoch": 0.002372381070605923, "grad_norm": 2.3135349276359407, "kl": 0.029296875, "learning_rate": 9.999861130572873e-07, "loss": 0.0012, "reward": 2.0240001678466797, "reward_std": 0.07183652371168137, "rewards/accuracy_reward": 0.8364999890327454, "rewards/format_reward": 1.0, "step": 172 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 412.03125, "epoch": 0.00238617398380712, "grad_norm": 2.187890007907099, "kl": 0.03125, "learning_rate": 9.999859511125604e-07, "loss": 0.0012, "reward": 2.056281089782715, "reward_std": 0.058351606130599976, "rewards/accuracy_reward": 0.8562811613082886, "rewards/format_reward": 1.0, "step": 173 }, { "all_correct": 0.0, "all_wrong": 0.0, "completion_length": 426.28125, "epoch": 0.002399966897008317, "grad_norm": 9.685102075276307, "kl": 0.031982421875, "learning_rate": 9.999857882290412e-07, "loss": 0.0013, "reward": 1.9387187957763672, "reward_std": 0.10506308078765869, "rewards/accuracy_reward": 0.7449687123298645, "rewards/format_reward": 1.0, "step": 174 }, { "all_correct": 0.25, "all_wrong": 0.0, "completion_length": 433.5625, "epoch": 0.002413759810209514, "grad_norm": 1.9759584974122304, "kl": 0.029296875, "learning_rate": 9.999856244067297e-07, "loss": 0.0012, "reward": 1.9013750553131104, "reward_std": 0.07270276546478271, "rewards/accuracy_reward": 0.7201250195503235, "rewards/format_reward": 1.0, "step": 175 }, { "all_correct": 0.0, "all_wrong": 0.0, "completion_length": 404.53125, "epoch": 0.0024275527234107115, "grad_norm": 2.1868257980136767, "kl": 0.029541015625, "learning_rate": 9.999854596456268e-07, "loss": 0.0012, "reward": 2.0413126945495605, "reward_std": 0.09028618782758713, "rewards/accuracy_reward": 0.8600624799728394, "rewards/format_reward": 1.0, "step": 176 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 425.90625, "epoch": 0.0024413456366119087, "grad_norm": 2.3516971050666235, "kl": 0.0303955078125, "learning_rate": 9.999852939457323e-07, "loss": 0.0012, "reward": 1.9848124980926514, "reward_std": 0.07637369632720947, "rewards/accuracy_reward": 0.8035625219345093, "rewards/format_reward": 1.0, "step": 177 }, { "all_correct": 0.25, "all_wrong": 0.0, "completion_length": 415.53125, "epoch": 0.002455138549813106, "grad_norm": 1.930799227540862, "kl": 0.02978515625, "learning_rate": 9.999851273070466e-07, "loss": 0.0012, "reward": 1.9667500257492065, "reward_std": 0.06736990064382553, "rewards/accuracy_reward": 0.7667499780654907, "rewards/format_reward": 1.0, "step": 178 }, { "all_correct": 0.25, "all_wrong": 0.0, "completion_length": 414.46875, "epoch": 0.0024689314630143033, "grad_norm": 2.098250753067024, "kl": 0.03271484375, "learning_rate": 9.9998495972957e-07, "loss": 0.0013, "reward": 1.9832812547683716, "reward_std": 0.04954220727086067, "rewards/accuracy_reward": 0.7832812070846558, "rewards/format_reward": 1.0, "step": 179 }, { "all_correct": 0.0, "all_wrong": 0.0, "completion_length": 417.59375, "epoch": 0.0024827243762155005, "grad_norm": 2.2996429439129265, "kl": 0.031494140625, "learning_rate": 9.99984791213303e-07, "loss": 0.0013, "reward": 2.0596561431884766, "reward_std": 0.07544936239719391, "rewards/accuracy_reward": 0.8596562147140503, "rewards/format_reward": 1.0, "step": 180 }, { "all_correct": 0.25, "all_wrong": 0.0, "completion_length": 419.59375, "epoch": 0.002496517289416698, "grad_norm": 4.634416433331346, "kl": 0.033447265625, "learning_rate": 9.999846217582459e-07, "loss": 0.0013, "reward": 2.0994374752044678, "reward_std": 0.08039168268442154, "rewards/accuracy_reward": 0.9119374752044678, "rewards/format_reward": 1.0, "step": 181 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 409.9375, "epoch": 0.002510310202617895, "grad_norm": 4.544426596764613, "kl": 0.03076171875, "learning_rate": 9.999844513643989e-07, "loss": 0.0012, "reward": 2.1343436241149902, "reward_std": 0.03114665299654007, "rewards/accuracy_reward": 0.934343695640564, "rewards/format_reward": 1.0, "step": 182 }, { "all_correct": 0.0, "all_wrong": 0.0, "completion_length": 426.40625, "epoch": 0.0025241031158190924, "grad_norm": 1.935963583219852, "kl": 0.032958984375, "learning_rate": 9.999842800317621e-07, "loss": 0.0013, "reward": 1.8970000743865967, "reward_std": 0.07467754185199738, "rewards/accuracy_reward": 0.715749979019165, "rewards/format_reward": 1.0, "step": 183 }, { "all_correct": 0.25, "all_wrong": 0.0, "completion_length": 423.5, "epoch": 0.002537896029020289, "grad_norm": 2.3059422667549065, "kl": 0.0296630859375, "learning_rate": 9.999841077603362e-07, "loss": 0.0012, "reward": 1.9113438129425049, "reward_std": 0.07229673862457275, "rewards/accuracy_reward": 0.7113437652587891, "rewards/format_reward": 1.0, "step": 184 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 414.03125, "epoch": 0.0025516889422214865, "grad_norm": 1.8840305479355008, "kl": 0.031982421875, "learning_rate": 9.999839345501216e-07, "loss": 0.0013, "reward": 2.0431251525878906, "reward_std": 0.032546259462833405, "rewards/accuracy_reward": 0.8431249856948853, "rewards/format_reward": 1.0, "step": 185 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 415.90625, "epoch": 0.0025654818554226837, "grad_norm": 2.8036313506148747, "kl": 0.03515625, "learning_rate": 9.99983760401118e-07, "loss": 0.0014, "reward": 2.1312499046325684, "reward_std": 0.029460545629262924, "rewards/accuracy_reward": 0.9312500357627869, "rewards/format_reward": 1.0, "step": 186 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 399.84375, "epoch": 0.002579274768623881, "grad_norm": 2.3846755043463865, "kl": 0.0322265625, "learning_rate": 9.999835853133264e-07, "loss": 0.0013, "reward": 2.05440616607666, "reward_std": 0.05262136459350586, "rewards/accuracy_reward": 0.8544061779975891, "rewards/format_reward": 1.0, "step": 187 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 406.09375, "epoch": 0.0025930676818250783, "grad_norm": 3.0644871241090827, "kl": 0.033447265625, "learning_rate": 9.99983409286747e-07, "loss": 0.0013, "reward": 2.025156259536743, "reward_std": 0.04958387464284897, "rewards/accuracy_reward": 0.8251562118530273, "rewards/format_reward": 1.0, "step": 188 }, { "all_correct": 0.25, "all_wrong": 0.0, "completion_length": 394.5, "epoch": 0.0026068605950262756, "grad_norm": 3.5594640926602206, "kl": 0.03369140625, "learning_rate": 9.999832323213797e-07, "loss": 0.0013, "reward": 2.0376248359680176, "reward_std": 0.08755264431238174, "rewards/accuracy_reward": 0.8438750505447388, "rewards/format_reward": 1.0, "step": 189 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 376.40625, "epoch": 0.002620653508227473, "grad_norm": 1.7232180978453093, "kl": 0.033203125, "learning_rate": 9.999830544172252e-07, "loss": 0.0013, "reward": 2.0407187938690186, "reward_std": 0.05778937414288521, "rewards/accuracy_reward": 0.8657187223434448, "rewards/format_reward": 1.0, "step": 190 }, { "all_correct": 0.25, "all_wrong": 0.0, "completion_length": 397.6875, "epoch": 0.00263444642142867, "grad_norm": 2.6276855645539037, "kl": 0.0289306640625, "learning_rate": 9.999828755742839e-07, "loss": 0.0012, "reward": 2.010031223297119, "reward_std": 0.059785306453704834, "rewards/accuracy_reward": 0.816281259059906, "rewards/format_reward": 1.0, "step": 191 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 374.40625, "epoch": 0.0026482393346298674, "grad_norm": 1.886602129302959, "kl": 0.03564453125, "learning_rate": 9.99982695792556e-07, "loss": 0.0014, "reward": 2.1299374103546143, "reward_std": 0.04402173310518265, "rewards/accuracy_reward": 0.9361875057220459, "rewards/format_reward": 1.0, "step": 192 }, { "all_correct": 0.25, "all_wrong": 0.0, "completion_length": 384.75, "epoch": 0.002662032247831064, "grad_norm": 2.5563845759958874, "kl": 0.034423828125, "learning_rate": 9.999825150720416e-07, "loss": 0.0014, "reward": 2.0019376277923584, "reward_std": 0.05592333897948265, "rewards/accuracy_reward": 0.8144375085830688, "rewards/format_reward": 1.0, "step": 193 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 393.375, "epoch": 0.0026758251610322615, "grad_norm": 1.9214993976702013, "kl": 0.034423828125, "learning_rate": 9.999823334127415e-07, "loss": 0.0014, "reward": 2.020249843597412, "reward_std": 0.07798199355602264, "rewards/accuracy_reward": 0.8264999985694885, "rewards/format_reward": 1.0, "step": 194 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 391.46875, "epoch": 0.0026896180742334587, "grad_norm": 2.8003295562485913, "kl": 0.03515625, "learning_rate": 9.999821508146558e-07, "loss": 0.0014, "reward": 2.056593894958496, "reward_std": 0.0458025187253952, "rewards/accuracy_reward": 0.8565937280654907, "rewards/format_reward": 1.0, "step": 195 }, { "all_correct": 0.25, "all_wrong": 0.0, "completion_length": 398.4375, "epoch": 0.002703410987434656, "grad_norm": 2.148460935015114, "kl": 0.038330078125, "learning_rate": 9.999819672777847e-07, "loss": 0.0015, "reward": 1.9625625610351562, "reward_std": 0.0632413849234581, "rewards/accuracy_reward": 0.7625625133514404, "rewards/format_reward": 1.0, "step": 196 }, { "all_correct": 0.0, "all_wrong": 0.0, "completion_length": 375.90625, "epoch": 0.0027172039006358533, "grad_norm": 2.074352951106199, "kl": 0.0341796875, "learning_rate": 9.99981782802129e-07, "loss": 0.0014, "reward": 1.9881561994552612, "reward_std": 0.05798901617527008, "rewards/accuracy_reward": 0.7944062948226929, "rewards/format_reward": 1.0, "step": 197 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 399.21875, "epoch": 0.0027309968138370506, "grad_norm": 2.220833697202844, "kl": 0.036865234375, "learning_rate": 9.999815973876887e-07, "loss": 0.0015, "reward": 2.0797812938690186, "reward_std": 0.030554696917533875, "rewards/accuracy_reward": 0.8797812461853027, "rewards/format_reward": 1.0, "step": 198 }, { "all_correct": 0.25, "all_wrong": 0.0, "completion_length": 385.0, "epoch": 0.002744789727038248, "grad_norm": 2.778379290637898, "kl": 0.03515625, "learning_rate": 9.99981411034464e-07, "loss": 0.0014, "reward": 1.9388437271118164, "reward_std": 0.09497188031673431, "rewards/accuracy_reward": 0.7450937628746033, "rewards/format_reward": 1.0, "step": 199 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 391.28125, "epoch": 0.002758582640239445, "grad_norm": 2.967158316469084, "kl": 0.03515625, "learning_rate": 9.999812237424554e-07, "loss": 0.0014, "reward": 2.109405994415283, "reward_std": 0.05784162878990173, "rewards/accuracy_reward": 0.9156562089920044, "rewards/format_reward": 1.0, "step": 200 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 386.65625, "epoch": 0.0027723755534406424, "grad_norm": 2.4155405988825294, "kl": 0.0390625, "learning_rate": 9.999810355116635e-07, "loss": 0.0016, "reward": 1.9998124837875366, "reward_std": 0.07975615561008453, "rewards/accuracy_reward": 0.8060624599456787, "rewards/format_reward": 1.0, "step": 201 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 401.09375, "epoch": 0.002786168466641839, "grad_norm": 4.856548535126516, "kl": 0.036376953125, "learning_rate": 9.999808463420884e-07, "loss": 0.0015, "reward": 2.0610313415527344, "reward_std": 0.07405795156955719, "rewards/accuracy_reward": 0.8610312342643738, "rewards/format_reward": 1.0, "step": 202 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 387.46875, "epoch": 0.0027999613798430365, "grad_norm": 2.030169334494783, "kl": 0.03759765625, "learning_rate": 9.999806562337306e-07, "loss": 0.0015, "reward": 2.0602500438690186, "reward_std": 0.04804185777902603, "rewards/accuracy_reward": 0.8602499961853027, "rewards/format_reward": 1.0, "step": 203 }, { "all_correct": 0.25, "all_wrong": 0.0, "completion_length": 390.71875, "epoch": 0.0028137542930442338, "grad_norm": 2.3858643652998035, "kl": 0.03515625, "learning_rate": 9.999804651865903e-07, "loss": 0.0014, "reward": 2.0386874675750732, "reward_std": 0.10029025375843048, "rewards/accuracy_reward": 0.8511874675750732, "rewards/format_reward": 1.0, "step": 204 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 398.71875, "epoch": 0.002827547206245431, "grad_norm": 2.231389844202713, "kl": 0.036376953125, "learning_rate": 9.999802732006677e-07, "loss": 0.0015, "reward": 2.1243438720703125, "reward_std": 0.04526836425065994, "rewards/accuracy_reward": 0.930593729019165, "rewards/format_reward": 1.0, "step": 205 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 415.71875, "epoch": 0.0028413401194466283, "grad_norm": 2.0523450510676393, "kl": 0.0400390625, "learning_rate": 9.999800802759637e-07, "loss": 0.0016, "reward": 2.1100001335144043, "reward_std": 0.03178774192929268, "rewards/accuracy_reward": 0.9100000262260437, "rewards/format_reward": 1.0, "step": 206 }, { "all_correct": 0.25, "all_wrong": 0.0, "completion_length": 388.1875, "epoch": 0.0028551330326478256, "grad_norm": 2.7988057223980514, "kl": 0.038818359375, "learning_rate": 9.999798864124782e-07, "loss": 0.0016, "reward": 1.982968807220459, "reward_std": 0.05906786024570465, "rewards/accuracy_reward": 0.8017187118530273, "rewards/format_reward": 1.0, "step": 207 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 399.6875, "epoch": 0.002868925945849023, "grad_norm": 2.5171061772943553, "kl": 0.04150390625, "learning_rate": 9.999796916102117e-07, "loss": 0.0017, "reward": 2.094249963760376, "reward_std": 0.0296689011156559, "rewards/accuracy_reward": 0.8942500352859497, "rewards/format_reward": 1.0, "step": 208 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 399.28125, "epoch": 0.00288271885905022, "grad_norm": 2.094314720403659, "kl": 0.041748046875, "learning_rate": 9.999794958691644e-07, "loss": 0.0017, "reward": 2.1491875648498535, "reward_std": 0.03765999525785446, "rewards/accuracy_reward": 0.9554375410079956, "rewards/format_reward": 1.0, "step": 209 }, { "all_correct": 0.25, "all_wrong": 0.0, "completion_length": 392.9375, "epoch": 0.0028965117722514174, "grad_norm": 1.6503501456992873, "kl": 0.040283203125, "learning_rate": 9.999792991893372e-07, "loss": 0.0016, "reward": 1.9469687938690186, "reward_std": 0.017904702574014664, "rewards/accuracy_reward": 0.7469687461853027, "rewards/format_reward": 1.0, "step": 210 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 407.34375, "epoch": 0.0029103046854526146, "grad_norm": 2.8394906789667065, "kl": 0.03564453125, "learning_rate": 9.9997910157073e-07, "loss": 0.0014, "reward": 2.040562629699707, "reward_std": 0.025202153250575066, "rewards/accuracy_reward": 0.8405624628067017, "rewards/format_reward": 1.0, "step": 211 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 422.8125, "epoch": 0.0029240975986538115, "grad_norm": 2.810870263184083, "kl": 0.035888671875, "learning_rate": 9.999789030133433e-07, "loss": 0.0014, "reward": 2.0199687480926514, "reward_std": 0.033414311707019806, "rewards/accuracy_reward": 0.8199687004089355, "rewards/format_reward": 1.0, "step": 212 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 428.25, "epoch": 0.0029378905118550088, "grad_norm": 3.6223491183378544, "kl": 0.0361328125, "learning_rate": 9.999787035171775e-07, "loss": 0.0014, "reward": 2.042468786239624, "reward_std": 0.06639298796653748, "rewards/accuracy_reward": 0.8487187623977661, "rewards/format_reward": 1.0, "step": 213 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 423.09375, "epoch": 0.002951683425056206, "grad_norm": 2.5290957847109166, "kl": 0.040283203125, "learning_rate": 9.999785030822327e-07, "loss": 0.0016, "reward": 2.0207812786102295, "reward_std": 0.009096596390008926, "rewards/accuracy_reward": 0.8207812905311584, "rewards/format_reward": 1.0, "step": 214 }, { "all_correct": 0.0, "all_wrong": 0.0, "completion_length": 441.34375, "epoch": 0.0029654763382574033, "grad_norm": 2.324018121666656, "kl": 0.0380859375, "learning_rate": 9.999783017085097e-07, "loss": 0.0015, "reward": 1.940250039100647, "reward_std": 0.08123809099197388, "rewards/accuracy_reward": 0.7527499794960022, "rewards/format_reward": 1.0, "step": 215 }, { "all_correct": 0.25, "all_wrong": 0.0, "completion_length": 439.0, "epoch": 0.0029792692514586006, "grad_norm": 2.941181310819628, "kl": 0.044189453125, "learning_rate": 9.999780993960087e-07, "loss": 0.0018, "reward": 2.0858750343322754, "reward_std": 0.05253898352384567, "rewards/accuracy_reward": 0.8858749866485596, "rewards/format_reward": 1.0, "step": 216 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 429.65625, "epoch": 0.002993062164659798, "grad_norm": 4.198951692269727, "kl": 0.037841796875, "learning_rate": 9.9997789614473e-07, "loss": 0.0015, "reward": 2.0808751583099365, "reward_std": 0.06408786028623581, "rewards/accuracy_reward": 0.8933749198913574, "rewards/format_reward": 1.0, "step": 217 }, { "all_correct": 0.0, "all_wrong": 0.0, "completion_length": 436.84375, "epoch": 0.003006855077860995, "grad_norm": 2.1481688044496385, "kl": 0.0390625, "learning_rate": 9.999776919546741e-07, "loss": 0.0016, "reward": 1.968843698501587, "reward_std": 0.061460018157958984, "rewards/accuracy_reward": 0.7813437581062317, "rewards/format_reward": 1.0, "step": 218 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 426.9375, "epoch": 0.0030206479910621924, "grad_norm": 2.6496714616841377, "kl": 0.034912109375, "learning_rate": 9.999774868258415e-07, "loss": 0.0014, "reward": 2.1024374961853027, "reward_std": 0.05336809158325195, "rewards/accuracy_reward": 0.9086875319480896, "rewards/format_reward": 1.0, "step": 219 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 433.15625, "epoch": 0.0030344409042633897, "grad_norm": 3.146872570782692, "kl": 0.0361328125, "learning_rate": 9.99977280758232e-07, "loss": 0.0014, "reward": 2.088718891143799, "reward_std": 0.04650425165891647, "rewards/accuracy_reward": 0.8887187838554382, "rewards/format_reward": 1.0, "step": 220 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 441.15625, "epoch": 0.0030482338174645865, "grad_norm": 3.6577890864718205, "kl": 0.035888671875, "learning_rate": 9.999770737518467e-07, "loss": 0.0014, "reward": 1.9315937757492065, "reward_std": 0.05462124943733215, "rewards/accuracy_reward": 0.7378437519073486, "rewards/format_reward": 1.0, "step": 221 }, { "all_correct": 0.0, "all_wrong": 0.0, "completion_length": 406.875, "epoch": 0.0030620267306657838, "grad_norm": 2.3048021507859753, "kl": 0.0390625, "learning_rate": 9.999768658066856e-07, "loss": 0.0016, "reward": 1.9279687404632568, "reward_std": 0.035580676048994064, "rewards/accuracy_reward": 0.7279687523841858, "rewards/format_reward": 1.0, "step": 222 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 436.625, "epoch": 0.003075819643866981, "grad_norm": 2.110741351371158, "kl": 0.03466796875, "learning_rate": 9.999766569227492e-07, "loss": 0.0014, "reward": 2.1274685859680176, "reward_std": 0.04286646842956543, "rewards/accuracy_reward": 0.9337186813354492, "rewards/format_reward": 1.0, "step": 223 }, { "all_correct": 0.25, "all_wrong": 0.0, "completion_length": 437.4375, "epoch": 0.0030896125570681783, "grad_norm": 2.836998473383281, "kl": 0.035400390625, "learning_rate": 9.99976447100038e-07, "loss": 0.0014, "reward": 2.083156108856201, "reward_std": 0.08227825164794922, "rewards/accuracy_reward": 0.9144062399864197, "rewards/format_reward": 1.0, "step": 224 }, { "all_correct": 0.25, "all_wrong": 0.0, "completion_length": 429.5625, "epoch": 0.0031034054702693756, "grad_norm": 2.7430622755902547, "kl": 0.03466796875, "learning_rate": 9.999762363385522e-07, "loss": 0.0014, "reward": 1.981812596321106, "reward_std": 0.050020813941955566, "rewards/accuracy_reward": 0.7880625128746033, "rewards/format_reward": 1.0, "step": 225 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 419.28125, "epoch": 0.003117198383470573, "grad_norm": 1.9800776020396267, "kl": 0.03515625, "learning_rate": 9.999760246382923e-07, "loss": 0.0014, "reward": 1.9923436641693115, "reward_std": 0.03193720430135727, "rewards/accuracy_reward": 0.79234379529953, "rewards/format_reward": 1.0, "step": 226 }, { "all_correct": 0.25, "all_wrong": 0.0, "completion_length": 416.28125, "epoch": 0.00313099129667177, "grad_norm": 3.840783521055594, "kl": 0.037109375, "learning_rate": 9.999758119992585e-07, "loss": 0.0015, "reward": 2.0612502098083496, "reward_std": 0.05794785916805267, "rewards/accuracy_reward": 0.8675000071525574, "rewards/format_reward": 1.0, "step": 227 }, { "all_correct": 0.25, "all_wrong": 0.0, "completion_length": 401.6875, "epoch": 0.0031447842098729674, "grad_norm": 2.8702125160240683, "kl": 0.033203125, "learning_rate": 9.999755984214516e-07, "loss": 0.0013, "reward": 1.9457812309265137, "reward_std": 0.04222777858376503, "rewards/accuracy_reward": 0.7520312070846558, "rewards/format_reward": 1.0, "step": 228 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 412.1875, "epoch": 0.0031585771230741647, "grad_norm": 2.286887849107822, "kl": 0.03271484375, "learning_rate": 9.999753839048714e-07, "loss": 0.0013, "reward": 2.0785627365112305, "reward_std": 0.05187535285949707, "rewards/accuracy_reward": 0.8785625100135803, "rewards/format_reward": 1.0, "step": 229 }, { "all_correct": 0.25, "all_wrong": 0.0, "completion_length": 409.78125, "epoch": 0.003172370036275362, "grad_norm": 2.6577956362347734, "kl": 0.0361328125, "learning_rate": 9.99975168449519e-07, "loss": 0.0014, "reward": 1.9205312728881836, "reward_std": 0.06479774415493011, "rewards/accuracy_reward": 0.7267812490463257, "rewards/format_reward": 1.0, "step": 230 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 410.96875, "epoch": 0.0031861629494765588, "grad_norm": 2.126939914195246, "kl": 0.034423828125, "learning_rate": 9.999749520553944e-07, "loss": 0.0014, "reward": 2.0932188034057617, "reward_std": 0.04308244585990906, "rewards/accuracy_reward": 0.8932187557220459, "rewards/format_reward": 1.0, "step": 231 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 399.5625, "epoch": 0.003199955862677756, "grad_norm": 2.2119580904414464, "kl": 0.03955078125, "learning_rate": 9.999747347224981e-07, "loss": 0.0016, "reward": 2.0918750762939453, "reward_std": 0.05328710377216339, "rewards/accuracy_reward": 0.8918749690055847, "rewards/format_reward": 1.0, "step": 232 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 388.3125, "epoch": 0.0032137487758789533, "grad_norm": 2.3851822614862614, "kl": 0.0361328125, "learning_rate": 9.999745164508304e-07, "loss": 0.0014, "reward": 2.1035313606262207, "reward_std": 0.03318953141570091, "rewards/accuracy_reward": 0.9035312533378601, "rewards/format_reward": 1.0, "step": 233 }, { "all_correct": 0.0, "all_wrong": 0.0, "completion_length": 406.6875, "epoch": 0.0032275416890801506, "grad_norm": 2.9115246098203573, "kl": 0.0361328125, "learning_rate": 9.999742972403918e-07, "loss": 0.0014, "reward": 1.9703125953674316, "reward_std": 0.05693824589252472, "rewards/accuracy_reward": 0.770312488079071, "rewards/format_reward": 1.0, "step": 234 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 402.78125, "epoch": 0.003241334602281348, "grad_norm": 2.2069067408045258, "kl": 0.03564453125, "learning_rate": 9.999740770911827e-07, "loss": 0.0014, "reward": 2.0199062824249268, "reward_std": 0.045900505036115646, "rewards/accuracy_reward": 0.8199062347412109, "rewards/format_reward": 1.0, "step": 235 }, { "all_correct": 0.25, "all_wrong": 0.0, "completion_length": 404.875, "epoch": 0.003255127515482545, "grad_norm": 2.7380543468041236, "kl": 0.040283203125, "learning_rate": 9.999738560032035e-07, "loss": 0.0016, "reward": 2.006593704223633, "reward_std": 0.0754980742931366, "rewards/accuracy_reward": 0.8128437399864197, "rewards/format_reward": 1.0, "step": 236 }, { "all_correct": 0.25, "all_wrong": 0.0, "completion_length": 408.34375, "epoch": 0.0032689204286837424, "grad_norm": 2.378886141549322, "kl": 0.04150390625, "learning_rate": 9.999736339764546e-07, "loss": 0.0017, "reward": 2.0571250915527344, "reward_std": 0.07030831277370453, "rewards/accuracy_reward": 0.8633750081062317, "rewards/format_reward": 1.0, "step": 237 }, { "all_correct": 0.25, "all_wrong": 0.0, "completion_length": 385.46875, "epoch": 0.0032827133418849397, "grad_norm": 2.518868413202874, "kl": 0.0380859375, "learning_rate": 9.999734110109366e-07, "loss": 0.0015, "reward": 2.006500244140625, "reward_std": 0.07023480534553528, "rewards/accuracy_reward": 0.8127500414848328, "rewards/format_reward": 1.0, "step": 238 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 425.65625, "epoch": 0.003296506255086137, "grad_norm": 2.398354781072295, "kl": 0.044921875, "learning_rate": 9.999731871066497e-07, "loss": 0.0018, "reward": 2.0112500190734863, "reward_std": 0.05328204482793808, "rewards/accuracy_reward": 0.8112499713897705, "rewards/format_reward": 1.0, "step": 239 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 403.65625, "epoch": 0.0033102991682873338, "grad_norm": 2.030876481703043, "kl": 0.0341796875, "learning_rate": 9.999729622635945e-07, "loss": 0.0014, "reward": 2.0425000190734863, "reward_std": 0.04199083149433136, "rewards/accuracy_reward": 0.8425000309944153, "rewards/format_reward": 1.0, "step": 240 }, { "all_correct": 0.0, "all_wrong": 0.0, "completion_length": 391.3125, "epoch": 0.003324092081488531, "grad_norm": 3.0114917592564114, "kl": 0.036376953125, "learning_rate": 9.999727364817711e-07, "loss": 0.0015, "reward": 1.9598751068115234, "reward_std": 0.07175479829311371, "rewards/accuracy_reward": 0.7723749876022339, "rewards/format_reward": 1.0, "step": 241 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 412.59375, "epoch": 0.0033378849946897283, "grad_norm": 2.23632288724753, "kl": 0.036376953125, "learning_rate": 9.999725097611803e-07, "loss": 0.0015, "reward": 2.0199689865112305, "reward_std": 0.04627995938062668, "rewards/accuracy_reward": 0.8262187242507935, "rewards/format_reward": 1.0, "step": 242 }, { "all_correct": 0.0, "all_wrong": 0.0, "completion_length": 410.40625, "epoch": 0.0033516779078909256, "grad_norm": 2.086962635355399, "kl": 0.033203125, "learning_rate": 9.999722821018225e-07, "loss": 0.0013, "reward": 2.01981258392334, "reward_std": 0.052031271159648895, "rewards/accuracy_reward": 0.8260624408721924, "rewards/format_reward": 1.0, "step": 243 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 435.84375, "epoch": 0.003365470821092123, "grad_norm": 2.1725635795225635, "kl": 0.033447265625, "learning_rate": 9.999720535036976e-07, "loss": 0.0013, "reward": 2.106781482696533, "reward_std": 0.07235150039196014, "rewards/accuracy_reward": 0.9130312204360962, "rewards/format_reward": 1.0, "step": 244 }, { "all_correct": 0.0, "all_wrong": 0.0, "completion_length": 405.71875, "epoch": 0.00337926373429332, "grad_norm": 2.057129382744587, "kl": 0.038330078125, "learning_rate": 9.999718239668067e-07, "loss": 0.0015, "reward": 1.9260313510894775, "reward_std": 0.08792833238840103, "rewards/accuracy_reward": 0.7447812557220459, "rewards/format_reward": 1.0, "step": 245 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 401.625, "epoch": 0.0033930566474945174, "grad_norm": 2.652843441716905, "kl": 0.032470703125, "learning_rate": 9.999715934911499e-07, "loss": 0.0013, "reward": 2.0135936737060547, "reward_std": 0.059460051357746124, "rewards/accuracy_reward": 0.8198436498641968, "rewards/format_reward": 1.0, "step": 246 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 411.46875, "epoch": 0.0034068495606957147, "grad_norm": 2.295464281374007, "kl": 0.03515625, "learning_rate": 9.999713620767277e-07, "loss": 0.0014, "reward": 2.00028133392334, "reward_std": 0.06256881356239319, "rewards/accuracy_reward": 0.8065312504768372, "rewards/format_reward": 1.0, "step": 247 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 412.3125, "epoch": 0.003420642473896912, "grad_norm": 2.0800237727160855, "kl": 0.03662109375, "learning_rate": 9.999711297235406e-07, "loss": 0.0015, "reward": 1.9807811975479126, "reward_std": 0.04289881885051727, "rewards/accuracy_reward": 0.7807812690734863, "rewards/format_reward": 1.0, "step": 248 }, { "all_correct": 0.0, "all_wrong": 0.0, "completion_length": 426.4375, "epoch": 0.0034344353870981088, "grad_norm": 2.199975124800715, "kl": 0.037109375, "learning_rate": 9.999708964315887e-07, "loss": 0.0015, "reward": 2.0427188873291016, "reward_std": 0.0730399638414383, "rewards/accuracy_reward": 0.8489687442779541, "rewards/format_reward": 1.0, "step": 249 }, { "all_correct": 0.25, "all_wrong": 0.0, "completion_length": 425.0, "epoch": 0.003448228300299306, "grad_norm": 2.0438191738697555, "kl": 0.03515625, "learning_rate": 9.999706622008729e-07, "loss": 0.0014, "reward": 2.087437629699707, "reward_std": 0.07310131192207336, "rewards/accuracy_reward": 0.8936874866485596, "rewards/format_reward": 1.0, "step": 250 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 395.46875, "epoch": 0.0034620212135005033, "grad_norm": 2.1852266242442493, "kl": 0.034423828125, "learning_rate": 9.999704270313933e-07, "loss": 0.0014, "reward": 2.0613436698913574, "reward_std": 0.05800023674964905, "rewards/accuracy_reward": 0.8613438010215759, "rewards/format_reward": 1.0, "step": 251 }, { "all_correct": 0.25, "all_wrong": 0.0, "completion_length": 405.65625, "epoch": 0.0034758141267017006, "grad_norm": 2.0023596759487896, "kl": 0.037353515625, "learning_rate": 9.999701909231505e-07, "loss": 0.0015, "reward": 2.0703749656677246, "reward_std": 0.062112633138895035, "rewards/accuracy_reward": 0.8766250014305115, "rewards/format_reward": 1.0, "step": 252 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 404.65625, "epoch": 0.003489607039902898, "grad_norm": 2.109790177842566, "kl": 0.039306640625, "learning_rate": 9.99969953876145e-07, "loss": 0.0016, "reward": 2.0616250038146973, "reward_std": 0.04189714789390564, "rewards/accuracy_reward": 0.8616250157356262, "rewards/format_reward": 1.0, "step": 253 }, { "all_correct": 0.25, "all_wrong": 0.0, "completion_length": 409.65625, "epoch": 0.003503399953104095, "grad_norm": 3.798202910178685, "kl": 0.043212890625, "learning_rate": 9.99969715890377e-07, "loss": 0.0017, "reward": 2.021312713623047, "reward_std": 0.058766983449459076, "rewards/accuracy_reward": 0.8213124871253967, "rewards/format_reward": 1.0, "step": 254 }, { "all_correct": 0.25, "all_wrong": 0.0, "completion_length": 421.3125, "epoch": 0.0035171928663052924, "grad_norm": 2.5956728171758354, "kl": 0.04541015625, "learning_rate": 9.999694769658473e-07, "loss": 0.0018, "reward": 2.014124870300293, "reward_std": 0.0557776540517807, "rewards/accuracy_reward": 0.8141250610351562, "rewards/format_reward": 1.0, "step": 255 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 398.09375, "epoch": 0.0035309857795064897, "grad_norm": 2.2923097999579665, "kl": 0.04248046875, "learning_rate": 9.99969237102556e-07, "loss": 0.0017, "reward": 2.091750144958496, "reward_std": 0.04986914247274399, "rewards/accuracy_reward": 0.8917499780654907, "rewards/format_reward": 1.0, "step": 256 }, { "all_correct": 0.0, "all_wrong": 0.0, "completion_length": 402.25, "epoch": 0.003544778692707687, "grad_norm": 2.571075561943788, "kl": 0.041259765625, "learning_rate": 9.99968996300504e-07, "loss": 0.0017, "reward": 1.8308438062667847, "reward_std": 0.07887184619903564, "rewards/accuracy_reward": 0.637093722820282, "rewards/format_reward": 1.0, "step": 257 }, { "all_correct": 0.25, "all_wrong": 0.0, "completion_length": 409.3125, "epoch": 0.0035585716059088842, "grad_norm": 2.544621956527065, "kl": 0.041259765625, "learning_rate": 9.999687545596911e-07, "loss": 0.0016, "reward": 2.010499954223633, "reward_std": 0.05066101253032684, "rewards/accuracy_reward": 0.8167500495910645, "rewards/format_reward": 1.0, "step": 258 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 416.9375, "epoch": 0.003572364519110081, "grad_norm": 2.4619144524351153, "kl": 0.052734375, "learning_rate": 9.999685118801182e-07, "loss": 0.0021, "reward": 2.0476250648498535, "reward_std": 0.04513250291347504, "rewards/accuracy_reward": 0.8538749814033508, "rewards/format_reward": 1.0, "step": 259 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 407.5625, "epoch": 0.0035861574323112783, "grad_norm": 2.6809247747108422, "kl": 0.04052734375, "learning_rate": 9.999682682617857e-07, "loss": 0.0016, "reward": 2.0165936946868896, "reward_std": 0.05982508882880211, "rewards/accuracy_reward": 0.8165937662124634, "rewards/format_reward": 1.0, "step": 260 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 410.3125, "epoch": 0.0035999503455124756, "grad_norm": 2.167779313567494, "kl": 0.04345703125, "learning_rate": 9.999680237046943e-07, "loss": 0.0017, "reward": 2.0805938243865967, "reward_std": 0.03542504459619522, "rewards/accuracy_reward": 0.8805937767028809, "rewards/format_reward": 1.0, "step": 261 }, { "all_correct": 0.25, "all_wrong": 0.0, "completion_length": 418.96875, "epoch": 0.003613743258713673, "grad_norm": 2.3331276442507054, "kl": 0.051025390625, "learning_rate": 9.99967778208844e-07, "loss": 0.002, "reward": 2.005687713623047, "reward_std": 0.05576542019844055, "rewards/accuracy_reward": 0.8119375109672546, "rewards/format_reward": 1.0, "step": 262 }, { "all_correct": 0.0, "all_wrong": 0.0, "completion_length": 398.8125, "epoch": 0.00362753617191487, "grad_norm": 2.0182493850384673, "kl": 0.039306640625, "learning_rate": 9.999675317742354e-07, "loss": 0.0016, "reward": 1.909406304359436, "reward_std": 0.0761910229921341, "rewards/accuracy_reward": 0.7156562805175781, "rewards/format_reward": 1.0, "step": 263 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 398.5625, "epoch": 0.0036413290851160674, "grad_norm": 2.2441006666586385, "kl": 0.0419921875, "learning_rate": 9.999672844008688e-07, "loss": 0.0017, "reward": 2.105343818664551, "reward_std": 0.05280838906764984, "rewards/accuracy_reward": 0.9115937948226929, "rewards/format_reward": 1.0, "step": 264 }, { "all_correct": 0.25, "all_wrong": 0.0, "completion_length": 411.0625, "epoch": 0.0036551219983172647, "grad_norm": 1.9324917564079633, "kl": 0.047607421875, "learning_rate": 9.999670360887451e-07, "loss": 0.0019, "reward": 1.9695937633514404, "reward_std": 0.06791258603334427, "rewards/accuracy_reward": 0.7883437871932983, "rewards/format_reward": 1.0, "step": 265 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 405.28125, "epoch": 0.003668914911518462, "grad_norm": 3.0008398320119687, "kl": 0.049072265625, "learning_rate": 9.999667868378646e-07, "loss": 0.002, "reward": 2.1115000247955322, "reward_std": 0.04329709708690643, "rewards/accuracy_reward": 0.9114999771118164, "rewards/format_reward": 1.0, "step": 266 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 417.90625, "epoch": 0.0036827078247196592, "grad_norm": 1.9823630409894832, "kl": 0.039794921875, "learning_rate": 9.999665366482277e-07, "loss": 0.0016, "reward": 2.0032811164855957, "reward_std": 0.046205341815948486, "rewards/accuracy_reward": 0.8157812356948853, "rewards/format_reward": 1.0, "step": 267 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 403.84375, "epoch": 0.003696500737920856, "grad_norm": 2.831452106077776, "kl": 0.037109375, "learning_rate": 9.999662855198347e-07, "loss": 0.0015, "reward": 2.02303147315979, "reward_std": 0.04871858283877373, "rewards/accuracy_reward": 0.829281210899353, "rewards/format_reward": 1.0, "step": 268 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 412.34375, "epoch": 0.0037102936511220533, "grad_norm": 3.2184165746431113, "kl": 0.0400390625, "learning_rate": 9.999660334526865e-07, "loss": 0.0016, "reward": 2.145625114440918, "reward_std": 0.03861492872238159, "rewards/accuracy_reward": 0.9456250071525574, "rewards/format_reward": 1.0, "step": 269 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 398.375, "epoch": 0.0037240865643232506, "grad_norm": 2.5247440595860047, "kl": 0.041748046875, "learning_rate": 9.999657804467831e-07, "loss": 0.0017, "reward": 2.060500144958496, "reward_std": 0.0684417188167572, "rewards/accuracy_reward": 0.8604999780654907, "rewards/format_reward": 1.0, "step": 270 }, { "all_correct": 0.25, "all_wrong": 0.0, "completion_length": 417.0625, "epoch": 0.003737879477524448, "grad_norm": 3.294549136546471, "kl": 0.048828125, "learning_rate": 9.999655265021254e-07, "loss": 0.002, "reward": 2.0664689540863037, "reward_std": 0.059139810502529144, "rewards/accuracy_reward": 0.8789687752723694, "rewards/format_reward": 1.0, "step": 271 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 403.96875, "epoch": 0.003751672390725645, "grad_norm": 2.2085419354877507, "kl": 0.0419921875, "learning_rate": 9.999652716187134e-07, "loss": 0.0017, "reward": 2.113156318664551, "reward_std": 0.054125670343637466, "rewards/accuracy_reward": 0.913156270980835, "rewards/format_reward": 1.0, "step": 272 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 418.59375, "epoch": 0.0037654653039268424, "grad_norm": 2.0449996029290514, "kl": 0.045166015625, "learning_rate": 9.999650157965483e-07, "loss": 0.0018, "reward": 2.01143741607666, "reward_std": 0.034549862146377563, "rewards/accuracy_reward": 0.8114374876022339, "rewards/format_reward": 1.0, "step": 273 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 406.875, "epoch": 0.0037792582171280397, "grad_norm": 2.3059778358426963, "kl": 0.041259765625, "learning_rate": 9.999647590356297e-07, "loss": 0.0016, "reward": 2.101250171661377, "reward_std": 0.05277696251869202, "rewards/accuracy_reward": 0.9012499451637268, "rewards/format_reward": 1.0, "step": 274 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 395.65625, "epoch": 0.003793051130329237, "grad_norm": 2.349047338736593, "kl": 0.0439453125, "learning_rate": 9.999645013359587e-07, "loss": 0.0018, "reward": 2.070218801498413, "reward_std": 0.0551038421690464, "rewards/accuracy_reward": 0.8702187538146973, "rewards/format_reward": 1.0, "step": 275 }, { "all_correct": 0.0, "all_wrong": 0.0, "completion_length": 379.53125, "epoch": 0.0038068440435304342, "grad_norm": 2.7461992071751595, "kl": 0.0458984375, "learning_rate": 9.999642426975357e-07, "loss": 0.0018, "reward": 1.9736876487731934, "reward_std": 0.07994268089532852, "rewards/accuracy_reward": 0.7861874103546143, "rewards/format_reward": 1.0, "step": 276 }, { "all_correct": 0.25, "all_wrong": 0.0, "completion_length": 416.21875, "epoch": 0.0038206369567316315, "grad_norm": 1.9620647616642628, "kl": 0.04443359375, "learning_rate": 9.999639831203607e-07, "loss": 0.0018, "reward": 2.0954062938690186, "reward_std": 0.07648126780986786, "rewards/accuracy_reward": 0.907906174659729, "rewards/format_reward": 1.0, "step": 277 }, { "all_correct": 0.25, "all_wrong": 0.0, "completion_length": 407.4375, "epoch": 0.0038344298699328283, "grad_norm": 2.202497343212584, "kl": 0.04931640625, "learning_rate": 9.999637226044352e-07, "loss": 0.002, "reward": 1.9760937690734863, "reward_std": 0.060630109161138535, "rewards/accuracy_reward": 0.7823437452316284, "rewards/format_reward": 1.0, "step": 278 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 391.34375, "epoch": 0.0038482227831340256, "grad_norm": 2.1483645409790677, "kl": 0.0439453125, "learning_rate": 9.999634611497585e-07, "loss": 0.0018, "reward": 2.135937452316284, "reward_std": 0.0832315981388092, "rewards/accuracy_reward": 0.948437511920929, "rewards/format_reward": 1.0, "step": 279 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 393.8125, "epoch": 0.003862015696335223, "grad_norm": 2.417496874935089, "kl": 0.0498046875, "learning_rate": 9.999631987563319e-07, "loss": 0.002, "reward": 2.0570626258850098, "reward_std": 0.04688289389014244, "rewards/accuracy_reward": 0.857062578201294, "rewards/format_reward": 1.0, "step": 280 }, { "all_correct": 0.0, "all_wrong": 0.0, "completion_length": 413.3125, "epoch": 0.00387580860953642, "grad_norm": 1.95025615979101, "kl": 0.04541015625, "learning_rate": 9.999629354241556e-07, "loss": 0.0018, "reward": 2.0249063968658447, "reward_std": 0.060103025287389755, "rewards/accuracy_reward": 0.8249062895774841, "rewards/format_reward": 1.0, "step": 281 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 417.625, "epoch": 0.0038896015227376174, "grad_norm": 2.5190439820102304, "kl": 0.0517578125, "learning_rate": 9.999626711532301e-07, "loss": 0.0021, "reward": 2.060281276702881, "reward_std": 0.05729492008686066, "rewards/accuracy_reward": 0.860281229019165, "rewards/format_reward": 1.0, "step": 282 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 396.59375, "epoch": 0.0039033944359388147, "grad_norm": 2.455990250729256, "kl": 0.0517578125, "learning_rate": 9.99962405943556e-07, "loss": 0.0021, "reward": 2.0301876068115234, "reward_std": 0.03979829326272011, "rewards/accuracy_reward": 0.8301875591278076, "rewards/format_reward": 1.0, "step": 283 }, { "all_correct": 0.25, "all_wrong": 0.0, "completion_length": 406.375, "epoch": 0.003917187349140012, "grad_norm": 2.208309017099772, "kl": 0.05078125, "learning_rate": 9.999621397951337e-07, "loss": 0.002, "reward": 1.9785938262939453, "reward_std": 0.07059404999017715, "rewards/accuracy_reward": 0.7785937786102295, "rewards/format_reward": 1.0, "step": 284 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 403.46875, "epoch": 0.003930980262341209, "grad_norm": 2.1968476853910732, "kl": 0.0458984375, "learning_rate": 9.999618727079638e-07, "loss": 0.0018, "reward": 2.0583438873291016, "reward_std": 0.048853449523448944, "rewards/accuracy_reward": 0.8583437204360962, "rewards/format_reward": 1.0, "step": 285 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 393.15625, "epoch": 0.0039447731755424065, "grad_norm": 1.6202956060079958, "kl": 0.046142578125, "learning_rate": 9.999616046820466e-07, "loss": 0.0018, "reward": 2.0592501163482666, "reward_std": 0.032233476638793945, "rewards/accuracy_reward": 0.859250009059906, "rewards/format_reward": 1.0, "step": 286 }, { "all_correct": 0.25, "all_wrong": 0.0, "completion_length": 414.4375, "epoch": 0.003958566088743604, "grad_norm": 2.0637169765745687, "kl": 0.0498046875, "learning_rate": 9.99961335717383e-07, "loss": 0.002, "reward": 2.1058125495910645, "reward_std": 0.06778968125581741, "rewards/accuracy_reward": 0.9183124303817749, "rewards/format_reward": 1.0, "step": 287 }, { "all_correct": 0.0, "all_wrong": 0.0, "completion_length": 410.96875, "epoch": 0.003972359001944801, "grad_norm": 1.9067833192651293, "kl": 0.045654296875, "learning_rate": 9.99961065813973e-07, "loss": 0.0018, "reward": 1.8669687509536743, "reward_std": 0.08422042429447174, "rewards/accuracy_reward": 0.6732187271118164, "rewards/format_reward": 1.0, "step": 288 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 401.4375, "epoch": 0.003986151915145998, "grad_norm": 5.707133364027419, "kl": 0.0498046875, "learning_rate": 9.999607949718175e-07, "loss": 0.002, "reward": 2.1269373893737793, "reward_std": 0.06713393330574036, "rewards/accuracy_reward": 0.9331874847412109, "rewards/format_reward": 1.0, "step": 289 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 387.96875, "epoch": 0.003999944828347196, "grad_norm": 1.9552374066531522, "kl": 0.0439453125, "learning_rate": 9.999605231909166e-07, "loss": 0.0018, "reward": 2.0127501487731934, "reward_std": 0.05280975252389908, "rewards/accuracy_reward": 0.825249969959259, "rewards/format_reward": 1.0, "step": 290 }, { "all_correct": 0.0, "all_wrong": 0.0, "completion_length": 403.84375, "epoch": 0.004013737741548393, "grad_norm": 1.8859779206151355, "kl": 0.0458984375, "learning_rate": 9.99960250471271e-07, "loss": 0.0018, "reward": 1.9792814254760742, "reward_std": 0.03198267146945, "rewards/accuracy_reward": 0.7792812585830688, "rewards/format_reward": 1.0, "step": 291 }, { "all_correct": 0.0, "all_wrong": 0.0, "completion_length": 424.40625, "epoch": 0.004027530654749589, "grad_norm": 2.9301732040098734, "kl": 0.04638671875, "learning_rate": 9.999599768128819e-07, "loss": 0.0019, "reward": 2.021250009536743, "reward_std": 0.07541981339454651, "rewards/accuracy_reward": 0.8212499618530273, "rewards/format_reward": 1.0, "step": 292 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 403.96875, "epoch": 0.0040413235679507865, "grad_norm": 2.170009226320114, "kl": 0.049560546875, "learning_rate": 9.999597022157487e-07, "loss": 0.002, "reward": 2.079031467437744, "reward_std": 0.03810182213783264, "rewards/accuracy_reward": 0.8852812647819519, "rewards/format_reward": 1.0, "step": 293 }, { "all_correct": 0.0, "all_wrong": 0.0, "completion_length": 410.8125, "epoch": 0.004055116481151984, "grad_norm": 2.1153812969661048, "kl": 0.0478515625, "learning_rate": 9.999594266798726e-07, "loss": 0.0019, "reward": 2.0100626945495605, "reward_std": 0.07581374794244766, "rewards/accuracy_reward": 0.8163124918937683, "rewards/format_reward": 1.0, "step": 294 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 412.75, "epoch": 0.004068909394353181, "grad_norm": 1.5327578299175344, "kl": 0.0419921875, "learning_rate": 9.999591502052539e-07, "loss": 0.0017, "reward": 2.111968755722046, "reward_std": 0.03029218688607216, "rewards/accuracy_reward": 0.9119687676429749, "rewards/format_reward": 1.0, "step": 295 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 396.3125, "epoch": 0.004082702307554378, "grad_norm": 2.3809873329560984, "kl": 0.044921875, "learning_rate": 9.99958872791893e-07, "loss": 0.0018, "reward": 2.0056562423706055, "reward_std": 0.050386276096105576, "rewards/accuracy_reward": 0.8056562542915344, "rewards/format_reward": 1.0, "step": 296 }, { "all_correct": 0.25, "all_wrong": 0.0, "completion_length": 400.46875, "epoch": 0.004096495220755576, "grad_norm": 2.259090133781256, "kl": 0.04443359375, "learning_rate": 9.999585944397908e-07, "loss": 0.0018, "reward": 1.942562460899353, "reward_std": 0.06388393044471741, "rewards/accuracy_reward": 0.7425624132156372, "rewards/format_reward": 1.0, "step": 297 }, { "all_correct": 0.25, "all_wrong": 0.0, "completion_length": 406.0625, "epoch": 0.004110288133956773, "grad_norm": 2.187301874035118, "kl": 0.039306640625, "learning_rate": 9.999583151489474e-07, "loss": 0.0016, "reward": 2.076937437057495, "reward_std": 0.06230815127491951, "rewards/accuracy_reward": 0.8769375085830688, "rewards/format_reward": 1.0, "step": 298 }, { "all_correct": 0.25, "all_wrong": 0.0, "completion_length": 421.0625, "epoch": 0.00412408104715797, "grad_norm": 2.9053013740819162, "kl": 0.03857421875, "learning_rate": 9.999580349193636e-07, "loss": 0.0016, "reward": 2.093437671661377, "reward_std": 0.06497351825237274, "rewards/accuracy_reward": 0.8996874690055847, "rewards/format_reward": 1.0, "step": 299 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 402.71875, "epoch": 0.0041378739603591674, "grad_norm": 2.295198209247636, "kl": 0.04150390625, "learning_rate": 9.999577537510399e-07, "loss": 0.0017, "reward": 2.132187604904175, "reward_std": 0.04672250896692276, "rewards/accuracy_reward": 0.9384374618530273, "rewards/format_reward": 1.0, "step": 300 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 403.46875, "epoch": 0.004151666873560365, "grad_norm": 2.6954721125938272, "kl": 0.03857421875, "learning_rate": 9.999574716439767e-07, "loss": 0.0015, "reward": 2.0789687633514404, "reward_std": 0.04196634143590927, "rewards/accuracy_reward": 0.8789687752723694, "rewards/format_reward": 1.0, "step": 301 }, { "all_correct": 0.25, "all_wrong": 0.0, "completion_length": 388.25, "epoch": 0.004165459786761562, "grad_norm": 7.385240465238868, "kl": 0.04443359375, "learning_rate": 9.999571885981745e-07, "loss": 0.0018, "reward": 2.0588436126708984, "reward_std": 0.06431737542152405, "rewards/accuracy_reward": 0.871343731880188, "rewards/format_reward": 1.0, "step": 302 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 392.75, "epoch": 0.004179252699962759, "grad_norm": 2.3460991055713345, "kl": 0.04052734375, "learning_rate": 9.999569046136342e-07, "loss": 0.0016, "reward": 2.053500175476074, "reward_std": 0.044138554483652115, "rewards/accuracy_reward": 0.8535000085830688, "rewards/format_reward": 1.0, "step": 303 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 404.90625, "epoch": 0.0041930456131639565, "grad_norm": 2.1793114997339096, "kl": 0.03759765625, "learning_rate": 9.999566196903558e-07, "loss": 0.0015, "reward": 2.0028750896453857, "reward_std": 0.05084363371133804, "rewards/accuracy_reward": 0.8028749227523804, "rewards/format_reward": 1.0, "step": 304 }, { "all_correct": 0.25, "all_wrong": 0.0, "completion_length": 378.78125, "epoch": 0.004206838526365154, "grad_norm": 2.1235652494656043, "kl": 0.041748046875, "learning_rate": 9.999563338283403e-07, "loss": 0.0017, "reward": 1.9711250066757202, "reward_std": 0.06680046021938324, "rewards/accuracy_reward": 0.7961249947547913, "rewards/format_reward": 1.0, "step": 305 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 402.03125, "epoch": 0.004220631439566351, "grad_norm": 2.4226850161928932, "kl": 0.0419921875, "learning_rate": 9.99956047027588e-07, "loss": 0.0017, "reward": 2.0967187881469727, "reward_std": 0.04176875203847885, "rewards/accuracy_reward": 0.8967186808586121, "rewards/format_reward": 1.0, "step": 306 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 392.375, "epoch": 0.004234424352767548, "grad_norm": 2.003513508773817, "kl": 0.03564453125, "learning_rate": 9.999557592880993e-07, "loss": 0.0014, "reward": 2.124812602996826, "reward_std": 0.032197341322898865, "rewards/accuracy_reward": 0.9248125553131104, "rewards/format_reward": 1.0, "step": 307 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 404.40625, "epoch": 0.004248217265968746, "grad_norm": 1.9352775343816293, "kl": 0.035400390625, "learning_rate": 9.99955470609875e-07, "loss": 0.0014, "reward": 2.034749984741211, "reward_std": 0.038731303066015244, "rewards/accuracy_reward": 0.8347500562667847, "rewards/format_reward": 1.0, "step": 308 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 411.4375, "epoch": 0.004262010179169943, "grad_norm": 2.302998863040488, "kl": 0.04052734375, "learning_rate": 9.999551809929157e-07, "loss": 0.0016, "reward": 2.0734689235687256, "reward_std": 0.03724667429924011, "rewards/accuracy_reward": 0.8734687566757202, "rewards/format_reward": 1.0, "step": 309 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 398.625, "epoch": 0.004275803092371139, "grad_norm": 2.0008945162862513, "kl": 0.0400390625, "learning_rate": 9.999548904372217e-07, "loss": 0.0016, "reward": 2.108562469482422, "reward_std": 0.040712036192417145, "rewards/accuracy_reward": 0.9148125052452087, "rewards/format_reward": 1.0, "step": 310 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 398.3125, "epoch": 0.0042895960055723365, "grad_norm": 2.561165946126311, "kl": 0.03857421875, "learning_rate": 9.999545989427936e-07, "loss": 0.0015, "reward": 2.0611250400543213, "reward_std": 0.04077856242656708, "rewards/accuracy_reward": 0.8611249923706055, "rewards/format_reward": 1.0, "step": 311 }, { "all_correct": 0.25, "all_wrong": 0.0, "completion_length": 403.96875, "epoch": 0.004303388918773534, "grad_norm": 2.351894978732953, "kl": 0.041748046875, "learning_rate": 9.999543065096318e-07, "loss": 0.0017, "reward": 1.9397813081741333, "reward_std": 0.07170985639095306, "rewards/accuracy_reward": 0.7460312843322754, "rewards/format_reward": 1.0, "step": 312 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 409.4375, "epoch": 0.004317181831974731, "grad_norm": 1.9769740402968916, "kl": 0.03759765625, "learning_rate": 9.999540131377374e-07, "loss": 0.0015, "reward": 2.079843759536743, "reward_std": 0.032624490559101105, "rewards/accuracy_reward": 0.8798437714576721, "rewards/format_reward": 1.0, "step": 313 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 376.75, "epoch": 0.004330974745175928, "grad_norm": 2.7979880124967864, "kl": 0.04345703125, "learning_rate": 9.999537188271105e-07, "loss": 0.0017, "reward": 1.9039688110351562, "reward_std": 0.05473501235246658, "rewards/accuracy_reward": 0.7164687514305115, "rewards/format_reward": 1.0, "step": 314 }, { "all_correct": 0.25, "all_wrong": 0.0, "completion_length": 408.0625, "epoch": 0.004344767658377126, "grad_norm": 2.0207158163206063, "kl": 0.04345703125, "learning_rate": 9.999534235777517e-07, "loss": 0.0017, "reward": 2.02524995803833, "reward_std": 0.06028753146529198, "rewards/accuracy_reward": 0.8315000534057617, "rewards/format_reward": 1.0, "step": 315 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 386.75, "epoch": 0.004358560571578323, "grad_norm": 2.0737064057327186, "kl": 0.0390625, "learning_rate": 9.999531273896615e-07, "loss": 0.0016, "reward": 1.9908437728881836, "reward_std": 0.07311345636844635, "rewards/accuracy_reward": 0.8158438205718994, "rewards/format_reward": 1.0, "step": 316 }, { "all_correct": 0.25, "all_wrong": 0.0, "completion_length": 412.09375, "epoch": 0.00437235348477952, "grad_norm": 2.2448866082612784, "kl": 0.038818359375, "learning_rate": 9.999528302628408e-07, "loss": 0.0016, "reward": 2.05831241607666, "reward_std": 0.05879085510969162, "rewards/accuracy_reward": 0.8645625114440918, "rewards/format_reward": 1.0, "step": 317 }, { "all_correct": 0.25, "all_wrong": 0.0, "completion_length": 405.5625, "epoch": 0.0043861463979807174, "grad_norm": 3.378150797647078, "kl": 0.042236328125, "learning_rate": 9.999525321972898e-07, "loss": 0.0017, "reward": 1.940500020980835, "reward_std": 0.04734080284833908, "rewards/accuracy_reward": 0.746749997138977, "rewards/format_reward": 1.0, "step": 318 }, { "all_correct": 0.25, "all_wrong": 0.0, "completion_length": 429.875, "epoch": 0.004399939311181915, "grad_norm": 1.5939155190729113, "kl": 0.037109375, "learning_rate": 9.99952233193009e-07, "loss": 0.0015, "reward": 2.0513124465942383, "reward_std": 0.037394169718027115, "rewards/accuracy_reward": 0.8575624823570251, "rewards/format_reward": 1.0, "step": 319 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 412.8125, "epoch": 0.004413732224383112, "grad_norm": 2.331498153024052, "kl": 0.043701171875, "learning_rate": 9.999519332499994e-07, "loss": 0.0018, "reward": 2.0784687995910645, "reward_std": 0.02493938058614731, "rewards/accuracy_reward": 0.8784687519073486, "rewards/format_reward": 1.0, "step": 320 }, { "all_correct": 0.0, "all_wrong": 0.0, "completion_length": 430.25, "epoch": 0.004427525137584309, "grad_norm": 2.185404643560563, "kl": 0.048583984375, "learning_rate": 9.999516323682612e-07, "loss": 0.0019, "reward": 1.9275312423706055, "reward_std": 0.08080125600099564, "rewards/accuracy_reward": 0.7400312423706055, "rewards/format_reward": 1.0, "step": 321 }, { "all_correct": 0.0, "all_wrong": 0.0, "completion_length": 399.4375, "epoch": 0.0044413180507855065, "grad_norm": 2.0086037914799943, "kl": 0.042724609375, "learning_rate": 9.99951330547795e-07, "loss": 0.0017, "reward": 1.9912811517715454, "reward_std": 0.07223745435476303, "rewards/accuracy_reward": 0.797531247138977, "rewards/format_reward": 1.0, "step": 322 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 417.75, "epoch": 0.004455110963986704, "grad_norm": 3.5915516694367207, "kl": 0.04638671875, "learning_rate": 9.999510277886015e-07, "loss": 0.0019, "reward": 2.042468786239624, "reward_std": 0.05185335874557495, "rewards/accuracy_reward": 0.8424687385559082, "rewards/format_reward": 1.0, "step": 323 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 398.03125, "epoch": 0.004468903877187901, "grad_norm": 2.077750974468366, "kl": 0.038818359375, "learning_rate": 9.999507240906812e-07, "loss": 0.0016, "reward": 2.135718822479248, "reward_std": 0.045622505247592926, "rewards/accuracy_reward": 0.9419687390327454, "rewards/format_reward": 1.0, "step": 324 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 411.53125, "epoch": 0.004482696790389098, "grad_norm": 3.597813350929996, "kl": 0.042236328125, "learning_rate": 9.999504194540346e-07, "loss": 0.0017, "reward": 2.029062509536743, "reward_std": 0.03845563158392906, "rewards/accuracy_reward": 0.8290624618530273, "rewards/format_reward": 1.0, "step": 325 }, { "all_correct": 0.25, "all_wrong": 0.0, "completion_length": 415.125, "epoch": 0.004496489703590296, "grad_norm": 2.6370243583822095, "kl": 0.0458984375, "learning_rate": 9.999501138786624e-07, "loss": 0.0018, "reward": 1.999406337738037, "reward_std": 0.032154206186532974, "rewards/accuracy_reward": 0.7994062900543213, "rewards/format_reward": 1.0, "step": 326 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 394.375, "epoch": 0.004510282616791493, "grad_norm": 3.7311115387817164, "kl": 0.044189453125, "learning_rate": 9.999498073645652e-07, "loss": 0.0018, "reward": 2.040843963623047, "reward_std": 0.06184101104736328, "rewards/accuracy_reward": 0.8470938205718994, "rewards/format_reward": 1.0, "step": 327 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 416.78125, "epoch": 0.00452407552999269, "grad_norm": 2.6801280911897805, "kl": 0.052490234375, "learning_rate": 9.999494999117433e-07, "loss": 0.0021, "reward": 2.0666344165802, "reward_std": 0.03997699171304703, "rewards/accuracy_reward": 0.8666344285011292, "rewards/format_reward": 1.0, "step": 328 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 419.78125, "epoch": 0.0045378684431938866, "grad_norm": 5.340731461425449, "kl": 0.0419921875, "learning_rate": 9.999491915201976e-07, "loss": 0.0017, "reward": 2.1009063720703125, "reward_std": 0.04983200132846832, "rewards/accuracy_reward": 0.9009062051773071, "rewards/format_reward": 1.0, "step": 329 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 416.75, "epoch": 0.004551661356395084, "grad_norm": 2.1024504570922193, "kl": 0.044677734375, "learning_rate": 9.999488821899285e-07, "loss": 0.0018, "reward": 2.0531249046325684, "reward_std": 0.05571488291025162, "rewards/accuracy_reward": 0.859375, "rewards/format_reward": 1.0, "step": 330 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 407.625, "epoch": 0.004565454269596281, "grad_norm": 1.9024488830001074, "kl": 0.04345703125, "learning_rate": 9.999485719209366e-07, "loss": 0.0017, "reward": 2.124000072479248, "reward_std": 0.04417997598648071, "rewards/accuracy_reward": 0.9239999651908875, "rewards/format_reward": 1.0, "step": 331 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 413.5, "epoch": 0.004579247182797478, "grad_norm": 2.763695348323022, "kl": 0.04296875, "learning_rate": 9.999482607132226e-07, "loss": 0.0017, "reward": 2.043750047683716, "reward_std": 0.05617400258779526, "rewards/accuracy_reward": 0.84375, "rewards/format_reward": 1.0, "step": 332 }, { "all_correct": 0.25, "all_wrong": 0.0, "completion_length": 415.8125, "epoch": 0.004593040095998676, "grad_norm": 2.3001319702998178, "kl": 0.04345703125, "learning_rate": 9.99947948566787e-07, "loss": 0.0017, "reward": 1.8749687671661377, "reward_std": 0.07037438452243805, "rewards/accuracy_reward": 0.6749687790870667, "rewards/format_reward": 1.0, "step": 333 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 396.90625, "epoch": 0.004606833009199873, "grad_norm": 2.268465115773429, "kl": 0.04345703125, "learning_rate": 9.999476354816304e-07, "loss": 0.0017, "reward": 2.1285624504089355, "reward_std": 0.026720412075519562, "rewards/accuracy_reward": 0.9285624027252197, "rewards/format_reward": 1.0, "step": 334 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 408.5625, "epoch": 0.00462062592240107, "grad_norm": 2.4107656291581705, "kl": 0.03955078125, "learning_rate": 9.99947321457753e-07, "loss": 0.0016, "reward": 2.097562313079834, "reward_std": 0.06343509256839752, "rewards/accuracy_reward": 0.897562563419342, "rewards/format_reward": 1.0, "step": 335 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 390.53125, "epoch": 0.0046344188356022675, "grad_norm": 2.1513955728976346, "kl": 0.04150390625, "learning_rate": 9.999470064951562e-07, "loss": 0.0017, "reward": 2.031625270843506, "reward_std": 0.035630617290735245, "rewards/accuracy_reward": 0.8378750681877136, "rewards/format_reward": 1.0, "step": 336 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 412.875, "epoch": 0.004648211748803465, "grad_norm": 2.4460731986358324, "kl": 0.043701171875, "learning_rate": 9.9994669059384e-07, "loss": 0.0018, "reward": 2.067187786102295, "reward_std": 0.027138758450746536, "rewards/accuracy_reward": 0.8671875, "rewards/format_reward": 1.0, "step": 337 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 397.25, "epoch": 0.004662004662004662, "grad_norm": 1.9375163075273105, "kl": 0.03857421875, "learning_rate": 9.999463737538052e-07, "loss": 0.0015, "reward": 2.0592501163482666, "reward_std": 0.059936925768852234, "rewards/accuracy_reward": 0.859250009059906, "rewards/format_reward": 1.0, "step": 338 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 396.0, "epoch": 0.004675797575205859, "grad_norm": 2.1572194579167627, "kl": 0.043701171875, "learning_rate": 9.999460559750523e-07, "loss": 0.0017, "reward": 2.0362813472747803, "reward_std": 0.05789932236075401, "rewards/accuracy_reward": 0.8487812280654907, "rewards/format_reward": 1.0, "step": 339 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 413.34375, "epoch": 0.0046895904884070565, "grad_norm": 2.231367884501955, "kl": 0.044677734375, "learning_rate": 9.999457372575818e-07, "loss": 0.0018, "reward": 1.9827499389648438, "reward_std": 0.05621706694364548, "rewards/accuracy_reward": 0.7827500104904175, "rewards/format_reward": 1.0, "step": 340 }, { "all_correct": 0.25, "all_wrong": 0.0, "completion_length": 415.75, "epoch": 0.004703383401608254, "grad_norm": 1.8589563338021162, "kl": 0.039306640625, "learning_rate": 9.999454176013947e-07, "loss": 0.0016, "reward": 1.9488749504089355, "reward_std": 0.03878430649638176, "rewards/accuracy_reward": 0.7488750219345093, "rewards/format_reward": 1.0, "step": 341 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 403.53125, "epoch": 0.004717176314809451, "grad_norm": 2.380589736078398, "kl": 0.043212890625, "learning_rate": 9.999450970064912e-07, "loss": 0.0017, "reward": 2.026750087738037, "reward_std": 0.047637972980737686, "rewards/accuracy_reward": 0.8267499804496765, "rewards/format_reward": 1.0, "step": 342 }, { "all_correct": 0.25, "all_wrong": 0.0, "completion_length": 396.9375, "epoch": 0.004730969228010648, "grad_norm": 2.577813399230479, "kl": 0.0419921875, "learning_rate": 9.99944775472872e-07, "loss": 0.0017, "reward": 2.017937660217285, "reward_std": 0.0620587095618248, "rewards/accuracy_reward": 0.8241875171661377, "rewards/format_reward": 1.0, "step": 343 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 416.5, "epoch": 0.004744762141211846, "grad_norm": 2.0542204616794226, "kl": 0.0390625, "learning_rate": 9.999444530005378e-07, "loss": 0.0016, "reward": 2.076906204223633, "reward_std": 0.0624992698431015, "rewards/accuracy_reward": 0.8769062757492065, "rewards/format_reward": 1.0, "step": 344 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 409.96875, "epoch": 0.004758555054413043, "grad_norm": 2.174883181770073, "kl": 0.045654296875, "learning_rate": 9.99944129589489e-07, "loss": 0.0018, "reward": 2.1073126792907715, "reward_std": 0.03396311402320862, "rewards/accuracy_reward": 0.9073124527931213, "rewards/format_reward": 1.0, "step": 345 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 417.375, "epoch": 0.00477234796761424, "grad_norm": 1.4385540274746251, "kl": 0.048828125, "learning_rate": 9.999438052397266e-07, "loss": 0.0019, "reward": 2.12890625, "reward_std": 0.01819523051381111, "rewards/accuracy_reward": 0.9289062023162842, "rewards/format_reward": 1.0, "step": 346 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 428.53125, "epoch": 0.0047861408808154374, "grad_norm": 2.0095998521417147, "kl": 0.0419921875, "learning_rate": 9.999434799512506e-07, "loss": 0.0017, "reward": 2.0420937538146973, "reward_std": 0.051832351833581924, "rewards/accuracy_reward": 0.8483437299728394, "rewards/format_reward": 1.0, "step": 347 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 413.4375, "epoch": 0.004799933794016634, "grad_norm": 1.8658504636043052, "kl": 0.039794921875, "learning_rate": 9.999431537240623e-07, "loss": 0.0016, "reward": 2.1078126430511475, "reward_std": 0.03130917623639107, "rewards/accuracy_reward": 0.9078124761581421, "rewards/format_reward": 1.0, "step": 348 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 418.375, "epoch": 0.004813726707217831, "grad_norm": 2.259989569398058, "kl": 0.0458984375, "learning_rate": 9.99942826558162e-07, "loss": 0.0018, "reward": 2.055500030517578, "reward_std": 0.034889351576566696, "rewards/accuracy_reward": 0.8555000424385071, "rewards/format_reward": 1.0, "step": 349 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 424.0625, "epoch": 0.004827519620419028, "grad_norm": 2.6212407829420052, "kl": 0.048095703125, "learning_rate": 9.9994249845355e-07, "loss": 0.0019, "reward": 2.0044689178466797, "reward_std": 0.043663956224918365, "rewards/accuracy_reward": 0.8107187747955322, "rewards/format_reward": 1.0, "step": 350 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 410.0625, "epoch": 0.004841312533620226, "grad_norm": 1.6811831343354653, "kl": 0.04345703125, "learning_rate": 9.999421694102275e-07, "loss": 0.0017, "reward": 2.1256561279296875, "reward_std": 0.027446215972304344, "rewards/accuracy_reward": 0.9256561994552612, "rewards/format_reward": 1.0, "step": 351 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 439.3125, "epoch": 0.004855105446821423, "grad_norm": 2.7644306360908684, "kl": 0.05419921875, "learning_rate": 9.999418394281946e-07, "loss": 0.0022, "reward": 2.0047812461853027, "reward_std": 0.06064669042825699, "rewards/accuracy_reward": 0.8172812461853027, "rewards/format_reward": 1.0, "step": 352 }, { "all_correct": 0.25, "all_wrong": 0.0, "completion_length": 452.3125, "epoch": 0.00486889836002262, "grad_norm": 2.1979924323956057, "kl": 0.04638671875, "learning_rate": 9.999415085074523e-07, "loss": 0.0019, "reward": 1.979468822479248, "reward_std": 0.06920531392097473, "rewards/accuracy_reward": 0.7919687628746033, "rewards/format_reward": 1.0, "step": 353 }, { "all_correct": 0.25, "all_wrong": 0.0, "completion_length": 418.625, "epoch": 0.0048826912732238175, "grad_norm": 1.9030704052930756, "kl": 0.0380859375, "learning_rate": 9.99941176648001e-07, "loss": 0.0015, "reward": 1.8941562175750732, "reward_std": 0.050667401403188705, "rewards/accuracy_reward": 0.7004062533378601, "rewards/format_reward": 1.0, "step": 354 }, { "all_correct": 0.0, "all_wrong": 0.0, "completion_length": 453.40625, "epoch": 0.004896484186425015, "grad_norm": 1.9964212543638618, "kl": 0.046875, "learning_rate": 9.999408438498415e-07, "loss": 0.0019, "reward": 1.894718885421753, "reward_std": 0.06702862679958344, "rewards/accuracy_reward": 0.7134687304496765, "rewards/format_reward": 1.0, "step": 355 }, { "all_correct": 0.25, "all_wrong": 0.0, "completion_length": 443.5, "epoch": 0.004910277099626212, "grad_norm": 4.939967998037341, "kl": 0.0458984375, "learning_rate": 9.999405101129742e-07, "loss": 0.0018, "reward": 1.9465625286102295, "reward_std": 0.03200867399573326, "rewards/accuracy_reward": 0.7528125047683716, "rewards/format_reward": 1.0, "step": 356 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 426.375, "epoch": 0.004924070012827409, "grad_norm": 2.501571141337024, "kl": 0.050048828125, "learning_rate": 9.999401754373999e-07, "loss": 0.002, "reward": 2.0984063148498535, "reward_std": 0.037641845643520355, "rewards/accuracy_reward": 0.8984062671661377, "rewards/format_reward": 1.0, "step": 357 }, { "all_correct": 0.0, "all_wrong": 0.0, "completion_length": 423.8125, "epoch": 0.0049378629260286066, "grad_norm": 2.255679817205866, "kl": 0.0498046875, "learning_rate": 9.999398398231195e-07, "loss": 0.002, "reward": 1.9163438081741333, "reward_std": 0.049837008118629456, "rewards/accuracy_reward": 0.7288438081741333, "rewards/format_reward": 1.0, "step": 358 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 414.5625, "epoch": 0.004951655839229804, "grad_norm": 2.2703938311771057, "kl": 0.0439453125, "learning_rate": 9.999395032701328e-07, "loss": 0.0018, "reward": 1.9667813777923584, "reward_std": 0.04559829831123352, "rewards/accuracy_reward": 0.766781210899353, "rewards/format_reward": 1.0, "step": 359 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 400.375, "epoch": 0.004965448752431001, "grad_norm": 1.8878858364150628, "kl": 0.044677734375, "learning_rate": 9.99939165778441e-07, "loss": 0.0018, "reward": 2.121593713760376, "reward_std": 0.05778314173221588, "rewards/accuracy_reward": 0.9403437972068787, "rewards/format_reward": 1.0, "step": 360 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 425.5625, "epoch": 0.004979241665632198, "grad_norm": 2.219397048364965, "kl": 0.043701171875, "learning_rate": 9.999388273480449e-07, "loss": 0.0018, "reward": 2.0616250038146973, "reward_std": 0.02956487610936165, "rewards/accuracy_reward": 0.8616249561309814, "rewards/format_reward": 1.0, "step": 361 }, { "all_correct": 0.25, "all_wrong": 0.0, "completion_length": 415.90625, "epoch": 0.004993034578833396, "grad_norm": 2.568471466949617, "kl": 0.043212890625, "learning_rate": 9.999384879789447e-07, "loss": 0.0017, "reward": 1.998687505722046, "reward_std": 0.0748109221458435, "rewards/accuracy_reward": 0.8111875057220459, "rewards/format_reward": 1.0, "step": 362 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 395.8125, "epoch": 0.005006827492034593, "grad_norm": 2.2353484256112996, "kl": 0.047607421875, "learning_rate": 9.999381476711414e-07, "loss": 0.0019, "reward": 2.039968967437744, "reward_std": 0.052057087421417236, "rewards/accuracy_reward": 0.8462187051773071, "rewards/format_reward": 1.0, "step": 363 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 409.90625, "epoch": 0.00502062040523579, "grad_norm": 2.013509558162626, "kl": 0.044921875, "learning_rate": 9.999378064246355e-07, "loss": 0.0018, "reward": 2.121593713760376, "reward_std": 0.0262024886906147, "rewards/accuracy_reward": 0.9215937256813049, "rewards/format_reward": 1.0, "step": 364 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 388.53125, "epoch": 0.0050344133184369875, "grad_norm": 2.4318810132785944, "kl": 0.04736328125, "learning_rate": 9.999374642394275e-07, "loss": 0.0019, "reward": 2.132781505584717, "reward_std": 0.040902525186538696, "rewards/accuracy_reward": 0.9390312433242798, "rewards/format_reward": 1.0, "step": 365 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 409.40625, "epoch": 0.005048206231638185, "grad_norm": 2.2564770978193964, "kl": 0.038818359375, "learning_rate": 9.999371211155184e-07, "loss": 0.0016, "reward": 2.109250068664551, "reward_std": 0.038611844182014465, "rewards/accuracy_reward": 0.9092499613761902, "rewards/format_reward": 1.0, "step": 366 }, { "all_correct": 0.25, "all_wrong": 0.0, "completion_length": 391.5625, "epoch": 0.005061999144839381, "grad_norm": 2.1334353736703653, "kl": 0.04150390625, "learning_rate": 9.999367770529083e-07, "loss": 0.0017, "reward": 1.989531397819519, "reward_std": 0.041811294853687286, "rewards/accuracy_reward": 0.7895312309265137, "rewards/format_reward": 1.0, "step": 367 }, { "all_correct": 0.0, "all_wrong": 0.0, "completion_length": 398.1875, "epoch": 0.005075792058040578, "grad_norm": 3.16430389961693, "kl": 0.03955078125, "learning_rate": 9.999364320515982e-07, "loss": 0.0016, "reward": 2.0073437690734863, "reward_std": 0.04476203769445419, "rewards/accuracy_reward": 0.8073437213897705, "rewards/format_reward": 1.0, "step": 368 }, { "all_correct": 0.25, "all_wrong": 0.0, "completion_length": 388.375, "epoch": 0.005089584971241776, "grad_norm": 8.463507163029279, "kl": 0.03662109375, "learning_rate": 9.99936086111589e-07, "loss": 0.0015, "reward": 1.9731249809265137, "reward_std": 0.07198931276798248, "rewards/accuracy_reward": 0.7918750047683716, "rewards/format_reward": 1.0, "step": 369 }, { "all_correct": 0.25, "all_wrong": 0.0, "completion_length": 387.375, "epoch": 0.005103377884442973, "grad_norm": 2.1602753956462197, "kl": 0.03955078125, "learning_rate": 9.999357392328806e-07, "loss": 0.0016, "reward": 1.9954688549041748, "reward_std": 0.06441701948642731, "rewards/accuracy_reward": 0.8079687356948853, "rewards/format_reward": 1.0, "step": 370 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 412.65625, "epoch": 0.00511717079764417, "grad_norm": 2.1322790083067895, "kl": 0.04296875, "learning_rate": 9.999353914154744e-07, "loss": 0.0017, "reward": 2.1139376163482666, "reward_std": 0.03282049298286438, "rewards/accuracy_reward": 0.9139375686645508, "rewards/format_reward": 1.0, "step": 371 }, { "all_correct": 0.0, "all_wrong": 0.0, "completion_length": 393.5, "epoch": 0.0051309637108453675, "grad_norm": 2.271743717303218, "kl": 0.04296875, "learning_rate": 9.999350426593707e-07, "loss": 0.0017, "reward": 2.0196876525878906, "reward_std": 0.0655524730682373, "rewards/accuracy_reward": 0.8384374976158142, "rewards/format_reward": 1.0, "step": 372 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 397.125, "epoch": 0.005144756624046565, "grad_norm": 2.2815111538786086, "kl": 0.041015625, "learning_rate": 9.999346929645704e-07, "loss": 0.0016, "reward": 2.080031394958496, "reward_std": 0.03502483665943146, "rewards/accuracy_reward": 0.8800312876701355, "rewards/format_reward": 1.0, "step": 373 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 386.84375, "epoch": 0.005158549537247762, "grad_norm": 2.2168277123145987, "kl": 0.0439453125, "learning_rate": 9.999343423310738e-07, "loss": 0.0018, "reward": 1.9021875858306885, "reward_std": 0.060183122754096985, "rewards/accuracy_reward": 0.7084375023841858, "rewards/format_reward": 1.0, "step": 374 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 405.65625, "epoch": 0.005172342450448959, "grad_norm": 4.977231438046553, "kl": 0.040771484375, "learning_rate": 9.999339907588816e-07, "loss": 0.0016, "reward": 1.981562614440918, "reward_std": 0.04548982158303261, "rewards/accuracy_reward": 0.7878125309944153, "rewards/format_reward": 1.0, "step": 375 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 403.0, "epoch": 0.0051861353636501566, "grad_norm": 2.0692604557532834, "kl": 0.0439453125, "learning_rate": 9.999336382479947e-07, "loss": 0.0018, "reward": 2.0643439292907715, "reward_std": 0.04345225915312767, "rewards/accuracy_reward": 0.8643437623977661, "rewards/format_reward": 1.0, "step": 376 }, { "all_correct": 0.0, "all_wrong": 0.0, "completion_length": 396.84375, "epoch": 0.005199928276851354, "grad_norm": 3.214504459599678, "kl": 0.04248046875, "learning_rate": 9.999332847984137e-07, "loss": 0.0017, "reward": 1.96303129196167, "reward_std": 0.07805295288562775, "rewards/accuracy_reward": 0.769281268119812, "rewards/format_reward": 1.0, "step": 377 }, { "all_correct": 0.25, "all_wrong": 0.0, "completion_length": 412.34375, "epoch": 0.005213721190052551, "grad_norm": 3.1058204816038035, "kl": 0.041015625, "learning_rate": 9.999329304101393e-07, "loss": 0.0016, "reward": 2.0320000648498535, "reward_std": 0.043971847742795944, "rewards/accuracy_reward": 0.8320000171661377, "rewards/format_reward": 1.0, "step": 378 }, { "all_correct": 0.25, "all_wrong": 0.0, "completion_length": 412.25, "epoch": 0.005227514103253748, "grad_norm": 11.441206366840744, "kl": 0.041259765625, "learning_rate": 9.99932575083172e-07, "loss": 0.0017, "reward": 2.026218891143799, "reward_std": 0.03904066979885101, "rewards/accuracy_reward": 0.8262187242507935, "rewards/format_reward": 1.0, "step": 379 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 397.9375, "epoch": 0.005241307016454946, "grad_norm": 3.050170450819848, "kl": 0.040283203125, "learning_rate": 9.999322188175123e-07, "loss": 0.0016, "reward": 2.0570626258850098, "reward_std": 0.03695041313767433, "rewards/accuracy_reward": 0.8570624589920044, "rewards/format_reward": 1.0, "step": 380 }, { "all_correct": 0.25, "all_wrong": 0.0, "completion_length": 400.8125, "epoch": 0.005255099929656143, "grad_norm": 3.2452300025301293, "kl": 0.044921875, "learning_rate": 9.999318616131614e-07, "loss": 0.0018, "reward": 2.020031213760376, "reward_std": 0.04126669466495514, "rewards/accuracy_reward": 0.8200312852859497, "rewards/format_reward": 1.0, "step": 381 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 405.40625, "epoch": 0.00526889284285734, "grad_norm": 2.584916142265051, "kl": 0.0419921875, "learning_rate": 9.999315034701196e-07, "loss": 0.0017, "reward": 2.0535311698913574, "reward_std": 0.022014515474438667, "rewards/accuracy_reward": 0.8535313010215759, "rewards/format_reward": 1.0, "step": 382 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 391.5625, "epoch": 0.0052826857560585375, "grad_norm": 2.9664971784253353, "kl": 0.04150390625, "learning_rate": 9.999311443883877e-07, "loss": 0.0017, "reward": 2.0848751068115234, "reward_std": 0.040109068155288696, "rewards/accuracy_reward": 0.891124963760376, "rewards/format_reward": 1.0, "step": 383 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 423.1875, "epoch": 0.005296478669259735, "grad_norm": 2.664493953814492, "kl": 0.03955078125, "learning_rate": 9.999307843679664e-07, "loss": 0.0016, "reward": 2.1409687995910645, "reward_std": 0.02972692996263504, "rewards/accuracy_reward": 0.9409687519073486, "rewards/format_reward": 1.0, "step": 384 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 424.90625, "epoch": 0.005310271582460932, "grad_norm": 2.3277999910080545, "kl": 0.03955078125, "learning_rate": 9.999304234088561e-07, "loss": 0.0016, "reward": 2.0722498893737793, "reward_std": 0.025868508964776993, "rewards/accuracy_reward": 0.872249960899353, "rewards/format_reward": 1.0, "step": 385 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 425.15625, "epoch": 0.005324064495662128, "grad_norm": 2.252197062196045, "kl": 0.046875, "learning_rate": 9.999300615110578e-07, "loss": 0.0019, "reward": 2.1024062633514404, "reward_std": 0.035415079444646835, "rewards/accuracy_reward": 0.9024063348770142, "rewards/format_reward": 1.0, "step": 386 }, { "all_correct": 0.25, "all_wrong": 0.0, "completion_length": 406.1875, "epoch": 0.005337857408863326, "grad_norm": 2.2820609701546597, "kl": 0.0419921875, "learning_rate": 9.99929698674572e-07, "loss": 0.0017, "reward": 2.0091562271118164, "reward_std": 0.05368387699127197, "rewards/accuracy_reward": 0.8154062628746033, "rewards/format_reward": 1.0, "step": 387 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 410.28125, "epoch": 0.005351650322064523, "grad_norm": 2.182767295668779, "kl": 0.041259765625, "learning_rate": 9.999293348993995e-07, "loss": 0.0017, "reward": 2.0722813606262207, "reward_std": 0.02592349424958229, "rewards/accuracy_reward": 0.8722812533378601, "rewards/format_reward": 1.0, "step": 388 }, { "all_correct": 0.0, "all_wrong": 0.0, "completion_length": 423.5, "epoch": 0.00536544323526572, "grad_norm": 2.582314560405083, "kl": 0.04150390625, "learning_rate": 9.999289701855409e-07, "loss": 0.0017, "reward": 1.9242812395095825, "reward_std": 0.050751689821481705, "rewards/accuracy_reward": 0.7305312156677246, "rewards/format_reward": 1.0, "step": 389 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 408.3125, "epoch": 0.0053792361484669175, "grad_norm": 3.4203531627122596, "kl": 0.04150390625, "learning_rate": 9.99928604532997e-07, "loss": 0.0017, "reward": 2.04087495803833, "reward_std": 0.06903257966041565, "rewards/accuracy_reward": 0.859624981880188, "rewards/format_reward": 1.0, "step": 390 }, { "all_correct": 0.25, "all_wrong": 0.0, "completion_length": 418.34375, "epoch": 0.005393029061668115, "grad_norm": 13.1145601444951, "kl": 0.04638671875, "learning_rate": 9.999282379417685e-07, "loss": 0.0019, "reward": 2.032437562942505, "reward_std": 0.04002109915018082, "rewards/accuracy_reward": 0.8324375152587891, "rewards/format_reward": 1.0, "step": 391 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 431.25, "epoch": 0.005406821974869312, "grad_norm": 2.993019833017041, "kl": 0.05029296875, "learning_rate": 9.999278704118556e-07, "loss": 0.002, "reward": 2.044187545776367, "reward_std": 0.039664123207330704, "rewards/accuracy_reward": 0.8441874980926514, "rewards/format_reward": 1.0, "step": 392 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 417.5625, "epoch": 0.005420614888070509, "grad_norm": 2.613621488610164, "kl": 0.044189453125, "learning_rate": 9.999275019432598e-07, "loss": 0.0018, "reward": 2.0289688110351562, "reward_std": 0.04123665764927864, "rewards/accuracy_reward": 0.8289687633514404, "rewards/format_reward": 1.0, "step": 393 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 402.46875, "epoch": 0.005434407801271707, "grad_norm": 2.747261864536508, "kl": 0.04638671875, "learning_rate": 9.999271325359812e-07, "loss": 0.0019, "reward": 2.077437400817871, "reward_std": 0.04278936609625816, "rewards/accuracy_reward": 0.8836874961853027, "rewards/format_reward": 1.0, "step": 394 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 428.34375, "epoch": 0.005448200714472904, "grad_norm": 2.764727325870423, "kl": 0.053466796875, "learning_rate": 9.999267621900207e-07, "loss": 0.0021, "reward": 2.0797812938690186, "reward_std": 0.03863271698355675, "rewards/accuracy_reward": 0.8797813057899475, "rewards/format_reward": 1.0, "step": 395 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 427.625, "epoch": 0.005461993627674101, "grad_norm": 3.18083080677714, "kl": 0.048828125, "learning_rate": 9.999263909053789e-07, "loss": 0.002, "reward": 2.1059374809265137, "reward_std": 0.05786753445863724, "rewards/accuracy_reward": 0.9184374809265137, "rewards/format_reward": 1.0, "step": 396 }, { "all_correct": 0.25, "all_wrong": 0.0, "completion_length": 446.15625, "epoch": 0.005475786540875298, "grad_norm": 2.755189236401332, "kl": 0.04736328125, "learning_rate": 9.999260186820564e-07, "loss": 0.0019, "reward": 2.053593635559082, "reward_std": 0.06484928727149963, "rewards/accuracy_reward": 0.8723437786102295, "rewards/format_reward": 1.0, "step": 397 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 406.53125, "epoch": 0.005489579454076496, "grad_norm": 2.3753921659387363, "kl": 0.048828125, "learning_rate": 9.999256455200544e-07, "loss": 0.002, "reward": 1.9841562509536743, "reward_std": 0.032505325973033905, "rewards/accuracy_reward": 0.7841562032699585, "rewards/format_reward": 1.0, "step": 398 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 407.96875, "epoch": 0.005503372367277693, "grad_norm": 27.35305851641293, "kl": 0.04833984375, "learning_rate": 9.99925271419373e-07, "loss": 0.0019, "reward": 2.0696873664855957, "reward_std": 0.03292178362607956, "rewards/accuracy_reward": 0.869687557220459, "rewards/format_reward": 1.0, "step": 399 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 393.6875, "epoch": 0.00551716528047889, "grad_norm": 2.285216972280409, "kl": 0.04931640625, "learning_rate": 9.999248963800133e-07, "loss": 0.002, "reward": 2.0543124675750732, "reward_std": 0.04364452138543129, "rewards/accuracy_reward": 0.8605625033378601, "rewards/format_reward": 1.0, "step": 400 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 409.5625, "epoch": 0.0055309581936800875, "grad_norm": 2.3404213310167585, "kl": 0.0478515625, "learning_rate": 9.999245204019757e-07, "loss": 0.0019, "reward": 2.086937427520752, "reward_std": 0.0445425920188427, "rewards/accuracy_reward": 0.8931875228881836, "rewards/format_reward": 1.0, "step": 401 }, { "all_correct": 0.25, "all_wrong": 0.0, "completion_length": 402.8125, "epoch": 0.005544751106881285, "grad_norm": 2.484126481563818, "kl": 0.055908203125, "learning_rate": 9.999241434852613e-07, "loss": 0.0022, "reward": 1.904750108718872, "reward_std": 0.055736370384693146, "rewards/accuracy_reward": 0.7172499895095825, "rewards/format_reward": 1.0, "step": 402 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 420.3125, "epoch": 0.005558544020082482, "grad_norm": 2.144216494759608, "kl": 0.04638671875, "learning_rate": 9.999237656298705e-07, "loss": 0.0019, "reward": 2.0600311756134033, "reward_std": 0.04742245748639107, "rewards/accuracy_reward": 0.866281270980835, "rewards/format_reward": 1.0, "step": 403 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 412.5625, "epoch": 0.005572336933283678, "grad_norm": 2.560120333611293, "kl": 0.051513671875, "learning_rate": 9.999233868358041e-07, "loss": 0.0021, "reward": 2.1044373512268066, "reward_std": 0.04191218316555023, "rewards/accuracy_reward": 0.9044374823570251, "rewards/format_reward": 1.0, "step": 404 }, { "all_correct": 0.25, "all_wrong": 0.0, "completion_length": 430.03125, "epoch": 0.005586129846484876, "grad_norm": 2.395959713089716, "kl": 0.048095703125, "learning_rate": 9.999230071030626e-07, "loss": 0.0019, "reward": 2.0335311889648438, "reward_std": 0.0669226422905922, "rewards/accuracy_reward": 0.8460313081741333, "rewards/format_reward": 1.0, "step": 405 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 422.4375, "epoch": 0.005599922759686073, "grad_norm": 2.3524984561926985, "kl": 0.053466796875, "learning_rate": 9.999226264316472e-07, "loss": 0.0021, "reward": 2.025937557220459, "reward_std": 0.03249998390674591, "rewards/accuracy_reward": 0.8259375095367432, "rewards/format_reward": 1.0, "step": 406 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 396.5625, "epoch": 0.00561371567288727, "grad_norm": 2.5520112059443054, "kl": 0.0478515625, "learning_rate": 9.999222448215582e-07, "loss": 0.0019, "reward": 1.997406244277954, "reward_std": 0.06406331062316895, "rewards/accuracy_reward": 0.7974063158035278, "rewards/format_reward": 1.0, "step": 407 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 409.25, "epoch": 0.0056275085860884675, "grad_norm": 2.472502912832956, "kl": 0.0498046875, "learning_rate": 9.999218622727963e-07, "loss": 0.002, "reward": 2.0297188758850098, "reward_std": 0.038563478738069534, "rewards/accuracy_reward": 0.8297187089920044, "rewards/format_reward": 1.0, "step": 408 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 404.6875, "epoch": 0.005641301499289665, "grad_norm": 2.3682474499912076, "kl": 0.04541015625, "learning_rate": 9.999214787853625e-07, "loss": 0.0018, "reward": 2.0352814197540283, "reward_std": 0.04013541340827942, "rewards/accuracy_reward": 0.8477813005447388, "rewards/format_reward": 1.0, "step": 409 }, { "all_correct": 0.25, "all_wrong": 0.0, "completion_length": 392.0, "epoch": 0.005655094412490862, "grad_norm": 3.2076033592997977, "kl": 0.047607421875, "learning_rate": 9.999210943592576e-07, "loss": 0.0019, "reward": 1.9645936489105225, "reward_std": 0.06348051130771637, "rewards/accuracy_reward": 0.7708437442779541, "rewards/format_reward": 1.0, "step": 410 }, { "all_correct": 0.25, "all_wrong": 0.0, "completion_length": 412.71875, "epoch": 0.005668887325692059, "grad_norm": 2.212895591864534, "kl": 0.051025390625, "learning_rate": 9.999207089944818e-07, "loss": 0.002, "reward": 2.05790638923645, "reward_std": 0.06455381959676743, "rewards/accuracy_reward": 0.8579062223434448, "rewards/format_reward": 1.0, "step": 411 }, { "all_correct": 0.25, "all_wrong": 0.0, "completion_length": 401.9375, "epoch": 0.005682680238893257, "grad_norm": 3.206891930895078, "kl": 0.051513671875, "learning_rate": 9.999203226910361e-07, "loss": 0.0021, "reward": 1.9501874446868896, "reward_std": 0.05105041712522507, "rewards/accuracy_reward": 0.7501874566078186, "rewards/format_reward": 1.0, "step": 412 }, { "all_correct": 0.0, "all_wrong": 0.0, "completion_length": 403.125, "epoch": 0.005696473152094454, "grad_norm": 2.9394324763953406, "kl": 0.050048828125, "learning_rate": 9.999199354489215e-07, "loss": 0.002, "reward": 2.016031265258789, "reward_std": 0.07761611044406891, "rewards/accuracy_reward": 0.8285312652587891, "rewards/format_reward": 1.0, "step": 413 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 404.3125, "epoch": 0.005710266065295651, "grad_norm": 2.5975645006089283, "kl": 0.048828125, "learning_rate": 9.999195472681383e-07, "loss": 0.002, "reward": 2.047968864440918, "reward_std": 0.038813669234514236, "rewards/accuracy_reward": 0.8479686975479126, "rewards/format_reward": 1.0, "step": 414 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 403.4375, "epoch": 0.005724058978496848, "grad_norm": 2.988920329252644, "kl": 0.046142578125, "learning_rate": 9.999191581486875e-07, "loss": 0.0018, "reward": 2.096531391143799, "reward_std": 0.047278836369514465, "rewards/accuracy_reward": 0.9027812480926514, "rewards/format_reward": 1.0, "step": 415 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 373.28125, "epoch": 0.005737851891698046, "grad_norm": 2.2908968722017558, "kl": 0.050048828125, "learning_rate": 9.999187680905697e-07, "loss": 0.002, "reward": 2.0340938568115234, "reward_std": 0.05671568959951401, "rewards/accuracy_reward": 0.8465937376022339, "rewards/format_reward": 1.0, "step": 416 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 398.75, "epoch": 0.005751644804899243, "grad_norm": 2.799105331097807, "kl": 0.049560546875, "learning_rate": 9.999183770937856e-07, "loss": 0.002, "reward": 2.045562505722046, "reward_std": 0.04838100075721741, "rewards/accuracy_reward": 0.8455624580383301, "rewards/format_reward": 1.0, "step": 417 }, { "all_correct": 0.0, "all_wrong": 0.0, "completion_length": 416.90625, "epoch": 0.00576543771810044, "grad_norm": 7.657015311901752, "kl": 0.051513671875, "learning_rate": 9.99917985158336e-07, "loss": 0.0021, "reward": 1.9908125400543213, "reward_std": 0.0635077953338623, "rewards/accuracy_reward": 0.8033124804496765, "rewards/format_reward": 1.0, "step": 418 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 402.8125, "epoch": 0.0057792306313016375, "grad_norm": 17.26965223929346, "kl": 0.04736328125, "learning_rate": 9.999175922842217e-07, "loss": 0.0019, "reward": 2.061718702316284, "reward_std": 0.05310092866420746, "rewards/accuracy_reward": 0.867968738079071, "rewards/format_reward": 1.0, "step": 419 }, { "all_correct": 0.25, "all_wrong": 0.0, "completion_length": 414.0, "epoch": 0.005793023544502835, "grad_norm": 2.4078115494238856, "kl": 0.04833984375, "learning_rate": 9.999171984714434e-07, "loss": 0.0019, "reward": 2.0330312252044678, "reward_std": 0.05156610161066055, "rewards/accuracy_reward": 0.8330312967300415, "rewards/format_reward": 1.0, "step": 420 }, { "all_correct": 0.0, "all_wrong": 0.0, "completion_length": 396.96875, "epoch": 0.005806816457704032, "grad_norm": 3.8663835271196727, "kl": 0.049072265625, "learning_rate": 9.99916803720002e-07, "loss": 0.002, "reward": 1.8866875171661377, "reward_std": 0.0813208669424057, "rewards/accuracy_reward": 0.7054375410079956, "rewards/format_reward": 1.0, "step": 421 }, { "all_correct": 0.25, "all_wrong": 0.0, "completion_length": 395.0625, "epoch": 0.005820609370905229, "grad_norm": 2.4911639617772825, "kl": 0.0498046875, "learning_rate": 9.999164080298977e-07, "loss": 0.002, "reward": 2.026343822479248, "reward_std": 0.0677998960018158, "rewards/accuracy_reward": 0.8450937271118164, "rewards/format_reward": 1.0, "step": 422 }, { "all_correct": 0.25, "all_wrong": 0.0, "completion_length": 417.25, "epoch": 0.005834402284106426, "grad_norm": 2.259744774870326, "kl": 0.048828125, "learning_rate": 9.999160114011318e-07, "loss": 0.0019, "reward": 1.9805938005447388, "reward_std": 0.050225235521793365, "rewards/accuracy_reward": 0.7868437170982361, "rewards/format_reward": 1.0, "step": 423 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 403.28125, "epoch": 0.005848195197307623, "grad_norm": 2.55697890877962, "kl": 0.053955078125, "learning_rate": 9.999156138337048e-07, "loss": 0.0022, "reward": 2.0245938301086426, "reward_std": 0.06332585215568542, "rewards/accuracy_reward": 0.8308436870574951, "rewards/format_reward": 1.0, "step": 424 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 407.625, "epoch": 0.00586198811050882, "grad_norm": 3.5955161814072785, "kl": 0.05859375, "learning_rate": 9.999152153276175e-07, "loss": 0.0023, "reward": 2.0122499465942383, "reward_std": 0.04555346816778183, "rewards/accuracy_reward": 0.812250018119812, "rewards/format_reward": 1.0, "step": 425 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 414.0, "epoch": 0.0058757810237100175, "grad_norm": 3.9940446175369098, "kl": 0.04638671875, "learning_rate": 9.999148158828707e-07, "loss": 0.0019, "reward": 2.091156244277954, "reward_std": 0.039717383682727814, "rewards/accuracy_reward": 0.8911562561988831, "rewards/format_reward": 1.0, "step": 426 }, { "all_correct": 0.25, "all_wrong": 0.0, "completion_length": 409.84375, "epoch": 0.005889573936911215, "grad_norm": 1.9947022310523848, "kl": 0.0517578125, "learning_rate": 9.999144154994649e-07, "loss": 0.0021, "reward": 2.0534374713897705, "reward_std": 0.055101994425058365, "rewards/accuracy_reward": 0.8596875071525574, "rewards/format_reward": 1.0, "step": 427 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 409.75, "epoch": 0.005903366850112412, "grad_norm": 2.2748465518839103, "kl": 0.049072265625, "learning_rate": 9.999140141774013e-07, "loss": 0.002, "reward": 2.068281412124634, "reward_std": 0.0704391598701477, "rewards/accuracy_reward": 0.8807811737060547, "rewards/format_reward": 1.0, "step": 428 }, { "all_correct": 0.25, "all_wrong": 0.0, "completion_length": 417.0625, "epoch": 0.005917159763313609, "grad_norm": 2.1250621120886413, "kl": 0.05224609375, "learning_rate": 9.999136119166803e-07, "loss": 0.0021, "reward": 2.05649995803833, "reward_std": 0.06229133531451225, "rewards/accuracy_reward": 0.8565000295639038, "rewards/format_reward": 1.0, "step": 429 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 426.5625, "epoch": 0.005930952676514807, "grad_norm": 2.182701815111434, "kl": 0.0546875, "learning_rate": 9.999132087173027e-07, "loss": 0.0022, "reward": 2.063281297683716, "reward_std": 0.04120098426938057, "rewards/accuracy_reward": 0.8632813096046448, "rewards/format_reward": 1.0, "step": 430 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 414.40625, "epoch": 0.005944745589716004, "grad_norm": 2.1766653424223033, "kl": 0.053955078125, "learning_rate": 9.999128045792693e-07, "loss": 0.0022, "reward": 2.0586562156677246, "reward_std": 0.047504741698503494, "rewards/accuracy_reward": 0.8586562871932983, "rewards/format_reward": 1.0, "step": 431 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 418.6875, "epoch": 0.005958538502917201, "grad_norm": 5.4769351934029755, "kl": 0.0576171875, "learning_rate": 9.999123995025808e-07, "loss": 0.0023, "reward": 2.085343837738037, "reward_std": 0.045142896473407745, "rewards/accuracy_reward": 0.8853437900543213, "rewards/format_reward": 1.0, "step": 432 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 434.9375, "epoch": 0.005972331416118398, "grad_norm": 1.4272156281185926, "kl": 0.04833984375, "learning_rate": 9.999119934872383e-07, "loss": 0.0019, "reward": 2.1294686794281006, "reward_std": 0.0611511692404747, "rewards/accuracy_reward": 0.9482187032699585, "rewards/format_reward": 1.0, "step": 433 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 417.46875, "epoch": 0.005986124329319596, "grad_norm": 1.6671806587009248, "kl": 0.0517578125, "learning_rate": 9.999115865332422e-07, "loss": 0.0021, "reward": 2.0610625743865967, "reward_std": 0.0455532930791378, "rewards/accuracy_reward": 0.8735624551773071, "rewards/format_reward": 1.0, "step": 434 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 414.875, "epoch": 0.005999917242520793, "grad_norm": 1.6128763317018815, "kl": 0.053466796875, "learning_rate": 9.99911178640593e-07, "loss": 0.0021, "reward": 2.0390000343322754, "reward_std": 0.03209829330444336, "rewards/accuracy_reward": 0.8452500104904175, "rewards/format_reward": 1.0, "step": 435 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 442.875, "epoch": 0.00601371015572199, "grad_norm": 2.0306932014553265, "kl": 0.05322265625, "learning_rate": 9.999107698092923e-07, "loss": 0.0021, "reward": 2.022843837738037, "reward_std": 0.05230832099914551, "rewards/accuracy_reward": 0.8415937423706055, "rewards/format_reward": 1.0, "step": 436 }, { "all_correct": 0.25, "all_wrong": 0.0, "completion_length": 430.90625, "epoch": 0.0060275030689231875, "grad_norm": 2.6002569606902983, "kl": 0.0546875, "learning_rate": 9.9991036003934e-07, "loss": 0.0022, "reward": 2.075031280517578, "reward_std": 0.09695666283369064, "rewards/accuracy_reward": 0.8812812566757202, "rewards/format_reward": 1.0, "step": 437 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 428.96875, "epoch": 0.006041295982124385, "grad_norm": 2.190925618443959, "kl": 0.0537109375, "learning_rate": 9.999099493307377e-07, "loss": 0.0021, "reward": 2.0072813034057617, "reward_std": 0.05058792978525162, "rewards/accuracy_reward": 0.8135312795639038, "rewards/format_reward": 1.0, "step": 438 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 441.625, "epoch": 0.006055088895325582, "grad_norm": 2.43781612677234, "kl": 0.051513671875, "learning_rate": 9.999095376834854e-07, "loss": 0.0021, "reward": 1.889875054359436, "reward_std": 0.04314005374908447, "rewards/accuracy_reward": 0.689875066280365, "rewards/format_reward": 1.0, "step": 439 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 419.125, "epoch": 0.006068881808526779, "grad_norm": 2.273550710187853, "kl": 0.0478515625, "learning_rate": 9.999091250975845e-07, "loss": 0.0019, "reward": 2.0492501258850098, "reward_std": 0.06178450956940651, "rewards/accuracy_reward": 0.8617500066757202, "rewards/format_reward": 1.0, "step": 440 }, { "all_correct": 0.0, "all_wrong": 0.0, "completion_length": 430.90625, "epoch": 0.006082674721727977, "grad_norm": 2.594133576549036, "kl": 0.05322265625, "learning_rate": 9.99908711573035e-07, "loss": 0.0021, "reward": 1.9610939025878906, "reward_std": 0.06257952749729156, "rewards/accuracy_reward": 0.7610937356948853, "rewards/format_reward": 1.0, "step": 441 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 429.46875, "epoch": 0.006096467634929173, "grad_norm": 2.1549078081288497, "kl": 0.052734375, "learning_rate": 9.999082971098384e-07, "loss": 0.0021, "reward": 2.1158125400543213, "reward_std": 0.051042407751083374, "rewards/accuracy_reward": 0.9220625162124634, "rewards/format_reward": 1.0, "step": 442 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 445.84375, "epoch": 0.00611026054813037, "grad_norm": 2.765044228336083, "kl": 0.052001953125, "learning_rate": 9.999078817079953e-07, "loss": 0.0021, "reward": 2.124281406402588, "reward_std": 0.04940464347600937, "rewards/accuracy_reward": 0.9305312633514404, "rewards/format_reward": 1.0, "step": 443 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 410.15625, "epoch": 0.0061240534613315675, "grad_norm": 1.822265017896965, "kl": 0.048095703125, "learning_rate": 9.999074653675064e-07, "loss": 0.0019, "reward": 1.9833126068115234, "reward_std": 0.02417147159576416, "rewards/accuracy_reward": 0.7833124399185181, "rewards/format_reward": 1.0, "step": 444 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 441.84375, "epoch": 0.006137846374532765, "grad_norm": 2.4254309036744006, "kl": 0.04541015625, "learning_rate": 9.999070480883725e-07, "loss": 0.0018, "reward": 2.0770626068115234, "reward_std": 0.0522807240486145, "rewards/accuracy_reward": 0.895812451839447, "rewards/format_reward": 1.0, "step": 445 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 438.4375, "epoch": 0.006151639287733962, "grad_norm": 2.483291261014134, "kl": 0.051513671875, "learning_rate": 9.999066298705942e-07, "loss": 0.0021, "reward": 2.022125005722046, "reward_std": 0.03747022897005081, "rewards/accuracy_reward": 0.8283750414848328, "rewards/format_reward": 1.0, "step": 446 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 426.4375, "epoch": 0.006165432200935159, "grad_norm": 2.1841883574910206, "kl": 0.050537109375, "learning_rate": 9.999062107141727e-07, "loss": 0.002, "reward": 2.1225314140319824, "reward_std": 0.03400640934705734, "rewards/accuracy_reward": 0.922531247138977, "rewards/format_reward": 1.0, "step": 447 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 423.78125, "epoch": 0.006179225114136357, "grad_norm": 2.078285857002367, "kl": 0.048583984375, "learning_rate": 9.999057906191084e-07, "loss": 0.0019, "reward": 2.0131564140319824, "reward_std": 0.1357610672712326, "rewards/accuracy_reward": 0.8506562113761902, "rewards/format_reward": 0.96875, "step": 448 }, { "all_correct": 0.25, "all_wrong": 0.0, "completion_length": 438.3125, "epoch": 0.006193018027337554, "grad_norm": 2.044171548114657, "kl": 0.053955078125, "learning_rate": 9.999053695854023e-07, "loss": 0.0022, "reward": 2.037374973297119, "reward_std": 0.05236751586198807, "rewards/accuracy_reward": 0.8498750329017639, "rewards/format_reward": 1.0, "step": 449 }, { "all_correct": 0.25, "all_wrong": 0.0, "completion_length": 420.53125, "epoch": 0.006206810940538751, "grad_norm": 2.060034989431705, "kl": 0.0517578125, "learning_rate": 9.999049476130551e-07, "loss": 0.0021, "reward": 2.0882186889648438, "reward_std": 0.05290935933589935, "rewards/accuracy_reward": 0.8944686651229858, "rewards/format_reward": 1.0, "step": 450 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 434.90625, "epoch": 0.006220603853739948, "grad_norm": 2.3513103702450238, "kl": 0.04833984375, "learning_rate": 9.999045247020678e-07, "loss": 0.0019, "reward": 2.1145312786102295, "reward_std": 0.041993848979473114, "rewards/accuracy_reward": 0.9145312905311584, "rewards/format_reward": 1.0, "step": 451 }, { "all_correct": 0.25, "all_wrong": 0.0, "completion_length": 407.4375, "epoch": 0.006234396766941146, "grad_norm": 54.847513188480065, "kl": 0.05078125, "learning_rate": 9.999041008524406e-07, "loss": 0.002, "reward": 2.066093921661377, "reward_std": 0.053901515901088715, "rewards/accuracy_reward": 0.8660937547683716, "rewards/format_reward": 1.0, "step": 452 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 411.46875, "epoch": 0.006248189680142343, "grad_norm": 2.132592722549295, "kl": 0.04736328125, "learning_rate": 9.999036760641752e-07, "loss": 0.0019, "reward": 2.060699939727783, "reward_std": 0.041433461010456085, "rewards/accuracy_reward": 0.8669500350952148, "rewards/format_reward": 1.0, "step": 453 }, { "all_correct": 0.25, "all_wrong": 0.0, "completion_length": 410.90625, "epoch": 0.00626198259334354, "grad_norm": 2.4284293083629755, "kl": 0.05126953125, "learning_rate": 9.999032503372715e-07, "loss": 0.002, "reward": 1.9245938062667847, "reward_std": 0.06403053551912308, "rewards/accuracy_reward": 0.7245937585830688, "rewards/format_reward": 1.0, "step": 454 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 416.6875, "epoch": 0.0062757755065447375, "grad_norm": 2.3561161361147223, "kl": 0.0478515625, "learning_rate": 9.99902823671731e-07, "loss": 0.0019, "reward": 2.154250144958496, "reward_std": 0.029831662774086, "rewards/accuracy_reward": 0.9542500376701355, "rewards/format_reward": 1.0, "step": 455 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 418.5625, "epoch": 0.006289568419745935, "grad_norm": 1.5929271765042643, "kl": 0.052978515625, "learning_rate": 9.99902396067554e-07, "loss": 0.0021, "reward": 2.1213748455047607, "reward_std": 0.020033225417137146, "rewards/accuracy_reward": 0.921375036239624, "rewards/format_reward": 1.0, "step": 456 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 413.40625, "epoch": 0.006303361332947132, "grad_norm": 1.9981365615948745, "kl": 0.046630859375, "learning_rate": 9.999019675247416e-07, "loss": 0.0019, "reward": 2.0269999504089355, "reward_std": 0.02967972122132778, "rewards/accuracy_reward": 0.8270000219345093, "rewards/format_reward": 1.0, "step": 457 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 410.15625, "epoch": 0.006317154246148329, "grad_norm": 4.794670641556352, "kl": 0.04931640625, "learning_rate": 9.999015380432946e-07, "loss": 0.002, "reward": 2.1294374465942383, "reward_std": 0.03062836267054081, "rewards/accuracy_reward": 0.929437518119812, "rewards/format_reward": 1.0, "step": 458 }, { "all_correct": 0.25, "all_wrong": 0.0, "completion_length": 433.09375, "epoch": 0.006330947159349527, "grad_norm": 7.432220349255496, "kl": 0.0556640625, "learning_rate": 9.999011076232136e-07, "loss": 0.0022, "reward": 2.0531561374664307, "reward_std": 0.05647537112236023, "rewards/accuracy_reward": 0.8656562566757202, "rewards/format_reward": 1.0, "step": 459 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 414.46875, "epoch": 0.006344740072550724, "grad_norm": 2.945291083078629, "kl": 0.0546875, "learning_rate": 9.999006762644997e-07, "loss": 0.0022, "reward": 1.9413437843322754, "reward_std": 0.053651705384254456, "rewards/accuracy_reward": 0.7475937604904175, "rewards/format_reward": 1.0, "step": 460 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 402.03125, "epoch": 0.00635853298575192, "grad_norm": 2.1417502172964897, "kl": 0.048828125, "learning_rate": 9.999002439671533e-07, "loss": 0.0019, "reward": 2.0660624504089355, "reward_std": 0.04758208990097046, "rewards/accuracy_reward": 0.8660625219345093, "rewards/format_reward": 1.0, "step": 461 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 406.09375, "epoch": 0.0063723258989531175, "grad_norm": 3.6130355249633133, "kl": 0.05029296875, "learning_rate": 9.998998107311757e-07, "loss": 0.002, "reward": 2.1443750858306885, "reward_std": 0.036351725459098816, "rewards/accuracy_reward": 0.9443750381469727, "rewards/format_reward": 1.0, "step": 462 }, { "all_correct": 0.25, "all_wrong": 0.0, "completion_length": 411.125, "epoch": 0.006386118812154315, "grad_norm": 10.246810969712243, "kl": 0.052734375, "learning_rate": 9.998993765565672e-07, "loss": 0.0021, "reward": 1.9857499599456787, "reward_std": 0.05506417155265808, "rewards/accuracy_reward": 0.7920000553131104, "rewards/format_reward": 1.0, "step": 463 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 413.53125, "epoch": 0.006399911725355512, "grad_norm": 2.820966804537115, "kl": 0.05029296875, "learning_rate": 9.99898941443329e-07, "loss": 0.002, "reward": 2.0905938148498535, "reward_std": 0.05303804576396942, "rewards/accuracy_reward": 0.8968437910079956, "rewards/format_reward": 1.0, "step": 464 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 411.6875, "epoch": 0.006413704638556709, "grad_norm": 2.5974919895731445, "kl": 0.0458984375, "learning_rate": 9.998985053914617e-07, "loss": 0.0018, "reward": 2.091843843460083, "reward_std": 0.027569973841309547, "rewards/accuracy_reward": 0.8918437957763672, "rewards/format_reward": 1.0, "step": 465 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 397.5, "epoch": 0.006427497551757907, "grad_norm": 6.75225842098542, "kl": 0.05517578125, "learning_rate": 9.998980684009663e-07, "loss": 0.0022, "reward": 2.0234999656677246, "reward_std": 0.041552476584911346, "rewards/accuracy_reward": 0.8235000371932983, "rewards/format_reward": 1.0, "step": 466 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 406.0, "epoch": 0.006441290464959104, "grad_norm": 1.9156503328528103, "kl": 0.046142578125, "learning_rate": 9.998976304718435e-07, "loss": 0.0018, "reward": 2.1222500801086426, "reward_std": 0.05393138527870178, "rewards/accuracy_reward": 0.9284999370574951, "rewards/format_reward": 1.0, "step": 467 }, { "all_correct": 0.25, "all_wrong": 0.0, "completion_length": 399.53125, "epoch": 0.006455083378160301, "grad_norm": 2.5002545029316927, "kl": 0.04931640625, "learning_rate": 9.998971916040944e-07, "loss": 0.002, "reward": 2.0030627250671387, "reward_std": 0.05505741760134697, "rewards/accuracy_reward": 0.8030624985694885, "rewards/format_reward": 1.0, "step": 468 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 409.96875, "epoch": 0.0064688762913614984, "grad_norm": 1.9322153099956494, "kl": 0.0478515625, "learning_rate": 9.998967517977195e-07, "loss": 0.0019, "reward": 2.015937566757202, "reward_std": 0.030973389744758606, "rewards/accuracy_reward": 0.8159375786781311, "rewards/format_reward": 1.0, "step": 469 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 388.40625, "epoch": 0.006482669204562696, "grad_norm": 15.973614871242363, "kl": 0.046875, "learning_rate": 9.998963110527196e-07, "loss": 0.0019, "reward": 2.044156312942505, "reward_std": 0.07589149475097656, "rewards/accuracy_reward": 0.8504062294960022, "rewards/format_reward": 1.0, "step": 470 }, { "all_correct": 0.0, "all_wrong": 0.0, "completion_length": 383.6875, "epoch": 0.006496462117763893, "grad_norm": 2.4809930255128667, "kl": 0.04833984375, "learning_rate": 9.998958693690956e-07, "loss": 0.0019, "reward": 2.0883126258850098, "reward_std": 0.08860978484153748, "rewards/accuracy_reward": 0.913312554359436, "rewards/format_reward": 1.0, "step": 471 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 402.71875, "epoch": 0.00651025503096509, "grad_norm": 2.4286191548195295, "kl": 0.04736328125, "learning_rate": 9.998954267468485e-07, "loss": 0.0019, "reward": 2.0695624351501465, "reward_std": 0.04460033029317856, "rewards/accuracy_reward": 0.8695625066757202, "rewards/format_reward": 1.0, "step": 472 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 398.25, "epoch": 0.0065240479441662875, "grad_norm": 2.00547489336181, "kl": 0.047119140625, "learning_rate": 9.998949831859789e-07, "loss": 0.0019, "reward": 2.0764999389648438, "reward_std": 0.04215770214796066, "rewards/accuracy_reward": 0.8827499151229858, "rewards/format_reward": 1.0, "step": 473 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 408.15625, "epoch": 0.006537840857367485, "grad_norm": 1.83465881304083, "kl": 0.048095703125, "learning_rate": 9.998945386864876e-07, "loss": 0.0019, "reward": 2.137765407562256, "reward_std": 0.04565926268696785, "rewards/accuracy_reward": 0.9377655982971191, "rewards/format_reward": 1.0, "step": 474 }, { "all_correct": 0.0, "all_wrong": 0.0, "completion_length": 415.375, "epoch": 0.006551633770568682, "grad_norm": 2.3609545535445804, "kl": 0.047119140625, "learning_rate": 9.998940932483757e-07, "loss": 0.0019, "reward": 1.948468804359436, "reward_std": 0.09645549207925797, "rewards/accuracy_reward": 0.7672187089920044, "rewards/format_reward": 1.0, "step": 475 }, { "all_correct": 0.25, "all_wrong": 0.0, "completion_length": 417.5625, "epoch": 0.006565426683769879, "grad_norm": 3.0135915486887495, "kl": 0.049072265625, "learning_rate": 9.99893646871644e-07, "loss": 0.002, "reward": 1.86328125, "reward_std": 0.05664583295583725, "rewards/accuracy_reward": 0.663281261920929, "rewards/format_reward": 1.0, "step": 476 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 421.71875, "epoch": 0.006579219596971077, "grad_norm": 2.192085318113336, "kl": 0.050048828125, "learning_rate": 9.99893199556293e-07, "loss": 0.002, "reward": 2.126999855041504, "reward_std": 0.029108930379152298, "rewards/accuracy_reward": 0.9269999861717224, "rewards/format_reward": 1.0, "step": 477 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 406.375, "epoch": 0.006593012510172274, "grad_norm": 2.421470546729685, "kl": 0.04345703125, "learning_rate": 9.99892751302324e-07, "loss": 0.0017, "reward": 2.1214687824249268, "reward_std": 0.05782489478588104, "rewards/accuracy_reward": 0.9214688539505005, "rewards/format_reward": 1.0, "step": 478 }, { "all_correct": 0.0, "all_wrong": 0.0, "completion_length": 394.46875, "epoch": 0.00660680542337347, "grad_norm": 2.366607752823309, "kl": 0.05419921875, "learning_rate": 9.998923021097375e-07, "loss": 0.0022, "reward": 2.006187677383423, "reward_std": 0.052849989384412766, "rewards/accuracy_reward": 0.8186875581741333, "rewards/format_reward": 1.0, "step": 479 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 399.9375, "epoch": 0.0066205983365746676, "grad_norm": 2.112964470212882, "kl": 0.050537109375, "learning_rate": 9.998918519785347e-07, "loss": 0.002, "reward": 2.0928125381469727, "reward_std": 0.031509097665548325, "rewards/accuracy_reward": 0.8928124308586121, "rewards/format_reward": 1.0, "step": 480 }, { "all_correct": 0.25, "all_wrong": 0.0, "completion_length": 421.25, "epoch": 0.006634391249775865, "grad_norm": 2.353844124839015, "kl": 0.04638671875, "learning_rate": 9.99891400908716e-07, "loss": 0.0018, "reward": 1.9729686975479126, "reward_std": 0.0887097418308258, "rewards/accuracy_reward": 0.7917186617851257, "rewards/format_reward": 1.0, "step": 481 }, { "all_correct": 0.25, "all_wrong": 0.0, "completion_length": 428.4375, "epoch": 0.006648184162977062, "grad_norm": 2.3856310335426376, "kl": 0.047607421875, "learning_rate": 9.998909489002825e-07, "loss": 0.0019, "reward": 2.118406295776367, "reward_std": 0.059468068182468414, "rewards/accuracy_reward": 0.9309061765670776, "rewards/format_reward": 1.0, "step": 482 }, { "all_correct": 0.0, "all_wrong": 0.0, "completion_length": 427.4375, "epoch": 0.006661977076178259, "grad_norm": 2.4610388604462736, "kl": 0.0439453125, "learning_rate": 9.99890495953235e-07, "loss": 0.0018, "reward": 1.9573438167572021, "reward_std": 0.08177557587623596, "rewards/accuracy_reward": 0.7635937929153442, "rewards/format_reward": 1.0, "step": 483 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 397.28125, "epoch": 0.006675769989379457, "grad_norm": 2.6556486767346716, "kl": 0.04736328125, "learning_rate": 9.998900420675744e-07, "loss": 0.0019, "reward": 2.0717501640319824, "reward_std": 0.038730934262275696, "rewards/accuracy_reward": 0.8779999017715454, "rewards/format_reward": 1.0, "step": 484 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 414.65625, "epoch": 0.006689562902580654, "grad_norm": 1.2580648206517744, "kl": 0.04443359375, "learning_rate": 9.998895872433013e-07, "loss": 0.0018, "reward": 2.110187530517578, "reward_std": 0.031248370185494423, "rewards/accuracy_reward": 0.9226875305175781, "rewards/format_reward": 1.0, "step": 485 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 420.09375, "epoch": 0.006703355815781851, "grad_norm": 2.2712597437977213, "kl": 0.046142578125, "learning_rate": 9.99889131480417e-07, "loss": 0.0018, "reward": 2.094531297683716, "reward_std": 0.030032359063625336, "rewards/accuracy_reward": 0.89453125, "rewards/format_reward": 1.0, "step": 486 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 414.40625, "epoch": 0.0067171487289830484, "grad_norm": 5.188655484010513, "kl": 0.04296875, "learning_rate": 9.998886747789221e-07, "loss": 0.0017, "reward": 2.039187431335449, "reward_std": 0.05987890064716339, "rewards/accuracy_reward": 0.839187502861023, "rewards/format_reward": 1.0, "step": 487 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 403.21875, "epoch": 0.006730941642184246, "grad_norm": 1.7779537936075132, "kl": 0.04541015625, "learning_rate": 9.998882171388174e-07, "loss": 0.0018, "reward": 2.005812406539917, "reward_std": 0.02743227779865265, "rewards/accuracy_reward": 0.8058124780654907, "rewards/format_reward": 1.0, "step": 488 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 415.59375, "epoch": 0.006744734555385443, "grad_norm": 2.0033902325526656, "kl": 0.048095703125, "learning_rate": 9.998877585601037e-07, "loss": 0.0019, "reward": 2.11118745803833, "reward_std": 0.06951781362295151, "rewards/accuracy_reward": 0.9236875176429749, "rewards/format_reward": 1.0, "step": 489 }, { "all_correct": 0.25, "all_wrong": 0.0, "completion_length": 414.90625, "epoch": 0.00675852746858664, "grad_norm": 2.103699142971979, "kl": 0.0478515625, "learning_rate": 9.998872990427823e-07, "loss": 0.0019, "reward": 2.0332188606262207, "reward_std": 0.043987613171339035, "rewards/accuracy_reward": 0.8332188129425049, "rewards/format_reward": 1.0, "step": 490 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 429.875, "epoch": 0.0067723203817878375, "grad_norm": 2.35698790048424, "kl": 0.04541015625, "learning_rate": 9.998868385868535e-07, "loss": 0.0018, "reward": 2.0375938415527344, "reward_std": 0.04837675765156746, "rewards/accuracy_reward": 0.8375937342643738, "rewards/format_reward": 1.0, "step": 491 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 402.6875, "epoch": 0.006786113294989035, "grad_norm": 2.1787245823489956, "kl": 0.0537109375, "learning_rate": 9.998863771923186e-07, "loss": 0.0021, "reward": 2.114500045776367, "reward_std": 0.02211102284491062, "rewards/accuracy_reward": 0.9144999980926514, "rewards/format_reward": 1.0, "step": 492 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 391.09375, "epoch": 0.006799906208190232, "grad_norm": 2.5024222784834946, "kl": 0.04638671875, "learning_rate": 9.998859148591782e-07, "loss": 0.0019, "reward": 2.083078384399414, "reward_std": 0.03898228704929352, "rewards/accuracy_reward": 0.8830780982971191, "rewards/format_reward": 1.0, "step": 493 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 410.75, "epoch": 0.006813699121391429, "grad_norm": 2.0398652968046016, "kl": 0.048828125, "learning_rate": 9.998854515874332e-07, "loss": 0.0019, "reward": 2.024218797683716, "reward_std": 0.039963144809007645, "rewards/accuracy_reward": 0.82421875, "rewards/format_reward": 1.0, "step": 494 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 388.71875, "epoch": 0.006827492034592627, "grad_norm": 3.065915206828661, "kl": 0.046630859375, "learning_rate": 9.998849873770847e-07, "loss": 0.0019, "reward": 1.9901562929153442, "reward_std": 0.04621211811900139, "rewards/accuracy_reward": 0.7901562452316284, "rewards/format_reward": 1.0, "step": 495 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 400.28125, "epoch": 0.006841284947793824, "grad_norm": 7.75104781114931, "kl": 0.0419921875, "learning_rate": 9.998845222281332e-07, "loss": 0.0017, "reward": 2.046968698501587, "reward_std": 0.033911436796188354, "rewards/accuracy_reward": 0.8469687700271606, "rewards/format_reward": 1.0, "step": 496 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 392.8125, "epoch": 0.006855077860995021, "grad_norm": 2.496730990316503, "kl": 0.05029296875, "learning_rate": 9.998840561405799e-07, "loss": 0.002, "reward": 2.03920316696167, "reward_std": 0.04264325648546219, "rewards/accuracy_reward": 0.8392031192779541, "rewards/format_reward": 1.0, "step": 497 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 425.0, "epoch": 0.0068688707741962176, "grad_norm": 1.8806490360063974, "kl": 0.047119140625, "learning_rate": 9.998835891144255e-07, "loss": 0.0019, "reward": 2.025240659713745, "reward_std": 0.017883004620671272, "rewards/accuracy_reward": 0.8252406120300293, "rewards/format_reward": 1.0, "step": 498 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 392.6875, "epoch": 0.006882663687397415, "grad_norm": 2.891641914101987, "kl": 0.04638671875, "learning_rate": 9.99883121149671e-07, "loss": 0.0019, "reward": 2.1169686317443848, "reward_std": 0.058275140821933746, "rewards/accuracy_reward": 0.9232187271118164, "rewards/format_reward": 1.0, "step": 499 }, { "all_correct": 0.25, "all_wrong": 0.0, "completion_length": 395.9375, "epoch": 0.006896456600598612, "grad_norm": 2.022715313742651, "kl": 0.0380859375, "learning_rate": 9.99882652246317e-07, "loss": 0.0015, "reward": 1.9176876544952393, "reward_std": 0.04783179238438606, "rewards/accuracy_reward": 0.7176874876022339, "rewards/format_reward": 1.0, "step": 500 }, { "all_correct": 0.25, "all_wrong": 0.0, "completion_length": 386.875, "epoch": 0.006910249513799809, "grad_norm": 2.396821625155436, "kl": 0.0478515625, "learning_rate": 9.998821824043647e-07, "loss": 0.0019, "reward": 2.0275156497955322, "reward_std": 0.0573999360203743, "rewards/accuracy_reward": 0.8400155901908875, "rewards/format_reward": 1.0, "step": 501 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 410.0, "epoch": 0.006924042427001007, "grad_norm": 2.548935980002161, "kl": 0.045166015625, "learning_rate": 9.99881711623815e-07, "loss": 0.0018, "reward": 2.0393593311309814, "reward_std": 0.06705594062805176, "rewards/accuracy_reward": 0.8456093668937683, "rewards/format_reward": 1.0, "step": 502 }, { "all_correct": 0.25, "all_wrong": 0.0, "completion_length": 407.03125, "epoch": 0.006937835340202204, "grad_norm": 2.644585106614417, "kl": 0.048095703125, "learning_rate": 9.998812399046686e-07, "loss": 0.0019, "reward": 1.9927188158035278, "reward_std": 0.03509826958179474, "rewards/accuracy_reward": 0.7927187085151672, "rewards/format_reward": 1.0, "step": 503 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 403.875, "epoch": 0.006951628253403401, "grad_norm": 2.228992746332528, "kl": 0.0439453125, "learning_rate": 9.998807672469264e-07, "loss": 0.0018, "reward": 2.08760929107666, "reward_std": 0.04669293761253357, "rewards/accuracy_reward": 0.8876093626022339, "rewards/format_reward": 1.0, "step": 504 }, { "all_correct": 0.25, "all_wrong": 0.0, "completion_length": 412.375, "epoch": 0.0069654211666045985, "grad_norm": 2.3573872042973876, "kl": 0.044189453125, "learning_rate": 9.998802936505892e-07, "loss": 0.0018, "reward": 2.047621965408325, "reward_std": 0.045699793845415115, "rewards/accuracy_reward": 0.8538718223571777, "rewards/format_reward": 1.0, "step": 505 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 405.25, "epoch": 0.006979214079805796, "grad_norm": 21.711347732871292, "kl": 0.05078125, "learning_rate": 9.99879819115658e-07, "loss": 0.002, "reward": 2.074406385421753, "reward_std": 0.0406007245182991, "rewards/accuracy_reward": 0.8744062781333923, "rewards/format_reward": 1.0, "step": 506 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 397.84375, "epoch": 0.006993006993006993, "grad_norm": 7.091894304132117, "kl": 0.04296875, "learning_rate": 9.99879343642134e-07, "loss": 0.0017, "reward": 2.1401562690734863, "reward_std": 0.06220386177301407, "rewards/accuracy_reward": 0.9589062333106995, "rewards/format_reward": 1.0, "step": 507 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 395.625, "epoch": 0.00700679990620819, "grad_norm": 3.450136063015016, "kl": 0.0439453125, "learning_rate": 9.998788672300176e-07, "loss": 0.0018, "reward": 2.0445780754089355, "reward_std": 0.029390225186944008, "rewards/accuracy_reward": 0.8508281707763672, "rewards/format_reward": 1.0, "step": 508 }, { "all_correct": 0.25, "all_wrong": 0.0, "completion_length": 410.40625, "epoch": 0.0070205928194093875, "grad_norm": 2.7480832171875726, "kl": 0.04541015625, "learning_rate": 9.9987838987931e-07, "loss": 0.0018, "reward": 2.0546250343322754, "reward_std": 0.05259440466761589, "rewards/accuracy_reward": 0.8608750104904175, "rewards/format_reward": 1.0, "step": 509 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 409.125, "epoch": 0.007034385732610585, "grad_norm": 2.3446414048484945, "kl": 0.050048828125, "learning_rate": 9.998779115900118e-07, "loss": 0.002, "reward": 1.953874945640564, "reward_std": 0.053261689841747284, "rewards/accuracy_reward": 0.7538750171661377, "rewards/format_reward": 1.0, "step": 510 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 401.71875, "epoch": 0.007048178645811782, "grad_norm": 3.770161456594712, "kl": 0.048828125, "learning_rate": 9.998774323621242e-07, "loss": 0.002, "reward": 2.022125244140625, "reward_std": 0.07293341308832169, "rewards/accuracy_reward": 0.8221249580383301, "rewards/format_reward": 1.0, "step": 511 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 407.65625, "epoch": 0.007061971559012979, "grad_norm": 2.2196594447780655, "kl": 0.049560546875, "learning_rate": 9.998769521956479e-07, "loss": 0.002, "reward": 2.0625157356262207, "reward_std": 0.02569575235247612, "rewards/accuracy_reward": 0.8625156283378601, "rewards/format_reward": 1.0, "step": 512 }, { "all_correct": 0.25, "all_wrong": 0.0, "completion_length": 427.0, "epoch": 0.007075764472214177, "grad_norm": 3.0104084161712774, "kl": 0.047607421875, "learning_rate": 9.99876471090584e-07, "loss": 0.0019, "reward": 2.020812511444092, "reward_std": 0.07906971871852875, "rewards/accuracy_reward": 0.8208125233650208, "rewards/format_reward": 1.0, "step": 513 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 405.0, "epoch": 0.007089557385415374, "grad_norm": 1.7182882745387882, "kl": 0.04541015625, "learning_rate": 9.998759890469332e-07, "loss": 0.0018, "reward": 2.116781234741211, "reward_std": 0.04438205435872078, "rewards/accuracy_reward": 0.9292812347412109, "rewards/format_reward": 1.0, "step": 514 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 419.71875, "epoch": 0.007103350298616571, "grad_norm": 2.1946667402693576, "kl": 0.0458984375, "learning_rate": 9.998755060646966e-07, "loss": 0.0018, "reward": 2.1357030868530273, "reward_std": 0.03673718869686127, "rewards/accuracy_reward": 0.9357030987739563, "rewards/format_reward": 1.0, "step": 515 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 415.90625, "epoch": 0.0071171432118177684, "grad_norm": 1.9680148320871198, "kl": 0.048828125, "learning_rate": 9.99875022143875e-07, "loss": 0.002, "reward": 2.1181249618530273, "reward_std": 0.023759758099913597, "rewards/accuracy_reward": 0.9181250333786011, "rewards/format_reward": 1.0, "step": 516 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 426.5625, "epoch": 0.007130936125018965, "grad_norm": 2.649771244215759, "kl": 0.04736328125, "learning_rate": 9.99874537284469e-07, "loss": 0.0019, "reward": 2.0860157012939453, "reward_std": 0.04784229397773743, "rewards/accuracy_reward": 0.8922655582427979, "rewards/format_reward": 1.0, "step": 517 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 429.96875, "epoch": 0.007144729038220162, "grad_norm": 1.9292168486626, "kl": 0.053466796875, "learning_rate": 9.998740514864804e-07, "loss": 0.0021, "reward": 2.099353313446045, "reward_std": 0.05245669186115265, "rewards/accuracy_reward": 0.9118530750274658, "rewards/format_reward": 1.0, "step": 518 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 433.65625, "epoch": 0.007158521951421359, "grad_norm": 2.1064973619491525, "kl": 0.04541015625, "learning_rate": 9.99873564749909e-07, "loss": 0.0018, "reward": 2.109562397003174, "reward_std": 0.04888321831822395, "rewards/accuracy_reward": 0.9158124923706055, "rewards/format_reward": 1.0, "step": 519 }, { "all_correct": 0.0, "all_wrong": 0.0, "completion_length": 406.5625, "epoch": 0.007172314864622557, "grad_norm": 1.714197419561755, "kl": 0.04736328125, "learning_rate": 9.998730770747564e-07, "loss": 0.0019, "reward": 1.8421564102172852, "reward_std": 0.012847058475017548, "rewards/accuracy_reward": 0.6421562433242798, "rewards/format_reward": 1.0, "step": 520 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 425.71875, "epoch": 0.007186107777823754, "grad_norm": 3.646128183924181, "kl": 0.053955078125, "learning_rate": 9.998725884610236e-07, "loss": 0.0022, "reward": 2.101968765258789, "reward_std": 0.03896723687648773, "rewards/accuracy_reward": 0.901968777179718, "rewards/format_reward": 1.0, "step": 521 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 408.21875, "epoch": 0.007199900691024951, "grad_norm": 1.7398977275314433, "kl": 0.045166015625, "learning_rate": 9.998720989087109e-07, "loss": 0.0018, "reward": 2.095125198364258, "reward_std": 0.04365184158086777, "rewards/accuracy_reward": 0.8951250314712524, "rewards/format_reward": 1.0, "step": 522 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 411.46875, "epoch": 0.0072136936042261485, "grad_norm": 3.76395542902141, "kl": 0.048095703125, "learning_rate": 9.998716084178198e-07, "loss": 0.0019, "reward": 2.0667343139648438, "reward_std": 0.033452246338129044, "rewards/accuracy_reward": 0.8667343854904175, "rewards/format_reward": 1.0, "step": 523 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 414.78125, "epoch": 0.007227486517427346, "grad_norm": 2.6288636793019116, "kl": 0.044677734375, "learning_rate": 9.99871116988351e-07, "loss": 0.0018, "reward": 2.0741562843322754, "reward_std": 0.04550555348396301, "rewards/accuracy_reward": 0.8741562366485596, "rewards/format_reward": 1.0, "step": 524 }, { "all_correct": 0.25, "all_wrong": 0.0, "completion_length": 437.21875, "epoch": 0.007241279430628543, "grad_norm": 1.9456527226226872, "kl": 0.053466796875, "learning_rate": 9.998706246203054e-07, "loss": 0.0021, "reward": 1.9760468006134033, "reward_std": 0.052287109196186066, "rewards/accuracy_reward": 0.7885469198226929, "rewards/format_reward": 1.0, "step": 525 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 424.6875, "epoch": 0.00725507234382974, "grad_norm": 1.470474537643452, "kl": 0.0458984375, "learning_rate": 9.998701313136842e-07, "loss": 0.0018, "reward": 1.93568754196167, "reward_std": 0.02510208822786808, "rewards/accuracy_reward": 0.7356874942779541, "rewards/format_reward": 1.0, "step": 526 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 427.40625, "epoch": 0.0072688652570309376, "grad_norm": 2.282278706418286, "kl": 0.0478515625, "learning_rate": 9.998696370684877e-07, "loss": 0.0019, "reward": 2.0772969722747803, "reward_std": 0.04258961230516434, "rewards/accuracy_reward": 0.8772968053817749, "rewards/format_reward": 1.0, "step": 527 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 415.8125, "epoch": 0.007282658170232135, "grad_norm": 4.049097356206214, "kl": 0.050537109375, "learning_rate": 9.998691418847176e-07, "loss": 0.002, "reward": 2.0915937423706055, "reward_std": 0.0461568646132946, "rewards/accuracy_reward": 0.8978437781333923, "rewards/format_reward": 1.0, "step": 528 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 411.0625, "epoch": 0.007296451083433332, "grad_norm": 2.0837389439725857, "kl": 0.046630859375, "learning_rate": 9.998686457623741e-07, "loss": 0.0019, "reward": 2.094156265258789, "reward_std": 0.0520988292992115, "rewards/accuracy_reward": 0.9004062414169312, "rewards/format_reward": 1.0, "step": 529 }, { "all_correct": 0.25, "all_wrong": 0.0, "completion_length": 404.90625, "epoch": 0.007310243996634529, "grad_norm": 2.120102960432839, "kl": 0.05517578125, "learning_rate": 9.99868148701459e-07, "loss": 0.0022, "reward": 1.9691874980926514, "reward_std": 0.03907351940870285, "rewards/accuracy_reward": 0.7691875100135803, "rewards/format_reward": 1.0, "step": 530 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 405.84375, "epoch": 0.007324036909835727, "grad_norm": 2.579244437249799, "kl": 0.04345703125, "learning_rate": 9.998676507019721e-07, "loss": 0.0017, "reward": 2.128000020980835, "reward_std": 0.048519209027290344, "rewards/accuracy_reward": 0.940500020980835, "rewards/format_reward": 1.0, "step": 531 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 425.59375, "epoch": 0.007337829823036924, "grad_norm": 1.9874039520070204, "kl": 0.046875, "learning_rate": 9.998671517639152e-07, "loss": 0.0019, "reward": 2.0330939292907715, "reward_std": 0.03748145326972008, "rewards/accuracy_reward": 0.839343786239624, "rewards/format_reward": 1.0, "step": 532 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 412.75, "epoch": 0.007351622736238121, "grad_norm": 2.813131197338738, "kl": 0.047607421875, "learning_rate": 9.998666518872892e-07, "loss": 0.0019, "reward": 2.1248438358306885, "reward_std": 0.023312415927648544, "rewards/accuracy_reward": 0.9248437881469727, "rewards/format_reward": 1.0, "step": 533 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 402.0, "epoch": 0.0073654156494393185, "grad_norm": 3.513338571103898, "kl": 0.044921875, "learning_rate": 9.998661510720944e-07, "loss": 0.0018, "reward": 2.1324687004089355, "reward_std": 0.027532529085874557, "rewards/accuracy_reward": 0.9324687123298645, "rewards/format_reward": 1.0, "step": 534 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 383.96875, "epoch": 0.007379208562640516, "grad_norm": 2.3471973342292993, "kl": 0.049560546875, "learning_rate": 9.998656493183325e-07, "loss": 0.002, "reward": 2.1390936374664307, "reward_std": 0.043466780334711075, "rewards/accuracy_reward": 0.9515937566757202, "rewards/format_reward": 1.0, "step": 535 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 408.65625, "epoch": 0.007393001475841712, "grad_norm": 2.340795022088105, "kl": 0.04443359375, "learning_rate": 9.99865146626004e-07, "loss": 0.0018, "reward": 2.1158437728881836, "reward_std": 0.04573627561330795, "rewards/accuracy_reward": 0.9158437252044678, "rewards/format_reward": 1.0, "step": 536 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 404.6875, "epoch": 0.007406794389042909, "grad_norm": 2.522644604003013, "kl": 0.0478515625, "learning_rate": 9.9986464299511e-07, "loss": 0.0019, "reward": 2.0786561965942383, "reward_std": 0.03554216027259827, "rewards/accuracy_reward": 0.878656268119812, "rewards/format_reward": 1.0, "step": 537 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 402.6875, "epoch": 0.007420587302244107, "grad_norm": 3.7604491957785813, "kl": 0.05322265625, "learning_rate": 9.998641384256515e-07, "loss": 0.0021, "reward": 2.0857813358306885, "reward_std": 0.01673278957605362, "rewards/accuracy_reward": 0.8857812285423279, "rewards/format_reward": 1.0, "step": 538 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 409.78125, "epoch": 0.007434380215445304, "grad_norm": 2.664318274849365, "kl": 0.048583984375, "learning_rate": 9.99863632917629e-07, "loss": 0.0019, "reward": 2.107687473297119, "reward_std": 0.04941736161708832, "rewards/accuracy_reward": 0.913937509059906, "rewards/format_reward": 1.0, "step": 539 }, { "all_correct": 0.0, "all_wrong": 0.0, "completion_length": 409.5625, "epoch": 0.007448173128646501, "grad_norm": 7.968877856206847, "kl": 0.046875, "learning_rate": 9.998631264710441e-07, "loss": 0.0019, "reward": 1.9662500619888306, "reward_std": 0.048946212977170944, "rewards/accuracy_reward": 0.778749942779541, "rewards/format_reward": 1.0, "step": 540 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 416.28125, "epoch": 0.0074619660418476985, "grad_norm": 2.7712626618404315, "kl": 0.055419921875, "learning_rate": 9.998626190858974e-07, "loss": 0.0022, "reward": 1.972359538078308, "reward_std": 0.02387489750981331, "rewards/accuracy_reward": 0.7723593711853027, "rewards/format_reward": 1.0, "step": 541 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 406.8125, "epoch": 0.007475758955048896, "grad_norm": 1.505516232952706, "kl": 0.04638671875, "learning_rate": 9.998621107621898e-07, "loss": 0.0019, "reward": 2.043093681335449, "reward_std": 0.015246758237481117, "rewards/accuracy_reward": 0.843093752861023, "rewards/format_reward": 1.0, "step": 542 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 411.78125, "epoch": 0.007489551868250093, "grad_norm": 2.65668605502793, "kl": 0.047119140625, "learning_rate": 9.998616014999225e-07, "loss": 0.0019, "reward": 2.042343854904175, "reward_std": 0.029045548290014267, "rewards/accuracy_reward": 0.8423437476158142, "rewards/format_reward": 1.0, "step": 543 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 424.75, "epoch": 0.00750334478145129, "grad_norm": 2.5744088646619203, "kl": 0.047607421875, "learning_rate": 9.998610912990962e-07, "loss": 0.0019, "reward": 2.082843780517578, "reward_std": 0.030678531154990196, "rewards/accuracy_reward": 0.8890937566757202, "rewards/format_reward": 1.0, "step": 544 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 408.5, "epoch": 0.0075171376946524876, "grad_norm": 2.2398843932794756, "kl": 0.052490234375, "learning_rate": 9.99860580159712e-07, "loss": 0.0021, "reward": 2.0209062099456787, "reward_std": 0.03538673743605614, "rewards/accuracy_reward": 0.8271562457084656, "rewards/format_reward": 1.0, "step": 545 }, { "all_correct": 0.25, "all_wrong": 0.0, "completion_length": 419.34375, "epoch": 0.007530930607853685, "grad_norm": 4.85026224221038, "kl": 0.04541015625, "learning_rate": 9.998600680817708e-07, "loss": 0.0018, "reward": 1.9804999828338623, "reward_std": 0.05869058519601822, "rewards/accuracy_reward": 0.7929999828338623, "rewards/format_reward": 1.0, "step": 546 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 424.34375, "epoch": 0.007544723521054882, "grad_norm": 2.268057446881576, "kl": 0.04541015625, "learning_rate": 9.998595550652735e-07, "loss": 0.0018, "reward": 2.0225625038146973, "reward_std": 0.05491924285888672, "rewards/accuracy_reward": 0.8288125395774841, "rewards/format_reward": 1.0, "step": 547 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 417.25, "epoch": 0.007558516434256079, "grad_norm": 2.1995625009748023, "kl": 0.040283203125, "learning_rate": 9.998590411102213e-07, "loss": 0.0016, "reward": 2.057593822479248, "reward_std": 0.036069631576538086, "rewards/accuracy_reward": 0.8575937747955322, "rewards/format_reward": 1.0, "step": 548 }, { "all_correct": 0.25, "all_wrong": 0.0, "completion_length": 402.8125, "epoch": 0.007572309347457277, "grad_norm": 3.191414361308829, "kl": 0.04833984375, "learning_rate": 9.99858526216615e-07, "loss": 0.0019, "reward": 1.9870312213897705, "reward_std": 0.053150027990341187, "rewards/accuracy_reward": 0.7995312213897705, "rewards/format_reward": 1.0, "step": 549 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 413.25, "epoch": 0.007586102260658474, "grad_norm": 1.712975265678163, "kl": 0.049560546875, "learning_rate": 9.998580103844556e-07, "loss": 0.002, "reward": 2.132093906402588, "reward_std": 0.01549484208226204, "rewards/accuracy_reward": 0.9320937395095825, "rewards/format_reward": 1.0, "step": 550 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 421.15625, "epoch": 0.007599895173859671, "grad_norm": 2.742542474097328, "kl": 0.044677734375, "learning_rate": 9.99857493613744e-07, "loss": 0.0018, "reward": 2.094437599182129, "reward_std": 0.040200125426054, "rewards/accuracy_reward": 0.900687575340271, "rewards/format_reward": 1.0, "step": 551 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 437.90625, "epoch": 0.0076136880870608685, "grad_norm": 2.808473489984914, "kl": 0.044921875, "learning_rate": 9.998569759044813e-07, "loss": 0.0018, "reward": 1.9765312671661377, "reward_std": 0.043738193809986115, "rewards/accuracy_reward": 0.7765312194824219, "rewards/format_reward": 1.0, "step": 552 }, { "all_correct": 0.0, "all_wrong": 0.0, "completion_length": 436.875, "epoch": 0.007627481000262066, "grad_norm": 5.35527647118084, "kl": 0.0458984375, "learning_rate": 9.998564572566682e-07, "loss": 0.0018, "reward": 2.000718832015991, "reward_std": 0.05669596791267395, "rewards/accuracy_reward": 0.8132187724113464, "rewards/format_reward": 1.0, "step": 553 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 406.5625, "epoch": 0.007641273913463263, "grad_norm": 5.8873742580104675, "kl": 0.047119140625, "learning_rate": 9.99855937670306e-07, "loss": 0.0019, "reward": 1.9873125553131104, "reward_std": 0.03839386999607086, "rewards/accuracy_reward": 0.7873125076293945, "rewards/format_reward": 1.0, "step": 554 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 418.21875, "epoch": 0.007655066826664459, "grad_norm": 2.6317964034594405, "kl": 0.042724609375, "learning_rate": 9.998554171453956e-07, "loss": 0.0017, "reward": 2.015031337738037, "reward_std": 0.034210801124572754, "rewards/accuracy_reward": 0.8150312900543213, "rewards/format_reward": 1.0, "step": 555 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 417.0, "epoch": 0.007668859739865657, "grad_norm": 2.805422766664735, "kl": 0.0478515625, "learning_rate": 9.998548956819379e-07, "loss": 0.0019, "reward": 2.1157655715942383, "reward_std": 0.03818315640091896, "rewards/accuracy_reward": 0.915765643119812, "rewards/format_reward": 1.0, "step": 556 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 405.03125, "epoch": 0.007682652653066854, "grad_norm": 2.18048530097695, "kl": 0.044921875, "learning_rate": 9.998543732799337e-07, "loss": 0.0018, "reward": 2.094156265258789, "reward_std": 0.019788896664977074, "rewards/accuracy_reward": 0.8941562175750732, "rewards/format_reward": 1.0, "step": 557 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 412.71875, "epoch": 0.007696445566268051, "grad_norm": 1.8605141579173605, "kl": 0.044921875, "learning_rate": 9.998538499393845e-07, "loss": 0.0018, "reward": 2.1350936889648438, "reward_std": 0.0146458949893713, "rewards/accuracy_reward": 0.9350938200950623, "rewards/format_reward": 1.0, "step": 558 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 416.6875, "epoch": 0.0077102384794692485, "grad_norm": 2.5594732844938166, "kl": 0.0478515625, "learning_rate": 9.998533256602907e-07, "loss": 0.0019, "reward": 2.0703125, "reward_std": 0.045187897980213165, "rewards/accuracy_reward": 0.8765624761581421, "rewards/format_reward": 1.0, "step": 559 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 406.125, "epoch": 0.007724031392670446, "grad_norm": 2.4909544263021246, "kl": 0.045654296875, "learning_rate": 9.998528004426535e-07, "loss": 0.0018, "reward": 2.095250129699707, "reward_std": 0.03287772461771965, "rewards/accuracy_reward": 0.8952499628067017, "rewards/format_reward": 1.0, "step": 560 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 410.8125, "epoch": 0.007737824305871643, "grad_norm": 2.6299525536405604, "kl": 0.05078125, "learning_rate": 9.998522742864744e-07, "loss": 0.002, "reward": 2.1489062309265137, "reward_std": 0.06338509917259216, "rewards/accuracy_reward": 0.9676562547683716, "rewards/format_reward": 1.0, "step": 561 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 414.125, "epoch": 0.00775161721907284, "grad_norm": 2.503623458440712, "kl": 0.04638671875, "learning_rate": 9.998517471917534e-07, "loss": 0.0019, "reward": 2.1376876831054688, "reward_std": 0.02024393156170845, "rewards/accuracy_reward": 0.9376875162124634, "rewards/format_reward": 1.0, "step": 562 }, { "all_correct": 0.25, "all_wrong": 0.0, "completion_length": 395.71875, "epoch": 0.007765410132274038, "grad_norm": 2.4726764911485266, "kl": 0.050537109375, "learning_rate": 9.998512191584923e-07, "loss": 0.002, "reward": 2.056906223297119, "reward_std": 0.05861155688762665, "rewards/accuracy_reward": 0.8694062232971191, "rewards/format_reward": 1.0, "step": 563 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 404.90625, "epoch": 0.007779203045475235, "grad_norm": 2.799259390912739, "kl": 0.0546875, "learning_rate": 9.998506901866916e-07, "loss": 0.0022, "reward": 2.133718967437744, "reward_std": 0.03177863731980324, "rewards/accuracy_reward": 0.933718740940094, "rewards/format_reward": 1.0, "step": 564 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 395.21875, "epoch": 0.007792995958676432, "grad_norm": 1.957420647806705, "kl": 0.05615234375, "learning_rate": 9.998501602763528e-07, "loss": 0.0022, "reward": 2.0997812747955322, "reward_std": 0.031323112547397614, "rewards/accuracy_reward": 0.9060313105583191, "rewards/format_reward": 1.0, "step": 565 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 406.3125, "epoch": 0.007806788871877629, "grad_norm": 2.6253991100437184, "kl": 0.053466796875, "learning_rate": 9.998496294274762e-07, "loss": 0.0021, "reward": 1.976812481880188, "reward_std": 0.0234046820551157, "rewards/accuracy_reward": 0.7768125534057617, "rewards/format_reward": 1.0, "step": 566 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 399.25, "epoch": 0.007820581785078827, "grad_norm": 2.466158240513804, "kl": 0.050048828125, "learning_rate": 9.998490976400635e-07, "loss": 0.002, "reward": 2.0518438816070557, "reward_std": 0.05854339152574539, "rewards/accuracy_reward": 0.8643437623977661, "rewards/format_reward": 1.0, "step": 567 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 414.90625, "epoch": 0.007834374698280024, "grad_norm": 5.0992276416627424, "kl": 0.052490234375, "learning_rate": 9.998485649141153e-07, "loss": 0.0021, "reward": 2.113156318664551, "reward_std": 0.03319736570119858, "rewards/accuracy_reward": 0.9194062948226929, "rewards/format_reward": 1.0, "step": 568 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 421.5625, "epoch": 0.007848167611481221, "grad_norm": 3.491355031460062, "kl": 0.05322265625, "learning_rate": 9.998480312496327e-07, "loss": 0.0021, "reward": 2.1549062728881836, "reward_std": 0.038089293986558914, "rewards/accuracy_reward": 0.9611562490463257, "rewards/format_reward": 1.0, "step": 569 }, { "all_correct": 0.25, "all_wrong": 0.0, "completion_length": 402.5, "epoch": 0.007861960524682418, "grad_norm": 4.589181752927826, "kl": 0.05126953125, "learning_rate": 9.998474966466166e-07, "loss": 0.0021, "reward": 1.9963749647140503, "reward_std": 0.12457430362701416, "rewards/accuracy_reward": 0.827625036239624, "rewards/format_reward": 0.96875, "step": 570 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 419.875, "epoch": 0.007875753437883616, "grad_norm": 7.388231714019175, "kl": 0.045166015625, "learning_rate": 9.998469611050682e-07, "loss": 0.0018, "reward": 2.157437562942505, "reward_std": 0.019397925585508347, "rewards/accuracy_reward": 0.9574375152587891, "rewards/format_reward": 1.0, "step": 571 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 398.34375, "epoch": 0.007889546351084813, "grad_norm": 2.2361473804468326, "kl": 0.052734375, "learning_rate": 9.998464246249883e-07, "loss": 0.0021, "reward": 2.0688905715942383, "reward_std": 0.042852070182561874, "rewards/accuracy_reward": 0.8751406073570251, "rewards/format_reward": 1.0, "step": 572 }, { "all_correct": 0.25, "all_wrong": 0.0, "completion_length": 432.65625, "epoch": 0.00790333926428601, "grad_norm": 2.0423230135814423, "kl": 0.052490234375, "learning_rate": 9.998458872063782e-07, "loss": 0.0021, "reward": 1.9822500944137573, "reward_std": 0.04722994565963745, "rewards/accuracy_reward": 0.8009999990463257, "rewards/format_reward": 1.0, "step": 573 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 418.75, "epoch": 0.007917132177487208, "grad_norm": 7.335567290498138, "kl": 0.051025390625, "learning_rate": 9.998453488492385e-07, "loss": 0.002, "reward": 2.095125198364258, "reward_std": 0.03222516179084778, "rewards/accuracy_reward": 0.8951249718666077, "rewards/format_reward": 1.0, "step": 574 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 412.78125, "epoch": 0.007930925090688405, "grad_norm": 2.407204150396676, "kl": 0.04541015625, "learning_rate": 9.998448095535705e-07, "loss": 0.0018, "reward": 2.1214687824249268, "reward_std": 0.025431903079152107, "rewards/accuracy_reward": 0.9214687943458557, "rewards/format_reward": 1.0, "step": 575 }, { "all_correct": 0.25, "all_wrong": 0.0, "completion_length": 402.0625, "epoch": 0.007944718003889602, "grad_norm": 3.705734717247417, "kl": 0.051513671875, "learning_rate": 9.998442693193752e-07, "loss": 0.0021, "reward": 1.989281177520752, "reward_std": 0.04195724055171013, "rewards/accuracy_reward": 0.7892812490463257, "rewards/format_reward": 1.0, "step": 576 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 420.625, "epoch": 0.0079585109170908, "grad_norm": 1.9764430736463567, "kl": 0.048828125, "learning_rate": 9.998437281466533e-07, "loss": 0.002, "reward": 2.122041702270508, "reward_std": 0.03430842235684395, "rewards/accuracy_reward": 0.9282916784286499, "rewards/format_reward": 1.0, "step": 577 }, { "all_correct": 0.25, "all_wrong": 0.0, "completion_length": 416.625, "epoch": 0.007972303830291997, "grad_norm": 3.2722723537661915, "kl": 0.04736328125, "learning_rate": 9.998431860354065e-07, "loss": 0.0019, "reward": 2.022031307220459, "reward_std": 0.0205271877348423, "rewards/accuracy_reward": 0.8220312595367432, "rewards/format_reward": 1.0, "step": 578 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 394.625, "epoch": 0.007986096743493194, "grad_norm": 2.7244174426346754, "kl": 0.04931640625, "learning_rate": 9.99842642985635e-07, "loss": 0.002, "reward": 2.069000244140625, "reward_std": 0.04791932925581932, "rewards/accuracy_reward": 0.875249981880188, "rewards/format_reward": 1.0, "step": 579 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 406.0, "epoch": 0.007999889656694391, "grad_norm": 2.5004396849143933, "kl": 0.05078125, "learning_rate": 9.998420989973404e-07, "loss": 0.002, "reward": 1.9372186660766602, "reward_std": 0.04446589946746826, "rewards/accuracy_reward": 0.7497186660766602, "rewards/format_reward": 1.0, "step": 580 }, { "all_correct": 0.25, "all_wrong": 0.0, "completion_length": 418.53125, "epoch": 0.008013682569895588, "grad_norm": 2.0668142836736125, "kl": 0.04638671875, "learning_rate": 9.998415540705234e-07, "loss": 0.0019, "reward": 1.9620938301086426, "reward_std": 0.01525508426129818, "rewards/accuracy_reward": 0.7620937824249268, "rewards/format_reward": 1.0, "step": 581 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 422.21875, "epoch": 0.008027475483096786, "grad_norm": 2.201531762063597, "kl": 0.05029296875, "learning_rate": 9.998410082051853e-07, "loss": 0.002, "reward": 2.103203296661377, "reward_std": 0.030211403965950012, "rewards/accuracy_reward": 0.9032031297683716, "rewards/format_reward": 1.0, "step": 582 }, { "all_correct": 0.25, "all_wrong": 0.0, "completion_length": 400.875, "epoch": 0.008041268396297981, "grad_norm": 2.386786833331952, "kl": 0.043701171875, "learning_rate": 9.99840461401327e-07, "loss": 0.0017, "reward": 1.972453236579895, "reward_std": 0.03474128618836403, "rewards/accuracy_reward": 0.7787030935287476, "rewards/format_reward": 1.0, "step": 583 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 407.46875, "epoch": 0.008055061309499179, "grad_norm": 3.68525999391454, "kl": 0.04443359375, "learning_rate": 9.998399136589495e-07, "loss": 0.0018, "reward": 2.1013126373291016, "reward_std": 0.023903727531433105, "rewards/accuracy_reward": 0.9013124704360962, "rewards/format_reward": 1.0, "step": 584 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 429.8125, "epoch": 0.008068854222700376, "grad_norm": 2.245951540590924, "kl": 0.04833984375, "learning_rate": 9.998393649780537e-07, "loss": 0.0019, "reward": 1.9754688739776611, "reward_std": 0.03574752062559128, "rewards/accuracy_reward": 0.7754687070846558, "rewards/format_reward": 1.0, "step": 585 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 395.625, "epoch": 0.008082647135901573, "grad_norm": 2.127053984711338, "kl": 0.049072265625, "learning_rate": 9.998388153586408e-07, "loss": 0.002, "reward": 2.1194686889648438, "reward_std": 0.01943955570459366, "rewards/accuracy_reward": 0.9194687008857727, "rewards/format_reward": 1.0, "step": 586 }, { "all_correct": 0.0, "all_wrong": 0.0, "completion_length": 411.84375, "epoch": 0.00809644004910277, "grad_norm": 2.7237821133610662, "kl": 0.050537109375, "learning_rate": 9.998382648007117e-07, "loss": 0.002, "reward": 1.993346929550171, "reward_std": 0.036785904318094254, "rewards/accuracy_reward": 0.7933468222618103, "rewards/format_reward": 1.0, "step": 587 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 401.6875, "epoch": 0.008110232962303968, "grad_norm": 2.2448973926877733, "kl": 0.0595703125, "learning_rate": 9.998377133042676e-07, "loss": 0.0024, "reward": 2.096874952316284, "reward_std": 0.044183891266584396, "rewards/accuracy_reward": 0.903124988079071, "rewards/format_reward": 1.0, "step": 588 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 388.34375, "epoch": 0.008124025875505165, "grad_norm": 2.308035815596835, "kl": 0.05126953125, "learning_rate": 9.998371608693095e-07, "loss": 0.0021, "reward": 2.079078197479248, "reward_std": 0.017626050859689713, "rewards/accuracy_reward": 0.8790781497955322, "rewards/format_reward": 1.0, "step": 589 }, { "all_correct": 0.25, "all_wrong": 0.0, "completion_length": 397.625, "epoch": 0.008137818788706362, "grad_norm": 2.745135395407473, "kl": 0.058349609375, "learning_rate": 9.998366074958383e-07, "loss": 0.0023, "reward": 1.977515697479248, "reward_std": 0.0437779426574707, "rewards/accuracy_reward": 0.7837656140327454, "rewards/format_reward": 1.0, "step": 590 }, { "all_correct": 0.0, "all_wrong": 0.0, "completion_length": 385.65625, "epoch": 0.00815161170190756, "grad_norm": 2.5060009700025354, "kl": 0.05810546875, "learning_rate": 9.998360531838552e-07, "loss": 0.0023, "reward": 1.8797187805175781, "reward_std": 0.033545564860105515, "rewards/accuracy_reward": 0.6859687566757202, "rewards/format_reward": 1.0, "step": 591 }, { "all_correct": 0.25, "all_wrong": 0.0, "completion_length": 392.03125, "epoch": 0.008165404615108757, "grad_norm": 2.232988056890812, "kl": 0.049072265625, "learning_rate": 9.998354979333613e-07, "loss": 0.002, "reward": 2.045015811920166, "reward_std": 0.037566304206848145, "rewards/accuracy_reward": 0.8512656092643738, "rewards/format_reward": 1.0, "step": 592 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 386.40625, "epoch": 0.008179197528309954, "grad_norm": 2.572411783666988, "kl": 0.053955078125, "learning_rate": 9.998349417443575e-07, "loss": 0.0022, "reward": 2.107781410217285, "reward_std": 0.030380673706531525, "rewards/accuracy_reward": 0.9140312075614929, "rewards/format_reward": 1.0, "step": 593 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 384.75, "epoch": 0.008192990441511151, "grad_norm": 2.674710046087842, "kl": 0.05322265625, "learning_rate": 9.998343846168447e-07, "loss": 0.0021, "reward": 2.0847561359405518, "reward_std": 0.06284406781196594, "rewards/accuracy_reward": 0.8910062313079834, "rewards/format_reward": 1.0, "step": 594 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 387.71875, "epoch": 0.008206783354712349, "grad_norm": 2.0566682170283324, "kl": 0.04931640625, "learning_rate": 9.998338265508243e-07, "loss": 0.002, "reward": 2.059046983718872, "reward_std": 0.031089063733816147, "rewards/accuracy_reward": 0.8590468764305115, "rewards/format_reward": 1.0, "step": 595 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 386.40625, "epoch": 0.008220576267913546, "grad_norm": 1.8868328817765734, "kl": 0.05224609375, "learning_rate": 9.99833267546297e-07, "loss": 0.0021, "reward": 2.0136561393737793, "reward_std": 0.04053781181573868, "rewards/accuracy_reward": 0.8199062347412109, "rewards/format_reward": 1.0, "step": 596 }, { "all_correct": 0.25, "all_wrong": 0.0, "completion_length": 395.1875, "epoch": 0.008234369181114743, "grad_norm": 2.2404098172490117, "kl": 0.05810546875, "learning_rate": 9.998327076032642e-07, "loss": 0.0023, "reward": 1.9520626068115234, "reward_std": 0.037759751081466675, "rewards/accuracy_reward": 0.7583125233650208, "rewards/format_reward": 1.0, "step": 597 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 391.3125, "epoch": 0.00824816209431594, "grad_norm": 2.047875516471587, "kl": 0.053466796875, "learning_rate": 9.998321467217266e-07, "loss": 0.0021, "reward": 2.0779218673706055, "reward_std": 0.03276470676064491, "rewards/accuracy_reward": 0.8779218792915344, "rewards/format_reward": 1.0, "step": 598 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 398.3125, "epoch": 0.008261955007517138, "grad_norm": 6.302417864383523, "kl": 0.046875, "learning_rate": 9.998315849016854e-07, "loss": 0.0019, "reward": 2.083718776702881, "reward_std": 0.044975195080041885, "rewards/accuracy_reward": 0.889968752861023, "rewards/format_reward": 1.0, "step": 599 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 387.96875, "epoch": 0.008275747920718335, "grad_norm": 2.2765073500433557, "kl": 0.052978515625, "learning_rate": 9.998310221431418e-07, "loss": 0.0021, "reward": 2.118281364440918, "reward_std": 0.043207887560129166, "rewards/accuracy_reward": 0.9245312213897705, "rewards/format_reward": 1.0, "step": 600 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 390.875, "epoch": 0.008289540833919532, "grad_norm": 2.6319948678517413, "kl": 0.0537109375, "learning_rate": 9.998304584460966e-07, "loss": 0.0021, "reward": 2.1271250247955322, "reward_std": 0.03652356564998627, "rewards/accuracy_reward": 0.9333750605583191, "rewards/format_reward": 1.0, "step": 601 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 403.15625, "epoch": 0.00830333374712073, "grad_norm": 2.7994406243118433, "kl": 0.05224609375, "learning_rate": 9.99829893810551e-07, "loss": 0.0021, "reward": 2.113593816757202, "reward_std": 0.017072822898626328, "rewards/accuracy_reward": 0.9135937094688416, "rewards/format_reward": 1.0, "step": 602 }, { "all_correct": 0.25, "all_wrong": 0.0, "completion_length": 381.8125, "epoch": 0.008317126660321927, "grad_norm": 2.4668808887652482, "kl": 0.05712890625, "learning_rate": 9.99829328236506e-07, "loss": 0.0023, "reward": 1.9726030826568604, "reward_std": 0.04915139079093933, "rewards/accuracy_reward": 0.7851030826568604, "rewards/format_reward": 1.0, "step": 603 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 390.3125, "epoch": 0.008330919573523124, "grad_norm": 2.690390285458535, "kl": 0.056640625, "learning_rate": 9.998287617239629e-07, "loss": 0.0023, "reward": 2.0758280754089355, "reward_std": 0.0338432677090168, "rewards/accuracy_reward": 0.8758281469345093, "rewards/format_reward": 1.0, "step": 604 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 391.8125, "epoch": 0.008344712486724321, "grad_norm": 3.099596010851626, "kl": 0.0537109375, "learning_rate": 9.998281942729226e-07, "loss": 0.0021, "reward": 2.1123032569885254, "reward_std": 0.04457748681306839, "rewards/accuracy_reward": 0.9185531139373779, "rewards/format_reward": 1.0, "step": 605 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 380.84375, "epoch": 0.008358505399925518, "grad_norm": 2.85370263563775, "kl": 0.055419921875, "learning_rate": 9.998276258833858e-07, "loss": 0.0022, "reward": 2.034712553024292, "reward_std": 0.04444105923175812, "rewards/accuracy_reward": 0.8347125053405762, "rewards/format_reward": 1.0, "step": 606 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 393.0625, "epoch": 0.008372298313126716, "grad_norm": 3.112604431728429, "kl": 0.05810546875, "learning_rate": 9.998270565553542e-07, "loss": 0.0023, "reward": 2.0184688568115234, "reward_std": 0.03241810202598572, "rewards/accuracy_reward": 0.8184688091278076, "rewards/format_reward": 1.0, "step": 607 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 376.40625, "epoch": 0.008386091226327913, "grad_norm": 2.126485543392029, "kl": 0.054931640625, "learning_rate": 9.998264862888284e-07, "loss": 0.0022, "reward": 2.105687379837036, "reward_std": 0.033524829894304276, "rewards/accuracy_reward": 0.9056875109672546, "rewards/format_reward": 1.0, "step": 608 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 388.125, "epoch": 0.00839988413952911, "grad_norm": 1.9499234816441722, "kl": 0.05859375, "learning_rate": 9.998259150838097e-07, "loss": 0.0024, "reward": 2.1235408782958984, "reward_std": 0.02109311707317829, "rewards/accuracy_reward": 0.9235406517982483, "rewards/format_reward": 1.0, "step": 609 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 386.09375, "epoch": 0.008413677052730308, "grad_norm": 2.8183770161704684, "kl": 0.05810546875, "learning_rate": 9.998253429402993e-07, "loss": 0.0023, "reward": 2.0257344245910645, "reward_std": 0.03893623873591423, "rewards/accuracy_reward": 0.8257343769073486, "rewards/format_reward": 1.0, "step": 610 }, { "all_correct": 0.25, "all_wrong": 0.0, "completion_length": 397.25, "epoch": 0.008427469965931505, "grad_norm": 2.9721138095369093, "kl": 0.060302734375, "learning_rate": 9.998247698582977e-07, "loss": 0.0024, "reward": 2.031135320663452, "reward_std": 0.055596478283405304, "rewards/accuracy_reward": 0.8436354398727417, "rewards/format_reward": 1.0, "step": 611 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 380.15625, "epoch": 0.008441262879132702, "grad_norm": 2.4769072314650846, "kl": 0.06640625, "learning_rate": 9.998241958378066e-07, "loss": 0.0026, "reward": 2.059593677520752, "reward_std": 0.020894348621368408, "rewards/accuracy_reward": 0.8595937490463257, "rewards/format_reward": 1.0, "step": 612 }, { "all_correct": 0.25, "all_wrong": 0.0, "completion_length": 390.3125, "epoch": 0.0084550557923339, "grad_norm": 2.163008190514058, "kl": 0.064453125, "learning_rate": 9.998236208788268e-07, "loss": 0.0026, "reward": 1.9688438177108765, "reward_std": 0.04034517705440521, "rewards/accuracy_reward": 0.7750937342643738, "rewards/format_reward": 1.0, "step": 613 }, { "all_correct": 0.25, "all_wrong": 0.0, "completion_length": 390.375, "epoch": 0.008468848705535097, "grad_norm": 2.478480975850513, "kl": 0.06494140625, "learning_rate": 9.998230449813596e-07, "loss": 0.0026, "reward": 1.978431224822998, "reward_std": 0.05483083054423332, "rewards/accuracy_reward": 0.7846812605857849, "rewards/format_reward": 1.0, "step": 614 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 401.5625, "epoch": 0.008482641618736294, "grad_norm": 2.682396192807337, "kl": 0.05615234375, "learning_rate": 9.998224681454056e-07, "loss": 0.0022, "reward": 2.003499984741211, "reward_std": 0.04995805397629738, "rewards/accuracy_reward": 0.8097500801086426, "rewards/format_reward": 1.0, "step": 615 }, { "all_correct": 0.0, "all_wrong": 0.0, "completion_length": 387.96875, "epoch": 0.008496434531937491, "grad_norm": 2.4011236279659522, "kl": 0.0546875, "learning_rate": 9.998218903709665e-07, "loss": 0.0022, "reward": 2.003406286239624, "reward_std": 0.04750627651810646, "rewards/accuracy_reward": 0.8096562623977661, "rewards/format_reward": 1.0, "step": 616 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 397.09375, "epoch": 0.008510227445138688, "grad_norm": 2.3404149875572053, "kl": 0.06103515625, "learning_rate": 9.99821311658043e-07, "loss": 0.0024, "reward": 2.1420626640319824, "reward_std": 0.033706601709127426, "rewards/accuracy_reward": 0.942062497138977, "rewards/format_reward": 1.0, "step": 617 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 388.53125, "epoch": 0.008524020358339886, "grad_norm": 2.914850768053351, "kl": 0.04833984375, "learning_rate": 9.99820732006636e-07, "loss": 0.0019, "reward": 2.055187463760376, "reward_std": 0.0450524240732193, "rewards/accuracy_reward": 0.8614375591278076, "rewards/format_reward": 1.0, "step": 618 }, { "all_correct": 0.25, "all_wrong": 0.0, "completion_length": 397.78125, "epoch": 0.008537813271541083, "grad_norm": 4.106220920970892, "kl": 0.055908203125, "learning_rate": 9.99820151416747e-07, "loss": 0.0022, "reward": 2.0269689559936523, "reward_std": 0.044902682304382324, "rewards/accuracy_reward": 0.8332187533378601, "rewards/format_reward": 1.0, "step": 619 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 400.125, "epoch": 0.008551606184742279, "grad_norm": 1.9764547817206366, "kl": 0.052490234375, "learning_rate": 9.998195698883771e-07, "loss": 0.0021, "reward": 2.1538748741149902, "reward_std": 0.015444188378751278, "rewards/accuracy_reward": 0.953874945640564, "rewards/format_reward": 1.0, "step": 620 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 415.4375, "epoch": 0.008565399097943476, "grad_norm": 5.064746496796814, "kl": 0.0537109375, "learning_rate": 9.99818987421527e-07, "loss": 0.0021, "reward": 2.0861563682556152, "reward_std": 0.03293747082352638, "rewards/accuracy_reward": 0.8861562013626099, "rewards/format_reward": 1.0, "step": 621 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 400.34375, "epoch": 0.008579192011144673, "grad_norm": 2.147137270741646, "kl": 0.05517578125, "learning_rate": 9.99818404016198e-07, "loss": 0.0022, "reward": 2.0408437252044678, "reward_std": 0.05207609012722969, "rewards/accuracy_reward": 0.8470937013626099, "rewards/format_reward": 1.0, "step": 622 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 400.4375, "epoch": 0.00859298492434587, "grad_norm": 1.626374703900478, "kl": 0.052978515625, "learning_rate": 9.998178196723914e-07, "loss": 0.0021, "reward": 2.0898125171661377, "reward_std": 0.015274680219590664, "rewards/accuracy_reward": 0.8898124694824219, "rewards/format_reward": 1.0, "step": 623 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 415.78125, "epoch": 0.008606777837547068, "grad_norm": 2.023997008424375, "kl": 0.06005859375, "learning_rate": 9.998172343901082e-07, "loss": 0.0024, "reward": 2.0917811393737793, "reward_std": 0.04930365830659866, "rewards/accuracy_reward": 0.8980312347412109, "rewards/format_reward": 1.0, "step": 624 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 421.71875, "epoch": 0.008620570750748265, "grad_norm": 2.2693899267378077, "kl": 0.05859375, "learning_rate": 9.998166481693492e-07, "loss": 0.0024, "reward": 1.9982812404632568, "reward_std": 0.03485594689846039, "rewards/accuracy_reward": 0.798281192779541, "rewards/format_reward": 1.0, "step": 625 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 420.875, "epoch": 0.008634363663949462, "grad_norm": 2.5685082513863056, "kl": 0.052734375, "learning_rate": 9.998160610101158e-07, "loss": 0.0021, "reward": 2.138312578201294, "reward_std": 0.034911856055259705, "rewards/accuracy_reward": 0.944562554359436, "rewards/format_reward": 1.0, "step": 626 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 439.78125, "epoch": 0.00864815657715066, "grad_norm": 2.4530750930755456, "kl": 0.0595703125, "learning_rate": 9.99815472912409e-07, "loss": 0.0024, "reward": 2.0868124961853027, "reward_std": 0.034031230956315994, "rewards/accuracy_reward": 0.8868125081062317, "rewards/format_reward": 1.0, "step": 627 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 421.25, "epoch": 0.008661949490351857, "grad_norm": 2.078663632867618, "kl": 0.05517578125, "learning_rate": 9.9981488387623e-07, "loss": 0.0022, "reward": 2.15415620803833, "reward_std": 0.022934025153517723, "rewards/accuracy_reward": 0.9541562795639038, "rewards/format_reward": 1.0, "step": 628 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 418.15625, "epoch": 0.008675742403553054, "grad_norm": 2.7524826792569055, "kl": 0.05615234375, "learning_rate": 9.998142939015798e-07, "loss": 0.0023, "reward": 2.1138124465942383, "reward_std": 0.039046578109264374, "rewards/accuracy_reward": 0.913812518119812, "rewards/format_reward": 1.0, "step": 629 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 425.40625, "epoch": 0.008689535316754251, "grad_norm": 2.22296900028479, "kl": 0.0576171875, "learning_rate": 9.998137029884592e-07, "loss": 0.0023, "reward": 2.1036875247955322, "reward_std": 0.04939688369631767, "rewards/accuracy_reward": 0.9099375009536743, "rewards/format_reward": 1.0, "step": 630 }, { "all_correct": 0.25, "all_wrong": 0.0, "completion_length": 429.78125, "epoch": 0.008703328229955449, "grad_norm": 2.5642466104647212, "kl": 0.051513671875, "learning_rate": 9.9981311113687e-07, "loss": 0.0021, "reward": 1.984781265258789, "reward_std": 0.05748384818434715, "rewards/accuracy_reward": 0.7972812652587891, "rewards/format_reward": 1.0, "step": 631 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 428.46875, "epoch": 0.008717121143156646, "grad_norm": 1.8367440728380107, "kl": 0.053466796875, "learning_rate": 9.998125183468128e-07, "loss": 0.0021, "reward": 2.0717811584472656, "reward_std": 0.03157765418291092, "rewards/accuracy_reward": 0.8717812299728394, "rewards/format_reward": 1.0, "step": 632 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 416.28125, "epoch": 0.008730914056357843, "grad_norm": 2.438469423462321, "kl": 0.056884765625, "learning_rate": 9.99811924618289e-07, "loss": 0.0023, "reward": 2.087296962738037, "reward_std": 0.03855868801474571, "rewards/accuracy_reward": 0.8935469388961792, "rewards/format_reward": 1.0, "step": 633 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 426.625, "epoch": 0.00874470696955904, "grad_norm": 2.771776532314638, "kl": 0.04833984375, "learning_rate": 9.998113299512996e-07, "loss": 0.0019, "reward": 2.056421995162964, "reward_std": 0.03198815882205963, "rewards/accuracy_reward": 0.8564218878746033, "rewards/format_reward": 1.0, "step": 634 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 429.40625, "epoch": 0.008758499882760238, "grad_norm": 2.1101736607696138, "kl": 0.05126953125, "learning_rate": 9.998107343458456e-07, "loss": 0.0021, "reward": 2.0679280757904053, "reward_std": 0.01697995699942112, "rewards/accuracy_reward": 0.867928147315979, "rewards/format_reward": 1.0, "step": 635 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 415.5, "epoch": 0.008772292795961435, "grad_norm": 2.3569885993940654, "kl": 0.05859375, "learning_rate": 9.998101378019284e-07, "loss": 0.0023, "reward": 1.9881563186645508, "reward_std": 0.04332330450415611, "rewards/accuracy_reward": 0.7944062352180481, "rewards/format_reward": 1.0, "step": 636 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 403.65625, "epoch": 0.008786085709162632, "grad_norm": 2.090929273324986, "kl": 0.052490234375, "learning_rate": 9.998095403195487e-07, "loss": 0.0021, "reward": 2.1616876125335693, "reward_std": 0.03680138662457466, "rewards/accuracy_reward": 0.9679374694824219, "rewards/format_reward": 1.0, "step": 637 }, { "all_correct": 0.25, "all_wrong": 0.0, "completion_length": 408.1875, "epoch": 0.00879987862236383, "grad_norm": 2.214879025935309, "kl": 0.055908203125, "learning_rate": 9.99808941898708e-07, "loss": 0.0022, "reward": 2.037156105041504, "reward_std": 0.03522922843694687, "rewards/accuracy_reward": 0.8434063196182251, "rewards/format_reward": 1.0, "step": 638 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 422.0625, "epoch": 0.008813671535565027, "grad_norm": 2.1205317647208646, "kl": 0.05224609375, "learning_rate": 9.998083425394072e-07, "loss": 0.0021, "reward": 2.1116719245910645, "reward_std": 0.04663609713315964, "rewards/accuracy_reward": 0.9179219007492065, "rewards/format_reward": 1.0, "step": 639 }, { "all_correct": 0.25, "all_wrong": 0.0, "completion_length": 418.78125, "epoch": 0.008827464448766224, "grad_norm": 2.042338839071063, "kl": 0.05517578125, "learning_rate": 9.998077422416476e-07, "loss": 0.0022, "reward": 1.876499891281128, "reward_std": 0.03659955412149429, "rewards/accuracy_reward": 0.6827499270439148, "rewards/format_reward": 1.0, "step": 640 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 415.0, "epoch": 0.008841257361967421, "grad_norm": 2.1692973668032036, "kl": 0.052001953125, "learning_rate": 9.9980714100543e-07, "loss": 0.0021, "reward": 2.0929999351501465, "reward_std": 0.020299028605222702, "rewards/accuracy_reward": 0.8929999470710754, "rewards/format_reward": 1.0, "step": 641 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 410.15625, "epoch": 0.008855050275168619, "grad_norm": 8.145238013831461, "kl": 0.05419921875, "learning_rate": 9.99806538830756e-07, "loss": 0.0022, "reward": 1.9901564121246338, "reward_std": 0.027446966618299484, "rewards/accuracy_reward": 0.7901561856269836, "rewards/format_reward": 1.0, "step": 642 }, { "all_correct": 0.25, "all_wrong": 0.0, "completion_length": 391.46875, "epoch": 0.008868843188369816, "grad_norm": 3.5870141350540905, "kl": 0.05322265625, "learning_rate": 9.998059357176264e-07, "loss": 0.0021, "reward": 1.9034688472747803, "reward_std": 0.10066314041614532, "rewards/accuracy_reward": 0.7159687280654907, "rewards/format_reward": 1.0, "step": 643 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 411.4375, "epoch": 0.008882636101571013, "grad_norm": 5.5183408870639825, "kl": 0.052734375, "learning_rate": 9.998053316660427e-07, "loss": 0.0021, "reward": 2.071718692779541, "reward_std": 0.051681824028491974, "rewards/accuracy_reward": 0.8717188239097595, "rewards/format_reward": 1.0, "step": 644 }, { "all_correct": 0.25, "all_wrong": 0.0, "completion_length": 419.9375, "epoch": 0.00889642901477221, "grad_norm": 3.3821479870666344, "kl": 0.0595703125, "learning_rate": 9.998047266760055e-07, "loss": 0.0024, "reward": 2.026750087738037, "reward_std": 0.0515354759991169, "rewards/accuracy_reward": 0.8267500400543213, "rewards/format_reward": 1.0, "step": 645 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 415.34375, "epoch": 0.008910221927973408, "grad_norm": 2.653034654723109, "kl": 0.0537109375, "learning_rate": 9.998041207475162e-07, "loss": 0.0022, "reward": 2.0444374084472656, "reward_std": 0.03171418607234955, "rewards/accuracy_reward": 0.8444375991821289, "rewards/format_reward": 1.0, "step": 646 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 400.53125, "epoch": 0.008924014841174605, "grad_norm": 4.232587551749829, "kl": 0.05224609375, "learning_rate": 9.99803513880576e-07, "loss": 0.0021, "reward": 2.1110939979553223, "reward_std": 0.011543495580554008, "rewards/accuracy_reward": 0.9110937714576721, "rewards/format_reward": 1.0, "step": 647 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 405.46875, "epoch": 0.008937807754375802, "grad_norm": 2.1555621268191216, "kl": 0.053955078125, "learning_rate": 9.99802906075186e-07, "loss": 0.0022, "reward": 2.139343738555908, "reward_std": 0.04276597499847412, "rewards/accuracy_reward": 0.9518437385559082, "rewards/format_reward": 1.0, "step": 648 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 419.875, "epoch": 0.008951600667577, "grad_norm": 9.638718561777605, "kl": 0.050048828125, "learning_rate": 9.998022973313471e-07, "loss": 0.002, "reward": 2.0935935974121094, "reward_std": 0.035451896488666534, "rewards/accuracy_reward": 0.899843692779541, "rewards/format_reward": 1.0, "step": 649 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 426.5625, "epoch": 0.008965393580778197, "grad_norm": 2.136015591482648, "kl": 0.05615234375, "learning_rate": 9.998016876490608e-07, "loss": 0.0023, "reward": 2.0662031173706055, "reward_std": 0.022263051941990852, "rewards/accuracy_reward": 0.8662031292915344, "rewards/format_reward": 1.0, "step": 650 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 429.125, "epoch": 0.008979186493979394, "grad_norm": 2.5408916769689975, "kl": 0.052001953125, "learning_rate": 9.998010770283283e-07, "loss": 0.0021, "reward": 2.123812675476074, "reward_std": 0.01799178309738636, "rewards/accuracy_reward": 0.9238125085830688, "rewards/format_reward": 1.0, "step": 651 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 410.3125, "epoch": 0.008992979407180591, "grad_norm": 1.713585949738345, "kl": 0.05712890625, "learning_rate": 9.998004654691504e-07, "loss": 0.0023, "reward": 2.0980000495910645, "reward_std": 0.011981104500591755, "rewards/accuracy_reward": 0.8980000019073486, "rewards/format_reward": 1.0, "step": 652 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 424.59375, "epoch": 0.009006772320381789, "grad_norm": 1.8164043796813294, "kl": 0.05517578125, "learning_rate": 9.997998529715284e-07, "loss": 0.0022, "reward": 2.093562602996826, "reward_std": 0.03193996101617813, "rewards/accuracy_reward": 0.8935624957084656, "rewards/format_reward": 1.0, "step": 653 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 412.5625, "epoch": 0.009020565233582986, "grad_norm": 2.1294364480088603, "kl": 0.050537109375, "learning_rate": 9.997992395354634e-07, "loss": 0.002, "reward": 2.134812355041504, "reward_std": 0.02538403868675232, "rewards/accuracy_reward": 0.9348125457763672, "rewards/format_reward": 1.0, "step": 654 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 423.59375, "epoch": 0.009034358146784183, "grad_norm": 1.9213315905645845, "kl": 0.0498046875, "learning_rate": 9.997986251609567e-07, "loss": 0.002, "reward": 2.1349687576293945, "reward_std": 0.04420100897550583, "rewards/accuracy_reward": 0.9412187933921814, "rewards/format_reward": 1.0, "step": 655 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 423.5, "epoch": 0.00904815105998538, "grad_norm": 2.391636972620343, "kl": 0.0537109375, "learning_rate": 9.997980098480092e-07, "loss": 0.0021, "reward": 2.0859999656677246, "reward_std": 0.02876342460513115, "rewards/accuracy_reward": 0.8860000371932983, "rewards/format_reward": 1.0, "step": 656 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 398.46875, "epoch": 0.009061943973186578, "grad_norm": 2.517320094983817, "kl": 0.052001953125, "learning_rate": 9.997973935966224e-07, "loss": 0.0021, "reward": 2.0307188034057617, "reward_std": 0.04952946677803993, "rewards/accuracy_reward": 0.8369687795639038, "rewards/format_reward": 1.0, "step": 657 }, { "all_correct": 0.25, "all_wrong": 0.0, "completion_length": 404.65625, "epoch": 0.009075736886387773, "grad_norm": 2.891569112880208, "kl": 0.060546875, "learning_rate": 9.997967764067972e-07, "loss": 0.0024, "reward": 2.065577983856201, "reward_std": 0.05062235891819, "rewards/accuracy_reward": 0.8718281388282776, "rewards/format_reward": 1.0, "step": 658 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 418.625, "epoch": 0.00908952979958897, "grad_norm": 3.7156255359803385, "kl": 0.05859375, "learning_rate": 9.99796158278535e-07, "loss": 0.0023, "reward": 2.0514373779296875, "reward_std": 0.02630566619336605, "rewards/accuracy_reward": 0.851437509059906, "rewards/format_reward": 1.0, "step": 659 }, { "all_correct": 0.0, "all_wrong": 0.0, "completion_length": 428.65625, "epoch": 0.009103322712790168, "grad_norm": 2.9580197043360426, "kl": 0.05419921875, "learning_rate": 9.997955392118365e-07, "loss": 0.0022, "reward": 1.9528437852859497, "reward_std": 0.044448766857385635, "rewards/accuracy_reward": 0.7528437376022339, "rewards/format_reward": 1.0, "step": 660 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 413.25, "epoch": 0.009117115625991365, "grad_norm": 2.851442801582792, "kl": 0.052734375, "learning_rate": 9.997949192067032e-07, "loss": 0.0021, "reward": 2.1162500381469727, "reward_std": 0.026071693748235703, "rewards/accuracy_reward": 0.9162500500679016, "rewards/format_reward": 1.0, "step": 661 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 414.71875, "epoch": 0.009130908539192562, "grad_norm": 4.329190577327829, "kl": 0.060546875, "learning_rate": 9.997942982631363e-07, "loss": 0.0024, "reward": 2.097062587738037, "reward_std": 0.03074048087000847, "rewards/accuracy_reward": 0.9033124446868896, "rewards/format_reward": 1.0, "step": 662 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 405.65625, "epoch": 0.00914470145239376, "grad_norm": 2.6330735040202415, "kl": 0.05810546875, "learning_rate": 9.997936763811367e-07, "loss": 0.0023, "reward": 1.959281325340271, "reward_std": 0.07087946683168411, "rewards/accuracy_reward": 0.7717812061309814, "rewards/format_reward": 1.0, "step": 663 }, { "all_correct": 0.25, "all_wrong": 0.0, "completion_length": 410.5625, "epoch": 0.009158494365594957, "grad_norm": 4.4723131732354355, "kl": 0.052734375, "learning_rate": 9.99793053560706e-07, "loss": 0.0021, "reward": 1.8886876106262207, "reward_std": 0.04330645874142647, "rewards/accuracy_reward": 0.6886875629425049, "rewards/format_reward": 1.0, "step": 664 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 408.71875, "epoch": 0.009172287278796154, "grad_norm": 2.5958242763529604, "kl": 0.05517578125, "learning_rate": 9.99792429801845e-07, "loss": 0.0022, "reward": 2.057281255722046, "reward_std": 0.034680403769016266, "rewards/accuracy_reward": 0.8572812676429749, "rewards/format_reward": 1.0, "step": 665 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 422.28125, "epoch": 0.009186080191997351, "grad_norm": 2.6017487494919274, "kl": 0.06298828125, "learning_rate": 9.99791805104555e-07, "loss": 0.0025, "reward": 2.075312614440918, "reward_std": 0.029237957671284676, "rewards/accuracy_reward": 0.8753124475479126, "rewards/format_reward": 1.0, "step": 666 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 417.21875, "epoch": 0.009199873105198549, "grad_norm": 2.6924402370209854, "kl": 0.05615234375, "learning_rate": 9.997911794688372e-07, "loss": 0.0022, "reward": 2.1150624752044678, "reward_std": 0.020551782101392746, "rewards/accuracy_reward": 0.9150625467300415, "rewards/format_reward": 1.0, "step": 667 }, { "all_correct": 0.25, "all_wrong": 0.0, "completion_length": 447.5, "epoch": 0.009213666018399746, "grad_norm": 2.170205277153537, "kl": 0.059326171875, "learning_rate": 9.997905528946927e-07, "loss": 0.0024, "reward": 1.9899688959121704, "reward_std": 0.08910458534955978, "rewards/accuracy_reward": 0.821218729019165, "rewards/format_reward": 1.0, "step": 668 }, { "all_correct": 0.25, "all_wrong": 0.0, "completion_length": 402.125, "epoch": 0.009227458931600943, "grad_norm": 2.2706751225097332, "kl": 0.05126953125, "learning_rate": 9.997899253821227e-07, "loss": 0.002, "reward": 2.0256874561309814, "reward_std": 0.01520544197410345, "rewards/accuracy_reward": 0.8256875276565552, "rewards/format_reward": 1.0, "step": 669 }, { "all_correct": 0.25, "all_wrong": 0.0, "completion_length": 437.4375, "epoch": 0.00924125184480214, "grad_norm": 4.737529312533587, "kl": 0.05810546875, "learning_rate": 9.997892969311284e-07, "loss": 0.0023, "reward": 1.9027812480926514, "reward_std": 0.04217426851391792, "rewards/accuracy_reward": 0.7027812600135803, "rewards/format_reward": 1.0, "step": 670 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 416.5625, "epoch": 0.009255044758003338, "grad_norm": 1.6125817235718418, "kl": 0.053466796875, "learning_rate": 9.997886675417108e-07, "loss": 0.0021, "reward": 2.13212513923645, "reward_std": 0.03144735097885132, "rewards/accuracy_reward": 0.9321250319480896, "rewards/format_reward": 1.0, "step": 671 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 424.375, "epoch": 0.009268837671204535, "grad_norm": 2.6187150115702686, "kl": 0.0595703125, "learning_rate": 9.997880372138716e-07, "loss": 0.0024, "reward": 1.999000072479248, "reward_std": 0.021543916314840317, "rewards/accuracy_reward": 0.7990000247955322, "rewards/format_reward": 1.0, "step": 672 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 435.78125, "epoch": 0.009282630584405732, "grad_norm": 2.228889154565372, "kl": 0.05615234375, "learning_rate": 9.997874059476114e-07, "loss": 0.0023, "reward": 2.0667500495910645, "reward_std": 0.0515543594956398, "rewards/accuracy_reward": 0.8730000257492065, "rewards/format_reward": 1.0, "step": 673 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 413.0625, "epoch": 0.00929642349760693, "grad_norm": 1.6510331416126716, "kl": 0.05322265625, "learning_rate": 9.997867737429317e-07, "loss": 0.0021, "reward": 2.1133124828338623, "reward_std": 0.019771721214056015, "rewards/accuracy_reward": 0.913312554359436, "rewards/format_reward": 1.0, "step": 674 }, { "all_correct": 0.25, "all_wrong": 0.0, "completion_length": 429.28125, "epoch": 0.009310216410808127, "grad_norm": 1.6296588009543245, "kl": 0.05712890625, "learning_rate": 9.997861405998336e-07, "loss": 0.0023, "reward": 1.954437494277954, "reward_std": 0.02436612918972969, "rewards/accuracy_reward": 0.7544374465942383, "rewards/format_reward": 1.0, "step": 675 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 416.75, "epoch": 0.009324009324009324, "grad_norm": 1.6872785109755712, "kl": 0.052978515625, "learning_rate": 9.997855065183182e-07, "loss": 0.0021, "reward": 2.0685625076293945, "reward_std": 0.015411928296089172, "rewards/accuracy_reward": 0.8685625195503235, "rewards/format_reward": 1.0, "step": 676 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 431.125, "epoch": 0.009337802237210521, "grad_norm": 2.2346468409405906, "kl": 0.05712890625, "learning_rate": 9.99784871498387e-07, "loss": 0.0023, "reward": 2.0950937271118164, "reward_std": 0.034140318632125854, "rewards/accuracy_reward": 0.8950937986373901, "rewards/format_reward": 1.0, "step": 677 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 433.78125, "epoch": 0.009351595150411719, "grad_norm": 2.0608301032918446, "kl": 0.05126953125, "learning_rate": 9.997842355400407e-07, "loss": 0.002, "reward": 2.0921249389648438, "reward_std": 0.04572833329439163, "rewards/accuracy_reward": 0.8983750343322754, "rewards/format_reward": 1.0, "step": 678 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 425.4375, "epoch": 0.009365388063612916, "grad_norm": 2.551873666217073, "kl": 0.053466796875, "learning_rate": 9.99783598643281e-07, "loss": 0.0021, "reward": 1.9982187747955322, "reward_std": 0.030461709946393967, "rewards/accuracy_reward": 0.7982187271118164, "rewards/format_reward": 1.0, "step": 679 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 427.71875, "epoch": 0.009379180976814113, "grad_norm": 2.2994885154254847, "kl": 0.0556640625, "learning_rate": 9.997829608081088e-07, "loss": 0.0022, "reward": 2.111656427383423, "reward_std": 0.01765572652220726, "rewards/accuracy_reward": 0.9116563200950623, "rewards/format_reward": 1.0, "step": 680 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 428.90625, "epoch": 0.00939297389001531, "grad_norm": 1.755166055642458, "kl": 0.060546875, "learning_rate": 9.997823220345255e-07, "loss": 0.0024, "reward": 2.1281561851501465, "reward_std": 0.01356419362127781, "rewards/accuracy_reward": 0.9281562566757202, "rewards/format_reward": 1.0, "step": 681 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 427.75, "epoch": 0.009406766803216508, "grad_norm": 2.439398119219092, "kl": 0.052734375, "learning_rate": 9.99781682322532e-07, "loss": 0.0021, "reward": 2.07603120803833, "reward_std": 0.0280348751693964, "rewards/accuracy_reward": 0.876031219959259, "rewards/format_reward": 1.0, "step": 682 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 414.25, "epoch": 0.009420559716417705, "grad_norm": 2.0539492625115163, "kl": 0.05419921875, "learning_rate": 9.997810416721296e-07, "loss": 0.0022, "reward": 2.0603437423706055, "reward_std": 0.068871408700943, "rewards/accuracy_reward": 0.8790937662124634, "rewards/format_reward": 1.0, "step": 683 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 434.65625, "epoch": 0.009434352629618902, "grad_norm": 3.80888034345577, "kl": 0.05517578125, "learning_rate": 9.997804000833197e-07, "loss": 0.0022, "reward": 2.1111249923706055, "reward_std": 0.0444219671189785, "rewards/accuracy_reward": 0.9236249923706055, "rewards/format_reward": 1.0, "step": 684 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 416.0, "epoch": 0.0094481455428201, "grad_norm": 2.0338356248523928, "kl": 0.056640625, "learning_rate": 9.997797575561033e-07, "loss": 0.0023, "reward": 1.9871562719345093, "reward_std": 0.025368230417370796, "rewards/accuracy_reward": 0.7871562242507935, "rewards/format_reward": 1.0, "step": 685 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 433.90625, "epoch": 0.009461938456021297, "grad_norm": 14.893743614147475, "kl": 0.046630859375, "learning_rate": 9.99779114090482e-07, "loss": 0.0019, "reward": 2.054281234741211, "reward_std": 0.03852001577615738, "rewards/accuracy_reward": 0.860531210899353, "rewards/format_reward": 1.0, "step": 686 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 427.59375, "epoch": 0.009475731369222494, "grad_norm": 4.024717281836114, "kl": 0.05712890625, "learning_rate": 9.997784696864563e-07, "loss": 0.0023, "reward": 1.9925625324249268, "reward_std": 0.034247685223817825, "rewards/accuracy_reward": 0.7925624847412109, "rewards/format_reward": 1.0, "step": 687 }, { "all_correct": 0.25, "all_wrong": 0.0, "completion_length": 441.5, "epoch": 0.009489524282423691, "grad_norm": 1.9772025997395857, "kl": 0.047119140625, "learning_rate": 9.99777824344028e-07, "loss": 0.0019, "reward": 2.0151562690734863, "reward_std": 0.06658585369586945, "rewards/accuracy_reward": 0.8276562690734863, "rewards/format_reward": 1.0, "step": 688 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 420.78125, "epoch": 0.009503317195624889, "grad_norm": 1.988712217809896, "kl": 0.0546875, "learning_rate": 9.99777178063198e-07, "loss": 0.0022, "reward": 2.114187717437744, "reward_std": 0.026240374892950058, "rewards/accuracy_reward": 0.9141875505447388, "rewards/format_reward": 1.0, "step": 689 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 431.6875, "epoch": 0.009517110108826086, "grad_norm": 2.703541324250646, "kl": 0.05419921875, "learning_rate": 9.997765308439677e-07, "loss": 0.0022, "reward": 2.0669374465942383, "reward_std": 0.03413967788219452, "rewards/accuracy_reward": 0.8731874227523804, "rewards/format_reward": 1.0, "step": 690 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 414.6875, "epoch": 0.009530903022027283, "grad_norm": 3.6799241491006187, "kl": 0.05810546875, "learning_rate": 9.99775882686338e-07, "loss": 0.0023, "reward": 2.0754687786102295, "reward_std": 0.02775740996003151, "rewards/accuracy_reward": 0.8754687309265137, "rewards/format_reward": 1.0, "step": 691 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 431.21875, "epoch": 0.00954469593522848, "grad_norm": 1.890937507143549, "kl": 0.049560546875, "learning_rate": 9.997752335903106e-07, "loss": 0.002, "reward": 2.0451250076293945, "reward_std": 0.0170753076672554, "rewards/accuracy_reward": 0.8451249599456787, "rewards/format_reward": 1.0, "step": 692 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 405.8125, "epoch": 0.009558488848429678, "grad_norm": 3.194346189271351, "kl": 0.05517578125, "learning_rate": 9.997745835558865e-07, "loss": 0.0022, "reward": 2.118000030517578, "reward_std": 0.03149932622909546, "rewards/accuracy_reward": 0.9180000424385071, "rewards/format_reward": 1.0, "step": 693 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 414.375, "epoch": 0.009572281761630875, "grad_norm": 2.331340300265999, "kl": 0.0546875, "learning_rate": 9.997739325830668e-07, "loss": 0.0022, "reward": 1.9761874675750732, "reward_std": 0.022366223856806755, "rewards/accuracy_reward": 0.776187539100647, "rewards/format_reward": 1.0, "step": 694 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 425.34375, "epoch": 0.00958607467483207, "grad_norm": 3.8794420702626975, "kl": 0.05322265625, "learning_rate": 9.997732806718528e-07, "loss": 0.0021, "reward": 2.000906467437744, "reward_std": 0.03193633258342743, "rewards/accuracy_reward": 0.8009061813354492, "rewards/format_reward": 1.0, "step": 695 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 424.0625, "epoch": 0.009599867588033268, "grad_norm": 2.201290748917426, "kl": 0.0498046875, "learning_rate": 9.997726278222459e-07, "loss": 0.002, "reward": 2.079531192779541, "reward_std": 0.03386136516928673, "rewards/accuracy_reward": 0.87953120470047, "rewards/format_reward": 1.0, "step": 696 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 410.875, "epoch": 0.009613660501234465, "grad_norm": 2.1247553847038083, "kl": 0.060546875, "learning_rate": 9.997719740342472e-07, "loss": 0.0024, "reward": 2.0337185859680176, "reward_std": 0.025783026590943336, "rewards/accuracy_reward": 0.8399688005447388, "rewards/format_reward": 1.0, "step": 697 }, { "all_correct": 0.25, "all_wrong": 0.0, "completion_length": 419.4375, "epoch": 0.009627453414435662, "grad_norm": 2.356167965383202, "kl": 0.0634765625, "learning_rate": 9.997713193078577e-07, "loss": 0.0025, "reward": 2.0381250381469727, "reward_std": 0.06399115175008774, "rewards/accuracy_reward": 0.8506250381469727, "rewards/format_reward": 1.0, "step": 698 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 395.625, "epoch": 0.00964124632763686, "grad_norm": 3.843978059591917, "kl": 0.056396484375, "learning_rate": 9.99770663643079e-07, "loss": 0.0023, "reward": 2.1421561241149902, "reward_std": 0.04591123387217522, "rewards/accuracy_reward": 0.9484062790870667, "rewards/format_reward": 1.0, "step": 699 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 407.65625, "epoch": 0.009655039240838057, "grad_norm": 2.255131334928212, "kl": 0.05615234375, "learning_rate": 9.99770007039912e-07, "loss": 0.0022, "reward": 2.0332188606262207, "reward_std": 0.03118271194398403, "rewards/accuracy_reward": 0.8394687175750732, "rewards/format_reward": 1.0, "step": 700 }, { "all_correct": 0.25, "all_wrong": 0.0, "completion_length": 398.75, "epoch": 0.009668832154039254, "grad_norm": 1.6758072177209355, "kl": 0.05712890625, "learning_rate": 9.997693494983582e-07, "loss": 0.0023, "reward": 1.9993125200271606, "reward_std": 0.03307871147990227, "rewards/accuracy_reward": 0.8055624961853027, "rewards/format_reward": 1.0, "step": 701 }, { "all_correct": 0.25, "all_wrong": 0.0, "completion_length": 418.84375, "epoch": 0.009682625067240451, "grad_norm": 10.804572953693194, "kl": 0.0537109375, "learning_rate": 9.997686910184185e-07, "loss": 0.0021, "reward": 1.9668437242507935, "reward_std": 0.04112984985113144, "rewards/accuracy_reward": 0.7730937600135803, "rewards/format_reward": 1.0, "step": 702 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 426.25, "epoch": 0.009696417980441649, "grad_norm": 2.089305158194791, "kl": 0.0498046875, "learning_rate": 9.997680316000946e-07, "loss": 0.002, "reward": 2.128281354904175, "reward_std": 0.036959290504455566, "rewards/accuracy_reward": 0.9345312118530273, "rewards/format_reward": 1.0, "step": 703 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 398.03125, "epoch": 0.009710210893642846, "grad_norm": 2.3470899762622564, "kl": 0.05712890625, "learning_rate": 9.997673712433874e-07, "loss": 0.0023, "reward": 1.9417812824249268, "reward_std": 0.02425316348671913, "rewards/accuracy_reward": 0.7417812347412109, "rewards/format_reward": 1.0, "step": 704 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 402.15625, "epoch": 0.009724003806844043, "grad_norm": 3.184750897642642, "kl": 0.05859375, "learning_rate": 9.997667099482983e-07, "loss": 0.0023, "reward": 2.01743745803833, "reward_std": 0.026041537523269653, "rewards/accuracy_reward": 0.8174375295639038, "rewards/format_reward": 1.0, "step": 705 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 398.25, "epoch": 0.00973779672004524, "grad_norm": 2.278684680388614, "kl": 0.0625, "learning_rate": 9.997660477148284e-07, "loss": 0.0025, "reward": 2.0754687786102295, "reward_std": 0.039515845477581024, "rewards/accuracy_reward": 0.8754687905311584, "rewards/format_reward": 1.0, "step": 706 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 405.1875, "epoch": 0.009751589633246438, "grad_norm": 1.8181294863490107, "kl": 0.052734375, "learning_rate": 9.997653845429792e-07, "loss": 0.0021, "reward": 2.13993763923645, "reward_std": 0.015603876672685146, "rewards/accuracy_reward": 0.9399375319480896, "rewards/format_reward": 1.0, "step": 707 }, { "all_correct": 0.25, "all_wrong": 0.0, "completion_length": 410.75, "epoch": 0.009765382546447635, "grad_norm": 4.958501690518395, "kl": 0.0615234375, "learning_rate": 9.997647204327515e-07, "loss": 0.0025, "reward": 1.9874688386917114, "reward_std": 0.023501278832554817, "rewards/accuracy_reward": 0.7874687910079956, "rewards/format_reward": 1.0, "step": 708 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 430.96875, "epoch": 0.009779175459648832, "grad_norm": 2.9193274904743327, "kl": 0.0498046875, "learning_rate": 9.997640553841471e-07, "loss": 0.002, "reward": 2.1104063987731934, "reward_std": 0.04291394725441933, "rewards/accuracy_reward": 0.9291562438011169, "rewards/format_reward": 1.0, "step": 709 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 399.65625, "epoch": 0.00979296837285003, "grad_norm": 3.403500909492158, "kl": 0.059326171875, "learning_rate": 9.997633893971668e-07, "loss": 0.0024, "reward": 2.084843873977661, "reward_std": 0.04556925222277641, "rewards/accuracy_reward": 0.8973437547683716, "rewards/format_reward": 1.0, "step": 710 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 413.3125, "epoch": 0.009806761286051227, "grad_norm": 6.669534614538442, "kl": 0.05615234375, "learning_rate": 9.99762722471812e-07, "loss": 0.0023, "reward": 2.0758438110351562, "reward_std": 0.029760783538222313, "rewards/accuracy_reward": 0.8758436441421509, "rewards/format_reward": 1.0, "step": 711 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 411.34375, "epoch": 0.009820554199252424, "grad_norm": 2.5793884394451636, "kl": 0.051513671875, "learning_rate": 9.99762054608084e-07, "loss": 0.0021, "reward": 2.076531410217285, "reward_std": 0.051275793462991714, "rewards/accuracy_reward": 0.8890312910079956, "rewards/format_reward": 1.0, "step": 712 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 402.21875, "epoch": 0.009834347112453621, "grad_norm": 3.030314055937417, "kl": 0.052001953125, "learning_rate": 9.997613858059842e-07, "loss": 0.0021, "reward": 2.060281276702881, "reward_std": 0.055342938750982285, "rewards/accuracy_reward": 0.8727812767028809, "rewards/format_reward": 1.0, "step": 713 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 406.84375, "epoch": 0.009848140025654819, "grad_norm": 4.943337065676028, "kl": 0.052490234375, "learning_rate": 9.997607160655133e-07, "loss": 0.0021, "reward": 2.0074687004089355, "reward_std": 0.014242958277463913, "rewards/accuracy_reward": 0.8074687719345093, "rewards/format_reward": 1.0, "step": 714 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 400.53125, "epoch": 0.009861932938856016, "grad_norm": 1.8960911088419596, "kl": 0.05517578125, "learning_rate": 9.997600453866732e-07, "loss": 0.0022, "reward": 2.0877187252044678, "reward_std": 0.021724281832575798, "rewards/accuracy_reward": 0.8939687609672546, "rewards/format_reward": 1.0, "step": 715 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 392.125, "epoch": 0.009875725852057213, "grad_norm": 6.662629453034917, "kl": 0.0556640625, "learning_rate": 9.99759373769465e-07, "loss": 0.0022, "reward": 2.0525624752044678, "reward_std": 0.031142540276050568, "rewards/accuracy_reward": 0.8525625467300415, "rewards/format_reward": 1.0, "step": 716 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 388.75, "epoch": 0.00988951876525841, "grad_norm": 5.982517409715113, "kl": 0.056396484375, "learning_rate": 9.997587012138895e-07, "loss": 0.0023, "reward": 1.9877188205718994, "reward_std": 0.0749661847949028, "rewards/accuracy_reward": 0.8002187609672546, "rewards/format_reward": 1.0, "step": 717 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 406.53125, "epoch": 0.009903311678459608, "grad_norm": 2.10317602669681, "kl": 0.06103515625, "learning_rate": 9.997580277199486e-07, "loss": 0.0024, "reward": 2.118468761444092, "reward_std": 0.02256305329501629, "rewards/accuracy_reward": 0.918468713760376, "rewards/format_reward": 1.0, "step": 718 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 399.625, "epoch": 0.009917104591660805, "grad_norm": 1.5453569358256356, "kl": 0.05322265625, "learning_rate": 9.99757353287643e-07, "loss": 0.0021, "reward": 2.0855937004089355, "reward_std": 0.019628841429948807, "rewards/accuracy_reward": 0.8918437957763672, "rewards/format_reward": 1.0, "step": 719 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 396.46875, "epoch": 0.009930897504862002, "grad_norm": 2.899620411999658, "kl": 0.048828125, "learning_rate": 9.997566779169746e-07, "loss": 0.002, "reward": 2.058375120162964, "reward_std": 0.018247393891215324, "rewards/accuracy_reward": 0.8583749532699585, "rewards/format_reward": 1.0, "step": 720 }, { "all_correct": 0.25, "all_wrong": 0.0, "completion_length": 402.71875, "epoch": 0.0099446904180632, "grad_norm": 5.34747389152533, "kl": 0.05908203125, "learning_rate": 9.997560016079442e-07, "loss": 0.0024, "reward": 2.0670623779296875, "reward_std": 0.04352187365293503, "rewards/accuracy_reward": 0.8733124732971191, "rewards/format_reward": 1.0, "step": 721 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 391.21875, "epoch": 0.009958483331264397, "grad_norm": 4.095829187926484, "kl": 0.06494140625, "learning_rate": 9.997553243605532e-07, "loss": 0.0026, "reward": 2.0938751697540283, "reward_std": 0.036793891340494156, "rewards/accuracy_reward": 0.893875002861023, "rewards/format_reward": 1.0, "step": 722 }, { "all_correct": 0.25, "all_wrong": 0.0, "completion_length": 400.15625, "epoch": 0.009972276244465594, "grad_norm": 2.8405226678825852, "kl": 0.052734375, "learning_rate": 9.997546461748028e-07, "loss": 0.0021, "reward": 2.002812385559082, "reward_std": 0.03477508947253227, "rewards/accuracy_reward": 0.8028125762939453, "rewards/format_reward": 1.0, "step": 723 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 409.0625, "epoch": 0.009986069157666791, "grad_norm": 2.184283144518582, "kl": 0.050537109375, "learning_rate": 9.997539670506942e-07, "loss": 0.002, "reward": 2.029843807220459, "reward_std": 0.01599831134080887, "rewards/accuracy_reward": 0.8298437595367432, "rewards/format_reward": 1.0, "step": 724 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 405.96875, "epoch": 0.009999862070867989, "grad_norm": 2.7305813676528734, "kl": 0.0595703125, "learning_rate": 9.99753286988229e-07, "loss": 0.0024, "reward": 2.0062499046325684, "reward_std": 0.011853743344545364, "rewards/accuracy_reward": 0.8062499761581421, "rewards/format_reward": 1.0, "step": 725 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 403.75, "epoch": 0.010013654984069186, "grad_norm": 6.0307912306653595, "kl": 0.05029296875, "learning_rate": 9.997526059874084e-07, "loss": 0.002, "reward": 2.0692501068115234, "reward_std": 0.03133557736873627, "rewards/accuracy_reward": 0.8755000233650208, "rewards/format_reward": 1.0, "step": 726 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 407.375, "epoch": 0.010027447897270383, "grad_norm": 2.588161925113926, "kl": 0.059814453125, "learning_rate": 9.997519240482333e-07, "loss": 0.0024, "reward": 2.10756254196167, "reward_std": 0.020189443603157997, "rewards/accuracy_reward": 0.9075624942779541, "rewards/format_reward": 1.0, "step": 727 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 408.6875, "epoch": 0.01004124081047158, "grad_norm": 8.244247699201978, "kl": 0.05322265625, "learning_rate": 9.997512411707055e-07, "loss": 0.0021, "reward": 2.0844063758850098, "reward_std": 0.026970503851771355, "rewards/accuracy_reward": 0.8844062089920044, "rewards/format_reward": 1.0, "step": 728 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 394.40625, "epoch": 0.010055033723672778, "grad_norm": 2.2085553536407003, "kl": 0.05126953125, "learning_rate": 9.997505573548258e-07, "loss": 0.0021, "reward": 2.1332812309265137, "reward_std": 0.028340646997094154, "rewards/accuracy_reward": 0.9332813024520874, "rewards/format_reward": 1.0, "step": 729 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 418.875, "epoch": 0.010068826636873975, "grad_norm": 3.971849469983568, "kl": 0.056396484375, "learning_rate": 9.997498726005957e-07, "loss": 0.0023, "reward": 1.9782187938690186, "reward_std": 0.05522511154413223, "rewards/accuracy_reward": 0.7782187461853027, "rewards/format_reward": 1.0, "step": 730 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 410.4375, "epoch": 0.010082619550075172, "grad_norm": 2.5351340929204755, "kl": 0.0576171875, "learning_rate": 9.997491869080166e-07, "loss": 0.0023, "reward": 2.088562488555908, "reward_std": 0.0400303453207016, "rewards/accuracy_reward": 0.8948124647140503, "rewards/format_reward": 1.0, "step": 731 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 416.8125, "epoch": 0.01009641246327637, "grad_norm": 3.3568771007081986, "kl": 0.05517578125, "learning_rate": 9.997485002770898e-07, "loss": 0.0022, "reward": 2.0677499771118164, "reward_std": 0.024040717631578445, "rewards/accuracy_reward": 0.8677499890327454, "rewards/format_reward": 1.0, "step": 732 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 403.1875, "epoch": 0.010110205376477565, "grad_norm": 4.250653356173924, "kl": 0.057373046875, "learning_rate": 9.997478127078162e-07, "loss": 0.0023, "reward": 2.102656364440918, "reward_std": 0.01822405681014061, "rewards/accuracy_reward": 0.9026561975479126, "rewards/format_reward": 1.0, "step": 733 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 400.1875, "epoch": 0.010123998289678762, "grad_norm": 1.8931909200220496, "kl": 0.052978515625, "learning_rate": 9.997471242001975e-07, "loss": 0.0021, "reward": 2.045187473297119, "reward_std": 0.027692124247550964, "rewards/accuracy_reward": 0.8451874852180481, "rewards/format_reward": 1.0, "step": 734 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 406.53125, "epoch": 0.01013779120287996, "grad_norm": 2.076250263276435, "kl": 0.0498046875, "learning_rate": 9.997464347542348e-07, "loss": 0.002, "reward": 2.143531322479248, "reward_std": 0.03511849790811539, "rewards/accuracy_reward": 0.943531334400177, "rewards/format_reward": 1.0, "step": 735 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 401.96875, "epoch": 0.010151584116081157, "grad_norm": 2.531616358469332, "kl": 0.061279296875, "learning_rate": 9.997457443699296e-07, "loss": 0.0025, "reward": 2.0248751640319824, "reward_std": 0.028868505731225014, "rewards/accuracy_reward": 0.824874997138977, "rewards/format_reward": 1.0, "step": 736 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 388.84375, "epoch": 0.010165377029282354, "grad_norm": 2.2248606975885394, "kl": 0.052734375, "learning_rate": 9.997450530472829e-07, "loss": 0.0021, "reward": 2.115875244140625, "reward_std": 0.024346042424440384, "rewards/accuracy_reward": 0.9158750176429749, "rewards/format_reward": 1.0, "step": 737 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 419.4375, "epoch": 0.010179169942483551, "grad_norm": 3.990291579350817, "kl": 0.06396484375, "learning_rate": 9.997443607862961e-07, "loss": 0.0026, "reward": 2.0698437690734863, "reward_std": 0.030115721747279167, "rewards/accuracy_reward": 0.8698437213897705, "rewards/format_reward": 1.0, "step": 738 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 404.0625, "epoch": 0.010192962855684749, "grad_norm": 2.1648224720026317, "kl": 0.0546875, "learning_rate": 9.997436675869708e-07, "loss": 0.0022, "reward": 2.111593723297119, "reward_std": 0.01349850557744503, "rewards/accuracy_reward": 0.9115937948226929, "rewards/format_reward": 1.0, "step": 739 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 400.875, "epoch": 0.010206755768885946, "grad_norm": 8.778738681829669, "kl": 0.05322265625, "learning_rate": 9.997429734493078e-07, "loss": 0.0021, "reward": 2.0667500495910645, "reward_std": 0.03505605831742287, "rewards/accuracy_reward": 0.8667500019073486, "rewards/format_reward": 1.0, "step": 740 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 409.65625, "epoch": 0.010220548682087143, "grad_norm": 3.157562387234862, "kl": 0.0693359375, "learning_rate": 9.997422783733088e-07, "loss": 0.0028, "reward": 2.147312641143799, "reward_std": 0.01136963814496994, "rewards/accuracy_reward": 0.9473124742507935, "rewards/format_reward": 1.0, "step": 741 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 405.5, "epoch": 0.01023434159528834, "grad_norm": 3.628114583517919, "kl": 0.060546875, "learning_rate": 9.997415823589749e-07, "loss": 0.0024, "reward": 2.0112812519073486, "reward_std": 0.037713997066020966, "rewards/accuracy_reward": 0.8112813234329224, "rewards/format_reward": 1.0, "step": 742 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 402.25, "epoch": 0.010248134508489538, "grad_norm": 2.744615880699793, "kl": 0.054931640625, "learning_rate": 9.997408854063074e-07, "loss": 0.0022, "reward": 2.139531373977661, "reward_std": 0.02022724784910679, "rewards/accuracy_reward": 0.9395312666893005, "rewards/format_reward": 1.0, "step": 743 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 420.875, "epoch": 0.010261927421690735, "grad_norm": 2.3453082636299487, "kl": 0.05419921875, "learning_rate": 9.997401875153077e-07, "loss": 0.0022, "reward": 2.168062686920166, "reward_std": 0.012410825118422508, "rewards/accuracy_reward": 0.9680625200271606, "rewards/format_reward": 1.0, "step": 744 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 421.21875, "epoch": 0.010275720334891932, "grad_norm": 7.420707809296835, "kl": 0.051025390625, "learning_rate": 9.99739488685977e-07, "loss": 0.002, "reward": 2.1364688873291016, "reward_std": 0.018790986388921738, "rewards/accuracy_reward": 0.936468780040741, "rewards/format_reward": 1.0, "step": 745 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 413.28125, "epoch": 0.01028951324809313, "grad_norm": 2.5412024334759855, "kl": 0.058837890625, "learning_rate": 9.99738788918317e-07, "loss": 0.0023, "reward": 2.1006250381469727, "reward_std": 0.029206018894910812, "rewards/accuracy_reward": 0.9068750143051147, "rewards/format_reward": 1.0, "step": 746 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 395.40625, "epoch": 0.010303306161294327, "grad_norm": 2.6938347949444337, "kl": 0.058837890625, "learning_rate": 9.997380882123285e-07, "loss": 0.0024, "reward": 1.9860000610351562, "reward_std": 0.033022359013557434, "rewards/accuracy_reward": 0.7860000133514404, "rewards/format_reward": 1.0, "step": 747 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 399.6875, "epoch": 0.010317099074495524, "grad_norm": 3.2975122364936538, "kl": 0.053466796875, "learning_rate": 9.99737386568013e-07, "loss": 0.0021, "reward": 2.0756564140319824, "reward_std": 0.016209229826927185, "rewards/accuracy_reward": 0.875656247138977, "rewards/format_reward": 1.0, "step": 748 }, { "all_correct": 0.25, "all_wrong": 0.0, "completion_length": 385.0625, "epoch": 0.010330891987696721, "grad_norm": 3.0682635457077487, "kl": 0.056640625, "learning_rate": 9.997366839853718e-07, "loss": 0.0023, "reward": 1.97740638256073, "reward_std": 0.04185992479324341, "rewards/accuracy_reward": 0.7899062633514404, "rewards/format_reward": 1.0, "step": 749 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 396.78125, "epoch": 0.010344684900897919, "grad_norm": 7.465632250146028, "kl": 0.0517578125, "learning_rate": 9.997359804644064e-07, "loss": 0.0021, "reward": 2.0840938091278076, "reward_std": 0.021526526659727097, "rewards/accuracy_reward": 0.8840937614440918, "rewards/format_reward": 1.0, "step": 750 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 396.0, "epoch": 0.010358477814099116, "grad_norm": 2.715302356725908, "kl": 0.05419921875, "learning_rate": 9.997352760051178e-07, "loss": 0.0022, "reward": 2.1145310401916504, "reward_std": 0.03436526656150818, "rewards/accuracy_reward": 0.9207813143730164, "rewards/format_reward": 1.0, "step": 751 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 396.5, "epoch": 0.010372270727300313, "grad_norm": 2.3986932526547644, "kl": 0.052001953125, "learning_rate": 9.997345706075077e-07, "loss": 0.0021, "reward": 2.1231250762939453, "reward_std": 0.015583636239171028, "rewards/accuracy_reward": 0.9231250286102295, "rewards/format_reward": 1.0, "step": 752 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 384.3125, "epoch": 0.01038606364050151, "grad_norm": 2.4024529943184145, "kl": 0.0537109375, "learning_rate": 9.997338642715771e-07, "loss": 0.0022, "reward": 2.122406244277954, "reward_std": 0.04960613697767258, "rewards/accuracy_reward": 0.9349062442779541, "rewards/format_reward": 1.0, "step": 753 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 398.875, "epoch": 0.010399856553702708, "grad_norm": 3.143490293985404, "kl": 0.0517578125, "learning_rate": 9.997331569973274e-07, "loss": 0.0021, "reward": 2.130312442779541, "reward_std": 0.018957635387778282, "rewards/accuracy_reward": 0.9303125143051147, "rewards/format_reward": 1.0, "step": 754 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 404.96875, "epoch": 0.010413649466903905, "grad_norm": 4.741382801510338, "kl": 0.06298828125, "learning_rate": 9.997324487847603e-07, "loss": 0.0025, "reward": 2.081218719482422, "reward_std": 0.023498455062508583, "rewards/accuracy_reward": 0.8812187314033508, "rewards/format_reward": 1.0, "step": 755 }, { "all_correct": 0.25, "all_wrong": 0.0, "completion_length": 406.9375, "epoch": 0.010427442380105102, "grad_norm": 2.928107559033569, "kl": 0.07666015625, "learning_rate": 9.997317396338764e-07, "loss": 0.0031, "reward": 1.9674999713897705, "reward_std": 0.03542211651802063, "rewards/accuracy_reward": 0.7737500071525574, "rewards/format_reward": 1.0, "step": 756 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 406.59375, "epoch": 0.0104412352933063, "grad_norm": 3.3315302316357243, "kl": 0.05419921875, "learning_rate": 9.997310295446777e-07, "loss": 0.0022, "reward": 1.958031177520752, "reward_std": 0.015777796506881714, "rewards/accuracy_reward": 0.7642812728881836, "rewards/format_reward": 1.0, "step": 757 }, { "all_correct": 0.25, "all_wrong": 0.0, "completion_length": 411.78125, "epoch": 0.010455028206507497, "grad_norm": 18.474726687302905, "kl": 0.068359375, "learning_rate": 9.997303185171653e-07, "loss": 0.0027, "reward": 1.986781120300293, "reward_std": 0.015298012644052505, "rewards/accuracy_reward": 0.7867811918258667, "rewards/format_reward": 1.0, "step": 758 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 415.53125, "epoch": 0.010468821119708694, "grad_norm": 2.1589338309340205, "kl": 0.060791015625, "learning_rate": 9.997296065513403e-07, "loss": 0.0024, "reward": 2.0800938606262207, "reward_std": 0.01615123450756073, "rewards/accuracy_reward": 0.8800936937332153, "rewards/format_reward": 1.0, "step": 759 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 407.28125, "epoch": 0.010482614032909891, "grad_norm": 3.384677172187868, "kl": 0.055419921875, "learning_rate": 9.997288936472043e-07, "loss": 0.0022, "reward": 2.0602502822875977, "reward_std": 0.03399854525923729, "rewards/accuracy_reward": 0.8665000200271606, "rewards/format_reward": 1.0, "step": 760 }, { "all_correct": 0.25, "all_wrong": 0.0, "completion_length": 442.5, "epoch": 0.010496406946111089, "grad_norm": 2.8940173775264744, "kl": 0.0625, "learning_rate": 9.997281798047588e-07, "loss": 0.0025, "reward": 1.9320937395095825, "reward_std": 0.22109809517860413, "rewards/accuracy_reward": 0.7820937633514404, "rewards/format_reward": 0.96875, "step": 761 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 402.59375, "epoch": 0.010510199859312286, "grad_norm": 2.32633618140947, "kl": 0.05517578125, "learning_rate": 9.997274650240046e-07, "loss": 0.0022, "reward": 1.9946562051773071, "reward_std": 0.02311072126030922, "rewards/accuracy_reward": 0.7946562767028809, "rewards/format_reward": 1.0, "step": 762 }, { "all_correct": 0.25, "all_wrong": 0.0, "completion_length": 438.0, "epoch": 0.010523992772513483, "grad_norm": 3.7825694959885685, "kl": 0.054931640625, "learning_rate": 9.997267493049438e-07, "loss": 0.0022, "reward": 1.994343638420105, "reward_std": 0.040323495864868164, "rewards/accuracy_reward": 0.8005937337875366, "rewards/format_reward": 1.0, "step": 763 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 416.96875, "epoch": 0.01053778568571468, "grad_norm": 3.688005427656586, "kl": 0.06005859375, "learning_rate": 9.99726032647577e-07, "loss": 0.0024, "reward": 2.0537500381469727, "reward_std": 0.01016139890998602, "rewards/accuracy_reward": 0.8537499904632568, "rewards/format_reward": 1.0, "step": 764 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 424.40625, "epoch": 0.010551578598915878, "grad_norm": 2.3420805541161553, "kl": 0.060791015625, "learning_rate": 9.99725315051906e-07, "loss": 0.0024, "reward": 2.053562641143799, "reward_std": 0.029797010123729706, "rewards/accuracy_reward": 0.8598124980926514, "rewards/format_reward": 1.0, "step": 765 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 440.71875, "epoch": 0.010565371512117075, "grad_norm": 2.7380140124259524, "kl": 0.05224609375, "learning_rate": 9.997245965179316e-07, "loss": 0.0021, "reward": 2.095156192779541, "reward_std": 0.032796021550893784, "rewards/accuracy_reward": 0.9014062881469727, "rewards/format_reward": 1.0, "step": 766 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 435.28125, "epoch": 0.010579164425318272, "grad_norm": 2.1085361831035407, "kl": 0.050048828125, "learning_rate": 9.997238770456562e-07, "loss": 0.002, "reward": 2.065093755722046, "reward_std": 0.010781139135360718, "rewards/accuracy_reward": 0.8650937080383301, "rewards/format_reward": 1.0, "step": 767 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 429.4375, "epoch": 0.01059295733851947, "grad_norm": 3.910167632589713, "kl": 0.052978515625, "learning_rate": 9.997231566350802e-07, "loss": 0.0021, "reward": 2.0673437118530273, "reward_std": 0.01724417693912983, "rewards/accuracy_reward": 0.8673437237739563, "rewards/format_reward": 1.0, "step": 768 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 434.3125, "epoch": 0.010606750251720667, "grad_norm": 2.3933112136668817, "kl": 0.054931640625, "learning_rate": 9.997224352862051e-07, "loss": 0.0022, "reward": 2.1377811431884766, "reward_std": 0.02724858559668064, "rewards/accuracy_reward": 0.9440312385559082, "rewards/format_reward": 1.0, "step": 769 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 419.75, "epoch": 0.010620543164921864, "grad_norm": 1.7710301641581008, "kl": 0.05859375, "learning_rate": 9.997217129990325e-07, "loss": 0.0023, "reward": 2.1596250534057617, "reward_std": 0.030992865562438965, "rewards/accuracy_reward": 0.9596250057220459, "rewards/format_reward": 1.0, "step": 770 }, { "all_correct": 0.25, "all_wrong": 0.0, "completion_length": 432.25, "epoch": 0.01063433607812306, "grad_norm": 2.2610593796980893, "kl": 0.053955078125, "learning_rate": 9.997209897735638e-07, "loss": 0.0022, "reward": 2.0071563720703125, "reward_std": 0.028937727212905884, "rewards/accuracy_reward": 0.813406229019165, "rewards/format_reward": 1.0, "step": 771 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 421.59375, "epoch": 0.010648128991324257, "grad_norm": 4.389775969791669, "kl": 0.060791015625, "learning_rate": 9.997202656098e-07, "loss": 0.0024, "reward": 2.0327188968658447, "reward_std": 0.016570717096328735, "rewards/accuracy_reward": 0.8327187299728394, "rewards/format_reward": 1.0, "step": 772 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 421.90625, "epoch": 0.010661921904525454, "grad_norm": 1.8096586543252338, "kl": 0.06103515625, "learning_rate": 9.99719540507743e-07, "loss": 0.0024, "reward": 2.1250314712524414, "reward_std": 0.013041210360825062, "rewards/accuracy_reward": 0.9250311851501465, "rewards/format_reward": 1.0, "step": 773 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 438.75, "epoch": 0.010675714817726651, "grad_norm": 2.3171812940722973, "kl": 0.05712890625, "learning_rate": 9.997188144673936e-07, "loss": 0.0023, "reward": 2.1441874504089355, "reward_std": 0.01430035475641489, "rewards/accuracy_reward": 0.9441875219345093, "rewards/format_reward": 1.0, "step": 774 }, { "all_correct": 0.25, "all_wrong": 0.0, "completion_length": 437.21875, "epoch": 0.010689507730927849, "grad_norm": 1.8798741361342544, "kl": 0.0556640625, "learning_rate": 9.997180874887535e-07, "loss": 0.0022, "reward": 2.028437614440918, "reward_std": 0.03181103616952896, "rewards/accuracy_reward": 0.8346875309944153, "rewards/format_reward": 1.0, "step": 775 }, { "all_correct": 0.25, "all_wrong": 0.0, "completion_length": 434.09375, "epoch": 0.010703300644129046, "grad_norm": 2.7731798347383156, "kl": 0.057373046875, "learning_rate": 9.99717359571824e-07, "loss": 0.0023, "reward": 2.0172500610351562, "reward_std": 0.017019499093294144, "rewards/accuracy_reward": 0.8172500133514404, "rewards/format_reward": 1.0, "step": 776 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 416.125, "epoch": 0.010717093557330243, "grad_norm": 1.9326760231771256, "kl": 0.05908203125, "learning_rate": 9.997166307166065e-07, "loss": 0.0024, "reward": 2.0292813777923584, "reward_std": 0.012397769838571548, "rewards/accuracy_reward": 0.8292812705039978, "rewards/format_reward": 1.0, "step": 777 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 420.5, "epoch": 0.01073088647053144, "grad_norm": 8.312144053152865, "kl": 0.064453125, "learning_rate": 9.997159009231022e-07, "loss": 0.0026, "reward": 2.036656379699707, "reward_std": 0.020144039765000343, "rewards/accuracy_reward": 0.8366562128067017, "rewards/format_reward": 1.0, "step": 778 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 423.40625, "epoch": 0.010744679383732638, "grad_norm": 2.4564014575073263, "kl": 0.057373046875, "learning_rate": 9.997151701913127e-07, "loss": 0.0023, "reward": 2.1264376640319824, "reward_std": 0.0199674554169178, "rewards/accuracy_reward": 0.926437497138977, "rewards/format_reward": 1.0, "step": 779 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 421.125, "epoch": 0.010758472296933835, "grad_norm": 1.9556915207208514, "kl": 0.07080078125, "learning_rate": 9.99714438521239e-07, "loss": 0.0028, "reward": 2.0612499713897705, "reward_std": 0.029282229021191597, "rewards/accuracy_reward": 0.8737499713897705, "rewards/format_reward": 1.0, "step": 780 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 421.28125, "epoch": 0.010772265210135032, "grad_norm": 3.3048727893573897, "kl": 0.0576171875, "learning_rate": 9.997137059128828e-07, "loss": 0.0023, "reward": 2.105250120162964, "reward_std": 0.02463524043560028, "rewards/accuracy_reward": 0.9052500128746033, "rewards/format_reward": 1.0, "step": 781 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 417.78125, "epoch": 0.01078605812333623, "grad_norm": 2.3225690396165812, "kl": 0.060791015625, "learning_rate": 9.997129723662456e-07, "loss": 0.0024, "reward": 2.0871875286102295, "reward_std": 0.007724268361926079, "rewards/accuracy_reward": 0.8871874809265137, "rewards/format_reward": 1.0, "step": 782 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 415.71875, "epoch": 0.010799851036537427, "grad_norm": 2.1293625413705377, "kl": 0.056640625, "learning_rate": 9.997122378813284e-07, "loss": 0.0023, "reward": 2.0953125953674316, "reward_std": 0.008481588214635849, "rewards/accuracy_reward": 0.8953125476837158, "rewards/format_reward": 1.0, "step": 783 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 413.0, "epoch": 0.010813643949738624, "grad_norm": 2.6298404697775455, "kl": 0.05908203125, "learning_rate": 9.997115024581327e-07, "loss": 0.0024, "reward": 2.0637500286102295, "reward_std": 0.023685740306973457, "rewards/accuracy_reward": 0.8637499809265137, "rewards/format_reward": 1.0, "step": 784 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 391.625, "epoch": 0.010827436862939821, "grad_norm": 2.928644321807091, "kl": 0.06787109375, "learning_rate": 9.997107660966605e-07, "loss": 0.0027, "reward": 2.034656286239624, "reward_std": 0.03632409870624542, "rewards/accuracy_reward": 0.8409062623977661, "rewards/format_reward": 1.0, "step": 785 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 405.71875, "epoch": 0.010841229776141019, "grad_norm": 2.513664047183415, "kl": 0.056396484375, "learning_rate": 9.99710028796912e-07, "loss": 0.0023, "reward": 2.0606563091278076, "reward_std": 0.01943240500986576, "rewards/accuracy_reward": 0.8606562614440918, "rewards/format_reward": 1.0, "step": 786 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 405.09375, "epoch": 0.010855022689342216, "grad_norm": 3.0382393213583283, "kl": 0.05712890625, "learning_rate": 9.997092905588895e-07, "loss": 0.0023, "reward": 2.1013126373291016, "reward_std": 0.027132606133818626, "rewards/accuracy_reward": 0.901312530040741, "rewards/format_reward": 1.0, "step": 787 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 407.0625, "epoch": 0.010868815602543413, "grad_norm": 2.0098023395296742, "kl": 0.059814453125, "learning_rate": 9.99708551382594e-07, "loss": 0.0024, "reward": 2.1075313091278076, "reward_std": 0.00969760399311781, "rewards/accuracy_reward": 0.9075312614440918, "rewards/format_reward": 1.0, "step": 788 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 393.84375, "epoch": 0.01088260851574461, "grad_norm": 2.468200383692465, "kl": 0.06298828125, "learning_rate": 9.997078112680268e-07, "loss": 0.0025, "reward": 2.1681876182556152, "reward_std": 0.028210487216711044, "rewards/accuracy_reward": 0.9744374752044678, "rewards/format_reward": 1.0, "step": 789 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 399.875, "epoch": 0.010896401428945808, "grad_norm": 7.955372158556385, "kl": 0.057861328125, "learning_rate": 9.997070702151897e-07, "loss": 0.0023, "reward": 2.0388126373291016, "reward_std": 0.039768535643815994, "rewards/accuracy_reward": 0.8513124585151672, "rewards/format_reward": 1.0, "step": 790 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 389.34375, "epoch": 0.010910194342147005, "grad_norm": 9.622073724562869, "kl": 0.06640625, "learning_rate": 9.997063282240839e-07, "loss": 0.0027, "reward": 1.9733126163482666, "reward_std": 0.0383220799267292, "rewards/accuracy_reward": 0.7858125567436218, "rewards/format_reward": 1.0, "step": 791 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 416.75, "epoch": 0.010923987255348202, "grad_norm": 3.022145262903997, "kl": 0.04931640625, "learning_rate": 9.997055852947107e-07, "loss": 0.002, "reward": 2.092062473297119, "reward_std": 0.015360655263066292, "rewards/accuracy_reward": 0.8920624852180481, "rewards/format_reward": 1.0, "step": 792 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 404.5, "epoch": 0.0109377801685494, "grad_norm": 2.6928963768556, "kl": 0.06005859375, "learning_rate": 9.997048414270715e-07, "loss": 0.0024, "reward": 2.125593662261963, "reward_std": 0.0725117027759552, "rewards/accuracy_reward": 0.9443436861038208, "rewards/format_reward": 1.0, "step": 793 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 390.71875, "epoch": 0.010951573081750597, "grad_norm": 3.8755091129550374, "kl": 0.0654296875, "learning_rate": 9.997040966211678e-07, "loss": 0.0026, "reward": 2.0360002517700195, "reward_std": 0.01876935362815857, "rewards/accuracy_reward": 0.8359999060630798, "rewards/format_reward": 1.0, "step": 794 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 407.28125, "epoch": 0.010965365994951794, "grad_norm": 3.137693646907833, "kl": 0.062255859375, "learning_rate": 9.99703350877001e-07, "loss": 0.0025, "reward": 2.1158437728881836, "reward_std": 0.015870871022343636, "rewards/accuracy_reward": 0.9158438444137573, "rewards/format_reward": 1.0, "step": 795 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 411.5625, "epoch": 0.010979158908152991, "grad_norm": 2.965191977296174, "kl": 0.053466796875, "learning_rate": 9.997026041945722e-07, "loss": 0.0021, "reward": 2.046750068664551, "reward_std": 0.03168531507253647, "rewards/accuracy_reward": 0.846750020980835, "rewards/format_reward": 1.0, "step": 796 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 409.65625, "epoch": 0.010992951821354189, "grad_norm": 4.391176315479191, "kl": 0.0625, "learning_rate": 9.997018565738833e-07, "loss": 0.0025, "reward": 2.0003747940063477, "reward_std": 0.015776025131344795, "rewards/accuracy_reward": 0.8003749847412109, "rewards/format_reward": 1.0, "step": 797 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 407.96875, "epoch": 0.011006744734555386, "grad_norm": 3.0627841943932474, "kl": 0.06494140625, "learning_rate": 9.997011080149354e-07, "loss": 0.0026, "reward": 2.069531202316284, "reward_std": 0.013713781721889973, "rewards/accuracy_reward": 0.8695312738418579, "rewards/format_reward": 1.0, "step": 798 }, { "all_correct": 0.25, "all_wrong": 0.0, "completion_length": 401.5, "epoch": 0.011020537647756583, "grad_norm": 3.137509558475994, "kl": 0.06103515625, "learning_rate": 9.9970035851773e-07, "loss": 0.0024, "reward": 1.9291250705718994, "reward_std": 0.01993734948337078, "rewards/accuracy_reward": 0.7291249632835388, "rewards/format_reward": 1.0, "step": 799 }, { "all_correct": 0.25, "all_wrong": 0.0, "completion_length": 416.625, "epoch": 0.01103433056095778, "grad_norm": 2.4419319351111484, "kl": 0.0595703125, "learning_rate": 9.996996080822683e-07, "loss": 0.0024, "reward": 1.959031105041504, "reward_std": 0.03270441293716431, "rewards/accuracy_reward": 0.7652812600135803, "rewards/format_reward": 1.0, "step": 800 }, { "all_correct": 0.25, "all_wrong": 0.0, "completion_length": 395.5, "epoch": 0.011048123474158978, "grad_norm": 5.040142494393657, "kl": 0.06884765625, "learning_rate": 9.99698856708552e-07, "loss": 0.0028, "reward": 2.0515313148498535, "reward_std": 0.04230183735489845, "rewards/accuracy_reward": 0.8640312552452087, "rewards/format_reward": 1.0, "step": 801 }, { "all_correct": 0.25, "all_wrong": 0.0, "completion_length": 412.03125, "epoch": 0.011061916387360175, "grad_norm": 3.5266963973312677, "kl": 0.0625, "learning_rate": 9.996981043965824e-07, "loss": 0.0025, "reward": 1.939250111579895, "reward_std": 0.015386686660349369, "rewards/accuracy_reward": 0.7392499446868896, "rewards/format_reward": 1.0, "step": 802 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 414.21875, "epoch": 0.011075709300561372, "grad_norm": 2.166008242138284, "kl": 0.061767578125, "learning_rate": 9.996973511463607e-07, "loss": 0.0025, "reward": 2.100281238555908, "reward_std": 0.019825290888547897, "rewards/accuracy_reward": 0.9002813100814819, "rewards/format_reward": 1.0, "step": 803 }, { "all_correct": 0.25, "all_wrong": 0.0, "completion_length": 415.40625, "epoch": 0.01108950221376257, "grad_norm": 3.7846259799298223, "kl": 0.056396484375, "learning_rate": 9.996965969578887e-07, "loss": 0.0023, "reward": 2.0434062480926514, "reward_std": 0.025049176067113876, "rewards/accuracy_reward": 0.8434062004089355, "rewards/format_reward": 1.0, "step": 804 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 403.71875, "epoch": 0.011103295126963767, "grad_norm": 2.9322276210941207, "kl": 0.06640625, "learning_rate": 9.996958418311676e-07, "loss": 0.0026, "reward": 2.0053439140319824, "reward_std": 0.025423455983400345, "rewards/accuracy_reward": 0.805343747138977, "rewards/format_reward": 1.0, "step": 805 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 406.3125, "epoch": 0.011117088040164964, "grad_norm": 4.336962125472765, "kl": 0.06103515625, "learning_rate": 9.996950857661987e-07, "loss": 0.0024, "reward": 1.9846094846725464, "reward_std": 0.023516319692134857, "rewards/accuracy_reward": 0.784609317779541, "rewards/format_reward": 1.0, "step": 806 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 400.21875, "epoch": 0.011130880953366161, "grad_norm": 3.1024691624259213, "kl": 0.0634765625, "learning_rate": 9.996943287629837e-07, "loss": 0.0025, "reward": 1.9855624437332153, "reward_std": 0.01902967132627964, "rewards/accuracy_reward": 0.7855624556541443, "rewards/format_reward": 1.0, "step": 807 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 413.34375, "epoch": 0.011144673866567357, "grad_norm": 1.9870172080452198, "kl": 0.0615234375, "learning_rate": 9.99693570821524e-07, "loss": 0.0025, "reward": 2.084656238555908, "reward_std": 0.014451270923018456, "rewards/accuracy_reward": 0.8846561908721924, "rewards/format_reward": 1.0, "step": 808 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 418.53125, "epoch": 0.011158466779768554, "grad_norm": 2.8292219474356006, "kl": 0.056640625, "learning_rate": 9.996928119418208e-07, "loss": 0.0023, "reward": 2.094749927520752, "reward_std": 0.017461363226175308, "rewards/accuracy_reward": 0.8947499394416809, "rewards/format_reward": 1.0, "step": 809 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 407.625, "epoch": 0.011172259692969751, "grad_norm": 2.007332182198467, "kl": 0.0625, "learning_rate": 9.996920521238757e-07, "loss": 0.0025, "reward": 2.117062568664551, "reward_std": 0.011572951450943947, "rewards/accuracy_reward": 0.917062520980835, "rewards/format_reward": 1.0, "step": 810 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 421.46875, "epoch": 0.011186052606170949, "grad_norm": 2.7758068869060026, "kl": 0.06005859375, "learning_rate": 9.9969129136769e-07, "loss": 0.0024, "reward": 1.9802188873291016, "reward_std": 0.01374787651002407, "rewards/accuracy_reward": 0.780218780040741, "rewards/format_reward": 1.0, "step": 811 }, { "all_correct": 0.25, "all_wrong": 0.0, "completion_length": 433.78125, "epoch": 0.011199845519372146, "grad_norm": 2.6216665583719974, "kl": 0.06494140625, "learning_rate": 9.996905296732652e-07, "loss": 0.0026, "reward": 2.0626251697540283, "reward_std": 0.04599296674132347, "rewards/accuracy_reward": 0.8876250386238098, "rewards/format_reward": 1.0, "step": 812 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 424.0625, "epoch": 0.011213638432573343, "grad_norm": 4.126874798021307, "kl": 0.072265625, "learning_rate": 9.996897670406025e-07, "loss": 0.0029, "reward": 2.089968681335449, "reward_std": 0.02616027556359768, "rewards/accuracy_reward": 0.889968752861023, "rewards/format_reward": 1.0, "step": 813 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 420.40625, "epoch": 0.01122743134577454, "grad_norm": 4.051032796367354, "kl": 0.0654296875, "learning_rate": 9.996890034697039e-07, "loss": 0.0026, "reward": 1.9441564083099365, "reward_std": 0.024097414687275887, "rewards/accuracy_reward": 0.7441562414169312, "rewards/format_reward": 1.0, "step": 814 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 427.5625, "epoch": 0.011241224258975738, "grad_norm": 27.221526178339026, "kl": 0.0595703125, "learning_rate": 9.996882389605703e-07, "loss": 0.0024, "reward": 1.9472500085830688, "reward_std": 0.02749377116560936, "rewards/accuracy_reward": 0.7534999847412109, "rewards/format_reward": 1.0, "step": 815 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 421.03125, "epoch": 0.011255017172176935, "grad_norm": 2.322686938851381, "kl": 0.06298828125, "learning_rate": 9.996874735132034e-07, "loss": 0.0025, "reward": 1.9621562957763672, "reward_std": 0.01175643876194954, "rewards/accuracy_reward": 0.7621562480926514, "rewards/format_reward": 1.0, "step": 816 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 412.8125, "epoch": 0.011268810085378132, "grad_norm": 2.611679718853567, "kl": 0.05810546875, "learning_rate": 9.996867071276046e-07, "loss": 0.0023, "reward": 1.9129687547683716, "reward_std": 0.01250208169221878, "rewards/accuracy_reward": 0.7129687070846558, "rewards/format_reward": 1.0, "step": 817 }, { "all_correct": 0.25, "all_wrong": 0.0, "completion_length": 420.75, "epoch": 0.01128260299857933, "grad_norm": 2.867111393698513, "kl": 0.05908203125, "learning_rate": 9.99685939803775e-07, "loss": 0.0024, "reward": 1.986781120300293, "reward_std": 0.013479959219694138, "rewards/accuracy_reward": 0.7867813110351562, "rewards/format_reward": 1.0, "step": 818 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 408.25, "epoch": 0.011296395911780527, "grad_norm": 2.394077205499161, "kl": 0.0625, "learning_rate": 9.996851715417168e-07, "loss": 0.0025, "reward": 2.090437650680542, "reward_std": 0.02494914084672928, "rewards/accuracy_reward": 0.8966874480247498, "rewards/format_reward": 1.0, "step": 819 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 433.34375, "epoch": 0.011310188824981724, "grad_norm": 2.5090999757553365, "kl": 0.05810546875, "learning_rate": 9.996844023414306e-07, "loss": 0.0023, "reward": 2.0095937252044678, "reward_std": 0.03689955547451973, "rewards/accuracy_reward": 0.8158438205718994, "rewards/format_reward": 1.0, "step": 820 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 418.96875, "epoch": 0.011323981738182921, "grad_norm": 2.3564867853669687, "kl": 0.060791015625, "learning_rate": 9.996836322029182e-07, "loss": 0.0024, "reward": 1.9346563816070557, "reward_std": 0.040891990065574646, "rewards/accuracy_reward": 0.7471562623977661, "rewards/format_reward": 1.0, "step": 821 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 417.0, "epoch": 0.011337774651384119, "grad_norm": 4.562292658819249, "kl": 0.059814453125, "learning_rate": 9.996828611261814e-07, "loss": 0.0024, "reward": 2.146843910217285, "reward_std": 0.03361503407359123, "rewards/accuracy_reward": 0.9530937075614929, "rewards/format_reward": 1.0, "step": 822 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 426.625, "epoch": 0.011351567564585316, "grad_norm": 6.197307201792726, "kl": 0.0615234375, "learning_rate": 9.996820891112212e-07, "loss": 0.0025, "reward": 2.099874973297119, "reward_std": 0.009361280128359795, "rewards/accuracy_reward": 0.8998750448226929, "rewards/format_reward": 1.0, "step": 823 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 431.4375, "epoch": 0.011365360477786513, "grad_norm": 2.558265503139382, "kl": 0.06640625, "learning_rate": 9.996813161580391e-07, "loss": 0.0027, "reward": 2.0712499618530273, "reward_std": 0.013410200364887714, "rewards/accuracy_reward": 0.8712499737739563, "rewards/format_reward": 1.0, "step": 824 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 423.0, "epoch": 0.01137915339098771, "grad_norm": 3.465438261958482, "kl": 0.06787109375, "learning_rate": 9.996805422666365e-07, "loss": 0.0027, "reward": 1.999906301498413, "reward_std": 0.03800682723522186, "rewards/accuracy_reward": 0.8061562776565552, "rewards/format_reward": 1.0, "step": 825 }, { "all_correct": 0.25, "all_wrong": 0.0, "completion_length": 417.4375, "epoch": 0.011392946304188908, "grad_norm": 2.3990857135518113, "kl": 0.06640625, "learning_rate": 9.996797674370151e-07, "loss": 0.0027, "reward": 1.9601249694824219, "reward_std": 0.024164855480194092, "rewards/accuracy_reward": 0.7663750648498535, "rewards/format_reward": 1.0, "step": 826 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 420.5625, "epoch": 0.011406739217390105, "grad_norm": 2.6271078590462245, "kl": 0.0693359375, "learning_rate": 9.996789916691764e-07, "loss": 0.0028, "reward": 2.061406373977661, "reward_std": 0.01364709623157978, "rewards/accuracy_reward": 0.8614062070846558, "rewards/format_reward": 1.0, "step": 827 }, { "all_correct": 0.25, "all_wrong": 0.0, "completion_length": 417.4375, "epoch": 0.011420532130591302, "grad_norm": 3.3369275583686573, "kl": 0.0625, "learning_rate": 9.996782149631215e-07, "loss": 0.0025, "reward": 1.9078125953674316, "reward_std": 0.05367925018072128, "rewards/accuracy_reward": 0.7203124761581421, "rewards/format_reward": 1.0, "step": 828 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 428.625, "epoch": 0.0114343250437925, "grad_norm": 2.778753896514087, "kl": 0.06884765625, "learning_rate": 9.996774373188521e-07, "loss": 0.0028, "reward": 2.059906482696533, "reward_std": 0.025252975523471832, "rewards/accuracy_reward": 0.8599063158035278, "rewards/format_reward": 1.0, "step": 829 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 419.8125, "epoch": 0.011448117956993697, "grad_norm": 2.7857368868609194, "kl": 0.06298828125, "learning_rate": 9.996766587363695e-07, "loss": 0.0025, "reward": 2.059281349182129, "reward_std": 0.01778646931052208, "rewards/accuracy_reward": 0.8592813014984131, "rewards/format_reward": 1.0, "step": 830 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 424.0625, "epoch": 0.011461910870194894, "grad_norm": 8.051504435818483, "kl": 0.0615234375, "learning_rate": 9.996758792156753e-07, "loss": 0.0025, "reward": 2.1093125343322754, "reward_std": 0.008717061020433903, "rewards/accuracy_reward": 0.9093124866485596, "rewards/format_reward": 1.0, "step": 831 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 407.46875, "epoch": 0.011475703783396091, "grad_norm": 1.604738557307816, "kl": 0.06396484375, "learning_rate": 9.99675098756771e-07, "loss": 0.0026, "reward": 2.133124828338623, "reward_std": 0.010685278102755547, "rewards/accuracy_reward": 0.9331250190734863, "rewards/format_reward": 1.0, "step": 832 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 404.65625, "epoch": 0.011489496696597289, "grad_norm": 2.7699740294123867, "kl": 0.05859375, "learning_rate": 9.99674317359658e-07, "loss": 0.0023, "reward": 2.027468681335449, "reward_std": 0.032107800245285034, "rewards/accuracy_reward": 0.8274686932563782, "rewards/format_reward": 1.0, "step": 833 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 403.15625, "epoch": 0.011503289609798486, "grad_norm": 1.3438355317644886, "kl": 0.0556640625, "learning_rate": 9.996735350243378e-07, "loss": 0.0022, "reward": 1.981156349182129, "reward_std": 0.0073362309485673904, "rewards/accuracy_reward": 0.7811562418937683, "rewards/format_reward": 1.0, "step": 834 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 400.0625, "epoch": 0.011517082522999683, "grad_norm": 3.5618758890444435, "kl": 0.06103515625, "learning_rate": 9.996727517508116e-07, "loss": 0.0024, "reward": 2.1022188663482666, "reward_std": 0.03714028745889664, "rewards/accuracy_reward": 0.9022186994552612, "rewards/format_reward": 1.0, "step": 835 }, { "all_correct": 0.25, "all_wrong": 0.0, "completion_length": 394.375, "epoch": 0.01153087543620088, "grad_norm": 8.264618607993818, "kl": 0.057373046875, "learning_rate": 9.996719675390814e-07, "loss": 0.0023, "reward": 1.9329688549041748, "reward_std": 0.013602283783257008, "rewards/accuracy_reward": 0.7329687476158142, "rewards/format_reward": 1.0, "step": 836 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 397.46875, "epoch": 0.011544668349402078, "grad_norm": 2.1389286903577553, "kl": 0.058837890625, "learning_rate": 9.996711823891481e-07, "loss": 0.0024, "reward": 2.0406875610351562, "reward_std": 0.02286362834274769, "rewards/accuracy_reward": 0.8406875133514404, "rewards/format_reward": 1.0, "step": 837 }, { "all_correct": 0.25, "all_wrong": 0.0, "completion_length": 402.0, "epoch": 0.011558461262603275, "grad_norm": 2.0762559763495485, "kl": 0.0654296875, "learning_rate": 9.996703963010136e-07, "loss": 0.0026, "reward": 2.048468828201294, "reward_std": 0.03429223597049713, "rewards/accuracy_reward": 0.8547187447547913, "rewards/format_reward": 1.0, "step": 838 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 392.5, "epoch": 0.011572254175804472, "grad_norm": 2.382663184085568, "kl": 0.06298828125, "learning_rate": 9.996696092746794e-07, "loss": 0.0025, "reward": 2.084249973297119, "reward_std": 0.028589671477675438, "rewards/accuracy_reward": 0.8842499852180481, "rewards/format_reward": 1.0, "step": 839 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 388.90625, "epoch": 0.01158604708900567, "grad_norm": 2.1591170321915554, "kl": 0.060791015625, "learning_rate": 9.996688213101464e-07, "loss": 0.0024, "reward": 2.12709379196167, "reward_std": 0.010218516923487186, "rewards/accuracy_reward": 0.9270937442779541, "rewards/format_reward": 1.0, "step": 840 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 401.1875, "epoch": 0.011599840002206867, "grad_norm": 3.377154698949668, "kl": 0.0537109375, "learning_rate": 9.996680324074167e-07, "loss": 0.0021, "reward": 2.074718952178955, "reward_std": 0.046328362077474594, "rewards/accuracy_reward": 0.8809687495231628, "rewards/format_reward": 1.0, "step": 841 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 391.625, "epoch": 0.011613632915408064, "grad_norm": 4.962266065521929, "kl": 0.068359375, "learning_rate": 9.996672425664915e-07, "loss": 0.0027, "reward": 2.075937509536743, "reward_std": 0.027525397017598152, "rewards/accuracy_reward": 0.8821874856948853, "rewards/format_reward": 1.0, "step": 842 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 401.59375, "epoch": 0.011627425828609261, "grad_norm": 2.8390598240275886, "kl": 0.06201171875, "learning_rate": 9.996664517873725e-07, "loss": 0.0025, "reward": 2.1173439025878906, "reward_std": 0.028220897540450096, "rewards/accuracy_reward": 0.9173436760902405, "rewards/format_reward": 1.0, "step": 843 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 390.5, "epoch": 0.011641218741810459, "grad_norm": 1.6394258138671067, "kl": 0.060302734375, "learning_rate": 9.996656600700607e-07, "loss": 0.0024, "reward": 2.0991873741149902, "reward_std": 0.007098478265106678, "rewards/accuracy_reward": 0.8991875648498535, "rewards/format_reward": 1.0, "step": 844 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 400.40625, "epoch": 0.011655011655011656, "grad_norm": 2.733876167820925, "kl": 0.06640625, "learning_rate": 9.996648674145583e-07, "loss": 0.0026, "reward": 2.0859375, "reward_std": 0.034203723073005676, "rewards/accuracy_reward": 0.8859374523162842, "rewards/format_reward": 1.0, "step": 845 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 381.65625, "epoch": 0.011668804568212851, "grad_norm": 2.0153972049353643, "kl": 0.06201171875, "learning_rate": 9.996640738208661e-07, "loss": 0.0025, "reward": 2.0403125286102295, "reward_std": 0.02131342887878418, "rewards/accuracy_reward": 0.8465625047683716, "rewards/format_reward": 1.0, "step": 846 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 393.25, "epoch": 0.011682597481414049, "grad_norm": 2.581256825259971, "kl": 0.05810546875, "learning_rate": 9.99663279288986e-07, "loss": 0.0023, "reward": 2.120093822479248, "reward_std": 0.014586973935365677, "rewards/accuracy_reward": 0.9200937747955322, "rewards/format_reward": 1.0, "step": 847 }, { "all_correct": 0.25, "all_wrong": 0.0, "completion_length": 381.375, "epoch": 0.011696390394615246, "grad_norm": 3.506499437442312, "kl": 0.06982421875, "learning_rate": 9.996624838189196e-07, "loss": 0.0028, "reward": 2.129031181335449, "reward_std": 0.07114508748054504, "rewards/accuracy_reward": 0.9477812051773071, "rewards/format_reward": 1.0, "step": 848 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 370.09375, "epoch": 0.011710183307816443, "grad_norm": 1.6740805467648918, "kl": 0.05908203125, "learning_rate": 9.99661687410668e-07, "loss": 0.0024, "reward": 2.091562509536743, "reward_std": 0.02857976034283638, "rewards/accuracy_reward": 0.9040625095367432, "rewards/format_reward": 1.0, "step": 849 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 384.28125, "epoch": 0.01172397622101764, "grad_norm": 1.9296226836547883, "kl": 0.06884765625, "learning_rate": 9.996608900642328e-07, "loss": 0.0028, "reward": 1.8899062871932983, "reward_std": 0.03519243001937866, "rewards/accuracy_reward": 0.6961562633514404, "rewards/format_reward": 1.0, "step": 850 }, { "all_correct": 0.25, "all_wrong": 0.0, "completion_length": 381.09375, "epoch": 0.011737769134218838, "grad_norm": 3.350627610732348, "kl": 0.06103515625, "learning_rate": 9.996600917796156e-07, "loss": 0.0024, "reward": 2.073437452316284, "reward_std": 0.03993368148803711, "rewards/accuracy_reward": 0.8796875476837158, "rewards/format_reward": 1.0, "step": 851 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 410.65625, "epoch": 0.011751562047420035, "grad_norm": 2.6447759856591144, "kl": 0.05517578125, "learning_rate": 9.99659292556818e-07, "loss": 0.0022, "reward": 1.9443750381469727, "reward_std": 0.01881929114460945, "rewards/accuracy_reward": 0.7443750500679016, "rewards/format_reward": 1.0, "step": 852 }, { "all_correct": 0.25, "all_wrong": 0.0, "completion_length": 395.65625, "epoch": 0.011765354960621232, "grad_norm": 3.1281596440526633, "kl": 0.06591796875, "learning_rate": 9.996584923958413e-07, "loss": 0.0026, "reward": 2.0310001373291016, "reward_std": 0.03758327290415764, "rewards/accuracy_reward": 0.8372499346733093, "rewards/format_reward": 1.0, "step": 853 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 387.46875, "epoch": 0.01177914787382243, "grad_norm": 6.6734576809864965, "kl": 0.06201171875, "learning_rate": 9.99657691296687e-07, "loss": 0.0025, "reward": 2.076843738555908, "reward_std": 0.02001909539103508, "rewards/accuracy_reward": 0.8768438100814819, "rewards/format_reward": 1.0, "step": 854 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 397.40625, "epoch": 0.011792940787023627, "grad_norm": 6.191875399836288, "kl": 0.06689453125, "learning_rate": 9.996568892593568e-07, "loss": 0.0027, "reward": 2.0779685974121094, "reward_std": 0.013660861179232597, "rewards/accuracy_reward": 0.8779687881469727, "rewards/format_reward": 1.0, "step": 855 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 417.1875, "epoch": 0.011806733700224824, "grad_norm": 2.5483785326216037, "kl": 0.0615234375, "learning_rate": 9.996560862838518e-07, "loss": 0.0025, "reward": 2.1045312881469727, "reward_std": 0.0539567694067955, "rewards/accuracy_reward": 0.9170312881469727, "rewards/format_reward": 1.0, "step": 856 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 398.46875, "epoch": 0.011820526613426021, "grad_norm": 2.418062418205007, "kl": 0.068359375, "learning_rate": 9.99655282370174e-07, "loss": 0.0027, "reward": 1.9567968845367432, "reward_std": 0.034331951290369034, "rewards/accuracy_reward": 0.7630468606948853, "rewards/format_reward": 1.0, "step": 857 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 403.6875, "epoch": 0.011834319526627219, "grad_norm": 1.7809773108215838, "kl": 0.0634765625, "learning_rate": 9.99654477518325e-07, "loss": 0.0025, "reward": 1.9782187938690186, "reward_std": 0.009585888125002384, "rewards/accuracy_reward": 0.7782187461853027, "rewards/format_reward": 1.0, "step": 858 }, { "all_correct": 0.25, "all_wrong": 0.0, "completion_length": 411.875, "epoch": 0.011848112439828416, "grad_norm": 0.11383412686078949, "kl": 0.06298828125, "learning_rate": 9.996536717283055e-07, "loss": 0.0025, "reward": 1.9325000047683716, "reward_std": 0.0, "rewards/accuracy_reward": 0.7325000166893005, "rewards/format_reward": 1.0, "step": 859 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 445.21875, "epoch": 0.011861905353029613, "grad_norm": 2.972591786302114, "kl": 0.05322265625, "learning_rate": 9.99652865000118e-07, "loss": 0.0021, "reward": 2.0732500553131104, "reward_std": 0.0530330166220665, "rewards/accuracy_reward": 0.8919999599456787, "rewards/format_reward": 1.0, "step": 860 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 421.90625, "epoch": 0.01187569826623081, "grad_norm": 2.0954102476111642, "kl": 0.060546875, "learning_rate": 9.996520573337631e-07, "loss": 0.0024, "reward": 2.117062568664551, "reward_std": 0.012381434440612793, "rewards/accuracy_reward": 0.917062520980835, "rewards/format_reward": 1.0, "step": 861 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 429.78125, "epoch": 0.011889491179432008, "grad_norm": 2.9585560613407336, "kl": 0.06689453125, "learning_rate": 9.99651248729243e-07, "loss": 0.0027, "reward": 2.0201563835144043, "reward_std": 0.03696242719888687, "rewards/accuracy_reward": 0.8264062404632568, "rewards/format_reward": 1.0, "step": 862 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 431.3125, "epoch": 0.011903284092633205, "grad_norm": 2.0811959755561227, "kl": 0.0625, "learning_rate": 9.99650439186559e-07, "loss": 0.0025, "reward": 2.0782814025878906, "reward_std": 0.03882887214422226, "rewards/accuracy_reward": 0.8845313191413879, "rewards/format_reward": 1.0, "step": 863 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 423.1875, "epoch": 0.011917077005834402, "grad_norm": 1.9242990768048855, "kl": 0.0732421875, "learning_rate": 9.996496287057127e-07, "loss": 0.0029, "reward": 2.073765754699707, "reward_std": 0.012184280902147293, "rewards/accuracy_reward": 0.8737655878067017, "rewards/format_reward": 1.0, "step": 864 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 424.59375, "epoch": 0.0119308699190356, "grad_norm": 6.091985250089945, "kl": 0.0615234375, "learning_rate": 9.99648817286705e-07, "loss": 0.0025, "reward": 2.0103437900543213, "reward_std": 0.011906002648174763, "rewards/accuracy_reward": 0.8103437423706055, "rewards/format_reward": 1.0, "step": 865 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 418.625, "epoch": 0.011944662832236797, "grad_norm": 1.981395523690396, "kl": 0.0615234375, "learning_rate": 9.996480049295384e-07, "loss": 0.0025, "reward": 2.0708751678466797, "reward_std": 0.014063848182559013, "rewards/accuracy_reward": 0.8708749413490295, "rewards/format_reward": 1.0, "step": 866 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 426.3125, "epoch": 0.011958455745437994, "grad_norm": 1.7563604014664984, "kl": 0.05712890625, "learning_rate": 9.99647191634214e-07, "loss": 0.0023, "reward": 2.0634374618530273, "reward_std": 0.012944048270583153, "rewards/accuracy_reward": 0.8634375333786011, "rewards/format_reward": 1.0, "step": 867 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 435.5625, "epoch": 0.011972248658639191, "grad_norm": 2.4674253728508218, "kl": 0.057861328125, "learning_rate": 9.996463774007331e-07, "loss": 0.0023, "reward": 2.109468936920166, "reward_std": 0.027734071016311646, "rewards/accuracy_reward": 0.915718674659729, "rewards/format_reward": 1.0, "step": 868 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 430.96875, "epoch": 0.011986041571840389, "grad_norm": 11.56900616999347, "kl": 0.06298828125, "learning_rate": 9.996455622290975e-07, "loss": 0.0025, "reward": 1.950562596321106, "reward_std": 0.007412286940962076, "rewards/accuracy_reward": 0.7505625486373901, "rewards/format_reward": 1.0, "step": 869 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 418.875, "epoch": 0.011999834485041586, "grad_norm": 2.2332283400516744, "kl": 0.05712890625, "learning_rate": 9.996447461193084e-07, "loss": 0.0023, "reward": 2.0629374980926514, "reward_std": 0.014496359042823315, "rewards/accuracy_reward": 0.8629374504089355, "rewards/format_reward": 1.0, "step": 870 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 416.84375, "epoch": 0.012013627398242783, "grad_norm": 2.1193121691806938, "kl": 0.0537109375, "learning_rate": 9.996439290713677e-07, "loss": 0.0021, "reward": 2.001906394958496, "reward_std": 0.03647825866937637, "rewards/accuracy_reward": 0.8081561923027039, "rewards/format_reward": 1.0, "step": 871 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 411.78125, "epoch": 0.01202742031144398, "grad_norm": 3.803337052059349, "kl": 0.06103515625, "learning_rate": 9.996431110852773e-07, "loss": 0.0024, "reward": 2.1410937309265137, "reward_std": 0.014491010457277298, "rewards/accuracy_reward": 0.9410936832427979, "rewards/format_reward": 1.0, "step": 872 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 417.59375, "epoch": 0.012041213224645178, "grad_norm": 4.082902419146474, "kl": 0.0576171875, "learning_rate": 9.996422921610377e-07, "loss": 0.0023, "reward": 2.0197501182556152, "reward_std": 0.016605820506811142, "rewards/accuracy_reward": 0.8197500705718994, "rewards/format_reward": 1.0, "step": 873 }, { "all_correct": 0.25, "all_wrong": 0.0, "completion_length": 423.21875, "epoch": 0.012055006137846375, "grad_norm": 2.6139498597955195, "kl": 0.056884765625, "learning_rate": 9.996414722986513e-07, "loss": 0.0023, "reward": 2.0, "reward_std": 0.02214580774307251, "rewards/accuracy_reward": 0.800000011920929, "rewards/format_reward": 1.0, "step": 874 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 419.1875, "epoch": 0.012068799051047572, "grad_norm": 7.07829282131875, "kl": 0.057373046875, "learning_rate": 9.99640651498119e-07, "loss": 0.0023, "reward": 2.0888748168945312, "reward_std": 0.01722707599401474, "rewards/accuracy_reward": 0.8888750076293945, "rewards/format_reward": 1.0, "step": 875 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 418.21875, "epoch": 0.01208259196424877, "grad_norm": 10.635125583071112, "kl": 0.064453125, "learning_rate": 9.996398297594428e-07, "loss": 0.0026, "reward": 2.069718837738037, "reward_std": 0.02198335900902748, "rewards/accuracy_reward": 0.8697187900543213, "rewards/format_reward": 1.0, "step": 876 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 402.9375, "epoch": 0.012096384877449967, "grad_norm": 2.6385402371431486, "kl": 0.0654296875, "learning_rate": 9.996390070826243e-07, "loss": 0.0026, "reward": 1.980375051498413, "reward_std": 0.010692941024899483, "rewards/accuracy_reward": 0.7803750038146973, "rewards/format_reward": 1.0, "step": 877 }, { "all_correct": 0.0, "all_wrong": 0.0, "completion_length": 414.84375, "epoch": 0.012110177790651164, "grad_norm": 1.3571584832192531, "kl": 0.0654296875, "learning_rate": 9.996381834676647e-07, "loss": 0.0026, "reward": 1.8616876602172852, "reward_std": 0.08793481439352036, "rewards/accuracy_reward": 0.6929374933242798, "rewards/format_reward": 0.96875, "step": 878 }, { "all_correct": 0.25, "all_wrong": 0.0, "completion_length": 396.875, "epoch": 0.012123970703852361, "grad_norm": 4.154217259597983, "kl": 0.0537109375, "learning_rate": 9.99637358914566e-07, "loss": 0.0021, "reward": 1.8893749713897705, "reward_std": 0.015821736305952072, "rewards/accuracy_reward": 0.6893750429153442, "rewards/format_reward": 1.0, "step": 879 }, { "all_correct": 0.25, "all_wrong": 0.0, "completion_length": 411.46875, "epoch": 0.012137763617053559, "grad_norm": 2.6638918320464606, "kl": 0.06298828125, "learning_rate": 9.99636533423329e-07, "loss": 0.0025, "reward": 1.9846875667572021, "reward_std": 0.029697462916374207, "rewards/accuracy_reward": 0.7909375429153442, "rewards/format_reward": 1.0, "step": 880 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 390.25, "epoch": 0.012151556530254756, "grad_norm": 3.9387543821551096, "kl": 0.06201171875, "learning_rate": 9.99635706993956e-07, "loss": 0.0025, "reward": 2.0455312728881836, "reward_std": 0.027177492156624794, "rewards/accuracy_reward": 0.8517812490463257, "rewards/format_reward": 1.0, "step": 881 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 410.71875, "epoch": 0.012165349443455953, "grad_norm": 1.654394573300401, "kl": 0.06201171875, "learning_rate": 9.996348796264481e-07, "loss": 0.0025, "reward": 1.9733750820159912, "reward_std": 0.002965821884572506, "rewards/accuracy_reward": 0.7733750343322754, "rewards/format_reward": 1.0, "step": 882 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 400.125, "epoch": 0.012179142356657149, "grad_norm": 2.99262038300314, "kl": 0.064453125, "learning_rate": 9.996340513208073e-07, "loss": 0.0026, "reward": 2.025437593460083, "reward_std": 0.01931835152208805, "rewards/accuracy_reward": 0.8254374861717224, "rewards/format_reward": 1.0, "step": 883 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 396.34375, "epoch": 0.012192935269858346, "grad_norm": 1.4806723113233027, "kl": 0.058349609375, "learning_rate": 9.996332220770344e-07, "loss": 0.0023, "reward": 2.1595935821533203, "reward_std": 0.0035929977893829346, "rewards/accuracy_reward": 0.9595937728881836, "rewards/format_reward": 1.0, "step": 884 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 406.3125, "epoch": 0.012206728183059543, "grad_norm": 3.947854212704794, "kl": 0.064453125, "learning_rate": 9.996323918951317e-07, "loss": 0.0026, "reward": 2.083718776702881, "reward_std": 0.028584294021129608, "rewards/accuracy_reward": 0.883718729019165, "rewards/format_reward": 1.0, "step": 885 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 385.125, "epoch": 0.01222052109626074, "grad_norm": 2.096519763186262, "kl": 0.055908203125, "learning_rate": 9.996315607751006e-07, "loss": 0.0022, "reward": 2.052896022796631, "reward_std": 0.03550507500767708, "rewards/accuracy_reward": 0.8653957843780518, "rewards/format_reward": 1.0, "step": 886 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 397.59375, "epoch": 0.012234314009461938, "grad_norm": 4.83163986635347, "kl": 0.061279296875, "learning_rate": 9.996307287169422e-07, "loss": 0.0025, "reward": 2.0966875553131104, "reward_std": 0.023463400080800056, "rewards/accuracy_reward": 0.8966875076293945, "rewards/format_reward": 1.0, "step": 887 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 403.25, "epoch": 0.012248106922663135, "grad_norm": 3.505408970866594, "kl": 0.0634765625, "learning_rate": 9.996298957206589e-07, "loss": 0.0025, "reward": 2.085625171661377, "reward_std": 0.009287193417549133, "rewards/accuracy_reward": 0.8856250643730164, "rewards/format_reward": 1.0, "step": 888 }, { "all_correct": 0.25, "all_wrong": 0.0, "completion_length": 397.6875, "epoch": 0.012261899835864332, "grad_norm": 1.884075769266571, "kl": 0.06494140625, "learning_rate": 9.996290617862513e-07, "loss": 0.0026, "reward": 1.9230313301086426, "reward_std": 0.028416823595762253, "rewards/accuracy_reward": 0.7292811870574951, "rewards/format_reward": 1.0, "step": 889 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 406.09375, "epoch": 0.01227569274906553, "grad_norm": 2.6083843425710413, "kl": 0.061279296875, "learning_rate": 9.996282269137218e-07, "loss": 0.0025, "reward": 2.0739998817443848, "reward_std": 0.04287020117044449, "rewards/accuracy_reward": 0.8865000009536743, "rewards/format_reward": 1.0, "step": 890 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 407.75, "epoch": 0.012289485662266727, "grad_norm": 1.848640109992965, "kl": 0.08251953125, "learning_rate": 9.996273911030713e-07, "loss": 0.0033, "reward": 2.0134999752044678, "reward_std": 0.015919554978609085, "rewards/accuracy_reward": 0.8134999871253967, "rewards/format_reward": 1.0, "step": 891 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 393.03125, "epoch": 0.012303278575467924, "grad_norm": 1.0093428303797902, "kl": 0.06298828125, "learning_rate": 9.996265543543017e-07, "loss": 0.0025, "reward": 2.1414999961853027, "reward_std": 0.01834392361342907, "rewards/accuracy_reward": 0.9477500319480896, "rewards/format_reward": 1.0, "step": 892 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 416.71875, "epoch": 0.012317071488669121, "grad_norm": 4.051860570895975, "kl": 0.0693359375, "learning_rate": 9.996257166674148e-07, "loss": 0.0028, "reward": 2.0829687118530273, "reward_std": 0.03916406258940697, "rewards/accuracy_reward": 0.889218807220459, "rewards/format_reward": 1.0, "step": 893 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 409.9375, "epoch": 0.012330864401870319, "grad_norm": 2.0765005631738265, "kl": 0.05126953125, "learning_rate": 9.996248780424117e-07, "loss": 0.0021, "reward": 2.066218852996826, "reward_std": 0.024836335331201553, "rewards/accuracy_reward": 0.8724687099456787, "rewards/format_reward": 1.0, "step": 894 }, { "all_correct": 0.25, "all_wrong": 0.0, "completion_length": 405.3125, "epoch": 0.012344657315071516, "grad_norm": 2.7762864087935673, "kl": 0.060546875, "learning_rate": 9.996240384792943e-07, "loss": 0.0024, "reward": 2.015218734741211, "reward_std": 0.016221510246396065, "rewards/accuracy_reward": 0.8152186870574951, "rewards/format_reward": 1.0, "step": 895 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 396.78125, "epoch": 0.012358450228272713, "grad_norm": 3.3850701091769158, "kl": 0.0546875, "learning_rate": 9.996231979780638e-07, "loss": 0.0022, "reward": 2.149437665939331, "reward_std": 0.03538715839385986, "rewards/accuracy_reward": 0.9494374990463257, "rewards/format_reward": 1.0, "step": 896 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 397.5625, "epoch": 0.01237224314147391, "grad_norm": 2.405693173402508, "kl": 0.060546875, "learning_rate": 9.996223565387222e-07, "loss": 0.0024, "reward": 2.0840935707092285, "reward_std": 0.02343052811920643, "rewards/accuracy_reward": 0.8840937614440918, "rewards/format_reward": 1.0, "step": 897 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 412.625, "epoch": 0.012386036054675108, "grad_norm": 2.416222684892916, "kl": 0.06689453125, "learning_rate": 9.99621514161271e-07, "loss": 0.0027, "reward": 2.0687499046325684, "reward_std": 0.01389754842966795, "rewards/accuracy_reward": 0.8687499761581421, "rewards/format_reward": 1.0, "step": 898 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 393.0625, "epoch": 0.012399828967876305, "grad_norm": 3.563209303421202, "kl": 0.048583984375, "learning_rate": 9.996206708457117e-07, "loss": 0.0019, "reward": 2.1583125591278076, "reward_std": 0.021028826013207436, "rewards/accuracy_reward": 0.958312451839447, "rewards/format_reward": 1.0, "step": 899 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 410.65625, "epoch": 0.012413621881077502, "grad_norm": 2.5939201074146974, "kl": 0.06640625, "learning_rate": 9.996198265920458e-07, "loss": 0.0027, "reward": 2.1188435554504395, "reward_std": 0.017672449350357056, "rewards/accuracy_reward": 0.9188437461853027, "rewards/format_reward": 1.0, "step": 900 }, { "all_correct": 0.25, "all_wrong": 0.0, "completion_length": 408.5, "epoch": 0.0124274147942787, "grad_norm": 1.8443103195923343, "kl": 0.06201171875, "learning_rate": 9.996189814002752e-07, "loss": 0.0025, "reward": 1.930375099182129, "reward_std": 0.009614366106688976, "rewards/accuracy_reward": 0.7303749918937683, "rewards/format_reward": 1.0, "step": 901 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 427.03125, "epoch": 0.012441207707479897, "grad_norm": 3.4247984712235717, "kl": 0.0625, "learning_rate": 9.99618135270401e-07, "loss": 0.0025, "reward": 2.0033750534057617, "reward_std": 0.030553072690963745, "rewards/accuracy_reward": 0.8096249103546143, "rewards/format_reward": 1.0, "step": 902 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 405.5, "epoch": 0.012455000620681094, "grad_norm": 1.7692753261289533, "kl": 0.07080078125, "learning_rate": 9.996172882024252e-07, "loss": 0.0028, "reward": 2.14634370803833, "reward_std": 0.007679288275539875, "rewards/accuracy_reward": 0.946343719959259, "rewards/format_reward": 1.0, "step": 903 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 402.25, "epoch": 0.012468793533882291, "grad_norm": 7.304207242130528, "kl": 0.06396484375, "learning_rate": 9.99616440196349e-07, "loss": 0.0025, "reward": 1.9845623970031738, "reward_std": 0.018476715311408043, "rewards/accuracy_reward": 0.7845624685287476, "rewards/format_reward": 1.0, "step": 904 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 412.75, "epoch": 0.012482586447083489, "grad_norm": 3.3505590735640407, "kl": 0.0595703125, "learning_rate": 9.996155912521747e-07, "loss": 0.0024, "reward": 2.034374952316284, "reward_std": 0.01827695034444332, "rewards/accuracy_reward": 0.8343750834465027, "rewards/format_reward": 1.0, "step": 905 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 404.6875, "epoch": 0.012496379360284686, "grad_norm": 2.71514943542268, "kl": 0.060302734375, "learning_rate": 9.99614741369903e-07, "loss": 0.0024, "reward": 2.1310312747955322, "reward_std": 0.011738812550902367, "rewards/accuracy_reward": 0.9310312271118164, "rewards/format_reward": 1.0, "step": 906 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 398.34375, "epoch": 0.012510172273485883, "grad_norm": 1.8794806001105664, "kl": 0.064453125, "learning_rate": 9.996138905495363e-07, "loss": 0.0026, "reward": 2.1114375591278076, "reward_std": 0.01660860888659954, "rewards/accuracy_reward": 0.9114375114440918, "rewards/format_reward": 1.0, "step": 907 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 398.15625, "epoch": 0.01252396518668708, "grad_norm": 3.447373044399844, "kl": 0.059814453125, "learning_rate": 9.996130387910754e-07, "loss": 0.0024, "reward": 2.13700008392334, "reward_std": 0.014783509075641632, "rewards/accuracy_reward": 0.9369999170303345, "rewards/format_reward": 1.0, "step": 908 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 401.375, "epoch": 0.012537758099888278, "grad_norm": 2.017053033461348, "kl": 0.052978515625, "learning_rate": 9.996121860945226e-07, "loss": 0.0021, "reward": 2.1075313091278076, "reward_std": 0.018499501049518585, "rewards/accuracy_reward": 0.9075311422348022, "rewards/format_reward": 1.0, "step": 909 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 399.9375, "epoch": 0.012551551013089475, "grad_norm": 2.0980386920138114, "kl": 0.072265625, "learning_rate": 9.996113324598791e-07, "loss": 0.0029, "reward": 2.106375217437744, "reward_std": 0.00542815588414669, "rewards/accuracy_reward": 0.9063750505447388, "rewards/format_reward": 1.0, "step": 910 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 401.46875, "epoch": 0.012565343926290672, "grad_norm": 4.705855992698426, "kl": 0.056884765625, "learning_rate": 9.996104778871467e-07, "loss": 0.0023, "reward": 2.1244688034057617, "reward_std": 0.023335276171565056, "rewards/accuracy_reward": 0.9244687557220459, "rewards/format_reward": 1.0, "step": 911 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 400.1875, "epoch": 0.01257913683949187, "grad_norm": 2.2043250916135095, "kl": 0.06201171875, "learning_rate": 9.996096223763269e-07, "loss": 0.0025, "reward": 2.062812566757202, "reward_std": 0.009765689261257648, "rewards/accuracy_reward": 0.8628124594688416, "rewards/format_reward": 1.0, "step": 912 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 400.21875, "epoch": 0.012592929752693067, "grad_norm": 1.788314218410025, "kl": 0.05712890625, "learning_rate": 9.996087659274213e-07, "loss": 0.0023, "reward": 2.172468662261963, "reward_std": 0.024640899151563644, "rewards/accuracy_reward": 0.9724687337875366, "rewards/format_reward": 1.0, "step": 913 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 406.0, "epoch": 0.012606722665894264, "grad_norm": 1.8442002636505592, "kl": 0.0556640625, "learning_rate": 9.996079085404316e-07, "loss": 0.0022, "reward": 2.065312623977661, "reward_std": 0.006495218258351088, "rewards/accuracy_reward": 0.8653125166893005, "rewards/format_reward": 1.0, "step": 914 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 396.53125, "epoch": 0.012620515579095461, "grad_norm": 2.4493533125284195, "kl": 0.060791015625, "learning_rate": 9.996070502153593e-07, "loss": 0.0024, "reward": 2.0648751258850098, "reward_std": 0.016605194658041, "rewards/accuracy_reward": 0.8648750185966492, "rewards/format_reward": 1.0, "step": 915 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 392.3125, "epoch": 0.012634308492296659, "grad_norm": 1.874395525977322, "kl": 0.060302734375, "learning_rate": 9.996061909522062e-07, "loss": 0.0024, "reward": 2.106156349182129, "reward_std": 0.01716047339141369, "rewards/accuracy_reward": 0.9061562418937683, "rewards/format_reward": 1.0, "step": 916 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 374.5625, "epoch": 0.012648101405497856, "grad_norm": 3.051945071161466, "kl": 0.06298828125, "learning_rate": 9.996053307509735e-07, "loss": 0.0025, "reward": 2.127500057220459, "reward_std": 0.02436526119709015, "rewards/accuracy_reward": 0.9337500333786011, "rewards/format_reward": 1.0, "step": 917 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 375.1875, "epoch": 0.012661894318699053, "grad_norm": 4.27526689440216, "kl": 0.068359375, "learning_rate": 9.996044696116634e-07, "loss": 0.0027, "reward": 2.120593786239624, "reward_std": 0.027109824120998383, "rewards/accuracy_reward": 0.9205936789512634, "rewards/format_reward": 1.0, "step": 918 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 372.09375, "epoch": 0.01267568723190025, "grad_norm": 4.5924682802658205, "kl": 0.06884765625, "learning_rate": 9.99603607534277e-07, "loss": 0.0028, "reward": 2.04296875, "reward_std": 0.016068341210484505, "rewards/accuracy_reward": 0.8429688215255737, "rewards/format_reward": 1.0, "step": 919 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 376.5, "epoch": 0.012689480145101448, "grad_norm": 2.3765006752269, "kl": 0.06787109375, "learning_rate": 9.996027445188163e-07, "loss": 0.0027, "reward": 2.05831241607666, "reward_std": 0.01175000797957182, "rewards/accuracy_reward": 0.8583124876022339, "rewards/format_reward": 1.0, "step": 920 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 375.53125, "epoch": 0.012703273058302643, "grad_norm": 2.593287870405021, "kl": 0.05712890625, "learning_rate": 9.996018805652824e-07, "loss": 0.0023, "reward": 2.1519062519073486, "reward_std": 0.02840704470872879, "rewards/accuracy_reward": 0.9581562876701355, "rewards/format_reward": 1.0, "step": 921 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 377.53125, "epoch": 0.01271706597150384, "grad_norm": 6.771963326537981, "kl": 0.06884765625, "learning_rate": 9.996010156736776e-07, "loss": 0.0028, "reward": 2.0836563110351562, "reward_std": 0.02402690052986145, "rewards/accuracy_reward": 0.8899062275886536, "rewards/format_reward": 1.0, "step": 922 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 384.0625, "epoch": 0.012730858884705038, "grad_norm": 2.3159900000790787, "kl": 0.055908203125, "learning_rate": 9.996001498440033e-07, "loss": 0.0022, "reward": 2.0596563816070557, "reward_std": 0.007038048934191465, "rewards/accuracy_reward": 0.8596562147140503, "rewards/format_reward": 1.0, "step": 923 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 370.75, "epoch": 0.012744651797906235, "grad_norm": 5.794802010367455, "kl": 0.06640625, "learning_rate": 9.995992830762607e-07, "loss": 0.0027, "reward": 2.088062286376953, "reward_std": 0.01949411630630493, "rewards/accuracy_reward": 0.8880625367164612, "rewards/format_reward": 1.0, "step": 924 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 361.0625, "epoch": 0.012758444711107432, "grad_norm": 3.8378583917643847, "kl": 0.06884765625, "learning_rate": 9.995984153704518e-07, "loss": 0.0028, "reward": 2.0709376335144043, "reward_std": 0.03367071971297264, "rewards/accuracy_reward": 0.8834375143051147, "rewards/format_reward": 1.0, "step": 925 }, { "all_correct": 0.25, "all_wrong": 0.0, "completion_length": 364.53125, "epoch": 0.01277223762430863, "grad_norm": 2.4266878297545444, "kl": 0.06982421875, "learning_rate": 9.995975467265782e-07, "loss": 0.0028, "reward": 2.0297813415527344, "reward_std": 0.0686185285449028, "rewards/accuracy_reward": 0.8610312342643738, "rewards/format_reward": 1.0, "step": 926 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 374.78125, "epoch": 0.012786030537509827, "grad_norm": 3.76553231019906, "kl": 0.06396484375, "learning_rate": 9.995966771446417e-07, "loss": 0.0026, "reward": 2.0322186946868896, "reward_std": 0.03391903638839722, "rewards/accuracy_reward": 0.8322186470031738, "rewards/format_reward": 1.0, "step": 927 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 394.84375, "epoch": 0.012799823450711024, "grad_norm": 2.620665321086029, "kl": 0.060791015625, "learning_rate": 9.995958066246436e-07, "loss": 0.0024, "reward": 2.043375015258789, "reward_std": 0.026476388797163963, "rewards/accuracy_reward": 0.843375027179718, "rewards/format_reward": 1.0, "step": 928 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 379.53125, "epoch": 0.012813616363912221, "grad_norm": 3.013001107323946, "kl": 0.0673828125, "learning_rate": 9.995949351665856e-07, "loss": 0.0027, "reward": 2.1157186031341553, "reward_std": 0.03292486071586609, "rewards/accuracy_reward": 0.9219688177108765, "rewards/format_reward": 1.0, "step": 929 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 389.375, "epoch": 0.012827409277113419, "grad_norm": 2.0995687056642676, "kl": 0.0595703125, "learning_rate": 9.995940627704696e-07, "loss": 0.0024, "reward": 2.1374998092651367, "reward_std": 0.009665516205132008, "rewards/accuracy_reward": 0.9375000596046448, "rewards/format_reward": 1.0, "step": 930 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 390.3125, "epoch": 0.012841202190314616, "grad_norm": 1.895892621246368, "kl": 0.061767578125, "learning_rate": 9.995931894362968e-07, "loss": 0.0024, "reward": 2.1489062309265137, "reward_std": 0.021223776042461395, "rewards/accuracy_reward": 0.9551562666893005, "rewards/format_reward": 1.0, "step": 931 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 372.09375, "epoch": 0.012854995103515813, "grad_norm": 1.7104553026834026, "kl": 0.060791015625, "learning_rate": 9.995923151640693e-07, "loss": 0.0024, "reward": 2.0697498321533203, "reward_std": 0.03370888903737068, "rewards/accuracy_reward": 0.8822500705718994, "rewards/format_reward": 1.0, "step": 932 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 399.0625, "epoch": 0.01286878801671701, "grad_norm": 3.360395721276478, "kl": 0.06494140625, "learning_rate": 9.995914399537884e-07, "loss": 0.0026, "reward": 2.080531120300293, "reward_std": 0.010477394796907902, "rewards/accuracy_reward": 0.8805312514305115, "rewards/format_reward": 1.0, "step": 933 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 394.15625, "epoch": 0.012882580929918208, "grad_norm": 2.143762023639265, "kl": 0.061279296875, "learning_rate": 9.99590563805456e-07, "loss": 0.0024, "reward": 2.110562324523926, "reward_std": 0.019955508410930634, "rewards/accuracy_reward": 0.9105625152587891, "rewards/format_reward": 1.0, "step": 934 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 384.53125, "epoch": 0.012896373843119405, "grad_norm": 3.3071350463292495, "kl": 0.064453125, "learning_rate": 9.995896867190736e-07, "loss": 0.0026, "reward": 2.1410000324249268, "reward_std": 0.033771172165870667, "rewards/accuracy_reward": 0.9472500085830688, "rewards/format_reward": 1.0, "step": 935 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 409.6875, "epoch": 0.012910166756320602, "grad_norm": 2.0141665161157936, "kl": 0.061767578125, "learning_rate": 9.995888086946428e-07, "loss": 0.0025, "reward": 2.0889999866485596, "reward_std": 0.0042914170771837234, "rewards/accuracy_reward": 0.8889999389648438, "rewards/format_reward": 1.0, "step": 936 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 408.125, "epoch": 0.0129239596695218, "grad_norm": 1.6337262572752422, "kl": 0.056396484375, "learning_rate": 9.995879297321654e-07, "loss": 0.0023, "reward": 2.0865938663482666, "reward_std": 0.04895639047026634, "rewards/accuracy_reward": 0.8865936994552612, "rewards/format_reward": 1.0, "step": 937 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 387.09375, "epoch": 0.012937752582722997, "grad_norm": 2.0620506370125975, "kl": 0.05810546875, "learning_rate": 9.995870498316428e-07, "loss": 0.0023, "reward": 1.9742813110351562, "reward_std": 0.00941779650747776, "rewards/accuracy_reward": 0.7742812037467957, "rewards/format_reward": 1.0, "step": 938 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 397.28125, "epoch": 0.012951545495924194, "grad_norm": 2.5984474504990605, "kl": 0.054443359375, "learning_rate": 9.995861689930768e-07, "loss": 0.0022, "reward": 2.011218786239624, "reward_std": 0.026280401274561882, "rewards/accuracy_reward": 0.8112187385559082, "rewards/format_reward": 1.0, "step": 939 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 402.5625, "epoch": 0.012965338409125391, "grad_norm": 3.409584811066543, "kl": 0.06396484375, "learning_rate": 9.995852872164695e-07, "loss": 0.0026, "reward": 2.0370311737060547, "reward_std": 0.014715050347149372, "rewards/accuracy_reward": 0.8370311856269836, "rewards/format_reward": 1.0, "step": 940 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 418.125, "epoch": 0.012979131322326589, "grad_norm": 3.5886430068693205, "kl": 0.060791015625, "learning_rate": 9.995844045018216e-07, "loss": 0.0024, "reward": 2.1205625534057617, "reward_std": 0.019434409216046333, "rewards/accuracy_reward": 0.9205625057220459, "rewards/format_reward": 1.0, "step": 941 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 397.4375, "epoch": 0.012992924235527786, "grad_norm": 2.2954037921104824, "kl": 0.0654296875, "learning_rate": 9.995835208491356e-07, "loss": 0.0026, "reward": 2.111875057220459, "reward_std": 0.018479883670806885, "rewards/accuracy_reward": 0.9118750095367432, "rewards/format_reward": 1.0, "step": 942 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 404.3125, "epoch": 0.013006717148728983, "grad_norm": 2.167334040093991, "kl": 0.05712890625, "learning_rate": 9.995826362584129e-07, "loss": 0.0023, "reward": 1.9730000495910645, "reward_std": 0.006898774299770594, "rewards/accuracy_reward": 0.7730000019073486, "rewards/format_reward": 1.0, "step": 943 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 412.34375, "epoch": 0.01302051006193018, "grad_norm": 4.434470495093593, "kl": 0.0625, "learning_rate": 9.99581750729655e-07, "loss": 0.0025, "reward": 2.065218925476074, "reward_std": 0.040436986833810806, "rewards/accuracy_reward": 0.871468722820282, "rewards/format_reward": 1.0, "step": 944 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 425.34375, "epoch": 0.013034302975131378, "grad_norm": 2.6563888076754436, "kl": 0.0576171875, "learning_rate": 9.995808642628633e-07, "loss": 0.0023, "reward": 2.063312530517578, "reward_std": 0.030278170481324196, "rewards/accuracy_reward": 0.8695625066757202, "rewards/format_reward": 1.0, "step": 945 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 404.5, "epoch": 0.013048095888332575, "grad_norm": 4.080358092003208, "kl": 0.06005859375, "learning_rate": 9.995799768580403e-07, "loss": 0.0024, "reward": 2.103250026702881, "reward_std": 0.02897321805357933, "rewards/accuracy_reward": 0.9032500386238098, "rewards/format_reward": 1.0, "step": 946 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 413.3125, "epoch": 0.013061888801533772, "grad_norm": 0.96206234899221, "kl": 0.0517578125, "learning_rate": 9.995790885151871e-07, "loss": 0.0021, "reward": 2.141031265258789, "reward_std": 0.0031408481299877167, "rewards/accuracy_reward": 0.9410312175750732, "rewards/format_reward": 1.0, "step": 947 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 397.0, "epoch": 0.01307568171473497, "grad_norm": 1.9783221294050166, "kl": 0.05224609375, "learning_rate": 9.995781992343053e-07, "loss": 0.0021, "reward": 2.1516873836517334, "reward_std": 0.025221746414899826, "rewards/accuracy_reward": 0.957937479019165, "rewards/format_reward": 1.0, "step": 948 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 387.90625, "epoch": 0.013089474627936167, "grad_norm": 1.603459321759574, "kl": 0.05908203125, "learning_rate": 9.99577309015397e-07, "loss": 0.0024, "reward": 2.1642186641693115, "reward_std": 0.01903594098985195, "rewards/accuracy_reward": 0.9704688191413879, "rewards/format_reward": 1.0, "step": 949 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 401.53125, "epoch": 0.013103267541137364, "grad_norm": 1.891060649459561, "kl": 0.06494140625, "learning_rate": 9.995764178584633e-07, "loss": 0.0026, "reward": 2.1019062995910645, "reward_std": 0.01263848040252924, "rewards/accuracy_reward": 0.9019061923027039, "rewards/format_reward": 1.0, "step": 950 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 410.375, "epoch": 0.013117060454338561, "grad_norm": 4.133796988321448, "kl": 0.060546875, "learning_rate": 9.995755257635064e-07, "loss": 0.0024, "reward": 2.0258126258850098, "reward_std": 0.031173421069979668, "rewards/accuracy_reward": 0.8258125185966492, "rewards/format_reward": 1.0, "step": 951 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 375.21875, "epoch": 0.013130853367539759, "grad_norm": 2.0682174174411756, "kl": 0.068359375, "learning_rate": 9.995746327305276e-07, "loss": 0.0027, "reward": 2.1248438358306885, "reward_std": 0.03573830798268318, "rewards/accuracy_reward": 0.9373438358306885, "rewards/format_reward": 1.0, "step": 952 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 399.21875, "epoch": 0.013144646280740956, "grad_norm": 2.3116031717973557, "kl": 0.05419921875, "learning_rate": 9.99573738759529e-07, "loss": 0.0022, "reward": 2.05078125, "reward_std": 0.02314024046063423, "rewards/accuracy_reward": 0.8507813215255737, "rewards/format_reward": 1.0, "step": 953 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 383.53125, "epoch": 0.013158439193942153, "grad_norm": 1.9536978347300982, "kl": 0.05224609375, "learning_rate": 9.995728438505117e-07, "loss": 0.0021, "reward": 2.040968894958496, "reward_std": 0.02620755508542061, "rewards/accuracy_reward": 0.8409687280654907, "rewards/format_reward": 1.0, "step": 954 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 381.59375, "epoch": 0.01317223210714335, "grad_norm": 3.4775180665448615, "kl": 0.06591796875, "learning_rate": 9.99571948003478e-07, "loss": 0.0026, "reward": 2.121000051498413, "reward_std": 0.0229371078312397, "rewards/accuracy_reward": 0.921000063419342, "rewards/format_reward": 1.0, "step": 955 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 383.34375, "epoch": 0.013186025020344548, "grad_norm": 2.230545767425793, "kl": 0.058837890625, "learning_rate": 9.99571051218429e-07, "loss": 0.0023, "reward": 2.120968818664551, "reward_std": 0.04076975956559181, "rewards/accuracy_reward": 0.9272187352180481, "rewards/format_reward": 1.0, "step": 956 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 378.9375, "epoch": 0.013199817933545745, "grad_norm": 12.187507277584867, "kl": 0.058349609375, "learning_rate": 9.995701534953669e-07, "loss": 0.0023, "reward": 2.124687433242798, "reward_std": 0.03506990894675255, "rewards/accuracy_reward": 0.9246875047683716, "rewards/format_reward": 1.0, "step": 957 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 372.6875, "epoch": 0.01321361084674694, "grad_norm": 2.225453856215837, "kl": 0.060791015625, "learning_rate": 9.99569254834293e-07, "loss": 0.0024, "reward": 2.1204376220703125, "reward_std": 0.05662054196000099, "rewards/accuracy_reward": 0.9391874670982361, "rewards/format_reward": 1.0, "step": 958 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 365.375, "epoch": 0.013227403759948138, "grad_norm": 3.1531469343554335, "kl": 0.059326171875, "learning_rate": 9.995683552352092e-07, "loss": 0.0024, "reward": 2.123000144958496, "reward_std": 0.04171569645404816, "rewards/accuracy_reward": 0.9354999661445618, "rewards/format_reward": 1.0, "step": 959 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 366.6875, "epoch": 0.013241196673149335, "grad_norm": 2.510283560853735, "kl": 0.0546875, "learning_rate": 9.99567454698117e-07, "loss": 0.0022, "reward": 2.0232813358306885, "reward_std": 0.04163326323032379, "rewards/accuracy_reward": 0.8420312404632568, "rewards/format_reward": 1.0, "step": 960 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 397.0625, "epoch": 0.013254989586350532, "grad_norm": 2.8777407417439105, "kl": 0.0654296875, "learning_rate": 9.99566553223018e-07, "loss": 0.0026, "reward": 2.1313750743865967, "reward_std": 0.022761916741728783, "rewards/accuracy_reward": 0.9313749074935913, "rewards/format_reward": 1.0, "step": 961 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 380.4375, "epoch": 0.01326878249955173, "grad_norm": 2.4662369354427947, "kl": 0.06005859375, "learning_rate": 9.995656508099145e-07, "loss": 0.0024, "reward": 2.074718952178955, "reward_std": 0.05522676557302475, "rewards/accuracy_reward": 0.8934687376022339, "rewards/format_reward": 1.0, "step": 962 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 371.4375, "epoch": 0.013282575412752927, "grad_norm": 2.316676704282207, "kl": 0.0576171875, "learning_rate": 9.995647474588077e-07, "loss": 0.0023, "reward": 2.069406270980835, "reward_std": 0.040666162967681885, "rewards/accuracy_reward": 0.881906270980835, "rewards/format_reward": 1.0, "step": 963 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 386.0625, "epoch": 0.013296368325954124, "grad_norm": 6.451994796671183, "kl": 0.059814453125, "learning_rate": 9.995638431696994e-07, "loss": 0.0024, "reward": 2.08021879196167, "reward_std": 0.036402128636837006, "rewards/accuracy_reward": 0.8802187442779541, "rewards/format_reward": 1.0, "step": 964 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 396.0625, "epoch": 0.013310161239155321, "grad_norm": 2.1422472664307532, "kl": 0.0625, "learning_rate": 9.995629379425912e-07, "loss": 0.0025, "reward": 2.0102813243865967, "reward_std": 0.026051711291074753, "rewards/accuracy_reward": 0.816531240940094, "rewards/format_reward": 1.0, "step": 965 }, { "all_correct": 0.25, "all_wrong": 0.0, "completion_length": 385.90625, "epoch": 0.013323954152356519, "grad_norm": 2.067149110197044, "kl": 0.0615234375, "learning_rate": 9.995620317774851e-07, "loss": 0.0025, "reward": 1.9296250343322754, "reward_std": 0.019551243633031845, "rewards/accuracy_reward": 0.7296250462532043, "rewards/format_reward": 1.0, "step": 966 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 396.75, "epoch": 0.013337747065557716, "grad_norm": 6.251715887985888, "kl": 0.059326171875, "learning_rate": 9.995611246743824e-07, "loss": 0.0024, "reward": 2.152656316757202, "reward_std": 0.01397896371781826, "rewards/accuracy_reward": 0.9526562690734863, "rewards/format_reward": 1.0, "step": 967 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 392.0, "epoch": 0.013351539978758913, "grad_norm": 2.5837081521457193, "kl": 0.0634765625, "learning_rate": 9.99560216633285e-07, "loss": 0.0025, "reward": 2.1388750076293945, "reward_std": 0.01405518501996994, "rewards/accuracy_reward": 0.9388750791549683, "rewards/format_reward": 1.0, "step": 968 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 401.8125, "epoch": 0.01336533289196011, "grad_norm": 3.4240178637681313, "kl": 0.0625, "learning_rate": 9.995593076541947e-07, "loss": 0.0025, "reward": 2.0802500247955322, "reward_std": 0.017234977334737778, "rewards/accuracy_reward": 0.8802499771118164, "rewards/format_reward": 1.0, "step": 969 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 408.75, "epoch": 0.013379125805161308, "grad_norm": 2.0050233770833965, "kl": 0.060546875, "learning_rate": 9.99558397737113e-07, "loss": 0.0024, "reward": 2.050093650817871, "reward_std": 0.020653944462537766, "rewards/accuracy_reward": 0.8500937819480896, "rewards/format_reward": 1.0, "step": 970 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 404.4375, "epoch": 0.013392918718362505, "grad_norm": 1.100530179238951, "kl": 0.06298828125, "learning_rate": 9.995574868820417e-07, "loss": 0.0025, "reward": 1.9181562662124634, "reward_std": 0.010097677819430828, "rewards/accuracy_reward": 0.7181562185287476, "rewards/format_reward": 1.0, "step": 971 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 394.96875, "epoch": 0.013406711631563702, "grad_norm": 2.3083452865416407, "kl": 0.06298828125, "learning_rate": 9.995565750889826e-07, "loss": 0.0025, "reward": 1.9631249904632568, "reward_std": 0.03356027603149414, "rewards/accuracy_reward": 0.7631250619888306, "rewards/format_reward": 1.0, "step": 972 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 396.8125, "epoch": 0.0134205045447649, "grad_norm": 1.975854283008749, "kl": 0.05517578125, "learning_rate": 9.995556623579373e-07, "loss": 0.0022, "reward": 2.106187343597412, "reward_std": 0.03825730085372925, "rewards/accuracy_reward": 0.9124375581741333, "rewards/format_reward": 1.0, "step": 973 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 392.0625, "epoch": 0.013434297457966097, "grad_norm": 2.530695759744838, "kl": 0.068359375, "learning_rate": 9.995547486889077e-07, "loss": 0.0027, "reward": 2.073500156402588, "reward_std": 0.033998943865299225, "rewards/accuracy_reward": 0.8735000491142273, "rewards/format_reward": 1.0, "step": 974 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 414.09375, "epoch": 0.013448090371167294, "grad_norm": 1.6458334499336997, "kl": 0.06396484375, "learning_rate": 9.995538340818951e-07, "loss": 0.0026, "reward": 2.151218891143799, "reward_std": 0.010997899807989597, "rewards/accuracy_reward": 0.9512187242507935, "rewards/format_reward": 1.0, "step": 975 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 392.375, "epoch": 0.013461883284368491, "grad_norm": 0.2477452730346337, "kl": 0.053955078125, "learning_rate": 9.995529185369018e-07, "loss": 0.0022, "reward": 1.9482500553131104, "reward_std": 0.0, "rewards/accuracy_reward": 0.7482500672340393, "rewards/format_reward": 1.0, "step": 976 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 398.40625, "epoch": 0.013475676197569689, "grad_norm": 3.0131001294551965, "kl": 0.052978515625, "learning_rate": 9.99552002053929e-07, "loss": 0.0021, "reward": 2.1441564559936523, "reward_std": 0.02559378556907177, "rewards/accuracy_reward": 0.9441561698913574, "rewards/format_reward": 1.0, "step": 977 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 418.0, "epoch": 0.013489469110770886, "grad_norm": 2.539167597786399, "kl": 0.056640625, "learning_rate": 9.995510846329788e-07, "loss": 0.0023, "reward": 2.0927810668945312, "reward_std": 0.029973864555358887, "rewards/accuracy_reward": 0.8927812576293945, "rewards/format_reward": 1.0, "step": 978 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 401.84375, "epoch": 0.013503262023972083, "grad_norm": 1.659274302788287, "kl": 0.0654296875, "learning_rate": 9.995501662740524e-07, "loss": 0.0026, "reward": 2.063000202178955, "reward_std": 0.006210608873516321, "rewards/accuracy_reward": 0.8630000352859497, "rewards/format_reward": 1.0, "step": 979 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 376.6875, "epoch": 0.01351705493717328, "grad_norm": 1.9213398583182117, "kl": 0.064453125, "learning_rate": 9.99549246977152e-07, "loss": 0.0026, "reward": 2.143531322479248, "reward_std": 0.07075823098421097, "rewards/accuracy_reward": 0.9497812986373901, "rewards/format_reward": 1.0, "step": 980 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 387.8125, "epoch": 0.013530847850374478, "grad_norm": 3.125976968410198, "kl": 0.0625, "learning_rate": 9.995483267422792e-07, "loss": 0.0025, "reward": 2.0835938453674316, "reward_std": 0.0372149720788002, "rewards/accuracy_reward": 0.8898437023162842, "rewards/format_reward": 1.0, "step": 981 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 395.96875, "epoch": 0.013544640763575675, "grad_norm": 2.500428329828002, "kl": 0.05859375, "learning_rate": 9.99547405569436e-07, "loss": 0.0023, "reward": 2.0518126487731934, "reward_std": 0.020919105038046837, "rewards/accuracy_reward": 0.8518125414848328, "rewards/format_reward": 1.0, "step": 982 }, { "all_correct": 0.0, "all_wrong": 0.0, "completion_length": 387.71875, "epoch": 0.013558433676776872, "grad_norm": 2.1681708674973326, "kl": 0.061279296875, "learning_rate": 9.995464834586235e-07, "loss": 0.0025, "reward": 1.9480624198913574, "reward_std": 0.03235013037919998, "rewards/accuracy_reward": 0.7543125152587891, "rewards/format_reward": 1.0, "step": 983 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 405.5, "epoch": 0.01357222658997807, "grad_norm": 1.7307679603834816, "kl": 0.0673828125, "learning_rate": 9.99545560409844e-07, "loss": 0.0027, "reward": 2.0042500495910645, "reward_std": 0.012961498461663723, "rewards/accuracy_reward": 0.8042500019073486, "rewards/format_reward": 1.0, "step": 984 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 395.46875, "epoch": 0.013586019503179267, "grad_norm": 1.8536788783270453, "kl": 0.06396484375, "learning_rate": 9.995446364230991e-07, "loss": 0.0026, "reward": 2.066718816757202, "reward_std": 0.03827047348022461, "rewards/accuracy_reward": 0.8667187690734863, "rewards/format_reward": 1.0, "step": 985 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 398.59375, "epoch": 0.013599812416380464, "grad_norm": 2.6835941275324284, "kl": 0.06298828125, "learning_rate": 9.995437114983901e-07, "loss": 0.0025, "reward": 1.9026875495910645, "reward_std": 0.014580855146050453, "rewards/accuracy_reward": 0.7026875019073486, "rewards/format_reward": 1.0, "step": 986 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 416.5, "epoch": 0.013613605329581661, "grad_norm": 22.492767801161104, "kl": 0.057861328125, "learning_rate": 9.995427856357194e-07, "loss": 0.0023, "reward": 2.0933125019073486, "reward_std": 0.02505647763609886, "rewards/accuracy_reward": 0.8933125734329224, "rewards/format_reward": 1.0, "step": 987 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 397.65625, "epoch": 0.013627398242782859, "grad_norm": 4.966099600360118, "kl": 0.06494140625, "learning_rate": 9.995418588350884e-07, "loss": 0.0026, "reward": 1.888718843460083, "reward_std": 0.02759726345539093, "rewards/accuracy_reward": 0.688718855381012, "rewards/format_reward": 1.0, "step": 988 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 417.125, "epoch": 0.013641191155984056, "grad_norm": 2.1799339139432448, "kl": 0.06494140625, "learning_rate": 9.99540931096499e-07, "loss": 0.0026, "reward": 2.0843124389648438, "reward_std": 0.012447066605091095, "rewards/accuracy_reward": 0.8843125104904175, "rewards/format_reward": 1.0, "step": 989 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 399.8125, "epoch": 0.013654984069185253, "grad_norm": 3.6513041943975875, "kl": 0.0634765625, "learning_rate": 9.995400024199523e-07, "loss": 0.0025, "reward": 2.0113439559936523, "reward_std": 0.032104697078466415, "rewards/accuracy_reward": 0.8175938129425049, "rewards/format_reward": 1.0, "step": 990 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 391.5625, "epoch": 0.01366877698238645, "grad_norm": 1.3049986212995772, "kl": 0.06201171875, "learning_rate": 9.995390728054508e-07, "loss": 0.0025, "reward": 2.0866565704345703, "reward_std": 0.012286011129617691, "rewards/accuracy_reward": 0.8866562247276306, "rewards/format_reward": 1.0, "step": 991 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 420.46875, "epoch": 0.013682569895587648, "grad_norm": 2.130959311738575, "kl": 0.05908203125, "learning_rate": 9.995381422529962e-07, "loss": 0.0024, "reward": 1.9981250762939453, "reward_std": 0.05305713415145874, "rewards/accuracy_reward": 0.8043749928474426, "rewards/format_reward": 1.0, "step": 992 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 409.3125, "epoch": 0.013696362808788845, "grad_norm": 1.4435491083063408, "kl": 0.06640625, "learning_rate": 9.9953721076259e-07, "loss": 0.0027, "reward": 2.083531379699707, "reward_std": 0.009409750811755657, "rewards/accuracy_reward": 0.8835312128067017, "rewards/format_reward": 1.0, "step": 993 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 398.25, "epoch": 0.013710155721990042, "grad_norm": 2.35799571419742, "kl": 0.056884765625, "learning_rate": 9.99536278334234e-07, "loss": 0.0023, "reward": 2.058781147003174, "reward_std": 0.038040243089199066, "rewards/accuracy_reward": 0.8587812185287476, "rewards/format_reward": 1.0, "step": 994 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 412.46875, "epoch": 0.01372394863519124, "grad_norm": 3.1647969678637637, "kl": 0.05615234375, "learning_rate": 9.995353449679298e-07, "loss": 0.0022, "reward": 2.0491561889648438, "reward_std": 0.018156027421355247, "rewards/accuracy_reward": 0.8491562604904175, "rewards/format_reward": 1.0, "step": 995 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 410.875, "epoch": 0.013737741548392435, "grad_norm": 1.4698220414974874, "kl": 0.06689453125, "learning_rate": 9.995344106636793e-07, "loss": 0.0027, "reward": 1.8772813081741333, "reward_std": 0.014451442286372185, "rewards/accuracy_reward": 0.6772812604904175, "rewards/format_reward": 1.0, "step": 996 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 409.34375, "epoch": 0.013751534461593632, "grad_norm": 2.9435540764919885, "kl": 0.06494140625, "learning_rate": 9.995334754214843e-07, "loss": 0.0026, "reward": 2.0444061756134033, "reward_std": 0.041618090122938156, "rewards/accuracy_reward": 0.844406247138977, "rewards/format_reward": 1.0, "step": 997 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 422.84375, "epoch": 0.01376532737479483, "grad_norm": 2.3773181609999594, "kl": 0.058837890625, "learning_rate": 9.995325392413467e-07, "loss": 0.0024, "reward": 2.1249687671661377, "reward_std": 0.022040609270334244, "rewards/accuracy_reward": 0.9249687790870667, "rewards/format_reward": 1.0, "step": 998 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 406.25, "epoch": 0.013779120287996027, "grad_norm": 1.9611305368502028, "kl": 0.0634765625, "learning_rate": 9.995316021232678e-07, "loss": 0.0025, "reward": 2.076812505722046, "reward_std": 0.016642915084958076, "rewards/accuracy_reward": 0.8768124580383301, "rewards/format_reward": 1.0, "step": 999 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 416.71875, "epoch": 0.013792913201197224, "grad_norm": 2.2665532696317245, "kl": 0.064453125, "learning_rate": 9.995306640672497e-07, "loss": 0.0026, "reward": 2.14662504196167, "reward_std": 0.03353721648454666, "rewards/accuracy_reward": 0.9466249942779541, "rewards/format_reward": 1.0, "step": 1000 }, { "all_correct": 0.25, "all_wrong": 0.0, "completion_length": 402.65625, "epoch": 0.013806706114398421, "grad_norm": 2.6841593636591816, "kl": 0.0615234375, "learning_rate": 9.99529725073294e-07, "loss": 0.0025, "reward": 1.918468713760376, "reward_std": 0.01867750473320484, "rewards/accuracy_reward": 0.7184686660766602, "rewards/format_reward": 1.0, "step": 1001 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 418.34375, "epoch": 0.013820499027599619, "grad_norm": 2.340833035537331, "kl": 0.058349609375, "learning_rate": 9.995287851414029e-07, "loss": 0.0023, "reward": 2.130500078201294, "reward_std": 0.035647280514240265, "rewards/accuracy_reward": 0.9305000305175781, "rewards/format_reward": 1.0, "step": 1002 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 395.21875, "epoch": 0.013834291940800816, "grad_norm": 2.382335624842158, "kl": 0.06298828125, "learning_rate": 9.995278442715775e-07, "loss": 0.0025, "reward": 2.090749979019165, "reward_std": 0.01863645389676094, "rewards/accuracy_reward": 0.890749990940094, "rewards/format_reward": 1.0, "step": 1003 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 402.0625, "epoch": 0.013848084854002013, "grad_norm": 3.1549631514710854, "kl": 0.0703125, "learning_rate": 9.9952690246382e-07, "loss": 0.0028, "reward": 2.057000160217285, "reward_std": 0.02115192636847496, "rewards/accuracy_reward": 0.8569999933242798, "rewards/format_reward": 1.0, "step": 1004 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 402.5625, "epoch": 0.01386187776720321, "grad_norm": 1.6145694170610638, "kl": 0.06689453125, "learning_rate": 9.99525959718132e-07, "loss": 0.0027, "reward": 2.0301876068115234, "reward_std": 0.014465948566794395, "rewards/accuracy_reward": 0.8301874995231628, "rewards/format_reward": 1.0, "step": 1005 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 393.53125, "epoch": 0.013875670680404408, "grad_norm": 1.7246037242703653, "kl": 0.0615234375, "learning_rate": 9.995250160345154e-07, "loss": 0.0025, "reward": 2.065687417984009, "reward_std": 0.053799569606781006, "rewards/accuracy_reward": 0.8656875491142273, "rewards/format_reward": 1.0, "step": 1006 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 392.03125, "epoch": 0.013889463593605605, "grad_norm": 2.274902733978332, "kl": 0.06982421875, "learning_rate": 9.995240714129719e-07, "loss": 0.0028, "reward": 2.0545313358306885, "reward_std": 0.020327426493167877, "rewards/accuracy_reward": 0.8545312285423279, "rewards/format_reward": 1.0, "step": 1007 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 390.375, "epoch": 0.013903256506806802, "grad_norm": 2.0253955247856483, "kl": 0.0654296875, "learning_rate": 9.995231258535033e-07, "loss": 0.0026, "reward": 2.073718786239624, "reward_std": 0.015995020046830177, "rewards/accuracy_reward": 0.8737187385559082, "rewards/format_reward": 1.0, "step": 1008 }, { "all_correct": 0.25, "all_wrong": 0.0, "completion_length": 384.25, "epoch": 0.013917049420008, "grad_norm": 4.036634684637275, "kl": 0.06591796875, "learning_rate": 9.995221793561113e-07, "loss": 0.0026, "reward": 1.9777500629425049, "reward_std": 0.07585783302783966, "rewards/accuracy_reward": 0.7902500033378601, "rewards/format_reward": 1.0, "step": 1009 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 395.15625, "epoch": 0.013930842333209197, "grad_norm": 5.6564302480286575, "kl": 0.059814453125, "learning_rate": 9.995212319207976e-07, "loss": 0.0024, "reward": 2.071031332015991, "reward_std": 0.041754066944122314, "rewards/accuracy_reward": 0.8710312843322754, "rewards/format_reward": 1.0, "step": 1010 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 399.03125, "epoch": 0.013944635246410394, "grad_norm": 5.06472738897145, "kl": 0.06298828125, "learning_rate": 9.995202835475641e-07, "loss": 0.0025, "reward": 2.0567188262939453, "reward_std": 0.031211989000439644, "rewards/accuracy_reward": 0.8567187786102295, "rewards/format_reward": 1.0, "step": 1011 }, { "all_correct": 0.25, "all_wrong": 0.0, "completion_length": 400.4375, "epoch": 0.013958428159611591, "grad_norm": 2.7941958010217474, "kl": 0.060546875, "learning_rate": 9.995193342364128e-07, "loss": 0.0024, "reward": 1.8352186679840088, "reward_std": 0.005374276079237461, "rewards/accuracy_reward": 0.6352187395095825, "rewards/format_reward": 1.0, "step": 1012 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 374.53125, "epoch": 0.013972221072812789, "grad_norm": 2.473876327437781, "kl": 0.0654296875, "learning_rate": 9.99518383987345e-07, "loss": 0.0026, "reward": 2.0870938301086426, "reward_std": 0.03920717164874077, "rewards/accuracy_reward": 0.8933437466621399, "rewards/format_reward": 1.0, "step": 1013 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 381.625, "epoch": 0.013986013986013986, "grad_norm": 1.925270923405343, "kl": 0.061767578125, "learning_rate": 9.99517432800363e-07, "loss": 0.0025, "reward": 2.0156874656677246, "reward_std": 0.027778197079896927, "rewards/accuracy_reward": 0.8219375014305115, "rewards/format_reward": 1.0, "step": 1014 }, { "all_correct": 0.0, "all_wrong": 0.0, "completion_length": 394.9375, "epoch": 0.013999806899215183, "grad_norm": 1.862736498607258, "kl": 0.0576171875, "learning_rate": 9.99516480675468e-07, "loss": 0.0023, "reward": 1.8773751258850098, "reward_std": 0.008180994540452957, "rewards/accuracy_reward": 0.6773749589920044, "rewards/format_reward": 1.0, "step": 1015 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 388.5625, "epoch": 0.01401359981241638, "grad_norm": 2.3267801183908836, "kl": 0.05419921875, "learning_rate": 9.995155276126626e-07, "loss": 0.0022, "reward": 1.9841251373291016, "reward_std": 0.04302642494440079, "rewards/accuracy_reward": 0.784125030040741, "rewards/format_reward": 1.0, "step": 1016 }, { "all_correct": 0.25, "all_wrong": 0.0, "completion_length": 379.53125, "epoch": 0.014027392725617578, "grad_norm": 2.4198964527178086, "kl": 0.057373046875, "learning_rate": 9.995145736119476e-07, "loss": 0.0023, "reward": 2.078843832015991, "reward_std": 0.04709043726325035, "rewards/accuracy_reward": 0.8850936889648438, "rewards/format_reward": 1.0, "step": 1017 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 393.71875, "epoch": 0.014041185638818775, "grad_norm": 2.7753958744593783, "kl": 0.057861328125, "learning_rate": 9.995136186733257e-07, "loss": 0.0023, "reward": 2.0915000438690186, "reward_std": 0.07932135462760925, "rewards/accuracy_reward": 0.8914999961853027, "rewards/format_reward": 1.0, "step": 1018 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 392.40625, "epoch": 0.014054978552019972, "grad_norm": 1.8747227301003553, "kl": 0.050537109375, "learning_rate": 9.99512662796798e-07, "loss": 0.002, "reward": 2.111187696456909, "reward_std": 0.030793720856308937, "rewards/accuracy_reward": 0.9174375534057617, "rewards/format_reward": 1.0, "step": 1019 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 384.4375, "epoch": 0.01406877146522117, "grad_norm": 3.018782971982781, "kl": 0.057373046875, "learning_rate": 9.995117059823667e-07, "loss": 0.0023, "reward": 2.0834999084472656, "reward_std": 0.025418316945433617, "rewards/accuracy_reward": 0.8834999799728394, "rewards/format_reward": 1.0, "step": 1020 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 381.375, "epoch": 0.014082564378422367, "grad_norm": 2.2283906563593, "kl": 0.061279296875, "learning_rate": 9.995107482300335e-07, "loss": 0.0025, "reward": 2.0917813777923584, "reward_std": 0.030742540955543518, "rewards/accuracy_reward": 0.8917812705039978, "rewards/format_reward": 1.0, "step": 1021 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 392.0, "epoch": 0.014096357291623564, "grad_norm": 3.7469031261924775, "kl": 0.0625, "learning_rate": 9.995097895398003e-07, "loss": 0.0025, "reward": 2.0813751220703125, "reward_std": 0.019066594541072845, "rewards/accuracy_reward": 0.8813750147819519, "rewards/format_reward": 1.0, "step": 1022 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 377.9375, "epoch": 0.014110150204824761, "grad_norm": 2.1316071164421024, "kl": 0.05615234375, "learning_rate": 9.995088299116688e-07, "loss": 0.0022, "reward": 2.1322813034057617, "reward_std": 0.03719237446784973, "rewards/accuracy_reward": 0.938531219959259, "rewards/format_reward": 1.0, "step": 1023 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 367.09375, "epoch": 0.014123943118025959, "grad_norm": 3.4566052885942553, "kl": 0.0615234375, "learning_rate": 9.995078693456407e-07, "loss": 0.0025, "reward": 2.0728750228881836, "reward_std": 0.04213777929544449, "rewards/accuracy_reward": 0.8791249990463257, "rewards/format_reward": 1.0, "step": 1024 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 378.625, "epoch": 0.014137736031227156, "grad_norm": 2.262560830772727, "kl": 0.060302734375, "learning_rate": 9.995069078417177e-07, "loss": 0.0024, "reward": 2.020249843597412, "reward_std": 0.013918399810791016, "rewards/accuracy_reward": 0.8202500343322754, "rewards/format_reward": 1.0, "step": 1025 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 387.25, "epoch": 0.014151528944428353, "grad_norm": 2.556370021374263, "kl": 0.056640625, "learning_rate": 9.99505945399902e-07, "loss": 0.0023, "reward": 2.003406047821045, "reward_std": 0.08778459578752518, "rewards/accuracy_reward": 0.8096562623977661, "rewards/format_reward": 1.0, "step": 1026 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 383.25, "epoch": 0.01416532185762955, "grad_norm": 2.747186719581895, "kl": 0.0615234375, "learning_rate": 9.995049820201952e-07, "loss": 0.0025, "reward": 2.0791561603546143, "reward_std": 0.02769346907734871, "rewards/accuracy_reward": 0.879156231880188, "rewards/format_reward": 1.0, "step": 1027 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 403.84375, "epoch": 0.014179114770830748, "grad_norm": 2.2228034115777238, "kl": 0.05517578125, "learning_rate": 9.99504017702599e-07, "loss": 0.0022, "reward": 2.146437644958496, "reward_std": 0.031086117029190063, "rewards/accuracy_reward": 0.9464374780654907, "rewards/format_reward": 1.0, "step": 1028 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 382.875, "epoch": 0.014192907684031945, "grad_norm": 2.4237383420672143, "kl": 0.0634765625, "learning_rate": 9.995030524471154e-07, "loss": 0.0025, "reward": 2.0984373092651367, "reward_std": 0.02709505707025528, "rewards/accuracy_reward": 0.8984375, "rewards/format_reward": 1.0, "step": 1029 }, { "all_correct": 0.25, "all_wrong": 0.0, "completion_length": 383.125, "epoch": 0.014206700597233142, "grad_norm": 9.992156867116375, "kl": 0.0595703125, "learning_rate": 9.995020862537462e-07, "loss": 0.0024, "reward": 2.0556561946868896, "reward_std": 0.039962634444236755, "rewards/accuracy_reward": 0.8619062304496765, "rewards/format_reward": 1.0, "step": 1030 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 377.21875, "epoch": 0.01422049351043434, "grad_norm": 3.749501724210966, "kl": 0.06396484375, "learning_rate": 9.99501119122493e-07, "loss": 0.0026, "reward": 2.136625051498413, "reward_std": 0.028208408504724503, "rewards/accuracy_reward": 0.9366250038146973, "rewards/format_reward": 1.0, "step": 1031 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 382.6875, "epoch": 0.014234286423635537, "grad_norm": 2.5634091107812917, "kl": 0.06591796875, "learning_rate": 9.99500151053358e-07, "loss": 0.0026, "reward": 2.0800938606262207, "reward_std": 0.03827814757823944, "rewards/accuracy_reward": 0.8863437175750732, "rewards/format_reward": 1.0, "step": 1032 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 382.375, "epoch": 0.014248079336836734, "grad_norm": 2.2669041007151476, "kl": 0.0634765625, "learning_rate": 9.994991820463426e-07, "loss": 0.0025, "reward": 2.1167502403259277, "reward_std": 0.06366574764251709, "rewards/accuracy_reward": 0.9167500138282776, "rewards/format_reward": 1.0, "step": 1033 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 389.8125, "epoch": 0.01426187225003793, "grad_norm": 2.4915107432135724, "kl": 0.0576171875, "learning_rate": 9.994982121014487e-07, "loss": 0.0023, "reward": 1.968656301498413, "reward_std": 0.035532400012016296, "rewards/accuracy_reward": 0.7686562538146973, "rewards/format_reward": 1.0, "step": 1034 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 380.5, "epoch": 0.014275665163239127, "grad_norm": 2.1809811971570356, "kl": 0.0634765625, "learning_rate": 9.994972412186785e-07, "loss": 0.0025, "reward": 2.0294063091278076, "reward_std": 0.03878871724009514, "rewards/accuracy_reward": 0.8356562852859497, "rewards/format_reward": 1.0, "step": 1035 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 379.09375, "epoch": 0.014289458076440324, "grad_norm": 2.260253775706708, "kl": 0.056640625, "learning_rate": 9.994962693980333e-07, "loss": 0.0023, "reward": 1.9719061851501465, "reward_std": 0.04054107144474983, "rewards/accuracy_reward": 0.7844062447547913, "rewards/format_reward": 1.0, "step": 1036 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 381.15625, "epoch": 0.014303250989641522, "grad_norm": 6.934004361380599, "kl": 0.0546875, "learning_rate": 9.994952966395152e-07, "loss": 0.0022, "reward": 2.1446876525878906, "reward_std": 0.009687417186796665, "rewards/accuracy_reward": 0.9446874856948853, "rewards/format_reward": 1.0, "step": 1037 }, { "all_correct": 0.25, "all_wrong": 0.0, "completion_length": 390.40625, "epoch": 0.014317043902842719, "grad_norm": 2.871732371385341, "kl": 0.05810546875, "learning_rate": 9.994943229431262e-07, "loss": 0.0023, "reward": 2.009031295776367, "reward_std": 0.06822541356086731, "rewards/accuracy_reward": 0.8152812719345093, "rewards/format_reward": 1.0, "step": 1038 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 374.625, "epoch": 0.014330836816043916, "grad_norm": 2.090810930347843, "kl": 0.06591796875, "learning_rate": 9.994933483088677e-07, "loss": 0.0026, "reward": 2.0885000228881836, "reward_std": 0.03708851337432861, "rewards/accuracy_reward": 0.8947499990463257, "rewards/format_reward": 1.0, "step": 1039 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 377.6875, "epoch": 0.014344629729245113, "grad_norm": 1.6883198717499386, "kl": 0.06298828125, "learning_rate": 9.994923727367418e-07, "loss": 0.0025, "reward": 2.1227500438690186, "reward_std": 0.025841422379016876, "rewards/accuracy_reward": 0.9352499842643738, "rewards/format_reward": 1.0, "step": 1040 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 377.90625, "epoch": 0.01435842264244631, "grad_norm": 3.2253503668773083, "kl": 0.0654296875, "learning_rate": 9.994913962267503e-07, "loss": 0.0026, "reward": 2.0452499389648438, "reward_std": 0.03652583062648773, "rewards/accuracy_reward": 0.8452500104904175, "rewards/format_reward": 1.0, "step": 1041 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 389.25, "epoch": 0.014372215555647508, "grad_norm": 1.0793983996727927, "kl": 0.051513671875, "learning_rate": 9.99490418778895e-07, "loss": 0.0021, "reward": 2.110781192779541, "reward_std": 0.009393594227731228, "rewards/accuracy_reward": 0.9107812643051147, "rewards/format_reward": 1.0, "step": 1042 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 399.34375, "epoch": 0.014386008468848705, "grad_norm": 3.567533867559961, "kl": 0.06298828125, "learning_rate": 9.994894403931778e-07, "loss": 0.0025, "reward": 2.071499824523926, "reward_std": 0.01883268915116787, "rewards/accuracy_reward": 0.8715000748634338, "rewards/format_reward": 1.0, "step": 1043 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 408.3125, "epoch": 0.014399801382049902, "grad_norm": 1.9767863781483073, "kl": 0.056396484375, "learning_rate": 9.994884610696006e-07, "loss": 0.0023, "reward": 2.0631563663482666, "reward_std": 0.023641955107450485, "rewards/accuracy_reward": 0.863156259059906, "rewards/format_reward": 1.0, "step": 1044 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 401.21875, "epoch": 0.0144135942952511, "grad_norm": 2.3432420394467752, "kl": 0.059326171875, "learning_rate": 9.99487480808165e-07, "loss": 0.0024, "reward": 2.1525936126708984, "reward_std": 0.018496576696634293, "rewards/accuracy_reward": 0.9525937438011169, "rewards/format_reward": 1.0, "step": 1045 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 390.875, "epoch": 0.014427387208452297, "grad_norm": 2.11665813366599, "kl": 0.05712890625, "learning_rate": 9.99486499608873e-07, "loss": 0.0023, "reward": 2.023750066757202, "reward_std": 0.03328660875558853, "rewards/accuracy_reward": 0.8299999833106995, "rewards/format_reward": 1.0, "step": 1046 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 390.90625, "epoch": 0.014441180121653494, "grad_norm": 2.308078835490882, "kl": 0.06591796875, "learning_rate": 9.994855174717264e-07, "loss": 0.0026, "reward": 2.0730624198913574, "reward_std": 0.008193084970116615, "rewards/accuracy_reward": 0.8730625510215759, "rewards/format_reward": 1.0, "step": 1047 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 398.84375, "epoch": 0.014454973034854691, "grad_norm": 2.525293100444316, "kl": 0.05615234375, "learning_rate": 9.99484534396727e-07, "loss": 0.0023, "reward": 2.126281261444092, "reward_std": 0.0399869903922081, "rewards/accuracy_reward": 0.9450312256813049, "rewards/format_reward": 1.0, "step": 1048 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 406.0625, "epoch": 0.014468765948055889, "grad_norm": 2.313492910387635, "kl": 0.060791015625, "learning_rate": 9.994835503838768e-07, "loss": 0.0024, "reward": 1.9915001392364502, "reward_std": 0.01579016074538231, "rewards/accuracy_reward": 0.7915000915527344, "rewards/format_reward": 1.0, "step": 1049 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 403.78125, "epoch": 0.014482558861257086, "grad_norm": 4.643788930351187, "kl": 0.05859375, "learning_rate": 9.994825654331776e-07, "loss": 0.0023, "reward": 2.049499988555908, "reward_std": 0.01867077127099037, "rewards/accuracy_reward": 0.8495000600814819, "rewards/format_reward": 1.0, "step": 1050 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 402.40625, "epoch": 0.014496351774458283, "grad_norm": 2.647180271597688, "kl": 0.05859375, "learning_rate": 9.99481579544631e-07, "loss": 0.0023, "reward": 2.068312644958496, "reward_std": 0.029035402461886406, "rewards/accuracy_reward": 0.8683124780654907, "rewards/format_reward": 1.0, "step": 1051 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 407.9375, "epoch": 0.01451014468765948, "grad_norm": 2.6491884664332597, "kl": 0.062255859375, "learning_rate": 9.994805927182393e-07, "loss": 0.0025, "reward": 2.0622501373291016, "reward_std": 0.04071832075715065, "rewards/accuracy_reward": 0.862250030040741, "rewards/format_reward": 1.0, "step": 1052 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 397.4375, "epoch": 0.014523937600860678, "grad_norm": 2.3423580431266395, "kl": 0.06640625, "learning_rate": 9.99479604954004e-07, "loss": 0.0026, "reward": 2.0285000801086426, "reward_std": 0.019985763356089592, "rewards/accuracy_reward": 0.8284999132156372, "rewards/format_reward": 1.0, "step": 1053 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 410.6875, "epoch": 0.014537730514061875, "grad_norm": 1.6756725204796774, "kl": 0.0546875, "learning_rate": 9.994786162519271e-07, "loss": 0.0022, "reward": 2.100437641143799, "reward_std": 0.021006863564252853, "rewards/accuracy_reward": 0.9004374146461487, "rewards/format_reward": 1.0, "step": 1054 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 410.84375, "epoch": 0.014551523427263072, "grad_norm": 3.8194351990252198, "kl": 0.072265625, "learning_rate": 9.994776266120104e-07, "loss": 0.0029, "reward": 2.0522189140319824, "reward_std": 0.023851297795772552, "rewards/accuracy_reward": 0.852218747138977, "rewards/format_reward": 1.0, "step": 1055 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 412.03125, "epoch": 0.01456531634046427, "grad_norm": 2.6374030353977003, "kl": 0.0595703125, "learning_rate": 9.994766360342558e-07, "loss": 0.0024, "reward": 2.118593692779541, "reward_std": 0.02647656947374344, "rewards/accuracy_reward": 0.9185937643051147, "rewards/format_reward": 1.0, "step": 1056 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 397.75, "epoch": 0.014579109253665467, "grad_norm": 2.6238147638786815, "kl": 0.056884765625, "learning_rate": 9.99475644518665e-07, "loss": 0.0023, "reward": 2.1460001468658447, "reward_std": 0.03200087323784828, "rewards/accuracy_reward": 0.9459999799728394, "rewards/format_reward": 1.0, "step": 1057 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 410.5625, "epoch": 0.014592902166866664, "grad_norm": 2.082613684206703, "kl": 0.0654296875, "learning_rate": 9.9947465206524e-07, "loss": 0.0026, "reward": 2.0904998779296875, "reward_std": 0.020687583833932877, "rewards/accuracy_reward": 0.8904999494552612, "rewards/format_reward": 1.0, "step": 1058 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 421.375, "epoch": 0.014606695080067861, "grad_norm": 3.450562654773857, "kl": 0.064453125, "learning_rate": 9.994736586739827e-07, "loss": 0.0026, "reward": 2.1506876945495605, "reward_std": 0.015783367678523064, "rewards/accuracy_reward": 0.9506875276565552, "rewards/format_reward": 1.0, "step": 1059 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 420.25, "epoch": 0.014620487993269059, "grad_norm": 1.9472301918914765, "kl": 0.06787109375, "learning_rate": 9.99472664344895e-07, "loss": 0.0027, "reward": 2.1585001945495605, "reward_std": 0.021822787821292877, "rewards/accuracy_reward": 0.9584999084472656, "rewards/format_reward": 1.0, "step": 1060 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 414.59375, "epoch": 0.014634280906470256, "grad_norm": 1.8273523890431835, "kl": 0.06640625, "learning_rate": 9.994716690779787e-07, "loss": 0.0027, "reward": 2.0393123626708984, "reward_std": 0.018948350101709366, "rewards/accuracy_reward": 0.8393124938011169, "rewards/format_reward": 1.0, "step": 1061 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 399.0625, "epoch": 0.014648073819671453, "grad_norm": 2.6782014668207665, "kl": 0.06298828125, "learning_rate": 9.994706728732355e-07, "loss": 0.0025, "reward": 2.1312811374664307, "reward_std": 0.031693197786808014, "rewards/accuracy_reward": 0.9312812089920044, "rewards/format_reward": 1.0, "step": 1062 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 403.625, "epoch": 0.01466186673287265, "grad_norm": 2.324852894599851, "kl": 0.06396484375, "learning_rate": 9.994696757306675e-07, "loss": 0.0026, "reward": 2.057000160217285, "reward_std": 0.06525996327400208, "rewards/accuracy_reward": 0.856999933719635, "rewards/format_reward": 1.0, "step": 1063 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 404.0, "epoch": 0.014675659646073848, "grad_norm": 3.29667561854513, "kl": 0.064453125, "learning_rate": 9.994686776502766e-07, "loss": 0.0026, "reward": 1.9528436660766602, "reward_std": 0.029975606128573418, "rewards/accuracy_reward": 0.7528437376022339, "rewards/format_reward": 1.0, "step": 1064 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 403.25, "epoch": 0.014689452559275045, "grad_norm": 1.7500356482894373, "kl": 0.0712890625, "learning_rate": 9.994676786320647e-07, "loss": 0.0028, "reward": 2.054281234741211, "reward_std": 0.028028924018144608, "rewards/accuracy_reward": 0.8605312705039978, "rewards/format_reward": 1.0, "step": 1065 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 432.28125, "epoch": 0.014703245472476242, "grad_norm": 9.309616225727984, "kl": 0.06396484375, "learning_rate": 9.994666786760333e-07, "loss": 0.0026, "reward": 2.1013126373291016, "reward_std": 0.03515523672103882, "rewards/accuracy_reward": 0.9075624942779541, "rewards/format_reward": 1.0, "step": 1066 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 429.375, "epoch": 0.01471703838567744, "grad_norm": 2.035352915275198, "kl": 0.06396484375, "learning_rate": 9.994656777821846e-07, "loss": 0.0026, "reward": 2.156437397003174, "reward_std": 0.025794094428420067, "rewards/accuracy_reward": 0.9626874923706055, "rewards/format_reward": 1.0, "step": 1067 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 411.75, "epoch": 0.014730831298878637, "grad_norm": 2.187782467298346, "kl": 0.059814453125, "learning_rate": 9.994646759505203e-07, "loss": 0.0024, "reward": 2.1257810592651367, "reward_std": 0.040661588311195374, "rewards/accuracy_reward": 0.92578125, "rewards/format_reward": 1.0, "step": 1068 }, { "all_correct": 0.25, "all_wrong": 0.0, "completion_length": 420.28125, "epoch": 0.014744624212079834, "grad_norm": 2.5026453390955274, "kl": 0.06640625, "learning_rate": 9.994636731810427e-07, "loss": 0.0027, "reward": 1.9868437051773071, "reward_std": 0.023193754255771637, "rewards/accuracy_reward": 0.7868437767028809, "rewards/format_reward": 1.0, "step": 1069 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 429.1875, "epoch": 0.014758417125281031, "grad_norm": 2.288931551258801, "kl": 0.057861328125, "learning_rate": 9.99462669473753e-07, "loss": 0.0023, "reward": 2.1106252670288086, "reward_std": 0.050059713423252106, "rewards/accuracy_reward": 0.9168750047683716, "rewards/format_reward": 1.0, "step": 1070 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 437.3125, "epoch": 0.014772210038482227, "grad_norm": 2.057354487026471, "kl": 0.0673828125, "learning_rate": 9.994616648286536e-07, "loss": 0.0027, "reward": 2.0542187690734863, "reward_std": 0.04256404936313629, "rewards/accuracy_reward": 0.8604687452316284, "rewards/format_reward": 1.0, "step": 1071 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 425.09375, "epoch": 0.014786002951683424, "grad_norm": 2.0371520809223878, "kl": 0.060546875, "learning_rate": 9.994606592457463e-07, "loss": 0.0024, "reward": 2.103062629699707, "reward_std": 0.04283753037452698, "rewards/accuracy_reward": 0.9093124866485596, "rewards/format_reward": 1.0, "step": 1072 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 417.03125, "epoch": 0.014799795864884622, "grad_norm": 1.9452010049327786, "kl": 0.059814453125, "learning_rate": 9.99459652725033e-07, "loss": 0.0024, "reward": 2.104968786239624, "reward_std": 0.03442716225981712, "rewards/accuracy_reward": 0.9112187027931213, "rewards/format_reward": 1.0, "step": 1073 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 421.75, "epoch": 0.014813588778085819, "grad_norm": 1.2523213041177887, "kl": 0.0693359375, "learning_rate": 9.994586452665155e-07, "loss": 0.0028, "reward": 2.129312515258789, "reward_std": 0.033046651631593704, "rewards/accuracy_reward": 0.9418123960494995, "rewards/format_reward": 1.0, "step": 1074 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 429.65625, "epoch": 0.014827381691287016, "grad_norm": 2.8727954950597177, "kl": 0.060302734375, "learning_rate": 9.994576368701957e-07, "loss": 0.0024, "reward": 2.141531229019165, "reward_std": 0.04028373956680298, "rewards/accuracy_reward": 0.9477812647819519, "rewards/format_reward": 1.0, "step": 1075 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 411.59375, "epoch": 0.014841174604488213, "grad_norm": 1.5956311726419976, "kl": 0.06591796875, "learning_rate": 9.994566275360753e-07, "loss": 0.0026, "reward": 2.121875047683716, "reward_std": 0.009881383739411831, "rewards/accuracy_reward": 0.921875, "rewards/format_reward": 1.0, "step": 1076 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 425.15625, "epoch": 0.01485496751768941, "grad_norm": 2.218568019552992, "kl": 0.0556640625, "learning_rate": 9.994556172641567e-07, "loss": 0.0022, "reward": 2.1162188053131104, "reward_std": 0.011133064515888691, "rewards/accuracy_reward": 0.9162187576293945, "rewards/format_reward": 1.0, "step": 1077 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 411.96875, "epoch": 0.014868760430890608, "grad_norm": 1.424588244485191, "kl": 0.07080078125, "learning_rate": 9.994546060544413e-07, "loss": 0.0028, "reward": 2.0124688148498535, "reward_std": 0.004626516252756119, "rewards/accuracy_reward": 0.8124687671661377, "rewards/format_reward": 1.0, "step": 1078 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 409.40625, "epoch": 0.014882553344091805, "grad_norm": 2.9556073070322904, "kl": 0.06201171875, "learning_rate": 9.994535939069313e-07, "loss": 0.0025, "reward": 2.0877811908721924, "reward_std": 0.01757834479212761, "rewards/accuracy_reward": 0.8877811431884766, "rewards/format_reward": 1.0, "step": 1079 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 405.4375, "epoch": 0.014896346257293002, "grad_norm": 2.400652816341641, "kl": 0.0625, "learning_rate": 9.994525808216286e-07, "loss": 0.0025, "reward": 2.096937417984009, "reward_std": 0.02723454311490059, "rewards/accuracy_reward": 0.8969374895095825, "rewards/format_reward": 1.0, "step": 1080 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 422.1875, "epoch": 0.0149101391704942, "grad_norm": 2.6900558056393655, "kl": 0.06640625, "learning_rate": 9.994515667985348e-07, "loss": 0.0027, "reward": 2.0487499237060547, "reward_std": 0.052178263664245605, "rewards/accuracy_reward": 0.8550000190734863, "rewards/format_reward": 1.0, "step": 1081 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 413.59375, "epoch": 0.014923932083695397, "grad_norm": 1.333579928012792, "kl": 0.0517578125, "learning_rate": 9.99450551837652e-07, "loss": 0.0021, "reward": 2.1450939178466797, "reward_std": 0.002078552497550845, "rewards/accuracy_reward": 0.9450937509536743, "rewards/format_reward": 1.0, "step": 1082 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 409.28125, "epoch": 0.014937724996896594, "grad_norm": 5.372646244504243, "kl": 0.0712890625, "learning_rate": 9.994495359389823e-07, "loss": 0.0028, "reward": 2.0963125228881836, "reward_std": 0.010374093428254128, "rewards/accuracy_reward": 0.8963124752044678, "rewards/format_reward": 1.0, "step": 1083 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 412.375, "epoch": 0.014951517910097792, "grad_norm": 2.4921541387065727, "kl": 0.06787109375, "learning_rate": 9.994485191025272e-07, "loss": 0.0027, "reward": 2.087218761444092, "reward_std": 0.024709578603506088, "rewards/accuracy_reward": 0.887218713760376, "rewards/format_reward": 1.0, "step": 1084 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 398.375, "epoch": 0.014965310823298989, "grad_norm": 1.48399931650834, "kl": 0.052490234375, "learning_rate": 9.99447501328289e-07, "loss": 0.0021, "reward": 2.174562454223633, "reward_std": 0.010739422403275967, "rewards/accuracy_reward": 0.9745625257492065, "rewards/format_reward": 1.0, "step": 1085 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 397.0625, "epoch": 0.014979103736500186, "grad_norm": 4.842247127748281, "kl": 0.0634765625, "learning_rate": 9.994464826162694e-07, "loss": 0.0025, "reward": 2.086437463760376, "reward_std": 0.03863893449306488, "rewards/accuracy_reward": 0.8926874399185181, "rewards/format_reward": 1.0, "step": 1086 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 403.75, "epoch": 0.014992896649701383, "grad_norm": 2.1795682781704797, "kl": 0.056884765625, "learning_rate": 9.994454629664704e-07, "loss": 0.0023, "reward": 2.114375114440918, "reward_std": 0.03267224133014679, "rewards/accuracy_reward": 0.9206250309944153, "rewards/format_reward": 1.0, "step": 1087 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 406.84375, "epoch": 0.01500668956290258, "grad_norm": 1.926170602872597, "kl": 0.06787109375, "learning_rate": 9.994444423788937e-07, "loss": 0.0027, "reward": 2.121062755584717, "reward_std": 0.017179129645228386, "rewards/accuracy_reward": 0.9210624098777771, "rewards/format_reward": 1.0, "step": 1088 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 385.71875, "epoch": 0.015020482476103778, "grad_norm": 1.3725523750611817, "kl": 0.057373046875, "learning_rate": 9.994434208535414e-07, "loss": 0.0023, "reward": 2.1212501525878906, "reward_std": 0.004670638125389814, "rewards/accuracy_reward": 0.9212501049041748, "rewards/format_reward": 1.0, "step": 1089 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 401.5, "epoch": 0.015034275389304975, "grad_norm": 1.8777286157731157, "kl": 0.0634765625, "learning_rate": 9.994423983904157e-07, "loss": 0.0025, "reward": 2.0565624237060547, "reward_std": 0.015134502202272415, "rewards/accuracy_reward": 0.8565624952316284, "rewards/format_reward": 1.0, "step": 1090 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 409.09375, "epoch": 0.015048068302506172, "grad_norm": 42.66234406750561, "kl": 0.0654296875, "learning_rate": 9.99441374989518e-07, "loss": 0.0026, "reward": 2.0629687309265137, "reward_std": 0.026116151362657547, "rewards/accuracy_reward": 0.8629687428474426, "rewards/format_reward": 1.0, "step": 1091 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 410.5, "epoch": 0.01506186121570737, "grad_norm": 10.41268494868566, "kl": 0.06494140625, "learning_rate": 9.994403506508503e-07, "loss": 0.0026, "reward": 2.0698747634887695, "reward_std": 0.03441910073161125, "rewards/accuracy_reward": 0.8698750138282776, "rewards/format_reward": 1.0, "step": 1092 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 403.03125, "epoch": 0.015075654128908567, "grad_norm": 2.4362879999026066, "kl": 0.06689453125, "learning_rate": 9.99439325374415e-07, "loss": 0.0027, "reward": 2.067187547683716, "reward_std": 0.024930741637945175, "rewards/accuracy_reward": 0.8671875596046448, "rewards/format_reward": 1.0, "step": 1093 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 413.34375, "epoch": 0.015089447042109764, "grad_norm": 2.1130370523876048, "kl": 0.0673828125, "learning_rate": 9.994382991602133e-07, "loss": 0.0027, "reward": 2.142124891281128, "reward_std": 0.01581864058971405, "rewards/accuracy_reward": 0.9421250224113464, "rewards/format_reward": 1.0, "step": 1094 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 405.9375, "epoch": 0.015103239955310961, "grad_norm": 3.1864484621376525, "kl": 0.0634765625, "learning_rate": 9.994372720082478e-07, "loss": 0.0025, "reward": 2.1130313873291016, "reward_std": 0.02044382505118847, "rewards/accuracy_reward": 0.9130312204360962, "rewards/format_reward": 1.0, "step": 1095 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 414.25, "epoch": 0.015117032868512159, "grad_norm": 2.1388219698734723, "kl": 0.05419921875, "learning_rate": 9.9943624391852e-07, "loss": 0.0022, "reward": 2.107875108718872, "reward_std": 0.05932911857962608, "rewards/accuracy_reward": 0.9203750491142273, "rewards/format_reward": 1.0, "step": 1096 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 410.21875, "epoch": 0.015130825781713356, "grad_norm": 2.3839989599174833, "kl": 0.0625, "learning_rate": 9.99435214891032e-07, "loss": 0.0025, "reward": 2.0528125762939453, "reward_std": 0.019413847476243973, "rewards/accuracy_reward": 0.8528125286102295, "rewards/format_reward": 1.0, "step": 1097 }, { "all_correct": 0.25, "all_wrong": 0.0, "completion_length": 407.09375, "epoch": 0.015144618694914553, "grad_norm": 2.033735055120275, "kl": 0.0703125, "learning_rate": 9.994341849257859e-07, "loss": 0.0028, "reward": 1.9528439044952393, "reward_std": 0.025748802348971367, "rewards/accuracy_reward": 0.7528437972068787, "rewards/format_reward": 1.0, "step": 1098 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 428.6875, "epoch": 0.01515841160811575, "grad_norm": 2.245073830736851, "kl": 0.064453125, "learning_rate": 9.994331540227832e-07, "loss": 0.0026, "reward": 2.001093864440918, "reward_std": 0.03333578258752823, "rewards/accuracy_reward": 0.8010937571525574, "rewards/format_reward": 1.0, "step": 1099 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 402.375, "epoch": 0.015172204521316948, "grad_norm": 2.496701802389481, "kl": 0.07373046875, "learning_rate": 9.994321221820263e-07, "loss": 0.003, "reward": 2.0866875648498535, "reward_std": 0.023032210767269135, "rewards/accuracy_reward": 0.8866875171661377, "rewards/format_reward": 1.0, "step": 1100 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 419.21875, "epoch": 0.015185997434518145, "grad_norm": 5.006268716123861, "kl": 0.0654296875, "learning_rate": 9.994310894035167e-07, "loss": 0.0026, "reward": 2.0892810821533203, "reward_std": 0.029601451009511948, "rewards/accuracy_reward": 0.895531177520752, "rewards/format_reward": 1.0, "step": 1101 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 414.40625, "epoch": 0.015199790347719342, "grad_norm": 1.865276240188177, "kl": 0.064453125, "learning_rate": 9.994300556872567e-07, "loss": 0.0026, "reward": 2.017937660217285, "reward_std": 0.03594556450843811, "rewards/accuracy_reward": 0.8241875171661377, "rewards/format_reward": 1.0, "step": 1102 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 412.25, "epoch": 0.01521358326092054, "grad_norm": 3.0769919361254736, "kl": 0.06787109375, "learning_rate": 9.994290210332478e-07, "loss": 0.0027, "reward": 2.0431251525878906, "reward_std": 0.05550801753997803, "rewards/accuracy_reward": 0.8493750095367432, "rewards/format_reward": 1.0, "step": 1103 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 428.21875, "epoch": 0.015227376174121737, "grad_norm": 1.1934991898044662, "kl": 0.0615234375, "learning_rate": 9.994279854414927e-07, "loss": 0.0025, "reward": 2.0102500915527344, "reward_std": 0.00590096740052104, "rewards/accuracy_reward": 0.8102500438690186, "rewards/format_reward": 1.0, "step": 1104 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 413.28125, "epoch": 0.015241169087322934, "grad_norm": 2.783297836932153, "kl": 0.0712890625, "learning_rate": 9.994269489119926e-07, "loss": 0.0029, "reward": 2.067812442779541, "reward_std": 0.018586885184049606, "rewards/accuracy_reward": 0.8678125143051147, "rewards/format_reward": 1.0, "step": 1105 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 412.59375, "epoch": 0.015254962000524131, "grad_norm": 3.6209274001933975, "kl": 0.072265625, "learning_rate": 9.994259114447499e-07, "loss": 0.0029, "reward": 2.080812454223633, "reward_std": 0.019839555025100708, "rewards/accuracy_reward": 0.8808125257492065, "rewards/format_reward": 1.0, "step": 1106 }, { "all_correct": 0.25, "all_wrong": 0.0, "completion_length": 411.5, "epoch": 0.015268754913725329, "grad_norm": 1.483393149084059, "kl": 0.06201171875, "learning_rate": 9.994248730397661e-07, "loss": 0.0025, "reward": 2.0144686698913574, "reward_std": 0.010401781648397446, "rewards/accuracy_reward": 0.8144687414169312, "rewards/format_reward": 1.0, "step": 1107 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 412.84375, "epoch": 0.015282547826926526, "grad_norm": 2.72084521752612, "kl": 0.060791015625, "learning_rate": 9.994238336970436e-07, "loss": 0.0024, "reward": 2.1058125495910645, "reward_std": 0.02686990797519684, "rewards/accuracy_reward": 0.9058125615119934, "rewards/format_reward": 1.0, "step": 1108 }, { "all_correct": 0.25, "all_wrong": 0.0, "completion_length": 403.8125, "epoch": 0.015296340740127722, "grad_norm": 1.823953342188384, "kl": 0.0703125, "learning_rate": 9.994227934165843e-07, "loss": 0.0028, "reward": 1.9065937995910645, "reward_std": 0.014940287917852402, "rewards/accuracy_reward": 0.7065937519073486, "rewards/format_reward": 1.0, "step": 1109 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 405.9375, "epoch": 0.015310133653328919, "grad_norm": 2.114621453977902, "kl": 0.06494140625, "learning_rate": 9.994217521983898e-07, "loss": 0.0026, "reward": 2.06278133392334, "reward_std": 0.02265346422791481, "rewards/accuracy_reward": 0.862781286239624, "rewards/format_reward": 1.0, "step": 1110 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 415.09375, "epoch": 0.015323926566530116, "grad_norm": 2.8239795437789477, "kl": 0.060546875, "learning_rate": 9.994207100424625e-07, "loss": 0.0024, "reward": 2.0698437690734863, "reward_std": 0.03149894252419472, "rewards/accuracy_reward": 0.8698437213897705, "rewards/format_reward": 1.0, "step": 1111 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 402.40625, "epoch": 0.015337719479731313, "grad_norm": 1.7070307350029286, "kl": 0.06396484375, "learning_rate": 9.99419666948804e-07, "loss": 0.0025, "reward": 2.1032185554504395, "reward_std": 0.011139210313558578, "rewards/accuracy_reward": 0.9032188057899475, "rewards/format_reward": 1.0, "step": 1112 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 411.46875, "epoch": 0.01535151239293251, "grad_norm": 19.24588487725623, "kl": 0.06640625, "learning_rate": 9.994186229174164e-07, "loss": 0.0027, "reward": 2.032562494277954, "reward_std": 0.023525450378656387, "rewards/accuracy_reward": 0.8325625061988831, "rewards/format_reward": 1.0, "step": 1113 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 406.84375, "epoch": 0.015365305306133708, "grad_norm": 2.79638005343388, "kl": 0.06640625, "learning_rate": 9.994175779483017e-07, "loss": 0.0027, "reward": 2.038062572479248, "reward_std": 0.0161270871758461, "rewards/accuracy_reward": 0.8380624651908875, "rewards/format_reward": 1.0, "step": 1114 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 393.375, "epoch": 0.015379098219334905, "grad_norm": 2.8525139000897015, "kl": 0.0615234375, "learning_rate": 9.994165320414618e-07, "loss": 0.0025, "reward": 2.1016249656677246, "reward_std": 0.015592718496918678, "rewards/accuracy_reward": 0.9016249775886536, "rewards/format_reward": 1.0, "step": 1115 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 411.53125, "epoch": 0.015392891132536102, "grad_norm": 2.2731562507807763, "kl": 0.0654296875, "learning_rate": 9.994154851968988e-07, "loss": 0.0026, "reward": 2.048281192779541, "reward_std": 0.012053274549543858, "rewards/accuracy_reward": 0.8482812643051147, "rewards/format_reward": 1.0, "step": 1116 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 392.375, "epoch": 0.0154066840457373, "grad_norm": 6.008809417243853, "kl": 0.068359375, "learning_rate": 9.994144374146145e-07, "loss": 0.0027, "reward": 1.990875005722046, "reward_std": 0.03312664106488228, "rewards/accuracy_reward": 0.7908750176429749, "rewards/format_reward": 1.0, "step": 1117 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 402.21875, "epoch": 0.015420476958938497, "grad_norm": 2.406566575373047, "kl": 0.06298828125, "learning_rate": 9.994133886946108e-07, "loss": 0.0025, "reward": 2.0693750381469727, "reward_std": 0.029706541448831558, "rewards/accuracy_reward": 0.8693749904632568, "rewards/format_reward": 1.0, "step": 1118 }, { "all_correct": 0.25, "all_wrong": 0.0, "completion_length": 411.875, "epoch": 0.015434269872139694, "grad_norm": 2.0678949573014638, "kl": 0.06591796875, "learning_rate": 9.994123390368899e-07, "loss": 0.0026, "reward": 2.019843578338623, "reward_std": 0.046620793640613556, "rewards/accuracy_reward": 0.8323436975479126, "rewards/format_reward": 1.0, "step": 1119 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 395.5, "epoch": 0.015448062785340892, "grad_norm": 1.73815244240117, "kl": 0.0556640625, "learning_rate": 9.994112884414537e-07, "loss": 0.0022, "reward": 2.124875068664551, "reward_std": 0.04046914353966713, "rewards/accuracy_reward": 0.937375009059906, "rewards/format_reward": 1.0, "step": 1120 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 395.03125, "epoch": 0.015461855698542089, "grad_norm": 6.3524144726152825, "kl": 0.06787109375, "learning_rate": 9.99410236908304e-07, "loss": 0.0027, "reward": 2.114734411239624, "reward_std": 0.041595689952373505, "rewards/accuracy_reward": 0.9209843277931213, "rewards/format_reward": 1.0, "step": 1121 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 394.59375, "epoch": 0.015475648611743286, "grad_norm": 1.9924175510208753, "kl": 0.06591796875, "learning_rate": 9.99409184437443e-07, "loss": 0.0026, "reward": 2.0250000953674316, "reward_std": 0.014884786680340767, "rewards/accuracy_reward": 0.8250000476837158, "rewards/format_reward": 1.0, "step": 1122 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 400.375, "epoch": 0.015489441524944483, "grad_norm": 1.826142853300095, "kl": 0.060791015625, "learning_rate": 9.994081310288725e-07, "loss": 0.0024, "reward": 2.028390645980835, "reward_std": 0.012495080009102821, "rewards/accuracy_reward": 0.8283905982971191, "rewards/format_reward": 1.0, "step": 1123 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 395.4375, "epoch": 0.01550323443814568, "grad_norm": 1.7060229216807339, "kl": 0.0712890625, "learning_rate": 9.994070766825946e-07, "loss": 0.0029, "reward": 2.027437448501587, "reward_std": 0.012202245183289051, "rewards/accuracy_reward": 0.8274375200271606, "rewards/format_reward": 1.0, "step": 1124 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 397.625, "epoch": 0.015517027351346878, "grad_norm": 2.2619932484529777, "kl": 0.0634765625, "learning_rate": 9.994060213986112e-07, "loss": 0.0025, "reward": 2.034468650817871, "reward_std": 0.025152001529932022, "rewards/accuracy_reward": 0.8344687223434448, "rewards/format_reward": 1.0, "step": 1125 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 390.71875, "epoch": 0.015530820264548075, "grad_norm": 3.162495559122797, "kl": 0.07470703125, "learning_rate": 9.994049651769245e-07, "loss": 0.003, "reward": 2.1051249504089355, "reward_std": 0.035846106708049774, "rewards/accuracy_reward": 0.9113749861717224, "rewards/format_reward": 1.0, "step": 1126 }, { "all_correct": 0.25, "all_wrong": 0.0, "completion_length": 413.59375, "epoch": 0.015544613177749272, "grad_norm": 2.514282537535975, "kl": 0.06494140625, "learning_rate": 9.994039080175362e-07, "loss": 0.0026, "reward": 2.037234306335449, "reward_std": 0.0439925417304039, "rewards/accuracy_reward": 0.8434844017028809, "rewards/format_reward": 1.0, "step": 1127 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 387.65625, "epoch": 0.01555840609095047, "grad_norm": 2.797905476969195, "kl": 0.06494140625, "learning_rate": 9.994028499204482e-07, "loss": 0.0026, "reward": 1.9822499752044678, "reward_std": 0.01832308992743492, "rewards/accuracy_reward": 0.7822500467300415, "rewards/format_reward": 1.0, "step": 1128 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 397.3125, "epoch": 0.015572199004151667, "grad_norm": 2.265747457683481, "kl": 0.06884765625, "learning_rate": 9.994017908856632e-07, "loss": 0.0027, "reward": 2.004718780517578, "reward_std": 0.02151535265147686, "rewards/accuracy_reward": 0.8047187924385071, "rewards/format_reward": 1.0, "step": 1129 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 390.6875, "epoch": 0.015585991917352864, "grad_norm": 2.6598770148788073, "kl": 0.0732421875, "learning_rate": 9.994007309131822e-07, "loss": 0.0029, "reward": 2.0670626163482666, "reward_std": 0.034036919474601746, "rewards/accuracy_reward": 0.8733125925064087, "rewards/format_reward": 1.0, "step": 1130 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 407.59375, "epoch": 0.015599784830554062, "grad_norm": 2.296487548390693, "kl": 0.061767578125, "learning_rate": 9.993996700030078e-07, "loss": 0.0025, "reward": 2.053187370300293, "reward_std": 0.025254828855395317, "rewards/accuracy_reward": 0.8531875610351562, "rewards/format_reward": 1.0, "step": 1131 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 403.5625, "epoch": 0.015613577743755259, "grad_norm": 2.202847231501686, "kl": 0.064453125, "learning_rate": 9.993986081551419e-07, "loss": 0.0026, "reward": 1.945968747138977, "reward_std": 0.012108190916478634, "rewards/accuracy_reward": 0.745968759059906, "rewards/format_reward": 1.0, "step": 1132 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 407.0, "epoch": 0.015627370656956456, "grad_norm": 2.1923861337441486, "kl": 0.056396484375, "learning_rate": 9.993975453695863e-07, "loss": 0.0023, "reward": 2.101968765258789, "reward_std": 0.03147018700838089, "rewards/accuracy_reward": 0.9082187414169312, "rewards/format_reward": 1.0, "step": 1133 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 402.84375, "epoch": 0.015641163570157653, "grad_norm": 2.876764104519325, "kl": 0.072265625, "learning_rate": 9.993964816463433e-07, "loss": 0.0029, "reward": 2.119874954223633, "reward_std": 0.029321841895580292, "rewards/accuracy_reward": 0.9198750257492065, "rewards/format_reward": 1.0, "step": 1134 }, { "all_correct": 0.25, "all_wrong": 0.0, "completion_length": 406.875, "epoch": 0.01565495648335885, "grad_norm": 2.588124826073845, "kl": 0.06787109375, "learning_rate": 9.993954169854146e-07, "loss": 0.0027, "reward": 1.9973751306533813, "reward_std": 0.019354483112692833, "rewards/accuracy_reward": 0.797374963760376, "rewards/format_reward": 1.0, "step": 1135 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 405.6875, "epoch": 0.015668749396560048, "grad_norm": 2.365653484663119, "kl": 0.0703125, "learning_rate": 9.993943513868023e-07, "loss": 0.0028, "reward": 1.9656875133514404, "reward_std": 0.025600366294384003, "rewards/accuracy_reward": 0.7656875252723694, "rewards/format_reward": 1.0, "step": 1136 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 409.71875, "epoch": 0.015682542309761245, "grad_norm": 2.76342915524858, "kl": 0.07421875, "learning_rate": 9.993932848505086e-07, "loss": 0.003, "reward": 2.1200313568115234, "reward_std": 0.021025152876973152, "rewards/accuracy_reward": 0.9200311899185181, "rewards/format_reward": 1.0, "step": 1137 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 393.5, "epoch": 0.015696335222962442, "grad_norm": 3.4983245109927177, "kl": 0.06982421875, "learning_rate": 9.993922173765353e-07, "loss": 0.0028, "reward": 2.0953750610351562, "reward_std": 0.01202378235757351, "rewards/accuracy_reward": 0.8953750729560852, "rewards/format_reward": 1.0, "step": 1138 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 410.125, "epoch": 0.01571012813616364, "grad_norm": 2.6349678223295143, "kl": 0.06884765625, "learning_rate": 9.993911489648844e-07, "loss": 0.0028, "reward": 2.112593650817871, "reward_std": 0.03881729394197464, "rewards/accuracy_reward": 0.9188437461853027, "rewards/format_reward": 1.0, "step": 1139 }, { "all_correct": 0.25, "all_wrong": 0.0, "completion_length": 392.0, "epoch": 0.015723921049364837, "grad_norm": 5.279422495562205, "kl": 0.0673828125, "learning_rate": 9.99390079615558e-07, "loss": 0.0027, "reward": 1.995750069618225, "reward_std": 0.036774467676877975, "rewards/accuracy_reward": 0.7957500219345093, "rewards/format_reward": 1.0, "step": 1140 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 396.5625, "epoch": 0.015737713962566034, "grad_norm": 55.02813971989049, "kl": 0.05615234375, "learning_rate": 9.99389009328558e-07, "loss": 0.0022, "reward": 2.1155624389648438, "reward_std": 0.014095155522227287, "rewards/accuracy_reward": 0.9155624508857727, "rewards/format_reward": 1.0, "step": 1141 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 418.8125, "epoch": 0.01575150687576723, "grad_norm": 5.8619625184261075, "kl": 0.06201171875, "learning_rate": 9.993879381038865e-07, "loss": 0.0025, "reward": 2.0765938758850098, "reward_std": 0.04129648208618164, "rewards/accuracy_reward": 0.8765937089920044, "rewards/format_reward": 1.0, "step": 1142 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 387.8125, "epoch": 0.01576529978896843, "grad_norm": 2.0790611137040105, "kl": 0.0625, "learning_rate": 9.993868659415453e-07, "loss": 0.0025, "reward": 2.0629687309265137, "reward_std": 0.08024712651968002, "rewards/accuracy_reward": 0.8629688024520874, "rewards/format_reward": 1.0, "step": 1143 }, { "all_correct": 0.25, "all_wrong": 0.0, "completion_length": 409.90625, "epoch": 0.015779092702169626, "grad_norm": 4.885438570305793, "kl": 0.060546875, "learning_rate": 9.993857928415368e-07, "loss": 0.0024, "reward": 2.044875144958496, "reward_std": 0.028586652129888535, "rewards/accuracy_reward": 0.8448749780654907, "rewards/format_reward": 1.0, "step": 1144 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 405.4375, "epoch": 0.015792885615370823, "grad_norm": 2.698557986663061, "kl": 0.064453125, "learning_rate": 9.993847188038628e-07, "loss": 0.0026, "reward": 2.0670313835144043, "reward_std": 0.027227405458688736, "rewards/accuracy_reward": 0.8670312166213989, "rewards/format_reward": 1.0, "step": 1145 }, { "all_correct": 0.25, "all_wrong": 0.0, "completion_length": 392.71875, "epoch": 0.01580667852857202, "grad_norm": 2.430768731897271, "kl": 0.060302734375, "learning_rate": 9.993836438285255e-07, "loss": 0.0024, "reward": 2.037937641143799, "reward_std": 0.041231196373701096, "rewards/accuracy_reward": 0.8441874384880066, "rewards/format_reward": 1.0, "step": 1146 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 417.0625, "epoch": 0.015820471441773218, "grad_norm": 2.982746654651911, "kl": 0.05859375, "learning_rate": 9.993825679155262e-07, "loss": 0.0023, "reward": 2.077031373977661, "reward_std": 0.061495304107666016, "rewards/accuracy_reward": 0.8770313262939453, "rewards/format_reward": 1.0, "step": 1147 }, { "all_correct": 0.0, "all_wrong": 0.0, "completion_length": 410.3125, "epoch": 0.015834264354974415, "grad_norm": 1.8186873686715441, "kl": 0.0576171875, "learning_rate": 9.99381491064868e-07, "loss": 0.0023, "reward": 1.9345937967300415, "reward_std": 0.020978011190891266, "rewards/accuracy_reward": 0.7345937490463257, "rewards/format_reward": 1.0, "step": 1148 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 407.96875, "epoch": 0.015848057268175612, "grad_norm": 2.3525540442997595, "kl": 0.06494140625, "learning_rate": 9.993804132765521e-07, "loss": 0.0026, "reward": 2.0937187671661377, "reward_std": 0.020365837961435318, "rewards/accuracy_reward": 0.8937187790870667, "rewards/format_reward": 1.0, "step": 1149 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 421.78125, "epoch": 0.01586185018137681, "grad_norm": 14.01362629528508, "kl": 0.06640625, "learning_rate": 9.993793345505809e-07, "loss": 0.0026, "reward": 2.0889062881469727, "reward_std": 0.019671235233545303, "rewards/accuracy_reward": 0.8889062404632568, "rewards/format_reward": 1.0, "step": 1150 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 417.09375, "epoch": 0.015875643094578007, "grad_norm": 2.3662879399109267, "kl": 0.05810546875, "learning_rate": 9.993782548869562e-07, "loss": 0.0023, "reward": 1.9958750009536743, "reward_std": 0.03132371976971626, "rewards/accuracy_reward": 0.7958749532699585, "rewards/format_reward": 1.0, "step": 1151 }, { "all_correct": 0.25, "all_wrong": 0.0, "completion_length": 410.21875, "epoch": 0.015889436007779204, "grad_norm": 2.399341917265444, "kl": 0.062255859375, "learning_rate": 9.993771742856804e-07, "loss": 0.0025, "reward": 1.9700312614440918, "reward_std": 0.040816061198711395, "rewards/accuracy_reward": 0.7700312733650208, "rewards/format_reward": 1.0, "step": 1152 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 394.46875, "epoch": 0.0159032289209804, "grad_norm": 2.3654804095394653, "kl": 0.06201171875, "learning_rate": 9.993760927467553e-07, "loss": 0.0025, "reward": 2.0527501106262207, "reward_std": 0.03276272863149643, "rewards/accuracy_reward": 0.8527500033378601, "rewards/format_reward": 1.0, "step": 1153 }, { "all_correct": 0.25, "all_wrong": 0.0, "completion_length": 416.0625, "epoch": 0.0159170218341816, "grad_norm": 1.8090014575254691, "kl": 0.060791015625, "learning_rate": 9.993750102701826e-07, "loss": 0.0024, "reward": 2.009671926498413, "reward_std": 0.026841901242733, "rewards/accuracy_reward": 0.8159218430519104, "rewards/format_reward": 1.0, "step": 1154 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 407.5, "epoch": 0.015930814747382796, "grad_norm": 4.245357779411806, "kl": 0.06201171875, "learning_rate": 9.993739268559647e-07, "loss": 0.0025, "reward": 2.044343948364258, "reward_std": 0.027243871241807938, "rewards/accuracy_reward": 0.8443437218666077, "rewards/format_reward": 1.0, "step": 1155 }, { "all_correct": 0.25, "all_wrong": 0.0, "completion_length": 423.03125, "epoch": 0.015944607660583993, "grad_norm": 2.4299734333140215, "kl": 0.06591796875, "learning_rate": 9.99372842504104e-07, "loss": 0.0026, "reward": 1.964296817779541, "reward_std": 0.045913681387901306, "rewards/accuracy_reward": 0.7705468535423279, "rewards/format_reward": 1.0, "step": 1156 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 399.75, "epoch": 0.01595840057378519, "grad_norm": 2.4141429541700226, "kl": 0.0634765625, "learning_rate": 9.993717572146016e-07, "loss": 0.0025, "reward": 2.0583748817443848, "reward_std": 0.013394411653280258, "rewards/accuracy_reward": 0.8583750128746033, "rewards/format_reward": 1.0, "step": 1157 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 401.875, "epoch": 0.015972193486986388, "grad_norm": 2.7514518993397274, "kl": 0.06640625, "learning_rate": 9.993706709874602e-07, "loss": 0.0026, "reward": 1.9363125562667847, "reward_std": 0.032857369631528854, "rewards/accuracy_reward": 0.7425625324249268, "rewards/format_reward": 1.0, "step": 1158 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 410.78125, "epoch": 0.015985986400187585, "grad_norm": 1.3329847645543602, "kl": 0.055908203125, "learning_rate": 9.993695838226818e-07, "loss": 0.0022, "reward": 2.0935001373291016, "reward_std": 0.009721132926642895, "rewards/accuracy_reward": 0.893500030040741, "rewards/format_reward": 1.0, "step": 1159 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 413.09375, "epoch": 0.015999779313388782, "grad_norm": 5.65610004433882, "kl": 0.05908203125, "learning_rate": 9.993684957202684e-07, "loss": 0.0024, "reward": 2.0814061164855957, "reward_std": 0.03583874925971031, "rewards/accuracy_reward": 0.8876563310623169, "rewards/format_reward": 1.0, "step": 1160 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 427.625, "epoch": 0.01601357222658998, "grad_norm": 1.5261686316222318, "kl": 0.0595703125, "learning_rate": 9.99367406680222e-07, "loss": 0.0024, "reward": 2.120187282562256, "reward_std": 0.010566852986812592, "rewards/accuracy_reward": 0.9201875329017639, "rewards/format_reward": 1.0, "step": 1161 }, { "all_correct": 0.0, "all_wrong": 0.0, "completion_length": 444.46875, "epoch": 0.016027365139791177, "grad_norm": 1.709002106042905, "kl": 0.058837890625, "learning_rate": 9.993663167025445e-07, "loss": 0.0024, "reward": 1.9316562414169312, "reward_std": 0.03195605427026749, "rewards/accuracy_reward": 0.737906277179718, "rewards/format_reward": 1.0, "step": 1162 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 410.15625, "epoch": 0.016041158052992374, "grad_norm": 1.8828778686583867, "kl": 0.06689453125, "learning_rate": 9.99365225787238e-07, "loss": 0.0027, "reward": 2.0196564197540283, "reward_std": 0.016404660418629646, "rewards/accuracy_reward": 0.8196563124656677, "rewards/format_reward": 1.0, "step": 1163 }, { "all_correct": 0.25, "all_wrong": 0.0, "completion_length": 418.6875, "epoch": 0.01605495096619357, "grad_norm": 2.6157308153332623, "kl": 0.0673828125, "learning_rate": 9.993641339343048e-07, "loss": 0.0027, "reward": 1.8850313425064087, "reward_std": 0.02486385405063629, "rewards/accuracy_reward": 0.6850312948226929, "rewards/format_reward": 1.0, "step": 1164 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 417.25, "epoch": 0.016068743879394765, "grad_norm": 3.282839511851077, "kl": 0.06298828125, "learning_rate": 9.993630411437468e-07, "loss": 0.0025, "reward": 2.0717499256134033, "reward_std": 0.05713348463177681, "rewards/accuracy_reward": 0.878000020980835, "rewards/format_reward": 1.0, "step": 1165 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 424.375, "epoch": 0.016082536792595963, "grad_norm": 1.6727532247128405, "kl": 0.06494140625, "learning_rate": 9.993619474155658e-07, "loss": 0.0026, "reward": 1.9839375019073486, "reward_std": 0.012156892567873001, "rewards/accuracy_reward": 0.7839374542236328, "rewards/format_reward": 1.0, "step": 1166 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 413.78125, "epoch": 0.01609632970579716, "grad_norm": 1.7988053296143596, "kl": 0.0595703125, "learning_rate": 9.993608527497643e-07, "loss": 0.0024, "reward": 2.0970938205718994, "reward_std": 0.01126367598772049, "rewards/accuracy_reward": 0.8970937728881836, "rewards/format_reward": 1.0, "step": 1167 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 415.34375, "epoch": 0.016110122618998357, "grad_norm": 2.5477468422602287, "kl": 0.05615234375, "learning_rate": 9.99359757146344e-07, "loss": 0.0022, "reward": 2.0705623626708984, "reward_std": 0.02816617116332054, "rewards/accuracy_reward": 0.8705624938011169, "rewards/format_reward": 1.0, "step": 1168 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 414.71875, "epoch": 0.016123915532199554, "grad_norm": 5.685719393648389, "kl": 0.05712890625, "learning_rate": 9.99358660605307e-07, "loss": 0.0023, "reward": 2.0773439407348633, "reward_std": 0.036236897110939026, "rewards/accuracy_reward": 0.8773437738418579, "rewards/format_reward": 1.0, "step": 1169 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 416.0, "epoch": 0.01613770844540075, "grad_norm": 1.729303054238063, "kl": 0.05517578125, "learning_rate": 9.993575631266558e-07, "loss": 0.0022, "reward": 2.0693438053131104, "reward_std": 0.017319152131676674, "rewards/accuracy_reward": 0.8693437576293945, "rewards/format_reward": 1.0, "step": 1170 }, { "all_correct": 0.25, "all_wrong": 0.0, "completion_length": 424.1875, "epoch": 0.01615150135860195, "grad_norm": 2.243796428548856, "kl": 0.062255859375, "learning_rate": 9.993564647103919e-07, "loss": 0.0025, "reward": 2.0122811794281006, "reward_std": 0.04178939759731293, "rewards/accuracy_reward": 0.8185312747955322, "rewards/format_reward": 1.0, "step": 1171 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 397.4375, "epoch": 0.016165294271803146, "grad_norm": 1.3220641681103404, "kl": 0.056396484375, "learning_rate": 9.993553653565175e-07, "loss": 0.0022, "reward": 2.1764373779296875, "reward_std": 0.0063827005214989185, "rewards/accuracy_reward": 0.9764374494552612, "rewards/format_reward": 1.0, "step": 1172 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 410.5625, "epoch": 0.016179087185004343, "grad_norm": 3.0231804717365582, "kl": 0.06005859375, "learning_rate": 9.99354265065035e-07, "loss": 0.0024, "reward": 2.061187744140625, "reward_std": 0.026546740904450417, "rewards/accuracy_reward": 0.8611875176429749, "rewards/format_reward": 1.0, "step": 1173 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 415.46875, "epoch": 0.01619288009820554, "grad_norm": 4.597139166488826, "kl": 0.0654296875, "learning_rate": 9.993531638359459e-07, "loss": 0.0026, "reward": 2.1100001335144043, "reward_std": 0.026963971555233, "rewards/accuracy_reward": 0.9100000262260437, "rewards/format_reward": 1.0, "step": 1174 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 423.3125, "epoch": 0.016206673011406738, "grad_norm": 6.21985943125025, "kl": 0.0537109375, "learning_rate": 9.993520616692526e-07, "loss": 0.0021, "reward": 2.04325008392334, "reward_std": 0.037057772278785706, "rewards/accuracy_reward": 0.8494999408721924, "rewards/format_reward": 1.0, "step": 1175 }, { "all_correct": 0.25, "all_wrong": 0.0, "completion_length": 416.25, "epoch": 0.016220465924607935, "grad_norm": 2.414668465904221, "kl": 0.056640625, "learning_rate": 9.99350958564957e-07, "loss": 0.0023, "reward": 1.9773750305175781, "reward_std": 0.02781280316412449, "rewards/accuracy_reward": 0.7773749828338623, "rewards/format_reward": 1.0, "step": 1176 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 412.5, "epoch": 0.016234258837809133, "grad_norm": 2.31868472194358, "kl": 0.0537109375, "learning_rate": 9.993498545230616e-07, "loss": 0.0021, "reward": 2.1153438091278076, "reward_std": 0.015813734382390976, "rewards/accuracy_reward": 0.9153437614440918, "rewards/format_reward": 1.0, "step": 1177 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 407.25, "epoch": 0.01624805175101033, "grad_norm": 2.948206045648212, "kl": 0.06396484375, "learning_rate": 9.993487495435682e-07, "loss": 0.0026, "reward": 2.121500015258789, "reward_std": 0.026033833622932434, "rewards/accuracy_reward": 0.9214999675750732, "rewards/format_reward": 1.0, "step": 1178 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 411.4375, "epoch": 0.016261844664211527, "grad_norm": 2.127673203166328, "kl": 0.0654296875, "learning_rate": 9.993476436264787e-07, "loss": 0.0026, "reward": 2.1236250400543213, "reward_std": 0.024130120873451233, "rewards/accuracy_reward": 0.9236249923706055, "rewards/format_reward": 1.0, "step": 1179 }, { "all_correct": 0.25, "all_wrong": 0.0, "completion_length": 438.96875, "epoch": 0.016275637577412724, "grad_norm": 2.5536922089990703, "kl": 0.060302734375, "learning_rate": 9.993465367717954e-07, "loss": 0.0024, "reward": 1.9992188215255737, "reward_std": 0.0426429882645607, "rewards/accuracy_reward": 0.8117187023162842, "rewards/format_reward": 1.0, "step": 1180 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 417.625, "epoch": 0.01628943049061392, "grad_norm": 6.645836548127712, "kl": 0.05517578125, "learning_rate": 9.993454289795205e-07, "loss": 0.0022, "reward": 2.098656177520752, "reward_std": 0.024374019354581833, "rewards/accuracy_reward": 0.8986561894416809, "rewards/format_reward": 1.0, "step": 1181 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 404.3125, "epoch": 0.01630322340381512, "grad_norm": 1.5768737030021052, "kl": 0.06005859375, "learning_rate": 9.993443202496557e-07, "loss": 0.0024, "reward": 2.083124876022339, "reward_std": 0.054747484624385834, "rewards/accuracy_reward": 0.8831250071525574, "rewards/format_reward": 1.0, "step": 1182 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 425.9375, "epoch": 0.016317016317016316, "grad_norm": 2.722133619504414, "kl": 0.06982421875, "learning_rate": 9.993432105822034e-07, "loss": 0.0028, "reward": 2.1630001068115234, "reward_std": 0.02354966662824154, "rewards/accuracy_reward": 0.969249963760376, "rewards/format_reward": 1.0, "step": 1183 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 412.40625, "epoch": 0.016330809230217513, "grad_norm": 2.656927516556769, "kl": 0.059326171875, "learning_rate": 9.993420999771654e-07, "loss": 0.0024, "reward": 2.0958125591278076, "reward_std": 0.021701566874980927, "rewards/accuracy_reward": 0.9020625352859497, "rewards/format_reward": 1.0, "step": 1184 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 420.28125, "epoch": 0.01634460214341871, "grad_norm": 2.4785063990663834, "kl": 0.06689453125, "learning_rate": 9.99340988434544e-07, "loss": 0.0027, "reward": 2.084343910217285, "reward_std": 0.0472121424973011, "rewards/accuracy_reward": 0.8905937671661377, "rewards/format_reward": 1.0, "step": 1185 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 419.0625, "epoch": 0.016358395056619908, "grad_norm": 3.2443550863611654, "kl": 0.0595703125, "learning_rate": 9.99339875954341e-07, "loss": 0.0024, "reward": 2.087296962738037, "reward_std": 0.019508250057697296, "rewards/accuracy_reward": 0.8872967958450317, "rewards/format_reward": 1.0, "step": 1186 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 413.21875, "epoch": 0.016372187969821105, "grad_norm": 2.005623501282883, "kl": 0.06640625, "learning_rate": 9.99338762536559e-07, "loss": 0.0026, "reward": 2.1083436012268066, "reward_std": 0.032916728407144547, "rewards/accuracy_reward": 0.9145937561988831, "rewards/format_reward": 1.0, "step": 1187 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 394.65625, "epoch": 0.016385980883022302, "grad_norm": 2.44556714816464, "kl": 0.056396484375, "learning_rate": 9.993376481812e-07, "loss": 0.0023, "reward": 2.055093765258789, "reward_std": 0.018108908087015152, "rewards/accuracy_reward": 0.8550937175750732, "rewards/format_reward": 1.0, "step": 1188 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 406.625, "epoch": 0.0163997737962235, "grad_norm": 5.325937417213604, "kl": 0.062255859375, "learning_rate": 9.993365328882655e-07, "loss": 0.0024, "reward": 2.1734375953674316, "reward_std": 0.012735157273709774, "rewards/accuracy_reward": 0.973437488079071, "rewards/format_reward": 1.0, "step": 1189 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 409.09375, "epoch": 0.016413566709424697, "grad_norm": 2.1502562404801715, "kl": 0.060791015625, "learning_rate": 9.993354166577582e-07, "loss": 0.0024, "reward": 2.113156318664551, "reward_std": 0.027573805302381516, "rewards/accuracy_reward": 0.9194062352180481, "rewards/format_reward": 1.0, "step": 1190 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 395.96875, "epoch": 0.016427359622625894, "grad_norm": 2.1803974872619025, "kl": 0.06787109375, "learning_rate": 9.9933429948968e-07, "loss": 0.0027, "reward": 2.0982186794281006, "reward_std": 0.018505975604057312, "rewards/accuracy_reward": 0.8982187509536743, "rewards/format_reward": 1.0, "step": 1191 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 392.5625, "epoch": 0.01644115253582709, "grad_norm": 2.707978402526472, "kl": 0.057861328125, "learning_rate": 9.99333181384033e-07, "loss": 0.0023, "reward": 2.0052812099456787, "reward_std": 0.011651430279016495, "rewards/accuracy_reward": 0.8052812814712524, "rewards/format_reward": 1.0, "step": 1192 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 383.0, "epoch": 0.01645494544902829, "grad_norm": 4.624640841441573, "kl": 0.0673828125, "learning_rate": 9.993320623408191e-07, "loss": 0.0027, "reward": 2.119000196456909, "reward_std": 0.030218057334423065, "rewards/accuracy_reward": 0.918999969959259, "rewards/format_reward": 1.0, "step": 1193 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 394.125, "epoch": 0.016468738362229486, "grad_norm": 1.9728416662091435, "kl": 0.060546875, "learning_rate": 9.993309423600406e-07, "loss": 0.0024, "reward": 2.122812509536743, "reward_std": 0.013722698204219341, "rewards/accuracy_reward": 0.9228125810623169, "rewards/format_reward": 1.0, "step": 1194 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 393.0, "epoch": 0.016482531275430683, "grad_norm": 5.512863984092441, "kl": 0.06396484375, "learning_rate": 9.993298214417e-07, "loss": 0.0026, "reward": 2.096468687057495, "reward_std": 0.018410982564091682, "rewards/accuracy_reward": 0.8964687585830688, "rewards/format_reward": 1.0, "step": 1195 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 394.21875, "epoch": 0.01649632418863188, "grad_norm": 2.3584568620794193, "kl": 0.06787109375, "learning_rate": 9.993286995857986e-07, "loss": 0.0027, "reward": 2.144625186920166, "reward_std": 0.019434140995144844, "rewards/accuracy_reward": 0.9446250200271606, "rewards/format_reward": 1.0, "step": 1196 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 381.5625, "epoch": 0.016510117101833078, "grad_norm": 3.0144953213459136, "kl": 0.06884765625, "learning_rate": 9.99327576792339e-07, "loss": 0.0028, "reward": 2.052968740463257, "reward_std": 0.053260281682014465, "rewards/accuracy_reward": 0.8592187762260437, "rewards/format_reward": 1.0, "step": 1197 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 381.75, "epoch": 0.016523910015034275, "grad_norm": 4.148022864199572, "kl": 0.05810546875, "learning_rate": 9.993264530613233e-07, "loss": 0.0023, "reward": 2.125500202178955, "reward_std": 0.042740464210510254, "rewards/accuracy_reward": 0.9255000352859497, "rewards/format_reward": 1.0, "step": 1198 }, { "all_correct": 0.25, "all_wrong": 0.0, "completion_length": 381.25, "epoch": 0.016537702928235472, "grad_norm": 5.691347690396019, "kl": 0.05712890625, "learning_rate": 9.993253283927534e-07, "loss": 0.0023, "reward": 1.9095938205718994, "reward_std": 0.0367857925593853, "rewards/accuracy_reward": 0.7158437967300415, "rewards/format_reward": 1.0, "step": 1199 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 394.09375, "epoch": 0.01655149584143667, "grad_norm": 2.4616430536119296, "kl": 0.06005859375, "learning_rate": 9.993242027866317e-07, "loss": 0.0024, "reward": 2.0504064559936523, "reward_std": 0.021390054374933243, "rewards/accuracy_reward": 0.850406289100647, "rewards/format_reward": 1.0, "step": 1200 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 376.78125, "epoch": 0.016565288754637867, "grad_norm": 1.4475569873349592, "kl": 0.061767578125, "learning_rate": 9.993230762429601e-07, "loss": 0.0025, "reward": 2.050187587738037, "reward_std": 0.01267438754439354, "rewards/accuracy_reward": 0.8501875400543213, "rewards/format_reward": 1.0, "step": 1201 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 386.40625, "epoch": 0.016579081667839064, "grad_norm": 6.089982266676443, "kl": 0.05078125, "learning_rate": 9.993219487617407e-07, "loss": 0.002, "reward": 2.1338748931884766, "reward_std": 0.007224694825708866, "rewards/accuracy_reward": 0.9338750243186951, "rewards/format_reward": 1.0, "step": 1202 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 388.40625, "epoch": 0.01659287458104026, "grad_norm": 2.3111265386563793, "kl": 0.061767578125, "learning_rate": 9.993208203429758e-07, "loss": 0.0025, "reward": 2.145124912261963, "reward_std": 0.026629649102687836, "rewards/accuracy_reward": 0.951374888420105, "rewards/format_reward": 1.0, "step": 1203 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 378.125, "epoch": 0.01660666749424146, "grad_norm": 2.4251314351268163, "kl": 0.060791015625, "learning_rate": 9.993196909866671e-07, "loss": 0.0024, "reward": 2.0392813682556152, "reward_std": 0.018102699890732765, "rewards/accuracy_reward": 0.8392812609672546, "rewards/format_reward": 1.0, "step": 1204 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 386.71875, "epoch": 0.016620460407442656, "grad_norm": 2.3906094135948113, "kl": 0.0625, "learning_rate": 9.993185606928174e-07, "loss": 0.0025, "reward": 2.053281307220459, "reward_std": 0.028174404054880142, "rewards/accuracy_reward": 0.8532812595367432, "rewards/format_reward": 1.0, "step": 1205 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 398.1875, "epoch": 0.016634253320643853, "grad_norm": 2.043617698128752, "kl": 0.0634765625, "learning_rate": 9.99317429461428e-07, "loss": 0.0025, "reward": 2.0999374389648438, "reward_std": 0.019859399646520615, "rewards/accuracy_reward": 0.8999375104904175, "rewards/format_reward": 1.0, "step": 1206 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 385.625, "epoch": 0.01664804623384505, "grad_norm": 2.915775622168035, "kl": 0.060791015625, "learning_rate": 9.99316297292502e-07, "loss": 0.0024, "reward": 2.0141875743865967, "reward_std": 0.03311220556497574, "rewards/accuracy_reward": 0.8204374313354492, "rewards/format_reward": 1.0, "step": 1207 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 404.15625, "epoch": 0.016661839147046248, "grad_norm": 2.765239327061956, "kl": 0.06494140625, "learning_rate": 9.993151641860407e-07, "loss": 0.0026, "reward": 2.010312557220459, "reward_std": 0.023398401215672493, "rewards/accuracy_reward": 0.8103125691413879, "rewards/format_reward": 1.0, "step": 1208 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 390.75, "epoch": 0.016675632060247445, "grad_norm": 2.174605024045954, "kl": 0.0615234375, "learning_rate": 9.993140301420463e-07, "loss": 0.0025, "reward": 2.128781318664551, "reward_std": 0.023918021470308304, "rewards/accuracy_reward": 0.9350312948226929, "rewards/format_reward": 1.0, "step": 1209 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 403.21875, "epoch": 0.016689424973448642, "grad_norm": 2.5509507489870007, "kl": 0.06103515625, "learning_rate": 9.993128951605216e-07, "loss": 0.0024, "reward": 2.036937713623047, "reward_std": 0.016708016395568848, "rewards/accuracy_reward": 0.836937427520752, "rewards/format_reward": 1.0, "step": 1210 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 408.21875, "epoch": 0.01670321788664984, "grad_norm": 2.014689075082376, "kl": 0.052734375, "learning_rate": 9.99311759241468e-07, "loss": 0.0021, "reward": 2.140749931335449, "reward_std": 0.01507203932851553, "rewards/accuracy_reward": 0.9407499432563782, "rewards/format_reward": 1.0, "step": 1211 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 399.5625, "epoch": 0.016717010799851037, "grad_norm": 2.4945905041703442, "kl": 0.05712890625, "learning_rate": 9.993106223848879e-07, "loss": 0.0023, "reward": 2.0440311431884766, "reward_std": 0.029954148456454277, "rewards/accuracy_reward": 0.8440312743186951, "rewards/format_reward": 1.0, "step": 1212 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 402.6875, "epoch": 0.016730803713052234, "grad_norm": 3.1110711525459727, "kl": 0.0654296875, "learning_rate": 9.993094845907837e-07, "loss": 0.0026, "reward": 2.109499931335449, "reward_std": 0.01622195914387703, "rewards/accuracy_reward": 0.9094999432563782, "rewards/format_reward": 1.0, "step": 1213 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 397.40625, "epoch": 0.01674459662625343, "grad_norm": 2.1942249532110263, "kl": 0.064453125, "learning_rate": 9.993083458591572e-07, "loss": 0.0026, "reward": 2.1093125343322754, "reward_std": 0.04177458956837654, "rewards/accuracy_reward": 0.9155624508857727, "rewards/format_reward": 1.0, "step": 1214 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 397.71875, "epoch": 0.01675838953945463, "grad_norm": 2.645894489171604, "kl": 0.062255859375, "learning_rate": 9.993072061900104e-07, "loss": 0.0025, "reward": 2.1039376258850098, "reward_std": 0.02338177151978016, "rewards/accuracy_reward": 0.9101874828338623, "rewards/format_reward": 1.0, "step": 1215 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 384.84375, "epoch": 0.016772182452655826, "grad_norm": 2.111482114360971, "kl": 0.061279296875, "learning_rate": 9.993060655833459e-07, "loss": 0.0024, "reward": 2.0365939140319824, "reward_std": 0.01635849103331566, "rewards/accuracy_reward": 0.836593747138977, "rewards/format_reward": 1.0, "step": 1216 }, { "all_correct": 0.25, "all_wrong": 0.0, "completion_length": 393.59375, "epoch": 0.016785975365857023, "grad_norm": 2.0913840288475183, "kl": 0.06640625, "learning_rate": 9.993049240391655e-07, "loss": 0.0027, "reward": 1.9040000438690186, "reward_std": 0.02014370635151863, "rewards/accuracy_reward": 0.7039999961853027, "rewards/format_reward": 1.0, "step": 1217 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 388.8125, "epoch": 0.01679976827905822, "grad_norm": 4.775042448556552, "kl": 0.058837890625, "learning_rate": 9.993037815574715e-07, "loss": 0.0024, "reward": 2.0795626640319824, "reward_std": 0.01768878847360611, "rewards/accuracy_reward": 0.879562497138977, "rewards/format_reward": 1.0, "step": 1218 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 407.84375, "epoch": 0.016813561192259418, "grad_norm": 2.2529042531407866, "kl": 0.0673828125, "learning_rate": 9.993026381382658e-07, "loss": 0.0027, "reward": 2.085343837738037, "reward_std": 0.020005101338028908, "rewards/accuracy_reward": 0.8853437900543213, "rewards/format_reward": 1.0, "step": 1219 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 391.125, "epoch": 0.016827354105460615, "grad_norm": 1.5735816650556906, "kl": 0.059326171875, "learning_rate": 9.993014937815509e-07, "loss": 0.0024, "reward": 2.1101250648498535, "reward_std": 0.009071951732039452, "rewards/accuracy_reward": 0.9101250171661377, "rewards/format_reward": 1.0, "step": 1220 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 391.84375, "epoch": 0.016841147018661812, "grad_norm": 3.4382738639985724, "kl": 0.061767578125, "learning_rate": 9.993003484873289e-07, "loss": 0.0025, "reward": 1.9953436851501465, "reward_std": 0.017380014061927795, "rewards/accuracy_reward": 0.7953437566757202, "rewards/format_reward": 1.0, "step": 1221 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 392.4375, "epoch": 0.01685493993186301, "grad_norm": 2.39306610291031, "kl": 0.0703125, "learning_rate": 9.992992022556017e-07, "loss": 0.0028, "reward": 2.045187473297119, "reward_std": 0.04146747663617134, "rewards/accuracy_reward": 0.863937497138977, "rewards/format_reward": 1.0, "step": 1222 }, { "all_correct": 0.25, "all_wrong": 0.0, "completion_length": 404.65625, "epoch": 0.016868732845064207, "grad_norm": 2.9482258680987243, "kl": 0.06982421875, "learning_rate": 9.992980550863715e-07, "loss": 0.0028, "reward": 1.9826874732971191, "reward_std": 0.022615507245063782, "rewards/accuracy_reward": 0.7826874852180481, "rewards/format_reward": 1.0, "step": 1223 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 384.0, "epoch": 0.016882525758265404, "grad_norm": 1.8206569873960117, "kl": 0.0546875, "learning_rate": 9.992969069796407e-07, "loss": 0.0022, "reward": 2.1243438720703125, "reward_std": 0.012220284901559353, "rewards/accuracy_reward": 0.9243437647819519, "rewards/format_reward": 1.0, "step": 1224 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 403.09375, "epoch": 0.0168963186714666, "grad_norm": 4.18120246830379, "kl": 0.06396484375, "learning_rate": 9.992957579354114e-07, "loss": 0.0026, "reward": 1.9914062023162842, "reward_std": 0.02773621864616871, "rewards/accuracy_reward": 0.7914062738418579, "rewards/format_reward": 1.0, "step": 1225 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 401.75, "epoch": 0.0169101115846678, "grad_norm": 3.0317885713682604, "kl": 0.06298828125, "learning_rate": 9.992946079536855e-07, "loss": 0.0025, "reward": 2.1069374084472656, "reward_std": 0.04304736107587814, "rewards/accuracy_reward": 0.9131874442100525, "rewards/format_reward": 1.0, "step": 1226 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 398.96875, "epoch": 0.016923904497868996, "grad_norm": 2.044266394699229, "kl": 0.0634765625, "learning_rate": 9.992934570344651e-07, "loss": 0.0026, "reward": 2.1077187061309814, "reward_std": 0.0352020189166069, "rewards/accuracy_reward": 0.9139687418937683, "rewards/format_reward": 1.0, "step": 1227 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 396.78125, "epoch": 0.016937697411070193, "grad_norm": 1.1515796291464742, "kl": 0.0556640625, "learning_rate": 9.992923051777529e-07, "loss": 0.0022, "reward": 2.0836875438690186, "reward_std": 0.010349041782319546, "rewards/accuracy_reward": 0.8836874961853027, "rewards/format_reward": 1.0, "step": 1228 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 412.28125, "epoch": 0.01695149032427139, "grad_norm": 3.4006158712178918, "kl": 0.06298828125, "learning_rate": 9.992911523835507e-07, "loss": 0.0025, "reward": 2.1156251430511475, "reward_std": 0.03228399157524109, "rewards/accuracy_reward": 0.921875, "rewards/format_reward": 1.0, "step": 1229 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 389.0, "epoch": 0.016965283237472588, "grad_norm": 4.325247590107978, "kl": 0.064453125, "learning_rate": 9.992899986518605e-07, "loss": 0.0026, "reward": 2.1386563777923584, "reward_std": 0.020227182656526566, "rewards/accuracy_reward": 0.938656210899353, "rewards/format_reward": 1.0, "step": 1230 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 397.28125, "epoch": 0.016979076150673785, "grad_norm": 3.0243130824383457, "kl": 0.06396484375, "learning_rate": 9.99288843982685e-07, "loss": 0.0026, "reward": 2.0880000591278076, "reward_std": 0.03455888852477074, "rewards/accuracy_reward": 0.8942500352859497, "rewards/format_reward": 1.0, "step": 1231 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 376.09375, "epoch": 0.016992869063874982, "grad_norm": 2.6438696477737285, "kl": 0.06298828125, "learning_rate": 9.992876883760258e-07, "loss": 0.0025, "reward": 2.0804061889648438, "reward_std": 0.030565757304430008, "rewards/accuracy_reward": 0.8804062604904175, "rewards/format_reward": 1.0, "step": 1232 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 387.90625, "epoch": 0.01700666197707618, "grad_norm": 3.950742918219173, "kl": 0.05712890625, "learning_rate": 9.992865318318855e-07, "loss": 0.0023, "reward": 2.0841875076293945, "reward_std": 0.03223949298262596, "rewards/accuracy_reward": 0.8904374837875366, "rewards/format_reward": 1.0, "step": 1233 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 403.84375, "epoch": 0.017020454890277377, "grad_norm": 1.8229161674693881, "kl": 0.06982421875, "learning_rate": 9.99285374350266e-07, "loss": 0.0028, "reward": 1.9882187843322754, "reward_std": 0.015215826220810413, "rewards/accuracy_reward": 0.7882187366485596, "rewards/format_reward": 1.0, "step": 1234 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 377.03125, "epoch": 0.017034247803478574, "grad_norm": 2.2471449640736925, "kl": 0.06640625, "learning_rate": 9.992842159311695e-07, "loss": 0.0027, "reward": 2.0631251335144043, "reward_std": 0.030933350324630737, "rewards/accuracy_reward": 0.8693749904632568, "rewards/format_reward": 1.0, "step": 1235 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 383.09375, "epoch": 0.01704804071667977, "grad_norm": 4.386590279249822, "kl": 0.057861328125, "learning_rate": 9.992830565745982e-07, "loss": 0.0023, "reward": 2.1308436393737793, "reward_std": 0.061652712523937225, "rewards/accuracy_reward": 0.9495937824249268, "rewards/format_reward": 1.0, "step": 1236 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 391.53125, "epoch": 0.01706183362988097, "grad_norm": 2.391687245043779, "kl": 0.06201171875, "learning_rate": 9.992818962805543e-07, "loss": 0.0025, "reward": 2.048156261444092, "reward_std": 0.0197436586022377, "rewards/accuracy_reward": 0.848156213760376, "rewards/format_reward": 1.0, "step": 1237 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 394.0, "epoch": 0.017075626543082166, "grad_norm": 2.3230121239614365, "kl": 0.06689453125, "learning_rate": 9.9928073504904e-07, "loss": 0.0027, "reward": 2.084437370300293, "reward_std": 0.028027672320604324, "rewards/accuracy_reward": 0.8844375014305115, "rewards/format_reward": 1.0, "step": 1238 }, { "all_correct": 0.25, "all_wrong": 0.0, "completion_length": 379.46875, "epoch": 0.017089419456283363, "grad_norm": 2.3082662250803563, "kl": 0.06884765625, "learning_rate": 9.992795728800576e-07, "loss": 0.0028, "reward": 2.079218864440918, "reward_std": 0.06573303788900375, "rewards/accuracy_reward": 0.8979687690734863, "rewards/format_reward": 1.0, "step": 1239 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 384.0625, "epoch": 0.017103212369484557, "grad_norm": 1.9339256322698573, "kl": 0.06591796875, "learning_rate": 9.992784097736092e-07, "loss": 0.0026, "reward": 2.145437479019165, "reward_std": 0.01848694123327732, "rewards/accuracy_reward": 0.9454375505447388, "rewards/format_reward": 1.0, "step": 1240 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 378.875, "epoch": 0.017117005282685754, "grad_norm": 4.37828493008526, "kl": 0.057861328125, "learning_rate": 9.992772457296967e-07, "loss": 0.0023, "reward": 1.9357500076293945, "reward_std": 0.04101315140724182, "rewards/accuracy_reward": 0.7419999837875366, "rewards/format_reward": 1.0, "step": 1241 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 386.9375, "epoch": 0.01713079819588695, "grad_norm": 2.5668631345595863, "kl": 0.061279296875, "learning_rate": 9.992760807483224e-07, "loss": 0.0025, "reward": 2.135531425476074, "reward_std": 0.0379534550011158, "rewards/accuracy_reward": 0.941781222820282, "rewards/format_reward": 1.0, "step": 1242 }, { "all_correct": 0.25, "all_wrong": 0.0, "completion_length": 402.21875, "epoch": 0.01714459110908815, "grad_norm": 2.736856062421682, "kl": 0.0654296875, "learning_rate": 9.99274914829489e-07, "loss": 0.0026, "reward": 2.0041563510894775, "reward_std": 0.07743582129478455, "rewards/accuracy_reward": 0.8041561841964722, "rewards/format_reward": 1.0, "step": 1243 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 391.1875, "epoch": 0.017158384022289346, "grad_norm": 2.3318474891083314, "kl": 0.06640625, "learning_rate": 9.992737479731981e-07, "loss": 0.0027, "reward": 2.040781259536743, "reward_std": 0.029941465705633163, "rewards/accuracy_reward": 0.8407812118530273, "rewards/format_reward": 1.0, "step": 1244 }, { "all_correct": 0.25, "all_wrong": 0.0, "completion_length": 410.125, "epoch": 0.017172176935490543, "grad_norm": 4.220386624098627, "kl": 0.0703125, "learning_rate": 9.99272580179452e-07, "loss": 0.0028, "reward": 2.015531301498413, "reward_std": 0.03705378621816635, "rewards/accuracy_reward": 0.8155312538146973, "rewards/format_reward": 1.0, "step": 1245 }, { "all_correct": 0.25, "all_wrong": 0.0, "completion_length": 411.3125, "epoch": 0.01718596984869174, "grad_norm": 5.128074107949049, "kl": 0.07373046875, "learning_rate": 9.99271411448253e-07, "loss": 0.003, "reward": 2.011625051498413, "reward_std": 0.037562109529972076, "rewards/accuracy_reward": 0.8116250038146973, "rewards/format_reward": 1.0, "step": 1246 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 405.6875, "epoch": 0.017199762761892938, "grad_norm": 2.9667012211965473, "kl": 0.07080078125, "learning_rate": 9.992702417796035e-07, "loss": 0.0028, "reward": 2.0946874618530273, "reward_std": 0.028073575347661972, "rewards/accuracy_reward": 0.8946874141693115, "rewards/format_reward": 1.0, "step": 1247 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 409.3125, "epoch": 0.017213555675094135, "grad_norm": 2.2899867197105386, "kl": 0.0654296875, "learning_rate": 9.992690711735052e-07, "loss": 0.0026, "reward": 2.0166563987731934, "reward_std": 0.02354704961180687, "rewards/accuracy_reward": 0.816656231880188, "rewards/format_reward": 1.0, "step": 1248 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 394.0625, "epoch": 0.017227348588295333, "grad_norm": 2.291531281338938, "kl": 0.07080078125, "learning_rate": 9.992678996299607e-07, "loss": 0.0028, "reward": 2.161375045776367, "reward_std": 0.011862218379974365, "rewards/accuracy_reward": 0.9613750576972961, "rewards/format_reward": 1.0, "step": 1249 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 413.4375, "epoch": 0.01724114150149653, "grad_norm": 2.1500935424260628, "kl": 0.06494140625, "learning_rate": 9.992667271489721e-07, "loss": 0.0026, "reward": 2.1350626945495605, "reward_std": 0.016322283074259758, "rewards/accuracy_reward": 0.9350624680519104, "rewards/format_reward": 1.0, "step": 1250 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 406.59375, "epoch": 0.017254934414697727, "grad_norm": 18.72490411726987, "kl": 0.07275390625, "learning_rate": 9.992655537305413e-07, "loss": 0.0029, "reward": 2.1120312213897705, "reward_std": 0.037210579961538315, "rewards/accuracy_reward": 0.9245312213897705, "rewards/format_reward": 1.0, "step": 1251 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 398.03125, "epoch": 0.017268727327898924, "grad_norm": 4.6832979827118635, "kl": 0.06591796875, "learning_rate": 9.99264379374671e-07, "loss": 0.0026, "reward": 2.071312427520752, "reward_std": 0.04111713543534279, "rewards/accuracy_reward": 0.8775625228881836, "rewards/format_reward": 1.0, "step": 1252 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 413.21875, "epoch": 0.01728252024110012, "grad_norm": 2.067493687168406, "kl": 0.0693359375, "learning_rate": 9.992632040813633e-07, "loss": 0.0028, "reward": 2.0321874618530273, "reward_std": 0.01625334843993187, "rewards/accuracy_reward": 0.8321874737739563, "rewards/format_reward": 1.0, "step": 1253 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 416.1875, "epoch": 0.01729631315430132, "grad_norm": 2.2615432777418345, "kl": 0.06884765625, "learning_rate": 9.992620278506202e-07, "loss": 0.0028, "reward": 2.069031238555908, "reward_std": 0.027360178530216217, "rewards/accuracy_reward": 0.8690312504768372, "rewards/format_reward": 1.0, "step": 1254 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 426.0625, "epoch": 0.017310106067502516, "grad_norm": 2.098946377415223, "kl": 0.06396484375, "learning_rate": 9.992608506824438e-07, "loss": 0.0026, "reward": 2.0982813835144043, "reward_std": 0.049889855086803436, "rewards/accuracy_reward": 0.9107812643051147, "rewards/format_reward": 1.0, "step": 1255 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 396.09375, "epoch": 0.017323898980703713, "grad_norm": 1.9941983951998314, "kl": 0.06884765625, "learning_rate": 9.992596725768368e-07, "loss": 0.0028, "reward": 2.056062698364258, "reward_std": 0.03154001384973526, "rewards/accuracy_reward": 0.8623125553131104, "rewards/format_reward": 1.0, "step": 1256 }, { "all_correct": 0.25, "all_wrong": 0.0, "completion_length": 413.65625, "epoch": 0.01733769189390491, "grad_norm": 1.7972413129910345, "kl": 0.072265625, "learning_rate": 9.992584935338008e-07, "loss": 0.0029, "reward": 2.067187547683716, "reward_std": 0.06780581921339035, "rewards/accuracy_reward": 0.8859375715255737, "rewards/format_reward": 1.0, "step": 1257 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 408.0625, "epoch": 0.017351484807106108, "grad_norm": 2.294427937116036, "kl": 0.0693359375, "learning_rate": 9.992573135533386e-07, "loss": 0.0028, "reward": 2.022125244140625, "reward_std": 0.026630530133843422, "rewards/accuracy_reward": 0.8283750414848328, "rewards/format_reward": 1.0, "step": 1258 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 404.96875, "epoch": 0.017365277720307305, "grad_norm": 4.086316547559066, "kl": 0.0673828125, "learning_rate": 9.99256132635452e-07, "loss": 0.0027, "reward": 2.0937812328338623, "reward_std": 0.02037150040268898, "rewards/accuracy_reward": 0.8937812447547913, "rewards/format_reward": 1.0, "step": 1259 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 400.09375, "epoch": 0.017379070633508503, "grad_norm": 7.153747393999852, "kl": 0.0712890625, "learning_rate": 9.992549507801434e-07, "loss": 0.0029, "reward": 1.9926562309265137, "reward_std": 0.03664803132414818, "rewards/accuracy_reward": 0.7926562428474426, "rewards/format_reward": 1.0, "step": 1260 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 392.8125, "epoch": 0.0173928635467097, "grad_norm": 2.5004406991188373, "kl": 0.072265625, "learning_rate": 9.99253767987415e-07, "loss": 0.0029, "reward": 2.1089377403259277, "reward_std": 0.03793904557824135, "rewards/accuracy_reward": 0.915187418460846, "rewards/format_reward": 1.0, "step": 1261 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 394.125, "epoch": 0.017406656459910897, "grad_norm": 15.38594194501659, "kl": 0.05859375, "learning_rate": 9.992525842572687e-07, "loss": 0.0023, "reward": 2.091468572616577, "reward_std": 0.045612677931785583, "rewards/accuracy_reward": 0.8977187275886536, "rewards/format_reward": 1.0, "step": 1262 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 391.75, "epoch": 0.017420449373112094, "grad_norm": 2.394367939462294, "kl": 0.0673828125, "learning_rate": 9.992513995897074e-07, "loss": 0.0027, "reward": 2.168375015258789, "reward_std": 0.030526449903845787, "rewards/accuracy_reward": 0.9746249914169312, "rewards/format_reward": 1.0, "step": 1263 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 390.59375, "epoch": 0.01743424228631329, "grad_norm": 7.5266991147920255, "kl": 0.06396484375, "learning_rate": 9.992502139847328e-07, "loss": 0.0026, "reward": 2.1299376487731934, "reward_std": 0.008113998919725418, "rewards/accuracy_reward": 0.9299376010894775, "rewards/format_reward": 1.0, "step": 1264 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 398.78125, "epoch": 0.01744803519951449, "grad_norm": 3.688791552019758, "kl": 0.0654296875, "learning_rate": 9.992490274423473e-07, "loss": 0.0026, "reward": 2.126187562942505, "reward_std": 0.04226308688521385, "rewards/accuracy_reward": 0.932437539100647, "rewards/format_reward": 1.0, "step": 1265 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 402.03125, "epoch": 0.017461828112715686, "grad_norm": 3.41515412758276, "kl": 0.06201171875, "learning_rate": 9.99247839962553e-07, "loss": 0.0025, "reward": 2.087437629699707, "reward_std": 0.01846667006611824, "rewards/accuracy_reward": 0.8874375820159912, "rewards/format_reward": 1.0, "step": 1266 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 382.75, "epoch": 0.017475621025916883, "grad_norm": 2.3076574682755697, "kl": 0.060302734375, "learning_rate": 9.992466515453525e-07, "loss": 0.0024, "reward": 2.0504374504089355, "reward_std": 0.034033939242362976, "rewards/accuracy_reward": 0.8566875457763672, "rewards/format_reward": 1.0, "step": 1267 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 409.34375, "epoch": 0.01748941393911808, "grad_norm": 2.2446307400242516, "kl": 0.0634765625, "learning_rate": 9.992454621907476e-07, "loss": 0.0025, "reward": 2.049093723297119, "reward_std": 0.04463730752468109, "rewards/accuracy_reward": 0.8490937948226929, "rewards/format_reward": 1.0, "step": 1268 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 392.03125, "epoch": 0.017503206852319278, "grad_norm": 2.5302112575013838, "kl": 0.0673828125, "learning_rate": 9.992442718987406e-07, "loss": 0.0027, "reward": 2.0161561965942383, "reward_std": 0.022656098008155823, "rewards/accuracy_reward": 0.816156268119812, "rewards/format_reward": 1.0, "step": 1269 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 395.5, "epoch": 0.017516999765520475, "grad_norm": 1.9859406652389073, "kl": 0.060546875, "learning_rate": 9.99243080669334e-07, "loss": 0.0024, "reward": 2.0910000801086426, "reward_std": 0.01576429232954979, "rewards/accuracy_reward": 0.8910000324249268, "rewards/format_reward": 1.0, "step": 1270 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 404.40625, "epoch": 0.017530792678721673, "grad_norm": 2.3011352442817077, "kl": 0.061279296875, "learning_rate": 9.992418885025298e-07, "loss": 0.0024, "reward": 2.1239686012268066, "reward_std": 0.015013029798865318, "rewards/accuracy_reward": 0.9239687323570251, "rewards/format_reward": 1.0, "step": 1271 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 384.53125, "epoch": 0.01754458559192287, "grad_norm": 2.4305872580025167, "kl": 0.06689453125, "learning_rate": 9.992406953983304e-07, "loss": 0.0027, "reward": 2.020437479019165, "reward_std": 0.025568563491106033, "rewards/accuracy_reward": 0.820437490940094, "rewards/format_reward": 1.0, "step": 1272 }, { "all_correct": 0.25, "all_wrong": 0.0, "completion_length": 386.3125, "epoch": 0.017558378505124067, "grad_norm": 7.294911341535325, "kl": 0.0693359375, "learning_rate": 9.992395013567378e-07, "loss": 0.0028, "reward": 2.0815625190734863, "reward_std": 0.04772401601076126, "rewards/accuracy_reward": 0.8940625190734863, "rewards/format_reward": 1.0, "step": 1273 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 389.15625, "epoch": 0.017572171418325264, "grad_norm": 2.517702522351415, "kl": 0.06494140625, "learning_rate": 9.992383063777544e-07, "loss": 0.0026, "reward": 2.0726563930511475, "reward_std": 0.029531696811318398, "rewards/accuracy_reward": 0.8789063096046448, "rewards/format_reward": 1.0, "step": 1274 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 381.96875, "epoch": 0.01758596433152646, "grad_norm": 2.0161750867487, "kl": 0.0703125, "learning_rate": 9.992371104613823e-07, "loss": 0.0028, "reward": 2.066718816757202, "reward_std": 0.025595100596547127, "rewards/accuracy_reward": 0.8729687333106995, "rewards/format_reward": 1.0, "step": 1275 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 402.375, "epoch": 0.01759975724472766, "grad_norm": 2.971561624868294, "kl": 0.06787109375, "learning_rate": 9.99235913607624e-07, "loss": 0.0027, "reward": 2.0804686546325684, "reward_std": 0.03435364365577698, "rewards/accuracy_reward": 0.88671875, "rewards/format_reward": 1.0, "step": 1276 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 383.53125, "epoch": 0.017613550157928856, "grad_norm": 2.4949421162610768, "kl": 0.06396484375, "learning_rate": 9.992347158164817e-07, "loss": 0.0025, "reward": 2.132312297821045, "reward_std": 0.020098654553294182, "rewards/accuracy_reward": 0.932312548160553, "rewards/format_reward": 1.0, "step": 1277 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 399.0, "epoch": 0.017627343071130053, "grad_norm": 12.34088454702304, "kl": 0.06640625, "learning_rate": 9.992335170879576e-07, "loss": 0.0027, "reward": 2.1072187423706055, "reward_std": 0.021995626389980316, "rewards/accuracy_reward": 0.9072187542915344, "rewards/format_reward": 1.0, "step": 1278 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 393.15625, "epoch": 0.01764113598433125, "grad_norm": 3.1788434701871644, "kl": 0.0703125, "learning_rate": 9.992323174220538e-07, "loss": 0.0028, "reward": 2.1569061279296875, "reward_std": 0.025308357551693916, "rewards/accuracy_reward": 0.9569063186645508, "rewards/format_reward": 1.0, "step": 1279 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 387.15625, "epoch": 0.017654928897532448, "grad_norm": 1.9921322556515215, "kl": 0.0693359375, "learning_rate": 9.992311168187729e-07, "loss": 0.0028, "reward": 2.13671875, "reward_std": 0.034134991466999054, "rewards/accuracy_reward": 0.9429687261581421, "rewards/format_reward": 1.0, "step": 1280 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 387.90625, "epoch": 0.017668721810733645, "grad_norm": 1.7297271163764978, "kl": 0.0693359375, "learning_rate": 9.992299152781168e-07, "loss": 0.0028, "reward": 1.9619061946868896, "reward_std": 0.007677676156163216, "rewards/accuracy_reward": 0.7619062662124634, "rewards/format_reward": 1.0, "step": 1281 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 385.875, "epoch": 0.017682514723934842, "grad_norm": 2.503294213866577, "kl": 0.0634765625, "learning_rate": 9.992287128000877e-07, "loss": 0.0025, "reward": 2.106656312942505, "reward_std": 0.021199192851781845, "rewards/accuracy_reward": 0.9066562652587891, "rewards/format_reward": 1.0, "step": 1282 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 381.71875, "epoch": 0.01769630763713604, "grad_norm": 5.174851418183943, "kl": 0.06396484375, "learning_rate": 9.992275093846883e-07, "loss": 0.0025, "reward": 2.1569998264312744, "reward_std": 0.004118942655622959, "rewards/accuracy_reward": 0.9570000171661377, "rewards/format_reward": 1.0, "step": 1283 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 380.625, "epoch": 0.017710100550337237, "grad_norm": 3.1384155069768327, "kl": 0.0673828125, "learning_rate": 9.992263050319204e-07, "loss": 0.0027, "reward": 2.1176562309265137, "reward_std": 0.03260079398751259, "rewards/accuracy_reward": 0.9301563501358032, "rewards/format_reward": 1.0, "step": 1284 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 389.75, "epoch": 0.017723893463538434, "grad_norm": 6.765583590133097, "kl": 0.06396484375, "learning_rate": 9.992250997417866e-07, "loss": 0.0026, "reward": 2.0833749771118164, "reward_std": 0.01692761667072773, "rewards/accuracy_reward": 0.8833749890327454, "rewards/format_reward": 1.0, "step": 1285 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 404.09375, "epoch": 0.01773768637673963, "grad_norm": 2.1526087802081744, "kl": 0.0654296875, "learning_rate": 9.99223893514289e-07, "loss": 0.0026, "reward": 1.9795937538146973, "reward_std": 0.019519304856657982, "rewards/accuracy_reward": 0.7795937061309814, "rewards/format_reward": 1.0, "step": 1286 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 390.5, "epoch": 0.01775147928994083, "grad_norm": 1.741793970155477, "kl": 0.07275390625, "learning_rate": 9.9922268634943e-07, "loss": 0.0029, "reward": 2.1280312538146973, "reward_std": 0.024315692484378815, "rewards/accuracy_reward": 0.9342812299728394, "rewards/format_reward": 1.0, "step": 1287 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 381.59375, "epoch": 0.017765272203142026, "grad_norm": 2.9334320217091086, "kl": 0.06787109375, "learning_rate": 9.992214782472115e-07, "loss": 0.0027, "reward": 2.08203125, "reward_std": 0.019113700836896896, "rewards/accuracy_reward": 0.882031261920929, "rewards/format_reward": 1.0, "step": 1288 }, { "all_correct": 0.25, "all_wrong": 0.0, "completion_length": 397.53125, "epoch": 0.017779065116343223, "grad_norm": 3.756170197221209, "kl": 0.0712890625, "learning_rate": 9.992202692076362e-07, "loss": 0.0029, "reward": 2.029968738555908, "reward_std": 0.02976238541305065, "rewards/accuracy_reward": 0.8299686908721924, "rewards/format_reward": 1.0, "step": 1289 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 384.71875, "epoch": 0.01779285802954442, "grad_norm": 2.3122942448900816, "kl": 0.068359375, "learning_rate": 9.992190592307063e-07, "loss": 0.0027, "reward": 2.0606250762939453, "reward_std": 0.012134727090597153, "rewards/accuracy_reward": 0.8606250286102295, "rewards/format_reward": 1.0, "step": 1290 }, { "all_correct": 0.0, "all_wrong": 0.0, "completion_length": 398.53125, "epoch": 0.017806650942745618, "grad_norm": 12.674295098780371, "kl": 0.0712890625, "learning_rate": 9.992178483164238e-07, "loss": 0.0029, "reward": 2.0131874084472656, "reward_std": 0.02567855641245842, "rewards/accuracy_reward": 0.8131875991821289, "rewards/format_reward": 1.0, "step": 1291 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 409.4375, "epoch": 0.017820443855946815, "grad_norm": 2.4979549954721403, "kl": 0.056884765625, "learning_rate": 9.992166364647912e-07, "loss": 0.0023, "reward": 2.1094374656677246, "reward_std": 0.022530939429998398, "rewards/accuracy_reward": 0.9094374775886536, "rewards/format_reward": 1.0, "step": 1292 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 400.1875, "epoch": 0.017834236769148012, "grad_norm": 2.2102403550860052, "kl": 0.0830078125, "learning_rate": 9.992154236758106e-07, "loss": 0.0033, "reward": 2.0909063816070557, "reward_std": 0.03599603846669197, "rewards/accuracy_reward": 0.8971562385559082, "rewards/format_reward": 1.0, "step": 1293 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 415.53125, "epoch": 0.01784802968234921, "grad_norm": 9.756326973723638, "kl": 0.0625, "learning_rate": 9.992142099494844e-07, "loss": 0.0025, "reward": 2.0747499465942383, "reward_std": 0.026643654331564903, "rewards/accuracy_reward": 0.874750018119812, "rewards/format_reward": 1.0, "step": 1294 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 390.9375, "epoch": 0.017861822595550407, "grad_norm": 1.861029523922298, "kl": 0.07373046875, "learning_rate": 9.99212995285815e-07, "loss": 0.003, "reward": 2.1524689197540283, "reward_std": 0.013114258646965027, "rewards/accuracy_reward": 0.952468752861023, "rewards/format_reward": 1.0, "step": 1295 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 389.6875, "epoch": 0.017875615508751604, "grad_norm": 2.3700312688285896, "kl": 0.068359375, "learning_rate": 9.992117796848047e-07, "loss": 0.0027, "reward": 2.1459686756134033, "reward_std": 0.036623407155275345, "rewards/accuracy_reward": 0.952218770980835, "rewards/format_reward": 1.0, "step": 1296 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 391.375, "epoch": 0.0178894084219528, "grad_norm": 2.4866139710005686, "kl": 0.07373046875, "learning_rate": 9.992105631464554e-07, "loss": 0.003, "reward": 2.1166563034057617, "reward_std": 0.015610732138156891, "rewards/accuracy_reward": 0.9166562557220459, "rewards/format_reward": 1.0, "step": 1297 }, { "all_correct": 0.25, "all_wrong": 0.0, "completion_length": 393.90625, "epoch": 0.017903201335154, "grad_norm": 2.402165425162664, "kl": 0.0673828125, "learning_rate": 9.992093456707699e-07, "loss": 0.0027, "reward": 1.894124984741211, "reward_std": 0.03596854582428932, "rewards/accuracy_reward": 0.700374960899353, "rewards/format_reward": 1.0, "step": 1298 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 388.03125, "epoch": 0.017916994248355196, "grad_norm": 2.568921693645958, "kl": 0.0634765625, "learning_rate": 9.992081272577498e-07, "loss": 0.0025, "reward": 2.083437442779541, "reward_std": 0.025990627706050873, "rewards/accuracy_reward": 0.8834375143051147, "rewards/format_reward": 1.0, "step": 1299 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 384.6875, "epoch": 0.017930787161556393, "grad_norm": 2.7557299826414683, "kl": 0.064453125, "learning_rate": 9.99206907907398e-07, "loss": 0.0026, "reward": 2.1140940189361572, "reward_std": 0.021752499043941498, "rewards/accuracy_reward": 0.9140937328338623, "rewards/format_reward": 1.0, "step": 1300 }, { "all_correct": 0.25, "all_wrong": 0.0, "completion_length": 406.4375, "epoch": 0.01794458007475759, "grad_norm": 3.385968019076272, "kl": 0.0703125, "learning_rate": 9.992056876197167e-07, "loss": 0.0028, "reward": 2.092531204223633, "reward_std": 0.05235935375094414, "rewards/accuracy_reward": 0.8987812399864197, "rewards/format_reward": 1.0, "step": 1301 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 400.40625, "epoch": 0.017958372987958788, "grad_norm": 3.629825429706229, "kl": 0.07080078125, "learning_rate": 9.992044663947078e-07, "loss": 0.0028, "reward": 2.1239686012268066, "reward_std": 0.020227909088134766, "rewards/accuracy_reward": 0.9239687919616699, "rewards/format_reward": 1.0, "step": 1302 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 383.5, "epoch": 0.017972165901159985, "grad_norm": 2.4798689201534225, "kl": 0.064453125, "learning_rate": 9.992032442323742e-07, "loss": 0.0026, "reward": 2.0557188987731934, "reward_std": 0.035645052790641785, "rewards/accuracy_reward": 0.8619687557220459, "rewards/format_reward": 1.0, "step": 1303 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 385.34375, "epoch": 0.017985958814361182, "grad_norm": 2.3368143161431996, "kl": 0.06396484375, "learning_rate": 9.992020211327177e-07, "loss": 0.0026, "reward": 2.0753438472747803, "reward_std": 0.013957293704152107, "rewards/accuracy_reward": 0.8753437995910645, "rewards/format_reward": 1.0, "step": 1304 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 394.5625, "epoch": 0.01799975172756238, "grad_norm": 2.295421557869091, "kl": 0.076171875, "learning_rate": 9.992007970957407e-07, "loss": 0.003, "reward": 2.040374994277954, "reward_std": 0.014951921999454498, "rewards/accuracy_reward": 0.8403750658035278, "rewards/format_reward": 1.0, "step": 1305 }, { "all_correct": 0.25, "all_wrong": 0.0, "completion_length": 394.625, "epoch": 0.018013544640763577, "grad_norm": 2.0980857236969874, "kl": 0.0751953125, "learning_rate": 9.991995721214457e-07, "loss": 0.003, "reward": 2.080031394958496, "reward_std": 0.049908656626939774, "rewards/accuracy_reward": 0.8925312757492065, "rewards/format_reward": 1.0, "step": 1306 }, { "all_correct": 0.25, "all_wrong": 0.0, "completion_length": 389.3125, "epoch": 0.018027337553964774, "grad_norm": 2.3412007801443706, "kl": 0.0791015625, "learning_rate": 9.991983462098349e-07, "loss": 0.0032, "reward": 1.9791876077651978, "reward_std": 0.03083454817533493, "rewards/accuracy_reward": 0.7854374647140503, "rewards/format_reward": 1.0, "step": 1307 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 404.8125, "epoch": 0.01804113046716597, "grad_norm": 1.9545182965726413, "kl": 0.061279296875, "learning_rate": 9.991971193609103e-07, "loss": 0.0024, "reward": 2.115281343460083, "reward_std": 0.03376578539609909, "rewards/accuracy_reward": 0.9215313196182251, "rewards/format_reward": 1.0, "step": 1308 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 390.21875, "epoch": 0.01805492338036717, "grad_norm": 4.991290932912927, "kl": 0.07275390625, "learning_rate": 9.991958915746747e-07, "loss": 0.0029, "reward": 2.1332499980926514, "reward_std": 0.041369445621967316, "rewards/accuracy_reward": 0.9394999742507935, "rewards/format_reward": 1.0, "step": 1309 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 391.25, "epoch": 0.018068716293568366, "grad_norm": 4.0592490101908565, "kl": 0.0673828125, "learning_rate": 9.9919466285113e-07, "loss": 0.0027, "reward": 2.0373125076293945, "reward_std": 0.03273964300751686, "rewards/accuracy_reward": 0.8435624837875366, "rewards/format_reward": 1.0, "step": 1310 }, { "all_correct": 0.25, "all_wrong": 0.0, "completion_length": 380.625, "epoch": 0.018082509206769563, "grad_norm": 2.655849268189705, "kl": 0.06494140625, "learning_rate": 9.991934331902789e-07, "loss": 0.0026, "reward": 1.9785312414169312, "reward_std": 0.06187181547284126, "rewards/accuracy_reward": 0.7910313010215759, "rewards/format_reward": 1.0, "step": 1311 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 381.40625, "epoch": 0.01809630211997076, "grad_norm": 9.965632179626775, "kl": 0.06591796875, "learning_rate": 9.991922025921232e-07, "loss": 0.0026, "reward": 2.13253116607666, "reward_std": 0.02880311757326126, "rewards/accuracy_reward": 0.9387812614440918, "rewards/format_reward": 1.0, "step": 1312 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 391.09375, "epoch": 0.018110095033171958, "grad_norm": 3.5182005503082574, "kl": 0.0673828125, "learning_rate": 9.991909710566657e-07, "loss": 0.0027, "reward": 2.069031238555908, "reward_std": 0.027374345809221268, "rewards/accuracy_reward": 0.8690313100814819, "rewards/format_reward": 1.0, "step": 1313 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 395.9375, "epoch": 0.018123887946373155, "grad_norm": 2.5440027565075933, "kl": 0.062255859375, "learning_rate": 9.991897385839085e-07, "loss": 0.0025, "reward": 2.0666251182556152, "reward_std": 0.02783946879208088, "rewards/accuracy_reward": 0.8666249513626099, "rewards/format_reward": 1.0, "step": 1314 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 387.8125, "epoch": 0.01813768085957435, "grad_norm": 2.8868659514781947, "kl": 0.06982421875, "learning_rate": 9.99188505173854e-07, "loss": 0.0028, "reward": 2.0211873054504395, "reward_std": 0.018854837864637375, "rewards/accuracy_reward": 0.8211874961853027, "rewards/format_reward": 1.0, "step": 1315 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 392.875, "epoch": 0.018151473772775546, "grad_norm": 2.1503997331165827, "kl": 0.0693359375, "learning_rate": 9.991872708265041e-07, "loss": 0.0028, "reward": 2.095343828201294, "reward_std": 0.01398918591439724, "rewards/accuracy_reward": 0.8953437805175781, "rewards/format_reward": 1.0, "step": 1316 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 396.875, "epoch": 0.018165266685976744, "grad_norm": 2.0452298782375737, "kl": 0.06640625, "learning_rate": 9.991860355418618e-07, "loss": 0.0027, "reward": 2.054500102996826, "reward_std": 0.02902274951338768, "rewards/accuracy_reward": 0.8544999957084656, "rewards/format_reward": 1.0, "step": 1317 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 387.9375, "epoch": 0.01817905959917794, "grad_norm": 1.9163852138598063, "kl": 0.06396484375, "learning_rate": 9.99184799319929e-07, "loss": 0.0026, "reward": 2.1368439197540283, "reward_std": 0.012028755620121956, "rewards/accuracy_reward": 0.936843752861023, "rewards/format_reward": 1.0, "step": 1318 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 391.65625, "epoch": 0.018192852512379138, "grad_norm": 4.133464636525462, "kl": 0.06640625, "learning_rate": 9.991835621607082e-07, "loss": 0.0027, "reward": 2.1093125343322754, "reward_std": 0.03394792228937149, "rewards/accuracy_reward": 0.9155625104904175, "rewards/format_reward": 1.0, "step": 1319 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 383.09375, "epoch": 0.018206645425580335, "grad_norm": 1.7634421982488198, "kl": 0.06640625, "learning_rate": 9.991823240642014e-07, "loss": 0.0027, "reward": 2.129093885421753, "reward_std": 0.00860103964805603, "rewards/accuracy_reward": 0.9290937185287476, "rewards/format_reward": 1.0, "step": 1320 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 408.34375, "epoch": 0.018220438338781533, "grad_norm": 3.8155978824687433, "kl": 0.06396484375, "learning_rate": 9.991810850304112e-07, "loss": 0.0026, "reward": 2.1029064655303955, "reward_std": 0.018995974212884903, "rewards/accuracy_reward": 0.9029062390327454, "rewards/format_reward": 1.0, "step": 1321 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 399.6875, "epoch": 0.01823423125198273, "grad_norm": 1.7862235299039864, "kl": 0.06494140625, "learning_rate": 9.9917984505934e-07, "loss": 0.0026, "reward": 2.0381875038146973, "reward_std": 0.022393614053726196, "rewards/accuracy_reward": 0.8444375395774841, "rewards/format_reward": 1.0, "step": 1322 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 379.875, "epoch": 0.018248024165183927, "grad_norm": 5.701456101301914, "kl": 0.0703125, "learning_rate": 9.991786041509898e-07, "loss": 0.0028, "reward": 1.9686561822891235, "reward_std": 0.01742098480463028, "rewards/accuracy_reward": 0.7686562538146973, "rewards/format_reward": 1.0, "step": 1323 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 396.6875, "epoch": 0.018261817078385124, "grad_norm": 2.3585676788738743, "kl": 0.06494140625, "learning_rate": 9.991773623053632e-07, "loss": 0.0026, "reward": 2.10546875, "reward_std": 0.03171078488230705, "rewards/accuracy_reward": 0.9117187857627869, "rewards/format_reward": 1.0, "step": 1324 }, { "all_correct": 0.25, "all_wrong": 0.0, "completion_length": 414.59375, "epoch": 0.01827560999158632, "grad_norm": 2.3819125062119038, "kl": 0.07275390625, "learning_rate": 9.991761195224626e-07, "loss": 0.0029, "reward": 2.0290937423706055, "reward_std": 0.03605687618255615, "rewards/accuracy_reward": 0.8353437781333923, "rewards/format_reward": 1.0, "step": 1325 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 404.40625, "epoch": 0.01828940290478752, "grad_norm": 1.9423954827040824, "kl": 0.06396484375, "learning_rate": 9.9917487580229e-07, "loss": 0.0026, "reward": 2.1045312881469727, "reward_std": 0.015334662050008774, "rewards/accuracy_reward": 0.9045312404632568, "rewards/format_reward": 1.0, "step": 1326 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 419.46875, "epoch": 0.018303195817988716, "grad_norm": 1.6208200237901333, "kl": 0.0703125, "learning_rate": 9.99173631144848e-07, "loss": 0.0028, "reward": 2.075000047683716, "reward_std": 0.01023872010409832, "rewards/accuracy_reward": 0.8749999403953552, "rewards/format_reward": 1.0, "step": 1327 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 432.375, "epoch": 0.018316988731189913, "grad_norm": 4.704455231200037, "kl": 0.0703125, "learning_rate": 9.99172385550139e-07, "loss": 0.0028, "reward": 2.101656436920166, "reward_std": 0.03354896605014801, "rewards/accuracy_reward": 0.9079062342643738, "rewards/format_reward": 1.0, "step": 1328 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 389.4375, "epoch": 0.01833078164439111, "grad_norm": 2.1562555900686378, "kl": 0.07177734375, "learning_rate": 9.991711390181652e-07, "loss": 0.0029, "reward": 2.119500160217285, "reward_std": 0.008087344467639923, "rewards/accuracy_reward": 0.9194999933242798, "rewards/format_reward": 1.0, "step": 1329 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 403.75, "epoch": 0.018344574557592308, "grad_norm": 2.360134470971612, "kl": 0.07275390625, "learning_rate": 9.991698915489286e-07, "loss": 0.0029, "reward": 2.034749984741211, "reward_std": 0.01637093722820282, "rewards/accuracy_reward": 0.8347499966621399, "rewards/format_reward": 1.0, "step": 1330 }, { "all_correct": 0.0, "all_wrong": 0.0, "completion_length": 406.65625, "epoch": 0.018358367470793505, "grad_norm": 4.985515018094128, "kl": 0.07470703125, "learning_rate": 9.991686431424322e-07, "loss": 0.003, "reward": 1.9120937585830688, "reward_std": 0.051356006413698196, "rewards/accuracy_reward": 0.7120937705039978, "rewards/format_reward": 1.0, "step": 1331 }, { "all_correct": 0.25, "all_wrong": 0.0, "completion_length": 395.09375, "epoch": 0.018372160383994703, "grad_norm": 2.2119426010538534, "kl": 0.078125, "learning_rate": 9.99167393798678e-07, "loss": 0.0031, "reward": 2.093062400817871, "reward_std": 0.05680582672357559, "rewards/accuracy_reward": 0.911812424659729, "rewards/format_reward": 1.0, "step": 1332 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 405.9375, "epoch": 0.0183859532971959, "grad_norm": 2.287375733459683, "kl": 0.0791015625, "learning_rate": 9.991661435176684e-07, "loss": 0.0032, "reward": 2.141812324523926, "reward_std": 0.012004390358924866, "rewards/accuracy_reward": 0.9418124556541443, "rewards/format_reward": 1.0, "step": 1333 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 416.78125, "epoch": 0.018399746210397097, "grad_norm": 2.7929312945513716, "kl": 0.0673828125, "learning_rate": 9.991648922994056e-07, "loss": 0.0027, "reward": 2.064000129699707, "reward_std": 0.0319165475666523, "rewards/accuracy_reward": 0.8639999628067017, "rewards/format_reward": 1.0, "step": 1334 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 409.5625, "epoch": 0.018413539123598294, "grad_norm": 4.101596251767827, "kl": 0.07861328125, "learning_rate": 9.991636401438922e-07, "loss": 0.0032, "reward": 2.099343776702881, "reward_std": 0.028483789414167404, "rewards/accuracy_reward": 0.905593752861023, "rewards/format_reward": 1.0, "step": 1335 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 407.625, "epoch": 0.01842733203679949, "grad_norm": 3.101866598957218, "kl": 0.0703125, "learning_rate": 9.991623870511302e-07, "loss": 0.0028, "reward": 2.05078125, "reward_std": 0.01934993639588356, "rewards/accuracy_reward": 0.8507813215255737, "rewards/format_reward": 1.0, "step": 1336 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 413.875, "epoch": 0.01844112495000069, "grad_norm": 2.2444862501305423, "kl": 0.06396484375, "learning_rate": 9.991611330211225e-07, "loss": 0.0026, "reward": 2.1689376831054688, "reward_std": 0.021803725510835648, "rewards/accuracy_reward": 0.9689374566078186, "rewards/format_reward": 1.0, "step": 1337 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 393.4375, "epoch": 0.018454917863201886, "grad_norm": 2.272289361291398, "kl": 0.07373046875, "learning_rate": 9.991598780538711e-07, "loss": 0.0029, "reward": 2.101375102996826, "reward_std": 0.00968160480260849, "rewards/accuracy_reward": 0.9013750553131104, "rewards/format_reward": 1.0, "step": 1338 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 421.6875, "epoch": 0.018468710776403083, "grad_norm": 3.22700265443963, "kl": 0.07666015625, "learning_rate": 9.991586221493782e-07, "loss": 0.0031, "reward": 2.1269688606262207, "reward_std": 0.016268674284219742, "rewards/accuracy_reward": 0.9269687533378601, "rewards/format_reward": 1.0, "step": 1339 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 421.09375, "epoch": 0.01848250368960428, "grad_norm": 2.8701778210012976, "kl": 0.0712890625, "learning_rate": 9.991573653076465e-07, "loss": 0.0029, "reward": 2.0544373989105225, "reward_std": 0.0317423939704895, "rewards/accuracy_reward": 0.8606875538825989, "rewards/format_reward": 1.0, "step": 1340 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 413.28125, "epoch": 0.018496296602805478, "grad_norm": 1.4197183012210006, "kl": 0.06689453125, "learning_rate": 9.991561075286782e-07, "loss": 0.0027, "reward": 2.1239686012268066, "reward_std": 0.007951047271490097, "rewards/accuracy_reward": 0.9239687323570251, "rewards/format_reward": 1.0, "step": 1341 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 409.3125, "epoch": 0.018510089516006675, "grad_norm": 2.0709104617009144, "kl": 0.07958984375, "learning_rate": 9.991548488124755e-07, "loss": 0.0032, "reward": 2.149718761444092, "reward_std": 0.04433050751686096, "rewards/accuracy_reward": 0.9622187614440918, "rewards/format_reward": 1.0, "step": 1342 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 407.875, "epoch": 0.018523882429207873, "grad_norm": 2.8063581849325834, "kl": 0.07421875, "learning_rate": 9.99153589159041e-07, "loss": 0.003, "reward": 2.1442501544952393, "reward_std": 0.008503405377268791, "rewards/accuracy_reward": 0.9442500472068787, "rewards/format_reward": 1.0, "step": 1343 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 396.90625, "epoch": 0.01853767534240907, "grad_norm": 5.452082990163365, "kl": 0.07861328125, "learning_rate": 9.99152328568377e-07, "loss": 0.0031, "reward": 2.1150002479553223, "reward_std": 0.030775217339396477, "rewards/accuracy_reward": 0.92125004529953, "rewards/format_reward": 1.0, "step": 1344 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 415.59375, "epoch": 0.018551468255610267, "grad_norm": 5.582527642050675, "kl": 0.078125, "learning_rate": 9.991510670404862e-07, "loss": 0.0031, "reward": 2.087531089782715, "reward_std": 0.021545231342315674, "rewards/accuracy_reward": 0.8875312805175781, "rewards/format_reward": 1.0, "step": 1345 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 426.125, "epoch": 0.018565261168811464, "grad_norm": 3.134959983375769, "kl": 0.0673828125, "learning_rate": 9.991498045753701e-07, "loss": 0.0027, "reward": 2.0568125247955322, "reward_std": 0.02381068468093872, "rewards/accuracy_reward": 0.8568124771118164, "rewards/format_reward": 1.0, "step": 1346 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 412.4375, "epoch": 0.01857905408201266, "grad_norm": 5.408913193256637, "kl": 0.07275390625, "learning_rate": 9.99148541173032e-07, "loss": 0.0029, "reward": 2.082937479019165, "reward_std": 0.02531331591308117, "rewards/accuracy_reward": 0.882937490940094, "rewards/format_reward": 1.0, "step": 1347 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 413.4375, "epoch": 0.01859284699521386, "grad_norm": 2.8505764854003974, "kl": 0.06982421875, "learning_rate": 9.991472768334737e-07, "loss": 0.0028, "reward": 2.1494998931884766, "reward_std": 0.028745274990797043, "rewards/accuracy_reward": 0.9557499885559082, "rewards/format_reward": 1.0, "step": 1348 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 407.15625, "epoch": 0.018606639908415056, "grad_norm": 2.376338226753753, "kl": 0.0751953125, "learning_rate": 9.991460115566978e-07, "loss": 0.003, "reward": 2.035562515258789, "reward_std": 0.023025721311569214, "rewards/accuracy_reward": 0.835562527179718, "rewards/format_reward": 1.0, "step": 1349 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 415.28125, "epoch": 0.018620432821616253, "grad_norm": 3.263741667221038, "kl": 0.07763671875, "learning_rate": 9.991447453427067e-07, "loss": 0.0031, "reward": 2.0255937576293945, "reward_std": 0.017825044691562653, "rewards/accuracy_reward": 0.8255937695503235, "rewards/format_reward": 1.0, "step": 1350 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 414.9375, "epoch": 0.01863422573481745, "grad_norm": 1.983969538051493, "kl": 0.0751953125, "learning_rate": 9.991434781915026e-07, "loss": 0.003, "reward": 2.09012508392334, "reward_std": 0.01619267836213112, "rewards/accuracy_reward": 0.8901249766349792, "rewards/format_reward": 1.0, "step": 1351 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 409.28125, "epoch": 0.018648018648018648, "grad_norm": 2.1924901131100327, "kl": 0.0595703125, "learning_rate": 9.99142210103088e-07, "loss": 0.0024, "reward": 2.1401562690734863, "reward_std": 0.013448791578412056, "rewards/accuracy_reward": 0.9401562213897705, "rewards/format_reward": 1.0, "step": 1352 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 407.65625, "epoch": 0.018661811561219845, "grad_norm": 1.9370415001797936, "kl": 0.0693359375, "learning_rate": 9.991409410774653e-07, "loss": 0.0028, "reward": 2.129624843597412, "reward_std": 0.011086581274867058, "rewards/accuracy_reward": 0.9296250343322754, "rewards/format_reward": 1.0, "step": 1353 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 419.03125, "epoch": 0.018675604474421043, "grad_norm": 1.8252733188785442, "kl": 0.07421875, "learning_rate": 9.99139671114637e-07, "loss": 0.003, "reward": 2.087343692779541, "reward_std": 0.025723453611135483, "rewards/accuracy_reward": 0.8935937285423279, "rewards/format_reward": 1.0, "step": 1354 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 401.8125, "epoch": 0.01868939738762224, "grad_norm": 2.235551051279889, "kl": 0.06982421875, "learning_rate": 9.99138400214605e-07, "loss": 0.0028, "reward": 1.9066250324249268, "reward_std": 0.047897472977638245, "rewards/accuracy_reward": 0.719124972820282, "rewards/format_reward": 1.0, "step": 1355 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 417.9375, "epoch": 0.018703190300823437, "grad_norm": 2.0475582245280046, "kl": 0.068359375, "learning_rate": 9.991371283773721e-07, "loss": 0.0027, "reward": 2.0213751792907715, "reward_std": 0.030648333951830864, "rewards/accuracy_reward": 0.8276249766349792, "rewards/format_reward": 1.0, "step": 1356 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 411.8125, "epoch": 0.018716983214024634, "grad_norm": 3.0947143794301732, "kl": 0.07763671875, "learning_rate": 9.991358556029407e-07, "loss": 0.0031, "reward": 2.089531421661377, "reward_std": 0.021671392023563385, "rewards/accuracy_reward": 0.8895312547683716, "rewards/format_reward": 1.0, "step": 1357 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 402.625, "epoch": 0.01873077612722583, "grad_norm": 3.345173621844925, "kl": 0.07177734375, "learning_rate": 9.991345818913132e-07, "loss": 0.0029, "reward": 2.121281147003174, "reward_std": 0.03856559470295906, "rewards/accuracy_reward": 0.9275312423706055, "rewards/format_reward": 1.0, "step": 1358 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 428.0625, "epoch": 0.01874456904042703, "grad_norm": 2.419742359947703, "kl": 0.06494140625, "learning_rate": 9.991333072424917e-07, "loss": 0.0026, "reward": 2.026718854904175, "reward_std": 0.02642715349793434, "rewards/accuracy_reward": 0.8329687714576721, "rewards/format_reward": 1.0, "step": 1359 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 406.5, "epoch": 0.018758361953628226, "grad_norm": 3.4849116417821504, "kl": 0.06689453125, "learning_rate": 9.991320316564788e-07, "loss": 0.0027, "reward": 2.0764377117156982, "reward_std": 0.026358962059020996, "rewards/accuracy_reward": 0.8764375448226929, "rewards/format_reward": 1.0, "step": 1360 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 399.15625, "epoch": 0.018772154866829423, "grad_norm": 2.423372390081931, "kl": 0.0693359375, "learning_rate": 9.99130755133277e-07, "loss": 0.0028, "reward": 2.048156261444092, "reward_std": 0.023086393252015114, "rewards/accuracy_reward": 0.848156213760376, "rewards/format_reward": 1.0, "step": 1361 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 398.6875, "epoch": 0.01878594778003062, "grad_norm": 5.165924481832018, "kl": 0.06884765625, "learning_rate": 9.991294776728885e-07, "loss": 0.0028, "reward": 2.1153438091278076, "reward_std": 0.01083328202366829, "rewards/accuracy_reward": 0.9153437614440918, "rewards/format_reward": 1.0, "step": 1362 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 393.75, "epoch": 0.018799740693231818, "grad_norm": 2.5854302494235477, "kl": 0.080078125, "learning_rate": 9.991281992753158e-07, "loss": 0.0032, "reward": 2.1729061603546143, "reward_std": 0.015167968347668648, "rewards/accuracy_reward": 0.972906231880188, "rewards/format_reward": 1.0, "step": 1363 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 390.46875, "epoch": 0.018813533606433015, "grad_norm": 2.965826203940543, "kl": 0.0673828125, "learning_rate": 9.991269199405613e-07, "loss": 0.0027, "reward": 2.1620311737060547, "reward_std": 0.04841157793998718, "rewards/accuracy_reward": 0.9745312929153442, "rewards/format_reward": 1.0, "step": 1364 }, { "all_correct": 0.0, "all_wrong": 0.0, "completion_length": 396.125, "epoch": 0.018827326519634213, "grad_norm": 2.9106077063294644, "kl": 0.07373046875, "learning_rate": 9.99125639668627e-07, "loss": 0.003, "reward": 1.9558436870574951, "reward_std": 0.020240500569343567, "rewards/accuracy_reward": 0.7558437585830688, "rewards/format_reward": 1.0, "step": 1365 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 409.21875, "epoch": 0.01884111943283541, "grad_norm": 1.9745329265337146, "kl": 0.07275390625, "learning_rate": 9.991243584595162e-07, "loss": 0.0029, "reward": 2.157531261444092, "reward_std": 0.016875505447387695, "rewards/accuracy_reward": 0.957531213760376, "rewards/format_reward": 1.0, "step": 1366 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 393.46875, "epoch": 0.018854912346036607, "grad_norm": 2.434004969758939, "kl": 0.06787109375, "learning_rate": 9.991230763132306e-07, "loss": 0.0027, "reward": 1.9667187929153442, "reward_std": 0.013308387249708176, "rewards/accuracy_reward": 0.7667186856269836, "rewards/format_reward": 1.0, "step": 1367 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 408.625, "epoch": 0.018868705259237804, "grad_norm": 2.5329570641590244, "kl": 0.06298828125, "learning_rate": 9.991217932297728e-07, "loss": 0.0025, "reward": 2.0630311965942383, "reward_std": 0.01589939184486866, "rewards/accuracy_reward": 0.863031268119812, "rewards/format_reward": 1.0, "step": 1368 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 409.21875, "epoch": 0.018882498172439, "grad_norm": 2.670051852264782, "kl": 0.06591796875, "learning_rate": 9.991205092091452e-07, "loss": 0.0026, "reward": 2.0581564903259277, "reward_std": 0.019017966464161873, "rewards/accuracy_reward": 0.8581562638282776, "rewards/format_reward": 1.0, "step": 1369 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 405.15625, "epoch": 0.0188962910856402, "grad_norm": 2.52800273343408, "kl": 0.06689453125, "learning_rate": 9.991192242513502e-07, "loss": 0.0027, "reward": 2.0528125762939453, "reward_std": 0.014065331779420376, "rewards/accuracy_reward": 0.8528125286102295, "rewards/format_reward": 1.0, "step": 1370 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 405.9375, "epoch": 0.018910083998841396, "grad_norm": 2.59207789965161, "kl": 0.072265625, "learning_rate": 9.991179383563901e-07, "loss": 0.0029, "reward": 2.0565781593322754, "reward_std": 0.016641080379486084, "rewards/accuracy_reward": 0.8565781116485596, "rewards/format_reward": 1.0, "step": 1371 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 394.9375, "epoch": 0.018923876912042593, "grad_norm": 4.633462528320491, "kl": 0.06640625, "learning_rate": 9.991166515242677e-07, "loss": 0.0027, "reward": 2.0631561279296875, "reward_std": 0.032800182700157166, "rewards/accuracy_reward": 0.875656247138977, "rewards/format_reward": 1.0, "step": 1372 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 403.4375, "epoch": 0.01893766982524379, "grad_norm": 2.8645025271161555, "kl": 0.06640625, "learning_rate": 9.991153637549848e-07, "loss": 0.0027, "reward": 2.172656297683716, "reward_std": 0.030428048223257065, "rewards/accuracy_reward": 0.9789062142372131, "rewards/format_reward": 1.0, "step": 1373 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 381.78125, "epoch": 0.018951462738444988, "grad_norm": 3.001483966449212, "kl": 0.07177734375, "learning_rate": 9.991140750485445e-07, "loss": 0.0029, "reward": 2.0498437881469727, "reward_std": 0.018360059708356857, "rewards/accuracy_reward": 0.8498437404632568, "rewards/format_reward": 1.0, "step": 1374 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 396.96875, "epoch": 0.018965255651646185, "grad_norm": 6.86166035745758, "kl": 0.068359375, "learning_rate": 9.991127854049489e-07, "loss": 0.0027, "reward": 2.0992345809936523, "reward_std": 0.021194960922002792, "rewards/accuracy_reward": 0.899234414100647, "rewards/format_reward": 1.0, "step": 1375 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 402.28125, "epoch": 0.018979048564847383, "grad_norm": 4.352203720350787, "kl": 0.0712890625, "learning_rate": 9.991114948242002e-07, "loss": 0.0029, "reward": 2.065281391143799, "reward_std": 0.0284421369433403, "rewards/accuracy_reward": 0.8652812242507935, "rewards/format_reward": 1.0, "step": 1376 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 415.875, "epoch": 0.01899284147804858, "grad_norm": 21.245959448261683, "kl": 0.06884765625, "learning_rate": 9.99110203306301e-07, "loss": 0.0027, "reward": 2.0699687004089355, "reward_std": 0.016931701451539993, "rewards/accuracy_reward": 0.8699687719345093, "rewards/format_reward": 1.0, "step": 1377 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 413.4375, "epoch": 0.019006634391249777, "grad_norm": 2.9893530560991017, "kl": 0.06787109375, "learning_rate": 9.99108910851254e-07, "loss": 0.0027, "reward": 2.089968681335449, "reward_std": 0.03087923862040043, "rewards/accuracy_reward": 0.8962187170982361, "rewards/format_reward": 1.0, "step": 1378 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 416.75, "epoch": 0.019020427304450974, "grad_norm": 2.2613369735417144, "kl": 0.06787109375, "learning_rate": 9.991076174590612e-07, "loss": 0.0027, "reward": 2.0407814979553223, "reward_std": 0.023062093183398247, "rewards/accuracy_reward": 0.8407812118530273, "rewards/format_reward": 1.0, "step": 1379 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 413.9375, "epoch": 0.01903422021765217, "grad_norm": 6.256814808683799, "kl": 0.06787109375, "learning_rate": 9.991063231297255e-07, "loss": 0.0027, "reward": 2.0450000762939453, "reward_std": 0.019958723336458206, "rewards/accuracy_reward": 0.8449999690055847, "rewards/format_reward": 1.0, "step": 1380 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 385.1875, "epoch": 0.01904801313085337, "grad_norm": 3.0508079933981547, "kl": 0.068359375, "learning_rate": 9.991050278632488e-07, "loss": 0.0027, "reward": 2.047468662261963, "reward_std": 0.04885120317339897, "rewards/accuracy_reward": 0.8599687814712524, "rewards/format_reward": 1.0, "step": 1381 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 405.375, "epoch": 0.019061806044054566, "grad_norm": 2.560188593035059, "kl": 0.06396484375, "learning_rate": 9.991037316596339e-07, "loss": 0.0025, "reward": 2.1327810287475586, "reward_std": 0.030704261735081673, "rewards/accuracy_reward": 0.9327812194824219, "rewards/format_reward": 1.0, "step": 1382 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 404.1875, "epoch": 0.019075598957255763, "grad_norm": 2.4127431924900837, "kl": 0.076171875, "learning_rate": 9.991024345188829e-07, "loss": 0.0031, "reward": 2.0745625495910645, "reward_std": 0.02189439721405506, "rewards/accuracy_reward": 0.8745625019073486, "rewards/format_reward": 1.0, "step": 1383 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 411.15625, "epoch": 0.01908939187045696, "grad_norm": 15.322939180686419, "kl": 0.06689453125, "learning_rate": 9.991011364409986e-07, "loss": 0.0027, "reward": 2.118593692779541, "reward_std": 0.006792397703975439, "rewards/accuracy_reward": 0.9185937643051147, "rewards/format_reward": 1.0, "step": 1384 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 407.0, "epoch": 0.019103184783658158, "grad_norm": 2.3089285319316883, "kl": 0.07421875, "learning_rate": 9.990998374259833e-07, "loss": 0.003, "reward": 2.094531297683716, "reward_std": 0.015446671284735203, "rewards/accuracy_reward": 0.89453125, "rewards/format_reward": 1.0, "step": 1385 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 420.8125, "epoch": 0.019116977696859355, "grad_norm": 2.24937093486185, "kl": 0.0615234375, "learning_rate": 9.990985374738395e-07, "loss": 0.0025, "reward": 2.1020936965942383, "reward_std": 0.01481853611767292, "rewards/accuracy_reward": 0.902093768119812, "rewards/format_reward": 1.0, "step": 1386 }, { "all_correct": 0.25, "all_wrong": 0.0, "completion_length": 401.5, "epoch": 0.019130770610060552, "grad_norm": 4.4409468423841, "kl": 0.0625, "learning_rate": 9.990972365845694e-07, "loss": 0.0025, "reward": 2.0081875324249268, "reward_std": 0.018969684839248657, "rewards/accuracy_reward": 0.8081874251365662, "rewards/format_reward": 1.0, "step": 1387 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 406.875, "epoch": 0.01914456352326175, "grad_norm": 2.8495613782729103, "kl": 0.0654296875, "learning_rate": 9.990959347581756e-07, "loss": 0.0026, "reward": 2.044562339782715, "reward_std": 0.015596939250826836, "rewards/accuracy_reward": 0.8445624709129333, "rewards/format_reward": 1.0, "step": 1388 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 393.21875, "epoch": 0.019158356436462947, "grad_norm": 2.4038115640660442, "kl": 0.061767578125, "learning_rate": 9.990946319946606e-07, "loss": 0.0025, "reward": 2.0484752655029297, "reward_std": 0.009697951376438141, "rewards/accuracy_reward": 0.8484753370285034, "rewards/format_reward": 1.0, "step": 1389 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 407.75, "epoch": 0.01917214934966414, "grad_norm": 2.365709593829244, "kl": 0.06201171875, "learning_rate": 9.990933282940268e-07, "loss": 0.0025, "reward": 2.118499755859375, "reward_std": 0.028025349602103233, "rewards/accuracy_reward": 0.924750030040741, "rewards/format_reward": 1.0, "step": 1390 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 405.375, "epoch": 0.019185942262865338, "grad_norm": 2.760345800837128, "kl": 0.06640625, "learning_rate": 9.990920236562767e-07, "loss": 0.0026, "reward": 2.115687370300293, "reward_std": 0.02271743305027485, "rewards/accuracy_reward": 0.9219374656677246, "rewards/format_reward": 1.0, "step": 1391 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 396.90625, "epoch": 0.019199735176066535, "grad_norm": 2.4719968506959566, "kl": 0.06640625, "learning_rate": 9.990907180814127e-07, "loss": 0.0026, "reward": 2.14662504196167, "reward_std": 0.005435137078166008, "rewards/accuracy_reward": 0.9466249942779541, "rewards/format_reward": 1.0, "step": 1392 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 400.4375, "epoch": 0.019213528089267733, "grad_norm": 2.921653276635333, "kl": 0.06640625, "learning_rate": 9.990894115694373e-07, "loss": 0.0027, "reward": 2.1763126850128174, "reward_std": 0.024860410019755363, "rewards/accuracy_reward": 0.9825624823570251, "rewards/format_reward": 1.0, "step": 1393 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 405.84375, "epoch": 0.01922732100246893, "grad_norm": 4.695182684585688, "kl": 0.06982421875, "learning_rate": 9.990881041203527e-07, "loss": 0.0028, "reward": 2.120640754699707, "reward_std": 0.02664576843380928, "rewards/accuracy_reward": 0.9268906116485596, "rewards/format_reward": 1.0, "step": 1394 }, { "all_correct": 0.25, "all_wrong": 0.0, "completion_length": 394.375, "epoch": 0.019241113915670127, "grad_norm": 2.948055774747822, "kl": 0.078125, "learning_rate": 9.990867957341617e-07, "loss": 0.0031, "reward": 2.053187370300293, "reward_std": 0.04791552573442459, "rewards/accuracy_reward": 0.8656874895095825, "rewards/format_reward": 1.0, "step": 1395 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 399.5625, "epoch": 0.019254906828871324, "grad_norm": 2.670385126230036, "kl": 0.07177734375, "learning_rate": 9.990854864108666e-07, "loss": 0.0029, "reward": 2.129499912261963, "reward_std": 0.025492768734693527, "rewards/accuracy_reward": 0.9357499480247498, "rewards/format_reward": 1.0, "step": 1396 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 387.40625, "epoch": 0.01926869974207252, "grad_norm": 2.258663069326255, "kl": 0.06298828125, "learning_rate": 9.9908417615047e-07, "loss": 0.0025, "reward": 2.064406394958496, "reward_std": 0.02151062712073326, "rewards/accuracy_reward": 0.8644062280654907, "rewards/format_reward": 1.0, "step": 1397 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 388.9375, "epoch": 0.01928249265527372, "grad_norm": 2.297977100366778, "kl": 0.06494140625, "learning_rate": 9.99082864952974e-07, "loss": 0.0026, "reward": 2.064687490463257, "reward_std": 0.028131868690252304, "rewards/accuracy_reward": 0.8709375262260437, "rewards/format_reward": 1.0, "step": 1398 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 399.09375, "epoch": 0.019296285568474916, "grad_norm": 3.004945067939647, "kl": 0.064453125, "learning_rate": 9.990815528183815e-07, "loss": 0.0026, "reward": 2.0157811641693115, "reward_std": 0.02041110396385193, "rewards/accuracy_reward": 0.8157812356948853, "rewards/format_reward": 1.0, "step": 1399 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 385.21875, "epoch": 0.019310078481676114, "grad_norm": 4.566409895909664, "kl": 0.072265625, "learning_rate": 9.990802397466946e-07, "loss": 0.0029, "reward": 2.031540632247925, "reward_std": 0.03331389278173447, "rewards/accuracy_reward": 0.8377906084060669, "rewards/format_reward": 1.0, "step": 1400 }, { "all_correct": 0.25, "all_wrong": 0.0, "completion_length": 388.40625, "epoch": 0.01932387139487731, "grad_norm": 2.816010107802226, "kl": 0.06591796875, "learning_rate": 9.99078925737916e-07, "loss": 0.0026, "reward": 2.0303125381469727, "reward_std": 0.03370671719312668, "rewards/accuracy_reward": 0.8365625739097595, "rewards/format_reward": 1.0, "step": 1401 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 387.46875, "epoch": 0.019337664308078508, "grad_norm": 2.0093763034820484, "kl": 0.0634765625, "learning_rate": 9.99077610792048e-07, "loss": 0.0026, "reward": 2.117500066757202, "reward_std": 0.010711370036005974, "rewards/accuracy_reward": 0.9174999594688416, "rewards/format_reward": 1.0, "step": 1402 }, { "all_correct": 0.0, "all_wrong": 0.0, "completion_length": 385.0625, "epoch": 0.019351457221279705, "grad_norm": 1.964705004863095, "kl": 0.06787109375, "learning_rate": 9.990762949090932e-07, "loss": 0.0027, "reward": 2.012312650680542, "reward_std": 0.026217876002192497, "rewards/accuracy_reward": 0.8185625076293945, "rewards/format_reward": 1.0, "step": 1403 }, { "all_correct": 0.25, "all_wrong": 0.0, "completion_length": 384.53125, "epoch": 0.019365250134480903, "grad_norm": 3.8917036634762305, "kl": 0.06591796875, "learning_rate": 9.990749780890543e-07, "loss": 0.0026, "reward": 1.9648125171661377, "reward_std": 0.037845950573682785, "rewards/accuracy_reward": 0.7648124694824219, "rewards/format_reward": 1.0, "step": 1404 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 388.15625, "epoch": 0.0193790430476821, "grad_norm": 3.7791355827027897, "kl": 0.06396484375, "learning_rate": 9.990736603319332e-07, "loss": 0.0026, "reward": 2.101375102996826, "reward_std": 0.04703740030527115, "rewards/accuracy_reward": 0.9013749361038208, "rewards/format_reward": 1.0, "step": 1405 }, { "all_correct": 0.25, "all_wrong": 0.0, "completion_length": 390.8125, "epoch": 0.019392835960883297, "grad_norm": 3.4782599406137877, "kl": 0.06298828125, "learning_rate": 9.990723416377326e-07, "loss": 0.0025, "reward": 1.9942188262939453, "reward_std": 0.010157547891139984, "rewards/accuracy_reward": 0.7942187786102295, "rewards/format_reward": 1.0, "step": 1406 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 389.0, "epoch": 0.019406628874084494, "grad_norm": 1.9183813775975525, "kl": 0.0654296875, "learning_rate": 9.990710220064553e-07, "loss": 0.0026, "reward": 2.082937717437744, "reward_std": 0.010085434652864933, "rewards/accuracy_reward": 0.8829374313354492, "rewards/format_reward": 1.0, "step": 1407 }, { "all_correct": 0.25, "all_wrong": 0.0, "completion_length": 390.125, "epoch": 0.01942042178728569, "grad_norm": 2.841260601384254, "kl": 0.0673828125, "learning_rate": 9.990697014381034e-07, "loss": 0.0027, "reward": 2.0561251640319824, "reward_std": 0.030994638800621033, "rewards/accuracy_reward": 0.862375020980835, "rewards/format_reward": 1.0, "step": 1408 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 388.875, "epoch": 0.01943421470048689, "grad_norm": 7.877301682722908, "kl": 0.068359375, "learning_rate": 9.990683799326795e-07, "loss": 0.0027, "reward": 2.0743751525878906, "reward_std": 0.015688057988882065, "rewards/accuracy_reward": 0.8743749856948853, "rewards/format_reward": 1.0, "step": 1409 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 398.25, "epoch": 0.019448007613688086, "grad_norm": 2.6976833786107357, "kl": 0.0654296875, "learning_rate": 9.990670574901861e-07, "loss": 0.0026, "reward": 2.1114063262939453, "reward_std": 0.019553350284695625, "rewards/accuracy_reward": 0.9114062786102295, "rewards/format_reward": 1.0, "step": 1410 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 395.5, "epoch": 0.019461800526889284, "grad_norm": 3.851158144377856, "kl": 0.0703125, "learning_rate": 9.990657341106258e-07, "loss": 0.0028, "reward": 2.0808498859405518, "reward_std": 0.04106046259403229, "rewards/accuracy_reward": 0.8933500647544861, "rewards/format_reward": 1.0, "step": 1411 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 393.03125, "epoch": 0.01947559344009048, "grad_norm": 2.9251545264930496, "kl": 0.07275390625, "learning_rate": 9.99064409794001e-07, "loss": 0.0029, "reward": 2.12137508392334, "reward_std": 0.028615232557058334, "rewards/accuracy_reward": 0.9276250004768372, "rewards/format_reward": 1.0, "step": 1412 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 399.09375, "epoch": 0.019489386353291678, "grad_norm": 2.9823938749513297, "kl": 0.0693359375, "learning_rate": 9.990630845403139e-07, "loss": 0.0028, "reward": 2.1189374923706055, "reward_std": 0.015463745221495628, "rewards/accuracy_reward": 0.9189374446868896, "rewards/format_reward": 1.0, "step": 1413 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 406.40625, "epoch": 0.019503179266492875, "grad_norm": 3.2499441215325957, "kl": 0.0673828125, "learning_rate": 9.990617583495674e-07, "loss": 0.0027, "reward": 2.1347813606262207, "reward_std": 0.014841862954199314, "rewards/accuracy_reward": 0.9347811937332153, "rewards/format_reward": 1.0, "step": 1414 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 399.0625, "epoch": 0.019516972179694073, "grad_norm": 7.3485830510949395, "kl": 0.06689453125, "learning_rate": 9.990604312217636e-07, "loss": 0.0027, "reward": 2.137343645095825, "reward_std": 0.01708845980465412, "rewards/accuracy_reward": 0.9373437762260437, "rewards/format_reward": 1.0, "step": 1415 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 407.9375, "epoch": 0.01953076509289527, "grad_norm": 2.3316750023619908, "kl": 0.0693359375, "learning_rate": 9.990591031569055e-07, "loss": 0.0028, "reward": 2.0688748359680176, "reward_std": 0.014043771661818027, "rewards/accuracy_reward": 0.8688750267028809, "rewards/format_reward": 1.0, "step": 1416 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 429.90625, "epoch": 0.019544558006096467, "grad_norm": 5.388408852254975, "kl": 0.0673828125, "learning_rate": 9.990577741549952e-07, "loss": 0.0027, "reward": 2.095968723297119, "reward_std": 0.01867583580315113, "rewards/accuracy_reward": 0.8959687948226929, "rewards/format_reward": 1.0, "step": 1417 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 414.5, "epoch": 0.019558350919297664, "grad_norm": 2.0788566359784935, "kl": 0.06640625, "learning_rate": 9.990564442160352e-07, "loss": 0.0026, "reward": 2.0601251125335693, "reward_std": 0.01963699422776699, "rewards/accuracy_reward": 0.8601250648498535, "rewards/format_reward": 1.0, "step": 1418 }, { "all_correct": 0.25, "all_wrong": 0.0, "completion_length": 410.71875, "epoch": 0.01957214383249886, "grad_norm": 2.800607541032042, "kl": 0.0712890625, "learning_rate": 9.990551133400283e-07, "loss": 0.0028, "reward": 2.000093936920166, "reward_std": 0.021569915115833282, "rewards/accuracy_reward": 0.8000937104225159, "rewards/format_reward": 1.0, "step": 1419 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 409.03125, "epoch": 0.01958593674570006, "grad_norm": 2.3001075444508965, "kl": 0.07275390625, "learning_rate": 9.990537815269765e-07, "loss": 0.0029, "reward": 2.0057811737060547, "reward_std": 0.010040796361863613, "rewards/accuracy_reward": 0.8057812452316284, "rewards/format_reward": 1.0, "step": 1420 }, { "all_correct": 0.0, "all_wrong": 0.0, "completion_length": 401.5625, "epoch": 0.019599729658901256, "grad_norm": 2.520649600787214, "kl": 0.07568359375, "learning_rate": 9.99052448776883e-07, "loss": 0.003, "reward": 2.0252814292907715, "reward_std": 0.03313218802213669, "rewards/accuracy_reward": 0.831531286239624, "rewards/format_reward": 1.0, "step": 1421 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 404.71875, "epoch": 0.019613522572102454, "grad_norm": 3.0774828302666695, "kl": 0.07373046875, "learning_rate": 9.990511150897496e-07, "loss": 0.003, "reward": 2.1285157203674316, "reward_std": 0.01692095398902893, "rewards/accuracy_reward": 0.928515613079071, "rewards/format_reward": 1.0, "step": 1422 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 417.75, "epoch": 0.01962731548530365, "grad_norm": 2.367879250724881, "kl": 0.0712890625, "learning_rate": 9.990497804655793e-07, "loss": 0.0028, "reward": 2.1272499561309814, "reward_std": 0.02027575671672821, "rewards/accuracy_reward": 0.9272499680519104, "rewards/format_reward": 1.0, "step": 1423 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 427.1875, "epoch": 0.019641108398504848, "grad_norm": 2.406134215714481, "kl": 0.0654296875, "learning_rate": 9.990484449043744e-07, "loss": 0.0026, "reward": 2.0484063625335693, "reward_std": 0.02412530779838562, "rewards/accuracy_reward": 0.8484062552452087, "rewards/format_reward": 1.0, "step": 1424 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 426.0, "epoch": 0.019654901311706045, "grad_norm": 2.5933129417946357, "kl": 0.0732421875, "learning_rate": 9.990471084061373e-07, "loss": 0.0029, "reward": 2.052187442779541, "reward_std": 0.035337500274181366, "rewards/accuracy_reward": 0.8584374785423279, "rewards/format_reward": 1.0, "step": 1425 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 413.71875, "epoch": 0.019668694224907243, "grad_norm": 10.806961798226759, "kl": 0.0673828125, "learning_rate": 9.990457709708705e-07, "loss": 0.0027, "reward": 2.0996875762939453, "reward_std": 0.026638073846697807, "rewards/accuracy_reward": 0.9059374928474426, "rewards/format_reward": 1.0, "step": 1426 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 416.625, "epoch": 0.01968248713810844, "grad_norm": 3.427685274050611, "kl": 0.0703125, "learning_rate": 9.99044432598577e-07, "loss": 0.0028, "reward": 2.133406162261963, "reward_std": 0.02070443332195282, "rewards/accuracy_reward": 0.9334062337875366, "rewards/format_reward": 1.0, "step": 1427 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 412.1875, "epoch": 0.019696280051309637, "grad_norm": 2.544944003080472, "kl": 0.07177734375, "learning_rate": 9.990430932892588e-07, "loss": 0.0029, "reward": 2.085624933242798, "reward_std": 0.01857692375779152, "rewards/accuracy_reward": 0.8856250047683716, "rewards/format_reward": 1.0, "step": 1428 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 427.625, "epoch": 0.019710072964510834, "grad_norm": 2.8662737447680335, "kl": 0.07666015625, "learning_rate": 9.990417530429186e-07, "loss": 0.0031, "reward": 2.067312717437744, "reward_std": 0.021073510870337486, "rewards/accuracy_reward": 0.867312490940094, "rewards/format_reward": 1.0, "step": 1429 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 417.28125, "epoch": 0.01972386587771203, "grad_norm": 2.2717837997473125, "kl": 0.07470703125, "learning_rate": 9.990404118595587e-07, "loss": 0.003, "reward": 2.1215624809265137, "reward_std": 0.028076861053705215, "rewards/accuracy_reward": 0.9278125166893005, "rewards/format_reward": 1.0, "step": 1430 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 425.40625, "epoch": 0.01973765879091323, "grad_norm": 2.499405558427648, "kl": 0.07177734375, "learning_rate": 9.99039069739182e-07, "loss": 0.0029, "reward": 2.05049991607666, "reward_std": 0.014963004738092422, "rewards/accuracy_reward": 0.8504999876022339, "rewards/format_reward": 1.0, "step": 1431 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 417.625, "epoch": 0.019751451704114426, "grad_norm": 2.414492798561618, "kl": 0.080078125, "learning_rate": 9.99037726681791e-07, "loss": 0.0032, "reward": 1.9780938625335693, "reward_std": 0.03298702836036682, "rewards/accuracy_reward": 0.778093695640564, "rewards/format_reward": 1.0, "step": 1432 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 415.46875, "epoch": 0.019765244617315623, "grad_norm": 3.5838160270555, "kl": 0.072265625, "learning_rate": 9.990363826873877e-07, "loss": 0.0029, "reward": 2.1017813682556152, "reward_std": 0.02714308351278305, "rewards/accuracy_reward": 0.9017812609672546, "rewards/format_reward": 1.0, "step": 1433 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 404.75, "epoch": 0.01977903753051682, "grad_norm": 8.212660605744995, "kl": 0.078125, "learning_rate": 9.99035037755975e-07, "loss": 0.0031, "reward": 2.116875171661377, "reward_std": 0.030858095735311508, "rewards/accuracy_reward": 0.9231250286102295, "rewards/format_reward": 1.0, "step": 1434 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 415.21875, "epoch": 0.019792830443718018, "grad_norm": 2.530384796034006, "kl": 0.072265625, "learning_rate": 9.990336918875557e-07, "loss": 0.0029, "reward": 2.073406219482422, "reward_std": 0.033087823539972305, "rewards/accuracy_reward": 0.8796562552452087, "rewards/format_reward": 1.0, "step": 1435 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 408.40625, "epoch": 0.019806623356919215, "grad_norm": 5.862788582254299, "kl": 0.0830078125, "learning_rate": 9.990323450821317e-07, "loss": 0.0033, "reward": 2.1296873092651367, "reward_std": 0.017731118947267532, "rewards/accuracy_reward": 0.9296875, "rewards/format_reward": 1.0, "step": 1436 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 402.5, "epoch": 0.019820416270120413, "grad_norm": 2.553000715757157, "kl": 0.07958984375, "learning_rate": 9.99030997339706e-07, "loss": 0.0032, "reward": 2.11447811126709, "reward_std": 0.02133796736598015, "rewards/accuracy_reward": 0.9144781231880188, "rewards/format_reward": 1.0, "step": 1437 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 411.0625, "epoch": 0.01983420918332161, "grad_norm": 7.537701671431813, "kl": 0.07275390625, "learning_rate": 9.990296486602809e-07, "loss": 0.0029, "reward": 2.1310312747955322, "reward_std": 0.04073544591665268, "rewards/accuracy_reward": 0.9310312271118164, "rewards/format_reward": 1.0, "step": 1438 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 419.59375, "epoch": 0.019848002096522807, "grad_norm": 3.0150078328275876, "kl": 0.0693359375, "learning_rate": 9.99028299043859e-07, "loss": 0.0028, "reward": 2.076406240463257, "reward_std": 0.03086410090327263, "rewards/accuracy_reward": 0.8826562166213989, "rewards/format_reward": 1.0, "step": 1439 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 397.09375, "epoch": 0.019861795009724004, "grad_norm": 2.7278265141157676, "kl": 0.07666015625, "learning_rate": 9.99026948490443e-07, "loss": 0.0031, "reward": 2.0022811889648438, "reward_std": 0.01818978041410446, "rewards/accuracy_reward": 0.8022812604904175, "rewards/format_reward": 1.0, "step": 1440 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 388.9375, "epoch": 0.0198755879229252, "grad_norm": 3.0086712372134716, "kl": 0.0751953125, "learning_rate": 9.990255970000352e-07, "loss": 0.003, "reward": 2.0996251106262207, "reward_std": 0.0259021557867527, "rewards/accuracy_reward": 0.8996249437332153, "rewards/format_reward": 1.0, "step": 1441 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 394.34375, "epoch": 0.0198893808361264, "grad_norm": 2.1449807743936984, "kl": 0.0712890625, "learning_rate": 9.990242445726383e-07, "loss": 0.0028, "reward": 2.1070938110351562, "reward_std": 0.013833701610565186, "rewards/accuracy_reward": 0.9070937037467957, "rewards/format_reward": 1.0, "step": 1442 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 393.78125, "epoch": 0.019903173749327596, "grad_norm": 2.2030681132618417, "kl": 0.078125, "learning_rate": 9.990228912082545e-07, "loss": 0.0031, "reward": 2.099375009536743, "reward_std": 0.029069416224956512, "rewards/accuracy_reward": 0.9056249856948853, "rewards/format_reward": 1.0, "step": 1443 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 375.5625, "epoch": 0.019916966662528793, "grad_norm": 3.09720987760733, "kl": 0.078125, "learning_rate": 9.99021536906887e-07, "loss": 0.0031, "reward": 2.0868749618530273, "reward_std": 0.01801455393433571, "rewards/accuracy_reward": 0.8868749737739563, "rewards/format_reward": 1.0, "step": 1444 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 398.90625, "epoch": 0.01993075957572999, "grad_norm": 2.3280074806987727, "kl": 0.080078125, "learning_rate": 9.990201816685378e-07, "loss": 0.0032, "reward": 2.083343982696533, "reward_std": 0.015879862010478973, "rewards/accuracy_reward": 0.8833437561988831, "rewards/format_reward": 1.0, "step": 1445 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 387.1875, "epoch": 0.019944552488931188, "grad_norm": 2.1722469200182997, "kl": 0.0732421875, "learning_rate": 9.990188254932096e-07, "loss": 0.0029, "reward": 2.1130001544952393, "reward_std": 0.01783987134695053, "rewards/accuracy_reward": 0.9129999876022339, "rewards/format_reward": 1.0, "step": 1446 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 389.59375, "epoch": 0.019958345402132385, "grad_norm": 3.113658086628346, "kl": 0.072265625, "learning_rate": 9.990174683809047e-07, "loss": 0.0029, "reward": 2.028031349182129, "reward_std": 0.03220091015100479, "rewards/accuracy_reward": 0.8280312418937683, "rewards/format_reward": 1.0, "step": 1447 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 382.375, "epoch": 0.019972138315333583, "grad_norm": 2.7258957173500913, "kl": 0.0693359375, "learning_rate": 9.99016110331626e-07, "loss": 0.0028, "reward": 2.075906276702881, "reward_std": 0.030843686312437057, "rewards/accuracy_reward": 0.8821563124656677, "rewards/format_reward": 1.0, "step": 1448 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 374.59375, "epoch": 0.01998593122853478, "grad_norm": 2.1182130106951536, "kl": 0.0732421875, "learning_rate": 9.99014751345376e-07, "loss": 0.0029, "reward": 2.140812397003174, "reward_std": 0.03086215630173683, "rewards/accuracy_reward": 0.9470625519752502, "rewards/format_reward": 1.0, "step": 1449 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 399.46875, "epoch": 0.019999724141735977, "grad_norm": 4.51714947154425, "kl": 0.076171875, "learning_rate": 9.990133914221573e-07, "loss": 0.003, "reward": 2.0561561584472656, "reward_std": 0.025271710008382797, "rewards/accuracy_reward": 0.8561562895774841, "rewards/format_reward": 1.0, "step": 1450 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 383.6875, "epoch": 0.020013517054937174, "grad_norm": 5.747795951529524, "kl": 0.08203125, "learning_rate": 9.99012030561972e-07, "loss": 0.0033, "reward": 2.028437614440918, "reward_std": 0.022412512451410294, "rewards/accuracy_reward": 0.8284375071525574, "rewards/format_reward": 1.0, "step": 1451 }, { "all_correct": 0.25, "all_wrong": 0.0, "completion_length": 382.65625, "epoch": 0.02002730996813837, "grad_norm": 2.3702601381037773, "kl": 0.080078125, "learning_rate": 9.990106687648234e-07, "loss": 0.0032, "reward": 2.0584373474121094, "reward_std": 0.050187185406684875, "rewards/accuracy_reward": 0.8709375262260437, "rewards/format_reward": 1.0, "step": 1452 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 381.96875, "epoch": 0.02004110288133957, "grad_norm": 2.870280503999781, "kl": 0.0771484375, "learning_rate": 9.990093060307134e-07, "loss": 0.0031, "reward": 2.102193832397461, "reward_std": 0.037157513201236725, "rewards/accuracy_reward": 0.9084437489509583, "rewards/format_reward": 1.0, "step": 1453 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 374.96875, "epoch": 0.020054895794540766, "grad_norm": 2.2391834679139433, "kl": 0.07373046875, "learning_rate": 9.99007942359645e-07, "loss": 0.003, "reward": 1.9770625829696655, "reward_std": 0.03263351693749428, "rewards/accuracy_reward": 0.789562463760376, "rewards/format_reward": 1.0, "step": 1454 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 379.875, "epoch": 0.020068688707741963, "grad_norm": 2.805966438635942, "kl": 0.07958984375, "learning_rate": 9.990065777516203e-07, "loss": 0.0032, "reward": 2.0721874237060547, "reward_std": 0.03986940532922745, "rewards/accuracy_reward": 0.8784375786781311, "rewards/format_reward": 1.0, "step": 1455 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 376.78125, "epoch": 0.02008248162094316, "grad_norm": 4.513762339417655, "kl": 0.0751953125, "learning_rate": 9.990052122066423e-07, "loss": 0.003, "reward": 2.035062313079834, "reward_std": 0.028803803026676178, "rewards/accuracy_reward": 0.8413124680519104, "rewards/format_reward": 1.0, "step": 1456 }, { "all_correct": 0.25, "all_wrong": 0.0, "completion_length": 382.71875, "epoch": 0.020096274534144358, "grad_norm": 2.209519098465575, "kl": 0.0751953125, "learning_rate": 9.990038457247135e-07, "loss": 0.003, "reward": 2.059093952178955, "reward_std": 0.02936217002570629, "rewards/accuracy_reward": 0.8653438091278076, "rewards/format_reward": 1.0, "step": 1457 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 382.15625, "epoch": 0.020110067447345555, "grad_norm": 5.275383632147241, "kl": 0.0712890625, "learning_rate": 9.99002478305836e-07, "loss": 0.0029, "reward": 2.136593818664551, "reward_std": 0.029701193794608116, "rewards/accuracy_reward": 0.9428437948226929, "rewards/format_reward": 1.0, "step": 1458 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 388.1875, "epoch": 0.020123860360546753, "grad_norm": 2.569242678900793, "kl": 0.078125, "learning_rate": 9.99001109950013e-07, "loss": 0.0031, "reward": 2.114187479019165, "reward_std": 0.018061114475131035, "rewards/accuracy_reward": 0.9141875505447388, "rewards/format_reward": 1.0, "step": 1459 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 397.03125, "epoch": 0.02013765327374795, "grad_norm": 5.731445674137346, "kl": 0.06640625, "learning_rate": 9.989997406572467e-07, "loss": 0.0027, "reward": 2.0840625762939453, "reward_std": 0.02083110623061657, "rewards/accuracy_reward": 0.8840625286102295, "rewards/format_reward": 1.0, "step": 1460 }, { "all_correct": 0.0, "all_wrong": 0.0, "completion_length": 405.25, "epoch": 0.020151446186949147, "grad_norm": 2.3202474142012237, "kl": 0.08349609375, "learning_rate": 9.9899837042754e-07, "loss": 0.0033, "reward": 1.8969063758850098, "reward_std": 0.049554333090782166, "rewards/accuracy_reward": 0.7156562209129333, "rewards/format_reward": 1.0, "step": 1461 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 417.75, "epoch": 0.020165239100150344, "grad_norm": 2.653689168150688, "kl": 0.072265625, "learning_rate": 9.989969992608952e-07, "loss": 0.0029, "reward": 2.0672812461853027, "reward_std": 0.0335346944630146, "rewards/accuracy_reward": 0.8735312819480896, "rewards/format_reward": 1.0, "step": 1462 }, { "all_correct": 0.25, "all_wrong": 0.0, "completion_length": 395.71875, "epoch": 0.02017903201335154, "grad_norm": 2.8829537302895, "kl": 0.0830078125, "learning_rate": 9.989956271573146e-07, "loss": 0.0033, "reward": 2.0203750133514404, "reward_std": 0.031481314450502396, "rewards/accuracy_reward": 0.8266249895095825, "rewards/format_reward": 1.0, "step": 1463 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 402.71875, "epoch": 0.02019282492655274, "grad_norm": 2.298391993201629, "kl": 0.0703125, "learning_rate": 9.989942541168014e-07, "loss": 0.0028, "reward": 2.0488126277923584, "reward_std": 0.030729303136467934, "rewards/accuracy_reward": 0.8550624847412109, "rewards/format_reward": 1.0, "step": 1464 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 420.5, "epoch": 0.020206617839753936, "grad_norm": 2.2312097748787223, "kl": 0.072265625, "learning_rate": 9.989928801393578e-07, "loss": 0.0029, "reward": 2.0637500286102295, "reward_std": 0.008092949166893959, "rewards/accuracy_reward": 0.8637499213218689, "rewards/format_reward": 1.0, "step": 1465 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 404.90625, "epoch": 0.02022041075295513, "grad_norm": 4.023196168787191, "kl": 0.0791015625, "learning_rate": 9.989915052249864e-07, "loss": 0.0032, "reward": 2.017031192779541, "reward_std": 0.034903790801763535, "rewards/accuracy_reward": 0.8170312643051147, "rewards/format_reward": 1.0, "step": 1466 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 404.34375, "epoch": 0.020234203666156327, "grad_norm": 3.2540815736474786, "kl": 0.076171875, "learning_rate": 9.989901293736897e-07, "loss": 0.0031, "reward": 2.05400013923645, "reward_std": 0.026853162795305252, "rewards/accuracy_reward": 0.8539999723434448, "rewards/format_reward": 1.0, "step": 1467 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 435.65625, "epoch": 0.020247996579357525, "grad_norm": 2.234032815486629, "kl": 0.076171875, "learning_rate": 9.989887525854708e-07, "loss": 0.003, "reward": 2.110374927520752, "reward_std": 0.018385866656899452, "rewards/accuracy_reward": 0.9103749990463257, "rewards/format_reward": 1.0, "step": 1468 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 413.0625, "epoch": 0.020261789492558722, "grad_norm": 4.841013135337845, "kl": 0.07080078125, "learning_rate": 9.989873748603316e-07, "loss": 0.0028, "reward": 2.0072813034057617, "reward_std": 0.011211058124899864, "rewards/accuracy_reward": 0.8072812557220459, "rewards/format_reward": 1.0, "step": 1469 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 406.15625, "epoch": 0.02027558240575992, "grad_norm": 25.240793802416594, "kl": 0.0771484375, "learning_rate": 9.98985996198275e-07, "loss": 0.0031, "reward": 2.063593864440918, "reward_std": 0.023534409701824188, "rewards/accuracy_reward": 0.8635937571525574, "rewards/format_reward": 1.0, "step": 1470 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 411.84375, "epoch": 0.020289375318961116, "grad_norm": 7.417145160218178, "kl": 0.0751953125, "learning_rate": 9.989846165993037e-07, "loss": 0.003, "reward": 2.0546875, "reward_std": 0.017199421301484108, "rewards/accuracy_reward": 0.8546874523162842, "rewards/format_reward": 1.0, "step": 1471 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 412.03125, "epoch": 0.020303168232162314, "grad_norm": 2.3842360275911845, "kl": 0.0732421875, "learning_rate": 9.9898323606342e-07, "loss": 0.0029, "reward": 2.137937545776367, "reward_std": 0.012007412500679493, "rewards/accuracy_reward": 0.9379374980926514, "rewards/format_reward": 1.0, "step": 1472 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 419.0625, "epoch": 0.02031696114536351, "grad_norm": 2.649914949442856, "kl": 0.0791015625, "learning_rate": 9.989818545906269e-07, "loss": 0.0032, "reward": 2.088156223297119, "reward_std": 0.008715685456991196, "rewards/accuracy_reward": 0.8881562948226929, "rewards/format_reward": 1.0, "step": 1473 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 423.96875, "epoch": 0.020330754058564708, "grad_norm": 2.4476070083864347, "kl": 0.0771484375, "learning_rate": 9.989804721809266e-07, "loss": 0.0031, "reward": 2.1330626010894775, "reward_std": 0.029888728633522987, "rewards/accuracy_reward": 0.9393125176429749, "rewards/format_reward": 1.0, "step": 1474 }, { "all_correct": 0.25, "all_wrong": 0.0, "completion_length": 413.0625, "epoch": 0.020344546971765905, "grad_norm": 2.932398650127671, "kl": 0.07177734375, "learning_rate": 9.98979088834322e-07, "loss": 0.0029, "reward": 2.014343738555908, "reward_std": 0.022120721638202667, "rewards/accuracy_reward": 0.8143437504768372, "rewards/format_reward": 1.0, "step": 1475 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 414.15625, "epoch": 0.020358339884967103, "grad_norm": 2.650293900427481, "kl": 0.08203125, "learning_rate": 9.989777045508155e-07, "loss": 0.0033, "reward": 1.9688124656677246, "reward_std": 0.022652767598628998, "rewards/accuracy_reward": 0.7688125371932983, "rewards/format_reward": 1.0, "step": 1476 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 422.53125, "epoch": 0.0203721327981683, "grad_norm": 2.5421774639434624, "kl": 0.07763671875, "learning_rate": 9.989763193304094e-07, "loss": 0.0031, "reward": 2.1029999256134033, "reward_std": 0.017032857984304428, "rewards/accuracy_reward": 0.902999997138977, "rewards/format_reward": 1.0, "step": 1477 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 413.75, "epoch": 0.020385925711369497, "grad_norm": 2.0188034380453805, "kl": 0.0830078125, "learning_rate": 9.989749331731072e-07, "loss": 0.0033, "reward": 2.1459999084472656, "reward_std": 0.010570304468274117, "rewards/accuracy_reward": 0.9459999799728394, "rewards/format_reward": 1.0, "step": 1478 }, { "all_correct": 0.25, "all_wrong": 0.0, "completion_length": 414.375, "epoch": 0.020399718624570694, "grad_norm": 2.4112312451120874, "kl": 0.07861328125, "learning_rate": 9.989735460789107e-07, "loss": 0.0031, "reward": 1.961625099182129, "reward_std": 0.020201044157147408, "rewards/accuracy_reward": 0.7616250514984131, "rewards/format_reward": 1.0, "step": 1479 }, { "all_correct": 0.25, "all_wrong": 0.0, "completion_length": 402.375, "epoch": 0.020413511537771892, "grad_norm": 2.512817512563664, "kl": 0.08056640625, "learning_rate": 9.989721580478227e-07, "loss": 0.0032, "reward": 2.020843982696533, "reward_std": 0.021659670397639275, "rewards/accuracy_reward": 0.8208437561988831, "rewards/format_reward": 1.0, "step": 1480 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 421.5, "epoch": 0.02042730445097309, "grad_norm": 2.7077147134968245, "kl": 0.07861328125, "learning_rate": 9.989707690798458e-07, "loss": 0.0032, "reward": 2.1095938682556152, "reward_std": 0.013318461365997791, "rewards/accuracy_reward": 0.9095937013626099, "rewards/format_reward": 1.0, "step": 1481 }, { "all_correct": 0.25, "all_wrong": 0.0, "completion_length": 411.3125, "epoch": 0.020441097364174286, "grad_norm": 6.392652014844335, "kl": 0.07958984375, "learning_rate": 9.98969379174983e-07, "loss": 0.0032, "reward": 1.992343783378601, "reward_std": 0.027757490053772926, "rewards/accuracy_reward": 0.79234379529953, "rewards/format_reward": 1.0, "step": 1482 }, { "all_correct": 0.25, "all_wrong": 0.0, "completion_length": 416.03125, "epoch": 0.020454890277375484, "grad_norm": 2.46023771878258, "kl": 0.08544921875, "learning_rate": 9.989679883332361e-07, "loss": 0.0034, "reward": 1.988187551498413, "reward_std": 0.03203430399298668, "rewards/accuracy_reward": 0.7881875038146973, "rewards/format_reward": 1.0, "step": 1483 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 411.5, "epoch": 0.02046868319057668, "grad_norm": 2.1221391079412673, "kl": 0.0771484375, "learning_rate": 9.989665965546084e-07, "loss": 0.0031, "reward": 2.05078125, "reward_std": 0.03196130692958832, "rewards/accuracy_reward": 0.850781261920929, "rewards/format_reward": 1.0, "step": 1484 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 425.40625, "epoch": 0.020482476103777878, "grad_norm": 2.1026395380688934, "kl": 0.076171875, "learning_rate": 9.989652038391024e-07, "loss": 0.0031, "reward": 2.1292812824249268, "reward_std": 0.019329454749822617, "rewards/accuracy_reward": 0.9292811751365662, "rewards/format_reward": 1.0, "step": 1485 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 403.3125, "epoch": 0.020496269016979075, "grad_norm": 3.115015061843392, "kl": 0.0849609375, "learning_rate": 9.989638101867207e-07, "loss": 0.0034, "reward": 2.128218650817871, "reward_std": 0.018721235916018486, "rewards/accuracy_reward": 0.9282187223434448, "rewards/format_reward": 1.0, "step": 1486 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 405.8125, "epoch": 0.020510061930180273, "grad_norm": 2.5414559938831056, "kl": 0.0791015625, "learning_rate": 9.989624155974657e-07, "loss": 0.0032, "reward": 2.1322813034057617, "reward_std": 0.041062213480472565, "rewards/accuracy_reward": 0.938531219959259, "rewards/format_reward": 1.0, "step": 1487 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 413.09375, "epoch": 0.02052385484338147, "grad_norm": 2.369762796274225, "kl": 0.076171875, "learning_rate": 9.9896102007134e-07, "loss": 0.0031, "reward": 2.0397188663482666, "reward_std": 0.025427144020795822, "rewards/accuracy_reward": 0.839718759059906, "rewards/format_reward": 1.0, "step": 1488 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 410.71875, "epoch": 0.020537647756582667, "grad_norm": 2.7831538407754732, "kl": 0.08056640625, "learning_rate": 9.989596236083465e-07, "loss": 0.0032, "reward": 1.9581562280654907, "reward_std": 0.01716439239680767, "rewards/accuracy_reward": 0.7581562995910645, "rewards/format_reward": 1.0, "step": 1489 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 408.875, "epoch": 0.020551440669783864, "grad_norm": 3.1473385913761014, "kl": 0.07958984375, "learning_rate": 9.989582262084878e-07, "loss": 0.0032, "reward": 2.0429999828338623, "reward_std": 0.022510571405291557, "rewards/accuracy_reward": 0.8429999947547913, "rewards/format_reward": 1.0, "step": 1490 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 407.96875, "epoch": 0.020565233582985062, "grad_norm": 2.552389909118091, "kl": 0.08984375, "learning_rate": 9.989568278717664e-07, "loss": 0.0036, "reward": 2.063718795776367, "reward_std": 0.02434457838535309, "rewards/accuracy_reward": 0.8637188076972961, "rewards/format_reward": 1.0, "step": 1491 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 410.53125, "epoch": 0.02057902649618626, "grad_norm": 2.2657698955735497, "kl": 0.0830078125, "learning_rate": 9.989554285981848e-07, "loss": 0.0033, "reward": 2.131812334060669, "reward_std": 0.01354505680501461, "rewards/accuracy_reward": 0.9318125247955322, "rewards/format_reward": 1.0, "step": 1492 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 396.5, "epoch": 0.020592819409387456, "grad_norm": 2.3538273729945547, "kl": 0.07861328125, "learning_rate": 9.989540283877458e-07, "loss": 0.0031, "reward": 2.0350937843322754, "reward_std": 0.02350785583257675, "rewards/accuracy_reward": 0.8350937366485596, "rewards/format_reward": 1.0, "step": 1493 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 410.71875, "epoch": 0.020606612322588654, "grad_norm": 2.6456278831917905, "kl": 0.08837890625, "learning_rate": 9.989526272404522e-07, "loss": 0.0035, "reward": 2.113781213760376, "reward_std": 0.01741741970181465, "rewards/accuracy_reward": 0.9137811660766602, "rewards/format_reward": 1.0, "step": 1494 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 413.9375, "epoch": 0.02062040523578985, "grad_norm": 3.48187233900594, "kl": 0.07275390625, "learning_rate": 9.989512251563063e-07, "loss": 0.0029, "reward": 2.0299062728881836, "reward_std": 0.02307567186653614, "rewards/accuracy_reward": 0.8299062252044678, "rewards/format_reward": 1.0, "step": 1495 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 397.59375, "epoch": 0.020634198148991048, "grad_norm": 2.009689987046221, "kl": 0.080078125, "learning_rate": 9.989498221353107e-07, "loss": 0.0032, "reward": 2.165250062942505, "reward_std": 0.011667201295495033, "rewards/accuracy_reward": 0.9652500152587891, "rewards/format_reward": 1.0, "step": 1496 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 400.21875, "epoch": 0.020647991062192245, "grad_norm": 3.7574275713972907, "kl": 0.08447265625, "learning_rate": 9.989484181774686e-07, "loss": 0.0034, "reward": 2.0981874465942383, "reward_std": 0.03144136816263199, "rewards/accuracy_reward": 0.898187518119812, "rewards/format_reward": 1.0, "step": 1497 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 394.90625, "epoch": 0.020661783975393443, "grad_norm": 3.0936045961612746, "kl": 0.080078125, "learning_rate": 9.98947013282782e-07, "loss": 0.0032, "reward": 2.120594024658203, "reward_std": 0.019680829718708992, "rewards/accuracy_reward": 0.9205937385559082, "rewards/format_reward": 1.0, "step": 1498 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 395.3125, "epoch": 0.02067557688859464, "grad_norm": 2.6038141207174923, "kl": 0.0830078125, "learning_rate": 9.989456074512537e-07, "loss": 0.0033, "reward": 2.0644373893737793, "reward_std": 0.021542729809880257, "rewards/accuracy_reward": 0.8644375205039978, "rewards/format_reward": 1.0, "step": 1499 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 395.25, "epoch": 0.020689369801795837, "grad_norm": 2.446497263212654, "kl": 0.09130859375, "learning_rate": 9.989442006828866e-07, "loss": 0.0037, "reward": 2.1802499294281006, "reward_std": 0.01756836473941803, "rewards/accuracy_reward": 0.9802500009536743, "rewards/format_reward": 1.0, "step": 1500 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 386.6875, "epoch": 0.020703162714997034, "grad_norm": 2.4412589030156613, "kl": 0.0771484375, "learning_rate": 9.98942792977683e-07, "loss": 0.0031, "reward": 2.081312656402588, "reward_std": 0.028154775500297546, "rewards/accuracy_reward": 0.8813124895095825, "rewards/format_reward": 1.0, "step": 1501 }, { "all_correct": 0.25, "all_wrong": 0.0, "completion_length": 381.875, "epoch": 0.02071695562819823, "grad_norm": 1.9974109852631787, "kl": 0.07763671875, "learning_rate": 9.989413843356458e-07, "loss": 0.0031, "reward": 2.037781238555908, "reward_std": 0.0330815352499485, "rewards/accuracy_reward": 0.8440312743186951, "rewards/format_reward": 1.0, "step": 1502 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 398.875, "epoch": 0.02073074854139943, "grad_norm": 2.819648442168812, "kl": 0.0908203125, "learning_rate": 9.989399747567774e-07, "loss": 0.0036, "reward": 2.1148123741149902, "reward_std": 0.018034860491752625, "rewards/accuracy_reward": 0.9148125052452087, "rewards/format_reward": 1.0, "step": 1503 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 386.375, "epoch": 0.020744541454600626, "grad_norm": 17.664696052578673, "kl": 0.0771484375, "learning_rate": 9.989385642410806e-07, "loss": 0.0031, "reward": 2.1179685592651367, "reward_std": 0.038253627717494965, "rewards/accuracy_reward": 0.9242187738418579, "rewards/format_reward": 1.0, "step": 1504 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 386.25, "epoch": 0.020758334367801824, "grad_norm": 2.413352219735155, "kl": 0.080078125, "learning_rate": 9.989371527885582e-07, "loss": 0.0032, "reward": 2.0059375762939453, "reward_std": 0.0405728742480278, "rewards/accuracy_reward": 0.8059374690055847, "rewards/format_reward": 1.0, "step": 1505 }, { "all_correct": 0.25, "all_wrong": 0.0, "completion_length": 376.375, "epoch": 0.02077212728100302, "grad_norm": 2.514804664128141, "kl": 0.07958984375, "learning_rate": 9.989357403992125e-07, "loss": 0.0032, "reward": 2.021353244781494, "reward_std": 0.04269827902317047, "rewards/accuracy_reward": 0.8276031017303467, "rewards/format_reward": 1.0, "step": 1506 }, { "all_correct": 0.25, "all_wrong": 0.0, "completion_length": 382.84375, "epoch": 0.020785920194204218, "grad_norm": 2.4134918426041096, "kl": 0.08203125, "learning_rate": 9.989343270730466e-07, "loss": 0.0033, "reward": 1.898343801498413, "reward_std": 0.044561997056007385, "rewards/accuracy_reward": 0.7108438014984131, "rewards/format_reward": 1.0, "step": 1507 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 381.28125, "epoch": 0.020799713107405415, "grad_norm": 2.429433601102753, "kl": 0.07568359375, "learning_rate": 9.989329128100628e-07, "loss": 0.003, "reward": 2.0971250534057617, "reward_std": 0.030733685940504074, "rewards/accuracy_reward": 0.9033750295639038, "rewards/format_reward": 1.0, "step": 1508 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 401.34375, "epoch": 0.020813506020606613, "grad_norm": 2.4642712682123737, "kl": 0.0810546875, "learning_rate": 9.989314976102636e-07, "loss": 0.0032, "reward": 2.064687490463257, "reward_std": 0.021355709061026573, "rewards/accuracy_reward": 0.864687442779541, "rewards/format_reward": 1.0, "step": 1509 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 391.8125, "epoch": 0.02082729893380781, "grad_norm": 3.723201861583115, "kl": 0.0712890625, "learning_rate": 9.989300814736522e-07, "loss": 0.0028, "reward": 2.123906135559082, "reward_std": 0.037387050688266754, "rewards/accuracy_reward": 0.9301562905311584, "rewards/format_reward": 1.0, "step": 1510 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 396.78125, "epoch": 0.020841091847009007, "grad_norm": 2.779478770399653, "kl": 0.0869140625, "learning_rate": 9.989286644002308e-07, "loss": 0.0035, "reward": 2.054593801498413, "reward_std": 0.03944680094718933, "rewards/accuracy_reward": 0.8608437180519104, "rewards/format_reward": 1.0, "step": 1511 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 371.03125, "epoch": 0.020854884760210204, "grad_norm": 3.104423079866213, "kl": 0.08251953125, "learning_rate": 9.989272463900024e-07, "loss": 0.0033, "reward": 2.1294689178466797, "reward_std": 0.0512034147977829, "rewards/accuracy_reward": 0.9419687986373901, "rewards/format_reward": 1.0, "step": 1512 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 395.71875, "epoch": 0.0208686776734114, "grad_norm": 3.0961399732755903, "kl": 0.083984375, "learning_rate": 9.989258274429693e-07, "loss": 0.0034, "reward": 1.995156168937683, "reward_std": 0.028719495981931686, "rewards/accuracy_reward": 0.7951561808586121, "rewards/format_reward": 1.0, "step": 1513 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 396.0625, "epoch": 0.0208824705866126, "grad_norm": 2.6640897284332556, "kl": 0.0810546875, "learning_rate": 9.989244075591345e-07, "loss": 0.0032, "reward": 2.100781202316284, "reward_std": 0.028524093329906464, "rewards/accuracy_reward": 0.9007812738418579, "rewards/format_reward": 1.0, "step": 1514 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 405.46875, "epoch": 0.020896263499813796, "grad_norm": 2.3801262206580476, "kl": 0.08154296875, "learning_rate": 9.989229867385003e-07, "loss": 0.0033, "reward": 2.1276001930236816, "reward_std": 0.020540155470371246, "rewards/accuracy_reward": 0.9276003241539001, "rewards/format_reward": 1.0, "step": 1515 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 399.90625, "epoch": 0.020910056413014994, "grad_norm": 5.495826883711522, "kl": 0.0830078125, "learning_rate": 9.989215649810697e-07, "loss": 0.0033, "reward": 2.0838751792907715, "reward_std": 0.03532567620277405, "rewards/accuracy_reward": 0.8838750123977661, "rewards/format_reward": 1.0, "step": 1516 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 403.0625, "epoch": 0.02092384932621619, "grad_norm": 2.9439855102347985, "kl": 0.0859375, "learning_rate": 9.989201422868455e-07, "loss": 0.0034, "reward": 2.0687501430511475, "reward_std": 0.020984133705496788, "rewards/accuracy_reward": 0.8687499165534973, "rewards/format_reward": 1.0, "step": 1517 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 402.3125, "epoch": 0.020937642239417388, "grad_norm": 2.8606934434411713, "kl": 0.0859375, "learning_rate": 9.989187186558299e-07, "loss": 0.0034, "reward": 2.0810000896453857, "reward_std": 0.03569136932492256, "rewards/accuracy_reward": 0.8810000419616699, "rewards/format_reward": 1.0, "step": 1518 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 406.03125, "epoch": 0.020951435152618585, "grad_norm": 2.8669744553649497, "kl": 0.09033203125, "learning_rate": 9.98917294088026e-07, "loss": 0.0036, "reward": 2.1474685668945312, "reward_std": 0.0178835391998291, "rewards/accuracy_reward": 0.9474687576293945, "rewards/format_reward": 1.0, "step": 1519 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 416.84375, "epoch": 0.020965228065819783, "grad_norm": 2.068133609218328, "kl": 0.08056640625, "learning_rate": 9.98915868583436e-07, "loss": 0.0032, "reward": 2.1226563453674316, "reward_std": 0.031642794609069824, "rewards/accuracy_reward": 0.928906261920929, "rewards/format_reward": 1.0, "step": 1520 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 387.9375, "epoch": 0.02097902097902098, "grad_norm": 2.226008061728883, "kl": 0.08447265625, "learning_rate": 9.989144421420628e-07, "loss": 0.0034, "reward": 2.137218713760376, "reward_std": 0.03068123385310173, "rewards/accuracy_reward": 0.9372187256813049, "rewards/format_reward": 1.0, "step": 1521 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 404.09375, "epoch": 0.020992813892222177, "grad_norm": 2.7235869240468302, "kl": 0.09619140625, "learning_rate": 9.989130147639092e-07, "loss": 0.0038, "reward": 2.0676562786102295, "reward_std": 0.024769969284534454, "rewards/accuracy_reward": 0.8676562309265137, "rewards/format_reward": 1.0, "step": 1522 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 398.5, "epoch": 0.021006606805423374, "grad_norm": 3.138919834922565, "kl": 0.083984375, "learning_rate": 9.98911586448978e-07, "loss": 0.0034, "reward": 2.1533126831054688, "reward_std": 0.019278643652796745, "rewards/accuracy_reward": 0.9533125162124634, "rewards/format_reward": 1.0, "step": 1523 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 407.5625, "epoch": 0.02102039971862457, "grad_norm": 3.0647395558481585, "kl": 0.091796875, "learning_rate": 9.989101571972716e-07, "loss": 0.0037, "reward": 2.104031562805176, "reward_std": 0.022861778736114502, "rewards/accuracy_reward": 0.9040312767028809, "rewards/format_reward": 1.0, "step": 1524 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 406.78125, "epoch": 0.02103419263182577, "grad_norm": 7.746727793069399, "kl": 0.09375, "learning_rate": 9.989087270087927e-07, "loss": 0.0038, "reward": 2.1284284591674805, "reward_std": 0.05125795304775238, "rewards/accuracy_reward": 0.9346784949302673, "rewards/format_reward": 1.0, "step": 1525 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 415.09375, "epoch": 0.021047985545026966, "grad_norm": 3.1206768926172566, "kl": 0.083984375, "learning_rate": 9.989072958835443e-07, "loss": 0.0034, "reward": 2.05078125, "reward_std": 0.026583846658468246, "rewards/accuracy_reward": 0.8507813215255737, "rewards/format_reward": 1.0, "step": 1526 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 406.125, "epoch": 0.021061778458228163, "grad_norm": 1.9482717232649231, "kl": 0.08251953125, "learning_rate": 9.989058638215286e-07, "loss": 0.0033, "reward": 2.0946874618530273, "reward_std": 0.02435433492064476, "rewards/accuracy_reward": 0.8946874141693115, "rewards/format_reward": 1.0, "step": 1527 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 392.9375, "epoch": 0.02107557137142936, "grad_norm": 1.8887445866403607, "kl": 0.09033203125, "learning_rate": 9.989044308227488e-07, "loss": 0.0036, "reward": 2.0756876468658447, "reward_std": 0.023495055735111237, "rewards/accuracy_reward": 0.8756874799728394, "rewards/format_reward": 1.0, "step": 1528 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 415.90625, "epoch": 0.021089364284630558, "grad_norm": 2.2610234204933506, "kl": 0.08203125, "learning_rate": 9.989029968872072e-07, "loss": 0.0033, "reward": 2.085343599319458, "reward_std": 0.036933641880750656, "rewards/accuracy_reward": 0.8915936946868896, "rewards/format_reward": 1.0, "step": 1529 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 414.34375, "epoch": 0.021103157197831755, "grad_norm": 2.8545865759476317, "kl": 0.0869140625, "learning_rate": 9.989015620149063e-07, "loss": 0.0035, "reward": 2.0868594646453857, "reward_std": 0.024989519268274307, "rewards/accuracy_reward": 0.8868594169616699, "rewards/format_reward": 1.0, "step": 1530 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 407.8125, "epoch": 0.021116950111032953, "grad_norm": 2.1157264923864005, "kl": 0.09375, "learning_rate": 9.989001262058495e-07, "loss": 0.0037, "reward": 2.1402812004089355, "reward_std": 0.012138539925217628, "rewards/accuracy_reward": 0.9402812719345093, "rewards/format_reward": 1.0, "step": 1531 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 412.59375, "epoch": 0.02113074302423415, "grad_norm": 2.017778594316767, "kl": 0.0869140625, "learning_rate": 9.988986894600391e-07, "loss": 0.0035, "reward": 2.113093852996826, "reward_std": 0.01661318726837635, "rewards/accuracy_reward": 0.9130936861038208, "rewards/format_reward": 1.0, "step": 1532 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 416.28125, "epoch": 0.021144535937435347, "grad_norm": 5.056252296759649, "kl": 0.087890625, "learning_rate": 9.988972517774777e-07, "loss": 0.0035, "reward": 2.0796875953674316, "reward_std": 0.023273160681128502, "rewards/accuracy_reward": 0.879687488079071, "rewards/format_reward": 1.0, "step": 1533 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 410.03125, "epoch": 0.021158328850636544, "grad_norm": 2.817885965012272, "kl": 0.0869140625, "learning_rate": 9.98895813158168e-07, "loss": 0.0035, "reward": 2.162374973297119, "reward_std": 0.01492896769195795, "rewards/accuracy_reward": 0.9623750448226929, "rewards/format_reward": 1.0, "step": 1534 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 409.9375, "epoch": 0.02117212176383774, "grad_norm": 2.6637289325898323, "kl": 0.08740234375, "learning_rate": 9.988943736021128e-07, "loss": 0.0035, "reward": 1.9640939235687256, "reward_std": 0.020299989730119705, "rewards/accuracy_reward": 0.764093816280365, "rewards/format_reward": 1.0, "step": 1535 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 402.21875, "epoch": 0.02118591467703894, "grad_norm": 2.21331201887261, "kl": 0.08642578125, "learning_rate": 9.98892933109315e-07, "loss": 0.0035, "reward": 2.1415627002716064, "reward_std": 0.015297467820346355, "rewards/accuracy_reward": 0.9415625333786011, "rewards/format_reward": 1.0, "step": 1536 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 416.09375, "epoch": 0.021199707590240136, "grad_norm": 2.1446447253991785, "kl": 0.0810546875, "learning_rate": 9.988914916797772e-07, "loss": 0.0032, "reward": 2.1563124656677246, "reward_std": 0.016780951991677284, "rewards/accuracy_reward": 0.9563124775886536, "rewards/format_reward": 1.0, "step": 1537 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 393.5, "epoch": 0.021213500503441333, "grad_norm": 2.17152487773185, "kl": 0.080078125, "learning_rate": 9.988900493135018e-07, "loss": 0.0032, "reward": 2.1330623626708984, "reward_std": 0.014645039103925228, "rewards/accuracy_reward": 0.9330624938011169, "rewards/format_reward": 1.0, "step": 1538 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 406.25, "epoch": 0.02122729341664253, "grad_norm": 2.53137971321452, "kl": 0.08544921875, "learning_rate": 9.988886060104918e-07, "loss": 0.0034, "reward": 2.1524999141693115, "reward_std": 0.016380976885557175, "rewards/accuracy_reward": 0.9524999856948853, "rewards/format_reward": 1.0, "step": 1539 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 413.4375, "epoch": 0.021241086329843728, "grad_norm": 3.2972521540024307, "kl": 0.0849609375, "learning_rate": 9.9888716177075e-07, "loss": 0.0034, "reward": 2.0991876125335693, "reward_std": 0.02933315560221672, "rewards/accuracy_reward": 0.899187445640564, "rewards/format_reward": 1.0, "step": 1540 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 405.5, "epoch": 0.021254879243044922, "grad_norm": 2.972382706822661, "kl": 0.0888671875, "learning_rate": 9.988857165942787e-07, "loss": 0.0036, "reward": 2.091531276702881, "reward_std": 0.022795991972088814, "rewards/accuracy_reward": 0.891531229019165, "rewards/format_reward": 1.0, "step": 1541 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 413.28125, "epoch": 0.02126867215624612, "grad_norm": 6.029279464258887, "kl": 0.09326171875, "learning_rate": 9.98884270481081e-07, "loss": 0.0037, "reward": 2.1213126182556152, "reward_std": 0.021050382405519485, "rewards/accuracy_reward": 0.9213125109672546, "rewards/format_reward": 1.0, "step": 1542 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 413.71875, "epoch": 0.021282465069447316, "grad_norm": 4.856696953148061, "kl": 0.08203125, "learning_rate": 9.988828234311594e-07, "loss": 0.0033, "reward": 2.066850423812866, "reward_std": 0.024274926632642746, "rewards/accuracy_reward": 0.8668503165245056, "rewards/format_reward": 1.0, "step": 1543 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 399.125, "epoch": 0.021296257982648514, "grad_norm": 2.227717914949612, "kl": 0.083984375, "learning_rate": 9.98881375444517e-07, "loss": 0.0034, "reward": 2.162781238555908, "reward_std": 0.010355159640312195, "rewards/accuracy_reward": 0.9627813100814819, "rewards/format_reward": 1.0, "step": 1544 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 388.125, "epoch": 0.02131005089584971, "grad_norm": 3.5728001949392896, "kl": 0.07861328125, "learning_rate": 9.988799265211558e-07, "loss": 0.0031, "reward": 2.086062431335449, "reward_std": 0.027896970510482788, "rewards/accuracy_reward": 0.8860624432563782, "rewards/format_reward": 1.0, "step": 1545 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 395.1875, "epoch": 0.021323843809050908, "grad_norm": 2.10505269153072, "kl": 0.076171875, "learning_rate": 9.98878476661079e-07, "loss": 0.003, "reward": 2.0056562423706055, "reward_std": 0.01300123892724514, "rewards/accuracy_reward": 0.8056562542915344, "rewards/format_reward": 1.0, "step": 1546 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 415.4375, "epoch": 0.021337636722252105, "grad_norm": 3.313191567121275, "kl": 0.08056640625, "learning_rate": 9.988770258642896e-07, "loss": 0.0032, "reward": 2.024656295776367, "reward_std": 0.02149621583521366, "rewards/accuracy_reward": 0.8246562480926514, "rewards/format_reward": 1.0, "step": 1547 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 399.25, "epoch": 0.021351429635453303, "grad_norm": 2.7253332346190526, "kl": 0.0810546875, "learning_rate": 9.988755741307897e-07, "loss": 0.0032, "reward": 2.019312620162964, "reward_std": 0.02338443323969841, "rewards/accuracy_reward": 0.8193125128746033, "rewards/format_reward": 1.0, "step": 1548 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 415.75, "epoch": 0.0213652225486545, "grad_norm": 2.2993868047678974, "kl": 0.07763671875, "learning_rate": 9.988741214605823e-07, "loss": 0.0031, "reward": 2.093874931335449, "reward_std": 0.013927859254181385, "rewards/accuracy_reward": 0.893875002861023, "rewards/format_reward": 1.0, "step": 1549 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 420.78125, "epoch": 0.021379015461855697, "grad_norm": 3.218688257551977, "kl": 0.08447265625, "learning_rate": 9.988726678536702e-07, "loss": 0.0034, "reward": 2.112874984741211, "reward_std": 0.033188559114933014, "rewards/accuracy_reward": 0.9128749966621399, "rewards/format_reward": 1.0, "step": 1550 }, { "all_correct": 0.25, "all_wrong": 0.0, "completion_length": 413.28125, "epoch": 0.021392808375056895, "grad_norm": 5.225160932145518, "kl": 0.080078125, "learning_rate": 9.988712133100561e-07, "loss": 0.0032, "reward": 2.025177001953125, "reward_std": 0.04278388246893883, "rewards/accuracy_reward": 0.8251770734786987, "rewards/format_reward": 1.0, "step": 1551 }, { "all_correct": 0.25, "all_wrong": 0.0, "completion_length": 408.03125, "epoch": 0.021406601288258092, "grad_norm": 7.388750952996294, "kl": 0.078125, "learning_rate": 9.988697578297426e-07, "loss": 0.0031, "reward": 1.8918238878250122, "reward_std": 0.07013053447008133, "rewards/accuracy_reward": 0.6918237805366516, "rewards/format_reward": 1.0, "step": 1552 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 421.9375, "epoch": 0.02142039420145929, "grad_norm": 2.0482494336792745, "kl": 0.08203125, "learning_rate": 9.988683014127327e-07, "loss": 0.0033, "reward": 2.0768749713897705, "reward_std": 0.012193598784506321, "rewards/accuracy_reward": 0.8768750429153442, "rewards/format_reward": 1.0, "step": 1553 }, { "all_correct": 0.25, "all_wrong": 0.0, "completion_length": 414.53125, "epoch": 0.021434187114660486, "grad_norm": 1.9585988460520216, "kl": 0.0712890625, "learning_rate": 9.98866844059029e-07, "loss": 0.0029, "reward": 2.1084063053131104, "reward_std": 0.0719788521528244, "rewards/accuracy_reward": 0.9146562218666077, "rewards/format_reward": 1.0, "step": 1554 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 421.59375, "epoch": 0.021447980027861684, "grad_norm": 2.3520659607253167, "kl": 0.0751953125, "learning_rate": 9.988653857686339e-07, "loss": 0.003, "reward": 2.144406318664551, "reward_std": 0.014882270246744156, "rewards/accuracy_reward": 0.944406270980835, "rewards/format_reward": 1.0, "step": 1555 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 426.75, "epoch": 0.02146177294106288, "grad_norm": 2.06146643698252, "kl": 0.08251953125, "learning_rate": 9.988639265415508e-07, "loss": 0.0033, "reward": 2.0045623779296875, "reward_std": 0.014828743413090706, "rewards/accuracy_reward": 0.8045624494552612, "rewards/format_reward": 1.0, "step": 1556 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 424.375, "epoch": 0.021475565854264078, "grad_norm": 2.8216345989635174, "kl": 0.076171875, "learning_rate": 9.988624663777819e-07, "loss": 0.003, "reward": 2.0919063091278076, "reward_std": 0.047057442367076874, "rewards/accuracy_reward": 0.8919062614440918, "rewards/format_reward": 1.0, "step": 1557 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 433.15625, "epoch": 0.021489358767465275, "grad_norm": 4.570568080456706, "kl": 0.07958984375, "learning_rate": 9.9886100527733e-07, "loss": 0.0032, "reward": 2.027937412261963, "reward_std": 0.020956214517354965, "rewards/accuracy_reward": 0.8279374837875366, "rewards/format_reward": 1.0, "step": 1558 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 412.4375, "epoch": 0.021503151680666473, "grad_norm": 2.8117561792439165, "kl": 0.0810546875, "learning_rate": 9.988595432401982e-07, "loss": 0.0032, "reward": 2.113687515258789, "reward_std": 0.027239935472607613, "rewards/accuracy_reward": 0.9136874675750732, "rewards/format_reward": 1.0, "step": 1559 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 420.1875, "epoch": 0.02151694459386767, "grad_norm": 4.251963555947831, "kl": 0.07470703125, "learning_rate": 9.988580802663887e-07, "loss": 0.003, "reward": 2.114562511444092, "reward_std": 0.036585815250873566, "rewards/accuracy_reward": 0.9208124876022339, "rewards/format_reward": 1.0, "step": 1560 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 428.6875, "epoch": 0.021530737507068867, "grad_norm": 2.5219995177527545, "kl": 0.078125, "learning_rate": 9.988566163559049e-07, "loss": 0.0031, "reward": 2.0700626373291016, "reward_std": 0.02543047070503235, "rewards/accuracy_reward": 0.8700624704360962, "rewards/format_reward": 1.0, "step": 1561 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 412.90625, "epoch": 0.021544530420270065, "grad_norm": 5.526570331675459, "kl": 0.08154296875, "learning_rate": 9.98855151508749e-07, "loss": 0.0033, "reward": 2.047656297683716, "reward_std": 0.028637949377298355, "rewards/accuracy_reward": 0.84765625, "rewards/format_reward": 1.0, "step": 1562 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 418.4375, "epoch": 0.021558323333471262, "grad_norm": 3.0031201897300126, "kl": 0.07470703125, "learning_rate": 9.988536857249241e-07, "loss": 0.003, "reward": 2.0929062366485596, "reward_std": 0.03276754170656204, "rewards/accuracy_reward": 0.8991562128067017, "rewards/format_reward": 1.0, "step": 1563 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 411.84375, "epoch": 0.02157211624667246, "grad_norm": 1.983178904618438, "kl": 0.0859375, "learning_rate": 9.988522190044328e-07, "loss": 0.0034, "reward": 1.9270312786102295, "reward_std": 0.009660182520747185, "rewards/accuracy_reward": 0.7270312309265137, "rewards/format_reward": 1.0, "step": 1564 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 404.09375, "epoch": 0.021585909159873656, "grad_norm": 2.5858870856831278, "kl": 0.080078125, "learning_rate": 9.988507513472776e-07, "loss": 0.0032, "reward": 2.140899658203125, "reward_std": 0.012261909432709217, "rewards/accuracy_reward": 0.940899670124054, "rewards/format_reward": 1.0, "step": 1565 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 410.0625, "epoch": 0.021599702073074854, "grad_norm": 2.906625090866049, "kl": 0.0810546875, "learning_rate": 9.988492827534618e-07, "loss": 0.0033, "reward": 2.147156238555908, "reward_std": 0.014906302094459534, "rewards/accuracy_reward": 0.9471562504768372, "rewards/format_reward": 1.0, "step": 1566 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 408.8125, "epoch": 0.02161349498627605, "grad_norm": 2.5150170362313267, "kl": 0.07861328125, "learning_rate": 9.988478132229878e-07, "loss": 0.0031, "reward": 2.144124984741211, "reward_std": 0.03592435270547867, "rewards/accuracy_reward": 0.950374960899353, "rewards/format_reward": 1.0, "step": 1567 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 413.34375, "epoch": 0.021627287899477248, "grad_norm": 3.3654542970723167, "kl": 0.07421875, "learning_rate": 9.988463427558584e-07, "loss": 0.003, "reward": 2.060187339782715, "reward_std": 0.024146951735019684, "rewards/accuracy_reward": 0.8601875305175781, "rewards/format_reward": 1.0, "step": 1568 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 391.71875, "epoch": 0.021641080812678445, "grad_norm": 3.6140742161554145, "kl": 0.07666015625, "learning_rate": 9.988448713520766e-07, "loss": 0.0031, "reward": 2.1519999504089355, "reward_std": 0.014973675832152367, "rewards/accuracy_reward": 0.9519999623298645, "rewards/format_reward": 1.0, "step": 1569 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 391.90625, "epoch": 0.021654873725879643, "grad_norm": 2.930493235792114, "kl": 0.080078125, "learning_rate": 9.988433990116446e-07, "loss": 0.0032, "reward": 2.0972187519073486, "reward_std": 0.01555462833493948, "rewards/accuracy_reward": 0.8972187638282776, "rewards/format_reward": 1.0, "step": 1570 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 388.90625, "epoch": 0.02166866663908084, "grad_norm": 3.499323046108649, "kl": 0.078125, "learning_rate": 9.98841925734566e-07, "loss": 0.0031, "reward": 2.034343719482422, "reward_std": 0.023150470107793808, "rewards/accuracy_reward": 0.8405937552452087, "rewards/format_reward": 1.0, "step": 1571 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 395.75, "epoch": 0.021682459552282037, "grad_norm": 2.2536647657995457, "kl": 0.0751953125, "learning_rate": 9.988404515208426e-07, "loss": 0.003, "reward": 2.067218780517578, "reward_std": 0.012301644310355186, "rewards/accuracy_reward": 0.8672187328338623, "rewards/format_reward": 1.0, "step": 1572 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 394.34375, "epoch": 0.021696252465483234, "grad_norm": 2.362339253643306, "kl": 0.078125, "learning_rate": 9.988389763704778e-07, "loss": 0.0031, "reward": 2.066531181335449, "reward_std": 0.016664590686559677, "rewards/accuracy_reward": 0.8665311932563782, "rewards/format_reward": 1.0, "step": 1573 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 387.0, "epoch": 0.021710045378684432, "grad_norm": 17.325235804524315, "kl": 0.07958984375, "learning_rate": 9.988375002834743e-07, "loss": 0.0032, "reward": 2.134531259536743, "reward_std": 0.01741349697113037, "rewards/accuracy_reward": 0.9345313310623169, "rewards/format_reward": 1.0, "step": 1574 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 399.125, "epoch": 0.02172383829188563, "grad_norm": 2.196936629027982, "kl": 0.08251953125, "learning_rate": 9.98836023259835e-07, "loss": 0.0033, "reward": 1.9609061479568481, "reward_std": 0.029735153540968895, "rewards/accuracy_reward": 0.7671563029289246, "rewards/format_reward": 1.0, "step": 1575 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 397.96875, "epoch": 0.021737631205086826, "grad_norm": 3.9764373039242655, "kl": 0.08251953125, "learning_rate": 9.988345452995622e-07, "loss": 0.0033, "reward": 2.1524062156677246, "reward_std": 0.015514640137553215, "rewards/accuracy_reward": 0.9524062871932983, "rewards/format_reward": 1.0, "step": 1576 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 405.3125, "epoch": 0.021751424118288024, "grad_norm": 5.140630134968848, "kl": 0.08837890625, "learning_rate": 9.988330664026589e-07, "loss": 0.0035, "reward": 2.1183438301086426, "reward_std": 0.024402325972914696, "rewards/accuracy_reward": 0.9183437824249268, "rewards/format_reward": 1.0, "step": 1577 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 392.40625, "epoch": 0.02176521703148922, "grad_norm": 3.699035372462222, "kl": 0.0771484375, "learning_rate": 9.98831586569128e-07, "loss": 0.0031, "reward": 2.1333439350128174, "reward_std": 0.018361661583185196, "rewards/accuracy_reward": 0.933343768119812, "rewards/format_reward": 1.0, "step": 1578 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 391.59375, "epoch": 0.021779009944690418, "grad_norm": 2.6309835563391295, "kl": 0.0869140625, "learning_rate": 9.988301057989724e-07, "loss": 0.0035, "reward": 2.1221251487731934, "reward_std": 0.021491248160600662, "rewards/accuracy_reward": 0.922124981880188, "rewards/format_reward": 1.0, "step": 1579 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 398.90625, "epoch": 0.021792802857891615, "grad_norm": 2.161715625365343, "kl": 0.0693359375, "learning_rate": 9.988286240921946e-07, "loss": 0.0028, "reward": 2.1294686794281006, "reward_std": 0.012171508744359016, "rewards/accuracy_reward": 0.9294688105583191, "rewards/format_reward": 1.0, "step": 1580 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 387.8125, "epoch": 0.021806595771092813, "grad_norm": 2.5330929905394544, "kl": 0.0703125, "learning_rate": 9.988271414487974e-07, "loss": 0.0028, "reward": 2.0659375190734863, "reward_std": 0.015786802396178246, "rewards/accuracy_reward": 0.8659375309944153, "rewards/format_reward": 1.0, "step": 1581 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 380.5625, "epoch": 0.02182038868429401, "grad_norm": 5.226057624433558, "kl": 0.072265625, "learning_rate": 9.988256578687836e-07, "loss": 0.0029, "reward": 2.094562530517578, "reward_std": 0.01329224556684494, "rewards/accuracy_reward": 0.8945624828338623, "rewards/format_reward": 1.0, "step": 1582 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 394.28125, "epoch": 0.021834181597495207, "grad_norm": 2.0700563854025273, "kl": 0.07275390625, "learning_rate": 9.98824173352156e-07, "loss": 0.0029, "reward": 2.0830626487731934, "reward_std": 0.014956730417907238, "rewards/accuracy_reward": 0.883062481880188, "rewards/format_reward": 1.0, "step": 1583 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 415.15625, "epoch": 0.021847974510696404, "grad_norm": 1.7649587756278515, "kl": 0.076171875, "learning_rate": 9.988226878989176e-07, "loss": 0.003, "reward": 2.0845937728881836, "reward_std": 0.021020015701651573, "rewards/accuracy_reward": 0.8908436298370361, "rewards/format_reward": 1.0, "step": 1584 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 374.75, "epoch": 0.021861767423897602, "grad_norm": 1.5217863214531302, "kl": 0.07666015625, "learning_rate": 9.988212015090708e-07, "loss": 0.0031, "reward": 2.0310001373291016, "reward_std": 0.00146384141407907, "rewards/accuracy_reward": 0.831000030040741, "rewards/format_reward": 1.0, "step": 1585 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 387.46875, "epoch": 0.0218755603370988, "grad_norm": 8.103618418543745, "kl": 0.0703125, "learning_rate": 9.988197141826188e-07, "loss": 0.0028, "reward": 2.0257811546325684, "reward_std": 0.025264011695981026, "rewards/accuracy_reward": 0.8257813453674316, "rewards/format_reward": 1.0, "step": 1586 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 374.0, "epoch": 0.021889353250299996, "grad_norm": 2.332754055152769, "kl": 0.078125, "learning_rate": 9.988182259195642e-07, "loss": 0.0031, "reward": 2.0961251258850098, "reward_std": 0.03362717479467392, "rewards/accuracy_reward": 0.9086250066757202, "rewards/format_reward": 1.0, "step": 1587 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 388.65625, "epoch": 0.021903146163501194, "grad_norm": 2.6874105036521465, "kl": 0.07421875, "learning_rate": 9.988167367199096e-07, "loss": 0.003, "reward": 2.0760626792907715, "reward_std": 0.025703947991132736, "rewards/accuracy_reward": 0.8760625123977661, "rewards/format_reward": 1.0, "step": 1588 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 389.5625, "epoch": 0.02191693907670239, "grad_norm": 2.218049496575101, "kl": 0.0771484375, "learning_rate": 9.988152465836581e-07, "loss": 0.0031, "reward": 2.1312499046325684, "reward_std": 0.016386108472943306, "rewards/accuracy_reward": 0.9312500357627869, "rewards/format_reward": 1.0, "step": 1589 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 376.5, "epoch": 0.021930731989903588, "grad_norm": 3.282572888837597, "kl": 0.0751953125, "learning_rate": 9.988137555108124e-07, "loss": 0.003, "reward": 2.1472811698913574, "reward_std": 0.03093050792813301, "rewards/accuracy_reward": 0.9535312056541443, "rewards/format_reward": 1.0, "step": 1590 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 396.9375, "epoch": 0.021944524903104785, "grad_norm": 3.647449859136758, "kl": 0.07421875, "learning_rate": 9.988122635013753e-07, "loss": 0.003, "reward": 2.021718978881836, "reward_std": 0.03832070156931877, "rewards/accuracy_reward": 0.8279687762260437, "rewards/format_reward": 1.0, "step": 1591 }, { "all_correct": 0.25, "all_wrong": 0.0, "completion_length": 391.5, "epoch": 0.021958317816305983, "grad_norm": 3.4390033611755206, "kl": 0.07373046875, "learning_rate": 9.988107705553495e-07, "loss": 0.0029, "reward": 1.9759063720703125, "reward_std": 0.027544600889086723, "rewards/accuracy_reward": 0.7759063243865967, "rewards/format_reward": 1.0, "step": 1592 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 397.125, "epoch": 0.02197211072950718, "grad_norm": 2.5797418416835933, "kl": 0.07373046875, "learning_rate": 9.98809276672738e-07, "loss": 0.003, "reward": 2.110781192779541, "reward_std": 0.018572986125946045, "rewards/accuracy_reward": 0.9107812643051147, "rewards/format_reward": 1.0, "step": 1593 }, { "all_correct": 0.25, "all_wrong": 0.0, "completion_length": 413.8125, "epoch": 0.021985903642708377, "grad_norm": 2.1827013296789115, "kl": 0.08056640625, "learning_rate": 9.988077818535435e-07, "loss": 0.0032, "reward": 2.045562505722046, "reward_std": 0.031129637733101845, "rewards/accuracy_reward": 0.851812481880188, "rewards/format_reward": 1.0, "step": 1594 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 404.15625, "epoch": 0.021999696555909574, "grad_norm": 2.766342776421939, "kl": 0.07861328125, "learning_rate": 9.988062860977686e-07, "loss": 0.0031, "reward": 2.0461249351501465, "reward_std": 0.011592695489525795, "rewards/accuracy_reward": 0.8461249470710754, "rewards/format_reward": 1.0, "step": 1595 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 407.53125, "epoch": 0.02201348946911077, "grad_norm": 4.1656036893810775, "kl": 0.0712890625, "learning_rate": 9.988047894054164e-07, "loss": 0.0028, "reward": 2.0014686584472656, "reward_std": 0.006300115957856178, "rewards/accuracy_reward": 0.8014687299728394, "rewards/format_reward": 1.0, "step": 1596 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 394.21875, "epoch": 0.02202728238231197, "grad_norm": 1.9583783115355127, "kl": 0.08154296875, "learning_rate": 9.988032917764898e-07, "loss": 0.0033, "reward": 1.9821875095367432, "reward_std": 0.012056650593876839, "rewards/accuracy_reward": 0.7821875214576721, "rewards/format_reward": 1.0, "step": 1597 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 407.25, "epoch": 0.022041075295513166, "grad_norm": 3.5399851121544472, "kl": 0.07763671875, "learning_rate": 9.988017932109912e-07, "loss": 0.0031, "reward": 2.054874897003174, "reward_std": 0.020652741193771362, "rewards/accuracy_reward": 0.8548749685287476, "rewards/format_reward": 1.0, "step": 1598 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 394.03125, "epoch": 0.022054868208714364, "grad_norm": 4.678225616923067, "kl": 0.08544921875, "learning_rate": 9.988002937089238e-07, "loss": 0.0034, "reward": 2.0822813510894775, "reward_std": 0.015072397887706757, "rewards/accuracy_reward": 0.8822813034057617, "rewards/format_reward": 1.0, "step": 1599 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 392.71875, "epoch": 0.02206866112191556, "grad_norm": 2.5234663984438237, "kl": 0.07421875, "learning_rate": 9.987987932702902e-07, "loss": 0.003, "reward": 2.1270313262939453, "reward_std": 0.010079530067741871, "rewards/accuracy_reward": 0.9270312190055847, "rewards/format_reward": 1.0, "step": 1600 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 401.25, "epoch": 0.022082454035116758, "grad_norm": 2.0218743350459096, "kl": 0.06787109375, "learning_rate": 9.987972918950933e-07, "loss": 0.0027, "reward": 2.1231563091278076, "reward_std": 0.021622788161039352, "rewards/accuracy_reward": 0.9294062256813049, "rewards/format_reward": 1.0, "step": 1601 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 393.46875, "epoch": 0.022096246948317955, "grad_norm": 2.9313096507871768, "kl": 0.08203125, "learning_rate": 9.987957895833359e-07, "loss": 0.0033, "reward": 2.0399062633514404, "reward_std": 0.029750332236289978, "rewards/accuracy_reward": 0.8399062156677246, "rewards/format_reward": 1.0, "step": 1602 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 385.5625, "epoch": 0.022110039861519153, "grad_norm": 2.27480467227018, "kl": 0.07763671875, "learning_rate": 9.987942863350207e-07, "loss": 0.0031, "reward": 2.05009388923645, "reward_std": 0.016288653016090393, "rewards/accuracy_reward": 0.8500937819480896, "rewards/format_reward": 1.0, "step": 1603 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 392.125, "epoch": 0.02212383277472035, "grad_norm": 4.096795289350605, "kl": 0.0712890625, "learning_rate": 9.987927821501507e-07, "loss": 0.0028, "reward": 2.1626875400543213, "reward_std": 0.024405695497989655, "rewards/accuracy_reward": 0.9689375162124634, "rewards/format_reward": 1.0, "step": 1604 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 382.28125, "epoch": 0.022137625687921547, "grad_norm": 2.545853388278239, "kl": 0.0751953125, "learning_rate": 9.987912770287287e-07, "loss": 0.003, "reward": 2.1858439445495605, "reward_std": 0.02256779372692108, "rewards/accuracy_reward": 0.9920937418937683, "rewards/format_reward": 1.0, "step": 1605 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 395.21875, "epoch": 0.022151418601122744, "grad_norm": 5.533992855812773, "kl": 0.0712890625, "learning_rate": 9.987897709707576e-07, "loss": 0.0029, "reward": 2.010499954223633, "reward_std": 0.019612543284893036, "rewards/accuracy_reward": 0.8104999661445618, "rewards/format_reward": 1.0, "step": 1606 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 400.71875, "epoch": 0.02216521151432394, "grad_norm": 2.4654377695399012, "kl": 0.087890625, "learning_rate": 9.9878826397624e-07, "loss": 0.0035, "reward": 2.079312562942505, "reward_std": 0.01909957453608513, "rewards/accuracy_reward": 0.8793125152587891, "rewards/format_reward": 1.0, "step": 1607 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 391.09375, "epoch": 0.02217900442752514, "grad_norm": 3.64585957663724, "kl": 0.0771484375, "learning_rate": 9.987867560451787e-07, "loss": 0.0031, "reward": 2.0647499561309814, "reward_std": 0.023414820432662964, "rewards/accuracy_reward": 0.8710000514984131, "rewards/format_reward": 1.0, "step": 1608 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 394.46875, "epoch": 0.022192797340726336, "grad_norm": 2.8894580950626625, "kl": 0.0771484375, "learning_rate": 9.987852471775769e-07, "loss": 0.0031, "reward": 2.12850022315979, "reward_std": 0.038125794380903244, "rewards/accuracy_reward": 0.934749960899353, "rewards/format_reward": 1.0, "step": 1609 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 396.46875, "epoch": 0.022206590253927534, "grad_norm": 3.596958897335824, "kl": 0.080078125, "learning_rate": 9.987837373734372e-07, "loss": 0.0032, "reward": 2.0559375286102295, "reward_std": 0.01801699586212635, "rewards/accuracy_reward": 0.8559374809265137, "rewards/format_reward": 1.0, "step": 1610 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 395.4375, "epoch": 0.02222038316712873, "grad_norm": 3.916782219349192, "kl": 0.0771484375, "learning_rate": 9.987822266327624e-07, "loss": 0.0031, "reward": 2.0479373931884766, "reward_std": 0.017862647771835327, "rewards/accuracy_reward": 0.8479374647140503, "rewards/format_reward": 1.0, "step": 1611 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 394.0, "epoch": 0.022234176080329928, "grad_norm": 2.0505635447726283, "kl": 0.06982421875, "learning_rate": 9.987807149555554e-07, "loss": 0.0028, "reward": 2.1303749084472656, "reward_std": 0.023144150152802467, "rewards/accuracy_reward": 0.9303749203681946, "rewards/format_reward": 1.0, "step": 1612 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 384.0, "epoch": 0.022247968993531125, "grad_norm": 3.2604147332214755, "kl": 0.068359375, "learning_rate": 9.987792023418189e-07, "loss": 0.0027, "reward": 2.0581562519073486, "reward_std": 0.017807789146900177, "rewards/accuracy_reward": 0.8581562638282776, "rewards/format_reward": 1.0, "step": 1613 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 380.78125, "epoch": 0.022261761906732323, "grad_norm": 2.3481618660544084, "kl": 0.07177734375, "learning_rate": 9.98777688791556e-07, "loss": 0.0029, "reward": 2.0974998474121094, "reward_std": 0.02573578618466854, "rewards/accuracy_reward": 0.903749942779541, "rewards/format_reward": 1.0, "step": 1614 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 393.6875, "epoch": 0.02227555481993352, "grad_norm": 2.456537182121554, "kl": 0.0673828125, "learning_rate": 9.987761743047693e-07, "loss": 0.0027, "reward": 2.080843925476074, "reward_std": 0.012926635332405567, "rewards/accuracy_reward": 0.8808437585830688, "rewards/format_reward": 1.0, "step": 1615 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 382.0, "epoch": 0.022289347733134714, "grad_norm": 1.9540094434596145, "kl": 0.06494140625, "learning_rate": 9.98774658881462e-07, "loss": 0.0026, "reward": 2.0770626068115234, "reward_std": 0.011614246293902397, "rewards/accuracy_reward": 0.8770624995231628, "rewards/format_reward": 1.0, "step": 1616 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 396.21875, "epoch": 0.02230314064633591, "grad_norm": 3.468532753757021, "kl": 0.0703125, "learning_rate": 9.987731425216364e-07, "loss": 0.0028, "reward": 2.042187452316284, "reward_std": 0.02053939737379551, "rewards/accuracy_reward": 0.8421874046325684, "rewards/format_reward": 1.0, "step": 1617 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 395.625, "epoch": 0.022316933559537108, "grad_norm": 2.2356802284425163, "kl": 0.068359375, "learning_rate": 9.987716252252957e-07, "loss": 0.0027, "reward": 2.0247654914855957, "reward_std": 0.025083305314183235, "rewards/accuracy_reward": 0.8310155868530273, "rewards/format_reward": 1.0, "step": 1618 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 375.625, "epoch": 0.022330726472738305, "grad_norm": 2.40069343630425, "kl": 0.06787109375, "learning_rate": 9.987701069924427e-07, "loss": 0.0027, "reward": 2.1070001125335693, "reward_std": 0.014938032254576683, "rewards/accuracy_reward": 0.906999945640564, "rewards/format_reward": 1.0, "step": 1619 }, { "all_correct": 0.25, "all_wrong": 0.0, "completion_length": 382.0625, "epoch": 0.022344519385939503, "grad_norm": 3.195540825530226, "kl": 0.0771484375, "learning_rate": 9.987685878230803e-07, "loss": 0.0031, "reward": 2.016718864440918, "reward_std": 0.02631540596485138, "rewards/accuracy_reward": 0.8229687213897705, "rewards/format_reward": 1.0, "step": 1620 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 376.4375, "epoch": 0.0223583122991407, "grad_norm": 3.6386246575180516, "kl": 0.068359375, "learning_rate": 9.987670677172113e-07, "loss": 0.0027, "reward": 2.091437339782715, "reward_std": 0.033959440886974335, "rewards/accuracy_reward": 0.8914374709129333, "rewards/format_reward": 1.0, "step": 1621 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 387.78125, "epoch": 0.022372105212341897, "grad_norm": 2.6455846219595873, "kl": 0.078125, "learning_rate": 9.987655466748384e-07, "loss": 0.0031, "reward": 2.0830626487731934, "reward_std": 0.032215796411037445, "rewards/accuracy_reward": 0.8893125057220459, "rewards/format_reward": 1.0, "step": 1622 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 380.75, "epoch": 0.022385898125543095, "grad_norm": 1.7424033586725158, "kl": 0.0751953125, "learning_rate": 9.987640246959648e-07, "loss": 0.003, "reward": 2.024531364440918, "reward_std": 0.021746691316366196, "rewards/accuracy_reward": 0.8307812213897705, "rewards/format_reward": 1.0, "step": 1623 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 385.03125, "epoch": 0.022399691038744292, "grad_norm": 2.4642272376215315, "kl": 0.0693359375, "learning_rate": 9.987625017805933e-07, "loss": 0.0028, "reward": 2.1683435440063477, "reward_std": 0.009530635550618172, "rewards/accuracy_reward": 0.9683437347412109, "rewards/format_reward": 1.0, "step": 1624 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 382.65625, "epoch": 0.02241348395194549, "grad_norm": 2.7652539374980445, "kl": 0.06640625, "learning_rate": 9.987609779287262e-07, "loss": 0.0027, "reward": 2.096062660217285, "reward_std": 0.016550809144973755, "rewards/accuracy_reward": 0.8960625529289246, "rewards/format_reward": 1.0, "step": 1625 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 378.0625, "epoch": 0.022427276865146686, "grad_norm": 4.588052411991899, "kl": 0.07861328125, "learning_rate": 9.98759453140367e-07, "loss": 0.0031, "reward": 2.0502500534057617, "reward_std": 0.011521900072693825, "rewards/accuracy_reward": 0.8502499461174011, "rewards/format_reward": 1.0, "step": 1626 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 392.84375, "epoch": 0.022441069778347884, "grad_norm": 4.2124360996096115, "kl": 0.0693359375, "learning_rate": 9.987579274155182e-07, "loss": 0.0028, "reward": 2.1075310707092285, "reward_std": 0.013847069814801216, "rewards/accuracy_reward": 0.9075312614440918, "rewards/format_reward": 1.0, "step": 1627 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 379.6875, "epoch": 0.02245486269154908, "grad_norm": 5.426314357669005, "kl": 0.0791015625, "learning_rate": 9.987564007541832e-07, "loss": 0.0032, "reward": 1.97475004196167, "reward_std": 0.03231143206357956, "rewards/accuracy_reward": 0.7810000777244568, "rewards/format_reward": 1.0, "step": 1628 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 399.0, "epoch": 0.022468655604750278, "grad_norm": 3.5415681066205216, "kl": 0.078125, "learning_rate": 9.98754873156364e-07, "loss": 0.0031, "reward": 2.0370936393737793, "reward_std": 0.03552011772990227, "rewards/accuracy_reward": 0.8433436751365662, "rewards/format_reward": 1.0, "step": 1629 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 386.34375, "epoch": 0.022482448517951475, "grad_norm": 4.3135285231021445, "kl": 0.06787109375, "learning_rate": 9.987533446220642e-07, "loss": 0.0027, "reward": 2.1143438816070557, "reward_std": 0.010626059956848621, "rewards/accuracy_reward": 0.9143437147140503, "rewards/format_reward": 1.0, "step": 1630 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 390.1875, "epoch": 0.022496241431152673, "grad_norm": 2.161792505344043, "kl": 0.07763671875, "learning_rate": 9.987518151512863e-07, "loss": 0.0031, "reward": 1.8996875286102295, "reward_std": 0.011843464337289333, "rewards/accuracy_reward": 0.6996874809265137, "rewards/format_reward": 1.0, "step": 1631 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 396.09375, "epoch": 0.02251003434435387, "grad_norm": 2.639045801220504, "kl": 0.072265625, "learning_rate": 9.987502847440335e-07, "loss": 0.0029, "reward": 2.098968982696533, "reward_std": 0.013721741735935211, "rewards/accuracy_reward": 0.8989687561988831, "rewards/format_reward": 1.0, "step": 1632 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 383.71875, "epoch": 0.022523827257555067, "grad_norm": 8.081977097473715, "kl": 0.0712890625, "learning_rate": 9.987487534003082e-07, "loss": 0.0028, "reward": 1.9724375009536743, "reward_std": 0.01910484954714775, "rewards/accuracy_reward": 0.7724375128746033, "rewards/format_reward": 1.0, "step": 1633 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 387.71875, "epoch": 0.022537620170756265, "grad_norm": 3.5172905275188038, "kl": 0.0791015625, "learning_rate": 9.987472211201139e-07, "loss": 0.0032, "reward": 2.007406234741211, "reward_std": 0.016517942771315575, "rewards/accuracy_reward": 0.8074062466621399, "rewards/format_reward": 1.0, "step": 1634 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 391.875, "epoch": 0.022551413083957462, "grad_norm": 26.11986204609202, "kl": 0.0751953125, "learning_rate": 9.987456879034526e-07, "loss": 0.003, "reward": 2.1055314540863037, "reward_std": 0.024459335952997208, "rewards/accuracy_reward": 0.9055312871932983, "rewards/format_reward": 1.0, "step": 1635 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 374.1875, "epoch": 0.02256520599715866, "grad_norm": 4.090719347099288, "kl": 0.07373046875, "learning_rate": 9.98744153750328e-07, "loss": 0.003, "reward": 2.1271252632141113, "reward_std": 0.03258293867111206, "rewards/accuracy_reward": 0.9333750009536743, "rewards/format_reward": 1.0, "step": 1636 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 390.34375, "epoch": 0.022578998910359856, "grad_norm": 2.976081529762625, "kl": 0.07958984375, "learning_rate": 9.987426186607425e-07, "loss": 0.0032, "reward": 2.054281234741211, "reward_std": 0.025163572281599045, "rewards/accuracy_reward": 0.8542812466621399, "rewards/format_reward": 1.0, "step": 1637 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 377.46875, "epoch": 0.022592791823561054, "grad_norm": 3.759704003109648, "kl": 0.07080078125, "learning_rate": 9.987410826346994e-07, "loss": 0.0028, "reward": 2.0804061889648438, "reward_std": 0.01303371973335743, "rewards/accuracy_reward": 0.8804062604904175, "rewards/format_reward": 1.0, "step": 1638 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 381.1875, "epoch": 0.02260658473676225, "grad_norm": 4.429015151705762, "kl": 0.07568359375, "learning_rate": 9.987395456722012e-07, "loss": 0.003, "reward": 1.9929375648498535, "reward_std": 0.011019095778465271, "rewards/accuracy_reward": 0.7929374575614929, "rewards/format_reward": 1.0, "step": 1639 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 381.28125, "epoch": 0.022620377649963448, "grad_norm": 2.672817395978639, "kl": 0.06982421875, "learning_rate": 9.98738007773251e-07, "loss": 0.0028, "reward": 2.138406276702881, "reward_std": 0.035480089485645294, "rewards/accuracy_reward": 0.9571563005447388, "rewards/format_reward": 1.0, "step": 1640 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 380.78125, "epoch": 0.022634170563164645, "grad_norm": 2.4441750624262193, "kl": 0.06640625, "learning_rate": 9.987364689378514e-07, "loss": 0.0027, "reward": 2.139937400817871, "reward_std": 0.04453915357589722, "rewards/accuracy_reward": 0.9524375200271606, "rewards/format_reward": 1.0, "step": 1641 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 382.5625, "epoch": 0.022647963476365843, "grad_norm": 5.124465866373513, "kl": 0.07080078125, "learning_rate": 9.987349291660058e-07, "loss": 0.0028, "reward": 2.0469062328338623, "reward_std": 0.0361442007124424, "rewards/accuracy_reward": 0.8531562089920044, "rewards/format_reward": 1.0, "step": 1642 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 383.46875, "epoch": 0.02266175638956704, "grad_norm": 2.353584137383195, "kl": 0.0712890625, "learning_rate": 9.987333884577165e-07, "loss": 0.0028, "reward": 1.9849374294281006, "reward_std": 0.009978788904845715, "rewards/accuracy_reward": 0.7849375605583191, "rewards/format_reward": 1.0, "step": 1643 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 395.0625, "epoch": 0.022675549302768237, "grad_norm": 2.1660351809070515, "kl": 0.06494140625, "learning_rate": 9.987318468129869e-07, "loss": 0.0026, "reward": 2.0594310760498047, "reward_std": 0.02263607829809189, "rewards/accuracy_reward": 0.865680992603302, "rewards/format_reward": 1.0, "step": 1644 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 407.78125, "epoch": 0.022689342215969435, "grad_norm": 5.839559912600374, "kl": 0.07763671875, "learning_rate": 9.987303042318197e-07, "loss": 0.0031, "reward": 2.0973124504089355, "reward_std": 0.015562057495117188, "rewards/accuracy_reward": 0.8973124623298645, "rewards/format_reward": 1.0, "step": 1645 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 392.21875, "epoch": 0.022703135129170632, "grad_norm": 3.1852166384659486, "kl": 0.068359375, "learning_rate": 9.987287607142175e-07, "loss": 0.0027, "reward": 2.091437339782715, "reward_std": 0.011914307251572609, "rewards/accuracy_reward": 0.8914375305175781, "rewards/format_reward": 1.0, "step": 1646 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 387.4375, "epoch": 0.02271692804237183, "grad_norm": 3.2110001823347436, "kl": 0.0673828125, "learning_rate": 9.987272162601836e-07, "loss": 0.0027, "reward": 2.0966248512268066, "reward_std": 0.02088780142366886, "rewards/accuracy_reward": 0.8966249227523804, "rewards/format_reward": 1.0, "step": 1647 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 411.03125, "epoch": 0.022730720955573026, "grad_norm": 2.0785619253009116, "kl": 0.0703125, "learning_rate": 9.98725670869721e-07, "loss": 0.0028, "reward": 2.1183531284332275, "reward_std": 0.015527507290244102, "rewards/accuracy_reward": 0.9183530807495117, "rewards/format_reward": 1.0, "step": 1648 }, { "all_correct": 0.25, "all_wrong": 0.0, "completion_length": 396.34375, "epoch": 0.022744513868774224, "grad_norm": 2.7673417700482394, "kl": 0.07080078125, "learning_rate": 9.987241245428322e-07, "loss": 0.0028, "reward": 1.9821875095367432, "reward_std": 0.012552608735859394, "rewards/accuracy_reward": 0.7821875214576721, "rewards/format_reward": 1.0, "step": 1649 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 409.875, "epoch": 0.02275830678197542, "grad_norm": 33.221404162866214, "kl": 0.07080078125, "learning_rate": 9.987225772795201e-07, "loss": 0.0028, "reward": 2.113093614578247, "reward_std": 0.026470482349395752, "rewards/accuracy_reward": 0.9193437099456787, "rewards/format_reward": 1.0, "step": 1650 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 402.4375, "epoch": 0.022772099695176618, "grad_norm": 2.7226735814247207, "kl": 0.06494140625, "learning_rate": 9.98721029079788e-07, "loss": 0.0026, "reward": 2.0960001945495605, "reward_std": 0.013849593698978424, "rewards/accuracy_reward": 0.8959999680519104, "rewards/format_reward": 1.0, "step": 1651 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 410.03125, "epoch": 0.022785892608377815, "grad_norm": 2.0815498159063437, "kl": 0.072265625, "learning_rate": 9.987194799436386e-07, "loss": 0.0029, "reward": 2.0556564331054688, "reward_std": 0.01970680058002472, "rewards/accuracy_reward": 0.8556562662124634, "rewards/format_reward": 1.0, "step": 1652 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 407.90625, "epoch": 0.022799685521579013, "grad_norm": 1.9594509009400654, "kl": 0.07373046875, "learning_rate": 9.98717929871075e-07, "loss": 0.0029, "reward": 2.1013436317443848, "reward_std": 0.01352996937930584, "rewards/accuracy_reward": 0.9013437032699585, "rewards/format_reward": 1.0, "step": 1653 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 401.5, "epoch": 0.02281347843478021, "grad_norm": 2.070654489749832, "kl": 0.06640625, "learning_rate": 9.987163788620995e-07, "loss": 0.0027, "reward": 2.1234688758850098, "reward_std": 0.011304449290037155, "rewards/accuracy_reward": 0.923468828201294, "rewards/format_reward": 1.0, "step": 1654 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 398.90625, "epoch": 0.022827271347981407, "grad_norm": 3.2764125669868234, "kl": 0.0693359375, "learning_rate": 9.987148269167157e-07, "loss": 0.0028, "reward": 2.1664376258850098, "reward_std": 0.015318438410758972, "rewards/accuracy_reward": 0.9664375185966492, "rewards/format_reward": 1.0, "step": 1655 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 397.40625, "epoch": 0.022841064261182605, "grad_norm": 1.8631941140950767, "kl": 0.0634765625, "learning_rate": 9.98713274034926e-07, "loss": 0.0025, "reward": 2.152937650680542, "reward_std": 0.012151816859841347, "rewards/accuracy_reward": 0.9529374837875366, "rewards/format_reward": 1.0, "step": 1656 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 418.84375, "epoch": 0.022854857174383802, "grad_norm": 11.236525264633915, "kl": 0.06298828125, "learning_rate": 9.987117202167337e-07, "loss": 0.0025, "reward": 2.1085000038146973, "reward_std": 0.010128715075552464, "rewards/accuracy_reward": 0.908500075340271, "rewards/format_reward": 1.0, "step": 1657 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 404.28125, "epoch": 0.022868650087585, "grad_norm": 2.313797345428018, "kl": 0.0712890625, "learning_rate": 9.987101654621416e-07, "loss": 0.0028, "reward": 2.087156295776367, "reward_std": 0.031427495181560516, "rewards/accuracy_reward": 0.8934062719345093, "rewards/format_reward": 1.0, "step": 1658 }, { "all_correct": 0.0, "all_wrong": 0.0, "completion_length": 422.6875, "epoch": 0.022882443000786196, "grad_norm": 2.132083557201774, "kl": 0.07177734375, "learning_rate": 9.987086097711524e-07, "loss": 0.0029, "reward": 1.9529376029968262, "reward_std": 0.03312106430530548, "rewards/accuracy_reward": 0.7591875195503235, "rewards/format_reward": 1.0, "step": 1659 }, { "all_correct": 0.25, "all_wrong": 0.0, "completion_length": 424.40625, "epoch": 0.022896235913987394, "grad_norm": 2.8942070323549114, "kl": 0.0654296875, "learning_rate": 9.987070531437694e-07, "loss": 0.0026, "reward": 1.9530937671661377, "reward_std": 0.018820933997631073, "rewards/accuracy_reward": 0.7530937194824219, "rewards/format_reward": 1.0, "step": 1660 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 418.40625, "epoch": 0.02291002882718859, "grad_norm": 2.655399528869471, "kl": 0.0732421875, "learning_rate": 9.987054955799952e-07, "loss": 0.0029, "reward": 2.097249984741211, "reward_std": 0.01258598268032074, "rewards/accuracy_reward": 0.8972499370574951, "rewards/format_reward": 1.0, "step": 1661 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 400.125, "epoch": 0.022923821740389788, "grad_norm": 4.205156707019659, "kl": 0.07421875, "learning_rate": 9.987039370798331e-07, "loss": 0.003, "reward": 2.1006250381469727, "reward_std": 0.01906784437596798, "rewards/accuracy_reward": 0.9006250500679016, "rewards/format_reward": 1.0, "step": 1662 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 410.875, "epoch": 0.022937614653590985, "grad_norm": 3.02287594681577, "kl": 0.07177734375, "learning_rate": 9.987023776432854e-07, "loss": 0.0029, "reward": 2.1303436756134033, "reward_std": 0.029743749648332596, "rewards/accuracy_reward": 0.9365937113761902, "rewards/format_reward": 1.0, "step": 1663 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 407.21875, "epoch": 0.022951407566792183, "grad_norm": 1.8434933484424019, "kl": 0.072265625, "learning_rate": 9.987008172703557e-07, "loss": 0.0029, "reward": 2.145343780517578, "reward_std": 0.0064971670508384705, "rewards/accuracy_reward": 0.9453437924385071, "rewards/format_reward": 1.0, "step": 1664 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 404.78125, "epoch": 0.02296520047999338, "grad_norm": 3.322838632058154, "kl": 0.0693359375, "learning_rate": 9.986992559610466e-07, "loss": 0.0028, "reward": 2.0964531898498535, "reward_std": 0.028336143121123314, "rewards/accuracy_reward": 0.9027031064033508, "rewards/format_reward": 1.0, "step": 1665 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 414.75, "epoch": 0.022978993393194577, "grad_norm": 2.218958646524337, "kl": 0.076171875, "learning_rate": 9.98697693715361e-07, "loss": 0.0031, "reward": 2.1302499771118164, "reward_std": 0.024848777800798416, "rewards/accuracy_reward": 0.9365000128746033, "rewards/format_reward": 1.0, "step": 1666 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 417.875, "epoch": 0.022992786306395775, "grad_norm": 2.373010030560566, "kl": 0.0673828125, "learning_rate": 9.98696130533302e-07, "loss": 0.0027, "reward": 2.1166560649871826, "reward_std": 0.00951197650283575, "rewards/accuracy_reward": 0.9166562557220459, "rewards/format_reward": 1.0, "step": 1667 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 429.4375, "epoch": 0.023006579219596972, "grad_norm": 2.108639863049619, "kl": 0.06640625, "learning_rate": 9.986945664148723e-07, "loss": 0.0027, "reward": 2.121509552001953, "reward_std": 0.03110520914196968, "rewards/accuracy_reward": 0.9277593493461609, "rewards/format_reward": 1.0, "step": 1668 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 440.53125, "epoch": 0.02302037213279817, "grad_norm": 5.028493232761288, "kl": 0.06494140625, "learning_rate": 9.986930013600752e-07, "loss": 0.0026, "reward": 2.084437370300293, "reward_std": 0.029759516939520836, "rewards/accuracy_reward": 0.8906875252723694, "rewards/format_reward": 1.0, "step": 1669 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 422.03125, "epoch": 0.023034165045999366, "grad_norm": 1.9456311677734903, "kl": 0.07763671875, "learning_rate": 9.986914353689134e-07, "loss": 0.0031, "reward": 2.120187759399414, "reward_std": 0.008283843286335468, "rewards/accuracy_reward": 0.9201874732971191, "rewards/format_reward": 1.0, "step": 1670 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 417.34375, "epoch": 0.023047957959200564, "grad_norm": 2.8738681609739443, "kl": 0.068359375, "learning_rate": 9.986898684413897e-07, "loss": 0.0027, "reward": 2.071624994277954, "reward_std": 0.018431399017572403, "rewards/accuracy_reward": 0.8716249465942383, "rewards/format_reward": 1.0, "step": 1671 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 410.25, "epoch": 0.02306175087240176, "grad_norm": 3.3355956777241453, "kl": 0.0673828125, "learning_rate": 9.98688300577507e-07, "loss": 0.0027, "reward": 2.13685941696167, "reward_std": 0.011823462322354317, "rewards/accuracy_reward": 0.9368594288825989, "rewards/format_reward": 1.0, "step": 1672 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 421.8125, "epoch": 0.023075543785602958, "grad_norm": 2.2224807510423186, "kl": 0.0693359375, "learning_rate": 9.986867317772689e-07, "loss": 0.0028, "reward": 2.0652291774749756, "reward_std": 0.01932460255920887, "rewards/accuracy_reward": 0.8652291893959045, "rewards/format_reward": 1.0, "step": 1673 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 411.0625, "epoch": 0.023089336698804155, "grad_norm": 2.85276188427479, "kl": 0.06591796875, "learning_rate": 9.986851620406775e-07, "loss": 0.0026, "reward": 2.0309062004089355, "reward_std": 0.012519195675849915, "rewards/accuracy_reward": 0.8309062719345093, "rewards/format_reward": 1.0, "step": 1674 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 422.21875, "epoch": 0.023103129612005353, "grad_norm": 3.1535972543580346, "kl": 0.0703125, "learning_rate": 9.986835913677363e-07, "loss": 0.0028, "reward": 2.0654375553131104, "reward_std": 0.016368605196475983, "rewards/accuracy_reward": 0.8654374480247498, "rewards/format_reward": 1.0, "step": 1675 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 420.84375, "epoch": 0.02311692252520655, "grad_norm": 1.4633776138349617, "kl": 0.07373046875, "learning_rate": 9.986820197584483e-07, "loss": 0.003, "reward": 2.0888750553131104, "reward_std": 0.004317393526434898, "rewards/accuracy_reward": 0.8888750076293945, "rewards/format_reward": 1.0, "step": 1676 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 424.0, "epoch": 0.023130715438407747, "grad_norm": 3.20145105519077, "kl": 0.0712890625, "learning_rate": 9.98680447212816e-07, "loss": 0.0028, "reward": 2.0140624046325684, "reward_std": 0.022028621286153793, "rewards/accuracy_reward": 0.8140624761581421, "rewards/format_reward": 1.0, "step": 1677 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 421.59375, "epoch": 0.023144508351608944, "grad_norm": 8.767472568874263, "kl": 0.072265625, "learning_rate": 9.986788737308425e-07, "loss": 0.0029, "reward": 2.1104063987731934, "reward_std": 0.013903023675084114, "rewards/accuracy_reward": 0.910406231880188, "rewards/format_reward": 1.0, "step": 1678 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 418.15625, "epoch": 0.023158301264810142, "grad_norm": 6.064668822241957, "kl": 0.07470703125, "learning_rate": 9.98677299312531e-07, "loss": 0.003, "reward": 2.0306248664855957, "reward_std": 0.016842491924762726, "rewards/accuracy_reward": 0.8306249976158142, "rewards/format_reward": 1.0, "step": 1679 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 427.3125, "epoch": 0.02317209417801134, "grad_norm": 5.605289672548259, "kl": 0.07861328125, "learning_rate": 9.986757239578841e-07, "loss": 0.0031, "reward": 2.0912375450134277, "reward_std": 0.009705105796456337, "rewards/accuracy_reward": 0.8912374973297119, "rewards/format_reward": 1.0, "step": 1680 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 405.5625, "epoch": 0.023185887091212536, "grad_norm": 6.283973648744672, "kl": 0.0634765625, "learning_rate": 9.986741476669052e-07, "loss": 0.0026, "reward": 2.162749767303467, "reward_std": 0.04761098325252533, "rewards/accuracy_reward": 0.9752500057220459, "rewards/format_reward": 1.0, "step": 1681 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 412.0, "epoch": 0.023199680004413734, "grad_norm": 7.43221628672166, "kl": 0.06982421875, "learning_rate": 9.98672570439597e-07, "loss": 0.0028, "reward": 2.0488126277923584, "reward_std": 0.01502240914851427, "rewards/accuracy_reward": 0.8488125801086426, "rewards/format_reward": 1.0, "step": 1682 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 411.875, "epoch": 0.02321347291761493, "grad_norm": 2.6111387040900205, "kl": 0.07421875, "learning_rate": 9.986709922759624e-07, "loss": 0.003, "reward": 2.1423094272613525, "reward_std": 0.00808392558246851, "rewards/accuracy_reward": 0.9423093199729919, "rewards/format_reward": 1.0, "step": 1683 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 426.28125, "epoch": 0.023227265830816128, "grad_norm": 2.7536377421888663, "kl": 0.072265625, "learning_rate": 9.986694131760045e-07, "loss": 0.0029, "reward": 2.117374897003174, "reward_std": 0.013065105304121971, "rewards/accuracy_reward": 0.9173749685287476, "rewards/format_reward": 1.0, "step": 1684 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 428.5, "epoch": 0.023241058744017325, "grad_norm": 1.5389553475223623, "kl": 0.0703125, "learning_rate": 9.98667833139726e-07, "loss": 0.0028, "reward": 2.1069374084472656, "reward_std": 0.004994590766727924, "rewards/accuracy_reward": 0.9069375395774841, "rewards/format_reward": 1.0, "step": 1685 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 413.1875, "epoch": 0.023254851657218523, "grad_norm": 4.479515165985387, "kl": 0.080078125, "learning_rate": 9.986662521671302e-07, "loss": 0.0032, "reward": 2.1265311241149902, "reward_std": 0.013859818689525127, "rewards/accuracy_reward": 0.9265313148498535, "rewards/format_reward": 1.0, "step": 1686 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 411.4375, "epoch": 0.02326864457041972, "grad_norm": 2.503294160586001, "kl": 0.076171875, "learning_rate": 9.986646702582201e-07, "loss": 0.003, "reward": 2.153937339782715, "reward_std": 0.028295788913965225, "rewards/accuracy_reward": 0.960187554359436, "rewards/format_reward": 1.0, "step": 1687 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 412.5625, "epoch": 0.023282437483620917, "grad_norm": 4.441688203165198, "kl": 0.07373046875, "learning_rate": 9.986630874129983e-07, "loss": 0.0029, "reward": 2.115884304046631, "reward_std": 0.017986608669161797, "rewards/accuracy_reward": 0.9158843755722046, "rewards/format_reward": 1.0, "step": 1688 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 416.875, "epoch": 0.023296230396822114, "grad_norm": 2.807887076029559, "kl": 0.0673828125, "learning_rate": 9.986615036314682e-07, "loss": 0.0027, "reward": 2.0740625858306885, "reward_std": 0.008702855557203293, "rewards/accuracy_reward": 0.8740624785423279, "rewards/format_reward": 1.0, "step": 1689 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 403.65625, "epoch": 0.023310023310023312, "grad_norm": 2.3830464551744925, "kl": 0.07568359375, "learning_rate": 9.986599189136323e-07, "loss": 0.003, "reward": 2.084843635559082, "reward_std": 0.013038131408393383, "rewards/accuracy_reward": 0.8848437666893005, "rewards/format_reward": 1.0, "step": 1690 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 403.375, "epoch": 0.023323816223224506, "grad_norm": 1.9070433118527552, "kl": 0.0712890625, "learning_rate": 9.98658333259494e-07, "loss": 0.0029, "reward": 2.1456093788146973, "reward_std": 0.004530326928943396, "rewards/accuracy_reward": 0.9456093311309814, "rewards/format_reward": 1.0, "step": 1691 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 433.28125, "epoch": 0.023337609136425703, "grad_norm": 1.5934918988257967, "kl": 0.072265625, "learning_rate": 9.986567466690562e-07, "loss": 0.0029, "reward": 2.1080312728881836, "reward_std": 0.022230740636587143, "rewards/accuracy_reward": 0.9142812490463257, "rewards/format_reward": 1.0, "step": 1692 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 420.03125, "epoch": 0.0233514020496269, "grad_norm": 3.5332894745457737, "kl": 0.068359375, "learning_rate": 9.986551591423216e-07, "loss": 0.0027, "reward": 1.9298124313354492, "reward_std": 0.007952363230288029, "rewards/accuracy_reward": 0.729812502861023, "rewards/format_reward": 1.0, "step": 1693 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 405.5625, "epoch": 0.023365194962828097, "grad_norm": 2.6340685227274636, "kl": 0.078125, "learning_rate": 9.986535706792935e-07, "loss": 0.0031, "reward": 2.132171869277954, "reward_std": 0.008069278672337532, "rewards/accuracy_reward": 0.9321718215942383, "rewards/format_reward": 1.0, "step": 1694 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 406.65625, "epoch": 0.023378987876029295, "grad_norm": 2.8492318261380065, "kl": 0.0791015625, "learning_rate": 9.986519812799747e-07, "loss": 0.0032, "reward": 1.9759130477905273, "reward_std": 0.009217929095029831, "rewards/accuracy_reward": 0.7759132385253906, "rewards/format_reward": 1.0, "step": 1695 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 412.25, "epoch": 0.023392780789230492, "grad_norm": 5.846606065456442, "kl": 0.0751953125, "learning_rate": 9.986503909443681e-07, "loss": 0.003, "reward": 2.1065688133239746, "reward_std": 0.01671210117638111, "rewards/accuracy_reward": 0.9065687656402588, "rewards/format_reward": 1.0, "step": 1696 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 408.78125, "epoch": 0.02340657370243169, "grad_norm": 2.4142969651961774, "kl": 0.072265625, "learning_rate": 9.98648799672477e-07, "loss": 0.0029, "reward": 1.979062557220459, "reward_std": 0.005396238062530756, "rewards/accuracy_reward": 0.7790625095367432, "rewards/format_reward": 1.0, "step": 1697 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 414.15625, "epoch": 0.023420366615632886, "grad_norm": 2.5854546446988143, "kl": 0.068359375, "learning_rate": 9.986472074643042e-07, "loss": 0.0027, "reward": 2.093031167984009, "reward_std": 0.007706265430897474, "rewards/accuracy_reward": 0.8930312395095825, "rewards/format_reward": 1.0, "step": 1698 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 401.21875, "epoch": 0.023434159528834084, "grad_norm": 1.886258510438861, "kl": 0.068359375, "learning_rate": 9.986456143198527e-07, "loss": 0.0027, "reward": 2.100281238555908, "reward_std": 0.004760038107633591, "rewards/accuracy_reward": 0.9002811908721924, "rewards/format_reward": 1.0, "step": 1699 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 406.28125, "epoch": 0.02344795244203528, "grad_norm": 2.822688158817216, "kl": 0.08056640625, "learning_rate": 9.986440202391255e-07, "loss": 0.0032, "reward": 2.13421893119812, "reward_std": 0.010491998866200447, "rewards/accuracy_reward": 0.9342187643051147, "rewards/format_reward": 1.0, "step": 1700 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 411.875, "epoch": 0.023461745355236478, "grad_norm": 2.729248283060408, "kl": 0.076171875, "learning_rate": 9.986424252221254e-07, "loss": 0.003, "reward": 2.0096664428710938, "reward_std": 0.012444496154785156, "rewards/accuracy_reward": 0.8096666932106018, "rewards/format_reward": 1.0, "step": 1701 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 407.96875, "epoch": 0.023475538268437676, "grad_norm": 3.1092368401185713, "kl": 0.07861328125, "learning_rate": 9.98640829268856e-07, "loss": 0.0031, "reward": 2.069103240966797, "reward_std": 0.009965509176254272, "rewards/accuracy_reward": 0.8691031336784363, "rewards/format_reward": 1.0, "step": 1702 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 400.71875, "epoch": 0.023489331181638873, "grad_norm": 6.463143728540664, "kl": 0.0830078125, "learning_rate": 9.986392323793194e-07, "loss": 0.0033, "reward": 2.108466625213623, "reward_std": 0.014811607077717781, "rewards/accuracy_reward": 0.9084665775299072, "rewards/format_reward": 1.0, "step": 1703 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 402.21875, "epoch": 0.02350312409484007, "grad_norm": 4.436697105443859, "kl": 0.0693359375, "learning_rate": 9.986376345535193e-07, "loss": 0.0028, "reward": 2.0995874404907227, "reward_std": 0.01052086241543293, "rewards/accuracy_reward": 0.8995875120162964, "rewards/format_reward": 1.0, "step": 1704 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 412.34375, "epoch": 0.023516917008041267, "grad_norm": 4.539649593034272, "kl": 0.078125, "learning_rate": 9.986360357914585e-07, "loss": 0.0031, "reward": 2.1238560676574707, "reward_std": 0.009398294612765312, "rewards/accuracy_reward": 0.923856258392334, "rewards/format_reward": 1.0, "step": 1705 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 382.5625, "epoch": 0.023530709921242465, "grad_norm": 7.069869664039515, "kl": 0.0673828125, "learning_rate": 9.986344360931398e-07, "loss": 0.0027, "reward": 2.180812358856201, "reward_std": 0.00308728264644742, "rewards/accuracy_reward": 0.9808125495910645, "rewards/format_reward": 1.0, "step": 1706 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 402.25, "epoch": 0.023544502834443662, "grad_norm": 3.9522791206040804, "kl": 0.06787109375, "learning_rate": 9.986328354585665e-07, "loss": 0.0027, "reward": 2.066878080368042, "reward_std": 0.010748587548732758, "rewards/accuracy_reward": 0.8668781518936157, "rewards/format_reward": 1.0, "step": 1707 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 408.96875, "epoch": 0.02355829574764486, "grad_norm": 3.7781428036422646, "kl": 0.07958984375, "learning_rate": 9.986312338877415e-07, "loss": 0.0032, "reward": 2.0023488998413086, "reward_std": 0.011203627102077007, "rewards/accuracy_reward": 0.802348792552948, "rewards/format_reward": 1.0, "step": 1708 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 399.625, "epoch": 0.023572088660846056, "grad_norm": 3.4957114407424514, "kl": 0.072265625, "learning_rate": 9.98629631380668e-07, "loss": 0.0029, "reward": 2.0955874919891357, "reward_std": 0.010107596404850483, "rewards/accuracy_reward": 0.8955875039100647, "rewards/format_reward": 1.0, "step": 1709 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 401.1875, "epoch": 0.023585881574047254, "grad_norm": 1.9033069682896537, "kl": 0.0732421875, "learning_rate": 9.986280279373484e-07, "loss": 0.0029, "reward": 2.0947813987731934, "reward_std": 0.004276146646589041, "rewards/accuracy_reward": 0.894781231880188, "rewards/format_reward": 1.0, "step": 1710 }, { "all_correct": 0.0, "all_wrong": 0.0, "completion_length": 379.3125, "epoch": 0.02359967448724845, "grad_norm": 5.867222020775312, "kl": 0.07421875, "learning_rate": 9.986264235577863e-07, "loss": 0.003, "reward": 1.8767969608306885, "reward_std": 0.028966933488845825, "rewards/accuracy_reward": 0.6830468773841858, "rewards/format_reward": 1.0, "step": 1711 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 392.53125, "epoch": 0.023613467400449648, "grad_norm": 2.731920737907278, "kl": 0.0673828125, "learning_rate": 9.986248182419845e-07, "loss": 0.0027, "reward": 2.0266876220703125, "reward_std": 0.006303221918642521, "rewards/accuracy_reward": 0.8266874551773071, "rewards/format_reward": 1.0, "step": 1712 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 405.3125, "epoch": 0.023627260313650846, "grad_norm": 2.626970184207237, "kl": 0.0771484375, "learning_rate": 9.986232119899461e-07, "loss": 0.0031, "reward": 2.134718894958496, "reward_std": 0.011585233733057976, "rewards/accuracy_reward": 0.9347187280654907, "rewards/format_reward": 1.0, "step": 1713 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 385.375, "epoch": 0.023641053226852043, "grad_norm": 2.552727687959991, "kl": 0.0732421875, "learning_rate": 9.986216048016742e-07, "loss": 0.0029, "reward": 1.9184062480926514, "reward_std": 0.021414421498775482, "rewards/accuracy_reward": 0.7184062004089355, "rewards/format_reward": 1.0, "step": 1714 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 412.28125, "epoch": 0.02365484614005324, "grad_norm": 13.777134298484489, "kl": 0.07177734375, "learning_rate": 9.986199966771713e-07, "loss": 0.0029, "reward": 2.0880000591278076, "reward_std": 0.01766117289662361, "rewards/accuracy_reward": 0.8880000114440918, "rewards/format_reward": 1.0, "step": 1715 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 397.25, "epoch": 0.023668639053254437, "grad_norm": 4.4655075172256655, "kl": 0.0693359375, "learning_rate": 9.98618387616441e-07, "loss": 0.0028, "reward": 2.0521562099456787, "reward_std": 0.006293314974755049, "rewards/accuracy_reward": 0.8521562814712524, "rewards/format_reward": 1.0, "step": 1716 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 401.03125, "epoch": 0.023682431966455635, "grad_norm": 2.6718997760957324, "kl": 0.08251953125, "learning_rate": 9.986167776194863e-07, "loss": 0.0033, "reward": 1.9139530658721924, "reward_std": 0.020417839288711548, "rewards/accuracy_reward": 0.7139531373977661, "rewards/format_reward": 1.0, "step": 1717 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 399.21875, "epoch": 0.023696224879656832, "grad_norm": 2.4677430153934004, "kl": 0.0703125, "learning_rate": 9.986151666863097e-07, "loss": 0.0028, "reward": 2.0550622940063477, "reward_std": 0.007356978952884674, "rewards/accuracy_reward": 0.8550626039505005, "rewards/format_reward": 1.0, "step": 1718 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 398.875, "epoch": 0.02371001779285803, "grad_norm": 2.1939899815669426, "kl": 0.068359375, "learning_rate": 9.986135548169149e-07, "loss": 0.0027, "reward": 2.0898125171661377, "reward_std": 0.007136219646781683, "rewards/accuracy_reward": 0.8898124694824219, "rewards/format_reward": 1.0, "step": 1719 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 387.40625, "epoch": 0.023723810706059226, "grad_norm": 2.515741061054989, "kl": 0.0771484375, "learning_rate": 9.986119420113043e-07, "loss": 0.0031, "reward": 2.101253032684326, "reward_std": 0.017434094101190567, "rewards/accuracy_reward": 0.9012531638145447, "rewards/format_reward": 1.0, "step": 1720 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 400.46875, "epoch": 0.023737603619260424, "grad_norm": 1.600265154399046, "kl": 0.060791015625, "learning_rate": 9.986103282694812e-07, "loss": 0.0024, "reward": 2.181375026702881, "reward_std": 0.00671817222610116, "rewards/accuracy_reward": 0.9813750386238098, "rewards/format_reward": 1.0, "step": 1721 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 402.9375, "epoch": 0.02375139653246162, "grad_norm": 3.0599808020254464, "kl": 0.076171875, "learning_rate": 9.986087135914488e-07, "loss": 0.003, "reward": 2.093647003173828, "reward_std": 0.017799291759729385, "rewards/accuracy_reward": 0.8936468362808228, "rewards/format_reward": 1.0, "step": 1722 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 409.125, "epoch": 0.023765189445662818, "grad_norm": 3.319295020392552, "kl": 0.0712890625, "learning_rate": 9.9860709797721e-07, "loss": 0.0029, "reward": 2.127687692642212, "reward_std": 0.02012709528207779, "rewards/accuracy_reward": 0.9276874661445618, "rewards/format_reward": 1.0, "step": 1723 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 395.78125, "epoch": 0.023778982358864015, "grad_norm": 2.7782832942336793, "kl": 0.07470703125, "learning_rate": 9.986054814267677e-07, "loss": 0.003, "reward": 2.1031250953674316, "reward_std": 0.02038407325744629, "rewards/accuracy_reward": 0.9031249284744263, "rewards/format_reward": 1.0, "step": 1724 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 395.5, "epoch": 0.023792775272065213, "grad_norm": 12.38354490808643, "kl": 0.0712890625, "learning_rate": 9.98603863940125e-07, "loss": 0.0029, "reward": 2.1373751163482666, "reward_std": 0.015513171441853046, "rewards/accuracy_reward": 0.937375009059906, "rewards/format_reward": 1.0, "step": 1725 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 398.90625, "epoch": 0.02380656818526641, "grad_norm": 4.456827369299883, "kl": 0.078125, "learning_rate": 9.986022455172852e-07, "loss": 0.0031, "reward": 2.1432812213897705, "reward_std": 0.015676815062761307, "rewards/accuracy_reward": 0.9432812333106995, "rewards/format_reward": 1.0, "step": 1726 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 388.4375, "epoch": 0.023820361098467607, "grad_norm": 16.67054442452413, "kl": 0.0712890625, "learning_rate": 9.98600626158251e-07, "loss": 0.0029, "reward": 2.1101250648498535, "reward_std": 0.04009168967604637, "rewards/accuracy_reward": 0.9163749814033508, "rewards/format_reward": 1.0, "step": 1727 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 398.3125, "epoch": 0.023834154011668805, "grad_norm": 3.969631658516179, "kl": 0.0703125, "learning_rate": 9.985990058630255e-07, "loss": 0.0028, "reward": 2.138625144958496, "reward_std": 0.009320787154138088, "rewards/accuracy_reward": 0.9386250376701355, "rewards/format_reward": 1.0, "step": 1728 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 401.53125, "epoch": 0.023847946924870002, "grad_norm": 1.8686466282663734, "kl": 0.0732421875, "learning_rate": 9.985973846316118e-07, "loss": 0.0028, "reward": 2.099249839782715, "reward_std": 0.0196660328656435, "rewards/accuracy_reward": 0.9054999351501465, "rewards/format_reward": 1.0, "step": 1729 }, { "all_correct": 0.0, "all_wrong": 0.0, "completion_length": 399.0, "epoch": 0.0238617398380712, "grad_norm": 4.4299319900147305, "kl": 0.07080078125, "learning_rate": 9.98595762464013e-07, "loss": 0.0028, "reward": 1.9225938320159912, "reward_std": 0.01270025409758091, "rewards/accuracy_reward": 0.7225937843322754, "rewards/format_reward": 1.0, "step": 1730 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 399.71875, "epoch": 0.023875532751272396, "grad_norm": 2.2146126289941153, "kl": 0.06982421875, "learning_rate": 9.985941393602324e-07, "loss": 0.0028, "reward": 2.0733437538146973, "reward_std": 0.016932286322116852, "rewards/accuracy_reward": 0.8733437657356262, "rewards/format_reward": 1.0, "step": 1731 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 400.1875, "epoch": 0.023889325664473594, "grad_norm": 1.861701969324973, "kl": 0.06884765625, "learning_rate": 9.985925153202724e-07, "loss": 0.0028, "reward": 2.149656295776367, "reward_std": 0.008015827275812626, "rewards/accuracy_reward": 0.9496562480926514, "rewards/format_reward": 1.0, "step": 1732 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 409.4375, "epoch": 0.02390311857767479, "grad_norm": 1.4679235471717649, "kl": 0.0732421875, "learning_rate": 9.985908903441364e-07, "loss": 0.0029, "reward": 2.163968563079834, "reward_std": 0.019429108127951622, "rewards/accuracy_reward": 0.9702187180519104, "rewards/format_reward": 1.0, "step": 1733 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 425.46875, "epoch": 0.023916911490875988, "grad_norm": 5.2470395522560604, "kl": 0.07177734375, "learning_rate": 9.985892644318274e-07, "loss": 0.0029, "reward": 2.0458436012268066, "reward_std": 0.0353349968791008, "rewards/accuracy_reward": 0.8520936965942383, "rewards/format_reward": 1.0, "step": 1734 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 411.46875, "epoch": 0.023930704404077185, "grad_norm": 2.3126466981488973, "kl": 0.0732421875, "learning_rate": 9.985876375833486e-07, "loss": 0.0029, "reward": 2.09793758392334, "reward_std": 0.013394429348409176, "rewards/accuracy_reward": 0.8979374766349792, "rewards/format_reward": 1.0, "step": 1735 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 417.78125, "epoch": 0.023944497317278383, "grad_norm": 4.51834999228688, "kl": 0.06494140625, "learning_rate": 9.985860097987032e-07, "loss": 0.0026, "reward": 2.112375020980835, "reward_std": 0.03277178108692169, "rewards/accuracy_reward": 0.918624997138977, "rewards/format_reward": 1.0, "step": 1736 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 390.75, "epoch": 0.02395829023047958, "grad_norm": 2.5137681586952474, "kl": 0.0771484375, "learning_rate": 9.985843810778936e-07, "loss": 0.0031, "reward": 2.096968650817871, "reward_std": 0.016660798341035843, "rewards/accuracy_reward": 0.8969687223434448, "rewards/format_reward": 1.0, "step": 1737 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 410.53125, "epoch": 0.023972083143680777, "grad_norm": 4.453886849240466, "kl": 0.06640625, "learning_rate": 9.985827514209236e-07, "loss": 0.0026, "reward": 2.0604376792907715, "reward_std": 0.007918298244476318, "rewards/accuracy_reward": 0.8604375123977661, "rewards/format_reward": 1.0, "step": 1738 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 411.03125, "epoch": 0.023985876056881975, "grad_norm": 2.4819342067203856, "kl": 0.07666015625, "learning_rate": 9.985811208277957e-07, "loss": 0.0031, "reward": 1.9915937185287476, "reward_std": 0.00937240943312645, "rewards/accuracy_reward": 0.7915937900543213, "rewards/format_reward": 1.0, "step": 1739 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 408.03125, "epoch": 0.023999668970083172, "grad_norm": 3.6585757122772673, "kl": 0.068359375, "learning_rate": 9.985794892985131e-07, "loss": 0.0027, "reward": 2.1114063262939453, "reward_std": 0.031646594405174255, "rewards/accuracy_reward": 0.9114062786102295, "rewards/format_reward": 1.0, "step": 1740 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 419.0, "epoch": 0.02401346188328437, "grad_norm": 2.5450651610331296, "kl": 0.08447265625, "learning_rate": 9.985778568330791e-07, "loss": 0.0034, "reward": 2.0655624866485596, "reward_std": 0.04809194430708885, "rewards/accuracy_reward": 0.8780624866485596, "rewards/format_reward": 1.0, "step": 1741 }, { "all_correct": 0.25, "all_wrong": 0.0, "completion_length": 425.09375, "epoch": 0.024027254796485566, "grad_norm": 2.530902592900987, "kl": 0.07275390625, "learning_rate": 9.985762234314967e-07, "loss": 0.0029, "reward": 1.9913125038146973, "reward_std": 0.011736673302948475, "rewards/accuracy_reward": 0.7913125157356262, "rewards/format_reward": 1.0, "step": 1742 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 392.4375, "epoch": 0.024041047709686764, "grad_norm": 2.6140164331444797, "kl": 0.0810546875, "learning_rate": 9.985745890937688e-07, "loss": 0.0033, "reward": 2.037656307220459, "reward_std": 0.0340898334980011, "rewards/accuracy_reward": 0.8501562476158142, "rewards/format_reward": 1.0, "step": 1743 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 395.65625, "epoch": 0.02405484062288796, "grad_norm": 1.7897864659100733, "kl": 0.07373046875, "learning_rate": 9.985729538198986e-07, "loss": 0.003, "reward": 2.1214065551757812, "reward_std": 0.02272145263850689, "rewards/accuracy_reward": 0.9276562333106995, "rewards/format_reward": 1.0, "step": 1744 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 385.90625, "epoch": 0.024068633536089158, "grad_norm": 2.446121041993616, "kl": 0.076171875, "learning_rate": 9.985713176098888e-07, "loss": 0.0031, "reward": 2.0507724285125732, "reward_std": 0.007350444328039885, "rewards/accuracy_reward": 0.8507723212242126, "rewards/format_reward": 1.0, "step": 1745 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 413.0625, "epoch": 0.024082426449290355, "grad_norm": 2.7705725535301227, "kl": 0.0673828125, "learning_rate": 9.98569680463743e-07, "loss": 0.0027, "reward": 2.0127811431884766, "reward_std": 0.03422151133418083, "rewards/accuracy_reward": 0.8190312385559082, "rewards/format_reward": 1.0, "step": 1746 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 405.0, "epoch": 0.024096219362491553, "grad_norm": 2.3872010424482477, "kl": 0.07470703125, "learning_rate": 9.985680423814642e-07, "loss": 0.003, "reward": 2.1339688301086426, "reward_std": 0.02170136198401451, "rewards/accuracy_reward": 0.9339687824249268, "rewards/format_reward": 1.0, "step": 1747 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 398.28125, "epoch": 0.02411001227569275, "grad_norm": 2.6316361218681465, "kl": 0.072265625, "learning_rate": 9.985664033630553e-07, "loss": 0.0029, "reward": 2.074937582015991, "reward_std": 0.029040640220046043, "rewards/accuracy_reward": 0.8811874985694885, "rewards/format_reward": 1.0, "step": 1748 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 403.90625, "epoch": 0.024123805188893947, "grad_norm": 3.405750619703905, "kl": 0.0712890625, "learning_rate": 9.985647634085194e-07, "loss": 0.0029, "reward": 2.0605313777923584, "reward_std": 0.03339112550020218, "rewards/accuracy_reward": 0.8667812943458557, "rewards/format_reward": 1.0, "step": 1749 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 385.03125, "epoch": 0.024137598102095145, "grad_norm": 2.91977031272727, "kl": 0.0751953125, "learning_rate": 9.985631225178598e-07, "loss": 0.003, "reward": 2.0946874618530273, "reward_std": 0.019197026267647743, "rewards/accuracy_reward": 0.8946875333786011, "rewards/format_reward": 1.0, "step": 1750 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 397.75, "epoch": 0.024151391015296342, "grad_norm": 3.604232767070265, "kl": 0.0810546875, "learning_rate": 9.985614806910792e-07, "loss": 0.0032, "reward": 2.076906204223633, "reward_std": 0.019419647753238678, "rewards/accuracy_reward": 0.8769062757492065, "rewards/format_reward": 1.0, "step": 1751 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 410.78125, "epoch": 0.02416518392849754, "grad_norm": 2.376437580329351, "kl": 0.0751953125, "learning_rate": 9.985598379281808e-07, "loss": 0.003, "reward": 2.0655312538146973, "reward_std": 0.01601465232670307, "rewards/accuracy_reward": 0.8655312061309814, "rewards/format_reward": 1.0, "step": 1752 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 384.28125, "epoch": 0.024178976841698736, "grad_norm": 4.116061292435113, "kl": 0.08251953125, "learning_rate": 9.985581942291675e-07, "loss": 0.0033, "reward": 2.052500009536743, "reward_std": 0.016271580010652542, "rewards/accuracy_reward": 0.8524999618530273, "rewards/format_reward": 1.0, "step": 1753 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 416.28125, "epoch": 0.024192769754899934, "grad_norm": 2.8459132192306806, "kl": 0.0888671875, "learning_rate": 9.985565495940431e-07, "loss": 0.0035, "reward": 2.0820937156677246, "reward_std": 0.025291379541158676, "rewards/accuracy_reward": 0.8820937275886536, "rewards/format_reward": 1.0, "step": 1754 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 401.375, "epoch": 0.02420656266810113, "grad_norm": 1.4439906845419863, "kl": 0.0693359375, "learning_rate": 9.985549040228101e-07, "loss": 0.0028, "reward": 2.022624969482422, "reward_std": 0.0030057914555072784, "rewards/accuracy_reward": 0.8226249814033508, "rewards/format_reward": 1.0, "step": 1755 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 413.46875, "epoch": 0.024220355581302328, "grad_norm": 2.435579747925683, "kl": 0.0849609375, "learning_rate": 9.985532575154717e-07, "loss": 0.0034, "reward": 2.1383438110351562, "reward_std": 0.018488572910428047, "rewards/accuracy_reward": 0.9383437633514404, "rewards/format_reward": 1.0, "step": 1756 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 399.125, "epoch": 0.024234148494503525, "grad_norm": 3.0120026475764483, "kl": 0.0771484375, "learning_rate": 9.98551610072031e-07, "loss": 0.0031, "reward": 2.035656213760376, "reward_std": 0.009117106907069683, "rewards/accuracy_reward": 0.8356562852859497, "rewards/format_reward": 1.0, "step": 1757 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 399.53125, "epoch": 0.024247941407704723, "grad_norm": 2.5538727615336247, "kl": 0.08203125, "learning_rate": 9.98549961692491e-07, "loss": 0.0033, "reward": 2.126584529876709, "reward_std": 0.0063272942788898945, "rewards/accuracy_reward": 0.9265843629837036, "rewards/format_reward": 1.0, "step": 1758 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 398.875, "epoch": 0.02426173432090592, "grad_norm": 8.37522776549557, "kl": 0.07177734375, "learning_rate": 9.98548312376855e-07, "loss": 0.0029, "reward": 2.1524062156677246, "reward_std": 0.010602997615933418, "rewards/accuracy_reward": 0.9524062275886536, "rewards/format_reward": 1.0, "step": 1759 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 405.125, "epoch": 0.024275527234107117, "grad_norm": 3.507332854838465, "kl": 0.08349609375, "learning_rate": 9.98546662125126e-07, "loss": 0.0033, "reward": 2.0226876735687256, "reward_std": 0.01573045551776886, "rewards/accuracy_reward": 0.8226875066757202, "rewards/format_reward": 1.0, "step": 1760 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 410.34375, "epoch": 0.024289320147308315, "grad_norm": 5.809003785345363, "kl": 0.08447265625, "learning_rate": 9.98545010937307e-07, "loss": 0.0034, "reward": 2.03515625, "reward_std": 0.019804697483778, "rewards/accuracy_reward": 0.835156261920929, "rewards/format_reward": 1.0, "step": 1761 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 413.78125, "epoch": 0.024303113060509512, "grad_norm": 3.0305832348923687, "kl": 0.0859375, "learning_rate": 9.98543358813401e-07, "loss": 0.0034, "reward": 2.052062511444092, "reward_std": 0.03612096980214119, "rewards/accuracy_reward": 0.8583125472068787, "rewards/format_reward": 1.0, "step": 1762 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 393.78125, "epoch": 0.02431690597371071, "grad_norm": 3.5563079913533424, "kl": 0.08740234375, "learning_rate": 9.985417057534117e-07, "loss": 0.0035, "reward": 2.1191563606262207, "reward_std": 0.016575699672102928, "rewards/accuracy_reward": 0.9191562533378601, "rewards/format_reward": 1.0, "step": 1763 }, { "all_correct": 0.25, "all_wrong": 0.0, "completion_length": 412.78125, "epoch": 0.024330698886911906, "grad_norm": 3.0645052581267507, "kl": 0.0888671875, "learning_rate": 9.985400517573417e-07, "loss": 0.0036, "reward": 2.044740676879883, "reward_std": 0.028551694005727768, "rewards/accuracy_reward": 0.8509905934333801, "rewards/format_reward": 1.0, "step": 1764 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 403.09375, "epoch": 0.024344491800113104, "grad_norm": 2.765856515806997, "kl": 0.08740234375, "learning_rate": 9.985383968251941e-07, "loss": 0.0035, "reward": 2.066312789916992, "reward_std": 0.01751692220568657, "rewards/accuracy_reward": 0.8663125038146973, "rewards/format_reward": 1.0, "step": 1765 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 387.375, "epoch": 0.024358284713314297, "grad_norm": 2.015853784114689, "kl": 0.08447265625, "learning_rate": 9.98536740956972e-07, "loss": 0.0034, "reward": 2.103749990463257, "reward_std": 0.013015479780733585, "rewards/accuracy_reward": 0.9037500023841858, "rewards/format_reward": 1.0, "step": 1766 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 409.0625, "epoch": 0.024372077626515495, "grad_norm": 3.3477428713056567, "kl": 0.087890625, "learning_rate": 9.985350841526788e-07, "loss": 0.0035, "reward": 2.0847811698913574, "reward_std": 0.023640643805265427, "rewards/accuracy_reward": 0.8847812414169312, "rewards/format_reward": 1.0, "step": 1767 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 416.75, "epoch": 0.024385870539716692, "grad_norm": 6.017892808909625, "kl": 0.08935546875, "learning_rate": 9.985334264123174e-07, "loss": 0.0036, "reward": 2.047187328338623, "reward_std": 0.014335026033222675, "rewards/accuracy_reward": 0.8471875190734863, "rewards/format_reward": 1.0, "step": 1768 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 397.8125, "epoch": 0.02439966345291789, "grad_norm": 2.698199071657015, "kl": 0.08349609375, "learning_rate": 9.985317677358908e-07, "loss": 0.0033, "reward": 2.010718822479248, "reward_std": 0.014230258762836456, "rewards/accuracy_reward": 0.8107187151908875, "rewards/format_reward": 1.0, "step": 1769 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 400.75, "epoch": 0.024413456366119086, "grad_norm": 2.304985842183476, "kl": 0.0751953125, "learning_rate": 9.985301081234025e-07, "loss": 0.003, "reward": 2.0739998817443848, "reward_std": 0.009021366946399212, "rewards/accuracy_reward": 0.8739999532699585, "rewards/format_reward": 1.0, "step": 1770 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 410.75, "epoch": 0.024427249279320284, "grad_norm": 2.616584686497504, "kl": 0.08056640625, "learning_rate": 9.985284475748552e-07, "loss": 0.0033, "reward": 2.149031162261963, "reward_std": 0.017312081530690193, "rewards/accuracy_reward": 0.9490312337875366, "rewards/format_reward": 1.0, "step": 1771 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 406.46875, "epoch": 0.02444104219252148, "grad_norm": 3.7454364850367723, "kl": 0.0791015625, "learning_rate": 9.985267860902522e-07, "loss": 0.0032, "reward": 2.0531249046325684, "reward_std": 0.012606385163962841, "rewards/accuracy_reward": 0.8531250357627869, "rewards/format_reward": 1.0, "step": 1772 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 400.21875, "epoch": 0.02445483510572268, "grad_norm": 2.1842734408082483, "kl": 0.08203125, "learning_rate": 9.985251236695967e-07, "loss": 0.0033, "reward": 2.0401875972747803, "reward_std": 0.02127690799534321, "rewards/accuracy_reward": 0.8464374542236328, "rewards/format_reward": 1.0, "step": 1773 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 414.34375, "epoch": 0.024468628018923876, "grad_norm": 6.173423938420589, "kl": 0.0791015625, "learning_rate": 9.985234603128917e-07, "loss": 0.0032, "reward": 2.0656564235687256, "reward_std": 0.018526801839470863, "rewards/accuracy_reward": 0.8656562566757202, "rewards/format_reward": 1.0, "step": 1774 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 401.4375, "epoch": 0.024482420932125073, "grad_norm": 2.7349810771321166, "kl": 0.08056640625, "learning_rate": 9.985217960201402e-07, "loss": 0.0032, "reward": 2.1108124256134033, "reward_std": 0.018724944442510605, "rewards/accuracy_reward": 0.910812497138977, "rewards/format_reward": 1.0, "step": 1775 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 410.5, "epoch": 0.02449621384532627, "grad_norm": 9.06553580363702, "kl": 0.0791015625, "learning_rate": 9.985201307913456e-07, "loss": 0.0031, "reward": 2.0666468143463135, "reward_std": 0.03265304118394852, "rewards/accuracy_reward": 0.8666468858718872, "rewards/format_reward": 1.0, "step": 1776 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 421.40625, "epoch": 0.024510006758527467, "grad_norm": 4.331300934590727, "kl": 0.07666015625, "learning_rate": 9.98518464626511e-07, "loss": 0.0031, "reward": 2.1221561431884766, "reward_std": 0.0323910228908062, "rewards/accuracy_reward": 0.9284062385559082, "rewards/format_reward": 1.0, "step": 1777 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 406.625, "epoch": 0.024523799671728665, "grad_norm": 2.4672398380474148, "kl": 0.078125, "learning_rate": 9.985167975256393e-07, "loss": 0.0031, "reward": 2.141625165939331, "reward_std": 0.014096378348767757, "rewards/accuracy_reward": 0.9416250586509705, "rewards/format_reward": 1.0, "step": 1778 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 407.6875, "epoch": 0.024537592584929862, "grad_norm": 34.324223710054575, "kl": 0.09765625, "learning_rate": 9.985151294887338e-07, "loss": 0.0039, "reward": 2.0224688053131104, "reward_std": 0.025731824338436127, "rewards/accuracy_reward": 0.8224687576293945, "rewards/format_reward": 1.0, "step": 1779 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 402.53125, "epoch": 0.02455138549813106, "grad_norm": 2.70846013690555, "kl": 0.083984375, "learning_rate": 9.985134605157975e-07, "loss": 0.0033, "reward": 2.0834689140319824, "reward_std": 0.016936348751187325, "rewards/accuracy_reward": 0.883468747138977, "rewards/format_reward": 1.0, "step": 1780 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 397.03125, "epoch": 0.024565178411332256, "grad_norm": 2.653429903702211, "kl": 0.080078125, "learning_rate": 9.985117906068337e-07, "loss": 0.0032, "reward": 2.087218761444092, "reward_std": 0.026929259300231934, "rewards/accuracy_reward": 0.8872188329696655, "rewards/format_reward": 1.0, "step": 1781 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 402.8125, "epoch": 0.024578971324533454, "grad_norm": 2.402392929045198, "kl": 0.08203125, "learning_rate": 9.985101197618456e-07, "loss": 0.0033, "reward": 1.9732187986373901, "reward_std": 0.02099413424730301, "rewards/accuracy_reward": 0.7732187509536743, "rewards/format_reward": 1.0, "step": 1782 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 402.25, "epoch": 0.02459276423773465, "grad_norm": 3.2217360796310057, "kl": 0.0771484375, "learning_rate": 9.98508447980836e-07, "loss": 0.0031, "reward": 2.0893125534057617, "reward_std": 0.01791352406144142, "rewards/accuracy_reward": 0.8893125057220459, "rewards/format_reward": 1.0, "step": 1783 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 408.0625, "epoch": 0.02460655715093585, "grad_norm": 2.683709335052171, "kl": 0.078125, "learning_rate": 9.985067752638084e-07, "loss": 0.0031, "reward": 2.1088438034057617, "reward_std": 0.021613825112581253, "rewards/accuracy_reward": 0.9088437557220459, "rewards/format_reward": 1.0, "step": 1784 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 400.75, "epoch": 0.024620350064137046, "grad_norm": 2.769526538229454, "kl": 0.08837890625, "learning_rate": 9.985051016107656e-07, "loss": 0.0035, "reward": 2.144731283187866, "reward_std": 0.007910585030913353, "rewards/accuracy_reward": 0.9447312355041504, "rewards/format_reward": 1.0, "step": 1785 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 419.46875, "epoch": 0.024634142977338243, "grad_norm": 1.985566695953307, "kl": 0.0830078125, "learning_rate": 9.985034270217112e-07, "loss": 0.0033, "reward": 2.150343656539917, "reward_std": 0.017256196588277817, "rewards/accuracy_reward": 0.9503437280654907, "rewards/format_reward": 1.0, "step": 1786 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 414.25, "epoch": 0.02464793589053944, "grad_norm": 2.725957722445619, "kl": 0.0849609375, "learning_rate": 9.985017514966478e-07, "loss": 0.0034, "reward": 2.0619373321533203, "reward_std": 0.022585805505514145, "rewards/accuracy_reward": 0.8619375228881836, "rewards/format_reward": 1.0, "step": 1787 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 406.34375, "epoch": 0.024661728803740637, "grad_norm": 2.725238061648846, "kl": 0.078125, "learning_rate": 9.98500075035579e-07, "loss": 0.0031, "reward": 2.0465939044952393, "reward_std": 0.026005594059824944, "rewards/accuracy_reward": 0.8528437614440918, "rewards/format_reward": 1.0, "step": 1788 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 393.78125, "epoch": 0.024675521716941835, "grad_norm": 4.823457412712981, "kl": 0.0830078125, "learning_rate": 9.984983976385077e-07, "loss": 0.0033, "reward": 2.0460939407348633, "reward_std": 0.08084728568792343, "rewards/accuracy_reward": 0.8460937738418579, "rewards/format_reward": 1.0, "step": 1789 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 403.09375, "epoch": 0.024689314630143032, "grad_norm": 4.581116446149254, "kl": 0.07470703125, "learning_rate": 9.98496719305437e-07, "loss": 0.003, "reward": 2.1031155586242676, "reward_std": 0.03143152594566345, "rewards/accuracy_reward": 0.9093655943870544, "rewards/format_reward": 1.0, "step": 1790 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 406.96875, "epoch": 0.02470310754334423, "grad_norm": 2.876962599508138, "kl": 0.087890625, "learning_rate": 9.984950400363704e-07, "loss": 0.0035, "reward": 2.1196563243865967, "reward_std": 0.012997458688914776, "rewards/accuracy_reward": 0.9196562767028809, "rewards/format_reward": 1.0, "step": 1791 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 398.84375, "epoch": 0.024716900456545426, "grad_norm": 2.2778762938618273, "kl": 0.0751953125, "learning_rate": 9.984933598313105e-07, "loss": 0.003, "reward": 2.1229686737060547, "reward_std": 0.013324317522346973, "rewards/accuracy_reward": 0.9229687452316284, "rewards/format_reward": 1.0, "step": 1792 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 403.15625, "epoch": 0.024730693369746624, "grad_norm": 4.2082699145935, "kl": 0.078125, "learning_rate": 9.98491678690261e-07, "loss": 0.0031, "reward": 2.080031394958496, "reward_std": 0.01834378018975258, "rewards/accuracy_reward": 0.8800312876701355, "rewards/format_reward": 1.0, "step": 1793 }, { "all_correct": 0.25, "all_wrong": 0.0, "completion_length": 396.34375, "epoch": 0.02474448628294782, "grad_norm": 2.2298200484881514, "kl": 0.08837890625, "learning_rate": 9.984899966132247e-07, "loss": 0.0035, "reward": 2.0351874828338623, "reward_std": 0.03237765282392502, "rewards/accuracy_reward": 0.8414374589920044, "rewards/format_reward": 1.0, "step": 1794 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 402.09375, "epoch": 0.024758279196149018, "grad_norm": 4.421820727567943, "kl": 0.07421875, "learning_rate": 9.984883136002051e-07, "loss": 0.003, "reward": 2.133000135421753, "reward_std": 0.01489143818616867, "rewards/accuracy_reward": 0.9329999685287476, "rewards/format_reward": 1.0, "step": 1795 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 393.46875, "epoch": 0.024772072109350216, "grad_norm": 3.0742689286485527, "kl": 0.080078125, "learning_rate": 9.98486629651205e-07, "loss": 0.0032, "reward": 2.125718832015991, "reward_std": 0.014771716669201851, "rewards/accuracy_reward": 0.9257187247276306, "rewards/format_reward": 1.0, "step": 1796 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 389.09375, "epoch": 0.024785865022551413, "grad_norm": 4.08258152216409, "kl": 0.0693359375, "learning_rate": 9.984849447662278e-07, "loss": 0.0028, "reward": 2.0645313262939453, "reward_std": 0.029033970087766647, "rewards/accuracy_reward": 0.8645312786102295, "rewards/format_reward": 1.0, "step": 1797 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 390.53125, "epoch": 0.02479965793575261, "grad_norm": 2.077638826240048, "kl": 0.080078125, "learning_rate": 9.984832589452764e-07, "loss": 0.0032, "reward": 2.0334062576293945, "reward_std": 0.009484776295721531, "rewards/accuracy_reward": 0.8334062695503235, "rewards/format_reward": 1.0, "step": 1798 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 396.375, "epoch": 0.024813450848953807, "grad_norm": 3.024384104108754, "kl": 0.0771484375, "learning_rate": 9.984815721883542e-07, "loss": 0.0031, "reward": 2.0014376640319824, "reward_std": 0.022771289572119713, "rewards/accuracy_reward": 0.801437497138977, "rewards/format_reward": 1.0, "step": 1799 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 400.8125, "epoch": 0.024827243762155005, "grad_norm": 2.840072468474782, "kl": 0.0703125, "learning_rate": 9.984798844954642e-07, "loss": 0.0028, "reward": 2.10756254196167, "reward_std": 0.01857057586312294, "rewards/accuracy_reward": 0.9075624942779541, "rewards/format_reward": 1.0, "step": 1800 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 381.5625, "epoch": 0.024841036675356202, "grad_norm": 2.419564851903318, "kl": 0.080078125, "learning_rate": 9.9847819586661e-07, "loss": 0.0032, "reward": 2.0813751220703125, "reward_std": 0.02807619608938694, "rewards/accuracy_reward": 0.887624979019165, "rewards/format_reward": 1.0, "step": 1801 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 398.34375, "epoch": 0.0248548295885574, "grad_norm": 5.265284200214633, "kl": 0.072265625, "learning_rate": 9.984765063017942e-07, "loss": 0.0029, "reward": 2.071125030517578, "reward_std": 0.008159724995493889, "rewards/accuracy_reward": 0.8711249828338623, "rewards/format_reward": 1.0, "step": 1802 }, { "all_correct": 0.25, "all_wrong": 0.0, "completion_length": 375.78125, "epoch": 0.024868622501758596, "grad_norm": 2.417742803497985, "kl": 0.078125, "learning_rate": 9.984748158010203e-07, "loss": 0.0031, "reward": 2.0822811126708984, "reward_std": 0.0606490820646286, "rewards/accuracy_reward": 0.9072812795639038, "rewards/format_reward": 1.0, "step": 1803 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 378.9375, "epoch": 0.024882415414959794, "grad_norm": 3.2660153386483612, "kl": 0.07470703125, "learning_rate": 9.984731243642913e-07, "loss": 0.003, "reward": 2.145656108856201, "reward_std": 0.0271601602435112, "rewards/accuracy_reward": 0.9456561803817749, "rewards/format_reward": 1.0, "step": 1804 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 383.75, "epoch": 0.02489620832816099, "grad_norm": 2.3149288294115666, "kl": 0.0751953125, "learning_rate": 9.984714319916105e-07, "loss": 0.003, "reward": 2.1265313625335693, "reward_std": 0.03469683229923248, "rewards/accuracy_reward": 0.9390312433242798, "rewards/format_reward": 1.0, "step": 1805 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 379.21875, "epoch": 0.024910001241362188, "grad_norm": 1.958748649606308, "kl": 0.072265625, "learning_rate": 9.98469738682981e-07, "loss": 0.0029, "reward": 2.1150312423706055, "reward_std": 0.019574537873268127, "rewards/accuracy_reward": 0.9212811589241028, "rewards/format_reward": 1.0, "step": 1806 }, { "all_correct": 0.25, "all_wrong": 0.0, "completion_length": 391.40625, "epoch": 0.024923794154563386, "grad_norm": 3.432904245801903, "kl": 0.08203125, "learning_rate": 9.984680444384062e-07, "loss": 0.0033, "reward": 1.9821562767028809, "reward_std": 0.0369049608707428, "rewards/accuracy_reward": 0.788406252861023, "rewards/format_reward": 1.0, "step": 1807 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 395.71875, "epoch": 0.024937587067764583, "grad_norm": 2.953627641991977, "kl": 0.07421875, "learning_rate": 9.984663492578888e-07, "loss": 0.003, "reward": 2.057468891143799, "reward_std": 0.03296763077378273, "rewards/accuracy_reward": 0.8637187480926514, "rewards/format_reward": 1.0, "step": 1808 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 391.75, "epoch": 0.02495137998096578, "grad_norm": 2.482057072855879, "kl": 0.0732421875, "learning_rate": 9.984646531414325e-07, "loss": 0.0029, "reward": 1.9731563329696655, "reward_std": 0.00761070940643549, "rewards/accuracy_reward": 0.7731561660766602, "rewards/format_reward": 1.0, "step": 1809 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 396.4375, "epoch": 0.024965172894166977, "grad_norm": 2.1106669311753485, "kl": 0.07470703125, "learning_rate": 9.984629560890402e-07, "loss": 0.003, "reward": 2.0951249599456787, "reward_std": 0.013353114016354084, "rewards/accuracy_reward": 0.8951249718666077, "rewards/format_reward": 1.0, "step": 1810 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 408.0, "epoch": 0.024978965807368175, "grad_norm": 4.368961204311242, "kl": 0.07568359375, "learning_rate": 9.984612581007153e-07, "loss": 0.003, "reward": 2.0777499675750732, "reward_std": 0.02158227562904358, "rewards/accuracy_reward": 0.877750039100647, "rewards/format_reward": 1.0, "step": 1811 }, { "all_correct": 0.25, "all_wrong": 0.0, "completion_length": 390.28125, "epoch": 0.024992758720569372, "grad_norm": 6.697641658711596, "kl": 0.0712890625, "learning_rate": 9.984595591764609e-07, "loss": 0.0029, "reward": 2.004000186920166, "reward_std": 0.03239420801401138, "rewards/accuracy_reward": 0.8164999485015869, "rewards/format_reward": 1.0, "step": 1812 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 400.9375, "epoch": 0.02500655163377057, "grad_norm": 4.610901858061098, "kl": 0.076171875, "learning_rate": 9.9845785931628e-07, "loss": 0.003, "reward": 2.079906463623047, "reward_std": 0.029418643563985825, "rewards/accuracy_reward": 0.8861563205718994, "rewards/format_reward": 1.0, "step": 1813 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 404.6875, "epoch": 0.025020344546971766, "grad_norm": 2.0085333775317196, "kl": 0.076171875, "learning_rate": 9.98456158520176e-07, "loss": 0.003, "reward": 2.151156425476074, "reward_std": 0.01612015813589096, "rewards/accuracy_reward": 0.9511562585830688, "rewards/format_reward": 1.0, "step": 1814 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 401.3125, "epoch": 0.025034137460172964, "grad_norm": 3.0711199374550273, "kl": 0.06982421875, "learning_rate": 9.984544567881517e-07, "loss": 0.0028, "reward": 2.0467188358306885, "reward_std": 0.028811011463403702, "rewards/accuracy_reward": 0.8467187881469727, "rewards/format_reward": 1.0, "step": 1815 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 410.34375, "epoch": 0.02504793037337416, "grad_norm": 2.169521221953622, "kl": 0.06787109375, "learning_rate": 9.98452754120211e-07, "loss": 0.0027, "reward": 1.9512187242507935, "reward_std": 0.014672722667455673, "rewards/accuracy_reward": 0.7512187957763672, "rewards/format_reward": 1.0, "step": 1816 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 424.34375, "epoch": 0.025061723286575358, "grad_norm": 3.755524547901593, "kl": 0.0751953125, "learning_rate": 9.984510505163565e-07, "loss": 0.003, "reward": 2.1437501907348633, "reward_std": 0.03614560514688492, "rewards/accuracy_reward": 0.9499999284744263, "rewards/format_reward": 1.0, "step": 1817 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 419.28125, "epoch": 0.025075516199776555, "grad_norm": 2.49891605222498, "kl": 0.08203125, "learning_rate": 9.984493459765917e-07, "loss": 0.0033, "reward": 2.1090939044952393, "reward_std": 0.01739683374762535, "rewards/accuracy_reward": 0.9090937376022339, "rewards/format_reward": 1.0, "step": 1818 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 405.59375, "epoch": 0.025089309112977753, "grad_norm": 2.5722757983522992, "kl": 0.068359375, "learning_rate": 9.984476405009197e-07, "loss": 0.0027, "reward": 2.1441564559936523, "reward_std": 0.03765767812728882, "rewards/accuracy_reward": 0.9504061937332153, "rewards/format_reward": 1.0, "step": 1819 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 396.46875, "epoch": 0.02510310202617895, "grad_norm": 1.66823560737798, "kl": 0.0712890625, "learning_rate": 9.984459340893436e-07, "loss": 0.0028, "reward": 2.1195626258850098, "reward_std": 0.022312071174383163, "rewards/accuracy_reward": 0.9258124828338623, "rewards/format_reward": 1.0, "step": 1820 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 404.65625, "epoch": 0.025116894939380147, "grad_norm": 2.2308678770960317, "kl": 0.0732421875, "learning_rate": 9.98444226741867e-07, "loss": 0.0029, "reward": 2.158874988555908, "reward_std": 0.015427597798407078, "rewards/accuracy_reward": 0.9588750004768372, "rewards/format_reward": 1.0, "step": 1821 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 406.5, "epoch": 0.025130687852581345, "grad_norm": 2.703205300148182, "kl": 0.07470703125, "learning_rate": 9.984425184584925e-07, "loss": 0.003, "reward": 2.1072187423706055, "reward_std": 0.014376457780599594, "rewards/accuracy_reward": 0.9072187542915344, "rewards/format_reward": 1.0, "step": 1822 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 398.21875, "epoch": 0.025144480765782542, "grad_norm": 3.481870794078983, "kl": 0.06884765625, "learning_rate": 9.984408092392239e-07, "loss": 0.0028, "reward": 2.1358749866485596, "reward_std": 0.0331958569586277, "rewards/accuracy_reward": 0.9358750581741333, "rewards/format_reward": 1.0, "step": 1823 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 404.8125, "epoch": 0.02515827367898374, "grad_norm": 2.5048217724213244, "kl": 0.0830078125, "learning_rate": 9.984390990840639e-07, "loss": 0.0033, "reward": 2.1587812900543213, "reward_std": 0.01900339126586914, "rewards/accuracy_reward": 0.9587812423706055, "rewards/format_reward": 1.0, "step": 1824 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 401.84375, "epoch": 0.025172066592184936, "grad_norm": 1.9915057057346952, "kl": 0.06982421875, "learning_rate": 9.984373879930161e-07, "loss": 0.0028, "reward": 2.1549999713897705, "reward_std": 0.015268120914697647, "rewards/accuracy_reward": 0.9550000429153442, "rewards/format_reward": 1.0, "step": 1825 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 410.03125, "epoch": 0.025185859505386134, "grad_norm": 3.2488064080797687, "kl": 0.0703125, "learning_rate": 9.984356759660836e-07, "loss": 0.0028, "reward": 2.0687813758850098, "reward_std": 0.016980387270450592, "rewards/accuracy_reward": 0.8687812685966492, "rewards/format_reward": 1.0, "step": 1826 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 402.53125, "epoch": 0.02519965241858733, "grad_norm": 4.68460593788176, "kl": 0.08349609375, "learning_rate": 9.984339630032694e-07, "loss": 0.0033, "reward": 2.090968608856201, "reward_std": 0.0176951065659523, "rewards/accuracy_reward": 0.8909687399864197, "rewards/format_reward": 1.0, "step": 1827 }, { "all_correct": 0.25, "all_wrong": 0.0, "completion_length": 416.0, "epoch": 0.025213445331788528, "grad_norm": 2.2079619534532227, "kl": 0.076171875, "learning_rate": 9.98432249104577e-07, "loss": 0.003, "reward": 1.9787812232971191, "reward_std": 0.01229407824575901, "rewards/accuracy_reward": 0.7787812948226929, "rewards/format_reward": 1.0, "step": 1828 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 404.15625, "epoch": 0.025227238244989725, "grad_norm": 2.062163834178871, "kl": 0.06640625, "learning_rate": 9.984305342700096e-07, "loss": 0.0027, "reward": 2.051875114440918, "reward_std": 0.006034111138433218, "rewards/accuracy_reward": 0.8518750071525574, "rewards/format_reward": 1.0, "step": 1829 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 400.4375, "epoch": 0.025241031158190923, "grad_norm": 2.5534888133413096, "kl": 0.0751953125, "learning_rate": 9.984288184995705e-07, "loss": 0.003, "reward": 2.0749688148498535, "reward_std": 0.011115302331745625, "rewards/accuracy_reward": 0.8749687671661377, "rewards/format_reward": 1.0, "step": 1830 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 386.59375, "epoch": 0.02525482407139212, "grad_norm": 2.0625345609854295, "kl": 0.0849609375, "learning_rate": 9.984271017932624e-07, "loss": 0.0034, "reward": 2.1003124713897705, "reward_std": 0.010857428424060345, "rewards/accuracy_reward": 0.9003124833106995, "rewards/format_reward": 1.0, "step": 1831 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 396.625, "epoch": 0.025268616984593317, "grad_norm": 2.1961172659527617, "kl": 0.0810546875, "learning_rate": 9.98425384151089e-07, "loss": 0.0032, "reward": 2.1265311241149902, "reward_std": 0.01104404404759407, "rewards/accuracy_reward": 0.9265313148498535, "rewards/format_reward": 1.0, "step": 1832 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 410.4375, "epoch": 0.025282409897794515, "grad_norm": 2.757163489840819, "kl": 0.076171875, "learning_rate": 9.984236655730534e-07, "loss": 0.003, "reward": 2.0827717781066895, "reward_std": 0.021459490060806274, "rewards/accuracy_reward": 0.882771909236908, "rewards/format_reward": 1.0, "step": 1833 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 401.40625, "epoch": 0.025296202810995712, "grad_norm": 2.4049703952157393, "kl": 0.06884765625, "learning_rate": 9.98421946059159e-07, "loss": 0.0027, "reward": 2.0678749084472656, "reward_std": 0.023924771696329117, "rewards/accuracy_reward": 0.8678749799728394, "rewards/format_reward": 1.0, "step": 1834 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 399.875, "epoch": 0.02530999572419691, "grad_norm": 3.1736190750633786, "kl": 0.07275390625, "learning_rate": 9.984202256094085e-07, "loss": 0.0029, "reward": 2.141343832015991, "reward_std": 0.013485681265592575, "rewards/accuracy_reward": 0.9413437247276306, "rewards/format_reward": 1.0, "step": 1835 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 389.0, "epoch": 0.025323788637398106, "grad_norm": 2.889002064771838, "kl": 0.07470703125, "learning_rate": 9.984185042238058e-07, "loss": 0.003, "reward": 2.0661563873291016, "reward_std": 0.02142975851893425, "rewards/accuracy_reward": 0.8661562204360962, "rewards/format_reward": 1.0, "step": 1836 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 399.375, "epoch": 0.025337581550599304, "grad_norm": 2.4645181419460362, "kl": 0.0791015625, "learning_rate": 9.984167819023538e-07, "loss": 0.0032, "reward": 2.08774995803833, "reward_std": 0.011205369606614113, "rewards/accuracy_reward": 0.8877500295639038, "rewards/format_reward": 1.0, "step": 1837 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 398.0, "epoch": 0.0253513744638005, "grad_norm": 2.8926009107047204, "kl": 0.0703125, "learning_rate": 9.984150586450556e-07, "loss": 0.0028, "reward": 2.1521248817443848, "reward_std": 0.014543944038450718, "rewards/accuracy_reward": 0.952125072479248, "rewards/format_reward": 1.0, "step": 1838 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 396.9375, "epoch": 0.025365167377001698, "grad_norm": 3.813652772408831, "kl": 0.07763671875, "learning_rate": 9.984133344519147e-07, "loss": 0.0031, "reward": 2.0731563568115234, "reward_std": 0.017619047313928604, "rewards/accuracy_reward": 0.8731563091278076, "rewards/format_reward": 1.0, "step": 1839 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 393.5, "epoch": 0.025378960290202895, "grad_norm": 2.7147271815300904, "kl": 0.076171875, "learning_rate": 9.984116093229342e-07, "loss": 0.003, "reward": 2.1460626125335693, "reward_std": 0.021524153649806976, "rewards/accuracy_reward": 0.946062445640564, "rewards/format_reward": 1.0, "step": 1840 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 393.40625, "epoch": 0.02539275320340409, "grad_norm": 3.174118113098078, "kl": 0.0732421875, "learning_rate": 9.984098832581174e-07, "loss": 0.0029, "reward": 2.0907187461853027, "reward_std": 0.015461665578186512, "rewards/accuracy_reward": 0.8907187581062317, "rewards/format_reward": 1.0, "step": 1841 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 388.125, "epoch": 0.025406546116605287, "grad_norm": 3.598866702367068, "kl": 0.07861328125, "learning_rate": 9.984081562574673e-07, "loss": 0.0031, "reward": 2.096062660217285, "reward_std": 0.023558679968118668, "rewards/accuracy_reward": 0.8960624933242798, "rewards/format_reward": 1.0, "step": 1842 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 388.125, "epoch": 0.025420339029806484, "grad_norm": 2.0147814308743754, "kl": 0.07080078125, "learning_rate": 9.984064283209877e-07, "loss": 0.0028, "reward": 2.0739998817443848, "reward_std": 0.029729995876550674, "rewards/accuracy_reward": 0.8802499771118164, "rewards/format_reward": 1.0, "step": 1843 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 377.53125, "epoch": 0.02543413194300768, "grad_norm": 8.453329596243213, "kl": 0.0732421875, "learning_rate": 9.984046994486812e-07, "loss": 0.0029, "reward": 2.1384687423706055, "reward_std": 0.014828226529061794, "rewards/accuracy_reward": 0.9384686946868896, "rewards/format_reward": 1.0, "step": 1844 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 371.0625, "epoch": 0.02544792485620888, "grad_norm": 2.7529798438141717, "kl": 0.080078125, "learning_rate": 9.984029696405516e-07, "loss": 0.0032, "reward": 2.1325626373291016, "reward_std": 0.012229455634951591, "rewards/accuracy_reward": 0.932562530040741, "rewards/format_reward": 1.0, "step": 1845 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 390.65625, "epoch": 0.025461717769410076, "grad_norm": 2.1643419439161296, "kl": 0.07421875, "learning_rate": 9.984012388966016e-07, "loss": 0.003, "reward": 2.0810937881469727, "reward_std": 0.026225507259368896, "rewards/accuracy_reward": 0.8873437643051147, "rewards/format_reward": 1.0, "step": 1846 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 383.125, "epoch": 0.025475510682611273, "grad_norm": 3.835810762659808, "kl": 0.07763671875, "learning_rate": 9.98399507216835e-07, "loss": 0.0031, "reward": 2.127687454223633, "reward_std": 0.0294649638235569, "rewards/accuracy_reward": 0.9339374899864197, "rewards/format_reward": 1.0, "step": 1847 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 382.75, "epoch": 0.02548930359581247, "grad_norm": 3.8238763539846787, "kl": 0.08154296875, "learning_rate": 9.983977746012547e-07, "loss": 0.0033, "reward": 2.159843683242798, "reward_std": 0.03189468756318092, "rewards/accuracy_reward": 0.9660937786102295, "rewards/format_reward": 1.0, "step": 1848 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 380.84375, "epoch": 0.025503096509013667, "grad_norm": 2.6918177909269687, "kl": 0.0791015625, "learning_rate": 9.98396041049864e-07, "loss": 0.0032, "reward": 2.1157188415527344, "reward_std": 0.027348648756742477, "rewards/accuracy_reward": 0.9219688177108765, "rewards/format_reward": 1.0, "step": 1849 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 371.3125, "epoch": 0.025516889422214865, "grad_norm": 5.603041859939737, "kl": 0.07275390625, "learning_rate": 9.983943065626663e-07, "loss": 0.0029, "reward": 2.0533437728881836, "reward_std": 0.02138070948421955, "rewards/accuracy_reward": 0.8533437252044678, "rewards/format_reward": 1.0, "step": 1850 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 387.25, "epoch": 0.025530682335416062, "grad_norm": 1.9700967287976536, "kl": 0.07421875, "learning_rate": 9.983925711396646e-07, "loss": 0.003, "reward": 2.1324687004089355, "reward_std": 0.024304986000061035, "rewards/accuracy_reward": 0.9387187361717224, "rewards/format_reward": 1.0, "step": 1851 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 382.78125, "epoch": 0.02554447524861726, "grad_norm": 2.3963907426435225, "kl": 0.0849609375, "learning_rate": 9.983908347808626e-07, "loss": 0.0034, "reward": 2.0945937633514404, "reward_std": 0.02313227578997612, "rewards/accuracy_reward": 0.9008437395095825, "rewards/format_reward": 1.0, "step": 1852 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 379.25, "epoch": 0.025558268161818457, "grad_norm": 2.853813057104101, "kl": 0.0869140625, "learning_rate": 9.98389097486263e-07, "loss": 0.0035, "reward": 2.1250624656677246, "reward_std": 0.03674417734146118, "rewards/accuracy_reward": 0.9313124418258667, "rewards/format_reward": 1.0, "step": 1853 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 392.65625, "epoch": 0.025572061075019654, "grad_norm": 2.794679757154359, "kl": 0.076171875, "learning_rate": 9.983873592558694e-07, "loss": 0.0031, "reward": 2.022125244140625, "reward_std": 0.029511287808418274, "rewards/accuracy_reward": 0.828374981880188, "rewards/format_reward": 1.0, "step": 1854 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 390.46875, "epoch": 0.02558585398822085, "grad_norm": 2.1646527246673357, "kl": 0.07861328125, "learning_rate": 9.98385620089685e-07, "loss": 0.0031, "reward": 2.1636252403259277, "reward_std": 0.027691997587680817, "rewards/accuracy_reward": 0.9698750376701355, "rewards/format_reward": 1.0, "step": 1855 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 395.9375, "epoch": 0.02559964690142205, "grad_norm": 4.328330009642746, "kl": 0.0810546875, "learning_rate": 9.983838799877132e-07, "loss": 0.0032, "reward": 2.1452813148498535, "reward_std": 0.009981352835893631, "rewards/accuracy_reward": 0.9452812671661377, "rewards/format_reward": 1.0, "step": 1856 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 390.96875, "epoch": 0.025613439814623246, "grad_norm": 2.807801471900933, "kl": 0.08056640625, "learning_rate": 9.98382138949957e-07, "loss": 0.0032, "reward": 2.044468879699707, "reward_std": 0.030625659972429276, "rewards/accuracy_reward": 0.8507187366485596, "rewards/format_reward": 1.0, "step": 1857 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 389.03125, "epoch": 0.025627232727824443, "grad_norm": 2.664027744076353, "kl": 0.0849609375, "learning_rate": 9.983803969764199e-07, "loss": 0.0034, "reward": 2.067687511444092, "reward_std": 0.01151359174400568, "rewards/accuracy_reward": 0.8676875233650208, "rewards/format_reward": 1.0, "step": 1858 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 402.71875, "epoch": 0.02564102564102564, "grad_norm": 2.185237919929018, "kl": 0.080078125, "learning_rate": 9.98378654067105e-07, "loss": 0.0032, "reward": 2.0583438873291016, "reward_std": 0.01906484179198742, "rewards/accuracy_reward": 0.8583437204360962, "rewards/format_reward": 1.0, "step": 1859 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 399.96875, "epoch": 0.025654818554226837, "grad_norm": 2.8348823693676866, "kl": 0.0751953125, "learning_rate": 9.983769102220157e-07, "loss": 0.003, "reward": 2.122781276702881, "reward_std": 0.013881700113415718, "rewards/accuracy_reward": 0.9227812886238098, "rewards/format_reward": 1.0, "step": 1860 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 400.875, "epoch": 0.025668611467428035, "grad_norm": 1.9281698057615821, "kl": 0.076171875, "learning_rate": 9.983751654411551e-07, "loss": 0.003, "reward": 2.167750120162964, "reward_std": 0.010406461544334888, "rewards/accuracy_reward": 0.9677499532699585, "rewards/format_reward": 1.0, "step": 1861 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 377.4375, "epoch": 0.025682404380629232, "grad_norm": 1.8748098234731816, "kl": 0.07861328125, "learning_rate": 9.983734197245267e-07, "loss": 0.0031, "reward": 2.0851564407348633, "reward_std": 0.029073134064674377, "rewards/accuracy_reward": 0.891406238079071, "rewards/format_reward": 1.0, "step": 1862 }, { "all_correct": 0.25, "all_wrong": 0.0, "completion_length": 397.96875, "epoch": 0.02569619729383043, "grad_norm": 2.001995155868636, "kl": 0.07421875, "learning_rate": 9.983716730721336e-07, "loss": 0.003, "reward": 1.9856562614440918, "reward_std": 0.010835351422429085, "rewards/accuracy_reward": 0.7856562733650208, "rewards/format_reward": 1.0, "step": 1863 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 402.71875, "epoch": 0.025709990207031626, "grad_norm": 6.9295445810275185, "kl": 0.07861328125, "learning_rate": 9.983699254839793e-07, "loss": 0.0031, "reward": 2.1544060707092285, "reward_std": 0.012011079117655754, "rewards/accuracy_reward": 0.9544063210487366, "rewards/format_reward": 1.0, "step": 1864 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 408.9375, "epoch": 0.025723783120232824, "grad_norm": 2.0486517003466322, "kl": 0.078125, "learning_rate": 9.983681769600668e-07, "loss": 0.0031, "reward": 2.121312379837036, "reward_std": 0.008215043693780899, "rewards/accuracy_reward": 0.9213125109672546, "rewards/format_reward": 1.0, "step": 1865 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 409.59375, "epoch": 0.02573757603343402, "grad_norm": 2.794561678802557, "kl": 0.08447265625, "learning_rate": 9.983664275003997e-07, "loss": 0.0034, "reward": 2.0582499504089355, "reward_std": 0.016837604343891144, "rewards/accuracy_reward": 0.8582500219345093, "rewards/format_reward": 1.0, "step": 1866 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 407.65625, "epoch": 0.02575136894663522, "grad_norm": 3.0965585637611284, "kl": 0.078125, "learning_rate": 9.983646771049808e-07, "loss": 0.0031, "reward": 2.0903749465942383, "reward_std": 0.02898436412215233, "rewards/accuracy_reward": 0.8966250419616699, "rewards/format_reward": 1.0, "step": 1867 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 425.5, "epoch": 0.025765161859836416, "grad_norm": 2.431143881446561, "kl": 0.07958984375, "learning_rate": 9.983629257738139e-07, "loss": 0.0032, "reward": 2.1497812271118164, "reward_std": 0.008799711242318153, "rewards/accuracy_reward": 0.9497812390327454, "rewards/format_reward": 1.0, "step": 1868 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 412.375, "epoch": 0.025778954773037613, "grad_norm": 2.361456574092, "kl": 0.0869140625, "learning_rate": 9.98361173506902e-07, "loss": 0.0035, "reward": 2.0873751640319824, "reward_std": 0.013711605221033096, "rewards/accuracy_reward": 0.8873750567436218, "rewards/format_reward": 1.0, "step": 1869 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 410.34375, "epoch": 0.02579274768623881, "grad_norm": 12.117904620459306, "kl": 0.08935546875, "learning_rate": 9.983594203042485e-07, "loss": 0.0036, "reward": 2.124593734741211, "reward_std": 0.017907653003931046, "rewards/accuracy_reward": 0.9245937466621399, "rewards/format_reward": 1.0, "step": 1870 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 416.9375, "epoch": 0.025806540599440007, "grad_norm": 2.406914301836135, "kl": 0.072265625, "learning_rate": 9.983576661658567e-07, "loss": 0.0029, "reward": 2.1099376678466797, "reward_std": 0.016399363055825233, "rewards/accuracy_reward": 0.9099375009536743, "rewards/format_reward": 1.0, "step": 1871 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 408.21875, "epoch": 0.025820333512641205, "grad_norm": 2.542509352827263, "kl": 0.0771484375, "learning_rate": 9.983559110917297e-07, "loss": 0.0031, "reward": 2.1330313682556152, "reward_std": 0.026605045422911644, "rewards/accuracy_reward": 0.9392812252044678, "rewards/format_reward": 1.0, "step": 1872 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 401.375, "epoch": 0.025834126425842402, "grad_norm": 2.4547970890687894, "kl": 0.07958984375, "learning_rate": 9.983541550818711e-07, "loss": 0.0032, "reward": 2.084718704223633, "reward_std": 0.013663766905665398, "rewards/accuracy_reward": 0.8847187757492065, "rewards/format_reward": 1.0, "step": 1873 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 414.28125, "epoch": 0.0258479193390436, "grad_norm": 4.6467052152682635, "kl": 0.0810546875, "learning_rate": 9.98352398136284e-07, "loss": 0.0032, "reward": 2.175062656402588, "reward_std": 0.012637021951377392, "rewards/accuracy_reward": 0.9750624895095825, "rewards/format_reward": 1.0, "step": 1874 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 417.96875, "epoch": 0.025861712252244796, "grad_norm": 2.1704201810849226, "kl": 0.07177734375, "learning_rate": 9.983506402549719e-07, "loss": 0.0029, "reward": 2.114656448364258, "reward_std": 0.12084515392780304, "rewards/accuracy_reward": 0.9521563053131104, "rewards/format_reward": 0.96875, "step": 1875 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 404.40625, "epoch": 0.025875505165445994, "grad_norm": 3.0880592887395615, "kl": 0.076171875, "learning_rate": 9.983488814379376e-07, "loss": 0.0031, "reward": 2.118281364440918, "reward_std": 0.1004912480711937, "rewards/accuracy_reward": 0.9495313167572021, "rewards/format_reward": 0.96875, "step": 1876 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 408.65625, "epoch": 0.02588929807864719, "grad_norm": 2.2737975895682254, "kl": 0.0751953125, "learning_rate": 9.98347121685185e-07, "loss": 0.003, "reward": 2.0881876945495605, "reward_std": 0.0455729179084301, "rewards/accuracy_reward": 0.9006874561309814, "rewards/format_reward": 1.0, "step": 1877 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 398.0625, "epoch": 0.02590309099184839, "grad_norm": 2.3691075317193566, "kl": 0.083984375, "learning_rate": 9.983453609967173e-07, "loss": 0.0034, "reward": 2.094531297683716, "reward_std": 0.023566478863358498, "rewards/accuracy_reward": 0.9007812142372131, "rewards/format_reward": 1.0, "step": 1878 }, { "all_correct": 0.25, "all_wrong": 0.0, "completion_length": 405.40625, "epoch": 0.025916883905049586, "grad_norm": 2.9056453960642004, "kl": 0.083984375, "learning_rate": 9.983435993725375e-07, "loss": 0.0034, "reward": 2.015625, "reward_std": 0.018177349120378494, "rewards/accuracy_reward": 0.8156250715255737, "rewards/format_reward": 1.0, "step": 1879 }, { "all_correct": 0.25, "all_wrong": 0.0, "completion_length": 392.25, "epoch": 0.025930676818250783, "grad_norm": 6.836704224544842, "kl": 0.091796875, "learning_rate": 9.983418368126489e-07, "loss": 0.0037, "reward": 2.003187417984009, "reward_std": 0.1331649273633957, "rewards/accuracy_reward": 0.8469374179840088, "rewards/format_reward": 0.96875, "step": 1880 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 397.40625, "epoch": 0.02594446973145198, "grad_norm": 13.759491626563882, "kl": 0.07958984375, "learning_rate": 9.983400733170553e-07, "loss": 0.0032, "reward": 2.0055625438690186, "reward_std": 0.01725863665342331, "rewards/accuracy_reward": 0.8055624961853027, "rewards/format_reward": 1.0, "step": 1881 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 391.59375, "epoch": 0.025958262644653177, "grad_norm": 3.0019805753970417, "kl": 0.0810546875, "learning_rate": 9.983383088857594e-07, "loss": 0.0032, "reward": 2.1140313148498535, "reward_std": 0.017071839421987534, "rewards/accuracy_reward": 0.9140312671661377, "rewards/format_reward": 1.0, "step": 1882 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 395.125, "epoch": 0.025972055557854375, "grad_norm": 1.8277776417879426, "kl": 0.07421875, "learning_rate": 9.98336543518765e-07, "loss": 0.003, "reward": 2.1651248931884766, "reward_std": 0.009313899092376232, "rewards/accuracy_reward": 0.9651249647140503, "rewards/format_reward": 1.0, "step": 1883 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 395.375, "epoch": 0.025985848471055572, "grad_norm": 3.354512363534502, "kl": 0.08447265625, "learning_rate": 9.98334777216075e-07, "loss": 0.0034, "reward": 2.0831875801086426, "reward_std": 0.02759060077369213, "rewards/accuracy_reward": 0.8831875324249268, "rewards/format_reward": 1.0, "step": 1884 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 389.65625, "epoch": 0.02599964138425677, "grad_norm": 2.4912088620046022, "kl": 0.06396484375, "learning_rate": 9.983330099776929e-07, "loss": 0.0026, "reward": 2.1124062538146973, "reward_std": 0.02426830306649208, "rewards/accuracy_reward": 0.9124062657356262, "rewards/format_reward": 1.0, "step": 1885 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 401.875, "epoch": 0.026013434297457966, "grad_norm": 2.0002994483792147, "kl": 0.06787109375, "learning_rate": 9.983312418036225e-07, "loss": 0.0027, "reward": 2.1269376277923584, "reward_std": 0.008103366941213608, "rewards/accuracy_reward": 0.9269375801086426, "rewards/format_reward": 1.0, "step": 1886 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 405.375, "epoch": 0.026027227210659164, "grad_norm": 2.81766079112827, "kl": 0.0693359375, "learning_rate": 9.983294726938663e-07, "loss": 0.0028, "reward": 2.081249952316284, "reward_std": 0.012763720005750656, "rewards/accuracy_reward": 0.8812500238418579, "rewards/format_reward": 1.0, "step": 1887 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 395.25, "epoch": 0.02604102012386036, "grad_norm": 7.288633040377808, "kl": 0.08251953125, "learning_rate": 9.98327702648428e-07, "loss": 0.0033, "reward": 2.0316247940063477, "reward_std": 0.015839356929063797, "rewards/accuracy_reward": 0.8316250443458557, "rewards/format_reward": 1.0, "step": 1888 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 389.46875, "epoch": 0.02605481303706156, "grad_norm": 2.916198006875666, "kl": 0.06982421875, "learning_rate": 9.98325931667311e-07, "loss": 0.0028, "reward": 2.0199687480926514, "reward_std": 0.04930349811911583, "rewards/accuracy_reward": 0.8199687004089355, "rewards/format_reward": 1.0, "step": 1889 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 397.625, "epoch": 0.026068605950262756, "grad_norm": 3.220923308357919, "kl": 0.068359375, "learning_rate": 9.983241597505185e-07, "loss": 0.0027, "reward": 2.0854063034057617, "reward_std": 0.019993368536233902, "rewards/accuracy_reward": 0.8854062557220459, "rewards/format_reward": 1.0, "step": 1890 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 391.5, "epoch": 0.026082398863463953, "grad_norm": 2.0730445013968595, "kl": 0.06640625, "learning_rate": 9.98322386898054e-07, "loss": 0.0026, "reward": 2.062187671661377, "reward_std": 0.014803411439061165, "rewards/accuracy_reward": 0.8621875047683716, "rewards/format_reward": 1.0, "step": 1891 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 399.34375, "epoch": 0.02609619177666515, "grad_norm": 3.1754931276567464, "kl": 0.07568359375, "learning_rate": 9.983206131099205e-07, "loss": 0.003, "reward": 2.063093662261963, "reward_std": 0.015045160427689552, "rewards/accuracy_reward": 0.8630937933921814, "rewards/format_reward": 1.0, "step": 1892 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 395.1875, "epoch": 0.026109984689866347, "grad_norm": 3.3021426965852068, "kl": 0.0732421875, "learning_rate": 9.983188383861216e-07, "loss": 0.0029, "reward": 2.1875312328338623, "reward_std": 0.008857827633619308, "rewards/accuracy_reward": 0.987531304359436, "rewards/format_reward": 1.0, "step": 1893 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 393.1875, "epoch": 0.026123777603067545, "grad_norm": 4.970401986276893, "kl": 0.072265625, "learning_rate": 9.983170627266606e-07, "loss": 0.0029, "reward": 2.133812427520752, "reward_std": 0.004189165309071541, "rewards/accuracy_reward": 0.9338124990463257, "rewards/format_reward": 1.0, "step": 1894 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 382.5625, "epoch": 0.026137570516268742, "grad_norm": 3.3709887680311126, "kl": 0.07763671875, "learning_rate": 9.983152861315406e-07, "loss": 0.0031, "reward": 2.084937572479248, "reward_std": 0.021619021892547607, "rewards/accuracy_reward": 0.8849375247955322, "rewards/format_reward": 1.0, "step": 1895 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 386.875, "epoch": 0.02615136342946994, "grad_norm": 13.081294697888739, "kl": 0.0888671875, "learning_rate": 9.983135086007654e-07, "loss": 0.0035, "reward": 2.1125311851501465, "reward_std": 0.015462963841855526, "rewards/accuracy_reward": 0.9125312566757202, "rewards/format_reward": 1.0, "step": 1896 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 382.5625, "epoch": 0.026165156342671136, "grad_norm": 2.3875891853307722, "kl": 0.08154296875, "learning_rate": 9.98311730134338e-07, "loss": 0.0033, "reward": 2.1114063262939453, "reward_std": 0.0249087605625391, "rewards/accuracy_reward": 0.9176562428474426, "rewards/format_reward": 1.0, "step": 1897 }, { "all_correct": 0.0, "all_wrong": 0.0, "completion_length": 370.0625, "epoch": 0.026178949255872334, "grad_norm": 2.028916751845437, "kl": 0.0732421875, "learning_rate": 9.983099507322618e-07, "loss": 0.0029, "reward": 2.0262811183929443, "reward_std": 0.054959043860435486, "rewards/accuracy_reward": 0.8450313210487366, "rewards/format_reward": 1.0, "step": 1898 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 376.46875, "epoch": 0.02619274216907353, "grad_norm": 3.6983934539591714, "kl": 0.07666015625, "learning_rate": 9.983081703945401e-07, "loss": 0.0031, "reward": 2.0888125896453857, "reward_std": 0.030467383563518524, "rewards/accuracy_reward": 0.8950624465942383, "rewards/format_reward": 1.0, "step": 1899 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 384.46875, "epoch": 0.026206535082274728, "grad_norm": 4.976831643531543, "kl": 0.07470703125, "learning_rate": 9.983063891211763e-07, "loss": 0.003, "reward": 2.088843822479248, "reward_std": 0.027769802138209343, "rewards/accuracy_reward": 0.8888437747955322, "rewards/format_reward": 1.0, "step": 1900 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 387.40625, "epoch": 0.026220327995475926, "grad_norm": 2.5997812010653827, "kl": 0.072265625, "learning_rate": 9.983046069121736e-07, "loss": 0.0029, "reward": 2.1284375190734863, "reward_std": 0.01613606885075569, "rewards/accuracy_reward": 0.9284374713897705, "rewards/format_reward": 1.0, "step": 1901 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 381.46875, "epoch": 0.026234120908677123, "grad_norm": 2.578028884006925, "kl": 0.07763671875, "learning_rate": 9.983028237675357e-07, "loss": 0.0031, "reward": 2.085906505584717, "reward_std": 0.03340889513492584, "rewards/accuracy_reward": 0.8921562433242798, "rewards/format_reward": 1.0, "step": 1902 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 384.65625, "epoch": 0.02624791382187832, "grad_norm": 4.561895732932041, "kl": 0.07666015625, "learning_rate": 9.983010396872657e-07, "loss": 0.0031, "reward": 2.1163125038146973, "reward_std": 0.04884226620197296, "rewards/accuracy_reward": 0.9288125038146973, "rewards/format_reward": 1.0, "step": 1903 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 394.28125, "epoch": 0.026261706735079517, "grad_norm": 4.270186995823431, "kl": 0.06494140625, "learning_rate": 9.982992546713669e-07, "loss": 0.0026, "reward": 2.1293437480926514, "reward_std": 0.022905055433511734, "rewards/accuracy_reward": 0.9293437600135803, "rewards/format_reward": 1.0, "step": 1904 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 377.59375, "epoch": 0.026275499648280715, "grad_norm": 5.4301956029983405, "kl": 0.06982421875, "learning_rate": 9.982974687198427e-07, "loss": 0.0028, "reward": 2.1040940284729004, "reward_std": 0.03425617143511772, "rewards/accuracy_reward": 0.9103437662124634, "rewards/format_reward": 1.0, "step": 1905 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 386.5625, "epoch": 0.026289292561481912, "grad_norm": 2.1826665071563474, "kl": 0.0732421875, "learning_rate": 9.982956818326966e-07, "loss": 0.0029, "reward": 2.1343750953674316, "reward_std": 0.013854065909981728, "rewards/accuracy_reward": 0.9343750476837158, "rewards/format_reward": 1.0, "step": 1906 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 379.75, "epoch": 0.02630308547468311, "grad_norm": 5.045341331330085, "kl": 0.0888671875, "learning_rate": 9.982938940099318e-07, "loss": 0.0036, "reward": 2.1560935974121094, "reward_std": 0.03527868166565895, "rewards/accuracy_reward": 0.962343692779541, "rewards/format_reward": 1.0, "step": 1907 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 392.03125, "epoch": 0.026316878387884306, "grad_norm": 2.585764075662836, "kl": 0.07763671875, "learning_rate": 9.982921052515515e-07, "loss": 0.0031, "reward": 2.1163125038146973, "reward_std": 0.01224626787006855, "rewards/accuracy_reward": 0.9163124561309814, "rewards/format_reward": 1.0, "step": 1908 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 384.5625, "epoch": 0.026330671301085504, "grad_norm": 4.009492226902157, "kl": 0.08203125, "learning_rate": 9.982903155575594e-07, "loss": 0.0033, "reward": 2.149656295776367, "reward_std": 0.02167775109410286, "rewards/accuracy_reward": 0.9496561884880066, "rewards/format_reward": 1.0, "step": 1909 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 389.8125, "epoch": 0.0263444642142867, "grad_norm": 4.192897607652903, "kl": 0.08251953125, "learning_rate": 9.982885249279588e-07, "loss": 0.0033, "reward": 2.1063437461853027, "reward_std": 0.038686685264110565, "rewards/accuracy_reward": 0.9125937819480896, "rewards/format_reward": 1.0, "step": 1910 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 388.4375, "epoch": 0.026358257127487898, "grad_norm": 1.8673974522114576, "kl": 0.0791015625, "learning_rate": 9.98286733362753e-07, "loss": 0.0032, "reward": 2.0999999046325684, "reward_std": 0.010693451389670372, "rewards/accuracy_reward": 0.9000000357627869, "rewards/format_reward": 1.0, "step": 1911 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 394.375, "epoch": 0.026372050040689096, "grad_norm": 1.6680172997767577, "kl": 0.076171875, "learning_rate": 9.982849408619452e-07, "loss": 0.0031, "reward": 2.171968936920166, "reward_std": 0.008573643863201141, "rewards/accuracy_reward": 0.9719687700271606, "rewards/format_reward": 1.0, "step": 1912 }, { "all_correct": 0.25, "all_wrong": 0.0, "completion_length": 387.6875, "epoch": 0.026385842953890293, "grad_norm": 3.2316097982894387, "kl": 0.0908203125, "learning_rate": 9.982831474255392e-07, "loss": 0.0036, "reward": 2.0645623207092285, "reward_std": 0.04534504562616348, "rewards/accuracy_reward": 0.8770625591278076, "rewards/format_reward": 1.0, "step": 1913 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 387.4375, "epoch": 0.02639963586709149, "grad_norm": 3.4795134207215033, "kl": 0.08544921875, "learning_rate": 9.982813530535376e-07, "loss": 0.0034, "reward": 2.0827813148498535, "reward_std": 0.027300989255309105, "rewards/accuracy_reward": 0.8827812671661377, "rewards/format_reward": 1.0, "step": 1914 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 392.8125, "epoch": 0.026413428780292687, "grad_norm": 2.646759466219593, "kl": 0.087890625, "learning_rate": 9.982795577459445e-07, "loss": 0.0035, "reward": 2.0375313758850098, "reward_std": 0.014005608856678009, "rewards/accuracy_reward": 0.8375312685966492, "rewards/format_reward": 1.0, "step": 1915 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 400.375, "epoch": 0.02642722169349388, "grad_norm": 4.12686725948492, "kl": 0.08056640625, "learning_rate": 9.982777615027631e-07, "loss": 0.0032, "reward": 2.066093921661377, "reward_std": 0.020965585485100746, "rewards/accuracy_reward": 0.8660937547683716, "rewards/format_reward": 1.0, "step": 1916 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 391.375, "epoch": 0.02644101460669508, "grad_norm": 2.2022983580988065, "kl": 0.0693359375, "learning_rate": 9.982759643239965e-07, "loss": 0.0028, "reward": 2.045781135559082, "reward_std": 0.04319870471954346, "rewards/accuracy_reward": 0.8582812547683716, "rewards/format_reward": 1.0, "step": 1917 }, { "all_correct": 0.25, "all_wrong": 0.0, "completion_length": 398.59375, "epoch": 0.026454807519896276, "grad_norm": 3.089738262417307, "kl": 0.078125, "learning_rate": 9.982741662096484e-07, "loss": 0.0031, "reward": 2.0027499198913574, "reward_std": 0.03057563677430153, "rewards/accuracy_reward": 0.8090000152587891, "rewards/format_reward": 1.0, "step": 1918 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 409.625, "epoch": 0.026468600433097473, "grad_norm": 1.5438678519468854, "kl": 0.0703125, "learning_rate": 9.98272367159722e-07, "loss": 0.0028, "reward": 2.1192188262939453, "reward_std": 0.005279357545077801, "rewards/accuracy_reward": 0.9192187786102295, "rewards/format_reward": 1.0, "step": 1919 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 404.0, "epoch": 0.02648239334629867, "grad_norm": 7.949626451142351, "kl": 0.0693359375, "learning_rate": 9.982705671742206e-07, "loss": 0.0028, "reward": 1.9359376430511475, "reward_std": 0.022330183535814285, "rewards/accuracy_reward": 0.7359374761581421, "rewards/format_reward": 1.0, "step": 1920 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 433.40625, "epoch": 0.026496186259499867, "grad_norm": 6.705376621560179, "kl": 0.07568359375, "learning_rate": 9.98268766253148e-07, "loss": 0.003, "reward": 2.0542500019073486, "reward_std": 0.04051809012889862, "rewards/accuracy_reward": 0.8667500615119934, "rewards/format_reward": 1.0, "step": 1921 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 413.71875, "epoch": 0.026509979172701065, "grad_norm": 2.781573787713504, "kl": 0.080078125, "learning_rate": 9.98266964396507e-07, "loss": 0.0032, "reward": 2.080312490463257, "reward_std": 0.020189763978123665, "rewards/accuracy_reward": 0.880312442779541, "rewards/format_reward": 1.0, "step": 1922 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 420.875, "epoch": 0.026523772085902262, "grad_norm": 3.9693455782903384, "kl": 0.07421875, "learning_rate": 9.982651616043012e-07, "loss": 0.003, "reward": 2.1007189750671387, "reward_std": 0.019428091123700142, "rewards/accuracy_reward": 0.9007187485694885, "rewards/format_reward": 1.0, "step": 1923 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 410.1875, "epoch": 0.02653756499910346, "grad_norm": 6.0958947140648325, "kl": 0.06982421875, "learning_rate": 9.982633578765343e-07, "loss": 0.0028, "reward": 2.0754687786102295, "reward_std": 0.02279149740934372, "rewards/accuracy_reward": 0.8754688501358032, "rewards/format_reward": 1.0, "step": 1924 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 412.9375, "epoch": 0.026551357912304657, "grad_norm": 2.7130420140475895, "kl": 0.07666015625, "learning_rate": 9.982615532132092e-07, "loss": 0.0031, "reward": 2.0829687118530273, "reward_std": 0.03561375290155411, "rewards/accuracy_reward": 0.889218807220459, "rewards/format_reward": 1.0, "step": 1925 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 413.8125, "epoch": 0.026565150825505854, "grad_norm": 2.733139048458819, "kl": 0.09228515625, "learning_rate": 9.982597476143297e-07, "loss": 0.0037, "reward": 2.0730624198913574, "reward_std": 0.013417109847068787, "rewards/accuracy_reward": 0.8730624318122864, "rewards/format_reward": 1.0, "step": 1926 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 410.34375, "epoch": 0.02657894373870705, "grad_norm": 5.9815436825189465, "kl": 0.078125, "learning_rate": 9.98257941079899e-07, "loss": 0.0031, "reward": 2.1597185134887695, "reward_std": 0.01997998356819153, "rewards/accuracy_reward": 0.9597187042236328, "rewards/format_reward": 1.0, "step": 1927 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 402.375, "epoch": 0.02659273665190825, "grad_norm": 3.226744660428045, "kl": 0.08251953125, "learning_rate": 9.982561336099203e-07, "loss": 0.0033, "reward": 2.098749876022339, "reward_std": 0.009890900924801826, "rewards/accuracy_reward": 0.8987500667572021, "rewards/format_reward": 1.0, "step": 1928 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 417.59375, "epoch": 0.026606529565109446, "grad_norm": 5.241779991096966, "kl": 0.08740234375, "learning_rate": 9.982543252043974e-07, "loss": 0.0035, "reward": 2.103968858718872, "reward_std": 0.016507325693964958, "rewards/accuracy_reward": 0.9039686918258667, "rewards/format_reward": 1.0, "step": 1929 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 405.03125, "epoch": 0.026620322478310643, "grad_norm": 1.595461150707015, "kl": 0.07421875, "learning_rate": 9.982525158633333e-07, "loss": 0.003, "reward": 2.1637187004089355, "reward_std": 0.008594900369644165, "rewards/accuracy_reward": 0.9637187719345093, "rewards/format_reward": 1.0, "step": 1930 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 416.34375, "epoch": 0.02663411539151184, "grad_norm": 3.701514816567279, "kl": 0.0791015625, "learning_rate": 9.982507055867317e-07, "loss": 0.0032, "reward": 2.0741562843322754, "reward_std": 0.021511593833565712, "rewards/accuracy_reward": 0.8741562366485596, "rewards/format_reward": 1.0, "step": 1931 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 406.625, "epoch": 0.026647908304713037, "grad_norm": 4.059367068286267, "kl": 0.08203125, "learning_rate": 9.98248894374596e-07, "loss": 0.0033, "reward": 2.122468948364258, "reward_std": 0.020731305703520775, "rewards/accuracy_reward": 0.9224687218666077, "rewards/format_reward": 1.0, "step": 1932 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 402.4375, "epoch": 0.026661701217914235, "grad_norm": 3.5167449646235784, "kl": 0.08251953125, "learning_rate": 9.982470822269294e-07, "loss": 0.0033, "reward": 2.0926249027252197, "reward_std": 0.01040612906217575, "rewards/accuracy_reward": 0.8926249742507935, "rewards/format_reward": 1.0, "step": 1933 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 398.4375, "epoch": 0.026675494131115432, "grad_norm": 5.159893282997334, "kl": 0.0732421875, "learning_rate": 9.982452691437354e-07, "loss": 0.0029, "reward": 2.0933749675750732, "reward_std": 0.01213439553976059, "rewards/accuracy_reward": 0.893375039100647, "rewards/format_reward": 1.0, "step": 1934 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 399.78125, "epoch": 0.02668928704431663, "grad_norm": 2.798177561141299, "kl": 0.07958984375, "learning_rate": 9.982434551250174e-07, "loss": 0.0032, "reward": 2.117500066757202, "reward_std": 0.016086172312498093, "rewards/accuracy_reward": 0.9175000190734863, "rewards/format_reward": 1.0, "step": 1935 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 410.15625, "epoch": 0.026703079957517827, "grad_norm": 5.82826113507385, "kl": 0.0810546875, "learning_rate": 9.982416401707787e-07, "loss": 0.0032, "reward": 2.0697813034057617, "reward_std": 0.016002286225557327, "rewards/accuracy_reward": 0.8697812557220459, "rewards/format_reward": 1.0, "step": 1936 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 398.90625, "epoch": 0.026716872870719024, "grad_norm": 4.606238427406338, "kl": 0.0869140625, "learning_rate": 9.98239824281023e-07, "loss": 0.0035, "reward": 2.12250018119812, "reward_std": 0.023778196424245834, "rewards/accuracy_reward": 0.9225000143051147, "rewards/format_reward": 1.0, "step": 1937 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 399.03125, "epoch": 0.02673066578392022, "grad_norm": 3.3284601819839286, "kl": 0.07666015625, "learning_rate": 9.982380074557532e-07, "loss": 0.0031, "reward": 2.1208436489105225, "reward_std": 0.016521064564585686, "rewards/accuracy_reward": 0.9208437204360962, "rewards/format_reward": 1.0, "step": 1938 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 389.21875, "epoch": 0.02674445869712142, "grad_norm": 2.516936344554583, "kl": 0.07177734375, "learning_rate": 9.982361896949733e-07, "loss": 0.0029, "reward": 2.1640625, "reward_std": 0.02064230851829052, "rewards/accuracy_reward": 0.964062511920929, "rewards/format_reward": 1.0, "step": 1939 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 388.59375, "epoch": 0.026758251610322616, "grad_norm": 4.057452587358344, "kl": 0.08642578125, "learning_rate": 9.982343709986866e-07, "loss": 0.0035, "reward": 2.10728120803833, "reward_std": 0.016745882108807564, "rewards/accuracy_reward": 0.9072812795639038, "rewards/format_reward": 1.0, "step": 1940 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 398.09375, "epoch": 0.026772044523523813, "grad_norm": 4.760308136592588, "kl": 0.08349609375, "learning_rate": 9.98232551366896e-07, "loss": 0.0033, "reward": 2.116062641143799, "reward_std": 0.016728738322854042, "rewards/accuracy_reward": 0.9160624742507935, "rewards/format_reward": 1.0, "step": 1941 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 399.1875, "epoch": 0.02678583743672501, "grad_norm": 2.2878912312656414, "kl": 0.07275390625, "learning_rate": 9.982307307996054e-07, "loss": 0.0029, "reward": 2.1108436584472656, "reward_std": 0.012023009359836578, "rewards/accuracy_reward": 0.9108437895774841, "rewards/format_reward": 1.0, "step": 1942 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 396.8125, "epoch": 0.026799630349926207, "grad_norm": 2.37695671372471, "kl": 0.08740234375, "learning_rate": 9.98228909296818e-07, "loss": 0.0035, "reward": 2.096062660217285, "reward_std": 0.02040119841694832, "rewards/accuracy_reward": 0.8960624933242798, "rewards/format_reward": 1.0, "step": 1943 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 387.59375, "epoch": 0.026813423263127405, "grad_norm": 2.9096358192113403, "kl": 0.08154296875, "learning_rate": 9.982270868585376e-07, "loss": 0.0033, "reward": 2.075531244277954, "reward_std": 0.030106842517852783, "rewards/accuracy_reward": 0.881781280040741, "rewards/format_reward": 1.0, "step": 1944 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 399.625, "epoch": 0.026827216176328602, "grad_norm": 18.07686300226145, "kl": 0.0712890625, "learning_rate": 9.98225263484767e-07, "loss": 0.0028, "reward": 2.146228313446045, "reward_std": 0.011443981900811195, "rewards/accuracy_reward": 0.9462281465530396, "rewards/format_reward": 1.0, "step": 1945 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 386.78125, "epoch": 0.0268410090895298, "grad_norm": 4.711936782595007, "kl": 0.08154296875, "learning_rate": 9.982234391755101e-07, "loss": 0.0033, "reward": 2.153125047683716, "reward_std": 0.021105458959937096, "rewards/accuracy_reward": 0.9531249403953552, "rewards/format_reward": 1.0, "step": 1946 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 386.1875, "epoch": 0.026854802002730997, "grad_norm": 2.269853761924988, "kl": 0.08203125, "learning_rate": 9.982216139307704e-07, "loss": 0.0033, "reward": 2.098656177520752, "reward_std": 0.015547218732535839, "rewards/accuracy_reward": 0.8986562490463257, "rewards/format_reward": 1.0, "step": 1947 }, { "all_correct": 0.25, "all_wrong": 0.0, "completion_length": 398.625, "epoch": 0.026868594915932194, "grad_norm": 4.844174160012034, "kl": 0.072265625, "learning_rate": 9.982197877505507e-07, "loss": 0.0029, "reward": 1.9913437366485596, "reward_std": 0.027762141078710556, "rewards/accuracy_reward": 0.7913437485694885, "rewards/format_reward": 1.0, "step": 1948 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 386.875, "epoch": 0.02688238782913339, "grad_norm": 3.9429491554399276, "kl": 0.0791015625, "learning_rate": 9.982179606348552e-07, "loss": 0.0031, "reward": 2.0159687995910645, "reward_std": 0.016044585034251213, "rewards/accuracy_reward": 0.8159687519073486, "rewards/format_reward": 1.0, "step": 1949 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 387.875, "epoch": 0.02689618074233459, "grad_norm": 2.5314859747253577, "kl": 0.083984375, "learning_rate": 9.982161325836867e-07, "loss": 0.0034, "reward": 2.1345314979553223, "reward_std": 0.013676775619387627, "rewards/accuracy_reward": 0.9345312118530273, "rewards/format_reward": 1.0, "step": 1950 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 384.34375, "epoch": 0.026909973655535786, "grad_norm": 3.7754952187315673, "kl": 0.0751953125, "learning_rate": 9.98214303597049e-07, "loss": 0.003, "reward": 2.1527812480926514, "reward_std": 0.009778826497495174, "rewards/accuracy_reward": 0.9527812004089355, "rewards/format_reward": 1.0, "step": 1951 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 377.6875, "epoch": 0.026923766568736983, "grad_norm": 3.894202600297443, "kl": 0.0703125, "learning_rate": 9.982124736749456e-07, "loss": 0.0028, "reward": 2.1665310859680176, "reward_std": 0.028231702744960785, "rewards/accuracy_reward": 0.972781240940094, "rewards/format_reward": 1.0, "step": 1952 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 385.21875, "epoch": 0.02693755948193818, "grad_norm": 3.0224362183706552, "kl": 0.07177734375, "learning_rate": 9.982106428173795e-07, "loss": 0.0029, "reward": 2.1128437519073486, "reward_std": 0.022281402722001076, "rewards/accuracy_reward": 0.9128437042236328, "rewards/format_reward": 1.0, "step": 1953 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 380.25, "epoch": 0.026951352395139377, "grad_norm": 2.7560229497486577, "kl": 0.080078125, "learning_rate": 9.982088110243547e-07, "loss": 0.0032, "reward": 2.126312494277954, "reward_std": 0.01885402575135231, "rewards/accuracy_reward": 0.9263124465942383, "rewards/format_reward": 1.0, "step": 1954 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 396.0625, "epoch": 0.026965145308340575, "grad_norm": 2.2702322374515997, "kl": 0.0712890625, "learning_rate": 9.982069782958743e-07, "loss": 0.0029, "reward": 2.132093906402588, "reward_std": 0.01684606447815895, "rewards/accuracy_reward": 0.9320937395095825, "rewards/format_reward": 1.0, "step": 1955 }, { "all_correct": 0.25, "all_wrong": 0.0, "completion_length": 383.3125, "epoch": 0.026978938221541772, "grad_norm": 2.7368210978615095, "kl": 0.076171875, "learning_rate": 9.982051446319415e-07, "loss": 0.0031, "reward": 2.015718936920166, "reward_std": 0.03476415574550629, "rewards/accuracy_reward": 0.8219687938690186, "rewards/format_reward": 1.0, "step": 1956 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 384.9375, "epoch": 0.02699273113474297, "grad_norm": 12.12168128461032, "kl": 0.08984375, "learning_rate": 9.982033100325601e-07, "loss": 0.0036, "reward": 2.1455624103546143, "reward_std": 0.01730254851281643, "rewards/accuracy_reward": 0.945562481880188, "rewards/format_reward": 1.0, "step": 1957 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 377.875, "epoch": 0.027006524047944167, "grad_norm": 2.6107115616525496, "kl": 0.0791015625, "learning_rate": 9.982014744977338e-07, "loss": 0.0032, "reward": 2.1704063415527344, "reward_std": 0.013156077824532986, "rewards/accuracy_reward": 0.9704062938690186, "rewards/format_reward": 1.0, "step": 1958 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 385.9375, "epoch": 0.027020316961145364, "grad_norm": 1.9175153963905764, "kl": 0.07373046875, "learning_rate": 9.981996380274655e-07, "loss": 0.0029, "reward": 2.139937400817871, "reward_std": 0.029868323355913162, "rewards/accuracy_reward": 0.9399374723434448, "rewards/format_reward": 1.0, "step": 1959 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 389.3125, "epoch": 0.02703410987434656, "grad_norm": 4.01039788479685, "kl": 0.080078125, "learning_rate": 9.98197800621759e-07, "loss": 0.0032, "reward": 2.151249885559082, "reward_std": 0.02801770344376564, "rewards/accuracy_reward": 0.9574999809265137, "rewards/format_reward": 1.0, "step": 1960 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 385.03125, "epoch": 0.02704790278754776, "grad_norm": 1.8568752414585066, "kl": 0.07275390625, "learning_rate": 9.981959622806174e-07, "loss": 0.0029, "reward": 2.1570000648498535, "reward_std": 0.013699093833565712, "rewards/accuracy_reward": 0.9569999575614929, "rewards/format_reward": 1.0, "step": 1961 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 370.28125, "epoch": 0.027061695700748956, "grad_norm": 2.023916101610666, "kl": 0.08056640625, "learning_rate": 9.981941230040446e-07, "loss": 0.0032, "reward": 2.1745312213897705, "reward_std": 0.02012145332992077, "rewards/accuracy_reward": 0.9807811975479126, "rewards/format_reward": 1.0, "step": 1962 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 385.09375, "epoch": 0.027075488613950153, "grad_norm": 5.848948952517147, "kl": 0.08935546875, "learning_rate": 9.981922827920438e-07, "loss": 0.0036, "reward": 2.152250051498413, "reward_std": 0.01440475881099701, "rewards/accuracy_reward": 0.9522500038146973, "rewards/format_reward": 1.0, "step": 1963 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 388.8125, "epoch": 0.02708928152715135, "grad_norm": 5.063419446627412, "kl": 0.0849609375, "learning_rate": 9.981904416446183e-07, "loss": 0.0034, "reward": 2.0025312900543213, "reward_std": 0.03082101047039032, "rewards/accuracy_reward": 0.8025312423706055, "rewards/format_reward": 1.0, "step": 1964 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 403.5625, "epoch": 0.027103074440352547, "grad_norm": 2.479213858347537, "kl": 0.078125, "learning_rate": 9.981885995617718e-07, "loss": 0.0031, "reward": 2.1049375534057617, "reward_std": 0.022446513175964355, "rewards/accuracy_reward": 0.9049373865127563, "rewards/format_reward": 1.0, "step": 1965 }, { "all_correct": 0.0, "all_wrong": 0.0, "completion_length": 387.59375, "epoch": 0.027116867353553745, "grad_norm": 2.600204433737937, "kl": 0.08349609375, "learning_rate": 9.981867565435079e-07, "loss": 0.0033, "reward": 2.0359063148498535, "reward_std": 0.07090350240468979, "rewards/accuracy_reward": 0.8609062433242798, "rewards/format_reward": 1.0, "step": 1966 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 408.3125, "epoch": 0.027130660266754942, "grad_norm": 3.9609062881331445, "kl": 0.0751953125, "learning_rate": 9.981849125898296e-07, "loss": 0.003, "reward": 2.112093925476074, "reward_std": 0.015172101557254791, "rewards/accuracy_reward": 0.9120937585830688, "rewards/format_reward": 1.0, "step": 1967 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 405.71875, "epoch": 0.02714445317995614, "grad_norm": 3.9779621494863298, "kl": 0.072265625, "learning_rate": 9.981830677007407e-07, "loss": 0.0029, "reward": 1.975968837738037, "reward_std": 0.009230835363268852, "rewards/accuracy_reward": 0.7759687900543213, "rewards/format_reward": 1.0, "step": 1968 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 391.84375, "epoch": 0.027158246093157336, "grad_norm": 8.56251786215719, "kl": 0.07763671875, "learning_rate": 9.981812218762446e-07, "loss": 0.0031, "reward": 2.0310938358306885, "reward_std": 0.017720166593790054, "rewards/accuracy_reward": 0.8310937881469727, "rewards/format_reward": 1.0, "step": 1969 }, { "all_correct": 0.25, "all_wrong": 0.0, "completion_length": 401.59375, "epoch": 0.027172039006358534, "grad_norm": 2.492009152427361, "kl": 0.0791015625, "learning_rate": 9.98179375116345e-07, "loss": 0.0032, "reward": 1.9724063873291016, "reward_std": 0.03215337544679642, "rewards/accuracy_reward": 0.7786562442779541, "rewards/format_reward": 1.0, "step": 1970 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 411.6875, "epoch": 0.02718583191955973, "grad_norm": 2.122142069281791, "kl": 0.07470703125, "learning_rate": 9.981775274210448e-07, "loss": 0.003, "reward": 2.152937412261963, "reward_std": 0.023467950522899628, "rewards/accuracy_reward": 0.9591875076293945, "rewards/format_reward": 1.0, "step": 1971 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 399.78125, "epoch": 0.02719962483276093, "grad_norm": 5.255352051053777, "kl": 0.0771484375, "learning_rate": 9.98175678790348e-07, "loss": 0.0031, "reward": 2.074718713760376, "reward_std": 0.023217612877488136, "rewards/accuracy_reward": 0.8747187256813049, "rewards/format_reward": 1.0, "step": 1972 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 410.40625, "epoch": 0.027213417745962126, "grad_norm": 3.306688164824961, "kl": 0.080078125, "learning_rate": 9.981738292242575e-07, "loss": 0.0032, "reward": 2.100562572479248, "reward_std": 0.03188871964812279, "rewards/accuracy_reward": 0.9068125486373901, "rewards/format_reward": 1.0, "step": 1973 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 413.28125, "epoch": 0.027227210659163323, "grad_norm": 3.200523432744986, "kl": 0.076171875, "learning_rate": 9.981719787227777e-07, "loss": 0.003, "reward": 2.0820624828338623, "reward_std": 0.029341604560613632, "rewards/accuracy_reward": 0.882062554359436, "rewards/format_reward": 1.0, "step": 1974 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 414.90625, "epoch": 0.02724100357236452, "grad_norm": 2.942676915792152, "kl": 0.07470703125, "learning_rate": 9.98170127285911e-07, "loss": 0.003, "reward": 2.1136250495910645, "reward_std": 0.03603549301624298, "rewards/accuracy_reward": 0.9261250495910645, "rewards/format_reward": 1.0, "step": 1975 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 416.1875, "epoch": 0.027254796485565717, "grad_norm": 2.672573354475276, "kl": 0.08447265625, "learning_rate": 9.981682749136615e-07, "loss": 0.0034, "reward": 2.0636250972747803, "reward_std": 0.015592987649142742, "rewards/accuracy_reward": 0.8636249303817749, "rewards/format_reward": 1.0, "step": 1976 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 419.21875, "epoch": 0.027268589398766915, "grad_norm": 2.124067155647725, "kl": 0.08447265625, "learning_rate": 9.98166421606033e-07, "loss": 0.0034, "reward": 2.160562515258789, "reward_std": 0.00924752838909626, "rewards/accuracy_reward": 0.9605624675750732, "rewards/format_reward": 1.0, "step": 1977 }, { "all_correct": 0.0, "all_wrong": 0.0, "completion_length": 407.4375, "epoch": 0.027282382311968112, "grad_norm": 3.582608939553046, "kl": 0.0771484375, "learning_rate": 9.981645673630279e-07, "loss": 0.0031, "reward": 2.0482499599456787, "reward_std": 0.0560419037938118, "rewards/accuracy_reward": 0.8607499599456787, "rewards/format_reward": 1.0, "step": 1978 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 395.1875, "epoch": 0.02729617522516931, "grad_norm": 3.909923980094944, "kl": 0.068359375, "learning_rate": 9.981627121846507e-07, "loss": 0.0027, "reward": 2.1547813415527344, "reward_std": 0.008173666894435883, "rewards/accuracy_reward": 0.9547812938690186, "rewards/format_reward": 1.0, "step": 1979 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 418.28125, "epoch": 0.027309968138370506, "grad_norm": 2.298319390584412, "kl": 0.0703125, "learning_rate": 9.981608560709044e-07, "loss": 0.0028, "reward": 2.1258749961853027, "reward_std": 0.012761179357767105, "rewards/accuracy_reward": 0.9258750081062317, "rewards/format_reward": 1.0, "step": 1980 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 421.25, "epoch": 0.027323761051571704, "grad_norm": 3.62657996977868, "kl": 0.0830078125, "learning_rate": 9.981589990217927e-07, "loss": 0.0033, "reward": 2.145718574523926, "reward_std": 0.01764613762497902, "rewards/accuracy_reward": 0.9457187652587891, "rewards/format_reward": 1.0, "step": 1981 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 424.3125, "epoch": 0.0273375539647729, "grad_norm": 4.885730841161599, "kl": 0.08251953125, "learning_rate": 9.981571410373189e-07, "loss": 0.0033, "reward": 2.03696870803833, "reward_std": 0.02655961364507675, "rewards/accuracy_reward": 0.836968719959259, "rewards/format_reward": 1.0, "step": 1982 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 433.6875, "epoch": 0.0273513468779741, "grad_norm": 2.1726128678289283, "kl": 0.08203125, "learning_rate": 9.981552821174865e-07, "loss": 0.0033, "reward": 2.051968574523926, "reward_std": 0.043825335800647736, "rewards/accuracy_reward": 0.8707187175750732, "rewards/format_reward": 1.0, "step": 1983 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 410.125, "epoch": 0.027365139791175296, "grad_norm": 2.3507363545677222, "kl": 0.0791015625, "learning_rate": 9.98153422262299e-07, "loss": 0.0032, "reward": 1.9909999370574951, "reward_std": 0.01418959628790617, "rewards/accuracy_reward": 0.7910000085830688, "rewards/format_reward": 1.0, "step": 1984 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 431.90625, "epoch": 0.027378932704376493, "grad_norm": 2.3753046950223196, "kl": 0.07177734375, "learning_rate": 9.981515614717603e-07, "loss": 0.0029, "reward": 2.053093910217285, "reward_std": 0.01402326300740242, "rewards/accuracy_reward": 0.8530938029289246, "rewards/format_reward": 1.0, "step": 1985 }, { "all_correct": 0.25, "all_wrong": 0.0, "completion_length": 409.90625, "epoch": 0.02739272561757769, "grad_norm": 3.391280828791311, "kl": 0.08203125, "learning_rate": 9.98149699745873e-07, "loss": 0.0033, "reward": 1.9285000562667847, "reward_std": 0.023256802931427956, "rewards/accuracy_reward": 0.7285000085830688, "rewards/format_reward": 1.0, "step": 1986 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 414.3125, "epoch": 0.027406518530778887, "grad_norm": 2.6992841528426923, "kl": 0.076171875, "learning_rate": 9.981478370846417e-07, "loss": 0.003, "reward": 2.1123123168945312, "reward_std": 0.016241051256656647, "rewards/accuracy_reward": 0.9123125076293945, "rewards/format_reward": 1.0, "step": 1987 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 417.6875, "epoch": 0.027420311443980085, "grad_norm": 3.2972260704656615, "kl": 0.08984375, "learning_rate": 9.98145973488069e-07, "loss": 0.0036, "reward": 2.121500015258789, "reward_std": 0.03570738807320595, "rewards/accuracy_reward": 0.9277500510215759, "rewards/format_reward": 1.0, "step": 1988 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 423.125, "epoch": 0.027434104357181282, "grad_norm": 1.9603578758219324, "kl": 0.0888671875, "learning_rate": 9.981441089561589e-07, "loss": 0.0036, "reward": 2.102656364440918, "reward_std": 0.025772742927074432, "rewards/accuracy_reward": 0.9089062213897705, "rewards/format_reward": 1.0, "step": 1989 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 411.25, "epoch": 0.02744789727038248, "grad_norm": 3.9682732469748, "kl": 0.076171875, "learning_rate": 9.981422434889147e-07, "loss": 0.0031, "reward": 2.013406276702881, "reward_std": 0.012101572938263416, "rewards/accuracy_reward": 0.813406229019165, "rewards/format_reward": 1.0, "step": 1990 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 412.21875, "epoch": 0.027461690183583673, "grad_norm": 3.5203845609518853, "kl": 0.07666015625, "learning_rate": 9.981403770863398e-07, "loss": 0.0031, "reward": 2.106062412261963, "reward_std": 0.015214774757623672, "rewards/accuracy_reward": 0.9060624837875366, "rewards/format_reward": 1.0, "step": 1991 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 405.65625, "epoch": 0.02747548309678487, "grad_norm": 4.103945117895234, "kl": 0.0771484375, "learning_rate": 9.98138509748438e-07, "loss": 0.0031, "reward": 2.1064374446868896, "reward_std": 0.008362555876374245, "rewards/accuracy_reward": 0.9064374566078186, "rewards/format_reward": 1.0, "step": 1992 }, { "all_correct": 0.25, "all_wrong": 0.0, "completion_length": 412.96875, "epoch": 0.027489276009986068, "grad_norm": 1.8409485463186182, "kl": 0.07373046875, "learning_rate": 9.981366414752126e-07, "loss": 0.003, "reward": 1.9903438091278076, "reward_std": 0.02833339013159275, "rewards/accuracy_reward": 0.7965936660766602, "rewards/format_reward": 1.0, "step": 1993 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 416.78125, "epoch": 0.027503068923187265, "grad_norm": 1.507191170318596, "kl": 0.06787109375, "learning_rate": 9.98134772266667e-07, "loss": 0.0027, "reward": 2.097062587738037, "reward_std": 0.008244003169238567, "rewards/accuracy_reward": 0.8970625400543213, "rewards/format_reward": 1.0, "step": 1994 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 414.875, "epoch": 0.027516861836388462, "grad_norm": 6.4563752978414355, "kl": 0.08154296875, "learning_rate": 9.981329021228051e-07, "loss": 0.0033, "reward": 2.1385626792907715, "reward_std": 0.017914045602083206, "rewards/accuracy_reward": 0.9385625123977661, "rewards/format_reward": 1.0, "step": 1995 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 412.65625, "epoch": 0.02753065474958966, "grad_norm": 2.9601656070699685, "kl": 0.0771484375, "learning_rate": 9.9813103104363e-07, "loss": 0.0031, "reward": 2.1109063625335693, "reward_std": 0.02493947744369507, "rewards/accuracy_reward": 0.910906195640564, "rewards/format_reward": 1.0, "step": 1996 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 406.96875, "epoch": 0.027544447662790857, "grad_norm": 1.950536689661912, "kl": 0.076171875, "learning_rate": 9.981291590291456e-07, "loss": 0.003, "reward": 2.0784687995910645, "reward_std": 0.09255323559045792, "rewards/accuracy_reward": 0.9097187519073486, "rewards/format_reward": 0.96875, "step": 1997 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 415.6875, "epoch": 0.027558240575992054, "grad_norm": 2.8686894224680453, "kl": 0.08984375, "learning_rate": 9.981272860793552e-07, "loss": 0.0036, "reward": 2.078218936920166, "reward_std": 0.015872638672590256, "rewards/accuracy_reward": 0.8782187104225159, "rewards/format_reward": 1.0, "step": 1998 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 414.46875, "epoch": 0.02757203348919325, "grad_norm": 2.5342368902223744, "kl": 0.07421875, "learning_rate": 9.981254121942622e-07, "loss": 0.003, "reward": 2.1020312309265137, "reward_std": 0.010027805343270302, "rewards/accuracy_reward": 0.9020313024520874, "rewards/format_reward": 1.0, "step": 1999 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 398.0, "epoch": 0.02758582640239445, "grad_norm": 2.610870944141539, "kl": 0.08056640625, "learning_rate": 9.981235373738703e-07, "loss": 0.0032, "reward": 2.119406223297119, "reward_std": 0.014805818907916546, "rewards/accuracy_reward": 0.9194062352180481, "rewards/format_reward": 1.0, "step": 2000 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 401.4375, "epoch": 0.027599619315595646, "grad_norm": 2.8084775351197404, "kl": 0.076171875, "learning_rate": 9.98121661618183e-07, "loss": 0.003, "reward": 2.1069064140319824, "reward_std": 0.019730646163225174, "rewards/accuracy_reward": 0.906906247138977, "rewards/format_reward": 1.0, "step": 2001 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 385.65625, "epoch": 0.027613412228796843, "grad_norm": 2.7114356860055113, "kl": 0.083984375, "learning_rate": 9.981197849272038e-07, "loss": 0.0034, "reward": 2.0946874618530273, "reward_std": 0.0399150587618351, "rewards/accuracy_reward": 0.9009374976158142, "rewards/format_reward": 1.0, "step": 2002 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 405.6875, "epoch": 0.02762720514199804, "grad_norm": 2.3881314964794735, "kl": 0.07861328125, "learning_rate": 9.98117907300936e-07, "loss": 0.0032, "reward": 2.1321563720703125, "reward_std": 0.021678175777196884, "rewards/accuracy_reward": 0.9321562647819519, "rewards/format_reward": 1.0, "step": 2003 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 395.6875, "epoch": 0.027640998055199237, "grad_norm": 2.9228114885434158, "kl": 0.078125, "learning_rate": 9.981160287393835e-07, "loss": 0.0031, "reward": 2.160749912261963, "reward_std": 0.011518163606524467, "rewards/accuracy_reward": 0.9607499837875366, "rewards/format_reward": 1.0, "step": 2004 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 384.625, "epoch": 0.027654790968400435, "grad_norm": 2.3171069852133326, "kl": 0.072265625, "learning_rate": 9.981141492425496e-07, "loss": 0.0029, "reward": 2.1293439865112305, "reward_std": 0.017616232857108116, "rewards/accuracy_reward": 0.9293437004089355, "rewards/format_reward": 1.0, "step": 2005 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 393.09375, "epoch": 0.027668583881601632, "grad_norm": 2.69804513497294, "kl": 0.08203125, "learning_rate": 9.98112268810438e-07, "loss": 0.0033, "reward": 2.1133437156677246, "reward_std": 0.011599687859416008, "rewards/accuracy_reward": 0.9133437871932983, "rewards/format_reward": 1.0, "step": 2006 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 390.9375, "epoch": 0.02768237679480283, "grad_norm": 2.7925807919908587, "kl": 0.076171875, "learning_rate": 9.981103874430521e-07, "loss": 0.0031, "reward": 2.0713436603546143, "reward_std": 0.028428178280591965, "rewards/accuracy_reward": 0.8775937557220459, "rewards/format_reward": 1.0, "step": 2007 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 381.6875, "epoch": 0.027696169708004027, "grad_norm": 6.916744212955163, "kl": 0.08447265625, "learning_rate": 9.981085051403954e-07, "loss": 0.0034, "reward": 2.0361876487731934, "reward_std": 0.03168478608131409, "rewards/accuracy_reward": 0.8424375057220459, "rewards/format_reward": 1.0, "step": 2008 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 375.21875, "epoch": 0.027709962621205224, "grad_norm": 3.8174792734655085, "kl": 0.07568359375, "learning_rate": 9.981066219024715e-07, "loss": 0.003, "reward": 2.084437370300293, "reward_std": 0.008145329542458057, "rewards/accuracy_reward": 0.8844374418258667, "rewards/format_reward": 1.0, "step": 2009 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 393.0, "epoch": 0.02772375553440642, "grad_norm": 2.672495920194106, "kl": 0.08154296875, "learning_rate": 9.981047377292839e-07, "loss": 0.0033, "reward": 2.0176563262939453, "reward_std": 0.01224865484982729, "rewards/accuracy_reward": 0.8176562786102295, "rewards/format_reward": 1.0, "step": 2010 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 386.78125, "epoch": 0.02773754844760762, "grad_norm": 2.4021085534516007, "kl": 0.0869140625, "learning_rate": 9.981028526208362e-07, "loss": 0.0035, "reward": 2.1004064083099365, "reward_std": 0.029316674917936325, "rewards/accuracy_reward": 0.9066562652587891, "rewards/format_reward": 1.0, "step": 2011 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 384.6875, "epoch": 0.027751341360808816, "grad_norm": 2.8153433546616915, "kl": 0.07958984375, "learning_rate": 9.981009665771318e-07, "loss": 0.0032, "reward": 2.1179685592651367, "reward_std": 0.011852141469717026, "rewards/accuracy_reward": 0.9179688096046448, "rewards/format_reward": 1.0, "step": 2012 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 384.5, "epoch": 0.027765134274010013, "grad_norm": 3.044222433287558, "kl": 0.0869140625, "learning_rate": 9.980990795981744e-07, "loss": 0.0035, "reward": 2.079031467437744, "reward_std": 0.014792138710618019, "rewards/accuracy_reward": 0.879031240940094, "rewards/format_reward": 1.0, "step": 2013 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 372.15625, "epoch": 0.02777892718721121, "grad_norm": 2.981786770186084, "kl": 0.0751953125, "learning_rate": 9.980971916839676e-07, "loss": 0.003, "reward": 1.968656301498413, "reward_std": 0.027573253959417343, "rewards/accuracy_reward": 0.7811562418937683, "rewards/format_reward": 1.0, "step": 2014 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 386.0, "epoch": 0.027792720100412407, "grad_norm": 5.659418526181548, "kl": 0.08837890625, "learning_rate": 9.980953028345148e-07, "loss": 0.0035, "reward": 2.138031244277954, "reward_std": 0.03208222612738609, "rewards/accuracy_reward": 0.944281280040741, "rewards/format_reward": 1.0, "step": 2015 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 397.0625, "epoch": 0.027806513013613605, "grad_norm": 2.414807506931623, "kl": 0.07373046875, "learning_rate": 9.980934130498195e-07, "loss": 0.003, "reward": 2.043093681335449, "reward_std": 0.010804702527821064, "rewards/accuracy_reward": 0.843093752861023, "rewards/format_reward": 1.0, "step": 2016 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 399.625, "epoch": 0.027820305926814802, "grad_norm": 30.444039451052753, "kl": 0.0830078125, "learning_rate": 9.980915223298854e-07, "loss": 0.0033, "reward": 2.060999870300293, "reward_std": 0.02490059658885002, "rewards/accuracy_reward": 0.8610000014305115, "rewards/format_reward": 1.0, "step": 2017 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 400.25, "epoch": 0.027834098840016, "grad_norm": 3.5985905327780716, "kl": 0.09033203125, "learning_rate": 9.980896306747158e-07, "loss": 0.0036, "reward": 2.0984063148498535, "reward_std": 0.019991839304566383, "rewards/accuracy_reward": 0.8984062671661377, "rewards/format_reward": 1.0, "step": 2018 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 383.03125, "epoch": 0.027847891753217197, "grad_norm": 4.476733650490753, "kl": 0.1015625, "learning_rate": 9.980877380843145e-07, "loss": 0.0041, "reward": 1.99609375, "reward_std": 0.02948116697371006, "rewards/accuracy_reward": 0.8023437261581421, "rewards/format_reward": 1.0, "step": 2019 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 380.84375, "epoch": 0.027861684666418394, "grad_norm": 2.568336154292667, "kl": 0.08740234375, "learning_rate": 9.98085844558685e-07, "loss": 0.0035, "reward": 2.0034687519073486, "reward_std": 0.011369307525455952, "rewards/accuracy_reward": 0.8034687638282776, "rewards/format_reward": 1.0, "step": 2020 }, { "all_correct": 0.25, "all_wrong": 0.0, "completion_length": 383.9375, "epoch": 0.02787547757961959, "grad_norm": 2.5557683115881105, "kl": 0.0732421875, "learning_rate": 9.98083950097831e-07, "loss": 0.0029, "reward": 2.0200624465942383, "reward_std": 0.04915841668844223, "rewards/accuracy_reward": 0.8325625061988831, "rewards/format_reward": 1.0, "step": 2021 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 388.40625, "epoch": 0.02788927049282079, "grad_norm": 15.391262802454044, "kl": 0.1025390625, "learning_rate": 9.980820547017557e-07, "loss": 0.0041, "reward": 2.0832810401916504, "reward_std": 0.014530522748827934, "rewards/accuracy_reward": 0.8832812309265137, "rewards/format_reward": 1.0, "step": 2022 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 410.40625, "epoch": 0.027903063406021986, "grad_norm": 3.5155203681260545, "kl": 0.09326171875, "learning_rate": 9.98080158370463e-07, "loss": 0.0037, "reward": 2.1206250190734863, "reward_std": 0.033267080783843994, "rewards/accuracy_reward": 0.9268750548362732, "rewards/format_reward": 1.0, "step": 2023 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 386.1875, "epoch": 0.027916856319223183, "grad_norm": 2.002417237529394, "kl": 0.09375, "learning_rate": 9.980782611039562e-07, "loss": 0.0038, "reward": 1.9477499723434448, "reward_std": 0.009035258553922176, "rewards/accuracy_reward": 0.7477500438690186, "rewards/format_reward": 1.0, "step": 2024 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 402.84375, "epoch": 0.02793064923242438, "grad_norm": 2.283592767109554, "kl": 0.0849609375, "learning_rate": 9.98076362902239e-07, "loss": 0.0034, "reward": 2.0442469120025635, "reward_std": 0.004505848977714777, "rewards/accuracy_reward": 0.8442468047142029, "rewards/format_reward": 1.0, "step": 2025 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 396.78125, "epoch": 0.027944442145625577, "grad_norm": 2.9378510321527544, "kl": 0.08984375, "learning_rate": 9.980744637653148e-07, "loss": 0.0036, "reward": 2.0288126468658447, "reward_std": 0.018277857452630997, "rewards/accuracy_reward": 0.8288124799728394, "rewards/format_reward": 1.0, "step": 2026 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 419.25, "epoch": 0.027958235058826775, "grad_norm": 2.3118866496318957, "kl": 0.08251953125, "learning_rate": 9.980725636931875e-07, "loss": 0.0033, "reward": 2.118375062942505, "reward_std": 0.027417782694101334, "rewards/accuracy_reward": 0.924625039100647, "rewards/format_reward": 1.0, "step": 2027 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 414.8125, "epoch": 0.027972027972027972, "grad_norm": 2.0557021536390807, "kl": 0.0869140625, "learning_rate": 9.980706626858607e-07, "loss": 0.0035, "reward": 2.1043436527252197, "reward_std": 0.0201911348849535, "rewards/accuracy_reward": 0.9105937480926514, "rewards/format_reward": 1.0, "step": 2028 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 411.84375, "epoch": 0.02798582088522917, "grad_norm": 2.411652807128667, "kl": 0.09033203125, "learning_rate": 9.980687607433373e-07, "loss": 0.0036, "reward": 2.1010470390319824, "reward_std": 0.03239306062459946, "rewards/accuracy_reward": 0.9072968363761902, "rewards/format_reward": 1.0, "step": 2029 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 408.25, "epoch": 0.027999613798430367, "grad_norm": 3.368887657065975, "kl": 0.0830078125, "learning_rate": 9.980668578656215e-07, "loss": 0.0033, "reward": 2.017500162124634, "reward_std": 0.01959807053208351, "rewards/accuracy_reward": 0.8174999952316284, "rewards/format_reward": 1.0, "step": 2030 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 412.28125, "epoch": 0.028013406711631564, "grad_norm": 2.336123805715452, "kl": 0.08203125, "learning_rate": 9.980649540527165e-07, "loss": 0.0033, "reward": 2.129593849182129, "reward_std": 0.015878131613135338, "rewards/accuracy_reward": 0.9295936822891235, "rewards/format_reward": 1.0, "step": 2031 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 405.6875, "epoch": 0.02802719962483276, "grad_norm": 3.1886311497815454, "kl": 0.0908203125, "learning_rate": 9.980630493046264e-07, "loss": 0.0036, "reward": 2.1080312728881836, "reward_std": 0.017673006281256676, "rewards/accuracy_reward": 0.9080312252044678, "rewards/format_reward": 1.0, "step": 2032 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 405.4375, "epoch": 0.02804099253803396, "grad_norm": 2.9214863905656006, "kl": 0.0830078125, "learning_rate": 9.980611436213544e-07, "loss": 0.0033, "reward": 2.0946249961853027, "reward_std": 0.018046777695417404, "rewards/accuracy_reward": 0.8946249485015869, "rewards/format_reward": 1.0, "step": 2033 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 404.25, "epoch": 0.028054785451235156, "grad_norm": 5.238866070088939, "kl": 0.08349609375, "learning_rate": 9.98059237002904e-07, "loss": 0.0033, "reward": 2.138718843460083, "reward_std": 0.017147937789559364, "rewards/accuracy_reward": 0.9387186765670776, "rewards/format_reward": 1.0, "step": 2034 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 412.53125, "epoch": 0.028068578364436353, "grad_norm": 3.306845612252958, "kl": 0.0908203125, "learning_rate": 9.980573294492787e-07, "loss": 0.0036, "reward": 1.9710625410079956, "reward_std": 0.03476882725954056, "rewards/accuracy_reward": 0.7773123979568481, "rewards/format_reward": 1.0, "step": 2035 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 410.9375, "epoch": 0.02808237127763755, "grad_norm": 2.9547635161751504, "kl": 0.08837890625, "learning_rate": 9.980554209604825e-07, "loss": 0.0035, "reward": 2.027468681335449, "reward_std": 0.012101997621357441, "rewards/accuracy_reward": 0.8274686932563782, "rewards/format_reward": 1.0, "step": 2036 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 414.0625, "epoch": 0.028096164190838747, "grad_norm": 7.854176775858453, "kl": 0.0859375, "learning_rate": 9.980535115365188e-07, "loss": 0.0034, "reward": 2.114281177520752, "reward_std": 0.009043807163834572, "rewards/accuracy_reward": 0.9142812490463257, "rewards/format_reward": 1.0, "step": 2037 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 406.6875, "epoch": 0.028109957104039945, "grad_norm": 2.904379586832252, "kl": 0.1025390625, "learning_rate": 9.98051601177391e-07, "loss": 0.0041, "reward": 2.0442187786102295, "reward_std": 0.020171722397208214, "rewards/accuracy_reward": 0.8442187309265137, "rewards/format_reward": 1.0, "step": 2038 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 409.21875, "epoch": 0.028123750017241142, "grad_norm": 2.8714500229049826, "kl": 0.09375, "learning_rate": 9.98049689883103e-07, "loss": 0.0037, "reward": 2.097871780395508, "reward_std": 0.02635156363248825, "rewards/accuracy_reward": 0.8978718519210815, "rewards/format_reward": 1.0, "step": 2039 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 407.625, "epoch": 0.02813754293044234, "grad_norm": 2.2630764513596566, "kl": 0.08740234375, "learning_rate": 9.98047777653658e-07, "loss": 0.0035, "reward": 2.1635000705718994, "reward_std": 0.01236058585345745, "rewards/accuracy_reward": 0.9635000228881836, "rewards/format_reward": 1.0, "step": 2040 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 403.96875, "epoch": 0.028151335843643537, "grad_norm": 3.0834774585446327, "kl": 0.0791015625, "learning_rate": 9.980458644890598e-07, "loss": 0.0032, "reward": 2.052968978881836, "reward_std": 0.008962081745266914, "rewards/accuracy_reward": 0.8529687523841858, "rewards/format_reward": 1.0, "step": 2041 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 391.25, "epoch": 0.028165128756844734, "grad_norm": 2.5839638634275297, "kl": 0.08935546875, "learning_rate": 9.980439503893124e-07, "loss": 0.0036, "reward": 2.104062557220459, "reward_std": 0.017595740035176277, "rewards/accuracy_reward": 0.9040625691413879, "rewards/format_reward": 1.0, "step": 2042 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 398.84375, "epoch": 0.02817892167004593, "grad_norm": 2.783328861502429, "kl": 0.08984375, "learning_rate": 9.980420353544186e-07, "loss": 0.0036, "reward": 2.1425342559814453, "reward_std": 0.011194610968232155, "rewards/accuracy_reward": 0.942534327507019, "rewards/format_reward": 1.0, "step": 2043 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 400.90625, "epoch": 0.02819271458324713, "grad_norm": 2.971659369320894, "kl": 0.0859375, "learning_rate": 9.980401193843827e-07, "loss": 0.0034, "reward": 2.002406120300293, "reward_std": 0.03162484988570213, "rewards/accuracy_reward": 0.8024062514305115, "rewards/format_reward": 1.0, "step": 2044 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 396.5, "epoch": 0.028206507496448326, "grad_norm": 12.679050087466655, "kl": 0.0791015625, "learning_rate": 9.980382024792077e-07, "loss": 0.0032, "reward": 2.062312602996826, "reward_std": 0.020395871251821518, "rewards/accuracy_reward": 0.8623125553131104, "rewards/format_reward": 1.0, "step": 2045 }, { "all_correct": 0.25, "all_wrong": 0.0, "completion_length": 375.34375, "epoch": 0.028220300409649523, "grad_norm": 2.196318607577958, "kl": 0.080078125, "learning_rate": 9.980362846388976e-07, "loss": 0.0032, "reward": 1.978562593460083, "reward_std": 0.02489536441862583, "rewards/accuracy_reward": 0.7848124504089355, "rewards/format_reward": 1.0, "step": 2046 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 392.1875, "epoch": 0.02823409332285072, "grad_norm": 1.9361530369501339, "kl": 0.0859375, "learning_rate": 9.98034365863456e-07, "loss": 0.0034, "reward": 2.115906238555908, "reward_std": 0.01409249659627676, "rewards/accuracy_reward": 0.9159062504768372, "rewards/format_reward": 1.0, "step": 2047 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 388.0625, "epoch": 0.028247886236051917, "grad_norm": 2.6855138861331804, "kl": 0.08642578125, "learning_rate": 9.980324461528865e-07, "loss": 0.0035, "reward": 2.106250047683716, "reward_std": 0.014943841844797134, "rewards/accuracy_reward": 0.9062499403953552, "rewards/format_reward": 1.0, "step": 2048 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 394.6875, "epoch": 0.028261679149253115, "grad_norm": 6.594156869797373, "kl": 0.083984375, "learning_rate": 9.980305255071922e-07, "loss": 0.0034, "reward": 2.0450313091278076, "reward_std": 0.020993392914533615, "rewards/accuracy_reward": 0.845031201839447, "rewards/format_reward": 1.0, "step": 2049 }, { "all_correct": 0.0, "all_wrong": 0.0, "completion_length": 388.65625, "epoch": 0.028275472062454312, "grad_norm": 2.654019550042582, "kl": 0.08984375, "learning_rate": 9.980286039263774e-07, "loss": 0.0036, "reward": 2.011906147003174, "reward_std": 0.028995629400014877, "rewards/accuracy_reward": 0.8181562423706055, "rewards/format_reward": 1.0, "step": 2050 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 394.34375, "epoch": 0.02828926497565551, "grad_norm": 2.5163281369855546, "kl": 0.0712890625, "learning_rate": 9.980266814104454e-07, "loss": 0.0028, "reward": 2.113687515258789, "reward_std": 0.009443430230021477, "rewards/accuracy_reward": 0.9136874675750732, "rewards/format_reward": 1.0, "step": 2051 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 386.75, "epoch": 0.028303057888856707, "grad_norm": 2.9121186143889, "kl": 0.0810546875, "learning_rate": 9.980247579593996e-07, "loss": 0.0032, "reward": 2.0890936851501465, "reward_std": 0.015981599688529968, "rewards/accuracy_reward": 0.8890937566757202, "rewards/format_reward": 1.0, "step": 2052 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 390.03125, "epoch": 0.028316850802057904, "grad_norm": 2.1455831615776297, "kl": 0.080078125, "learning_rate": 9.98022833573244e-07, "loss": 0.0032, "reward": 2.1273436546325684, "reward_std": 0.013971472159028053, "rewards/accuracy_reward": 0.9273437261581421, "rewards/format_reward": 1.0, "step": 2053 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 382.90625, "epoch": 0.0283306437152591, "grad_norm": 2.1309348958758783, "kl": 0.076171875, "learning_rate": 9.980209082519822e-07, "loss": 0.003, "reward": 2.1307501792907715, "reward_std": 0.02510671317577362, "rewards/accuracy_reward": 0.9369999170303345, "rewards/format_reward": 1.0, "step": 2054 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 403.875, "epoch": 0.0283444366284603, "grad_norm": 2.789125978101275, "kl": 0.076171875, "learning_rate": 9.980189819956175e-07, "loss": 0.003, "reward": 2.135593891143799, "reward_std": 0.008521700277924538, "rewards/accuracy_reward": 0.9355937242507935, "rewards/format_reward": 1.0, "step": 2055 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 402.1875, "epoch": 0.028358229541661496, "grad_norm": 2.4557818433047824, "kl": 0.0771484375, "learning_rate": 9.980170548041536e-07, "loss": 0.0031, "reward": 2.1533126831054688, "reward_std": 0.029019678011536598, "rewards/accuracy_reward": 0.9595625400543213, "rewards/format_reward": 1.0, "step": 2056 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 385.375, "epoch": 0.028372022454862693, "grad_norm": 5.458354526153296, "kl": 0.078125, "learning_rate": 9.980151266775942e-07, "loss": 0.0031, "reward": 2.0703749656677246, "reward_std": 0.020792512223124504, "rewards/accuracy_reward": 0.8703750371932983, "rewards/format_reward": 1.0, "step": 2057 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 413.5625, "epoch": 0.02838581536806389, "grad_norm": 2.1994200102764365, "kl": 0.083984375, "learning_rate": 9.980131976159432e-07, "loss": 0.0034, "reward": 2.159250259399414, "reward_std": 0.009845316410064697, "rewards/accuracy_reward": 0.9592499732971191, "rewards/format_reward": 1.0, "step": 2058 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 388.03125, "epoch": 0.028399608281265087, "grad_norm": 6.677962248902495, "kl": 0.0791015625, "learning_rate": 9.980112676192037e-07, "loss": 0.0032, "reward": 1.9943437576293945, "reward_std": 0.018568377941846848, "rewards/accuracy_reward": 0.7943437695503235, "rewards/format_reward": 1.0, "step": 2059 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 391.75, "epoch": 0.028413401194466285, "grad_norm": 3.6875644046326532, "kl": 0.083984375, "learning_rate": 9.980093366873797e-07, "loss": 0.0034, "reward": 2.0176563262939453, "reward_std": 0.018136974424123764, "rewards/accuracy_reward": 0.8176563382148743, "rewards/format_reward": 1.0, "step": 2060 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 407.6875, "epoch": 0.028427194107667482, "grad_norm": 2.7034239656725623, "kl": 0.07861328125, "learning_rate": 9.980074048204744e-07, "loss": 0.0031, "reward": 2.0900626182556152, "reward_std": 0.009769640862941742, "rewards/accuracy_reward": 0.8900624513626099, "rewards/format_reward": 1.0, "step": 2061 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 410.6875, "epoch": 0.02844098702086868, "grad_norm": 3.2053751316199164, "kl": 0.076171875, "learning_rate": 9.98005472018492e-07, "loss": 0.0031, "reward": 2.142218589782715, "reward_std": 0.01818351447582245, "rewards/accuracy_reward": 0.9422187805175781, "rewards/format_reward": 1.0, "step": 2062 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 402.15625, "epoch": 0.028454779934069876, "grad_norm": 12.744656982507609, "kl": 0.08203125, "learning_rate": 9.980035382814357e-07, "loss": 0.0033, "reward": 2.081843852996826, "reward_std": 0.01705320179462433, "rewards/accuracy_reward": 0.8818437457084656, "rewards/format_reward": 1.0, "step": 2063 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 405.46875, "epoch": 0.028468572847271074, "grad_norm": 2.4460673091208354, "kl": 0.06982421875, "learning_rate": 9.980016036093093e-07, "loss": 0.0028, "reward": 2.0595312118530273, "reward_std": 0.012723948806524277, "rewards/accuracy_reward": 0.8595312237739563, "rewards/format_reward": 1.0, "step": 2064 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 413.5, "epoch": 0.02848236576047227, "grad_norm": 10.690303999779175, "kl": 0.08203125, "learning_rate": 9.979996680021166e-07, "loss": 0.0033, "reward": 2.1090002059936523, "reward_std": 0.02731136605143547, "rewards/accuracy_reward": 0.909000039100647, "rewards/format_reward": 1.0, "step": 2065 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 415.5, "epoch": 0.02849615867367347, "grad_norm": 2.263414737450947, "kl": 0.07470703125, "learning_rate": 9.97997731459861e-07, "loss": 0.003, "reward": 2.0491220951080322, "reward_std": 0.011150998994708061, "rewards/accuracy_reward": 0.8491218090057373, "rewards/format_reward": 1.0, "step": 2066 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 410.65625, "epoch": 0.028509951586874662, "grad_norm": 2.108701215577669, "kl": 0.072265625, "learning_rate": 9.97995793982546e-07, "loss": 0.0029, "reward": 2.09770131111145, "reward_std": 0.008197976276278496, "rewards/accuracy_reward": 0.8977012634277344, "rewards/format_reward": 1.0, "step": 2067 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 418.25, "epoch": 0.02852374450007586, "grad_norm": 3.4433795464372032, "kl": 0.0751953125, "learning_rate": 9.979938555701754e-07, "loss": 0.003, "reward": 2.0000624656677246, "reward_std": 0.013488207012414932, "rewards/accuracy_reward": 0.8000625371932983, "rewards/format_reward": 1.0, "step": 2068 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 406.375, "epoch": 0.028537537413277057, "grad_norm": 5.161791238612761, "kl": 0.08203125, "learning_rate": 9.97991916222753e-07, "loss": 0.0033, "reward": 2.088531255722046, "reward_std": 0.013438455760478973, "rewards/accuracy_reward": 0.8885312080383301, "rewards/format_reward": 1.0, "step": 2069 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 408.0625, "epoch": 0.028551330326478254, "grad_norm": 3.4610384000208443, "kl": 0.0751953125, "learning_rate": 9.979899759402823e-07, "loss": 0.003, "reward": 2.1161248683929443, "reward_std": 0.033605195581912994, "rewards/accuracy_reward": 0.9161250591278076, "rewards/format_reward": 1.0, "step": 2070 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 412.1875, "epoch": 0.02856512323967945, "grad_norm": 4.921646812201996, "kl": 0.080078125, "learning_rate": 9.979880347227669e-07, "loss": 0.0032, "reward": 2.1761562824249268, "reward_std": 0.011000405065715313, "rewards/accuracy_reward": 0.9761562943458557, "rewards/format_reward": 1.0, "step": 2071 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 408.9375, "epoch": 0.02857891615288065, "grad_norm": 7.141057473858071, "kl": 0.0771484375, "learning_rate": 9.979860925702105e-07, "loss": 0.0031, "reward": 2.113156318664551, "reward_std": 0.010698425583541393, "rewards/accuracy_reward": 0.913156270980835, "rewards/format_reward": 1.0, "step": 2072 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 405.15625, "epoch": 0.028592709066081846, "grad_norm": 2.7903819001852237, "kl": 0.087890625, "learning_rate": 9.979841494826167e-07, "loss": 0.0035, "reward": 2.1317501068115234, "reward_std": 0.0135076018050313, "rewards/accuracy_reward": 0.9317499995231628, "rewards/format_reward": 1.0, "step": 2073 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 409.40625, "epoch": 0.028606501979283043, "grad_norm": 2.0915499397023014, "kl": 0.080078125, "learning_rate": 9.979822054599894e-07, "loss": 0.0032, "reward": 2.0935935974121094, "reward_std": 0.024439077824354172, "rewards/accuracy_reward": 0.8998437523841858, "rewards/format_reward": 1.0, "step": 2074 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 415.84375, "epoch": 0.02862029489248424, "grad_norm": 4.254950816370984, "kl": 0.08203125, "learning_rate": 9.979802605023318e-07, "loss": 0.0033, "reward": 2.1291561126708984, "reward_std": 0.01947108283638954, "rewards/accuracy_reward": 0.9291563034057617, "rewards/format_reward": 1.0, "step": 2075 }, { "all_correct": 0.25, "all_wrong": 0.0, "completion_length": 403.125, "epoch": 0.028634087805685438, "grad_norm": 3.9531415207398846, "kl": 0.08056640625, "learning_rate": 9.979783146096479e-07, "loss": 0.0032, "reward": 1.9294064044952393, "reward_std": 0.017338544130325317, "rewards/accuracy_reward": 0.7294062972068787, "rewards/format_reward": 1.0, "step": 2076 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 402.90625, "epoch": 0.028647880718886635, "grad_norm": 6.919585654920173, "kl": 0.07421875, "learning_rate": 9.97976367781941e-07, "loss": 0.003, "reward": 2.1209375858306885, "reward_std": 0.024540863931179047, "rewards/accuracy_reward": 0.9209375381469727, "rewards/format_reward": 1.0, "step": 2077 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 407.34375, "epoch": 0.028661673632087832, "grad_norm": 5.765836660587872, "kl": 0.07568359375, "learning_rate": 9.979744200192153e-07, "loss": 0.003, "reward": 2.125645875930786, "reward_std": 0.029877355322241783, "rewards/accuracy_reward": 0.9318958520889282, "rewards/format_reward": 1.0, "step": 2078 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 406.78125, "epoch": 0.02867546654528903, "grad_norm": 4.29473995907193, "kl": 0.07861328125, "learning_rate": 9.97972471321474e-07, "loss": 0.0031, "reward": 2.1382498741149902, "reward_std": 0.028349842876195908, "rewards/accuracy_reward": 0.9444999694824219, "rewards/format_reward": 1.0, "step": 2079 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 400.75, "epoch": 0.028689259458490227, "grad_norm": 13.77050613624554, "kl": 0.0791015625, "learning_rate": 9.97970521688721e-07, "loss": 0.0032, "reward": 2.0674374103546143, "reward_std": 0.04495127499103546, "rewards/accuracy_reward": 0.8736875057220459, "rewards/format_reward": 1.0, "step": 2080 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 418.21875, "epoch": 0.028703052371691424, "grad_norm": 1.9361973581944902, "kl": 0.083984375, "learning_rate": 9.979685711209597e-07, "loss": 0.0034, "reward": 2.1226563453674316, "reward_std": 0.024589134380221367, "rewards/accuracy_reward": 0.9289063215255737, "rewards/format_reward": 1.0, "step": 2081 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 415.96875, "epoch": 0.02871684528489262, "grad_norm": 2.599464014084937, "kl": 0.0693359375, "learning_rate": 9.97966619618194e-07, "loss": 0.0028, "reward": 2.1723437309265137, "reward_std": 0.02695685625076294, "rewards/accuracy_reward": 0.9785937070846558, "rewards/format_reward": 1.0, "step": 2082 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 414.0, "epoch": 0.02873063819809382, "grad_norm": 1.569643836200425, "kl": 0.07958984375, "learning_rate": 9.979646671804276e-07, "loss": 0.0032, "reward": 2.1110312938690186, "reward_std": 0.007149267941713333, "rewards/accuracy_reward": 0.9110312461853027, "rewards/format_reward": 1.0, "step": 2083 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 388.65625, "epoch": 0.028744431111295016, "grad_norm": 3.190030648318923, "kl": 0.08154296875, "learning_rate": 9.979627138076639e-07, "loss": 0.0033, "reward": 2.174093723297119, "reward_std": 0.007472606375813484, "rewards/accuracy_reward": 0.9740937948226929, "rewards/format_reward": 1.0, "step": 2084 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 404.875, "epoch": 0.028758224024496213, "grad_norm": 3.07316520867738, "kl": 0.07421875, "learning_rate": 9.979607594999069e-07, "loss": 0.003, "reward": 2.1448750495910645, "reward_std": 0.018620474264025688, "rewards/accuracy_reward": 0.9448750019073486, "rewards/format_reward": 1.0, "step": 2085 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 386.5625, "epoch": 0.02877201693769741, "grad_norm": 4.426303001381219, "kl": 0.07666015625, "learning_rate": 9.979588042571601e-07, "loss": 0.0031, "reward": 2.0162813663482666, "reward_std": 0.02568703331053257, "rewards/accuracy_reward": 0.8162811994552612, "rewards/format_reward": 1.0, "step": 2086 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 401.625, "epoch": 0.028785809850898608, "grad_norm": 2.108257890474877, "kl": 0.0810546875, "learning_rate": 9.97956848079427e-07, "loss": 0.0032, "reward": 2.1298437118530273, "reward_std": 0.008606940507888794, "rewards/accuracy_reward": 0.9298437833786011, "rewards/format_reward": 1.0, "step": 2087 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 375.6875, "epoch": 0.028799602764099805, "grad_norm": 3.807179077400949, "kl": 0.080078125, "learning_rate": 9.979548909667116e-07, "loss": 0.0032, "reward": 1.9619063138961792, "reward_std": 0.03733261674642563, "rewards/accuracy_reward": 0.7744062542915344, "rewards/format_reward": 1.0, "step": 2088 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 391.40625, "epoch": 0.028813395677301002, "grad_norm": 2.79635918152035, "kl": 0.08447265625, "learning_rate": 9.979529329190174e-07, "loss": 0.0034, "reward": 1.9932187795639038, "reward_std": 0.09017042815685272, "rewards/accuracy_reward": 0.824468731880188, "rewards/format_reward": 0.96875, "step": 2089 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 385.375, "epoch": 0.0288271885905022, "grad_norm": 3.1102944415937914, "kl": 0.0888671875, "learning_rate": 9.97950973936348e-07, "loss": 0.0036, "reward": 2.118000030517578, "reward_std": 0.03228549659252167, "rewards/accuracy_reward": 0.924250066280365, "rewards/format_reward": 1.0, "step": 2090 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 399.9375, "epoch": 0.028840981503703397, "grad_norm": 4.213120767290845, "kl": 0.07470703125, "learning_rate": 9.979490140187073e-07, "loss": 0.003, "reward": 2.147218704223633, "reward_std": 0.02015913464128971, "rewards/accuracy_reward": 0.9472187161445618, "rewards/format_reward": 1.0, "step": 2091 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 397.90625, "epoch": 0.028854774416904594, "grad_norm": 2.640892061219349, "kl": 0.0771484375, "learning_rate": 9.979470531660986e-07, "loss": 0.0031, "reward": 2.114687442779541, "reward_std": 0.017453260719776154, "rewards/accuracy_reward": 0.9146875143051147, "rewards/format_reward": 1.0, "step": 2092 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 401.25, "epoch": 0.02886856733010579, "grad_norm": 4.1515280983760485, "kl": 0.0771484375, "learning_rate": 9.979450913785263e-07, "loss": 0.0031, "reward": 2.0978751182556152, "reward_std": 0.019949279725551605, "rewards/accuracy_reward": 0.8978749513626099, "rewards/format_reward": 1.0, "step": 2093 }, { "all_correct": 0.25, "all_wrong": 0.0, "completion_length": 400.5625, "epoch": 0.02888236024330699, "grad_norm": 4.597083957292824, "kl": 0.07568359375, "learning_rate": 9.979431286559933e-07, "loss": 0.003, "reward": 2.062375068664551, "reward_std": 0.029642976820468903, "rewards/accuracy_reward": 0.8686250448226929, "rewards/format_reward": 1.0, "step": 2094 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 400.03125, "epoch": 0.028896153156508186, "grad_norm": 2.832273985170308, "kl": 0.09033203125, "learning_rate": 9.979411649985037e-07, "loss": 0.0036, "reward": 2.10518741607666, "reward_std": 0.013895487412810326, "rewards/accuracy_reward": 0.9051874279975891, "rewards/format_reward": 1.0, "step": 2095 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 402.6875, "epoch": 0.028909946069709383, "grad_norm": 3.199596924447709, "kl": 0.07958984375, "learning_rate": 9.97939200406061e-07, "loss": 0.0032, "reward": 2.0587189197540283, "reward_std": 0.014315973967313766, "rewards/accuracy_reward": 0.8587186932563782, "rewards/format_reward": 1.0, "step": 2096 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 405.375, "epoch": 0.02892373898291058, "grad_norm": 4.775581630970847, "kl": 0.08935546875, "learning_rate": 9.97937234878669e-07, "loss": 0.0036, "reward": 2.070718765258789, "reward_std": 0.02618570253252983, "rewards/accuracy_reward": 0.8769687414169312, "rewards/format_reward": 1.0, "step": 2097 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 395.6875, "epoch": 0.028937531896111778, "grad_norm": 2.747497169736035, "kl": 0.0830078125, "learning_rate": 9.979352684163316e-07, "loss": 0.0033, "reward": 2.1251564025878906, "reward_std": 0.010395649820566177, "rewards/accuracy_reward": 0.9251562356948853, "rewards/format_reward": 1.0, "step": 2098 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 393.125, "epoch": 0.028951324809312975, "grad_norm": 3.3409368244468824, "kl": 0.0771484375, "learning_rate": 9.979333010190519e-07, "loss": 0.0031, "reward": 2.117968797683716, "reward_std": 0.018976405262947083, "rewards/accuracy_reward": 0.9179686903953552, "rewards/format_reward": 1.0, "step": 2099 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 389.8125, "epoch": 0.028965117722514172, "grad_norm": 2.444924369555416, "kl": 0.0771484375, "learning_rate": 9.979313326868342e-07, "loss": 0.0031, "reward": 2.0849688053131104, "reward_std": 0.012484787032008171, "rewards/accuracy_reward": 0.8849687576293945, "rewards/format_reward": 1.0, "step": 2100 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 410.96875, "epoch": 0.02897891063571537, "grad_norm": 8.54544128521888, "kl": 0.087890625, "learning_rate": 9.979293634196819e-07, "loss": 0.0035, "reward": 2.1270313262939453, "reward_std": 0.017910931259393692, "rewards/accuracy_reward": 0.9270312786102295, "rewards/format_reward": 1.0, "step": 2101 }, { "all_correct": 0.25, "all_wrong": 0.0, "completion_length": 397.03125, "epoch": 0.028992703548916567, "grad_norm": 2.783408219036573, "kl": 0.0751953125, "learning_rate": 9.979273932175987e-07, "loss": 0.003, "reward": 2.011593818664551, "reward_std": 0.017483575269579887, "rewards/accuracy_reward": 0.811593770980835, "rewards/format_reward": 1.0, "step": 2102 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 394.03125, "epoch": 0.029006496462117764, "grad_norm": 3.2457868839194166, "kl": 0.09521484375, "learning_rate": 9.979254220805883e-07, "loss": 0.0038, "reward": 2.0436248779296875, "reward_std": 0.019684679806232452, "rewards/accuracy_reward": 0.843625009059906, "rewards/format_reward": 1.0, "step": 2103 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 399.25, "epoch": 0.02902028937531896, "grad_norm": 3.340308167709199, "kl": 0.076171875, "learning_rate": 9.979234500086547e-07, "loss": 0.0031, "reward": 2.118781328201294, "reward_std": 0.023527873679995537, "rewards/accuracy_reward": 0.9187812805175781, "rewards/format_reward": 1.0, "step": 2104 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 393.78125, "epoch": 0.02903408228852016, "grad_norm": 3.1616015013518313, "kl": 0.08251953125, "learning_rate": 9.97921477001801e-07, "loss": 0.0033, "reward": 2.0796875953674316, "reward_std": 0.01122305728495121, "rewards/accuracy_reward": 0.879687488079071, "rewards/format_reward": 1.0, "step": 2105 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 385.6875, "epoch": 0.029047875201721356, "grad_norm": 9.430061753508822, "kl": 0.08056640625, "learning_rate": 9.979195030600317e-07, "loss": 0.0032, "reward": 2.1826562881469727, "reward_std": 0.006362080108374357, "rewards/accuracy_reward": 0.9826562404632568, "rewards/format_reward": 1.0, "step": 2106 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 393.15625, "epoch": 0.029061668114922553, "grad_norm": 2.241999609078686, "kl": 0.09228515625, "learning_rate": 9.979175281833499e-07, "loss": 0.0037, "reward": 2.097156286239624, "reward_std": 0.014765438623726368, "rewards/accuracy_reward": 0.8971562385559082, "rewards/format_reward": 1.0, "step": 2107 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 390.03125, "epoch": 0.02907546102812375, "grad_norm": 4.23771388158449, "kl": 0.09033203125, "learning_rate": 9.979155523717593e-07, "loss": 0.0036, "reward": 2.1519999504089355, "reward_std": 0.03193982318043709, "rewards/accuracy_reward": 0.9582499861717224, "rewards/format_reward": 1.0, "step": 2108 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 397.90625, "epoch": 0.029089253941324947, "grad_norm": 2.6605842998706204, "kl": 0.0849609375, "learning_rate": 9.979135756252639e-07, "loss": 0.0034, "reward": 2.1091251373291016, "reward_std": 0.008270704187452793, "rewards/accuracy_reward": 0.9091250896453857, "rewards/format_reward": 1.0, "step": 2109 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 393.40625, "epoch": 0.029103046854526145, "grad_norm": 3.0487255267527202, "kl": 0.07275390625, "learning_rate": 9.979115979438673e-07, "loss": 0.0029, "reward": 2.1705470085144043, "reward_std": 0.016734443604946136, "rewards/accuracy_reward": 0.9705468416213989, "rewards/format_reward": 1.0, "step": 2110 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 390.84375, "epoch": 0.029116839767727342, "grad_norm": 3.710469228403074, "kl": 0.0849609375, "learning_rate": 9.979096193275731e-07, "loss": 0.0034, "reward": 2.0710625648498535, "reward_std": 0.009986454620957375, "rewards/accuracy_reward": 0.8710625171661377, "rewards/format_reward": 1.0, "step": 2111 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 386.96875, "epoch": 0.02913063268092854, "grad_norm": 3.9013126120312625, "kl": 0.080078125, "learning_rate": 9.979076397763852e-07, "loss": 0.0032, "reward": 2.0230000019073486, "reward_std": 0.03693739324808121, "rewards/accuracy_reward": 0.8292499780654907, "rewards/format_reward": 1.0, "step": 2112 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 408.34375, "epoch": 0.029144425594129737, "grad_norm": 2.3009510655209615, "kl": 0.08642578125, "learning_rate": 9.979056592903073e-07, "loss": 0.0035, "reward": 2.1267499923706055, "reward_std": 0.011796379461884499, "rewards/accuracy_reward": 0.9267499446868896, "rewards/format_reward": 1.0, "step": 2113 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 386.90625, "epoch": 0.029158218507330934, "grad_norm": 4.427301020722693, "kl": 0.080078125, "learning_rate": 9.97903677869343e-07, "loss": 0.0032, "reward": 2.052968740463257, "reward_std": 0.016182444989681244, "rewards/accuracy_reward": 0.8529687523841858, "rewards/format_reward": 1.0, "step": 2114 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 398.96875, "epoch": 0.02917201142053213, "grad_norm": 5.067117029900634, "kl": 0.09130859375, "learning_rate": 9.97901695513496e-07, "loss": 0.0037, "reward": 2.1168127059936523, "reward_std": 0.012572855688631535, "rewards/accuracy_reward": 0.916812539100647, "rewards/format_reward": 1.0, "step": 2115 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 400.09375, "epoch": 0.02918580433373333, "grad_norm": 2.4802782428883154, "kl": 0.0869140625, "learning_rate": 9.978997122227703e-07, "loss": 0.0035, "reward": 2.1394500732421875, "reward_std": 0.015019536018371582, "rewards/accuracy_reward": 0.9394500255584717, "rewards/format_reward": 1.0, "step": 2116 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 395.71875, "epoch": 0.029199597246934526, "grad_norm": 2.027608526349447, "kl": 0.08203125, "learning_rate": 9.978977279971692e-07, "loss": 0.0033, "reward": 2.159343719482422, "reward_std": 0.0072292909026145935, "rewards/accuracy_reward": 0.9593437314033508, "rewards/format_reward": 1.0, "step": 2117 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 389.4375, "epoch": 0.029213390160135723, "grad_norm": 4.56286517312335, "kl": 0.09033203125, "learning_rate": 9.97895742836697e-07, "loss": 0.0036, "reward": 2.1171875, "reward_std": 0.02136104926466942, "rewards/accuracy_reward": 0.9234374761581421, "rewards/format_reward": 1.0, "step": 2118 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 394.5, "epoch": 0.02922718307333692, "grad_norm": 3.189917825822055, "kl": 0.09326171875, "learning_rate": 9.978937567413567e-07, "loss": 0.0037, "reward": 2.085218667984009, "reward_std": 0.014136429876089096, "rewards/accuracy_reward": 0.8852187395095825, "rewards/format_reward": 1.0, "step": 2119 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 416.625, "epoch": 0.029240975986538117, "grad_norm": 2.415233532918492, "kl": 0.08349609375, "learning_rate": 9.978917697111527e-07, "loss": 0.0033, "reward": 2.05078125, "reward_std": 0.005621230695396662, "rewards/accuracy_reward": 0.8507812023162842, "rewards/format_reward": 1.0, "step": 2120 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 408.4375, "epoch": 0.029254768899739315, "grad_norm": 4.9699034633081665, "kl": 0.08837890625, "learning_rate": 9.978897817460883e-07, "loss": 0.0035, "reward": 1.9973125457763672, "reward_std": 0.022114025428891182, "rewards/accuracy_reward": 0.7973124980926514, "rewards/format_reward": 1.0, "step": 2121 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 402.625, "epoch": 0.029268561812940512, "grad_norm": 4.562417920530549, "kl": 0.07958984375, "learning_rate": 9.978877928461673e-07, "loss": 0.0032, "reward": 2.0763437747955322, "reward_std": 0.014324802905321121, "rewards/accuracy_reward": 0.8763437271118164, "rewards/format_reward": 1.0, "step": 2122 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 401.78125, "epoch": 0.02928235472614171, "grad_norm": 7.44604681003263, "kl": 0.0859375, "learning_rate": 9.978858030113935e-07, "loss": 0.0035, "reward": 2.1031250953674316, "reward_std": 0.017918944358825684, "rewards/accuracy_reward": 0.9031249284744263, "rewards/format_reward": 1.0, "step": 2123 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 411.8125, "epoch": 0.029296147639342907, "grad_norm": 3.179200665688222, "kl": 0.0888671875, "learning_rate": 9.978838122417707e-07, "loss": 0.0035, "reward": 2.1171250343322754, "reward_std": 0.008016312494874, "rewards/accuracy_reward": 0.9171249866485596, "rewards/format_reward": 1.0, "step": 2124 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 406.625, "epoch": 0.029309940552544104, "grad_norm": 3.784362134306023, "kl": 0.09033203125, "learning_rate": 9.978818205373028e-07, "loss": 0.0036, "reward": 2.0875000953674316, "reward_std": 0.015809625387191772, "rewards/accuracy_reward": 0.8875000476837158, "rewards/format_reward": 1.0, "step": 2125 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 415.28125, "epoch": 0.0293237334657453, "grad_norm": 2.538941098063826, "kl": 0.08642578125, "learning_rate": 9.97879827897993e-07, "loss": 0.0034, "reward": 2.1544687747955322, "reward_std": 0.01447913609445095, "rewards/accuracy_reward": 0.9544687867164612, "rewards/format_reward": 1.0, "step": 2126 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 405.1875, "epoch": 0.0293375263789465, "grad_norm": 5.166172666915555, "kl": 0.0830078125, "learning_rate": 9.978778343238455e-07, "loss": 0.0033, "reward": 2.1079373359680176, "reward_std": 0.01183334831148386, "rewards/accuracy_reward": 0.9079375267028809, "rewards/format_reward": 1.0, "step": 2127 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 419.09375, "epoch": 0.029351319292147696, "grad_norm": 2.3351985525684027, "kl": 0.09326171875, "learning_rate": 9.978758398148638e-07, "loss": 0.0037, "reward": 2.1147186756134033, "reward_std": 0.019157445058226585, "rewards/accuracy_reward": 0.914718747138977, "rewards/format_reward": 1.0, "step": 2128 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 420.125, "epoch": 0.029365112205348893, "grad_norm": 4.768021132238271, "kl": 0.0810546875, "learning_rate": 9.978738443710519e-07, "loss": 0.0033, "reward": 1.9914376735687256, "reward_std": 0.01594974473118782, "rewards/accuracy_reward": 0.7914375066757202, "rewards/format_reward": 1.0, "step": 2129 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 429.0, "epoch": 0.02937890511855009, "grad_norm": 3.117995350718515, "kl": 0.0888671875, "learning_rate": 9.978718479924135e-07, "loss": 0.0036, "reward": 2.059812545776367, "reward_std": 0.01983087696135044, "rewards/accuracy_reward": 0.8598124980926514, "rewards/format_reward": 1.0, "step": 2130 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 418.71875, "epoch": 0.029392698031751287, "grad_norm": 2.079176625623688, "kl": 0.0888671875, "learning_rate": 9.97869850678952e-07, "loss": 0.0036, "reward": 2.139343738555908, "reward_std": 0.03310300037264824, "rewards/accuracy_reward": 0.9518437385559082, "rewards/format_reward": 1.0, "step": 2131 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 422.03125, "epoch": 0.029406490944952485, "grad_norm": 2.882148557990783, "kl": 0.087890625, "learning_rate": 9.978678524306714e-07, "loss": 0.0035, "reward": 2.168375015258789, "reward_std": 0.028139302507042885, "rewards/accuracy_reward": 0.9683749675750732, "rewards/format_reward": 1.0, "step": 2132 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 408.84375, "epoch": 0.029420283858153682, "grad_norm": 2.9787532178845675, "kl": 0.087890625, "learning_rate": 9.978658532475756e-07, "loss": 0.0035, "reward": 2.125, "reward_std": 0.009319850243628025, "rewards/accuracy_reward": 0.9249999523162842, "rewards/format_reward": 1.0, "step": 2133 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 406.625, "epoch": 0.02943407677135488, "grad_norm": 2.905957153049684, "kl": 0.0869140625, "learning_rate": 9.978638531296681e-07, "loss": 0.0035, "reward": 2.0711874961853027, "reward_std": 0.011048303917050362, "rewards/accuracy_reward": 0.8711875081062317, "rewards/format_reward": 1.0, "step": 2134 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 399.3125, "epoch": 0.029447869684556077, "grad_norm": 2.1774469309483626, "kl": 0.08984375, "learning_rate": 9.978618520769528e-07, "loss": 0.0036, "reward": 2.070906162261963, "reward_std": 0.016848890110850334, "rewards/accuracy_reward": 0.8709062337875366, "rewards/format_reward": 1.0, "step": 2135 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 400.96875, "epoch": 0.029461662597757274, "grad_norm": 2.2377773396502945, "kl": 0.08056640625, "learning_rate": 9.978598500894334e-07, "loss": 0.0032, "reward": 2.088437557220459, "reward_std": 0.013646058738231659, "rewards/accuracy_reward": 0.8884374499320984, "rewards/format_reward": 1.0, "step": 2136 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 412.90625, "epoch": 0.02947545551095847, "grad_norm": 2.5584010881481705, "kl": 0.07958984375, "learning_rate": 9.978578471671138e-07, "loss": 0.0032, "reward": 2.0523438453674316, "reward_std": 0.011382254771888256, "rewards/accuracy_reward": 0.852343738079071, "rewards/format_reward": 1.0, "step": 2137 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 399.75, "epoch": 0.02948924842415967, "grad_norm": 2.875528615926683, "kl": 0.08544921875, "learning_rate": 9.978558433099974e-07, "loss": 0.0034, "reward": 2.112468719482422, "reward_std": 0.01609628088772297, "rewards/accuracy_reward": 0.9124687910079956, "rewards/format_reward": 1.0, "step": 2138 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 404.125, "epoch": 0.029503041337360866, "grad_norm": 2.877970166120491, "kl": 0.0849609375, "learning_rate": 9.978538385180884e-07, "loss": 0.0034, "reward": 2.0692501068115234, "reward_std": 0.012559600174427032, "rewards/accuracy_reward": 0.8692499399185181, "rewards/format_reward": 1.0, "step": 2139 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 392.59375, "epoch": 0.029516834250562063, "grad_norm": 3.1316881587225014, "kl": 0.08935546875, "learning_rate": 9.9785183279139e-07, "loss": 0.0036, "reward": 2.07450008392334, "reward_std": 0.031652599573135376, "rewards/accuracy_reward": 0.8807499408721924, "rewards/format_reward": 1.0, "step": 2140 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 394.59375, "epoch": 0.02953062716376326, "grad_norm": 2.8788656058468023, "kl": 0.08203125, "learning_rate": 9.978498261299066e-07, "loss": 0.0033, "reward": 2.1280312538146973, "reward_std": 0.02024359628558159, "rewards/accuracy_reward": 0.9280312061309814, "rewards/format_reward": 1.0, "step": 2141 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 402.59375, "epoch": 0.029544420076964454, "grad_norm": 5.9812467053196805, "kl": 0.07666015625, "learning_rate": 9.978478185336418e-07, "loss": 0.0031, "reward": 2.067593812942505, "reward_std": 0.029555248096585274, "rewards/accuracy_reward": 0.8738437294960022, "rewards/format_reward": 1.0, "step": 2142 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 409.1875, "epoch": 0.02955821299016565, "grad_norm": 2.1376543878891288, "kl": 0.087890625, "learning_rate": 9.97845810002599e-07, "loss": 0.0035, "reward": 2.0815000534057617, "reward_std": 0.02134629152715206, "rewards/accuracy_reward": 0.8814999461174011, "rewards/format_reward": 1.0, "step": 2143 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 398.90625, "epoch": 0.02957200590336685, "grad_norm": 2.6101996283004336, "kl": 0.0888671875, "learning_rate": 9.978438005367824e-07, "loss": 0.0036, "reward": 2.0667812824249268, "reward_std": 0.012508269399404526, "rewards/accuracy_reward": 0.8667812347412109, "rewards/format_reward": 1.0, "step": 2144 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 398.46875, "epoch": 0.029585798816568046, "grad_norm": 2.3873864045505893, "kl": 0.0859375, "learning_rate": 9.978417901361957e-07, "loss": 0.0034, "reward": 2.0777812004089355, "reward_std": 0.0191669762134552, "rewards/accuracy_reward": 0.8777812123298645, "rewards/format_reward": 1.0, "step": 2145 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 402.40625, "epoch": 0.029599591729769243, "grad_norm": 2.8541186010689317, "kl": 0.08251953125, "learning_rate": 9.978397788008424e-07, "loss": 0.0033, "reward": 2.085031032562256, "reward_std": 0.019557081162929535, "rewards/accuracy_reward": 0.8850312232971191, "rewards/format_reward": 1.0, "step": 2146 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 415.09375, "epoch": 0.02961338464297044, "grad_norm": 2.604685832409183, "kl": 0.072265625, "learning_rate": 9.978377665307267e-07, "loss": 0.0029, "reward": 2.024156093597412, "reward_std": 0.010080362670123577, "rewards/accuracy_reward": 0.8241562843322754, "rewards/format_reward": 1.0, "step": 2147 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 404.4375, "epoch": 0.029627177556171638, "grad_norm": 2.54893984392409, "kl": 0.07958984375, "learning_rate": 9.978357533258519e-07, "loss": 0.0032, "reward": 2.1401875019073486, "reward_std": 0.014890208840370178, "rewards/accuracy_reward": 0.9401875734329224, "rewards/format_reward": 1.0, "step": 2148 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 415.875, "epoch": 0.029640970469372835, "grad_norm": 3.840579320662775, "kl": 0.0849609375, "learning_rate": 9.978337391862222e-07, "loss": 0.0034, "reward": 2.01631236076355, "reward_std": 0.01764514110982418, "rewards/accuracy_reward": 0.8163124322891235, "rewards/format_reward": 1.0, "step": 2149 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 405.90625, "epoch": 0.029654763382574032, "grad_norm": 2.975806862608416, "kl": 0.0830078125, "learning_rate": 9.97831724111841e-07, "loss": 0.0033, "reward": 2.0737500190734863, "reward_std": 0.015361923724412918, "rewards/accuracy_reward": 0.8737500309944153, "rewards/format_reward": 1.0, "step": 2150 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 406.71875, "epoch": 0.02966855629577523, "grad_norm": 2.92006622831087, "kl": 0.08251953125, "learning_rate": 9.978297081027123e-07, "loss": 0.0033, "reward": 2.120093584060669, "reward_std": 0.018047591671347618, "rewards/accuracy_reward": 0.9200937747955322, "rewards/format_reward": 1.0, "step": 2151 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 408.1875, "epoch": 0.029682349208976427, "grad_norm": 5.266211221576084, "kl": 0.072265625, "learning_rate": 9.978276911588398e-07, "loss": 0.0029, "reward": 2.0843749046325684, "reward_std": 0.022663431242108345, "rewards/accuracy_reward": 0.8843749761581421, "rewards/format_reward": 1.0, "step": 2152 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 409.53125, "epoch": 0.029696142122177624, "grad_norm": 2.89893569138178, "kl": 0.0830078125, "learning_rate": 9.978256732802278e-07, "loss": 0.0033, "reward": 2.107375144958496, "reward_std": 0.024955984205007553, "rewards/accuracy_reward": 0.9073749780654907, "rewards/format_reward": 1.0, "step": 2153 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 413.875, "epoch": 0.02970993503537882, "grad_norm": 3.238170161374507, "kl": 0.08056640625, "learning_rate": 9.978236544668793e-07, "loss": 0.0032, "reward": 2.016906261444092, "reward_std": 0.028309836983680725, "rewards/accuracy_reward": 0.8231562376022339, "rewards/format_reward": 1.0, "step": 2154 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 410.09375, "epoch": 0.02972372794858002, "grad_norm": 2.935671648303275, "kl": 0.083984375, "learning_rate": 9.978216347187984e-07, "loss": 0.0034, "reward": 2.0879688262939453, "reward_std": 0.02018401399254799, "rewards/accuracy_reward": 0.8879687190055847, "rewards/format_reward": 1.0, "step": 2155 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 399.9375, "epoch": 0.029737520861781216, "grad_norm": 8.163255279087053, "kl": 0.08984375, "learning_rate": 9.97819614035989e-07, "loss": 0.0036, "reward": 2.0929064750671387, "reward_std": 0.017653988674283028, "rewards/accuracy_reward": 0.8929062485694885, "rewards/format_reward": 1.0, "step": 2156 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 409.84375, "epoch": 0.029751313774982413, "grad_norm": 2.48398554660599, "kl": 0.0810546875, "learning_rate": 9.978175924184549e-07, "loss": 0.0032, "reward": 2.1097497940063477, "reward_std": 0.03194742649793625, "rewards/accuracy_reward": 0.9159999489784241, "rewards/format_reward": 1.0, "step": 2157 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 414.5, "epoch": 0.02976510668818361, "grad_norm": 6.499278903251688, "kl": 0.080078125, "learning_rate": 9.978155698661997e-07, "loss": 0.0032, "reward": 2.1356563568115234, "reward_std": 0.03120109811425209, "rewards/accuracy_reward": 0.9419062733650208, "rewards/format_reward": 1.0, "step": 2158 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 406.53125, "epoch": 0.029778899601384808, "grad_norm": 6.924626612390532, "kl": 0.07958984375, "learning_rate": 9.978135463792272e-07, "loss": 0.0032, "reward": 2.1131250858306885, "reward_std": 0.028056159615516663, "rewards/accuracy_reward": 0.9131249189376831, "rewards/format_reward": 1.0, "step": 2159 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 403.3125, "epoch": 0.029792692514586005, "grad_norm": 2.768071757995682, "kl": 0.0810546875, "learning_rate": 9.978115219575415e-07, "loss": 0.0032, "reward": 2.124875068664551, "reward_std": 0.015134197659790516, "rewards/accuracy_reward": 0.924875020980835, "rewards/format_reward": 1.0, "step": 2160 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 401.15625, "epoch": 0.029806485427787202, "grad_norm": 2.2597427352509896, "kl": 0.095703125, "learning_rate": 9.978094966011462e-07, "loss": 0.0038, "reward": 2.1288750171661377, "reward_std": 0.014218784868717194, "rewards/accuracy_reward": 0.9288750290870667, "rewards/format_reward": 1.0, "step": 2161 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 402.65625, "epoch": 0.0298202783409884, "grad_norm": 2.995593591894307, "kl": 0.080078125, "learning_rate": 9.978074703100452e-07, "loss": 0.0032, "reward": 2.1445937156677246, "reward_std": 0.0158880315721035, "rewards/accuracy_reward": 0.9445937275886536, "rewards/format_reward": 1.0, "step": 2162 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 389.21875, "epoch": 0.029834071254189597, "grad_norm": 3.1360165301793668, "kl": 0.08544921875, "learning_rate": 9.97805443084242e-07, "loss": 0.0034, "reward": 2.1525938510894775, "reward_std": 0.008373921737074852, "rewards/accuracy_reward": 0.9525937438011169, "rewards/format_reward": 1.0, "step": 2163 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 399.9375, "epoch": 0.029847864167390794, "grad_norm": 2.6505661273803542, "kl": 0.078125, "learning_rate": 9.978034149237408e-07, "loss": 0.0031, "reward": 2.113468647003174, "reward_std": 0.023791436105966568, "rewards/accuracy_reward": 0.9134687781333923, "rewards/format_reward": 1.0, "step": 2164 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 386.0, "epoch": 0.02986165708059199, "grad_norm": 2.8453187362795505, "kl": 0.08642578125, "learning_rate": 9.978013858285452e-07, "loss": 0.0035, "reward": 2.1449062824249268, "reward_std": 0.011698558926582336, "rewards/accuracy_reward": 0.9449062347412109, "rewards/format_reward": 1.0, "step": 2165 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 402.65625, "epoch": 0.02987544999379319, "grad_norm": 2.364622446033586, "kl": 0.07080078125, "learning_rate": 9.97799355798659e-07, "loss": 0.0028, "reward": 2.143531322479248, "reward_std": 0.013602443970739841, "rewards/accuracy_reward": 0.9435312747955322, "rewards/format_reward": 1.0, "step": 2166 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 395.21875, "epoch": 0.029889242906994386, "grad_norm": 2.0285015682756806, "kl": 0.07568359375, "learning_rate": 9.97797324834086e-07, "loss": 0.003, "reward": 2.140531063079834, "reward_std": 0.01572326198220253, "rewards/accuracy_reward": 0.9405312538146973, "rewards/format_reward": 1.0, "step": 2167 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 404.75, "epoch": 0.029903035820195583, "grad_norm": 2.509314338215299, "kl": 0.07568359375, "learning_rate": 9.977952929348303e-07, "loss": 0.003, "reward": 2.0768749713897705, "reward_std": 0.01655711978673935, "rewards/accuracy_reward": 0.8768749833106995, "rewards/format_reward": 1.0, "step": 2168 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 396.71875, "epoch": 0.02991682873339678, "grad_norm": 36.05552545242765, "kl": 0.07763671875, "learning_rate": 9.977932601008953e-07, "loss": 0.0031, "reward": 2.018249988555908, "reward_std": 0.028770923614501953, "rewards/accuracy_reward": 0.8182500004768372, "rewards/format_reward": 1.0, "step": 2169 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 398.65625, "epoch": 0.029930621646597978, "grad_norm": 1.9503543557451053, "kl": 0.078125, "learning_rate": 9.977912263322852e-07, "loss": 0.0031, "reward": 2.177093744277954, "reward_std": 0.025064757093787193, "rewards/accuracy_reward": 0.9833437204360962, "rewards/format_reward": 1.0, "step": 2170 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 380.96875, "epoch": 0.029944414559799175, "grad_norm": 2.1052868046048054, "kl": 0.080078125, "learning_rate": 9.977891916290034e-07, "loss": 0.0032, "reward": 2.100062847137451, "reward_std": 0.018956970423460007, "rewards/accuracy_reward": 0.9000625014305115, "rewards/format_reward": 1.0, "step": 2171 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 382.96875, "epoch": 0.029958207473000372, "grad_norm": 3.766676976135831, "kl": 0.080078125, "learning_rate": 9.97787155991054e-07, "loss": 0.0032, "reward": 2.112468719482422, "reward_std": 0.013913454487919807, "rewards/accuracy_reward": 0.9124687910079956, "rewards/format_reward": 1.0, "step": 2172 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 380.1875, "epoch": 0.02997200038620157, "grad_norm": 2.2995707450873875, "kl": 0.07275390625, "learning_rate": 9.977851194184408e-07, "loss": 0.0029, "reward": 2.087625026702881, "reward_std": 0.016663692891597748, "rewards/accuracy_reward": 0.8876249194145203, "rewards/format_reward": 1.0, "step": 2173 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 393.59375, "epoch": 0.029985793299402767, "grad_norm": 1.6701813818269653, "kl": 0.06591796875, "learning_rate": 9.977830819111677e-07, "loss": 0.0026, "reward": 2.17396879196167, "reward_std": 0.0061285244300961494, "rewards/accuracy_reward": 0.9739687442779541, "rewards/format_reward": 1.0, "step": 2174 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 390.96875, "epoch": 0.029999586212603964, "grad_norm": 4.620354032250096, "kl": 0.078125, "learning_rate": 9.977810434692382e-07, "loss": 0.0031, "reward": 2.079531192779541, "reward_std": 0.025339238345623016, "rewards/accuracy_reward": 0.8795312643051147, "rewards/format_reward": 1.0, "step": 2175 }, { "all_correct": 0.25, "all_wrong": 0.0, "completion_length": 376.0, "epoch": 0.03001337912580516, "grad_norm": 3.038742812191827, "kl": 0.078125, "learning_rate": 9.977790040926564e-07, "loss": 0.0031, "reward": 1.9677188396453857, "reward_std": 0.01996118575334549, "rewards/accuracy_reward": 0.7677187323570251, "rewards/format_reward": 1.0, "step": 2176 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 384.59375, "epoch": 0.03002717203900636, "grad_norm": 5.010668778086776, "kl": 0.0791015625, "learning_rate": 9.977769637814262e-07, "loss": 0.0032, "reward": 1.9748749732971191, "reward_std": 0.01638328656554222, "rewards/accuracy_reward": 0.7748749256134033, "rewards/format_reward": 1.0, "step": 2177 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 376.25, "epoch": 0.030040964952207556, "grad_norm": 2.4600798300088615, "kl": 0.08056640625, "learning_rate": 9.977749225355513e-07, "loss": 0.0032, "reward": 2.120687484741211, "reward_std": 0.03754265978932381, "rewards/accuracy_reward": 0.9331875443458557, "rewards/format_reward": 1.0, "step": 2178 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 386.46875, "epoch": 0.030054757865408753, "grad_norm": 2.3294518004206477, "kl": 0.0673828125, "learning_rate": 9.977728803550354e-07, "loss": 0.0027, "reward": 2.146749973297119, "reward_std": 0.012294841930270195, "rewards/accuracy_reward": 0.9467499256134033, "rewards/format_reward": 1.0, "step": 2179 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 384.84375, "epoch": 0.03006855077860995, "grad_norm": 5.291889903710729, "kl": 0.076171875, "learning_rate": 9.977708372398825e-07, "loss": 0.003, "reward": 2.122593641281128, "reward_std": 0.043387528508901596, "rewards/accuracy_reward": 0.9225937724113464, "rewards/format_reward": 1.0, "step": 2180 }, { "all_correct": 0.25, "all_wrong": 0.0, "completion_length": 386.09375, "epoch": 0.030082343691811148, "grad_norm": 2.3011057917786815, "kl": 0.0771484375, "learning_rate": 9.977687931900964e-07, "loss": 0.0031, "reward": 1.9800312519073486, "reward_std": 0.027691654860973358, "rewards/accuracy_reward": 0.7800313234329224, "rewards/format_reward": 1.0, "step": 2181 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 390.0625, "epoch": 0.030096136605012345, "grad_norm": 2.168541525310985, "kl": 0.083984375, "learning_rate": 9.97766748205681e-07, "loss": 0.0034, "reward": 2.115968704223633, "reward_std": 0.009785845875740051, "rewards/accuracy_reward": 0.9159688353538513, "rewards/format_reward": 1.0, "step": 2182 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 379.375, "epoch": 0.030109929518213542, "grad_norm": 2.4607620918111768, "kl": 0.0673828125, "learning_rate": 9.9776470228664e-07, "loss": 0.0027, "reward": 2.1579999923706055, "reward_std": 0.017274891957640648, "rewards/accuracy_reward": 0.9580000638961792, "rewards/format_reward": 1.0, "step": 2183 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 383.4375, "epoch": 0.03012372243141474, "grad_norm": 2.9249667299728586, "kl": 0.08349609375, "learning_rate": 9.977626554329774e-07, "loss": 0.0033, "reward": 2.0839061737060547, "reward_std": 0.032355956733226776, "rewards/accuracy_reward": 0.8901562690734863, "rewards/format_reward": 1.0, "step": 2184 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 388.0625, "epoch": 0.030137515344615937, "grad_norm": 3.122571621572205, "kl": 0.07275390625, "learning_rate": 9.97760607644697e-07, "loss": 0.0029, "reward": 2.141843795776367, "reward_std": 0.013540457934141159, "rewards/accuracy_reward": 0.9418437480926514, "rewards/format_reward": 1.0, "step": 2185 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 385.96875, "epoch": 0.030151308257817134, "grad_norm": 3.076471409176197, "kl": 0.08203125, "learning_rate": 9.977585589218027e-07, "loss": 0.0033, "reward": 2.1568126678466797, "reward_std": 0.011049201712012291, "rewards/accuracy_reward": 0.9568124413490295, "rewards/format_reward": 1.0, "step": 2186 }, { "all_correct": 0.25, "all_wrong": 0.0, "completion_length": 375.0, "epoch": 0.03016510117101833, "grad_norm": 2.97496284292612, "kl": 0.087890625, "learning_rate": 9.97756509264298e-07, "loss": 0.0035, "reward": 2.0447187423706055, "reward_std": 0.04658292233943939, "rewards/accuracy_reward": 0.8572187423706055, "rewards/format_reward": 1.0, "step": 2187 }, { "all_correct": 0.25, "all_wrong": 0.0, "completion_length": 370.84375, "epoch": 0.03017889408421953, "grad_norm": 2.719218977129783, "kl": 0.07861328125, "learning_rate": 9.97754458672187e-07, "loss": 0.0031, "reward": 1.9946563243865967, "reward_std": 0.01253945380449295, "rewards/accuracy_reward": 0.7946561574935913, "rewards/format_reward": 1.0, "step": 2188 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 399.0625, "epoch": 0.030192686997420726, "grad_norm": 2.3528926497907947, "kl": 0.07470703125, "learning_rate": 9.97752407145474e-07, "loss": 0.003, "reward": 2.14736270904541, "reward_std": 0.01700129732489586, "rewards/accuracy_reward": 0.9473625421524048, "rewards/format_reward": 1.0, "step": 2189 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 402.6875, "epoch": 0.030206479910621923, "grad_norm": 2.2119627206977417, "kl": 0.08056640625, "learning_rate": 9.97750354684162e-07, "loss": 0.0032, "reward": 2.154531240463257, "reward_std": 0.014431731775403023, "rewards/accuracy_reward": 0.9545312523841858, "rewards/format_reward": 1.0, "step": 2190 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 398.28125, "epoch": 0.03022027282382312, "grad_norm": 2.2972766701343414, "kl": 0.078125, "learning_rate": 9.977483012882555e-07, "loss": 0.0031, "reward": 2.142343759536743, "reward_std": 0.013355524279177189, "rewards/accuracy_reward": 0.9423437714576721, "rewards/format_reward": 1.0, "step": 2191 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 398.90625, "epoch": 0.030234065737024318, "grad_norm": 3.0812410672646973, "kl": 0.07763671875, "learning_rate": 9.97746246957758e-07, "loss": 0.0031, "reward": 2.0666563510894775, "reward_std": 0.025859542191028595, "rewards/accuracy_reward": 0.8666562438011169, "rewards/format_reward": 1.0, "step": 2192 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 400.6875, "epoch": 0.030247858650225515, "grad_norm": 3.2406973025286776, "kl": 0.06884765625, "learning_rate": 9.977441916926734e-07, "loss": 0.0028, "reward": 2.0384063720703125, "reward_std": 0.034345947206020355, "rewards/accuracy_reward": 0.8384062051773071, "rewards/format_reward": 1.0, "step": 2193 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 401.875, "epoch": 0.030261651563426712, "grad_norm": 2.4193024626897324, "kl": 0.08056640625, "learning_rate": 9.977421354930056e-07, "loss": 0.0032, "reward": 2.0469374656677246, "reward_std": 0.01872834376990795, "rewards/accuracy_reward": 0.8469375371932983, "rewards/format_reward": 1.0, "step": 2194 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 398.28125, "epoch": 0.03027544447662791, "grad_norm": 2.515117819770639, "kl": 0.0751953125, "learning_rate": 9.977400783587586e-07, "loss": 0.003, "reward": 2.010406255722046, "reward_std": 0.020029190927743912, "rewards/accuracy_reward": 0.8104062676429749, "rewards/format_reward": 1.0, "step": 2195 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 392.0, "epoch": 0.030289237389829107, "grad_norm": 2.9456626807740247, "kl": 0.083984375, "learning_rate": 9.977380202899363e-07, "loss": 0.0034, "reward": 2.017125129699707, "reward_std": 0.01958318054676056, "rewards/accuracy_reward": 0.8171249628067017, "rewards/format_reward": 1.0, "step": 2196 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 392.4375, "epoch": 0.030303030303030304, "grad_norm": 3.5454949638624145, "kl": 0.07275390625, "learning_rate": 9.977359612865422e-07, "loss": 0.0029, "reward": 2.106968879699707, "reward_std": 0.03425057232379913, "rewards/accuracy_reward": 0.9132187366485596, "rewards/format_reward": 1.0, "step": 2197 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 394.65625, "epoch": 0.0303168232162315, "grad_norm": 3.4955659730397866, "kl": 0.08056640625, "learning_rate": 9.977339013485806e-07, "loss": 0.0032, "reward": 2.0889062881469727, "reward_std": 0.02780120261013508, "rewards/accuracy_reward": 0.8889062404632568, "rewards/format_reward": 1.0, "step": 2198 }, { "all_correct": 0.25, "all_wrong": 0.0, "completion_length": 402.75, "epoch": 0.0303306161294327, "grad_norm": 1.976163071320007, "kl": 0.06689453125, "learning_rate": 9.97731840476055e-07, "loss": 0.0027, "reward": 2.068906307220459, "reward_std": 0.026243673637509346, "rewards/accuracy_reward": 0.8751562237739563, "rewards/format_reward": 1.0, "step": 2199 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 412.21875, "epoch": 0.030344409042633896, "grad_norm": 2.642038761372087, "kl": 0.0703125, "learning_rate": 9.977297786689695e-07, "loss": 0.0028, "reward": 2.145718574523926, "reward_std": 0.017455019056797028, "rewards/accuracy_reward": 0.9457187652587891, "rewards/format_reward": 1.0, "step": 2200 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 406.875, "epoch": 0.030358201955835093, "grad_norm": 2.138230163872008, "kl": 0.07861328125, "learning_rate": 9.97727715927328e-07, "loss": 0.0031, "reward": 2.1140313148498535, "reward_std": 0.02577412873506546, "rewards/accuracy_reward": 0.9140312075614929, "rewards/format_reward": 1.0, "step": 2201 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 405.71875, "epoch": 0.03037199486903629, "grad_norm": 2.8655290879303426, "kl": 0.07177734375, "learning_rate": 9.977256522511341e-07, "loss": 0.0029, "reward": 2.101468563079834, "reward_std": 0.016969265416264534, "rewards/accuracy_reward": 0.9014687538146973, "rewards/format_reward": 1.0, "step": 2202 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 400.75, "epoch": 0.030385787782237487, "grad_norm": 2.926329849080525, "kl": 0.07275390625, "learning_rate": 9.977235876403917e-07, "loss": 0.0029, "reward": 2.1384999752044678, "reward_std": 0.015922226011753082, "rewards/accuracy_reward": 0.9385000467300415, "rewards/format_reward": 1.0, "step": 2203 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 414.5625, "epoch": 0.030399580695438685, "grad_norm": 4.554514910715986, "kl": 0.078125, "learning_rate": 9.977215220951052e-07, "loss": 0.0031, "reward": 2.1174373626708984, "reward_std": 0.015788527205586433, "rewards/accuracy_reward": 0.9174374341964722, "rewards/format_reward": 1.0, "step": 2204 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 422.28125, "epoch": 0.030413373608639882, "grad_norm": 2.5669971551511543, "kl": 0.07275390625, "learning_rate": 9.97719455615278e-07, "loss": 0.0029, "reward": 1.9704687595367432, "reward_std": 0.02118171565234661, "rewards/accuracy_reward": 0.7704687714576721, "rewards/format_reward": 1.0, "step": 2205 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 408.46875, "epoch": 0.03042716652184108, "grad_norm": 1.8949751480227206, "kl": 0.072265625, "learning_rate": 9.977173882009139e-07, "loss": 0.0029, "reward": 2.1453280448913574, "reward_std": 0.008334027603268623, "rewards/accuracy_reward": 0.9453281164169312, "rewards/format_reward": 1.0, "step": 2206 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 421.3125, "epoch": 0.030440959435042277, "grad_norm": 7.686394691849239, "kl": 0.07568359375, "learning_rate": 9.977153198520169e-07, "loss": 0.003, "reward": 2.11928129196167, "reward_std": 0.02656150609254837, "rewards/accuracy_reward": 0.9192812442779541, "rewards/format_reward": 1.0, "step": 2207 }, { "all_correct": 0.25, "all_wrong": 0.0, "completion_length": 421.9375, "epoch": 0.030454752348243474, "grad_norm": 2.459156397192004, "kl": 0.08154296875, "learning_rate": 9.977132505685912e-07, "loss": 0.0033, "reward": 2.0004687309265137, "reward_std": 0.033843837678432465, "rewards/accuracy_reward": 0.8067187666893005, "rewards/format_reward": 1.0, "step": 2208 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 410.71875, "epoch": 0.03046854526144467, "grad_norm": 8.18672049184234, "kl": 0.07470703125, "learning_rate": 9.977111803506404e-07, "loss": 0.003, "reward": 2.125718832015991, "reward_std": 0.01143190823495388, "rewards/accuracy_reward": 0.9257187843322754, "rewards/format_reward": 1.0, "step": 2209 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 419.3125, "epoch": 0.03048233817464587, "grad_norm": 3.1399075653630626, "kl": 0.072265625, "learning_rate": 9.977091091981683e-07, "loss": 0.0029, "reward": 2.0902187824249268, "reward_std": 0.0364406518638134, "rewards/accuracy_reward": 0.8964687585830688, "rewards/format_reward": 1.0, "step": 2210 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 416.6875, "epoch": 0.030496131087847066, "grad_norm": 2.179556823229944, "kl": 0.08203125, "learning_rate": 9.97707037111179e-07, "loss": 0.0033, "reward": 2.0192811489105225, "reward_std": 0.011065879836678505, "rewards/accuracy_reward": 0.819281280040741, "rewards/format_reward": 1.0, "step": 2211 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 409.59375, "epoch": 0.030509924001048263, "grad_norm": 2.733655606703832, "kl": 0.0771484375, "learning_rate": 9.977049640896764e-07, "loss": 0.0031, "reward": 2.0240938663482666, "reward_std": 0.024245424196124077, "rewards/accuracy_reward": 0.8240938186645508, "rewards/format_reward": 1.0, "step": 2212 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 422.0, "epoch": 0.03052371691424946, "grad_norm": 2.6686014150167217, "kl": 0.076171875, "learning_rate": 9.977028901336642e-07, "loss": 0.0031, "reward": 2.1514062881469727, "reward_std": 0.016458529978990555, "rewards/accuracy_reward": 0.9514062404632568, "rewards/format_reward": 1.0, "step": 2213 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 414.1875, "epoch": 0.030537509827450657, "grad_norm": 2.513533602492548, "kl": 0.07666015625, "learning_rate": 9.977008152431464e-07, "loss": 0.0031, "reward": 2.1211252212524414, "reward_std": 0.020682761445641518, "rewards/accuracy_reward": 0.9211249947547913, "rewards/format_reward": 1.0, "step": 2214 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 410.5, "epoch": 0.030551302740651855, "grad_norm": 2.4577089806894867, "kl": 0.0830078125, "learning_rate": 9.976987394181268e-07, "loss": 0.0033, "reward": 2.0757501125335693, "reward_std": 0.02439242973923683, "rewards/accuracy_reward": 0.8757500052452087, "rewards/format_reward": 1.0, "step": 2215 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 418.96875, "epoch": 0.030565095653853052, "grad_norm": 2.0341885480550297, "kl": 0.07958984375, "learning_rate": 9.976966626586094e-07, "loss": 0.0032, "reward": 2.132687568664551, "reward_std": 0.010410184971988201, "rewards/accuracy_reward": 0.932687520980835, "rewards/format_reward": 1.0, "step": 2216 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 416.125, "epoch": 0.030578888567054246, "grad_norm": 2.528840489794019, "kl": 0.08251953125, "learning_rate": 9.976945849645982e-07, "loss": 0.0033, "reward": 2.0746562480926514, "reward_std": 0.021503344178199768, "rewards/accuracy_reward": 0.8746562600135803, "rewards/format_reward": 1.0, "step": 2217 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 406.46875, "epoch": 0.030592681480255443, "grad_norm": 2.1720790927046707, "kl": 0.08349609375, "learning_rate": 9.976925063360969e-07, "loss": 0.0033, "reward": 1.958031177520752, "reward_std": 0.01428062841296196, "rewards/accuracy_reward": 0.7580313086509705, "rewards/format_reward": 1.0, "step": 2218 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 404.875, "epoch": 0.03060647439345664, "grad_norm": 2.3651510960369455, "kl": 0.08740234375, "learning_rate": 9.976904267731094e-07, "loss": 0.0035, "reward": 2.058499813079834, "reward_std": 0.018771033734083176, "rewards/accuracy_reward": 0.8585000038146973, "rewards/format_reward": 1.0, "step": 2219 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 429.625, "epoch": 0.030620267306657838, "grad_norm": 1.7941257717792825, "kl": 0.08203125, "learning_rate": 9.976883462756396e-07, "loss": 0.0033, "reward": 2.1697187423706055, "reward_std": 0.013782726600766182, "rewards/accuracy_reward": 0.9697187542915344, "rewards/format_reward": 1.0, "step": 2220 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 406.53125, "epoch": 0.030634060219859035, "grad_norm": 2.368408251382291, "kl": 0.078125, "learning_rate": 9.976862648436918e-07, "loss": 0.0031, "reward": 2.051968812942505, "reward_std": 0.007895168848335743, "rewards/accuracy_reward": 0.8519687056541443, "rewards/format_reward": 1.0, "step": 2221 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 398.46875, "epoch": 0.030647853133060232, "grad_norm": 2.4682423594997713, "kl": 0.0791015625, "learning_rate": 9.976841824772694e-07, "loss": 0.0032, "reward": 2.1348438262939453, "reward_std": 0.00763288140296936, "rewards/accuracy_reward": 0.9348437786102295, "rewards/format_reward": 1.0, "step": 2222 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 404.0625, "epoch": 0.03066164604626143, "grad_norm": 3.1077677241072155, "kl": 0.068359375, "learning_rate": 9.976820991763766e-07, "loss": 0.0027, "reward": 2.095749855041504, "reward_std": 0.026015982031822205, "rewards/accuracy_reward": 0.8957499861717224, "rewards/format_reward": 1.0, "step": 2223 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 408.21875, "epoch": 0.030675438959462627, "grad_norm": 2.4017392972771825, "kl": 0.06884765625, "learning_rate": 9.97680014941017e-07, "loss": 0.0028, "reward": 2.0806875228881836, "reward_std": 0.011542889289557934, "rewards/accuracy_reward": 0.8806875348091125, "rewards/format_reward": 1.0, "step": 2224 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 392.375, "epoch": 0.030689231872663824, "grad_norm": 2.5235671917056672, "kl": 0.07568359375, "learning_rate": 9.97677929771195e-07, "loss": 0.003, "reward": 2.0131564140319824, "reward_std": 0.03558385372161865, "rewards/accuracy_reward": 0.819406270980835, "rewards/format_reward": 1.0, "step": 2225 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 392.125, "epoch": 0.03070302478586502, "grad_norm": 2.93808277386678, "kl": 0.0771484375, "learning_rate": 9.976758436669143e-07, "loss": 0.0031, "reward": 2.1344687938690186, "reward_std": 0.024807695299386978, "rewards/accuracy_reward": 0.9344688057899475, "rewards/format_reward": 1.0, "step": 2226 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 379.4375, "epoch": 0.03071681769906622, "grad_norm": 2.971829394095093, "kl": 0.07568359375, "learning_rate": 9.976737566281786e-07, "loss": 0.003, "reward": 2.14634370803833, "reward_std": 0.019381701946258545, "rewards/accuracy_reward": 0.9463437795639038, "rewards/format_reward": 1.0, "step": 2227 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 372.09375, "epoch": 0.030730610612267416, "grad_norm": 1.827691353414307, "kl": 0.08349609375, "learning_rate": 9.97671668654992e-07, "loss": 0.0033, "reward": 2.1275625228881836, "reward_std": 0.029285648837685585, "rewards/accuracy_reward": 0.9338124990463257, "rewards/format_reward": 1.0, "step": 2228 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 388.3125, "epoch": 0.030744403525468613, "grad_norm": 5.5095614586256465, "kl": 0.07666015625, "learning_rate": 9.976695797473586e-07, "loss": 0.0031, "reward": 2.1415624618530273, "reward_std": 0.02668778970837593, "rewards/accuracy_reward": 0.9415625333786011, "rewards/format_reward": 1.0, "step": 2229 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 389.1875, "epoch": 0.03075819643866981, "grad_norm": 3.7832720213875586, "kl": 0.0693359375, "learning_rate": 9.97667489905282e-07, "loss": 0.0028, "reward": 2.133625030517578, "reward_std": 0.025699052959680557, "rewards/accuracy_reward": 0.9336249828338623, "rewards/format_reward": 1.0, "step": 2230 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 384.21875, "epoch": 0.030771989351871008, "grad_norm": 8.392595456984209, "kl": 0.0830078125, "learning_rate": 9.976653991287663e-07, "loss": 0.0033, "reward": 2.135312557220459, "reward_std": 0.015526149421930313, "rewards/accuracy_reward": 0.9353125095367432, "rewards/format_reward": 1.0, "step": 2231 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 375.34375, "epoch": 0.030785782265072205, "grad_norm": 2.6724588859096636, "kl": 0.08544921875, "learning_rate": 9.97663307417815e-07, "loss": 0.0034, "reward": 2.0517501831054688, "reward_std": 0.024781785905361176, "rewards/accuracy_reward": 0.8579999804496765, "rewards/format_reward": 1.0, "step": 2232 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 375.1875, "epoch": 0.030799575178273402, "grad_norm": 11.327668111461566, "kl": 0.08251953125, "learning_rate": 9.97661214772433e-07, "loss": 0.0033, "reward": 2.1013126373291016, "reward_std": 0.01635340228676796, "rewards/accuracy_reward": 0.901312530040741, "rewards/format_reward": 1.0, "step": 2233 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 365.90625, "epoch": 0.0308133680914746, "grad_norm": 2.1384325780673823, "kl": 0.0830078125, "learning_rate": 9.976591211926234e-07, "loss": 0.0033, "reward": 2.0364062786102295, "reward_std": 0.009607749991118908, "rewards/accuracy_reward": 0.8364062309265137, "rewards/format_reward": 1.0, "step": 2234 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 383.46875, "epoch": 0.030827161004675797, "grad_norm": 2.204461990673804, "kl": 0.07275390625, "learning_rate": 9.976570266783903e-07, "loss": 0.0029, "reward": 2.124406337738037, "reward_std": 0.022497011348605156, "rewards/accuracy_reward": 0.9244062304496765, "rewards/format_reward": 1.0, "step": 2235 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 367.84375, "epoch": 0.030840953917876994, "grad_norm": 2.7644044618056487, "kl": 0.07763671875, "learning_rate": 9.976549312297377e-07, "loss": 0.0031, "reward": 2.158937454223633, "reward_std": 0.014448843896389008, "rewards/accuracy_reward": 0.9589375257492065, "rewards/format_reward": 1.0, "step": 2236 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 379.28125, "epoch": 0.03085474683107819, "grad_norm": 3.145633406695606, "kl": 0.0849609375, "learning_rate": 9.976528348466696e-07, "loss": 0.0034, "reward": 2.080718755722046, "reward_std": 0.013893071562051773, "rewards/accuracy_reward": 0.8807187080383301, "rewards/format_reward": 1.0, "step": 2237 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 361.96875, "epoch": 0.03086853974427939, "grad_norm": 2.4205322345880638, "kl": 0.07958984375, "learning_rate": 9.976507375291899e-07, "loss": 0.0032, "reward": 2.1183125972747803, "reward_std": 0.04246281832456589, "rewards/accuracy_reward": 0.9308124780654907, "rewards/format_reward": 1.0, "step": 2238 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 360.90625, "epoch": 0.030882332657480586, "grad_norm": 2.844281725485797, "kl": 0.08154296875, "learning_rate": 9.976486392773024e-07, "loss": 0.0032, "reward": 2.0728437900543213, "reward_std": 0.016438495367765427, "rewards/accuracy_reward": 0.8728438019752502, "rewards/format_reward": 1.0, "step": 2239 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 367.5625, "epoch": 0.030896125570681783, "grad_norm": 4.621018941936662, "kl": 0.06982421875, "learning_rate": 9.976465400910112e-07, "loss": 0.0028, "reward": 2.077125072479248, "reward_std": 0.01715395227074623, "rewards/accuracy_reward": 0.877125084400177, "rewards/format_reward": 1.0, "step": 2240 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 366.59375, "epoch": 0.03090991848388298, "grad_norm": 2.0577509061327426, "kl": 0.08056640625, "learning_rate": 9.9764443997032e-07, "loss": 0.0032, "reward": 2.1116561889648438, "reward_std": 0.02548736147582531, "rewards/accuracy_reward": 0.9241563081741333, "rewards/format_reward": 1.0, "step": 2241 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 357.84375, "epoch": 0.030923711397084178, "grad_norm": 22.744508980270847, "kl": 0.0810546875, "learning_rate": 9.976423389152333e-07, "loss": 0.0033, "reward": 2.129218816757202, "reward_std": 0.04335203766822815, "rewards/accuracy_reward": 0.9479687213897705, "rewards/format_reward": 1.0, "step": 2242 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 381.0, "epoch": 0.030937504310285375, "grad_norm": 2.963337044240746, "kl": 0.0712890625, "learning_rate": 9.976402369257545e-07, "loss": 0.0028, "reward": 2.128093719482422, "reward_std": 0.03770013526082039, "rewards/accuracy_reward": 0.9405937194824219, "rewards/format_reward": 1.0, "step": 2243 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 359.25, "epoch": 0.030951297223486572, "grad_norm": 2.6960204318314487, "kl": 0.08251953125, "learning_rate": 9.976381340018879e-07, "loss": 0.0033, "reward": 2.1100311279296875, "reward_std": 0.04551228880882263, "rewards/accuracy_reward": 0.928781270980835, "rewards/format_reward": 1.0, "step": 2244 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 370.625, "epoch": 0.03096509013668777, "grad_norm": 8.652480811834844, "kl": 0.08056640625, "learning_rate": 9.976360301436371e-07, "loss": 0.0032, "reward": 2.0601563453674316, "reward_std": 0.03214862942695618, "rewards/accuracy_reward": 0.860156238079071, "rewards/format_reward": 1.0, "step": 2245 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 375.9375, "epoch": 0.030978883049888967, "grad_norm": 5.692512028898535, "kl": 0.0791015625, "learning_rate": 9.976339253510061e-07, "loss": 0.0032, "reward": 2.0611562728881836, "reward_std": 0.018124910071492195, "rewards/accuracy_reward": 0.8611562252044678, "rewards/format_reward": 1.0, "step": 2246 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 382.8125, "epoch": 0.030992675963090164, "grad_norm": 2.8356484923369267, "kl": 0.0712890625, "learning_rate": 9.976318196239993e-07, "loss": 0.0029, "reward": 2.118000030517578, "reward_std": 0.021000022068619728, "rewards/accuracy_reward": 0.9180000424385071, "rewards/format_reward": 1.0, "step": 2247 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 394.5, "epoch": 0.03100646887629136, "grad_norm": 1.9789276552945052, "kl": 0.07666015625, "learning_rate": 9.976297129626199e-07, "loss": 0.0031, "reward": 2.0753750801086426, "reward_std": 0.02469100058078766, "rewards/accuracy_reward": 0.8816250562667847, "rewards/format_reward": 1.0, "step": 2248 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 385.5, "epoch": 0.03102026178949256, "grad_norm": 2.752123285315237, "kl": 0.07763671875, "learning_rate": 9.976276053668727e-07, "loss": 0.0031, "reward": 2.0533127784729004, "reward_std": 0.02811865136027336, "rewards/accuracy_reward": 0.8595624566078186, "rewards/format_reward": 1.0, "step": 2249 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 386.0625, "epoch": 0.031034054702693756, "grad_norm": 1.968964903946242, "kl": 0.08154296875, "learning_rate": 9.97625496836761e-07, "loss": 0.0033, "reward": 2.125124931335449, "reward_std": 0.02524317428469658, "rewards/accuracy_reward": 0.9313750267028809, "rewards/format_reward": 1.0, "step": 2250 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 395.9375, "epoch": 0.031047847615894953, "grad_norm": 1.8646218285651153, "kl": 0.076171875, "learning_rate": 9.976233873722892e-07, "loss": 0.003, "reward": 2.0802500247955322, "reward_std": 0.00986083596944809, "rewards/accuracy_reward": 0.8802499771118164, "rewards/format_reward": 1.0, "step": 2251 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 393.78125, "epoch": 0.03106164052909615, "grad_norm": 9.948187608748917, "kl": 0.07080078125, "learning_rate": 9.97621276973461e-07, "loss": 0.0028, "reward": 2.0847811698913574, "reward_std": 0.027061868458986282, "rewards/accuracy_reward": 0.8910312056541443, "rewards/format_reward": 1.0, "step": 2252 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 387.1875, "epoch": 0.031075433442297348, "grad_norm": 3.724281014273633, "kl": 0.072265625, "learning_rate": 9.976191656402802e-07, "loss": 0.0029, "reward": 2.12918758392334, "reward_std": 0.017832955345511436, "rewards/accuracy_reward": 0.9291874766349792, "rewards/format_reward": 1.0, "step": 2253 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 400.28125, "epoch": 0.031089226355498545, "grad_norm": 6.9227067913396585, "kl": 0.0771484375, "learning_rate": 9.976170533727514e-07, "loss": 0.0031, "reward": 2.1320314407348633, "reward_std": 0.01620614528656006, "rewards/accuracy_reward": 0.9320312142372131, "rewards/format_reward": 1.0, "step": 2254 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 405.34375, "epoch": 0.031103019268699742, "grad_norm": 2.7375972952163607, "kl": 0.072265625, "learning_rate": 9.976149401708779e-07, "loss": 0.0029, "reward": 1.988968849182129, "reward_std": 0.103922039270401, "rewards/accuracy_reward": 0.8202187418937683, "rewards/format_reward": 0.96875, "step": 2255 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 403.71875, "epoch": 0.03111681218190094, "grad_norm": 2.5645238861618904, "kl": 0.07421875, "learning_rate": 9.976128260346637e-07, "loss": 0.003, "reward": 2.0241875648498535, "reward_std": 0.020421762019395828, "rewards/accuracy_reward": 0.8241874575614929, "rewards/format_reward": 1.0, "step": 2256 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 412.8125, "epoch": 0.031130605095102137, "grad_norm": 1.7731716261583952, "kl": 0.080078125, "learning_rate": 9.976107109641133e-07, "loss": 0.0032, "reward": 2.1723437309265137, "reward_std": 0.015186430886387825, "rewards/accuracy_reward": 0.9723437428474426, "rewards/format_reward": 1.0, "step": 2257 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 393.8125, "epoch": 0.031144398008303334, "grad_norm": 7.339239918515213, "kl": 0.083984375, "learning_rate": 9.976085949592303e-07, "loss": 0.0034, "reward": 2.1562812328338623, "reward_std": 0.01189088262617588, "rewards/accuracy_reward": 0.956281304359436, "rewards/format_reward": 1.0, "step": 2258 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 411.46875, "epoch": 0.03115819092150453, "grad_norm": 4.072541252752485, "kl": 0.07177734375, "learning_rate": 9.976064780200188e-07, "loss": 0.0029, "reward": 2.0021250247955322, "reward_std": 0.024898849427700043, "rewards/accuracy_reward": 0.8021249771118164, "rewards/format_reward": 1.0, "step": 2259 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 398.1875, "epoch": 0.03117198383470573, "grad_norm": 2.2569326885036785, "kl": 0.0771484375, "learning_rate": 9.976043601464826e-07, "loss": 0.0031, "reward": 2.13462495803833, "reward_std": 0.020481616258621216, "rewards/accuracy_reward": 0.9346249103546143, "rewards/format_reward": 1.0, "step": 2260 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 398.15625, "epoch": 0.031185776747906926, "grad_norm": 3.0491014954147397, "kl": 0.0810546875, "learning_rate": 9.97602241338626e-07, "loss": 0.0032, "reward": 2.0787811279296875, "reward_std": 0.01796579174697399, "rewards/accuracy_reward": 0.878781259059906, "rewards/format_reward": 1.0, "step": 2261 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 403.25, "epoch": 0.031199569661108123, "grad_norm": 3.7707670295178732, "kl": 0.080078125, "learning_rate": 9.976001215964525e-07, "loss": 0.0032, "reward": 2.0914063453674316, "reward_std": 0.014145651832222939, "rewards/accuracy_reward": 0.891406238079071, "rewards/format_reward": 1.0, "step": 2262 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 423.84375, "epoch": 0.03121336257430932, "grad_norm": 2.9659502647996336, "kl": 0.078125, "learning_rate": 9.975980009199664e-07, "loss": 0.0031, "reward": 2.1424999237060547, "reward_std": 0.02005607634782791, "rewards/accuracy_reward": 0.9424999356269836, "rewards/format_reward": 1.0, "step": 2263 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 423.3125, "epoch": 0.031227155487510518, "grad_norm": 2.9546681724599595, "kl": 0.0908203125, "learning_rate": 9.975958793091719e-07, "loss": 0.0036, "reward": 2.0823750495910645, "reward_std": 0.014879917725920677, "rewards/accuracy_reward": 0.8823750019073486, "rewards/format_reward": 1.0, "step": 2264 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 413.25, "epoch": 0.031240948400711715, "grad_norm": 12.737518251876528, "kl": 0.076171875, "learning_rate": 9.975937567640724e-07, "loss": 0.003, "reward": 2.1417813301086426, "reward_std": 0.019783347845077515, "rewards/accuracy_reward": 0.941781222820282, "rewards/format_reward": 1.0, "step": 2265 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 414.6875, "epoch": 0.03125474131391291, "grad_norm": 5.537585660967187, "kl": 0.08251953125, "learning_rate": 9.975916332846723e-07, "loss": 0.0033, "reward": 2.128593683242798, "reward_std": 0.020362388342618942, "rewards/accuracy_reward": 0.9285937547683716, "rewards/format_reward": 1.0, "step": 2266 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 411.15625, "epoch": 0.031268534227114106, "grad_norm": 1.8428683071500345, "kl": 0.08203125, "learning_rate": 9.975895088709756e-07, "loss": 0.0033, "reward": 2.1715002059936523, "reward_std": 0.011392639949917793, "rewards/accuracy_reward": 0.9714999794960022, "rewards/format_reward": 1.0, "step": 2267 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 402.40625, "epoch": 0.03128232714031531, "grad_norm": 4.211057476053346, "kl": 0.0791015625, "learning_rate": 9.97587383522986e-07, "loss": 0.0032, "reward": 2.09556245803833, "reward_std": 0.02641196735203266, "rewards/accuracy_reward": 0.9018125534057617, "rewards/format_reward": 1.0, "step": 2268 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 406.09375, "epoch": 0.0312961200535165, "grad_norm": 2.799815983078908, "kl": 0.0849609375, "learning_rate": 9.975852572407077e-07, "loss": 0.0034, "reward": 2.1171562671661377, "reward_std": 0.01794649288058281, "rewards/accuracy_reward": 0.9171561598777771, "rewards/format_reward": 1.0, "step": 2269 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 414.59375, "epoch": 0.0313099129667177, "grad_norm": 2.665871421606478, "kl": 0.0869140625, "learning_rate": 9.975831300241447e-07, "loss": 0.0035, "reward": 2.097909450531006, "reward_std": 0.01113520935177803, "rewards/accuracy_reward": 0.89790940284729, "rewards/format_reward": 1.0, "step": 2270 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 393.40625, "epoch": 0.031323705879918895, "grad_norm": 2.5675211108799796, "kl": 0.0810546875, "learning_rate": 9.975810018733011e-07, "loss": 0.0032, "reward": 2.088624954223633, "reward_std": 0.02920497953891754, "rewards/accuracy_reward": 0.8948750495910645, "rewards/format_reward": 1.0, "step": 2271 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 401.65625, "epoch": 0.031337498793120096, "grad_norm": 3.8790258981998296, "kl": 0.0849609375, "learning_rate": 9.975788727881805e-07, "loss": 0.0034, "reward": 2.1497812271118164, "reward_std": 0.024576518684625626, "rewards/accuracy_reward": 0.9497812390327454, "rewards/format_reward": 1.0, "step": 2272 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 393.25, "epoch": 0.03135129170632129, "grad_norm": 5.337841975385978, "kl": 0.08349609375, "learning_rate": 9.975767427687873e-07, "loss": 0.0033, "reward": 2.0717501640319824, "reward_std": 0.007796011865139008, "rewards/accuracy_reward": 0.8717499375343323, "rewards/format_reward": 1.0, "step": 2273 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 391.125, "epoch": 0.03136508461952249, "grad_norm": 4.468961239105371, "kl": 0.07568359375, "learning_rate": 9.975746118151253e-07, "loss": 0.003, "reward": 2.0844998359680176, "reward_std": 0.007305807434022427, "rewards/accuracy_reward": 0.8844999670982361, "rewards/format_reward": 1.0, "step": 2274 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 386.46875, "epoch": 0.031378877532723684, "grad_norm": 3.9144826966411412, "kl": 0.08154296875, "learning_rate": 9.975724799271985e-07, "loss": 0.0033, "reward": 2.139531135559082, "reward_std": 0.012412281706929207, "rewards/accuracy_reward": 0.9395312666893005, "rewards/format_reward": 1.0, "step": 2275 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 395.9375, "epoch": 0.031392670445924885, "grad_norm": 2.247217390979363, "kl": 0.0849609375, "learning_rate": 9.97570347105011e-07, "loss": 0.0034, "reward": 2.1615939140319824, "reward_std": 0.020095108076930046, "rewards/accuracy_reward": 0.961593747138977, "rewards/format_reward": 1.0, "step": 2276 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 394.15625, "epoch": 0.03140646335912608, "grad_norm": 6.598113178381127, "kl": 0.08984375, "learning_rate": 9.975682133485667e-07, "loss": 0.0036, "reward": 2.0371875762939453, "reward_std": 0.0338313914835453, "rewards/accuracy_reward": 0.8371874690055847, "rewards/format_reward": 1.0, "step": 2277 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 387.3125, "epoch": 0.03142025627232728, "grad_norm": 2.0167431145717507, "kl": 0.083984375, "learning_rate": 9.975660786578697e-07, "loss": 0.0034, "reward": 2.1931562423706055, "reward_std": 0.016381051391363144, "rewards/accuracy_reward": 0.9931561946868896, "rewards/format_reward": 1.0, "step": 2278 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 398.5625, "epoch": 0.03143404918552847, "grad_norm": 2.705724921182165, "kl": 0.0830078125, "learning_rate": 9.975639430329241e-07, "loss": 0.0033, "reward": 2.1126561164855957, "reward_std": 0.011554856784641743, "rewards/accuracy_reward": 0.912656307220459, "rewards/format_reward": 1.0, "step": 2279 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 391.28125, "epoch": 0.031447842098729674, "grad_norm": 2.268176490866149, "kl": 0.08740234375, "learning_rate": 9.975618064737336e-07, "loss": 0.0035, "reward": 2.15596866607666, "reward_std": 0.02884788066148758, "rewards/accuracy_reward": 0.9559687376022339, "rewards/format_reward": 1.0, "step": 2280 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 376.875, "epoch": 0.03146163501193087, "grad_norm": 4.933171101868903, "kl": 0.07177734375, "learning_rate": 9.975596689803024e-07, "loss": 0.0029, "reward": 2.1358752250671387, "reward_std": 0.026951689273118973, "rewards/accuracy_reward": 0.9421250224113464, "rewards/format_reward": 1.0, "step": 2281 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 387.6875, "epoch": 0.03147542792513207, "grad_norm": 3.983475689175812, "kl": 0.08251953125, "learning_rate": 9.975575305526347e-07, "loss": 0.0033, "reward": 2.136812448501587, "reward_std": 0.020964784547686577, "rewards/accuracy_reward": 0.9368124604225159, "rewards/format_reward": 1.0, "step": 2282 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 377.96875, "epoch": 0.03148922083833326, "grad_norm": 2.3215301439483107, "kl": 0.0966796875, "learning_rate": 9.975553911907339e-07, "loss": 0.0039, "reward": 2.1399688720703125, "reward_std": 0.02450164407491684, "rewards/accuracy_reward": 0.9462187886238098, "rewards/format_reward": 1.0, "step": 2283 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 394.78125, "epoch": 0.03150301375153446, "grad_norm": 2.3094136694587806, "kl": 0.083984375, "learning_rate": 9.975532508946049e-07, "loss": 0.0034, "reward": 2.033656358718872, "reward_std": 0.0077733565121889114, "rewards/accuracy_reward": 0.8336561918258667, "rewards/format_reward": 1.0, "step": 2284 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 385.34375, "epoch": 0.03151680666473566, "grad_norm": 3.0665324707304458, "kl": 0.087890625, "learning_rate": 9.97551109664251e-07, "loss": 0.0035, "reward": 2.1108436584472656, "reward_std": 0.028609005734324455, "rewards/accuracy_reward": 0.9170937538146973, "rewards/format_reward": 1.0, "step": 2285 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 393.75, "epoch": 0.03153059957793686, "grad_norm": 2.2930623873016764, "kl": 0.08154296875, "learning_rate": 9.975489674996766e-07, "loss": 0.0033, "reward": 2.0263748168945312, "reward_std": 0.03337153419852257, "rewards/accuracy_reward": 0.8263750076293945, "rewards/format_reward": 1.0, "step": 2286 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 394.0, "epoch": 0.03154439249113805, "grad_norm": 2.9391900907343995, "kl": 0.07763671875, "learning_rate": 9.975468244008854e-07, "loss": 0.0031, "reward": 2.142878293991089, "reward_std": 0.011515817604959011, "rewards/accuracy_reward": 0.9428781270980835, "rewards/format_reward": 1.0, "step": 2287 }, { "all_correct": 0.25, "all_wrong": 0.0, "completion_length": 393.9375, "epoch": 0.03155818540433925, "grad_norm": 3.6925777305986007, "kl": 0.08984375, "learning_rate": 9.975446803678817e-07, "loss": 0.0036, "reward": 2.0121874809265137, "reward_std": 0.019458185881376266, "rewards/accuracy_reward": 0.8121875524520874, "rewards/format_reward": 1.0, "step": 2288 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 385.25, "epoch": 0.031571978317540446, "grad_norm": 2.4090613635973765, "kl": 0.08447265625, "learning_rate": 9.975425354006697e-07, "loss": 0.0034, "reward": 2.1460626125335693, "reward_std": 0.008709149435162544, "rewards/accuracy_reward": 0.9460625648498535, "rewards/format_reward": 1.0, "step": 2289 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 375.46875, "epoch": 0.03158577123074165, "grad_norm": 2.1557699965981576, "kl": 0.1025390625, "learning_rate": 9.97540389499253e-07, "loss": 0.0041, "reward": 2.096531391143799, "reward_std": 0.04128604754805565, "rewards/accuracy_reward": 0.9090312719345093, "rewards/format_reward": 1.0, "step": 2290 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 388.875, "epoch": 0.03159956414394284, "grad_norm": 2.2462730791901215, "kl": 0.09130859375, "learning_rate": 9.975382426636357e-07, "loss": 0.0036, "reward": 2.17396879196167, "reward_std": 0.03342505171895027, "rewards/accuracy_reward": 0.9802187085151672, "rewards/format_reward": 1.0, "step": 2291 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 391.15625, "epoch": 0.03161335705714404, "grad_norm": 3.416497516962503, "kl": 0.08984375, "learning_rate": 9.97536094893822e-07, "loss": 0.0036, "reward": 2.104281187057495, "reward_std": 0.013790383003652096, "rewards/accuracy_reward": 0.9042812585830688, "rewards/format_reward": 1.0, "step": 2292 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 375.5, "epoch": 0.031627149970345235, "grad_norm": 1.1200499903122947, "kl": 0.08203125, "learning_rate": 9.97533946189816e-07, "loss": 0.0033, "reward": 2.054093837738037, "reward_std": 0.0027220644988119602, "rewards/accuracy_reward": 0.8540937900543213, "rewards/format_reward": 1.0, "step": 2293 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 385.96875, "epoch": 0.031640942883546436, "grad_norm": 3.1924813112400607, "kl": 0.08984375, "learning_rate": 9.975317965516214e-07, "loss": 0.0036, "reward": 2.1097187995910645, "reward_std": 0.02264920435845852, "rewards/accuracy_reward": 0.9097187519073486, "rewards/format_reward": 1.0, "step": 2294 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 377.3125, "epoch": 0.03165473579674763, "grad_norm": 10.921711478924543, "kl": 0.0830078125, "learning_rate": 9.975296459792426e-07, "loss": 0.0033, "reward": 2.0443124771118164, "reward_std": 0.038417719304561615, "rewards/accuracy_reward": 0.8505625128746033, "rewards/format_reward": 1.0, "step": 2295 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 393.03125, "epoch": 0.03166852870994883, "grad_norm": 3.3469008903065585, "kl": 0.09326171875, "learning_rate": 9.975274944726833e-07, "loss": 0.0037, "reward": 2.1206250190734863, "reward_std": 0.03464517369866371, "rewards/accuracy_reward": 0.9268749952316284, "rewards/format_reward": 1.0, "step": 2296 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 386.40625, "epoch": 0.031682321623150024, "grad_norm": 2.8791720241301464, "kl": 0.0869140625, "learning_rate": 9.975253420319478e-07, "loss": 0.0035, "reward": 2.121906280517578, "reward_std": 0.030730130150914192, "rewards/accuracy_reward": 0.9281561970710754, "rewards/format_reward": 1.0, "step": 2297 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 388.75, "epoch": 0.031696114536351225, "grad_norm": 2.625697759684863, "kl": 0.083984375, "learning_rate": 9.9752318865704e-07, "loss": 0.0033, "reward": 2.1727187633514404, "reward_std": 0.008489701896905899, "rewards/accuracy_reward": 0.9727187156677246, "rewards/format_reward": 1.0, "step": 2298 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 397.1875, "epoch": 0.03170990744955242, "grad_norm": 1.9962552325377538, "kl": 0.0703125, "learning_rate": 9.975210343479641e-07, "loss": 0.0028, "reward": 2.1116561889648438, "reward_std": 0.012254114262759686, "rewards/accuracy_reward": 0.9116562008857727, "rewards/format_reward": 1.0, "step": 2299 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 389.71875, "epoch": 0.03172370036275362, "grad_norm": 3.357955183820964, "kl": 0.07568359375, "learning_rate": 9.975188791047241e-07, "loss": 0.003, "reward": 2.112656354904175, "reward_std": 0.0253610797226429, "rewards/accuracy_reward": 0.9126561880111694, "rewards/format_reward": 1.0, "step": 2300 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 387.1875, "epoch": 0.03173749327595481, "grad_norm": 3.1984752760470614, "kl": 0.08984375, "learning_rate": 9.97516722927324e-07, "loss": 0.0036, "reward": 2.0823750495910645, "reward_std": 0.023001380264759064, "rewards/accuracy_reward": 0.8823750019073486, "rewards/format_reward": 1.0, "step": 2301 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 381.15625, "epoch": 0.031751286189156014, "grad_norm": 9.980481463749479, "kl": 0.07958984375, "learning_rate": 9.975145658157677e-07, "loss": 0.0032, "reward": 2.1624999046325684, "reward_std": 0.00814671441912651, "rewards/accuracy_reward": 0.9624999761581421, "rewards/format_reward": 1.0, "step": 2302 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 388.875, "epoch": 0.03176507910235721, "grad_norm": 2.4775649729432954, "kl": 0.08837890625, "learning_rate": 9.975124077700594e-07, "loss": 0.0035, "reward": 2.1408438682556152, "reward_std": 0.026987893506884575, "rewards/accuracy_reward": 0.9470937848091125, "rewards/format_reward": 1.0, "step": 2303 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 391.84375, "epoch": 0.03177887201555841, "grad_norm": 2.6106013584134726, "kl": 0.078125, "learning_rate": 9.975102487902033e-07, "loss": 0.0031, "reward": 2.103562355041504, "reward_std": 0.010480975732207298, "rewards/accuracy_reward": 0.9035625457763672, "rewards/format_reward": 1.0, "step": 2304 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 397.28125, "epoch": 0.0317926649287596, "grad_norm": 2.739984500849049, "kl": 0.068359375, "learning_rate": 9.975080888762032e-07, "loss": 0.0028, "reward": 2.146937608718872, "reward_std": 0.009717043489217758, "rewards/accuracy_reward": 0.9469375014305115, "rewards/format_reward": 1.0, "step": 2305 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 392.90625, "epoch": 0.0318064578419608, "grad_norm": 2.6073931429806425, "kl": 0.0751953125, "learning_rate": 9.975059280280631e-07, "loss": 0.003, "reward": 2.1352500915527344, "reward_std": 0.024324417114257812, "rewards/accuracy_reward": 0.9415000677108765, "rewards/format_reward": 1.0, "step": 2306 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 394.90625, "epoch": 0.031820250755162, "grad_norm": 2.8339642457462753, "kl": 0.08740234375, "learning_rate": 9.975037662457874e-07, "loss": 0.0035, "reward": 2.108250141143799, "reward_std": 0.016800247132778168, "rewards/accuracy_reward": 0.9082499742507935, "rewards/format_reward": 1.0, "step": 2307 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 394.0625, "epoch": 0.0318340436683632, "grad_norm": 2.544390784965393, "kl": 0.08251953125, "learning_rate": 9.975016035293799e-07, "loss": 0.0033, "reward": 2.1215624809265137, "reward_std": 0.029493363574147224, "rewards/accuracy_reward": 0.9215625524520874, "rewards/format_reward": 1.0, "step": 2308 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 389.96875, "epoch": 0.03184783658156439, "grad_norm": 2.2567491718429893, "kl": 0.078125, "learning_rate": 9.974994398788447e-07, "loss": 0.0031, "reward": 2.0270938873291016, "reward_std": 0.009944726713001728, "rewards/accuracy_reward": 0.827093780040741, "rewards/format_reward": 1.0, "step": 2309 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 389.03125, "epoch": 0.03186162949476559, "grad_norm": 2.154549945332516, "kl": 0.078125, "learning_rate": 9.97497275294186e-07, "loss": 0.0031, "reward": 2.0022501945495605, "reward_std": 0.017113225534558296, "rewards/accuracy_reward": 0.8022500872612, "rewards/format_reward": 1.0, "step": 2310 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 397.84375, "epoch": 0.031875422407966786, "grad_norm": 2.058418541848099, "kl": 0.080078125, "learning_rate": 9.974951097754075e-07, "loss": 0.0032, "reward": 2.1064376831054688, "reward_std": 0.004897836595773697, "rewards/accuracy_reward": 0.9064375162124634, "rewards/format_reward": 1.0, "step": 2311 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 407.25, "epoch": 0.03188921532116799, "grad_norm": 9.32905612836491, "kl": 0.0751953125, "learning_rate": 9.974929433225138e-07, "loss": 0.0029, "reward": 2.0581769943237305, "reward_std": 0.013685942627489567, "rewards/accuracy_reward": 0.858177125453949, "rewards/format_reward": 1.0, "step": 2312 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 405.40625, "epoch": 0.03190300823436918, "grad_norm": 2.8502713062155256, "kl": 0.0771484375, "learning_rate": 9.974907759355086e-07, "loss": 0.0031, "reward": 1.98046875, "reward_std": 0.015279257670044899, "rewards/accuracy_reward": 0.780468761920929, "rewards/format_reward": 1.0, "step": 2313 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 407.34375, "epoch": 0.03191680114757038, "grad_norm": 3.855142522539016, "kl": 0.0869140625, "learning_rate": 9.974886076143962e-07, "loss": 0.0035, "reward": 2.1050000190734863, "reward_std": 0.019919203594326973, "rewards/accuracy_reward": 0.9049999713897705, "rewards/format_reward": 1.0, "step": 2314 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 396.3125, "epoch": 0.031930594060771575, "grad_norm": 2.362971603615965, "kl": 0.0732421875, "learning_rate": 9.974864383591804e-07, "loss": 0.0029, "reward": 2.1367406845092773, "reward_std": 0.009053431451320648, "rewards/accuracy_reward": 0.936740517616272, "rewards/format_reward": 1.0, "step": 2315 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 402.625, "epoch": 0.031944386973972776, "grad_norm": 2.5169657865996067, "kl": 0.0859375, "learning_rate": 9.974842681698653e-07, "loss": 0.0034, "reward": 2.120513677597046, "reward_std": 0.011337135918438435, "rewards/accuracy_reward": 0.9205137491226196, "rewards/format_reward": 1.0, "step": 2316 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 410.375, "epoch": 0.03195817988717397, "grad_norm": 1.4191570126918736, "kl": 0.07373046875, "learning_rate": 9.974820970464552e-07, "loss": 0.003, "reward": 2.1772499084472656, "reward_std": 0.008732639253139496, "rewards/accuracy_reward": 0.9772499799728394, "rewards/format_reward": 1.0, "step": 2317 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 413.53125, "epoch": 0.03197197280037517, "grad_norm": 4.580004600703628, "kl": 0.07275390625, "learning_rate": 9.974799249889542e-07, "loss": 0.0029, "reward": 2.0073437690734863, "reward_std": 0.027227478101849556, "rewards/accuracy_reward": 0.8073437809944153, "rewards/format_reward": 1.0, "step": 2318 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 409.9375, "epoch": 0.031985765713576364, "grad_norm": 4.324348257257624, "kl": 0.0810546875, "learning_rate": 9.97477751997366e-07, "loss": 0.0032, "reward": 2.0601563453674316, "reward_std": 0.012123478576540947, "rewards/accuracy_reward": 0.860156238079071, "rewards/format_reward": 1.0, "step": 2319 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 406.9375, "epoch": 0.031999558626777565, "grad_norm": 2.4815305147866518, "kl": 0.0771484375, "learning_rate": 9.97475578071695e-07, "loss": 0.0031, "reward": 2.128218650817871, "reward_std": 0.007847452536225319, "rewards/accuracy_reward": 0.9282187223434448, "rewards/format_reward": 1.0, "step": 2320 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 412.53125, "epoch": 0.03201335153997876, "grad_norm": 2.0412791670306305, "kl": 0.08203125, "learning_rate": 9.974734032119452e-07, "loss": 0.0033, "reward": 2.1193437576293945, "reward_std": 0.01069062203168869, "rewards/accuracy_reward": 0.9193437099456787, "rewards/format_reward": 1.0, "step": 2321 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 410.21875, "epoch": 0.03202714445317996, "grad_norm": 2.6274071169258737, "kl": 0.08349609375, "learning_rate": 9.974712274181208e-07, "loss": 0.0033, "reward": 2.089062452316284, "reward_std": 0.01782357506453991, "rewards/accuracy_reward": 0.8890624642372131, "rewards/format_reward": 1.0, "step": 2322 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 410.84375, "epoch": 0.03204093736638115, "grad_norm": 2.3248122663803006, "kl": 0.080078125, "learning_rate": 9.974690506902255e-07, "loss": 0.0032, "reward": 2.1310312747955322, "reward_std": 0.016628500074148178, "rewards/accuracy_reward": 0.9310312271118164, "rewards/format_reward": 1.0, "step": 2323 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 402.8125, "epoch": 0.032054730279582354, "grad_norm": 1.6996090600110827, "kl": 0.08251953125, "learning_rate": 9.974668730282638e-07, "loss": 0.0033, "reward": 2.033937454223633, "reward_std": 0.006229642312973738, "rewards/accuracy_reward": 0.8339375257492065, "rewards/format_reward": 1.0, "step": 2324 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 409.40625, "epoch": 0.03206852319278355, "grad_norm": 2.947229159632449, "kl": 0.08935546875, "learning_rate": 9.974646944322397e-07, "loss": 0.0036, "reward": 2.0913751125335693, "reward_std": 0.015313539654016495, "rewards/accuracy_reward": 0.8913750052452087, "rewards/format_reward": 1.0, "step": 2325 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 402.15625, "epoch": 0.03208231610598475, "grad_norm": 2.2098465689425444, "kl": 0.076171875, "learning_rate": 9.974625149021572e-07, "loss": 0.003, "reward": 2.11928129196167, "reward_std": 0.022272884845733643, "rewards/accuracy_reward": 0.9192811846733093, "rewards/format_reward": 1.0, "step": 2326 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 405.78125, "epoch": 0.03209610901918594, "grad_norm": 25.78150202858481, "kl": 0.08642578125, "learning_rate": 9.974603344380203e-07, "loss": 0.0035, "reward": 2.0751874446868896, "reward_std": 0.02650390937924385, "rewards/accuracy_reward": 0.8751873970031738, "rewards/format_reward": 1.0, "step": 2327 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 401.78125, "epoch": 0.03210990193238714, "grad_norm": 8.955326727526062, "kl": 0.0810546875, "learning_rate": 9.974581530398333e-07, "loss": 0.0033, "reward": 2.1094064712524414, "reward_std": 0.021358944475650787, "rewards/accuracy_reward": 0.909406304359436, "rewards/format_reward": 1.0, "step": 2328 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 394.03125, "epoch": 0.03212369484558834, "grad_norm": 2.9534450307683566, "kl": 0.08251953125, "learning_rate": 9.974559707076002e-07, "loss": 0.0033, "reward": 2.0166876316070557, "reward_std": 0.026313763111829758, "rewards/accuracy_reward": 0.8166874647140503, "rewards/format_reward": 1.0, "step": 2329 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 392.5625, "epoch": 0.03213748775878953, "grad_norm": 2.6769641930782786, "kl": 0.083984375, "learning_rate": 9.97453787441325e-07, "loss": 0.0034, "reward": 2.131093978881836, "reward_std": 0.013185089454054832, "rewards/accuracy_reward": 0.9310937523841858, "rewards/format_reward": 1.0, "step": 2330 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 389.0625, "epoch": 0.03215128067199073, "grad_norm": 5.887994643950803, "kl": 0.08203125, "learning_rate": 9.974516032410123e-07, "loss": 0.0033, "reward": 2.076218605041504, "reward_std": 0.017778923735022545, "rewards/accuracy_reward": 0.8762187957763672, "rewards/format_reward": 1.0, "step": 2331 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 396.96875, "epoch": 0.032165073585191925, "grad_norm": 3.930450991478668, "kl": 0.08251953125, "learning_rate": 9.974494181066655e-07, "loss": 0.0033, "reward": 2.0151877403259277, "reward_std": 0.0163571834564209, "rewards/accuracy_reward": 0.8151874542236328, "rewards/format_reward": 1.0, "step": 2332 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 397.0625, "epoch": 0.032178866498393126, "grad_norm": 3.4216881399245533, "kl": 0.091796875, "learning_rate": 9.97447232038289e-07, "loss": 0.0037, "reward": 2.0705313682556152, "reward_std": 0.02359245903789997, "rewards/accuracy_reward": 0.8705312013626099, "rewards/format_reward": 1.0, "step": 2333 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 388.125, "epoch": 0.03219265941159432, "grad_norm": 3.060793416795109, "kl": 0.076171875, "learning_rate": 9.97445045035887e-07, "loss": 0.0031, "reward": 2.0478124618530273, "reward_std": 0.021338466554880142, "rewards/accuracy_reward": 0.8478125333786011, "rewards/format_reward": 1.0, "step": 2334 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 403.875, "epoch": 0.03220645232479552, "grad_norm": 4.834611104144321, "kl": 0.08984375, "learning_rate": 9.974428570994636e-07, "loss": 0.0036, "reward": 2.109715461730957, "reward_std": 0.029342088848352432, "rewards/accuracy_reward": 0.9159656167030334, "rewards/format_reward": 1.0, "step": 2335 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 401.75, "epoch": 0.032220245237996714, "grad_norm": 3.8493422859776874, "kl": 0.07666015625, "learning_rate": 9.974406682290226e-07, "loss": 0.0031, "reward": 2.0732502937316895, "reward_std": 0.016811514273285866, "rewards/accuracy_reward": 0.8732500076293945, "rewards/format_reward": 1.0, "step": 2336 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 396.0625, "epoch": 0.032234038151197915, "grad_norm": 2.8234388747705745, "kl": 0.0849609375, "learning_rate": 9.974384784245687e-07, "loss": 0.0034, "reward": 1.93959379196167, "reward_std": 0.014548176899552345, "rewards/accuracy_reward": 0.7395937442779541, "rewards/format_reward": 1.0, "step": 2337 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 419.3125, "epoch": 0.03224783106439911, "grad_norm": 4.529598366145159, "kl": 0.083984375, "learning_rate": 9.974362876861054e-07, "loss": 0.0033, "reward": 2.0817813873291016, "reward_std": 0.01818380132317543, "rewards/accuracy_reward": 0.881781280040741, "rewards/format_reward": 1.0, "step": 2338 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 406.1875, "epoch": 0.03226162397760031, "grad_norm": 3.1938708582024438, "kl": 0.0869140625, "learning_rate": 9.97434096013637e-07, "loss": 0.0035, "reward": 2.0459351539611816, "reward_std": 0.015249264426529408, "rewards/accuracy_reward": 0.8459351062774658, "rewards/format_reward": 1.0, "step": 2339 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 398.8125, "epoch": 0.0322754168908015, "grad_norm": 2.126501769507135, "kl": 0.08349609375, "learning_rate": 9.974319034071679e-07, "loss": 0.0033, "reward": 2.0508615970611572, "reward_std": 0.01695430465042591, "rewards/accuracy_reward": 0.8508615493774414, "rewards/format_reward": 1.0, "step": 2340 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 407.9375, "epoch": 0.032289209804002704, "grad_norm": 3.851130384146306, "kl": 0.08837890625, "learning_rate": 9.974297098667017e-07, "loss": 0.0035, "reward": 2.1062188148498535, "reward_std": 0.01266256533563137, "rewards/accuracy_reward": 0.9062187075614929, "rewards/format_reward": 1.0, "step": 2341 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 412.03125, "epoch": 0.0323030027172039, "grad_norm": 2.9705387640655623, "kl": 0.0810546875, "learning_rate": 9.974275153922431e-07, "loss": 0.0032, "reward": 1.997593879699707, "reward_std": 0.011554213240742683, "rewards/accuracy_reward": 0.7975937724113464, "rewards/format_reward": 1.0, "step": 2342 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 414.65625, "epoch": 0.0323167956304051, "grad_norm": 1.668882262052337, "kl": 0.080078125, "learning_rate": 9.97425319983796e-07, "loss": 0.0032, "reward": 1.9397811889648438, "reward_std": 0.0055802930146455765, "rewards/accuracy_reward": 0.7397812604904175, "rewards/format_reward": 1.0, "step": 2343 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 409.28125, "epoch": 0.03233058854360629, "grad_norm": 2.6109228493523076, "kl": 0.087890625, "learning_rate": 9.974231236413643e-07, "loss": 0.0035, "reward": 2.106843948364258, "reward_std": 0.011872983537614346, "rewards/accuracy_reward": 0.9068437218666077, "rewards/format_reward": 1.0, "step": 2344 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 417.40625, "epoch": 0.03234438145680749, "grad_norm": 3.216842831639977, "kl": 0.08251953125, "learning_rate": 9.974209263649523e-07, "loss": 0.0033, "reward": 1.9984999895095825, "reward_std": 0.03902565687894821, "rewards/accuracy_reward": 0.8047500252723694, "rewards/format_reward": 1.0, "step": 2345 }, { "all_correct": 0.25, "all_wrong": 0.0, "completion_length": 406.5, "epoch": 0.03235817437000869, "grad_norm": 2.215271794334236, "kl": 0.08251953125, "learning_rate": 9.974187281545642e-07, "loss": 0.0033, "reward": 2.013031482696533, "reward_std": 0.0055474755354225636, "rewards/accuracy_reward": 0.8130311965942383, "rewards/format_reward": 1.0, "step": 2346 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 404.5, "epoch": 0.03237196728320989, "grad_norm": 3.0852133313992103, "kl": 0.08740234375, "learning_rate": 9.974165290102039e-07, "loss": 0.0035, "reward": 2.111062526702881, "reward_std": 0.010986735112965107, "rewards/accuracy_reward": 0.911062479019165, "rewards/format_reward": 1.0, "step": 2347 }, { "all_correct": 0.25, "all_wrong": 0.0, "completion_length": 406.3125, "epoch": 0.03238576019641108, "grad_norm": 2.507537334365763, "kl": 0.0849609375, "learning_rate": 9.974143289318756e-07, "loss": 0.0034, "reward": 2.009500026702881, "reward_std": 0.029866669327020645, "rewards/accuracy_reward": 0.8157499432563782, "rewards/format_reward": 1.0, "step": 2348 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 408.84375, "epoch": 0.03239955310961228, "grad_norm": 1.9951901921263475, "kl": 0.08984375, "learning_rate": 9.974121279195837e-07, "loss": 0.0036, "reward": 2.1469688415527344, "reward_std": 0.00911218486726284, "rewards/accuracy_reward": 0.9469687938690186, "rewards/format_reward": 1.0, "step": 2349 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 403.0, "epoch": 0.032413346022813476, "grad_norm": 1.4985005935974436, "kl": 0.08203125, "learning_rate": 9.974099259733322e-07, "loss": 0.0033, "reward": 2.085031509399414, "reward_std": 0.005332938861101866, "rewards/accuracy_reward": 0.8850312232971191, "rewards/format_reward": 1.0, "step": 2350 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 402.9375, "epoch": 0.03242713893601468, "grad_norm": 3.103144542429257, "kl": 0.083984375, "learning_rate": 9.97407723093125e-07, "loss": 0.0034, "reward": 2.149343967437744, "reward_std": 0.019839826971292496, "rewards/accuracy_reward": 0.9493438005447388, "rewards/format_reward": 1.0, "step": 2351 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 406.4375, "epoch": 0.03244093184921587, "grad_norm": 3.386565044206934, "kl": 0.0927734375, "learning_rate": 9.974055192789665e-07, "loss": 0.0037, "reward": 2.063624858856201, "reward_std": 0.017994562163949013, "rewards/accuracy_reward": 0.8636250495910645, "rewards/format_reward": 1.0, "step": 2352 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 401.40625, "epoch": 0.03245472476241707, "grad_norm": 2.2761309494338295, "kl": 0.08349609375, "learning_rate": 9.974033145308607e-07, "loss": 0.0033, "reward": 2.1413750648498535, "reward_std": 0.01586213707923889, "rewards/accuracy_reward": 0.9413750171661377, "rewards/format_reward": 1.0, "step": 2353 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 389.78125, "epoch": 0.032468517675618265, "grad_norm": 1.8432715728651696, "kl": 0.08642578125, "learning_rate": 9.974011088488117e-07, "loss": 0.0035, "reward": 2.132718801498413, "reward_std": 0.005831220652908087, "rewards/accuracy_reward": 0.9327187538146973, "rewards/format_reward": 1.0, "step": 2354 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 386.03125, "epoch": 0.032482310588819466, "grad_norm": 2.477588129396509, "kl": 0.08935546875, "learning_rate": 9.973989022328237e-07, "loss": 0.0036, "reward": 2.0822811126708984, "reward_std": 0.012081412598490715, "rewards/accuracy_reward": 0.8822813034057617, "rewards/format_reward": 1.0, "step": 2355 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 389.21875, "epoch": 0.03249610350202066, "grad_norm": 3.3365285601954326, "kl": 0.0869140625, "learning_rate": 9.97396694682901e-07, "loss": 0.0035, "reward": 2.106156349182129, "reward_std": 0.013339381664991379, "rewards/accuracy_reward": 0.9061561822891235, "rewards/format_reward": 1.0, "step": 2356 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 392.75, "epoch": 0.03250989641522186, "grad_norm": 11.849880391347611, "kl": 0.087890625, "learning_rate": 9.973944861990475e-07, "loss": 0.0035, "reward": 2.0523126125335693, "reward_std": 0.0322912335395813, "rewards/accuracy_reward": 0.8585624694824219, "rewards/format_reward": 1.0, "step": 2357 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 387.3125, "epoch": 0.032523689328423054, "grad_norm": 8.392127570633706, "kl": 0.08544921875, "learning_rate": 9.973922767812675e-07, "loss": 0.0034, "reward": 2.12681245803833, "reward_std": 0.018162531778216362, "rewards/accuracy_reward": 0.9268125295639038, "rewards/format_reward": 1.0, "step": 2358 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 387.96875, "epoch": 0.032537482241624255, "grad_norm": 2.943888458050703, "kl": 0.0908203125, "learning_rate": 9.97390066429565e-07, "loss": 0.0036, "reward": 2.1219375133514404, "reward_std": 0.016423191875219345, "rewards/accuracy_reward": 0.9219375848770142, "rewards/format_reward": 1.0, "step": 2359 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 398.21875, "epoch": 0.03255127515482545, "grad_norm": 14.287235924336045, "kl": 0.09521484375, "learning_rate": 9.973878551439444e-07, "loss": 0.0038, "reward": 2.0561251640319824, "reward_std": 0.015381023287773132, "rewards/accuracy_reward": 0.856124997138977, "rewards/format_reward": 1.0, "step": 2360 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 379.15625, "epoch": 0.03256506806802665, "grad_norm": 2.339159417704558, "kl": 0.0830078125, "learning_rate": 9.973856429244098e-07, "loss": 0.0033, "reward": 2.0890345573425293, "reward_std": 0.029502497985959053, "rewards/accuracy_reward": 0.8952842950820923, "rewards/format_reward": 1.0, "step": 2361 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 382.46875, "epoch": 0.03257886098122784, "grad_norm": 2.9886503479010162, "kl": 0.0830078125, "learning_rate": 9.973834297709653e-07, "loss": 0.0033, "reward": 2.120687484741211, "reward_std": 0.027790088206529617, "rewards/accuracy_reward": 0.9269375205039978, "rewards/format_reward": 1.0, "step": 2362 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 382.3125, "epoch": 0.032592653894429044, "grad_norm": 2.6831080041579374, "kl": 0.0869140625, "learning_rate": 9.973812156836148e-07, "loss": 0.0035, "reward": 2.1221563816070557, "reward_std": 0.0333104133605957, "rewards/accuracy_reward": 0.9284062385559082, "rewards/format_reward": 1.0, "step": 2363 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 389.0, "epoch": 0.03260644680763024, "grad_norm": 5.902226917081986, "kl": 0.0927734375, "learning_rate": 9.973790006623629e-07, "loss": 0.0037, "reward": 1.9923124313354492, "reward_std": 0.05784345418214798, "rewards/accuracy_reward": 0.804812490940094, "rewards/format_reward": 1.0, "step": 2364 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 384.09375, "epoch": 0.03262023972083144, "grad_norm": 2.927095845204925, "kl": 0.08935546875, "learning_rate": 9.973767847072133e-07, "loss": 0.0035, "reward": 2.138051986694336, "reward_std": 0.007921425625681877, "rewards/accuracy_reward": 0.9380520582199097, "rewards/format_reward": 1.0, "step": 2365 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 384.0, "epoch": 0.03263403263403263, "grad_norm": 3.079658610465056, "kl": 0.08642578125, "learning_rate": 9.973745678181704e-07, "loss": 0.0035, "reward": 2.058718681335449, "reward_std": 0.021353382617235184, "rewards/accuracy_reward": 0.8587188124656677, "rewards/format_reward": 1.0, "step": 2366 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 391.25, "epoch": 0.03264782554723383, "grad_norm": 1.9730684993662486, "kl": 0.08984375, "learning_rate": 9.973723499952384e-07, "loss": 0.0036, "reward": 2.1182498931884766, "reward_std": 0.0077733239158988, "rewards/accuracy_reward": 0.9182500243186951, "rewards/format_reward": 1.0, "step": 2367 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 384.875, "epoch": 0.03266161846043503, "grad_norm": 2.0341904079955055, "kl": 0.08056640625, "learning_rate": 9.973701312384215e-07, "loss": 0.0032, "reward": 2.124500036239624, "reward_std": 0.026494383811950684, "rewards/accuracy_reward": 0.937000036239624, "rewards/format_reward": 1.0, "step": 2368 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 384.0, "epoch": 0.03267541137363623, "grad_norm": 4.8726830959665435, "kl": 0.07470703125, "learning_rate": 9.973679115477238e-07, "loss": 0.003, "reward": 2.1194376945495605, "reward_std": 0.011935602873563766, "rewards/accuracy_reward": 0.9194374680519104, "rewards/format_reward": 1.0, "step": 2369 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 385.59375, "epoch": 0.03268920428683742, "grad_norm": 2.6967009035499054, "kl": 0.07568359375, "learning_rate": 9.973656909231495e-07, "loss": 0.003, "reward": 2.101531505584717, "reward_std": 0.010418438352644444, "rewards/accuracy_reward": 0.9015312194824219, "rewards/format_reward": 1.0, "step": 2370 }, { "all_correct": 0.25, "all_wrong": 0.0, "completion_length": 375.25, "epoch": 0.03270299720003862, "grad_norm": 3.165494353344299, "kl": 0.08154296875, "learning_rate": 9.973634693647024e-07, "loss": 0.0033, "reward": 2.08134388923645, "reward_std": 0.050670918077230453, "rewards/accuracy_reward": 0.8938437700271606, "rewards/format_reward": 1.0, "step": 2371 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 382.1875, "epoch": 0.032716790113239816, "grad_norm": 1.8671773146818746, "kl": 0.08251953125, "learning_rate": 9.973612468723874e-07, "loss": 0.0033, "reward": 2.0793747901916504, "reward_std": 0.008107316680252552, "rewards/accuracy_reward": 0.8793749809265137, "rewards/format_reward": 1.0, "step": 2372 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 398.6875, "epoch": 0.03273058302644102, "grad_norm": 3.116751694895294, "kl": 0.08203125, "learning_rate": 9.97359023446208e-07, "loss": 0.0033, "reward": 2.0723750591278076, "reward_std": 0.011620712466537952, "rewards/accuracy_reward": 0.8723750114440918, "rewards/format_reward": 1.0, "step": 2373 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 377.78125, "epoch": 0.03274437593964221, "grad_norm": 2.672323119923527, "kl": 0.08447265625, "learning_rate": 9.973567990861687e-07, "loss": 0.0034, "reward": 2.1391220092773438, "reward_std": 0.027407145127654076, "rewards/accuracy_reward": 0.9453719258308411, "rewards/format_reward": 1.0, "step": 2374 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 384.59375, "epoch": 0.03275816885284341, "grad_norm": 3.8358348492990153, "kl": 0.0771484375, "learning_rate": 9.973545737922736e-07, "loss": 0.0031, "reward": 2.1338748931884766, "reward_std": 0.04213307797908783, "rewards/accuracy_reward": 0.9463750123977661, "rewards/format_reward": 1.0, "step": 2375 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 374.53125, "epoch": 0.032771961766044605, "grad_norm": 2.3489391410091094, "kl": 0.07421875, "learning_rate": 9.97352347564527e-07, "loss": 0.003, "reward": 2.1101250648498535, "reward_std": 0.022552819922566414, "rewards/accuracy_reward": 0.9163750410079956, "rewards/format_reward": 1.0, "step": 2376 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 386.8125, "epoch": 0.032785754679245806, "grad_norm": 5.4585844652772435, "kl": 0.0888671875, "learning_rate": 9.973501204029328e-07, "loss": 0.0036, "reward": 2.076124906539917, "reward_std": 0.02718494087457657, "rewards/accuracy_reward": 0.8761250376701355, "rewards/format_reward": 1.0, "step": 2377 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 390.875, "epoch": 0.032799547592447, "grad_norm": 4.143807061511893, "kl": 0.0869140625, "learning_rate": 9.973478923074957e-07, "loss": 0.0035, "reward": 2.1364998817443848, "reward_std": 0.031032584607601166, "rewards/accuracy_reward": 0.9427499175071716, "rewards/format_reward": 1.0, "step": 2378 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 414.90625, "epoch": 0.0328133405056482, "grad_norm": 2.6955632757511294, "kl": 0.06689453125, "learning_rate": 9.973456632782193e-07, "loss": 0.0027, "reward": 2.142406463623047, "reward_std": 0.025756757706403732, "rewards/accuracy_reward": 0.9486562013626099, "rewards/format_reward": 1.0, "step": 2379 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 409.9375, "epoch": 0.032827133418849394, "grad_norm": 2.047918124942534, "kl": 0.078125, "learning_rate": 9.97343433315108e-07, "loss": 0.0031, "reward": 2.1542186737060547, "reward_std": 0.004356837831437588, "rewards/accuracy_reward": 0.9542187452316284, "rewards/format_reward": 1.0, "step": 2380 }, { "all_correct": 0.25, "all_wrong": 0.0, "completion_length": 405.9375, "epoch": 0.032840926332050595, "grad_norm": 4.510018267524575, "kl": 0.0791015625, "learning_rate": 9.97341202418166e-07, "loss": 0.0032, "reward": 2.045593738555908, "reward_std": 0.03318803757429123, "rewards/accuracy_reward": 0.8518437743186951, "rewards/format_reward": 1.0, "step": 2381 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 403.53125, "epoch": 0.03285471924525179, "grad_norm": 3.117937202605605, "kl": 0.080078125, "learning_rate": 9.973389705873975e-07, "loss": 0.0032, "reward": 2.1275312900543213, "reward_std": 0.021019957959651947, "rewards/accuracy_reward": 0.9337812662124634, "rewards/format_reward": 1.0, "step": 2382 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 407.4375, "epoch": 0.03286851215845299, "grad_norm": 3.7193578446461775, "kl": 0.07763671875, "learning_rate": 9.973367378228069e-07, "loss": 0.0031, "reward": 2.1307811737060547, "reward_std": 0.012470152229070663, "rewards/accuracy_reward": 0.9307812452316284, "rewards/format_reward": 1.0, "step": 2383 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 392.5625, "epoch": 0.03288230507165418, "grad_norm": 2.5658145371196333, "kl": 0.083984375, "learning_rate": 9.973345041243982e-07, "loss": 0.0034, "reward": 2.008593797683716, "reward_std": 0.032400086522102356, "rewards/accuracy_reward": 0.8148437738418579, "rewards/format_reward": 1.0, "step": 2384 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 410.0, "epoch": 0.032896097984855384, "grad_norm": 2.0921653834263645, "kl": 0.08349609375, "learning_rate": 9.973322694921753e-07, "loss": 0.0033, "reward": 2.1388437747955322, "reward_std": 0.005781481973826885, "rewards/accuracy_reward": 0.9388437271118164, "rewards/format_reward": 1.0, "step": 2385 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 401.875, "epoch": 0.03290989089805658, "grad_norm": 1.6942686118466461, "kl": 0.08837890625, "learning_rate": 9.973300339261428e-07, "loss": 0.0035, "reward": 2.186375141143799, "reward_std": 0.022686326876282692, "rewards/accuracy_reward": 0.9926249980926514, "rewards/format_reward": 1.0, "step": 2386 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 398.90625, "epoch": 0.03292368381125778, "grad_norm": 4.2585733558131125, "kl": 0.0927734375, "learning_rate": 9.973277974263048e-07, "loss": 0.0037, "reward": 2.040875196456909, "reward_std": 0.024384882301092148, "rewards/accuracy_reward": 0.840874969959259, "rewards/format_reward": 1.0, "step": 2387 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 422.6875, "epoch": 0.03293747672445897, "grad_norm": 2.1267470233836314, "kl": 0.08056640625, "learning_rate": 9.973255599926655e-07, "loss": 0.0032, "reward": 2.133406400680542, "reward_std": 0.02618633769452572, "rewards/accuracy_reward": 0.9396562576293945, "rewards/format_reward": 1.0, "step": 2388 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 404.59375, "epoch": 0.03295126963766017, "grad_norm": 5.008861536587949, "kl": 0.0927734375, "learning_rate": 9.97323321625229e-07, "loss": 0.0037, "reward": 2.045687675476074, "reward_std": 0.018814748153090477, "rewards/accuracy_reward": 0.8456875085830688, "rewards/format_reward": 1.0, "step": 2389 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 402.84375, "epoch": 0.03296506255086137, "grad_norm": 5.329722737658765, "kl": 0.083984375, "learning_rate": 9.973210823239996e-07, "loss": 0.0034, "reward": 2.143843650817871, "reward_std": 0.012380078434944153, "rewards/accuracy_reward": 0.9438437223434448, "rewards/format_reward": 1.0, "step": 2390 }, { "all_correct": 0.25, "all_wrong": 0.0, "completion_length": 411.0625, "epoch": 0.03297885546406257, "grad_norm": 2.664733009476507, "kl": 0.0869140625, "learning_rate": 9.973188420889816e-07, "loss": 0.0035, "reward": 1.826812505722046, "reward_std": 0.014422931708395481, "rewards/accuracy_reward": 0.6268125176429749, "rewards/format_reward": 1.0, "step": 2391 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 391.6875, "epoch": 0.03299264837726376, "grad_norm": 2.8494405564210035, "kl": 0.07958984375, "learning_rate": 9.97316600920179e-07, "loss": 0.0032, "reward": 2.0310938358306885, "reward_std": 0.018070163205266, "rewards/accuracy_reward": 0.8310937285423279, "rewards/format_reward": 1.0, "step": 2392 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 387.3125, "epoch": 0.03300644129046496, "grad_norm": 2.216269019927954, "kl": 0.0869140625, "learning_rate": 9.97314358817596e-07, "loss": 0.0035, "reward": 2.1406564712524414, "reward_std": 0.03640886768698692, "rewards/accuracy_reward": 0.9531562924385071, "rewards/format_reward": 1.0, "step": 2393 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 387.25, "epoch": 0.033020234203666156, "grad_norm": 2.296214209825985, "kl": 0.0927734375, "learning_rate": 9.97312115781237e-07, "loss": 0.0037, "reward": 2.113687515258789, "reward_std": 0.014185428619384766, "rewards/accuracy_reward": 0.9136874675750732, "rewards/format_reward": 1.0, "step": 2394 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 380.59375, "epoch": 0.03303402711686736, "grad_norm": 3.9799257974870685, "kl": 0.1005859375, "learning_rate": 9.973098718111061e-07, "loss": 0.004, "reward": 2.1571249961853027, "reward_std": 0.016814254224300385, "rewards/accuracy_reward": 0.9571249485015869, "rewards/format_reward": 1.0, "step": 2395 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 396.9375, "epoch": 0.03304782003006855, "grad_norm": 2.3743539775718414, "kl": 0.08642578125, "learning_rate": 9.973076269072075e-07, "loss": 0.0035, "reward": 2.107343912124634, "reward_std": 0.017473481595516205, "rewards/accuracy_reward": 0.9073437452316284, "rewards/format_reward": 1.0, "step": 2396 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 382.625, "epoch": 0.03306161294326975, "grad_norm": 2.8013854200604658, "kl": 0.07958984375, "learning_rate": 9.973053810695457e-07, "loss": 0.0032, "reward": 2.112562417984009, "reward_std": 0.011191128753125668, "rewards/accuracy_reward": 0.9125624895095825, "rewards/format_reward": 1.0, "step": 2397 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 393.78125, "epoch": 0.033075405856470945, "grad_norm": 5.330557156468669, "kl": 0.0771484375, "learning_rate": 9.973031342981244e-07, "loss": 0.0031, "reward": 2.1572813987731934, "reward_std": 0.012861261144280434, "rewards/accuracy_reward": 0.957281231880188, "rewards/format_reward": 1.0, "step": 2398 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 386.28125, "epoch": 0.033089198769672146, "grad_norm": 3.984579946953634, "kl": 0.08544921875, "learning_rate": 9.973008865929483e-07, "loss": 0.0034, "reward": 2.0656251907348633, "reward_std": 0.013595905154943466, "rewards/accuracy_reward": 0.8656249642372131, "rewards/format_reward": 1.0, "step": 2399 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 378.03125, "epoch": 0.03310299168287334, "grad_norm": 6.216136210817981, "kl": 0.08203125, "learning_rate": 9.97298637954021e-07, "loss": 0.0033, "reward": 2.07240629196167, "reward_std": 0.009323226287961006, "rewards/accuracy_reward": 0.8724062442779541, "rewards/format_reward": 1.0, "step": 2400 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 377.5625, "epoch": 0.03311678459607454, "grad_norm": 8.221543568756736, "kl": 0.07763671875, "learning_rate": 9.972963883813476e-07, "loss": 0.0031, "reward": 2.127593755722046, "reward_std": 0.008764777332544327, "rewards/accuracy_reward": 0.9275937676429749, "rewards/format_reward": 1.0, "step": 2401 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 384.40625, "epoch": 0.033130577509275734, "grad_norm": 2.8362306643516173, "kl": 0.09033203125, "learning_rate": 9.972941378749316e-07, "loss": 0.0036, "reward": 2.139437437057495, "reward_std": 0.015877075493335724, "rewards/accuracy_reward": 0.9394374489784241, "rewards/format_reward": 1.0, "step": 2402 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 376.40625, "epoch": 0.033144370422476935, "grad_norm": 2.3639772550253957, "kl": 0.08740234375, "learning_rate": 9.972918864347775e-07, "loss": 0.0035, "reward": 2.129312515258789, "reward_std": 0.02568642422556877, "rewards/accuracy_reward": 0.9355624914169312, "rewards/format_reward": 1.0, "step": 2403 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 385.28125, "epoch": 0.03315816333567813, "grad_norm": 2.6715838347248866, "kl": 0.087890625, "learning_rate": 9.972896340608894e-07, "loss": 0.0035, "reward": 2.1388437747955322, "reward_std": 0.01290091685950756, "rewards/accuracy_reward": 0.9388437271118164, "rewards/format_reward": 1.0, "step": 2404 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 373.53125, "epoch": 0.03317195624887932, "grad_norm": 1.8561072825430347, "kl": 0.08447265625, "learning_rate": 9.972873807532718e-07, "loss": 0.0034, "reward": 2.007625102996826, "reward_std": 0.026237372308969498, "rewards/accuracy_reward": 0.8076249957084656, "rewards/format_reward": 1.0, "step": 2405 }, { "all_correct": 0.25, "all_wrong": 0.0, "completion_length": 363.75, "epoch": 0.03318574916208052, "grad_norm": 2.336971683307296, "kl": 0.09228515625, "learning_rate": 9.972851265119287e-07, "loss": 0.0037, "reward": 2.081125020980835, "reward_std": 0.052818939089775085, "rewards/accuracy_reward": 0.8998750448226929, "rewards/format_reward": 1.0, "step": 2406 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 378.59375, "epoch": 0.03319954207528172, "grad_norm": 1.9259696652231992, "kl": 0.0771484375, "learning_rate": 9.972828713368643e-07, "loss": 0.0031, "reward": 2.056999921798706, "reward_std": 0.005443733185529709, "rewards/accuracy_reward": 0.8569999933242798, "rewards/format_reward": 1.0, "step": 2407 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 389.75, "epoch": 0.03321333498848292, "grad_norm": 1.8827932239322343, "kl": 0.0732421875, "learning_rate": 9.972806152280834e-07, "loss": 0.0029, "reward": 2.170875072479248, "reward_std": 0.014248276129364967, "rewards/accuracy_reward": 0.9708750247955322, "rewards/format_reward": 1.0, "step": 2408 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 381.0625, "epoch": 0.03322712790168411, "grad_norm": 18.377650887285704, "kl": 0.0771484375, "learning_rate": 9.972783581855892e-07, "loss": 0.0031, "reward": 2.1256561279296875, "reward_std": 0.01474706269800663, "rewards/accuracy_reward": 0.9256561994552612, "rewards/format_reward": 1.0, "step": 2409 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 383.4375, "epoch": 0.03324092081488531, "grad_norm": 2.4538553611388956, "kl": 0.087890625, "learning_rate": 9.972761002093866e-07, "loss": 0.0035, "reward": 2.1155314445495605, "reward_std": 0.010522525757551193, "rewards/accuracy_reward": 0.9155312776565552, "rewards/format_reward": 1.0, "step": 2410 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 373.09375, "epoch": 0.033254713728086506, "grad_norm": 3.5926245855456234, "kl": 0.08203125, "learning_rate": 9.9727384129948e-07, "loss": 0.0033, "reward": 2.1695938110351562, "reward_std": 0.024917827919125557, "rewards/accuracy_reward": 0.9758437871932983, "rewards/format_reward": 1.0, "step": 2411 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 378.78125, "epoch": 0.03326850664128771, "grad_norm": 2.4851963418716174, "kl": 0.0947265625, "learning_rate": 9.972715814558732e-07, "loss": 0.0038, "reward": 2.1246564388275146, "reward_std": 0.011552146635949612, "rewards/accuracy_reward": 0.9246562719345093, "rewards/format_reward": 1.0, "step": 2412 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 370.90625, "epoch": 0.0332822995544889, "grad_norm": 4.107681301273479, "kl": 0.0966796875, "learning_rate": 9.972693206785706e-07, "loss": 0.0039, "reward": 2.0489375591278076, "reward_std": 0.022118523716926575, "rewards/accuracy_reward": 0.848937451839447, "rewards/format_reward": 1.0, "step": 2413 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 380.5625, "epoch": 0.0332960924676901, "grad_norm": 2.9656141275351806, "kl": 0.08935546875, "learning_rate": 9.972670589675766e-07, "loss": 0.0036, "reward": 2.144718885421753, "reward_std": 0.020112890750169754, "rewards/accuracy_reward": 0.9447187781333923, "rewards/format_reward": 1.0, "step": 2414 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 374.0625, "epoch": 0.033309885380891295, "grad_norm": 2.543859750446778, "kl": 0.08251953125, "learning_rate": 9.972647963228953e-07, "loss": 0.0033, "reward": 2.0807814598083496, "reward_std": 0.041097287088632584, "rewards/accuracy_reward": 0.8932812213897705, "rewards/format_reward": 1.0, "step": 2415 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 382.03125, "epoch": 0.033323678294092496, "grad_norm": 2.51712493253632, "kl": 0.08203125, "learning_rate": 9.972625327445307e-07, "loss": 0.0033, "reward": 2.1582188606262207, "reward_std": 0.02050146833062172, "rewards/accuracy_reward": 0.9644687175750732, "rewards/format_reward": 1.0, "step": 2416 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 370.59375, "epoch": 0.03333747120729369, "grad_norm": 2.722630365394544, "kl": 0.08154296875, "learning_rate": 9.972602682324877e-07, "loss": 0.0033, "reward": 2.1474685668945312, "reward_std": 0.02480340749025345, "rewards/accuracy_reward": 0.9537187218666077, "rewards/format_reward": 1.0, "step": 2417 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 371.125, "epoch": 0.03335126412049489, "grad_norm": 2.4782277425164527, "kl": 0.08251953125, "learning_rate": 9.9725800278677e-07, "loss": 0.0033, "reward": 2.103687286376953, "reward_std": 0.026489539071917534, "rewards/accuracy_reward": 0.9099375009536743, "rewards/format_reward": 1.0, "step": 2418 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 378.25, "epoch": 0.033365057033696084, "grad_norm": 3.1856597450066575, "kl": 0.095703125, "learning_rate": 9.972557364073819e-07, "loss": 0.0038, "reward": 2.0617189407348633, "reward_std": 0.018151069059967995, "rewards/accuracy_reward": 0.8617187738418579, "rewards/format_reward": 1.0, "step": 2419 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 381.875, "epoch": 0.033378849946897285, "grad_norm": 1.1518360391563398, "kl": 0.0732421875, "learning_rate": 9.97253469094328e-07, "loss": 0.0029, "reward": 2.1312499046325684, "reward_std": 0.017677675932645798, "rewards/accuracy_reward": 0.9375, "rewards/format_reward": 1.0, "step": 2420 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 380.625, "epoch": 0.03339264286009848, "grad_norm": 13.350230308940366, "kl": 0.0927734375, "learning_rate": 9.972512008476123e-07, "loss": 0.0037, "reward": 2.1233749389648438, "reward_std": 0.012214983813464642, "rewards/accuracy_reward": 0.9233750104904175, "rewards/format_reward": 1.0, "step": 2421 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 382.5625, "epoch": 0.03340643577329968, "grad_norm": 2.328168748796646, "kl": 0.0869140625, "learning_rate": 9.97248931667239e-07, "loss": 0.0035, "reward": 2.128406524658203, "reward_std": 0.025792112573981285, "rewards/accuracy_reward": 0.9346562623977661, "rewards/format_reward": 1.0, "step": 2422 }, { "all_correct": 0.25, "all_wrong": 0.0, "completion_length": 382.9375, "epoch": 0.03342022868650087, "grad_norm": 2.6963226717099187, "kl": 0.087890625, "learning_rate": 9.972466615532124e-07, "loss": 0.0035, "reward": 2.050940752029419, "reward_std": 0.029490116983652115, "rewards/accuracy_reward": 0.8571906089782715, "rewards/format_reward": 1.0, "step": 2423 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 380.65625, "epoch": 0.033434021599702074, "grad_norm": 3.863580668796059, "kl": 0.08349609375, "learning_rate": 9.97244390505537e-07, "loss": 0.0033, "reward": 2.0817813873291016, "reward_std": 0.013757359236478806, "rewards/accuracy_reward": 0.8817812204360962, "rewards/format_reward": 1.0, "step": 2424 }, { "all_correct": 0.25, "all_wrong": 0.0, "completion_length": 376.65625, "epoch": 0.03344781451290327, "grad_norm": 3.054151597245667, "kl": 0.0888671875, "learning_rate": 9.972421185242169e-07, "loss": 0.0036, "reward": 2.098656177520752, "reward_std": 0.048749424517154694, "rewards/accuracy_reward": 0.9111562967300415, "rewards/format_reward": 1.0, "step": 2425 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 379.4375, "epoch": 0.03346160742610447, "grad_norm": 2.577269652225504, "kl": 0.0830078125, "learning_rate": 9.972398456092562e-07, "loss": 0.0033, "reward": 2.1077189445495605, "reward_std": 0.01511446014046669, "rewards/accuracy_reward": 0.9077187180519104, "rewards/format_reward": 1.0, "step": 2426 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 384.375, "epoch": 0.03347540033930566, "grad_norm": 3.8511200990567733, "kl": 0.0810546875, "learning_rate": 9.972375717606593e-07, "loss": 0.0033, "reward": 2.142937660217285, "reward_std": 0.010026934556663036, "rewards/accuracy_reward": 0.9429374933242798, "rewards/format_reward": 1.0, "step": 2427 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 384.875, "epoch": 0.03348919325250686, "grad_norm": 2.088545057038509, "kl": 0.08447265625, "learning_rate": 9.972352969784305e-07, "loss": 0.0034, "reward": 2.131333351135254, "reward_std": 0.0233272984623909, "rewards/accuracy_reward": 0.937583327293396, "rewards/format_reward": 1.0, "step": 2428 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 389.34375, "epoch": 0.03350298616570806, "grad_norm": 2.3506554559714754, "kl": 0.0859375, "learning_rate": 9.97233021262574e-07, "loss": 0.0035, "reward": 2.0661563873291016, "reward_std": 0.02349940314888954, "rewards/accuracy_reward": 0.8724062442779541, "rewards/format_reward": 1.0, "step": 2429 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 398.375, "epoch": 0.03351677907890926, "grad_norm": 3.617462970624219, "kl": 0.087890625, "learning_rate": 9.972307446130944e-07, "loss": 0.0035, "reward": 2.082750082015991, "reward_std": 0.01732909306883812, "rewards/accuracy_reward": 0.8827500343322754, "rewards/format_reward": 1.0, "step": 2430 }, { "all_correct": 0.25, "all_wrong": 0.0, "completion_length": 387.0625, "epoch": 0.03353057199211045, "grad_norm": 7.24485562526061, "kl": 0.08642578125, "learning_rate": 9.972284670299955e-07, "loss": 0.0034, "reward": 1.9272499084472656, "reward_std": 0.010798781178891659, "rewards/accuracy_reward": 0.7272499799728394, "rewards/format_reward": 1.0, "step": 2431 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 407.34375, "epoch": 0.03354436490531165, "grad_norm": 2.4207804705077756, "kl": 0.08203125, "learning_rate": 9.97226188513282e-07, "loss": 0.0033, "reward": 2.1395626068115234, "reward_std": 0.009129272773861885, "rewards/accuracy_reward": 0.9395624399185181, "rewards/format_reward": 1.0, "step": 2432 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 413.25, "epoch": 0.033558157818512846, "grad_norm": 2.4855238197036686, "kl": 0.08154296875, "learning_rate": 9.972239090629578e-07, "loss": 0.0033, "reward": 2.10868763923645, "reward_std": 0.0074102627113461494, "rewards/accuracy_reward": 0.9086875319480896, "rewards/format_reward": 1.0, "step": 2433 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 399.9375, "epoch": 0.03357195073171405, "grad_norm": 2.664360697631489, "kl": 0.08837890625, "learning_rate": 9.972216286790272e-07, "loss": 0.0035, "reward": 2.0164687633514404, "reward_std": 0.012277012690901756, "rewards/accuracy_reward": 0.8164688348770142, "rewards/format_reward": 1.0, "step": 2434 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 400.75, "epoch": 0.03358574364491524, "grad_norm": 3.51656537641986, "kl": 0.0869140625, "learning_rate": 9.972193473614948e-07, "loss": 0.0035, "reward": 2.0055625438690186, "reward_std": 0.0083012655377388, "rewards/accuracy_reward": 0.8055624961853027, "rewards/format_reward": 1.0, "step": 2435 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 403.625, "epoch": 0.03359953655811644, "grad_norm": 4.083725203407614, "kl": 0.08642578125, "learning_rate": 9.972170651103647e-07, "loss": 0.0035, "reward": 2.0890936851501465, "reward_std": 0.03324972838163376, "rewards/accuracy_reward": 0.8890937566757202, "rewards/format_reward": 1.0, "step": 2436 }, { "all_correct": 0.25, "all_wrong": 0.0, "completion_length": 406.6875, "epoch": 0.033613329471317635, "grad_norm": 4.47755386213965, "kl": 0.0966796875, "learning_rate": 9.97214781925641e-07, "loss": 0.0039, "reward": 1.9827501773834229, "reward_std": 0.01662595197558403, "rewards/accuracy_reward": 0.7827500104904175, "rewards/format_reward": 1.0, "step": 2437 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 388.09375, "epoch": 0.033627122384518836, "grad_norm": 1.897925868444686, "kl": 0.0830078125, "learning_rate": 9.972124978073284e-07, "loss": 0.0033, "reward": 2.0619375705718994, "reward_std": 0.006030885502696037, "rewards/accuracy_reward": 0.8619375824928284, "rewards/format_reward": 1.0, "step": 2438 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 393.53125, "epoch": 0.03364091529772003, "grad_norm": 2.385725610135012, "kl": 0.0810546875, "learning_rate": 9.972102127554307e-07, "loss": 0.0032, "reward": 2.0602188110351562, "reward_std": 0.007102045696228743, "rewards/accuracy_reward": 0.8602187633514404, "rewards/format_reward": 1.0, "step": 2439 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 400.90625, "epoch": 0.03365470821092123, "grad_norm": 5.65282098650959, "kl": 0.0830078125, "learning_rate": 9.972079267699526e-07, "loss": 0.0033, "reward": 2.0046563148498535, "reward_std": 0.019091207534074783, "rewards/accuracy_reward": 0.8046562671661377, "rewards/format_reward": 1.0, "step": 2440 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 405.53125, "epoch": 0.033668501124122424, "grad_norm": 4.669345736260731, "kl": 0.0849609375, "learning_rate": 9.972056398508984e-07, "loss": 0.0034, "reward": 2.0408437252044678, "reward_std": 0.012706535868346691, "rewards/accuracy_reward": 0.8408437371253967, "rewards/format_reward": 1.0, "step": 2441 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 404.0625, "epoch": 0.033682294037323625, "grad_norm": 3.0949382951462696, "kl": 0.08251953125, "learning_rate": 9.97203351998272e-07, "loss": 0.0033, "reward": 2.168656349182129, "reward_std": 0.01767483353614807, "rewards/accuracy_reward": 0.9686563014984131, "rewards/format_reward": 1.0, "step": 2442 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 403.1875, "epoch": 0.03369608695052482, "grad_norm": 2.7724112619274925, "kl": 0.07666015625, "learning_rate": 9.97201063212078e-07, "loss": 0.0031, "reward": 2.1036248207092285, "reward_std": 0.01158314198255539, "rewards/accuracy_reward": 0.9036250114440918, "rewards/format_reward": 1.0, "step": 2443 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 401.875, "epoch": 0.03370987986372602, "grad_norm": 2.6676294727136107, "kl": 0.083984375, "learning_rate": 9.97198773492321e-07, "loss": 0.0034, "reward": 2.1239376068115234, "reward_std": 0.025089092552661896, "rewards/accuracy_reward": 0.9239374995231628, "rewards/format_reward": 1.0, "step": 2444 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 386.34375, "epoch": 0.03372367277692721, "grad_norm": 2.642678056326857, "kl": 0.08984375, "learning_rate": 9.971964828390047e-07, "loss": 0.0036, "reward": 2.026249885559082, "reward_std": 0.011902209371328354, "rewards/accuracy_reward": 0.8262499570846558, "rewards/format_reward": 1.0, "step": 2445 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 393.3125, "epoch": 0.033737465690128414, "grad_norm": 4.595366298878818, "kl": 0.07958984375, "learning_rate": 9.971941912521335e-07, "loss": 0.0032, "reward": 2.0620625019073486, "reward_std": 0.012353375554084778, "rewards/accuracy_reward": 0.8620625138282776, "rewards/format_reward": 1.0, "step": 2446 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 396.875, "epoch": 0.03375125860332961, "grad_norm": 3.0458713282711183, "kl": 0.0830078125, "learning_rate": 9.971918987317117e-07, "loss": 0.0033, "reward": 2.0361876487731934, "reward_std": 0.021022191271185875, "rewards/accuracy_reward": 0.8361876010894775, "rewards/format_reward": 1.0, "step": 2447 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 399.1875, "epoch": 0.03376505151653081, "grad_norm": 3.6574162160207377, "kl": 0.08984375, "learning_rate": 9.97189605277744e-07, "loss": 0.0036, "reward": 2.151750087738037, "reward_std": 0.011491785757243633, "rewards/accuracy_reward": 0.9517499804496765, "rewards/format_reward": 1.0, "step": 2448 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 392.0, "epoch": 0.033778844429732, "grad_norm": 2.63140295141255, "kl": 0.08056640625, "learning_rate": 9.971873108902345e-07, "loss": 0.0032, "reward": 2.1619999408721924, "reward_std": 0.007781158667057753, "rewards/accuracy_reward": 0.9620000123977661, "rewards/format_reward": 1.0, "step": 2449 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 390.40625, "epoch": 0.0337926373429332, "grad_norm": 3.093291900477158, "kl": 0.078125, "learning_rate": 9.971850155691874e-07, "loss": 0.0031, "reward": 2.122781276702881, "reward_std": 0.044731613248586655, "rewards/accuracy_reward": 0.922781229019165, "rewards/format_reward": 1.0, "step": 2450 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 386.9375, "epoch": 0.0338064302561344, "grad_norm": 6.463156689368654, "kl": 0.0703125, "learning_rate": 9.97182719314607e-07, "loss": 0.0028, "reward": 2.1404688358306885, "reward_std": 0.010282193310558796, "rewards/accuracy_reward": 0.9404687881469727, "rewards/format_reward": 1.0, "step": 2451 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 385.84375, "epoch": 0.0338202231693356, "grad_norm": 2.896773754705013, "kl": 0.0830078125, "learning_rate": 9.971804221264977e-07, "loss": 0.0033, "reward": 2.1138124465942383, "reward_std": 0.03136708587408066, "rewards/accuracy_reward": 0.9263125061988831, "rewards/format_reward": 1.0, "step": 2452 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 373.3125, "epoch": 0.03383401608253679, "grad_norm": 3.3747246201428163, "kl": 0.0869140625, "learning_rate": 9.971781240048639e-07, "loss": 0.0035, "reward": 2.1637189388275146, "reward_std": 0.0077643669210374355, "rewards/accuracy_reward": 0.9637187719345093, "rewards/format_reward": 1.0, "step": 2453 }, { "all_correct": 0.25, "all_wrong": 0.0, "completion_length": 370.875, "epoch": 0.03384780899573799, "grad_norm": 3.9642644702356495, "kl": 0.0908203125, "learning_rate": 9.971758249497096e-07, "loss": 0.0036, "reward": 2.0025625228881836, "reward_std": 0.037515923380851746, "rewards/accuracy_reward": 0.8150624632835388, "rewards/format_reward": 1.0, "step": 2454 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 374.46875, "epoch": 0.033861601908939186, "grad_norm": 3.3502580157771495, "kl": 0.0830078125, "learning_rate": 9.971735249610394e-07, "loss": 0.0033, "reward": 2.0908751487731934, "reward_std": 0.024314774200320244, "rewards/accuracy_reward": 0.8971249461174011, "rewards/format_reward": 1.0, "step": 2455 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 380.375, "epoch": 0.03387539482214039, "grad_norm": 5.394240688036444, "kl": 0.0859375, "learning_rate": 9.971712240388576e-07, "loss": 0.0035, "reward": 2.1338438987731934, "reward_std": 0.017250115051865578, "rewards/accuracy_reward": 0.933843731880188, "rewards/format_reward": 1.0, "step": 2456 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 375.15625, "epoch": 0.03388918773534158, "grad_norm": 4.685872872380482, "kl": 0.08984375, "learning_rate": 9.971689221831684e-07, "loss": 0.0036, "reward": 2.1126251220703125, "reward_std": 0.02720843441784382, "rewards/accuracy_reward": 0.918874979019165, "rewards/format_reward": 1.0, "step": 2457 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 381.90625, "epoch": 0.03390298064854278, "grad_norm": 2.160279344611698, "kl": 0.07958984375, "learning_rate": 9.971666193939763e-07, "loss": 0.0032, "reward": 2.159656286239624, "reward_std": 0.0219342689961195, "rewards/accuracy_reward": 0.9659062623977661, "rewards/format_reward": 1.0, "step": 2458 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 379.6875, "epoch": 0.033916773561743975, "grad_norm": 2.057524035915639, "kl": 0.08447265625, "learning_rate": 9.971643156712855e-07, "loss": 0.0034, "reward": 2.165253162384033, "reward_std": 0.009145085699856281, "rewards/accuracy_reward": 0.9652531147003174, "rewards/format_reward": 1.0, "step": 2459 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 378.03125, "epoch": 0.033930566474945176, "grad_norm": 6.565121450329172, "kl": 0.0810546875, "learning_rate": 9.971620110151002e-07, "loss": 0.0033, "reward": 2.0455312728881836, "reward_std": 0.03897400200366974, "rewards/accuracy_reward": 0.8580312132835388, "rewards/format_reward": 1.0, "step": 2460 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 387.78125, "epoch": 0.03394435938814637, "grad_norm": 3.2723399725121682, "kl": 0.07080078125, "learning_rate": 9.971597054254248e-07, "loss": 0.0028, "reward": 2.092750072479248, "reward_std": 0.006710412912070751, "rewards/accuracy_reward": 0.8927499651908875, "rewards/format_reward": 1.0, "step": 2461 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 393.59375, "epoch": 0.03395815230134757, "grad_norm": 2.3234776316756336, "kl": 0.07275390625, "learning_rate": 9.97157398902264e-07, "loss": 0.0029, "reward": 2.103781223297119, "reward_std": 0.02257383055984974, "rewards/accuracy_reward": 0.9100311994552612, "rewards/format_reward": 1.0, "step": 2462 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 397.0, "epoch": 0.033971945214548764, "grad_norm": 2.3517598987425403, "kl": 0.0859375, "learning_rate": 9.971550914456217e-07, "loss": 0.0034, "reward": 2.134031295776367, "reward_std": 0.010852044448256493, "rewards/accuracy_reward": 0.9340312480926514, "rewards/format_reward": 1.0, "step": 2463 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 380.6875, "epoch": 0.033985738127749965, "grad_norm": 4.7973722135779475, "kl": 0.09033203125, "learning_rate": 9.971527830555021e-07, "loss": 0.0036, "reward": 2.040468692779541, "reward_std": 0.010141235776245594, "rewards/accuracy_reward": 0.84046870470047, "rewards/format_reward": 1.0, "step": 2464 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 384.15625, "epoch": 0.03399953104095116, "grad_norm": 3.0173789702295157, "kl": 0.08154296875, "learning_rate": 9.971504737319101e-07, "loss": 0.0033, "reward": 2.0005626678466797, "reward_std": 0.07080061733722687, "rewards/accuracy_reward": 0.8005625009536743, "rewards/format_reward": 1.0, "step": 2465 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 387.59375, "epoch": 0.03401332395415236, "grad_norm": 2.6133684855381283, "kl": 0.09521484375, "learning_rate": 9.971481634748495e-07, "loss": 0.0038, "reward": 2.0197813510894775, "reward_std": 0.03206191211938858, "rewards/accuracy_reward": 0.8260312676429749, "rewards/format_reward": 1.0, "step": 2466 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 398.625, "epoch": 0.03402711686735355, "grad_norm": 5.866873540986668, "kl": 0.08349609375, "learning_rate": 9.97145852284325e-07, "loss": 0.0033, "reward": 2.117906332015991, "reward_std": 0.007888181135058403, "rewards/accuracy_reward": 0.9179062843322754, "rewards/format_reward": 1.0, "step": 2467 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 384.78125, "epoch": 0.034040909780554754, "grad_norm": 5.0572905549114076, "kl": 0.08935546875, "learning_rate": 9.971435401603407e-07, "loss": 0.0036, "reward": 2.0903749465942383, "reward_std": 0.027681348845362663, "rewards/accuracy_reward": 0.8966250419616699, "rewards/format_reward": 1.0, "step": 2468 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 382.59375, "epoch": 0.03405470269375595, "grad_norm": 1.3156902998176172, "kl": 0.0810546875, "learning_rate": 9.971412271029013e-07, "loss": 0.0032, "reward": 2.1235625743865967, "reward_std": 0.023817671462893486, "rewards/accuracy_reward": 0.9360624551773071, "rewards/format_reward": 1.0, "step": 2469 }, { "all_correct": 0.25, "all_wrong": 0.0, "completion_length": 411.9375, "epoch": 0.03406849560695715, "grad_norm": 3.7195421296306668, "kl": 0.08642578125, "learning_rate": 9.971389131120108e-07, "loss": 0.0035, "reward": 1.964093804359436, "reward_std": 0.06583546102046967, "rewards/accuracy_reward": 0.7640937566757202, "rewards/format_reward": 1.0, "step": 2470 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 402.15625, "epoch": 0.03408228852015834, "grad_norm": 3.0295248489458633, "kl": 0.08837890625, "learning_rate": 9.971365981876734e-07, "loss": 0.0035, "reward": 2.101830005645752, "reward_std": 0.02001786045730114, "rewards/accuracy_reward": 0.9018298387527466, "rewards/format_reward": 1.0, "step": 2471 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 389.5625, "epoch": 0.03409608143335954, "grad_norm": 7.1150072581499995, "kl": 0.08447265625, "learning_rate": 9.97134282329894e-07, "loss": 0.0034, "reward": 2.0817813873291016, "reward_std": 0.026779845356941223, "rewards/accuracy_reward": 0.881781280040741, "rewards/format_reward": 1.0, "step": 2472 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 400.71875, "epoch": 0.03410987434656074, "grad_norm": 4.195636147756919, "kl": 0.0869140625, "learning_rate": 9.971319655386763e-07, "loss": 0.0035, "reward": 2.0225937366485596, "reward_std": 0.029494451358914375, "rewards/accuracy_reward": 0.8288437128067017, "rewards/format_reward": 1.0, "step": 2473 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 416.46875, "epoch": 0.03412366725976194, "grad_norm": 2.3419742955447007, "kl": 0.09033203125, "learning_rate": 9.97129647814025e-07, "loss": 0.0036, "reward": 2.1741561889648438, "reward_std": 0.006099590100347996, "rewards/accuracy_reward": 0.9741562604904175, "rewards/format_reward": 1.0, "step": 2474 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 412.0625, "epoch": 0.03413746017296313, "grad_norm": 2.772833498800288, "kl": 0.08642578125, "learning_rate": 9.971273291559446e-07, "loss": 0.0035, "reward": 2.1161561012268066, "reward_std": 0.0149550661444664, "rewards/accuracy_reward": 0.9161562323570251, "rewards/format_reward": 1.0, "step": 2475 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 407.34375, "epoch": 0.03415125308616433, "grad_norm": 3.898858564895746, "kl": 0.09326171875, "learning_rate": 9.971250095644392e-07, "loss": 0.0037, "reward": 2.1189374923706055, "reward_std": 0.017431240528821945, "rewards/accuracy_reward": 0.9189375042915344, "rewards/format_reward": 1.0, "step": 2476 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 413.34375, "epoch": 0.034165045999365526, "grad_norm": 2.9555917020149574, "kl": 0.09228515625, "learning_rate": 9.971226890395132e-07, "loss": 0.0037, "reward": 2.019031286239624, "reward_std": 0.013402904383838177, "rewards/accuracy_reward": 0.8190311789512634, "rewards/format_reward": 1.0, "step": 2477 }, { "all_correct": 0.25, "all_wrong": 0.0, "completion_length": 412.75, "epoch": 0.03417883891256673, "grad_norm": 3.0685472343498463, "kl": 0.0791015625, "learning_rate": 9.97120367581171e-07, "loss": 0.0032, "reward": 1.9044687747955322, "reward_std": 0.04219035804271698, "rewards/accuracy_reward": 0.7107187509536743, "rewards/format_reward": 1.0, "step": 2478 }, { "all_correct": 0.25, "all_wrong": 0.0, "completion_length": 406.71875, "epoch": 0.03419263182576792, "grad_norm": 3.2418424809581268, "kl": 0.0859375, "learning_rate": 9.971180451894172e-07, "loss": 0.0035, "reward": 1.980125069618225, "reward_std": 0.014232121407985687, "rewards/accuracy_reward": 0.7801250219345093, "rewards/format_reward": 1.0, "step": 2479 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 402.90625, "epoch": 0.034206424738969114, "grad_norm": 2.6745811352136974, "kl": 0.08447265625, "learning_rate": 9.971157218642556e-07, "loss": 0.0034, "reward": 1.9965624809265137, "reward_std": 0.010226715356111526, "rewards/accuracy_reward": 0.7965624332427979, "rewards/format_reward": 1.0, "step": 2480 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 416.84375, "epoch": 0.034220217652170315, "grad_norm": 4.930951633392177, "kl": 0.083984375, "learning_rate": 9.97113397605691e-07, "loss": 0.0034, "reward": 2.0781562328338623, "reward_std": 0.015688687562942505, "rewards/accuracy_reward": 0.8781561851501465, "rewards/format_reward": 1.0, "step": 2481 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 410.09375, "epoch": 0.03423401056537151, "grad_norm": 3.437921211682119, "kl": 0.07568359375, "learning_rate": 9.971110724137276e-07, "loss": 0.003, "reward": 2.119374990463257, "reward_std": 0.01475725881755352, "rewards/accuracy_reward": 0.9193750023841858, "rewards/format_reward": 1.0, "step": 2482 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 427.875, "epoch": 0.03424780347857271, "grad_norm": 3.3107338780351347, "kl": 0.078125, "learning_rate": 9.971087462883697e-07, "loss": 0.0031, "reward": 2.0736875534057617, "reward_std": 0.008253393694758415, "rewards/accuracy_reward": 0.8736873865127563, "rewards/format_reward": 1.0, "step": 2483 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 416.6875, "epoch": 0.0342615963917739, "grad_norm": 4.475249844210883, "kl": 0.08447265625, "learning_rate": 9.971064192296222e-07, "loss": 0.0034, "reward": 1.9825000762939453, "reward_std": 0.028541497886180878, "rewards/accuracy_reward": 0.7887499332427979, "rewards/format_reward": 1.0, "step": 2484 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 406.40625, "epoch": 0.034275389304975104, "grad_norm": 2.879428313194441, "kl": 0.087890625, "learning_rate": 9.971040912374886e-07, "loss": 0.0035, "reward": 2.1397812366485596, "reward_std": 0.013590086251497269, "rewards/accuracy_reward": 0.9397812485694885, "rewards/format_reward": 1.0, "step": 2485 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 420.59375, "epoch": 0.0342891822181763, "grad_norm": 6.317778461168252, "kl": 0.0810546875, "learning_rate": 9.971017623119739e-07, "loss": 0.0032, "reward": 2.13853120803833, "reward_std": 0.013636965304613113, "rewards/accuracy_reward": 0.9385312795639038, "rewards/format_reward": 1.0, "step": 2486 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 407.0, "epoch": 0.0343029751313775, "grad_norm": 2.5873515223000747, "kl": 0.08349609375, "learning_rate": 9.970994324530824e-07, "loss": 0.0033, "reward": 2.1447501182556152, "reward_std": 0.012977300211787224, "rewards/accuracy_reward": 0.9447499513626099, "rewards/format_reward": 1.0, "step": 2487 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 415.5625, "epoch": 0.03431676804457869, "grad_norm": 2.9246081118692167, "kl": 0.0869140625, "learning_rate": 9.97097101660818e-07, "loss": 0.0035, "reward": 2.1542186737060547, "reward_std": 0.015418612398207188, "rewards/accuracy_reward": 0.9542188048362732, "rewards/format_reward": 1.0, "step": 2488 }, { "all_correct": 0.25, "all_wrong": 0.0, "completion_length": 427.5625, "epoch": 0.03433056095777989, "grad_norm": 2.0930674403234195, "kl": 0.08154296875, "learning_rate": 9.970947699351857e-07, "loss": 0.0033, "reward": 2.055093765258789, "reward_std": 0.030023913830518723, "rewards/accuracy_reward": 0.8613437414169312, "rewards/format_reward": 1.0, "step": 2489 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 414.4375, "epoch": 0.03434435387098109, "grad_norm": 3.9937996142284558, "kl": 0.0791015625, "learning_rate": 9.970924372761896e-07, "loss": 0.0032, "reward": 2.0469062328338623, "reward_std": 0.03693939745426178, "rewards/accuracy_reward": 0.8531562089920044, "rewards/format_reward": 1.0, "step": 2490 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 421.96875, "epoch": 0.03435814678418229, "grad_norm": 2.8254197294337, "kl": 0.07421875, "learning_rate": 9.970901036838342e-07, "loss": 0.003, "reward": 2.131687641143799, "reward_std": 0.010842893272638321, "rewards/accuracy_reward": 0.9316875338554382, "rewards/format_reward": 1.0, "step": 2491 }, { "all_correct": 0.25, "all_wrong": 0.0, "completion_length": 417.15625, "epoch": 0.03437193969738348, "grad_norm": 1.9596384483936762, "kl": 0.08837890625, "learning_rate": 9.970877691581236e-07, "loss": 0.0035, "reward": 2.0266876220703125, "reward_std": 0.0414372943341732, "rewards/accuracy_reward": 0.839187502861023, "rewards/format_reward": 1.0, "step": 2492 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 407.28125, "epoch": 0.03438573261058468, "grad_norm": 2.2381669334726952, "kl": 0.07421875, "learning_rate": 9.970854336990624e-07, "loss": 0.003, "reward": 2.1756250858306885, "reward_std": 0.009924383834004402, "rewards/accuracy_reward": 0.9756249785423279, "rewards/format_reward": 1.0, "step": 2493 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 422.15625, "epoch": 0.034399525523785876, "grad_norm": 2.5692768372546446, "kl": 0.0830078125, "learning_rate": 9.970830973066549e-07, "loss": 0.0033, "reward": 2.101468801498413, "reward_std": 0.01245273556560278, "rewards/accuracy_reward": 0.9014687538146973, "rewards/format_reward": 1.0, "step": 2494 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 411.1875, "epoch": 0.03441331843698708, "grad_norm": 7.57162041956244, "kl": 0.07470703125, "learning_rate": 9.970807599809056e-07, "loss": 0.003, "reward": 2.079625129699707, "reward_std": 0.008041145280003548, "rewards/accuracy_reward": 0.8796249628067017, "rewards/format_reward": 1.0, "step": 2495 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 412.125, "epoch": 0.03442711135018827, "grad_norm": 2.3123518034331125, "kl": 0.083984375, "learning_rate": 9.97078421721819e-07, "loss": 0.0034, "reward": 2.1514687538146973, "reward_std": 0.009307511150836945, "rewards/accuracy_reward": 0.9514687657356262, "rewards/format_reward": 1.0, "step": 2496 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 411.03125, "epoch": 0.03444090426338947, "grad_norm": 4.411916140559651, "kl": 0.083984375, "learning_rate": 9.97076082529399e-07, "loss": 0.0034, "reward": 2.152156352996826, "reward_std": 0.010378116741776466, "rewards/accuracy_reward": 0.9521562457084656, "rewards/format_reward": 1.0, "step": 2497 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 399.65625, "epoch": 0.034454697176590665, "grad_norm": 2.4775239786426533, "kl": 0.0771484375, "learning_rate": 9.970737424036504e-07, "loss": 0.0031, "reward": 2.0715625286102295, "reward_std": 0.013272336684167385, "rewards/accuracy_reward": 0.8715626001358032, "rewards/format_reward": 1.0, "step": 2498 }, { "all_correct": 0.25, "all_wrong": 0.0, "completion_length": 390.78125, "epoch": 0.034468490089791866, "grad_norm": 2.04170119441449, "kl": 0.08642578125, "learning_rate": 9.970714013445775e-07, "loss": 0.0035, "reward": 1.9000937938690186, "reward_std": 0.006583128124475479, "rewards/accuracy_reward": 0.7000937461853027, "rewards/format_reward": 1.0, "step": 2499 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 381.15625, "epoch": 0.03448228300299306, "grad_norm": 4.124821929364961, "kl": 0.0859375, "learning_rate": 9.970690593521847e-07, "loss": 0.0034, "reward": 2.112281322479248, "reward_std": 0.03347647190093994, "rewards/accuracy_reward": 0.9185312390327454, "rewards/format_reward": 1.0, "step": 2500 } ], "logging_steps": 1.0, "max_steps": 72501, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 100, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 1, "trial_name": null, "trial_params": null }