{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.07602481449945263, "eval_steps": 500, "global_step": 2500, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "all_correct": 0.375, "all_wrong": 0.0, "completion_length": 385.515625, "epoch": 3.040992579978105e-05, "grad_norm": 1.46462129102653, "kl": 0.0, "learning_rate": 9.999999977182372e-07, "loss": -0.0, "reward": 1.6200919151306152, "reward_std": 0.3473181128501892, "rewards/accuracy_reward": 0.5732167959213257, "rewards/format_reward": 0.984375, "step": 1, "temperature": 1.0 }, { "all_correct": 0.375, "all_wrong": 0.125, "completion_length": 349.140625, "epoch": 6.08198515995621e-05, "grad_norm": 1.5765784532645386, "kl": 0.0006866455078125, "learning_rate": 9.999999908729493e-07, "loss": 0.0, "reward": 1.6871967315673828, "reward_std": 0.15995854139328003, "rewards/accuracy_reward": 0.6090717315673828, "rewards/format_reward": 1.0, "step": 2, "temperature": 1.0 }, { "all_correct": 0.375, "all_wrong": 0.0, "completion_length": 382.171875, "epoch": 9.122977739934314e-05, "grad_norm": 1.4479568746080858, "kl": 0.000553131103515625, "learning_rate": 9.99999979464136e-07, "loss": 0.0, "reward": 1.800853967666626, "reward_std": 0.3275423049926758, "rewards/accuracy_reward": 0.7696040868759155, "rewards/format_reward": 0.953125, "step": 3, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 301.59375, "epoch": 0.0001216397031991242, "grad_norm": 1.8873157921194488, "kl": 0.000637054443359375, "learning_rate": 9.999999634917972e-07, "loss": 0.0, "reward": 1.7625000476837158, "reward_std": 0.2836723327636719, "rewards/accuracy_reward": 0.71875, "rewards/format_reward": 1.0, "step": 4, "temperature": 1.0 }, { "all_correct": 0.375, "all_wrong": 0.125, "completion_length": 340.03125, "epoch": 0.00015204962899890526, "grad_norm": 1.8722689099433998, "kl": 0.000545501708984375, "learning_rate": 9.999999429559338e-07, "loss": 0.0, "reward": 1.6502331495285034, "reward_std": 0.2827591896057129, "rewards/accuracy_reward": 0.6189831495285034, "rewards/format_reward": 0.984375, "step": 5, "temperature": 1.0 }, { "all_correct": 0.25, "all_wrong": 0.0, "completion_length": 335.21875, "epoch": 0.0001824595547986863, "grad_norm": 1.562897442755551, "kl": 0.000797271728515625, "learning_rate": 9.999999178565453e-07, "loss": 0.0, "reward": 1.743882179260254, "reward_std": 0.3677159547805786, "rewards/accuracy_reward": 0.6720072031021118, "rewards/format_reward": 1.0, "step": 6, "temperature": 1.0 }, { "all_correct": 0.25, "all_wrong": 0.125, "completion_length": 290.828125, "epoch": 0.00021286948059846735, "grad_norm": 1.7767724234648412, "kl": 0.000823974609375, "learning_rate": 9.999998881936323e-07, "loss": 0.0, "reward": 1.5342857837677002, "reward_std": 0.2276882380247116, "rewards/accuracy_reward": 0.5092858076095581, "rewards/format_reward": 1.0, "step": 7, "temperature": 1.0 }, { "all_correct": 0.25, "all_wrong": 0.125, "completion_length": 431.515625, "epoch": 0.0002432794063982484, "grad_norm": 2.418736542031397, "kl": 0.0012359619140625, "learning_rate": 9.999998539671947e-07, "loss": 0.0, "reward": 1.4342398643493652, "reward_std": 0.3719263970851898, "rewards/accuracy_reward": 0.49986493587493896, "rewards/format_reward": 0.890625, "step": 8, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 370.890625, "epoch": 0.0002736893321980294, "grad_norm": 1.3838072184589625, "kl": 0.0007476806640625, "learning_rate": 9.999998151772334e-07, "loss": 0.0, "reward": 1.756640076637268, "reward_std": 0.20073392987251282, "rewards/accuracy_reward": 0.6941401362419128, "rewards/format_reward": 0.984375, "step": 9, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.0, "completion_length": 361.09375, "epoch": 0.0003040992579978105, "grad_norm": 1.5727200865129225, "kl": 0.000728607177734375, "learning_rate": 9.99999771823748e-07, "loss": 0.0, "reward": 1.75, "reward_std": 0.22021089494228363, "rewards/accuracy_reward": 0.6875, "rewards/format_reward": 1.0, "step": 10, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 325.59375, "epoch": 0.00033450918379759155, "grad_norm": 2.688424452559186, "kl": 0.00084686279296875, "learning_rate": 9.999997239067396e-07, "loss": 0.0, "reward": 1.917578101158142, "reward_std": 0.17126643657684326, "rewards/accuracy_reward": 0.861328125, "rewards/format_reward": 1.0, "step": 11, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 333.3125, "epoch": 0.0003649191095973726, "grad_norm": 1.2016068474261656, "kl": 0.000919342041015625, "learning_rate": 9.999996714262083e-07, "loss": 0.0, "reward": 1.890625, "reward_std": 0.16929560899734497, "rewards/accuracy_reward": 0.828125, "rewards/format_reward": 1.0, "step": 12, "temperature": 1.0 }, { "all_correct": 0.375, "all_wrong": 0.0, "completion_length": 335.25, "epoch": 0.0003953290353971536, "grad_norm": 1.501756467523148, "kl": 0.00115203857421875, "learning_rate": 9.999996143821546e-07, "loss": 0.0, "reward": 1.7435684204101562, "reward_std": 0.2695395350456238, "rewards/accuracy_reward": 0.7029433846473694, "rewards/format_reward": 1.0, "step": 13, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 386.421875, "epoch": 0.0004257389611969347, "grad_norm": 1.7218942478249426, "kl": 0.00122833251953125, "learning_rate": 9.999995527745788e-07, "loss": 0.0, "reward": 1.8532562255859375, "reward_std": 0.2433551549911499, "rewards/accuracy_reward": 0.7657562494277954, "rewards/format_reward": 1.0, "step": 14, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.125, "completion_length": 392.9375, "epoch": 0.00045614888699671574, "grad_norm": 1.4130863054579237, "kl": 0.00146484375, "learning_rate": 9.99999486603482e-07, "loss": 0.0001, "reward": 1.7096762657165527, "reward_std": 0.1203475296497345, "rewards/accuracy_reward": 0.6503013372421265, "rewards/format_reward": 1.0, "step": 15, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.125, "completion_length": 352.703125, "epoch": 0.0004865588127964968, "grad_norm": 2.2745796643908838, "kl": 0.00124359130859375, "learning_rate": 9.999994158688644e-07, "loss": 0.0, "reward": 1.6993539333343506, "reward_std": 0.15594330430030823, "rewards/accuracy_reward": 0.6274789571762085, "rewards/format_reward": 1.0, "step": 16, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.125, "completion_length": 325.890625, "epoch": 0.0005169687385962778, "grad_norm": 1.3784606032507907, "kl": 0.00185394287109375, "learning_rate": 9.999993405707267e-07, "loss": 0.0001, "reward": 1.789963722229004, "reward_std": 0.16524848341941833, "rewards/accuracy_reward": 0.7243385910987854, "rewards/format_reward": 1.0, "step": 17, "temperature": 1.0 }, { "all_correct": 0.25, "all_wrong": 0.0, "completion_length": 333.515625, "epoch": 0.0005473786643960588, "grad_norm": 1.8108028298378926, "kl": 0.001678466796875, "learning_rate": 9.999992607090697e-07, "loss": 0.0001, "reward": 1.6446179151535034, "reward_std": 0.3519408702850342, "rewards/accuracy_reward": 0.5883679389953613, "rewards/format_reward": 1.0, "step": 18, "temperature": 1.0 }, { "all_correct": 0.25, "all_wrong": 0.125, "completion_length": 347.828125, "epoch": 0.0005777885901958399, "grad_norm": 1.8002379603895076, "kl": 0.0018463134765625, "learning_rate": 9.999991762838942e-07, "loss": 0.0001, "reward": 1.800812005996704, "reward_std": 0.2275523990392685, "rewards/accuracy_reward": 0.6851869821548462, "rewards/format_reward": 1.0, "step": 19, "temperature": 1.0 }, { "all_correct": 0.25, "all_wrong": 0.0, "completion_length": 440.3125, "epoch": 0.000608198515995621, "grad_norm": 1.7096354165169774, "kl": 0.001556396484375, "learning_rate": 9.999990872952006e-07, "loss": 0.0001, "reward": 1.7975349426269531, "reward_std": 0.342560738325119, "rewards/accuracy_reward": 0.716284990310669, "rewards/format_reward": 1.0, "step": 20, "temperature": 1.0 }, { "all_correct": 0.25, "all_wrong": 0.125, "completion_length": 378.140625, "epoch": 0.0006386084417954021, "grad_norm": 2.48989439056807, "kl": 0.001953125, "learning_rate": 9.999989937429902e-07, "loss": 0.0001, "reward": 1.5938849449157715, "reward_std": 0.24751313030719757, "rewards/accuracy_reward": 0.5095100402832031, "rewards/format_reward": 1.0, "step": 21, "temperature": 1.0 }, { "all_correct": 0.25, "all_wrong": 0.125, "completion_length": 347.703125, "epoch": 0.0006690183675951831, "grad_norm": 1.5987316397281808, "kl": 0.0026397705078125, "learning_rate": 9.999988956272633e-07, "loss": 0.0001, "reward": 1.5236949920654297, "reward_std": 0.24698606133460999, "rewards/accuracy_reward": 0.4674449861049652, "rewards/format_reward": 1.0, "step": 22, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.0, "completion_length": 316.28125, "epoch": 0.0006994282933949641, "grad_norm": 1.6727439953130954, "kl": 0.0034027099609375, "learning_rate": 9.999987929480213e-07, "loss": 0.0001, "reward": 1.9693775177001953, "reward_std": 0.21827057003974915, "rewards/accuracy_reward": 0.8818774223327637, "rewards/format_reward": 1.0, "step": 23, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.0, "completion_length": 392.59375, "epoch": 0.0007298382191947452, "grad_norm": 1.5077457194830788, "kl": 0.003173828125, "learning_rate": 9.99998685705265e-07, "loss": 0.0001, "reward": 1.9068952798843384, "reward_std": 0.16635803878307343, "rewards/accuracy_reward": 0.8318951725959778, "rewards/format_reward": 0.96875, "step": 24, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 436.640625, "epoch": 0.0007602481449945262, "grad_norm": 1.9822459834217656, "kl": 0.00299072265625, "learning_rate": 9.999985738989952e-07, "loss": 0.0001, "reward": 1.6628453731536865, "reward_std": 0.1725778877735138, "rewards/accuracy_reward": 0.6472203135490417, "rewards/format_reward": 0.96875, "step": 25, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 335.65625, "epoch": 0.0007906580707943072, "grad_norm": 1.8945284918235032, "kl": 0.003997802734375, "learning_rate": 9.99998457529213e-07, "loss": 0.0002, "reward": 1.953125, "reward_std": 0.2697862684726715, "rewards/accuracy_reward": 0.859375, "rewards/format_reward": 1.0, "step": 26, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.125, "completion_length": 313.484375, "epoch": 0.0008210679965940883, "grad_norm": 2.2296562874480825, "kl": 0.00518798828125, "learning_rate": 9.999983365959195e-07, "loss": 0.0002, "reward": 1.853124976158142, "reward_std": 0.21263141930103302, "rewards/accuracy_reward": 0.796875, "rewards/format_reward": 1.0, "step": 27, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.375, "completion_length": 356.921875, "epoch": 0.0008514779223938694, "grad_norm": 1.374397718368643, "kl": 0.004486083984375, "learning_rate": 9.99998211099116e-07, "loss": 0.0002, "reward": 1.5906494855880737, "reward_std": 0.09352891147136688, "rewards/accuracy_reward": 0.5406495332717896, "rewards/format_reward": 0.984375, "step": 28, "temperature": 1.0 }, { "all_correct": 0.25, "all_wrong": 0.25, "completion_length": 377.09375, "epoch": 0.0008818878481936504, "grad_norm": 2.1876852440717003, "kl": 0.004486083984375, "learning_rate": 9.99998081038803e-07, "loss": 0.0002, "reward": 1.6968750953674316, "reward_std": 0.24924103915691376, "rewards/accuracy_reward": 0.609375, "rewards/format_reward": 1.0, "step": 29, "temperature": 1.0 }, { "all_correct": 0.25, "all_wrong": 0.125, "completion_length": 389.6875, "epoch": 0.0009122977739934315, "grad_norm": 1.3752351130491731, "kl": 0.00482177734375, "learning_rate": 9.999979464149825e-07, "loss": 0.0002, "reward": 1.596991777420044, "reward_std": 0.26820871233940125, "rewards/accuracy_reward": 0.5251166820526123, "rewards/format_reward": 1.0, "step": 30, "temperature": 1.0 }, { "all_correct": 0.375, "all_wrong": 0.0, "completion_length": 386.921875, "epoch": 0.0009427076997932125, "grad_norm": 1.508165110944859, "kl": 0.00579833984375, "learning_rate": 9.999978072276553e-07, "loss": 0.0002, "reward": 1.718550443649292, "reward_std": 0.35128822922706604, "rewards/accuracy_reward": 0.6154255270957947, "rewards/format_reward": 1.0, "step": 31, "temperature": 1.0 }, { "all_correct": 0.25, "all_wrong": 0.0, "completion_length": 377.65625, "epoch": 0.0009731176255929935, "grad_norm": 1.2888692440946512, "kl": 0.00555419921875, "learning_rate": 9.999976634768226e-07, "loss": 0.0002, "reward": 1.8919233083724976, "reward_std": 0.345920205116272, "rewards/accuracy_reward": 0.7700482606887817, "rewards/format_reward": 1.0, "step": 32, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.125, "completion_length": 364.703125, "epoch": 0.0010035275513927747, "grad_norm": 1.232348639633421, "kl": 0.005157470703125, "learning_rate": 9.99997515162486e-07, "loss": 0.0002, "reward": 1.8963067531585693, "reward_std": 0.19907650351524353, "rewards/accuracy_reward": 0.7713068723678589, "rewards/format_reward": 1.0, "step": 33, "temperature": 1.0 }, { "all_correct": 0.25, "all_wrong": 0.125, "completion_length": 438.5625, "epoch": 0.0010339374771925556, "grad_norm": 1.3578169430467355, "kl": 0.005340576171875, "learning_rate": 9.999973622846466e-07, "loss": 0.0002, "reward": 1.593675136566162, "reward_std": 0.2628445625305176, "rewards/accuracy_reward": 0.5311751365661621, "rewards/format_reward": 0.96875, "step": 34, "temperature": 1.0 }, { "all_correct": 0.25, "all_wrong": 0.0, "completion_length": 384.328125, "epoch": 0.0010643474029923368, "grad_norm": 1.5866826727618084, "kl": 0.00689697265625, "learning_rate": 9.99997204843306e-07, "loss": 0.0003, "reward": 1.8410686254501343, "reward_std": 0.33037933707237244, "rewards/accuracy_reward": 0.7223186492919922, "rewards/format_reward": 1.0, "step": 35, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.0, "completion_length": 367.734375, "epoch": 0.0010947573287921177, "grad_norm": 1.3910018979925194, "kl": 0.009521484375, "learning_rate": 9.999970428384654e-07, "loss": 0.0004, "reward": 1.7983285188674927, "reward_std": 0.2062988579273224, "rewards/accuracy_reward": 0.7202036380767822, "rewards/format_reward": 0.984375, "step": 36, "temperature": 1.0 }, { "all_correct": 0.25, "all_wrong": 0.125, "completion_length": 416.1875, "epoch": 0.0011251672545918988, "grad_norm": 3.4750962646915196, "kl": 0.00799560546875, "learning_rate": 9.999968762701265e-07, "loss": 0.0003, "reward": 1.5581188201904297, "reward_std": 0.23245224356651306, "rewards/accuracy_reward": 0.47686874866485596, "rewards/format_reward": 1.0, "step": 37, "temperature": 1.0 }, { "all_correct": 0.375, "all_wrong": 0.125, "completion_length": 408.4375, "epoch": 0.0011555771803916798, "grad_norm": 1.4937905037107568, "kl": 0.00823974609375, "learning_rate": 9.999967051382908e-07, "loss": 0.0003, "reward": 1.7101589441299438, "reward_std": 0.21186581254005432, "rewards/accuracy_reward": 0.6289088726043701, "rewards/format_reward": 1.0, "step": 38, "temperature": 1.0 }, { "all_correct": 0.25, "all_wrong": 0.125, "completion_length": 369.0625, "epoch": 0.001185987106191461, "grad_norm": 1.5830653150849738, "kl": 0.01007080078125, "learning_rate": 9.999965294429597e-07, "loss": 0.0004, "reward": 1.6895959377288818, "reward_std": 0.1790069043636322, "rewards/accuracy_reward": 0.5833459496498108, "rewards/format_reward": 1.0, "step": 39, "temperature": 1.0 }, { "all_correct": 0.375, "all_wrong": 0.25, "completion_length": 406.390625, "epoch": 0.001216397031991242, "grad_norm": 1.4961036906202025, "kl": 0.007720947265625, "learning_rate": 9.99996349184135e-07, "loss": 0.0003, "reward": 1.6968750953674316, "reward_std": 0.21975873410701752, "rewards/accuracy_reward": 0.625, "rewards/format_reward": 0.96875, "step": 40, "temperature": 1.0 }, { "all_correct": 0.125, "all_wrong": 0.375, "completion_length": 396.578125, "epoch": 0.001246806957791023, "grad_norm": 1.8720107464553335, "kl": 0.00830078125, "learning_rate": 9.99996164361818e-07, "loss": 0.0003, "reward": 1.3554950952529907, "reward_std": 0.10378493368625641, "rewards/accuracy_reward": 0.305495023727417, "rewards/format_reward": 1.0, "step": 41, "temperature": 1.0 }, { "all_correct": 0.25, "all_wrong": 0.0, "completion_length": 405.359375, "epoch": 0.0012772168835908041, "grad_norm": 1.9376029926340375, "kl": 0.0111083984375, "learning_rate": 9.999959749760108e-07, "loss": 0.0004, "reward": 1.6944656372070312, "reward_std": 0.2876342236995697, "rewards/accuracy_reward": 0.5757156610488892, "rewards/format_reward": 1.0, "step": 42, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 352.34375, "epoch": 0.001307626809390585, "grad_norm": 1.247138658977461, "kl": 0.01123046875, "learning_rate": 9.99995781026715e-07, "loss": 0.0004, "reward": 2.0968751907348633, "reward_std": 0.15718582272529602, "rewards/accuracy_reward": 0.953125, "rewards/format_reward": 1.0, "step": 43, "temperature": 1.0 }, { "all_correct": 0.375, "all_wrong": 0.125, "completion_length": 395.4375, "epoch": 0.0013380367351903662, "grad_norm": 1.1259095782576038, "kl": 0.01202392578125, "learning_rate": 9.999955825139322e-07, "loss": 0.0005, "reward": 1.735487699508667, "reward_std": 0.1868601143360138, "rewards/accuracy_reward": 0.6354876756668091, "rewards/format_reward": 0.984375, "step": 44, "temperature": 1.0 }, { "all_correct": 0.125, "all_wrong": 0.0, "completion_length": 435.25, "epoch": 0.001368446660990147, "grad_norm": 1.4793116261961232, "kl": 0.00897216796875, "learning_rate": 9.999953794376643e-07, "loss": 0.0004, "reward": 1.7550036907196045, "reward_std": 0.37853968143463135, "rewards/accuracy_reward": 0.6425036191940308, "rewards/format_reward": 0.984375, "step": 45, "temperature": 1.0 }, { "all_correct": 0.375, "all_wrong": 0.0, "completion_length": 413.34375, "epoch": 0.0013988565867899283, "grad_norm": 1.441985044139194, "kl": 0.01287841796875, "learning_rate": 9.999951717979134e-07, "loss": 0.0005, "reward": 1.7012786865234375, "reward_std": 0.22611986100673676, "rewards/accuracy_reward": 0.5919036269187927, "rewards/format_reward": 1.0, "step": 46, "temperature": 1.0 }, { "all_correct": 0.25, "all_wrong": 0.0, "completion_length": 437.171875, "epoch": 0.0014292665125897092, "grad_norm": 2.566395975798807, "kl": 0.0111083984375, "learning_rate": 9.99994959594681e-07, "loss": 0.0004, "reward": 1.748474359512329, "reward_std": 0.3263983130455017, "rewards/accuracy_reward": 0.6359742879867554, "rewards/format_reward": 1.0, "step": 47, "temperature": 1.0 }, { "all_correct": 0.125, "all_wrong": 0.125, "completion_length": 469.3125, "epoch": 0.0014596764383894903, "grad_norm": 1.2051169690229298, "kl": 0.0089111328125, "learning_rate": 9.999947428279694e-07, "loss": 0.0004, "reward": 1.3944621086120605, "reward_std": 0.32815349102020264, "rewards/accuracy_reward": 0.38508710265159607, "rewards/format_reward": 0.921875, "step": 48, "temperature": 1.0 }, { "all_correct": 0.375, "all_wrong": 0.125, "completion_length": 420.796875, "epoch": 0.0014900863641892715, "grad_norm": 1.2857179475078493, "kl": 0.01416015625, "learning_rate": 9.999945214977802e-07, "loss": 0.0006, "reward": 1.730210304260254, "reward_std": 0.15597693622112274, "rewards/accuracy_reward": 0.633335292339325, "rewards/format_reward": 1.0, "step": 49, "temperature": 1.0 }, { "all_correct": 0.25, "all_wrong": 0.25, "completion_length": 341.71875, "epoch": 0.0015204962899890524, "grad_norm": 1.6077028970954936, "kl": 0.0172119140625, "learning_rate": 9.999942956041157e-07, "loss": 0.0007, "reward": 1.4790146350860596, "reward_std": 0.20343194901943207, "rewards/accuracy_reward": 0.41338956356048584, "rewards/format_reward": 1.0, "step": 50, "temperature": 1.0 }, { "all_correct": 0.375, "all_wrong": 0.25, "completion_length": 427.390625, "epoch": 0.0015509062157888335, "grad_norm": 1.2833602919748015, "kl": 0.0155029296875, "learning_rate": 9.999940651469777e-07, "loss": 0.0006, "reward": 1.6171908378601074, "reward_std": 0.16841214895248413, "rewards/accuracy_reward": 0.5328157544136047, "rewards/format_reward": 0.984375, "step": 51, "temperature": 1.0 }, { "all_correct": 0.125, "all_wrong": 0.0, "completion_length": 473.90625, "epoch": 0.0015813161415886145, "grad_norm": 1.5347386121422006, "kl": 0.010498046875, "learning_rate": 9.999938301263689e-07, "loss": 0.0004, "reward": 1.446211576461792, "reward_std": 0.3887838125228882, "rewards/accuracy_reward": 0.45246145129203796, "rewards/format_reward": 0.90625, "step": 52, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 377.109375, "epoch": 0.0016117260673883956, "grad_norm": 1.3010726103581274, "kl": 0.017333984375, "learning_rate": 9.999935905422906e-07, "loss": 0.0007, "reward": 2.03125, "reward_std": 0.19370707869529724, "rewards/accuracy_reward": 0.875, "rewards/format_reward": 1.0, "step": 53, "temperature": 1.0 }, { "all_correct": 0.375, "all_wrong": 0.0, "completion_length": 375.0, "epoch": 0.0016421359931881765, "grad_norm": 1.8165008374461562, "kl": 0.01531982421875, "learning_rate": 9.999933463947457e-07, "loss": 0.0006, "reward": 1.7314881086349487, "reward_std": 0.254728227853775, "rewards/accuracy_reward": 0.6252381205558777, "rewards/format_reward": 1.0, "step": 54, "temperature": 1.0 }, { "all_correct": 0.375, "all_wrong": 0.25, "completion_length": 448.5625, "epoch": 0.0016725459189879577, "grad_norm": 1.1805774442651564, "kl": 0.011962890625, "learning_rate": 9.99993097683736e-07, "loss": 0.0005, "reward": 1.5063714981079102, "reward_std": 0.17219069600105286, "rewards/accuracy_reward": 0.4501214325428009, "rewards/format_reward": 0.96875, "step": 55, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.125, "completion_length": 419.53125, "epoch": 0.0017029558447877388, "grad_norm": 0.6537447709750156, "kl": 0.01507568359375, "learning_rate": 9.99992844409264e-07, "loss": 0.0006, "reward": 1.8656249046325684, "reward_std": 0.16710473597049713, "rewards/accuracy_reward": 0.75, "rewards/format_reward": 0.984375, "step": 56, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 421.296875, "epoch": 0.0017333657705875197, "grad_norm": 1.0263463835692015, "kl": 0.010986328125, "learning_rate": 9.99992586571332e-07, "loss": 0.0004, "reward": 1.945468783378601, "reward_std": 0.2802686393260956, "rewards/accuracy_reward": 0.7985937595367432, "rewards/format_reward": 1.0, "step": 57, "temperature": 1.0 }, { "all_correct": 0.25, "all_wrong": 0.25, "completion_length": 437.375, "epoch": 0.0017637756963873009, "grad_norm": 1.076073320456093, "kl": 0.0130615234375, "learning_rate": 9.999923241699422e-07, "loss": 0.0005, "reward": 1.686951994895935, "reward_std": 0.19162070751190186, "rewards/accuracy_reward": 0.5869519114494324, "rewards/format_reward": 1.0, "step": 58, "temperature": 1.0 }, { "all_correct": 0.375, "all_wrong": 0.125, "completion_length": 445.4375, "epoch": 0.0017941856221870818, "grad_norm": 0.9960617743408544, "kl": 0.0123291015625, "learning_rate": 9.99992057205097e-07, "loss": 0.0005, "reward": 1.6391254663467407, "reward_std": 0.24785034358501434, "rewards/accuracy_reward": 0.573500394821167, "rewards/format_reward": 0.9375, "step": 59, "temperature": 1.0 }, { "all_correct": 0.25, "all_wrong": 0.125, "completion_length": 420.875, "epoch": 0.001824595547986863, "grad_norm": 6.483373705035169, "kl": 0.01336669921875, "learning_rate": 9.99991785676799e-07, "loss": 0.0005, "reward": 1.6313416957855225, "reward_std": 0.20466020703315735, "rewards/accuracy_reward": 0.5407165884971619, "rewards/format_reward": 1.0, "step": 60, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.125, "completion_length": 375.78125, "epoch": 0.0018550054737866439, "grad_norm": 2.093180961005048, "kl": 0.015625, "learning_rate": 9.999915095850507e-07, "loss": 0.0006, "reward": 1.7097656726837158, "reward_std": 0.19712693989276886, "rewards/accuracy_reward": 0.603515625, "rewards/format_reward": 1.0, "step": 61, "temperature": 1.0 }, { "all_correct": 0.375, "all_wrong": 0.25, "completion_length": 382.09375, "epoch": 0.001885415399586425, "grad_norm": 0.9596082118279999, "kl": 0.01544189453125, "learning_rate": 9.999912289298542e-07, "loss": 0.0006, "reward": 1.7470896244049072, "reward_std": 0.13650238513946533, "rewards/accuracy_reward": 0.6252144575119019, "rewards/format_reward": 1.0, "step": 62, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 381.953125, "epoch": 0.001915825325386206, "grad_norm": 0.9498841030661145, "kl": 0.015625, "learning_rate": 9.999909437112127e-07, "loss": 0.0006, "reward": 2.0062499046325684, "reward_std": 0.15554746985435486, "rewards/accuracy_reward": 0.84375, "rewards/format_reward": 1.0, "step": 63, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 368.390625, "epoch": 0.001946235251185987, "grad_norm": 1.4807535326882024, "kl": 0.01416015625, "learning_rate": 9.999906539291283e-07, "loss": 0.0006, "reward": 1.8768644332885742, "reward_std": 0.21333277225494385, "rewards/accuracy_reward": 0.7331144213676453, "rewards/format_reward": 1.0, "step": 64, "temperature": 1.0 }, { "all_correct": 0.25, "all_wrong": 0.0, "completion_length": 427.296875, "epoch": 0.001976645176985768, "grad_norm": 1.214398911591177, "kl": 0.0101318359375, "learning_rate": 9.999903595836038e-07, "loss": 0.0004, "reward": 1.7070368528366089, "reward_std": 0.34790247678756714, "rewards/accuracy_reward": 0.6070367693901062, "rewards/format_reward": 1.0, "step": 65, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.125, "completion_length": 461.140625, "epoch": 0.0020070551027855494, "grad_norm": 1.4959046818170132, "kl": 0.01165771484375, "learning_rate": 9.99990060674642e-07, "loss": 0.0005, "reward": 1.6366686820983887, "reward_std": 0.1298898160457611, "rewards/accuracy_reward": 0.5710436105728149, "rewards/format_reward": 0.953125, "step": 66, "temperature": 1.0 }, { "all_correct": 0.125, "all_wrong": 0.375, "completion_length": 438.390625, "epoch": 0.0020374650285853303, "grad_norm": 4.538453262127269, "kl": 0.00933837890625, "learning_rate": 9.999897572022456e-07, "loss": 0.0004, "reward": 1.4098670482635498, "reward_std": 0.1763109415769577, "rewards/accuracy_reward": 0.34111708402633667, "rewards/format_reward": 0.984375, "step": 67, "temperature": 1.0 }, { "all_correct": 0.375, "all_wrong": 0.0, "completion_length": 435.34375, "epoch": 0.0020678749543851112, "grad_norm": 2.0874902548005756, "kl": 0.0096435546875, "learning_rate": 9.99989449166417e-07, "loss": 0.0004, "reward": 1.7215529680252075, "reward_std": 0.21077515184879303, "rewards/accuracy_reward": 0.6215529441833496, "rewards/format_reward": 1.0, "step": 68, "temperature": 1.0 }, { "all_correct": 0.25, "all_wrong": 0.0, "completion_length": 436.984375, "epoch": 0.002098284880184892, "grad_norm": 1.7146351828596011, "kl": 0.0113525390625, "learning_rate": 9.999891365671596e-07, "loss": 0.0005, "reward": 1.6222567558288574, "reward_std": 0.27506405115127563, "rewards/accuracy_reward": 0.5222567915916443, "rewards/format_reward": 1.0, "step": 69, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 410.46875, "epoch": 0.0021286948059846735, "grad_norm": 2.197154462353113, "kl": 0.0166015625, "learning_rate": 9.99988819404476e-07, "loss": 0.0007, "reward": 1.8892011642456055, "reward_std": 0.23192350566387177, "rewards/accuracy_reward": 0.7548261284828186, "rewards/format_reward": 1.0, "step": 70, "temperature": 1.0 }, { "all_correct": 0.25, "all_wrong": 0.25, "completion_length": 394.96875, "epoch": 0.0021591047317844544, "grad_norm": 1.1189918218015957, "kl": 0.013671875, "learning_rate": 9.999884976783688e-07, "loss": 0.0005, "reward": 1.357374668121338, "reward_std": 0.22493042051792145, "rewards/accuracy_reward": 0.332374632358551, "rewards/format_reward": 0.96875, "step": 71, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 395.453125, "epoch": 0.0021895146575842354, "grad_norm": 1.451347471947504, "kl": 0.015869140625, "learning_rate": 9.999881713888414e-07, "loss": 0.0006, "reward": 1.8340792655944824, "reward_std": 0.17378279566764832, "rewards/accuracy_reward": 0.6965792179107666, "rewards/format_reward": 1.0, "step": 72, "temperature": 1.0 }, { "all_correct": 0.125, "all_wrong": 0.125, "completion_length": 388.84375, "epoch": 0.0022199245833840167, "grad_norm": 1.685924842797364, "kl": 0.01220703125, "learning_rate": 9.999878405358967e-07, "loss": 0.0005, "reward": 1.6226558685302734, "reward_std": 0.18531137704849243, "rewards/accuracy_reward": 0.5382808446884155, "rewards/format_reward": 1.0, "step": 73, "temperature": 1.0 }, { "all_correct": 0.25, "all_wrong": 0.0, "completion_length": 463.265625, "epoch": 0.0022503345091837977, "grad_norm": 1.6190723344090046, "kl": 0.01031494140625, "learning_rate": 9.999875051195372e-07, "loss": 0.0004, "reward": 1.6120021343231201, "reward_std": 0.3168191909790039, "rewards/accuracy_reward": 0.524502158164978, "rewards/format_reward": 1.0, "step": 74, "temperature": 1.0 }, { "all_correct": 0.375, "all_wrong": 0.25, "completion_length": 451.765625, "epoch": 0.0022807444349835786, "grad_norm": 3.5254170467954453, "kl": 0.01220703125, "learning_rate": 9.999871651397667e-07, "loss": 0.0005, "reward": 1.6127300262451172, "reward_std": 0.15051430463790894, "rewards/accuracy_reward": 0.5596050024032593, "rewards/format_reward": 0.984375, "step": 75, "temperature": 1.0 }, { "all_correct": 0.375, "all_wrong": 0.25, "completion_length": 383.015625, "epoch": 0.0023111543607833595, "grad_norm": 0.8987896270440928, "kl": 0.016357421875, "learning_rate": 9.999868205965878e-07, "loss": 0.0007, "reward": 1.6875, "reward_std": 0.21905139088630676, "rewards/accuracy_reward": 0.578125, "rewards/format_reward": 1.0, "step": 76, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.125, "completion_length": 385.859375, "epoch": 0.002341564286583141, "grad_norm": 1.2105123411895744, "kl": 0.0140380859375, "learning_rate": 9.99986471490004e-07, "loss": 0.0006, "reward": 1.7625000476837158, "reward_std": 0.20695412158966064, "rewards/accuracy_reward": 0.65625, "rewards/format_reward": 1.0, "step": 77, "temperature": 1.0 }, { "all_correct": 0.375, "all_wrong": 0.0, "completion_length": 391.59375, "epoch": 0.002371974212382922, "grad_norm": 1.6022385015193465, "kl": 0.01318359375, "learning_rate": 9.99986117820018e-07, "loss": 0.0005, "reward": 1.801504373550415, "reward_std": 0.24511218070983887, "rewards/accuracy_reward": 0.6733792424201965, "rewards/format_reward": 1.0, "step": 78, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.125, "completion_length": 427.078125, "epoch": 0.0024023841381827027, "grad_norm": 1.2043719522172864, "kl": 0.01116943359375, "learning_rate": 9.999857595866335e-07, "loss": 0.0004, "reward": 1.7467107772827148, "reward_std": 0.17070460319519043, "rewards/accuracy_reward": 0.6467106938362122, "rewards/format_reward": 1.0, "step": 79, "temperature": 1.0 }, { "all_correct": 0.125, "all_wrong": 0.25, "completion_length": 431.390625, "epoch": 0.002432794063982484, "grad_norm": 2.108227862622634, "kl": 0.01446533203125, "learning_rate": 9.999853967898537e-07, "loss": 0.0006, "reward": 1.3918615579605103, "reward_std": 0.11718933284282684, "rewards/accuracy_reward": 0.32623645663261414, "rewards/format_reward": 1.0, "step": 80, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.125, "completion_length": 404.5, "epoch": 0.002463203989782265, "grad_norm": 0.9692539817597077, "kl": 0.01318359375, "learning_rate": 9.999850294296816e-07, "loss": 0.0005, "reward": 1.8312500715255737, "reward_std": 0.16712023317813873, "rewards/accuracy_reward": 0.71875, "rewards/format_reward": 1.0, "step": 81, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.125, "completion_length": 402.109375, "epoch": 0.002493613915582046, "grad_norm": 0.8761166873709011, "kl": 0.01409912109375, "learning_rate": 9.99984657506121e-07, "loss": 0.0006, "reward": 1.6949537992477417, "reward_std": 0.12685883045196533, "rewards/accuracy_reward": 0.6105788350105286, "rewards/format_reward": 1.0, "step": 82, "temperature": 1.0 }, { "all_correct": 0.375, "all_wrong": 0.125, "completion_length": 435.03125, "epoch": 0.002524023841381827, "grad_norm": 2.2100798784549505, "kl": 0.01092529296875, "learning_rate": 9.999842810191747e-07, "loss": 0.0004, "reward": 1.64670991897583, "reward_std": 0.2460305094718933, "rewards/accuracy_reward": 0.540459930896759, "rewards/format_reward": 1.0, "step": 83, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.125, "completion_length": 395.890625, "epoch": 0.0025544337671816082, "grad_norm": 1.3487257761624414, "kl": 0.01507568359375, "learning_rate": 9.999838999688465e-07, "loss": 0.0006, "reward": 1.8760416507720947, "reward_std": 0.13139429688453674, "rewards/accuracy_reward": 0.7635416984558105, "rewards/format_reward": 1.0, "step": 84, "temperature": 1.0 }, { "all_correct": 0.375, "all_wrong": 0.125, "completion_length": 425.90625, "epoch": 0.002584843692981389, "grad_norm": 1.4077493051045469, "kl": 0.012451171875, "learning_rate": 9.999835143551402e-07, "loss": 0.0005, "reward": 1.8503814935684204, "reward_std": 0.19670003652572632, "rewards/accuracy_reward": 0.7191314101219177, "rewards/format_reward": 1.0, "step": 85, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.0, "completion_length": 409.921875, "epoch": 0.00261525361878117, "grad_norm": 1.326853347999934, "kl": 0.01263427734375, "learning_rate": 9.999831241780588e-07, "loss": 0.0005, "reward": 1.9051461219787598, "reward_std": 0.13108865916728973, "rewards/accuracy_reward": 0.7707711458206177, "rewards/format_reward": 1.0, "step": 86, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.125, "completion_length": 408.6875, "epoch": 0.0026456635445809514, "grad_norm": 1.5100265978927194, "kl": 0.01513671875, "learning_rate": 9.99982729437606e-07, "loss": 0.0006, "reward": 1.7437500953674316, "reward_std": 0.31241440773010254, "rewards/accuracy_reward": 0.6875, "rewards/format_reward": 0.9375, "step": 87, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.125, "completion_length": 389.625, "epoch": 0.0026760734703807324, "grad_norm": 1.3926909394883846, "kl": 0.01434326171875, "learning_rate": 9.999823301337855e-07, "loss": 0.0006, "reward": 1.8625000715255737, "reward_std": 0.14150118827819824, "rewards/accuracy_reward": 0.734375, "rewards/format_reward": 1.0, "step": 88, "temperature": 1.0 }, { "all_correct": 0.375, "all_wrong": 0.125, "completion_length": 431.109375, "epoch": 0.0027064833961805133, "grad_norm": 1.859221024826941, "kl": 0.0111083984375, "learning_rate": 9.99981926266601e-07, "loss": 0.0004, "reward": 1.78244948387146, "reward_std": 0.28553903102874756, "rewards/accuracy_reward": 0.666824460029602, "rewards/format_reward": 0.984375, "step": 89, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.0, "completion_length": 412.21875, "epoch": 0.002736893321980294, "grad_norm": 1.620012331539679, "kl": 0.01214599609375, "learning_rate": 9.99981517836056e-07, "loss": 0.0005, "reward": 1.9375, "reward_std": 0.20358122885227203, "rewards/accuracy_reward": 0.796875, "rewards/format_reward": 1.0, "step": 90, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.375, "completion_length": 399.875, "epoch": 0.0027673032477800756, "grad_norm": 1.0283293281513692, "kl": 0.01190185546875, "learning_rate": 9.999811048421543e-07, "loss": 0.0005, "reward": 1.671875, "reward_std": 0.08074887096881866, "rewards/accuracy_reward": 0.578125, "rewards/format_reward": 1.0, "step": 91, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.125, "completion_length": 434.25, "epoch": 0.0027977131735798565, "grad_norm": 1.2589396013442424, "kl": 0.01177978515625, "learning_rate": 9.999806872848996e-07, "loss": 0.0005, "reward": 1.740625023841858, "reward_std": 0.20894074440002441, "rewards/accuracy_reward": 0.625, "rewards/format_reward": 1.0, "step": 92, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 406.765625, "epoch": 0.0028281230993796374, "grad_norm": 1.2290289686000981, "kl": 0.01214599609375, "learning_rate": 9.999802651642959e-07, "loss": 0.0005, "reward": 1.870416522026062, "reward_std": 0.19986608624458313, "rewards/accuracy_reward": 0.8079164624214172, "rewards/format_reward": 0.953125, "step": 93, "temperature": 1.0 }, { "all_correct": 0.25, "all_wrong": 0.0, "completion_length": 421.109375, "epoch": 0.0028585330251794184, "grad_norm": 1.6494777381764956, "kl": 0.012939453125, "learning_rate": 9.999798384803468e-07, "loss": 0.0005, "reward": 1.7553774118423462, "reward_std": 0.3744775652885437, "rewards/accuracy_reward": 0.6335023045539856, "rewards/format_reward": 1.0, "step": 94, "temperature": 1.0 }, { "all_correct": 0.375, "all_wrong": 0.0, "completion_length": 390.984375, "epoch": 0.0028889429509791997, "grad_norm": 1.2261005372553526, "kl": 0.0135498046875, "learning_rate": 9.999794072330564e-07, "loss": 0.0005, "reward": 1.821874976158142, "reward_std": 0.3170676827430725, "rewards/accuracy_reward": 0.71875, "rewards/format_reward": 0.984375, "step": 95, "temperature": 1.0 }, { "all_correct": 0.25, "all_wrong": 0.0, "completion_length": 397.6875, "epoch": 0.0029193528767789806, "grad_norm": 1.320081343511887, "kl": 0.0120849609375, "learning_rate": 9.999789714224287e-07, "loss": 0.0005, "reward": 1.955208420753479, "reward_std": 0.2944028377532959, "rewards/accuracy_reward": 0.8020833134651184, "rewards/format_reward": 1.0, "step": 96, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 389.765625, "epoch": 0.0029497628025787616, "grad_norm": 1.7815217711393854, "kl": 0.015869140625, "learning_rate": 9.999785310484673e-07, "loss": 0.0006, "reward": 2.0625, "reward_std": 0.16938352584838867, "rewards/accuracy_reward": 0.890625, "rewards/format_reward": 1.0, "step": 97, "temperature": 1.0 }, { "all_correct": 0.375, "all_wrong": 0.0, "completion_length": 400.140625, "epoch": 0.002980172728378543, "grad_norm": 1.8925236124886895, "kl": 0.01336669921875, "learning_rate": 9.999780861111765e-07, "loss": 0.0005, "reward": 1.8936694860458374, "reward_std": 0.21802827715873718, "rewards/accuracy_reward": 0.7467944622039795, "rewards/format_reward": 1.0, "step": 98, "temperature": 1.0 }, { "all_correct": 0.25, "all_wrong": 0.0, "completion_length": 425.28125, "epoch": 0.003010582654178324, "grad_norm": 1.3099964918558389, "kl": 0.0133056640625, "learning_rate": 9.999776366105606e-07, "loss": 0.0005, "reward": 1.855330467224121, "reward_std": 0.3692500591278076, "rewards/accuracy_reward": 0.7553303837776184, "rewards/format_reward": 0.96875, "step": 99, "temperature": 1.0 }, { "all_correct": 0.375, "all_wrong": 0.0, "completion_length": 371.8125, "epoch": 0.0030409925799781048, "grad_norm": 1.6107059297561335, "kl": 0.01300048828125, "learning_rate": 9.999771825466232e-07, "loss": 0.0005, "reward": 1.9406250715255737, "reward_std": 0.2936359941959381, "rewards/accuracy_reward": 0.796875, "rewards/format_reward": 1.0, "step": 100, "temperature": 1.0 }, { "all_correct": 0.375, "all_wrong": 0.25, "completion_length": 381.0, "epoch": 0.0030714025057778857, "grad_norm": 1.1435506156259134, "kl": 0.01336669921875, "learning_rate": 9.999767239193688e-07, "loss": 0.0005, "reward": 1.6163592338562012, "reward_std": 0.10960143804550171, "rewards/accuracy_reward": 0.5069841742515564, "rewards/format_reward": 1.0, "step": 101, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 375.046875, "epoch": 0.003101812431577667, "grad_norm": 1.191072112005555, "kl": 0.012939453125, "learning_rate": 9.999762607288014e-07, "loss": 0.0005, "reward": 2.0187501907348633, "reward_std": 0.18259362876415253, "rewards/accuracy_reward": 0.875, "rewards/format_reward": 1.0, "step": 102, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.125, "completion_length": 360.6875, "epoch": 0.003132222357377448, "grad_norm": 1.2120608878998178, "kl": 0.0130615234375, "learning_rate": 9.999757929749252e-07, "loss": 0.0005, "reward": 1.8628289699554443, "reward_std": 0.11512254178524017, "rewards/accuracy_reward": 0.7128288745880127, "rewards/format_reward": 1.0, "step": 103, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.0, "completion_length": 365.703125, "epoch": 0.003162632283177229, "grad_norm": 1.5119521598542058, "kl": 0.0135498046875, "learning_rate": 9.999753206577448e-07, "loss": 0.0005, "reward": 1.9562499523162842, "reward_std": 0.2622116506099701, "rewards/accuracy_reward": 0.828125, "rewards/format_reward": 1.0, "step": 104, "temperature": 1.0 }, { "all_correct": 0.125, "all_wrong": 0.125, "completion_length": 454.21875, "epoch": 0.0031930422089770103, "grad_norm": 1.4115809218417046, "kl": 0.0108642578125, "learning_rate": 9.99974843777264e-07, "loss": 0.0004, "reward": 1.4562711715698242, "reward_std": 0.34898149967193604, "rewards/accuracy_reward": 0.40627118945121765, "rewards/format_reward": 0.96875, "step": 105, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.125, "completion_length": 375.03125, "epoch": 0.003223452134776791, "grad_norm": 1.1943710506900078, "kl": 0.01202392578125, "learning_rate": 9.999743623334876e-07, "loss": 0.0005, "reward": 1.7741870880126953, "reward_std": 0.17306214570999146, "rewards/accuracy_reward": 0.6648120880126953, "rewards/format_reward": 1.0, "step": 106, "temperature": 1.0 }, { "all_correct": 0.375, "all_wrong": 0.0, "completion_length": 378.46875, "epoch": 0.003253862060576572, "grad_norm": 1.4729281075548473, "kl": 0.0103759765625, "learning_rate": 9.999738763264197e-07, "loss": 0.0004, "reward": 1.9126482009887695, "reward_std": 0.32268238067626953, "rewards/accuracy_reward": 0.7657730579376221, "rewards/format_reward": 0.984375, "step": 107, "temperature": 1.0 }, { "all_correct": 0.375, "all_wrong": 0.125, "completion_length": 446.046875, "epoch": 0.003284271986376353, "grad_norm": 1.705785582140939, "kl": 0.00927734375, "learning_rate": 9.999733857560649e-07, "loss": 0.0004, "reward": 1.613050103187561, "reward_std": 0.27703195810317993, "rewards/accuracy_reward": 0.5661750435829163, "rewards/format_reward": 0.953125, "step": 108, "temperature": 1.0 }, { "all_correct": 0.125, "all_wrong": 0.5, "completion_length": 499.96875, "epoch": 0.0033146819121761344, "grad_norm": 0.9622434953213093, "kl": 0.00909423828125, "learning_rate": 9.999728906224276e-07, "loss": 0.0004, "reward": 1.2282872200012207, "reward_std": 0.09929470717906952, "rewards/accuracy_reward": 0.23766222596168518, "rewards/format_reward": 0.96875, "step": 109, "temperature": 1.0 }, { "all_correct": 0.375, "all_wrong": 0.25, "completion_length": 438.3125, "epoch": 0.0033450918379759153, "grad_norm": 1.697222097259729, "kl": 0.010498046875, "learning_rate": 9.999723909255124e-07, "loss": 0.0004, "reward": 1.6912615299224854, "reward_std": 0.24412010610103607, "rewards/accuracy_reward": 0.6318864822387695, "rewards/format_reward": 0.953125, "step": 110, "temperature": 1.0 }, { "all_correct": 0.375, "all_wrong": 0.125, "completion_length": 412.703125, "epoch": 0.0033755017637756963, "grad_norm": 1.2369543491609039, "kl": 0.01123046875, "learning_rate": 9.999718866653237e-07, "loss": 0.0005, "reward": 1.7088282108306885, "reward_std": 0.3435198664665222, "rewards/accuracy_reward": 0.624453067779541, "rewards/format_reward": 0.96875, "step": 111, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 422.1875, "epoch": 0.0034059116895754776, "grad_norm": 1.8507689567213426, "kl": 0.01190185546875, "learning_rate": 9.999713778418665e-07, "loss": 0.0005, "reward": 1.9233193397521973, "reward_std": 0.22450970113277435, "rewards/accuracy_reward": 0.8108192682266235, "rewards/format_reward": 0.984375, "step": 112, "temperature": 1.0 }, { "all_correct": 0.25, "all_wrong": 0.375, "completion_length": 404.984375, "epoch": 0.0034363216153752586, "grad_norm": 1.399880525066186, "kl": 0.0133056640625, "learning_rate": 9.999708644551448e-07, "loss": 0.0005, "reward": 1.4375, "reward_std": 0.17550253868103027, "rewards/accuracy_reward": 0.375, "rewards/format_reward": 1.0, "step": 113, "temperature": 1.0 }, { "all_correct": 0.375, "all_wrong": 0.125, "completion_length": 384.15625, "epoch": 0.0034667315411750395, "grad_norm": 1.8157402050726867, "kl": 0.01275634765625, "learning_rate": 9.999703465051638e-07, "loss": 0.0005, "reward": 1.6812500953674316, "reward_std": 0.27547648549079895, "rewards/accuracy_reward": 0.59375, "rewards/format_reward": 1.0, "step": 114, "temperature": 1.0 }, { "all_correct": 0.25, "all_wrong": 0.25, "completion_length": 374.140625, "epoch": 0.0034971414669748204, "grad_norm": 1.3256303335408497, "kl": 0.01416015625, "learning_rate": 9.999698239919282e-07, "loss": 0.0006, "reward": 1.5202667713165283, "reward_std": 0.23908200860023499, "rewards/accuracy_reward": 0.44214165210723877, "rewards/format_reward": 1.0, "step": 115, "temperature": 1.0 }, { "all_correct": 0.25, "all_wrong": 0.0, "completion_length": 399.421875, "epoch": 0.0035275513927746018, "grad_norm": 1.2711021744130893, "kl": 0.01104736328125, "learning_rate": 9.999692969154423e-07, "loss": 0.0004, "reward": 1.7680635452270508, "reward_std": 0.337743878364563, "rewards/accuracy_reward": 0.6586884260177612, "rewards/format_reward": 1.0, "step": 116, "temperature": 1.0 }, { "all_correct": 0.375, "all_wrong": 0.375, "completion_length": 433.625, "epoch": 0.0035579613185743827, "grad_norm": 0.9873440205784932, "kl": 0.00787353515625, "learning_rate": 9.999687652757116e-07, "loss": 0.0003, "reward": 1.649999976158142, "reward_std": 0.15633204579353333, "rewards/accuracy_reward": 0.5625, "rewards/format_reward": 1.0, "step": 117, "temperature": 1.0 }, { "all_correct": 0.375, "all_wrong": 0.25, "completion_length": 384.71875, "epoch": 0.0035883712443741636, "grad_norm": 1.1390425401656346, "kl": 0.01513671875, "learning_rate": 9.999682290727402e-07, "loss": 0.0006, "reward": 1.738194465637207, "reward_std": 0.1970047652721405, "rewards/accuracy_reward": 0.6163194179534912, "rewards/format_reward": 1.0, "step": 118, "temperature": 1.0 }, { "all_correct": 0.25, "all_wrong": 0.0, "completion_length": 379.828125, "epoch": 0.003618781170173945, "grad_norm": 1.5954146720231357, "kl": 0.010986328125, "learning_rate": 9.999676883065339e-07, "loss": 0.0004, "reward": 1.8111308813095093, "reward_std": 0.20661289989948273, "rewards/accuracy_reward": 0.6767559051513672, "rewards/format_reward": 1.0, "step": 119, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.25, "completion_length": 395.625, "epoch": 0.003649191095973726, "grad_norm": 1.0695744299357113, "kl": 0.014892578125, "learning_rate": 9.999671429770967e-07, "loss": 0.0006, "reward": 1.6961580514907837, "reward_std": 0.14148113131523132, "rewards/accuracy_reward": 0.6180330514907837, "rewards/format_reward": 0.96875, "step": 120, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 414.875, "epoch": 0.003679601021773507, "grad_norm": 1.945354071536962, "kl": 0.0123291015625, "learning_rate": 9.999665930844342e-07, "loss": 0.0005, "reward": 1.8152601718902588, "reward_std": 0.23605464398860931, "rewards/accuracy_reward": 0.7152600288391113, "rewards/format_reward": 0.96875, "step": 121, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 436.671875, "epoch": 0.0037100109475732878, "grad_norm": 1.0968894228795836, "kl": 0.01275634765625, "learning_rate": 9.999660386285512e-07, "loss": 0.0005, "reward": 1.753679871559143, "reward_std": 0.25608769059181213, "rewards/accuracy_reward": 0.6568049192428589, "rewards/format_reward": 0.984375, "step": 122, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 437.671875, "epoch": 0.003740420873373069, "grad_norm": 1.4673063371872073, "kl": 0.01275634765625, "learning_rate": 9.999654796094527e-07, "loss": 0.0005, "reward": 1.9900388717651367, "reward_std": 0.1983608454465866, "rewards/accuracy_reward": 0.8587887287139893, "rewards/format_reward": 1.0, "step": 123, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.0, "completion_length": 381.640625, "epoch": 0.00377083079917285, "grad_norm": 1.6416560271510383, "kl": 0.012939453125, "learning_rate": 9.99964916027144e-07, "loss": 0.0005, "reward": 1.9000000953674316, "reward_std": 0.17067179083824158, "rewards/accuracy_reward": 0.765625, "rewards/format_reward": 1.0, "step": 124, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 457.40625, "epoch": 0.003801240724972631, "grad_norm": 1.0225803997486793, "kl": 0.012451171875, "learning_rate": 9.999643478816301e-07, "loss": 0.0005, "reward": 1.763511061668396, "reward_std": 0.2219434380531311, "rewards/accuracy_reward": 0.7291359901428223, "rewards/format_reward": 0.921875, "step": 125, "temperature": 1.0 }, { "all_correct": 0.25, "all_wrong": 0.125, "completion_length": 448.859375, "epoch": 0.003831650650772412, "grad_norm": 8.352785593061897, "kl": 0.01104736328125, "learning_rate": 9.999637751729162e-07, "loss": 0.0004, "reward": 1.6567270755767822, "reward_std": 0.14242099225521088, "rewards/accuracy_reward": 0.5473519563674927, "rewards/format_reward": 1.0, "step": 126, "temperature": 1.0 }, { "all_correct": 0.375, "all_wrong": 0.0, "completion_length": 438.0, "epoch": 0.0038620605765721933, "grad_norm": 1.0799127843001628, "kl": 0.00970458984375, "learning_rate": 9.999631979010075e-07, "loss": 0.0004, "reward": 1.755429744720459, "reward_std": 0.33935219049453735, "rewards/accuracy_reward": 0.6710547208786011, "rewards/format_reward": 0.96875, "step": 127, "temperature": 1.0 }, { "all_correct": 0.375, "all_wrong": 0.125, "completion_length": 441.359375, "epoch": 0.003892470502371974, "grad_norm": 1.0661924261754825, "kl": 0.01324462890625, "learning_rate": 9.999626160659094e-07, "loss": 0.0005, "reward": 1.5311402082443237, "reward_std": 0.22228728234767914, "rewards/accuracy_reward": 0.49051526188850403, "rewards/format_reward": 0.96875, "step": 128, "temperature": 1.0 }, { "all_correct": 0.25, "all_wrong": 0.25, "completion_length": 400.703125, "epoch": 0.003922880428171755, "grad_norm": 1.3569775529091876, "kl": 0.01318359375, "learning_rate": 9.999620296676273e-07, "loss": 0.0005, "reward": 1.6112319231033325, "reward_std": 0.16228729486465454, "rewards/accuracy_reward": 0.5049818754196167, "rewards/format_reward": 1.0, "step": 129, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 424.71875, "epoch": 0.003953290353971536, "grad_norm": 1.3836269306579518, "kl": 0.01495361328125, "learning_rate": 9.99961438706166e-07, "loss": 0.0006, "reward": 1.8607784509658813, "reward_std": 0.11711832135915756, "rewards/accuracy_reward": 0.7232784628868103, "rewards/format_reward": 1.0, "step": 130, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.125, "completion_length": 394.15625, "epoch": 0.003983700279771317, "grad_norm": 1.4107358207677978, "kl": 0.015625, "learning_rate": 9.999608431815313e-07, "loss": 0.0006, "reward": 1.8250000476837158, "reward_std": 0.18652412295341492, "rewards/accuracy_reward": 0.703125, "rewards/format_reward": 1.0, "step": 131, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.125, "completion_length": 367.390625, "epoch": 0.004014110205571099, "grad_norm": 1.1726251380447597, "kl": 0.0177001953125, "learning_rate": 9.99960243093729e-07, "loss": 0.0007, "reward": 1.8555676937103271, "reward_std": 0.1450468897819519, "rewards/accuracy_reward": 0.7180677056312561, "rewards/format_reward": 1.0, "step": 132, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 387.953125, "epoch": 0.00404452013137088, "grad_norm": 1.3575715734166978, "kl": 0.015380859375, "learning_rate": 9.999596384427637e-07, "loss": 0.0006, "reward": 1.835242748260498, "reward_std": 0.2508818805217743, "rewards/accuracy_reward": 0.7227427363395691, "rewards/format_reward": 1.0, "step": 133, "temperature": 1.0 }, { "all_correct": 0.25, "all_wrong": 0.25, "completion_length": 377.84375, "epoch": 0.004074930057170661, "grad_norm": 1.139747620935481, "kl": 0.014404296875, "learning_rate": 9.999590292286418e-07, "loss": 0.0006, "reward": 1.6535420417785645, "reward_std": 0.2498963177204132, "rewards/accuracy_reward": 0.5535419583320618, "rewards/format_reward": 1.0, "step": 134, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 415.859375, "epoch": 0.0041053399829704415, "grad_norm": 1.402961959420838, "kl": 0.01483154296875, "learning_rate": 9.999584154513682e-07, "loss": 0.0006, "reward": 1.9603800773620605, "reward_std": 0.050045616924762726, "rewards/accuracy_reward": 0.8135050535202026, "rewards/format_reward": 1.0, "step": 135, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.25, "completion_length": 388.9375, "epoch": 0.0041357499087702225, "grad_norm": 2.2987765574575882, "kl": 0.01361083984375, "learning_rate": 9.999577971109489e-07, "loss": 0.0005, "reward": 1.7411651611328125, "reward_std": 0.11923021078109741, "rewards/accuracy_reward": 0.6224152445793152, "rewards/format_reward": 1.0, "step": 136, "temperature": 1.0 }, { "all_correct": 0.25, "all_wrong": 0.25, "completion_length": 422.671875, "epoch": 0.004166159834570003, "grad_norm": 1.1635082350786998, "kl": 0.01416015625, "learning_rate": 9.999571742073893e-07, "loss": 0.0006, "reward": 1.6062500476837158, "reward_std": 0.2297685146331787, "rewards/accuracy_reward": 0.53125, "rewards/format_reward": 1.0, "step": 137, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 462.015625, "epoch": 0.004196569760369784, "grad_norm": 1.639453929022012, "kl": 0.01263427734375, "learning_rate": 9.999565467406952e-07, "loss": 0.0005, "reward": 1.8578273057937622, "reward_std": 0.09192578494548798, "rewards/accuracy_reward": 0.7234523296356201, "rewards/format_reward": 1.0, "step": 138, "temperature": 1.0 }, { "all_correct": 0.25, "all_wrong": 0.0, "completion_length": 499.546875, "epoch": 0.004226979686169566, "grad_norm": 1.7563537775719584, "kl": 0.00970458984375, "learning_rate": 9.999559147108725e-07, "loss": 0.0004, "reward": 1.5782231092453003, "reward_std": 0.18151439726352692, "rewards/accuracy_reward": 0.5344730615615845, "rewards/format_reward": 0.9375, "step": 139, "temperature": 1.0 }, { "all_correct": 0.375, "all_wrong": 0.125, "completion_length": 392.390625, "epoch": 0.004257389611969347, "grad_norm": 1.1406199565944355, "kl": 0.01708984375, "learning_rate": 9.999552781179266e-07, "loss": 0.0007, "reward": 1.5926470756530762, "reward_std": 0.18690401315689087, "rewards/accuracy_reward": 0.49264708161354065, "rewards/format_reward": 1.0, "step": 140, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.0, "completion_length": 375.65625, "epoch": 0.004287799537769128, "grad_norm": 1.3605958847837936, "kl": 0.0224609375, "learning_rate": 9.999546369618637e-07, "loss": 0.0009, "reward": 1.915426254272461, "reward_std": 0.11051365733146667, "rewards/accuracy_reward": 0.7591762542724609, "rewards/format_reward": 1.0, "step": 141, "temperature": 1.0 }, { "all_correct": 0.25, "all_wrong": 0.25, "completion_length": 472.109375, "epoch": 0.004318209463568909, "grad_norm": 0.9698083088398921, "kl": 0.01263427734375, "learning_rate": 9.999539912426892e-07, "loss": 0.0005, "reward": 1.4461592435836792, "reward_std": 0.0903548151254654, "rewards/accuracy_reward": 0.3805343210697174, "rewards/format_reward": 1.0, "step": 142, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.25, "completion_length": 389.484375, "epoch": 0.00434861938936869, "grad_norm": 1.040035997424916, "kl": 0.014404296875, "learning_rate": 9.999533409604093e-07, "loss": 0.0006, "reward": 1.631250023841858, "reward_std": 0.11976678669452667, "rewards/accuracy_reward": 0.546875, "rewards/format_reward": 1.0, "step": 143, "temperature": 1.0 }, { "all_correct": 0.375, "all_wrong": 0.125, "completion_length": 450.484375, "epoch": 0.004379029315168471, "grad_norm": 1.1421509953959035, "kl": 0.0157470703125, "learning_rate": 9.9995268611503e-07, "loss": 0.0006, "reward": 1.6714529991149902, "reward_std": 0.12337745726108551, "rewards/accuracy_reward": 0.5558279156684875, "rewards/format_reward": 1.0, "step": 144, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.0, "completion_length": 423.390625, "epoch": 0.004409439240968252, "grad_norm": 1.0759991753626605, "kl": 0.0196533203125, "learning_rate": 9.999520267065572e-07, "loss": 0.0008, "reward": 1.9781577587127686, "reward_std": 0.1568175107240677, "rewards/accuracy_reward": 0.8344077467918396, "rewards/format_reward": 1.0, "step": 145, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.125, "completion_length": 452.640625, "epoch": 0.0044398491667680335, "grad_norm": 0.8519314536708666, "kl": 0.018798828125, "learning_rate": 9.999513627349967e-07, "loss": 0.0008, "reward": 1.6944670677185059, "reward_std": 0.12818582355976105, "rewards/accuracy_reward": 0.63509202003479, "rewards/format_reward": 0.9375, "step": 146, "temperature": 1.0 }, { "all_correct": 0.375, "all_wrong": 0.25, "completion_length": 451.046875, "epoch": 0.004470259092567814, "grad_norm": 0.90631585986892, "kl": 0.01171875, "learning_rate": 9.999506942003547e-07, "loss": 0.0005, "reward": 1.667960524559021, "reward_std": 0.19165682792663574, "rewards/accuracy_reward": 0.5523355007171631, "rewards/format_reward": 1.0, "step": 147, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 424.140625, "epoch": 0.004500669018367595, "grad_norm": 1.132809510162569, "kl": 0.0189208984375, "learning_rate": 9.999500211026376e-07, "loss": 0.0008, "reward": 1.8894298076629639, "reward_std": 0.1842423528432846, "rewards/accuracy_reward": 0.7863047122955322, "rewards/format_reward": 0.953125, "step": 148, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.0, "completion_length": 409.46875, "epoch": 0.004531078944167376, "grad_norm": 1.0125047844435144, "kl": 0.0203857421875, "learning_rate": 9.999493434418508e-07, "loss": 0.0008, "reward": 1.9479596614837646, "reward_std": 0.1112360879778862, "rewards/accuracy_reward": 0.7948344945907593, "rewards/format_reward": 1.0, "step": 149, "temperature": 1.0 }, { "all_correct": 0.375, "all_wrong": 0.0, "completion_length": 492.828125, "epoch": 0.004561488869967157, "grad_norm": 1.3245815716926614, "kl": 0.00982666015625, "learning_rate": 9.999486612180015e-07, "loss": 0.0004, "reward": 1.860871434211731, "reward_std": 0.26773563027381897, "rewards/accuracy_reward": 0.7546213865280151, "rewards/format_reward": 0.984375, "step": 150, "temperature": 1.0 }, { "all_correct": 0.125, "all_wrong": 0.25, "completion_length": 427.171875, "epoch": 0.004591898795766938, "grad_norm": 1.2321894970714469, "kl": 0.01611328125, "learning_rate": 9.99947974431095e-07, "loss": 0.0006, "reward": 1.4892878532409668, "reward_std": 0.16884982585906982, "rewards/accuracy_reward": 0.4111628234386444, "rewards/format_reward": 1.0, "step": 151, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.0, "completion_length": 428.9375, "epoch": 0.004622308721566719, "grad_norm": 1.1213278922266443, "kl": 0.01519775390625, "learning_rate": 9.999472830811382e-07, "loss": 0.0006, "reward": 1.8488322496414185, "reward_std": 0.09654758870601654, "rewards/accuracy_reward": 0.7113322615623474, "rewards/format_reward": 1.0, "step": 152, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.125, "completion_length": 391.171875, "epoch": 0.004652718647366501, "grad_norm": 0.8304602358235011, "kl": 0.0218505859375, "learning_rate": 9.99946587168137e-07, "loss": 0.0009, "reward": 1.941306710243225, "reward_std": 0.029227139428257942, "rewards/accuracy_reward": 0.775681734085083, "rewards/format_reward": 1.0, "step": 153, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.125, "completion_length": 418.453125, "epoch": 0.004683128573166282, "grad_norm": 1.2554274834074644, "kl": 0.01446533203125, "learning_rate": 9.99945886692098e-07, "loss": 0.0006, "reward": 1.9105966091156006, "reward_std": 0.14083711802959442, "rewards/accuracy_reward": 0.7605965733528137, "rewards/format_reward": 1.0, "step": 154, "temperature": 1.0 }, { "all_correct": 0.25, "all_wrong": 0.0, "completion_length": 417.09375, "epoch": 0.004713538498966063, "grad_norm": 1.405928031553138, "kl": 0.0135498046875, "learning_rate": 9.999451816530275e-07, "loss": 0.0005, "reward": 1.6618525981903076, "reward_std": 0.21694952249526978, "rewards/accuracy_reward": 0.5556026101112366, "rewards/format_reward": 0.984375, "step": 155, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.125, "completion_length": 373.140625, "epoch": 0.004743948424765844, "grad_norm": 1.0697919086566865, "kl": 0.0196533203125, "learning_rate": 9.99944472050932e-07, "loss": 0.0008, "reward": 1.821874976158142, "reward_std": 0.147711843252182, "rewards/accuracy_reward": 0.6875, "rewards/format_reward": 1.0, "step": 156, "temperature": 1.0 }, { "all_correct": 0.375, "all_wrong": 0.125, "completion_length": 382.296875, "epoch": 0.0047743583505656245, "grad_norm": 1.3137637121377133, "kl": 0.0184326171875, "learning_rate": 9.99943757885818e-07, "loss": 0.0007, "reward": 1.8315324783325195, "reward_std": 0.17180569469928741, "rewards/accuracy_reward": 0.6971573829650879, "rewards/format_reward": 1.0, "step": 157, "temperature": 1.0 }, { "all_correct": 0.25, "all_wrong": 0.125, "completion_length": 450.078125, "epoch": 0.0048047682763654054, "grad_norm": 1.0325763822116654, "kl": 0.014404296875, "learning_rate": 9.999430391576918e-07, "loss": 0.0006, "reward": 1.594119668006897, "reward_std": 0.18696415424346924, "rewards/accuracy_reward": 0.515994668006897, "rewards/format_reward": 1.0, "step": 158, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.25, "completion_length": 376.640625, "epoch": 0.004835178202165186, "grad_norm": 0.996193873185499, "kl": 0.019287109375, "learning_rate": 9.9994231586656e-07, "loss": 0.0008, "reward": 1.7409956455230713, "reward_std": 0.12510690093040466, "rewards/accuracy_reward": 0.6159957647323608, "rewards/format_reward": 1.0, "step": 159, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 394.015625, "epoch": 0.004865588127964968, "grad_norm": 1.1598439380998047, "kl": 0.019775390625, "learning_rate": 9.999415880124296e-07, "loss": 0.0008, "reward": 1.9418286085128784, "reward_std": 0.2405511885881424, "rewards/accuracy_reward": 0.7762036323547363, "rewards/format_reward": 1.0, "step": 160, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.0, "completion_length": 397.25, "epoch": 0.004895998053764749, "grad_norm": 1.2265609150598025, "kl": 0.017822265625, "learning_rate": 9.999408555953066e-07, "loss": 0.0007, "reward": 1.9905532598495483, "reward_std": 0.05020171031355858, "rewards/accuracy_reward": 0.8124282360076904, "rewards/format_reward": 1.0, "step": 161, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.25, "completion_length": 405.765625, "epoch": 0.00492640797956453, "grad_norm": 0.5407968570662303, "kl": 0.014892578125, "learning_rate": 9.999401186151983e-07, "loss": 0.0006, "reward": 1.7437500953674316, "reward_std": 0.09669842571020126, "rewards/accuracy_reward": 0.625, "rewards/format_reward": 1.0, "step": 162, "temperature": 1.0 }, { "all_correct": 0.25, "all_wrong": 0.125, "completion_length": 412.28125, "epoch": 0.004956817905364311, "grad_norm": 0.9864724322545501, "kl": 0.01495361328125, "learning_rate": 9.99939377072111e-07, "loss": 0.0006, "reward": 1.6354146003723145, "reward_std": 0.28268858790397644, "rewards/accuracy_reward": 0.5416646003723145, "rewards/format_reward": 0.984375, "step": 163, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 394.84375, "epoch": 0.004987227831164092, "grad_norm": 1.0647402377184647, "kl": 0.0205078125, "learning_rate": 9.999386309660518e-07, "loss": 0.0008, "reward": 1.8802317380905151, "reward_std": 0.18236258625984192, "rewards/accuracy_reward": 0.7396067976951599, "rewards/format_reward": 0.984375, "step": 164, "temperature": 1.0 }, { "all_correct": 0.25, "all_wrong": 0.0, "completion_length": 403.796875, "epoch": 0.005017637756963873, "grad_norm": 1.3360443785792222, "kl": 0.017822265625, "learning_rate": 9.999378802970273e-07, "loss": 0.0007, "reward": 1.7311592102050781, "reward_std": 0.37493041157722473, "rewards/accuracy_reward": 0.6342841386795044, "rewards/format_reward": 0.984375, "step": 165, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.125, "completion_length": 414.21875, "epoch": 0.005048047682763654, "grad_norm": 1.227267912895048, "kl": 0.015625, "learning_rate": 9.999371250650443e-07, "loss": 0.0006, "reward": 1.7867743968963623, "reward_std": 0.10373984277248383, "rewards/accuracy_reward": 0.6555243730545044, "rewards/format_reward": 1.0, "step": 166, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.125, "completion_length": 381.265625, "epoch": 0.0050784576085634355, "grad_norm": 5.573274860294901, "kl": 0.0186767578125, "learning_rate": 9.999363652701097e-07, "loss": 0.0007, "reward": 1.8781249523162842, "reward_std": 0.189039409160614, "rewards/accuracy_reward": 0.75, "rewards/format_reward": 1.0, "step": 167, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 381.1875, "epoch": 0.0051088675343632164, "grad_norm": 1.5245968420451688, "kl": 0.015869140625, "learning_rate": 9.999356009122307e-07, "loss": 0.0006, "reward": 1.8024578094482422, "reward_std": 0.19867169857025146, "rewards/accuracy_reward": 0.6774578094482422, "rewards/format_reward": 1.0, "step": 168, "temperature": 1.0 }, { "all_correct": 0.0, "all_wrong": 0.125, "completion_length": 438.484375, "epoch": 0.005139277460162997, "grad_norm": 1.5126479755429134, "kl": 0.0174560546875, "learning_rate": 9.999348319914141e-07, "loss": 0.0007, "reward": 1.550065279006958, "reward_std": 0.353948712348938, "rewards/accuracy_reward": 0.4906902611255646, "rewards/format_reward": 0.984375, "step": 169, "temperature": 1.0 }, { "all_correct": 0.125, "all_wrong": 0.125, "completion_length": 449.078125, "epoch": 0.005169687385962778, "grad_norm": 1.621401769931139, "kl": 0.01019287109375, "learning_rate": 9.999340585076667e-07, "loss": 0.0004, "reward": 1.446197271347046, "reward_std": 0.22781604528427124, "rewards/accuracy_reward": 0.3868221938610077, "rewards/format_reward": 1.0, "step": 170, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.0, "completion_length": 400.5, "epoch": 0.005200097311762559, "grad_norm": 1.008489149973811, "kl": 0.0159912109375, "learning_rate": 9.99933280460996e-07, "loss": 0.0006, "reward": 1.9821999073028564, "reward_std": 0.17775297164916992, "rewards/accuracy_reward": 0.8384499549865723, "rewards/format_reward": 0.96875, "step": 171, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 398.265625, "epoch": 0.00523050723756234, "grad_norm": 1.2521057745061377, "kl": 0.01446533203125, "learning_rate": 9.999324978514087e-07, "loss": 0.0006, "reward": 2.0507450103759766, "reward_std": 0.08263193815946579, "rewards/accuracy_reward": 0.8913698196411133, "rewards/format_reward": 1.0, "step": 172, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.125, "completion_length": 362.390625, "epoch": 0.005260917163362121, "grad_norm": 0.9522233747661588, "kl": 0.0205078125, "learning_rate": 9.999317106789121e-07, "loss": 0.0008, "reward": 1.8781249523162842, "reward_std": 0.10790814459323883, "rewards/accuracy_reward": 0.75, "rewards/format_reward": 1.0, "step": 173, "temperature": 1.0 }, { "all_correct": 0.0, "all_wrong": 0.125, "completion_length": 463.890625, "epoch": 0.005291327089161903, "grad_norm": 1.562755596088275, "kl": 0.017333984375, "learning_rate": 9.999309189435137e-07, "loss": 0.0007, "reward": 1.5901439189910889, "reward_std": 0.32178163528442383, "rewards/accuracy_reward": 0.5495189428329468, "rewards/format_reward": 0.953125, "step": 174, "temperature": 1.0 }, { "all_correct": 0.375, "all_wrong": 0.0, "completion_length": 430.8125, "epoch": 0.005321737014961684, "grad_norm": 1.3739334321952938, "kl": 0.0133056640625, "learning_rate": 9.9993012264522e-07, "loss": 0.0005, "reward": 1.6702098846435547, "reward_std": 0.2349352240562439, "rewards/accuracy_reward": 0.5733348727226257, "rewards/format_reward": 0.984375, "step": 175, "temperature": 1.0 }, { "all_correct": 0.375, "all_wrong": 0.125, "completion_length": 416.921875, "epoch": 0.005352146940761465, "grad_norm": 1.1652981075424256, "kl": 0.0115966796875, "learning_rate": 9.99929321784039e-07, "loss": 0.0005, "reward": 1.8125, "reward_std": 0.29565125703811646, "rewards/accuracy_reward": 0.6968750357627869, "rewards/format_reward": 1.0, "step": 176, "temperature": 1.0 }, { "all_correct": 0.0, "all_wrong": 0.375, "completion_length": 404.59375, "epoch": 0.005382556866561246, "grad_norm": 1.3406466440273301, "kl": 0.01239013671875, "learning_rate": 9.999285163599776e-07, "loss": 0.0005, "reward": 1.3471534252166748, "reward_std": 0.15532881021499634, "rewards/accuracy_reward": 0.2877783477306366, "rewards/format_reward": 1.0, "step": 177, "temperature": 1.0 }, { "all_correct": 0.375, "all_wrong": 0.25, "completion_length": 430.625, "epoch": 0.005412966792361027, "grad_norm": 1.3887344535264228, "kl": 0.01336669921875, "learning_rate": 9.999277063730433e-07, "loss": 0.0005, "reward": 1.6237156391143799, "reward_std": 0.21755698323249817, "rewards/accuracy_reward": 0.570590615272522, "rewards/format_reward": 0.953125, "step": 178, "temperature": 1.0 }, { "all_correct": 0.375, "all_wrong": 0.0, "completion_length": 409.625, "epoch": 0.0054433767181608075, "grad_norm": 2.0743859063561723, "kl": 0.01324462890625, "learning_rate": 9.999268918232434e-07, "loss": 0.0005, "reward": 1.8434568643569946, "reward_std": 0.10064193606376648, "rewards/accuracy_reward": 0.6872068643569946, "rewards/format_reward": 1.0, "step": 179, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.25, "completion_length": 379.015625, "epoch": 0.005473786643960588, "grad_norm": 0.7532915053197624, "kl": 0.01214599609375, "learning_rate": 9.999260727105853e-07, "loss": 0.0005, "reward": 1.8430216312408447, "reward_std": 0.05529624968767166, "rewards/accuracy_reward": 0.7055215239524841, "rewards/format_reward": 1.0, "step": 180, "temperature": 1.0 }, { "all_correct": 0.375, "all_wrong": 0.125, "completion_length": 356.453125, "epoch": 0.005504196569760369, "grad_norm": 1.7075214346164662, "kl": 0.0218505859375, "learning_rate": 9.999252490350766e-07, "loss": 0.0009, "reward": 1.6724622249603271, "reward_std": 0.25920727849006653, "rewards/accuracy_reward": 0.5787121057510376, "rewards/format_reward": 1.0, "step": 181, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.0, "completion_length": 358.28125, "epoch": 0.005534606495560151, "grad_norm": 2.042184463125017, "kl": 0.0133056640625, "learning_rate": 9.99924420796725e-07, "loss": 0.0005, "reward": 1.8975837230682373, "reward_std": 0.09810338169336319, "rewards/accuracy_reward": 0.7507086992263794, "rewards/format_reward": 1.0, "step": 182, "temperature": 1.0 }, { "all_correct": 0.0, "all_wrong": 0.25, "completion_length": 423.890625, "epoch": 0.005565016421359932, "grad_norm": 1.3303510784474357, "kl": 0.013916015625, "learning_rate": 9.999235879955376e-07, "loss": 0.0006, "reward": 1.4585750102996826, "reward_std": 0.36167454719543457, "rewards/accuracy_reward": 0.3898250460624695, "rewards/format_reward": 1.0, "step": 183, "temperature": 1.0 }, { "all_correct": 0.375, "all_wrong": 0.0, "completion_length": 400.59375, "epoch": 0.005595426347159713, "grad_norm": 1.557466390721163, "kl": 0.01129150390625, "learning_rate": 9.999227506315222e-07, "loss": 0.0005, "reward": 1.8849766254425049, "reward_std": 0.25979411602020264, "rewards/accuracy_reward": 0.7599766254425049, "rewards/format_reward": 1.0, "step": 184, "temperature": 1.0 }, { "all_correct": 0.375, "all_wrong": 0.25, "completion_length": 430.90625, "epoch": 0.005625836272959494, "grad_norm": 1.2477656111925082, "kl": 0.01007080078125, "learning_rate": 9.999219087046865e-07, "loss": 0.0004, "reward": 1.7110908031463623, "reward_std": 0.164617657661438, "rewards/accuracy_reward": 0.582965612411499, "rewards/format_reward": 1.0, "step": 185, "temperature": 1.0 }, { "all_correct": 0.25, "all_wrong": 0.125, "completion_length": 382.5625, "epoch": 0.005656246198759275, "grad_norm": 1.3597665407127855, "kl": 0.0146484375, "learning_rate": 9.999210622150383e-07, "loss": 0.0006, "reward": 1.634385347366333, "reward_std": 0.28478699922561646, "rewards/accuracy_reward": 0.540635347366333, "rewards/format_reward": 1.0, "step": 186, "temperature": 1.0 }, { "all_correct": 0.25, "all_wrong": 0.125, "completion_length": 437.296875, "epoch": 0.005686656124559056, "grad_norm": 8.877258215680504, "kl": 0.0115966796875, "learning_rate": 9.999202111625851e-07, "loss": 0.0005, "reward": 1.5340607166290283, "reward_std": 0.2130703628063202, "rewards/accuracy_reward": 0.4715607166290283, "rewards/format_reward": 0.9375, "step": 187, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.125, "completion_length": 369.34375, "epoch": 0.005717066050358837, "grad_norm": 1.0737367954086672, "kl": 0.013916015625, "learning_rate": 9.999193555473349e-07, "loss": 0.0006, "reward": 1.984375, "reward_std": 0.13876724243164062, "rewards/accuracy_reward": 0.828125, "rewards/format_reward": 1.0, "step": 188, "temperature": 1.0 }, { "all_correct": 0.125, "all_wrong": 0.25, "completion_length": 406.46875, "epoch": 0.0057474759761586185, "grad_norm": 1.1086347245193864, "kl": 0.01226806640625, "learning_rate": 9.999184953692952e-07, "loss": 0.0005, "reward": 1.414463758468628, "reward_std": 0.28794142603874207, "rewards/accuracy_reward": 0.33633872866630554, "rewards/format_reward": 1.0, "step": 189, "temperature": 1.0 }, { "all_correct": 0.25, "all_wrong": 0.375, "completion_length": 360.484375, "epoch": 0.005777885901958399, "grad_norm": 1.101609083427379, "kl": 0.0162353515625, "learning_rate": 9.99917630628474e-07, "loss": 0.0007, "reward": 1.4500000476837158, "reward_std": 0.20526908338069916, "rewards/accuracy_reward": 0.390625, "rewards/format_reward": 1.0, "step": 190, "temperature": 1.0 }, { "all_correct": 0.25, "all_wrong": 0.125, "completion_length": 411.03125, "epoch": 0.00580829582775818, "grad_norm": 1.7051722738234822, "kl": 0.013916015625, "learning_rate": 9.999167613248794e-07, "loss": 0.0006, "reward": 1.6270763874053955, "reward_std": 0.2157573401927948, "rewards/accuracy_reward": 0.5395764112472534, "rewards/format_reward": 1.0, "step": 191, "temperature": 1.0 }, { "all_correct": 0.25, "all_wrong": 0.125, "completion_length": 386.984375, "epoch": 0.005838705753557961, "grad_norm": 1.302695433524478, "kl": 0.01422119140625, "learning_rate": 9.99915887458519e-07, "loss": 0.0006, "reward": 1.5817657709121704, "reward_std": 0.28352606296539307, "rewards/accuracy_reward": 0.5036407709121704, "rewards/format_reward": 1.0, "step": 192, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 386.28125, "epoch": 0.005869115679357742, "grad_norm": 1.7590269488643204, "kl": 0.0137939453125, "learning_rate": 9.999150090294008e-07, "loss": 0.0006, "reward": 1.7564142942428589, "reward_std": 0.1505160629749298, "rewards/accuracy_reward": 0.6282892227172852, "rewards/format_reward": 0.984375, "step": 193, "temperature": 1.0 }, { "all_correct": 0.375, "all_wrong": 0.0, "completion_length": 363.796875, "epoch": 0.005899525605157523, "grad_norm": 1.920205592023229, "kl": 0.018310546875, "learning_rate": 9.99914126037533e-07, "loss": 0.0007, "reward": 1.7245876789093018, "reward_std": 0.2171836495399475, "rewards/accuracy_reward": 0.5933376550674438, "rewards/format_reward": 1.0, "step": 194, "temperature": 1.0 }, { "all_correct": 0.125, "all_wrong": 0.0, "completion_length": 402.03125, "epoch": 0.005929935530957304, "grad_norm": 2.708794554667455, "kl": 0.0142822265625, "learning_rate": 9.999132384829237e-07, "loss": 0.0006, "reward": 1.6385501623153687, "reward_std": 0.34167760610580444, "rewards/accuracy_reward": 0.5385500192642212, "rewards/format_reward": 1.0, "step": 195, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.125, "completion_length": 387.15625, "epoch": 0.005960345456757086, "grad_norm": 0.7565696081840521, "kl": 0.0157470703125, "learning_rate": 9.99912346365581e-07, "loss": 0.0006, "reward": 1.8814079761505127, "reward_std": 0.1055237203836441, "rewards/accuracy_reward": 0.7220330238342285, "rewards/format_reward": 1.0, "step": 196, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.25, "completion_length": 415.578125, "epoch": 0.005990755382556867, "grad_norm": 1.3716921990656596, "kl": 0.0145263671875, "learning_rate": 9.999114496855125e-07, "loss": 0.0006, "reward": 1.6889476776123047, "reward_std": 0.06109042093157768, "rewards/accuracy_reward": 0.5764476656913757, "rewards/format_reward": 1.0, "step": 197, "temperature": 1.0 }, { "all_correct": 0.375, "all_wrong": 0.125, "completion_length": 408.953125, "epoch": 0.006021165308356648, "grad_norm": 1.2696577001009746, "kl": 0.0157470703125, "learning_rate": 9.999105484427273e-07, "loss": 0.0006, "reward": 1.703125, "reward_std": 0.3058618903160095, "rewards/accuracy_reward": 0.625, "rewards/format_reward": 0.984375, "step": 198, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 378.25, "epoch": 0.006051575234156429, "grad_norm": 1.2521705961099014, "kl": 0.0169677734375, "learning_rate": 9.999096426372328e-07, "loss": 0.0007, "reward": 1.9564234018325806, "reward_std": 0.029600605368614197, "rewards/accuracy_reward": 0.7939233183860779, "rewards/format_reward": 1.0, "step": 199, "temperature": 1.0 }, { "all_correct": 0.375, "all_wrong": 0.0, "completion_length": 385.28125, "epoch": 0.0060819851599562096, "grad_norm": 1.4527944675901368, "kl": 0.0147705078125, "learning_rate": 9.999087322690379e-07, "loss": 0.0006, "reward": 1.8885259628295898, "reward_std": 0.12971588969230652, "rewards/accuracy_reward": 0.7322760224342346, "rewards/format_reward": 1.0, "step": 200, "temperature": 1.0 }, { "all_correct": 0.25, "all_wrong": 0.0, "completion_length": 438.09375, "epoch": 0.0061123950857559905, "grad_norm": 1.3893010615237968, "kl": 0.0123291015625, "learning_rate": 9.999078173381504e-07, "loss": 0.0005, "reward": 1.719128131866455, "reward_std": 0.270042359828949, "rewards/accuracy_reward": 0.6066280603408813, "rewards/format_reward": 1.0, "step": 201, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.0, "completion_length": 374.125, "epoch": 0.006142805011555771, "grad_norm": 1.2857928784234862, "kl": 0.0186767578125, "learning_rate": 9.99906897844579e-07, "loss": 0.0007, "reward": 1.8464598655700684, "reward_std": 0.22821247577667236, "rewards/accuracy_reward": 0.7464597821235657, "rewards/format_reward": 0.96875, "step": 202, "temperature": 1.0 }, { "all_correct": 0.125, "all_wrong": 0.0, "completion_length": 484.828125, "epoch": 0.006173214937355553, "grad_norm": 1.161916327769571, "kl": 0.0106201171875, "learning_rate": 9.99905973788332e-07, "loss": 0.0004, "reward": 1.7661173343658447, "reward_std": 0.3755035996437073, "rewards/accuracy_reward": 0.6723674535751343, "rewards/format_reward": 0.984375, "step": 203, "temperature": 1.0 }, { "all_correct": 0.875, "all_wrong": 0.0, "completion_length": 383.40625, "epoch": 0.006203624863155334, "grad_norm": 0.8321996024052432, "kl": 0.018310546875, "learning_rate": 9.999050451694177e-07, "loss": 0.0007, "reward": 2.125, "reward_std": 0.09940993785858154, "rewards/accuracy_reward": 0.953125, "rewards/format_reward": 1.0, "step": 204, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.125, "completion_length": 401.453125, "epoch": 0.006234034788955115, "grad_norm": 0.9804543956579949, "kl": 0.018310546875, "learning_rate": 9.999041119878447e-07, "loss": 0.0007, "reward": 1.7361085414886475, "reward_std": 0.17073804140090942, "rewards/accuracy_reward": 0.6204833984375, "rewards/format_reward": 1.0, "step": 205, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 391.109375, "epoch": 0.006264444714754896, "grad_norm": 1.2028693885976411, "kl": 0.0213623046875, "learning_rate": 9.999031742436215e-07, "loss": 0.0009, "reward": 1.984070062637329, "reward_std": 0.11129516363143921, "rewards/accuracy_reward": 0.837195098400116, "rewards/format_reward": 1.0, "step": 206, "temperature": 1.0 }, { "all_correct": 0.125, "all_wrong": 0.0, "completion_length": 437.390625, "epoch": 0.006294854640554677, "grad_norm": 1.5376020965816457, "kl": 0.0137939453125, "learning_rate": 9.999022319367567e-07, "loss": 0.0006, "reward": 1.6291178464889526, "reward_std": 0.2104966640472412, "rewards/accuracy_reward": 0.5009927749633789, "rewards/format_reward": 0.984375, "step": 207, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.125, "completion_length": 410.703125, "epoch": 0.006325264566354458, "grad_norm": 2.099640306710403, "kl": 0.017578125, "learning_rate": 9.999012850672587e-07, "loss": 0.0007, "reward": 1.7741410732269287, "reward_std": 0.16794037818908691, "rewards/accuracy_reward": 0.6491410136222839, "rewards/format_reward": 1.0, "step": 208, "temperature": 1.0 }, { "all_correct": 0.375, "all_wrong": 0.125, "completion_length": 448.515625, "epoch": 0.006355674492154239, "grad_norm": 1.3803918216890592, "kl": 0.017578125, "learning_rate": 9.999003336351364e-07, "loss": 0.0007, "reward": 1.7693995237350464, "reward_std": 0.1516171246767044, "rewards/accuracy_reward": 0.6381495594978333, "rewards/format_reward": 1.0, "step": 209, "temperature": 1.0 }, { "all_correct": 0.375, "all_wrong": 0.125, "completion_length": 432.078125, "epoch": 0.0063860844179540206, "grad_norm": 2.0535011409234167, "kl": 0.020263671875, "learning_rate": 9.998993776403983e-07, "loss": 0.0008, "reward": 1.74228835105896, "reward_std": 0.21149301528930664, "rewards/accuracy_reward": 0.6422882676124573, "rewards/format_reward": 1.0, "step": 210, "temperature": 1.0 }, { "all_correct": 0.375, "all_wrong": 0.25, "completion_length": 395.046875, "epoch": 0.0064164943437538015, "grad_norm": 1.7156220404120721, "kl": 0.0220947265625, "learning_rate": 9.998984170830533e-07, "loss": 0.0009, "reward": 1.7945313453674316, "reward_std": 0.146955668926239, "rewards/accuracy_reward": 0.66015625, "rewards/format_reward": 1.0, "step": 211, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.25, "completion_length": 396.703125, "epoch": 0.006446904269553582, "grad_norm": 1.0191415850799448, "kl": 0.020263671875, "learning_rate": 9.998974519631103e-07, "loss": 0.0008, "reward": 1.734459400177002, "reward_std": 0.10649251192808151, "rewards/accuracy_reward": 0.6063344478607178, "rewards/format_reward": 1.0, "step": 212, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.125, "completion_length": 426.109375, "epoch": 0.006477314195353363, "grad_norm": 1.1212553986619302, "kl": 0.0216064453125, "learning_rate": 9.998964822805774e-07, "loss": 0.0009, "reward": 1.6878325939178467, "reward_std": 0.1365116685628891, "rewards/accuracy_reward": 0.6440826058387756, "rewards/format_reward": 0.9375, "step": 213, "temperature": 1.0 }, { "all_correct": 0.375, "all_wrong": 0.125, "completion_length": 422.859375, "epoch": 0.006507724121153144, "grad_norm": 6.258472389668914, "kl": 0.021728515625, "learning_rate": 9.998955080354644e-07, "loss": 0.0009, "reward": 1.714188575744629, "reward_std": 0.14747735857963562, "rewards/accuracy_reward": 0.5891884565353394, "rewards/format_reward": 1.0, "step": 214, "temperature": 1.0 }, { "all_correct": 0.375, "all_wrong": 0.0, "completion_length": 438.265625, "epoch": 0.006538134046952925, "grad_norm": 1.7274532008253825, "kl": 0.017822265625, "learning_rate": 9.998945292277793e-07, "loss": 0.0007, "reward": 1.6116645336151123, "reward_std": 0.1820090264081955, "rewards/accuracy_reward": 0.5491644740104675, "rewards/format_reward": 0.921875, "step": 215, "temperature": 1.0 }, { "all_correct": 0.25, "all_wrong": 0.0, "completion_length": 389.0, "epoch": 0.006568543972752706, "grad_norm": 1.35340326061819, "kl": 0.020751953125, "learning_rate": 9.998935458575316e-07, "loss": 0.0008, "reward": 1.7151023149490356, "reward_std": 0.26303189992904663, "rewards/accuracy_reward": 0.5651022791862488, "rewards/format_reward": 1.0, "step": 216, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.125, "completion_length": 382.65625, "epoch": 0.006598953898552488, "grad_norm": 0.9891291843789567, "kl": 0.02490234375, "learning_rate": 9.998925579247302e-07, "loss": 0.001, "reward": 2.028125047683716, "reward_std": 0.06187184900045395, "rewards/accuracy_reward": 0.859375, "rewards/format_reward": 1.0, "step": 217, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.125, "completion_length": 415.53125, "epoch": 0.006629363824352269, "grad_norm": 0.6999098048725213, "kl": 0.017578125, "learning_rate": 9.99891565429384e-07, "loss": 0.0007, "reward": 1.8948068618774414, "reward_std": 0.0846766009926796, "rewards/accuracy_reward": 0.751056969165802, "rewards/format_reward": 1.0, "step": 218, "temperature": 1.0 }, { "all_correct": 0.25, "all_wrong": 0.125, "completion_length": 421.96875, "epoch": 0.00665977375015205, "grad_norm": 4.3048021553854285, "kl": 0.0167236328125, "learning_rate": 9.998905683715021e-07, "loss": 0.0007, "reward": 1.7319144010543823, "reward_std": 0.10828442871570587, "rewards/accuracy_reward": 0.6100394129753113, "rewards/format_reward": 1.0, "step": 219, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.25, "completion_length": 378.0625, "epoch": 0.006690183675951831, "grad_norm": 0.8868201339092495, "kl": 0.020751953125, "learning_rate": 9.998895667510938e-07, "loss": 0.0008, "reward": 1.78125, "reward_std": 0.1530359536409378, "rewards/accuracy_reward": 0.671875, "rewards/format_reward": 1.0, "step": 220, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.125, "completion_length": 411.09375, "epoch": 0.006720593601751612, "grad_norm": 1.1557171687279029, "kl": 0.020751953125, "learning_rate": 9.998885605681676e-07, "loss": 0.0008, "reward": 1.7820312976837158, "reward_std": 0.08407025039196014, "rewards/accuracy_reward": 0.66015625, "rewards/format_reward": 1.0, "step": 221, "temperature": 1.0 }, { "all_correct": 0.375, "all_wrong": 0.125, "completion_length": 449.71875, "epoch": 0.0067510035275513925, "grad_norm": 1.0569183354789908, "kl": 0.012939453125, "learning_rate": 9.998875498227337e-07, "loss": 0.0005, "reward": 1.6942498683929443, "reward_std": 0.22362014651298523, "rewards/accuracy_reward": 0.5879997611045837, "rewards/format_reward": 1.0, "step": 222, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.0, "completion_length": 410.625, "epoch": 0.0067814134533511735, "grad_norm": 7.906802442704991, "kl": 0.01708984375, "learning_rate": 9.998865345148004e-07, "loss": 0.0007, "reward": 1.969193458557129, "reward_std": 0.15727870166301727, "rewards/accuracy_reward": 0.7973184585571289, "rewards/format_reward": 1.0, "step": 223, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 432.359375, "epoch": 0.006811823379150955, "grad_norm": 2.96437483469161, "kl": 0.0146484375, "learning_rate": 9.998855146443775e-07, "loss": 0.0006, "reward": 1.7564566135406494, "reward_std": 0.11943941563367844, "rewards/accuracy_reward": 0.6283315420150757, "rewards/format_reward": 1.0, "step": 224, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.0, "completion_length": 413.984375, "epoch": 0.006842233304950736, "grad_norm": 1.266059182595279, "kl": 0.0191650390625, "learning_rate": 9.998844902114739e-07, "loss": 0.0008, "reward": 1.9190821647644043, "reward_std": 0.20085208117961884, "rewards/accuracy_reward": 0.7784572243690491, "rewards/format_reward": 0.984375, "step": 225, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.125, "completion_length": 397.484375, "epoch": 0.006872643230750517, "grad_norm": 1.5112630298156946, "kl": 0.0220947265625, "learning_rate": 9.998834612160993e-07, "loss": 0.0009, "reward": 1.7913932800292969, "reward_std": 0.15194737911224365, "rewards/accuracy_reward": 0.669518232345581, "rewards/format_reward": 1.0, "step": 226, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 389.40625, "epoch": 0.006903053156550298, "grad_norm": 1.0268766912676646, "kl": 0.0213623046875, "learning_rate": 9.99882427658263e-07, "loss": 0.0009, "reward": 2.030564785003662, "reward_std": 0.09639369696378708, "rewards/accuracy_reward": 0.8493149280548096, "rewards/format_reward": 1.0, "step": 227, "temperature": 1.0 }, { "all_correct": 0.125, "all_wrong": 0.125, "completion_length": 431.8125, "epoch": 0.006933463082350079, "grad_norm": 1.679679765943165, "kl": 0.01544189453125, "learning_rate": 9.998813895379742e-07, "loss": 0.0006, "reward": 1.6168794631958008, "reward_std": 0.29984116554260254, "rewards/accuracy_reward": 0.526254415512085, "rewards/format_reward": 1.0, "step": 228, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 405.375, "epoch": 0.00696387300814986, "grad_norm": 1.7846381547403725, "kl": 0.0181884765625, "learning_rate": 9.998803468552428e-07, "loss": 0.0007, "reward": 1.8887107372283936, "reward_std": 0.12382345646619797, "rewards/accuracy_reward": 0.7418356537818909, "rewards/format_reward": 1.0, "step": 229, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.125, "completion_length": 374.84375, "epoch": 0.006994282933949641, "grad_norm": 1.1428493463383278, "kl": 0.0179443359375, "learning_rate": 9.998792996100782e-07, "loss": 0.0007, "reward": 1.7624480724334717, "reward_std": 0.12163154035806656, "rewards/accuracy_reward": 0.6468230485916138, "rewards/format_reward": 1.0, "step": 230, "temperature": 1.0 }, { "all_correct": 0.25, "all_wrong": 0.125, "completion_length": 431.71875, "epoch": 0.007024692859749423, "grad_norm": 1.166922419567413, "kl": 0.0172119140625, "learning_rate": 9.998782478024897e-07, "loss": 0.0007, "reward": 1.7511425018310547, "reward_std": 0.22296537458896637, "rewards/accuracy_reward": 0.6386424899101257, "rewards/format_reward": 1.0, "step": 231, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.125, "completion_length": 380.34375, "epoch": 0.0070551027855492035, "grad_norm": 0.9734376500868925, "kl": 0.018310546875, "learning_rate": 9.99877191432487e-07, "loss": 0.0007, "reward": 1.859375, "reward_std": 0.1939256191253662, "rewards/accuracy_reward": 0.734375, "rewards/format_reward": 0.984375, "step": 232, "temperature": 1.0 }, { "all_correct": 0.25, "all_wrong": 0.125, "completion_length": 373.015625, "epoch": 0.0070855127113489845, "grad_norm": 3.1585609162562522, "kl": 0.0177001953125, "learning_rate": 9.9987613050008e-07, "loss": 0.0007, "reward": 1.720524549484253, "reward_std": 0.24647434055805206, "rewards/accuracy_reward": 0.5986495614051819, "rewards/format_reward": 1.0, "step": 233, "temperature": 1.0 }, { "all_correct": 0.375, "all_wrong": 0.375, "completion_length": 415.484375, "epoch": 0.007115922637148765, "grad_norm": 1.1413550607333405, "kl": 0.0159912109375, "learning_rate": 9.99875065005278e-07, "loss": 0.0006, "reward": 1.5806015729904175, "reward_std": 0.10819113254547119, "rewards/accuracy_reward": 0.4931015968322754, "rewards/format_reward": 1.0, "step": 234, "temperature": 1.0 }, { "all_correct": 0.125, "all_wrong": 0.125, "completion_length": 415.046875, "epoch": 0.007146332562948546, "grad_norm": 1.263984754486529, "kl": 0.01336669921875, "learning_rate": 9.99873994948091e-07, "loss": 0.0005, "reward": 1.5381163358688354, "reward_std": 0.27806341648101807, "rewards/accuracy_reward": 0.4349912703037262, "rewards/format_reward": 1.0, "step": 235, "temperature": 1.0 }, { "all_correct": 0.375, "all_wrong": 0.25, "completion_length": 387.78125, "epoch": 0.007176742488748327, "grad_norm": 1.168037503685382, "kl": 0.017333984375, "learning_rate": 9.998729203285287e-07, "loss": 0.0007, "reward": 1.6493332386016846, "reward_std": 0.11238259077072144, "rewards/accuracy_reward": 0.5399582386016846, "rewards/format_reward": 1.0, "step": 236, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.0, "completion_length": 373.359375, "epoch": 0.007207152414548108, "grad_norm": 1.268387927184248, "kl": 0.0177001953125, "learning_rate": 9.998718411466008e-07, "loss": 0.0007, "reward": 1.8947432041168213, "reward_std": 0.10737168788909912, "rewards/accuracy_reward": 0.7384930849075317, "rewards/format_reward": 1.0, "step": 237, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 419.59375, "epoch": 0.00723756234034789, "grad_norm": 1.5050301211427486, "kl": 0.0155029296875, "learning_rate": 9.998707574023172e-07, "loss": 0.0006, "reward": 1.7658064365386963, "reward_std": 0.09879247844219208, "rewards/accuracy_reward": 0.6376813650131226, "rewards/format_reward": 1.0, "step": 238, "temperature": 1.0 }, { "all_correct": 0.375, "all_wrong": 0.125, "completion_length": 369.6875, "epoch": 0.007267972266147671, "grad_norm": 2.6382105945747507, "kl": 0.0179443359375, "learning_rate": 9.99869669095688e-07, "loss": 0.0007, "reward": 1.6472656726837158, "reward_std": 0.17413204908370972, "rewards/accuracy_reward": 0.556640625, "rewards/format_reward": 1.0, "step": 239, "temperature": 1.0 }, { "all_correct": 0.25, "all_wrong": 0.0, "completion_length": 408.78125, "epoch": 0.007298382191947452, "grad_norm": 1.3865649348822708, "kl": 0.014892578125, "learning_rate": 9.998685762267229e-07, "loss": 0.0006, "reward": 1.8254108428955078, "reward_std": 0.2866128385066986, "rewards/accuracy_reward": 0.6879106760025024, "rewards/format_reward": 1.0, "step": 240, "temperature": 1.0 }, { "all_correct": 0.25, "all_wrong": 0.125, "completion_length": 425.6875, "epoch": 0.007328792117747233, "grad_norm": 1.4692098400653377, "kl": 0.0135498046875, "learning_rate": 9.998674787954318e-07, "loss": 0.0005, "reward": 1.6054425239562988, "reward_std": 0.21674001216888428, "rewards/accuracy_reward": 0.5054423809051514, "rewards/format_reward": 0.984375, "step": 241, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.125, "completion_length": 370.109375, "epoch": 0.007359202043547014, "grad_norm": 1.8934079492849498, "kl": 0.01806640625, "learning_rate": 9.99866376801825e-07, "loss": 0.0007, "reward": 1.8729619979858398, "reward_std": 0.08135965466499329, "rewards/accuracy_reward": 0.7323369383811951, "rewards/format_reward": 1.0, "step": 242, "temperature": 1.0 }, { "all_correct": 0.375, "all_wrong": 0.0, "completion_length": 412.734375, "epoch": 0.007389611969346795, "grad_norm": 1.304983648567823, "kl": 0.01519775390625, "learning_rate": 9.998652702459123e-07, "loss": 0.0006, "reward": 1.7894978523254395, "reward_std": 0.23549628257751465, "rewards/accuracy_reward": 0.6394978761672974, "rewards/format_reward": 1.0, "step": 243, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 377.515625, "epoch": 0.0074200218951465755, "grad_norm": 1.1114601985571375, "kl": 0.01513671875, "learning_rate": 9.998641591277038e-07, "loss": 0.0006, "reward": 2.0155651569366455, "reward_std": 0.04978543519973755, "rewards/accuracy_reward": 0.8624401688575745, "rewards/format_reward": 1.0, "step": 244, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.0, "completion_length": 383.65625, "epoch": 0.007450431820946357, "grad_norm": 1.4515246684859342, "kl": 0.0196533203125, "learning_rate": 9.9986304344721e-07, "loss": 0.0008, "reward": 2.065331220626831, "reward_std": 0.19604066014289856, "rewards/accuracy_reward": 0.9028311967849731, "rewards/format_reward": 1.0, "step": 245, "temperature": 1.0 }, { "all_correct": 0.375, "all_wrong": 0.125, "completion_length": 424.9375, "epoch": 0.007480841746746138, "grad_norm": 1.0988957051601167, "kl": 0.0159912109375, "learning_rate": 9.998619232044405e-07, "loss": 0.0006, "reward": 1.6412570476531982, "reward_std": 0.19408860802650452, "rewards/accuracy_reward": 0.5475069284439087, "rewards/format_reward": 0.96875, "step": 246, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.0, "completion_length": 416.75, "epoch": 0.007511251672545919, "grad_norm": 1.435719151402882, "kl": 0.0185546875, "learning_rate": 9.998607983994061e-07, "loss": 0.0007, "reward": 1.9047449827194214, "reward_std": 0.10657308995723724, "rewards/accuracy_reward": 0.7391198873519897, "rewards/format_reward": 1.0, "step": 247, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 378.890625, "epoch": 0.0075416615983457, "grad_norm": 1.3455637978204695, "kl": 0.0211181640625, "learning_rate": 9.998596690321168e-07, "loss": 0.0008, "reward": 1.9215421676635742, "reward_std": 0.24901898205280304, "rewards/accuracy_reward": 0.7809171676635742, "rewards/format_reward": 1.0, "step": 248, "temperature": 1.0 }, { "all_correct": 0.375, "all_wrong": 0.25, "completion_length": 397.828125, "epoch": 0.007572071524145481, "grad_norm": 1.167171903141408, "kl": 0.0205078125, "learning_rate": 9.998585351025829e-07, "loss": 0.0008, "reward": 1.6909074783325195, "reward_std": 0.17762073874473572, "rewards/accuracy_reward": 0.5877823829650879, "rewards/format_reward": 1.0, "step": 249, "temperature": 1.0 }, { "all_correct": 0.25, "all_wrong": 0.0, "completion_length": 382.71875, "epoch": 0.007602481449945262, "grad_norm": 2.0764422873513295, "kl": 0.0191650390625, "learning_rate": 9.998573966108147e-07, "loss": 0.0008, "reward": 1.6750352382659912, "reward_std": 0.30122262239456177, "rewards/accuracy_reward": 0.5500352382659912, "rewards/format_reward": 1.0, "step": 250, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 399.703125, "epoch": 0.007632891375745043, "grad_norm": 0.8997983074065692, "kl": 0.018798828125, "learning_rate": 9.998562535568227e-07, "loss": 0.0007, "reward": 1.9275240898132324, "reward_std": 0.17176565527915955, "rewards/accuracy_reward": 0.7806490659713745, "rewards/format_reward": 1.0, "step": 251, "temperature": 1.0 }, { "all_correct": 0.375, "all_wrong": 0.125, "completion_length": 413.671875, "epoch": 0.007663301301544824, "grad_norm": 1.5932879533627549, "kl": 0.0186767578125, "learning_rate": 9.998551059406174e-07, "loss": 0.0007, "reward": 1.5218663215637207, "reward_std": 0.1883080154657364, "rewards/accuracy_reward": 0.45311635732650757, "rewards/format_reward": 0.984375, "step": 252, "temperature": 1.0 }, { "all_correct": 0.375, "all_wrong": 0.0, "completion_length": 434.140625, "epoch": 0.007693711227344606, "grad_norm": 1.742823374375136, "kl": 0.017822265625, "learning_rate": 9.998539537622093e-07, "loss": 0.0007, "reward": 1.7532962560653687, "reward_std": 0.14785994589328766, "rewards/accuracy_reward": 0.6095462441444397, "rewards/format_reward": 1.0, "step": 253, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.125, "completion_length": 394.921875, "epoch": 0.0077241211531443865, "grad_norm": 1.2722874219875149, "kl": 0.0191650390625, "learning_rate": 9.998527970216084e-07, "loss": 0.0008, "reward": 1.9215774536132812, "reward_std": 0.1415565460920334, "rewards/accuracy_reward": 0.7715773582458496, "rewards/format_reward": 1.0, "step": 254, "temperature": 1.0 }, { "all_correct": 0.875, "all_wrong": 0.125, "completion_length": 377.578125, "epoch": 0.0077545310789441674, "grad_norm": 0.6936276319667944, "kl": 0.022705078125, "learning_rate": 9.998516357188258e-07, "loss": 0.0009, "reward": 2.043750047683716, "reward_std": 0.017677675932645798, "rewards/accuracy_reward": 0.875, "rewards/format_reward": 1.0, "step": 255, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 406.96875, "epoch": 0.007784941004743948, "grad_norm": 1.2276230997416127, "kl": 0.0177001953125, "learning_rate": 9.99850469853872e-07, "loss": 0.0007, "reward": 2.046505928039551, "reward_std": 0.07997314631938934, "rewards/accuracy_reward": 0.8777557611465454, "rewards/format_reward": 1.0, "step": 256, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.125, "completion_length": 379.4375, "epoch": 0.00781535093054373, "grad_norm": 0.8746516396380941, "kl": 0.0164794921875, "learning_rate": 9.998492994267576e-07, "loss": 0.0007, "reward": 1.9320472478866577, "reward_std": 0.1163143664598465, "rewards/accuracy_reward": 0.7757972478866577, "rewards/format_reward": 1.0, "step": 257, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.125, "completion_length": 392.140625, "epoch": 0.00784576085634351, "grad_norm": 1.6968203059364477, "kl": 0.0185546875, "learning_rate": 9.998481244374932e-07, "loss": 0.0007, "reward": 1.6855379343032837, "reward_std": 0.15177245438098907, "rewards/accuracy_reward": 0.5824129581451416, "rewards/format_reward": 1.0, "step": 258, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 376.46875, "epoch": 0.007876170782143291, "grad_norm": 4.023832821261256, "kl": 0.021240234375, "learning_rate": 9.998469448860898e-07, "loss": 0.0008, "reward": 1.9996700286865234, "reward_std": 0.23446165025234222, "rewards/accuracy_reward": 0.8340448141098022, "rewards/format_reward": 1.0, "step": 259, "temperature": 1.0 }, { "all_correct": 0.125, "all_wrong": 0.125, "completion_length": 395.90625, "epoch": 0.007906580707943072, "grad_norm": 1.468067958819823, "kl": 0.0174560546875, "learning_rate": 9.998457607725577e-07, "loss": 0.0007, "reward": 1.5739922523498535, "reward_std": 0.3005222678184509, "rewards/accuracy_reward": 0.4864921271800995, "rewards/format_reward": 0.984375, "step": 260, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.125, "completion_length": 407.109375, "epoch": 0.007936990633742853, "grad_norm": 1.4478560274594938, "kl": 0.0172119140625, "learning_rate": 9.998445720969082e-07, "loss": 0.0007, "reward": 1.7547173500061035, "reward_std": 0.0737149715423584, "rewards/accuracy_reward": 0.6390923857688904, "rewards/format_reward": 1.0, "step": 261, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 386.046875, "epoch": 0.007967400559542634, "grad_norm": 1.2995545752815922, "kl": 0.016845703125, "learning_rate": 9.998433788591516e-07, "loss": 0.0007, "reward": 2.0538532733917236, "reward_std": 0.07212623953819275, "rewards/accuracy_reward": 0.8757283091545105, "rewards/format_reward": 1.0, "step": 262, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.125, "completion_length": 392.75, "epoch": 0.007997810485342417, "grad_norm": 1.0039865621466422, "kl": 0.0198974609375, "learning_rate": 9.998421810592995e-07, "loss": 0.0008, "reward": 1.850000023841858, "reward_std": 0.13277359306812286, "rewards/accuracy_reward": 0.7125000357627869, "rewards/format_reward": 1.0, "step": 263, "temperature": 1.0 }, { "all_correct": 0.375, "all_wrong": 0.125, "completion_length": 387.984375, "epoch": 0.008028220411142198, "grad_norm": 1.6787583724136423, "kl": 0.0201416015625, "learning_rate": 9.998409786973622e-07, "loss": 0.0008, "reward": 1.7388439178466797, "reward_std": 0.14969630539417267, "rewards/accuracy_reward": 0.5950939655303955, "rewards/format_reward": 1.0, "step": 264, "temperature": 1.0 }, { "all_correct": 0.25, "all_wrong": 0.0, "completion_length": 406.359375, "epoch": 0.008058630336941978, "grad_norm": 3.58921592621572, "kl": 0.0162353515625, "learning_rate": 9.99839771773351e-07, "loss": 0.0006, "reward": 1.7742525339126587, "reward_std": 0.21629348397254944, "rewards/accuracy_reward": 0.6117525696754456, "rewards/format_reward": 1.0, "step": 265, "temperature": 1.0 }, { "all_correct": 0.375, "all_wrong": 0.125, "completion_length": 430.421875, "epoch": 0.00808904026274176, "grad_norm": 9.729282074480514, "kl": 0.0166015625, "learning_rate": 9.998385602872768e-07, "loss": 0.0007, "reward": 1.6444361209869385, "reward_std": 0.16006812453269958, "rewards/accuracy_reward": 0.5475611090660095, "rewards/format_reward": 1.0, "step": 266, "temperature": 1.0 }, { "all_correct": 0.375, "all_wrong": 0.125, "completion_length": 454.734375, "epoch": 0.00811945018854154, "grad_norm": 1.2696450990003763, "kl": 0.0162353515625, "learning_rate": 9.998373442391505e-07, "loss": 0.0007, "reward": 1.5544025897979736, "reward_std": 0.2473534494638443, "rewards/accuracy_reward": 0.5262776613235474, "rewards/format_reward": 0.921875, "step": 267, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 400.625, "epoch": 0.008149860114341321, "grad_norm": 1.0757583736109715, "kl": 0.021240234375, "learning_rate": 9.998361236289836e-07, "loss": 0.0009, "reward": 1.7983217239379883, "reward_std": 0.17950627207756042, "rewards/accuracy_reward": 0.6826967000961304, "rewards/format_reward": 1.0, "step": 268, "temperature": 1.0 }, { "all_correct": 0.375, "all_wrong": 0.125, "completion_length": 385.765625, "epoch": 0.008180270040141102, "grad_norm": 88.34558035216274, "kl": 0.017333984375, "learning_rate": 9.99834898456787e-07, "loss": 0.0007, "reward": 1.6330974102020264, "reward_std": 0.10395858436822891, "rewards/accuracy_reward": 0.5080972909927368, "rewards/format_reward": 1.0, "step": 269, "temperature": 1.0 }, { "all_correct": 0.875, "all_wrong": 0.0, "completion_length": 355.359375, "epoch": 0.008210679965940883, "grad_norm": 0.9639375732062316, "kl": 0.025146484375, "learning_rate": 9.99833668722572e-07, "loss": 0.001, "reward": 2.1500000953674316, "reward_std": 0.09121407568454742, "rewards/accuracy_reward": 0.96875, "rewards/format_reward": 1.0, "step": 270, "temperature": 1.0 }, { "all_correct": 0.375, "all_wrong": 0.0, "completion_length": 430.390625, "epoch": 0.008241089891740664, "grad_norm": 1.3672529823270474, "kl": 0.0169677734375, "learning_rate": 9.998324344263497e-07, "loss": 0.0007, "reward": 1.742522954940796, "reward_std": 0.1812654584646225, "rewards/accuracy_reward": 0.6175230741500854, "rewards/format_reward": 1.0, "step": 271, "temperature": 1.0 }, { "all_correct": 0.375, "all_wrong": 0.125, "completion_length": 441.78125, "epoch": 0.008271499817540445, "grad_norm": 6.506897812501092, "kl": 0.015625, "learning_rate": 9.998311955681312e-07, "loss": 0.0006, "reward": 1.6561243534088135, "reward_std": 0.13107216358184814, "rewards/accuracy_reward": 0.5498743057250977, "rewards/format_reward": 1.0, "step": 272, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.125, "completion_length": 410.734375, "epoch": 0.008301909743340226, "grad_norm": 2.049900258813113, "kl": 0.01806640625, "learning_rate": 9.998299521479281e-07, "loss": 0.0007, "reward": 1.8084383010864258, "reward_std": 0.1195424497127533, "rewards/accuracy_reward": 0.6771881580352783, "rewards/format_reward": 1.0, "step": 273, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 370.09375, "epoch": 0.008332319669140007, "grad_norm": 1.1151104799809013, "kl": 0.0213623046875, "learning_rate": 9.998287041657515e-07, "loss": 0.0009, "reward": 2.0338916778564453, "reward_std": 0.10179020464420319, "rewards/accuracy_reward": 0.8495166897773743, "rewards/format_reward": 1.0, "step": 274, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.25, "completion_length": 381.390625, "epoch": 0.008362729594939788, "grad_norm": 0.633587659886379, "kl": 0.01806640625, "learning_rate": 9.998274516216132e-07, "loss": 0.0007, "reward": 1.8968749046325684, "reward_std": 0.008838837966322899, "rewards/accuracy_reward": 0.75, "rewards/format_reward": 1.0, "step": 275, "temperature": 1.0 }, { "all_correct": 0.25, "all_wrong": 0.25, "completion_length": 364.25, "epoch": 0.008393139520739569, "grad_norm": 1.2549106407714368, "kl": 0.0169677734375, "learning_rate": 9.998261945155245e-07, "loss": 0.0007, "reward": 1.7268636226654053, "reward_std": 0.20624911785125732, "rewards/accuracy_reward": 0.6081135272979736, "rewards/format_reward": 1.0, "step": 276, "temperature": 1.0 }, { "all_correct": 0.25, "all_wrong": 0.375, "completion_length": 369.34375, "epoch": 0.008423549446539351, "grad_norm": 1.3437726173468763, "kl": 0.02294921875, "learning_rate": 9.998249328474964e-07, "loss": 0.0009, "reward": 1.5311367511749268, "reward_std": 0.2296334058046341, "rewards/accuracy_reward": 0.449886679649353, "rewards/format_reward": 0.984375, "step": 277, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.125, "completion_length": 373.078125, "epoch": 0.008453959372339132, "grad_norm": 0.9114216670248432, "kl": 0.02197265625, "learning_rate": 9.99823666617541e-07, "loss": 0.0009, "reward": 2.0062501430511475, "reward_std": 0.08711418509483337, "rewards/accuracy_reward": 0.84375, "rewards/format_reward": 1.0, "step": 278, "temperature": 1.0 }, { "all_correct": 0.375, "all_wrong": 0.0, "completion_length": 374.671875, "epoch": 0.008484369298138913, "grad_norm": 1.1859561845566342, "kl": 0.0196533203125, "learning_rate": 9.998223958256695e-07, "loss": 0.0008, "reward": 1.903586745262146, "reward_std": 0.22309716045856476, "rewards/accuracy_reward": 0.731711745262146, "rewards/format_reward": 1.0, "step": 279, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.0, "completion_length": 364.375, "epoch": 0.008514779223938694, "grad_norm": 1.6769116808674795, "kl": 0.021240234375, "learning_rate": 9.998211204718936e-07, "loss": 0.0009, "reward": 2.087890625, "reward_std": 0.188189297914505, "rewards/accuracy_reward": 0.916015625, "rewards/format_reward": 1.0, "step": 280, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.25, "completion_length": 398.609375, "epoch": 0.008545189149738475, "grad_norm": 0.6488468337757527, "kl": 0.019287109375, "learning_rate": 9.99819840556225e-07, "loss": 0.0008, "reward": 1.7400803565979004, "reward_std": 0.12728886306285858, "rewards/accuracy_reward": 0.6182053089141846, "rewards/format_reward": 0.984375, "step": 281, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 365.734375, "epoch": 0.008575599075538256, "grad_norm": 1.179660191289629, "kl": 0.02001953125, "learning_rate": 9.998185560786753e-07, "loss": 0.0008, "reward": 1.8793513774871826, "reward_std": 0.2097109854221344, "rewards/accuracy_reward": 0.7324762344360352, "rewards/format_reward": 1.0, "step": 282, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 380.921875, "epoch": 0.008606009001338037, "grad_norm": 0.9254812804034066, "kl": 0.02001953125, "learning_rate": 9.998172670392566e-07, "loss": 0.0008, "reward": 2.0477325916290283, "reward_std": 0.06443040072917938, "rewards/accuracy_reward": 0.8664825558662415, "rewards/format_reward": 1.0, "step": 283, "temperature": 1.0 }, { "all_correct": 0.25, "all_wrong": 0.125, "completion_length": 389.8125, "epoch": 0.008636418927137818, "grad_norm": 2.637937194392917, "kl": 0.0203857421875, "learning_rate": 9.9981597343798e-07, "loss": 0.0008, "reward": 1.5260801315307617, "reward_std": 0.2240482121706009, "rewards/accuracy_reward": 0.43233007192611694, "rewards/format_reward": 1.0, "step": 284, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 389.890625, "epoch": 0.008666828852937599, "grad_norm": 1.1072693246546153, "kl": 0.016845703125, "learning_rate": 9.998146752748577e-07, "loss": 0.0007, "reward": 1.9428386688232422, "reward_std": 0.24031490087509155, "rewards/accuracy_reward": 0.7897135019302368, "rewards/format_reward": 1.0, "step": 285, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 399.96875, "epoch": 0.00869723877873738, "grad_norm": 0.8546295331496868, "kl": 0.01806640625, "learning_rate": 9.998133725499014e-07, "loss": 0.0007, "reward": 2.0550358295440674, "reward_std": 0.1342320740222931, "rewards/accuracy_reward": 0.9050357341766357, "rewards/format_reward": 1.0, "step": 286, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.25, "completion_length": 369.359375, "epoch": 0.00872764870453716, "grad_norm": 0.42319947894545923, "kl": 0.018798828125, "learning_rate": 9.998120652631231e-07, "loss": 0.0008, "reward": 1.875, "reward_std": 0.051754921674728394, "rewards/accuracy_reward": 0.734375, "rewards/format_reward": 1.0, "step": 287, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 408.703125, "epoch": 0.008758058630336941, "grad_norm": 0.5448274786954306, "kl": 0.0205078125, "learning_rate": 9.998107534145348e-07, "loss": 0.0008, "reward": 2.077310800552368, "reward_std": 0.09646253287792206, "rewards/accuracy_reward": 0.8991856575012207, "rewards/format_reward": 1.0, "step": 288, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.0, "completion_length": 394.796875, "epoch": 0.008788468556136722, "grad_norm": 1.263692114643898, "kl": 0.0177001953125, "learning_rate": 9.998094370041485e-07, "loss": 0.0007, "reward": 1.9167423248291016, "reward_std": 0.07768350094556808, "rewards/accuracy_reward": 0.7511173486709595, "rewards/format_reward": 1.0, "step": 289, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.0, "completion_length": 413.4375, "epoch": 0.008818878481936503, "grad_norm": 1.4574613980867532, "kl": 0.0206298828125, "learning_rate": 9.998081160319759e-07, "loss": 0.0008, "reward": 1.9200654029846191, "reward_std": 0.1518048793077469, "rewards/accuracy_reward": 0.7919403910636902, "rewards/format_reward": 0.984375, "step": 290, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.0, "completion_length": 361.375, "epoch": 0.008849288407736284, "grad_norm": 1.6967769004986903, "kl": 0.0238037109375, "learning_rate": 9.998067904980293e-07, "loss": 0.001, "reward": 1.9124999046325684, "reward_std": 0.23418307304382324, "rewards/accuracy_reward": 0.765625, "rewards/format_reward": 1.0, "step": 291, "temperature": 1.0 }, { "all_correct": 0.125, "all_wrong": 0.25, "completion_length": 386.203125, "epoch": 0.008879698333536067, "grad_norm": 1.5270866574207245, "kl": 0.01708984375, "learning_rate": 9.998054604023208e-07, "loss": 0.0007, "reward": 1.6708673238754272, "reward_std": 0.22572845220565796, "rewards/accuracy_reward": 0.5583672523498535, "rewards/format_reward": 1.0, "step": 292, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 423.6875, "epoch": 0.008910108259335848, "grad_norm": 1.2254023006663615, "kl": 0.019287109375, "learning_rate": 9.998041257448623e-07, "loss": 0.0008, "reward": 1.7597155570983887, "reward_std": 0.26602789759635925, "rewards/accuracy_reward": 0.6597157120704651, "rewards/format_reward": 0.984375, "step": 293, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 380.9375, "epoch": 0.008940518185135629, "grad_norm": 1.5121729070637349, "kl": 0.0225830078125, "learning_rate": 9.998027865256664e-07, "loss": 0.0009, "reward": 1.9640789031982422, "reward_std": 0.2045850157737732, "rewards/accuracy_reward": 0.8047038912773132, "rewards/format_reward": 1.0, "step": 294, "temperature": 1.0 }, { "all_correct": 0.25, "all_wrong": 0.0, "completion_length": 443.25, "epoch": 0.00897092811093541, "grad_norm": 2.6542982099965604, "kl": 0.020751953125, "learning_rate": 9.99801442744745e-07, "loss": 0.0008, "reward": 1.6407462358474731, "reward_std": 0.30159279704093933, "rewards/accuracy_reward": 0.5438711643218994, "rewards/format_reward": 1.0, "step": 295, "temperature": 1.0 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 376.734375, "epoch": 0.00900133803673519, "grad_norm": 1.0356791095074909, "kl": 0.0234375, "learning_rate": 9.998000944021105e-07, "loss": 0.0009, "reward": 2.190624952316284, "reward_std": 0.026516513898968697, "rewards/accuracy_reward": 1.0, "rewards/format_reward": 1.0, "step": 296, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 398.59375, "epoch": 0.009031747962534972, "grad_norm": 1.0930553306787711, "kl": 0.0203857421875, "learning_rate": 9.997987414977751e-07, "loss": 0.0008, "reward": 2.0114102363586426, "reward_std": 0.03505685180425644, "rewards/accuracy_reward": 0.8270350694656372, "rewards/format_reward": 1.0, "step": 297, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.0, "completion_length": 383.453125, "epoch": 0.009062157888334752, "grad_norm": 1.4956913686960942, "kl": 0.02587890625, "learning_rate": 9.997973840317512e-07, "loss": 0.001, "reward": 1.884374976158142, "reward_std": 0.2237825095653534, "rewards/accuracy_reward": 0.75, "rewards/format_reward": 1.0, "step": 298, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.125, "completion_length": 403.875, "epoch": 0.009092567814134533, "grad_norm": 2.1483223023180926, "kl": 0.0233154296875, "learning_rate": 9.997960220040513e-07, "loss": 0.0009, "reward": 1.805154800415039, "reward_std": 0.06720379739999771, "rewards/accuracy_reward": 0.6739048957824707, "rewards/format_reward": 1.0, "step": 299, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 408.75, "epoch": 0.009122977739934314, "grad_norm": 1.0463953955814098, "kl": 0.021728515625, "learning_rate": 9.99794655414688e-07, "loss": 0.0009, "reward": 1.9292975664138794, "reward_std": 0.20763559639453888, "rewards/accuracy_reward": 0.7730474472045898, "rewards/format_reward": 1.0, "step": 300, "temperature": 1.0 }, { "all_correct": 0.25, "all_wrong": 0.375, "completion_length": 394.484375, "epoch": 0.009153387665734095, "grad_norm": 3.1512766598196396, "kl": 0.0267333984375, "learning_rate": 9.99793284263673e-07, "loss": 0.0011, "reward": 1.430694580078125, "reward_std": 0.14200130105018616, "rewards/accuracy_reward": 0.36819449067115784, "rewards/format_reward": 1.0, "step": 301, "temperature": 1.0 }, { "all_correct": 0.375, "all_wrong": 0.25, "completion_length": 406.609375, "epoch": 0.009183797591533876, "grad_norm": 1.0791137430105968, "kl": 0.02587890625, "learning_rate": 9.997919085510195e-07, "loss": 0.001, "reward": 1.6759614944458008, "reward_std": 0.17698103189468384, "rewards/accuracy_reward": 0.5697115063667297, "rewards/format_reward": 1.0, "step": 302, "temperature": 1.0 }, { "all_correct": 0.125, "all_wrong": 0.125, "completion_length": 444.8125, "epoch": 0.009214207517333657, "grad_norm": 1.4418307326155777, "kl": 0.016357421875, "learning_rate": 9.9979052827674e-07, "loss": 0.0007, "reward": 1.655404806137085, "reward_std": 0.1988653838634491, "rewards/accuracy_reward": 0.524154782295227, "rewards/format_reward": 1.0, "step": 303, "temperature": 1.0 }, { "all_correct": 0.375, "all_wrong": 0.125, "completion_length": 447.109375, "epoch": 0.009244617443133438, "grad_norm": 1.248057559674378, "kl": 0.025390625, "learning_rate": 9.997891434408468e-07, "loss": 0.001, "reward": 1.6764607429504395, "reward_std": 0.25445371866226196, "rewards/accuracy_reward": 0.5983356833457947, "rewards/format_reward": 0.984375, "step": 304, "temperature": 1.0 }, { "all_correct": 0.25, "all_wrong": 0.375, "completion_length": 397.0625, "epoch": 0.009275027368933219, "grad_norm": 1.2817247996072059, "kl": 0.03125, "learning_rate": 9.997877540433528e-07, "loss": 0.0012, "reward": 1.4456782341003418, "reward_std": 0.11583495885133743, "rewards/accuracy_reward": 0.37380316853523254, "rewards/format_reward": 1.0, "step": 305, "temperature": 1.0 }, { "all_correct": 0.375, "all_wrong": 0.0, "completion_length": 408.078125, "epoch": 0.009305437294733002, "grad_norm": 1.2715245243088085, "kl": 0.0279541015625, "learning_rate": 9.997863600842707e-07, "loss": 0.0011, "reward": 1.8627594709396362, "reward_std": 0.307189404964447, "rewards/accuracy_reward": 0.7315094470977783, "rewards/format_reward": 1.0, "step": 306, "temperature": 1.0 }, { "all_correct": 0.375, "all_wrong": 0.125, "completion_length": 387.203125, "epoch": 0.009335847220532783, "grad_norm": 1.2061878735481548, "kl": 0.0250244140625, "learning_rate": 9.99784961563613e-07, "loss": 0.001, "reward": 1.8187501430511475, "reward_std": 0.2811111509799957, "rewards/accuracy_reward": 0.6875, "rewards/format_reward": 1.0, "step": 307, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.25, "completion_length": 383.828125, "epoch": 0.009366257146332563, "grad_norm": 0.5985537422618991, "kl": 0.03125, "learning_rate": 9.997835584813926e-07, "loss": 0.0013, "reward": 1.84726881980896, "reward_std": 0.004020115826278925, "rewards/accuracy_reward": 0.6972686648368835, "rewards/format_reward": 1.0, "step": 308, "temperature": 1.0 }, { "all_correct": 0.25, "all_wrong": 0.25, "completion_length": 411.046875, "epoch": 0.009396667072132344, "grad_norm": 1.0704198498017132, "kl": 0.020751953125, "learning_rate": 9.997821508376224e-07, "loss": 0.0008, "reward": 1.5381271839141846, "reward_std": 0.06827206909656525, "rewards/accuracy_reward": 0.4193771779537201, "rewards/format_reward": 1.0, "step": 309, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.0, "completion_length": 421.84375, "epoch": 0.009427076997932125, "grad_norm": 0.6895041807599859, "kl": 0.025634765625, "learning_rate": 9.99780738632315e-07, "loss": 0.001, "reward": 1.9787968397140503, "reward_std": 0.1919824779033661, "rewards/accuracy_reward": 0.8475468158721924, "rewards/format_reward": 0.96875, "step": 310, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.0, "completion_length": 373.671875, "epoch": 0.009457486923731906, "grad_norm": 1.3122637558664003, "kl": 0.0247802734375, "learning_rate": 9.997793218654834e-07, "loss": 0.001, "reward": 1.9734376668930054, "reward_std": 0.19686976075172424, "rewards/accuracy_reward": 0.8140624761581421, "rewards/format_reward": 1.0, "step": 311, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.125, "completion_length": 416.859375, "epoch": 0.009487896849531687, "grad_norm": 4.109287020477817, "kl": 0.0233154296875, "learning_rate": 9.997779005371407e-07, "loss": 0.0009, "reward": 1.9256929159164429, "reward_std": 0.06598345935344696, "rewards/accuracy_reward": 0.7819429039955139, "rewards/format_reward": 1.0, "step": 312, "temperature": 1.0 }, { "all_correct": 0.25, "all_wrong": 0.0, "completion_length": 424.21875, "epoch": 0.009518306775331468, "grad_norm": 1.8737314780510137, "kl": 0.0205078125, "learning_rate": 9.997764746472996e-07, "loss": 0.0008, "reward": 1.8117625713348389, "reward_std": 0.3156144320964813, "rewards/accuracy_reward": 0.664887547492981, "rewards/format_reward": 1.0, "step": 313, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.0, "completion_length": 399.59375, "epoch": 0.009548716701131249, "grad_norm": 1.7064761384703606, "kl": 0.0250244140625, "learning_rate": 9.997750441959734e-07, "loss": 0.001, "reward": 1.916944980621338, "reward_std": 0.14504948258399963, "rewards/accuracy_reward": 0.7700700163841248, "rewards/format_reward": 1.0, "step": 314, "temperature": 1.0 }, { "all_correct": 0.25, "all_wrong": 0.25, "completion_length": 375.734375, "epoch": 0.00957912662693103, "grad_norm": 1.1583862872424482, "kl": 0.0242919921875, "learning_rate": 9.99773609183175e-07, "loss": 0.001, "reward": 1.4302083253860474, "reward_std": 0.17158865928649902, "rewards/accuracy_reward": 0.3645833432674408, "rewards/format_reward": 1.0, "step": 315, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 392.671875, "epoch": 0.009609536552730811, "grad_norm": 1.043983484483266, "kl": 0.0185546875, "learning_rate": 9.997721696089173e-07, "loss": 0.0007, "reward": 1.8526010513305664, "reward_std": 0.25343698263168335, "rewards/accuracy_reward": 0.7088510394096375, "rewards/format_reward": 1.0, "step": 316, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 379.34375, "epoch": 0.009639946478530592, "grad_norm": 1.2435685205190956, "kl": 0.024169921875, "learning_rate": 9.997707254732137e-07, "loss": 0.001, "reward": 2.0125956535339355, "reward_std": 0.08711028099060059, "rewards/accuracy_reward": 0.8625956177711487, "rewards/format_reward": 1.0, "step": 317, "temperature": 1.0 }, { "all_correct": 0.25, "all_wrong": 0.0, "completion_length": 420.453125, "epoch": 0.009670356404330373, "grad_norm": 1.2073808849375935, "kl": 0.0191650390625, "learning_rate": 9.997692767760773e-07, "loss": 0.0008, "reward": 1.483243465423584, "reward_std": 0.22430793941020966, "rewards/accuracy_reward": 0.38949358463287354, "rewards/format_reward": 1.0, "step": 318, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 414.921875, "epoch": 0.009700766330130154, "grad_norm": 1.7543338198623577, "kl": 0.01458740234375, "learning_rate": 9.997678235175215e-07, "loss": 0.0006, "reward": 2.0304794311523438, "reward_std": 0.15252670645713806, "rewards/accuracy_reward": 0.8648544549942017, "rewards/format_reward": 1.0, "step": 319, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.25, "completion_length": 394.359375, "epoch": 0.009731176255929936, "grad_norm": 1.1622687202292914, "kl": 0.0203857421875, "learning_rate": 9.997663656975592e-07, "loss": 0.0008, "reward": 1.6910561323165894, "reward_std": 0.03899677097797394, "rewards/accuracy_reward": 0.5723061561584473, "rewards/format_reward": 1.0, "step": 320, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.125, "completion_length": 398.734375, "epoch": 0.009761586181729717, "grad_norm": 0.978334555688099, "kl": 0.01904296875, "learning_rate": 9.997649033162042e-07, "loss": 0.0008, "reward": 1.9593749046325684, "reward_std": 0.11948156356811523, "rewards/accuracy_reward": 0.828125, "rewards/format_reward": 1.0, "step": 321, "temperature": 1.0 }, { "all_correct": 0.375, "all_wrong": 0.25, "completion_length": 370.0625, "epoch": 0.009791996107529498, "grad_norm": 1.3129654251741718, "kl": 0.021484375, "learning_rate": 9.997634363734692e-07, "loss": 0.0009, "reward": 1.6187500953674316, "reward_std": 0.21988226473331451, "rewards/accuracy_reward": 0.53125, "rewards/format_reward": 1.0, "step": 322, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.125, "completion_length": 395.53125, "epoch": 0.009822406033329279, "grad_norm": 2.10137825726634, "kl": 0.0247802734375, "learning_rate": 9.997619648693682e-07, "loss": 0.001, "reward": 1.8324732780456543, "reward_std": 0.17295102775096893, "rewards/accuracy_reward": 0.6949732303619385, "rewards/format_reward": 1.0, "step": 323, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.0, "completion_length": 424.796875, "epoch": 0.00985281595912906, "grad_norm": 1.2091792124021508, "kl": 0.0169677734375, "learning_rate": 9.997604888039145e-07, "loss": 0.0007, "reward": 1.9597744941711426, "reward_std": 0.07805581390857697, "rewards/accuracy_reward": 0.8097744584083557, "rewards/format_reward": 1.0, "step": 324, "temperature": 1.0 }, { "all_correct": 0.25, "all_wrong": 0.0, "completion_length": 468.359375, "epoch": 0.009883225884928841, "grad_norm": 1.470298354998882, "kl": 0.01531982421875, "learning_rate": 9.99759008177121e-07, "loss": 0.0006, "reward": 1.7124056816101074, "reward_std": 0.17168018221855164, "rewards/accuracy_reward": 0.5780308246612549, "rewards/format_reward": 0.984375, "step": 325, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.125, "completion_length": 359.046875, "epoch": 0.009913635810728622, "grad_norm": 1.4401498106282375, "kl": 0.0263671875, "learning_rate": 9.99757522989002e-07, "loss": 0.0011, "reward": 1.896331548690796, "reward_std": 0.16039817035198212, "rewards/accuracy_reward": 0.7588316202163696, "rewards/format_reward": 1.0, "step": 326, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 369.09375, "epoch": 0.009944045736528403, "grad_norm": 1.4028236008621378, "kl": 0.0242919921875, "learning_rate": 9.997560332395708e-07, "loss": 0.001, "reward": 1.8496155738830566, "reward_std": 0.17857444286346436, "rewards/accuracy_reward": 0.7121155858039856, "rewards/format_reward": 1.0, "step": 327, "temperature": 1.0 }, { "all_correct": 0.375, "all_wrong": 0.125, "completion_length": 413.84375, "epoch": 0.009974455662328184, "grad_norm": 1.5576510603963416, "kl": 0.019775390625, "learning_rate": 9.997545389288406e-07, "loss": 0.0008, "reward": 1.7248263359069824, "reward_std": 0.1685350239276886, "rewards/accuracy_reward": 0.5998262166976929, "rewards/format_reward": 1.0, "step": 328, "temperature": 1.0 }, { "all_correct": 0.375, "all_wrong": 0.25, "completion_length": 390.234375, "epoch": 0.010004865588127965, "grad_norm": 0.9574485456894085, "kl": 0.0191650390625, "learning_rate": 9.997530400568254e-07, "loss": 0.0008, "reward": 1.6866984367370605, "reward_std": 0.12370496988296509, "rewards/accuracy_reward": 0.5523233413696289, "rewards/format_reward": 1.0, "step": 329, "temperature": 1.0 }, { "all_correct": 0.875, "all_wrong": 0.0, "completion_length": 385.109375, "epoch": 0.010035275513927746, "grad_norm": 0.9200797909039734, "kl": 0.0262451171875, "learning_rate": 9.99751536623539e-07, "loss": 0.0011, "reward": 2.0815439224243164, "reward_std": 0.03328409790992737, "rewards/accuracy_reward": 0.9034188389778137, "rewards/format_reward": 1.0, "step": 330, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.375, "completion_length": 359.296875, "epoch": 0.010065685439727527, "grad_norm": 0.5969029669857804, "kl": 0.027099609375, "learning_rate": 9.99750028628995e-07, "loss": 0.0011, "reward": 1.6343750953674316, "reward_std": 0.07827534526586533, "rewards/accuracy_reward": 0.53125, "rewards/format_reward": 1.0, "step": 331, "temperature": 1.0 }, { "all_correct": 0.125, "all_wrong": 0.125, "completion_length": 434.65625, "epoch": 0.010096095365527307, "grad_norm": 1.3629394053333748, "kl": 0.018310546875, "learning_rate": 9.997485160732069e-07, "loss": 0.0007, "reward": 1.4190571308135986, "reward_std": 0.19542253017425537, "rewards/accuracy_reward": 0.32218223810195923, "rewards/format_reward": 1.0, "step": 332, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 366.75, "epoch": 0.010126505291327088, "grad_norm": 1.5797602130890627, "kl": 0.0220947265625, "learning_rate": 9.997469989561888e-07, "loss": 0.0009, "reward": 2.048142910003662, "reward_std": 0.0324062705039978, "rewards/accuracy_reward": 0.8512676954269409, "rewards/format_reward": 1.0, "step": 333, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 382.375, "epoch": 0.010156915217126871, "grad_norm": 1.0092773286353647, "kl": 0.0205078125, "learning_rate": 9.997454772779548e-07, "loss": 0.0008, "reward": 1.8166532516479492, "reward_std": 0.23889750242233276, "rewards/accuracy_reward": 0.6729031801223755, "rewards/format_reward": 1.0, "step": 334, "temperature": 1.0 }, { "all_correct": 0.25, "all_wrong": 0.0, "completion_length": 399.84375, "epoch": 0.010187325142926652, "grad_norm": 2.470337565938291, "kl": 0.0179443359375, "learning_rate": 9.99743951038518e-07, "loss": 0.0007, "reward": 1.7218533754348755, "reward_std": 0.32006701827049255, "rewards/accuracy_reward": 0.5968533754348755, "rewards/format_reward": 1.0, "step": 335, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.125, "completion_length": 389.3125, "epoch": 0.010217735068726433, "grad_norm": 3.1610778618689848, "kl": 0.0260009765625, "learning_rate": 9.99742420237893e-07, "loss": 0.001, "reward": 1.7296210527420044, "reward_std": 0.15104272961616516, "rewards/accuracy_reward": 0.6108710169792175, "rewards/format_reward": 1.0, "step": 336, "temperature": 1.0 }, { "all_correct": 0.25, "all_wrong": 0.0, "completion_length": 391.421875, "epoch": 0.010248144994526214, "grad_norm": 1.6150632585954299, "kl": 0.0244140625, "learning_rate": 9.997408848760936e-07, "loss": 0.001, "reward": 1.8327126502990723, "reward_std": 0.30116862058639526, "rewards/accuracy_reward": 0.6858375072479248, "rewards/format_reward": 1.0, "step": 337, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 386.25, "epoch": 0.010278554920325995, "grad_norm": 1.3024991197748887, "kl": 0.0281982421875, "learning_rate": 9.997393449531337e-07, "loss": 0.0011, "reward": 1.9110532999038696, "reward_std": 0.15766188502311707, "rewards/accuracy_reward": 0.7516781687736511, "rewards/format_reward": 1.0, "step": 338, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.0, "completion_length": 431.671875, "epoch": 0.010308964846125776, "grad_norm": 0.8607159669595827, "kl": 0.0218505859375, "learning_rate": 9.997378004690274e-07, "loss": 0.0009, "reward": 2.0114333629608154, "reward_std": 0.15601055324077606, "rewards/accuracy_reward": 0.8676832318305969, "rewards/format_reward": 1.0, "step": 339, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 376.3125, "epoch": 0.010339374771925557, "grad_norm": 2.3863769002162916, "kl": 0.027099609375, "learning_rate": 9.997362514237888e-07, "loss": 0.0011, "reward": 2.0123977661132812, "reward_std": 0.0494222529232502, "rewards/accuracy_reward": 0.8405227661132812, "rewards/format_reward": 1.0, "step": 340, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.25, "completion_length": 375.75, "epoch": 0.010369784697725338, "grad_norm": 1.8138844938133896, "kl": 0.0284423828125, "learning_rate": 9.997346978174322e-07, "loss": 0.0011, "reward": 1.8250000476837158, "reward_std": 0.08017837256193161, "rewards/accuracy_reward": 0.6875, "rewards/format_reward": 1.0, "step": 341, "temperature": 1.0 }, { "all_correct": 0.125, "all_wrong": 0.0, "completion_length": 417.328125, "epoch": 0.010400194623525118, "grad_norm": 1.4936494375604366, "kl": 0.02099609375, "learning_rate": 9.997331396499716e-07, "loss": 0.0008, "reward": 1.6359374523162842, "reward_std": 0.24820901453495026, "rewards/accuracy_reward": 0.48281240463256836, "rewards/format_reward": 1.0, "step": 342, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 382.6875, "epoch": 0.0104306045493249, "grad_norm": 1.656477444273406, "kl": 0.022216796875, "learning_rate": 9.997315769214212e-07, "loss": 0.0009, "reward": 1.8265624046325684, "reward_std": 0.24232600629329681, "rewards/accuracy_reward": 0.6796874403953552, "rewards/format_reward": 1.0, "step": 343, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.125, "completion_length": 376.859375, "epoch": 0.01046101447512468, "grad_norm": 1.6049510398086544, "kl": 0.0281982421875, "learning_rate": 9.997300096317954e-07, "loss": 0.0011, "reward": 1.8283483982086182, "reward_std": 0.08479934930801392, "rewards/accuracy_reward": 0.6908482313156128, "rewards/format_reward": 1.0, "step": 344, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 379.4375, "epoch": 0.010491424400924461, "grad_norm": 1.4004504887545426, "kl": 0.0240478515625, "learning_rate": 9.997284377811084e-07, "loss": 0.001, "reward": 2.0796875953674316, "reward_std": 0.1020514965057373, "rewards/accuracy_reward": 0.8984375, "rewards/format_reward": 1.0, "step": 345, "temperature": 1.0 }, { "all_correct": 0.375, "all_wrong": 0.25, "completion_length": 377.40625, "epoch": 0.010521834326724242, "grad_norm": 0.7153849752855842, "kl": 0.0191650390625, "learning_rate": 9.997268613693746e-07, "loss": 0.0008, "reward": 1.6875001192092896, "reward_std": 0.17433080077171326, "rewards/accuracy_reward": 0.578125, "rewards/format_reward": 1.0, "step": 346, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.0, "completion_length": 389.78125, "epoch": 0.010552244252524023, "grad_norm": 0.9256000342917721, "kl": 0.023681640625, "learning_rate": 9.997252803966083e-07, "loss": 0.0009, "reward": 1.892232060432434, "reward_std": 0.13832026720046997, "rewards/accuracy_reward": 0.7422319650650024, "rewards/format_reward": 1.0, "step": 347, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.0, "completion_length": 399.59375, "epoch": 0.010582654178323806, "grad_norm": 0.8294889522002576, "kl": 0.021240234375, "learning_rate": 9.99723694862824e-07, "loss": 0.0009, "reward": 1.9756220579147339, "reward_std": 0.11665228009223938, "rewards/accuracy_reward": 0.7974969744682312, "rewards/format_reward": 1.0, "step": 348, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 382.859375, "epoch": 0.010613064104123587, "grad_norm": 0.9378216077644611, "kl": 0.02734375, "learning_rate": 9.997221047680362e-07, "loss": 0.0011, "reward": 2.0281248092651367, "reward_std": 0.1600036919116974, "rewards/accuracy_reward": 0.875, "rewards/format_reward": 1.0, "step": 349, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.125, "completion_length": 417.171875, "epoch": 0.010643474029923368, "grad_norm": 0.6503517655191055, "kl": 0.0208740234375, "learning_rate": 9.997205101122594e-07, "loss": 0.0008, "reward": 1.8544048070907593, "reward_std": 0.14091598987579346, "rewards/accuracy_reward": 0.7169046401977539, "rewards/format_reward": 1.0, "step": 350, "temperature": 1.0 }, { "all_correct": 0.375, "all_wrong": 0.0, "completion_length": 421.921875, "epoch": 0.010673883955723149, "grad_norm": 1.050949439707556, "kl": 0.02001953125, "learning_rate": 9.997189108955081e-07, "loss": 0.0008, "reward": 1.70163893699646, "reward_std": 0.29886701703071594, "rewards/accuracy_reward": 0.6078888773918152, "rewards/format_reward": 0.984375, "step": 351, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.125, "completion_length": 391.34375, "epoch": 0.01070429388152293, "grad_norm": 1.3003666008893664, "kl": 0.025390625, "learning_rate": 9.99717307117797e-07, "loss": 0.001, "reward": 1.7807180881500244, "reward_std": 0.12660560011863708, "rewards/accuracy_reward": 0.649468183517456, "rewards/format_reward": 1.0, "step": 352, "temperature": 1.0 }, { "all_correct": 0.375, "all_wrong": 0.0, "completion_length": 385.890625, "epoch": 0.01073470380732271, "grad_norm": 1.5194493284569137, "kl": 0.02783203125, "learning_rate": 9.997156987791406e-07, "loss": 0.0011, "reward": 1.8250924348831177, "reward_std": 0.23386509716510773, "rewards/accuracy_reward": 0.693842351436615, "rewards/format_reward": 1.0, "step": 353, "temperature": 1.0 }, { "all_correct": 0.375, "all_wrong": 0.25, "completion_length": 404.515625, "epoch": 0.010765113733122491, "grad_norm": 0.9493138487580918, "kl": 0.020751953125, "learning_rate": 9.997140858795538e-07, "loss": 0.0008, "reward": 1.6468260288238525, "reward_std": 0.08402933180332184, "rewards/accuracy_reward": 0.5187010765075684, "rewards/format_reward": 1.0, "step": 354, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.25, "completion_length": 368.328125, "epoch": 0.010795523658922272, "grad_norm": 3.903013956535791, "kl": 0.02783203125, "learning_rate": 9.997124684190512e-07, "loss": 0.0011, "reward": 1.6581554412841797, "reward_std": 0.08747033774852753, "rewards/accuracy_reward": 0.5550304055213928, "rewards/format_reward": 1.0, "step": 355, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 389.890625, "epoch": 0.010825933584722053, "grad_norm": 1.2268900661624917, "kl": 0.0213623046875, "learning_rate": 9.997108463976473e-07, "loss": 0.0009, "reward": 1.7474215030670166, "reward_std": 0.1925657093524933, "rewards/accuracy_reward": 0.6192964315414429, "rewards/format_reward": 1.0, "step": 356, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.0, "completion_length": 384.9375, "epoch": 0.010856343510521834, "grad_norm": 0.9530083357359483, "kl": 0.0189208984375, "learning_rate": 9.997092198153571e-07, "loss": 0.0008, "reward": 1.9738308191299438, "reward_std": 0.08408787846565247, "rewards/accuracy_reward": 0.7988307476043701, "rewards/format_reward": 1.0, "step": 357, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 397.875, "epoch": 0.010886753436321615, "grad_norm": 1.58182358242398, "kl": 0.023193359375, "learning_rate": 9.997075886721957e-07, "loss": 0.0009, "reward": 1.8325740098953247, "reward_std": 0.15860262513160706, "rewards/accuracy_reward": 0.6856989860534668, "rewards/format_reward": 1.0, "step": 358, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.125, "completion_length": 376.96875, "epoch": 0.010917163362121396, "grad_norm": 1.2671627397589489, "kl": 0.0224609375, "learning_rate": 9.997059529681776e-07, "loss": 0.0009, "reward": 1.828125, "reward_std": 0.11202043294906616, "rewards/accuracy_reward": 0.684374988079071, "rewards/format_reward": 1.0, "step": 359, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 412.859375, "epoch": 0.010947573287921177, "grad_norm": 1.0842802704398273, "kl": 0.018310546875, "learning_rate": 9.99704312703318e-07, "loss": 0.0007, "reward": 1.8854668140411377, "reward_std": 0.08531834185123444, "rewards/accuracy_reward": 0.7167167663574219, "rewards/format_reward": 1.0, "step": 360, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.125, "completion_length": 366.734375, "epoch": 0.010977983213720958, "grad_norm": 1.1785682686143464, "kl": 0.0242919921875, "learning_rate": 9.99702667877632e-07, "loss": 0.001, "reward": 2.0142135620117188, "reward_std": 0.07085821777582169, "rewards/accuracy_reward": 0.8548387289047241, "rewards/format_reward": 1.0, "step": 361, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.0, "completion_length": 443.515625, "epoch": 0.011008393139520739, "grad_norm": 0.7952036734766956, "kl": 0.017578125, "learning_rate": 9.99701018491134e-07, "loss": 0.0007, "reward": 1.9316582679748535, "reward_std": 0.09770317375659943, "rewards/accuracy_reward": 0.7660331130027771, "rewards/format_reward": 1.0, "step": 362, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.125, "completion_length": 384.125, "epoch": 0.011038803065320521, "grad_norm": 1.0050400149275307, "kl": 0.0218505859375, "learning_rate": 9.996993645438396e-07, "loss": 0.0009, "reward": 1.9343750476837158, "reward_std": 0.19957314431667328, "rewards/accuracy_reward": 0.78125, "rewards/format_reward": 1.0, "step": 363, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.125, "completion_length": 423.703125, "epoch": 0.011069212991120302, "grad_norm": 0.7840730194261892, "kl": 0.0181884765625, "learning_rate": 9.996977060357637e-07, "loss": 0.0007, "reward": 1.7778847217559814, "reward_std": 0.17245075106620789, "rewards/accuracy_reward": 0.677884578704834, "rewards/format_reward": 0.96875, "step": 364, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.0, "completion_length": 367.078125, "epoch": 0.011099622916920083, "grad_norm": 1.4423948934019022, "kl": 0.0242919921875, "learning_rate": 9.996960429669215e-07, "loss": 0.001, "reward": 2.010911464691162, "reward_std": 0.10332072526216507, "rewards/accuracy_reward": 0.8296614289283752, "rewards/format_reward": 1.0, "step": 365, "temperature": 1.0 }, { "all_correct": 0.25, "all_wrong": 0.0, "completion_length": 410.140625, "epoch": 0.011130032842719864, "grad_norm": 1.3013899508472213, "kl": 0.0185546875, "learning_rate": 9.99694375337328e-07, "loss": 0.0007, "reward": 1.6582443714141846, "reward_std": 0.1563914716243744, "rewards/accuracy_reward": 0.5051193833351135, "rewards/format_reward": 1.0, "step": 366, "temperature": 1.0 }, { "all_correct": 0.375, "all_wrong": 0.125, "completion_length": 397.0, "epoch": 0.011160442768519645, "grad_norm": 3.4850421876109796, "kl": 0.024169921875, "learning_rate": 9.996927031469988e-07, "loss": 0.001, "reward": 1.6948763132095337, "reward_std": 0.13153031468391418, "rewards/accuracy_reward": 0.55112624168396, "rewards/format_reward": 1.0, "step": 367, "temperature": 1.0 }, { "all_correct": 0.375, "all_wrong": 0.125, "completion_length": 404.53125, "epoch": 0.011190852694319426, "grad_norm": 1.3760804113238818, "kl": 0.02099609375, "learning_rate": 9.99691026395949e-07, "loss": 0.0008, "reward": 1.8155430555343628, "reward_std": 0.1822381168603897, "rewards/accuracy_reward": 0.6624179482460022, "rewards/format_reward": 1.0, "step": 368, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 405.78125, "epoch": 0.011221262620119207, "grad_norm": 1.4806080781566082, "kl": 0.025634765625, "learning_rate": 9.996893450841937e-07, "loss": 0.001, "reward": 1.8327884674072266, "reward_std": 0.13730138540267944, "rewards/accuracy_reward": 0.6827883124351501, "rewards/format_reward": 1.0, "step": 369, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 398.5, "epoch": 0.011251672545918988, "grad_norm": 1.2268312466231384, "kl": 0.020751953125, "learning_rate": 9.996876592117482e-07, "loss": 0.0008, "reward": 1.7721421718597412, "reward_std": 0.09305316209793091, "rewards/accuracy_reward": 0.6252670288085938, "rewards/format_reward": 1.0, "step": 370, "temperature": 1.0 }, { "all_correct": 0.375, "all_wrong": 0.125, "completion_length": 391.734375, "epoch": 0.011282082471718769, "grad_norm": 1.1203238277974892, "kl": 0.0260009765625, "learning_rate": 9.996859687786283e-07, "loss": 0.001, "reward": 1.7975209951400757, "reward_std": 0.07121521234512329, "rewards/accuracy_reward": 0.6443960070610046, "rewards/format_reward": 1.0, "step": 371, "temperature": 1.0 }, { "all_correct": 0.375, "all_wrong": 0.25, "completion_length": 456.015625, "epoch": 0.01131249239751855, "grad_norm": 0.7459272316525624, "kl": 0.0194091796875, "learning_rate": 9.996842737848492e-07, "loss": 0.0008, "reward": 1.6215076446533203, "reward_std": 0.15464895963668823, "rewards/accuracy_reward": 0.5621325969696045, "rewards/format_reward": 0.953125, "step": 372, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.125, "completion_length": 398.9375, "epoch": 0.01134290232331833, "grad_norm": 0.7493862450152716, "kl": 0.0242919921875, "learning_rate": 9.996825742304262e-07, "loss": 0.001, "reward": 1.8313472270965576, "reward_std": 0.05798782408237457, "rewards/accuracy_reward": 0.6969722509384155, "rewards/format_reward": 1.0, "step": 373, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.0, "completion_length": 396.140625, "epoch": 0.011373312249118112, "grad_norm": 0.9609906093811658, "kl": 0.02392578125, "learning_rate": 9.996808701153752e-07, "loss": 0.001, "reward": 1.934267282485962, "reward_std": 0.17643608152866364, "rewards/accuracy_reward": 0.7717671990394592, "rewards/format_reward": 1.0, "step": 374, "temperature": 1.0 }, { "all_correct": 0.0, "all_wrong": 0.375, "completion_length": 399.515625, "epoch": 0.011403722174917892, "grad_norm": 1.2941091441374422, "kl": 0.0234375, "learning_rate": 9.996791614397114e-07, "loss": 0.0009, "reward": 1.3692656755447388, "reward_std": 0.20688988268375397, "rewards/accuracy_reward": 0.3130156397819519, "rewards/format_reward": 1.0, "step": 375, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 394.078125, "epoch": 0.011434132100717673, "grad_norm": 1.2065358317355055, "kl": 0.0252685546875, "learning_rate": 9.996774482034504e-07, "loss": 0.001, "reward": 1.7968661785125732, "reward_std": 0.19386743009090424, "rewards/accuracy_reward": 0.6562411785125732, "rewards/format_reward": 1.0, "step": 376, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.125, "completion_length": 398.765625, "epoch": 0.011464542026517456, "grad_norm": 0.7803615770999071, "kl": 0.02099609375, "learning_rate": 9.996757304066082e-07, "loss": 0.0008, "reward": 1.9000000953674316, "reward_std": 0.1769564002752304, "rewards/accuracy_reward": 0.765625, "rewards/format_reward": 1.0, "step": 377, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.125, "completion_length": 368.890625, "epoch": 0.011494951952317237, "grad_norm": 1.0723777077267151, "kl": 0.03173828125, "learning_rate": 9.996740080492e-07, "loss": 0.0013, "reward": 1.9687501192092896, "reward_std": 0.13897547125816345, "rewards/accuracy_reward": 0.8125, "rewards/format_reward": 1.0, "step": 378, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.0, "completion_length": 377.515625, "epoch": 0.011525361878117018, "grad_norm": 1.3345379575740723, "kl": 0.0255126953125, "learning_rate": 9.996722811312421e-07, "loss": 0.001, "reward": 1.8573659658432007, "reward_std": 0.13177335262298584, "rewards/accuracy_reward": 0.7073659300804138, "rewards/format_reward": 1.0, "step": 379, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.125, "completion_length": 392.296875, "epoch": 0.011555771803916799, "grad_norm": 0.9948255629135296, "kl": 0.0247802734375, "learning_rate": 9.996705496527498e-07, "loss": 0.001, "reward": 1.844771385192871, "reward_std": 0.11077163368463516, "rewards/accuracy_reward": 0.7041463851928711, "rewards/format_reward": 1.0, "step": 380, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 397.171875, "epoch": 0.01158618172971658, "grad_norm": 1.529416177804708, "kl": 0.026123046875, "learning_rate": 9.996688136137389e-07, "loss": 0.001, "reward": 1.8979039192199707, "reward_std": 0.11607788503170013, "rewards/accuracy_reward": 0.7166540026664734, "rewards/format_reward": 1.0, "step": 381, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.125, "completion_length": 379.0, "epoch": 0.01161659165551636, "grad_norm": 1.0169282213750153, "kl": 0.021240234375, "learning_rate": 9.996670730142253e-07, "loss": 0.0008, "reward": 1.9374959468841553, "reward_std": 0.08553895354270935, "rewards/accuracy_reward": 0.7718709707260132, "rewards/format_reward": 1.0, "step": 382, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.0, "completion_length": 393.03125, "epoch": 0.011647001581316142, "grad_norm": 3.0262990016823834, "kl": 0.0169677734375, "learning_rate": 9.996653278542252e-07, "loss": 0.0007, "reward": 1.9877909421920776, "reward_std": 0.17407485842704773, "rewards/accuracy_reward": 0.8284158706665039, "rewards/format_reward": 1.0, "step": 383, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.125, "completion_length": 367.140625, "epoch": 0.011677411507115923, "grad_norm": 0.8052365679051469, "kl": 0.0234375, "learning_rate": 9.99663578133754e-07, "loss": 0.0009, "reward": 1.8718750476837158, "reward_std": 0.13130834698677063, "rewards/accuracy_reward": 0.734375, "rewards/format_reward": 1.0, "step": 384, "temperature": 1.0 }, { "all_correct": 0.375, "all_wrong": 0.125, "completion_length": 392.59375, "epoch": 0.011707821432915703, "grad_norm": 2.0517034069494704, "kl": 0.0184326171875, "learning_rate": 9.996618238528282e-07, "loss": 0.0007, "reward": 1.7625436782836914, "reward_std": 0.14881935715675354, "rewards/accuracy_reward": 0.6156686544418335, "rewards/format_reward": 1.0, "step": 385, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.25, "completion_length": 382.296875, "epoch": 0.011738231358715484, "grad_norm": 0.7977540648656763, "kl": 0.0255126953125, "learning_rate": 9.996600650114638e-07, "loss": 0.001, "reward": 1.8375000953674316, "reward_std": 0.08920513093471527, "rewards/accuracy_reward": 0.703125, "rewards/format_reward": 1.0, "step": 386, "temperature": 1.0 }, { "all_correct": 0.375, "all_wrong": 0.25, "completion_length": 396.5, "epoch": 0.011768641284515265, "grad_norm": 0.7601343344208239, "kl": 0.022216796875, "learning_rate": 9.996583016096763e-07, "loss": 0.0009, "reward": 1.6375000476837158, "reward_std": 0.20795778930187225, "rewards/accuracy_reward": 0.546875, "rewards/format_reward": 1.0, "step": 387, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.25, "completion_length": 398.609375, "epoch": 0.011799051210315046, "grad_norm": 1.1953722749572813, "kl": 0.025146484375, "learning_rate": 9.99656533647482e-07, "loss": 0.001, "reward": 1.6994366645812988, "reward_std": 0.07241673022508621, "rewards/accuracy_reward": 0.5744366645812988, "rewards/format_reward": 1.0, "step": 388, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.125, "completion_length": 401.328125, "epoch": 0.011829461136114827, "grad_norm": 0.8176248441056455, "kl": 0.0205078125, "learning_rate": 9.996547611248974e-07, "loss": 0.0008, "reward": 1.8029773235321045, "reward_std": 0.04074126109480858, "rewards/accuracy_reward": 0.6404772996902466, "rewards/format_reward": 1.0, "step": 389, "temperature": 1.0 }, { "all_correct": 0.25, "all_wrong": 0.125, "completion_length": 437.15625, "epoch": 0.011859871061914608, "grad_norm": 1.384536645365741, "kl": 0.0224609375, "learning_rate": 9.996529840419384e-07, "loss": 0.0009, "reward": 1.684781551361084, "reward_std": 0.14926466345787048, "rewards/accuracy_reward": 0.5441564321517944, "rewards/format_reward": 1.0, "step": 390, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.125, "completion_length": 349.1875, "epoch": 0.01189028098771439, "grad_norm": 1.6887365664547622, "kl": 0.02880859375, "learning_rate": 9.99651202398621e-07, "loss": 0.0012, "reward": 1.8038419485092163, "reward_std": 0.04790571331977844, "rewards/accuracy_reward": 0.6413419842720032, "rewards/format_reward": 1.0, "step": 391, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.125, "completion_length": 382.53125, "epoch": 0.011920690913514172, "grad_norm": 0.9981259372501113, "kl": 0.02978515625, "learning_rate": 9.99649416194962e-07, "loss": 0.0012, "reward": 1.868749976158142, "reward_std": 0.16536714136600494, "rewards/accuracy_reward": 0.734375, "rewards/format_reward": 1.0, "step": 392, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.0, "completion_length": 353.765625, "epoch": 0.011951100839313953, "grad_norm": 1.8448058299490133, "kl": 0.0260009765625, "learning_rate": 9.996476254309773e-07, "loss": 0.001, "reward": 2.0181920528411865, "reward_std": 0.11770347505807877, "rewards/accuracy_reward": 0.8588169813156128, "rewards/format_reward": 1.0, "step": 393, "temperature": 1.0 }, { "all_correct": 0.375, "all_wrong": 0.125, "completion_length": 400.84375, "epoch": 0.011981510765113734, "grad_norm": 1.4341111466534822, "kl": 0.0225830078125, "learning_rate": 9.996458301066835e-07, "loss": 0.0009, "reward": 1.7502045631408691, "reward_std": 0.2051336169242859, "rewards/accuracy_reward": 0.6220794916152954, "rewards/format_reward": 1.0, "step": 394, "temperature": 1.0 }, { "all_correct": 0.375, "all_wrong": 0.125, "completion_length": 371.28125, "epoch": 0.012011920690913514, "grad_norm": 2.315779754381853, "kl": 0.0264892578125, "learning_rate": 9.996440302220967e-07, "loss": 0.0011, "reward": 1.6628079414367676, "reward_std": 0.1338011622428894, "rewards/accuracy_reward": 0.5503078699111938, "rewards/format_reward": 1.0, "step": 395, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.25, "completion_length": 385.75, "epoch": 0.012042330616713295, "grad_norm": 0.49131978481123745, "kl": 0.0240478515625, "learning_rate": 9.996422257772335e-07, "loss": 0.001, "reward": 1.8250000476837158, "reward_std": 0.08017837256193161, "rewards/accuracy_reward": 0.6875, "rewards/format_reward": 1.0, "step": 396, "temperature": 1.0 }, { "all_correct": 0.375, "all_wrong": 0.0, "completion_length": 394.03125, "epoch": 0.012072740542513076, "grad_norm": 3.388649828030671, "kl": 0.0218505859375, "learning_rate": 9.996404167721101e-07, "loss": 0.0009, "reward": 1.9327692985534668, "reward_std": 0.1615990549325943, "rewards/accuracy_reward": 0.7577692270278931, "rewards/format_reward": 1.0, "step": 397, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.125, "completion_length": 361.90625, "epoch": 0.012103150468312857, "grad_norm": 9.815904613753887, "kl": 0.029296875, "learning_rate": 9.996386032067437e-07, "loss": 0.0012, "reward": 1.9656250476837158, "reward_std": 0.12840762734413147, "rewards/accuracy_reward": 0.8125, "rewards/format_reward": 1.0, "step": 398, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.25, "completion_length": 371.828125, "epoch": 0.012133560394112638, "grad_norm": 1.2110059490884901, "kl": 0.0242919921875, "learning_rate": 9.996367850811503e-07, "loss": 0.001, "reward": 1.875, "reward_std": 0.06460576504468918, "rewards/accuracy_reward": 0.734375, "rewards/format_reward": 1.0, "step": 399, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 382.921875, "epoch": 0.012163970319912419, "grad_norm": 5.484870431099293, "kl": 0.028564453125, "learning_rate": 9.996349623953464e-07, "loss": 0.0011, "reward": 2.0436489582061768, "reward_std": 0.08788136392831802, "rewards/accuracy_reward": 0.8686488270759583, "rewards/format_reward": 1.0, "step": 400, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 378.5, "epoch": 0.0121943802457122, "grad_norm": 0.9080088211059828, "kl": 0.0296630859375, "learning_rate": 9.996331351493488e-07, "loss": 0.0012, "reward": 2.0423500537872314, "reward_std": 0.11549904942512512, "rewards/accuracy_reward": 0.8673499822616577, "rewards/format_reward": 1.0, "step": 401, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 402.15625, "epoch": 0.012224790171511981, "grad_norm": 1.1423769537719999, "kl": 0.02099609375, "learning_rate": 9.996313033431744e-07, "loss": 0.0008, "reward": 1.8997764587402344, "reward_std": 0.1524738222360611, "rewards/accuracy_reward": 0.7310263514518738, "rewards/format_reward": 1.0, "step": 402, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 376.59375, "epoch": 0.012255200097311762, "grad_norm": 2.2975493526546202, "kl": 0.0294189453125, "learning_rate": 9.996294669768398e-07, "loss": 0.0012, "reward": 1.864615559577942, "reward_std": 0.08533743023872375, "rewards/accuracy_reward": 0.6833655834197998, "rewards/format_reward": 1.0, "step": 403, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.125, "completion_length": 361.84375, "epoch": 0.012285610023111543, "grad_norm": 1.6653609743276963, "kl": 0.0286865234375, "learning_rate": 9.996276260503616e-07, "loss": 0.0011, "reward": 1.8635417222976685, "reward_std": 0.20986217260360718, "rewards/accuracy_reward": 0.7447916865348816, "rewards/format_reward": 0.984375, "step": 404, "temperature": 1.0 }, { "all_correct": 0.375, "all_wrong": 0.0, "completion_length": 440.765625, "epoch": 0.012316019948911325, "grad_norm": 1.2945540418279038, "kl": 0.0211181640625, "learning_rate": 9.996257805637564e-07, "loss": 0.0008, "reward": 1.7538068294525146, "reward_std": 0.16644743084907532, "rewards/accuracy_reward": 0.622556746006012, "rewards/format_reward": 1.0, "step": 405, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.125, "completion_length": 373.296875, "epoch": 0.012346429874711106, "grad_norm": 0.9565332068062118, "kl": 0.029541015625, "learning_rate": 9.996239305170417e-07, "loss": 0.0012, "reward": 1.7602752447128296, "reward_std": 0.1293303519487381, "rewards/accuracy_reward": 0.6227751970291138, "rewards/format_reward": 1.0, "step": 406, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.125, "completion_length": 380.984375, "epoch": 0.012376839800510887, "grad_norm": 1.1816180748076128, "kl": 0.023193359375, "learning_rate": 9.99622075910234e-07, "loss": 0.0009, "reward": 1.972031831741333, "reward_std": 0.08351900428533554, "rewards/accuracy_reward": 0.8157817125320435, "rewards/format_reward": 1.0, "step": 407, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 368.609375, "epoch": 0.012407249726310668, "grad_norm": 1.5642399928788158, "kl": 0.0262451171875, "learning_rate": 9.9962021674335e-07, "loss": 0.0011, "reward": 1.9522716999053955, "reward_std": 0.1168014332652092, "rewards/accuracy_reward": 0.7835216522216797, "rewards/format_reward": 1.0, "step": 408, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.0, "completion_length": 360.59375, "epoch": 0.01243765965211045, "grad_norm": 3.2963069861183865, "kl": 0.02294921875, "learning_rate": 9.996183530164074e-07, "loss": 0.0009, "reward": 1.9045774936676025, "reward_std": 0.10382448881864548, "rewards/accuracy_reward": 0.7483274936676025, "rewards/format_reward": 1.0, "step": 409, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 370.078125, "epoch": 0.01246806957791023, "grad_norm": 1.677188381525764, "kl": 0.0264892578125, "learning_rate": 9.996164847294222e-07, "loss": 0.0011, "reward": 2.065524101257324, "reward_std": 0.07782972604036331, "rewards/accuracy_reward": 0.8811489939689636, "rewards/format_reward": 1.0, "step": 410, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 412.625, "epoch": 0.012498479503710011, "grad_norm": 2.1613245758812982, "kl": 0.0244140625, "learning_rate": 9.996146118824123e-07, "loss": 0.001, "reward": 1.9531219005584717, "reward_std": 0.22167542576789856, "rewards/accuracy_reward": 0.8156219720840454, "rewards/format_reward": 1.0, "step": 411, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 376.921875, "epoch": 0.012528889429509792, "grad_norm": 1.4698928795060002, "kl": 0.02001953125, "learning_rate": 9.996127344753944e-07, "loss": 0.0008, "reward": 2.070927858352661, "reward_std": 0.08682312071323395, "rewards/accuracy_reward": 0.8865528702735901, "rewards/format_reward": 1.0, "step": 412, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 382.28125, "epoch": 0.012559299355309573, "grad_norm": 0.9000164852037478, "kl": 0.02734375, "learning_rate": 9.996108525083857e-07, "loss": 0.0011, "reward": 2.0280327796936035, "reward_std": 0.07457979023456573, "rewards/accuracy_reward": 0.859282910823822, "rewards/format_reward": 1.0, "step": 413, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.25, "completion_length": 368.4375, "epoch": 0.012589709281109354, "grad_norm": 0.9333505076634158, "kl": 0.0252685546875, "learning_rate": 9.996089659814032e-07, "loss": 0.001, "reward": 1.778926134109497, "reward_std": 0.10284383594989777, "rewards/accuracy_reward": 0.6445510387420654, "rewards/format_reward": 1.0, "step": 414, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.25, "completion_length": 368.453125, "epoch": 0.012620119206909135, "grad_norm": 1.1109114673997336, "kl": 0.0250244140625, "learning_rate": 9.996070748944646e-07, "loss": 0.001, "reward": 1.7941220998764038, "reward_std": 0.03619522601366043, "rewards/accuracy_reward": 0.6503720283508301, "rewards/format_reward": 1.0, "step": 415, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.0, "completion_length": 374.171875, "epoch": 0.012650529132708916, "grad_norm": 4.067140537038911, "kl": 0.0242919921875, "learning_rate": 9.996051792475869e-07, "loss": 0.001, "reward": 2.0318899154663086, "reward_std": 0.13213837146759033, "rewards/accuracy_reward": 0.850639820098877, "rewards/format_reward": 1.0, "step": 416, "temperature": 1.0 }, { "all_correct": 0.375, "all_wrong": 0.125, "completion_length": 394.578125, "epoch": 0.012680939058508697, "grad_norm": 1.0856540189127113, "kl": 0.0196533203125, "learning_rate": 9.99603279040787e-07, "loss": 0.0008, "reward": 1.801923394203186, "reward_std": 0.1127195805311203, "rewards/accuracy_reward": 0.6362982988357544, "rewards/format_reward": 1.0, "step": 417, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.125, "completion_length": 401.921875, "epoch": 0.012711348984308478, "grad_norm": 0.9105330854135775, "kl": 0.022216796875, "learning_rate": 9.99601374274083e-07, "loss": 0.0009, "reward": 1.8045833110809326, "reward_std": 0.12587976455688477, "rewards/accuracy_reward": 0.6670833230018616, "rewards/format_reward": 1.0, "step": 418, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.125, "completion_length": 431.546875, "epoch": 0.01274175891010826, "grad_norm": 1.1347127894406621, "kl": 0.0179443359375, "learning_rate": 9.995994649474918e-07, "loss": 0.0007, "reward": 1.8670748472213745, "reward_std": 0.0404692068696022, "rewards/accuracy_reward": 0.7170748114585876, "rewards/format_reward": 1.0, "step": 419, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.125, "completion_length": 394.5, "epoch": 0.012772168835908041, "grad_norm": 0.9449873289506605, "kl": 0.025634765625, "learning_rate": 9.995975510610307e-07, "loss": 0.001, "reward": 1.836775302886963, "reward_std": 0.1473839282989502, "rewards/accuracy_reward": 0.6992752552032471, "rewards/format_reward": 0.984375, "step": 420, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.25, "completion_length": 375.265625, "epoch": 0.012802578761707822, "grad_norm": 1.0516091777259904, "kl": 0.0244140625, "learning_rate": 9.995956326147176e-07, "loss": 0.001, "reward": 1.7887868881225586, "reward_std": 0.07155824452638626, "rewards/accuracy_reward": 0.663786768913269, "rewards/format_reward": 1.0, "step": 421, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.375, "completion_length": 377.953125, "epoch": 0.012832988687507603, "grad_norm": 0.5694380019233464, "kl": 0.024169921875, "learning_rate": 9.995937096085697e-07, "loss": 0.001, "reward": 1.7468750476837158, "reward_std": 0.008838837966322899, "rewards/accuracy_reward": 0.625, "rewards/format_reward": 1.0, "step": 422, "temperature": 1.0 }, { "all_correct": 0.375, "all_wrong": 0.0, "completion_length": 424.359375, "epoch": 0.012863398613307384, "grad_norm": 1.127248002853664, "kl": 0.015869140625, "learning_rate": 9.995917820426047e-07, "loss": 0.0006, "reward": 1.7372026443481445, "reward_std": 0.17061707377433777, "rewards/accuracy_reward": 0.5997025966644287, "rewards/format_reward": 1.0, "step": 423, "temperature": 1.0 }, { "all_correct": 0.375, "all_wrong": 0.125, "completion_length": 413.609375, "epoch": 0.012893808539107165, "grad_norm": 3.974309207935761, "kl": 0.0201416015625, "learning_rate": 9.995898499168402e-07, "loss": 0.0008, "reward": 1.7879559993743896, "reward_std": 0.194219708442688, "rewards/accuracy_reward": 0.6504560708999634, "rewards/format_reward": 1.0, "step": 424, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.125, "completion_length": 376.859375, "epoch": 0.012924218464906946, "grad_norm": 0.622142736111521, "kl": 0.027099609375, "learning_rate": 9.995879132312936e-07, "loss": 0.0011, "reward": 1.9343750476837158, "reward_std": 0.07827534526586533, "rewards/accuracy_reward": 0.78125, "rewards/format_reward": 1.0, "step": 425, "temperature": 1.0 }, { "all_correct": 0.375, "all_wrong": 0.125, "completion_length": 375.203125, "epoch": 0.012954628390706727, "grad_norm": 1.5903279331050733, "kl": 0.026123046875, "learning_rate": 9.99585971985983e-07, "loss": 0.001, "reward": 1.6797423362731934, "reward_std": 0.150284543633461, "rewards/accuracy_reward": 0.5516172647476196, "rewards/format_reward": 1.0, "step": 426, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.125, "completion_length": 380.953125, "epoch": 0.012985038316506508, "grad_norm": 1.062176531375013, "kl": 0.0216064453125, "learning_rate": 9.995840261809256e-07, "loss": 0.0009, "reward": 1.8180136680603027, "reward_std": 0.08744699507951736, "rewards/accuracy_reward": 0.6523886919021606, "rewards/format_reward": 1.0, "step": 427, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.0, "completion_length": 369.65625, "epoch": 0.013015448242306289, "grad_norm": 0.8337367026433918, "kl": 0.0274658203125, "learning_rate": 9.995820758161396e-07, "loss": 0.0011, "reward": 1.8878841400146484, "reward_std": 0.1377033144235611, "rewards/accuracy_reward": 0.7347592115402222, "rewards/format_reward": 1.0, "step": 428, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.125, "completion_length": 384.75, "epoch": 0.01304585816810607, "grad_norm": 0.6084481475037314, "kl": 0.0211181640625, "learning_rate": 9.995801208916426e-07, "loss": 0.0008, "reward": 1.8877935409545898, "reward_std": 0.09616396576166153, "rewards/accuracy_reward": 0.7315435409545898, "rewards/format_reward": 1.0, "step": 429, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.25, "completion_length": 377.203125, "epoch": 0.01307626809390585, "grad_norm": 1.098252325031774, "kl": 0.02490234375, "learning_rate": 9.995781614074525e-07, "loss": 0.001, "reward": 1.779937505722046, "reward_std": 0.08466672152280807, "rewards/accuracy_reward": 0.6393124461174011, "rewards/format_reward": 1.0, "step": 430, "temperature": 1.0 }, { "all_correct": 0.375, "all_wrong": 0.125, "completion_length": 407.640625, "epoch": 0.013106678019705631, "grad_norm": 1.8846486597481995, "kl": 0.0233154296875, "learning_rate": 9.995761973635873e-07, "loss": 0.0009, "reward": 1.7590913772583008, "reward_std": 0.23392775654792786, "rewards/accuracy_reward": 0.6340913772583008, "rewards/format_reward": 1.0, "step": 431, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.125, "completion_length": 372.203125, "epoch": 0.013137087945505412, "grad_norm": 1.2557707122278616, "kl": 0.0308837890625, "learning_rate": 9.995742287600645e-07, "loss": 0.0012, "reward": 1.8699300289154053, "reward_std": 0.10238337516784668, "rewards/accuracy_reward": 0.7168049812316895, "rewards/format_reward": 1.0, "step": 432, "temperature": 1.0 }, { "all_correct": 0.375, "all_wrong": 0.0, "completion_length": 371.921875, "epoch": 0.013167497871305193, "grad_norm": 1.5699627776590146, "kl": 0.0301513671875, "learning_rate": 9.995722555969026e-07, "loss": 0.0012, "reward": 1.8518974781036377, "reward_std": 0.2162868082523346, "rewards/accuracy_reward": 0.6925222277641296, "rewards/format_reward": 1.0, "step": 433, "temperature": 1.0 }, { "all_correct": 0.375, "all_wrong": 0.25, "completion_length": 380.046875, "epoch": 0.013197907797104976, "grad_norm": 0.9288802923072665, "kl": 0.0225830078125, "learning_rate": 9.995702778741192e-07, "loss": 0.0009, "reward": 1.7594242095947266, "reward_std": 0.07744473218917847, "rewards/accuracy_reward": 0.6219242215156555, "rewards/format_reward": 1.0, "step": 434, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 402.71875, "epoch": 0.013228317722904757, "grad_norm": 1.3675505091144904, "kl": 0.0220947265625, "learning_rate": 9.995682955917326e-07, "loss": 0.0009, "reward": 1.9266667366027832, "reward_std": 0.09359733015298843, "rewards/accuracy_reward": 0.7422915697097778, "rewards/format_reward": 1.0, "step": 435, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.125, "completion_length": 385.484375, "epoch": 0.013258727648704538, "grad_norm": 2.23819519846205, "kl": 0.0238037109375, "learning_rate": 9.995663087497608e-07, "loss": 0.001, "reward": 1.9015331268310547, "reward_std": 0.03403928130865097, "rewards/accuracy_reward": 0.7484080791473389, "rewards/format_reward": 1.0, "step": 436, "temperature": 1.0 }, { "all_correct": 0.375, "all_wrong": 0.375, "completion_length": 375.9375, "epoch": 0.013289137574504319, "grad_norm": 4.706405211413954, "kl": 0.025390625, "learning_rate": 9.995643173482218e-07, "loss": 0.001, "reward": 1.5572465658187866, "reward_std": 0.03346993029117584, "rewards/accuracy_reward": 0.4416215419769287, "rewards/format_reward": 1.0, "step": 437, "temperature": 1.0 }, { "all_correct": 0.375, "all_wrong": 0.375, "completion_length": 404.921875, "epoch": 0.0133195475003041, "grad_norm": 0.7703207420764284, "kl": 0.01953125, "learning_rate": 9.99562321387134e-07, "loss": 0.0008, "reward": 1.5982862710952759, "reward_std": 0.06301999092102051, "rewards/accuracy_reward": 0.48578622937202454, "rewards/format_reward": 1.0, "step": 438, "temperature": 1.0 }, { "all_correct": 0.375, "all_wrong": 0.125, "completion_length": 384.0625, "epoch": 0.01334995742610388, "grad_norm": 1.468654125223099, "kl": 0.0218505859375, "learning_rate": 9.995603208665157e-07, "loss": 0.0009, "reward": 1.6567028760910034, "reward_std": 0.11169316619634628, "rewards/accuracy_reward": 0.5223277807235718, "rewards/format_reward": 1.0, "step": 439, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.125, "completion_length": 360.3125, "epoch": 0.013380367351903661, "grad_norm": 1.126958068422436, "kl": 0.022216796875, "learning_rate": 9.995583157863848e-07, "loss": 0.0009, "reward": 1.9796874523162842, "reward_std": 0.08926186710596085, "rewards/accuracy_reward": 0.8203125, "rewards/format_reward": 1.0, "step": 440, "temperature": 1.0 }, { "all_correct": 0.375, "all_wrong": 0.0, "completion_length": 397.078125, "epoch": 0.013410777277703442, "grad_norm": 2.989665454494861, "kl": 0.0213623046875, "learning_rate": 9.995563061467597e-07, "loss": 0.0009, "reward": 1.781590461730957, "reward_std": 0.08468598127365112, "rewards/accuracy_reward": 0.6190904378890991, "rewards/format_reward": 1.0, "step": 441, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.25, "completion_length": 369.140625, "epoch": 0.013441187203503223, "grad_norm": 1.0396161274260947, "kl": 0.023681640625, "learning_rate": 9.995542919476592e-07, "loss": 0.0009, "reward": 1.790672779083252, "reward_std": 0.08840832859277725, "rewards/accuracy_reward": 0.6500478386878967, "rewards/format_reward": 1.0, "step": 442, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.0, "completion_length": 390.53125, "epoch": 0.013471597129303004, "grad_norm": 1.4791970001769736, "kl": 0.0238037109375, "learning_rate": 9.99552273189101e-07, "loss": 0.001, "reward": 1.8770506381988525, "reward_std": 0.06989116966724396, "rewards/accuracy_reward": 0.7114256024360657, "rewards/format_reward": 1.0, "step": 443, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.125, "completion_length": 360.296875, "epoch": 0.013502007055102785, "grad_norm": 1.1384169772377273, "kl": 0.02294921875, "learning_rate": 9.99550249871104e-07, "loss": 0.0009, "reward": 1.9211235046386719, "reward_std": 0.20084112882614136, "rewards/accuracy_reward": 0.7679983973503113, "rewards/format_reward": 1.0, "step": 444, "temperature": 1.0 }, { "all_correct": 0.875, "all_wrong": 0.0, "completion_length": 395.890625, "epoch": 0.013532416980902566, "grad_norm": 0.6153453961417708, "kl": 0.02001953125, "learning_rate": 9.995482219936864e-07, "loss": 0.0008, "reward": 2.0562500953674316, "reward_std": 0.09121407568454742, "rewards/accuracy_reward": 0.90625, "rewards/format_reward": 1.0, "step": 445, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.125, "completion_length": 398.984375, "epoch": 0.013562826906702347, "grad_norm": 0.8618215192613391, "kl": 0.0218505859375, "learning_rate": 9.995461895568668e-07, "loss": 0.0009, "reward": 1.9375, "reward_std": 0.15492431819438934, "rewards/accuracy_reward": 0.796875, "rewards/format_reward": 1.0, "step": 446, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.125, "completion_length": 399.984375, "epoch": 0.013593236832502128, "grad_norm": 1.5945336232652259, "kl": 0.0235595703125, "learning_rate": 9.99544152560664e-07, "loss": 0.0009, "reward": 1.7949377298355103, "reward_std": 0.05537300929427147, "rewards/accuracy_reward": 0.6355627775192261, "rewards/format_reward": 1.0, "step": 447, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.25, "completion_length": 396.71875, "epoch": 0.01362364675830191, "grad_norm": 0.8008103675423062, "kl": 0.0216064453125, "learning_rate": 9.99542111005096e-07, "loss": 0.0009, "reward": 1.7746574878692627, "reward_std": 0.07270940393209457, "rewards/accuracy_reward": 0.6527825593948364, "rewards/format_reward": 1.0, "step": 448, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.125, "completion_length": 395.5625, "epoch": 0.013654056684101691, "grad_norm": 1.105572543787844, "kl": 0.0233154296875, "learning_rate": 9.995400648901823e-07, "loss": 0.0009, "reward": 1.7727954387664795, "reward_std": 0.08363926410675049, "rewards/accuracy_reward": 0.6415454149246216, "rewards/format_reward": 1.0, "step": 449, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.25, "completion_length": 373.46875, "epoch": 0.013684466609901472, "grad_norm": 0.6018386909568895, "kl": 0.027099609375, "learning_rate": 9.995380142159406e-07, "loss": 0.0011, "reward": 1.896875023841858, "reward_std": 0.008838837966322899, "rewards/accuracy_reward": 0.75, "rewards/format_reward": 1.0, "step": 450, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.125, "completion_length": 368.0625, "epoch": 0.013714876535701253, "grad_norm": 0.8380762449116346, "kl": 0.0277099609375, "learning_rate": 9.995359589823904e-07, "loss": 0.0011, "reward": 1.9687501192092896, "reward_std": 0.0978560596704483, "rewards/accuracy_reward": 0.8125, "rewards/format_reward": 1.0, "step": 451, "temperature": 1.0 }, { "all_correct": 0.375, "all_wrong": 0.0, "completion_length": 401.296875, "epoch": 0.013745286461501034, "grad_norm": 2.0286423199653627, "kl": 0.0225830078125, "learning_rate": 9.995338991895498e-07, "loss": 0.0009, "reward": 1.6641693115234375, "reward_std": 0.165618896484375, "rewards/accuracy_reward": 0.5485442876815796, "rewards/format_reward": 1.0, "step": 452, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.125, "completion_length": 402.984375, "epoch": 0.013775696387300815, "grad_norm": 1.1305742453533707, "kl": 0.0228271484375, "learning_rate": 9.995318348374384e-07, "loss": 0.0009, "reward": 1.8092817068099976, "reward_std": 0.11414626240730286, "rewards/accuracy_reward": 0.6780316233634949, "rewards/format_reward": 1.0, "step": 453, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 368.90625, "epoch": 0.013806106313100596, "grad_norm": 1.1221976435534433, "kl": 0.0244140625, "learning_rate": 9.995297659260742e-07, "loss": 0.001, "reward": 1.8606196641921997, "reward_std": 0.1809464693069458, "rewards/accuracy_reward": 0.7231194972991943, "rewards/format_reward": 1.0, "step": 454, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 374.34375, "epoch": 0.013836516238900377, "grad_norm": 1.314333836821333, "kl": 0.0263671875, "learning_rate": 9.995276924554767e-07, "loss": 0.0011, "reward": 1.8475112915039062, "reward_std": 0.23364703357219696, "rewards/accuracy_reward": 0.7006362676620483, "rewards/format_reward": 1.0, "step": 455, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.125, "completion_length": 390.71875, "epoch": 0.013866926164700158, "grad_norm": 0.7309687189504123, "kl": 0.0257568359375, "learning_rate": 9.995256144256644e-07, "loss": 0.001, "reward": 1.9031249284744263, "reward_std": 0.14942099153995514, "rewards/accuracy_reward": 0.7593749761581421, "rewards/format_reward": 1.0, "step": 456, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.25, "completion_length": 390.5, "epoch": 0.013897336090499939, "grad_norm": 0.9400969584701695, "kl": 0.0262451171875, "learning_rate": 9.995235318366566e-07, "loss": 0.0011, "reward": 1.8725000619888306, "reward_std": 0.07008225470781326, "rewards/accuracy_reward": 0.7318750023841858, "rewards/format_reward": 1.0, "step": 457, "temperature": 1.0 }, { "all_correct": 0.875, "all_wrong": 0.0, "completion_length": 371.84375, "epoch": 0.01392774601629972, "grad_norm": 1.0806275889339196, "kl": 0.0289306640625, "learning_rate": 9.995214446884722e-07, "loss": 0.0012, "reward": 2.171875, "reward_std": 0.07344460487365723, "rewards/accuracy_reward": 0.984375, "rewards/format_reward": 1.0, "step": 458, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.125, "completion_length": 385.015625, "epoch": 0.0139581559420995, "grad_norm": 0.8312789204091494, "kl": 0.03173828125, "learning_rate": 9.995193529811301e-07, "loss": 0.0013, "reward": 1.9650859832763672, "reward_std": 0.018264738842844963, "rewards/accuracy_reward": 0.7963359355926514, "rewards/format_reward": 1.0, "step": 459, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.125, "completion_length": 400.46875, "epoch": 0.013988565867899282, "grad_norm": 1.5996490673044275, "kl": 0.02294921875, "learning_rate": 9.995172567146496e-07, "loss": 0.0009, "reward": 1.7979313135147095, "reward_std": 0.09172330796718597, "rewards/accuracy_reward": 0.6416813135147095, "rewards/format_reward": 1.0, "step": 460, "temperature": 1.0 }, { "all_correct": 0.375, "all_wrong": 0.125, "completion_length": 383.796875, "epoch": 0.014018975793699063, "grad_norm": 1.2351942913074625, "kl": 0.0281982421875, "learning_rate": 9.995151558890495e-07, "loss": 0.0011, "reward": 1.851953148841858, "reward_std": 0.18310530483722687, "rewards/accuracy_reward": 0.705078125, "rewards/format_reward": 1.0, "step": 461, "temperature": 1.0 }, { "all_correct": 0.0, "all_wrong": 0.25, "completion_length": 443.703125, "epoch": 0.014049385719498845, "grad_norm": 1.7411484741607874, "kl": 0.02490234375, "learning_rate": 9.995130505043495e-07, "loss": 0.001, "reward": 1.2498527765274048, "reward_std": 0.13263455033302307, "rewards/accuracy_reward": 0.20610277354717255, "rewards/format_reward": 1.0, "step": 462, "temperature": 1.0 }, { "all_correct": 0.375, "all_wrong": 0.375, "completion_length": 380.859375, "epoch": 0.014079795645298626, "grad_norm": 1.2495323644891858, "kl": 0.0302734375, "learning_rate": 9.995109405605683e-07, "loss": 0.0012, "reward": 1.4718749523162842, "reward_std": 0.101327084004879, "rewards/accuracy_reward": 0.40625, "rewards/format_reward": 1.0, "step": 463, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.125, "completion_length": 403.015625, "epoch": 0.014110205571098407, "grad_norm": 0.4181846124701794, "kl": 0.0247802734375, "learning_rate": 9.995088260577257e-07, "loss": 0.001, "reward": 1.9343749284744263, "reward_std": 0.07827534526586533, "rewards/accuracy_reward": 0.78125, "rewards/format_reward": 1.0, "step": 464, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.25, "completion_length": 387.28125, "epoch": 0.014140615496898188, "grad_norm": 0.7953206755988361, "kl": 0.029052734375, "learning_rate": 9.995067069958404e-07, "loss": 0.0012, "reward": 1.8015625476837158, "reward_std": 0.08693268150091171, "rewards/accuracy_reward": 0.6703125238418579, "rewards/format_reward": 1.0, "step": 465, "temperature": 1.0 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 377.359375, "epoch": 0.014171025422697969, "grad_norm": 0.9408394023777019, "kl": 0.025634765625, "learning_rate": 9.995045833749323e-07, "loss": 0.001, "reward": 2.1937499046325684, "reward_std": 0.017677675932645798, "rewards/accuracy_reward": 1.0, "rewards/format_reward": 1.0, "step": 466, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.125, "completion_length": 404.890625, "epoch": 0.01420143534849775, "grad_norm": 1.0781516326422151, "kl": 0.026611328125, "learning_rate": 9.995024551950203e-07, "loss": 0.0011, "reward": 1.794966220855713, "reward_std": 0.14499084651470184, "rewards/accuracy_reward": 0.6574661731719971, "rewards/format_reward": 1.0, "step": 467, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 411.578125, "epoch": 0.01423184527429753, "grad_norm": 1.8664424004576747, "kl": 0.0252685546875, "learning_rate": 9.99500322456124e-07, "loss": 0.001, "reward": 1.8546946048736572, "reward_std": 0.10674580186605453, "rewards/accuracy_reward": 0.6796945929527283, "rewards/format_reward": 1.0, "step": 468, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.125, "completion_length": 402.0, "epoch": 0.014262255200097312, "grad_norm": 8.499584285454972, "kl": 0.0262451171875, "learning_rate": 9.99498185158263e-07, "loss": 0.001, "reward": 1.8758907318115234, "reward_std": 0.13766837120056152, "rewards/accuracy_reward": 0.7321406602859497, "rewards/format_reward": 1.0, "step": 469, "temperature": 1.0 }, { "all_correct": 0.25, "all_wrong": 0.125, "completion_length": 403.109375, "epoch": 0.014292665125897093, "grad_norm": 1.8972264783382558, "kl": 0.024169921875, "learning_rate": 9.994960433014568e-07, "loss": 0.001, "reward": 1.4283539056777954, "reward_std": 0.15578557550907135, "rewards/accuracy_reward": 0.3627288341522217, "rewards/format_reward": 1.0, "step": 470, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.125, "completion_length": 395.09375, "epoch": 0.014323075051696874, "grad_norm": 0.9301703657198602, "kl": 0.024658203125, "learning_rate": 9.994938968857248e-07, "loss": 0.001, "reward": 1.8406250476837158, "reward_std": 0.183067187666893, "rewards/accuracy_reward": 0.703125, "rewards/format_reward": 1.0, "step": 471, "temperature": 1.0 }, { "all_correct": 0.25, "all_wrong": 0.0, "completion_length": 432.453125, "epoch": 0.014353484977496654, "grad_norm": 1.2812077327499334, "kl": 0.0206298828125, "learning_rate": 9.994917459110866e-07, "loss": 0.0008, "reward": 1.6236194372177124, "reward_std": 0.25698375701904297, "rewards/accuracy_reward": 0.5392444729804993, "rewards/format_reward": 0.96875, "step": 472, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.0, "completion_length": 397.03125, "epoch": 0.014383894903296435, "grad_norm": 1.476101342228793, "kl": 0.018798828125, "learning_rate": 9.994895903775618e-07, "loss": 0.0008, "reward": 1.9296013116836548, "reward_std": 0.165361687541008, "rewards/accuracy_reward": 0.7671012878417969, "rewards/format_reward": 1.0, "step": 473, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.25, "completion_length": 374.5625, "epoch": 0.014414304829096216, "grad_norm": 1.2054324526869955, "kl": 0.030517578125, "learning_rate": 9.994874302851704e-07, "loss": 0.0012, "reward": 1.8375000953674316, "reward_std": 0.12992841005325317, "rewards/accuracy_reward": 0.703125, "rewards/format_reward": 1.0, "step": 474, "temperature": 1.0 }, { "all_correct": 0.375, "all_wrong": 0.125, "completion_length": 382.21875, "epoch": 0.014444714754895997, "grad_norm": 0.9605522805925136, "kl": 0.022216796875, "learning_rate": 9.994852656339317e-07, "loss": 0.0009, "reward": 1.8764104843139648, "reward_std": 0.08165568858385086, "rewards/accuracy_reward": 0.7107853889465332, "rewards/format_reward": 1.0, "step": 475, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 408.3125, "epoch": 0.01447512468069578, "grad_norm": 1.5557925777476829, "kl": 0.0234375, "learning_rate": 9.994830964238655e-07, "loss": 0.0009, "reward": 1.9632638692855835, "reward_std": 0.13806749880313873, "rewards/accuracy_reward": 0.8070138096809387, "rewards/format_reward": 1.0, "step": 476, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 407.546875, "epoch": 0.01450553460649556, "grad_norm": 1.594750723693762, "kl": 0.023193359375, "learning_rate": 9.99480922654992e-07, "loss": 0.0009, "reward": 1.8550639152526855, "reward_std": 0.18018555641174316, "rewards/accuracy_reward": 0.7019388675689697, "rewards/format_reward": 1.0, "step": 477, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 382.125, "epoch": 0.014535944532295342, "grad_norm": 1.1981462114456876, "kl": 0.0203857421875, "learning_rate": 9.994787443273305e-07, "loss": 0.0008, "reward": 1.9415209293365479, "reward_std": 0.15329420566558838, "rewards/accuracy_reward": 0.7727709412574768, "rewards/format_reward": 1.0, "step": 478, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.125, "completion_length": 366.703125, "epoch": 0.014566354458095123, "grad_norm": 1.0124550289266019, "kl": 0.0235595703125, "learning_rate": 9.994765614409013e-07, "loss": 0.0009, "reward": 1.772862434387207, "reward_std": 0.05820607393980026, "rewards/accuracy_reward": 0.6228625178337097, "rewards/format_reward": 1.0, "step": 479, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.125, "completion_length": 364.421875, "epoch": 0.014596764383894904, "grad_norm": 1.73310972399627, "kl": 0.026611328125, "learning_rate": 9.99474373995724e-07, "loss": 0.0011, "reward": 1.9468750953674316, "reward_std": 0.15362296998500824, "rewards/accuracy_reward": 0.8125, "rewards/format_reward": 0.984375, "step": 480, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.125, "completion_length": 367.984375, "epoch": 0.014627174309694685, "grad_norm": 1.278776072064968, "kl": 0.0240478515625, "learning_rate": 9.99472181991819e-07, "loss": 0.001, "reward": 1.6767091751098633, "reward_std": 0.08327358961105347, "rewards/accuracy_reward": 0.5485841631889343, "rewards/format_reward": 1.0, "step": 481, "temperature": 1.0 }, { "all_correct": 0.875, "all_wrong": 0.125, "completion_length": 361.75, "epoch": 0.014657584235494465, "grad_norm": 1.1354116964573329, "kl": 0.0301513671875, "learning_rate": 9.99469985429206e-07, "loss": 0.0012, "reward": 2.0374999046325684, "reward_std": 0.029250435531139374, "rewards/accuracy_reward": 0.875, "rewards/format_reward": 1.0, "step": 482, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.0, "completion_length": 368.296875, "epoch": 0.014687994161294246, "grad_norm": 0.9816022633769056, "kl": 0.022705078125, "learning_rate": 9.994677843079049e-07, "loss": 0.0009, "reward": 1.9828240871429443, "reward_std": 0.14439547061920166, "rewards/accuracy_reward": 0.7984490394592285, "rewards/format_reward": 1.0, "step": 483, "temperature": 1.0 }, { "all_correct": 0.375, "all_wrong": 0.125, "completion_length": 357.4375, "epoch": 0.014718404087094027, "grad_norm": 1.3762233724222692, "kl": 0.0238037109375, "learning_rate": 9.99465578627936e-07, "loss": 0.001, "reward": 1.73703932762146, "reward_std": 0.22470009326934814, "rewards/accuracy_reward": 0.6151642799377441, "rewards/format_reward": 1.0, "step": 484, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.25, "completion_length": 341.515625, "epoch": 0.014748814012893808, "grad_norm": 21.483624729352442, "kl": 0.0299072265625, "learning_rate": 9.994633683893194e-07, "loss": 0.0012, "reward": 1.890625, "reward_std": 0.020411595702171326, "rewards/accuracy_reward": 0.75, "rewards/format_reward": 1.0, "step": 485, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.125, "completion_length": 350.671875, "epoch": 0.01477922393869359, "grad_norm": 4.750600359663863, "kl": 0.030517578125, "learning_rate": 9.994611535920755e-07, "loss": 0.0012, "reward": 1.8562500476837158, "reward_std": 0.1672925502061844, "rewards/accuracy_reward": 0.71875, "rewards/format_reward": 1.0, "step": 486, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.25, "completion_length": 345.40625, "epoch": 0.01480963386449337, "grad_norm": 0.4490329278190297, "kl": 0.032470703125, "learning_rate": 9.994589342362242e-07, "loss": 0.0013, "reward": 1.8968751430511475, "reward_std": 0.008838837966322899, "rewards/accuracy_reward": 0.75, "rewards/format_reward": 1.0, "step": 487, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 374.328125, "epoch": 0.014840043790293151, "grad_norm": 0.8811612692288447, "kl": 0.025146484375, "learning_rate": 9.994567103217857e-07, "loss": 0.001, "reward": 2.1192474365234375, "reward_std": 0.02281036786735058, "rewards/accuracy_reward": 0.9192472696304321, "rewards/format_reward": 1.0, "step": 488, "temperature": 1.0 }, { "all_correct": 0.25, "all_wrong": 0.5, "completion_length": 409.390625, "epoch": 0.014870453716092932, "grad_norm": 1.449497617084126, "kl": 0.0244140625, "learning_rate": 9.994544818487807e-07, "loss": 0.001, "reward": 1.4860035181045532, "reward_std": 0.030808432027697563, "rewards/accuracy_reward": 0.40787848830223083, "rewards/format_reward": 1.0, "step": 489, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.0, "completion_length": 369.1875, "epoch": 0.014900863641892715, "grad_norm": 1.197418595850855, "kl": 0.026123046875, "learning_rate": 9.994522488172292e-07, "loss": 0.001, "reward": 1.9457591772079468, "reward_std": 0.13133002817630768, "rewards/accuracy_reward": 0.7832591533660889, "rewards/format_reward": 1.0, "step": 490, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 434.578125, "epoch": 0.014931273567692496, "grad_norm": 3.8766259808301777, "kl": 0.0167236328125, "learning_rate": 9.994500112271515e-07, "loss": 0.0007, "reward": 1.9357054233551025, "reward_std": 0.2655298113822937, "rewards/accuracy_reward": 0.8013304471969604, "rewards/format_reward": 0.984375, "step": 491, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 356.578125, "epoch": 0.014961683493492276, "grad_norm": 2.0308594187354307, "kl": 0.0277099609375, "learning_rate": 9.994477690785683e-07, "loss": 0.0011, "reward": 1.9444613456726074, "reward_std": 0.13510005176067352, "rewards/accuracy_reward": 0.7725862264633179, "rewards/format_reward": 1.0, "step": 492, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.125, "completion_length": 380.0625, "epoch": 0.014992093419292057, "grad_norm": 0.7251449247281695, "kl": 0.0322265625, "learning_rate": 9.994455223715e-07, "loss": 0.0013, "reward": 1.8531250953674316, "reward_std": 0.15011084079742432, "rewards/accuracy_reward": 0.71875, "rewards/format_reward": 1.0, "step": 493, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.125, "completion_length": 366.515625, "epoch": 0.015022503345091838, "grad_norm": 1.3961813675517114, "kl": 0.0228271484375, "learning_rate": 9.99443271105967e-07, "loss": 0.0009, "reward": 1.7663193941116333, "reward_std": 0.2004338800907135, "rewards/accuracy_reward": 0.647569477558136, "rewards/format_reward": 1.0, "step": 494, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.375, "completion_length": 398.46875, "epoch": 0.01505291327089162, "grad_norm": 0.43603587661087234, "kl": 0.0238037109375, "learning_rate": 9.994410152819901e-07, "loss": 0.001, "reward": 1.6700551509857178, "reward_std": 0.011016301810741425, "rewards/accuracy_reward": 0.5513050556182861, "rewards/format_reward": 1.0, "step": 495, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.25, "completion_length": 391.078125, "epoch": 0.0150833231966914, "grad_norm": 6.528817867688138, "kl": 0.0238037109375, "learning_rate": 9.994387548995895e-07, "loss": 0.001, "reward": 1.7028368711471558, "reward_std": 0.0879361554980278, "rewards/accuracy_reward": 0.5903368592262268, "rewards/format_reward": 1.0, "step": 496, "temperature": 1.0 }, { "all_correct": 0.375, "all_wrong": 0.125, "completion_length": 383.515625, "epoch": 0.015113733122491181, "grad_norm": 1.1438669635370577, "kl": 0.0283203125, "learning_rate": 9.99436489958786e-07, "loss": 0.0011, "reward": 1.7709039449691772, "reward_std": 0.08125047385692596, "rewards/accuracy_reward": 0.6240289211273193, "rewards/format_reward": 1.0, "step": 497, "temperature": 1.0 }, { "all_correct": 0.875, "all_wrong": 0.0, "completion_length": 373.375, "epoch": 0.015144143048290962, "grad_norm": 0.7407721238302988, "kl": 0.02587890625, "learning_rate": 9.994342204596005e-07, "loss": 0.001, "reward": 2.113842248916626, "reward_std": 0.012486360967159271, "rewards/accuracy_reward": 0.9169672131538391, "rewards/format_reward": 1.0, "step": 498, "temperature": 1.0 }, { "all_correct": 0.25, "all_wrong": 0.125, "completion_length": 391.59375, "epoch": 0.015174552974090743, "grad_norm": 1.7547240765527865, "kl": 0.023193359375, "learning_rate": 9.994319464020535e-07, "loss": 0.0009, "reward": 1.6313037872314453, "reward_std": 0.10591939091682434, "rewards/accuracy_reward": 0.47817879915237427, "rewards/format_reward": 1.0, "step": 499, "temperature": 1.0 }, { "all_correct": 0.125, "all_wrong": 0.125, "completion_length": 417.078125, "epoch": 0.015204962899890524, "grad_norm": 1.7671884927395058, "kl": 0.0224609375, "learning_rate": 9.994296677861656e-07, "loss": 0.0009, "reward": 1.5905811786651611, "reward_std": 0.28221431374549866, "rewards/accuracy_reward": 0.49995630979537964, "rewards/format_reward": 1.0, "step": 500, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 386.390625, "epoch": 0.015235372825690305, "grad_norm": 1.0579745722302552, "kl": 0.02099609375, "learning_rate": 9.99427384611958e-07, "loss": 0.0008, "reward": 1.6781747341156006, "reward_std": 0.16024881601333618, "rewards/accuracy_reward": 0.5719246864318848, "rewards/format_reward": 1.0, "step": 501, "temperature": 1.0 }, { "all_correct": 0.375, "all_wrong": 0.25, "completion_length": 410.296875, "epoch": 0.015265782751490086, "grad_norm": 0.9557962176132624, "kl": 0.026611328125, "learning_rate": 9.99425096879451e-07, "loss": 0.0011, "reward": 1.6614211797714233, "reward_std": 0.08476591855287552, "rewards/accuracy_reward": 0.5457960367202759, "rewards/format_reward": 1.0, "step": 502, "temperature": 1.0 }, { "all_correct": 0.25, "all_wrong": 0.375, "completion_length": 414.578125, "epoch": 0.015296192677289867, "grad_norm": 2.5270387407852226, "kl": 0.0174560546875, "learning_rate": 9.99422804588666e-07, "loss": 0.0007, "reward": 1.5413379669189453, "reward_std": 0.1443595290184021, "rewards/accuracy_reward": 0.4319629669189453, "rewards/format_reward": 1.0, "step": 503, "temperature": 1.0 }, { "all_correct": 0.375, "all_wrong": 0.25, "completion_length": 388.6875, "epoch": 0.015326602603089648, "grad_norm": 3.212793562040323, "kl": 0.0262451171875, "learning_rate": 9.994205077396235e-07, "loss": 0.001, "reward": 1.7812501192092896, "reward_std": 0.2039220929145813, "rewards/accuracy_reward": 0.65625, "rewards/format_reward": 1.0, "step": 504, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.0, "completion_length": 382.34375, "epoch": 0.01535701252888943, "grad_norm": 0.8830435979823414, "kl": 0.02197265625, "learning_rate": 9.99418206332345e-07, "loss": 0.0009, "reward": 2.081249952316284, "reward_std": 0.1801677793264389, "rewards/accuracy_reward": 0.90625, "rewards/format_reward": 1.0, "step": 505, "temperature": 1.0 }, { "all_correct": 0.375, "all_wrong": 0.125, "completion_length": 408.140625, "epoch": 0.015387422454689211, "grad_norm": 1.7651640875101753, "kl": 0.025146484375, "learning_rate": 9.99415900366851e-07, "loss": 0.001, "reward": 1.701768159866333, "reward_std": 0.10204082727432251, "rewards/accuracy_reward": 0.561143159866333, "rewards/format_reward": 1.0, "step": 506, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.125, "completion_length": 387.796875, "epoch": 0.015417832380488992, "grad_norm": 1.2433370198476819, "kl": 0.0286865234375, "learning_rate": 9.994135898431626e-07, "loss": 0.0011, "reward": 1.8230243921279907, "reward_std": 0.030501363798975945, "rewards/accuracy_reward": 0.6698993444442749, "rewards/format_reward": 1.0, "step": 507, "temperature": 1.0 }, { "all_correct": 0.375, "all_wrong": 0.125, "completion_length": 400.53125, "epoch": 0.015448242306288773, "grad_norm": 1.601411041562304, "kl": 0.0244140625, "learning_rate": 9.994112747613013e-07, "loss": 0.001, "reward": 1.6756925582885742, "reward_std": 0.19744716584682465, "rewards/accuracy_reward": 0.5475676655769348, "rewards/format_reward": 1.0, "step": 508, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.125, "completion_length": 364.765625, "epoch": 0.015478652232088554, "grad_norm": 1.334225481217231, "kl": 0.033447265625, "learning_rate": 9.994089551212878e-07, "loss": 0.0013, "reward": 1.836044192314148, "reward_std": 0.20293495059013367, "rewards/accuracy_reward": 0.7047942280769348, "rewards/format_reward": 1.0, "step": 509, "temperature": 1.0 }, { "all_correct": 0.25, "all_wrong": 0.125, "completion_length": 421.140625, "epoch": 0.015509062157888335, "grad_norm": 2.2315658848306748, "kl": 0.0224609375, "learning_rate": 9.994066309231433e-07, "loss": 0.0009, "reward": 1.5848779678344727, "reward_std": 0.1835135668516159, "rewards/accuracy_reward": 0.4786280393600464, "rewards/format_reward": 1.0, "step": 510, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.0, "completion_length": 414.875, "epoch": 0.015539472083688116, "grad_norm": 1.0259196284542582, "kl": 0.02587890625, "learning_rate": 9.994043021668893e-07, "loss": 0.001, "reward": 1.8112499713897705, "reward_std": 0.10919193923473358, "rewards/accuracy_reward": 0.6893749237060547, "rewards/format_reward": 1.0, "step": 511, "temperature": 1.0 }, { "all_correct": 0.25, "all_wrong": 0.125, "completion_length": 386.5, "epoch": 0.015569882009487897, "grad_norm": 4.604892286988543, "kl": 0.0244140625, "learning_rate": 9.994019688525468e-07, "loss": 0.001, "reward": 1.8986676931381226, "reward_std": 0.18019667267799377, "rewards/accuracy_reward": 0.7330426573753357, "rewards/format_reward": 1.0, "step": 512, "temperature": 1.0 }, { "all_correct": 0.375, "all_wrong": 0.25, "completion_length": 412.03125, "epoch": 0.015600291935287678, "grad_norm": 0.7396815554950271, "kl": 0.01806640625, "learning_rate": 9.993996309801372e-07, "loss": 0.0007, "reward": 1.6681134700775146, "reward_std": 0.08704882115125656, "rewards/accuracy_reward": 0.5431134700775146, "rewards/format_reward": 1.0, "step": 513, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 360.265625, "epoch": 0.01563070186108746, "grad_norm": 1.6690888405541142, "kl": 0.02978515625, "learning_rate": 9.993972885496818e-07, "loss": 0.0012, "reward": 1.791684627532959, "reward_std": 0.09275756776332855, "rewards/accuracy_reward": 0.6291844844818115, "rewards/format_reward": 1.0, "step": 514, "temperature": 1.0 }, { "all_correct": 0.375, "all_wrong": 0.0, "completion_length": 379.96875, "epoch": 0.01566111178688724, "grad_norm": 0.9714671847747159, "kl": 0.0245361328125, "learning_rate": 9.99394941561202e-07, "loss": 0.001, "reward": 1.7595596313476562, "reward_std": 0.26944100856781006, "rewards/accuracy_reward": 0.6251845359802246, "rewards/format_reward": 1.0, "step": 515, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 404.109375, "epoch": 0.01569152171268702, "grad_norm": 0.9952776018834041, "kl": 0.02294921875, "learning_rate": 9.993925900147192e-07, "loss": 0.0009, "reward": 1.7211538553237915, "reward_std": 0.19977575540542603, "rewards/accuracy_reward": 0.611778974533081, "rewards/format_reward": 0.984375, "step": 516, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 366.453125, "epoch": 0.015721931638486803, "grad_norm": 1.1271872799825466, "kl": 0.02392578125, "learning_rate": 9.99390233910255e-07, "loss": 0.001, "reward": 1.9171892404556274, "reward_std": 0.23139682412147522, "rewards/accuracy_reward": 0.7515642642974854, "rewards/format_reward": 1.0, "step": 517, "temperature": 1.0 }, { "all_correct": 0.25, "all_wrong": 0.0, "completion_length": 407.421875, "epoch": 0.015752341564286582, "grad_norm": 1.4148442953292637, "kl": 0.02197265625, "learning_rate": 9.993878732478307e-07, "loss": 0.0009, "reward": 1.814260482788086, "reward_std": 0.2154199779033661, "rewards/accuracy_reward": 0.6330103874206543, "rewards/format_reward": 1.0, "step": 518, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.125, "completion_length": 379.9375, "epoch": 0.015782751490086365, "grad_norm": 0.9117037230310154, "kl": 0.025390625, "learning_rate": 9.99385508027468e-07, "loss": 0.001, "reward": 1.9541666507720947, "reward_std": 0.09727580100297928, "rewards/accuracy_reward": 0.7979166507720947, "rewards/format_reward": 1.0, "step": 519, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.0, "completion_length": 392.796875, "epoch": 0.015813161415886144, "grad_norm": 1.8840561359969978, "kl": 0.027099609375, "learning_rate": 9.993831382491884e-07, "loss": 0.0011, "reward": 1.9901095628738403, "reward_std": 0.21440814435482025, "rewards/accuracy_reward": 0.8244844675064087, "rewards/format_reward": 1.0, "step": 520, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 381.15625, "epoch": 0.015843571341685927, "grad_norm": 1.0692597127521983, "kl": 0.02587890625, "learning_rate": 9.993807639130133e-07, "loss": 0.001, "reward": 2.0625, "reward_std": 0.16474656760692596, "rewards/accuracy_reward": 0.890625, "rewards/format_reward": 1.0, "step": 521, "temperature": 1.0 }, { "all_correct": 0.375, "all_wrong": 0.0, "completion_length": 402.5, "epoch": 0.015873981267485706, "grad_norm": 1.7035038134937677, "kl": 0.022705078125, "learning_rate": 9.99378385018965e-07, "loss": 0.0009, "reward": 1.8963849544525146, "reward_std": 0.3370119035243988, "rewards/accuracy_reward": 0.7557598352432251, "rewards/format_reward": 0.984375, "step": 522, "temperature": 1.0 }, { "all_correct": 0.375, "all_wrong": 0.125, "completion_length": 387.078125, "epoch": 0.01590439119328549, "grad_norm": 0.780901457286748, "kl": 0.0177001953125, "learning_rate": 9.993760015670644e-07, "loss": 0.0007, "reward": 1.6112170219421387, "reward_std": 0.23193469643592834, "rewards/accuracy_reward": 0.5112169981002808, "rewards/format_reward": 0.984375, "step": 523, "temperature": 1.0 }, { "all_correct": 0.375, "all_wrong": 0.25, "completion_length": 389.984375, "epoch": 0.015934801119085268, "grad_norm": 0.9625130944234312, "kl": 0.02392578125, "learning_rate": 9.993736135573341e-07, "loss": 0.001, "reward": 1.765625, "reward_std": 0.21148672699928284, "rewards/accuracy_reward": 0.640625, "rewards/format_reward": 1.0, "step": 524, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.125, "completion_length": 383.953125, "epoch": 0.01596521104488505, "grad_norm": 1.1593850326921535, "kl": 0.0274658203125, "learning_rate": 9.993712209897954e-07, "loss": 0.0011, "reward": 1.8734374046325684, "reward_std": 0.1397111713886261, "rewards/accuracy_reward": 0.7296875715255737, "rewards/format_reward": 1.0, "step": 525, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.125, "completion_length": 382.8125, "epoch": 0.015995620970684833, "grad_norm": 0.3328221255135476, "kl": 0.02490234375, "learning_rate": 9.9936882386447e-07, "loss": 0.001, "reward": 2.0218751430511475, "reward_std": 0.05077523738145828, "rewards/accuracy_reward": 0.859375, "rewards/format_reward": 1.0, "step": 526, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.125, "completion_length": 366.40625, "epoch": 0.016026030896484612, "grad_norm": 0.7461185627324085, "kl": 0.03369140625, "learning_rate": 9.993664221813801e-07, "loss": 0.0014, "reward": 1.8531250953674316, "reward_std": 0.11738668382167816, "rewards/accuracy_reward": 0.71875, "rewards/format_reward": 1.0, "step": 527, "temperature": 1.0 }, { "all_correct": 0.375, "all_wrong": 0.125, "completion_length": 408.703125, "epoch": 0.016056440822284395, "grad_norm": 1.085743044665237, "kl": 0.02197265625, "learning_rate": 9.993640159405476e-07, "loss": 0.0009, "reward": 1.609356164932251, "reward_std": 0.16089139878749847, "rewards/accuracy_reward": 0.49685609340667725, "rewards/format_reward": 1.0, "step": 528, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.0, "completion_length": 372.40625, "epoch": 0.016086850748084174, "grad_norm": 7.074046320438292, "kl": 0.0244140625, "learning_rate": 9.993616051419943e-07, "loss": 0.001, "reward": 2.0406250953674316, "reward_std": 0.2205134928226471, "rewards/accuracy_reward": 0.875, "rewards/format_reward": 1.0, "step": 529, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.125, "completion_length": 367.9375, "epoch": 0.016117260673883957, "grad_norm": 0.9390608563156165, "kl": 0.029052734375, "learning_rate": 9.993591897857423e-07, "loss": 0.0012, "reward": 1.84375, "reward_std": 0.14706888794898987, "rewards/accuracy_reward": 0.703125, "rewards/format_reward": 1.0, "step": 530, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.125, "completion_length": 375.03125, "epoch": 0.016147670599683736, "grad_norm": 1.3584662554495432, "kl": 0.0238037109375, "learning_rate": 9.993567698718136e-07, "loss": 0.001, "reward": 1.8705666065216064, "reward_std": 0.03462255001068115, "rewards/accuracy_reward": 0.7111915946006775, "rewards/format_reward": 1.0, "step": 531, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.25, "completion_length": 382.0625, "epoch": 0.01617808052548352, "grad_norm": 1.6917243155579849, "kl": 0.02587890625, "learning_rate": 9.993543454002303e-07, "loss": 0.001, "reward": 1.7641172409057617, "reward_std": 0.09114737808704376, "rewards/accuracy_reward": 0.635992169380188, "rewards/format_reward": 1.0, "step": 532, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.0, "completion_length": 372.84375, "epoch": 0.016208490451283298, "grad_norm": 1.1409825426174645, "kl": 0.0242919921875, "learning_rate": 9.993519163710146e-07, "loss": 0.001, "reward": 1.9512312412261963, "reward_std": 0.1779123693704605, "rewards/accuracy_reward": 0.7918561697006226, "rewards/format_reward": 1.0, "step": 533, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.125, "completion_length": 363.59375, "epoch": 0.01623890037708308, "grad_norm": 2.974921497398216, "kl": 0.021484375, "learning_rate": 9.993494827841886e-07, "loss": 0.0009, "reward": 1.935035228729248, "reward_std": 0.08776319026947021, "rewards/accuracy_reward": 0.769410252571106, "rewards/format_reward": 1.0, "step": 534, "temperature": 1.0 }, { "all_correct": 0.875, "all_wrong": 0.0, "completion_length": 353.0625, "epoch": 0.01626931030288286, "grad_norm": 5.620758565555608, "kl": 0.0279541015625, "learning_rate": 9.993470446397744e-07, "loss": 0.0011, "reward": 2.1281251907348633, "reward_std": 0.11572164297103882, "rewards/accuracy_reward": 0.953125, "rewards/format_reward": 1.0, "step": 535, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.125, "completion_length": 378.5, "epoch": 0.016299720228682642, "grad_norm": 1.0278039332585507, "kl": 0.0244140625, "learning_rate": 9.993446019377947e-07, "loss": 0.001, "reward": 1.8778808116912842, "reward_std": 0.016999786719679832, "rewards/accuracy_reward": 0.7060058116912842, "rewards/format_reward": 1.0, "step": 536, "temperature": 1.0 }, { "all_correct": 0.375, "all_wrong": 0.125, "completion_length": 444.421875, "epoch": 0.01633013015448242, "grad_norm": 6.56766491234196, "kl": 0.0167236328125, "learning_rate": 9.99342154678271e-07, "loss": 0.0007, "reward": 1.6468727588653564, "reward_std": 0.2497006356716156, "rewards/accuracy_reward": 0.5624977350234985, "rewards/format_reward": 0.96875, "step": 537, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.125, "completion_length": 359.765625, "epoch": 0.016360540080282204, "grad_norm": 0.9944108109794813, "kl": 0.0277099609375, "learning_rate": 9.993397028612264e-07, "loss": 0.0011, "reward": 1.8656251430511475, "reward_std": 0.17158037424087524, "rewards/accuracy_reward": 0.734375, "rewards/format_reward": 1.0, "step": 538, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.0, "completion_length": 390.5, "epoch": 0.016390950006081983, "grad_norm": 1.1496669474447838, "kl": 0.021240234375, "learning_rate": 9.99337246486683e-07, "loss": 0.0009, "reward": 1.96875, "reward_std": 0.12482766062021255, "rewards/accuracy_reward": 0.7999999523162842, "rewards/format_reward": 1.0, "step": 539, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.0, "completion_length": 396.203125, "epoch": 0.016421359931881766, "grad_norm": 1.8325948958130853, "kl": 0.0211181640625, "learning_rate": 9.993347855546632e-07, "loss": 0.0008, "reward": 1.988114833831787, "reward_std": 0.1748911738395691, "rewards/accuracy_reward": 0.8256146907806396, "rewards/format_reward": 1.0, "step": 540, "temperature": 1.0 }, { "all_correct": 0.375, "all_wrong": 0.375, "completion_length": 411.5, "epoch": 0.01645176985768155, "grad_norm": 4.3836195932159026, "kl": 0.0201416015625, "learning_rate": 9.993323200651894e-07, "loss": 0.0008, "reward": 1.5890107154846191, "reward_std": 0.01648581773042679, "rewards/accuracy_reward": 0.46401065587997437, "rewards/format_reward": 1.0, "step": 541, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.125, "completion_length": 367.15625, "epoch": 0.016482179783481328, "grad_norm": 2.2965032825537826, "kl": 0.0296630859375, "learning_rate": 9.99329850018284e-07, "loss": 0.0012, "reward": 1.887526512145996, "reward_std": 0.13017582893371582, "rewards/accuracy_reward": 0.7406514883041382, "rewards/format_reward": 1.0, "step": 542, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 372.734375, "epoch": 0.01651258970928111, "grad_norm": 1.3938738879017516, "kl": 0.0255126953125, "learning_rate": 9.9932737541397e-07, "loss": 0.001, "reward": 1.8326916694641113, "reward_std": 0.1102554127573967, "rewards/accuracy_reward": 0.6639417409896851, "rewards/format_reward": 1.0, "step": 543, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 423.0625, "epoch": 0.01654299963508089, "grad_norm": 1.3633646897373013, "kl": 0.0174560546875, "learning_rate": 9.993248962522695e-07, "loss": 0.0007, "reward": 1.8517284393310547, "reward_std": 0.11528775095939636, "rewards/accuracy_reward": 0.7017285227775574, "rewards/format_reward": 1.0, "step": 544, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 367.125, "epoch": 0.016573409560880673, "grad_norm": 1.032095326324485, "kl": 0.023681640625, "learning_rate": 9.993224125332056e-07, "loss": 0.0009, "reward": 1.9763847589492798, "reward_std": 0.10624010860919952, "rewards/accuracy_reward": 0.8107597827911377, "rewards/format_reward": 1.0, "step": 545, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.125, "completion_length": 380.0625, "epoch": 0.01660381948668045, "grad_norm": 1.2163105023389043, "kl": 0.02734375, "learning_rate": 9.993199242568005e-07, "loss": 0.0011, "reward": 1.9005695581436157, "reward_std": 0.09277185052633286, "rewards/accuracy_reward": 0.7474446296691895, "rewards/format_reward": 1.0, "step": 546, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.0, "completion_length": 362.328125, "epoch": 0.016634229412480234, "grad_norm": 1.5977639445846574, "kl": 0.0299072265625, "learning_rate": 9.99317431423077e-07, "loss": 0.0012, "reward": 1.9183369874954224, "reward_std": 0.1112891435623169, "rewards/accuracy_reward": 0.7464619278907776, "rewards/format_reward": 1.0, "step": 547, "temperature": 1.0 }, { "all_correct": 0.375, "all_wrong": 0.125, "completion_length": 401.28125, "epoch": 0.016664639338280014, "grad_norm": 0.8467991021030187, "kl": 0.0218505859375, "learning_rate": 9.993149340320582e-07, "loss": 0.0009, "reward": 1.6057817935943604, "reward_std": 0.09839224815368652, "rewards/accuracy_reward": 0.4807818531990051, "rewards/format_reward": 1.0, "step": 548, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.25, "completion_length": 372.84375, "epoch": 0.016695049264079796, "grad_norm": 0.8357612606352839, "kl": 0.0255126953125, "learning_rate": 9.993124320837664e-07, "loss": 0.001, "reward": 1.8343751430511475, "reward_std": 0.09804397076368332, "rewards/accuracy_reward": 0.703125, "rewards/format_reward": 1.0, "step": 549, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 375.5625, "epoch": 0.016725459189879575, "grad_norm": 1.407874959884774, "kl": 0.0201416015625, "learning_rate": 9.993099255782247e-07, "loss": 0.0008, "reward": 1.9874300956726074, "reward_std": 0.053691502660512924, "rewards/accuracy_reward": 0.8155549764633179, "rewards/format_reward": 1.0, "step": 550, "temperature": 1.0 }, { "all_correct": 0.375, "all_wrong": 0.125, "completion_length": 413.171875, "epoch": 0.016755869115679358, "grad_norm": 1.2613798808352086, "kl": 0.0186767578125, "learning_rate": 9.993074145154563e-07, "loss": 0.0007, "reward": 1.7065178155899048, "reward_std": 0.10608058422803879, "rewards/accuracy_reward": 0.5658928155899048, "rewards/format_reward": 1.0, "step": 551, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.125, "completion_length": 385.9375, "epoch": 0.016786279041479137, "grad_norm": 1.323970138211582, "kl": 0.0272216796875, "learning_rate": 9.993048988954833e-07, "loss": 0.0011, "reward": 2.006477117538452, "reward_std": 0.043110501021146774, "rewards/accuracy_reward": 0.8533520698547363, "rewards/format_reward": 1.0, "step": 552, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.125, "completion_length": 395.234375, "epoch": 0.01681668896727892, "grad_norm": 1.1793579264954002, "kl": 0.02783203125, "learning_rate": 9.993023787183295e-07, "loss": 0.0011, "reward": 1.9462419748306274, "reward_std": 0.07720139622688293, "rewards/accuracy_reward": 0.7837419509887695, "rewards/format_reward": 1.0, "step": 553, "temperature": 1.0 }, { "all_correct": 0.375, "all_wrong": 0.25, "completion_length": 365.03125, "epoch": 0.016847098893078703, "grad_norm": 0.8565651825875189, "kl": 0.027099609375, "learning_rate": 9.992998539840173e-07, "loss": 0.0011, "reward": 1.7223215103149414, "reward_std": 0.185908704996109, "rewards/accuracy_reward": 0.6004464626312256, "rewards/format_reward": 1.0, "step": 554, "temperature": 1.0 }, { "all_correct": 0.25, "all_wrong": 0.0, "completion_length": 393.5625, "epoch": 0.016877508818878482, "grad_norm": 1.3027761956048955, "kl": 0.0196533203125, "learning_rate": 9.992973246925703e-07, "loss": 0.0008, "reward": 1.6851897239685059, "reward_std": 0.28823140263557434, "rewards/accuracy_reward": 0.54768967628479, "rewards/format_reward": 1.0, "step": 555, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.25, "completion_length": 365.421875, "epoch": 0.016907918744678264, "grad_norm": 1.3601459447578725, "kl": 0.026611328125, "learning_rate": 9.99294790844011e-07, "loss": 0.0011, "reward": 1.6862847805023193, "reward_std": 0.08702725172042847, "rewards/accuracy_reward": 0.5581597089767456, "rewards/format_reward": 1.0, "step": 556, "temperature": 1.0 }, { "all_correct": 0.125, "all_wrong": 0.25, "completion_length": 400.640625, "epoch": 0.016938328670478044, "grad_norm": 1.2224024179890598, "kl": 0.022705078125, "learning_rate": 9.992922524383628e-07, "loss": 0.0009, "reward": 1.5042027235031128, "reward_std": 0.2732207775115967, "rewards/accuracy_reward": 0.41045260429382324, "rewards/format_reward": 1.0, "step": 557, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.0, "completion_length": 392.28125, "epoch": 0.016968738596277826, "grad_norm": 0.7617721605575297, "kl": 0.0205078125, "learning_rate": 9.992897094756492e-07, "loss": 0.0008, "reward": 1.9370760917663574, "reward_std": 0.15735886991024017, "rewards/accuracy_reward": 0.7745760679244995, "rewards/format_reward": 1.0, "step": 558, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 369.78125, "epoch": 0.016999148522077605, "grad_norm": 1.3418875652939304, "kl": 0.023681640625, "learning_rate": 9.992871619558926e-07, "loss": 0.0009, "reward": 2.1012680530548096, "reward_std": 0.10392493009567261, "rewards/accuracy_reward": 0.9168931245803833, "rewards/format_reward": 1.0, "step": 559, "temperature": 1.0 }, { "all_correct": 0.375, "all_wrong": 0.0, "completion_length": 347.984375, "epoch": 0.017029558447877388, "grad_norm": 1.847504137402203, "kl": 0.029296875, "learning_rate": 9.992846098791172e-07, "loss": 0.0012, "reward": 1.8774423599243164, "reward_std": 0.2313249707221985, "rewards/accuracy_reward": 0.7211923599243164, "rewards/format_reward": 1.0, "step": 560, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.125, "completion_length": 384.15625, "epoch": 0.017059968373677167, "grad_norm": 5.146573220678828, "kl": 0.02490234375, "learning_rate": 9.992820532453453e-07, "loss": 0.001, "reward": 1.8488876819610596, "reward_std": 0.13339930772781372, "rewards/accuracy_reward": 0.7145127058029175, "rewards/format_reward": 1.0, "step": 561, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.125, "completion_length": 358.203125, "epoch": 0.01709037829947695, "grad_norm": 1.2601285290402524, "kl": 0.0250244140625, "learning_rate": 9.992794920546013e-07, "loss": 0.001, "reward": 1.9891787767410278, "reward_std": 0.0844581127166748, "rewards/accuracy_reward": 0.8298037052154541, "rewards/format_reward": 1.0, "step": 562, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.375, "completion_length": 361.28125, "epoch": 0.01712078822527673, "grad_norm": 1.297508343100705, "kl": 0.0255126953125, "learning_rate": 9.992769263069076e-07, "loss": 0.001, "reward": 1.6500000953674316, "reward_std": 0.09531005471944809, "rewards/accuracy_reward": 0.546875, "rewards/format_reward": 1.0, "step": 563, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 418.65625, "epoch": 0.017151198151076512, "grad_norm": 1.13360117142651, "kl": 0.024658203125, "learning_rate": 9.992743560022883e-07, "loss": 0.001, "reward": 1.8644129037857056, "reward_std": 0.17236268520355225, "rewards/accuracy_reward": 0.7237879037857056, "rewards/format_reward": 1.0, "step": 564, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.125, "completion_length": 398.34375, "epoch": 0.01718160807687629, "grad_norm": 0.7635775165151959, "kl": 0.0267333984375, "learning_rate": 9.992717811407664e-07, "loss": 0.0011, "reward": 1.8516591787338257, "reward_std": 0.012908020056784153, "rewards/accuracy_reward": 0.6985341310501099, "rewards/format_reward": 1.0, "step": 565, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.25, "completion_length": 365.53125, "epoch": 0.017212018002676074, "grad_norm": 0.6447127086817688, "kl": 0.0244140625, "learning_rate": 9.992692017223659e-07, "loss": 0.001, "reward": 1.8047688007354736, "reward_std": 0.017000392079353333, "rewards/accuracy_reward": 0.657893717288971, "rewards/format_reward": 1.0, "step": 566, "temperature": 1.0 }, { "all_correct": 0.375, "all_wrong": 0.125, "completion_length": 401.953125, "epoch": 0.017242427928475853, "grad_norm": 1.5188734796293817, "kl": 0.0228271484375, "learning_rate": 9.992666177471098e-07, "loss": 0.0009, "reward": 1.7812390327453613, "reward_std": 0.1712251752614975, "rewards/accuracy_reward": 0.6343639492988586, "rewards/format_reward": 1.0, "step": 567, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.125, "completion_length": 369.734375, "epoch": 0.017272837854275636, "grad_norm": 1.201776688689362, "kl": 0.028076171875, "learning_rate": 9.99264029215022e-07, "loss": 0.0011, "reward": 1.91015625, "reward_std": 0.08285161852836609, "rewards/accuracy_reward": 0.757031261920929, "rewards/format_reward": 1.0, "step": 568, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.125, "completion_length": 388.75, "epoch": 0.017303247780075418, "grad_norm": 3.863676781286814, "kl": 0.0269775390625, "learning_rate": 9.992614361261262e-07, "loss": 0.0011, "reward": 1.845820426940918, "reward_std": 0.10071505606174469, "rewards/accuracy_reward": 0.7083203792572021, "rewards/format_reward": 1.0, "step": 569, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 377.28125, "epoch": 0.017333657705875197, "grad_norm": 1.9407281459184562, "kl": 0.0302734375, "learning_rate": 9.992588384804456e-07, "loss": 0.0012, "reward": 2.1265625953674316, "reward_std": 0.12014105916023254, "rewards/accuracy_reward": 0.9390624761581421, "rewards/format_reward": 1.0, "step": 570, "temperature": 1.0 }, { "all_correct": 0.375, "all_wrong": 0.125, "completion_length": 403.953125, "epoch": 0.01736406763167498, "grad_norm": 1.2471813411969082, "kl": 0.0289306640625, "learning_rate": 9.992562362780047e-07, "loss": 0.0012, "reward": 1.7252793312072754, "reward_std": 0.12868757545948029, "rewards/accuracy_reward": 0.6065292358398438, "rewards/format_reward": 1.0, "step": 571, "temperature": 1.0 }, { "all_correct": 0.375, "all_wrong": 0.125, "completion_length": 387.25, "epoch": 0.01739447755747476, "grad_norm": 1.044655844615009, "kl": 0.0274658203125, "learning_rate": 9.992536295188265e-07, "loss": 0.0011, "reward": 1.7438277006149292, "reward_std": 0.22789573669433594, "rewards/accuracy_reward": 0.6188276410102844, "rewards/format_reward": 1.0, "step": 572, "temperature": 1.0 }, { "all_correct": 0.25, "all_wrong": 0.0, "completion_length": 410.4375, "epoch": 0.017424887483274542, "grad_norm": 1.7835064565347276, "kl": 0.0228271484375, "learning_rate": 9.99251018202935e-07, "loss": 0.0009, "reward": 1.7491974830627441, "reward_std": 0.1960187554359436, "rewards/accuracy_reward": 0.6085724234580994, "rewards/format_reward": 1.0, "step": 573, "temperature": 1.0 }, { "all_correct": 0.375, "all_wrong": 0.125, "completion_length": 402.25, "epoch": 0.01745529740907432, "grad_norm": 0.6861679592366509, "kl": 0.020751953125, "learning_rate": 9.992484023303542e-07, "loss": 0.0008, "reward": 1.6845179796218872, "reward_std": 0.15099084377288818, "rewards/accuracy_reward": 0.5626429319381714, "rewards/format_reward": 1.0, "step": 574, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.125, "completion_length": 404.5, "epoch": 0.017485707334874104, "grad_norm": 0.8191226747115254, "kl": 0.026123046875, "learning_rate": 9.99245781901108e-07, "loss": 0.001, "reward": 1.9397214651107788, "reward_std": 0.07836076617240906, "rewards/accuracy_reward": 0.7803462743759155, "rewards/format_reward": 1.0, "step": 575, "temperature": 1.0 }, { "all_correct": 0.375, "all_wrong": 0.125, "completion_length": 383.8125, "epoch": 0.017516117260673883, "grad_norm": 1.2632142879600834, "kl": 0.02587890625, "learning_rate": 9.992431569152201e-07, "loss": 0.001, "reward": 1.7038289308547974, "reward_std": 0.13528932631015778, "rewards/accuracy_reward": 0.5725789666175842, "rewards/format_reward": 1.0, "step": 576, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.125, "completion_length": 368.359375, "epoch": 0.017546527186473666, "grad_norm": 1.320968696932847, "kl": 0.02978515625, "learning_rate": 9.992405273727147e-07, "loss": 0.0012, "reward": 1.8704546689987183, "reward_std": 0.16585060954093933, "rewards/accuracy_reward": 0.7329545617103577, "rewards/format_reward": 1.0, "step": 577, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.25, "completion_length": 392.234375, "epoch": 0.017576937112273445, "grad_norm": 1.0909852075508653, "kl": 0.022705078125, "learning_rate": 9.992378932736154e-07, "loss": 0.0009, "reward": 1.6427083015441895, "reward_std": 0.11842279136180878, "rewards/accuracy_reward": 0.5364583730697632, "rewards/format_reward": 1.0, "step": 578, "temperature": 1.0 }, { "all_correct": 0.875, "all_wrong": 0.125, "completion_length": 358.265625, "epoch": 0.017607347038073227, "grad_norm": 0.6798318982385767, "kl": 0.031982421875, "learning_rate": 9.992352546179466e-07, "loss": 0.0013, "reward": 2.043750047683716, "reward_std": 0.017677675932645798, "rewards/accuracy_reward": 0.875, "rewards/format_reward": 1.0, "step": 579, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 381.609375, "epoch": 0.017637756963873007, "grad_norm": 0.9814548381346477, "kl": 0.025634765625, "learning_rate": 9.992326114057325e-07, "loss": 0.001, "reward": 2.1030664443969727, "reward_std": 0.06711506098508835, "rewards/accuracy_reward": 0.9093164801597595, "rewards/format_reward": 1.0, "step": 580, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 353.421875, "epoch": 0.01766816688967279, "grad_norm": 1.7524928879387054, "kl": 0.030517578125, "learning_rate": 9.992299636369968e-07, "loss": 0.0012, "reward": 1.9520833492279053, "reward_std": 0.22887161374092102, "rewards/accuracy_reward": 0.7864583730697632, "rewards/format_reward": 1.0, "step": 581, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.125, "completion_length": 397.109375, "epoch": 0.01769857681547257, "grad_norm": 1.8441756014952742, "kl": 0.0238037109375, "learning_rate": 9.99227311311764e-07, "loss": 0.001, "reward": 1.8849236965179443, "reward_std": 0.09047336131334305, "rewards/accuracy_reward": 0.7286736965179443, "rewards/format_reward": 1.0, "step": 582, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.25, "completion_length": 357.21875, "epoch": 0.01772898674127235, "grad_norm": 0.7585881955602598, "kl": 0.033203125, "learning_rate": 9.99224654430058e-07, "loss": 0.0013, "reward": 1.8382327556610107, "reward_std": 0.03417164087295532, "rewards/accuracy_reward": 0.6913577318191528, "rewards/format_reward": 1.0, "step": 583, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.125, "completion_length": 390.90625, "epoch": 0.017759396667072134, "grad_norm": 1.4220417144545328, "kl": 0.028564453125, "learning_rate": 9.992219929919034e-07, "loss": 0.0011, "reward": 1.718446135520935, "reward_std": 0.09436649829149246, "rewards/accuracy_reward": 0.5903211832046509, "rewards/format_reward": 1.0, "step": 584, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 385.328125, "epoch": 0.017789806592871913, "grad_norm": 1.1784537671132156, "kl": 0.028564453125, "learning_rate": 9.992193269973245e-07, "loss": 0.0011, "reward": 1.747064471244812, "reward_std": 0.17472191154956818, "rewards/accuracy_reward": 0.6251895427703857, "rewards/format_reward": 1.0, "step": 585, "temperature": 1.0 }, { "all_correct": 0.375, "all_wrong": 0.0, "completion_length": 365.203125, "epoch": 0.017820216518671696, "grad_norm": 1.2275023818182955, "kl": 0.036865234375, "learning_rate": 9.99216656446345e-07, "loss": 0.0015, "reward": 1.877621054649353, "reward_std": 0.23138678073883057, "rewards/accuracy_reward": 0.7182459831237793, "rewards/format_reward": 1.0, "step": 586, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.125, "completion_length": 366.140625, "epoch": 0.017850626444471475, "grad_norm": 0.8485411730675455, "kl": 0.043212890625, "learning_rate": 9.9921398133899e-07, "loss": 0.0017, "reward": 1.920138955116272, "reward_std": 0.10352161526679993, "rewards/accuracy_reward": 0.7638888955116272, "rewards/format_reward": 1.0, "step": 587, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 376.171875, "epoch": 0.017881036370271258, "grad_norm": 0.9688169079096972, "kl": 0.0419921875, "learning_rate": 9.992113016752836e-07, "loss": 0.0017, "reward": 2.0319104194641113, "reward_std": 0.10207494348287582, "rewards/accuracy_reward": 0.8475353717803955, "rewards/format_reward": 1.0, "step": 588, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.0, "completion_length": 380.109375, "epoch": 0.017911446296071037, "grad_norm": 2.049074380961269, "kl": 0.044921875, "learning_rate": 9.992086174552505e-07, "loss": 0.0018, "reward": 1.8737730979919434, "reward_std": 0.12766709923744202, "rewards/accuracy_reward": 0.7175230383872986, "rewards/format_reward": 1.0, "step": 589, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 413.609375, "epoch": 0.01794185622187082, "grad_norm": 1.079514699091543, "kl": 0.044921875, "learning_rate": 9.992059286789147e-07, "loss": 0.0018, "reward": 1.8300812244415283, "reward_std": 0.1272916942834854, "rewards/accuracy_reward": 0.7175811529159546, "rewards/format_reward": 0.9375, "step": 590, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 362.0625, "epoch": 0.0179722661476706, "grad_norm": 1.184987882050577, "kl": 0.05029296875, "learning_rate": 9.992032353463013e-07, "loss": 0.002, "reward": 2.0025362968444824, "reward_std": 0.09839466959238052, "rewards/accuracy_reward": 0.8306612372398376, "rewards/format_reward": 1.0, "step": 591, "temperature": 1.0 }, { "all_correct": 0.25, "all_wrong": 0.0, "completion_length": 363.546875, "epoch": 0.01800267607347038, "grad_norm": 1.4697981828305116, "kl": 0.041015625, "learning_rate": 9.992005374574345e-07, "loss": 0.0016, "reward": 1.752118468284607, "reward_std": 0.23267632722854614, "rewards/accuracy_reward": 0.6052433252334595, "rewards/format_reward": 1.0, "step": 592, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 350.546875, "epoch": 0.01803308599927016, "grad_norm": 1.127629496102784, "kl": 0.049560546875, "learning_rate": 9.991978350123391e-07, "loss": 0.002, "reward": 2.040572166442871, "reward_std": 0.09483858942985535, "rewards/accuracy_reward": 0.871821939945221, "rewards/format_reward": 1.0, "step": 593, "temperature": 1.0 }, { "all_correct": 0.25, "all_wrong": 0.375, "completion_length": 375.453125, "epoch": 0.018063495925069943, "grad_norm": 1.0819872217855888, "kl": 0.043701171875, "learning_rate": 9.991951280110396e-07, "loss": 0.0018, "reward": 1.5016298294067383, "reward_std": 0.09979674220085144, "rewards/accuracy_reward": 0.40787991881370544, "rewards/format_reward": 1.0, "step": 594, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 361.296875, "epoch": 0.018093905850869722, "grad_norm": 2.299313232393947, "kl": 0.048095703125, "learning_rate": 9.99192416453561e-07, "loss": 0.0019, "reward": 2.0348854064941406, "reward_std": 0.04809385538101196, "rewards/accuracy_reward": 0.8536352515220642, "rewards/format_reward": 1.0, "step": 595, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 396.90625, "epoch": 0.018124315776669505, "grad_norm": 1.7685658559941535, "kl": 0.046142578125, "learning_rate": 9.991897003399279e-07, "loss": 0.0018, "reward": 1.8747129440307617, "reward_std": 0.2227209508419037, "rewards/accuracy_reward": 0.7215878963470459, "rewards/format_reward": 1.0, "step": 596, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.125, "completion_length": 367.125, "epoch": 0.018154725702469288, "grad_norm": 2.0385906210110503, "kl": 0.051513671875, "learning_rate": 9.991869796701648e-07, "loss": 0.0021, "reward": 1.8681142330169678, "reward_std": 0.04913956671953201, "rewards/accuracy_reward": 0.708739161491394, "rewards/format_reward": 1.0, "step": 597, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.125, "completion_length": 402.71875, "epoch": 0.018185135628269067, "grad_norm": 1.042504465270702, "kl": 0.0419921875, "learning_rate": 9.99184254444297e-07, "loss": 0.0017, "reward": 1.8252263069152832, "reward_std": 0.15027400851249695, "rewards/accuracy_reward": 0.6783512830734253, "rewards/format_reward": 1.0, "step": 598, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.125, "completion_length": 364.859375, "epoch": 0.01821554555406885, "grad_norm": 1.290359576104531, "kl": 0.048095703125, "learning_rate": 9.991815246623493e-07, "loss": 0.0019, "reward": 1.8686763048171997, "reward_std": 0.13960713148117065, "rewards/accuracy_reward": 0.706176221370697, "rewards/format_reward": 1.0, "step": 599, "temperature": 1.0 }, { "all_correct": 0.375, "all_wrong": 0.375, "completion_length": 413.21875, "epoch": 0.01824595547986863, "grad_norm": 0.9688752544396189, "kl": 0.038330078125, "learning_rate": 9.991787903243463e-07, "loss": 0.0015, "reward": 1.536142349243164, "reward_std": 0.01596335880458355, "rewards/accuracy_reward": 0.4236423671245575, "rewards/format_reward": 1.0, "step": 600, "temperature": 1.0 }, { "all_correct": 0.25, "all_wrong": 0.125, "completion_length": 370.296875, "epoch": 0.01827636540566841, "grad_norm": 1.3873465516859986, "kl": 0.04443359375, "learning_rate": 9.991760514303132e-07, "loss": 0.0018, "reward": 1.8008612394332886, "reward_std": 0.2821829617023468, "rewards/accuracy_reward": 0.660236120223999, "rewards/format_reward": 1.0, "step": 601, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.125, "completion_length": 404.25, "epoch": 0.01830677533146819, "grad_norm": 0.9322885161877308, "kl": 0.0294189453125, "learning_rate": 9.991733079802748e-07, "loss": 0.0012, "reward": 1.9062501192092896, "reward_std": 0.2160109579563141, "rewards/accuracy_reward": 0.765625, "rewards/format_reward": 1.0, "step": 602, "temperature": 1.0 }, { "all_correct": 0.125, "all_wrong": 0.375, "completion_length": 396.359375, "epoch": 0.018337185257267973, "grad_norm": 1.2546804699904273, "kl": 0.0308837890625, "learning_rate": 9.991705599742564e-07, "loss": 0.0012, "reward": 1.4027683734893799, "reward_std": 0.24704982340335846, "rewards/accuracy_reward": 0.3371432423591614, "rewards/format_reward": 0.984375, "step": 603, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 403.15625, "epoch": 0.018367595183067752, "grad_norm": 0.959492014454973, "kl": 0.033203125, "learning_rate": 9.991678074122828e-07, "loss": 0.0013, "reward": 2.008178234100342, "reward_std": 0.027123089879751205, "rewards/accuracy_reward": 0.8269281387329102, "rewards/format_reward": 1.0, "step": 604, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.0, "completion_length": 387.0625, "epoch": 0.018398005108867535, "grad_norm": 1.1539315805318402, "kl": 0.0240478515625, "learning_rate": 9.991650502943795e-07, "loss": 0.001, "reward": 1.9461710453033447, "reward_std": 0.07115110754966736, "rewards/accuracy_reward": 0.7680459022521973, "rewards/format_reward": 1.0, "step": 605, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.125, "completion_length": 395.578125, "epoch": 0.018428415034667314, "grad_norm": 0.6221506136131344, "kl": 0.0252685546875, "learning_rate": 9.991622886205714e-07, "loss": 0.001, "reward": 1.8969091176986694, "reward_std": 0.08935505896806717, "rewards/accuracy_reward": 0.7344090938568115, "rewards/format_reward": 1.0, "step": 606, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.125, "completion_length": 387.0, "epoch": 0.018458824960467097, "grad_norm": 1.4029656502284182, "kl": 0.029541015625, "learning_rate": 9.991595223908837e-07, "loss": 0.0012, "reward": 1.8726065158843994, "reward_std": 0.13873696327209473, "rewards/accuracy_reward": 0.7132315635681152, "rewards/format_reward": 1.0, "step": 607, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.25, "completion_length": 381.609375, "epoch": 0.018489234886266876, "grad_norm": 0.946494106311795, "kl": 0.03466796875, "learning_rate": 9.991567516053417e-07, "loss": 0.0014, "reward": 1.7780226469039917, "reward_std": 0.042960405349731445, "rewards/accuracy_reward": 0.6342726349830627, "rewards/format_reward": 1.0, "step": 608, "temperature": 1.0 }, { "all_correct": 0.25, "all_wrong": 0.375, "completion_length": 390.015625, "epoch": 0.01851964481206666, "grad_norm": 1.037715430840855, "kl": 0.031982421875, "learning_rate": 9.991539762639706e-07, "loss": 0.0013, "reward": 1.4132628440856934, "reward_std": 0.11848343908786774, "rewards/accuracy_reward": 0.34138786792755127, "rewards/format_reward": 1.0, "step": 609, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.25, "completion_length": 369.578125, "epoch": 0.018550054737866438, "grad_norm": 0.7840780869029643, "kl": 0.0283203125, "learning_rate": 9.99151196366796e-07, "loss": 0.0011, "reward": 1.8250000476837158, "reward_std": 0.08017837256193161, "rewards/accuracy_reward": 0.6875, "rewards/format_reward": 1.0, "step": 610, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.125, "completion_length": 387.046875, "epoch": 0.01858046466366622, "grad_norm": 1.0261828832554214, "kl": 0.03173828125, "learning_rate": 9.991484119138432e-07, "loss": 0.0013, "reward": 1.7433415651321411, "reward_std": 0.1010158360004425, "rewards/accuracy_reward": 0.5995915532112122, "rewards/format_reward": 1.0, "step": 611, "temperature": 1.0 }, { "all_correct": 0.875, "all_wrong": 0.0, "completion_length": 379.671875, "epoch": 0.018610874589466003, "grad_norm": 0.9730297737043242, "kl": 0.035400390625, "learning_rate": 9.99145622905137e-07, "loss": 0.0014, "reward": 2.0426056385040283, "reward_std": 0.024341296404600143, "rewards/accuracy_reward": 0.8769806623458862, "rewards/format_reward": 1.0, "step": 612, "temperature": 1.0 }, { "all_correct": 0.25, "all_wrong": 0.25, "completion_length": 408.453125, "epoch": 0.018641284515265782, "grad_norm": 1.9103427318156068, "kl": 0.0233154296875, "learning_rate": 9.991428293407039e-07, "loss": 0.0009, "reward": 1.6917380094528198, "reward_std": 0.13564084470272064, "rewards/accuracy_reward": 0.5792379379272461, "rewards/format_reward": 1.0, "step": 613, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.125, "completion_length": 388.0625, "epoch": 0.018671694441065565, "grad_norm": 0.9620525986388728, "kl": 0.032470703125, "learning_rate": 9.991400312205686e-07, "loss": 0.0013, "reward": 1.8695738315582275, "reward_std": 0.030544929206371307, "rewards/accuracy_reward": 0.7101988196372986, "rewards/format_reward": 1.0, "step": 614, "temperature": 1.0 }, { "all_correct": 0.25, "all_wrong": 0.25, "completion_length": 397.671875, "epoch": 0.018702104366865344, "grad_norm": 1.2947269515086028, "kl": 0.0289306640625, "learning_rate": 9.99137228544757e-07, "loss": 0.0012, "reward": 1.5723220109939575, "reward_std": 0.14814886450767517, "rewards/accuracy_reward": 0.4660719037055969, "rewards/format_reward": 1.0, "step": 615, "temperature": 1.0 }, { "all_correct": 0.375, "all_wrong": 0.25, "completion_length": 397.28125, "epoch": 0.018732514292665127, "grad_norm": 1.0433791479046082, "kl": 0.0279541015625, "learning_rate": 9.991344213132945e-07, "loss": 0.0011, "reward": 1.583266258239746, "reward_std": 0.03504175692796707, "rewards/accuracy_reward": 0.47701627016067505, "rewards/format_reward": 1.0, "step": 616, "temperature": 1.0 }, { "all_correct": 0.375, "all_wrong": 0.25, "completion_length": 365.15625, "epoch": 0.018762924218464906, "grad_norm": 2.5383366534843463, "kl": 0.02587890625, "learning_rate": 9.991316095262068e-07, "loss": 0.001, "reward": 1.7088302373886108, "reward_std": 0.076881043612957, "rewards/accuracy_reward": 0.5650802850723267, "rewards/format_reward": 1.0, "step": 617, "temperature": 1.0 }, { "all_correct": 0.125, "all_wrong": 0.125, "completion_length": 395.234375, "epoch": 0.01879333414426469, "grad_norm": 1.3724658192691543, "kl": 0.0240478515625, "learning_rate": 9.991287931835197e-07, "loss": 0.001, "reward": 1.4778103828430176, "reward_std": 0.25440776348114014, "rewards/accuracy_reward": 0.36218535900115967, "rewards/format_reward": 1.0, "step": 618, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.25, "completion_length": 388.8125, "epoch": 0.018823744070064468, "grad_norm": 0.7453644057197045, "kl": 0.0244140625, "learning_rate": 9.991259722852585e-07, "loss": 0.001, "reward": 1.6281249523162842, "reward_std": 0.09722718596458435, "rewards/accuracy_reward": 0.53125, "rewards/format_reward": 1.0, "step": 619, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.0, "completion_length": 406.125, "epoch": 0.01885415399586425, "grad_norm": 1.2339587864139192, "kl": 0.02294921875, "learning_rate": 9.991231468314494e-07, "loss": 0.0009, "reward": 2.0086283683776855, "reward_std": 0.12564092874526978, "rewards/accuracy_reward": 0.836753249168396, "rewards/format_reward": 1.0, "step": 620, "temperature": 1.0 }, { "all_correct": 0.125, "all_wrong": 0.375, "completion_length": 364.390625, "epoch": 0.01888456392166403, "grad_norm": 0.95049834766344, "kl": 0.022216796875, "learning_rate": 9.99120316822118e-07, "loss": 0.0009, "reward": 1.5004464387893677, "reward_std": 0.2071518450975418, "rewards/accuracy_reward": 0.4129464626312256, "rewards/format_reward": 1.0, "step": 621, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.0, "completion_length": 362.734375, "epoch": 0.018914973847463813, "grad_norm": 1.2009901201034807, "kl": 0.0311279296875, "learning_rate": 9.9911748225729e-07, "loss": 0.0012, "reward": 1.916428804397583, "reward_std": 0.11666810512542725, "rewards/accuracy_reward": 0.750803530216217, "rewards/format_reward": 1.0, "step": 622, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.125, "completion_length": 398.90625, "epoch": 0.01894538377326359, "grad_norm": 1.3025178576870173, "kl": 0.0240478515625, "learning_rate": 9.991146431369917e-07, "loss": 0.001, "reward": 1.8494185209274292, "reward_std": 0.09329584240913391, "rewards/accuracy_reward": 0.6994185447692871, "rewards/format_reward": 1.0, "step": 623, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.375, "completion_length": 359.1875, "epoch": 0.018975793699063374, "grad_norm": 1.0312548818320586, "kl": 0.02587890625, "learning_rate": 9.991117994612484e-07, "loss": 0.001, "reward": 1.743749976158142, "reward_std": 0.017677675932645798, "rewards/accuracy_reward": 0.625, "rewards/format_reward": 1.0, "step": 624, "temperature": 1.0 }, { "all_correct": 0.125, "all_wrong": 0.25, "completion_length": 355.953125, "epoch": 0.019006203624863157, "grad_norm": 1.586765348164984, "kl": 0.0274658203125, "learning_rate": 9.991089512300866e-07, "loss": 0.0011, "reward": 1.4598374366760254, "reward_std": 0.21847301721572876, "rewards/accuracy_reward": 0.387962281703949, "rewards/format_reward": 1.0, "step": 625, "temperature": 1.0 }, { "all_correct": 0.375, "all_wrong": 0.25, "completion_length": 363.765625, "epoch": 0.019036613550662936, "grad_norm": 1.6605294163803124, "kl": 0.02587890625, "learning_rate": 9.991060984435318e-07, "loss": 0.001, "reward": 1.5627896785736084, "reward_std": 0.03303666040301323, "rewards/accuracy_reward": 0.46591466665267944, "rewards/format_reward": 1.0, "step": 626, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.25, "completion_length": 363.453125, "epoch": 0.01906702347646272, "grad_norm": 0.6995419795539591, "kl": 0.02587890625, "learning_rate": 9.991032411016104e-07, "loss": 0.001, "reward": 1.896875023841858, "reward_std": 0.008838837966322899, "rewards/accuracy_reward": 0.75, "rewards/format_reward": 1.0, "step": 627, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 380.828125, "epoch": 0.019097433402262498, "grad_norm": 1.4560804773452942, "kl": 0.02490234375, "learning_rate": 9.991003792043487e-07, "loss": 0.001, "reward": 1.8818953037261963, "reward_std": 0.11065559089183807, "rewards/accuracy_reward": 0.700645387172699, "rewards/format_reward": 1.0, "step": 628, "temperature": 1.0 }, { "all_correct": 0.25, "all_wrong": 0.25, "completion_length": 398.65625, "epoch": 0.01912784332806228, "grad_norm": 1.208267871406542, "kl": 0.0213623046875, "learning_rate": 9.990975127517722e-07, "loss": 0.0009, "reward": 1.6488420963287354, "reward_std": 0.22068798542022705, "rewards/accuracy_reward": 0.5269670486450195, "rewards/format_reward": 1.0, "step": 629, "temperature": 1.0 }, { "all_correct": 0.25, "all_wrong": 0.25, "completion_length": 387.65625, "epoch": 0.01915825325386206, "grad_norm": 1.5769873858782995, "kl": 0.0223388671875, "learning_rate": 9.990946417439074e-07, "loss": 0.0009, "reward": 1.6526292562484741, "reward_std": 0.17980273067951202, "rewards/accuracy_reward": 0.5338792204856873, "rewards/format_reward": 1.0, "step": 630, "temperature": 1.0 }, { "all_correct": 0.375, "all_wrong": 0.0, "completion_length": 372.796875, "epoch": 0.019188663179661843, "grad_norm": 2.0264850387179063, "kl": 0.025146484375, "learning_rate": 9.990917661807804e-07, "loss": 0.001, "reward": 1.82393217086792, "reward_std": 0.2524424195289612, "rewards/accuracy_reward": 0.6833070516586304, "rewards/format_reward": 1.0, "step": 631, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.125, "completion_length": 390.421875, "epoch": 0.019219073105461622, "grad_norm": 1.258693282315969, "kl": 0.02294921875, "learning_rate": 9.990888860624177e-07, "loss": 0.0009, "reward": 1.90625, "reward_std": 0.15655069053173065, "rewards/accuracy_reward": 0.765625, "rewards/format_reward": 1.0, "step": 632, "temperature": 1.0 }, { "all_correct": 0.375, "all_wrong": 0.0, "completion_length": 362.28125, "epoch": 0.019249483031261404, "grad_norm": 1.5277042477361833, "kl": 0.0281982421875, "learning_rate": 9.990860013888451e-07, "loss": 0.0011, "reward": 1.8355580568313599, "reward_std": 0.27930980920791626, "rewards/accuracy_reward": 0.6980580687522888, "rewards/format_reward": 1.0, "step": 633, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.0, "completion_length": 384.796875, "epoch": 0.019279892957061184, "grad_norm": 2.950728724003849, "kl": 0.02685546875, "learning_rate": 9.990831121600895e-07, "loss": 0.0011, "reward": 1.9457731246948242, "reward_std": 0.11338792741298676, "rewards/accuracy_reward": 0.7707730531692505, "rewards/format_reward": 1.0, "step": 634, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.125, "completion_length": 358.8125, "epoch": 0.019310302882860966, "grad_norm": 1.5388133363613137, "kl": 0.0283203125, "learning_rate": 9.99080218376177e-07, "loss": 0.0011, "reward": 1.7218749523162842, "reward_std": 0.17485956847667694, "rewards/accuracy_reward": 0.609375, "rewards/format_reward": 1.0, "step": 635, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 386.390625, "epoch": 0.019340712808660745, "grad_norm": 2.165420667802019, "kl": 0.025146484375, "learning_rate": 9.99077320037134e-07, "loss": 0.001, "reward": 1.8524532318115234, "reward_std": 0.13530555367469788, "rewards/accuracy_reward": 0.6774532198905945, "rewards/format_reward": 1.0, "step": 636, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.0, "completion_length": 362.015625, "epoch": 0.019371122734460528, "grad_norm": 1.24873374676275, "kl": 0.0250244140625, "learning_rate": 9.990744171429867e-07, "loss": 0.001, "reward": 2.0134856700897217, "reward_std": 0.19197435677051544, "rewards/accuracy_reward": 0.8416104912757874, "rewards/format_reward": 1.0, "step": 637, "temperature": 1.0 }, { "all_correct": 0.375, "all_wrong": 0.0, "completion_length": 376.625, "epoch": 0.019401532660260307, "grad_norm": 1.5195214629791318, "kl": 0.02685546875, "learning_rate": 9.99071509693762e-07, "loss": 0.0011, "reward": 1.7479314804077148, "reward_std": 0.19491618871688843, "rewards/accuracy_reward": 0.6041814088821411, "rewards/format_reward": 1.0, "step": 638, "temperature": 1.0 }, { "all_correct": 0.25, "all_wrong": 0.125, "completion_length": 413.71875, "epoch": 0.01943194258606009, "grad_norm": 1.4126931807642382, "kl": 0.019287109375, "learning_rate": 9.990685976894865e-07, "loss": 0.0008, "reward": 1.5989735126495361, "reward_std": 0.208685964345932, "rewards/accuracy_reward": 0.4958483576774597, "rewards/format_reward": 1.0, "step": 639, "temperature": 1.0 }, { "all_correct": 0.375, "all_wrong": 0.125, "completion_length": 399.015625, "epoch": 0.019462352511859873, "grad_norm": 2.458838051486715, "kl": 0.0260009765625, "learning_rate": 9.990656811301864e-07, "loss": 0.001, "reward": 1.6153169870376587, "reward_std": 0.19008949398994446, "rewards/accuracy_reward": 0.5059419870376587, "rewards/format_reward": 1.0, "step": 640, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.25, "completion_length": 383.3125, "epoch": 0.019492762437659652, "grad_norm": 0.5127573351760508, "kl": 0.028076171875, "learning_rate": 9.990627600158885e-07, "loss": 0.0011, "reward": 1.8062500953674316, "reward_std": 0.077632375061512, "rewards/accuracy_reward": 0.671875, "rewards/format_reward": 1.0, "step": 641, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.0, "completion_length": 379.890625, "epoch": 0.019523172363459435, "grad_norm": 3.630182420875963, "kl": 0.0302734375, "learning_rate": 9.990598343466197e-07, "loss": 0.0012, "reward": 2.026771306991577, "reward_std": 0.1488124430179596, "rewards/accuracy_reward": 0.8392711877822876, "rewards/format_reward": 1.0, "step": 642, "temperature": 1.0 }, { "all_correct": 0.875, "all_wrong": 0.0, "completion_length": 375.140625, "epoch": 0.019553582289259214, "grad_norm": 0.829760302162651, "kl": 0.0291748046875, "learning_rate": 9.990569041224063e-07, "loss": 0.0012, "reward": 2.075000047683716, "reward_std": 0.02314549870789051, "rewards/accuracy_reward": 0.8937500715255737, "rewards/format_reward": 1.0, "step": 643, "temperature": 1.0 }, { "all_correct": 0.25, "all_wrong": 0.25, "completion_length": 410.203125, "epoch": 0.019583992215058996, "grad_norm": 0.8511825180955228, "kl": 0.02294921875, "learning_rate": 9.990539693432748e-07, "loss": 0.0009, "reward": 1.6409912109375, "reward_std": 0.15731123089790344, "rewards/accuracy_reward": 0.5284910202026367, "rewards/format_reward": 1.0, "step": 644, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.125, "completion_length": 363.265625, "epoch": 0.019614402140858776, "grad_norm": 1.3223514324793195, "kl": 0.036376953125, "learning_rate": 9.99051030009253e-07, "loss": 0.0015, "reward": 1.950523018836975, "reward_std": 0.03516820818185806, "rewards/accuracy_reward": 0.7880231142044067, "rewards/format_reward": 1.0, "step": 645, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 398.78125, "epoch": 0.019644812066658558, "grad_norm": 2.099031418276086, "kl": 0.0252685546875, "learning_rate": 9.990480861203668e-07, "loss": 0.001, "reward": 2.1360092163085938, "reward_std": 0.06928229331970215, "rewards/accuracy_reward": 0.9516341090202332, "rewards/format_reward": 1.0, "step": 646, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.125, "completion_length": 407.46875, "epoch": 0.019675221992458337, "grad_norm": 4.173178054074948, "kl": 0.0281982421875, "learning_rate": 9.990451376766434e-07, "loss": 0.0011, "reward": 1.7876081466674805, "reward_std": 0.08835624903440475, "rewards/accuracy_reward": 0.6407331824302673, "rewards/format_reward": 1.0, "step": 647, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.25, "completion_length": 402.890625, "epoch": 0.01970563191825812, "grad_norm": 0.8465399637980721, "kl": 0.0281982421875, "learning_rate": 9.990421846781098e-07, "loss": 0.0011, "reward": 1.890625, "reward_std": 0.026516513898968697, "rewards/accuracy_reward": 0.75, "rewards/format_reward": 1.0, "step": 648, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.0, "completion_length": 395.78125, "epoch": 0.0197360418440579, "grad_norm": 0.8197573744592871, "kl": 0.03271484375, "learning_rate": 9.990392271247927e-07, "loss": 0.0013, "reward": 1.964296817779541, "reward_std": 0.08782364428043365, "rewards/accuracy_reward": 0.7799217104911804, "rewards/format_reward": 1.0, "step": 649, "temperature": 1.0 }, { "all_correct": 0.875, "all_wrong": 0.0, "completion_length": 389.84375, "epoch": 0.019766451769857682, "grad_norm": 0.5187919632812292, "kl": 0.02978515625, "learning_rate": 9.990362650167194e-07, "loss": 0.0012, "reward": 2.106008291244507, "reward_std": 0.006497703958302736, "rewards/accuracy_reward": 0.906008243560791, "rewards/format_reward": 1.0, "step": 650, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.25, "completion_length": 404.734375, "epoch": 0.01979686169565746, "grad_norm": 0.556164541443606, "kl": 0.036376953125, "learning_rate": 9.990332983539168e-07, "loss": 0.0015, "reward": 1.8004518747329712, "reward_std": 0.012578457593917847, "rewards/accuracy_reward": 0.6660767793655396, "rewards/format_reward": 1.0, "step": 651, "temperature": 1.0 }, { "all_correct": 0.375, "all_wrong": 0.0, "completion_length": 451.1875, "epoch": 0.019827271621457244, "grad_norm": 1.5232993585917596, "kl": 0.0174560546875, "learning_rate": 9.990303271364116e-07, "loss": 0.0007, "reward": 1.9128844738006592, "reward_std": 0.1640322506427765, "rewards/accuracy_reward": 0.7691343426704407, "rewards/format_reward": 1.0, "step": 652, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.125, "completion_length": 382.109375, "epoch": 0.019857681547257023, "grad_norm": 1.0257064360469872, "kl": 0.0283203125, "learning_rate": 9.990273513642315e-07, "loss": 0.0011, "reward": 1.9710252285003662, "reward_std": 0.026936471462249756, "rewards/accuracy_reward": 0.7991501688957214, "rewards/format_reward": 1.0, "step": 653, "temperature": 1.0 }, { "all_correct": 0.375, "all_wrong": 0.25, "completion_length": 408.796875, "epoch": 0.019888091473056806, "grad_norm": 1.03674946322046, "kl": 0.0269775390625, "learning_rate": 9.990243710374036e-07, "loss": 0.0011, "reward": 1.6634440422058105, "reward_std": 0.19483540952205658, "rewards/accuracy_reward": 0.5540690422058105, "rewards/format_reward": 1.0, "step": 654, "temperature": 1.0 }, { "all_correct": 0.375, "all_wrong": 0.375, "completion_length": 400.4375, "epoch": 0.01991850139885659, "grad_norm": 0.7579534517188027, "kl": 0.034423828125, "learning_rate": 9.990213861559548e-07, "loss": 0.0014, "reward": 1.5534179210662842, "reward_std": 0.07709024846553802, "rewards/accuracy_reward": 0.44716793298721313, "rewards/format_reward": 1.0, "step": 655, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.125, "completion_length": 391.015625, "epoch": 0.019948911324656367, "grad_norm": 0.9517862587511295, "kl": 0.03515625, "learning_rate": 9.990183967199125e-07, "loss": 0.0014, "reward": 1.9356319904327393, "reward_std": 0.1179971694946289, "rewards/accuracy_reward": 0.7731318473815918, "rewards/format_reward": 1.0, "step": 656, "temperature": 1.0 }, { "all_correct": 0.375, "all_wrong": 0.0, "completion_length": 419.609375, "epoch": 0.01997932125045615, "grad_norm": 1.0037908166550191, "kl": 0.027099609375, "learning_rate": 9.99015402729304e-07, "loss": 0.0011, "reward": 1.7795934677124023, "reward_std": 0.28604164719581604, "rewards/accuracy_reward": 0.6514685153961182, "rewards/format_reward": 1.0, "step": 657, "temperature": 1.0 }, { "all_correct": 0.375, "all_wrong": 0.125, "completion_length": 416.734375, "epoch": 0.02000973117625593, "grad_norm": 1.1948288809149632, "kl": 0.03173828125, "learning_rate": 9.990124041841564e-07, "loss": 0.0013, "reward": 1.727430820465088, "reward_std": 0.14140310883522034, "rewards/accuracy_reward": 0.5868057608604431, "rewards/format_reward": 1.0, "step": 658, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.125, "completion_length": 409.234375, "epoch": 0.020040141102055712, "grad_norm": 12.87615769115042, "kl": 0.027099609375, "learning_rate": 9.990094010844975e-07, "loss": 0.0011, "reward": 1.7602542638778687, "reward_std": 0.083908312022686, "rewards/accuracy_reward": 0.610254168510437, "rewards/format_reward": 1.0, "step": 659, "temperature": 1.0 }, { "all_correct": 0.25, "all_wrong": 0.0, "completion_length": 416.15625, "epoch": 0.02007055102785549, "grad_norm": 1.4571503638723753, "kl": 0.031005859375, "learning_rate": 9.990063934303542e-07, "loss": 0.0012, "reward": 1.7599095106124878, "reward_std": 0.2226928323507309, "rewards/accuracy_reward": 0.6067845225334167, "rewards/format_reward": 0.984375, "step": 660, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.125, "completion_length": 426.453125, "epoch": 0.020100960953655274, "grad_norm": 1.1290797322221129, "kl": 0.0262451171875, "learning_rate": 9.990033812217543e-07, "loss": 0.001, "reward": 1.7170798778533936, "reward_std": 0.12519504129886627, "rewards/accuracy_reward": 0.5920798778533936, "rewards/format_reward": 1.0, "step": 661, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 393.84375, "epoch": 0.020131370879455053, "grad_norm": 0.6663544267287709, "kl": 0.0235595703125, "learning_rate": 9.990003644587252e-07, "loss": 0.0009, "reward": 2.0290255546569824, "reward_std": 0.06814910471439362, "rewards/accuracy_reward": 0.8540254235267639, "rewards/format_reward": 1.0, "step": 662, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.0, "completion_length": 402.265625, "epoch": 0.020161780805254836, "grad_norm": 1.0553800931832154, "kl": 0.0284423828125, "learning_rate": 9.989973431412945e-07, "loss": 0.0011, "reward": 2.031787872314453, "reward_std": 0.06939591467380524, "rewards/accuracy_reward": 0.8411630392074585, "rewards/format_reward": 1.0, "step": 663, "temperature": 1.0 }, { "all_correct": 0.375, "all_wrong": 0.125, "completion_length": 404.59375, "epoch": 0.020192190731054615, "grad_norm": 0.962805620934257, "kl": 0.02490234375, "learning_rate": 9.9899431726949e-07, "loss": 0.001, "reward": 1.6338539123535156, "reward_std": 0.1546269953250885, "rewards/accuracy_reward": 0.5119788646697998, "rewards/format_reward": 1.0, "step": 664, "temperature": 1.0 }, { "all_correct": 0.25, "all_wrong": 0.375, "completion_length": 399.890625, "epoch": 0.020222600656854398, "grad_norm": 1.3139052096558272, "kl": 0.0245361328125, "learning_rate": 9.989912868433386e-07, "loss": 0.001, "reward": 1.5590829849243164, "reward_std": 0.10083663463592529, "rewards/accuracy_reward": 0.44658297300338745, "rewards/format_reward": 1.0, "step": 665, "temperature": 1.0 }, { "all_correct": 0.875, "all_wrong": 0.0, "completion_length": 366.609375, "epoch": 0.020253010582654177, "grad_norm": 0.5207536826700402, "kl": 0.033203125, "learning_rate": 9.989882518628686e-07, "loss": 0.0013, "reward": 2.0875000953674316, "reward_std": 0.06943650543689728, "rewards/accuracy_reward": 0.90625, "rewards/format_reward": 1.0, "step": 666, "temperature": 1.0 }, { "all_correct": 0.375, "all_wrong": 0.0, "completion_length": 402.3125, "epoch": 0.02028342050845396, "grad_norm": 1.0881756933706128, "kl": 0.0225830078125, "learning_rate": 9.989852123281073e-07, "loss": 0.0009, "reward": 1.759714961051941, "reward_std": 0.1360561102628708, "rewards/accuracy_reward": 0.6034649610519409, "rewards/format_reward": 1.0, "step": 667, "temperature": 1.0 }, { "all_correct": 0.375, "all_wrong": 0.125, "completion_length": 389.125, "epoch": 0.020313830434253742, "grad_norm": 3.0831469477607323, "kl": 0.026123046875, "learning_rate": 9.989821682390829e-07, "loss": 0.001, "reward": 1.7557421922683716, "reward_std": 0.17218458652496338, "rewards/accuracy_reward": 0.6057420969009399, "rewards/format_reward": 1.0, "step": 668, "temperature": 1.0 }, { "all_correct": 0.875, "all_wrong": 0.0, "completion_length": 362.265625, "epoch": 0.02034424036005352, "grad_norm": 1.060380386817024, "kl": 0.024658203125, "learning_rate": 9.989791195958227e-07, "loss": 0.001, "reward": 2.1773133277893066, "reward_std": 0.02901316061615944, "rewards/accuracy_reward": 0.9835633039474487, "rewards/format_reward": 1.0, "step": 669, "temperature": 1.0 }, { "all_correct": 0.25, "all_wrong": 0.25, "completion_length": 379.546875, "epoch": 0.020374650285853304, "grad_norm": 1.0791148780455193, "kl": 0.029052734375, "learning_rate": 9.98976066398355e-07, "loss": 0.0012, "reward": 1.5806500911712646, "reward_std": 0.19715918600559235, "rewards/accuracy_reward": 0.4744000732898712, "rewards/format_reward": 1.0, "step": 670, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 381.125, "epoch": 0.020405060211653083, "grad_norm": 1.64637513725008, "kl": 0.024169921875, "learning_rate": 9.989730086467071e-07, "loss": 0.001, "reward": 1.9743870496749878, "reward_std": 0.04748433828353882, "rewards/accuracy_reward": 0.790012001991272, "rewards/format_reward": 1.0, "step": 671, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 373.71875, "epoch": 0.020435470137452866, "grad_norm": 2.473573119854195, "kl": 0.0245361328125, "learning_rate": 9.989699463409074e-07, "loss": 0.001, "reward": 2.0678670406341553, "reward_std": 0.09563078731298447, "rewards/accuracy_reward": 0.8897418975830078, "rewards/format_reward": 1.0, "step": 672, "temperature": 1.0 }, { "all_correct": 0.875, "all_wrong": 0.125, "completion_length": 356.8125, "epoch": 0.020465880063252645, "grad_norm": 1.2070813582027315, "kl": 0.02490234375, "learning_rate": 9.989668794809838e-07, "loss": 0.001, "reward": 2.0343751907348633, "reward_std": 0.03808927163481712, "rewards/accuracy_reward": 0.875, "rewards/format_reward": 1.0, "step": 673, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 356.3125, "epoch": 0.020496289989052428, "grad_norm": 1.2888948442110533, "kl": 0.0322265625, "learning_rate": 9.98963808066964e-07, "loss": 0.0013, "reward": 1.9281251430511475, "reward_std": 0.33044448494911194, "rewards/accuracy_reward": 0.796875, "rewards/format_reward": 1.0, "step": 674, "temperature": 1.0 }, { "all_correct": 0.375, "all_wrong": 0.0, "completion_length": 415.765625, "epoch": 0.020526699914852207, "grad_norm": 1.3479555356873087, "kl": 0.0238037109375, "learning_rate": 9.989607320988762e-07, "loss": 0.001, "reward": 1.6207882165908813, "reward_std": 0.1512431800365448, "rewards/accuracy_reward": 0.5020381808280945, "rewards/format_reward": 1.0, "step": 675, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.25, "completion_length": 360.890625, "epoch": 0.02055710984065199, "grad_norm": 0.9397730997851568, "kl": 0.0279541015625, "learning_rate": 9.989576515767484e-07, "loss": 0.0011, "reward": 1.7625000476837158, "reward_std": 0.0530330091714859, "rewards/accuracy_reward": 0.640625, "rewards/format_reward": 1.0, "step": 676, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.125, "completion_length": 389.078125, "epoch": 0.02058751976645177, "grad_norm": 3.9149500611579713, "kl": 0.0272216796875, "learning_rate": 9.98954566500609e-07, "loss": 0.0011, "reward": 1.7727192640304565, "reward_std": 0.12443836778402328, "rewards/accuracy_reward": 0.6477192044258118, "rewards/format_reward": 1.0, "step": 677, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 367.609375, "epoch": 0.02061792969225155, "grad_norm": 1.5797261197006918, "kl": 0.0267333984375, "learning_rate": 9.989514768704858e-07, "loss": 0.0011, "reward": 1.9609650373458862, "reward_std": 0.20830951631069183, "rewards/accuracy_reward": 0.7922148704528809, "rewards/format_reward": 1.0, "step": 678, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.25, "completion_length": 383.640625, "epoch": 0.02064833961805133, "grad_norm": 1.004336027718628, "kl": 0.028564453125, "learning_rate": 9.989483826864072e-07, "loss": 0.0011, "reward": 1.7489676475524902, "reward_std": 0.01990433782339096, "rewards/accuracy_reward": 0.6270925998687744, "rewards/format_reward": 1.0, "step": 679, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.125, "completion_length": 381.453125, "epoch": 0.020678749543851113, "grad_norm": 0.8176627402165275, "kl": 0.02490234375, "learning_rate": 9.989452839484013e-07, "loss": 0.001, "reward": 1.8937500715255737, "reward_std": 0.12730026245117188, "rewards/accuracy_reward": 0.75, "rewards/format_reward": 1.0, "step": 680, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.0, "completion_length": 368.1875, "epoch": 0.020709159469650892, "grad_norm": 1.2394771992348648, "kl": 0.0286865234375, "learning_rate": 9.989421806564965e-07, "loss": 0.0012, "reward": 1.9444711208343506, "reward_std": 0.18188361823558807, "rewards/accuracy_reward": 0.7788462042808533, "rewards/format_reward": 1.0, "step": 681, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 408.765625, "epoch": 0.020739569395450675, "grad_norm": 0.7868972656018047, "kl": 0.021484375, "learning_rate": 9.98939072810721e-07, "loss": 0.0009, "reward": 1.8661730289459229, "reward_std": 0.14778602123260498, "rewards/accuracy_reward": 0.7067980766296387, "rewards/format_reward": 1.0, "step": 682, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.125, "completion_length": 357.390625, "epoch": 0.020769979321250458, "grad_norm": 1.0699316632670048, "kl": 0.0299072265625, "learning_rate": 9.989359604111033e-07, "loss": 0.0012, "reward": 1.7626953125, "reward_std": 0.0924317017197609, "rewards/accuracy_reward": 0.6376953125, "rewards/format_reward": 1.0, "step": 683, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.0, "completion_length": 417.78125, "epoch": 0.020800389247050237, "grad_norm": 0.7743100420567133, "kl": 0.0228271484375, "learning_rate": 9.98932843457672e-07, "loss": 0.0009, "reward": 1.8919696807861328, "reward_std": 0.1535273790359497, "rewards/accuracy_reward": 0.7482196688652039, "rewards/format_reward": 1.0, "step": 684, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.125, "completion_length": 371.671875, "epoch": 0.02083079917285002, "grad_norm": 1.348400187622915, "kl": 0.0299072265625, "learning_rate": 9.98929721950455e-07, "loss": 0.0012, "reward": 1.7625000476837158, "reward_std": 0.1414213478565216, "rewards/accuracy_reward": 0.640625, "rewards/format_reward": 1.0, "step": 685, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 374.234375, "epoch": 0.0208612090986498, "grad_norm": 1.076024899431847, "kl": 0.0245361328125, "learning_rate": 9.989265958894813e-07, "loss": 0.001, "reward": 1.8858520984649658, "reward_std": 0.1646895855665207, "rewards/accuracy_reward": 0.7233520150184631, "rewards/format_reward": 1.0, "step": 686, "temperature": 1.0 }, { "all_correct": 0.375, "all_wrong": 0.0, "completion_length": 410.859375, "epoch": 0.02089161902444958, "grad_norm": 1.8198815639881512, "kl": 0.02880859375, "learning_rate": 9.989234652747792e-07, "loss": 0.0012, "reward": 1.7496097087860107, "reward_std": 0.1555100828409195, "rewards/accuracy_reward": 0.5871097445487976, "rewards/format_reward": 1.0, "step": 687, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.125, "completion_length": 384.375, "epoch": 0.02092202895024936, "grad_norm": 0.9495578796368424, "kl": 0.027587890625, "learning_rate": 9.98920330106377e-07, "loss": 0.0011, "reward": 1.796875, "reward_std": 0.19501903653144836, "rewards/accuracy_reward": 0.671875, "rewards/format_reward": 1.0, "step": 688, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 410.46875, "epoch": 0.020952438876049143, "grad_norm": 1.3903452606585855, "kl": 0.028076171875, "learning_rate": 9.989171903843041e-07, "loss": 0.0011, "reward": 1.9002516269683838, "reward_std": 0.03786957263946533, "rewards/accuracy_reward": 0.7033764123916626, "rewards/format_reward": 1.0, "step": 689, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.125, "completion_length": 385.265625, "epoch": 0.020982848801848922, "grad_norm": 0.9129695414268254, "kl": 0.02978515625, "learning_rate": 9.989140461085884e-07, "loss": 0.0012, "reward": 1.9258019924163818, "reward_std": 0.06725609302520752, "rewards/accuracy_reward": 0.7601770162582397, "rewards/format_reward": 1.0, "step": 690, "temperature": 1.0 }, { "all_correct": 0.875, "all_wrong": 0.0, "completion_length": 370.09375, "epoch": 0.021013258727648705, "grad_norm": 1.5291332037197616, "kl": 0.0302734375, "learning_rate": 9.989108972792588e-07, "loss": 0.0012, "reward": 2.1048214435577393, "reward_std": 0.018811214715242386, "rewards/accuracy_reward": 0.9110714197158813, "rewards/format_reward": 1.0, "step": 691, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.0, "completion_length": 385.78125, "epoch": 0.021043668653448484, "grad_norm": 1.4911174892484735, "kl": 0.0250244140625, "learning_rate": 9.989077438963446e-07, "loss": 0.001, "reward": 1.972328543663025, "reward_std": 0.1256667971611023, "rewards/accuracy_reward": 0.7942036390304565, "rewards/format_reward": 1.0, "step": 692, "temperature": 1.0 }, { "all_correct": 0.25, "all_wrong": 0.125, "completion_length": 364.59375, "epoch": 0.021074078579248267, "grad_norm": 4.0742315991079225, "kl": 0.035400390625, "learning_rate": 9.989045859598735e-07, "loss": 0.0014, "reward": 1.6659669876098633, "reward_std": 0.2535577714443207, "rewards/accuracy_reward": 0.5378419160842896, "rewards/format_reward": 1.0, "step": 693, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 396.109375, "epoch": 0.021104488505048046, "grad_norm": 0.6510446015733851, "kl": 0.023681640625, "learning_rate": 9.98901423469875e-07, "loss": 0.0009, "reward": 1.9562500715255737, "reward_std": 0.13476528227329254, "rewards/accuracy_reward": 0.8125, "rewards/format_reward": 1.0, "step": 694, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.125, "completion_length": 379.6875, "epoch": 0.02113489843084783, "grad_norm": 1.284707243433621, "kl": 0.0341796875, "learning_rate": 9.988982564263783e-07, "loss": 0.0014, "reward": 1.9819912910461426, "reward_std": 0.02816206030547619, "rewards/accuracy_reward": 0.810116171836853, "rewards/format_reward": 1.0, "step": 695, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 420.765625, "epoch": 0.02116530835664761, "grad_norm": 1.0228858927903428, "kl": 0.025634765625, "learning_rate": 9.988950848294115e-07, "loss": 0.001, "reward": 1.956160068511963, "reward_std": 0.1683378666639328, "rewards/accuracy_reward": 0.7905351519584656, "rewards/format_reward": 1.0, "step": 696, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.125, "completion_length": 364.140625, "epoch": 0.02119571828244739, "grad_norm": 6.953942738895297, "kl": 0.0306396484375, "learning_rate": 9.988919086790042e-07, "loss": 0.0012, "reward": 1.9083714485168457, "reward_std": 0.14154362678527832, "rewards/accuracy_reward": 0.7552464008331299, "rewards/format_reward": 1.0, "step": 697, "temperature": 1.0 }, { "all_correct": 0.25, "all_wrong": 0.0, "completion_length": 413.65625, "epoch": 0.021226128208247173, "grad_norm": 2.0502753068091013, "kl": 0.02685546875, "learning_rate": 9.98888727975185e-07, "loss": 0.0011, "reward": 1.7411872148513794, "reward_std": 0.1346178948879242, "rewards/accuracy_reward": 0.5818121433258057, "rewards/format_reward": 1.0, "step": 698, "temperature": 1.0 }, { "all_correct": 0.375, "all_wrong": 0.125, "completion_length": 398.671875, "epoch": 0.021256538134046953, "grad_norm": 1.1108207427788948, "kl": 0.0286865234375, "learning_rate": 9.98885542717983e-07, "loss": 0.0011, "reward": 1.6812796592712402, "reward_std": 0.19948555529117584, "rewards/accuracy_reward": 0.5594046711921692, "rewards/format_reward": 1.0, "step": 699, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.0, "completion_length": 370.796875, "epoch": 0.021286948059846735, "grad_norm": 1.1051913229462709, "kl": 0.037841796875, "learning_rate": 9.988823529074273e-07, "loss": 0.0015, "reward": 1.8820419311523438, "reward_std": 0.028230367228388786, "rewards/accuracy_reward": 0.7007919549942017, "rewards/format_reward": 1.0, "step": 700, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 388.8125, "epoch": 0.021317357985646514, "grad_norm": 1.3189130744438085, "kl": 0.0286865234375, "learning_rate": 9.988791585435473e-07, "loss": 0.0011, "reward": 1.8846724033355713, "reward_std": 0.15534169971942902, "rewards/accuracy_reward": 0.7252973318099976, "rewards/format_reward": 1.0, "step": 701, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.0, "completion_length": 361.015625, "epoch": 0.021347767911446297, "grad_norm": 1.2850311736204318, "kl": 0.0281982421875, "learning_rate": 9.988759596263716e-07, "loss": 0.0011, "reward": 1.9652068614959717, "reward_std": 0.14845925569534302, "rewards/accuracy_reward": 0.7995818257331848, "rewards/format_reward": 1.0, "step": 702, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.25, "completion_length": 366.5, "epoch": 0.021378177837246076, "grad_norm": 0.7473492927736173, "kl": 0.0277099609375, "learning_rate": 9.988727561559298e-07, "loss": 0.0011, "reward": 1.8022041320800781, "reward_std": 0.012764747254550457, "rewards/accuracy_reward": 0.6522042155265808, "rewards/format_reward": 1.0, "step": 703, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 401.625, "epoch": 0.02140858776304586, "grad_norm": 3.5135745491854133, "kl": 0.0277099609375, "learning_rate": 9.98869548132251e-07, "loss": 0.0011, "reward": 2.0574450492858887, "reward_std": 0.18382766842842102, "rewards/accuracy_reward": 0.8824450373649597, "rewards/format_reward": 1.0, "step": 704, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.0, "completion_length": 374.953125, "epoch": 0.021438997688845638, "grad_norm": 0.8270584942057014, "kl": 0.031982421875, "learning_rate": 9.988663355553646e-07, "loss": 0.0013, "reward": 2.015568733215332, "reward_std": 0.07329375296831131, "rewards/accuracy_reward": 0.8311936259269714, "rewards/format_reward": 1.0, "step": 705, "temperature": 1.0 }, { "all_correct": 0.375, "all_wrong": 0.0, "completion_length": 422.6875, "epoch": 0.02146940761464542, "grad_norm": 1.2004283484774476, "kl": 0.0255126953125, "learning_rate": 9.988631184252998e-07, "loss": 0.001, "reward": 1.7782607078552246, "reward_std": 0.23250901699066162, "rewards/accuracy_reward": 0.6470108032226562, "rewards/format_reward": 1.0, "step": 706, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.0, "completion_length": 402.140625, "epoch": 0.0214998175404452, "grad_norm": 0.7550582140447133, "kl": 0.0291748046875, "learning_rate": 9.988598967420859e-07, "loss": 0.0012, "reward": 1.8572123050689697, "reward_std": 0.09363141655921936, "rewards/accuracy_reward": 0.7072123885154724, "rewards/format_reward": 1.0, "step": 707, "temperature": 1.0 }, { "all_correct": 0.375, "all_wrong": 0.25, "completion_length": 355.953125, "epoch": 0.021530227466244983, "grad_norm": 1.354910754421629, "kl": 0.029296875, "learning_rate": 9.988566705057524e-07, "loss": 0.0012, "reward": 1.6400147676467896, "reward_std": 0.10470054298639297, "rewards/accuracy_reward": 0.5243897438049316, "rewards/format_reward": 1.0, "step": 708, "temperature": 1.0 }, { "all_correct": 0.375, "all_wrong": 0.125, "completion_length": 403.5625, "epoch": 0.021560637392044762, "grad_norm": 1.7470952393041894, "kl": 0.0247802734375, "learning_rate": 9.988534397163288e-07, "loss": 0.001, "reward": 1.6837437152862549, "reward_std": 0.11966129392385483, "rewards/accuracy_reward": 0.5649937391281128, "rewards/format_reward": 1.0, "step": 709, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.0, "completion_length": 368.421875, "epoch": 0.021591047317844544, "grad_norm": 0.9576535290302793, "kl": 0.034912109375, "learning_rate": 9.988502043738446e-07, "loss": 0.0014, "reward": 1.9031682014465332, "reward_std": 0.1091449111700058, "rewards/accuracy_reward": 0.7312930822372437, "rewards/format_reward": 1.0, "step": 710, "temperature": 1.0 }, { "all_correct": 0.125, "all_wrong": 0.25, "completion_length": 360.046875, "epoch": 0.021621457243644327, "grad_norm": 1.9683335088547544, "kl": 0.0284423828125, "learning_rate": 9.988469644783292e-07, "loss": 0.0011, "reward": 1.421875, "reward_std": 0.3600984215736389, "rewards/accuracy_reward": 0.359375, "rewards/format_reward": 1.0, "step": 711, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.125, "completion_length": 378.453125, "epoch": 0.021651867169444106, "grad_norm": 1.1191351287384375, "kl": 0.03271484375, "learning_rate": 9.988437200298122e-07, "loss": 0.0013, "reward": 1.8150949478149414, "reward_std": 0.0747203379869461, "rewards/accuracy_reward": 0.6494698524475098, "rewards/format_reward": 1.0, "step": 712, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.125, "completion_length": 396.359375, "epoch": 0.02168227709524389, "grad_norm": 1.5410124151734454, "kl": 0.027587890625, "learning_rate": 9.988404710283233e-07, "loss": 0.0011, "reward": 1.8281421661376953, "reward_std": 0.1140567883849144, "rewards/accuracy_reward": 0.6875171661376953, "rewards/format_reward": 1.0, "step": 713, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.125, "completion_length": 376.640625, "epoch": 0.021712687021043668, "grad_norm": 1.0159516208860988, "kl": 0.034423828125, "learning_rate": 9.988372174738921e-07, "loss": 0.0014, "reward": 1.9187500476837158, "reward_std": 0.14878928661346436, "rewards/accuracy_reward": 0.78125, "rewards/format_reward": 1.0, "step": 714, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 383.875, "epoch": 0.02174309694684345, "grad_norm": 1.3544330017464423, "kl": 0.028564453125, "learning_rate": 9.988339593665483e-07, "loss": 0.0011, "reward": 1.9659550189971924, "reward_std": 0.18172678351402283, "rewards/accuracy_reward": 0.8034549355506897, "rewards/format_reward": 1.0, "step": 715, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.125, "completion_length": 391.9375, "epoch": 0.02177350687264323, "grad_norm": 2.0999879748810115, "kl": 0.032958984375, "learning_rate": 9.988306967063215e-07, "loss": 0.0013, "reward": 1.8330107927322388, "reward_std": 0.05549243837594986, "rewards/accuracy_reward": 0.689260721206665, "rewards/format_reward": 1.0, "step": 716, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.0, "completion_length": 361.828125, "epoch": 0.021803916798443013, "grad_norm": 2.892890202982373, "kl": 0.03125, "learning_rate": 9.988274294932418e-07, "loss": 0.0013, "reward": 1.875, "reward_std": 0.1660207211971283, "rewards/accuracy_reward": 0.734375, "rewards/format_reward": 1.0, "step": 717, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.0, "completion_length": 387.078125, "epoch": 0.021834326724242792, "grad_norm": 1.2159550958942147, "kl": 0.033447265625, "learning_rate": 9.988241577273387e-07, "loss": 0.0013, "reward": 1.9819549322128296, "reward_std": 0.11317240446805954, "rewards/accuracy_reward": 0.8007049560546875, "rewards/format_reward": 1.0, "step": 718, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.125, "completion_length": 367.78125, "epoch": 0.021864736650042575, "grad_norm": 1.6662803638531312, "kl": 0.0302734375, "learning_rate": 9.988208814086423e-07, "loss": 0.0012, "reward": 2.0187501907348633, "reward_std": 0.07902076840400696, "rewards/accuracy_reward": 0.859375, "rewards/format_reward": 1.0, "step": 719, "temperature": 1.0 }, { "all_correct": 0.375, "all_wrong": 0.125, "completion_length": 367.5625, "epoch": 0.021895146575842354, "grad_norm": 1.3977024575206578, "kl": 0.03564453125, "learning_rate": 9.988176005371823e-07, "loss": 0.0014, "reward": 1.7965492010116577, "reward_std": 0.1774827092885971, "rewards/accuracy_reward": 0.6621740460395813, "rewards/format_reward": 1.0, "step": 720, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 393.78125, "epoch": 0.021925556501642136, "grad_norm": 1.8056817244750996, "kl": 0.031494140625, "learning_rate": 9.98814315112989e-07, "loss": 0.0013, "reward": 1.934206485748291, "reward_std": 0.09965212643146515, "rewards/accuracy_reward": 0.7592064142227173, "rewards/format_reward": 1.0, "step": 721, "temperature": 1.0 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 356.09375, "epoch": 0.021955966427441916, "grad_norm": 0.9567560347553676, "kl": 0.035400390625, "learning_rate": 9.988110251360917e-07, "loss": 0.0014, "reward": 2.190624952316284, "reward_std": 0.026516513898968697, "rewards/accuracy_reward": 1.0, "rewards/format_reward": 1.0, "step": 722, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.25, "completion_length": 361.21875, "epoch": 0.021986376353241698, "grad_norm": 0.8158320054946443, "kl": 0.035400390625, "learning_rate": 9.98807730606521e-07, "loss": 0.0014, "reward": 1.723860740661621, "reward_std": 0.040238410234451294, "rewards/accuracy_reward": 0.58636075258255, "rewards/format_reward": 1.0, "step": 723, "temperature": 1.0 }, { "all_correct": 0.125, "all_wrong": 0.375, "completion_length": 398.65625, "epoch": 0.022016786279041477, "grad_norm": 1.1098093094231731, "kl": 0.0289306640625, "learning_rate": 9.98804431524307e-07, "loss": 0.0012, "reward": 1.4754552841186523, "reward_std": 0.1270744353532791, "rewards/accuracy_reward": 0.3785802721977234, "rewards/format_reward": 1.0, "step": 724, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.25, "completion_length": 373.25, "epoch": 0.02204719620484126, "grad_norm": 1.2183616601207778, "kl": 0.0380859375, "learning_rate": 9.988011278894795e-07, "loss": 0.0015, "reward": 1.7102231979370117, "reward_std": 0.08546829968690872, "rewards/accuracy_reward": 0.5852231383323669, "rewards/format_reward": 1.0, "step": 725, "temperature": 1.0 }, { "all_correct": 0.25, "all_wrong": 0.125, "completion_length": 399.03125, "epoch": 0.022077606130641043, "grad_norm": 1.1681630281894226, "kl": 0.0283203125, "learning_rate": 9.987978197020687e-07, "loss": 0.0011, "reward": 1.501389741897583, "reward_std": 0.1450280249118805, "rewards/accuracy_reward": 0.4013897180557251, "rewards/format_reward": 1.0, "step": 726, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 400.1875, "epoch": 0.022108016056440822, "grad_norm": 1.1411547253118972, "kl": 0.0299072265625, "learning_rate": 9.98794506962105e-07, "loss": 0.0012, "reward": 2.012653350830078, "reward_std": 0.1220325455069542, "rewards/accuracy_reward": 0.8220282793045044, "rewards/format_reward": 1.0, "step": 727, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.125, "completion_length": 377.0, "epoch": 0.022138425982240605, "grad_norm": 1.3514340209932092, "kl": 0.034423828125, "learning_rate": 9.987911896696185e-07, "loss": 0.0014, "reward": 2.0093750953674316, "reward_std": 0.07827534526586533, "rewards/accuracy_reward": 0.84375, "rewards/format_reward": 1.0, "step": 728, "temperature": 1.0 }, { "all_correct": 0.375, "all_wrong": 0.25, "completion_length": 412.1875, "epoch": 0.022168835908040384, "grad_norm": 1.1702099934196326, "kl": 0.033203125, "learning_rate": 9.987878678246393e-07, "loss": 0.0013, "reward": 1.6856060028076172, "reward_std": 0.15356150269508362, "rewards/accuracy_reward": 0.5699810981750488, "rewards/format_reward": 1.0, "step": 729, "temperature": 1.0 }, { "all_correct": 0.875, "all_wrong": 0.0, "completion_length": 383.234375, "epoch": 0.022199245833840166, "grad_norm": 0.7273505110212117, "kl": 0.031494140625, "learning_rate": 9.987845414271981e-07, "loss": 0.0013, "reward": 2.106250047683716, "reward_std": 0.077632375061512, "rewards/accuracy_reward": 0.921875, "rewards/format_reward": 1.0, "step": 730, "temperature": 1.0 }, { "all_correct": 0.375, "all_wrong": 0.125, "completion_length": 379.8125, "epoch": 0.022229655759639946, "grad_norm": 1.4263464375075543, "kl": 0.03515625, "learning_rate": 9.987812104773252e-07, "loss": 0.0014, "reward": 1.7749428749084473, "reward_std": 0.1686939001083374, "rewards/accuracy_reward": 0.6218177080154419, "rewards/format_reward": 1.0, "step": 731, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.125, "completion_length": 402.015625, "epoch": 0.02226006568543973, "grad_norm": 0.8913883866530306, "kl": 0.0380859375, "learning_rate": 9.987778749750506e-07, "loss": 0.0015, "reward": 1.7426148653030396, "reward_std": 0.007226511836051941, "rewards/accuracy_reward": 0.5957398414611816, "rewards/format_reward": 1.0, "step": 732, "temperature": 1.0 }, { "all_correct": 0.25, "all_wrong": 0.125, "completion_length": 394.3125, "epoch": 0.022290475611239507, "grad_norm": 1.8338488816478984, "kl": 0.0361328125, "learning_rate": 9.987745349204053e-07, "loss": 0.0014, "reward": 1.711248517036438, "reward_std": 0.1885174810886383, "rewards/accuracy_reward": 0.5737485289573669, "rewards/format_reward": 1.0, "step": 733, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.125, "completion_length": 372.140625, "epoch": 0.02232088553703929, "grad_norm": 0.9094313823446758, "kl": 0.0361328125, "learning_rate": 9.987711903134193e-07, "loss": 0.0014, "reward": 1.84375, "reward_std": 0.14706888794898987, "rewards/accuracy_reward": 0.703125, "rewards/format_reward": 1.0, "step": 734, "temperature": 1.0 }, { "all_correct": 0.25, "all_wrong": 0.25, "completion_length": 381.296875, "epoch": 0.02235129546283907, "grad_norm": 1.1464659738770122, "kl": 0.03515625, "learning_rate": 9.987678411541234e-07, "loss": 0.0014, "reward": 1.6514378786087036, "reward_std": 0.18804897367954254, "rewards/accuracy_reward": 0.5201878547668457, "rewards/format_reward": 1.0, "step": 735, "temperature": 1.0 }, { "all_correct": 0.375, "all_wrong": 0.125, "completion_length": 384.984375, "epoch": 0.022381705388638852, "grad_norm": 1.291010346276496, "kl": 0.0302734375, "learning_rate": 9.98764487442548e-07, "loss": 0.0012, "reward": 1.6768947839736938, "reward_std": 0.08444131910800934, "rewards/accuracy_reward": 0.555019736289978, "rewards/format_reward": 1.0, "step": 736, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.125, "completion_length": 391.421875, "epoch": 0.02241211531443863, "grad_norm": 0.823421836041912, "kl": 0.036865234375, "learning_rate": 9.98761129178724e-07, "loss": 0.0015, "reward": 1.7976568937301636, "reward_std": 0.06957726180553436, "rewards/accuracy_reward": 0.6601568460464478, "rewards/format_reward": 1.0, "step": 737, "temperature": 1.0 }, { "all_correct": 0.375, "all_wrong": 0.125, "completion_length": 384.328125, "epoch": 0.022442525240238414, "grad_norm": 2.2622801393055396, "kl": 0.0322265625, "learning_rate": 9.987577663626817e-07, "loss": 0.0013, "reward": 1.716848611831665, "reward_std": 0.21006464958190918, "rewards/accuracy_reward": 0.6074735522270203, "rewards/format_reward": 1.0, "step": 738, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.125, "completion_length": 365.921875, "epoch": 0.022472935166038197, "grad_norm": 3.015272825477005, "kl": 0.031494140625, "learning_rate": 9.987543989944519e-07, "loss": 0.0013, "reward": 1.9758110046386719, "reward_std": 0.0849425345659256, "rewards/accuracy_reward": 0.807060956954956, "rewards/format_reward": 1.0, "step": 739, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.125, "completion_length": 389.578125, "epoch": 0.022503345091837976, "grad_norm": 1.0688015856224915, "kl": 0.0264892578125, "learning_rate": 9.987510270740654e-07, "loss": 0.0011, "reward": 1.8392271995544434, "reward_std": 0.07759998738765717, "rewards/accuracy_reward": 0.6861021518707275, "rewards/format_reward": 1.0, "step": 740, "temperature": 1.0 }, { "all_correct": 0.375, "all_wrong": 0.125, "completion_length": 372.515625, "epoch": 0.02253375501763776, "grad_norm": 1.1500346694003214, "kl": 0.034912109375, "learning_rate": 9.98747650601553e-07, "loss": 0.0014, "reward": 1.7619264125823975, "reward_std": 0.19373206794261932, "rewards/accuracy_reward": 0.63067626953125, "rewards/format_reward": 1.0, "step": 741, "temperature": 1.0 }, { "all_correct": 0.375, "all_wrong": 0.125, "completion_length": 375.953125, "epoch": 0.022564164943437538, "grad_norm": 1.1035879546104315, "kl": 0.03369140625, "learning_rate": 9.987442695769455e-07, "loss": 0.0013, "reward": 1.7696813344955444, "reward_std": 0.19338145852088928, "rewards/accuracy_reward": 0.6321812868118286, "rewards/format_reward": 1.0, "step": 742, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.25, "completion_length": 363.71875, "epoch": 0.02259457486923732, "grad_norm": 1.2437793328857525, "kl": 0.030517578125, "learning_rate": 9.987408840002737e-07, "loss": 0.0012, "reward": 1.8453125953674316, "reward_std": 0.042508676648139954, "rewards/accuracy_reward": 0.7015625834465027, "rewards/format_reward": 1.0, "step": 743, "temperature": 1.0 }, { "all_correct": 0.375, "all_wrong": 0.25, "completion_length": 370.59375, "epoch": 0.0226249847950371, "grad_norm": 1.8445164487446828, "kl": 0.02587890625, "learning_rate": 9.987374938715685e-07, "loss": 0.001, "reward": 1.6270833015441895, "reward_std": 0.18902161717414856, "rewards/accuracy_reward": 0.5208333134651184, "rewards/format_reward": 1.0, "step": 744, "temperature": 1.0 }, { "all_correct": 0.375, "all_wrong": 0.25, "completion_length": 393.625, "epoch": 0.022655394720836882, "grad_norm": 3.890205553589082, "kl": 0.025390625, "learning_rate": 9.98734099190861e-07, "loss": 0.001, "reward": 1.6375000476837158, "reward_std": 0.19854024052619934, "rewards/accuracy_reward": 0.5625, "rewards/format_reward": 0.984375, "step": 745, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.125, "completion_length": 340.171875, "epoch": 0.02268580464663666, "grad_norm": 1.8007790811440603, "kl": 0.0291748046875, "learning_rate": 9.98730699958182e-07, "loss": 0.0012, "reward": 1.8573578596115112, "reward_std": 0.12742909789085388, "rewards/accuracy_reward": 0.7167328000068665, "rewards/format_reward": 1.0, "step": 746, "temperature": 1.0 }, { "all_correct": 0.375, "all_wrong": 0.125, "completion_length": 402.375, "epoch": 0.022716214572436444, "grad_norm": 4.872981637051921, "kl": 0.0211181640625, "learning_rate": 9.987272961735625e-07, "loss": 0.0008, "reward": 1.6657042503356934, "reward_std": 0.2179035246372223, "rewards/accuracy_reward": 0.5719541311264038, "rewards/format_reward": 1.0, "step": 747, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.125, "completion_length": 349.0625, "epoch": 0.022746624498236223, "grad_norm": 1.3989106953302732, "kl": 0.027587890625, "learning_rate": 9.987238878370337e-07, "loss": 0.0011, "reward": 1.798216462135315, "reward_std": 0.18179690837860107, "rewards/accuracy_reward": 0.6794663667678833, "rewards/format_reward": 1.0, "step": 748, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.0, "completion_length": 357.046875, "epoch": 0.022777034424036006, "grad_norm": 1.9984254089382694, "kl": 0.025146484375, "learning_rate": 9.987204749486267e-07, "loss": 0.001, "reward": 1.9562500715255737, "reward_std": 0.22807425260543823, "rewards/accuracy_reward": 0.8125, "rewards/format_reward": 1.0, "step": 749, "temperature": 1.0 }, { "all_correct": 0.125, "all_wrong": 0.125, "completion_length": 372.6875, "epoch": 0.022807444349835785, "grad_norm": 1.9903949992535945, "kl": 0.0255126953125, "learning_rate": 9.987170575083727e-07, "loss": 0.001, "reward": 1.6590054035186768, "reward_std": 0.16362865269184113, "rewards/accuracy_reward": 0.5183802843093872, "rewards/format_reward": 1.0, "step": 750, "temperature": 1.0 }, { "all_correct": 0.375, "all_wrong": 0.0, "completion_length": 385.125, "epoch": 0.022837854275635568, "grad_norm": 1.1454249383111754, "kl": 0.0252685546875, "learning_rate": 9.987136355163027e-07, "loss": 0.001, "reward": 1.9045586585998535, "reward_std": 0.21874359250068665, "rewards/accuracy_reward": 0.7358086705207825, "rewards/format_reward": 1.0, "step": 751, "temperature": 1.0 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 361.296875, "epoch": 0.022868264201435347, "grad_norm": 0.8814794824068796, "kl": 0.0274658203125, "learning_rate": 9.987102089724479e-07, "loss": 0.0011, "reward": 2.184375047683716, "reward_std": 0.030616413801908493, "rewards/accuracy_reward": 1.0, "rewards/format_reward": 1.0, "step": 752, "temperature": 1.0 }, { "all_correct": 0.375, "all_wrong": 0.25, "completion_length": 404.015625, "epoch": 0.02289867412723513, "grad_norm": 0.8128564540040495, "kl": 0.022216796875, "learning_rate": 9.987067778768398e-07, "loss": 0.0009, "reward": 1.7192349433898926, "reward_std": 0.1248355358839035, "rewards/accuracy_reward": 0.6004849076271057, "rewards/format_reward": 1.0, "step": 753, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.25, "completion_length": 363.25, "epoch": 0.022929084053034912, "grad_norm": 1.892492029894602, "kl": 0.030029296875, "learning_rate": 9.987033422295096e-07, "loss": 0.0012, "reward": 1.783818006515503, "reward_std": 0.09978213906288147, "rewards/accuracy_reward": 0.6619428396224976, "rewards/format_reward": 1.0, "step": 754, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.0, "completion_length": 372.921875, "epoch": 0.02295949397883469, "grad_norm": 1.3484429636558195, "kl": 0.026611328125, "learning_rate": 9.986999020304889e-07, "loss": 0.0011, "reward": 1.9940892457962036, "reward_std": 0.11476308107376099, "rewards/accuracy_reward": 0.8128392100334167, "rewards/format_reward": 1.0, "step": 755, "temperature": 1.0 }, { "all_correct": 0.25, "all_wrong": 0.125, "completion_length": 395.40625, "epoch": 0.022989903904634474, "grad_norm": 1.1782680560420162, "kl": 0.0257568359375, "learning_rate": 9.986964572798087e-07, "loss": 0.001, "reward": 1.671875, "reward_std": 0.3408779203891754, "rewards/accuracy_reward": 0.5625, "rewards/format_reward": 1.0, "step": 756, "temperature": 1.0 }, { "all_correct": 0.25, "all_wrong": 0.0, "completion_length": 397.046875, "epoch": 0.023020313830434253, "grad_norm": 3.9010688571281094, "kl": 0.029052734375, "learning_rate": 9.986930079775006e-07, "loss": 0.0012, "reward": 1.7876451015472412, "reward_std": 0.3001384735107422, "rewards/accuracy_reward": 0.6532700061798096, "rewards/format_reward": 1.0, "step": 757, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 394.734375, "epoch": 0.023050723756234036, "grad_norm": 0.8314108017978707, "kl": 0.0250244140625, "learning_rate": 9.986895541235962e-07, "loss": 0.001, "reward": 1.9951221942901611, "reward_std": 0.0914997085928917, "rewards/accuracy_reward": 0.8263722062110901, "rewards/format_reward": 1.0, "step": 758, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 378.125, "epoch": 0.023081133682033815, "grad_norm": 1.8539042371917176, "kl": 0.032470703125, "learning_rate": 9.98686095718127e-07, "loss": 0.0013, "reward": 1.8104023933410645, "reward_std": 0.2301657497882843, "rewards/accuracy_reward": 0.6791523694992065, "rewards/format_reward": 1.0, "step": 759, "temperature": 1.0 }, { "all_correct": 0.375, "all_wrong": 0.0, "completion_length": 399.890625, "epoch": 0.023111543607833598, "grad_norm": 0.8654292889895803, "kl": 0.0224609375, "learning_rate": 9.986826327611243e-07, "loss": 0.0009, "reward": 1.9511767625808716, "reward_std": 0.25309449434280396, "rewards/accuracy_reward": 0.7886766791343689, "rewards/format_reward": 1.0, "step": 760, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.0, "completion_length": 395.953125, "epoch": 0.023141953533633377, "grad_norm": 1.0741326810940508, "kl": 0.0264892578125, "learning_rate": 9.9867916525262e-07, "loss": 0.0011, "reward": 1.8732432126998901, "reward_std": 0.09512462466955185, "rewards/accuracy_reward": 0.710743248462677, "rewards/format_reward": 1.0, "step": 761, "temperature": 1.0 }, { "all_correct": 0.375, "all_wrong": 0.125, "completion_length": 378.4375, "epoch": 0.02317236345943316, "grad_norm": 1.3741871806333228, "kl": 0.0279541015625, "learning_rate": 9.986756931926455e-07, "loss": 0.0011, "reward": 1.613971471786499, "reward_std": 0.17764733731746674, "rewards/accuracy_reward": 0.48897144198417664, "rewards/format_reward": 1.0, "step": 762, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.25, "completion_length": 381.3125, "epoch": 0.02320277338523294, "grad_norm": 0.7373601609617522, "kl": 0.028564453125, "learning_rate": 9.986722165812326e-07, "loss": 0.0011, "reward": 1.734375, "reward_std": 0.08797704428434372, "rewards/accuracy_reward": 0.609375, "rewards/format_reward": 1.0, "step": 763, "temperature": 1.0 }, { "all_correct": 0.25, "all_wrong": 0.0, "completion_length": 393.34375, "epoch": 0.02323318331103272, "grad_norm": 1.7867541502119688, "kl": 0.0260009765625, "learning_rate": 9.986687354184132e-07, "loss": 0.001, "reward": 1.7962843179702759, "reward_std": 0.246465265750885, "rewards/accuracy_reward": 0.6619092226028442, "rewards/format_reward": 1.0, "step": 764, "temperature": 1.0 }, { "all_correct": 0.875, "all_wrong": 0.0, "completion_length": 365.578125, "epoch": 0.0232635932368325, "grad_norm": 0.5880294556167397, "kl": 0.0286865234375, "learning_rate": 9.986652497042189e-07, "loss": 0.0011, "reward": 2.1031250953674316, "reward_std": 0.07372426986694336, "rewards/accuracy_reward": 0.921875, "rewards/format_reward": 1.0, "step": 765, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 379.609375, "epoch": 0.023294003162632283, "grad_norm": 1.4468293109569579, "kl": 0.0267333984375, "learning_rate": 9.986617594386817e-07, "loss": 0.0011, "reward": 1.8983525037765503, "reward_std": 0.18144752085208893, "rewards/accuracy_reward": 0.7264774441719055, "rewards/format_reward": 1.0, "step": 766, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.25, "completion_length": 363.8125, "epoch": 0.023324413088432066, "grad_norm": 0.4200898096974516, "kl": 0.0283203125, "learning_rate": 9.98658264621833e-07, "loss": 0.0011, "reward": 1.8593751192092896, "reward_std": 0.06805657595396042, "rewards/accuracy_reward": 0.71875, "rewards/format_reward": 1.0, "step": 767, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.25, "completion_length": 378.609375, "epoch": 0.023354823014231845, "grad_norm": 1.149190483234591, "kl": 0.0281982421875, "learning_rate": 9.986547652537052e-07, "loss": 0.0011, "reward": 1.7759602069854736, "reward_std": 0.07735943049192429, "rewards/accuracy_reward": 0.6478351354598999, "rewards/format_reward": 1.0, "step": 768, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 386.78125, "epoch": 0.023385232940031628, "grad_norm": 1.9985330531319843, "kl": 0.0264892578125, "learning_rate": 9.9865126133433e-07, "loss": 0.0011, "reward": 1.7804362773895264, "reward_std": 0.13158154487609863, "rewards/accuracy_reward": 0.6398111581802368, "rewards/format_reward": 1.0, "step": 769, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 369.75, "epoch": 0.023415642865831407, "grad_norm": 1.7796850612062007, "kl": 0.0286865234375, "learning_rate": 9.986477528637397e-07, "loss": 0.0011, "reward": 1.851261854171753, "reward_std": 0.25341296195983887, "rewards/accuracy_reward": 0.7137618064880371, "rewards/format_reward": 1.0, "step": 770, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.25, "completion_length": 389.3125, "epoch": 0.02344605279163119, "grad_norm": 1.6343019910511987, "kl": 0.0272216796875, "learning_rate": 9.986442398419657e-07, "loss": 0.0011, "reward": 1.7222223281860352, "reward_std": 0.07617852836847305, "rewards/accuracy_reward": 0.6128472089767456, "rewards/format_reward": 1.0, "step": 771, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.25, "completion_length": 416.90625, "epoch": 0.02347646271743097, "grad_norm": 0.8128435781265602, "kl": 0.02099609375, "learning_rate": 9.986407222690405e-07, "loss": 0.0008, "reward": 1.7283798456192017, "reward_std": 0.03378346562385559, "rewards/accuracy_reward": 0.5877547860145569, "rewards/format_reward": 1.0, "step": 772, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 380.53125, "epoch": 0.02350687264323075, "grad_norm": 1.5830758083146865, "kl": 0.027099609375, "learning_rate": 9.98637200144996e-07, "loss": 0.0011, "reward": 1.847960352897644, "reward_std": 0.17717812955379486, "rewards/accuracy_reward": 0.6885852813720703, "rewards/format_reward": 1.0, "step": 773, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.125, "completion_length": 391.578125, "epoch": 0.02353728256903053, "grad_norm": 0.9616305243958918, "kl": 0.023681640625, "learning_rate": 9.986336734698648e-07, "loss": 0.0009, "reward": 1.908273696899414, "reward_std": 0.11014175415039062, "rewards/accuracy_reward": 0.7551485896110535, "rewards/format_reward": 1.0, "step": 774, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 437.609375, "epoch": 0.023567692494830313, "grad_norm": 2.3915666241249225, "kl": 0.0235595703125, "learning_rate": 9.986301422436784e-07, "loss": 0.0009, "reward": 1.853381633758545, "reward_std": 0.11915645748376846, "rewards/accuracy_reward": 0.7002565860748291, "rewards/format_reward": 1.0, "step": 775, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.0, "completion_length": 376.5, "epoch": 0.023598102420630093, "grad_norm": 1.2744706011582818, "kl": 0.0269775390625, "learning_rate": 9.986266064664697e-07, "loss": 0.0011, "reward": 2.0259487628936768, "reward_std": 0.08181364834308624, "rewards/accuracy_reward": 0.8384487628936768, "rewards/format_reward": 1.0, "step": 776, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.125, "completion_length": 399.578125, "epoch": 0.023628512346429875, "grad_norm": 0.7514170902259528, "kl": 0.023193359375, "learning_rate": 9.986230661382704e-07, "loss": 0.0009, "reward": 1.8625000715255737, "reward_std": 0.22470125555992126, "rewards/accuracy_reward": 0.71875, "rewards/format_reward": 1.0, "step": 777, "temperature": 1.0 }, { "all_correct": 0.25, "all_wrong": 0.25, "completion_length": 397.03125, "epoch": 0.023658922272229654, "grad_norm": 1.0399599372562451, "kl": 0.021240234375, "learning_rate": 9.986195212591131e-07, "loss": 0.0008, "reward": 1.4069530963897705, "reward_std": 0.1705554723739624, "rewards/accuracy_reward": 0.3444531261920929, "rewards/format_reward": 1.0, "step": 778, "temperature": 1.0 }, { "all_correct": 0.375, "all_wrong": 0.0, "completion_length": 387.21875, "epoch": 0.023689332198029437, "grad_norm": 1.4523965467122186, "kl": 0.0306396484375, "learning_rate": 9.986159718290302e-07, "loss": 0.0012, "reward": 1.8218750953674316, "reward_std": 0.34781375527381897, "rewards/accuracy_reward": 0.6875, "rewards/format_reward": 1.0, "step": 779, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.0, "completion_length": 374.125, "epoch": 0.023719742123829216, "grad_norm": 1.1810049410022392, "kl": 0.03466796875, "learning_rate": 9.98612417848054e-07, "loss": 0.0014, "reward": 2.0662689208984375, "reward_std": 0.10660039633512497, "rewards/accuracy_reward": 0.875643789768219, "rewards/format_reward": 1.0, "step": 780, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.125, "completion_length": 362.90625, "epoch": 0.023750152049629, "grad_norm": 0.1340575940459647, "kl": 0.03271484375, "learning_rate": 9.98608859316217e-07, "loss": 0.0013, "reward": 2.008333206176758, "reward_std": 0.0, "rewards/accuracy_reward": 0.8333333730697632, "rewards/format_reward": 1.0, "step": 781, "temperature": 1.0 }, { "all_correct": 0.25, "all_wrong": 0.125, "completion_length": 404.109375, "epoch": 0.02378056197542878, "grad_norm": 1.2327608833746468, "kl": 0.0284423828125, "learning_rate": 9.986052962335516e-07, "loss": 0.0011, "reward": 1.5544676780700684, "reward_std": 0.14577242732048035, "rewards/accuracy_reward": 0.42946767807006836, "rewards/format_reward": 1.0, "step": 782, "temperature": 1.0 }, { "all_correct": 0.375, "all_wrong": 0.125, "completion_length": 436.4375, "epoch": 0.02381097190122856, "grad_norm": 1.1078770998810665, "kl": 0.026123046875, "learning_rate": 9.986017286000902e-07, "loss": 0.001, "reward": 1.6280699968338013, "reward_std": 0.2018967717885971, "rewards/accuracy_reward": 0.5530700087547302, "rewards/format_reward": 0.96875, "step": 783, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.125, "completion_length": 358.9375, "epoch": 0.023841381827028343, "grad_norm": 0.7931463344084059, "kl": 0.03369140625, "learning_rate": 9.985981564158656e-07, "loss": 0.0014, "reward": 1.9065475463867188, "reward_std": 0.0530330091714859, "rewards/accuracy_reward": 0.7596726417541504, "rewards/format_reward": 1.0, "step": 784, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.125, "completion_length": 376.359375, "epoch": 0.023871791752828123, "grad_norm": 0.5884591963971636, "kl": 0.0263671875, "learning_rate": 9.985945796809103e-07, "loss": 0.0011, "reward": 1.931249976158142, "reward_std": 0.07283870130777359, "rewards/accuracy_reward": 0.78125, "rewards/format_reward": 1.0, "step": 785, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 358.28125, "epoch": 0.023902201678627905, "grad_norm": 1.4184127451505397, "kl": 0.0277099609375, "learning_rate": 9.98590998395257e-07, "loss": 0.0011, "reward": 1.9197499752044678, "reward_std": 0.17344173789024353, "rewards/accuracy_reward": 0.7572498917579651, "rewards/format_reward": 1.0, "step": 786, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 361.28125, "epoch": 0.023932611604427684, "grad_norm": 1.011397119800329, "kl": 0.0277099609375, "learning_rate": 9.985874125589383e-07, "loss": 0.0011, "reward": 1.9830851554870605, "reward_std": 0.08638297021389008, "rewards/accuracy_reward": 0.8112100958824158, "rewards/format_reward": 1.0, "step": 787, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.125, "completion_length": 388.375, "epoch": 0.023963021530227467, "grad_norm": 1.5502893031665737, "kl": 0.0244140625, "learning_rate": 9.985838221719871e-07, "loss": 0.001, "reward": 1.7328448295593262, "reward_std": 0.19366195797920227, "rewards/accuracy_reward": 0.6390948295593262, "rewards/format_reward": 0.96875, "step": 788, "temperature": 1.0 }, { "all_correct": 0.25, "all_wrong": 0.25, "completion_length": 406.6875, "epoch": 0.023993431456027246, "grad_norm": 1.1517971177658946, "kl": 0.027099609375, "learning_rate": 9.985802272344359e-07, "loss": 0.0011, "reward": 1.4072890281677246, "reward_std": 0.17669269442558289, "rewards/accuracy_reward": 0.35728907585144043, "rewards/format_reward": 0.984375, "step": 789, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.125, "completion_length": 363.40625, "epoch": 0.02402384138182703, "grad_norm": 2.1427116200520406, "kl": 0.02685546875, "learning_rate": 9.985766277463179e-07, "loss": 0.0011, "reward": 1.7281250953674316, "reward_std": 0.22230380773544312, "rewards/accuracy_reward": 0.625, "rewards/format_reward": 1.0, "step": 790, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.125, "completion_length": 378.140625, "epoch": 0.024054251307626808, "grad_norm": 2.4593138913151082, "kl": 0.0279541015625, "learning_rate": 9.985730237076655e-07, "loss": 0.0011, "reward": 1.7292461395263672, "reward_std": 0.11161977797746658, "rewards/accuracy_reward": 0.6104961037635803, "rewards/format_reward": 1.0, "step": 791, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.125, "completion_length": 363.015625, "epoch": 0.02408466123342659, "grad_norm": 1.5380651608787437, "kl": 0.031005859375, "learning_rate": 9.985694151185117e-07, "loss": 0.0012, "reward": 1.8104652166366577, "reward_std": 0.17807050049304962, "rewards/accuracy_reward": 0.6635902523994446, "rewards/format_reward": 1.0, "step": 792, "temperature": 1.0 }, { "all_correct": 0.125, "all_wrong": 0.25, "completion_length": 355.859375, "epoch": 0.02411507115922637, "grad_norm": 2.905784489643889, "kl": 0.0257568359375, "learning_rate": 9.985658019788897e-07, "loss": 0.001, "reward": 1.4597008228302002, "reward_std": 0.27356308698654175, "rewards/accuracy_reward": 0.3722008168697357, "rewards/format_reward": 1.0, "step": 793, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.125, "completion_length": 345.1875, "epoch": 0.024145481085026153, "grad_norm": 0.8647291599259485, "kl": 0.0252685546875, "learning_rate": 9.985621842888324e-07, "loss": 0.001, "reward": 1.9494946002960205, "reward_std": 0.03824981302022934, "rewards/accuracy_reward": 0.7869945168495178, "rewards/format_reward": 1.0, "step": 794, "temperature": 1.0 }, { "all_correct": 0.0, "all_wrong": 0.0, "completion_length": 395.875, "epoch": 0.024175891010825932, "grad_norm": 1.3702880013868752, "kl": 0.027099609375, "learning_rate": 9.985585620483725e-07, "loss": 0.0011, "reward": 1.6429288387298584, "reward_std": 0.32906872034072876, "rewards/accuracy_reward": 0.5116787552833557, "rewards/format_reward": 1.0, "step": 795, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.125, "completion_length": 392.03125, "epoch": 0.024206300936625715, "grad_norm": 1.3157251153830898, "kl": 0.023193359375, "learning_rate": 9.985549352575435e-07, "loss": 0.0009, "reward": 1.9093750715255737, "reward_std": 0.16394595801830292, "rewards/accuracy_reward": 0.796875, "rewards/format_reward": 1.0, "step": 796, "temperature": 1.0 }, { "all_correct": 0.375, "all_wrong": 0.125, "completion_length": 377.890625, "epoch": 0.024236710862425497, "grad_norm": 1.3937358038483803, "kl": 0.0262451171875, "learning_rate": 9.985513039163782e-07, "loss": 0.001, "reward": 1.801854133605957, "reward_std": 0.15722987055778503, "rewards/accuracy_reward": 0.6674790382385254, "rewards/format_reward": 1.0, "step": 797, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.125, "completion_length": 361.40625, "epoch": 0.024267120788225276, "grad_norm": 1.7339451275938458, "kl": 0.02490234375, "learning_rate": 9.9854766802491e-07, "loss": 0.001, "reward": 1.799006700515747, "reward_std": 0.1200207993388176, "rewards/accuracy_reward": 0.6490066647529602, "rewards/format_reward": 1.0, "step": 798, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.125, "completion_length": 367.296875, "epoch": 0.02429753071402506, "grad_norm": 1.401229118477146, "kl": 0.031494140625, "learning_rate": 9.985440275831716e-07, "loss": 0.0013, "reward": 1.8512508869171143, "reward_std": 0.08396824449300766, "rewards/accuracy_reward": 0.6825007796287537, "rewards/format_reward": 1.0, "step": 799, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 389.5, "epoch": 0.024327940639824838, "grad_norm": 1.2778062111740731, "kl": 0.0294189453125, "learning_rate": 9.985403825911968e-07, "loss": 0.0012, "reward": 1.7474007606506348, "reward_std": 0.13472574949264526, "rewards/accuracy_reward": 0.5974007248878479, "rewards/format_reward": 1.0, "step": 800, "temperature": 1.0 }, { "all_correct": 0.875, "all_wrong": 0.0, "completion_length": 360.15625, "epoch": 0.02435835056562462, "grad_norm": 0.8422667755068607, "kl": 0.0380859375, "learning_rate": 9.985367330490184e-07, "loss": 0.0015, "reward": 2.1187500953674316, "reward_std": 0.09175113588571548, "rewards/accuracy_reward": 0.9375, "rewards/format_reward": 1.0, "step": 801, "temperature": 1.0 }, { "all_correct": 0.125, "all_wrong": 0.0, "completion_length": 469.96875, "epoch": 0.0243887604914244, "grad_norm": 2.044532105927981, "kl": 0.0194091796875, "learning_rate": 9.985330789566701e-07, "loss": 0.0008, "reward": 1.5624651908874512, "reward_std": 0.30480289459228516, "rewards/accuracy_reward": 0.4562152624130249, "rewards/format_reward": 1.0, "step": 802, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.125, "completion_length": 437.9375, "epoch": 0.024419170417224183, "grad_norm": 0.936276620147263, "kl": 0.024658203125, "learning_rate": 9.985294203141851e-07, "loss": 0.001, "reward": 1.817041039466858, "reward_std": 0.09996174275875092, "rewards/accuracy_reward": 0.6670409440994263, "rewards/format_reward": 1.0, "step": 803, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.125, "completion_length": 371.578125, "epoch": 0.024449580343023962, "grad_norm": 2.7390871030767387, "kl": 0.03564453125, "learning_rate": 9.985257571215969e-07, "loss": 0.0014, "reward": 1.8532803058624268, "reward_std": 0.16679328680038452, "rewards/accuracy_reward": 0.7001551389694214, "rewards/format_reward": 1.0, "step": 804, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 396.421875, "epoch": 0.024479990268823745, "grad_norm": 1.1350548686394575, "kl": 0.0281982421875, "learning_rate": 9.985220893789386e-07, "loss": 0.0011, "reward": 1.8072664737701416, "reward_std": 0.08884067088365555, "rewards/accuracy_reward": 0.6416415572166443, "rewards/format_reward": 1.0, "step": 805, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.0, "completion_length": 381.078125, "epoch": 0.024510400194623524, "grad_norm": 1.1838546809098474, "kl": 0.0341796875, "learning_rate": 9.985184170862439e-07, "loss": 0.0014, "reward": 1.8751509189605713, "reward_std": 0.12401609122753143, "rewards/accuracy_reward": 0.7189009189605713, "rewards/format_reward": 1.0, "step": 806, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.125, "completion_length": 416.984375, "epoch": 0.024540810120423306, "grad_norm": 0.7113854330107958, "kl": 0.025390625, "learning_rate": 9.985147402435465e-07, "loss": 0.001, "reward": 1.860893964767456, "reward_std": 0.10788382589817047, "rewards/accuracy_reward": 0.7171440124511719, "rewards/format_reward": 1.0, "step": 807, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.25, "completion_length": 379.625, "epoch": 0.024571220046223086, "grad_norm": 0.8621736558472874, "kl": 0.0291748046875, "learning_rate": 9.985110588508795e-07, "loss": 0.0012, "reward": 1.6844508647918701, "reward_std": 0.022079434245824814, "rewards/accuracy_reward": 0.5594507455825806, "rewards/format_reward": 1.0, "step": 808, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.125, "completion_length": 374.359375, "epoch": 0.02460162997202287, "grad_norm": 0.7071942765346917, "kl": 0.0286865234375, "learning_rate": 9.98507372908277e-07, "loss": 0.0011, "reward": 1.803125023841858, "reward_std": 0.11363068222999573, "rewards/accuracy_reward": 0.671875, "rewards/format_reward": 1.0, "step": 809, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.125, "completion_length": 378.078125, "epoch": 0.02463203989782265, "grad_norm": 0.753317685037029, "kl": 0.031494140625, "learning_rate": 9.985036824157721e-07, "loss": 0.0013, "reward": 1.9469069242477417, "reward_std": 0.0033762247767299414, "rewards/accuracy_reward": 0.771906852722168, "rewards/format_reward": 1.0, "step": 810, "temperature": 1.0 }, { "all_correct": 0.125, "all_wrong": 0.375, "completion_length": 408.3125, "epoch": 0.02466244982362243, "grad_norm": 1.3133526037882994, "kl": 0.025390625, "learning_rate": 9.98499987373399e-07, "loss": 0.001, "reward": 1.4260823726654053, "reward_std": 0.2575654983520508, "rewards/accuracy_reward": 0.357332319021225, "rewards/format_reward": 1.0, "step": 811, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.125, "completion_length": 383.046875, "epoch": 0.024692859749422213, "grad_norm": 0.9405671198543698, "kl": 0.026611328125, "learning_rate": 9.984962877811912e-07, "loss": 0.0011, "reward": 1.973444938659668, "reward_std": 0.0744757130742073, "rewards/accuracy_reward": 0.8078199028968811, "rewards/format_reward": 1.0, "step": 812, "temperature": 1.0 }, { "all_correct": 0.375, "all_wrong": 0.375, "completion_length": 362.8125, "epoch": 0.024723269675221992, "grad_norm": 1.3366797214305564, "kl": 0.034912109375, "learning_rate": 9.984925836391824e-07, "loss": 0.0014, "reward": 1.5078840255737305, "reward_std": 0.09823894500732422, "rewards/accuracy_reward": 0.42350900173187256, "rewards/format_reward": 1.0, "step": 813, "temperature": 1.0 }, { "all_correct": 0.375, "all_wrong": 0.125, "completion_length": 385.515625, "epoch": 0.024753679601021775, "grad_norm": 11.787372693678307, "kl": 0.032470703125, "learning_rate": 9.984888749474065e-07, "loss": 0.0013, "reward": 1.6988606452941895, "reward_std": 0.07482944428920746, "rewards/accuracy_reward": 0.5332356095314026, "rewards/format_reward": 1.0, "step": 814, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.125, "completion_length": 391.84375, "epoch": 0.024784089526821554, "grad_norm": 1.6503160923401785, "kl": 0.0301513671875, "learning_rate": 9.984851617058973e-07, "loss": 0.0012, "reward": 1.8883610963821411, "reward_std": 0.14864513278007507, "rewards/accuracy_reward": 0.7321109771728516, "rewards/format_reward": 1.0, "step": 815, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.125, "completion_length": 380.890625, "epoch": 0.024814499452621337, "grad_norm": 1.6862114208862402, "kl": 0.0234375, "learning_rate": 9.984814439146888e-07, "loss": 0.0009, "reward": 1.6855181455612183, "reward_std": 0.06899987906217575, "rewards/accuracy_reward": 0.5636430978775024, "rewards/format_reward": 1.0, "step": 816, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.0, "completion_length": 357.9375, "epoch": 0.024844909378421116, "grad_norm": 1.2532849972569562, "kl": 0.027587890625, "learning_rate": 9.984777215738149e-07, "loss": 0.0011, "reward": 1.9966022968292236, "reward_std": 0.17398184537887573, "rewards/accuracy_reward": 0.8216022253036499, "rewards/format_reward": 1.0, "step": 817, "temperature": 1.0 }, { "all_correct": 0.375, "all_wrong": 0.125, "completion_length": 369.25, "epoch": 0.0248753193042209, "grad_norm": 1.242709080230352, "kl": 0.0281982421875, "learning_rate": 9.984739946833095e-07, "loss": 0.0011, "reward": 1.7294418811798096, "reward_std": 0.10268831253051758, "rewards/accuracy_reward": 0.57319176197052, "rewards/format_reward": 1.0, "step": 818, "temperature": 1.0 }, { "all_correct": 0.125, "all_wrong": 0.25, "completion_length": 404.46875, "epoch": 0.024905729230020678, "grad_norm": 4.666140804016954, "kl": 0.0252685546875, "learning_rate": 9.984702632432065e-07, "loss": 0.001, "reward": 1.5115324258804321, "reward_std": 0.18961066007614136, "rewards/accuracy_reward": 0.42715737223625183, "rewards/format_reward": 1.0, "step": 819, "temperature": 1.0 }, { "all_correct": 0.375, "all_wrong": 0.25, "completion_length": 354.65625, "epoch": 0.02493613915582046, "grad_norm": 1.20173530491368, "kl": 0.03369140625, "learning_rate": 9.984665272535401e-07, "loss": 0.0013, "reward": 1.7843750715255737, "reward_std": 0.20894074440002441, "rewards/accuracy_reward": 0.65625, "rewards/format_reward": 1.0, "step": 820, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.25, "completion_length": 373.640625, "epoch": 0.02496654908162024, "grad_norm": 2.175721887242528, "kl": 0.030517578125, "learning_rate": 9.984627867143448e-07, "loss": 0.0012, "reward": 1.6507654190063477, "reward_std": 0.006912535987794399, "rewards/accuracy_reward": 0.5257654190063477, "rewards/format_reward": 1.0, "step": 821, "temperature": 1.0 }, { "all_correct": 0.375, "all_wrong": 0.375, "completion_length": 378.921875, "epoch": 0.024996959007420022, "grad_norm": 1.1819830292655331, "kl": 0.029052734375, "learning_rate": 9.98459041625654e-07, "loss": 0.0012, "reward": 1.6184587478637695, "reward_std": 0.07483232021331787, "rewards/accuracy_reward": 0.4997085928916931, "rewards/format_reward": 1.0, "step": 822, "temperature": 1.0 }, { "all_correct": 0.25, "all_wrong": 0.625, "completion_length": 385.234375, "epoch": 0.0250273689332198, "grad_norm": 0.5890580778114546, "kl": 0.0257568359375, "learning_rate": 9.984552919875023e-07, "loss": 0.001, "reward": 1.350223183631897, "reward_std": 0.0022408082149922848, "rewards/accuracy_reward": 0.2752231955528259, "rewards/format_reward": 1.0, "step": 823, "temperature": 1.0 }, { "all_correct": 0.375, "all_wrong": 0.125, "completion_length": 384.3125, "epoch": 0.025057778859019584, "grad_norm": 1.0626272463859665, "kl": 0.03173828125, "learning_rate": 9.984515377999238e-07, "loss": 0.0013, "reward": 1.7680878639221191, "reward_std": 0.21717840433120728, "rewards/accuracy_reward": 0.6399627923965454, "rewards/format_reward": 1.0, "step": 824, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 372.359375, "epoch": 0.025088188784819367, "grad_norm": 0.860881227803636, "kl": 0.034423828125, "learning_rate": 9.984477790629528e-07, "loss": 0.0014, "reward": 2.045442581176758, "reward_std": 0.0174080953001976, "rewards/accuracy_reward": 0.848567545413971, "rewards/format_reward": 1.0, "step": 825, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 357.078125, "epoch": 0.025118598710619146, "grad_norm": 1.027399425939752, "kl": 0.0281982421875, "learning_rate": 9.984440157766237e-07, "loss": 0.0011, "reward": 1.892415165901184, "reward_std": 0.16436290740966797, "rewards/accuracy_reward": 0.742415189743042, "rewards/format_reward": 1.0, "step": 826, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.125, "completion_length": 371.8125, "epoch": 0.02514900863641893, "grad_norm": 0.6642525007865296, "kl": 0.0284423828125, "learning_rate": 9.984402479409706e-07, "loss": 0.0011, "reward": 1.9496476650238037, "reward_std": 0.008385919965803623, "rewards/accuracy_reward": 0.7808976173400879, "rewards/format_reward": 1.0, "step": 827, "temperature": 1.0 }, { "all_correct": 0.375, "all_wrong": 0.25, "completion_length": 418.21875, "epoch": 0.025179418562218708, "grad_norm": 0.7685288268290463, "kl": 0.021728515625, "learning_rate": 9.984364755560283e-07, "loss": 0.0009, "reward": 1.576933741569519, "reward_std": 0.05900060757994652, "rewards/accuracy_reward": 0.4644337594509125, "rewards/format_reward": 1.0, "step": 828, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.125, "completion_length": 376.546875, "epoch": 0.02520982848801849, "grad_norm": 0.8542773475084886, "kl": 0.0291748046875, "learning_rate": 9.984326986218307e-07, "loss": 0.0012, "reward": 1.8766294717788696, "reward_std": 0.11274223029613495, "rewards/accuracy_reward": 0.7297544479370117, "rewards/format_reward": 1.0, "step": 829, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.25, "completion_length": 382.609375, "epoch": 0.02524023841381827, "grad_norm": 0.6556835507549637, "kl": 0.0284423828125, "learning_rate": 9.984289171384128e-07, "loss": 0.0011, "reward": 1.78634512424469, "reward_std": 0.10739859938621521, "rewards/accuracy_reward": 0.6582200527191162, "rewards/format_reward": 0.984375, "step": 830, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.125, "completion_length": 376.015625, "epoch": 0.025270648339618052, "grad_norm": 2.0655646454483185, "kl": 0.023681640625, "learning_rate": 9.984251311058089e-07, "loss": 0.0009, "reward": 1.7772659063339233, "reward_std": 0.08064363896846771, "rewards/accuracy_reward": 0.6335158348083496, "rewards/format_reward": 1.0, "step": 831, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.125, "completion_length": 365.46875, "epoch": 0.02530105826541783, "grad_norm": 2.027330325770631, "kl": 0.028564453125, "learning_rate": 9.984213405240533e-07, "loss": 0.0011, "reward": 1.849196434020996, "reward_std": 0.08454695343971252, "rewards/accuracy_reward": 0.699196457862854, "rewards/format_reward": 1.0, "step": 832, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 365.21875, "epoch": 0.025331468191217614, "grad_norm": 1.4262445633930974, "kl": 0.0281982421875, "learning_rate": 9.98417545393181e-07, "loss": 0.0011, "reward": 1.7781119346618652, "reward_std": 0.17312592267990112, "rewards/accuracy_reward": 0.6374868750572205, "rewards/format_reward": 1.0, "step": 833, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 364.625, "epoch": 0.025361878117017393, "grad_norm": 3.5321082833203796, "kl": 0.0269775390625, "learning_rate": 9.984137457132266e-07, "loss": 0.0011, "reward": 1.9796721935272217, "reward_std": 0.27583861351013184, "rewards/accuracy_reward": 0.8171721696853638, "rewards/format_reward": 1.0, "step": 834, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 394.375, "epoch": 0.025392288042817176, "grad_norm": 3.262304699600032, "kl": 0.031494140625, "learning_rate": 9.984099414842244e-07, "loss": 0.0013, "reward": 1.8777580261230469, "reward_std": 0.0923653244972229, "rewards/accuracy_reward": 0.693382978439331, "rewards/format_reward": 1.0, "step": 835, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.25, "completion_length": 371.15625, "epoch": 0.025422697968616955, "grad_norm": 0.8056051442761981, "kl": 0.02587890625, "learning_rate": 9.984061327062093e-07, "loss": 0.001, "reward": 1.77370023727417, "reward_std": 0.01660957932472229, "rewards/accuracy_reward": 0.6299501657485962, "rewards/format_reward": 1.0, "step": 836, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.0, "completion_length": 406.03125, "epoch": 0.025453107894416738, "grad_norm": 1.111456845465687, "kl": 0.0250244140625, "learning_rate": 9.984023193792163e-07, "loss": 0.001, "reward": 1.9238277673721313, "reward_std": 0.1628345102071762, "rewards/accuracy_reward": 0.7613277435302734, "rewards/format_reward": 1.0, "step": 837, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.125, "completion_length": 398.84375, "epoch": 0.02548351782021652, "grad_norm": 1.367743610546074, "kl": 0.023193359375, "learning_rate": 9.983985015032799e-07, "loss": 0.0009, "reward": 1.8928800821304321, "reward_std": 0.12182924151420593, "rewards/accuracy_reward": 0.7366300821304321, "rewards/format_reward": 1.0, "step": 838, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.125, "completion_length": 361.71875, "epoch": 0.0255139277460163, "grad_norm": 1.0246144840746412, "kl": 0.0322265625, "learning_rate": 9.983946790784352e-07, "loss": 0.0013, "reward": 1.9377695322036743, "reward_std": 0.15529388189315796, "rewards/accuracy_reward": 0.7721444964408875, "rewards/format_reward": 1.0, "step": 839, "temperature": 1.0 }, { "all_correct": 0.375, "all_wrong": 0.25, "completion_length": 394.296875, "epoch": 0.025544337671816082, "grad_norm": 1.2972349728565007, "kl": 0.026123046875, "learning_rate": 9.983908521047169e-07, "loss": 0.001, "reward": 1.5154740810394287, "reward_std": 0.1842728555202484, "rewards/accuracy_reward": 0.45297402143478394, "rewards/format_reward": 0.984375, "step": 840, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.125, "completion_length": 360.140625, "epoch": 0.02557474759761586, "grad_norm": 1.4475498402613012, "kl": 0.028564453125, "learning_rate": 9.9838702058216e-07, "loss": 0.0011, "reward": 1.7000000476837158, "reward_std": 0.21167466044425964, "rewards/accuracy_reward": 0.59375, "rewards/format_reward": 1.0, "step": 841, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 391.125, "epoch": 0.025605157523415644, "grad_norm": 0.8377675227701067, "kl": 0.02490234375, "learning_rate": 9.983831845107994e-07, "loss": 0.001, "reward": 1.9406315088272095, "reward_std": 0.14284569025039673, "rewards/accuracy_reward": 0.7625064849853516, "rewards/format_reward": 1.0, "step": 842, "temperature": 1.0 }, { "all_correct": 0.25, "all_wrong": 0.125, "completion_length": 409.328125, "epoch": 0.025635567449215423, "grad_norm": 1.265027170882157, "kl": 0.02587890625, "learning_rate": 9.983793438906701e-07, "loss": 0.001, "reward": 1.6790452003479004, "reward_std": 0.12033594399690628, "rewards/accuracy_reward": 0.5352951884269714, "rewards/format_reward": 1.0, "step": 843, "temperature": 1.0 }, { "all_correct": 0.375, "all_wrong": 0.125, "completion_length": 365.953125, "epoch": 0.025665977375015206, "grad_norm": 1.3220933558296946, "kl": 0.0306396484375, "learning_rate": 9.983754987218077e-07, "loss": 0.0012, "reward": 1.543589472770691, "reward_std": 0.13156212866306305, "rewards/accuracy_reward": 0.45608940720558167, "rewards/format_reward": 1.0, "step": 844, "temperature": 1.0 }, { "all_correct": 0.25, "all_wrong": 0.0, "completion_length": 384.328125, "epoch": 0.025696387300814985, "grad_norm": 1.298653333589377, "kl": 0.027099609375, "learning_rate": 9.983716490042464e-07, "loss": 0.0011, "reward": 1.7942698001861572, "reward_std": 0.08681254088878632, "rewards/accuracy_reward": 0.6036449074745178, "rewards/format_reward": 1.0, "step": 845, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.125, "completion_length": 385.90625, "epoch": 0.025726797226614768, "grad_norm": 0.41777919201746394, "kl": 0.0230712890625, "learning_rate": 9.983677947380218e-07, "loss": 0.0009, "reward": 1.9500000476837158, "reward_std": 0.08920513093471527, "rewards/accuracy_reward": 0.796875, "rewards/format_reward": 1.0, "step": 846, "temperature": 1.0 }, { "all_correct": 0.125, "all_wrong": 0.125, "completion_length": 409.40625, "epoch": 0.025757207152414547, "grad_norm": 1.7224468249949592, "kl": 0.024169921875, "learning_rate": 9.983639359231691e-07, "loss": 0.001, "reward": 1.5336899757385254, "reward_std": 0.32330387830734253, "rewards/accuracy_reward": 0.45243996381759644, "rewards/format_reward": 0.984375, "step": 847, "temperature": 1.0 }, { "all_correct": 0.375, "all_wrong": 0.0, "completion_length": 382.78125, "epoch": 0.02578761707821433, "grad_norm": 2.0676796538377635, "kl": 0.0216064453125, "learning_rate": 9.983600725597234e-07, "loss": 0.0009, "reward": 1.8787537813186646, "reward_std": 0.216922789812088, "rewards/accuracy_reward": 0.7131287455558777, "rewards/format_reward": 1.0, "step": 848, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 369.625, "epoch": 0.02581802700401411, "grad_norm": 1.5231400523181586, "kl": 0.02734375, "learning_rate": 9.983562046477202e-07, "loss": 0.0011, "reward": 1.9757616519927979, "reward_std": 0.02236121892929077, "rewards/accuracy_reward": 0.8070114850997925, "rewards/format_reward": 1.0, "step": 849, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.125, "completion_length": 353.671875, "epoch": 0.02584843692981389, "grad_norm": 1.1869500179620096, "kl": 0.03125, "learning_rate": 9.983523321871944e-07, "loss": 0.0013, "reward": 1.8624999523162842, "reward_std": 0.14150117337703705, "rewards/accuracy_reward": 0.734375, "rewards/format_reward": 1.0, "step": 850, "temperature": 1.0 }, { "all_correct": 0.375, "all_wrong": 0.0, "completion_length": 390.125, "epoch": 0.02587884685561367, "grad_norm": 1.929123493203054, "kl": 0.030029296875, "learning_rate": 9.983484551781818e-07, "loss": 0.0012, "reward": 1.647362470626831, "reward_std": 0.17034035921096802, "rewards/accuracy_reward": 0.5286123752593994, "rewards/format_reward": 1.0, "step": 851, "temperature": 1.0 }, { "all_correct": 0.125, "all_wrong": 0.375, "completion_length": 380.640625, "epoch": 0.025909256781413453, "grad_norm": 7.419320574363224, "kl": 0.02099609375, "learning_rate": 9.983445736207172e-07, "loss": 0.0008, "reward": 1.5718750953674316, "reward_std": 0.24147427082061768, "rewards/accuracy_reward": 0.484375, "rewards/format_reward": 1.0, "step": 852, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.125, "completion_length": 373.59375, "epoch": 0.025939666707213236, "grad_norm": 1.1592747817606803, "kl": 0.0294189453125, "learning_rate": 9.983406875148367e-07, "loss": 0.0012, "reward": 1.7848021984100342, "reward_std": 0.10883790254592896, "rewards/accuracy_reward": 0.6348022222518921, "rewards/format_reward": 1.0, "step": 853, "temperature": 1.0 }, { "all_correct": 0.25, "all_wrong": 0.0, "completion_length": 406.015625, "epoch": 0.025970076633013015, "grad_norm": 1.5065848739738974, "kl": 0.0279541015625, "learning_rate": 9.983367968605754e-07, "loss": 0.0011, "reward": 1.7275400161743164, "reward_std": 0.2939084470272064, "rewards/accuracy_reward": 0.6025399565696716, "rewards/format_reward": 1.0, "step": 854, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.0, "completion_length": 360.625, "epoch": 0.026000486558812798, "grad_norm": 2.5496701802517463, "kl": 0.026611328125, "learning_rate": 9.983329016579687e-07, "loss": 0.0011, "reward": 1.9937500953674316, "reward_std": 0.23289713263511658, "rewards/accuracy_reward": 0.828125, "rewards/format_reward": 1.0, "step": 855, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.125, "completion_length": 364.359375, "epoch": 0.026030896484612577, "grad_norm": 2.2007921262656795, "kl": 0.0257568359375, "learning_rate": 9.983290019070525e-07, "loss": 0.001, "reward": 1.7394740581512451, "reward_std": 0.1827305555343628, "rewards/accuracy_reward": 0.6144740581512451, "rewards/format_reward": 1.0, "step": 856, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 366.59375, "epoch": 0.02606130641041236, "grad_norm": 1.4893904361420967, "kl": 0.0274658203125, "learning_rate": 9.98325097607862e-07, "loss": 0.0011, "reward": 2.02575945854187, "reward_std": 0.09400910139083862, "rewards/accuracy_reward": 0.8507594466209412, "rewards/format_reward": 1.0, "step": 857, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.125, "completion_length": 361.21875, "epoch": 0.02609171633621214, "grad_norm": 1.1138096367871577, "kl": 0.0281982421875, "learning_rate": 9.983211887604332e-07, "loss": 0.0011, "reward": 1.7625000476837158, "reward_std": 0.2082977592945099, "rewards/accuracy_reward": 0.640625, "rewards/format_reward": 1.0, "step": 858, "temperature": 1.0 }, { "all_correct": 0.375, "all_wrong": 0.125, "completion_length": 368.703125, "epoch": 0.02612212626201192, "grad_norm": 1.3294540815621967, "kl": 0.0294189453125, "learning_rate": 9.983172753648018e-07, "loss": 0.0012, "reward": 1.6704678535461426, "reward_std": 0.18212249875068665, "rewards/accuracy_reward": 0.5579677820205688, "rewards/format_reward": 1.0, "step": 859, "temperature": 1.0 }, { "all_correct": 0.25, "all_wrong": 0.125, "completion_length": 379.03125, "epoch": 0.0261525361878117, "grad_norm": 1.2317845208704672, "kl": 0.029052734375, "learning_rate": 9.98313357421003e-07, "loss": 0.0012, "reward": 1.6934876441955566, "reward_std": 0.2625080347061157, "rewards/accuracy_reward": 0.5559875965118408, "rewards/format_reward": 1.0, "step": 860, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.125, "completion_length": 360.75, "epoch": 0.026182946113611483, "grad_norm": 0.7743985481564009, "kl": 0.031982421875, "learning_rate": 9.98309434929073e-07, "loss": 0.0013, "reward": 1.999851107597351, "reward_std": 0.017677675932645798, "rewards/accuracy_reward": 0.8311011791229248, "rewards/format_reward": 1.0, "step": 861, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 364.125, "epoch": 0.026213356039411263, "grad_norm": 2.4909999872254844, "kl": 0.034423828125, "learning_rate": 9.983055078890475e-07, "loss": 0.0014, "reward": 1.85933518409729, "reward_std": 0.12884247303009033, "rewards/accuracy_reward": 0.7093351483345032, "rewards/format_reward": 1.0, "step": 862, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.125, "completion_length": 437.5625, "epoch": 0.026243765965211045, "grad_norm": 0.8997173943274382, "kl": 0.0277099609375, "learning_rate": 9.983015763009623e-07, "loss": 0.0011, "reward": 1.7145646810531616, "reward_std": 0.20130015909671783, "rewards/accuracy_reward": 0.6583147048950195, "rewards/format_reward": 0.9375, "step": 863, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.125, "completion_length": 365.71875, "epoch": 0.026274175891010824, "grad_norm": 1.2334851397104887, "kl": 0.02978515625, "learning_rate": 9.982976401648533e-07, "loss": 0.0012, "reward": 1.8901915550231934, "reward_std": 0.037181075662374496, "rewards/accuracy_reward": 0.7276915311813354, "rewards/format_reward": 1.0, "step": 864, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.0, "completion_length": 342.625, "epoch": 0.026304585816810607, "grad_norm": 1.9919030755272673, "kl": 0.027587890625, "learning_rate": 9.982936994807566e-07, "loss": 0.0011, "reward": 1.853270411491394, "reward_std": 0.08195285499095917, "rewards/accuracy_reward": 0.6907703876495361, "rewards/format_reward": 1.0, "step": 865, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.0, "completion_length": 374.171875, "epoch": 0.026334995742610386, "grad_norm": 1.2138233272698584, "kl": 0.02392578125, "learning_rate": 9.982897542487079e-07, "loss": 0.001, "reward": 2.02734375, "reward_std": 0.19783607125282288, "rewards/accuracy_reward": 0.8648437261581421, "rewards/format_reward": 1.0, "step": 866, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 372.25, "epoch": 0.02636540566841017, "grad_norm": 1.5505146892100785, "kl": 0.0322265625, "learning_rate": 9.982858044687432e-07, "loss": 0.0013, "reward": 2.0061817169189453, "reward_std": 0.15434777736663818, "rewards/accuracy_reward": 0.821806788444519, "rewards/format_reward": 1.0, "step": 867, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.0, "completion_length": 372.171875, "epoch": 0.02639581559420995, "grad_norm": 0.9624729377131497, "kl": 0.025390625, "learning_rate": 9.982818501408987e-07, "loss": 0.001, "reward": 2.0065112113952637, "reward_std": 0.14402265846729279, "rewards/accuracy_reward": 0.8283862471580505, "rewards/format_reward": 1.0, "step": 868, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.125, "completion_length": 356.4375, "epoch": 0.02642622552000973, "grad_norm": 1.0769681146869727, "kl": 0.03125, "learning_rate": 9.982778912652107e-07, "loss": 0.0013, "reward": 1.9249999523162842, "reward_std": 0.09531004726886749, "rewards/accuracy_reward": 0.7718750238418579, "rewards/format_reward": 1.0, "step": 869, "temperature": 1.0 }, { "all_correct": 0.375, "all_wrong": 0.0, "completion_length": 397.53125, "epoch": 0.026456635445809513, "grad_norm": 4.884932430551105, "kl": 0.0263671875, "learning_rate": 9.982739278417146e-07, "loss": 0.0011, "reward": 1.9031250476837158, "reward_std": 0.3369966745376587, "rewards/accuracy_reward": 0.765625, "rewards/format_reward": 1.0, "step": 870, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.25, "completion_length": 362.359375, "epoch": 0.026487045371609293, "grad_norm": 0.8320703244626642, "kl": 0.0263671875, "learning_rate": 9.982699598704475e-07, "loss": 0.0011, "reward": 1.873399019241333, "reward_std": 0.00807617325335741, "rewards/accuracy_reward": 0.7265239357948303, "rewards/format_reward": 1.0, "step": 871, "temperature": 1.0 }, { "all_correct": 0.125, "all_wrong": 0.375, "completion_length": 396.78125, "epoch": 0.026517455297409075, "grad_norm": 1.085670421688828, "kl": 0.02099609375, "learning_rate": 9.982659873514452e-07, "loss": 0.0008, "reward": 1.5830128192901611, "reward_std": 0.08244168758392334, "rewards/accuracy_reward": 0.48926281929016113, "rewards/format_reward": 1.0, "step": 872, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.375, "completion_length": 382.4375, "epoch": 0.026547865223208855, "grad_norm": 0.6720713691095094, "kl": 0.0272216796875, "learning_rate": 9.982620102847438e-07, "loss": 0.0011, "reward": 1.709375023841858, "reward_std": 0.07827534526586533, "rewards/accuracy_reward": 0.59375, "rewards/format_reward": 1.0, "step": 873, "temperature": 1.0 }, { "all_correct": 0.25, "all_wrong": 0.125, "completion_length": 350.609375, "epoch": 0.026578275149008637, "grad_norm": 3.2857768722209326, "kl": 0.036865234375, "learning_rate": 9.982580286703796e-07, "loss": 0.0015, "reward": 1.75, "reward_std": 0.28431224822998047, "rewards/accuracy_reward": 0.640625, "rewards/format_reward": 1.0, "step": 874, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.125, "completion_length": 356.8125, "epoch": 0.026608685074808416, "grad_norm": 1.2195104293218655, "kl": 0.028564453125, "learning_rate": 9.982540425083894e-07, "loss": 0.0011, "reward": 1.9500000476837158, "reward_std": 0.1508890688419342, "rewards/accuracy_reward": 0.796875, "rewards/format_reward": 1.0, "step": 875, "temperature": 1.0 }, { "all_correct": 0.25, "all_wrong": 0.125, "completion_length": 380.5, "epoch": 0.0266390950006082, "grad_norm": 1.8105289352966827, "kl": 0.02587890625, "learning_rate": 9.98250051798809e-07, "loss": 0.001, "reward": 1.6753591299057007, "reward_std": 0.257985383272171, "rewards/accuracy_reward": 0.5441091060638428, "rewards/format_reward": 1.0, "step": 876, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.25, "completion_length": 360.953125, "epoch": 0.026669504926407978, "grad_norm": 0.6975135252933212, "kl": 0.030029296875, "learning_rate": 9.982460565416753e-07, "loss": 0.0012, "reward": 1.725000023841858, "reward_std": 0.11225074529647827, "rewards/accuracy_reward": 0.609375, "rewards/format_reward": 1.0, "step": 877, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.0, "completion_length": 343.71875, "epoch": 0.02669991485220776, "grad_norm": 1.0205203035006727, "kl": 0.02978515625, "learning_rate": 9.982420567370245e-07, "loss": 0.0012, "reward": 2.0251989364624023, "reward_std": 0.12982305884361267, "rewards/accuracy_reward": 0.8564489483833313, "rewards/format_reward": 1.0, "step": 878, "temperature": 1.0 }, { "all_correct": 0.375, "all_wrong": 0.125, "completion_length": 354.265625, "epoch": 0.02673032477800754, "grad_norm": 1.2860595347360626, "kl": 0.02783203125, "learning_rate": 9.98238052384893e-07, "loss": 0.0011, "reward": 1.7984168529510498, "reward_std": 0.24946370720863342, "rewards/accuracy_reward": 0.6609169244766235, "rewards/format_reward": 1.0, "step": 879, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 379.765625, "epoch": 0.026760734703807323, "grad_norm": 0.9075033343656573, "kl": 0.021728515625, "learning_rate": 9.982340434853177e-07, "loss": 0.0009, "reward": 2.022137403488159, "reward_std": 0.1549217402935028, "rewards/accuracy_reward": 0.8315123319625854, "rewards/format_reward": 1.0, "step": 880, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.125, "completion_length": 360.59375, "epoch": 0.026791144629607105, "grad_norm": 3.393599770983227, "kl": 0.0264892578125, "learning_rate": 9.982300300383347e-07, "loss": 0.0011, "reward": 1.8451385498046875, "reward_std": 0.09243814647197723, "rewards/accuracy_reward": 0.688888430595398, "rewards/format_reward": 1.0, "step": 881, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.125, "completion_length": 352.015625, "epoch": 0.026821554555406885, "grad_norm": 1.2437494482901101, "kl": 0.027099609375, "learning_rate": 9.98226012043981e-07, "loss": 0.0011, "reward": 1.821874976158142, "reward_std": 0.22308871150016785, "rewards/accuracy_reward": 0.703125, "rewards/format_reward": 1.0, "step": 882, "temperature": 1.0 }, { "all_correct": 0.25, "all_wrong": 0.125, "completion_length": 354.296875, "epoch": 0.026851964481206667, "grad_norm": 3.117784740205964, "kl": 0.0262451171875, "learning_rate": 9.982219895022934e-07, "loss": 0.001, "reward": 1.6497461795806885, "reward_std": 0.2257424294948578, "rewards/accuracy_reward": 0.5341210961341858, "rewards/format_reward": 1.0, "step": 883, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.125, "completion_length": 370.96875, "epoch": 0.026882374407006446, "grad_norm": 0.9489396380701225, "kl": 0.021484375, "learning_rate": 9.982179624133084e-07, "loss": 0.0009, "reward": 1.8611946105957031, "reward_std": 0.037532929331064224, "rewards/accuracy_reward": 0.6924446821212769, "rewards/format_reward": 1.0, "step": 884, "temperature": 1.0 }, { "all_correct": 0.375, "all_wrong": 0.125, "completion_length": 382.046875, "epoch": 0.02691278433280623, "grad_norm": 1.3298126166294117, "kl": 0.030029296875, "learning_rate": 9.982139307770627e-07, "loss": 0.0012, "reward": 1.7973594665527344, "reward_std": 0.14469589293003082, "rewards/accuracy_reward": 0.6536096334457397, "rewards/format_reward": 1.0, "step": 885, "temperature": 1.0 }, { "all_correct": 0.375, "all_wrong": 0.375, "completion_length": 375.515625, "epoch": 0.02694319425860601, "grad_norm": 0.926950321876323, "kl": 0.0277099609375, "learning_rate": 9.98209894593593e-07, "loss": 0.0011, "reward": 1.623437523841858, "reward_std": 0.0865686684846878, "rewards/accuracy_reward": 0.5359374284744263, "rewards/format_reward": 1.0, "step": 886, "temperature": 1.0 }, { "all_correct": 0.375, "all_wrong": 0.375, "completion_length": 373.75, "epoch": 0.02697360418440579, "grad_norm": 1.4128885644530433, "kl": 0.0286865234375, "learning_rate": 9.982058538629366e-07, "loss": 0.0011, "reward": 1.5056250095367432, "reward_std": 0.10134813189506531, "rewards/accuracy_reward": 0.4274999797344208, "rewards/format_reward": 1.0, "step": 887, "temperature": 1.0 }, { "all_correct": 0.25, "all_wrong": 0.25, "completion_length": 363.640625, "epoch": 0.02700401411020557, "grad_norm": 1.7655377487403774, "kl": 0.034423828125, "learning_rate": 9.982018085851297e-07, "loss": 0.0014, "reward": 1.578125, "reward_std": 0.2865731120109558, "rewards/accuracy_reward": 0.484375, "rewards/format_reward": 1.0, "step": 888, "temperature": 1.0 }, { "all_correct": 0.875, "all_wrong": 0.0, "completion_length": 372.578125, "epoch": 0.027034424036005353, "grad_norm": 2.6417907518264063, "kl": 0.027587890625, "learning_rate": 9.9819775876021e-07, "loss": 0.0011, "reward": 2.0992777347564697, "reward_std": 0.012263868935406208, "rewards/accuracy_reward": 0.9024028182029724, "rewards/format_reward": 1.0, "step": 889, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.125, "completion_length": 389.53125, "epoch": 0.027064833961805132, "grad_norm": 4.688509124322886, "kl": 0.0264892578125, "learning_rate": 9.98193704388214e-07, "loss": 0.0011, "reward": 1.7051362991333008, "reward_std": 0.14113397896289825, "rewards/accuracy_reward": 0.5863862037658691, "rewards/format_reward": 1.0, "step": 890, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.125, "completion_length": 375.671875, "epoch": 0.027095243887604915, "grad_norm": 0.6634671932252996, "kl": 0.03076171875, "learning_rate": 9.981896454691786e-07, "loss": 0.0012, "reward": 1.8781250715255737, "reward_std": 0.14495554566383362, "rewards/accuracy_reward": 0.734375, "rewards/format_reward": 1.0, "step": 891, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.25, "completion_length": 381.1875, "epoch": 0.027125653813404694, "grad_norm": 0.6385036944999951, "kl": 0.03173828125, "learning_rate": 9.98185582003141e-07, "loss": 0.0013, "reward": 1.7580617666244507, "reward_std": 0.07697310298681259, "rewards/accuracy_reward": 0.623686671257019, "rewards/format_reward": 1.0, "step": 892, "temperature": 1.0 }, { "all_correct": 0.375, "all_wrong": 0.375, "completion_length": 390.84375, "epoch": 0.027156063739204477, "grad_norm": 0.5334310474622267, "kl": 0.0302734375, "learning_rate": 9.981815139901383e-07, "loss": 0.0012, "reward": 1.5406250953674316, "reward_std": 0.14316077530384064, "rewards/accuracy_reward": 0.453125, "rewards/format_reward": 1.0, "step": 893, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.25, "completion_length": 380.484375, "epoch": 0.027186473665004256, "grad_norm": 0.8584856914674049, "kl": 0.030517578125, "learning_rate": 9.981774414302079e-07, "loss": 0.0012, "reward": 1.8052207231521606, "reward_std": 0.0214085653424263, "rewards/accuracy_reward": 0.6677207946777344, "rewards/format_reward": 1.0, "step": 894, "temperature": 1.0 }, { "all_correct": 0.375, "all_wrong": 0.125, "completion_length": 373.8125, "epoch": 0.02721688359080404, "grad_norm": 1.2925027242072804, "kl": 0.0272216796875, "learning_rate": 9.981733643233866e-07, "loss": 0.0011, "reward": 1.7487808465957642, "reward_std": 0.16004620492458344, "rewards/accuracy_reward": 0.6081558465957642, "rewards/format_reward": 1.0, "step": 895, "temperature": 1.0 }, { "all_correct": 0.125, "all_wrong": 0.0, "completion_length": 398.40625, "epoch": 0.02724729351660382, "grad_norm": 2.043120244435532, "kl": 0.028564453125, "learning_rate": 9.981692826697115e-07, "loss": 0.0011, "reward": 1.5589139461517334, "reward_std": 0.3679611384868622, "rewards/accuracy_reward": 0.4651640057563782, "rewards/format_reward": 0.984375, "step": 896, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 366.71875, "epoch": 0.0272777034424036, "grad_norm": 1.1291064780764344, "kl": 0.03173828125, "learning_rate": 9.981651964692202e-07, "loss": 0.0013, "reward": 1.9500001668930054, "reward_std": 0.24376732110977173, "rewards/accuracy_reward": 0.796875, "rewards/format_reward": 1.0, "step": 897, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.0, "completion_length": 388.765625, "epoch": 0.027308113368203383, "grad_norm": 1.3750506665939297, "kl": 0.0291748046875, "learning_rate": 9.9816110572195e-07, "loss": 0.0012, "reward": 1.89055335521698, "reward_std": 0.1391996592283249, "rewards/accuracy_reward": 0.7436783909797668, "rewards/format_reward": 1.0, "step": 898, "temperature": 1.0 }, { "all_correct": 0.375, "all_wrong": 0.25, "completion_length": 377.65625, "epoch": 0.027338523294003162, "grad_norm": 1.1864955791058833, "kl": 0.031494140625, "learning_rate": 9.98157010427938e-07, "loss": 0.0013, "reward": 1.6825969219207764, "reward_std": 0.10635487735271454, "rewards/accuracy_reward": 0.5513469576835632, "rewards/format_reward": 1.0, "step": 899, "temperature": 1.0 }, { "all_correct": 0.375, "all_wrong": 0.375, "completion_length": 388.859375, "epoch": 0.027368933219802945, "grad_norm": 0.922919824150924, "kl": 0.029052734375, "learning_rate": 9.981529105872216e-07, "loss": 0.0012, "reward": 1.6239583492279053, "reward_std": 0.10447327792644501, "rewards/accuracy_reward": 0.5177083015441895, "rewards/format_reward": 1.0, "step": 900, "temperature": 1.0 }, { "all_correct": 0.25, "all_wrong": 0.25, "completion_length": 358.328125, "epoch": 0.027399343145602724, "grad_norm": 1.654188308627461, "kl": 0.0306396484375, "learning_rate": 9.981488061998382e-07, "loss": 0.0012, "reward": 1.4632987976074219, "reward_std": 0.19796308875083923, "rewards/accuracy_reward": 0.3914237916469574, "rewards/format_reward": 1.0, "step": 901, "temperature": 1.0 }, { "all_correct": 0.375, "all_wrong": 0.375, "completion_length": 376.703125, "epoch": 0.027429753071402507, "grad_norm": 1.0854570867395315, "kl": 0.0311279296875, "learning_rate": 9.981446972658257e-07, "loss": 0.0012, "reward": 1.6490330696105957, "reward_std": 0.055693116039037704, "rewards/accuracy_reward": 0.5271580219268799, "rewards/format_reward": 1.0, "step": 902, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.25, "completion_length": 370.296875, "epoch": 0.027460162997202286, "grad_norm": 1.127809195168914, "kl": 0.0301513671875, "learning_rate": 9.981405837852209e-07, "loss": 0.0012, "reward": 1.7267836332321167, "reward_std": 0.04569198936223984, "rewards/accuracy_reward": 0.5924085974693298, "rewards/format_reward": 1.0, "step": 903, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.25, "completion_length": 376.296875, "epoch": 0.02749057292300207, "grad_norm": 0.9358964274327413, "kl": 0.034423828125, "learning_rate": 9.98136465758062e-07, "loss": 0.0014, "reward": 1.7383935451507568, "reward_std": 0.06318999826908112, "rewards/accuracy_reward": 0.6133936047554016, "rewards/format_reward": 1.0, "step": 904, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.125, "completion_length": 354.96875, "epoch": 0.027520982848801848, "grad_norm": 0.8511361597802979, "kl": 0.0341796875, "learning_rate": 9.98132343184386e-07, "loss": 0.0014, "reward": 2.0176825523376465, "reward_std": 0.02414887025952339, "rewards/accuracy_reward": 0.8489325046539307, "rewards/format_reward": 1.0, "step": 905, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.125, "completion_length": 383.109375, "epoch": 0.02755139277460163, "grad_norm": 0.9745485135989337, "kl": 0.0308837890625, "learning_rate": 9.98128216064231e-07, "loss": 0.0012, "reward": 1.8669123649597168, "reward_std": 0.10205531865358353, "rewards/accuracy_reward": 0.7106623649597168, "rewards/format_reward": 1.0, "step": 906, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.0, "completion_length": 370.25, "epoch": 0.02758180270040141, "grad_norm": 1.895851937140255, "kl": 0.030517578125, "learning_rate": 9.981240843976345e-07, "loss": 0.0012, "reward": 1.9984097480773926, "reward_std": 0.1495390683412552, "rewards/accuracy_reward": 0.8171596527099609, "rewards/format_reward": 1.0, "step": 907, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 358.015625, "epoch": 0.027612212626201192, "grad_norm": 1.6098516042847661, "kl": 0.0267333984375, "learning_rate": 9.98119948184634e-07, "loss": 0.0011, "reward": 1.914100170135498, "reward_std": 0.23505143821239471, "rewards/accuracy_reward": 0.7641000747680664, "rewards/format_reward": 1.0, "step": 908, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 375.796875, "epoch": 0.027642622552000975, "grad_norm": 1.033574192365843, "kl": 0.0281982421875, "learning_rate": 9.981158074252678e-07, "loss": 0.0011, "reward": 1.990525245666504, "reward_std": 0.06795720010995865, "rewards/accuracy_reward": 0.8217751979827881, "rewards/format_reward": 1.0, "step": 909, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.0, "completion_length": 416.203125, "epoch": 0.027673032477800754, "grad_norm": 0.9914155496887159, "kl": 0.023193359375, "learning_rate": 9.98111662119573e-07, "loss": 0.0009, "reward": 1.9707400798797607, "reward_std": 0.09584079682826996, "rewards/accuracy_reward": 0.8019900918006897, "rewards/format_reward": 1.0, "step": 910, "temperature": 1.0 }, { "all_correct": 0.375, "all_wrong": 0.125, "completion_length": 377.703125, "epoch": 0.027703442403600537, "grad_norm": 1.5349799012658385, "kl": 0.025390625, "learning_rate": 9.98107512267588e-07, "loss": 0.001, "reward": 1.6978265047073364, "reward_std": 0.1508125364780426, "rewards/accuracy_reward": 0.5665765404701233, "rewards/format_reward": 1.0, "step": 911, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 365.4375, "epoch": 0.027733852329400316, "grad_norm": 1.2936510644957224, "kl": 0.0308837890625, "learning_rate": 9.981033578693504e-07, "loss": 0.0012, "reward": 2.0045530796051025, "reward_std": 0.09096268564462662, "rewards/accuracy_reward": 0.8264280557632446, "rewards/format_reward": 1.0, "step": 912, "temperature": 1.0 }, { "all_correct": 0.375, "all_wrong": 0.125, "completion_length": 356.234375, "epoch": 0.0277642622552001, "grad_norm": 1.3718693769008454, "kl": 0.03466796875, "learning_rate": 9.980991989248982e-07, "loss": 0.0014, "reward": 1.7949542999267578, "reward_std": 0.16163060069084167, "rewards/accuracy_reward": 0.6387042999267578, "rewards/format_reward": 1.0, "step": 913, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.125, "completion_length": 370.359375, "epoch": 0.027794672180999878, "grad_norm": 3.6198798411398494, "kl": 0.02978515625, "learning_rate": 9.980950354342693e-07, "loss": 0.0012, "reward": 1.7999999523162842, "reward_std": 0.12246951460838318, "rewards/accuracy_reward": 0.671875, "rewards/format_reward": 1.0, "step": 914, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.125, "completion_length": 367.59375, "epoch": 0.02782508210679966, "grad_norm": 0.6991926328229966, "kl": 0.029541015625, "learning_rate": 9.980908673975016e-07, "loss": 0.0012, "reward": 1.882638931274414, "reward_std": 0.08100926131010056, "rewards/accuracy_reward": 0.7326388955116272, "rewards/format_reward": 1.0, "step": 915, "temperature": 1.0 }, { "all_correct": 0.375, "all_wrong": 0.375, "completion_length": 363.53125, "epoch": 0.02785549203259944, "grad_norm": 0.8133604325907676, "kl": 0.0291748046875, "learning_rate": 9.980866948146333e-07, "loss": 0.0012, "reward": 1.521875023841858, "reward_std": 0.12182654440402985, "rewards/accuracy_reward": 0.4375, "rewards/format_reward": 1.0, "step": 916, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.125, "completion_length": 371.671875, "epoch": 0.027885901958399222, "grad_norm": 1.7215792826521052, "kl": 0.0277099609375, "learning_rate": 9.980825176857026e-07, "loss": 0.0011, "reward": 1.8953166007995605, "reward_std": 0.1273740828037262, "rewards/accuracy_reward": 0.7296916842460632, "rewards/format_reward": 1.0, "step": 917, "temperature": 1.0 }, { "all_correct": 0.375, "all_wrong": 0.125, "completion_length": 379.578125, "epoch": 0.027916311884199, "grad_norm": 0.9323764979423356, "kl": 0.030517578125, "learning_rate": 9.980783360107474e-07, "loss": 0.0012, "reward": 1.6779088973999023, "reward_std": 0.21271350979804993, "rewards/accuracy_reward": 0.5685338377952576, "rewards/format_reward": 1.0, "step": 918, "temperature": 1.0 }, { "all_correct": 0.875, "all_wrong": 0.125, "completion_length": 363.5, "epoch": 0.027946721809998784, "grad_norm": 0.5036526044285736, "kl": 0.037109375, "learning_rate": 9.980741497898059e-07, "loss": 0.0015, "reward": 2.0406250953674316, "reward_std": 0.012938733212649822, "rewards/accuracy_reward": 0.875, "rewards/format_reward": 1.0, "step": 919, "temperature": 1.0 }, { "all_correct": 0.375, "all_wrong": 0.0, "completion_length": 449.09375, "epoch": 0.027977131735798563, "grad_norm": 1.1181503590706576, "kl": 0.02685546875, "learning_rate": 9.980699590229163e-07, "loss": 0.0011, "reward": 1.7752344608306885, "reward_std": 0.30081695318222046, "rewards/accuracy_reward": 0.6533594727516174, "rewards/format_reward": 1.0, "step": 920, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 396.703125, "epoch": 0.028007541661598346, "grad_norm": 2.2400200286490155, "kl": 0.0291748046875, "learning_rate": 9.98065763710117e-07, "loss": 0.0012, "reward": 1.8694725036621094, "reward_std": 0.10489445924758911, "rewards/accuracy_reward": 0.6975974440574646, "rewards/format_reward": 1.0, "step": 921, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.125, "completion_length": 383.90625, "epoch": 0.028037951587398125, "grad_norm": 1.449539652320286, "kl": 0.03173828125, "learning_rate": 9.980615638514462e-07, "loss": 0.0013, "reward": 1.7131584882736206, "reward_std": 0.07114633917808533, "rewards/accuracy_reward": 0.5694084167480469, "rewards/format_reward": 1.0, "step": 922, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.25, "completion_length": 358.46875, "epoch": 0.028068361513197908, "grad_norm": 1.0254694345337798, "kl": 0.033447265625, "learning_rate": 9.98057359446942e-07, "loss": 0.0013, "reward": 1.725000023841858, "reward_std": 0.12246951460838318, "rewards/accuracy_reward": 0.609375, "rewards/format_reward": 1.0, "step": 923, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.25, "completion_length": 375.5, "epoch": 0.02809877143899769, "grad_norm": 0.9636333571347802, "kl": 0.038330078125, "learning_rate": 9.980531504966435e-07, "loss": 0.0015, "reward": 1.806096076965332, "reward_std": 0.026446864008903503, "rewards/accuracy_reward": 0.6685960292816162, "rewards/format_reward": 1.0, "step": 924, "temperature": 1.0 }, { "all_correct": 0.375, "all_wrong": 0.25, "completion_length": 387.703125, "epoch": 0.02812918136479747, "grad_norm": 1.2327195958197117, "kl": 0.02978515625, "learning_rate": 9.980489370005882e-07, "loss": 0.0012, "reward": 1.584425449371338, "reward_std": 0.12056782841682434, "rewards/accuracy_reward": 0.4906753897666931, "rewards/format_reward": 1.0, "step": 925, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.0, "completion_length": 420.625, "epoch": 0.028159591290597252, "grad_norm": 1.7836051233030696, "kl": 0.034912109375, "learning_rate": 9.98044718958815e-07, "loss": 0.0014, "reward": 1.9043868780136108, "reward_std": 0.09988686442375183, "rewards/accuracy_reward": 0.7637618780136108, "rewards/format_reward": 1.0, "step": 926, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.25, "completion_length": 398.046875, "epoch": 0.02819000121639703, "grad_norm": 2.0495932481021004, "kl": 0.031494140625, "learning_rate": 9.980404963713625e-07, "loss": 0.0013, "reward": 1.7250001430511475, "reward_std": 0.16474655270576477, "rewards/accuracy_reward": 0.609375, "rewards/format_reward": 1.0, "step": 927, "temperature": 1.0 }, { "all_correct": 0.375, "all_wrong": 0.25, "completion_length": 416.359375, "epoch": 0.028220411142196814, "grad_norm": 0.6958913598805665, "kl": 0.0277099609375, "learning_rate": 9.98036269238269e-07, "loss": 0.0011, "reward": 1.612518310546875, "reward_std": 0.15618416666984558, "rewards/accuracy_reward": 0.518768310546875, "rewards/format_reward": 0.984375, "step": 928, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 393.359375, "epoch": 0.028250821067996593, "grad_norm": 1.8605528943159149, "kl": 0.029541015625, "learning_rate": 9.980320375595733e-07, "loss": 0.0012, "reward": 2.0207390785217285, "reward_std": 0.040223851799964905, "rewards/accuracy_reward": 0.8394889831542969, "rewards/format_reward": 1.0, "step": 929, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.0, "completion_length": 411.875, "epoch": 0.028281230993796376, "grad_norm": 1.4510236521538862, "kl": 0.0308837890625, "learning_rate": 9.980278013353138e-07, "loss": 0.0012, "reward": 1.917789101600647, "reward_std": 0.14712709188461304, "rewards/accuracy_reward": 0.7896640300750732, "rewards/format_reward": 1.0, "step": 930, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 364.546875, "epoch": 0.028311640919596155, "grad_norm": 0.9839097008232094, "kl": 0.0380859375, "learning_rate": 9.980235605655293e-07, "loss": 0.0015, "reward": 2.121875047683716, "reward_std": 0.13950422406196594, "rewards/accuracy_reward": 0.9375, "rewards/format_reward": 1.0, "step": 931, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.25, "completion_length": 398.515625, "epoch": 0.028342050845395938, "grad_norm": 0.6689879422126677, "kl": 0.029052734375, "learning_rate": 9.980193152502587e-07, "loss": 0.0012, "reward": 1.7489992380142212, "reward_std": 0.055807724595069885, "rewards/accuracy_reward": 0.6271241903305054, "rewards/format_reward": 1.0, "step": 932, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 370.703125, "epoch": 0.028372460771195717, "grad_norm": 0.8188485686718167, "kl": 0.0299072265625, "learning_rate": 9.980150653895402e-07, "loss": 0.0012, "reward": 2.054932117462158, "reward_std": 0.08002617210149765, "rewards/accuracy_reward": 0.8705570697784424, "rewards/format_reward": 1.0, "step": 933, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 371.21875, "epoch": 0.0284028706969955, "grad_norm": 1.2216375959528853, "kl": 0.0419921875, "learning_rate": 9.980108109834131e-07, "loss": 0.0017, "reward": 2.077817440032959, "reward_std": 0.10074599087238312, "rewards/accuracy_reward": 0.8934422731399536, "rewards/format_reward": 1.0, "step": 934, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.25, "completion_length": 392.28125, "epoch": 0.02843328062279528, "grad_norm": 0.7211319772695124, "kl": 0.0311279296875, "learning_rate": 9.98006552031916e-07, "loss": 0.0012, "reward": 1.7468750476837158, "reward_std": 0.147711843252182, "rewards/accuracy_reward": 0.625, "rewards/format_reward": 1.0, "step": 935, "temperature": 1.0 }, { "all_correct": 0.375, "all_wrong": 0.125, "completion_length": 399.125, "epoch": 0.02846369054859506, "grad_norm": 1.3210966850576604, "kl": 0.03369140625, "learning_rate": 9.98002288535088e-07, "loss": 0.0013, "reward": 1.5802547931671143, "reward_std": 0.1403290033340454, "rewards/accuracy_reward": 0.46462976932525635, "rewards/format_reward": 1.0, "step": 936, "temperature": 1.0 }, { "all_correct": 0.125, "all_wrong": 0.25, "completion_length": 401.671875, "epoch": 0.02849410047439484, "grad_norm": 1.01077858694633, "kl": 0.0255126953125, "learning_rate": 9.979980204929675e-07, "loss": 0.001, "reward": 1.5889317989349365, "reward_std": 0.22281545400619507, "rewards/accuracy_reward": 0.4608067274093628, "rewards/format_reward": 1.0, "step": 937, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.125, "completion_length": 378.515625, "epoch": 0.028524510400194623, "grad_norm": 0.8103053302764397, "kl": 0.0322265625, "learning_rate": 9.97993747905594e-07, "loss": 0.0013, "reward": 1.9020813703536987, "reward_std": 0.011060738936066628, "rewards/accuracy_reward": 0.7270812392234802, "rewards/format_reward": 1.0, "step": 938, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 394.21875, "epoch": 0.028554920325994406, "grad_norm": 0.6608990672443955, "kl": 0.03125, "learning_rate": 9.979894707730062e-07, "loss": 0.0012, "reward": 2.038043737411499, "reward_std": 0.0746963694691658, "rewards/accuracy_reward": 0.8599188327789307, "rewards/format_reward": 1.0, "step": 939, "temperature": 1.0 }, { "all_correct": 0.25, "all_wrong": 0.25, "completion_length": 390.734375, "epoch": 0.028585330251794185, "grad_norm": 1.912762350397319, "kl": 0.0279541015625, "learning_rate": 9.979851890952433e-07, "loss": 0.0011, "reward": 1.6827574968338013, "reward_std": 0.199858158826828, "rewards/accuracy_reward": 0.5546325445175171, "rewards/format_reward": 1.0, "step": 940, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.125, "completion_length": 372.421875, "epoch": 0.028615740177593968, "grad_norm": 0.48009103444448936, "kl": 0.0400390625, "learning_rate": 9.979809028723442e-07, "loss": 0.0016, "reward": 1.9156250953674316, "reward_std": 0.04419417306780815, "rewards/accuracy_reward": 0.765625, "rewards/format_reward": 1.0, "step": 941, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.25, "completion_length": 408.71875, "epoch": 0.028646150103393747, "grad_norm": 0.7777220778370562, "kl": 0.034912109375, "learning_rate": 9.979766121043482e-07, "loss": 0.0014, "reward": 1.7216309309005737, "reward_std": 0.027476558461785316, "rewards/accuracy_reward": 0.596630871295929, "rewards/format_reward": 1.0, "step": 942, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.125, "completion_length": 404.71875, "epoch": 0.02867656002919353, "grad_norm": 0.9198329151539716, "kl": 0.0294189453125, "learning_rate": 9.979723167912943e-07, "loss": 0.0012, "reward": 1.9346178770065308, "reward_std": 0.07425513863563538, "rewards/accuracy_reward": 0.7752428650856018, "rewards/format_reward": 1.0, "step": 943, "temperature": 1.0 }, { "all_correct": 0.25, "all_wrong": 0.0, "completion_length": 398.96875, "epoch": 0.02870696995499331, "grad_norm": 1.2492067383093468, "kl": 0.023681640625, "learning_rate": 9.979680169332218e-07, "loss": 0.0009, "reward": 1.7629592418670654, "reward_std": 0.2209610491991043, "rewards/accuracy_reward": 0.6317091584205627, "rewards/format_reward": 1.0, "step": 944, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.125, "completion_length": 375.53125, "epoch": 0.02873737988079309, "grad_norm": 1.0021986293836167, "kl": 0.034423828125, "learning_rate": 9.9796371253017e-07, "loss": 0.0014, "reward": 1.9547169208526611, "reward_std": 0.10674916207790375, "rewards/accuracy_reward": 0.7890918850898743, "rewards/format_reward": 1.0, "step": 945, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.375, "completion_length": 392.765625, "epoch": 0.02876778980659287, "grad_norm": 0.42457408637622385, "kl": 0.03076171875, "learning_rate": 9.97959403582178e-07, "loss": 0.0012, "reward": 1.7281250953674316, "reward_std": 0.05250425264239311, "rewards/accuracy_reward": 0.609375, "rewards/format_reward": 1.0, "step": 946, "temperature": 1.0 }, { "all_correct": 0.375, "all_wrong": 0.125, "completion_length": 439.171875, "epoch": 0.028798199732392653, "grad_norm": 0.721668805203199, "kl": 0.0250244140625, "learning_rate": 9.979550900892855e-07, "loss": 0.001, "reward": 1.771875023841858, "reward_std": 0.23347899317741394, "rewards/accuracy_reward": 0.65625, "rewards/format_reward": 1.0, "step": 947, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.125, "completion_length": 405.15625, "epoch": 0.028828609658192433, "grad_norm": 1.2583794360604859, "kl": 0.03173828125, "learning_rate": 9.979507720515315e-07, "loss": 0.0013, "reward": 1.8047316074371338, "reward_std": 0.047892432659864426, "rewards/accuracy_reward": 0.6453566551208496, "rewards/format_reward": 1.0, "step": 948, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.0, "completion_length": 392.703125, "epoch": 0.028859019583992215, "grad_norm": 1.0734142744654365, "kl": 0.0257568359375, "learning_rate": 9.979464494689554e-07, "loss": 0.001, "reward": 1.8413039445877075, "reward_std": 0.09166154265403748, "rewards/accuracy_reward": 0.6913039088249207, "rewards/format_reward": 1.0, "step": 949, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.125, "completion_length": 393.8125, "epoch": 0.028889429509791995, "grad_norm": 1.083258443809071, "kl": 0.02587890625, "learning_rate": 9.979421223415969e-07, "loss": 0.001, "reward": 1.851592779159546, "reward_std": 0.13468001782894135, "rewards/accuracy_reward": 0.6953428387641907, "rewards/format_reward": 1.0, "step": 950, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.25, "completion_length": 388.65625, "epoch": 0.028919839435591777, "grad_norm": 3.9233771180464427, "kl": 0.0291748046875, "learning_rate": 9.979377906694951e-07, "loss": 0.0012, "reward": 1.6732618808746338, "reward_std": 0.037542469799518585, "rewards/accuracy_reward": 0.5420119762420654, "rewards/format_reward": 1.0, "step": 951, "temperature": 1.0 }, { "all_correct": 0.375, "all_wrong": 0.125, "completion_length": 404.671875, "epoch": 0.02895024936139156, "grad_norm": 0.9794857387291974, "kl": 0.0267333984375, "learning_rate": 9.979334544526902e-07, "loss": 0.0011, "reward": 1.6966025829315186, "reward_std": 0.20698893070220947, "rewards/accuracy_reward": 0.5684775114059448, "rewards/format_reward": 1.0, "step": 952, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 392.59375, "epoch": 0.02898065928719134, "grad_norm": 1.668871628374678, "kl": 0.0294189453125, "learning_rate": 9.979291136912213e-07, "loss": 0.0012, "reward": 1.8008915185928345, "reward_std": 0.04139992594718933, "rewards/accuracy_reward": 0.610266387462616, "rewards/format_reward": 1.0, "step": 953, "temperature": 1.0 }, { "all_correct": 0.375, "all_wrong": 0.0, "completion_length": 437.78125, "epoch": 0.02901106921299112, "grad_norm": 1.38145164616484, "kl": 0.02880859375, "learning_rate": 9.97924768385128e-07, "loss": 0.0011, "reward": 1.749072551727295, "reward_std": 0.14466246962547302, "rewards/accuracy_reward": 0.6178224682807922, "rewards/format_reward": 1.0, "step": 954, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.125, "completion_length": 379.484375, "epoch": 0.0290414791387909, "grad_norm": 23.099239772447394, "kl": 0.02783203125, "learning_rate": 9.9792041853445e-07, "loss": 0.0011, "reward": 1.8895816802978516, "reward_std": 0.08888044953346252, "rewards/accuracy_reward": 0.7302066683769226, "rewards/format_reward": 1.0, "step": 955, "temperature": 1.0 }, { "all_correct": 0.375, "all_wrong": 0.125, "completion_length": 386.34375, "epoch": 0.029071889064590684, "grad_norm": 1.9618639377814826, "kl": 0.033447265625, "learning_rate": 9.979160641392273e-07, "loss": 0.0013, "reward": 1.6772538423538208, "reward_std": 0.12570199370384216, "rewards/accuracy_reward": 0.5491288900375366, "rewards/format_reward": 1.0, "step": 956, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 375.234375, "epoch": 0.029102298990390463, "grad_norm": 1.1989404253767613, "kl": 0.03369140625, "learning_rate": 9.979117051994991e-07, "loss": 0.0013, "reward": 1.9542853832244873, "reward_std": 0.12954586744308472, "rewards/accuracy_reward": 0.7730352878570557, "rewards/format_reward": 1.0, "step": 957, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.0, "completion_length": 376.234375, "epoch": 0.029132708916190245, "grad_norm": 0.6999490027694096, "kl": 0.0240478515625, "learning_rate": 9.979073417153056e-07, "loss": 0.001, "reward": 1.915624976158142, "reward_std": 0.19945894181728363, "rewards/accuracy_reward": 0.765625, "rewards/format_reward": 1.0, "step": 958, "temperature": 1.0 }, { "all_correct": 0.875, "all_wrong": 0.0, "completion_length": 362.265625, "epoch": 0.029163118841990025, "grad_norm": 0.7369000499348929, "kl": 0.0267333984375, "learning_rate": 9.979029736866865e-07, "loss": 0.0011, "reward": 2.10657000541687, "reward_std": 0.017150744795799255, "rewards/accuracy_reward": 0.909695029258728, "rewards/format_reward": 1.0, "step": 959, "temperature": 1.0 }, { "all_correct": 0.375, "all_wrong": 0.125, "completion_length": 427.71875, "epoch": 0.029193528767789807, "grad_norm": 1.2702683362614735, "kl": 0.0203857421875, "learning_rate": 9.97898601113682e-07, "loss": 0.0008, "reward": 1.6314494609832764, "reward_std": 0.2217612862586975, "rewards/accuracy_reward": 0.5814494490623474, "rewards/format_reward": 0.96875, "step": 960, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 362.421875, "epoch": 0.029223938693589586, "grad_norm": 1.3454688418066623, "kl": 0.031005859375, "learning_rate": 9.978942239963313e-07, "loss": 0.0012, "reward": 1.9503661394119263, "reward_std": 0.0223162192851305, "rewards/accuracy_reward": 0.7909910678863525, "rewards/format_reward": 1.0, "step": 961, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.125, "completion_length": 371.46875, "epoch": 0.02925434861938937, "grad_norm": 1.1394228365817627, "kl": 0.025390625, "learning_rate": 9.97889842334675e-07, "loss": 0.001, "reward": 1.8466341495513916, "reward_std": 0.14077790081501007, "rewards/accuracy_reward": 0.6997590065002441, "rewards/format_reward": 1.0, "step": 962, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.0, "completion_length": 365.21875, "epoch": 0.02928475854518915, "grad_norm": 1.1849044273331357, "kl": 0.0279541015625, "learning_rate": 9.978854561287526e-07, "loss": 0.0011, "reward": 1.8964686393737793, "reward_std": 0.09825307875871658, "rewards/accuracy_reward": 0.7308435440063477, "rewards/format_reward": 1.0, "step": 963, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 372.234375, "epoch": 0.02931516847098893, "grad_norm": 1.4882034685366563, "kl": 0.0264892578125, "learning_rate": 9.978810653786047e-07, "loss": 0.0011, "reward": 2.1031250953674316, "reward_std": 0.1590200662612915, "rewards/accuracy_reward": 0.9375, "rewards/format_reward": 1.0, "step": 964, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.125, "completion_length": 353.125, "epoch": 0.02934557839678871, "grad_norm": 1.1399864272692561, "kl": 0.02880859375, "learning_rate": 9.978766700842708e-07, "loss": 0.0012, "reward": 1.975847840309143, "reward_std": 0.022725991904735565, "rewards/accuracy_reward": 0.8164727687835693, "rewards/format_reward": 1.0, "step": 965, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.125, "completion_length": 361.5, "epoch": 0.029375988322588493, "grad_norm": 1.3343373972984818, "kl": 0.02685546875, "learning_rate": 9.978722702457912e-07, "loss": 0.0011, "reward": 1.9047565460205078, "reward_std": 0.14533251523971558, "rewards/accuracy_reward": 0.7453814744949341, "rewards/format_reward": 1.0, "step": 966, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.125, "completion_length": 378.734375, "epoch": 0.029406398248388275, "grad_norm": 0.8460601083173706, "kl": 0.0203857421875, "learning_rate": 9.978678658632062e-07, "loss": 0.0008, "reward": 1.8355995416641235, "reward_std": 0.128595232963562, "rewards/accuracy_reward": 0.6824743747711182, "rewards/format_reward": 1.0, "step": 967, "temperature": 1.0 }, { "all_correct": 0.375, "all_wrong": 0.0, "completion_length": 372.421875, "epoch": 0.029436808174188055, "grad_norm": 1.475867147831558, "kl": 0.02783203125, "learning_rate": 9.978634569365559e-07, "loss": 0.0011, "reward": 1.8253848552703857, "reward_std": 0.24557790160179138, "rewards/accuracy_reward": 0.6691348552703857, "rewards/format_reward": 1.0, "step": 968, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 375.796875, "epoch": 0.029467218099987837, "grad_norm": 2.169297660446578, "kl": 0.027587890625, "learning_rate": 9.978590434658806e-07, "loss": 0.0011, "reward": 1.9281532764434814, "reward_std": 0.18303149938583374, "rewards/accuracy_reward": 0.7594032883644104, "rewards/format_reward": 1.0, "step": 969, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.375, "completion_length": 345.953125, "epoch": 0.029497628025787617, "grad_norm": 0.7151006176558011, "kl": 0.035888671875, "learning_rate": 9.978546254512206e-07, "loss": 0.0014, "reward": 1.6122395992279053, "reward_std": 0.017303230240941048, "rewards/accuracy_reward": 0.5091145634651184, "rewards/format_reward": 1.0, "step": 970, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 392.96875, "epoch": 0.0295280379515874, "grad_norm": 1.156622711966878, "kl": 0.023193359375, "learning_rate": 9.97850202892616e-07, "loss": 0.0009, "reward": 1.827118158340454, "reward_std": 0.15230073034763336, "rewards/accuracy_reward": 0.6614930629730225, "rewards/format_reward": 1.0, "step": 971, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.0, "completion_length": 359.0, "epoch": 0.02955844787738718, "grad_norm": 1.2567777112654557, "kl": 0.025146484375, "learning_rate": 9.978457757901074e-07, "loss": 0.001, "reward": 1.953125, "reward_std": 0.2253442257642746, "rewards/accuracy_reward": 0.796875, "rewards/format_reward": 1.0, "step": 972, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 391.3125, "epoch": 0.02958885780318696, "grad_norm": 5.286549021007393, "kl": 0.03271484375, "learning_rate": 9.978413441437352e-07, "loss": 0.0013, "reward": 1.8352673053741455, "reward_std": 0.06420054286718369, "rewards/accuracy_reward": 0.6790173053741455, "rewards/format_reward": 1.0, "step": 973, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.125, "completion_length": 366.203125, "epoch": 0.02961926772898674, "grad_norm": 0.7401289635294794, "kl": 0.02880859375, "learning_rate": 9.978369079535397e-07, "loss": 0.0012, "reward": 1.9489858150482178, "reward_std": 0.07714703679084778, "rewards/accuracy_reward": 0.789610743522644, "rewards/format_reward": 1.0, "step": 974, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.0, "completion_length": 364.09375, "epoch": 0.029649677654786523, "grad_norm": 1.1255922200526791, "kl": 0.0267333984375, "learning_rate": 9.978324672195615e-07, "loss": 0.0011, "reward": 2.1031250953674316, "reward_std": 0.190526083111763, "rewards/accuracy_reward": 0.921875, "rewards/format_reward": 1.0, "step": 975, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.125, "completion_length": 376.484375, "epoch": 0.029680087580586302, "grad_norm": 1.1122777567497524, "kl": 0.03173828125, "learning_rate": 9.97828021941841e-07, "loss": 0.0013, "reward": 1.7939478158950806, "reward_std": 0.17696912586688995, "rewards/accuracy_reward": 0.653322696685791, "rewards/format_reward": 1.0, "step": 976, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.125, "completion_length": 356.796875, "epoch": 0.029710497506386085, "grad_norm": 0.7538102238125617, "kl": 0.0296630859375, "learning_rate": 9.97823572120419e-07, "loss": 0.0012, "reward": 1.975000023841858, "reward_std": 0.08017838001251221, "rewards/accuracy_reward": 0.8125, "rewards/format_reward": 1.0, "step": 977, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.125, "completion_length": 359.34375, "epoch": 0.029740907432185864, "grad_norm": 1.1599605620501257, "kl": 0.03173828125, "learning_rate": 9.97819117755336e-07, "loss": 0.0013, "reward": 1.9171385765075684, "reward_std": 0.09751737117767334, "rewards/accuracy_reward": 0.7640135288238525, "rewards/format_reward": 1.0, "step": 978, "temperature": 1.0 }, { "all_correct": 0.25, "all_wrong": 0.25, "completion_length": 379.75, "epoch": 0.029771317357985647, "grad_norm": 4.503875807685217, "kl": 0.0294189453125, "learning_rate": 9.978146588466325e-07, "loss": 0.0012, "reward": 1.6928553581237793, "reward_std": 0.09279040992259979, "rewards/accuracy_reward": 0.549105167388916, "rewards/format_reward": 1.0, "step": 979, "temperature": 1.0 }, { "all_correct": 0.375, "all_wrong": 0.25, "completion_length": 393.25, "epoch": 0.02980172728378543, "grad_norm": 1.8302686911428612, "kl": 0.030029296875, "learning_rate": 9.978101953943493e-07, "loss": 0.0012, "reward": 1.5455598831176758, "reward_std": 0.0946376621723175, "rewards/accuracy_reward": 0.46118488907814026, "rewards/format_reward": 1.0, "step": 980, "temperature": 1.0 }, { "all_correct": 0.375, "all_wrong": 0.125, "completion_length": 414.140625, "epoch": 0.02983213720958521, "grad_norm": 1.6506077815154478, "kl": 0.028564453125, "learning_rate": 9.978057273985274e-07, "loss": 0.0011, "reward": 1.7312986850738525, "reward_std": 0.1738753467798233, "rewards/accuracy_reward": 0.5969235897064209, "rewards/format_reward": 1.0, "step": 981, "temperature": 1.0 }, { "all_correct": 0.25, "all_wrong": 0.125, "completion_length": 408.578125, "epoch": 0.02986254713538499, "grad_norm": 1.833667770887371, "kl": 0.028076171875, "learning_rate": 9.978012548592072e-07, "loss": 0.0011, "reward": 1.5394816398620605, "reward_std": 0.18644912540912628, "rewards/accuracy_reward": 0.4238566756248474, "rewards/format_reward": 1.0, "step": 982, "temperature": 1.0 }, { "all_correct": 0.375, "all_wrong": 0.125, "completion_length": 364.96875, "epoch": 0.02989295706118477, "grad_norm": 1.5160733703126137, "kl": 0.031982421875, "learning_rate": 9.977967777764298e-07, "loss": 0.0013, "reward": 1.6664032936096191, "reward_std": 0.08078866451978683, "rewards/accuracy_reward": 0.5101531744003296, "rewards/format_reward": 1.0, "step": 983, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.125, "completion_length": 360.359375, "epoch": 0.029923366986984553, "grad_norm": 0.44449218073993657, "kl": 0.033203125, "learning_rate": 9.97792296150236e-07, "loss": 0.0013, "reward": 1.9976552724838257, "reward_std": 0.005704321898519993, "rewards/accuracy_reward": 0.822655200958252, "rewards/format_reward": 1.0, "step": 984, "temperature": 1.0 }, { "all_correct": 0.375, "all_wrong": 0.125, "completion_length": 414.625, "epoch": 0.029953776912784332, "grad_norm": 1.1091560585910236, "kl": 0.0264892578125, "learning_rate": 9.977878099806665e-07, "loss": 0.0011, "reward": 1.70172917842865, "reward_std": 0.1716819703578949, "rewards/accuracy_reward": 0.5673540830612183, "rewards/format_reward": 1.0, "step": 985, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.125, "completion_length": 358.9375, "epoch": 0.029984186838584115, "grad_norm": 0.3466471642938768, "kl": 0.027587890625, "learning_rate": 9.977833192677623e-07, "loss": 0.0011, "reward": 1.9562499523162842, "reward_std": 0.077632375061512, "rewards/accuracy_reward": 0.796875, "rewards/format_reward": 1.0, "step": 986, "temperature": 1.0 }, { "all_correct": 0.375, "all_wrong": 0.125, "completion_length": 369.8125, "epoch": 0.030014596764383894, "grad_norm": 1.5107527162047532, "kl": 0.0230712890625, "learning_rate": 9.977788240115646e-07, "loss": 0.0009, "reward": 1.8345935344696045, "reward_std": 0.18656717240810394, "rewards/accuracy_reward": 0.6877186298370361, "rewards/format_reward": 1.0, "step": 987, "temperature": 1.0 }, { "all_correct": 0.875, "all_wrong": 0.0, "completion_length": 369.1875, "epoch": 0.030045006690183677, "grad_norm": 0.29539111343017155, "kl": 0.0311279296875, "learning_rate": 9.977743242121142e-07, "loss": 0.0012, "reward": 2.1437501907348633, "reward_std": 0.077632375061512, "rewards/accuracy_reward": 0.953125, "rewards/format_reward": 1.0, "step": 988, "temperature": 1.0 }, { "all_correct": 0.375, "all_wrong": 0.125, "completion_length": 400.765625, "epoch": 0.030075416615983456, "grad_norm": 1.1081499466093632, "kl": 0.0269775390625, "learning_rate": 9.977698198694523e-07, "loss": 0.0011, "reward": 1.6687896251678467, "reward_std": 0.04964016377925873, "rewards/accuracy_reward": 0.5156646370887756, "rewards/format_reward": 1.0, "step": 989, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.25, "completion_length": 365.296875, "epoch": 0.03010582654178324, "grad_norm": 0.6733295323828469, "kl": 0.02978515625, "learning_rate": 9.977653109836201e-07, "loss": 0.0012, "reward": 1.8562500476837158, "reward_std": 0.11763877421617508, "rewards/accuracy_reward": 0.71875, "rewards/format_reward": 1.0, "step": 990, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.0, "completion_length": 390.515625, "epoch": 0.030136236467583018, "grad_norm": 0.6981963351715742, "kl": 0.026123046875, "learning_rate": 9.977607975546588e-07, "loss": 0.001, "reward": 2.081507682800293, "reward_std": 0.12283474206924438, "rewards/accuracy_reward": 0.9002578258514404, "rewards/format_reward": 1.0, "step": 991, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.125, "completion_length": 369.796875, "epoch": 0.0301666463933828, "grad_norm": 1.1929917405349333, "kl": 0.033447265625, "learning_rate": 9.97756279582609e-07, "loss": 0.0013, "reward": 1.9112459421157837, "reward_std": 0.10382983088493347, "rewards/accuracy_reward": 0.75187087059021, "rewards/format_reward": 1.0, "step": 992, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.125, "completion_length": 371.6875, "epoch": 0.03019705631918258, "grad_norm": 7.433376278489804, "kl": 0.0380859375, "learning_rate": 9.977517570675128e-07, "loss": 0.0015, "reward": 1.873798131942749, "reward_std": 0.1491905152797699, "rewards/accuracy_reward": 0.7331730723381042, "rewards/format_reward": 1.0, "step": 993, "temperature": 1.0 }, { "all_correct": 0.875, "all_wrong": 0.0, "completion_length": 399.46875, "epoch": 0.030227466244982362, "grad_norm": 0.7285474661582106, "kl": 0.02685546875, "learning_rate": 9.977472300094108e-07, "loss": 0.0011, "reward": 2.1031250953674316, "reward_std": 0.08647121489048004, "rewards/accuracy_reward": 0.921875, "rewards/format_reward": 1.0, "step": 994, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.25, "completion_length": 368.421875, "epoch": 0.030257876170782145, "grad_norm": 0.6211543500311653, "kl": 0.03662109375, "learning_rate": 9.977426984083445e-07, "loss": 0.0015, "reward": 1.7468750476837158, "reward_std": 0.15315142273902893, "rewards/accuracy_reward": 0.625, "rewards/format_reward": 1.0, "step": 995, "temperature": 1.0 }, { "all_correct": 0.25, "all_wrong": 0.25, "completion_length": 374.15625, "epoch": 0.030288286096581924, "grad_norm": 3.3128506331826095, "kl": 0.034423828125, "learning_rate": 9.977381622643555e-07, "loss": 0.0014, "reward": 1.5265625715255737, "reward_std": 0.18119041621685028, "rewards/accuracy_reward": 0.4453125, "rewards/format_reward": 1.0, "step": 996, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.0, "completion_length": 394.265625, "epoch": 0.030318696022381707, "grad_norm": 0.8180593751576957, "kl": 0.02783203125, "learning_rate": 9.97733621577485e-07, "loss": 0.0011, "reward": 2.05006742477417, "reward_std": 0.09958094358444214, "rewards/accuracy_reward": 0.8656922578811646, "rewards/format_reward": 1.0, "step": 997, "temperature": 1.0 }, { "all_correct": 0.375, "all_wrong": 0.125, "completion_length": 385.703125, "epoch": 0.030349105948181486, "grad_norm": 1.7214226749397117, "kl": 0.033447265625, "learning_rate": 9.977290763477744e-07, "loss": 0.0013, "reward": 1.66288423538208, "reward_std": 0.23061969876289368, "rewards/accuracy_reward": 0.5597591400146484, "rewards/format_reward": 0.953125, "step": 998, "temperature": 1.0 }, { "all_correct": 0.875, "all_wrong": 0.0, "completion_length": 375.15625, "epoch": 0.03037951587398127, "grad_norm": 1.5463239044841857, "kl": 0.0289306640625, "learning_rate": 9.977245265752652e-07, "loss": 0.0012, "reward": 2.054236888885498, "reward_std": 0.013707850128412247, "rewards/accuracy_reward": 0.8823617696762085, "rewards/format_reward": 1.0, "step": 999, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 357.15625, "epoch": 0.030409925799781048, "grad_norm": 0.782399850106308, "kl": 0.02880859375, "learning_rate": 9.977199722599991e-07, "loss": 0.0012, "reward": 2.106250047683716, "reward_std": 0.1332113891839981, "rewards/accuracy_reward": 0.921875, "rewards/format_reward": 1.0, "step": 1000, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.125, "completion_length": 385.15625, "epoch": 0.03044033572558083, "grad_norm": 2.5352426202895546, "kl": 0.0284423828125, "learning_rate": 9.977154134020174e-07, "loss": 0.0011, "reward": 1.8904694318771362, "reward_std": 0.07783059775829315, "rewards/accuracy_reward": 0.7342193722724915, "rewards/format_reward": 1.0, "step": 1001, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.125, "completion_length": 351.65625, "epoch": 0.03047074565138061, "grad_norm": 1.1322854843580028, "kl": 0.03125, "learning_rate": 9.97710850001362e-07, "loss": 0.0012, "reward": 1.9802082777023315, "reward_std": 0.07344460487365723, "rewards/accuracy_reward": 0.8177083730697632, "rewards/format_reward": 1.0, "step": 1002, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.25, "completion_length": 390.6875, "epoch": 0.030501155577180392, "grad_norm": 1.2207954424876948, "kl": 0.028564453125, "learning_rate": 9.977062820580744e-07, "loss": 0.0011, "reward": 1.7218749523162842, "reward_std": 0.10816285014152527, "rewards/accuracy_reward": 0.625, "rewards/format_reward": 1.0, "step": 1003, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 363.734375, "epoch": 0.03053156550298017, "grad_norm": 0.8529650625732379, "kl": 0.034912109375, "learning_rate": 9.977017095721963e-07, "loss": 0.0014, "reward": 2.074571132659912, "reward_std": 0.08991102874279022, "rewards/accuracy_reward": 0.8933210968971252, "rewards/format_reward": 1.0, "step": 1004, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 355.828125, "epoch": 0.030561975428779954, "grad_norm": 1.389363713669344, "kl": 0.0306396484375, "learning_rate": 9.976971325437693e-07, "loss": 0.0012, "reward": 2.0461008548736572, "reward_std": 0.045329876244068146, "rewards/accuracy_reward": 0.8586008548736572, "rewards/format_reward": 1.0, "step": 1005, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.0, "completion_length": 393.6875, "epoch": 0.030592385354579733, "grad_norm": 1.4876171469465767, "kl": 0.028076171875, "learning_rate": 9.976925509728354e-07, "loss": 0.0011, "reward": 1.9939799308776855, "reward_std": 0.15717732906341553, "rewards/accuracy_reward": 0.847105085849762, "rewards/format_reward": 1.0, "step": 1006, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.125, "completion_length": 374.765625, "epoch": 0.030622795280379516, "grad_norm": 1.0638612113186632, "kl": 0.0308837890625, "learning_rate": 9.976879648594362e-07, "loss": 0.0012, "reward": 1.9098339080810547, "reward_std": 0.04801425337791443, "rewards/accuracy_reward": 0.7473338842391968, "rewards/format_reward": 1.0, "step": 1007, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.0, "completion_length": 387.8125, "epoch": 0.030653205206179295, "grad_norm": 1.1338305280574497, "kl": 0.03369140625, "learning_rate": 9.976833742036138e-07, "loss": 0.0014, "reward": 2.0214643478393555, "reward_std": 0.05749175697565079, "rewards/accuracy_reward": 0.8245891332626343, "rewards/format_reward": 1.0, "step": 1008, "temperature": 1.0 }, { "all_correct": 0.375, "all_wrong": 0.25, "completion_length": 371.34375, "epoch": 0.030683615131979078, "grad_norm": 0.8910110238535314, "kl": 0.0301513671875, "learning_rate": 9.976787790054102e-07, "loss": 0.0012, "reward": 1.709375023841858, "reward_std": 0.16666369140148163, "rewards/accuracy_reward": 0.59375, "rewards/format_reward": 1.0, "step": 1009, "temperature": 1.0 }, { "all_correct": 0.125, "all_wrong": 0.125, "completion_length": 390.28125, "epoch": 0.03071402505777886, "grad_norm": 1.4706095789329003, "kl": 0.0281982421875, "learning_rate": 9.976741792648668e-07, "loss": 0.0011, "reward": 1.5338377952575684, "reward_std": 0.23000706732273102, "rewards/accuracy_reward": 0.4275878369808197, "rewards/format_reward": 1.0, "step": 1010, "temperature": 1.0 }, { "all_correct": 0.375, "all_wrong": 0.25, "completion_length": 375.359375, "epoch": 0.03074443498357864, "grad_norm": 1.0386717397737988, "kl": 0.032470703125, "learning_rate": 9.97669574982026e-07, "loss": 0.0013, "reward": 1.5888352394104004, "reward_std": 0.06130202114582062, "rewards/accuracy_reward": 0.4669603705406189, "rewards/format_reward": 1.0, "step": 1011, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 400.390625, "epoch": 0.030774844909378422, "grad_norm": 1.7763034198469754, "kl": 0.03271484375, "learning_rate": 9.976649661569298e-07, "loss": 0.0013, "reward": 1.8490240573883057, "reward_std": 0.11306708306074142, "rewards/accuracy_reward": 0.6896489858627319, "rewards/format_reward": 1.0, "step": 1012, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.125, "completion_length": 382.34375, "epoch": 0.0308052548351782, "grad_norm": 1.1689508955512573, "kl": 0.033447265625, "learning_rate": 9.976603527896202e-07, "loss": 0.0013, "reward": 1.7343809604644775, "reward_std": 0.12506583333015442, "rewards/accuracy_reward": 0.6125059723854065, "rewards/format_reward": 1.0, "step": 1013, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.0, "completion_length": 366.4375, "epoch": 0.030835664760977984, "grad_norm": 0.941164418094868, "kl": 0.04150390625, "learning_rate": 9.976557348801392e-07, "loss": 0.0017, "reward": 2.0537610054016113, "reward_std": 0.09884816408157349, "rewards/accuracy_reward": 0.8631359338760376, "rewards/format_reward": 1.0, "step": 1014, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.0, "completion_length": 381.390625, "epoch": 0.030866074686777763, "grad_norm": 0.831857428020724, "kl": 0.026123046875, "learning_rate": 9.97651112428529e-07, "loss": 0.001, "reward": 1.8866114616394043, "reward_std": 0.10539491474628448, "rewards/accuracy_reward": 0.7209863662719727, "rewards/format_reward": 1.0, "step": 1015, "temperature": 1.0 }, { "all_correct": 0.25, "all_wrong": 0.125, "completion_length": 393.8125, "epoch": 0.030896484612577546, "grad_norm": 1.253263403381405, "kl": 0.0322265625, "learning_rate": 9.97646485434832e-07, "loss": 0.0013, "reward": 1.6774048805236816, "reward_std": 0.20186689496040344, "rewards/accuracy_reward": 0.5336548089981079, "rewards/format_reward": 1.0, "step": 1016, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 382.890625, "epoch": 0.030926894538377325, "grad_norm": 0.8894819683205699, "kl": 0.0306396484375, "learning_rate": 9.9764185389909e-07, "loss": 0.0012, "reward": 1.9500000476837158, "reward_std": 0.12246951460838318, "rewards/accuracy_reward": 0.796875, "rewards/format_reward": 1.0, "step": 1017, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.125, "completion_length": 379.53125, "epoch": 0.030957304464177108, "grad_norm": 1.2104989301853364, "kl": 0.03515625, "learning_rate": 9.976372178213456e-07, "loss": 0.0014, "reward": 1.8262748718261719, "reward_std": 0.08429563790559769, "rewards/accuracy_reward": 0.679399847984314, "rewards/format_reward": 1.0, "step": 1018, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.125, "completion_length": 385.03125, "epoch": 0.030987714389976887, "grad_norm": 1.1865058840825473, "kl": 0.031494140625, "learning_rate": 9.976325772016414e-07, "loss": 0.0013, "reward": 1.8211631774902344, "reward_std": 0.1438705325126648, "rewards/accuracy_reward": 0.686788022518158, "rewards/format_reward": 1.0, "step": 1019, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 366.15625, "epoch": 0.03101812431577667, "grad_norm": 0.4914112043612559, "kl": 0.031494140625, "learning_rate": 9.976279320400192e-07, "loss": 0.0013, "reward": 2.1187500953674316, "reward_std": 0.0530330091714859, "rewards/accuracy_reward": 0.921875, "rewards/format_reward": 1.0, "step": 1020, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 376.625, "epoch": 0.03104853424157645, "grad_norm": 1.0128239611337386, "kl": 0.03369140625, "learning_rate": 9.976232823365215e-07, "loss": 0.0014, "reward": 2.06553316116333, "reward_std": 0.04186427593231201, "rewards/accuracy_reward": 0.8749080300331116, "rewards/format_reward": 1.0, "step": 1021, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.0, "completion_length": 386.9375, "epoch": 0.03107894416737623, "grad_norm": 1.0307083875752883, "kl": 0.027587890625, "learning_rate": 9.97618628091191e-07, "loss": 0.0011, "reward": 1.8793494701385498, "reward_std": 0.15918007493019104, "rewards/accuracy_reward": 0.7230994701385498, "rewards/format_reward": 1.0, "step": 1022, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.25, "completion_length": 385.75, "epoch": 0.031109354093176014, "grad_norm": 1.4294363581755307, "kl": 0.033203125, "learning_rate": 9.976139693040697e-07, "loss": 0.0013, "reward": 1.8375000953674316, "reward_std": 0.09531005471944809, "rewards/accuracy_reward": 0.703125, "rewards/format_reward": 1.0, "step": 1023, "temperature": 1.0 }, { "all_correct": 0.375, "all_wrong": 0.25, "completion_length": 392.5625, "epoch": 0.031139764018975793, "grad_norm": 1.2074880237842038, "kl": 0.02783203125, "learning_rate": 9.976093059752009e-07, "loss": 0.0011, "reward": 1.7406575679779053, "reward_std": 0.12536031007766724, "rewards/accuracy_reward": 0.6000326871871948, "rewards/format_reward": 1.0, "step": 1024, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.125, "completion_length": 360.625, "epoch": 0.031170173944775576, "grad_norm": 1.0327862152264016, "kl": 0.03564453125, "learning_rate": 9.976046381046264e-07, "loss": 0.0014, "reward": 1.8537046909332275, "reward_std": 0.13305805623531342, "rewards/accuracy_reward": 0.713079571723938, "rewards/format_reward": 1.0, "step": 1025, "temperature": 1.0 }, { "all_correct": 0.375, "all_wrong": 0.0, "completion_length": 388.375, "epoch": 0.031200583870575355, "grad_norm": 3.101474064043986, "kl": 0.033203125, "learning_rate": 9.97599965692389e-07, "loss": 0.0013, "reward": 1.8433763980865479, "reward_std": 0.11379536986351013, "rewards/accuracy_reward": 0.6715012788772583, "rewards/format_reward": 1.0, "step": 1026, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.0, "completion_length": 381.5, "epoch": 0.031230993796375138, "grad_norm": 0.7608621621800262, "kl": 0.02685546875, "learning_rate": 9.975952887385317e-07, "loss": 0.0011, "reward": 1.8340480327606201, "reward_std": 0.1912408322095871, "rewards/accuracy_reward": 0.7059230804443359, "rewards/format_reward": 0.984375, "step": 1027, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 348.328125, "epoch": 0.03126140372217492, "grad_norm": 1.1145174861487677, "kl": 0.033935546875, "learning_rate": 9.975906072430969e-07, "loss": 0.0014, "reward": 2.02040433883667, "reward_std": 0.0999375432729721, "rewards/accuracy_reward": 0.8391544222831726, "rewards/format_reward": 1.0, "step": 1028, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 393.625, "epoch": 0.031291813647974696, "grad_norm": 1.430521498035193, "kl": 0.02734375, "learning_rate": 9.975859212061274e-07, "loss": 0.0011, "reward": 2.1035537719726562, "reward_std": 0.07151352614164352, "rewards/accuracy_reward": 0.9129289388656616, "rewards/format_reward": 1.0, "step": 1029, "temperature": 1.0 }, { "all_correct": 0.875, "all_wrong": 0.0, "completion_length": 366.09375, "epoch": 0.03132222357377448, "grad_norm": 0.9725369091114492, "kl": 0.0291748046875, "learning_rate": 9.97581230627666e-07, "loss": 0.0012, "reward": 2.0565624237060547, "reward_std": 0.05702798068523407, "rewards/accuracy_reward": 0.8846875429153442, "rewards/format_reward": 1.0, "step": 1030, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 379.5625, "epoch": 0.03135263349957426, "grad_norm": 1.3794703306002054, "kl": 0.035888671875, "learning_rate": 9.975765355077553e-07, "loss": 0.0014, "reward": 1.93522047996521, "reward_std": 0.13645440340042114, "rewards/accuracy_reward": 0.7633453607559204, "rewards/format_reward": 1.0, "step": 1031, "temperature": 1.0 }, { "all_correct": 0.375, "all_wrong": 0.0, "completion_length": 373.09375, "epoch": 0.03138304342537404, "grad_norm": 1.5917229873345842, "kl": 0.031005859375, "learning_rate": 9.975718358464383e-07, "loss": 0.0012, "reward": 1.8549714088439941, "reward_std": 0.19620639085769653, "rewards/accuracy_reward": 0.6862212419509888, "rewards/format_reward": 1.0, "step": 1032, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.0, "completion_length": 379.421875, "epoch": 0.03141345335117382, "grad_norm": 1.4726606412692054, "kl": 0.031494140625, "learning_rate": 9.97567131643758e-07, "loss": 0.0013, "reward": 1.9913734197616577, "reward_std": 0.1471506804227829, "rewards/accuracy_reward": 0.8507484197616577, "rewards/format_reward": 1.0, "step": 1033, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 387.484375, "epoch": 0.031443863276973606, "grad_norm": 1.259856833652236, "kl": 0.026123046875, "learning_rate": 9.97562422899757e-07, "loss": 0.001, "reward": 1.8706974983215332, "reward_std": 0.1301310956478119, "rewards/accuracy_reward": 0.6988223791122437, "rewards/format_reward": 1.0, "step": 1034, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 404.84375, "epoch": 0.031474273202773385, "grad_norm": 2.2608351361129797, "kl": 0.0260009765625, "learning_rate": 9.975577096144788e-07, "loss": 0.001, "reward": 1.7881382703781128, "reward_std": 0.15508222579956055, "rewards/accuracy_reward": 0.6725132465362549, "rewards/format_reward": 1.0, "step": 1035, "temperature": 1.0 }, { "all_correct": 0.875, "all_wrong": 0.0, "completion_length": 371.265625, "epoch": 0.031504683128573165, "grad_norm": 0.48728254589879094, "kl": 0.0322265625, "learning_rate": 9.97552991787966e-07, "loss": 0.0013, "reward": 2.1239399909973145, "reward_std": 0.007922534830868244, "rewards/accuracy_reward": 0.9239398837089539, "rewards/format_reward": 1.0, "step": 1036, "temperature": 1.0 }, { "all_correct": 0.125, "all_wrong": 0.375, "completion_length": 374.421875, "epoch": 0.03153509305437295, "grad_norm": 1.3629466911378965, "kl": 0.02734375, "learning_rate": 9.975482694202618e-07, "loss": 0.0011, "reward": 1.2980787754058838, "reward_std": 0.11241129040718079, "rewards/accuracy_reward": 0.2137036919593811, "rewards/format_reward": 1.0, "step": 1037, "temperature": 1.0 }, { "all_correct": 0.25, "all_wrong": 0.375, "completion_length": 406.625, "epoch": 0.03156550298017273, "grad_norm": 1.461056211536323, "kl": 0.0283203125, "learning_rate": 9.975435425114092e-07, "loss": 0.0011, "reward": 1.4827834367752075, "reward_std": 0.10858327895402908, "rewards/accuracy_reward": 0.4015333950519562, "rewards/format_reward": 1.0, "step": 1038, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.125, "completion_length": 397.109375, "epoch": 0.03159591290597251, "grad_norm": 0.66771279792573, "kl": 0.0247802734375, "learning_rate": 9.975388110614514e-07, "loss": 0.001, "reward": 1.9582293033599854, "reward_std": 0.004301874432712793, "rewards/accuracy_reward": 0.7832292318344116, "rewards/format_reward": 1.0, "step": 1039, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.0, "completion_length": 389.328125, "epoch": 0.03162632283177229, "grad_norm": 0.9623246814350269, "kl": 0.027587890625, "learning_rate": 9.975340750704317e-07, "loss": 0.0011, "reward": 1.9350848197937012, "reward_std": 0.15852871537208557, "rewards/accuracy_reward": 0.7663347721099854, "rewards/format_reward": 1.0, "step": 1040, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.25, "completion_length": 388.734375, "epoch": 0.031656732757572074, "grad_norm": 0.29921358495941097, "kl": 0.032958984375, "learning_rate": 9.975293345383933e-07, "loss": 0.0013, "reward": 1.765625, "reward_std": 0.04419417306780815, "rewards/accuracy_reward": 0.640625, "rewards/format_reward": 1.0, "step": 1041, "temperature": 1.0 }, { "all_correct": 0.875, "all_wrong": 0.0, "completion_length": 383.53125, "epoch": 0.031687142683371854, "grad_norm": 6.553914097257682, "kl": 0.04052734375, "learning_rate": 9.975245894653795e-07, "loss": 0.0016, "reward": 2.112783432006836, "reward_std": 0.006405085325241089, "rewards/accuracy_reward": 0.912783145904541, "rewards/format_reward": 1.0, "step": 1042, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.125, "completion_length": 421.921875, "epoch": 0.03171755260917163, "grad_norm": 0.8658028152057612, "kl": 0.029296875, "learning_rate": 9.975198398514333e-07, "loss": 0.0012, "reward": 1.869927167892456, "reward_std": 0.12430602312088013, "rewards/accuracy_reward": 0.7261770963668823, "rewards/format_reward": 0.984375, "step": 1043, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.125, "completion_length": 374.65625, "epoch": 0.03174796253497141, "grad_norm": 1.5865512189991238, "kl": 0.0419921875, "learning_rate": 9.975150856965984e-07, "loss": 0.0017, "reward": 1.9823377132415771, "reward_std": 0.007232592906802893, "rewards/accuracy_reward": 0.8073376417160034, "rewards/format_reward": 1.0, "step": 1044, "temperature": 1.0 }, { "all_correct": 0.375, "all_wrong": 0.125, "completion_length": 404.84375, "epoch": 0.0317783724607712, "grad_norm": 1.33193976633155, "kl": 0.029052734375, "learning_rate": 9.97510327000918e-07, "loss": 0.0012, "reward": 1.6656694412231445, "reward_std": 0.10279634594917297, "rewards/accuracy_reward": 0.5187945365905762, "rewards/format_reward": 1.0, "step": 1045, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 395.328125, "epoch": 0.03180878238657098, "grad_norm": 1.3035393755697025, "kl": 0.0390625, "learning_rate": 9.975055637644355e-07, "loss": 0.0016, "reward": 1.786503791809082, "reward_std": 0.111859530210495, "rewards/accuracy_reward": 0.6458787322044373, "rewards/format_reward": 1.0, "step": 1046, "temperature": 1.0 }, { "all_correct": 0.375, "all_wrong": 0.125, "completion_length": 397.125, "epoch": 0.03183919231237076, "grad_norm": 1.0076990893591982, "kl": 0.033935546875, "learning_rate": 9.975007959871947e-07, "loss": 0.0014, "reward": 1.810640811920166, "reward_std": 0.21970966458320618, "rewards/accuracy_reward": 0.6637657284736633, "rewards/format_reward": 1.0, "step": 1047, "temperature": 1.0 }, { "all_correct": 0.375, "all_wrong": 0.25, "completion_length": 405.453125, "epoch": 0.031869602238170536, "grad_norm": 1.1632123679178106, "kl": 0.03369140625, "learning_rate": 9.974960236692387e-07, "loss": 0.0013, "reward": 1.686184287071228, "reward_std": 0.051657333970069885, "rewards/accuracy_reward": 0.5424342155456543, "rewards/format_reward": 1.0, "step": 1048, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.0, "completion_length": 401.875, "epoch": 0.03190001216397032, "grad_norm": 0.7782241410854358, "kl": 0.03857421875, "learning_rate": 9.974912468106112e-07, "loss": 0.0015, "reward": 1.9845863580703735, "reward_std": 0.08298208564519882, "rewards/accuracy_reward": 0.7939612865447998, "rewards/format_reward": 1.0, "step": 1049, "temperature": 1.0 }, { "all_correct": 0.875, "all_wrong": 0.125, "completion_length": 401.765625, "epoch": 0.0319304220897701, "grad_norm": 0.2278528084097704, "kl": 0.04052734375, "learning_rate": 9.97486465411356e-07, "loss": 0.0016, "reward": 2.0031251907348633, "reward_std": 0.0646936446428299, "rewards/accuracy_reward": 0.875, "rewards/format_reward": 0.953125, "step": 1050, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.0, "completion_length": 385.921875, "epoch": 0.03196083201556988, "grad_norm": 1.0086817191493935, "kl": 0.03955078125, "learning_rate": 9.974816794715165e-07, "loss": 0.0016, "reward": 2.0194334983825684, "reward_std": 0.06533440947532654, "rewards/accuracy_reward": 0.8256836533546448, "rewards/format_reward": 1.0, "step": 1051, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 386.625, "epoch": 0.031991241941369666, "grad_norm": 0.5998748827167423, "kl": 0.03466796875, "learning_rate": 9.974768889911365e-07, "loss": 0.0014, "reward": 2.1642045974731445, "reward_std": 0.061623118817806244, "rewards/accuracy_reward": 0.9673295021057129, "rewards/format_reward": 1.0, "step": 1052, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 381.234375, "epoch": 0.032021651867169446, "grad_norm": 1.1801284648311607, "kl": 0.04052734375, "learning_rate": 9.974720939702596e-07, "loss": 0.0016, "reward": 1.8989980220794678, "reward_std": 0.12210077047348022, "rewards/accuracy_reward": 0.7208729982376099, "rewards/format_reward": 1.0, "step": 1053, "temperature": 1.0 }, { "all_correct": 0.375, "all_wrong": 0.125, "completion_length": 375.828125, "epoch": 0.032052061792969225, "grad_norm": 3.0655197929301248, "kl": 0.0361328125, "learning_rate": 9.974672944089296e-07, "loss": 0.0014, "reward": 1.7496037483215332, "reward_std": 0.16746900975704193, "rewards/accuracy_reward": 0.6214788556098938, "rewards/format_reward": 1.0, "step": 1054, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.25, "completion_length": 365.484375, "epoch": 0.032082471718769004, "grad_norm": 0.7612209929918888, "kl": 0.034423828125, "learning_rate": 9.974624903071903e-07, "loss": 0.0014, "reward": 1.8343307971954346, "reward_std": 0.013469927944242954, "rewards/accuracy_reward": 0.6843306422233582, "rewards/format_reward": 1.0, "step": 1055, "temperature": 1.0 }, { "all_correct": 0.25, "all_wrong": 0.125, "completion_length": 399.65625, "epoch": 0.03211288164456879, "grad_norm": 1.6204578480917131, "kl": 0.03271484375, "learning_rate": 9.97457681665086e-07, "loss": 0.0013, "reward": 1.718801736831665, "reward_std": 0.21143373847007751, "rewards/accuracy_reward": 0.6063016653060913, "rewards/format_reward": 1.0, "step": 1056, "temperature": 1.0 }, { "all_correct": 0.375, "all_wrong": 0.0, "completion_length": 392.265625, "epoch": 0.03214329157036857, "grad_norm": 1.722078742051551, "kl": 0.031494140625, "learning_rate": 9.974528684826598e-07, "loss": 0.0013, "reward": 1.7573037147521973, "reward_std": 0.18160060048103333, "rewards/accuracy_reward": 0.6073037385940552, "rewards/format_reward": 0.984375, "step": 1057, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.125, "completion_length": 375.984375, "epoch": 0.03217370149616835, "grad_norm": 1.2005784098615955, "kl": 0.031494140625, "learning_rate": 9.97448050759956e-07, "loss": 0.0013, "reward": 1.8566406965255737, "reward_std": 0.08184340596199036, "rewards/accuracy_reward": 0.7035156488418579, "rewards/format_reward": 1.0, "step": 1058, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 348.078125, "epoch": 0.03220411142196813, "grad_norm": 1.1694386500197025, "kl": 0.034423828125, "learning_rate": 9.97443228497019e-07, "loss": 0.0014, "reward": 2.0843751430511475, "reward_std": 0.1595131754875183, "rewards/accuracy_reward": 0.921875, "rewards/format_reward": 1.0, "step": 1059, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.25, "completion_length": 367.21875, "epoch": 0.032234521347767914, "grad_norm": 1.5085682650996186, "kl": 0.0341796875, "learning_rate": 9.97438401693892e-07, "loss": 0.0014, "reward": 1.805967926979065, "reward_std": 0.043219514191150665, "rewards/accuracy_reward": 0.6715928912162781, "rewards/format_reward": 1.0, "step": 1060, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 391.4375, "epoch": 0.03226493127356769, "grad_norm": 1.2843785197775819, "kl": 0.0296630859375, "learning_rate": 9.974335703506195e-07, "loss": 0.0012, "reward": 1.9324777126312256, "reward_std": 0.1777973622083664, "rewards/accuracy_reward": 0.7762277126312256, "rewards/format_reward": 1.0, "step": 1061, "temperature": 1.0 }, { "all_correct": 0.875, "all_wrong": 0.125, "completion_length": 353.578125, "epoch": 0.03229534119936747, "grad_norm": 0.9274474149210893, "kl": 0.0286865234375, "learning_rate": 9.974287344672458e-07, "loss": 0.0011, "reward": 2.0343751907348633, "reward_std": 0.030616413801908493, "rewards/accuracy_reward": 0.875, "rewards/format_reward": 1.0, "step": 1062, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 358.453125, "epoch": 0.03232575112516725, "grad_norm": 4.099988383137438, "kl": 0.031494140625, "learning_rate": 9.974238940438145e-07, "loss": 0.0013, "reward": 1.8780081272125244, "reward_std": 0.2630694806575775, "rewards/accuracy_reward": 0.7248830795288086, "rewards/format_reward": 1.0, "step": 1063, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.125, "completion_length": 348.640625, "epoch": 0.03235616105096704, "grad_norm": 2.4818476415134567, "kl": 0.03662109375, "learning_rate": 9.974190490803702e-07, "loss": 0.0015, "reward": 1.8492457866668701, "reward_std": 0.11250324547290802, "rewards/accuracy_reward": 0.7117456793785095, "rewards/format_reward": 1.0, "step": 1064, "temperature": 1.0 }, { "all_correct": 0.375, "all_wrong": 0.25, "completion_length": 365.6875, "epoch": 0.03238657097676682, "grad_norm": 0.8569010092934155, "kl": 0.03271484375, "learning_rate": 9.97414199576957e-07, "loss": 0.0013, "reward": 1.609375, "reward_std": 0.14089259505271912, "rewards/accuracy_reward": 0.515625, "rewards/format_reward": 1.0, "step": 1065, "temperature": 1.0 }, { "all_correct": 0.25, "all_wrong": 0.125, "completion_length": 383.125, "epoch": 0.032416980902566596, "grad_norm": 2.2040386956667173, "kl": 0.03515625, "learning_rate": 9.974093455336192e-07, "loss": 0.0014, "reward": 1.6146278381347656, "reward_std": 0.19844332337379456, "rewards/accuracy_reward": 0.48650285601615906, "rewards/format_reward": 1.0, "step": 1066, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.125, "completion_length": 361.375, "epoch": 0.03244739082836638, "grad_norm": 0.7779248216512246, "kl": 0.036376953125, "learning_rate": 9.97404486950401e-07, "loss": 0.0015, "reward": 1.9718750715255737, "reward_std": 0.08901721239089966, "rewards/accuracy_reward": 0.8125, "rewards/format_reward": 1.0, "step": 1067, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 419.234375, "epoch": 0.03247780075416616, "grad_norm": 1.3818472136952866, "kl": 0.026123046875, "learning_rate": 9.973996238273468e-07, "loss": 0.001, "reward": 1.8044815063476562, "reward_std": 0.11264000087976456, "rewards/accuracy_reward": 0.6576066017150879, "rewards/format_reward": 1.0, "step": 1068, "temperature": 1.0 }, { "all_correct": 0.375, "all_wrong": 0.125, "completion_length": 374.734375, "epoch": 0.03250821067996594, "grad_norm": 1.4298278780276268, "kl": 0.0341796875, "learning_rate": 9.973947561645008e-07, "loss": 0.0014, "reward": 1.896875023841858, "reward_std": 0.2362026572227478, "rewards/accuracy_reward": 0.75, "rewards/format_reward": 1.0, "step": 1069, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.25, "completion_length": 396.3125, "epoch": 0.03253862060576572, "grad_norm": 0.6803816360331187, "kl": 0.0281982421875, "learning_rate": 9.97389883961908e-07, "loss": 0.0011, "reward": 1.7533247470855713, "reward_std": 0.08520626276731491, "rewards/accuracy_reward": 0.6158246994018555, "rewards/format_reward": 1.0, "step": 1070, "temperature": 1.0 }, { "all_correct": 0.875, "all_wrong": 0.0, "completion_length": 391.5, "epoch": 0.032569030531565506, "grad_norm": 1.050297100744761, "kl": 0.033447265625, "learning_rate": 9.973850072196122e-07, "loss": 0.0013, "reward": 2.0400681495666504, "reward_std": 0.013605024665594101, "rewards/accuracy_reward": 0.8838181495666504, "rewards/format_reward": 1.0, "step": 1071, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.125, "completion_length": 378.859375, "epoch": 0.032599440457365285, "grad_norm": 0.8194215031196004, "kl": 0.037841796875, "learning_rate": 9.973801259376583e-07, "loss": 0.0015, "reward": 1.8406963348388672, "reward_std": 0.05681546777486801, "rewards/accuracy_reward": 0.696946382522583, "rewards/format_reward": 1.0, "step": 1072, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.125, "completion_length": 407.265625, "epoch": 0.032629850383165064, "grad_norm": 0.5931019624130202, "kl": 0.037109375, "learning_rate": 9.973752401160905e-07, "loss": 0.0015, "reward": 1.8343359231948853, "reward_std": 0.052666086703538895, "rewards/accuracy_reward": 0.6874608993530273, "rewards/format_reward": 1.0, "step": 1073, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.125, "completion_length": 371.75, "epoch": 0.03266026030896484, "grad_norm": 0.5850531213485309, "kl": 0.037353515625, "learning_rate": 9.97370349754954e-07, "loss": 0.0015, "reward": 1.970604419708252, "reward_std": 0.006674332078546286, "rewards/accuracy_reward": 0.7956044673919678, "rewards/format_reward": 1.0, "step": 1074, "temperature": 1.0 }, { "all_correct": 0.375, "all_wrong": 0.0, "completion_length": 400.5, "epoch": 0.03269067023476463, "grad_norm": 1.110155660909042, "kl": 0.0247802734375, "learning_rate": 9.973654548542927e-07, "loss": 0.001, "reward": 1.8497488498687744, "reward_std": 0.19115684926509857, "rewards/accuracy_reward": 0.6747488379478455, "rewards/format_reward": 1.0, "step": 1075, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.125, "completion_length": 365.15625, "epoch": 0.03272108016056441, "grad_norm": 1.0284487521612546, "kl": 0.031494140625, "learning_rate": 9.97360555414152e-07, "loss": 0.0013, "reward": 1.8062500953674316, "reward_std": 0.19190602004528046, "rewards/accuracy_reward": 0.671875, "rewards/format_reward": 1.0, "step": 1076, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 369.734375, "epoch": 0.03275149008636419, "grad_norm": 1.9472521362140927, "kl": 0.03759765625, "learning_rate": 9.973556514345761e-07, "loss": 0.0015, "reward": 1.9555892944335938, "reward_std": 0.07708688080310822, "rewards/accuracy_reward": 0.7587141990661621, "rewards/format_reward": 1.0, "step": 1077, "temperature": 1.0 }, { "all_correct": 0.25, "all_wrong": 0.125, "completion_length": 387.1875, "epoch": 0.03278190001216397, "grad_norm": 1.3884403244974035, "kl": 0.0274658203125, "learning_rate": 9.973507429156098e-07, "loss": 0.0011, "reward": 1.819960355758667, "reward_std": 0.21372120082378387, "rewards/accuracy_reward": 0.663710355758667, "rewards/format_reward": 1.0, "step": 1078, "temperature": 1.0 }, { "all_correct": 0.375, "all_wrong": 0.375, "completion_length": 390.0625, "epoch": 0.03281230993796375, "grad_norm": 0.8200422254215559, "kl": 0.03173828125, "learning_rate": 9.973458298572982e-07, "loss": 0.0013, "reward": 1.631159782409668, "reward_std": 0.08605692535638809, "rewards/accuracy_reward": 0.5155346393585205, "rewards/format_reward": 1.0, "step": 1079, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.125, "completion_length": 365.890625, "epoch": 0.03284271986376353, "grad_norm": 0.8075935578414538, "kl": 0.0341796875, "learning_rate": 9.973409122596858e-07, "loss": 0.0014, "reward": 2.0218749046325684, "reward_std": 0.07018192857503891, "rewards/accuracy_reward": 0.859375, "rewards/format_reward": 1.0, "step": 1080, "temperature": 1.0 }, { "all_correct": 0.25, "all_wrong": 0.25, "completion_length": 414.28125, "epoch": 0.03287312978956331, "grad_norm": 2.164588831509964, "kl": 0.0264892578125, "learning_rate": 9.97335990122818e-07, "loss": 0.0011, "reward": 1.4646395444869995, "reward_std": 0.1339268684387207, "rewards/accuracy_reward": 0.38338956236839294, "rewards/format_reward": 0.984375, "step": 1081, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.125, "completion_length": 370.96875, "epoch": 0.0329035397153631, "grad_norm": 0.5506757118454917, "kl": 0.03369140625, "learning_rate": 9.97331063446739e-07, "loss": 0.0013, "reward": 1.9762752056121826, "reward_std": 0.024736447259783745, "rewards/accuracy_reward": 0.8012750148773193, "rewards/format_reward": 1.0, "step": 1082, "temperature": 1.0 }, { "all_correct": 0.375, "all_wrong": 0.125, "completion_length": 385.546875, "epoch": 0.03293394964116288, "grad_norm": 1.4689588116782917, "kl": 0.0308837890625, "learning_rate": 9.973261322314942e-07, "loss": 0.0012, "reward": 1.714081048965454, "reward_std": 0.05578683689236641, "rewards/accuracy_reward": 0.5484559535980225, "rewards/format_reward": 1.0, "step": 1083, "temperature": 1.0 }, { "all_correct": 0.375, "all_wrong": 0.375, "completion_length": 386.59375, "epoch": 0.032964359566962656, "grad_norm": 0.7729783315586961, "kl": 0.0301513671875, "learning_rate": 9.973211964771286e-07, "loss": 0.0012, "reward": 1.6049872636795044, "reward_std": 0.07922689616680145, "rewards/accuracy_reward": 0.495612233877182, "rewards/format_reward": 1.0, "step": 1084, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.25, "completion_length": 364.09375, "epoch": 0.032994769492762435, "grad_norm": 0.08579283258951795, "kl": 0.03271484375, "learning_rate": 9.973162561836872e-07, "loss": 0.0013, "reward": 1.9000000953674316, "reward_std": 0.0, "rewards/accuracy_reward": 0.75, "rewards/format_reward": 1.0, "step": 1085, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.25, "completion_length": 360.4375, "epoch": 0.03302517941856222, "grad_norm": 0.46954585572884366, "kl": 0.0341796875, "learning_rate": 9.973113113512152e-07, "loss": 0.0014, "reward": 1.896875023841858, "reward_std": 0.008838837966322899, "rewards/accuracy_reward": 0.75, "rewards/format_reward": 1.0, "step": 1086, "temperature": 1.0 }, { "all_correct": 0.25, "all_wrong": 0.25, "completion_length": 389.40625, "epoch": 0.033055589344362, "grad_norm": 1.7091455195897645, "kl": 0.0267333984375, "learning_rate": 9.973063619797576e-07, "loss": 0.0011, "reward": 1.5719795227050781, "reward_std": 0.13767774403095245, "rewards/accuracy_reward": 0.4501045346260071, "rewards/format_reward": 1.0, "step": 1087, "temperature": 1.0 }, { "all_correct": 0.375, "all_wrong": 0.125, "completion_length": 398.09375, "epoch": 0.03308599927016178, "grad_norm": 1.0289699015560325, "kl": 0.025390625, "learning_rate": 9.973014080693593e-07, "loss": 0.001, "reward": 1.8027890920639038, "reward_std": 0.12817265093326569, "rewards/accuracy_reward": 0.6559140086174011, "rewards/format_reward": 1.0, "step": 1088, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 395.015625, "epoch": 0.03311640919596156, "grad_norm": 1.5449118187480184, "kl": 0.032470703125, "learning_rate": 9.972964496200661e-07, "loss": 0.0013, "reward": 1.8438479900360107, "reward_std": 0.09966101497411728, "rewards/accuracy_reward": 0.6594729423522949, "rewards/format_reward": 1.0, "step": 1089, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.375, "completion_length": 363.921875, "epoch": 0.033146819121761345, "grad_norm": 0.7394582997275666, "kl": 0.032470703125, "learning_rate": 9.972914866319228e-07, "loss": 0.0013, "reward": 1.6560384035110474, "reward_std": 0.013002442196011543, "rewards/accuracy_reward": 0.5341634750366211, "rewards/format_reward": 1.0, "step": 1090, "temperature": 1.0 }, { "all_correct": 0.125, "all_wrong": 0.375, "completion_length": 387.796875, "epoch": 0.033177229047561124, "grad_norm": 5.706732301032811, "kl": 0.0361328125, "learning_rate": 9.97286519104975e-07, "loss": 0.0014, "reward": 1.5088940858840942, "reward_std": 0.16414456069469452, "rewards/accuracy_reward": 0.40264415740966797, "rewards/format_reward": 1.0, "step": 1091, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.125, "completion_length": 389.453125, "epoch": 0.0332076389733609, "grad_norm": 0.7165977625533464, "kl": 0.0301513671875, "learning_rate": 9.972815470392679e-07, "loss": 0.0012, "reward": 1.870192289352417, "reward_std": 0.08161399513483047, "rewards/accuracy_reward": 0.729567289352417, "rewards/format_reward": 1.0, "step": 1092, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 391.5625, "epoch": 0.03323804889916068, "grad_norm": 0.9248391407503694, "kl": 0.03076171875, "learning_rate": 9.972765704348467e-07, "loss": 0.0012, "reward": 1.8346686363220215, "reward_std": 0.16912472248077393, "rewards/accuracy_reward": 0.6784186363220215, "rewards/format_reward": 1.0, "step": 1093, "temperature": 1.0 }, { "all_correct": 0.375, "all_wrong": 0.25, "completion_length": 364.015625, "epoch": 0.03326845882496047, "grad_norm": 0.974307748350617, "kl": 0.03466796875, "learning_rate": 9.972715892917572e-07, "loss": 0.0014, "reward": 1.7006316184997559, "reward_std": 0.13532617688179016, "rewards/accuracy_reward": 0.5662564635276794, "rewards/format_reward": 1.0, "step": 1094, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.25, "completion_length": 349.9375, "epoch": 0.03329886875076025, "grad_norm": 0.8915227577354273, "kl": 0.03271484375, "learning_rate": 9.972666036100444e-07, "loss": 0.0013, "reward": 1.7798829078674316, "reward_std": 0.02489185519516468, "rewards/accuracy_reward": 0.6455078125, "rewards/format_reward": 1.0, "step": 1095, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.25, "completion_length": 379.859375, "epoch": 0.03332927867656003, "grad_norm": 0.48060170225431265, "kl": 0.0262451171875, "learning_rate": 9.972616133897543e-07, "loss": 0.0011, "reward": 1.8218750953674316, "reward_std": 0.08901721239089966, "rewards/accuracy_reward": 0.6875, "rewards/format_reward": 1.0, "step": 1096, "temperature": 1.0 }, { "all_correct": 0.375, "all_wrong": 0.25, "completion_length": 395.0, "epoch": 0.03335968860235981, "grad_norm": 1.2592743953890917, "kl": 0.030517578125, "learning_rate": 9.972566186309321e-07, "loss": 0.0012, "reward": 1.6198643445968628, "reward_std": 0.10171298682689667, "rewards/accuracy_reward": 0.494864284992218, "rewards/format_reward": 1.0, "step": 1097, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.125, "completion_length": 388.71875, "epoch": 0.03339009852815959, "grad_norm": 0.6234882876885078, "kl": 0.0301513671875, "learning_rate": 9.972516193336235e-07, "loss": 0.0012, "reward": 1.9183714389801025, "reward_std": 0.08268841356039047, "rewards/accuracy_reward": 0.7683714628219604, "rewards/format_reward": 1.0, "step": 1098, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.125, "completion_length": 378.6875, "epoch": 0.03342050845395937, "grad_norm": 1.1879629268265015, "kl": 0.0308837890625, "learning_rate": 9.972466154978743e-07, "loss": 0.0012, "reward": 1.8996248245239258, "reward_std": 0.1979600489139557, "rewards/accuracy_reward": 0.758999764919281, "rewards/format_reward": 1.0, "step": 1099, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 372.125, "epoch": 0.03345091837975915, "grad_norm": 0.6969750381163629, "kl": 0.030517578125, "learning_rate": 9.972416071237297e-07, "loss": 0.0012, "reward": 1.9986071586608887, "reward_std": 0.05326046049594879, "rewards/accuracy_reward": 0.8111070394515991, "rewards/format_reward": 1.0, "step": 1100, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.125, "completion_length": 355.21875, "epoch": 0.03348132830555894, "grad_norm": 1.1670169454417925, "kl": 0.036376953125, "learning_rate": 9.97236594211236e-07, "loss": 0.0015, "reward": 1.9805370569229126, "reward_std": 0.047131411731243134, "rewards/accuracy_reward": 0.8117871284484863, "rewards/format_reward": 1.0, "step": 1101, "temperature": 1.0 }, { "all_correct": 0.25, "all_wrong": 0.25, "completion_length": 393.71875, "epoch": 0.033511738231358716, "grad_norm": 1.1557842585686968, "kl": 0.029296875, "learning_rate": 9.972315767604384e-07, "loss": 0.0012, "reward": 1.475762963294983, "reward_std": 0.18541356921195984, "rewards/accuracy_reward": 0.391387939453125, "rewards/format_reward": 1.0, "step": 1102, "temperature": 1.0 }, { "all_correct": 0.375, "all_wrong": 0.125, "completion_length": 398.5625, "epoch": 0.033542148157158495, "grad_norm": 1.6982091192995499, "kl": 0.028564453125, "learning_rate": 9.97226554771383e-07, "loss": 0.0011, "reward": 1.6949204206466675, "reward_std": 0.1318046748638153, "rewards/accuracy_reward": 0.5542953610420227, "rewards/format_reward": 1.0, "step": 1103, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.25, "completion_length": 387.984375, "epoch": 0.033572558082958275, "grad_norm": 0.712282533668454, "kl": 0.03173828125, "learning_rate": 9.972215282441159e-07, "loss": 0.0013, "reward": 1.7991609573364258, "reward_std": 0.016065146774053574, "rewards/accuracy_reward": 0.6522858738899231, "rewards/format_reward": 1.0, "step": 1104, "temperature": 1.0 }, { "all_correct": 0.375, "all_wrong": 0.25, "completion_length": 394.421875, "epoch": 0.03360296800875806, "grad_norm": 1.136228153510139, "kl": 0.0303955078125, "learning_rate": 9.972164971786824e-07, "loss": 0.0012, "reward": 1.6297039985656738, "reward_std": 0.01870308443903923, "rewards/accuracy_reward": 0.5047040581703186, "rewards/format_reward": 1.0, "step": 1105, "temperature": 1.0 }, { "all_correct": 0.375, "all_wrong": 0.25, "completion_length": 409.9375, "epoch": 0.03363337793455784, "grad_norm": 0.7446650634925455, "kl": 0.0255126953125, "learning_rate": 9.972114615751287e-07, "loss": 0.001, "reward": 1.7607765197753906, "reward_std": 0.1566496044397354, "rewards/accuracy_reward": 0.6357765197753906, "rewards/format_reward": 0.984375, "step": 1106, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.125, "completion_length": 380.375, "epoch": 0.03366378786035762, "grad_norm": 0.9641752956345843, "kl": 0.031494140625, "learning_rate": 9.97206421433501e-07, "loss": 0.0013, "reward": 1.8046131134033203, "reward_std": 0.08519918471574783, "rewards/accuracy_reward": 0.6733629703521729, "rewards/format_reward": 0.984375, "step": 1107, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.125, "completion_length": 378.15625, "epoch": 0.033694197786157405, "grad_norm": 0.7673943013304966, "kl": 0.02978515625, "learning_rate": 9.972013767538448e-07, "loss": 0.0012, "reward": 1.8758015632629395, "reward_std": 0.08757517486810684, "rewards/accuracy_reward": 0.7226765155792236, "rewards/format_reward": 1.0, "step": 1108, "temperature": 1.0 }, { "all_correct": 0.375, "all_wrong": 0.125, "completion_length": 376.984375, "epoch": 0.033724607711957184, "grad_norm": 0.963322040028811, "kl": 0.0286865234375, "learning_rate": 9.971963275362062e-07, "loss": 0.0011, "reward": 1.5670509338378906, "reward_std": 0.13995975255966187, "rewards/accuracy_reward": 0.4545508623123169, "rewards/format_reward": 1.0, "step": 1109, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 385.453125, "epoch": 0.033755017637756964, "grad_norm": 2.672311078318137, "kl": 0.0306396484375, "learning_rate": 9.971912737806319e-07, "loss": 0.0012, "reward": 1.9038267135620117, "reward_std": 0.1121816486120224, "rewards/accuracy_reward": 0.7350767850875854, "rewards/format_reward": 1.0, "step": 1110, "temperature": 1.0 }, { "all_correct": 0.375, "all_wrong": 0.25, "completion_length": 363.453125, "epoch": 0.03378542756355674, "grad_norm": 1.2558479431204714, "kl": 0.03271484375, "learning_rate": 9.971862154871674e-07, "loss": 0.0013, "reward": 1.658834457397461, "reward_std": 0.060252152383327484, "rewards/accuracy_reward": 0.527584433555603, "rewards/format_reward": 1.0, "step": 1111, "temperature": 1.0 }, { "all_correct": 0.25, "all_wrong": 0.125, "completion_length": 378.34375, "epoch": 0.03381583748935653, "grad_norm": 1.1270021494138065, "kl": 0.026611328125, "learning_rate": 9.97181152655859e-07, "loss": 0.0011, "reward": 1.5101438760757446, "reward_std": 0.25951337814331055, "rewards/accuracy_reward": 0.4351438581943512, "rewards/format_reward": 0.984375, "step": 1112, "temperature": 1.0 }, { "all_correct": 0.25, "all_wrong": 0.125, "completion_length": 387.75, "epoch": 0.03384624741515631, "grad_norm": 1.2117361639038522, "kl": 0.0291748046875, "learning_rate": 9.971760852867532e-07, "loss": 0.0012, "reward": 1.5003330707550049, "reward_std": 0.20664426684379578, "rewards/accuracy_reward": 0.412833034992218, "rewards/format_reward": 1.0, "step": 1113, "temperature": 1.0 }, { "all_correct": 0.375, "all_wrong": 0.125, "completion_length": 396.265625, "epoch": 0.03387665734095609, "grad_norm": 1.1755606365907483, "kl": 0.02392578125, "learning_rate": 9.97171013379896e-07, "loss": 0.001, "reward": 1.7332415580749512, "reward_std": 0.08857608586549759, "rewards/accuracy_reward": 0.5894915461540222, "rewards/format_reward": 1.0, "step": 1114, "temperature": 1.0 }, { "all_correct": 0.375, "all_wrong": 0.375, "completion_length": 374.640625, "epoch": 0.033907067266755866, "grad_norm": 0.9210505530670403, "kl": 0.0296630859375, "learning_rate": 9.971659369353336e-07, "loss": 0.0012, "reward": 1.5143427848815918, "reward_std": 0.05233721062541008, "rewards/accuracy_reward": 0.4205928146839142, "rewards/format_reward": 1.0, "step": 1115, "temperature": 1.0 }, { "all_correct": 0.25, "all_wrong": 0.625, "completion_length": 354.625, "epoch": 0.03393747719255565, "grad_norm": 0.611951872528932, "kl": 0.030517578125, "learning_rate": 9.971608559531125e-07, "loss": 0.0012, "reward": 1.375, "reward_std": 0.08017838001251221, "rewards/accuracy_reward": 0.3125, "rewards/format_reward": 1.0, "step": 1116, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.125, "completion_length": 371.171875, "epoch": 0.03396788711835543, "grad_norm": 1.3663988661679252, "kl": 0.033203125, "learning_rate": 9.97155770433279e-07, "loss": 0.0013, "reward": 1.788399338722229, "reward_std": 0.15740957856178284, "rewards/accuracy_reward": 0.6602743268013, "rewards/format_reward": 1.0, "step": 1117, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.25, "completion_length": 417.25, "epoch": 0.03399829704415521, "grad_norm": 2.9571042805531276, "kl": 0.029296875, "learning_rate": 9.971506803758795e-07, "loss": 0.0012, "reward": 1.8625000715255737, "reward_std": 0.08448860794305801, "rewards/accuracy_reward": 0.734375, "rewards/format_reward": 1.0, "step": 1118, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.125, "completion_length": 375.453125, "epoch": 0.03402870696995499, "grad_norm": 1.6445221111201525, "kl": 0.02783203125, "learning_rate": 9.971455857809606e-07, "loss": 0.0011, "reward": 1.9543342590332031, "reward_std": 0.05220496654510498, "rewards/accuracy_reward": 0.8043341040611267, "rewards/format_reward": 1.0, "step": 1119, "temperature": 1.0 }, { "all_correct": 0.125, "all_wrong": 0.125, "completion_length": 395.453125, "epoch": 0.034059116895754776, "grad_norm": 1.4751750111053377, "kl": 0.0257568359375, "learning_rate": 9.971404866485687e-07, "loss": 0.001, "reward": 1.582068920135498, "reward_std": 0.29019424319267273, "rewards/accuracy_reward": 0.4851939082145691, "rewards/format_reward": 1.0, "step": 1120, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.125, "completion_length": 358.0625, "epoch": 0.034089526821554555, "grad_norm": 1.0335957607718362, "kl": 0.0272216796875, "learning_rate": 9.971353829787504e-07, "loss": 0.0011, "reward": 1.9825429916381836, "reward_std": 0.015851331874728203, "rewards/accuracy_reward": 0.810667872428894, "rewards/format_reward": 1.0, "step": 1121, "temperature": 1.0 }, { "all_correct": 0.375, "all_wrong": 0.25, "completion_length": 375.09375, "epoch": 0.034119936747354335, "grad_norm": 1.0574268139005814, "kl": 0.0322265625, "learning_rate": 9.97130274771552e-07, "loss": 0.0013, "reward": 1.6531250476837158, "reward_std": 0.2007448673248291, "rewards/accuracy_reward": 0.546875, "rewards/format_reward": 1.0, "step": 1122, "temperature": 1.0 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 357.59375, "epoch": 0.03415034667315412, "grad_norm": 1.0290707698268704, "kl": 0.02783203125, "learning_rate": 9.971251620270205e-07, "loss": 0.0011, "reward": 2.1875, "reward_std": 0.029250433668494225, "rewards/accuracy_reward": 1.0, "rewards/format_reward": 1.0, "step": 1123, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.375, "completion_length": 365.671875, "epoch": 0.0341807565989539, "grad_norm": 0.9078286142575763, "kl": 0.031494140625, "learning_rate": 9.971200447452025e-07, "loss": 0.0013, "reward": 1.725000023841858, "reward_std": 0.06134308874607086, "rewards/accuracy_reward": 0.609375, "rewards/format_reward": 1.0, "step": 1124, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.125, "completion_length": 371.59375, "epoch": 0.03421116652475368, "grad_norm": 2.0567512180567147, "kl": 0.0247802734375, "learning_rate": 9.971149229261444e-07, "loss": 0.001, "reward": 1.7211308479309082, "reward_std": 0.13100098073482513, "rewards/accuracy_reward": 0.5898808240890503, "rewards/format_reward": 1.0, "step": 1125, "temperature": 1.0 }, { "all_correct": 0.875, "all_wrong": 0.0, "completion_length": 351.0625, "epoch": 0.03424157645055346, "grad_norm": 1.1963595173639654, "kl": 0.0322265625, "learning_rate": 9.971097965698934e-07, "loss": 0.0013, "reward": 2.094332218170166, "reward_std": 0.04028134047985077, "rewards/accuracy_reward": 0.9068320989608765, "rewards/format_reward": 1.0, "step": 1126, "temperature": 1.0 }, { "all_correct": 0.375, "all_wrong": 0.0, "completion_length": 373.25, "epoch": 0.034271986376353245, "grad_norm": 1.7077131554355405, "kl": 0.0322265625, "learning_rate": 9.971046656764958e-07, "loss": 0.0013, "reward": 1.8754020929336548, "reward_std": 0.17054632306098938, "rewards/accuracy_reward": 0.7129021883010864, "rewards/format_reward": 1.0, "step": 1127, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.125, "completion_length": 364.296875, "epoch": 0.034302396302153024, "grad_norm": 0.7235772236845203, "kl": 0.0283203125, "learning_rate": 9.97099530245999e-07, "loss": 0.0011, "reward": 1.96875, "reward_std": 0.08611097186803818, "rewards/accuracy_reward": 0.8125, "rewards/format_reward": 1.0, "step": 1128, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 375.515625, "epoch": 0.0343328062279528, "grad_norm": 3.449172454198428, "kl": 0.0224609375, "learning_rate": 9.970943902784493e-07, "loss": 0.0009, "reward": 1.8469563722610474, "reward_std": 0.1868019700050354, "rewards/accuracy_reward": 0.6875813603401184, "rewards/format_reward": 1.0, "step": 1129, "temperature": 1.0 }, { "all_correct": 0.125, "all_wrong": 0.125, "completion_length": 400.796875, "epoch": 0.03436321615375258, "grad_norm": 1.2091136047387152, "kl": 0.033203125, "learning_rate": 9.97089245773894e-07, "loss": 0.0013, "reward": 1.7847669124603271, "reward_std": 0.209543377161026, "rewards/accuracy_reward": 0.6410167813301086, "rewards/format_reward": 1.0, "step": 1130, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.125, "completion_length": 364.5, "epoch": 0.03439362607955237, "grad_norm": 0.9410007787577985, "kl": 0.032470703125, "learning_rate": 9.9708409673238e-07, "loss": 0.0013, "reward": 1.9010366201400757, "reward_std": 0.017607901245355606, "rewards/accuracy_reward": 0.751036524772644, "rewards/format_reward": 1.0, "step": 1131, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.125, "completion_length": 406.734375, "epoch": 0.03442403600535215, "grad_norm": 0.908586090698521, "kl": 0.029296875, "learning_rate": 9.970789431539539e-07, "loss": 0.0012, "reward": 1.8408238887786865, "reward_std": 0.09544786810874939, "rewards/accuracy_reward": 0.6908238530158997, "rewards/format_reward": 1.0, "step": 1132, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.375, "completion_length": 380.125, "epoch": 0.03445444593115193, "grad_norm": 0.49611828600933333, "kl": 0.0294189453125, "learning_rate": 9.970737850386633e-07, "loss": 0.0012, "reward": 1.6937501430511475, "reward_std": 0.077632375061512, "rewards/accuracy_reward": 0.578125, "rewards/format_reward": 1.0, "step": 1133, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.25, "completion_length": 379.28125, "epoch": 0.034484855856951706, "grad_norm": 0.5520796247701365, "kl": 0.033935546875, "learning_rate": 9.97068622386555e-07, "loss": 0.0014, "reward": 1.823991298675537, "reward_std": 0.012359250336885452, "rewards/accuracy_reward": 0.6802411675453186, "rewards/format_reward": 1.0, "step": 1134, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.125, "completion_length": 371.96875, "epoch": 0.03451526578275149, "grad_norm": 0.971961980451252, "kl": 0.0296630859375, "learning_rate": 9.97063455197676e-07, "loss": 0.0012, "reward": 1.8294472694396973, "reward_std": 0.0805341899394989, "rewards/accuracy_reward": 0.6575722694396973, "rewards/format_reward": 1.0, "step": 1135, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 365.390625, "epoch": 0.03454567570855127, "grad_norm": 1.0852322847656206, "kl": 0.0322265625, "learning_rate": 9.970582834720736e-07, "loss": 0.0013, "reward": 2.0802955627441406, "reward_std": 0.07524856925010681, "rewards/accuracy_reward": 0.8896706104278564, "rewards/format_reward": 1.0, "step": 1136, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.125, "completion_length": 380.234375, "epoch": 0.03457608563435105, "grad_norm": 1.3975940309707076, "kl": 0.034912109375, "learning_rate": 9.970531072097952e-07, "loss": 0.0014, "reward": 1.7180752754211426, "reward_std": 0.033700451254844666, "rewards/accuracy_reward": 0.561825156211853, "rewards/format_reward": 1.0, "step": 1137, "temperature": 1.0 }, { "all_correct": 0.25, "all_wrong": 0.5, "completion_length": 411.203125, "epoch": 0.034606495560150836, "grad_norm": 0.3978940051681478, "kl": 0.0283203125, "learning_rate": 9.970479264108878e-07, "loss": 0.0011, "reward": 1.415454626083374, "reward_std": 0.13413015007972717, "rewards/accuracy_reward": 0.3810797333717346, "rewards/format_reward": 0.953125, "step": 1138, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.0, "completion_length": 371.5, "epoch": 0.034636905485950616, "grad_norm": 1.1529001480369871, "kl": 0.032958984375, "learning_rate": 9.970427410753987e-07, "loss": 0.0013, "reward": 1.921875, "reward_std": 0.15858273208141327, "rewards/accuracy_reward": 0.765625, "rewards/format_reward": 1.0, "step": 1139, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.125, "completion_length": 378.0, "epoch": 0.034667315411750395, "grad_norm": 9.181256947299897, "kl": 0.034423828125, "learning_rate": 9.970375512033753e-07, "loss": 0.0014, "reward": 1.745079755783081, "reward_std": 0.110328808426857, "rewards/accuracy_reward": 0.6013297438621521, "rewards/format_reward": 1.0, "step": 1140, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 374.84375, "epoch": 0.034697725337550174, "grad_norm": 1.0932684617786967, "kl": 0.0264892578125, "learning_rate": 9.97032356794865e-07, "loss": 0.0011, "reward": 2.0882415771484375, "reward_std": 0.0599309504032135, "rewards/accuracy_reward": 0.8913666605949402, "rewards/format_reward": 1.0, "step": 1141, "temperature": 1.0 }, { "all_correct": 0.875, "all_wrong": 0.125, "completion_length": 359.53125, "epoch": 0.03472813526334996, "grad_norm": 0.48494704730474314, "kl": 0.031494140625, "learning_rate": 9.97027157849915e-07, "loss": 0.0013, "reward": 2.046875, "reward_std": 0.008838837966322899, "rewards/accuracy_reward": 0.875, "rewards/format_reward": 1.0, "step": 1142, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.125, "completion_length": 372.671875, "epoch": 0.03475854518914974, "grad_norm": 1.5052958700844847, "kl": 0.036865234375, "learning_rate": 9.97021954368573e-07, "loss": 0.0015, "reward": 1.7848329544067383, "reward_std": 0.17025971412658691, "rewards/accuracy_reward": 0.6410830020904541, "rewards/format_reward": 1.0, "step": 1143, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.25, "completion_length": 368.25, "epoch": 0.03478895511494952, "grad_norm": 1.633993810720194, "kl": 0.02978515625, "learning_rate": 9.970167463508865e-07, "loss": 0.0012, "reward": 1.8181629180908203, "reward_std": 0.024760911241173744, "rewards/accuracy_reward": 0.6712878942489624, "rewards/format_reward": 1.0, "step": 1144, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.25, "completion_length": 365.796875, "epoch": 0.0348193650407493, "grad_norm": 0.8787459677537164, "kl": 0.030517578125, "learning_rate": 9.970115337969027e-07, "loss": 0.0012, "reward": 1.7283766269683838, "reward_std": 0.09927521646022797, "rewards/accuracy_reward": 0.5971266031265259, "rewards/format_reward": 1.0, "step": 1145, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.125, "completion_length": 376.140625, "epoch": 0.034849774966549084, "grad_norm": 2.1798777171219155, "kl": 0.0286865234375, "learning_rate": 9.970063167066696e-07, "loss": 0.0011, "reward": 1.9034209251403809, "reward_std": 0.02453031577169895, "rewards/accuracy_reward": 0.7315459251403809, "rewards/format_reward": 1.0, "step": 1146, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.125, "completion_length": 388.3125, "epoch": 0.03488018489234886, "grad_norm": 0.6235675806737009, "kl": 0.0277099609375, "learning_rate": 9.970010950802346e-07, "loss": 0.0011, "reward": 1.9968749284744263, "reward_std": 0.07151715457439423, "rewards/accuracy_reward": 0.84375, "rewards/format_reward": 1.0, "step": 1147, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.125, "completion_length": 385.984375, "epoch": 0.03491059481814864, "grad_norm": 0.8614620498175211, "kl": 0.032958984375, "learning_rate": 9.969958689176452e-07, "loss": 0.0013, "reward": 1.7910230159759521, "reward_std": 0.06380719691514969, "rewards/accuracy_reward": 0.6503980159759521, "rewards/format_reward": 1.0, "step": 1148, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.125, "completion_length": 360.96875, "epoch": 0.03494100474394842, "grad_norm": 6.9891784666690615, "kl": 0.03564453125, "learning_rate": 9.969906382189494e-07, "loss": 0.0014, "reward": 1.7764062881469727, "reward_std": 0.04640388488769531, "rewards/accuracy_reward": 0.6514062285423279, "rewards/format_reward": 1.0, "step": 1149, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.125, "completion_length": 395.40625, "epoch": 0.03497141466974821, "grad_norm": 1.1492934451768142, "kl": 0.03173828125, "learning_rate": 9.969854029841948e-07, "loss": 0.0013, "reward": 1.7899036407470703, "reward_std": 0.11600731313228607, "rewards/accuracy_reward": 0.6399034857749939, "rewards/format_reward": 1.0, "step": 1150, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.125, "completion_length": 370.140625, "epoch": 0.03500182459554799, "grad_norm": 1.1792462099638712, "kl": 0.034912109375, "learning_rate": 9.96980163213429e-07, "loss": 0.0014, "reward": 1.8117589950561523, "reward_std": 0.15627166628837585, "rewards/accuracy_reward": 0.6680090427398682, "rewards/format_reward": 1.0, "step": 1151, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.125, "completion_length": 386.421875, "epoch": 0.035032234521347766, "grad_norm": 0.8516265507210926, "kl": 0.027099609375, "learning_rate": 9.969749189067005e-07, "loss": 0.0011, "reward": 1.8625000715255737, "reward_std": 0.2108437716960907, "rewards/accuracy_reward": 0.71875, "rewards/format_reward": 1.0, "step": 1152, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.0, "completion_length": 372.0, "epoch": 0.03506264444714755, "grad_norm": 1.2170402308144335, "kl": 0.0301513671875, "learning_rate": 9.969696700640565e-07, "loss": 0.0012, "reward": 1.8844451904296875, "reward_std": 0.029813367873430252, "rewards/accuracy_reward": 0.7250702381134033, "rewards/format_reward": 1.0, "step": 1153, "temperature": 1.0 }, { "all_correct": 0.25, "all_wrong": 0.25, "completion_length": 415.671875, "epoch": 0.03509305437294733, "grad_norm": 0.7148363338812642, "kl": 0.035400390625, "learning_rate": 9.969644166855449e-07, "loss": 0.0014, "reward": 1.4465069770812988, "reward_std": 0.11390846967697144, "rewards/accuracy_reward": 0.37150701880455017, "rewards/format_reward": 0.984375, "step": 1154, "temperature": 1.0 }, { "all_correct": 0.375, "all_wrong": 0.375, "completion_length": 376.125, "epoch": 0.03512346429874711, "grad_norm": 0.8814426695820301, "kl": 0.03076171875, "learning_rate": 9.969591587712138e-07, "loss": 0.0012, "reward": 1.5531249046325684, "reward_std": 0.11738666892051697, "rewards/accuracy_reward": 0.46875, "rewards/format_reward": 1.0, "step": 1155, "temperature": 1.0 }, { "all_correct": 0.25, "all_wrong": 0.125, "completion_length": 377.4375, "epoch": 0.03515387422454689, "grad_norm": 2.503417425861633, "kl": 0.038330078125, "learning_rate": 9.969538963211114e-07, "loss": 0.0015, "reward": 1.6409857273101807, "reward_std": 0.21098706126213074, "rewards/accuracy_reward": 0.5128607153892517, "rewards/format_reward": 1.0, "step": 1156, "temperature": 1.0 }, { "all_correct": 0.25, "all_wrong": 0.25, "completion_length": 376.625, "epoch": 0.035184284150346676, "grad_norm": 1.6032154488382135, "kl": 0.04248046875, "learning_rate": 9.969486293352855e-07, "loss": 0.0017, "reward": 1.4589827060699463, "reward_std": 0.08787427842617035, "rewards/accuracy_reward": 0.34648269414901733, "rewards/format_reward": 1.0, "step": 1157, "temperature": 1.0 }, { "all_correct": 0.375, "all_wrong": 0.375, "completion_length": 386.4375, "epoch": 0.035214694076146455, "grad_norm": 1.00080809949175, "kl": 0.032958984375, "learning_rate": 9.969433578137843e-07, "loss": 0.0013, "reward": 1.6479413509368896, "reward_std": 0.026308942586183548, "rewards/accuracy_reward": 0.5385662913322449, "rewards/format_reward": 1.0, "step": 1158, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 355.734375, "epoch": 0.035245104001946234, "grad_norm": 1.1514138514700039, "kl": 0.040283203125, "learning_rate": 9.96938081756656e-07, "loss": 0.0016, "reward": 2.0572447776794434, "reward_std": 0.11053717881441116, "rewards/accuracy_reward": 0.8759948015213013, "rewards/format_reward": 1.0, "step": 1159, "temperature": 1.0 }, { "all_correct": 0.25, "all_wrong": 0.25, "completion_length": 450.25, "epoch": 0.03527551392774601, "grad_norm": 0.6436159700997038, "kl": 0.027099609375, "learning_rate": 9.969328011639483e-07, "loss": 0.0011, "reward": 1.5360925197601318, "reward_std": 0.21513505280017853, "rewards/accuracy_reward": 0.4704675078392029, "rewards/format_reward": 0.984375, "step": 1160, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.375, "completion_length": 383.40625, "epoch": 0.0353059238535458, "grad_norm": 0.7721102492301887, "kl": 0.025390625, "learning_rate": 9.969275160357099e-07, "loss": 0.001, "reward": 1.6437499523162842, "reward_std": 0.0909954383969307, "rewards/accuracy_reward": 0.546875, "rewards/format_reward": 1.0, "step": 1161, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.375, "completion_length": 370.734375, "epoch": 0.03533633377934558, "grad_norm": 0.6252239478250365, "kl": 0.03271484375, "learning_rate": 9.969222263719888e-07, "loss": 0.0013, "reward": 1.7224416732788086, "reward_std": 0.012419304810464382, "rewards/accuracy_reward": 0.5974416732788086, "rewards/format_reward": 1.0, "step": 1162, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.25, "completion_length": 396.109375, "epoch": 0.03536674370514536, "grad_norm": 0.7509517056107452, "kl": 0.0289306640625, "learning_rate": 9.969169321728332e-07, "loss": 0.0012, "reward": 1.6746838092803955, "reward_std": 0.058494821190834045, "rewards/accuracy_reward": 0.5590587258338928, "rewards/format_reward": 1.0, "step": 1163, "temperature": 1.0 }, { "all_correct": 0.375, "all_wrong": 0.125, "completion_length": 399.5, "epoch": 0.03539715363094514, "grad_norm": 1.0944056297816556, "kl": 0.03125, "learning_rate": 9.969116334382916e-07, "loss": 0.0012, "reward": 1.7903258800506592, "reward_std": 0.10033317655324936, "rewards/accuracy_reward": 0.6309508085250854, "rewards/format_reward": 1.0, "step": 1164, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.125, "completion_length": 381.828125, "epoch": 0.03542756355674492, "grad_norm": 1.4459090095298563, "kl": 0.03173828125, "learning_rate": 9.969063301684122e-07, "loss": 0.0013, "reward": 1.8457082509994507, "reward_std": 0.17786380648612976, "rewards/accuracy_reward": 0.701958179473877, "rewards/format_reward": 1.0, "step": 1165, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.25, "completion_length": 364.171875, "epoch": 0.0354579734825447, "grad_norm": 0.6495847827642469, "kl": 0.033203125, "learning_rate": 9.969010223632438e-07, "loss": 0.0013, "reward": 1.8812499046325684, "reward_std": 0.0530330091714859, "rewards/accuracy_reward": 0.734375, "rewards/format_reward": 1.0, "step": 1166, "temperature": 1.0 }, { "all_correct": 0.375, "all_wrong": 0.125, "completion_length": 374.609375, "epoch": 0.03548838340834448, "grad_norm": 0.9979204737646302, "kl": 0.03662109375, "learning_rate": 9.968957100228341e-07, "loss": 0.0015, "reward": 1.7093751430511475, "reward_std": 0.2606995701789856, "rewards/accuracy_reward": 0.59375, "rewards/format_reward": 1.0, "step": 1167, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.25, "completion_length": 381.640625, "epoch": 0.03551879333414427, "grad_norm": 0.9898733601082234, "kl": 0.0301513671875, "learning_rate": 9.968903931472322e-07, "loss": 0.0012, "reward": 1.8159818649291992, "reward_std": 0.1083039790391922, "rewards/accuracy_reward": 0.6847319006919861, "rewards/format_reward": 0.984375, "step": 1168, "temperature": 1.0 }, { "all_correct": 0.25, "all_wrong": 0.25, "completion_length": 375.484375, "epoch": 0.03554920325994405, "grad_norm": 1.3182098032840643, "kl": 0.0294189453125, "learning_rate": 9.968850717364868e-07, "loss": 0.0012, "reward": 1.5841600894927979, "reward_std": 0.07941659539937973, "rewards/accuracy_reward": 0.4622851014137268, "rewards/format_reward": 1.0, "step": 1169, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.25, "completion_length": 363.328125, "epoch": 0.035579613185743826, "grad_norm": 0.380305527813375, "kl": 0.03662109375, "learning_rate": 9.968797457906458e-07, "loss": 0.0015, "reward": 1.8250000476837158, "reward_std": 0.08017837256193161, "rewards/accuracy_reward": 0.6875, "rewards/format_reward": 1.0, "step": 1170, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.125, "completion_length": 372.5625, "epoch": 0.035610023111543605, "grad_norm": 0.8590394745775125, "kl": 0.035888671875, "learning_rate": 9.968744153097582e-07, "loss": 0.0014, "reward": 1.941326379776001, "reward_std": 0.025385325774550438, "rewards/accuracy_reward": 0.7757013440132141, "rewards/format_reward": 1.0, "step": 1171, "temperature": 1.0 }, { "all_correct": 0.375, "all_wrong": 0.0, "completion_length": 406.84375, "epoch": 0.03564043303734339, "grad_norm": 11.466341526099667, "kl": 0.03271484375, "learning_rate": 9.968690802938726e-07, "loss": 0.0013, "reward": 1.7135803699493408, "reward_std": 0.15738029778003693, "rewards/accuracy_reward": 0.5635802745819092, "rewards/format_reward": 1.0, "step": 1172, "temperature": 1.0 }, { "all_correct": 0.875, "all_wrong": 0.0, "completion_length": 371.078125, "epoch": 0.03567084296314317, "grad_norm": 0.8272982077412819, "kl": 0.0301513671875, "learning_rate": 9.968637407430375e-07, "loss": 0.0012, "reward": 2.1204514503479004, "reward_std": 0.015612217597663403, "rewards/accuracy_reward": 0.9267013669013977, "rewards/format_reward": 1.0, "step": 1173, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.25, "completion_length": 387.96875, "epoch": 0.03570125288894295, "grad_norm": 0.6965215441219149, "kl": 0.02978515625, "learning_rate": 9.96858396657302e-07, "loss": 0.0012, "reward": 1.8656251430511475, "reward_std": 0.07523491233587265, "rewards/accuracy_reward": 0.734375, "rewards/format_reward": 1.0, "step": 1174, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 384.625, "epoch": 0.03573166281474273, "grad_norm": 1.068305855827611, "kl": 0.0303955078125, "learning_rate": 9.968530480367147e-07, "loss": 0.0012, "reward": 1.9180628061294556, "reward_std": 0.16402453184127808, "rewards/accuracy_reward": 0.7430627346038818, "rewards/format_reward": 1.0, "step": 1175, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.0, "completion_length": 380.34375, "epoch": 0.035762072740542515, "grad_norm": 1.4461056573254005, "kl": 0.028564453125, "learning_rate": 9.968476948813244e-07, "loss": 0.0011, "reward": 1.9896197319030762, "reward_std": 0.10615414381027222, "rewards/accuracy_reward": 0.8083697557449341, "rewards/format_reward": 1.0, "step": 1176, "temperature": 1.0 }, { "all_correct": 0.25, "all_wrong": 0.25, "completion_length": 398.0625, "epoch": 0.035792482666342294, "grad_norm": 1.099603268497178, "kl": 0.0284423828125, "learning_rate": 9.9684233719118e-07, "loss": 0.0011, "reward": 1.6466586589813232, "reward_std": 0.2337760478258133, "rewards/accuracy_reward": 0.5279086232185364, "rewards/format_reward": 1.0, "step": 1177, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.125, "completion_length": 411.828125, "epoch": 0.035822892592142073, "grad_norm": 1.0809137541164098, "kl": 0.029052734375, "learning_rate": 9.968369749663303e-07, "loss": 0.0012, "reward": 1.6710968017578125, "reward_std": 0.07263951003551483, "rewards/accuracy_reward": 0.561721682548523, "rewards/format_reward": 0.984375, "step": 1178, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.125, "completion_length": 358.453125, "epoch": 0.03585330251794186, "grad_norm": 1.2441350221369591, "kl": 0.03759765625, "learning_rate": 9.968316082068245e-07, "loss": 0.0015, "reward": 1.8768274784088135, "reward_std": 0.09971503168344498, "rewards/accuracy_reward": 0.7268275618553162, "rewards/format_reward": 1.0, "step": 1179, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.125, "completion_length": 369.203125, "epoch": 0.03588371244374164, "grad_norm": 1.4171047280074038, "kl": 0.030029296875, "learning_rate": 9.968262369127113e-07, "loss": 0.0012, "reward": 1.7928521633148193, "reward_std": 0.1550390124320984, "rewards/accuracy_reward": 0.6553521156311035, "rewards/format_reward": 1.0, "step": 1180, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.125, "completion_length": 406.171875, "epoch": 0.03591412236954142, "grad_norm": 1.8007463661266467, "kl": 0.029052734375, "learning_rate": 9.968208610840395e-07, "loss": 0.0012, "reward": 1.8497192859649658, "reward_std": 0.157828226685524, "rewards/accuracy_reward": 0.7122193574905396, "rewards/format_reward": 1.0, "step": 1181, "temperature": 1.0 }, { "all_correct": 0.25, "all_wrong": 0.125, "completion_length": 410.765625, "epoch": 0.0359445322953412, "grad_norm": 1.142855906925365, "kl": 0.029296875, "learning_rate": 9.968154807208587e-07, "loss": 0.0012, "reward": 1.6572691202163696, "reward_std": 0.22080466151237488, "rewards/accuracy_reward": 0.5447690486907959, "rewards/format_reward": 0.984375, "step": 1182, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.25, "completion_length": 359.046875, "epoch": 0.03597494222114098, "grad_norm": 1.1347860660889892, "kl": 0.03173828125, "learning_rate": 9.968100958232178e-07, "loss": 0.0013, "reward": 1.7593750953674316, "reward_std": 0.13471055030822754, "rewards/accuracy_reward": 0.640625, "rewards/format_reward": 1.0, "step": 1183, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.25, "completion_length": 376.703125, "epoch": 0.03600535214694076, "grad_norm": 0.7133977275239562, "kl": 0.023681640625, "learning_rate": 9.96804706391166e-07, "loss": 0.0009, "reward": 1.8214243650436401, "reward_std": 0.09029192477464676, "rewards/accuracy_reward": 0.6870493292808533, "rewards/format_reward": 1.0, "step": 1184, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.25, "completion_length": 345.71875, "epoch": 0.03603576207274054, "grad_norm": 0.6967976294770689, "kl": 0.03515625, "learning_rate": 9.967993124247523e-07, "loss": 0.0014, "reward": 1.8968751430511475, "reward_std": 0.008838837966322899, "rewards/accuracy_reward": 0.75, "rewards/format_reward": 1.0, "step": 1185, "temperature": 1.0 }, { "all_correct": 0.875, "all_wrong": 0.0, "completion_length": 359.015625, "epoch": 0.03606617199854032, "grad_norm": 1.1186051043445568, "kl": 0.0306396484375, "learning_rate": 9.967939139240263e-07, "loss": 0.0012, "reward": 2.089137554168701, "reward_std": 0.017244329676032066, "rewards/accuracy_reward": 0.8953874111175537, "rewards/format_reward": 1.0, "step": 1186, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.125, "completion_length": 378.796875, "epoch": 0.03609658192434011, "grad_norm": 1.3285576154380563, "kl": 0.02978515625, "learning_rate": 9.967885108890366e-07, "loss": 0.0012, "reward": 1.7848801612854004, "reward_std": 0.1768539845943451, "rewards/accuracy_reward": 0.6536300778388977, "rewards/format_reward": 1.0, "step": 1187, "temperature": 1.0 }, { "all_correct": 0.375, "all_wrong": 0.25, "completion_length": 356.5, "epoch": 0.036126991850139886, "grad_norm": 1.4087343721938757, "kl": 0.035888671875, "learning_rate": 9.967831033198332e-07, "loss": 0.0014, "reward": 1.6132287979125977, "reward_std": 0.14344462752342224, "rewards/accuracy_reward": 0.5101038217544556, "rewards/format_reward": 1.0, "step": 1188, "temperature": 1.0 }, { "all_correct": 0.375, "all_wrong": 0.125, "completion_length": 398.171875, "epoch": 0.036157401775939665, "grad_norm": 1.6305690544642972, "kl": 0.0299072265625, "learning_rate": 9.967776912164654e-07, "loss": 0.0012, "reward": 1.664294719696045, "reward_std": 0.22947821021080017, "rewards/accuracy_reward": 0.5486696362495422, "rewards/format_reward": 1.0, "step": 1189, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 388.359375, "epoch": 0.036187811701739445, "grad_norm": 0.9320953940462177, "kl": 0.029541015625, "learning_rate": 9.967722745789821e-07, "loss": 0.0012, "reward": 2.0042881965637207, "reward_std": 0.026993395760655403, "rewards/accuracy_reward": 0.8324130773544312, "rewards/format_reward": 1.0, "step": 1190, "temperature": 1.0 }, { "all_correct": 0.875, "all_wrong": 0.0, "completion_length": 370.34375, "epoch": 0.03621822162753923, "grad_norm": 0.5975919574030307, "kl": 0.031982421875, "learning_rate": 9.96766853407433e-07, "loss": 0.0013, "reward": 2.144791603088379, "reward_std": 0.03830162435770035, "rewards/accuracy_reward": 0.9479167461395264, "rewards/format_reward": 1.0, "step": 1191, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.0, "completion_length": 399.90625, "epoch": 0.03624863155333901, "grad_norm": 1.0764676435999878, "kl": 0.0322265625, "learning_rate": 9.967614277018679e-07, "loss": 0.0013, "reward": 1.988457441329956, "reward_std": 0.08253973722457886, "rewards/accuracy_reward": 0.8353323936462402, "rewards/format_reward": 1.0, "step": 1192, "temperature": 1.0 }, { "all_correct": 0.375, "all_wrong": 0.125, "completion_length": 382.40625, "epoch": 0.03627904147913879, "grad_norm": 1.2341022368480987, "kl": 0.033935546875, "learning_rate": 9.967559974623359e-07, "loss": 0.0014, "reward": 1.8015625476837158, "reward_std": 0.2570255696773529, "rewards/accuracy_reward": 0.6703125238418579, "rewards/format_reward": 1.0, "step": 1193, "temperature": 1.0 }, { "all_correct": 0.875, "all_wrong": 0.0, "completion_length": 362.078125, "epoch": 0.036309451404938575, "grad_norm": 0.5645158834454872, "kl": 0.041015625, "learning_rate": 9.967505626888866e-07, "loss": 0.0016, "reward": 2.153818130493164, "reward_std": 0.00623175036162138, "rewards/accuracy_reward": 0.9538180828094482, "rewards/format_reward": 1.0, "step": 1194, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 369.671875, "epoch": 0.036339861330738354, "grad_norm": 0.7895940874474386, "kl": 0.0286865234375, "learning_rate": 9.967451233815697e-07, "loss": 0.0011, "reward": 2.038499355316162, "reward_std": 0.056512411683797836, "rewards/accuracy_reward": 0.8634991645812988, "rewards/format_reward": 1.0, "step": 1195, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.0, "completion_length": 379.25, "epoch": 0.036370271256538134, "grad_norm": 1.3076750391527348, "kl": 0.03515625, "learning_rate": 9.96739679540435e-07, "loss": 0.0014, "reward": 1.9959135055541992, "reward_std": 0.09294147789478302, "rewards/accuracy_reward": 0.8177883625030518, "rewards/format_reward": 1.0, "step": 1196, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.25, "completion_length": 375.75, "epoch": 0.03640068118233791, "grad_norm": 0.32484330199761985, "kl": 0.03515625, "learning_rate": 9.967342311655317e-07, "loss": 0.0014, "reward": 1.859375, "reward_std": 0.06805656850337982, "rewards/accuracy_reward": 0.71875, "rewards/format_reward": 1.0, "step": 1197, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.125, "completion_length": 401.5, "epoch": 0.0364310911081377, "grad_norm": 0.9620007160341915, "kl": 0.0361328125, "learning_rate": 9.967287782569103e-07, "loss": 0.0014, "reward": 1.834375023841858, "reward_std": 0.1532049924135208, "rewards/accuracy_reward": 0.703125, "rewards/format_reward": 1.0, "step": 1198, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.25, "completion_length": 386.4375, "epoch": 0.03646150103393748, "grad_norm": 0.8673758016973084, "kl": 0.03271484375, "learning_rate": 9.967233208146198e-07, "loss": 0.0013, "reward": 1.709754228591919, "reward_std": 0.08199620991945267, "rewards/accuracy_reward": 0.5785042643547058, "rewards/format_reward": 1.0, "step": 1199, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 408.84375, "epoch": 0.03649191095973726, "grad_norm": 1.1171750789337782, "kl": 0.035400390625, "learning_rate": 9.967178588387105e-07, "loss": 0.0014, "reward": 1.7569730281829834, "reward_std": 0.1504843682050705, "rewards/accuracy_reward": 0.6382229328155518, "rewards/format_reward": 1.0, "step": 1200, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.25, "completion_length": 378.9375, "epoch": 0.03652232088553704, "grad_norm": 1.3634616646782756, "kl": 0.0322265625, "learning_rate": 9.96712392329232e-07, "loss": 0.0013, "reward": 1.7582263946533203, "reward_std": 0.14932964742183685, "rewards/accuracy_reward": 0.6301013231277466, "rewards/format_reward": 1.0, "step": 1201, "temperature": 1.0 }, { "all_correct": 0.25, "all_wrong": 0.375, "completion_length": 407.421875, "epoch": 0.03655273081133682, "grad_norm": 0.8687880973062533, "kl": 0.022705078125, "learning_rate": 9.967069212862344e-07, "loss": 0.0009, "reward": 1.4812500476837158, "reward_std": 0.18988311290740967, "rewards/accuracy_reward": 0.40625, "rewards/format_reward": 1.0, "step": 1202, "temperature": 1.0 }, { "all_correct": 0.25, "all_wrong": 0.5, "completion_length": 417.859375, "epoch": 0.0365831407371366, "grad_norm": 0.7836273469848917, "kl": 0.031005859375, "learning_rate": 9.967014457097674e-07, "loss": 0.0012, "reward": 1.3569281101226807, "reward_std": 0.14372044801712036, "rewards/accuracy_reward": 0.3131781220436096, "rewards/format_reward": 0.96875, "step": 1203, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.125, "completion_length": 385.984375, "epoch": 0.03661355066293638, "grad_norm": 0.8026837070655322, "kl": 0.035888671875, "learning_rate": 9.96695965599881e-07, "loss": 0.0014, "reward": 1.8945523500442505, "reward_std": 0.08972810953855515, "rewards/accuracy_reward": 0.7383023500442505, "rewards/format_reward": 1.0, "step": 1204, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.125, "completion_length": 382.125, "epoch": 0.03664396058873616, "grad_norm": 4.463699689997463, "kl": 0.03369140625, "learning_rate": 9.966904809566254e-07, "loss": 0.0013, "reward": 1.925453543663025, "reward_std": 0.027894020080566406, "rewards/accuracy_reward": 0.7567035555839539, "rewards/format_reward": 1.0, "step": 1205, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.125, "completion_length": 433.03125, "epoch": 0.036674370514535946, "grad_norm": 0.9793168422046096, "kl": 0.0262451171875, "learning_rate": 9.966849917800506e-07, "loss": 0.001, "reward": 1.7894985675811768, "reward_std": 0.11389000713825226, "rewards/accuracy_reward": 0.7519985437393188, "rewards/format_reward": 0.875, "step": 1206, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.25, "completion_length": 374.265625, "epoch": 0.036704780440335726, "grad_norm": 2.0818830897901486, "kl": 0.033935546875, "learning_rate": 9.966794980702068e-07, "loss": 0.0014, "reward": 1.6943612098693848, "reward_std": 0.035793375223875046, "rewards/accuracy_reward": 0.581861138343811, "rewards/format_reward": 1.0, "step": 1207, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.125, "completion_length": 376.84375, "epoch": 0.036735190366135505, "grad_norm": 1.178900126330261, "kl": 0.029541015625, "learning_rate": 9.96673999827144e-07, "loss": 0.0012, "reward": 1.8115098476409912, "reward_std": 0.05396632105112076, "rewards/accuracy_reward": 0.6771348714828491, "rewards/format_reward": 1.0, "step": 1208, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.125, "completion_length": 352.1875, "epoch": 0.03676560029193529, "grad_norm": 1.4387193720931983, "kl": 0.03173828125, "learning_rate": 9.96668497050912e-07, "loss": 0.0013, "reward": 1.8645833730697632, "reward_std": 0.10012082010507584, "rewards/accuracy_reward": 0.7239583730697632, "rewards/format_reward": 1.0, "step": 1209, "temperature": 1.0 }, { "all_correct": 0.375, "all_wrong": 0.375, "completion_length": 365.390625, "epoch": 0.03679601021773507, "grad_norm": 1.1363495127150767, "kl": 0.0284423828125, "learning_rate": 9.966629897415617e-07, "loss": 0.0011, "reward": 1.5947306156158447, "reward_std": 0.07874157279729843, "rewards/accuracy_reward": 0.4853556156158447, "rewards/format_reward": 1.0, "step": 1210, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 387.78125, "epoch": 0.03682642014353485, "grad_norm": 1.208442634663853, "kl": 0.0302734375, "learning_rate": 9.966574778991432e-07, "loss": 0.0012, "reward": 2.0115935802459717, "reward_std": 0.03436083719134331, "rewards/accuracy_reward": 0.8334685564041138, "rewards/format_reward": 1.0, "step": 1211, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.25, "completion_length": 362.875, "epoch": 0.03685683006933463, "grad_norm": 2.8758779767557012, "kl": 0.038330078125, "learning_rate": 9.966519615237065e-07, "loss": 0.0015, "reward": 1.7900360822677612, "reward_std": 0.04290841519832611, "rewards/accuracy_reward": 0.6556610465049744, "rewards/format_reward": 1.0, "step": 1212, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 390.9375, "epoch": 0.036887239995134415, "grad_norm": 1.5541432406253806, "kl": 0.035400390625, "learning_rate": 9.966464406153021e-07, "loss": 0.0014, "reward": 1.955958366394043, "reward_std": 0.14124098420143127, "rewards/accuracy_reward": 0.8215833902359009, "rewards/format_reward": 0.96875, "step": 1213, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.125, "completion_length": 369.375, "epoch": 0.036917649920934194, "grad_norm": 1.0424339826756504, "kl": 0.03173828125, "learning_rate": 9.966409151739806e-07, "loss": 0.0013, "reward": 1.946874976158142, "reward_std": 0.16336661577224731, "rewards/accuracy_reward": 0.796875, "rewards/format_reward": 1.0, "step": 1214, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.125, "completion_length": 361.359375, "epoch": 0.03694805984673397, "grad_norm": 1.58038302498911, "kl": 0.03125, "learning_rate": 9.966353851997923e-07, "loss": 0.0012, "reward": 1.8651032447814941, "reward_std": 0.1377798616886139, "rewards/accuracy_reward": 0.7057281732559204, "rewards/format_reward": 1.0, "step": 1215, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 419.5625, "epoch": 0.03697846977253375, "grad_norm": 1.0880074538339308, "kl": 0.02392578125, "learning_rate": 9.966298506927872e-07, "loss": 0.001, "reward": 1.787438154220581, "reward_std": 0.21425844728946686, "rewards/accuracy_reward": 0.6655632257461548, "rewards/format_reward": 1.0, "step": 1216, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.125, "completion_length": 374.015625, "epoch": 0.03700887969833354, "grad_norm": 3.861972156329695, "kl": 0.02685546875, "learning_rate": 9.966243116530166e-07, "loss": 0.0011, "reward": 1.7696444988250732, "reward_std": 0.15385319292545319, "rewards/accuracy_reward": 0.6352695226669312, "rewards/format_reward": 1.0, "step": 1217, "temperature": 1.0 }, { "all_correct": 0.375, "all_wrong": 0.25, "completion_length": 413.0625, "epoch": 0.03703928962413332, "grad_norm": 0.906261229520691, "kl": 0.028564453125, "learning_rate": 9.966187680805306e-07, "loss": 0.0011, "reward": 1.5482540130615234, "reward_std": 0.15232303738594055, "rewards/accuracy_reward": 0.45762890577316284, "rewards/format_reward": 1.0, "step": 1218, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 360.515625, "epoch": 0.0370696995499331, "grad_norm": 1.7421324539687955, "kl": 0.036376953125, "learning_rate": 9.966132199753801e-07, "loss": 0.0015, "reward": 2.078726053237915, "reward_std": 0.10794822126626968, "rewards/accuracy_reward": 0.8943510055541992, "rewards/format_reward": 1.0, "step": 1219, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 370.953125, "epoch": 0.037100109475732876, "grad_norm": 0.5037320591843865, "kl": 0.0284423828125, "learning_rate": 9.966076673376154e-07, "loss": 0.0011, "reward": 2.046875, "reward_std": 0.09722718596458435, "rewards/accuracy_reward": 0.875, "rewards/format_reward": 1.0, "step": 1220, "temperature": 1.0 }, { "all_correct": 0.375, "all_wrong": 0.125, "completion_length": 373.28125, "epoch": 0.03713051940153266, "grad_norm": 2.1830184615082286, "kl": 0.03076171875, "learning_rate": 9.96602110167287e-07, "loss": 0.0012, "reward": 1.8402504920959473, "reward_std": 0.11787135899066925, "rewards/accuracy_reward": 0.6715005040168762, "rewards/format_reward": 1.0, "step": 1221, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.25, "completion_length": 364.8125, "epoch": 0.03716092932733244, "grad_norm": 0.6624015563055085, "kl": 0.032470703125, "learning_rate": 9.965965484644463e-07, "loss": 0.0013, "reward": 1.6531250476837158, "reward_std": 0.11363068222999573, "rewards/accuracy_reward": 0.546875, "rewards/format_reward": 1.0, "step": 1222, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 374.296875, "epoch": 0.03719133925313222, "grad_norm": 0.6628709858215975, "kl": 0.0301513671875, "learning_rate": 9.965909822291436e-07, "loss": 0.0012, "reward": 2.046875, "reward_std": 0.1374930739402771, "rewards/accuracy_reward": 0.875, "rewards/format_reward": 1.0, "step": 1223, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.125, "completion_length": 376.328125, "epoch": 0.037221749178932007, "grad_norm": 0.7873412826967261, "kl": 0.0244140625, "learning_rate": 9.9658541146143e-07, "loss": 0.001, "reward": 1.96875, "reward_std": 0.1293872892856598, "rewards/accuracy_reward": 0.8125, "rewards/format_reward": 1.0, "step": 1224, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 393.671875, "epoch": 0.037252159104731786, "grad_norm": 0.7330801616056122, "kl": 0.036865234375, "learning_rate": 9.965798361613558e-07, "loss": 0.0015, "reward": 2.0437426567077637, "reward_std": 0.01713588461279869, "rewards/accuracy_reward": 0.8499925136566162, "rewards/format_reward": 1.0, "step": 1225, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.25, "completion_length": 373.765625, "epoch": 0.037282569030531565, "grad_norm": 0.29470607435970175, "kl": 0.03173828125, "learning_rate": 9.965742563289723e-07, "loss": 0.0013, "reward": 1.8593751192092896, "reward_std": 0.06805657595396042, "rewards/accuracy_reward": 0.71875, "rewards/format_reward": 1.0, "step": 1226, "temperature": 1.0 }, { "all_correct": 0.375, "all_wrong": 0.25, "completion_length": 373.765625, "epoch": 0.037312978956331344, "grad_norm": 0.7671066076076499, "kl": 0.0284423828125, "learning_rate": 9.965686719643306e-07, "loss": 0.0011, "reward": 1.6811611652374268, "reward_std": 0.13532957434654236, "rewards/accuracy_reward": 0.5530362129211426, "rewards/format_reward": 1.0, "step": 1227, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 413.875, "epoch": 0.03734338888213113, "grad_norm": 0.5839433990046675, "kl": 0.0269775390625, "learning_rate": 9.965630830674811e-07, "loss": 0.0011, "reward": 2.007234811782837, "reward_std": 0.059669144451618195, "rewards/accuracy_reward": 0.8322348594665527, "rewards/format_reward": 1.0, "step": 1228, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.125, "completion_length": 398.296875, "epoch": 0.03737379880793091, "grad_norm": 1.04504483458555, "kl": 0.0302734375, "learning_rate": 9.965574896384752e-07, "loss": 0.0012, "reward": 1.8041876554489136, "reward_std": 0.07023001462221146, "rewards/accuracy_reward": 0.6635626554489136, "rewards/format_reward": 1.0, "step": 1229, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.125, "completion_length": 379.96875, "epoch": 0.03740420873373069, "grad_norm": 0.9801820992728698, "kl": 0.027587890625, "learning_rate": 9.96551891677364e-07, "loss": 0.0011, "reward": 1.927573561668396, "reward_std": 0.08493957668542862, "rewards/accuracy_reward": 0.7775735259056091, "rewards/format_reward": 1.0, "step": 1230, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.125, "completion_length": 361.53125, "epoch": 0.03743461865953047, "grad_norm": 0.9744591801053886, "kl": 0.034423828125, "learning_rate": 9.965462891841985e-07, "loss": 0.0014, "reward": 1.8536531925201416, "reward_std": 0.07960119098424911, "rewards/accuracy_reward": 0.6974031925201416, "rewards/format_reward": 1.0, "step": 1231, "temperature": 1.0 }, { "all_correct": 0.375, "all_wrong": 0.125, "completion_length": 401.890625, "epoch": 0.037465028585330254, "grad_norm": 1.190577939491672, "kl": 0.0299072265625, "learning_rate": 9.965406821590297e-07, "loss": 0.0012, "reward": 1.827261209487915, "reward_std": 0.1324939727783203, "rewards/accuracy_reward": 0.6678860783576965, "rewards/format_reward": 1.0, "step": 1232, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.25, "completion_length": 352.140625, "epoch": 0.03749543851113003, "grad_norm": 1.0470058319377902, "kl": 0.032470703125, "learning_rate": 9.965350706019089e-07, "loss": 0.0013, "reward": 1.8218339681625366, "reward_std": 0.03220607712864876, "rewards/accuracy_reward": 0.6749589443206787, "rewards/format_reward": 1.0, "step": 1233, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.0, "completion_length": 390.578125, "epoch": 0.03752584843692981, "grad_norm": 1.035546006376402, "kl": 0.0296630859375, "learning_rate": 9.965294545128874e-07, "loss": 0.0012, "reward": 1.9316580295562744, "reward_std": 0.10213324427604675, "rewards/accuracy_reward": 0.7566580176353455, "rewards/format_reward": 1.0, "step": 1234, "temperature": 1.0 }, { "all_correct": 0.375, "all_wrong": 0.125, "completion_length": 371.578125, "epoch": 0.03755625836272959, "grad_norm": 1.4642897719935828, "kl": 0.03076171875, "learning_rate": 9.965238338920164e-07, "loss": 0.0012, "reward": 1.7755050659179688, "reward_std": 0.22341889142990112, "rewards/accuracy_reward": 0.6692550778388977, "rewards/format_reward": 0.984375, "step": 1235, "temperature": 1.0 }, { "all_correct": 0.125, "all_wrong": 0.0, "completion_length": 383.484375, "epoch": 0.03758666828852938, "grad_norm": 7.040237983053658, "kl": 0.030029296875, "learning_rate": 9.96518208739347e-07, "loss": 0.0012, "reward": 1.5384817123413086, "reward_std": 0.315449059009552, "rewards/accuracy_reward": 0.41973167657852173, "rewards/format_reward": 1.0, "step": 1236, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.25, "completion_length": 353.296875, "epoch": 0.03761707821432916, "grad_norm": 0.530121268428862, "kl": 0.031982421875, "learning_rate": 9.965125790549306e-07, "loss": 0.0013, "reward": 1.8968749046325684, "reward_std": 0.008838837966322899, "rewards/accuracy_reward": 0.75, "rewards/format_reward": 1.0, "step": 1237, "temperature": 1.0 }, { "all_correct": 0.25, "all_wrong": 0.125, "completion_length": 366.078125, "epoch": 0.037647488140128936, "grad_norm": 2.641844831376638, "kl": 0.03466796875, "learning_rate": 9.96506944838819e-07, "loss": 0.0014, "reward": 1.5409669876098633, "reward_std": 0.17587411403656006, "rewards/accuracy_reward": 0.43471699953079224, "rewards/format_reward": 1.0, "step": 1238, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.125, "completion_length": 368.984375, "epoch": 0.03767789806592872, "grad_norm": 10.781444891274246, "kl": 0.0267333984375, "learning_rate": 9.965013060910632e-07, "loss": 0.0011, "reward": 1.7990696430206299, "reward_std": 0.12815704941749573, "rewards/accuracy_reward": 0.667819619178772, "rewards/format_reward": 1.0, "step": 1239, "temperature": 1.0 }, { "all_correct": 0.25, "all_wrong": 0.125, "completion_length": 369.921875, "epoch": 0.0377083079917285, "grad_norm": 1.3664953017098478, "kl": 0.031982421875, "learning_rate": 9.964956628117147e-07, "loss": 0.0013, "reward": 1.7144029140472412, "reward_std": 0.16515496373176575, "rewards/accuracy_reward": 0.570652961730957, "rewards/format_reward": 1.0, "step": 1240, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.125, "completion_length": 336.6875, "epoch": 0.03773871791752828, "grad_norm": 1.6518992062464046, "kl": 0.034912109375, "learning_rate": 9.96490015000825e-07, "loss": 0.0014, "reward": 1.9328126907348633, "reward_std": 0.1499861478805542, "rewards/accuracy_reward": 0.8046875, "rewards/format_reward": 1.0, "step": 1241, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.125, "completion_length": 360.453125, "epoch": 0.03776912784332806, "grad_norm": 1.8315896123879818, "kl": 0.0322265625, "learning_rate": 9.964843626584459e-07, "loss": 0.0013, "reward": 1.7527251243591309, "reward_std": 0.056719888001680374, "rewards/accuracy_reward": 0.6089749932289124, "rewards/format_reward": 1.0, "step": 1242, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.25, "completion_length": 360.40625, "epoch": 0.037799537769127846, "grad_norm": 0.9440690682440332, "kl": 0.0284423828125, "learning_rate": 9.96478705784629e-07, "loss": 0.0011, "reward": 1.820537805557251, "reward_std": 0.014862941578030586, "rewards/accuracy_reward": 0.6736628413200378, "rewards/format_reward": 1.0, "step": 1243, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 365.609375, "epoch": 0.037829947694927625, "grad_norm": 4.170969734855603, "kl": 0.0308837890625, "learning_rate": 9.964730443794254e-07, "loss": 0.0012, "reward": 1.7946250438690186, "reward_std": 0.059131920337677, "rewards/accuracy_reward": 0.6446249485015869, "rewards/format_reward": 1.0, "step": 1244, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.125, "completion_length": 367.609375, "epoch": 0.037860357620727404, "grad_norm": 0.8027578930664081, "kl": 0.0216064453125, "learning_rate": 9.964673784428872e-07, "loss": 0.0009, "reward": 1.7999999523162842, "reward_std": 0.20830951631069183, "rewards/accuracy_reward": 0.671875, "rewards/format_reward": 1.0, "step": 1245, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 368.359375, "epoch": 0.03789076754652718, "grad_norm": 1.4802423240245264, "kl": 0.0269775390625, "learning_rate": 9.964617079750662e-07, "loss": 0.0011, "reward": 1.9007866382598877, "reward_std": 0.10542438179254532, "rewards/accuracy_reward": 0.7382866144180298, "rewards/format_reward": 1.0, "step": 1246, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.125, "completion_length": 359.71875, "epoch": 0.03792117747232697, "grad_norm": 1.2888714387797215, "kl": 0.034423828125, "learning_rate": 9.964560329760137e-07, "loss": 0.0014, "reward": 1.8350915908813477, "reward_std": 0.0758938416838646, "rewards/accuracy_reward": 0.6975915431976318, "rewards/format_reward": 1.0, "step": 1247, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.0, "completion_length": 387.078125, "epoch": 0.03795158739812675, "grad_norm": 1.3056266517770545, "kl": 0.033447265625, "learning_rate": 9.96450353445782e-07, "loss": 0.0013, "reward": 2.0613996982574463, "reward_std": 0.1021379679441452, "rewards/accuracy_reward": 0.8801496028900146, "rewards/format_reward": 1.0, "step": 1248, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 356.828125, "epoch": 0.03798199732392653, "grad_norm": 0.6565394948757017, "kl": 0.03369140625, "learning_rate": 9.964446693844226e-07, "loss": 0.0013, "reward": 2.049384355545044, "reward_std": 0.08088602870702744, "rewards/accuracy_reward": 0.8587594032287598, "rewards/format_reward": 1.0, "step": 1249, "temperature": 1.0 }, { "all_correct": 0.375, "all_wrong": 0.25, "completion_length": 391.390625, "epoch": 0.038012407249726314, "grad_norm": 0.6279364459235974, "kl": 0.0224609375, "learning_rate": 9.964389807919877e-07, "loss": 0.0009, "reward": 1.7124006748199463, "reward_std": 0.15352453291416168, "rewards/accuracy_reward": 0.5967757701873779, "rewards/format_reward": 1.0, "step": 1250, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.125, "completion_length": 360.734375, "epoch": 0.03804281717552609, "grad_norm": 0.571488166215603, "kl": 0.028564453125, "learning_rate": 9.964332876685288e-07, "loss": 0.0011, "reward": 1.9156250953674316, "reward_std": 0.04419417306780815, "rewards/accuracy_reward": 0.765625, "rewards/format_reward": 1.0, "step": 1251, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.125, "completion_length": 384.484375, "epoch": 0.03807322710132587, "grad_norm": 0.86930372825186, "kl": 0.026123046875, "learning_rate": 9.964275900140983e-07, "loss": 0.001, "reward": 1.76670241355896, "reward_std": 0.13029633462429047, "rewards/accuracy_reward": 0.6323273777961731, "rewards/format_reward": 1.0, "step": 1252, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.0, "completion_length": 386.03125, "epoch": 0.03810363702712565, "grad_norm": 1.0540194890672043, "kl": 0.0322265625, "learning_rate": 9.964218878287478e-07, "loss": 0.0013, "reward": 2.004464864730835, "reward_std": 0.05205019190907478, "rewards/accuracy_reward": 0.8044646978378296, "rewards/format_reward": 1.0, "step": 1253, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.125, "completion_length": 362.53125, "epoch": 0.03813404695292544, "grad_norm": 0.32555587211172576, "kl": 0.0302734375, "learning_rate": 9.964161811125296e-07, "loss": 0.0012, "reward": 2.012500047683716, "reward_std": 0.06943650543689728, "rewards/accuracy_reward": 0.84375, "rewards/format_reward": 1.0, "step": 1254, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.125, "completion_length": 386.015625, "epoch": 0.03816445687872522, "grad_norm": 0.9538431294251063, "kl": 0.02587890625, "learning_rate": 9.964104698654958e-07, "loss": 0.001, "reward": 1.943750023841858, "reward_std": 0.07353248447179794, "rewards/accuracy_reward": 0.796875, "rewards/format_reward": 1.0, "step": 1255, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.125, "completion_length": 362.703125, "epoch": 0.038194866804524996, "grad_norm": 0.5389047522384508, "kl": 0.031982421875, "learning_rate": 9.964047540876984e-07, "loss": 0.0013, "reward": 1.9156250953674316, "reward_std": 0.04419417306780815, "rewards/accuracy_reward": 0.765625, "rewards/format_reward": 1.0, "step": 1256, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.125, "completion_length": 363.453125, "epoch": 0.038225276730324775, "grad_norm": 1.1029472720095477, "kl": 0.0390625, "learning_rate": 9.963990337791895e-07, "loss": 0.0016, "reward": 1.788149118423462, "reward_std": 0.11239831149578094, "rewards/accuracy_reward": 0.6381491422653198, "rewards/format_reward": 1.0, "step": 1257, "temperature": 1.0 }, { "all_correct": 0.375, "all_wrong": 0.375, "completion_length": 412.859375, "epoch": 0.03825568665612456, "grad_norm": 0.38396649116414483, "kl": 0.0257568359375, "learning_rate": 9.963933089400215e-07, "loss": 0.001, "reward": 1.59375, "reward_std": 0.12730026245117188, "rewards/accuracy_reward": 0.5, "rewards/format_reward": 1.0, "step": 1258, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.125, "completion_length": 351.375, "epoch": 0.03828609658192434, "grad_norm": 0.732037344403108, "kl": 0.033935546875, "learning_rate": 9.963875795702466e-07, "loss": 0.0014, "reward": 2.0218751430511475, "reward_std": 0.0659717470407486, "rewards/accuracy_reward": 0.859375, "rewards/format_reward": 1.0, "step": 1259, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.0, "completion_length": 380.03125, "epoch": 0.03831650650772412, "grad_norm": 1.2927377093265666, "kl": 0.02880859375, "learning_rate": 9.963818456699169e-07, "loss": 0.0012, "reward": 2.0321199893951416, "reward_std": 0.054939642548561096, "rewards/accuracy_reward": 0.8414947986602783, "rewards/format_reward": 1.0, "step": 1260, "temperature": 1.0 }, { "all_correct": 0.25, "all_wrong": 0.25, "completion_length": 385.734375, "epoch": 0.0383469164335239, "grad_norm": 2.7591561724178435, "kl": 0.03369140625, "learning_rate": 9.963761072390852e-07, "loss": 0.0013, "reward": 1.5007102489471436, "reward_std": 0.1544691026210785, "rewards/accuracy_reward": 0.4007102847099304, "rewards/format_reward": 1.0, "step": 1261, "temperature": 1.0 }, { "all_correct": 0.25, "all_wrong": 0.25, "completion_length": 378.71875, "epoch": 0.038377326359323685, "grad_norm": 1.444546466930967, "kl": 0.0322265625, "learning_rate": 9.963703642778033e-07, "loss": 0.0013, "reward": 1.5905085802078247, "reward_std": 0.15695898234844208, "rewards/accuracy_reward": 0.46550852060317993, "rewards/format_reward": 1.0, "step": 1262, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.375, "completion_length": 368.1875, "epoch": 0.038407736285123464, "grad_norm": 0.5595674624429317, "kl": 0.0281982421875, "learning_rate": 9.96364616786124e-07, "loss": 0.0011, "reward": 1.7468750476837158, "reward_std": 0.008838837966322899, "rewards/accuracy_reward": 0.625, "rewards/format_reward": 1.0, "step": 1263, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 357.390625, "epoch": 0.038438146210923244, "grad_norm": 1.1833190077655955, "kl": 0.033447265625, "learning_rate": 9.963588647640995e-07, "loss": 0.0013, "reward": 1.7588131427764893, "reward_std": 0.17466342449188232, "rewards/accuracy_reward": 0.6119380593299866, "rewards/format_reward": 1.0, "step": 1264, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 348.375, "epoch": 0.03846855613672303, "grad_norm": 1.602818390916467, "kl": 0.0322265625, "learning_rate": 9.963531082117826e-07, "loss": 0.0013, "reward": 1.8966145515441895, "reward_std": 0.18168765306472778, "rewards/accuracy_reward": 0.7528645992279053, "rewards/format_reward": 1.0, "step": 1265, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.125, "completion_length": 353.859375, "epoch": 0.03849896606252281, "grad_norm": 0.6380081656374837, "kl": 0.0306396484375, "learning_rate": 9.963473471292257e-07, "loss": 0.0012, "reward": 1.9627233743667603, "reward_std": 0.008838835172355175, "rewards/accuracy_reward": 0.7908483743667603, "rewards/format_reward": 1.0, "step": 1266, "temperature": 1.0 }, { "all_correct": 0.25, "all_wrong": 0.125, "completion_length": 378.15625, "epoch": 0.03852937598832259, "grad_norm": 2.10252359125003, "kl": 0.037109375, "learning_rate": 9.963415815164814e-07, "loss": 0.0015, "reward": 1.583824634552002, "reward_std": 0.2249050736427307, "rewards/accuracy_reward": 0.46819961071014404, "rewards/format_reward": 1.0, "step": 1267, "temperature": 1.0 }, { "all_correct": 0.25, "all_wrong": 0.5, "completion_length": 359.90625, "epoch": 0.03855978591412237, "grad_norm": 1.0805066568184425, "kl": 0.03369140625, "learning_rate": 9.963358113736022e-07, "loss": 0.0014, "reward": 1.4713525772094727, "reward_std": 0.08763913810253143, "rewards/accuracy_reward": 0.38697749376296997, "rewards/format_reward": 1.0, "step": 1268, "temperature": 1.0 }, { "all_correct": 0.25, "all_wrong": 0.25, "completion_length": 356.0, "epoch": 0.03859019583992215, "grad_norm": 0.9904265559039437, "kl": 0.0303955078125, "learning_rate": 9.963300367006409e-07, "loss": 0.0012, "reward": 1.5116394758224487, "reward_std": 0.11188942193984985, "rewards/accuracy_reward": 0.41163942217826843, "rewards/format_reward": 1.0, "step": 1269, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 352.703125, "epoch": 0.03862060576572193, "grad_norm": 1.166406805130573, "kl": 0.03271484375, "learning_rate": 9.9632425749765e-07, "loss": 0.0013, "reward": 1.9444780349731445, "reward_std": 0.1057472825050354, "rewards/accuracy_reward": 0.7788530588150024, "rewards/format_reward": 1.0, "step": 1270, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.375, "completion_length": 356.265625, "epoch": 0.03865101569152171, "grad_norm": 0.057644823582353386, "kl": 0.027099609375, "learning_rate": 9.963184737646826e-07, "loss": 0.0011, "reward": 1.75, "reward_std": 0.0, "rewards/accuracy_reward": 0.625, "rewards/format_reward": 1.0, "step": 1271, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 348.875, "epoch": 0.03868142561732149, "grad_norm": 1.1198136967958012, "kl": 0.033203125, "learning_rate": 9.96312685501791e-07, "loss": 0.0013, "reward": 1.9925200939178467, "reward_std": 0.040434785187244415, "rewards/accuracy_reward": 0.8175201416015625, "rewards/format_reward": 1.0, "step": 1272, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.125, "completion_length": 389.703125, "epoch": 0.03871183554312128, "grad_norm": 0.7362419005442138, "kl": 0.032470703125, "learning_rate": 9.963068927090287e-07, "loss": 0.0013, "reward": 1.8125, "reward_std": 0.11499667167663574, "rewards/accuracy_reward": 0.765625, "rewards/format_reward": 0.90625, "step": 1273, "temperature": 1.0 }, { "all_correct": 0.375, "all_wrong": 0.125, "completion_length": 406.375, "epoch": 0.038742245468921056, "grad_norm": 0.865928850864484, "kl": 0.02685546875, "learning_rate": 9.963010953864481e-07, "loss": 0.0011, "reward": 1.7434850931167603, "reward_std": 0.08635497093200684, "rewards/accuracy_reward": 0.5997350811958313, "rewards/format_reward": 1.0, "step": 1274, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.125, "completion_length": 358.078125, "epoch": 0.038772655394720835, "grad_norm": 1.0034849049090346, "kl": 0.0302734375, "learning_rate": 9.962952935341022e-07, "loss": 0.0012, "reward": 1.953125, "reward_std": 0.14205020666122437, "rewards/accuracy_reward": 0.796875, "rewards/format_reward": 1.0, "step": 1275, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.25, "completion_length": 374.65625, "epoch": 0.038803065320520615, "grad_norm": 2.2836808464223552, "kl": 0.03466796875, "learning_rate": 9.962894871520439e-07, "loss": 0.0014, "reward": 1.7178198099136353, "reward_std": 0.023216933012008667, "rewards/accuracy_reward": 0.5740697979927063, "rewards/format_reward": 1.0, "step": 1276, "temperature": 1.0 }, { "all_correct": 0.25, "all_wrong": 0.375, "completion_length": 423.96875, "epoch": 0.0388334752463204, "grad_norm": 0.8396630741137664, "kl": 0.0238037109375, "learning_rate": 9.962836762403263e-07, "loss": 0.001, "reward": 1.4961453676223755, "reward_std": 0.1826830357313156, "rewards/accuracy_reward": 0.41489535570144653, "rewards/format_reward": 0.984375, "step": 1277, "temperature": 1.0 }, { "all_correct": 0.875, "all_wrong": 0.0, "completion_length": 361.734375, "epoch": 0.03886388517212018, "grad_norm": 0.6423896831532357, "kl": 0.039794921875, "learning_rate": 9.962778607990024e-07, "loss": 0.0016, "reward": 2.1280763149261475, "reward_std": 0.015616388991475105, "rewards/accuracy_reward": 0.9312012791633606, "rewards/format_reward": 1.0, "step": 1278, "temperature": 1.0 }, { "all_correct": 0.25, "all_wrong": 0.375, "completion_length": 353.90625, "epoch": 0.03889429509791996, "grad_norm": 2.1235720959667717, "kl": 0.03515625, "learning_rate": 9.962720408281252e-07, "loss": 0.0014, "reward": 1.3937880992889404, "reward_std": 0.12773333489894867, "rewards/accuracy_reward": 0.3156631588935852, "rewards/format_reward": 1.0, "step": 1279, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.0, "completion_length": 362.453125, "epoch": 0.038924705023719745, "grad_norm": 1.0389428900399809, "kl": 0.036376953125, "learning_rate": 9.96266216327748e-07, "loss": 0.0015, "reward": 1.963141679763794, "reward_std": 0.029613522812724113, "rewards/accuracy_reward": 0.7631416320800781, "rewards/format_reward": 1.0, "step": 1280, "temperature": 1.0 }, { "all_correct": 0.375, "all_wrong": 0.125, "completion_length": 390.140625, "epoch": 0.038955114949519525, "grad_norm": 1.1055251004611069, "kl": 0.030517578125, "learning_rate": 9.962603872979238e-07, "loss": 0.0012, "reward": 1.6734874248504639, "reward_std": 0.16549795866012573, "rewards/accuracy_reward": 0.5328624248504639, "rewards/format_reward": 1.0, "step": 1281, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.125, "completion_length": 362.390625, "epoch": 0.038985524875319304, "grad_norm": 0.4316540582810598, "kl": 0.034423828125, "learning_rate": 9.962545537387058e-07, "loss": 0.0014, "reward": 1.9568827152252197, "reward_std": 0.0059239245019853115, "rewards/accuracy_reward": 0.7818825840950012, "rewards/format_reward": 1.0, "step": 1282, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.125, "completion_length": 353.484375, "epoch": 0.03901593480111908, "grad_norm": 0.9183092673750451, "kl": 0.039306640625, "learning_rate": 9.962487156501474e-07, "loss": 0.0016, "reward": 1.9093750715255737, "reward_std": 0.06187184900045395, "rewards/accuracy_reward": 0.765625, "rewards/format_reward": 1.0, "step": 1283, "temperature": 1.0 }, { "all_correct": 0.375, "all_wrong": 0.25, "completion_length": 398.046875, "epoch": 0.03904634472691887, "grad_norm": 1.7521040314494982, "kl": 0.031005859375, "learning_rate": 9.962428730323017e-07, "loss": 0.0012, "reward": 1.639801025390625, "reward_std": 0.11338851600885391, "rewards/accuracy_reward": 0.533551037311554, "rewards/format_reward": 0.984375, "step": 1284, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.125, "completion_length": 388.296875, "epoch": 0.03907675465271865, "grad_norm": 0.7235301366831329, "kl": 0.039794921875, "learning_rate": 9.96237025885222e-07, "loss": 0.0016, "reward": 1.8937499523162842, "reward_std": 0.0530330054461956, "rewards/accuracy_reward": 0.765625, "rewards/format_reward": 1.0, "step": 1285, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.125, "completion_length": 373.40625, "epoch": 0.03910716457851843, "grad_norm": 0.5115175019287024, "kl": 0.041015625, "learning_rate": 9.96231174208962e-07, "loss": 0.0016, "reward": 2.003525733947754, "reward_std": 0.005139642860740423, "rewards/accuracy_reward": 0.8285256624221802, "rewards/format_reward": 1.0, "step": 1286, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 366.234375, "epoch": 0.03913757450431821, "grad_norm": 0.50161282244396, "kl": 0.03857421875, "learning_rate": 9.962253180035747e-07, "loss": 0.0015, "reward": 2.0642921924591064, "reward_std": 0.0130257373675704, "rewards/accuracy_reward": 0.8642921447753906, "rewards/format_reward": 1.0, "step": 1287, "temperature": 1.0 }, { "all_correct": 0.25, "all_wrong": 0.125, "completion_length": 412.0625, "epoch": 0.03916798443011799, "grad_norm": 0.607991605189762, "kl": 0.03173828125, "learning_rate": 9.962194572691139e-07, "loss": 0.0013, "reward": 1.7259693145751953, "reward_std": 0.1509249210357666, "rewards/accuracy_reward": 0.5853441953659058, "rewards/format_reward": 1.0, "step": 1288, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.125, "completion_length": 400.796875, "epoch": 0.03919839435591777, "grad_norm": 0.7244747857353911, "kl": 0.0274658203125, "learning_rate": 9.962135920056328e-07, "loss": 0.0011, "reward": 1.9484374523162842, "reward_std": 0.147711843252182, "rewards/accuracy_reward": 0.7890625, "rewards/format_reward": 1.0, "step": 1289, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.25, "completion_length": 406.5625, "epoch": 0.03922880428171755, "grad_norm": 0.7626932135649339, "kl": 0.030517578125, "learning_rate": 9.962077222131852e-07, "loss": 0.0012, "reward": 1.8417868614196777, "reward_std": 0.06385098397731781, "rewards/accuracy_reward": 0.7136617302894592, "rewards/format_reward": 1.0, "step": 1290, "temperature": 1.0 }, { "all_correct": 0.875, "all_wrong": 0.0, "completion_length": 356.875, "epoch": 0.03925921420751733, "grad_norm": 0.7256042486802683, "kl": 0.04931640625, "learning_rate": 9.962018478918244e-07, "loss": 0.002, "reward": 2.1593751907348633, "reward_std": 0.07827534526586533, "rewards/accuracy_reward": 0.96875, "rewards/format_reward": 1.0, "step": 1291, "temperature": 1.0 }, { "all_correct": 0.375, "all_wrong": 0.375, "completion_length": 365.59375, "epoch": 0.039289624133317116, "grad_norm": 0.8201631840296107, "kl": 0.0341796875, "learning_rate": 9.961959690416041e-07, "loss": 0.0014, "reward": 1.6178460121154785, "reward_std": 0.06796128302812576, "rewards/accuracy_reward": 0.5115960836410522, "rewards/format_reward": 1.0, "step": 1292, "temperature": 1.0 }, { "all_correct": 0.875, "all_wrong": 0.125, "completion_length": 362.625, "epoch": 0.039320034059116896, "grad_norm": 0.08207798670455557, "kl": 0.03759765625, "learning_rate": 9.961900856625782e-07, "loss": 0.0015, "reward": 2.0500001907348633, "reward_std": 0.0, "rewards/accuracy_reward": 0.875, "rewards/format_reward": 1.0, "step": 1293, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.25, "completion_length": 365.921875, "epoch": 0.039350443984916675, "grad_norm": 1.3529287547770732, "kl": 0.0419921875, "learning_rate": 9.961841977548e-07, "loss": 0.0017, "reward": 1.6968610286712646, "reward_std": 0.07324168086051941, "rewards/accuracy_reward": 0.5687360763549805, "rewards/format_reward": 1.0, "step": 1294, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.125, "completion_length": 379.375, "epoch": 0.03938085391071646, "grad_norm": 0.7799968902292901, "kl": 0.028564453125, "learning_rate": 9.961783053183235e-07, "loss": 0.0011, "reward": 1.9093749523162842, "reward_std": 0.06187184900045395, "rewards/accuracy_reward": 0.765625, "rewards/format_reward": 1.0, "step": 1295, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.125, "completion_length": 346.4375, "epoch": 0.03941126383651624, "grad_norm": 0.8301073063285291, "kl": 0.0361328125, "learning_rate": 9.961724083532025e-07, "loss": 0.0014, "reward": 2.0250000953674316, "reward_std": 0.06134308874607086, "rewards/accuracy_reward": 0.859375, "rewards/format_reward": 1.0, "step": 1296, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 352.890625, "epoch": 0.03944167376231602, "grad_norm": 5.88803715755647, "kl": 0.03369140625, "learning_rate": 9.961665068594906e-07, "loss": 0.0013, "reward": 2.1190028190612793, "reward_std": 0.09822721779346466, "rewards/accuracy_reward": 0.9283777475357056, "rewards/format_reward": 1.0, "step": 1297, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.125, "completion_length": 361.203125, "epoch": 0.0394720836881158, "grad_norm": 1.2677164076099372, "kl": 0.036376953125, "learning_rate": 9.96160600837242e-07, "loss": 0.0015, "reward": 1.8633277416229248, "reward_std": 0.13791996240615845, "rewards/accuracy_reward": 0.7133276462554932, "rewards/format_reward": 1.0, "step": 1298, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.125, "completion_length": 374.28125, "epoch": 0.039502493613915585, "grad_norm": 1.3311327494857392, "kl": 0.0289306640625, "learning_rate": 9.961546902865102e-07, "loss": 0.0012, "reward": 1.8705925941467285, "reward_std": 0.06272601336240768, "rewards/accuracy_reward": 0.7268426418304443, "rewards/format_reward": 1.0, "step": 1299, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.25, "completion_length": 372.796875, "epoch": 0.039532903539715364, "grad_norm": 0.5729202564215646, "kl": 0.033935546875, "learning_rate": 9.961487752073493e-07, "loss": 0.0014, "reward": 1.8011949062347412, "reward_std": 0.09193049371242523, "rewards/accuracy_reward": 0.6668198704719543, "rewards/format_reward": 1.0, "step": 1300, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.0, "completion_length": 349.125, "epoch": 0.03956331346551514, "grad_norm": 2.240176080055285, "kl": 0.033447265625, "learning_rate": 9.961428555998137e-07, "loss": 0.0013, "reward": 2.001201868057251, "reward_std": 0.1575872302055359, "rewards/accuracy_reward": 0.8293269276618958, "rewards/format_reward": 1.0, "step": 1301, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 345.484375, "epoch": 0.03959372339131492, "grad_norm": 0.95299612843412, "kl": 0.0322265625, "learning_rate": 9.96136931463957e-07, "loss": 0.0013, "reward": 2.0250000953674316, "reward_std": 0.12246952205896378, "rewards/accuracy_reward": 0.859375, "rewards/format_reward": 1.0, "step": 1302, "temperature": 1.0 }, { "all_correct": 0.25, "all_wrong": 0.125, "completion_length": 421.78125, "epoch": 0.03962413331711471, "grad_norm": 1.1609403597347412, "kl": 0.02490234375, "learning_rate": 9.96131002799833e-07, "loss": 0.001, "reward": 1.5474251508712769, "reward_std": 0.1381864994764328, "rewards/accuracy_reward": 0.43180012702941895, "rewards/format_reward": 1.0, "step": 1303, "temperature": 1.0 }, { "all_correct": 0.375, "all_wrong": 0.25, "completion_length": 363.171875, "epoch": 0.03965454324291449, "grad_norm": 1.501789258861905, "kl": 0.029541015625, "learning_rate": 9.961250696074963e-07, "loss": 0.0012, "reward": 1.6882401704788208, "reward_std": 0.16457030177116394, "rewards/accuracy_reward": 0.5694901347160339, "rewards/format_reward": 1.0, "step": 1304, "temperature": 1.0 }, { "all_correct": 0.375, "all_wrong": 0.5, "completion_length": 365.1875, "epoch": 0.03968495316871427, "grad_norm": 1.4016039121676045, "kl": 0.028076171875, "learning_rate": 9.96119131887001e-07, "loss": 0.0011, "reward": 1.5366151332855225, "reward_std": 0.01780317723751068, "rewards/accuracy_reward": 0.4397400915622711, "rewards/format_reward": 1.0, "step": 1305, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.125, "completion_length": 362.796875, "epoch": 0.039715363094514046, "grad_norm": 1.2074679919637177, "kl": 0.032958984375, "learning_rate": 9.961131896384013e-07, "loss": 0.0013, "reward": 1.9812500476837158, "reward_std": 0.13402831554412842, "rewards/accuracy_reward": 0.828125, "rewards/format_reward": 1.0, "step": 1306, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.125, "completion_length": 355.328125, "epoch": 0.03974577302031383, "grad_norm": 0.563225816831639, "kl": 0.0289306640625, "learning_rate": 9.96107242861751e-07, "loss": 0.0012, "reward": 1.9375, "reward_std": 0.06943650543689728, "rewards/accuracy_reward": 0.78125, "rewards/format_reward": 1.0, "step": 1307, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 368.6875, "epoch": 0.03977618294611361, "grad_norm": 2.038779470571373, "kl": 0.037109375, "learning_rate": 9.96101291557105e-07, "loss": 0.0015, "reward": 1.816410779953003, "reward_std": 0.054240547120571136, "rewards/accuracy_reward": 0.6539105772972107, "rewards/format_reward": 1.0, "step": 1308, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.375, "completion_length": 338.421875, "epoch": 0.03980659287191339, "grad_norm": 0.9253009394332102, "kl": 0.03466796875, "learning_rate": 9.960953357245173e-07, "loss": 0.0014, "reward": 1.6031250953674316, "reward_std": 0.067339688539505, "rewards/accuracy_reward": 0.515625, "rewards/format_reward": 1.0, "step": 1309, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 389.59375, "epoch": 0.03983700279771318, "grad_norm": 1.4771398119031744, "kl": 0.031494140625, "learning_rate": 9.960893753640422e-07, "loss": 0.0013, "reward": 1.8139492273330688, "reward_std": 0.14914552867412567, "rewards/accuracy_reward": 0.6576991677284241, "rewards/format_reward": 1.0, "step": 1310, "temperature": 1.0 }, { "all_correct": 0.25, "all_wrong": 0.0, "completion_length": 387.3125, "epoch": 0.039867412723512956, "grad_norm": 1.2975032790808658, "kl": 0.035888671875, "learning_rate": 9.960834104757342e-07, "loss": 0.0014, "reward": 1.7364957332611084, "reward_std": 0.18749645352363586, "rewards/accuracy_reward": 0.5802457332611084, "rewards/format_reward": 1.0, "step": 1311, "temperature": 1.0 }, { "all_correct": 0.375, "all_wrong": 0.125, "completion_length": 399.15625, "epoch": 0.039897822649312735, "grad_norm": 2.7914229876516967, "kl": 0.032958984375, "learning_rate": 9.960774410596477e-07, "loss": 0.0013, "reward": 1.7136642932891846, "reward_std": 0.15386910736560822, "rewards/accuracy_reward": 0.6511642932891846, "rewards/format_reward": 0.921875, "step": 1312, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.125, "completion_length": 397.765625, "epoch": 0.039928232575112514, "grad_norm": 2.2107138683935825, "kl": 0.028564453125, "learning_rate": 9.960714671158373e-07, "loss": 0.0011, "reward": 1.7863473892211914, "reward_std": 0.11173753440380096, "rewards/accuracy_reward": 0.6519724130630493, "rewards/format_reward": 1.0, "step": 1313, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 396.28125, "epoch": 0.0399586425009123, "grad_norm": 1.0169353253453348, "kl": 0.02783203125, "learning_rate": 9.960654886443573e-07, "loss": 0.0011, "reward": 1.8392975330352783, "reward_std": 0.19117602705955505, "rewards/accuracy_reward": 0.7080473899841309, "rewards/format_reward": 1.0, "step": 1314, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.125, "completion_length": 357.375, "epoch": 0.03998905242671208, "grad_norm": 1.1942419532258988, "kl": 0.0400390625, "learning_rate": 9.960595056452625e-07, "loss": 0.0016, "reward": 1.8291510343551636, "reward_std": 0.072826087474823, "rewards/accuracy_reward": 0.6854010820388794, "rewards/format_reward": 1.0, "step": 1315, "temperature": 1.0 }, { "all_correct": 0.375, "all_wrong": 0.125, "completion_length": 396.953125, "epoch": 0.04001946235251186, "grad_norm": 1.0337391045249644, "kl": 0.0244140625, "learning_rate": 9.960535181186075e-07, "loss": 0.001, "reward": 1.7338731288909912, "reward_std": 0.1774049699306488, "rewards/accuracy_reward": 0.6088730096817017, "rewards/format_reward": 1.0, "step": 1316, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.125, "completion_length": 381.46875, "epoch": 0.04004987227831164, "grad_norm": 0.9211275913192079, "kl": 0.032470703125, "learning_rate": 9.960475260644468e-07, "loss": 0.0013, "reward": 1.9375, "reward_std": 0.10867312550544739, "rewards/accuracy_reward": 0.796875, "rewards/format_reward": 1.0, "step": 1317, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 360.84375, "epoch": 0.040080282204111424, "grad_norm": 3.445572032114775, "kl": 0.0299072265625, "learning_rate": 9.96041529482835e-07, "loss": 0.0012, "reward": 2.0757908821105957, "reward_std": 0.0918976366519928, "rewards/accuracy_reward": 0.885165810585022, "rewards/format_reward": 1.0, "step": 1318, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.125, "completion_length": 366.125, "epoch": 0.0401106921299112, "grad_norm": 3.3390643076864177, "kl": 0.0283203125, "learning_rate": 9.960355283738272e-07, "loss": 0.0011, "reward": 1.9224340915679932, "reward_std": 0.0858631283044815, "rewards/accuracy_reward": 0.7536839842796326, "rewards/format_reward": 1.0, "step": 1319, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.125, "completion_length": 358.265625, "epoch": 0.04014110205571098, "grad_norm": 0.9561689460043742, "kl": 0.0341796875, "learning_rate": 9.96029522737478e-07, "loss": 0.0014, "reward": 1.9840564727783203, "reward_std": 0.10123483836650848, "rewards/accuracy_reward": 0.8153064250946045, "rewards/format_reward": 1.0, "step": 1320, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.25, "completion_length": 371.5, "epoch": 0.04017151198151077, "grad_norm": 1.1189274987324662, "kl": 0.033935546875, "learning_rate": 9.960235125738418e-07, "loss": 0.0014, "reward": 1.816159963607788, "reward_std": 0.003463847329840064, "rewards/accuracy_reward": 0.666159987449646, "rewards/format_reward": 1.0, "step": 1321, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 406.96875, "epoch": 0.04020192190731055, "grad_norm": 0.9504857076018339, "kl": 0.037109375, "learning_rate": 9.960174978829742e-07, "loss": 0.0015, "reward": 1.8848531246185303, "reward_std": 0.03111434541642666, "rewards/accuracy_reward": 0.7004780769348145, "rewards/format_reward": 1.0, "step": 1322, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 398.03125, "epoch": 0.04023233183311033, "grad_norm": 2.36202888937614, "kl": 0.033447265625, "learning_rate": 9.960114786649293e-07, "loss": 0.0013, "reward": 1.8598071336746216, "reward_std": 0.15658432245254517, "rewards/accuracy_reward": 0.703557014465332, "rewards/format_reward": 1.0, "step": 1323, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 375.015625, "epoch": 0.040262741758910106, "grad_norm": 1.2317196139737694, "kl": 0.037353515625, "learning_rate": 9.960054549197628e-07, "loss": 0.0015, "reward": 1.915244460105896, "reward_std": 0.13039034605026245, "rewards/accuracy_reward": 0.7371194362640381, "rewards/format_reward": 1.0, "step": 1324, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.0, "completion_length": 349.078125, "epoch": 0.04029315168470989, "grad_norm": 1.6130517078895361, "kl": 0.03125, "learning_rate": 9.959994266475293e-07, "loss": 0.0013, "reward": 1.9375, "reward_std": 0.22470125555992126, "rewards/accuracy_reward": 0.78125, "rewards/format_reward": 1.0, "step": 1325, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.125, "completion_length": 369.953125, "epoch": 0.04032356161050967, "grad_norm": 0.8290163003969956, "kl": 0.030517578125, "learning_rate": 9.95993393848284e-07, "loss": 0.0012, "reward": 1.8490631580352783, "reward_std": 0.05882937088608742, "rewards/accuracy_reward": 0.6896881461143494, "rewards/format_reward": 1.0, "step": 1326, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.25, "completion_length": 358.921875, "epoch": 0.04035397153630945, "grad_norm": 0.7771225119765722, "kl": 0.0341796875, "learning_rate": 9.959873565220814e-07, "loss": 0.0014, "reward": 1.7544031143188477, "reward_std": 0.0530330091714859, "rewards/accuracy_reward": 0.6325280666351318, "rewards/format_reward": 1.0, "step": 1327, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.0, "completion_length": 370.1875, "epoch": 0.04038438146210923, "grad_norm": 1.2361075223311364, "kl": 0.03271484375, "learning_rate": 9.959813146689774e-07, "loss": 0.0013, "reward": 1.999735713005066, "reward_std": 0.10615263879299164, "rewards/accuracy_reward": 0.8122357130050659, "rewards/format_reward": 1.0, "step": 1328, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.0, "completion_length": 363.140625, "epoch": 0.040414791387909016, "grad_norm": 1.1421687131580691, "kl": 0.035888671875, "learning_rate": 9.959752682890264e-07, "loss": 0.0014, "reward": 2.098296642303467, "reward_std": 0.10883867740631104, "rewards/accuracy_reward": 0.9076717495918274, "rewards/format_reward": 1.0, "step": 1329, "temperature": 1.0 }, { "all_correct": 0.875, "all_wrong": 0.125, "completion_length": 349.765625, "epoch": 0.040445201313708795, "grad_norm": 0.08169214426988264, "kl": 0.037353515625, "learning_rate": 9.959692173822843e-07, "loss": 0.0015, "reward": 2.0500001907348633, "reward_std": 0.0, "rewards/accuracy_reward": 0.875, "rewards/format_reward": 1.0, "step": 1330, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.125, "completion_length": 359.84375, "epoch": 0.040475611239508574, "grad_norm": 2.5703292667113584, "kl": 0.035888671875, "learning_rate": 9.959631619488056e-07, "loss": 0.0014, "reward": 1.8603743314743042, "reward_std": 0.07285212725400925, "rewards/accuracy_reward": 0.6947493553161621, "rewards/format_reward": 1.0, "step": 1331, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.125, "completion_length": 377.84375, "epoch": 0.040506021165308354, "grad_norm": 0.7104214303797572, "kl": 0.0263671875, "learning_rate": 9.959571019886463e-07, "loss": 0.0011, "reward": 1.8565657138824463, "reward_std": 0.07906949520111084, "rewards/accuracy_reward": 0.7034406661987305, "rewards/format_reward": 1.0, "step": 1332, "temperature": 1.0 }, { "all_correct": 0.25, "all_wrong": 0.25, "completion_length": 395.671875, "epoch": 0.04053643109110814, "grad_norm": 1.1342694561075992, "kl": 0.0289306640625, "learning_rate": 9.959510375018611e-07, "loss": 0.0012, "reward": 1.6762497425079346, "reward_std": 0.09515200555324554, "rewards/accuracy_reward": 0.5543747544288635, "rewards/format_reward": 1.0, "step": 1333, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.0, "completion_length": 351.390625, "epoch": 0.04056684101690792, "grad_norm": 2.9223507356928864, "kl": 0.033203125, "learning_rate": 9.959449684885056e-07, "loss": 0.0013, "reward": 1.9155181646347046, "reward_std": 0.12182654440402985, "rewards/accuracy_reward": 0.7561431527137756, "rewards/format_reward": 1.0, "step": 1334, "temperature": 1.0 }, { "all_correct": 0.25, "all_wrong": 0.0, "completion_length": 414.28125, "epoch": 0.0405972509427077, "grad_norm": 2.0226103270529796, "kl": 0.025634765625, "learning_rate": 9.959388949486355e-07, "loss": 0.001, "reward": 1.719913125038147, "reward_std": 0.17510588467121124, "rewards/accuracy_reward": 0.5480380058288574, "rewards/format_reward": 1.0, "step": 1335, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.125, "completion_length": 346.375, "epoch": 0.040627660868507484, "grad_norm": 0.9813379076880263, "kl": 0.03515625, "learning_rate": 9.959328168823058e-07, "loss": 0.0014, "reward": 1.8983631134033203, "reward_std": 0.013047808781266212, "rewards/accuracy_reward": 0.7514880895614624, "rewards/format_reward": 1.0, "step": 1336, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 378.515625, "epoch": 0.04065807079430726, "grad_norm": 0.6908030927388166, "kl": 0.0263671875, "learning_rate": 9.95926734289572e-07, "loss": 0.0011, "reward": 2.0971951484680176, "reward_std": 0.029635250568389893, "rewards/accuracy_reward": 0.8971948623657227, "rewards/format_reward": 1.0, "step": 1337, "temperature": 1.0 }, { "all_correct": 0.25, "all_wrong": 0.125, "completion_length": 397.09375, "epoch": 0.04068848072010704, "grad_norm": 1.5392314568723093, "kl": 0.033203125, "learning_rate": 9.959206471704898e-07, "loss": 0.0013, "reward": 1.6306530237197876, "reward_std": 0.0899733230471611, "rewards/accuracy_reward": 0.47440293431282043, "rewards/format_reward": 1.0, "step": 1338, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 362.109375, "epoch": 0.04071889064590682, "grad_norm": 1.5594762859452738, "kl": 0.03173828125, "learning_rate": 9.959145555251147e-07, "loss": 0.0013, "reward": 1.9375, "reward_std": 0.14424708485603333, "rewards/accuracy_reward": 0.796875, "rewards/format_reward": 1.0, "step": 1339, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.125, "completion_length": 358.546875, "epoch": 0.04074930057170661, "grad_norm": 1.1133703627868659, "kl": 0.034423828125, "learning_rate": 9.959084593535022e-07, "loss": 0.0014, "reward": 1.8486685752868652, "reward_std": 0.07270568609237671, "rewards/accuracy_reward": 0.6861684322357178, "rewards/format_reward": 1.0, "step": 1340, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.125, "completion_length": 361.25, "epoch": 0.04077971049750639, "grad_norm": 1.4476454114524349, "kl": 0.033203125, "learning_rate": 9.95902358655708e-07, "loss": 0.0013, "reward": 1.9710166454315186, "reward_std": 0.02923586405813694, "rewards/accuracy_reward": 0.8085166215896606, "rewards/format_reward": 1.0, "step": 1341, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.25, "completion_length": 353.6875, "epoch": 0.040810120423306166, "grad_norm": 0.37439872223183535, "kl": 0.0299072265625, "learning_rate": 9.958962534317879e-07, "loss": 0.0012, "reward": 1.881250023841858, "reward_std": 0.0530330091714859, "rewards/accuracy_reward": 0.734375, "rewards/format_reward": 1.0, "step": 1342, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.125, "completion_length": 368.59375, "epoch": 0.040840530349105945, "grad_norm": 1.2093084754691545, "kl": 0.03369140625, "learning_rate": 9.958901436817976e-07, "loss": 0.0013, "reward": 1.6858181953430176, "reward_std": 0.09985166788101196, "rewards/accuracy_reward": 0.5639432668685913, "rewards/format_reward": 1.0, "step": 1343, "temperature": 1.0 }, { "all_correct": 0.375, "all_wrong": 0.125, "completion_length": 347.921875, "epoch": 0.04087094027490573, "grad_norm": 1.335591765419783, "kl": 0.041015625, "learning_rate": 9.958840294057926e-07, "loss": 0.0016, "reward": 1.785186767578125, "reward_std": 0.25591838359832764, "rewards/accuracy_reward": 0.6539367437362671, "rewards/format_reward": 1.0, "step": 1344, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 364.015625, "epoch": 0.04090135020070551, "grad_norm": 0.7149604691909549, "kl": 0.031494140625, "learning_rate": 9.958779106038288e-07, "loss": 0.0013, "reward": 2.0577006340026855, "reward_std": 0.08434312045574188, "rewards/accuracy_reward": 0.8670756816864014, "rewards/format_reward": 1.0, "step": 1345, "temperature": 1.0 }, { "all_correct": 0.375, "all_wrong": 0.0, "completion_length": 378.6875, "epoch": 0.04093176012650529, "grad_norm": 2.7239104313696214, "kl": 0.03369140625, "learning_rate": 9.958717872759624e-07, "loss": 0.0013, "reward": 1.8692957162857056, "reward_std": 0.1602056920528412, "rewards/accuracy_reward": 0.7067955732345581, "rewards/format_reward": 1.0, "step": 1346, "temperature": 1.0 }, { "all_correct": 0.25, "all_wrong": 0.25, "completion_length": 386.421875, "epoch": 0.04096217005230507, "grad_norm": 4.060058364152238, "kl": 0.0267333984375, "learning_rate": 9.95865659422249e-07, "loss": 0.0011, "reward": 1.7184205055236816, "reward_std": 0.1956622451543808, "rewards/accuracy_reward": 0.5840454697608948, "rewards/format_reward": 1.0, "step": 1347, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 365.296875, "epoch": 0.040992579978104855, "grad_norm": 1.084755669443812, "kl": 0.031982421875, "learning_rate": 9.958595270427444e-07, "loss": 0.0013, "reward": 1.909855842590332, "reward_std": 0.2032507061958313, "rewards/accuracy_reward": 0.7536057829856873, "rewards/format_reward": 1.0, "step": 1348, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.125, "completion_length": 375.765625, "epoch": 0.041022989903904634, "grad_norm": 1.023990420386371, "kl": 0.036865234375, "learning_rate": 9.958533901375048e-07, "loss": 0.0015, "reward": 1.8748741149902344, "reward_std": 0.01003176998347044, "rewards/accuracy_reward": 0.6998740434646606, "rewards/format_reward": 1.0, "step": 1349, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.0, "completion_length": 382.15625, "epoch": 0.041053399829704414, "grad_norm": 0.7862832436070752, "kl": 0.031494140625, "learning_rate": 9.958472487065861e-07, "loss": 0.0013, "reward": 2.0239477157592773, "reward_std": 0.13823741674423218, "rewards/accuracy_reward": 0.8395727276802063, "rewards/format_reward": 1.0, "step": 1350, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 368.25, "epoch": 0.0410838097555042, "grad_norm": 0.9568973862143455, "kl": 0.036865234375, "learning_rate": 9.958411027500444e-07, "loss": 0.0015, "reward": 1.834375023841858, "reward_std": 0.25955602526664734, "rewards/accuracy_reward": 0.6875, "rewards/format_reward": 1.0, "step": 1351, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.125, "completion_length": 375.234375, "epoch": 0.04111421968130398, "grad_norm": 0.5052447963655162, "kl": 0.04052734375, "learning_rate": 9.958349522679358e-07, "loss": 0.0016, "reward": 1.9608256816864014, "reward_std": 0.005701042246073484, "rewards/accuracy_reward": 0.7858256697654724, "rewards/format_reward": 1.0, "step": 1352, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.375, "completion_length": 357.6875, "epoch": 0.04114462960710376, "grad_norm": 0.09187506500515366, "kl": 0.03759765625, "learning_rate": 9.958287972603165e-07, "loss": 0.0015, "reward": 1.75, "reward_std": 0.0, "rewards/accuracy_reward": 0.625, "rewards/format_reward": 1.0, "step": 1353, "temperature": 1.0 }, { "all_correct": 0.375, "all_wrong": 0.25, "completion_length": 374.265625, "epoch": 0.04117503953290354, "grad_norm": 1.358335769314926, "kl": 0.0294189453125, "learning_rate": 9.958226377272425e-07, "loss": 0.0012, "reward": 1.6300781965255737, "reward_std": 0.11866331845521927, "rewards/accuracy_reward": 0.517578125, "rewards/format_reward": 1.0, "step": 1354, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.25, "completion_length": 357.21875, "epoch": 0.041205449458703323, "grad_norm": 0.0747002453531493, "kl": 0.036865234375, "learning_rate": 9.958164736687702e-07, "loss": 0.0015, "reward": 1.9000000953674316, "reward_std": 0.0, "rewards/accuracy_reward": 0.75, "rewards/format_reward": 1.0, "step": 1355, "temperature": 1.0 }, { "all_correct": 0.375, "all_wrong": 0.25, "completion_length": 412.09375, "epoch": 0.0412358593845031, "grad_norm": 0.912485963733874, "kl": 0.025634765625, "learning_rate": 9.958103050849556e-07, "loss": 0.001, "reward": 1.639655351638794, "reward_std": 0.09232194721698761, "rewards/accuracy_reward": 0.527155339717865, "rewards/format_reward": 1.0, "step": 1356, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 394.125, "epoch": 0.04126626931030288, "grad_norm": 0.8962499470123171, "kl": 0.033203125, "learning_rate": 9.958041319758553e-07, "loss": 0.0013, "reward": 1.8270025253295898, "reward_std": 0.1693069189786911, "rewards/accuracy_reward": 0.6707525253295898, "rewards/format_reward": 1.0, "step": 1357, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 363.890625, "epoch": 0.04129667923610266, "grad_norm": 1.1640764569792519, "kl": 0.0289306640625, "learning_rate": 9.957979543415256e-07, "loss": 0.0012, "reward": 2.0755209922790527, "reward_std": 0.15776163339614868, "rewards/accuracy_reward": 0.9036458730697632, "rewards/format_reward": 1.0, "step": 1358, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.125, "completion_length": 400.578125, "epoch": 0.04132708916190245, "grad_norm": 0.8224545222029924, "kl": 0.03125, "learning_rate": 9.957917721820228e-07, "loss": 0.0013, "reward": 1.8485685586929321, "reward_std": 0.07196305692195892, "rewards/accuracy_reward": 0.686068594455719, "rewards/format_reward": 1.0, "step": 1359, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.125, "completion_length": 381.46875, "epoch": 0.041357499087702226, "grad_norm": 1.0215253339886545, "kl": 0.03173828125, "learning_rate": 9.957855854974031e-07, "loss": 0.0013, "reward": 1.8232738971710205, "reward_std": 0.16958290338516235, "rewards/accuracy_reward": 0.6732739210128784, "rewards/format_reward": 1.0, "step": 1360, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.0, "completion_length": 366.703125, "epoch": 0.041387909013502006, "grad_norm": 1.1505760209014952, "kl": 0.036376953125, "learning_rate": 9.957793942877234e-07, "loss": 0.0015, "reward": 1.9686541557312012, "reward_std": 0.1127479076385498, "rewards/accuracy_reward": 0.8030290603637695, "rewards/format_reward": 1.0, "step": 1361, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.25, "completion_length": 358.078125, "epoch": 0.041418318939301785, "grad_norm": 0.9797118931668743, "kl": 0.046142578125, "learning_rate": 9.9577319855304e-07, "loss": 0.0018, "reward": 1.8024139404296875, "reward_std": 0.03240561857819557, "rewards/accuracy_reward": 0.6711639165878296, "rewards/format_reward": 1.0, "step": 1362, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.125, "completion_length": 382.59375, "epoch": 0.04144872886510157, "grad_norm": 1.1800745232733343, "kl": 0.031494140625, "learning_rate": 9.957669982934095e-07, "loss": 0.0013, "reward": 1.8131953477859497, "reward_std": 0.10240954160690308, "rewards/accuracy_reward": 0.6569451689720154, "rewards/format_reward": 1.0, "step": 1363, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 366.296875, "epoch": 0.04147913879090135, "grad_norm": 1.579373767967266, "kl": 0.0283203125, "learning_rate": 9.957607935088881e-07, "loss": 0.0011, "reward": 2.082505702972412, "reward_std": 0.08248256146907806, "rewards/accuracy_reward": 0.8950057029724121, "rewards/format_reward": 1.0, "step": 1364, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.0, "completion_length": 359.875, "epoch": 0.04150954871670113, "grad_norm": 1.0994258225115032, "kl": 0.03271484375, "learning_rate": 9.95754584199533e-07, "loss": 0.0013, "reward": 1.946874976158142, "reward_std": 0.20484475791454315, "rewards/accuracy_reward": 0.796875, "rewards/format_reward": 1.0, "step": 1365, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.125, "completion_length": 380.625, "epoch": 0.041539958642500915, "grad_norm": 2.142313473407584, "kl": 0.03466796875, "learning_rate": 9.957483703654007e-07, "loss": 0.0014, "reward": 1.8287945985794067, "reward_std": 0.09270096570253372, "rewards/accuracy_reward": 0.681919515132904, "rewards/format_reward": 1.0, "step": 1366, "temperature": 1.0 }, { "all_correct": 0.25, "all_wrong": 0.125, "completion_length": 404.796875, "epoch": 0.041570368568300695, "grad_norm": 4.070596419153116, "kl": 0.030517578125, "learning_rate": 9.957421520065478e-07, "loss": 0.0012, "reward": 1.7388451099395752, "reward_std": 0.2863381505012512, "rewards/accuracy_reward": 0.6232200860977173, "rewards/format_reward": 0.984375, "step": 1367, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 364.5, "epoch": 0.041600778494100474, "grad_norm": 0.6985405625163128, "kl": 0.03564453125, "learning_rate": 9.957359291230312e-07, "loss": 0.0014, "reward": 2.117159605026245, "reward_std": 0.06562963873147964, "rewards/accuracy_reward": 0.9202845096588135, "rewards/format_reward": 1.0, "step": 1368, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 353.5, "epoch": 0.04163118841990025, "grad_norm": 0.8939929837988022, "kl": 0.044921875, "learning_rate": 9.957297017149074e-07, "loss": 0.0018, "reward": 2.1624999046325684, "reward_std": 0.1060660183429718, "rewards/accuracy_reward": 0.96875, "rewards/format_reward": 1.0, "step": 1369, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.125, "completion_length": 376.640625, "epoch": 0.04166159834570004, "grad_norm": 0.7851483279213706, "kl": 0.03369140625, "learning_rate": 9.957234697822336e-07, "loss": 0.0014, "reward": 1.9848215579986572, "reward_std": 0.12246951460838318, "rewards/accuracy_reward": 0.8191964626312256, "rewards/format_reward": 1.0, "step": 1370, "temperature": 1.0 }, { "all_correct": 0.375, "all_wrong": 0.25, "completion_length": 380.890625, "epoch": 0.04169200827149982, "grad_norm": 0.9947247043089857, "kl": 0.038818359375, "learning_rate": 9.957172333250665e-07, "loss": 0.0016, "reward": 1.597571611404419, "reward_std": 0.019277328625321388, "rewards/accuracy_reward": 0.47882160544395447, "rewards/format_reward": 1.0, "step": 1371, "temperature": 1.0 }, { "all_correct": 0.25, "all_wrong": 0.125, "completion_length": 390.3125, "epoch": 0.0417224181972996, "grad_norm": 1.784295128440716, "kl": 0.0380859375, "learning_rate": 9.95710992343463e-07, "loss": 0.0015, "reward": 1.5076708793640137, "reward_std": 0.26120397448539734, "rewards/accuracy_reward": 0.41392096877098083, "rewards/format_reward": 0.984375, "step": 1372, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.125, "completion_length": 349.171875, "epoch": 0.04175282812309938, "grad_norm": 0.689767236641378, "kl": 0.038818359375, "learning_rate": 9.957047468374802e-07, "loss": 0.0016, "reward": 2.0194854736328125, "reward_std": 0.059365928173065186, "rewards/accuracy_reward": 0.8538603186607361, "rewards/format_reward": 1.0, "step": 1373, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.0, "completion_length": 359.484375, "epoch": 0.04178323804889916, "grad_norm": 0.8476792433817267, "kl": 0.035400390625, "learning_rate": 9.95698496807175e-07, "loss": 0.0014, "reward": 1.9301159381866455, "reward_std": 0.21660631895065308, "rewards/accuracy_reward": 0.7801159620285034, "rewards/format_reward": 1.0, "step": 1374, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.125, "completion_length": 361.75, "epoch": 0.04181364797469894, "grad_norm": 1.918046473143597, "kl": 0.038330078125, "learning_rate": 9.956922422526044e-07, "loss": 0.0015, "reward": 1.8562500476837158, "reward_std": 0.1306653916835785, "rewards/accuracy_reward": 0.71875, "rewards/format_reward": 1.0, "step": 1375, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.0, "completion_length": 355.578125, "epoch": 0.04184405790049872, "grad_norm": 0.9779042951842699, "kl": 0.037353515625, "learning_rate": 9.956859831738255e-07, "loss": 0.0015, "reward": 2.104365110397339, "reward_std": 0.12784358859062195, "rewards/accuracy_reward": 0.9231151342391968, "rewards/format_reward": 1.0, "step": 1376, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 362.0625, "epoch": 0.0418744678262985, "grad_norm": 1.2036717444706044, "kl": 0.0302734375, "learning_rate": 9.956797195708956e-07, "loss": 0.0012, "reward": 2.0006837844848633, "reward_std": 0.22426101565361023, "rewards/accuracy_reward": 0.8381837606430054, "rewards/format_reward": 1.0, "step": 1377, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.0, "completion_length": 375.03125, "epoch": 0.041904877752098287, "grad_norm": 1.1425535973095085, "kl": 0.031494140625, "learning_rate": 9.956734514438717e-07, "loss": 0.0013, "reward": 1.9820313453674316, "reward_std": 0.23114120960235596, "rewards/accuracy_reward": 0.828906238079071, "rewards/format_reward": 1.0, "step": 1378, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.125, "completion_length": 346.921875, "epoch": 0.041935287677898066, "grad_norm": 1.4992635776053853, "kl": 0.041015625, "learning_rate": 9.95667178792811e-07, "loss": 0.0016, "reward": 1.8628125190734863, "reward_std": 0.15551820397377014, "rewards/accuracy_reward": 0.7190625071525574, "rewards/format_reward": 1.0, "step": 1379, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.125, "completion_length": 368.296875, "epoch": 0.041965697603697845, "grad_norm": 1.5928241173498965, "kl": 0.032470703125, "learning_rate": 9.956609016177708e-07, "loss": 0.0013, "reward": 1.941906452178955, "reward_std": 0.08737563341856003, "rewards/accuracy_reward": 0.8012814521789551, "rewards/format_reward": 1.0, "step": 1380, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 353.84375, "epoch": 0.04199610752949763, "grad_norm": 1.0913982525005104, "kl": 0.03271484375, "learning_rate": 9.956546199188085e-07, "loss": 0.0013, "reward": 2.0999999046325684, "reward_std": 0.1508890688419342, "rewards/accuracy_reward": 0.921875, "rewards/format_reward": 1.0, "step": 1381, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.125, "completion_length": 369.953125, "epoch": 0.04202651745529741, "grad_norm": 1.0337523453864121, "kl": 0.03173828125, "learning_rate": 9.956483336959813e-07, "loss": 0.0013, "reward": 1.7231431007385254, "reward_std": 0.13377638161182404, "rewards/accuracy_reward": 0.6043931245803833, "rewards/format_reward": 1.0, "step": 1382, "temperature": 1.0 }, { "all_correct": 0.375, "all_wrong": 0.25, "completion_length": 360.609375, "epoch": 0.04205692738109719, "grad_norm": 0.8907339087513076, "kl": 0.031494140625, "learning_rate": 9.956420429493464e-07, "loss": 0.0013, "reward": 1.7444961071014404, "reward_std": 0.10254321247339249, "rewards/accuracy_reward": 0.6038711071014404, "rewards/format_reward": 1.0, "step": 1383, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.25, "completion_length": 341.296875, "epoch": 0.04208733730689697, "grad_norm": 1.1065106945712984, "kl": 0.03759765625, "learning_rate": 9.956357476789617e-07, "loss": 0.0015, "reward": 1.6664738655090332, "reward_std": 0.05185055360198021, "rewards/accuracy_reward": 0.5414737462997437, "rewards/format_reward": 1.0, "step": 1384, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.0, "completion_length": 359.84375, "epoch": 0.042117747232696755, "grad_norm": 1.2391149565154294, "kl": 0.043212890625, "learning_rate": 9.956294478848841e-07, "loss": 0.0017, "reward": 1.938907504081726, "reward_std": 0.14079871773719788, "rewards/accuracy_reward": 0.7795325517654419, "rewards/format_reward": 1.0, "step": 1385, "temperature": 1.0 }, { "all_correct": 0.375, "all_wrong": 0.125, "completion_length": 382.3125, "epoch": 0.042148157158496534, "grad_norm": 1.2384501016988423, "kl": 0.0341796875, "learning_rate": 9.956231435671714e-07, "loss": 0.0014, "reward": 1.827519416809082, "reward_std": 0.19984927773475647, "rewards/accuracy_reward": 0.6743943691253662, "rewards/format_reward": 1.0, "step": 1386, "temperature": 1.0 }, { "all_correct": 0.375, "all_wrong": 0.25, "completion_length": 354.828125, "epoch": 0.04217856708429631, "grad_norm": 0.7968053280738694, "kl": 0.032470703125, "learning_rate": 9.956168347258813e-07, "loss": 0.0013, "reward": 1.7843750715255737, "reward_std": 0.18571564555168152, "rewards/accuracy_reward": 0.65625, "rewards/format_reward": 1.0, "step": 1387, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 371.734375, "epoch": 0.04220897701009609, "grad_norm": 0.6675766688700255, "kl": 0.03515625, "learning_rate": 9.95610521361071e-07, "loss": 0.0014, "reward": 2.075000047683716, "reward_std": 0.12449951469898224, "rewards/accuracy_reward": 0.90625, "rewards/format_reward": 1.0, "step": 1388, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 379.265625, "epoch": 0.04223938693589588, "grad_norm": 0.7818800502448621, "kl": 0.0341796875, "learning_rate": 9.956042034727985e-07, "loss": 0.0014, "reward": 2.066763401031494, "reward_std": 0.07203345000743866, "rewards/accuracy_reward": 0.8855133056640625, "rewards/format_reward": 1.0, "step": 1389, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.125, "completion_length": 356.90625, "epoch": 0.04226979686169566, "grad_norm": 0.0724866816835963, "kl": 0.029541015625, "learning_rate": 9.955978810611211e-07, "loss": 0.0012, "reward": 2.0140368938446045, "reward_std": 0.0, "rewards/accuracy_reward": 0.8390368819236755, "rewards/format_reward": 1.0, "step": 1390, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.125, "completion_length": 345.578125, "epoch": 0.04230020678749544, "grad_norm": 1.7992814818664191, "kl": 0.038330078125, "learning_rate": 9.955915541260969e-07, "loss": 0.0015, "reward": 1.8659237623214722, "reward_std": 0.0924883484840393, "rewards/accuracy_reward": 0.7127987742424011, "rewards/format_reward": 1.0, "step": 1391, "temperature": 1.0 }, { "all_correct": 0.375, "all_wrong": 0.375, "completion_length": 373.734375, "epoch": 0.04233061671329522, "grad_norm": 0.4208736047719149, "kl": 0.03515625, "learning_rate": 9.955852226677832e-07, "loss": 0.0014, "reward": 1.59375, "reward_std": 0.09669842571020126, "rewards/accuracy_reward": 0.5, "rewards/format_reward": 1.0, "step": 1392, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.125, "completion_length": 350.5, "epoch": 0.042361026639095, "grad_norm": 1.0526480713520352, "kl": 0.03369140625, "learning_rate": 9.95578886686238e-07, "loss": 0.0014, "reward": 1.7999999523162842, "reward_std": 0.19530820846557617, "rewards/accuracy_reward": 0.671875, "rewards/format_reward": 1.0, "step": 1393, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.0, "completion_length": 378.640625, "epoch": 0.04239143656489478, "grad_norm": 2.990754731618439, "kl": 0.035888671875, "learning_rate": 9.955725461815194e-07, "loss": 0.0014, "reward": 1.8154511451721191, "reward_std": 0.08033843338489532, "rewards/accuracy_reward": 0.6873260736465454, "rewards/format_reward": 1.0, "step": 1394, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.375, "completion_length": 376.953125, "epoch": 0.04242184649069456, "grad_norm": 0.5710888968463236, "kl": 0.0281982421875, "learning_rate": 9.955662011536848e-07, "loss": 0.0011, "reward": 1.7468750476837158, "reward_std": 0.008838837966322899, "rewards/accuracy_reward": 0.625, "rewards/format_reward": 1.0, "step": 1395, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.125, "completion_length": 358.75, "epoch": 0.04245225641649435, "grad_norm": 1.8230811471045338, "kl": 0.0308837890625, "learning_rate": 9.955598516027923e-07, "loss": 0.0012, "reward": 1.7721840143203735, "reward_std": 0.09068720787763596, "rewards/accuracy_reward": 0.631558895111084, "rewards/format_reward": 1.0, "step": 1396, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.0, "completion_length": 351.359375, "epoch": 0.042482666342294126, "grad_norm": 1.0493647396721761, "kl": 0.034912109375, "learning_rate": 9.955534975289e-07, "loss": 0.0014, "reward": 1.920665979385376, "reward_std": 0.07833994925022125, "rewards/accuracy_reward": 0.764415979385376, "rewards/format_reward": 1.0, "step": 1397, "temperature": 1.0 }, { "all_correct": 0.375, "all_wrong": 0.0, "completion_length": 383.234375, "epoch": 0.042513076268093905, "grad_norm": 1.4743487872088243, "kl": 0.040283203125, "learning_rate": 9.955471389320657e-07, "loss": 0.0016, "reward": 1.8181214332580566, "reward_std": 0.15854354202747345, "rewards/accuracy_reward": 0.639996349811554, "rewards/format_reward": 1.0, "step": 1398, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 367.09375, "epoch": 0.042543486193893684, "grad_norm": 1.001258748020277, "kl": 0.035888671875, "learning_rate": 9.955407758123476e-07, "loss": 0.0014, "reward": 1.9713821411132812, "reward_std": 0.0642324835062027, "rewards/accuracy_reward": 0.802632212638855, "rewards/format_reward": 1.0, "step": 1399, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.0, "completion_length": 356.15625, "epoch": 0.04257389611969347, "grad_norm": 1.4115221072547501, "kl": 0.04052734375, "learning_rate": 9.955344081698036e-07, "loss": 0.0016, "reward": 1.909414291381836, "reward_std": 0.16459053754806519, "rewards/accuracy_reward": 0.7531642913818359, "rewards/format_reward": 1.0, "step": 1400, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.0, "completion_length": 377.953125, "epoch": 0.04260430604549325, "grad_norm": 0.905302787419834, "kl": 0.03271484375, "learning_rate": 9.95528036004492e-07, "loss": 0.0013, "reward": 1.9376168251037598, "reward_std": 0.0854758620262146, "rewards/accuracy_reward": 0.7532418966293335, "rewards/format_reward": 1.0, "step": 1401, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 351.796875, "epoch": 0.04263471597129303, "grad_norm": 1.1842627419073168, "kl": 0.0380859375, "learning_rate": 9.955216593164707e-07, "loss": 0.0015, "reward": 2.0881142616271973, "reward_std": 0.09037096053361893, "rewards/accuracy_reward": 0.8974892497062683, "rewards/format_reward": 1.0, "step": 1402, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 353.078125, "epoch": 0.04266512589709281, "grad_norm": 1.0831424984539992, "kl": 0.036376953125, "learning_rate": 9.955152781057982e-07, "loss": 0.0015, "reward": 2.078812599182129, "reward_std": 0.18137730658054352, "rewards/accuracy_reward": 0.8881875872612, "rewards/format_reward": 1.0, "step": 1403, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.0, "completion_length": 394.328125, "epoch": 0.042695535822892594, "grad_norm": 1.0464001971870835, "kl": 0.032958984375, "learning_rate": 9.955088923725326e-07, "loss": 0.0013, "reward": 1.8705427646636963, "reward_std": 0.08714998513460159, "rewards/accuracy_reward": 0.71741783618927, "rewards/format_reward": 1.0, "step": 1404, "temperature": 1.0 }, { "all_correct": 0.375, "all_wrong": 0.375, "completion_length": 393.8125, "epoch": 0.04272594574869237, "grad_norm": 0.6390348862019546, "kl": 0.0274658203125, "learning_rate": 9.955025021167322e-07, "loss": 0.0011, "reward": 1.6036441326141357, "reward_std": 0.0829862654209137, "rewards/accuracy_reward": 0.491144061088562, "rewards/format_reward": 1.0, "step": 1405, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.25, "completion_length": 357.421875, "epoch": 0.04275635567449215, "grad_norm": 0.7531338243810934, "kl": 0.03125, "learning_rate": 9.95496107338455e-07, "loss": 0.0013, "reward": 1.859375, "reward_std": 0.07827534526586533, "rewards/accuracy_reward": 0.71875, "rewards/format_reward": 1.0, "step": 1406, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.125, "completion_length": 372.625, "epoch": 0.04278676560029194, "grad_norm": 12.204370174330474, "kl": 0.03125, "learning_rate": 9.9548970803776e-07, "loss": 0.0012, "reward": 1.9285497665405273, "reward_std": 0.061849795281887054, "rewards/accuracy_reward": 0.7597997784614563, "rewards/format_reward": 1.0, "step": 1407, "temperature": 1.0 }, { "all_correct": 0.375, "all_wrong": 0.25, "completion_length": 367.921875, "epoch": 0.04281717552609172, "grad_norm": 1.0758398476025555, "kl": 0.03662109375, "learning_rate": 9.954833042147053e-07, "loss": 0.0015, "reward": 1.641932725906372, "reward_std": 0.04902216047048569, "rewards/accuracy_reward": 0.4981827735900879, "rewards/format_reward": 1.0, "step": 1408, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.125, "completion_length": 380.125, "epoch": 0.0428475854518915, "grad_norm": 0.8605003205951494, "kl": 0.030029296875, "learning_rate": 9.954768958693493e-07, "loss": 0.0012, "reward": 1.8602197170257568, "reward_std": 0.09333300590515137, "rewards/accuracy_reward": 0.7070946097373962, "rewards/format_reward": 1.0, "step": 1409, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.125, "completion_length": 383.28125, "epoch": 0.042877995377691276, "grad_norm": 1.0673170244771828, "kl": 0.03564453125, "learning_rate": 9.954704830017504e-07, "loss": 0.0014, "reward": 1.8254494667053223, "reward_std": 0.040953077375888824, "rewards/accuracy_reward": 0.6598243713378906, "rewards/format_reward": 1.0, "step": 1410, "temperature": 1.0 }, { "all_correct": 0.375, "all_wrong": 0.0, "completion_length": 395.1875, "epoch": 0.04290840530349106, "grad_norm": 1.8896723220098544, "kl": 0.038330078125, "learning_rate": 9.954640656119674e-07, "loss": 0.0015, "reward": 1.643940806388855, "reward_std": 0.1714984029531479, "rewards/accuracy_reward": 0.5251908302307129, "rewards/format_reward": 1.0, "step": 1411, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.125, "completion_length": 376.640625, "epoch": 0.04293881522929084, "grad_norm": 0.627655840559016, "kl": 0.0301513671875, "learning_rate": 9.954576437000586e-07, "loss": 0.0012, "reward": 1.85562264919281, "reward_std": 0.07370621711015701, "rewards/accuracy_reward": 0.6993725895881653, "rewards/format_reward": 1.0, "step": 1412, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.125, "completion_length": 360.984375, "epoch": 0.04296922515509062, "grad_norm": 0.8558489581272183, "kl": 0.029052734375, "learning_rate": 9.954512172660827e-07, "loss": 0.0012, "reward": 1.881404161453247, "reward_std": 0.021881647408008575, "rewards/accuracy_reward": 0.7126542329788208, "rewards/format_reward": 1.0, "step": 1413, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.125, "completion_length": 352.890625, "epoch": 0.0429996350808904, "grad_norm": 1.2511134402312547, "kl": 0.033935546875, "learning_rate": 9.954447863100986e-07, "loss": 0.0014, "reward": 1.7999999523162842, "reward_std": 0.10819403827190399, "rewards/accuracy_reward": 0.671875, "rewards/format_reward": 1.0, "step": 1414, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.25, "completion_length": 371.1875, "epoch": 0.043030045006690186, "grad_norm": 0.5241430332937208, "kl": 0.03271484375, "learning_rate": 9.954383508321646e-07, "loss": 0.0013, "reward": 1.765625, "reward_std": 0.04419417306780815, "rewards/accuracy_reward": 0.640625, "rewards/format_reward": 1.0, "step": 1415, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.125, "completion_length": 362.203125, "epoch": 0.043060454932489965, "grad_norm": 0.8918081103657807, "kl": 0.0294189453125, "learning_rate": 9.954319108323396e-07, "loss": 0.0012, "reward": 2.0093750953674316, "reward_std": 0.07827534526586533, "rewards/accuracy_reward": 0.84375, "rewards/format_reward": 1.0, "step": 1416, "temperature": 1.0 }, { "all_correct": 0.375, "all_wrong": 0.0, "completion_length": 396.5, "epoch": 0.043090864858289744, "grad_norm": 1.2253805508398652, "kl": 0.032958984375, "learning_rate": 9.954254663106825e-07, "loss": 0.0013, "reward": 1.8449311256408691, "reward_std": 0.23291127383708954, "rewards/accuracy_reward": 0.6793060302734375, "rewards/format_reward": 1.0, "step": 1417, "temperature": 1.0 }, { "all_correct": 0.375, "all_wrong": 0.0, "completion_length": 366.78125, "epoch": 0.043121274784089524, "grad_norm": 1.9654714484621454, "kl": 0.039306640625, "learning_rate": 9.95419017267252e-07, "loss": 0.0016, "reward": 1.7403266429901123, "reward_std": 0.23603031039237976, "rewards/accuracy_reward": 0.5934516191482544, "rewards/format_reward": 1.0, "step": 1418, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.25, "completion_length": 374.828125, "epoch": 0.04315168470988931, "grad_norm": 0.7252879064139763, "kl": 0.03173828125, "learning_rate": 9.954125637021069e-07, "loss": 0.0013, "reward": 1.8968749046325684, "reward_std": 0.008838837966322899, "rewards/accuracy_reward": 0.75, "rewards/format_reward": 1.0, "step": 1419, "temperature": 1.0 }, { "all_correct": 0.375, "all_wrong": 0.0, "completion_length": 379.5625, "epoch": 0.04318209463568909, "grad_norm": 1.291367281172599, "kl": 0.0263671875, "learning_rate": 9.954061056153062e-07, "loss": 0.0011, "reward": 1.8468750715255737, "reward_std": 0.3137286603450775, "rewards/accuracy_reward": 0.71875, "rewards/format_reward": 1.0, "step": 1420, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.375, "completion_length": 367.1875, "epoch": 0.04321250456148887, "grad_norm": 0.7431683708603904, "kl": 0.036376953125, "learning_rate": 9.95399643006909e-07, "loss": 0.0015, "reward": 1.6340558528900146, "reward_std": 0.020328953862190247, "rewards/accuracy_reward": 0.5184307098388672, "rewards/format_reward": 1.0, "step": 1421, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.125, "completion_length": 395.65625, "epoch": 0.043242914487288654, "grad_norm": 3.2642062472615523, "kl": 0.030517578125, "learning_rate": 9.95393175876974e-07, "loss": 0.0012, "reward": 1.7997121810913086, "reward_std": 0.15055255591869354, "rewards/accuracy_reward": 0.6622122526168823, "rewards/format_reward": 1.0, "step": 1422, "temperature": 1.0 }, { "all_correct": 0.375, "all_wrong": 0.0, "completion_length": 381.828125, "epoch": 0.04327332441308843, "grad_norm": 1.3301865353911098, "kl": 0.036865234375, "learning_rate": 9.953867042255603e-07, "loss": 0.0015, "reward": 1.807694673538208, "reward_std": 0.10424507409334183, "rewards/accuracy_reward": 0.6264446973800659, "rewards/format_reward": 1.0, "step": 1423, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.0, "completion_length": 363.78125, "epoch": 0.04330373433888821, "grad_norm": 1.1818196479726868, "kl": 0.032958984375, "learning_rate": 9.95380228052727e-07, "loss": 0.0013, "reward": 2.0754377841949463, "reward_std": 0.19192728400230408, "rewards/accuracy_reward": 0.8910626173019409, "rewards/format_reward": 1.0, "step": 1424, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.0, "completion_length": 367.25, "epoch": 0.04333414426468799, "grad_norm": 1.6843413045092313, "kl": 0.029052734375, "learning_rate": 9.953737473585334e-07, "loss": 0.0012, "reward": 1.8454101085662842, "reward_std": 0.14871634542942047, "rewards/accuracy_reward": 0.70166015625, "rewards/format_reward": 1.0, "step": 1425, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.25, "completion_length": 372.234375, "epoch": 0.04336455419048778, "grad_norm": 1.276331012003874, "kl": 0.0341796875, "learning_rate": 9.95367262143038e-07, "loss": 0.0014, "reward": 1.659361481666565, "reward_std": 0.031196530908346176, "rewards/accuracy_reward": 0.5374864339828491, "rewards/format_reward": 1.0, "step": 1426, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 361.34375, "epoch": 0.04339496411628756, "grad_norm": 0.8055627452554797, "kl": 0.0322265625, "learning_rate": 9.95360772406301e-07, "loss": 0.0013, "reward": 2.0843749046325684, "reward_std": 0.15845370292663574, "rewards/accuracy_reward": 0.90625, "rewards/format_reward": 1.0, "step": 1427, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.125, "completion_length": 385.609375, "epoch": 0.043425374042087336, "grad_norm": 0.8059394819486126, "kl": 0.03173828125, "learning_rate": 9.953542781483807e-07, "loss": 0.0013, "reward": 1.8274681568145752, "reward_std": 0.0897180512547493, "rewards/accuracy_reward": 0.6837181448936462, "rewards/format_reward": 1.0, "step": 1428, "temperature": 1.0 }, { "all_correct": 0.375, "all_wrong": 0.25, "completion_length": 377.84375, "epoch": 0.043455783967887116, "grad_norm": 0.9276925565644119, "kl": 0.028564453125, "learning_rate": 9.953477793693369e-07, "loss": 0.0011, "reward": 1.7710769176483154, "reward_std": 0.1064361035823822, "rewards/accuracy_reward": 0.627327024936676, "rewards/format_reward": 1.0, "step": 1429, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.25, "completion_length": 388.109375, "epoch": 0.0434861938936869, "grad_norm": 3.2544571345879536, "kl": 0.0291748046875, "learning_rate": 9.953412760692289e-07, "loss": 0.0012, "reward": 1.865625023841858, "reward_std": 0.06670259684324265, "rewards/accuracy_reward": 0.75, "rewards/format_reward": 0.96875, "step": 1430, "temperature": 1.0 }, { "all_correct": 0.375, "all_wrong": 0.25, "completion_length": 366.265625, "epoch": 0.04351660381948668, "grad_norm": 1.3343617979110485, "kl": 0.03173828125, "learning_rate": 9.953347682481156e-07, "loss": 0.0013, "reward": 1.7437500953674316, "reward_std": 0.20756080746650696, "rewards/accuracy_reward": 0.625, "rewards/format_reward": 1.0, "step": 1431, "temperature": 1.0 }, { "all_correct": 0.375, "all_wrong": 0.0, "completion_length": 394.453125, "epoch": 0.04354701374528646, "grad_norm": 1.450442021026999, "kl": 0.035888671875, "learning_rate": 9.953282559060569e-07, "loss": 0.0014, "reward": 1.8605952262878418, "reward_std": 0.1639946699142456, "rewards/accuracy_reward": 0.6887201070785522, "rewards/format_reward": 1.0, "step": 1432, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 366.234375, "epoch": 0.04357742367108624, "grad_norm": 0.987437961895171, "kl": 0.0277099609375, "learning_rate": 9.95321739043112e-07, "loss": 0.0011, "reward": 1.969940423965454, "reward_std": 0.17938214540481567, "rewards/accuracy_reward": 0.804315447807312, "rewards/format_reward": 1.0, "step": 1433, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 358.984375, "epoch": 0.043607833596886025, "grad_norm": 2.2142698310124245, "kl": 0.031494140625, "learning_rate": 9.953152176593407e-07, "loss": 0.0013, "reward": 2.014552116394043, "reward_std": 0.0031000934541225433, "rewards/accuracy_reward": 0.8145519495010376, "rewards/format_reward": 1.0, "step": 1434, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.25, "completion_length": 363.3125, "epoch": 0.043638243522685805, "grad_norm": 0.9724225933249551, "kl": 0.0283203125, "learning_rate": 9.953086917548022e-07, "loss": 0.0011, "reward": 1.7427167892456055, "reward_std": 0.1048978716135025, "rewards/accuracy_reward": 0.6114667654037476, "rewards/format_reward": 1.0, "step": 1435, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 373.875, "epoch": 0.043668653448485584, "grad_norm": 4.108701607153318, "kl": 0.035400390625, "learning_rate": 9.953021613295559e-07, "loss": 0.0014, "reward": 1.7902079820632935, "reward_std": 0.11418286710977554, "rewards/accuracy_reward": 0.6402080059051514, "rewards/format_reward": 1.0, "step": 1436, "temperature": 1.0 }, { "all_correct": 0.25, "all_wrong": 0.25, "completion_length": 363.46875, "epoch": 0.04369906337428537, "grad_norm": 1.1077681661337027, "kl": 0.033203125, "learning_rate": 9.952956263836617e-07, "loss": 0.0013, "reward": 1.600822925567627, "reward_std": 0.10736281424760818, "rewards/accuracy_reward": 0.4664478302001953, "rewards/format_reward": 1.0, "step": 1437, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.125, "completion_length": 367.890625, "epoch": 0.04372947330008515, "grad_norm": 1.1097933376317737, "kl": 0.03173828125, "learning_rate": 9.952890869171794e-07, "loss": 0.0013, "reward": 1.7584264278411865, "reward_std": 0.07671654224395752, "rewards/accuracy_reward": 0.6115514636039734, "rewards/format_reward": 1.0, "step": 1438, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.375, "completion_length": 346.234375, "epoch": 0.04375988322588493, "grad_norm": 0.6745761165047038, "kl": 0.0299072265625, "learning_rate": 9.952825429301682e-07, "loss": 0.0012, "reward": 1.7468750476837158, "reward_std": 0.008838837966322899, "rewards/accuracy_reward": 0.625, "rewards/format_reward": 1.0, "step": 1439, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.25, "completion_length": 351.6875, "epoch": 0.04379029315168471, "grad_norm": 0.9534164564023454, "kl": 0.0380859375, "learning_rate": 9.95275994422688e-07, "loss": 0.0015, "reward": 1.8374111652374268, "reward_std": 0.02175387367606163, "rewards/accuracy_reward": 0.6999111771583557, "rewards/format_reward": 1.0, "step": 1440, "temperature": 1.0 }, { "all_correct": 0.375, "all_wrong": 0.25, "completion_length": 347.359375, "epoch": 0.043820703077484494, "grad_norm": 2.998131961463864, "kl": 0.031982421875, "learning_rate": 9.95269441394799e-07, "loss": 0.0013, "reward": 1.722355842590332, "reward_std": 0.14863130450248718, "rewards/accuracy_reward": 0.5973557233810425, "rewards/format_reward": 1.0, "step": 1441, "temperature": 1.0 }, { "all_correct": 0.375, "all_wrong": 0.125, "completion_length": 370.671875, "epoch": 0.04385111300328427, "grad_norm": 1.1586711093082385, "kl": 0.03173828125, "learning_rate": 9.952628838465605e-07, "loss": 0.0013, "reward": 1.5950384140014648, "reward_std": 0.1618431806564331, "rewards/accuracy_reward": 0.4950385093688965, "rewards/format_reward": 1.0, "step": 1442, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.125, "completion_length": 367.703125, "epoch": 0.04388152292908405, "grad_norm": 1.26135537462759, "kl": 0.03173828125, "learning_rate": 9.952563217780325e-07, "loss": 0.0013, "reward": 1.8301081657409668, "reward_std": 0.045346699655056, "rewards/accuracy_reward": 0.6676081418991089, "rewards/format_reward": 1.0, "step": 1443, "temperature": 1.0 }, { "all_correct": 0.375, "all_wrong": 0.125, "completion_length": 390.59375, "epoch": 0.04391193285488383, "grad_norm": 2.282157513676405, "kl": 0.0311279296875, "learning_rate": 9.95249755189275e-07, "loss": 0.0012, "reward": 1.777186632156372, "reward_std": 0.1454283595085144, "rewards/accuracy_reward": 0.6365616321563721, "rewards/format_reward": 1.0, "step": 1444, "temperature": 1.0 }, { "all_correct": 0.25, "all_wrong": 0.25, "completion_length": 389.0, "epoch": 0.04394234278068362, "grad_norm": 0.8608242583449095, "kl": 0.031982421875, "learning_rate": 9.952431840803476e-07, "loss": 0.0013, "reward": 1.6358062028884888, "reward_std": 0.1612795889377594, "rewards/accuracy_reward": 0.5014311075210571, "rewards/format_reward": 1.0, "step": 1445, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.25, "completion_length": 359.421875, "epoch": 0.043972752706483396, "grad_norm": 0.910701737075201, "kl": 0.03662109375, "learning_rate": 9.952366084513108e-07, "loss": 0.0015, "reward": 1.6645950078964233, "reward_std": 0.06247019022703171, "rewards/accuracy_reward": 0.5458450317382812, "rewards/format_reward": 1.0, "step": 1446, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.0, "completion_length": 358.8125, "epoch": 0.044003162632283176, "grad_norm": 0.9767069762444006, "kl": 0.0341796875, "learning_rate": 9.952300283022242e-07, "loss": 0.0014, "reward": 1.949601411819458, "reward_std": 0.079706110060215, "rewards/accuracy_reward": 0.7621012926101685, "rewards/format_reward": 1.0, "step": 1447, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.125, "completion_length": 381.4375, "epoch": 0.044033572558082955, "grad_norm": 1.9448086284847381, "kl": 0.0390625, "learning_rate": 9.952234436331478e-07, "loss": 0.0016, "reward": 1.7736895084381104, "reward_std": 0.09731435775756836, "rewards/accuracy_reward": 0.6424395442008972, "rewards/format_reward": 1.0, "step": 1448, "temperature": 1.0 }, { "all_correct": 0.375, "all_wrong": 0.25, "completion_length": 362.296875, "epoch": 0.04406398248388274, "grad_norm": 1.808786301810168, "kl": 0.033935546875, "learning_rate": 9.952168544441423e-07, "loss": 0.0014, "reward": 1.677612543106079, "reward_std": 0.07382393628358841, "rewards/accuracy_reward": 0.5588625073432922, "rewards/format_reward": 1.0, "step": 1449, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.0, "completion_length": 372.84375, "epoch": 0.04409439240968252, "grad_norm": 1.4327497868631311, "kl": 0.03369140625, "learning_rate": 9.952102607352673e-07, "loss": 0.0014, "reward": 1.9167944192886353, "reward_std": 0.09248004108667374, "rewards/accuracy_reward": 0.7386693954467773, "rewards/format_reward": 1.0, "step": 1450, "temperature": 1.0 }, { "all_correct": 0.375, "all_wrong": 0.25, "completion_length": 395.109375, "epoch": 0.0441248023354823, "grad_norm": 0.7832087926653342, "kl": 0.0281982421875, "learning_rate": 9.952036625065828e-07, "loss": 0.0011, "reward": 1.6026294231414795, "reward_std": 0.1323229968547821, "rewards/accuracy_reward": 0.4932544231414795, "rewards/format_reward": 1.0, "step": 1451, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.125, "completion_length": 373.171875, "epoch": 0.044155212261282085, "grad_norm": 1.6010255559511783, "kl": 0.029541015625, "learning_rate": 9.951970597581498e-07, "loss": 0.0012, "reward": 1.840694546699524, "reward_std": 0.09693321585655212, "rewards/accuracy_reward": 0.6844445466995239, "rewards/format_reward": 1.0, "step": 1452, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.25, "completion_length": 355.8125, "epoch": 0.044185622187081865, "grad_norm": 0.5271208169338722, "kl": 0.034912109375, "learning_rate": 9.95190452490028e-07, "loss": 0.0014, "reward": 1.8937500715255737, "reward_std": 0.011572758667171001, "rewards/accuracy_reward": 0.75, "rewards/format_reward": 1.0, "step": 1453, "temperature": 1.0 }, { "all_correct": 0.875, "all_wrong": 0.0, "completion_length": 345.859375, "epoch": 0.044216032112881644, "grad_norm": 0.44727881987488494, "kl": 0.038818359375, "learning_rate": 9.951838407022776e-07, "loss": 0.0016, "reward": 2.1750001907348633, "reward_std": 0.051754921674728394, "rewards/accuracy_reward": 0.984375, "rewards/format_reward": 1.0, "step": 1454, "temperature": 1.0 }, { "all_correct": 0.875, "all_wrong": 0.125, "completion_length": 348.4375, "epoch": 0.04424644203868142, "grad_norm": 0.10081682621202316, "kl": 0.03955078125, "learning_rate": 9.951772243949592e-07, "loss": 0.0016, "reward": 2.0500001907348633, "reward_std": 0.0, "rewards/accuracy_reward": 0.875, "rewards/format_reward": 1.0, "step": 1455, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 364.03125, "epoch": 0.04427685196448121, "grad_norm": 0.9522053523432532, "kl": 0.04248046875, "learning_rate": 9.951706035681332e-07, "loss": 0.0017, "reward": 2.031747817993164, "reward_std": 0.015882009640336037, "rewards/accuracy_reward": 0.8348726034164429, "rewards/format_reward": 1.0, "step": 1456, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 368.296875, "epoch": 0.04430726189028099, "grad_norm": 1.0419885343848692, "kl": 0.02783203125, "learning_rate": 9.9516397822186e-07, "loss": 0.0011, "reward": 2.1232306957244873, "reward_std": 0.09801394492387772, "rewards/accuracy_reward": 0.935730516910553, "rewards/format_reward": 1.0, "step": 1457, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.0, "completion_length": 392.578125, "epoch": 0.04433767181608077, "grad_norm": 1.8517418142135063, "kl": 0.029296875, "learning_rate": 9.951573483561999e-07, "loss": 0.0012, "reward": 1.9896379709243774, "reward_std": 0.07773250341415405, "rewards/accuracy_reward": 0.7990127801895142, "rewards/format_reward": 1.0, "step": 1458, "temperature": 1.0 }, { "all_correct": 0.375, "all_wrong": 0.0, "completion_length": 365.453125, "epoch": 0.04436808174188055, "grad_norm": 1.2500016498987283, "kl": 0.0361328125, "learning_rate": 9.951507139712135e-07, "loss": 0.0014, "reward": 1.8249154090881348, "reward_std": 0.1567482054233551, "rewards/accuracy_reward": 0.6592903137207031, "rewards/format_reward": 1.0, "step": 1459, "temperature": 1.0 }, { "all_correct": 0.375, "all_wrong": 0.25, "completion_length": 352.515625, "epoch": 0.04439849166768033, "grad_norm": 1.0363516727280049, "kl": 0.041748046875, "learning_rate": 9.951440750669617e-07, "loss": 0.0017, "reward": 1.7068523168563843, "reward_std": 0.13696826994419098, "rewards/accuracy_reward": 0.5724773406982422, "rewards/format_reward": 1.0, "step": 1460, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.125, "completion_length": 375.46875, "epoch": 0.04442890159348011, "grad_norm": 1.1356562122457683, "kl": 0.0299072265625, "learning_rate": 9.951374316435044e-07, "loss": 0.0012, "reward": 1.8911783695220947, "reward_std": 0.15338963270187378, "rewards/accuracy_reward": 0.7349283695220947, "rewards/format_reward": 1.0, "step": 1461, "temperature": 1.0 }, { "all_correct": 0.875, "all_wrong": 0.0, "completion_length": 357.140625, "epoch": 0.04445931151927989, "grad_norm": 0.7839827207417605, "kl": 0.031005859375, "learning_rate": 9.95130783700903e-07, "loss": 0.0012, "reward": 2.117067337036133, "reward_std": 0.08855694532394409, "rewards/accuracy_reward": 0.932692289352417, "rewards/format_reward": 1.0, "step": 1462, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.125, "completion_length": 370.453125, "epoch": 0.04448972144507968, "grad_norm": 1.5040226846370996, "kl": 0.0322265625, "learning_rate": 9.951241312392175e-07, "loss": 0.0013, "reward": 1.926222801208496, "reward_std": 0.11093716323375702, "rewards/accuracy_reward": 0.7730978727340698, "rewards/format_reward": 1.0, "step": 1463, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.0, "completion_length": 376.34375, "epoch": 0.04452013137087946, "grad_norm": 1.1288581066217378, "kl": 0.031982421875, "learning_rate": 9.95117474258509e-07, "loss": 0.0013, "reward": 1.8765690326690674, "reward_std": 0.16353511810302734, "rewards/accuracy_reward": 0.7328190803527832, "rewards/format_reward": 1.0, "step": 1464, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.125, "completion_length": 371.1875, "epoch": 0.044550541296679236, "grad_norm": 1.0099541617495906, "kl": 0.041015625, "learning_rate": 9.951108127588384e-07, "loss": 0.0016, "reward": 1.8732120990753174, "reward_std": 0.03758494183421135, "rewards/accuracy_reward": 0.7075871229171753, "rewards/format_reward": 1.0, "step": 1465, "temperature": 1.0 }, { "all_correct": 0.25, "all_wrong": 0.25, "completion_length": 357.671875, "epoch": 0.044580951222479015, "grad_norm": 1.07112950703109, "kl": 0.039794921875, "learning_rate": 9.951041467402662e-07, "loss": 0.0016, "reward": 1.6632583141326904, "reward_std": 0.14806610345840454, "rewards/accuracy_reward": 0.5288832187652588, "rewards/format_reward": 1.0, "step": 1466, "temperature": 1.0 }, { "all_correct": 0.375, "all_wrong": 0.0, "completion_length": 399.53125, "epoch": 0.0446113611482788, "grad_norm": 1.0910665962681687, "kl": 0.03662109375, "learning_rate": 9.950974762028531e-07, "loss": 0.0015, "reward": 1.8939828872680664, "reward_std": 0.1992325782775879, "rewards/accuracy_reward": 0.7221077680587769, "rewards/format_reward": 1.0, "step": 1467, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.125, "completion_length": 390.03125, "epoch": 0.04464177107407858, "grad_norm": 0.9213723719201844, "kl": 0.027099609375, "learning_rate": 9.950908011466604e-07, "loss": 0.0011, "reward": 1.8288965225219727, "reward_std": 0.10781900584697723, "rewards/accuracy_reward": 0.6820214986801147, "rewards/format_reward": 0.984375, "step": 1468, "temperature": 1.0 }, { "all_correct": 0.25, "all_wrong": 0.0, "completion_length": 425.453125, "epoch": 0.04467218099987836, "grad_norm": 1.6946295284192618, "kl": 0.03564453125, "learning_rate": 9.950841215717486e-07, "loss": 0.0014, "reward": 1.7433699369430542, "reward_std": 0.15014700591564178, "rewards/accuracy_reward": 0.5839948654174805, "rewards/format_reward": 1.0, "step": 1469, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.125, "completion_length": 366.265625, "epoch": 0.04470259092567814, "grad_norm": 0.6684982927040791, "kl": 0.028564453125, "learning_rate": 9.950774374781791e-07, "loss": 0.0011, "reward": 2.000401020050049, "reward_std": 0.017368635162711143, "rewards/accuracy_reward": 0.8285259008407593, "rewards/format_reward": 1.0, "step": 1470, "temperature": 1.0 }, { "all_correct": 0.875, "all_wrong": 0.0, "completion_length": 357.140625, "epoch": 0.044733000851477925, "grad_norm": 0.4506155515392534, "kl": 0.0308837890625, "learning_rate": 9.950707488660128e-07, "loss": 0.0012, "reward": 2.065624952316284, "reward_std": 0.04419417306780815, "rewards/accuracy_reward": 0.890625, "rewards/format_reward": 1.0, "step": 1471, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.0, "completion_length": 358.453125, "epoch": 0.044763410777277704, "grad_norm": 1.2973050479991999, "kl": 0.03466796875, "learning_rate": 9.950640557353105e-07, "loss": 0.0014, "reward": 1.909334421157837, "reward_std": 0.12173619866371155, "rewards/accuracy_reward": 0.7593344449996948, "rewards/format_reward": 1.0, "step": 1472, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 406.8125, "epoch": 0.04479382070307748, "grad_norm": 0.9705975908672091, "kl": 0.0301513671875, "learning_rate": 9.950573580861334e-07, "loss": 0.0012, "reward": 1.8421473503112793, "reward_std": 0.15772631764411926, "rewards/accuracy_reward": 0.7108972668647766, "rewards/format_reward": 0.96875, "step": 1473, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.125, "completion_length": 365.234375, "epoch": 0.04482423062887726, "grad_norm": 1.7751361517240847, "kl": 0.03759765625, "learning_rate": 9.950506559185427e-07, "loss": 0.0015, "reward": 1.8007440567016602, "reward_std": 0.13253821432590485, "rewards/accuracy_reward": 0.6601190567016602, "rewards/format_reward": 1.0, "step": 1474, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 351.75, "epoch": 0.04485464055467705, "grad_norm": 1.0231435562934674, "kl": 0.032470703125, "learning_rate": 9.950439492325996e-07, "loss": 0.0013, "reward": 1.9802191257476807, "reward_std": 0.04725078493356705, "rewards/accuracy_reward": 0.8052189350128174, "rewards/format_reward": 1.0, "step": 1475, "temperature": 1.0 }, { "all_correct": 0.375, "all_wrong": 0.125, "completion_length": 385.609375, "epoch": 0.04488505048047683, "grad_norm": 6.207218521375364, "kl": 0.03515625, "learning_rate": 9.950372380283652e-07, "loss": 0.0014, "reward": 1.627403736114502, "reward_std": 0.19693249464035034, "rewards/accuracy_reward": 0.5055287480354309, "rewards/format_reward": 1.0, "step": 1476, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 376.953125, "epoch": 0.04491546040627661, "grad_norm": 2.93981818317281, "kl": 0.0361328125, "learning_rate": 9.950305223059006e-07, "loss": 0.0014, "reward": 1.8260126113891602, "reward_std": 0.14023137092590332, "rewards/accuracy_reward": 0.6760127544403076, "rewards/format_reward": 1.0, "step": 1477, "temperature": 1.0 }, { "all_correct": 0.0, "all_wrong": 0.125, "completion_length": 400.578125, "epoch": 0.04494587033207639, "grad_norm": 1.439020652240854, "kl": 0.02880859375, "learning_rate": 9.950238020652674e-07, "loss": 0.0012, "reward": 1.4315876960754395, "reward_std": 0.2973781228065491, "rewards/accuracy_reward": 0.3378376364707947, "rewards/format_reward": 1.0, "step": 1478, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.375, "completion_length": 349.578125, "epoch": 0.04497628025787617, "grad_norm": 1.1143370649172963, "kl": 0.036376953125, "learning_rate": 9.95017077306527e-07, "loss": 0.0015, "reward": 1.6362563371658325, "reward_std": 0.02975967898964882, "rewards/accuracy_reward": 0.5206311941146851, "rewards/format_reward": 1.0, "step": 1479, "temperature": 1.0 }, { "all_correct": 0.375, "all_wrong": 0.25, "completion_length": 348.265625, "epoch": 0.04500669018367595, "grad_norm": 52.814293823995925, "kl": 0.03173828125, "learning_rate": 9.950103480297404e-07, "loss": 0.0013, "reward": 1.6327314376831055, "reward_std": 0.14577078819274902, "rewards/accuracy_reward": 0.5171064138412476, "rewards/format_reward": 1.0, "step": 1480, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 359.15625, "epoch": 0.04503710010947573, "grad_norm": 1.4963436162267556, "kl": 0.030517578125, "learning_rate": 9.950036142349695e-07, "loss": 0.0012, "reward": 1.9544835090637207, "reward_std": 0.14945869147777557, "rewards/accuracy_reward": 0.7951084971427917, "rewards/format_reward": 1.0, "step": 1481, "temperature": 1.0 }, { "all_correct": 0.875, "all_wrong": 0.0, "completion_length": 356.234375, "epoch": 0.04506751003527552, "grad_norm": 0.879946397617088, "kl": 0.0301513671875, "learning_rate": 9.949968759222753e-07, "loss": 0.0012, "reward": 2.13305926322937, "reward_std": 0.07978584617376328, "rewards/accuracy_reward": 0.9424341917037964, "rewards/format_reward": 1.0, "step": 1482, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.125, "completion_length": 378.078125, "epoch": 0.045097919961075296, "grad_norm": 1.0536098493703574, "kl": 0.029052734375, "learning_rate": 9.949901330917194e-07, "loss": 0.0012, "reward": 1.7526874542236328, "reward_std": 0.18059033155441284, "rewards/accuracy_reward": 0.6308124661445618, "rewards/format_reward": 1.0, "step": 1483, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.0, "completion_length": 346.390625, "epoch": 0.045128329886875075, "grad_norm": 1.2153050194043369, "kl": 0.0390625, "learning_rate": 9.949833857433635e-07, "loss": 0.0016, "reward": 2.0583395957946777, "reward_std": 0.12933349609375, "rewards/accuracy_reward": 0.870839536190033, "rewards/format_reward": 1.0, "step": 1484, "temperature": 1.0 }, { "all_correct": 0.375, "all_wrong": 0.25, "completion_length": 393.375, "epoch": 0.045158739812674854, "grad_norm": 3.9483433435626623, "kl": 0.0230712890625, "learning_rate": 9.949766338772691e-07, "loss": 0.0009, "reward": 1.6679595708847046, "reward_std": 0.17675693333148956, "rewards/accuracy_reward": 0.5617096424102783, "rewards/format_reward": 1.0, "step": 1485, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.125, "completion_length": 353.265625, "epoch": 0.04518914973847464, "grad_norm": 2.083860712177062, "kl": 0.032470703125, "learning_rate": 9.94969877493498e-07, "loss": 0.0013, "reward": 1.760359287261963, "reward_std": 0.13927516341209412, "rewards/accuracy_reward": 0.6353593468666077, "rewards/format_reward": 1.0, "step": 1486, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.0, "completion_length": 377.203125, "epoch": 0.04521955966427442, "grad_norm": 0.8942759642987876, "kl": 0.026611328125, "learning_rate": 9.949631165921114e-07, "loss": 0.0011, "reward": 1.9829045534133911, "reward_std": 0.07493706792593002, "rewards/accuracy_reward": 0.8235296010971069, "rewards/format_reward": 1.0, "step": 1487, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.0, "completion_length": 360.578125, "epoch": 0.0452499695900742, "grad_norm": 6.610008308402162, "kl": 0.0380859375, "learning_rate": 9.949563511731716e-07, "loss": 0.0015, "reward": 1.9175341129302979, "reward_std": 0.09986317157745361, "rewards/accuracy_reward": 0.7550340294837952, "rewards/format_reward": 1.0, "step": 1488, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.25, "completion_length": 347.21875, "epoch": 0.04528037951587398, "grad_norm": 0.8990897090069695, "kl": 0.036865234375, "learning_rate": 9.9494958123674e-07, "loss": 0.0015, "reward": 1.7609374523162842, "reward_std": 0.05287374183535576, "rewards/accuracy_reward": 0.6390625238418579, "rewards/format_reward": 1.0, "step": 1489, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 376.6875, "epoch": 0.045310789441673764, "grad_norm": 0.9662755782125431, "kl": 0.037841796875, "learning_rate": 9.949428067828783e-07, "loss": 0.0015, "reward": 1.7970421314239502, "reward_std": 0.15590900182724, "rewards/accuracy_reward": 0.6407921314239502, "rewards/format_reward": 1.0, "step": 1490, "temperature": 1.0 }, { "all_correct": 0.375, "all_wrong": 0.125, "completion_length": 385.21875, "epoch": 0.04534119936747354, "grad_norm": 2.9788792687465406, "kl": 0.032958984375, "learning_rate": 9.949360278116485e-07, "loss": 0.0013, "reward": 1.8427278995513916, "reward_std": 0.07297234237194061, "rewards/accuracy_reward": 0.6771028637886047, "rewards/format_reward": 1.0, "step": 1491, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 382.65625, "epoch": 0.04537160929327332, "grad_norm": 1.1218463639658613, "kl": 0.038330078125, "learning_rate": 9.949292443231126e-07, "loss": 0.0015, "reward": 1.8783656358718872, "reward_std": 0.15227922797203064, "rewards/accuracy_reward": 0.7064905762672424, "rewards/format_reward": 1.0, "step": 1492, "temperature": 1.0 }, { "all_correct": 0.375, "all_wrong": 0.0, "completion_length": 357.1875, "epoch": 0.04540201921907311, "grad_norm": 1.131043023543099, "kl": 0.036376953125, "learning_rate": 9.949224563173324e-07, "loss": 0.0015, "reward": 1.9406839609146118, "reward_std": 0.13163180649280548, "rewards/accuracy_reward": 0.7469338774681091, "rewards/format_reward": 1.0, "step": 1493, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 350.875, "epoch": 0.04543242914487289, "grad_norm": 1.3116104831704865, "kl": 0.0458984375, "learning_rate": 9.949156637943697e-07, "loss": 0.0018, "reward": 1.8398926258087158, "reward_std": 0.17847460508346558, "rewards/accuracy_reward": 0.6836427450180054, "rewards/format_reward": 1.0, "step": 1494, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.0, "completion_length": 366.609375, "epoch": 0.04546283907067267, "grad_norm": 1.1352401894710695, "kl": 0.030029296875, "learning_rate": 9.949088667542864e-07, "loss": 0.0012, "reward": 1.9238330125808716, "reward_std": 0.13391874730587006, "rewards/accuracy_reward": 0.7582079768180847, "rewards/format_reward": 1.0, "step": 1495, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 373.78125, "epoch": 0.045493248996472446, "grad_norm": 1.0897315557312168, "kl": 0.0322265625, "learning_rate": 9.949020651971452e-07, "loss": 0.0013, "reward": 2.01607084274292, "reward_std": 0.016686681658029556, "rewards/accuracy_reward": 0.8191959857940674, "rewards/format_reward": 1.0, "step": 1496, "temperature": 1.0 }, { "all_correct": 0.875, "all_wrong": 0.0, "completion_length": 369.578125, "epoch": 0.04552365892227223, "grad_norm": 0.6101103446317552, "kl": 0.0277099609375, "learning_rate": 9.948952591230072e-07, "loss": 0.0011, "reward": 2.178379535675049, "reward_std": 0.058348141610622406, "rewards/accuracy_reward": 0.981504499912262, "rewards/format_reward": 1.0, "step": 1497, "temperature": 1.0 }, { "all_correct": 0.375, "all_wrong": 0.0, "completion_length": 376.125, "epoch": 0.04555406884807201, "grad_norm": 1.1957024426430471, "kl": 0.03759765625, "learning_rate": 9.948884485319354e-07, "loss": 0.0015, "reward": 1.9359761476516724, "reward_std": 0.23552243411540985, "rewards/accuracy_reward": 0.7703511118888855, "rewards/format_reward": 1.0, "step": 1498, "temperature": 1.0 }, { "all_correct": 0.375, "all_wrong": 0.0, "completion_length": 382.140625, "epoch": 0.04558447877387179, "grad_norm": 2.422783128951826, "kl": 0.041748046875, "learning_rate": 9.948816334239916e-07, "loss": 0.0017, "reward": 1.7369554042816162, "reward_std": 0.17986533045768738, "rewards/accuracy_reward": 0.5869553089141846, "rewards/format_reward": 1.0, "step": 1499, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.125, "completion_length": 344.9375, "epoch": 0.04561488869967157, "grad_norm": 0.573565526773741, "kl": 0.03466796875, "learning_rate": 9.94874813799238e-07, "loss": 0.0014, "reward": 2.0133042335510254, "reward_std": 0.015123185701668262, "rewards/accuracy_reward": 0.8383040428161621, "rewards/format_reward": 1.0, "step": 1500, "temperature": 1.0 }, { "all_correct": 0.375, "all_wrong": 0.0, "completion_length": 385.609375, "epoch": 0.045645298625471356, "grad_norm": 1.6344611968450313, "kl": 0.0244140625, "learning_rate": 9.948679896577368e-07, "loss": 0.001, "reward": 1.7537634372711182, "reward_std": 0.20297253131866455, "rewards/accuracy_reward": 0.6100134253501892, "rewards/format_reward": 1.0, "step": 1501, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.125, "completion_length": 339.75, "epoch": 0.045675708551271135, "grad_norm": 1.1897831822289011, "kl": 0.03564453125, "learning_rate": 9.948611609995503e-07, "loss": 0.0014, "reward": 1.9249999523162842, "reward_std": 0.15665315091609955, "rewards/accuracy_reward": 0.78125, "rewards/format_reward": 1.0, "step": 1502, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.125, "completion_length": 384.671875, "epoch": 0.045706118477070914, "grad_norm": 1.458419400798581, "kl": 0.031494140625, "learning_rate": 9.94854327824741e-07, "loss": 0.0013, "reward": 1.8189011812210083, "reward_std": 0.15998052060604095, "rewards/accuracy_reward": 0.6845262050628662, "rewards/format_reward": 1.0, "step": 1503, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 359.59375, "epoch": 0.045736528402870694, "grad_norm": 1.258710233209215, "kl": 0.03955078125, "learning_rate": 9.94847490133371e-07, "loss": 0.0016, "reward": 1.9312500953674316, "reward_std": 0.1571877896785736, "rewards/accuracy_reward": 0.796875, "rewards/format_reward": 1.0, "step": 1504, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.375, "completion_length": 345.5, "epoch": 0.04576693832867048, "grad_norm": 4.849787897971567, "kl": 0.03857421875, "learning_rate": 9.948406479255029e-07, "loss": 0.0015, "reward": 1.6372532844543457, "reward_std": 0.03142615407705307, "rewards/accuracy_reward": 0.524753212928772, "rewards/format_reward": 1.0, "step": 1505, "temperature": 1.0 }, { "all_correct": 0.375, "all_wrong": 0.0, "completion_length": 384.265625, "epoch": 0.04579734825447026, "grad_norm": 2.482537746801645, "kl": 0.0400390625, "learning_rate": 9.94833801201199e-07, "loss": 0.0016, "reward": 1.8902778625488281, "reward_std": 0.2431999146938324, "rewards/accuracy_reward": 0.7621527910232544, "rewards/format_reward": 1.0, "step": 1506, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 347.328125, "epoch": 0.04582775818027004, "grad_norm": 1.562474174681093, "kl": 0.038818359375, "learning_rate": 9.94826949960522e-07, "loss": 0.0015, "reward": 1.8534408807754517, "reward_std": 0.2663414180278778, "rewards/accuracy_reward": 0.706565797328949, "rewards/format_reward": 1.0, "step": 1507, "temperature": 1.0 }, { "all_correct": 0.375, "all_wrong": 0.125, "completion_length": 360.4375, "epoch": 0.045858168106069824, "grad_norm": 1.0211026200244981, "kl": 0.03564453125, "learning_rate": 9.948200942035342e-07, "loss": 0.0014, "reward": 1.84375, "reward_std": 0.25313490629196167, "rewards/accuracy_reward": 0.703125, "rewards/format_reward": 1.0, "step": 1508, "temperature": 1.0 }, { "all_correct": 0.375, "all_wrong": 0.125, "completion_length": 381.546875, "epoch": 0.045888578031869603, "grad_norm": 1.2173857017865652, "kl": 0.03515625, "learning_rate": 9.948132339302984e-07, "loss": 0.0014, "reward": 1.7156915664672852, "reward_std": 0.24871021509170532, "rewards/accuracy_reward": 0.5969415903091431, "rewards/format_reward": 1.0, "step": 1509, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.25, "completion_length": 360.828125, "epoch": 0.04591898795766938, "grad_norm": 1.1388940868021875, "kl": 0.039306640625, "learning_rate": 9.94806369140877e-07, "loss": 0.0016, "reward": 1.704660177230835, "reward_std": 0.08171486854553223, "rewards/accuracy_reward": 0.5859100818634033, "rewards/format_reward": 1.0, "step": 1510, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 356.828125, "epoch": 0.04594939788346916, "grad_norm": 0.8114983621048736, "kl": 0.03369140625, "learning_rate": 9.947994998353326e-07, "loss": 0.0013, "reward": 2.020199775695801, "reward_std": 0.07879830151796341, "rewards/accuracy_reward": 0.8389497995376587, "rewards/format_reward": 1.0, "step": 1511, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.125, "completion_length": 379.921875, "epoch": 0.04597980780926895, "grad_norm": 0.7829033317428011, "kl": 0.037109375, "learning_rate": 9.947926260137284e-07, "loss": 0.0015, "reward": 1.8514877557754517, "reward_std": 0.08532886207103729, "rewards/accuracy_reward": 0.6889877319335938, "rewards/format_reward": 1.0, "step": 1512, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.125, "completion_length": 384.515625, "epoch": 0.04601021773506873, "grad_norm": 1.1191746145809083, "kl": 0.04052734375, "learning_rate": 9.947857476761265e-07, "loss": 0.0016, "reward": 1.9793094396591187, "reward_std": 0.11250028759241104, "rewards/accuracy_reward": 0.8293095231056213, "rewards/format_reward": 1.0, "step": 1513, "temperature": 1.0 }, { "all_correct": 0.25, "all_wrong": 0.25, "completion_length": 420.625, "epoch": 0.046040627660868506, "grad_norm": 0.7137288567475439, "kl": 0.0322265625, "learning_rate": 9.9477886482259e-07, "loss": 0.0013, "reward": 1.4401919841766357, "reward_std": 0.19135333597660065, "rewards/accuracy_reward": 0.3714420199394226, "rewards/format_reward": 1.0, "step": 1514, "temperature": 1.0 }, { "all_correct": 0.125, "all_wrong": 0.25, "completion_length": 375.765625, "epoch": 0.046071037586668286, "grad_norm": 1.0300857628555844, "kl": 0.035400390625, "learning_rate": 9.94771977453182e-07, "loss": 0.0014, "reward": 1.5888595581054688, "reward_std": 0.17372721433639526, "rewards/accuracy_reward": 0.45448437333106995, "rewards/format_reward": 1.0, "step": 1515, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.125, "completion_length": 391.171875, "epoch": 0.04610144751246807, "grad_norm": 2.5347021875005296, "kl": 0.03466796875, "learning_rate": 9.947650855679649e-07, "loss": 0.0014, "reward": 1.8931080102920532, "reward_std": 0.141493558883667, "rewards/accuracy_reward": 0.7368580102920532, "rewards/format_reward": 1.0, "step": 1516, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.25, "completion_length": 366.484375, "epoch": 0.04613185743826785, "grad_norm": 0.41198028832444783, "kl": 0.038330078125, "learning_rate": 9.947581891670015e-07, "loss": 0.0015, "reward": 1.8006374835968018, "reward_std": 0.008857602253556252, "rewards/accuracy_reward": 0.6537624597549438, "rewards/format_reward": 1.0, "step": 1517, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.0, "completion_length": 416.1875, "epoch": 0.04616226736406763, "grad_norm": 1.2508945703684131, "kl": 0.03515625, "learning_rate": 9.94751288250355e-07, "loss": 0.0014, "reward": 1.9455528259277344, "reward_std": 0.14354023337364197, "rewards/accuracy_reward": 0.7799278497695923, "rewards/format_reward": 1.0, "step": 1518, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.125, "completion_length": 368.546875, "epoch": 0.04619267728986741, "grad_norm": 0.8131875773706615, "kl": 0.030517578125, "learning_rate": 9.947443828180886e-07, "loss": 0.0012, "reward": 1.8515865802764893, "reward_std": 0.059592120349407196, "rewards/accuracy_reward": 0.7015864849090576, "rewards/format_reward": 1.0, "step": 1519, "temperature": 1.0 }, { "all_correct": 0.375, "all_wrong": 0.0, "completion_length": 425.578125, "epoch": 0.046223087215667195, "grad_norm": 1.220134639430691, "kl": 0.035888671875, "learning_rate": 9.947374728702651e-07, "loss": 0.0014, "reward": 1.7307689189910889, "reward_std": 0.05556311830878258, "rewards/accuracy_reward": 0.5870188474655151, "rewards/format_reward": 1.0, "step": 1520, "temperature": 1.0 }, { "all_correct": 0.875, "all_wrong": 0.0, "completion_length": 344.8125, "epoch": 0.046253497141466975, "grad_norm": 0.56030815958017, "kl": 0.032958984375, "learning_rate": 9.947305584069473e-07, "loss": 0.0013, "reward": 2.065624952316284, "reward_std": 0.04419417306780815, "rewards/accuracy_reward": 0.890625, "rewards/format_reward": 1.0, "step": 1521, "temperature": 1.0 }, { "all_correct": 0.25, "all_wrong": 0.125, "completion_length": 377.0, "epoch": 0.046283907067266754, "grad_norm": 1.5560728709672904, "kl": 0.035888671875, "learning_rate": 9.94723639428199e-07, "loss": 0.0014, "reward": 1.624798059463501, "reward_std": 0.1477717161178589, "rewards/accuracy_reward": 0.47479814291000366, "rewards/format_reward": 1.0, "step": 1522, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 372.859375, "epoch": 0.04631431699306654, "grad_norm": 0.902267021153162, "kl": 0.03466796875, "learning_rate": 9.947167159340825e-07, "loss": 0.0014, "reward": 2.014842987060547, "reward_std": 0.009716305881738663, "rewards/accuracy_reward": 0.8148430585861206, "rewards/format_reward": 1.0, "step": 1523, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 383.265625, "epoch": 0.04634472691886632, "grad_norm": 0.7320523541022059, "kl": 0.034423828125, "learning_rate": 9.947097879246618e-07, "loss": 0.0014, "reward": 2.110445499420166, "reward_std": 0.05745057016611099, "rewards/accuracy_reward": 0.9135704040527344, "rewards/format_reward": 1.0, "step": 1524, "temperature": 1.0 }, { "all_correct": 0.375, "all_wrong": 0.375, "completion_length": 402.890625, "epoch": 0.0463751368446661, "grad_norm": 3.5547059022242555, "kl": 0.0311279296875, "learning_rate": 9.947028553999995e-07, "loss": 0.0012, "reward": 1.5954759120941162, "reward_std": 0.0381922647356987, "rewards/accuracy_reward": 0.49235087633132935, "rewards/format_reward": 1.0, "step": 1525, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 367.828125, "epoch": 0.04640554677046588, "grad_norm": 1.7253306447220629, "kl": 0.0322265625, "learning_rate": 9.946959183601593e-07, "loss": 0.0013, "reward": 2.029130458831787, "reward_std": 0.13583141565322876, "rewards/accuracy_reward": 0.8541305065155029, "rewards/format_reward": 0.984375, "step": 1526, "temperature": 1.0 }, { "all_correct": 0.25, "all_wrong": 0.0, "completion_length": 376.109375, "epoch": 0.046435956696265664, "grad_norm": 1.5534313564861473, "kl": 0.03466796875, "learning_rate": 9.946889768052041e-07, "loss": 0.0014, "reward": 1.6638532876968384, "reward_std": 0.1377621740102768, "rewards/accuracy_reward": 0.4857282042503357, "rewards/format_reward": 1.0, "step": 1527, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.125, "completion_length": 361.140625, "epoch": 0.04646636662206544, "grad_norm": 0.9531581665984314, "kl": 0.03125, "learning_rate": 9.946820307351977e-07, "loss": 0.0013, "reward": 1.7577677965164185, "reward_std": 0.09089983999729156, "rewards/accuracy_reward": 0.6202678084373474, "rewards/format_reward": 1.0, "step": 1528, "temperature": 1.0 }, { "all_correct": 0.375, "all_wrong": 0.25, "completion_length": 363.625, "epoch": 0.04649677654786522, "grad_norm": 1.089614049147969, "kl": 0.035888671875, "learning_rate": 9.946750801502033e-07, "loss": 0.0014, "reward": 1.7125000953674316, "reward_std": 0.21213886141777039, "rewards/accuracy_reward": 0.59375, "rewards/format_reward": 1.0, "step": 1529, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.0, "completion_length": 357.640625, "epoch": 0.046527186473665, "grad_norm": 1.0914333916999122, "kl": 0.03369140625, "learning_rate": 9.946681250502842e-07, "loss": 0.0013, "reward": 1.9557760953903198, "reward_std": 0.16583147644996643, "rewards/accuracy_reward": 0.790151059627533, "rewards/format_reward": 1.0, "step": 1530, "temperature": 1.0 }, { "all_correct": 0.375, "all_wrong": 0.25, "completion_length": 381.140625, "epoch": 0.04655759639946479, "grad_norm": 2.191393194528012, "kl": 0.031494140625, "learning_rate": 9.946611654355041e-07, "loss": 0.0013, "reward": 1.735191822052002, "reward_std": 0.07497971504926682, "rewards/accuracy_reward": 0.5914418697357178, "rewards/format_reward": 1.0, "step": 1531, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.0, "completion_length": 360.40625, "epoch": 0.04658800632526457, "grad_norm": 1.1549654360251704, "kl": 0.0380859375, "learning_rate": 9.946542013059265e-07, "loss": 0.0015, "reward": 1.988822102546692, "reward_std": 0.17995887994766235, "rewards/accuracy_reward": 0.826322078704834, "rewards/format_reward": 1.0, "step": 1532, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.0, "completion_length": 357.125, "epoch": 0.046618416251064346, "grad_norm": 0.6888523543826557, "kl": 0.0322265625, "learning_rate": 9.946472326616147e-07, "loss": 0.0013, "reward": 2.0143518447875977, "reward_std": 0.06943650543689728, "rewards/accuracy_reward": 0.8206018805503845, "rewards/format_reward": 1.0, "step": 1533, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.0, "completion_length": 367.03125, "epoch": 0.04664882617686413, "grad_norm": 1.1025587349348267, "kl": 0.03662109375, "learning_rate": 9.946402595026327e-07, "loss": 0.0015, "reward": 1.8937666416168213, "reward_std": 0.021150335669517517, "rewards/accuracy_reward": 0.7187666296958923, "rewards/format_reward": 1.0, "step": 1534, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.0, "completion_length": 359.421875, "epoch": 0.04667923610266391, "grad_norm": 1.2546239586102172, "kl": 0.05224609375, "learning_rate": 9.946332818290438e-07, "loss": 0.0021, "reward": 1.9164773225784302, "reward_std": 0.16829003393650055, "rewards/accuracy_reward": 0.7602272033691406, "rewards/format_reward": 1.0, "step": 1535, "temperature": 1.0 }, { "all_correct": 0.25, "all_wrong": 0.375, "completion_length": 382.390625, "epoch": 0.04670964602846369, "grad_norm": 1.0558509420492939, "kl": 0.032470703125, "learning_rate": 9.94626299640912e-07, "loss": 0.0013, "reward": 1.4431209564208984, "reward_std": 0.08103935420513153, "rewards/accuracy_reward": 0.33999598026275635, "rewards/format_reward": 1.0, "step": 1536, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.125, "completion_length": 347.703125, "epoch": 0.04674005595426347, "grad_norm": 1.162069068539975, "kl": 0.036376953125, "learning_rate": 9.946193129383007e-07, "loss": 0.0015, "reward": 1.872774362564087, "reward_std": 0.14167305827140808, "rewards/accuracy_reward": 0.7165243625640869, "rewards/format_reward": 1.0, "step": 1537, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.125, "completion_length": 363.0625, "epoch": 0.046770465880063256, "grad_norm": 1.559317282402883, "kl": 0.034423828125, "learning_rate": 9.946123217212738e-07, "loss": 0.0014, "reward": 1.8008460998535156, "reward_std": 0.1059165969491005, "rewards/accuracy_reward": 0.6477210521697998, "rewards/format_reward": 1.0, "step": 1538, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 355.5, "epoch": 0.046800875805863035, "grad_norm": 2.384803006452768, "kl": 0.04443359375, "learning_rate": 9.946053259898951e-07, "loss": 0.0018, "reward": 1.8696527481079102, "reward_std": 0.19570326805114746, "rewards/accuracy_reward": 0.7071527242660522, "rewards/format_reward": 1.0, "step": 1539, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.125, "completion_length": 355.65625, "epoch": 0.046831285731662814, "grad_norm": 0.9068824518139893, "kl": 0.03564453125, "learning_rate": 9.945983257442287e-07, "loss": 0.0014, "reward": 1.9635319709777832, "reward_std": 0.02752988040447235, "rewards/accuracy_reward": 0.7979068756103516, "rewards/format_reward": 1.0, "step": 1540, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 363.890625, "epoch": 0.04686169565746259, "grad_norm": 1.6378855985034637, "kl": 0.03662109375, "learning_rate": 9.94591320984338e-07, "loss": 0.0015, "reward": 1.8875000476837158, "reward_std": 0.26964086294174194, "rewards/accuracy_reward": 0.75, "rewards/format_reward": 1.0, "step": 1541, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 363.375, "epoch": 0.04689210558326238, "grad_norm": 1.5233192189089795, "kl": 0.033203125, "learning_rate": 9.945843117102874e-07, "loss": 0.0013, "reward": 1.8291280269622803, "reward_std": 0.25905275344848633, "rewards/accuracy_reward": 0.6947530508041382, "rewards/format_reward": 1.0, "step": 1542, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 352.125, "epoch": 0.04692251550906216, "grad_norm": 0.9741589064791435, "kl": 0.03173828125, "learning_rate": 9.945772979221407e-07, "loss": 0.0013, "reward": 2.1334705352783203, "reward_std": 0.128914937376976, "rewards/accuracy_reward": 0.9428454637527466, "rewards/format_reward": 1.0, "step": 1543, "temperature": 1.0 }, { "all_correct": 0.375, "all_wrong": 0.125, "completion_length": 390.453125, "epoch": 0.04695292543486194, "grad_norm": 1.6243864728538238, "kl": 0.031982421875, "learning_rate": 9.945702796199617e-07, "loss": 0.0013, "reward": 1.7554842233657837, "reward_std": 0.183892622590065, "rewards/accuracy_reward": 0.6086091995239258, "rewards/format_reward": 1.0, "step": 1544, "temperature": 1.0 }, { "all_correct": 0.375, "all_wrong": 0.125, "completion_length": 357.234375, "epoch": 0.04698333536066172, "grad_norm": 1.2922284262832764, "kl": 0.032958984375, "learning_rate": 9.945632568038149e-07, "loss": 0.0013, "reward": 1.774712085723877, "reward_std": 0.18133237957954407, "rewards/accuracy_reward": 0.6372120380401611, "rewards/format_reward": 1.0, "step": 1545, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.0, "completion_length": 386.40625, "epoch": 0.0470137452864615, "grad_norm": 0.9967112339669894, "kl": 0.03466796875, "learning_rate": 9.945562294737639e-07, "loss": 0.0014, "reward": 1.9337646961212158, "reward_std": 0.013676613569259644, "rewards/accuracy_reward": 0.7368895411491394, "rewards/format_reward": 1.0, "step": 1546, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.25, "completion_length": 358.0, "epoch": 0.04704415521226128, "grad_norm": 1.312940533018157, "kl": 0.037353515625, "learning_rate": 9.945491976298734e-07, "loss": 0.0015, "reward": 1.8718750476837158, "reward_std": 0.07344460487365723, "rewards/accuracy_reward": 0.734375, "rewards/format_reward": 1.0, "step": 1547, "temperature": 1.0 }, { "all_correct": 0.375, "all_wrong": 0.25, "completion_length": 355.8125, "epoch": 0.04707456513806106, "grad_norm": 1.5552138544455676, "kl": 0.035400390625, "learning_rate": 9.945421612722069e-07, "loss": 0.0014, "reward": 1.6254463195800781, "reward_std": 0.22079315781593323, "rewards/accuracy_reward": 0.5223214626312256, "rewards/format_reward": 1.0, "step": 1548, "temperature": 1.0 }, { "all_correct": 0.375, "all_wrong": 0.0, "completion_length": 386.578125, "epoch": 0.04710497506386085, "grad_norm": 0.8717021672417284, "kl": 0.03564453125, "learning_rate": 9.945351204008292e-07, "loss": 0.0014, "reward": 1.8744988441467285, "reward_std": 0.13295477628707886, "rewards/accuracy_reward": 0.7088736891746521, "rewards/format_reward": 1.0, "step": 1549, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 369.796875, "epoch": 0.04713538498966063, "grad_norm": 1.2833396405627877, "kl": 0.033447265625, "learning_rate": 9.945280750158044e-07, "loss": 0.0013, "reward": 1.9808827638626099, "reward_std": 0.17337007820606232, "rewards/accuracy_reward": 0.8027576804161072, "rewards/format_reward": 1.0, "step": 1550, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.125, "completion_length": 362.796875, "epoch": 0.047165794915460406, "grad_norm": 1.8074974147216338, "kl": 0.03515625, "learning_rate": 9.945210251171966e-07, "loss": 0.0014, "reward": 1.885331392288208, "reward_std": 0.15647625923156738, "rewards/accuracy_reward": 0.729081392288208, "rewards/format_reward": 1.0, "step": 1551, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.125, "completion_length": 357.0625, "epoch": 0.047196204841260185, "grad_norm": 1.244653719852763, "kl": 0.035400390625, "learning_rate": 9.945139707050705e-07, "loss": 0.0014, "reward": 1.7670929431915283, "reward_std": 0.0673820897936821, "rewards/accuracy_reward": 0.623342752456665, "rewards/format_reward": 1.0, "step": 1552, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 351.09375, "epoch": 0.04722661476705997, "grad_norm": 0.8202775806155235, "kl": 0.037109375, "learning_rate": 9.945069117794903e-07, "loss": 0.0015, "reward": 2.1208202838897705, "reward_std": 0.09091229736804962, "rewards/accuracy_reward": 0.930195152759552, "rewards/format_reward": 1.0, "step": 1553, "temperature": 1.0 }, { "all_correct": 0.375, "all_wrong": 0.375, "completion_length": 356.046875, "epoch": 0.04725702469285975, "grad_norm": 3.1772189255974483, "kl": 0.033935546875, "learning_rate": 9.9449984834052e-07, "loss": 0.0014, "reward": 1.6708016395568848, "reward_std": 0.08724631369113922, "rewards/accuracy_reward": 0.5551766157150269, "rewards/format_reward": 1.0, "step": 1554, "temperature": 1.0 }, { "all_correct": 0.875, "all_wrong": 0.0, "completion_length": 371.71875, "epoch": 0.04728743461865953, "grad_norm": 0.28866260078484823, "kl": 0.034423828125, "learning_rate": 9.944927803882249e-07, "loss": 0.0014, "reward": 2.1422150135040283, "reward_std": 0.011572758667171001, "rewards/accuracy_reward": 0.9484648704528809, "rewards/format_reward": 1.0, "step": 1555, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 362.328125, "epoch": 0.04731784454445931, "grad_norm": 1.3497709191765361, "kl": 0.037841796875, "learning_rate": 9.944857079226689e-07, "loss": 0.0015, "reward": 2.0602612495422363, "reward_std": 0.07537347078323364, "rewards/accuracy_reward": 0.8665112853050232, "rewards/format_reward": 1.0, "step": 1556, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.375, "completion_length": 390.21875, "epoch": 0.047348254470259095, "grad_norm": 1.067983478909884, "kl": 0.03076171875, "learning_rate": 9.944786309439167e-07, "loss": 0.0012, "reward": 1.662770390510559, "reward_std": 0.07826808094978333, "rewards/accuracy_reward": 0.5565203428268433, "rewards/format_reward": 1.0, "step": 1557, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.125, "completion_length": 353.625, "epoch": 0.047378664396058874, "grad_norm": 0.8628796162250926, "kl": 0.03662109375, "learning_rate": 9.94471549452033e-07, "loss": 0.0015, "reward": 1.9475369453430176, "reward_std": 0.04816683754324913, "rewards/accuracy_reward": 0.7819119095802307, "rewards/format_reward": 1.0, "step": 1558, "temperature": 1.0 }, { "all_correct": 0.375, "all_wrong": 0.0, "completion_length": 411.703125, "epoch": 0.04740907432185865, "grad_norm": 1.912954806317993, "kl": 0.035888671875, "learning_rate": 9.944644634470825e-07, "loss": 0.0014, "reward": 1.7232180833816528, "reward_std": 0.1536242961883545, "rewards/accuracy_reward": 0.5950931310653687, "rewards/format_reward": 0.984375, "step": 1559, "temperature": 1.0 }, { "all_correct": 0.25, "all_wrong": 0.25, "completion_length": 393.0625, "epoch": 0.04743948424765843, "grad_norm": 1.5214355237572437, "kl": 0.034423828125, "learning_rate": 9.944573729291296e-07, "loss": 0.0014, "reward": 1.5150384902954102, "reward_std": 0.1093657910823822, "rewards/accuracy_reward": 0.4025384485721588, "rewards/format_reward": 1.0, "step": 1560, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.125, "completion_length": 358.21875, "epoch": 0.04746989417345822, "grad_norm": 0.7276105776498598, "kl": 0.03369140625, "learning_rate": 9.944502778982394e-07, "loss": 0.0013, "reward": 1.952887773513794, "reward_std": 0.012567606754601002, "rewards/accuracy_reward": 0.7810126543045044, "rewards/format_reward": 1.0, "step": 1561, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.125, "completion_length": 361.578125, "epoch": 0.047500304099258, "grad_norm": 0.9841566008588063, "kl": 0.0341796875, "learning_rate": 9.944431783544762e-07, "loss": 0.0014, "reward": 1.8032851219177246, "reward_std": 0.10920310020446777, "rewards/accuracy_reward": 0.6657851934432983, "rewards/format_reward": 1.0, "step": 1562, "temperature": 1.0 }, { "all_correct": 0.375, "all_wrong": 0.125, "completion_length": 364.21875, "epoch": 0.04753071402505778, "grad_norm": 1.1366244187951449, "kl": 0.035888671875, "learning_rate": 9.94436074297905e-07, "loss": 0.0014, "reward": 1.8437501192092896, "reward_std": 0.23605698347091675, "rewards/accuracy_reward": 0.703125, "rewards/format_reward": 1.0, "step": 1563, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.25, "completion_length": 346.796875, "epoch": 0.04756112395085756, "grad_norm": 2.506955296941956, "kl": 0.035888671875, "learning_rate": 9.944289657285905e-07, "loss": 0.0014, "reward": 1.674835205078125, "reward_std": 0.06749535351991653, "rewards/accuracy_reward": 0.5529600977897644, "rewards/format_reward": 1.0, "step": 1564, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.25, "completion_length": 375.78125, "epoch": 0.04759153387665734, "grad_norm": 1.023928068461891, "kl": 0.0380859375, "learning_rate": 9.944218526465979e-07, "loss": 0.0015, "reward": 1.7856860160827637, "reward_std": 0.06369657814502716, "rewards/accuracy_reward": 0.6575610637664795, "rewards/format_reward": 0.984375, "step": 1565, "temperature": 1.0 }, { "all_correct": 0.25, "all_wrong": 0.125, "completion_length": 378.359375, "epoch": 0.04762194380245712, "grad_norm": 1.1274449806554883, "kl": 0.0341796875, "learning_rate": 9.94414735051992e-07, "loss": 0.0014, "reward": 1.6358880996704102, "reward_std": 0.09547819942235947, "rewards/accuracy_reward": 0.4921380281448364, "rewards/format_reward": 1.0, "step": 1566, "temperature": 1.0 }, { "all_correct": 0.375, "all_wrong": 0.375, "completion_length": 341.65625, "epoch": 0.0476523537282569, "grad_norm": 0.9240323697666751, "kl": 0.0361328125, "learning_rate": 9.944076129448378e-07, "loss": 0.0014, "reward": 1.618647575378418, "reward_std": 0.10974804311990738, "rewards/accuracy_reward": 0.5155225992202759, "rewards/format_reward": 1.0, "step": 1567, "temperature": 1.0 }, { "all_correct": 0.375, "all_wrong": 0.125, "completion_length": 371.734375, "epoch": 0.04768276365405669, "grad_norm": 0.9265472677305572, "kl": 0.0341796875, "learning_rate": 9.944004863252001e-07, "loss": 0.0014, "reward": 1.7358182668685913, "reward_std": 0.09713871031999588, "rewards/accuracy_reward": 0.5795682668685913, "rewards/format_reward": 1.0, "step": 1568, "temperature": 1.0 }, { "all_correct": 0.25, "all_wrong": 0.375, "completion_length": 381.9375, "epoch": 0.047713173579856466, "grad_norm": 1.7176708853116256, "kl": 0.037353515625, "learning_rate": 9.94393355193144e-07, "loss": 0.0015, "reward": 1.582282304763794, "reward_std": 0.08384598791599274, "rewards/accuracy_reward": 0.4791572093963623, "rewards/format_reward": 1.0, "step": 1569, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.0, "completion_length": 371.984375, "epoch": 0.047743583505656245, "grad_norm": 2.4280290319859237, "kl": 0.042724609375, "learning_rate": 9.943862195487347e-07, "loss": 0.0017, "reward": 1.8999028205871582, "reward_std": 0.05048826336860657, "rewards/accuracy_reward": 0.7186528444290161, "rewards/format_reward": 1.0, "step": 1570, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.125, "completion_length": 357.796875, "epoch": 0.047773993431456024, "grad_norm": 1.240472439429232, "kl": 0.039306640625, "learning_rate": 9.943790793920373e-07, "loss": 0.0016, "reward": 1.9381531476974487, "reward_std": 0.04622669890522957, "rewards/accuracy_reward": 0.794403076171875, "rewards/format_reward": 1.0, "step": 1571, "temperature": 1.0 }, { "all_correct": 0.375, "all_wrong": 0.25, "completion_length": 392.9375, "epoch": 0.04780440335725581, "grad_norm": 1.096641580971876, "kl": 0.0322265625, "learning_rate": 9.94371934723117e-07, "loss": 0.0013, "reward": 1.6043202877044678, "reward_std": 0.08812005817890167, "rewards/accuracy_reward": 0.473070353269577, "rewards/format_reward": 1.0, "step": 1572, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.25, "completion_length": 350.0, "epoch": 0.04783481328305559, "grad_norm": 0.06746722896096136, "kl": 0.036865234375, "learning_rate": 9.943647855420388e-07, "loss": 0.0015, "reward": 1.9000000953674316, "reward_std": 0.0, "rewards/accuracy_reward": 0.75, "rewards/format_reward": 1.0, "step": 1573, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 382.609375, "epoch": 0.04786522320885537, "grad_norm": 0.8781529264859942, "kl": 0.033935546875, "learning_rate": 9.943576318488682e-07, "loss": 0.0014, "reward": 2.0207979679107666, "reward_std": 0.12056371569633484, "rewards/accuracy_reward": 0.8364229798316956, "rewards/format_reward": 1.0, "step": 1574, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 343.171875, "epoch": 0.04789563313465515, "grad_norm": 0.7497062115884768, "kl": 0.0361328125, "learning_rate": 9.943504736436705e-07, "loss": 0.0014, "reward": 2.133333206176758, "reward_std": 0.06134308874607086, "rewards/accuracy_reward": 0.9427083730697632, "rewards/format_reward": 1.0, "step": 1575, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.125, "completion_length": 361.265625, "epoch": 0.047926043060454934, "grad_norm": 1.7256290312998543, "kl": 0.037109375, "learning_rate": 9.943433109265106e-07, "loss": 0.0015, "reward": 1.874631643295288, "reward_std": 0.025582769885659218, "rewards/accuracy_reward": 0.6996316313743591, "rewards/format_reward": 1.0, "step": 1576, "temperature": 1.0 }, { "all_correct": 0.375, "all_wrong": 0.0, "completion_length": 380.953125, "epoch": 0.04795645298625471, "grad_norm": 1.1265997050732721, "kl": 0.032470703125, "learning_rate": 9.943361436974546e-07, "loss": 0.0013, "reward": 1.8008129596710205, "reward_std": 0.18737894296646118, "rewards/accuracy_reward": 0.6414377689361572, "rewards/format_reward": 1.0, "step": 1577, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.375, "completion_length": 368.765625, "epoch": 0.04798686291205449, "grad_norm": 0.6464948738567164, "kl": 0.046875, "learning_rate": 9.943289719565673e-07, "loss": 0.0019, "reward": 1.6754918098449707, "reward_std": 0.00784970447421074, "rewards/accuracy_reward": 0.5504918694496155, "rewards/format_reward": 1.0, "step": 1578, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.25, "completion_length": 353.84375, "epoch": 0.04801727283785428, "grad_norm": 0.5136057368711837, "kl": 0.034423828125, "learning_rate": 9.943217957039146e-07, "loss": 0.0014, "reward": 1.765625, "reward_std": 0.04419417306780815, "rewards/accuracy_reward": 0.640625, "rewards/format_reward": 1.0, "step": 1579, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 376.171875, "epoch": 0.04804768276365406, "grad_norm": 2.5784092896690325, "kl": 0.034423828125, "learning_rate": 9.943146149395616e-07, "loss": 0.0014, "reward": 2.0359914302825928, "reward_std": 0.14440687000751495, "rewards/accuracy_reward": 0.8484913110733032, "rewards/format_reward": 1.0, "step": 1580, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.125, "completion_length": 404.9375, "epoch": 0.04807809268945384, "grad_norm": 0.6828462339522391, "kl": 0.0306396484375, "learning_rate": 9.943074296635738e-07, "loss": 0.0012, "reward": 1.7997803688049316, "reward_std": 0.15681302547454834, "rewards/accuracy_reward": 0.6560302972793579, "rewards/format_reward": 1.0, "step": 1581, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.25, "completion_length": 354.921875, "epoch": 0.048108502615253616, "grad_norm": 1.4667237569274554, "kl": 0.039794921875, "learning_rate": 9.943002398760174e-07, "loss": 0.0016, "reward": 1.7875001430511475, "reward_std": 0.14961488544940948, "rewards/accuracy_reward": 0.65625, "rewards/format_reward": 1.0, "step": 1582, "temperature": 1.0 }, { "all_correct": 0.875, "all_wrong": 0.125, "completion_length": 371.515625, "epoch": 0.0481389125410534, "grad_norm": 0.6238793384298421, "kl": 0.0380859375, "learning_rate": 9.942930455769574e-07, "loss": 0.0015, "reward": 2.0402345657348633, "reward_std": 0.008086705580353737, "rewards/accuracy_reward": 0.865234375, "rewards/format_reward": 1.0, "step": 1583, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.125, "completion_length": 347.96875, "epoch": 0.04816932246685318, "grad_norm": 0.6221197383175237, "kl": 0.04150390625, "learning_rate": 9.942858467664596e-07, "loss": 0.0017, "reward": 1.806249976158142, "reward_std": 0.10079033672809601, "rewards/accuracy_reward": 0.671875, "rewards/format_reward": 1.0, "step": 1584, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.125, "completion_length": 405.640625, "epoch": 0.04819973239265296, "grad_norm": 1.031585548205416, "kl": 0.038330078125, "learning_rate": 9.9427864344459e-07, "loss": 0.0015, "reward": 1.6910154819488525, "reward_std": 0.14263184368610382, "rewards/accuracy_reward": 0.5910155177116394, "rewards/format_reward": 0.96875, "step": 1585, "temperature": 1.0 }, { "all_correct": 0.25, "all_wrong": 0.25, "completion_length": 366.890625, "epoch": 0.04823014231845274, "grad_norm": 1.223275608439595, "kl": 0.037109375, "learning_rate": 9.942714356114142e-07, "loss": 0.0015, "reward": 1.6380391120910645, "reward_std": 0.17795586585998535, "rewards/accuracy_reward": 0.5099140405654907, "rewards/format_reward": 1.0, "step": 1586, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 368.328125, "epoch": 0.048260552244252526, "grad_norm": 0.6520116864443368, "kl": 0.0322265625, "learning_rate": 9.942642232669976e-07, "loss": 0.0013, "reward": 2.113842248916626, "reward_std": 0.07689541578292847, "rewards/accuracy_reward": 0.926342248916626, "rewards/format_reward": 1.0, "step": 1587, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.125, "completion_length": 357.46875, "epoch": 0.048290962170052305, "grad_norm": 1.1704457966371444, "kl": 0.034912109375, "learning_rate": 9.942570064114065e-07, "loss": 0.0014, "reward": 1.9099608659744263, "reward_std": 0.08057739585638046, "rewards/accuracy_reward": 0.7568359375, "rewards/format_reward": 1.0, "step": 1588, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.125, "completion_length": 363.390625, "epoch": 0.048321372095852085, "grad_norm": 0.8072791272124485, "kl": 0.033447265625, "learning_rate": 9.942497850447066e-07, "loss": 0.0013, "reward": 1.8170771598815918, "reward_std": 0.11260746419429779, "rewards/accuracy_reward": 0.6733270883560181, "rewards/format_reward": 1.0, "step": 1589, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 363.109375, "epoch": 0.048351782021651864, "grad_norm": 0.880259150837379, "kl": 0.033447265625, "learning_rate": 9.942425591669638e-07, "loss": 0.0013, "reward": 1.975000023841858, "reward_std": 0.13887301087379456, "rewards/accuracy_reward": 0.8125, "rewards/format_reward": 1.0, "step": 1590, "temperature": 1.0 }, { "all_correct": 0.375, "all_wrong": 0.25, "completion_length": 388.078125, "epoch": 0.04838219194745165, "grad_norm": 1.5455032945479896, "kl": 0.0322265625, "learning_rate": 9.942353287782443e-07, "loss": 0.0013, "reward": 1.6080572605133057, "reward_std": 0.10116233676671982, "rewards/accuracy_reward": 0.4893072843551636, "rewards/format_reward": 1.0, "step": 1591, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.125, "completion_length": 372.796875, "epoch": 0.04841260187325143, "grad_norm": 1.5330375534733567, "kl": 0.041015625, "learning_rate": 9.942280938786136e-07, "loss": 0.0016, "reward": 1.827918529510498, "reward_std": 0.075978122651577, "rewards/accuracy_reward": 0.693543553352356, "rewards/format_reward": 1.0, "step": 1592, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.125, "completion_length": 351.0, "epoch": 0.04844301179905121, "grad_norm": 1.873313210746184, "kl": 0.0390625, "learning_rate": 9.942208544681382e-07, "loss": 0.0016, "reward": 1.9183484315872192, "reward_std": 0.03680071234703064, "rewards/accuracy_reward": 0.7558484077453613, "rewards/format_reward": 1.0, "step": 1593, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.125, "completion_length": 378.40625, "epoch": 0.048473421724850994, "grad_norm": 1.2466522950877232, "kl": 0.03173828125, "learning_rate": 9.942136105468836e-07, "loss": 0.0013, "reward": 1.8782905340194702, "reward_std": 0.06371896713972092, "rewards/accuracy_reward": 0.7095404863357544, "rewards/format_reward": 1.0, "step": 1594, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.375, "completion_length": 345.734375, "epoch": 0.048503831650650774, "grad_norm": 1.0127522486589493, "kl": 0.032958984375, "learning_rate": 9.942063621149167e-07, "loss": 0.0013, "reward": 1.6687500476837158, "reward_std": 0.09785604476928711, "rewards/accuracy_reward": 0.5625, "rewards/format_reward": 1.0, "step": 1595, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.25, "completion_length": 352.703125, "epoch": 0.04853424157645055, "grad_norm": 0.5239190601959732, "kl": 0.029541015625, "learning_rate": 9.94199109172303e-07, "loss": 0.0012, "reward": 1.8967673778533936, "reward_std": 0.0019953041337430477, "rewards/accuracy_reward": 0.7467672228813171, "rewards/format_reward": 1.0, "step": 1596, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.25, "completion_length": 353.78125, "epoch": 0.04856465150225033, "grad_norm": 0.5223022458695523, "kl": 0.03271484375, "learning_rate": 9.94191851719109e-07, "loss": 0.0013, "reward": 1.8195064067840576, "reward_std": 0.003923232667148113, "rewards/accuracy_reward": 0.669506311416626, "rewards/format_reward": 1.0, "step": 1597, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.125, "completion_length": 407.46875, "epoch": 0.04859506142805012, "grad_norm": 1.0477238042492734, "kl": 0.0311279296875, "learning_rate": 9.94184589755401e-07, "loss": 0.0012, "reward": 1.7822928428649902, "reward_std": 0.20271551609039307, "rewards/accuracy_reward": 0.6729178428649902, "rewards/format_reward": 0.984375, "step": 1598, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.0, "completion_length": 355.203125, "epoch": 0.0486254713538499, "grad_norm": 3.0963659139314945, "kl": 0.033203125, "learning_rate": 9.94177323281245e-07, "loss": 0.0013, "reward": 2.0595688819885254, "reward_std": 0.0841984823346138, "rewards/accuracy_reward": 0.8720687627792358, "rewards/format_reward": 1.0, "step": 1599, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.125, "completion_length": 353.640625, "epoch": 0.048655881279649676, "grad_norm": 2.478039505589137, "kl": 0.030029296875, "learning_rate": 9.941700522967075e-07, "loss": 0.0012, "reward": 1.7853107452392578, "reward_std": 0.07803639024496078, "rewards/accuracy_reward": 0.6415607929229736, "rewards/format_reward": 1.0, "step": 1600, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 355.859375, "epoch": 0.048686291205449456, "grad_norm": 1.3484416111241089, "kl": 0.03955078125, "learning_rate": 9.94162776801855e-07, "loss": 0.0016, "reward": 2.017160415649414, "reward_std": 0.03981923684477806, "rewards/accuracy_reward": 0.829660177230835, "rewards/format_reward": 1.0, "step": 1601, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 353.4375, "epoch": 0.04871670113124924, "grad_norm": 3.7241698901285076, "kl": 0.03955078125, "learning_rate": 9.941554967967536e-07, "loss": 0.0016, "reward": 1.8958606719970703, "reward_std": 0.16589069366455078, "rewards/accuracy_reward": 0.7396107316017151, "rewards/format_reward": 1.0, "step": 1602, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.125, "completion_length": 391.5625, "epoch": 0.04874711105704902, "grad_norm": 1.002616647619633, "kl": 0.0302734375, "learning_rate": 9.9414821228147e-07, "loss": 0.0012, "reward": 1.8509879112243652, "reward_std": 0.06839513778686523, "rewards/accuracy_reward": 0.697862982749939, "rewards/format_reward": 1.0, "step": 1603, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 360.0625, "epoch": 0.0487775209828488, "grad_norm": 5.538458759235147, "kl": 0.0303955078125, "learning_rate": 9.941409232560706e-07, "loss": 0.0012, "reward": 2.104374885559082, "reward_std": 0.10199768841266632, "rewards/accuracy_reward": 0.9168750047683716, "rewards/format_reward": 1.0, "step": 1604, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.25, "completion_length": 389.875, "epoch": 0.048807930908648586, "grad_norm": 4.299657580781597, "kl": 0.032958984375, "learning_rate": 9.941336297206218e-07, "loss": 0.0013, "reward": 1.7654898166656494, "reward_std": 0.035659000277519226, "rewards/accuracy_reward": 0.6248648166656494, "rewards/format_reward": 1.0, "step": 1605, "temperature": 1.0 }, { "all_correct": 0.25, "all_wrong": 0.25, "completion_length": 421.078125, "epoch": 0.048838340834448365, "grad_norm": 1.3709310345231727, "kl": 0.028564453125, "learning_rate": 9.941263316751903e-07, "loss": 0.0011, "reward": 1.5403196811676025, "reward_std": 0.16956695914268494, "rewards/accuracy_reward": 0.4246945381164551, "rewards/format_reward": 1.0, "step": 1606, "temperature": 1.0 }, { "all_correct": 0.25, "all_wrong": 0.25, "completion_length": 377.8125, "epoch": 0.048868750760248145, "grad_norm": 1.2575989970300037, "kl": 0.0439453125, "learning_rate": 9.941190291198427e-07, "loss": 0.0018, "reward": 1.6765680313110352, "reward_std": 0.1317962110042572, "rewards/accuracy_reward": 0.5546929836273193, "rewards/format_reward": 1.0, "step": 1607, "temperature": 1.0 }, { "all_correct": 0.875, "all_wrong": 0.0, "completion_length": 356.9375, "epoch": 0.048899160686047924, "grad_norm": 0.5253648932133926, "kl": 0.036376953125, "learning_rate": 9.941117220546456e-07, "loss": 0.0015, "reward": 2.0561275482177734, "reward_std": 0.00507401255890727, "rewards/accuracy_reward": 0.8811274766921997, "rewards/format_reward": 1.0, "step": 1608, "temperature": 1.0 }, { "all_correct": 0.375, "all_wrong": 0.125, "completion_length": 381.171875, "epoch": 0.04892957061184771, "grad_norm": 1.2907963887867246, "kl": 0.028076171875, "learning_rate": 9.94104410479666e-07, "loss": 0.0011, "reward": 1.7683851718902588, "reward_std": 0.1850762516260147, "rewards/accuracy_reward": 0.6340101957321167, "rewards/format_reward": 1.0, "step": 1609, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.125, "completion_length": 390.25, "epoch": 0.04895998053764749, "grad_norm": 1.048965739826735, "kl": 0.035888671875, "learning_rate": 9.9409709439497e-07, "loss": 0.0014, "reward": 1.8398423194885254, "reward_std": 0.06410342454910278, "rewards/accuracy_reward": 0.6710922122001648, "rewards/format_reward": 1.0, "step": 1610, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 376.390625, "epoch": 0.04899039046344727, "grad_norm": 1.4907542786223664, "kl": 0.03271484375, "learning_rate": 9.940897738006249e-07, "loss": 0.0013, "reward": 2.0062501430511475, "reward_std": 0.1197132021188736, "rewards/accuracy_reward": 0.84375, "rewards/format_reward": 1.0, "step": 1611, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.125, "completion_length": 364.640625, "epoch": 0.04902080038924705, "grad_norm": 1.1426833643894796, "kl": 0.04150390625, "learning_rate": 9.940824486966973e-07, "loss": 0.0017, "reward": 1.9526190757751465, "reward_std": 0.01772783324122429, "rewards/accuracy_reward": 0.7807440161705017, "rewards/format_reward": 1.0, "step": 1612, "temperature": 1.0 }, { "all_correct": 0.25, "all_wrong": 0.25, "completion_length": 382.125, "epoch": 0.049051210315046834, "grad_norm": 1.06352829898061, "kl": 0.03662109375, "learning_rate": 9.940751190832542e-07, "loss": 0.0015, "reward": 1.483259916305542, "reward_std": 0.04168295860290527, "rewards/accuracy_reward": 0.36763477325439453, "rewards/format_reward": 1.0, "step": 1613, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 369.609375, "epoch": 0.04908162024084661, "grad_norm": 0.8671115988600048, "kl": 0.0361328125, "learning_rate": 9.940677849603623e-07, "loss": 0.0014, "reward": 2.078125, "reward_std": 0.1436041295528412, "rewards/accuracy_reward": 0.90625, "rewards/format_reward": 1.0, "step": 1614, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.0, "completion_length": 366.0625, "epoch": 0.04911203016664639, "grad_norm": 1.4189014494137373, "kl": 0.03759765625, "learning_rate": 9.940604463280887e-07, "loss": 0.0015, "reward": 2.071859359741211, "reward_std": 0.17294278740882874, "rewards/accuracy_reward": 0.893734335899353, "rewards/format_reward": 1.0, "step": 1615, "temperature": 1.0 }, { "all_correct": 0.125, "all_wrong": 0.25, "completion_length": 385.375, "epoch": 0.04914244009244617, "grad_norm": 1.4834780542250774, "kl": 0.0419921875, "learning_rate": 9.940531031865003e-07, "loss": 0.0017, "reward": 1.362673282623291, "reward_std": 0.05739261955022812, "rewards/accuracy_reward": 0.2564231753349304, "rewards/format_reward": 1.0, "step": 1616, "temperature": 1.0 }, { "all_correct": 0.875, "all_wrong": 0.0, "completion_length": 363.296875, "epoch": 0.04917285001824596, "grad_norm": 0.4688582738854757, "kl": 0.033447265625, "learning_rate": 9.940457555356643e-07, "loss": 0.0013, "reward": 2.110405445098877, "reward_std": 0.008229927159845829, "rewards/accuracy_reward": 0.9104055166244507, "rewards/format_reward": 1.0, "step": 1617, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.0, "completion_length": 361.5625, "epoch": 0.04920325994404574, "grad_norm": 1.7435215978858059, "kl": 0.048828125, "learning_rate": 9.940384033756475e-07, "loss": 0.002, "reward": 2.054636001586914, "reward_std": 0.1404547244310379, "rewards/accuracy_reward": 0.8671360611915588, "rewards/format_reward": 1.0, "step": 1618, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.125, "completion_length": 359.546875, "epoch": 0.049233669869845516, "grad_norm": 0.7308266162859263, "kl": 0.03515625, "learning_rate": 9.940310467065168e-07, "loss": 0.0014, "reward": 1.9130616188049316, "reward_std": 0.08688535541296005, "rewards/accuracy_reward": 0.7568116188049316, "rewards/format_reward": 1.0, "step": 1619, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.0, "completion_length": 372.0, "epoch": 0.0492640797956453, "grad_norm": 0.8249601837489324, "kl": 0.0284423828125, "learning_rate": 9.940236855283402e-07, "loss": 0.0011, "reward": 2.0044503211975098, "reward_std": 0.09639055281877518, "rewards/accuracy_reward": 0.8138253092765808, "rewards/format_reward": 1.0, "step": 1620, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.375, "completion_length": 426.625, "epoch": 0.04929448972144508, "grad_norm": 0.48947654096508736, "kl": 0.0286865234375, "learning_rate": 9.940163198411841e-07, "loss": 0.0011, "reward": 1.6551868915557861, "reward_std": 0.009914111346006393, "rewards/accuracy_reward": 0.5333119034767151, "rewards/format_reward": 1.0, "step": 1621, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.125, "completion_length": 380.21875, "epoch": 0.04932489964724486, "grad_norm": 1.0236904942326734, "kl": 0.030517578125, "learning_rate": 9.940089496451158e-07, "loss": 0.0012, "reward": 1.7944661378860474, "reward_std": 0.1585567742586136, "rewards/accuracy_reward": 0.6444660425186157, "rewards/format_reward": 1.0, "step": 1622, "temperature": 1.0 }, { "all_correct": 0.375, "all_wrong": 0.125, "completion_length": 393.09375, "epoch": 0.04935530957304464, "grad_norm": 1.3957201955326277, "kl": 0.044189453125, "learning_rate": 9.94001574940203e-07, "loss": 0.0018, "reward": 1.7461063861846924, "reward_std": 0.03529345989227295, "rewards/accuracy_reward": 0.5773564577102661, "rewards/format_reward": 1.0, "step": 1623, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 365.65625, "epoch": 0.049385719498844426, "grad_norm": 1.6625290436694786, "kl": 0.040771484375, "learning_rate": 9.939941957265126e-07, "loss": 0.0016, "reward": 1.807074785232544, "reward_std": 0.1034763753414154, "rewards/accuracy_reward": 0.6383247971534729, "rewards/format_reward": 1.0, "step": 1624, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.0, "completion_length": 369.71875, "epoch": 0.049416129424644205, "grad_norm": 4.565557020090409, "kl": 0.03564453125, "learning_rate": 9.93986812004112e-07, "loss": 0.0014, "reward": 1.9766405820846558, "reward_std": 0.08576418459415436, "rewards/accuracy_reward": 0.8016405701637268, "rewards/format_reward": 1.0, "step": 1625, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.125, "completion_length": 350.890625, "epoch": 0.049446539350443984, "grad_norm": 0.7262923651673935, "kl": 0.037109375, "learning_rate": 9.939794237730691e-07, "loss": 0.0015, "reward": 1.9906251430511475, "reward_std": 0.08647121489048004, "rewards/accuracy_reward": 0.828125, "rewards/format_reward": 1.0, "step": 1626, "temperature": 1.0 }, { "all_correct": 0.875, "all_wrong": 0.0, "completion_length": 366.59375, "epoch": 0.04947694927624376, "grad_norm": 0.8330422454538527, "kl": 0.0308837890625, "learning_rate": 9.939720310334508e-07, "loss": 0.0012, "reward": 2.1593751907348633, "reward_std": 0.07827534526586533, "rewards/accuracy_reward": 0.96875, "rewards/format_reward": 1.0, "step": 1627, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.125, "completion_length": 381.984375, "epoch": 0.04950735920204355, "grad_norm": 1.1228799183154856, "kl": 0.03564453125, "learning_rate": 9.939646337853245e-07, "loss": 0.0014, "reward": 1.8919830322265625, "reward_std": 0.025667762383818626, "rewards/accuracy_reward": 0.72635817527771, "rewards/format_reward": 1.0, "step": 1628, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 375.453125, "epoch": 0.04953776912784333, "grad_norm": 1.131050375483854, "kl": 0.03564453125, "learning_rate": 9.939572320287578e-07, "loss": 0.0014, "reward": 1.849438190460205, "reward_std": 0.05381552875041962, "rewards/accuracy_reward": 0.680688202381134, "rewards/format_reward": 1.0, "step": 1629, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.125, "completion_length": 351.296875, "epoch": 0.04956817905364311, "grad_norm": 0.8007424778596781, "kl": 0.035888671875, "learning_rate": 9.939498257638187e-07, "loss": 0.0014, "reward": 1.8370535373687744, "reward_std": 0.09386977553367615, "rewards/accuracy_reward": 0.6964285969734192, "rewards/format_reward": 1.0, "step": 1630, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.125, "completion_length": 357.984375, "epoch": 0.04959858897944289, "grad_norm": 1.298788173921301, "kl": 0.0322265625, "learning_rate": 9.939424149905742e-07, "loss": 0.0013, "reward": 1.822864055633545, "reward_std": 0.15720722079277039, "rewards/accuracy_reward": 0.6791141033172607, "rewards/format_reward": 1.0, "step": 1631, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.125, "completion_length": 347.5625, "epoch": 0.04962899890524267, "grad_norm": 0.5766538041005346, "kl": 0.0380859375, "learning_rate": 9.939349997090925e-07, "loss": 0.0015, "reward": 1.9670263528823853, "reward_std": 0.009457984007894993, "rewards/accuracy_reward": 0.7920262813568115, "rewards/format_reward": 1.0, "step": 1632, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 346.890625, "epoch": 0.04965940883104245, "grad_norm": 0.9738371822219221, "kl": 0.041259765625, "learning_rate": 9.939275799194406e-07, "loss": 0.0017, "reward": 1.9765625, "reward_std": 0.060398709028959274, "rewards/accuracy_reward": 0.8046875, "rewards/format_reward": 1.0, "step": 1633, "temperature": 1.0 }, { "all_correct": 0.375, "all_wrong": 0.0, "completion_length": 383.84375, "epoch": 0.04968981875684223, "grad_norm": 2.8621775161227014, "kl": 0.032958984375, "learning_rate": 9.93920155621687e-07, "loss": 0.0013, "reward": 1.8008538484573364, "reward_std": 0.05842326208949089, "rewards/accuracy_reward": 0.6102288365364075, "rewards/format_reward": 1.0, "step": 1634, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.125, "completion_length": 332.28125, "epoch": 0.04972022868264202, "grad_norm": 1.5371610531744304, "kl": 0.04443359375, "learning_rate": 9.939127268158987e-07, "loss": 0.0018, "reward": 1.8770833015441895, "reward_std": 0.17122013866901398, "rewards/accuracy_reward": 0.7395833730697632, "rewards/format_reward": 1.0, "step": 1635, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.0, "completion_length": 342.03125, "epoch": 0.0497506386084418, "grad_norm": 1.5899053902015894, "kl": 0.033935546875, "learning_rate": 9.939052935021442e-07, "loss": 0.0014, "reward": 2.01462459564209, "reward_std": 0.10628823935985565, "rewards/accuracy_reward": 0.8396245241165161, "rewards/format_reward": 1.0, "step": 1636, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.125, "completion_length": 379.453125, "epoch": 0.049781048534241576, "grad_norm": 1.1009704959983944, "kl": 0.031494140625, "learning_rate": 9.938978556804907e-07, "loss": 0.0013, "reward": 1.9638290405273438, "reward_std": 0.03070160374045372, "rewards/accuracy_reward": 0.7950790524482727, "rewards/format_reward": 1.0, "step": 1637, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.125, "completion_length": 355.875, "epoch": 0.049811458460041355, "grad_norm": 0.30376802399305636, "kl": 0.03369140625, "learning_rate": 9.938904133510066e-07, "loss": 0.0014, "reward": 1.9375, "reward_std": 0.06943650543689728, "rewards/accuracy_reward": 0.78125, "rewards/format_reward": 1.0, "step": 1638, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 365.359375, "epoch": 0.04984186838584114, "grad_norm": 1.1232362296676783, "kl": 0.029541015625, "learning_rate": 9.938829665137595e-07, "loss": 0.0012, "reward": 2.049867868423462, "reward_std": 0.1196543425321579, "rewards/accuracy_reward": 0.8686177730560303, "rewards/format_reward": 1.0, "step": 1639, "temperature": 1.0 }, { "all_correct": 0.375, "all_wrong": 0.25, "completion_length": 393.546875, "epoch": 0.04987227831164092, "grad_norm": 0.6126305221586202, "kl": 0.033447265625, "learning_rate": 9.938755151688176e-07, "loss": 0.0013, "reward": 1.6714158058166504, "reward_std": 0.14979159832000732, "rewards/accuracy_reward": 0.5526658296585083, "rewards/format_reward": 1.0, "step": 1640, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.0, "completion_length": 384.921875, "epoch": 0.0499026882374407, "grad_norm": 0.6966865177064817, "kl": 0.03564453125, "learning_rate": 9.938680593162488e-07, "loss": 0.0014, "reward": 1.8735941648483276, "reward_std": 0.13001015782356262, "rewards/accuracy_reward": 0.7267191410064697, "rewards/format_reward": 1.0, "step": 1641, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.25, "completion_length": 351.640625, "epoch": 0.04993309816324048, "grad_norm": 0.8710328699575767, "kl": 0.04150390625, "learning_rate": 9.938605989561212e-07, "loss": 0.0017, "reward": 1.839176893234253, "reward_std": 0.07993703335523605, "rewards/accuracy_reward": 0.6985518336296082, "rewards/format_reward": 1.0, "step": 1642, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.125, "completion_length": 354.046875, "epoch": 0.049963508089040265, "grad_norm": 0.3330535540502741, "kl": 0.0301513671875, "learning_rate": 9.938531340885028e-07, "loss": 0.0012, "reward": 1.8765519857406616, "reward_std": 0.077632375061512, "rewards/accuracy_reward": 0.7109269499778748, "rewards/format_reward": 1.0, "step": 1643, "temperature": 1.0 }, { "all_correct": 0.375, "all_wrong": 0.0, "completion_length": 355.640625, "epoch": 0.049993918014840044, "grad_norm": 4.889179501029402, "kl": 0.033935546875, "learning_rate": 9.938456647134617e-07, "loss": 0.0014, "reward": 1.636674165725708, "reward_std": 0.2371462881565094, "rewards/accuracy_reward": 0.5179240703582764, "rewards/format_reward": 1.0, "step": 1644, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.125, "completion_length": 371.5625, "epoch": 0.05002432794063982, "grad_norm": 4.442556743116582, "kl": 0.036376953125, "learning_rate": 9.938381908310663e-07, "loss": 0.0015, "reward": 1.79168701171875, "reward_std": 0.0852556899189949, "rewards/accuracy_reward": 0.632311999797821, "rewards/format_reward": 1.0, "step": 1645, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.125, "completion_length": 381.265625, "epoch": 0.0500547378664396, "grad_norm": 1.340102870247926, "kl": 0.038818359375, "learning_rate": 9.938307124413846e-07, "loss": 0.0016, "reward": 1.8142801523208618, "reward_std": 0.06440731137990952, "rewards/accuracy_reward": 0.673655092716217, "rewards/format_reward": 1.0, "step": 1646, "temperature": 1.0 }, { "all_correct": 0.125, "all_wrong": 0.125, "completion_length": 406.9375, "epoch": 0.05008514779223939, "grad_norm": 1.039049590517804, "kl": 0.032958984375, "learning_rate": 9.938232295444849e-07, "loss": 0.0013, "reward": 1.477534532546997, "reward_std": 0.23548895120620728, "rewards/accuracy_reward": 0.3775344491004944, "rewards/format_reward": 1.0, "step": 1647, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.25, "completion_length": 379.625, "epoch": 0.05011555771803917, "grad_norm": 0.8045936546376954, "kl": 0.0294189453125, "learning_rate": 9.938157421404354e-07, "loss": 0.0012, "reward": 1.8295786380767822, "reward_std": 0.09341376274824142, "rewards/accuracy_reward": 0.6889535188674927, "rewards/format_reward": 1.0, "step": 1648, "temperature": 1.0 }, { "all_correct": 0.875, "all_wrong": 0.125, "completion_length": 373.171875, "epoch": 0.05014596764383895, "grad_norm": 0.24953388076486008, "kl": 0.031494140625, "learning_rate": 9.938082502293045e-07, "loss": 0.0013, "reward": 2.046875, "reward_std": 0.008838837966322899, "rewards/accuracy_reward": 0.875, "rewards/format_reward": 1.0, "step": 1649, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.125, "completion_length": 354.15625, "epoch": 0.05017637756963873, "grad_norm": 0.4183402418758457, "kl": 0.03515625, "learning_rate": 9.93800753811161e-07, "loss": 0.0014, "reward": 1.9156250953674316, "reward_std": 0.04419417306780815, "rewards/accuracy_reward": 0.765625, "rewards/format_reward": 1.0, "step": 1650, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.125, "completion_length": 371.375, "epoch": 0.05020678749543851, "grad_norm": 0.6121153856507449, "kl": 0.037353515625, "learning_rate": 9.937932528860727e-07, "loss": 0.0015, "reward": 1.9039881229400635, "reward_std": 0.0864611566066742, "rewards/accuracy_reward": 0.741487979888916, "rewards/format_reward": 1.0, "step": 1651, "temperature": 1.0 }, { "all_correct": 0.375, "all_wrong": 0.25, "completion_length": 365.515625, "epoch": 0.05023719742123829, "grad_norm": 1.6587345494518946, "kl": 0.042236328125, "learning_rate": 9.937857474541083e-07, "loss": 0.0017, "reward": 1.5650074481964111, "reward_std": 0.0547744557261467, "rewards/accuracy_reward": 0.4431324601173401, "rewards/format_reward": 1.0, "step": 1652, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.125, "completion_length": 361.40625, "epoch": 0.05026760734703807, "grad_norm": 1.0617490451283724, "kl": 0.036376953125, "learning_rate": 9.937782375153364e-07, "loss": 0.0015, "reward": 1.8342570066452026, "reward_std": 0.07604838907718658, "rewards/accuracy_reward": 0.6811319589614868, "rewards/format_reward": 1.0, "step": 1653, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.125, "completion_length": 353.375, "epoch": 0.05029801727283786, "grad_norm": 0.8310994657658896, "kl": 0.043212890625, "learning_rate": 9.937707230698255e-07, "loss": 0.0017, "reward": 1.8482410907745361, "reward_std": 0.15723612904548645, "rewards/accuracy_reward": 0.701366126537323, "rewards/format_reward": 1.0, "step": 1654, "temperature": 1.0 }, { "all_correct": 0.375, "all_wrong": 0.25, "completion_length": 373.09375, "epoch": 0.050328427198637636, "grad_norm": 3.6399266148591574, "kl": 0.037841796875, "learning_rate": 9.93763204117644e-07, "loss": 0.0015, "reward": 1.7966499328613281, "reward_std": 0.0966249331831932, "rewards/accuracy_reward": 0.6560249328613281, "rewards/format_reward": 1.0, "step": 1655, "temperature": 1.0 }, { "all_correct": 0.375, "all_wrong": 0.25, "completion_length": 361.0, "epoch": 0.050358837124437415, "grad_norm": 1.178986597138529, "kl": 0.042236328125, "learning_rate": 9.937556806588606e-07, "loss": 0.0017, "reward": 1.697963833808899, "reward_std": 0.17579889297485352, "rewards/accuracy_reward": 0.5729637145996094, "rewards/format_reward": 1.0, "step": 1656, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.125, "completion_length": 390.15625, "epoch": 0.050389247050237194, "grad_norm": 2.9622499482698816, "kl": 0.0380859375, "learning_rate": 9.937481526935443e-07, "loss": 0.0015, "reward": 1.8066219091415405, "reward_std": 0.09898646175861359, "rewards/accuracy_reward": 0.6566218733787537, "rewards/format_reward": 1.0, "step": 1657, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.125, "completion_length": 374.109375, "epoch": 0.05041965697603698, "grad_norm": 1.473096996494932, "kl": 0.045166015625, "learning_rate": 9.937406202217635e-07, "loss": 0.0018, "reward": 1.7280230522155762, "reward_std": 0.1014149859547615, "rewards/accuracy_reward": 0.5936480760574341, "rewards/format_reward": 1.0, "step": 1658, "temperature": 1.0 }, { "all_correct": 0.25, "all_wrong": 0.375, "completion_length": 365.328125, "epoch": 0.05045006690183676, "grad_norm": 0.9597011332437813, "kl": 0.0322265625, "learning_rate": 9.93733083243587e-07, "loss": 0.0013, "reward": 1.4885691404342651, "reward_std": 0.10112345963716507, "rewards/accuracy_reward": 0.37919408082962036, "rewards/format_reward": 1.0, "step": 1659, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.25, "completion_length": 354.9375, "epoch": 0.05048047682763654, "grad_norm": 0.6986928189945019, "kl": 0.03515625, "learning_rate": 9.937255417590835e-07, "loss": 0.0014, "reward": 1.766253113746643, "reward_std": 0.01196515467017889, "rewards/accuracy_reward": 0.6193780899047852, "rewards/format_reward": 1.0, "step": 1660, "temperature": 1.0 }, { "all_correct": 0.25, "all_wrong": 0.125, "completion_length": 390.90625, "epoch": 0.05051088675343632, "grad_norm": 3.511848636702321, "kl": 0.039794921875, "learning_rate": 9.937179957683218e-07, "loss": 0.0016, "reward": 1.614233136177063, "reward_std": 0.12450411170721054, "rewards/accuracy_reward": 0.4704830050468445, "rewards/format_reward": 1.0, "step": 1661, "temperature": 1.0 }, { "all_correct": 0.375, "all_wrong": 0.125, "completion_length": 377.59375, "epoch": 0.050541296679236104, "grad_norm": 1.6828295610164712, "kl": 0.033935546875, "learning_rate": 9.937104452713711e-07, "loss": 0.0014, "reward": 1.6807746887207031, "reward_std": 0.20051831007003784, "rewards/accuracy_reward": 0.562024712562561, "rewards/format_reward": 1.0, "step": 1662, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 354.203125, "epoch": 0.050571706605035883, "grad_norm": 1.1319360294225356, "kl": 0.03759765625, "learning_rate": 9.937028902683e-07, "loss": 0.0015, "reward": 2.0091750621795654, "reward_std": 0.07335405051708221, "rewards/accuracy_reward": 0.8466750383377075, "rewards/format_reward": 1.0, "step": 1663, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.125, "completion_length": 340.390625, "epoch": 0.05060211653083566, "grad_norm": 1.828628224142654, "kl": 0.039794921875, "learning_rate": 9.936953307591777e-07, "loss": 0.0016, "reward": 1.8635308742523193, "reward_std": 0.12882153689861298, "rewards/accuracy_reward": 0.7197808027267456, "rewards/format_reward": 1.0, "step": 1664, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.25, "completion_length": 356.53125, "epoch": 0.05063252645663545, "grad_norm": 1.1458638914066333, "kl": 0.043212890625, "learning_rate": 9.93687766744073e-07, "loss": 0.0017, "reward": 1.7048448324203491, "reward_std": 0.11007291078567505, "rewards/accuracy_reward": 0.5798447132110596, "rewards/format_reward": 1.0, "step": 1665, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.125, "completion_length": 352.078125, "epoch": 0.05066293638243523, "grad_norm": 1.4288198379279586, "kl": 0.03369140625, "learning_rate": 9.936801982230548e-07, "loss": 0.0013, "reward": 1.9037678241729736, "reward_std": 0.018663080409169197, "rewards/accuracy_reward": 0.7287677526473999, "rewards/format_reward": 1.0, "step": 1666, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 356.21875, "epoch": 0.05069334630823501, "grad_norm": 0.7863880348225815, "kl": 0.038330078125, "learning_rate": 9.936726251961925e-07, "loss": 0.0015, "reward": 2.039496660232544, "reward_std": 0.07071912288665771, "rewards/accuracy_reward": 0.8644964694976807, "rewards/format_reward": 1.0, "step": 1667, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.125, "completion_length": 365.390625, "epoch": 0.050723756234034786, "grad_norm": 3.966981127952393, "kl": 0.038330078125, "learning_rate": 9.93665047663555e-07, "loss": 0.0015, "reward": 1.8164305686950684, "reward_std": 0.020390480756759644, "rewards/accuracy_reward": 0.6414304375648499, "rewards/format_reward": 1.0, "step": 1668, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 379.453125, "epoch": 0.05075416615983457, "grad_norm": 1.1533730851878448, "kl": 0.0269775390625, "learning_rate": 9.936574656252116e-07, "loss": 0.0011, "reward": 2.027372360229492, "reward_std": 0.09050028771162033, "rewards/accuracy_reward": 0.8429971933364868, "rewards/format_reward": 1.0, "step": 1669, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.25, "completion_length": 342.671875, "epoch": 0.05078457608563435, "grad_norm": 2.2878501541360743, "kl": 0.037841796875, "learning_rate": 9.936498790812316e-07, "loss": 0.0015, "reward": 1.8312500715255737, "reward_std": 0.09080372750759125, "rewards/accuracy_reward": 0.6875, "rewards/format_reward": 1.0, "step": 1670, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 385.671875, "epoch": 0.05081498601143413, "grad_norm": 1.1168035576399926, "kl": 0.034912109375, "learning_rate": 9.93642288031684e-07, "loss": 0.0014, "reward": 1.9249169826507568, "reward_std": 0.21349960565567017, "rewards/accuracy_reward": 0.7686669826507568, "rewards/format_reward": 1.0, "step": 1671, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.125, "completion_length": 383.953125, "epoch": 0.05084539593723391, "grad_norm": 0.5943885498576946, "kl": 0.03515625, "learning_rate": 9.936346924766383e-07, "loss": 0.0014, "reward": 1.8754420280456543, "reward_std": 0.07013367861509323, "rewards/accuracy_reward": 0.709817111492157, "rewards/format_reward": 1.0, "step": 1672, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 391.984375, "epoch": 0.050875805863033696, "grad_norm": 1.0790132512070234, "kl": 0.03759765625, "learning_rate": 9.936270924161635e-07, "loss": 0.0015, "reward": 1.8200020790100098, "reward_std": 0.1979990005493164, "rewards/accuracy_reward": 0.7012521624565125, "rewards/format_reward": 0.984375, "step": 1673, "temperature": 1.0 }, { "all_correct": 0.25, "all_wrong": 0.375, "completion_length": 370.265625, "epoch": 0.050906215788833475, "grad_norm": 0.9482184940203908, "kl": 0.040283203125, "learning_rate": 9.936194878503291e-07, "loss": 0.0016, "reward": 1.6085577011108398, "reward_std": 0.1292867809534073, "rewards/accuracy_reward": 0.49293267726898193, "rewards/format_reward": 1.0, "step": 1674, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.125, "completion_length": 362.265625, "epoch": 0.050936625714633255, "grad_norm": 0.7133985073444051, "kl": 0.03271484375, "learning_rate": 9.936118787792049e-07, "loss": 0.0013, "reward": 1.9343750476837158, "reward_std": 0.07827534526586533, "rewards/accuracy_reward": 0.78125, "rewards/format_reward": 1.0, "step": 1675, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.125, "completion_length": 362.71875, "epoch": 0.05096703564043304, "grad_norm": 0.3971615580548201, "kl": 0.03955078125, "learning_rate": 9.936042652028597e-07, "loss": 0.0016, "reward": 1.9452810287475586, "reward_std": 0.011235001496970654, "rewards/accuracy_reward": 0.7734060287475586, "rewards/format_reward": 1.0, "step": 1676, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.125, "completion_length": 371.09375, "epoch": 0.05099744556623282, "grad_norm": 2.816785325438691, "kl": 0.0380859375, "learning_rate": 9.935966471213634e-07, "loss": 0.0015, "reward": 2.0082387924194336, "reward_std": 0.07206667214632034, "rewards/accuracy_reward": 0.8394886255264282, "rewards/format_reward": 1.0, "step": 1677, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.125, "completion_length": 351.546875, "epoch": 0.0510278554920326, "grad_norm": 0.9377350789267656, "kl": 0.03662109375, "learning_rate": 9.935890245347856e-07, "loss": 0.0015, "reward": 1.9370989799499512, "reward_std": 0.08515331894159317, "rewards/accuracy_reward": 0.765224039554596, "rewards/format_reward": 1.0, "step": 1678, "temperature": 1.0 }, { "all_correct": 0.375, "all_wrong": 0.0, "completion_length": 391.8125, "epoch": 0.05105826541783238, "grad_norm": 1.4591388121684274, "kl": 0.037841796875, "learning_rate": 9.935813974431957e-07, "loss": 0.0015, "reward": 1.8176357746124268, "reward_std": 0.18189215660095215, "rewards/accuracy_reward": 0.6832607388496399, "rewards/format_reward": 0.96875, "step": 1679, "temperature": 1.0 }, { "all_correct": 0.375, "all_wrong": 0.375, "completion_length": 374.4375, "epoch": 0.051088675343632164, "grad_norm": 0.6936870528033658, "kl": 0.04345703125, "learning_rate": 9.935737658466631e-07, "loss": 0.0017, "reward": 1.5233813524246216, "reward_std": 0.05789122357964516, "rewards/accuracy_reward": 0.43900638818740845, "rewards/format_reward": 1.0, "step": 1680, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 352.65625, "epoch": 0.051119085269431944, "grad_norm": 0.9509372394399741, "kl": 0.0361328125, "learning_rate": 9.93566129745258e-07, "loss": 0.0014, "reward": 2.036555051803589, "reward_std": 0.01791897602379322, "rewards/accuracy_reward": 0.836555004119873, "rewards/format_reward": 1.0, "step": 1681, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.25, "completion_length": 371.078125, "epoch": 0.05114949519523172, "grad_norm": 0.3951769342742871, "kl": 0.042724609375, "learning_rate": 9.935584891390495e-07, "loss": 0.0017, "reward": 1.791914463043213, "reward_std": 0.009663645178079605, "rewards/accuracy_reward": 0.660664439201355, "rewards/format_reward": 1.0, "step": 1682, "temperature": 1.0 }, { "all_correct": 0.25, "all_wrong": 0.25, "completion_length": 385.203125, "epoch": 0.0511799051210315, "grad_norm": 1.3605734298930652, "kl": 0.04931640625, "learning_rate": 9.935508440281078e-07, "loss": 0.002, "reward": 1.6233460903167725, "reward_std": 0.11581914871931076, "rewards/accuracy_reward": 0.49834609031677246, "rewards/format_reward": 1.0, "step": 1683, "temperature": 1.0 }, { "all_correct": 0.375, "all_wrong": 0.125, "completion_length": 364.796875, "epoch": 0.05121031504683129, "grad_norm": 2.890698984590165, "kl": 0.0294189453125, "learning_rate": 9.935431944125024e-07, "loss": 0.0012, "reward": 1.9088068008422852, "reward_std": 0.19097597897052765, "rewards/accuracy_reward": 0.7556818127632141, "rewards/format_reward": 1.0, "step": 1684, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.25, "completion_length": 389.828125, "epoch": 0.05124072497263107, "grad_norm": 1.7211652321826534, "kl": 0.03955078125, "learning_rate": 9.935355402923033e-07, "loss": 0.0016, "reward": 1.7310891151428223, "reward_std": 0.1575307846069336, "rewards/accuracy_reward": 0.6373391151428223, "rewards/format_reward": 0.96875, "step": 1685, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 359.40625, "epoch": 0.05127113489843085, "grad_norm": 4.429784977710128, "kl": 0.0390625, "learning_rate": 9.935278816675802e-07, "loss": 0.0016, "reward": 2.050593852996826, "reward_std": 0.02281137928366661, "rewards/accuracy_reward": 0.8537189364433289, "rewards/format_reward": 1.0, "step": 1686, "temperature": 1.0 }, { "all_correct": 0.375, "all_wrong": 0.25, "completion_length": 363.8125, "epoch": 0.051301544824230626, "grad_norm": 7.602922751970457, "kl": 0.03759765625, "learning_rate": 9.935202185384031e-07, "loss": 0.0015, "reward": 1.6680340766906738, "reward_std": 0.08603070676326752, "rewards/accuracy_reward": 0.533659040927887, "rewards/format_reward": 1.0, "step": 1687, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 353.09375, "epoch": 0.05133195475003041, "grad_norm": 1.5179480810969561, "kl": 0.036376953125, "learning_rate": 9.935125509048418e-07, "loss": 0.0015, "reward": 2.0531249046325684, "reward_std": 0.18494783341884613, "rewards/accuracy_reward": 0.890625, "rewards/format_reward": 1.0, "step": 1688, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.25, "completion_length": 349.890625, "epoch": 0.05136236467583019, "grad_norm": 0.9065629458172986, "kl": 0.038330078125, "learning_rate": 9.935048787669665e-07, "loss": 0.0015, "reward": 1.7286990880966187, "reward_std": 0.09804330766201019, "rewards/accuracy_reward": 0.6036989688873291, "rewards/format_reward": 1.0, "step": 1689, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.125, "completion_length": 354.921875, "epoch": 0.05139277460162997, "grad_norm": 1.52422660389779, "kl": 0.045654296875, "learning_rate": 9.93497202124847e-07, "loss": 0.0018, "reward": 1.7389616966247559, "reward_std": 0.05179315805435181, "rewards/accuracy_reward": 0.5827116966247559, "rewards/format_reward": 1.0, "step": 1690, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.0, "completion_length": 345.796875, "epoch": 0.051423184527429756, "grad_norm": 1.070619639271041, "kl": 0.0361328125, "learning_rate": 9.934895209785536e-07, "loss": 0.0014, "reward": 2.0241079330444336, "reward_std": 0.07567884773015976, "rewards/accuracy_reward": 0.8366081118583679, "rewards/format_reward": 1.0, "step": 1691, "temperature": 1.0 }, { "all_correct": 0.875, "all_wrong": 0.0, "completion_length": 354.640625, "epoch": 0.051453594453229536, "grad_norm": 1.5113803828379757, "kl": 0.03955078125, "learning_rate": 9.934818353281564e-07, "loss": 0.0016, "reward": 2.0800211429595947, "reward_std": 0.031228920444846153, "rewards/accuracy_reward": 0.8893961310386658, "rewards/format_reward": 1.0, "step": 1692, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.125, "completion_length": 395.0, "epoch": 0.051484004379029315, "grad_norm": 1.5163073362737587, "kl": 0.032470703125, "learning_rate": 9.934741451737254e-07, "loss": 0.0013, "reward": 1.7957069873809814, "reward_std": 0.08248253166675568, "rewards/accuracy_reward": 0.6394569277763367, "rewards/format_reward": 1.0, "step": 1693, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 351.546875, "epoch": 0.051514414304829094, "grad_norm": 0.8653184565169834, "kl": 0.03466796875, "learning_rate": 9.934664505153308e-07, "loss": 0.0014, "reward": 2.0211806297302246, "reward_std": 0.03529752790927887, "rewards/accuracy_reward": 0.8430556058883667, "rewards/format_reward": 1.0, "step": 1694, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.125, "completion_length": 400.484375, "epoch": 0.05154482423062888, "grad_norm": 0.7886583539947964, "kl": 0.036865234375, "learning_rate": 9.934587513530428e-07, "loss": 0.0015, "reward": 1.811471939086914, "reward_std": 0.09145061671733856, "rewards/accuracy_reward": 0.6614718437194824, "rewards/format_reward": 1.0, "step": 1695, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.125, "completion_length": 338.28125, "epoch": 0.05157523415642866, "grad_norm": 0.9356359876134492, "kl": 0.0390625, "learning_rate": 9.934510476869315e-07, "loss": 0.0016, "reward": 1.938044548034668, "reward_std": 0.036843061447143555, "rewards/accuracy_reward": 0.7755445241928101, "rewards/format_reward": 1.0, "step": 1696, "temperature": 1.0 }, { "all_correct": 0.375, "all_wrong": 0.125, "completion_length": 384.765625, "epoch": 0.05160564408222844, "grad_norm": 1.1611883627854689, "kl": 0.037841796875, "learning_rate": 9.934433395170677e-07, "loss": 0.0015, "reward": 1.7988560199737549, "reward_std": 0.18911992013454437, "rewards/accuracy_reward": 0.6551059484481812, "rewards/format_reward": 1.0, "step": 1697, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.125, "completion_length": 367.921875, "epoch": 0.05163605400802822, "grad_norm": 0.9630382646793093, "kl": 0.0341796875, "learning_rate": 9.934356268435215e-07, "loss": 0.0014, "reward": 1.9741320610046387, "reward_std": 0.13312062621116638, "rewards/accuracy_reward": 0.8116319179534912, "rewards/format_reward": 1.0, "step": 1698, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.125, "completion_length": 358.984375, "epoch": 0.051666463933828004, "grad_norm": 0.503210643045451, "kl": 0.034423828125, "learning_rate": 9.934279096663634e-07, "loss": 0.0014, "reward": 1.970725178718567, "reward_std": 0.004311002790927887, "rewards/accuracy_reward": 0.7957251667976379, "rewards/format_reward": 1.0, "step": 1699, "temperature": 1.0 }, { "all_correct": 0.875, "all_wrong": 0.0, "completion_length": 368.15625, "epoch": 0.05169687385962778, "grad_norm": 0.8565713178714169, "kl": 0.04052734375, "learning_rate": 9.934201879856634e-07, "loss": 0.0016, "reward": 2.0955491065979004, "reward_std": 0.029428184032440186, "rewards/accuracy_reward": 0.9111740589141846, "rewards/format_reward": 1.0, "step": 1700, "temperature": 1.0 }, { "all_correct": 0.25, "all_wrong": 0.5, "completion_length": 369.921875, "epoch": 0.05172728378542756, "grad_norm": 0.8591133475865472, "kl": 0.03662109375, "learning_rate": 9.934124618014925e-07, "loss": 0.0015, "reward": 1.446874976158142, "reward_std": 0.09722718596458435, "rewards/accuracy_reward": 0.375, "rewards/format_reward": 1.0, "step": 1701, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 359.921875, "epoch": 0.05175769371122734, "grad_norm": 1.5590615335120794, "kl": 0.0400390625, "learning_rate": 9.93404731113921e-07, "loss": 0.0016, "reward": 2.0147361755371094, "reward_std": 0.038761723786592484, "rewards/accuracy_reward": 0.827235996723175, "rewards/format_reward": 1.0, "step": 1702, "temperature": 1.0 }, { "all_correct": 0.375, "all_wrong": 0.125, "completion_length": 364.390625, "epoch": 0.05178810363702713, "grad_norm": 0.9501324925560295, "kl": 0.042236328125, "learning_rate": 9.933969959230195e-07, "loss": 0.0017, "reward": 1.7727097272872925, "reward_std": 0.215582937002182, "rewards/accuracy_reward": 0.6477096080780029, "rewards/format_reward": 1.0, "step": 1703, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.0, "completion_length": 376.328125, "epoch": 0.05181851356282691, "grad_norm": 0.5988409479043928, "kl": 0.03857421875, "learning_rate": 9.933892562288584e-07, "loss": 0.0015, "reward": 1.9458026885986328, "reward_std": 0.1047879308462143, "rewards/accuracy_reward": 0.7833025455474854, "rewards/format_reward": 1.0, "step": 1704, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 356.203125, "epoch": 0.051848923488626686, "grad_norm": 1.265448084548528, "kl": 0.037109375, "learning_rate": 9.933815120315087e-07, "loss": 0.0015, "reward": 1.775796890258789, "reward_std": 0.1951175034046173, "rewards/accuracy_reward": 0.6351718306541443, "rewards/format_reward": 1.0, "step": 1705, "temperature": 1.0 }, { "all_correct": 0.375, "all_wrong": 0.25, "completion_length": 346.34375, "epoch": 0.05187933341442647, "grad_norm": 1.0977140443811335, "kl": 0.04345703125, "learning_rate": 9.93373763331041e-07, "loss": 0.0017, "reward": 1.5187499523162842, "reward_std": 0.15782485902309418, "rewards/accuracy_reward": 0.4375, "rewards/format_reward": 1.0, "step": 1706, "temperature": 1.0 }, { "all_correct": 0.375, "all_wrong": 0.5, "completion_length": 390.578125, "epoch": 0.05190974334022625, "grad_norm": 0.4317568777250948, "kl": 0.03662109375, "learning_rate": 9.933660101275257e-07, "loss": 0.0015, "reward": 1.497149109840393, "reward_std": 0.00949166901409626, "rewards/accuracy_reward": 0.40339910984039307, "rewards/format_reward": 1.0, "step": 1707, "temperature": 1.0 }, { "all_correct": 0.25, "all_wrong": 0.0, "completion_length": 398.65625, "epoch": 0.05194015326602603, "grad_norm": 1.479935862634438, "kl": 0.039794921875, "learning_rate": 9.933582524210337e-07, "loss": 0.0016, "reward": 1.7873013019561768, "reward_std": 0.16633044183254242, "rewards/accuracy_reward": 0.6185513138771057, "rewards/format_reward": 1.0, "step": 1708, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.125, "completion_length": 374.828125, "epoch": 0.05197056319182581, "grad_norm": 0.8580855559527292, "kl": 0.040771484375, "learning_rate": 9.933504902116361e-07, "loss": 0.0016, "reward": 1.97991144657135, "reward_std": 0.044846419245004654, "rewards/accuracy_reward": 0.8049113750457764, "rewards/format_reward": 1.0, "step": 1709, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.125, "completion_length": 383.703125, "epoch": 0.052000973117625596, "grad_norm": 1.1836716505058824, "kl": 0.043212890625, "learning_rate": 9.933427234994034e-07, "loss": 0.0017, "reward": 1.8689963817596436, "reward_std": 0.08528483659029007, "rewards/accuracy_reward": 0.7221212387084961, "rewards/format_reward": 1.0, "step": 1710, "temperature": 1.0 }, { "all_correct": 0.375, "all_wrong": 0.0, "completion_length": 398.21875, "epoch": 0.052031383043425375, "grad_norm": 1.3660456902660463, "kl": 0.041748046875, "learning_rate": 9.933349522844066e-07, "loss": 0.0017, "reward": 1.752043604850769, "reward_std": 0.1173986941576004, "rewards/accuracy_reward": 0.5895435810089111, "rewards/format_reward": 1.0, "step": 1711, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 365.984375, "epoch": 0.052061792969225154, "grad_norm": 1.9349183026458403, "kl": 0.0390625, "learning_rate": 9.933271765667167e-07, "loss": 0.0016, "reward": 1.799690842628479, "reward_std": 0.11679378896951675, "rewards/accuracy_reward": 0.643440842628479, "rewards/format_reward": 1.0, "step": 1712, "temperature": 1.0 }, { "all_correct": 0.375, "all_wrong": 0.125, "completion_length": 362.28125, "epoch": 0.05209220289502493, "grad_norm": 1.0192001306401566, "kl": 0.0400390625, "learning_rate": 9.933193963464046e-07, "loss": 0.0016, "reward": 1.844870686531067, "reward_std": 0.19211235642433167, "rewards/accuracy_reward": 0.6917456388473511, "rewards/format_reward": 1.0, "step": 1713, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.0, "completion_length": 383.21875, "epoch": 0.05212261282082472, "grad_norm": 0.9398283249237044, "kl": 0.040283203125, "learning_rate": 9.933116116235414e-07, "loss": 0.0016, "reward": 2.019998550415039, "reward_std": 0.06577799469232559, "rewards/accuracy_reward": 0.8293734192848206, "rewards/format_reward": 1.0, "step": 1714, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 368.15625, "epoch": 0.0521530227466245, "grad_norm": 0.929083364351505, "kl": 0.04052734375, "learning_rate": 9.93303822398198e-07, "loss": 0.0016, "reward": 2.0230250358581543, "reward_std": 0.028195075690746307, "rewards/accuracy_reward": 0.8292750716209412, "rewards/format_reward": 1.0, "step": 1715, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.25, "completion_length": 396.75, "epoch": 0.05218343267242428, "grad_norm": 1.33212459144287, "kl": 0.04150390625, "learning_rate": 9.932960286704457e-07, "loss": 0.0017, "reward": 1.7906250953674316, "reward_std": 0.20484083890914917, "rewards/accuracy_reward": 0.703125, "rewards/format_reward": 0.953125, "step": 1716, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 340.9375, "epoch": 0.05221384259822406, "grad_norm": 1.4571616004143157, "kl": 0.0478515625, "learning_rate": 9.932882304403554e-07, "loss": 0.0019, "reward": 1.9562500715255737, "reward_std": 0.17180021107196808, "rewards/accuracy_reward": 0.7874999642372131, "rewards/format_reward": 1.0, "step": 1717, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.0, "completion_length": 385.203125, "epoch": 0.05224425252402384, "grad_norm": 1.5984187397557628, "kl": 0.0380859375, "learning_rate": 9.932804277079983e-07, "loss": 0.0015, "reward": 1.8924496173858643, "reward_std": 0.16073644161224365, "rewards/accuracy_reward": 0.7486996650695801, "rewards/format_reward": 1.0, "step": 1718, "temperature": 1.0 }, { "all_correct": 0.875, "all_wrong": 0.125, "completion_length": 351.421875, "epoch": 0.05227466244982362, "grad_norm": 0.5998310364904058, "kl": 0.042724609375, "learning_rate": 9.932726204734458e-07, "loss": 0.0017, "reward": 2.046875, "reward_std": 0.008838837966322899, "rewards/accuracy_reward": 0.875, "rewards/format_reward": 1.0, "step": 1719, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.0, "completion_length": 359.890625, "epoch": 0.0523050723756234, "grad_norm": 1.777480766828006, "kl": 0.056396484375, "learning_rate": 9.93264808736769e-07, "loss": 0.0023, "reward": 2.00716495513916, "reward_std": 0.0291978120803833, "rewards/accuracy_reward": 0.8102896809577942, "rewards/format_reward": 1.0, "step": 1720, "temperature": 1.0 }, { "all_correct": 0.375, "all_wrong": 0.25, "completion_length": 377.984375, "epoch": 0.05233548230142319, "grad_norm": 1.0364725589308517, "kl": 0.040771484375, "learning_rate": 9.932569924980394e-07, "loss": 0.0016, "reward": 1.6118416786193848, "reward_std": 0.09988240897655487, "rewards/accuracy_reward": 0.4930917024612427, "rewards/format_reward": 1.0, "step": 1721, "temperature": 1.0 }, { "all_correct": 0.375, "all_wrong": 0.0, "completion_length": 393.9375, "epoch": 0.05236589222722297, "grad_norm": 3.5388999185689984, "kl": 0.04931640625, "learning_rate": 9.932491717573282e-07, "loss": 0.002, "reward": 1.7885299921035767, "reward_std": 0.06719963997602463, "rewards/accuracy_reward": 0.6041549444198608, "rewards/format_reward": 1.0, "step": 1722, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.25, "completion_length": 397.109375, "epoch": 0.052396302153022746, "grad_norm": 0.8395984169324521, "kl": 0.04052734375, "learning_rate": 9.932413465147065e-07, "loss": 0.0016, "reward": 1.7591776847839355, "reward_std": 0.07111681252717972, "rewards/accuracy_reward": 0.6310526728630066, "rewards/format_reward": 1.0, "step": 1723, "temperature": 1.0 }, { "all_correct": 0.25, "all_wrong": 0.375, "completion_length": 367.203125, "epoch": 0.052426712078822525, "grad_norm": 0.728277070326255, "kl": 0.045654296875, "learning_rate": 9.932335167702461e-07, "loss": 0.0018, "reward": 1.5781887769699097, "reward_std": 0.15444427728652954, "rewards/accuracy_reward": 0.4688137471675873, "rewards/format_reward": 1.0, "step": 1724, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.0, "completion_length": 377.8125, "epoch": 0.05245712200462231, "grad_norm": 0.81287027871598, "kl": 0.040283203125, "learning_rate": 9.932256825240185e-07, "loss": 0.0016, "reward": 1.8304009437561035, "reward_std": 0.059511780738830566, "rewards/accuracy_reward": 0.6866510510444641, "rewards/format_reward": 1.0, "step": 1725, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.125, "completion_length": 363.75, "epoch": 0.05248753193042209, "grad_norm": 0.7594044705066947, "kl": 0.045654296875, "learning_rate": 9.93217843776095e-07, "loss": 0.0018, "reward": 1.8916666507720947, "reward_std": 0.0824957937002182, "rewards/accuracy_reward": 0.7447916865348816, "rewards/format_reward": 1.0, "step": 1726, "temperature": 1.0 }, { "all_correct": 0.375, "all_wrong": 0.125, "completion_length": 362.140625, "epoch": 0.05251794185622187, "grad_norm": 1.1798908039225517, "kl": 0.04345703125, "learning_rate": 9.932100005265472e-07, "loss": 0.0017, "reward": 1.8173259496688843, "reward_std": 0.09162682294845581, "rewards/accuracy_reward": 0.6517008543014526, "rewards/format_reward": 1.0, "step": 1727, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.125, "completion_length": 364.515625, "epoch": 0.05254835178202165, "grad_norm": 0.8548169847406715, "kl": 0.04736328125, "learning_rate": 9.932021527754466e-07, "loss": 0.0019, "reward": 1.9537043571472168, "reward_std": 0.010699999518692493, "rewards/accuracy_reward": 0.781829297542572, "rewards/format_reward": 1.0, "step": 1728, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.375, "completion_length": 371.15625, "epoch": 0.052578761707821435, "grad_norm": 0.6180866662946092, "kl": 0.051025390625, "learning_rate": 9.93194300522865e-07, "loss": 0.002, "reward": 1.6797244548797607, "reward_std": 0.019820820540189743, "rewards/accuracy_reward": 0.5609744191169739, "rewards/format_reward": 1.0, "step": 1729, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 374.078125, "epoch": 0.052609171633621214, "grad_norm": 0.6542545233984774, "kl": 0.04736328125, "learning_rate": 9.931864437688738e-07, "loss": 0.0019, "reward": 1.9587702751159668, "reward_std": 0.05685633420944214, "rewards/accuracy_reward": 0.7868952751159668, "rewards/format_reward": 1.0, "step": 1730, "temperature": 1.0 }, { "all_correct": 0.375, "all_wrong": 0.125, "completion_length": 382.140625, "epoch": 0.05263958155942099, "grad_norm": 1.4650487754531858, "kl": 0.044189453125, "learning_rate": 9.93178582513545e-07, "loss": 0.0018, "reward": 1.7392561435699463, "reward_std": 0.11198993027210236, "rewards/accuracy_reward": 0.5861310958862305, "rewards/format_reward": 1.0, "step": 1731, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.125, "completion_length": 370.34375, "epoch": 0.05266999148522077, "grad_norm": 1.058491516946553, "kl": 0.047607421875, "learning_rate": 9.931707167569503e-07, "loss": 0.0019, "reward": 1.8322917222976685, "reward_std": 0.16913866996765137, "rewards/accuracy_reward": 0.6822916269302368, "rewards/format_reward": 1.0, "step": 1732, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.0, "completion_length": 357.015625, "epoch": 0.05270040141102056, "grad_norm": 3.739528004352349, "kl": 0.04345703125, "learning_rate": 9.931628464991616e-07, "loss": 0.0017, "reward": 1.9121160507202148, "reward_std": 0.028857268393039703, "rewards/accuracy_reward": 0.7246160507202148, "rewards/format_reward": 1.0, "step": 1733, "temperature": 1.0 }, { "all_correct": 0.25, "all_wrong": 0.25, "completion_length": 398.828125, "epoch": 0.05273081133682034, "grad_norm": 1.1230772315853261, "kl": 0.043212890625, "learning_rate": 9.931549717402504e-07, "loss": 0.0017, "reward": 1.6522834300994873, "reward_std": 0.15697012841701508, "rewards/accuracy_reward": 0.5179083347320557, "rewards/format_reward": 1.0, "step": 1734, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.125, "completion_length": 399.8125, "epoch": 0.05276122126262012, "grad_norm": 0.7357951914998079, "kl": 0.03271484375, "learning_rate": 9.931470924802887e-07, "loss": 0.0013, "reward": 1.8989508152008057, "reward_std": 0.09077323228120804, "rewards/accuracy_reward": 0.7364507913589478, "rewards/format_reward": 1.0, "step": 1735, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.25, "completion_length": 354.890625, "epoch": 0.0527916311884199, "grad_norm": 0.5080282171341534, "kl": 0.037841796875, "learning_rate": 9.931392087193484e-07, "loss": 0.0015, "reward": 1.8555550575256348, "reward_std": 0.025247374549508095, "rewards/accuracy_reward": 0.7055552005767822, "rewards/format_reward": 1.0, "step": 1736, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.25, "completion_length": 357.140625, "epoch": 0.05282204111421968, "grad_norm": 0.3716966480196633, "kl": 0.03515625, "learning_rate": 9.931313204575015e-07, "loss": 0.0014, "reward": 1.881250023841858, "reward_std": 0.0530330091714859, "rewards/accuracy_reward": 0.734375, "rewards/format_reward": 1.0, "step": 1737, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.0, "completion_length": 393.71875, "epoch": 0.05285245104001946, "grad_norm": 2.6006393805487193, "kl": 0.034423828125, "learning_rate": 9.9312342769482e-07, "loss": 0.0014, "reward": 1.902111291885376, "reward_std": 0.1417084038257599, "rewards/accuracy_reward": 0.7396112680435181, "rewards/format_reward": 1.0, "step": 1738, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.125, "completion_length": 411.484375, "epoch": 0.05288286096581924, "grad_norm": 0.6575456730527551, "kl": 0.0303955078125, "learning_rate": 9.931155304313762e-07, "loss": 0.0012, "reward": 1.7260732650756836, "reward_std": 0.22020688652992249, "rewards/accuracy_reward": 0.638573169708252, "rewards/format_reward": 0.96875, "step": 1739, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.125, "completion_length": 389.78125, "epoch": 0.05291327089161903, "grad_norm": 1.1017322765261257, "kl": 0.037841796875, "learning_rate": 9.931076286672417e-07, "loss": 0.0015, "reward": 1.7297368049621582, "reward_std": 0.01664542779326439, "rewards/accuracy_reward": 0.5734866857528687, "rewards/format_reward": 1.0, "step": 1740, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.25, "completion_length": 376.078125, "epoch": 0.052943680817418806, "grad_norm": 0.9491881887325085, "kl": 0.026611328125, "learning_rate": 9.930997224024888e-07, "loss": 0.0011, "reward": 1.8082174062728882, "reward_std": 0.011353096924722195, "rewards/accuracy_reward": 0.6613423228263855, "rewards/format_reward": 1.0, "step": 1741, "temperature": 1.0 }, { "all_correct": 0.375, "all_wrong": 0.125, "completion_length": 358.078125, "epoch": 0.052974090743218585, "grad_norm": 0.8417268891664833, "kl": 0.036865234375, "learning_rate": 9.9309181163719e-07, "loss": 0.0015, "reward": 1.6787805557250977, "reward_std": 0.19126306474208832, "rewards/accuracy_reward": 0.5631555914878845, "rewards/format_reward": 1.0, "step": 1742, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.125, "completion_length": 380.546875, "epoch": 0.053004500669018365, "grad_norm": 1.7648883421342274, "kl": 0.037841796875, "learning_rate": 9.93083896371417e-07, "loss": 0.0015, "reward": 1.7168731689453125, "reward_std": 0.104706771671772, "rewards/accuracy_reward": 0.576248049736023, "rewards/format_reward": 1.0, "step": 1743, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.125, "completion_length": 393.578125, "epoch": 0.05303491059481815, "grad_norm": 0.9970850148866124, "kl": 0.0284423828125, "learning_rate": 9.93075976605242e-07, "loss": 0.0011, "reward": 1.9124999046325684, "reward_std": 0.0530330091714859, "rewards/accuracy_reward": 0.765625, "rewards/format_reward": 1.0, "step": 1744, "temperature": 1.0 }, { "all_correct": 0.25, "all_wrong": 0.125, "completion_length": 374.96875, "epoch": 0.05306532052061793, "grad_norm": 2.140672869454024, "kl": 0.033203125, "learning_rate": 9.93068052338738e-07, "loss": 0.0013, "reward": 1.739622712135315, "reward_std": 0.1683429777622223, "rewards/accuracy_reward": 0.5864977240562439, "rewards/format_reward": 1.0, "step": 1745, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.375, "completion_length": 347.609375, "epoch": 0.05309573044641771, "grad_norm": 0.4239443399894489, "kl": 0.0361328125, "learning_rate": 9.930601235719765e-07, "loss": 0.0014, "reward": 1.65625, "reward_std": 0.077632375061512, "rewards/accuracy_reward": 0.546875, "rewards/format_reward": 1.0, "step": 1746, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.125, "completion_length": 417.78125, "epoch": 0.053126140372217495, "grad_norm": 0.8991270107431673, "kl": 0.0264892578125, "learning_rate": 9.930521903050304e-07, "loss": 0.0011, "reward": 1.7762322425842285, "reward_std": 0.08783479034900665, "rewards/accuracy_reward": 0.6231071949005127, "rewards/format_reward": 1.0, "step": 1747, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.125, "completion_length": 400.71875, "epoch": 0.053156550298017274, "grad_norm": 1.016355773539133, "kl": 0.03515625, "learning_rate": 9.930442525379719e-07, "loss": 0.0014, "reward": 1.7676626443862915, "reward_std": 0.17786872386932373, "rewards/accuracy_reward": 0.6426626443862915, "rewards/format_reward": 1.0, "step": 1748, "temperature": 1.0 }, { "all_correct": 0.875, "all_wrong": 0.125, "completion_length": 365.765625, "epoch": 0.053186960223817054, "grad_norm": 0.5349088688643145, "kl": 0.03466796875, "learning_rate": 9.930363102708734e-07, "loss": 0.0014, "reward": 2.046875, "reward_std": 0.008838837966322899, "rewards/accuracy_reward": 0.875, "rewards/format_reward": 1.0, "step": 1749, "temperature": 1.0 }, { "all_correct": 0.875, "all_wrong": 0.0, "completion_length": 354.484375, "epoch": 0.05321737014961683, "grad_norm": 1.1797717237448997, "kl": 0.0322265625, "learning_rate": 9.930283635038077e-07, "loss": 0.0013, "reward": 2.163480043411255, "reward_std": 0.04172266274690628, "rewards/accuracy_reward": 0.9759801030158997, "rewards/format_reward": 1.0, "step": 1750, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.125, "completion_length": 370.171875, "epoch": 0.05324778007541662, "grad_norm": 0.7196814619998144, "kl": 0.032958984375, "learning_rate": 9.93020412236847e-07, "loss": 0.0013, "reward": 1.7657501697540283, "reward_std": 0.1526947170495987, "rewards/accuracy_reward": 0.6345000267028809, "rewards/format_reward": 1.0, "step": 1751, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.125, "completion_length": 396.078125, "epoch": 0.0532781900012164, "grad_norm": 1.865299134301001, "kl": 0.03369140625, "learning_rate": 9.930124564700636e-07, "loss": 0.0014, "reward": 1.6824376583099365, "reward_std": 0.09923285245895386, "rewards/accuracy_reward": 0.5574377179145813, "rewards/format_reward": 1.0, "step": 1752, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.125, "completion_length": 357.234375, "epoch": 0.05330859992701618, "grad_norm": 0.5525320267292863, "kl": 0.036865234375, "learning_rate": 9.930044962035309e-07, "loss": 0.0015, "reward": 2.03125, "reward_std": 0.0530330091714859, "rewards/accuracy_reward": 0.859375, "rewards/format_reward": 1.0, "step": 1753, "temperature": 1.0 }, { "all_correct": 0.375, "all_wrong": 0.125, "completion_length": 392.734375, "epoch": 0.053339009852815956, "grad_norm": 1.5410236252655487, "kl": 0.03955078125, "learning_rate": 9.92996531437321e-07, "loss": 0.0016, "reward": 1.6641496419906616, "reward_std": 0.03943108394742012, "rewards/accuracy_reward": 0.5047745704650879, "rewards/format_reward": 1.0, "step": 1754, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 348.640625, "epoch": 0.05336941977861574, "grad_norm": 0.8023339347746173, "kl": 0.0361328125, "learning_rate": 9.929885621715069e-07, "loss": 0.0015, "reward": 2.085416793823242, "reward_std": 0.10849304497241974, "rewards/accuracy_reward": 0.8947916626930237, "rewards/format_reward": 1.0, "step": 1755, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.125, "completion_length": 384.78125, "epoch": 0.05339982970441552, "grad_norm": 2.8337067074041045, "kl": 0.038330078125, "learning_rate": 9.929805884061611e-07, "loss": 0.0015, "reward": 1.8345284461975098, "reward_std": 0.0697808712720871, "rewards/accuracy_reward": 0.675153374671936, "rewards/format_reward": 1.0, "step": 1756, "temperature": 1.0 }, { "all_correct": 0.375, "all_wrong": 0.25, "completion_length": 362.265625, "epoch": 0.0534302396302153, "grad_norm": 0.8957468575906574, "kl": 0.036376953125, "learning_rate": 9.929726101413563e-07, "loss": 0.0015, "reward": 1.7416856288909912, "reward_std": 0.10728268325328827, "rewards/accuracy_reward": 0.6041855812072754, "rewards/format_reward": 1.0, "step": 1757, "temperature": 1.0 }, { "all_correct": 0.875, "all_wrong": 0.0, "completion_length": 364.59375, "epoch": 0.05346064955601508, "grad_norm": 0.5772351467096913, "kl": 0.040283203125, "learning_rate": 9.929646273771655e-07, "loss": 0.0016, "reward": 2.1126341819763184, "reward_std": 0.0031856982968747616, "rewards/accuracy_reward": 0.9126341342926025, "rewards/format_reward": 1.0, "step": 1758, "temperature": 1.0 }, { "all_correct": 0.375, "all_wrong": 0.125, "completion_length": 384.203125, "epoch": 0.053491059481814866, "grad_norm": 1.0483151816965295, "kl": 0.034423828125, "learning_rate": 9.929566401136616e-07, "loss": 0.0014, "reward": 1.7578452825546265, "reward_std": 0.10913911461830139, "rewards/accuracy_reward": 0.6109702587127686, "rewards/format_reward": 1.0, "step": 1759, "temperature": 1.0 }, { "all_correct": 0.25, "all_wrong": 0.25, "completion_length": 365.8125, "epoch": 0.053521469407614645, "grad_norm": 1.0780376133259406, "kl": 0.04638671875, "learning_rate": 9.929486483509172e-07, "loss": 0.0019, "reward": 1.592857837677002, "reward_std": 0.12637543678283691, "rewards/accuracy_reward": 0.47098276019096375, "rewards/format_reward": 1.0, "step": 1760, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.25, "completion_length": 363.703125, "epoch": 0.053551879333414425, "grad_norm": 1.1411611807689201, "kl": 0.03955078125, "learning_rate": 9.929406520890057e-07, "loss": 0.0016, "reward": 1.7194530963897705, "reward_std": 0.09455876052379608, "rewards/accuracy_reward": 0.5850780010223389, "rewards/format_reward": 1.0, "step": 1761, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.25, "completion_length": 376.59375, "epoch": 0.05358228925921421, "grad_norm": 1.0990709893710588, "kl": 0.03857421875, "learning_rate": 9.929326513279998e-07, "loss": 0.0015, "reward": 1.7798888683319092, "reward_std": 0.030979227274656296, "rewards/accuracy_reward": 0.6392639875411987, "rewards/format_reward": 1.0, "step": 1762, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.125, "completion_length": 370.375, "epoch": 0.05361269918501399, "grad_norm": 1.5619074600195828, "kl": 0.0458984375, "learning_rate": 9.929246460679726e-07, "loss": 0.0018, "reward": 1.8410710096359253, "reward_std": 0.13639596104621887, "rewards/accuracy_reward": 0.7004460096359253, "rewards/format_reward": 1.0, "step": 1763, "temperature": 1.0 }, { "all_correct": 0.375, "all_wrong": 0.25, "completion_length": 369.890625, "epoch": 0.05364310911081377, "grad_norm": 1.267018239322046, "kl": 0.04931640625, "learning_rate": 9.92916636308997e-07, "loss": 0.002, "reward": 1.548771858215332, "reward_std": 0.10878238081932068, "rewards/accuracy_reward": 0.4550217390060425, "rewards/format_reward": 1.0, "step": 1764, "temperature": 1.0 }, { "all_correct": 0.375, "all_wrong": 0.25, "completion_length": 357.5625, "epoch": 0.05367351903661355, "grad_norm": 1.10040002668136, "kl": 0.04052734375, "learning_rate": 9.929086220511464e-07, "loss": 0.0016, "reward": 1.6640499830245972, "reward_std": 0.04192690551280975, "rewards/accuracy_reward": 0.5140500068664551, "rewards/format_reward": 1.0, "step": 1765, "temperature": 1.0 }, { "all_correct": 0.25, "all_wrong": 0.125, "completion_length": 393.9375, "epoch": 0.053703928962413335, "grad_norm": 1.258009812354391, "kl": 0.039306640625, "learning_rate": 9.929006032944937e-07, "loss": 0.0016, "reward": 1.750711441040039, "reward_std": 0.19465500116348267, "rewards/accuracy_reward": 0.6069614291191101, "rewards/format_reward": 1.0, "step": 1766, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 358.140625, "epoch": 0.053734338888213114, "grad_norm": 0.979811231693195, "kl": 0.032958984375, "learning_rate": 9.928925800391122e-07, "loss": 0.0013, "reward": 1.9893229007720947, "reward_std": 0.1280553936958313, "rewards/accuracy_reward": 0.8268229365348816, "rewards/format_reward": 1.0, "step": 1767, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.125, "completion_length": 368.734375, "epoch": 0.05376474881401289, "grad_norm": 1.245326969352413, "kl": 0.03857421875, "learning_rate": 9.92884552285075e-07, "loss": 0.0015, "reward": 1.9602339267730713, "reward_std": 0.020391281694173813, "rewards/accuracy_reward": 0.7977339029312134, "rewards/format_reward": 1.0, "step": 1768, "temperature": 1.0 }, { "all_correct": 0.125, "all_wrong": 0.5, "completion_length": 355.5, "epoch": 0.05379515873981267, "grad_norm": 1.4377474901201928, "kl": 0.046875, "learning_rate": 9.928765200324556e-07, "loss": 0.0019, "reward": 1.336641550064087, "reward_std": 0.12639351189136505, "rewards/accuracy_reward": 0.2585165798664093, "rewards/format_reward": 1.0, "step": 1769, "temperature": 1.0 }, { "all_correct": 0.125, "all_wrong": 0.5, "completion_length": 379.125, "epoch": 0.05382556866561246, "grad_norm": 1.131896108581665, "kl": 0.041259765625, "learning_rate": 9.928684832813271e-07, "loss": 0.0016, "reward": 1.3194055557250977, "reward_std": 0.12246951460838318, "rewards/accuracy_reward": 0.2662805914878845, "rewards/format_reward": 1.0, "step": 1770, "temperature": 1.0 }, { "all_correct": 0.375, "all_wrong": 0.125, "completion_length": 393.765625, "epoch": 0.05385597859141224, "grad_norm": 1.3392822171928767, "kl": 0.037841796875, "learning_rate": 9.92860442031763e-07, "loss": 0.0015, "reward": 1.6653803586959839, "reward_std": 0.09766694158315659, "rewards/accuracy_reward": 0.5153802633285522, "rewards/format_reward": 1.0, "step": 1771, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.0, "completion_length": 386.703125, "epoch": 0.05388638851721202, "grad_norm": 0.9429016451987904, "kl": 0.0361328125, "learning_rate": 9.928523962838367e-07, "loss": 0.0014, "reward": 1.9241161346435547, "reward_std": 0.0734207034111023, "rewards/accuracy_reward": 0.7584910988807678, "rewards/format_reward": 1.0, "step": 1772, "temperature": 1.0 }, { "all_correct": 0.25, "all_wrong": 0.5, "completion_length": 377.84375, "epoch": 0.053916798443011796, "grad_norm": 0.6449840137534874, "kl": 0.043701171875, "learning_rate": 9.928443460376214e-07, "loss": 0.0018, "reward": 1.4719064235687256, "reward_std": 0.02093832567334175, "rewards/accuracy_reward": 0.39378130435943604, "rewards/format_reward": 1.0, "step": 1773, "temperature": 1.0 }, { "all_correct": 0.375, "all_wrong": 0.25, "completion_length": 359.5, "epoch": 0.05394720836881158, "grad_norm": 0.8619882894784859, "kl": 0.03515625, "learning_rate": 9.928362912931908e-07, "loss": 0.0014, "reward": 1.666562557220459, "reward_std": 0.1892632246017456, "rewards/accuracy_reward": 0.5478125214576721, "rewards/format_reward": 1.0, "step": 1774, "temperature": 1.0 }, { "all_correct": 0.375, "all_wrong": 0.25, "completion_length": 363.609375, "epoch": 0.05397761829461136, "grad_norm": 0.7894507492899807, "kl": 0.032470703125, "learning_rate": 9.928282320506184e-07, "loss": 0.0013, "reward": 1.5625, "reward_std": 0.20830951631069183, "rewards/accuracy_reward": 0.46875, "rewards/format_reward": 1.0, "step": 1775, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 360.0, "epoch": 0.05400802822041114, "grad_norm": 0.6439435545284175, "kl": 0.04638671875, "learning_rate": 9.928201683099776e-07, "loss": 0.0019, "reward": 2.0901756286621094, "reward_std": 0.08176927268505096, "rewards/accuracy_reward": 0.8995506763458252, "rewards/format_reward": 1.0, "step": 1776, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.0, "completion_length": 375.0625, "epoch": 0.054038438146210926, "grad_norm": 1.0162005352654053, "kl": 0.038330078125, "learning_rate": 9.92812100071342e-07, "loss": 0.0015, "reward": 1.9773123264312744, "reward_std": 0.1322588324546814, "rewards/accuracy_reward": 0.8085622191429138, "rewards/format_reward": 1.0, "step": 1777, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.0, "completion_length": 378.78125, "epoch": 0.054068848072010706, "grad_norm": 1.018356078030503, "kl": 0.03369140625, "learning_rate": 9.928040273347857e-07, "loss": 0.0013, "reward": 2.0540552139282227, "reward_std": 0.07261952757835388, "rewards/accuracy_reward": 0.8603051900863647, "rewards/format_reward": 1.0, "step": 1778, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.125, "completion_length": 372.765625, "epoch": 0.054099257997810485, "grad_norm": 10.306130800978577, "kl": 0.0498046875, "learning_rate": 9.927959501003818e-07, "loss": 0.002, "reward": 1.8654594421386719, "reward_std": 0.024972496554255486, "rewards/accuracy_reward": 0.6967092752456665, "rewards/format_reward": 1.0, "step": 1779, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.25, "completion_length": 370.5, "epoch": 0.054129667923610264, "grad_norm": 0.5380322217986514, "kl": 0.033447265625, "learning_rate": 9.927878683682042e-07, "loss": 0.0013, "reward": 1.8211686611175537, "reward_std": 0.014143557287752628, "rewards/accuracy_reward": 0.6742936372756958, "rewards/format_reward": 1.0, "step": 1780, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.125, "completion_length": 415.828125, "epoch": 0.05416007784941005, "grad_norm": 0.8417510469660456, "kl": 0.03271484375, "learning_rate": 9.927797821383268e-07, "loss": 0.0013, "reward": 1.7898240089416504, "reward_std": 0.1088612824678421, "rewards/accuracy_reward": 0.6585740447044373, "rewards/format_reward": 1.0, "step": 1781, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.0, "completion_length": 369.21875, "epoch": 0.05419048777520983, "grad_norm": 2.5172960805917612, "kl": 0.041015625, "learning_rate": 9.927716914108234e-07, "loss": 0.0016, "reward": 1.954413652420044, "reward_std": 0.17351588606834412, "rewards/accuracy_reward": 0.782538652420044, "rewards/format_reward": 1.0, "step": 1782, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.125, "completion_length": 343.0625, "epoch": 0.05422089770100961, "grad_norm": 1.786912565328946, "kl": 0.04345703125, "learning_rate": 9.927635961857676e-07, "loss": 0.0017, "reward": 1.84542977809906, "reward_std": 0.13848485052585602, "rewards/accuracy_reward": 0.7110546827316284, "rewards/format_reward": 0.984375, "step": 1783, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.0, "completion_length": 360.34375, "epoch": 0.05425130762680939, "grad_norm": 1.0705404911157932, "kl": 0.031982421875, "learning_rate": 9.927554964632336e-07, "loss": 0.0013, "reward": 1.9792534112930298, "reward_std": 0.14146408438682556, "rewards/accuracy_reward": 0.8042534589767456, "rewards/format_reward": 1.0, "step": 1784, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.125, "completion_length": 375.1875, "epoch": 0.054281717552609174, "grad_norm": 0.8772968620722489, "kl": 0.035400390625, "learning_rate": 9.92747392243295e-07, "loss": 0.0014, "reward": 1.7880208492279053, "reward_std": 0.15328487753868103, "rewards/accuracy_reward": 0.6661458015441895, "rewards/format_reward": 1.0, "step": 1785, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.125, "completion_length": 360.0, "epoch": 0.05431212747840895, "grad_norm": 0.26052674188374403, "kl": 0.032958984375, "learning_rate": 9.92739283526026e-07, "loss": 0.0013, "reward": 1.9937500953674316, "reward_std": 0.077632375061512, "rewards/accuracy_reward": 0.828125, "rewards/format_reward": 1.0, "step": 1786, "temperature": 1.0 }, { "all_correct": 0.375, "all_wrong": 0.25, "completion_length": 379.234375, "epoch": 0.05434253740420873, "grad_norm": 2.4959708676620846, "kl": 0.04345703125, "learning_rate": 9.927311703115006e-07, "loss": 0.0017, "reward": 1.6713738441467285, "reward_std": 0.024273546412587166, "rewards/accuracy_reward": 0.5276238918304443, "rewards/format_reward": 1.0, "step": 1787, "temperature": 1.0 }, { "all_correct": 0.375, "all_wrong": 0.375, "completion_length": 381.65625, "epoch": 0.05437294733000851, "grad_norm": 1.3009136153894694, "kl": 0.03369140625, "learning_rate": 9.927230525997927e-07, "loss": 0.0013, "reward": 1.5975360870361328, "reward_std": 0.07986532151699066, "rewards/accuracy_reward": 0.5037860870361328, "rewards/format_reward": 1.0, "step": 1788, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.125, "completion_length": 375.25, "epoch": 0.0544033572558083, "grad_norm": 0.9331215541727185, "kl": 0.04052734375, "learning_rate": 9.927149303909766e-07, "loss": 0.0016, "reward": 1.784458041191101, "reward_std": 0.1636240929365158, "rewards/accuracy_reward": 0.650083065032959, "rewards/format_reward": 1.0, "step": 1789, "temperature": 1.0 }, { "all_correct": 0.375, "all_wrong": 0.125, "completion_length": 359.828125, "epoch": 0.05443376718160808, "grad_norm": 1.3705318478471595, "kl": 0.039306640625, "learning_rate": 9.927068036851262e-07, "loss": 0.0016, "reward": 1.7698920965194702, "reward_std": 0.20690850913524628, "rewards/accuracy_reward": 0.6292670369148254, "rewards/format_reward": 1.0, "step": 1790, "temperature": 1.0 }, { "all_correct": 0.375, "all_wrong": 0.125, "completion_length": 376.1875, "epoch": 0.054464177107407856, "grad_norm": 1.4125366598956606, "kl": 0.041259765625, "learning_rate": 9.92698672482316e-07, "loss": 0.0017, "reward": 1.678125023841858, "reward_std": 0.24642017483711243, "rewards/accuracy_reward": 0.578125, "rewards/format_reward": 1.0, "step": 1791, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.125, "completion_length": 361.390625, "epoch": 0.05449458703320764, "grad_norm": 2.480509526962815, "kl": 0.03955078125, "learning_rate": 9.9269053678262e-07, "loss": 0.0016, "reward": 1.7759034633636475, "reward_std": 0.08361143618822098, "rewards/accuracy_reward": 0.6352784037590027, "rewards/format_reward": 1.0, "step": 1792, "temperature": 1.0 }, { "all_correct": 0.375, "all_wrong": 0.0, "completion_length": 356.890625, "epoch": 0.05452499695900742, "grad_norm": 1.5671263197704786, "kl": 0.049560546875, "learning_rate": 9.926823965861122e-07, "loss": 0.002, "reward": 1.751999020576477, "reward_std": 0.17759668827056885, "rewards/accuracy_reward": 0.5894989371299744, "rewards/format_reward": 1.0, "step": 1793, "temperature": 1.0 }, { "all_correct": 0.25, "all_wrong": 0.25, "completion_length": 357.828125, "epoch": 0.0545554068848072, "grad_norm": 1.5383738378679102, "kl": 0.054931640625, "learning_rate": 9.926742518928673e-07, "loss": 0.0022, "reward": 1.701249599456787, "reward_std": 0.14524880051612854, "rewards/accuracy_reward": 0.5699995756149292, "rewards/format_reward": 1.0, "step": 1794, "temperature": 1.0 }, { "all_correct": 0.25, "all_wrong": 0.125, "completion_length": 356.953125, "epoch": 0.05458581681060698, "grad_norm": 2.012750203932684, "kl": 0.046142578125, "learning_rate": 9.926661027029595e-07, "loss": 0.0018, "reward": 1.6020137071609497, "reward_std": 0.18549156188964844, "rewards/accuracy_reward": 0.4957636594772339, "rewards/format_reward": 1.0, "step": 1795, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.125, "completion_length": 364.015625, "epoch": 0.054616226736406766, "grad_norm": 1.3396600748411733, "kl": 0.0380859375, "learning_rate": 9.92657949016463e-07, "loss": 0.0015, "reward": 1.900700330734253, "reward_std": 0.04315320774912834, "rewards/accuracy_reward": 0.7319502234458923, "rewards/format_reward": 1.0, "step": 1796, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 358.984375, "epoch": 0.054646636662206545, "grad_norm": 0.8127969025071565, "kl": 0.04296875, "learning_rate": 9.926497908334525e-07, "loss": 0.0017, "reward": 2.0303053855895996, "reward_std": 0.006462138146162033, "rewards/accuracy_reward": 0.8303053379058838, "rewards/format_reward": 1.0, "step": 1797, "temperature": 1.0 }, { "all_correct": 0.875, "all_wrong": 0.125, "completion_length": 341.328125, "epoch": 0.054677046588006324, "grad_norm": 0.9717605064989127, "kl": 0.0439453125, "learning_rate": 9.926416281540025e-07, "loss": 0.0018, "reward": 2.0343751907348633, "reward_std": 0.030616411939263344, "rewards/accuracy_reward": 0.875, "rewards/format_reward": 1.0, "step": 1798, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.125, "completion_length": 352.578125, "epoch": 0.0547074565138061, "grad_norm": 0.8735632889812265, "kl": 0.039794921875, "learning_rate": 9.926334609781872e-07, "loss": 0.0016, "reward": 1.994710922241211, "reward_std": 0.020471710711717606, "rewards/accuracy_reward": 0.8197108507156372, "rewards/format_reward": 1.0, "step": 1799, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.125, "completion_length": 374.953125, "epoch": 0.05473786643960589, "grad_norm": 0.7983037343834086, "kl": 0.03662109375, "learning_rate": 9.926252893060813e-07, "loss": 0.0015, "reward": 1.7769169807434082, "reward_std": 0.1593465358018875, "rewards/accuracy_reward": 0.6394169330596924, "rewards/format_reward": 1.0, "step": 1800, "temperature": 1.0 }, { "all_correct": 0.25, "all_wrong": 0.25, "completion_length": 355.328125, "epoch": 0.05476827636540567, "grad_norm": 1.5435004475522043, "kl": 0.042236328125, "learning_rate": 9.926171131377595e-07, "loss": 0.0017, "reward": 1.6208317279815674, "reward_std": 0.07936879992485046, "rewards/accuracy_reward": 0.47395679354667664, "rewards/format_reward": 1.0, "step": 1801, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.125, "completion_length": 361.15625, "epoch": 0.05479868629120545, "grad_norm": 1.048033469920985, "kl": 0.0380859375, "learning_rate": 9.926089324732962e-07, "loss": 0.0015, "reward": 1.7613670825958252, "reward_std": 0.10836216807365417, "rewards/accuracy_reward": 0.6238670349121094, "rewards/format_reward": 1.0, "step": 1802, "temperature": 1.0 }, { "all_correct": 0.375, "all_wrong": 0.125, "completion_length": 353.015625, "epoch": 0.05482909621700523, "grad_norm": 1.3398989619898078, "kl": 0.0380859375, "learning_rate": 9.926007473127662e-07, "loss": 0.0015, "reward": 1.7222988605499268, "reward_std": 0.22579775750637054, "rewards/accuracy_reward": 0.6035487055778503, "rewards/format_reward": 1.0, "step": 1803, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 368.296875, "epoch": 0.05485950614280501, "grad_norm": 1.456614292818029, "kl": 0.04443359375, "learning_rate": 9.92592557656244e-07, "loss": 0.0018, "reward": 1.8075824975967407, "reward_std": 0.12256352603435516, "rewards/accuracy_reward": 0.6419574618339539, "rewards/format_reward": 1.0, "step": 1804, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 375.8125, "epoch": 0.05488991606860479, "grad_norm": 1.2932988406157064, "kl": 0.032470703125, "learning_rate": 9.925843635038048e-07, "loss": 0.0013, "reward": 1.895079255104065, "reward_std": 0.17520296573638916, "rewards/accuracy_reward": 0.7263292670249939, "rewards/format_reward": 1.0, "step": 1805, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 376.140625, "epoch": 0.05492032599440457, "grad_norm": 1.4258968783611523, "kl": 0.041748046875, "learning_rate": 9.92576164855523e-07, "loss": 0.0017, "reward": 1.9859092235565186, "reward_std": 0.14932841062545776, "rewards/accuracy_reward": 0.8015341758728027, "rewards/format_reward": 1.0, "step": 1806, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.0, "completion_length": 352.53125, "epoch": 0.05495073592020436, "grad_norm": 1.1249191140109915, "kl": 0.041015625, "learning_rate": 9.925679617114735e-07, "loss": 0.0016, "reward": 1.961499810218811, "reward_std": 0.1550062596797943, "rewards/accuracy_reward": 0.8021246790885925, "rewards/format_reward": 1.0, "step": 1807, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.0, "completion_length": 348.25, "epoch": 0.05498114584600414, "grad_norm": 1.640943635415484, "kl": 0.040771484375, "learning_rate": 9.925597540717311e-07, "loss": 0.0016, "reward": 2.075655698776245, "reward_std": 0.13321137428283691, "rewards/accuracy_reward": 0.8912806510925293, "rewards/format_reward": 1.0, "step": 1808, "temperature": 1.0 }, { "all_correct": 0.375, "all_wrong": 0.25, "completion_length": 390.140625, "epoch": 0.055011555771803916, "grad_norm": 0.8769442291243054, "kl": 0.041259765625, "learning_rate": 9.925515419363711e-07, "loss": 0.0016, "reward": 1.645674705505371, "reward_std": 0.016574416309595108, "rewards/accuracy_reward": 0.4956746995449066, "rewards/format_reward": 1.0, "step": 1809, "temperature": 1.0 }, { "all_correct": 0.375, "all_wrong": 0.375, "completion_length": 378.21875, "epoch": 0.055041965697603695, "grad_norm": 0.7106836727263328, "kl": 0.03662109375, "learning_rate": 9.92543325305468e-07, "loss": 0.0015, "reward": 1.5473958253860474, "reward_std": 0.09293131530284882, "rewards/accuracy_reward": 0.4598958194255829, "rewards/format_reward": 1.0, "step": 1810, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.125, "completion_length": 382.25, "epoch": 0.05507237562340348, "grad_norm": 0.54736202046218, "kl": 0.031982421875, "learning_rate": 9.92535104179097e-07, "loss": 0.0013, "reward": 1.9531251192092896, "reward_std": 0.17300525307655334, "rewards/accuracy_reward": 0.8125, "rewards/format_reward": 0.984375, "step": 1811, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.125, "completion_length": 350.6875, "epoch": 0.05510278554920326, "grad_norm": 0.977392856100019, "kl": 0.03662109375, "learning_rate": 9.925268785573332e-07, "loss": 0.0015, "reward": 1.919791579246521, "reward_std": 0.16896700859069824, "rewards/accuracy_reward": 0.7604167461395264, "rewards/format_reward": 1.0, "step": 1812, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.125, "completion_length": 365.46875, "epoch": 0.05513319547500304, "grad_norm": 0.8476268963499787, "kl": 0.0400390625, "learning_rate": 9.925186484402512e-07, "loss": 0.0016, "reward": 1.9810937643051147, "reward_std": 0.02394336462020874, "rewards/accuracy_reward": 0.8092187643051147, "rewards/format_reward": 1.0, "step": 1813, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.25, "completion_length": 369.5625, "epoch": 0.05516360540080282, "grad_norm": 0.3222451085297914, "kl": 0.03515625, "learning_rate": 9.925104138279268e-07, "loss": 0.0014, "reward": 1.8624999523162842, "reward_std": 0.06943650543689728, "rewards/accuracy_reward": 0.71875, "rewards/format_reward": 1.0, "step": 1814, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.25, "completion_length": 366.484375, "epoch": 0.055194015326602605, "grad_norm": 1.4283901752006507, "kl": 0.043701171875, "learning_rate": 9.92502174720435e-07, "loss": 0.0018, "reward": 1.6837177276611328, "reward_std": 0.05716827139258385, "rewards/accuracy_reward": 0.5587177872657776, "rewards/format_reward": 1.0, "step": 1815, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.125, "completion_length": 353.109375, "epoch": 0.055224425252402384, "grad_norm": 1.007097999743465, "kl": 0.043212890625, "learning_rate": 9.924939311178506e-07, "loss": 0.0017, "reward": 1.838458776473999, "reward_std": 0.04184095561504364, "rewards/accuracy_reward": 0.6759586930274963, "rewards/format_reward": 1.0, "step": 1816, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 397.796875, "epoch": 0.055254835178202164, "grad_norm": 2.4294998715474314, "kl": 0.0294189453125, "learning_rate": 9.92485683020249e-07, "loss": 0.0012, "reward": 1.9912397861480713, "reward_std": 0.10232331603765488, "rewards/accuracy_reward": 0.8037397861480713, "rewards/format_reward": 1.0, "step": 1817, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.125, "completion_length": 376.046875, "epoch": 0.05528524510400195, "grad_norm": 1.6600904990874867, "kl": 0.033203125, "learning_rate": 9.92477430427706e-07, "loss": 0.0013, "reward": 1.7595914602279663, "reward_std": 0.052386827766895294, "rewards/accuracy_reward": 0.6095914244651794, "rewards/format_reward": 1.0, "step": 1818, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.125, "completion_length": 377.296875, "epoch": 0.05531565502980173, "grad_norm": 1.6004028763061766, "kl": 0.041748046875, "learning_rate": 9.92469173340296e-07, "loss": 0.0017, "reward": 1.6154446601867676, "reward_std": 0.08654943853616714, "rewards/accuracy_reward": 0.5279447436332703, "rewards/format_reward": 1.0, "step": 1819, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.125, "completion_length": 356.75, "epoch": 0.05534606495560151, "grad_norm": 1.377380782239242, "kl": 0.04541015625, "learning_rate": 9.924609117580951e-07, "loss": 0.0018, "reward": 1.8948674201965332, "reward_std": 0.07723085582256317, "rewards/accuracy_reward": 0.7261173725128174, "rewards/format_reward": 1.0, "step": 1820, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 391.0625, "epoch": 0.05537647488140129, "grad_norm": 0.8839340175894997, "kl": 0.041259765625, "learning_rate": 9.924526456811785e-07, "loss": 0.0017, "reward": 2.0152053833007812, "reward_std": 0.12524709105491638, "rewards/accuracy_reward": 0.8308305144309998, "rewards/format_reward": 1.0, "step": 1821, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.0, "completion_length": 347.734375, "epoch": 0.05540688480720107, "grad_norm": 1.8549139139246353, "kl": 0.038818359375, "learning_rate": 9.924443751096215e-07, "loss": 0.0015, "reward": 1.9861880540847778, "reward_std": 0.11136253923177719, "rewards/accuracy_reward": 0.8174380660057068, "rewards/format_reward": 1.0, "step": 1822, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 354.71875, "epoch": 0.05543729473300085, "grad_norm": 2.3093435679670873, "kl": 0.036865234375, "learning_rate": 9.924361000434997e-07, "loss": 0.0015, "reward": 2.072706699371338, "reward_std": 0.08866246789693832, "rewards/accuracy_reward": 0.8883317708969116, "rewards/format_reward": 1.0, "step": 1823, "temperature": 1.0 }, { "all_correct": 0.375, "all_wrong": 0.125, "completion_length": 373.4375, "epoch": 0.05546770465880063, "grad_norm": 1.2994203906027615, "kl": 0.03662109375, "learning_rate": 9.924278204828885e-07, "loss": 0.0015, "reward": 1.8100841045379639, "reward_std": 0.11531638354063034, "rewards/accuracy_reward": 0.6600841283798218, "rewards/format_reward": 1.0, "step": 1824, "temperature": 1.0 }, { "all_correct": 0.25, "all_wrong": 0.25, "completion_length": 379.328125, "epoch": 0.05549811458460041, "grad_norm": 2.927134591359527, "kl": 0.0439453125, "learning_rate": 9.92419536427864e-07, "loss": 0.0018, "reward": 1.6755608320236206, "reward_std": 0.04908619821071625, "rewards/accuracy_reward": 0.5255608558654785, "rewards/format_reward": 1.0, "step": 1825, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 381.40625, "epoch": 0.0555285245104002, "grad_norm": 0.8268663404142803, "kl": 0.033203125, "learning_rate": 9.924112478785011e-07, "loss": 0.0013, "reward": 1.9913171529769897, "reward_std": 0.01231265440583229, "rewards/accuracy_reward": 0.8100671172142029, "rewards/format_reward": 1.0, "step": 1826, "temperature": 1.0 }, { "all_correct": 0.375, "all_wrong": 0.0, "completion_length": 419.484375, "epoch": 0.055558934436199976, "grad_norm": 1.9931406517003176, "kl": 0.04345703125, "learning_rate": 9.924029548348758e-07, "loss": 0.0017, "reward": 1.6861860752105713, "reward_std": 0.06438237428665161, "rewards/accuracy_reward": 0.5236860513687134, "rewards/format_reward": 1.0, "step": 1827, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.0, "completion_length": 348.390625, "epoch": 0.055589344361999755, "grad_norm": 1.230808951654158, "kl": 0.035888671875, "learning_rate": 9.923946572970636e-07, "loss": 0.0014, "reward": 2.0843749046325684, "reward_std": 0.19508321583271027, "rewards/accuracy_reward": 0.90625, "rewards/format_reward": 1.0, "step": 1828, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.125, "completion_length": 346.609375, "epoch": 0.055619754287799535, "grad_norm": 1.0487314531643683, "kl": 0.0380859375, "learning_rate": 9.923863552651405e-07, "loss": 0.0015, "reward": 1.7979985475540161, "reward_std": 0.06891971826553345, "rewards/accuracy_reward": 0.6636235117912292, "rewards/format_reward": 1.0, "step": 1829, "temperature": 1.0 }, { "all_correct": 0.375, "all_wrong": 0.125, "completion_length": 355.984375, "epoch": 0.05565016421359932, "grad_norm": 1.1607839160185394, "kl": 0.03857421875, "learning_rate": 9.923780487391822e-07, "loss": 0.0015, "reward": 1.7312500476837158, "reward_std": 0.30233362317085266, "rewards/accuracy_reward": 0.609375, "rewards/format_reward": 1.0, "step": 1830, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.0, "completion_length": 353.828125, "epoch": 0.0556805741393991, "grad_norm": 1.0089524585810061, "kl": 0.0478515625, "learning_rate": 9.923697377192644e-07, "loss": 0.0019, "reward": 2.0357604026794434, "reward_std": 0.09623770415782928, "rewards/accuracy_reward": 0.8513851165771484, "rewards/format_reward": 1.0, "step": 1831, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.125, "completion_length": 343.390625, "epoch": 0.05571098406519888, "grad_norm": 0.9049065601240881, "kl": 0.033203125, "learning_rate": 9.92361422205463e-07, "loss": 0.0013, "reward": 1.953125, "reward_std": 0.08647121489048004, "rewards/accuracy_reward": 0.796875, "rewards/format_reward": 1.0, "step": 1832, "temperature": 1.0 }, { "all_correct": 0.875, "all_wrong": 0.125, "completion_length": 342.703125, "epoch": 0.055741393990998665, "grad_norm": 0.5486600557676367, "kl": 0.054443359375, "learning_rate": 9.923531021978537e-07, "loss": 0.0022, "reward": 2.044959545135498, "reward_std": 0.0018665710231289268, "rewards/accuracy_reward": 0.8699596524238586, "rewards/format_reward": 1.0, "step": 1833, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.125, "completion_length": 353.171875, "epoch": 0.055771803916798444, "grad_norm": 1.2334516884932032, "kl": 0.03955078125, "learning_rate": 9.92344777696513e-07, "loss": 0.0016, "reward": 1.823790192604065, "reward_std": 0.029252495616674423, "rewards/accuracy_reward": 0.6706651449203491, "rewards/format_reward": 1.0, "step": 1834, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.125, "completion_length": 347.0, "epoch": 0.055802213842598224, "grad_norm": 0.48596692961575844, "kl": 0.03466796875, "learning_rate": 9.923364487015162e-07, "loss": 0.0014, "reward": 1.9562500715255737, "reward_std": 0.14706888794898987, "rewards/accuracy_reward": 0.796875, "rewards/format_reward": 1.0, "step": 1835, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 354.8125, "epoch": 0.055832623768398, "grad_norm": 0.7778282609104304, "kl": 0.037841796875, "learning_rate": 9.923281152129399e-07, "loss": 0.0015, "reward": 2.06136155128479, "reward_std": 0.028856799006462097, "rewards/accuracy_reward": 0.8644865155220032, "rewards/format_reward": 1.0, "step": 1836, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.125, "completion_length": 359.828125, "epoch": 0.05586303369419779, "grad_norm": 1.04456295719975, "kl": 0.041259765625, "learning_rate": 9.923197772308597e-07, "loss": 0.0017, "reward": 1.8501803874969482, "reward_std": 0.1517704427242279, "rewards/accuracy_reward": 0.7095552682876587, "rewards/format_reward": 1.0, "step": 1837, "temperature": 1.0 }, { "all_correct": 0.375, "all_wrong": 0.25, "completion_length": 410.828125, "epoch": 0.05589344361999757, "grad_norm": 3.336341985898304, "kl": 0.03173828125, "learning_rate": 9.92311434755352e-07, "loss": 0.0013, "reward": 1.5871260166168213, "reward_std": 0.08487197756767273, "rewards/accuracy_reward": 0.46525105834007263, "rewards/format_reward": 1.0, "step": 1838, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.125, "completion_length": 363.28125, "epoch": 0.05592385354579735, "grad_norm": 1.4883900247881565, "kl": 0.04638671875, "learning_rate": 9.923030877864928e-07, "loss": 0.0019, "reward": 1.895857334136963, "reward_std": 0.06589207798242569, "rewards/accuracy_reward": 0.7333572506904602, "rewards/format_reward": 1.0, "step": 1839, "temperature": 1.0 }, { "all_correct": 0.25, "all_wrong": 0.125, "completion_length": 412.71875, "epoch": 0.05595426347159713, "grad_norm": 3.0778497463352097, "kl": 0.031982421875, "learning_rate": 9.922947363243585e-07, "loss": 0.0013, "reward": 1.6286427974700928, "reward_std": 0.11927562206983566, "rewards/accuracy_reward": 0.4755178689956665, "rewards/format_reward": 1.0, "step": 1840, "temperature": 1.0 }, { "all_correct": 0.25, "all_wrong": 0.125, "completion_length": 407.546875, "epoch": 0.05598467339739691, "grad_norm": 0.9575660204521547, "kl": 0.0289306640625, "learning_rate": 9.922863803690249e-07, "loss": 0.0012, "reward": 1.580139398574829, "reward_std": 0.28412267565727234, "rewards/accuracy_reward": 0.5051394104957581, "rewards/format_reward": 0.953125, "step": 1841, "temperature": 1.0 }, { "all_correct": 0.375, "all_wrong": 0.125, "completion_length": 354.9375, "epoch": 0.05601508332319669, "grad_norm": 1.0755044015423803, "kl": 0.0322265625, "learning_rate": 9.922780199205688e-07, "loss": 0.0013, "reward": 1.7700700759887695, "reward_std": 0.2082536518573761, "rewards/accuracy_reward": 0.6325700283050537, "rewards/format_reward": 1.0, "step": 1842, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.0, "completion_length": 349.125, "epoch": 0.05604549324899647, "grad_norm": 1.8343532000513065, "kl": 0.0380859375, "learning_rate": 9.92269654979066e-07, "loss": 0.0015, "reward": 2.0218751430511475, "reward_std": 0.21231761574745178, "rewards/accuracy_reward": 0.859375, "rewards/format_reward": 1.0, "step": 1843, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.0, "completion_length": 349.515625, "epoch": 0.05607590317479625, "grad_norm": 1.2330015511660366, "kl": 0.03515625, "learning_rate": 9.922612855445931e-07, "loss": 0.0014, "reward": 1.953125, "reward_std": 0.2253442406654358, "rewards/accuracy_reward": 0.796875, "rewards/format_reward": 1.0, "step": 1844, "temperature": 1.0 }, { "all_correct": 0.375, "all_wrong": 0.125, "completion_length": 358.171875, "epoch": 0.056106313100596036, "grad_norm": 1.5973750873190569, "kl": 0.039794921875, "learning_rate": 9.922529116172265e-07, "loss": 0.0016, "reward": 1.7458016872406006, "reward_std": 0.22305285930633545, "rewards/accuracy_reward": 0.6083016395568848, "rewards/format_reward": 1.0, "step": 1845, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 373.3125, "epoch": 0.056136723026395816, "grad_norm": 2.6075486990186185, "kl": 0.039794921875, "learning_rate": 9.922445331970426e-07, "loss": 0.0016, "reward": 1.8482511043548584, "reward_std": 0.11192609369754791, "rewards/accuracy_reward": 0.6763760447502136, "rewards/format_reward": 1.0, "step": 1846, "temperature": 1.0 }, { "all_correct": 0.375, "all_wrong": 0.125, "completion_length": 366.078125, "epoch": 0.056167132952195595, "grad_norm": 0.9001168650455822, "kl": 0.043701171875, "learning_rate": 9.922361502841178e-07, "loss": 0.0018, "reward": 1.7739583253860474, "reward_std": 0.22493413090705872, "rewards/accuracy_reward": 0.6458333134651184, "rewards/format_reward": 1.0, "step": 1847, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.125, "completion_length": 372.953125, "epoch": 0.05619754287799538, "grad_norm": 1.0874942647285268, "kl": 0.0380859375, "learning_rate": 9.922277628785287e-07, "loss": 0.0015, "reward": 1.8757933378219604, "reward_std": 0.17411655187606812, "rewards/accuracy_reward": 0.7195432782173157, "rewards/format_reward": 1.0, "step": 1848, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.125, "completion_length": 359.15625, "epoch": 0.05622795280379516, "grad_norm": 1.1551137955046786, "kl": 0.03466796875, "learning_rate": 9.922193709803519e-07, "loss": 0.0014, "reward": 1.875568151473999, "reward_std": 0.10124483704566956, "rewards/accuracy_reward": 0.7443181872367859, "rewards/format_reward": 0.984375, "step": 1849, "temperature": 1.0 }, { "all_correct": 0.375, "all_wrong": 0.375, "completion_length": 357.1875, "epoch": 0.05625836272959494, "grad_norm": 0.6220895176656418, "kl": 0.042236328125, "learning_rate": 9.922109745896637e-07, "loss": 0.0017, "reward": 1.5498721599578857, "reward_std": 0.048731740564107895, "rewards/accuracy_reward": 0.4498721957206726, "rewards/format_reward": 1.0, "step": 1850, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.0, "completion_length": 343.671875, "epoch": 0.05628877265539472, "grad_norm": 25.13358638185564, "kl": 0.040771484375, "learning_rate": 9.922025737065412e-07, "loss": 0.0016, "reward": 2.0662150382995605, "reward_std": 0.15824121236801147, "rewards/accuracy_reward": 0.8880901336669922, "rewards/format_reward": 1.0, "step": 1851, "temperature": 1.0 }, { "all_correct": 0.375, "all_wrong": 0.125, "completion_length": 379.984375, "epoch": 0.056319182581194505, "grad_norm": 0.9934314366436088, "kl": 0.03662109375, "learning_rate": 9.921941683310605e-07, "loss": 0.0015, "reward": 1.7997488975524902, "reward_std": 0.18747317790985107, "rewards/accuracy_reward": 0.6778739094734192, "rewards/format_reward": 0.984375, "step": 1852, "temperature": 1.0 }, { "all_correct": 0.25, "all_wrong": 0.125, "completion_length": 362.765625, "epoch": 0.056349592506994284, "grad_norm": 1.5625271495605662, "kl": 0.05712890625, "learning_rate": 9.92185758463299e-07, "loss": 0.0023, "reward": 1.7489309310913086, "reward_std": 0.23080140352249146, "rewards/accuracy_reward": 0.6020558476448059, "rewards/format_reward": 1.0, "step": 1853, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 353.203125, "epoch": 0.05638000243279406, "grad_norm": 0.8748453337552218, "kl": 0.040283203125, "learning_rate": 9.92177344103333e-07, "loss": 0.0016, "reward": 2.0786030292510986, "reward_std": 0.08555250614881516, "rewards/accuracy_reward": 0.8911029100418091, "rewards/format_reward": 1.0, "step": 1854, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.25, "completion_length": 365.6875, "epoch": 0.05641041235859384, "grad_norm": 0.4326961918771979, "kl": 0.04248046875, "learning_rate": 9.921689252512393e-07, "loss": 0.0017, "reward": 1.8231035470962524, "reward_std": 0.009484430775046349, "rewards/accuracy_reward": 0.6762285828590393, "rewards/format_reward": 1.0, "step": 1855, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.0, "completion_length": 380.6875, "epoch": 0.05644082228439363, "grad_norm": 1.8029994511917542, "kl": 0.0400390625, "learning_rate": 9.92160501907095e-07, "loss": 0.0016, "reward": 1.9482872486114502, "reward_std": 0.05767810717225075, "rewards/accuracy_reward": 0.7701621651649475, "rewards/format_reward": 1.0, "step": 1856, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.125, "completion_length": 384.25, "epoch": 0.05647123221019341, "grad_norm": 1.2766406489627038, "kl": 0.04541015625, "learning_rate": 9.921520740709765e-07, "loss": 0.0018, "reward": 1.8384604454040527, "reward_std": 0.07768689841032028, "rewards/accuracy_reward": 0.6853355169296265, "rewards/format_reward": 1.0, "step": 1857, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 363.953125, "epoch": 0.05650164213599319, "grad_norm": 1.5418241194115512, "kl": 0.04052734375, "learning_rate": 9.921436417429612e-07, "loss": 0.0016, "reward": 1.9332107305526733, "reward_std": 0.06883864849805832, "rewards/accuracy_reward": 0.7769607305526733, "rewards/format_reward": 1.0, "step": 1858, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 363.796875, "epoch": 0.056532052061792966, "grad_norm": 1.2806297958598352, "kl": 0.041259765625, "learning_rate": 9.92135204923126e-07, "loss": 0.0017, "reward": 1.867650032043457, "reward_std": 0.17836429178714752, "rewards/accuracy_reward": 0.7051499485969543, "rewards/format_reward": 1.0, "step": 1859, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 344.65625, "epoch": 0.05656246198759275, "grad_norm": 1.4074167664440507, "kl": 0.039794921875, "learning_rate": 9.921267636115478e-07, "loss": 0.0016, "reward": 1.9787335395812988, "reward_std": 0.20133064687252045, "rewards/accuracy_reward": 0.8006083965301514, "rewards/format_reward": 1.0, "step": 1860, "temperature": 1.0 }, { "all_correct": 0.375, "all_wrong": 0.0, "completion_length": 370.765625, "epoch": 0.05659287191339253, "grad_norm": 1.3488314866156224, "kl": 0.048095703125, "learning_rate": 9.921183178083035e-07, "loss": 0.0019, "reward": 1.932004451751709, "reward_std": 0.09696999192237854, "rewards/accuracy_reward": 0.7413793802261353, "rewards/format_reward": 1.0, "step": 1861, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.0, "completion_length": 362.203125, "epoch": 0.05662328183919231, "grad_norm": 0.9534472962813201, "kl": 0.04443359375, "learning_rate": 9.921098675134704e-07, "loss": 0.0018, "reward": 2.071725368499756, "reward_std": 0.037764161825180054, "rewards/accuracy_reward": 0.877975344657898, "rewards/format_reward": 1.0, "step": 1862, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.375, "completion_length": 347.03125, "epoch": 0.056653691764992097, "grad_norm": 0.4915780859728136, "kl": 0.03515625, "learning_rate": 9.921014127271257e-07, "loss": 0.0014, "reward": 1.7437500953674316, "reward_std": 0.011572758667171001, "rewards/accuracy_reward": 0.625, "rewards/format_reward": 1.0, "step": 1863, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.125, "completion_length": 336.765625, "epoch": 0.056684101690791876, "grad_norm": 0.9366182842437619, "kl": 0.052490234375, "learning_rate": 9.920929534493463e-07, "loss": 0.0021, "reward": 1.96875, "reward_std": 0.0978560596704483, "rewards/accuracy_reward": 0.8125, "rewards/format_reward": 1.0, "step": 1864, "temperature": 1.0 }, { "all_correct": 0.875, "all_wrong": 0.125, "completion_length": 341.40625, "epoch": 0.056714511616591655, "grad_norm": 1.1694495623434176, "kl": 0.044189453125, "learning_rate": 9.920844896802093e-07, "loss": 0.0018, "reward": 2.043750047683716, "reward_std": 0.017677675932645798, "rewards/accuracy_reward": 0.875, "rewards/format_reward": 1.0, "step": 1865, "temperature": 1.0 }, { "all_correct": 0.375, "all_wrong": 0.25, "completion_length": 364.09375, "epoch": 0.056744921542391434, "grad_norm": 1.11981809197019, "kl": 0.04150390625, "learning_rate": 9.920760214197924e-07, "loss": 0.0017, "reward": 1.7781250476837158, "reward_std": 0.1972883939743042, "rewards/accuracy_reward": 0.65625, "rewards/format_reward": 1.0, "step": 1866, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.125, "completion_length": 377.28125, "epoch": 0.05677533146819122, "grad_norm": 0.8966699107727735, "kl": 0.0478515625, "learning_rate": 9.920675486681728e-07, "loss": 0.0019, "reward": 1.7971857786178589, "reward_std": 0.017440611496567726, "rewards/accuracy_reward": 0.6253107786178589, "rewards/format_reward": 1.0, "step": 1867, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.0, "completion_length": 341.78125, "epoch": 0.056805741393991, "grad_norm": 1.70372319647996, "kl": 0.034912109375, "learning_rate": 9.920590714254275e-07, "loss": 0.0014, "reward": 2.125, "reward_std": 0.17550252377986908, "rewards/accuracy_reward": 0.9375, "rewards/format_reward": 1.0, "step": 1868, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.125, "completion_length": 353.296875, "epoch": 0.05683615131979078, "grad_norm": 1.0618694031823293, "kl": 0.04052734375, "learning_rate": 9.920505896916342e-07, "loss": 0.0016, "reward": 1.8476537466049194, "reward_std": 0.14259874820709229, "rewards/accuracy_reward": 0.7007787227630615, "rewards/format_reward": 1.0, "step": 1869, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.0, "completion_length": 351.078125, "epoch": 0.05686656124559056, "grad_norm": 1.12369152246337, "kl": 0.033935546875, "learning_rate": 9.920421034668703e-07, "loss": 0.0014, "reward": 1.9709336757659912, "reward_std": 0.17443126440048218, "rewards/accuracy_reward": 0.7959335446357727, "rewards/format_reward": 1.0, "step": 1870, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.25, "completion_length": 361.09375, "epoch": 0.056896971171390344, "grad_norm": 0.6805041296573715, "kl": 0.035888671875, "learning_rate": 9.92033612751213e-07, "loss": 0.0014, "reward": 1.6889848709106445, "reward_std": 0.04650196060538292, "rewards/accuracy_reward": 0.5639848113059998, "rewards/format_reward": 1.0, "step": 1871, "temperature": 1.0 }, { "all_correct": 0.375, "all_wrong": 0.125, "completion_length": 375.015625, "epoch": 0.05692738109719012, "grad_norm": 1.3111713081537106, "kl": 0.02734375, "learning_rate": 9.920251175447397e-07, "loss": 0.0011, "reward": 1.8788461685180664, "reward_std": 0.22712530195713043, "rewards/accuracy_reward": 0.7319711446762085, "rewards/format_reward": 1.0, "step": 1872, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.125, "completion_length": 380.59375, "epoch": 0.0569577910229899, "grad_norm": 0.7721329306152038, "kl": 0.03271484375, "learning_rate": 9.920166178475287e-07, "loss": 0.0013, "reward": 1.8742027282714844, "reward_std": 0.09498561173677444, "rewards/accuracy_reward": 0.7273277640342712, "rewards/format_reward": 1.0, "step": 1873, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 376.03125, "epoch": 0.05698820094878968, "grad_norm": 0.9944406872404637, "kl": 0.03173828125, "learning_rate": 9.920081136596569e-07, "loss": 0.0013, "reward": 1.798855185508728, "reward_std": 0.1442265510559082, "rewards/accuracy_reward": 0.6394801139831543, "rewards/format_reward": 1.0, "step": 1874, "temperature": 1.0 }, { "all_correct": 0.875, "all_wrong": 0.0, "completion_length": 344.484375, "epoch": 0.05701861087458947, "grad_norm": 0.6691698710531648, "kl": 0.040283203125, "learning_rate": 9.919996049812017e-07, "loss": 0.0016, "reward": 2.105757236480713, "reward_std": 0.009038932621479034, "rewards/accuracy_reward": 0.905756950378418, "rewards/format_reward": 1.0, "step": 1875, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.25, "completion_length": 390.296875, "epoch": 0.05704902080038925, "grad_norm": 0.647064639184316, "kl": 0.0294189453125, "learning_rate": 9.919910918122415e-07, "loss": 0.0012, "reward": 1.8351638317108154, "reward_std": 0.025424057617783546, "rewards/accuracy_reward": 0.7070387601852417, "rewards/format_reward": 1.0, "step": 1876, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 376.515625, "epoch": 0.057079430726189026, "grad_norm": 0.8856239279835408, "kl": 0.0303955078125, "learning_rate": 9.919825741528533e-07, "loss": 0.0012, "reward": 2.016319513320923, "reward_std": 0.008788714185357094, "rewards/accuracy_reward": 0.8163194060325623, "rewards/format_reward": 1.0, "step": 1877, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.0, "completion_length": 384.46875, "epoch": 0.05710984065198881, "grad_norm": 1.0406720879063511, "kl": 0.036376953125, "learning_rate": 9.919740520031155e-07, "loss": 0.0015, "reward": 1.980074405670166, "reward_std": 0.09344036132097244, "rewards/accuracy_reward": 0.7925743460655212, "rewards/format_reward": 1.0, "step": 1878, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.125, "completion_length": 383.203125, "epoch": 0.05714025057778859, "grad_norm": 0.7667431274639929, "kl": 0.034423828125, "learning_rate": 9.919655253631054e-07, "loss": 0.0014, "reward": 1.8458385467529297, "reward_std": 0.15162408351898193, "rewards/accuracy_reward": 0.6927134990692139, "rewards/format_reward": 1.0, "step": 1879, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.125, "completion_length": 376.25, "epoch": 0.05717066050358837, "grad_norm": 1.14080870389853, "kl": 0.048095703125, "learning_rate": 9.919569942329009e-07, "loss": 0.0019, "reward": 1.7443220615386963, "reward_std": 0.06011394411325455, "rewards/accuracy_reward": 0.6005719900131226, "rewards/format_reward": 1.0, "step": 1880, "temperature": 1.0 }, { "all_correct": 0.375, "all_wrong": 0.125, "completion_length": 375.75, "epoch": 0.05720107042938815, "grad_norm": 2.637647472482071, "kl": 0.0390625, "learning_rate": 9.919484586125802e-07, "loss": 0.0016, "reward": 1.7118557691574097, "reward_std": 0.13622045516967773, "rewards/accuracy_reward": 0.5774807929992676, "rewards/format_reward": 1.0, "step": 1881, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.125, "completion_length": 372.71875, "epoch": 0.057231480355187936, "grad_norm": 0.5519170912122575, "kl": 0.033203125, "learning_rate": 9.919399185022206e-07, "loss": 0.0013, "reward": 1.9000000953674316, "reward_std": 0.13887301087379456, "rewards/accuracy_reward": 0.75, "rewards/format_reward": 1.0, "step": 1882, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.375, "completion_length": 364.078125, "epoch": 0.057261890280987715, "grad_norm": 0.07778799895529925, "kl": 0.034423828125, "learning_rate": 9.919313739019005e-07, "loss": 0.0014, "reward": 1.75, "reward_std": 0.0, "rewards/accuracy_reward": 0.625, "rewards/format_reward": 1.0, "step": 1883, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.375, "completion_length": 356.5625, "epoch": 0.057292300206787494, "grad_norm": 0.4071381724904059, "kl": 0.033447265625, "learning_rate": 9.919228248116979e-07, "loss": 0.0013, "reward": 1.6375000476837158, "reward_std": 0.06943650543689728, "rewards/accuracy_reward": 0.53125, "rewards/format_reward": 1.0, "step": 1884, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.0, "completion_length": 359.625, "epoch": 0.05732271013258727, "grad_norm": 1.3221228163380951, "kl": 0.03271484375, "learning_rate": 9.919142712316908e-07, "loss": 0.0013, "reward": 1.8824599981307983, "reward_std": 0.10024935007095337, "rewards/accuracy_reward": 0.7137100100517273, "rewards/format_reward": 1.0, "step": 1885, "temperature": 1.0 }, { "all_correct": 0.875, "all_wrong": 0.0, "completion_length": 360.84375, "epoch": 0.05735312005838706, "grad_norm": 0.3438298259669171, "kl": 0.0302734375, "learning_rate": 9.91905713161957e-07, "loss": 0.0012, "reward": 2.1812500953674316, "reward_std": 0.0530330091714859, "rewards/accuracy_reward": 0.984375, "rewards/format_reward": 1.0, "step": 1886, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.125, "completion_length": 364.703125, "epoch": 0.05738352998418684, "grad_norm": 0.697426835202751, "kl": 0.03759765625, "learning_rate": 9.918971506025748e-07, "loss": 0.0015, "reward": 1.9042613506317139, "reward_std": 0.0547148659825325, "rewards/accuracy_reward": 0.7542614340782166, "rewards/format_reward": 1.0, "step": 1887, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.25, "completion_length": 391.25, "epoch": 0.05741393990998662, "grad_norm": 0.7603068597705075, "kl": 0.032470703125, "learning_rate": 9.918885835536224e-07, "loss": 0.0013, "reward": 1.788675308227539, "reward_std": 0.06308997422456741, "rewards/accuracy_reward": 0.6574252247810364, "rewards/format_reward": 1.0, "step": 1888, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 384.96875, "epoch": 0.057444349835786404, "grad_norm": 1.2720786266932729, "kl": 0.034423828125, "learning_rate": 9.91880012015178e-07, "loss": 0.0014, "reward": 2.0269055366516113, "reward_std": 0.032083842903375626, "rewards/accuracy_reward": 0.8362805247306824, "rewards/format_reward": 1.0, "step": 1889, "temperature": 1.0 }, { "all_correct": 0.25, "all_wrong": 0.125, "completion_length": 376.46875, "epoch": 0.05747475976158618, "grad_norm": 1.1097446528975026, "kl": 0.0311279296875, "learning_rate": 9.918714359873196e-07, "loss": 0.0012, "reward": 1.5291284322738647, "reward_std": 0.21159903705120087, "rewards/accuracy_reward": 0.4385033845901489, "rewards/format_reward": 1.0, "step": 1890, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.125, "completion_length": 348.8125, "epoch": 0.05750516968738596, "grad_norm": 0.742983427881877, "kl": 0.03125, "learning_rate": 9.918628554701255e-07, "loss": 0.0013, "reward": 1.912500023841858, "reward_std": 0.0530330091714859, "rewards/accuracy_reward": 0.765625, "rewards/format_reward": 1.0, "step": 1891, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 348.375, "epoch": 0.05753557961318574, "grad_norm": 0.8924434227276397, "kl": 0.0400390625, "learning_rate": 9.918542704636746e-07, "loss": 0.0016, "reward": 2.1175732612609863, "reward_std": 0.0632951632142067, "rewards/accuracy_reward": 0.9206979870796204, "rewards/format_reward": 1.0, "step": 1892, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.125, "completion_length": 358.796875, "epoch": 0.05756598953898553, "grad_norm": 0.3597485326232882, "kl": 0.034423828125, "learning_rate": 9.918456809680444e-07, "loss": 0.0014, "reward": 1.953125, "reward_std": 0.07372426986694336, "rewards/accuracy_reward": 0.796875, "rewards/format_reward": 1.0, "step": 1893, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.0, "completion_length": 384.390625, "epoch": 0.05759639946478531, "grad_norm": 1.4960860461058496, "kl": 0.029052734375, "learning_rate": 9.91837086983314e-07, "loss": 0.0012, "reward": 1.9725404977798462, "reward_std": 0.1434636116027832, "rewards/accuracy_reward": 0.8100404143333435, "rewards/format_reward": 1.0, "step": 1894, "temperature": 1.0 }, { "all_correct": 0.375, "all_wrong": 0.25, "completion_length": 366.21875, "epoch": 0.057626809390585086, "grad_norm": 0.9198639198174171, "kl": 0.02783203125, "learning_rate": 9.918284885095614e-07, "loss": 0.0011, "reward": 1.578125, "reward_std": 0.19380906224250793, "rewards/accuracy_reward": 0.484375, "rewards/format_reward": 1.0, "step": 1895, "temperature": 1.0 }, { "all_correct": 0.25, "all_wrong": 0.375, "completion_length": 372.015625, "epoch": 0.057657219316384865, "grad_norm": 1.7578506511545606, "kl": 0.0289306640625, "learning_rate": 9.918198855468652e-07, "loss": 0.0012, "reward": 1.428168773651123, "reward_std": 0.06693951785564423, "rewards/accuracy_reward": 0.35316866636276245, "rewards/format_reward": 1.0, "step": 1896, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.125, "completion_length": 346.125, "epoch": 0.05768762924218465, "grad_norm": 0.8962215786927146, "kl": 0.03173828125, "learning_rate": 9.91811278095304e-07, "loss": 0.0013, "reward": 2.0250000953674316, "reward_std": 0.070710688829422, "rewards/accuracy_reward": 0.859375, "rewards/format_reward": 1.0, "step": 1897, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.0, "completion_length": 343.890625, "epoch": 0.05771803916798443, "grad_norm": 1.8427826830397054, "kl": 0.03662109375, "learning_rate": 9.918026661549561e-07, "loss": 0.0015, "reward": 2.0277183055877686, "reward_std": 0.10206834971904755, "rewards/accuracy_reward": 0.8339681625366211, "rewards/format_reward": 1.0, "step": 1898, "temperature": 1.0 }, { "all_correct": 0.375, "all_wrong": 0.375, "completion_length": 358.40625, "epoch": 0.05774844909378421, "grad_norm": 1.0199748147036032, "kl": 0.027587890625, "learning_rate": 9.917940497259003e-07, "loss": 0.0011, "reward": 1.5790156126022339, "reward_std": 0.07046148926019669, "rewards/accuracy_reward": 0.4821406900882721, "rewards/format_reward": 1.0, "step": 1899, "temperature": 1.0 }, { "all_correct": 0.875, "all_wrong": 0.0, "completion_length": 345.109375, "epoch": 0.05777885901958399, "grad_norm": 0.7844493641141327, "kl": 0.0380859375, "learning_rate": 9.917854288082154e-07, "loss": 0.0015, "reward": 2.105393886566162, "reward_std": 0.021596401929855347, "rewards/accuracy_reward": 0.9116440415382385, "rewards/format_reward": 1.0, "step": 1900, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 348.125, "epoch": 0.057809268945383775, "grad_norm": 0.8513497946067383, "kl": 0.0306396484375, "learning_rate": 9.9177680340198e-07, "loss": 0.0012, "reward": 2.09395170211792, "reward_std": 0.021317126229405403, "rewards/accuracy_reward": 0.8939515948295593, "rewards/format_reward": 1.0, "step": 1901, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 365.34375, "epoch": 0.057839678871183554, "grad_norm": 1.2822183774256382, "kl": 0.031494140625, "learning_rate": 9.917681735072726e-07, "loss": 0.0013, "reward": 1.9283008575439453, "reward_std": 0.12729044258594513, "rewards/accuracy_reward": 0.7501758337020874, "rewards/format_reward": 1.0, "step": 1902, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 376.265625, "epoch": 0.057870088796983334, "grad_norm": 1.1855024797954445, "kl": 0.036376953125, "learning_rate": 9.917595391241721e-07, "loss": 0.0015, "reward": 1.8624359369277954, "reward_std": 0.04997137933969498, "rewards/accuracy_reward": 0.67806077003479, "rewards/format_reward": 1.0, "step": 1903, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 356.828125, "epoch": 0.05790049872278312, "grad_norm": 1.6942335084396216, "kl": 0.0361328125, "learning_rate": 9.917509002527575e-07, "loss": 0.0014, "reward": 2.035306930541992, "reward_std": 0.12590652704238892, "rewards/accuracy_reward": 0.8603069186210632, "rewards/format_reward": 1.0, "step": 1904, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.0, "completion_length": 362.34375, "epoch": 0.0579309086485829, "grad_norm": 0.8028863580163638, "kl": 0.026123046875, "learning_rate": 9.917422568931075e-07, "loss": 0.001, "reward": 1.9750001430511475, "reward_std": 0.235443115234375, "rewards/accuracy_reward": 0.8125, "rewards/format_reward": 1.0, "step": 1905, "temperature": 1.0 }, { "all_correct": 0.375, "all_wrong": 0.25, "completion_length": 373.171875, "epoch": 0.05796131857438268, "grad_norm": 0.6193118223896407, "kl": 0.02978515625, "learning_rate": 9.917336090453008e-07, "loss": 0.0012, "reward": 1.600000023841858, "reward_std": 0.2220800668001175, "rewards/accuracy_reward": 0.515625, "rewards/format_reward": 0.984375, "step": 1906, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.125, "completion_length": 379.078125, "epoch": 0.05799172850018246, "grad_norm": 1.1171733928028575, "kl": 0.03076171875, "learning_rate": 9.917249567094164e-07, "loss": 0.0012, "reward": 1.8750169277191162, "reward_std": 0.0764056071639061, "rewards/accuracy_reward": 0.7093918919563293, "rewards/format_reward": 1.0, "step": 1907, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.125, "completion_length": 350.75, "epoch": 0.05802213842598224, "grad_norm": 0.972485497035434, "kl": 0.03662109375, "learning_rate": 9.917162998855335e-07, "loss": 0.0015, "reward": 1.7945833206176758, "reward_std": 0.09081462025642395, "rewards/accuracy_reward": 0.6539583206176758, "rewards/format_reward": 1.0, "step": 1908, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 384.546875, "epoch": 0.05805254835178202, "grad_norm": 2.3575955652391416, "kl": 0.0289306640625, "learning_rate": 9.917076385737311e-07, "loss": 0.0012, "reward": 1.9692280292510986, "reward_std": 0.14977966248989105, "rewards/accuracy_reward": 0.7879780530929565, "rewards/format_reward": 1.0, "step": 1909, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.125, "completion_length": 374.0, "epoch": 0.0580829582775818, "grad_norm": 0.9416126752498891, "kl": 0.03662109375, "learning_rate": 9.916989727740878e-07, "loss": 0.0015, "reward": 1.8093750476837158, "reward_std": 0.18150544166564941, "rewards/accuracy_reward": 0.703125, "rewards/format_reward": 0.984375, "step": 1910, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.25, "completion_length": 366.234375, "epoch": 0.05811336820338158, "grad_norm": 0.6814228686218095, "kl": 0.0294189453125, "learning_rate": 9.916903024866833e-07, "loss": 0.0012, "reward": 1.734375, "reward_std": 0.06568655371665955, "rewards/accuracy_reward": 0.609375, "rewards/format_reward": 1.0, "step": 1911, "temperature": 1.0 }, { "all_correct": 0.375, "all_wrong": 0.0, "completion_length": 368.421875, "epoch": 0.05814377812918137, "grad_norm": 1.3087947491934386, "kl": 0.030517578125, "learning_rate": 9.916816277115963e-07, "loss": 0.0012, "reward": 1.7895748615264893, "reward_std": 0.23183944821357727, "rewards/accuracy_reward": 0.6364498138427734, "rewards/format_reward": 1.0, "step": 1912, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.0, "completion_length": 359.3125, "epoch": 0.058174188054981146, "grad_norm": 1.089610792522334, "kl": 0.033203125, "learning_rate": 9.916729484489062e-07, "loss": 0.0013, "reward": 1.93362295627594, "reward_std": 0.04842543601989746, "rewards/accuracy_reward": 0.7648729085922241, "rewards/format_reward": 1.0, "step": 1913, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.125, "completion_length": 340.359375, "epoch": 0.058204597980780926, "grad_norm": 1.7154099383940173, "kl": 0.03955078125, "learning_rate": 9.916642646986922e-07, "loss": 0.0016, "reward": 2.028125047683716, "reward_std": 0.06187184900045395, "rewards/accuracy_reward": 0.859375, "rewards/format_reward": 1.0, "step": 1914, "temperature": 1.0 }, { "all_correct": 0.25, "all_wrong": 0.0, "completion_length": 380.46875, "epoch": 0.058235007906580705, "grad_norm": 1.4605282242671804, "kl": 0.0263671875, "learning_rate": 9.916555764610332e-07, "loss": 0.0011, "reward": 1.6188099384307861, "reward_std": 0.25562745332717896, "rewards/accuracy_reward": 0.4875600039958954, "rewards/format_reward": 1.0, "step": 1915, "temperature": 1.0 }, { "all_correct": 0.375, "all_wrong": 0.0, "completion_length": 365.953125, "epoch": 0.05826541783238049, "grad_norm": 1.6940972369812064, "kl": 0.03662109375, "learning_rate": 9.91646883736009e-07, "loss": 0.0015, "reward": 1.7356996536254883, "reward_std": 0.042403820902109146, "rewards/accuracy_reward": 0.5669495463371277, "rewards/format_reward": 1.0, "step": 1916, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.25, "completion_length": 335.171875, "epoch": 0.05829582775818027, "grad_norm": 6.254087126604567, "kl": 0.048095703125, "learning_rate": 9.916381865236989e-07, "loss": 0.0019, "reward": 1.859375, "reward_std": 0.06805657595396042, "rewards/accuracy_reward": 0.71875, "rewards/format_reward": 1.0, "step": 1917, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.25, "completion_length": 358.5, "epoch": 0.05832623768398005, "grad_norm": 0.5788029293989937, "kl": 0.02783203125, "learning_rate": 9.916294848241819e-07, "loss": 0.0011, "reward": 1.7988590002059937, "reward_std": 0.0588177852332592, "rewards/accuracy_reward": 0.6519839763641357, "rewards/format_reward": 1.0, "step": 1918, "temperature": 1.0 }, { "all_correct": 0.375, "all_wrong": 0.25, "completion_length": 377.40625, "epoch": 0.058356647609779835, "grad_norm": 1.5719701391524767, "kl": 0.0380859375, "learning_rate": 9.916207786375377e-07, "loss": 0.0015, "reward": 1.7020118236541748, "reward_std": 0.03771716356277466, "rewards/accuracy_reward": 0.5551367402076721, "rewards/format_reward": 1.0, "step": 1919, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.25, "completion_length": 360.046875, "epoch": 0.058387057535579615, "grad_norm": 1.1006374956824891, "kl": 0.04052734375, "learning_rate": 9.916120679638456e-07, "loss": 0.0016, "reward": 1.7598072290420532, "reward_std": 0.03736592084169388, "rewards/accuracy_reward": 0.6348072290420532, "rewards/format_reward": 1.0, "step": 1920, "temperature": 1.0 }, { "all_correct": 0.375, "all_wrong": 0.0, "completion_length": 348.328125, "epoch": 0.058417467461379394, "grad_norm": 1.4631790548379138, "kl": 0.03369140625, "learning_rate": 9.916033528031853e-07, "loss": 0.0013, "reward": 1.846264123916626, "reward_std": 0.1404450386762619, "rewards/accuracy_reward": 0.6806390285491943, "rewards/format_reward": 1.0, "step": 1921, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.375, "completion_length": 377.21875, "epoch": 0.05844787738717917, "grad_norm": 0.5298400163797515, "kl": 0.033935546875, "learning_rate": 9.915946331556362e-07, "loss": 0.0014, "reward": 1.6437499523162842, "reward_std": 0.07353248447179794, "rewards/accuracy_reward": 0.546875, "rewards/format_reward": 1.0, "step": 1922, "temperature": 1.0 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 349.265625, "epoch": 0.05847828731297896, "grad_norm": 2.9732511633120375, "kl": 0.0439453125, "learning_rate": 9.915859090212779e-07, "loss": 0.0018, "reward": 2.1839284896850586, "reward_std": 0.02880534529685974, "rewards/accuracy_reward": 0.9933035969734192, "rewards/format_reward": 1.0, "step": 1923, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 345.828125, "epoch": 0.05850869723877874, "grad_norm": 0.6937139034444461, "kl": 0.041259765625, "learning_rate": 9.915771804001902e-07, "loss": 0.0017, "reward": 2.03125, "reward_std": 0.14706888794898987, "rewards/accuracy_reward": 0.859375, "rewards/format_reward": 1.0, "step": 1924, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 334.0625, "epoch": 0.05853910716457852, "grad_norm": 4.768468154562458, "kl": 0.0478515625, "learning_rate": 9.915684472924527e-07, "loss": 0.0019, "reward": 2.1249351501464844, "reward_std": 0.11453960835933685, "rewards/accuracy_reward": 0.9405601024627686, "rewards/format_reward": 1.0, "step": 1925, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.25, "completion_length": 353.703125, "epoch": 0.0585695170903783, "grad_norm": 0.06815154368267241, "kl": 0.035888671875, "learning_rate": 9.915597096981447e-07, "loss": 0.0014, "reward": 1.9000000953674316, "reward_std": 0.0, "rewards/accuracy_reward": 0.75, "rewards/format_reward": 1.0, "step": 1926, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.125, "completion_length": 370.234375, "epoch": 0.05859992701617808, "grad_norm": 0.6822074231209386, "kl": 0.0302734375, "learning_rate": 9.915509676173463e-07, "loss": 0.0012, "reward": 1.8250000476837158, "reward_std": 0.2222006767988205, "rewards/accuracy_reward": 0.703125, "rewards/format_reward": 0.984375, "step": 1927, "temperature": 1.0 }, { "all_correct": 0.375, "all_wrong": 0.25, "completion_length": 340.9375, "epoch": 0.05863033694197786, "grad_norm": 1.349772442957914, "kl": 0.04052734375, "learning_rate": 9.915422210501375e-07, "loss": 0.0016, "reward": 1.6343750953674316, "reward_std": 0.21926461160182953, "rewards/accuracy_reward": 0.53125, "rewards/format_reward": 1.0, "step": 1928, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.25, "completion_length": 355.421875, "epoch": 0.05866074686777764, "grad_norm": 0.5661528089209161, "kl": 0.0291748046875, "learning_rate": 9.91533469996598e-07, "loss": 0.0012, "reward": 1.8465278148651123, "reward_std": 0.0530330091714859, "rewards/accuracy_reward": 0.6996527910232544, "rewards/format_reward": 1.0, "step": 1929, "temperature": 1.0 }, { "all_correct": 0.375, "all_wrong": 0.375, "completion_length": 399.515625, "epoch": 0.05869115679357742, "grad_norm": 1.0536658473140443, "kl": 0.035400390625, "learning_rate": 9.915247144568072e-07, "loss": 0.0014, "reward": 1.5600395202636719, "reward_std": 0.09187215566635132, "rewards/accuracy_reward": 0.46316447854042053, "rewards/format_reward": 1.0, "step": 1930, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.125, "completion_length": 359.984375, "epoch": 0.058721566719377206, "grad_norm": 1.0803966285212137, "kl": 0.039794921875, "learning_rate": 9.915159544308457e-07, "loss": 0.0016, "reward": 1.9145196676254272, "reward_std": 0.08712279796600342, "rewards/accuracy_reward": 0.7520195245742798, "rewards/format_reward": 1.0, "step": 1931, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 381.453125, "epoch": 0.058751976645176986, "grad_norm": 1.7890682422940305, "kl": 0.03173828125, "learning_rate": 9.915071899187928e-07, "loss": 0.0013, "reward": 1.9411742687225342, "reward_std": 0.10596342384815216, "rewards/accuracy_reward": 0.7567992210388184, "rewards/format_reward": 1.0, "step": 1932, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 363.984375, "epoch": 0.058782386570976765, "grad_norm": 4.275883135800628, "kl": 0.045166015625, "learning_rate": 9.91498420920729e-07, "loss": 0.0018, "reward": 2.0919463634490967, "reward_std": 0.026878736913204193, "rewards/accuracy_reward": 0.8950713276863098, "rewards/format_reward": 1.0, "step": 1933, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.0, "completion_length": 361.40625, "epoch": 0.05881279649677655, "grad_norm": 1.5820421822262587, "kl": 0.04443359375, "learning_rate": 9.914896474367342e-07, "loss": 0.0018, "reward": 2.0290799140930176, "reward_std": 0.14544382691383362, "rewards/accuracy_reward": 0.8478297591209412, "rewards/format_reward": 1.0, "step": 1934, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.125, "completion_length": 385.140625, "epoch": 0.05884320642257633, "grad_norm": 1.1529414658385009, "kl": 0.04052734375, "learning_rate": 9.914808694668883e-07, "loss": 0.0016, "reward": 1.892958641052246, "reward_std": 0.15377146005630493, "rewards/accuracy_reward": 0.7460835576057434, "rewards/format_reward": 1.0, "step": 1935, "temperature": 1.0 }, { "all_correct": 0.375, "all_wrong": 0.25, "completion_length": 368.9375, "epoch": 0.05887361634837611, "grad_norm": 0.9532487787940221, "kl": 0.03369140625, "learning_rate": 9.914720870112717e-07, "loss": 0.0013, "reward": 1.615625023841858, "reward_std": 0.183067187666893, "rewards/accuracy_reward": 0.515625, "rewards/format_reward": 1.0, "step": 1936, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.125, "completion_length": 348.5625, "epoch": 0.05890402627417589, "grad_norm": 0.5182027767039996, "kl": 0.037109375, "learning_rate": 9.914633000699643e-07, "loss": 0.0015, "reward": 1.9623512029647827, "reward_std": 0.017505919560790062, "rewards/accuracy_reward": 0.7873511910438538, "rewards/format_reward": 1.0, "step": 1937, "temperature": 1.0 }, { "all_correct": 0.375, "all_wrong": 0.25, "completion_length": 363.640625, "epoch": 0.058934436199975675, "grad_norm": 0.718950047998924, "kl": 0.03564453125, "learning_rate": 9.914545086430466e-07, "loss": 0.0014, "reward": 1.804757833480835, "reward_std": 0.12973397970199585, "rewards/accuracy_reward": 0.6641327142715454, "rewards/format_reward": 1.0, "step": 1938, "temperature": 1.0 }, { "all_correct": 0.375, "all_wrong": 0.25, "completion_length": 397.546875, "epoch": 0.058964846125775454, "grad_norm": 0.8850242151250404, "kl": 0.036865234375, "learning_rate": 9.914457127305985e-07, "loss": 0.0015, "reward": 1.7022138833999634, "reward_std": 0.08681733906269073, "rewards/accuracy_reward": 0.5740888714790344, "rewards/format_reward": 1.0, "step": 1939, "temperature": 1.0 }, { "all_correct": 0.25, "all_wrong": 0.125, "completion_length": 389.84375, "epoch": 0.05899525605157523, "grad_norm": 1.2719879470450386, "kl": 0.03271484375, "learning_rate": 9.914369123327004e-07, "loss": 0.0013, "reward": 1.7510440349578857, "reward_std": 0.16377565264701843, "rewards/accuracy_reward": 0.6041690707206726, "rewards/format_reward": 1.0, "step": 1940, "temperature": 1.0 }, { "all_correct": 0.875, "all_wrong": 0.125, "completion_length": 339.953125, "epoch": 0.05902566597737501, "grad_norm": 0.06809387626321121, "kl": 0.04541015625, "learning_rate": 9.914281074494329e-07, "loss": 0.0018, "reward": 2.0500001907348633, "reward_std": 0.0, "rewards/accuracy_reward": 0.875, "rewards/format_reward": 1.0, "step": 1941, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.25, "completion_length": 367.90625, "epoch": 0.0590560759031748, "grad_norm": 0.9466723355131045, "kl": 0.03466796875, "learning_rate": 9.914192980808758e-07, "loss": 0.0014, "reward": 1.8051339387893677, "reward_std": 0.13636811077594757, "rewards/accuracy_reward": 0.6707589626312256, "rewards/format_reward": 1.0, "step": 1942, "temperature": 1.0 }, { "all_correct": 0.375, "all_wrong": 0.0, "completion_length": 386.453125, "epoch": 0.05908648582897458, "grad_norm": 1.5251316131290986, "kl": 0.040283203125, "learning_rate": 9.914104842271101e-07, "loss": 0.0016, "reward": 1.8705720901489258, "reward_std": 0.147018164396286, "rewards/accuracy_reward": 0.7143219709396362, "rewards/format_reward": 1.0, "step": 1943, "temperature": 1.0 }, { "all_correct": 0.375, "all_wrong": 0.125, "completion_length": 396.15625, "epoch": 0.05911689575477436, "grad_norm": 1.6906761069258769, "kl": 0.041259765625, "learning_rate": 9.914016658882158e-07, "loss": 0.0017, "reward": 1.7677013874053955, "reward_std": 0.15818242728710175, "rewards/accuracy_reward": 0.6302014589309692, "rewards/format_reward": 1.0, "step": 1944, "temperature": 1.0 }, { "all_correct": 0.875, "all_wrong": 0.0, "completion_length": 349.984375, "epoch": 0.059147305680574136, "grad_norm": 0.46971376991558067, "kl": 0.04052734375, "learning_rate": 9.913928430642734e-07, "loss": 0.0016, "reward": 2.13149356842041, "reward_std": 0.005983040202409029, "rewards/accuracy_reward": 0.9314936399459839, "rewards/format_reward": 1.0, "step": 1945, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.25, "completion_length": 353.53125, "epoch": 0.05917771560637392, "grad_norm": 1.6114911660061315, "kl": 0.042724609375, "learning_rate": 9.91384015755364e-07, "loss": 0.0017, "reward": 1.6861538887023926, "reward_std": 0.024565791711211205, "rewards/accuracy_reward": 0.5486539006233215, "rewards/format_reward": 1.0, "step": 1946, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.0, "completion_length": 385.8125, "epoch": 0.0592081255321737, "grad_norm": 0.7909405850546506, "kl": 0.045166015625, "learning_rate": 9.913751839615673e-07, "loss": 0.0018, "reward": 1.8833873271942139, "reward_std": 0.07349240034818649, "rewards/accuracy_reward": 0.7240123748779297, "rewards/format_reward": 0.984375, "step": 1947, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.375, "completion_length": 340.578125, "epoch": 0.05923853545797348, "grad_norm": 0.4588488007189282, "kl": 0.041748046875, "learning_rate": 9.913663476829645e-07, "loss": 0.0017, "reward": 1.7468750476837158, "reward_std": 0.008838837966322899, "rewards/accuracy_reward": 0.625, "rewards/format_reward": 1.0, "step": 1948, "temperature": 1.0 }, { "all_correct": 0.375, "all_wrong": 0.25, "completion_length": 361.5625, "epoch": 0.05926894538377327, "grad_norm": 4.684657630115788, "kl": 0.03759765625, "learning_rate": 9.913575069196363e-07, "loss": 0.0015, "reward": 1.7546768188476562, "reward_std": 0.035466305911540985, "rewards/accuracy_reward": 0.6046769618988037, "rewards/format_reward": 1.0, "step": 1949, "temperature": 1.0 }, { "all_correct": 0.375, "all_wrong": 0.125, "completion_length": 365.375, "epoch": 0.059299355309573046, "grad_norm": 2.1722017627324415, "kl": 0.048095703125, "learning_rate": 9.91348661671663e-07, "loss": 0.0019, "reward": 1.660821557044983, "reward_std": 0.12771271169185638, "rewards/accuracy_reward": 0.5201965570449829, "rewards/format_reward": 1.0, "step": 1950, "temperature": 1.0 }, { "all_correct": 0.875, "all_wrong": 0.125, "completion_length": 350.109375, "epoch": 0.059329765235372825, "grad_norm": 0.06402290048245014, "kl": 0.040283203125, "learning_rate": 9.913398119391256e-07, "loss": 0.0016, "reward": 2.0500001907348633, "reward_std": 0.0, "rewards/accuracy_reward": 0.875, "rewards/format_reward": 1.0, "step": 1951, "temperature": 1.0 }, { "all_correct": 0.25, "all_wrong": 0.0, "completion_length": 363.1875, "epoch": 0.059360175161172604, "grad_norm": 3.8520892959687667, "kl": 0.039794921875, "learning_rate": 9.913309577221048e-07, "loss": 0.0016, "reward": 1.764976978302002, "reward_std": 0.2561027705669403, "rewards/accuracy_reward": 0.6181020140647888, "rewards/format_reward": 1.0, "step": 1952, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.125, "completion_length": 393.09375, "epoch": 0.05939058508697239, "grad_norm": 0.9379537163565117, "kl": 0.0361328125, "learning_rate": 9.913220990206813e-07, "loss": 0.0014, "reward": 1.7286500930786133, "reward_std": 0.13317827880382538, "rewards/accuracy_reward": 0.6036500334739685, "rewards/format_reward": 1.0, "step": 1953, "temperature": 1.0 }, { "all_correct": 0.875, "all_wrong": 0.0, "completion_length": 344.828125, "epoch": 0.05942099501277217, "grad_norm": 0.6042287195765036, "kl": 0.049072265625, "learning_rate": 9.91313235834936e-07, "loss": 0.002, "reward": 2.1427083015441895, "reward_std": 0.08057864755392075, "rewards/accuracy_reward": 0.9520833492279053, "rewards/format_reward": 1.0, "step": 1954, "temperature": 1.0 }, { "all_correct": 0.375, "all_wrong": 0.125, "completion_length": 387.1875, "epoch": 0.05945140493857195, "grad_norm": 1.603273903753444, "kl": 0.037109375, "learning_rate": 9.913043681649502e-07, "loss": 0.0015, "reward": 1.6594336032867432, "reward_std": 0.028177479282021523, "rewards/accuracy_reward": 0.5125585794448853, "rewards/format_reward": 1.0, "step": 1955, "temperature": 1.0 }, { "all_correct": 0.25, "all_wrong": 0.25, "completion_length": 367.84375, "epoch": 0.05948181486437173, "grad_norm": 0.9216119961149345, "kl": 0.04736328125, "learning_rate": 9.912954960108042e-07, "loss": 0.0019, "reward": 1.6091248989105225, "reward_std": 0.18067726492881775, "rewards/accuracy_reward": 0.4903748333454132, "rewards/format_reward": 1.0, "step": 1956, "temperature": 1.0 }, { "all_correct": 0.25, "all_wrong": 0.25, "completion_length": 396.59375, "epoch": 0.059512224790171514, "grad_norm": 1.8395779007242576, "kl": 0.03515625, "learning_rate": 9.912866193725795e-07, "loss": 0.0014, "reward": 1.501940131187439, "reward_std": 0.1749444603919983, "rewards/accuracy_reward": 0.39569011330604553, "rewards/format_reward": 1.0, "step": 1957, "temperature": 1.0 }, { "all_correct": 0.25, "all_wrong": 0.0, "completion_length": 386.671875, "epoch": 0.05954263471597129, "grad_norm": 1.441771828459778, "kl": 0.03857421875, "learning_rate": 9.912777382503565e-07, "loss": 0.0015, "reward": 1.822148084640503, "reward_std": 0.1737963706254959, "rewards/accuracy_reward": 0.6565229892730713, "rewards/format_reward": 1.0, "step": 1958, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.0, "completion_length": 362.515625, "epoch": 0.05957304464177107, "grad_norm": 1.0875873324737508, "kl": 0.040771484375, "learning_rate": 9.91268852644217e-07, "loss": 0.0016, "reward": 1.97079598903656, "reward_std": 0.11148596554994583, "rewards/accuracy_reward": 0.8020459413528442, "rewards/format_reward": 1.0, "step": 1959, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 359.53125, "epoch": 0.05960345456757086, "grad_norm": 1.5087855442379052, "kl": 0.041015625, "learning_rate": 9.912599625542416e-07, "loss": 0.0016, "reward": 1.9784926176071167, "reward_std": 0.06187184900045395, "rewards/accuracy_reward": 0.8097426891326904, "rewards/format_reward": 1.0, "step": 1960, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.25, "completion_length": 350.96875, "epoch": 0.05963386449337064, "grad_norm": 0.6169738650856209, "kl": 0.0439453125, "learning_rate": 9.912510679805116e-07, "loss": 0.0018, "reward": 1.7751202583312988, "reward_std": 0.014631202444434166, "rewards/accuracy_reward": 0.6376201510429382, "rewards/format_reward": 1.0, "step": 1961, "temperature": 1.0 }, { "all_correct": 0.375, "all_wrong": 0.25, "completion_length": 378.796875, "epoch": 0.05966427441917042, "grad_norm": 1.2295601976912733, "kl": 0.0322265625, "learning_rate": 9.912421689231082e-07, "loss": 0.0013, "reward": 1.6150445938110352, "reward_std": 0.16728758811950684, "rewards/accuracy_reward": 0.518169641494751, "rewards/format_reward": 1.0, "step": 1962, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.125, "completion_length": 346.09375, "epoch": 0.059694684344970196, "grad_norm": 0.4205179484664445, "kl": 0.037841796875, "learning_rate": 9.912332653821124e-07, "loss": 0.0015, "reward": 1.9937500953674316, "reward_std": 0.077632375061512, "rewards/accuracy_reward": 0.828125, "rewards/format_reward": 1.0, "step": 1963, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.125, "completion_length": 362.46875, "epoch": 0.05972509427076998, "grad_norm": 1.0197033484652347, "kl": 0.04345703125, "learning_rate": 9.91224357357606e-07, "loss": 0.0017, "reward": 1.9127107858657837, "reward_std": 0.12639778852462769, "rewards/accuracy_reward": 0.7470858097076416, "rewards/format_reward": 1.0, "step": 1964, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 349.015625, "epoch": 0.05975550419656976, "grad_norm": 0.9501448060590215, "kl": 0.0537109375, "learning_rate": 9.912154448496696e-07, "loss": 0.0021, "reward": 2.068230390548706, "reward_std": 0.07340776920318604, "rewards/accuracy_reward": 0.8744803667068481, "rewards/format_reward": 1.0, "step": 1965, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.125, "completion_length": 373.484375, "epoch": 0.05978591412236954, "grad_norm": 1.1271847528819734, "kl": 0.0595703125, "learning_rate": 9.912065278583853e-07, "loss": 0.0024, "reward": 1.7667450904846191, "reward_std": 0.024641025811433792, "rewards/accuracy_reward": 0.638620138168335, "rewards/format_reward": 1.0, "step": 1966, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.125, "completion_length": 340.984375, "epoch": 0.05981632404816932, "grad_norm": 1.2533839993985527, "kl": 0.050048828125, "learning_rate": 9.911976063838338e-07, "loss": 0.002, "reward": 1.9451099634170532, "reward_std": 0.09429675340652466, "rewards/accuracy_reward": 0.7794848680496216, "rewards/format_reward": 1.0, "step": 1967, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.125, "completion_length": 371.359375, "epoch": 0.059846733973969106, "grad_norm": 1.0457200137808438, "kl": 0.0478515625, "learning_rate": 9.91188680426097e-07, "loss": 0.0019, "reward": 1.8587749004364014, "reward_std": 0.018019117414951324, "rewards/accuracy_reward": 0.6868999004364014, "rewards/format_reward": 1.0, "step": 1968, "temperature": 1.0 }, { "all_correct": 0.125, "all_wrong": 0.5, "completion_length": 386.125, "epoch": 0.059877143899768885, "grad_norm": 1.0431837283311127, "kl": 0.03857421875, "learning_rate": 9.911797499852559e-07, "loss": 0.0015, "reward": 1.321475863456726, "reward_std": 0.10865139961242676, "rewards/accuracy_reward": 0.2527258098125458, "rewards/format_reward": 1.0, "step": 1969, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.25, "completion_length": 357.671875, "epoch": 0.059907553825568664, "grad_norm": 1.142398935898857, "kl": 0.042236328125, "learning_rate": 9.911708150613924e-07, "loss": 0.0017, "reward": 1.7703170776367188, "reward_std": 0.009297682903707027, "rewards/accuracy_reward": 0.6203169822692871, "rewards/format_reward": 1.0, "step": 1970, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.125, "completion_length": 370.84375, "epoch": 0.059937963751368444, "grad_norm": 0.8206907445941592, "kl": 0.031494140625, "learning_rate": 9.91161875654588e-07, "loss": 0.0013, "reward": 1.8650250434875488, "reward_std": 0.0807153582572937, "rewards/accuracy_reward": 0.7181500196456909, "rewards/format_reward": 1.0, "step": 1971, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.125, "completion_length": 375.109375, "epoch": 0.05996837367716823, "grad_norm": 0.6414932018732011, "kl": 0.032470703125, "learning_rate": 9.91152931764924e-07, "loss": 0.0013, "reward": 1.8312499523162842, "reward_std": 0.15609951317310333, "rewards/accuracy_reward": 0.703125, "rewards/format_reward": 1.0, "step": 1972, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.375, "completion_length": 355.671875, "epoch": 0.05999878360296801, "grad_norm": 0.8738284334042088, "kl": 0.045166015625, "learning_rate": 9.911439833924826e-07, "loss": 0.0018, "reward": 1.677626609802246, "reward_std": 0.02369932271540165, "rewards/accuracy_reward": 0.5588765144348145, "rewards/format_reward": 1.0, "step": 1973, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.125, "completion_length": 368.828125, "epoch": 0.06002919352876779, "grad_norm": 1.5296967126642902, "kl": 0.04443359375, "learning_rate": 9.91135030537345e-07, "loss": 0.0018, "reward": 1.8013516664505005, "reward_std": 0.08538375049829483, "rewards/accuracy_reward": 0.6419765949249268, "rewards/format_reward": 1.0, "step": 1974, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 370.28125, "epoch": 0.060059603454567574, "grad_norm": 1.0523610924261948, "kl": 0.029541015625, "learning_rate": 9.911260731995929e-07, "loss": 0.0012, "reward": 1.8492486476898193, "reward_std": 0.12775130569934845, "rewards/accuracy_reward": 0.6804986000061035, "rewards/format_reward": 1.0, "step": 1975, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.125, "completion_length": 372.046875, "epoch": 0.06009001338036735, "grad_norm": 0.5109180259835793, "kl": 0.03369140625, "learning_rate": 9.911171113793084e-07, "loss": 0.0013, "reward": 1.9937500953674316, "reward_std": 0.12246951460838318, "rewards/accuracy_reward": 0.828125, "rewards/format_reward": 1.0, "step": 1976, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.125, "completion_length": 390.40625, "epoch": 0.06012042330616713, "grad_norm": 1.102908024981143, "kl": 0.0341796875, "learning_rate": 9.91108145076573e-07, "loss": 0.0014, "reward": 2.0, "reward_std": 0.12268617749214172, "rewards/accuracy_reward": 0.84375, "rewards/format_reward": 1.0, "step": 1977, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 372.234375, "epoch": 0.06015083323196691, "grad_norm": 1.709610783822741, "kl": 0.03662109375, "learning_rate": 9.910991742914688e-07, "loss": 0.0015, "reward": 1.8974268436431885, "reward_std": 0.08651846647262573, "rewards/accuracy_reward": 0.7130517959594727, "rewards/format_reward": 1.0, "step": 1978, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.25, "completion_length": 347.984375, "epoch": 0.0601812431577667, "grad_norm": 0.06368488202440593, "kl": 0.037841796875, "learning_rate": 9.910901990240772e-07, "loss": 0.0015, "reward": 1.9000000953674316, "reward_std": 0.0, "rewards/accuracy_reward": 0.75, "rewards/format_reward": 1.0, "step": 1979, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.0, "completion_length": 383.375, "epoch": 0.06021165308356648, "grad_norm": 2.3757866306151434, "kl": 0.044921875, "learning_rate": 9.910812192744807e-07, "loss": 0.0018, "reward": 1.869271159172058, "reward_std": 0.05167567729949951, "rewards/accuracy_reward": 0.7098960876464844, "rewards/format_reward": 1.0, "step": 1980, "temperature": 1.0 }, { "all_correct": 0.375, "all_wrong": 0.125, "completion_length": 406.703125, "epoch": 0.060242063009366256, "grad_norm": 1.3654657594927055, "kl": 0.0308837890625, "learning_rate": 9.910722350427609e-07, "loss": 0.0012, "reward": 1.6654720306396484, "reward_std": 0.22080287337303162, "rewards/accuracy_reward": 0.5623469352722168, "rewards/format_reward": 1.0, "step": 1981, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.125, "completion_length": 372.015625, "epoch": 0.060272472935166035, "grad_norm": 0.6070560587513565, "kl": 0.0400390625, "learning_rate": 9.910632463289999e-07, "loss": 0.0016, "reward": 1.9875001907348633, "reward_std": 0.08435788005590439, "rewards/accuracy_reward": 0.828125, "rewards/format_reward": 1.0, "step": 1982, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.0, "completion_length": 347.4375, "epoch": 0.06030288286096582, "grad_norm": 1.3307857984054687, "kl": 0.041259765625, "learning_rate": 9.910542531332795e-07, "loss": 0.0017, "reward": 1.9093294143676758, "reward_std": 0.11756882816553116, "rewards/accuracy_reward": 0.7437043786048889, "rewards/format_reward": 1.0, "step": 1983, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.0, "completion_length": 354.453125, "epoch": 0.0603332927867656, "grad_norm": 17.77065210727443, "kl": 0.04150390625, "learning_rate": 9.910452554556822e-07, "loss": 0.0017, "reward": 1.9469906091690063, "reward_std": 0.1214180737733841, "rewards/accuracy_reward": 0.7876155972480774, "rewards/format_reward": 1.0, "step": 1984, "temperature": 1.0 }, { "all_correct": 0.25, "all_wrong": 0.25, "completion_length": 366.703125, "epoch": 0.06036370271256538, "grad_norm": 0.9472929521263453, "kl": 0.0390625, "learning_rate": 9.910362532962898e-07, "loss": 0.0016, "reward": 1.5874080657958984, "reward_std": 0.18145638704299927, "rewards/accuracy_reward": 0.47803300619125366, "rewards/format_reward": 1.0, "step": 1985, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 413.1875, "epoch": 0.06039411263836516, "grad_norm": 0.9138971312077004, "kl": 0.0303955078125, "learning_rate": 9.910272466551847e-07, "loss": 0.0012, "reward": 1.9122934341430664, "reward_std": 0.11233077198266983, "rewards/accuracy_reward": 0.7372933030128479, "rewards/format_reward": 1.0, "step": 1986, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 381.421875, "epoch": 0.060424522564164945, "grad_norm": 0.9864659679933918, "kl": 0.04052734375, "learning_rate": 9.91018235532449e-07, "loss": 0.0016, "reward": 1.989978551864624, "reward_std": 0.17572543025016785, "rewards/accuracy_reward": 0.8337284326553345, "rewards/format_reward": 1.0, "step": 1987, "temperature": 1.0 }, { "all_correct": 0.375, "all_wrong": 0.125, "completion_length": 397.953125, "epoch": 0.060454932489964724, "grad_norm": 2.4991825660598708, "kl": 0.040283203125, "learning_rate": 9.91009219928165e-07, "loss": 0.0016, "reward": 1.6769561767578125, "reward_std": 0.08373412489891052, "rewards/accuracy_reward": 0.5238310694694519, "rewards/format_reward": 1.0, "step": 1988, "temperature": 1.0 }, { "all_correct": 0.25, "all_wrong": 0.0, "completion_length": 394.484375, "epoch": 0.060485342415764504, "grad_norm": 2.0879254561320177, "kl": 0.03662109375, "learning_rate": 9.910001998424148e-07, "loss": 0.0015, "reward": 1.6113088130950928, "reward_std": 0.2243492603302002, "rewards/accuracy_reward": 0.46443384885787964, "rewards/format_reward": 1.0, "step": 1989, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.125, "completion_length": 373.21875, "epoch": 0.06051575234156429, "grad_norm": 0.996015561494989, "kl": 0.046142578125, "learning_rate": 9.90991175275281e-07, "loss": 0.0019, "reward": 1.8631010055541992, "reward_std": 0.041487954556941986, "rewards/accuracy_reward": 0.7006008625030518, "rewards/format_reward": 1.0, "step": 1990, "temperature": 1.0 }, { "all_correct": 0.375, "all_wrong": 0.25, "completion_length": 383.515625, "epoch": 0.06054616226736407, "grad_norm": 0.7343137120308841, "kl": 0.033203125, "learning_rate": 9.909821462268457e-07, "loss": 0.0013, "reward": 1.7302956581115723, "reward_std": 0.1389673501253128, "rewards/accuracy_reward": 0.5959206819534302, "rewards/format_reward": 1.0, "step": 1991, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.125, "completion_length": 355.75, "epoch": 0.06057657219316385, "grad_norm": 0.776607792500057, "kl": 0.03662109375, "learning_rate": 9.909731126971913e-07, "loss": 0.0015, "reward": 1.9947916269302368, "reward_std": 0.0681459903717041, "rewards/accuracy_reward": 0.8229166269302368, "rewards/format_reward": 1.0, "step": 1992, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.125, "completion_length": 381.515625, "epoch": 0.06060698211896363, "grad_norm": 0.835727328002826, "kl": 0.03564453125, "learning_rate": 9.909640746864008e-07, "loss": 0.0014, "reward": 1.8006861209869385, "reward_std": 0.08327779918909073, "rewards/accuracy_reward": 0.6538111567497253, "rewards/format_reward": 1.0, "step": 1993, "temperature": 1.0 }, { "all_correct": 0.375, "all_wrong": 0.25, "completion_length": 386.125, "epoch": 0.060637392044763413, "grad_norm": 1.0524091159144835, "kl": 0.0308837890625, "learning_rate": 9.90955032194556e-07, "loss": 0.0012, "reward": 1.646376609802246, "reward_std": 0.15505598485469818, "rewards/accuracy_reward": 0.524501621723175, "rewards/format_reward": 1.0, "step": 1994, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.0, "completion_length": 389.0, "epoch": 0.06066780197056319, "grad_norm": 3.573113734574524, "kl": 0.04052734375, "learning_rate": 9.909459852217396e-07, "loss": 0.0016, "reward": 1.9151726961135864, "reward_std": 0.12631858885288239, "rewards/accuracy_reward": 0.7589226961135864, "rewards/format_reward": 1.0, "step": 1995, "temperature": 1.0 }, { "all_correct": 0.875, "all_wrong": 0.0, "completion_length": 371.09375, "epoch": 0.06069821189636297, "grad_norm": 3.962360284402888, "kl": 0.035888671875, "learning_rate": 9.909369337680344e-07, "loss": 0.0014, "reward": 2.110389232635498, "reward_std": 0.02068474516272545, "rewards/accuracy_reward": 0.9197641015052795, "rewards/format_reward": 1.0, "step": 1996, "temperature": 1.0 }, { "all_correct": 0.375, "all_wrong": 0.25, "completion_length": 373.46875, "epoch": 0.06072862182216275, "grad_norm": 0.7652132452324627, "kl": 0.03857421875, "learning_rate": 9.90927877833523e-07, "loss": 0.0015, "reward": 1.683347225189209, "reward_std": 0.14815086126327515, "rewards/accuracy_reward": 0.5708472728729248, "rewards/format_reward": 1.0, "step": 1997, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.125, "completion_length": 358.109375, "epoch": 0.06075903174796254, "grad_norm": 0.9326094194701183, "kl": 0.029296875, "learning_rate": 9.909188174182876e-07, "loss": 0.0012, "reward": 1.8406250476837158, "reward_std": 0.17485956847667694, "rewards/accuracy_reward": 0.703125, "rewards/format_reward": 1.0, "step": 1998, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 394.703125, "epoch": 0.060789441673762316, "grad_norm": 1.3427057846942594, "kl": 0.03662109375, "learning_rate": 9.909097525224113e-07, "loss": 0.0015, "reward": 1.9200384616851807, "reward_std": 0.08428885787725449, "rewards/accuracy_reward": 0.7294133901596069, "rewards/format_reward": 1.0, "step": 1999, "temperature": 1.0 }, { "all_correct": 0.125, "all_wrong": 0.125, "completion_length": 398.5625, "epoch": 0.060819851599562096, "grad_norm": 3.0749503768137565, "kl": 0.037353515625, "learning_rate": 9.90900683145977e-07, "loss": 0.0015, "reward": 1.6705085039138794, "reward_std": 0.11880546808242798, "rewards/accuracy_reward": 0.5173835158348083, "rewards/format_reward": 1.0, "step": 2000, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.25, "completion_length": 389.625, "epoch": 0.060850261525361875, "grad_norm": 2.4584141836633067, "kl": 0.0322265625, "learning_rate": 9.908916092890672e-07, "loss": 0.0013, "reward": 1.7610342502593994, "reward_std": 0.07611941546201706, "rewards/accuracy_reward": 0.6235341429710388, "rewards/format_reward": 1.0, "step": 2001, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.125, "completion_length": 395.984375, "epoch": 0.06088067145116166, "grad_norm": 0.8096807580529091, "kl": 0.03076171875, "learning_rate": 9.908825309517646e-07, "loss": 0.0012, "reward": 1.9089552164077759, "reward_std": 0.14473243057727814, "rewards/accuracy_reward": 0.7495800256729126, "rewards/format_reward": 1.0, "step": 2002, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.25, "completion_length": 366.078125, "epoch": 0.06091108137696144, "grad_norm": 1.4661966747165747, "kl": 0.034912109375, "learning_rate": 9.908734481341524e-07, "loss": 0.0014, "reward": 1.7146159410476685, "reward_std": 0.03164929151535034, "rewards/accuracy_reward": 0.5708659291267395, "rewards/format_reward": 1.0, "step": 2003, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.0, "completion_length": 346.859375, "epoch": 0.06094149130276122, "grad_norm": 2.0676895372245974, "kl": 0.037841796875, "learning_rate": 9.908643608363131e-07, "loss": 0.0015, "reward": 1.975151538848877, "reward_std": 0.1236010193824768, "rewards/accuracy_reward": 0.8095265030860901, "rewards/format_reward": 1.0, "step": 2004, "temperature": 1.0 }, { "all_correct": 0.375, "all_wrong": 0.125, "completion_length": 403.5, "epoch": 0.060971901228561005, "grad_norm": 1.8778853673451223, "kl": 0.03173828125, "learning_rate": 9.9085526905833e-07, "loss": 0.0013, "reward": 1.8376152515411377, "reward_std": 0.21920059621334076, "rewards/accuracy_reward": 0.6876152157783508, "rewards/format_reward": 1.0, "step": 2005, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 369.328125, "epoch": 0.061002311154360785, "grad_norm": 1.1192917042278074, "kl": 0.036865234375, "learning_rate": 9.90846172800286e-07, "loss": 0.0015, "reward": 1.84938383102417, "reward_std": 0.12319977581501007, "rewards/accuracy_reward": 0.6837587952613831, "rewards/format_reward": 1.0, "step": 2006, "temperature": 1.0 }, { "all_correct": 0.375, "all_wrong": 0.125, "completion_length": 388.84375, "epoch": 0.061032721080160564, "grad_norm": 6.192662454100659, "kl": 0.032470703125, "learning_rate": 9.90837072062264e-07, "loss": 0.0013, "reward": 1.8039448261260986, "reward_std": 0.27431347966194153, "rewards/accuracy_reward": 0.6914447546005249, "rewards/format_reward": 0.984375, "step": 2007, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 375.328125, "epoch": 0.06106313100596034, "grad_norm": 0.6153442989840169, "kl": 0.031494140625, "learning_rate": 9.908279668443471e-07, "loss": 0.0013, "reward": 1.9864578247070312, "reward_std": 0.047391265630722046, "rewards/accuracy_reward": 0.8114577531814575, "rewards/format_reward": 1.0, "step": 2008, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.125, "completion_length": 345.40625, "epoch": 0.06109354093176013, "grad_norm": 0.9402845941055297, "kl": 0.045166015625, "learning_rate": 9.908188571466184e-07, "loss": 0.0018, "reward": 1.9312500953674316, "reward_std": 0.08711418509483337, "rewards/accuracy_reward": 0.78125, "rewards/format_reward": 1.0, "step": 2009, "temperature": 1.0 }, { "all_correct": 0.25, "all_wrong": 0.125, "completion_length": 345.015625, "epoch": 0.06112395085755991, "grad_norm": 1.4668053882271297, "kl": 0.03564453125, "learning_rate": 9.908097429691612e-07, "loss": 0.0014, "reward": 1.7497222423553467, "reward_std": 0.25806403160095215, "rewards/accuracy_reward": 0.6122223138809204, "rewards/format_reward": 1.0, "step": 2010, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.125, "completion_length": 360.15625, "epoch": 0.06115436078335969, "grad_norm": 1.0136508075622583, "kl": 0.037841796875, "learning_rate": 9.908006243120586e-07, "loss": 0.0015, "reward": 1.815248727798462, "reward_std": 0.003950612619519234, "rewards/accuracy_reward": 0.6652487516403198, "rewards/format_reward": 1.0, "step": 2011, "temperature": 1.0 }, { "all_correct": 0.375, "all_wrong": 0.0, "completion_length": 408.921875, "epoch": 0.06118477070915947, "grad_norm": 2.067966896442725, "kl": 0.042724609375, "learning_rate": 9.907915011753935e-07, "loss": 0.0017, "reward": 1.6213836669921875, "reward_std": 0.16359512507915497, "rewards/accuracy_reward": 0.5620086789131165, "rewards/format_reward": 0.90625, "step": 2012, "temperature": 1.0 }, { "all_correct": 0.375, "all_wrong": 0.25, "completion_length": 353.625, "epoch": 0.06121518063495925, "grad_norm": 0.8826839892327382, "kl": 0.028564453125, "learning_rate": 9.907823735592497e-07, "loss": 0.0011, "reward": 1.632556676864624, "reward_std": 0.1384935975074768, "rewards/accuracy_reward": 0.523181676864624, "rewards/format_reward": 1.0, "step": 2013, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.125, "completion_length": 346.09375, "epoch": 0.06124559056075903, "grad_norm": 1.2708261860073458, "kl": 0.03857421875, "learning_rate": 9.907732414637102e-07, "loss": 0.0015, "reward": 1.9701991081237793, "reward_std": 0.04589265584945679, "rewards/accuracy_reward": 0.8014490008354187, "rewards/format_reward": 1.0, "step": 2014, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.125, "completion_length": 339.9375, "epoch": 0.06127600048655881, "grad_norm": 0.8143599457597835, "kl": 0.033447265625, "learning_rate": 9.907641048888585e-07, "loss": 0.0013, "reward": 1.96875, "reward_std": 0.1293872892856598, "rewards/accuracy_reward": 0.8125, "rewards/format_reward": 1.0, "step": 2015, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.0, "completion_length": 356.140625, "epoch": 0.06130641041235859, "grad_norm": 2.451855780717952, "kl": 0.029541015625, "learning_rate": 9.907549638347778e-07, "loss": 0.0012, "reward": 1.9939371347427368, "reward_std": 0.1658138632774353, "rewards/accuracy_reward": 0.8158120512962341, "rewards/format_reward": 1.0, "step": 2016, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 391.09375, "epoch": 0.06133682033815838, "grad_norm": 1.2448856001567048, "kl": 0.03955078125, "learning_rate": 9.907458183015516e-07, "loss": 0.0016, "reward": 1.7002171277999878, "reward_std": 0.14296643435955048, "rewards/accuracy_reward": 0.590842068195343, "rewards/format_reward": 0.984375, "step": 2017, "temperature": 1.0 }, { "all_correct": 0.375, "all_wrong": 0.125, "completion_length": 364.078125, "epoch": 0.061367230263958156, "grad_norm": 1.534647785388403, "kl": 0.038818359375, "learning_rate": 9.907366682892636e-07, "loss": 0.0015, "reward": 1.750182867050171, "reward_std": 0.15653809905052185, "rewards/accuracy_reward": 0.6095578670501709, "rewards/format_reward": 1.0, "step": 2018, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 367.90625, "epoch": 0.061397640189757935, "grad_norm": 1.178917090393725, "kl": 0.03173828125, "learning_rate": 9.90727513797997e-07, "loss": 0.0013, "reward": 2.0434062480926514, "reward_std": 0.11369597166776657, "rewards/accuracy_reward": 0.8621562719345093, "rewards/format_reward": 1.0, "step": 2019, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.125, "completion_length": 368.9375, "epoch": 0.06142805011555772, "grad_norm": 1.0157961915253606, "kl": 0.03662109375, "learning_rate": 9.907183548278353e-07, "loss": 0.0015, "reward": 1.7323836088180542, "reward_std": 0.13221773505210876, "rewards/accuracy_reward": 0.6073836088180542, "rewards/format_reward": 1.0, "step": 2020, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.0, "completion_length": 368.09375, "epoch": 0.0614584600413575, "grad_norm": 1.6582395335683926, "kl": 0.03759765625, "learning_rate": 9.907091913788625e-07, "loss": 0.0015, "reward": 1.8728384971618652, "reward_std": 0.12552928924560547, "rewards/accuracy_reward": 0.7322134971618652, "rewards/format_reward": 1.0, "step": 2021, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.125, "completion_length": 393.59375, "epoch": 0.06148886996715728, "grad_norm": 0.7855824877121248, "kl": 0.037109375, "learning_rate": 9.907000234511618e-07, "loss": 0.0015, "reward": 1.8731685876846313, "reward_std": 0.12509721517562866, "rewards/accuracy_reward": 0.7419184446334839, "rewards/format_reward": 0.984375, "step": 2022, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 363.328125, "epoch": 0.06151927989295706, "grad_norm": 34.315817412333004, "kl": 0.03515625, "learning_rate": 9.906908510448174e-07, "loss": 0.0014, "reward": 2.013446092605591, "reward_std": 0.09947584569454193, "rewards/accuracy_reward": 0.8384460210800171, "rewards/format_reward": 1.0, "step": 2023, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.125, "completion_length": 355.078125, "epoch": 0.061549689818756845, "grad_norm": 0.9354044754653187, "kl": 0.0294189453125, "learning_rate": 9.906816741599125e-07, "loss": 0.0012, "reward": 1.9156250953674316, "reward_std": 0.13130834698677063, "rewards/accuracy_reward": 0.765625, "rewards/format_reward": 1.0, "step": 2024, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.25, "completion_length": 342.453125, "epoch": 0.061580099744556624, "grad_norm": 0.684202130500259, "kl": 0.031494140625, "learning_rate": 9.906724927965308e-07, "loss": 0.0013, "reward": 1.829469919204712, "reward_std": 0.011335247196257114, "rewards/accuracy_reward": 0.6794699430465698, "rewards/format_reward": 1.0, "step": 2025, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.125, "completion_length": 342.359375, "epoch": 0.0616105096703564, "grad_norm": 0.7968202961327147, "kl": 0.03271484375, "learning_rate": 9.906633069547567e-07, "loss": 0.0013, "reward": 1.8194878101348877, "reward_std": 0.09470692276954651, "rewards/accuracy_reward": 0.685112714767456, "rewards/format_reward": 1.0, "step": 2026, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.0, "completion_length": 356.515625, "epoch": 0.06164091959615618, "grad_norm": 1.0279680363230617, "kl": 0.03759765625, "learning_rate": 9.906541166346736e-07, "loss": 0.0015, "reward": 1.9176373481750488, "reward_std": 0.07220220565795898, "rewards/accuracy_reward": 0.7551373243331909, "rewards/format_reward": 1.0, "step": 2027, "temperature": 1.0 }, { "all_correct": 0.25, "all_wrong": 0.125, "completion_length": 374.78125, "epoch": 0.06167132952195597, "grad_norm": 1.1702449263467285, "kl": 0.035888671875, "learning_rate": 9.906449218363653e-07, "loss": 0.0014, "reward": 1.6825354099273682, "reward_std": 0.25893667340278625, "rewards/accuracy_reward": 0.5481604337692261, "rewards/format_reward": 1.0, "step": 2028, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.375, "completion_length": 350.03125, "epoch": 0.06170173944775575, "grad_norm": 0.5640948683912339, "kl": 0.031982421875, "learning_rate": 9.906357225599162e-07, "loss": 0.0013, "reward": 1.7125000953674316, "reward_std": 0.06943650543689728, "rewards/accuracy_reward": 0.59375, "rewards/format_reward": 1.0, "step": 2029, "temperature": 1.0 }, { "all_correct": 0.875, "all_wrong": 0.0, "completion_length": 350.5625, "epoch": 0.06173214937355553, "grad_norm": 0.7092419109420526, "kl": 0.037841796875, "learning_rate": 9.906265188054097e-07, "loss": 0.0015, "reward": 2.1659364700317383, "reward_std": 0.0021651671268045902, "rewards/accuracy_reward": 0.965936541557312, "rewards/format_reward": 1.0, "step": 2030, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 347.015625, "epoch": 0.06176255929935531, "grad_norm": 0.8401065484932992, "kl": 0.041015625, "learning_rate": 9.906173105729301e-07, "loss": 0.0016, "reward": 2.039583206176758, "reward_std": 0.11958510428667068, "rewards/accuracy_reward": 0.8645833134651184, "rewards/format_reward": 1.0, "step": 2031, "temperature": 1.0 }, { "all_correct": 0.875, "all_wrong": 0.0, "completion_length": 367.4375, "epoch": 0.06179296922515509, "grad_norm": 1.0132838206894346, "kl": 0.0341796875, "learning_rate": 9.906080978625615e-07, "loss": 0.0014, "reward": 2.149937629699707, "reward_std": 0.004149340093135834, "rewards/accuracy_reward": 0.9499375224113464, "rewards/format_reward": 1.0, "step": 2032, "temperature": 1.0 }, { "all_correct": 0.375, "all_wrong": 0.125, "completion_length": 381.25, "epoch": 0.06182337915095487, "grad_norm": 1.2705715676491562, "kl": 0.039306640625, "learning_rate": 9.90598880674388e-07, "loss": 0.0016, "reward": 1.6753382682800293, "reward_std": 0.14991800487041473, "rewards/accuracy_reward": 0.5534632205963135, "rewards/format_reward": 1.0, "step": 2033, "temperature": 1.0 }, { "all_correct": 0.375, "all_wrong": 0.25, "completion_length": 385.75, "epoch": 0.06185378907675465, "grad_norm": 1.075710889191878, "kl": 0.038330078125, "learning_rate": 9.905896590084934e-07, "loss": 0.0015, "reward": 1.6953051090240479, "reward_std": 0.03766397386789322, "rewards/accuracy_reward": 0.5515551567077637, "rewards/format_reward": 1.0, "step": 2034, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.125, "completion_length": 372.453125, "epoch": 0.06188419900255444, "grad_norm": 0.7289916645503076, "kl": 0.045654296875, "learning_rate": 9.905804328649621e-07, "loss": 0.0018, "reward": 1.9435954093933105, "reward_std": 0.008036581799387932, "rewards/accuracy_reward": 0.77484530210495, "rewards/format_reward": 1.0, "step": 2035, "temperature": 1.0 }, { "all_correct": 0.25, "all_wrong": 0.375, "completion_length": 401.609375, "epoch": 0.061914608928354216, "grad_norm": 0.9882589464269553, "kl": 0.033203125, "learning_rate": 9.905712022438782e-07, "loss": 0.0013, "reward": 1.436063528060913, "reward_std": 0.05676914006471634, "rewards/accuracy_reward": 0.34231358766555786, "rewards/format_reward": 1.0, "step": 2036, "temperature": 1.0 }, { "all_correct": 0.375, "all_wrong": 0.25, "completion_length": 384.59375, "epoch": 0.061945018854153995, "grad_norm": 0.9299777172077761, "kl": 0.033203125, "learning_rate": 9.905619671453262e-07, "loss": 0.0013, "reward": 1.765625, "reward_std": 0.21148672699928284, "rewards/accuracy_reward": 0.640625, "rewards/format_reward": 1.0, "step": 2037, "temperature": 1.0 }, { "all_correct": 0.25, "all_wrong": 0.0, "completion_length": 410.578125, "epoch": 0.061975428779953774, "grad_norm": 1.3601296207923972, "kl": 0.0286865234375, "learning_rate": 9.905527275693902e-07, "loss": 0.0011, "reward": 1.7845923900604248, "reward_std": 0.24145135283470154, "rewards/accuracy_reward": 0.6189673542976379, "rewards/format_reward": 1.0, "step": 2038, "temperature": 1.0 }, { "all_correct": 0.25, "all_wrong": 0.125, "completion_length": 381.25, "epoch": 0.06200583870575356, "grad_norm": 1.1309604043384553, "kl": 0.03369140625, "learning_rate": 9.905434835161545e-07, "loss": 0.0014, "reward": 1.7005425691604614, "reward_std": 0.23524481058120728, "rewards/accuracy_reward": 0.5692925453186035, "rewards/format_reward": 1.0, "step": 2039, "temperature": 1.0 }, { "all_correct": 0.375, "all_wrong": 0.25, "completion_length": 380.453125, "epoch": 0.06203624863155334, "grad_norm": 0.8838880236497924, "kl": 0.03515625, "learning_rate": 9.905342349857036e-07, "loss": 0.0014, "reward": 1.648337483406067, "reward_std": 0.025409521535038948, "rewards/accuracy_reward": 0.4983374774456024, "rewards/format_reward": 1.0, "step": 2040, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.125, "completion_length": 378.03125, "epoch": 0.06206665855735312, "grad_norm": 0.8097388042890695, "kl": 0.03466796875, "learning_rate": 9.905249819781216e-07, "loss": 0.0014, "reward": 1.9192222356796265, "reward_std": 0.1308070570230484, "rewards/accuracy_reward": 0.7535971403121948, "rewards/format_reward": 1.0, "step": 2041, "temperature": 1.0 }, { "all_correct": 0.875, "all_wrong": 0.0, "completion_length": 352.84375, "epoch": 0.0620970684831529, "grad_norm": 0.7684804653042219, "kl": 0.032958984375, "learning_rate": 9.905157244934934e-07, "loss": 0.0013, "reward": 2.1422853469848633, "reward_std": 0.07965405285358429, "rewards/accuracy_reward": 0.95166015625, "rewards/format_reward": 1.0, "step": 2042, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.125, "completion_length": 355.515625, "epoch": 0.062127478408952684, "grad_norm": 0.8021499880881202, "kl": 0.035888671875, "learning_rate": 9.905064625319035e-07, "loss": 0.0014, "reward": 1.7461738586425781, "reward_std": 0.13696521520614624, "rewards/accuracy_reward": 0.6086738705635071, "rewards/format_reward": 1.0, "step": 2043, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.375, "completion_length": 345.953125, "epoch": 0.06215788833475246, "grad_norm": 0.7005308999840786, "kl": 0.03173828125, "learning_rate": 9.904971960934359e-07, "loss": 0.0013, "reward": 1.728124976158142, "reward_std": 0.06187184900045395, "rewards/accuracy_reward": 0.609375, "rewards/format_reward": 1.0, "step": 2044, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.0, "completion_length": 374.765625, "epoch": 0.06218829826055224, "grad_norm": 0.7247497247406496, "kl": 0.03515625, "learning_rate": 9.904879251781755e-07, "loss": 0.0014, "reward": 2.00933837890625, "reward_std": 0.06618594378232956, "rewards/accuracy_reward": 0.8187133073806763, "rewards/format_reward": 1.0, "step": 2045, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.125, "completion_length": 369.046875, "epoch": 0.06221870818635203, "grad_norm": 0.93673334733384, "kl": 0.03466796875, "learning_rate": 9.90478649786207e-07, "loss": 0.0014, "reward": 1.8385248184204102, "reward_std": 0.15765385329723358, "rewards/accuracy_reward": 0.6853996515274048, "rewards/format_reward": 1.0, "step": 2046, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.0, "completion_length": 361.8125, "epoch": 0.06224911811215181, "grad_norm": 0.928052213753772, "kl": 0.03369140625, "learning_rate": 9.90469369917615e-07, "loss": 0.0013, "reward": 1.9800981283187866, "reward_std": 0.08810873329639435, "rewards/accuracy_reward": 0.7925980687141418, "rewards/format_reward": 1.0, "step": 2047, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 362.921875, "epoch": 0.06227952803795159, "grad_norm": 1.131193053003704, "kl": 0.0303955078125, "learning_rate": 9.90460085572484e-07, "loss": 0.0012, "reward": 1.949488878250122, "reward_std": 0.13756681978702545, "rewards/accuracy_reward": 0.7682388424873352, "rewards/format_reward": 1.0, "step": 2048, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.0, "completion_length": 390.65625, "epoch": 0.062309937963751366, "grad_norm": 1.366775535007421, "kl": 0.03662109375, "learning_rate": 9.90450796750899e-07, "loss": 0.0015, "reward": 1.9264273643493652, "reward_std": 0.0442509651184082, "rewards/accuracy_reward": 0.757677435874939, "rewards/format_reward": 1.0, "step": 2049, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.25, "completion_length": 383.40625, "epoch": 0.06234034788955115, "grad_norm": 0.5207658131457866, "kl": 0.026123046875, "learning_rate": 9.904415034529447e-07, "loss": 0.001, "reward": 1.7875001430511475, "reward_std": 0.155264750123024, "rewards/accuracy_reward": 0.65625, "rewards/format_reward": 1.0, "step": 2050, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.125, "completion_length": 366.875, "epoch": 0.06237075781535093, "grad_norm": 1.5279307884748905, "kl": 0.03369140625, "learning_rate": 9.90432205678706e-07, "loss": 0.0014, "reward": 1.8474767208099365, "reward_std": 0.05910363793373108, "rewards/accuracy_reward": 0.6974766850471497, "rewards/format_reward": 1.0, "step": 2051, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.125, "completion_length": 370.1875, "epoch": 0.06240116774115071, "grad_norm": 0.9275379111156308, "kl": 0.03271484375, "learning_rate": 9.904229034282673e-07, "loss": 0.0013, "reward": 1.78125, "reward_std": 0.1612270474433899, "rewards/accuracy_reward": 0.65625, "rewards/format_reward": 1.0, "step": 2052, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.125, "completion_length": 343.546875, "epoch": 0.06243157766695049, "grad_norm": 1.291331794933179, "kl": 0.045654296875, "learning_rate": 9.904135967017143e-07, "loss": 0.0018, "reward": 1.9118304252624512, "reward_std": 0.033199384808540344, "rewards/accuracy_reward": 0.7618303894996643, "rewards/format_reward": 1.0, "step": 2053, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.125, "completion_length": 369.046875, "epoch": 0.062461987592750276, "grad_norm": 0.718810920160123, "kl": 0.0341796875, "learning_rate": 9.904042854991312e-07, "loss": 0.0014, "reward": 1.859375, "reward_std": 0.12182654440402985, "rewards/accuracy_reward": 0.71875, "rewards/format_reward": 1.0, "step": 2054, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.125, "completion_length": 357.96875, "epoch": 0.062492397518550055, "grad_norm": 2.311793653923513, "kl": 0.037841796875, "learning_rate": 9.903949698206035e-07, "loss": 0.0015, "reward": 1.9187151193618774, "reward_std": 0.07667052745819092, "rewards/accuracy_reward": 0.7530900239944458, "rewards/format_reward": 1.0, "step": 2055, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.125, "completion_length": 356.40625, "epoch": 0.06252280744434983, "grad_norm": 0.759435649454247, "kl": 0.032958984375, "learning_rate": 9.90385649666216e-07, "loss": 0.0013, "reward": 1.8229310512542725, "reward_std": 0.045942895114421844, "rewards/accuracy_reward": 0.6729309558868408, "rewards/format_reward": 1.0, "step": 2056, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.125, "completion_length": 355.640625, "epoch": 0.06255321737014961, "grad_norm": 0.7958682915224414, "kl": 0.033203125, "learning_rate": 9.903763250360538e-07, "loss": 0.0013, "reward": 1.9847657680511475, "reward_std": 0.0980965867638588, "rewards/accuracy_reward": 0.822265625, "rewards/format_reward": 1.0, "step": 2057, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.375, "completion_length": 363.265625, "epoch": 0.06258362729594939, "grad_norm": 0.8520046863322858, "kl": 0.04052734375, "learning_rate": 9.903669959302017e-07, "loss": 0.0016, "reward": 1.6726926565170288, "reward_std": 0.013306278735399246, "rewards/accuracy_reward": 0.550817608833313, "rewards/format_reward": 1.0, "step": 2058, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 357.6875, "epoch": 0.06261403722174919, "grad_norm": 0.7023387105896836, "kl": 0.03515625, "learning_rate": 9.903576623487455e-07, "loss": 0.0014, "reward": 2.033048629760742, "reward_std": 0.010209137573838234, "rewards/accuracy_reward": 0.8330484628677368, "rewards/format_reward": 1.0, "step": 2059, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.25, "completion_length": 356.328125, "epoch": 0.06264444714754897, "grad_norm": 0.5469953797085015, "kl": 0.03369140625, "learning_rate": 9.903483242917698e-07, "loss": 0.0013, "reward": 1.8161206245422363, "reward_std": 0.011126866564154625, "rewards/accuracy_reward": 0.6692456007003784, "rewards/format_reward": 1.0, "step": 2060, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.25, "completion_length": 360.296875, "epoch": 0.06267485707334874, "grad_norm": 1.4401091309350895, "kl": 0.040771484375, "learning_rate": 9.9033898175936e-07, "loss": 0.0016, "reward": 1.7744289636611938, "reward_std": 0.06001918390393257, "rewards/accuracy_reward": 0.6306790113449097, "rewards/format_reward": 1.0, "step": 2061, "temperature": 1.0 }, { "all_correct": 0.375, "all_wrong": 0.25, "completion_length": 373.421875, "epoch": 0.06270526699914852, "grad_norm": 0.9827097306894552, "kl": 0.037353515625, "learning_rate": 9.903296347516018e-07, "loss": 0.0015, "reward": 1.7161130905151367, "reward_std": 0.11669593304395676, "rewards/accuracy_reward": 0.5879881381988525, "rewards/format_reward": 0.984375, "step": 2062, "temperature": 1.0 }, { "all_correct": 0.375, "all_wrong": 0.125, "completion_length": 384.015625, "epoch": 0.0627356769249483, "grad_norm": 1.2227015708915272, "kl": 0.042236328125, "learning_rate": 9.903202832685798e-07, "loss": 0.0017, "reward": 1.7291587591171265, "reward_std": 0.13431653380393982, "rewards/accuracy_reward": 0.5947836637496948, "rewards/format_reward": 1.0, "step": 2063, "temperature": 1.0 }, { "all_correct": 0.375, "all_wrong": 0.0, "completion_length": 362.5625, "epoch": 0.06276608685074808, "grad_norm": 2.1060742388772558, "kl": 0.042236328125, "learning_rate": 9.903109273103798e-07, "loss": 0.0017, "reward": 1.665625810623169, "reward_std": 0.1045437604188919, "rewards/accuracy_reward": 0.5500007271766663, "rewards/format_reward": 1.0, "step": 2064, "temperature": 1.0 }, { "all_correct": 0.25, "all_wrong": 0.125, "completion_length": 364.3125, "epoch": 0.06279649677654786, "grad_norm": 1.3639985564743895, "kl": 0.0341796875, "learning_rate": 9.90301566877087e-07, "loss": 0.0014, "reward": 1.7351657152175903, "reward_std": 0.14109373092651367, "rewards/accuracy_reward": 0.5757906436920166, "rewards/format_reward": 1.0, "step": 2065, "temperature": 1.0 }, { "all_correct": 0.875, "all_wrong": 0.125, "completion_length": 346.3125, "epoch": 0.06282690670234764, "grad_norm": 0.06390044476202389, "kl": 0.03173828125, "learning_rate": 9.90292201968787e-07, "loss": 0.0013, "reward": 2.0500001907348633, "reward_std": 0.0, "rewards/accuracy_reward": 0.875, "rewards/format_reward": 1.0, "step": 2066, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.125, "completion_length": 344.09375, "epoch": 0.06285731662814743, "grad_norm": 1.342029890330942, "kl": 0.037109375, "learning_rate": 9.90282832585565e-07, "loss": 0.0015, "reward": 2.028125047683716, "reward_std": 0.06187184900045395, "rewards/accuracy_reward": 0.859375, "rewards/format_reward": 1.0, "step": 2067, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.0, "completion_length": 362.96875, "epoch": 0.06288772655394721, "grad_norm": 0.9718487235941278, "kl": 0.035400390625, "learning_rate": 9.902734587275068e-07, "loss": 0.0014, "reward": 1.9866228103637695, "reward_std": 0.01950971409678459, "rewards/accuracy_reward": 0.7897477149963379, "rewards/format_reward": 1.0, "step": 2068, "temperature": 1.0 }, { "all_correct": 0.375, "all_wrong": 0.0, "completion_length": 398.609375, "epoch": 0.06291813647974699, "grad_norm": 2.543297548995273, "kl": 0.03369140625, "learning_rate": 9.902640803946979e-07, "loss": 0.0014, "reward": 1.7768049240112305, "reward_std": 0.10847268998622894, "rewards/accuracy_reward": 0.6174298524856567, "rewards/format_reward": 1.0, "step": 2069, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 387.90625, "epoch": 0.06294854640554677, "grad_norm": 1.6285946401437221, "kl": 0.0322265625, "learning_rate": 9.90254697587224e-07, "loss": 0.0013, "reward": 1.8467861413955688, "reward_std": 0.1362738013267517, "rewards/accuracy_reward": 0.6686611175537109, "rewards/format_reward": 1.0, "step": 2070, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.25, "completion_length": 355.28125, "epoch": 0.06297895633134655, "grad_norm": 0.6953878849235884, "kl": 0.037109375, "learning_rate": 9.902453103051702e-07, "loss": 0.0015, "reward": 1.8449684381484985, "reward_std": 0.01673966459929943, "rewards/accuracy_reward": 0.6980934143066406, "rewards/format_reward": 1.0, "step": 2071, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.0, "completion_length": 361.703125, "epoch": 0.06300936625714633, "grad_norm": 0.7441769832189555, "kl": 0.035888671875, "learning_rate": 9.90235918548623e-07, "loss": 0.0014, "reward": 1.853124976158142, "reward_std": 0.16566048562526703, "rewards/accuracy_reward": 0.71875, "rewards/format_reward": 1.0, "step": 2072, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.25, "completion_length": 393.125, "epoch": 0.06303977618294611, "grad_norm": 0.8136621660333894, "kl": 0.0274658203125, "learning_rate": 9.902265223176673e-07, "loss": 0.0011, "reward": 1.7177050113677979, "reward_std": 0.004691427107900381, "rewards/accuracy_reward": 0.5677049160003662, "rewards/format_reward": 1.0, "step": 2073, "temperature": 1.0 }, { "all_correct": 0.875, "all_wrong": 0.0, "completion_length": 343.140625, "epoch": 0.0630701861087459, "grad_norm": 0.8260918679094278, "kl": 0.04345703125, "learning_rate": 9.902171216123894e-07, "loss": 0.0017, "reward": 2.1187500953674316, "reward_std": 0.09175113588571548, "rewards/accuracy_reward": 0.9375, "rewards/format_reward": 1.0, "step": 2074, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.125, "completion_length": 349.4375, "epoch": 0.06310059603454568, "grad_norm": 8.673867948774213, "kl": 0.03857421875, "learning_rate": 9.90207716432875e-07, "loss": 0.0015, "reward": 1.8274343013763428, "reward_std": 0.02573998272418976, "rewards/accuracy_reward": 0.6711843013763428, "rewards/format_reward": 1.0, "step": 2075, "temperature": 1.0 }, { "all_correct": 0.375, "all_wrong": 0.375, "completion_length": 374.765625, "epoch": 0.06313100596034546, "grad_norm": 0.8607301410124202, "kl": 0.0267333984375, "learning_rate": 9.901983067792098e-07, "loss": 0.0011, "reward": 1.5982356071472168, "reward_std": 0.08568847179412842, "rewards/accuracy_reward": 0.48573556542396545, "rewards/format_reward": 1.0, "step": 2076, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.125, "completion_length": 356.703125, "epoch": 0.06316141588614524, "grad_norm": 12.275259101725174, "kl": 0.036865234375, "learning_rate": 9.9018889265148e-07, "loss": 0.0015, "reward": 1.9509810209274292, "reward_std": 0.058495841920375824, "rewards/accuracy_reward": 0.7791059613227844, "rewards/format_reward": 1.0, "step": 2077, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.125, "completion_length": 344.578125, "epoch": 0.06319182581194502, "grad_norm": 1.1744025894552494, "kl": 0.03369140625, "learning_rate": 9.90179474049771e-07, "loss": 0.0014, "reward": 1.7909038066864014, "reward_std": 0.054379355162382126, "rewards/accuracy_reward": 0.640903890132904, "rewards/format_reward": 1.0, "step": 2078, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.125, "completion_length": 390.140625, "epoch": 0.0632222357377448, "grad_norm": 1.7583312403110702, "kl": 0.03125, "learning_rate": 9.901700509741693e-07, "loss": 0.0013, "reward": 1.8378750085830688, "reward_std": 0.12378955632448196, "rewards/accuracy_reward": 0.6878749132156372, "rewards/format_reward": 1.0, "step": 2079, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.125, "completion_length": 345.28125, "epoch": 0.06325264566354458, "grad_norm": 0.6215106477964852, "kl": 0.037841796875, "learning_rate": 9.901606234247604e-07, "loss": 0.0015, "reward": 1.9750001430511475, "reward_std": 0.08017837256193161, "rewards/accuracy_reward": 0.8125, "rewards/format_reward": 1.0, "step": 2080, "temperature": 1.0 }, { "all_correct": 0.875, "all_wrong": 0.0, "completion_length": 335.546875, "epoch": 0.06328305558934436, "grad_norm": 0.6009034709044881, "kl": 0.033935546875, "learning_rate": 9.901511914016308e-07, "loss": 0.0014, "reward": 2.1781249046325684, "reward_std": 0.05250425264239311, "rewards/accuracy_reward": 0.984375, "rewards/format_reward": 1.0, "step": 2081, "temperature": 1.0 }, { "all_correct": 0.25, "all_wrong": 0.375, "completion_length": 349.875, "epoch": 0.06331346551514415, "grad_norm": 2.2534356564330698, "kl": 0.035400390625, "learning_rate": 9.901417549048664e-07, "loss": 0.0014, "reward": 1.4485280513763428, "reward_std": 0.07305482029914856, "rewards/accuracy_reward": 0.35790300369262695, "rewards/format_reward": 1.0, "step": 2082, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.0, "completion_length": 344.984375, "epoch": 0.06334387544094393, "grad_norm": 1.4331067924758507, "kl": 0.035400390625, "learning_rate": 9.901323139345531e-07, "loss": 0.0014, "reward": 2.029134511947632, "reward_std": 0.15779872238636017, "rewards/accuracy_reward": 0.8572593927383423, "rewards/format_reward": 1.0, "step": 2083, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.125, "completion_length": 360.46875, "epoch": 0.06337428536674371, "grad_norm": 2.205659566841763, "kl": 0.03515625, "learning_rate": 9.901228684907775e-07, "loss": 0.0014, "reward": 1.809924602508545, "reward_std": 0.030358213931322098, "rewards/accuracy_reward": 0.6380495429039001, "rewards/format_reward": 1.0, "step": 2084, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.0, "completion_length": 354.609375, "epoch": 0.06340469529254349, "grad_norm": 0.9553728749173171, "kl": 0.0341796875, "learning_rate": 9.901134185736256e-07, "loss": 0.0014, "reward": 2.0278735160827637, "reward_std": 0.057059720158576965, "rewards/accuracy_reward": 0.8278735876083374, "rewards/format_reward": 1.0, "step": 2085, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 365.46875, "epoch": 0.06343510521834327, "grad_norm": 1.3824417312938044, "kl": 0.03466796875, "learning_rate": 9.901039641831836e-07, "loss": 0.0014, "reward": 2.0466792583465576, "reward_std": 0.020899798721075058, "rewards/accuracy_reward": 0.8498042821884155, "rewards/format_reward": 1.0, "step": 2086, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.125, "completion_length": 359.5, "epoch": 0.06346551514414304, "grad_norm": 1.5241662407322047, "kl": 0.03369140625, "learning_rate": 9.900945053195378e-07, "loss": 0.0014, "reward": 1.943068265914917, "reward_std": 0.11704853177070618, "rewards/accuracy_reward": 0.7743180990219116, "rewards/format_reward": 1.0, "step": 2087, "temperature": 1.0 }, { "all_correct": 0.875, "all_wrong": 0.0, "completion_length": 355.109375, "epoch": 0.06349592506994282, "grad_norm": 0.6135670801153557, "kl": 0.031005859375, "learning_rate": 9.900850419827747e-07, "loss": 0.0012, "reward": 2.1196517944335938, "reward_std": 0.004342858679592609, "rewards/accuracy_reward": 0.9196515679359436, "rewards/format_reward": 1.0, "step": 2088, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 363.171875, "epoch": 0.06352633499574262, "grad_norm": 0.8937525302381855, "kl": 0.03564453125, "learning_rate": 9.900755741729804e-07, "loss": 0.0014, "reward": 2.026134729385376, "reward_std": 0.009340699762105942, "rewards/accuracy_reward": 0.8261346817016602, "rewards/format_reward": 1.0, "step": 2089, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.125, "completion_length": 346.21875, "epoch": 0.0635567449215424, "grad_norm": 1.4483716685126036, "kl": 0.043212890625, "learning_rate": 9.900661018902414e-07, "loss": 0.0017, "reward": 1.7777183055877686, "reward_std": 0.024914588779211044, "rewards/accuracy_reward": 0.6120931506156921, "rewards/format_reward": 1.0, "step": 2090, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.125, "completion_length": 382.859375, "epoch": 0.06358715484734218, "grad_norm": 1.6430604346136035, "kl": 0.03173828125, "learning_rate": 9.900566251346443e-07, "loss": 0.0013, "reward": 1.821995735168457, "reward_std": 0.10360874980688095, "rewards/accuracy_reward": 0.6688706278800964, "rewards/format_reward": 1.0, "step": 2091, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.25, "completion_length": 348.671875, "epoch": 0.06361756477314195, "grad_norm": 0.3690318413230103, "kl": 0.03369140625, "learning_rate": 9.900471439062756e-07, "loss": 0.0013, "reward": 1.8625000715255737, "reward_std": 0.06943650543689728, "rewards/accuracy_reward": 0.71875, "rewards/format_reward": 1.0, "step": 2092, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.375, "completion_length": 341.65625, "epoch": 0.06364797469894173, "grad_norm": 0.8819279449608347, "kl": 0.037841796875, "learning_rate": 9.900376582052218e-07, "loss": 0.0015, "reward": 1.6831932067871094, "reward_std": 0.01790298894047737, "rewards/accuracy_reward": 0.5613182187080383, "rewards/format_reward": 1.0, "step": 2093, "temperature": 1.0 }, { "all_correct": 0.125, "all_wrong": 0.25, "completion_length": 394.578125, "epoch": 0.06367838462474151, "grad_norm": 1.3029635730757725, "kl": 0.03466796875, "learning_rate": 9.900281680315693e-07, "loss": 0.0014, "reward": 1.5591199398040771, "reward_std": 0.15607582032680511, "rewards/accuracy_reward": 0.42474496364593506, "rewards/format_reward": 1.0, "step": 2094, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.125, "completion_length": 347.5625, "epoch": 0.06370879455054129, "grad_norm": 0.3845888449514648, "kl": 0.034912109375, "learning_rate": 9.900186733854048e-07, "loss": 0.0014, "reward": 1.9156250953674316, "reward_std": 0.04419417306780815, "rewards/accuracy_reward": 0.765625, "rewards/format_reward": 1.0, "step": 2095, "temperature": 1.0 }, { "all_correct": 0.125, "all_wrong": 0.25, "completion_length": 389.78125, "epoch": 0.06373920447634107, "grad_norm": 1.1474210159127085, "kl": 0.03369140625, "learning_rate": 9.90009174266815e-07, "loss": 0.0013, "reward": 1.6258864402770996, "reward_std": 0.13403303921222687, "rewards/accuracy_reward": 0.49151140451431274, "rewards/format_reward": 1.0, "step": 2096, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.25, "completion_length": 371.328125, "epoch": 0.06376961440214086, "grad_norm": 0.38733837417301736, "kl": 0.04443359375, "learning_rate": 9.899996706758865e-07, "loss": 0.0018, "reward": 1.8476699590682983, "reward_std": 0.021822111681103706, "rewards/accuracy_reward": 0.7039200067520142, "rewards/format_reward": 1.0, "step": 2097, "temperature": 1.0 }, { "all_correct": 0.25, "all_wrong": 0.5, "completion_length": 358.390625, "epoch": 0.06380002432794064, "grad_norm": 0.6300871223918846, "kl": 0.032470703125, "learning_rate": 9.899901626127063e-07, "loss": 0.0013, "reward": 1.446874976158142, "reward_std": 0.09722718596458435, "rewards/accuracy_reward": 0.375, "rewards/format_reward": 1.0, "step": 2098, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.0, "completion_length": 386.59375, "epoch": 0.06383043425374042, "grad_norm": 1.1097295586857874, "kl": 0.03955078125, "learning_rate": 9.89980650077361e-07, "loss": 0.0016, "reward": 1.9942893981933594, "reward_std": 0.04625324904918671, "rewards/accuracy_reward": 0.8005393743515015, "rewards/format_reward": 1.0, "step": 2099, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 346.953125, "epoch": 0.0638608441795402, "grad_norm": 0.843517437587786, "kl": 0.04052734375, "learning_rate": 9.899711330699374e-07, "loss": 0.0016, "reward": 2.0448660850524902, "reward_std": 0.05009492486715317, "rewards/accuracy_reward": 0.8573660254478455, "rewards/format_reward": 1.0, "step": 2100, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.125, "completion_length": 378.5, "epoch": 0.06389125410533998, "grad_norm": 1.6616304646340512, "kl": 0.0322265625, "learning_rate": 9.899616115905222e-07, "loss": 0.0013, "reward": 1.7805960178375244, "reward_std": 0.061432041227817535, "rewards/accuracy_reward": 0.6493459343910217, "rewards/format_reward": 1.0, "step": 2101, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 360.796875, "epoch": 0.06392166403113976, "grad_norm": 1.179245753967449, "kl": 0.035888671875, "learning_rate": 9.899520856392028e-07, "loss": 0.0014, "reward": 1.8624999523162842, "reward_std": 0.26953840255737305, "rewards/accuracy_reward": 0.71875, "rewards/format_reward": 1.0, "step": 2102, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 381.921875, "epoch": 0.06395207395693954, "grad_norm": 0.8710482028837138, "kl": 0.036376953125, "learning_rate": 9.899425552160658e-07, "loss": 0.0015, "reward": 2.018749952316284, "reward_std": 0.16463366150856018, "rewards/accuracy_reward": 0.875, "rewards/format_reward": 0.984375, "step": 2103, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.125, "completion_length": 351.765625, "epoch": 0.06398248388273933, "grad_norm": 0.7966493759385025, "kl": 0.04345703125, "learning_rate": 9.89933020321198e-07, "loss": 0.0017, "reward": 1.963721513748169, "reward_std": 0.010661628097295761, "rewards/accuracy_reward": 0.78872150182724, "rewards/format_reward": 1.0, "step": 2104, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.125, "completion_length": 345.625, "epoch": 0.06401289380853911, "grad_norm": 0.9052513817437199, "kl": 0.0341796875, "learning_rate": 9.89923480954687e-07, "loss": 0.0014, "reward": 1.9406579732894897, "reward_std": 0.030457256361842155, "rewards/accuracy_reward": 0.7750329375267029, "rewards/format_reward": 1.0, "step": 2105, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.125, "completion_length": 363.0625, "epoch": 0.06404330373433889, "grad_norm": 1.0707722038346783, "kl": 0.03271484375, "learning_rate": 9.899139371166192e-07, "loss": 0.0013, "reward": 1.9431431293487549, "reward_std": 0.07140569388866425, "rewards/accuracy_reward": 0.7743930220603943, "rewards/format_reward": 1.0, "step": 2106, "temperature": 1.0 }, { "all_correct": 0.375, "all_wrong": 0.125, "completion_length": 371.25, "epoch": 0.06407371366013867, "grad_norm": 2.1538361115591074, "kl": 0.0390625, "learning_rate": 9.89904388807082e-07, "loss": 0.0016, "reward": 1.637770175933838, "reward_std": 0.15319609642028809, "rewards/accuracy_reward": 0.5252702236175537, "rewards/format_reward": 1.0, "step": 2107, "temperature": 1.0 }, { "all_correct": 0.25, "all_wrong": 0.0, "completion_length": 391.59375, "epoch": 0.06410412358593845, "grad_norm": 1.481038484699785, "kl": 0.041748046875, "learning_rate": 9.898948360261629e-07, "loss": 0.0017, "reward": 1.7597084045410156, "reward_std": 0.14945609867572784, "rewards/accuracy_reward": 0.5972083806991577, "rewards/format_reward": 1.0, "step": 2108, "temperature": 1.0 }, { "all_correct": 0.375, "all_wrong": 0.25, "completion_length": 375.859375, "epoch": 0.06413453351173823, "grad_norm": 2.2386871613419284, "kl": 0.041015625, "learning_rate": 9.898852787739486e-07, "loss": 0.0016, "reward": 1.6676735877990723, "reward_std": 0.16083982586860657, "rewards/accuracy_reward": 0.5551736354827881, "rewards/format_reward": 1.0, "step": 2109, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.375, "completion_length": 354.59375, "epoch": 0.06416494343753801, "grad_norm": 0.6016372562348676, "kl": 0.032958984375, "learning_rate": 9.898757170505265e-07, "loss": 0.0013, "reward": 1.747809886932373, "reward_std": 0.004055412020534277, "rewards/accuracy_reward": 0.6228098273277283, "rewards/format_reward": 1.0, "step": 2110, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.125, "completion_length": 349.625, "epoch": 0.06419535336333779, "grad_norm": 0.49644899713280927, "kl": 0.035888671875, "learning_rate": 9.898661508559838e-07, "loss": 0.0014, "reward": 1.9078915119171143, "reward_std": 0.00012754539784509689, "rewards/accuracy_reward": 0.7578914165496826, "rewards/format_reward": 1.0, "step": 2111, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.125, "completion_length": 347.359375, "epoch": 0.06422576328913758, "grad_norm": 1.0031867975779312, "kl": 0.03662109375, "learning_rate": 9.89856580190408e-07, "loss": 0.0015, "reward": 1.8902798891067505, "reward_std": 0.07828083634376526, "rewards/accuracy_reward": 0.7340297698974609, "rewards/format_reward": 1.0, "step": 2112, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.125, "completion_length": 374.109375, "epoch": 0.06425617321493736, "grad_norm": 4.923614028240135, "kl": 0.041015625, "learning_rate": 9.89847005053886e-07, "loss": 0.0016, "reward": 1.8250000476837158, "reward_std": 0.15982910990715027, "rewards/accuracy_reward": 0.6875, "rewards/format_reward": 1.0, "step": 2113, "temperature": 1.0 }, { "all_correct": 0.25, "all_wrong": 0.375, "completion_length": 334.203125, "epoch": 0.06428658314073714, "grad_norm": 1.8741822864756679, "kl": 0.035400390625, "learning_rate": 9.89837425446506e-07, "loss": 0.0014, "reward": 1.44376802444458, "reward_std": 0.1725069284439087, "rewards/accuracy_reward": 0.36876797676086426, "rewards/format_reward": 1.0, "step": 2114, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.25, "completion_length": 338.421875, "epoch": 0.06431699306653692, "grad_norm": 1.6241743383818346, "kl": 0.033447265625, "learning_rate": 9.898278413683546e-07, "loss": 0.0013, "reward": 1.7639122009277344, "reward_std": 0.09392809867858887, "rewards/accuracy_reward": 0.6357871294021606, "rewards/format_reward": 1.0, "step": 2115, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.125, "completion_length": 355.234375, "epoch": 0.0643474029923367, "grad_norm": 0.9209420410759019, "kl": 0.0380859375, "learning_rate": 9.898182528195198e-07, "loss": 0.0015, "reward": 2.0031251907348633, "reward_std": 0.11362677067518234, "rewards/accuracy_reward": 0.84375, "rewards/format_reward": 1.0, "step": 2116, "temperature": 1.0 }, { "all_correct": 0.375, "all_wrong": 0.0, "completion_length": 383.5625, "epoch": 0.06437781291813648, "grad_norm": 1.16749473761527, "kl": 0.033203125, "learning_rate": 9.89808659800089e-07, "loss": 0.0013, "reward": 1.8255112171173096, "reward_std": 0.14966201782226562, "rewards/accuracy_reward": 0.6473861932754517, "rewards/format_reward": 1.0, "step": 2117, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.0, "completion_length": 333.34375, "epoch": 0.06440822284393626, "grad_norm": 1.1352405885948824, "kl": 0.04296875, "learning_rate": 9.897990623101496e-07, "loss": 0.0017, "reward": 2.0041370391845703, "reward_std": 0.08043959736824036, "rewards/accuracy_reward": 0.8322620391845703, "rewards/format_reward": 1.0, "step": 2118, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.25, "completion_length": 365.6875, "epoch": 0.06443863276973605, "grad_norm": 1.0439086736504721, "kl": 0.031494140625, "learning_rate": 9.897894603497892e-07, "loss": 0.0013, "reward": 1.7449532747268677, "reward_std": 0.09416370838880539, "rewards/accuracy_reward": 0.6230782270431519, "rewards/format_reward": 1.0, "step": 2119, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.125, "completion_length": 359.734375, "epoch": 0.06446904269553583, "grad_norm": 0.9559940599663257, "kl": 0.032470703125, "learning_rate": 9.897798539190957e-07, "loss": 0.0013, "reward": 1.9458057880401611, "reward_std": 0.015595316886901855, "rewards/accuracy_reward": 0.7739309072494507, "rewards/format_reward": 1.0, "step": 2120, "temperature": 1.0 }, { "all_correct": 0.375, "all_wrong": 0.125, "completion_length": 379.625, "epoch": 0.0644994526213356, "grad_norm": 1.3634067621699748, "kl": 0.041259765625, "learning_rate": 9.897702430181566e-07, "loss": 0.0017, "reward": 1.7735090255737305, "reward_std": 0.13696910440921783, "rewards/accuracy_reward": 0.6297589540481567, "rewards/format_reward": 1.0, "step": 2121, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.125, "completion_length": 351.859375, "epoch": 0.06452986254713539, "grad_norm": 0.9028272330045392, "kl": 0.03076171875, "learning_rate": 9.897606276470593e-07, "loss": 0.0012, "reward": 1.8594133853912354, "reward_std": 0.08496332168579102, "rewards/accuracy_reward": 0.7062882781028748, "rewards/format_reward": 1.0, "step": 2122, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 358.84375, "epoch": 0.06456027247293517, "grad_norm": 3.2060297209328836, "kl": 0.03369140625, "learning_rate": 9.897510078058922e-07, "loss": 0.0013, "reward": 1.8825440406799316, "reward_std": 0.08603426069021225, "rewards/accuracy_reward": 0.7106691002845764, "rewards/format_reward": 1.0, "step": 2123, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 331.484375, "epoch": 0.06459068239873494, "grad_norm": 1.8349227523760465, "kl": 0.049072265625, "learning_rate": 9.897413834947427e-07, "loss": 0.002, "reward": 1.9387884140014648, "reward_std": 0.2935318350791931, "rewards/accuracy_reward": 0.7887883186340332, "rewards/format_reward": 1.0, "step": 2124, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 367.578125, "epoch": 0.06462109232453472, "grad_norm": 1.3528003309215921, "kl": 0.04443359375, "learning_rate": 9.897317547136988e-07, "loss": 0.0018, "reward": 1.8656044006347656, "reward_std": 0.14573940634727478, "rewards/accuracy_reward": 0.7031044363975525, "rewards/format_reward": 1.0, "step": 2125, "temperature": 1.0 }, { "all_correct": 0.375, "all_wrong": 0.25, "completion_length": 379.125, "epoch": 0.0646515022503345, "grad_norm": 0.8343527303239737, "kl": 0.044189453125, "learning_rate": 9.897221214628484e-07, "loss": 0.0018, "reward": 1.6528196334838867, "reward_std": 0.10057816654443741, "rewards/accuracy_reward": 0.5278196334838867, "rewards/format_reward": 1.0, "step": 2126, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.125, "completion_length": 370.9375, "epoch": 0.0646819121761343, "grad_norm": 9.679084889740105, "kl": 0.04443359375, "learning_rate": 9.897124837422792e-07, "loss": 0.0018, "reward": 1.7644470930099487, "reward_std": 0.16008460521697998, "rewards/accuracy_reward": 0.6394469738006592, "rewards/format_reward": 1.0, "step": 2127, "temperature": 1.0 }, { "all_correct": 0.375, "all_wrong": 0.125, "completion_length": 374.8125, "epoch": 0.06471232210193407, "grad_norm": 0.6768542554282132, "kl": 0.04443359375, "learning_rate": 9.897028415520792e-07, "loss": 0.0018, "reward": 1.7843749523162842, "reward_std": 0.25249195098876953, "rewards/accuracy_reward": 0.65625, "rewards/format_reward": 1.0, "step": 2128, "temperature": 1.0 }, { "all_correct": 0.25, "all_wrong": 0.125, "completion_length": 376.859375, "epoch": 0.06474273202773385, "grad_norm": 2.4895048492459355, "kl": 0.046630859375, "learning_rate": 9.896931948923366e-07, "loss": 0.0019, "reward": 1.5706278085708618, "reward_std": 0.14404921233654022, "rewards/accuracy_reward": 0.43625274300575256, "rewards/format_reward": 1.0, "step": 2129, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.0, "completion_length": 363.125, "epoch": 0.06477314195353363, "grad_norm": 1.8097071372255122, "kl": 0.042236328125, "learning_rate": 9.896835437631394e-07, "loss": 0.0017, "reward": 2.0391857624053955, "reward_std": 0.08121491223573685, "rewards/accuracy_reward": 0.8485608100891113, "rewards/format_reward": 1.0, "step": 2130, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.125, "completion_length": 343.5, "epoch": 0.06480355187933341, "grad_norm": 0.5344462016106117, "kl": 0.042236328125, "learning_rate": 9.896738881645757e-07, "loss": 0.0017, "reward": 1.959800362586975, "reward_std": 0.008134474977850914, "rewards/accuracy_reward": 0.7848003506660461, "rewards/format_reward": 1.0, "step": 2131, "temperature": 1.0 }, { "all_correct": 0.375, "all_wrong": 0.0, "completion_length": 352.421875, "epoch": 0.06483396180513319, "grad_norm": 1.9622838639713318, "kl": 0.05029296875, "learning_rate": 9.896642280967334e-07, "loss": 0.002, "reward": 1.695159912109375, "reward_std": 0.12434754520654678, "rewards/accuracy_reward": 0.5451599359512329, "rewards/format_reward": 1.0, "step": 2132, "temperature": 1.0 }, { "all_correct": 0.25, "all_wrong": 0.25, "completion_length": 390.859375, "epoch": 0.06486437173093297, "grad_norm": 1.1807476420805225, "kl": 0.048095703125, "learning_rate": 9.896545635597008e-07, "loss": 0.0019, "reward": 1.6558191776275635, "reward_std": 0.12571552395820618, "rewards/accuracy_reward": 0.527694046497345, "rewards/format_reward": 1.0, "step": 2133, "temperature": 1.0 }, { "all_correct": 0.25, "all_wrong": 0.0, "completion_length": 389.859375, "epoch": 0.06489478165673276, "grad_norm": 2.2794949432996567, "kl": 0.0439453125, "learning_rate": 9.896448945535666e-07, "loss": 0.0018, "reward": 1.6754416227340698, "reward_std": 0.07964123785495758, "rewards/accuracy_reward": 0.48481661081314087, "rewards/format_reward": 1.0, "step": 2134, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.125, "completion_length": 350.546875, "epoch": 0.06492519158253254, "grad_norm": 0.39648808946114733, "kl": 0.0537109375, "learning_rate": 9.89635221078418e-07, "loss": 0.0022, "reward": 1.915624976158142, "reward_std": 0.04419417306780815, "rewards/accuracy_reward": 0.765625, "rewards/format_reward": 1.0, "step": 2135, "temperature": 1.0 }, { "all_correct": 0.875, "all_wrong": 0.0, "completion_length": 347.984375, "epoch": 0.06495560150833232, "grad_norm": 0.6958096708660882, "kl": 0.040283203125, "learning_rate": 9.896255431343441e-07, "loss": 0.0016, "reward": 2.0572917461395264, "reward_std": 0.03830162435770035, "rewards/accuracy_reward": 0.8854166865348816, "rewards/format_reward": 1.0, "step": 2136, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.0, "completion_length": 353.234375, "epoch": 0.0649860114341321, "grad_norm": 1.3734723441313546, "kl": 0.04931640625, "learning_rate": 9.896158607214332e-07, "loss": 0.002, "reward": 1.982656717300415, "reward_std": 0.1639942079782486, "rewards/accuracy_reward": 0.8045315146446228, "rewards/format_reward": 1.0, "step": 2137, "temperature": 1.0 }, { "all_correct": 0.375, "all_wrong": 0.25, "completion_length": 361.0, "epoch": 0.06501642135993188, "grad_norm": 1.0542050232427072, "kl": 0.046630859375, "learning_rate": 9.89606173839773e-07, "loss": 0.0019, "reward": 1.6927616596221924, "reward_std": 0.04832878336310387, "rewards/accuracy_reward": 0.5521366596221924, "rewards/format_reward": 1.0, "step": 2138, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.125, "completion_length": 365.703125, "epoch": 0.06504683128573166, "grad_norm": 2.208117656458588, "kl": 0.04345703125, "learning_rate": 9.895964824894529e-07, "loss": 0.0017, "reward": 1.6826171875, "reward_std": 0.1700640767812729, "rewards/accuracy_reward": 0.5732421875, "rewards/format_reward": 1.0, "step": 2139, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.125, "completion_length": 347.71875, "epoch": 0.06507724121153144, "grad_norm": 1.884391376198748, "kl": 0.047607421875, "learning_rate": 9.895867866705608e-07, "loss": 0.0019, "reward": 2.0107929706573486, "reward_std": 0.06136559322476387, "rewards/accuracy_reward": 0.845167875289917, "rewards/format_reward": 1.0, "step": 2140, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.25, "completion_length": 345.546875, "epoch": 0.06510765113733122, "grad_norm": 0.5364680597865498, "kl": 0.0537109375, "learning_rate": 9.895770863831851e-07, "loss": 0.0021, "reward": 1.8968751430511475, "reward_std": 0.008838837966322899, "rewards/accuracy_reward": 0.75, "rewards/format_reward": 1.0, "step": 2141, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.25, "completion_length": 372.046875, "epoch": 0.06513806106313101, "grad_norm": 1.8535781240385485, "kl": 0.047119140625, "learning_rate": 9.895673816274144e-07, "loss": 0.0019, "reward": 1.7141551971435547, "reward_std": 0.06155017763376236, "rewards/accuracy_reward": 0.5922801494598389, "rewards/format_reward": 1.0, "step": 2142, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.25, "completion_length": 362.015625, "epoch": 0.06516847098893079, "grad_norm": 8.912827278175117, "kl": 0.040283203125, "learning_rate": 9.895576724033376e-07, "loss": 0.0016, "reward": 1.8562500476837158, "reward_std": 0.08100926131010056, "rewards/accuracy_reward": 0.71875, "rewards/format_reward": 1.0, "step": 2143, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.125, "completion_length": 388.375, "epoch": 0.06519888091473057, "grad_norm": 1.0740971487586592, "kl": 0.043701171875, "learning_rate": 9.89547958711043e-07, "loss": 0.0017, "reward": 1.686990737915039, "reward_std": 0.0771588385105133, "rewards/accuracy_reward": 0.568240761756897, "rewards/format_reward": 1.0, "step": 2144, "temperature": 1.0 }, { "all_correct": 0.375, "all_wrong": 0.25, "completion_length": 385.015625, "epoch": 0.06522929084053035, "grad_norm": 0.9200854899031182, "kl": 0.042236328125, "learning_rate": 9.89538240550619e-07, "loss": 0.0017, "reward": 1.6555705070495605, "reward_std": 0.11085636913776398, "rewards/accuracy_reward": 0.5493203997612, "rewards/format_reward": 0.984375, "step": 2145, "temperature": 1.0 }, { "all_correct": 0.375, "all_wrong": 0.125, "completion_length": 368.25, "epoch": 0.06525970076633013, "grad_norm": 0.7352006757712228, "kl": 0.033203125, "learning_rate": 9.89528517922155e-07, "loss": 0.0013, "reward": 1.743749976158142, "reward_std": 0.28969788551330566, "rewards/accuracy_reward": 0.625, "rewards/format_reward": 1.0, "step": 2146, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.25, "completion_length": 368.8125, "epoch": 0.06529011069212991, "grad_norm": 0.45823322028731206, "kl": 0.04736328125, "learning_rate": 9.895187908257394e-07, "loss": 0.0019, "reward": 1.6906250715255737, "reward_std": 0.12437255680561066, "rewards/accuracy_reward": 0.578125, "rewards/format_reward": 1.0, "step": 2147, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.125, "completion_length": 342.84375, "epoch": 0.06532052061792969, "grad_norm": 0.7338883775498909, "kl": 0.042724609375, "learning_rate": 9.895090592614608e-07, "loss": 0.0017, "reward": 1.880029559135437, "reward_std": 0.0859389528632164, "rewards/accuracy_reward": 0.7237794995307922, "rewards/format_reward": 1.0, "step": 2148, "temperature": 1.0 }, { "all_correct": 0.125, "all_wrong": 0.0, "completion_length": 363.640625, "epoch": 0.06535093054372948, "grad_norm": 1.4019951462933893, "kl": 0.036865234375, "learning_rate": 9.89499323229408e-07, "loss": 0.0015, "reward": 1.7609286308288574, "reward_std": 0.37728893756866455, "rewards/accuracy_reward": 0.6203036308288574, "rewards/format_reward": 1.0, "step": 2149, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.5, "completion_length": 336.4375, "epoch": 0.06538134046952926, "grad_norm": 0.06367687563436776, "kl": 0.03515625, "learning_rate": 9.894895827296703e-07, "loss": 0.0014, "reward": 1.600000023841858, "reward_std": 0.0, "rewards/accuracy_reward": 0.5, "rewards/format_reward": 1.0, "step": 2150, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.125, "completion_length": 351.859375, "epoch": 0.06541175039532904, "grad_norm": 0.9252965806375493, "kl": 0.037109375, "learning_rate": 9.89479837762336e-07, "loss": 0.0015, "reward": 1.7753630876541138, "reward_std": 0.12900525331497192, "rewards/accuracy_reward": 0.6441130638122559, "rewards/format_reward": 1.0, "step": 2151, "temperature": 1.0 }, { "all_correct": 0.375, "all_wrong": 0.25, "completion_length": 358.484375, "epoch": 0.06544216032112882, "grad_norm": 1.1239139810826408, "kl": 0.04052734375, "learning_rate": 9.894700883274948e-07, "loss": 0.0016, "reward": 1.5656399726867676, "reward_std": 0.09243050217628479, "rewards/accuracy_reward": 0.4562648832798004, "rewards/format_reward": 1.0, "step": 2152, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.0, "completion_length": 430.609375, "epoch": 0.0654725702469286, "grad_norm": 0.5322576485153732, "kl": 0.027099609375, "learning_rate": 9.89460334425235e-07, "loss": 0.0011, "reward": 1.8913309574127197, "reward_std": 0.2120649814605713, "rewards/accuracy_reward": 0.8007059097290039, "rewards/format_reward": 0.9375, "step": 2153, "temperature": 1.0 }, { "all_correct": 0.375, "all_wrong": 0.125, "completion_length": 376.375, "epoch": 0.06550298017272838, "grad_norm": 2.0366863215589825, "kl": 0.043212890625, "learning_rate": 9.89450576055646e-07, "loss": 0.0017, "reward": 1.8033990859985352, "reward_std": 0.17502304911613464, "rewards/accuracy_reward": 0.6502741575241089, "rewards/format_reward": 1.0, "step": 2154, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.0, "completion_length": 389.859375, "epoch": 0.06553339009852815, "grad_norm": 1.1680208213511205, "kl": 0.03125, "learning_rate": 9.894408132188167e-07, "loss": 0.0013, "reward": 1.9641649723052979, "reward_std": 0.17212986946105957, "rewards/accuracy_reward": 0.798539936542511, "rewards/format_reward": 1.0, "step": 2155, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.125, "completion_length": 335.453125, "epoch": 0.06556380002432793, "grad_norm": 1.5757252253835234, "kl": 0.044677734375, "learning_rate": 9.894310459148363e-07, "loss": 0.0018, "reward": 1.915624976158142, "reward_std": 0.13130834698677063, "rewards/accuracy_reward": 0.765625, "rewards/format_reward": 1.0, "step": 2156, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 356.5, "epoch": 0.06559420995012773, "grad_norm": 1.092003969800466, "kl": 0.03125, "learning_rate": 9.894212741437937e-07, "loss": 0.0012, "reward": 1.9278312921524048, "reward_std": 0.07658499479293823, "rewards/accuracy_reward": 0.7747061848640442, "rewards/format_reward": 1.0, "step": 2157, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.125, "completion_length": 366.40625, "epoch": 0.0656246198759275, "grad_norm": 0.6802086568691329, "kl": 0.037353515625, "learning_rate": 9.894114979057786e-07, "loss": 0.0015, "reward": 1.9343750476837158, "reward_std": 0.1467086374759674, "rewards/accuracy_reward": 0.78125, "rewards/format_reward": 1.0, "step": 2158, "temperature": 1.0 }, { "all_correct": 0.375, "all_wrong": 0.0, "completion_length": 342.78125, "epoch": 0.06565502980172729, "grad_norm": 1.2788573230426565, "kl": 0.035400390625, "learning_rate": 9.894017172008798e-07, "loss": 0.0014, "reward": 1.8436254262924194, "reward_std": 0.24868270754814148, "rewards/accuracy_reward": 0.6873753666877747, "rewards/format_reward": 1.0, "step": 2159, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 376.671875, "epoch": 0.06568543972752706, "grad_norm": 2.2527442517709186, "kl": 0.03955078125, "learning_rate": 9.893919320291864e-07, "loss": 0.0016, "reward": 1.9951844215393066, "reward_std": 0.03023550845682621, "rewards/accuracy_reward": 0.8170594573020935, "rewards/format_reward": 1.0, "step": 2160, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.0, "completion_length": 360.390625, "epoch": 0.06571584965332684, "grad_norm": 1.077928703248801, "kl": 0.040283203125, "learning_rate": 9.893821423907884e-07, "loss": 0.0016, "reward": 2.0173511505126953, "reward_std": 0.07441163063049316, "rewards/accuracy_reward": 0.8298512697219849, "rewards/format_reward": 1.0, "step": 2161, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.125, "completion_length": 361.5, "epoch": 0.06574625957912662, "grad_norm": 1.1783431411999512, "kl": 0.04345703125, "learning_rate": 9.893723482857746e-07, "loss": 0.0017, "reward": 1.921875, "reward_std": 0.11572164297103882, "rewards/accuracy_reward": 0.796875, "rewards/format_reward": 1.0, "step": 2162, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.125, "completion_length": 351.46875, "epoch": 0.0657766695049264, "grad_norm": 3.157449754843279, "kl": 0.046142578125, "learning_rate": 9.893625497142345e-07, "loss": 0.0018, "reward": 1.8677895069122314, "reward_std": 0.0447576679289341, "rewards/accuracy_reward": 0.7052894830703735, "rewards/format_reward": 1.0, "step": 2163, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.125, "completion_length": 343.28125, "epoch": 0.0658070794307262, "grad_norm": 1.0470054634982928, "kl": 0.037841796875, "learning_rate": 9.893527466762575e-07, "loss": 0.0015, "reward": 1.8531250953674316, "reward_std": 0.13950422406196594, "rewards/accuracy_reward": 0.71875, "rewards/format_reward": 1.0, "step": 2164, "temperature": 1.0 }, { "all_correct": 0.375, "all_wrong": 0.25, "completion_length": 379.734375, "epoch": 0.06583748935652597, "grad_norm": 1.2186945096011434, "kl": 0.03857421875, "learning_rate": 9.89342939171933e-07, "loss": 0.0015, "reward": 1.7369017601013184, "reward_std": 0.04994848370552063, "rewards/accuracy_reward": 0.5994017720222473, "rewards/format_reward": 1.0, "step": 2165, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.125, "completion_length": 359.578125, "epoch": 0.06586789928232575, "grad_norm": 0.5178990465717023, "kl": 0.0250244140625, "learning_rate": 9.89333127201351e-07, "loss": 0.001, "reward": 1.9000000953674316, "reward_std": 0.13887301087379456, "rewards/accuracy_reward": 0.75, "rewards/format_reward": 1.0, "step": 2166, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.125, "completion_length": 359.234375, "epoch": 0.06589830920812553, "grad_norm": 0.9568610125398591, "kl": 0.044921875, "learning_rate": 9.893233107646006e-07, "loss": 0.0018, "reward": 1.8965741395950317, "reward_std": 0.11531075835227966, "rewards/accuracy_reward": 0.740324079990387, "rewards/format_reward": 1.0, "step": 2167, "temperature": 1.0 }, { "all_correct": 0.875, "all_wrong": 0.0, "completion_length": 345.828125, "epoch": 0.06592871913392531, "grad_norm": 1.0975191212109643, "kl": 0.048095703125, "learning_rate": 9.893134898617714e-07, "loss": 0.0019, "reward": 2.094196319580078, "reward_std": 0.08892960101366043, "rewards/accuracy_reward": 0.9098214507102966, "rewards/format_reward": 1.0, "step": 2168, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.0, "completion_length": 339.328125, "epoch": 0.06595912905972509, "grad_norm": 1.4292095285075697, "kl": 0.03759765625, "learning_rate": 9.893036644929534e-07, "loss": 0.0015, "reward": 1.9753940105438232, "reward_std": 0.15661275386810303, "rewards/accuracy_reward": 0.8160189390182495, "rewards/format_reward": 1.0, "step": 2169, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.0, "completion_length": 355.984375, "epoch": 0.06598953898552487, "grad_norm": 1.1597201143826492, "kl": 0.034423828125, "learning_rate": 9.892938346582357e-07, "loss": 0.0014, "reward": 2.027740001678467, "reward_std": 0.10970516502857208, "rewards/accuracy_reward": 0.8433650135993958, "rewards/format_reward": 1.0, "step": 2170, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.0, "completion_length": 381.0, "epoch": 0.06601994891132465, "grad_norm": 0.8697326730523995, "kl": 0.041015625, "learning_rate": 9.892840003577083e-07, "loss": 0.0016, "reward": 1.9620141983032227, "reward_std": 0.03178410977125168, "rewards/accuracy_reward": 0.7745140790939331, "rewards/format_reward": 1.0, "step": 2171, "temperature": 1.0 }, { "all_correct": 0.375, "all_wrong": 0.25, "completion_length": 376.09375, "epoch": 0.06605035883712444, "grad_norm": 0.8429770118738523, "kl": 0.044189453125, "learning_rate": 9.892741615914613e-07, "loss": 0.0018, "reward": 1.7590655088424683, "reward_std": 0.06664355099201202, "rewards/accuracy_reward": 0.6184403896331787, "rewards/format_reward": 1.0, "step": 2172, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.125, "completion_length": 357.28125, "epoch": 0.06608076876292422, "grad_norm": 1.308018558209081, "kl": 0.04443359375, "learning_rate": 9.89264318359584e-07, "loss": 0.0018, "reward": 1.8214643001556396, "reward_std": 0.16325393319129944, "rewards/accuracy_reward": 0.6714643836021423, "rewards/format_reward": 1.0, "step": 2173, "temperature": 1.0 }, { "all_correct": 0.25, "all_wrong": 0.25, "completion_length": 366.59375, "epoch": 0.066111178688724, "grad_norm": 1.9116182256325975, "kl": 0.03955078125, "learning_rate": 9.892544706621665e-07, "loss": 0.0016, "reward": 1.6888889074325562, "reward_std": 0.15620636940002441, "rewards/accuracy_reward": 0.5607638955116272, "rewards/format_reward": 1.0, "step": 2174, "temperature": 1.0 }, { "all_correct": 0.25, "all_wrong": 0.25, "completion_length": 366.09375, "epoch": 0.06614158861452378, "grad_norm": 0.9579836230289398, "kl": 0.041015625, "learning_rate": 9.892446184992987e-07, "loss": 0.0016, "reward": 1.55078125, "reward_std": 0.20485760271549225, "rewards/accuracy_reward": 0.45703125, "rewards/format_reward": 1.0, "step": 2175, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.0, "completion_length": 362.96875, "epoch": 0.06617199854032356, "grad_norm": 0.9432513111254556, "kl": 0.03857421875, "learning_rate": 9.8923476187107e-07, "loss": 0.0015, "reward": 1.9913735389709473, "reward_std": 0.07396546006202698, "rewards/accuracy_reward": 0.7976235151290894, "rewards/format_reward": 1.0, "step": 2176, "temperature": 1.0 }, { "all_correct": 0.375, "all_wrong": 0.25, "completion_length": 345.40625, "epoch": 0.06620240846612334, "grad_norm": 1.5921816086785212, "kl": 0.032470703125, "learning_rate": 9.892249007775713e-07, "loss": 0.0013, "reward": 1.6612179279327393, "reward_std": 0.1332113891839981, "rewards/accuracy_reward": 0.5518429279327393, "rewards/format_reward": 1.0, "step": 2177, "temperature": 1.0 }, { "all_correct": 0.125, "all_wrong": 0.125, "completion_length": 422.984375, "epoch": 0.06623281839192312, "grad_norm": 1.7138908857733282, "kl": 0.0301513671875, "learning_rate": 9.89215035218892e-07, "loss": 0.0012, "reward": 1.5531978607177734, "reward_std": 0.2504054009914398, "rewards/accuracy_reward": 0.4344477653503418, "rewards/format_reward": 1.0, "step": 2178, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.125, "completion_length": 350.234375, "epoch": 0.06626322831772291, "grad_norm": 7.959994559815919, "kl": 0.034423828125, "learning_rate": 9.89205165195122e-07, "loss": 0.0014, "reward": 1.8781249523162842, "reward_std": 0.15590772032737732, "rewards/accuracy_reward": 0.734375, "rewards/format_reward": 1.0, "step": 2179, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.0, "completion_length": 368.78125, "epoch": 0.06629363824352269, "grad_norm": 0.9551556346840174, "kl": 0.02978515625, "learning_rate": 9.891952907063518e-07, "loss": 0.0012, "reward": 1.8961539268493652, "reward_std": 0.10754971206188202, "rewards/accuracy_reward": 0.7524038553237915, "rewards/format_reward": 1.0, "step": 2180, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.25, "completion_length": 358.671875, "epoch": 0.06632404816932247, "grad_norm": 1.2049701553116061, "kl": 0.03173828125, "learning_rate": 9.891854117526713e-07, "loss": 0.0013, "reward": 1.6755940914154053, "reward_std": 0.09196175634860992, "rewards/accuracy_reward": 0.5599690079689026, "rewards/format_reward": 1.0, "step": 2181, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.25, "completion_length": 341.125, "epoch": 0.06635445809512225, "grad_norm": 0.5221770240749765, "kl": 0.0361328125, "learning_rate": 9.891755283341707e-07, "loss": 0.0014, "reward": 1.8968749046325684, "reward_std": 0.008838837966322899, "rewards/accuracy_reward": 0.75, "rewards/format_reward": 1.0, "step": 2182, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.125, "completion_length": 397.859375, "epoch": 0.06638486802092203, "grad_norm": 0.5736903572413794, "kl": 0.033203125, "learning_rate": 9.891656404509403e-07, "loss": 0.0013, "reward": 1.8772469758987427, "reward_std": 0.0691828578710556, "rewards/accuracy_reward": 0.780371904373169, "rewards/format_reward": 0.921875, "step": 2183, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.25, "completion_length": 347.203125, "epoch": 0.0664152779467218, "grad_norm": 1.5212429188427288, "kl": 0.037841796875, "learning_rate": 9.891557481030703e-07, "loss": 0.0015, "reward": 1.7114999294281006, "reward_std": 0.024166982620954514, "rewards/accuracy_reward": 0.570874810218811, "rewards/format_reward": 1.0, "step": 2184, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.125, "completion_length": 375.625, "epoch": 0.06644568787252159, "grad_norm": 1.0463700484591476, "kl": 0.037109375, "learning_rate": 9.891458512906509e-07, "loss": 0.0015, "reward": 1.8772956132888794, "reward_std": 0.10785648226737976, "rewards/accuracy_reward": 0.7241705656051636, "rewards/format_reward": 1.0, "step": 2185, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.125, "completion_length": 371.4375, "epoch": 0.06647609779832137, "grad_norm": 5.969664624863523, "kl": 0.04345703125, "learning_rate": 9.891359500137723e-07, "loss": 0.0017, "reward": 1.7193341255187988, "reward_std": 0.0828140452504158, "rewards/accuracy_reward": 0.5880841016769409, "rewards/format_reward": 1.0, "step": 2186, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 352.59375, "epoch": 0.06650650772412116, "grad_norm": 0.9211553063389013, "kl": 0.03271484375, "learning_rate": 9.891260442725252e-07, "loss": 0.0013, "reward": 1.9913378953933716, "reward_std": 0.054313041269779205, "rewards/accuracy_reward": 0.8163378834724426, "rewards/format_reward": 1.0, "step": 2187, "temperature": 1.0 }, { "all_correct": 0.375, "all_wrong": 0.125, "completion_length": 359.734375, "epoch": 0.06653691764992094, "grad_norm": 1.233020580684019, "kl": 0.043212890625, "learning_rate": 9.89116134067e-07, "loss": 0.0017, "reward": 1.7299096584320068, "reward_std": 0.1557638943195343, "rewards/accuracy_reward": 0.5861595273017883, "rewards/format_reward": 1.0, "step": 2188, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.25, "completion_length": 353.296875, "epoch": 0.06656732757572072, "grad_norm": 0.8674757407568687, "kl": 0.046630859375, "learning_rate": 9.891062193972868e-07, "loss": 0.0019, "reward": 1.7094061374664307, "reward_std": 0.01996174082159996, "rewards/accuracy_reward": 0.575031042098999, "rewards/format_reward": 1.0, "step": 2189, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.125, "completion_length": 358.296875, "epoch": 0.0665977375015205, "grad_norm": 1.006863413214628, "kl": 0.032958984375, "learning_rate": 9.890963002634765e-07, "loss": 0.0013, "reward": 1.8406250476837158, "reward_std": 0.12437255680561066, "rewards/accuracy_reward": 0.703125, "rewards/format_reward": 1.0, "step": 2190, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.125, "completion_length": 348.453125, "epoch": 0.06662814742732028, "grad_norm": 0.7694364367798958, "kl": 0.034423828125, "learning_rate": 9.890863766656594e-07, "loss": 0.0014, "reward": 1.8406250476837158, "reward_std": 0.12437255680561066, "rewards/accuracy_reward": 0.703125, "rewards/format_reward": 1.0, "step": 2191, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.125, "completion_length": 385.03125, "epoch": 0.06665855735312005, "grad_norm": 0.8216668764442004, "kl": 0.036865234375, "learning_rate": 9.890764486039261e-07, "loss": 0.0015, "reward": 1.9977678060531616, "reward_std": 0.1139097586274147, "rewards/accuracy_reward": 0.8571428656578064, "rewards/format_reward": 0.984375, "step": 2192, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.125, "completion_length": 351.65625, "epoch": 0.06668896727891983, "grad_norm": 1.37466984034403, "kl": 0.039306640625, "learning_rate": 9.890665160783671e-07, "loss": 0.0016, "reward": 1.9375, "reward_std": 0.09785604476928711, "rewards/accuracy_reward": 0.78125, "rewards/format_reward": 1.0, "step": 2193, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.0, "completion_length": 377.875, "epoch": 0.06671937720471963, "grad_norm": 1.2030926972845888, "kl": 0.04150390625, "learning_rate": 9.890565790890733e-07, "loss": 0.0017, "reward": 2.0089097023010254, "reward_std": 0.04396862909197807, "rewards/accuracy_reward": 0.8307846188545227, "rewards/format_reward": 1.0, "step": 2194, "temperature": 1.0 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 336.3125, "epoch": 0.0667497871305194, "grad_norm": 0.8532780041171667, "kl": 0.046875, "learning_rate": 9.890466376361351e-07, "loss": 0.0019, "reward": 2.1932692527770996, "reward_std": 0.015992917120456696, "rewards/accuracy_reward": 0.9963942170143127, "rewards/format_reward": 1.0, "step": 2195, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.0, "completion_length": 358.515625, "epoch": 0.06678019705631918, "grad_norm": 1.2342594756815422, "kl": 0.034912109375, "learning_rate": 9.890366917196436e-07, "loss": 0.0014, "reward": 2.0475234985351562, "reward_std": 0.14059799909591675, "rewards/accuracy_reward": 0.8631485104560852, "rewards/format_reward": 1.0, "step": 2196, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.25, "completion_length": 335.703125, "epoch": 0.06681060698211896, "grad_norm": 2.8716440725062133, "kl": 0.03759765625, "learning_rate": 9.890267413396893e-07, "loss": 0.0015, "reward": 1.8114757537841797, "reward_std": 0.01881350390613079, "rewards/accuracy_reward": 0.6708507537841797, "rewards/format_reward": 1.0, "step": 2197, "temperature": 1.0 }, { "all_correct": 0.875, "all_wrong": 0.0, "completion_length": 339.734375, "epoch": 0.06684101690791874, "grad_norm": 0.7395256291530461, "kl": 0.045166015625, "learning_rate": 9.890167864963633e-07, "loss": 0.0018, "reward": 2.153125047683716, "reward_std": 0.08237524330615997, "rewards/accuracy_reward": 0.96875, "rewards/format_reward": 1.0, "step": 2198, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.125, "completion_length": 357.015625, "epoch": 0.06687142683371852, "grad_norm": 1.0732227696801029, "kl": 0.044921875, "learning_rate": 9.89006827189756e-07, "loss": 0.0018, "reward": 1.86215341091156, "reward_std": 0.07089664041996002, "rewards/accuracy_reward": 0.7090283632278442, "rewards/format_reward": 1.0, "step": 2199, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.125, "completion_length": 344.578125, "epoch": 0.0669018367595183, "grad_norm": 2.965122436784346, "kl": 0.03271484375, "learning_rate": 9.889968634199586e-07, "loss": 0.0013, "reward": 1.8812501430511475, "reward_std": 0.22724726796150208, "rewards/accuracy_reward": 0.734375, "rewards/format_reward": 1.0, "step": 2200, "temperature": 1.0 }, { "all_correct": 0.375, "all_wrong": 0.0, "completion_length": 353.671875, "epoch": 0.0669322466853181, "grad_norm": 1.2118531159484083, "kl": 0.04736328125, "learning_rate": 9.889868951870621e-07, "loss": 0.0019, "reward": 1.8041009902954102, "reward_std": 0.12547039985656738, "rewards/accuracy_reward": 0.6478509902954102, "rewards/format_reward": 1.0, "step": 2201, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.25, "completion_length": 369.21875, "epoch": 0.06696265661111787, "grad_norm": 0.9639575110053586, "kl": 0.03369140625, "learning_rate": 9.889769224911575e-07, "loss": 0.0013, "reward": 1.6743358373641968, "reward_std": 0.07426143437623978, "rewards/accuracy_reward": 0.5430857539176941, "rewards/format_reward": 1.0, "step": 2202, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.125, "completion_length": 355.1875, "epoch": 0.06699306653691765, "grad_norm": 0.9287571709700312, "kl": 0.041015625, "learning_rate": 9.889669453323355e-07, "loss": 0.0016, "reward": 1.9153801202774048, "reward_std": 0.08854761719703674, "rewards/accuracy_reward": 0.7528800368309021, "rewards/format_reward": 1.0, "step": 2203, "temperature": 1.0 }, { "all_correct": 0.875, "all_wrong": 0.0, "completion_length": 361.984375, "epoch": 0.06702347646271743, "grad_norm": 0.3649377336082976, "kl": 0.038818359375, "learning_rate": 9.889569637106874e-07, "loss": 0.0016, "reward": 2.1271228790283203, "reward_std": 0.010082371532917023, "rewards/accuracy_reward": 0.9271230697631836, "rewards/format_reward": 1.0, "step": 2204, "temperature": 1.0 }, { "all_correct": 0.375, "all_wrong": 0.25, "completion_length": 376.375, "epoch": 0.06705388638851721, "grad_norm": 1.173058807002147, "kl": 0.037353515625, "learning_rate": 9.889469776263044e-07, "loss": 0.0015, "reward": 1.5968750715255737, "reward_std": 0.19126304984092712, "rewards/accuracy_reward": 0.5, "rewards/format_reward": 1.0, "step": 2205, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.25, "completion_length": 393.484375, "epoch": 0.06708429631431699, "grad_norm": 0.43675101533037486, "kl": 0.034912109375, "learning_rate": 9.889369870792774e-07, "loss": 0.0014, "reward": 1.8212820291519165, "reward_std": 0.009965362027287483, "rewards/accuracy_reward": 0.6744069457054138, "rewards/format_reward": 1.0, "step": 2206, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.125, "completion_length": 360.296875, "epoch": 0.06711470624011677, "grad_norm": 0.9683387610166221, "kl": 0.036376953125, "learning_rate": 9.889269920696977e-07, "loss": 0.0015, "reward": 1.911933183670044, "reward_std": 0.06308604031801224, "rewards/accuracy_reward": 0.7400580644607544, "rewards/format_reward": 1.0, "step": 2207, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 387.453125, "epoch": 0.06714511616591655, "grad_norm": 1.2543030334557612, "kl": 0.03466796875, "learning_rate": 9.889169925976567e-07, "loss": 0.0014, "reward": 1.7569324970245361, "reward_std": 0.19314423203468323, "rewards/accuracy_reward": 0.6350574493408203, "rewards/format_reward": 1.0, "step": 2208, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.125, "completion_length": 357.140625, "epoch": 0.06717552609171634, "grad_norm": 1.5859139862542995, "kl": 0.0419921875, "learning_rate": 9.889069886632453e-07, "loss": 0.0017, "reward": 1.8361048698425293, "reward_std": 0.006304196082055569, "rewards/accuracy_reward": 0.6861048936843872, "rewards/format_reward": 1.0, "step": 2209, "temperature": 1.0 }, { "all_correct": 0.375, "all_wrong": 0.25, "completion_length": 380.296875, "epoch": 0.06720593601751612, "grad_norm": 0.8509601010539325, "kl": 0.041748046875, "learning_rate": 9.88896980266555e-07, "loss": 0.0017, "reward": 1.695457935333252, "reward_std": 0.08584828674793243, "rewards/accuracy_reward": 0.5579578876495361, "rewards/format_reward": 1.0, "step": 2210, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.25, "completion_length": 357.75, "epoch": 0.0672363459433159, "grad_norm": 0.8604834821913739, "kl": 0.03466796875, "learning_rate": 9.888869674076772e-07, "loss": 0.0014, "reward": 1.7046375274658203, "reward_std": 0.08134608715772629, "rewards/accuracy_reward": 0.5765125751495361, "rewards/format_reward": 1.0, "step": 2211, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.0, "completion_length": 372.53125, "epoch": 0.06726675586911568, "grad_norm": 0.9715926231048938, "kl": 0.0380859375, "learning_rate": 9.888769500867031e-07, "loss": 0.0015, "reward": 2.097578525543213, "reward_std": 0.06918932497501373, "rewards/accuracy_reward": 0.9007035493850708, "rewards/format_reward": 1.0, "step": 2212, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 374.953125, "epoch": 0.06729716579491546, "grad_norm": 1.2766166004314237, "kl": 0.033447265625, "learning_rate": 9.888669283037245e-07, "loss": 0.0013, "reward": 2.029348373413086, "reward_std": 0.01156461052596569, "rewards/accuracy_reward": 0.8293483257293701, "rewards/format_reward": 1.0, "step": 2213, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 367.515625, "epoch": 0.06732757572071524, "grad_norm": 0.9215003525237704, "kl": 0.03955078125, "learning_rate": 9.888569020588325e-07, "loss": 0.0016, "reward": 2.0311098098754883, "reward_std": 0.09168177098035812, "rewards/accuracy_reward": 0.8467345237731934, "rewards/format_reward": 1.0, "step": 2214, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.125, "completion_length": 374.234375, "epoch": 0.06735798564651502, "grad_norm": 0.774566980708394, "kl": 0.036865234375, "learning_rate": 9.888468713521188e-07, "loss": 0.0015, "reward": 1.965592622756958, "reward_std": 0.005947811529040337, "rewards/accuracy_reward": 0.7905925512313843, "rewards/format_reward": 1.0, "step": 2215, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.125, "completion_length": 406.90625, "epoch": 0.06738839557231481, "grad_norm": 1.3057221687884633, "kl": 0.033203125, "learning_rate": 9.888368361836748e-07, "loss": 0.0013, "reward": 1.836895227432251, "reward_std": 0.0696188285946846, "rewards/accuracy_reward": 0.6868952512741089, "rewards/format_reward": 1.0, "step": 2216, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.25, "completion_length": 357.578125, "epoch": 0.06741880549811459, "grad_norm": 0.7122588737406516, "kl": 0.04638671875, "learning_rate": 9.888267965535921e-07, "loss": 0.0019, "reward": 1.7175116539001465, "reward_std": 0.08245132863521576, "rewards/accuracy_reward": 0.5800116658210754, "rewards/format_reward": 1.0, "step": 2217, "temperature": 1.0 }, { "all_correct": 0.375, "all_wrong": 0.125, "completion_length": 377.28125, "epoch": 0.06744921542391437, "grad_norm": 1.6554297291100282, "kl": 0.03564453125, "learning_rate": 9.888167524619626e-07, "loss": 0.0014, "reward": 1.6717925071716309, "reward_std": 0.1376330703496933, "rewards/accuracy_reward": 0.5342925786972046, "rewards/format_reward": 1.0, "step": 2218, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 348.296875, "epoch": 0.06747962534971415, "grad_norm": 1.1253100796109392, "kl": 0.0380859375, "learning_rate": 9.888067039088776e-07, "loss": 0.0015, "reward": 2.065624952316284, "reward_std": 0.15590772032737732, "rewards/accuracy_reward": 0.890625, "rewards/format_reward": 1.0, "step": 2219, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.125, "completion_length": 363.09375, "epoch": 0.06751003527551393, "grad_norm": 1.0485963127786813, "kl": 0.037109375, "learning_rate": 9.887966508944292e-07, "loss": 0.0015, "reward": 1.8107998371124268, "reward_std": 0.014005981385707855, "rewards/accuracy_reward": 0.6357998847961426, "rewards/format_reward": 1.0, "step": 2220, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.125, "completion_length": 346.234375, "epoch": 0.0675404452013137, "grad_norm": 1.645339150782058, "kl": 0.0439453125, "learning_rate": 9.887865934187089e-07, "loss": 0.0018, "reward": 1.7654517889022827, "reward_std": 0.13604041934013367, "rewards/accuracy_reward": 0.6404517889022827, "rewards/format_reward": 1.0, "step": 2221, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 355.828125, "epoch": 0.06757085512711349, "grad_norm": 2.903908420632924, "kl": 0.043701171875, "learning_rate": 9.887765314818085e-07, "loss": 0.0017, "reward": 1.962499976158142, "reward_std": 0.14223814010620117, "rewards/accuracy_reward": 0.8125, "rewards/format_reward": 1.0, "step": 2222, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.25, "completion_length": 375.9375, "epoch": 0.06760126505291326, "grad_norm": 1.3954033852374013, "kl": 0.04248046875, "learning_rate": 9.887664650838198e-07, "loss": 0.0017, "reward": 1.7894814014434814, "reward_std": 0.05271490290760994, "rewards/accuracy_reward": 0.6488563418388367, "rewards/format_reward": 1.0, "step": 2223, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.25, "completion_length": 364.65625, "epoch": 0.06763167497871306, "grad_norm": 0.8941798708553818, "kl": 0.035400390625, "learning_rate": 9.88756394224835e-07, "loss": 0.0014, "reward": 1.834233045578003, "reward_std": 0.0618002787232399, "rewards/accuracy_reward": 0.6904829144477844, "rewards/format_reward": 1.0, "step": 2224, "temperature": 1.0 }, { "all_correct": 0.375, "all_wrong": 0.5, "completion_length": 362.171875, "epoch": 0.06766208490451284, "grad_norm": 1.312360301298235, "kl": 0.033447265625, "learning_rate": 9.887463189049457e-07, "loss": 0.0013, "reward": 1.465134859085083, "reward_std": 0.015524456277489662, "rewards/accuracy_reward": 0.38388481736183167, "rewards/format_reward": 1.0, "step": 2225, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.125, "completion_length": 371.1875, "epoch": 0.06769249483031262, "grad_norm": 0.4693052695392991, "kl": 0.03271484375, "learning_rate": 9.887362391242437e-07, "loss": 0.0013, "reward": 1.9920763969421387, "reward_std": 0.0659717470407486, "rewards/accuracy_reward": 0.8295762538909912, "rewards/format_reward": 1.0, "step": 2226, "temperature": 1.0 }, { "all_correct": 0.875, "all_wrong": 0.0, "completion_length": 349.0625, "epoch": 0.0677229047561124, "grad_norm": 0.6616559652401559, "kl": 0.036865234375, "learning_rate": 9.887261548828215e-07, "loss": 0.0015, "reward": 2.179513931274414, "reward_std": 0.05624767020344734, "rewards/accuracy_reward": 0.9826388955116272, "rewards/format_reward": 1.0, "step": 2227, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 352.4375, "epoch": 0.06775331468191217, "grad_norm": 1.1631021820362066, "kl": 0.0380859375, "learning_rate": 9.887160661807708e-07, "loss": 0.0015, "reward": 1.8558531999588013, "reward_std": 0.1292649805545807, "rewards/accuracy_reward": 0.7058532238006592, "rewards/format_reward": 1.0, "step": 2228, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 343.984375, "epoch": 0.06778372460771195, "grad_norm": 1.5298475687083963, "kl": 0.0419921875, "learning_rate": 9.887059730181838e-07, "loss": 0.0017, "reward": 1.9083261489868164, "reward_std": 0.26008477807044983, "rewards/accuracy_reward": 0.7489511370658875, "rewards/format_reward": 1.0, "step": 2229, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.125, "completion_length": 343.859375, "epoch": 0.06781413453351173, "grad_norm": 1.116183878692776, "kl": 0.043701171875, "learning_rate": 9.886958753951524e-07, "loss": 0.0017, "reward": 1.984375, "reward_std": 0.09804397821426392, "rewards/accuracy_reward": 0.828125, "rewards/format_reward": 1.0, "step": 2230, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.125, "completion_length": 355.484375, "epoch": 0.06784454445931153, "grad_norm": 1.0151983832541056, "kl": 0.0400390625, "learning_rate": 9.886857733117692e-07, "loss": 0.0016, "reward": 1.9531069993972778, "reward_std": 0.06591516733169556, "rewards/accuracy_reward": 0.7843568921089172, "rewards/format_reward": 1.0, "step": 2231, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.125, "completion_length": 348.15625, "epoch": 0.0678749543851113, "grad_norm": 0.75267016829093, "kl": 0.037353515625, "learning_rate": 9.886756667681259e-07, "loss": 0.0015, "reward": 2.0093750953674316, "reward_std": 0.07827534526586533, "rewards/accuracy_reward": 0.84375, "rewards/format_reward": 1.0, "step": 2232, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.0, "completion_length": 356.21875, "epoch": 0.06790536431091108, "grad_norm": 1.6709839581364976, "kl": 0.04638671875, "learning_rate": 9.886655557643151e-07, "loss": 0.0019, "reward": 1.9906619787216187, "reward_std": 0.11100949347019196, "rewards/accuracy_reward": 0.8125368356704712, "rewards/format_reward": 1.0, "step": 2233, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 368.25, "epoch": 0.06793577423671086, "grad_norm": 3.6082573841813654, "kl": 0.03076171875, "learning_rate": 9.88655440300429e-07, "loss": 0.0012, "reward": 1.8297780752182007, "reward_std": 0.15095004439353943, "rewards/accuracy_reward": 0.6797781586647034, "rewards/format_reward": 1.0, "step": 2234, "temperature": 1.0 }, { "all_correct": 0.25, "all_wrong": 0.25, "completion_length": 386.8125, "epoch": 0.06796618416251064, "grad_norm": 2.6515970005504057, "kl": 0.038818359375, "learning_rate": 9.8864532037656e-07, "loss": 0.0016, "reward": 1.528320550918579, "reward_std": 0.05963604524731636, "rewards/accuracy_reward": 0.42207059264183044, "rewards/format_reward": 1.0, "step": 2235, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.125, "completion_length": 367.34375, "epoch": 0.06799659408831042, "grad_norm": 0.7370335966791741, "kl": 0.0400390625, "learning_rate": 9.886351959928001e-07, "loss": 0.0016, "reward": 1.914493441581726, "reward_std": 0.011152930557727814, "rewards/accuracy_reward": 0.7426183819770813, "rewards/format_reward": 1.0, "step": 2236, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.25, "completion_length": 367.015625, "epoch": 0.0680270040141102, "grad_norm": 0.5074762813946876, "kl": 0.0400390625, "learning_rate": 9.88625067149242e-07, "loss": 0.0016, "reward": 1.8968749046325684, "reward_std": 0.008838837966322899, "rewards/accuracy_reward": 0.746874988079071, "rewards/format_reward": 1.0, "step": 2237, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.125, "completion_length": 362.953125, "epoch": 0.06805741393990998, "grad_norm": 1.9127370249608193, "kl": 0.0498046875, "learning_rate": 9.886149338459782e-07, "loss": 0.002, "reward": 1.6760433912277222, "reward_std": 0.05108101665973663, "rewards/accuracy_reward": 0.5385434031486511, "rewards/format_reward": 1.0, "step": 2238, "temperature": 1.0 }, { "all_correct": 0.875, "all_wrong": 0.0, "completion_length": 376.609375, "epoch": 0.06808782386570977, "grad_norm": 0.4184130412938746, "kl": 0.0311279296875, "learning_rate": 9.88604796083101e-07, "loss": 0.0012, "reward": 2.1468749046325684, "reward_std": 0.08237523585557938, "rewards/accuracy_reward": 0.96875, "rewards/format_reward": 1.0, "step": 2239, "temperature": 1.0 }, { "all_correct": 0.375, "all_wrong": 0.25, "completion_length": 401.84375, "epoch": 0.06811823379150955, "grad_norm": 1.805844491970183, "kl": 0.033203125, "learning_rate": 9.885946538607031e-07, "loss": 0.0013, "reward": 1.682572603225708, "reward_std": 0.07227236032485962, "rewards/accuracy_reward": 0.5513225197792053, "rewards/format_reward": 1.0, "step": 2240, "temperature": 1.0 }, { "all_correct": 0.875, "all_wrong": 0.0, "completion_length": 352.859375, "epoch": 0.06814864371730933, "grad_norm": 0.4046895602290211, "kl": 0.046142578125, "learning_rate": 9.88584507178877e-07, "loss": 0.0018, "reward": 2.1315250396728516, "reward_std": 0.010544582270085812, "rewards/accuracy_reward": 0.934650182723999, "rewards/format_reward": 1.0, "step": 2241, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 375.59375, "epoch": 0.06817905364310911, "grad_norm": 1.144219701701698, "kl": 0.032958984375, "learning_rate": 9.885743560377153e-07, "loss": 0.0013, "reward": 1.8402928113937378, "reward_std": 0.15710125863552094, "rewards/accuracy_reward": 0.6902925968170166, "rewards/format_reward": 0.984375, "step": 2242, "temperature": 1.0 }, { "all_correct": 0.875, "all_wrong": 0.0, "completion_length": 353.734375, "epoch": 0.06820946356890889, "grad_norm": 0.7724977110124849, "kl": 0.040283203125, "learning_rate": 9.885642004373107e-07, "loss": 0.0016, "reward": 2.106642961502075, "reward_std": 0.016255296766757965, "rewards/accuracy_reward": 0.9097679853439331, "rewards/format_reward": 1.0, "step": 2243, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 354.53125, "epoch": 0.06823987349470867, "grad_norm": 1.3876557518155939, "kl": 0.045166015625, "learning_rate": 9.885540403777556e-07, "loss": 0.0018, "reward": 2.0749754905700684, "reward_std": 0.03171054273843765, "rewards/accuracy_reward": 0.8874753713607788, "rewards/format_reward": 1.0, "step": 2244, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.25, "completion_length": 368.171875, "epoch": 0.06827028342050845, "grad_norm": 1.077688281664119, "kl": 0.044921875, "learning_rate": 9.885438758591431e-07, "loss": 0.0018, "reward": 1.725000023841858, "reward_std": 0.12246952205896378, "rewards/accuracy_reward": 0.609375, "rewards/format_reward": 1.0, "step": 2245, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 376.53125, "epoch": 0.06830069334630824, "grad_norm": 0.4644101306656919, "kl": 0.04052734375, "learning_rate": 9.88533706881566e-07, "loss": 0.0016, "reward": 2.049999952316284, "reward_std": 0.11206510663032532, "rewards/accuracy_reward": 0.890625, "rewards/format_reward": 1.0, "step": 2246, "temperature": 1.0 }, { "all_correct": 0.875, "all_wrong": 0.0, "completion_length": 384.84375, "epoch": 0.06833110327210802, "grad_norm": 0.7051081157141439, "kl": 0.033447265625, "learning_rate": 9.88523533445117e-07, "loss": 0.0013, "reward": 2.0969135761260986, "reward_std": 0.017027081921696663, "rewards/accuracy_reward": 0.9094133973121643, "rewards/format_reward": 1.0, "step": 2247, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 350.390625, "epoch": 0.0683615131979078, "grad_norm": 0.6781086551716938, "kl": 0.03564453125, "learning_rate": 9.885133555498887e-07, "loss": 0.0014, "reward": 2.046875, "reward_std": 0.09722718596458435, "rewards/accuracy_reward": 0.875, "rewards/format_reward": 1.0, "step": 2248, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.125, "completion_length": 345.75, "epoch": 0.06839192312370758, "grad_norm": 0.7634153846345358, "kl": 0.039794921875, "learning_rate": 9.885031731959741e-07, "loss": 0.0016, "reward": 1.7843749523162842, "reward_std": 0.16666369140148163, "rewards/accuracy_reward": 0.65625, "rewards/format_reward": 1.0, "step": 2249, "temperature": 1.0 }, { "all_correct": 0.25, "all_wrong": 0.125, "completion_length": 393.28125, "epoch": 0.06842233304950736, "grad_norm": 1.9460064065270672, "kl": 0.04345703125, "learning_rate": 9.884929863834665e-07, "loss": 0.0017, "reward": 1.6914310455322266, "reward_std": 0.15358980000019073, "rewards/accuracy_reward": 0.5601810216903687, "rewards/format_reward": 1.0, "step": 2250, "temperature": 1.0 }, { "all_correct": 0.25, "all_wrong": 0.125, "completion_length": 398.421875, "epoch": 0.06845274297530714, "grad_norm": 1.2804890304173684, "kl": 0.03515625, "learning_rate": 9.884827951124585e-07, "loss": 0.0014, "reward": 1.759111762046814, "reward_std": 0.21260187029838562, "rewards/accuracy_reward": 0.6466116905212402, "rewards/format_reward": 1.0, "step": 2251, "temperature": 1.0 }, { "all_correct": 0.375, "all_wrong": 0.125, "completion_length": 378.09375, "epoch": 0.06848315290110692, "grad_norm": 1.4745686258508393, "kl": 0.048095703125, "learning_rate": 9.884725993830433e-07, "loss": 0.0019, "reward": 1.7369842529296875, "reward_std": 0.13547685742378235, "rewards/accuracy_reward": 0.5869842767715454, "rewards/format_reward": 1.0, "step": 2252, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.125, "completion_length": 359.75, "epoch": 0.0685135628269067, "grad_norm": 0.8871923028985719, "kl": 0.041748046875, "learning_rate": 9.884623991953137e-07, "loss": 0.0017, "reward": 1.8182101249694824, "reward_std": 0.01499113067984581, "rewards/accuracy_reward": 0.6744600534439087, "rewards/format_reward": 1.0, "step": 2253, "temperature": 1.0 }, { "all_correct": 0.25, "all_wrong": 0.125, "completion_length": 356.0, "epoch": 0.06854397275270649, "grad_norm": 1.2229444103101907, "kl": 0.053955078125, "learning_rate": 9.884521945493633e-07, "loss": 0.0022, "reward": 1.7477614879608154, "reward_std": 0.19921521842479706, "rewards/accuracy_reward": 0.6102614998817444, "rewards/format_reward": 1.0, "step": 2254, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.125, "completion_length": 350.296875, "epoch": 0.06857438267850627, "grad_norm": 0.46578868748278857, "kl": 0.03662109375, "learning_rate": 9.884419854452849e-07, "loss": 0.0015, "reward": 2.03125, "reward_std": 0.0530330091714859, "rewards/accuracy_reward": 0.859375, "rewards/format_reward": 1.0, "step": 2255, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 362.78125, "epoch": 0.06860479260430605, "grad_norm": 0.7420801254894409, "kl": 0.044189453125, "learning_rate": 9.884317718831715e-07, "loss": 0.0018, "reward": 2.1480114459991455, "reward_std": 0.10553726553916931, "rewards/accuracy_reward": 0.9573863744735718, "rewards/format_reward": 1.0, "step": 2256, "temperature": 1.0 }, { "all_correct": 0.25, "all_wrong": 0.125, "completion_length": 392.0, "epoch": 0.06863520253010583, "grad_norm": 1.2064680745875855, "kl": 0.033447265625, "learning_rate": 9.884215538631165e-07, "loss": 0.0013, "reward": 1.5687499046325684, "reward_std": 0.28721606731414795, "rewards/accuracy_reward": 0.4781249761581421, "rewards/format_reward": 1.0, "step": 2257, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.125, "completion_length": 350.703125, "epoch": 0.0686656124559056, "grad_norm": 0.8532761410665195, "kl": 0.037353515625, "learning_rate": 9.884113313852133e-07, "loss": 0.0015, "reward": 1.8167037963867188, "reward_std": 0.09854865074157715, "rewards/accuracy_reward": 0.6698288917541504, "rewards/format_reward": 1.0, "step": 2258, "temperature": 1.0 }, { "all_correct": 0.375, "all_wrong": 0.125, "completion_length": 353.53125, "epoch": 0.06869602238170538, "grad_norm": 1.6448761973557864, "kl": 0.037353515625, "learning_rate": 9.88401104449555e-07, "loss": 0.0015, "reward": 1.7667994499206543, "reward_std": 0.15542975068092346, "rewards/accuracy_reward": 0.6167994141578674, "rewards/format_reward": 1.0, "step": 2259, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.125, "completion_length": 350.921875, "epoch": 0.06872643230750516, "grad_norm": 0.6316409767517929, "kl": 0.042236328125, "learning_rate": 9.883908730562351e-07, "loss": 0.0017, "reward": 1.9796427488327026, "reward_std": 0.012135233730077744, "rewards/accuracy_reward": 0.8108928203582764, "rewards/format_reward": 1.0, "step": 2260, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.125, "completion_length": 347.84375, "epoch": 0.06875684223330496, "grad_norm": 0.8056503372911228, "kl": 0.045654296875, "learning_rate": 9.88380637205347e-07, "loss": 0.0018, "reward": 1.846882700920105, "reward_std": 0.0741848573088646, "rewards/accuracy_reward": 0.690632700920105, "rewards/format_reward": 1.0, "step": 2261, "temperature": 1.0 }, { "all_correct": 0.375, "all_wrong": 0.125, "completion_length": 367.890625, "epoch": 0.06878725215910474, "grad_norm": 1.181723048855504, "kl": 0.04443359375, "learning_rate": 9.883703968969839e-07, "loss": 0.0018, "reward": 1.7667608261108398, "reward_std": 0.022528760135173798, "rewards/accuracy_reward": 0.5948858261108398, "rewards/format_reward": 1.0, "step": 2262, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.125, "completion_length": 336.5625, "epoch": 0.06881766208490452, "grad_norm": 1.0834613786889569, "kl": 0.041015625, "learning_rate": 9.883601521312393e-07, "loss": 0.0016, "reward": 1.9125001430511475, "reward_std": 0.0530330091714859, "rewards/accuracy_reward": 0.765625, "rewards/format_reward": 1.0, "step": 2263, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 392.546875, "epoch": 0.0688480720107043, "grad_norm": 0.34115646537216643, "kl": 0.032470703125, "learning_rate": 9.88349902908207e-07, "loss": 0.0013, "reward": 1.9593751430511475, "reward_std": 0.184493750333786, "rewards/accuracy_reward": 0.84375, "rewards/format_reward": 0.953125, "step": 2264, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.125, "completion_length": 358.578125, "epoch": 0.06887848193650407, "grad_norm": 0.8852695569139788, "kl": 0.03271484375, "learning_rate": 9.883396492279803e-07, "loss": 0.0013, "reward": 1.9052592515945435, "reward_std": 0.0291459821164608, "rewards/accuracy_reward": 0.7333841919898987, "rewards/format_reward": 1.0, "step": 2265, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.125, "completion_length": 343.328125, "epoch": 0.06890889186230385, "grad_norm": 0.8182329343389032, "kl": 0.0294189453125, "learning_rate": 9.883293910906528e-07, "loss": 0.0012, "reward": 1.9375, "reward_std": 0.06943650543689728, "rewards/accuracy_reward": 0.78125, "rewards/format_reward": 1.0, "step": 2266, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.125, "completion_length": 370.375, "epoch": 0.06893930178810363, "grad_norm": 1.0205765365184616, "kl": 0.03466796875, "learning_rate": 9.883191284963182e-07, "loss": 0.0014, "reward": 1.835097074508667, "reward_std": 0.02325272560119629, "rewards/accuracy_reward": 0.6632219552993774, "rewards/format_reward": 1.0, "step": 2267, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.0, "completion_length": 353.703125, "epoch": 0.06896971171390341, "grad_norm": 0.9226313677957126, "kl": 0.035888671875, "learning_rate": 9.8830886144507e-07, "loss": 0.0014, "reward": 1.9139372110366821, "reward_std": 0.07032033056020737, "rewards/accuracy_reward": 0.738937258720398, "rewards/format_reward": 1.0, "step": 2268, "temperature": 1.0 }, { "all_correct": 0.875, "all_wrong": 0.0, "completion_length": 336.21875, "epoch": 0.0690001216397032, "grad_norm": 0.591537663424875, "kl": 0.04248046875, "learning_rate": 9.88298589937002e-07, "loss": 0.0017, "reward": 2.1437501907348633, "reward_std": 0.077632375061512, "rewards/accuracy_reward": 0.953125, "rewards/format_reward": 1.0, "step": 2269, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 367.34375, "epoch": 0.06903053156550298, "grad_norm": 1.5404692508277464, "kl": 0.038818359375, "learning_rate": 9.882883139722083e-07, "loss": 0.0016, "reward": 1.9613518714904785, "reward_std": 0.13811716437339783, "rewards/accuracy_reward": 0.7738518118858337, "rewards/format_reward": 1.0, "step": 2270, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.125, "completion_length": 346.546875, "epoch": 0.06906094149130276, "grad_norm": 1.8829819033304787, "kl": 0.04443359375, "learning_rate": 9.88278033550782e-07, "loss": 0.0018, "reward": 1.876361608505249, "reward_std": 0.07933309674263, "rewards/accuracy_reward": 0.7419865131378174, "rewards/format_reward": 1.0, "step": 2271, "temperature": 1.0 }, { "all_correct": 0.375, "all_wrong": 0.125, "completion_length": 350.6875, "epoch": 0.06909135141710254, "grad_norm": 1.3919123389713304, "kl": 0.034423828125, "learning_rate": 9.882677486728176e-07, "loss": 0.0014, "reward": 1.8618614673614502, "reward_std": 0.2066417634487152, "rewards/accuracy_reward": 0.705611526966095, "rewards/format_reward": 1.0, "step": 2272, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.125, "completion_length": 352.125, "epoch": 0.06912176134290232, "grad_norm": 0.4518602644679097, "kl": 0.037109375, "learning_rate": 9.882574593384086e-07, "loss": 0.0015, "reward": 1.9156250953674316, "reward_std": 0.04419417306780815, "rewards/accuracy_reward": 0.765625, "rewards/format_reward": 1.0, "step": 2273, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.125, "completion_length": 351.265625, "epoch": 0.0691521712687021, "grad_norm": 0.7774460175595154, "kl": 0.037109375, "learning_rate": 9.88247165547649e-07, "loss": 0.0015, "reward": 1.9488685131072998, "reward_std": 0.010748323053121567, "rewards/accuracy_reward": 0.776993453502655, "rewards/format_reward": 1.0, "step": 2274, "temperature": 1.0 }, { "all_correct": 0.375, "all_wrong": 0.25, "completion_length": 403.359375, "epoch": 0.06918258119450188, "grad_norm": 0.7830753434879931, "kl": 0.041259765625, "learning_rate": 9.882368673006325e-07, "loss": 0.0017, "reward": 1.7278258800506592, "reward_std": 0.15781942009925842, "rewards/accuracy_reward": 0.6153258085250854, "rewards/format_reward": 1.0, "step": 2275, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.125, "completion_length": 354.484375, "epoch": 0.06921299112030167, "grad_norm": 0.6532487036716937, "kl": 0.0419921875, "learning_rate": 9.882265645974536e-07, "loss": 0.0017, "reward": 1.9704697132110596, "reward_std": 0.002899857936426997, "rewards/accuracy_reward": 0.7954696416854858, "rewards/format_reward": 1.0, "step": 2276, "temperature": 1.0 }, { "all_correct": 0.375, "all_wrong": 0.0, "completion_length": 372.046875, "epoch": 0.06924340104610145, "grad_norm": 1.217019881469137, "kl": 0.040283203125, "learning_rate": 9.882162574382058e-07, "loss": 0.0016, "reward": 1.7814278602600098, "reward_std": 0.12052673101425171, "rewards/accuracy_reward": 0.6001777052879333, "rewards/format_reward": 1.0, "step": 2277, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.125, "completion_length": 357.8125, "epoch": 0.06927381097190123, "grad_norm": 0.6157950169981056, "kl": 0.04248046875, "learning_rate": 9.882059458229838e-07, "loss": 0.0017, "reward": 1.8441228866577148, "reward_std": 0.04940905421972275, "rewards/accuracy_reward": 0.6941227912902832, "rewards/format_reward": 1.0, "step": 2278, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.0, "completion_length": 351.078125, "epoch": 0.06930422089770101, "grad_norm": 1.1635963289211058, "kl": 0.04443359375, "learning_rate": 9.88195629751881e-07, "loss": 0.0018, "reward": 1.970986247062683, "reward_std": 0.1586020141839981, "rewards/accuracy_reward": 0.7959861755371094, "rewards/format_reward": 1.0, "step": 2279, "temperature": 1.0 }, { "all_correct": 0.375, "all_wrong": 0.125, "completion_length": 388.84375, "epoch": 0.06933463082350079, "grad_norm": 1.1869323599480415, "kl": 0.04052734375, "learning_rate": 9.881853092249922e-07, "loss": 0.0016, "reward": 1.7687737941741943, "reward_std": 0.1427783966064453, "rewards/accuracy_reward": 0.6281486749649048, "rewards/format_reward": 1.0, "step": 2280, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.125, "completion_length": 389.5625, "epoch": 0.06936504074930057, "grad_norm": 0.986237530240388, "kl": 0.033935546875, "learning_rate": 9.881749842424109e-07, "loss": 0.0014, "reward": 1.7048877477645874, "reward_std": 0.1004287600517273, "rewards/accuracy_reward": 0.5798878073692322, "rewards/format_reward": 1.0, "step": 2281, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.25, "completion_length": 360.65625, "epoch": 0.06939545067510035, "grad_norm": 2.1843880955563932, "kl": 0.036865234375, "learning_rate": 9.88164654804232e-07, "loss": 0.0015, "reward": 1.6312499046325684, "reward_std": 0.0883883461356163, "rewards/accuracy_reward": 0.53125, "rewards/format_reward": 1.0, "step": 2282, "temperature": 1.0 }, { "all_correct": 0.25, "all_wrong": 0.25, "completion_length": 382.140625, "epoch": 0.06942586060090013, "grad_norm": 1.7483121261870007, "kl": 0.03564453125, "learning_rate": 9.881543209105496e-07, "loss": 0.0014, "reward": 1.6724202632904053, "reward_std": 0.17130136489868164, "rewards/accuracy_reward": 0.5380451083183289, "rewards/format_reward": 1.0, "step": 2283, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 384.890625, "epoch": 0.06945627052669992, "grad_norm": 0.9984343534733315, "kl": 0.033447265625, "learning_rate": 9.881439825614577e-07, "loss": 0.0013, "reward": 1.9319651126861572, "reward_std": 0.167019322514534, "rewards/accuracy_reward": 0.7538400292396545, "rewards/format_reward": 1.0, "step": 2284, "temperature": 1.0 }, { "all_correct": 0.25, "all_wrong": 0.125, "completion_length": 366.953125, "epoch": 0.0694866804524997, "grad_norm": 1.2063902084423044, "kl": 0.036376953125, "learning_rate": 9.88133639757051e-07, "loss": 0.0015, "reward": 1.652312994003296, "reward_std": 0.14295431971549988, "rewards/accuracy_reward": 0.5273130536079407, "rewards/format_reward": 1.0, "step": 2285, "temperature": 1.0 }, { "all_correct": 0.125, "all_wrong": 0.0, "completion_length": 394.9375, "epoch": 0.06951709037829948, "grad_norm": 1.3198663744728758, "kl": 0.034912109375, "learning_rate": 9.881232924974238e-07, "loss": 0.0014, "reward": 1.7320127487182617, "reward_std": 0.3195357620716095, "rewards/accuracy_reward": 0.5820126533508301, "rewards/format_reward": 1.0, "step": 2286, "temperature": 1.0 }, { "all_correct": 0.375, "all_wrong": 0.375, "completion_length": 362.015625, "epoch": 0.06954750030409926, "grad_norm": 1.9331218919662314, "kl": 0.0439453125, "learning_rate": 9.881129407826708e-07, "loss": 0.0018, "reward": 1.5314611196517944, "reward_std": 0.003886499907821417, "rewards/accuracy_reward": 0.4314611554145813, "rewards/format_reward": 1.0, "step": 2287, "temperature": 1.0 }, { "all_correct": 0.375, "all_wrong": 0.0, "completion_length": 380.78125, "epoch": 0.06957791022989904, "grad_norm": 1.4882716048161213, "kl": 0.040283203125, "learning_rate": 9.88102584612886e-07, "loss": 0.0016, "reward": 1.8347229957580566, "reward_std": 0.12433423101902008, "rewards/accuracy_reward": 0.6534730195999146, "rewards/format_reward": 1.0, "step": 2288, "temperature": 1.0 }, { "all_correct": 0.25, "all_wrong": 0.375, "completion_length": 374.828125, "epoch": 0.06960832015569882, "grad_norm": 1.0978709226254382, "kl": 0.03271484375, "learning_rate": 9.880922239881638e-07, "loss": 0.0013, "reward": 1.4976376295089722, "reward_std": 0.12915125489234924, "rewards/accuracy_reward": 0.4163876473903656, "rewards/format_reward": 1.0, "step": 2289, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.25, "completion_length": 366.796875, "epoch": 0.0696387300814986, "grad_norm": 0.24775669143239068, "kl": 0.03125, "learning_rate": 9.880818589085995e-07, "loss": 0.0013, "reward": 1.8968751430511475, "reward_std": 0.008838837966322899, "rewards/accuracy_reward": 0.75, "rewards/format_reward": 1.0, "step": 2290, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.125, "completion_length": 340.296875, "epoch": 0.06966914000729839, "grad_norm": 1.2598423543594697, "kl": 0.0400390625, "learning_rate": 9.880714893742873e-07, "loss": 0.0016, "reward": 1.6704634428024292, "reward_std": 0.03241485357284546, "rewards/accuracy_reward": 0.5392134189605713, "rewards/format_reward": 1.0, "step": 2291, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.125, "completion_length": 362.609375, "epoch": 0.06969954993309817, "grad_norm": 0.49159107932310486, "kl": 0.03564453125, "learning_rate": 9.880611153853216e-07, "loss": 0.0014, "reward": 1.96297025680542, "reward_std": 0.003289412474259734, "rewards/accuracy_reward": 0.7879700660705566, "rewards/format_reward": 1.0, "step": 2292, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.125, "completion_length": 367.453125, "epoch": 0.06972995985889795, "grad_norm": 0.7469250089504683, "kl": 0.035400390625, "learning_rate": 9.880507369417974e-07, "loss": 0.0014, "reward": 1.953125, "reward_std": 0.08647121489048004, "rewards/accuracy_reward": 0.796875, "rewards/format_reward": 1.0, "step": 2293, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.25, "completion_length": 357.28125, "epoch": 0.06976036978469773, "grad_norm": 0.3083134033801743, "kl": 0.031494140625, "learning_rate": 9.880403540438094e-07, "loss": 0.0013, "reward": 1.8062500953674316, "reward_std": 0.07763238251209259, "rewards/accuracy_reward": 0.671875, "rewards/format_reward": 1.0, "step": 2294, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.25, "completion_length": 341.953125, "epoch": 0.0697907797104975, "grad_norm": 3.291994363812802, "kl": 0.046142578125, "learning_rate": 9.880299666914522e-07, "loss": 0.0018, "reward": 1.849205732345581, "reward_std": 0.07967593520879745, "rewards/accuracy_reward": 0.7054557204246521, "rewards/format_reward": 1.0, "step": 2295, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.0, "completion_length": 369.625, "epoch": 0.06982118963629728, "grad_norm": 0.6766648081826101, "kl": 0.03662109375, "learning_rate": 9.88019574884821e-07, "loss": 0.0015, "reward": 2.065624952316284, "reward_std": 0.20126797258853912, "rewards/accuracy_reward": 0.890625, "rewards/format_reward": 1.0, "step": 2296, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.25, "completion_length": 370.390625, "epoch": 0.06985159956209706, "grad_norm": 0.6515051735410049, "kl": 0.036376953125, "learning_rate": 9.880091786240102e-07, "loss": 0.0015, "reward": 1.7697710990905762, "reward_std": 0.08235938847064972, "rewards/accuracy_reward": 0.6416460275650024, "rewards/format_reward": 1.0, "step": 2297, "temperature": 1.0 }, { "all_correct": 0.125, "all_wrong": 0.25, "completion_length": 378.703125, "epoch": 0.06988200948789684, "grad_norm": 1.080129129026624, "kl": 0.03369140625, "learning_rate": 9.879987779091148e-07, "loss": 0.0014, "reward": 1.3913724422454834, "reward_std": 0.08132602274417877, "rewards/accuracy_reward": 0.2976223826408386, "rewards/format_reward": 1.0, "step": 2298, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.375, "completion_length": 362.03125, "epoch": 0.06991241941369664, "grad_norm": 0.09025643559834735, "kl": 0.03369140625, "learning_rate": 9.879883727402298e-07, "loss": 0.0013, "reward": 1.75, "reward_std": 0.0, "rewards/accuracy_reward": 0.625, "rewards/format_reward": 1.0, "step": 2299, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.125, "completion_length": 354.921875, "epoch": 0.06994282933949642, "grad_norm": 0.4976207438461873, "kl": 0.033935546875, "learning_rate": 9.879779631174503e-07, "loss": 0.0014, "reward": 2.03125, "reward_std": 0.0530330091714859, "rewards/accuracy_reward": 0.859375, "rewards/format_reward": 1.0, "step": 2300, "temperature": 1.0 }, { "all_correct": 0.25, "all_wrong": 0.25, "completion_length": 384.53125, "epoch": 0.0699732392652962, "grad_norm": 1.1354118354307317, "kl": 0.03662109375, "learning_rate": 9.87967549040871e-07, "loss": 0.0015, "reward": 1.648857593536377, "reward_std": 0.1248646154999733, "rewards/accuracy_reward": 0.5238576531410217, "rewards/format_reward": 1.0, "step": 2301, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.125, "completion_length": 374.15625, "epoch": 0.07000364919109597, "grad_norm": 0.750317047560824, "kl": 0.03955078125, "learning_rate": 9.879571305105873e-07, "loss": 0.0016, "reward": 1.9010951519012451, "reward_std": 0.02355922758579254, "rewards/accuracy_reward": 0.7260951399803162, "rewards/format_reward": 1.0, "step": 2302, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.125, "completion_length": 358.34375, "epoch": 0.07003405911689575, "grad_norm": 0.6651510985141867, "kl": 0.037353515625, "learning_rate": 9.879467075266938e-07, "loss": 0.0015, "reward": 1.9865939617156982, "reward_std": 0.0065956078469753265, "rewards/accuracy_reward": 0.8147189617156982, "rewards/format_reward": 1.0, "step": 2303, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.25, "completion_length": 348.90625, "epoch": 0.07006446904269553, "grad_norm": 0.651526694453003, "kl": 0.05126953125, "learning_rate": 9.879362800892862e-07, "loss": 0.0021, "reward": 1.84375, "reward_std": 0.077632375061512, "rewards/accuracy_reward": 0.703125, "rewards/format_reward": 1.0, "step": 2304, "temperature": 1.0 }, { "all_correct": 0.375, "all_wrong": 0.0, "completion_length": 398.6875, "epoch": 0.07009487896849531, "grad_norm": 3.927001490134674, "kl": 0.043212890625, "learning_rate": 9.879258481984592e-07, "loss": 0.0017, "reward": 1.845004677772522, "reward_std": 0.09620370715856552, "rewards/accuracy_reward": 0.7043795585632324, "rewards/format_reward": 1.0, "step": 2305, "temperature": 1.0 }, { "all_correct": 0.875, "all_wrong": 0.0, "completion_length": 367.734375, "epoch": 0.0701252888942951, "grad_norm": 1.2058228718167345, "kl": 0.0439453125, "learning_rate": 9.879154118543083e-07, "loss": 0.0018, "reward": 2.0927352905273438, "reward_std": 0.021979494020342827, "rewards/accuracy_reward": 0.9021104574203491, "rewards/format_reward": 1.0, "step": 2306, "temperature": 1.0 }, { "all_correct": 0.375, "all_wrong": 0.25, "completion_length": 424.46875, "epoch": 0.07015569882009488, "grad_norm": 1.3795550673046424, "kl": 0.031982421875, "learning_rate": 9.879049710569287e-07, "loss": 0.0013, "reward": 1.5257447957992554, "reward_std": 0.1564824879169464, "rewards/accuracy_reward": 0.5163697600364685, "rewards/format_reward": 0.890625, "step": 2307, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.0, "completion_length": 380.28125, "epoch": 0.07018610874589466, "grad_norm": 0.8802635739787507, "kl": 0.0341796875, "learning_rate": 9.878945258064154e-07, "loss": 0.0014, "reward": 2.061544418334961, "reward_std": 0.08267122507095337, "rewards/accuracy_reward": 0.8740445375442505, "rewards/format_reward": 1.0, "step": 2308, "temperature": 1.0 }, { "all_correct": 0.125, "all_wrong": 0.625, "completion_length": 352.265625, "epoch": 0.07021651867169444, "grad_norm": 0.7675436573627277, "kl": 0.04345703125, "learning_rate": 9.878840761028645e-07, "loss": 0.0017, "reward": 1.2972195148468018, "reward_std": 0.08386655151844025, "rewards/accuracy_reward": 0.23471945524215698, "rewards/format_reward": 1.0, "step": 2309, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 367.328125, "epoch": 0.07024692859749422, "grad_norm": 3.5503186019566915, "kl": 0.0322265625, "learning_rate": 9.878736219463705e-07, "loss": 0.0013, "reward": 2.0423545837402344, "reward_std": 0.03579375520348549, "rewards/accuracy_reward": 0.8611045479774475, "rewards/format_reward": 1.0, "step": 2310, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.25, "completion_length": 380.46875, "epoch": 0.070277338523294, "grad_norm": 0.9870104926777629, "kl": 0.0341796875, "learning_rate": 9.878631633370293e-07, "loss": 0.0014, "reward": 1.8088619709014893, "reward_std": 0.06886190176010132, "rewards/accuracy_reward": 0.6682369709014893, "rewards/format_reward": 1.0, "step": 2311, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 366.796875, "epoch": 0.07030774844909378, "grad_norm": 0.9453462225409254, "kl": 0.046630859375, "learning_rate": 9.878527002749361e-07, "loss": 0.0019, "reward": 2.0369269847869873, "reward_std": 0.03608742728829384, "rewards/accuracy_reward": 0.8525518774986267, "rewards/format_reward": 1.0, "step": 2312, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.125, "completion_length": 376.0625, "epoch": 0.07033815837489356, "grad_norm": 0.7991887605374727, "kl": 0.0302734375, "learning_rate": 9.878422327601866e-07, "loss": 0.0012, "reward": 1.8520312309265137, "reward_std": 0.08910862356424332, "rewards/accuracy_reward": 0.7020313143730164, "rewards/format_reward": 1.0, "step": 2313, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.125, "completion_length": 369.21875, "epoch": 0.07036856830069335, "grad_norm": 1.2514406253694041, "kl": 0.038330078125, "learning_rate": 9.878317607928765e-07, "loss": 0.0015, "reward": 1.8699041604995728, "reward_std": 0.030581310391426086, "rewards/accuracy_reward": 0.698029100894928, "rewards/format_reward": 1.0, "step": 2314, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.125, "completion_length": 376.96875, "epoch": 0.07039897822649313, "grad_norm": 1.1645371452224047, "kl": 0.033203125, "learning_rate": 9.87821284373101e-07, "loss": 0.0013, "reward": 1.763006567955017, "reward_std": 0.05664077401161194, "rewards/accuracy_reward": 0.6130064725875854, "rewards/format_reward": 1.0, "step": 2315, "temperature": 1.0 }, { "all_correct": 0.25, "all_wrong": 0.25, "completion_length": 381.875, "epoch": 0.07042938815229291, "grad_norm": 0.8407366591462966, "kl": 0.04638671875, "learning_rate": 9.878108035009558e-07, "loss": 0.0019, "reward": 1.5228604078292847, "reward_std": 0.1503661870956421, "rewards/accuracy_reward": 0.4197354316711426, "rewards/format_reward": 1.0, "step": 2316, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.25, "completion_length": 382.609375, "epoch": 0.07045979807809269, "grad_norm": 0.7703236414610714, "kl": 0.0517578125, "learning_rate": 9.878003181765369e-07, "loss": 0.0021, "reward": 1.7848243713378906, "reward_std": 0.028871191665530205, "rewards/accuracy_reward": 0.6598243117332458, "rewards/format_reward": 1.0, "step": 2317, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 349.234375, "epoch": 0.07049020800389247, "grad_norm": 0.8097166648062635, "kl": 0.034912109375, "learning_rate": 9.877898283999394e-07, "loss": 0.0014, "reward": 1.8730545043945312, "reward_std": 0.09398944675922394, "rewards/accuracy_reward": 0.7261793613433838, "rewards/format_reward": 1.0, "step": 2318, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.125, "completion_length": 341.953125, "epoch": 0.07052061792969225, "grad_norm": 1.3759789838541443, "kl": 0.037841796875, "learning_rate": 9.877793341712596e-07, "loss": 0.0015, "reward": 1.929140329360962, "reward_std": 0.029561541974544525, "rewards/accuracy_reward": 0.7760151624679565, "rewards/format_reward": 1.0, "step": 2319, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 338.046875, "epoch": 0.07055102785549203, "grad_norm": 0.763708230796555, "kl": 0.042236328125, "learning_rate": 9.877688354905928e-07, "loss": 0.0017, "reward": 2.07489013671875, "reward_std": 0.09200280159711838, "rewards/accuracy_reward": 0.88739013671875, "rewards/format_reward": 1.0, "step": 2320, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.125, "completion_length": 343.203125, "epoch": 0.07058143778129182, "grad_norm": 1.260315895288304, "kl": 0.036865234375, "learning_rate": 9.877583323580355e-07, "loss": 0.0015, "reward": 1.9929577112197876, "reward_std": 0.08860299736261368, "rewards/accuracy_reward": 0.827332615852356, "rewards/format_reward": 1.0, "step": 2321, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.0, "completion_length": 372.015625, "epoch": 0.0706118477070916, "grad_norm": 1.071296103423229, "kl": 0.040283203125, "learning_rate": 9.877478247736828e-07, "loss": 0.0016, "reward": 1.8927083015441895, "reward_std": 0.14467142522335052, "rewards/accuracy_reward": 0.7552083134651184, "rewards/format_reward": 1.0, "step": 2322, "temperature": 1.0 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 338.203125, "epoch": 0.07064225763289138, "grad_norm": 0.07085687593540116, "kl": 0.058837890625, "learning_rate": 9.87737312737631e-07, "loss": 0.0024, "reward": 2.200000047683716, "reward_std": 0.0, "rewards/accuracy_reward": 1.0, "rewards/format_reward": 1.0, "step": 2323, "temperature": 1.0 }, { "all_correct": 0.375, "all_wrong": 0.0, "completion_length": 371.125, "epoch": 0.07067266755869116, "grad_norm": 1.2852428148380417, "kl": 0.049072265625, "learning_rate": 9.87726796249976e-07, "loss": 0.002, "reward": 1.7640430927276611, "reward_std": 0.04223271459341049, "rewards/accuracy_reward": 0.6015430688858032, "rewards/format_reward": 1.0, "step": 2324, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.25, "completion_length": 387.453125, "epoch": 0.07070307748449094, "grad_norm": 0.7617805200476044, "kl": 0.0308837890625, "learning_rate": 9.877162753108138e-07, "loss": 0.0012, "reward": 1.834062099456787, "reward_std": 0.006440363824367523, "rewards/accuracy_reward": 0.6840620040893555, "rewards/format_reward": 1.0, "step": 2325, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.25, "completion_length": 366.90625, "epoch": 0.07073348741029072, "grad_norm": 2.091456419932376, "kl": 0.0380859375, "learning_rate": 9.877057499202404e-07, "loss": 0.0015, "reward": 1.7536214590072632, "reward_std": 0.08217963576316833, "rewards/accuracy_reward": 0.6161214113235474, "rewards/format_reward": 1.0, "step": 2326, "temperature": 1.0 }, { "all_correct": 0.875, "all_wrong": 0.0, "completion_length": 345.125, "epoch": 0.0707638973360905, "grad_norm": 0.6566244678597222, "kl": 0.046875, "learning_rate": 9.87695220078352e-07, "loss": 0.0019, "reward": 2.083054542541504, "reward_std": 0.013435021974146366, "rewards/accuracy_reward": 0.8893045783042908, "rewards/format_reward": 1.0, "step": 2327, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.25, "completion_length": 377.3125, "epoch": 0.07079430726189027, "grad_norm": 0.9711363118362093, "kl": 0.0306396484375, "learning_rate": 9.876846857852443e-07, "loss": 0.0012, "reward": 1.742506742477417, "reward_std": 0.08601991832256317, "rewards/accuracy_reward": 0.6050066947937012, "rewards/format_reward": 1.0, "step": 2328, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.25, "completion_length": 344.046875, "epoch": 0.07082471718769007, "grad_norm": 0.08032897808819615, "kl": 0.046875, "learning_rate": 9.876741470410138e-07, "loss": 0.0019, "reward": 1.9000000953674316, "reward_std": 0.0, "rewards/accuracy_reward": 0.75, "rewards/format_reward": 1.0, "step": 2329, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.0, "completion_length": 364.28125, "epoch": 0.07085512711348985, "grad_norm": 1.0768505102580797, "kl": 0.0308837890625, "learning_rate": 9.876636038457566e-07, "loss": 0.0012, "reward": 1.8829131126403809, "reward_std": 0.17062914371490479, "rewards/accuracy_reward": 0.7360380291938782, "rewards/format_reward": 1.0, "step": 2330, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 380.65625, "epoch": 0.07088553703928963, "grad_norm": 1.2643904268130357, "kl": 0.047119140625, "learning_rate": 9.87653056199569e-07, "loss": 0.0019, "reward": 1.9835827350616455, "reward_std": 0.10681791603565216, "rewards/accuracy_reward": 0.8054577708244324, "rewards/format_reward": 1.0, "step": 2331, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.25, "completion_length": 370.234375, "epoch": 0.0709159469650894, "grad_norm": 0.9412618504801109, "kl": 0.031982421875, "learning_rate": 9.876425041025472e-07, "loss": 0.0013, "reward": 1.82041335105896, "reward_std": 0.07400663197040558, "rewards/accuracy_reward": 0.676663339138031, "rewards/format_reward": 1.0, "step": 2332, "temperature": 1.0 }, { "all_correct": 0.875, "all_wrong": 0.125, "completion_length": 381.46875, "epoch": 0.07094635689088918, "grad_norm": 0.07042972212435189, "kl": 0.0419921875, "learning_rate": 9.876319475547876e-07, "loss": 0.0017, "reward": 2.047159194946289, "reward_std": 0.0, "rewards/accuracy_reward": 0.8721591234207153, "rewards/format_reward": 1.0, "step": 2333, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 370.171875, "epoch": 0.07097676681668896, "grad_norm": 0.7192414015903144, "kl": 0.035400390625, "learning_rate": 9.876213865563862e-07, "loss": 0.0014, "reward": 2.0867929458618164, "reward_std": 0.055555880069732666, "rewards/accuracy_reward": 0.8899179697036743, "rewards/format_reward": 1.0, "step": 2334, "temperature": 1.0 }, { "all_correct": 0.25, "all_wrong": 0.25, "completion_length": 382.09375, "epoch": 0.07100717674248874, "grad_norm": 1.1933814829009333, "kl": 0.045166015625, "learning_rate": 9.876108211074398e-07, "loss": 0.0018, "reward": 1.5807031393051147, "reward_std": 0.17194490134716034, "rewards/accuracy_reward": 0.4588281214237213, "rewards/format_reward": 1.0, "step": 2335, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.125, "completion_length": 429.140625, "epoch": 0.07103758666828854, "grad_norm": 1.4223185922453216, "kl": 0.0341796875, "learning_rate": 9.876002512080449e-07, "loss": 0.0014, "reward": 1.7764413356781006, "reward_std": 0.13451741635799408, "rewards/accuracy_reward": 0.6420663595199585, "rewards/format_reward": 0.984375, "step": 2336, "temperature": 1.0 }, { "all_correct": 0.375, "all_wrong": 0.375, "completion_length": 372.078125, "epoch": 0.07106799659408831, "grad_norm": 0.6744907246660446, "kl": 0.032470703125, "learning_rate": 9.875896768582973e-07, "loss": 0.0013, "reward": 1.5616071224212646, "reward_std": 0.0969124287366867, "rewards/accuracy_reward": 0.4616071581840515, "rewards/format_reward": 1.0, "step": 2337, "temperature": 1.0 }, { "all_correct": 0.375, "all_wrong": 0.25, "completion_length": 367.84375, "epoch": 0.0710984065198881, "grad_norm": 1.1304911063540322, "kl": 0.033203125, "learning_rate": 9.875790980582944e-07, "loss": 0.0013, "reward": 1.7306983470916748, "reward_std": 0.15087859332561493, "rewards/accuracy_reward": 0.6056983470916748, "rewards/format_reward": 1.0, "step": 2338, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.125, "completion_length": 388.09375, "epoch": 0.07112881644568787, "grad_norm": 0.6199517743156713, "kl": 0.0294189453125, "learning_rate": 9.875685148081321e-07, "loss": 0.0012, "reward": 1.8539000749588013, "reward_std": 0.07059960067272186, "rewards/accuracy_reward": 0.7070250511169434, "rewards/format_reward": 1.0, "step": 2339, "temperature": 1.0 }, { "all_correct": 0.375, "all_wrong": 0.125, "completion_length": 371.84375, "epoch": 0.07115922637148765, "grad_norm": 3.0928287263719443, "kl": 0.054443359375, "learning_rate": 9.875579271079072e-07, "loss": 0.0022, "reward": 1.7377976179122925, "reward_std": 0.09587763249874115, "rewards/accuracy_reward": 0.5752975940704346, "rewards/format_reward": 1.0, "step": 2340, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.125, "completion_length": 352.015625, "epoch": 0.07118963629728743, "grad_norm": 0.8860976766744139, "kl": 0.04443359375, "learning_rate": 9.875473349577163e-07, "loss": 0.0018, "reward": 1.9976170063018799, "reward_std": 0.013091685250401497, "rewards/accuracy_reward": 0.8257418274879456, "rewards/format_reward": 1.0, "step": 2341, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.125, "completion_length": 368.515625, "epoch": 0.07122004622308721, "grad_norm": 1.7454497199042656, "kl": 0.03271484375, "learning_rate": 9.875367383576562e-07, "loss": 0.0013, "reward": 1.8415366411209106, "reward_std": 0.12381254881620407, "rewards/accuracy_reward": 0.6884115934371948, "rewards/format_reward": 1.0, "step": 2342, "temperature": 1.0 }, { "all_correct": 0.375, "all_wrong": 0.25, "completion_length": 367.171875, "epoch": 0.071250456148887, "grad_norm": 1.3917214673059821, "kl": 0.043701171875, "learning_rate": 9.875261373078235e-07, "loss": 0.0017, "reward": 1.6809732913970947, "reward_std": 0.08920764923095703, "rewards/accuracy_reward": 0.5559731721878052, "rewards/format_reward": 1.0, "step": 2343, "temperature": 1.0 }, { "all_correct": 0.375, "all_wrong": 0.125, "completion_length": 388.09375, "epoch": 0.07128086607468678, "grad_norm": 1.0896570997166224, "kl": 0.0361328125, "learning_rate": 9.875155318083153e-07, "loss": 0.0014, "reward": 1.7034600973129272, "reward_std": 0.0824342593550682, "rewards/accuracy_reward": 0.5628350973129272, "rewards/format_reward": 1.0, "step": 2344, "temperature": 1.0 }, { "all_correct": 0.25, "all_wrong": 0.125, "completion_length": 357.453125, "epoch": 0.07131127600048656, "grad_norm": 1.4010655658252515, "kl": 0.041015625, "learning_rate": 9.875049218592277e-07, "loss": 0.0016, "reward": 1.7863812446594238, "reward_std": 0.2668065130710602, "rewards/accuracy_reward": 0.6520061492919922, "rewards/format_reward": 1.0, "step": 2345, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.25, "completion_length": 361.921875, "epoch": 0.07134168592628634, "grad_norm": 0.6438169748067052, "kl": 0.037109375, "learning_rate": 9.87494307460658e-07, "loss": 0.0015, "reward": 1.7492815256118774, "reward_std": 0.08504504710435867, "rewards/accuracy_reward": 0.6149064302444458, "rewards/format_reward": 1.0, "step": 2346, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 398.5, "epoch": 0.07137209585208612, "grad_norm": 2.071403256386345, "kl": 0.03857421875, "learning_rate": 9.874836886127032e-07, "loss": 0.0015, "reward": 1.9620819091796875, "reward_std": 0.04787423089146614, "rewards/accuracy_reward": 0.802706778049469, "rewards/format_reward": 0.984375, "step": 2347, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.125, "completion_length": 348.171875, "epoch": 0.0714025057778859, "grad_norm": 1.6185414907538251, "kl": 0.038818359375, "learning_rate": 9.8747306531546e-07, "loss": 0.0016, "reward": 1.967228651046753, "reward_std": 0.06347106397151947, "rewards/accuracy_reward": 0.7953536510467529, "rewards/format_reward": 1.0, "step": 2348, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.375, "completion_length": 387.125, "epoch": 0.07143291570368568, "grad_norm": 0.18744414504786844, "kl": 0.033447265625, "learning_rate": 9.874624375690252e-07, "loss": 0.0013, "reward": 1.5758750438690186, "reward_std": 0.06229490786790848, "rewards/accuracy_reward": 0.5071249008178711, "rewards/format_reward": 0.96875, "step": 2349, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 412.078125, "epoch": 0.07146332562948546, "grad_norm": 0.9290609344598316, "kl": 0.03564453125, "learning_rate": 9.87451805373496e-07, "loss": 0.0014, "reward": 1.8494861125946045, "reward_std": 0.17398807406425476, "rewards/accuracy_reward": 0.7057360410690308, "rewards/format_reward": 1.0, "step": 2350, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 400.671875, "epoch": 0.07149373555528525, "grad_norm": 2.0475402304796755, "kl": 0.043212890625, "learning_rate": 9.874411687289696e-07, "loss": 0.0017, "reward": 1.849299669265747, "reward_std": 0.11398116499185562, "rewards/accuracy_reward": 0.6899245977401733, "rewards/format_reward": 1.0, "step": 2351, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.25, "completion_length": 346.640625, "epoch": 0.07152414548108503, "grad_norm": 0.7994846754047048, "kl": 0.041259765625, "learning_rate": 9.87430527635543e-07, "loss": 0.0017, "reward": 1.8551548719406128, "reward_std": 0.014731397852301598, "rewards/accuracy_reward": 0.7082799077033997, "rewards/format_reward": 1.0, "step": 2352, "temperature": 1.0 }, { "all_correct": 0.875, "all_wrong": 0.125, "completion_length": 368.859375, "epoch": 0.07155455540688481, "grad_norm": 0.19205016763221117, "kl": 0.036376953125, "learning_rate": 9.874198820933132e-07, "loss": 0.0015, "reward": 2.046875, "reward_std": 0.008838837966322899, "rewards/accuracy_reward": 0.875, "rewards/format_reward": 1.0, "step": 2353, "temperature": 1.0 }, { "all_correct": 0.875, "all_wrong": 0.0, "completion_length": 361.78125, "epoch": 0.07158496533268459, "grad_norm": 0.6406138732344626, "kl": 0.04638671875, "learning_rate": 9.87409232102377e-07, "loss": 0.0019, "reward": 2.0625, "reward_std": 0.0530330091714859, "rewards/accuracy_reward": 0.890625, "rewards/format_reward": 1.0, "step": 2354, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 392.015625, "epoch": 0.07161537525848437, "grad_norm": 1.184033148557307, "kl": 0.034912109375, "learning_rate": 9.873985776628324e-07, "loss": 0.0014, "reward": 1.9982727766036987, "reward_std": 0.10642261058092117, "rewards/accuracy_reward": 0.8263977766036987, "rewards/format_reward": 1.0, "step": 2355, "temperature": 1.0 }, { "all_correct": 0.875, "all_wrong": 0.0, "completion_length": 363.625, "epoch": 0.07164578518428415, "grad_norm": 0.3001301702894199, "kl": 0.0400390625, "learning_rate": 9.87387918774776e-07, "loss": 0.0016, "reward": 2.1750001907348633, "reward_std": 0.051754921674728394, "rewards/accuracy_reward": 0.984375, "rewards/format_reward": 1.0, "step": 2356, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.125, "completion_length": 384.265625, "epoch": 0.07167619511008393, "grad_norm": 2.241678968184171, "kl": 0.0361328125, "learning_rate": 9.873772554383055e-07, "loss": 0.0014, "reward": 1.979071021080017, "reward_std": 0.02370750904083252, "rewards/accuracy_reward": 0.8103209137916565, "rewards/format_reward": 1.0, "step": 2357, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 347.78125, "epoch": 0.07170660503588372, "grad_norm": 1.759500690501111, "kl": 0.036376953125, "learning_rate": 9.873665876535178e-07, "loss": 0.0015, "reward": 2.059232711791992, "reward_std": 0.0892532616853714, "rewards/accuracy_reward": 0.8748578429222107, "rewards/format_reward": 1.0, "step": 2358, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.25, "completion_length": 333.453125, "epoch": 0.0717370149616835, "grad_norm": 1.7260963301864547, "kl": 0.0361328125, "learning_rate": 9.873559154205107e-07, "loss": 0.0014, "reward": 1.78125, "reward_std": 0.08711418509483337, "rewards/accuracy_reward": 0.65625, "rewards/format_reward": 1.0, "step": 2359, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.125, "completion_length": 351.734375, "epoch": 0.07176742488748328, "grad_norm": 0.3294159860598936, "kl": 0.03515625, "learning_rate": 9.873452387393815e-07, "loss": 0.0014, "reward": 2.012500047683716, "reward_std": 0.06943650543689728, "rewards/accuracy_reward": 0.84375, "rewards/format_reward": 1.0, "step": 2360, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 358.015625, "epoch": 0.07179783481328306, "grad_norm": 1.2387888396808895, "kl": 0.034912109375, "learning_rate": 9.873345576102275e-07, "loss": 0.0014, "reward": 1.760685920715332, "reward_std": 0.13407106697559357, "rewards/accuracy_reward": 0.6231858730316162, "rewards/format_reward": 1.0, "step": 2361, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.125, "completion_length": 343.359375, "epoch": 0.07182824473908284, "grad_norm": 1.243195193673154, "kl": 0.037353515625, "learning_rate": 9.873238720331461e-07, "loss": 0.0015, "reward": 2.008072853088379, "reward_std": 0.11858770251274109, "rewards/accuracy_reward": 0.8424479365348816, "rewards/format_reward": 1.0, "step": 2362, "temperature": 1.0 }, { "all_correct": 0.375, "all_wrong": 0.375, "completion_length": 399.328125, "epoch": 0.07185865466488262, "grad_norm": 0.7347466020250426, "kl": 0.036865234375, "learning_rate": 9.873131820082353e-07, "loss": 0.0015, "reward": 1.5863829851150513, "reward_std": 0.12548309564590454, "rewards/accuracy_reward": 0.4926328659057617, "rewards/format_reward": 0.984375, "step": 2363, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.125, "completion_length": 339.78125, "epoch": 0.0718890645906824, "grad_norm": 2.430927093095686, "kl": 0.04443359375, "learning_rate": 9.87302487535592e-07, "loss": 0.0018, "reward": 1.8776071071624756, "reward_std": 0.1703161895275116, "rewards/accuracy_reward": 0.7307321429252625, "rewards/format_reward": 1.0, "step": 2364, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.25, "completion_length": 390.328125, "epoch": 0.07191947451648217, "grad_norm": 0.726168856376468, "kl": 0.04052734375, "learning_rate": 9.872917886153143e-07, "loss": 0.0016, "reward": 1.73752760887146, "reward_std": 0.013866622932255268, "rewards/accuracy_reward": 0.5906525254249573, "rewards/format_reward": 1.0, "step": 2365, "temperature": 1.0 }, { "all_correct": 0.375, "all_wrong": 0.25, "completion_length": 348.53125, "epoch": 0.07194988444228197, "grad_norm": 1.4092102397226216, "kl": 0.03857421875, "learning_rate": 9.872810852474998e-07, "loss": 0.0015, "reward": 1.7188366651535034, "reward_std": 0.14098027348518372, "rewards/accuracy_reward": 0.5907115936279297, "rewards/format_reward": 1.0, "step": 2366, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 375.140625, "epoch": 0.07198029436808175, "grad_norm": 1.008741601967172, "kl": 0.033447265625, "learning_rate": 9.872703774322459e-07, "loss": 0.0013, "reward": 1.8539319038391113, "reward_std": 0.1849345564842224, "rewards/accuracy_reward": 0.6976817846298218, "rewards/format_reward": 1.0, "step": 2367, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 400.046875, "epoch": 0.07201070429388153, "grad_norm": 1.1956523031934239, "kl": 0.049560546875, "learning_rate": 9.872596651696507e-07, "loss": 0.002, "reward": 1.8618621826171875, "reward_std": 0.10856911540031433, "rewards/accuracy_reward": 0.6899871230125427, "rewards/format_reward": 1.0, "step": 2368, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.25, "completion_length": 357.359375, "epoch": 0.0720411142196813, "grad_norm": 0.7288300839194058, "kl": 0.04296875, "learning_rate": 9.872489484598118e-07, "loss": 0.0017, "reward": 1.7676124572753906, "reward_std": 0.08025988936424255, "rewards/accuracy_reward": 0.6269873976707458, "rewards/format_reward": 1.0, "step": 2369, "temperature": 1.0 }, { "all_correct": 0.875, "all_wrong": 0.0, "completion_length": 406.53125, "epoch": 0.07207152414548108, "grad_norm": 0.42346943927514236, "kl": 0.027587890625, "learning_rate": 9.872382273028271e-07, "loss": 0.0011, "reward": 2.159374952316284, "reward_std": 0.07344459742307663, "rewards/accuracy_reward": 0.984375, "rewards/format_reward": 1.0, "step": 2370, "temperature": 1.0 }, { "all_correct": 0.375, "all_wrong": 0.25, "completion_length": 386.859375, "epoch": 0.07210193407128086, "grad_norm": 0.5160970530543679, "kl": 0.035888671875, "learning_rate": 9.87227501698794e-07, "loss": 0.0014, "reward": 1.6359111070632935, "reward_std": 0.07127352058887482, "rewards/accuracy_reward": 0.5046610832214355, "rewards/format_reward": 1.0, "step": 2371, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.25, "completion_length": 371.71875, "epoch": 0.07213234399708064, "grad_norm": 0.7304731625868155, "kl": 0.037353515625, "learning_rate": 9.87216771647811e-07, "loss": 0.0015, "reward": 1.7207286357879639, "reward_std": 0.042031966149806976, "rewards/accuracy_reward": 0.5832285284996033, "rewards/format_reward": 1.0, "step": 2372, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.125, "completion_length": 372.796875, "epoch": 0.07216275392288043, "grad_norm": 1.305029130028423, "kl": 0.036865234375, "learning_rate": 9.87206037149976e-07, "loss": 0.0015, "reward": 1.7617288827896118, "reward_std": 0.13543274998664856, "rewards/accuracy_reward": 0.6273539066314697, "rewards/format_reward": 1.0, "step": 2373, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.125, "completion_length": 349.859375, "epoch": 0.07219316384868021, "grad_norm": 0.9322076558580428, "kl": 0.0400390625, "learning_rate": 9.871952982053864e-07, "loss": 0.0016, "reward": 1.9011987447738647, "reward_std": 0.08437976986169815, "rewards/accuracy_reward": 0.7386987209320068, "rewards/format_reward": 1.0, "step": 2374, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.0, "completion_length": 371.5, "epoch": 0.07222357377448, "grad_norm": 3.781595040304778, "kl": 0.048828125, "learning_rate": 9.871845548141407e-07, "loss": 0.002, "reward": 1.8638901710510254, "reward_std": 0.09927506744861603, "rewards/accuracy_reward": 0.7138901948928833, "rewards/format_reward": 1.0, "step": 2375, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.125, "completion_length": 352.109375, "epoch": 0.07225398370027977, "grad_norm": 0.9940906767625175, "kl": 0.04736328125, "learning_rate": 9.871738069763369e-07, "loss": 0.0019, "reward": 1.9334869384765625, "reward_std": 0.0318918414413929, "rewards/accuracy_reward": 0.7678618431091309, "rewards/format_reward": 1.0, "step": 2376, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 396.0, "epoch": 0.07228439362607955, "grad_norm": 1.9071408267644623, "kl": 0.052490234375, "learning_rate": 9.87163054692073e-07, "loss": 0.0021, "reward": 1.8087539672851562, "reward_std": 0.14542332291603088, "rewards/accuracy_reward": 0.6681289672851562, "rewards/format_reward": 0.96875, "step": 2377, "temperature": 1.0 }, { "all_correct": 0.875, "all_wrong": 0.0, "completion_length": 376.890625, "epoch": 0.07231480355187933, "grad_norm": 0.27206154736830773, "kl": 0.037841796875, "learning_rate": 9.87152297961447e-07, "loss": 0.0015, "reward": 2.1061627864837646, "reward_std": 0.005692124832421541, "rewards/accuracy_reward": 0.9311627149581909, "rewards/format_reward": 1.0, "step": 2378, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.125, "completion_length": 342.234375, "epoch": 0.07234521347767911, "grad_norm": 1.3631018169865634, "kl": 0.03515625, "learning_rate": 9.871415367845576e-07, "loss": 0.0014, "reward": 1.9156250953674316, "reward_std": 0.13130834698677063, "rewards/accuracy_reward": 0.765625, "rewards/format_reward": 1.0, "step": 2379, "temperature": 1.0 }, { "all_correct": 0.375, "all_wrong": 0.125, "completion_length": 435.15625, "epoch": 0.07237562340347889, "grad_norm": 2.0533319360748763, "kl": 0.03662109375, "learning_rate": 9.871307711615025e-07, "loss": 0.0015, "reward": 1.707545518875122, "reward_std": 0.06655619293451309, "rewards/accuracy_reward": 0.5606704354286194, "rewards/format_reward": 1.0, "step": 2380, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.25, "completion_length": 345.625, "epoch": 0.07240603332927868, "grad_norm": 0.9042574155831559, "kl": 0.047607421875, "learning_rate": 9.871200010923802e-07, "loss": 0.0019, "reward": 1.7468749284744263, "reward_std": 0.09722718596458435, "rewards/accuracy_reward": 0.625, "rewards/format_reward": 1.0, "step": 2381, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 372.484375, "epoch": 0.07243644325507846, "grad_norm": 1.053134833408383, "kl": 0.04931640625, "learning_rate": 9.871092265772886e-07, "loss": 0.002, "reward": 1.7778041362762451, "reward_std": 0.09956423938274384, "rewards/accuracy_reward": 0.6246791481971741, "rewards/format_reward": 1.0, "step": 2382, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.125, "completion_length": 361.015625, "epoch": 0.07246685318087824, "grad_norm": 1.411380564106867, "kl": 0.04248046875, "learning_rate": 9.870984476163266e-07, "loss": 0.0017, "reward": 1.9268344640731812, "reward_std": 0.06711134314537048, "rewards/accuracy_reward": 0.7612094879150391, "rewards/format_reward": 1.0, "step": 2383, "temperature": 1.0 }, { "all_correct": 0.375, "all_wrong": 0.125, "completion_length": 379.734375, "epoch": 0.07249726310667802, "grad_norm": 1.7399343535562768, "kl": 0.037109375, "learning_rate": 9.870876642095924e-07, "loss": 0.0015, "reward": 1.6735461950302124, "reward_std": 0.20593903958797455, "rewards/accuracy_reward": 0.5391713380813599, "rewards/format_reward": 1.0, "step": 2384, "temperature": 1.0 }, { "all_correct": 0.375, "all_wrong": 0.125, "completion_length": 363.46875, "epoch": 0.0725276730324778, "grad_norm": 1.2494202082179664, "kl": 0.039794921875, "learning_rate": 9.870768763571845e-07, "loss": 0.0016, "reward": 1.6788194179534912, "reward_std": 0.2000548392534256, "rewards/accuracy_reward": 0.5538194179534912, "rewards/format_reward": 1.0, "step": 2385, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.25, "completion_length": 364.453125, "epoch": 0.07255808295827758, "grad_norm": 0.7192996518087594, "kl": 0.03515625, "learning_rate": 9.870660840592009e-07, "loss": 0.0014, "reward": 1.725977897644043, "reward_std": 0.016961591318249702, "rewards/accuracy_reward": 0.5791028738021851, "rewards/format_reward": 1.0, "step": 2386, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.125, "completion_length": 409.828125, "epoch": 0.07258849288407736, "grad_norm": 1.1885091410867556, "kl": 0.030029296875, "learning_rate": 9.870552873157406e-07, "loss": 0.0012, "reward": 1.8222078084945679, "reward_std": 0.10220152884721756, "rewards/accuracy_reward": 0.684707760810852, "rewards/format_reward": 1.0, "step": 2387, "temperature": 1.0 }, { "all_correct": 0.875, "all_wrong": 0.125, "completion_length": 341.78125, "epoch": 0.07261890280987715, "grad_norm": 0.5542245890413094, "kl": 0.041259765625, "learning_rate": 9.870444861269019e-07, "loss": 0.0016, "reward": 2.0445313453674316, "reward_std": 0.0022097111213952303, "rewards/accuracy_reward": 0.8695312142372131, "rewards/format_reward": 1.0, "step": 2388, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.125, "completion_length": 382.671875, "epoch": 0.07264931273567693, "grad_norm": 0.789375085898076, "kl": 0.03125, "learning_rate": 9.870336804927835e-07, "loss": 0.0012, "reward": 1.931337594985962, "reward_std": 0.08027233928442001, "rewards/accuracy_reward": 0.768837571144104, "rewards/format_reward": 1.0, "step": 2389, "temperature": 1.0 }, { "all_correct": 0.375, "all_wrong": 0.0, "completion_length": 365.34375, "epoch": 0.07267972266147671, "grad_norm": 1.4420029424731065, "kl": 0.03515625, "learning_rate": 9.870228704134838e-07, "loss": 0.0014, "reward": 1.7843749523162842, "reward_std": 0.305536687374115, "rewards/accuracy_reward": 0.65625, "rewards/format_reward": 1.0, "step": 2390, "temperature": 1.0 }, { "all_correct": 0.25, "all_wrong": 0.125, "completion_length": 381.828125, "epoch": 0.07271013258727649, "grad_norm": 1.3907322998654708, "kl": 0.037109375, "learning_rate": 9.87012055889102e-07, "loss": 0.0015, "reward": 1.6607921123504639, "reward_std": 0.3719618320465088, "rewards/accuracy_reward": 0.576417088508606, "rewards/format_reward": 0.984375, "step": 2391, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.25, "completion_length": 352.421875, "epoch": 0.07274054251307627, "grad_norm": 1.0067768056594717, "kl": 0.040283203125, "learning_rate": 9.87001236919736e-07, "loss": 0.0016, "reward": 1.801041603088379, "reward_std": 0.09032879024744034, "rewards/accuracy_reward": 0.6697916984558105, "rewards/format_reward": 1.0, "step": 2392, "temperature": 1.0 }, { "all_correct": 0.375, "all_wrong": 0.125, "completion_length": 388.171875, "epoch": 0.07277095243887605, "grad_norm": 3.321728941018141, "kl": 0.03515625, "learning_rate": 9.869904135054852e-07, "loss": 0.0014, "reward": 1.7820522785186768, "reward_std": 0.08742889016866684, "rewards/accuracy_reward": 0.6164272427558899, "rewards/format_reward": 1.0, "step": 2393, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.0, "completion_length": 365.0625, "epoch": 0.07280136236467583, "grad_norm": 1.1074975879512672, "kl": 0.03662109375, "learning_rate": 9.869795856464483e-07, "loss": 0.0015, "reward": 2.0538861751556396, "reward_std": 0.19253724813461304, "rewards/accuracy_reward": 0.8788861036300659, "rewards/format_reward": 1.0, "step": 2394, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.5, "completion_length": 344.9375, "epoch": 0.0728317722904756, "grad_norm": 0.06032410292837829, "kl": 0.038818359375, "learning_rate": 9.869687533427238e-07, "loss": 0.0015, "reward": 1.600000023841858, "reward_std": 0.0, "rewards/accuracy_reward": 0.5, "rewards/format_reward": 1.0, "step": 2395, "temperature": 1.0 }, { "all_correct": 0.25, "all_wrong": 0.375, "completion_length": 373.015625, "epoch": 0.0728621822162754, "grad_norm": 0.5321502698427486, "kl": 0.04296875, "learning_rate": 9.86957916594411e-07, "loss": 0.0017, "reward": 1.5008922815322876, "reward_std": 0.04707634076476097, "rewards/accuracy_reward": 0.40089231729507446, "rewards/format_reward": 1.0, "step": 2396, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.125, "completion_length": 355.578125, "epoch": 0.07289259214207518, "grad_norm": 1.8741475856880374, "kl": 0.048095703125, "learning_rate": 9.869470754016084e-07, "loss": 0.0019, "reward": 1.8517272472381592, "reward_std": 0.09261726588010788, "rewards/accuracy_reward": 0.6954772472381592, "rewards/format_reward": 1.0, "step": 2397, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.25, "completion_length": 368.640625, "epoch": 0.07292300206787496, "grad_norm": 1.2045149246166644, "kl": 0.044189453125, "learning_rate": 9.86936229764415e-07, "loss": 0.0018, "reward": 1.7932454347610474, "reward_std": 0.06748578697443008, "rewards/accuracy_reward": 0.6494954824447632, "rewards/format_reward": 1.0, "step": 2398, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.375, "completion_length": 356.140625, "epoch": 0.07295341199367474, "grad_norm": 0.7142624481867306, "kl": 0.035888671875, "learning_rate": 9.869253796829301e-07, "loss": 0.0014, "reward": 1.7468750476837158, "reward_std": 0.008838837966322899, "rewards/accuracy_reward": 0.625, "rewards/format_reward": 1.0, "step": 2399, "temperature": 1.0 }, { "all_correct": 0.25, "all_wrong": 0.375, "completion_length": 377.203125, "epoch": 0.07298382191947451, "grad_norm": 1.181274004680584, "kl": 0.0439453125, "learning_rate": 9.869145251572526e-07, "loss": 0.0018, "reward": 1.4649507999420166, "reward_std": 0.12162395566701889, "rewards/accuracy_reward": 0.3774508237838745, "rewards/format_reward": 1.0, "step": 2400, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.125, "completion_length": 339.515625, "epoch": 0.0730142318452743, "grad_norm": 1.0074126329381317, "kl": 0.04150390625, "learning_rate": 9.869036661874814e-07, "loss": 0.0017, "reward": 1.9090144634246826, "reward_std": 0.03275389224290848, "rewards/accuracy_reward": 0.7590144276618958, "rewards/format_reward": 1.0, "step": 2401, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 355.6875, "epoch": 0.07304464177107407, "grad_norm": 1.4011800320960943, "kl": 0.04541015625, "learning_rate": 9.868928027737159e-07, "loss": 0.0018, "reward": 1.8778297901153564, "reward_std": 0.14642588794231415, "rewards/accuracy_reward": 0.7122048139572144, "rewards/format_reward": 1.0, "step": 2402, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.125, "completion_length": 376.84375, "epoch": 0.07307505169687387, "grad_norm": 0.7152743718494116, "kl": 0.03564453125, "learning_rate": 9.868819349160548e-07, "loss": 0.0014, "reward": 1.9742333889007568, "reward_std": 0.0811690092086792, "rewards/accuracy_reward": 0.8117333054542542, "rewards/format_reward": 1.0, "step": 2403, "temperature": 1.0 }, { "all_correct": 0.375, "all_wrong": 0.125, "completion_length": 364.9375, "epoch": 0.07310546162267365, "grad_norm": 2.224080701981514, "kl": 0.042236328125, "learning_rate": 9.868710626145977e-07, "loss": 0.0017, "reward": 1.6684293746948242, "reward_std": 0.16979125142097473, "rewards/accuracy_reward": 0.5434294939041138, "rewards/format_reward": 1.0, "step": 2404, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.125, "completion_length": 357.734375, "epoch": 0.07313587154847342, "grad_norm": 0.8129115621368378, "kl": 0.043212890625, "learning_rate": 9.868601858694437e-07, "loss": 0.0017, "reward": 1.9687224626541138, "reward_std": 0.07925340533256531, "rewards/accuracy_reward": 0.796847403049469, "rewards/format_reward": 1.0, "step": 2405, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.125, "completion_length": 380.4375, "epoch": 0.0731662814742732, "grad_norm": 0.4888095820681017, "kl": 0.04833984375, "learning_rate": 9.868493046806921e-07, "loss": 0.0019, "reward": 1.9721442461013794, "reward_std": 0.02054297924041748, "rewards/accuracy_reward": 0.8002692461013794, "rewards/format_reward": 1.0, "step": 2406, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.25, "completion_length": 356.046875, "epoch": 0.07319669140007298, "grad_norm": 0.900442763788821, "kl": 0.03955078125, "learning_rate": 9.868384190484422e-07, "loss": 0.0016, "reward": 1.803674578666687, "reward_std": 0.05646608769893646, "rewards/accuracy_reward": 0.6567996144294739, "rewards/format_reward": 1.0, "step": 2407, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.125, "completion_length": 379.75, "epoch": 0.07322710132587276, "grad_norm": 1.04362222171391, "kl": 0.035400390625, "learning_rate": 9.868275289727931e-07, "loss": 0.0014, "reward": 1.8784642219543457, "reward_std": 0.03106926567852497, "rewards/accuracy_reward": 0.7159643173217773, "rewards/format_reward": 1.0, "step": 2408, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.0, "completion_length": 368.203125, "epoch": 0.07325751125167254, "grad_norm": 0.9722738501812692, "kl": 0.0341796875, "learning_rate": 9.868166344538447e-07, "loss": 0.0014, "reward": 2.010758876800537, "reward_std": 0.10203946381807327, "rewards/accuracy_reward": 0.8263838887214661, "rewards/format_reward": 1.0, "step": 2409, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.125, "completion_length": 373.578125, "epoch": 0.07328792117747232, "grad_norm": 0.848783283002575, "kl": 0.03125, "learning_rate": 9.86805735491696e-07, "loss": 0.0013, "reward": 1.890326738357544, "reward_std": 0.008600450120866299, "rewards/accuracy_reward": 0.7153265476226807, "rewards/format_reward": 1.0, "step": 2410, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.25, "completion_length": 349.03125, "epoch": 0.07331833110327211, "grad_norm": 0.44144534797573287, "kl": 0.043701171875, "learning_rate": 9.867948320864468e-07, "loss": 0.0017, "reward": 1.8229761123657227, "reward_std": 0.010387420654296875, "rewards/accuracy_reward": 0.67610102891922, "rewards/format_reward": 1.0, "step": 2411, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.25, "completion_length": 365.3125, "epoch": 0.07334874102907189, "grad_norm": 1.4943415954741863, "kl": 0.03173828125, "learning_rate": 9.867839242381964e-07, "loss": 0.0013, "reward": 1.769737958908081, "reward_std": 0.08797435462474823, "rewards/accuracy_reward": 0.6384879946708679, "rewards/format_reward": 1.0, "step": 2412, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 349.90625, "epoch": 0.07337915095487167, "grad_norm": 0.6053844622722483, "kl": 0.038818359375, "learning_rate": 9.867730119470443e-07, "loss": 0.0016, "reward": 2.082291603088379, "reward_std": 0.09599603712558746, "rewards/accuracy_reward": 0.9010416865348816, "rewards/format_reward": 1.0, "step": 2413, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.25, "completion_length": 338.9375, "epoch": 0.07340956088067145, "grad_norm": 0.047609186335330136, "kl": 0.041015625, "learning_rate": 9.867620952130903e-07, "loss": 0.0016, "reward": 1.9000000953674316, "reward_std": 0.0, "rewards/accuracy_reward": 0.75, "rewards/format_reward": 1.0, "step": 2414, "temperature": 1.0 }, { "all_correct": 0.25, "all_wrong": 0.125, "completion_length": 365.1875, "epoch": 0.07343997080647123, "grad_norm": 1.14244099142403, "kl": 0.04931640625, "learning_rate": 9.86751174036434e-07, "loss": 0.002, "reward": 1.7373353242874146, "reward_std": 0.04766164720058441, "rewards/accuracy_reward": 0.5685853958129883, "rewards/format_reward": 1.0, "step": 2415, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.0, "completion_length": 368.203125, "epoch": 0.07347038073227101, "grad_norm": 2.6817091003209006, "kl": 0.034423828125, "learning_rate": 9.86740248417175e-07, "loss": 0.0014, "reward": 1.9533547163009644, "reward_std": 0.01623806357383728, "rewards/accuracy_reward": 0.7533547282218933, "rewards/format_reward": 1.0, "step": 2416, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.25, "completion_length": 357.828125, "epoch": 0.07350079065807079, "grad_norm": 0.5245537740639242, "kl": 0.0390625, "learning_rate": 9.867293183554128e-07, "loss": 0.0016, "reward": 1.827418327331543, "reward_std": 0.0031466642394661903, "rewards/accuracy_reward": 0.6774182915687561, "rewards/format_reward": 1.0, "step": 2417, "temperature": 1.0 }, { "all_correct": 0.375, "all_wrong": 0.0, "completion_length": 374.765625, "epoch": 0.07353120058387058, "grad_norm": 1.3460949967649667, "kl": 0.04638671875, "learning_rate": 9.867183838512477e-07, "loss": 0.0019, "reward": 1.7104425430297852, "reward_std": 0.11293171346187592, "rewards/accuracy_reward": 0.5573174953460693, "rewards/format_reward": 1.0, "step": 2418, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.125, "completion_length": 372.046875, "epoch": 0.07356161050967036, "grad_norm": 1.7305739883484743, "kl": 0.04150390625, "learning_rate": 9.867074449047791e-07, "loss": 0.0017, "reward": 1.7051334381103516, "reward_std": 0.00995232630521059, "rewards/accuracy_reward": 0.5551333427429199, "rewards/format_reward": 1.0, "step": 2419, "temperature": 1.0 }, { "all_correct": 0.375, "all_wrong": 0.125, "completion_length": 381.609375, "epoch": 0.07359202043547014, "grad_norm": 1.4617132159168726, "kl": 0.035400390625, "learning_rate": 9.86696501516107e-07, "loss": 0.0014, "reward": 1.825235366821289, "reward_std": 0.14546802639961243, "rewards/accuracy_reward": 0.6721101999282837, "rewards/format_reward": 1.0, "step": 2420, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.25, "completion_length": 351.28125, "epoch": 0.07362243036126992, "grad_norm": 0.8256672478524869, "kl": 0.029541015625, "learning_rate": 9.866855536853312e-07, "loss": 0.0012, "reward": 1.7546409368515015, "reward_std": 0.015134266577661037, "rewards/accuracy_reward": 0.6077659130096436, "rewards/format_reward": 1.0, "step": 2421, "temperature": 1.0 }, { "all_correct": 0.375, "all_wrong": 0.5, "completion_length": 369.0, "epoch": 0.0736528402870697, "grad_norm": 1.9125822630779419, "kl": 0.0322265625, "learning_rate": 9.866746014125515e-07, "loss": 0.0013, "reward": 1.5, "reward_std": 0.08256310969591141, "rewards/accuracy_reward": 0.421875, "rewards/format_reward": 1.0, "step": 2422, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.125, "completion_length": 342.96875, "epoch": 0.07368325021286948, "grad_norm": 0.4376545276079412, "kl": 0.042236328125, "learning_rate": 9.866636446978682e-07, "loss": 0.0017, "reward": 2.0044641494750977, "reward_std": 0.0530330091714859, "rewards/accuracy_reward": 0.8325892686843872, "rewards/format_reward": 1.0, "step": 2423, "temperature": 1.0 }, { "all_correct": 0.375, "all_wrong": 0.25, "completion_length": 366.078125, "epoch": 0.07371366013866926, "grad_norm": 2.040210001686647, "kl": 0.04052734375, "learning_rate": 9.866526835413811e-07, "loss": 0.0016, "reward": 1.6171314716339111, "reward_std": 0.02809916064143181, "rewards/accuracy_reward": 0.473381370306015, "rewards/format_reward": 1.0, "step": 2424, "temperature": 1.0 }, { "all_correct": 0.375, "all_wrong": 0.25, "completion_length": 388.421875, "epoch": 0.07374407006446904, "grad_norm": 1.4886639303523426, "kl": 0.04248046875, "learning_rate": 9.866417179431903e-07, "loss": 0.0017, "reward": 1.590973138809204, "reward_std": 0.18717995285987854, "rewards/accuracy_reward": 0.5534729957580566, "rewards/format_reward": 0.9375, "step": 2425, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.125, "completion_length": 338.46875, "epoch": 0.07377447999026883, "grad_norm": 2.464349857867224, "kl": 0.03955078125, "learning_rate": 9.866307479033954e-07, "loss": 0.0016, "reward": 1.96875, "reward_std": 0.08611097186803818, "rewards/accuracy_reward": 0.8125, "rewards/format_reward": 1.0, "step": 2426, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.125, "completion_length": 340.3125, "epoch": 0.07380488991606861, "grad_norm": 4.064602911142833, "kl": 0.034912109375, "learning_rate": 9.866197734220975e-07, "loss": 0.0014, "reward": 2.0093750953674316, "reward_std": 0.07827534526586533, "rewards/accuracy_reward": 0.84375, "rewards/format_reward": 1.0, "step": 2427, "temperature": 1.0 }, { "all_correct": 0.125, "all_wrong": 0.375, "completion_length": 387.375, "epoch": 0.07383529984186839, "grad_norm": 0.901584615767375, "kl": 0.038330078125, "learning_rate": 9.86608794499396e-07, "loss": 0.0015, "reward": 1.4975647926330566, "reward_std": 0.2287033200263977, "rewards/accuracy_reward": 0.406939834356308, "rewards/format_reward": 1.0, "step": 2428, "temperature": 1.0 }, { "all_correct": 0.875, "all_wrong": 0.125, "completion_length": 356.453125, "epoch": 0.07386570976766817, "grad_norm": 0.6777475619072667, "kl": 0.03759765625, "learning_rate": 9.865978111353914e-07, "loss": 0.0015, "reward": 2.0374999046325684, "reward_std": 0.029250433668494225, "rewards/accuracy_reward": 0.875, "rewards/format_reward": 1.0, "step": 2429, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.125, "completion_length": 389.0, "epoch": 0.07389611969346795, "grad_norm": 1.550111489281346, "kl": 0.0458984375, "learning_rate": 9.865868233301838e-07, "loss": 0.0018, "reward": 1.83012056350708, "reward_std": 0.10616391152143478, "rewards/accuracy_reward": 0.6832455396652222, "rewards/format_reward": 1.0, "step": 2430, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.0, "completion_length": 378.25, "epoch": 0.07392652961926773, "grad_norm": 0.9195488957637191, "kl": 0.035888671875, "learning_rate": 9.865758310838737e-07, "loss": 0.0014, "reward": 2.0080654621124268, "reward_std": 0.06260258704423904, "rewards/accuracy_reward": 0.8111904263496399, "rewards/format_reward": 1.0, "step": 2431, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.25, "completion_length": 383.390625, "epoch": 0.0739569395450675, "grad_norm": 1.2083513160361266, "kl": 0.0390625, "learning_rate": 9.86564834396561e-07, "loss": 0.0016, "reward": 1.6922341585159302, "reward_std": 0.09212476760149002, "rewards/accuracy_reward": 0.5734841227531433, "rewards/format_reward": 1.0, "step": 2432, "temperature": 1.0 }, { "all_correct": 0.375, "all_wrong": 0.125, "completion_length": 356.515625, "epoch": 0.0739873494708673, "grad_norm": 2.318027168455351, "kl": 0.04296875, "learning_rate": 9.865538332683465e-07, "loss": 0.0017, "reward": 1.7383453845977783, "reward_std": 0.230462446808815, "rewards/accuracy_reward": 0.6102203726768494, "rewards/format_reward": 1.0, "step": 2433, "temperature": 1.0 }, { "all_correct": 0.375, "all_wrong": 0.125, "completion_length": 357.015625, "epoch": 0.07401775939666708, "grad_norm": 1.3237742822388896, "kl": 0.03515625, "learning_rate": 9.865428276993305e-07, "loss": 0.0014, "reward": 1.816334843635559, "reward_std": 0.1322784125804901, "rewards/accuracy_reward": 0.6600847244262695, "rewards/format_reward": 1.0, "step": 2434, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.0, "completion_length": 369.84375, "epoch": 0.07404816932246686, "grad_norm": 1.0224552754149703, "kl": 0.03271484375, "learning_rate": 9.865318176896137e-07, "loss": 0.0013, "reward": 2.000845432281494, "reward_std": 0.088492751121521, "rewards/accuracy_reward": 0.8102203607559204, "rewards/format_reward": 1.0, "step": 2435, "temperature": 1.0 }, { "all_correct": 0.25, "all_wrong": 0.125, "completion_length": 413.859375, "epoch": 0.07407857924826663, "grad_norm": 1.1050400285244213, "kl": 0.050537109375, "learning_rate": 9.865208032392959e-07, "loss": 0.002, "reward": 1.7133132219314575, "reward_std": 0.12883175909519196, "rewards/accuracy_reward": 0.5695631504058838, "rewards/format_reward": 1.0, "step": 2436, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.125, "completion_length": 367.203125, "epoch": 0.07410898917406641, "grad_norm": 0.6038902397687032, "kl": 0.050048828125, "learning_rate": 9.865097843484782e-07, "loss": 0.002, "reward": 1.8265496492385864, "reward_std": 0.060750119388103485, "rewards/accuracy_reward": 0.6859246492385864, "rewards/format_reward": 1.0, "step": 2437, "temperature": 1.0 }, { "all_correct": 0.25, "all_wrong": 0.0, "completion_length": 398.625, "epoch": 0.0741393990998662, "grad_norm": 1.2487994778651919, "kl": 0.041748046875, "learning_rate": 9.86498761017261e-07, "loss": 0.0017, "reward": 1.7737808227539062, "reward_std": 0.1029369905591011, "rewards/accuracy_reward": 0.633155882358551, "rewards/format_reward": 1.0, "step": 2438, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.25, "completion_length": 364.953125, "epoch": 0.07416980902566597, "grad_norm": 3.0184637238313567, "kl": 0.042724609375, "learning_rate": 9.864877332457447e-07, "loss": 0.0017, "reward": 1.7953405380249023, "reward_std": 0.013075281865894794, "rewards/accuracy_reward": 0.6547154188156128, "rewards/format_reward": 1.0, "step": 2439, "temperature": 1.0 }, { "all_correct": 0.375, "all_wrong": 0.5, "completion_length": 378.515625, "epoch": 0.07420021895146575, "grad_norm": 0.6273756324300105, "kl": 0.033935546875, "learning_rate": 9.864767010340303e-07, "loss": 0.0014, "reward": 1.5625, "reward_std": 0.06943650543689728, "rewards/accuracy_reward": 0.46875, "rewards/format_reward": 1.0, "step": 2440, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.25, "completion_length": 371.625, "epoch": 0.07423062887726554, "grad_norm": 0.7182830683150857, "kl": 0.043701171875, "learning_rate": 9.864656643822185e-07, "loss": 0.0017, "reward": 1.7364407777786255, "reward_std": 0.012584841810166836, "rewards/accuracy_reward": 0.5864406228065491, "rewards/format_reward": 1.0, "step": 2441, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.25, "completion_length": 354.6875, "epoch": 0.07426103880306532, "grad_norm": 0.064951927096182, "kl": 0.03515625, "learning_rate": 9.864546232904097e-07, "loss": 0.0014, "reward": 1.9000000953674316, "reward_std": 0.0, "rewards/accuracy_reward": 0.75, "rewards/format_reward": 1.0, "step": 2442, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.125, "completion_length": 377.109375, "epoch": 0.0742914487288651, "grad_norm": 0.289331297116656, "kl": 0.033447265625, "learning_rate": 9.86443577758705e-07, "loss": 0.0013, "reward": 1.975000023841858, "reward_std": 0.08017838001251221, "rewards/accuracy_reward": 0.8125, "rewards/format_reward": 1.0, "step": 2443, "temperature": 1.0 }, { "all_correct": 0.25, "all_wrong": 0.25, "completion_length": 380.3125, "epoch": 0.07432185865466488, "grad_norm": 0.8101507871600769, "kl": 0.03466796875, "learning_rate": 9.86432527787205e-07, "loss": 0.0014, "reward": 1.5771369934082031, "reward_std": 0.17012745141983032, "rewards/accuracy_reward": 0.4615119695663452, "rewards/format_reward": 1.0, "step": 2444, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 362.15625, "epoch": 0.07435226858046466, "grad_norm": 1.4048447865835865, "kl": 0.037353515625, "learning_rate": 9.864214733760105e-07, "loss": 0.0015, "reward": 1.9670672416687012, "reward_std": 0.05414704233407974, "rewards/accuracy_reward": 0.792067289352417, "rewards/format_reward": 1.0, "step": 2445, "temperature": 1.0 }, { "all_correct": 0.375, "all_wrong": 0.125, "completion_length": 377.296875, "epoch": 0.07438267850626444, "grad_norm": 1.5141240510946599, "kl": 0.03662109375, "learning_rate": 9.864104145252228e-07, "loss": 0.0015, "reward": 1.74039626121521, "reward_std": 0.14393679797649384, "rewards/accuracy_reward": 0.6060212254524231, "rewards/format_reward": 1.0, "step": 2446, "temperature": 1.0 }, { "all_correct": 0.375, "all_wrong": 0.125, "completion_length": 382.96875, "epoch": 0.07441308843206422, "grad_norm": 1.1626758733908558, "kl": 0.0419921875, "learning_rate": 9.863993512349422e-07, "loss": 0.0017, "reward": 1.73267662525177, "reward_std": 0.18755152821540833, "rewards/accuracy_reward": 0.5983015298843384, "rewards/format_reward": 1.0, "step": 2447, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.375, "completion_length": 375.3125, "epoch": 0.07444349835786401, "grad_norm": 0.3516838495711569, "kl": 0.030517578125, "learning_rate": 9.863882835052703e-07, "loss": 0.0012, "reward": 1.7468750476837158, "reward_std": 0.008838837966322899, "rewards/accuracy_reward": 0.625, "rewards/format_reward": 1.0, "step": 2448, "temperature": 1.0 }, { "all_correct": 0.25, "all_wrong": 0.125, "completion_length": 372.09375, "epoch": 0.07447390828366379, "grad_norm": 1.8457732712207748, "kl": 0.047119140625, "learning_rate": 9.863772113363079e-07, "loss": 0.0019, "reward": 1.8109742403030396, "reward_std": 0.09137019515037537, "rewards/accuracy_reward": 0.657849133014679, "rewards/format_reward": 1.0, "step": 2449, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.25, "completion_length": 375.5, "epoch": 0.07450431820946357, "grad_norm": 0.9893721340588104, "kl": 0.04150390625, "learning_rate": 9.863661347281559e-07, "loss": 0.0017, "reward": 1.7946966886520386, "reward_std": 0.012107347138226032, "rewards/accuracy_reward": 0.6540716886520386, "rewards/format_reward": 1.0, "step": 2450, "temperature": 1.0 }, { "all_correct": 0.375, "all_wrong": 0.125, "completion_length": 355.96875, "epoch": 0.07453472813526335, "grad_norm": 1.1707529776595416, "kl": 0.03271484375, "learning_rate": 9.863550536809155e-07, "loss": 0.0013, "reward": 1.6749060153961182, "reward_std": 0.15124866366386414, "rewards/accuracy_reward": 0.5530309081077576, "rewards/format_reward": 1.0, "step": 2451, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.125, "completion_length": 346.421875, "epoch": 0.07456513806106313, "grad_norm": 0.3316130915493195, "kl": 0.039794921875, "learning_rate": 9.863439681946876e-07, "loss": 0.0016, "reward": 2.012500047683716, "reward_std": 0.06943650543689728, "rewards/accuracy_reward": 0.84375, "rewards/format_reward": 1.0, "step": 2452, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.125, "completion_length": 359.015625, "epoch": 0.07459554798686291, "grad_norm": 1.1169364165201192, "kl": 0.0361328125, "learning_rate": 9.86332878269574e-07, "loss": 0.0014, "reward": 1.777503490447998, "reward_std": 0.09107542037963867, "rewards/accuracy_reward": 0.6275034546852112, "rewards/format_reward": 1.0, "step": 2453, "temperature": 1.0 }, { "all_correct": 0.375, "all_wrong": 0.125, "completion_length": 368.265625, "epoch": 0.07462595791266269, "grad_norm": 2.2794141345578227, "kl": 0.057861328125, "learning_rate": 9.863217839056753e-07, "loss": 0.0023, "reward": 1.635258674621582, "reward_std": 0.17192363739013672, "rewards/accuracy_reward": 0.5321337580680847, "rewards/format_reward": 1.0, "step": 2454, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.125, "completion_length": 349.484375, "epoch": 0.07465636783846247, "grad_norm": 1.0567344106151233, "kl": 0.05029296875, "learning_rate": 9.863106851030931e-07, "loss": 0.002, "reward": 1.9281251430511475, "reward_std": 0.11234867572784424, "rewards/accuracy_reward": 0.796875, "rewards/format_reward": 1.0, "step": 2455, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.25, "completion_length": 336.734375, "epoch": 0.07468677776426226, "grad_norm": 2.044713339948245, "kl": 0.043701171875, "learning_rate": 9.862995818619285e-07, "loss": 0.0017, "reward": 1.8125, "reward_std": 0.10942880809307098, "rewards/accuracy_reward": 0.6875, "rewards/format_reward": 1.0, "step": 2456, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.25, "completion_length": 348.9375, "epoch": 0.07471718769006204, "grad_norm": 1.154382217205233, "kl": 0.037353515625, "learning_rate": 9.86288474182283e-07, "loss": 0.0015, "reward": 1.718045711517334, "reward_std": 0.06686752289533615, "rewards/accuracy_reward": 0.5805457830429077, "rewards/format_reward": 1.0, "step": 2457, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.125, "completion_length": 359.765625, "epoch": 0.07474759761586182, "grad_norm": 0.7270482487687907, "kl": 0.047119140625, "learning_rate": 9.862773620642579e-07, "loss": 0.0019, "reward": 1.9517834186553955, "reward_std": 0.024797961115837097, "rewards/accuracy_reward": 0.7924084663391113, "rewards/format_reward": 1.0, "step": 2458, "temperature": 1.0 }, { "all_correct": 0.375, "all_wrong": 0.25, "completion_length": 372.609375, "epoch": 0.0747780075416616, "grad_norm": 0.5208613711512166, "kl": 0.0341796875, "learning_rate": 9.862662455079547e-07, "loss": 0.0014, "reward": 1.615625023841858, "reward_std": 0.21106873452663422, "rewards/accuracy_reward": 0.515625, "rewards/format_reward": 1.0, "step": 2459, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.125, "completion_length": 375.34375, "epoch": 0.07480841746746138, "grad_norm": 1.0840508866033822, "kl": 0.054443359375, "learning_rate": 9.862551245134748e-07, "loss": 0.0022, "reward": 1.8168368339538574, "reward_std": 0.13141664862632751, "rewards/accuracy_reward": 0.6699618697166443, "rewards/format_reward": 1.0, "step": 2460, "temperature": 1.0 }, { "all_correct": 0.375, "all_wrong": 0.25, "completion_length": 372.359375, "epoch": 0.07483882739326116, "grad_norm": 1.7590025770012876, "kl": 0.040283203125, "learning_rate": 9.862439990809194e-07, "loss": 0.0016, "reward": 1.701322078704834, "reward_std": 0.14187799394130707, "rewards/accuracy_reward": 0.5700720548629761, "rewards/format_reward": 1.0, "step": 2461, "temperature": 1.0 }, { "all_correct": 0.375, "all_wrong": 0.25, "completion_length": 387.109375, "epoch": 0.07486923731906094, "grad_norm": 0.7714404109126028, "kl": 0.031982421875, "learning_rate": 9.862328692103906e-07, "loss": 0.0013, "reward": 1.7373809814453125, "reward_std": 0.10618994385004044, "rewards/accuracy_reward": 0.6030058860778809, "rewards/format_reward": 1.0, "step": 2462, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.125, "completion_length": 359.921875, "epoch": 0.07489964724486073, "grad_norm": 0.4179774930762378, "kl": 0.04052734375, "learning_rate": 9.862217349019898e-07, "loss": 0.0016, "reward": 1.9695611000061035, "reward_std": 0.007892224006354809, "rewards/accuracy_reward": 0.7945610284805298, "rewards/format_reward": 1.0, "step": 2463, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.125, "completion_length": 360.765625, "epoch": 0.07493005717066051, "grad_norm": 0.8514652690839146, "kl": 0.04443359375, "learning_rate": 9.862105961558181e-07, "loss": 0.0018, "reward": 1.8496713638305664, "reward_std": 0.08928145468235016, "rewards/accuracy_reward": 0.6871712803840637, "rewards/format_reward": 1.0, "step": 2464, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.0, "completion_length": 360.171875, "epoch": 0.07496046709646029, "grad_norm": 1.8925500168398144, "kl": 0.049560546875, "learning_rate": 9.861994529719782e-07, "loss": 0.002, "reward": 1.9960846900939941, "reward_std": 0.08680392801761627, "rewards/accuracy_reward": 0.80545973777771, "rewards/format_reward": 1.0, "step": 2465, "temperature": 1.0 }, { "all_correct": 0.375, "all_wrong": 0.25, "completion_length": 367.109375, "epoch": 0.07499087702226007, "grad_norm": 2.2419958708900634, "kl": 0.044677734375, "learning_rate": 9.861883053505709e-07, "loss": 0.0018, "reward": 1.7002004384994507, "reward_std": 0.09157991409301758, "rewards/accuracy_reward": 0.5689502954483032, "rewards/format_reward": 1.0, "step": 2466, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.0, "completion_length": 355.40625, "epoch": 0.07502128694805985, "grad_norm": 1.5234000671636032, "kl": 0.05419921875, "learning_rate": 9.861771532916982e-07, "loss": 0.0022, "reward": 2.0240349769592285, "reward_std": 0.15204796195030212, "rewards/accuracy_reward": 0.8490350246429443, "rewards/format_reward": 1.0, "step": 2467, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.125, "completion_length": 364.140625, "epoch": 0.07505169687385962, "grad_norm": 1.3566997573333124, "kl": 0.0537109375, "learning_rate": 9.86165996795462e-07, "loss": 0.0022, "reward": 1.8291666507720947, "reward_std": 0.08725392818450928, "rewards/accuracy_reward": 0.6760416626930237, "rewards/format_reward": 1.0, "step": 2468, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 360.84375, "epoch": 0.0750821067996594, "grad_norm": 1.6983741691325775, "kl": 0.04736328125, "learning_rate": 9.861548358619642e-07, "loss": 0.0019, "reward": 2.0466551780700684, "reward_std": 0.026137743145227432, "rewards/accuracy_reward": 0.846655011177063, "rewards/format_reward": 1.0, "step": 2469, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 369.9375, "epoch": 0.07511251672545918, "grad_norm": 1.489412317472571, "kl": 0.0478515625, "learning_rate": 9.861436704913064e-07, "loss": 0.0019, "reward": 1.935652494430542, "reward_std": 0.1458202600479126, "rewards/accuracy_reward": 0.7512774467468262, "rewards/format_reward": 1.0, "step": 2470, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 351.859375, "epoch": 0.07514292665125898, "grad_norm": 5.077931611877617, "kl": 0.05712890625, "learning_rate": 9.861325006835907e-07, "loss": 0.0023, "reward": 1.911177635192871, "reward_std": 0.154936283826828, "rewards/accuracy_reward": 0.7393025159835815, "rewards/format_reward": 0.984375, "step": 2471, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.25, "completion_length": 378.515625, "epoch": 0.07517333657705876, "grad_norm": 0.7860913426598098, "kl": 0.045654296875, "learning_rate": 9.86121326438919e-07, "loss": 0.0018, "reward": 1.776305079460144, "reward_std": 0.07419613003730774, "rewards/accuracy_reward": 0.641930103302002, "rewards/format_reward": 1.0, "step": 2472, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.125, "completion_length": 376.953125, "epoch": 0.07520374650285853, "grad_norm": 8.485478521573564, "kl": 0.041748046875, "learning_rate": 9.861101477573932e-07, "loss": 0.0017, "reward": 1.8207122087478638, "reward_std": 0.13071081042289734, "rewards/accuracy_reward": 0.680087149143219, "rewards/format_reward": 1.0, "step": 2473, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.375, "completion_length": 352.03125, "epoch": 0.07523415642865831, "grad_norm": 0.07271371098965215, "kl": 0.046875, "learning_rate": 9.860989646391154e-07, "loss": 0.0019, "reward": 1.75, "reward_std": 0.0, "rewards/accuracy_reward": 0.625, "rewards/format_reward": 1.0, "step": 2474, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.125, "completion_length": 365.1875, "epoch": 0.07526456635445809, "grad_norm": 0.5426966961277859, "kl": 0.04833984375, "learning_rate": 9.860877770841877e-07, "loss": 0.0019, "reward": 1.9754856824874878, "reward_std": 0.0027033109217882156, "rewards/accuracy_reward": 0.8004856109619141, "rewards/format_reward": 1.0, "step": 2475, "temperature": 1.0 }, { "all_correct": 0.875, "all_wrong": 0.125, "completion_length": 344.0, "epoch": 0.07529497628025787, "grad_norm": 0.05713668037224509, "kl": 0.038330078125, "learning_rate": 9.860765850927123e-07, "loss": 0.0015, "reward": 2.0500001907348633, "reward_std": 0.0, "rewards/accuracy_reward": 0.875, "rewards/format_reward": 1.0, "step": 2476, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.125, "completion_length": 359.046875, "epoch": 0.07532538620605765, "grad_norm": 1.1855176625270283, "kl": 0.0625, "learning_rate": 9.86065388664791e-07, "loss": 0.0025, "reward": 1.717117428779602, "reward_std": 0.07998567819595337, "rewards/accuracy_reward": 0.5858674049377441, "rewards/format_reward": 1.0, "step": 2477, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.25, "completion_length": 358.015625, "epoch": 0.07535579613185744, "grad_norm": 0.6243011914501678, "kl": 0.035400390625, "learning_rate": 9.860541878005265e-07, "loss": 0.0014, "reward": 1.7988369464874268, "reward_std": 0.005458469968289137, "rewards/accuracy_reward": 0.6488368511199951, "rewards/format_reward": 1.0, "step": 2478, "temperature": 1.0 }, { "all_correct": 0.875, "all_wrong": 0.0, "completion_length": 372.171875, "epoch": 0.07538620605765722, "grad_norm": 0.5188317705038367, "kl": 0.034423828125, "learning_rate": 9.860429825000205e-07, "loss": 0.0014, "reward": 2.106250047683716, "reward_std": 0.077632375061512, "rewards/accuracy_reward": 0.921875, "rewards/format_reward": 1.0, "step": 2479, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 377.078125, "epoch": 0.075416615983457, "grad_norm": 1.4250837798306268, "kl": 0.045654296875, "learning_rate": 9.860317727633755e-07, "loss": 0.0018, "reward": 1.8872348070144653, "reward_std": 0.1437542587518692, "rewards/accuracy_reward": 0.7372347712516785, "rewards/format_reward": 1.0, "step": 2480, "temperature": 1.0 }, { "all_correct": 0.25, "all_wrong": 0.25, "completion_length": 386.703125, "epoch": 0.07544702590925678, "grad_norm": 0.9946974212222288, "kl": 0.0390625, "learning_rate": 9.86020558590694e-07, "loss": 0.0016, "reward": 1.620572566986084, "reward_std": 0.16380320489406586, "rewards/accuracy_reward": 0.5018225908279419, "rewards/format_reward": 1.0, "step": 2481, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.125, "completion_length": 370.84375, "epoch": 0.07547743583505656, "grad_norm": 0.953768195585671, "kl": 0.040283203125, "learning_rate": 9.860093399820782e-07, "loss": 0.0016, "reward": 1.8818761110305786, "reward_std": 0.025369292125105858, "rewards/accuracy_reward": 0.713126003742218, "rewards/format_reward": 1.0, "step": 2482, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.0, "completion_length": 355.5, "epoch": 0.07550784576085634, "grad_norm": 1.0774331163796875, "kl": 0.04052734375, "learning_rate": 9.859981169376306e-07, "loss": 0.0016, "reward": 2.023257255554199, "reward_std": 0.15300512313842773, "rewards/accuracy_reward": 0.8357572555541992, "rewards/format_reward": 1.0, "step": 2483, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 359.609375, "epoch": 0.07553825568665612, "grad_norm": 0.5521875415032471, "kl": 0.041259765625, "learning_rate": 9.859868894574532e-07, "loss": 0.0016, "reward": 1.975000023841858, "reward_std": 0.13887301087379456, "rewards/accuracy_reward": 0.8125, "rewards/format_reward": 1.0, "step": 2484, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 353.015625, "epoch": 0.07556866561245591, "grad_norm": 1.070579889736262, "kl": 0.049072265625, "learning_rate": 9.85975657541649e-07, "loss": 0.002, "reward": 2.1401352882385254, "reward_std": 0.09388431906700134, "rewards/accuracy_reward": 0.9526352882385254, "rewards/format_reward": 1.0, "step": 2485, "temperature": 1.0 }, { "all_correct": 0.375, "all_wrong": 0.0, "completion_length": 372.015625, "epoch": 0.07559907553825569, "grad_norm": 1.1370533121142035, "kl": 0.051025390625, "learning_rate": 9.859644211903202e-07, "loss": 0.002, "reward": 1.786617636680603, "reward_std": 0.15352703630924225, "rewards/accuracy_reward": 0.6147426962852478, "rewards/format_reward": 1.0, "step": 2486, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.125, "completion_length": 354.890625, "epoch": 0.07562948546405547, "grad_norm": 1.1693893115212104, "kl": 0.051513671875, "learning_rate": 9.859531804035696e-07, "loss": 0.0021, "reward": 1.9025732278823853, "reward_std": 0.035607125610113144, "rewards/accuracy_reward": 0.7338231205940247, "rewards/format_reward": 1.0, "step": 2487, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.25, "completion_length": 362.65625, "epoch": 0.07565989538985525, "grad_norm": 0.5970097315123322, "kl": 0.04931640625, "learning_rate": 9.859419351814996e-07, "loss": 0.002, "reward": 1.8781249523162842, "reward_std": 0.06187184900045395, "rewards/accuracy_reward": 0.734375, "rewards/format_reward": 1.0, "step": 2488, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.5, "completion_length": 360.265625, "epoch": 0.07569030531565503, "grad_norm": 0.5531070588891508, "kl": 0.033203125, "learning_rate": 9.859306855242128e-07, "loss": 0.0013, "reward": 1.5968749523162842, "reward_std": 0.008838837966322899, "rewards/accuracy_reward": 0.5, "rewards/format_reward": 1.0, "step": 2489, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.0, "completion_length": 344.3125, "epoch": 0.07572071524145481, "grad_norm": 1.1684490824231821, "kl": 0.041259765625, "learning_rate": 9.85919431431812e-07, "loss": 0.0016, "reward": 1.991852879524231, "reward_std": 0.1611950695514679, "rewards/accuracy_reward": 0.8168528079986572, "rewards/format_reward": 1.0, "step": 2490, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.0, "completion_length": 347.984375, "epoch": 0.07575112516725459, "grad_norm": 1.6776006745663348, "kl": 0.0439453125, "learning_rate": 9.859081729044e-07, "loss": 0.0018, "reward": 1.9742608070373535, "reward_std": 0.0359986275434494, "rewards/accuracy_reward": 0.7773858904838562, "rewards/format_reward": 1.0, "step": 2491, "temperature": 1.0 }, { "all_correct": 0.875, "all_wrong": 0.0, "completion_length": 355.46875, "epoch": 0.07578153509305437, "grad_norm": 0.7869383393780497, "kl": 0.032958984375, "learning_rate": 9.858969099420792e-07, "loss": 0.0013, "reward": 2.1308512687683105, "reward_std": 0.016522862017154694, "rewards/accuracy_reward": 0.9339761137962341, "rewards/format_reward": 1.0, "step": 2492, "temperature": 1.0 }, { "all_correct": 0.375, "all_wrong": 0.125, "completion_length": 383.90625, "epoch": 0.07581194501885416, "grad_norm": 1.2402048641405226, "kl": 0.048583984375, "learning_rate": 9.858856425449528e-07, "loss": 0.0019, "reward": 1.7456811666488647, "reward_std": 0.10677331686019897, "rewards/accuracy_reward": 0.5863059759140015, "rewards/format_reward": 1.0, "step": 2493, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.25, "completion_length": 370.21875, "epoch": 0.07584235494465394, "grad_norm": 1.1866071006884658, "kl": 0.04443359375, "learning_rate": 9.858743707131235e-07, "loss": 0.0018, "reward": 1.8085421323776245, "reward_std": 0.0016743600135669112, "rewards/accuracy_reward": 0.6585420966148376, "rewards/format_reward": 1.0, "step": 2494, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.0, "completion_length": 400.640625, "epoch": 0.07587276487045372, "grad_norm": 1.9023615267742355, "kl": 0.034423828125, "learning_rate": 9.858630944466942e-07, "loss": 0.0014, "reward": 1.9092881679534912, "reward_std": 0.1398414969444275, "rewards/accuracy_reward": 0.7561631202697754, "rewards/format_reward": 1.0, "step": 2495, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.25, "completion_length": 346.015625, "epoch": 0.0759031747962535, "grad_norm": 0.7567730238111163, "kl": 0.035400390625, "learning_rate": 9.858518137457676e-07, "loss": 0.0014, "reward": 1.859375, "reward_std": 0.07827534526586533, "rewards/accuracy_reward": 0.71875, "rewards/format_reward": 1.0, "step": 2496, "temperature": 1.0 }, { "all_correct": 0.875, "all_wrong": 0.0, "completion_length": 370.28125, "epoch": 0.07593358472205328, "grad_norm": 3.4977687100284096, "kl": 0.04052734375, "learning_rate": 9.85840528610447e-07, "loss": 0.0016, "reward": 2.1546006202697754, "reward_std": 0.08007238060235977, "rewards/accuracy_reward": 0.963975727558136, "rewards/format_reward": 1.0, "step": 2497, "temperature": 1.0 }, { "all_correct": 0.375, "all_wrong": 0.25, "completion_length": 367.046875, "epoch": 0.07596399464785306, "grad_norm": 1.1870415818959628, "kl": 0.05224609375, "learning_rate": 9.85829239040835e-07, "loss": 0.0021, "reward": 1.6631851196289062, "reward_std": 0.028778309002518654, "rewards/accuracy_reward": 0.5194350481033325, "rewards/format_reward": 1.0, "step": 2498, "temperature": 1.0 }, { "all_correct": 0.25, "all_wrong": 0.375, "completion_length": 380.140625, "epoch": 0.07599440457365284, "grad_norm": 4.9208788569425765, "kl": 0.041015625, "learning_rate": 9.858179450370352e-07, "loss": 0.0016, "reward": 1.4496848583221436, "reward_std": 0.12111330777406693, "rewards/accuracy_reward": 0.36530983448028564, "rewards/format_reward": 0.984375, "step": 2499, "temperature": 1.0 }, { "all_correct": 0.875, "all_wrong": 0.125, "completion_length": 372.3125, "epoch": 0.07602481449945263, "grad_norm": 0.09674058622159516, "kl": 0.03857421875, "learning_rate": 9.858066465991502e-07, "loss": 0.0015, "reward": 2.0500001907348633, "reward_std": 0.0, "rewards/accuracy_reward": 0.875, "rewards/format_reward": 1.0, "step": 2500, "temperature": 1.0 } ], "logging_steps": 1.0, "max_steps": 32884, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 1, "trial_name": null, "trial_params": null }