| { |
| "best_metric": null, |
| "best_model_checkpoint": null, |
| "epoch": 3.0, |
| "eval_steps": 10, |
| "global_step": 1875, |
| "is_hyper_param_search": false, |
| "is_local_process_zero": true, |
| "is_world_process_zero": true, |
| "log_history": [ |
| { |
| "completion_length": 549.5593883514405, |
| "epoch": 0.016, |
| "grad_norm": 0.6740879416465759, |
| "kl": 0.00035033226013183596, |
| "learning_rate": 1.5957446808510638e-07, |
| "loss": 0.0, |
| "reward": 0.42916667833924294, |
| "reward_std": 0.34028950408101083, |
| "rewards/accuracy_reward": 0.3166666737757623, |
| "rewards/format_reward": 0.11250000298023224, |
| "step": 10 |
| }, |
| { |
| "completion_length": 527.5224136352539, |
| "epoch": 0.032, |
| "grad_norm": 0.2563185691833496, |
| "kl": 0.0004585623741149902, |
| "learning_rate": 3.1914893617021275e-07, |
| "loss": 0.0, |
| "reward": 0.4945312611758709, |
| "reward_std": 0.35083559062331915, |
| "rewards/accuracy_reward": 0.37005209350027146, |
| "rewards/format_reward": 0.12447916958481073, |
| "step": 20 |
| }, |
| { |
| "completion_length": 538.003141784668, |
| "epoch": 0.048, |
| "grad_norm": 0.33752796053886414, |
| "kl": 0.005180943012237549, |
| "learning_rate": 4.787234042553192e-07, |
| "loss": 0.0002, |
| "reward": 0.5481770986691117, |
| "reward_std": 0.3370666664093733, |
| "rewards/accuracy_reward": 0.2927083420334384, |
| "rewards/format_reward": 0.25546875870786606, |
| "step": 30 |
| }, |
| { |
| "completion_length": 258.59141426086427, |
| "epoch": 0.064, |
| "grad_norm": 0.354353666305542, |
| "kl": 0.0585662841796875, |
| "learning_rate": 6.382978723404255e-07, |
| "loss": 0.0023, |
| "reward": 0.8239583492279052, |
| "reward_std": 0.27262834142893555, |
| "rewards/accuracy_reward": 0.06171875221189112, |
| "rewards/format_reward": 0.7622395977377892, |
| "step": 40 |
| }, |
| { |
| "completion_length": 161.72448558807372, |
| "epoch": 0.08, |
| "grad_norm": 0.24595139920711517, |
| "kl": 0.0718231201171875, |
| "learning_rate": 7.978723404255319e-07, |
| "loss": 0.0029, |
| "reward": 0.9528646096587181, |
| "reward_std": 0.18603200055658817, |
| "rewards/accuracy_reward": 0.020572917093522845, |
| "rewards/format_reward": 0.9322916835546493, |
| "step": 50 |
| }, |
| { |
| "completion_length": 187.65990085601806, |
| "epoch": 0.096, |
| "grad_norm": 0.18100768327713013, |
| "kl": 0.0560791015625, |
| "learning_rate": 9.574468085106384e-07, |
| "loss": 0.0022, |
| "reward": 0.9700521141290664, |
| "reward_std": 0.23906342964619398, |
| "rewards/accuracy_reward": 0.05546875142026693, |
| "rewards/format_reward": 0.9145833522081375, |
| "step": 60 |
| }, |
| { |
| "completion_length": 197.76198501586913, |
| "epoch": 0.112, |
| "grad_norm": 0.2552257180213928, |
| "kl": 0.067510986328125, |
| "learning_rate": 1.1170212765957447e-06, |
| "loss": 0.0027, |
| "reward": 1.0585937827825547, |
| "reward_std": 0.278127851895988, |
| "rewards/accuracy_reward": 0.11562500302679837, |
| "rewards/format_reward": 0.9429687693715095, |
| "step": 70 |
| }, |
| { |
| "completion_length": 207.72917366027832, |
| "epoch": 0.128, |
| "grad_norm": 0.23932258784770966, |
| "kl": 0.08233642578125, |
| "learning_rate": 1.276595744680851e-06, |
| "loss": 0.0033, |
| "reward": 1.1739583641290665, |
| "reward_std": 0.31914406083524227, |
| "rewards/accuracy_reward": 0.2052083393326029, |
| "rewards/format_reward": 0.9687500178813935, |
| "step": 80 |
| }, |
| { |
| "completion_length": 254.02891464233397, |
| "epoch": 0.144, |
| "grad_norm": 0.13784578442573547, |
| "kl": 0.077899169921875, |
| "learning_rate": 1.4361702127659576e-06, |
| "loss": 0.0031, |
| "reward": 1.254687537252903, |
| "reward_std": 0.3285429562442005, |
| "rewards/accuracy_reward": 0.2760416746838018, |
| "rewards/format_reward": 0.9786458492279053, |
| "step": 90 |
| }, |
| { |
| "completion_length": 313.156778717041, |
| "epoch": 0.16, |
| "grad_norm": 0.14116181433200836, |
| "kl": 0.087481689453125, |
| "learning_rate": 1.5957446808510639e-06, |
| "loss": 0.0035, |
| "reward": 1.398958370089531, |
| "reward_std": 0.364690675213933, |
| "rewards/accuracy_reward": 0.43567709643393754, |
| "rewards/format_reward": 0.9632812708616256, |
| "step": 100 |
| }, |
| { |
| "completion_length": 317.794021987915, |
| "epoch": 0.176, |
| "grad_norm": 0.17862962186336517, |
| "kl": 0.09073486328125, |
| "learning_rate": 1.7553191489361702e-06, |
| "loss": 0.0036, |
| "reward": 1.4677083671092988, |
| "reward_std": 0.37521998956799507, |
| "rewards/accuracy_reward": 0.517708345875144, |
| "rewards/format_reward": 0.9500000178813934, |
| "step": 110 |
| }, |
| { |
| "completion_length": 310.7653747558594, |
| "epoch": 0.192, |
| "grad_norm": 0.160443514585495, |
| "kl": 0.095281982421875, |
| "learning_rate": 1.9148936170212767e-06, |
| "loss": 0.0038, |
| "reward": 1.5169271260499955, |
| "reward_std": 0.3428271571174264, |
| "rewards/accuracy_reward": 0.5375000145286322, |
| "rewards/format_reward": 0.979427094757557, |
| "step": 120 |
| }, |
| { |
| "completion_length": 293.892195892334, |
| "epoch": 0.208, |
| "grad_norm": 0.12249578535556793, |
| "kl": 0.093798828125, |
| "learning_rate": 2.074468085106383e-06, |
| "loss": 0.0037, |
| "reward": 1.5312500417232513, |
| "reward_std": 0.3389985624700785, |
| "rewards/accuracy_reward": 0.539322929829359, |
| "rewards/format_reward": 0.9919271007180214, |
| "step": 130 |
| }, |
| { |
| "completion_length": 325.6273529052734, |
| "epoch": 0.224, |
| "grad_norm": 0.14869874715805054, |
| "kl": 0.08721923828125, |
| "learning_rate": 2.2340425531914894e-06, |
| "loss": 0.0035, |
| "reward": 1.4973958760499955, |
| "reward_std": 0.32006428195163605, |
| "rewards/accuracy_reward": 0.5270833484828472, |
| "rewards/format_reward": 0.9703125178813934, |
| "step": 140 |
| }, |
| { |
| "completion_length": 302.0961029052734, |
| "epoch": 0.24, |
| "grad_norm": 0.15774066746234894, |
| "kl": 0.09039306640625, |
| "learning_rate": 2.3936170212765957e-06, |
| "loss": 0.0036, |
| "reward": 1.546093797683716, |
| "reward_std": 0.33077279273420573, |
| "rewards/accuracy_reward": 0.5557291835546494, |
| "rewards/format_reward": 0.9903646007180213, |
| "step": 150 |
| }, |
| { |
| "completion_length": 377.5010528564453, |
| "epoch": 0.256, |
| "grad_norm": 0.1264476478099823, |
| "kl": 0.083544921875, |
| "learning_rate": 2.553191489361702e-06, |
| "loss": 0.0033, |
| "reward": 1.4979167014360428, |
| "reward_std": 0.3458104237914085, |
| "rewards/accuracy_reward": 0.5213541844394058, |
| "rewards/format_reward": 0.9765625163912773, |
| "step": 160 |
| }, |
| { |
| "completion_length": 333.62370681762695, |
| "epoch": 0.272, |
| "grad_norm": 0.0962129607796669, |
| "kl": 0.0868896484375, |
| "learning_rate": 2.7127659574468088e-06, |
| "loss": 0.0035, |
| "reward": 1.55364588201046, |
| "reward_std": 0.3210131399333477, |
| "rewards/accuracy_reward": 0.5757812647148967, |
| "rewards/format_reward": 0.9778646036982537, |
| "step": 170 |
| }, |
| { |
| "completion_length": 372.2838665008545, |
| "epoch": 0.288, |
| "grad_norm": 0.23018397390842438, |
| "kl": 0.09453125, |
| "learning_rate": 2.872340425531915e-06, |
| "loss": 0.0038, |
| "reward": 1.533854204416275, |
| "reward_std": 0.3298664506524801, |
| "rewards/accuracy_reward": 0.5500000137835741, |
| "rewards/format_reward": 0.9838541820645332, |
| "step": 180 |
| }, |
| { |
| "completion_length": 352.70756072998046, |
| "epoch": 0.304, |
| "grad_norm": 0.14402048289775848, |
| "kl": 0.0962158203125, |
| "learning_rate": 2.999989596239813e-06, |
| "loss": 0.0039, |
| "reward": 1.5427083760499953, |
| "reward_std": 0.3494082003831863, |
| "rewards/accuracy_reward": 0.575781268440187, |
| "rewards/format_reward": 0.9669270977377892, |
| "step": 190 |
| }, |
| { |
| "completion_length": 340.1216236114502, |
| "epoch": 0.32, |
| "grad_norm": 0.12887056171894073, |
| "kl": 0.12586669921875, |
| "learning_rate": 2.9996254797863878e-06, |
| "loss": 0.005, |
| "reward": 1.4958333730697633, |
| "reward_std": 0.3522159656509757, |
| "rewards/accuracy_reward": 0.5208333488553762, |
| "rewards/format_reward": 0.9750000193715096, |
| "step": 200 |
| }, |
| { |
| "completion_length": 345.5015693664551, |
| "epoch": 0.336, |
| "grad_norm": 0.1841161698102951, |
| "kl": 0.13616943359375, |
| "learning_rate": 2.9987413196322384e-06, |
| "loss": 0.0054, |
| "reward": 1.5750000387430192, |
| "reward_std": 0.3604385921731591, |
| "rewards/accuracy_reward": 0.6200521003454924, |
| "rewards/format_reward": 0.9549479350447655, |
| "step": 210 |
| }, |
| { |
| "completion_length": 351.95599975585935, |
| "epoch": 0.352, |
| "grad_norm": 0.21690769493579865, |
| "kl": 0.113519287109375, |
| "learning_rate": 2.9973374223885316e-06, |
| "loss": 0.0045, |
| "reward": 1.456250038743019, |
| "reward_std": 0.3830007821321487, |
| "rewards/accuracy_reward": 0.5210937664844095, |
| "rewards/format_reward": 0.9351562723517418, |
| "step": 220 |
| }, |
| { |
| "completion_length": 338.8109489440918, |
| "epoch": 0.368, |
| "grad_norm": 0.15305453538894653, |
| "kl": 0.15860595703125, |
| "learning_rate": 2.9954142749021024e-06, |
| "loss": 0.0063, |
| "reward": 1.4955729603767396, |
| "reward_std": 0.3856545228511095, |
| "rewards/accuracy_reward": 0.540625018067658, |
| "rewards/format_reward": 0.9549479365348816, |
| "step": 230 |
| }, |
| { |
| "completion_length": 340.34349899291993, |
| "epoch": 0.384, |
| "grad_norm": 0.12304379045963287, |
| "kl": 0.13148193359375, |
| "learning_rate": 2.9929725440866226e-06, |
| "loss": 0.0053, |
| "reward": 1.5273437917232513, |
| "reward_std": 0.33333041463047264, |
| "rewards/accuracy_reward": 0.5708333509741351, |
| "rewards/format_reward": 0.9565104365348815, |
| "step": 240 |
| }, |
| { |
| "completion_length": 328.22422676086427, |
| "epoch": 0.4, |
| "grad_norm": 0.1456129401922226, |
| "kl": 0.16090087890625, |
| "learning_rate": 2.990013076691329e-06, |
| "loss": 0.0064, |
| "reward": 1.4192708760499955, |
| "reward_std": 0.38170270454138516, |
| "rewards/accuracy_reward": 0.48046876322478055, |
| "rewards/format_reward": 0.9388021036982537, |
| "step": 250 |
| }, |
| { |
| "completion_length": 346.2437599182129, |
| "epoch": 0.416, |
| "grad_norm": 0.14456409215927124, |
| "kl": 0.15968017578125, |
| "learning_rate": 2.986536899007383e-06, |
| "loss": 0.0064, |
| "reward": 1.4953125447034836, |
| "reward_std": 0.4336598340421915, |
| "rewards/accuracy_reward": 0.6145833546295763, |
| "rewards/format_reward": 0.8807291924953461, |
| "step": 260 |
| }, |
| { |
| "completion_length": 269.3726661682129, |
| "epoch": 0.432, |
| "grad_norm": 0.46196305751800537, |
| "kl": 0.2087890625, |
| "learning_rate": 2.982545216511974e-06, |
| "loss": 0.0084, |
| "reward": 1.3755208745598793, |
| "reward_std": 0.399412408657372, |
| "rewards/accuracy_reward": 0.46328125838190315, |
| "rewards/format_reward": 0.9122396051883698, |
| "step": 270 |
| }, |
| { |
| "completion_length": 343.7578243255615, |
| "epoch": 0.448, |
| "grad_norm": 0.14162611961364746, |
| "kl": 0.259228515625, |
| "learning_rate": 2.978039413450278e-06, |
| "loss": 0.0104, |
| "reward": 1.2468750298023223, |
| "reward_std": 0.5331707052886486, |
| "rewards/accuracy_reward": 0.40807292954996227, |
| "rewards/format_reward": 0.8388021022081376, |
| "step": 280 |
| }, |
| { |
| "completion_length": 283.88282241821287, |
| "epoch": 0.464, |
| "grad_norm": 0.12917938828468323, |
| "kl": 0.24149169921875, |
| "learning_rate": 2.9730210523554276e-06, |
| "loss": 0.0097, |
| "reward": 1.4640625447034836, |
| "reward_std": 0.39531214712187646, |
| "rewards/accuracy_reward": 0.5385416830889881, |
| "rewards/format_reward": 0.9255208507180214, |
| "step": 290 |
| }, |
| { |
| "completion_length": 348.46693687438966, |
| "epoch": 0.48, |
| "grad_norm": 0.10320460051298141, |
| "kl": 0.212744140625, |
| "learning_rate": 2.967491873506653e-06, |
| "loss": 0.0085, |
| "reward": 1.4231771245598792, |
| "reward_std": 0.4722843859344721, |
| "rewards/accuracy_reward": 0.5325520962476731, |
| "rewards/format_reward": 0.8906250193715095, |
| "step": 300 |
| }, |
| { |
| "completion_length": 385.8747501373291, |
| "epoch": 0.496, |
| "grad_norm": 0.09853401780128479, |
| "kl": 0.2241943359375, |
| "learning_rate": 2.9614537943257835e-06, |
| "loss": 0.009, |
| "reward": 1.4325521185994148, |
| "reward_std": 0.48100620210170747, |
| "rewards/accuracy_reward": 0.5578125163912773, |
| "rewards/format_reward": 0.8747396007180214, |
| "step": 310 |
| }, |
| { |
| "completion_length": 340.27865371704104, |
| "epoch": 0.512, |
| "grad_norm": 240231.90625, |
| "kl": 4838.629223632813, |
| "learning_rate": 2.9549089087123195e-06, |
| "loss": 193.2151, |
| "reward": 1.402343785762787, |
| "reward_std": 0.49140210878103974, |
| "rewards/accuracy_reward": 0.5153646014630795, |
| "rewards/format_reward": 0.8869791895151138, |
| "step": 320 |
| }, |
| { |
| "completion_length": 385.4213642120361, |
| "epoch": 0.528, |
| "grad_norm": 0.07590307295322418, |
| "kl": 12.74246826171875, |
| "learning_rate": 2.947859486317304e-06, |
| "loss": 0.5097, |
| "reward": 1.433593787252903, |
| "reward_std": 0.5454600352793932, |
| "rewards/accuracy_reward": 0.5776041799690574, |
| "rewards/format_reward": 0.8559896066784859, |
| "step": 330 |
| }, |
| { |
| "completion_length": 366.5921993255615, |
| "epoch": 0.544, |
| "grad_norm": 0.0868915244936943, |
| "kl": 0.27060546875, |
| "learning_rate": 2.9403079717562495e-06, |
| "loss": 0.0108, |
| "reward": 1.4158854514360428, |
| "reward_std": 0.5097951695322991, |
| "rewards/accuracy_reward": 0.5526041839271784, |
| "rewards/format_reward": 0.8632812693715095, |
| "step": 340 |
| }, |
| { |
| "completion_length": 342.1976657867432, |
| "epoch": 0.56, |
| "grad_norm": 0.09720102697610855, |
| "kl": 0.24512939453125, |
| "learning_rate": 2.9322569837613867e-06, |
| "loss": 0.0098, |
| "reward": 1.4158854573965072, |
| "reward_std": 0.4330069116316736, |
| "rewards/accuracy_reward": 0.5171875095693395, |
| "rewards/format_reward": 0.8986979350447655, |
| "step": 350 |
| }, |
| { |
| "completion_length": 304.5101634979248, |
| "epoch": 0.576, |
| "grad_norm": 0.1557752639055252, |
| "kl": 0.273779296875, |
| "learning_rate": 2.9237093142735355e-06, |
| "loss": 0.011, |
| "reward": 1.4440104573965074, |
| "reward_std": 0.45580658232793214, |
| "rewards/accuracy_reward": 0.5408854335546494, |
| "rewards/format_reward": 0.9031250193715096, |
| "step": 360 |
| }, |
| { |
| "completion_length": 337.13073883056643, |
| "epoch": 0.592, |
| "grad_norm": 0.07957520335912704, |
| "kl": 0.2844482421875, |
| "learning_rate": 2.914667927473909e-06, |
| "loss": 0.0114, |
| "reward": 1.4494792118668556, |
| "reward_std": 0.4396442363038659, |
| "rewards/accuracy_reward": 0.5385416800854728, |
| "rewards/format_reward": 0.9109375163912773, |
| "step": 370 |
| }, |
| { |
| "completion_length": 348.5554767608643, |
| "epoch": 0.608, |
| "grad_norm": 0.10430486500263214, |
| "kl": 0.2727783203125, |
| "learning_rate": 2.905135958756186e-06, |
| "loss": 0.0109, |
| "reward": 1.416666714847088, |
| "reward_std": 0.5037212844938039, |
| "rewards/accuracy_reward": 0.5440104316920042, |
| "rewards/format_reward": 0.8726562723517418, |
| "step": 380 |
| }, |
| { |
| "completion_length": 296.6213638305664, |
| "epoch": 0.624, |
| "grad_norm": 0.1668756604194641, |
| "kl": 1331.5044677734375, |
| "learning_rate": 2.8951167136392134e-06, |
| "loss": 53.281, |
| "reward": 1.3690104529261589, |
| "reward_std": 0.5218944691121579, |
| "rewards/accuracy_reward": 0.49401043094694613, |
| "rewards/format_reward": 0.8750000208616256, |
| "step": 390 |
| }, |
| { |
| "completion_length": 283.71016426086425, |
| "epoch": 0.64, |
| "grad_norm": 0.11033165454864502, |
| "kl": 0.2045654296875, |
| "learning_rate": 2.8846136666207118e-06, |
| "loss": 0.0082, |
| "reward": 1.5940104514360427, |
| "reward_std": 0.36332854218780997, |
| "rewards/accuracy_reward": 0.6401041854172945, |
| "rewards/format_reward": 0.9539062738418579, |
| "step": 400 |
| }, |
| { |
| "completion_length": 357.7877712249756, |
| "epoch": 0.656, |
| "grad_norm": 0.1104319766163826, |
| "kl": 74.237353515625, |
| "learning_rate": 2.873630459972376e-06, |
| "loss": 2.9658, |
| "reward": 1.471875038743019, |
| "reward_std": 0.4806873705238104, |
| "rewards/accuracy_reward": 0.5885416785255074, |
| "rewards/format_reward": 0.8833333522081375, |
| "step": 410 |
| }, |
| { |
| "completion_length": 314.5830821990967, |
| "epoch": 0.672, |
| "grad_norm": 0.1443065106868744, |
| "kl": 1.9264404296875, |
| "learning_rate": 2.8621709024768054e-06, |
| "loss": 0.0774, |
| "reward": 1.4846354559063912, |
| "reward_std": 0.4601195017807186, |
| "rewards/accuracy_reward": 0.578645845502615, |
| "rewards/format_reward": 0.9059896066784858, |
| "step": 420 |
| }, |
| { |
| "completion_length": 352.45912475585936, |
| "epoch": 0.688, |
| "grad_norm": 0.08876121789216995, |
| "kl": 0.278662109375, |
| "learning_rate": 2.8502389681066806e-06, |
| "loss": 0.0111, |
| "reward": 1.4117187947034835, |
| "reward_std": 0.46184736788272857, |
| "rewards/accuracy_reward": 0.5140625171363353, |
| "rewards/format_reward": 0.8976562708616257, |
| "step": 430 |
| }, |
| { |
| "completion_length": 285.142195892334, |
| "epoch": 0.704, |
| "grad_norm": 0.07999342679977417, |
| "kl": 2598.7319091796876, |
| "learning_rate": 2.8378387946466623e-06, |
| "loss": 103.7532, |
| "reward": 1.3726562917232514, |
| "reward_std": 0.49423595853149893, |
| "rewards/accuracy_reward": 0.47968751210719346, |
| "rewards/format_reward": 0.8929687663912773, |
| "step": 440 |
| }, |
| { |
| "completion_length": 417.71849822998047, |
| "epoch": 0.72, |
| "grad_norm": 0.08980146795511246, |
| "kl": 0.1867431640625, |
| "learning_rate": 2.8249746822584788e-06, |
| "loss": 0.0075, |
| "reward": 1.4705729529261589, |
| "reward_std": 0.4550738349556923, |
| "rewards/accuracy_reward": 0.5778646010905504, |
| "rewards/format_reward": 0.8927083507180213, |
| "step": 450 |
| }, |
| { |
| "completion_length": 499.32371215820314, |
| "epoch": 0.736, |
| "grad_norm": 0.09107893705368042, |
| "kl": 0.347216796875, |
| "learning_rate": 2.811651091989708e-06, |
| "loss": 0.0139, |
| "reward": 1.2427083723247052, |
| "reward_std": 0.6114235140383244, |
| "rewards/accuracy_reward": 0.48880209363996985, |
| "rewards/format_reward": 0.7539062686264515, |
| "step": 460 |
| }, |
| { |
| "completion_length": 363.605997467041, |
| "epoch": 0.752, |
| "grad_norm": 0.10554559528827667, |
| "kl": 1.361376953125, |
| "learning_rate": 2.797872644226761e-06, |
| "loss": 0.0545, |
| "reward": 1.5098958760499954, |
| "reward_std": 0.38472358537837864, |
| "rewards/accuracy_reward": 0.5791666805744171, |
| "rewards/format_reward": 0.9307291820645333, |
| "step": 470 |
| }, |
| { |
| "completion_length": 343.4364669799805, |
| "epoch": 0.768, |
| "grad_norm": 0.08596952259540558, |
| "kl": 0.2862548828125, |
| "learning_rate": 2.7836441170926177e-06, |
| "loss": 0.0114, |
| "reward": 1.4947917029261588, |
| "reward_std": 0.45332635566592216, |
| "rewards/accuracy_reward": 0.5903646016027778, |
| "rewards/format_reward": 0.904427108168602, |
| "step": 480 |
| }, |
| { |
| "completion_length": 372.5755313873291, |
| "epoch": 0.784, |
| "grad_norm": 0.12259896844625473, |
| "kl": 0.3115966796875, |
| "learning_rate": 2.768970444789855e-06, |
| "loss": 0.0125, |
| "reward": 1.3859375461935997, |
| "reward_std": 0.5074398431926965, |
| "rewards/accuracy_reward": 0.521354182693176, |
| "rewards/format_reward": 0.8645833596587181, |
| "step": 490 |
| }, |
| { |
| "completion_length": 343.65287551879885, |
| "epoch": 0.8, |
| "grad_norm": 0.32125625014305115, |
| "kl": 0.2789794921875, |
| "learning_rate": 2.753856715889552e-06, |
| "loss": 0.0112, |
| "reward": 1.3770833745598794, |
| "reward_std": 0.5012526127509773, |
| "rewards/accuracy_reward": 0.5059895979240536, |
| "rewards/format_reward": 0.8710937723517418, |
| "step": 500 |
| }, |
| { |
| "completion_length": 250.93281898498535, |
| "epoch": 0.816, |
| "grad_norm": 0.19703888893127441, |
| "kl": 4480.266870117188, |
| "learning_rate": 2.738308171566667e-06, |
| "loss": 179.2807, |
| "reward": 1.4348958730697632, |
| "reward_std": 0.4180175280198455, |
| "rewards/accuracy_reward": 0.4963541803881526, |
| "rewards/format_reward": 0.9385416880249977, |
| "step": 510 |
| }, |
| { |
| "completion_length": 276.1153732299805, |
| "epoch": 0.832, |
| "grad_norm": 0.10936518013477325, |
| "kl": 0.19671630859375, |
| "learning_rate": 2.7223302037824845e-06, |
| "loss": 0.0079, |
| "reward": 1.521875038743019, |
| "reward_std": 0.33053973913192747, |
| "rewards/accuracy_reward": 0.5567708510905505, |
| "rewards/format_reward": 0.9651041850447655, |
| "step": 520 |
| }, |
| { |
| "completion_length": 339.7869876861572, |
| "epoch": 0.848, |
| "grad_norm": 0.1296410709619522, |
| "kl": 0.8387939453125, |
| "learning_rate": 2.705928353414784e-06, |
| "loss": 0.0335, |
| "reward": 1.2877604514360428, |
| "reward_std": 0.5975749351084232, |
| "rewards/accuracy_reward": 0.4781250134110451, |
| "rewards/format_reward": 0.8096354395151139, |
| "step": 530 |
| }, |
| { |
| "completion_length": 333.3028755187988, |
| "epoch": 0.864, |
| "grad_norm": 0.20703841745853424, |
| "kl": 0.3625732421875, |
| "learning_rate": 2.6891083083363554e-06, |
| "loss": 0.0145, |
| "reward": 1.3070312827825545, |
| "reward_std": 0.5399560488760471, |
| "rewards/accuracy_reward": 0.45468751080334185, |
| "rewards/format_reward": 0.8523437708616257, |
| "step": 540 |
| }, |
| { |
| "completion_length": 272.563028717041, |
| "epoch": 0.88, |
| "grad_norm": 0.16747455298900604, |
| "kl": 0.30728759765625, |
| "learning_rate": 2.6718759014425513e-06, |
| "loss": 0.0123, |
| "reward": 1.534114620089531, |
| "reward_std": 0.3431927986443043, |
| "rewards/accuracy_reward": 0.5791666842997074, |
| "rewards/format_reward": 0.9549479365348816, |
| "step": 550 |
| }, |
| { |
| "completion_length": 317.7333419799805, |
| "epoch": 0.896, |
| "grad_norm": 0.3575699031352997, |
| "kl": 0.20316162109375, |
| "learning_rate": 2.6542371086285347e-06, |
| "loss": 0.0081, |
| "reward": 1.5716146260499955, |
| "reward_std": 0.3063303453847766, |
| "rewards/accuracy_reward": 0.6052083535119891, |
| "rewards/format_reward": 0.9664062723517418, |
| "step": 560 |
| }, |
| { |
| "completion_length": 380.475789642334, |
| "epoch": 0.912, |
| "grad_norm": 0.3914913237094879, |
| "kl": 0.68067626953125, |
| "learning_rate": 2.6361980467169505e-06, |
| "loss": 0.0273, |
| "reward": 1.397135455906391, |
| "reward_std": 0.4914455428719521, |
| "rewards/accuracy_reward": 0.5210937639698386, |
| "rewards/format_reward": 0.8760416865348816, |
| "step": 570 |
| }, |
| { |
| "completion_length": 375.67553367614744, |
| "epoch": 0.928, |
| "grad_norm": 0.31454476714134216, |
| "kl": 0.549169921875, |
| "learning_rate": 2.6177649713367136e-06, |
| "loss": 0.0219, |
| "reward": 1.503125038743019, |
| "reward_std": 0.5160485468804836, |
| "rewards/accuracy_reward": 0.6070312697440385, |
| "rewards/format_reward": 0.8960937678813934, |
| "step": 580 |
| }, |
| { |
| "completion_length": 362.68178100585936, |
| "epoch": 0.944, |
| "grad_norm": 0.08374358713626862, |
| "kl": 0.245947265625, |
| "learning_rate": 2.5989442747536697e-06, |
| "loss": 0.0098, |
| "reward": 1.5033854618668556, |
| "reward_std": 0.4191708039492369, |
| "rewards/accuracy_reward": 0.5708333518356085, |
| "rewards/format_reward": 0.9325521036982536, |
| "step": 590 |
| }, |
| { |
| "completion_length": 348.230997467041, |
| "epoch": 0.96, |
| "grad_norm": 0.08352825045585632, |
| "kl": 0.3398193359375, |
| "learning_rate": 2.5797424836538714e-06, |
| "loss": 0.0136, |
| "reward": 1.5054687887430191, |
| "reward_std": 0.40789004862308503, |
| "rewards/accuracy_reward": 0.5828125124797225, |
| "rewards/format_reward": 0.9226562663912773, |
| "step": 600 |
| }, |
| { |
| "completion_length": 348.3810001373291, |
| "epoch": 0.976, |
| "grad_norm": 0.09684169292449951, |
| "kl": 0.5585693359375, |
| "learning_rate": 2.560166256880234e-06, |
| "loss": 0.0224, |
| "reward": 1.5184896290302277, |
| "reward_std": 0.4260748438537121, |
| "rewards/accuracy_reward": 0.5945312634110451, |
| "rewards/format_reward": 0.9239583536982536, |
| "step": 610 |
| }, |
| { |
| "completion_length": 342.1513130187988, |
| "epoch": 0.992, |
| "grad_norm": 0.0948108583688736, |
| "kl": 0.31746826171875, |
| "learning_rate": 2.5402223831233723e-06, |
| "loss": 0.0127, |
| "reward": 1.5098958641290665, |
| "reward_std": 0.4502595506608486, |
| "rewards/accuracy_reward": 0.5950520992279053, |
| "rewards/format_reward": 0.9148437663912773, |
| "step": 620 |
| }, |
| { |
| "completion_length": 360.778133392334, |
| "epoch": 1.008, |
| "grad_norm": 0.14951100945472717, |
| "kl": 0.272119140625, |
| "learning_rate": 2.5199177785673957e-06, |
| "loss": 0.0109, |
| "reward": 1.5187500447034836, |
| "reward_std": 0.4504229260608554, |
| "rewards/accuracy_reward": 0.6013020960614085, |
| "rewards/format_reward": 0.91744794100523, |
| "step": 630 |
| }, |
| { |
| "completion_length": 363.9461048126221, |
| "epoch": 1.024, |
| "grad_norm": 0.22019609808921814, |
| "kl": 0.6080078125, |
| "learning_rate": 2.4992594844915022e-06, |
| "loss": 0.0243, |
| "reward": 1.4822917118668557, |
| "reward_std": 0.4976062387228012, |
| "rewards/accuracy_reward": 0.5966145992279053, |
| "rewards/format_reward": 0.8856771007180214, |
| "step": 640 |
| }, |
| { |
| "completion_length": 303.2153751373291, |
| "epoch": 1.04, |
| "grad_norm": 0.1117577999830246, |
| "kl": 0.28231201171875, |
| "learning_rate": 2.4782546648281847e-06, |
| "loss": 0.0113, |
| "reward": 1.504687535762787, |
| "reward_std": 0.3277318266220391, |
| "rewards/accuracy_reward": 0.5388020966434851, |
| "rewards/format_reward": 0.9658854335546494, |
| "step": 650 |
| }, |
| { |
| "completion_length": 301.4893325805664, |
| "epoch": 1.056, |
| "grad_norm": 0.09414847195148468, |
| "kl": 0.19752197265625, |
| "learning_rate": 2.4569106036789064e-06, |
| "loss": 0.0079, |
| "reward": 1.5304687917232513, |
| "reward_std": 0.28476983550935986, |
| "rewards/accuracy_reward": 0.5505208492279052, |
| "rewards/format_reward": 0.9799479380249977, |
| "step": 660 |
| }, |
| { |
| "completion_length": 380.32605056762696, |
| "epoch": 1.072, |
| "grad_norm": 0.08695989847183228, |
| "kl": 0.3002685546875, |
| "learning_rate": 2.4352347027881005e-06, |
| "loss": 0.012, |
| "reward": 1.4942708730697631, |
| "reward_std": 0.4118765268474817, |
| "rewards/accuracy_reward": 0.559375013038516, |
| "rewards/format_reward": 0.9348958566784858, |
| "step": 670 |
| }, |
| { |
| "completion_length": 362.15209197998047, |
| "epoch": 1.088, |
| "grad_norm": 0.10074878484010696, |
| "kl": 0.3934326171875, |
| "learning_rate": 2.413234478976379e-06, |
| "loss": 0.0157, |
| "reward": 1.5263021305203437, |
| "reward_std": 0.3859816137701273, |
| "rewards/accuracy_reward": 0.5898437647148966, |
| "rewards/format_reward": 0.9364583566784859, |
| "step": 680 |
| }, |
| { |
| "completion_length": 342.3330821990967, |
| "epoch": 1.104, |
| "grad_norm": 0.11779873073101044, |
| "kl": 0.4483154296875, |
| "learning_rate": 2.3909175615338297e-06, |
| "loss": 0.018, |
| "reward": 1.5794271260499955, |
| "reward_std": 0.43667961433529856, |
| "rewards/accuracy_reward": 0.6510416842997074, |
| "rewards/format_reward": 0.9283854380249977, |
| "step": 690 |
| }, |
| { |
| "completion_length": 364.8481903076172, |
| "epoch": 1.12, |
| "grad_norm": 0.2739701271057129, |
| "kl": 0.5840087890625, |
| "learning_rate": 2.368291689574312e-06, |
| "loss": 0.0233, |
| "reward": 1.4492187917232513, |
| "reward_std": 0.5059375043958425, |
| "rewards/accuracy_reward": 0.5679687663912774, |
| "rewards/format_reward": 0.8812500208616256, |
| "step": 700 |
| }, |
| { |
| "completion_length": 299.41016693115233, |
| "epoch": 1.1360000000000001, |
| "grad_norm": 0.11518736928701401, |
| "kl": 0.24130859375, |
| "learning_rate": 2.3453647093516705e-06, |
| "loss": 0.0096, |
| "reward": 1.6536458730697632, |
| "reward_std": 0.3213042883202434, |
| "rewards/accuracy_reward": 0.6833333440124989, |
| "rewards/format_reward": 0.9703125163912774, |
| "step": 710 |
| }, |
| { |
| "completion_length": 322.81980018615724, |
| "epoch": 1.152, |
| "grad_norm": 0.09168912470340729, |
| "kl": 0.46943359375, |
| "learning_rate": 2.322144571538792e-06, |
| "loss": 0.0188, |
| "reward": 1.5898437827825547, |
| "reward_std": 0.39194310661405324, |
| "rewards/accuracy_reward": 0.6382812660187482, |
| "rewards/format_reward": 0.9515625208616256, |
| "step": 720 |
| }, |
| { |
| "completion_length": 383.13386459350585, |
| "epoch": 1.168, |
| "grad_norm": 0.2942081093788147, |
| "kl": 0.3664794921875, |
| "learning_rate": 2.2986393284704496e-06, |
| "loss": 0.0147, |
| "reward": 1.4242187917232514, |
| "reward_std": 0.519075758382678, |
| "rewards/accuracy_reward": 0.5468750189524144, |
| "rewards/format_reward": 0.8773437738418579, |
| "step": 730 |
| }, |
| { |
| "completion_length": 323.92474937438965, |
| "epoch": 1.184, |
| "grad_norm": 0.7609843015670776, |
| "kl": 0.9059814453125, |
| "learning_rate": 2.2748571313509e-06, |
| "loss": 0.0363, |
| "reward": 1.4992187842726707, |
| "reward_std": 0.4844189383089542, |
| "rewards/accuracy_reward": 0.5958333497866988, |
| "rewards/format_reward": 0.9033854320645333, |
| "step": 740 |
| }, |
| { |
| "completion_length": 312.4018325805664, |
| "epoch": 1.2, |
| "grad_norm": 0.26806285977363586, |
| "kl": 0.352099609375, |
| "learning_rate": 2.2508062274271832e-06, |
| "loss": 0.0141, |
| "reward": 1.5622396349906922, |
| "reward_std": 0.3421931225806475, |
| "rewards/accuracy_reward": 0.6031250144354999, |
| "rewards/format_reward": 0.959114608168602, |
| "step": 750 |
| }, |
| { |
| "completion_length": 371.5671989440918, |
| "epoch": 1.216, |
| "grad_norm": 0.09768559783697128, |
| "kl": 0.418310546875, |
| "learning_rate": 2.2264949571291272e-06, |
| "loss": 0.0167, |
| "reward": 1.5015625447034835, |
| "reward_std": 0.4462232066318393, |
| "rewards/accuracy_reward": 0.5955729261040688, |
| "rewards/format_reward": 0.9059896036982537, |
| "step": 760 |
| }, |
| { |
| "completion_length": 357.2544361114502, |
| "epoch": 1.232, |
| "grad_norm": 1.3492963314056396, |
| "kl": 0.36956787109375, |
| "learning_rate": 2.2019317511770334e-06, |
| "loss": 0.0148, |
| "reward": 1.5296875447034837, |
| "reward_std": 0.38438957259058953, |
| "rewards/accuracy_reward": 0.5861979339271783, |
| "rewards/format_reward": 0.943489608168602, |
| "step": 770 |
| }, |
| { |
| "completion_length": 324.1862083435059, |
| "epoch": 1.248, |
| "grad_norm": 0.0821109488606453, |
| "kl": 0.37724609375, |
| "learning_rate": 2.1771251276580473e-06, |
| "loss": 0.0151, |
| "reward": 1.563802120089531, |
| "reward_std": 0.38602438326925037, |
| "rewards/accuracy_reward": 0.6151041872799397, |
| "rewards/format_reward": 0.9486979365348815, |
| "step": 780 |
| }, |
| { |
| "completion_length": 343.0974063873291, |
| "epoch": 1.264, |
| "grad_norm": 0.11053454130887985, |
| "kl": 0.3335693359375, |
| "learning_rate": 2.152083689072242e-06, |
| "loss": 0.0134, |
| "reward": 1.500260454416275, |
| "reward_std": 0.45653478614985943, |
| "rewards/accuracy_reward": 0.5726562667638063, |
| "rewards/format_reward": 0.92760419100523, |
| "step": 790 |
| }, |
| { |
| "completion_length": 333.49376106262207, |
| "epoch": 1.28, |
| "grad_norm": 0.09054333716630936, |
| "kl": 0.4177734375, |
| "learning_rate": 2.126816119349417e-06, |
| "loss": 0.0167, |
| "reward": 1.53333338201046, |
| "reward_std": 0.4124399437569082, |
| "rewards/accuracy_reward": 0.5903645984828472, |
| "rewards/format_reward": 0.9429687693715095, |
| "step": 800 |
| }, |
| { |
| "completion_length": 309.3979267120361, |
| "epoch": 1.296, |
| "grad_norm": 0.1404261738061905, |
| "kl": 0.30087890625, |
| "learning_rate": 2.1013311808376683e-06, |
| "loss": 0.012, |
| "reward": 1.5450521230697631, |
| "reward_std": 0.3885490225628018, |
| "rewards/accuracy_reward": 0.5989583499729634, |
| "rewards/format_reward": 0.9460937708616257, |
| "step": 810 |
| }, |
| { |
| "completion_length": 355.4174579620361, |
| "epoch": 1.312, |
| "grad_norm": 0.21735621988773346, |
| "kl": 396.079345703125, |
| "learning_rate": 2.075637711264759e-06, |
| "loss": 15.8661, |
| "reward": 1.4781250327825546, |
| "reward_std": 0.475801320374012, |
| "rewards/accuracy_reward": 0.5750000145286321, |
| "rewards/format_reward": 0.9031250223517417, |
| "step": 820 |
| }, |
| { |
| "completion_length": 340.7921970367432, |
| "epoch": 1.328, |
| "grad_norm": 0.41135042905807495, |
| "kl": 0.43037109375, |
| "learning_rate": 2.04974462067335e-06, |
| "loss": 0.0172, |
| "reward": 1.5763021230697631, |
| "reward_std": 0.45917638950049877, |
| "rewards/accuracy_reward": 0.6552083536982536, |
| "rewards/format_reward": 0.9210937678813934, |
| "step": 830 |
| }, |
| { |
| "completion_length": 345.1257911682129, |
| "epoch": 1.3439999999999999, |
| "grad_norm": 0.11835141479969025, |
| "kl": 0.3072998046875, |
| "learning_rate": 2.023660888331156e-06, |
| "loss": 0.0123, |
| "reward": 1.5898437917232513, |
| "reward_std": 0.41639630161225794, |
| "rewards/accuracy_reward": 0.6492187671363354, |
| "rewards/format_reward": 0.9406250193715096, |
| "step": 840 |
| }, |
| { |
| "completion_length": 314.01824111938475, |
| "epoch": 1.3599999999999999, |
| "grad_norm": 0.11381607502698898, |
| "kl": 0.320751953125, |
| "learning_rate": 1.997395559617093e-06, |
| "loss": 0.0128, |
| "reward": 1.594791704416275, |
| "reward_std": 0.34149147048592565, |
| "rewards/accuracy_reward": 0.6294271037913859, |
| "rewards/format_reward": 0.9653646007180214, |
| "step": 850 |
| }, |
| { |
| "completion_length": 324.5421962738037, |
| "epoch": 1.376, |
| "grad_norm": 0.23615330457687378, |
| "kl": 0.45950927734375, |
| "learning_rate": 1.9709577428844986e-06, |
| "loss": 0.0184, |
| "reward": 1.5335937917232514, |
| "reward_std": 0.38021210972219704, |
| "rewards/accuracy_reward": 0.5817708492279052, |
| "rewards/format_reward": 0.9518229350447655, |
| "step": 860 |
| }, |
| { |
| "completion_length": 347.88047943115237, |
| "epoch": 1.392, |
| "grad_norm": 0.09267138689756393, |
| "kl": 0.2773193359375, |
| "learning_rate": 1.9443566063025173e-06, |
| "loss": 0.0111, |
| "reward": 1.4914062932133674, |
| "reward_std": 0.4475890576839447, |
| "rewards/accuracy_reward": 0.5679687656462192, |
| "rewards/format_reward": 0.9234375193715095, |
| "step": 870 |
| }, |
| { |
| "completion_length": 324.02969856262206, |
| "epoch": 1.408, |
| "grad_norm": 0.08601139485836029, |
| "kl": 0.30263671875, |
| "learning_rate": 1.9176013746767422e-06, |
| "loss": 0.0121, |
| "reward": 1.6111979484558105, |
| "reward_std": 0.40976997539401055, |
| "rewards/accuracy_reward": 0.665104179084301, |
| "rewards/format_reward": 0.9460937723517417, |
| "step": 880 |
| }, |
| { |
| "completion_length": 313.2916774749756, |
| "epoch": 1.424, |
| "grad_norm": 0.09291204810142517, |
| "kl": 0.31865234375, |
| "learning_rate": 1.8907013262502107e-06, |
| "loss": 0.0127, |
| "reward": 1.5414062976837157, |
| "reward_std": 0.38301597684621813, |
| "rewards/accuracy_reward": 0.5854166820645332, |
| "rewards/format_reward": 0.9559895992279053, |
| "step": 890 |
| }, |
| { |
| "completion_length": 341.86511344909667, |
| "epoch": 1.44, |
| "grad_norm": 0.08695019036531448, |
| "kl": 0.4276123046875, |
| "learning_rate": 1.8636657894858784e-06, |
| "loss": 0.0171, |
| "reward": 1.5312500506639481, |
| "reward_std": 0.3857471447438002, |
| "rewards/accuracy_reward": 0.593750013038516, |
| "rewards/format_reward": 0.9375000178813935, |
| "step": 900 |
| }, |
| { |
| "completion_length": 328.9026153564453, |
| "epoch": 1.456, |
| "grad_norm": 0.1095338836312294, |
| "kl": 1.388330078125, |
| "learning_rate": 1.8365041398316678e-06, |
| "loss": 0.0556, |
| "reward": 1.488802120089531, |
| "reward_std": 0.4116742081940174, |
| "rewards/accuracy_reward": 0.5505208489485085, |
| "rewards/format_reward": 0.9382812678813934, |
| "step": 910 |
| }, |
| { |
| "completion_length": 348.82787590026857, |
| "epoch": 1.472, |
| "grad_norm": 0.12603142857551575, |
| "kl": 0.378857421875, |
| "learning_rate": 1.8092257964692304e-06, |
| "loss": 0.0152, |
| "reward": 1.4481771260499954, |
| "reward_std": 0.49320419803261756, |
| "rewards/accuracy_reward": 0.545833345502615, |
| "rewards/format_reward": 0.9023437693715095, |
| "step": 920 |
| }, |
| { |
| "completion_length": 336.4057384490967, |
| "epoch": 1.488, |
| "grad_norm": 0.06563200801610947, |
| "kl": 0.296435546875, |
| "learning_rate": 1.781840219047541e-06, |
| "loss": 0.0119, |
| "reward": 1.5815104573965073, |
| "reward_std": 0.4233523942530155, |
| "rewards/accuracy_reward": 0.6494791835546494, |
| "rewards/format_reward": 0.9320312693715096, |
| "step": 930 |
| }, |
| { |
| "completion_length": 389.78751373291016, |
| "epoch": 1.504, |
| "grad_norm": 0.13051985204219818, |
| "kl": 0.4625, |
| "learning_rate": 1.7543569044024565e-06, |
| "loss": 0.0185, |
| "reward": 1.4177083730697633, |
| "reward_std": 0.5342824589461088, |
| "rewards/accuracy_reward": 0.5640625163912774, |
| "rewards/format_reward": 0.8536458507180213, |
| "step": 940 |
| }, |
| { |
| "completion_length": 309.4536560058594, |
| "epoch": 1.52, |
| "grad_norm": 0.2592553496360779, |
| "kl": 0.3264892578125, |
| "learning_rate": 1.7267853832633819e-06, |
| "loss": 0.0131, |
| "reward": 1.510416704416275, |
| "reward_std": 0.4337383009493351, |
| "rewards/accuracy_reward": 0.594010425824672, |
| "rewards/format_reward": 0.9164062738418579, |
| "step": 950 |
| }, |
| { |
| "completion_length": 287.96485176086424, |
| "epoch": 1.536, |
| "grad_norm": 0.1538826823234558, |
| "kl": 0.4028076171875, |
| "learning_rate": 1.6991352169481808e-06, |
| "loss": 0.0161, |
| "reward": 1.5739583730697633, |
| "reward_std": 0.3966914664953947, |
| "rewards/accuracy_reward": 0.6291666841134429, |
| "rewards/format_reward": 0.9447916880249977, |
| "step": 960 |
| }, |
| { |
| "completion_length": 341.0164161682129, |
| "epoch": 1.552, |
| "grad_norm": 0.18513712286949158, |
| "kl": 0.560888671875, |
| "learning_rate": 1.6714159940474768e-06, |
| "loss": 0.0224, |
| "reward": 1.5395833611488343, |
| "reward_std": 0.4434517964720726, |
| "rewards/accuracy_reward": 0.6177083533257246, |
| "rewards/format_reward": 0.9218750253319741, |
| "step": 970 |
| }, |
| { |
| "completion_length": 350.5575626373291, |
| "epoch": 1.568, |
| "grad_norm": 0.07839391380548477, |
| "kl": 0.440478515625, |
| "learning_rate": 1.6436373270995033e-06, |
| "loss": 0.0176, |
| "reward": 1.5424479573965073, |
| "reward_std": 0.4582442186772823, |
| "rewards/accuracy_reward": 0.6197916835546493, |
| "rewards/format_reward": 0.9226562708616257, |
| "step": 980 |
| }, |
| { |
| "completion_length": 336.3362071990967, |
| "epoch": 1.584, |
| "grad_norm": 0.11262823641300201, |
| "kl": 0.4291748046875, |
| "learning_rate": 1.61580884925664e-06, |
| "loss": 0.0172, |
| "reward": 1.5526042044162751, |
| "reward_std": 0.44057157076895237, |
| "rewards/accuracy_reward": 0.6263021004851907, |
| "rewards/format_reward": 0.9263021007180214, |
| "step": 990 |
| }, |
| { |
| "completion_length": 385.09662437438965, |
| "epoch": 1.6, |
| "grad_norm": 0.09378820657730103, |
| "kl": 40.2127197265625, |
| "learning_rate": 1.5879402109448092e-06, |
| "loss": 1.6059, |
| "reward": 1.472916704416275, |
| "reward_std": 0.49806156009435654, |
| "rewards/accuracy_reward": 0.5927083514630794, |
| "rewards/format_reward": 0.8802083507180214, |
| "step": 1000 |
| }, |
| { |
| "completion_length": 320.54193649291994, |
| "epoch": 1.616, |
| "grad_norm": 0.19597963988780975, |
| "kl": 0.37021484375, |
| "learning_rate": 1.5600410765168756e-06, |
| "loss": 0.0148, |
| "reward": 1.566406288743019, |
| "reward_std": 0.39799929689615965, |
| "rewards/accuracy_reward": 0.6265625130385161, |
| "rewards/format_reward": 0.9398437708616256, |
| "step": 1010 |
| }, |
| { |
| "completion_length": 274.95834197998045, |
| "epoch": 1.6320000000000001, |
| "grad_norm": 0.10498908162117004, |
| "kl": 1.54697265625, |
| "learning_rate": 1.53212112090122e-06, |
| "loss": 0.0618, |
| "reward": 1.6328125432133676, |
| "reward_std": 0.311348158121109, |
| "rewards/accuracy_reward": 0.6619791840668767, |
| "rewards/format_reward": 0.9708333551883698, |
| "step": 1020 |
| }, |
| { |
| "completion_length": 319.71302909851073, |
| "epoch": 1.6480000000000001, |
| "grad_norm": 0.09473922103643417, |
| "kl": 0.5324951171875, |
| "learning_rate": 1.5041900262466447e-06, |
| "loss": 0.0213, |
| "reward": 1.5330729544162751, |
| "reward_std": 0.32746845642104744, |
| "rewards/accuracy_reward": 0.5736979268491268, |
| "rewards/format_reward": 0.9593750178813935, |
| "step": 1030 |
| }, |
| { |
| "completion_length": 355.3851661682129, |
| "epoch": 1.6640000000000001, |
| "grad_norm": 0.0937461331486702, |
| "kl": 0.4345703125, |
| "learning_rate": 1.4762574785647733e-06, |
| "loss": 0.0174, |
| "reward": 1.4458333671092987, |
| "reward_std": 0.46327271312475204, |
| "rewards/accuracy_reward": 0.5424479328095912, |
| "rewards/format_reward": 0.9033854380249977, |
| "step": 1040 |
| }, |
| { |
| "completion_length": 351.1898559570312, |
| "epoch": 1.6800000000000002, |
| "grad_norm": 0.0828389897942543, |
| "kl": 0.48388671875, |
| "learning_rate": 1.448333164371115e-06, |
| "loss": 0.0194, |
| "reward": 1.484895871579647, |
| "reward_std": 0.47403750047087667, |
| "rewards/accuracy_reward": 0.5729166816920042, |
| "rewards/format_reward": 0.91197919100523, |
| "step": 1050 |
| }, |
| { |
| "completion_length": 354.74141693115234, |
| "epoch": 1.696, |
| "grad_norm": 0.168987438082695, |
| "kl": 0.7384521484375, |
| "learning_rate": 1.4204267673259495e-06, |
| "loss": 0.0295, |
| "reward": 1.4638021171092988, |
| "reward_std": 0.5317467883229255, |
| "rewards/accuracy_reward": 0.5744791775941849, |
| "rewards/format_reward": 0.8893229335546493, |
| "step": 1060 |
| }, |
| { |
| "completion_length": 304.412247467041, |
| "epoch": 1.712, |
| "grad_norm": 5.448803901672363, |
| "kl": 0.60498046875, |
| "learning_rate": 1.3925479648762055e-06, |
| "loss": 0.0242, |
| "reward": 1.3429687917232513, |
| "reward_std": 0.5209484387189149, |
| "rewards/accuracy_reward": 0.45364584792405366, |
| "rewards/format_reward": 0.8893229365348816, |
| "step": 1070 |
| }, |
| { |
| "completion_length": 412.5049598693848, |
| "epoch": 1.728, |
| "grad_norm": 0.12441123276948929, |
| "kl": 0.9303955078125, |
| "learning_rate": 1.364706424899492e-06, |
| "loss": 0.0372, |
| "reward": 1.1434896126389504, |
| "reward_std": 0.6584706656634808, |
| "rewards/accuracy_reward": 0.40442709531635046, |
| "rewards/format_reward": 0.7390625201165676, |
| "step": 1080 |
| }, |
| { |
| "completion_length": 301.4174571990967, |
| "epoch": 1.744, |
| "grad_norm": 0.2378871887922287, |
| "kl": 0.33017578125, |
| "learning_rate": 1.3369118023514485e-06, |
| "loss": 0.0132, |
| "reward": 1.4455729559063912, |
| "reward_std": 0.4887973885983229, |
| "rewards/accuracy_reward": 0.533854181971401, |
| "rewards/format_reward": 0.9117187678813934, |
| "step": 1090 |
| }, |
| { |
| "completion_length": 302.38151969909666, |
| "epoch": 1.76, |
| "grad_norm": 0.15944868326187134, |
| "kl": 0.4225830078125, |
| "learning_rate": 1.3091737359175766e-06, |
| "loss": 0.0169, |
| "reward": 1.437239620089531, |
| "reward_std": 0.5037642396986485, |
| "rewards/accuracy_reward": 0.5348958517191932, |
| "rewards/format_reward": 0.9023437738418579, |
| "step": 1100 |
| }, |
| { |
| "completion_length": 448.40626373291013, |
| "epoch": 1.776, |
| "grad_norm": 0.2753123342990875, |
| "kl": 0.99765625, |
| "learning_rate": 1.2815018446707142e-06, |
| "loss": 0.0399, |
| "reward": 1.141927120089531, |
| "reward_std": 0.6968318119645118, |
| "rewards/accuracy_reward": 0.4669270968064666, |
| "rewards/format_reward": 0.6750000223517418, |
| "step": 1110 |
| }, |
| { |
| "completion_length": 335.4994888305664, |
| "epoch": 1.792, |
| "grad_norm": 0.13646121323108673, |
| "kl": 0.5754638671875, |
| "learning_rate": 1.253905724735309e-06, |
| "loss": 0.023, |
| "reward": 1.4078125417232514, |
| "reward_std": 0.550966077670455, |
| "rewards/accuracy_reward": 0.5497395996004343, |
| "rewards/format_reward": 0.8580729365348816, |
| "step": 1120 |
| }, |
| { |
| "completion_length": 287.5849044799805, |
| "epoch": 1.808, |
| "grad_norm": 0.116419717669487, |
| "kl": 0.38251953125, |
| "learning_rate": 1.2263949459596545e-06, |
| "loss": 0.0153, |
| "reward": 1.5557292193174361, |
| "reward_std": 0.370727850869298, |
| "rewards/accuracy_reward": 0.600781269185245, |
| "rewards/format_reward": 0.9549479365348816, |
| "step": 1130 |
| }, |
| { |
| "completion_length": 267.22839508056643, |
| "epoch": 1.8239999999999998, |
| "grad_norm": 0.22047029435634613, |
| "kl": 0.32794189453125, |
| "learning_rate": 1.1989790485972312e-06, |
| "loss": 0.0131, |
| "reward": 1.5026041999459268, |
| "reward_std": 0.3551323272287846, |
| "rewards/accuracy_reward": 0.5403646015096456, |
| "rewards/format_reward": 0.9622396036982537, |
| "step": 1140 |
| }, |
| { |
| "completion_length": 308.1588619232178, |
| "epoch": 1.8399999999999999, |
| "grad_norm": 0.3132161498069763, |
| "kl": 0.46527099609375, |
| "learning_rate": 1.171667539998318e-06, |
| "loss": 0.0186, |
| "reward": 1.5867187917232513, |
| "reward_std": 0.36549231559038164, |
| "rewards/accuracy_reward": 0.6335937686264514, |
| "rewards/format_reward": 0.9531250223517418, |
| "step": 1150 |
| }, |
| { |
| "completion_length": 339.93542861938477, |
| "epoch": 1.8559999999999999, |
| "grad_norm": 0.10203922539949417, |
| "kl": 0.5742431640625, |
| "learning_rate": 1.1444698913130093e-06, |
| "loss": 0.023, |
| "reward": 1.501041704416275, |
| "reward_std": 0.48126591108739375, |
| "rewards/accuracy_reward": 0.5880208533257246, |
| "rewards/format_reward": 0.9130208522081376, |
| "step": 1160 |
| }, |
| { |
| "completion_length": 402.17761306762696, |
| "epoch": 1.8719999999999999, |
| "grad_norm": 0.10859699547290802, |
| "kl": 0.718212890625, |
| "learning_rate": 1.1173955342067857e-06, |
| "loss": 0.0287, |
| "reward": 1.335677121579647, |
| "reward_std": 0.6192147806286812, |
| "rewards/accuracy_reward": 0.5356770996004343, |
| "rewards/format_reward": 0.8000000193715096, |
| "step": 1170 |
| }, |
| { |
| "completion_length": 330.6448013305664, |
| "epoch": 1.888, |
| "grad_norm": 0.09255196154117584, |
| "kl": 0.36551513671875, |
| "learning_rate": 1.090453857589783e-06, |
| "loss": 0.0146, |
| "reward": 1.5846354484558105, |
| "reward_std": 0.43822822347283363, |
| "rewards/accuracy_reward": 0.6536458495538682, |
| "rewards/format_reward": 0.9309895977377891, |
| "step": 1180 |
| }, |
| { |
| "completion_length": 307.15391540527344, |
| "epoch": 1.904, |
| "grad_norm": 0.10617883503437042, |
| "kl": 0.38302001953125, |
| "learning_rate": 1.0636542043608775e-06, |
| "loss": 0.0153, |
| "reward": 1.607552121579647, |
| "reward_std": 0.38010403923690317, |
| "rewards/accuracy_reward": 0.6497395992279053, |
| "rewards/format_reward": 0.9578125238418579, |
| "step": 1190 |
| }, |
| { |
| "completion_length": 332.0044364929199, |
| "epoch": 1.92, |
| "grad_norm": 0.22778630256652832, |
| "kl": 0.4427490234375, |
| "learning_rate": 1.0370058681677376e-06, |
| "loss": 0.0177, |
| "reward": 1.5015625342726708, |
| "reward_std": 0.42089375481009483, |
| "rewards/accuracy_reward": 0.5770833486691117, |
| "rewards/format_reward": 0.9244791865348816, |
| "step": 1200 |
| }, |
| { |
| "completion_length": 315.778914642334, |
| "epoch": 1.936, |
| "grad_norm": 0.15557731688022614, |
| "kl": 0.33568115234375, |
| "learning_rate": 1.0105180901839485e-06, |
| "loss": 0.0134, |
| "reward": 1.5648437857627868, |
| "reward_std": 0.4563456516712904, |
| "rewards/accuracy_reward": 0.6401041850447655, |
| "rewards/format_reward": 0.9247396036982536, |
| "step": 1210 |
| }, |
| { |
| "completion_length": 329.52292518615724, |
| "epoch": 1.952, |
| "grad_norm": 0.35561373829841614, |
| "kl": 0.4545166015625, |
| "learning_rate": 9.84200055904337e-07, |
| "loss": 0.0182, |
| "reward": 1.542187537252903, |
| "reward_std": 0.39646931514143946, |
| "rewards/accuracy_reward": 0.6049479361623525, |
| "rewards/format_reward": 0.9372396036982537, |
| "step": 1220 |
| }, |
| { |
| "completion_length": 343.41563415527344, |
| "epoch": 1.968, |
| "grad_norm": 0.3309305012226105, |
| "kl": 0.49996337890625, |
| "learning_rate": 9.58060891959604e-07, |
| "loss": 0.02, |
| "reward": 1.484375037252903, |
| "reward_std": 0.44305979572236537, |
| "rewards/accuracy_reward": 0.5619791811332107, |
| "rewards/format_reward": 0.9223958522081375, |
| "step": 1230 |
| }, |
| { |
| "completion_length": 340.05495872497556, |
| "epoch": 1.984, |
| "grad_norm": 0.17401637136936188, |
| "kl": 0.4303955078125, |
| "learning_rate": 9.321096629513677e-07, |
| "loss": 0.0172, |
| "reward": 1.4763021290302276, |
| "reward_std": 0.4661326684057713, |
| "rewards/accuracy_reward": 0.5549479309469462, |
| "rewards/format_reward": 0.9213541880249977, |
| "step": 1240 |
| }, |
| { |
| "completion_length": 335.0432403564453, |
| "epoch": 2.0, |
| "grad_norm": 0.10272786021232605, |
| "kl": 0.5320556640625, |
| "learning_rate": 9.063553683087214e-07, |
| "loss": 0.0213, |
| "reward": 1.549739608168602, |
| "reward_std": 0.46059494726359845, |
| "rewards/accuracy_reward": 0.6302083522081375, |
| "rewards/format_reward": 0.9195312708616257, |
| "step": 1250 |
| }, |
| { |
| "completion_length": 314.4708419799805, |
| "epoch": 2.016, |
| "grad_norm": 0.11312547326087952, |
| "kl": 0.3264892578125, |
| "learning_rate": 8.808069391673894e-07, |
| "loss": 0.0131, |
| "reward": 1.5486979544162751, |
| "reward_std": 0.42191329710185527, |
| "rewards/accuracy_reward": 0.6192708492279053, |
| "rewards/format_reward": 0.929427108168602, |
| "step": 1260 |
| }, |
| { |
| "completion_length": 324.0794334411621, |
| "epoch": 2.032, |
| "grad_norm": 0.19456617534160614, |
| "kl": 0.3976806640625, |
| "learning_rate": 8.55473235272566e-07, |
| "loss": 0.0159, |
| "reward": 1.5580729603767396, |
| "reward_std": 0.45290672667324544, |
| "rewards/accuracy_reward": 0.6231770989950747, |
| "rewards/format_reward": 0.934895858168602, |
| "step": 1270 |
| }, |
| { |
| "completion_length": 341.376053237915, |
| "epoch": 2.048, |
| "grad_norm": 0.1282634139060974, |
| "kl": 0.478564453125, |
| "learning_rate": 8.303630419065136e-07, |
| "loss": 0.0191, |
| "reward": 1.5843750417232514, |
| "reward_std": 0.4248688681051135, |
| "rewards/accuracy_reward": 0.6598958525806665, |
| "rewards/format_reward": 0.9244791910052299, |
| "step": 1280 |
| }, |
| { |
| "completion_length": 351.30183334350585, |
| "epoch": 2.064, |
| "grad_norm": 0.09928528219461441, |
| "kl": 0.44130859375, |
| "learning_rate": 8.054850668419788e-07, |
| "loss": 0.0177, |
| "reward": 1.515104205906391, |
| "reward_std": 0.4964366652071476, |
| "rewards/accuracy_reward": 0.6174479354172945, |
| "rewards/format_reward": 0.8976562708616257, |
| "step": 1290 |
| }, |
| { |
| "completion_length": 350.6877723693848, |
| "epoch": 2.08, |
| "grad_norm": 0.10390625149011612, |
| "kl": 0.4825927734375, |
| "learning_rate": 7.808479373224925e-07, |
| "loss": 0.0193, |
| "reward": 1.493229202926159, |
| "reward_std": 0.4417869906872511, |
| "rewards/accuracy_reward": 0.5815104300854728, |
| "rewards/format_reward": 0.9117187693715095, |
| "step": 1300 |
| }, |
| { |
| "completion_length": 334.73959312438967, |
| "epoch": 2.096, |
| "grad_norm": 0.1110270768404007, |
| "kl": 0.76192626953125, |
| "learning_rate": 7.564601970705929e-07, |
| "loss": 0.0306, |
| "reward": 1.510677120089531, |
| "reward_std": 0.4166031703352928, |
| "rewards/accuracy_reward": 0.5739583484828472, |
| "rewards/format_reward": 0.9367187678813934, |
| "step": 1310 |
| }, |
| { |
| "completion_length": 293.86120681762696, |
| "epoch": 2.112, |
| "grad_norm": 0.1455606073141098, |
| "kl": 0.4578857421875, |
| "learning_rate": 7.323303033250134e-07, |
| "loss": 0.0183, |
| "reward": 1.517187537252903, |
| "reward_std": 0.3795573392882943, |
| "rewards/accuracy_reward": 0.5739583510439843, |
| "rewards/format_reward": 0.9432291880249977, |
| "step": 1320 |
| }, |
| { |
| "completion_length": 309.1671974182129, |
| "epoch": 2.128, |
| "grad_norm": 0.19481982290744781, |
| "kl": 0.418798828125, |
| "learning_rate": 7.0846662390786e-07, |
| "loss": 0.0167, |
| "reward": 1.4848958790302276, |
| "reward_std": 0.40688302349299194, |
| "rewards/accuracy_reward": 0.5481770981103182, |
| "rewards/format_reward": 0.9367187663912773, |
| "step": 1330 |
| }, |
| { |
| "completion_length": 330.8593837738037, |
| "epoch": 2.144, |
| "grad_norm": 0.09199036657810211, |
| "kl": 0.44635009765625, |
| "learning_rate": 6.848774343228007e-07, |
| "loss": 0.0179, |
| "reward": 1.4914062917232513, |
| "reward_std": 0.45433705151081083, |
| "rewards/accuracy_reward": 0.5721354318782688, |
| "rewards/format_reward": 0.9192708536982537, |
| "step": 1340 |
| }, |
| { |
| "completion_length": 338.9346446990967, |
| "epoch": 2.16, |
| "grad_norm": 0.10064072906970978, |
| "kl": 0.5522216796875, |
| "learning_rate": 6.615709148852632e-07, |
| "loss": 0.0221, |
| "reward": 1.5203125387430192, |
| "reward_std": 0.4799085700884461, |
| "rewards/accuracy_reward": 0.6000000141561032, |
| "rewards/format_reward": 0.9203125193715096, |
| "step": 1350 |
| }, |
| { |
| "completion_length": 322.1612045288086, |
| "epoch": 2.176, |
| "grad_norm": 0.10341834276914597, |
| "kl": 0.4356201171875, |
| "learning_rate": 6.385551478856481e-07, |
| "loss": 0.0174, |
| "reward": 1.5250000357627869, |
| "reward_std": 0.42158898171037434, |
| "rewards/accuracy_reward": 0.5856770953163505, |
| "rewards/format_reward": 0.9393229365348816, |
| "step": 1360 |
| }, |
| { |
| "completion_length": 319.66303024291994, |
| "epoch": 2.192, |
| "grad_norm": 0.08911896497011185, |
| "kl": 0.33355712890625, |
| "learning_rate": 6.158381147865313e-07, |
| "loss": 0.0134, |
| "reward": 1.6200521171092988, |
| "reward_std": 0.3689839508384466, |
| "rewards/accuracy_reward": 0.6757812671363354, |
| "rewards/format_reward": 0.9442708596587182, |
| "step": 1370 |
| }, |
| { |
| "completion_length": 306.39219665527344, |
| "epoch": 2.208, |
| "grad_norm": 0.07315315306186676, |
| "kl": 0.65413818359375, |
| "learning_rate": 5.934276934548348e-07, |
| "loss": 0.0262, |
| "reward": 1.5802083760499954, |
| "reward_std": 0.35810978449881076, |
| "rewards/accuracy_reward": 0.6257812708616257, |
| "rewards/format_reward": 0.9544270992279053, |
| "step": 1380 |
| }, |
| { |
| "completion_length": 351.4132900238037, |
| "epoch": 2.224, |
| "grad_norm": 0.08047836273908615, |
| "kl": 0.442431640625, |
| "learning_rate": 5.713316554299203e-07, |
| "loss": 0.0177, |
| "reward": 1.5190104573965073, |
| "reward_std": 0.4470030918717384, |
| "rewards/accuracy_reward": 0.5908854331821203, |
| "rewards/format_reward": 0.9281250193715096, |
| "step": 1390 |
| }, |
| { |
| "completion_length": 360.53099937438964, |
| "epoch": 2.24, |
| "grad_norm": 0.1132533922791481, |
| "kl": 0.3490234375, |
| "learning_rate": 5.495576632285572e-07, |
| "loss": 0.014, |
| "reward": 1.596875047683716, |
| "reward_std": 0.42701252046972515, |
| "rewards/accuracy_reward": 0.6643229309469462, |
| "rewards/format_reward": 0.9325521036982536, |
| "step": 1400 |
| }, |
| { |
| "completion_length": 329.94688377380373, |
| "epoch": 2.2560000000000002, |
| "grad_norm": 0.09439625591039658, |
| "kl": 0.3957763671875, |
| "learning_rate": 5.281132676876946e-07, |
| "loss": 0.0158, |
| "reward": 1.559114620089531, |
| "reward_std": 0.4072124421596527, |
| "rewards/accuracy_reward": 0.6242187682539224, |
| "rewards/format_reward": 0.9348958551883697, |
| "step": 1410 |
| }, |
| { |
| "completion_length": 331.89454040527346, |
| "epoch": 2.2720000000000002, |
| "grad_norm": 0.2693132162094116, |
| "kl": 0.326708984375, |
| "learning_rate": 5.070059053459672e-07, |
| "loss": 0.0131, |
| "reward": 1.5700521126389504, |
| "reward_std": 0.41562592387199404, |
| "rewards/accuracy_reward": 0.6338541865348816, |
| "rewards/format_reward": 0.9361979395151139, |
| "step": 1420 |
| }, |
| { |
| "completion_length": 315.82110328674315, |
| "epoch": 2.288, |
| "grad_norm": 0.31827375292778015, |
| "kl": 0.413037109375, |
| "learning_rate": 4.862428958648314e-07, |
| "loss": 0.0165, |
| "reward": 1.5156250312924384, |
| "reward_std": 0.4274031076580286, |
| "rewards/accuracy_reward": 0.5851562664844095, |
| "rewards/format_reward": 0.9304687708616257, |
| "step": 1430 |
| }, |
| { |
| "completion_length": 309.80261077880857, |
| "epoch": 2.304, |
| "grad_norm": 0.09511108696460724, |
| "kl": 0.5197265625, |
| "learning_rate": 4.6583143949023923e-07, |
| "loss": 0.0208, |
| "reward": 1.5911458671092986, |
| "reward_std": 0.41279490403831004, |
| "rewards/accuracy_reward": 0.6476562671363354, |
| "rewards/format_reward": 0.9434896051883698, |
| "step": 1440 |
| }, |
| { |
| "completion_length": 317.4851661682129, |
| "epoch": 2.32, |
| "grad_norm": 0.0870954692363739, |
| "kl": 0.705419921875, |
| "learning_rate": 4.4577861455571625e-07, |
| "loss": 0.0282, |
| "reward": 1.541927120089531, |
| "reward_std": 0.44012959226965903, |
| "rewards/accuracy_reward": 0.6156250163912773, |
| "rewards/format_reward": 0.9263021036982536, |
| "step": 1450 |
| }, |
| { |
| "completion_length": 334.9497497558594, |
| "epoch": 2.336, |
| "grad_norm": 0.10319157689809799, |
| "kl": 0.32333984375, |
| "learning_rate": 4.2609137502772247e-07, |
| "loss": 0.0129, |
| "reward": 1.5619792103767396, |
| "reward_std": 0.4711644366383553, |
| "rewards/accuracy_reward": 0.6453125212341547, |
| "rewards/format_reward": 0.9166666910052299, |
| "step": 1460 |
| }, |
| { |
| "completion_length": 320.07735481262205, |
| "epoch": 2.352, |
| "grad_norm": 0.1767028421163559, |
| "kl": 0.49638671875, |
| "learning_rate": 4.0677654809413873e-07, |
| "loss": 0.0199, |
| "reward": 1.5223958685994148, |
| "reward_std": 0.40760069973766805, |
| "rewards/accuracy_reward": 0.5927083479939028, |
| "rewards/format_reward": 0.9296875223517418, |
| "step": 1470 |
| }, |
| { |
| "completion_length": 327.24089736938475, |
| "epoch": 2.368, |
| "grad_norm": 0.13433118164539337, |
| "kl": 0.6244384765625, |
| "learning_rate": 3.878408317967177e-07, |
| "loss": 0.025, |
| "reward": 1.5828125387430192, |
| "reward_std": 0.4388178702443838, |
| "rewards/accuracy_reward": 0.6481770999729634, |
| "rewards/format_reward": 0.9346354365348816, |
| "step": 1480 |
| }, |
| { |
| "completion_length": 366.6963653564453, |
| "epoch": 2.384, |
| "grad_norm": 0.10366253554821014, |
| "kl": 0.475390625, |
| "learning_rate": 3.6929079270832173e-07, |
| "loss": 0.019, |
| "reward": 1.441927120089531, |
| "reward_std": 0.5121089570224285, |
| "rewards/accuracy_reward": 0.5583333490416408, |
| "rewards/format_reward": 0.8835937723517417, |
| "step": 1490 |
| }, |
| { |
| "completion_length": 362.4604267120361, |
| "epoch": 2.4, |
| "grad_norm": 0.24114616215229034, |
| "kl": 0.5971923828125, |
| "learning_rate": 3.511328636557509e-07, |
| "loss": 0.0239, |
| "reward": 1.459895870089531, |
| "reward_std": 0.5535307381302118, |
| "rewards/accuracy_reward": 0.5890625186264515, |
| "rewards/format_reward": 0.8708333536982537, |
| "step": 1500 |
| }, |
| { |
| "completion_length": 378.812771987915, |
| "epoch": 2.416, |
| "grad_norm": 0.5140863060951233, |
| "kl": 0.47861328125, |
| "learning_rate": 3.3337334148895143e-07, |
| "loss": 0.0191, |
| "reward": 1.4708333671092988, |
| "reward_std": 0.5573876537382603, |
| "rewards/accuracy_reward": 0.604427096247673, |
| "rewards/format_reward": 0.8664062723517418, |
| "step": 1510 |
| }, |
| { |
| "completion_length": 295.7984432220459, |
| "epoch": 2.432, |
| "grad_norm": 0.0825420618057251, |
| "kl": 0.3201416015625, |
| "learning_rate": 3.160183848973795e-07, |
| "loss": 0.0128, |
| "reward": 1.6070312827825546, |
| "reward_std": 0.3443769380450249, |
| "rewards/accuracy_reward": 0.6369791872799396, |
| "rewards/format_reward": 0.9700521051883697, |
| "step": 1520 |
| }, |
| { |
| "completion_length": 308.06641616821287, |
| "epoch": 2.448, |
| "grad_norm": 0.13088344037532806, |
| "kl": 0.3154541015625, |
| "learning_rate": 2.990740122742765e-07, |
| "loss": 0.0126, |
| "reward": 1.5820313006639481, |
| "reward_std": 0.32235752418637276, |
| "rewards/accuracy_reward": 0.6179687663912773, |
| "rewards/format_reward": 0.9640625208616257, |
| "step": 1530 |
| }, |
| { |
| "completion_length": 313.82579040527344, |
| "epoch": 2.464, |
| "grad_norm": 0.13155396282672882, |
| "kl": 0.2889404296875, |
| "learning_rate": 2.82546099629595e-07, |
| "loss": 0.0116, |
| "reward": 1.5177083805203437, |
| "reward_std": 0.38207798628136513, |
| "rewards/accuracy_reward": 0.5718750163912774, |
| "rewards/format_reward": 0.9458333477377892, |
| "step": 1540 |
| }, |
| { |
| "completion_length": 335.9320430755615, |
| "epoch": 2.48, |
| "grad_norm": 0.13609492778778076, |
| "kl": 0.692578125, |
| "learning_rate": 2.664403785523046e-07, |
| "loss": 0.0277, |
| "reward": 1.5653646260499954, |
| "reward_std": 0.457539339363575, |
| "rewards/accuracy_reward": 0.6510416842997074, |
| "rewards/format_reward": 0.9143229365348816, |
| "step": 1550 |
| }, |
| { |
| "completion_length": 335.92474937438965, |
| "epoch": 2.496, |
| "grad_norm": 0.10897226631641388, |
| "kl": 0.4633544921875, |
| "learning_rate": 2.507624342227748e-07, |
| "loss": 0.0185, |
| "reward": 1.5093750327825546, |
| "reward_std": 0.48894391059875486, |
| "rewards/accuracy_reward": 0.6005208481103181, |
| "rewards/format_reward": 0.9088541865348816, |
| "step": 1560 |
| }, |
| { |
| "completion_length": 346.82943687438967, |
| "epoch": 2.512, |
| "grad_norm": 0.3290146589279175, |
| "kl": 0.521923828125, |
| "learning_rate": 2.3551770347593443e-07, |
| "loss": 0.0209, |
| "reward": 1.5062500298023225, |
| "reward_std": 0.49438975416123865, |
| "rewards/accuracy_reward": 0.6013021029531955, |
| "rewards/format_reward": 0.9049479335546493, |
| "step": 1570 |
| }, |
| { |
| "completion_length": 297.566414642334, |
| "epoch": 2.528, |
| "grad_norm": 0.11054307222366333, |
| "kl": 0.3962158203125, |
| "learning_rate": 2.2071147291587318e-07, |
| "loss": 0.0158, |
| "reward": 1.5473958730697632, |
| "reward_std": 0.39565564077347515, |
| "rewards/accuracy_reward": 0.6039062641561032, |
| "rewards/format_reward": 0.9434896051883698, |
| "step": 1580 |
| }, |
| { |
| "completion_length": 284.1966236114502, |
| "epoch": 2.544, |
| "grad_norm": 0.13115550577640533, |
| "kl": 0.2541015625, |
| "learning_rate": 2.0634887708253957e-07, |
| "loss": 0.0102, |
| "reward": 1.6270833700895309, |
| "reward_std": 0.37217756249010564, |
| "rewards/accuracy_reward": 0.669791679829359, |
| "rewards/format_reward": 0.9572916865348816, |
| "step": 1590 |
| }, |
| { |
| "completion_length": 299.0880302429199, |
| "epoch": 2.56, |
| "grad_norm": 0.17235107719898224, |
| "kl": 0.345361328125, |
| "learning_rate": 1.9243489667117404e-07, |
| "loss": 0.0138, |
| "reward": 1.496354204416275, |
| "reward_std": 0.4093624118715525, |
| "rewards/accuracy_reward": 0.559635432716459, |
| "rewards/format_reward": 0.9367187663912773, |
| "step": 1600 |
| }, |
| { |
| "completion_length": 347.20313453674316, |
| "epoch": 2.576, |
| "grad_norm": 0.0926247164607048, |
| "kl": 0.487939453125, |
| "learning_rate": 1.7897435680509044e-07, |
| "loss": 0.0195, |
| "reward": 1.5723958775401115, |
| "reward_std": 0.47892273142933844, |
| "rewards/accuracy_reward": 0.6585937669500709, |
| "rewards/format_reward": 0.9138020992279052, |
| "step": 1610 |
| }, |
| { |
| "completion_length": 353.61902084350584, |
| "epoch": 2.592, |
| "grad_norm": 0.10807590931653976, |
| "kl": 509.08798828125, |
| "learning_rate": 1.6597192536240918e-07, |
| "loss": 20.4646, |
| "reward": 1.5908854603767395, |
| "reward_std": 0.398144971113652, |
| "rewards/accuracy_reward": 0.649739598762244, |
| "rewards/format_reward": 0.9411458477377892, |
| "step": 1620 |
| }, |
| { |
| "completion_length": 305.7497501373291, |
| "epoch": 2.608, |
| "grad_norm": 0.05855726823210716, |
| "kl": 0.271142578125, |
| "learning_rate": 1.5343211135731894e-07, |
| "loss": 0.0108, |
| "reward": 1.616406297683716, |
| "reward_std": 0.3559125494211912, |
| "rewards/accuracy_reward": 0.6567708469927311, |
| "rewards/format_reward": 0.9596354380249977, |
| "step": 1630 |
| }, |
| { |
| "completion_length": 330.2669376373291, |
| "epoch": 2.624, |
| "grad_norm": 0.2727009654045105, |
| "kl": 0.37744140625, |
| "learning_rate": 1.413592633764292e-07, |
| "loss": 0.0151, |
| "reward": 1.6255208760499955, |
| "reward_std": 0.4100566331297159, |
| "rewards/accuracy_reward": 0.6791666850447655, |
| "rewards/format_reward": 0.9463541865348816, |
| "step": 1640 |
| }, |
| { |
| "completion_length": 314.27292404174807, |
| "epoch": 2.64, |
| "grad_norm": 0.18453815579414368, |
| "kl": 0.3802490234375, |
| "learning_rate": 1.2975756807075945e-07, |
| "loss": 0.0152, |
| "reward": 1.5565104544162751, |
| "reward_std": 0.35327375028282404, |
| "rewards/accuracy_reward": 0.6018229356966913, |
| "rewards/format_reward": 0.9546875223517418, |
| "step": 1650 |
| }, |
| { |
| "completion_length": 310.15131092071533, |
| "epoch": 2.656, |
| "grad_norm": 0.16150705516338348, |
| "kl": 0.432373046875, |
| "learning_rate": 1.1863104870387903e-07, |
| "loss": 0.0173, |
| "reward": 1.5888021260499954, |
| "reward_std": 0.389334324374795, |
| "rewards/accuracy_reward": 0.6361979298293591, |
| "rewards/format_reward": 0.9526041895151138, |
| "step": 1660 |
| }, |
| { |
| "completion_length": 343.99792633056643, |
| "epoch": 2.672, |
| "grad_norm": 0.06966466456651688, |
| "kl": 0.6599609375, |
| "learning_rate": 1.0798356375671254e-07, |
| "loss": 0.0264, |
| "reward": 1.5226562947034836, |
| "reward_std": 0.42317282818257806, |
| "rewards/accuracy_reward": 0.5893229354172945, |
| "rewards/format_reward": 0.9333333522081375, |
| "step": 1670 |
| }, |
| { |
| "completion_length": 347.6976665496826, |
| "epoch": 2.6879999999999997, |
| "grad_norm": 0.08115794509649277, |
| "kl": 0.47548828125, |
| "learning_rate": 9.781880558948619e-08, |
| "loss": 0.019, |
| "reward": 1.5763021200895309, |
| "reward_std": 0.42064056508243086, |
| "rewards/accuracy_reward": 0.6372396051883698, |
| "rewards/format_reward": 0.9390625178813934, |
| "step": 1680 |
| }, |
| { |
| "completion_length": 333.13386459350585, |
| "epoch": 2.7039999999999997, |
| "grad_norm": 0.31740495562553406, |
| "kl": 0.3748291015625, |
| "learning_rate": 8.81402991612813e-08, |
| "loss": 0.015, |
| "reward": 1.6104167073965072, |
| "reward_std": 0.4134950406849384, |
| "rewards/accuracy_reward": 0.6638021025806665, |
| "rewards/format_reward": 0.9466146036982537, |
| "step": 1690 |
| }, |
| { |
| "completion_length": 349.80261688232423, |
| "epoch": 2.7199999999999998, |
| "grad_norm": 0.15794631838798523, |
| "kl": 0.6106201171875, |
| "learning_rate": 7.895140080764201e-08, |
| "loss": 0.0244, |
| "reward": 1.4783854544162751, |
| "reward_std": 0.5441196266561746, |
| "rewards/accuracy_reward": 0.5888020968064666, |
| "rewards/format_reward": 0.8895833522081376, |
| "step": 1700 |
| }, |
| { |
| "completion_length": 341.8935005187988, |
| "epoch": 2.7359999999999998, |
| "grad_norm": 0.13493210077285767, |
| "kl": 0.650341796875, |
| "learning_rate": 7.02552970766569e-08, |
| "loss": 0.026, |
| "reward": 1.546093788743019, |
| "reward_std": 0.5227277502417564, |
| "rewards/accuracy_reward": 0.6377604328095913, |
| "rewards/format_reward": 0.9083333551883698, |
| "step": 1710 |
| }, |
| { |
| "completion_length": 316.40026931762696, |
| "epoch": 2.752, |
| "grad_norm": 0.17133887112140656, |
| "kl": 0.316015625, |
| "learning_rate": 6.205500362391853e-08, |
| "loss": 0.0126, |
| "reward": 1.5994792133569717, |
| "reward_std": 0.39476869329810144, |
| "rewards/accuracy_reward": 0.6406250201165676, |
| "rewards/format_reward": 0.9588541895151138, |
| "step": 1720 |
| }, |
| { |
| "completion_length": 309.6942783355713, |
| "epoch": 2.768, |
| "grad_norm": 0.07311931252479553, |
| "kl": 0.47498779296875, |
| "learning_rate": 5.435336416674985e-08, |
| "loss": 0.019, |
| "reward": 1.528906300663948, |
| "reward_std": 0.34818257931619884, |
| "rewards/accuracy_reward": 0.5666666861623526, |
| "rewards/format_reward": 0.9622396036982537, |
| "step": 1730 |
| }, |
| { |
| "completion_length": 317.2898525238037, |
| "epoch": 2.784, |
| "grad_norm": 0.15522390604019165, |
| "kl": 0.29674072265625, |
| "learning_rate": 4.7153049498051546e-08, |
| "loss": 0.0119, |
| "reward": 1.6304687857627869, |
| "reward_std": 0.35606151502579453, |
| "rewards/accuracy_reward": 0.6640625163912773, |
| "rewards/format_reward": 0.9664062723517418, |
| "step": 1740 |
| }, |
| { |
| "completion_length": 306.6278762817383, |
| "epoch": 2.8, |
| "grad_norm": 0.09380000084638596, |
| "kl": 0.2547119140625, |
| "learning_rate": 4.0456556560117874e-08, |
| "loss": 0.0102, |
| "reward": 1.6122396111488342, |
| "reward_std": 0.3659005742520094, |
| "rewards/accuracy_reward": 0.6453125214204192, |
| "rewards/format_reward": 0.9669271066784859, |
| "step": 1750 |
| }, |
| { |
| "completion_length": 322.19063262939454, |
| "epoch": 2.816, |
| "grad_norm": 0.16257521510124207, |
| "kl": 0.37265625, |
| "learning_rate": 3.426620757874266e-08, |
| "loss": 0.0149, |
| "reward": 1.596875038743019, |
| "reward_std": 0.3646328579634428, |
| "rewards/accuracy_reward": 0.6367187672294676, |
| "rewards/format_reward": 0.9601562708616257, |
| "step": 1760 |
| }, |
| { |
| "completion_length": 369.0770942687988, |
| "epoch": 2.832, |
| "grad_norm": 1.67650306224823, |
| "kl": 0.4777587890625, |
| "learning_rate": 2.858414925791014e-08, |
| "loss": 0.0191, |
| "reward": 1.5520833730697632, |
| "reward_std": 0.5112588044255972, |
| "rewards/accuracy_reward": 0.6450520984828472, |
| "rewards/format_reward": 0.9070312693715096, |
| "step": 1770 |
| }, |
| { |
| "completion_length": 395.6838626861572, |
| "epoch": 2.848, |
| "grad_norm": 0.08218298107385635, |
| "kl": 0.565185546875, |
| "learning_rate": 2.3412352035357797e-08, |
| "loss": 0.0226, |
| "reward": 1.4523437827825547, |
| "reward_std": 0.5854175344109536, |
| "rewards/accuracy_reward": 0.5934895996004343, |
| "rewards/format_reward": 0.8588541835546494, |
| "step": 1780 |
| }, |
| { |
| "completion_length": 364.1682384490967, |
| "epoch": 2.864, |
| "grad_norm": 0.1337290108203888, |
| "kl": 0.66181640625, |
| "learning_rate": 1.87526093992616e-08, |
| "loss": 0.0265, |
| "reward": 1.4619792073965072, |
| "reward_std": 0.4972026661038399, |
| "rewards/accuracy_reward": 0.5640625156462192, |
| "rewards/format_reward": 0.8979166850447655, |
| "step": 1790 |
| }, |
| { |
| "completion_length": 313.5166757583618, |
| "epoch": 2.88, |
| "grad_norm": 0.4620625972747803, |
| "kl": 0.6297607421875, |
| "learning_rate": 1.4606537266287522e-08, |
| "loss": 0.0252, |
| "reward": 1.5656250357627868, |
| "reward_std": 0.42052843160927295, |
| "rewards/accuracy_reward": 0.6260416898876429, |
| "rewards/format_reward": 0.9395833551883698, |
| "step": 1800 |
| }, |
| { |
| "completion_length": 283.5046951293945, |
| "epoch": 2.896, |
| "grad_norm": 0.14225490391254425, |
| "kl": 0.53448486328125, |
| "learning_rate": 1.0975573421218632e-08, |
| "loss": 0.0214, |
| "reward": 1.6221354603767395, |
| "reward_std": 0.35582950357347726, |
| "rewards/accuracy_reward": 0.6682291835546493, |
| "rewards/format_reward": 0.9539062708616257, |
| "step": 1810 |
| }, |
| { |
| "completion_length": 329.02943687438966, |
| "epoch": 2.912, |
| "grad_norm": 0.08317083865404129, |
| "kl": 0.330859375, |
| "learning_rate": 7.860977018357751e-09, |
| "loss": 0.0132, |
| "reward": 1.6283854573965073, |
| "reward_std": 0.393698890786618, |
| "rewards/accuracy_reward": 0.6736979335546494, |
| "rewards/format_reward": 0.9546875268220901, |
| "step": 1820 |
| }, |
| { |
| "completion_length": 336.3953224182129, |
| "epoch": 2.928, |
| "grad_norm": 0.1257566511631012, |
| "kl": 0.5287353515625, |
| "learning_rate": 5.263828144873917e-09, |
| "loss": 0.0212, |
| "reward": 1.5440104603767395, |
| "reward_std": 0.4020043730735779, |
| "rewards/accuracy_reward": 0.5953125171363354, |
| "rewards/format_reward": 0.9486979380249977, |
| "step": 1830 |
| }, |
| { |
| "completion_length": 359.2218837738037, |
| "epoch": 2.944, |
| "grad_norm": 0.20077994465827942, |
| "kl": 0.4537841796875, |
| "learning_rate": 3.1850274462484896e-09, |
| "loss": 0.0181, |
| "reward": 1.5343750327825547, |
| "reward_std": 0.40093739330768585, |
| "rewards/accuracy_reward": 0.5877604320645332, |
| "rewards/format_reward": 0.9466146022081375, |
| "step": 1840 |
| }, |
| { |
| "completion_length": 352.94141693115233, |
| "epoch": 2.96, |
| "grad_norm": 0.08914138376712799, |
| "kl": 0.3108154296875, |
| "learning_rate": 1.6252958139456597e-09, |
| "loss": 0.0124, |
| "reward": 1.654687538743019, |
| "reward_std": 0.3981630776077509, |
| "rewards/accuracy_reward": 0.7039062689989806, |
| "rewards/format_reward": 0.9507812678813934, |
| "step": 1850 |
| }, |
| { |
| "completion_length": 343.89792518615724, |
| "epoch": 2.976, |
| "grad_norm": 0.07618039101362228, |
| "kl": 0.2248779296875, |
| "learning_rate": 5.85174135421418e-10, |
| "loss": 0.009, |
| "reward": 1.6197917073965074, |
| "reward_std": 0.3660704350098968, |
| "rewards/accuracy_reward": 0.6570312656462193, |
| "rewards/format_reward": 0.9627604365348816, |
| "step": 1860 |
| }, |
| { |
| "completion_length": 344.7703212738037, |
| "epoch": 2.992, |
| "grad_norm": 0.16367003321647644, |
| "kl": 0.4246337890625, |
| "learning_rate": 6.502310655193133e-11, |
| "loss": 0.017, |
| "reward": 1.587239620089531, |
| "reward_std": 0.4006639949977398, |
| "rewards/accuracy_reward": 0.6445312701165676, |
| "rewards/format_reward": 0.9427083522081375, |
| "step": 1870 |
| }, |
| { |
| "completion_length": 341.2281364440918, |
| "epoch": 3.0, |
| "kl": 0.54423828125, |
| "reward": 1.5484375357627869, |
| "reward_std": 0.4545022763311863, |
| "rewards/accuracy_reward": 0.6192708536982536, |
| "rewards/format_reward": 0.9291666805744171, |
| "step": 1875, |
| "total_flos": 0.0, |
| "train_loss": 0.11509215348958969, |
| "train_runtime": 61764.0604, |
| "train_samples_per_second": 0.364, |
| "train_steps_per_second": 0.03 |
| } |
| ], |
| "logging_steps": 10, |
| "max_steps": 1875, |
| "num_input_tokens_seen": 0, |
| "num_train_epochs": 3, |
| "save_steps": 500, |
| "stateful_callbacks": { |
| "TrainerControl": { |
| "args": { |
| "should_epoch_stop": false, |
| "should_evaluate": false, |
| "should_log": false, |
| "should_save": true, |
| "should_training_stop": true |
| }, |
| "attributes": {} |
| } |
| }, |
| "total_flos": 0.0, |
| "train_batch_size": 1, |
| "trial_name": null, |
| "trial_params": null |
| } |
|
|