| { | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 0.8028904054596547, | |
| "eval_steps": 500, | |
| "global_step": 100, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "completion_length": 155.3333339691162, | |
| "epoch": 0.008028904054596548, | |
| "grad_norm": 0.8266991671440796, | |
| "kl": 0.0, | |
| "learning_rate": 3.0000000000000004e-08, | |
| "loss": 0.0, | |
| "reward": 2.3694444835186004, | |
| "reward_std": 0.706648226082325, | |
| "rewards/accuracy_reward_log": 1.4888889163732528, | |
| "rewards/format_number_reward": 0.4138888940215111, | |
| "rewards/format_reasoning_reward": 0.46666666865348816, | |
| "step": 1 | |
| }, | |
| { | |
| "completion_length": 177.53333435058593, | |
| "epoch": 0.016057808109193095, | |
| "grad_norm": 0.853670554176349, | |
| "kl": 0.0, | |
| "learning_rate": 6.000000000000001e-08, | |
| "loss": 0.0, | |
| "reward": 2.3472222685813904, | |
| "reward_std": 0.6901098385453224, | |
| "rewards/accuracy_reward_log": 1.500000025331974, | |
| "rewards/format_number_reward": 0.4000000059604645, | |
| "rewards/format_reasoning_reward": 0.4472222253680229, | |
| "step": 2 | |
| }, | |
| { | |
| "completion_length": 160.18333435058594, | |
| "epoch": 0.02408671216378964, | |
| "grad_norm": 0.6327729129725634, | |
| "kl": 0.00010757744312286376, | |
| "learning_rate": 9e-08, | |
| "loss": 0.0, | |
| "reward": 2.4138889491558073, | |
| "reward_std": 0.5638244189321995, | |
| "rewards/accuracy_reward_log": 1.5222222447395324, | |
| "rewards/format_number_reward": 0.4194444492459297, | |
| "rewards/format_reasoning_reward": 0.4722222238779068, | |
| "step": 3 | |
| }, | |
| { | |
| "completion_length": 167.59444580078124, | |
| "epoch": 0.03211561621838619, | |
| "grad_norm": 0.6798573023319466, | |
| "kl": 0.00010120868682861328, | |
| "learning_rate": 1.2000000000000002e-07, | |
| "loss": 0.0, | |
| "reward": 2.20833335518837, | |
| "reward_std": 0.5129089742898941, | |
| "rewards/accuracy_reward_log": 1.333333359658718, | |
| "rewards/format_number_reward": 0.4027777835726738, | |
| "rewards/format_reasoning_reward": 0.4722222238779068, | |
| "step": 4 | |
| }, | |
| { | |
| "completion_length": 147.56666870117186, | |
| "epoch": 0.04014452027298274, | |
| "grad_norm": 0.5872669908882053, | |
| "kl": 9.968876838684082e-05, | |
| "learning_rate": 1.5000000000000002e-07, | |
| "loss": 0.0, | |
| "reward": 2.4361111402511595, | |
| "reward_std": 0.48325310088694096, | |
| "rewards/accuracy_reward_log": 1.5333333551883697, | |
| "rewards/format_number_reward": 0.42222222685813904, | |
| "rewards/format_reasoning_reward": 0.48055555671453476, | |
| "step": 5 | |
| }, | |
| { | |
| "completion_length": 168.4333324432373, | |
| "epoch": 0.04817342432757928, | |
| "grad_norm": 0.842231210713151, | |
| "kl": 0.000108298659324646, | |
| "learning_rate": 1.8e-07, | |
| "loss": 0.0, | |
| "reward": 1.9972222298383713, | |
| "reward_std": 0.6861602704972029, | |
| "rewards/accuracy_reward_log": 1.188888917118311, | |
| "rewards/format_number_reward": 0.35555556118488313, | |
| "rewards/format_reasoning_reward": 0.45277778059244156, | |
| "step": 6 | |
| }, | |
| { | |
| "completion_length": 161.47222175598145, | |
| "epoch": 0.056202328382175835, | |
| "grad_norm": 0.6301911132102126, | |
| "kl": 0.00010362863540649414, | |
| "learning_rate": 2.1000000000000003e-07, | |
| "loss": 0.0, | |
| "reward": 2.344444477558136, | |
| "reward_std": 0.5787256445735693, | |
| "rewards/accuracy_reward_log": 1.455555585026741, | |
| "rewards/format_number_reward": 0.41111111640930176, | |
| "rewards/format_reasoning_reward": 0.47777777910232544, | |
| "step": 7 | |
| }, | |
| { | |
| "completion_length": 178.705558013916, | |
| "epoch": 0.06423123243677238, | |
| "grad_norm": 0.7904000046666478, | |
| "kl": 0.00015439987182617188, | |
| "learning_rate": 2.4000000000000003e-07, | |
| "loss": 0.0, | |
| "reward": 2.033333358168602, | |
| "reward_std": 0.6374398373067379, | |
| "rewards/accuracy_reward_log": 1.2000000223517417, | |
| "rewards/format_number_reward": 0.37777778320014477, | |
| "rewards/format_reasoning_reward": 0.4555555582046509, | |
| "step": 8 | |
| }, | |
| { | |
| "completion_length": 154.6055564880371, | |
| "epoch": 0.07226013649136893, | |
| "grad_norm": 0.8113384704385274, | |
| "kl": 0.00010915249586105347, | |
| "learning_rate": 2.7e-07, | |
| "loss": 0.0, | |
| "reward": 2.4722222566604612, | |
| "reward_std": 0.38092764765024184, | |
| "rewards/accuracy_reward_log": 1.5777777969837188, | |
| "rewards/format_number_reward": 0.4194444492459297, | |
| "rewards/format_reasoning_reward": 0.4750000014901161, | |
| "step": 9 | |
| }, | |
| { | |
| "completion_length": 155.86111221313476, | |
| "epoch": 0.08028904054596547, | |
| "grad_norm": 0.6734680070959974, | |
| "kl": 0.00010448098182678223, | |
| "learning_rate": 3.0000000000000004e-07, | |
| "loss": 0.0, | |
| "reward": 2.3611111462116243, | |
| "reward_std": 0.5814844127744436, | |
| "rewards/accuracy_reward_log": 1.4777778029441833, | |
| "rewards/format_number_reward": 0.413888893276453, | |
| "rewards/format_reasoning_reward": 0.4694444462656975, | |
| "step": 10 | |
| }, | |
| { | |
| "completion_length": 167.8611110687256, | |
| "epoch": 0.08831794460056203, | |
| "grad_norm": 0.6192549717392329, | |
| "kl": 0.0002144932746887207, | |
| "learning_rate": 3.3e-07, | |
| "loss": 0.0, | |
| "reward": 2.3972222745418548, | |
| "reward_std": 0.5403781462460756, | |
| "rewards/accuracy_reward_log": 1.511111134290695, | |
| "rewards/format_number_reward": 0.4194444492459297, | |
| "rewards/format_reasoning_reward": 0.46666666865348816, | |
| "step": 11 | |
| }, | |
| { | |
| "completion_length": 159.29444694519043, | |
| "epoch": 0.09634684865515857, | |
| "grad_norm": 0.6219588398194654, | |
| "kl": 0.00014747381210327147, | |
| "learning_rate": 3.6e-07, | |
| "loss": 0.0, | |
| "reward": 2.22777781188488, | |
| "reward_std": 0.6599382009357214, | |
| "rewards/accuracy_reward_log": 1.3777778007090091, | |
| "rewards/format_number_reward": 0.3805555604398251, | |
| "rewards/format_reasoning_reward": 0.4694444462656975, | |
| "step": 12 | |
| }, | |
| { | |
| "completion_length": 163.36666717529297, | |
| "epoch": 0.10437575270975512, | |
| "grad_norm": 0.5303599869712791, | |
| "kl": 0.0001802980899810791, | |
| "learning_rate": 3.9e-07, | |
| "loss": 0.0, | |
| "reward": 2.388888931274414, | |
| "reward_std": 0.38289184793829917, | |
| "rewards/accuracy_reward_log": 1.4888889119029045, | |
| "rewards/format_number_reward": 0.4111111145466566, | |
| "rewards/format_reasoning_reward": 0.4888888895511627, | |
| "step": 13 | |
| }, | |
| { | |
| "completion_length": 153.45000038146972, | |
| "epoch": 0.11240465676435167, | |
| "grad_norm": 0.5530377333718409, | |
| "kl": 0.0004338264465332031, | |
| "learning_rate": 4.2000000000000006e-07, | |
| "loss": 0.0, | |
| "reward": 2.569444465637207, | |
| "reward_std": 0.42339018881320956, | |
| "rewards/accuracy_reward_log": 1.6333333551883698, | |
| "rewards/format_number_reward": 0.45277778059244156, | |
| "rewards/format_reasoning_reward": 0.4833333343267441, | |
| "step": 14 | |
| }, | |
| { | |
| "completion_length": 165.07777938842773, | |
| "epoch": 0.12043356081894821, | |
| "grad_norm": 0.49102029519105816, | |
| "kl": 0.0006180524826049805, | |
| "learning_rate": 4.5e-07, | |
| "loss": 0.0, | |
| "reward": 2.597222250699997, | |
| "reward_std": 0.3478126596659422, | |
| "rewards/accuracy_reward_log": 1.6444444566965104, | |
| "rewards/format_number_reward": 0.46388889104127884, | |
| "rewards/format_reasoning_reward": 0.4888888895511627, | |
| "step": 15 | |
| }, | |
| { | |
| "completion_length": 164.92778129577636, | |
| "epoch": 0.12846246487354476, | |
| "grad_norm": 0.6338598904220616, | |
| "kl": 0.0007488012313842773, | |
| "learning_rate": 4.800000000000001e-07, | |
| "loss": 0.0, | |
| "reward": 2.319444465637207, | |
| "reward_std": 0.5170031324028969, | |
| "rewards/accuracy_reward_log": 1.4222222447395325, | |
| "rewards/format_number_reward": 0.4277777820825577, | |
| "rewards/format_reasoning_reward": 0.4694444462656975, | |
| "step": 16 | |
| }, | |
| { | |
| "completion_length": 156.144446182251, | |
| "epoch": 0.13649136892814132, | |
| "grad_norm": 0.5574851133438189, | |
| "kl": 0.0012537002563476562, | |
| "learning_rate": 5.100000000000001e-07, | |
| "loss": 0.0001, | |
| "reward": 2.5555556058883666, | |
| "reward_std": 0.44343185126781465, | |
| "rewards/accuracy_reward_log": 1.6222222417593002, | |
| "rewards/format_number_reward": 0.45000000223517417, | |
| "rewards/format_reasoning_reward": 0.4833333343267441, | |
| "step": 17 | |
| }, | |
| { | |
| "completion_length": 153.38888931274414, | |
| "epoch": 0.14452027298273787, | |
| "grad_norm": 0.6029065735373917, | |
| "kl": 0.001799297332763672, | |
| "learning_rate": 5.4e-07, | |
| "loss": 0.0001, | |
| "reward": 2.4750000178813933, | |
| "reward_std": 0.5368539825081825, | |
| "rewards/accuracy_reward_log": 1.5444444686174392, | |
| "rewards/format_number_reward": 0.45000000298023224, | |
| "rewards/format_reasoning_reward": 0.48055555671453476, | |
| "step": 18 | |
| }, | |
| { | |
| "completion_length": 170.45000228881835, | |
| "epoch": 0.1525491770373344, | |
| "grad_norm": 0.6238637844164427, | |
| "kl": 0.0022980213165283204, | |
| "learning_rate": 5.7e-07, | |
| "loss": 0.0001, | |
| "reward": 2.3472222328186034, | |
| "reward_std": 0.5638264730572701, | |
| "rewards/accuracy_reward_log": 1.4444444686174394, | |
| "rewards/format_number_reward": 0.4333333373069763, | |
| "rewards/format_reasoning_reward": 0.4694444462656975, | |
| "step": 19 | |
| }, | |
| { | |
| "completion_length": 146.12777786254884, | |
| "epoch": 0.16057808109193095, | |
| "grad_norm": 0.4196477311949672, | |
| "kl": 0.00455629825592041, | |
| "learning_rate": 6.000000000000001e-07, | |
| "loss": 0.0002, | |
| "reward": 2.5250000238418577, | |
| "reward_std": 0.27503596656024454, | |
| "rewards/accuracy_reward_log": 1.5888888999819755, | |
| "rewards/format_number_reward": 0.44722222462296485, | |
| "rewards/format_reasoning_reward": 0.4888888895511627, | |
| "step": 20 | |
| }, | |
| { | |
| "completion_length": 179.50000228881837, | |
| "epoch": 0.1686069851465275, | |
| "grad_norm": 0.5891348987729047, | |
| "kl": 0.0013088226318359376, | |
| "learning_rate": 6.3e-07, | |
| "loss": 0.0001, | |
| "reward": 2.4500000417232513, | |
| "reward_std": 0.5685264855623245, | |
| "rewards/accuracy_reward_log": 1.5666666895151138, | |
| "rewards/format_number_reward": 0.43611111491918564, | |
| "rewards/format_reasoning_reward": 0.4472222253680229, | |
| "step": 21 | |
| }, | |
| { | |
| "completion_length": 167.82222518920898, | |
| "epoch": 0.17663588920112405, | |
| "grad_norm": 0.5080236888533528, | |
| "kl": 0.0015691757202148438, | |
| "learning_rate": 6.6e-07, | |
| "loss": 0.0001, | |
| "reward": 2.4805555999279023, | |
| "reward_std": 0.38282057382166385, | |
| "rewards/accuracy_reward_log": 1.5333333551883697, | |
| "rewards/format_number_reward": 0.4694444462656975, | |
| "rewards/format_reasoning_reward": 0.47777777910232544, | |
| "step": 22 | |
| }, | |
| { | |
| "completion_length": 159.62222480773926, | |
| "epoch": 0.1846647932557206, | |
| "grad_norm": 0.5410426478942184, | |
| "kl": 0.0024078369140625, | |
| "learning_rate": 6.900000000000001e-07, | |
| "loss": 0.0001, | |
| "reward": 2.6277777940034865, | |
| "reward_std": 0.332701800763607, | |
| "rewards/accuracy_reward_log": 1.6777777917683125, | |
| "rewards/format_number_reward": 0.46388889029622077, | |
| "rewards/format_reasoning_reward": 0.4861111119389534, | |
| "step": 23 | |
| }, | |
| { | |
| "completion_length": 157.90000114440917, | |
| "epoch": 0.19269369731031713, | |
| "grad_norm": 0.5882381452711394, | |
| "kl": 0.002088165283203125, | |
| "learning_rate": 7.2e-07, | |
| "loss": 0.0001, | |
| "reward": 2.6722222566604614, | |
| "reward_std": 0.38149141892790794, | |
| "rewards/accuracy_reward_log": 1.7111111283302307, | |
| "rewards/format_number_reward": 0.48055555671453476, | |
| "rewards/format_reasoning_reward": 0.48055555671453476, | |
| "step": 24 | |
| }, | |
| { | |
| "completion_length": 148.97222328186035, | |
| "epoch": 0.20072260136491368, | |
| "grad_norm": 0.46880008245690374, | |
| "kl": 0.002790069580078125, | |
| "learning_rate": 7.5e-07, | |
| "loss": 0.0001, | |
| "reward": 2.6666667222976685, | |
| "reward_std": 0.42418686002492906, | |
| "rewards/accuracy_reward_log": 1.7111111283302307, | |
| "rewards/format_number_reward": 0.46388889104127884, | |
| "rewards/format_reasoning_reward": 0.49166666716337204, | |
| "step": 25 | |
| }, | |
| { | |
| "completion_length": 166.62222595214843, | |
| "epoch": 0.20875150541951024, | |
| "grad_norm": 0.6233073148075161, | |
| "kl": 0.003025054931640625, | |
| "learning_rate": 7.8e-07, | |
| "loss": 0.0001, | |
| "reward": 2.544444477558136, | |
| "reward_std": 0.5540193915367126, | |
| "rewards/accuracy_reward_log": 1.6111111342906952, | |
| "rewards/format_number_reward": 0.45277778059244156, | |
| "rewards/format_reasoning_reward": 0.48055555671453476, | |
| "step": 26 | |
| }, | |
| { | |
| "completion_length": 153.51666755676268, | |
| "epoch": 0.2167804094741068, | |
| "grad_norm": 0.4559712721597118, | |
| "kl": 0.003208160400390625, | |
| "learning_rate": 8.100000000000001e-07, | |
| "loss": 0.0001, | |
| "reward": 2.5555555880069734, | |
| "reward_std": 0.34781265556812285, | |
| "rewards/accuracy_reward_log": 1.5888889104127883, | |
| "rewards/format_number_reward": 0.4750000014901161, | |
| "rewards/format_reasoning_reward": 0.49166666716337204, | |
| "step": 27 | |
| }, | |
| { | |
| "completion_length": 172.8500011444092, | |
| "epoch": 0.22480931352870334, | |
| "grad_norm": 0.5449553128247491, | |
| "kl": 0.0030437469482421874, | |
| "learning_rate": 8.400000000000001e-07, | |
| "loss": 0.0001, | |
| "reward": 2.4750000476837157, | |
| "reward_std": 0.3911139152944088, | |
| "rewards/accuracy_reward_log": 1.544444465637207, | |
| "rewards/format_number_reward": 0.45000000223517417, | |
| "rewards/format_reasoning_reward": 0.48055555671453476, | |
| "step": 28 | |
| }, | |
| { | |
| "completion_length": 164.09999961853026, | |
| "epoch": 0.23283821758329987, | |
| "grad_norm": 0.5619230788736052, | |
| "kl": 0.00468292236328125, | |
| "learning_rate": 8.699999999999999e-07, | |
| "loss": 0.0002, | |
| "reward": 2.502777820825577, | |
| "reward_std": 0.46127643398940565, | |
| "rewards/accuracy_reward_log": 1.5666666895151138, | |
| "rewards/format_number_reward": 0.46388889104127884, | |
| "rewards/format_reasoning_reward": 0.4722222238779068, | |
| "step": 29 | |
| }, | |
| { | |
| "completion_length": 151.90000228881837, | |
| "epoch": 0.24086712163789642, | |
| "grad_norm": 0.7524148953577687, | |
| "kl": 0.0029834747314453126, | |
| "learning_rate": 9e-07, | |
| "loss": 0.0001, | |
| "reward": 2.7583333969116213, | |
| "reward_std": 0.31546305269002917, | |
| "rewards/accuracy_reward_log": 1.7777777910232544, | |
| "rewards/format_number_reward": 0.4861111119389534, | |
| "rewards/format_reasoning_reward": 0.49444444477558136, | |
| "step": 30 | |
| }, | |
| { | |
| "completion_length": 150.7055564880371, | |
| "epoch": 0.24889602569249297, | |
| "grad_norm": 1.70651807869166, | |
| "kl": 0.013134002685546875, | |
| "learning_rate": 9.3e-07, | |
| "loss": 0.0005, | |
| "reward": 2.8583333492279053, | |
| "reward_std": 0.1264950528740883, | |
| "rewards/accuracy_reward_log": 1.8666666746139526, | |
| "rewards/format_number_reward": 0.49444444477558136, | |
| "rewards/format_reasoning_reward": 0.4972222223877907, | |
| "step": 31 | |
| }, | |
| { | |
| "completion_length": 169.57222366333008, | |
| "epoch": 0.2569249297470895, | |
| "grad_norm": 0.41158376714433204, | |
| "kl": 0.004754638671875, | |
| "learning_rate": 9.600000000000001e-07, | |
| "loss": 0.0002, | |
| "reward": 2.6888889491558077, | |
| "reward_std": 0.29488888159394266, | |
| "rewards/accuracy_reward_log": 1.722222238779068, | |
| "rewards/format_number_reward": 0.4833333343267441, | |
| "rewards/format_reasoning_reward": 0.4833333343267441, | |
| "step": 32 | |
| }, | |
| { | |
| "completion_length": 160.33333473205568, | |
| "epoch": 0.2649538338016861, | |
| "grad_norm": 0.5042425185498995, | |
| "kl": 0.005771636962890625, | |
| "learning_rate": 9.9e-07, | |
| "loss": 0.0002, | |
| "reward": 2.6055556178092956, | |
| "reward_std": 0.33884221240878104, | |
| "rewards/accuracy_reward_log": 1.644444465637207, | |
| "rewards/format_number_reward": 0.4750000014901161, | |
| "rewards/format_reasoning_reward": 0.4861111119389534, | |
| "step": 33 | |
| }, | |
| { | |
| "completion_length": 165.0555561065674, | |
| "epoch": 0.27298273785628263, | |
| "grad_norm": 0.5229992920065437, | |
| "kl": 0.006603240966796875, | |
| "learning_rate": 1.0200000000000002e-06, | |
| "loss": 0.0003, | |
| "reward": 2.663888943195343, | |
| "reward_std": 0.4261931136250496, | |
| "rewards/accuracy_reward_log": 1.6888889074325562, | |
| "rewards/format_number_reward": 0.4833333343267441, | |
| "rewards/format_reasoning_reward": 0.49166666716337204, | |
| "step": 34 | |
| }, | |
| { | |
| "completion_length": 164.63888969421387, | |
| "epoch": 0.2810116419108792, | |
| "grad_norm": 0.4700519781333619, | |
| "kl": 0.006055450439453125, | |
| "learning_rate": 1.05e-06, | |
| "loss": 0.0002, | |
| "reward": 2.6194444715976717, | |
| "reward_std": 0.34433056265115736, | |
| "rewards/accuracy_reward_log": 1.6444444596767425, | |
| "rewards/format_number_reward": 0.4861111119389534, | |
| "rewards/format_reasoning_reward": 0.4888888895511627, | |
| "step": 35 | |
| }, | |
| { | |
| "completion_length": 195.3277786254883, | |
| "epoch": 0.28904054596547574, | |
| "grad_norm": 0.5312518629436677, | |
| "kl": 0.005257415771484375, | |
| "learning_rate": 1.08e-06, | |
| "loss": 0.0002, | |
| "reward": 2.4138889372348786, | |
| "reward_std": 0.4737018562853336, | |
| "rewards/accuracy_reward_log": 1.5000000193715095, | |
| "rewards/format_number_reward": 0.45277778059244156, | |
| "rewards/format_reasoning_reward": 0.4611111134290695, | |
| "step": 36 | |
| }, | |
| { | |
| "completion_length": 162.9777805328369, | |
| "epoch": 0.29706945002007223, | |
| "grad_norm": 0.3947867705421231, | |
| "kl": 0.00660400390625, | |
| "learning_rate": 1.11e-06, | |
| "loss": 0.0003, | |
| "reward": 2.8111111521720886, | |
| "reward_std": 0.21169509664177893, | |
| "rewards/accuracy_reward_log": 1.8222222328186035, | |
| "rewards/format_number_reward": 0.49166666716337204, | |
| "rewards/format_reasoning_reward": 0.4972222223877907, | |
| "step": 37 | |
| }, | |
| { | |
| "completion_length": 171.81111373901368, | |
| "epoch": 0.3050983540746688, | |
| "grad_norm": 0.49680622935913366, | |
| "kl": 0.0083709716796875, | |
| "learning_rate": 1.14e-06, | |
| "loss": 0.0003, | |
| "reward": 2.600000059604645, | |
| "reward_std": 0.3381901502609253, | |
| "rewards/accuracy_reward_log": 1.6333333551883698, | |
| "rewards/format_number_reward": 0.47777777910232544, | |
| "rewards/format_reasoning_reward": 0.4888888895511627, | |
| "step": 38 | |
| }, | |
| { | |
| "completion_length": 159.78333473205566, | |
| "epoch": 0.31312725812926534, | |
| "grad_norm": 0.4841125000891351, | |
| "kl": 0.01015167236328125, | |
| "learning_rate": 1.17e-06, | |
| "loss": 0.0004, | |
| "reward": 2.7166666865348814, | |
| "reward_std": 0.25784134939312936, | |
| "rewards/accuracy_reward_log": 1.7555555701255798, | |
| "rewards/format_number_reward": 0.4750000014901161, | |
| "rewards/format_reasoning_reward": 0.4861111119389534, | |
| "step": 39 | |
| }, | |
| { | |
| "completion_length": 160.47777938842773, | |
| "epoch": 0.3211561621838619, | |
| "grad_norm": 0.4051878172293322, | |
| "kl": 0.00718231201171875, | |
| "learning_rate": 1.2000000000000002e-06, | |
| "loss": 0.0003, | |
| "reward": 2.6527778089046476, | |
| "reward_std": 0.27083261907100675, | |
| "rewards/accuracy_reward_log": 1.6777777969837189, | |
| "rewards/format_number_reward": 0.4861111119389534, | |
| "rewards/format_reasoning_reward": 0.4888888895511627, | |
| "step": 40 | |
| }, | |
| { | |
| "completion_length": 164.8000015258789, | |
| "epoch": 0.32918506623845845, | |
| "grad_norm": 0.474618622817464, | |
| "kl": 0.006967926025390625, | |
| "learning_rate": 1.2299999999999999e-06, | |
| "loss": 0.0003, | |
| "reward": 2.7750000238418577, | |
| "reward_std": 0.24537386298179625, | |
| "rewards/accuracy_reward_log": 1.800000011920929, | |
| "rewards/format_number_reward": 0.4861111119389534, | |
| "rewards/format_reasoning_reward": 0.4888888895511627, | |
| "step": 41 | |
| }, | |
| { | |
| "completion_length": 166.5111141204834, | |
| "epoch": 0.337213970293055, | |
| "grad_norm": 0.3629585253598538, | |
| "kl": 0.007363128662109375, | |
| "learning_rate": 1.26e-06, | |
| "loss": 0.0003, | |
| "reward": 2.63055557012558, | |
| "reward_std": 0.22272009253501893, | |
| "rewards/accuracy_reward_log": 1.6555555701255797, | |
| "rewards/format_number_reward": 0.4861111119389534, | |
| "rewards/format_reasoning_reward": 0.4888888895511627, | |
| "step": 42 | |
| }, | |
| { | |
| "completion_length": 166.7388900756836, | |
| "epoch": 0.34524287434765155, | |
| "grad_norm": 0.4008326525358567, | |
| "kl": 0.009012603759765625, | |
| "learning_rate": 1.29e-06, | |
| "loss": 0.0004, | |
| "reward": 2.708333361148834, | |
| "reward_std": 0.29148011431097987, | |
| "rewards/accuracy_reward_log": 1.7444444596767426, | |
| "rewards/format_number_reward": 0.47777777910232544, | |
| "rewards/format_reasoning_reward": 0.4861111119389534, | |
| "step": 43 | |
| }, | |
| { | |
| "completion_length": 185.93888893127442, | |
| "epoch": 0.3532717784022481, | |
| "grad_norm": 0.5017342406963871, | |
| "kl": 0.00979156494140625, | |
| "learning_rate": 1.32e-06, | |
| "loss": 0.0004, | |
| "reward": 2.5722222566604613, | |
| "reward_std": 0.3904368232935667, | |
| "rewards/accuracy_reward_log": 1.6555555760860443, | |
| "rewards/format_number_reward": 0.4472222253680229, | |
| "rewards/format_reasoning_reward": 0.4694444462656975, | |
| "step": 44 | |
| }, | |
| { | |
| "completion_length": 170.97777862548827, | |
| "epoch": 0.36130068245684466, | |
| "grad_norm": 0.36508724491351696, | |
| "kl": 0.00971527099609375, | |
| "learning_rate": 1.35e-06, | |
| "loss": 0.0004, | |
| "reward": 2.680555593967438, | |
| "reward_std": 0.1924500897526741, | |
| "rewards/accuracy_reward_log": 1.7111111253499984, | |
| "rewards/format_number_reward": 0.48055555671453476, | |
| "rewards/format_reasoning_reward": 0.4888888895511627, | |
| "step": 45 | |
| }, | |
| { | |
| "completion_length": 200.09444770812988, | |
| "epoch": 0.3693295865114412, | |
| "grad_norm": 0.5859797241606328, | |
| "kl": 0.01236724853515625, | |
| "learning_rate": 1.3800000000000001e-06, | |
| "loss": 0.0005, | |
| "reward": 2.4972222447395325, | |
| "reward_std": 0.4179751716554165, | |
| "rewards/accuracy_reward_log": 1.588888904452324, | |
| "rewards/format_number_reward": 0.4527777798473835, | |
| "rewards/format_reasoning_reward": 0.4555555582046509, | |
| "step": 46 | |
| }, | |
| { | |
| "completion_length": 190.55000267028808, | |
| "epoch": 0.37735849056603776, | |
| "grad_norm": 0.6735251651031329, | |
| "kl": 0.013592529296875, | |
| "learning_rate": 1.41e-06, | |
| "loss": 0.0005, | |
| "reward": 2.525000035762787, | |
| "reward_std": 0.39592516496777536, | |
| "rewards/accuracy_reward_log": 1.6000000178813933, | |
| "rewards/format_number_reward": 0.4611111134290695, | |
| "rewards/format_reasoning_reward": 0.46388889104127884, | |
| "step": 47 | |
| }, | |
| { | |
| "completion_length": 177.14444732666016, | |
| "epoch": 0.38538739462063426, | |
| "grad_norm": 0.4013696331659362, | |
| "kl": 0.01161346435546875, | |
| "learning_rate": 1.44e-06, | |
| "loss": 0.0005, | |
| "reward": 2.725000035762787, | |
| "reward_std": 0.30932264029979706, | |
| "rewards/accuracy_reward_log": 1.7666666805744171, | |
| "rewards/format_number_reward": 0.4750000014901161, | |
| "rewards/format_reasoning_reward": 0.4833333343267441, | |
| "step": 48 | |
| }, | |
| { | |
| "completion_length": 183.58333511352538, | |
| "epoch": 0.3934162986752308, | |
| "grad_norm": 0.413245950296645, | |
| "kl": 0.0140655517578125, | |
| "learning_rate": 1.4700000000000001e-06, | |
| "loss": 0.0006, | |
| "reward": 2.547222238779068, | |
| "reward_std": 0.28799803368747234, | |
| "rewards/accuracy_reward_log": 1.6000000208616256, | |
| "rewards/format_number_reward": 0.4694444462656975, | |
| "rewards/format_reasoning_reward": 0.47777777910232544, | |
| "step": 49 | |
| }, | |
| { | |
| "completion_length": 179.96666717529297, | |
| "epoch": 0.40144520272982737, | |
| "grad_norm": 0.41408601639844395, | |
| "kl": 0.01335296630859375, | |
| "learning_rate": 1.5e-06, | |
| "loss": 0.0005, | |
| "reward": 2.658333349227905, | |
| "reward_std": 0.3410413548350334, | |
| "rewards/accuracy_reward_log": 1.7222222357988357, | |
| "rewards/format_number_reward": 0.46388889104127884, | |
| "rewards/format_reasoning_reward": 0.4722222238779068, | |
| "step": 50 | |
| }, | |
| { | |
| "completion_length": 185.75556030273438, | |
| "epoch": 0.4094741067844239, | |
| "grad_norm": 0.4053783571755214, | |
| "kl": 0.0148193359375, | |
| "learning_rate": 1.53e-06, | |
| "loss": 0.0006, | |
| "reward": 2.6555556058883667, | |
| "reward_std": 0.35194680131971834, | |
| "rewards/accuracy_reward_log": 1.722222238779068, | |
| "rewards/format_number_reward": 0.46388889104127884, | |
| "rewards/format_reasoning_reward": 0.4694444462656975, | |
| "step": 51 | |
| }, | |
| { | |
| "completion_length": 209.23333358764648, | |
| "epoch": 0.41750301083902047, | |
| "grad_norm": 0.55024118806201, | |
| "kl": 0.018719482421875, | |
| "learning_rate": 1.56e-06, | |
| "loss": 0.0007, | |
| "reward": 2.4083333551883697, | |
| "reward_std": 0.5498852420598268, | |
| "rewards/accuracy_reward_log": 1.5333333522081376, | |
| "rewards/format_number_reward": 0.430555559694767, | |
| "rewards/format_reasoning_reward": 0.4444444477558136, | |
| "step": 52 | |
| }, | |
| { | |
| "completion_length": 201.15000343322754, | |
| "epoch": 0.425531914893617, | |
| "grad_norm": 0.4665326355575322, | |
| "kl": 0.01983795166015625, | |
| "learning_rate": 1.59e-06, | |
| "loss": 0.0008, | |
| "reward": 2.5222222208976746, | |
| "reward_std": 0.5347743809223175, | |
| "rewards/accuracy_reward_log": 1.6333333492279052, | |
| "rewards/format_number_reward": 0.4416666693985462, | |
| "rewards/format_reasoning_reward": 0.44722222462296485, | |
| "step": 53 | |
| }, | |
| { | |
| "completion_length": 192.3111145019531, | |
| "epoch": 0.4335608189482136, | |
| "grad_norm": 0.4264344321145415, | |
| "kl": 0.02133026123046875, | |
| "learning_rate": 1.6200000000000002e-06, | |
| "loss": 0.0009, | |
| "reward": 2.566666692495346, | |
| "reward_std": 0.47150271385908127, | |
| "rewards/accuracy_reward_log": 1.6555555760860443, | |
| "rewards/format_number_reward": 0.45277778059244156, | |
| "rewards/format_reasoning_reward": 0.4583333358168602, | |
| "step": 54 | |
| }, | |
| { | |
| "completion_length": 207.86111373901366, | |
| "epoch": 0.44158972300281013, | |
| "grad_norm": 0.41848310682566847, | |
| "kl": 0.0194488525390625, | |
| "learning_rate": 1.65e-06, | |
| "loss": 0.0008, | |
| "reward": 2.525000047683716, | |
| "reward_std": 0.5073827020823956, | |
| "rewards/accuracy_reward_log": 1.6333333492279052, | |
| "rewards/format_number_reward": 0.4444444477558136, | |
| "rewards/format_reasoning_reward": 0.4472222253680229, | |
| "step": 55 | |
| }, | |
| { | |
| "completion_length": 218.4888916015625, | |
| "epoch": 0.4496186270574067, | |
| "grad_norm": 0.493209444659726, | |
| "kl": 0.0301055908203125, | |
| "learning_rate": 1.6800000000000002e-06, | |
| "loss": 0.0012, | |
| "reward": 2.4250000417232513, | |
| "reward_std": 0.6275906786322594, | |
| "rewards/accuracy_reward_log": 1.5777777969837188, | |
| "rewards/format_number_reward": 0.41388889253139494, | |
| "rewards/format_reasoning_reward": 0.4333333373069763, | |
| "step": 56 | |
| }, | |
| { | |
| "completion_length": 207.85555877685547, | |
| "epoch": 0.45764753111200324, | |
| "grad_norm": 0.4781117283691274, | |
| "kl": 0.028985595703125, | |
| "learning_rate": 1.71e-06, | |
| "loss": 0.0012, | |
| "reward": 2.4833333492279053, | |
| "reward_std": 0.4261951830238104, | |
| "rewards/accuracy_reward_log": 1.588888907432556, | |
| "rewards/format_number_reward": 0.4444444477558136, | |
| "rewards/format_reasoning_reward": 0.45000000298023224, | |
| "step": 57 | |
| }, | |
| { | |
| "completion_length": 207.41666984558105, | |
| "epoch": 0.46567643516659973, | |
| "grad_norm": 9.7179275371274, | |
| "kl": 0.066973876953125, | |
| "learning_rate": 1.7399999999999999e-06, | |
| "loss": 0.0027, | |
| "reward": 2.4972222566604616, | |
| "reward_std": 0.6165656700730324, | |
| "rewards/accuracy_reward_log": 1.6222222447395325, | |
| "rewards/format_number_reward": 0.4333333373069763, | |
| "rewards/format_reasoning_reward": 0.4416666701436043, | |
| "step": 58 | |
| }, | |
| { | |
| "completion_length": 199.16111221313477, | |
| "epoch": 0.4737053392211963, | |
| "grad_norm": 0.4594537727679852, | |
| "kl": 0.03179931640625, | |
| "learning_rate": 1.77e-06, | |
| "loss": 0.0013, | |
| "reward": 2.5444444954395293, | |
| "reward_std": 0.4083905890583992, | |
| "rewards/accuracy_reward_log": 1.6333333522081375, | |
| "rewards/format_number_reward": 0.45000000298023224, | |
| "rewards/format_reasoning_reward": 0.4611111134290695, | |
| "step": 59 | |
| }, | |
| { | |
| "completion_length": 185.04444580078126, | |
| "epoch": 0.48173424327579284, | |
| "grad_norm": 0.35591417052929925, | |
| "kl": 0.02755126953125, | |
| "learning_rate": 1.8e-06, | |
| "loss": 0.0011, | |
| "reward": 2.708333361148834, | |
| "reward_std": 0.30322221145033834, | |
| "rewards/accuracy_reward_log": 1.7555555701255798, | |
| "rewards/format_number_reward": 0.4750000014901161, | |
| "rewards/format_reasoning_reward": 0.47777777910232544, | |
| "step": 60 | |
| }, | |
| { | |
| "completion_length": 217.3444480895996, | |
| "epoch": 0.4897631473303894, | |
| "grad_norm": 0.47620034091897007, | |
| "kl": 0.043768310546875, | |
| "learning_rate": 1.83e-06, | |
| "loss": 0.0017, | |
| "reward": 2.45833335518837, | |
| "reward_std": 0.48677313327789307, | |
| "rewards/accuracy_reward_log": 1.5777777925133705, | |
| "rewards/format_number_reward": 0.4305555589497089, | |
| "rewards/format_reasoning_reward": 0.45000000298023224, | |
| "step": 61 | |
| }, | |
| { | |
| "completion_length": 190.76111221313477, | |
| "epoch": 0.49779205138498595, | |
| "grad_norm": 0.381306777730851, | |
| "kl": 0.03514404296875, | |
| "learning_rate": 1.86e-06, | |
| "loss": 0.0014, | |
| "reward": 2.694444453716278, | |
| "reward_std": 0.2467763565480709, | |
| "rewards/accuracy_reward_log": 1.7555555671453476, | |
| "rewards/format_number_reward": 0.4694444462656975, | |
| "rewards/format_reasoning_reward": 0.4694444462656975, | |
| "step": 62 | |
| }, | |
| { | |
| "completion_length": 187.97777671813964, | |
| "epoch": 0.5058209554395825, | |
| "grad_norm": 0.594069782615144, | |
| "kl": 0.039031982421875, | |
| "learning_rate": 1.8900000000000001e-06, | |
| "loss": 0.0016, | |
| "reward": 2.6111111283302306, | |
| "reward_std": 0.39384557902812956, | |
| "rewards/accuracy_reward_log": 1.6888889014720916, | |
| "rewards/format_number_reward": 0.4555555582046509, | |
| "rewards/format_reasoning_reward": 0.46666666865348816, | |
| "step": 63 | |
| }, | |
| { | |
| "completion_length": 193.00555725097655, | |
| "epoch": 0.513849859494179, | |
| "grad_norm": 0.5261946833334382, | |
| "kl": 0.04935302734375, | |
| "learning_rate": 1.9200000000000003e-06, | |
| "loss": 0.002, | |
| "reward": 2.586111146211624, | |
| "reward_std": 0.4021389245986938, | |
| "rewards/accuracy_reward_log": 1.6777777940034866, | |
| "rewards/format_number_reward": 0.45277778059244156, | |
| "rewards/format_reasoning_reward": 0.4555555582046509, | |
| "step": 64 | |
| }, | |
| { | |
| "completion_length": 193.02222366333007, | |
| "epoch": 0.5218787635487756, | |
| "grad_norm": 0.4720147986556917, | |
| "kl": 0.0471923828125, | |
| "learning_rate": 1.95e-06, | |
| "loss": 0.0019, | |
| "reward": 2.6138889133930205, | |
| "reward_std": 0.444037689268589, | |
| "rewards/accuracy_reward_log": 1.7000000149011611, | |
| "rewards/format_number_reward": 0.4555555582046509, | |
| "rewards/format_reasoning_reward": 0.4583333358168602, | |
| "step": 65 | |
| }, | |
| { | |
| "completion_length": 210.9277801513672, | |
| "epoch": 0.5299076676033722, | |
| "grad_norm": 0.5124703016443702, | |
| "kl": 0.06097412109375, | |
| "learning_rate": 1.98e-06, | |
| "loss": 0.0024, | |
| "reward": 2.4833333492279053, | |
| "reward_std": 0.5216697975993156, | |
| "rewards/accuracy_reward_log": 1.6111111342906952, | |
| "rewards/format_number_reward": 0.4277777820825577, | |
| "rewards/format_reasoning_reward": 0.4444444477558136, | |
| "step": 66 | |
| }, | |
| { | |
| "completion_length": 195.95000152587892, | |
| "epoch": 0.5379365716579687, | |
| "grad_norm": 0.3914341348179692, | |
| "kl": 0.05845947265625, | |
| "learning_rate": 2.0100000000000002e-06, | |
| "loss": 0.0023, | |
| "reward": 2.5666666865348815, | |
| "reward_std": 0.4269164353609085, | |
| "rewards/accuracy_reward_log": 1.666666680574417, | |
| "rewards/format_number_reward": 0.4416666701436043, | |
| "rewards/format_reasoning_reward": 0.4583333358168602, | |
| "step": 67 | |
| }, | |
| { | |
| "completion_length": 189.8333354949951, | |
| "epoch": 0.5459654757125653, | |
| "grad_norm": 1.8409712600042354, | |
| "kl": 0.104864501953125, | |
| "learning_rate": 2.0400000000000004e-06, | |
| "loss": 0.0042, | |
| "reward": 2.6055555939674377, | |
| "reward_std": 0.3877051591873169, | |
| "rewards/accuracy_reward_log": 1.7000000119209289, | |
| "rewards/format_number_reward": 0.45000000298023224, | |
| "rewards/format_reasoning_reward": 0.4555555582046509, | |
| "step": 68 | |
| }, | |
| { | |
| "completion_length": 216.18333587646484, | |
| "epoch": 0.5539943797671618, | |
| "grad_norm": 1.234592815069376, | |
| "kl": 0.11241455078125, | |
| "learning_rate": 2.07e-06, | |
| "loss": 0.0045, | |
| "reward": 2.3138889193534853, | |
| "reward_std": 0.5511893726885319, | |
| "rewards/accuracy_reward_log": 1.46666669100523, | |
| "rewards/format_number_reward": 0.4138888940215111, | |
| "rewards/format_reasoning_reward": 0.4333333373069763, | |
| "step": 69 | |
| }, | |
| { | |
| "completion_length": 216.01666946411132, | |
| "epoch": 0.5620232838217584, | |
| "grad_norm": 0.47189027920553445, | |
| "kl": 0.086163330078125, | |
| "learning_rate": 2.1e-06, | |
| "loss": 0.0034, | |
| "reward": 2.4277777910232543, | |
| "reward_std": 0.5567510481923819, | |
| "rewards/accuracy_reward_log": 1.588888907432556, | |
| "rewards/format_number_reward": 0.413888893276453, | |
| "rewards/format_reasoning_reward": 0.4250000037252903, | |
| "step": 70 | |
| }, | |
| { | |
| "completion_length": 180.42222366333007, | |
| "epoch": 0.5700521878763549, | |
| "grad_norm": 0.5661866241275311, | |
| "kl": 0.055902099609375, | |
| "learning_rate": 2.13e-06, | |
| "loss": 0.0022, | |
| "reward": 2.508333349227905, | |
| "reward_std": 0.416572679579258, | |
| "rewards/accuracy_reward_log": 1.6000000208616256, | |
| "rewards/format_number_reward": 0.4472222253680229, | |
| "rewards/format_reasoning_reward": 0.4611111134290695, | |
| "step": 71 | |
| }, | |
| { | |
| "completion_length": 212.58333511352538, | |
| "epoch": 0.5780810919309515, | |
| "grad_norm": 0.5852592346007427, | |
| "kl": 0.108624267578125, | |
| "learning_rate": 2.16e-06, | |
| "loss": 0.0043, | |
| "reward": 2.2777777791023253, | |
| "reward_std": 0.53003438860178, | |
| "rewards/accuracy_reward_log": 1.4888889133930205, | |
| "rewards/format_number_reward": 0.38611111640930174, | |
| "rewards/format_reasoning_reward": 0.40277778208255766, | |
| "step": 72 | |
| }, | |
| { | |
| "completion_length": 203.32777824401856, | |
| "epoch": 0.586109995985548, | |
| "grad_norm": 1.0262474142127427, | |
| "kl": 0.104498291015625, | |
| "learning_rate": 2.19e-06, | |
| "loss": 0.0042, | |
| "reward": 2.4166666686534883, | |
| "reward_std": 0.40687683820724485, | |
| "rewards/accuracy_reward_log": 1.5777777969837188, | |
| "rewards/format_number_reward": 0.4166666708886623, | |
| "rewards/format_reasoning_reward": 0.42222222611308097, | |
| "step": 73 | |
| }, | |
| { | |
| "completion_length": 205.4111114501953, | |
| "epoch": 0.5941389000401445, | |
| "grad_norm": 0.6749287604370076, | |
| "kl": 0.106732177734375, | |
| "learning_rate": 2.22e-06, | |
| "loss": 0.0043, | |
| "reward": 2.2611111223697664, | |
| "reward_std": 0.5402376987040043, | |
| "rewards/accuracy_reward_log": 1.4333333566784858, | |
| "rewards/format_number_reward": 0.4027777828276157, | |
| "rewards/format_reasoning_reward": 0.42500000447034836, | |
| "step": 74 | |
| }, | |
| { | |
| "completion_length": 204.6277805328369, | |
| "epoch": 0.602167804094741, | |
| "grad_norm": 1.3836757790747922, | |
| "kl": 0.118878173828125, | |
| "learning_rate": 2.25e-06, | |
| "loss": 0.0048, | |
| "reward": 2.211111146211624, | |
| "reward_std": 0.7402307014912367, | |
| "rewards/accuracy_reward_log": 1.4000000268220902, | |
| "rewards/format_number_reward": 0.38055556192994117, | |
| "rewards/format_reasoning_reward": 0.430555559694767, | |
| "step": 75 | |
| }, | |
| { | |
| "completion_length": 192.80555725097656, | |
| "epoch": 0.6101967081493376, | |
| "grad_norm": 0.5732652733178961, | |
| "kl": 0.081439208984375, | |
| "learning_rate": 2.28e-06, | |
| "loss": 0.0033, | |
| "reward": 2.444444465637207, | |
| "reward_std": 0.5093889623880387, | |
| "rewards/accuracy_reward_log": 1.533333358168602, | |
| "rewards/format_number_reward": 0.45277778059244156, | |
| "rewards/format_reasoning_reward": 0.4583333358168602, | |
| "step": 76 | |
| }, | |
| { | |
| "completion_length": 159.32222518920898, | |
| "epoch": 0.6182256122039341, | |
| "grad_norm": 0.4884865339246385, | |
| "kl": 0.057391357421875, | |
| "learning_rate": 2.31e-06, | |
| "loss": 0.0023, | |
| "reward": 2.5805555820465087, | |
| "reward_std": 0.16979632191359997, | |
| "rewards/accuracy_reward_log": 1.611111131310463, | |
| "rewards/format_number_reward": 0.47777777910232544, | |
| "rewards/format_reasoning_reward": 0.49166666716337204, | |
| "step": 77 | |
| }, | |
| { | |
| "completion_length": 190.20000076293945, | |
| "epoch": 0.6262545162585307, | |
| "grad_norm": 0.5209974936705293, | |
| "kl": 0.067266845703125, | |
| "learning_rate": 2.34e-06, | |
| "loss": 0.0027, | |
| "reward": 2.4777778029441833, | |
| "reward_std": 0.4935527116060257, | |
| "rewards/accuracy_reward_log": 1.5555555790662765, | |
| "rewards/format_number_reward": 0.4583333358168602, | |
| "rewards/format_reasoning_reward": 0.46388889104127884, | |
| "step": 78 | |
| }, | |
| { | |
| "completion_length": 169.23889122009277, | |
| "epoch": 0.6342834203131272, | |
| "grad_norm": 0.4459209602371647, | |
| "kl": 0.05482177734375, | |
| "learning_rate": 2.37e-06, | |
| "loss": 0.0022, | |
| "reward": 2.6750000178813935, | |
| "reward_std": 0.23374302312731743, | |
| "rewards/accuracy_reward_log": 1.7000000149011611, | |
| "rewards/format_number_reward": 0.4833333343267441, | |
| "rewards/format_reasoning_reward": 0.49166666716337204, | |
| "step": 79 | |
| }, | |
| { | |
| "completion_length": 182.30000267028808, | |
| "epoch": 0.6423123243677238, | |
| "grad_norm": 0.45736704423541613, | |
| "kl": 0.07120361328125, | |
| "learning_rate": 2.4000000000000003e-06, | |
| "loss": 0.0028, | |
| "reward": 2.5750000059604643, | |
| "reward_std": 0.31974178850650786, | |
| "rewards/accuracy_reward_log": 1.6444444626569747, | |
| "rewards/format_number_reward": 0.46388889104127884, | |
| "rewards/format_reasoning_reward": 0.46666666865348816, | |
| "step": 80 | |
| }, | |
| { | |
| "completion_length": 163.07778053283693, | |
| "epoch": 0.6503412284223203, | |
| "grad_norm": 0.4682785728868757, | |
| "kl": 0.06102294921875, | |
| "learning_rate": 2.43e-06, | |
| "loss": 0.0024, | |
| "reward": 2.686111146211624, | |
| "reward_std": 0.3670576632022858, | |
| "rewards/accuracy_reward_log": 1.7444444596767426, | |
| "rewards/format_number_reward": 0.4694444462656975, | |
| "rewards/format_reasoning_reward": 0.4722222238779068, | |
| "step": 81 | |
| }, | |
| { | |
| "completion_length": 165.8722225189209, | |
| "epoch": 0.6583701324769169, | |
| "grad_norm": 0.509179920003658, | |
| "kl": 0.061614990234375, | |
| "learning_rate": 2.4599999999999997e-06, | |
| "loss": 0.0025, | |
| "reward": 2.6250000596046448, | |
| "reward_std": 0.402138914167881, | |
| "rewards/accuracy_reward_log": 1.6777777969837189, | |
| "rewards/format_number_reward": 0.4694444462656975, | |
| "rewards/format_reasoning_reward": 0.47777777910232544, | |
| "step": 82 | |
| }, | |
| { | |
| "completion_length": 159.80555725097656, | |
| "epoch": 0.6663990365315134, | |
| "grad_norm": 0.5994985713489502, | |
| "kl": 0.054705810546875, | |
| "learning_rate": 2.49e-06, | |
| "loss": 0.0022, | |
| "reward": 2.6194444835186004, | |
| "reward_std": 0.4117614269256592, | |
| "rewards/accuracy_reward_log": 1.6555555760860443, | |
| "rewards/format_number_reward": 0.48055555671453476, | |
| "rewards/format_reasoning_reward": 0.4833333343267441, | |
| "step": 83 | |
| }, | |
| { | |
| "completion_length": 175.34444770812988, | |
| "epoch": 0.67442794058611, | |
| "grad_norm": 0.518979425053283, | |
| "kl": 0.08248291015625, | |
| "learning_rate": 2.52e-06, | |
| "loss": 0.0033, | |
| "reward": 2.5138889074325563, | |
| "reward_std": 0.4138893112540245, | |
| "rewards/accuracy_reward_log": 1.6000000208616256, | |
| "rewards/format_number_reward": 0.45277778059244156, | |
| "rewards/format_reasoning_reward": 0.4611111134290695, | |
| "step": 84 | |
| }, | |
| { | |
| "completion_length": 165.01666755676268, | |
| "epoch": 0.6824568446407066, | |
| "grad_norm": 0.49612998471580416, | |
| "kl": 0.065625, | |
| "learning_rate": 2.55e-06, | |
| "loss": 0.0026, | |
| "reward": 2.469444477558136, | |
| "reward_std": 0.5141268767416477, | |
| "rewards/accuracy_reward_log": 1.588888907432556, | |
| "rewards/format_number_reward": 0.43333333656191825, | |
| "rewards/format_reasoning_reward": 0.44722222462296485, | |
| "step": 85 | |
| }, | |
| { | |
| "completion_length": 165.6388900756836, | |
| "epoch": 0.6904857486953031, | |
| "grad_norm": 0.552553452651074, | |
| "kl": 0.06807861328125, | |
| "learning_rate": 2.58e-06, | |
| "loss": 0.0027, | |
| "reward": 2.4944444805383683, | |
| "reward_std": 0.37747681848704817, | |
| "rewards/accuracy_reward_log": 1.577777798473835, | |
| "rewards/format_number_reward": 0.4527777798473835, | |
| "rewards/format_reasoning_reward": 0.46388889104127884, | |
| "step": 86 | |
| }, | |
| { | |
| "completion_length": 155.51666641235352, | |
| "epoch": 0.6985146527498997, | |
| "grad_norm": 0.5440640607374735, | |
| "kl": 0.060748291015625, | |
| "learning_rate": 2.61e-06, | |
| "loss": 0.0024, | |
| "reward": 2.4611111283302307, | |
| "reward_std": 0.5409881249070168, | |
| "rewards/accuracy_reward_log": 1.577777799963951, | |
| "rewards/format_number_reward": 0.43888889253139496, | |
| "rewards/format_reasoning_reward": 0.4444444477558136, | |
| "step": 87 | |
| }, | |
| { | |
| "completion_length": 150.6777774810791, | |
| "epoch": 0.7065435568044962, | |
| "grad_norm": 0.4538101109952547, | |
| "kl": 0.05191650390625, | |
| "learning_rate": 2.64e-06, | |
| "loss": 0.0021, | |
| "reward": 2.6416666924953462, | |
| "reward_std": 0.2997001264244318, | |
| "rewards/accuracy_reward_log": 1.7000000149011611, | |
| "rewards/format_number_reward": 0.4694444462656975, | |
| "rewards/format_reasoning_reward": 0.4722222238779068, | |
| "step": 88 | |
| }, | |
| { | |
| "completion_length": 165.80000228881835, | |
| "epoch": 0.7145724608590928, | |
| "grad_norm": 0.5131296250407812, | |
| "kl": 0.06552734375, | |
| "learning_rate": 2.6700000000000003e-06, | |
| "loss": 0.0026, | |
| "reward": 2.600000035762787, | |
| "reward_std": 0.48052144795656204, | |
| "rewards/accuracy_reward_log": 1.6777777969837189, | |
| "rewards/format_number_reward": 0.4583333358168602, | |
| "rewards/format_reasoning_reward": 0.46388889104127884, | |
| "step": 89 | |
| }, | |
| { | |
| "completion_length": 151.10555686950684, | |
| "epoch": 0.7226013649136893, | |
| "grad_norm": 0.5165638439339917, | |
| "kl": 0.0578857421875, | |
| "learning_rate": 2.7e-06, | |
| "loss": 0.0023, | |
| "reward": 2.6555556058883667, | |
| "reward_std": 0.362246410548687, | |
| "rewards/accuracy_reward_log": 1.7111111283302307, | |
| "rewards/format_number_reward": 0.4722222238779068, | |
| "rewards/format_reasoning_reward": 0.4722222238779068, | |
| "step": 90 | |
| }, | |
| { | |
| "completion_length": 159.4166687011719, | |
| "epoch": 0.7306302689682859, | |
| "grad_norm": 0.49295723950300413, | |
| "kl": 0.058404541015625, | |
| "learning_rate": 2.73e-06, | |
| "loss": 0.0023, | |
| "reward": 2.5833333671092986, | |
| "reward_std": 0.44544019252061845, | |
| "rewards/accuracy_reward_log": 1.6444444626569747, | |
| "rewards/format_number_reward": 0.4694444462656975, | |
| "rewards/format_reasoning_reward": 0.4694444462656975, | |
| "step": 91 | |
| }, | |
| { | |
| "completion_length": 159.53333625793456, | |
| "epoch": 0.7386591730228824, | |
| "grad_norm": 1.0273725644592662, | |
| "kl": 0.06968994140625, | |
| "learning_rate": 2.7600000000000003e-06, | |
| "loss": 0.0028, | |
| "reward": 2.5111111402511597, | |
| "reward_std": 0.44476102106273174, | |
| "rewards/accuracy_reward_log": 1.577777799963951, | |
| "rewards/format_number_reward": 0.4611111134290695, | |
| "rewards/format_reasoning_reward": 0.4722222238779068, | |
| "step": 92 | |
| }, | |
| { | |
| "completion_length": 154.76666793823242, | |
| "epoch": 0.746688077077479, | |
| "grad_norm": 0.3153647070205455, | |
| "kl": 0.053741455078125, | |
| "learning_rate": 2.7900000000000004e-06, | |
| "loss": 0.0021, | |
| "reward": 2.688888907432556, | |
| "reward_std": 0.2062800731509924, | |
| "rewards/accuracy_reward_log": 1.733333346247673, | |
| "rewards/format_number_reward": 0.4750000014901161, | |
| "rewards/format_reasoning_reward": 0.48055555671453476, | |
| "step": 93 | |
| }, | |
| { | |
| "completion_length": 166.42777900695802, | |
| "epoch": 0.7547169811320755, | |
| "grad_norm": 0.5748531305032764, | |
| "kl": 0.066802978515625, | |
| "learning_rate": 2.82e-06, | |
| "loss": 0.0027, | |
| "reward": 2.600000041723251, | |
| "reward_std": 0.3458063915371895, | |
| "rewards/accuracy_reward_log": 1.655555573105812, | |
| "rewards/format_number_reward": 0.4722222238779068, | |
| "rewards/format_reasoning_reward": 0.4722222238779068, | |
| "step": 94 | |
| }, | |
| { | |
| "completion_length": 158.03888969421388, | |
| "epoch": 0.762745885186672, | |
| "grad_norm": 0.5545909018402391, | |
| "kl": 0.066021728515625, | |
| "learning_rate": 2.85e-06, | |
| "loss": 0.0026, | |
| "reward": 2.7111111342906953, | |
| "reward_std": 0.3430013954639435, | |
| "rewards/accuracy_reward_log": 1.7666666775941848, | |
| "rewards/format_number_reward": 0.4722222238779068, | |
| "rewards/format_reasoning_reward": 0.4722222238779068, | |
| "step": 95 | |
| }, | |
| { | |
| "completion_length": 165.80000228881835, | |
| "epoch": 0.7707747892412685, | |
| "grad_norm": 0.42499585100246984, | |
| "kl": 0.05565185546875, | |
| "learning_rate": 2.88e-06, | |
| "loss": 0.0022, | |
| "reward": 2.602777808904648, | |
| "reward_std": 0.3154630549252033, | |
| "rewards/accuracy_reward_log": 1.6444444626569747, | |
| "rewards/format_number_reward": 0.47777777910232544, | |
| "rewards/format_reasoning_reward": 0.48055555671453476, | |
| "step": 96 | |
| }, | |
| { | |
| "completion_length": 174.88333435058593, | |
| "epoch": 0.7788036932958651, | |
| "grad_norm": 0.5630736371263277, | |
| "kl": 0.07933349609375, | |
| "learning_rate": 2.91e-06, | |
| "loss": 0.0032, | |
| "reward": 2.394444453716278, | |
| "reward_std": 0.4866618663072586, | |
| "rewards/accuracy_reward_log": 1.4777778044342995, | |
| "rewards/format_number_reward": 0.4555555582046509, | |
| "rewards/format_reasoning_reward": 0.4611111134290695, | |
| "step": 97 | |
| }, | |
| { | |
| "completion_length": 194.8888916015625, | |
| "epoch": 0.7868325973504616, | |
| "grad_norm": 0.5354591296941339, | |
| "kl": 0.08121337890625, | |
| "learning_rate": 2.9400000000000002e-06, | |
| "loss": 0.0032, | |
| "reward": 2.4166667103767394, | |
| "reward_std": 0.467490179464221, | |
| "rewards/accuracy_reward_log": 1.5444444686174392, | |
| "rewards/format_number_reward": 0.4333333373069763, | |
| "rewards/format_reasoning_reward": 0.43888889253139496, | |
| "step": 98 | |
| }, | |
| { | |
| "completion_length": 198.76111335754393, | |
| "epoch": 0.7948615014050582, | |
| "grad_norm": 0.7579425034647131, | |
| "kl": 0.092218017578125, | |
| "learning_rate": 2.97e-06, | |
| "loss": 0.0037, | |
| "reward": 2.472222238779068, | |
| "reward_std": 0.44544018656015394, | |
| "rewards/accuracy_reward_log": 1.577777799963951, | |
| "rewards/format_number_reward": 0.4472222253680229, | |
| "rewards/format_reasoning_reward": 0.4472222253680229, | |
| "step": 99 | |
| }, | |
| { | |
| "completion_length": 181.00000228881837, | |
| "epoch": 0.8028904054596547, | |
| "grad_norm": 3.752908048104343, | |
| "kl": 0.29696044921875, | |
| "learning_rate": 3e-06, | |
| "loss": 0.0119, | |
| "reward": 2.644444489479065, | |
| "reward_std": 0.3877051673829556, | |
| "rewards/accuracy_reward_log": 1.722222238779068, | |
| "rewards/format_number_reward": 0.4611111134290695, | |
| "rewards/format_reasoning_reward": 0.4611111134290695, | |
| "step": 100 | |
| } | |
| ], | |
| "logging_steps": 1.0, | |
| "max_steps": 1000, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 9, | |
| "save_steps": 50, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": false | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 0.0, | |
| "train_batch_size": 1, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |