| { |
| "best_metric": null, |
| "best_model_checkpoint": null, |
| "epoch": 0.9987438399845395, |
| "eval_steps": 100, |
| "global_step": 646, |
| "is_hyper_param_search": false, |
| "is_local_process_zero": true, |
| "is_world_process_zero": true, |
| "log_history": [ |
| { |
| "completion_length": 895.9870979309082, |
| "epoch": 0.015460430959512996, |
| "grad_norm": 0.0634842514468751, |
| "kl": 0.00048552751541137696, |
| "learning_rate": 3.0769230769230774e-06, |
| "loss": 0.0, |
| "reward": 0.2128826495842077, |
| "reward_std": 0.19821357885375618, |
| "rewards/accuracy_reward": 0.2128826495842077, |
| "rewards/format_reward": 0.0, |
| "step": 10 |
| }, |
| { |
| "completion_length": 813.6550861358643, |
| "epoch": 0.03092086191902599, |
| "grad_norm": 0.05144004581802371, |
| "kl": 0.00542140007019043, |
| "learning_rate": 6.153846153846155e-06, |
| "loss": 0.0002, |
| "reward": 0.3605867292615585, |
| "reward_std": 0.2062750784214586, |
| "rewards/accuracy_reward": 0.3605867292615585, |
| "rewards/format_reward": 0.0, |
| "step": 20 |
| }, |
| { |
| "completion_length": 784.7251134872437, |
| "epoch": 0.04638129287853899, |
| "grad_norm": 0.058006301230646685, |
| "kl": 0.008408355712890624, |
| "learning_rate": 9.230769230769232e-06, |
| "loss": 0.0003, |
| "reward": 0.42461733864620327, |
| "reward_std": 0.21439061006531118, |
| "rewards/accuracy_reward": 0.42461733864620327, |
| "rewards/format_reward": 0.0, |
| "step": 30 |
| }, |
| { |
| "completion_length": 752.7404174804688, |
| "epoch": 0.06184172383805198, |
| "grad_norm": 0.06100438489544683, |
| "kl": 0.013679313659667968, |
| "learning_rate": 1.230769230769231e-05, |
| "loss": 0.0005, |
| "reward": 0.48864794997498395, |
| "reward_std": 0.2098829376511276, |
| "rewards/accuracy_reward": 0.48864794997498395, |
| "rewards/format_reward": 0.0, |
| "step": 40 |
| }, |
| { |
| "completion_length": 703.8393978118896, |
| "epoch": 0.07730215479756498, |
| "grad_norm": 0.06182390025666779, |
| "kl": 0.024402618408203125, |
| "learning_rate": 1.5384615384615387e-05, |
| "loss": 0.001, |
| "reward": 0.5262754996772856, |
| "reward_std": 0.23188330424018205, |
| "rewards/accuracy_reward": 0.5262754996772856, |
| "rewards/format_reward": 0.0, |
| "step": 50 |
| }, |
| { |
| "completion_length": 732.7664375305176, |
| "epoch": 0.09276258575707798, |
| "grad_norm": 0.0653391395105433, |
| "kl": 0.03628692626953125, |
| "learning_rate": 1.8461538461538465e-05, |
| "loss": 0.0015, |
| "reward": 0.5107142759487033, |
| "reward_std": 0.2243131298571825, |
| "rewards/accuracy_reward": 0.5107142759487033, |
| "rewards/format_reward": 0.0, |
| "step": 60 |
| }, |
| { |
| "completion_length": 718.8909275054932, |
| "epoch": 0.10822301671659097, |
| "grad_norm": 0.16972060323025048, |
| "kl": 0.14796981811523438, |
| "learning_rate": 1.999634547413886e-05, |
| "loss": 0.0059, |
| "reward": 0.5130101933144033, |
| "reward_std": 0.2463569703977555, |
| "rewards/accuracy_reward": 0.5130101933144033, |
| "rewards/format_reward": 0.0, |
| "step": 70 |
| }, |
| { |
| "completion_length": 776.1557273864746, |
| "epoch": 0.12368344767610397, |
| "grad_norm": 0.7559210983180688, |
| "kl": 0.1969940185546875, |
| "learning_rate": 1.9967125291968495e-05, |
| "loss": 0.0079, |
| "reward": 0.46645407350733875, |
| "reward_std": 0.23498255694285036, |
| "rewards/accuracy_reward": 0.46645407350733875, |
| "rewards/format_reward": 0.0, |
| "step": 80 |
| }, |
| { |
| "completion_length": 709.4320009231567, |
| "epoch": 0.13914387863561697, |
| "grad_norm": 0.1603065487133038, |
| "kl": 0.27840576171875, |
| "learning_rate": 1.990877034074683e-05, |
| "loss": 0.0111, |
| "reward": 0.4230867256294005, |
| "reward_std": 0.2592574997805059, |
| "rewards/accuracy_reward": 0.4230867256294005, |
| "rewards/format_reward": 0.0, |
| "step": 90 |
| }, |
| { |
| "completion_length": 724.9899085998535, |
| "epoch": 0.15460430959512997, |
| "grad_norm": 0.14050811846040184, |
| "kl": 0.38084716796875, |
| "learning_rate": 1.9821451197042028e-05, |
| "loss": 0.0152, |
| "reward": 0.3531887684832327, |
| "reward_std": 0.24415079602040352, |
| "rewards/accuracy_reward": 0.3531887684832327, |
| "rewards/format_reward": 0.0, |
| "step": 100 |
| }, |
| { |
| "epoch": 0.15460430959512997, |
| "eval_completion_length": 714.497265625, |
| "eval_kl": 0.21029296875, |
| "eval_loss": 0.008998678997159004, |
| "eval_reward": 0.41000001683831216, |
| "eval_reward_std": 0.25652114123106, |
| "eval_rewards/accuracy_reward": 0.41000001683831216, |
| "eval_rewards/format_reward": 0.0, |
| "eval_runtime": 117.6798, |
| "eval_samples_per_second": 0.841, |
| "eval_steps_per_second": 0.034, |
| "step": 100 |
| }, |
| { |
| "completion_length": 744.1831466674805, |
| "epoch": 0.17006474055464296, |
| "grad_norm": 0.10648109851296413, |
| "kl": 0.22943115234375, |
| "learning_rate": 1.9705423102261324e-05, |
| "loss": 0.0092, |
| "reward": 0.40063774921000006, |
| "reward_std": 0.22866119714453817, |
| "rewards/accuracy_reward": 0.40063774921000006, |
| "rewards/format_reward": 0.0, |
| "step": 110 |
| }, |
| { |
| "completion_length": 676.4876100540162, |
| "epoch": 0.18552517151415596, |
| "grad_norm": 0.07603341715798907, |
| "kl": 0.182135009765625, |
| "learning_rate": 1.956102521655831e-05, |
| "loss": 0.0073, |
| "reward": 0.45714284805580974, |
| "reward_std": 0.24492393187247216, |
| "rewards/accuracy_reward": 0.45714284805580974, |
| "rewards/format_reward": 0.0, |
| "step": 120 |
| }, |
| { |
| "completion_length": 573.8422079086304, |
| "epoch": 0.20098560247366895, |
| "grad_norm": 0.08767607432549998, |
| "kl": 0.2532989501953125, |
| "learning_rate": 1.9388679627438486e-05, |
| "loss": 0.0101, |
| "reward": 0.450510194664821, |
| "reward_std": 0.24328936655074357, |
| "rewards/accuracy_reward": 0.450510194664821, |
| "rewards/format_reward": 0.0, |
| "step": 130 |
| }, |
| { |
| "completion_length": 441.023459815979, |
| "epoch": 0.21644603343318194, |
| "grad_norm": 0.09806805953108236, |
| "kl": 0.26416015625, |
| "learning_rate": 1.9188890115960967e-05, |
| "loss": 0.0106, |
| "reward": 0.47551019601523875, |
| "reward_std": 0.2608696824405342, |
| "rewards/accuracy_reward": 0.47551019601523875, |
| "rewards/format_reward": 0.0, |
| "step": 140 |
| }, |
| { |
| "completion_length": 458.37256927490233, |
| "epoch": 0.23190646439269494, |
| "grad_norm": 0.1513486117044564, |
| "kl": 0.3147705078125, |
| "learning_rate": 1.8962240684142923e-05, |
| "loss": 0.0126, |
| "reward": 0.4642857049591839, |
| "reward_std": 0.2634817813988775, |
| "rewards/accuracy_reward": 0.4642857049591839, |
| "rewards/format_reward": 0.0, |
| "step": 150 |
| }, |
| { |
| "completion_length": 497.4808575630188, |
| "epoch": 0.24736689535220793, |
| "grad_norm": 0.8199204469447977, |
| "kl": 4.927545166015625, |
| "learning_rate": 1.8709393847871146e-05, |
| "loss": 0.1972, |
| "reward": 0.4331632573157549, |
| "reward_std": 0.28318220381625, |
| "rewards/accuracy_reward": 0.4331632573157549, |
| "rewards/format_reward": 0.0, |
| "step": 160 |
| }, |
| { |
| "completion_length": 492.83812828063964, |
| "epoch": 0.26282732631172095, |
| "grad_norm": 0.13685058175751932, |
| "kl": 0.540545654296875, |
| "learning_rate": 1.8431088700310846e-05, |
| "loss": 0.0216, |
| "reward": 0.5280612137168645, |
| "reward_std": 0.25305321919731794, |
| "rewards/accuracy_reward": 0.5280612137168645, |
| "rewards/format_reward": 0.0, |
| "step": 170 |
| }, |
| { |
| "completion_length": 749.1317470550537, |
| "epoch": 0.27828775727123395, |
| "grad_norm": 0.10491062132812622, |
| "kl": 0.122552490234375, |
| "learning_rate": 1.8128138751472432e-05, |
| "loss": 0.0049, |
| "reward": 0.434948971029371, |
| "reward_std": 0.2707877185661346, |
| "rewards/accuracy_reward": 0.434948971029371, |
| "rewards/format_reward": 0.0, |
| "step": 180 |
| }, |
| { |
| "completion_length": 729.5617179870605, |
| "epoch": 0.29374818823074694, |
| "grad_norm": 152.95600809970867, |
| "kl": 2.2434478759765626, |
| "learning_rate": 1.780142955025139e-05, |
| "loss": 0.0897, |
| "reward": 0.43596938010305164, |
| "reward_std": 0.2726593071129173, |
| "rewards/accuracy_reward": 0.43596938010305164, |
| "rewards/format_reward": 0.0, |
| "step": 190 |
| }, |
| { |
| "completion_length": 715.5297044754028, |
| "epoch": 0.30920861919025994, |
| "grad_norm": 0.3781996804891015, |
| "kl": 1.44560546875, |
| "learning_rate": 1.745191609589231e-05, |
| "loss": 0.0578, |
| "reward": 0.42602040050551293, |
| "reward_std": 0.29439413188956676, |
| "rewards/accuracy_reward": 0.42602040050551293, |
| "rewards/format_reward": 0.0, |
| "step": 200 |
| }, |
| { |
| "epoch": 0.30920861919025994, |
| "eval_completion_length": 699.7232763671875, |
| "eval_kl": 0.60046875, |
| "eval_loss": 0.023748639971017838, |
| "eval_reward": 0.44000002443790437, |
| "eval_reward_std": 0.29031212598085404, |
| "eval_rewards/accuracy_reward": 0.44000002443790437, |
| "eval_rewards/format_reward": 0.0, |
| "eval_runtime": 117.9304, |
| "eval_samples_per_second": 0.839, |
| "eval_steps_per_second": 0.034, |
| "step": 200 |
| }, |
| { |
| "completion_length": 651.5467977523804, |
| "epoch": 0.32466905014977293, |
| "grad_norm": 0.16011847933591739, |
| "kl": 0.3212890625, |
| "learning_rate": 1.7080620046443503e-05, |
| "loss": 0.0128, |
| "reward": 0.46645407294854524, |
| "reward_std": 0.2662083923816681, |
| "rewards/accuracy_reward": 0.46645407294854524, |
| "rewards/format_reward": 0.0, |
| "step": 210 |
| }, |
| { |
| "completion_length": 610.8702682495117, |
| "epoch": 0.3401294811092859, |
| "grad_norm": 0.11241315704764059, |
| "kl": 0.200762939453125, |
| "learning_rate": 1.6688626732362192e-05, |
| "loss": 0.008, |
| "reward": 0.46709182686172424, |
| "reward_std": 0.2868866888806224, |
| "rewards/accuracy_reward": 0.46709182686172424, |
| "rewards/format_reward": 0.0, |
| "step": 220 |
| }, |
| { |
| "completion_length": 573.5464164733887, |
| "epoch": 0.3555899120687989, |
| "grad_norm": 0.4545690819582109, |
| "kl": 0.3180419921875, |
| "learning_rate": 1.6277081983999742e-05, |
| "loss": 0.0127, |
| "reward": 0.46811223728582263, |
| "reward_std": 0.27821872364729644, |
| "rewards/accuracy_reward": 0.46811223728582263, |
| "rewards/format_reward": 0.0, |
| "step": 230 |
| }, |
| { |
| "completion_length": 591.5071292877197, |
| "epoch": 0.3710503430283119, |
| "grad_norm": 3.8443277156368407, |
| "kl": 0.709783935546875, |
| "learning_rate": 1.5847188782240473e-05, |
| "loss": 0.0284, |
| "reward": 0.4396683592349291, |
| "reward_std": 0.2964717396069318, |
| "rewards/accuracy_reward": 0.4396683592349291, |
| "rewards/format_reward": 0.0, |
| "step": 240 |
| }, |
| { |
| "completion_length": 581.6975629806518, |
| "epoch": 0.3865107739878249, |
| "grad_norm": 0.3623625896893634, |
| "kl": 0.77828369140625, |
| "learning_rate": 1.5400203742084508e-05, |
| "loss": 0.0311, |
| "reward": 0.4667091774288565, |
| "reward_std": 0.29322946835309266, |
| "rewards/accuracy_reward": 0.4667091774288565, |
| "rewards/format_reward": 0.0, |
| "step": 250 |
| }, |
| { |
| "completion_length": 592.2187366485596, |
| "epoch": 0.4019712049473379, |
| "grad_norm": 0.2809541447384242, |
| "kl": 0.5046875, |
| "learning_rate": 1.4937433439453465e-05, |
| "loss": 0.0202, |
| "reward": 0.46109692989848555, |
| "reward_std": 0.297008786117658, |
| "rewards/accuracy_reward": 0.46109692989848555, |
| "rewards/format_reward": 0.0, |
| "step": 260 |
| }, |
| { |
| "completion_length": 551.9140211105347, |
| "epoch": 0.4174316359068509, |
| "grad_norm": 0.11113688922654333, |
| "kl": 0.397216796875, |
| "learning_rate": 1.4460230591956097e-05, |
| "loss": 0.0159, |
| "reward": 0.5364795843139291, |
| "reward_std": 0.250957741914317, |
| "rewards/accuracy_reward": 0.5364795843139291, |
| "rewards/format_reward": 0.0, |
| "step": 270 |
| }, |
| { |
| "completion_length": 568.6547088623047, |
| "epoch": 0.4328920668663639, |
| "grad_norm": 0.24708378647232437, |
| "kl": 0.163128662109375, |
| "learning_rate": 1.3969990104777712e-05, |
| "loss": 0.0065, |
| "reward": 0.509438766585663, |
| "reward_std": 0.25802470711059866, |
| "rewards/accuracy_reward": 0.509438766585663, |
| "rewards/format_reward": 0.0, |
| "step": 280 |
| }, |
| { |
| "completion_length": 545.0108312606811, |
| "epoch": 0.4483524978258769, |
| "grad_norm": 5.507612845198049, |
| "kl": 1.26832275390625, |
| "learning_rate": 1.3468144993251735e-05, |
| "loss": 0.0508, |
| "reward": 0.45420917579904196, |
| "reward_std": 0.27056095115840434, |
| "rewards/accuracy_reward": 0.45420917579904196, |
| "rewards/format_reward": 0.0, |
| "step": 290 |
| }, |
| { |
| "completion_length": 554.2693771362304, |
| "epoch": 0.4638129287853899, |
| "grad_norm": 0.6042026046483606, |
| "kl": 0.679656982421875, |
| "learning_rate": 1.295616219403197e-05, |
| "loss": 0.0272, |
| "reward": 0.4521683591417968, |
| "reward_std": 0.28633297309279443, |
| "rewards/accuracy_reward": 0.4521683591417968, |
| "rewards/format_reward": 0.0, |
| "step": 300 |
| }, |
| { |
| "epoch": 0.4638129287853899, |
| "eval_completion_length": 527.6724047851562, |
| "eval_kl": 0.7890625, |
| "eval_loss": 0.030249282717704773, |
| "eval_reward": 0.4671428832411766, |
| "eval_reward_std": 0.30915650248527526, |
| "eval_rewards/accuracy_reward": 0.4671428832411766, |
| "eval_rewards/format_reward": 0.0, |
| "eval_runtime": 112.1004, |
| "eval_samples_per_second": 0.883, |
| "eval_steps_per_second": 0.036, |
| "step": 300 |
| }, |
| { |
| "completion_length": 559.3555992126464, |
| "epoch": 0.47927335974490287, |
| "grad_norm": 0.3223266455258349, |
| "kl": 0.563067626953125, |
| "learning_rate": 1.2435538277109919e-05, |
| "loss": 0.0225, |
| "reward": 0.4325255031813867, |
| "reward_std": 0.2710647247266024, |
| "rewards/accuracy_reward": 0.4325255031813867, |
| "rewards/format_reward": 0.0, |
| "step": 310 |
| }, |
| { |
| "completion_length": 567.4769004821777, |
| "epoch": 0.49473379070441587, |
| "grad_norm": 0.32191489256962375, |
| "kl": 0.478643798828125, |
| "learning_rate": 1.19077950712113e-05, |
| "loss": 0.0191, |
| "reward": 0.5024234588257969, |
| "reward_std": 0.2630208498798311, |
| "rewards/accuracy_reward": 0.5024234588257969, |
| "rewards/format_reward": 0.0, |
| "step": 320 |
| }, |
| { |
| "completion_length": 593.5192455291748, |
| "epoch": 0.5101942216639289, |
| "grad_norm": 0.3534014177091543, |
| "kl": 0.58690185546875, |
| "learning_rate": 1.137447521535908e-05, |
| "loss": 0.0235, |
| "reward": 0.49515305291861295, |
| "reward_std": 0.2701924462337047, |
| "rewards/accuracy_reward": 0.49515305291861295, |
| "rewards/format_reward": 0.0, |
| "step": 330 |
| }, |
| { |
| "completion_length": 580.6474365234375, |
| "epoch": 0.5256546526234419, |
| "grad_norm": 0.5793452639869299, |
| "kl": 0.567840576171875, |
| "learning_rate": 1.0837137649606241e-05, |
| "loss": 0.0227, |
| "reward": 0.48864794997498395, |
| "reward_std": 0.25743518364615736, |
| "rewards/accuracy_reward": 0.48864794997498395, |
| "rewards/format_reward": 0.0, |
| "step": 340 |
| }, |
| { |
| "completion_length": 568.6904249191284, |
| "epoch": 0.5411150835829549, |
| "grad_norm": 0.18695646465398727, |
| "kl": 0.605303955078125, |
| "learning_rate": 1.0297353058119209e-05, |
| "loss": 0.0242, |
| "reward": 0.4730867262929678, |
| "reward_std": 0.2723493260331452, |
| "rewards/accuracy_reward": 0.4730867262929678, |
| "rewards/format_reward": 0.0, |
| "step": 350 |
| }, |
| { |
| "completion_length": 544.8482028961182, |
| "epoch": 0.5565755145424679, |
| "grad_norm": 0.40854460643950763, |
| "kl": 0.607025146484375, |
| "learning_rate": 9.756699277932196e-06, |
| "loss": 0.0243, |
| "reward": 0.4915816240012646, |
| "reward_std": 0.2733304013963789, |
| "rewards/accuracy_reward": 0.4915816240012646, |
| "rewards/format_reward": 0.0, |
| "step": 360 |
| }, |
| { |
| "completion_length": 527.0141460418702, |
| "epoch": 0.5720359455019809, |
| "grad_norm": 0.5971511752861693, |
| "kl": 0.598443603515625, |
| "learning_rate": 9.216756686793163e-06, |
| "loss": 0.0239, |
| "reward": 0.5142857053317129, |
| "reward_std": 0.25178592149168255, |
| "rewards/accuracy_reward": 0.5142857053317129, |
| "rewards/format_reward": 0.0, |
| "step": 370 |
| }, |
| { |
| "completion_length": 544.3457790374756, |
| "epoch": 0.5874963764614939, |
| "grad_norm": 0.26757335867195303, |
| "kl": 0.774627685546875, |
| "learning_rate": 8.67910358358298e-06, |
| "loss": 0.031, |
| "reward": 0.49706631591543554, |
| "reward_std": 0.2742634845431894, |
| "rewards/accuracy_reward": 0.49706631591543554, |
| "rewards/format_reward": 0.0, |
| "step": 380 |
| }, |
| { |
| "completion_length": 549.8956525802612, |
| "epoch": 0.6029568074210069, |
| "grad_norm": 1.6833932480130032, |
| "kl": 0.77420654296875, |
| "learning_rate": 8.145311574811325e-06, |
| "loss": 0.031, |
| "reward": 0.48048468669876454, |
| "reward_std": 0.26254036352038385, |
| "rewards/accuracy_reward": 0.48048468669876454, |
| "rewards/format_reward": 0.0, |
| "step": 390 |
| }, |
| { |
| "completion_length": 526.9630001068115, |
| "epoch": 0.6184172383805199, |
| "grad_norm": 1.314728368359331, |
| "kl": 0.8248779296875, |
| "learning_rate": 7.616940980675004e-06, |
| "loss": 0.033, |
| "reward": 0.4784438674338162, |
| "reward_std": 0.28052179743535816, |
| "rewards/accuracy_reward": 0.4784438674338162, |
| "rewards/format_reward": 0.0, |
| "step": 400 |
| }, |
| { |
| "epoch": 0.6184172383805199, |
| "eval_completion_length": 513.1444018554688, |
| "eval_kl": 1.018828125, |
| "eval_loss": 0.04034169018268585, |
| "eval_reward": 0.4842857384681702, |
| "eval_reward_std": 0.31372432827949526, |
| "eval_rewards/accuracy_reward": 0.4842857384681702, |
| "eval_rewards/format_reward": 0.0, |
| "eval_runtime": 112.3631, |
| "eval_samples_per_second": 0.881, |
| "eval_steps_per_second": 0.036, |
| "step": 400 |
| }, |
| { |
| "completion_length": 500.106876373291, |
| "epoch": 0.6338776693400329, |
| "grad_norm": 0.3314015554378421, |
| "kl": 0.7260986328125, |
| "learning_rate": 7.095536274107046e-06, |
| "loss": 0.029, |
| "reward": 0.5030612162780017, |
| "reward_std": 0.2555597382131964, |
| "rewards/accuracy_reward": 0.5030612162780017, |
| "rewards/format_reward": 0.0, |
| "step": 410 |
| }, |
| { |
| "completion_length": 572.6313653945923, |
| "epoch": 0.6493381002995459, |
| "grad_norm": 1.83967459416783, |
| "kl": 0.6177490234375, |
| "learning_rate": 6.58262156614881e-06, |
| "loss": 0.0247, |
| "reward": 0.4937499940395355, |
| "reward_std": 0.2641662787180394, |
| "rewards/accuracy_reward": 0.4937499940395355, |
| "rewards/format_reward": 0.0, |
| "step": 420 |
| }, |
| { |
| "completion_length": 558.3020294189453, |
| "epoch": 0.6647985312590589, |
| "grad_norm": 0.2328326516837162, |
| "kl": 0.5453857421875, |
| "learning_rate": 6.079696150841634e-06, |
| "loss": 0.0218, |
| "reward": 0.4604591763578355, |
| "reward_std": 0.25486370851285756, |
| "rewards/accuracy_reward": 0.4604591763578355, |
| "rewards/format_reward": 0.0, |
| "step": 430 |
| }, |
| { |
| "completion_length": 495.181622505188, |
| "epoch": 0.6802589622185718, |
| "grad_norm": 0.45411943739355637, |
| "kl": 0.489031982421875, |
| "learning_rate": 5.588230122660672e-06, |
| "loss": 0.0196, |
| "reward": 0.501403052546084, |
| "reward_std": 0.2606009878218174, |
| "rewards/accuracy_reward": 0.501403052546084, |
| "rewards/format_reward": 0.0, |
| "step": 440 |
| }, |
| { |
| "completion_length": 521.8503698348999, |
| "epoch": 0.6957193931780848, |
| "grad_norm": 0.6920580933707077, |
| "kl": 0.69771728515625, |
| "learning_rate": 5.109660079301668e-06, |
| "loss": 0.0279, |
| "reward": 0.48520407294854523, |
| "reward_std": 0.2722197940573096, |
| "rewards/accuracy_reward": 0.48520407294854523, |
| "rewards/format_reward": 0.0, |
| "step": 450 |
| }, |
| { |
| "completion_length": 516.6503719329834, |
| "epoch": 0.7111798241375978, |
| "grad_norm": 0.7655139656662532, |
| "kl": 0.59906005859375, |
| "learning_rate": 4.64538492238166e-06, |
| "loss": 0.024, |
| "reward": 0.4839285622350872, |
| "reward_std": 0.2701169220265001, |
| "rewards/accuracy_reward": 0.4839285622350872, |
| "rewards/format_reward": 0.0, |
| "step": 460 |
| }, |
| { |
| "completion_length": 532.4928466796875, |
| "epoch": 0.7266402550971108, |
| "grad_norm": 0.42334345776819876, |
| "kl": 0.660205078125, |
| "learning_rate": 4.196761768328599e-06, |
| "loss": 0.0264, |
| "reward": 0.4859693797305226, |
| "reward_std": 0.26866118256002663, |
| "rewards/accuracy_reward": 0.4859693797305226, |
| "rewards/format_reward": 0.0, |
| "step": 470 |
| }, |
| { |
| "completion_length": 534.1709095001221, |
| "epoch": 0.7421006860566238, |
| "grad_norm": 0.48787507323058743, |
| "kl": 0.65863037109375, |
| "learning_rate": 3.7651019814126656e-06, |
| "loss": 0.0263, |
| "reward": 0.4844387672841549, |
| "reward_std": 0.27398421289399266, |
| "rewards/accuracy_reward": 0.4844387672841549, |
| "rewards/format_reward": 0.0, |
| "step": 480 |
| }, |
| { |
| "completion_length": 534.6030525207519, |
| "epoch": 0.7575611170161368, |
| "grad_norm": 0.5465188749045764, |
| "kl": 0.7282470703125, |
| "learning_rate": 3.3516673405151546e-06, |
| "loss": 0.0291, |
| "reward": 0.49260203279554843, |
| "reward_std": 0.27119873408228157, |
| "rewards/accuracy_reward": 0.49260203279554843, |
| "rewards/format_reward": 0.0, |
| "step": 490 |
| }, |
| { |
| "completion_length": 531.7174621582031, |
| "epoch": 0.7730215479756498, |
| "grad_norm": 1.4984058691914603, |
| "kl": 0.571331787109375, |
| "learning_rate": 2.957666350839663e-06, |
| "loss": 0.0229, |
| "reward": 0.4956632579676807, |
| "reward_std": 0.25490435254760085, |
| "rewards/accuracy_reward": 0.4956632579676807, |
| "rewards/format_reward": 0.0, |
| "step": 500 |
| }, |
| { |
| "epoch": 0.7730215479756498, |
| "eval_completion_length": 518.6042163085938, |
| "eval_kl": 0.613046875, |
| "eval_loss": 0.024254189804196358, |
| "eval_reward": 0.5128571724891663, |
| "eval_reward_std": 0.2827815467119217, |
| "eval_rewards/accuracy_reward": 0.5128571724891663, |
| "eval_rewards/format_reward": 0.0, |
| "eval_runtime": 112.0796, |
| "eval_samples_per_second": 0.883, |
| "eval_steps_per_second": 0.036, |
| "step": 500 |
| }, |
| { |
| "completion_length": 565.8936117172241, |
| "epoch": 0.7884819789351628, |
| "grad_norm": 1.3357020339930779, |
| "kl": 0.752362060546875, |
| "learning_rate": 2.5842507113469307e-06, |
| "loss": 0.0301, |
| "reward": 0.48163264396134764, |
| "reward_std": 0.2701051170937717, |
| "rewards/accuracy_reward": 0.48163264396134764, |
| "rewards/format_reward": 0.0, |
| "step": 510 |
| }, |
| { |
| "completion_length": 549.8587898254394, |
| "epoch": 0.8039424098946758, |
| "grad_norm": 0.27711780573814004, |
| "kl": 0.6265380859375, |
| "learning_rate": 2.2325119482391466e-06, |
| "loss": 0.0251, |
| "reward": 0.48915815316140654, |
| "reward_std": 0.2548467483371496, |
| "rewards/accuracy_reward": 0.48915815316140654, |
| "rewards/format_reward": 0.0, |
| "step": 520 |
| }, |
| { |
| "completion_length": 531.7180992126465, |
| "epoch": 0.8194028408541888, |
| "grad_norm": 0.5958984702375305, |
| "kl": 0.599822998046875, |
| "learning_rate": 1.9034782243345074e-06, |
| "loss": 0.024, |
| "reward": 0.513903050404042, |
| "reward_std": 0.2422366608865559, |
| "rewards/accuracy_reward": 0.513903050404042, |
| "rewards/format_reward": 0.0, |
| "step": 530 |
| }, |
| { |
| "completion_length": 536.6831537246704, |
| "epoch": 0.8348632718137018, |
| "grad_norm": 0.9836698663565713, |
| "kl": 0.66021728515625, |
| "learning_rate": 1.5981113336584041e-06, |
| "loss": 0.0264, |
| "reward": 0.5011479528620839, |
| "reward_std": 0.2621664395555854, |
| "rewards/accuracy_reward": 0.5011479528620839, |
| "rewards/format_reward": 0.0, |
| "step": 540 |
| }, |
| { |
| "completion_length": 530.7587900161743, |
| "epoch": 0.8503237027732148, |
| "grad_norm": 0.644004276722614, |
| "kl": 0.666015625, |
| "learning_rate": 1.3173038900362977e-06, |
| "loss": 0.0266, |
| "reward": 0.5116071328520775, |
| "reward_std": 0.264158633723855, |
| "rewards/accuracy_reward": 0.5116071328520775, |
| "rewards/format_reward": 0.0, |
| "step": 550 |
| }, |
| { |
| "completion_length": 537.9271587371826, |
| "epoch": 0.8657841337327278, |
| "grad_norm": 0.708668814006566, |
| "kl": 0.61654052734375, |
| "learning_rate": 1.0618767179063416e-06, |
| "loss": 0.0246, |
| "reward": 0.49528060380835087, |
| "reward_std": 0.25958121265284717, |
| "rewards/accuracy_reward": 0.49528060380835087, |
| "rewards/format_reward": 0.0, |
| "step": 560 |
| }, |
| { |
| "completion_length": 550.545781326294, |
| "epoch": 0.8812445646922408, |
| "grad_norm": 0.7500579172257139, |
| "kl": 0.71644287109375, |
| "learning_rate": 8.325764529785851e-07, |
| "loss": 0.0287, |
| "reward": 0.4839285627938807, |
| "reward_std": 0.25909215547144415, |
| "rewards/accuracy_reward": 0.4839285627938807, |
| "rewards/format_reward": 0.0, |
| "step": 570 |
| }, |
| { |
| "completion_length": 546.6188653945923, |
| "epoch": 0.8967049956517538, |
| "grad_norm": 0.37217374017534643, |
| "kl": 0.65179443359375, |
| "learning_rate": 6.300733597542086e-07, |
| "loss": 0.0261, |
| "reward": 0.49107142109423874, |
| "reward_std": 0.2726855373941362, |
| "rewards/accuracy_reward": 0.49107142109423874, |
| "rewards/format_reward": 0.0, |
| "step": 580 |
| }, |
| { |
| "completion_length": 548.421669960022, |
| "epoch": 0.9121654266112668, |
| "grad_norm": 0.2522060016676339, |
| "kl": 0.7209228515625, |
| "learning_rate": 4.549593722844492e-07, |
| "loss": 0.0288, |
| "reward": 0.48686223682016133, |
| "reward_std": 0.2682306385599077, |
| "rewards/accuracy_reward": 0.48686223682016133, |
| "rewards/format_reward": 0.0, |
| "step": 590 |
| }, |
| { |
| "completion_length": 536.7123596191407, |
| "epoch": 0.9276258575707798, |
| "grad_norm": 0.506532147702234, |
| "kl": 0.7099609375, |
| "learning_rate": 3.0774636389618196e-07, |
| "loss": 0.0284, |
| "reward": 0.508545909030363, |
| "reward_std": 0.26957538770511746, |
| "rewards/accuracy_reward": 0.508545909030363, |
| "rewards/format_reward": 0.0, |
| "step": 600 |
| }, |
| { |
| "epoch": 0.9276258575707798, |
| "eval_completion_length": 519.1669702148438, |
| "eval_kl": 0.608203125, |
| "eval_loss": 0.024479562416672707, |
| "eval_reward": 0.49571430921554566, |
| "eval_reward_std": 0.3164041242003441, |
| "eval_rewards/accuracy_reward": 0.49571430921554566, |
| "eval_rewards/format_reward": 0.0, |
| "eval_runtime": 118.8299, |
| "eval_samples_per_second": 0.833, |
| "eval_steps_per_second": 0.034, |
| "step": 600 |
| }, |
| { |
| "completion_length": 539.3131271362305, |
| "epoch": 0.9430862885302927, |
| "grad_norm": 0.5791165478383384, |
| "kl": 0.691119384765625, |
| "learning_rate": 1.8886465094192895e-07, |
| "loss": 0.0276, |
| "reward": 0.490943868085742, |
| "reward_std": 0.25967655666172507, |
| "rewards/accuracy_reward": 0.490943868085742, |
| "rewards/format_reward": 0.0, |
| "step": 610 |
| }, |
| { |
| "completion_length": 538.3966737747193, |
| "epoch": 0.9585467194898057, |
| "grad_norm": 0.7640246067553359, |
| "kl": 0.67713623046875, |
| "learning_rate": 9.866173494794462e-08, |
| "loss": 0.0271, |
| "reward": 0.5010203978512436, |
| "reward_std": 0.2573148904833943, |
| "rewards/accuracy_reward": 0.5010203978512436, |
| "rewards/format_reward": 0.0, |
| "step": 620 |
| }, |
| { |
| "completion_length": 544.9163146972656, |
| "epoch": 0.9740071504493187, |
| "grad_norm": 0.32057223426264814, |
| "kl": 0.733837890625, |
| "learning_rate": 3.7401286837214224e-08, |
| "loss": 0.0294, |
| "reward": 0.49795917570590975, |
| "reward_std": 0.2614025991875678, |
| "rewards/accuracy_reward": 0.49795917570590975, |
| "rewards/format_reward": 0.0, |
| "step": 630 |
| }, |
| { |
| "completion_length": 538.4007551193238, |
| "epoch": 0.9894675814088317, |
| "grad_norm": 0.46047905547332224, |
| "kl": 0.66102294921875, |
| "learning_rate": 5.262376196544239e-09, |
| "loss": 0.0264, |
| "reward": 0.486479582823813, |
| "reward_std": 0.27064854740165173, |
| "rewards/accuracy_reward": 0.486479582823813, |
| "rewards/format_reward": 0.0, |
| "step": 640 |
| }, |
| { |
| "completion_length": 533.0607868830363, |
| "epoch": 0.9987438399845395, |
| "kl": 0.6852213541666666, |
| "reward": 0.4989370664892097, |
| "reward_std": 0.26216560679798323, |
| "rewards/accuracy_reward": 0.4989370664892097, |
| "rewards/format_reward": 0.0, |
| "step": 646, |
| "total_flos": 0.0, |
| "train_loss": 0.02447813622547251, |
| "train_runtime": 60911.9541, |
| "train_samples_per_second": 1.189, |
| "train_steps_per_second": 0.011 |
| } |
| ], |
| "logging_steps": 10, |
| "max_steps": 646, |
| "num_input_tokens_seen": 0, |
| "num_train_epochs": 1, |
| "save_steps": 500, |
| "stateful_callbacks": { |
| "TrainerControl": { |
| "args": { |
| "should_epoch_stop": false, |
| "should_evaluate": false, |
| "should_log": false, |
| "should_save": false, |
| "should_training_stop": false |
| }, |
| "attributes": {} |
| } |
| }, |
| "total_flos": 0.0, |
| "train_batch_size": 7, |
| "trial_name": null, |
| "trial_params": null |
| } |
|
|