| { |
| "best_metric": null, |
| "best_model_checkpoint": null, |
| "epoch": 0.007072135785007072, |
| "eval_steps": 500, |
| "global_step": 100, |
| "is_hyper_param_search": false, |
| "is_local_process_zero": true, |
| "is_world_process_zero": true, |
| "log_history": [ |
| { |
| "all_correct": 0.0, |
| "all_wrong": 0.14285714285714285, |
| "completion_length": 228.94644165039062, |
| "epoch": 7.072135785007072e-05, |
| "grad_norm": 5.001040503294373, |
| "kl": 0.0, |
| "learning_rate": 9.999961446907352e-07, |
| "loss": -0.0, |
| "reward": 1.427711844444275, |
| "reward_std": 0.43736881017684937, |
| "rewards/accuracy_reward": 0.5027117729187012, |
| "rewards/format_reward": 0.7410714626312256, |
| "step": 1, |
| "temporal_rewards": 0.7857142686843872 |
| }, |
| { |
| "all_correct": 0.0, |
| "all_wrong": 0.14285714285714285, |
| "completion_length": 243.33929443359375, |
| "epoch": 0.00014144271570014144, |
| "grad_norm": 2.8203915378362665, |
| "kl": 0.000698089599609375, |
| "learning_rate": 9.999845788223948e-07, |
| "loss": 0.0, |
| "reward": 0.9055423736572266, |
| "reward_std": 0.4032002389431, |
| "rewards/accuracy_reward": 0.1912565976381302, |
| "rewards/format_reward": 0.6964285969734192, |
| "step": 2, |
| "temporal_rewards": 0.4285714328289032 |
| }, |
| { |
| "all_correct": 0.0, |
| "all_wrong": 0.14285714285714285, |
| "completion_length": 304.08929443359375, |
| "epoch": 0.00021216407355021216, |
| "grad_norm": 3.229814928119297, |
| "kl": 0.000713348388671875, |
| "learning_rate": 9.999653025733385e-07, |
| "loss": 0.0, |
| "reward": 1.260606288909912, |
| "reward_std": 0.4095536172389984, |
| "rewards/accuracy_reward": 0.29989194869995117, |
| "rewards/format_reward": 0.848214328289032, |
| "step": 3, |
| "temporal_rewards": 0.6428571343421936 |
| }, |
| { |
| "all_correct": 0.0, |
| "all_wrong": 0.2857142857142857, |
| "completion_length": 269.51788330078125, |
| "epoch": 0.0002828854314002829, |
| "grad_norm": 2.4757985886885496, |
| "kl": 0.001068115234375, |
| "learning_rate": 9.999383162408303e-07, |
| "loss": 0.0, |
| "reward": 1.1197317838668823, |
| "reward_std": 0.2747226357460022, |
| "rewards/accuracy_reward": 0.23401744663715363, |
| "rewards/format_reward": 0.8035714626312256, |
| "step": 4, |
| "temporal_rewards": 0.6428571343421936 |
| }, |
| { |
| "all_correct": 0.0, |
| "all_wrong": 0.7142857142857143, |
| "completion_length": 265.3214416503906, |
| "epoch": 0.0003536067892503536, |
| "grad_norm": 5.853654137415192, |
| "kl": 0.001983642578125, |
| "learning_rate": 9.999036202410323e-07, |
| "loss": 0.0001, |
| "reward": 0.9350484013557434, |
| "reward_std": 0.36939921975135803, |
| "rewards/accuracy_reward": 0.22254842519760132, |
| "rewards/format_reward": 0.6696428656578064, |
| "step": 5, |
| "temporal_rewards": 0.5714285373687744 |
| }, |
| { |
| "all_correct": 0.2857142857142857, |
| "all_wrong": 0.2857142857142857, |
| "completion_length": 308.64288330078125, |
| "epoch": 0.0004243281471004243, |
| "grad_norm": 4.2687495467742265, |
| "kl": 0.0017852783203125, |
| "learning_rate": 9.998612151090002e-07, |
| "loss": 0.0001, |
| "reward": 1.3212175369262695, |
| "reward_std": 0.21474608778953552, |
| "rewards/accuracy_reward": 0.4033604562282562, |
| "rewards/format_reward": 0.910714328289032, |
| "step": 6, |
| "temporal_rewards": 0.5 |
| }, |
| { |
| "all_correct": 0.14285714285714285, |
| "all_wrong": 0.2857142857142857, |
| "completion_length": 269.9464416503906, |
| "epoch": 0.0004950495049504951, |
| "grad_norm": 1.8718838996670093, |
| "kl": 0.003631591796875, |
| "learning_rate": 9.998111014986734e-07, |
| "loss": 0.0001, |
| "reward": 1.1820130348205566, |
| "reward_std": 0.40140631794929504, |
| "rewards/accuracy_reward": 0.3248700797557831, |
| "rewards/format_reward": 0.8392857313156128, |
| "step": 7, |
| "temporal_rewards": 0.4285714328289032 |
| }, |
| { |
| "all_correct": 0.14285714285714285, |
| "all_wrong": 0.0, |
| "completion_length": 261.5357360839844, |
| "epoch": 0.0005657708628005657, |
| "grad_norm": 33.10783433713444, |
| "kl": 0.0037689208984375, |
| "learning_rate": 9.997532801828658e-07, |
| "loss": 0.0002, |
| "reward": 1.4435728788375854, |
| "reward_std": 0.2840394377708435, |
| "rewards/accuracy_reward": 0.4275014102458954, |
| "rewards/format_reward": 0.9375000596046448, |
| "step": 8, |
| "temporal_rewards": 0.5714285373687744 |
| }, |
| { |
| "all_correct": 0.0, |
| "all_wrong": 0.2857142857142857, |
| "completion_length": 229.94644165039062, |
| "epoch": 0.0006364922206506365, |
| "grad_norm": 2.322068277450259, |
| "kl": 0.00531005859375, |
| "learning_rate": 9.996877520532534e-07, |
| "loss": 0.0002, |
| "reward": 1.2075822353363037, |
| "reward_std": 0.12404467165470123, |
| "rewards/accuracy_reward": 0.17365358769893646, |
| "rewards/format_reward": 0.9910714626312256, |
| "step": 9, |
| "temporal_rewards": 0.5 |
| }, |
| { |
| "all_correct": 0.14285714285714285, |
| "all_wrong": 0.14285714285714285, |
| "completion_length": 302.5, |
| "epoch": 0.0007072135785007072, |
| "grad_norm": 1.903621240007123, |
| "kl": 0.002532958984375, |
| "learning_rate": 9.996145181203615e-07, |
| "loss": 0.0001, |
| "reward": 1.2701517343521118, |
| "reward_std": 0.18076138198375702, |
| "rewards/accuracy_reward": 0.2558659315109253, |
| "rewards/format_reward": 0.910714328289032, |
| "step": 10, |
| "temporal_rewards": 0.5714285373687744 |
| }, |
| { |
| "all_correct": 0.2857142857142857, |
| "all_wrong": 0.14285714285714285, |
| "completion_length": 219.00001525878906, |
| "epoch": 0.0007779349363507779, |
| "grad_norm": 4.3082825593447485, |
| "kl": 0.00830078125, |
| "learning_rate": 9.995335795135475e-07, |
| "loss": 0.0003, |
| "reward": 1.4937142133712769, |
| "reward_std": 0.1519784927368164, |
| "rewards/accuracy_reward": 0.4294286072254181, |
| "rewards/format_reward": 0.9642857313156128, |
| "step": 11, |
| "temporal_rewards": 0.5 |
| }, |
| { |
| "all_correct": 0.2857142857142857, |
| "all_wrong": 0.14285714285714285, |
| "completion_length": 166.7857208251953, |
| "epoch": 0.0008486562942008486, |
| "grad_norm": 1.7997888001626345, |
| "kl": 0.01519775390625, |
| "learning_rate": 9.99444937480985e-07, |
| "loss": 0.0006, |
| "reward": 1.480262279510498, |
| "reward_std": 0.17392753064632416, |
| "rewards/accuracy_reward": 0.4463335871696472, |
| "rewards/format_reward": 1.0, |
| "step": 12, |
| "temporal_rewards": 0.5714285373687744 |
| }, |
| { |
| "all_correct": 0.0, |
| "all_wrong": 0.14285714285714285, |
| "completion_length": 279.6071472167969, |
| "epoch": 0.0009193776520509194, |
| "grad_norm": 11.803394899600724, |
| "kl": 0.00799560546875, |
| "learning_rate": 9.993485933896437e-07, |
| "loss": 0.0003, |
| "reward": 1.4244685173034668, |
| "reward_std": 0.19528640806674957, |
| "rewards/accuracy_reward": 0.23161137104034424, |
| "rewards/format_reward": 0.9910714626312256, |
| "step": 13, |
| "temporal_rewards": 0.7857142686843872 |
| }, |
| { |
| "all_correct": 0.0, |
| "all_wrong": 0.2857142857142857, |
| "completion_length": 237.6428680419922, |
| "epoch": 0.0009900990099009901, |
| "grad_norm": 1.8875797724521384, |
| "kl": 0.01171875, |
| "learning_rate": 9.99244548725269e-07, |
| "loss": 0.0005, |
| "reward": 1.4068427085876465, |
| "reward_std": 0.26988983154296875, |
| "rewards/accuracy_reward": 0.3711283504962921, |
| "rewards/format_reward": 0.9821429252624512, |
| "step": 14, |
| "temporal_rewards": 0.5 |
| }, |
| { |
| "all_correct": 0.0, |
| "all_wrong": 0.42857142857142855, |
| "completion_length": 240.3035888671875, |
| "epoch": 0.0010608203677510608, |
| "grad_norm": 2.3109230196325914, |
| "kl": 0.015380859375, |
| "learning_rate": 9.99132805092358e-07, |
| "loss": 0.0006, |
| "reward": 1.1580581665039062, |
| "reward_std": 0.13049285113811493, |
| "rewards/accuracy_reward": 0.07948664575815201, |
| "rewards/format_reward": 0.9821429252624512, |
| "step": 15, |
| "temporal_rewards": 0.6428571343421936 |
| }, |
| { |
| "all_correct": 0.2857142857142857, |
| "all_wrong": 0.2857142857142857, |
| "completion_length": 293.4107360839844, |
| "epoch": 0.0011315417256011315, |
| "grad_norm": 4.0653699922309885, |
| "kl": 0.007232666015625, |
| "learning_rate": 9.990133642141357e-07, |
| "loss": 0.0003, |
| "reward": 1.4591931104660034, |
| "reward_std": 0.11937069892883301, |
| "rewards/accuracy_reward": 0.46455028653144836, |
| "rewards/format_reward": 0.973214328289032, |
| "step": 16, |
| "temporal_rewards": 0.5 |
| }, |
| { |
| "all_correct": 0.0, |
| "all_wrong": 0.14285714285714285, |
| "completion_length": 209.5178680419922, |
| "epoch": 0.0012022630834512022, |
| "grad_norm": 3.010805401903002, |
| "kl": 0.006134033203125, |
| "learning_rate": 9.988862279325284e-07, |
| "loss": 0.0002, |
| "reward": 1.2082678079605103, |
| "reward_std": 0.18017704784870148, |
| "rewards/accuracy_reward": 0.1743391901254654, |
| "rewards/format_reward": 0.9821429252624512, |
| "step": 17, |
| "temporal_rewards": 0.5714285373687744 |
| }, |
| { |
| "all_correct": 0.14285714285714285, |
| "all_wrong": 0.2857142857142857, |
| "completion_length": 304.6607360839844, |
| "epoch": 0.001272984441301273, |
| "grad_norm": 2.8112272464882273, |
| "kl": 0.0086669921875, |
| "learning_rate": 9.98751398208135e-07, |
| "loss": 0.0003, |
| "reward": 1.2044445276260376, |
| "reward_std": 0.12818890810012817, |
| "rewards/accuracy_reward": 0.24908724427223206, |
| "rewards/format_reward": 0.848214328289032, |
| "step": 18, |
| "temporal_rewards": 0.6428571343421936 |
| }, |
| { |
| "all_correct": 0.0, |
| "all_wrong": 0.2857142857142857, |
| "completion_length": 245.10714721679688, |
| "epoch": 0.0013437057991513438, |
| "grad_norm": 4.69936573628269, |
| "kl": 0.00701904296875, |
| "learning_rate": 9.986088771201963e-07, |
| "loss": 0.0003, |
| "reward": 1.3298383951187134, |
| "reward_std": 0.19001968204975128, |
| "rewards/accuracy_reward": 0.37983840703964233, |
| "rewards/format_reward": 0.9285714626312256, |
| "step": 19, |
| "temporal_rewards": 0.5 |
| }, |
| { |
| "all_correct": 0.0, |
| "all_wrong": 0.0, |
| "completion_length": 268.3571472167969, |
| "epoch": 0.0014144271570014145, |
| "grad_norm": 5.228045202997088, |
| "kl": 0.00811767578125, |
| "learning_rate": 9.98458666866564e-07, |
| "loss": 0.0003, |
| "reward": 1.3763288259506226, |
| "reward_std": 0.16895097494125366, |
| "rewards/accuracy_reward": 0.2674001157283783, |
| "rewards/format_reward": 0.9910714626312256, |
| "step": 20, |
| "temporal_rewards": 0.6428571343421936 |
| }, |
| { |
| "all_correct": 0.0, |
| "all_wrong": 0.2857142857142857, |
| "completion_length": 289.21429443359375, |
| "epoch": 0.0014851485148514852, |
| "grad_norm": 1.991182727557945, |
| "kl": 0.007293701171875, |
| "learning_rate": 9.983007697636658e-07, |
| "loss": 0.0003, |
| "reward": 1.26073157787323, |
| "reward_std": 0.242427796125412, |
| "rewards/accuracy_reward": 0.23573148250579834, |
| "rewards/format_reward": 1.0, |
| "step": 21, |
| "temporal_rewards": 0.5 |
| }, |
| { |
| "all_correct": 0.0, |
| "all_wrong": 0.14285714285714285, |
| "completion_length": 302.375, |
| "epoch": 0.0015558698727015559, |
| "grad_norm": 2.228743183846523, |
| "kl": 0.0084228515625, |
| "learning_rate": 9.981351882464707e-07, |
| "loss": 0.0003, |
| "reward": 1.2869592905044556, |
| "reward_std": 0.3141626715660095, |
| "rewards/accuracy_reward": 0.2548163831233978, |
| "rewards/format_reward": 0.9910714626312256, |
| "step": 22, |
| "temporal_rewards": 0.5714285373687744 |
| }, |
| { |
| "all_correct": 0.2857142857142857, |
| "all_wrong": 0.14285714285714285, |
| "completion_length": 316.01788330078125, |
| "epoch": 0.0016265912305516265, |
| "grad_norm": 3.8272666470851493, |
| "kl": 0.0081787109375, |
| "learning_rate": 9.979619248684501e-07, |
| "loss": 0.0003, |
| "reward": 1.33269202709198, |
| "reward_std": 0.13754969835281372, |
| "rewards/accuracy_reward": 0.3916205167770386, |
| "rewards/format_reward": 0.8571429252624512, |
| "step": 23, |
| "temporal_rewards": 0.6428571343421936 |
| }, |
| { |
| "all_correct": 0.0, |
| "all_wrong": 0.0, |
| "completion_length": 308.3571472167969, |
| "epoch": 0.0016973125884016972, |
| "grad_norm": 2.7805250038590934, |
| "kl": 0.0098876953125, |
| "learning_rate": 9.9778098230154e-07, |
| "loss": 0.0004, |
| "reward": 1.541878581047058, |
| "reward_std": 0.16751347482204437, |
| "rewards/accuracy_reward": 0.3454500734806061, |
| "rewards/format_reward": 1.0, |
| "step": 24, |
| "temporal_rewards": 0.714285671710968 |
| }, |
| { |
| "all_correct": 0.0, |
| "all_wrong": 0.0, |
| "completion_length": 240.5178680419922, |
| "epoch": 0.001768033946251768, |
| "grad_norm": 7.148190397206045, |
| "kl": 0.0142822265625, |
| "learning_rate": 9.975923633360984e-07, |
| "loss": 0.0006, |
| "reward": 1.4286309480667114, |
| "reward_std": 0.1863246113061905, |
| "rewards/accuracy_reward": 0.38934507966041565, |
| "rewards/format_reward": 1.0, |
| "step": 25, |
| "temporal_rewards": 0.5 |
| }, |
| { |
| "all_correct": 0.0, |
| "all_wrong": 0.0, |
| "completion_length": 314.125, |
| "epoch": 0.0018387553041018388, |
| "grad_norm": 2.7600877644928796, |
| "kl": 0.01080322265625, |
| "learning_rate": 9.973960708808631e-07, |
| "loss": 0.0004, |
| "reward": 1.4452130794525146, |
| "reward_std": 0.29430562257766724, |
| "rewards/accuracy_reward": 0.3523559868335724, |
| "rewards/format_reward": 0.9821429252624512, |
| "step": 26, |
| "temporal_rewards": 0.6428571343421936 |
| }, |
| { |
| "all_correct": 0.0, |
| "all_wrong": 0.2857142857142857, |
| "completion_length": 287.375, |
| "epoch": 0.0019094766619519095, |
| "grad_norm": 2.1898194956106405, |
| "kl": 0.0128173828125, |
| "learning_rate": 9.971921079629069e-07, |
| "loss": 0.0005, |
| "reward": 1.3160984516143799, |
| "reward_std": 0.1425773948431015, |
| "rewards/accuracy_reward": 0.22145557403564453, |
| "rewards/format_reward": 0.9910714626312256, |
| "step": 27, |
| "temporal_rewards": 0.714285671710968 |
| }, |
| { |
| "all_correct": 0.14285714285714285, |
| "all_wrong": 0.2857142857142857, |
| "completion_length": 323.1785888671875, |
| "epoch": 0.0019801980198019802, |
| "grad_norm": 2.3090632087802643, |
| "kl": 0.00872802734375, |
| "learning_rate": 9.969804777275898e-07, |
| "loss": 0.0004, |
| "reward": 1.4885876178741455, |
| "reward_std": 0.1416071355342865, |
| "rewards/accuracy_reward": 0.48501619696617126, |
| "rewards/format_reward": 0.9642857313156128, |
| "step": 28, |
| "temporal_rewards": 0.5 |
| }, |
| { |
| "all_correct": 0.14285714285714285, |
| "all_wrong": 0.14285714285714285, |
| "completion_length": 277.71429443359375, |
| "epoch": 0.002050919377652051, |
| "grad_norm": 2.563040364496041, |
| "kl": 0.01495361328125, |
| "learning_rate": 9.967611834385122e-07, |
| "loss": 0.0006, |
| "reward": 1.5186972618103027, |
| "reward_std": 0.21927528083324432, |
| "rewards/accuracy_reward": 0.5436971187591553, |
| "rewards/format_reward": 0.9642857313156128, |
| "step": 29, |
| "temporal_rewards": 0.5 |
| }, |
| { |
| "all_correct": 0.14285714285714285, |
| "all_wrong": 0.14285714285714285, |
| "completion_length": 279.3035888671875, |
| "epoch": 0.0021216407355021216, |
| "grad_norm": 5.357871739418545, |
| "kl": 0.010009765625, |
| "learning_rate": 9.965342284774631e-07, |
| "loss": 0.0004, |
| "reward": 1.4721060991287231, |
| "reward_std": 0.11375095695257187, |
| "rewards/accuracy_reward": 0.3828202784061432, |
| "rewards/format_reward": 1.0, |
| "step": 30, |
| "temporal_rewards": 0.5714285373687744 |
| }, |
| { |
| "all_correct": 0.0, |
| "all_wrong": 0.14285714285714285, |
| "completion_length": 273.8035888671875, |
| "epoch": 0.0021923620933521925, |
| "grad_norm": 16.74341524407899, |
| "kl": 0.01470947265625, |
| "learning_rate": 9.962996163443688e-07, |
| "loss": 0.0006, |
| "reward": 1.4044029712677002, |
| "reward_std": 0.3334062397480011, |
| "rewards/accuracy_reward": 0.3222600221633911, |
| "rewards/format_reward": 0.973214328289032, |
| "step": 31, |
| "temporal_rewards": 0.5714285373687744 |
| }, |
| { |
| "all_correct": 0.8571428571428571, |
| "all_wrong": 0.0, |
| "completion_length": 234.19644165039062, |
| "epoch": 0.002263083451202263, |
| "grad_norm": 0.8595633494464696, |
| "kl": 0.01556396484375, |
| "learning_rate": 9.960573506572389e-07, |
| "loss": 0.0006, |
| "reward": 1.9745900630950928, |
| "reward_std": 0.0232665054500103, |
| "rewards/accuracy_reward": 0.8781614899635315, |
| "rewards/format_reward": 1.0, |
| "step": 32, |
| "temporal_rewards": 0.6428571343421936 |
| }, |
| { |
| "all_correct": 0.0, |
| "all_wrong": 0.14285714285714285, |
| "completion_length": 419.2321472167969, |
| "epoch": 0.002333804809052334, |
| "grad_norm": 1.5338994037252882, |
| "kl": 0.0084228515625, |
| "learning_rate": 9.958074351521096e-07, |
| "loss": 0.0003, |
| "reward": 1.3846343755722046, |
| "reward_std": 0.33544743061065674, |
| "rewards/accuracy_reward": 0.3524913489818573, |
| "rewards/format_reward": 0.9642857313156128, |
| "step": 33, |
| "temporal_rewards": 0.5 |
| }, |
| { |
| "all_correct": 0.0, |
| "all_wrong": 0.0, |
| "completion_length": 322.75, |
| "epoch": 0.0024045261669024044, |
| "grad_norm": 3.835179363483981, |
| "kl": 0.01275634765625, |
| "learning_rate": 9.955498736829874e-07, |
| "loss": 0.0005, |
| "reward": 1.4323281049728394, |
| "reward_std": 0.29070112109184265, |
| "rewards/accuracy_reward": 0.43232807517051697, |
| "rewards/format_reward": 0.9642857313156128, |
| "step": 34, |
| "temporal_rewards": 0.4285714328289032 |
| }, |
| { |
| "all_correct": 0.0, |
| "all_wrong": 0.14285714285714285, |
| "completion_length": 338.375, |
| "epoch": 0.0024752475247524753, |
| "grad_norm": 2.5354088480923265, |
| "kl": 0.01483154296875, |
| "learning_rate": 9.952846702217885e-07, |
| "loss": 0.0006, |
| "reward": 1.4049376249313354, |
| "reward_std": 0.26813623309135437, |
| "rewards/accuracy_reward": 0.3192232847213745, |
| "rewards/format_reward": 1.0, |
| "step": 35, |
| "temporal_rewards": 0.5 |
| }, |
| { |
| "all_correct": 0.14285714285714285, |
| "all_wrong": 0.14285714285714285, |
| "completion_length": 257.1071472167969, |
| "epoch": 0.002545968882602546, |
| "grad_norm": 2.1307554136020066, |
| "kl": 0.017822265625, |
| "learning_rate": 9.950118288582787e-07, |
| "loss": 0.0007, |
| "reward": 1.4344677925109863, |
| "reward_std": 0.20196352899074554, |
| "rewards/accuracy_reward": 0.3826819956302643, |
| "rewards/format_reward": 1.0, |
| "step": 36, |
| "temporal_rewards": 0.6428571343421936 |
| }, |
| { |
| "all_correct": 0.2857142857142857, |
| "all_wrong": 0.42857142857142855, |
| "completion_length": 197.37501525878906, |
| "epoch": 0.0026166902404526167, |
| "grad_norm": 1.055288306131669, |
| "kl": 0.01806640625, |
| "learning_rate": 9.947313538000092e-07, |
| "loss": 0.0007, |
| "reward": 1.4836164712905884, |
| "reward_std": 0.10975757986307144, |
| "rewards/accuracy_reward": 0.47290223836898804, |
| "rewards/format_reward": 1.0, |
| "step": 37, |
| "temporal_rewards": 0.5 |
| }, |
| { |
| "all_correct": 0.14285714285714285, |
| "all_wrong": 0.14285714285714285, |
| "completion_length": 244.23214721679688, |
| "epoch": 0.0026874115983026876, |
| "grad_norm": 2.103619483245465, |
| "kl": 0.0272216796875, |
| "learning_rate": 9.944432493722524e-07, |
| "loss": 0.0011, |
| "reward": 1.3943806886672974, |
| "reward_std": 0.0798025131225586, |
| "rewards/accuracy_reward": 0.3586663007736206, |
| "rewards/format_reward": 1.0, |
| "step": 38, |
| "temporal_rewards": 0.5 |
| }, |
| { |
| "all_correct": 0.14285714285714285, |
| "all_wrong": 0.14285714285714285, |
| "completion_length": 363.5714416503906, |
| "epoch": 0.002758132956152758, |
| "grad_norm": 1.7740744148730756, |
| "kl": 0.0142822265625, |
| "learning_rate": 9.941475200179346e-07, |
| "loss": 0.0006, |
| "reward": 1.3610663414001465, |
| "reward_std": 0.15090212225914001, |
| "rewards/accuracy_reward": 0.33428052067756653, |
| "rewards/format_reward": 0.9553571939468384, |
| "step": 39, |
| "temporal_rewards": 0.5714285373687744 |
| }, |
| { |
| "all_correct": 0.14285714285714285, |
| "all_wrong": 0.14285714285714285, |
| "completion_length": 346.76788330078125, |
| "epoch": 0.002828854314002829, |
| "grad_norm": 2.37933402889077, |
| "kl": 0.01202392578125, |
| "learning_rate": 9.938441702975689e-07, |
| "loss": 0.0005, |
| "reward": 1.4156906604766846, |
| "reward_std": 0.1760970950126648, |
| "rewards/accuracy_reward": 0.38533341884613037, |
| "rewards/format_reward": 0.973214328289032, |
| "step": 40, |
| "temporal_rewards": 0.5 |
| }, |
| { |
| "all_correct": 0.2857142857142857, |
| "all_wrong": 0.14285714285714285, |
| "completion_length": 341.7321472167969, |
| "epoch": 0.0028995756718528994, |
| "grad_norm": 1.80289901611109, |
| "kl": 0.01226806640625, |
| "learning_rate": 9.935332048891826e-07, |
| "loss": 0.0005, |
| "reward": 1.5363633632659912, |
| "reward_std": 0.15127059817314148, |
| "rewards/accuracy_reward": 0.4327918589115143, |
| "rewards/format_reward": 1.0, |
| "step": 41, |
| "temporal_rewards": 0.6428571343421936 |
| }, |
| { |
| "all_correct": 0.0, |
| "all_wrong": 0.0, |
| "completion_length": 379.6785888671875, |
| "epoch": 0.0029702970297029703, |
| "grad_norm": 6.9951116771129564, |
| "kl": 0.01312255859375, |
| "learning_rate": 9.932146285882476e-07, |
| "loss": 0.0005, |
| "reward": 1.4107944965362549, |
| "reward_std": 0.2239990383386612, |
| "rewards/accuracy_reward": 0.2965086102485657, |
| "rewards/format_reward": 0.9821429252624512, |
| "step": 42, |
| "temporal_rewards": 0.6428571343421936 |
| }, |
| { |
| "all_correct": 0.0, |
| "all_wrong": 0.14285714285714285, |
| "completion_length": 361.9821472167969, |
| "epoch": 0.003041018387553041, |
| "grad_norm": 14.405879650265113, |
| "kl": 0.01422119140625, |
| "learning_rate": 9.928884463076043e-07, |
| "loss": 0.0006, |
| "reward": 1.4136697053909302, |
| "reward_std": 0.2486438900232315, |
| "rewards/accuracy_reward": 0.3136696517467499, |
| "rewards/format_reward": 0.9821429252624512, |
| "step": 43, |
| "temporal_rewards": 0.5714285373687744 |
| }, |
| { |
| "all_correct": 0.14285714285714285, |
| "all_wrong": 0.2857142857142857, |
| "completion_length": 350.0714416503906, |
| "epoch": 0.0031117397454031117, |
| "grad_norm": 27.566161680922317, |
| "kl": 0.0147705078125, |
| "learning_rate": 9.925546630773868e-07, |
| "loss": 0.0006, |
| "reward": 1.2689402103424072, |
| "reward_std": 0.16928793489933014, |
| "rewards/accuracy_reward": 0.2725115716457367, |
| "rewards/format_reward": 0.8750000596046448, |
| "step": 44, |
| "temporal_rewards": 0.6428571343421936 |
| }, |
| { |
| "all_correct": 0.0, |
| "all_wrong": 0.2857142857142857, |
| "completion_length": 385.83929443359375, |
| "epoch": 0.0031824611032531826, |
| "grad_norm": 12.206659175892238, |
| "kl": 0.0146484375, |
| "learning_rate": 9.922132840449458e-07, |
| "loss": 0.0006, |
| "reward": 1.1907211542129517, |
| "reward_std": 0.2152532935142517, |
| "rewards/accuracy_reward": 0.13179250061511993, |
| "rewards/format_reward": 0.9196429252624512, |
| "step": 45, |
| "temporal_rewards": 0.6428571343421936 |
| }, |
| { |
| "all_correct": 0.0, |
| "all_wrong": 0.0, |
| "completion_length": 374.9107360839844, |
| "epoch": 0.003253182461103253, |
| "grad_norm": 2.4250446537484573, |
| "kl": 0.01446533203125, |
| "learning_rate": 9.91864314474768e-07, |
| "loss": 0.0006, |
| "reward": 1.376240611076355, |
| "reward_std": 0.20905308425426483, |
| "rewards/accuracy_reward": 0.28338348865509033, |
| "rewards/format_reward": 0.9642857313156128, |
| "step": 46, |
| "temporal_rewards": 0.5714285373687744 |
| }, |
| { |
| "all_correct": 0.0, |
| "all_wrong": 0.14285714285714285, |
| "completion_length": 433.7321472167969, |
| "epoch": 0.003323903818953324, |
| "grad_norm": 4.7936130008464275, |
| "kl": 0.0084228515625, |
| "learning_rate": 9.915077597483958e-07, |
| "loss": 0.0003, |
| "reward": 1.392529845237732, |
| "reward_std": 0.2959822416305542, |
| "rewards/accuracy_reward": 0.369315505027771, |
| "rewards/format_reward": 0.9910714626312256, |
| "step": 47, |
| "temporal_rewards": 0.5 |
| }, |
| { |
| "all_correct": 0.14285714285714285, |
| "all_wrong": 0.14285714285714285, |
| "completion_length": 423.4821472167969, |
| "epoch": 0.0033946251768033945, |
| "grad_norm": 1.1925199674831055, |
| "kl": 0.00927734375, |
| "learning_rate": 9.911436253643443e-07, |
| "loss": 0.0004, |
| "reward": 1.4139832258224487, |
| "reward_std": 0.2553583085536957, |
| "rewards/accuracy_reward": 0.4854118824005127, |
| "rewards/format_reward": 0.8928571939468384, |
| "step": 48, |
| "temporal_rewards": 0.5 |
| }, |
| { |
| "all_correct": 0.0, |
| "all_wrong": 0.0, |
| "completion_length": 360.1071472167969, |
| "epoch": 0.0034653465346534654, |
| "grad_norm": 6.970466371308957, |
| "kl": 0.01214599609375, |
| "learning_rate": 9.907719169380162e-07, |
| "loss": 0.0005, |
| "reward": 1.4226962327957153, |
| "reward_std": 0.15648144483566284, |
| "rewards/accuracy_reward": 0.29769620299339294, |
| "rewards/format_reward": 1.0, |
| "step": 49, |
| "temporal_rewards": 0.5 |
| }, |
| { |
| "all_correct": 0.0, |
| "all_wrong": 0.2857142857142857, |
| "completion_length": 245.75001525878906, |
| "epoch": 0.003536067892503536, |
| "grad_norm": 11.750858368118319, |
| "kl": 0.016845703125, |
| "learning_rate": 9.90392640201615e-07, |
| "loss": 0.0007, |
| "reward": 1.2471274137496948, |
| "reward_std": 0.20660826563835144, |
| "rewards/accuracy_reward": 0.2274845540523529, |
| "rewards/format_reward": 1.0, |
| "step": 50, |
| "temporal_rewards": 0.5714285373687744 |
| }, |
| { |
| "all_correct": 0.14285714285714285, |
| "all_wrong": 0.14285714285714285, |
| "completion_length": 296.96429443359375, |
| "epoch": 0.0036067892503536068, |
| "grad_norm": 1.9771563601864632, |
| "kl": 0.01513671875, |
| "learning_rate": 9.900058010040577e-07, |
| "loss": 0.0006, |
| "reward": 1.4364941120147705, |
| "reward_std": 0.2468460500240326, |
| "rewards/accuracy_reward": 0.3489939868450165, |
| "rewards/format_reward": 0.9910714626312256, |
| "step": 51, |
| "temporal_rewards": 0.6428571343421936 |
| }, |
| { |
| "all_correct": 0.0, |
| "all_wrong": 0.14285714285714285, |
| "completion_length": 282.7857360839844, |
| "epoch": 0.0036775106082036777, |
| "grad_norm": 2.490876400127454, |
| "kl": 0.0164794921875, |
| "learning_rate": 9.89611405310883e-07, |
| "loss": 0.0007, |
| "reward": 1.4430785179138184, |
| "reward_std": 0.1914571076631546, |
| "rewards/accuracy_reward": 0.34843552112579346, |
| "rewards/format_reward": 1.0, |
| "step": 52, |
| "temporal_rewards": 0.6428571343421936 |
| }, |
| { |
| "all_correct": 0.2857142857142857, |
| "all_wrong": 0.0, |
| "completion_length": 255.35714721679688, |
| "epoch": 0.003748231966053748, |
| "grad_norm": 1.9265114881852106, |
| "kl": 0.0167236328125, |
| "learning_rate": 9.8920945920416e-07, |
| "loss": 0.0007, |
| "reward": 1.6397331953048706, |
| "reward_std": 0.16647037863731384, |
| "rewards/accuracy_reward": 0.5183045864105225, |
| "rewards/format_reward": 1.0, |
| "step": 53, |
| "temporal_rewards": 0.6428571343421936 |
| }, |
| { |
| "all_correct": 0.0, |
| "all_wrong": 0.0, |
| "completion_length": 328.51788330078125, |
| "epoch": 0.003818953323903819, |
| "grad_norm": 1.9871436921834855, |
| "kl": 0.01239013671875, |
| "learning_rate": 9.887999688823954e-07, |
| "loss": 0.0005, |
| "reward": 1.36974036693573, |
| "reward_std": 0.1781034767627716, |
| "rewards/accuracy_reward": 0.34116891026496887, |
| "rewards/format_reward": 0.9821429252624512, |
| "step": 54, |
| "temporal_rewards": 0.5 |
| }, |
| { |
| "all_correct": 0.0, |
| "all_wrong": 0.0, |
| "completion_length": 255.71429443359375, |
| "epoch": 0.0038896746817538895, |
| "grad_norm": 1.9418580621335162, |
| "kl": 0.015869140625, |
| "learning_rate": 9.883829406604361e-07, |
| "loss": 0.0006, |
| "reward": 1.2471375465393066, |
| "reward_std": 0.20031888782978058, |
| "rewards/accuracy_reward": 0.2346375435590744, |
| "rewards/format_reward": 0.9910714626312256, |
| "step": 55, |
| "temporal_rewards": 0.5 |
| }, |
| { |
| "all_correct": 0.14285714285714285, |
| "all_wrong": 0.14285714285714285, |
| "completion_length": 309.5535888671875, |
| "epoch": 0.0039603960396039604, |
| "grad_norm": 2.0813942224085347, |
| "kl": 0.0146484375, |
| "learning_rate": 9.879583809693736e-07, |
| "loss": 0.0006, |
| "reward": 1.356779932975769, |
| "reward_std": 0.13400611281394958, |
| "rewards/accuracy_reward": 0.29785144329071045, |
| "rewards/format_reward": 0.9910714626312256, |
| "step": 56, |
| "temporal_rewards": 0.5714285373687744 |
| }, |
| { |
| "all_correct": 0.0, |
| "all_wrong": 0.0, |
| "completion_length": 371.83929443359375, |
| "epoch": 0.004031117397454031, |
| "grad_norm": 2.64005633723031, |
| "kl": 0.01141357421875, |
| "learning_rate": 9.875262963564435e-07, |
| "loss": 0.0005, |
| "reward": 1.4918296337127686, |
| "reward_std": 0.37153175473213196, |
| "rewards/accuracy_reward": 0.5061153769493103, |
| "rewards/format_reward": 0.9464285969734192, |
| "step": 57, |
| "temporal_rewards": 0.5 |
| }, |
| { |
| "all_correct": 0.0, |
| "all_wrong": 0.2857142857142857, |
| "completion_length": 272.1071472167969, |
| "epoch": 0.004101838755304102, |
| "grad_norm": 2.3327288177522463, |
| "kl": 0.019775390625, |
| "learning_rate": 9.870866934849246e-07, |
| "loss": 0.0008, |
| "reward": 1.3908780813217163, |
| "reward_std": 0.18408146500587463, |
| "rewards/accuracy_reward": 0.35516369342803955, |
| "rewards/format_reward": 1.0, |
| "step": 58, |
| "temporal_rewards": 0.5714285373687744 |
| }, |
| { |
| "all_correct": 0.0, |
| "all_wrong": 0.14285714285714285, |
| "completion_length": 361.7500305175781, |
| "epoch": 0.004172560113154172, |
| "grad_norm": 2.4090170910345385, |
| "kl": 0.016845703125, |
| "learning_rate": 9.866395791340374e-07, |
| "loss": 0.0007, |
| "reward": 1.3034508228302002, |
| "reward_std": 0.23496000468730927, |
| "rewards/accuracy_reward": 0.22845058143138885, |
| "rewards/format_reward": 0.973214328289032, |
| "step": 59, |
| "temporal_rewards": 0.5714285373687744 |
| }, |
| { |
| "all_correct": 0.2857142857142857, |
| "all_wrong": 0.14285714285714285, |
| "completion_length": 338.3035888671875, |
| "epoch": 0.004243281471004243, |
| "grad_norm": 2.0023139679744824, |
| "kl": 0.0140380859375, |
| "learning_rate": 9.861849601988383e-07, |
| "loss": 0.0006, |
| "reward": 1.3882596492767334, |
| "reward_std": 0.1839255839586258, |
| "rewards/accuracy_reward": 0.4061168134212494, |
| "rewards/format_reward": 0.9464285969734192, |
| "step": 60, |
| "temporal_rewards": 0.5 |
| }, |
| { |
| "all_correct": 0.0, |
| "all_wrong": 0.14285714285714285, |
| "completion_length": 273.375, |
| "epoch": 0.004314002828854314, |
| "grad_norm": 2.2569790201906055, |
| "kl": 0.0203857421875, |
| "learning_rate": 9.857228436901134e-07, |
| "loss": 0.0008, |
| "reward": 1.6071990728378296, |
| "reward_std": 0.1727701723575592, |
| "rewards/accuracy_reward": 0.4821990430355072, |
| "rewards/format_reward": 1.0, |
| "step": 61, |
| "temporal_rewards": 0.6428571343421936 |
| }, |
| { |
| "all_correct": 0.14285714285714285, |
| "all_wrong": 0.2857142857142857, |
| "completion_length": 281.625, |
| "epoch": 0.004384724186704385, |
| "grad_norm": 1.6524578433340213, |
| "kl": 0.0125732421875, |
| "learning_rate": 9.852532367342712e-07, |
| "loss": 0.0005, |
| "reward": 1.617965579032898, |
| "reward_std": 0.2325797826051712, |
| "rewards/accuracy_reward": 0.5501083731651306, |
| "rewards/format_reward": 1.0, |
| "step": 62, |
| "temporal_rewards": 0.5714285373687744 |
| }, |
| { |
| "all_correct": 0.14285714285714285, |
| "all_wrong": 0.14285714285714285, |
| "completion_length": 237.1785888671875, |
| "epoch": 0.004455445544554455, |
| "grad_norm": 2.83202259372297, |
| "kl": 0.02685546875, |
| "learning_rate": 9.847761465732316e-07, |
| "loss": 0.0011, |
| "reward": 1.5921242237091064, |
| "reward_std": 0.13505886495113373, |
| "rewards/accuracy_reward": 0.5099811553955078, |
| "rewards/format_reward": 0.9821429252624512, |
| "step": 63, |
| "temporal_rewards": 0.5714285373687744 |
| }, |
| { |
| "all_correct": 0.0, |
| "all_wrong": 0.2857142857142857, |
| "completion_length": 280.3214416503906, |
| "epoch": 0.004526166902404526, |
| "grad_norm": 6.503797696065202, |
| "kl": 0.0166015625, |
| "learning_rate": 9.842915805643156e-07, |
| "loss": 0.0007, |
| "reward": 1.2465910911560059, |
| "reward_std": 0.15690433979034424, |
| "rewards/accuracy_reward": 0.2215910702943802, |
| "rewards/format_reward": 0.9821429252624512, |
| "step": 64, |
| "temporal_rewards": 0.5714285373687744 |
| }, |
| { |
| "all_correct": 0.0, |
| "all_wrong": 0.0, |
| "completion_length": 299.71429443359375, |
| "epoch": 0.004596888260254597, |
| "grad_norm": 2.597081597057807, |
| "kl": 0.01318359375, |
| "learning_rate": 9.837995461801299e-07, |
| "loss": 0.0005, |
| "reward": 1.2907322645187378, |
| "reward_std": 0.18411685526371002, |
| "rewards/accuracy_reward": 0.24787509441375732, |
| "rewards/format_reward": 1.0, |
| "step": 65, |
| "temporal_rewards": 0.5 |
| }, |
| { |
| "all_correct": 0.14285714285714285, |
| "all_wrong": 0.14285714285714285, |
| "completion_length": 254.4285888671875, |
| "epoch": 0.004667609618104668, |
| "grad_norm": 3.0012617684710476, |
| "kl": 0.0184326171875, |
| "learning_rate": 9.833000510084537e-07, |
| "loss": 0.0007, |
| "reward": 1.5473493337631226, |
| "reward_std": 0.28692588210105896, |
| "rewards/accuracy_reward": 0.43663495779037476, |
| "rewards/format_reward": 0.9910714626312256, |
| "step": 66, |
| "temporal_rewards": 0.714285671710968 |
| }, |
| { |
| "all_correct": 0.2857142857142857, |
| "all_wrong": 0.14285714285714285, |
| "completion_length": 310.3571472167969, |
| "epoch": 0.004738330975954739, |
| "grad_norm": 2.5744316518646677, |
| "kl": 0.0164794921875, |
| "learning_rate": 9.827931027521203e-07, |
| "loss": 0.0007, |
| "reward": 1.5562368631362915, |
| "reward_std": 0.056368716061115265, |
| "rewards/accuracy_reward": 0.4740940034389496, |
| "rewards/format_reward": 1.0, |
| "step": 67, |
| "temporal_rewards": 0.5714285373687744 |
| }, |
| { |
| "all_correct": 0.14285714285714285, |
| "all_wrong": 0.0, |
| "completion_length": 285.1607360839844, |
| "epoch": 0.004809052333804809, |
| "grad_norm": 4.4218169067197675, |
| "kl": 0.02099609375, |
| "learning_rate": 9.82278709228899e-07, |
| "loss": 0.0008, |
| "reward": 1.6907390356063843, |
| "reward_std": 0.14986543357372284, |
| "rewards/accuracy_reward": 0.48716750741004944, |
| "rewards/format_reward": 1.0, |
| "step": 68, |
| "temporal_rewards": 0.714285671710968 |
| }, |
| { |
| "all_correct": 0.0, |
| "all_wrong": 0.0, |
| "completion_length": 268.83929443359375, |
| "epoch": 0.00487977369165488, |
| "grad_norm": 36.58319370122495, |
| "kl": 0.0150146484375, |
| "learning_rate": 9.817568783713743e-07, |
| "loss": 0.0006, |
| "reward": 1.422197699546814, |
| "reward_std": 0.1646842360496521, |
| "rewards/accuracy_reward": 0.29362624883651733, |
| "rewards/format_reward": 1.0, |
| "step": 69, |
| "temporal_rewards": 0.6428571343421936 |
| }, |
| { |
| "all_correct": 0.14285714285714285, |
| "all_wrong": 0.0, |
| "completion_length": 404.3214416503906, |
| "epoch": 0.0049504950495049506, |
| "grad_norm": 1.9404067621178873, |
| "kl": 0.0120849609375, |
| "learning_rate": 9.812276182268236e-07, |
| "loss": 0.0005, |
| "reward": 1.441391110420227, |
| "reward_std": 0.3151356279850006, |
| "rewards/accuracy_reward": 0.3735339939594269, |
| "rewards/format_reward": 0.9642857313156128, |
| "step": 70, |
| "temporal_rewards": 0.5714285373687744 |
| }, |
| { |
| "all_correct": 0.0, |
| "all_wrong": 0.2857142857142857, |
| "completion_length": 287.5714416503906, |
| "epoch": 0.0050212164073550215, |
| "grad_norm": 1.5992858525788907, |
| "kl": 0.0206298828125, |
| "learning_rate": 9.80690936957093e-07, |
| "loss": 0.0008, |
| "reward": 1.2283703088760376, |
| "reward_std": 0.19715102016925812, |
| "rewards/accuracy_reward": 0.1426558941602707, |
| "rewards/format_reward": 1.0, |
| "step": 71, |
| "temporal_rewards": 0.5714285373687744 |
| }, |
| { |
| "all_correct": 0.14285714285714285, |
| "all_wrong": 0.14285714285714285, |
| "completion_length": 234.46429443359375, |
| "epoch": 0.005091937765205092, |
| "grad_norm": 3.2452184263458235, |
| "kl": 0.020263671875, |
| "learning_rate": 9.801468428384716e-07, |
| "loss": 0.0008, |
| "reward": 1.4352792501449585, |
| "reward_std": 0.1383657157421112, |
| "rewards/accuracy_reward": 0.4245648682117462, |
| "rewards/format_reward": 1.0, |
| "step": 72, |
| "temporal_rewards": 0.4285714328289032 |
| }, |
| { |
| "all_correct": 0.14285714285714285, |
| "all_wrong": 0.42857142857142855, |
| "completion_length": 226.6607208251953, |
| "epoch": 0.005162659123055162, |
| "grad_norm": 3.2995827121320156, |
| "kl": 0.0206298828125, |
| "learning_rate": 9.795953442615637e-07, |
| "loss": 0.0008, |
| "reward": 1.4032503366470337, |
| "reward_std": 0.18326207995414734, |
| "rewards/accuracy_reward": 0.30860739946365356, |
| "rewards/format_reward": 1.0, |
| "step": 73, |
| "temporal_rewards": 0.6428571343421936 |
| }, |
| { |
| "all_correct": 0.14285714285714285, |
| "all_wrong": 0.0, |
| "completion_length": 273.4821472167969, |
| "epoch": 0.005233380480905233, |
| "grad_norm": 2.014919533244763, |
| "kl": 0.01708984375, |
| "learning_rate": 9.790364497311595e-07, |
| "loss": 0.0007, |
| "reward": 1.494827151298523, |
| "reward_std": 0.2938782870769501, |
| "rewards/accuracy_reward": 0.47696998715400696, |
| "rewards/format_reward": 1.0, |
| "step": 74, |
| "temporal_rewards": 0.5 |
| }, |
| { |
| "all_correct": 0.14285714285714285, |
| "all_wrong": 0.0, |
| "completion_length": 333.3035888671875, |
| "epoch": 0.005304101838755304, |
| "grad_norm": 2.149668464492747, |
| "kl": 0.0152587890625, |
| "learning_rate": 9.784701678661044e-07, |
| "loss": 0.0006, |
| "reward": 1.6914342641830444, |
| "reward_std": 0.2628679871559143, |
| "rewards/accuracy_reward": 0.5807199478149414, |
| "rewards/format_reward": 1.0, |
| "step": 75, |
| "temporal_rewards": 0.5 |
| }, |
| { |
| "all_correct": 0.14285714285714285, |
| "all_wrong": 0.0, |
| "completion_length": 198.37501525878906, |
| "epoch": 0.005374823196605375, |
| "grad_norm": 2.941637870652214, |
| "kl": 0.023681640625, |
| "learning_rate": 9.77896507399165e-07, |
| "loss": 0.0009, |
| "reward": 1.6653074026107788, |
| "reward_std": 0.17388883233070374, |
| "rewards/accuracy_reward": 0.6617358326911926, |
| "rewards/format_reward": 1.0, |
| "step": 76, |
| "temporal_rewards": 0.5 |
| }, |
| { |
| "all_correct": 0.0, |
| "all_wrong": 0.14285714285714285, |
| "completion_length": 292.9107360839844, |
| "epoch": 0.005445544554455445, |
| "grad_norm": 2.575279782360163, |
| "kl": 0.022216796875, |
| "learning_rate": 9.773154771768955e-07, |
| "loss": 0.0009, |
| "reward": 1.3348206281661987, |
| "reward_std": 0.2901800870895386, |
| "rewards/accuracy_reward": 0.24732069671154022, |
| "rewards/format_reward": 1.0, |
| "step": 77, |
| "temporal_rewards": 0.5714285373687744 |
| }, |
| { |
| "all_correct": 0.0, |
| "all_wrong": 0.0, |
| "completion_length": 395.9285888671875, |
| "epoch": 0.005516265912305516, |
| "grad_norm": 2.25007048941082, |
| "kl": 0.014892578125, |
| "learning_rate": 9.767270861595004e-07, |
| "loss": 0.0006, |
| "reward": 1.4193731546401978, |
| "reward_std": 0.2683948278427124, |
| "rewards/accuracy_reward": 0.26937323808670044, |
| "rewards/format_reward": 0.9464285969734192, |
| "step": 78, |
| "temporal_rewards": 0.714285671710968 |
| }, |
| { |
| "all_correct": 0.2857142857142857, |
| "all_wrong": 0.2857142857142857, |
| "completion_length": 348.3035888671875, |
| "epoch": 0.005586987270155587, |
| "grad_norm": 5.781423242439702, |
| "kl": 0.02099609375, |
| "learning_rate": 9.761313434206977e-07, |
| "loss": 0.0008, |
| "reward": 1.4807766675949097, |
| "reward_std": 0.09172937273979187, |
| "rewards/accuracy_reward": 0.5164910554885864, |
| "rewards/format_reward": 0.8392857313156128, |
| "step": 79, |
| "temporal_rewards": 0.714285671710968 |
| }, |
| { |
| "all_correct": 0.0, |
| "all_wrong": 0.2857142857142857, |
| "completion_length": 330.75, |
| "epoch": 0.005657708628005658, |
| "grad_norm": 3.517785506448118, |
| "kl": 0.011962890625, |
| "learning_rate": 9.755282581475767e-07, |
| "loss": 0.0005, |
| "reward": 1.233546495437622, |
| "reward_std": 0.3027758002281189, |
| "rewards/accuracy_reward": 0.27283215522766113, |
| "rewards/format_reward": 0.9285714626312256, |
| "step": 80, |
| "temporal_rewards": 0.5 |
| }, |
| { |
| "all_correct": 0.14285714285714285, |
| "all_wrong": 0.0, |
| "completion_length": 346.1607360839844, |
| "epoch": 0.005728429985855729, |
| "grad_norm": 1.6029567912993616, |
| "kl": 0.0162353515625, |
| "learning_rate": 9.749178396404588e-07, |
| "loss": 0.0007, |
| "reward": 1.4679009914398193, |
| "reward_std": 0.15201067924499512, |
| "rewards/accuracy_reward": 0.43218663334846497, |
| "rewards/format_reward": 0.9642857313156128, |
| "step": 81, |
| "temporal_rewards": 0.5 |
| }, |
| { |
| "all_correct": 0.14285714285714285, |
| "all_wrong": 0.2857142857142857, |
| "completion_length": 235.75001525878906, |
| "epoch": 0.005799151343705799, |
| "grad_norm": 1.4899619569405949, |
| "kl": 0.0201416015625, |
| "learning_rate": 9.743000973127523e-07, |
| "loss": 0.0008, |
| "reward": 1.4392858743667603, |
| "reward_std": 0.2590990662574768, |
| "rewards/accuracy_reward": 0.3999999761581421, |
| "rewards/format_reward": 1.0, |
| "step": 82, |
| "temporal_rewards": 0.5 |
| }, |
| { |
| "all_correct": 0.0, |
| "all_wrong": 0.14285714285714285, |
| "completion_length": 229.08929443359375, |
| "epoch": 0.00586987270155587, |
| "grad_norm": 2.3086478791631824, |
| "kl": 0.0255126953125, |
| "learning_rate": 9.73675040690808e-07, |
| "loss": 0.001, |
| "reward": 1.3768136501312256, |
| "reward_std": 0.2634865641593933, |
| "rewards/accuracy_reward": 0.31967079639434814, |
| "rewards/format_reward": 1.0, |
| "step": 83, |
| "temporal_rewards": 0.4285714328289032 |
| }, |
| { |
| "all_correct": 0.2857142857142857, |
| "all_wrong": 0.0, |
| "completion_length": 282.125, |
| "epoch": 0.005940594059405941, |
| "grad_norm": 2.016686773728144, |
| "kl": 0.0206298828125, |
| "learning_rate": 9.730426794137726e-07, |
| "loss": 0.0008, |
| "reward": 1.5875691175460815, |
| "reward_std": 0.17722778022289276, |
| "rewards/accuracy_reward": 0.5679263472557068, |
| "rewards/format_reward": 0.9910714626312256, |
| "step": 84, |
| "temporal_rewards": 0.5 |
| }, |
| { |
| "all_correct": 0.0, |
| "all_wrong": 0.14285714285714285, |
| "completion_length": 272.5535888671875, |
| "epoch": 0.006011315417256012, |
| "grad_norm": 2.4549952656895093, |
| "kl": 0.0245361328125, |
| "learning_rate": 9.72403023233439e-07, |
| "loss": 0.001, |
| "reward": 1.4239870309829712, |
| "reward_std": 0.1376960575580597, |
| "rewards/accuracy_reward": 0.35255834460258484, |
| "rewards/format_reward": 1.0, |
| "step": 85, |
| "temporal_rewards": 0.5714285373687744 |
| }, |
| { |
| "all_correct": 0.0, |
| "all_wrong": 0.0, |
| "completion_length": 334.625, |
| "epoch": 0.006082036775106082, |
| "grad_norm": 4.839946932590676, |
| "kl": 0.0185546875, |
| "learning_rate": 9.717560820140968e-07, |
| "loss": 0.0007, |
| "reward": 1.5320364236831665, |
| "reward_std": 0.18894881010055542, |
| "rewards/accuracy_reward": 0.4463222026824951, |
| "rewards/format_reward": 1.0, |
| "step": 86, |
| "temporal_rewards": 0.5 |
| }, |
| { |
| "all_correct": 0.0, |
| "all_wrong": 0.14285714285714285, |
| "completion_length": 302.9285888671875, |
| "epoch": 0.0061527581329561525, |
| "grad_norm": 14.12798359766534, |
| "kl": 0.025634765625, |
| "learning_rate": 9.711018657323798e-07, |
| "loss": 0.001, |
| "reward": 1.564679503440857, |
| "reward_std": 0.1213698536157608, |
| "rewards/accuracy_reward": 0.3789650797843933, |
| "rewards/format_reward": 1.0, |
| "step": 87, |
| "temporal_rewards": 0.714285671710968 |
| }, |
| { |
| "all_correct": 0.0, |
| "all_wrong": 0.2857142857142857, |
| "completion_length": 369.3571472167969, |
| "epoch": 0.006223479490806223, |
| "grad_norm": 1.1787384215611914, |
| "kl": 0.01434326171875, |
| "learning_rate": 9.704403844771127e-07, |
| "loss": 0.0006, |
| "reward": 1.3432508707046509, |
| "reward_std": 0.24372150003910065, |
| "rewards/accuracy_reward": 0.3253936171531677, |
| "rewards/format_reward": 0.9464285969734192, |
| "step": 88, |
| "temporal_rewards": 0.5 |
| }, |
| { |
| "all_correct": 0.0, |
| "all_wrong": 0.0, |
| "completion_length": 285.46429443359375, |
| "epoch": 0.006294200848656294, |
| "grad_norm": 2.1919194899374355, |
| "kl": 0.02587890625, |
| "learning_rate": 9.697716484491545e-07, |
| "loss": 0.001, |
| "reward": 1.250388503074646, |
| "reward_std": 0.09060105681419373, |
| "rewards/accuracy_reward": 0.21824556589126587, |
| "rewards/format_reward": 1.0, |
| "step": 89, |
| "temporal_rewards": 0.4285714328289032 |
| }, |
| { |
| "all_correct": 0.14285714285714285, |
| "all_wrong": 0.0, |
| "completion_length": 256.39288330078125, |
| "epoch": 0.006364922206506365, |
| "grad_norm": 2.4092854894258298, |
| "kl": 0.0218505859375, |
| "learning_rate": 9.69095667961242e-07, |
| "loss": 0.0009, |
| "reward": 1.5603086948394775, |
| "reward_std": 0.2149655967950821, |
| "rewards/accuracy_reward": 0.53352290391922, |
| "rewards/format_reward": 0.9464285969734192, |
| "step": 90, |
| "temporal_rewards": 0.6428571343421936 |
| }, |
| { |
| "all_correct": 0.14285714285714285, |
| "all_wrong": 0.14285714285714285, |
| "completion_length": 353.64288330078125, |
| "epoch": 0.006435643564356435, |
| "grad_norm": 2.2816781220518316, |
| "kl": 0.0189208984375, |
| "learning_rate": 9.684124534378306e-07, |
| "loss": 0.0008, |
| "reward": 1.5044071674346924, |
| "reward_std": 0.3355371654033661, |
| "rewards/accuracy_reward": 0.4311927855014801, |
| "rewards/format_reward": 1.0, |
| "step": 91, |
| "temporal_rewards": 0.6428571343421936 |
| }, |
| { |
| "all_correct": 0.14285714285714285, |
| "all_wrong": 0.14285714285714285, |
| "completion_length": 383.0000305175781, |
| "epoch": 0.006506364922206506, |
| "grad_norm": 3.1153295514142254, |
| "kl": 0.0198974609375, |
| "learning_rate": 9.677220154149337e-07, |
| "loss": 0.0008, |
| "reward": 1.3627312183380127, |
| "reward_std": 0.22414493560791016, |
| "rewards/accuracy_reward": 0.30380263924598694, |
| "rewards/format_reward": 0.9553571939468384, |
| "step": 92, |
| "temporal_rewards": 0.6428571343421936 |
| }, |
| { |
| "all_correct": 0.0, |
| "all_wrong": 0.14285714285714285, |
| "completion_length": 390.08929443359375, |
| "epoch": 0.006577086280056577, |
| "grad_norm": 2.2845054552496777, |
| "kl": 0.019775390625, |
| "learning_rate": 9.670243645399592e-07, |
| "loss": 0.0008, |
| "reward": 1.2227200269699097, |
| "reward_std": 0.11306477338075638, |
| "rewards/accuracy_reward": 0.27629145979881287, |
| "rewards/format_reward": 0.8571429252624512, |
| "step": 93, |
| "temporal_rewards": 0.4285714328289032 |
| }, |
| { |
| "all_correct": 0.14285714285714285, |
| "all_wrong": 0.2857142857142857, |
| "completion_length": 329.5357360839844, |
| "epoch": 0.006647807637906648, |
| "grad_norm": 4.302480420224573, |
| "kl": 0.0201416015625, |
| "learning_rate": 9.66319511571547e-07, |
| "loss": 0.0008, |
| "reward": 1.516809105873108, |
| "reward_std": 0.19412218034267426, |
| "rewards/accuracy_reward": 0.36680904030799866, |
| "rewards/format_reward": 1.0, |
| "step": 94, |
| "temporal_rewards": 0.6428571343421936 |
| }, |
| { |
| "all_correct": 0.0, |
| "all_wrong": 0.2857142857142857, |
| "completion_length": 381.8214416503906, |
| "epoch": 0.006718528995756719, |
| "grad_norm": 2.8590305141700845, |
| "kl": 0.01556396484375, |
| "learning_rate": 9.656074673794017e-07, |
| "loss": 0.0006, |
| "reward": 1.4072080850601196, |
| "reward_std": 0.26467329263687134, |
| "rewards/accuracy_reward": 0.32327938079833984, |
| "rewards/format_reward": 0.9464285969734192, |
| "step": 95, |
| "temporal_rewards": 0.6428571343421936 |
| }, |
| { |
| "all_correct": 0.0, |
| "all_wrong": 0.14285714285714285, |
| "completion_length": 380.4464416503906, |
| "epoch": 0.006789250353606789, |
| "grad_norm": 1.8422918377808415, |
| "kl": 0.017333984375, |
| "learning_rate": 9.648882429441256e-07, |
| "loss": 0.0007, |
| "reward": 1.3689515590667725, |
| "reward_std": 0.08133874088525772, |
| "rewards/accuracy_reward": 0.29752302169799805, |
| "rewards/format_reward": 1.0, |
| "step": 96, |
| "temporal_rewards": 0.5 |
| }, |
| { |
| "all_correct": 0.2857142857142857, |
| "all_wrong": 0.0, |
| "completion_length": 268.71429443359375, |
| "epoch": 0.00685997171145686, |
| "grad_norm": 1.9635116777359027, |
| "kl": 0.0234375, |
| "learning_rate": 9.641618493570494e-07, |
| "loss": 0.0009, |
| "reward": 1.5267155170440674, |
| "reward_std": 0.1548340767621994, |
| "rewards/accuracy_reward": 0.5052869319915771, |
| "rewards/format_reward": 1.0, |
| "step": 97, |
| "temporal_rewards": 0.5 |
| }, |
| { |
| "all_correct": 0.2857142857142857, |
| "all_wrong": 0.0, |
| "completion_length": 317.7857360839844, |
| "epoch": 0.006930693069306931, |
| "grad_norm": 3.1660820324790895, |
| "kl": 0.0223388671875, |
| "learning_rate": 9.634282978200603e-07, |
| "loss": 0.0009, |
| "reward": 1.617742896080017, |
| "reward_std": 0.18577314913272858, |
| "rewards/accuracy_reward": 0.5284570455551147, |
| "rewards/format_reward": 0.9821429252624512, |
| "step": 98, |
| "temporal_rewards": 0.5 |
| }, |
| { |
| "all_correct": 0.14285714285714285, |
| "all_wrong": 0.42857142857142855, |
| "completion_length": 383.9821472167969, |
| "epoch": 0.007001414427157002, |
| "grad_norm": 1.737409775959772, |
| "kl": 0.0186767578125, |
| "learning_rate": 9.62687599645431e-07, |
| "loss": 0.0007, |
| "reward": 1.293892741203308, |
| "reward_std": 0.15461252629756927, |
| "rewards/accuracy_reward": 0.3153212070465088, |
| "rewards/format_reward": 0.9464285969734192, |
| "step": 99, |
| "temporal_rewards": 0.5 |
| }, |
| { |
| "all_correct": 0.14285714285714285, |
| "all_wrong": 0.14285714285714285, |
| "completion_length": 371.6964416503906, |
| "epoch": 0.007072135785007072, |
| "grad_norm": 2.208121527474034, |
| "kl": 0.0196533203125, |
| "learning_rate": 9.619397662556433e-07, |
| "loss": 0.0008, |
| "reward": 1.5583664178848267, |
| "reward_std": 0.17507988214492798, |
| "rewards/accuracy_reward": 0.47622358798980713, |
| "rewards/format_reward": 0.9821429252624512, |
| "step": 100, |
| "temporal_rewards": 0.5714285373687744 |
| } |
| ], |
| "logging_steps": 1.0, |
| "max_steps": 800, |
| "num_input_tokens_seen": 0, |
| "num_train_epochs": 1, |
| "save_steps": 100, |
| "stateful_callbacks": { |
| "TrainerControl": { |
| "args": { |
| "should_epoch_stop": false, |
| "should_evaluate": false, |
| "should_log": false, |
| "should_save": true, |
| "should_training_stop": false |
| }, |
| "attributes": {} |
| } |
| }, |
| "total_flos": 0.0, |
| "train_batch_size": 1, |
| "trial_name": null, |
| "trial_params": null |
| } |
|
|