{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 0.4, "eval_steps": 500, "global_step": 100, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "completion_length": 361.75, "epoch": 0.004, "grad_norm": 0.0693558007478714, "kl": 0.0, "learning_rate": 0.0, "loss": -0.0, "reward": 0.02345000021159649, "reward_std": 0.04690000042319298, "rewards/pot_combined_reward": 0.02345000021159649, "step": 1 }, { "completion_length": 371.375, "epoch": 0.008, "grad_norm": 0.08951307833194733, "kl": 0.0, "learning_rate": 5.000000000000001e-07, "loss": -0.0, "reward": 0.026133334264159203, "reward_std": 0.052266668528318405, "rewards/pot_combined_reward": 0.026133334264159203, "step": 2 }, { "completion_length": 374.0, "epoch": 0.012, "grad_norm": 0.001663331058807671, "kl": 0.0005528016190510243, "learning_rate": 1.0000000000000002e-06, "loss": 0.0001, "reward": 0.0, "reward_std": 0.0, "rewards/pot_combined_reward": 0.0, "step": 3 }, { "completion_length": 374.0, "epoch": 0.016, "grad_norm": 0.0016158577054738998, "kl": 0.0005013404006604105, "learning_rate": 1.5e-06, "loss": 0.0001, "reward": 0.0, "reward_std": 0.0, "rewards/pot_combined_reward": 0.0, "step": 4 }, { "completion_length": 373.375, "epoch": 0.02, "grad_norm": 0.0030017346143722534, "kl": 0.0005660907772835344, "learning_rate": 2.0000000000000003e-06, "loss": 0.0001, "reward": 0.0, "reward_std": 0.0, "rewards/pot_combined_reward": 0.0, "step": 5 }, { "completion_length": 373.5625, "epoch": 0.024, "grad_norm": 0.0015043334569782019, "kl": 0.0005426810312201269, "learning_rate": 2.5e-06, "loss": 0.0001, "reward": 0.0, "reward_std": 0.0, "rewards/pot_combined_reward": 0.0, "step": 6 }, { "completion_length": 364.5, "epoch": 0.028, "grad_norm": 0.08007726073265076, "kl": 0.0005026786457165144, "learning_rate": 3e-06, "loss": 0.0001, "reward": 0.03146666660904884, "reward_std": 0.06293333321809769, "rewards/pot_combined_reward": 0.03146666660904884, "step": 7 }, { "completion_length": 374.0, "epoch": 0.032, "grad_norm": 0.0016093035228550434, "kl": 0.0005015511706005782, "learning_rate": 3.5e-06, "loss": 0.0001, "reward": 0.0, "reward_std": 0.0, "rewards/pot_combined_reward": 0.0, "step": 8 }, { "completion_length": 353.0625, "epoch": 0.036, "grad_norm": 0.001784446300007403, "kl": 0.0003549655375536531, "learning_rate": 4.000000000000001e-06, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/pot_combined_reward": 0.0, "step": 9 }, { "completion_length": 374.0, "epoch": 0.04, "grad_norm": 0.0020515809301286936, "kl": 0.0005762250584666617, "learning_rate": 4.5e-06, "loss": 0.0001, "reward": 0.0, "reward_std": 0.0, "rewards/pot_combined_reward": 0.0, "step": 10 }, { "completion_length": 374.0, "epoch": 0.044, "grad_norm": 0.06735244393348694, "kl": 0.0005008808220736682, "learning_rate": 5e-06, "loss": 0.0001, "reward": 0.01808333396911621, "reward_std": 0.03616666793823242, "rewards/pot_combined_reward": 0.01808333396911621, "step": 11 }, { "completion_length": 342.5, "epoch": 0.048, "grad_norm": 0.0024033007211983204, "kl": 0.00046441886661341414, "learning_rate": 4.99847706754774e-06, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/pot_combined_reward": 0.0, "step": 12 }, { "completion_length": 374.0, "epoch": 0.052, "grad_norm": 0.0014815045287832618, "kl": 0.0004604290661518462, "learning_rate": 4.993910125649561e-06, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/pot_combined_reward": 0.0, "step": 13 }, { "completion_length": 374.0, "epoch": 0.056, "grad_norm": 0.001548771746456623, "kl": 0.0004989306180505082, "learning_rate": 4.986304738420684e-06, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/pot_combined_reward": 0.0, "step": 14 }, { "completion_length": 360.8125, "epoch": 0.06, "grad_norm": 0.001822226564399898, "kl": 0.00046228048086049967, "learning_rate": 4.975670171853926e-06, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/pot_combined_reward": 0.0, "step": 15 }, { "completion_length": 374.0, "epoch": 0.064, "grad_norm": 0.09072617441415787, "kl": 0.000476861278002616, "learning_rate": 4.962019382530521e-06, "loss": 0.0, "reward": 0.012600000016391277, "reward_std": 0.025200000032782555, "rewards/pot_combined_reward": 0.012600000016391277, "step": 16 }, { "completion_length": 374.0, "epoch": 0.068, "grad_norm": 0.0015437575057148933, "kl": 0.0005299622716847807, "learning_rate": 4.9453690018345144e-06, "loss": 0.0001, "reward": 0.0, "reward_std": 0.0, "rewards/pot_combined_reward": 0.0, "step": 17 }, { "completion_length": 374.0, "epoch": 0.072, "grad_norm": 0.0016528957057744265, "kl": 0.0005346549514797516, "learning_rate": 4.925739315689991e-06, "loss": 0.0001, "reward": 0.0, "reward_std": 0.0, "rewards/pot_combined_reward": 0.0, "step": 18 }, { "completion_length": 374.0, "epoch": 0.076, "grad_norm": 0.0017183530144393444, "kl": 0.0004841076224693097, "learning_rate": 4.903154239845798e-06, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/pot_combined_reward": 0.0, "step": 19 }, { "completion_length": 373.5625, "epoch": 0.08, "grad_norm": 0.08476348221302032, "kl": 0.000539450986252632, "learning_rate": 4.8776412907378845e-06, "loss": 0.0001, "reward": 0.026249999180436134, "reward_std": 0.05249999836087227, "rewards/pot_combined_reward": 0.026249999180436134, "step": 20 }, { "completion_length": 374.0, "epoch": 0.084, "grad_norm": 0.07820821553468704, "kl": 0.0005957721295999363, "learning_rate": 4.849231551964771e-06, "loss": 0.0001, "reward": 0.07116249948740005, "reward_std": 0.1423249989748001, "rewards/pot_combined_reward": 0.07116249948740005, "step": 21 }, { "completion_length": 373.5625, "epoch": 0.088, "grad_norm": 0.0016008545644581318, "kl": 0.000533243379322812, "learning_rate": 4.817959636416969e-06, "loss": 0.0001, "reward": 0.0, "reward_std": 0.0, "rewards/pot_combined_reward": 0.0, "step": 22 }, { "completion_length": 374.0, "epoch": 0.092, "grad_norm": 0.0018651616992428899, "kl": 0.0005620143201667815, "learning_rate": 4.783863644106502e-06, "loss": 0.0001, "reward": 0.0, "reward_std": 0.0, "rewards/pot_combined_reward": 0.0, "step": 23 }, { "completion_length": 374.0, "epoch": 0.096, "grad_norm": 0.001895356923341751, "kl": 0.00047788477240828797, "learning_rate": 4.746985115747918e-06, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/pot_combined_reward": 0.0, "step": 24 }, { "completion_length": 369.75, "epoch": 0.1, "grad_norm": 0.11337540298700333, "kl": 0.00047047801490407437, "learning_rate": 4.707368982147318e-06, "loss": 0.0, "reward": 0.04736666567623615, "reward_std": 0.0947333313524723, "rewards/pot_combined_reward": 0.04736666567623615, "step": 25 }, { "completion_length": 368.625, "epoch": 0.104, "grad_norm": 0.01079186424612999, "kl": 0.0006513141634059139, "learning_rate": 4.665063509461098e-06, "loss": 0.0001, "reward": 0.0, "reward_std": 0.0, "rewards/pot_combined_reward": 0.0, "step": 26 }, { "completion_length": 374.0, "epoch": 0.108, "grad_norm": 0.0016566955018788576, "kl": 0.0005965056043351069, "learning_rate": 4.620120240391065e-06, "loss": 0.0001, "reward": 0.0, "reward_std": 0.0, "rewards/pot_combined_reward": 0.0, "step": 27 }, { "completion_length": 374.0, "epoch": 0.112, "grad_norm": 0.001792517607100308, "kl": 0.0005393773099058308, "learning_rate": 4.572593931387604e-06, "loss": 0.0001, "reward": 0.0, "reward_std": 0.0, "rewards/pot_combined_reward": 0.0, "step": 28 }, { "completion_length": 374.0, "epoch": 0.116, "grad_norm": 0.0017438618233427405, "kl": 0.0005095232809253503, "learning_rate": 4.522542485937369e-06, "loss": 0.0001, "reward": 0.0, "reward_std": 0.0, "rewards/pot_combined_reward": 0.0, "step": 29 }, { "completion_length": 374.0, "epoch": 0.12, "grad_norm": 0.0015684061218053102, "kl": 0.00047055614413693547, "learning_rate": 4.470026884016805e-06, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/pot_combined_reward": 0.0, "step": 30 }, { "completion_length": 374.0, "epoch": 0.124, "grad_norm": 0.0019608919974416494, "kl": 0.0005961552087683231, "learning_rate": 4.415111107797445e-06, "loss": 0.0001, "reward": 0.0, "reward_std": 0.0, "rewards/pot_combined_reward": 0.0, "step": 31 }, { "completion_length": 372.0625, "epoch": 0.128, "grad_norm": 0.0015237935585901141, "kl": 0.0005326158570824191, "learning_rate": 4.357862063693486e-06, "loss": 0.0001, "reward": 0.0, "reward_std": 0.0, "rewards/pot_combined_reward": 0.0, "step": 32 }, { "completion_length": 374.0, "epoch": 0.132, "grad_norm": 0.002203061943873763, "kl": 0.0006071907628211193, "learning_rate": 4.2983495008466285e-06, "loss": 0.0001, "reward": 0.0, "reward_std": 0.0, "rewards/pot_combined_reward": 0.0, "step": 33 }, { "completion_length": 370.5, "epoch": 0.136, "grad_norm": 0.10126212984323502, "kl": 0.0006130525580374524, "learning_rate": 4.236645926147493e-06, "loss": 0.0001, "reward": 0.061249999329447746, "reward_std": 0.12249999865889549, "rewards/pot_combined_reward": 0.061249999329447746, "step": 34 }, { "completion_length": 374.0, "epoch": 0.14, "grad_norm": 0.08621055632829666, "kl": 0.0004927485424559563, "learning_rate": 4.172826515897146e-06, "loss": 0.0, "reward": 0.024966666474938393, "reward_std": 0.049933332949876785, "rewards/pot_combined_reward": 0.024966666474938393, "step": 35 }, { "completion_length": 374.0, "epoch": 0.144, "grad_norm": 0.0017127083847299218, "kl": 0.0005035524372942746, "learning_rate": 4.106969024216348e-06, "loss": 0.0001, "reward": 0.0, "reward_std": 0.0, "rewards/pot_combined_reward": 0.0, "step": 36 }, { "completion_length": 374.0, "epoch": 0.148, "grad_norm": 0.0018064226023852825, "kl": 0.0005530964990612119, "learning_rate": 4.039153688314146e-06, "loss": 0.0001, "reward": 0.0, "reward_std": 0.0, "rewards/pot_combined_reward": 0.0, "step": 37 }, { "completion_length": 373.8125, "epoch": 0.152, "grad_norm": 0.0019408023217692971, "kl": 0.0006417437689378858, "learning_rate": 3.969463130731183e-06, "loss": 0.0001, "reward": 0.0, "reward_std": 0.0, "rewards/pot_combined_reward": 0.0, "step": 38 }, { "completion_length": 374.0, "epoch": 0.156, "grad_norm": 0.08421237021684647, "kl": 0.0006241849769139662, "learning_rate": 3.897982258676867e-06, "loss": 0.0001, "reward": 0.07303333282470703, "reward_std": 0.057444244623184204, "rewards/pot_combined_reward": 0.07303333282470703, "step": 39 }, { "completion_length": 374.0, "epoch": 0.16, "grad_norm": 0.08112610131502151, "kl": 0.0005679467285517603, "learning_rate": 3.824798160583012e-06, "loss": 0.0001, "reward": 0.014233333058655262, "reward_std": 0.028466666117310524, "rewards/pot_combined_reward": 0.014233333058655262, "step": 40 }, { "completion_length": 374.0, "epoch": 0.164, "grad_norm": 0.001669050194323063, "kl": 0.0005617116403300315, "learning_rate": 3.7500000000000005e-06, "loss": 0.0001, "reward": 0.0, "reward_std": 0.0, "rewards/pot_combined_reward": 0.0, "step": 41 }, { "completion_length": 374.0, "epoch": 0.168, "grad_norm": 0.002355287317186594, "kl": 0.0006023006426403299, "learning_rate": 3.6736789069647273e-06, "loss": 0.0001, "reward": 0.0, "reward_std": 0.0, "rewards/pot_combined_reward": 0.0, "step": 42 }, { "completion_length": 328.8125, "epoch": 0.172, "grad_norm": 0.0023054229095578194, "kl": 0.0005200250307098031, "learning_rate": 3.595927866972694e-06, "loss": 0.0001, "reward": 0.0, "reward_std": 0.0, "rewards/pot_combined_reward": 0.0, "step": 43 }, { "completion_length": 374.0, "epoch": 0.176, "grad_norm": 0.11641528457403183, "kl": 0.0005873750924365595, "learning_rate": 3.516841607689501e-06, "loss": 0.0001, "reward": 0.07468749955296516, "reward_std": 0.14937499910593033, "rewards/pot_combined_reward": 0.07468749955296516, "step": 44 }, { "completion_length": 374.0, "epoch": 0.18, "grad_norm": 0.001737726735882461, "kl": 0.0005044558856752701, "learning_rate": 3.436516483539781e-06, "loss": 0.0001, "reward": 0.0, "reward_std": 0.0, "rewards/pot_combined_reward": 0.0, "step": 45 }, { "completion_length": 370.9375, "epoch": 0.184, "grad_norm": 0.0019563438836485147, "kl": 0.000592117925407365, "learning_rate": 3.3550503583141726e-06, "loss": 0.0001, "reward": 0.0, "reward_std": 0.0, "rewards/pot_combined_reward": 0.0, "step": 46 }, { "completion_length": 372.4375, "epoch": 0.188, "grad_norm": 0.0026125519070774317, "kl": 0.0005959889385849237, "learning_rate": 3.272542485937369e-06, "loss": 0.0001, "reward": 0.0, "reward_std": 0.0, "rewards/pot_combined_reward": 0.0, "step": 47 }, { "completion_length": 372.75, "epoch": 0.192, "grad_norm": 0.0820600688457489, "kl": 0.0005973696243017912, "learning_rate": 3.189093389542498e-06, "loss": 0.0001, "reward": 0.03968749940395355, "reward_std": 0.0793749988079071, "rewards/pot_combined_reward": 0.03968749940395355, "step": 48 }, { "completion_length": 374.0, "epoch": 0.196, "grad_norm": 0.06782057881355286, "kl": 0.0007923852826934308, "learning_rate": 3.1048047389991693e-06, "loss": 0.0001, "reward": 0.026249999180436134, "reward_std": 0.05249999836087227, "rewards/pot_combined_reward": 0.026249999180436134, "step": 49 }, { "completion_length": 374.0, "epoch": 0.2, "grad_norm": 0.10189752280712128, "kl": 0.0004997247888240963, "learning_rate": 3.019779227044398e-06, "loss": 0.0, "reward": 0.06999999843537807, "reward_std": 0.13999999687075615, "rewards/pot_combined_reward": 0.06999999843537807, "step": 50 }, { "completion_length": 374.0, "epoch": 0.204, "grad_norm": 0.004633176140487194, "kl": 0.0005932025596848689, "learning_rate": 2.9341204441673267e-06, "loss": 0.0001, "reward": 0.0, "reward_std": 0.0, "rewards/pot_combined_reward": 0.0, "step": 51 }, { "completion_length": 373.3125, "epoch": 0.208, "grad_norm": 0.0021235239692032337, "kl": 0.0006392140567186289, "learning_rate": 2.847932752400164e-06, "loss": 0.0001, "reward": 0.0, "reward_std": 0.0, "rewards/pot_combined_reward": 0.0, "step": 52 }, { "completion_length": 348.5, "epoch": 0.212, "grad_norm": 0.001950100064277649, "kl": 0.0006046331109246239, "learning_rate": 2.761321158169134e-06, "loss": 0.0001, "reward": 0.0, "reward_std": 0.0, "rewards/pot_combined_reward": 0.0, "step": 53 }, { "completion_length": 351.875, "epoch": 0.216, "grad_norm": 0.009500402957201004, "kl": 0.0005020819662604481, "learning_rate": 2.6743911843603134e-06, "loss": 0.0001, "reward": 0.0, "reward_std": 0.0, "rewards/pot_combined_reward": 0.0, "step": 54 }, { "completion_length": 374.0, "epoch": 0.22, "grad_norm": 0.001888828701339662, "kl": 0.0005405374467954971, "learning_rate": 2.587248741756253e-06, "loss": 0.0001, "reward": 0.0, "reward_std": 0.0, "rewards/pot_combined_reward": 0.0, "step": 55 }, { "completion_length": 371.125, "epoch": 0.224, "grad_norm": 0.10614389926195145, "kl": 0.0007851378031773493, "learning_rate": 2.5e-06, "loss": 0.0001, "reward": 0.18312916532158852, "reward_std": 0.27463946491479874, "rewards/pot_combined_reward": 0.18312916532158852, "step": 56 }, { "completion_length": 374.0, "epoch": 0.228, "grad_norm": 0.002482097828760743, "kl": 0.0006323368143057451, "learning_rate": 2.4127512582437486e-06, "loss": 0.0001, "reward": 0.0, "reward_std": 0.0, "rewards/pot_combined_reward": 0.0, "step": 57 }, { "completion_length": 374.0, "epoch": 0.232, "grad_norm": 0.0019110125722363591, "kl": 0.0005738867403124459, "learning_rate": 2.325608815639687e-06, "loss": 0.0001, "reward": 0.0, "reward_std": 0.0, "rewards/pot_combined_reward": 0.0, "step": 58 }, { "completion_length": 374.0, "epoch": 0.236, "grad_norm": 0.13201673328876495, "kl": 0.0008093079959508032, "learning_rate": 2.238678841830867e-06, "loss": 0.0001, "reward": 0.09951666742563248, "reward_std": 0.1495332196354866, "rewards/pot_combined_reward": 0.09951666742563248, "step": 59 }, { "completion_length": 373.75, "epoch": 0.24, "grad_norm": 0.10325703024864197, "kl": 0.0007794436533004045, "learning_rate": 2.1520672475998374e-06, "loss": 0.0001, "reward": 0.0652166660875082, "reward_std": 0.1304333321750164, "rewards/pot_combined_reward": 0.0652166660875082, "step": 60 }, { "completion_length": 374.0, "epoch": 0.244, "grad_norm": 0.07717566192150116, "kl": 0.0006568977987626567, "learning_rate": 2.0658795558326745e-06, "loss": 0.0001, "reward": 0.03968749940395355, "reward_std": 0.0793749988079071, "rewards/pot_combined_reward": 0.03968749940395355, "step": 61 }, { "completion_length": 373.0625, "epoch": 0.248, "grad_norm": 0.0719151571393013, "kl": 0.0006853110971860588, "learning_rate": 1.9802207729556023e-06, "loss": 0.0001, "reward": 0.026249999180436134, "reward_std": 0.05249999836087227, "rewards/pot_combined_reward": 0.026249999180436134, "step": 62 }, { "completion_length": 373.625, "epoch": 0.252, "grad_norm": 0.07498770952224731, "kl": 0.0007669057231396437, "learning_rate": 1.895195261000831e-06, "loss": 0.0001, "reward": 0.013883333653211594, "reward_std": 0.027766667306423187, "rewards/pot_combined_reward": 0.013883333653211594, "step": 63 }, { "completion_length": 372.375, "epoch": 0.256, "grad_norm": 0.0024490871001034975, "kl": 0.0007131954189389944, "learning_rate": 1.8109066104575023e-06, "loss": 0.0001, "reward": 0.0, "reward_std": 0.0, "rewards/pot_combined_reward": 0.0, "step": 64 }, { "completion_length": 374.0, "epoch": 0.26, "grad_norm": 0.09898767620325089, "kl": 0.0008127802575472742, "learning_rate": 1.7274575140626318e-06, "loss": 0.0001, "reward": 0.06544999964535236, "reward_std": 0.09778991341590881, "rewards/pot_combined_reward": 0.06544999964535236, "step": 65 }, { "completion_length": 374.0, "epoch": 0.264, "grad_norm": 0.08986019343137741, "kl": 0.000880417152075097, "learning_rate": 1.6449496416858285e-06, "loss": 0.0001, "reward": 0.04374999925494194, "reward_std": 0.08749999850988388, "rewards/pot_combined_reward": 0.04374999925494194, "step": 66 }, { "completion_length": 374.0, "epoch": 0.268, "grad_norm": 0.15933048725128174, "kl": 0.0012064649199601263, "learning_rate": 1.56348351646022e-06, "loss": 0.0001, "reward": 0.14419999904930592, "reward_std": 0.28839999809861183, "rewards/pot_combined_reward": 0.14419999904930592, "step": 67 }, { "completion_length": 366.875, "epoch": 0.272, "grad_norm": 0.0023835976608097553, "kl": 0.0007049251580610871, "learning_rate": 1.4831583923105e-06, "loss": 0.0001, "reward": 0.0, "reward_std": 0.0, "rewards/pot_combined_reward": 0.0, "step": 68 }, { "completion_length": 364.9375, "epoch": 0.276, "grad_norm": 0.07123439759016037, "kl": 0.0006189760897541419, "learning_rate": 1.4040721330273063e-06, "loss": 0.0001, "reward": 0.08187499642372131, "reward_std": 0.16374999284744263, "rewards/pot_combined_reward": 0.08187499642372131, "step": 69 }, { "completion_length": 371.6875, "epoch": 0.28, "grad_norm": 0.09466277807950974, "kl": 0.0009991034894483164, "learning_rate": 1.3263210930352737e-06, "loss": 0.0001, "reward": 0.1346083376556635, "reward_std": 0.2251959629356861, "rewards/pot_combined_reward": 0.1346083376556635, "step": 70 }, { "completion_length": 374.0, "epoch": 0.284, "grad_norm": 0.09791545569896698, "kl": 0.001197919569676742, "learning_rate": 1.2500000000000007e-06, "loss": 0.0001, "reward": 0.04223333299160004, "reward_std": 0.08446666970849037, "rewards/pot_combined_reward": 0.04223333299160004, "step": 71 }, { "completion_length": 373.5, "epoch": 0.288, "grad_norm": 0.09683706611394882, "kl": 0.001184802589705214, "learning_rate": 1.1752018394169882e-06, "loss": 0.0001, "reward": 0.05285000056028366, "reward_std": 0.10570000112056732, "rewards/pot_combined_reward": 0.05285000056028366, "step": 72 }, { "completion_length": 367.25, "epoch": 0.292, "grad_norm": 0.0033558327704668045, "kl": 0.0008135271054925397, "learning_rate": 1.1020177413231334e-06, "loss": 0.0001, "reward": 0.0, "reward_std": 0.0, "rewards/pot_combined_reward": 0.0, "step": 73 }, { "completion_length": 374.0, "epoch": 0.296, "grad_norm": 0.07025693356990814, "kl": 0.0007950128638185561, "learning_rate": 1.0305368692688175e-06, "loss": 0.0001, "reward": 0.04374999925494194, "reward_std": 0.08749999850988388, "rewards/pot_combined_reward": 0.04374999925494194, "step": 74 }, { "completion_length": 374.0, "epoch": 0.3, "grad_norm": 0.07400605827569962, "kl": 0.0009908084612106904, "learning_rate": 9.608463116858544e-07, "loss": 0.0001, "reward": 0.04339999705553055, "reward_std": 0.05020662397146225, "rewards/pot_combined_reward": 0.04339999705553055, "step": 75 }, { "completion_length": 374.0, "epoch": 0.304, "grad_norm": 0.09700194746255875, "kl": 0.0008199879812309518, "learning_rate": 8.930309757836517e-07, "loss": 0.0001, "reward": 0.061249999329447746, "reward_std": 0.12249999865889549, "rewards/pot_combined_reward": 0.061249999329447746, "step": 76 }, { "completion_length": 374.0, "epoch": 0.308, "grad_norm": 0.11501616984605789, "kl": 0.001101230038329959, "learning_rate": 8.271734841028553e-07, "loss": 0.0001, "reward": 0.12562499567866325, "reward_std": 0.2512499913573265, "rewards/pot_combined_reward": 0.12562499567866325, "step": 77 }, { "completion_length": 369.5, "epoch": 0.312, "grad_norm": 0.07562494277954102, "kl": 0.0013928793196100742, "learning_rate": 7.633540738525066e-07, "loss": 0.0001, "reward": 0.021816667169332504, "reward_std": 0.04363333433866501, "rewards/pot_combined_reward": 0.021816667169332504, "step": 78 }, { "completion_length": 374.0, "epoch": 0.316, "grad_norm": 0.0032410642597824335, "kl": 0.0008720912301214412, "learning_rate": 7.016504991533727e-07, "loss": 0.0001, "reward": 0.0, "reward_std": 0.0, "rewards/pot_combined_reward": 0.0, "step": 79 }, { "completion_length": 331.1875, "epoch": 0.32, "grad_norm": 0.004254704806953669, "kl": 0.0010208465537289158, "learning_rate": 6.421379363065142e-07, "loss": 0.0001, "reward": 0.0, "reward_std": 0.0, "rewards/pot_combined_reward": 0.0, "step": 80 }, { "completion_length": 373.75, "epoch": 0.324, "grad_norm": 0.002906290115788579, "kl": 0.0007839756435714662, "learning_rate": 5.848888922025553e-07, "loss": 0.0001, "reward": 0.0, "reward_std": 0.0, "rewards/pot_combined_reward": 0.0, "step": 81 }, { "completion_length": 374.0, "epoch": 0.328, "grad_norm": 0.13924577832221985, "kl": 0.001414620659488719, "learning_rate": 5.299731159831953e-07, "loss": 0.0001, "reward": 0.15825624577701092, "reward_std": 0.31651249155402184, "rewards/pot_combined_reward": 0.15825624577701092, "step": 82 }, { "completion_length": 374.0, "epoch": 0.332, "grad_norm": 0.14953076839447021, "kl": 0.0011789420095738024, "learning_rate": 4.774575140626317e-07, "loss": 0.0001, "reward": 0.1184374988079071, "reward_std": 0.2368749976158142, "rewards/pot_combined_reward": 0.1184374988079071, "step": 83 }, { "completion_length": 374.0, "epoch": 0.336, "grad_norm": 0.16725599765777588, "kl": 0.0009174157894449309, "learning_rate": 4.27406068612396e-07, "loss": 0.0001, "reward": 0.10500000044703484, "reward_std": 0.21000000089406967, "rewards/pot_combined_reward": 0.10500000044703484, "step": 84 }, { "completion_length": 370.6875, "epoch": 0.34, "grad_norm": 0.08203138411045074, "kl": 0.0009189353149849921, "learning_rate": 3.798797596089351e-07, "loss": 0.0001, "reward": 0.04374999925494194, "reward_std": 0.08749999850988388, "rewards/pot_combined_reward": 0.04374999925494194, "step": 85 }, { "completion_length": 371.5625, "epoch": 0.344, "grad_norm": 0.07274218648672104, "kl": 0.0008360931678907946, "learning_rate": 3.3493649053890325e-07, "loss": 0.0001, "reward": 0.08656249940395355, "reward_std": 0.1731249988079071, "rewards/pot_combined_reward": 0.08656249940395355, "step": 86 }, { "completion_length": 374.0, "epoch": 0.348, "grad_norm": 0.12336569279432297, "kl": 0.0012354136852081865, "learning_rate": 2.9263101785268253e-07, "loss": 0.0001, "reward": 0.09249166399240494, "reward_std": 0.18498332425951958, "rewards/pot_combined_reward": 0.09249166399240494, "step": 87 }, { "completion_length": 367.875, "epoch": 0.352, "grad_norm": 0.13889098167419434, "kl": 0.0017657574207987636, "learning_rate": 2.53014884252083e-07, "loss": 0.0002, "reward": 0.2926562614738941, "reward_std": 0.4413819834589958, "rewards/pot_combined_reward": 0.2926562614738941, "step": 88 }, { "completion_length": 374.0, "epoch": 0.356, "grad_norm": 0.06699243932962418, "kl": 0.001279524396522902, "learning_rate": 2.1613635589349756e-07, "loss": 0.0001, "reward": 0.08187499642372131, "reward_std": 0.16374999284744263, "rewards/pot_combined_reward": 0.08187499642372131, "step": 89 }, { "completion_length": 374.0, "epoch": 0.36, "grad_norm": 0.006427991669625044, "kl": 0.0013257948303362355, "learning_rate": 1.8204036358303173e-07, "loss": 0.0001, "reward": 0.0, "reward_std": 0.0, "rewards/pot_combined_reward": 0.0, "step": 90 }, { "completion_length": 370.9375, "epoch": 0.364, "grad_norm": 0.1104293167591095, "kl": 0.0018631000712048262, "learning_rate": 1.507684480352292e-07, "loss": 0.0002, "reward": 0.058799998834729195, "reward_std": 0.11759999766945839, "rewards/pot_combined_reward": 0.058799998834729195, "step": 91 }, { "completion_length": 361.9375, "epoch": 0.368, "grad_norm": 0.07179438322782516, "kl": 0.0009191952631226741, "learning_rate": 1.223587092621162e-07, "loss": 0.0001, "reward": 0.08343750238418579, "reward_std": 0.09881261736154556, "rewards/pot_combined_reward": 0.08343750238418579, "step": 92 }, { "completion_length": 374.0, "epoch": 0.372, "grad_norm": 0.06493149697780609, "kl": 0.0012784109567292035, "learning_rate": 9.684576015420277e-08, "loss": 0.0001, "reward": 0.11232500523328781, "reward_std": 0.17172931134700775, "rewards/pot_combined_reward": 0.11232500523328781, "step": 93 }, { "completion_length": 374.0, "epoch": 0.376, "grad_norm": 0.12131404131650925, "kl": 0.0012257406779099256, "learning_rate": 7.426068431000883e-08, "loss": 0.0001, "reward": 0.1303124949336052, "reward_std": 0.2606249898672104, "rewards/pot_combined_reward": 0.1303124949336052, "step": 94 }, { "completion_length": 368.0625, "epoch": 0.38, "grad_norm": 0.1013365238904953, "kl": 0.0013991060986882076, "learning_rate": 5.463099816548578e-08, "loss": 0.0001, "reward": 0.10476666688919067, "reward_std": 0.15946697443723679, "rewards/pot_combined_reward": 0.10476666688919067, "step": 95 }, { "completion_length": 374.0, "epoch": 0.384, "grad_norm": 0.06952964514493942, "kl": 0.0014077925588935614, "learning_rate": 3.798061746947995e-08, "loss": 0.0001, "reward": 0.11687499284744263, "reward_std": 0.1551528126001358, "rewards/pot_combined_reward": 0.11687499284744263, "step": 96 }, { "completion_length": 370.25, "epoch": 0.388, "grad_norm": 0.09735430032014847, "kl": 0.0009093381959246472, "learning_rate": 2.4329828146074096e-08, "loss": 0.0001, "reward": 0.026249999180436134, "reward_std": 0.05249999836087227, "rewards/pot_combined_reward": 0.026249999180436134, "step": 97 }, { "completion_length": 374.0, "epoch": 0.392, "grad_norm": 0.14300031960010529, "kl": 0.0015978575684130192, "learning_rate": 1.3695261579316776e-08, "loss": 0.0002, "reward": 0.23843749426305294, "reward_std": 0.33464543148875237, "rewards/pot_combined_reward": 0.23843749426305294, "step": 98 }, { "completion_length": 374.0, "epoch": 0.396, "grad_norm": 0.13475055992603302, "kl": 0.0017390053835697472, "learning_rate": 6.089874350439507e-09, "loss": 0.0002, "reward": 0.15921874903142452, "reward_std": 0.31843749806284904, "rewards/pot_combined_reward": 0.15921874903142452, "step": 99 }, { "completion_length": 374.0, "epoch": 0.4, "grad_norm": 0.07442972809076309, "kl": 0.0010832307452801615, "learning_rate": 1.5229324522605949e-09, "loss": 0.0001, "reward": 0.019833333790302277, "reward_std": 0.03966666758060455, "rewards/pot_combined_reward": 0.019833333790302277, "step": 100 } ], "logging_steps": 1, "max_steps": 100, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 50, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 4, "trial_name": null, "trial_params": null }