{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 0.2, "eval_steps": 500, "global_step": 50, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "completion_length": 361.75, "epoch": 0.004, "grad_norm": 0.0693558007478714, "kl": 0.0, "learning_rate": 0.0, "loss": -0.0, "reward": 0.02345000021159649, "reward_std": 0.04690000042319298, "rewards/pot_combined_reward": 0.02345000021159649, "step": 1 }, { "completion_length": 371.375, "epoch": 0.008, "grad_norm": 0.08951307833194733, "kl": 0.0, "learning_rate": 5.000000000000001e-07, "loss": -0.0, "reward": 0.026133334264159203, "reward_std": 0.052266668528318405, "rewards/pot_combined_reward": 0.026133334264159203, "step": 2 }, { "completion_length": 374.0, "epoch": 0.012, "grad_norm": 0.001663331058807671, "kl": 0.0005528016190510243, "learning_rate": 1.0000000000000002e-06, "loss": 0.0001, "reward": 0.0, "reward_std": 0.0, "rewards/pot_combined_reward": 0.0, "step": 3 }, { "completion_length": 374.0, "epoch": 0.016, "grad_norm": 0.0016158577054738998, "kl": 0.0005013404006604105, "learning_rate": 1.5e-06, "loss": 0.0001, "reward": 0.0, "reward_std": 0.0, "rewards/pot_combined_reward": 0.0, "step": 4 }, { "completion_length": 373.375, "epoch": 0.02, "grad_norm": 0.0030017346143722534, "kl": 0.0005660907772835344, "learning_rate": 2.0000000000000003e-06, "loss": 0.0001, "reward": 0.0, "reward_std": 0.0, "rewards/pot_combined_reward": 0.0, "step": 5 }, { "completion_length": 373.5625, "epoch": 0.024, "grad_norm": 0.0015043334569782019, "kl": 0.0005426810312201269, "learning_rate": 2.5e-06, "loss": 0.0001, "reward": 0.0, "reward_std": 0.0, "rewards/pot_combined_reward": 0.0, "step": 6 }, { "completion_length": 364.5, "epoch": 0.028, "grad_norm": 0.08007726073265076, "kl": 0.0005026786457165144, "learning_rate": 3e-06, "loss": 0.0001, "reward": 0.03146666660904884, "reward_std": 0.06293333321809769, "rewards/pot_combined_reward": 0.03146666660904884, "step": 7 }, { "completion_length": 374.0, "epoch": 0.032, "grad_norm": 0.0016093035228550434, "kl": 0.0005015511706005782, "learning_rate": 3.5e-06, "loss": 0.0001, "reward": 0.0, "reward_std": 0.0, "rewards/pot_combined_reward": 0.0, "step": 8 }, { "completion_length": 353.0625, "epoch": 0.036, "grad_norm": 0.001784446300007403, "kl": 0.0003549655375536531, "learning_rate": 4.000000000000001e-06, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/pot_combined_reward": 0.0, "step": 9 }, { "completion_length": 374.0, "epoch": 0.04, "grad_norm": 0.0020515809301286936, "kl": 0.0005762250584666617, "learning_rate": 4.5e-06, "loss": 0.0001, "reward": 0.0, "reward_std": 0.0, "rewards/pot_combined_reward": 0.0, "step": 10 }, { "completion_length": 374.0, "epoch": 0.044, "grad_norm": 0.06735244393348694, "kl": 0.0005008808220736682, "learning_rate": 5e-06, "loss": 0.0001, "reward": 0.01808333396911621, "reward_std": 0.03616666793823242, "rewards/pot_combined_reward": 0.01808333396911621, "step": 11 }, { "completion_length": 342.5, "epoch": 0.048, "grad_norm": 0.0024033007211983204, "kl": 0.00046441886661341414, "learning_rate": 4.99847706754774e-06, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/pot_combined_reward": 0.0, "step": 12 }, { "completion_length": 374.0, "epoch": 0.052, "grad_norm": 0.0014815045287832618, "kl": 0.0004604290661518462, "learning_rate": 4.993910125649561e-06, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/pot_combined_reward": 0.0, "step": 13 }, { "completion_length": 374.0, "epoch": 0.056, "grad_norm": 0.001548771746456623, "kl": 0.0004989306180505082, "learning_rate": 4.986304738420684e-06, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/pot_combined_reward": 0.0, "step": 14 }, { "completion_length": 360.8125, "epoch": 0.06, "grad_norm": 0.001822226564399898, "kl": 0.00046228048086049967, "learning_rate": 4.975670171853926e-06, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/pot_combined_reward": 0.0, "step": 15 }, { "completion_length": 374.0, "epoch": 0.064, "grad_norm": 0.09072617441415787, "kl": 0.000476861278002616, "learning_rate": 4.962019382530521e-06, "loss": 0.0, "reward": 0.012600000016391277, "reward_std": 0.025200000032782555, "rewards/pot_combined_reward": 0.012600000016391277, "step": 16 }, { "completion_length": 374.0, "epoch": 0.068, "grad_norm": 0.0015437575057148933, "kl": 0.0005299622716847807, "learning_rate": 4.9453690018345144e-06, "loss": 0.0001, "reward": 0.0, "reward_std": 0.0, "rewards/pot_combined_reward": 0.0, "step": 17 }, { "completion_length": 374.0, "epoch": 0.072, "grad_norm": 0.0016528957057744265, "kl": 0.0005346549514797516, "learning_rate": 4.925739315689991e-06, "loss": 0.0001, "reward": 0.0, "reward_std": 0.0, "rewards/pot_combined_reward": 0.0, "step": 18 }, { "completion_length": 374.0, "epoch": 0.076, "grad_norm": 0.0017183530144393444, "kl": 0.0004841076224693097, "learning_rate": 4.903154239845798e-06, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/pot_combined_reward": 0.0, "step": 19 }, { "completion_length": 373.5625, "epoch": 0.08, "grad_norm": 0.08476348221302032, "kl": 0.000539450986252632, "learning_rate": 4.8776412907378845e-06, "loss": 0.0001, "reward": 0.026249999180436134, "reward_std": 0.05249999836087227, "rewards/pot_combined_reward": 0.026249999180436134, "step": 20 }, { "completion_length": 374.0, "epoch": 0.084, "grad_norm": 0.07820821553468704, "kl": 0.0005957721295999363, "learning_rate": 4.849231551964771e-06, "loss": 0.0001, "reward": 0.07116249948740005, "reward_std": 0.1423249989748001, "rewards/pot_combined_reward": 0.07116249948740005, "step": 21 }, { "completion_length": 373.5625, "epoch": 0.088, "grad_norm": 0.0016008545644581318, "kl": 0.000533243379322812, "learning_rate": 4.817959636416969e-06, "loss": 0.0001, "reward": 0.0, "reward_std": 0.0, "rewards/pot_combined_reward": 0.0, "step": 22 }, { "completion_length": 374.0, "epoch": 0.092, "grad_norm": 0.0018651616992428899, "kl": 0.0005620143201667815, "learning_rate": 4.783863644106502e-06, "loss": 0.0001, "reward": 0.0, "reward_std": 0.0, "rewards/pot_combined_reward": 0.0, "step": 23 }, { "completion_length": 374.0, "epoch": 0.096, "grad_norm": 0.001895356923341751, "kl": 0.00047788477240828797, "learning_rate": 4.746985115747918e-06, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/pot_combined_reward": 0.0, "step": 24 }, { "completion_length": 369.75, "epoch": 0.1, "grad_norm": 0.11337540298700333, "kl": 0.00047047801490407437, "learning_rate": 4.707368982147318e-06, "loss": 0.0, "reward": 0.04736666567623615, "reward_std": 0.0947333313524723, "rewards/pot_combined_reward": 0.04736666567623615, "step": 25 }, { "completion_length": 368.625, "epoch": 0.104, "grad_norm": 0.01079186424612999, "kl": 0.0006513141634059139, "learning_rate": 4.665063509461098e-06, "loss": 0.0001, "reward": 0.0, "reward_std": 0.0, "rewards/pot_combined_reward": 0.0, "step": 26 }, { "completion_length": 374.0, "epoch": 0.108, "grad_norm": 0.0016566955018788576, "kl": 0.0005965056043351069, "learning_rate": 4.620120240391065e-06, "loss": 0.0001, "reward": 0.0, "reward_std": 0.0, "rewards/pot_combined_reward": 0.0, "step": 27 }, { "completion_length": 374.0, "epoch": 0.112, "grad_norm": 0.001792517607100308, "kl": 0.0005393773099058308, "learning_rate": 4.572593931387604e-06, "loss": 0.0001, "reward": 0.0, "reward_std": 0.0, "rewards/pot_combined_reward": 0.0, "step": 28 }, { "completion_length": 374.0, "epoch": 0.116, "grad_norm": 0.0017438618233427405, "kl": 0.0005095232809253503, "learning_rate": 4.522542485937369e-06, "loss": 0.0001, "reward": 0.0, "reward_std": 0.0, "rewards/pot_combined_reward": 0.0, "step": 29 }, { "completion_length": 374.0, "epoch": 0.12, "grad_norm": 0.0015684061218053102, "kl": 0.00047055614413693547, "learning_rate": 4.470026884016805e-06, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/pot_combined_reward": 0.0, "step": 30 }, { "completion_length": 374.0, "epoch": 0.124, "grad_norm": 0.0019608919974416494, "kl": 0.0005961552087683231, "learning_rate": 4.415111107797445e-06, "loss": 0.0001, "reward": 0.0, "reward_std": 0.0, "rewards/pot_combined_reward": 0.0, "step": 31 }, { "completion_length": 372.0625, "epoch": 0.128, "grad_norm": 0.0015237935585901141, "kl": 0.0005326158570824191, "learning_rate": 4.357862063693486e-06, "loss": 0.0001, "reward": 0.0, "reward_std": 0.0, "rewards/pot_combined_reward": 0.0, "step": 32 }, { "completion_length": 374.0, "epoch": 0.132, "grad_norm": 0.002203061943873763, "kl": 0.0006071907628211193, "learning_rate": 4.2983495008466285e-06, "loss": 0.0001, "reward": 0.0, "reward_std": 0.0, "rewards/pot_combined_reward": 0.0, "step": 33 }, { "completion_length": 370.5, "epoch": 0.136, "grad_norm": 0.10126212984323502, "kl": 0.0006130525580374524, "learning_rate": 4.236645926147493e-06, "loss": 0.0001, "reward": 0.061249999329447746, "reward_std": 0.12249999865889549, "rewards/pot_combined_reward": 0.061249999329447746, "step": 34 }, { "completion_length": 374.0, "epoch": 0.14, "grad_norm": 0.08621055632829666, "kl": 0.0004927485424559563, "learning_rate": 4.172826515897146e-06, "loss": 0.0, "reward": 0.024966666474938393, "reward_std": 0.049933332949876785, "rewards/pot_combined_reward": 0.024966666474938393, "step": 35 }, { "completion_length": 374.0, "epoch": 0.144, "grad_norm": 0.0017127083847299218, "kl": 0.0005035524372942746, "learning_rate": 4.106969024216348e-06, "loss": 0.0001, "reward": 0.0, "reward_std": 0.0, "rewards/pot_combined_reward": 0.0, "step": 36 }, { "completion_length": 374.0, "epoch": 0.148, "grad_norm": 0.0018064226023852825, "kl": 0.0005530964990612119, "learning_rate": 4.039153688314146e-06, "loss": 0.0001, "reward": 0.0, "reward_std": 0.0, "rewards/pot_combined_reward": 0.0, "step": 37 }, { "completion_length": 373.8125, "epoch": 0.152, "grad_norm": 0.0019408023217692971, "kl": 0.0006417437689378858, "learning_rate": 3.969463130731183e-06, "loss": 0.0001, "reward": 0.0, "reward_std": 0.0, "rewards/pot_combined_reward": 0.0, "step": 38 }, { "completion_length": 374.0, "epoch": 0.156, "grad_norm": 0.08421237021684647, "kl": 0.0006241849769139662, "learning_rate": 3.897982258676867e-06, "loss": 0.0001, "reward": 0.07303333282470703, "reward_std": 0.057444244623184204, "rewards/pot_combined_reward": 0.07303333282470703, "step": 39 }, { "completion_length": 374.0, "epoch": 0.16, "grad_norm": 0.08112610131502151, "kl": 0.0005679467285517603, "learning_rate": 3.824798160583012e-06, "loss": 0.0001, "reward": 0.014233333058655262, "reward_std": 0.028466666117310524, "rewards/pot_combined_reward": 0.014233333058655262, "step": 40 }, { "completion_length": 374.0, "epoch": 0.164, "grad_norm": 0.001669050194323063, "kl": 0.0005617116403300315, "learning_rate": 3.7500000000000005e-06, "loss": 0.0001, "reward": 0.0, "reward_std": 0.0, "rewards/pot_combined_reward": 0.0, "step": 41 }, { "completion_length": 374.0, "epoch": 0.168, "grad_norm": 0.002355287317186594, "kl": 0.0006023006426403299, "learning_rate": 3.6736789069647273e-06, "loss": 0.0001, "reward": 0.0, "reward_std": 0.0, "rewards/pot_combined_reward": 0.0, "step": 42 }, { "completion_length": 328.8125, "epoch": 0.172, "grad_norm": 0.0023054229095578194, "kl": 0.0005200250307098031, "learning_rate": 3.595927866972694e-06, "loss": 0.0001, "reward": 0.0, "reward_std": 0.0, "rewards/pot_combined_reward": 0.0, "step": 43 }, { "completion_length": 374.0, "epoch": 0.176, "grad_norm": 0.11641528457403183, "kl": 0.0005873750924365595, "learning_rate": 3.516841607689501e-06, "loss": 0.0001, "reward": 0.07468749955296516, "reward_std": 0.14937499910593033, "rewards/pot_combined_reward": 0.07468749955296516, "step": 44 }, { "completion_length": 374.0, "epoch": 0.18, "grad_norm": 0.001737726735882461, "kl": 0.0005044558856752701, "learning_rate": 3.436516483539781e-06, "loss": 0.0001, "reward": 0.0, "reward_std": 0.0, "rewards/pot_combined_reward": 0.0, "step": 45 }, { "completion_length": 370.9375, "epoch": 0.184, "grad_norm": 0.0019563438836485147, "kl": 0.000592117925407365, "learning_rate": 3.3550503583141726e-06, "loss": 0.0001, "reward": 0.0, "reward_std": 0.0, "rewards/pot_combined_reward": 0.0, "step": 46 }, { "completion_length": 372.4375, "epoch": 0.188, "grad_norm": 0.0026125519070774317, "kl": 0.0005959889385849237, "learning_rate": 3.272542485937369e-06, "loss": 0.0001, "reward": 0.0, "reward_std": 0.0, "rewards/pot_combined_reward": 0.0, "step": 47 }, { "completion_length": 372.75, "epoch": 0.192, "grad_norm": 0.0820600688457489, "kl": 0.0005973696243017912, "learning_rate": 3.189093389542498e-06, "loss": 0.0001, "reward": 0.03968749940395355, "reward_std": 0.0793749988079071, "rewards/pot_combined_reward": 0.03968749940395355, "step": 48 }, { "completion_length": 374.0, "epoch": 0.196, "grad_norm": 0.06782057881355286, "kl": 0.0007923852826934308, "learning_rate": 3.1048047389991693e-06, "loss": 0.0001, "reward": 0.026249999180436134, "reward_std": 0.05249999836087227, "rewards/pot_combined_reward": 0.026249999180436134, "step": 49 }, { "completion_length": 374.0, "epoch": 0.2, "grad_norm": 0.10189752280712128, "kl": 0.0004997247888240963, "learning_rate": 3.019779227044398e-06, "loss": 0.0, "reward": 0.06999999843537807, "reward_std": 0.13999999687075615, "rewards/pot_combined_reward": 0.06999999843537807, "step": 50 } ], "logging_steps": 1, "max_steps": 100, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 50, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 4, "trial_name": null, "trial_params": null }