{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 0.9512485136741974, "eval_steps": 500, "global_step": 100, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "completion_length": 414.0390625, "epoch": 0.009512485136741973, "grad_norm": 18.50773734319143, "kl": 3.324449062347412e-05, "learning_rate": 0.0, "loss": 0.0, "reward": 1.931640625, "reward_std": 0.7540743527933955, "rewards/accuracy_reward": 0.501953125, "rewards/format_reward": 0.720703125, "rewards/influence_reward": 0.318359375, "rewards/len_reward": 0.390625, "step": 1 }, { "completion_length": 406.5244140625, "epoch": 0.04756242568370987, "grad_norm": 12.240397137360413, "kl": 0.0038472674787044525, "learning_rate": 1.818181818181818e-07, "loss": 0.0002, "reward": 2.06787109375, "reward_std": 0.6699417636264116, "rewards/accuracy_reward": 0.552734375, "rewards/format_reward": 0.73681640625, "rewards/influence_reward": 0.35595703125, "rewards/len_reward": 0.42236328125, "step": 5 }, { "completion_length": 408.440625, "epoch": 0.09512485136741974, "grad_norm": 10.899643199929397, "kl": 0.051489830017089844, "learning_rate": 4.090909090909091e-07, "loss": 0.0021, "reward": 2.095703125, "reward_std": 0.6716910980641841, "rewards/accuracy_reward": 0.564453125, "rewards/format_reward": 0.756640625, "rewards/influence_reward": 0.368359375, "rewards/len_reward": 0.40625, "step": 10 }, { "completion_length": 402.093359375, "epoch": 0.1426872770511296, "grad_norm": 56.657034116967125, "kl": 0.9256591796875, "learning_rate": 6.363636363636363e-07, "loss": 0.037, "reward": 2.055859375, "reward_std": 0.6135061264038086, "rewards/accuracy_reward": 0.537109375, "rewards/format_reward": 0.77890625, "rewards/influence_reward": 0.355859375, "rewards/len_reward": 0.383984375, "step": 15 }, { "completion_length": 390.37109375, "epoch": 0.1902497027348395, "grad_norm": 4.269559046091982, "kl": 2.8193359375, "learning_rate": 8.636363636363636e-07, "loss": 0.1128, "reward": 2.1671875, "reward_std": 0.5897074935957789, "rewards/accuracy_reward": 0.56015625, "rewards/format_reward": 0.815234375, "rewards/influence_reward": 0.384765625, "rewards/len_reward": 0.40703125, "step": 20 }, { "completion_length": 355.8265625, "epoch": 0.23781212841854935, "grad_norm": 5.36647948775656, "kl": 2.3447265625, "learning_rate": 9.99726628670463e-07, "loss": 0.0938, "reward": 2.289453125, "reward_std": 0.5657233998179436, "rewards/accuracy_reward": 0.54140625, "rewards/format_reward": 0.87578125, "rewards/influence_reward": 0.405859375, "rewards/len_reward": 0.46640625, "step": 25 }, { "completion_length": 312.60859375, "epoch": 0.2853745541022592, "grad_norm": 93.06936011551063, "kl": 3.04345703125, "learning_rate": 9.966546331768192e-07, "loss": 0.1218, "reward": 2.478125, "reward_std": 0.5446455283090472, "rewards/accuracy_reward": 0.5625, "rewards/format_reward": 0.953515625, "rewards/influence_reward": 0.451171875, "rewards/len_reward": 0.5109375, "step": 30 }, { "completion_length": 299.95390625, "epoch": 0.3329369797859691, "grad_norm": 4.81826375681457, "kl": 2.7115234375, "learning_rate": 9.901899829374047e-07, "loss": 0.1085, "reward": 2.5625, "reward_std": 0.5693813040852547, "rewards/accuracy_reward": 0.546875, "rewards/format_reward": 0.962109375, "rewards/influence_reward": 0.44296875, "rewards/len_reward": 0.610546875, "step": 35 }, { "completion_length": 276.19296875, "epoch": 0.380499405469679, "grad_norm": 2.854100596289783, "kl": 2.379248046875, "learning_rate": 9.803768380684242e-07, "loss": 0.0952, "reward": 2.51484375, "reward_std": 0.5233275255188345, "rewards/accuracy_reward": 0.50859375, "rewards/format_reward": 0.9640625, "rewards/influence_reward": 0.4140625, "rewards/len_reward": 0.628125, "step": 40 }, { "completion_length": 281.1796875, "epoch": 0.4280618311533888, "grad_norm": 3.461375360266424, "kl": 2.206005859375, "learning_rate": 9.672822322997304e-07, "loss": 0.0882, "reward": 2.471875, "reward_std": 0.5379180932417512, "rewards/accuracy_reward": 0.49296875, "rewards/format_reward": 0.944140625, "rewards/influence_reward": 0.3921875, "rewards/len_reward": 0.642578125, "step": 45 }, { "completion_length": 297.3796875, "epoch": 0.4756242568370987, "grad_norm": 3.3937919439490454, "kl": 2.230615234375, "learning_rate": 9.509956150664795e-07, "loss": 0.0892, "reward": 2.546875, "reward_std": 0.5505968105047941, "rewards/accuracy_reward": 0.53125, "rewards/format_reward": 0.923046875, "rewards/influence_reward": 0.416015625, "rewards/len_reward": 0.6765625, "step": 50 }, { "completion_length": 300.020703125, "epoch": 0.5231866825208086, "grad_norm": 4.255030544730029, "kl": 2.759912109375, "learning_rate": 9.316282404787869e-07, "loss": 0.1104, "reward": 2.500390625, "reward_std": 0.5426989603787661, "rewards/accuracy_reward": 0.522265625, "rewards/format_reward": 0.933984375, "rewards/influence_reward": 0.414453125, "rewards/len_reward": 0.6296875, "step": 55 }, { "completion_length": 309.028515625, "epoch": 0.5707491082045184, "grad_norm": 2.734847983664629, "kl": 2.937158203125, "learning_rate": 9.093124073433462e-07, "loss": 0.1175, "reward": 2.381640625, "reward_std": 0.5930636901408434, "rewards/accuracy_reward": 0.46953125, "rewards/format_reward": 0.93359375, "rewards/influence_reward": 0.3640625, "rewards/len_reward": 0.614453125, "step": 60 }, { "completion_length": 305.84296875, "epoch": 0.6183115338882283, "grad_norm": 4.019506444773187, "kl": 3.4853515625, "learning_rate": 8.842005554284295e-07, "loss": 0.1394, "reward": 2.45859375, "reward_std": 0.560142171010375, "rewards/accuracy_reward": 0.49609375, "rewards/format_reward": 0.93671875, "rewards/influence_reward": 0.396484375, "rewards/len_reward": 0.629296875, "step": 65 }, { "completion_length": 305.07265625, "epoch": 0.6658739595719382, "grad_norm": 4.116522469679006, "kl": 3.28466796875, "learning_rate": 8.564642241456986e-07, "loss": 0.1314, "reward": 2.435546875, "reward_std": 0.5443418994545937, "rewards/accuracy_reward": 0.48515625, "rewards/format_reward": 0.940625, "rewards/influence_reward": 0.383203125, "rewards/len_reward": 0.6265625, "step": 70 }, { "completion_length": 298.075390625, "epoch": 0.713436385255648, "grad_norm": 2.9394867850708772, "kl": 3.50244140625, "learning_rate": 8.262928807620843e-07, "loss": 0.1401, "reward": 2.416796875, "reward_std": 0.5376573745161295, "rewards/accuracy_reward": 0.48515625, "rewards/format_reward": 0.950390625, "rewards/influence_reward": 0.37890625, "rewards/len_reward": 0.60234375, "step": 75 }, { "completion_length": 299.733984375, "epoch": 0.760998810939358, "grad_norm": 3.301846350005546, "kl": 3.5205078125, "learning_rate": 7.938926261462365e-07, "loss": 0.1408, "reward": 2.404296875, "reward_std": 0.5362825602293014, "rewards/accuracy_reward": 0.4703125, "rewards/format_reward": 0.94765625, "rewards/influence_reward": 0.3671875, "rewards/len_reward": 0.619140625, "step": 80 }, { "completion_length": 308.55546875, "epoch": 0.8085612366230678, "grad_norm": 4.193561063299626, "kl": 3.41328125, "learning_rate": 7.594847868906076e-07, "loss": 0.1365, "reward": 2.408203125, "reward_std": 0.535981552861631, "rewards/accuracy_reward": 0.46796875, "rewards/format_reward": 0.945703125, "rewards/influence_reward": 0.365625, "rewards/len_reward": 0.62890625, "step": 85 }, { "completion_length": 305.84140625, "epoch": 0.8561236623067776, "grad_norm": 20.941270697353453, "kl": 3.77119140625, "learning_rate": 7.233044034264033e-07, "loss": 0.1509, "reward": 2.408203125, "reward_std": 0.5046488767489791, "rewards/accuracy_reward": 0.475, "rewards/format_reward": 0.957421875, "rewards/influence_reward": 0.37890625, "rewards/len_reward": 0.596875, "step": 90 }, { "completion_length": 318.782421875, "epoch": 0.9036860879904876, "grad_norm": 5.799431376928617, "kl": 3.83603515625, "learning_rate": 6.855986244591103e-07, "loss": 0.1534, "reward": 2.423046875, "reward_std": 0.5394395122304558, "rewards/accuracy_reward": 0.503515625, "rewards/format_reward": 0.943359375, "rewards/influence_reward": 0.3859375, "rewards/len_reward": 0.590234375, "step": 95 }, { "completion_length": 318.328515625, "epoch": 0.9512485136741974, "grad_norm": 304.60647459145224, "kl": 4.10732421875, "learning_rate": 6.466250186922324e-07, "loss": 0.1643, "reward": 2.353125, "reward_std": 0.5590785862877965, "rewards/accuracy_reward": 0.461328125, "rewards/format_reward": 0.94296875, "rewards/influence_reward": 0.359765625, "rewards/len_reward": 0.5890625, "step": 100 } ], "logging_steps": 5, "max_steps": 212, "num_input_tokens_seen": 0, "num_train_epochs": 2, "save_steps": 10, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 1, "trial_name": null, "trial_params": null }