{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.12358882947118241, "eval_steps": 62, "global_step": 40, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0030897207367795603, "grad_norm": 4588.7900390625, "learning_rate": 4.0000000000000004e-11, "loss": 11.1378, "step": 1 }, { "epoch": 0.006179441473559121, "grad_norm": 4484.97314453125, "learning_rate": 8.000000000000001e-11, "loss": 11.1465, "step": 2 }, { "epoch": 0.009269162210338681, "grad_norm": 4465.8408203125, "learning_rate": 1.2e-10, "loss": 11.0439, "step": 3 }, { "epoch": 0.012358882947118241, "grad_norm": 4807.01171875, "learning_rate": 1.6000000000000002e-10, "loss": 11.3464, "step": 4 }, { "epoch": 0.015448603683897801, "grad_norm": 4590.3369140625, "learning_rate": 2e-10, "loss": 11.0523, "step": 5 }, { "epoch": 0.018538324420677363, "grad_norm": 4261.99658203125, "learning_rate": 1.9984815164333162e-10, "loss": 10.9142, "step": 6 }, { "epoch": 0.02162804515745692, "grad_norm": 4459.41455078125, "learning_rate": 1.9939306773179497e-10, "loss": 11.0083, "step": 7 }, { "epoch": 0.024717765894236483, "grad_norm": 4372.3876953125, "learning_rate": 1.9863613034027225e-10, "loss": 10.9957, "step": 8 }, { "epoch": 0.027807486631016044, "grad_norm": 4308.26953125, "learning_rate": 1.9757963826274357e-10, "loss": 10.9736, "step": 9 }, { "epoch": 0.030897207367795602, "grad_norm": 3821.595458984375, "learning_rate": 1.9622680003092504e-10, "loss": 10.4572, "step": 10 }, { "epoch": 0.03398692810457516, "grad_norm": 4184.8388671875, "learning_rate": 1.9458172417006345e-10, "loss": 10.7733, "step": 11 }, { "epoch": 0.037076648841354726, "grad_norm": 4909.9609375, "learning_rate": 1.9264940672148017e-10, "loss": 11.6039, "step": 12 }, { "epoch": 0.040166369578134284, "grad_norm": 4097.865234375, "learning_rate": 1.9043571606975777e-10, "loss": 10.7497, "step": 13 }, { "epoch": 0.04325609031491384, "grad_norm": 4388.50244140625, "learning_rate": 1.879473751206489e-10, "loss": 10.8997, "step": 14 }, { "epoch": 0.04634581105169341, "grad_norm": 4527.54150390625, "learning_rate": 1.851919408838327e-10, "loss": 11.0818, "step": 15 }, { "epoch": 0.049435531788472965, "grad_norm": 4139.16552734375, "learning_rate": 1.821777815225245e-10, "loss": 10.6874, "step": 16 }, { "epoch": 0.052525252525252523, "grad_norm": 4184.986328125, "learning_rate": 1.7891405093963936e-10, "loss": 10.6914, "step": 17 }, { "epoch": 0.05561497326203209, "grad_norm": 4698.89599609375, "learning_rate": 1.7541066097768963e-10, "loss": 11.3234, "step": 18 }, { "epoch": 0.05870469399881165, "grad_norm": 4441.7099609375, "learning_rate": 1.7167825131684515e-10, "loss": 11.0614, "step": 19 }, { "epoch": 0.061794414735591205, "grad_norm": 4341.03955078125, "learning_rate": 1.6772815716257412e-10, "loss": 10.9571, "step": 20 }, { "epoch": 0.06488413547237076, "grad_norm": 4568.626953125, "learning_rate": 1.6357237482099684e-10, "loss": 11.1655, "step": 21 }, { "epoch": 0.06797385620915032, "grad_norm": 4392.01953125, "learning_rate": 1.59223525266498e-10, "loss": 11.0342, "step": 22 }, { "epoch": 0.0710635769459299, "grad_norm": 4315.537109375, "learning_rate": 1.5469481581224272e-10, "loss": 11.0743, "step": 23 }, { "epoch": 0.07415329768270945, "grad_norm": 4450.1845703125, "learning_rate": 1.5e-10, "loss": 10.9424, "step": 24 }, { "epoch": 0.07724301841948901, "grad_norm": 4252.31982421875, "learning_rate": 1.4515333583108894e-10, "loss": 10.8202, "step": 25 }, { "epoch": 0.08033273915626857, "grad_norm": 4481.5, "learning_rate": 1.4016954246529695e-10, "loss": 11.0462, "step": 26 }, { "epoch": 0.08342245989304813, "grad_norm": 4287.03515625, "learning_rate": 1.3506375551927545e-10, "loss": 11.0053, "step": 27 }, { "epoch": 0.08651218062982768, "grad_norm": 4513.53662109375, "learning_rate": 1.2985148110016947e-10, "loss": 11.1842, "step": 28 }, { "epoch": 0.08960190136660724, "grad_norm": 4208.63330078125, "learning_rate": 1.2454854871407994e-10, "loss": 10.9974, "step": 29 }, { "epoch": 0.09269162210338681, "grad_norm": 4587.828125, "learning_rate": 1.1917106319237384e-10, "loss": 11.3144, "step": 30 }, { "epoch": 0.09578134284016637, "grad_norm": 4346.50341796875, "learning_rate": 1.1373535578184083e-10, "loss": 11.0087, "step": 31 }, { "epoch": 0.09887106357694593, "grad_norm": 4239.73974609375, "learning_rate": 1.0825793454723326e-10, "loss": 10.8437, "step": 32 }, { "epoch": 0.10196078431372549, "grad_norm": 4778.1728515625, "learning_rate": 1.0275543423681622e-10, "loss": 11.3694, "step": 33 }, { "epoch": 0.10505050505050505, "grad_norm": 4481.9033203125, "learning_rate": 9.724456576318382e-11, "loss": 11.1087, "step": 34 }, { "epoch": 0.1081402257872846, "grad_norm": 4602.31005859375, "learning_rate": 9.174206545276678e-11, "loss": 11.3214, "step": 35 }, { "epoch": 0.11122994652406418, "grad_norm": 4783.39501953125, "learning_rate": 8.626464421815918e-11, "loss": 11.2677, "step": 36 }, { "epoch": 0.11431966726084374, "grad_norm": 4314.626953125, "learning_rate": 8.082893680762619e-11, "loss": 10.9266, "step": 37 }, { "epoch": 0.1174093879976233, "grad_norm": 4121.65478515625, "learning_rate": 7.54514512859201e-11, "loss": 10.7766, "step": 38 }, { "epoch": 0.12049910873440285, "grad_norm": 4387.51318359375, "learning_rate": 7.014851889983058e-11, "loss": 11.1114, "step": 39 }, { "epoch": 0.12358882947118241, "grad_norm": 4680.94140625, "learning_rate": 6.493624448072457e-11, "loss": 11.3017, "step": 40 } ], "logging_steps": 1, "max_steps": 62, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 10, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 6.637517228998656e+16, "train_batch_size": 2, "trial_name": null, "trial_params": null }