{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.9955555555555555, "eval_steps": 100, "global_step": 562, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.035555555555555556, "grad_norm": 11.767955780029297, "learning_rate": 0.00019679715302491104, "loss": 33.6554, "step": 10 }, { "epoch": 0.07111111111111111, "grad_norm": 9.438668251037598, "learning_rate": 0.0001932384341637011, "loss": 33.8787, "step": 20 }, { "epoch": 0.10666666666666667, "grad_norm": 10.174617767333984, "learning_rate": 0.00018967971530249112, "loss": 33.8919, "step": 30 }, { "epoch": 0.14222222222222222, "grad_norm": 8.84274673461914, "learning_rate": 0.00018612099644128114, "loss": 33.7011, "step": 40 }, { "epoch": 0.17777777777777778, "grad_norm": 10.169342041015625, "learning_rate": 0.0001825622775800712, "loss": 33.6306, "step": 50 }, { "epoch": 0.21333333333333335, "grad_norm": 9.339362144470215, "learning_rate": 0.0001790035587188612, "loss": 33.5378, "step": 60 }, { "epoch": 0.24888888888888888, "grad_norm": 10.399051666259766, "learning_rate": 0.00017544483985765125, "loss": 33.1223, "step": 70 }, { "epoch": 0.28444444444444444, "grad_norm": 8.772202491760254, "learning_rate": 0.00017188612099644127, "loss": 34.3864, "step": 80 }, { "epoch": 0.32, "grad_norm": 9.338233947753906, "learning_rate": 0.00016832740213523133, "loss": 33.2955, "step": 90 }, { "epoch": 0.35555555555555557, "grad_norm": 9.439739227294922, "learning_rate": 0.00016476868327402135, "loss": 33.229, "step": 100 }, { "epoch": 0.35555555555555557, "eval_loss": 2.133469820022583, "eval_runtime": 296.1668, "eval_samples_per_second": 3.376, "eval_steps_per_second": 0.422, "step": 100 }, { "epoch": 0.39111111111111113, "grad_norm": 9.046673774719238, "learning_rate": 0.0001612099644128114, "loss": 33.3667, "step": 110 }, { "epoch": 0.4266666666666667, "grad_norm": 8.99227237701416, "learning_rate": 0.00015765124555160143, "loss": 32.6701, "step": 120 }, { "epoch": 0.4622222222222222, "grad_norm": 7.6904144287109375, "learning_rate": 0.00015409252669039148, "loss": 33.2927, "step": 130 }, { "epoch": 0.49777777777777776, "grad_norm": 8.012206077575684, "learning_rate": 0.00015053380782918148, "loss": 33.2934, "step": 140 }, { "epoch": 0.5333333333333333, "grad_norm": 10.931622505187988, "learning_rate": 0.00014697508896797153, "loss": 33.3676, "step": 150 }, { "epoch": 0.5688888888888889, "grad_norm": 7.606035232543945, "learning_rate": 0.00014341637010676156, "loss": 34.1758, "step": 160 }, { "epoch": 0.6044444444444445, "grad_norm": 9.531214714050293, "learning_rate": 0.0001398576512455516, "loss": 33.0847, "step": 170 }, { "epoch": 0.64, "grad_norm": 8.761300086975098, "learning_rate": 0.00013629893238434164, "loss": 33.5206, "step": 180 }, { "epoch": 0.6755555555555556, "grad_norm": 9.155729293823242, "learning_rate": 0.0001327402135231317, "loss": 33.2403, "step": 190 }, { "epoch": 0.7111111111111111, "grad_norm": 9.354476928710938, "learning_rate": 0.00012918149466192172, "loss": 33.5548, "step": 200 }, { "epoch": 0.7111111111111111, "eval_loss": 2.126850128173828, "eval_runtime": 296.1679, "eval_samples_per_second": 3.376, "eval_steps_per_second": 0.422, "step": 200 }, { "epoch": 0.7466666666666667, "grad_norm": 8.922224998474121, "learning_rate": 0.00012562277580071177, "loss": 33.279, "step": 210 }, { "epoch": 0.7822222222222223, "grad_norm": 9.973633766174316, "learning_rate": 0.00012206405693950178, "loss": 33.5481, "step": 220 }, { "epoch": 0.8177777777777778, "grad_norm": 8.771803855895996, "learning_rate": 0.00011850533807829183, "loss": 33.1058, "step": 230 }, { "epoch": 0.8533333333333334, "grad_norm": 10.16543960571289, "learning_rate": 0.00011494661921708185, "loss": 33.3706, "step": 240 }, { "epoch": 0.8888888888888888, "grad_norm": 9.286821365356445, "learning_rate": 0.0001113879003558719, "loss": 33.3456, "step": 250 }, { "epoch": 0.9244444444444444, "grad_norm": 9.520956039428711, "learning_rate": 0.00010782918149466192, "loss": 33.5781, "step": 260 }, { "epoch": 0.96, "grad_norm": 10.376456260681152, "learning_rate": 0.00010427046263345198, "loss": 32.9687, "step": 270 }, { "epoch": 0.9955555555555555, "grad_norm": 8.36178207397461, "learning_rate": 0.00010071174377224199, "loss": 33.7239, "step": 280 }, { "epoch": 1.0284444444444445, "grad_norm": 10.113052368164062, "learning_rate": 9.715302491103203e-05, "loss": 29.9997, "step": 290 }, { "epoch": 1.064, "grad_norm": 11.123631477355957, "learning_rate": 9.359430604982207e-05, "loss": 32.5004, "step": 300 }, { "epoch": 1.064, "eval_loss": 2.122236490249634, "eval_runtime": 296.127, "eval_samples_per_second": 3.377, "eval_steps_per_second": 0.422, "step": 300 }, { "epoch": 1.0995555555555556, "grad_norm": 9.897551536560059, "learning_rate": 9.00355871886121e-05, "loss": 32.5046, "step": 310 }, { "epoch": 1.1351111111111112, "grad_norm": 9.53073501586914, "learning_rate": 8.647686832740213e-05, "loss": 32.2727, "step": 320 }, { "epoch": 1.1706666666666667, "grad_norm": 10.394311904907227, "learning_rate": 8.291814946619217e-05, "loss": 32.688, "step": 330 }, { "epoch": 1.2062222222222223, "grad_norm": 9.498970031738281, "learning_rate": 7.935943060498221e-05, "loss": 33.6316, "step": 340 }, { "epoch": 1.2417777777777779, "grad_norm": 10.150975227355957, "learning_rate": 7.580071174377225e-05, "loss": 33.0713, "step": 350 }, { "epoch": 1.2773333333333334, "grad_norm": 9.899177551269531, "learning_rate": 7.224199288256229e-05, "loss": 32.4769, "step": 360 }, { "epoch": 1.3128888888888888, "grad_norm": 9.39831829071045, "learning_rate": 6.868327402135231e-05, "loss": 32.2654, "step": 370 }, { "epoch": 1.3484444444444446, "grad_norm": 10.761151313781738, "learning_rate": 6.512455516014235e-05, "loss": 32.491, "step": 380 }, { "epoch": 1.384, "grad_norm": 9.932414054870605, "learning_rate": 6.156583629893239e-05, "loss": 33.5308, "step": 390 }, { "epoch": 1.4195555555555557, "grad_norm": 11.054327011108398, "learning_rate": 5.8007117437722425e-05, "loss": 31.7061, "step": 400 }, { "epoch": 1.4195555555555557, "eval_loss": 2.120673418045044, "eval_runtime": 296.1092, "eval_samples_per_second": 3.377, "eval_steps_per_second": 0.422, "step": 400 }, { "epoch": 1.455111111111111, "grad_norm": 10.89476203918457, "learning_rate": 5.4448398576512464e-05, "loss": 32.485, "step": 410 }, { "epoch": 1.4906666666666666, "grad_norm": 9.823376655578613, "learning_rate": 5.0889679715302496e-05, "loss": 32.9951, "step": 420 }, { "epoch": 1.5262222222222221, "grad_norm": 11.316079139709473, "learning_rate": 4.733096085409253e-05, "loss": 32.3443, "step": 430 }, { "epoch": 1.561777777777778, "grad_norm": 11.608524322509766, "learning_rate": 4.377224199288256e-05, "loss": 32.2948, "step": 440 }, { "epoch": 1.5973333333333333, "grad_norm": 11.020298957824707, "learning_rate": 4.02135231316726e-05, "loss": 32.6702, "step": 450 }, { "epoch": 1.6328888888888888, "grad_norm": 9.804555892944336, "learning_rate": 3.665480427046263e-05, "loss": 31.6452, "step": 460 }, { "epoch": 1.6684444444444444, "grad_norm": 11.037073135375977, "learning_rate": 3.309608540925267e-05, "loss": 32.479, "step": 470 }, { "epoch": 1.704, "grad_norm": 9.837021827697754, "learning_rate": 2.9537366548042704e-05, "loss": 32.72, "step": 480 }, { "epoch": 1.7395555555555555, "grad_norm": 11.720721244812012, "learning_rate": 2.597864768683274e-05, "loss": 32.6789, "step": 490 }, { "epoch": 1.775111111111111, "grad_norm": 11.738125801086426, "learning_rate": 2.2419928825622775e-05, "loss": 33.3128, "step": 500 }, { "epoch": 1.775111111111111, "eval_loss": 2.11881947517395, "eval_runtime": 296.1216, "eval_samples_per_second": 3.377, "eval_steps_per_second": 0.422, "step": 500 }, { "epoch": 1.8106666666666666, "grad_norm": 11.249613761901855, "learning_rate": 1.8861209964412814e-05, "loss": 31.9298, "step": 510 }, { "epoch": 1.8462222222222222, "grad_norm": 11.530637741088867, "learning_rate": 1.530249110320285e-05, "loss": 31.8878, "step": 520 }, { "epoch": 1.8817777777777778, "grad_norm": 11.147592544555664, "learning_rate": 1.1743772241992882e-05, "loss": 32.6852, "step": 530 }, { "epoch": 1.9173333333333333, "grad_norm": 9.81916332244873, "learning_rate": 8.185053380782918e-06, "loss": 32.1578, "step": 540 }, { "epoch": 1.952888888888889, "grad_norm": 10.557317733764648, "learning_rate": 4.626334519572954e-06, "loss": 32.2151, "step": 550 }, { "epoch": 1.9884444444444445, "grad_norm": 10.493524551391602, "learning_rate": 1.0676156583629894e-06, "loss": 31.9549, "step": 560 } ], "logging_steps": 10, "max_steps": 562, "num_input_tokens_seen": 0, "num_train_epochs": 2, "save_steps": 100, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 5.93400073703424e+16, "train_batch_size": 2, "trial_name": null, "trial_params": null }