{ "best_metric": null, "best_model_checkpoint": null, "epoch": 2.0, "eval_steps": 100, "global_step": 496, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.004032258064516129, "grad_norm": 215.24710014620507, "learning_rate": 4.0000000000000003e-07, "loss": 1.6159, "step": 1 }, { "epoch": 0.020161290322580645, "grad_norm": 312.9268127676858, "learning_rate": 2.0000000000000003e-06, "loss": 1.4896, "step": 5 }, { "epoch": 0.04032258064516129, "grad_norm": 22.150019622688806, "learning_rate": 4.000000000000001e-06, "loss": 1.4524, "step": 10 }, { "epoch": 0.06048387096774194, "grad_norm": 6.1881183003709035, "learning_rate": 6e-06, "loss": 1.2634, "step": 15 }, { "epoch": 0.08064516129032258, "grad_norm": 4.300533001219337, "learning_rate": 8.000000000000001e-06, "loss": 1.1845, "step": 20 }, { "epoch": 0.10080645161290322, "grad_norm": 3.355604785739065, "learning_rate": 1e-05, "loss": 1.1828, "step": 25 }, { "epoch": 0.12096774193548387, "grad_norm": 3.1071264175265796, "learning_rate": 1.2e-05, "loss": 1.1377, "step": 30 }, { "epoch": 0.14112903225806453, "grad_norm": 3.739675462647032, "learning_rate": 1.4e-05, "loss": 1.1406, "step": 35 }, { "epoch": 0.16129032258064516, "grad_norm": 3.5497603101186597, "learning_rate": 1.6000000000000003e-05, "loss": 1.1189, "step": 40 }, { "epoch": 0.1814516129032258, "grad_norm": 3.459217793654087, "learning_rate": 1.8e-05, "loss": 1.1015, "step": 45 }, { "epoch": 0.20161290322580644, "grad_norm": 2.5061828560304673, "learning_rate": 2e-05, "loss": 1.1509, "step": 50 }, { "epoch": 0.2217741935483871, "grad_norm": 3.071258628815723, "learning_rate": 1.999379852284651e-05, "loss": 1.1428, "step": 55 }, { "epoch": 0.24193548387096775, "grad_norm": 2.1526518734405915, "learning_rate": 1.9975201783049804e-05, "loss": 1.0857, "step": 60 }, { "epoch": 0.2620967741935484, "grad_norm": 3.1257781855467806, "learning_rate": 1.9944232846061284e-05, "loss": 1.1007, "step": 65 }, { "epoch": 0.28225806451612906, "grad_norm": 2.650156490646313, "learning_rate": 1.9900930122511993e-05, "loss": 1.1763, "step": 70 }, { "epoch": 0.3024193548387097, "grad_norm": 2.222582022247095, "learning_rate": 1.984534732057208e-05, "loss": 1.0812, "step": 75 }, { "epoch": 0.3225806451612903, "grad_norm": 3.755219640137959, "learning_rate": 1.977755337933682e-05, "loss": 1.1358, "step": 80 }, { "epoch": 0.34274193548387094, "grad_norm": 2.9861330009858578, "learning_rate": 1.9697632383321755e-05, "loss": 1.0906, "step": 85 }, { "epoch": 0.3629032258064516, "grad_norm": 4.930667988082775, "learning_rate": 1.960568345817306e-05, "loss": 1.0903, "step": 90 }, { "epoch": 0.38306451612903225, "grad_norm": 2.2881396839609565, "learning_rate": 1.9501820647722458e-05, "loss": 1.1276, "step": 95 }, { "epoch": 0.4032258064516129, "grad_norm": 2.699315706389293, "learning_rate": 1.9386172772539162e-05, "loss": 1.0391, "step": 100 }, { "epoch": 0.4032258064516129, "eval_loss": 1.1010785102844238, "eval_runtime": 5.7903, "eval_samples_per_second": 68.563, "eval_steps_per_second": 2.245, "step": 100 }, { "epoch": 0.42338709677419356, "grad_norm": 2.3951607960351318, "learning_rate": 1.925888327015434e-05, "loss": 1.1294, "step": 105 }, { "epoch": 0.4435483870967742, "grad_norm": 2.590506218859771, "learning_rate": 1.9120110017156172e-05, "loss": 1.1039, "step": 110 }, { "epoch": 0.4637096774193548, "grad_norm": 2.3642899736042855, "learning_rate": 1.8970025133376252e-05, "loss": 1.0845, "step": 115 }, { "epoch": 0.4838709677419355, "grad_norm": 2.4194055228848623, "learning_rate": 1.8808814768410157e-05, "loss": 1.1614, "step": 120 }, { "epoch": 0.5040322580645161, "grad_norm": 2.3158075822843545, "learning_rate": 1.8636678870736928e-05, "loss": 1.0667, "step": 125 }, { "epoch": 0.5241935483870968, "grad_norm": 2.2181248045354165, "learning_rate": 1.8453830939723913e-05, "loss": 1.0932, "step": 130 }, { "epoch": 0.5443548387096774, "grad_norm": 2.4950408748993653, "learning_rate": 1.826049776082446e-05, "loss": 1.102, "step": 135 }, { "epoch": 0.5645161290322581, "grad_norm": 2.9826288289990774, "learning_rate": 1.8056919124296957e-05, "loss": 1.1153, "step": 140 }, { "epoch": 0.5846774193548387, "grad_norm": 4.789513209353997, "learning_rate": 1.784334752779408e-05, "loss": 1.0892, "step": 145 }, { "epoch": 0.6048387096774194, "grad_norm": 2.153748931520763, "learning_rate": 1.76200478631911e-05, "loss": 1.0512, "step": 150 }, { "epoch": 0.625, "grad_norm": 2.1449515128315584, "learning_rate": 1.7387297088041696e-05, "loss": 1.0708, "step": 155 }, { "epoch": 0.6451612903225806, "grad_norm": 2.568672641778878, "learning_rate": 1.714538388206878e-05, "loss": 1.1534, "step": 160 }, { "epoch": 0.6653225806451613, "grad_norm": 2.4066286973710818, "learning_rate": 1.6894608289116344e-05, "loss": 1.0782, "step": 165 }, { "epoch": 0.6854838709677419, "grad_norm": 2.469873074784075, "learning_rate": 1.663528134500646e-05, "loss": 1.1579, "step": 170 }, { "epoch": 0.7056451612903226, "grad_norm": 2.546868439730841, "learning_rate": 1.6367724691762967e-05, "loss": 1.0727, "step": 175 }, { "epoch": 0.7258064516129032, "grad_norm": 2.610421464248938, "learning_rate": 1.609227017868033e-05, "loss": 1.0766, "step": 180 }, { "epoch": 0.7459677419354839, "grad_norm": 2.4332563313436486, "learning_rate": 1.5809259450732495e-05, "loss": 1.1252, "step": 185 }, { "epoch": 0.7661290322580645, "grad_norm": 2.273728572874247, "learning_rate": 1.551904352483217e-05, "loss": 1.1049, "step": 190 }, { "epoch": 0.7862903225806451, "grad_norm": 1.9228388068167408, "learning_rate": 1.5221982354466172e-05, "loss": 1.0661, "step": 195 }, { "epoch": 0.8064516129032258, "grad_norm": 2.1988965883386573, "learning_rate": 1.4918444383246738e-05, "loss": 1.1256, "step": 200 }, { "epoch": 0.8064516129032258, "eval_loss": 1.0620791912078857, "eval_runtime": 5.8005, "eval_samples_per_second": 68.443, "eval_steps_per_second": 2.241, "step": 200 }, { "epoch": 0.8266129032258065, "grad_norm": 2.4001670804799273, "learning_rate": 1.460880608793262e-05, "loss": 1.069, "step": 205 }, { "epoch": 0.8467741935483871, "grad_norm": 2.361908189355991, "learning_rate": 1.4293451511486658e-05, "loss": 1.0566, "step": 210 }, { "epoch": 0.8669354838709677, "grad_norm": 2.1713973708275955, "learning_rate": 1.3972771786749074e-05, "loss": 1.0593, "step": 215 }, { "epoch": 0.8870967741935484, "grad_norm": 2.0277729000613403, "learning_rate": 1.3647164651317178e-05, "loss": 1.0812, "step": 220 }, { "epoch": 0.907258064516129, "grad_norm": 2.4350034774265326, "learning_rate": 1.3317033954233246e-05, "loss": 1.1498, "step": 225 }, { "epoch": 0.9274193548387096, "grad_norm": 2.359441580453533, "learning_rate": 1.2982789155092407e-05, "loss": 1.1063, "step": 230 }, { "epoch": 0.9475806451612904, "grad_norm": 2.337686204567925, "learning_rate": 1.264484481619177e-05, "loss": 1.0411, "step": 235 }, { "epoch": 0.967741935483871, "grad_norm": 2.1962971145230297, "learning_rate": 1.23036200883507e-05, "loss": 1.0567, "step": 240 }, { "epoch": 0.9879032258064516, "grad_norm": 2.8727159078532507, "learning_rate": 1.1959538191039986e-05, "loss": 1.09, "step": 245 }, { "epoch": 1.0080645161290323, "grad_norm": 3.2425805939567804, "learning_rate": 1.1613025887464642e-05, "loss": 0.8776, "step": 250 }, { "epoch": 1.028225806451613, "grad_norm": 3.0688033959726506, "learning_rate": 1.1264512955251479e-05, "loss": 0.5624, "step": 255 }, { "epoch": 1.0483870967741935, "grad_norm": 2.310668387893831, "learning_rate": 1.0914431653397856e-05, "loss": 0.5702, "step": 260 }, { "epoch": 1.0685483870967742, "grad_norm": 1.810288404740645, "learning_rate": 1.056321618614284e-05, "loss": 0.5516, "step": 265 }, { "epoch": 1.0887096774193548, "grad_norm": 2.029102262007128, "learning_rate": 1.0211302164425657e-05, "loss": 0.492, "step": 270 }, { "epoch": 1.1088709677419355, "grad_norm": 2.1849097454980977, "learning_rate": 9.859126065599435e-06, "loss": 0.5087, "step": 275 }, { "epoch": 1.129032258064516, "grad_norm": 2.0350424139394456, "learning_rate": 9.507124692070356e-06, "loss": 0.5514, "step": 280 }, { "epoch": 1.1491935483870968, "grad_norm": 2.1657967134210643, "learning_rate": 9.155734629533612e-06, "loss": 0.4923, "step": 285 }, { "epoch": 1.1693548387096775, "grad_norm": 2.3019047292629065, "learning_rate": 8.805391705478149e-06, "loss": 0.5185, "step": 290 }, { "epoch": 1.189516129032258, "grad_norm": 2.342813194456275, "learning_rate": 8.456530448631856e-06, "loss": 0.5622, "step": 295 }, { "epoch": 1.2096774193548387, "grad_norm": 2.4435792193657373, "learning_rate": 8.10958355001755e-06, "loss": 0.5037, "step": 300 }, { "epoch": 1.2096774193548387, "eval_loss": 1.0970051288604736, "eval_runtime": 5.7929, "eval_samples_per_second": 68.532, "eval_steps_per_second": 2.244, "step": 300 }, { "epoch": 1.2298387096774193, "grad_norm": 2.0093621809495943, "learning_rate": 7.764981326288273e-06, "loss": 0.5044, "step": 305 }, { "epoch": 1.25, "grad_norm": 2.217076831970701, "learning_rate": 7.423151186007527e-06, "loss": 0.4857, "step": 310 }, { "epoch": 1.2701612903225805, "grad_norm": 2.711127350014508, "learning_rate": 7.084517099536378e-06, "loss": 0.5131, "step": 315 }, { "epoch": 1.2903225806451613, "grad_norm": 2.2254535872260925, "learning_rate": 6.749499073184957e-06, "loss": 0.5338, "step": 320 }, { "epoch": 1.310483870967742, "grad_norm": 2.438095643634648, "learning_rate": 6.418512628280544e-06, "loss": 0.5052, "step": 325 }, { "epoch": 1.3306451612903225, "grad_norm": 2.1227128290496045, "learning_rate": 6.09196828579838e-06, "loss": 0.4932, "step": 330 }, { "epoch": 1.3508064516129032, "grad_norm": 2.157875906897965, "learning_rate": 5.7702710571943695e-06, "loss": 0.488, "step": 335 }, { "epoch": 1.370967741935484, "grad_norm": 2.1989233445083176, "learning_rate": 5.453819942071212e-06, "loss": 0.4638, "step": 340 }, { "epoch": 1.3911290322580645, "grad_norm": 1.7426003114674993, "learning_rate": 5.1430074333010346e-06, "loss": 0.5005, "step": 345 }, { "epoch": 1.4112903225806452, "grad_norm": 2.4552831246664453, "learning_rate": 4.838219030218274e-06, "loss": 0.4814, "step": 350 }, { "epoch": 1.4314516129032258, "grad_norm": 2.3809447566869113, "learning_rate": 4.5398327604866056e-06, "loss": 0.4956, "step": 355 }, { "epoch": 1.4516129032258065, "grad_norm": 1.9931477191927476, "learning_rate": 4.248218711232952e-06, "loss": 0.5095, "step": 360 }, { "epoch": 1.471774193548387, "grad_norm": 2.0758952034120104, "learning_rate": 3.963738570030135e-06, "loss": 0.5031, "step": 365 }, { "epoch": 1.4919354838709677, "grad_norm": 2.113141005154832, "learning_rate": 3.6867451762974117e-06, "loss": 0.4858, "step": 370 }, { "epoch": 1.5120967741935485, "grad_norm": 2.033150384960035, "learning_rate": 3.417582083675365e-06, "loss": 0.4838, "step": 375 }, { "epoch": 1.532258064516129, "grad_norm": 2.6578869400642984, "learning_rate": 3.1565831339178844e-06, "loss": 0.4981, "step": 380 }, { "epoch": 1.5524193548387095, "grad_norm": 2.4259980364912557, "learning_rate": 2.9040720428297754e-06, "loss": 0.4953, "step": 385 }, { "epoch": 1.5725806451612905, "grad_norm": 2.188171842523612, "learning_rate": 2.6603619987635087e-06, "loss": 0.4886, "step": 390 }, { "epoch": 1.592741935483871, "grad_norm": 2.4509085938481645, "learning_rate": 2.4257552741731593e-06, "loss": 0.4919, "step": 395 }, { "epoch": 1.6129032258064515, "grad_norm": 2.057354914567607, "learning_rate": 2.200542850707247e-06, "loss": 0.5057, "step": 400 }, { "epoch": 1.6129032258064515, "eval_loss": 1.0593500137329102, "eval_runtime": 5.7784, "eval_samples_per_second": 68.704, "eval_steps_per_second": 2.25, "step": 400 }, { "epoch": 1.6330645161290323, "grad_norm": 1.797397264265373, "learning_rate": 1.985004058305535e-06, "loss": 0.4598, "step": 405 }, { "epoch": 1.653225806451613, "grad_norm": 2.243456315367938, "learning_rate": 1.7794062287473734e-06, "loss": 0.5226, "step": 410 }, { "epoch": 1.6733870967741935, "grad_norm": 2.220033645347878, "learning_rate": 1.5840043640813274e-06, "loss": 0.5404, "step": 415 }, { "epoch": 1.6935483870967742, "grad_norm": 1.9945052495277809, "learning_rate": 1.3990408203472938e-06, "loss": 0.4725, "step": 420 }, { "epoch": 1.713709677419355, "grad_norm": 2.2666732532368217, "learning_rate": 1.2247450069834077e-06, "loss": 0.4773, "step": 425 }, { "epoch": 1.7338709677419355, "grad_norm": 1.907392372458416, "learning_rate": 1.061333102290576e-06, "loss": 0.4776, "step": 430 }, { "epoch": 1.754032258064516, "grad_norm": 2.6145484574587083, "learning_rate": 9.090077853075119e-07, "loss": 0.4864, "step": 435 }, { "epoch": 1.7741935483870968, "grad_norm": 2.06555720808374, "learning_rate": 7.679579844288509e-07, "loss": 0.4692, "step": 440 }, { "epoch": 1.7943548387096775, "grad_norm": 2.1245728247554982, "learning_rate": 6.383586430781196e-07, "loss": 0.5071, "step": 445 }, { "epoch": 1.814516129032258, "grad_norm": 2.0064654819537298, "learning_rate": 5.203705027262185e-07, "loss": 0.4659, "step": 450 }, { "epoch": 1.8346774193548387, "grad_norm": 2.1497980743373906, "learning_rate": 4.141399035245053e-07, "loss": 0.5124, "step": 455 }, { "epoch": 1.8548387096774195, "grad_norm": 2.1598292791262064, "learning_rate": 3.197986027997657e-07, "loss": 0.4742, "step": 460 }, { "epoch": 1.875, "grad_norm": 2.01644524585016, "learning_rate": 2.3746361163621723e-07, "loss": 0.4833, "step": 465 }, { "epoch": 1.8951612903225805, "grad_norm": 1.9818198911248883, "learning_rate": 1.6723704974718758e-07, "loss": 0.5176, "step": 470 }, { "epoch": 1.9153225806451613, "grad_norm": 2.0731118119143126, "learning_rate": 1.0920601881650006e-07, "loss": 0.4895, "step": 475 }, { "epoch": 1.935483870967742, "grad_norm": 1.9849538964737026, "learning_rate": 6.344249446665673e-08, "loss": 0.5093, "step": 480 }, { "epoch": 1.9556451612903225, "grad_norm": 1.8778229972783114, "learning_rate": 3.0003236987802276e-08, "loss": 0.4298, "step": 485 }, { "epoch": 1.9758064516129032, "grad_norm": 2.0981658218579873, "learning_rate": 8.929720938193331e-09, "loss": 0.524, "step": 490 }, { "epoch": 1.995967741935484, "grad_norm": 2.1714963758794936, "learning_rate": 2.48083703494606e-10, "loss": 0.523, "step": 495 }, { "epoch": 2.0, "step": 496, "total_flos": 20768208814080.0, "train_loss": 0.8128850824169574, "train_runtime": 1218.4539, "train_samples_per_second": 13.005, "train_steps_per_second": 0.407 } ], "logging_steps": 5, "max_steps": 496, "num_input_tokens_seen": 0, "num_train_epochs": 2, "save_steps": 100, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 20768208814080.0, "train_batch_size": 4, "trial_name": null, "trial_params": null }