{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 3.0, "eval_steps": 500, "global_step": 528, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.028409090909090908, "grad_norm": 10.896637192000782, "learning_rate": 7.4074074074074075e-06, "loss": 1.6734, "step": 5 }, { "epoch": 0.056818181818181816, "grad_norm": 1.5163857276357569, "learning_rate": 1.6666666666666667e-05, "loss": 1.2095, "step": 10 }, { "epoch": 0.08522727272727272, "grad_norm": 0.6030083388420293, "learning_rate": 2.5925925925925925e-05, "loss": 0.8954, "step": 15 }, { "epoch": 0.11363636363636363, "grad_norm": 0.4146436929136456, "learning_rate": 3.518518518518519e-05, "loss": 0.7715, "step": 20 }, { "epoch": 0.14204545454545456, "grad_norm": 0.2636145188710264, "learning_rate": 4.4444444444444447e-05, "loss": 0.7127, "step": 25 }, { "epoch": 0.17045454545454544, "grad_norm": 0.2023043123134589, "learning_rate": 4.99982305792533e-05, "loss": 0.6778, "step": 30 }, { "epoch": 0.19886363636363635, "grad_norm": 0.16510970301238276, "learning_rate": 4.997832779179051e-05, "loss": 0.6409, "step": 35 }, { "epoch": 0.22727272727272727, "grad_norm": 0.13082269602718508, "learning_rate": 4.993633006948142e-05, "loss": 0.6336, "step": 40 }, { "epoch": 0.2556818181818182, "grad_norm": 0.14831898432802543, "learning_rate": 4.987227869373036e-05, "loss": 0.6166, "step": 45 }, { "epoch": 0.2840909090909091, "grad_norm": 0.10897233346547917, "learning_rate": 4.9786236623445434e-05, "loss": 0.6118, "step": 50 }, { "epoch": 0.3125, "grad_norm": 0.11184138744287996, "learning_rate": 4.967828843315348e-05, "loss": 0.5983, "step": 55 }, { "epoch": 0.3409090909090909, "grad_norm": 0.11134807726458239, "learning_rate": 4.954854022986805e-05, "loss": 0.5891, "step": 60 }, { "epoch": 0.3693181818181818, "grad_norm": 0.09921365604301072, "learning_rate": 4.939711954879211e-05, "loss": 0.5876, "step": 65 }, { "epoch": 0.3977272727272727, "grad_norm": 0.1055432451773935, "learning_rate": 4.922417522795821e-05, "loss": 0.5801, "step": 70 }, { "epoch": 0.42613636363636365, "grad_norm": 0.09914093623755263, "learning_rate": 4.902987726192893e-05, "loss": 0.5781, "step": 75 }, { "epoch": 0.45454545454545453, "grad_norm": 0.08828127246038743, "learning_rate": 4.881441663470182e-05, "loss": 0.5649, "step": 80 }, { "epoch": 0.48295454545454547, "grad_norm": 0.10055317591789738, "learning_rate": 4.857800513198288e-05, "loss": 0.5625, "step": 85 }, { "epoch": 0.5113636363636364, "grad_norm": 0.08874927350125936, "learning_rate": 4.832087513301302e-05, "loss": 0.5631, "step": 90 }, { "epoch": 0.5397727272727273, "grad_norm": 0.1118239441455913, "learning_rate": 4.804327938215235e-05, "loss": 0.5591, "step": 95 }, { "epoch": 0.5681818181818182, "grad_norm": 0.09351012207556161, "learning_rate": 4.7745490740446676e-05, "loss": 0.5566, "step": 100 }, { "epoch": 0.5965909090909091, "grad_norm": 0.10852517138960499, "learning_rate": 4.742780191742036e-05, "loss": 0.5607, "step": 105 }, { "epoch": 0.625, "grad_norm": 0.11702640688817323, "learning_rate": 4.7090525183359297e-05, "loss": 0.5539, "step": 110 }, { "epoch": 0.6534090909090909, "grad_norm": 0.09785419980049836, "learning_rate": 4.6733992062366766e-05, "loss": 0.5419, "step": 115 }, { "epoch": 0.6818181818181818, "grad_norm": 0.10711167883214251, "learning_rate": 4.635855300649382e-05, "loss": 0.5445, "step": 120 }, { "epoch": 0.7102272727272727, "grad_norm": 0.10796834846017456, "learning_rate": 4.596457705126459e-05, "loss": 0.5448, "step": 125 }, { "epoch": 0.7386363636363636, "grad_norm": 0.11744855439211235, "learning_rate": 4.555245145293509e-05, "loss": 0.5411, "step": 130 }, { "epoch": 0.7670454545454546, "grad_norm": 0.11356156360640139, "learning_rate": 4.512258130784199e-05, "loss": 0.5434, "step": 135 }, { "epoch": 0.7954545454545454, "grad_norm": 0.12169855888778969, "learning_rate": 4.4675389154215744e-05, "loss": 0.5421, "step": 140 }, { "epoch": 0.8238636363636364, "grad_norm": 0.10341006801601364, "learning_rate": 4.4211314556849136e-05, "loss": 0.5355, "step": 145 }, { "epoch": 0.8522727272727273, "grad_norm": 0.10759924912275785, "learning_rate": 4.373081367502981e-05, "loss": 0.5326, "step": 150 }, { "epoch": 0.8806818181818182, "grad_norm": 0.10861205860545389, "learning_rate": 4.323435881416124e-05, "loss": 0.537, "step": 155 }, { "epoch": 0.9090909090909091, "grad_norm": 0.09957047692339929, "learning_rate": 4.272243796151305e-05, "loss": 0.533, "step": 160 }, { "epoch": 0.9375, "grad_norm": 0.12359375999999637, "learning_rate": 4.219555430655693e-05, "loss": 0.5351, "step": 165 }, { "epoch": 0.9659090909090909, "grad_norm": 0.09258501782228747, "learning_rate": 4.165422574635959e-05, "loss": 0.5317, "step": 170 }, { "epoch": 0.9943181818181818, "grad_norm": 0.11203309080231412, "learning_rate": 4.1098984376519e-05, "loss": 0.5292, "step": 175 }, { "epoch": 1.0227272727272727, "grad_norm": 0.08337277679519309, "learning_rate": 4.053037596814432e-05, "loss": 0.5205, "step": 180 }, { "epoch": 1.0511363636363635, "grad_norm": 0.08922972769023695, "learning_rate": 3.9948959431393454e-05, "loss": 0.5107, "step": 185 }, { "epoch": 1.0795454545454546, "grad_norm": 0.09256444997193415, "learning_rate": 3.935530626609582e-05, "loss": 0.5091, "step": 190 }, { "epoch": 1.1079545454545454, "grad_norm": 0.08761092083420882, "learning_rate": 3.875e-05, "loss": 0.5118, "step": 195 }, { "epoch": 1.1363636363636362, "grad_norm": 0.09766222638240604, "learning_rate": 3.813363561519876e-05, "loss": 0.5086, "step": 200 }, { "epoch": 1.1647727272727273, "grad_norm": 0.08469321954291688, "learning_rate": 3.750681896329504e-05, "loss": 0.5106, "step": 205 }, { "epoch": 1.1931818181818181, "grad_norm": 0.0961560094422636, "learning_rate": 3.687016616988379e-05, "loss": 0.51, "step": 210 }, { "epoch": 1.2215909090909092, "grad_norm": 0.0862194194928744, "learning_rate": 3.6224303028935106e-05, "loss": 0.5103, "step": 215 }, { "epoch": 1.25, "grad_norm": 0.07956278803606433, "learning_rate": 3.556986438767389e-05, "loss": 0.5037, "step": 220 }, { "epoch": 1.2784090909090908, "grad_norm": 0.0903181198584574, "learning_rate": 3.490749352256063e-05, "loss": 0.509, "step": 225 }, { "epoch": 1.3068181818181819, "grad_norm": 0.0944984329203277, "learning_rate": 3.4237841506986744e-05, "loss": 0.5027, "step": 230 }, { "epoch": 1.3352272727272727, "grad_norm": 0.08882033124741437, "learning_rate": 3.356156657130596e-05, "loss": 0.4961, "step": 235 }, { "epoch": 1.3636363636363638, "grad_norm": 0.09550461777333721, "learning_rate": 3.287933345583085e-05, "loss": 0.5069, "step": 240 }, { "epoch": 1.3920454545454546, "grad_norm": 0.08946536218539883, "learning_rate": 3.219181275743034e-05, "loss": 0.5086, "step": 245 }, { "epoch": 1.4204545454545454, "grad_norm": 0.09325584287334789, "learning_rate": 3.1499680270370706e-05, "loss": 0.5044, "step": 250 }, { "epoch": 1.4488636363636362, "grad_norm": 0.08223842512135068, "learning_rate": 3.0803616322047654e-05, "loss": 0.5052, "step": 255 }, { "epoch": 1.4772727272727273, "grad_norm": 0.08270798846301625, "learning_rate": 3.0104305104262698e-05, "loss": 0.4985, "step": 260 }, { "epoch": 1.5056818181818183, "grad_norm": 0.08262838460793459, "learning_rate": 2.940243400070103e-05, "loss": 0.4994, "step": 265 }, { "epoch": 1.5340909090909092, "grad_norm": 0.09058860519349171, "learning_rate": 2.8698692911271897e-05, "loss": 0.5023, "step": 270 }, { "epoch": 1.5625, "grad_norm": 0.09371800013120092, "learning_rate": 2.7993773573975728e-05, "loss": 0.4992, "step": 275 }, { "epoch": 1.5909090909090908, "grad_norm": 0.08456948817551574, "learning_rate": 2.7288368884964475e-05, "loss": 0.503, "step": 280 }, { "epoch": 1.6193181818181817, "grad_norm": 0.09101003159864386, "learning_rate": 2.658317221746361e-05, "loss": 0.4954, "step": 285 }, { "epoch": 1.6477272727272727, "grad_norm": 0.08160457721074796, "learning_rate": 2.5878876740225116e-05, "loss": 0.4986, "step": 290 }, { "epoch": 1.6761363636363638, "grad_norm": 0.08078176783826871, "learning_rate": 2.5176174736181536e-05, "loss": 0.4971, "step": 295 }, { "epoch": 1.7045454545454546, "grad_norm": 0.08864892274469409, "learning_rate": 2.4475756921970653e-05, "loss": 0.4988, "step": 300 }, { "epoch": 1.7329545454545454, "grad_norm": 0.09144520680898066, "learning_rate": 2.377831176899986e-05, "loss": 0.4987, "step": 305 }, { "epoch": 1.7613636363636362, "grad_norm": 0.0913519780551606, "learning_rate": 2.3084524826717317e-05, "loss": 0.4962, "step": 310 }, { "epoch": 1.7897727272727273, "grad_norm": 0.08330604163992984, "learning_rate": 2.23950780487554e-05, "loss": 0.5021, "step": 315 }, { "epoch": 1.8181818181818183, "grad_norm": 0.08280936051442833, "learning_rate": 2.171064912260849e-05, "loss": 0.4983, "step": 320 }, { "epoch": 1.8465909090909092, "grad_norm": 0.07571686269473353, "learning_rate": 2.103191080350419e-05, "loss": 0.499, "step": 325 }, { "epoch": 1.875, "grad_norm": 0.08525336953750823, "learning_rate": 2.0359530253122738e-05, "loss": 0.4923, "step": 330 }, { "epoch": 1.9034090909090908, "grad_norm": 0.10358699532249785, "learning_rate": 1.9694168383814492e-05, "loss": 0.5043, "step": 335 }, { "epoch": 1.9318181818181817, "grad_norm": 0.08301020877209554, "learning_rate": 1.9036479208960127e-05, "loss": 0.4946, "step": 340 }, { "epoch": 1.9602272727272727, "grad_norm": 0.08553632563551307, "learning_rate": 1.838710920011227e-05, "loss": 0.4969, "step": 345 }, { "epoch": 1.9886363636363638, "grad_norm": 0.08303295503235186, "learning_rate": 1.7746696651550143e-05, "loss": 0.4997, "step": 350 }, { "epoch": 2.0170454545454546, "grad_norm": 0.08033143740448973, "learning_rate": 1.7115871052872204e-05, "loss": 0.4867, "step": 355 }, { "epoch": 2.0454545454545454, "grad_norm": 0.08063264186240293, "learning_rate": 1.6495252470243134e-05, "loss": 0.487, "step": 360 }, { "epoch": 2.0738636363636362, "grad_norm": 0.07841584895617655, "learning_rate": 1.5885450936903586e-05, "loss": 0.4837, "step": 365 }, { "epoch": 2.102272727272727, "grad_norm": 0.07460661702402475, "learning_rate": 1.528706585354177e-05, "loss": 0.4788, "step": 370 }, { "epoch": 2.1306818181818183, "grad_norm": 0.08030048639361652, "learning_rate": 1.4700685399116238e-05, "loss": 0.4803, "step": 375 }, { "epoch": 2.159090909090909, "grad_norm": 0.07655857860636041, "learning_rate": 1.412688595270887e-05, "loss": 0.4782, "step": 380 }, { "epoch": 2.1875, "grad_norm": 0.0741239677057519, "learning_rate": 1.3566231526976714e-05, "loss": 0.4833, "step": 385 }, { "epoch": 2.215909090909091, "grad_norm": 0.07552080480637674, "learning_rate": 1.3019273213759081e-05, "loss": 0.4817, "step": 390 }, { "epoch": 2.2443181818181817, "grad_norm": 0.087506481602976, "learning_rate": 1.2486548642385274e-05, "loss": 0.4855, "step": 395 }, { "epoch": 2.2727272727272725, "grad_norm": 0.07825929401017973, "learning_rate": 1.1968581451215065e-05, "loss": 0.4783, "step": 400 }, { "epoch": 2.3011363636363638, "grad_norm": 0.07347633398325569, "learning_rate": 1.1465880772931601e-05, "loss": 0.484, "step": 405 }, { "epoch": 2.3295454545454546, "grad_norm": 0.07075822069959536, "learning_rate": 1.0978940734092554e-05, "loss": 0.478, "step": 410 }, { "epoch": 2.3579545454545454, "grad_norm": 0.07267042198965988, "learning_rate": 1.050823996943144e-05, "loss": 0.4783, "step": 415 }, { "epoch": 2.3863636363636362, "grad_norm": 0.07341626200460101, "learning_rate": 1.0054241151386492e-05, "loss": 0.4756, "step": 420 }, { "epoch": 2.4147727272727275, "grad_norm": 0.0706063205931748, "learning_rate": 9.617390535319684e-06, "loss": 0.4772, "step": 425 }, { "epoch": 2.4431818181818183, "grad_norm": 0.06987948971233222, "learning_rate": 9.198117520872698e-06, "loss": 0.477, "step": 430 }, { "epoch": 2.471590909090909, "grad_norm": 0.07780450930302736, "learning_rate": 8.796834229891206e-06, "loss": 0.4819, "step": 435 }, { "epoch": 2.5, "grad_norm": 0.07385728736608703, "learning_rate": 8.413935101332198e-06, "loss": 0.476, "step": 440 }, { "epoch": 2.528409090909091, "grad_norm": 0.06912845080364564, "learning_rate": 8.049796503552626e-06, "loss": 0.4776, "step": 445 }, { "epoch": 2.5568181818181817, "grad_norm": 0.07115097048034667, "learning_rate": 7.704776364360454e-06, "loss": 0.4806, "step": 450 }, { "epoch": 2.5852272727272725, "grad_norm": 0.0719603929691263, "learning_rate": 7.379213819191676e-06, "loss": 0.4805, "step": 455 }, { "epoch": 2.6136363636363638, "grad_norm": 0.06733281311431526, "learning_rate": 7.07342887775922e-06, "loss": 0.4771, "step": 460 }, { "epoch": 2.6420454545454546, "grad_norm": 0.06812256929077086, "learning_rate": 6.787722109501379e-06, "loss": 0.4769, "step": 465 }, { "epoch": 2.6704545454545454, "grad_norm": 0.06945313043383605, "learning_rate": 6.522374348138882e-06, "loss": 0.4791, "step": 470 }, { "epoch": 2.6988636363636362, "grad_norm": 0.07082851639261782, "learning_rate": 6.277646415631148e-06, "loss": 0.4785, "step": 475 }, { "epoch": 2.7272727272727275, "grad_norm": 0.07019381014525443, "learning_rate": 6.053778865802907e-06, "loss": 0.4787, "step": 480 }, { "epoch": 2.7556818181818183, "grad_norm": 0.06814585572621572, "learning_rate": 5.850991747893292e-06, "loss": 0.4788, "step": 485 }, { "epoch": 2.784090909090909, "grad_norm": 0.06724675412918013, "learning_rate": 5.669484390259793e-06, "loss": 0.4744, "step": 490 }, { "epoch": 2.8125, "grad_norm": 0.06951904971525688, "learning_rate": 5.509435204449666e-06, "loss": 0.4852, "step": 495 }, { "epoch": 2.840909090909091, "grad_norm": 0.06771580623936355, "learning_rate": 5.37100150983139e-06, "loss": 0.4748, "step": 500 }, { "epoch": 2.8693181818181817, "grad_norm": 0.06692159909264785, "learning_rate": 5.254319378958563e-06, "loss": 0.475, "step": 505 }, { "epoch": 2.8977272727272725, "grad_norm": 0.06940269513307211, "learning_rate": 5.159503503818199e-06, "loss": 0.4809, "step": 510 }, { "epoch": 2.9261363636363638, "grad_norm": 0.06707771857169355, "learning_rate": 5.086647083094966e-06, "loss": 0.4793, "step": 515 }, { "epoch": 2.9545454545454546, "grad_norm": 0.06794955106454185, "learning_rate": 5.03582173056209e-06, "loss": 0.4781, "step": 520 }, { "epoch": 2.9829545454545454, "grad_norm": 0.06951889463364248, "learning_rate": 5.00707740468904e-06, "loss": 0.4807, "step": 525 }, { "epoch": 3.0, "step": 528, "total_flos": 962656789856256.0, "train_loss": 0.5394951967578946, "train_runtime": 11008.0222, "train_samples_per_second": 6.123, "train_steps_per_second": 0.048 } ], "logging_steps": 5, "max_steps": 528, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 100, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 962656789856256.0, "train_batch_size": 16, "trial_name": null, "trial_params": null }