| { | |
| "best_global_step": null, | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 3.0, | |
| "eval_steps": 500, | |
| "global_step": 528, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.028409090909090908, | |
| "grad_norm": 10.896637192000782, | |
| "learning_rate": 7.4074074074074075e-06, | |
| "loss": 1.6734, | |
| "step": 5 | |
| }, | |
| { | |
| "epoch": 0.056818181818181816, | |
| "grad_norm": 1.5163857276357569, | |
| "learning_rate": 1.6666666666666667e-05, | |
| "loss": 1.2095, | |
| "step": 10 | |
| }, | |
| { | |
| "epoch": 0.08522727272727272, | |
| "grad_norm": 0.6030083388420293, | |
| "learning_rate": 2.5925925925925925e-05, | |
| "loss": 0.8954, | |
| "step": 15 | |
| }, | |
| { | |
| "epoch": 0.11363636363636363, | |
| "grad_norm": 0.4146436929136456, | |
| "learning_rate": 3.518518518518519e-05, | |
| "loss": 0.7715, | |
| "step": 20 | |
| }, | |
| { | |
| "epoch": 0.14204545454545456, | |
| "grad_norm": 0.2636145188710264, | |
| "learning_rate": 4.4444444444444447e-05, | |
| "loss": 0.7127, | |
| "step": 25 | |
| }, | |
| { | |
| "epoch": 0.17045454545454544, | |
| "grad_norm": 0.2023043123134589, | |
| "learning_rate": 4.99982305792533e-05, | |
| "loss": 0.6778, | |
| "step": 30 | |
| }, | |
| { | |
| "epoch": 0.19886363636363635, | |
| "grad_norm": 0.16510970301238276, | |
| "learning_rate": 4.997832779179051e-05, | |
| "loss": 0.6409, | |
| "step": 35 | |
| }, | |
| { | |
| "epoch": 0.22727272727272727, | |
| "grad_norm": 0.13082269602718508, | |
| "learning_rate": 4.993633006948142e-05, | |
| "loss": 0.6336, | |
| "step": 40 | |
| }, | |
| { | |
| "epoch": 0.2556818181818182, | |
| "grad_norm": 0.14831898432802543, | |
| "learning_rate": 4.987227869373036e-05, | |
| "loss": 0.6166, | |
| "step": 45 | |
| }, | |
| { | |
| "epoch": 0.2840909090909091, | |
| "grad_norm": 0.10897233346547917, | |
| "learning_rate": 4.9786236623445434e-05, | |
| "loss": 0.6118, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.3125, | |
| "grad_norm": 0.11184138744287996, | |
| "learning_rate": 4.967828843315348e-05, | |
| "loss": 0.5983, | |
| "step": 55 | |
| }, | |
| { | |
| "epoch": 0.3409090909090909, | |
| "grad_norm": 0.11134807726458239, | |
| "learning_rate": 4.954854022986805e-05, | |
| "loss": 0.5891, | |
| "step": 60 | |
| }, | |
| { | |
| "epoch": 0.3693181818181818, | |
| "grad_norm": 0.09921365604301072, | |
| "learning_rate": 4.939711954879211e-05, | |
| "loss": 0.5876, | |
| "step": 65 | |
| }, | |
| { | |
| "epoch": 0.3977272727272727, | |
| "grad_norm": 0.1055432451773935, | |
| "learning_rate": 4.922417522795821e-05, | |
| "loss": 0.5801, | |
| "step": 70 | |
| }, | |
| { | |
| "epoch": 0.42613636363636365, | |
| "grad_norm": 0.09914093623755263, | |
| "learning_rate": 4.902987726192893e-05, | |
| "loss": 0.5781, | |
| "step": 75 | |
| }, | |
| { | |
| "epoch": 0.45454545454545453, | |
| "grad_norm": 0.08828127246038743, | |
| "learning_rate": 4.881441663470182e-05, | |
| "loss": 0.5649, | |
| "step": 80 | |
| }, | |
| { | |
| "epoch": 0.48295454545454547, | |
| "grad_norm": 0.10055317591789738, | |
| "learning_rate": 4.857800513198288e-05, | |
| "loss": 0.5625, | |
| "step": 85 | |
| }, | |
| { | |
| "epoch": 0.5113636363636364, | |
| "grad_norm": 0.08874927350125936, | |
| "learning_rate": 4.832087513301302e-05, | |
| "loss": 0.5631, | |
| "step": 90 | |
| }, | |
| { | |
| "epoch": 0.5397727272727273, | |
| "grad_norm": 0.1118239441455913, | |
| "learning_rate": 4.804327938215235e-05, | |
| "loss": 0.5591, | |
| "step": 95 | |
| }, | |
| { | |
| "epoch": 0.5681818181818182, | |
| "grad_norm": 0.09351012207556161, | |
| "learning_rate": 4.7745490740446676e-05, | |
| "loss": 0.5566, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.5965909090909091, | |
| "grad_norm": 0.10852517138960499, | |
| "learning_rate": 4.742780191742036e-05, | |
| "loss": 0.5607, | |
| "step": 105 | |
| }, | |
| { | |
| "epoch": 0.625, | |
| "grad_norm": 0.11702640688817323, | |
| "learning_rate": 4.7090525183359297e-05, | |
| "loss": 0.5539, | |
| "step": 110 | |
| }, | |
| { | |
| "epoch": 0.6534090909090909, | |
| "grad_norm": 0.09785419980049836, | |
| "learning_rate": 4.6733992062366766e-05, | |
| "loss": 0.5419, | |
| "step": 115 | |
| }, | |
| { | |
| "epoch": 0.6818181818181818, | |
| "grad_norm": 0.10711167883214251, | |
| "learning_rate": 4.635855300649382e-05, | |
| "loss": 0.5445, | |
| "step": 120 | |
| }, | |
| { | |
| "epoch": 0.7102272727272727, | |
| "grad_norm": 0.10796834846017456, | |
| "learning_rate": 4.596457705126459e-05, | |
| "loss": 0.5448, | |
| "step": 125 | |
| }, | |
| { | |
| "epoch": 0.7386363636363636, | |
| "grad_norm": 0.11744855439211235, | |
| "learning_rate": 4.555245145293509e-05, | |
| "loss": 0.5411, | |
| "step": 130 | |
| }, | |
| { | |
| "epoch": 0.7670454545454546, | |
| "grad_norm": 0.11356156360640139, | |
| "learning_rate": 4.512258130784199e-05, | |
| "loss": 0.5434, | |
| "step": 135 | |
| }, | |
| { | |
| "epoch": 0.7954545454545454, | |
| "grad_norm": 0.12169855888778969, | |
| "learning_rate": 4.4675389154215744e-05, | |
| "loss": 0.5421, | |
| "step": 140 | |
| }, | |
| { | |
| "epoch": 0.8238636363636364, | |
| "grad_norm": 0.10341006801601364, | |
| "learning_rate": 4.4211314556849136e-05, | |
| "loss": 0.5355, | |
| "step": 145 | |
| }, | |
| { | |
| "epoch": 0.8522727272727273, | |
| "grad_norm": 0.10759924912275785, | |
| "learning_rate": 4.373081367502981e-05, | |
| "loss": 0.5326, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 0.8806818181818182, | |
| "grad_norm": 0.10861205860545389, | |
| "learning_rate": 4.323435881416124e-05, | |
| "loss": 0.537, | |
| "step": 155 | |
| }, | |
| { | |
| "epoch": 0.9090909090909091, | |
| "grad_norm": 0.09957047692339929, | |
| "learning_rate": 4.272243796151305e-05, | |
| "loss": 0.533, | |
| "step": 160 | |
| }, | |
| { | |
| "epoch": 0.9375, | |
| "grad_norm": 0.12359375999999637, | |
| "learning_rate": 4.219555430655693e-05, | |
| "loss": 0.5351, | |
| "step": 165 | |
| }, | |
| { | |
| "epoch": 0.9659090909090909, | |
| "grad_norm": 0.09258501782228747, | |
| "learning_rate": 4.165422574635959e-05, | |
| "loss": 0.5317, | |
| "step": 170 | |
| }, | |
| { | |
| "epoch": 0.9943181818181818, | |
| "grad_norm": 0.11203309080231412, | |
| "learning_rate": 4.1098984376519e-05, | |
| "loss": 0.5292, | |
| "step": 175 | |
| }, | |
| { | |
| "epoch": 1.0227272727272727, | |
| "grad_norm": 0.08337277679519309, | |
| "learning_rate": 4.053037596814432e-05, | |
| "loss": 0.5205, | |
| "step": 180 | |
| }, | |
| { | |
| "epoch": 1.0511363636363635, | |
| "grad_norm": 0.08922972769023695, | |
| "learning_rate": 3.9948959431393454e-05, | |
| "loss": 0.5107, | |
| "step": 185 | |
| }, | |
| { | |
| "epoch": 1.0795454545454546, | |
| "grad_norm": 0.09256444997193415, | |
| "learning_rate": 3.935530626609582e-05, | |
| "loss": 0.5091, | |
| "step": 190 | |
| }, | |
| { | |
| "epoch": 1.1079545454545454, | |
| "grad_norm": 0.08761092083420882, | |
| "learning_rate": 3.875e-05, | |
| "loss": 0.5118, | |
| "step": 195 | |
| }, | |
| { | |
| "epoch": 1.1363636363636362, | |
| "grad_norm": 0.09766222638240604, | |
| "learning_rate": 3.813363561519876e-05, | |
| "loss": 0.5086, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 1.1647727272727273, | |
| "grad_norm": 0.08469321954291688, | |
| "learning_rate": 3.750681896329504e-05, | |
| "loss": 0.5106, | |
| "step": 205 | |
| }, | |
| { | |
| "epoch": 1.1931818181818181, | |
| "grad_norm": 0.0961560094422636, | |
| "learning_rate": 3.687016616988379e-05, | |
| "loss": 0.51, | |
| "step": 210 | |
| }, | |
| { | |
| "epoch": 1.2215909090909092, | |
| "grad_norm": 0.0862194194928744, | |
| "learning_rate": 3.6224303028935106e-05, | |
| "loss": 0.5103, | |
| "step": 215 | |
| }, | |
| { | |
| "epoch": 1.25, | |
| "grad_norm": 0.07956278803606433, | |
| "learning_rate": 3.556986438767389e-05, | |
| "loss": 0.5037, | |
| "step": 220 | |
| }, | |
| { | |
| "epoch": 1.2784090909090908, | |
| "grad_norm": 0.0903181198584574, | |
| "learning_rate": 3.490749352256063e-05, | |
| "loss": 0.509, | |
| "step": 225 | |
| }, | |
| { | |
| "epoch": 1.3068181818181819, | |
| "grad_norm": 0.0944984329203277, | |
| "learning_rate": 3.4237841506986744e-05, | |
| "loss": 0.5027, | |
| "step": 230 | |
| }, | |
| { | |
| "epoch": 1.3352272727272727, | |
| "grad_norm": 0.08882033124741437, | |
| "learning_rate": 3.356156657130596e-05, | |
| "loss": 0.4961, | |
| "step": 235 | |
| }, | |
| { | |
| "epoch": 1.3636363636363638, | |
| "grad_norm": 0.09550461777333721, | |
| "learning_rate": 3.287933345583085e-05, | |
| "loss": 0.5069, | |
| "step": 240 | |
| }, | |
| { | |
| "epoch": 1.3920454545454546, | |
| "grad_norm": 0.08946536218539883, | |
| "learning_rate": 3.219181275743034e-05, | |
| "loss": 0.5086, | |
| "step": 245 | |
| }, | |
| { | |
| "epoch": 1.4204545454545454, | |
| "grad_norm": 0.09325584287334789, | |
| "learning_rate": 3.1499680270370706e-05, | |
| "loss": 0.5044, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 1.4488636363636362, | |
| "grad_norm": 0.08223842512135068, | |
| "learning_rate": 3.0803616322047654e-05, | |
| "loss": 0.5052, | |
| "step": 255 | |
| }, | |
| { | |
| "epoch": 1.4772727272727273, | |
| "grad_norm": 0.08270798846301625, | |
| "learning_rate": 3.0104305104262698e-05, | |
| "loss": 0.4985, | |
| "step": 260 | |
| }, | |
| { | |
| "epoch": 1.5056818181818183, | |
| "grad_norm": 0.08262838460793459, | |
| "learning_rate": 2.940243400070103e-05, | |
| "loss": 0.4994, | |
| "step": 265 | |
| }, | |
| { | |
| "epoch": 1.5340909090909092, | |
| "grad_norm": 0.09058860519349171, | |
| "learning_rate": 2.8698692911271897e-05, | |
| "loss": 0.5023, | |
| "step": 270 | |
| }, | |
| { | |
| "epoch": 1.5625, | |
| "grad_norm": 0.09371800013120092, | |
| "learning_rate": 2.7993773573975728e-05, | |
| "loss": 0.4992, | |
| "step": 275 | |
| }, | |
| { | |
| "epoch": 1.5909090909090908, | |
| "grad_norm": 0.08456948817551574, | |
| "learning_rate": 2.7288368884964475e-05, | |
| "loss": 0.503, | |
| "step": 280 | |
| }, | |
| { | |
| "epoch": 1.6193181818181817, | |
| "grad_norm": 0.09101003159864386, | |
| "learning_rate": 2.658317221746361e-05, | |
| "loss": 0.4954, | |
| "step": 285 | |
| }, | |
| { | |
| "epoch": 1.6477272727272727, | |
| "grad_norm": 0.08160457721074796, | |
| "learning_rate": 2.5878876740225116e-05, | |
| "loss": 0.4986, | |
| "step": 290 | |
| }, | |
| { | |
| "epoch": 1.6761363636363638, | |
| "grad_norm": 0.08078176783826871, | |
| "learning_rate": 2.5176174736181536e-05, | |
| "loss": 0.4971, | |
| "step": 295 | |
| }, | |
| { | |
| "epoch": 1.7045454545454546, | |
| "grad_norm": 0.08864892274469409, | |
| "learning_rate": 2.4475756921970653e-05, | |
| "loss": 0.4988, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 1.7329545454545454, | |
| "grad_norm": 0.09144520680898066, | |
| "learning_rate": 2.377831176899986e-05, | |
| "loss": 0.4987, | |
| "step": 305 | |
| }, | |
| { | |
| "epoch": 1.7613636363636362, | |
| "grad_norm": 0.0913519780551606, | |
| "learning_rate": 2.3084524826717317e-05, | |
| "loss": 0.4962, | |
| "step": 310 | |
| }, | |
| { | |
| "epoch": 1.7897727272727273, | |
| "grad_norm": 0.08330604163992984, | |
| "learning_rate": 2.23950780487554e-05, | |
| "loss": 0.5021, | |
| "step": 315 | |
| }, | |
| { | |
| "epoch": 1.8181818181818183, | |
| "grad_norm": 0.08280936051442833, | |
| "learning_rate": 2.171064912260849e-05, | |
| "loss": 0.4983, | |
| "step": 320 | |
| }, | |
| { | |
| "epoch": 1.8465909090909092, | |
| "grad_norm": 0.07571686269473353, | |
| "learning_rate": 2.103191080350419e-05, | |
| "loss": 0.499, | |
| "step": 325 | |
| }, | |
| { | |
| "epoch": 1.875, | |
| "grad_norm": 0.08525336953750823, | |
| "learning_rate": 2.0359530253122738e-05, | |
| "loss": 0.4923, | |
| "step": 330 | |
| }, | |
| { | |
| "epoch": 1.9034090909090908, | |
| "grad_norm": 0.10358699532249785, | |
| "learning_rate": 1.9694168383814492e-05, | |
| "loss": 0.5043, | |
| "step": 335 | |
| }, | |
| { | |
| "epoch": 1.9318181818181817, | |
| "grad_norm": 0.08301020877209554, | |
| "learning_rate": 1.9036479208960127e-05, | |
| "loss": 0.4946, | |
| "step": 340 | |
| }, | |
| { | |
| "epoch": 1.9602272727272727, | |
| "grad_norm": 0.08553632563551307, | |
| "learning_rate": 1.838710920011227e-05, | |
| "loss": 0.4969, | |
| "step": 345 | |
| }, | |
| { | |
| "epoch": 1.9886363636363638, | |
| "grad_norm": 0.08303295503235186, | |
| "learning_rate": 1.7746696651550143e-05, | |
| "loss": 0.4997, | |
| "step": 350 | |
| }, | |
| { | |
| "epoch": 2.0170454545454546, | |
| "grad_norm": 0.08033143740448973, | |
| "learning_rate": 1.7115871052872204e-05, | |
| "loss": 0.4867, | |
| "step": 355 | |
| }, | |
| { | |
| "epoch": 2.0454545454545454, | |
| "grad_norm": 0.08063264186240293, | |
| "learning_rate": 1.6495252470243134e-05, | |
| "loss": 0.487, | |
| "step": 360 | |
| }, | |
| { | |
| "epoch": 2.0738636363636362, | |
| "grad_norm": 0.07841584895617655, | |
| "learning_rate": 1.5885450936903586e-05, | |
| "loss": 0.4837, | |
| "step": 365 | |
| }, | |
| { | |
| "epoch": 2.102272727272727, | |
| "grad_norm": 0.07460661702402475, | |
| "learning_rate": 1.528706585354177e-05, | |
| "loss": 0.4788, | |
| "step": 370 | |
| }, | |
| { | |
| "epoch": 2.1306818181818183, | |
| "grad_norm": 0.08030048639361652, | |
| "learning_rate": 1.4700685399116238e-05, | |
| "loss": 0.4803, | |
| "step": 375 | |
| }, | |
| { | |
| "epoch": 2.159090909090909, | |
| "grad_norm": 0.07655857860636041, | |
| "learning_rate": 1.412688595270887e-05, | |
| "loss": 0.4782, | |
| "step": 380 | |
| }, | |
| { | |
| "epoch": 2.1875, | |
| "grad_norm": 0.0741239677057519, | |
| "learning_rate": 1.3566231526976714e-05, | |
| "loss": 0.4833, | |
| "step": 385 | |
| }, | |
| { | |
| "epoch": 2.215909090909091, | |
| "grad_norm": 0.07552080480637674, | |
| "learning_rate": 1.3019273213759081e-05, | |
| "loss": 0.4817, | |
| "step": 390 | |
| }, | |
| { | |
| "epoch": 2.2443181818181817, | |
| "grad_norm": 0.087506481602976, | |
| "learning_rate": 1.2486548642385274e-05, | |
| "loss": 0.4855, | |
| "step": 395 | |
| }, | |
| { | |
| "epoch": 2.2727272727272725, | |
| "grad_norm": 0.07825929401017973, | |
| "learning_rate": 1.1968581451215065e-05, | |
| "loss": 0.4783, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 2.3011363636363638, | |
| "grad_norm": 0.07347633398325569, | |
| "learning_rate": 1.1465880772931601e-05, | |
| "loss": 0.484, | |
| "step": 405 | |
| }, | |
| { | |
| "epoch": 2.3295454545454546, | |
| "grad_norm": 0.07075822069959536, | |
| "learning_rate": 1.0978940734092554e-05, | |
| "loss": 0.478, | |
| "step": 410 | |
| }, | |
| { | |
| "epoch": 2.3579545454545454, | |
| "grad_norm": 0.07267042198965988, | |
| "learning_rate": 1.050823996943144e-05, | |
| "loss": 0.4783, | |
| "step": 415 | |
| }, | |
| { | |
| "epoch": 2.3863636363636362, | |
| "grad_norm": 0.07341626200460101, | |
| "learning_rate": 1.0054241151386492e-05, | |
| "loss": 0.4756, | |
| "step": 420 | |
| }, | |
| { | |
| "epoch": 2.4147727272727275, | |
| "grad_norm": 0.0706063205931748, | |
| "learning_rate": 9.617390535319684e-06, | |
| "loss": 0.4772, | |
| "step": 425 | |
| }, | |
| { | |
| "epoch": 2.4431818181818183, | |
| "grad_norm": 0.06987948971233222, | |
| "learning_rate": 9.198117520872698e-06, | |
| "loss": 0.477, | |
| "step": 430 | |
| }, | |
| { | |
| "epoch": 2.471590909090909, | |
| "grad_norm": 0.07780450930302736, | |
| "learning_rate": 8.796834229891206e-06, | |
| "loss": 0.4819, | |
| "step": 435 | |
| }, | |
| { | |
| "epoch": 2.5, | |
| "grad_norm": 0.07385728736608703, | |
| "learning_rate": 8.413935101332198e-06, | |
| "loss": 0.476, | |
| "step": 440 | |
| }, | |
| { | |
| "epoch": 2.528409090909091, | |
| "grad_norm": 0.06912845080364564, | |
| "learning_rate": 8.049796503552626e-06, | |
| "loss": 0.4776, | |
| "step": 445 | |
| }, | |
| { | |
| "epoch": 2.5568181818181817, | |
| "grad_norm": 0.07115097048034667, | |
| "learning_rate": 7.704776364360454e-06, | |
| "loss": 0.4806, | |
| "step": 450 | |
| }, | |
| { | |
| "epoch": 2.5852272727272725, | |
| "grad_norm": 0.0719603929691263, | |
| "learning_rate": 7.379213819191676e-06, | |
| "loss": 0.4805, | |
| "step": 455 | |
| }, | |
| { | |
| "epoch": 2.6136363636363638, | |
| "grad_norm": 0.06733281311431526, | |
| "learning_rate": 7.07342887775922e-06, | |
| "loss": 0.4771, | |
| "step": 460 | |
| }, | |
| { | |
| "epoch": 2.6420454545454546, | |
| "grad_norm": 0.06812256929077086, | |
| "learning_rate": 6.787722109501379e-06, | |
| "loss": 0.4769, | |
| "step": 465 | |
| }, | |
| { | |
| "epoch": 2.6704545454545454, | |
| "grad_norm": 0.06945313043383605, | |
| "learning_rate": 6.522374348138882e-06, | |
| "loss": 0.4791, | |
| "step": 470 | |
| }, | |
| { | |
| "epoch": 2.6988636363636362, | |
| "grad_norm": 0.07082851639261782, | |
| "learning_rate": 6.277646415631148e-06, | |
| "loss": 0.4785, | |
| "step": 475 | |
| }, | |
| { | |
| "epoch": 2.7272727272727275, | |
| "grad_norm": 0.07019381014525443, | |
| "learning_rate": 6.053778865802907e-06, | |
| "loss": 0.4787, | |
| "step": 480 | |
| }, | |
| { | |
| "epoch": 2.7556818181818183, | |
| "grad_norm": 0.06814585572621572, | |
| "learning_rate": 5.850991747893292e-06, | |
| "loss": 0.4788, | |
| "step": 485 | |
| }, | |
| { | |
| "epoch": 2.784090909090909, | |
| "grad_norm": 0.06724675412918013, | |
| "learning_rate": 5.669484390259793e-06, | |
| "loss": 0.4744, | |
| "step": 490 | |
| }, | |
| { | |
| "epoch": 2.8125, | |
| "grad_norm": 0.06951904971525688, | |
| "learning_rate": 5.509435204449666e-06, | |
| "loss": 0.4852, | |
| "step": 495 | |
| }, | |
| { | |
| "epoch": 2.840909090909091, | |
| "grad_norm": 0.06771580623936355, | |
| "learning_rate": 5.37100150983139e-06, | |
| "loss": 0.4748, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 2.8693181818181817, | |
| "grad_norm": 0.06692159909264785, | |
| "learning_rate": 5.254319378958563e-06, | |
| "loss": 0.475, | |
| "step": 505 | |
| }, | |
| { | |
| "epoch": 2.8977272727272725, | |
| "grad_norm": 0.06940269513307211, | |
| "learning_rate": 5.159503503818199e-06, | |
| "loss": 0.4809, | |
| "step": 510 | |
| }, | |
| { | |
| "epoch": 2.9261363636363638, | |
| "grad_norm": 0.06707771857169355, | |
| "learning_rate": 5.086647083094966e-06, | |
| "loss": 0.4793, | |
| "step": 515 | |
| }, | |
| { | |
| "epoch": 2.9545454545454546, | |
| "grad_norm": 0.06794955106454185, | |
| "learning_rate": 5.03582173056209e-06, | |
| "loss": 0.4781, | |
| "step": 520 | |
| }, | |
| { | |
| "epoch": 2.9829545454545454, | |
| "grad_norm": 0.06951889463364248, | |
| "learning_rate": 5.00707740468904e-06, | |
| "loss": 0.4807, | |
| "step": 525 | |
| }, | |
| { | |
| "epoch": 3.0, | |
| "step": 528, | |
| "total_flos": 962656789856256.0, | |
| "train_loss": 0.5394951967578946, | |
| "train_runtime": 11008.0222, | |
| "train_samples_per_second": 6.123, | |
| "train_steps_per_second": 0.048 | |
| } | |
| ], | |
| "logging_steps": 5, | |
| "max_steps": 528, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 3, | |
| "save_steps": 100, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": true | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 962656789856256.0, | |
| "train_batch_size": 16, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |