{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 0.9997450352701209, "eval_steps": 500, "global_step": 3529, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.002832941443100371, "grad_norm": 174.56146240234375, "learning_rate": 9.433962264150944e-07, "loss": 17.5632, "step": 10 }, { "epoch": 0.005665882886200742, "grad_norm": 99.76522064208984, "learning_rate": 1.8867924528301889e-06, "loss": 16.5212, "step": 20 }, { "epoch": 0.008498824329301113, "grad_norm": 61.843570709228516, "learning_rate": 2.830188679245283e-06, "loss": 14.66, "step": 30 }, { "epoch": 0.011331765772401484, "grad_norm": 58.26314926147461, "learning_rate": 3.7735849056603777e-06, "loss": 12.8877, "step": 40 }, { "epoch": 0.014164707215501856, "grad_norm": 64.67306518554688, "learning_rate": 4.716981132075472e-06, "loss": 12.6102, "step": 50 }, { "epoch": 0.016997648658602225, "grad_norm": 40.95338439941406, "learning_rate": 5.660377358490566e-06, "loss": 10.3169, "step": 60 }, { "epoch": 0.019830590101702596, "grad_norm": 59.99547576904297, "learning_rate": 6.60377358490566e-06, "loss": 10.4793, "step": 70 }, { "epoch": 0.022663531544802967, "grad_norm": 69.4089584350586, "learning_rate": 7.5471698113207555e-06, "loss": 10.2298, "step": 80 }, { "epoch": 0.02549647298790334, "grad_norm": 38.2027702331543, "learning_rate": 8.49056603773585e-06, "loss": 8.5679, "step": 90 }, { "epoch": 0.028329414431003713, "grad_norm": 41.05867004394531, "learning_rate": 9.433962264150944e-06, "loss": 8.6834, "step": 100 }, { "epoch": 0.031162355874104083, "grad_norm": 45.03547286987305, "learning_rate": 9.999966306552455e-06, "loss": 7.8071, "step": 110 }, { "epoch": 0.03399529731720445, "grad_norm": 43.03623580932617, "learning_rate": 9.999587260482597e-06, "loss": 8.2945, "step": 120 }, { "epoch": 0.036828238760304825, "grad_norm": 37.273048400878906, "learning_rate": 9.998787083568112e-06, "loss": 8.0273, "step": 130 }, { "epoch": 0.03966118020340519, "grad_norm": 55.94173049926758, "learning_rate": 9.997565843210401e-06, "loss": 7.1597, "step": 140 }, { "epoch": 0.04249412164650557, "grad_norm": 50.30799102783203, "learning_rate": 9.995923642278351e-06, "loss": 7.2908, "step": 150 }, { "epoch": 0.045327063089605935, "grad_norm": 53.113243103027344, "learning_rate": 9.993860619099673e-06, "loss": 5.9006, "step": 160 }, { "epoch": 0.04816000453270631, "grad_norm": 51.57769012451172, "learning_rate": 9.991376947449254e-06, "loss": 8.8304, "step": 170 }, { "epoch": 0.05099294597580668, "grad_norm": 38.17790985107422, "learning_rate": 9.988472836534509e-06, "loss": 8.6644, "step": 180 }, { "epoch": 0.05382588741890705, "grad_norm": 34.28203582763672, "learning_rate": 9.985148530977767e-06, "loss": 6.6272, "step": 190 }, { "epoch": 0.056658828862007425, "grad_norm": 34.18405532836914, "learning_rate": 9.981404310795667e-06, "loss": 8.2372, "step": 200 }, { "epoch": 0.05949177030510779, "grad_norm": 40.83757400512695, "learning_rate": 9.97724049137556e-06, "loss": 5.3928, "step": 210 }, { "epoch": 0.06232471174820817, "grad_norm": 36.30077362060547, "learning_rate": 9.972657423448961e-06, "loss": 6.5196, "step": 220 }, { "epoch": 0.06515765319130853, "grad_norm": 38.03015899658203, "learning_rate": 9.96765549306199e-06, "loss": 7.1923, "step": 230 }, { "epoch": 0.0679905946344089, "grad_norm": 42.84525680541992, "learning_rate": 9.962235121542858e-06, "loss": 6.9849, "step": 240 }, { "epoch": 0.07082353607750928, "grad_norm": 34.071800231933594, "learning_rate": 9.956396765466382e-06, "loss": 7.0978, "step": 250 }, { "epoch": 0.07365647752060965, "grad_norm": 37.807029724121094, "learning_rate": 9.950140916615526e-06, "loss": 7.5077, "step": 260 }, { "epoch": 0.07648941896371002, "grad_norm": 36.55296325683594, "learning_rate": 9.943468101939968e-06, "loss": 6.6867, "step": 270 }, { "epoch": 0.07932236040681039, "grad_norm": 31.735977172851562, "learning_rate": 9.936378883511722e-06, "loss": 8.5626, "step": 280 }, { "epoch": 0.08215530184991077, "grad_norm": 51.97509002685547, "learning_rate": 9.92887385847779e-06, "loss": 7.1104, "step": 290 }, { "epoch": 0.08498824329301113, "grad_norm": 56.030025482177734, "learning_rate": 9.920953659009863e-06, "loss": 6.6099, "step": 300 }, { "epoch": 0.0878211847361115, "grad_norm": 37.08720016479492, "learning_rate": 9.912618952251071e-06, "loss": 4.9933, "step": 310 }, { "epoch": 0.09065412617921187, "grad_norm": 34.61451721191406, "learning_rate": 9.903870440259787e-06, "loss": 5.7727, "step": 320 }, { "epoch": 0.09348706762231225, "grad_norm": 35.92675018310547, "learning_rate": 9.89470885995049e-06, "loss": 4.8536, "step": 330 }, { "epoch": 0.09632000906541262, "grad_norm": 31.899490356445312, "learning_rate": 9.885134983031694e-06, "loss": 5.6988, "step": 340 }, { "epoch": 0.09915295050851299, "grad_norm": 37.71702194213867, "learning_rate": 9.875149615940943e-06, "loss": 8.0547, "step": 350 }, { "epoch": 0.10198589195161337, "grad_norm": 32.81459426879883, "learning_rate": 9.864753599776883e-06, "loss": 5.7466, "step": 360 }, { "epoch": 0.10481883339471373, "grad_norm": 34.12638854980469, "learning_rate": 9.853947810228416e-06, "loss": 6.3535, "step": 370 }, { "epoch": 0.1076517748378141, "grad_norm": 34.04792022705078, "learning_rate": 9.842733157500932e-06, "loss": 5.7424, "step": 380 }, { "epoch": 0.11048471628091447, "grad_norm": 33.2330322265625, "learning_rate": 9.831110586239643e-06, "loss": 5.4935, "step": 390 }, { "epoch": 0.11331765772401485, "grad_norm": 33.370567321777344, "learning_rate": 9.819081075450014e-06, "loss": 5.5257, "step": 400 }, { "epoch": 0.11615059916711522, "grad_norm": 34.091304779052734, "learning_rate": 9.806645638415302e-06, "loss": 6.1631, "step": 410 }, { "epoch": 0.11898354061021559, "grad_norm": 28.293777465820312, "learning_rate": 9.79380532261119e-06, "loss": 6.2594, "step": 420 }, { "epoch": 0.12181648205331595, "grad_norm": 31.169191360473633, "learning_rate": 9.780561209617569e-06, "loss": 5.428, "step": 430 }, { "epoch": 0.12464942349641633, "grad_norm": 31.67852783203125, "learning_rate": 9.766914415027426e-06, "loss": 6.3704, "step": 440 }, { "epoch": 0.1274823649395167, "grad_norm": 24.92995262145996, "learning_rate": 9.752866088352882e-06, "loss": 6.3413, "step": 450 }, { "epoch": 0.13031530638261707, "grad_norm": 30.267122268676758, "learning_rate": 9.738417412928348e-06, "loss": 6.1918, "step": 460 }, { "epoch": 0.13314824782571744, "grad_norm": 34.71146011352539, "learning_rate": 9.72356960581087e-06, "loss": 5.2388, "step": 470 }, { "epoch": 0.1359811892688178, "grad_norm": 32.87137985229492, "learning_rate": 9.7083239176776e-06, "loss": 4.2622, "step": 480 }, { "epoch": 0.13881413071191817, "grad_norm": 37.83769226074219, "learning_rate": 9.692681632720448e-06, "loss": 4.1838, "step": 490 }, { "epoch": 0.14164707215501857, "grad_norm": 29.84713363647461, "learning_rate": 9.676644068537915e-06, "loss": 6.1015, "step": 500 }, { "epoch": 0.14448001359811893, "grad_norm": 37.30936050415039, "learning_rate": 9.660212576024102e-06, "loss": 6.1121, "step": 510 }, { "epoch": 0.1473129550412193, "grad_norm": 43.746490478515625, "learning_rate": 9.64338853925493e-06, "loss": 6.0385, "step": 520 }, { "epoch": 0.15014589648431967, "grad_norm": 32.99515151977539, "learning_rate": 9.62617337537154e-06, "loss": 6.1911, "step": 530 }, { "epoch": 0.15297883792742004, "grad_norm": 46.957340240478516, "learning_rate": 9.608568534460938e-06, "loss": 5.822, "step": 540 }, { "epoch": 0.1558117793705204, "grad_norm": 33.011844635009766, "learning_rate": 9.590575499433837e-06, "loss": 7.1735, "step": 550 }, { "epoch": 0.15864472081362077, "grad_norm": 31.042083740234375, "learning_rate": 9.572195785899756e-06, "loss": 6.8695, "step": 560 }, { "epoch": 0.16147766225672117, "grad_norm": 26.289737701416016, "learning_rate": 9.553430942039352e-06, "loss": 6.9135, "step": 570 }, { "epoch": 0.16431060369982153, "grad_norm": 29.071701049804688, "learning_rate": 9.534282548474008e-06, "loss": 5.075, "step": 580 }, { "epoch": 0.1671435451429219, "grad_norm": 46.65534973144531, "learning_rate": 9.514752218132703e-06, "loss": 7.0842, "step": 590 }, { "epoch": 0.16997648658602227, "grad_norm": 31.195234298706055, "learning_rate": 9.494841596116138e-06, "loss": 4.9555, "step": 600 }, { "epoch": 0.17280942802912264, "grad_norm": 26.25963020324707, "learning_rate": 9.474552359558167e-06, "loss": 8.0829, "step": 610 }, { "epoch": 0.175642369472223, "grad_norm": 25.132131576538086, "learning_rate": 9.453886217484536e-06, "loss": 4.0549, "step": 620 }, { "epoch": 0.17847531091532337, "grad_norm": 47.06711196899414, "learning_rate": 9.432844910668914e-06, "loss": 6.1125, "step": 630 }, { "epoch": 0.18130825235842374, "grad_norm": 41.20506286621094, "learning_rate": 9.41143021148627e-06, "loss": 6.7009, "step": 640 }, { "epoch": 0.18414119380152413, "grad_norm": 27.037729263305664, "learning_rate": 9.389643923763573e-06, "loss": 6.8328, "step": 650 }, { "epoch": 0.1869741352446245, "grad_norm": 44.91098403930664, "learning_rate": 9.367487882627866e-06, "loss": 5.0284, "step": 660 }, { "epoch": 0.18980707668772487, "grad_norm": 32.71237564086914, "learning_rate": 9.344963954351662e-06, "loss": 6.0377, "step": 670 }, { "epoch": 0.19264001813082524, "grad_norm": 24.608020782470703, "learning_rate": 9.32207403619577e-06, "loss": 3.9539, "step": 680 }, { "epoch": 0.1954729595739256, "grad_norm": 45.37845230102539, "learning_rate": 9.298820056249459e-06, "loss": 7.6906, "step": 690 }, { "epoch": 0.19830590101702597, "grad_norm": 26.40629768371582, "learning_rate": 9.275203973268064e-06, "loss": 5.7302, "step": 700 }, { "epoch": 0.20113884246012634, "grad_norm": 25.433490753173828, "learning_rate": 9.251227776507989e-06, "loss": 5.6252, "step": 710 }, { "epoch": 0.20397178390322673, "grad_norm": 25.276575088500977, "learning_rate": 9.226893485559146e-06, "loss": 5.8884, "step": 720 }, { "epoch": 0.2068047253463271, "grad_norm": 45.13107681274414, "learning_rate": 9.202203150174836e-06, "loss": 9.215, "step": 730 }, { "epoch": 0.20963766678942747, "grad_norm": 26.52821922302246, "learning_rate": 9.177158850099099e-06, "loss": 5.7232, "step": 740 }, { "epoch": 0.21247060823252784, "grad_norm": 43.29339599609375, "learning_rate": 9.151762694891522e-06, "loss": 6.7846, "step": 750 }, { "epoch": 0.2153035496756282, "grad_norm": 29.308732986450195, "learning_rate": 9.12601682374955e-06, "loss": 5.8371, "step": 760 }, { "epoch": 0.21813649111872857, "grad_norm": 25.373172760009766, "learning_rate": 9.099923405328293e-06, "loss": 3.9846, "step": 770 }, { "epoch": 0.22096943256182894, "grad_norm": 33.12062454223633, "learning_rate": 9.073484637557852e-06, "loss": 4.8174, "step": 780 }, { "epoch": 0.2238023740049293, "grad_norm": 24.834850311279297, "learning_rate": 9.046702747458186e-06, "loss": 5.8073, "step": 790 }, { "epoch": 0.2266353154480297, "grad_norm": 23.760942459106445, "learning_rate": 9.019579990951514e-06, "loss": 5.6668, "step": 800 }, { "epoch": 0.22946825689113007, "grad_norm": 25.230995178222656, "learning_rate": 8.992118652672302e-06, "loss": 5.6386, "step": 810 }, { "epoch": 0.23230119833423044, "grad_norm": 27.033655166625977, "learning_rate": 8.964321045774808e-06, "loss": 5.1316, "step": 820 }, { "epoch": 0.2351341397773308, "grad_norm": 36.77193832397461, "learning_rate": 8.936189511738254e-06, "loss": 5.0568, "step": 830 }, { "epoch": 0.23796708122043117, "grad_norm": 21.841785430908203, "learning_rate": 8.907726420169583e-06, "loss": 5.5521, "step": 840 }, { "epoch": 0.24080002266353154, "grad_norm": 46.69823455810547, "learning_rate": 8.878934168603865e-06, "loss": 6.7058, "step": 850 }, { "epoch": 0.2436329641066319, "grad_norm": 23.228717803955078, "learning_rate": 8.849815182302345e-06, "loss": 7.8944, "step": 860 }, { "epoch": 0.2464659055497323, "grad_norm": 33.86655807495117, "learning_rate": 8.820371914048153e-06, "loss": 4.8468, "step": 870 }, { "epoch": 0.24929884699283267, "grad_norm": 25.241182327270508, "learning_rate": 8.790606843939705e-06, "loss": 3.709, "step": 880 }, { "epoch": 0.25213178843593304, "grad_norm": 25.6811580657959, "learning_rate": 8.760522479181784e-06, "loss": 4.844, "step": 890 }, { "epoch": 0.2549647298790334, "grad_norm": 43.929115295410156, "learning_rate": 8.730121353874365e-06, "loss": 6.7687, "step": 900 }, { "epoch": 0.25779767132213377, "grad_norm": 25.351106643676758, "learning_rate": 8.69940602879915e-06, "loss": 3.7733, "step": 910 }, { "epoch": 0.26063061276523414, "grad_norm": 45.543373107910156, "learning_rate": 8.66837909120387e-06, "loss": 6.5226, "step": 920 }, { "epoch": 0.2634635542083345, "grad_norm": 35.3692626953125, "learning_rate": 8.637043154584351e-06, "loss": 7.782, "step": 930 }, { "epoch": 0.2662964956514349, "grad_norm": 23.175140380859375, "learning_rate": 8.60540085846437e-06, "loss": 3.7581, "step": 940 }, { "epoch": 0.26912943709453524, "grad_norm": 22.461284637451172, "learning_rate": 8.573454868173325e-06, "loss": 3.8114, "step": 950 }, { "epoch": 0.2719623785376356, "grad_norm": 30.49061393737793, "learning_rate": 8.541207874621718e-06, "loss": 5.6752, "step": 960 }, { "epoch": 0.274795319980736, "grad_norm": 43.03390121459961, "learning_rate": 8.508662594074496e-06, "loss": 5.8459, "step": 970 }, { "epoch": 0.27762826142383634, "grad_norm": 144.74916076660156, "learning_rate": 8.475821767922254e-06, "loss": 6.1604, "step": 980 }, { "epoch": 0.28046120286693677, "grad_norm": 26.303081512451172, "learning_rate": 8.442688162450315e-06, "loss": 4.7632, "step": 990 }, { "epoch": 0.28329414431003713, "grad_norm": 28.066007614135742, "learning_rate": 8.409264568605714e-06, "loss": 8.0062, "step": 1000 }, { "epoch": 0.2861270857531375, "grad_norm": 23.32230567932129, "learning_rate": 8.375553801762119e-06, "loss": 3.9505, "step": 1010 }, { "epoch": 0.28896002719623787, "grad_norm": 53.46368408203125, "learning_rate": 8.34155870148267e-06, "loss": 4.7727, "step": 1020 }, { "epoch": 0.29179296863933823, "grad_norm": 28.557096481323242, "learning_rate": 8.307282131280805e-06, "loss": 3.876, "step": 1030 }, { "epoch": 0.2946259100824386, "grad_norm": 26.72674560546875, "learning_rate": 8.272726978379049e-06, "loss": 3.6362, "step": 1040 }, { "epoch": 0.29745885152553897, "grad_norm": 33.64091110229492, "learning_rate": 8.23789615346582e-06, "loss": 4.8435, "step": 1050 }, { "epoch": 0.30029179296863934, "grad_norm": 25.513519287109375, "learning_rate": 8.202792590450246e-06, "loss": 4.8615, "step": 1060 }, { "epoch": 0.3031247344117397, "grad_norm": 26.183082580566406, "learning_rate": 8.167419246215042e-06, "loss": 3.7897, "step": 1070 }, { "epoch": 0.30595767585484007, "grad_norm": 37.91279983520508, "learning_rate": 8.131779100367438e-06, "loss": 3.8092, "step": 1080 }, { "epoch": 0.30879061729794044, "grad_norm": 20.643037796020508, "learning_rate": 8.09587515498819e-06, "loss": 5.8217, "step": 1090 }, { "epoch": 0.3116235587410408, "grad_norm": 24.009424209594727, "learning_rate": 8.059710434378717e-06, "loss": 4.6594, "step": 1100 }, { "epoch": 0.3144565001841412, "grad_norm": 26.472389221191406, "learning_rate": 8.02328798480635e-06, "loss": 6.5989, "step": 1110 }, { "epoch": 0.31728944162724154, "grad_norm": 42.69245529174805, "learning_rate": 7.986610874247736e-06, "loss": 6.7969, "step": 1120 }, { "epoch": 0.3201223830703419, "grad_norm": 20.50579833984375, "learning_rate": 7.949682192130407e-06, "loss": 7.532, "step": 1130 }, { "epoch": 0.32295532451344233, "grad_norm": 33.428890228271484, "learning_rate": 7.912505049072559e-06, "loss": 5.5098, "step": 1140 }, { "epoch": 0.3257882659565427, "grad_norm": 33.04521560668945, "learning_rate": 7.875082576621024e-06, "loss": 5.7852, "step": 1150 }, { "epoch": 0.32862120739964307, "grad_norm": 29.72992706298828, "learning_rate": 7.837417926987496e-06, "loss": 3.8586, "step": 1160 }, { "epoch": 0.33145414884274343, "grad_norm": 22.467132568359375, "learning_rate": 7.799514272783014e-06, "loss": 5.6287, "step": 1170 }, { "epoch": 0.3342870902858438, "grad_norm": 25.866819381713867, "learning_rate": 7.761374806750712e-06, "loss": 3.7462, "step": 1180 }, { "epoch": 0.33712003172894417, "grad_norm": 21.801698684692383, "learning_rate": 7.723002741496892e-06, "loss": 5.6068, "step": 1190 }, { "epoch": 0.33995297317204454, "grad_norm": 41.93526840209961, "learning_rate": 7.684401309220416e-06, "loss": 5.8573, "step": 1200 }, { "epoch": 0.3427859146151449, "grad_norm": 23.245235443115234, "learning_rate": 7.645573761440444e-06, "loss": 4.5851, "step": 1210 }, { "epoch": 0.34561885605824527, "grad_norm": 24.502330780029297, "learning_rate": 7.606523368722554e-06, "loss": 6.4644, "step": 1220 }, { "epoch": 0.34845179750134564, "grad_norm": 16.042354583740234, "learning_rate": 7.567253420403249e-06, "loss": 5.6877, "step": 1230 }, { "epoch": 0.351284738944446, "grad_norm": 26.405628204345703, "learning_rate": 7.527767224312883e-06, "loss": 4.764, "step": 1240 }, { "epoch": 0.3541176803875464, "grad_norm": 40.40938186645508, "learning_rate": 7.488068106497035e-06, "loss": 5.8002, "step": 1250 }, { "epoch": 0.35695062183064674, "grad_norm": 25.338321685791016, "learning_rate": 7.448159410936348e-06, "loss": 5.5113, "step": 1260 }, { "epoch": 0.3597835632737471, "grad_norm": 39.411128997802734, "learning_rate": 7.4080444992648534e-06, "loss": 5.5444, "step": 1270 }, { "epoch": 0.3626165047168475, "grad_norm": 22.218137741088867, "learning_rate": 7.3677267504868055e-06, "loss": 4.4882, "step": 1280 }, { "epoch": 0.3654494461599479, "grad_norm": 43.15862274169922, "learning_rate": 7.327209560692063e-06, "loss": 6.6107, "step": 1290 }, { "epoch": 0.36828238760304827, "grad_norm": 20.51604652404785, "learning_rate": 7.2864963427700284e-06, "loss": 5.6351, "step": 1300 }, { "epoch": 0.37111532904614863, "grad_norm": 21.392065048217773, "learning_rate": 7.2455905261221585e-06, "loss": 5.7755, "step": 1310 }, { "epoch": 0.373948270489249, "grad_norm": 28.160072326660156, "learning_rate": 7.204495556373106e-06, "loss": 6.5779, "step": 1320 }, { "epoch": 0.37678121193234937, "grad_norm": 41.15205764770508, "learning_rate": 7.163214895080479e-06, "loss": 6.4435, "step": 1330 }, { "epoch": 0.37961415337544974, "grad_norm": 26.100757598876953, "learning_rate": 7.121752019443266e-06, "loss": 6.5864, "step": 1340 }, { "epoch": 0.3824470948185501, "grad_norm": 41.462791442871094, "learning_rate": 7.080110422008937e-06, "loss": 5.6488, "step": 1350 }, { "epoch": 0.38528003626165047, "grad_norm": 22.369388580322266, "learning_rate": 7.038293610379255e-06, "loss": 4.4922, "step": 1360 }, { "epoch": 0.38811297770475084, "grad_norm": 19.927444458007812, "learning_rate": 6.996305106914824e-06, "loss": 4.5791, "step": 1370 }, { "epoch": 0.3909459191478512, "grad_norm": 25.871030807495117, "learning_rate": 6.954148448438389e-06, "loss": 4.5578, "step": 1380 }, { "epoch": 0.3937788605909516, "grad_norm": 18.420751571655273, "learning_rate": 6.911827185936914e-06, "loss": 4.6252, "step": 1390 }, { "epoch": 0.39661180203405194, "grad_norm": 27.263010025024414, "learning_rate": 6.869344884262473e-06, "loss": 5.5235, "step": 1400 }, { "epoch": 0.3994447434771523, "grad_norm": 24.479764938354492, "learning_rate": 6.8267051218319766e-06, "loss": 5.6514, "step": 1410 }, { "epoch": 0.4022776849202527, "grad_norm": 23.21695899963379, "learning_rate": 6.7839114903257404e-06, "loss": 7.5326, "step": 1420 }, { "epoch": 0.40511062636335304, "grad_norm": 21.287368774414062, "learning_rate": 6.74096759438496e-06, "loss": 3.895, "step": 1430 }, { "epoch": 0.40794356780645347, "grad_norm": 25.839454650878906, "learning_rate": 6.697877051308067e-06, "loss": 6.3928, "step": 1440 }, { "epoch": 0.41077650924955383, "grad_norm": 22.896682739257812, "learning_rate": 6.654643490746042e-06, "loss": 4.5232, "step": 1450 }, { "epoch": 0.4136094506926542, "grad_norm": 25.252422332763672, "learning_rate": 6.611270554396676e-06, "loss": 6.5998, "step": 1460 }, { "epoch": 0.41644239213575457, "grad_norm": 24.610836029052734, "learning_rate": 6.567761895697816e-06, "loss": 4.6121, "step": 1470 }, { "epoch": 0.41927533357885494, "grad_norm": 19.092580795288086, "learning_rate": 6.524121179519625e-06, "loss": 3.6029, "step": 1480 }, { "epoch": 0.4221082750219553, "grad_norm": 22.915136337280273, "learning_rate": 6.480352081855884e-06, "loss": 3.6352, "step": 1490 }, { "epoch": 0.42494121646505567, "grad_norm": 29.044233322143555, "learning_rate": 6.436458289514342e-06, "loss": 4.6979, "step": 1500 }, { "epoch": 0.42777415790815604, "grad_norm": 39.80937194824219, "learning_rate": 6.392443499806175e-06, "loss": 4.6673, "step": 1510 }, { "epoch": 0.4306070993512564, "grad_norm": 22.760765075683594, "learning_rate": 6.348311420234542e-06, "loss": 4.6801, "step": 1520 }, { "epoch": 0.4334400407943568, "grad_norm": 21.216337203979492, "learning_rate": 6.304065768182295e-06, "loss": 5.7451, "step": 1530 }, { "epoch": 0.43627298223745714, "grad_norm": 20.622943878173828, "learning_rate": 6.259710270598848e-06, "loss": 5.6216, "step": 1540 }, { "epoch": 0.4391059236805575, "grad_norm": 40.299949645996094, "learning_rate": 6.215248663686251e-06, "loss": 6.5508, "step": 1550 }, { "epoch": 0.4419388651236579, "grad_norm": 28.81671905517578, "learning_rate": 6.170684692584469e-06, "loss": 3.5039, "step": 1560 }, { "epoch": 0.44477180656675824, "grad_norm": 43.010169982910156, "learning_rate": 6.126022111055929e-06, "loss": 6.4925, "step": 1570 }, { "epoch": 0.4476047480098586, "grad_norm": 23.351240158081055, "learning_rate": 6.081264681169317e-06, "loss": 3.4456, "step": 1580 }, { "epoch": 0.45043768945295903, "grad_norm": 40.19292449951172, "learning_rate": 6.0364161729826905e-06, "loss": 4.4953, "step": 1590 }, { "epoch": 0.4532706308960594, "grad_norm": 25.595369338989258, "learning_rate": 5.991480364225924e-06, "loss": 6.2619, "step": 1600 }, { "epoch": 0.45610357233915977, "grad_norm": 32.5233268737793, "learning_rate": 5.946461039982485e-06, "loss": 5.5702, "step": 1610 }, { "epoch": 0.45893651378226014, "grad_norm": 25.565658569335938, "learning_rate": 5.901361992370614e-06, "loss": 3.5389, "step": 1620 }, { "epoch": 0.4617694552253605, "grad_norm": 21.443763732910156, "learning_rate": 5.856187020223901e-06, "loss": 4.6532, "step": 1630 }, { "epoch": 0.46460239666846087, "grad_norm": 26.775903701782227, "learning_rate": 5.8109399287712935e-06, "loss": 5.7745, "step": 1640 }, { "epoch": 0.46743533811156124, "grad_norm": 20.02845001220703, "learning_rate": 5.765624529316573e-06, "loss": 5.506, "step": 1650 }, { "epoch": 0.4702682795546616, "grad_norm": 22.177770614624023, "learning_rate": 5.7202446389173225e-06, "loss": 3.5255, "step": 1660 }, { "epoch": 0.473101220997762, "grad_norm": 27.885957717895508, "learning_rate": 5.674804080063392e-06, "loss": 3.5088, "step": 1670 }, { "epoch": 0.47593416244086234, "grad_norm": 33.34544372558594, "learning_rate": 5.62930668035493e-06, "loss": 4.4746, "step": 1680 }, { "epoch": 0.4787671038839627, "grad_norm": 24.865848541259766, "learning_rate": 5.5837562721799644e-06, "loss": 6.4182, "step": 1690 }, { "epoch": 0.4816000453270631, "grad_norm": 20.06027603149414, "learning_rate": 5.538156692391592e-06, "loss": 3.499, "step": 1700 }, { "epoch": 0.48443298677016344, "grad_norm": 28.240829467773438, "learning_rate": 5.4925117819847925e-06, "loss": 5.4651, "step": 1710 }, { "epoch": 0.4872659282132638, "grad_norm": 39.07200241088867, "learning_rate": 5.44682538577288e-06, "loss": 4.7134, "step": 1720 }, { "epoch": 0.4900988696563642, "grad_norm": 31.383825302124023, "learning_rate": 5.4011013520636466e-06, "loss": 4.4705, "step": 1730 }, { "epoch": 0.4929318110994646, "grad_norm": 40.832984924316406, "learning_rate": 5.355343532335215e-06, "loss": 7.2469, "step": 1740 }, { "epoch": 0.49576475254256497, "grad_norm": 20.33405303955078, "learning_rate": 5.309555780911604e-06, "loss": 5.4482, "step": 1750 }, { "epoch": 0.49859769398566534, "grad_norm": 22.8585262298584, "learning_rate": 5.263741954638072e-06, "loss": 4.4573, "step": 1760 }, { "epoch": 0.5014306354287656, "grad_norm": 42.46244430541992, "learning_rate": 5.217905912556248e-06, "loss": 5.5277, "step": 1770 }, { "epoch": 0.5042635768718661, "grad_norm": 21.30562973022461, "learning_rate": 5.172051515579065e-06, "loss": 5.4764, "step": 1780 }, { "epoch": 0.5070965183149664, "grad_norm": 18.9359130859375, "learning_rate": 5.126182626165547e-06, "loss": 6.4232, "step": 1790 }, { "epoch": 0.5099294597580668, "grad_norm": 33.8026123046875, "learning_rate": 5.080303107995461e-06, "loss": 6.6042, "step": 1800 }, { "epoch": 0.5127624012011671, "grad_norm": 40.52323913574219, "learning_rate": 5.034416825643868e-06, "loss": 5.5848, "step": 1810 }, { "epoch": 0.5155953426442675, "grad_norm": 27.342744827270508, "learning_rate": 4.988527644255591e-06, "loss": 5.2504, "step": 1820 }, { "epoch": 0.518428284087368, "grad_norm": 19.118297576904297, "learning_rate": 4.942639429219661e-06, "loss": 4.5668, "step": 1830 }, { "epoch": 0.5212612255304683, "grad_norm": 41.146236419677734, "learning_rate": 4.896756045843698e-06, "loss": 6.0831, "step": 1840 }, { "epoch": 0.5240941669735687, "grad_norm": 19.685937881469727, "learning_rate": 4.85088135902834e-06, "loss": 5.5025, "step": 1850 }, { "epoch": 0.526927108416669, "grad_norm": 22.97096061706543, "learning_rate": 4.805019232941689e-06, "loss": 4.4157, "step": 1860 }, { "epoch": 0.5297600498597694, "grad_norm": 27.627784729003906, "learning_rate": 4.7591735306938144e-06, "loss": 4.3861, "step": 1870 }, { "epoch": 0.5325929913028697, "grad_norm": 25.308032989501953, "learning_rate": 4.713348114011357e-06, "loss": 7.2963, "step": 1880 }, { "epoch": 0.5354259327459702, "grad_norm": 19.11351203918457, "learning_rate": 4.667546842912239e-06, "loss": 4.2907, "step": 1890 }, { "epoch": 0.5382588741890705, "grad_norm": 28.81739044189453, "learning_rate": 4.6217735753805235e-06, "loss": 4.5385, "step": 1900 }, { "epoch": 0.5410918156321709, "grad_norm": 20.510547637939453, "learning_rate": 4.576032167041452e-06, "loss": 7.2043, "step": 1910 }, { "epoch": 0.5439247570752712, "grad_norm": 26.19765281677246, "learning_rate": 4.530326470836659e-06, "loss": 4.3494, "step": 1920 }, { "epoch": 0.5467576985183716, "grad_norm": 25.779802322387695, "learning_rate": 4.484660336699638e-06, "loss": 5.3226, "step": 1930 }, { "epoch": 0.549590639961472, "grad_norm": 26.97022247314453, "learning_rate": 4.439037611231448e-06, "loss": 6.5069, "step": 1940 }, { "epoch": 0.5524235814045724, "grad_norm": 26.32407569885254, "learning_rate": 4.393462137376696e-06, "loss": 3.545, "step": 1950 }, { "epoch": 0.5552565228476727, "grad_norm": 30.962535858154297, "learning_rate": 4.347937754099841e-06, "loss": 4.4292, "step": 1960 }, { "epoch": 0.5580894642907731, "grad_norm": 38.1851921081543, "learning_rate": 4.302468296061823e-06, "loss": 4.3079, "step": 1970 }, { "epoch": 0.5609224057338735, "grad_norm": 21.038278579711914, "learning_rate": 4.257057593297055e-06, "loss": 4.5294, "step": 1980 }, { "epoch": 0.5637553471769738, "grad_norm": 20.618942260742188, "learning_rate": 4.211709470890815e-06, "loss": 7.2449, "step": 1990 }, { "epoch": 0.5665882886200743, "grad_norm": 21.230995178222656, "learning_rate": 4.166427748657034e-06, "loss": 4.3681, "step": 2000 }, { "epoch": 0.5694212300631746, "grad_norm": 20.577428817749023, "learning_rate": 4.121216240816559e-06, "loss": 5.3925, "step": 2010 }, { "epoch": 0.572254171506275, "grad_norm": 21.1496524810791, "learning_rate": 4.076078755675852e-06, "loss": 5.0495, "step": 2020 }, { "epoch": 0.5750871129493753, "grad_norm": 26.215744018554688, "learning_rate": 4.0310190953062155e-06, "loss": 5.5832, "step": 2030 }, { "epoch": 0.5779200543924757, "grad_norm": 33.668174743652344, "learning_rate": 3.986041055223526e-06, "loss": 5.1639, "step": 2040 }, { "epoch": 0.580752995835576, "grad_norm": 28.786453247070312, "learning_rate": 3.9411484240685315e-06, "loss": 3.3797, "step": 2050 }, { "epoch": 0.5835859372786765, "grad_norm": 24.81963348388672, "learning_rate": 3.8963449832877164e-06, "loss": 6.3189, "step": 2060 }, { "epoch": 0.5864188787217768, "grad_norm": 25.143753051757812, "learning_rate": 3.851634506814782e-06, "loss": 6.463, "step": 2070 }, { "epoch": 0.5892518201648772, "grad_norm": 39.29959487915039, "learning_rate": 3.8070207607527587e-06, "loss": 7.5255, "step": 2080 }, { "epoch": 0.5920847616079775, "grad_norm": 19.030284881591797, "learning_rate": 3.7625075030567683e-06, "loss": 4.2513, "step": 2090 }, { "epoch": 0.5949177030510779, "grad_norm": 24.105989456176758, "learning_rate": 3.718098483217484e-06, "loss": 3.3586, "step": 2100 }, { "epoch": 0.5977506444941783, "grad_norm": 38.95778274536133, "learning_rate": 3.673797441945304e-06, "loss": 4.2773, "step": 2110 }, { "epoch": 0.6005835859372787, "grad_norm": 42.26526641845703, "learning_rate": 3.629608110855248e-06, "loss": 5.2586, "step": 2120 }, { "epoch": 0.6034165273803791, "grad_norm": 21.60348892211914, "learning_rate": 3.585534212152643e-06, "loss": 4.4408, "step": 2130 }, { "epoch": 0.6062494688234794, "grad_norm": 39.41062545776367, "learning_rate": 3.5415794583195846e-06, "loss": 4.5132, "step": 2140 }, { "epoch": 0.6090824102665798, "grad_norm": 32.25893783569336, "learning_rate": 3.497747551802221e-06, "loss": 8.4284, "step": 2150 }, { "epoch": 0.6119153517096801, "grad_norm": 34.556373596191406, "learning_rate": 3.4540421846988916e-06, "loss": 6.3801, "step": 2160 }, { "epoch": 0.6147482931527806, "grad_norm": 27.99374771118164, "learning_rate": 3.4104670384491234e-06, "loss": 5.5573, "step": 2170 }, { "epoch": 0.6175812345958809, "grad_norm": 23.997901916503906, "learning_rate": 3.367025783523534e-06, "loss": 4.2779, "step": 2180 }, { "epoch": 0.6204141760389813, "grad_norm": 41.76970291137695, "learning_rate": 3.3237220791146597e-06, "loss": 5.241, "step": 2190 }, { "epoch": 0.6232471174820816, "grad_norm": 27.922670364379883, "learning_rate": 3.2805595728287255e-06, "loss": 4.2649, "step": 2200 }, { "epoch": 0.626080058925182, "grad_norm": 33.54890060424805, "learning_rate": 3.2375419003783957e-06, "loss": 6.0635, "step": 2210 }, { "epoch": 0.6289130003682823, "grad_norm": 21.987178802490234, "learning_rate": 3.1946726852765325e-06, "loss": 5.1542, "step": 2220 }, { "epoch": 0.6317459418113828, "grad_norm": 35.2348518371582, "learning_rate": 3.1519555385309685e-06, "loss": 4.2332, "step": 2230 }, { "epoch": 0.6345788832544831, "grad_norm": 39.060691833496094, "learning_rate": 3.1093940583403447e-06, "loss": 8.0693, "step": 2240 }, { "epoch": 0.6374118246975835, "grad_norm": 20.76451873779297, "learning_rate": 3.066991829791024e-06, "loss": 5.3108, "step": 2250 }, { "epoch": 0.6402447661406838, "grad_norm": 40.92884826660156, "learning_rate": 3.024752424555105e-06, "loss": 4.2548, "step": 2260 }, { "epoch": 0.6430777075837842, "grad_norm": 24.043121337890625, "learning_rate": 2.982679400589569e-06, "loss": 5.3648, "step": 2270 }, { "epoch": 0.6459106490268847, "grad_norm": 22.929412841796875, "learning_rate": 2.9407763018365854e-06, "loss": 4.2817, "step": 2280 }, { "epoch": 0.648743590469985, "grad_norm": 36.0571174621582, "learning_rate": 2.899046657924992e-06, "loss": 7.9167, "step": 2290 }, { "epoch": 0.6515765319130854, "grad_norm": 23.849647521972656, "learning_rate": 2.8574939838729844e-06, "loss": 4.44, "step": 2300 }, { "epoch": 0.6544094733561857, "grad_norm": 42.65750503540039, "learning_rate": 2.8161217797920304e-06, "loss": 5.6655, "step": 2310 }, { "epoch": 0.6572424147992861, "grad_norm": 23.45660400390625, "learning_rate": 2.774933530592054e-06, "loss": 5.4841, "step": 2320 }, { "epoch": 0.6600753562423864, "grad_norm": 21.22451400756836, "learning_rate": 2.733932705687883e-06, "loss": 3.3468, "step": 2330 }, { "epoch": 0.6629082976854869, "grad_norm": 37.178993225097656, "learning_rate": 2.693122758707013e-06, "loss": 5.1606, "step": 2340 }, { "epoch": 0.6657412391285872, "grad_norm": 24.34912109375, "learning_rate": 2.652507127198689e-06, "loss": 7.2961, "step": 2350 }, { "epoch": 0.6685741805716876, "grad_norm": 40.61592483520508, "learning_rate": 2.612089232344371e-06, "loss": 6.3695, "step": 2360 }, { "epoch": 0.6714071220147879, "grad_norm": 20.37811279296875, "learning_rate": 2.571872478669528e-06, "loss": 3.3039, "step": 2370 }, { "epoch": 0.6742400634578883, "grad_norm": 25.745912551879883, "learning_rate": 2.5318602537568904e-06, "loss": 4.2973, "step": 2380 }, { "epoch": 0.6770730049009887, "grad_norm": 22.395126342773438, "learning_rate": 2.4920559279610886e-06, "loss": 4.1162, "step": 2390 }, { "epoch": 0.6799059463440891, "grad_norm": 34.32621383666992, "learning_rate": 2.452462854124758e-06, "loss": 4.1658, "step": 2400 }, { "epoch": 0.6827388877871894, "grad_norm": 39.03499984741211, "learning_rate": 2.413084367296127e-06, "loss": 6.3083, "step": 2410 }, { "epoch": 0.6855718292302898, "grad_norm": 47.788394927978516, "learning_rate": 2.373923784448089e-06, "loss": 4.2861, "step": 2420 }, { "epoch": 0.6884047706733902, "grad_norm": 26.90192413330078, "learning_rate": 2.3349844041988044e-06, "loss": 4.3008, "step": 2430 }, { "epoch": 0.6912377121164905, "grad_norm": 22.178869247436523, "learning_rate": 2.296269506533846e-06, "loss": 5.2767, "step": 2440 }, { "epoch": 0.694070653559591, "grad_norm": 21.529335021972656, "learning_rate": 2.2577823525299205e-06, "loss": 7.1097, "step": 2450 }, { "epoch": 0.6969035950026913, "grad_norm": 20.215675354003906, "learning_rate": 2.2195261840801757e-06, "loss": 7.1815, "step": 2460 }, { "epoch": 0.6997365364457917, "grad_norm": 21.300861358642578, "learning_rate": 2.18150422362112e-06, "loss": 6.9142, "step": 2470 }, { "epoch": 0.702569477888892, "grad_norm": 30.098453521728516, "learning_rate": 2.1437196738611958e-06, "loss": 4.4774, "step": 2480 }, { "epoch": 0.7054024193319924, "grad_norm": 25.317970275878906, "learning_rate": 2.1061757175110024e-06, "loss": 4.4772, "step": 2490 }, { "epoch": 0.7082353607750927, "grad_norm": 30.881681442260742, "learning_rate": 2.0688755170152e-06, "loss": 4.2296, "step": 2500 }, { "epoch": 0.7110683022181932, "grad_norm": 23.95901107788086, "learning_rate": 2.031822214286134e-06, "loss": 5.0405, "step": 2510 }, { "epoch": 0.7139012436612935, "grad_norm": 41.624210357666016, "learning_rate": 1.9950189304391855e-06, "loss": 6.3358, "step": 2520 }, { "epoch": 0.7167341851043939, "grad_norm": 34.76797866821289, "learning_rate": 1.958468765529853e-06, "loss": 5.061, "step": 2530 }, { "epoch": 0.7195671265474942, "grad_norm": 20.406444549560547, "learning_rate": 1.9221747982926493e-06, "loss": 5.1701, "step": 2540 }, { "epoch": 0.7224000679905946, "grad_norm": 24.22311782836914, "learning_rate": 1.8861400858817508e-06, "loss": 4.2621, "step": 2550 }, { "epoch": 0.725233009433695, "grad_norm": 37.65345001220703, "learning_rate": 1.8503676636134882e-06, "loss": 6.1661, "step": 2560 }, { "epoch": 0.7280659508767954, "grad_norm": 20.813777923583984, "learning_rate": 1.81486054471068e-06, "loss": 5.3045, "step": 2570 }, { "epoch": 0.7308988923198958, "grad_norm": 39.82976150512695, "learning_rate": 1.7796217200488114e-06, "loss": 6.4348, "step": 2580 }, { "epoch": 0.7337318337629961, "grad_norm": 25.495925903320312, "learning_rate": 1.7446541579041048e-06, "loss": 4.2349, "step": 2590 }, { "epoch": 0.7365647752060965, "grad_norm": 38.05914306640625, "learning_rate": 1.7099608037034953e-06, "loss": 5.2485, "step": 2600 }, { "epoch": 0.7393977166491968, "grad_norm": 22.876413345336914, "learning_rate": 1.6755445797765286e-06, "loss": 4.263, "step": 2610 }, { "epoch": 0.7422306580922973, "grad_norm": 22.402753829956055, "learning_rate": 1.6414083851091973e-06, "loss": 4.3153, "step": 2620 }, { "epoch": 0.7450635995353976, "grad_norm": 20.86781883239746, "learning_rate": 1.6075550950997592e-06, "loss": 4.4095, "step": 2630 }, { "epoch": 0.747896540978498, "grad_norm": 39.51744842529297, "learning_rate": 1.5739875613165283e-06, "loss": 6.2356, "step": 2640 }, { "epoch": 0.7507294824215983, "grad_norm": 26.651187896728516, "learning_rate": 1.5407086112576813e-06, "loss": 4.1033, "step": 2650 }, { "epoch": 0.7535624238646987, "grad_norm": 46.947757720947266, "learning_rate": 1.5077210481130815e-06, "loss": 8.1815, "step": 2660 }, { "epoch": 0.756395365307799, "grad_norm": 41.29295349121094, "learning_rate": 1.475027650528168e-06, "loss": 6.1637, "step": 2670 }, { "epoch": 0.7592283067508995, "grad_norm": 39.40729522705078, "learning_rate": 1.442631172369896e-06, "loss": 7.1273, "step": 2680 }, { "epoch": 0.7620612481939998, "grad_norm": 39.58256912231445, "learning_rate": 1.4105343424947654e-06, "loss": 5.187, "step": 2690 }, { "epoch": 0.7648941896371002, "grad_norm": 39.836185455322266, "learning_rate": 1.378739864518971e-06, "loss": 3.8889, "step": 2700 }, { "epoch": 0.7677271310802005, "grad_norm": 40.20053482055664, "learning_rate": 1.3472504165906614e-06, "loss": 5.3001, "step": 2710 }, { "epoch": 0.7705600725233009, "grad_norm": 23.571002960205078, "learning_rate": 1.3160686511643505e-06, "loss": 4.0238, "step": 2720 }, { "epoch": 0.7733930139664014, "grad_norm": 23.623443603515625, "learning_rate": 1.2851971947774987e-06, "loss": 5.1091, "step": 2730 }, { "epoch": 0.7762259554095017, "grad_norm": 31.367658615112305, "learning_rate": 1.2546386478292604e-06, "loss": 4.1048, "step": 2740 }, { "epoch": 0.7790588968526021, "grad_norm": 39.296226501464844, "learning_rate": 1.2243955843614558e-06, "loss": 4.271, "step": 2750 }, { "epoch": 0.7818918382957024, "grad_norm": 28.69118881225586, "learning_rate": 1.1944705518417466e-06, "loss": 4.0739, "step": 2760 }, { "epoch": 0.7847247797388028, "grad_norm": 32.27414321899414, "learning_rate": 1.1648660709490538e-06, "loss": 5.1998, "step": 2770 }, { "epoch": 0.7875577211819031, "grad_norm": 24.473217010498047, "learning_rate": 1.135584635361232e-06, "loss": 4.9601, "step": 2780 }, { "epoch": 0.7903906626250036, "grad_norm": 27.856367111206055, "learning_rate": 1.1066287115450242e-06, "loss": 4.9381, "step": 2790 }, { "epoch": 0.7932236040681039, "grad_norm": 17.640838623046875, "learning_rate": 1.0780007385483005e-06, "loss": 4.2145, "step": 2800 }, { "epoch": 0.7960565455112043, "grad_norm": 34.375091552734375, "learning_rate": 1.0497031277946062e-06, "loss": 8.3028, "step": 2810 }, { "epoch": 0.7988894869543046, "grad_norm": 23.346403121948242, "learning_rate": 1.0217382628800465e-06, "loss": 6.9337, "step": 2820 }, { "epoch": 0.801722428397405, "grad_norm": 25.259016036987305, "learning_rate": 9.94108499372507e-07, "loss": 3.1855, "step": 2830 }, { "epoch": 0.8045553698405054, "grad_norm": 33.022727966308594, "learning_rate": 9.668161646132296e-07, "loss": 5.2408, "step": 2840 }, { "epoch": 0.8073883112836058, "grad_norm": 30.2951717376709, "learning_rate": 9.398635575207854e-07, "loss": 3.1828, "step": 2850 }, { "epoch": 0.8102212527267061, "grad_norm": 51.273616790771484, "learning_rate": 9.132529483974217e-07, "loss": 5.0485, "step": 2860 }, { "epoch": 0.8130541941698065, "grad_norm": 39.878597259521484, "learning_rate": 8.869865787378262e-07, "loss": 6.3068, "step": 2870 }, { "epoch": 0.8158871356129069, "grad_norm": 21.064966201782227, "learning_rate": 8.61066661040324e-07, "loss": 3.3587, "step": 2880 }, { "epoch": 0.8187200770560072, "grad_norm": 22.18380355834961, "learning_rate": 8.354953786205133e-07, "loss": 4.242, "step": 2890 }, { "epoch": 0.8215530184991077, "grad_norm": 41.24013137817383, "learning_rate": 8.102748854273468e-07, "loss": 4.1017, "step": 2900 }, { "epoch": 0.824385959942208, "grad_norm": 23.30076789855957, "learning_rate": 7.854073058617112e-07, "loss": 5.3308, "step": 2910 }, { "epoch": 0.8272189013853084, "grad_norm": 21.42025375366211, "learning_rate": 7.60894734597476e-07, "loss": 8.113, "step": 2920 }, { "epoch": 0.8300518428284087, "grad_norm": 18.741073608398438, "learning_rate": 7.367392364050485e-07, "loss": 5.1848, "step": 2930 }, { "epoch": 0.8328847842715091, "grad_norm": 23.857194900512695, "learning_rate": 7.129428459774618e-07, "loss": 7.1581, "step": 2940 }, { "epoch": 0.8357177257146094, "grad_norm": 28.530094146728516, "learning_rate": 6.895075677589791e-07, "loss": 6.2661, "step": 2950 }, { "epoch": 0.8385506671577099, "grad_norm": 40.354949951171875, "learning_rate": 6.664353757762515e-07, "loss": 4.2647, "step": 2960 }, { "epoch": 0.8413836086008102, "grad_norm": 21.305288314819336, "learning_rate": 6.437282134720479e-07, "loss": 4.9122, "step": 2970 }, { "epoch": 0.8442165500439106, "grad_norm": 40.32603454589844, "learning_rate": 6.21387993541544e-07, "loss": 6.2095, "step": 2980 }, { "epoch": 0.8470494914870109, "grad_norm": 39.293067932128906, "learning_rate": 5.994165977712175e-07, "loss": 4.1365, "step": 2990 }, { "epoch": 0.8498824329301113, "grad_norm": 25.006118774414062, "learning_rate": 5.778158768803294e-07, "loss": 3.4504, "step": 3000 }, { "epoch": 0.8527153743732117, "grad_norm": 38.37477111816406, "learning_rate": 5.565876503650442e-07, "loss": 4.2214, "step": 3010 }, { "epoch": 0.8555483158163121, "grad_norm": 40.921207427978516, "learning_rate": 5.357337063451601e-07, "loss": 5.1103, "step": 3020 }, { "epoch": 0.8583812572594125, "grad_norm": 26.225017547607422, "learning_rate": 5.152558014134906e-07, "loss": 5.9913, "step": 3030 }, { "epoch": 0.8612141987025128, "grad_norm": 22.678930282592773, "learning_rate": 4.951556604879049e-07, "loss": 4.3731, "step": 3040 }, { "epoch": 0.8640471401456132, "grad_norm": 29.937393188476562, "learning_rate": 4.754349766660299e-07, "loss": 4.2301, "step": 3050 }, { "epoch": 0.8668800815887135, "grad_norm": 26.465091705322266, "learning_rate": 4.5609541108263377e-07, "loss": 6.0091, "step": 3060 }, { "epoch": 0.869713023031814, "grad_norm": 25.58681297302246, "learning_rate": 4.3713859276971026e-07, "loss": 6.979, "step": 3070 }, { "epoch": 0.8725459644749143, "grad_norm": 22.564706802368164, "learning_rate": 4.1856611851925245e-07, "loss": 5.0316, "step": 3080 }, { "epoch": 0.8753789059180147, "grad_norm": 40.97758102416992, "learning_rate": 4.003795527487508e-07, "loss": 8.964, "step": 3090 }, { "epoch": 0.878211847361115, "grad_norm": 32.80113220214844, "learning_rate": 3.8258042736942446e-07, "loss": 3.1517, "step": 3100 }, { "epoch": 0.8810447888042154, "grad_norm": 30.950176239013672, "learning_rate": 3.651702416571762e-07, "loss": 4.345, "step": 3110 }, { "epoch": 0.8838777302473158, "grad_norm": 21.242090225219727, "learning_rate": 3.481504621263049e-07, "loss": 6.1642, "step": 3120 }, { "epoch": 0.8867106716904162, "grad_norm": 21.0790958404541, "learning_rate": 3.315225224059809e-07, "loss": 5.1734, "step": 3130 }, { "epoch": 0.8895436131335165, "grad_norm": 41.8050537109375, "learning_rate": 3.1528782311948226e-07, "loss": 5.0608, "step": 3140 }, { "epoch": 0.8923765545766169, "grad_norm": 23.527942657470703, "learning_rate": 2.9944773176621756e-07, "loss": 5.9961, "step": 3150 }, { "epoch": 0.8952094960197172, "grad_norm": 28.754201889038086, "learning_rate": 2.840035826065368e-07, "loss": 3.8781, "step": 3160 }, { "epoch": 0.8980424374628176, "grad_norm": 26.580829620361328, "learning_rate": 2.689566765493451e-07, "loss": 4.1426, "step": 3170 }, { "epoch": 0.9008753789059181, "grad_norm": 18.550945281982422, "learning_rate": 2.5430828104251684e-07, "loss": 4.9139, "step": 3180 }, { "epoch": 0.9037083203490184, "grad_norm": 20.301895141601562, "learning_rate": 2.4005962996614174e-07, "loss": 3.1654, "step": 3190 }, { "epoch": 0.9065412617921188, "grad_norm": 37.804969787597656, "learning_rate": 2.2621192352858702e-07, "loss": 5.0736, "step": 3200 }, { "epoch": 0.9093742032352191, "grad_norm": 29.193897247314453, "learning_rate": 2.1276632816540077e-07, "loss": 5.2175, "step": 3210 }, { "epoch": 0.9122071446783195, "grad_norm": 23.378320693969727, "learning_rate": 1.9972397644106023e-07, "loss": 5.2508, "step": 3220 }, { "epoch": 0.9150400861214198, "grad_norm": 18.92923355102539, "learning_rate": 1.870859669535724e-07, "loss": 5.2554, "step": 3230 }, { "epoch": 0.9178730275645203, "grad_norm": 18.03963279724121, "learning_rate": 1.7485336424193366e-07, "loss": 5.1253, "step": 3240 }, { "epoch": 0.9207059690076206, "grad_norm": 38.961456298828125, "learning_rate": 1.6302719869646432e-07, "loss": 5.0659, "step": 3250 }, { "epoch": 0.923538910450721, "grad_norm": 20.624431610107422, "learning_rate": 1.5160846647201132e-07, "loss": 4.1776, "step": 3260 }, { "epoch": 0.9263718518938213, "grad_norm": 21.755279541015625, "learning_rate": 1.4059812940404093e-07, "loss": 3.142, "step": 3270 }, { "epoch": 0.9292047933369217, "grad_norm": 20.076051712036133, "learning_rate": 1.2999711492762079e-07, "loss": 5.2161, "step": 3280 }, { "epoch": 0.932037734780022, "grad_norm": 25.841142654418945, "learning_rate": 1.198063159992996e-07, "loss": 5.3184, "step": 3290 }, { "epoch": 0.9348706762231225, "grad_norm": 35.23577117919922, "learning_rate": 1.1002659102188784e-07, "loss": 3.3098, "step": 3300 }, { "epoch": 0.9377036176662228, "grad_norm": 20.789785385131836, "learning_rate": 1.006587637721551e-07, "loss": 3.1742, "step": 3310 }, { "epoch": 0.9405365591093232, "grad_norm": 40.675296783447266, "learning_rate": 9.170362333143778e-08, "loss": 7.3385, "step": 3320 }, { "epoch": 0.9433695005524236, "grad_norm": 24.71589469909668, "learning_rate": 8.316192401917667e-08, "loss": 5.3478, "step": 3330 }, { "epoch": 0.946202441995524, "grad_norm": 38.48093032836914, "learning_rate": 7.503438532937169e-08, "loss": 6.069, "step": 3340 }, { "epoch": 0.9490353834386244, "grad_norm": 26.636127471923828, "learning_rate": 6.732169186998372e-08, "loss": 4.1179, "step": 3350 }, { "epoch": 0.9518683248817247, "grad_norm": 42.95631790161133, "learning_rate": 6.002449330526294e-08, "loss": 6.9268, "step": 3360 }, { "epoch": 0.9547012663248251, "grad_norm": 20.64594268798828, "learning_rate": 5.31434043010276e-08, "loss": 3.1192, "step": 3370 }, { "epoch": 0.9575342077679254, "grad_norm": 44.144744873046875, "learning_rate": 4.667900447288931e-08, "loss": 6.0163, "step": 3380 }, { "epoch": 0.9603671492110258, "grad_norm": 41.0361442565918, "learning_rate": 4.0631838337427675e-08, "loss": 5.265, "step": 3390 }, { "epoch": 0.9632000906541262, "grad_norm": 28.538305282592773, "learning_rate": 3.500241526632753e-08, "loss": 5.07, "step": 3400 }, { "epoch": 0.9660330320972266, "grad_norm": 24.23358154296875, "learning_rate": 2.979120944346936e-08, "loss": 4.3623, "step": 3410 }, { "epoch": 0.9688659735403269, "grad_norm": 26.74643898010254, "learning_rate": 2.499865982499128e-08, "loss": 3.3373, "step": 3420 }, { "epoch": 0.9716989149834273, "grad_norm": 22.635358810424805, "learning_rate": 2.0625170102309687e-08, "loss": 4.2529, "step": 3430 }, { "epoch": 0.9745318564265276, "grad_norm": 25.663415908813477, "learning_rate": 1.6671108668119828e-08, "loss": 4.1368, "step": 3440 }, { "epoch": 0.977364797869628, "grad_norm": 28.40155029296875, "learning_rate": 1.3136808585361149e-08, "loss": 5.9535, "step": 3450 }, { "epoch": 0.9801977393127284, "grad_norm": 21.457969665527344, "learning_rate": 1.0022567559164198e-08, "loss": 6.1661, "step": 3460 }, { "epoch": 0.9830306807558288, "grad_norm": 26.77224349975586, "learning_rate": 7.328647911774567e-09, "loss": 4.1479, "step": 3470 }, { "epoch": 0.9858636221989292, "grad_norm": 34.7308464050293, "learning_rate": 5.055276560454459e-09, "loss": 3.216, "step": 3480 }, { "epoch": 0.9886965636420295, "grad_norm": 27.362573623657227, "learning_rate": 3.202644998370752e-09, "loss": 5.132, "step": 3490 }, { "epoch": 0.9915295050851299, "grad_norm": 33.32588195800781, "learning_rate": 1.770909278464017e-09, "loss": 6.1225, "step": 3500 }, { "epoch": 0.9943624465282302, "grad_norm": 19.929155349731445, "learning_rate": 7.601900003051388e-10, "loss": 3.1458, "step": 3510 }, { "epoch": 0.9971953879713307, "grad_norm": 26.834556579589844, "learning_rate": 1.7057229993344693e-10, "loss": 7.2351, "step": 3520 }, { "epoch": 0.9997450352701209, "step": 3529, "total_flos": 1.1540349697243742e+19, "train_loss": 5.616373471816494, "train_runtime": 84659.9545, "train_samples_per_second": 2.502, "train_steps_per_second": 0.042 } ], "logging_steps": 10, "max_steps": 3529, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 1000, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 1.1540349697243742e+19, "train_batch_size": 1, "trial_name": null, "trial_params": null }