SpatialStack / Order_Ablations /E116 /trainer_state.json
Journey9ni's picture
Upload Order_Ablations without checkpoints
19d4cfa verified
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 0.9997450352701209,
"eval_steps": 500,
"global_step": 3529,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.002832941443100371,
"grad_norm": 174.56146240234375,
"learning_rate": 9.433962264150944e-07,
"loss": 17.5632,
"step": 10
},
{
"epoch": 0.005665882886200742,
"grad_norm": 99.76522064208984,
"learning_rate": 1.8867924528301889e-06,
"loss": 16.5212,
"step": 20
},
{
"epoch": 0.008498824329301113,
"grad_norm": 61.843570709228516,
"learning_rate": 2.830188679245283e-06,
"loss": 14.66,
"step": 30
},
{
"epoch": 0.011331765772401484,
"grad_norm": 58.26314926147461,
"learning_rate": 3.7735849056603777e-06,
"loss": 12.8877,
"step": 40
},
{
"epoch": 0.014164707215501856,
"grad_norm": 64.67306518554688,
"learning_rate": 4.716981132075472e-06,
"loss": 12.6102,
"step": 50
},
{
"epoch": 0.016997648658602225,
"grad_norm": 40.95338439941406,
"learning_rate": 5.660377358490566e-06,
"loss": 10.3169,
"step": 60
},
{
"epoch": 0.019830590101702596,
"grad_norm": 59.99547576904297,
"learning_rate": 6.60377358490566e-06,
"loss": 10.4793,
"step": 70
},
{
"epoch": 0.022663531544802967,
"grad_norm": 69.4089584350586,
"learning_rate": 7.5471698113207555e-06,
"loss": 10.2298,
"step": 80
},
{
"epoch": 0.02549647298790334,
"grad_norm": 38.2027702331543,
"learning_rate": 8.49056603773585e-06,
"loss": 8.5679,
"step": 90
},
{
"epoch": 0.028329414431003713,
"grad_norm": 41.05867004394531,
"learning_rate": 9.433962264150944e-06,
"loss": 8.6834,
"step": 100
},
{
"epoch": 0.031162355874104083,
"grad_norm": 45.03547286987305,
"learning_rate": 9.999966306552455e-06,
"loss": 7.8071,
"step": 110
},
{
"epoch": 0.03399529731720445,
"grad_norm": 43.03623580932617,
"learning_rate": 9.999587260482597e-06,
"loss": 8.2945,
"step": 120
},
{
"epoch": 0.036828238760304825,
"grad_norm": 37.273048400878906,
"learning_rate": 9.998787083568112e-06,
"loss": 8.0273,
"step": 130
},
{
"epoch": 0.03966118020340519,
"grad_norm": 55.94173049926758,
"learning_rate": 9.997565843210401e-06,
"loss": 7.1597,
"step": 140
},
{
"epoch": 0.04249412164650557,
"grad_norm": 50.30799102783203,
"learning_rate": 9.995923642278351e-06,
"loss": 7.2908,
"step": 150
},
{
"epoch": 0.045327063089605935,
"grad_norm": 53.113243103027344,
"learning_rate": 9.993860619099673e-06,
"loss": 5.9006,
"step": 160
},
{
"epoch": 0.04816000453270631,
"grad_norm": 51.57769012451172,
"learning_rate": 9.991376947449254e-06,
"loss": 8.8304,
"step": 170
},
{
"epoch": 0.05099294597580668,
"grad_norm": 38.17790985107422,
"learning_rate": 9.988472836534509e-06,
"loss": 8.6644,
"step": 180
},
{
"epoch": 0.05382588741890705,
"grad_norm": 34.28203582763672,
"learning_rate": 9.985148530977767e-06,
"loss": 6.6272,
"step": 190
},
{
"epoch": 0.056658828862007425,
"grad_norm": 34.18405532836914,
"learning_rate": 9.981404310795667e-06,
"loss": 8.2372,
"step": 200
},
{
"epoch": 0.05949177030510779,
"grad_norm": 40.83757400512695,
"learning_rate": 9.97724049137556e-06,
"loss": 5.3928,
"step": 210
},
{
"epoch": 0.06232471174820817,
"grad_norm": 36.30077362060547,
"learning_rate": 9.972657423448961e-06,
"loss": 6.5196,
"step": 220
},
{
"epoch": 0.06515765319130853,
"grad_norm": 38.03015899658203,
"learning_rate": 9.96765549306199e-06,
"loss": 7.1923,
"step": 230
},
{
"epoch": 0.0679905946344089,
"grad_norm": 42.84525680541992,
"learning_rate": 9.962235121542858e-06,
"loss": 6.9849,
"step": 240
},
{
"epoch": 0.07082353607750928,
"grad_norm": 34.071800231933594,
"learning_rate": 9.956396765466382e-06,
"loss": 7.0978,
"step": 250
},
{
"epoch": 0.07365647752060965,
"grad_norm": 37.807029724121094,
"learning_rate": 9.950140916615526e-06,
"loss": 7.5077,
"step": 260
},
{
"epoch": 0.07648941896371002,
"grad_norm": 36.55296325683594,
"learning_rate": 9.943468101939968e-06,
"loss": 6.6867,
"step": 270
},
{
"epoch": 0.07932236040681039,
"grad_norm": 31.735977172851562,
"learning_rate": 9.936378883511722e-06,
"loss": 8.5626,
"step": 280
},
{
"epoch": 0.08215530184991077,
"grad_norm": 51.97509002685547,
"learning_rate": 9.92887385847779e-06,
"loss": 7.1104,
"step": 290
},
{
"epoch": 0.08498824329301113,
"grad_norm": 56.030025482177734,
"learning_rate": 9.920953659009863e-06,
"loss": 6.6099,
"step": 300
},
{
"epoch": 0.0878211847361115,
"grad_norm": 37.08720016479492,
"learning_rate": 9.912618952251071e-06,
"loss": 4.9933,
"step": 310
},
{
"epoch": 0.09065412617921187,
"grad_norm": 34.61451721191406,
"learning_rate": 9.903870440259787e-06,
"loss": 5.7727,
"step": 320
},
{
"epoch": 0.09348706762231225,
"grad_norm": 35.92675018310547,
"learning_rate": 9.89470885995049e-06,
"loss": 4.8536,
"step": 330
},
{
"epoch": 0.09632000906541262,
"grad_norm": 31.899490356445312,
"learning_rate": 9.885134983031694e-06,
"loss": 5.6988,
"step": 340
},
{
"epoch": 0.09915295050851299,
"grad_norm": 37.71702194213867,
"learning_rate": 9.875149615940943e-06,
"loss": 8.0547,
"step": 350
},
{
"epoch": 0.10198589195161337,
"grad_norm": 32.81459426879883,
"learning_rate": 9.864753599776883e-06,
"loss": 5.7466,
"step": 360
},
{
"epoch": 0.10481883339471373,
"grad_norm": 34.12638854980469,
"learning_rate": 9.853947810228416e-06,
"loss": 6.3535,
"step": 370
},
{
"epoch": 0.1076517748378141,
"grad_norm": 34.04792022705078,
"learning_rate": 9.842733157500932e-06,
"loss": 5.7424,
"step": 380
},
{
"epoch": 0.11048471628091447,
"grad_norm": 33.2330322265625,
"learning_rate": 9.831110586239643e-06,
"loss": 5.4935,
"step": 390
},
{
"epoch": 0.11331765772401485,
"grad_norm": 33.370567321777344,
"learning_rate": 9.819081075450014e-06,
"loss": 5.5257,
"step": 400
},
{
"epoch": 0.11615059916711522,
"grad_norm": 34.091304779052734,
"learning_rate": 9.806645638415302e-06,
"loss": 6.1631,
"step": 410
},
{
"epoch": 0.11898354061021559,
"grad_norm": 28.293777465820312,
"learning_rate": 9.79380532261119e-06,
"loss": 6.2594,
"step": 420
},
{
"epoch": 0.12181648205331595,
"grad_norm": 31.169191360473633,
"learning_rate": 9.780561209617569e-06,
"loss": 5.428,
"step": 430
},
{
"epoch": 0.12464942349641633,
"grad_norm": 31.67852783203125,
"learning_rate": 9.766914415027426e-06,
"loss": 6.3704,
"step": 440
},
{
"epoch": 0.1274823649395167,
"grad_norm": 24.92995262145996,
"learning_rate": 9.752866088352882e-06,
"loss": 6.3413,
"step": 450
},
{
"epoch": 0.13031530638261707,
"grad_norm": 30.267122268676758,
"learning_rate": 9.738417412928348e-06,
"loss": 6.1918,
"step": 460
},
{
"epoch": 0.13314824782571744,
"grad_norm": 34.71146011352539,
"learning_rate": 9.72356960581087e-06,
"loss": 5.2388,
"step": 470
},
{
"epoch": 0.1359811892688178,
"grad_norm": 32.87137985229492,
"learning_rate": 9.7083239176776e-06,
"loss": 4.2622,
"step": 480
},
{
"epoch": 0.13881413071191817,
"grad_norm": 37.83769226074219,
"learning_rate": 9.692681632720448e-06,
"loss": 4.1838,
"step": 490
},
{
"epoch": 0.14164707215501857,
"grad_norm": 29.84713363647461,
"learning_rate": 9.676644068537915e-06,
"loss": 6.1015,
"step": 500
},
{
"epoch": 0.14448001359811893,
"grad_norm": 37.30936050415039,
"learning_rate": 9.660212576024102e-06,
"loss": 6.1121,
"step": 510
},
{
"epoch": 0.1473129550412193,
"grad_norm": 43.746490478515625,
"learning_rate": 9.64338853925493e-06,
"loss": 6.0385,
"step": 520
},
{
"epoch": 0.15014589648431967,
"grad_norm": 32.99515151977539,
"learning_rate": 9.62617337537154e-06,
"loss": 6.1911,
"step": 530
},
{
"epoch": 0.15297883792742004,
"grad_norm": 46.957340240478516,
"learning_rate": 9.608568534460938e-06,
"loss": 5.822,
"step": 540
},
{
"epoch": 0.1558117793705204,
"grad_norm": 33.011844635009766,
"learning_rate": 9.590575499433837e-06,
"loss": 7.1735,
"step": 550
},
{
"epoch": 0.15864472081362077,
"grad_norm": 31.042083740234375,
"learning_rate": 9.572195785899756e-06,
"loss": 6.8695,
"step": 560
},
{
"epoch": 0.16147766225672117,
"grad_norm": 26.289737701416016,
"learning_rate": 9.553430942039352e-06,
"loss": 6.9135,
"step": 570
},
{
"epoch": 0.16431060369982153,
"grad_norm": 29.071701049804688,
"learning_rate": 9.534282548474008e-06,
"loss": 5.075,
"step": 580
},
{
"epoch": 0.1671435451429219,
"grad_norm": 46.65534973144531,
"learning_rate": 9.514752218132703e-06,
"loss": 7.0842,
"step": 590
},
{
"epoch": 0.16997648658602227,
"grad_norm": 31.195234298706055,
"learning_rate": 9.494841596116138e-06,
"loss": 4.9555,
"step": 600
},
{
"epoch": 0.17280942802912264,
"grad_norm": 26.25963020324707,
"learning_rate": 9.474552359558167e-06,
"loss": 8.0829,
"step": 610
},
{
"epoch": 0.175642369472223,
"grad_norm": 25.132131576538086,
"learning_rate": 9.453886217484536e-06,
"loss": 4.0549,
"step": 620
},
{
"epoch": 0.17847531091532337,
"grad_norm": 47.06711196899414,
"learning_rate": 9.432844910668914e-06,
"loss": 6.1125,
"step": 630
},
{
"epoch": 0.18130825235842374,
"grad_norm": 41.20506286621094,
"learning_rate": 9.41143021148627e-06,
"loss": 6.7009,
"step": 640
},
{
"epoch": 0.18414119380152413,
"grad_norm": 27.037729263305664,
"learning_rate": 9.389643923763573e-06,
"loss": 6.8328,
"step": 650
},
{
"epoch": 0.1869741352446245,
"grad_norm": 44.91098403930664,
"learning_rate": 9.367487882627866e-06,
"loss": 5.0284,
"step": 660
},
{
"epoch": 0.18980707668772487,
"grad_norm": 32.71237564086914,
"learning_rate": 9.344963954351662e-06,
"loss": 6.0377,
"step": 670
},
{
"epoch": 0.19264001813082524,
"grad_norm": 24.608020782470703,
"learning_rate": 9.32207403619577e-06,
"loss": 3.9539,
"step": 680
},
{
"epoch": 0.1954729595739256,
"grad_norm": 45.37845230102539,
"learning_rate": 9.298820056249459e-06,
"loss": 7.6906,
"step": 690
},
{
"epoch": 0.19830590101702597,
"grad_norm": 26.40629768371582,
"learning_rate": 9.275203973268064e-06,
"loss": 5.7302,
"step": 700
},
{
"epoch": 0.20113884246012634,
"grad_norm": 25.433490753173828,
"learning_rate": 9.251227776507989e-06,
"loss": 5.6252,
"step": 710
},
{
"epoch": 0.20397178390322673,
"grad_norm": 25.276575088500977,
"learning_rate": 9.226893485559146e-06,
"loss": 5.8884,
"step": 720
},
{
"epoch": 0.2068047253463271,
"grad_norm": 45.13107681274414,
"learning_rate": 9.202203150174836e-06,
"loss": 9.215,
"step": 730
},
{
"epoch": 0.20963766678942747,
"grad_norm": 26.52821922302246,
"learning_rate": 9.177158850099099e-06,
"loss": 5.7232,
"step": 740
},
{
"epoch": 0.21247060823252784,
"grad_norm": 43.29339599609375,
"learning_rate": 9.151762694891522e-06,
"loss": 6.7846,
"step": 750
},
{
"epoch": 0.2153035496756282,
"grad_norm": 29.308732986450195,
"learning_rate": 9.12601682374955e-06,
"loss": 5.8371,
"step": 760
},
{
"epoch": 0.21813649111872857,
"grad_norm": 25.373172760009766,
"learning_rate": 9.099923405328293e-06,
"loss": 3.9846,
"step": 770
},
{
"epoch": 0.22096943256182894,
"grad_norm": 33.12062454223633,
"learning_rate": 9.073484637557852e-06,
"loss": 4.8174,
"step": 780
},
{
"epoch": 0.2238023740049293,
"grad_norm": 24.834850311279297,
"learning_rate": 9.046702747458186e-06,
"loss": 5.8073,
"step": 790
},
{
"epoch": 0.2266353154480297,
"grad_norm": 23.760942459106445,
"learning_rate": 9.019579990951514e-06,
"loss": 5.6668,
"step": 800
},
{
"epoch": 0.22946825689113007,
"grad_norm": 25.230995178222656,
"learning_rate": 8.992118652672302e-06,
"loss": 5.6386,
"step": 810
},
{
"epoch": 0.23230119833423044,
"grad_norm": 27.033655166625977,
"learning_rate": 8.964321045774808e-06,
"loss": 5.1316,
"step": 820
},
{
"epoch": 0.2351341397773308,
"grad_norm": 36.77193832397461,
"learning_rate": 8.936189511738254e-06,
"loss": 5.0568,
"step": 830
},
{
"epoch": 0.23796708122043117,
"grad_norm": 21.841785430908203,
"learning_rate": 8.907726420169583e-06,
"loss": 5.5521,
"step": 840
},
{
"epoch": 0.24080002266353154,
"grad_norm": 46.69823455810547,
"learning_rate": 8.878934168603865e-06,
"loss": 6.7058,
"step": 850
},
{
"epoch": 0.2436329641066319,
"grad_norm": 23.228717803955078,
"learning_rate": 8.849815182302345e-06,
"loss": 7.8944,
"step": 860
},
{
"epoch": 0.2464659055497323,
"grad_norm": 33.86655807495117,
"learning_rate": 8.820371914048153e-06,
"loss": 4.8468,
"step": 870
},
{
"epoch": 0.24929884699283267,
"grad_norm": 25.241182327270508,
"learning_rate": 8.790606843939705e-06,
"loss": 3.709,
"step": 880
},
{
"epoch": 0.25213178843593304,
"grad_norm": 25.6811580657959,
"learning_rate": 8.760522479181784e-06,
"loss": 4.844,
"step": 890
},
{
"epoch": 0.2549647298790334,
"grad_norm": 43.929115295410156,
"learning_rate": 8.730121353874365e-06,
"loss": 6.7687,
"step": 900
},
{
"epoch": 0.25779767132213377,
"grad_norm": 25.351106643676758,
"learning_rate": 8.69940602879915e-06,
"loss": 3.7733,
"step": 910
},
{
"epoch": 0.26063061276523414,
"grad_norm": 45.543373107910156,
"learning_rate": 8.66837909120387e-06,
"loss": 6.5226,
"step": 920
},
{
"epoch": 0.2634635542083345,
"grad_norm": 35.3692626953125,
"learning_rate": 8.637043154584351e-06,
"loss": 7.782,
"step": 930
},
{
"epoch": 0.2662964956514349,
"grad_norm": 23.175140380859375,
"learning_rate": 8.60540085846437e-06,
"loss": 3.7581,
"step": 940
},
{
"epoch": 0.26912943709453524,
"grad_norm": 22.461284637451172,
"learning_rate": 8.573454868173325e-06,
"loss": 3.8114,
"step": 950
},
{
"epoch": 0.2719623785376356,
"grad_norm": 30.49061393737793,
"learning_rate": 8.541207874621718e-06,
"loss": 5.6752,
"step": 960
},
{
"epoch": 0.274795319980736,
"grad_norm": 43.03390121459961,
"learning_rate": 8.508662594074496e-06,
"loss": 5.8459,
"step": 970
},
{
"epoch": 0.27762826142383634,
"grad_norm": 144.74916076660156,
"learning_rate": 8.475821767922254e-06,
"loss": 6.1604,
"step": 980
},
{
"epoch": 0.28046120286693677,
"grad_norm": 26.303081512451172,
"learning_rate": 8.442688162450315e-06,
"loss": 4.7632,
"step": 990
},
{
"epoch": 0.28329414431003713,
"grad_norm": 28.066007614135742,
"learning_rate": 8.409264568605714e-06,
"loss": 8.0062,
"step": 1000
},
{
"epoch": 0.2861270857531375,
"grad_norm": 23.32230567932129,
"learning_rate": 8.375553801762119e-06,
"loss": 3.9505,
"step": 1010
},
{
"epoch": 0.28896002719623787,
"grad_norm": 53.46368408203125,
"learning_rate": 8.34155870148267e-06,
"loss": 4.7727,
"step": 1020
},
{
"epoch": 0.29179296863933823,
"grad_norm": 28.557096481323242,
"learning_rate": 8.307282131280805e-06,
"loss": 3.876,
"step": 1030
},
{
"epoch": 0.2946259100824386,
"grad_norm": 26.72674560546875,
"learning_rate": 8.272726978379049e-06,
"loss": 3.6362,
"step": 1040
},
{
"epoch": 0.29745885152553897,
"grad_norm": 33.64091110229492,
"learning_rate": 8.23789615346582e-06,
"loss": 4.8435,
"step": 1050
},
{
"epoch": 0.30029179296863934,
"grad_norm": 25.513519287109375,
"learning_rate": 8.202792590450246e-06,
"loss": 4.8615,
"step": 1060
},
{
"epoch": 0.3031247344117397,
"grad_norm": 26.183082580566406,
"learning_rate": 8.167419246215042e-06,
"loss": 3.7897,
"step": 1070
},
{
"epoch": 0.30595767585484007,
"grad_norm": 37.91279983520508,
"learning_rate": 8.131779100367438e-06,
"loss": 3.8092,
"step": 1080
},
{
"epoch": 0.30879061729794044,
"grad_norm": 20.643037796020508,
"learning_rate": 8.09587515498819e-06,
"loss": 5.8217,
"step": 1090
},
{
"epoch": 0.3116235587410408,
"grad_norm": 24.009424209594727,
"learning_rate": 8.059710434378717e-06,
"loss": 4.6594,
"step": 1100
},
{
"epoch": 0.3144565001841412,
"grad_norm": 26.472389221191406,
"learning_rate": 8.02328798480635e-06,
"loss": 6.5989,
"step": 1110
},
{
"epoch": 0.31728944162724154,
"grad_norm": 42.69245529174805,
"learning_rate": 7.986610874247736e-06,
"loss": 6.7969,
"step": 1120
},
{
"epoch": 0.3201223830703419,
"grad_norm": 20.50579833984375,
"learning_rate": 7.949682192130407e-06,
"loss": 7.532,
"step": 1130
},
{
"epoch": 0.32295532451344233,
"grad_norm": 33.428890228271484,
"learning_rate": 7.912505049072559e-06,
"loss": 5.5098,
"step": 1140
},
{
"epoch": 0.3257882659565427,
"grad_norm": 33.04521560668945,
"learning_rate": 7.875082576621024e-06,
"loss": 5.7852,
"step": 1150
},
{
"epoch": 0.32862120739964307,
"grad_norm": 29.72992706298828,
"learning_rate": 7.837417926987496e-06,
"loss": 3.8586,
"step": 1160
},
{
"epoch": 0.33145414884274343,
"grad_norm": 22.467132568359375,
"learning_rate": 7.799514272783014e-06,
"loss": 5.6287,
"step": 1170
},
{
"epoch": 0.3342870902858438,
"grad_norm": 25.866819381713867,
"learning_rate": 7.761374806750712e-06,
"loss": 3.7462,
"step": 1180
},
{
"epoch": 0.33712003172894417,
"grad_norm": 21.801698684692383,
"learning_rate": 7.723002741496892e-06,
"loss": 5.6068,
"step": 1190
},
{
"epoch": 0.33995297317204454,
"grad_norm": 41.93526840209961,
"learning_rate": 7.684401309220416e-06,
"loss": 5.8573,
"step": 1200
},
{
"epoch": 0.3427859146151449,
"grad_norm": 23.245235443115234,
"learning_rate": 7.645573761440444e-06,
"loss": 4.5851,
"step": 1210
},
{
"epoch": 0.34561885605824527,
"grad_norm": 24.502330780029297,
"learning_rate": 7.606523368722554e-06,
"loss": 6.4644,
"step": 1220
},
{
"epoch": 0.34845179750134564,
"grad_norm": 16.042354583740234,
"learning_rate": 7.567253420403249e-06,
"loss": 5.6877,
"step": 1230
},
{
"epoch": 0.351284738944446,
"grad_norm": 26.405628204345703,
"learning_rate": 7.527767224312883e-06,
"loss": 4.764,
"step": 1240
},
{
"epoch": 0.3541176803875464,
"grad_norm": 40.40938186645508,
"learning_rate": 7.488068106497035e-06,
"loss": 5.8002,
"step": 1250
},
{
"epoch": 0.35695062183064674,
"grad_norm": 25.338321685791016,
"learning_rate": 7.448159410936348e-06,
"loss": 5.5113,
"step": 1260
},
{
"epoch": 0.3597835632737471,
"grad_norm": 39.411128997802734,
"learning_rate": 7.4080444992648534e-06,
"loss": 5.5444,
"step": 1270
},
{
"epoch": 0.3626165047168475,
"grad_norm": 22.218137741088867,
"learning_rate": 7.3677267504868055e-06,
"loss": 4.4882,
"step": 1280
},
{
"epoch": 0.3654494461599479,
"grad_norm": 43.15862274169922,
"learning_rate": 7.327209560692063e-06,
"loss": 6.6107,
"step": 1290
},
{
"epoch": 0.36828238760304827,
"grad_norm": 20.51604652404785,
"learning_rate": 7.2864963427700284e-06,
"loss": 5.6351,
"step": 1300
},
{
"epoch": 0.37111532904614863,
"grad_norm": 21.392065048217773,
"learning_rate": 7.2455905261221585e-06,
"loss": 5.7755,
"step": 1310
},
{
"epoch": 0.373948270489249,
"grad_norm": 28.160072326660156,
"learning_rate": 7.204495556373106e-06,
"loss": 6.5779,
"step": 1320
},
{
"epoch": 0.37678121193234937,
"grad_norm": 41.15205764770508,
"learning_rate": 7.163214895080479e-06,
"loss": 6.4435,
"step": 1330
},
{
"epoch": 0.37961415337544974,
"grad_norm": 26.100757598876953,
"learning_rate": 7.121752019443266e-06,
"loss": 6.5864,
"step": 1340
},
{
"epoch": 0.3824470948185501,
"grad_norm": 41.462791442871094,
"learning_rate": 7.080110422008937e-06,
"loss": 5.6488,
"step": 1350
},
{
"epoch": 0.38528003626165047,
"grad_norm": 22.369388580322266,
"learning_rate": 7.038293610379255e-06,
"loss": 4.4922,
"step": 1360
},
{
"epoch": 0.38811297770475084,
"grad_norm": 19.927444458007812,
"learning_rate": 6.996305106914824e-06,
"loss": 4.5791,
"step": 1370
},
{
"epoch": 0.3909459191478512,
"grad_norm": 25.871030807495117,
"learning_rate": 6.954148448438389e-06,
"loss": 4.5578,
"step": 1380
},
{
"epoch": 0.3937788605909516,
"grad_norm": 18.420751571655273,
"learning_rate": 6.911827185936914e-06,
"loss": 4.6252,
"step": 1390
},
{
"epoch": 0.39661180203405194,
"grad_norm": 27.263010025024414,
"learning_rate": 6.869344884262473e-06,
"loss": 5.5235,
"step": 1400
},
{
"epoch": 0.3994447434771523,
"grad_norm": 24.479764938354492,
"learning_rate": 6.8267051218319766e-06,
"loss": 5.6514,
"step": 1410
},
{
"epoch": 0.4022776849202527,
"grad_norm": 23.21695899963379,
"learning_rate": 6.7839114903257404e-06,
"loss": 7.5326,
"step": 1420
},
{
"epoch": 0.40511062636335304,
"grad_norm": 21.287368774414062,
"learning_rate": 6.74096759438496e-06,
"loss": 3.895,
"step": 1430
},
{
"epoch": 0.40794356780645347,
"grad_norm": 25.839454650878906,
"learning_rate": 6.697877051308067e-06,
"loss": 6.3928,
"step": 1440
},
{
"epoch": 0.41077650924955383,
"grad_norm": 22.896682739257812,
"learning_rate": 6.654643490746042e-06,
"loss": 4.5232,
"step": 1450
},
{
"epoch": 0.4136094506926542,
"grad_norm": 25.252422332763672,
"learning_rate": 6.611270554396676e-06,
"loss": 6.5998,
"step": 1460
},
{
"epoch": 0.41644239213575457,
"grad_norm": 24.610836029052734,
"learning_rate": 6.567761895697816e-06,
"loss": 4.6121,
"step": 1470
},
{
"epoch": 0.41927533357885494,
"grad_norm": 19.092580795288086,
"learning_rate": 6.524121179519625e-06,
"loss": 3.6029,
"step": 1480
},
{
"epoch": 0.4221082750219553,
"grad_norm": 22.915136337280273,
"learning_rate": 6.480352081855884e-06,
"loss": 3.6352,
"step": 1490
},
{
"epoch": 0.42494121646505567,
"grad_norm": 29.044233322143555,
"learning_rate": 6.436458289514342e-06,
"loss": 4.6979,
"step": 1500
},
{
"epoch": 0.42777415790815604,
"grad_norm": 39.80937194824219,
"learning_rate": 6.392443499806175e-06,
"loss": 4.6673,
"step": 1510
},
{
"epoch": 0.4306070993512564,
"grad_norm": 22.760765075683594,
"learning_rate": 6.348311420234542e-06,
"loss": 4.6801,
"step": 1520
},
{
"epoch": 0.4334400407943568,
"grad_norm": 21.216337203979492,
"learning_rate": 6.304065768182295e-06,
"loss": 5.7451,
"step": 1530
},
{
"epoch": 0.43627298223745714,
"grad_norm": 20.622943878173828,
"learning_rate": 6.259710270598848e-06,
"loss": 5.6216,
"step": 1540
},
{
"epoch": 0.4391059236805575,
"grad_norm": 40.299949645996094,
"learning_rate": 6.215248663686251e-06,
"loss": 6.5508,
"step": 1550
},
{
"epoch": 0.4419388651236579,
"grad_norm": 28.81671905517578,
"learning_rate": 6.170684692584469e-06,
"loss": 3.5039,
"step": 1560
},
{
"epoch": 0.44477180656675824,
"grad_norm": 43.010169982910156,
"learning_rate": 6.126022111055929e-06,
"loss": 6.4925,
"step": 1570
},
{
"epoch": 0.4476047480098586,
"grad_norm": 23.351240158081055,
"learning_rate": 6.081264681169317e-06,
"loss": 3.4456,
"step": 1580
},
{
"epoch": 0.45043768945295903,
"grad_norm": 40.19292449951172,
"learning_rate": 6.0364161729826905e-06,
"loss": 4.4953,
"step": 1590
},
{
"epoch": 0.4532706308960594,
"grad_norm": 25.595369338989258,
"learning_rate": 5.991480364225924e-06,
"loss": 6.2619,
"step": 1600
},
{
"epoch": 0.45610357233915977,
"grad_norm": 32.5233268737793,
"learning_rate": 5.946461039982485e-06,
"loss": 5.5702,
"step": 1610
},
{
"epoch": 0.45893651378226014,
"grad_norm": 25.565658569335938,
"learning_rate": 5.901361992370614e-06,
"loss": 3.5389,
"step": 1620
},
{
"epoch": 0.4617694552253605,
"grad_norm": 21.443763732910156,
"learning_rate": 5.856187020223901e-06,
"loss": 4.6532,
"step": 1630
},
{
"epoch": 0.46460239666846087,
"grad_norm": 26.775903701782227,
"learning_rate": 5.8109399287712935e-06,
"loss": 5.7745,
"step": 1640
},
{
"epoch": 0.46743533811156124,
"grad_norm": 20.02845001220703,
"learning_rate": 5.765624529316573e-06,
"loss": 5.506,
"step": 1650
},
{
"epoch": 0.4702682795546616,
"grad_norm": 22.177770614624023,
"learning_rate": 5.7202446389173225e-06,
"loss": 3.5255,
"step": 1660
},
{
"epoch": 0.473101220997762,
"grad_norm": 27.885957717895508,
"learning_rate": 5.674804080063392e-06,
"loss": 3.5088,
"step": 1670
},
{
"epoch": 0.47593416244086234,
"grad_norm": 33.34544372558594,
"learning_rate": 5.62930668035493e-06,
"loss": 4.4746,
"step": 1680
},
{
"epoch": 0.4787671038839627,
"grad_norm": 24.865848541259766,
"learning_rate": 5.5837562721799644e-06,
"loss": 6.4182,
"step": 1690
},
{
"epoch": 0.4816000453270631,
"grad_norm": 20.06027603149414,
"learning_rate": 5.538156692391592e-06,
"loss": 3.499,
"step": 1700
},
{
"epoch": 0.48443298677016344,
"grad_norm": 28.240829467773438,
"learning_rate": 5.4925117819847925e-06,
"loss": 5.4651,
"step": 1710
},
{
"epoch": 0.4872659282132638,
"grad_norm": 39.07200241088867,
"learning_rate": 5.44682538577288e-06,
"loss": 4.7134,
"step": 1720
},
{
"epoch": 0.4900988696563642,
"grad_norm": 31.383825302124023,
"learning_rate": 5.4011013520636466e-06,
"loss": 4.4705,
"step": 1730
},
{
"epoch": 0.4929318110994646,
"grad_norm": 40.832984924316406,
"learning_rate": 5.355343532335215e-06,
"loss": 7.2469,
"step": 1740
},
{
"epoch": 0.49576475254256497,
"grad_norm": 20.33405303955078,
"learning_rate": 5.309555780911604e-06,
"loss": 5.4482,
"step": 1750
},
{
"epoch": 0.49859769398566534,
"grad_norm": 22.8585262298584,
"learning_rate": 5.263741954638072e-06,
"loss": 4.4573,
"step": 1760
},
{
"epoch": 0.5014306354287656,
"grad_norm": 42.46244430541992,
"learning_rate": 5.217905912556248e-06,
"loss": 5.5277,
"step": 1770
},
{
"epoch": 0.5042635768718661,
"grad_norm": 21.30562973022461,
"learning_rate": 5.172051515579065e-06,
"loss": 5.4764,
"step": 1780
},
{
"epoch": 0.5070965183149664,
"grad_norm": 18.9359130859375,
"learning_rate": 5.126182626165547e-06,
"loss": 6.4232,
"step": 1790
},
{
"epoch": 0.5099294597580668,
"grad_norm": 33.8026123046875,
"learning_rate": 5.080303107995461e-06,
"loss": 6.6042,
"step": 1800
},
{
"epoch": 0.5127624012011671,
"grad_norm": 40.52323913574219,
"learning_rate": 5.034416825643868e-06,
"loss": 5.5848,
"step": 1810
},
{
"epoch": 0.5155953426442675,
"grad_norm": 27.342744827270508,
"learning_rate": 4.988527644255591e-06,
"loss": 5.2504,
"step": 1820
},
{
"epoch": 0.518428284087368,
"grad_norm": 19.118297576904297,
"learning_rate": 4.942639429219661e-06,
"loss": 4.5668,
"step": 1830
},
{
"epoch": 0.5212612255304683,
"grad_norm": 41.146236419677734,
"learning_rate": 4.896756045843698e-06,
"loss": 6.0831,
"step": 1840
},
{
"epoch": 0.5240941669735687,
"grad_norm": 19.685937881469727,
"learning_rate": 4.85088135902834e-06,
"loss": 5.5025,
"step": 1850
},
{
"epoch": 0.526927108416669,
"grad_norm": 22.97096061706543,
"learning_rate": 4.805019232941689e-06,
"loss": 4.4157,
"step": 1860
},
{
"epoch": 0.5297600498597694,
"grad_norm": 27.627784729003906,
"learning_rate": 4.7591735306938144e-06,
"loss": 4.3861,
"step": 1870
},
{
"epoch": 0.5325929913028697,
"grad_norm": 25.308032989501953,
"learning_rate": 4.713348114011357e-06,
"loss": 7.2963,
"step": 1880
},
{
"epoch": 0.5354259327459702,
"grad_norm": 19.11351203918457,
"learning_rate": 4.667546842912239e-06,
"loss": 4.2907,
"step": 1890
},
{
"epoch": 0.5382588741890705,
"grad_norm": 28.81739044189453,
"learning_rate": 4.6217735753805235e-06,
"loss": 4.5385,
"step": 1900
},
{
"epoch": 0.5410918156321709,
"grad_norm": 20.510547637939453,
"learning_rate": 4.576032167041452e-06,
"loss": 7.2043,
"step": 1910
},
{
"epoch": 0.5439247570752712,
"grad_norm": 26.19765281677246,
"learning_rate": 4.530326470836659e-06,
"loss": 4.3494,
"step": 1920
},
{
"epoch": 0.5467576985183716,
"grad_norm": 25.779802322387695,
"learning_rate": 4.484660336699638e-06,
"loss": 5.3226,
"step": 1930
},
{
"epoch": 0.549590639961472,
"grad_norm": 26.97022247314453,
"learning_rate": 4.439037611231448e-06,
"loss": 6.5069,
"step": 1940
},
{
"epoch": 0.5524235814045724,
"grad_norm": 26.32407569885254,
"learning_rate": 4.393462137376696e-06,
"loss": 3.545,
"step": 1950
},
{
"epoch": 0.5552565228476727,
"grad_norm": 30.962535858154297,
"learning_rate": 4.347937754099841e-06,
"loss": 4.4292,
"step": 1960
},
{
"epoch": 0.5580894642907731,
"grad_norm": 38.1851921081543,
"learning_rate": 4.302468296061823e-06,
"loss": 4.3079,
"step": 1970
},
{
"epoch": 0.5609224057338735,
"grad_norm": 21.038278579711914,
"learning_rate": 4.257057593297055e-06,
"loss": 4.5294,
"step": 1980
},
{
"epoch": 0.5637553471769738,
"grad_norm": 20.618942260742188,
"learning_rate": 4.211709470890815e-06,
"loss": 7.2449,
"step": 1990
},
{
"epoch": 0.5665882886200743,
"grad_norm": 21.230995178222656,
"learning_rate": 4.166427748657034e-06,
"loss": 4.3681,
"step": 2000
},
{
"epoch": 0.5694212300631746,
"grad_norm": 20.577428817749023,
"learning_rate": 4.121216240816559e-06,
"loss": 5.3925,
"step": 2010
},
{
"epoch": 0.572254171506275,
"grad_norm": 21.1496524810791,
"learning_rate": 4.076078755675852e-06,
"loss": 5.0495,
"step": 2020
},
{
"epoch": 0.5750871129493753,
"grad_norm": 26.215744018554688,
"learning_rate": 4.0310190953062155e-06,
"loss": 5.5832,
"step": 2030
},
{
"epoch": 0.5779200543924757,
"grad_norm": 33.668174743652344,
"learning_rate": 3.986041055223526e-06,
"loss": 5.1639,
"step": 2040
},
{
"epoch": 0.580752995835576,
"grad_norm": 28.786453247070312,
"learning_rate": 3.9411484240685315e-06,
"loss": 3.3797,
"step": 2050
},
{
"epoch": 0.5835859372786765,
"grad_norm": 24.81963348388672,
"learning_rate": 3.8963449832877164e-06,
"loss": 6.3189,
"step": 2060
},
{
"epoch": 0.5864188787217768,
"grad_norm": 25.143753051757812,
"learning_rate": 3.851634506814782e-06,
"loss": 6.463,
"step": 2070
},
{
"epoch": 0.5892518201648772,
"grad_norm": 39.29959487915039,
"learning_rate": 3.8070207607527587e-06,
"loss": 7.5255,
"step": 2080
},
{
"epoch": 0.5920847616079775,
"grad_norm": 19.030284881591797,
"learning_rate": 3.7625075030567683e-06,
"loss": 4.2513,
"step": 2090
},
{
"epoch": 0.5949177030510779,
"grad_norm": 24.105989456176758,
"learning_rate": 3.718098483217484e-06,
"loss": 3.3586,
"step": 2100
},
{
"epoch": 0.5977506444941783,
"grad_norm": 38.95778274536133,
"learning_rate": 3.673797441945304e-06,
"loss": 4.2773,
"step": 2110
},
{
"epoch": 0.6005835859372787,
"grad_norm": 42.26526641845703,
"learning_rate": 3.629608110855248e-06,
"loss": 5.2586,
"step": 2120
},
{
"epoch": 0.6034165273803791,
"grad_norm": 21.60348892211914,
"learning_rate": 3.585534212152643e-06,
"loss": 4.4408,
"step": 2130
},
{
"epoch": 0.6062494688234794,
"grad_norm": 39.41062545776367,
"learning_rate": 3.5415794583195846e-06,
"loss": 4.5132,
"step": 2140
},
{
"epoch": 0.6090824102665798,
"grad_norm": 32.25893783569336,
"learning_rate": 3.497747551802221e-06,
"loss": 8.4284,
"step": 2150
},
{
"epoch": 0.6119153517096801,
"grad_norm": 34.556373596191406,
"learning_rate": 3.4540421846988916e-06,
"loss": 6.3801,
"step": 2160
},
{
"epoch": 0.6147482931527806,
"grad_norm": 27.99374771118164,
"learning_rate": 3.4104670384491234e-06,
"loss": 5.5573,
"step": 2170
},
{
"epoch": 0.6175812345958809,
"grad_norm": 23.997901916503906,
"learning_rate": 3.367025783523534e-06,
"loss": 4.2779,
"step": 2180
},
{
"epoch": 0.6204141760389813,
"grad_norm": 41.76970291137695,
"learning_rate": 3.3237220791146597e-06,
"loss": 5.241,
"step": 2190
},
{
"epoch": 0.6232471174820816,
"grad_norm": 27.922670364379883,
"learning_rate": 3.2805595728287255e-06,
"loss": 4.2649,
"step": 2200
},
{
"epoch": 0.626080058925182,
"grad_norm": 33.54890060424805,
"learning_rate": 3.2375419003783957e-06,
"loss": 6.0635,
"step": 2210
},
{
"epoch": 0.6289130003682823,
"grad_norm": 21.987178802490234,
"learning_rate": 3.1946726852765325e-06,
"loss": 5.1542,
"step": 2220
},
{
"epoch": 0.6317459418113828,
"grad_norm": 35.2348518371582,
"learning_rate": 3.1519555385309685e-06,
"loss": 4.2332,
"step": 2230
},
{
"epoch": 0.6345788832544831,
"grad_norm": 39.060691833496094,
"learning_rate": 3.1093940583403447e-06,
"loss": 8.0693,
"step": 2240
},
{
"epoch": 0.6374118246975835,
"grad_norm": 20.76451873779297,
"learning_rate": 3.066991829791024e-06,
"loss": 5.3108,
"step": 2250
},
{
"epoch": 0.6402447661406838,
"grad_norm": 40.92884826660156,
"learning_rate": 3.024752424555105e-06,
"loss": 4.2548,
"step": 2260
},
{
"epoch": 0.6430777075837842,
"grad_norm": 24.043121337890625,
"learning_rate": 2.982679400589569e-06,
"loss": 5.3648,
"step": 2270
},
{
"epoch": 0.6459106490268847,
"grad_norm": 22.929412841796875,
"learning_rate": 2.9407763018365854e-06,
"loss": 4.2817,
"step": 2280
},
{
"epoch": 0.648743590469985,
"grad_norm": 36.0571174621582,
"learning_rate": 2.899046657924992e-06,
"loss": 7.9167,
"step": 2290
},
{
"epoch": 0.6515765319130854,
"grad_norm": 23.849647521972656,
"learning_rate": 2.8574939838729844e-06,
"loss": 4.44,
"step": 2300
},
{
"epoch": 0.6544094733561857,
"grad_norm": 42.65750503540039,
"learning_rate": 2.8161217797920304e-06,
"loss": 5.6655,
"step": 2310
},
{
"epoch": 0.6572424147992861,
"grad_norm": 23.45660400390625,
"learning_rate": 2.774933530592054e-06,
"loss": 5.4841,
"step": 2320
},
{
"epoch": 0.6600753562423864,
"grad_norm": 21.22451400756836,
"learning_rate": 2.733932705687883e-06,
"loss": 3.3468,
"step": 2330
},
{
"epoch": 0.6629082976854869,
"grad_norm": 37.178993225097656,
"learning_rate": 2.693122758707013e-06,
"loss": 5.1606,
"step": 2340
},
{
"epoch": 0.6657412391285872,
"grad_norm": 24.34912109375,
"learning_rate": 2.652507127198689e-06,
"loss": 7.2961,
"step": 2350
},
{
"epoch": 0.6685741805716876,
"grad_norm": 40.61592483520508,
"learning_rate": 2.612089232344371e-06,
"loss": 6.3695,
"step": 2360
},
{
"epoch": 0.6714071220147879,
"grad_norm": 20.37811279296875,
"learning_rate": 2.571872478669528e-06,
"loss": 3.3039,
"step": 2370
},
{
"epoch": 0.6742400634578883,
"grad_norm": 25.745912551879883,
"learning_rate": 2.5318602537568904e-06,
"loss": 4.2973,
"step": 2380
},
{
"epoch": 0.6770730049009887,
"grad_norm": 22.395126342773438,
"learning_rate": 2.4920559279610886e-06,
"loss": 4.1162,
"step": 2390
},
{
"epoch": 0.6799059463440891,
"grad_norm": 34.32621383666992,
"learning_rate": 2.452462854124758e-06,
"loss": 4.1658,
"step": 2400
},
{
"epoch": 0.6827388877871894,
"grad_norm": 39.03499984741211,
"learning_rate": 2.413084367296127e-06,
"loss": 6.3083,
"step": 2410
},
{
"epoch": 0.6855718292302898,
"grad_norm": 47.788394927978516,
"learning_rate": 2.373923784448089e-06,
"loss": 4.2861,
"step": 2420
},
{
"epoch": 0.6884047706733902,
"grad_norm": 26.90192413330078,
"learning_rate": 2.3349844041988044e-06,
"loss": 4.3008,
"step": 2430
},
{
"epoch": 0.6912377121164905,
"grad_norm": 22.178869247436523,
"learning_rate": 2.296269506533846e-06,
"loss": 5.2767,
"step": 2440
},
{
"epoch": 0.694070653559591,
"grad_norm": 21.529335021972656,
"learning_rate": 2.2577823525299205e-06,
"loss": 7.1097,
"step": 2450
},
{
"epoch": 0.6969035950026913,
"grad_norm": 20.215675354003906,
"learning_rate": 2.2195261840801757e-06,
"loss": 7.1815,
"step": 2460
},
{
"epoch": 0.6997365364457917,
"grad_norm": 21.300861358642578,
"learning_rate": 2.18150422362112e-06,
"loss": 6.9142,
"step": 2470
},
{
"epoch": 0.702569477888892,
"grad_norm": 30.098453521728516,
"learning_rate": 2.1437196738611958e-06,
"loss": 4.4774,
"step": 2480
},
{
"epoch": 0.7054024193319924,
"grad_norm": 25.317970275878906,
"learning_rate": 2.1061757175110024e-06,
"loss": 4.4772,
"step": 2490
},
{
"epoch": 0.7082353607750927,
"grad_norm": 30.881681442260742,
"learning_rate": 2.0688755170152e-06,
"loss": 4.2296,
"step": 2500
},
{
"epoch": 0.7110683022181932,
"grad_norm": 23.95901107788086,
"learning_rate": 2.031822214286134e-06,
"loss": 5.0405,
"step": 2510
},
{
"epoch": 0.7139012436612935,
"grad_norm": 41.624210357666016,
"learning_rate": 1.9950189304391855e-06,
"loss": 6.3358,
"step": 2520
},
{
"epoch": 0.7167341851043939,
"grad_norm": 34.76797866821289,
"learning_rate": 1.958468765529853e-06,
"loss": 5.061,
"step": 2530
},
{
"epoch": 0.7195671265474942,
"grad_norm": 20.406444549560547,
"learning_rate": 1.9221747982926493e-06,
"loss": 5.1701,
"step": 2540
},
{
"epoch": 0.7224000679905946,
"grad_norm": 24.22311782836914,
"learning_rate": 1.8861400858817508e-06,
"loss": 4.2621,
"step": 2550
},
{
"epoch": 0.725233009433695,
"grad_norm": 37.65345001220703,
"learning_rate": 1.8503676636134882e-06,
"loss": 6.1661,
"step": 2560
},
{
"epoch": 0.7280659508767954,
"grad_norm": 20.813777923583984,
"learning_rate": 1.81486054471068e-06,
"loss": 5.3045,
"step": 2570
},
{
"epoch": 0.7308988923198958,
"grad_norm": 39.82976150512695,
"learning_rate": 1.7796217200488114e-06,
"loss": 6.4348,
"step": 2580
},
{
"epoch": 0.7337318337629961,
"grad_norm": 25.495925903320312,
"learning_rate": 1.7446541579041048e-06,
"loss": 4.2349,
"step": 2590
},
{
"epoch": 0.7365647752060965,
"grad_norm": 38.05914306640625,
"learning_rate": 1.7099608037034953e-06,
"loss": 5.2485,
"step": 2600
},
{
"epoch": 0.7393977166491968,
"grad_norm": 22.876413345336914,
"learning_rate": 1.6755445797765286e-06,
"loss": 4.263,
"step": 2610
},
{
"epoch": 0.7422306580922973,
"grad_norm": 22.402753829956055,
"learning_rate": 1.6414083851091973e-06,
"loss": 4.3153,
"step": 2620
},
{
"epoch": 0.7450635995353976,
"grad_norm": 20.86781883239746,
"learning_rate": 1.6075550950997592e-06,
"loss": 4.4095,
"step": 2630
},
{
"epoch": 0.747896540978498,
"grad_norm": 39.51744842529297,
"learning_rate": 1.5739875613165283e-06,
"loss": 6.2356,
"step": 2640
},
{
"epoch": 0.7507294824215983,
"grad_norm": 26.651187896728516,
"learning_rate": 1.5407086112576813e-06,
"loss": 4.1033,
"step": 2650
},
{
"epoch": 0.7535624238646987,
"grad_norm": 46.947757720947266,
"learning_rate": 1.5077210481130815e-06,
"loss": 8.1815,
"step": 2660
},
{
"epoch": 0.756395365307799,
"grad_norm": 41.29295349121094,
"learning_rate": 1.475027650528168e-06,
"loss": 6.1637,
"step": 2670
},
{
"epoch": 0.7592283067508995,
"grad_norm": 39.40729522705078,
"learning_rate": 1.442631172369896e-06,
"loss": 7.1273,
"step": 2680
},
{
"epoch": 0.7620612481939998,
"grad_norm": 39.58256912231445,
"learning_rate": 1.4105343424947654e-06,
"loss": 5.187,
"step": 2690
},
{
"epoch": 0.7648941896371002,
"grad_norm": 39.836185455322266,
"learning_rate": 1.378739864518971e-06,
"loss": 3.8889,
"step": 2700
},
{
"epoch": 0.7677271310802005,
"grad_norm": 40.20053482055664,
"learning_rate": 1.3472504165906614e-06,
"loss": 5.3001,
"step": 2710
},
{
"epoch": 0.7705600725233009,
"grad_norm": 23.571002960205078,
"learning_rate": 1.3160686511643505e-06,
"loss": 4.0238,
"step": 2720
},
{
"epoch": 0.7733930139664014,
"grad_norm": 23.623443603515625,
"learning_rate": 1.2851971947774987e-06,
"loss": 5.1091,
"step": 2730
},
{
"epoch": 0.7762259554095017,
"grad_norm": 31.367658615112305,
"learning_rate": 1.2546386478292604e-06,
"loss": 4.1048,
"step": 2740
},
{
"epoch": 0.7790588968526021,
"grad_norm": 39.296226501464844,
"learning_rate": 1.2243955843614558e-06,
"loss": 4.271,
"step": 2750
},
{
"epoch": 0.7818918382957024,
"grad_norm": 28.69118881225586,
"learning_rate": 1.1944705518417466e-06,
"loss": 4.0739,
"step": 2760
},
{
"epoch": 0.7847247797388028,
"grad_norm": 32.27414321899414,
"learning_rate": 1.1648660709490538e-06,
"loss": 5.1998,
"step": 2770
},
{
"epoch": 0.7875577211819031,
"grad_norm": 24.473217010498047,
"learning_rate": 1.135584635361232e-06,
"loss": 4.9601,
"step": 2780
},
{
"epoch": 0.7903906626250036,
"grad_norm": 27.856367111206055,
"learning_rate": 1.1066287115450242e-06,
"loss": 4.9381,
"step": 2790
},
{
"epoch": 0.7932236040681039,
"grad_norm": 17.640838623046875,
"learning_rate": 1.0780007385483005e-06,
"loss": 4.2145,
"step": 2800
},
{
"epoch": 0.7960565455112043,
"grad_norm": 34.375091552734375,
"learning_rate": 1.0497031277946062e-06,
"loss": 8.3028,
"step": 2810
},
{
"epoch": 0.7988894869543046,
"grad_norm": 23.346403121948242,
"learning_rate": 1.0217382628800465e-06,
"loss": 6.9337,
"step": 2820
},
{
"epoch": 0.801722428397405,
"grad_norm": 25.259016036987305,
"learning_rate": 9.94108499372507e-07,
"loss": 3.1855,
"step": 2830
},
{
"epoch": 0.8045553698405054,
"grad_norm": 33.022727966308594,
"learning_rate": 9.668161646132296e-07,
"loss": 5.2408,
"step": 2840
},
{
"epoch": 0.8073883112836058,
"grad_norm": 30.2951717376709,
"learning_rate": 9.398635575207854e-07,
"loss": 3.1828,
"step": 2850
},
{
"epoch": 0.8102212527267061,
"grad_norm": 51.273616790771484,
"learning_rate": 9.132529483974217e-07,
"loss": 5.0485,
"step": 2860
},
{
"epoch": 0.8130541941698065,
"grad_norm": 39.878597259521484,
"learning_rate": 8.869865787378262e-07,
"loss": 6.3068,
"step": 2870
},
{
"epoch": 0.8158871356129069,
"grad_norm": 21.064966201782227,
"learning_rate": 8.61066661040324e-07,
"loss": 3.3587,
"step": 2880
},
{
"epoch": 0.8187200770560072,
"grad_norm": 22.18380355834961,
"learning_rate": 8.354953786205133e-07,
"loss": 4.242,
"step": 2890
},
{
"epoch": 0.8215530184991077,
"grad_norm": 41.24013137817383,
"learning_rate": 8.102748854273468e-07,
"loss": 4.1017,
"step": 2900
},
{
"epoch": 0.824385959942208,
"grad_norm": 23.30076789855957,
"learning_rate": 7.854073058617112e-07,
"loss": 5.3308,
"step": 2910
},
{
"epoch": 0.8272189013853084,
"grad_norm": 21.42025375366211,
"learning_rate": 7.60894734597476e-07,
"loss": 8.113,
"step": 2920
},
{
"epoch": 0.8300518428284087,
"grad_norm": 18.741073608398438,
"learning_rate": 7.367392364050485e-07,
"loss": 5.1848,
"step": 2930
},
{
"epoch": 0.8328847842715091,
"grad_norm": 23.857194900512695,
"learning_rate": 7.129428459774618e-07,
"loss": 7.1581,
"step": 2940
},
{
"epoch": 0.8357177257146094,
"grad_norm": 28.530094146728516,
"learning_rate": 6.895075677589791e-07,
"loss": 6.2661,
"step": 2950
},
{
"epoch": 0.8385506671577099,
"grad_norm": 40.354949951171875,
"learning_rate": 6.664353757762515e-07,
"loss": 4.2647,
"step": 2960
},
{
"epoch": 0.8413836086008102,
"grad_norm": 21.305288314819336,
"learning_rate": 6.437282134720479e-07,
"loss": 4.9122,
"step": 2970
},
{
"epoch": 0.8442165500439106,
"grad_norm": 40.32603454589844,
"learning_rate": 6.21387993541544e-07,
"loss": 6.2095,
"step": 2980
},
{
"epoch": 0.8470494914870109,
"grad_norm": 39.293067932128906,
"learning_rate": 5.994165977712175e-07,
"loss": 4.1365,
"step": 2990
},
{
"epoch": 0.8498824329301113,
"grad_norm": 25.006118774414062,
"learning_rate": 5.778158768803294e-07,
"loss": 3.4504,
"step": 3000
},
{
"epoch": 0.8527153743732117,
"grad_norm": 38.37477111816406,
"learning_rate": 5.565876503650442e-07,
"loss": 4.2214,
"step": 3010
},
{
"epoch": 0.8555483158163121,
"grad_norm": 40.921207427978516,
"learning_rate": 5.357337063451601e-07,
"loss": 5.1103,
"step": 3020
},
{
"epoch": 0.8583812572594125,
"grad_norm": 26.225017547607422,
"learning_rate": 5.152558014134906e-07,
"loss": 5.9913,
"step": 3030
},
{
"epoch": 0.8612141987025128,
"grad_norm": 22.678930282592773,
"learning_rate": 4.951556604879049e-07,
"loss": 4.3731,
"step": 3040
},
{
"epoch": 0.8640471401456132,
"grad_norm": 29.937393188476562,
"learning_rate": 4.754349766660299e-07,
"loss": 4.2301,
"step": 3050
},
{
"epoch": 0.8668800815887135,
"grad_norm": 26.465091705322266,
"learning_rate": 4.5609541108263377e-07,
"loss": 6.0091,
"step": 3060
},
{
"epoch": 0.869713023031814,
"grad_norm": 25.58681297302246,
"learning_rate": 4.3713859276971026e-07,
"loss": 6.979,
"step": 3070
},
{
"epoch": 0.8725459644749143,
"grad_norm": 22.564706802368164,
"learning_rate": 4.1856611851925245e-07,
"loss": 5.0316,
"step": 3080
},
{
"epoch": 0.8753789059180147,
"grad_norm": 40.97758102416992,
"learning_rate": 4.003795527487508e-07,
"loss": 8.964,
"step": 3090
},
{
"epoch": 0.878211847361115,
"grad_norm": 32.80113220214844,
"learning_rate": 3.8258042736942446e-07,
"loss": 3.1517,
"step": 3100
},
{
"epoch": 0.8810447888042154,
"grad_norm": 30.950176239013672,
"learning_rate": 3.651702416571762e-07,
"loss": 4.345,
"step": 3110
},
{
"epoch": 0.8838777302473158,
"grad_norm": 21.242090225219727,
"learning_rate": 3.481504621263049e-07,
"loss": 6.1642,
"step": 3120
},
{
"epoch": 0.8867106716904162,
"grad_norm": 21.0790958404541,
"learning_rate": 3.315225224059809e-07,
"loss": 5.1734,
"step": 3130
},
{
"epoch": 0.8895436131335165,
"grad_norm": 41.8050537109375,
"learning_rate": 3.1528782311948226e-07,
"loss": 5.0608,
"step": 3140
},
{
"epoch": 0.8923765545766169,
"grad_norm": 23.527942657470703,
"learning_rate": 2.9944773176621756e-07,
"loss": 5.9961,
"step": 3150
},
{
"epoch": 0.8952094960197172,
"grad_norm": 28.754201889038086,
"learning_rate": 2.840035826065368e-07,
"loss": 3.8781,
"step": 3160
},
{
"epoch": 0.8980424374628176,
"grad_norm": 26.580829620361328,
"learning_rate": 2.689566765493451e-07,
"loss": 4.1426,
"step": 3170
},
{
"epoch": 0.9008753789059181,
"grad_norm": 18.550945281982422,
"learning_rate": 2.5430828104251684e-07,
"loss": 4.9139,
"step": 3180
},
{
"epoch": 0.9037083203490184,
"grad_norm": 20.301895141601562,
"learning_rate": 2.4005962996614174e-07,
"loss": 3.1654,
"step": 3190
},
{
"epoch": 0.9065412617921188,
"grad_norm": 37.804969787597656,
"learning_rate": 2.2621192352858702e-07,
"loss": 5.0736,
"step": 3200
},
{
"epoch": 0.9093742032352191,
"grad_norm": 29.193897247314453,
"learning_rate": 2.1276632816540077e-07,
"loss": 5.2175,
"step": 3210
},
{
"epoch": 0.9122071446783195,
"grad_norm": 23.378320693969727,
"learning_rate": 1.9972397644106023e-07,
"loss": 5.2508,
"step": 3220
},
{
"epoch": 0.9150400861214198,
"grad_norm": 18.92923355102539,
"learning_rate": 1.870859669535724e-07,
"loss": 5.2554,
"step": 3230
},
{
"epoch": 0.9178730275645203,
"grad_norm": 18.03963279724121,
"learning_rate": 1.7485336424193366e-07,
"loss": 5.1253,
"step": 3240
},
{
"epoch": 0.9207059690076206,
"grad_norm": 38.961456298828125,
"learning_rate": 1.6302719869646432e-07,
"loss": 5.0659,
"step": 3250
},
{
"epoch": 0.923538910450721,
"grad_norm": 20.624431610107422,
"learning_rate": 1.5160846647201132e-07,
"loss": 4.1776,
"step": 3260
},
{
"epoch": 0.9263718518938213,
"grad_norm": 21.755279541015625,
"learning_rate": 1.4059812940404093e-07,
"loss": 3.142,
"step": 3270
},
{
"epoch": 0.9292047933369217,
"grad_norm": 20.076051712036133,
"learning_rate": 1.2999711492762079e-07,
"loss": 5.2161,
"step": 3280
},
{
"epoch": 0.932037734780022,
"grad_norm": 25.841142654418945,
"learning_rate": 1.198063159992996e-07,
"loss": 5.3184,
"step": 3290
},
{
"epoch": 0.9348706762231225,
"grad_norm": 35.23577117919922,
"learning_rate": 1.1002659102188784e-07,
"loss": 3.3098,
"step": 3300
},
{
"epoch": 0.9377036176662228,
"grad_norm": 20.789785385131836,
"learning_rate": 1.006587637721551e-07,
"loss": 3.1742,
"step": 3310
},
{
"epoch": 0.9405365591093232,
"grad_norm": 40.675296783447266,
"learning_rate": 9.170362333143778e-08,
"loss": 7.3385,
"step": 3320
},
{
"epoch": 0.9433695005524236,
"grad_norm": 24.71589469909668,
"learning_rate": 8.316192401917667e-08,
"loss": 5.3478,
"step": 3330
},
{
"epoch": 0.946202441995524,
"grad_norm": 38.48093032836914,
"learning_rate": 7.503438532937169e-08,
"loss": 6.069,
"step": 3340
},
{
"epoch": 0.9490353834386244,
"grad_norm": 26.636127471923828,
"learning_rate": 6.732169186998372e-08,
"loss": 4.1179,
"step": 3350
},
{
"epoch": 0.9518683248817247,
"grad_norm": 42.95631790161133,
"learning_rate": 6.002449330526294e-08,
"loss": 6.9268,
"step": 3360
},
{
"epoch": 0.9547012663248251,
"grad_norm": 20.64594268798828,
"learning_rate": 5.31434043010276e-08,
"loss": 3.1192,
"step": 3370
},
{
"epoch": 0.9575342077679254,
"grad_norm": 44.144744873046875,
"learning_rate": 4.667900447288931e-08,
"loss": 6.0163,
"step": 3380
},
{
"epoch": 0.9603671492110258,
"grad_norm": 41.0361442565918,
"learning_rate": 4.0631838337427675e-08,
"loss": 5.265,
"step": 3390
},
{
"epoch": 0.9632000906541262,
"grad_norm": 28.538305282592773,
"learning_rate": 3.500241526632753e-08,
"loss": 5.07,
"step": 3400
},
{
"epoch": 0.9660330320972266,
"grad_norm": 24.23358154296875,
"learning_rate": 2.979120944346936e-08,
"loss": 4.3623,
"step": 3410
},
{
"epoch": 0.9688659735403269,
"grad_norm": 26.74643898010254,
"learning_rate": 2.499865982499128e-08,
"loss": 3.3373,
"step": 3420
},
{
"epoch": 0.9716989149834273,
"grad_norm": 22.635358810424805,
"learning_rate": 2.0625170102309687e-08,
"loss": 4.2529,
"step": 3430
},
{
"epoch": 0.9745318564265276,
"grad_norm": 25.663415908813477,
"learning_rate": 1.6671108668119828e-08,
"loss": 4.1368,
"step": 3440
},
{
"epoch": 0.977364797869628,
"grad_norm": 28.40155029296875,
"learning_rate": 1.3136808585361149e-08,
"loss": 5.9535,
"step": 3450
},
{
"epoch": 0.9801977393127284,
"grad_norm": 21.457969665527344,
"learning_rate": 1.0022567559164198e-08,
"loss": 6.1661,
"step": 3460
},
{
"epoch": 0.9830306807558288,
"grad_norm": 26.77224349975586,
"learning_rate": 7.328647911774567e-09,
"loss": 4.1479,
"step": 3470
},
{
"epoch": 0.9858636221989292,
"grad_norm": 34.7308464050293,
"learning_rate": 5.055276560454459e-09,
"loss": 3.216,
"step": 3480
},
{
"epoch": 0.9886965636420295,
"grad_norm": 27.362573623657227,
"learning_rate": 3.202644998370752e-09,
"loss": 5.132,
"step": 3490
},
{
"epoch": 0.9915295050851299,
"grad_norm": 33.32588195800781,
"learning_rate": 1.770909278464017e-09,
"loss": 6.1225,
"step": 3500
},
{
"epoch": 0.9943624465282302,
"grad_norm": 19.929155349731445,
"learning_rate": 7.601900003051388e-10,
"loss": 3.1458,
"step": 3510
},
{
"epoch": 0.9971953879713307,
"grad_norm": 26.834556579589844,
"learning_rate": 1.7057229993344693e-10,
"loss": 7.2351,
"step": 3520
},
{
"epoch": 0.9997450352701209,
"step": 3529,
"total_flos": 1.1540349697243742e+19,
"train_loss": 5.616373471816494,
"train_runtime": 84659.9545,
"train_samples_per_second": 2.502,
"train_steps_per_second": 0.042
}
],
"logging_steps": 10,
"max_steps": 3529,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 1000,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 1.1540349697243742e+19,
"train_batch_size": 1,
"trial_name": null,
"trial_params": null
}