P2_baseline_with_retrieval / trainer_state.json
AliHmlii's picture
Model save
71b9d5b verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 1.0,
"eval_steps": 500,
"global_step": 1777,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0005627462014631402,
"grad_norm": 0.96278315782547,
"learning_rate": 1.1235955056179775e-06,
"loss": 0.8197,
"step": 1
},
{
"epoch": 0.0028137310073157004,
"grad_norm": 1.2738590240478516,
"learning_rate": 5.617977528089888e-06,
"loss": 1.1947,
"step": 5
},
{
"epoch": 0.005627462014631401,
"grad_norm": 2.892261028289795,
"learning_rate": 1.1235955056179776e-05,
"loss": 1.2592,
"step": 10
},
{
"epoch": 0.008441193021947102,
"grad_norm": 1.4733870029449463,
"learning_rate": 1.6853932584269665e-05,
"loss": 1.2391,
"step": 15
},
{
"epoch": 0.011254924029262802,
"grad_norm": 1.0532560348510742,
"learning_rate": 2.2471910112359552e-05,
"loss": 1.5268,
"step": 20
},
{
"epoch": 0.014068655036578503,
"grad_norm": 1.5811922550201416,
"learning_rate": 2.8089887640449443e-05,
"loss": 1.0691,
"step": 25
},
{
"epoch": 0.016882386043894203,
"grad_norm": 1.4303256273269653,
"learning_rate": 3.370786516853933e-05,
"loss": 0.6871,
"step": 30
},
{
"epoch": 0.019696117051209903,
"grad_norm": 2.0926148891448975,
"learning_rate": 3.9325842696629214e-05,
"loss": 0.6074,
"step": 35
},
{
"epoch": 0.022509848058525603,
"grad_norm": 2.149613857269287,
"learning_rate": 4.4943820224719104e-05,
"loss": 0.4506,
"step": 40
},
{
"epoch": 0.025323579065841307,
"grad_norm": 1.4852650165557861,
"learning_rate": 5.0561797752808995e-05,
"loss": 0.6363,
"step": 45
},
{
"epoch": 0.028137310073157007,
"grad_norm": 0.8294332027435303,
"learning_rate": 5.6179775280898885e-05,
"loss": 0.3258,
"step": 50
},
{
"epoch": 0.030951041080472707,
"grad_norm": 0.9797491431236267,
"learning_rate": 6.179775280898876e-05,
"loss": 0.4315,
"step": 55
},
{
"epoch": 0.03376477208778841,
"grad_norm": 0.8157183527946472,
"learning_rate": 6.741573033707866e-05,
"loss": 0.5252,
"step": 60
},
{
"epoch": 0.03657850309510411,
"grad_norm": 0.988738477230072,
"learning_rate": 7.303370786516854e-05,
"loss": 0.3002,
"step": 65
},
{
"epoch": 0.03939223410241981,
"grad_norm": 1.035030484199524,
"learning_rate": 7.865168539325843e-05,
"loss": 0.3721,
"step": 70
},
{
"epoch": 0.04220596510973551,
"grad_norm": 1.2941393852233887,
"learning_rate": 8.426966292134831e-05,
"loss": 0.3631,
"step": 75
},
{
"epoch": 0.04501969611705121,
"grad_norm": 0.8939509391784668,
"learning_rate": 8.988764044943821e-05,
"loss": 0.3648,
"step": 80
},
{
"epoch": 0.04783342712436691,
"grad_norm": 1.7211397886276245,
"learning_rate": 9.550561797752809e-05,
"loss": 0.3593,
"step": 85
},
{
"epoch": 0.050647158131682614,
"grad_norm": 0.7832581996917725,
"learning_rate": 0.00010112359550561799,
"loss": 0.2104,
"step": 90
},
{
"epoch": 0.05346088913899831,
"grad_norm": 1.1701756715774536,
"learning_rate": 0.00010674157303370786,
"loss": 0.3957,
"step": 95
},
{
"epoch": 0.056274620146314014,
"grad_norm": 0.7531750798225403,
"learning_rate": 0.00011235955056179777,
"loss": 0.3513,
"step": 100
},
{
"epoch": 0.05908835115362971,
"grad_norm": 0.7287600636482239,
"learning_rate": 0.00011797752808988764,
"loss": 0.3221,
"step": 105
},
{
"epoch": 0.061902082160945414,
"grad_norm": 1.7473770380020142,
"learning_rate": 0.00012359550561797752,
"loss": 0.1973,
"step": 110
},
{
"epoch": 0.06471581316826111,
"grad_norm": 1.0901485681533813,
"learning_rate": 0.00012921348314606744,
"loss": 0.4353,
"step": 115
},
{
"epoch": 0.06752954417557681,
"grad_norm": 0.7513278722763062,
"learning_rate": 0.00013483146067415732,
"loss": 0.3628,
"step": 120
},
{
"epoch": 0.07034327518289252,
"grad_norm": 1.2991347312927246,
"learning_rate": 0.0001404494382022472,
"loss": 0.3113,
"step": 125
},
{
"epoch": 0.07315700619020822,
"grad_norm": 0.6642701029777527,
"learning_rate": 0.0001460674157303371,
"loss": 0.4762,
"step": 130
},
{
"epoch": 0.07597073719752391,
"grad_norm": 1.3134933710098267,
"learning_rate": 0.00015168539325842697,
"loss": 0.3526,
"step": 135
},
{
"epoch": 0.07878446820483961,
"grad_norm": 1.0081052780151367,
"learning_rate": 0.00015730337078651685,
"loss": 0.3115,
"step": 140
},
{
"epoch": 0.08159819921215532,
"grad_norm": 0.2705545723438263,
"learning_rate": 0.00016292134831460674,
"loss": 0.2489,
"step": 145
},
{
"epoch": 0.08441193021947102,
"grad_norm": 1.097110390663147,
"learning_rate": 0.00016853932584269662,
"loss": 0.4727,
"step": 150
},
{
"epoch": 0.08722566122678672,
"grad_norm": 0.8656176328659058,
"learning_rate": 0.00017415730337078653,
"loss": 0.4242,
"step": 155
},
{
"epoch": 0.09003939223410241,
"grad_norm": 0.8114811778068542,
"learning_rate": 0.00017977528089887642,
"loss": 0.3033,
"step": 160
},
{
"epoch": 0.09285312324141812,
"grad_norm": 1.072106957435608,
"learning_rate": 0.0001853932584269663,
"loss": 0.3359,
"step": 165
},
{
"epoch": 0.09566685424873382,
"grad_norm": 0.6422829031944275,
"learning_rate": 0.00019101123595505618,
"loss": 0.2874,
"step": 170
},
{
"epoch": 0.09848058525604952,
"grad_norm": 1.2826429605484009,
"learning_rate": 0.00019662921348314607,
"loss": 0.3826,
"step": 175
},
{
"epoch": 0.10129431626336523,
"grad_norm": 0.24743008613586426,
"learning_rate": 0.00019999922797341667,
"loss": 0.1672,
"step": 180
},
{
"epoch": 0.10410804727068092,
"grad_norm": 0.9253625273704529,
"learning_rate": 0.00019999054281125283,
"loss": 0.2247,
"step": 185
},
{
"epoch": 0.10692177827799662,
"grad_norm": 0.6312052011489868,
"learning_rate": 0.0001999722082946312,
"loss": 0.4353,
"step": 190
},
{
"epoch": 0.10973550928531232,
"grad_norm": 0.7200600504875183,
"learning_rate": 0.00019994422619288159,
"loss": 0.2801,
"step": 195
},
{
"epoch": 0.11254924029262803,
"grad_norm": 0.46093428134918213,
"learning_rate": 0.00019990659920635152,
"loss": 0.2552,
"step": 200
},
{
"epoch": 0.11536297129994373,
"grad_norm": 2.7881016731262207,
"learning_rate": 0.00019985933096614578,
"loss": 0.4449,
"step": 205
},
{
"epoch": 0.11817670230725942,
"grad_norm": 0.7893804311752319,
"learning_rate": 0.00019980242603377573,
"loss": 0.2687,
"step": 210
},
{
"epoch": 0.12099043331457512,
"grad_norm": 0.644159197807312,
"learning_rate": 0.00019973588990071937,
"loss": 0.2525,
"step": 215
},
{
"epoch": 0.12380416432189083,
"grad_norm": 0.7028160095214844,
"learning_rate": 0.00019965972898789125,
"loss": 0.4349,
"step": 220
},
{
"epoch": 0.12661789532920653,
"grad_norm": 0.9951076507568359,
"learning_rate": 0.0001995739506450229,
"loss": 0.3944,
"step": 225
},
{
"epoch": 0.12943162633652222,
"grad_norm": 0.9317317605018616,
"learning_rate": 0.00019947856314995349,
"loss": 0.2633,
"step": 230
},
{
"epoch": 0.13224535734383794,
"grad_norm": 0.426104336977005,
"learning_rate": 0.00019937357570783107,
"loss": 0.2667,
"step": 235
},
{
"epoch": 0.13505908835115363,
"grad_norm": 0.7403711676597595,
"learning_rate": 0.0001992589984502243,
"loss": 0.4544,
"step": 240
},
{
"epoch": 0.13787281935846932,
"grad_norm": 0.7271831035614014,
"learning_rate": 0.0001991348424341445,
"loss": 0.3209,
"step": 245
},
{
"epoch": 0.14068655036578503,
"grad_norm": 2.071850299835205,
"learning_rate": 0.00019900111964097893,
"loss": 0.4721,
"step": 250
},
{
"epoch": 0.14350028137310072,
"grad_norm": 0.5502871870994568,
"learning_rate": 0.0001988578429753342,
"loss": 0.2698,
"step": 255
},
{
"epoch": 0.14631401238041644,
"grad_norm": 1.0685575008392334,
"learning_rate": 0.00019870502626379127,
"loss": 0.3656,
"step": 260
},
{
"epoch": 0.14912774338773213,
"grad_norm": 0.521375834941864,
"learning_rate": 0.00019854268425357105,
"loss": 0.3065,
"step": 265
},
{
"epoch": 0.15194147439504782,
"grad_norm": 0.7491894960403442,
"learning_rate": 0.0001983708326111111,
"loss": 0.3863,
"step": 270
},
{
"epoch": 0.15475520540236354,
"grad_norm": 0.8233822584152222,
"learning_rate": 0.0001981894879205539,
"loss": 0.3434,
"step": 275
},
{
"epoch": 0.15756893640967923,
"grad_norm": 0.9702492952346802,
"learning_rate": 0.0001979986676821465,
"loss": 0.3103,
"step": 280
},
{
"epoch": 0.16038266741699495,
"grad_norm": 1.0381535291671753,
"learning_rate": 0.00019779839031055157,
"loss": 0.4879,
"step": 285
},
{
"epoch": 0.16319639842431063,
"grad_norm": 0.594953179359436,
"learning_rate": 0.00019758867513307047,
"loss": 0.2185,
"step": 290
},
{
"epoch": 0.16601012943162632,
"grad_norm": 0.9089880585670471,
"learning_rate": 0.00019736954238777792,
"loss": 0.4932,
"step": 295
},
{
"epoch": 0.16882386043894204,
"grad_norm": 0.8703072667121887,
"learning_rate": 0.00019714101322156915,
"loss": 0.3456,
"step": 300
},
{
"epoch": 0.17163759144625773,
"grad_norm": 0.9151054620742798,
"learning_rate": 0.00019690310968811914,
"loss": 0.3413,
"step": 305
},
{
"epoch": 0.17445132245357345,
"grad_norm": 0.7803131341934204,
"learning_rate": 0.0001966558547457543,
"loss": 0.2144,
"step": 310
},
{
"epoch": 0.17726505346088914,
"grad_norm": 4.146880626678467,
"learning_rate": 0.00019639927225523698,
"loss": 0.2838,
"step": 315
},
{
"epoch": 0.18007878446820483,
"grad_norm": 0.8946036696434021,
"learning_rate": 0.00019613338697746285,
"loss": 0.3069,
"step": 320
},
{
"epoch": 0.18289251547552055,
"grad_norm": 1.0504130125045776,
"learning_rate": 0.00019585822457107138,
"loss": 0.1849,
"step": 325
},
{
"epoch": 0.18570624648283623,
"grad_norm": 0.5320996642112732,
"learning_rate": 0.0001955738115899698,
"loss": 0.4705,
"step": 330
},
{
"epoch": 0.18851997749015195,
"grad_norm": 1.0972635746002197,
"learning_rate": 0.00019528017548077045,
"loss": 0.1279,
"step": 335
},
{
"epoch": 0.19133370849746764,
"grad_norm": 0.9836655855178833,
"learning_rate": 0.00019497734458014216,
"loss": 0.3454,
"step": 340
},
{
"epoch": 0.19414743950478333,
"grad_norm": 0.9435672163963318,
"learning_rate": 0.00019466534811207569,
"loss": 0.1713,
"step": 345
},
{
"epoch": 0.19696117051209905,
"grad_norm": 0.8675717115402222,
"learning_rate": 0.00019434421618506358,
"loss": 0.4799,
"step": 350
},
{
"epoch": 0.19977490151941474,
"grad_norm": 0.5335102677345276,
"learning_rate": 0.00019401397978919453,
"loss": 0.242,
"step": 355
},
{
"epoch": 0.20258863252673046,
"grad_norm": 0.7480678558349609,
"learning_rate": 0.00019367467079316279,
"loss": 0.3568,
"step": 360
},
{
"epoch": 0.20540236353404615,
"grad_norm": 0.23846450448036194,
"learning_rate": 0.0001933263219411928,
"loss": 0.4519,
"step": 365
},
{
"epoch": 0.20821609454136183,
"grad_norm": 0.24460311233997345,
"learning_rate": 0.00019296896684987925,
"loss": 0.3763,
"step": 370
},
{
"epoch": 0.21102982554867755,
"grad_norm": 0.6336620450019836,
"learning_rate": 0.0001926026400049429,
"loss": 0.3754,
"step": 375
},
{
"epoch": 0.21384355655599324,
"grad_norm": 0.4195510745048523,
"learning_rate": 0.00019222737675790276,
"loss": 0.2576,
"step": 380
},
{
"epoch": 0.21665728756330896,
"grad_norm": 1.3396929502487183,
"learning_rate": 0.00019184321332266452,
"loss": 0.3267,
"step": 385
},
{
"epoch": 0.21947101857062465,
"grad_norm": 0.49325576424598694,
"learning_rate": 0.0001914501867720258,
"loss": 0.2602,
"step": 390
},
{
"epoch": 0.22228474957794034,
"grad_norm": 0.5282377004623413,
"learning_rate": 0.00019104833503409848,
"loss": 0.3498,
"step": 395
},
{
"epoch": 0.22509848058525606,
"grad_norm": 0.6364492774009705,
"learning_rate": 0.00019063769688864866,
"loss": 0.2147,
"step": 400
},
{
"epoch": 0.22791221159257175,
"grad_norm": 0.8976377248764038,
"learning_rate": 0.00019021831196335418,
"loss": 0.328,
"step": 405
},
{
"epoch": 0.23072594259988746,
"grad_norm": 0.4209904372692108,
"learning_rate": 0.0001897902207299805,
"loss": 0.2822,
"step": 410
},
{
"epoch": 0.23353967360720315,
"grad_norm": 0.5531566739082336,
"learning_rate": 0.0001893534645004751,
"loss": 0.4366,
"step": 415
},
{
"epoch": 0.23635340461451884,
"grad_norm": 0.592050313949585,
"learning_rate": 0.00018890808542298073,
"loss": 0.381,
"step": 420
},
{
"epoch": 0.23916713562183456,
"grad_norm": 0.8051882982254028,
"learning_rate": 0.00018845412647776794,
"loss": 0.3602,
"step": 425
},
{
"epoch": 0.24198086662915025,
"grad_norm": 0.7593362331390381,
"learning_rate": 0.0001879916314730875,
"loss": 0.3809,
"step": 430
},
{
"epoch": 0.24479459763646597,
"grad_norm": 1.2135759592056274,
"learning_rate": 0.00018752064504094272,
"loss": 0.2138,
"step": 435
},
{
"epoch": 0.24760832864378166,
"grad_norm": 0.14413990080356598,
"learning_rate": 0.00018704121263278227,
"loss": 0.3506,
"step": 440
},
{
"epoch": 0.2504220596510974,
"grad_norm": 0.6321181058883667,
"learning_rate": 0.00018655338051511413,
"loss": 0.3232,
"step": 445
},
{
"epoch": 0.25323579065841306,
"grad_norm": 0.7276772856712341,
"learning_rate": 0.00018605719576504065,
"loss": 0.2345,
"step": 450
},
{
"epoch": 0.25604952166572875,
"grad_norm": 0.324861079454422,
"learning_rate": 0.00018555270626571555,
"loss": 0.1345,
"step": 455
},
{
"epoch": 0.25886325267304444,
"grad_norm": 0.7779459953308105,
"learning_rate": 0.000185039960701723,
"loss": 0.2958,
"step": 460
},
{
"epoch": 0.26167698368036013,
"grad_norm": 0.6974682211875916,
"learning_rate": 0.0001845190085543795,
"loss": 0.2257,
"step": 465
},
{
"epoch": 0.2644907146876759,
"grad_norm": 0.9312912821769714,
"learning_rate": 0.0001839899000969587,
"loss": 0.353,
"step": 470
},
{
"epoch": 0.26730444569499157,
"grad_norm": 0.49484914541244507,
"learning_rate": 0.00018345268638984003,
"loss": 0.2321,
"step": 475
},
{
"epoch": 0.27011817670230726,
"grad_norm": 0.24110960960388184,
"learning_rate": 0.00018290741927558113,
"loss": 0.2501,
"step": 480
},
{
"epoch": 0.27293190770962295,
"grad_norm": 0.5313132405281067,
"learning_rate": 0.00018235415137391497,
"loss": 0.2477,
"step": 485
},
{
"epoch": 0.27574563871693863,
"grad_norm": 0.6360633373260498,
"learning_rate": 0.00018179293607667178,
"loss": 0.2846,
"step": 490
},
{
"epoch": 0.2785593697242544,
"grad_norm": 0.5810567140579224,
"learning_rate": 0.00018122382754262681,
"loss": 0.2196,
"step": 495
},
{
"epoch": 0.28137310073157007,
"grad_norm": 0.7277317047119141,
"learning_rate": 0.00018064688069227368,
"loss": 0.2656,
"step": 500
},
{
"epoch": 0.28418683173888576,
"grad_norm": 0.7561081051826477,
"learning_rate": 0.00018006215120252453,
"loss": 0.3004,
"step": 505
},
{
"epoch": 0.28700056274620145,
"grad_norm": 0.8930642604827881,
"learning_rate": 0.0001794696955013369,
"loss": 0.445,
"step": 510
},
{
"epoch": 0.28981429375351714,
"grad_norm": 0.9028257727622986,
"learning_rate": 0.00017886957076226838,
"loss": 0.3362,
"step": 515
},
{
"epoch": 0.2926280247608329,
"grad_norm": 0.6070359945297241,
"learning_rate": 0.0001782618348989593,
"loss": 0.1993,
"step": 520
},
{
"epoch": 0.2954417557681486,
"grad_norm": 0.9078888893127441,
"learning_rate": 0.0001776465465595437,
"loss": 0.2554,
"step": 525
},
{
"epoch": 0.29825548677546426,
"grad_norm": 0.7235105633735657,
"learning_rate": 0.0001770237651209898,
"loss": 0.1807,
"step": 530
},
{
"epoch": 0.30106921778277995,
"grad_norm": 0.37401115894317627,
"learning_rate": 0.00017639355068336987,
"loss": 0.153,
"step": 535
},
{
"epoch": 0.30388294879009564,
"grad_norm": 0.4220016896724701,
"learning_rate": 0.00017575596406406048,
"loss": 0.2249,
"step": 540
},
{
"epoch": 0.3066966797974114,
"grad_norm": 0.6732789278030396,
"learning_rate": 0.00017511106679187334,
"loss": 0.1663,
"step": 545
},
{
"epoch": 0.3095104108047271,
"grad_norm": 0.8022940754890442,
"learning_rate": 0.00017445892110111783,
"loss": 0.3083,
"step": 550
},
{
"epoch": 0.31232414181204277,
"grad_norm": 2.0369653701782227,
"learning_rate": 0.00017379958992559493,
"loss": 0.3939,
"step": 555
},
{
"epoch": 0.31513787281935846,
"grad_norm": 1.5206650495529175,
"learning_rate": 0.00017313313689252418,
"loss": 0.3079,
"step": 560
},
{
"epoch": 0.31795160382667415,
"grad_norm": 0.343148410320282,
"learning_rate": 0.00017245962631640341,
"loss": 0.2408,
"step": 565
},
{
"epoch": 0.3207653348339899,
"grad_norm": 0.7586761116981506,
"learning_rate": 0.00017177912319280217,
"loss": 0.2298,
"step": 570
},
{
"epoch": 0.3235790658413056,
"grad_norm": 0.5147440433502197,
"learning_rate": 0.00017109169319208948,
"loss": 0.3067,
"step": 575
},
{
"epoch": 0.32639279684862127,
"grad_norm": 0.5967961549758911,
"learning_rate": 0.0001703974026530966,
"loss": 0.3291,
"step": 580
},
{
"epoch": 0.32920652785593696,
"grad_norm": 0.8705066442489624,
"learning_rate": 0.00016969631857671497,
"loss": 0.3395,
"step": 585
},
{
"epoch": 0.33202025886325265,
"grad_norm": 0.36913836002349854,
"learning_rate": 0.00016898850861943058,
"loss": 0.15,
"step": 590
},
{
"epoch": 0.3348339898705684,
"grad_norm": 1.0335655212402344,
"learning_rate": 0.00016827404108679485,
"loss": 0.2577,
"step": 595
},
{
"epoch": 0.3376477208778841,
"grad_norm": 0.44228196144104004,
"learning_rate": 0.00016755298492683308,
"loss": 0.3735,
"step": 600
},
{
"epoch": 0.3404614518851998,
"grad_norm": 0.7567837238311768,
"learning_rate": 0.0001668254097233907,
"loss": 0.233,
"step": 605
},
{
"epoch": 0.34327518289251546,
"grad_norm": 0.7530750632286072,
"learning_rate": 0.00016609138568941809,
"loss": 0.34,
"step": 610
},
{
"epoch": 0.34608891389983115,
"grad_norm": 0.5381020307540894,
"learning_rate": 0.0001653509836601952,
"loss": 0.3743,
"step": 615
},
{
"epoch": 0.3489026449071469,
"grad_norm": 1.5347527265548706,
"learning_rate": 0.00016460427508649546,
"loss": 0.3224,
"step": 620
},
{
"epoch": 0.3517163759144626,
"grad_norm": 0.7328157424926758,
"learning_rate": 0.0001638513320276907,
"loss": 0.4606,
"step": 625
},
{
"epoch": 0.3545301069217783,
"grad_norm": 0.5422099232673645,
"learning_rate": 0.0001630922271447972,
"loss": 0.239,
"step": 630
},
{
"epoch": 0.35734383792909397,
"grad_norm": 0.12207705527544022,
"learning_rate": 0.0001623270336934638,
"loss": 0.1777,
"step": 635
},
{
"epoch": 0.36015756893640966,
"grad_norm": 0.7163983583450317,
"learning_rate": 0.00016155582551690236,
"loss": 0.308,
"step": 640
},
{
"epoch": 0.3629712999437254,
"grad_norm": 0.5855613350868225,
"learning_rate": 0.00016077867703876182,
"loss": 0.3146,
"step": 645
},
{
"epoch": 0.3657850309510411,
"grad_norm": 0.3779029846191406,
"learning_rate": 0.0001599956632559461,
"loss": 0.2512,
"step": 650
},
{
"epoch": 0.3685987619583568,
"grad_norm": 0.4623885452747345,
"learning_rate": 0.00015920685973137673,
"loss": 0.3344,
"step": 655
},
{
"epoch": 0.37141249296567247,
"grad_norm": 0.5071548819541931,
"learning_rate": 0.00015841234258670065,
"loss": 0.2301,
"step": 660
},
{
"epoch": 0.37422622397298816,
"grad_norm": 0.8452264070510864,
"learning_rate": 0.0001576121884949446,
"loss": 0.3051,
"step": 665
},
{
"epoch": 0.3770399549803039,
"grad_norm": 0.3451974391937256,
"learning_rate": 0.00015680647467311557,
"loss": 0.2358,
"step": 670
},
{
"epoch": 0.3798536859876196,
"grad_norm": 0.7894652485847473,
"learning_rate": 0.0001559952788747495,
"loss": 0.2995,
"step": 675
},
{
"epoch": 0.3826674169949353,
"grad_norm": 0.6503499746322632,
"learning_rate": 0.00015517867938240763,
"loss": 0.3838,
"step": 680
},
{
"epoch": 0.385481148002251,
"grad_norm": 0.8255408406257629,
"learning_rate": 0.00015435675500012212,
"loss": 0.3326,
"step": 685
},
{
"epoch": 0.38829487900956666,
"grad_norm": 0.9840317368507385,
"learning_rate": 0.00015352958504579123,
"loss": 0.3544,
"step": 690
},
{
"epoch": 0.3911086100168824,
"grad_norm": 0.5513814687728882,
"learning_rate": 0.00015269724934352497,
"loss": 0.3565,
"step": 695
},
{
"epoch": 0.3939223410241981,
"grad_norm": 0.9206532835960388,
"learning_rate": 0.00015185982821594175,
"loss": 0.6995,
"step": 700
},
{
"epoch": 0.3967360720315138,
"grad_norm": 0.6624305248260498,
"learning_rate": 0.00015101740247641714,
"loss": 0.236,
"step": 705
},
{
"epoch": 0.3995498030388295,
"grad_norm": 0.4557129442691803,
"learning_rate": 0.00015017005342128517,
"loss": 0.2185,
"step": 710
},
{
"epoch": 0.40236353404614517,
"grad_norm": 0.5107508301734924,
"learning_rate": 0.000149317862821993,
"loss": 0.2376,
"step": 715
},
{
"epoch": 0.4051772650534609,
"grad_norm": 0.6542500257492065,
"learning_rate": 0.00014846091291720957,
"loss": 0.3315,
"step": 720
},
{
"epoch": 0.4079909960607766,
"grad_norm": 1.1817783117294312,
"learning_rate": 0.00014759928640488965,
"loss": 0.27,
"step": 725
},
{
"epoch": 0.4108047270680923,
"grad_norm": 0.47976672649383545,
"learning_rate": 0.00014673306643429314,
"loss": 0.2458,
"step": 730
},
{
"epoch": 0.413618458075408,
"grad_norm": 0.9391474723815918,
"learning_rate": 0.00014586233659796087,
"loss": 0.424,
"step": 735
},
{
"epoch": 0.41643218908272367,
"grad_norm": 0.6755409240722656,
"learning_rate": 0.0001449871809236478,
"loss": 0.3009,
"step": 740
},
{
"epoch": 0.4192459200900394,
"grad_norm": 1.1786988973617554,
"learning_rate": 0.000144107683866214,
"loss": 0.2926,
"step": 745
},
{
"epoch": 0.4220596510973551,
"grad_norm": 0.4564046561717987,
"learning_rate": 0.00014322393029947468,
"loss": 0.3663,
"step": 750
},
{
"epoch": 0.4248733821046708,
"grad_norm": 0.650117814540863,
"learning_rate": 0.00014233600550800962,
"loss": 0.2522,
"step": 755
},
{
"epoch": 0.4276871131119865,
"grad_norm": 0.35542795062065125,
"learning_rate": 0.0001414439951789328,
"loss": 0.1902,
"step": 760
},
{
"epoch": 0.4305008441193022,
"grad_norm": 0.6666870713233948,
"learning_rate": 0.00014054798539362356,
"loss": 0.3625,
"step": 765
},
{
"epoch": 0.4333145751266179,
"grad_norm": 1.3364002704620361,
"learning_rate": 0.00013964806261941944,
"loss": 0.2144,
"step": 770
},
{
"epoch": 0.4361283061339336,
"grad_norm": 0.30019038915634155,
"learning_rate": 0.00013874431370127188,
"loss": 0.205,
"step": 775
},
{
"epoch": 0.4389420371412493,
"grad_norm": 0.7001076936721802,
"learning_rate": 0.0001378368258533654,
"loss": 0.4068,
"step": 780
},
{
"epoch": 0.441755768148565,
"grad_norm": 1.1424932479858398,
"learning_rate": 0.0001369256866507012,
"loss": 0.2298,
"step": 785
},
{
"epoch": 0.4445694991558807,
"grad_norm": 0.8599133491516113,
"learning_rate": 0.00013601098402064607,
"loss": 0.2843,
"step": 790
},
{
"epoch": 0.4473832301631964,
"grad_norm": 0.8263369798660278,
"learning_rate": 0.00013509280623444695,
"loss": 0.2451,
"step": 795
},
{
"epoch": 0.4501969611705121,
"grad_norm": 0.9365822672843933,
"learning_rate": 0.00013417124189871272,
"loss": 0.2621,
"step": 800
},
{
"epoch": 0.4530106921778278,
"grad_norm": 0.47065469622612,
"learning_rate": 0.00013324637994686326,
"loss": 0.191,
"step": 805
},
{
"epoch": 0.4558244231851435,
"grad_norm": 0.6455582976341248,
"learning_rate": 0.00013231830963054722,
"loss": 0.2073,
"step": 810
},
{
"epoch": 0.4586381541924592,
"grad_norm": 0.8905434012413025,
"learning_rate": 0.0001313871205110291,
"loss": 0.3213,
"step": 815
},
{
"epoch": 0.4614518851997749,
"grad_norm": 0.44368186593055725,
"learning_rate": 0.0001304529024505461,
"loss": 0.4487,
"step": 820
},
{
"epoch": 0.4642656162070906,
"grad_norm": 0.3236369788646698,
"learning_rate": 0.00012951574560363636,
"loss": 0.2343,
"step": 825
},
{
"epoch": 0.4670793472144063,
"grad_norm": 0.3316313624382019,
"learning_rate": 0.00012857574040843876,
"loss": 0.1704,
"step": 830
},
{
"epoch": 0.469893078221722,
"grad_norm": 1.180114984512329,
"learning_rate": 0.0001276329775779655,
"loss": 0.2944,
"step": 835
},
{
"epoch": 0.4727068092290377,
"grad_norm": 0.4699708819389343,
"learning_rate": 0.00012668754809134773,
"loss": 0.1709,
"step": 840
},
{
"epoch": 0.47552054023635343,
"grad_norm": 0.5087912082672119,
"learning_rate": 0.00012573954318505624,
"loss": 0.2753,
"step": 845
},
{
"epoch": 0.4783342712436691,
"grad_norm": 0.21406421065330505,
"learning_rate": 0.00012478905434409662,
"loss": 0.2955,
"step": 850
},
{
"epoch": 0.4811480022509848,
"grad_norm": 0.8056962490081787,
"learning_rate": 0.0001238361732931808,
"loss": 0.275,
"step": 855
},
{
"epoch": 0.4839617332583005,
"grad_norm": 0.7347704768180847,
"learning_rate": 0.00012288099198787532,
"loss": 0.2448,
"step": 860
},
{
"epoch": 0.4867754642656162,
"grad_norm": 0.43679895997047424,
"learning_rate": 0.0001219236026057275,
"loss": 0.4004,
"step": 865
},
{
"epoch": 0.48958919527293193,
"grad_norm": 0.4202831983566284,
"learning_rate": 0.00012096409753736991,
"loss": 0.1963,
"step": 870
},
{
"epoch": 0.4924029262802476,
"grad_norm": 0.8716102838516235,
"learning_rate": 0.00012000256937760445,
"loss": 0.225,
"step": 875
},
{
"epoch": 0.4952166572875633,
"grad_norm": 0.2482863813638687,
"learning_rate": 0.00011903911091646684,
"loss": 0.2338,
"step": 880
},
{
"epoch": 0.498030388294879,
"grad_norm": 0.6226937174797058,
"learning_rate": 0.000118073815130272,
"loss": 0.3606,
"step": 885
},
{
"epoch": 0.5008441193021947,
"grad_norm": 0.4387325942516327,
"learning_rate": 0.0001171067751726416,
"loss": 0.231,
"step": 890
},
{
"epoch": 0.5036578503095104,
"grad_norm": 0.26261425018310547,
"learning_rate": 0.00011613808436551454,
"loss": 0.1239,
"step": 895
},
{
"epoch": 0.5064715813168261,
"grad_norm": 1.2383506298065186,
"learning_rate": 0.00011516783619014109,
"loss": 0.2496,
"step": 900
},
{
"epoch": 0.5092853123241418,
"grad_norm": 1.853761911392212,
"learning_rate": 0.00011419612427806172,
"loss": 0.3915,
"step": 905
},
{
"epoch": 0.5120990433314575,
"grad_norm": 0.330138623714447,
"learning_rate": 0.00011322304240207145,
"loss": 0.0917,
"step": 910
},
{
"epoch": 0.5149127743387732,
"grad_norm": 0.41656142473220825,
"learning_rate": 0.00011224868446717036,
"loss": 0.1754,
"step": 915
},
{
"epoch": 0.5177265053460889,
"grad_norm": 0.6251401901245117,
"learning_rate": 0.00011127314450150175,
"loss": 0.3901,
"step": 920
},
{
"epoch": 0.5205402363534046,
"grad_norm": 1.254900336265564,
"learning_rate": 0.00011029651664727798,
"loss": 0.2828,
"step": 925
},
{
"epoch": 0.5233539673607203,
"grad_norm": 0.9572696089744568,
"learning_rate": 0.00010931889515169555,
"loss": 0.2235,
"step": 930
},
{
"epoch": 0.526167698368036,
"grad_norm": 0.8414142727851868,
"learning_rate": 0.00010834037435784008,
"loss": 0.2718,
"step": 935
},
{
"epoch": 0.5289814293753518,
"grad_norm": 0.4331166744232178,
"learning_rate": 0.00010736104869558176,
"loss": 0.2558,
"step": 940
},
{
"epoch": 0.5317951603826674,
"grad_norm": 0.32980430126190186,
"learning_rate": 0.00010638101267246283,
"loss": 0.1117,
"step": 945
},
{
"epoch": 0.5346088913899831,
"grad_norm": 0.7335298657417297,
"learning_rate": 0.00010540036086457723,
"loss": 0.4412,
"step": 950
},
{
"epoch": 0.5374226223972988,
"grad_norm": 0.6139857769012451,
"learning_rate": 0.00010441918790744372,
"loss": 0.2925,
"step": 955
},
{
"epoch": 0.5402363534046145,
"grad_norm": 0.3401097059249878,
"learning_rate": 0.00010343758848687341,
"loss": 0.2625,
"step": 960
},
{
"epoch": 0.5430500844119303,
"grad_norm": 0.3688424229621887,
"learning_rate": 0.00010245565732983227,
"loss": 0.211,
"step": 965
},
{
"epoch": 0.5458638154192459,
"grad_norm": 1.3460103273391724,
"learning_rate": 0.00010147348919529969,
"loss": 0.3091,
"step": 970
},
{
"epoch": 0.5486775464265616,
"grad_norm": 0.4599795937538147,
"learning_rate": 0.00010049117886512404,
"loss": 0.2301,
"step": 975
},
{
"epoch": 0.5514912774338773,
"grad_norm": 0.5787628293037415,
"learning_rate": 9.950882113487598e-05,
"loss": 0.258,
"step": 980
},
{
"epoch": 0.554305008441193,
"grad_norm": 0.8748778104782104,
"learning_rate": 9.852651080470033e-05,
"loss": 0.3606,
"step": 985
},
{
"epoch": 0.5571187394485088,
"grad_norm": 0.4328353703022003,
"learning_rate": 9.754434267016775e-05,
"loss": 0.2004,
"step": 990
},
{
"epoch": 0.5599324704558244,
"grad_norm": 0.9542059898376465,
"learning_rate": 9.656241151312661e-05,
"loss": 0.2206,
"step": 995
},
{
"epoch": 0.5627462014631401,
"grad_norm": 0.3367530405521393,
"learning_rate": 9.558081209255629e-05,
"loss": 0.1936,
"step": 1000
},
{
"epoch": 0.5655599324704558,
"grad_norm": 0.3511320650577545,
"learning_rate": 9.459963913542279e-05,
"loss": 0.1467,
"step": 1005
},
{
"epoch": 0.5683736634777715,
"grad_norm": 0.5722060799598694,
"learning_rate": 9.361898732753716e-05,
"loss": 0.3173,
"step": 1010
},
{
"epoch": 0.5711873944850873,
"grad_norm": 0.5380959510803223,
"learning_rate": 9.263895130441826e-05,
"loss": 0.2697,
"step": 1015
},
{
"epoch": 0.5740011254924029,
"grad_norm": 0.7701444625854492,
"learning_rate": 9.165962564215993e-05,
"loss": 0.2513,
"step": 1020
},
{
"epoch": 0.5768148564997186,
"grad_norm": 0.44029852747917175,
"learning_rate": 9.068110484830447e-05,
"loss": 0.3467,
"step": 1025
},
{
"epoch": 0.5796285875070343,
"grad_norm": 0.6167469620704651,
"learning_rate": 8.970348335272203e-05,
"loss": 0.3191,
"step": 1030
},
{
"epoch": 0.58244231851435,
"grad_norm": 0.8046761751174927,
"learning_rate": 8.872685549849827e-05,
"loss": 0.2874,
"step": 1035
},
{
"epoch": 0.5852560495216658,
"grad_norm": 0.6045218110084534,
"learning_rate": 8.775131553282965e-05,
"loss": 0.2724,
"step": 1040
},
{
"epoch": 0.5880697805289814,
"grad_norm": 0.23991712927818298,
"learning_rate": 8.67769575979286e-05,
"loss": 0.0839,
"step": 1045
},
{
"epoch": 0.5908835115362971,
"grad_norm": 0.5629101395606995,
"learning_rate": 8.580387572193829e-05,
"loss": 0.2608,
"step": 1050
},
{
"epoch": 0.5936972425436128,
"grad_norm": 0.7069487571716309,
"learning_rate": 8.483216380985895e-05,
"loss": 0.359,
"step": 1055
},
{
"epoch": 0.5965109735509285,
"grad_norm": 1.0714657306671143,
"learning_rate": 8.386191563448548e-05,
"loss": 0.2144,
"step": 1060
},
{
"epoch": 0.5993247045582443,
"grad_norm": 0.8178947567939758,
"learning_rate": 8.289322482735844e-05,
"loss": 0.3105,
"step": 1065
},
{
"epoch": 0.6021384355655599,
"grad_norm": 0.7573699951171875,
"learning_rate": 8.192618486972803e-05,
"loss": 0.2918,
"step": 1070
},
{
"epoch": 0.6049521665728756,
"grad_norm": 0.3417803645133972,
"learning_rate": 8.096088908353315e-05,
"loss": 0.1382,
"step": 1075
},
{
"epoch": 0.6077658975801913,
"grad_norm": 0.7476038336753845,
"learning_rate": 7.999743062239557e-05,
"loss": 0.4213,
"step": 1080
},
{
"epoch": 0.610579628587507,
"grad_norm": 0.3231750726699829,
"learning_rate": 7.90359024626301e-05,
"loss": 0.2874,
"step": 1085
},
{
"epoch": 0.6133933595948228,
"grad_norm": 0.5958102345466614,
"learning_rate": 7.807639739427251e-05,
"loss": 0.2656,
"step": 1090
},
{
"epoch": 0.6162070906021384,
"grad_norm": 0.4588276743888855,
"learning_rate": 7.711900801212466e-05,
"loss": 0.1933,
"step": 1095
},
{
"epoch": 0.6190208216094542,
"grad_norm": 0.5570498704910278,
"learning_rate": 7.616382670681924e-05,
"loss": 0.3897,
"step": 1100
},
{
"epoch": 0.6218345526167698,
"grad_norm": 0.41902509331703186,
"learning_rate": 7.521094565590338e-05,
"loss": 0.2403,
"step": 1105
},
{
"epoch": 0.6246482836240855,
"grad_norm": 0.9511467814445496,
"learning_rate": 7.426045681494378e-05,
"loss": 0.3146,
"step": 1110
},
{
"epoch": 0.6274620146314013,
"grad_norm": 1.1212773323059082,
"learning_rate": 7.33124519086523e-05,
"loss": 0.2424,
"step": 1115
},
{
"epoch": 0.6302757456387169,
"grad_norm": 0.8666883111000061,
"learning_rate": 7.236702242203457e-05,
"loss": 0.319,
"step": 1120
},
{
"epoch": 0.6330894766460327,
"grad_norm": 1.2638081312179565,
"learning_rate": 7.142425959156125e-05,
"loss": 0.1587,
"step": 1125
},
{
"epoch": 0.6359032076533483,
"grad_norm": 0.33488303422927856,
"learning_rate": 7.04842543963637e-05,
"loss": 0.2139,
"step": 1130
},
{
"epoch": 0.638716938660664,
"grad_norm": 0.6867479681968689,
"learning_rate": 6.954709754945394e-05,
"loss": 0.2332,
"step": 1135
},
{
"epoch": 0.6415306696679798,
"grad_norm": 0.4780934751033783,
"learning_rate": 6.861287948897091e-05,
"loss": 0.1127,
"step": 1140
},
{
"epoch": 0.6443444006752954,
"grad_norm": 0.8691847920417786,
"learning_rate": 6.768169036945277e-05,
"loss": 0.3039,
"step": 1145
},
{
"epoch": 0.6471581316826112,
"grad_norm": 0.4771972894668579,
"learning_rate": 6.675362005313677e-05,
"loss": 0.2787,
"step": 1150
},
{
"epoch": 0.6499718626899268,
"grad_norm": 0.5366829037666321,
"learning_rate": 6.58287581012873e-05,
"loss": 0.1824,
"step": 1155
},
{
"epoch": 0.6527855936972425,
"grad_norm": 0.28026753664016724,
"learning_rate": 6.490719376555305e-05,
"loss": 0.2074,
"step": 1160
},
{
"epoch": 0.6555993247045583,
"grad_norm": 0.9920913577079773,
"learning_rate": 6.398901597935393e-05,
"loss": 0.3188,
"step": 1165
},
{
"epoch": 0.6584130557118739,
"grad_norm": 0.5217199921607971,
"learning_rate": 6.30743133492988e-05,
"loss": 0.2846,
"step": 1170
},
{
"epoch": 0.6612267867191897,
"grad_norm": 0.5738883018493652,
"learning_rate": 6.216317414663463e-05,
"loss": 0.1972,
"step": 1175
},
{
"epoch": 0.6640405177265053,
"grad_norm": 0.3134082853794098,
"learning_rate": 6.125568629872813e-05,
"loss": 0.1806,
"step": 1180
},
{
"epoch": 0.666854248733821,
"grad_norm": 0.4762999415397644,
"learning_rate": 6.035193738058056e-05,
"loss": 0.2386,
"step": 1185
},
{
"epoch": 0.6696679797411368,
"grad_norm": 0.48775815963745117,
"learning_rate": 5.945201460637645e-05,
"loss": 0.1261,
"step": 1190
},
{
"epoch": 0.6724817107484524,
"grad_norm": 0.5460477471351624,
"learning_rate": 5.855600482106721e-05,
"loss": 0.3201,
"step": 1195
},
{
"epoch": 0.6752954417557682,
"grad_norm": 0.41563519835472107,
"learning_rate": 5.766399449199037e-05,
"loss": 0.2287,
"step": 1200
},
{
"epoch": 0.6781091727630838,
"grad_norm": 0.832744300365448,
"learning_rate": 5.677606970052529e-05,
"loss": 0.5409,
"step": 1205
},
{
"epoch": 0.6809229037703995,
"grad_norm": 0.8101387023925781,
"learning_rate": 5.5892316133786005e-05,
"loss": 0.1934,
"step": 1210
},
{
"epoch": 0.6837366347777153,
"grad_norm": 0.9781274795532227,
"learning_rate": 5.501281907635223e-05,
"loss": 0.1842,
"step": 1215
},
{
"epoch": 0.6865503657850309,
"grad_norm": 0.36751049757003784,
"learning_rate": 5.413766340203914e-05,
"loss": 0.2631,
"step": 1220
},
{
"epoch": 0.6893640967923467,
"grad_norm": 0.3681579828262329,
"learning_rate": 5.3266933565706865e-05,
"loss": 0.2639,
"step": 1225
},
{
"epoch": 0.6921778277996623,
"grad_norm": 0.7795785069465637,
"learning_rate": 5.240071359511035e-05,
"loss": 0.3817,
"step": 1230
},
{
"epoch": 0.694991558806978,
"grad_norm": 0.6714096069335938,
"learning_rate": 5.153908708279045e-05,
"loss": 0.2655,
"step": 1235
},
{
"epoch": 0.6978052898142938,
"grad_norm": 0.6018862724304199,
"learning_rate": 5.0682137178007025e-05,
"loss": 0.2517,
"step": 1240
},
{
"epoch": 0.7006190208216094,
"grad_norm": 0.7170803546905518,
"learning_rate": 4.9829946578714825e-05,
"loss": 0.3097,
"step": 1245
},
{
"epoch": 0.7034327518289252,
"grad_norm": 0.37394005060195923,
"learning_rate": 4.898259752358287e-05,
"loss": 0.1883,
"step": 1250
},
{
"epoch": 0.7062464828362408,
"grad_norm": 2.812126874923706,
"learning_rate": 4.814017178405829e-05,
"loss": 0.1431,
"step": 1255
},
{
"epoch": 0.7090602138435566,
"grad_norm": 0.5540988445281982,
"learning_rate": 4.730275065647506e-05,
"loss": 0.2109,
"step": 1260
},
{
"epoch": 0.7118739448508723,
"grad_norm": 0.9915019869804382,
"learning_rate": 4.6470414954208785e-05,
"loss": 0.2857,
"step": 1265
},
{
"epoch": 0.7146876758581879,
"grad_norm": 1.248504400253296,
"learning_rate": 4.56432449998779e-05,
"loss": 0.4287,
"step": 1270
},
{
"epoch": 0.7175014068655037,
"grad_norm": 0.5127077102661133,
"learning_rate": 4.482132061759239e-05,
"loss": 0.204,
"step": 1275
},
{
"epoch": 0.7203151378728193,
"grad_norm": 0.6224874258041382,
"learning_rate": 4.400472112525051e-05,
"loss": 0.2376,
"step": 1280
},
{
"epoch": 0.7231288688801351,
"grad_norm": 0.3423043191432953,
"learning_rate": 4.3193525326884435e-05,
"loss": 0.1957,
"step": 1285
},
{
"epoch": 0.7259425998874508,
"grad_norm": 0.41790780425071716,
"learning_rate": 4.238781150505542e-05,
"loss": 0.3171,
"step": 1290
},
{
"epoch": 0.7287563308947664,
"grad_norm": 0.550262451171875,
"learning_rate": 4.158765741329935e-05,
"loss": 0.2016,
"step": 1295
},
{
"epoch": 0.7315700619020822,
"grad_norm": 0.5064123868942261,
"learning_rate": 4.079314026862331e-05,
"loss": 0.2747,
"step": 1300
},
{
"epoch": 0.7343837929093978,
"grad_norm": 0.6976874470710754,
"learning_rate": 4.000433674405392e-05,
"loss": 0.2478,
"step": 1305
},
{
"epoch": 0.7371975239167136,
"grad_norm": 0.8951148986816406,
"learning_rate": 3.9221322961238213e-05,
"loss": 0.1747,
"step": 1310
},
{
"epoch": 0.7400112549240293,
"grad_norm": 0.5888150930404663,
"learning_rate": 3.8444174483097675e-05,
"loss": 0.2342,
"step": 1315
},
{
"epoch": 0.7428249859313449,
"grad_norm": 0.39759594202041626,
"learning_rate": 3.7672966306536226e-05,
"loss": 0.2402,
"step": 1320
},
{
"epoch": 0.7456387169386607,
"grad_norm": 1.4384478330612183,
"learning_rate": 3.690777285520281e-05,
"loss": 0.3132,
"step": 1325
},
{
"epoch": 0.7484524479459763,
"grad_norm": 0.4053248465061188,
"learning_rate": 3.614866797230935e-05,
"loss": 0.1615,
"step": 1330
},
{
"epoch": 0.7512661789532921,
"grad_norm": 0.4696710407733917,
"learning_rate": 3.5395724913504545e-05,
"loss": 0.1633,
"step": 1335
},
{
"epoch": 0.7540799099606078,
"grad_norm": 0.3567434847354889,
"learning_rate": 3.464901633980484e-05,
"loss": 0.2388,
"step": 1340
},
{
"epoch": 0.7568936409679234,
"grad_norm": 0.8810656070709229,
"learning_rate": 3.3908614310581924e-05,
"loss": 0.3078,
"step": 1345
},
{
"epoch": 0.7597073719752392,
"grad_norm": 0.9257289171218872,
"learning_rate": 3.3174590276609355e-05,
"loss": 0.4227,
"step": 1350
},
{
"epoch": 0.7625211029825548,
"grad_norm": 0.3970353305339813,
"learning_rate": 3.24470150731669e-05,
"loss": 0.108,
"step": 1355
},
{
"epoch": 0.7653348339898706,
"grad_norm": 0.4147047996520996,
"learning_rate": 3.1725958913205166e-05,
"loss": 0.3138,
"step": 1360
},
{
"epoch": 0.7681485649971863,
"grad_norm": 0.29604053497314453,
"learning_rate": 3.1011491380569425e-05,
"loss": 0.1246,
"step": 1365
},
{
"epoch": 0.770962296004502,
"grad_norm": 0.7796684503555298,
"learning_rate": 3.0303681423285068e-05,
"loss": 0.3338,
"step": 1370
},
{
"epoch": 0.7737760270118177,
"grad_norm": 0.5329720973968506,
"learning_rate": 2.9602597346903406e-05,
"loss": 0.3101,
"step": 1375
},
{
"epoch": 0.7765897580191333,
"grad_norm": 0.6327192187309265,
"learning_rate": 2.8908306807910534e-05,
"loss": 0.1832,
"step": 1380
},
{
"epoch": 0.7794034890264491,
"grad_norm": 0.6063408851623535,
"learning_rate": 2.822087680719783e-05,
"loss": 0.2447,
"step": 1385
},
{
"epoch": 0.7822172200337648,
"grad_norm": 0.3461267650127411,
"learning_rate": 2.754037368359661e-05,
"loss": 0.274,
"step": 1390
},
{
"epoch": 0.7850309510410804,
"grad_norm": 0.598047137260437,
"learning_rate": 2.6866863107475803e-05,
"loss": 0.173,
"step": 1395
},
{
"epoch": 0.7878446820483962,
"grad_norm": 0.7208033800125122,
"learning_rate": 2.620041007440508e-05,
"loss": 0.2908,
"step": 1400
},
{
"epoch": 0.7906584130557118,
"grad_norm": 0.3856890797615051,
"learning_rate": 2.5541078898882187e-05,
"loss": 0.1546,
"step": 1405
},
{
"epoch": 0.7934721440630276,
"grad_norm": 0.9550760388374329,
"learning_rate": 2.4888933208126663e-05,
"loss": 0.2096,
"step": 1410
},
{
"epoch": 0.7962858750703433,
"grad_norm": 0.4413495361804962,
"learning_rate": 2.4244035935939547e-05,
"loss": 0.2607,
"step": 1415
},
{
"epoch": 0.799099606077659,
"grad_norm": 1.8407310247421265,
"learning_rate": 2.360644931663014e-05,
"loss": 0.3178,
"step": 1420
},
{
"epoch": 0.8019133370849747,
"grad_norm": 0.6831107139587402,
"learning_rate": 2.2976234879010218e-05,
"loss": 0.2287,
"step": 1425
},
{
"epoch": 0.8047270680922903,
"grad_norm": 0.7173850536346436,
"learning_rate": 2.2353453440456316e-05,
"loss": 0.2534,
"step": 1430
},
{
"epoch": 0.8075407990996061,
"grad_norm": 0.5183877944946289,
"learning_rate": 2.173816510104073e-05,
"loss": 0.2009,
"step": 1435
},
{
"epoch": 0.8103545301069218,
"grad_norm": 0.50481116771698,
"learning_rate": 2.113042923773164e-05,
"loss": 0.088,
"step": 1440
},
{
"epoch": 0.8131682611142375,
"grad_norm": 0.648539125919342,
"learning_rate": 2.0530304498663143e-05,
"loss": 0.1828,
"step": 1445
},
{
"epoch": 0.8159819921215532,
"grad_norm": 0.6068935990333557,
"learning_rate": 1.9937848797475488e-05,
"loss": 0.2655,
"step": 1450
},
{
"epoch": 0.8187957231288688,
"grad_norm": 0.36234456300735474,
"learning_rate": 1.935311930772632e-05,
"loss": 0.1955,
"step": 1455
},
{
"epoch": 0.8216094541361846,
"grad_norm": 0.5231152176856995,
"learning_rate": 1.877617245737321e-05,
"loss": 0.2071,
"step": 1460
},
{
"epoch": 0.8244231851435003,
"grad_norm": 0.21570482850074768,
"learning_rate": 1.8207063923328237e-05,
"loss": 0.1853,
"step": 1465
},
{
"epoch": 0.827236916150816,
"grad_norm": 0.4301048815250397,
"learning_rate": 1.764584862608507e-05,
"loss": 0.2832,
"step": 1470
},
{
"epoch": 0.8300506471581317,
"grad_norm": 0.44830775260925293,
"learning_rate": 1.7092580724418882e-05,
"loss": 0.2344,
"step": 1475
},
{
"epoch": 0.8328643781654473,
"grad_norm": 0.42212042212486267,
"learning_rate": 1.6547313610159986e-05,
"loss": 0.2679,
"step": 1480
},
{
"epoch": 0.8356781091727631,
"grad_norm": 0.7017850875854492,
"learning_rate": 1.6010099903041332e-05,
"loss": 0.2124,
"step": 1485
},
{
"epoch": 0.8384918401800788,
"grad_norm": 0.8840892910957336,
"learning_rate": 1.5480991445620542e-05,
"loss": 0.2939,
"step": 1490
},
{
"epoch": 0.8413055711873945,
"grad_norm": 0.8503584265708923,
"learning_rate": 1.4960039298277029e-05,
"loss": 0.2429,
"step": 1495
},
{
"epoch": 0.8441193021947102,
"grad_norm": 1.0028765201568604,
"learning_rate": 1.4447293734284474e-05,
"loss": 0.3548,
"step": 1500
},
{
"epoch": 0.8469330332020258,
"grad_norm": 0.5684967637062073,
"learning_rate": 1.3942804234959373e-05,
"loss": 0.2871,
"step": 1505
},
{
"epoch": 0.8497467642093416,
"grad_norm": 0.7405120730400085,
"learning_rate": 1.3446619484885903e-05,
"loss": 0.1683,
"step": 1510
},
{
"epoch": 0.8525604952166573,
"grad_norm": 0.6290687918663025,
"learning_rate": 1.2958787367217751e-05,
"loss": 0.2926,
"step": 1515
},
{
"epoch": 0.855374226223973,
"grad_norm": 0.7487866878509521,
"learning_rate": 1.2479354959057298e-05,
"loss": 0.4543,
"step": 1520
},
{
"epoch": 0.8581879572312887,
"grad_norm": 0.2865360379219055,
"learning_rate": 1.2008368526912506e-05,
"loss": 0.1499,
"step": 1525
},
{
"epoch": 0.8610016882386043,
"grad_norm": 0.7912615537643433,
"learning_rate": 1.1545873522232053e-05,
"loss": 0.4047,
"step": 1530
},
{
"epoch": 0.8638154192459201,
"grad_norm": 0.4929727017879486,
"learning_rate": 1.1091914577019302e-05,
"loss": 0.3078,
"step": 1535
},
{
"epoch": 0.8666291502532358,
"grad_norm": 0.590064287185669,
"learning_rate": 1.0646535499524902e-05,
"loss": 0.2642,
"step": 1540
},
{
"epoch": 0.8694428812605515,
"grad_norm": 0.7222818732261658,
"learning_rate": 1.0209779270019525e-05,
"loss": 0.241,
"step": 1545
},
{
"epoch": 0.8722566122678672,
"grad_norm": 0.8655977249145508,
"learning_rate": 9.781688036645842e-06,
"loss": 0.2761,
"step": 1550
},
{
"epoch": 0.8750703432751828,
"grad_norm": 0.4627645015716553,
"learning_rate": 9.362303111351378e-06,
"loss": 0.1023,
"step": 1555
},
{
"epoch": 0.8778840742824986,
"grad_norm": 0.07925199717283249,
"learning_rate": 8.95166496590153e-06,
"loss": 0.1914,
"step": 1560
},
{
"epoch": 0.8806978052898143,
"grad_norm": 0.3665456771850586,
"learning_rate": 8.549813227974247e-06,
"loss": 0.2617,
"step": 1565
},
{
"epoch": 0.88351153629713,
"grad_norm": 0.8194103240966797,
"learning_rate": 8.156786677335493e-06,
"loss": 0.3987,
"step": 1570
},
{
"epoch": 0.8863252673044457,
"grad_norm": 0.4855242669582367,
"learning_rate": 7.772623242097277e-06,
"loss": 0.2692,
"step": 1575
},
{
"epoch": 0.8891389983117614,
"grad_norm": 0.531732976436615,
"learning_rate": 7.397359995057118e-06,
"loss": 0.2017,
"step": 1580
},
{
"epoch": 0.8919527293190771,
"grad_norm": 0.23579372465610504,
"learning_rate": 7.03103315012078e-06,
"loss": 0.183,
"step": 1585
},
{
"epoch": 0.8947664603263928,
"grad_norm": 0.38668301701545715,
"learning_rate": 6.673678058807198e-06,
"loss": 0.1825,
"step": 1590
},
{
"epoch": 0.8975801913337085,
"grad_norm": 0.5998560190200806,
"learning_rate": 6.325329206837216e-06,
"loss": 0.3018,
"step": 1595
},
{
"epoch": 0.9003939223410242,
"grad_norm": 0.96495521068573,
"learning_rate": 5.986020210805488e-06,
"loss": 0.2978,
"step": 1600
},
{
"epoch": 0.9032076533483399,
"grad_norm": 0.6704295873641968,
"learning_rate": 5.655783814936433e-06,
"loss": 0.1745,
"step": 1605
},
{
"epoch": 0.9060213843556556,
"grad_norm": 0.502069890499115,
"learning_rate": 5.334651887924324e-06,
"loss": 0.1923,
"step": 1610
},
{
"epoch": 0.9088351153629713,
"grad_norm": 0.1441662758588791,
"learning_rate": 5.0226554198578576e-06,
"loss": 0.1279,
"step": 1615
},
{
"epoch": 0.911648846370287,
"grad_norm": 0.6545499563217163,
"learning_rate": 4.719824519229554e-06,
"loss": 0.2999,
"step": 1620
},
{
"epoch": 0.9144625773776027,
"grad_norm": 0.4436165690422058,
"learning_rate": 4.426188410030196e-06,
"loss": 0.2404,
"step": 1625
},
{
"epoch": 0.9172763083849184,
"grad_norm": 0.2450067549943924,
"learning_rate": 4.1417754289286184e-06,
"loss": 0.3257,
"step": 1630
},
{
"epoch": 0.9200900393922341,
"grad_norm": 0.5148952603340149,
"learning_rate": 3.866613022537169e-06,
"loss": 0.1532,
"step": 1635
},
{
"epoch": 0.9229037703995498,
"grad_norm": 0.39606383442878723,
"learning_rate": 3.600727744763044e-06,
"loss": 0.3313,
"step": 1640
},
{
"epoch": 0.9257175014068655,
"grad_norm": 0.5422732830047607,
"learning_rate": 3.344145254245723e-06,
"loss": 0.1593,
"step": 1645
},
{
"epoch": 0.9285312324141812,
"grad_norm": 0.565556526184082,
"learning_rate": 3.0968903118808622e-06,
"loss": 0.3292,
"step": 1650
},
{
"epoch": 0.9313449634214969,
"grad_norm": 0.9481168389320374,
"learning_rate": 2.85898677843085e-06,
"loss": 0.1793,
"step": 1655
},
{
"epoch": 0.9341586944288126,
"grad_norm": 0.5848947167396545,
"learning_rate": 2.6304576122221035e-06,
"loss": 0.2746,
"step": 1660
},
{
"epoch": 0.9369724254361284,
"grad_norm": 0.840390145778656,
"learning_rate": 2.411324866929543e-06,
"loss": 0.3995,
"step": 1665
},
{
"epoch": 0.939786156443444,
"grad_norm": 0.5747278928756714,
"learning_rate": 2.201609689448425e-06,
"loss": 0.2925,
"step": 1670
},
{
"epoch": 0.9425998874507597,
"grad_norm": 0.518104612827301,
"learning_rate": 2.0013323178535102e-06,
"loss": 0.2362,
"step": 1675
},
{
"epoch": 0.9454136184580754,
"grad_norm": 0.5568994879722595,
"learning_rate": 1.810512079446125e-06,
"loss": 0.2395,
"step": 1680
},
{
"epoch": 0.9482273494653911,
"grad_norm": 0.2408752143383026,
"learning_rate": 1.6291673888889302e-06,
"loss": 0.1682,
"step": 1685
},
{
"epoch": 0.9510410804727069,
"grad_norm": 0.3361740708351135,
"learning_rate": 1.4573157464289554e-06,
"loss": 0.1792,
"step": 1690
},
{
"epoch": 0.9538548114800225,
"grad_norm": 0.46246424317359924,
"learning_rate": 1.2949737362087156e-06,
"loss": 0.1895,
"step": 1695
},
{
"epoch": 0.9566685424873382,
"grad_norm": 0.6043664813041687,
"learning_rate": 1.1421570246658242e-06,
"loss": 0.2753,
"step": 1700
},
{
"epoch": 0.9594822734946539,
"grad_norm": 0.5759782195091248,
"learning_rate": 9.988803590211037e-07,
"loss": 0.2,
"step": 1705
},
{
"epoch": 0.9622960045019696,
"grad_norm": 0.7253143787384033,
"learning_rate": 8.6515756585549e-07,
"loss": 0.3226,
"step": 1710
},
{
"epoch": 0.9651097355092854,
"grad_norm": 0.4216267466545105,
"learning_rate": 7.410015497756994e-07,
"loss": 0.2211,
"step": 1715
},
{
"epoch": 0.967923466516601,
"grad_norm": 0.6309015154838562,
"learning_rate": 6.264242921689257e-07,
"loss": 0.2258,
"step": 1720
},
{
"epoch": 0.9707371975239167,
"grad_norm": 0.4388352930545807,
"learning_rate": 5.214368500465305e-07,
"loss": 0.2074,
"step": 1725
},
{
"epoch": 0.9735509285312324,
"grad_norm": 0.36881500482559204,
"learning_rate": 4.260493549771316e-07,
"loss": 0.3136,
"step": 1730
},
{
"epoch": 0.9763646595385481,
"grad_norm": 0.612010657787323,
"learning_rate": 3.4027101210876155e-07,
"loss": 0.29,
"step": 1735
},
{
"epoch": 0.9791783905458639,
"grad_norm": 0.5004868507385254,
"learning_rate": 2.6411009928064556e-07,
"loss": 0.2251,
"step": 1740
},
{
"epoch": 0.9819921215531795,
"grad_norm": 0.3668440878391266,
"learning_rate": 1.9757396622428482e-07,
"loss": 0.1813,
"step": 1745
},
{
"epoch": 0.9848058525604952,
"grad_norm": 0.45063304901123047,
"learning_rate": 1.406690338542349e-07,
"loss": 0.1792,
"step": 1750
},
{
"epoch": 0.9876195835678109,
"grad_norm": 0.5779865384101868,
"learning_rate": 9.340079364847931e-08,
"loss": 0.2583,
"step": 1755
},
{
"epoch": 0.9904333145751266,
"grad_norm": 0.5747032165527344,
"learning_rate": 5.5773807118442154e-08,
"loss": 0.2567,
"step": 1760
},
{
"epoch": 0.9932470455824424,
"grad_norm": 0.49229690432548523,
"learning_rate": 2.7791705368818143e-08,
"loss": 0.1704,
"step": 1765
},
{
"epoch": 0.996060776589758,
"grad_norm": 0.24515922367572784,
"learning_rate": 9.457188747186151e-09,
"loss": 0.1709,
"step": 1770
},
{
"epoch": 0.9988745075970737,
"grad_norm": 0.45681869983673096,
"learning_rate": 7.720265833510709e-10,
"loss": 0.2311,
"step": 1775
},
{
"epoch": 1.0,
"step": 1777,
"total_flos": 6.548002937199657e+17,
"train_loss": 0.29464965595847514,
"train_runtime": 5546.9812,
"train_samples_per_second": 2.563,
"train_steps_per_second": 0.32
}
],
"logging_steps": 5,
"max_steps": 1777,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 20,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 6.548002937199657e+17,
"train_batch_size": 2,
"trial_name": null,
"trial_params": null
}