test_model_3B / trainer_state.json
liyang619's picture
Upload folder using huggingface_hub
54b792a verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 3.8377807640622,
"eval_steps": 500,
"global_step": 5000,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.007679017085813016,
"grad_norm": 9.727725365375315,
"learning_rate": 1.9969278033794163e-05,
"loss": 1.0813,
"step": 10
},
{
"epoch": 0.015358034171626032,
"grad_norm": 5.891390510276554,
"learning_rate": 1.993855606758833e-05,
"loss": 0.2409,
"step": 20
},
{
"epoch": 0.02303705125743905,
"grad_norm": 5.316733043697516,
"learning_rate": 1.990783410138249e-05,
"loss": 0.1393,
"step": 30
},
{
"epoch": 0.030716068343252065,
"grad_norm": 1.7740681981145512,
"learning_rate": 1.9877112135176652e-05,
"loss": 0.0493,
"step": 40
},
{
"epoch": 0.03839508542906508,
"grad_norm": 0.21916356325831438,
"learning_rate": 1.9846390168970814e-05,
"loss": 0.0162,
"step": 50
},
{
"epoch": 0.0460741025148781,
"grad_norm": 0.10526533851381863,
"learning_rate": 1.981566820276498e-05,
"loss": 0.0091,
"step": 60
},
{
"epoch": 0.05375311960069111,
"grad_norm": 0.09814045024832824,
"learning_rate": 1.978494623655914e-05,
"loss": 0.0059,
"step": 70
},
{
"epoch": 0.06143213668650413,
"grad_norm": 0.07005501460032271,
"learning_rate": 1.9754224270353303e-05,
"loss": 0.004,
"step": 80
},
{
"epoch": 0.06911115377231715,
"grad_norm": 0.05895087166496951,
"learning_rate": 1.9723502304147465e-05,
"loss": 0.0029,
"step": 90
},
{
"epoch": 0.07679017085813016,
"grad_norm": 0.05370999256341492,
"learning_rate": 1.969278033794163e-05,
"loss": 0.0021,
"step": 100
},
{
"epoch": 0.08446918794394317,
"grad_norm": 0.03834816398116016,
"learning_rate": 1.9662058371735792e-05,
"loss": 0.0017,
"step": 110
},
{
"epoch": 0.0921482050297562,
"grad_norm": 0.02734932863664795,
"learning_rate": 1.9631336405529954e-05,
"loss": 0.0013,
"step": 120
},
{
"epoch": 0.0998272221155692,
"grad_norm": 0.036636985676517964,
"learning_rate": 1.960061443932412e-05,
"loss": 0.0011,
"step": 130
},
{
"epoch": 0.10750623920138222,
"grad_norm": 0.02624781225455215,
"learning_rate": 1.956989247311828e-05,
"loss": 0.001,
"step": 140
},
{
"epoch": 0.11518525628719524,
"grad_norm": 0.020107478390890325,
"learning_rate": 1.9539170506912443e-05,
"loss": 0.0009,
"step": 150
},
{
"epoch": 0.12286427337300826,
"grad_norm": 0.01800807960296447,
"learning_rate": 1.9508448540706605e-05,
"loss": 0.0008,
"step": 160
},
{
"epoch": 0.13054329045882127,
"grad_norm": 0.020228009242971072,
"learning_rate": 1.947772657450077e-05,
"loss": 0.0008,
"step": 170
},
{
"epoch": 0.1382223075446343,
"grad_norm": 0.014493824912246944,
"learning_rate": 1.9447004608294932e-05,
"loss": 0.0007,
"step": 180
},
{
"epoch": 0.1459013246304473,
"grad_norm": 0.017644672369553523,
"learning_rate": 1.9416282642089094e-05,
"loss": 0.0007,
"step": 190
},
{
"epoch": 0.1535803417162603,
"grad_norm": 0.010751743718197211,
"learning_rate": 1.9385560675883256e-05,
"loss": 0.0007,
"step": 200
},
{
"epoch": 0.16125935880207334,
"grad_norm": 0.010740308456505745,
"learning_rate": 1.935483870967742e-05,
"loss": 0.0006,
"step": 210
},
{
"epoch": 0.16893837588788635,
"grad_norm": 0.012713528482636717,
"learning_rate": 1.9324116743471583e-05,
"loss": 0.0006,
"step": 220
},
{
"epoch": 0.17661739297369936,
"grad_norm": 0.015802969815143914,
"learning_rate": 1.9293394777265745e-05,
"loss": 0.0006,
"step": 230
},
{
"epoch": 0.1842964100595124,
"grad_norm": 0.014091331879241371,
"learning_rate": 1.926267281105991e-05,
"loss": 0.0006,
"step": 240
},
{
"epoch": 0.1919754271453254,
"grad_norm": 0.015514382294291343,
"learning_rate": 1.923195084485407e-05,
"loss": 0.0006,
"step": 250
},
{
"epoch": 0.1996544442311384,
"grad_norm": 0.013206328203358594,
"learning_rate": 1.9201228878648233e-05,
"loss": 0.0005,
"step": 260
},
{
"epoch": 0.20733346131695143,
"grad_norm": 0.009242022580630511,
"learning_rate": 1.91705069124424e-05,
"loss": 0.0006,
"step": 270
},
{
"epoch": 0.21501247840276444,
"grad_norm": 0.020571611640529425,
"learning_rate": 1.913978494623656e-05,
"loss": 0.0006,
"step": 280
},
{
"epoch": 0.22269149548857747,
"grad_norm": 0.010955465950333863,
"learning_rate": 1.9109062980030722e-05,
"loss": 0.0006,
"step": 290
},
{
"epoch": 0.23037051257439048,
"grad_norm": 0.010007446119499878,
"learning_rate": 1.9078341013824884e-05,
"loss": 0.0005,
"step": 300
},
{
"epoch": 0.23804952966020348,
"grad_norm": 0.007350763980205989,
"learning_rate": 1.904761904761905e-05,
"loss": 0.0005,
"step": 310
},
{
"epoch": 0.24572854674601652,
"grad_norm": 0.01779827202427705,
"learning_rate": 1.901689708141321e-05,
"loss": 0.0005,
"step": 320
},
{
"epoch": 0.25340756383182955,
"grad_norm": 0.011252332653060129,
"learning_rate": 1.8986175115207373e-05,
"loss": 0.0005,
"step": 330
},
{
"epoch": 0.26108658091764253,
"grad_norm": 0.008048724916684773,
"learning_rate": 1.895545314900154e-05,
"loss": 0.0005,
"step": 340
},
{
"epoch": 0.26876559800345556,
"grad_norm": 0.008708169011978722,
"learning_rate": 1.89247311827957e-05,
"loss": 0.0005,
"step": 350
},
{
"epoch": 0.2764446150892686,
"grad_norm": 0.011334018261947772,
"learning_rate": 1.8894009216589862e-05,
"loss": 0.0005,
"step": 360
},
{
"epoch": 0.2841236321750816,
"grad_norm": 0.012541055386254458,
"learning_rate": 1.8863287250384027e-05,
"loss": 0.0005,
"step": 370
},
{
"epoch": 0.2918026492608946,
"grad_norm": 0.00797864009319901,
"learning_rate": 1.883256528417819e-05,
"loss": 0.0005,
"step": 380
},
{
"epoch": 0.29948166634670764,
"grad_norm": 0.020543218292001533,
"learning_rate": 1.880184331797235e-05,
"loss": 0.0005,
"step": 390
},
{
"epoch": 0.3071606834325206,
"grad_norm": 0.008833716167768964,
"learning_rate": 1.8771121351766516e-05,
"loss": 0.0005,
"step": 400
},
{
"epoch": 0.31483970051833365,
"grad_norm": 0.01304769806714082,
"learning_rate": 1.8740399385560678e-05,
"loss": 0.0005,
"step": 410
},
{
"epoch": 0.3225187176041467,
"grad_norm": 0.0069234640254174085,
"learning_rate": 1.870967741935484e-05,
"loss": 0.0005,
"step": 420
},
{
"epoch": 0.33019773468995967,
"grad_norm": 0.006673799615850355,
"learning_rate": 1.8678955453149005e-05,
"loss": 0.0005,
"step": 430
},
{
"epoch": 0.3378767517757727,
"grad_norm": 0.008446803942572372,
"learning_rate": 1.8648233486943167e-05,
"loss": 0.0005,
"step": 440
},
{
"epoch": 0.34555576886158573,
"grad_norm": 0.007927264290360291,
"learning_rate": 1.861751152073733e-05,
"loss": 0.0005,
"step": 450
},
{
"epoch": 0.3532347859473987,
"grad_norm": 0.008214795632921428,
"learning_rate": 1.858678955453149e-05,
"loss": 0.0005,
"step": 460
},
{
"epoch": 0.36091380303321174,
"grad_norm": 0.0753894373723428,
"learning_rate": 1.8556067588325656e-05,
"loss": 0.0005,
"step": 470
},
{
"epoch": 0.3685928201190248,
"grad_norm": 0.023758962572301385,
"learning_rate": 1.8525345622119818e-05,
"loss": 0.0011,
"step": 480
},
{
"epoch": 0.37627183720483776,
"grad_norm": 0.008016402676739795,
"learning_rate": 1.849462365591398e-05,
"loss": 0.0007,
"step": 490
},
{
"epoch": 0.3839508542906508,
"grad_norm": 0.008412657527867743,
"learning_rate": 1.8463901689708145e-05,
"loss": 0.0005,
"step": 500
},
{
"epoch": 0.3916298713764638,
"grad_norm": 0.009837140434012532,
"learning_rate": 1.8433179723502307e-05,
"loss": 0.0005,
"step": 510
},
{
"epoch": 0.3993088884622768,
"grad_norm": 0.007645537883711128,
"learning_rate": 1.840245775729647e-05,
"loss": 0.0005,
"step": 520
},
{
"epoch": 0.40698790554808983,
"grad_norm": 0.009833102435437406,
"learning_rate": 1.837173579109063e-05,
"loss": 0.0005,
"step": 530
},
{
"epoch": 0.41466692263390287,
"grad_norm": 0.011054813245860173,
"learning_rate": 1.8341013824884796e-05,
"loss": 0.0004,
"step": 540
},
{
"epoch": 0.4223459397197159,
"grad_norm": 0.007490270162929283,
"learning_rate": 1.8310291858678958e-05,
"loss": 0.0004,
"step": 550
},
{
"epoch": 0.4300249568055289,
"grad_norm": 0.004870404631077036,
"learning_rate": 1.827956989247312e-05,
"loss": 0.0004,
"step": 560
},
{
"epoch": 0.4377039738913419,
"grad_norm": 0.0066189666941918876,
"learning_rate": 1.8248847926267285e-05,
"loss": 0.0004,
"step": 570
},
{
"epoch": 0.44538299097715495,
"grad_norm": 0.006057086083441172,
"learning_rate": 1.8218125960061447e-05,
"loss": 0.0004,
"step": 580
},
{
"epoch": 0.4530620080629679,
"grad_norm": 0.013285236122907804,
"learning_rate": 1.818740399385561e-05,
"loss": 0.0004,
"step": 590
},
{
"epoch": 0.46074102514878096,
"grad_norm": 0.0060099205496900975,
"learning_rate": 1.815668202764977e-05,
"loss": 0.0004,
"step": 600
},
{
"epoch": 0.468420042234594,
"grad_norm": 0.014047318223341236,
"learning_rate": 1.8125960061443936e-05,
"loss": 0.0004,
"step": 610
},
{
"epoch": 0.47609905932040697,
"grad_norm": 0.006607072647025668,
"learning_rate": 1.8095238095238097e-05,
"loss": 0.0004,
"step": 620
},
{
"epoch": 0.48377807640622,
"grad_norm": 0.006231396108942757,
"learning_rate": 1.806451612903226e-05,
"loss": 0.0004,
"step": 630
},
{
"epoch": 0.49145709349203304,
"grad_norm": 0.007977818806990321,
"learning_rate": 1.803379416282642e-05,
"loss": 0.0004,
"step": 640
},
{
"epoch": 0.499136110577846,
"grad_norm": 0.004452562656745065,
"learning_rate": 1.8003072196620586e-05,
"loss": 0.0004,
"step": 650
},
{
"epoch": 0.5068151276636591,
"grad_norm": 0.014825032602307369,
"learning_rate": 1.7972350230414748e-05,
"loss": 0.0004,
"step": 660
},
{
"epoch": 0.5144941447494721,
"grad_norm": 0.007726258660291533,
"learning_rate": 1.794162826420891e-05,
"loss": 0.0004,
"step": 670
},
{
"epoch": 0.5221731618352851,
"grad_norm": 0.004901555909874864,
"learning_rate": 1.7910906298003075e-05,
"loss": 0.0004,
"step": 680
},
{
"epoch": 0.5298521789210981,
"grad_norm": 0.008374026896139112,
"learning_rate": 1.7880184331797237e-05,
"loss": 0.0004,
"step": 690
},
{
"epoch": 0.5375311960069111,
"grad_norm": 0.012547419917709076,
"learning_rate": 1.78494623655914e-05,
"loss": 0.0004,
"step": 700
},
{
"epoch": 0.5452102130927241,
"grad_norm": 0.006258152294015092,
"learning_rate": 1.781874039938556e-05,
"loss": 0.0003,
"step": 710
},
{
"epoch": 0.5528892301785372,
"grad_norm": 0.005083584292665606,
"learning_rate": 1.7788018433179726e-05,
"loss": 0.0004,
"step": 720
},
{
"epoch": 0.5605682472643502,
"grad_norm": 0.004891854049254944,
"learning_rate": 1.7757296466973888e-05,
"loss": 0.0003,
"step": 730
},
{
"epoch": 0.5682472643501632,
"grad_norm": 0.0072081474858841825,
"learning_rate": 1.772657450076805e-05,
"loss": 0.0003,
"step": 740
},
{
"epoch": 0.5759262814359762,
"grad_norm": 0.007757147865401733,
"learning_rate": 1.7695852534562215e-05,
"loss": 0.0003,
"step": 750
},
{
"epoch": 0.5836052985217892,
"grad_norm": 0.009479916603674836,
"learning_rate": 1.7665130568356377e-05,
"loss": 0.0003,
"step": 760
},
{
"epoch": 0.5912843156076022,
"grad_norm": 0.0032364838449939247,
"learning_rate": 1.763440860215054e-05,
"loss": 0.0003,
"step": 770
},
{
"epoch": 0.5989633326934153,
"grad_norm": 0.006901552705099111,
"learning_rate": 1.76036866359447e-05,
"loss": 0.0003,
"step": 780
},
{
"epoch": 0.6066423497792283,
"grad_norm": 0.015422089144709121,
"learning_rate": 1.7572964669738866e-05,
"loss": 0.0004,
"step": 790
},
{
"epoch": 0.6143213668650412,
"grad_norm": 0.008708065391951672,
"learning_rate": 1.7542242703533028e-05,
"loss": 0.0003,
"step": 800
},
{
"epoch": 0.6220003839508543,
"grad_norm": 0.005612862812218234,
"learning_rate": 1.751152073732719e-05,
"loss": 0.0004,
"step": 810
},
{
"epoch": 0.6296794010366673,
"grad_norm": 0.004164448177684985,
"learning_rate": 1.748079877112135e-05,
"loss": 0.0003,
"step": 820
},
{
"epoch": 0.6373584181224803,
"grad_norm": 0.005805489468439844,
"learning_rate": 1.7450076804915517e-05,
"loss": 0.0003,
"step": 830
},
{
"epoch": 0.6450374352082934,
"grad_norm": 0.0058336408998533274,
"learning_rate": 1.741935483870968e-05,
"loss": 0.0003,
"step": 840
},
{
"epoch": 0.6527164522941064,
"grad_norm": 0.004581976763955479,
"learning_rate": 1.738863287250384e-05,
"loss": 0.0003,
"step": 850
},
{
"epoch": 0.6603954693799193,
"grad_norm": 0.007306575244655952,
"learning_rate": 1.7357910906298005e-05,
"loss": 0.0003,
"step": 860
},
{
"epoch": 0.6680744864657324,
"grad_norm": 0.003820631484850308,
"learning_rate": 1.7327188940092167e-05,
"loss": 0.0003,
"step": 870
},
{
"epoch": 0.6757535035515454,
"grad_norm": 0.0051408749682255814,
"learning_rate": 1.729646697388633e-05,
"loss": 0.0003,
"step": 880
},
{
"epoch": 0.6834325206373584,
"grad_norm": 0.006921982048724673,
"learning_rate": 1.726574500768049e-05,
"loss": 0.0003,
"step": 890
},
{
"epoch": 0.6911115377231715,
"grad_norm": 0.004784894049846067,
"learning_rate": 1.7235023041474656e-05,
"loss": 0.0003,
"step": 900
},
{
"epoch": 0.6987905548089844,
"grad_norm": 0.0031159854313391635,
"learning_rate": 1.7204301075268818e-05,
"loss": 0.0003,
"step": 910
},
{
"epoch": 0.7064695718947974,
"grad_norm": 0.006605791714352256,
"learning_rate": 1.717357910906298e-05,
"loss": 0.0003,
"step": 920
},
{
"epoch": 0.7141485889806105,
"grad_norm": 0.004936488600986064,
"learning_rate": 1.7142857142857142e-05,
"loss": 0.0003,
"step": 930
},
{
"epoch": 0.7218276060664235,
"grad_norm": 0.007674349562955001,
"learning_rate": 1.7112135176651307e-05,
"loss": 0.0003,
"step": 940
},
{
"epoch": 0.7295066231522365,
"grad_norm": 0.0076727356784045695,
"learning_rate": 1.708141321044547e-05,
"loss": 0.0003,
"step": 950
},
{
"epoch": 0.7371856402380496,
"grad_norm": 0.00553702977922986,
"learning_rate": 1.705069124423963e-05,
"loss": 0.0003,
"step": 960
},
{
"epoch": 0.7448646573238625,
"grad_norm": 0.007299313911910154,
"learning_rate": 1.7019969278033796e-05,
"loss": 0.0003,
"step": 970
},
{
"epoch": 0.7525436744096755,
"grad_norm": 0.004258926555903475,
"learning_rate": 1.6989247311827958e-05,
"loss": 0.0003,
"step": 980
},
{
"epoch": 0.7602226914954886,
"grad_norm": 0.004547740732820229,
"learning_rate": 1.695852534562212e-05,
"loss": 0.0003,
"step": 990
},
{
"epoch": 0.7679017085813016,
"grad_norm": 0.005203964533047756,
"learning_rate": 1.6927803379416285e-05,
"loss": 0.0003,
"step": 1000
},
{
"epoch": 0.7755807256671146,
"grad_norm": 0.01302332966364172,
"learning_rate": 1.6897081413210447e-05,
"loss": 0.0003,
"step": 1010
},
{
"epoch": 0.7832597427529276,
"grad_norm": 0.0093180048231896,
"learning_rate": 1.686635944700461e-05,
"loss": 0.0003,
"step": 1020
},
{
"epoch": 0.7909387598387406,
"grad_norm": 0.008803117247590506,
"learning_rate": 1.683563748079877e-05,
"loss": 0.0003,
"step": 1030
},
{
"epoch": 0.7986177769245536,
"grad_norm": 0.0048423473942718395,
"learning_rate": 1.6804915514592936e-05,
"loss": 0.0003,
"step": 1040
},
{
"epoch": 0.8062967940103667,
"grad_norm": 0.00871594900669087,
"learning_rate": 1.6774193548387098e-05,
"loss": 0.0003,
"step": 1050
},
{
"epoch": 0.8139758110961797,
"grad_norm": 0.004213591332826992,
"learning_rate": 1.674347158218126e-05,
"loss": 0.0003,
"step": 1060
},
{
"epoch": 0.8216548281819928,
"grad_norm": 0.008233410874863824,
"learning_rate": 1.6712749615975425e-05,
"loss": 0.0003,
"step": 1070
},
{
"epoch": 0.8293338452678057,
"grad_norm": 0.004842583832484554,
"learning_rate": 1.6682027649769587e-05,
"loss": 0.0003,
"step": 1080
},
{
"epoch": 0.8370128623536187,
"grad_norm": 0.012551492004310723,
"learning_rate": 1.665130568356375e-05,
"loss": 0.0003,
"step": 1090
},
{
"epoch": 0.8446918794394318,
"grad_norm": 0.010999047619194315,
"learning_rate": 1.6620583717357914e-05,
"loss": 0.0003,
"step": 1100
},
{
"epoch": 0.8523708965252448,
"grad_norm": 0.00940161449318046,
"learning_rate": 1.6589861751152075e-05,
"loss": 0.0003,
"step": 1110
},
{
"epoch": 0.8600499136110578,
"grad_norm": 0.005629135501887116,
"learning_rate": 1.6559139784946237e-05,
"loss": 0.0003,
"step": 1120
},
{
"epoch": 0.8677289306968708,
"grad_norm": 0.005695864014209226,
"learning_rate": 1.6528417818740403e-05,
"loss": 0.0003,
"step": 1130
},
{
"epoch": 0.8754079477826838,
"grad_norm": 0.008141397000999681,
"learning_rate": 1.6497695852534564e-05,
"loss": 0.0003,
"step": 1140
},
{
"epoch": 0.8830869648684968,
"grad_norm": 0.010312822716836551,
"learning_rate": 1.6466973886328726e-05,
"loss": 0.0003,
"step": 1150
},
{
"epoch": 0.8907659819543099,
"grad_norm": 0.004299526696605698,
"learning_rate": 1.643625192012289e-05,
"loss": 0.0003,
"step": 1160
},
{
"epoch": 0.8984449990401229,
"grad_norm": 0.007880227129562899,
"learning_rate": 1.6405529953917053e-05,
"loss": 0.0003,
"step": 1170
},
{
"epoch": 0.9061240161259358,
"grad_norm": 0.0038386080131062204,
"learning_rate": 1.6374807987711215e-05,
"loss": 0.0003,
"step": 1180
},
{
"epoch": 0.9138030332117489,
"grad_norm": 0.005902343886207709,
"learning_rate": 1.6344086021505377e-05,
"loss": 0.0003,
"step": 1190
},
{
"epoch": 0.9214820502975619,
"grad_norm": 0.005315908218014497,
"learning_rate": 1.6313364055299542e-05,
"loss": 0.0003,
"step": 1200
},
{
"epoch": 0.9291610673833749,
"grad_norm": 0.004817638329770463,
"learning_rate": 1.6282642089093704e-05,
"loss": 0.0003,
"step": 1210
},
{
"epoch": 0.936840084469188,
"grad_norm": 0.004282276912369252,
"learning_rate": 1.6251920122887866e-05,
"loss": 0.0003,
"step": 1220
},
{
"epoch": 0.944519101555001,
"grad_norm": 0.0030553467454727474,
"learning_rate": 1.622119815668203e-05,
"loss": 0.0003,
"step": 1230
},
{
"epoch": 0.9521981186408139,
"grad_norm": 0.003669066757736195,
"learning_rate": 1.6190476190476193e-05,
"loss": 0.0003,
"step": 1240
},
{
"epoch": 0.959877135726627,
"grad_norm": 0.007997281797816693,
"learning_rate": 1.6159754224270355e-05,
"loss": 0.0003,
"step": 1250
},
{
"epoch": 0.96755615281244,
"grad_norm": 0.0033293025096136717,
"learning_rate": 1.6129032258064517e-05,
"loss": 0.0003,
"step": 1260
},
{
"epoch": 0.975235169898253,
"grad_norm": 0.0090924908378621,
"learning_rate": 1.6098310291858682e-05,
"loss": 0.0003,
"step": 1270
},
{
"epoch": 0.9829141869840661,
"grad_norm": 0.01333845903017061,
"learning_rate": 1.6067588325652844e-05,
"loss": 0.0003,
"step": 1280
},
{
"epoch": 0.990593204069879,
"grad_norm": 0.004317437784916082,
"learning_rate": 1.6036866359447006e-05,
"loss": 0.0003,
"step": 1290
},
{
"epoch": 0.998272221155692,
"grad_norm": 0.006218136692693676,
"learning_rate": 1.600614439324117e-05,
"loss": 0.0003,
"step": 1300
},
{
"epoch": 1.005375311960069,
"grad_norm": 0.012722862337500376,
"learning_rate": 1.5975422427035333e-05,
"loss": 0.0003,
"step": 1310
},
{
"epoch": 1.0130543290458822,
"grad_norm": 0.012347661878448313,
"learning_rate": 1.5944700460829495e-05,
"loss": 0.0003,
"step": 1320
},
{
"epoch": 1.0207333461316952,
"grad_norm": 0.013680655581236744,
"learning_rate": 1.5913978494623657e-05,
"loss": 0.0003,
"step": 1330
},
{
"epoch": 1.0284123632175082,
"grad_norm": 0.008212234787974039,
"learning_rate": 1.5883256528417822e-05,
"loss": 0.0003,
"step": 1340
},
{
"epoch": 1.0360913803033212,
"grad_norm": 0.004326507441533446,
"learning_rate": 1.5852534562211984e-05,
"loss": 0.0003,
"step": 1350
},
{
"epoch": 1.0437703973891341,
"grad_norm": 0.006702741273813344,
"learning_rate": 1.5821812596006145e-05,
"loss": 0.0003,
"step": 1360
},
{
"epoch": 1.051449414474947,
"grad_norm": 0.007151020722561525,
"learning_rate": 1.5791090629800307e-05,
"loss": 0.0003,
"step": 1370
},
{
"epoch": 1.0591284315607603,
"grad_norm": 0.012743255489323653,
"learning_rate": 1.5760368663594473e-05,
"loss": 0.0003,
"step": 1380
},
{
"epoch": 1.0668074486465733,
"grad_norm": 0.008818497275557231,
"learning_rate": 1.5729646697388634e-05,
"loss": 0.0003,
"step": 1390
},
{
"epoch": 1.0744864657323863,
"grad_norm": 0.002839593309524096,
"learning_rate": 1.5698924731182796e-05,
"loss": 0.0003,
"step": 1400
},
{
"epoch": 1.0821654828181992,
"grad_norm": 0.006499416407964513,
"learning_rate": 1.566820276497696e-05,
"loss": 0.0003,
"step": 1410
},
{
"epoch": 1.0898444999040122,
"grad_norm": 0.0043902156072960176,
"learning_rate": 1.5637480798771123e-05,
"loss": 0.0003,
"step": 1420
},
{
"epoch": 1.0975235169898252,
"grad_norm": 0.011284861249978177,
"learning_rate": 1.5606758832565285e-05,
"loss": 0.0003,
"step": 1430
},
{
"epoch": 1.1052025340756384,
"grad_norm": 0.004036096758704275,
"learning_rate": 1.5576036866359447e-05,
"loss": 0.0003,
"step": 1440
},
{
"epoch": 1.1128815511614514,
"grad_norm": 0.006193352867852987,
"learning_rate": 1.5545314900153612e-05,
"loss": 0.0003,
"step": 1450
},
{
"epoch": 1.1205605682472644,
"grad_norm": 0.0043243328755989815,
"learning_rate": 1.5514592933947774e-05,
"loss": 0.0003,
"step": 1460
},
{
"epoch": 1.1282395853330773,
"grad_norm": 0.005942275681812123,
"learning_rate": 1.5483870967741936e-05,
"loss": 0.0003,
"step": 1470
},
{
"epoch": 1.1359186024188903,
"grad_norm": 0.004708153262372667,
"learning_rate": 1.5453149001536098e-05,
"loss": 0.0003,
"step": 1480
},
{
"epoch": 1.1435976195047033,
"grad_norm": 0.007691673975032789,
"learning_rate": 1.5422427035330263e-05,
"loss": 0.0003,
"step": 1490
},
{
"epoch": 1.1512766365905165,
"grad_norm": 0.012308448692961365,
"learning_rate": 1.5391705069124425e-05,
"loss": 0.0003,
"step": 1500
},
{
"epoch": 1.1589556536763295,
"grad_norm": 0.008574072497790838,
"learning_rate": 1.5360983102918587e-05,
"loss": 0.0003,
"step": 1510
},
{
"epoch": 1.1666346707621424,
"grad_norm": 0.0022349584876796874,
"learning_rate": 1.5330261136712752e-05,
"loss": 0.0003,
"step": 1520
},
{
"epoch": 1.1743136878479554,
"grad_norm": 0.0039426732001856726,
"learning_rate": 1.5299539170506914e-05,
"loss": 0.0003,
"step": 1530
},
{
"epoch": 1.1819927049337684,
"grad_norm": 0.0076643241645843055,
"learning_rate": 1.5268817204301076e-05,
"loss": 0.0003,
"step": 1540
},
{
"epoch": 1.1896717220195816,
"grad_norm": 0.00801480580123044,
"learning_rate": 1.523809523809524e-05,
"loss": 0.0003,
"step": 1550
},
{
"epoch": 1.1973507391053946,
"grad_norm": 0.0063236711582617235,
"learning_rate": 1.5207373271889403e-05,
"loss": 0.0003,
"step": 1560
},
{
"epoch": 1.2050297561912076,
"grad_norm": 0.0026532800936888,
"learning_rate": 1.5176651305683565e-05,
"loss": 0.0003,
"step": 1570
},
{
"epoch": 1.2127087732770205,
"grad_norm": 0.004399678225568667,
"learning_rate": 1.5145929339477728e-05,
"loss": 0.0003,
"step": 1580
},
{
"epoch": 1.2203877903628335,
"grad_norm": 0.003175980432596953,
"learning_rate": 1.511520737327189e-05,
"loss": 0.0003,
"step": 1590
},
{
"epoch": 1.2280668074486465,
"grad_norm": 0.003690809213724381,
"learning_rate": 1.5084485407066054e-05,
"loss": 0.0003,
"step": 1600
},
{
"epoch": 1.2357458245344595,
"grad_norm": 0.002885795037112433,
"learning_rate": 1.5053763440860215e-05,
"loss": 0.0003,
"step": 1610
},
{
"epoch": 1.2434248416202727,
"grad_norm": 0.004662223632430856,
"learning_rate": 1.5023041474654379e-05,
"loss": 0.0003,
"step": 1620
},
{
"epoch": 1.2511038587060856,
"grad_norm": 0.0038277697276091482,
"learning_rate": 1.4992319508448543e-05,
"loss": 0.0003,
"step": 1630
},
{
"epoch": 1.2587828757918986,
"grad_norm": 0.004926059671498492,
"learning_rate": 1.4961597542242704e-05,
"loss": 0.0003,
"step": 1640
},
{
"epoch": 1.2664618928777116,
"grad_norm": 0.00463367088053743,
"learning_rate": 1.4930875576036868e-05,
"loss": 0.0003,
"step": 1650
},
{
"epoch": 1.2741409099635246,
"grad_norm": 0.005726418059037605,
"learning_rate": 1.490015360983103e-05,
"loss": 0.0003,
"step": 1660
},
{
"epoch": 1.2818199270493378,
"grad_norm": 0.005247011723204908,
"learning_rate": 1.4869431643625193e-05,
"loss": 0.0003,
"step": 1670
},
{
"epoch": 1.2894989441351508,
"grad_norm": 0.007164559324630275,
"learning_rate": 1.4838709677419357e-05,
"loss": 0.0003,
"step": 1680
},
{
"epoch": 1.2971779612209637,
"grad_norm": 0.005041996622130852,
"learning_rate": 1.4807987711213519e-05,
"loss": 0.0003,
"step": 1690
},
{
"epoch": 1.3048569783067767,
"grad_norm": 0.005487598186113812,
"learning_rate": 1.477726574500768e-05,
"loss": 0.0003,
"step": 1700
},
{
"epoch": 1.3125359953925897,
"grad_norm": 0.003135256998223166,
"learning_rate": 1.4746543778801846e-05,
"loss": 0.0003,
"step": 1710
},
{
"epoch": 1.320215012478403,
"grad_norm": 0.0069210555218662635,
"learning_rate": 1.4715821812596008e-05,
"loss": 0.0003,
"step": 1720
},
{
"epoch": 1.3278940295642156,
"grad_norm": 0.003417481046551585,
"learning_rate": 1.468509984639017e-05,
"loss": 0.0003,
"step": 1730
},
{
"epoch": 1.3355730466500289,
"grad_norm": 0.0076945646315396184,
"learning_rate": 1.4654377880184335e-05,
"loss": 0.0003,
"step": 1740
},
{
"epoch": 1.3432520637358418,
"grad_norm": 0.004664070136688662,
"learning_rate": 1.4623655913978497e-05,
"loss": 0.0003,
"step": 1750
},
{
"epoch": 1.3509310808216548,
"grad_norm": 0.003957679244338998,
"learning_rate": 1.4592933947772658e-05,
"loss": 0.0003,
"step": 1760
},
{
"epoch": 1.3586100979074678,
"grad_norm": 0.003471942169350295,
"learning_rate": 1.456221198156682e-05,
"loss": 0.0003,
"step": 1770
},
{
"epoch": 1.3662891149932808,
"grad_norm": 0.006157537653481742,
"learning_rate": 1.4531490015360986e-05,
"loss": 0.0003,
"step": 1780
},
{
"epoch": 1.373968132079094,
"grad_norm": 0.008728892287992982,
"learning_rate": 1.4500768049155147e-05,
"loss": 0.0003,
"step": 1790
},
{
"epoch": 1.381647149164907,
"grad_norm": 0.00566547815883834,
"learning_rate": 1.447004608294931e-05,
"loss": 0.0003,
"step": 1800
},
{
"epoch": 1.38932616625072,
"grad_norm": 0.008658651412825806,
"learning_rate": 1.4439324116743471e-05,
"loss": 0.0003,
"step": 1810
},
{
"epoch": 1.397005183336533,
"grad_norm": 0.005166116309061509,
"learning_rate": 1.4408602150537636e-05,
"loss": 0.0003,
"step": 1820
},
{
"epoch": 1.4046842004223459,
"grad_norm": 0.0028075866905398405,
"learning_rate": 1.4377880184331798e-05,
"loss": 0.0003,
"step": 1830
},
{
"epoch": 1.412363217508159,
"grad_norm": 0.005329755957597182,
"learning_rate": 1.434715821812596e-05,
"loss": 0.0003,
"step": 1840
},
{
"epoch": 1.420042234593972,
"grad_norm": 0.0025789510506500093,
"learning_rate": 1.4316436251920125e-05,
"loss": 0.0003,
"step": 1850
},
{
"epoch": 1.427721251679785,
"grad_norm": 0.00305980645847167,
"learning_rate": 1.4285714285714287e-05,
"loss": 0.0003,
"step": 1860
},
{
"epoch": 1.435400268765598,
"grad_norm": 0.0070892296996524756,
"learning_rate": 1.4254992319508449e-05,
"loss": 0.0003,
"step": 1870
},
{
"epoch": 1.443079285851411,
"grad_norm": 0.004636892035159531,
"learning_rate": 1.422427035330261e-05,
"loss": 0.0003,
"step": 1880
},
{
"epoch": 1.450758302937224,
"grad_norm": 0.007196283698693798,
"learning_rate": 1.4193548387096776e-05,
"loss": 0.0003,
"step": 1890
},
{
"epoch": 1.458437320023037,
"grad_norm": 0.002244250906729436,
"learning_rate": 1.4162826420890938e-05,
"loss": 0.0003,
"step": 1900
},
{
"epoch": 1.4661163371088501,
"grad_norm": 0.003662139205215455,
"learning_rate": 1.41321044546851e-05,
"loss": 0.0003,
"step": 1910
},
{
"epoch": 1.4737953541946631,
"grad_norm": 0.00899296235470264,
"learning_rate": 1.4101382488479263e-05,
"loss": 0.0003,
"step": 1920
},
{
"epoch": 1.481474371280476,
"grad_norm": 0.007554011023119674,
"learning_rate": 1.4070660522273427e-05,
"loss": 0.0003,
"step": 1930
},
{
"epoch": 1.489153388366289,
"grad_norm": 0.006664687059239841,
"learning_rate": 1.4039938556067589e-05,
"loss": 0.0003,
"step": 1940
},
{
"epoch": 1.496832405452102,
"grad_norm": 0.003654880039156468,
"learning_rate": 1.4009216589861752e-05,
"loss": 0.0003,
"step": 1950
},
{
"epoch": 1.5045114225379153,
"grad_norm": 0.007520628392798319,
"learning_rate": 1.3978494623655916e-05,
"loss": 0.0003,
"step": 1960
},
{
"epoch": 1.512190439623728,
"grad_norm": 0.0074722854847562,
"learning_rate": 1.3947772657450078e-05,
"loss": 0.0003,
"step": 1970
},
{
"epoch": 1.5198694567095412,
"grad_norm": 0.009940245405058647,
"learning_rate": 1.3917050691244241e-05,
"loss": 0.0003,
"step": 1980
},
{
"epoch": 1.5275484737953542,
"grad_norm": 0.006304573730328701,
"learning_rate": 1.3886328725038403e-05,
"loss": 0.0003,
"step": 1990
},
{
"epoch": 1.5352274908811672,
"grad_norm": 0.0055550452573098165,
"learning_rate": 1.3855606758832567e-05,
"loss": 0.0003,
"step": 2000
},
{
"epoch": 1.5429065079669804,
"grad_norm": 0.008909260445155288,
"learning_rate": 1.382488479262673e-05,
"loss": 0.0003,
"step": 2010
},
{
"epoch": 1.5505855250527931,
"grad_norm": 0.002599064360242108,
"learning_rate": 1.3794162826420892e-05,
"loss": 0.0003,
"step": 2020
},
{
"epoch": 1.5582645421386063,
"grad_norm": 0.0034038539972756267,
"learning_rate": 1.3763440860215056e-05,
"loss": 0.0003,
"step": 2030
},
{
"epoch": 1.5659435592244193,
"grad_norm": 0.013808170488498894,
"learning_rate": 1.3732718894009217e-05,
"loss": 0.0003,
"step": 2040
},
{
"epoch": 1.5736225763102323,
"grad_norm": 0.0017545504012708383,
"learning_rate": 1.3701996927803381e-05,
"loss": 0.0003,
"step": 2050
},
{
"epoch": 1.5813015933960453,
"grad_norm": 0.003842048923808368,
"learning_rate": 1.3671274961597543e-05,
"loss": 0.0003,
"step": 2060
},
{
"epoch": 1.5889806104818582,
"grad_norm": 0.004326236565390302,
"learning_rate": 1.3640552995391706e-05,
"loss": 0.0003,
"step": 2070
},
{
"epoch": 1.5966596275676714,
"grad_norm": 0.0035407621847916644,
"learning_rate": 1.360983102918587e-05,
"loss": 0.0003,
"step": 2080
},
{
"epoch": 1.6043386446534842,
"grad_norm": 0.004974256724902474,
"learning_rate": 1.3579109062980032e-05,
"loss": 0.0003,
"step": 2090
},
{
"epoch": 1.6120176617392974,
"grad_norm": 0.002810217043936606,
"learning_rate": 1.3548387096774194e-05,
"loss": 0.0003,
"step": 2100
},
{
"epoch": 1.6196966788251104,
"grad_norm": 0.002954993487071089,
"learning_rate": 1.3517665130568359e-05,
"loss": 0.0003,
"step": 2110
},
{
"epoch": 1.6273756959109233,
"grad_norm": 0.003028755651427543,
"learning_rate": 1.348694316436252e-05,
"loss": 0.0003,
"step": 2120
},
{
"epoch": 1.6350547129967365,
"grad_norm": 0.0026674905313092702,
"learning_rate": 1.3456221198156683e-05,
"loss": 0.0003,
"step": 2130
},
{
"epoch": 1.6427337300825493,
"grad_norm": 0.006409597579717579,
"learning_rate": 1.3425499231950848e-05,
"loss": 0.0003,
"step": 2140
},
{
"epoch": 1.6504127471683625,
"grad_norm": 0.0036828809413129507,
"learning_rate": 1.339477726574501e-05,
"loss": 0.0003,
"step": 2150
},
{
"epoch": 1.6580917642541755,
"grad_norm": 0.007891670676920814,
"learning_rate": 1.3364055299539171e-05,
"loss": 0.0003,
"step": 2160
},
{
"epoch": 1.6657707813399885,
"grad_norm": 0.006626613367159732,
"learning_rate": 1.3333333333333333e-05,
"loss": 0.0003,
"step": 2170
},
{
"epoch": 1.6734497984258017,
"grad_norm": 0.003534695341590609,
"learning_rate": 1.3302611367127499e-05,
"loss": 0.0003,
"step": 2180
},
{
"epoch": 1.6811288155116144,
"grad_norm": 0.002799573693646372,
"learning_rate": 1.327188940092166e-05,
"loss": 0.0003,
"step": 2190
},
{
"epoch": 1.6888078325974276,
"grad_norm": 0.007920181925607207,
"learning_rate": 1.3241167434715822e-05,
"loss": 0.0003,
"step": 2200
},
{
"epoch": 1.6964868496832406,
"grad_norm": 0.003197909917687604,
"learning_rate": 1.3210445468509984e-05,
"loss": 0.0003,
"step": 2210
},
{
"epoch": 1.7041658667690536,
"grad_norm": 0.0019020952054658064,
"learning_rate": 1.317972350230415e-05,
"loss": 0.0003,
"step": 2220
},
{
"epoch": 1.7118448838548666,
"grad_norm": 0.003430345573430971,
"learning_rate": 1.3149001536098311e-05,
"loss": 0.0003,
"step": 2230
},
{
"epoch": 1.7195239009406795,
"grad_norm": 0.005966901533330741,
"learning_rate": 1.3118279569892473e-05,
"loss": 0.0003,
"step": 2240
},
{
"epoch": 1.7272029180264927,
"grad_norm": 0.003453325797399688,
"learning_rate": 1.3087557603686638e-05,
"loss": 0.0003,
"step": 2250
},
{
"epoch": 1.7348819351123055,
"grad_norm": 0.004117138090969933,
"learning_rate": 1.30568356374808e-05,
"loss": 0.0003,
"step": 2260
},
{
"epoch": 1.7425609521981187,
"grad_norm": 0.015933078305414367,
"learning_rate": 1.3026113671274962e-05,
"loss": 0.0003,
"step": 2270
},
{
"epoch": 1.7502399692839317,
"grad_norm": 0.00695674170034877,
"learning_rate": 1.2995391705069126e-05,
"loss": 0.0003,
"step": 2280
},
{
"epoch": 1.7579189863697446,
"grad_norm": 0.0030599729808705334,
"learning_rate": 1.2964669738863289e-05,
"loss": 0.0003,
"step": 2290
},
{
"epoch": 1.7655980034555578,
"grad_norm": 0.005764241880693109,
"learning_rate": 1.2933947772657451e-05,
"loss": 0.0003,
"step": 2300
},
{
"epoch": 1.7732770205413706,
"grad_norm": 0.0024572213319480774,
"learning_rate": 1.2903225806451613e-05,
"loss": 0.0003,
"step": 2310
},
{
"epoch": 1.7809560376271838,
"grad_norm": 0.005442108190635756,
"learning_rate": 1.2872503840245776e-05,
"loss": 0.0003,
"step": 2320
},
{
"epoch": 1.7886350547129968,
"grad_norm": 0.005249849493945225,
"learning_rate": 1.284178187403994e-05,
"loss": 0.0003,
"step": 2330
},
{
"epoch": 1.7963140717988098,
"grad_norm": 0.003760686952515486,
"learning_rate": 1.2811059907834102e-05,
"loss": 0.0003,
"step": 2340
},
{
"epoch": 1.8039930888846227,
"grad_norm": 0.003326565830912095,
"learning_rate": 1.2780337941628265e-05,
"loss": 0.0003,
"step": 2350
},
{
"epoch": 1.8116721059704357,
"grad_norm": 0.00687928832968373,
"learning_rate": 1.2749615975422429e-05,
"loss": 0.0003,
"step": 2360
},
{
"epoch": 1.819351123056249,
"grad_norm": 0.002523920816120679,
"learning_rate": 1.271889400921659e-05,
"loss": 0.0003,
"step": 2370
},
{
"epoch": 1.8270301401420617,
"grad_norm": 0.0035826335995729104,
"learning_rate": 1.2688172043010754e-05,
"loss": 0.0003,
"step": 2380
},
{
"epoch": 1.8347091572278749,
"grad_norm": 0.0042202748472642045,
"learning_rate": 1.2657450076804916e-05,
"loss": 0.0003,
"step": 2390
},
{
"epoch": 1.8423881743136878,
"grad_norm": 0.00433061878504225,
"learning_rate": 1.262672811059908e-05,
"loss": 0.0003,
"step": 2400
},
{
"epoch": 1.8500671913995008,
"grad_norm": 0.006032498913999335,
"learning_rate": 1.2596006144393243e-05,
"loss": 0.0003,
"step": 2410
},
{
"epoch": 1.857746208485314,
"grad_norm": 0.002731884836982076,
"learning_rate": 1.2565284178187405e-05,
"loss": 0.0003,
"step": 2420
},
{
"epoch": 1.8654252255711268,
"grad_norm": 0.003451471394001781,
"learning_rate": 1.2534562211981567e-05,
"loss": 0.0003,
"step": 2430
},
{
"epoch": 1.87310424265694,
"grad_norm": 0.002472343027285463,
"learning_rate": 1.2503840245775732e-05,
"loss": 0.0003,
"step": 2440
},
{
"epoch": 1.880783259742753,
"grad_norm": 0.003320944102999081,
"learning_rate": 1.2473118279569894e-05,
"loss": 0.0003,
"step": 2450
},
{
"epoch": 1.888462276828566,
"grad_norm": 0.0018486199226611809,
"learning_rate": 1.2442396313364056e-05,
"loss": 0.0003,
"step": 2460
},
{
"epoch": 1.896141293914379,
"grad_norm": 0.0024515391018999654,
"learning_rate": 1.2411674347158221e-05,
"loss": 0.0003,
"step": 2470
},
{
"epoch": 1.903820311000192,
"grad_norm": 0.0039409109050494015,
"learning_rate": 1.2380952380952383e-05,
"loss": 0.0003,
"step": 2480
},
{
"epoch": 1.911499328086005,
"grad_norm": 0.0020024603662128597,
"learning_rate": 1.2350230414746545e-05,
"loss": 0.0003,
"step": 2490
},
{
"epoch": 1.9191783451718178,
"grad_norm": 0.004837460528352513,
"learning_rate": 1.2319508448540707e-05,
"loss": 0.0003,
"step": 2500
},
{
"epoch": 1.926857362257631,
"grad_norm": 0.001684979658088187,
"learning_rate": 1.2288786482334872e-05,
"loss": 0.0003,
"step": 2510
},
{
"epoch": 1.934536379343444,
"grad_norm": 0.0036424135207038444,
"learning_rate": 1.2258064516129034e-05,
"loss": 0.0003,
"step": 2520
},
{
"epoch": 1.942215396429257,
"grad_norm": 0.003460021636897484,
"learning_rate": 1.2227342549923195e-05,
"loss": 0.0003,
"step": 2530
},
{
"epoch": 1.9498944135150702,
"grad_norm": 0.0012857496658247277,
"learning_rate": 1.2196620583717357e-05,
"loss": 0.0003,
"step": 2540
},
{
"epoch": 1.957573430600883,
"grad_norm": 0.006405995384537319,
"learning_rate": 1.2165898617511523e-05,
"loss": 0.0003,
"step": 2550
},
{
"epoch": 1.9652524476866962,
"grad_norm": 0.0027885557407680427,
"learning_rate": 1.2135176651305684e-05,
"loss": 0.0003,
"step": 2560
},
{
"epoch": 1.9729314647725091,
"grad_norm": 0.0046197235806071674,
"learning_rate": 1.2104454685099846e-05,
"loss": 0.0003,
"step": 2570
},
{
"epoch": 1.9806104818583221,
"grad_norm": 0.002923714387690518,
"learning_rate": 1.2073732718894012e-05,
"loss": 0.0003,
"step": 2580
},
{
"epoch": 1.9882894989441353,
"grad_norm": 0.004546312959559587,
"learning_rate": 1.2043010752688173e-05,
"loss": 0.0003,
"step": 2590
},
{
"epoch": 1.995968516029948,
"grad_norm": 0.0027543773295028978,
"learning_rate": 1.2012288786482335e-05,
"loss": 0.0003,
"step": 2600
},
{
"epoch": 2.0030716068343253,
"grad_norm": 0.00445399533446676,
"learning_rate": 1.1981566820276497e-05,
"loss": 0.0002,
"step": 2610
},
{
"epoch": 2.010750623920138,
"grad_norm": 0.009066513947313037,
"learning_rate": 1.1950844854070662e-05,
"loss": 0.0003,
"step": 2620
},
{
"epoch": 2.0184296410059512,
"grad_norm": 0.0022089803112435333,
"learning_rate": 1.1920122887864824e-05,
"loss": 0.0003,
"step": 2630
},
{
"epoch": 2.0261086580917644,
"grad_norm": 0.007859382516213974,
"learning_rate": 1.1889400921658986e-05,
"loss": 0.0003,
"step": 2640
},
{
"epoch": 2.033787675177577,
"grad_norm": 0.002280973635314502,
"learning_rate": 1.185867895545315e-05,
"loss": 0.0003,
"step": 2650
},
{
"epoch": 2.0414666922633904,
"grad_norm": 0.003812714680306737,
"learning_rate": 1.1827956989247313e-05,
"loss": 0.0003,
"step": 2660
},
{
"epoch": 2.049145709349203,
"grad_norm": 0.003865003763224919,
"learning_rate": 1.1797235023041475e-05,
"loss": 0.0003,
"step": 2670
},
{
"epoch": 2.0568247264350163,
"grad_norm": 0.0056210714836978015,
"learning_rate": 1.1766513056835639e-05,
"loss": 0.0003,
"step": 2680
},
{
"epoch": 2.0645037435208295,
"grad_norm": 0.0029786676822015994,
"learning_rate": 1.1735791090629802e-05,
"loss": 0.0003,
"step": 2690
},
{
"epoch": 2.0721827606066423,
"grad_norm": 0.007577207373633205,
"learning_rate": 1.1705069124423964e-05,
"loss": 0.0003,
"step": 2700
},
{
"epoch": 2.0798617776924555,
"grad_norm": 0.004911935203582005,
"learning_rate": 1.1674347158218127e-05,
"loss": 0.0003,
"step": 2710
},
{
"epoch": 2.0875407947782683,
"grad_norm": 0.002348567372539777,
"learning_rate": 1.164362519201229e-05,
"loss": 0.0003,
"step": 2720
},
{
"epoch": 2.0952198118640815,
"grad_norm": 0.0021975557273255456,
"learning_rate": 1.1612903225806453e-05,
"loss": 0.0003,
"step": 2730
},
{
"epoch": 2.102898828949894,
"grad_norm": 0.004404237994021701,
"learning_rate": 1.1582181259600616e-05,
"loss": 0.0003,
"step": 2740
},
{
"epoch": 2.1105778460357074,
"grad_norm": 0.0018039936664214415,
"learning_rate": 1.1551459293394778e-05,
"loss": 0.0003,
"step": 2750
},
{
"epoch": 2.1182568631215206,
"grad_norm": 0.010542570288986714,
"learning_rate": 1.152073732718894e-05,
"loss": 0.0003,
"step": 2760
},
{
"epoch": 2.1259358802073334,
"grad_norm": 0.007212811136834576,
"learning_rate": 1.1490015360983104e-05,
"loss": 0.0003,
"step": 2770
},
{
"epoch": 2.1336148972931466,
"grad_norm": 0.002463133011224361,
"learning_rate": 1.1459293394777267e-05,
"loss": 0.0003,
"step": 2780
},
{
"epoch": 2.1412939143789593,
"grad_norm": 0.0030969432185738934,
"learning_rate": 1.1428571428571429e-05,
"loss": 0.0003,
"step": 2790
},
{
"epoch": 2.1489729314647725,
"grad_norm": 0.0009640256914357676,
"learning_rate": 1.1397849462365593e-05,
"loss": 0.0003,
"step": 2800
},
{
"epoch": 2.1566519485505857,
"grad_norm": 0.0062156621929774095,
"learning_rate": 1.1367127496159756e-05,
"loss": 0.0003,
"step": 2810
},
{
"epoch": 2.1643309656363985,
"grad_norm": 0.006538407111075363,
"learning_rate": 1.1336405529953918e-05,
"loss": 0.0003,
"step": 2820
},
{
"epoch": 2.1720099827222117,
"grad_norm": 0.0028212937587065077,
"learning_rate": 1.130568356374808e-05,
"loss": 0.0003,
"step": 2830
},
{
"epoch": 2.1796889998080244,
"grad_norm": 0.005023107122791538,
"learning_rate": 1.1274961597542245e-05,
"loss": 0.0003,
"step": 2840
},
{
"epoch": 2.1873680168938376,
"grad_norm": 0.0011156564572317208,
"learning_rate": 1.1244239631336407e-05,
"loss": 0.0003,
"step": 2850
},
{
"epoch": 2.1950470339796504,
"grad_norm": 0.001942852230799726,
"learning_rate": 1.1213517665130569e-05,
"loss": 0.0003,
"step": 2860
},
{
"epoch": 2.2027260510654636,
"grad_norm": 0.008378616547487394,
"learning_rate": 1.118279569892473e-05,
"loss": 0.0003,
"step": 2870
},
{
"epoch": 2.210405068151277,
"grad_norm": 0.002600735996609255,
"learning_rate": 1.1152073732718896e-05,
"loss": 0.0003,
"step": 2880
},
{
"epoch": 2.2180840852370896,
"grad_norm": 0.0015136314382080984,
"learning_rate": 1.1121351766513058e-05,
"loss": 0.0003,
"step": 2890
},
{
"epoch": 2.2257631023229028,
"grad_norm": 0.007485965818712587,
"learning_rate": 1.109062980030722e-05,
"loss": 0.0003,
"step": 2900
},
{
"epoch": 2.2334421194087155,
"grad_norm": 0.006013194482848518,
"learning_rate": 1.1059907834101385e-05,
"loss": 0.0003,
"step": 2910
},
{
"epoch": 2.2411211364945287,
"grad_norm": 0.005253526138111572,
"learning_rate": 1.1029185867895547e-05,
"loss": 0.0003,
"step": 2920
},
{
"epoch": 2.248800153580342,
"grad_norm": 0.0030844920407275436,
"learning_rate": 1.0998463901689708e-05,
"loss": 0.0003,
"step": 2930
},
{
"epoch": 2.2564791706661547,
"grad_norm": 0.00448225831555134,
"learning_rate": 1.096774193548387e-05,
"loss": 0.0003,
"step": 2940
},
{
"epoch": 2.264158187751968,
"grad_norm": 0.002766089015696827,
"learning_rate": 1.0937019969278036e-05,
"loss": 0.0003,
"step": 2950
},
{
"epoch": 2.2718372048377806,
"grad_norm": 0.00432406954984362,
"learning_rate": 1.0906298003072197e-05,
"loss": 0.0003,
"step": 2960
},
{
"epoch": 2.279516221923594,
"grad_norm": 0.005297571043727681,
"learning_rate": 1.087557603686636e-05,
"loss": 0.0003,
"step": 2970
},
{
"epoch": 2.2871952390094066,
"grad_norm": 0.005051571714735924,
"learning_rate": 1.0844854070660523e-05,
"loss": 0.0003,
"step": 2980
},
{
"epoch": 2.2948742560952198,
"grad_norm": 0.0036863856281938016,
"learning_rate": 1.0814132104454686e-05,
"loss": 0.0003,
"step": 2990
},
{
"epoch": 2.302553273181033,
"grad_norm": 0.0024327974031678975,
"learning_rate": 1.0783410138248848e-05,
"loss": 0.0003,
"step": 3000
},
{
"epoch": 2.3102322902668457,
"grad_norm": 0.0032986912710764884,
"learning_rate": 1.0752688172043012e-05,
"loss": 0.0003,
"step": 3010
},
{
"epoch": 2.317911307352659,
"grad_norm": 0.002895373170239971,
"learning_rate": 1.0721966205837175e-05,
"loss": 0.0003,
"step": 3020
},
{
"epoch": 2.3255903244384717,
"grad_norm": 0.0037467096830764678,
"learning_rate": 1.0691244239631337e-05,
"loss": 0.0003,
"step": 3030
},
{
"epoch": 2.333269341524285,
"grad_norm": 0.0041338587084730925,
"learning_rate": 1.0660522273425499e-05,
"loss": 0.0003,
"step": 3040
},
{
"epoch": 2.340948358610098,
"grad_norm": 0.004584463378907932,
"learning_rate": 1.0629800307219663e-05,
"loss": 0.0003,
"step": 3050
},
{
"epoch": 2.348627375695911,
"grad_norm": 0.005390217173101364,
"learning_rate": 1.0599078341013826e-05,
"loss": 0.0003,
"step": 3060
},
{
"epoch": 2.356306392781724,
"grad_norm": 0.006574519516791052,
"learning_rate": 1.0568356374807988e-05,
"loss": 0.0003,
"step": 3070
},
{
"epoch": 2.363985409867537,
"grad_norm": 0.0031029456990706457,
"learning_rate": 1.0537634408602151e-05,
"loss": 0.0003,
"step": 3080
},
{
"epoch": 2.37166442695335,
"grad_norm": 0.003485382502658449,
"learning_rate": 1.0506912442396313e-05,
"loss": 0.0003,
"step": 3090
},
{
"epoch": 2.379343444039163,
"grad_norm": 0.0033652977877385503,
"learning_rate": 1.0476190476190477e-05,
"loss": 0.0003,
"step": 3100
},
{
"epoch": 2.387022461124976,
"grad_norm": 0.001994262459548078,
"learning_rate": 1.044546850998464e-05,
"loss": 0.0003,
"step": 3110
},
{
"epoch": 2.394701478210789,
"grad_norm": 0.0027030822644481534,
"learning_rate": 1.0414746543778802e-05,
"loss": 0.0003,
"step": 3120
},
{
"epoch": 2.402380495296602,
"grad_norm": 0.0075004858676295996,
"learning_rate": 1.0384024577572966e-05,
"loss": 0.0003,
"step": 3130
},
{
"epoch": 2.410059512382415,
"grad_norm": 0.005277345871616036,
"learning_rate": 1.035330261136713e-05,
"loss": 0.0003,
"step": 3140
},
{
"epoch": 2.4177385294682283,
"grad_norm": 0.0034500505732812984,
"learning_rate": 1.0322580645161291e-05,
"loss": 0.0003,
"step": 3150
},
{
"epoch": 2.425417546554041,
"grad_norm": 0.004072496071322172,
"learning_rate": 1.0291858678955453e-05,
"loss": 0.0003,
"step": 3160
},
{
"epoch": 2.4330965636398543,
"grad_norm": 0.0037519391939282247,
"learning_rate": 1.0261136712749618e-05,
"loss": 0.0003,
"step": 3170
},
{
"epoch": 2.440775580725667,
"grad_norm": 0.0029126430566717857,
"learning_rate": 1.023041474654378e-05,
"loss": 0.0003,
"step": 3180
},
{
"epoch": 2.4484545978114802,
"grad_norm": 0.006804725581164672,
"learning_rate": 1.0199692780337942e-05,
"loss": 0.0003,
"step": 3190
},
{
"epoch": 2.456133614897293,
"grad_norm": 0.003394434324862927,
"learning_rate": 1.0168970814132104e-05,
"loss": 0.0003,
"step": 3200
},
{
"epoch": 2.463812631983106,
"grad_norm": 0.0047839322011928136,
"learning_rate": 1.0138248847926269e-05,
"loss": 0.0003,
"step": 3210
},
{
"epoch": 2.471491649068919,
"grad_norm": 0.0021047452286355496,
"learning_rate": 1.0107526881720431e-05,
"loss": 0.0003,
"step": 3220
},
{
"epoch": 2.479170666154732,
"grad_norm": 0.002910893106529187,
"learning_rate": 1.0076804915514593e-05,
"loss": 0.0003,
"step": 3230
},
{
"epoch": 2.4868496832405453,
"grad_norm": 0.0025188863038248495,
"learning_rate": 1.0046082949308758e-05,
"loss": 0.0003,
"step": 3240
},
{
"epoch": 2.494528700326358,
"grad_norm": 0.005110455562647162,
"learning_rate": 1.001536098310292e-05,
"loss": 0.0003,
"step": 3250
},
{
"epoch": 2.5022077174121713,
"grad_norm": 0.0035194967193856925,
"learning_rate": 9.984639016897082e-06,
"loss": 0.0003,
"step": 3260
},
{
"epoch": 2.509886734497984,
"grad_norm": 0.004112839294946175,
"learning_rate": 9.953917050691245e-06,
"loss": 0.0003,
"step": 3270
},
{
"epoch": 2.5175657515837973,
"grad_norm": 0.003219601043538841,
"learning_rate": 9.923195084485407e-06,
"loss": 0.0003,
"step": 3280
},
{
"epoch": 2.5252447686696105,
"grad_norm": 0.0021615101929151476,
"learning_rate": 9.89247311827957e-06,
"loss": 0.0003,
"step": 3290
},
{
"epoch": 2.532923785755423,
"grad_norm": 0.0026956859004236954,
"learning_rate": 9.861751152073733e-06,
"loss": 0.0003,
"step": 3300
},
{
"epoch": 2.5406028028412364,
"grad_norm": 0.0012769547330191422,
"learning_rate": 9.831029185867896e-06,
"loss": 0.0003,
"step": 3310
},
{
"epoch": 2.548281819927049,
"grad_norm": 0.004837667895037462,
"learning_rate": 9.80030721966206e-06,
"loss": 0.0003,
"step": 3320
},
{
"epoch": 2.5559608370128624,
"grad_norm": 0.004331759593272771,
"learning_rate": 9.769585253456221e-06,
"loss": 0.0003,
"step": 3330
},
{
"epoch": 2.5636398540986756,
"grad_norm": 0.003806749090919161,
"learning_rate": 9.738863287250385e-06,
"loss": 0.0003,
"step": 3340
},
{
"epoch": 2.5713188711844883,
"grad_norm": 0.003405808798578046,
"learning_rate": 9.708141321044547e-06,
"loss": 0.0003,
"step": 3350
},
{
"epoch": 2.5789978882703015,
"grad_norm": 0.0018090209587433655,
"learning_rate": 9.67741935483871e-06,
"loss": 0.0003,
"step": 3360
},
{
"epoch": 2.5866769053561143,
"grad_norm": 0.0024778977279488216,
"learning_rate": 9.646697388632872e-06,
"loss": 0.0003,
"step": 3370
},
{
"epoch": 2.5943559224419275,
"grad_norm": 0.005359718689721543,
"learning_rate": 9.615975422427036e-06,
"loss": 0.0003,
"step": 3380
},
{
"epoch": 2.6020349395277407,
"grad_norm": 0.0012868512440999584,
"learning_rate": 9.5852534562212e-06,
"loss": 0.0003,
"step": 3390
},
{
"epoch": 2.6097139566135534,
"grad_norm": 0.005696079089497556,
"learning_rate": 9.554531490015361e-06,
"loss": 0.0003,
"step": 3400
},
{
"epoch": 2.6173929736993666,
"grad_norm": 0.0032109625254054023,
"learning_rate": 9.523809523809525e-06,
"loss": 0.0003,
"step": 3410
},
{
"epoch": 2.6250719907851794,
"grad_norm": 0.002903890729728573,
"learning_rate": 9.493087557603687e-06,
"loss": 0.0003,
"step": 3420
},
{
"epoch": 2.6327510078709926,
"grad_norm": 0.003523170524614984,
"learning_rate": 9.46236559139785e-06,
"loss": 0.0003,
"step": 3430
},
{
"epoch": 2.640430024956806,
"grad_norm": 0.0036252760580902602,
"learning_rate": 9.431643625192014e-06,
"loss": 0.0003,
"step": 3440
},
{
"epoch": 2.6481090420426185,
"grad_norm": 0.002714708446513513,
"learning_rate": 9.400921658986176e-06,
"loss": 0.0003,
"step": 3450
},
{
"epoch": 2.6557880591284313,
"grad_norm": 0.0037700018885341927,
"learning_rate": 9.370199692780339e-06,
"loss": 0.0003,
"step": 3460
},
{
"epoch": 2.6634670762142445,
"grad_norm": 0.005122776482125785,
"learning_rate": 9.339477726574503e-06,
"loss": 0.0003,
"step": 3470
},
{
"epoch": 2.6711460933000577,
"grad_norm": 0.0013201671036341795,
"learning_rate": 9.308755760368664e-06,
"loss": 0.0003,
"step": 3480
},
{
"epoch": 2.6788251103858705,
"grad_norm": 0.003020186145598462,
"learning_rate": 9.278033794162828e-06,
"loss": 0.0003,
"step": 3490
},
{
"epoch": 2.6865041274716837,
"grad_norm": 0.011069671357401941,
"learning_rate": 9.24731182795699e-06,
"loss": 0.0003,
"step": 3500
},
{
"epoch": 2.6941831445574964,
"grad_norm": 0.007485690656388163,
"learning_rate": 9.216589861751153e-06,
"loss": 0.0003,
"step": 3510
},
{
"epoch": 2.7018621616433096,
"grad_norm": 0.00636654094660092,
"learning_rate": 9.185867895545315e-06,
"loss": 0.0003,
"step": 3520
},
{
"epoch": 2.709541178729123,
"grad_norm": 0.0039619478349876185,
"learning_rate": 9.155145929339479e-06,
"loss": 0.0003,
"step": 3530
},
{
"epoch": 2.7172201958149356,
"grad_norm": 0.0054654628329376094,
"learning_rate": 9.124423963133642e-06,
"loss": 0.0003,
"step": 3540
},
{
"epoch": 2.7248992129007488,
"grad_norm": 0.0023486404063996135,
"learning_rate": 9.093701996927804e-06,
"loss": 0.0003,
"step": 3550
},
{
"epoch": 2.7325782299865615,
"grad_norm": 0.002849175524143881,
"learning_rate": 9.062980030721968e-06,
"loss": 0.0003,
"step": 3560
},
{
"epoch": 2.7402572470723747,
"grad_norm": 0.0040654911920692495,
"learning_rate": 9.03225806451613e-06,
"loss": 0.0003,
"step": 3570
},
{
"epoch": 2.747936264158188,
"grad_norm": 0.003611352214922628,
"learning_rate": 9.001536098310293e-06,
"loss": 0.0003,
"step": 3580
},
{
"epoch": 2.7556152812440007,
"grad_norm": 0.002668802943203528,
"learning_rate": 8.970814132104455e-06,
"loss": 0.0003,
"step": 3590
},
{
"epoch": 2.763294298329814,
"grad_norm": 0.0033794836750422907,
"learning_rate": 8.940092165898619e-06,
"loss": 0.0003,
"step": 3600
},
{
"epoch": 2.7709733154156266,
"grad_norm": 0.0019714078072412106,
"learning_rate": 8.90937019969278e-06,
"loss": 0.0003,
"step": 3610
},
{
"epoch": 2.77865233250144,
"grad_norm": 0.0036424341616288908,
"learning_rate": 8.878648233486944e-06,
"loss": 0.0003,
"step": 3620
},
{
"epoch": 2.786331349587253,
"grad_norm": 0.0025731242429069758,
"learning_rate": 8.847926267281107e-06,
"loss": 0.0003,
"step": 3630
},
{
"epoch": 2.794010366673066,
"grad_norm": 0.003112848869608825,
"learning_rate": 8.81720430107527e-06,
"loss": 0.0003,
"step": 3640
},
{
"epoch": 2.801689383758879,
"grad_norm": 0.0024507056173105482,
"learning_rate": 8.786482334869433e-06,
"loss": 0.0003,
"step": 3650
},
{
"epoch": 2.8093684008446917,
"grad_norm": 0.003942355547919082,
"learning_rate": 8.755760368663595e-06,
"loss": 0.0003,
"step": 3660
},
{
"epoch": 2.817047417930505,
"grad_norm": 0.007985074482092179,
"learning_rate": 8.725038402457758e-06,
"loss": 0.0003,
"step": 3670
},
{
"epoch": 2.824726435016318,
"grad_norm": 0.006570601645436398,
"learning_rate": 8.69431643625192e-06,
"loss": 0.0003,
"step": 3680
},
{
"epoch": 2.832405452102131,
"grad_norm": 0.002711244143718766,
"learning_rate": 8.663594470046084e-06,
"loss": 0.0003,
"step": 3690
},
{
"epoch": 2.840084469187944,
"grad_norm": 0.0021197096154111942,
"learning_rate": 8.632872503840246e-06,
"loss": 0.0003,
"step": 3700
},
{
"epoch": 2.847763486273757,
"grad_norm": 0.0030651493700618623,
"learning_rate": 8.602150537634409e-06,
"loss": 0.0003,
"step": 3710
},
{
"epoch": 2.85544250335957,
"grad_norm": 0.0038564973802781943,
"learning_rate": 8.571428571428571e-06,
"loss": 0.0003,
"step": 3720
},
{
"epoch": 2.8631215204453833,
"grad_norm": 0.0019345140882013855,
"learning_rate": 8.540706605222734e-06,
"loss": 0.0003,
"step": 3730
},
{
"epoch": 2.870800537531196,
"grad_norm": 0.003530046987716614,
"learning_rate": 8.509984639016898e-06,
"loss": 0.0003,
"step": 3740
},
{
"epoch": 2.8784795546170088,
"grad_norm": 0.0045638471491203396,
"learning_rate": 8.47926267281106e-06,
"loss": 0.0003,
"step": 3750
},
{
"epoch": 2.886158571702822,
"grad_norm": 0.002548977990330342,
"learning_rate": 8.448540706605223e-06,
"loss": 0.0003,
"step": 3760
},
{
"epoch": 2.893837588788635,
"grad_norm": 0.004637676739276992,
"learning_rate": 8.417818740399385e-06,
"loss": 0.0003,
"step": 3770
},
{
"epoch": 2.901516605874448,
"grad_norm": 0.004249182216049807,
"learning_rate": 8.387096774193549e-06,
"loss": 0.0003,
"step": 3780
},
{
"epoch": 2.909195622960261,
"grad_norm": 0.0020834658623780523,
"learning_rate": 8.356374807987712e-06,
"loss": 0.0003,
"step": 3790
},
{
"epoch": 2.916874640046074,
"grad_norm": 0.005570319306302508,
"learning_rate": 8.325652841781874e-06,
"loss": 0.0003,
"step": 3800
},
{
"epoch": 2.924553657131887,
"grad_norm": 0.004441691840249149,
"learning_rate": 8.294930875576038e-06,
"loss": 0.0003,
"step": 3810
},
{
"epoch": 2.9322326742177003,
"grad_norm": 0.004852997098567165,
"learning_rate": 8.264208909370201e-06,
"loss": 0.0003,
"step": 3820
},
{
"epoch": 2.939911691303513,
"grad_norm": 0.004796673169729304,
"learning_rate": 8.233486943164363e-06,
"loss": 0.0003,
"step": 3830
},
{
"epoch": 2.9475907083893262,
"grad_norm": 0.00303272221365764,
"learning_rate": 8.202764976958527e-06,
"loss": 0.0003,
"step": 3840
},
{
"epoch": 2.955269725475139,
"grad_norm": 0.009776414261676187,
"learning_rate": 8.172043010752689e-06,
"loss": 0.0003,
"step": 3850
},
{
"epoch": 2.962948742560952,
"grad_norm": 0.0019734177655631514,
"learning_rate": 8.141321044546852e-06,
"loss": 0.0003,
"step": 3860
},
{
"epoch": 2.9706277596467654,
"grad_norm": 0.005414209178109374,
"learning_rate": 8.110599078341016e-06,
"loss": 0.0003,
"step": 3870
},
{
"epoch": 2.978306776732578,
"grad_norm": 0.0020584308196663873,
"learning_rate": 8.079877112135177e-06,
"loss": 0.0003,
"step": 3880
},
{
"epoch": 2.9859857938183914,
"grad_norm": 0.002197200567243655,
"learning_rate": 8.049155145929341e-06,
"loss": 0.0003,
"step": 3890
},
{
"epoch": 2.993664810904204,
"grad_norm": 0.0012909809217368014,
"learning_rate": 8.018433179723503e-06,
"loss": 0.0003,
"step": 3900
},
{
"epoch": 3.0007679017085813,
"grad_norm": 0.005912347993649129,
"learning_rate": 7.987711213517666e-06,
"loss": 0.0003,
"step": 3910
},
{
"epoch": 3.0084469187943945,
"grad_norm": 0.0024746506269591262,
"learning_rate": 7.956989247311828e-06,
"loss": 0.0003,
"step": 3920
},
{
"epoch": 3.0161259358802073,
"grad_norm": 0.003743031987316316,
"learning_rate": 7.926267281105992e-06,
"loss": 0.0003,
"step": 3930
},
{
"epoch": 3.0238049529660205,
"grad_norm": 0.0020760288460637535,
"learning_rate": 7.895545314900154e-06,
"loss": 0.0003,
"step": 3940
},
{
"epoch": 3.0314839700518332,
"grad_norm": 0.010111413246967849,
"learning_rate": 7.864823348694317e-06,
"loss": 0.0003,
"step": 3950
},
{
"epoch": 3.0391629871376464,
"grad_norm": 0.00321449927030987,
"learning_rate": 7.83410138248848e-06,
"loss": 0.0003,
"step": 3960
},
{
"epoch": 3.046842004223459,
"grad_norm": 0.0022266492617884176,
"learning_rate": 7.803379416282643e-06,
"loss": 0.0003,
"step": 3970
},
{
"epoch": 3.0545210213092724,
"grad_norm": 0.3330681312874651,
"learning_rate": 7.772657450076806e-06,
"loss": 0.0006,
"step": 3980
},
{
"epoch": 3.0622000383950856,
"grad_norm": 0.0929644329244732,
"learning_rate": 7.741935483870968e-06,
"loss": 0.0031,
"step": 3990
},
{
"epoch": 3.0698790554808983,
"grad_norm": 2.5444639341537942,
"learning_rate": 7.711213517665132e-06,
"loss": 0.0499,
"step": 4000
},
{
"epoch": 3.0775580725667115,
"grad_norm": 2.706681402967684,
"learning_rate": 7.680491551459293e-06,
"loss": 0.0123,
"step": 4010
},
{
"epoch": 3.0852370896525243,
"grad_norm": 0.3106533133384103,
"learning_rate": 7.649769585253457e-06,
"loss": 0.012,
"step": 4020
},
{
"epoch": 3.0929161067383375,
"grad_norm": 0.011659651180711347,
"learning_rate": 7.61904761904762e-06,
"loss": 0.0007,
"step": 4030
},
{
"epoch": 3.1005951238241507,
"grad_norm": 0.01130908041600874,
"learning_rate": 7.588325652841782e-06,
"loss": 0.0004,
"step": 4040
},
{
"epoch": 3.1082741409099635,
"grad_norm": 0.008227145166831094,
"learning_rate": 7.557603686635945e-06,
"loss": 0.0003,
"step": 4050
},
{
"epoch": 3.1159531579957767,
"grad_norm": 0.0055737792826447054,
"learning_rate": 7.526881720430108e-06,
"loss": 0.0003,
"step": 4060
},
{
"epoch": 3.1236321750815894,
"grad_norm": 0.002555421706411845,
"learning_rate": 7.496159754224271e-06,
"loss": 0.0003,
"step": 4070
},
{
"epoch": 3.1313111921674026,
"grad_norm": 0.006555677014598616,
"learning_rate": 7.465437788018434e-06,
"loss": 0.0003,
"step": 4080
},
{
"epoch": 3.1389902092532154,
"grad_norm": 0.009434120428132338,
"learning_rate": 7.434715821812597e-06,
"loss": 0.0003,
"step": 4090
},
{
"epoch": 3.1466692263390286,
"grad_norm": 0.0041818007155617395,
"learning_rate": 7.403993855606759e-06,
"loss": 0.0003,
"step": 4100
},
{
"epoch": 3.1543482434248418,
"grad_norm": 0.002877849745067617,
"learning_rate": 7.373271889400923e-06,
"loss": 0.0003,
"step": 4110
},
{
"epoch": 3.1620272605106545,
"grad_norm": 0.0030447348846358723,
"learning_rate": 7.342549923195085e-06,
"loss": 0.0003,
"step": 4120
},
{
"epoch": 3.1697062775964677,
"grad_norm": 0.0060218718801864175,
"learning_rate": 7.311827956989248e-06,
"loss": 0.0003,
"step": 4130
},
{
"epoch": 3.1773852946822805,
"grad_norm": 0.006332525678995614,
"learning_rate": 7.28110599078341e-06,
"loss": 0.0003,
"step": 4140
},
{
"epoch": 3.1850643117680937,
"grad_norm": 0.003215146996700883,
"learning_rate": 7.250384024577574e-06,
"loss": 0.0003,
"step": 4150
},
{
"epoch": 3.192743328853907,
"grad_norm": 0.005935997258576502,
"learning_rate": 7.2196620583717355e-06,
"loss": 0.0003,
"step": 4160
},
{
"epoch": 3.2004223459397196,
"grad_norm": 0.004157021390061587,
"learning_rate": 7.188940092165899e-06,
"loss": 0.0003,
"step": 4170
},
{
"epoch": 3.208101363025533,
"grad_norm": 0.004859684777513284,
"learning_rate": 7.158218125960063e-06,
"loss": 0.0003,
"step": 4180
},
{
"epoch": 3.2157803801113456,
"grad_norm": 0.004857019863143934,
"learning_rate": 7.1274961597542245e-06,
"loss": 0.0003,
"step": 4190
},
{
"epoch": 3.223459397197159,
"grad_norm": 0.003878455168237981,
"learning_rate": 7.096774193548388e-06,
"loss": 0.0003,
"step": 4200
},
{
"epoch": 3.231138414282972,
"grad_norm": 0.004425680070729378,
"learning_rate": 7.06605222734255e-06,
"loss": 0.0003,
"step": 4210
},
{
"epoch": 3.2388174313687847,
"grad_norm": 0.0012776092639396753,
"learning_rate": 7.0353302611367134e-06,
"loss": 0.0003,
"step": 4220
},
{
"epoch": 3.246496448454598,
"grad_norm": 0.0032977925007112736,
"learning_rate": 7.004608294930876e-06,
"loss": 0.0003,
"step": 4230
},
{
"epoch": 3.2541754655404107,
"grad_norm": 0.00520607446673023,
"learning_rate": 6.973886328725039e-06,
"loss": 0.0003,
"step": 4240
},
{
"epoch": 3.261854482626224,
"grad_norm": 0.005908417082190133,
"learning_rate": 6.9431643625192015e-06,
"loss": 0.0003,
"step": 4250
},
{
"epoch": 3.269533499712037,
"grad_norm": 0.005104388415579662,
"learning_rate": 6.912442396313365e-06,
"loss": 0.0003,
"step": 4260
},
{
"epoch": 3.27721251679785,
"grad_norm": 0.00279843439440394,
"learning_rate": 6.881720430107528e-06,
"loss": 0.0003,
"step": 4270
},
{
"epoch": 3.284891533883663,
"grad_norm": 0.004221914284256829,
"learning_rate": 6.8509984639016905e-06,
"loss": 0.0003,
"step": 4280
},
{
"epoch": 3.292570550969476,
"grad_norm": 0.002789017236217476,
"learning_rate": 6.820276497695853e-06,
"loss": 0.0003,
"step": 4290
},
{
"epoch": 3.300249568055289,
"grad_norm": 0.003591747007618594,
"learning_rate": 6.789554531490016e-06,
"loss": 0.0003,
"step": 4300
},
{
"epoch": 3.3079285851411018,
"grad_norm": 0.0056326237606971275,
"learning_rate": 6.758832565284179e-06,
"loss": 0.0003,
"step": 4310
},
{
"epoch": 3.315607602226915,
"grad_norm": 0.0027487580853665806,
"learning_rate": 6.728110599078341e-06,
"loss": 0.0003,
"step": 4320
},
{
"epoch": 3.3232866193127277,
"grad_norm": 0.005221571566163444,
"learning_rate": 6.697388632872505e-06,
"loss": 0.0003,
"step": 4330
},
{
"epoch": 3.330965636398541,
"grad_norm": 0.005764385454293689,
"learning_rate": 6.666666666666667e-06,
"loss": 0.0003,
"step": 4340
},
{
"epoch": 3.338644653484354,
"grad_norm": 0.0019604837190694527,
"learning_rate": 6.63594470046083e-06,
"loss": 0.0003,
"step": 4350
},
{
"epoch": 3.346323670570167,
"grad_norm": 0.002896385918252742,
"learning_rate": 6.605222734254992e-06,
"loss": 0.0003,
"step": 4360
},
{
"epoch": 3.35400268765598,
"grad_norm": 0.00606820751300495,
"learning_rate": 6.574500768049156e-06,
"loss": 0.0003,
"step": 4370
},
{
"epoch": 3.361681704741793,
"grad_norm": 0.004591568141116001,
"learning_rate": 6.543778801843319e-06,
"loss": 0.0003,
"step": 4380
},
{
"epoch": 3.369360721827606,
"grad_norm": 0.002433120666611045,
"learning_rate": 6.513056835637481e-06,
"loss": 0.0003,
"step": 4390
},
{
"epoch": 3.3770397389134192,
"grad_norm": 0.00531173620847677,
"learning_rate": 6.4823348694316445e-06,
"loss": 0.0003,
"step": 4400
},
{
"epoch": 3.384718755999232,
"grad_norm": 0.005122482104609612,
"learning_rate": 6.451612903225806e-06,
"loss": 0.0003,
"step": 4410
},
{
"epoch": 3.392397773085045,
"grad_norm": 0.004100500912973401,
"learning_rate": 6.42089093701997e-06,
"loss": 0.0003,
"step": 4420
},
{
"epoch": 3.400076790170858,
"grad_norm": 0.0023809509840975367,
"learning_rate": 6.390168970814133e-06,
"loss": 0.0003,
"step": 4430
},
{
"epoch": 3.407755807256671,
"grad_norm": 0.0026348003137030064,
"learning_rate": 6.359447004608295e-06,
"loss": 0.0003,
"step": 4440
},
{
"epoch": 3.4154348243424844,
"grad_norm": 0.0013064532447140981,
"learning_rate": 6.328725038402458e-06,
"loss": 0.0003,
"step": 4450
},
{
"epoch": 3.423113841428297,
"grad_norm": 0.004540940802478115,
"learning_rate": 6.2980030721966216e-06,
"loss": 0.0003,
"step": 4460
},
{
"epoch": 3.4307928585141103,
"grad_norm": 0.0026972329703791845,
"learning_rate": 6.267281105990783e-06,
"loss": 0.0003,
"step": 4470
},
{
"epoch": 3.438471875599923,
"grad_norm": 0.004636444780767564,
"learning_rate": 6.236559139784947e-06,
"loss": 0.0003,
"step": 4480
},
{
"epoch": 3.4461508926857363,
"grad_norm": 0.003960381641830104,
"learning_rate": 6.2058371735791105e-06,
"loss": 0.0003,
"step": 4490
},
{
"epoch": 3.4538299097715495,
"grad_norm": 0.00474082033962727,
"learning_rate": 6.175115207373272e-06,
"loss": 0.0003,
"step": 4500
},
{
"epoch": 3.4615089268573622,
"grad_norm": 0.0038782360963232256,
"learning_rate": 6.144393241167436e-06,
"loss": 0.0003,
"step": 4510
},
{
"epoch": 3.4691879439431754,
"grad_norm": 0.0035297720880386315,
"learning_rate": 6.113671274961598e-06,
"loss": 0.0003,
"step": 4520
},
{
"epoch": 3.476866961028988,
"grad_norm": 0.004232110212456926,
"learning_rate": 6.082949308755761e-06,
"loss": 0.0003,
"step": 4530
},
{
"epoch": 3.4845459781148014,
"grad_norm": 0.002775233265931185,
"learning_rate": 6.052227342549923e-06,
"loss": 0.0002,
"step": 4540
},
{
"epoch": 3.492224995200614,
"grad_norm": 0.0022484687297900418,
"learning_rate": 6.021505376344087e-06,
"loss": 0.0003,
"step": 4550
},
{
"epoch": 3.4999040122864273,
"grad_norm": 0.010059652937527065,
"learning_rate": 5.9907834101382485e-06,
"loss": 0.0003,
"step": 4560
},
{
"epoch": 3.50758302937224,
"grad_norm": 0.005860058288903284,
"learning_rate": 5.960061443932412e-06,
"loss": 0.0003,
"step": 4570
},
{
"epoch": 3.5152620464580533,
"grad_norm": 0.001771873531568768,
"learning_rate": 5.929339477726575e-06,
"loss": 0.0003,
"step": 4580
},
{
"epoch": 3.5229410635438665,
"grad_norm": 0.0037856677349401477,
"learning_rate": 5.8986175115207375e-06,
"loss": 0.0003,
"step": 4590
},
{
"epoch": 3.5306200806296792,
"grad_norm": 0.00262376974978298,
"learning_rate": 5.867895545314901e-06,
"loss": 0.0003,
"step": 4600
},
{
"epoch": 3.5382990977154924,
"grad_norm": 0.003097531567235156,
"learning_rate": 5.837173579109064e-06,
"loss": 0.0003,
"step": 4610
},
{
"epoch": 3.545978114801305,
"grad_norm": 0.0019368382797062594,
"learning_rate": 5.806451612903226e-06,
"loss": 0.0003,
"step": 4620
},
{
"epoch": 3.5536571318871184,
"grad_norm": 0.004336527548689335,
"learning_rate": 5.775729646697389e-06,
"loss": 0.0003,
"step": 4630
},
{
"epoch": 3.5613361489729316,
"grad_norm": 0.001542836625256801,
"learning_rate": 5.745007680491552e-06,
"loss": 0.0003,
"step": 4640
},
{
"epoch": 3.5690151660587444,
"grad_norm": 0.006404316601820908,
"learning_rate": 5.7142857142857145e-06,
"loss": 0.0003,
"step": 4650
},
{
"epoch": 3.5766941831445576,
"grad_norm": 0.0029147588687753185,
"learning_rate": 5.683563748079878e-06,
"loss": 0.0003,
"step": 4660
},
{
"epoch": 3.5843732002303703,
"grad_norm": 0.009747259780473038,
"learning_rate": 5.65284178187404e-06,
"loss": 0.0003,
"step": 4670
},
{
"epoch": 3.5920522173161835,
"grad_norm": 0.005435655230219059,
"learning_rate": 5.6221198156682035e-06,
"loss": 0.0003,
"step": 4680
},
{
"epoch": 3.5997312344019967,
"grad_norm": 0.0011809589959859185,
"learning_rate": 5.591397849462365e-06,
"loss": 0.0003,
"step": 4690
},
{
"epoch": 3.6074102514878095,
"grad_norm": 0.0043535225911655855,
"learning_rate": 5.560675883256529e-06,
"loss": 0.0003,
"step": 4700
},
{
"epoch": 3.6150892685736227,
"grad_norm": 0.002409686221015935,
"learning_rate": 5.529953917050692e-06,
"loss": 0.0003,
"step": 4710
},
{
"epoch": 3.6227682856594354,
"grad_norm": 0.0021895075558719937,
"learning_rate": 5.499231950844854e-06,
"loss": 0.0003,
"step": 4720
},
{
"epoch": 3.6304473027452486,
"grad_norm": 0.002325096639061682,
"learning_rate": 5.468509984639018e-06,
"loss": 0.0003,
"step": 4730
},
{
"epoch": 3.638126319831062,
"grad_norm": 0.005217362404820908,
"learning_rate": 5.43778801843318e-06,
"loss": 0.0003,
"step": 4740
},
{
"epoch": 3.6458053369168746,
"grad_norm": 0.0025128558421436426,
"learning_rate": 5.407066052227343e-06,
"loss": 0.0003,
"step": 4750
},
{
"epoch": 3.653484354002688,
"grad_norm": 0.003260458572828957,
"learning_rate": 5.376344086021506e-06,
"loss": 0.0003,
"step": 4760
},
{
"epoch": 3.6611633710885005,
"grad_norm": 0.004920360483939583,
"learning_rate": 5.345622119815669e-06,
"loss": 0.0003,
"step": 4770
},
{
"epoch": 3.6688423881743137,
"grad_norm": 0.006017572517411142,
"learning_rate": 5.314900153609831e-06,
"loss": 0.0003,
"step": 4780
},
{
"epoch": 3.676521405260127,
"grad_norm": 0.003921825806007615,
"learning_rate": 5.284178187403994e-06,
"loss": 0.0003,
"step": 4790
},
{
"epoch": 3.6842004223459397,
"grad_norm": 0.0026317828696058375,
"learning_rate": 5.253456221198157e-06,
"loss": 0.0003,
"step": 4800
},
{
"epoch": 3.691879439431753,
"grad_norm": 0.0018516494095851901,
"learning_rate": 5.22273425499232e-06,
"loss": 0.0003,
"step": 4810
},
{
"epoch": 3.6995584565175657,
"grad_norm": 0.0018334101081003715,
"learning_rate": 5.192012288786483e-06,
"loss": 0.0003,
"step": 4820
},
{
"epoch": 3.707237473603379,
"grad_norm": 0.0026102899650512555,
"learning_rate": 5.161290322580646e-06,
"loss": 0.0003,
"step": 4830
},
{
"epoch": 3.714916490689192,
"grad_norm": 0.0041980444717698105,
"learning_rate": 5.130568356374809e-06,
"loss": 0.0002,
"step": 4840
},
{
"epoch": 3.722595507775005,
"grad_norm": 0.002162407360619667,
"learning_rate": 5.099846390168971e-06,
"loss": 0.0003,
"step": 4850
},
{
"epoch": 3.7302745248608176,
"grad_norm": 0.003927892142640596,
"learning_rate": 5.0691244239631346e-06,
"loss": 0.0003,
"step": 4860
},
{
"epoch": 3.7379535419466308,
"grad_norm": 0.006489628564389115,
"learning_rate": 5.038402457757296e-06,
"loss": 0.0003,
"step": 4870
},
{
"epoch": 3.745632559032444,
"grad_norm": 0.0021265975651571616,
"learning_rate": 5.00768049155146e-06,
"loss": 0.0003,
"step": 4880
},
{
"epoch": 3.7533115761182567,
"grad_norm": 0.008523718567678173,
"learning_rate": 4.976958525345623e-06,
"loss": 0.0002,
"step": 4890
},
{
"epoch": 3.76099059320407,
"grad_norm": 0.00219483779562531,
"learning_rate": 4.946236559139785e-06,
"loss": 0.0003,
"step": 4900
},
{
"epoch": 3.7686696102898827,
"grad_norm": 0.0026242522484158557,
"learning_rate": 4.915514592933948e-06,
"loss": 0.0003,
"step": 4910
},
{
"epoch": 3.776348627375696,
"grad_norm": 0.0030264706878712897,
"learning_rate": 4.884792626728111e-06,
"loss": 0.0003,
"step": 4920
},
{
"epoch": 3.784027644461509,
"grad_norm": 0.006051061980646295,
"learning_rate": 4.8540706605222734e-06,
"loss": 0.0003,
"step": 4930
},
{
"epoch": 3.791706661547322,
"grad_norm": 0.0076497369041367514,
"learning_rate": 4.823348694316436e-06,
"loss": 0.0003,
"step": 4940
},
{
"epoch": 3.799385678633135,
"grad_norm": 0.003304104700477534,
"learning_rate": 4.7926267281106e-06,
"loss": 0.0003,
"step": 4950
},
{
"epoch": 3.807064695718948,
"grad_norm": 0.001277891356393671,
"learning_rate": 4.761904761904762e-06,
"loss": 0.0003,
"step": 4960
},
{
"epoch": 3.814743712804761,
"grad_norm": 0.0016109781447346389,
"learning_rate": 4.731182795698925e-06,
"loss": 0.0002,
"step": 4970
},
{
"epoch": 3.822422729890574,
"grad_norm": 0.0032221247449133765,
"learning_rate": 4.700460829493088e-06,
"loss": 0.0003,
"step": 4980
},
{
"epoch": 3.830101746976387,
"grad_norm": 0.0023617321542409296,
"learning_rate": 4.669738863287251e-06,
"loss": 0.0003,
"step": 4990
},
{
"epoch": 3.8377807640622,
"grad_norm": 0.0016875889883648163,
"learning_rate": 4.639016897081414e-06,
"loss": 0.0002,
"step": 5000
}
],
"logging_steps": 10,
"max_steps": 6510,
"num_input_tokens_seen": 0,
"num_train_epochs": 5,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 2.088806659181773e+16,
"train_batch_size": 12,
"trial_name": null,
"trial_params": null
}