final_full_2b_422 / trainer_state.json
LHL3341's picture
upload
a98cc5e verified
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 3.0,
"eval_steps": 500,
"global_step": 8004,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0037488284910965324,
"grad_norm": 5.261307048988544,
"learning_rate": 1.1235955056179776e-07,
"loss": 0.6121,
"step": 10
},
{
"epoch": 0.007497656982193065,
"grad_norm": 5.15614665529525,
"learning_rate": 2.3720349563046193e-07,
"loss": 0.609,
"step": 20
},
{
"epoch": 0.011246485473289597,
"grad_norm": 4.807893824154755,
"learning_rate": 3.620474406991261e-07,
"loss": 0.6041,
"step": 30
},
{
"epoch": 0.01499531396438613,
"grad_norm": 3.702189980946651,
"learning_rate": 4.868913857677903e-07,
"loss": 0.5865,
"step": 40
},
{
"epoch": 0.01874414245548266,
"grad_norm": 3.348870199432412,
"learning_rate": 6.117353308364544e-07,
"loss": 0.5738,
"step": 50
},
{
"epoch": 0.022492970946579195,
"grad_norm": 1.3717180565873228,
"learning_rate": 7.365792759051186e-07,
"loss": 0.5303,
"step": 60
},
{
"epoch": 0.026241799437675725,
"grad_norm": 1.2344497017475258,
"learning_rate": 8.614232209737828e-07,
"loss": 0.5046,
"step": 70
},
{
"epoch": 0.02999062792877226,
"grad_norm": 0.9972181073865652,
"learning_rate": 9.86267166042447e-07,
"loss": 0.4776,
"step": 80
},
{
"epoch": 0.033739456419868794,
"grad_norm": 0.5107539966752407,
"learning_rate": 1.111111111111111e-06,
"loss": 0.4698,
"step": 90
},
{
"epoch": 0.03748828491096532,
"grad_norm": 0.38982019488173025,
"learning_rate": 1.2359550561797752e-06,
"loss": 0.464,
"step": 100
},
{
"epoch": 0.041237113402061855,
"grad_norm": 0.3325139789027463,
"learning_rate": 1.3607990012484395e-06,
"loss": 0.4446,
"step": 110
},
{
"epoch": 0.04498594189315839,
"grad_norm": 0.27751561949048015,
"learning_rate": 1.4856429463171037e-06,
"loss": 0.4386,
"step": 120
},
{
"epoch": 0.04873477038425492,
"grad_norm": 0.26409663300412245,
"learning_rate": 1.6104868913857679e-06,
"loss": 0.4394,
"step": 130
},
{
"epoch": 0.05248359887535145,
"grad_norm": 0.25322270864108015,
"learning_rate": 1.735330836454432e-06,
"loss": 0.4347,
"step": 140
},
{
"epoch": 0.056232427366447985,
"grad_norm": 0.24385171974327005,
"learning_rate": 1.8601747815230963e-06,
"loss": 0.4232,
"step": 150
},
{
"epoch": 0.05998125585754452,
"grad_norm": 0.2507796927254884,
"learning_rate": 1.9850187265917605e-06,
"loss": 0.4278,
"step": 160
},
{
"epoch": 0.06373008434864105,
"grad_norm": 0.24135744232005513,
"learning_rate": 2.1098626716604245e-06,
"loss": 0.4194,
"step": 170
},
{
"epoch": 0.06747891283973759,
"grad_norm": 0.23616171555517834,
"learning_rate": 2.234706616729089e-06,
"loss": 0.4154,
"step": 180
},
{
"epoch": 0.07122774133083412,
"grad_norm": 0.23673999658027137,
"learning_rate": 2.359550561797753e-06,
"loss": 0.4135,
"step": 190
},
{
"epoch": 0.07497656982193064,
"grad_norm": 0.23052101463745947,
"learning_rate": 2.484394506866417e-06,
"loss": 0.4129,
"step": 200
},
{
"epoch": 0.07872539831302718,
"grad_norm": 0.2475027892360779,
"learning_rate": 2.6092384519350818e-06,
"loss": 0.4082,
"step": 210
},
{
"epoch": 0.08247422680412371,
"grad_norm": 0.2455114921254245,
"learning_rate": 2.7340823970037454e-06,
"loss": 0.4052,
"step": 220
},
{
"epoch": 0.08622305529522024,
"grad_norm": 0.24513693066233658,
"learning_rate": 2.8589263420724094e-06,
"loss": 0.4046,
"step": 230
},
{
"epoch": 0.08997188378631678,
"grad_norm": 0.2375006603458473,
"learning_rate": 2.9837702871410738e-06,
"loss": 0.4046,
"step": 240
},
{
"epoch": 0.09372071227741331,
"grad_norm": 0.23868492253713985,
"learning_rate": 3.1086142322097378e-06,
"loss": 0.3975,
"step": 250
},
{
"epoch": 0.09746954076850985,
"grad_norm": 0.2326427861557998,
"learning_rate": 3.233458177278402e-06,
"loss": 0.3999,
"step": 260
},
{
"epoch": 0.10121836925960637,
"grad_norm": 0.23731014220097904,
"learning_rate": 3.358302122347066e-06,
"loss": 0.3974,
"step": 270
},
{
"epoch": 0.1049671977507029,
"grad_norm": 0.2548285517151653,
"learning_rate": 3.4831460674157306e-06,
"loss": 0.3984,
"step": 280
},
{
"epoch": 0.10871602624179943,
"grad_norm": 0.2415337350559026,
"learning_rate": 3.6079900124843946e-06,
"loss": 0.3916,
"step": 290
},
{
"epoch": 0.11246485473289597,
"grad_norm": 0.26258238150885815,
"learning_rate": 3.732833957553059e-06,
"loss": 0.3906,
"step": 300
},
{
"epoch": 0.1162136832239925,
"grad_norm": 0.26012016485357875,
"learning_rate": 3.857677902621723e-06,
"loss": 0.3889,
"step": 310
},
{
"epoch": 0.11996251171508904,
"grad_norm": 0.2569432757214338,
"learning_rate": 3.9825218476903875e-06,
"loss": 0.3873,
"step": 320
},
{
"epoch": 0.12371134020618557,
"grad_norm": 0.23813642779308677,
"learning_rate": 4.107365792759052e-06,
"loss": 0.3879,
"step": 330
},
{
"epoch": 0.1274601686972821,
"grad_norm": 0.2620384468843467,
"learning_rate": 4.2322097378277155e-06,
"loss": 0.3841,
"step": 340
},
{
"epoch": 0.13120899718837864,
"grad_norm": 0.2544756212917662,
"learning_rate": 4.35705368289638e-06,
"loss": 0.3881,
"step": 350
},
{
"epoch": 0.13495782567947517,
"grad_norm": 0.23871678308328853,
"learning_rate": 4.481897627965044e-06,
"loss": 0.3819,
"step": 360
},
{
"epoch": 0.1387066541705717,
"grad_norm": 0.27307451356383844,
"learning_rate": 4.606741573033709e-06,
"loss": 0.3849,
"step": 370
},
{
"epoch": 0.14245548266166824,
"grad_norm": 0.2982815099279342,
"learning_rate": 4.731585518102372e-06,
"loss": 0.384,
"step": 380
},
{
"epoch": 0.14620431115276475,
"grad_norm": 0.27841785889657866,
"learning_rate": 4.856429463171037e-06,
"loss": 0.3797,
"step": 390
},
{
"epoch": 0.14995313964386128,
"grad_norm": 0.2727860073948262,
"learning_rate": 4.9812734082397e-06,
"loss": 0.3826,
"step": 400
},
{
"epoch": 0.15370196813495782,
"grad_norm": 0.25678949790726463,
"learning_rate": 5.106117353308366e-06,
"loss": 0.38,
"step": 410
},
{
"epoch": 0.15745079662605435,
"grad_norm": 0.31173528136138384,
"learning_rate": 5.230961298377028e-06,
"loss": 0.3775,
"step": 420
},
{
"epoch": 0.16119962511715089,
"grad_norm": 0.2952941783305471,
"learning_rate": 5.355805243445693e-06,
"loss": 0.3821,
"step": 430
},
{
"epoch": 0.16494845360824742,
"grad_norm": 0.2908243225639686,
"learning_rate": 5.480649188514357e-06,
"loss": 0.3767,
"step": 440
},
{
"epoch": 0.16869728209934395,
"grad_norm": 0.25818770698112115,
"learning_rate": 5.6054931335830224e-06,
"loss": 0.3753,
"step": 450
},
{
"epoch": 0.1724461105904405,
"grad_norm": 0.3211832737453995,
"learning_rate": 5.730337078651685e-06,
"loss": 0.3763,
"step": 460
},
{
"epoch": 0.17619493908153702,
"grad_norm": 0.3105251087664963,
"learning_rate": 5.85518102372035e-06,
"loss": 0.3758,
"step": 470
},
{
"epoch": 0.17994376757263356,
"grad_norm": 0.2764457811664437,
"learning_rate": 5.980024968789014e-06,
"loss": 0.3703,
"step": 480
},
{
"epoch": 0.1836925960637301,
"grad_norm": 0.28650193996460066,
"learning_rate": 6.104868913857679e-06,
"loss": 0.3772,
"step": 490
},
{
"epoch": 0.18744142455482662,
"grad_norm": 0.29068625167257756,
"learning_rate": 6.229712858926342e-06,
"loss": 0.3734,
"step": 500
},
{
"epoch": 0.19119025304592316,
"grad_norm": 0.3259945542580016,
"learning_rate": 6.3545568039950064e-06,
"loss": 0.3734,
"step": 510
},
{
"epoch": 0.1949390815370197,
"grad_norm": 0.2921563403060802,
"learning_rate": 6.479400749063671e-06,
"loss": 0.3708,
"step": 520
},
{
"epoch": 0.19868791002811623,
"grad_norm": 0.28384494127638144,
"learning_rate": 6.6042446941323344e-06,
"loss": 0.3703,
"step": 530
},
{
"epoch": 0.20243673851921273,
"grad_norm": 0.35311062890426737,
"learning_rate": 6.729088639200999e-06,
"loss": 0.3686,
"step": 540
},
{
"epoch": 0.20618556701030927,
"grad_norm": 0.3645818485106796,
"learning_rate": 6.853932584269663e-06,
"loss": 0.3665,
"step": 550
},
{
"epoch": 0.2099343955014058,
"grad_norm": 0.29958381926712185,
"learning_rate": 6.978776529338328e-06,
"loss": 0.3703,
"step": 560
},
{
"epoch": 0.21368322399250234,
"grad_norm": 0.2943930481951806,
"learning_rate": 7.103620474406991e-06,
"loss": 0.3686,
"step": 570
},
{
"epoch": 0.21743205248359887,
"grad_norm": 0.33743210095771675,
"learning_rate": 7.228464419475656e-06,
"loss": 0.364,
"step": 580
},
{
"epoch": 0.2211808809746954,
"grad_norm": 0.30279035276936334,
"learning_rate": 7.35330836454432e-06,
"loss": 0.3664,
"step": 590
},
{
"epoch": 0.22492970946579194,
"grad_norm": 0.3222972752742645,
"learning_rate": 7.4781523096129846e-06,
"loss": 0.3644,
"step": 600
},
{
"epoch": 0.22867853795688847,
"grad_norm": 0.38710479557491634,
"learning_rate": 7.602996254681648e-06,
"loss": 0.3647,
"step": 610
},
{
"epoch": 0.232427366447985,
"grad_norm": 0.37149432246081854,
"learning_rate": 7.727840199750313e-06,
"loss": 0.3654,
"step": 620
},
{
"epoch": 0.23617619493908154,
"grad_norm": 0.3783560186385295,
"learning_rate": 7.852684144818978e-06,
"loss": 0.362,
"step": 630
},
{
"epoch": 0.23992502343017807,
"grad_norm": 0.37006311884362064,
"learning_rate": 7.97752808988764e-06,
"loss": 0.3615,
"step": 640
},
{
"epoch": 0.2436738519212746,
"grad_norm": 0.34763658374372114,
"learning_rate": 8.102372034956305e-06,
"loss": 0.3632,
"step": 650
},
{
"epoch": 0.24742268041237114,
"grad_norm": 0.29073397245455634,
"learning_rate": 8.22721598002497e-06,
"loss": 0.3583,
"step": 660
},
{
"epoch": 0.2511715089034677,
"grad_norm": 0.35203426090002876,
"learning_rate": 8.352059925093634e-06,
"loss": 0.3647,
"step": 670
},
{
"epoch": 0.2549203373945642,
"grad_norm": 0.437139066758181,
"learning_rate": 8.476903870162298e-06,
"loss": 0.3585,
"step": 680
},
{
"epoch": 0.25866916588566075,
"grad_norm": 0.3142538887122429,
"learning_rate": 8.601747815230963e-06,
"loss": 0.3568,
"step": 690
},
{
"epoch": 0.2624179943767573,
"grad_norm": 0.4424671561181043,
"learning_rate": 8.726591760299627e-06,
"loss": 0.3592,
"step": 700
},
{
"epoch": 0.2661668228678538,
"grad_norm": 0.4870824456249142,
"learning_rate": 8.851435705368292e-06,
"loss": 0.359,
"step": 710
},
{
"epoch": 0.26991565135895035,
"grad_norm": 0.37766488794403613,
"learning_rate": 8.976279650436954e-06,
"loss": 0.3573,
"step": 720
},
{
"epoch": 0.2736644798500469,
"grad_norm": 0.37439610464624506,
"learning_rate": 9.101123595505619e-06,
"loss": 0.3618,
"step": 730
},
{
"epoch": 0.2774133083411434,
"grad_norm": 0.4004032917727574,
"learning_rate": 9.225967540574283e-06,
"loss": 0.361,
"step": 740
},
{
"epoch": 0.28116213683223995,
"grad_norm": 0.3373876121827577,
"learning_rate": 9.350811485642946e-06,
"loss": 0.359,
"step": 750
},
{
"epoch": 0.2849109653233365,
"grad_norm": 0.36216174893173675,
"learning_rate": 9.475655430711612e-06,
"loss": 0.357,
"step": 760
},
{
"epoch": 0.28865979381443296,
"grad_norm": 0.42153204399869865,
"learning_rate": 9.600499375780276e-06,
"loss": 0.3577,
"step": 770
},
{
"epoch": 0.2924086223055295,
"grad_norm": 0.35740754175627953,
"learning_rate": 9.72534332084894e-06,
"loss": 0.3547,
"step": 780
},
{
"epoch": 0.29615745079662603,
"grad_norm": 0.32327770051907684,
"learning_rate": 9.850187265917604e-06,
"loss": 0.357,
"step": 790
},
{
"epoch": 0.29990627928772257,
"grad_norm": 0.460840791598725,
"learning_rate": 9.975031210986268e-06,
"loss": 0.3556,
"step": 800
},
{
"epoch": 0.3036551077788191,
"grad_norm": 0.42772161445538665,
"learning_rate": 9.99996956365783e-06,
"loss": 0.3552,
"step": 810
},
{
"epoch": 0.30740393626991563,
"grad_norm": 0.3837967533130784,
"learning_rate": 9.999845916652828e-06,
"loss": 0.3537,
"step": 820
},
{
"epoch": 0.31115276476101217,
"grad_norm": 0.3587190063419423,
"learning_rate": 9.999627159063904e-06,
"loss": 0.3522,
"step": 830
},
{
"epoch": 0.3149015932521087,
"grad_norm": 0.37997976078876,
"learning_rate": 9.999313295052418e-06,
"loss": 0.354,
"step": 840
},
{
"epoch": 0.31865042174320524,
"grad_norm": 0.33994105517602696,
"learning_rate": 9.998904330588908e-06,
"loss": 0.352,
"step": 850
},
{
"epoch": 0.32239925023430177,
"grad_norm": 0.3874016806923687,
"learning_rate": 9.998400273452987e-06,
"loss": 0.3528,
"step": 860
},
{
"epoch": 0.3261480787253983,
"grad_norm": 0.3219534280341289,
"learning_rate": 9.997801133233184e-06,
"loss": 0.3534,
"step": 870
},
{
"epoch": 0.32989690721649484,
"grad_norm": 0.359125234314906,
"learning_rate": 9.997106921326764e-06,
"loss": 0.3537,
"step": 880
},
{
"epoch": 0.3336457357075914,
"grad_norm": 0.3184315740360875,
"learning_rate": 9.996317650939515e-06,
"loss": 0.3528,
"step": 890
},
{
"epoch": 0.3373945641986879,
"grad_norm": 0.5591690891723508,
"learning_rate": 9.995433337085492e-06,
"loss": 0.3522,
"step": 900
},
{
"epoch": 0.34114339268978444,
"grad_norm": 0.3423810176135369,
"learning_rate": 9.994453996586737e-06,
"loss": 0.3501,
"step": 910
},
{
"epoch": 0.344892221180881,
"grad_norm": 0.40486938076977924,
"learning_rate": 9.99337964807295e-06,
"loss": 0.3496,
"step": 920
},
{
"epoch": 0.3486410496719775,
"grad_norm": 0.4088267861862101,
"learning_rate": 9.992210311981148e-06,
"loss": 0.357,
"step": 930
},
{
"epoch": 0.35238987816307404,
"grad_norm": 0.3795197690451968,
"learning_rate": 9.99094601055526e-06,
"loss": 0.3495,
"step": 940
},
{
"epoch": 0.3561387066541706,
"grad_norm": 0.37097930414190233,
"learning_rate": 9.989586767845721e-06,
"loss": 0.3469,
"step": 950
},
{
"epoch": 0.3598875351452671,
"grad_norm": 0.3513423854824207,
"learning_rate": 9.988132609708999e-06,
"loss": 0.3457,
"step": 960
},
{
"epoch": 0.36363636363636365,
"grad_norm": 0.36447802251950095,
"learning_rate": 9.986583563807109e-06,
"loss": 0.3486,
"step": 970
},
{
"epoch": 0.3673851921274602,
"grad_norm": 0.3729932757859573,
"learning_rate": 9.984939659607095e-06,
"loss": 0.3507,
"step": 980
},
{
"epoch": 0.3711340206185567,
"grad_norm": 0.29469670304955903,
"learning_rate": 9.983200928380461e-06,
"loss": 0.3459,
"step": 990
},
{
"epoch": 0.37488284910965325,
"grad_norm": 0.3654383892479426,
"learning_rate": 9.981367403202569e-06,
"loss": 0.3449,
"step": 1000
},
{
"epoch": 0.3786316776007498,
"grad_norm": 0.36618034043522385,
"learning_rate": 9.979439118952026e-06,
"loss": 0.3452,
"step": 1010
},
{
"epoch": 0.3823805060918463,
"grad_norm": 0.3577145469220018,
"learning_rate": 9.977416112310012e-06,
"loss": 0.3464,
"step": 1020
},
{
"epoch": 0.38612933458294285,
"grad_norm": 0.556582145142578,
"learning_rate": 9.97529842175958e-06,
"loss": 0.3446,
"step": 1030
},
{
"epoch": 0.3898781630740394,
"grad_norm": 0.40442147020098534,
"learning_rate": 9.973086087584929e-06,
"loss": 0.3454,
"step": 1040
},
{
"epoch": 0.3936269915651359,
"grad_norm": 0.34956913654247584,
"learning_rate": 9.970779151870634e-06,
"loss": 0.3454,
"step": 1050
},
{
"epoch": 0.39737582005623245,
"grad_norm": 0.40777603350082636,
"learning_rate": 9.96837765850085e-06,
"loss": 0.3459,
"step": 1060
},
{
"epoch": 0.40112464854732893,
"grad_norm": 0.38403660939167644,
"learning_rate": 9.96588165315847e-06,
"loss": 0.3464,
"step": 1070
},
{
"epoch": 0.40487347703842547,
"grad_norm": 0.4080672955719013,
"learning_rate": 9.963291183324264e-06,
"loss": 0.3441,
"step": 1080
},
{
"epoch": 0.408622305529522,
"grad_norm": 0.3685239162827897,
"learning_rate": 9.960606298275968e-06,
"loss": 0.3464,
"step": 1090
},
{
"epoch": 0.41237113402061853,
"grad_norm": 0.3380919841293949,
"learning_rate": 9.957827049087357e-06,
"loss": 0.3433,
"step": 1100
},
{
"epoch": 0.41611996251171507,
"grad_norm": 0.3315173931094776,
"learning_rate": 9.954953488627258e-06,
"loss": 0.3486,
"step": 1110
},
{
"epoch": 0.4198687910028116,
"grad_norm": 0.4016275857570284,
"learning_rate": 9.951985671558559e-06,
"loss": 0.3426,
"step": 1120
},
{
"epoch": 0.42361761949390814,
"grad_norm": 0.35593601511143413,
"learning_rate": 9.948923654337167e-06,
"loss": 0.3453,
"step": 1130
},
{
"epoch": 0.42736644798500467,
"grad_norm": 0.32764050829507113,
"learning_rate": 9.945767495210921e-06,
"loss": 0.3427,
"step": 1140
},
{
"epoch": 0.4311152764761012,
"grad_norm": 0.3022647428699131,
"learning_rate": 9.942517254218503e-06,
"loss": 0.3407,
"step": 1150
},
{
"epoch": 0.43486410496719774,
"grad_norm": 0.33454020574954163,
"learning_rate": 9.93917299318828e-06,
"loss": 0.3411,
"step": 1160
},
{
"epoch": 0.4386129334582943,
"grad_norm": 0.41746428448570383,
"learning_rate": 9.935734775737136e-06,
"loss": 0.3386,
"step": 1170
},
{
"epoch": 0.4423617619493908,
"grad_norm": 0.4736902430441777,
"learning_rate": 9.932202667269259e-06,
"loss": 0.3437,
"step": 1180
},
{
"epoch": 0.44611059044048734,
"grad_norm": 0.36913024892238444,
"learning_rate": 9.928576734974903e-06,
"loss": 0.3444,
"step": 1190
},
{
"epoch": 0.4498594189315839,
"grad_norm": 0.3085872877246692,
"learning_rate": 9.924857047829097e-06,
"loss": 0.3407,
"step": 1200
},
{
"epoch": 0.4536082474226804,
"grad_norm": 0.5014477248612932,
"learning_rate": 9.92104367659035e-06,
"loss": 0.3402,
"step": 1210
},
{
"epoch": 0.45735707591377694,
"grad_norm": 0.37208177374221185,
"learning_rate": 9.917136693799287e-06,
"loss": 0.3443,
"step": 1220
},
{
"epoch": 0.4611059044048735,
"grad_norm": 0.4736108253772323,
"learning_rate": 9.91313617377728e-06,
"loss": 0.3402,
"step": 1230
},
{
"epoch": 0.46485473289597,
"grad_norm": 0.3989185456576625,
"learning_rate": 9.909042192625038e-06,
"loss": 0.3387,
"step": 1240
},
{
"epoch": 0.46860356138706655,
"grad_norm": 0.4452446526616405,
"learning_rate": 9.904854828221142e-06,
"loss": 0.3444,
"step": 1250
},
{
"epoch": 0.4723523898781631,
"grad_norm": 0.32502934619385765,
"learning_rate": 9.900574160220589e-06,
"loss": 0.3349,
"step": 1260
},
{
"epoch": 0.4761012183692596,
"grad_norm": 0.3626693442578511,
"learning_rate": 9.896200270053248e-06,
"loss": 0.3414,
"step": 1270
},
{
"epoch": 0.47985004686035615,
"grad_norm": 0.32869562107129774,
"learning_rate": 9.891733240922336e-06,
"loss": 0.3422,
"step": 1280
},
{
"epoch": 0.4835988753514527,
"grad_norm": 0.37637832215175443,
"learning_rate": 9.887173157802823e-06,
"loss": 0.3366,
"step": 1290
},
{
"epoch": 0.4873477038425492,
"grad_norm": 0.3508655971923107,
"learning_rate": 9.882520107439813e-06,
"loss": 0.3386,
"step": 1300
},
{
"epoch": 0.49109653233364575,
"grad_norm": 0.34109879872339993,
"learning_rate": 9.877774178346901e-06,
"loss": 0.3363,
"step": 1310
},
{
"epoch": 0.4948453608247423,
"grad_norm": 0.32811254268165824,
"learning_rate": 9.87293546080449e-06,
"loss": 0.3395,
"step": 1320
},
{
"epoch": 0.4985941893158388,
"grad_norm": 0.4254861700788238,
"learning_rate": 9.868004046858063e-06,
"loss": 0.3371,
"step": 1330
},
{
"epoch": 0.5023430178069354,
"grad_norm": 0.3785996603686958,
"learning_rate": 9.862980030316445e-06,
"loss": 0.3375,
"step": 1340
},
{
"epoch": 0.5060918462980318,
"grad_norm": 0.3514746852402266,
"learning_rate": 9.857863506750008e-06,
"loss": 0.3373,
"step": 1350
},
{
"epoch": 0.5098406747891284,
"grad_norm": 0.39720746535746887,
"learning_rate": 9.852654573488865e-06,
"loss": 0.3361,
"step": 1360
},
{
"epoch": 0.5135895032802249,
"grad_norm": 0.5262745224774735,
"learning_rate": 9.847353329621001e-06,
"loss": 0.338,
"step": 1370
},
{
"epoch": 0.5173383317713215,
"grad_norm": 0.39442611541662853,
"learning_rate": 9.841959875990406e-06,
"loss": 0.3354,
"step": 1380
},
{
"epoch": 0.521087160262418,
"grad_norm": 0.4302778358785363,
"learning_rate": 9.836474315195148e-06,
"loss": 0.3354,
"step": 1390
},
{
"epoch": 0.5248359887535146,
"grad_norm": 0.35819495417141983,
"learning_rate": 9.830896751585419e-06,
"loss": 0.3385,
"step": 1400
},
{
"epoch": 0.528584817244611,
"grad_norm": 0.4018817834079902,
"learning_rate": 9.825227291261555e-06,
"loss": 0.3372,
"step": 1410
},
{
"epoch": 0.5323336457357076,
"grad_norm": 0.45163931451375233,
"learning_rate": 9.819466042072016e-06,
"loss": 0.3361,
"step": 1420
},
{
"epoch": 0.5360824742268041,
"grad_norm": 0.41628377691191,
"learning_rate": 9.813613113611336e-06,
"loss": 0.3335,
"step": 1430
},
{
"epoch": 0.5398313027179007,
"grad_norm": 0.32539571890495544,
"learning_rate": 9.807668617218033e-06,
"loss": 0.3357,
"step": 1440
},
{
"epoch": 0.5435801312089972,
"grad_norm": 0.3226911203286496,
"learning_rate": 9.801632665972496e-06,
"loss": 0.3369,
"step": 1450
},
{
"epoch": 0.5473289597000938,
"grad_norm": 0.39760989762825094,
"learning_rate": 9.795505374694833e-06,
"loss": 0.3361,
"step": 1460
},
{
"epoch": 0.5510777881911902,
"grad_norm": 0.4106794192966664,
"learning_rate": 9.78928685994269e-06,
"loss": 0.3334,
"step": 1470
},
{
"epoch": 0.5548266166822868,
"grad_norm": 0.34122967448980807,
"learning_rate": 9.78297724000902e-06,
"loss": 0.3338,
"step": 1480
},
{
"epoch": 0.5585754451733833,
"grad_norm": 0.3568238195196395,
"learning_rate": 9.776576634919853e-06,
"loss": 0.3337,
"step": 1490
},
{
"epoch": 0.5623242736644799,
"grad_norm": 0.4345852471251167,
"learning_rate": 9.770085166431998e-06,
"loss": 0.3302,
"step": 1500
},
{
"epoch": 0.5660731021555764,
"grad_norm": 0.3501750906447824,
"learning_rate": 9.763502958030733e-06,
"loss": 0.3336,
"step": 1510
},
{
"epoch": 0.569821930646673,
"grad_norm": 0.4280185568028667,
"learning_rate": 9.75683013492745e-06,
"loss": 0.3345,
"step": 1520
},
{
"epoch": 0.5735707591377694,
"grad_norm": 0.33687559054684085,
"learning_rate": 9.750066824057286e-06,
"loss": 0.3319,
"step": 1530
},
{
"epoch": 0.5773195876288659,
"grad_norm": 0.4603527064195899,
"learning_rate": 9.74321315407669e-06,
"loss": 0.3362,
"step": 1540
},
{
"epoch": 0.5810684161199625,
"grad_norm": 0.37047815009428875,
"learning_rate": 9.736269255360993e-06,
"loss": 0.3308,
"step": 1550
},
{
"epoch": 0.584817244611059,
"grad_norm": 0.3618655285759886,
"learning_rate": 9.729235260001919e-06,
"loss": 0.333,
"step": 1560
},
{
"epoch": 0.5885660731021556,
"grad_norm": 0.3818659087459316,
"learning_rate": 9.72211130180507e-06,
"loss": 0.3323,
"step": 1570
},
{
"epoch": 0.5923149015932521,
"grad_norm": 0.3996155158067246,
"learning_rate": 9.714897516287392e-06,
"loss": 0.3346,
"step": 1580
},
{
"epoch": 0.5960637300843487,
"grad_norm": 0.37123696273112744,
"learning_rate": 9.707594040674577e-06,
"loss": 0.3361,
"step": 1590
},
{
"epoch": 0.5998125585754451,
"grad_norm": 0.3452230221109262,
"learning_rate": 9.700201013898478e-06,
"loss": 0.3349,
"step": 1600
},
{
"epoch": 0.6035613870665417,
"grad_norm": 0.500409003679961,
"learning_rate": 9.692718576594447e-06,
"loss": 0.3323,
"step": 1610
},
{
"epoch": 0.6073102155576382,
"grad_norm": 0.5303403579247664,
"learning_rate": 9.685146871098663e-06,
"loss": 0.3303,
"step": 1620
},
{
"epoch": 0.6110590440487348,
"grad_norm": 0.3376647352530898,
"learning_rate": 9.677486041445436e-06,
"loss": 0.3329,
"step": 1630
},
{
"epoch": 0.6148078725398313,
"grad_norm": 0.41119673564751275,
"learning_rate": 9.669736233364448e-06,
"loss": 0.3349,
"step": 1640
},
{
"epoch": 0.6185567010309279,
"grad_norm": 0.3574831617883723,
"learning_rate": 9.661897594278e-06,
"loss": 0.3345,
"step": 1650
},
{
"epoch": 0.6223055295220243,
"grad_norm": 0.3748467608566312,
"learning_rate": 9.653970273298197e-06,
"loss": 0.3325,
"step": 1660
},
{
"epoch": 0.6260543580131209,
"grad_norm": 0.3884786451043299,
"learning_rate": 9.645954421224106e-06,
"loss": 0.3316,
"step": 1670
},
{
"epoch": 0.6298031865042174,
"grad_norm": 0.35595037812559693,
"learning_rate": 9.637850190538904e-06,
"loss": 0.3268,
"step": 1680
},
{
"epoch": 0.633552014995314,
"grad_norm": 0.34174290421878073,
"learning_rate": 9.629657735406964e-06,
"loss": 0.3279,
"step": 1690
},
{
"epoch": 0.6373008434864105,
"grad_norm": 0.36083476878974785,
"learning_rate": 9.621377211670926e-06,
"loss": 0.3307,
"step": 1700
},
{
"epoch": 0.6410496719775071,
"grad_norm": 0.3289272824820436,
"learning_rate": 9.613008776848734e-06,
"loss": 0.329,
"step": 1710
},
{
"epoch": 0.6447985004686035,
"grad_norm": 0.35744054622945465,
"learning_rate": 9.604552590130638e-06,
"loss": 0.3291,
"step": 1720
},
{
"epoch": 0.6485473289597001,
"grad_norm": 0.43446933745858823,
"learning_rate": 9.596008812376167e-06,
"loss": 0.3314,
"step": 1730
},
{
"epoch": 0.6522961574507966,
"grad_norm": 0.38294138333824385,
"learning_rate": 9.587377606111067e-06,
"loss": 0.3259,
"step": 1740
},
{
"epoch": 0.6560449859418932,
"grad_norm": 0.3524604957942587,
"learning_rate": 9.578659135524214e-06,
"loss": 0.3354,
"step": 1750
},
{
"epoch": 0.6597938144329897,
"grad_norm": 0.42690501583378293,
"learning_rate": 9.569853566464482e-06,
"loss": 0.3282,
"step": 1760
},
{
"epoch": 0.6635426429240863,
"grad_norm": 0.48509167399052666,
"learning_rate": 9.560961066437595e-06,
"loss": 0.3328,
"step": 1770
},
{
"epoch": 0.6672914714151827,
"grad_norm": 0.3313886004708506,
"learning_rate": 9.551981804602943e-06,
"loss": 0.327,
"step": 1780
},
{
"epoch": 0.6710402999062793,
"grad_norm": 0.4278444007339863,
"learning_rate": 9.542915951770356e-06,
"loss": 0.3298,
"step": 1790
},
{
"epoch": 0.6747891283973758,
"grad_norm": 0.43003182010366847,
"learning_rate": 9.533763680396857e-06,
"loss": 0.3299,
"step": 1800
},
{
"epoch": 0.6785379568884724,
"grad_norm": 0.38848251404003764,
"learning_rate": 9.524525164583389e-06,
"loss": 0.3299,
"step": 1810
},
{
"epoch": 0.6822867853795689,
"grad_norm": 0.3812957689804934,
"learning_rate": 9.515200580071495e-06,
"loss": 0.3303,
"step": 1820
},
{
"epoch": 0.6860356138706654,
"grad_norm": 0.5538363802153154,
"learning_rate": 9.505790104239975e-06,
"loss": 0.3289,
"step": 1830
},
{
"epoch": 0.689784442361762,
"grad_norm": 0.4099362555822008,
"learning_rate": 9.496293916101516e-06,
"loss": 0.3303,
"step": 1840
},
{
"epoch": 0.6935332708528584,
"grad_norm": 0.511739746908337,
"learning_rate": 9.486712196299285e-06,
"loss": 0.3312,
"step": 1850
},
{
"epoch": 0.697282099343955,
"grad_norm": 0.38232455442549296,
"learning_rate": 9.477045127103495e-06,
"loss": 0.3305,
"step": 1860
},
{
"epoch": 0.7010309278350515,
"grad_norm": 0.46907822057098875,
"learning_rate": 9.467292892407926e-06,
"loss": 0.327,
"step": 1870
},
{
"epoch": 0.7047797563261481,
"grad_norm": 0.333174592156295,
"learning_rate": 9.457455677726447e-06,
"loss": 0.3276,
"step": 1880
},
{
"epoch": 0.7085285848172446,
"grad_norm": 0.4504078695656366,
"learning_rate": 9.447533670189472e-06,
"loss": 0.3298,
"step": 1890
},
{
"epoch": 0.7122774133083412,
"grad_norm": 0.36606560970104657,
"learning_rate": 9.437527058540398e-06,
"loss": 0.3317,
"step": 1900
},
{
"epoch": 0.7160262417994376,
"grad_norm": 0.31085511164293034,
"learning_rate": 9.427436033132033e-06,
"loss": 0.3256,
"step": 1910
},
{
"epoch": 0.7197750702905342,
"grad_norm": 0.3485865828995088,
"learning_rate": 9.417260785922953e-06,
"loss": 0.3262,
"step": 1920
},
{
"epoch": 0.7235238987816307,
"grad_norm": 0.45177189070417023,
"learning_rate": 9.407001510473861e-06,
"loss": 0.3282,
"step": 1930
},
{
"epoch": 0.7272727272727273,
"grad_norm": 0.4145405256703436,
"learning_rate": 9.396658401943913e-06,
"loss": 0.3269,
"step": 1940
},
{
"epoch": 0.7310215557638238,
"grad_norm": 0.39368332737774225,
"learning_rate": 9.386231657086984e-06,
"loss": 0.3267,
"step": 1950
},
{
"epoch": 0.7347703842549204,
"grad_norm": 0.4086874229698859,
"learning_rate": 9.37572147424795e-06,
"loss": 0.3301,
"step": 1960
},
{
"epoch": 0.7385192127460168,
"grad_norm": 0.41879040405476353,
"learning_rate": 9.365128053358896e-06,
"loss": 0.3267,
"step": 1970
},
{
"epoch": 0.7422680412371134,
"grad_norm": 0.48491627309563456,
"learning_rate": 9.35445159593532e-06,
"loss": 0.3279,
"step": 1980
},
{
"epoch": 0.7460168697282099,
"grad_norm": 0.3400392892381988,
"learning_rate": 9.343692305072302e-06,
"loss": 0.3257,
"step": 1990
},
{
"epoch": 0.7497656982193065,
"grad_norm": 0.38335608109171726,
"learning_rate": 9.332850385440637e-06,
"loss": 0.3272,
"step": 2000
},
{
"epoch": 0.753514526710403,
"grad_norm": 0.3872685747801623,
"learning_rate": 9.32192604328294e-06,
"loss": 0.3303,
"step": 2010
},
{
"epoch": 0.7572633552014996,
"grad_norm": 0.43431054195807284,
"learning_rate": 9.31091948640973e-06,
"loss": 0.3252,
"step": 2020
},
{
"epoch": 0.761012183692596,
"grad_norm": 0.3987919949548056,
"learning_rate": 9.299830924195468e-06,
"loss": 0.3295,
"step": 2030
},
{
"epoch": 0.7647610121836926,
"grad_norm": 0.43495362781321384,
"learning_rate": 9.28866056757458e-06,
"loss": 0.3271,
"step": 2040
},
{
"epoch": 0.7685098406747891,
"grad_norm": 0.5319949922755799,
"learning_rate": 9.277408629037442e-06,
"loss": 0.3211,
"step": 2050
},
{
"epoch": 0.7722586691658857,
"grad_norm": 0.4399645846756319,
"learning_rate": 9.266075322626338e-06,
"loss": 0.3252,
"step": 2060
},
{
"epoch": 0.7760074976569822,
"grad_norm": 0.6456342041580845,
"learning_rate": 9.254660863931392e-06,
"loss": 0.3252,
"step": 2070
},
{
"epoch": 0.7797563261480788,
"grad_norm": 0.36547086873410367,
"learning_rate": 9.243165470086463e-06,
"loss": 0.3264,
"step": 2080
},
{
"epoch": 0.7835051546391752,
"grad_norm": 0.3872999565414706,
"learning_rate": 9.23158935976501e-06,
"loss": 0.3253,
"step": 2090
},
{
"epoch": 0.7872539831302718,
"grad_norm": 0.35985858368454743,
"learning_rate": 9.219932753175944e-06,
"loss": 0.3252,
"step": 2100
},
{
"epoch": 0.7910028116213683,
"grad_norm": 0.4412647339459317,
"learning_rate": 9.208195872059429e-06,
"loss": 0.324,
"step": 2110
},
{
"epoch": 0.7947516401124649,
"grad_norm": 0.3973882373774354,
"learning_rate": 9.19637893968267e-06,
"loss": 0.3261,
"step": 2120
},
{
"epoch": 0.7985004686035614,
"grad_norm": 0.357118509545678,
"learning_rate": 9.184482180835662e-06,
"loss": 0.3238,
"step": 2130
},
{
"epoch": 0.8022492970946579,
"grad_norm": 0.3223398993083127,
"learning_rate": 9.172505821826911e-06,
"loss": 0.325,
"step": 2140
},
{
"epoch": 0.8059981255857545,
"grad_norm": 0.3256784650503759,
"learning_rate": 9.160450090479144e-06,
"loss": 0.3229,
"step": 2150
},
{
"epoch": 0.8097469540768509,
"grad_norm": 0.550415905591969,
"learning_rate": 9.148315216124954e-06,
"loss": 0.3275,
"step": 2160
},
{
"epoch": 0.8134957825679475,
"grad_norm": 0.42518457436926477,
"learning_rate": 9.136101429602451e-06,
"loss": 0.327,
"step": 2170
},
{
"epoch": 0.817244611059044,
"grad_norm": 0.4261965775006577,
"learning_rate": 9.123808963250873e-06,
"loss": 0.3225,
"step": 2180
},
{
"epoch": 0.8209934395501406,
"grad_norm": 0.43150733510100164,
"learning_rate": 9.111438050906151e-06,
"loss": 0.3279,
"step": 2190
},
{
"epoch": 0.8247422680412371,
"grad_norm": 0.37038125521844484,
"learning_rate": 9.09898892789648e-06,
"loss": 0.324,
"step": 2200
},
{
"epoch": 0.8284910965323337,
"grad_norm": 0.40692784507847907,
"learning_rate": 9.08646183103783e-06,
"loss": 0.3296,
"step": 2210
},
{
"epoch": 0.8322399250234301,
"grad_norm": 0.386868671740811,
"learning_rate": 9.07385699862944e-06,
"loss": 0.3215,
"step": 2220
},
{
"epoch": 0.8359887535145267,
"grad_norm": 0.3350868838762515,
"learning_rate": 9.061174670449298e-06,
"loss": 0.3246,
"step": 2230
},
{
"epoch": 0.8397375820056232,
"grad_norm": 0.34427971717421035,
"learning_rate": 9.048415087749565e-06,
"loss": 0.3242,
"step": 2240
},
{
"epoch": 0.8434864104967198,
"grad_norm": 0.3856288956960091,
"learning_rate": 9.03557849325199e-06,
"loss": 0.326,
"step": 2250
},
{
"epoch": 0.8472352389878163,
"grad_norm": 0.39608051805449296,
"learning_rate": 9.022665131143303e-06,
"loss": 0.3229,
"step": 2260
},
{
"epoch": 0.8509840674789129,
"grad_norm": 0.41794409702724045,
"learning_rate": 9.00967524707055e-06,
"loss": 0.3232,
"step": 2270
},
{
"epoch": 0.8547328959700093,
"grad_norm": 0.43169401522169315,
"learning_rate": 8.996609088136444e-06,
"loss": 0.3255,
"step": 2280
},
{
"epoch": 0.8584817244611059,
"grad_norm": 0.35305226663749295,
"learning_rate": 8.98346690289464e-06,
"loss": 0.3222,
"step": 2290
},
{
"epoch": 0.8622305529522024,
"grad_norm": 0.40183352362359215,
"learning_rate": 8.970248941345028e-06,
"loss": 0.3201,
"step": 2300
},
{
"epoch": 0.865979381443299,
"grad_norm": 0.37025187300501505,
"learning_rate": 8.956955454928966e-06,
"loss": 0.3226,
"step": 2310
},
{
"epoch": 0.8697282099343955,
"grad_norm": 0.4229158348114115,
"learning_rate": 8.943586696524495e-06,
"loss": 0.3227,
"step": 2320
},
{
"epoch": 0.8734770384254921,
"grad_norm": 0.33306767880493077,
"learning_rate": 8.930142920441536e-06,
"loss": 0.3233,
"step": 2330
},
{
"epoch": 0.8772258669165885,
"grad_norm": 0.3664337194336669,
"learning_rate": 8.916624382417052e-06,
"loss": 0.3244,
"step": 2340
},
{
"epoch": 0.8809746954076851,
"grad_norm": 0.39368217476089484,
"learning_rate": 8.903031339610172e-06,
"loss": 0.3205,
"step": 2350
},
{
"epoch": 0.8847235238987816,
"grad_norm": 0.35736567969388083,
"learning_rate": 8.889364050597315e-06,
"loss": 0.3208,
"step": 2360
},
{
"epoch": 0.8884723523898782,
"grad_norm": 0.33000582790021804,
"learning_rate": 8.87562277536726e-06,
"loss": 0.3224,
"step": 2370
},
{
"epoch": 0.8922211808809747,
"grad_norm": 0.4427745690627731,
"learning_rate": 8.861807775316205e-06,
"loss": 0.3203,
"step": 2380
},
{
"epoch": 0.8959700093720713,
"grad_norm": 0.4066534130717824,
"learning_rate": 8.847919313242792e-06,
"loss": 0.3211,
"step": 2390
},
{
"epoch": 0.8997188378631678,
"grad_norm": 0.34844243986285434,
"learning_rate": 8.833957653343112e-06,
"loss": 0.3199,
"step": 2400
},
{
"epoch": 0.9034676663542643,
"grad_norm": 0.3594011818287422,
"learning_rate": 8.819923061205674e-06,
"loss": 0.321,
"step": 2410
},
{
"epoch": 0.9072164948453608,
"grad_norm": 0.34488001664929596,
"learning_rate": 8.805815803806353e-06,
"loss": 0.3255,
"step": 2420
},
{
"epoch": 0.9109653233364574,
"grad_norm": 0.3409284776369216,
"learning_rate": 8.791636149503322e-06,
"loss": 0.3195,
"step": 2430
},
{
"epoch": 0.9147141518275539,
"grad_norm": 0.3853629161293025,
"learning_rate": 8.777384368031929e-06,
"loss": 0.3216,
"step": 2440
},
{
"epoch": 0.9184629803186504,
"grad_norm": 0.34448988964699645,
"learning_rate": 8.763060730499582e-06,
"loss": 0.322,
"step": 2450
},
{
"epoch": 0.922211808809747,
"grad_norm": 0.3382752164401171,
"learning_rate": 8.748665509380582e-06,
"loss": 0.3213,
"step": 2460
},
{
"epoch": 0.9259606373008434,
"grad_norm": 0.3396778964486745,
"learning_rate": 8.73419897851095e-06,
"loss": 0.3236,
"step": 2470
},
{
"epoch": 0.92970946579194,
"grad_norm": 0.37725271996102505,
"learning_rate": 8.7196614130832e-06,
"loss": 0.3189,
"step": 2480
},
{
"epoch": 0.9334582942830365,
"grad_norm": 0.32462914306061436,
"learning_rate": 8.705053089641125e-06,
"loss": 0.3221,
"step": 2490
},
{
"epoch": 0.9372071227741331,
"grad_norm": 0.4369048990730118,
"learning_rate": 8.690374286074522e-06,
"loss": 0.3188,
"step": 2500
},
{
"epoch": 0.9409559512652296,
"grad_norm": 0.3760926470610608,
"learning_rate": 8.675625281613914e-06,
"loss": 0.3212,
"step": 2510
},
{
"epoch": 0.9447047797563262,
"grad_norm": 0.34740401778415597,
"learning_rate": 8.660806356825226e-06,
"loss": 0.3248,
"step": 2520
},
{
"epoch": 0.9484536082474226,
"grad_norm": 0.417666450800287,
"learning_rate": 8.64591779360447e-06,
"loss": 0.3233,
"step": 2530
},
{
"epoch": 0.9522024367385192,
"grad_norm": 0.42158210771195226,
"learning_rate": 8.63095987517236e-06,
"loss": 0.3221,
"step": 2540
},
{
"epoch": 0.9559512652296157,
"grad_norm": 0.4056842055888519,
"learning_rate": 8.615932886068936e-06,
"loss": 0.3244,
"step": 2550
},
{
"epoch": 0.9597000937207123,
"grad_norm": 0.2986528504566679,
"learning_rate": 8.600837112148147e-06,
"loss": 0.3244,
"step": 2560
},
{
"epoch": 0.9634489222118088,
"grad_norm": 0.4221810983370233,
"learning_rate": 8.585672840572418e-06,
"loss": 0.3171,
"step": 2570
},
{
"epoch": 0.9671977507029054,
"grad_norm": 0.32724721456300626,
"learning_rate": 8.570440359807185e-06,
"loss": 0.3221,
"step": 2580
},
{
"epoch": 0.9709465791940018,
"grad_norm": 0.4057823948549215,
"learning_rate": 8.555139959615404e-06,
"loss": 0.3161,
"step": 2590
},
{
"epoch": 0.9746954076850984,
"grad_norm": 0.30760675235109797,
"learning_rate": 8.539771931052042e-06,
"loss": 0.3202,
"step": 2600
},
{
"epoch": 0.9784442361761949,
"grad_norm": 0.4037579844772881,
"learning_rate": 8.524336566458546e-06,
"loss": 0.3189,
"step": 2610
},
{
"epoch": 0.9821930646672915,
"grad_norm": 0.3410849671328779,
"learning_rate": 8.50883415945727e-06,
"loss": 0.3196,
"step": 2620
},
{
"epoch": 0.985941893158388,
"grad_norm": 0.3450287677089191,
"learning_rate": 8.493265004945896e-06,
"loss": 0.3221,
"step": 2630
},
{
"epoch": 0.9896907216494846,
"grad_norm": 0.42284107643629854,
"learning_rate": 8.477629399091829e-06,
"loss": 0.3207,
"step": 2640
},
{
"epoch": 0.993439550140581,
"grad_norm": 0.3054813070655616,
"learning_rate": 8.461927639326557e-06,
"loss": 0.3167,
"step": 2650
},
{
"epoch": 0.9971883786316776,
"grad_norm": 0.37328989726555284,
"learning_rate": 8.446160024339991e-06,
"loss": 0.3236,
"step": 2660
},
{
"epoch": 1.0007497656982194,
"grad_norm": 0.30788577364316916,
"learning_rate": 8.430326854074787e-06,
"loss": 0.3214,
"step": 2670
},
{
"epoch": 1.0044985941893159,
"grad_norm": 0.35820789954813653,
"learning_rate": 8.41442842972064e-06,
"loss": 0.3168,
"step": 2680
},
{
"epoch": 1.0082474226804123,
"grad_norm": 0.3817197374504937,
"learning_rate": 8.398465053708555e-06,
"loss": 0.3161,
"step": 2690
},
{
"epoch": 1.0119962511715088,
"grad_norm": 0.4062171398157765,
"learning_rate": 8.382437029705096e-06,
"loss": 0.3155,
"step": 2700
},
{
"epoch": 1.0157450796626055,
"grad_norm": 0.3540693852002706,
"learning_rate": 8.3663446626066e-06,
"loss": 0.3144,
"step": 2710
},
{
"epoch": 1.019493908153702,
"grad_norm": 0.4029712327717952,
"learning_rate": 8.350188258533387e-06,
"loss": 0.3162,
"step": 2720
},
{
"epoch": 1.0232427366447985,
"grad_norm": 0.3159333545246974,
"learning_rate": 8.333968124823935e-06,
"loss": 0.3168,
"step": 2730
},
{
"epoch": 1.026991565135895,
"grad_norm": 0.33529821419279554,
"learning_rate": 8.31768457002903e-06,
"loss": 0.3144,
"step": 2740
},
{
"epoch": 1.0307403936269917,
"grad_norm": 0.29642181276226026,
"learning_rate": 8.301337903905895e-06,
"loss": 0.3123,
"step": 2750
},
{
"epoch": 1.0344892221180881,
"grad_norm": 0.3454172596114213,
"learning_rate": 8.28492843741231e-06,
"loss": 0.315,
"step": 2760
},
{
"epoch": 1.0382380506091846,
"grad_norm": 0.34066649262255677,
"learning_rate": 8.26845648270068e-06,
"loss": 0.3142,
"step": 2770
},
{
"epoch": 1.041986879100281,
"grad_norm": 0.43684514447857653,
"learning_rate": 8.251922353112108e-06,
"loss": 0.3146,
"step": 2780
},
{
"epoch": 1.0457357075913778,
"grad_norm": 0.3242821399353603,
"learning_rate": 8.235326363170428e-06,
"loss": 0.313,
"step": 2790
},
{
"epoch": 1.0494845360824743,
"grad_norm": 0.3789281542297383,
"learning_rate": 8.21866882857623e-06,
"loss": 0.31,
"step": 2800
},
{
"epoch": 1.0532333645735708,
"grad_norm": 0.3248576385401445,
"learning_rate": 8.201950066200848e-06,
"loss": 0.311,
"step": 2810
},
{
"epoch": 1.0569821930646672,
"grad_norm": 0.36161121635161686,
"learning_rate": 8.185170394080331e-06,
"loss": 0.3153,
"step": 2820
},
{
"epoch": 1.060731021555764,
"grad_norm": 0.4276056542533018,
"learning_rate": 8.168330131409401e-06,
"loss": 0.3123,
"step": 2830
},
{
"epoch": 1.0644798500468604,
"grad_norm": 0.3567320922642298,
"learning_rate": 8.15142959853537e-06,
"loss": 0.314,
"step": 2840
},
{
"epoch": 1.0682286785379569,
"grad_norm": 0.3412615684823743,
"learning_rate": 8.134469116952058e-06,
"loss": 0.3138,
"step": 2850
},
{
"epoch": 1.0719775070290534,
"grad_norm": 0.3526545824695297,
"learning_rate": 8.117449009293668e-06,
"loss": 0.3137,
"step": 2860
},
{
"epoch": 1.0757263355201498,
"grad_norm": 0.3601042619190153,
"learning_rate": 8.100369599328653e-06,
"loss": 0.3145,
"step": 2870
},
{
"epoch": 1.0794751640112465,
"grad_norm": 0.4413371086788278,
"learning_rate": 8.083231211953556e-06,
"loss": 0.3183,
"step": 2880
},
{
"epoch": 1.083223992502343,
"grad_norm": 0.35663942982642843,
"learning_rate": 8.06603417318683e-06,
"loss": 0.3137,
"step": 2890
},
{
"epoch": 1.0869728209934395,
"grad_norm": 0.30103176112371444,
"learning_rate": 8.048778810162638e-06,
"loss": 0.3158,
"step": 2900
},
{
"epoch": 1.090721649484536,
"grad_norm": 0.5017990471408166,
"learning_rate": 8.031465451124623e-06,
"loss": 0.3104,
"step": 2910
},
{
"epoch": 1.0944704779756327,
"grad_norm": 0.30760999555771873,
"learning_rate": 8.014094425419672e-06,
"loss": 0.3093,
"step": 2920
},
{
"epoch": 1.0982193064667292,
"grad_norm": 0.3744914287772446,
"learning_rate": 7.99666606349165e-06,
"loss": 0.3143,
"step": 2930
},
{
"epoch": 1.1019681349578256,
"grad_norm": 0.321847870026043,
"learning_rate": 7.979180696875107e-06,
"loss": 0.3148,
"step": 2940
},
{
"epoch": 1.1057169634489221,
"grad_norm": 0.34119760664325655,
"learning_rate": 7.961638658188982e-06,
"loss": 0.3156,
"step": 2950
},
{
"epoch": 1.1094657919400188,
"grad_norm": 0.3046924003401573,
"learning_rate": 7.944040281130266e-06,
"loss": 0.3112,
"step": 2960
},
{
"epoch": 1.1132146204311153,
"grad_norm": 0.34431924035163775,
"learning_rate": 7.926385900467656e-06,
"loss": 0.308,
"step": 2970
},
{
"epoch": 1.1169634489222118,
"grad_norm": 0.322861633348364,
"learning_rate": 7.908675852035198e-06,
"loss": 0.3115,
"step": 2980
},
{
"epoch": 1.1207122774133083,
"grad_norm": 0.4011613509203003,
"learning_rate": 7.89091047272588e-06,
"loss": 0.3141,
"step": 2990
},
{
"epoch": 1.124461105904405,
"grad_norm": 0.4644811139880584,
"learning_rate": 7.873090100485235e-06,
"loss": 0.3119,
"step": 3000
},
{
"epoch": 1.1282099343955014,
"grad_norm": 0.3288163794026555,
"learning_rate": 7.855215074304913e-06,
"loss": 0.3115,
"step": 3010
},
{
"epoch": 1.131958762886598,
"grad_norm": 0.3862750722060379,
"learning_rate": 7.837285734216228e-06,
"loss": 0.3108,
"step": 3020
},
{
"epoch": 1.1357075913776944,
"grad_norm": 0.4449512249542337,
"learning_rate": 7.819302421283692e-06,
"loss": 0.3121,
"step": 3030
},
{
"epoch": 1.139456419868791,
"grad_norm": 0.306482330717066,
"learning_rate": 7.801265477598525e-06,
"loss": 0.3155,
"step": 3040
},
{
"epoch": 1.1432052483598876,
"grad_norm": 0.3537936797605087,
"learning_rate": 7.783175246272151e-06,
"loss": 0.3137,
"step": 3050
},
{
"epoch": 1.146954076850984,
"grad_norm": 0.39316069541271836,
"learning_rate": 7.765032071429669e-06,
"loss": 0.3158,
"step": 3060
},
{
"epoch": 1.1507029053420805,
"grad_norm": 0.3496204259814267,
"learning_rate": 7.7468362982033e-06,
"loss": 0.3148,
"step": 3070
},
{
"epoch": 1.1544517338331772,
"grad_norm": 0.34318492953087437,
"learning_rate": 7.728588272725838e-06,
"loss": 0.3143,
"step": 3080
},
{
"epoch": 1.1582005623242737,
"grad_norm": 0.33547854591563525,
"learning_rate": 7.710288342124053e-06,
"loss": 0.3117,
"step": 3090
},
{
"epoch": 1.1619493908153702,
"grad_norm": 0.33238683759141413,
"learning_rate": 7.691936854512089e-06,
"loss": 0.3122,
"step": 3100
},
{
"epoch": 1.1656982193064667,
"grad_norm": 0.3364307963397178,
"learning_rate": 7.673534158984843e-06,
"loss": 0.3118,
"step": 3110
},
{
"epoch": 1.1694470477975631,
"grad_norm": 0.3388871161978866,
"learning_rate": 7.655080605611326e-06,
"loss": 0.3107,
"step": 3120
},
{
"epoch": 1.1731958762886598,
"grad_norm": 0.32235529671903823,
"learning_rate": 7.636576545428006e-06,
"loss": 0.3115,
"step": 3130
},
{
"epoch": 1.1769447047797563,
"grad_norm": 0.41170615872376826,
"learning_rate": 7.618022330432122e-06,
"loss": 0.3112,
"step": 3140
},
{
"epoch": 1.1806935332708528,
"grad_norm": 0.38112866816887303,
"learning_rate": 7.599418313574997e-06,
"loss": 0.3134,
"step": 3150
},
{
"epoch": 1.1844423617619495,
"grad_norm": 0.4177346294198595,
"learning_rate": 7.580764848755315e-06,
"loss": 0.3129,
"step": 3160
},
{
"epoch": 1.188191190253046,
"grad_norm": 0.44163782423530795,
"learning_rate": 7.5620622908124e-06,
"loss": 0.3129,
"step": 3170
},
{
"epoch": 1.1919400187441425,
"grad_norm": 0.32956971930691387,
"learning_rate": 7.543310995519457e-06,
"loss": 0.3116,
"step": 3180
},
{
"epoch": 1.195688847235239,
"grad_norm": 0.3452689739953157,
"learning_rate": 7.524511319576808e-06,
"loss": 0.3118,
"step": 3190
},
{
"epoch": 1.1994376757263354,
"grad_norm": 0.3304621485731618,
"learning_rate": 7.5056636206051014e-06,
"loss": 0.3151,
"step": 3200
},
{
"epoch": 1.2031865042174321,
"grad_norm": 0.32694078694361833,
"learning_rate": 7.486768257138519e-06,
"loss": 0.3126,
"step": 3210
},
{
"epoch": 1.2069353327085286,
"grad_norm": 0.298213189088602,
"learning_rate": 7.4678255886179495e-06,
"loss": 0.3104,
"step": 3220
},
{
"epoch": 1.210684161199625,
"grad_norm": 0.3102729131161918,
"learning_rate": 7.44883597538415e-06,
"loss": 0.3119,
"step": 3230
},
{
"epoch": 1.2144329896907216,
"grad_norm": 0.3615016329359893,
"learning_rate": 7.429799778670892e-06,
"loss": 0.312,
"step": 3240
},
{
"epoch": 1.2181818181818183,
"grad_norm": 0.27320951055269593,
"learning_rate": 7.410717360598091e-06,
"loss": 0.3122,
"step": 3250
},
{
"epoch": 1.2219306466729147,
"grad_norm": 0.32659263816751305,
"learning_rate": 7.3915890841649185e-06,
"loss": 0.3091,
"step": 3260
},
{
"epoch": 1.2256794751640112,
"grad_norm": 0.3334234925194134,
"learning_rate": 7.3724153132429e-06,
"loss": 0.3098,
"step": 3270
},
{
"epoch": 1.2294283036551077,
"grad_norm": 0.2954664212502977,
"learning_rate": 7.353196412568981e-06,
"loss": 0.3109,
"step": 3280
},
{
"epoch": 1.2331771321462044,
"grad_norm": 0.3625209301919828,
"learning_rate": 7.333932747738604e-06,
"loss": 0.3111,
"step": 3290
},
{
"epoch": 1.2369259606373009,
"grad_norm": 0.3777090990507428,
"learning_rate": 7.314624685198739e-06,
"loss": 0.3123,
"step": 3300
},
{
"epoch": 1.2406747891283973,
"grad_norm": 0.3213218051422179,
"learning_rate": 7.295272592240931e-06,
"loss": 0.3135,
"step": 3310
},
{
"epoch": 1.2444236176194938,
"grad_norm": 0.3517340125442994,
"learning_rate": 7.275876836994293e-06,
"loss": 0.3098,
"step": 3320
},
{
"epoch": 1.2481724461105905,
"grad_norm": 0.2958254932033676,
"learning_rate": 7.256437788418518e-06,
"loss": 0.3117,
"step": 3330
},
{
"epoch": 1.251921274601687,
"grad_norm": 0.36561257727299235,
"learning_rate": 7.236955816296853e-06,
"loss": 0.3122,
"step": 3340
},
{
"epoch": 1.2556701030927835,
"grad_norm": 0.3058483367622925,
"learning_rate": 7.217431291229068e-06,
"loss": 0.312,
"step": 3350
},
{
"epoch": 1.25941893158388,
"grad_norm": 0.360042705506358,
"learning_rate": 7.197864584624404e-06,
"loss": 0.3098,
"step": 3360
},
{
"epoch": 1.2631677600749764,
"grad_norm": 0.33352021186668945,
"learning_rate": 7.178256068694511e-06,
"loss": 0.3103,
"step": 3370
},
{
"epoch": 1.2669165885660731,
"grad_norm": 0.35734952787552315,
"learning_rate": 7.158606116446364e-06,
"loss": 0.3102,
"step": 3380
},
{
"epoch": 1.2706654170571696,
"grad_norm": 0.39449641298830695,
"learning_rate": 7.138915101675165e-06,
"loss": 0.3102,
"step": 3390
},
{
"epoch": 1.274414245548266,
"grad_norm": 0.3714471907535279,
"learning_rate": 7.1191833989572435e-06,
"loss": 0.3123,
"step": 3400
},
{
"epoch": 1.2781630740393628,
"grad_norm": 0.3042416177198767,
"learning_rate": 7.099411383642918e-06,
"loss": 0.312,
"step": 3410
},
{
"epoch": 1.2819119025304593,
"grad_norm": 0.32047368301258783,
"learning_rate": 7.079599431849364e-06,
"loss": 0.3121,
"step": 3420
},
{
"epoch": 1.2856607310215558,
"grad_norm": 0.41019651538058094,
"learning_rate": 7.059747920453458e-06,
"loss": 0.3103,
"step": 3430
},
{
"epoch": 1.2894095595126522,
"grad_norm": 0.3551928742197105,
"learning_rate": 7.0398572270846034e-06,
"loss": 0.3111,
"step": 3440
},
{
"epoch": 1.2931583880037487,
"grad_norm": 0.3565239012658027,
"learning_rate": 7.019927730117553e-06,
"loss": 0.3108,
"step": 3450
},
{
"epoch": 1.2969072164948454,
"grad_norm": 0.40917892127860045,
"learning_rate": 6.999959808665208e-06,
"loss": 0.3095,
"step": 3460
},
{
"epoch": 1.300656044985942,
"grad_norm": 0.3769832501031786,
"learning_rate": 6.979953842571409e-06,
"loss": 0.3091,
"step": 3470
},
{
"epoch": 1.3044048734770384,
"grad_norm": 0.370975430040192,
"learning_rate": 6.959910212403708e-06,
"loss": 0.3081,
"step": 3480
},
{
"epoch": 1.308153701968135,
"grad_norm": 0.33276491617365694,
"learning_rate": 6.939829299446127e-06,
"loss": 0.3099,
"step": 3490
},
{
"epoch": 1.3119025304592316,
"grad_norm": 0.37547647443532434,
"learning_rate": 6.919711485691909e-06,
"loss": 0.3101,
"step": 3500
},
{
"epoch": 1.315651358950328,
"grad_norm": 0.38523049314373486,
"learning_rate": 6.899557153836252e-06,
"loss": 0.3104,
"step": 3510
},
{
"epoch": 1.3194001874414245,
"grad_norm": 0.32838375873093845,
"learning_rate": 6.8793666872690224e-06,
"loss": 0.3095,
"step": 3520
},
{
"epoch": 1.323149015932521,
"grad_norm": 0.3183180402450787,
"learning_rate": 6.859140470067471e-06,
"loss": 0.3096,
"step": 3530
},
{
"epoch": 1.3268978444236177,
"grad_norm": 0.3629948359100712,
"learning_rate": 6.838878886988921e-06,
"loss": 0.3121,
"step": 3540
},
{
"epoch": 1.3306466729147142,
"grad_norm": 0.3533555709563143,
"learning_rate": 6.818582323463447e-06,
"loss": 0.3123,
"step": 3550
},
{
"epoch": 1.3343955014058106,
"grad_norm": 0.33208001817653265,
"learning_rate": 6.798251165586554e-06,
"loss": 0.3049,
"step": 3560
},
{
"epoch": 1.3381443298969073,
"grad_norm": 0.3002792502712748,
"learning_rate": 6.777885800111814e-06,
"loss": 0.3142,
"step": 3570
},
{
"epoch": 1.3418931583880038,
"grad_norm": 0.35269336353853314,
"learning_rate": 6.757486614443528e-06,
"loss": 0.3098,
"step": 3580
},
{
"epoch": 1.3456419868791003,
"grad_norm": 0.3461464681174475,
"learning_rate": 6.737053996629349e-06,
"loss": 0.3127,
"step": 3590
},
{
"epoch": 1.3493908153701968,
"grad_norm": 0.3240065140465353,
"learning_rate": 6.716588335352894e-06,
"loss": 0.3112,
"step": 3600
},
{
"epoch": 1.3531396438612933,
"grad_norm": 0.3906540626649883,
"learning_rate": 6.69609001992636e-06,
"loss": 0.3067,
"step": 3610
},
{
"epoch": 1.3568884723523897,
"grad_norm": 0.2896675578566779,
"learning_rate": 6.675559440283115e-06,
"loss": 0.3088,
"step": 3620
},
{
"epoch": 1.3606373008434864,
"grad_norm": 0.37220517825735044,
"learning_rate": 6.654996986970277e-06,
"loss": 0.3116,
"step": 3630
},
{
"epoch": 1.364386129334583,
"grad_norm": 0.36451423055829396,
"learning_rate": 6.634403051141287e-06,
"loss": 0.3096,
"step": 3640
},
{
"epoch": 1.3681349578256794,
"grad_norm": 0.33555700523774556,
"learning_rate": 6.613778024548471e-06,
"loss": 0.3059,
"step": 3650
},
{
"epoch": 1.371883786316776,
"grad_norm": 0.3589735145982526,
"learning_rate": 6.593122299535583e-06,
"loss": 0.3082,
"step": 3660
},
{
"epoch": 1.3756326148078726,
"grad_norm": 0.3535758021471356,
"learning_rate": 6.572436269030349e-06,
"loss": 0.3056,
"step": 3670
},
{
"epoch": 1.379381443298969,
"grad_norm": 0.3139302849140221,
"learning_rate": 6.55172032653698e-06,
"loss": 0.3071,
"step": 3680
},
{
"epoch": 1.3831302717900655,
"grad_norm": 0.30876175987731125,
"learning_rate": 6.530974866128699e-06,
"loss": 0.3119,
"step": 3690
},
{
"epoch": 1.386879100281162,
"grad_norm": 0.29612839549415565,
"learning_rate": 6.510200282440235e-06,
"loss": 0.3089,
"step": 3700
},
{
"epoch": 1.3906279287722587,
"grad_norm": 0.367321799885979,
"learning_rate": 6.489396970660327e-06,
"loss": 0.3101,
"step": 3710
},
{
"epoch": 1.3943767572633552,
"grad_norm": 0.3185186823770553,
"learning_rate": 6.4685653265241965e-06,
"loss": 0.3111,
"step": 3720
},
{
"epoch": 1.3981255857544517,
"grad_norm": 0.3767241891962025,
"learning_rate": 6.447705746306022e-06,
"loss": 0.3094,
"step": 3730
},
{
"epoch": 1.4018744142455484,
"grad_norm": 0.33019970567321594,
"learning_rate": 6.426818626811402e-06,
"loss": 0.3132,
"step": 3740
},
{
"epoch": 1.4056232427366449,
"grad_norm": 0.32094766671797814,
"learning_rate": 6.405904365369807e-06,
"loss": 0.3088,
"step": 3750
},
{
"epoch": 1.4093720712277413,
"grad_norm": 0.2826846139255292,
"learning_rate": 6.384963359827023e-06,
"loss": 0.3081,
"step": 3760
},
{
"epoch": 1.4131208997188378,
"grad_norm": 0.32741886750116905,
"learning_rate": 6.3639960085375765e-06,
"loss": 0.3053,
"step": 3770
},
{
"epoch": 1.4168697282099343,
"grad_norm": 0.317490371811339,
"learning_rate": 6.343002710357164e-06,
"loss": 0.3074,
"step": 3780
},
{
"epoch": 1.420618556701031,
"grad_norm": 0.41306311997301026,
"learning_rate": 6.321983864635064e-06,
"loss": 0.3057,
"step": 3790
},
{
"epoch": 1.4243673851921275,
"grad_norm": 0.38554380688703854,
"learning_rate": 6.300939871206534e-06,
"loss": 0.3105,
"step": 3800
},
{
"epoch": 1.428116213683224,
"grad_norm": 0.32564252337266814,
"learning_rate": 6.279871130385212e-06,
"loss": 0.3072,
"step": 3810
},
{
"epoch": 1.4318650421743206,
"grad_norm": 0.2827430544551273,
"learning_rate": 6.258778042955498e-06,
"loss": 0.3086,
"step": 3820
},
{
"epoch": 1.4356138706654171,
"grad_norm": 0.29456325510951775,
"learning_rate": 6.2376610101649286e-06,
"loss": 0.3037,
"step": 3830
},
{
"epoch": 1.4393626991565136,
"grad_norm": 0.3965462936666553,
"learning_rate": 6.216520433716544e-06,
"loss": 0.3078,
"step": 3840
},
{
"epoch": 1.44311152764761,
"grad_norm": 0.35168327410996947,
"learning_rate": 6.1953567157612546e-06,
"loss": 0.304,
"step": 3850
},
{
"epoch": 1.4468603561387066,
"grad_norm": 0.28589685002558846,
"learning_rate": 6.174170258890183e-06,
"loss": 0.3084,
"step": 3860
},
{
"epoch": 1.4506091846298033,
"grad_norm": 0.3031381651642691,
"learning_rate": 6.152961466127003e-06,
"loss": 0.3082,
"step": 3870
},
{
"epoch": 1.4543580131208997,
"grad_norm": 0.2921304333612235,
"learning_rate": 6.131730740920281e-06,
"loss": 0.3082,
"step": 3880
},
{
"epoch": 1.4581068416119962,
"grad_norm": 0.30283235717875956,
"learning_rate": 6.110478487135798e-06,
"loss": 0.3061,
"step": 3890
},
{
"epoch": 1.461855670103093,
"grad_norm": 0.332497742843437,
"learning_rate": 6.0892051090488635e-06,
"loss": 0.3084,
"step": 3900
},
{
"epoch": 1.4656044985941894,
"grad_norm": 0.41697991725406525,
"learning_rate": 6.067911011336631e-06,
"loss": 0.3071,
"step": 3910
},
{
"epoch": 1.4693533270852859,
"grad_norm": 0.4051234101656245,
"learning_rate": 6.046596599070401e-06,
"loss": 0.3104,
"step": 3920
},
{
"epoch": 1.4731021555763824,
"grad_norm": 0.3164549035040791,
"learning_rate": 6.0252622777079035e-06,
"loss": 0.3099,
"step": 3930
},
{
"epoch": 1.4768509840674788,
"grad_norm": 0.31241166834227563,
"learning_rate": 6.003908453085601e-06,
"loss": 0.311,
"step": 3940
},
{
"epoch": 1.4805998125585753,
"grad_norm": 0.32222188905870913,
"learning_rate": 5.9825355314109526e-06,
"loss": 0.3072,
"step": 3950
},
{
"epoch": 1.484348641049672,
"grad_norm": 0.3227944538376381,
"learning_rate": 5.961143919254703e-06,
"loss": 0.308,
"step": 3960
},
{
"epoch": 1.4880974695407685,
"grad_norm": 0.3300037675651502,
"learning_rate": 5.939734023543136e-06,
"loss": 0.3046,
"step": 3970
},
{
"epoch": 1.491846298031865,
"grad_norm": 0.30075953903254604,
"learning_rate": 5.918306251550339e-06,
"loss": 0.302,
"step": 3980
},
{
"epoch": 1.4955951265229617,
"grad_norm": 0.31317933873421516,
"learning_rate": 5.8968610108904544e-06,
"loss": 0.3067,
"step": 3990
},
{
"epoch": 1.4993439550140581,
"grad_norm": 0.29399450034399577,
"learning_rate": 5.8753987095099265e-06,
"loss": 0.3083,
"step": 4000
},
{
"epoch": 1.5030927835051546,
"grad_norm": 0.3323948341742506,
"learning_rate": 5.85391975567974e-06,
"loss": 0.3062,
"step": 4010
},
{
"epoch": 1.506841611996251,
"grad_norm": 0.3084482430218866,
"learning_rate": 5.832424557987656e-06,
"loss": 0.3099,
"step": 4020
},
{
"epoch": 1.5105904404873476,
"grad_norm": 0.3207010067585515,
"learning_rate": 5.810913525330431e-06,
"loss": 0.3052,
"step": 4030
},
{
"epoch": 1.5143392689784443,
"grad_norm": 0.31177483150319013,
"learning_rate": 5.789387066906058e-06,
"loss": 0.3075,
"step": 4040
},
{
"epoch": 1.5180880974695408,
"grad_norm": 0.29934212917396297,
"learning_rate": 5.7678455922059555e-06,
"loss": 0.3052,
"step": 4050
},
{
"epoch": 1.5218369259606375,
"grad_norm": 0.34568865059481235,
"learning_rate": 5.746289511007203e-06,
"loss": 0.309,
"step": 4060
},
{
"epoch": 1.525585754451734,
"grad_norm": 0.3456913152992093,
"learning_rate": 5.724719233364731e-06,
"loss": 0.311,
"step": 4070
},
{
"epoch": 1.5293345829428304,
"grad_norm": 0.27251969528129666,
"learning_rate": 5.703135169603522e-06,
"loss": 0.3062,
"step": 4080
},
{
"epoch": 1.533083411433927,
"grad_norm": 0.2690943892304097,
"learning_rate": 5.681537730310811e-06,
"loss": 0.3074,
"step": 4090
},
{
"epoch": 1.5368322399250234,
"grad_norm": 0.278333988084598,
"learning_rate": 5.659927326328272e-06,
"loss": 0.3068,
"step": 4100
},
{
"epoch": 1.5405810684161199,
"grad_norm": 0.3244786550521004,
"learning_rate": 5.6383043687442045e-06,
"loss": 0.3056,
"step": 4110
},
{
"epoch": 1.5443298969072163,
"grad_norm": 0.2872353797592551,
"learning_rate": 5.616669268885704e-06,
"loss": 0.3106,
"step": 4120
},
{
"epoch": 1.548078725398313,
"grad_norm": 0.2849148407943691,
"learning_rate": 5.595022438310853e-06,
"loss": 0.3048,
"step": 4130
},
{
"epoch": 1.5518275538894095,
"grad_norm": 0.2796372211580368,
"learning_rate": 5.573364288800879e-06,
"loss": 0.3053,
"step": 4140
},
{
"epoch": 1.5555763823805062,
"grad_norm": 0.30531840338480354,
"learning_rate": 5.551695232352325e-06,
"loss": 0.3087,
"step": 4150
},
{
"epoch": 1.5593252108716027,
"grad_norm": 0.4238218939540311,
"learning_rate": 5.530015681169221e-06,
"loss": 0.3055,
"step": 4160
},
{
"epoch": 1.5630740393626992,
"grad_norm": 0.3815305304277874,
"learning_rate": 5.508326047655228e-06,
"loss": 0.3057,
"step": 4170
},
{
"epoch": 1.5668228678537957,
"grad_norm": 0.3138756321656075,
"learning_rate": 5.486626744405803e-06,
"loss": 0.3041,
"step": 4180
},
{
"epoch": 1.5705716963448921,
"grad_norm": 0.33872416612121964,
"learning_rate": 5.464918184200346e-06,
"loss": 0.3051,
"step": 4190
},
{
"epoch": 1.5743205248359886,
"grad_norm": 0.28251389314936254,
"learning_rate": 5.443200779994352e-06,
"loss": 0.3056,
"step": 4200
},
{
"epoch": 1.5780693533270853,
"grad_norm": 0.30129264913596,
"learning_rate": 5.42147494491155e-06,
"loss": 0.3093,
"step": 4210
},
{
"epoch": 1.5818181818181818,
"grad_norm": 0.3521670880883061,
"learning_rate": 5.399741092236048e-06,
"loss": 0.3048,
"step": 4220
},
{
"epoch": 1.5855670103092785,
"grad_norm": 0.3228669758788861,
"learning_rate": 5.377999635404471e-06,
"loss": 0.3053,
"step": 4230
},
{
"epoch": 1.589315838800375,
"grad_norm": 0.310469703159362,
"learning_rate": 5.356250987998096e-06,
"loss": 0.305,
"step": 4240
},
{
"epoch": 1.5930646672914714,
"grad_norm": 0.30212536889324887,
"learning_rate": 5.334495563734982e-06,
"loss": 0.3053,
"step": 4250
},
{
"epoch": 1.596813495782568,
"grad_norm": 0.286958233099421,
"learning_rate": 5.312733776462104e-06,
"loss": 0.3076,
"step": 4260
},
{
"epoch": 1.6005623242736644,
"grad_norm": 0.28804728629879095,
"learning_rate": 5.290966040147478e-06,
"loss": 0.3043,
"step": 4270
},
{
"epoch": 1.6043111527647609,
"grad_norm": 0.30051995708100954,
"learning_rate": 5.269192768872287e-06,
"loss": 0.3052,
"step": 4280
},
{
"epoch": 1.6080599812558576,
"grad_norm": 0.28056826093377685,
"learning_rate": 5.247414376823002e-06,
"loss": 0.307,
"step": 4290
},
{
"epoch": 1.611808809746954,
"grad_norm": 0.2917800338604193,
"learning_rate": 5.225631278283509e-06,
"loss": 0.3081,
"step": 4300
},
{
"epoch": 1.6155576382380508,
"grad_norm": 0.32427902637956957,
"learning_rate": 5.203843887627223e-06,
"loss": 0.3056,
"step": 4310
},
{
"epoch": 1.6193064667291472,
"grad_norm": 0.3284938009296222,
"learning_rate": 5.1820526193092035e-06,
"loss": 0.3,
"step": 4320
},
{
"epoch": 1.6230552952202437,
"grad_norm": 0.2994176202208045,
"learning_rate": 5.160257887858278e-06,
"loss": 0.3055,
"step": 4330
},
{
"epoch": 1.6268041237113402,
"grad_norm": 0.30335734202532266,
"learning_rate": 5.138460107869144e-06,
"loss": 0.3063,
"step": 4340
},
{
"epoch": 1.6305529522024367,
"grad_norm": 0.2981285125458323,
"learning_rate": 5.116659693994502e-06,
"loss": 0.3075,
"step": 4350
},
{
"epoch": 1.6343017806935332,
"grad_norm": 0.31311406252278756,
"learning_rate": 5.09485706093715e-06,
"loss": 0.305,
"step": 4360
},
{
"epoch": 1.6380506091846299,
"grad_norm": 0.34474359679529554,
"learning_rate": 5.073052623442102e-06,
"loss": 0.3058,
"step": 4370
},
{
"epoch": 1.6417994376757263,
"grad_norm": 0.28599712118797227,
"learning_rate": 5.0512467962886925e-06,
"loss": 0.3061,
"step": 4380
},
{
"epoch": 1.6455482661668228,
"grad_norm": 0.2943654382080562,
"learning_rate": 5.029439994282698e-06,
"loss": 0.3066,
"step": 4390
},
{
"epoch": 1.6492970946579195,
"grad_norm": 0.29496328633119784,
"learning_rate": 5.007632632248435e-06,
"loss": 0.3041,
"step": 4400
},
{
"epoch": 1.653045923149016,
"grad_norm": 0.3047275804837925,
"learning_rate": 4.985825125020875e-06,
"loss": 0.3057,
"step": 4410
},
{
"epoch": 1.6567947516401125,
"grad_norm": 0.3144745132241415,
"learning_rate": 4.9640178874377555e-06,
"loss": 0.3076,
"step": 4420
},
{
"epoch": 1.660543580131209,
"grad_norm": 0.3080039277090962,
"learning_rate": 4.942211334331673e-06,
"loss": 0.3042,
"step": 4430
},
{
"epoch": 1.6642924086223054,
"grad_norm": 0.2933857618191826,
"learning_rate": 4.920405880522216e-06,
"loss": 0.3013,
"step": 4440
},
{
"epoch": 1.668041237113402,
"grad_norm": 0.2654508570280346,
"learning_rate": 4.898601940808054e-06,
"loss": 0.3061,
"step": 4450
},
{
"epoch": 1.6717900656044986,
"grad_norm": 0.2612577806304869,
"learning_rate": 4.876799929959056e-06,
"loss": 0.3051,
"step": 4460
},
{
"epoch": 1.675538894095595,
"grad_norm": 0.2941200079354783,
"learning_rate": 4.855000262708403e-06,
"loss": 0.3058,
"step": 4470
},
{
"epoch": 1.6792877225866918,
"grad_norm": 0.292477790531897,
"learning_rate": 4.833203353744685e-06,
"loss": 0.3003,
"step": 4480
},
{
"epoch": 1.6830365510777883,
"grad_norm": 0.3182850195508283,
"learning_rate": 4.811409617704031e-06,
"loss": 0.3055,
"step": 4490
},
{
"epoch": 1.6867853795688847,
"grad_norm": 0.295963046759259,
"learning_rate": 4.789619469162207e-06,
"loss": 0.3038,
"step": 4500
},
{
"epoch": 1.6905342080599812,
"grad_norm": 0.30453062664454494,
"learning_rate": 4.767833322626739e-06,
"loss": 0.3077,
"step": 4510
},
{
"epoch": 1.6942830365510777,
"grad_norm": 0.32102961685654363,
"learning_rate": 4.746051592529024e-06,
"loss": 0.3045,
"step": 4520
},
{
"epoch": 1.6980318650421742,
"grad_norm": 0.3860426253093947,
"learning_rate": 4.72427469321644e-06,
"loss": 0.304,
"step": 4530
},
{
"epoch": 1.7017806935332709,
"grad_norm": 0.31976534291795883,
"learning_rate": 4.702503038944477e-06,
"loss": 0.3065,
"step": 4540
},
{
"epoch": 1.7055295220243674,
"grad_norm": 0.30020103673628445,
"learning_rate": 4.680737043868847e-06,
"loss": 0.3051,
"step": 4550
},
{
"epoch": 1.709278350515464,
"grad_norm": 0.3397891462095839,
"learning_rate": 4.658977122037613e-06,
"loss": 0.304,
"step": 4560
},
{
"epoch": 1.7130271790065605,
"grad_norm": 0.27912887922833707,
"learning_rate": 4.637223687383301e-06,
"loss": 0.3037,
"step": 4570
},
{
"epoch": 1.716776007497657,
"grad_norm": 0.3191792078436299,
"learning_rate": 4.6154771537150395e-06,
"loss": 0.3045,
"step": 4580
},
{
"epoch": 1.7205248359887535,
"grad_norm": 0.33133767848366,
"learning_rate": 4.593737934710682e-06,
"loss": 0.3054,
"step": 4590
},
{
"epoch": 1.72427366447985,
"grad_norm": 0.3304449062895558,
"learning_rate": 4.572006443908931e-06,
"loss": 0.3074,
"step": 4600
},
{
"epoch": 1.7280224929709465,
"grad_norm": 0.31412432430168175,
"learning_rate": 4.550283094701486e-06,
"loss": 0.3014,
"step": 4610
},
{
"epoch": 1.7317713214620432,
"grad_norm": 0.3205845577239349,
"learning_rate": 4.528568300325163e-06,
"loss": 0.3048,
"step": 4620
},
{
"epoch": 1.7355201499531396,
"grad_norm": 0.2978584746193892,
"learning_rate": 4.506862473854051e-06,
"loss": 0.3073,
"step": 4630
},
{
"epoch": 1.7392689784442363,
"grad_norm": 0.31780026167650316,
"learning_rate": 4.485166028191635e-06,
"loss": 0.3042,
"step": 4640
},
{
"epoch": 1.7430178069353328,
"grad_norm": 0.29696017195351765,
"learning_rate": 4.46347937606296e-06,
"loss": 0.3033,
"step": 4650
},
{
"epoch": 1.7467666354264293,
"grad_norm": 0.3034994709800072,
"learning_rate": 4.441802930006769e-06,
"loss": 0.3019,
"step": 4660
},
{
"epoch": 1.7505154639175258,
"grad_norm": 0.27513107950398713,
"learning_rate": 4.420137102367655e-06,
"loss": 0.3013,
"step": 4670
},
{
"epoch": 1.7542642924086223,
"grad_norm": 0.27992210471747286,
"learning_rate": 4.3984823052882275e-06,
"loss": 0.3078,
"step": 4680
},
{
"epoch": 1.7580131208997187,
"grad_norm": 0.26668301924819904,
"learning_rate": 4.376838950701253e-06,
"loss": 0.3048,
"step": 4690
},
{
"epoch": 1.7617619493908152,
"grad_norm": 0.366231766614244,
"learning_rate": 4.355207450321843e-06,
"loss": 0.3062,
"step": 4700
},
{
"epoch": 1.765510777881912,
"grad_norm": 0.3017858158791989,
"learning_rate": 4.333588215639602e-06,
"loss": 0.3075,
"step": 4710
},
{
"epoch": 1.7692596063730084,
"grad_norm": 0.346236001649392,
"learning_rate": 4.3119816579108105e-06,
"loss": 0.3051,
"step": 4720
},
{
"epoch": 1.773008434864105,
"grad_norm": 0.2643095034172052,
"learning_rate": 4.290388188150602e-06,
"loss": 0.3076,
"step": 4730
},
{
"epoch": 1.7767572633552016,
"grad_norm": 0.29431607280457334,
"learning_rate": 4.268808217125135e-06,
"loss": 0.2987,
"step": 4740
},
{
"epoch": 1.780506091846298,
"grad_norm": 0.2876827526865779,
"learning_rate": 4.247242155343791e-06,
"loss": 0.3025,
"step": 4750
},
{
"epoch": 1.7842549203373945,
"grad_norm": 0.3017890755638588,
"learning_rate": 4.225690413051357e-06,
"loss": 0.3047,
"step": 4760
},
{
"epoch": 1.788003748828491,
"grad_norm": 0.2569896947025125,
"learning_rate": 4.204153400220226e-06,
"loss": 0.3018,
"step": 4770
},
{
"epoch": 1.7917525773195875,
"grad_norm": 0.3176997642563989,
"learning_rate": 4.1826315265426e-06,
"loss": 0.3029,
"step": 4780
},
{
"epoch": 1.7955014058106842,
"grad_norm": 0.26253963065958663,
"learning_rate": 4.161125201422685e-06,
"loss": 0.3056,
"step": 4790
},
{
"epoch": 1.7992502343017807,
"grad_norm": 0.2999248709623152,
"learning_rate": 4.139634833968918e-06,
"loss": 0.3034,
"step": 4800
},
{
"epoch": 1.8029990627928774,
"grad_norm": 0.2625190575233833,
"learning_rate": 4.118160832986178e-06,
"loss": 0.3066,
"step": 4810
},
{
"epoch": 1.8067478912839738,
"grad_norm": 0.28396845470603027,
"learning_rate": 4.096703606968007e-06,
"loss": 0.2975,
"step": 4820
},
{
"epoch": 1.8104967197750703,
"grad_norm": 0.28114557639064414,
"learning_rate": 4.075263564088841e-06,
"loss": 0.3057,
"step": 4830
},
{
"epoch": 1.8142455482661668,
"grad_norm": 0.28640490820378106,
"learning_rate": 4.05384111219625e-06,
"loss": 0.304,
"step": 4840
},
{
"epoch": 1.8179943767572633,
"grad_norm": 0.3121384034406098,
"learning_rate": 4.032436658803175e-06,
"loss": 0.3018,
"step": 4850
},
{
"epoch": 1.8217432052483598,
"grad_norm": 0.3535199694389629,
"learning_rate": 4.011050611080173e-06,
"loss": 0.3016,
"step": 4860
},
{
"epoch": 1.8254920337394565,
"grad_norm": 0.2763793407172489,
"learning_rate": 3.989683375847681e-06,
"loss": 0.305,
"step": 4870
},
{
"epoch": 1.829240862230553,
"grad_norm": 0.2650103173219085,
"learning_rate": 3.968335359568267e-06,
"loss": 0.3053,
"step": 4880
},
{
"epoch": 1.8329896907216496,
"grad_norm": 0.2963214885097785,
"learning_rate": 3.947006968338904e-06,
"loss": 0.3026,
"step": 4890
},
{
"epoch": 1.8367385192127461,
"grad_norm": 0.2628968628669574,
"learning_rate": 3.9256986078832445e-06,
"loss": 0.3013,
"step": 4900
},
{
"epoch": 1.8404873477038426,
"grad_norm": 0.2681562658141619,
"learning_rate": 3.9044106835439e-06,
"loss": 0.3043,
"step": 4910
},
{
"epoch": 1.844236176194939,
"grad_norm": 0.2778911339445839,
"learning_rate": 3.883143600274737e-06,
"loss": 0.3065,
"step": 4920
},
{
"epoch": 1.8479850046860355,
"grad_norm": 0.31108432801110153,
"learning_rate": 3.861897762633158e-06,
"loss": 0.3047,
"step": 4930
},
{
"epoch": 1.851733833177132,
"grad_norm": 0.3091817436636937,
"learning_rate": 3.840673574772427e-06,
"loss": 0.3016,
"step": 4940
},
{
"epoch": 1.8554826616682287,
"grad_norm": 0.2779952989085048,
"learning_rate": 3.819471440433963e-06,
"loss": 0.3059,
"step": 4950
},
{
"epoch": 1.8592314901593252,
"grad_norm": 0.30082155927958976,
"learning_rate": 3.798291762939672e-06,
"loss": 0.3037,
"step": 4960
},
{
"epoch": 1.862980318650422,
"grad_norm": 0.2714535755041495,
"learning_rate": 3.7771349451842706e-06,
"loss": 0.3024,
"step": 4970
},
{
"epoch": 1.8667291471415184,
"grad_norm": 0.2646413815703557,
"learning_rate": 3.7560013896276154e-06,
"loss": 0.3062,
"step": 4980
},
{
"epoch": 1.8704779756326149,
"grad_norm": 0.27989036154054725,
"learning_rate": 3.7348914982870598e-06,
"loss": 0.3014,
"step": 4990
},
{
"epoch": 1.8742268041237113,
"grad_norm": 0.249918156959456,
"learning_rate": 3.7138056727297966e-06,
"loss": 0.3066,
"step": 5000
},
{
"epoch": 1.8779756326148078,
"grad_norm": 0.2756181408313783,
"learning_rate": 3.6927443140652243e-06,
"loss": 0.2989,
"step": 5010
},
{
"epoch": 1.8817244611059043,
"grad_norm": 0.28120277663989135,
"learning_rate": 3.6717078229373094e-06,
"loss": 0.3058,
"step": 5020
},
{
"epoch": 1.8854732895970008,
"grad_norm": 0.30112216057324276,
"learning_rate": 3.6506965995169778e-06,
"loss": 0.302,
"step": 5030
},
{
"epoch": 1.8892221180880975,
"grad_norm": 0.25724383909329807,
"learning_rate": 3.6297110434944937e-06,
"loss": 0.3009,
"step": 5040
},
{
"epoch": 1.892970946579194,
"grad_norm": 0.25981738649741903,
"learning_rate": 3.6087515540718533e-06,
"loss": 0.3055,
"step": 5050
},
{
"epoch": 1.8967197750702907,
"grad_norm": 0.25652902199859806,
"learning_rate": 3.587818529955203e-06,
"loss": 0.3025,
"step": 5060
},
{
"epoch": 1.9004686035613871,
"grad_norm": 0.2673452955037198,
"learning_rate": 3.5669123693472386e-06,
"loss": 0.3048,
"step": 5070
},
{
"epoch": 1.9042174320524836,
"grad_norm": 0.26507376497776003,
"learning_rate": 3.5460334699396486e-06,
"loss": 0.299,
"step": 5080
},
{
"epoch": 1.90796626054358,
"grad_norm": 0.28370264506275916,
"learning_rate": 3.525182228905532e-06,
"loss": 0.3026,
"step": 5090
},
{
"epoch": 1.9117150890346766,
"grad_norm": 0.2712966167311274,
"learning_rate": 3.5043590428918543e-06,
"loss": 0.3016,
"step": 5100
},
{
"epoch": 1.915463917525773,
"grad_norm": 0.2553393508783198,
"learning_rate": 3.4835643080119035e-06,
"loss": 0.3007,
"step": 5110
},
{
"epoch": 1.9192127460168698,
"grad_norm": 0.27034735418826117,
"learning_rate": 3.4627984198377397e-06,
"loss": 0.3007,
"step": 5120
},
{
"epoch": 1.9229615745079662,
"grad_norm": 0.30646341895729456,
"learning_rate": 3.4420617733926897e-06,
"loss": 0.3,
"step": 5130
},
{
"epoch": 1.926710402999063,
"grad_norm": 0.2512695351382934,
"learning_rate": 3.421354763143817e-06,
"loss": 0.306,
"step": 5140
},
{
"epoch": 1.9304592314901594,
"grad_norm": 0.295107786520028,
"learning_rate": 3.40067778299443e-06,
"loss": 0.3018,
"step": 5150
},
{
"epoch": 1.934208059981256,
"grad_norm": 0.26954378841421245,
"learning_rate": 3.380031226276579e-06,
"loss": 0.2959,
"step": 5160
},
{
"epoch": 1.9379568884723524,
"grad_norm": 0.2817857686250554,
"learning_rate": 3.3594154857435824e-06,
"loss": 0.2995,
"step": 5170
},
{
"epoch": 1.9417057169634488,
"grad_norm": 0.26193810687692165,
"learning_rate": 3.33883095356255e-06,
"loss": 0.3015,
"step": 5180
},
{
"epoch": 1.9454545454545453,
"grad_norm": 0.24259516878097934,
"learning_rate": 3.318278021306921e-06,
"loss": 0.3054,
"step": 5190
},
{
"epoch": 1.949203373945642,
"grad_norm": 0.25065102402789546,
"learning_rate": 3.297757079949024e-06,
"loss": 0.3009,
"step": 5200
},
{
"epoch": 1.9529522024367385,
"grad_norm": 0.2983273686405371,
"learning_rate": 3.27726851985263e-06,
"loss": 0.3014,
"step": 5210
},
{
"epoch": 1.9567010309278352,
"grad_norm": 0.2822433040270773,
"learning_rate": 3.2568127307655332e-06,
"loss": 0.3047,
"step": 5220
},
{
"epoch": 1.9604498594189317,
"grad_norm": 0.2921957259400507,
"learning_rate": 3.236390101812137e-06,
"loss": 0.3045,
"step": 5230
},
{
"epoch": 1.9641986879100282,
"grad_norm": 0.3033695868527522,
"learning_rate": 3.2160010214860415e-06,
"loss": 0.3,
"step": 5240
},
{
"epoch": 1.9679475164011246,
"grad_norm": 0.3020931816317373,
"learning_rate": 3.1956458776426704e-06,
"loss": 0.2982,
"step": 5250
},
{
"epoch": 1.9716963448922211,
"grad_norm": 0.3154539666343415,
"learning_rate": 3.1753250574918755e-06,
"loss": 0.3045,
"step": 5260
},
{
"epoch": 1.9754451733833176,
"grad_norm": 0.3079829291596006,
"learning_rate": 3.1550389475905884e-06,
"loss": 0.3033,
"step": 5270
},
{
"epoch": 1.9791940018744143,
"grad_norm": 0.2795592575657611,
"learning_rate": 3.1347879338354474e-06,
"loss": 0.3013,
"step": 5280
},
{
"epoch": 1.9829428303655108,
"grad_norm": 0.28905850248906373,
"learning_rate": 3.114572401455476e-06,
"loss": 0.3018,
"step": 5290
},
{
"epoch": 1.9866916588566073,
"grad_norm": 0.2917801204906715,
"learning_rate": 3.094392735004742e-06,
"loss": 0.3077,
"step": 5300
},
{
"epoch": 1.990440487347704,
"grad_norm": 0.2869880836395518,
"learning_rate": 3.074249318355046e-06,
"loss": 0.305,
"step": 5310
},
{
"epoch": 1.9941893158388004,
"grad_norm": 0.266193620073341,
"learning_rate": 3.0541425346886234e-06,
"loss": 0.3042,
"step": 5320
},
{
"epoch": 1.997938144329897,
"grad_norm": 0.24366671055144762,
"learning_rate": 3.0340727664908437e-06,
"loss": 0.3031,
"step": 5330
},
{
"epoch": 2.0014995313964388,
"grad_norm": 0.27572825512606464,
"learning_rate": 3.0140403955429498e-06,
"loss": 0.2986,
"step": 5340
},
{
"epoch": 2.0052483598875352,
"grad_norm": 0.27324353099912724,
"learning_rate": 2.9940458029147833e-06,
"loss": 0.2982,
"step": 5350
},
{
"epoch": 2.0089971883786317,
"grad_norm": 0.2737011310465298,
"learning_rate": 2.974089368957542e-06,
"loss": 0.2987,
"step": 5360
},
{
"epoch": 2.012746016869728,
"grad_norm": 0.24331428459490873,
"learning_rate": 2.954171473296543e-06,
"loss": 0.299,
"step": 5370
},
{
"epoch": 2.0164948453608247,
"grad_norm": 0.26924383948866965,
"learning_rate": 2.934292494823997e-06,
"loss": 0.2999,
"step": 5380
},
{
"epoch": 2.020243673851921,
"grad_norm": 0.26880279888098063,
"learning_rate": 2.9144528116918114e-06,
"loss": 0.2982,
"step": 5390
},
{
"epoch": 2.0239925023430176,
"grad_norm": 0.26881181242331237,
"learning_rate": 2.894652801304382e-06,
"loss": 0.2985,
"step": 5400
},
{
"epoch": 2.027741330834114,
"grad_norm": 0.2759806088124146,
"learning_rate": 2.8748928403114274e-06,
"loss": 0.2983,
"step": 5410
},
{
"epoch": 2.031490159325211,
"grad_norm": 0.25861803616143986,
"learning_rate": 2.855173304600817e-06,
"loss": 0.299,
"step": 5420
},
{
"epoch": 2.0352389878163075,
"grad_norm": 0.2522194380092259,
"learning_rate": 2.835494569291423e-06,
"loss": 0.3015,
"step": 5430
},
{
"epoch": 2.038987816307404,
"grad_norm": 0.28629406566171184,
"learning_rate": 2.8158570087259825e-06,
"loss": 0.2968,
"step": 5440
},
{
"epoch": 2.0427366447985005,
"grad_norm": 0.25257171184095567,
"learning_rate": 2.796260996463975e-06,
"loss": 0.2972,
"step": 5450
},
{
"epoch": 2.046485473289597,
"grad_norm": 0.292661843719565,
"learning_rate": 2.7767069052745267e-06,
"loss": 0.2982,
"step": 5460
},
{
"epoch": 2.0502343017806934,
"grad_norm": 0.29909777284030853,
"learning_rate": 2.7571951071293015e-06,
"loss": 0.2961,
"step": 5470
},
{
"epoch": 2.05398313027179,
"grad_norm": 0.27936272946203394,
"learning_rate": 2.737725973195442e-06,
"loss": 0.3,
"step": 5480
},
{
"epoch": 2.0577319587628864,
"grad_norm": 0.2722976980702845,
"learning_rate": 2.718299873828505e-06,
"loss": 0.2995,
"step": 5490
},
{
"epoch": 2.0614807872539833,
"grad_norm": 0.2556623687563956,
"learning_rate": 2.698917178565403e-06,
"loss": 0.3004,
"step": 5500
},
{
"epoch": 2.06522961574508,
"grad_norm": 0.24143453577190127,
"learning_rate": 2.6795782561173946e-06,
"loss": 0.3012,
"step": 5510
},
{
"epoch": 2.0689784442361763,
"grad_norm": 0.2554088706670706,
"learning_rate": 2.6602834743630567e-06,
"loss": 0.3007,
"step": 5520
},
{
"epoch": 2.0727272727272728,
"grad_norm": 0.24613264972717522,
"learning_rate": 2.6410332003412953e-06,
"loss": 0.2997,
"step": 5530
},
{
"epoch": 2.0764761012183692,
"grad_norm": 0.2851526682441657,
"learning_rate": 2.6218278002443513e-06,
"loss": 0.2985,
"step": 5540
},
{
"epoch": 2.0802249297094657,
"grad_norm": 0.2672191793824331,
"learning_rate": 2.602667639410849e-06,
"loss": 0.2998,
"step": 5550
},
{
"epoch": 2.083973758200562,
"grad_norm": 0.26199548768893466,
"learning_rate": 2.5835530823188393e-06,
"loss": 0.2977,
"step": 5560
},
{
"epoch": 2.0877225866916587,
"grad_norm": 0.26975447619905446,
"learning_rate": 2.5644844925788605e-06,
"loss": 0.2979,
"step": 5570
},
{
"epoch": 2.0914714151827556,
"grad_norm": 0.25886114170392355,
"learning_rate": 2.5454622329270354e-06,
"loss": 0.301,
"step": 5580
},
{
"epoch": 2.095220243673852,
"grad_norm": 0.2739516733416926,
"learning_rate": 2.5264866652181572e-06,
"loss": 0.2986,
"step": 5590
},
{
"epoch": 2.0989690721649485,
"grad_norm": 0.26592574679314473,
"learning_rate": 2.5075581504188162e-06,
"loss": 0.297,
"step": 5600
},
{
"epoch": 2.102717900656045,
"grad_norm": 0.2604176424953529,
"learning_rate": 2.4886770486005283e-06,
"loss": 0.3002,
"step": 5610
},
{
"epoch": 2.1064667291471415,
"grad_norm": 0.2750968013788369,
"learning_rate": 2.469843718932883e-06,
"loss": 0.2955,
"step": 5620
},
{
"epoch": 2.110215557638238,
"grad_norm": 0.2526155437469898,
"learning_rate": 2.45105851967672e-06,
"loss": 0.3008,
"step": 5630
},
{
"epoch": 2.1139643861293345,
"grad_norm": 0.2410587895663488,
"learning_rate": 2.432321808177304e-06,
"loss": 0.3005,
"step": 5640
},
{
"epoch": 2.117713214620431,
"grad_norm": 0.2895237677114124,
"learning_rate": 2.413633940857535e-06,
"loss": 0.2981,
"step": 5650
},
{
"epoch": 2.121462043111528,
"grad_norm": 0.24128988947398597,
"learning_rate": 2.394995273211159e-06,
"loss": 0.2935,
"step": 5660
},
{
"epoch": 2.1252108716026243,
"grad_norm": 0.2507583674367069,
"learning_rate": 2.376406159796018e-06,
"loss": 0.2964,
"step": 5670
},
{
"epoch": 2.128959700093721,
"grad_norm": 0.2412689179502715,
"learning_rate": 2.357866954227297e-06,
"loss": 0.2981,
"step": 5680
},
{
"epoch": 2.1327085285848173,
"grad_norm": 0.2522987442122012,
"learning_rate": 2.3393780091707925e-06,
"loss": 0.3003,
"step": 5690
},
{
"epoch": 2.1364573570759138,
"grad_norm": 0.2933683784691293,
"learning_rate": 2.32093967633622e-06,
"loss": 0.2976,
"step": 5700
},
{
"epoch": 2.1402061855670103,
"grad_norm": 0.24892001274036304,
"learning_rate": 2.3025523064705054e-06,
"loss": 0.2938,
"step": 5710
},
{
"epoch": 2.1439550140581067,
"grad_norm": 0.2486503075797692,
"learning_rate": 2.284216249351125e-06,
"loss": 0.3002,
"step": 5720
},
{
"epoch": 2.147703842549203,
"grad_norm": 0.2408285712986302,
"learning_rate": 2.265931853779449e-06,
"loss": 0.298,
"step": 5730
},
{
"epoch": 2.1514526710402997,
"grad_norm": 0.2539542237070198,
"learning_rate": 2.2476994675741032e-06,
"loss": 0.2995,
"step": 5740
},
{
"epoch": 2.1552014995313966,
"grad_norm": 0.25802690903176057,
"learning_rate": 2.2295194375643574e-06,
"loss": 0.2986,
"step": 5750
},
{
"epoch": 2.158950328022493,
"grad_norm": 0.2434560170701692,
"learning_rate": 2.21139210958352e-06,
"loss": 0.2974,
"step": 5760
},
{
"epoch": 2.1626991565135896,
"grad_norm": 0.25695023854646953,
"learning_rate": 2.1933178284623696e-06,
"loss": 0.2967,
"step": 5770
},
{
"epoch": 2.166447985004686,
"grad_norm": 0.23288003046372271,
"learning_rate": 2.175296938022586e-06,
"loss": 0.3007,
"step": 5780
},
{
"epoch": 2.1701968134957825,
"grad_norm": 0.23121575998389227,
"learning_rate": 2.1573297810702178e-06,
"loss": 0.2961,
"step": 5790
},
{
"epoch": 2.173945641986879,
"grad_norm": 0.24000439116988645,
"learning_rate": 2.139416699389153e-06,
"loss": 0.2958,
"step": 5800
},
{
"epoch": 2.1776944704779755,
"grad_norm": 0.2871636238090685,
"learning_rate": 2.121558033734626e-06,
"loss": 0.2992,
"step": 5810
},
{
"epoch": 2.181443298969072,
"grad_norm": 0.26976350838721214,
"learning_rate": 2.103754123826729e-06,
"loss": 0.2952,
"step": 5820
},
{
"epoch": 2.185192127460169,
"grad_norm": 0.2521119119225825,
"learning_rate": 2.0860053083439523e-06,
"loss": 0.2992,
"step": 5830
},
{
"epoch": 2.1889409559512654,
"grad_norm": 0.23812422246398923,
"learning_rate": 2.0683119249167444e-06,
"loss": 0.2959,
"step": 5840
},
{
"epoch": 2.192689784442362,
"grad_norm": 0.24908411598662353,
"learning_rate": 2.0506743101210786e-06,
"loss": 0.2992,
"step": 5850
},
{
"epoch": 2.1964386129334583,
"grad_norm": 0.24276676952619408,
"learning_rate": 2.033092799472065e-06,
"loss": 0.2951,
"step": 5860
},
{
"epoch": 2.200187441424555,
"grad_norm": 0.26493805047445046,
"learning_rate": 2.0155677274175607e-06,
"loss": 0.2977,
"step": 5870
},
{
"epoch": 2.2039362699156513,
"grad_norm": 0.2557649875393899,
"learning_rate": 1.9980994273318033e-06,
"loss": 0.301,
"step": 5880
},
{
"epoch": 2.2076850984067478,
"grad_norm": 0.25974462261893655,
"learning_rate": 1.9806882315090796e-06,
"loss": 0.2986,
"step": 5890
},
{
"epoch": 2.2114339268978442,
"grad_norm": 0.2788427522001163,
"learning_rate": 1.963334471157395e-06,
"loss": 0.3006,
"step": 5900
},
{
"epoch": 2.215182755388941,
"grad_norm": 0.2639877192794817,
"learning_rate": 1.946038476392179e-06,
"loss": 0.2939,
"step": 5910
},
{
"epoch": 2.2189315838800376,
"grad_norm": 0.2552096590711124,
"learning_rate": 1.9288005762300034e-06,
"loss": 0.2977,
"step": 5920
},
{
"epoch": 2.222680412371134,
"grad_norm": 0.2618656152750493,
"learning_rate": 1.9116210985823234e-06,
"loss": 0.2977,
"step": 5930
},
{
"epoch": 2.2264292408622306,
"grad_norm": 0.23852460637945597,
"learning_rate": 1.894500370249242e-06,
"loss": 0.2989,
"step": 5940
},
{
"epoch": 2.230178069353327,
"grad_norm": 0.24929690435833737,
"learning_rate": 1.8774387169132858e-06,
"loss": 0.2999,
"step": 5950
},
{
"epoch": 2.2339268978444236,
"grad_norm": 0.23828564971308522,
"learning_rate": 1.8604364631332216e-06,
"loss": 0.297,
"step": 5960
},
{
"epoch": 2.23767572633552,
"grad_norm": 0.24494166996235048,
"learning_rate": 1.8434939323378715e-06,
"loss": 0.2973,
"step": 5970
},
{
"epoch": 2.2414245548266165,
"grad_norm": 0.24086882859135658,
"learning_rate": 1.8266114468199692e-06,
"loss": 0.295,
"step": 5980
},
{
"epoch": 2.245173383317713,
"grad_norm": 0.2544357426841843,
"learning_rate": 1.80978932773002e-06,
"loss": 0.2973,
"step": 5990
},
{
"epoch": 2.24892221180881,
"grad_norm": 0.23643151878409177,
"learning_rate": 1.7930278950701997e-06,
"loss": 0.3002,
"step": 6000
},
{
"epoch": 2.2526710402999064,
"grad_norm": 0.23975129409481882,
"learning_rate": 1.7763274676882647e-06,
"loss": 0.2979,
"step": 6010
},
{
"epoch": 2.256419868791003,
"grad_norm": 0.24291773824786236,
"learning_rate": 1.7596883632714852e-06,
"loss": 0.2932,
"step": 6020
},
{
"epoch": 2.2601686972820993,
"grad_norm": 0.25153396655313853,
"learning_rate": 1.7431108983406036e-06,
"loss": 0.298,
"step": 6030
},
{
"epoch": 2.263917525773196,
"grad_norm": 0.23197295943529667,
"learning_rate": 1.7265953882438086e-06,
"loss": 0.2988,
"step": 6040
},
{
"epoch": 2.2676663542642923,
"grad_norm": 0.24970808760082508,
"learning_rate": 1.7101421471507457e-06,
"loss": 0.2939,
"step": 6050
},
{
"epoch": 2.271415182755389,
"grad_norm": 0.228549562748791,
"learning_rate": 1.6937514880465355e-06,
"loss": 0.2989,
"step": 6060
},
{
"epoch": 2.2751640112464857,
"grad_norm": 0.24315653299360235,
"learning_rate": 1.6774237227258144e-06,
"loss": 0.2973,
"step": 6070
},
{
"epoch": 2.278912839737582,
"grad_norm": 0.24556558850550259,
"learning_rate": 1.6611591617868162e-06,
"loss": 0.2959,
"step": 6080
},
{
"epoch": 2.2826616682286787,
"grad_norm": 0.2148170791770767,
"learning_rate": 1.6449581146254496e-06,
"loss": 0.2991,
"step": 6090
},
{
"epoch": 2.286410496719775,
"grad_norm": 0.24023542138163662,
"learning_rate": 1.628820889429426e-06,
"loss": 0.2949,
"step": 6100
},
{
"epoch": 2.2901593252108716,
"grad_norm": 0.23817289823295282,
"learning_rate": 1.6127477931723857e-06,
"loss": 0.2959,
"step": 6110
},
{
"epoch": 2.293908153701968,
"grad_norm": 0.2431772216034719,
"learning_rate": 1.596739131608065e-06,
"loss": 0.2973,
"step": 6120
},
{
"epoch": 2.2976569821930646,
"grad_norm": 0.2382144152110202,
"learning_rate": 1.5807952092644795e-06,
"loss": 0.2994,
"step": 6130
},
{
"epoch": 2.301405810684161,
"grad_norm": 0.24054816884904162,
"learning_rate": 1.564916329438128e-06,
"loss": 0.2936,
"step": 6140
},
{
"epoch": 2.3051546391752575,
"grad_norm": 0.26274953712674814,
"learning_rate": 1.549102794188228e-06,
"loss": 0.2973,
"step": 6150
},
{
"epoch": 2.3089034676663545,
"grad_norm": 0.2806875425573502,
"learning_rate": 1.5333549043309592e-06,
"loss": 0.2977,
"step": 6160
},
{
"epoch": 2.312652296157451,
"grad_norm": 0.24329932907358484,
"learning_rate": 1.5176729594337575e-06,
"loss": 0.2958,
"step": 6170
},
{
"epoch": 2.3164011246485474,
"grad_norm": 0.24003607606344493,
"learning_rate": 1.5020572578095999e-06,
"loss": 0.2986,
"step": 6180
},
{
"epoch": 2.320149953139644,
"grad_norm": 0.26644077871725813,
"learning_rate": 1.4865080965113415e-06,
"loss": 0.2998,
"step": 6190
},
{
"epoch": 2.3238987816307404,
"grad_norm": 0.24781415510678054,
"learning_rate": 1.4710257713260623e-06,
"loss": 0.3022,
"step": 6200
},
{
"epoch": 2.327647610121837,
"grad_norm": 0.24420941065044707,
"learning_rate": 1.4556105767694317e-06,
"loss": 0.2981,
"step": 6210
},
{
"epoch": 2.3313964386129333,
"grad_norm": 0.26073417382172037,
"learning_rate": 1.44026280608012e-06,
"loss": 0.2984,
"step": 6220
},
{
"epoch": 2.33514526710403,
"grad_norm": 0.25521419311104987,
"learning_rate": 1.42498275121421e-06,
"loss": 0.2954,
"step": 6230
},
{
"epoch": 2.3388940955951263,
"grad_norm": 0.23848221017188856,
"learning_rate": 1.4097707028396496e-06,
"loss": 0.3005,
"step": 6240
},
{
"epoch": 2.342642924086223,
"grad_norm": 0.26233800895757997,
"learning_rate": 1.394626950330713e-06,
"loss": 0.2989,
"step": 6250
},
{
"epoch": 2.3463917525773197,
"grad_norm": 0.2548396023100127,
"learning_rate": 1.3795517817625088e-06,
"loss": 0.2978,
"step": 6260
},
{
"epoch": 2.350140581068416,
"grad_norm": 0.22752855679283976,
"learning_rate": 1.3645454839054921e-06,
"loss": 0.2967,
"step": 6270
},
{
"epoch": 2.3538894095595126,
"grad_norm": 0.2664643076296192,
"learning_rate": 1.3496083422200085e-06,
"loss": 0.2978,
"step": 6280
},
{
"epoch": 2.357638238050609,
"grad_norm": 0.2543566847419294,
"learning_rate": 1.3347406408508695e-06,
"loss": 0.2953,
"step": 6290
},
{
"epoch": 2.3613870665417056,
"grad_norm": 0.263835168702037,
"learning_rate": 1.3199426626219407e-06,
"loss": 0.2989,
"step": 6300
},
{
"epoch": 2.365135895032802,
"grad_norm": 0.25548238955558084,
"learning_rate": 1.3052146890307683e-06,
"loss": 0.2981,
"step": 6310
},
{
"epoch": 2.368884723523899,
"grad_norm": 0.23791649446797647,
"learning_rate": 1.2905570002432188e-06,
"loss": 0.2963,
"step": 6320
},
{
"epoch": 2.3726335520149955,
"grad_norm": 0.25494794302918333,
"learning_rate": 1.2759698750881533e-06,
"loss": 0.2973,
"step": 6330
},
{
"epoch": 2.376382380506092,
"grad_norm": 0.23772018287469734,
"learning_rate": 1.261453591052123e-06,
"loss": 0.2975,
"step": 6340
},
{
"epoch": 2.3801312089971884,
"grad_norm": 0.22552210252604188,
"learning_rate": 1.2470084242740848e-06,
"loss": 0.2971,
"step": 6350
},
{
"epoch": 2.383880037488285,
"grad_norm": 0.2497054243868162,
"learning_rate": 1.2326346495401587e-06,
"loss": 0.2989,
"step": 6360
},
{
"epoch": 2.3876288659793814,
"grad_norm": 0.23428555088684583,
"learning_rate": 1.2183325402783892e-06,
"loss": 0.2974,
"step": 6370
},
{
"epoch": 2.391377694470478,
"grad_norm": 0.24172427166914104,
"learning_rate": 1.2041023685535557e-06,
"loss": 0.2959,
"step": 6380
},
{
"epoch": 2.3951265229615744,
"grad_norm": 0.24675551010483163,
"learning_rate": 1.1899444050619891e-06,
"loss": 0.2967,
"step": 6390
},
{
"epoch": 2.398875351452671,
"grad_norm": 0.2375966890453732,
"learning_rate": 1.1758589191264214e-06,
"loss": 0.2994,
"step": 6400
},
{
"epoch": 2.4026241799437678,
"grad_norm": 0.23458923294261602,
"learning_rate": 1.1618461786908698e-06,
"loss": 0.2989,
"step": 6410
},
{
"epoch": 2.4063730084348642,
"grad_norm": 0.22080305330098507,
"learning_rate": 1.1479064503155335e-06,
"loss": 0.2957,
"step": 6420
},
{
"epoch": 2.4101218369259607,
"grad_norm": 0.24659042545889448,
"learning_rate": 1.1340399991717266e-06,
"loss": 0.2928,
"step": 6430
},
{
"epoch": 2.413870665417057,
"grad_norm": 0.2393208054211518,
"learning_rate": 1.1202470890368283e-06,
"loss": 0.2952,
"step": 6440
},
{
"epoch": 2.4176194939081537,
"grad_norm": 0.2452372681591504,
"learning_rate": 1.1065279822892732e-06,
"loss": 0.2989,
"step": 6450
},
{
"epoch": 2.42136832239925,
"grad_norm": 0.24729288324568552,
"learning_rate": 1.0928829399035563e-06,
"loss": 0.2954,
"step": 6460
},
{
"epoch": 2.4251171508903466,
"grad_norm": 0.24671451339099365,
"learning_rate": 1.0793122214452617e-06,
"loss": 0.2957,
"step": 6470
},
{
"epoch": 2.428865979381443,
"grad_norm": 0.24427659099939275,
"learning_rate": 1.0658160850661408e-06,
"loss": 0.3004,
"step": 6480
},
{
"epoch": 2.4326148078725396,
"grad_norm": 0.21374771872871845,
"learning_rate": 1.0523947874991842e-06,
"loss": 0.2984,
"step": 6490
},
{
"epoch": 2.4363636363636365,
"grad_norm": 0.22329887893290565,
"learning_rate": 1.0390485840537506e-06,
"loss": 0.297,
"step": 6500
},
{
"epoch": 2.440112464854733,
"grad_norm": 0.22533705247175104,
"learning_rate": 1.0257777286107045e-06,
"loss": 0.2943,
"step": 6510
},
{
"epoch": 2.4438612933458295,
"grad_norm": 0.2423245266897566,
"learning_rate": 1.0125824736175877e-06,
"loss": 0.3007,
"step": 6520
},
{
"epoch": 2.447610121836926,
"grad_norm": 0.22780962278138578,
"learning_rate": 9.994630700838175e-07,
"loss": 0.2967,
"step": 6530
},
{
"epoch": 2.4513589503280224,
"grad_norm": 0.22430754799850255,
"learning_rate": 9.864197675759096e-07,
"loss": 0.2966,
"step": 6540
},
{
"epoch": 2.455107778819119,
"grad_norm": 0.22784149833015035,
"learning_rate": 9.734528142127353e-07,
"loss": 0.2983,
"step": 6550
},
{
"epoch": 2.4588566073102154,
"grad_norm": 0.22292639589506671,
"learning_rate": 9.605624566607951e-07,
"loss": 0.2951,
"step": 6560
},
{
"epoch": 2.4626054358013123,
"grad_norm": 0.2236123311897104,
"learning_rate": 9.477489401295331e-07,
"loss": 0.2965,
"step": 6570
},
{
"epoch": 2.466354264292409,
"grad_norm": 0.22000628796913552,
"learning_rate": 9.350125083666711e-07,
"loss": 0.2976,
"step": 6580
},
{
"epoch": 2.4701030927835053,
"grad_norm": 0.24493160622717997,
"learning_rate": 9.223534036535636e-07,
"loss": 0.297,
"step": 6590
},
{
"epoch": 2.4738519212746017,
"grad_norm": 0.2455063807901648,
"learning_rate": 9.09771866800604e-07,
"loss": 0.2995,
"step": 6600
},
{
"epoch": 2.477600749765698,
"grad_norm": 0.24548065358066581,
"learning_rate": 8.972681371426273e-07,
"loss": 0.2953,
"step": 6610
},
{
"epoch": 2.4813495782567947,
"grad_norm": 0.2521783316460835,
"learning_rate": 8.848424525343713e-07,
"loss": 0.2946,
"step": 6620
},
{
"epoch": 2.485098406747891,
"grad_norm": 0.2553417419798155,
"learning_rate": 8.724950493459439e-07,
"loss": 0.2963,
"step": 6630
},
{
"epoch": 2.4888472352389877,
"grad_norm": 0.24486438815130893,
"learning_rate": 8.60226162458328e-07,
"loss": 0.2964,
"step": 6640
},
{
"epoch": 2.492596063730084,
"grad_norm": 0.22843671738308705,
"learning_rate": 8.48036025258917e-07,
"loss": 0.2972,
"step": 6650
},
{
"epoch": 2.496344892221181,
"grad_norm": 0.22729569350727913,
"learning_rate": 8.359248696370676e-07,
"loss": 0.298,
"step": 6660
},
{
"epoch": 2.5000937207122775,
"grad_norm": 0.21994248905891314,
"learning_rate": 8.238929259796991e-07,
"loss": 0.2988,
"step": 6670
},
{
"epoch": 2.503842549203374,
"grad_norm": 0.21862264017292513,
"learning_rate": 8.119404231668987e-07,
"loss": 0.2961,
"step": 6680
},
{
"epoch": 2.5075913776944705,
"grad_norm": 0.22932451090242004,
"learning_rate": 8.000675885675812e-07,
"loss": 0.2957,
"step": 6690
},
{
"epoch": 2.511340206185567,
"grad_norm": 0.2346294431862581,
"learning_rate": 7.882746480351499e-07,
"loss": 0.2989,
"step": 6700
},
{
"epoch": 2.5150890346766634,
"grad_norm": 0.2507513804674915,
"learning_rate": 7.765618259032115e-07,
"loss": 0.2967,
"step": 6710
},
{
"epoch": 2.51883786316776,
"grad_norm": 0.2301520077398978,
"learning_rate": 7.649293449813022e-07,
"loss": 0.2989,
"step": 6720
},
{
"epoch": 2.522586691658857,
"grad_norm": 0.22017138623299468,
"learning_rate": 7.533774265506528e-07,
"loss": 0.3008,
"step": 6730
},
{
"epoch": 2.526335520149953,
"grad_norm": 0.23466327607015292,
"learning_rate": 7.419062903599766e-07,
"loss": 0.2973,
"step": 6740
},
{
"epoch": 2.53008434864105,
"grad_norm": 0.23339557992179352,
"learning_rate": 7.305161546212891e-07,
"loss": 0.2998,
"step": 6750
},
{
"epoch": 2.5338331771321463,
"grad_norm": 0.228073145422222,
"learning_rate": 7.192072360057601e-07,
"loss": 0.2976,
"step": 6760
},
{
"epoch": 2.5375820056232428,
"grad_norm": 0.22403329120047444,
"learning_rate": 7.079797496395913e-07,
"loss": 0.2991,
"step": 6770
},
{
"epoch": 2.5413308341143392,
"grad_norm": 0.21653330730415413,
"learning_rate": 6.968339090999188e-07,
"loss": 0.301,
"step": 6780
},
{
"epoch": 2.5450796626054357,
"grad_norm": 0.22798842242544992,
"learning_rate": 6.857699264107592e-07,
"loss": 0.2961,
"step": 6790
},
{
"epoch": 2.548828491096532,
"grad_norm": 0.22152088362625078,
"learning_rate": 6.747880120389671e-07,
"loss": 0.2969,
"step": 6800
},
{
"epoch": 2.5525773195876287,
"grad_norm": 0.24941395004953096,
"learning_rate": 6.638883748902386e-07,
"loss": 0.2942,
"step": 6810
},
{
"epoch": 2.5563261480787256,
"grad_norm": 0.2352854991109166,
"learning_rate": 6.530712223051345e-07,
"loss": 0.2993,
"step": 6820
},
{
"epoch": 2.560074976569822,
"grad_norm": 0.21713333345854474,
"learning_rate": 6.423367600551356e-07,
"loss": 0.2965,
"step": 6830
},
{
"epoch": 2.5638238050609186,
"grad_norm": 0.21808108904670195,
"learning_rate": 6.316851923387302e-07,
"loss": 0.295,
"step": 6840
},
{
"epoch": 2.567572633552015,
"grad_norm": 0.22265553162824864,
"learning_rate": 6.211167217775255e-07,
"loss": 0.2999,
"step": 6850
},
{
"epoch": 2.5713214620431115,
"grad_norm": 0.2122649455456037,
"learning_rate": 6.106315494123999e-07,
"loss": 0.2972,
"step": 6860
},
{
"epoch": 2.575070290534208,
"grad_norm": 0.24269035288607615,
"learning_rate": 6.00229874699671e-07,
"loss": 0.2977,
"step": 6870
},
{
"epoch": 2.5788191190253045,
"grad_norm": 0.2197453016555571,
"learning_rate": 5.899118955073108e-07,
"loss": 0.2946,
"step": 6880
},
{
"epoch": 2.5825679475164014,
"grad_norm": 0.2161395378073621,
"learning_rate": 5.796778081111693e-07,
"loss": 0.2977,
"step": 6890
},
{
"epoch": 2.5863167760074974,
"grad_norm": 0.21287035374436375,
"learning_rate": 5.695278071912541e-07,
"loss": 0.296,
"step": 6900
},
{
"epoch": 2.5900656044985944,
"grad_norm": 0.2318471089563035,
"learning_rate": 5.59462085828017e-07,
"loss": 0.297,
"step": 6910
},
{
"epoch": 2.593814432989691,
"grad_norm": 0.22185763220188276,
"learning_rate": 5.494808354986869e-07,
"loss": 0.2966,
"step": 6920
},
{
"epoch": 2.5975632614807873,
"grad_norm": 0.2258427132260489,
"learning_rate": 5.395842460736251e-07,
"loss": 0.2977,
"step": 6930
},
{
"epoch": 2.601312089971884,
"grad_norm": 0.2214557018015068,
"learning_rate": 5.297725058127101e-07,
"loss": 0.2984,
"step": 6940
},
{
"epoch": 2.6050609184629803,
"grad_norm": 0.22757821840990952,
"learning_rate": 5.200458013617648e-07,
"loss": 0.2967,
"step": 6950
},
{
"epoch": 2.6088097469540767,
"grad_norm": 0.21756167779737182,
"learning_rate": 5.104043177490003e-07,
"loss": 0.2962,
"step": 6960
},
{
"epoch": 2.6125585754451732,
"grad_norm": 0.22089636199337112,
"learning_rate": 5.008482383814934e-07,
"loss": 0.2954,
"step": 6970
},
{
"epoch": 2.61630740393627,
"grad_norm": 0.23596951276418235,
"learning_rate": 4.913777450417051e-07,
"loss": 0.2958,
"step": 6980
},
{
"epoch": 2.620056232427366,
"grad_norm": 0.23285613087396778,
"learning_rate": 4.819930178840171e-07,
"loss": 0.3001,
"step": 6990
},
{
"epoch": 2.623805060918463,
"grad_norm": 0.22413547230254402,
"learning_rate": 4.726942354313063e-07,
"loss": 0.2977,
"step": 7000
},
{
"epoch": 2.6275538894095596,
"grad_norm": 0.21404799569429234,
"learning_rate": 4.634815745715504e-07,
"loss": 0.2977,
"step": 7010
},
{
"epoch": 2.631302717900656,
"grad_norm": 0.2220467447552054,
"learning_rate": 4.5435521055446077e-07,
"loss": 0.3002,
"step": 7020
},
{
"epoch": 2.6350515463917525,
"grad_norm": 0.22203820025276308,
"learning_rate": 4.4531531698815025e-07,
"loss": 0.2985,
"step": 7030
},
{
"epoch": 2.638800374882849,
"grad_norm": 0.21412842926369127,
"learning_rate": 4.3636206583582755e-07,
"loss": 0.2941,
"step": 7040
},
{
"epoch": 2.6425492033739455,
"grad_norm": 0.21845241517697617,
"learning_rate": 4.2749562741253194e-07,
"loss": 0.2983,
"step": 7050
},
{
"epoch": 2.646298031865042,
"grad_norm": 0.23326009884971552,
"learning_rate": 4.1871617038188704e-07,
"loss": 0.2967,
"step": 7060
},
{
"epoch": 2.650046860356139,
"grad_norm": 0.22789428750694415,
"learning_rate": 4.100238617528973e-07,
"loss": 0.2965,
"step": 7070
},
{
"epoch": 2.6537956888472354,
"grad_norm": 0.2310757479907784,
"learning_rate": 4.014188668767671e-07,
"loss": 0.2956,
"step": 7080
},
{
"epoch": 2.657544517338332,
"grad_norm": 0.21314866127121557,
"learning_rate": 3.9290134944375834e-07,
"loss": 0.2975,
"step": 7090
},
{
"epoch": 2.6612933458294283,
"grad_norm": 0.21262546928012396,
"learning_rate": 3.8447147148007735e-07,
"loss": 0.2991,
"step": 7100
},
{
"epoch": 2.665042174320525,
"grad_norm": 0.2185036816100282,
"learning_rate": 3.761293933447868e-07,
"loss": 0.2959,
"step": 7110
},
{
"epoch": 2.6687910028116213,
"grad_norm": 0.22509882595410716,
"learning_rate": 3.678752737267627e-07,
"loss": 0.2961,
"step": 7120
},
{
"epoch": 2.6725398313027178,
"grad_norm": 0.2209295427900408,
"learning_rate": 3.597092696416704e-07,
"loss": 0.296,
"step": 7130
},
{
"epoch": 2.6762886597938147,
"grad_norm": 0.20839767657345984,
"learning_rate": 3.5163153642898073e-07,
"loss": 0.2986,
"step": 7140
},
{
"epoch": 2.6800374882849107,
"grad_norm": 0.2170005019845449,
"learning_rate": 3.4364222774901366e-07,
"loss": 0.2969,
"step": 7150
},
{
"epoch": 2.6837863167760077,
"grad_norm": 0.22790793305608095,
"learning_rate": 3.357414955800148e-07,
"loss": 0.2967,
"step": 7160
},
{
"epoch": 2.687535145267104,
"grad_norm": 0.2179962319863983,
"learning_rate": 3.2792949021526686e-07,
"loss": 0.2965,
"step": 7170
},
{
"epoch": 2.6912839737582006,
"grad_norm": 0.2176296940618202,
"learning_rate": 3.202063602602262e-07,
"loss": 0.2982,
"step": 7180
},
{
"epoch": 2.695032802249297,
"grad_norm": 0.22847934720081933,
"learning_rate": 3.1257225262970146e-07,
"loss": 0.2946,
"step": 7190
},
{
"epoch": 2.6987816307403936,
"grad_norm": 0.20161821165065855,
"learning_rate": 3.050273125450537e-07,
"loss": 0.2958,
"step": 7200
},
{
"epoch": 2.70253045923149,
"grad_norm": 0.21754547873420735,
"learning_rate": 2.9757168353143795e-07,
"loss": 0.2931,
"step": 7210
},
{
"epoch": 2.7062792877225865,
"grad_norm": 0.20927718552782254,
"learning_rate": 2.9020550741507003e-07,
"loss": 0.296,
"step": 7220
},
{
"epoch": 2.7100281162136834,
"grad_norm": 0.21673136214793481,
"learning_rate": 2.829289243205313e-07,
"loss": 0.2977,
"step": 7230
},
{
"epoch": 2.7137769447047795,
"grad_norm": 0.21738597303662568,
"learning_rate": 2.7574207266810095e-07,
"loss": 0.297,
"step": 7240
},
{
"epoch": 2.7175257731958764,
"grad_norm": 0.22090904296834485,
"learning_rate": 2.686450891711223e-07,
"loss": 0.3019,
"step": 7250
},
{
"epoch": 2.721274601686973,
"grad_norm": 0.22588390372378073,
"learning_rate": 2.6163810883340633e-07,
"loss": 0.2992,
"step": 7260
},
{
"epoch": 2.7250234301780694,
"grad_norm": 0.2171446833278576,
"learning_rate": 2.547212649466568e-07,
"loss": 0.2958,
"step": 7270
},
{
"epoch": 2.728772258669166,
"grad_norm": 0.20897228290385925,
"learning_rate": 2.478946890879419e-07,
"loss": 0.2987,
"step": 7280
},
{
"epoch": 2.7325210871602623,
"grad_norm": 0.21930393993687972,
"learning_rate": 2.4115851111718767e-07,
"loss": 0.2984,
"step": 7290
},
{
"epoch": 2.736269915651359,
"grad_norm": 0.2086577507401556,
"learning_rate": 2.3451285917470478e-07,
"loss": 0.2948,
"step": 7300
},
{
"epoch": 2.7400187441424553,
"grad_norm": 0.24689056017428762,
"learning_rate": 2.2795785967875794e-07,
"loss": 0.2949,
"step": 7310
},
{
"epoch": 2.743767572633552,
"grad_norm": 0.2202426775150526,
"learning_rate": 2.214936373231552e-07,
"loss": 0.2964,
"step": 7320
},
{
"epoch": 2.7475164011246487,
"grad_norm": 0.22114657231547902,
"learning_rate": 2.151203150748793e-07,
"loss": 0.295,
"step": 7330
},
{
"epoch": 2.751265229615745,
"grad_norm": 0.20989352158816843,
"learning_rate": 2.08838014171745e-07,
"loss": 0.2948,
"step": 7340
},
{
"epoch": 2.7550140581068416,
"grad_norm": 0.21653508921554077,
"learning_rate": 2.0264685412009765e-07,
"loss": 0.2913,
"step": 7350
},
{
"epoch": 2.758762886597938,
"grad_norm": 0.20856496439474423,
"learning_rate": 1.9654695269253676e-07,
"loss": 0.2966,
"step": 7360
},
{
"epoch": 2.7625117150890346,
"grad_norm": 0.21697328504317934,
"learning_rate": 1.9053842592567372e-07,
"loss": 0.2935,
"step": 7370
},
{
"epoch": 2.766260543580131,
"grad_norm": 0.223018905717748,
"learning_rate": 1.846213881179304e-07,
"loss": 0.2943,
"step": 7380
},
{
"epoch": 2.770009372071228,
"grad_norm": 0.22471887950418276,
"learning_rate": 1.7879595182735853e-07,
"loss": 0.294,
"step": 7390
},
{
"epoch": 2.773758200562324,
"grad_norm": 0.2274675818451194,
"learning_rate": 1.7306222786950266e-07,
"loss": 0.2976,
"step": 7400
},
{
"epoch": 2.777507029053421,
"grad_norm": 0.21745114552538827,
"learning_rate": 1.6742032531529117e-07,
"loss": 0.2984,
"step": 7410
},
{
"epoch": 2.7812558575445174,
"grad_norm": 0.209520126032932,
"learning_rate": 1.618703514889608e-07,
"loss": 0.294,
"step": 7420
},
{
"epoch": 2.785004686035614,
"grad_norm": 0.2152117690983746,
"learning_rate": 1.5641241196601542e-07,
"loss": 0.2947,
"step": 7430
},
{
"epoch": 2.7887535145267104,
"grad_norm": 0.20856820453640396,
"learning_rate": 1.5104661057121605e-07,
"loss": 0.2934,
"step": 7440
},
{
"epoch": 2.792502343017807,
"grad_norm": 0.2087726976922736,
"learning_rate": 1.457730493766113e-07,
"loss": 0.2997,
"step": 7450
},
{
"epoch": 2.7962511715089033,
"grad_norm": 0.21039795736062336,
"learning_rate": 1.4059182869958776e-07,
"loss": 0.2944,
"step": 7460
},
{
"epoch": 2.8,
"grad_norm": 0.24255936306012296,
"learning_rate": 1.355030471009683e-07,
"loss": 0.2952,
"step": 7470
},
{
"epoch": 2.8037488284910967,
"grad_norm": 0.228361544622829,
"learning_rate": 1.3050680138313398e-07,
"loss": 0.2968,
"step": 7480
},
{
"epoch": 2.8074976569821932,
"grad_norm": 0.2788669586751136,
"learning_rate": 1.2560318658818238e-07,
"loss": 0.2919,
"step": 7490
},
{
"epoch": 2.8112464854732897,
"grad_norm": 0.21485061261933228,
"learning_rate": 1.2079229599612274e-07,
"loss": 0.2948,
"step": 7500
},
{
"epoch": 2.814995313964386,
"grad_norm": 0.20880096710948615,
"learning_rate": 1.160742211230964e-07,
"loss": 0.2966,
"step": 7510
},
{
"epoch": 2.8187441424554827,
"grad_norm": 0.20194538376919882,
"learning_rate": 1.1144905171964149e-07,
"loss": 0.2975,
"step": 7520
},
{
"epoch": 2.822492970946579,
"grad_norm": 0.20774178992713427,
"learning_rate": 1.0691687576898202e-07,
"loss": 0.2982,
"step": 7530
},
{
"epoch": 2.8262417994376756,
"grad_norm": 0.208737099957496,
"learning_rate": 1.0247777948535432e-07,
"loss": 0.2925,
"step": 7540
},
{
"epoch": 2.829990627928772,
"grad_norm": 0.2176776156807812,
"learning_rate": 9.813184731236935e-08,
"loss": 0.2932,
"step": 7550
},
{
"epoch": 2.8337394564198686,
"grad_norm": 0.207539461770748,
"learning_rate": 9.3879161921403e-08,
"loss": 0.295,
"step": 7560
},
{
"epoch": 2.8374882849109655,
"grad_norm": 0.2118079972476284,
"learning_rate": 8.971980421002779e-08,
"loss": 0.2946,
"step": 7570
},
{
"epoch": 2.841237113402062,
"grad_norm": 0.20851272394405806,
"learning_rate": 8.565385330046915e-08,
"loss": 0.2997,
"step": 7580
},
{
"epoch": 2.8449859418931585,
"grad_norm": 0.21224194574719543,
"learning_rate": 8.168138653810387e-08,
"loss": 0.2966,
"step": 7590
},
{
"epoch": 2.848734770384255,
"grad_norm": 0.20967024625449956,
"learning_rate": 7.780247948998788e-08,
"loss": 0.2993,
"step": 7600
},
{
"epoch": 2.8524835988753514,
"grad_norm": 0.2261170709858784,
"learning_rate": 7.401720594341688e-08,
"loss": 0.2947,
"step": 7610
},
{
"epoch": 2.856232427366448,
"grad_norm": 0.2001298846468726,
"learning_rate": 7.032563790452585e-08,
"loss": 0.2976,
"step": 7620
},
{
"epoch": 2.8599812558575444,
"grad_norm": 0.22681383953814865,
"learning_rate": 6.672784559691725e-08,
"loss": 0.2983,
"step": 7630
},
{
"epoch": 2.8637300843486413,
"grad_norm": 0.21103315361356145,
"learning_rate": 6.322389746032608e-08,
"loss": 0.2956,
"step": 7640
},
{
"epoch": 2.8674789128397373,
"grad_norm": 0.202377754899562,
"learning_rate": 5.981386014931645e-08,
"loss": 0.297,
"step": 7650
},
{
"epoch": 2.8712277413308342,
"grad_norm": 0.21978839641973172,
"learning_rate": 5.649779853201587e-08,
"loss": 0.2957,
"step": 7660
},
{
"epoch": 2.8749765698219307,
"grad_norm": 0.20879696172885226,
"learning_rate": 5.3275775688879096e-08,
"loss": 0.2951,
"step": 7670
},
{
"epoch": 2.878725398313027,
"grad_norm": 0.20388100110101098,
"learning_rate": 5.0147852911489606e-08,
"loss": 0.2975,
"step": 7680
},
{
"epoch": 2.8824742268041237,
"grad_norm": 0.21370045275380867,
"learning_rate": 4.7114089701393864e-08,
"loss": 0.2948,
"step": 7690
},
{
"epoch": 2.88622305529522,
"grad_norm": 0.2060746309945781,
"learning_rate": 4.4174543768968346e-08,
"loss": 0.2945,
"step": 7700
},
{
"epoch": 2.8899718837863166,
"grad_norm": 0.20704697522314686,
"learning_rate": 4.132927103232209e-08,
"loss": 0.2966,
"step": 7710
},
{
"epoch": 2.893720712277413,
"grad_norm": 0.22365022572567808,
"learning_rate": 3.857832561623309e-08,
"loss": 0.2972,
"step": 7720
},
{
"epoch": 2.89746954076851,
"grad_norm": 0.20659583365699133,
"learning_rate": 3.592175985111968e-08,
"loss": 0.2961,
"step": 7730
},
{
"epoch": 2.9012183692596065,
"grad_norm": 0.20618218716255435,
"learning_rate": 3.3359624272042976e-08,
"loss": 0.2952,
"step": 7740
},
{
"epoch": 2.904967197750703,
"grad_norm": 0.2149240713982645,
"learning_rate": 3.089196761774715e-08,
"loss": 0.2955,
"step": 7750
},
{
"epoch": 2.9087160262417995,
"grad_norm": 0.22355201181105996,
"learning_rate": 2.8518836829732332e-08,
"loss": 0.2977,
"step": 7760
},
{
"epoch": 2.912464854732896,
"grad_norm": 0.22264076611026476,
"learning_rate": 2.6240277051359788e-08,
"loss": 0.2987,
"step": 7770
},
{
"epoch": 2.9162136832239924,
"grad_norm": 0.2033040105643027,
"learning_rate": 2.4056331626995943e-08,
"loss": 0.2967,
"step": 7780
},
{
"epoch": 2.919962511715089,
"grad_norm": 0.2029744325807162,
"learning_rate": 2.1967042101185832e-08,
"loss": 0.2984,
"step": 7790
},
{
"epoch": 2.923711340206186,
"grad_norm": 0.2207170499349933,
"learning_rate": 1.9972448217863706e-08,
"loss": 0.2944,
"step": 7800
},
{
"epoch": 2.927460168697282,
"grad_norm": 0.2190536979775703,
"learning_rate": 1.807258791959643e-08,
"loss": 0.299,
"step": 7810
},
{
"epoch": 2.931208997188379,
"grad_norm": 0.20453932822962492,
"learning_rate": 1.626749734686295e-08,
"loss": 0.2965,
"step": 7820
},
{
"epoch": 2.9349578256794753,
"grad_norm": 0.21158692274365276,
"learning_rate": 1.4557210837364278e-08,
"loss": 0.2971,
"step": 7830
},
{
"epoch": 2.9387066541705718,
"grad_norm": 0.22314299534375273,
"learning_rate": 1.2941760925372914e-08,
"loss": 0.296,
"step": 7840
},
{
"epoch": 2.9424554826616682,
"grad_norm": 0.21201880468903128,
"learning_rate": 1.1421178341112781e-08,
"loss": 0.2981,
"step": 7850
},
{
"epoch": 2.9462043111527647,
"grad_norm": 0.19995417097694335,
"learning_rate": 9.995492010175245e-09,
"loss": 0.2969,
"step": 7860
},
{
"epoch": 2.949953139643861,
"grad_norm": 0.2032618514771412,
"learning_rate": 8.66472905296678e-09,
"loss": 0.2958,
"step": 7870
},
{
"epoch": 2.9537019681349577,
"grad_norm": 0.22054734778678423,
"learning_rate": 7.428914784197161e-09,
"loss": 0.2984,
"step": 7880
},
{
"epoch": 2.9574507966260546,
"grad_norm": 0.21531333241349596,
"learning_rate": 6.288072712393734e-09,
"loss": 0.2961,
"step": 7890
},
{
"epoch": 2.9611996251171506,
"grad_norm": 0.21334038501182684,
"learning_rate": 5.24222453945622e-09,
"loss": 0.2968,
"step": 7900
},
{
"epoch": 2.9649484536082475,
"grad_norm": 0.21379304132484478,
"learning_rate": 4.291390160243713e-09,
"loss": 0.2987,
"step": 7910
},
{
"epoch": 2.968697282099344,
"grad_norm": 0.21586668554028848,
"learning_rate": 3.435587662196094e-09,
"loss": 0.2963,
"step": 7920
},
{
"epoch": 2.9724461105904405,
"grad_norm": 0.21612407035729198,
"learning_rate": 2.6748333249909665e-09,
"loss": 0.2947,
"step": 7930
},
{
"epoch": 2.976194939081537,
"grad_norm": 0.20332605208597299,
"learning_rate": 2.0091416202316918e-09,
"loss": 0.2988,
"step": 7940
},
{
"epoch": 2.9799437675726335,
"grad_norm": 0.2079341599363278,
"learning_rate": 1.4385252111737136e-09,
"loss": 0.2919,
"step": 7950
},
{
"epoch": 2.98369259606373,
"grad_norm": 0.20606235066584186,
"learning_rate": 9.629949524830873e-10,
"loss": 0.2952,
"step": 7960
},
{
"epoch": 2.9874414245548264,
"grad_norm": 0.2280988323439713,
"learning_rate": 5.825598900316421e-10,
"loss": 0.2963,
"step": 7970
},
{
"epoch": 2.9911902530459233,
"grad_norm": 0.21299797878972357,
"learning_rate": 2.972272607221216e-10,
"loss": 0.2919,
"step": 7980
},
{
"epoch": 2.99493908153702,
"grad_norm": 0.21214297989590156,
"learning_rate": 1.0700249235218175e-10,
"loss": 0.2962,
"step": 7990
},
{
"epoch": 2.9986879100281163,
"grad_norm": 0.21387066401777824,
"learning_rate": 1.1889203511139536e-11,
"loss": 0.299,
"step": 8000
}
],
"logging_steps": 10,
"max_steps": 8004,
"num_input_tokens_seen": 0,
"num_train_epochs": 3,
"save_steps": 10000000,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 4.990281269536358e+16,
"train_batch_size": 1,
"trial_name": null,
"trial_params": null
}