ODA-Fin-SFT-8B / trainer_state.json
chuxuecao's picture
Upload folder using huggingface_hub
7ddbd9b verified
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 3.0,
"eval_steps": 500,
"global_step": 7470,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.004017576898932831,
"grad_norm": 5.806783098701457,
"learning_rate": 1.204819277108434e-07,
"loss": 1.0662,
"step": 10
},
{
"epoch": 0.008035153797865662,
"grad_norm": 6.1737495782500815,
"learning_rate": 2.5435073627844717e-07,
"loss": 1.0418,
"step": 20
},
{
"epoch": 0.012052730696798493,
"grad_norm": 5.869188839514599,
"learning_rate": 3.8821954484605087e-07,
"loss": 1.0476,
"step": 30
},
{
"epoch": 0.016070307595731324,
"grad_norm": 4.067430650309112,
"learning_rate": 5.220883534136546e-07,
"loss": 1.0186,
"step": 40
},
{
"epoch": 0.020087884494664157,
"grad_norm": 1.9993360792772212,
"learning_rate": 6.559571619812584e-07,
"loss": 0.9457,
"step": 50
},
{
"epoch": 0.024105461393596987,
"grad_norm": 1.0897504400503495,
"learning_rate": 7.898259705488621e-07,
"loss": 0.8694,
"step": 60
},
{
"epoch": 0.02812303829252982,
"grad_norm": 0.8480278075524572,
"learning_rate": 9.236947791164659e-07,
"loss": 0.8453,
"step": 70
},
{
"epoch": 0.03214061519146265,
"grad_norm": 0.5847985812870466,
"learning_rate": 1.0575635876840697e-06,
"loss": 0.8091,
"step": 80
},
{
"epoch": 0.03615819209039548,
"grad_norm": 0.5632601125338154,
"learning_rate": 1.1914323962516733e-06,
"loss": 0.7981,
"step": 90
},
{
"epoch": 0.040175768989328314,
"grad_norm": 0.4998168225005524,
"learning_rate": 1.3253012048192773e-06,
"loss": 0.7646,
"step": 100
},
{
"epoch": 0.04419334588826114,
"grad_norm": 0.6903822114162349,
"learning_rate": 1.4591700133868811e-06,
"loss": 0.7619,
"step": 110
},
{
"epoch": 0.04821092278719397,
"grad_norm": 0.4815863955585346,
"learning_rate": 1.593038821954485e-06,
"loss": 0.7679,
"step": 120
},
{
"epoch": 0.052228499686126806,
"grad_norm": 0.4828754779519579,
"learning_rate": 1.7269076305220885e-06,
"loss": 0.7406,
"step": 130
},
{
"epoch": 0.05624607658505964,
"grad_norm": 0.5272829904340466,
"learning_rate": 1.8607764390896923e-06,
"loss": 0.7451,
"step": 140
},
{
"epoch": 0.060263653483992465,
"grad_norm": 0.48100813622874905,
"learning_rate": 1.994645247657296e-06,
"loss": 0.7053,
"step": 150
},
{
"epoch": 0.0642812303829253,
"grad_norm": 0.49305369454902975,
"learning_rate": 2.1285140562248997e-06,
"loss": 0.7217,
"step": 160
},
{
"epoch": 0.06829880728185812,
"grad_norm": 0.4711986293293931,
"learning_rate": 2.2623828647925037e-06,
"loss": 0.7368,
"step": 170
},
{
"epoch": 0.07231638418079096,
"grad_norm": 0.5025185766261334,
"learning_rate": 2.3962516733601073e-06,
"loss": 0.7179,
"step": 180
},
{
"epoch": 0.07633396107972379,
"grad_norm": 0.5264169172616818,
"learning_rate": 2.530120481927711e-06,
"loss": 0.7374,
"step": 190
},
{
"epoch": 0.08035153797865663,
"grad_norm": 0.46948548462020134,
"learning_rate": 2.6639892904953145e-06,
"loss": 0.72,
"step": 200
},
{
"epoch": 0.08436911487758945,
"grad_norm": 0.463248294175632,
"learning_rate": 2.7978580990629185e-06,
"loss": 0.7115,
"step": 210
},
{
"epoch": 0.08838669177652228,
"grad_norm": 0.45484667965794356,
"learning_rate": 2.931726907630522e-06,
"loss": 0.7034,
"step": 220
},
{
"epoch": 0.09240426867545512,
"grad_norm": 0.4357030076439862,
"learning_rate": 3.0655957161981257e-06,
"loss": 0.7126,
"step": 230
},
{
"epoch": 0.09642184557438795,
"grad_norm": 0.6453731431637157,
"learning_rate": 3.1994645247657297e-06,
"loss": 0.7097,
"step": 240
},
{
"epoch": 0.10043942247332077,
"grad_norm": 0.4849489606416127,
"learning_rate": 3.3333333333333333e-06,
"loss": 0.7023,
"step": 250
},
{
"epoch": 0.10445699937225361,
"grad_norm": 0.42549798364358465,
"learning_rate": 3.4672021419009373e-06,
"loss": 0.6924,
"step": 260
},
{
"epoch": 0.10847457627118644,
"grad_norm": 0.4773000774187341,
"learning_rate": 3.601070950468541e-06,
"loss": 0.6968,
"step": 270
},
{
"epoch": 0.11249215317011928,
"grad_norm": 0.5240441729923269,
"learning_rate": 3.7349397590361445e-06,
"loss": 0.7045,
"step": 280
},
{
"epoch": 0.1165097300690521,
"grad_norm": 0.4910140491378646,
"learning_rate": 3.8688085676037485e-06,
"loss": 0.6933,
"step": 290
},
{
"epoch": 0.12052730696798493,
"grad_norm": 0.4580543272940571,
"learning_rate": 4.002677376171352e-06,
"loss": 0.6762,
"step": 300
},
{
"epoch": 0.12454488386691777,
"grad_norm": 0.4638793548670153,
"learning_rate": 4.136546184738956e-06,
"loss": 0.6807,
"step": 310
},
{
"epoch": 0.1285624607658506,
"grad_norm": 0.48226078090248237,
"learning_rate": 4.270414993306559e-06,
"loss": 0.6836,
"step": 320
},
{
"epoch": 0.13258003766478343,
"grad_norm": 0.5205992514175658,
"learning_rate": 4.404283801874164e-06,
"loss": 0.7006,
"step": 330
},
{
"epoch": 0.13659761456371625,
"grad_norm": 0.5157658854686866,
"learning_rate": 4.538152610441767e-06,
"loss": 0.6777,
"step": 340
},
{
"epoch": 0.1406151914626491,
"grad_norm": 0.5186893129183067,
"learning_rate": 4.672021419009371e-06,
"loss": 0.6863,
"step": 350
},
{
"epoch": 0.14463276836158193,
"grad_norm": 0.48228160242697454,
"learning_rate": 4.8058902275769745e-06,
"loss": 0.692,
"step": 360
},
{
"epoch": 0.14865034526051477,
"grad_norm": 0.48252742197899023,
"learning_rate": 4.939759036144578e-06,
"loss": 0.6866,
"step": 370
},
{
"epoch": 0.15266792215944758,
"grad_norm": 0.4457961570378318,
"learning_rate": 5.0736278447121826e-06,
"loss": 0.6786,
"step": 380
},
{
"epoch": 0.15668549905838042,
"grad_norm": 0.475655993068204,
"learning_rate": 5.207496653279787e-06,
"loss": 0.6899,
"step": 390
},
{
"epoch": 0.16070307595731326,
"grad_norm": 0.44137190174512886,
"learning_rate": 5.34136546184739e-06,
"loss": 0.6673,
"step": 400
},
{
"epoch": 0.16472065285624607,
"grad_norm": 0.49163743462647375,
"learning_rate": 5.475234270414994e-06,
"loss": 0.6696,
"step": 410
},
{
"epoch": 0.1687382297551789,
"grad_norm": 0.5274225683716718,
"learning_rate": 5.609103078982597e-06,
"loss": 0.6604,
"step": 420
},
{
"epoch": 0.17275580665411175,
"grad_norm": 0.4371201249615473,
"learning_rate": 5.742971887550201e-06,
"loss": 0.6623,
"step": 430
},
{
"epoch": 0.17677338355304456,
"grad_norm": 0.496866788031066,
"learning_rate": 5.876840696117805e-06,
"loss": 0.6733,
"step": 440
},
{
"epoch": 0.1807909604519774,
"grad_norm": 0.5029900924311191,
"learning_rate": 6.010709504685409e-06,
"loss": 0.6593,
"step": 450
},
{
"epoch": 0.18480853735091024,
"grad_norm": 0.5057790072232172,
"learning_rate": 6.144578313253012e-06,
"loss": 0.684,
"step": 460
},
{
"epoch": 0.18882611424984305,
"grad_norm": 1.210542334820464,
"learning_rate": 6.2784471218206166e-06,
"loss": 0.6743,
"step": 470
},
{
"epoch": 0.1928436911487759,
"grad_norm": 0.4979782025477802,
"learning_rate": 6.41231593038822e-06,
"loss": 0.6645,
"step": 480
},
{
"epoch": 0.19686126804770873,
"grad_norm": 0.4671679965356099,
"learning_rate": 6.546184738955825e-06,
"loss": 0.6593,
"step": 490
},
{
"epoch": 0.20087884494664154,
"grad_norm": 0.47107607146399,
"learning_rate": 6.680053547523427e-06,
"loss": 0.653,
"step": 500
},
{
"epoch": 0.20489642184557438,
"grad_norm": 0.5355141556215653,
"learning_rate": 6.813922356091032e-06,
"loss": 0.6555,
"step": 510
},
{
"epoch": 0.20891399874450722,
"grad_norm": 0.4770424360600091,
"learning_rate": 6.9477911646586345e-06,
"loss": 0.6426,
"step": 520
},
{
"epoch": 0.21293157564344006,
"grad_norm": 0.4799836242336395,
"learning_rate": 7.081659973226239e-06,
"loss": 0.6565,
"step": 530
},
{
"epoch": 0.21694915254237288,
"grad_norm": 0.5167694867415858,
"learning_rate": 7.2155287817938426e-06,
"loss": 0.6609,
"step": 540
},
{
"epoch": 0.22096672944130571,
"grad_norm": 0.5086826009179188,
"learning_rate": 7.349397590361447e-06,
"loss": 0.6494,
"step": 550
},
{
"epoch": 0.22498430634023855,
"grad_norm": 0.4922827776447584,
"learning_rate": 7.48326639892905e-06,
"loss": 0.6558,
"step": 560
},
{
"epoch": 0.22900188323917137,
"grad_norm": 0.48344938995436526,
"learning_rate": 7.617135207496654e-06,
"loss": 0.6481,
"step": 570
},
{
"epoch": 0.2330194601381042,
"grad_norm": 0.507502223485786,
"learning_rate": 7.751004016064258e-06,
"loss": 0.6529,
"step": 580
},
{
"epoch": 0.23703703703703705,
"grad_norm": 0.49027674360195406,
"learning_rate": 7.884872824631861e-06,
"loss": 0.6475,
"step": 590
},
{
"epoch": 0.24105461393596986,
"grad_norm": 0.47662111002737323,
"learning_rate": 8.018741633199465e-06,
"loss": 0.6606,
"step": 600
},
{
"epoch": 0.2450721908349027,
"grad_norm": 0.5178759139169029,
"learning_rate": 8.152610441767069e-06,
"loss": 0.6507,
"step": 610
},
{
"epoch": 0.24908976773383554,
"grad_norm": 0.4913437654692537,
"learning_rate": 8.286479250334672e-06,
"loss": 0.6676,
"step": 620
},
{
"epoch": 0.25310734463276835,
"grad_norm": 0.4983838425660273,
"learning_rate": 8.420348058902277e-06,
"loss": 0.6431,
"step": 630
},
{
"epoch": 0.2571249215317012,
"grad_norm": 0.5296861218593591,
"learning_rate": 8.55421686746988e-06,
"loss": 0.6469,
"step": 640
},
{
"epoch": 0.26114249843063403,
"grad_norm": 0.49470648462227873,
"learning_rate": 8.688085676037485e-06,
"loss": 0.653,
"step": 650
},
{
"epoch": 0.26516007532956687,
"grad_norm": 0.4977505266791689,
"learning_rate": 8.821954484605088e-06,
"loss": 0.6506,
"step": 660
},
{
"epoch": 0.2691776522284997,
"grad_norm": 0.5181442610979835,
"learning_rate": 8.955823293172692e-06,
"loss": 0.6389,
"step": 670
},
{
"epoch": 0.2731952291274325,
"grad_norm": 0.5401626680115751,
"learning_rate": 9.089692101740295e-06,
"loss": 0.644,
"step": 680
},
{
"epoch": 0.27721280602636533,
"grad_norm": 0.5294876220592581,
"learning_rate": 9.223560910307899e-06,
"loss": 0.6476,
"step": 690
},
{
"epoch": 0.2812303829252982,
"grad_norm": 0.5001749586085277,
"learning_rate": 9.357429718875503e-06,
"loss": 0.6469,
"step": 700
},
{
"epoch": 0.285247959824231,
"grad_norm": 0.49514793732187534,
"learning_rate": 9.491298527443106e-06,
"loss": 0.6412,
"step": 710
},
{
"epoch": 0.28926553672316385,
"grad_norm": 0.5451601544807843,
"learning_rate": 9.62516733601071e-06,
"loss": 0.6342,
"step": 720
},
{
"epoch": 0.2932831136220967,
"grad_norm": 0.5386128345367395,
"learning_rate": 9.759036144578315e-06,
"loss": 0.6422,
"step": 730
},
{
"epoch": 0.29730069052102953,
"grad_norm": 0.5295404573044942,
"learning_rate": 9.892904953145917e-06,
"loss": 0.6418,
"step": 740
},
{
"epoch": 0.3013182674199623,
"grad_norm": 0.48815626124299877,
"learning_rate": 9.999997816397962e-06,
"loss": 0.649,
"step": 750
},
{
"epoch": 0.30533584431889516,
"grad_norm": 0.5070827630467095,
"learning_rate": 9.999921390526839e-06,
"loss": 0.6453,
"step": 760
},
{
"epoch": 0.309353421217828,
"grad_norm": 0.5070143645241298,
"learning_rate": 9.999735786460982e-06,
"loss": 0.6302,
"step": 770
},
{
"epoch": 0.31337099811676083,
"grad_norm": 0.4790968559270035,
"learning_rate": 9.999441008253238e-06,
"loss": 0.632,
"step": 780
},
{
"epoch": 0.3173885750156937,
"grad_norm": 0.48852106728621275,
"learning_rate": 9.999037062340376e-06,
"loss": 0.6436,
"step": 790
},
{
"epoch": 0.3214061519146265,
"grad_norm": 0.4972110031536491,
"learning_rate": 9.998523957542955e-06,
"loss": 0.6411,
"step": 800
},
{
"epoch": 0.3254237288135593,
"grad_norm": 0.533109673228536,
"learning_rate": 9.997901705065118e-06,
"loss": 0.6422,
"step": 810
},
{
"epoch": 0.32944130571249214,
"grad_norm": 0.528721261444126,
"learning_rate": 9.997170318494362e-06,
"loss": 0.6457,
"step": 820
},
{
"epoch": 0.333458882611425,
"grad_norm": 0.7706123304017218,
"learning_rate": 9.996329813801233e-06,
"loss": 0.6479,
"step": 830
},
{
"epoch": 0.3374764595103578,
"grad_norm": 0.6304185980981377,
"learning_rate": 9.995380209338973e-06,
"loss": 0.639,
"step": 840
},
{
"epoch": 0.34149403640929066,
"grad_norm": 0.47993658132408695,
"learning_rate": 9.99432152584313e-06,
"loss": 0.6232,
"step": 850
},
{
"epoch": 0.3455116133082235,
"grad_norm": 0.5189489672799563,
"learning_rate": 9.993153786431098e-06,
"loss": 0.6457,
"step": 860
},
{
"epoch": 0.3495291902071563,
"grad_norm": 0.490298021772909,
"learning_rate": 9.991877016601612e-06,
"loss": 0.6489,
"step": 870
},
{
"epoch": 0.3535467671060891,
"grad_norm": 0.4733551505574991,
"learning_rate": 9.990491244234197e-06,
"loss": 0.6327,
"step": 880
},
{
"epoch": 0.35756434400502196,
"grad_norm": 0.4812850798556578,
"learning_rate": 9.988996499588556e-06,
"loss": 0.6325,
"step": 890
},
{
"epoch": 0.3615819209039548,
"grad_norm": 0.5139202954592238,
"learning_rate": 9.987392815303903e-06,
"loss": 0.6302,
"step": 900
},
{
"epoch": 0.36559949780288764,
"grad_norm": 0.4950874659702236,
"learning_rate": 9.985680226398261e-06,
"loss": 0.641,
"step": 910
},
{
"epoch": 0.3696170747018205,
"grad_norm": 0.5079828516156794,
"learning_rate": 9.98385877026769e-06,
"loss": 0.6384,
"step": 920
},
{
"epoch": 0.3736346516007533,
"grad_norm": 0.49658973438025256,
"learning_rate": 9.981928486685477e-06,
"loss": 0.6365,
"step": 930
},
{
"epoch": 0.3776522284996861,
"grad_norm": 0.46532869938151694,
"learning_rate": 9.979889417801257e-06,
"loss": 0.64,
"step": 940
},
{
"epoch": 0.38166980539861894,
"grad_norm": 0.5187436115265832,
"learning_rate": 9.9777416081401e-06,
"loss": 0.6268,
"step": 950
},
{
"epoch": 0.3856873822975518,
"grad_norm": 0.46664875039431214,
"learning_rate": 9.975485104601544e-06,
"loss": 0.6302,
"step": 960
},
{
"epoch": 0.3897049591964846,
"grad_norm": 0.47880441558880193,
"learning_rate": 9.973119956458558e-06,
"loss": 0.6238,
"step": 970
},
{
"epoch": 0.39372253609541746,
"grad_norm": 0.4691906979131477,
"learning_rate": 9.970646215356477e-06,
"loss": 0.6422,
"step": 980
},
{
"epoch": 0.3977401129943503,
"grad_norm": 0.5000299551978495,
"learning_rate": 9.968063935311865e-06,
"loss": 0.6329,
"step": 990
},
{
"epoch": 0.4017576898932831,
"grad_norm": 0.5484904742238748,
"learning_rate": 9.965373172711343e-06,
"loss": 0.6317,
"step": 1000
},
{
"epoch": 0.4057752667922159,
"grad_norm": 0.517325966851464,
"learning_rate": 9.96257398631036e-06,
"loss": 0.6404,
"step": 1010
},
{
"epoch": 0.40979284369114877,
"grad_norm": 0.4507627813502968,
"learning_rate": 9.959666437231895e-06,
"loss": 0.6303,
"step": 1020
},
{
"epoch": 0.4138104205900816,
"grad_norm": 0.5071861791885447,
"learning_rate": 9.95665058896514e-06,
"loss": 0.6135,
"step": 1030
},
{
"epoch": 0.41782799748901445,
"grad_norm": 0.45341474710439855,
"learning_rate": 9.953526507364106e-06,
"loss": 0.619,
"step": 1040
},
{
"epoch": 0.4218455743879473,
"grad_norm": 0.46797706523025123,
"learning_rate": 9.95029426064618e-06,
"loss": 0.6253,
"step": 1050
},
{
"epoch": 0.4258631512868801,
"grad_norm": 0.5018403383029635,
"learning_rate": 9.946953919390648e-06,
"loss": 0.6363,
"step": 1060
},
{
"epoch": 0.4298807281858129,
"grad_norm": 0.4563294653990798,
"learning_rate": 9.94350555653714e-06,
"loss": 0.6223,
"step": 1070
},
{
"epoch": 0.43389830508474575,
"grad_norm": 0.5142438260172338,
"learning_rate": 9.939949247384046e-06,
"loss": 0.636,
"step": 1080
},
{
"epoch": 0.4379158819836786,
"grad_norm": 0.4974669445120328,
"learning_rate": 9.93628506958687e-06,
"loss": 0.6242,
"step": 1090
},
{
"epoch": 0.44193345888261143,
"grad_norm": 0.5980709429827091,
"learning_rate": 9.932513103156532e-06,
"loss": 0.6408,
"step": 1100
},
{
"epoch": 0.44595103578154427,
"grad_norm": 0.5581732511219715,
"learning_rate": 9.928633430457628e-06,
"loss": 0.6139,
"step": 1110
},
{
"epoch": 0.4499686126804771,
"grad_norm": 0.5080904502728457,
"learning_rate": 9.924646136206617e-06,
"loss": 0.628,
"step": 1120
},
{
"epoch": 0.4539861895794099,
"grad_norm": 0.5539298313516415,
"learning_rate": 9.920551307469987e-06,
"loss": 0.6212,
"step": 1130
},
{
"epoch": 0.45800376647834273,
"grad_norm": 0.5135075682143369,
"learning_rate": 9.916349033662349e-06,
"loss": 0.6207,
"step": 1140
},
{
"epoch": 0.4620213433772756,
"grad_norm": 0.5431528274267999,
"learning_rate": 9.912039406544477e-06,
"loss": 0.6268,
"step": 1150
},
{
"epoch": 0.4660389202762084,
"grad_norm": 0.46293456683258105,
"learning_rate": 9.907622520221312e-06,
"loss": 0.6168,
"step": 1160
},
{
"epoch": 0.47005649717514125,
"grad_norm": 0.5265783067042777,
"learning_rate": 9.903098471139903e-06,
"loss": 0.611,
"step": 1170
},
{
"epoch": 0.4740740740740741,
"grad_norm": 0.4964165005454209,
"learning_rate": 9.89846735808731e-06,
"loss": 0.6209,
"step": 1180
},
{
"epoch": 0.47809165097300693,
"grad_norm": 0.44720379185402914,
"learning_rate": 9.893729282188433e-06,
"loss": 0.6274,
"step": 1190
},
{
"epoch": 0.4821092278719397,
"grad_norm": 0.5160429448873454,
"learning_rate": 9.888884346903813e-06,
"loss": 0.618,
"step": 1200
},
{
"epoch": 0.48612680477087256,
"grad_norm": 0.483511156492776,
"learning_rate": 9.883932658027374e-06,
"loss": 0.621,
"step": 1210
},
{
"epoch": 0.4901443816698054,
"grad_norm": 0.5158419978797311,
"learning_rate": 9.8788743236841e-06,
"loss": 0.6295,
"step": 1220
},
{
"epoch": 0.49416195856873824,
"grad_norm": 0.5784261169598502,
"learning_rate": 9.873709454327697e-06,
"loss": 0.6215,
"step": 1230
},
{
"epoch": 0.4981795354676711,
"grad_norm": 0.5198565153146563,
"learning_rate": 9.868438162738154e-06,
"loss": 0.6264,
"step": 1240
},
{
"epoch": 0.5021971123666039,
"grad_norm": 0.510434585062711,
"learning_rate": 9.863060564019305e-06,
"loss": 0.6149,
"step": 1250
},
{
"epoch": 0.5062146892655367,
"grad_norm": 0.4593962120443787,
"learning_rate": 9.8575767755963e-06,
"loss": 0.625,
"step": 1260
},
{
"epoch": 0.5102322661644696,
"grad_norm": 0.498172586068187,
"learning_rate": 9.851986917213044e-06,
"loss": 0.6143,
"step": 1270
},
{
"epoch": 0.5142498430634024,
"grad_norm": 0.4620438331915131,
"learning_rate": 9.846291110929586e-06,
"loss": 0.6313,
"step": 1280
},
{
"epoch": 0.5182674199623352,
"grad_norm": 0.5343634450415627,
"learning_rate": 9.840489481119452e-06,
"loss": 0.6182,
"step": 1290
},
{
"epoch": 0.5222849968612681,
"grad_norm": 0.48909140457697287,
"learning_rate": 9.834582154466927e-06,
"loss": 0.6325,
"step": 1300
},
{
"epoch": 0.5263025737602008,
"grad_norm": 0.4678287985367856,
"learning_rate": 9.828569259964291e-06,
"loss": 0.6307,
"step": 1310
},
{
"epoch": 0.5303201506591337,
"grad_norm": 0.4920124660634079,
"learning_rate": 9.822450928909e-06,
"loss": 0.6108,
"step": 1320
},
{
"epoch": 0.5343377275580665,
"grad_norm": 1.4526222205367336,
"learning_rate": 9.816227294900822e-06,
"loss": 0.6213,
"step": 1330
},
{
"epoch": 0.5383553044569994,
"grad_norm": 0.471616985395006,
"learning_rate": 9.809898493838923e-06,
"loss": 0.6169,
"step": 1340
},
{
"epoch": 0.5423728813559322,
"grad_norm": 0.4421867425241258,
"learning_rate": 9.803464663918886e-06,
"loss": 0.6093,
"step": 1350
},
{
"epoch": 0.546390458254865,
"grad_norm": 0.5351347284551249,
"learning_rate": 9.796925945629711e-06,
"loss": 0.6143,
"step": 1360
},
{
"epoch": 0.5504080351537979,
"grad_norm": 0.4877248363865589,
"learning_rate": 9.79028248175073e-06,
"loss": 0.6192,
"step": 1370
},
{
"epoch": 0.5544256120527307,
"grad_norm": 0.5009708952846684,
"learning_rate": 9.783534417348507e-06,
"loss": 0.6143,
"step": 1380
},
{
"epoch": 0.5584431889516636,
"grad_norm": 0.5178945086806888,
"learning_rate": 9.776681899773652e-06,
"loss": 0.6205,
"step": 1390
},
{
"epoch": 0.5624607658505963,
"grad_norm": 0.4762670817917884,
"learning_rate": 9.769725078657622e-06,
"loss": 0.6173,
"step": 1400
},
{
"epoch": 0.5664783427495292,
"grad_norm": 0.4613547006315256,
"learning_rate": 9.762664105909434e-06,
"loss": 0.6251,
"step": 1410
},
{
"epoch": 0.570495919648462,
"grad_norm": 0.4702754249747577,
"learning_rate": 9.755499135712368e-06,
"loss": 0.6183,
"step": 1420
},
{
"epoch": 0.5745134965473948,
"grad_norm": 0.5373232333646776,
"learning_rate": 9.748230324520585e-06,
"loss": 0.6132,
"step": 1430
},
{
"epoch": 0.5785310734463277,
"grad_norm": 0.5626349604991109,
"learning_rate": 9.740857831055715e-06,
"loss": 0.621,
"step": 1440
},
{
"epoch": 0.5825486503452605,
"grad_norm": 0.5658213472247587,
"learning_rate": 9.733381816303395e-06,
"loss": 0.6138,
"step": 1450
},
{
"epoch": 0.5865662272441934,
"grad_norm": 0.45081662608826584,
"learning_rate": 9.725802443509753e-06,
"loss": 0.616,
"step": 1460
},
{
"epoch": 0.5905838041431262,
"grad_norm": 0.44740848559688695,
"learning_rate": 9.718119878177837e-06,
"loss": 0.6129,
"step": 1470
},
{
"epoch": 0.5946013810420591,
"grad_norm": 0.5237403744825302,
"learning_rate": 9.710334288064007e-06,
"loss": 0.6136,
"step": 1480
},
{
"epoch": 0.5986189579409918,
"grad_norm": 0.48924900162227497,
"learning_rate": 9.702445843174274e-06,
"loss": 0.6196,
"step": 1490
},
{
"epoch": 0.6026365348399246,
"grad_norm": 0.49368923400575526,
"learning_rate": 9.694454715760573e-06,
"loss": 0.6187,
"step": 1500
},
{
"epoch": 0.6066541117388575,
"grad_norm": 0.487126510015843,
"learning_rate": 9.686361080317029e-06,
"loss": 0.6172,
"step": 1510
},
{
"epoch": 0.6106716886377903,
"grad_norm": 0.45477094460777745,
"learning_rate": 9.678165113576114e-06,
"loss": 0.6056,
"step": 1520
},
{
"epoch": 0.6146892655367232,
"grad_norm": 0.49141680431275075,
"learning_rate": 9.669866994504818e-06,
"loss": 0.6043,
"step": 1530
},
{
"epoch": 0.618706842435656,
"grad_norm": 0.7376167208477882,
"learning_rate": 9.66146690430072e-06,
"loss": 0.6208,
"step": 1540
},
{
"epoch": 0.6227244193345888,
"grad_norm": 0.4931059951127201,
"learning_rate": 9.652965026388039e-06,
"loss": 0.6097,
"step": 1550
},
{
"epoch": 0.6267419962335217,
"grad_norm": 0.48305824703353156,
"learning_rate": 9.644361546413635e-06,
"loss": 0.6081,
"step": 1560
},
{
"epoch": 0.6307595731324545,
"grad_norm": 0.46866086349351754,
"learning_rate": 9.635656652242938e-06,
"loss": 0.6187,
"step": 1570
},
{
"epoch": 0.6347771500313873,
"grad_norm": 0.48653033771244636,
"learning_rate": 9.626850533955864e-06,
"loss": 0.6039,
"step": 1580
},
{
"epoch": 0.6387947269303201,
"grad_norm": 0.4839769721956582,
"learning_rate": 9.617943383842659e-06,
"loss": 0.617,
"step": 1590
},
{
"epoch": 0.642812303829253,
"grad_norm": 0.4887686378157136,
"learning_rate": 9.608935396399692e-06,
"loss": 0.6043,
"step": 1600
},
{
"epoch": 0.6468298807281858,
"grad_norm": 0.46040947244346264,
"learning_rate": 9.599826768325218e-06,
"loss": 0.6088,
"step": 1610
},
{
"epoch": 0.6508474576271186,
"grad_norm": 0.4882314027569112,
"learning_rate": 9.590617698515077e-06,
"loss": 0.6084,
"step": 1620
},
{
"epoch": 0.6548650345260515,
"grad_norm": 0.46985181649935615,
"learning_rate": 9.581308388058354e-06,
"loss": 0.6029,
"step": 1630
},
{
"epoch": 0.6588826114249843,
"grad_norm": 0.48794194189773543,
"learning_rate": 9.571899040232989e-06,
"loss": 0.6088,
"step": 1640
},
{
"epoch": 0.6629001883239172,
"grad_norm": 0.46766933522748133,
"learning_rate": 9.56238986050133e-06,
"loss": 0.6149,
"step": 1650
},
{
"epoch": 0.66691776522285,
"grad_norm": 0.48282374325088395,
"learning_rate": 9.552781056505662e-06,
"loss": 0.6101,
"step": 1660
},
{
"epoch": 0.6709353421217829,
"grad_norm": 0.5292742570379373,
"learning_rate": 9.543072838063655e-06,
"loss": 0.6128,
"step": 1670
},
{
"epoch": 0.6749529190207156,
"grad_norm": 0.5260262320491007,
"learning_rate": 9.533265417163793e-06,
"loss": 0.6234,
"step": 1680
},
{
"epoch": 0.6789704959196484,
"grad_norm": 0.48584608333045604,
"learning_rate": 9.523359007960748e-06,
"loss": 0.6116,
"step": 1690
},
{
"epoch": 0.6829880728185813,
"grad_norm": 0.4743066347549614,
"learning_rate": 9.513353826770695e-06,
"loss": 0.5959,
"step": 1700
},
{
"epoch": 0.6870056497175141,
"grad_norm": 0.4856299177120577,
"learning_rate": 9.503250092066592e-06,
"loss": 0.6204,
"step": 1710
},
{
"epoch": 0.691023226616447,
"grad_norm": 0.5046231694012181,
"learning_rate": 9.493048024473413e-06,
"loss": 0.6126,
"step": 1720
},
{
"epoch": 0.6950408035153798,
"grad_norm": 0.4928237045269149,
"learning_rate": 9.48274784676332e-06,
"loss": 0.6089,
"step": 1730
},
{
"epoch": 0.6990583804143126,
"grad_norm": 0.4766754598686215,
"learning_rate": 9.472349783850815e-06,
"loss": 0.6061,
"step": 1740
},
{
"epoch": 0.7030759573132455,
"grad_norm": 0.4631337703939171,
"learning_rate": 9.461854062787812e-06,
"loss": 0.6121,
"step": 1750
},
{
"epoch": 0.7070935342121782,
"grad_norm": 0.4961256195106074,
"learning_rate": 9.451260912758695e-06,
"loss": 0.6037,
"step": 1760
},
{
"epoch": 0.7111111111111111,
"grad_norm": 0.511885819703456,
"learning_rate": 9.440570565075295e-06,
"loss": 0.6145,
"step": 1770
},
{
"epoch": 0.7151286880100439,
"grad_norm": 0.48629463779046733,
"learning_rate": 9.429783253171855e-06,
"loss": 0.5966,
"step": 1780
},
{
"epoch": 0.7191462649089768,
"grad_norm": 0.4792036473525835,
"learning_rate": 9.418899212599928e-06,
"loss": 0.608,
"step": 1790
},
{
"epoch": 0.7231638418079096,
"grad_norm": 0.5083453945346657,
"learning_rate": 9.407918681023229e-06,
"loss": 0.6095,
"step": 1800
},
{
"epoch": 0.7271814187068424,
"grad_norm": 0.4685060218692666,
"learning_rate": 9.396841898212452e-06,
"loss": 0.6111,
"step": 1810
},
{
"epoch": 0.7311989956057753,
"grad_norm": 0.5007535253551997,
"learning_rate": 9.38566910604003e-06,
"loss": 0.6175,
"step": 1820
},
{
"epoch": 0.7352165725047081,
"grad_norm": 0.4721707017133051,
"learning_rate": 9.374400548474853e-06,
"loss": 0.6105,
"step": 1830
},
{
"epoch": 0.739234149403641,
"grad_norm": 0.45194793890207013,
"learning_rate": 9.363036471576945e-06,
"loss": 0.5976,
"step": 1840
},
{
"epoch": 0.7432517263025737,
"grad_norm": 0.48023710070315084,
"learning_rate": 9.351577123492087e-06,
"loss": 0.606,
"step": 1850
},
{
"epoch": 0.7472693032015066,
"grad_norm": 0.4562292402628347,
"learning_rate": 9.3400227544464e-06,
"loss": 0.6204,
"step": 1860
},
{
"epoch": 0.7512868801004394,
"grad_norm": 0.5199018316758173,
"learning_rate": 9.328373616740884e-06,
"loss": 0.6061,
"step": 1870
},
{
"epoch": 0.7553044569993722,
"grad_norm": 0.45403478079118165,
"learning_rate": 9.3166299647459e-06,
"loss": 0.5977,
"step": 1880
},
{
"epoch": 0.7593220338983051,
"grad_norm": 0.48026121094218754,
"learning_rate": 9.304792054895627e-06,
"loss": 0.6046,
"step": 1890
},
{
"epoch": 0.7633396107972379,
"grad_norm": 0.5273730174734627,
"learning_rate": 9.292860145682451e-06,
"loss": 0.6016,
"step": 1900
},
{
"epoch": 0.7673571876961708,
"grad_norm": 0.45635254897692096,
"learning_rate": 9.280834497651334e-06,
"loss": 0.6049,
"step": 1910
},
{
"epoch": 0.7713747645951036,
"grad_norm": 0.4952893916325239,
"learning_rate": 9.26871537339411e-06,
"loss": 0.6108,
"step": 1920
},
{
"epoch": 0.7753923414940365,
"grad_norm": 0.5020795358241874,
"learning_rate": 9.25650303754376e-06,
"loss": 0.6066,
"step": 1930
},
{
"epoch": 0.7794099183929692,
"grad_norm": 0.4519568302601775,
"learning_rate": 9.244197756768638e-06,
"loss": 0.6048,
"step": 1940
},
{
"epoch": 0.783427495291902,
"grad_norm": 0.4730934017356477,
"learning_rate": 9.231799799766633e-06,
"loss": 0.6205,
"step": 1950
},
{
"epoch": 0.7874450721908349,
"grad_norm": 0.43205824907881557,
"learning_rate": 9.219309437259312e-06,
"loss": 0.6094,
"step": 1960
},
{
"epoch": 0.7914626490897677,
"grad_norm": 0.49712695410471086,
"learning_rate": 9.206726941986012e-06,
"loss": 0.6177,
"step": 1970
},
{
"epoch": 0.7954802259887006,
"grad_norm": 0.5220660443409905,
"learning_rate": 9.194052588697877e-06,
"loss": 0.6101,
"step": 1980
},
{
"epoch": 0.7994978028876334,
"grad_norm": 0.5154062082113726,
"learning_rate": 9.18128665415186e-06,
"loss": 0.5928,
"step": 1990
},
{
"epoch": 0.8035153797865662,
"grad_norm": 0.466497275783982,
"learning_rate": 9.16842941710468e-06,
"loss": 0.5976,
"step": 2000
},
{
"epoch": 0.8075329566854991,
"grad_norm": 0.4710841642629808,
"learning_rate": 9.155481158306736e-06,
"loss": 0.5989,
"step": 2010
},
{
"epoch": 0.8115505335844319,
"grad_norm": 0.5011683049021086,
"learning_rate": 9.142442160495981e-06,
"loss": 0.602,
"step": 2020
},
{
"epoch": 0.8155681104833647,
"grad_norm": 0.48222917749993743,
"learning_rate": 9.129312708391735e-06,
"loss": 0.5991,
"step": 2030
},
{
"epoch": 0.8195856873822975,
"grad_norm": 0.5023833093181953,
"learning_rate": 9.116093088688486e-06,
"loss": 0.603,
"step": 2040
},
{
"epoch": 0.8236032642812304,
"grad_norm": 0.5057506213682262,
"learning_rate": 9.102783590049613e-06,
"loss": 0.6074,
"step": 2050
},
{
"epoch": 0.8276208411801632,
"grad_norm": 0.4465714041839266,
"learning_rate": 9.08938450310109e-06,
"loss": 0.6117,
"step": 2060
},
{
"epoch": 0.831638418079096,
"grad_norm": 0.45774188769330276,
"learning_rate": 9.075896120425144e-06,
"loss": 0.5982,
"step": 2070
},
{
"epoch": 0.8356559949780289,
"grad_norm": 0.4956125400496556,
"learning_rate": 9.06231873655386e-06,
"loss": 0.6131,
"step": 2080
},
{
"epoch": 0.8396735718769617,
"grad_norm": 0.503936504338912,
"learning_rate": 9.04865264796275e-06,
"loss": 0.6067,
"step": 2090
},
{
"epoch": 0.8436911487758946,
"grad_norm": 0.4966401695744208,
"learning_rate": 9.034898153064281e-06,
"loss": 0.5982,
"step": 2100
},
{
"epoch": 0.8477087256748274,
"grad_norm": 0.48606930633171735,
"learning_rate": 9.021055552201364e-06,
"loss": 0.6015,
"step": 2110
},
{
"epoch": 0.8517263025737603,
"grad_norm": 0.5102880215310355,
"learning_rate": 9.00712514764078e-06,
"loss": 0.6084,
"step": 2120
},
{
"epoch": 0.855743879472693,
"grad_norm": 0.5874497994476533,
"learning_rate": 8.993107243566599e-06,
"loss": 0.6014,
"step": 2130
},
{
"epoch": 0.8597614563716258,
"grad_norm": 0.45526275583074516,
"learning_rate": 8.979002146073526e-06,
"loss": 0.6047,
"step": 2140
},
{
"epoch": 0.8637790332705587,
"grad_norm": 0.44057367739611536,
"learning_rate": 8.964810163160218e-06,
"loss": 0.6023,
"step": 2150
},
{
"epoch": 0.8677966101694915,
"grad_norm": 0.47276989533109426,
"learning_rate": 8.95053160472256e-06,
"loss": 0.5996,
"step": 2160
},
{
"epoch": 0.8718141870684244,
"grad_norm": 0.47310493449075497,
"learning_rate": 8.936166782546907e-06,
"loss": 0.6053,
"step": 2170
},
{
"epoch": 0.8758317639673572,
"grad_norm": 0.4417798231937385,
"learning_rate": 8.921716010303255e-06,
"loss": 0.6075,
"step": 2180
},
{
"epoch": 0.87984934086629,
"grad_norm": 0.47514268261185605,
"learning_rate": 8.907179603538411e-06,
"loss": 0.5892,
"step": 2190
},
{
"epoch": 0.8838669177652229,
"grad_norm": 0.4830403204501155,
"learning_rate": 8.892557879669097e-06,
"loss": 0.5962,
"step": 2200
},
{
"epoch": 0.8878844946641556,
"grad_norm": 0.46322055953346064,
"learning_rate": 8.877851157975017e-06,
"loss": 0.6027,
"step": 2210
},
{
"epoch": 0.8919020715630885,
"grad_norm": 0.4722416444611542,
"learning_rate": 8.86305975959188e-06,
"loss": 0.5949,
"step": 2220
},
{
"epoch": 0.8959196484620213,
"grad_norm": 0.5039262176010645,
"learning_rate": 8.848184007504404e-06,
"loss": 0.5983,
"step": 2230
},
{
"epoch": 0.8999372253609542,
"grad_norm": 0.5011682351198766,
"learning_rate": 8.833224226539246e-06,
"loss": 0.5902,
"step": 2240
},
{
"epoch": 0.903954802259887,
"grad_norm": 0.46884988549973994,
"learning_rate": 8.818180743357915e-06,
"loss": 0.6043,
"step": 2250
},
{
"epoch": 0.9079723791588198,
"grad_norm": 0.45627355824791144,
"learning_rate": 8.803053886449644e-06,
"loss": 0.609,
"step": 2260
},
{
"epoch": 0.9119899560577527,
"grad_norm": 0.4489855391892701,
"learning_rate": 8.787843986124214e-06,
"loss": 0.5945,
"step": 2270
},
{
"epoch": 0.9160075329566855,
"grad_norm": 0.5195815180128369,
"learning_rate": 8.772551374504736e-06,
"loss": 0.6032,
"step": 2280
},
{
"epoch": 0.9200251098556184,
"grad_norm": 0.4547182251787525,
"learning_rate": 8.757176385520406e-06,
"loss": 0.6071,
"step": 2290
},
{
"epoch": 0.9240426867545511,
"grad_norm": 0.4672155983783131,
"learning_rate": 8.741719354899214e-06,
"loss": 0.6026,
"step": 2300
},
{
"epoch": 0.928060263653484,
"grad_norm": 0.44811195796882736,
"learning_rate": 8.7261806201606e-06,
"loss": 0.5903,
"step": 2310
},
{
"epoch": 0.9320778405524168,
"grad_norm": 0.4609033648332187,
"learning_rate": 8.710560520608106e-06,
"loss": 0.5954,
"step": 2320
},
{
"epoch": 0.9360954174513496,
"grad_norm": 0.5031025381027067,
"learning_rate": 8.694859397321947e-06,
"loss": 0.5971,
"step": 2330
},
{
"epoch": 0.9401129943502825,
"grad_norm": 0.45508717131932036,
"learning_rate": 8.67907759315157e-06,
"loss": 0.6009,
"step": 2340
},
{
"epoch": 0.9441305712492153,
"grad_norm": 0.46492655160451346,
"learning_rate": 8.663215452708173e-06,
"loss": 0.5971,
"step": 2350
},
{
"epoch": 0.9481481481481482,
"grad_norm": 0.4891914193609098,
"learning_rate": 8.647273322357174e-06,
"loss": 0.5854,
"step": 2360
},
{
"epoch": 0.952165725047081,
"grad_norm": 0.45300916670077845,
"learning_rate": 8.631251550210645e-06,
"loss": 0.6073,
"step": 2370
},
{
"epoch": 0.9561833019460139,
"grad_norm": 0.46574627999413143,
"learning_rate": 8.61515048611972e-06,
"loss": 0.5973,
"step": 2380
},
{
"epoch": 0.9602008788449466,
"grad_norm": 0.46780579054245386,
"learning_rate": 8.598970481666949e-06,
"loss": 0.5903,
"step": 2390
},
{
"epoch": 0.9642184557438794,
"grad_norm": 0.49368155945672554,
"learning_rate": 8.582711890158622e-06,
"loss": 0.5918,
"step": 2400
},
{
"epoch": 0.9682360326428123,
"grad_norm": 0.4981441902973779,
"learning_rate": 8.566375066617056e-06,
"loss": 0.5849,
"step": 2410
},
{
"epoch": 0.9722536095417451,
"grad_norm": 0.4940426996715437,
"learning_rate": 8.549960367772836e-06,
"loss": 0.5983,
"step": 2420
},
{
"epoch": 0.976271186440678,
"grad_norm": 0.4785729651530905,
"learning_rate": 8.533468152057037e-06,
"loss": 0.5886,
"step": 2430
},
{
"epoch": 0.9802887633396108,
"grad_norm": 0.47987909811753693,
"learning_rate": 8.51689877959339e-06,
"loss": 0.5934,
"step": 2440
},
{
"epoch": 0.9843063402385436,
"grad_norm": 0.5163558607429957,
"learning_rate": 8.500252612190416e-06,
"loss": 0.5996,
"step": 2450
},
{
"epoch": 0.9883239171374765,
"grad_norm": 0.51101675304883,
"learning_rate": 8.48353001333353e-06,
"loss": 0.5914,
"step": 2460
},
{
"epoch": 0.9923414940364093,
"grad_norm": 0.5029778228075064,
"learning_rate": 8.466731348177106e-06,
"loss": 0.5941,
"step": 2470
},
{
"epoch": 0.9963590709353422,
"grad_norm": 0.4522455217876264,
"learning_rate": 8.4498569835365e-06,
"loss": 0.597,
"step": 2480
},
{
"epoch": 1.0,
"grad_norm": 0.4817425446921033,
"learning_rate": 8.432907287880033e-06,
"loss": 0.6044,
"step": 2490
},
{
"epoch": 1.0040175768989328,
"grad_norm": 0.5586942369932902,
"learning_rate": 8.415882631320963e-06,
"loss": 0.5356,
"step": 2500
},
{
"epoch": 1.0080351537978656,
"grad_norm": 0.4841445380910929,
"learning_rate": 8.398783385609386e-06,
"loss": 0.5324,
"step": 2510
},
{
"epoch": 1.0120527306967986,
"grad_norm": 0.5199870071061111,
"learning_rate": 8.38160992412413e-06,
"loss": 0.5408,
"step": 2520
},
{
"epoch": 1.0160703075957314,
"grad_norm": 0.4803423902226154,
"learning_rate": 8.364362621864595e-06,
"loss": 0.5454,
"step": 2530
},
{
"epoch": 1.0200878844946641,
"grad_norm": 0.5170354608787521,
"learning_rate": 8.347041855442565e-06,
"loss": 0.5438,
"step": 2540
},
{
"epoch": 1.024105461393597,
"grad_norm": 0.48879934808926573,
"learning_rate": 8.329648003073991e-06,
"loss": 0.5409,
"step": 2550
},
{
"epoch": 1.0281230382925297,
"grad_norm": 0.5352031093199895,
"learning_rate": 8.312181444570722e-06,
"loss": 0.5379,
"step": 2560
},
{
"epoch": 1.0321406151914627,
"grad_norm": 0.503404555579561,
"learning_rate": 8.29464256133222e-06,
"loss": 0.5339,
"step": 2570
},
{
"epoch": 1.0361581920903955,
"grad_norm": 0.5412849211631844,
"learning_rate": 8.277031736337229e-06,
"loss": 0.537,
"step": 2580
},
{
"epoch": 1.0401757689893283,
"grad_norm": 0.49624211963806875,
"learning_rate": 8.259349354135408e-06,
"loss": 0.5365,
"step": 2590
},
{
"epoch": 1.044193345888261,
"grad_norm": 0.47456260927212846,
"learning_rate": 8.241595800838945e-06,
"loss": 0.5331,
"step": 2600
},
{
"epoch": 1.048210922787194,
"grad_norm": 0.5253188216147088,
"learning_rate": 8.223771464114114e-06,
"loss": 0.5407,
"step": 2610
},
{
"epoch": 1.0522284996861269,
"grad_norm": 0.48560865787460045,
"learning_rate": 8.205876733172813e-06,
"loss": 0.5358,
"step": 2620
},
{
"epoch": 1.0562460765850596,
"grad_norm": 0.4552759788335426,
"learning_rate": 8.187911998764073e-06,
"loss": 0.5383,
"step": 2630
},
{
"epoch": 1.0602636534839924,
"grad_norm": 0.5238408302939632,
"learning_rate": 8.169877653165512e-06,
"loss": 0.5432,
"step": 2640
},
{
"epoch": 1.0642812303829252,
"grad_norm": 0.5143828197718291,
"learning_rate": 8.15177409017478e-06,
"loss": 0.5449,
"step": 2650
},
{
"epoch": 1.0682988072818582,
"grad_norm": 0.5003350510607426,
"learning_rate": 8.13360170510096e-06,
"loss": 0.5379,
"step": 2660
},
{
"epoch": 1.072316384180791,
"grad_norm": 0.4862183359830462,
"learning_rate": 8.115360894755928e-06,
"loss": 0.5313,
"step": 2670
},
{
"epoch": 1.0763339610797238,
"grad_norm": 0.525472490342403,
"learning_rate": 8.097052057445696e-06,
"loss": 0.5324,
"step": 2680
},
{
"epoch": 1.0803515379786566,
"grad_norm": 0.4609085774775871,
"learning_rate": 8.07867559296171e-06,
"loss": 0.5339,
"step": 2690
},
{
"epoch": 1.0843691148775894,
"grad_norm": 0.47498380053763667,
"learning_rate": 8.060231902572123e-06,
"loss": 0.5416,
"step": 2700
},
{
"epoch": 1.0883866917765224,
"grad_norm": 0.4774259144620562,
"learning_rate": 8.041721389013029e-06,
"loss": 0.5315,
"step": 2710
},
{
"epoch": 1.0924042686754551,
"grad_norm": 0.4535031254083697,
"learning_rate": 8.023144456479677e-06,
"loss": 0.5337,
"step": 2720
},
{
"epoch": 1.096421845574388,
"grad_norm": 0.45369844007905547,
"learning_rate": 8.004501510617631e-06,
"loss": 0.5286,
"step": 2730
},
{
"epoch": 1.1004394224733207,
"grad_norm": 0.47676753007555456,
"learning_rate": 7.985792958513932e-06,
"loss": 0.5316,
"step": 2740
},
{
"epoch": 1.1044569993722537,
"grad_norm": 0.49166946653996263,
"learning_rate": 7.967019208688187e-06,
"loss": 0.534,
"step": 2750
},
{
"epoch": 1.1084745762711865,
"grad_norm": 0.47818247256990665,
"learning_rate": 7.948180671083665e-06,
"loss": 0.5372,
"step": 2760
},
{
"epoch": 1.1124921531701193,
"grad_norm": 0.49991506616495146,
"learning_rate": 7.92927775705834e-06,
"loss": 0.5497,
"step": 2770
},
{
"epoch": 1.116509730069052,
"grad_norm": 0.5015448494254134,
"learning_rate": 7.910310879375906e-06,
"loss": 0.5335,
"step": 2780
},
{
"epoch": 1.1205273069679849,
"grad_norm": 0.5009679314406517,
"learning_rate": 7.891280452196767e-06,
"loss": 0.5349,
"step": 2790
},
{
"epoch": 1.1245448838669179,
"grad_norm": 0.46077740198691347,
"learning_rate": 7.872186891068997e-06,
"loss": 0.5474,
"step": 2800
},
{
"epoch": 1.1285624607658507,
"grad_norm": 0.47629174007493424,
"learning_rate": 7.85303061291925e-06,
"loss": 0.5352,
"step": 2810
},
{
"epoch": 1.1325800376647834,
"grad_norm": 0.5001826254949262,
"learning_rate": 7.833812036043684e-06,
"loss": 0.5253,
"step": 2820
},
{
"epoch": 1.1365976145637162,
"grad_norm": 0.48877881868647444,
"learning_rate": 7.814531580098799e-06,
"loss": 0.5405,
"step": 2830
},
{
"epoch": 1.140615191462649,
"grad_norm": 0.4525660243228381,
"learning_rate": 7.795189666092286e-06,
"loss": 0.5392,
"step": 2840
},
{
"epoch": 1.144632768361582,
"grad_norm": 0.5016062315339999,
"learning_rate": 7.77578671637384e-06,
"loss": 0.5392,
"step": 2850
},
{
"epoch": 1.1486503452605148,
"grad_norm": 0.46664182486781586,
"learning_rate": 7.756323154625927e-06,
"loss": 0.5307,
"step": 2860
},
{
"epoch": 1.1526679221594476,
"grad_norm": 0.4851667799865368,
"learning_rate": 7.736799405854531e-06,
"loss": 0.5249,
"step": 2870
},
{
"epoch": 1.1566854990583804,
"grad_norm": 0.44756010484495995,
"learning_rate": 7.71721589637989e-06,
"loss": 0.5423,
"step": 2880
},
{
"epoch": 1.1607030759573131,
"grad_norm": 0.4810612674475816,
"learning_rate": 7.697573053827163e-06,
"loss": 0.5346,
"step": 2890
},
{
"epoch": 1.1647206528562462,
"grad_norm": 0.5005415716085619,
"learning_rate": 7.677871307117117e-06,
"loss": 0.5277,
"step": 2900
},
{
"epoch": 1.168738229755179,
"grad_norm": 0.48046892345205033,
"learning_rate": 7.658111086456738e-06,
"loss": 0.5372,
"step": 2910
},
{
"epoch": 1.1727558066541117,
"grad_norm": 0.5231466496543029,
"learning_rate": 7.638292823329861e-06,
"loss": 0.5349,
"step": 2920
},
{
"epoch": 1.1767733835530445,
"grad_norm": 0.47426409377347806,
"learning_rate": 7.6184169504877195e-06,
"loss": 0.5335,
"step": 2930
},
{
"epoch": 1.1807909604519775,
"grad_norm": 0.4644152310984778,
"learning_rate": 7.598483901939525e-06,
"loss": 0.5375,
"step": 2940
},
{
"epoch": 1.1848085373509103,
"grad_norm": 0.5016059422510154,
"learning_rate": 7.5784941129429715e-06,
"loss": 0.5336,
"step": 2950
},
{
"epoch": 1.188826114249843,
"grad_norm": 0.4893646800410941,
"learning_rate": 7.558448019994733e-06,
"loss": 0.5427,
"step": 2960
},
{
"epoch": 1.1928436911487759,
"grad_norm": 0.4964262727258161,
"learning_rate": 7.5383460608209444e-06,
"loss": 0.5362,
"step": 2970
},
{
"epoch": 1.1968612680477086,
"grad_norm": 0.542942008787974,
"learning_rate": 7.518188674367628e-06,
"loss": 0.5474,
"step": 2980
},
{
"epoch": 1.2008788449466414,
"grad_norm": 0.5299626906544336,
"learning_rate": 7.497976300791114e-06,
"loss": 0.5431,
"step": 2990
},
{
"epoch": 1.2048964218455744,
"grad_norm": 0.45657822017276745,
"learning_rate": 7.477709381448436e-06,
"loss": 0.5207,
"step": 3000
},
{
"epoch": 1.2089139987445072,
"grad_norm": 0.5192739282525728,
"learning_rate": 7.457388358887682e-06,
"loss": 0.5389,
"step": 3010
},
{
"epoch": 1.21293157564344,
"grad_norm": 0.5108336845381785,
"learning_rate": 7.437013676838345e-06,
"loss": 0.5427,
"step": 3020
},
{
"epoch": 1.2169491525423728,
"grad_norm": 0.4490173262151658,
"learning_rate": 7.416585780201615e-06,
"loss": 0.541,
"step": 3030
},
{
"epoch": 1.2209667294413058,
"grad_norm": 0.4823167719289656,
"learning_rate": 7.396105115040684e-06,
"loss": 0.5396,
"step": 3040
},
{
"epoch": 1.2249843063402386,
"grad_norm": 0.4926557447124054,
"learning_rate": 7.37557212857099e-06,
"loss": 0.5413,
"step": 3050
},
{
"epoch": 1.2290018832391714,
"grad_norm": 0.4846891664452503,
"learning_rate": 7.3549872691504646e-06,
"loss": 0.5448,
"step": 3060
},
{
"epoch": 1.2330194601381042,
"grad_norm": 0.5250489652016472,
"learning_rate": 7.3343509862697295e-06,
"loss": 0.5402,
"step": 3070
},
{
"epoch": 1.237037037037037,
"grad_norm": 0.4588507155594117,
"learning_rate": 7.313663730542295e-06,
"loss": 0.5404,
"step": 3080
},
{
"epoch": 1.24105461393597,
"grad_norm": 0.5009891938060933,
"learning_rate": 7.292925953694705e-06,
"loss": 0.5363,
"step": 3090
},
{
"epoch": 1.2450721908349027,
"grad_norm": 0.5090944716207311,
"learning_rate": 7.272138108556691e-06,
"loss": 0.5284,
"step": 3100
},
{
"epoch": 1.2490897677338355,
"grad_norm": 0.449393206268348,
"learning_rate": 7.25130064905127e-06,
"loss": 0.5296,
"step": 3110
},
{
"epoch": 1.2531073446327683,
"grad_norm": 0.5002200548002171,
"learning_rate": 7.230414030184835e-06,
"loss": 0.531,
"step": 3120
},
{
"epoch": 1.2571249215317013,
"grad_norm": 0.5114004159834465,
"learning_rate": 7.209478708037225e-06,
"loss": 0.5458,
"step": 3130
},
{
"epoch": 1.261142498430634,
"grad_norm": 0.5166427993736156,
"learning_rate": 7.1884951397517664e-06,
"loss": 0.5309,
"step": 3140
},
{
"epoch": 1.2651600753295669,
"grad_norm": 0.4851353630236827,
"learning_rate": 7.167463783525282e-06,
"loss": 0.5375,
"step": 3150
},
{
"epoch": 1.2691776522284997,
"grad_norm": 0.508201816800996,
"learning_rate": 7.146385098598092e-06,
"loss": 0.5356,
"step": 3160
},
{
"epoch": 1.2731952291274324,
"grad_norm": 0.47326774384605347,
"learning_rate": 7.12525954524399e-06,
"loss": 0.5281,
"step": 3170
},
{
"epoch": 1.2772128060263652,
"grad_norm": 0.4964394626079393,
"learning_rate": 7.1040875847601775e-06,
"loss": 0.5339,
"step": 3180
},
{
"epoch": 1.2812303829252982,
"grad_norm": 0.4965199644569434,
"learning_rate": 7.082869679457214e-06,
"loss": 0.5373,
"step": 3190
},
{
"epoch": 1.285247959824231,
"grad_norm": 0.48647921670810396,
"learning_rate": 7.061606292648899e-06,
"loss": 0.5368,
"step": 3200
},
{
"epoch": 1.2892655367231638,
"grad_norm": 0.4709672064019513,
"learning_rate": 7.040297888642172e-06,
"loss": 0.5401,
"step": 3210
},
{
"epoch": 1.2932831136220968,
"grad_norm": 0.4710864047305743,
"learning_rate": 7.018944932726963e-06,
"loss": 0.538,
"step": 3220
},
{
"epoch": 1.2973006905210296,
"grad_norm": 0.507624692636882,
"learning_rate": 6.997547891166041e-06,
"loss": 0.5333,
"step": 3230
},
{
"epoch": 1.3013182674199624,
"grad_norm": 0.4692990883467198,
"learning_rate": 6.976107231184823e-06,
"loss": 0.5412,
"step": 3240
},
{
"epoch": 1.3053358443188952,
"grad_norm": 0.5133848969883468,
"learning_rate": 6.954623420961179e-06,
"loss": 0.5254,
"step": 3250
},
{
"epoch": 1.309353421217828,
"grad_norm": 0.5377405201588225,
"learning_rate": 6.933096929615211e-06,
"loss": 0.5304,
"step": 3260
},
{
"epoch": 1.3133709981167607,
"grad_norm": 0.4449757913251056,
"learning_rate": 6.911528227199e-06,
"loss": 0.5345,
"step": 3270
},
{
"epoch": 1.3173885750156937,
"grad_norm": 0.48594757234752356,
"learning_rate": 6.88991778468635e-06,
"loss": 0.5313,
"step": 3280
},
{
"epoch": 1.3214061519146265,
"grad_norm": 0.44216301043487133,
"learning_rate": 6.868266073962497e-06,
"loss": 0.5301,
"step": 3290
},
{
"epoch": 1.3254237288135593,
"grad_norm": 0.46671237233856316,
"learning_rate": 6.846573567813819e-06,
"loss": 0.5414,
"step": 3300
},
{
"epoch": 1.329441305712492,
"grad_norm": 0.4812496937883229,
"learning_rate": 6.8248407399174865e-06,
"loss": 0.5364,
"step": 3310
},
{
"epoch": 1.333458882611425,
"grad_norm": 0.5038329328787501,
"learning_rate": 6.803068064831149e-06,
"loss": 0.5425,
"step": 3320
},
{
"epoch": 1.3374764595103579,
"grad_norm": 0.45630407505785825,
"learning_rate": 6.781256017982555e-06,
"loss": 0.5367,
"step": 3330
},
{
"epoch": 1.3414940364092907,
"grad_norm": 0.49858693203976323,
"learning_rate": 6.759405075659165e-06,
"loss": 0.539,
"step": 3340
},
{
"epoch": 1.3455116133082234,
"grad_norm": 0.4890924664342059,
"learning_rate": 6.7375157149977755e-06,
"loss": 0.5206,
"step": 3350
},
{
"epoch": 1.3495291902071562,
"grad_norm": 0.4844858973506766,
"learning_rate": 6.715588413974073e-06,
"loss": 0.533,
"step": 3360
},
{
"epoch": 1.353546767106089,
"grad_norm": 0.456108199064706,
"learning_rate": 6.693623651392216e-06,
"loss": 0.54,
"step": 3370
},
{
"epoch": 1.357564344005022,
"grad_norm": 0.44821643945703865,
"learning_rate": 6.671621906874366e-06,
"loss": 0.5313,
"step": 3380
},
{
"epoch": 1.3615819209039548,
"grad_norm": 0.44366483367693854,
"learning_rate": 6.649583660850232e-06,
"loss": 0.5445,
"step": 3390
},
{
"epoch": 1.3655994978028876,
"grad_norm": 0.5007318603356307,
"learning_rate": 6.627509394546558e-06,
"loss": 0.5253,
"step": 3400
},
{
"epoch": 1.3696170747018206,
"grad_norm": 0.47701484999347654,
"learning_rate": 6.605399589976631e-06,
"loss": 0.5432,
"step": 3410
},
{
"epoch": 1.3736346516007534,
"grad_norm": 0.533932151873456,
"learning_rate": 6.583254729929756e-06,
"loss": 0.5362,
"step": 3420
},
{
"epoch": 1.3776522284996862,
"grad_norm": 0.4885239761359488,
"learning_rate": 6.5610752979607e-06,
"loss": 0.5393,
"step": 3430
},
{
"epoch": 1.381669805398619,
"grad_norm": 0.534453189520691,
"learning_rate": 6.538861778379147e-06,
"loss": 0.538,
"step": 3440
},
{
"epoch": 1.3856873822975517,
"grad_norm": 0.55879808060491,
"learning_rate": 6.516614656239115e-06,
"loss": 0.5379,
"step": 3450
},
{
"epoch": 1.3897049591964845,
"grad_norm": 0.5105051602697208,
"learning_rate": 6.49433441732837e-06,
"loss": 0.5434,
"step": 3460
},
{
"epoch": 1.3937225360954175,
"grad_norm": 0.5849851114309839,
"learning_rate": 6.472021548157812e-06,
"loss": 0.5309,
"step": 3470
},
{
"epoch": 1.3977401129943503,
"grad_norm": 0.4862484792197367,
"learning_rate": 6.4496765359508575e-06,
"loss": 0.5403,
"step": 3480
},
{
"epoch": 1.401757689893283,
"grad_norm": 0.4785944624308064,
"learning_rate": 6.427299868632795e-06,
"loss": 0.5315,
"step": 3490
},
{
"epoch": 1.4057752667922159,
"grad_norm": 0.5206432193262734,
"learning_rate": 6.404892034820134e-06,
"loss": 0.5363,
"step": 3500
},
{
"epoch": 1.4097928436911489,
"grad_norm": 0.5223412310453764,
"learning_rate": 6.382453523809939e-06,
"loss": 0.5409,
"step": 3510
},
{
"epoch": 1.4138104205900817,
"grad_norm": 0.5251307070269818,
"learning_rate": 6.359984825569138e-06,
"loss": 0.5286,
"step": 3520
},
{
"epoch": 1.4178279974890144,
"grad_norm": 0.512721201250328,
"learning_rate": 6.3374864307238235e-06,
"loss": 0.5261,
"step": 3530
},
{
"epoch": 1.4218455743879472,
"grad_norm": 0.5026376332917479,
"learning_rate": 6.3149588305485475e-06,
"loss": 0.5208,
"step": 3540
},
{
"epoch": 1.42586315128688,
"grad_norm": 0.4829796384358809,
"learning_rate": 6.2924025169555916e-06,
"loss": 0.5433,
"step": 3550
},
{
"epoch": 1.4298807281858128,
"grad_norm": 0.4957733273389861,
"learning_rate": 6.269817982484212e-06,
"loss": 0.529,
"step": 3560
},
{
"epoch": 1.4338983050847458,
"grad_norm": 0.45279662345918176,
"learning_rate": 6.247205720289907e-06,
"loss": 0.5292,
"step": 3570
},
{
"epoch": 1.4379158819836786,
"grad_norm": 0.45766889598494964,
"learning_rate": 6.224566224133632e-06,
"loss": 0.5358,
"step": 3580
},
{
"epoch": 1.4419334588826114,
"grad_norm": 0.4809643743051477,
"learning_rate": 6.201899988371022e-06,
"loss": 0.544,
"step": 3590
},
{
"epoch": 1.4459510357815444,
"grad_norm": 0.5019231578335686,
"learning_rate": 6.1792075079416e-06,
"loss": 0.5357,
"step": 3600
},
{
"epoch": 1.4499686126804772,
"grad_norm": 0.4966841368294494,
"learning_rate": 6.156489278357967e-06,
"loss": 0.5315,
"step": 3610
},
{
"epoch": 1.45398618957941,
"grad_norm": 0.5111737253668558,
"learning_rate": 6.1337457956949774e-06,
"loss": 0.5231,
"step": 3620
},
{
"epoch": 1.4580037664783427,
"grad_norm": 0.5050504504365466,
"learning_rate": 6.1109775565789164e-06,
"loss": 0.5354,
"step": 3630
},
{
"epoch": 1.4620213433772755,
"grad_norm": 0.49236484658873236,
"learning_rate": 6.0881850581766515e-06,
"loss": 0.5243,
"step": 3640
},
{
"epoch": 1.4660389202762083,
"grad_norm": 0.4748265818153898,
"learning_rate": 6.065368798184771e-06,
"loss": 0.5391,
"step": 3650
},
{
"epoch": 1.4700564971751413,
"grad_norm": 0.5182967229817997,
"learning_rate": 6.042529274818724e-06,
"loss": 0.5294,
"step": 3660
},
{
"epoch": 1.474074074074074,
"grad_norm": 0.4752424911930453,
"learning_rate": 6.019666986801936e-06,
"loss": 0.5281,
"step": 3670
},
{
"epoch": 1.4780916509730069,
"grad_norm": 0.47120824092970764,
"learning_rate": 5.996782433354923e-06,
"loss": 0.5253,
"step": 3680
},
{
"epoch": 1.4821092278719397,
"grad_norm": 0.46585234442500195,
"learning_rate": 5.973876114184388e-06,
"loss": 0.5202,
"step": 3690
},
{
"epoch": 1.4861268047708727,
"grad_norm": 0.47339586038582876,
"learning_rate": 5.95094852947231e-06,
"loss": 0.5288,
"step": 3700
},
{
"epoch": 1.4901443816698055,
"grad_norm": 0.44170633595442677,
"learning_rate": 5.928000179865024e-06,
"loss": 0.531,
"step": 3710
},
{
"epoch": 1.4941619585687382,
"grad_norm": 0.509631075940108,
"learning_rate": 5.905031566462279e-06,
"loss": 0.5371,
"step": 3720
},
{
"epoch": 1.498179535467671,
"grad_norm": 0.4730072164920337,
"learning_rate": 5.882043190806314e-06,
"loss": 0.5275,
"step": 3730
},
{
"epoch": 1.5021971123666038,
"grad_norm": 0.4821028115884746,
"learning_rate": 5.859035554870893e-06,
"loss": 0.5337,
"step": 3740
},
{
"epoch": 1.5062146892655366,
"grad_norm": 0.4834994282281791,
"learning_rate": 5.836009161050342e-06,
"loss": 0.5289,
"step": 3750
},
{
"epoch": 1.5102322661644696,
"grad_norm": 0.45785972034921696,
"learning_rate": 5.812964512148589e-06,
"loss": 0.5399,
"step": 3760
},
{
"epoch": 1.5142498430634024,
"grad_norm": 0.4766787260315672,
"learning_rate": 5.78990211136818e-06,
"loss": 0.538,
"step": 3770
},
{
"epoch": 1.5182674199623352,
"grad_norm": 0.4892932237467062,
"learning_rate": 5.766822462299286e-06,
"loss": 0.5393,
"step": 3780
},
{
"epoch": 1.5222849968612682,
"grad_norm": 0.4837638271264737,
"learning_rate": 5.743726068908717e-06,
"loss": 0.5229,
"step": 3790
},
{
"epoch": 1.526302573760201,
"grad_norm": 0.4868620820757227,
"learning_rate": 5.72061343552891e-06,
"loss": 0.5353,
"step": 3800
},
{
"epoch": 1.5303201506591337,
"grad_norm": 0.49287861664200744,
"learning_rate": 5.697485066846914e-06,
"loss": 0.5407,
"step": 3810
},
{
"epoch": 1.5343377275580665,
"grad_norm": 0.5070216858712217,
"learning_rate": 5.674341467893378e-06,
"loss": 0.5322,
"step": 3820
},
{
"epoch": 1.5383553044569993,
"grad_norm": 0.48075109583598735,
"learning_rate": 5.6511831440315215e-06,
"loss": 0.5318,
"step": 3830
},
{
"epoch": 1.542372881355932,
"grad_norm": 0.487828149268802,
"learning_rate": 5.628010600946088e-06,
"loss": 0.5367,
"step": 3840
},
{
"epoch": 1.5463904582548649,
"grad_norm": 0.4434100223228771,
"learning_rate": 5.604824344632319e-06,
"loss": 0.5413,
"step": 3850
},
{
"epoch": 1.5504080351537979,
"grad_norm": 0.46224887215867433,
"learning_rate": 5.581624881384897e-06,
"loss": 0.5287,
"step": 3860
},
{
"epoch": 1.5544256120527307,
"grad_norm": 0.5122729795251854,
"learning_rate": 5.55841271778689e-06,
"loss": 0.5365,
"step": 3870
},
{
"epoch": 1.5584431889516637,
"grad_norm": 0.690970740866929,
"learning_rate": 5.535188360698687e-06,
"loss": 0.5467,
"step": 3880
},
{
"epoch": 1.5624607658505965,
"grad_norm": 0.4794120185813089,
"learning_rate": 5.511952317246941e-06,
"loss": 0.5348,
"step": 3890
},
{
"epoch": 1.5664783427495292,
"grad_norm": 0.4818371908690834,
"learning_rate": 5.4887050948134825e-06,
"loss": 0.5412,
"step": 3900
},
{
"epoch": 1.570495919648462,
"grad_norm": 0.486538488375387,
"learning_rate": 5.465447201024248e-06,
"loss": 0.5362,
"step": 3910
},
{
"epoch": 1.5745134965473948,
"grad_norm": 0.5061137169976885,
"learning_rate": 5.442179143738193e-06,
"loss": 0.5363,
"step": 3920
},
{
"epoch": 1.5785310734463276,
"grad_norm": 0.46226895825091646,
"learning_rate": 5.418901431036205e-06,
"loss": 0.5277,
"step": 3930
},
{
"epoch": 1.5825486503452604,
"grad_norm": 0.49850901564672195,
"learning_rate": 5.395614571210004e-06,
"loss": 0.5253,
"step": 3940
},
{
"epoch": 1.5865662272441934,
"grad_norm": 0.49839262038652726,
"learning_rate": 5.372319072751046e-06,
"loss": 0.5217,
"step": 3950
},
{
"epoch": 1.5905838041431262,
"grad_norm": 0.4540519023429122,
"learning_rate": 5.349015444339429e-06,
"loss": 0.5174,
"step": 3960
},
{
"epoch": 1.5946013810420592,
"grad_norm": 0.4615246890403801,
"learning_rate": 5.325704194832759e-06,
"loss": 0.5399,
"step": 3970
},
{
"epoch": 1.598618957940992,
"grad_norm": 0.5069766547949516,
"learning_rate": 5.302385833255076e-06,
"loss": 0.5377,
"step": 3980
},
{
"epoch": 1.6026365348399247,
"grad_norm": 0.5034072911043822,
"learning_rate": 5.2790608687857034e-06,
"loss": 0.5312,
"step": 3990
},
{
"epoch": 1.6066541117388575,
"grad_norm": 0.478063165462391,
"learning_rate": 5.2557298107481536e-06,
"loss": 0.5235,
"step": 4000
},
{
"epoch": 1.6106716886377903,
"grad_norm": 0.5051927530264109,
"learning_rate": 5.2323931685989945e-06,
"loss": 0.5282,
"step": 4010
},
{
"epoch": 1.614689265536723,
"grad_norm": 0.449944668715227,
"learning_rate": 5.209051451916733e-06,
"loss": 0.5391,
"step": 4020
},
{
"epoch": 1.6187068424356559,
"grad_norm": 0.4987609704482517,
"learning_rate": 5.185705170390677e-06,
"loss": 0.5401,
"step": 4030
},
{
"epoch": 1.6227244193345887,
"grad_norm": 0.5129818470578882,
"learning_rate": 5.162354833809815e-06,
"loss": 0.5389,
"step": 4040
},
{
"epoch": 1.6267419962335217,
"grad_norm": 0.46834889576653455,
"learning_rate": 5.139000952051686e-06,
"loss": 0.551,
"step": 4050
},
{
"epoch": 1.6307595731324545,
"grad_norm": 0.5100548420871484,
"learning_rate": 5.115644035071234e-06,
"loss": 0.5353,
"step": 4060
},
{
"epoch": 1.6347771500313875,
"grad_norm": 0.5091440448579789,
"learning_rate": 5.0922845928896865e-06,
"loss": 0.5312,
"step": 4070
},
{
"epoch": 1.6387947269303202,
"grad_norm": 0.5011348467216399,
"learning_rate": 5.068923135583405e-06,
"loss": 0.5379,
"step": 4080
},
{
"epoch": 1.642812303829253,
"grad_norm": 0.4879211850299191,
"learning_rate": 5.04556017327276e-06,
"loss": 0.5259,
"step": 4090
},
{
"epoch": 1.6468298807281858,
"grad_norm": 0.47580521291496164,
"learning_rate": 5.022196216110978e-06,
"loss": 0.5264,
"step": 4100
},
{
"epoch": 1.6508474576271186,
"grad_norm": 0.4836039036319484,
"learning_rate": 4.998831774273016e-06,
"loss": 0.5245,
"step": 4110
},
{
"epoch": 1.6548650345260514,
"grad_norm": 0.45734991734522173,
"learning_rate": 4.975467357944412e-06,
"loss": 0.5347,
"step": 4120
},
{
"epoch": 1.6588826114249842,
"grad_norm": 0.45580879464211926,
"learning_rate": 4.9521034773101405e-06,
"loss": 0.5281,
"step": 4130
},
{
"epoch": 1.6629001883239172,
"grad_norm": 0.5088749078327436,
"learning_rate": 4.928740642543491e-06,
"loss": 0.5203,
"step": 4140
},
{
"epoch": 1.66691776522285,
"grad_norm": 0.5023597172357365,
"learning_rate": 4.905379363794907e-06,
"loss": 0.5323,
"step": 4150
},
{
"epoch": 1.670935342121783,
"grad_norm": 0.5160005322831623,
"learning_rate": 4.882020151180852e-06,
"loss": 0.5354,
"step": 4160
},
{
"epoch": 1.6749529190207157,
"grad_norm": 0.48241576610764997,
"learning_rate": 4.858663514772684e-06,
"loss": 0.5256,
"step": 4170
},
{
"epoch": 1.6789704959196485,
"grad_norm": 0.4350265055622632,
"learning_rate": 4.8353099645855e-06,
"loss": 0.5343,
"step": 4180
},
{
"epoch": 1.6829880728185813,
"grad_norm": 0.5156247900477684,
"learning_rate": 4.811960010567005e-06,
"loss": 0.5235,
"step": 4190
},
{
"epoch": 1.687005649717514,
"grad_norm": 0.49845097220709156,
"learning_rate": 4.788614162586379e-06,
"loss": 0.5311,
"step": 4200
},
{
"epoch": 1.6910232266164469,
"grad_norm": 0.4684264084420117,
"learning_rate": 4.76527293042315e-06,
"loss": 0.5361,
"step": 4210
},
{
"epoch": 1.6950408035153797,
"grad_norm": 0.5161852081555511,
"learning_rate": 4.741936823756046e-06,
"loss": 0.5207,
"step": 4220
},
{
"epoch": 1.6990583804143125,
"grad_norm": 0.6119074989682534,
"learning_rate": 4.718606352151874e-06,
"loss": 0.5221,
"step": 4230
},
{
"epoch": 1.7030759573132455,
"grad_norm": 0.4419977523098354,
"learning_rate": 4.695282025054406e-06,
"loss": 0.5336,
"step": 4240
},
{
"epoch": 1.7070935342121782,
"grad_norm": 0.4776310585207038,
"learning_rate": 4.671964351773229e-06,
"loss": 0.5254,
"step": 4250
},
{
"epoch": 1.7111111111111112,
"grad_norm": 0.4362671430805064,
"learning_rate": 4.648653841472643e-06,
"loss": 0.5368,
"step": 4260
},
{
"epoch": 1.715128688010044,
"grad_norm": 0.4926985303907698,
"learning_rate": 4.625351003160539e-06,
"loss": 0.529,
"step": 4270
},
{
"epoch": 1.7191462649089768,
"grad_norm": 0.5037843279607946,
"learning_rate": 4.60205634567728e-06,
"loss": 0.5266,
"step": 4280
},
{
"epoch": 1.7231638418079096,
"grad_norm": 0.48300010587173975,
"learning_rate": 4.578770377684593e-06,
"loss": 0.5308,
"step": 4290
},
{
"epoch": 1.7271814187068424,
"grad_norm": 0.4952905848146038,
"learning_rate": 4.555493607654463e-06,
"loss": 0.5348,
"step": 4300
},
{
"epoch": 1.7311989956057752,
"grad_norm": 0.509232593416316,
"learning_rate": 4.532226543858025e-06,
"loss": 0.5363,
"step": 4310
},
{
"epoch": 1.735216572504708,
"grad_norm": 0.5186412413734403,
"learning_rate": 4.508969694354472e-06,
"loss": 0.5158,
"step": 4320
},
{
"epoch": 1.739234149403641,
"grad_norm": 0.4648730018824965,
"learning_rate": 4.485723566979959e-06,
"loss": 0.5205,
"step": 4330
},
{
"epoch": 1.7432517263025737,
"grad_norm": 0.487919260567548,
"learning_rate": 4.462488669336507e-06,
"loss": 0.5292,
"step": 4340
},
{
"epoch": 1.7472693032015068,
"grad_norm": 0.4741644363249272,
"learning_rate": 4.439265508780932e-06,
"loss": 0.5283,
"step": 4350
},
{
"epoch": 1.7512868801004395,
"grad_norm": 0.49035056338707034,
"learning_rate": 4.416054592413755e-06,
"loss": 0.538,
"step": 4360
},
{
"epoch": 1.7553044569993723,
"grad_norm": 0.4755513975974018,
"learning_rate": 4.392856427068132e-06,
"loss": 0.5297,
"step": 4370
},
{
"epoch": 1.759322033898305,
"grad_norm": 0.46435677929151326,
"learning_rate": 4.3696715192987904e-06,
"loss": 0.5247,
"step": 4380
},
{
"epoch": 1.7633396107972379,
"grad_norm": 0.48979753506487095,
"learning_rate": 4.346500375370966e-06,
"loss": 0.5165,
"step": 4390
},
{
"epoch": 1.7673571876961707,
"grad_norm": 0.4487673109128978,
"learning_rate": 4.323343501249346e-06,
"loss": 0.5317,
"step": 4400
},
{
"epoch": 1.7713747645951035,
"grad_norm": 0.5113864337117118,
"learning_rate": 4.300201402587019e-06,
"loss": 0.5382,
"step": 4410
},
{
"epoch": 1.7753923414940365,
"grad_norm": 0.483652205814584,
"learning_rate": 4.277074584714447e-06,
"loss": 0.5311,
"step": 4420
},
{
"epoch": 1.7794099183929692,
"grad_norm": 0.4759761657301343,
"learning_rate": 4.253963552628411e-06,
"loss": 0.5351,
"step": 4430
},
{
"epoch": 1.783427495291902,
"grad_norm": 0.4821299331080685,
"learning_rate": 4.230868810980997e-06,
"loss": 0.5342,
"step": 4440
},
{
"epoch": 1.787445072190835,
"grad_norm": 0.5245032360028585,
"learning_rate": 4.207790864068573e-06,
"loss": 0.5237,
"step": 4450
},
{
"epoch": 1.7914626490897678,
"grad_norm": 0.49328875619112256,
"learning_rate": 4.184730215820782e-06,
"loss": 0.5317,
"step": 4460
},
{
"epoch": 1.7954802259887006,
"grad_norm": 0.5008680438944126,
"learning_rate": 4.161687369789526e-06,
"loss": 0.517,
"step": 4470
},
{
"epoch": 1.7994978028876334,
"grad_norm": 0.47287999035048983,
"learning_rate": 4.138662829137984e-06,
"loss": 0.5327,
"step": 4480
},
{
"epoch": 1.8035153797865662,
"grad_norm": 0.49099568575427033,
"learning_rate": 4.115657096629615e-06,
"loss": 0.5302,
"step": 4490
},
{
"epoch": 1.807532956685499,
"grad_norm": 0.4518440123720032,
"learning_rate": 4.092670674617187e-06,
"loss": 0.5153,
"step": 4500
},
{
"epoch": 1.8115505335844317,
"grad_norm": 0.4893643353710452,
"learning_rate": 4.069704065031804e-06,
"loss": 0.5354,
"step": 4510
},
{
"epoch": 1.8155681104833647,
"grad_norm": 0.4938753269976269,
"learning_rate": 4.0467577693719436e-06,
"loss": 0.5304,
"step": 4520
},
{
"epoch": 1.8195856873822975,
"grad_norm": 0.48759036445701953,
"learning_rate": 4.023832288692512e-06,
"loss": 0.5333,
"step": 4530
},
{
"epoch": 1.8236032642812305,
"grad_norm": 0.48751563543723775,
"learning_rate": 4.000928123593898e-06,
"loss": 0.5385,
"step": 4540
},
{
"epoch": 1.8276208411801633,
"grad_norm": 0.512128536559346,
"learning_rate": 3.978045774211043e-06,
"loss": 0.5438,
"step": 4550
},
{
"epoch": 1.831638418079096,
"grad_norm": 0.48148117588240913,
"learning_rate": 3.9551857402025215e-06,
"loss": 0.5321,
"step": 4560
},
{
"epoch": 1.835655994978029,
"grad_norm": 0.5036763909061966,
"learning_rate": 3.932348520739633e-06,
"loss": 0.5321,
"step": 4570
},
{
"epoch": 1.8396735718769617,
"grad_norm": 0.4850093548850179,
"learning_rate": 3.909534614495495e-06,
"loss": 0.5212,
"step": 4580
},
{
"epoch": 1.8436911487758945,
"grad_norm": 0.5089724945679945,
"learning_rate": 3.886744519634157e-06,
"loss": 0.526,
"step": 4590
},
{
"epoch": 1.8477087256748272,
"grad_norm": 0.4736889376864286,
"learning_rate": 3.86397873379973e-06,
"loss": 0.5355,
"step": 4600
},
{
"epoch": 1.8517263025737603,
"grad_norm": 0.48275206618553307,
"learning_rate": 3.841237754105508e-06,
"loss": 0.5375,
"step": 4610
},
{
"epoch": 1.855743879472693,
"grad_norm": 0.5070260134880437,
"learning_rate": 3.818522077123119e-06,
"loss": 0.5256,
"step": 4620
},
{
"epoch": 1.8597614563716258,
"grad_norm": 0.48382223029891325,
"learning_rate": 3.795832198871682e-06,
"loss": 0.5272,
"step": 4630
},
{
"epoch": 1.8637790332705588,
"grad_norm": 0.4533070548630681,
"learning_rate": 3.7731686148069768e-06,
"loss": 0.529,
"step": 4640
},
{
"epoch": 1.8677966101694916,
"grad_norm": 0.5136994231340827,
"learning_rate": 3.7505318198106226e-06,
"loss": 0.5259,
"step": 4650
},
{
"epoch": 1.8718141870684244,
"grad_norm": 0.4891813686506932,
"learning_rate": 3.727922308179275e-06,
"loss": 0.528,
"step": 4660
},
{
"epoch": 1.8758317639673572,
"grad_norm": 0.4784464468836338,
"learning_rate": 3.7053405736138228e-06,
"loss": 0.5239,
"step": 4670
},
{
"epoch": 1.87984934086629,
"grad_norm": 0.4439894514511087,
"learning_rate": 3.6827871092086283e-06,
"loss": 0.5278,
"step": 4680
},
{
"epoch": 1.8838669177652227,
"grad_norm": 0.42133586965144204,
"learning_rate": 3.6602624074407354e-06,
"loss": 0.525,
"step": 4690
},
{
"epoch": 1.8878844946641555,
"grad_norm": 0.4452525569153129,
"learning_rate": 3.6377669601591314e-06,
"loss": 0.5271,
"step": 4700
},
{
"epoch": 1.8919020715630885,
"grad_norm": 0.4781861294441116,
"learning_rate": 3.615301258574009e-06,
"loss": 0.5244,
"step": 4710
},
{
"epoch": 1.8959196484620213,
"grad_norm": 0.442730967082675,
"learning_rate": 3.5928657932460252e-06,
"loss": 0.5245,
"step": 4720
},
{
"epoch": 1.8999372253609543,
"grad_norm": 0.45356827607495354,
"learning_rate": 3.5704610540756035e-06,
"loss": 0.5226,
"step": 4730
},
{
"epoch": 1.9039548022598871,
"grad_norm": 0.44691195704566716,
"learning_rate": 3.5480875302922296e-06,
"loss": 0.5383,
"step": 4740
},
{
"epoch": 1.90797237915882,
"grad_norm": 0.48059730824092567,
"learning_rate": 3.525745710443774e-06,
"loss": 0.5224,
"step": 4750
},
{
"epoch": 1.9119899560577527,
"grad_norm": 0.447518168596057,
"learning_rate": 3.503436082385817e-06,
"loss": 0.529,
"step": 4760
},
{
"epoch": 1.9160075329566855,
"grad_norm": 0.45980480004977614,
"learning_rate": 3.4811591332710003e-06,
"loss": 0.5283,
"step": 4770
},
{
"epoch": 1.9200251098556183,
"grad_norm": 0.4641653971039642,
"learning_rate": 3.4589153495383916e-06,
"loss": 0.524,
"step": 4780
},
{
"epoch": 1.924042686754551,
"grad_norm": 0.4608734211286836,
"learning_rate": 3.4367052169028557e-06,
"loss": 0.5154,
"step": 4790
},
{
"epoch": 1.928060263653484,
"grad_norm": 0.45628159677386826,
"learning_rate": 3.414529220344455e-06,
"loss": 0.5246,
"step": 4800
},
{
"epoch": 1.9320778405524168,
"grad_norm": 0.4571910097492241,
"learning_rate": 3.3923878440978563e-06,
"loss": 0.5355,
"step": 4810
},
{
"epoch": 1.9360954174513496,
"grad_norm": 0.4701204963184072,
"learning_rate": 3.370281571641759e-06,
"loss": 0.519,
"step": 4820
},
{
"epoch": 1.9401129943502826,
"grad_norm": 0.4644809253665488,
"learning_rate": 3.348210885688337e-06,
"loss": 0.5444,
"step": 4830
},
{
"epoch": 1.9441305712492154,
"grad_norm": 0.4505954165735258,
"learning_rate": 3.3261762681726955e-06,
"loss": 0.5288,
"step": 4840
},
{
"epoch": 1.9481481481481482,
"grad_norm": 0.5016011741209488,
"learning_rate": 3.304178200242351e-06,
"loss": 0.5279,
"step": 4850
},
{
"epoch": 1.952165725047081,
"grad_norm": 0.48172105905267193,
"learning_rate": 3.282217162246726e-06,
"loss": 0.5331,
"step": 4860
},
{
"epoch": 1.9561833019460138,
"grad_norm": 0.45055441596952966,
"learning_rate": 3.260293633726656e-06,
"loss": 0.5312,
"step": 4870
},
{
"epoch": 1.9602008788449465,
"grad_norm": 0.4553856152933495,
"learning_rate": 3.2384080934039193e-06,
"loss": 0.5301,
"step": 4880
},
{
"epoch": 1.9642184557438793,
"grad_norm": 0.468059642341407,
"learning_rate": 3.2165610191707872e-06,
"loss": 0.5265,
"step": 4890
},
{
"epoch": 1.9682360326428123,
"grad_norm": 0.42050027272205776,
"learning_rate": 3.194752888079585e-06,
"loss": 0.5212,
"step": 4900
},
{
"epoch": 1.9722536095417451,
"grad_norm": 0.49679149221807994,
"learning_rate": 3.1729841763322776e-06,
"loss": 0.5298,
"step": 4910
},
{
"epoch": 1.9762711864406781,
"grad_norm": 0.4716285847340891,
"learning_rate": 3.1512553592700622e-06,
"loss": 0.5203,
"step": 4920
},
{
"epoch": 1.980288763339611,
"grad_norm": 0.4740722369561679,
"learning_rate": 3.129566911363009e-06,
"loss": 0.5208,
"step": 4930
},
{
"epoch": 1.9843063402385437,
"grad_norm": 0.4553213513296392,
"learning_rate": 3.1079193061996803e-06,
"loss": 0.5241,
"step": 4940
},
{
"epoch": 1.9883239171374765,
"grad_norm": 0.48169953080880973,
"learning_rate": 3.086313016476794e-06,
"loss": 0.5418,
"step": 4950
},
{
"epoch": 1.9923414940364093,
"grad_norm": 0.5764706666287279,
"learning_rate": 3.0647485139889145e-06,
"loss": 0.5259,
"step": 4960
},
{
"epoch": 1.996359070935342,
"grad_norm": 0.43220459247826987,
"learning_rate": 3.0432262696181336e-06,
"loss": 0.522,
"step": 4970
},
{
"epoch": 2.0,
"grad_norm": 0.4710113996508926,
"learning_rate": 3.0217467533237956e-06,
"loss": 0.5142,
"step": 4980
},
{
"epoch": 2.004017576898933,
"grad_norm": 0.5202233442297398,
"learning_rate": 3.000310434132237e-06,
"loss": 0.4811,
"step": 4990
},
{
"epoch": 2.0080351537978656,
"grad_norm": 0.5384401881895943,
"learning_rate": 2.9789177801265455e-06,
"loss": 0.4769,
"step": 5000
},
{
"epoch": 2.0120527306967984,
"grad_norm": 0.4756295523176075,
"learning_rate": 2.9575692584363337e-06,
"loss": 0.4755,
"step": 5010
},
{
"epoch": 2.016070307595731,
"grad_norm": 0.5172206673937627,
"learning_rate": 2.9362653352275405e-06,
"loss": 0.4813,
"step": 5020
},
{
"epoch": 2.0200878844946644,
"grad_norm": 0.49483001522371817,
"learning_rate": 2.915006475692256e-06,
"loss": 0.472,
"step": 5030
},
{
"epoch": 2.024105461393597,
"grad_norm": 0.4704791978578869,
"learning_rate": 2.89379314403856e-06,
"loss": 0.4747,
"step": 5040
},
{
"epoch": 2.02812303829253,
"grad_norm": 0.5050782711725152,
"learning_rate": 2.8726258034803866e-06,
"loss": 0.4794,
"step": 5050
},
{
"epoch": 2.0321406151914627,
"grad_norm": 0.4712269294430662,
"learning_rate": 2.8515049162274057e-06,
"loss": 0.4722,
"step": 5060
},
{
"epoch": 2.0361581920903955,
"grad_norm": 0.48881220535426967,
"learning_rate": 2.83043094347494e-06,
"loss": 0.4678,
"step": 5070
},
{
"epoch": 2.0401757689893283,
"grad_norm": 0.48771842977610164,
"learning_rate": 2.8094043453938844e-06,
"loss": 0.4665,
"step": 5080
},
{
"epoch": 2.044193345888261,
"grad_norm": 0.5202925253687711,
"learning_rate": 2.7884255811206584e-06,
"loss": 0.4763,
"step": 5090
},
{
"epoch": 2.048210922787194,
"grad_norm": 0.46874669055521,
"learning_rate": 2.7674951087471858e-06,
"loss": 0.4833,
"step": 5100
},
{
"epoch": 2.0522284996861266,
"grad_norm": 0.47453927342332336,
"learning_rate": 2.7466133853108935e-06,
"loss": 0.4598,
"step": 5110
},
{
"epoch": 2.0562460765850594,
"grad_norm": 0.45817909400058926,
"learning_rate": 2.725780866784722e-06,
"loss": 0.4719,
"step": 5120
},
{
"epoch": 2.0602636534839927,
"grad_norm": 0.5261764803454092,
"learning_rate": 2.704998008067177e-06,
"loss": 0.4634,
"step": 5130
},
{
"epoch": 2.0642812303829254,
"grad_norm": 0.47433900887045005,
"learning_rate": 2.6842652629723907e-06,
"loss": 0.4785,
"step": 5140
},
{
"epoch": 2.068298807281858,
"grad_norm": 0.4711156322673265,
"learning_rate": 2.6635830842202182e-06,
"loss": 0.4625,
"step": 5150
},
{
"epoch": 2.072316384180791,
"grad_norm": 0.4620663705804988,
"learning_rate": 2.642951923426348e-06,
"loss": 0.4775,
"step": 5160
},
{
"epoch": 2.076333961079724,
"grad_norm": 0.4617355126174571,
"learning_rate": 2.622372231092437e-06,
"loss": 0.4817,
"step": 5170
},
{
"epoch": 2.0803515379786566,
"grad_norm": 0.5411857216699084,
"learning_rate": 2.6018444565962885e-06,
"loss": 0.4731,
"step": 5180
},
{
"epoch": 2.0843691148775894,
"grad_norm": 0.4943277164316023,
"learning_rate": 2.5813690481820184e-06,
"loss": 0.4693,
"step": 5190
},
{
"epoch": 2.088386691776522,
"grad_norm": 0.5102055113637535,
"learning_rate": 2.5609464529502815e-06,
"loss": 0.4805,
"step": 5200
},
{
"epoch": 2.092404268675455,
"grad_norm": 0.49132645013613546,
"learning_rate": 2.540577116848505e-06,
"loss": 0.4694,
"step": 5210
},
{
"epoch": 2.096421845574388,
"grad_norm": 0.49577954933166757,
"learning_rate": 2.52026148466115e-06,
"loss": 0.4825,
"step": 5220
},
{
"epoch": 2.100439422473321,
"grad_norm": 0.4840304988230105,
"learning_rate": 2.5000000000000015e-06,
"loss": 0.4775,
"step": 5230
},
{
"epoch": 2.1044569993722537,
"grad_norm": 0.5036218210876587,
"learning_rate": 2.4797931052944755e-06,
"loss": 0.472,
"step": 5240
},
{
"epoch": 2.1084745762711865,
"grad_norm": 0.4502304265079634,
"learning_rate": 2.4596412417819708e-06,
"loss": 0.4685,
"step": 5250
},
{
"epoch": 2.1124921531701193,
"grad_norm": 0.5218870877367079,
"learning_rate": 2.4395448494982198e-06,
"loss": 0.4817,
"step": 5260
},
{
"epoch": 2.116509730069052,
"grad_norm": 0.49197993587084365,
"learning_rate": 2.419504367267689e-06,
"loss": 0.4744,
"step": 5270
},
{
"epoch": 2.120527306967985,
"grad_norm": 0.5349529463351869,
"learning_rate": 2.3995202326939866e-06,
"loss": 0.4872,
"step": 5280
},
{
"epoch": 2.1245448838669176,
"grad_norm": 0.48741903300666545,
"learning_rate": 2.3795928821503275e-06,
"loss": 0.4688,
"step": 5290
},
{
"epoch": 2.1285624607658504,
"grad_norm": 0.4999227078942744,
"learning_rate": 2.359722750769981e-06,
"loss": 0.4793,
"step": 5300
},
{
"epoch": 2.132580037664783,
"grad_norm": 0.45470893654183736,
"learning_rate": 2.339910272436782e-06,
"loss": 0.4755,
"step": 5310
},
{
"epoch": 2.1365976145637164,
"grad_norm": 0.4824984736023203,
"learning_rate": 2.3201558797756602e-06,
"loss": 0.472,
"step": 5320
},
{
"epoch": 2.1406151914626492,
"grad_norm": 0.47042980851352273,
"learning_rate": 2.300460004143182e-06,
"loss": 0.477,
"step": 5330
},
{
"epoch": 2.144632768361582,
"grad_norm": 0.47770309689595364,
"learning_rate": 2.2808230756181344e-06,
"loss": 0.4678,
"step": 5340
},
{
"epoch": 2.148650345260515,
"grad_norm": 0.5137315970001541,
"learning_rate": 2.261245522992141e-06,
"loss": 0.4718,
"step": 5350
},
{
"epoch": 2.1526679221594476,
"grad_norm": 0.47959163673834015,
"learning_rate": 2.2417277737602967e-06,
"loss": 0.4777,
"step": 5360
},
{
"epoch": 2.1566854990583804,
"grad_norm": 0.5938128736434867,
"learning_rate": 2.222270254111825e-06,
"loss": 0.4573,
"step": 5370
},
{
"epoch": 2.160703075957313,
"grad_norm": 0.4891138485781613,
"learning_rate": 2.2028733889207787e-06,
"loss": 0.4767,
"step": 5380
},
{
"epoch": 2.164720652856246,
"grad_norm": 0.501211033879534,
"learning_rate": 2.1835376017367665e-06,
"loss": 0.4735,
"step": 5390
},
{
"epoch": 2.1687382297551787,
"grad_norm": 0.5297335059398043,
"learning_rate": 2.1642633147756894e-06,
"loss": 0.4824,
"step": 5400
},
{
"epoch": 2.172755806654112,
"grad_norm": 0.4658949552573407,
"learning_rate": 2.145050948910536e-06,
"loss": 0.4757,
"step": 5410
},
{
"epoch": 2.1767733835530447,
"grad_norm": 0.5130444470120785,
"learning_rate": 2.1259009236621857e-06,
"loss": 0.4804,
"step": 5420
},
{
"epoch": 2.1807909604519775,
"grad_norm": 0.5328006329035961,
"learning_rate": 2.1068136571902527e-06,
"loss": 0.4714,
"step": 5430
},
{
"epoch": 2.1848085373509103,
"grad_norm": 0.49974994460658273,
"learning_rate": 2.0877895662839477e-06,
"loss": 0.4661,
"step": 5440
},
{
"epoch": 2.188826114249843,
"grad_norm": 0.504181506688872,
"learning_rate": 2.0688290663529813e-06,
"loss": 0.469,
"step": 5450
},
{
"epoch": 2.192843691148776,
"grad_norm": 0.5249615841875294,
"learning_rate": 2.049932571418494e-06,
"loss": 0.4784,
"step": 5460
},
{
"epoch": 2.1968612680477086,
"grad_norm": 0.4718284348814741,
"learning_rate": 2.031100494104014e-06,
"loss": 0.4784,
"step": 5470
},
{
"epoch": 2.2008788449466414,
"grad_norm": 0.5193890352090972,
"learning_rate": 2.0123332456264473e-06,
"loss": 0.4818,
"step": 5480
},
{
"epoch": 2.204896421845574,
"grad_norm": 0.48046763032522966,
"learning_rate": 1.9936312357870962e-06,
"loss": 0.4802,
"step": 5490
},
{
"epoch": 2.2089139987445074,
"grad_norm": 0.47752926213638963,
"learning_rate": 1.9749948729627188e-06,
"loss": 0.4686,
"step": 5500
},
{
"epoch": 2.2129315756434402,
"grad_norm": 0.4800555208436017,
"learning_rate": 1.956424564096602e-06,
"loss": 0.482,
"step": 5510
},
{
"epoch": 2.216949152542373,
"grad_norm": 0.443284090953342,
"learning_rate": 1.9379207146896827e-06,
"loss": 0.4733,
"step": 5520
},
{
"epoch": 2.220966729441306,
"grad_norm": 0.5002931907484188,
"learning_rate": 1.9194837287916817e-06,
"loss": 0.4776,
"step": 5530
},
{
"epoch": 2.2249843063402386,
"grad_norm": 0.45028976156882144,
"learning_rate": 1.9011140089923013e-06,
"loss": 0.4785,
"step": 5540
},
{
"epoch": 2.2290018832391714,
"grad_norm": 0.4619774988789297,
"learning_rate": 1.8828119564124159e-06,
"loss": 0.475,
"step": 5550
},
{
"epoch": 2.233019460138104,
"grad_norm": 0.47661490970644393,
"learning_rate": 1.8645779706953188e-06,
"loss": 0.4824,
"step": 5560
},
{
"epoch": 2.237037037037037,
"grad_norm": 0.49398577636680097,
"learning_rate": 1.8464124499980013e-06,
"loss": 0.4719,
"step": 5570
},
{
"epoch": 2.2410546139359697,
"grad_norm": 0.4799156749603329,
"learning_rate": 1.8283157909824517e-06,
"loss": 0.4787,
"step": 5580
},
{
"epoch": 2.2450721908349025,
"grad_norm": 0.4903571200855578,
"learning_rate": 1.8102883888069917e-06,
"loss": 0.481,
"step": 5590
},
{
"epoch": 2.2490897677338357,
"grad_norm": 0.4595319743138882,
"learning_rate": 1.7923306371176542e-06,
"loss": 0.4722,
"step": 5600
},
{
"epoch": 2.2531073446327685,
"grad_norm": 0.4815148560520921,
"learning_rate": 1.7744429280395903e-06,
"loss": 0.4804,
"step": 5610
},
{
"epoch": 2.2571249215317013,
"grad_norm": 0.49411596599772084,
"learning_rate": 1.7566256521684966e-06,
"loss": 0.4837,
"step": 5620
},
{
"epoch": 2.261142498430634,
"grad_norm": 0.5331117323575773,
"learning_rate": 1.7388791985620922e-06,
"loss": 0.4705,
"step": 5630
},
{
"epoch": 2.265160075329567,
"grad_norm": 0.52376875529828,
"learning_rate": 1.721203954731624e-06,
"loss": 0.4723,
"step": 5640
},
{
"epoch": 2.2691776522284997,
"grad_norm": 0.48551462166212467,
"learning_rate": 1.7036003066334012e-06,
"loss": 0.4853,
"step": 5650
},
{
"epoch": 2.2731952291274324,
"grad_norm": 0.5014457542958235,
"learning_rate": 1.6860686386603719e-06,
"loss": 0.4733,
"step": 5660
},
{
"epoch": 2.277212806026365,
"grad_norm": 0.4996867895329777,
"learning_rate": 1.6686093336337256e-06,
"loss": 0.4741,
"step": 5670
},
{
"epoch": 2.281230382925298,
"grad_norm": 0.48719109638855057,
"learning_rate": 1.6512227727945391e-06,
"loss": 0.4831,
"step": 5680
},
{
"epoch": 2.285247959824231,
"grad_norm": 0.4856082508478335,
"learning_rate": 1.6339093357954455e-06,
"loss": 0.4833,
"step": 5690
},
{
"epoch": 2.289265536723164,
"grad_norm": 0.5118289081317942,
"learning_rate": 1.6166694006923479e-06,
"loss": 0.4845,
"step": 5700
},
{
"epoch": 2.293283113622097,
"grad_norm": 0.5338262164693043,
"learning_rate": 1.5995033439361623e-06,
"loss": 0.4725,
"step": 5710
},
{
"epoch": 2.2973006905210296,
"grad_norm": 0.5147815344933698,
"learning_rate": 1.5824115403646e-06,
"loss": 0.471,
"step": 5720
},
{
"epoch": 2.3013182674199624,
"grad_norm": 0.4925335147001341,
"learning_rate": 1.5653943631939806e-06,
"loss": 0.4748,
"step": 5730
},
{
"epoch": 2.305335844318895,
"grad_norm": 0.5223852201900643,
"learning_rate": 1.5484521840110812e-06,
"loss": 0.4799,
"step": 5740
},
{
"epoch": 2.309353421217828,
"grad_norm": 0.5102348798882654,
"learning_rate": 1.5315853727650283e-06,
"loss": 0.4734,
"step": 5750
},
{
"epoch": 2.3133709981167607,
"grad_norm": 0.4806877999022709,
"learning_rate": 1.5147942977592111e-06,
"loss": 0.4793,
"step": 5760
},
{
"epoch": 2.3173885750156935,
"grad_norm": 0.47394343796609417,
"learning_rate": 1.4980793256432474e-06,
"loss": 0.4778,
"step": 5770
},
{
"epoch": 2.3214061519146263,
"grad_norm": 0.5169128981837072,
"learning_rate": 1.4814408214049674e-06,
"loss": 0.4756,
"step": 5780
},
{
"epoch": 2.3254237288135595,
"grad_norm": 0.4653905202296831,
"learning_rate": 1.4648791483624586e-06,
"loss": 0.476,
"step": 5790
},
{
"epoch": 2.3294413057124923,
"grad_norm": 0.49306875398820615,
"learning_rate": 1.4483946681561178e-06,
"loss": 0.4686,
"step": 5800
},
{
"epoch": 2.333458882611425,
"grad_norm": 0.48126484624398314,
"learning_rate": 1.4319877407407623e-06,
"loss": 0.4757,
"step": 5810
},
{
"epoch": 2.337476459510358,
"grad_norm": 0.4857946344608446,
"learning_rate": 1.415658724377767e-06,
"loss": 0.4707,
"step": 5820
},
{
"epoch": 2.3414940364092907,
"grad_norm": 0.508812865037136,
"learning_rate": 1.3994079756272467e-06,
"loss": 0.4716,
"step": 5830
},
{
"epoch": 2.3455116133082234,
"grad_norm": 0.4924478822718041,
"learning_rate": 1.3832358493402591e-06,
"loss": 0.4788,
"step": 5840
},
{
"epoch": 2.3495291902071562,
"grad_norm": 0.5636558365101663,
"learning_rate": 1.3671426986510667e-06,
"loss": 0.4791,
"step": 5850
},
{
"epoch": 2.353546767106089,
"grad_norm": 0.4787221048450779,
"learning_rate": 1.3511288749694245e-06,
"loss": 0.4774,
"step": 5860
},
{
"epoch": 2.357564344005022,
"grad_norm": 0.509596234617302,
"learning_rate": 1.3351947279729016e-06,
"loss": 0.4738,
"step": 5870
},
{
"epoch": 2.361581920903955,
"grad_norm": 0.48376004159048386,
"learning_rate": 1.3193406055992485e-06,
"loss": 0.4828,
"step": 5880
},
{
"epoch": 2.365599497802888,
"grad_norm": 0.5038494271293078,
"learning_rate": 1.3035668540388002e-06,
"loss": 0.4864,
"step": 5890
},
{
"epoch": 2.3696170747018206,
"grad_norm": 0.5316770270179247,
"learning_rate": 1.2878738177269156e-06,
"loss": 0.4785,
"step": 5900
},
{
"epoch": 2.3736346516007534,
"grad_norm": 0.5142512043455278,
"learning_rate": 1.2722618393364572e-06,
"loss": 0.4817,
"step": 5910
},
{
"epoch": 2.377652228499686,
"grad_norm": 0.468789477239248,
"learning_rate": 1.2567312597703063e-06,
"loss": 0.4735,
"step": 5920
},
{
"epoch": 2.381669805398619,
"grad_norm": 0.4736300406705453,
"learning_rate": 1.2412824181539256e-06,
"loss": 0.467,
"step": 5930
},
{
"epoch": 2.3856873822975517,
"grad_norm": 0.49508031018022314,
"learning_rate": 1.2259156518279452e-06,
"loss": 0.4854,
"step": 5940
},
{
"epoch": 2.3897049591964845,
"grad_norm": 0.4889357358622553,
"learning_rate": 1.2106312963408024e-06,
"loss": 0.4683,
"step": 5950
},
{
"epoch": 2.3937225360954173,
"grad_norm": 0.5186581775849328,
"learning_rate": 1.1954296854414111e-06,
"loss": 0.4743,
"step": 5960
},
{
"epoch": 2.3977401129943505,
"grad_norm": 0.534928567712559,
"learning_rate": 1.1803111510718774e-06,
"loss": 0.4713,
"step": 5970
},
{
"epoch": 2.401757689893283,
"grad_norm": 0.46937598704982375,
"learning_rate": 1.1652760233602495e-06,
"loss": 0.4773,
"step": 5980
},
{
"epoch": 2.405775266792216,
"grad_norm": 0.49187943461524203,
"learning_rate": 1.1503246306133099e-06,
"loss": 0.48,
"step": 5990
},
{
"epoch": 2.409792843691149,
"grad_norm": 0.4699512441083907,
"learning_rate": 1.1354572993094031e-06,
"loss": 0.4752,
"step": 6000
},
{
"epoch": 2.4138104205900817,
"grad_norm": 0.514478387098126,
"learning_rate": 1.1206743540913144e-06,
"loss": 0.4735,
"step": 6010
},
{
"epoch": 2.4178279974890144,
"grad_norm": 0.4912678185053618,
"learning_rate": 1.1059761177591727e-06,
"loss": 0.4738,
"step": 6020
},
{
"epoch": 2.4218455743879472,
"grad_norm": 0.45184357921612245,
"learning_rate": 1.0913629112634045e-06,
"loss": 0.4764,
"step": 6030
},
{
"epoch": 2.42586315128688,
"grad_norm": 0.45539851634790796,
"learning_rate": 1.076835053697728e-06,
"loss": 0.4758,
"step": 6040
},
{
"epoch": 2.429880728185813,
"grad_norm": 0.5152318496750359,
"learning_rate": 1.0623928622921825e-06,
"loss": 0.4732,
"step": 6050
},
{
"epoch": 2.4338983050847456,
"grad_norm": 0.5016552767543895,
"learning_rate": 1.0480366524062041e-06,
"loss": 0.483,
"step": 6060
},
{
"epoch": 2.4379158819836784,
"grad_norm": 0.47698174699672474,
"learning_rate": 1.0337667375217353e-06,
"loss": 0.4737,
"step": 6070
},
{
"epoch": 2.4419334588826116,
"grad_norm": 0.5215110019193456,
"learning_rate": 1.0195834292363881e-06,
"loss": 0.4717,
"step": 6080
},
{
"epoch": 2.4459510357815444,
"grad_norm": 0.5003135573061556,
"learning_rate": 1.0054870372566273e-06,
"loss": 0.4711,
"step": 6090
},
{
"epoch": 2.449968612680477,
"grad_norm": 0.5154412726359013,
"learning_rate": 9.914778693910165e-07,
"loss": 0.4738,
"step": 6100
},
{
"epoch": 2.45398618957941,
"grad_norm": 0.5155157663249713,
"learning_rate": 9.775562315435005e-07,
"loss": 0.481,
"step": 6110
},
{
"epoch": 2.4580037664783427,
"grad_norm": 0.4896545010247639,
"learning_rate": 9.637224277067142e-07,
"loss": 0.4869,
"step": 6120
},
{
"epoch": 2.4620213433772755,
"grad_norm": 0.4603343227039726,
"learning_rate": 9.499767599553528e-07,
"loss": 0.4817,
"step": 6130
},
{
"epoch": 2.4660389202762083,
"grad_norm": 0.4600261678419497,
"learning_rate": 9.363195284395732e-07,
"loss": 0.4679,
"step": 6140
},
{
"epoch": 2.470056497175141,
"grad_norm": 0.4989876951619158,
"learning_rate": 9.227510313784405e-07,
"loss": 0.4805,
"step": 6150
},
{
"epoch": 2.474074074074074,
"grad_norm": 0.49701651082881143,
"learning_rate": 9.092715650534162e-07,
"loss": 0.474,
"step": 6160
},
{
"epoch": 2.478091650973007,
"grad_norm": 0.47689581346361015,
"learning_rate": 8.958814238018864e-07,
"loss": 0.4735,
"step": 6170
},
{
"epoch": 2.48210922787194,
"grad_norm": 0.48466102861465504,
"learning_rate": 8.825809000107382e-07,
"loss": 0.4823,
"step": 6180
},
{
"epoch": 2.4861268047708727,
"grad_norm": 0.43949823990630094,
"learning_rate": 8.693702841099744e-07,
"loss": 0.468,
"step": 6190
},
{
"epoch": 2.4901443816698055,
"grad_norm": 0.4734599690753599,
"learning_rate": 8.56249864566368e-07,
"loss": 0.4716,
"step": 6200
},
{
"epoch": 2.4941619585687382,
"grad_norm": 0.4837246488118121,
"learning_rate": 8.432199278771679e-07,
"loss": 0.4727,
"step": 6210
},
{
"epoch": 2.498179535467671,
"grad_norm": 0.5212860772248176,
"learning_rate": 8.302807585638401e-07,
"loss": 0.4781,
"step": 6220
},
{
"epoch": 2.502197112366604,
"grad_norm": 0.4952468033104653,
"learning_rate": 8.174326391658561e-07,
"loss": 0.4742,
"step": 6230
},
{
"epoch": 2.5062146892655366,
"grad_norm": 0.4867979964745898,
"learning_rate": 8.04675850234523e-07,
"loss": 0.4731,
"step": 6240
},
{
"epoch": 2.5102322661644694,
"grad_norm": 0.48350910576661243,
"learning_rate": 7.92010670326856e-07,
"loss": 0.4793,
"step": 6250
},
{
"epoch": 2.5142498430634026,
"grad_norm": 0.48063080647615003,
"learning_rate": 7.794373759995017e-07,
"loss": 0.4814,
"step": 6260
},
{
"epoch": 2.518267419962335,
"grad_norm": 0.5071850927755938,
"learning_rate": 7.669562418026905e-07,
"loss": 0.4726,
"step": 6270
},
{
"epoch": 2.522284996861268,
"grad_norm": 0.5007552952463317,
"learning_rate": 7.545675402742464e-07,
"loss": 0.4701,
"step": 6280
},
{
"epoch": 2.526302573760201,
"grad_norm": 0.49193921591367956,
"learning_rate": 7.422715419336374e-07,
"loss": 0.4798,
"step": 6290
},
{
"epoch": 2.5303201506591337,
"grad_norm": 0.4631014183740996,
"learning_rate": 7.30068515276064e-07,
"loss": 0.4783,
"step": 6300
},
{
"epoch": 2.5343377275580665,
"grad_norm": 0.4461109021704027,
"learning_rate": 7.179587267665999e-07,
"loss": 0.4807,
"step": 6310
},
{
"epoch": 2.5383553044569993,
"grad_norm": 0.5615597716996266,
"learning_rate": 7.059424408343713e-07,
"loss": 0.476,
"step": 6320
},
{
"epoch": 2.542372881355932,
"grad_norm": 0.4681517030493916,
"learning_rate": 6.940199198667863e-07,
"loss": 0.4746,
"step": 6330
},
{
"epoch": 2.546390458254865,
"grad_norm": 0.4270578942801569,
"learning_rate": 6.821914242038013e-07,
"loss": 0.47,
"step": 6340
},
{
"epoch": 2.550408035153798,
"grad_norm": 0.5120970629852971,
"learning_rate": 6.704572121322356e-07,
"loss": 0.4661,
"step": 6350
},
{
"epoch": 2.5544256120527304,
"grad_norm": 0.4577342016167032,
"learning_rate": 6.588175398801356e-07,
"loss": 0.4778,
"step": 6360
},
{
"epoch": 2.5584431889516637,
"grad_norm": 0.5112843701769388,
"learning_rate": 6.472726616111797e-07,
"loss": 0.4774,
"step": 6370
},
{
"epoch": 2.5624607658505965,
"grad_norm": 0.5002523546947816,
"learning_rate": 6.358228294191248e-07,
"loss": 0.4745,
"step": 6380
},
{
"epoch": 2.5664783427495292,
"grad_norm": 0.4844209522259677,
"learning_rate": 6.244682933223023e-07,
"loss": 0.4743,
"step": 6390
},
{
"epoch": 2.570495919648462,
"grad_norm": 0.5054412286540757,
"learning_rate": 6.13209301258162e-07,
"loss": 0.4689,
"step": 6400
},
{
"epoch": 2.574513496547395,
"grad_norm": 0.4613555569290718,
"learning_rate": 6.020460990778537e-07,
"loss": 0.4711,
"step": 6410
},
{
"epoch": 2.5785310734463276,
"grad_norm": 0.47785054223407225,
"learning_rate": 5.909789305408631e-07,
"loss": 0.476,
"step": 6420
},
{
"epoch": 2.5825486503452604,
"grad_norm": 0.5097922646263033,
"learning_rate": 5.800080373096839e-07,
"loss": 0.4628,
"step": 6430
},
{
"epoch": 2.5865662272441936,
"grad_norm": 0.460211806295128,
"learning_rate": 5.691336589445485e-07,
"loss": 0.4693,
"step": 6440
},
{
"epoch": 2.590583804143126,
"grad_norm": 0.47963475329302363,
"learning_rate": 5.583560328981885e-07,
"loss": 0.4741,
"step": 6450
},
{
"epoch": 2.594601381042059,
"grad_norm": 0.521520019445044,
"learning_rate": 5.476753945106556e-07,
"loss": 0.4763,
"step": 6460
},
{
"epoch": 2.598618957940992,
"grad_norm": 0.4716378386067206,
"learning_rate": 5.370919770041799e-07,
"loss": 0.4742,
"step": 6470
},
{
"epoch": 2.6026365348399247,
"grad_norm": 0.5095512360791891,
"learning_rate": 5.266060114780774e-07,
"loss": 0.4769,
"step": 6480
},
{
"epoch": 2.6066541117388575,
"grad_norm": 0.49008584458884724,
"learning_rate": 5.162177269037061e-07,
"loss": 0.4695,
"step": 6490
},
{
"epoch": 2.6106716886377903,
"grad_norm": 0.4430564461046631,
"learning_rate": 5.059273501194622e-07,
"loss": 0.4738,
"step": 6500
},
{
"epoch": 2.614689265536723,
"grad_norm": 0.48041503000167013,
"learning_rate": 4.95735105825833e-07,
"loss": 0.4671,
"step": 6510
},
{
"epoch": 2.618706842435656,
"grad_norm": 0.47058785476417425,
"learning_rate": 4.856412165804824e-07,
"loss": 0.4656,
"step": 6520
},
{
"epoch": 2.6227244193345887,
"grad_norm": 0.5186436231128471,
"learning_rate": 4.756459027933974e-07,
"loss": 0.4795,
"step": 6530
},
{
"epoch": 2.6267419962335214,
"grad_norm": 0.4971208460235255,
"learning_rate": 4.657493827220705e-07,
"loss": 0.4745,
"step": 6540
},
{
"epoch": 2.6307595731324547,
"grad_norm": 0.46196697310421064,
"learning_rate": 4.559518724667411e-07,
"loss": 0.4788,
"step": 6550
},
{
"epoch": 2.6347771500313875,
"grad_norm": 0.516003177023342,
"learning_rate": 4.462535859656675e-07,
"loss": 0.476,
"step": 6560
},
{
"epoch": 2.6387947269303202,
"grad_norm": 0.5015505649590103,
"learning_rate": 4.36654734990461e-07,
"loss": 0.4818,
"step": 6570
},
{
"epoch": 2.642812303829253,
"grad_norm": 0.517201945888056,
"learning_rate": 4.271555291414636e-07,
"loss": 0.4564,
"step": 6580
},
{
"epoch": 2.646829880728186,
"grad_norm": 0.4978218856523734,
"learning_rate": 4.1775617584316476e-07,
"loss": 0.4713,
"step": 6590
},
{
"epoch": 2.6508474576271186,
"grad_norm": 0.4876775749761943,
"learning_rate": 4.0845688033967435e-07,
"loss": 0.4753,
"step": 6600
},
{
"epoch": 2.6548650345260514,
"grad_norm": 0.49046863554031045,
"learning_rate": 3.992578456902452e-07,
"loss": 0.4719,
"step": 6610
},
{
"epoch": 2.658882611424984,
"grad_norm": 0.4828102602774462,
"learning_rate": 3.901592727648351e-07,
"loss": 0.471,
"step": 6620
},
{
"epoch": 2.662900188323917,
"grad_norm": 0.5128903958409182,
"learning_rate": 3.811613602397202e-07,
"loss": 0.4799,
"step": 6630
},
{
"epoch": 2.66691776522285,
"grad_norm": 0.485862558152459,
"learning_rate": 3.7226430459315957e-07,
"loss": 0.4682,
"step": 6640
},
{
"epoch": 2.670935342121783,
"grad_norm": 0.48320591671258245,
"learning_rate": 3.634683001011019e-07,
"loss": 0.4802,
"step": 6650
},
{
"epoch": 2.6749529190207157,
"grad_norm": 0.4382446479240708,
"learning_rate": 3.547735388329443e-07,
"loss": 0.4728,
"step": 6660
},
{
"epoch": 2.6789704959196485,
"grad_norm": 0.4939760563538167,
"learning_rate": 3.461802106473411e-07,
"loss": 0.4811,
"step": 6670
},
{
"epoch": 2.6829880728185813,
"grad_norm": 0.50894968077572,
"learning_rate": 3.3768850318805224e-07,
"loss": 0.4666,
"step": 6680
},
{
"epoch": 2.687005649717514,
"grad_norm": 0.5297225230888177,
"learning_rate": 3.2929860187985216e-07,
"loss": 0.4712,
"step": 6690
},
{
"epoch": 2.691023226616447,
"grad_norm": 0.49359696533604985,
"learning_rate": 3.210106899244775e-07,
"loss": 0.4808,
"step": 6700
},
{
"epoch": 2.6950408035153797,
"grad_norm": 0.47433607917767673,
"learning_rate": 3.1282494829662556e-07,
"loss": 0.4676,
"step": 6710
},
{
"epoch": 2.6990583804143125,
"grad_norm": 0.4777730701958091,
"learning_rate": 3.047415557400057e-07,
"loss": 0.4777,
"step": 6720
},
{
"epoch": 2.7030759573132457,
"grad_norm": 0.4944173220055023,
"learning_rate": 2.967606887634344e-07,
"loss": 0.4736,
"step": 6730
},
{
"epoch": 2.707093534212178,
"grad_norm": 0.5904235702447377,
"learning_rate": 2.888825216369806e-07,
"loss": 0.4772,
"step": 6740
},
{
"epoch": 2.7111111111111112,
"grad_norm": 0.4952766389802285,
"learning_rate": 2.811072263881615e-07,
"loss": 0.485,
"step": 6750
},
{
"epoch": 2.715128688010044,
"grad_norm": 1.0088921588490039,
"learning_rate": 2.7343497279818833e-07,
"loss": 0.4695,
"step": 6760
},
{
"epoch": 2.719146264908977,
"grad_norm": 0.4788614551696112,
"learning_rate": 2.658659283982523e-07,
"loss": 0.4737,
"step": 6770
},
{
"epoch": 2.7231638418079096,
"grad_norm": 0.5299426999271306,
"learning_rate": 2.58400258465874e-07,
"loss": 0.4835,
"step": 6780
},
{
"epoch": 2.7271814187068424,
"grad_norm": 0.496725834314719,
"learning_rate": 2.510381260212874e-07,
"loss": 0.4714,
"step": 6790
},
{
"epoch": 2.731198995605775,
"grad_norm": 0.5015477715189429,
"learning_rate": 2.4377969182388774e-07,
"loss": 0.4692,
"step": 6800
},
{
"epoch": 2.735216572504708,
"grad_norm": 0.5250384332220221,
"learning_rate": 2.3662511436871538e-07,
"loss": 0.4749,
"step": 6810
},
{
"epoch": 2.739234149403641,
"grad_norm": 0.4422929805667402,
"learning_rate": 2.295745498829949e-07,
"loss": 0.475,
"step": 6820
},
{
"epoch": 2.7432517263025735,
"grad_norm": 0.5083194750205056,
"learning_rate": 2.2262815232272916e-07,
"loss": 0.4683,
"step": 6830
},
{
"epoch": 2.7472693032015068,
"grad_norm": 0.459034326672265,
"learning_rate": 2.1578607336933177e-07,
"loss": 0.4776,
"step": 6840
},
{
"epoch": 2.7512868801004395,
"grad_norm": 0.4944061747494303,
"learning_rate": 2.090484624263167e-07,
"loss": 0.4686,
"step": 6850
},
{
"epoch": 2.7553044569993723,
"grad_norm": 0.5465184475852697,
"learning_rate": 2.0241546661603605e-07,
"loss": 0.4694,
"step": 6860
},
{
"epoch": 2.759322033898305,
"grad_norm": 0.4897764666579114,
"learning_rate": 1.9588723077646976e-07,
"loss": 0.4711,
"step": 6870
},
{
"epoch": 2.763339610797238,
"grad_norm": 0.49639087360500034,
"learning_rate": 1.8946389745805983e-07,
"loss": 0.4747,
"step": 6880
},
{
"epoch": 2.7673571876961707,
"grad_norm": 0.5022906703230215,
"learning_rate": 1.8314560692059836e-07,
"loss": 0.4735,
"step": 6890
},
{
"epoch": 2.7713747645951035,
"grad_norm": 0.48564181193501227,
"learning_rate": 1.7693249713016558e-07,
"loss": 0.466,
"step": 6900
},
{
"epoch": 2.7753923414940367,
"grad_norm": 0.5530530885750234,
"learning_rate": 1.7082470375611614e-07,
"loss": 0.4815,
"step": 6910
},
{
"epoch": 2.779409918392969,
"grad_norm": 0.48580647047733116,
"learning_rate": 1.648223601681176e-07,
"loss": 0.4858,
"step": 6920
},
{
"epoch": 2.7834274952919023,
"grad_norm": 0.4942106873900172,
"learning_rate": 1.589255974332382e-07,
"loss": 0.4755,
"step": 6930
},
{
"epoch": 2.787445072190835,
"grad_norm": 0.5077413435223396,
"learning_rate": 1.5313454431308494e-07,
"loss": 0.4762,
"step": 6940
},
{
"epoch": 2.791462649089768,
"grad_norm": 0.49459375051900323,
"learning_rate": 1.4744932726099005e-07,
"loss": 0.4678,
"step": 6950
},
{
"epoch": 2.7954802259887006,
"grad_norm": 0.5103917447607162,
"learning_rate": 1.4187007041925328e-07,
"loss": 0.4734,
"step": 6960
},
{
"epoch": 2.7994978028876334,
"grad_norm": 0.47776810942745174,
"learning_rate": 1.363968956164269e-07,
"loss": 0.4736,
"step": 6970
},
{
"epoch": 2.803515379786566,
"grad_norm": 0.5055359246952604,
"learning_rate": 1.310299223646594e-07,
"loss": 0.4675,
"step": 6980
},
{
"epoch": 2.807532956685499,
"grad_norm": 0.5159323211802336,
"learning_rate": 1.2576926785708321e-07,
"loss": 0.4796,
"step": 6990
},
{
"epoch": 2.8115505335844317,
"grad_norm": 0.4717892098536317,
"learning_rate": 1.2061504696525617e-07,
"loss": 0.4752,
"step": 7000
},
{
"epoch": 2.8155681104833645,
"grad_norm": 0.47314656128582044,
"learning_rate": 1.1556737223665515e-07,
"loss": 0.4715,
"step": 7010
},
{
"epoch": 2.8195856873822978,
"grad_norm": 0.47908474784592364,
"learning_rate": 1.1062635389221588e-07,
"loss": 0.4865,
"step": 7020
},
{
"epoch": 2.8236032642812305,
"grad_norm": 0.4460236716075369,
"learning_rate": 1.0579209982392757e-07,
"loss": 0.4692,
"step": 7030
},
{
"epoch": 2.8276208411801633,
"grad_norm": 0.5320629331138326,
"learning_rate": 1.0106471559247433e-07,
"loss": 0.4692,
"step": 7040
},
{
"epoch": 2.831638418079096,
"grad_norm": 0.5095821610850613,
"learning_rate": 9.644430442493636e-08,
"loss": 0.4635,
"step": 7050
},
{
"epoch": 2.835655994978029,
"grad_norm": 0.45919702634359383,
"learning_rate": 9.193096721252903e-08,
"loss": 0.4623,
"step": 7060
},
{
"epoch": 2.8396735718769617,
"grad_norm": 0.5030328915586799,
"learning_rate": 8.752480250840411e-08,
"loss": 0.4738,
"step": 7070
},
{
"epoch": 2.8436911487758945,
"grad_norm": 0.5239356661209714,
"learning_rate": 8.322590652549478e-08,
"loss": 0.4717,
"step": 7080
},
{
"epoch": 2.8477087256748272,
"grad_norm": 0.5003917496304626,
"learning_rate": 7.903437313441842e-08,
"loss": 0.4857,
"step": 7090
},
{
"epoch": 2.85172630257376,
"grad_norm": 0.4708502707470253,
"learning_rate": 7.495029386142382e-08,
"loss": 0.4724,
"step": 7100
},
{
"epoch": 2.8557438794726933,
"grad_norm": 0.5171601825808555,
"learning_rate": 7.097375788639227e-08,
"loss": 0.4655,
"step": 7110
},
{
"epoch": 2.8597614563716256,
"grad_norm": 0.49349820404810807,
"learning_rate": 6.710485204089456e-08,
"loss": 0.4701,
"step": 7120
},
{
"epoch": 2.863779033270559,
"grad_norm": 0.5150356271171084,
"learning_rate": 6.334366080628873e-08,
"loss": 0.482,
"step": 7130
},
{
"epoch": 2.8677966101694916,
"grad_norm": 0.45255498690118884,
"learning_rate": 5.96902663118798e-08,
"loss": 0.4696,
"step": 7140
},
{
"epoch": 2.8718141870684244,
"grad_norm": 0.4833344072840721,
"learning_rate": 5.614474833312622e-08,
"loss": 0.4686,
"step": 7150
},
{
"epoch": 2.875831763967357,
"grad_norm": 0.44938418775925026,
"learning_rate": 5.270718428989463e-08,
"loss": 0.4671,
"step": 7160
},
{
"epoch": 2.87984934086629,
"grad_norm": 0.5005428826464499,
"learning_rate": 4.937764924477284e-08,
"loss": 0.4757,
"step": 7170
},
{
"epoch": 2.8838669177652227,
"grad_norm": 0.511470087270601,
"learning_rate": 4.615621590142838e-08,
"loss": 0.488,
"step": 7180
},
{
"epoch": 2.8878844946641555,
"grad_norm": 0.5016274179565011,
"learning_rate": 4.3042954603023655e-08,
"loss": 0.4717,
"step": 7190
},
{
"epoch": 2.8919020715630888,
"grad_norm": 0.46282357124816725,
"learning_rate": 4.003793333067607e-08,
"loss": 0.47,
"step": 7200
},
{
"epoch": 2.895919648462021,
"grad_norm": 0.5009456286050996,
"learning_rate": 3.714121770197754e-08,
"loss": 0.467,
"step": 7210
},
{
"epoch": 2.8999372253609543,
"grad_norm": 0.49427390713109254,
"learning_rate": 3.435287096955897e-08,
"loss": 0.4703,
"step": 7220
},
{
"epoch": 2.903954802259887,
"grad_norm": 0.4756544687033634,
"learning_rate": 3.167295401970971e-08,
"loss": 0.475,
"step": 7230
},
{
"epoch": 2.90797237915882,
"grad_norm": 0.4729939555426576,
"learning_rate": 2.9101525371049154e-08,
"loss": 0.4851,
"step": 7240
},
{
"epoch": 2.9119899560577527,
"grad_norm": 0.4655382923368333,
"learning_rate": 2.663864117324777e-08,
"loss": 0.4755,
"step": 7250
},
{
"epoch": 2.9160075329566855,
"grad_norm": 0.47231627841883606,
"learning_rate": 2.42843552058003e-08,
"loss": 0.4677,
"step": 7260
},
{
"epoch": 2.9200251098556183,
"grad_norm": 0.48596292370132727,
"learning_rate": 2.203871887685449e-08,
"loss": 0.4744,
"step": 7270
},
{
"epoch": 2.924042686754551,
"grad_norm": 0.4968533928087242,
"learning_rate": 1.9901781222084192e-08,
"loss": 0.4755,
"step": 7280
},
{
"epoch": 2.9280602636534843,
"grad_norm": 0.5186135008077664,
"learning_rate": 1.7873588903623006e-08,
"loss": 0.479,
"step": 7290
},
{
"epoch": 2.9320778405524166,
"grad_norm": 0.5196627425255889,
"learning_rate": 1.5954186209042323e-08,
"loss": 0.4684,
"step": 7300
},
{
"epoch": 2.93609541745135,
"grad_norm": 0.4875506685262467,
"learning_rate": 1.4143615050384862e-08,
"loss": 0.4619,
"step": 7310
},
{
"epoch": 2.9401129943502826,
"grad_norm": 0.4803641492705183,
"learning_rate": 1.2441914963250423e-08,
"loss": 0.4753,
"step": 7320
},
{
"epoch": 2.9441305712492154,
"grad_norm": 0.4438429839046257,
"learning_rate": 1.0849123105931558e-08,
"loss": 0.4772,
"step": 7330
},
{
"epoch": 2.948148148148148,
"grad_norm": 0.47157812956867295,
"learning_rate": 9.365274258604229e-09,
"loss": 0.4743,
"step": 7340
},
{
"epoch": 2.952165725047081,
"grad_norm": 0.5144283649963357,
"learning_rate": 7.990400822564525e-09,
"loss": 0.4898,
"step": 7350
},
{
"epoch": 2.9561833019460138,
"grad_norm": 0.4994406655154466,
"learning_rate": 6.7245328195247875e-09,
"loss": 0.4807,
"step": 7360
},
{
"epoch": 2.9602008788449465,
"grad_norm": 0.4789005591934425,
"learning_rate": 5.567697890955792e-09,
"loss": 0.4809,
"step": 7370
},
{
"epoch": 2.9642184557438793,
"grad_norm": 0.4556018148546147,
"learning_rate": 4.519921297484464e-09,
"loss": 0.4687,
"step": 7380
},
{
"epoch": 2.968236032642812,
"grad_norm": 0.5209374897753106,
"learning_rate": 3.5812259183426457e-09,
"loss": 0.47,
"step": 7390
},
{
"epoch": 2.9722536095417453,
"grad_norm": 0.4460089771421974,
"learning_rate": 2.751632250865832e-09,
"loss": 0.4778,
"step": 7400
},
{
"epoch": 2.976271186440678,
"grad_norm": 0.4783096492145474,
"learning_rate": 2.0311584100457526e-09,
"loss": 0.4753,
"step": 7410
},
{
"epoch": 2.980288763339611,
"grad_norm": 0.4882741296424901,
"learning_rate": 1.4198201281373503e-09,
"loss": 0.484,
"step": 7420
},
{
"epoch": 2.9843063402385437,
"grad_norm": 0.4605012097308224,
"learning_rate": 9.17630754312393e-10,
"loss": 0.4795,
"step": 7430
},
{
"epoch": 2.9883239171374765,
"grad_norm": 0.7503308211144039,
"learning_rate": 5.246012543680401e-10,
"loss": 0.4806,
"step": 7440
},
{
"epoch": 2.9923414940364093,
"grad_norm": 0.4845828672261576,
"learning_rate": 2.4074021049091954e-10,
"loss": 0.4753,
"step": 7450
},
{
"epoch": 2.996359070935342,
"grad_norm": 0.525299815132595,
"learning_rate": 6.605382106505964e-11,
"loss": 0.4796,
"step": 7460
},
{
"epoch": 3.0,
"grad_norm": 0.5136490807064724,
"learning_rate": 5.459005397723261e-13,
"loss": 0.4664,
"step": 7470
},
{
"epoch": 3.0,
"step": 7470,
"total_flos": 4.845415158505275e+18,
"train_loss": 0.5501844393999541,
"train_runtime": 257337.3533,
"train_samples_per_second": 3.714,
"train_steps_per_second": 0.029
}
],
"logging_steps": 10,
"max_steps": 7470,
"num_input_tokens_seen": 0,
"num_train_epochs": 3,
"save_steps": 24890,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 4.845415158505275e+18,
"train_batch_size": 1,
"trial_name": null,
"trial_params": null
}