arco-m1-test-run-2 / checkpoint-5000 /trainer_state.json
appvoid's picture
Upload folder using huggingface_hub
c8bd181 verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 0.5571030640668524,
"eval_steps": 5000,
"global_step": 5000,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0011142061281337048,
"grad_norm": 526387360.0,
"learning_rate": 2.0000000000000002e-07,
"loss": 8.3558,
"step": 10
},
{
"epoch": 0.0022284122562674096,
"grad_norm": 10082223104.0,
"learning_rate": 4.0000000000000003e-07,
"loss": 9.7414,
"step": 20
},
{
"epoch": 0.003342618384401114,
"grad_norm": 394851360.0,
"learning_rate": 6.000000000000001e-07,
"loss": 8.3657,
"step": 30
},
{
"epoch": 0.004456824512534819,
"grad_norm": 550555200.0,
"learning_rate": 8.000000000000001e-07,
"loss": 7.7018,
"step": 40
},
{
"epoch": 0.005571030640668524,
"grad_norm": 6702706176.0,
"learning_rate": 1.0000000000000002e-06,
"loss": 7.8417,
"step": 50
},
{
"epoch": 0.006685236768802228,
"grad_norm": 984629184.0,
"learning_rate": 1.2000000000000002e-06,
"loss": 7.2844,
"step": 60
},
{
"epoch": 0.007799442896935933,
"grad_norm": 136705808.0,
"learning_rate": 1.4000000000000001e-06,
"loss": 6.072,
"step": 70
},
{
"epoch": 0.008913649025069638,
"grad_norm": 2020311040.0,
"learning_rate": 1.6000000000000001e-06,
"loss": 6.2002,
"step": 80
},
{
"epoch": 0.010027855153203343,
"grad_norm": 284398432.0,
"learning_rate": 1.8000000000000001e-06,
"loss": 5.9848,
"step": 90
},
{
"epoch": 0.011142061281337047,
"grad_norm": 101863704.0,
"learning_rate": 2.0000000000000003e-06,
"loss": 5.6746,
"step": 100
},
{
"epoch": 0.012256267409470752,
"grad_norm": 289118624.0,
"learning_rate": 2.2e-06,
"loss": 4.8833,
"step": 110
},
{
"epoch": 0.013370473537604457,
"grad_norm": 1703932416.0,
"learning_rate": 2.4000000000000003e-06,
"loss": 5.7605,
"step": 120
},
{
"epoch": 0.014484679665738161,
"grad_norm": 2717028864.0,
"learning_rate": 2.6e-06,
"loss": 5.2174,
"step": 130
},
{
"epoch": 0.015598885793871866,
"grad_norm": 73946976.0,
"learning_rate": 2.8000000000000003e-06,
"loss": 4.5718,
"step": 140
},
{
"epoch": 0.016713091922005572,
"grad_norm": 19827058.0,
"learning_rate": 3e-06,
"loss": 3.9569,
"step": 150
},
{
"epoch": 0.017827298050139277,
"grad_norm": 41642140.0,
"learning_rate": 3.2000000000000003e-06,
"loss": 3.0831,
"step": 160
},
{
"epoch": 0.01894150417827298,
"grad_norm": 29684964.0,
"learning_rate": 3.4000000000000005e-06,
"loss": 3.0789,
"step": 170
},
{
"epoch": 0.020055710306406686,
"grad_norm": 23762328.0,
"learning_rate": 3.6000000000000003e-06,
"loss": 2.6555,
"step": 180
},
{
"epoch": 0.02116991643454039,
"grad_norm": 9484590.0,
"learning_rate": 3.8000000000000005e-06,
"loss": 2.3382,
"step": 190
},
{
"epoch": 0.022284122562674095,
"grad_norm": 69793928.0,
"learning_rate": 4.000000000000001e-06,
"loss": 2.2238,
"step": 200
},
{
"epoch": 0.0233983286908078,
"grad_norm": 117296832.0,
"learning_rate": 4.2000000000000004e-06,
"loss": 2.1228,
"step": 210
},
{
"epoch": 0.024512534818941504,
"grad_norm": 12182392.0,
"learning_rate": 4.4e-06,
"loss": 1.9407,
"step": 220
},
{
"epoch": 0.02562674094707521,
"grad_norm": 16288649.0,
"learning_rate": 4.600000000000001e-06,
"loss": 1.837,
"step": 230
},
{
"epoch": 0.026740947075208913,
"grad_norm": 3740553216.0,
"learning_rate": 4.800000000000001e-06,
"loss": 1.7806,
"step": 240
},
{
"epoch": 0.027855153203342618,
"grad_norm": 16179237.0,
"learning_rate": 5e-06,
"loss": 2.3418,
"step": 250
},
{
"epoch": 0.028969359331476322,
"grad_norm": 3983458.25,
"learning_rate": 5.2e-06,
"loss": 1.4532,
"step": 260
},
{
"epoch": 0.030083565459610027,
"grad_norm": 41428304.0,
"learning_rate": 5.400000000000001e-06,
"loss": 1.3844,
"step": 270
},
{
"epoch": 0.03119777158774373,
"grad_norm": 11067015.0,
"learning_rate": 5.600000000000001e-06,
"loss": 1.218,
"step": 280
},
{
"epoch": 0.03231197771587744,
"grad_norm": 3763499.0,
"learning_rate": 5.8e-06,
"loss": 1.0569,
"step": 290
},
{
"epoch": 0.033426183844011144,
"grad_norm": 3374811.25,
"learning_rate": 6e-06,
"loss": 0.9546,
"step": 300
},
{
"epoch": 0.03454038997214485,
"grad_norm": 956919.375,
"learning_rate": 6.200000000000001e-06,
"loss": 0.7927,
"step": 310
},
{
"epoch": 0.03565459610027855,
"grad_norm": 3694079.75,
"learning_rate": 6.4000000000000006e-06,
"loss": 0.8144,
"step": 320
},
{
"epoch": 0.03676880222841226,
"grad_norm": 11441138.0,
"learning_rate": 6.600000000000001e-06,
"loss": 0.7171,
"step": 330
},
{
"epoch": 0.03788300835654596,
"grad_norm": 3600018.75,
"learning_rate": 6.800000000000001e-06,
"loss": 0.6414,
"step": 340
},
{
"epoch": 0.03899721448467967,
"grad_norm": 2904519.75,
"learning_rate": 7e-06,
"loss": 0.6656,
"step": 350
},
{
"epoch": 0.04011142061281337,
"grad_norm": 508876.46875,
"learning_rate": 7.2000000000000005e-06,
"loss": 0.7313,
"step": 360
},
{
"epoch": 0.041225626740947076,
"grad_norm": 1115024.25,
"learning_rate": 7.4e-06,
"loss": 0.6118,
"step": 370
},
{
"epoch": 0.04233983286908078,
"grad_norm": 705881.875,
"learning_rate": 7.600000000000001e-06,
"loss": 0.576,
"step": 380
},
{
"epoch": 0.043454038997214485,
"grad_norm": 1243545.875,
"learning_rate": 7.800000000000002e-06,
"loss": 0.6819,
"step": 390
},
{
"epoch": 0.04456824512534819,
"grad_norm": 403263.125,
"learning_rate": 8.000000000000001e-06,
"loss": 0.6145,
"step": 400
},
{
"epoch": 0.045682451253481894,
"grad_norm": 604421.6875,
"learning_rate": 8.2e-06,
"loss": 0.5955,
"step": 410
},
{
"epoch": 0.0467966573816156,
"grad_norm": 659902.5,
"learning_rate": 8.400000000000001e-06,
"loss": 0.5932,
"step": 420
},
{
"epoch": 0.0479108635097493,
"grad_norm": 636760.1875,
"learning_rate": 8.6e-06,
"loss": 0.5849,
"step": 430
},
{
"epoch": 0.04902506963788301,
"grad_norm": 657968.4375,
"learning_rate": 8.8e-06,
"loss": 0.5557,
"step": 440
},
{
"epoch": 0.05013927576601671,
"grad_norm": 345233.0,
"learning_rate": 9e-06,
"loss": 0.5214,
"step": 450
},
{
"epoch": 0.05125348189415042,
"grad_norm": 6643789.5,
"learning_rate": 9.200000000000002e-06,
"loss": 0.5415,
"step": 460
},
{
"epoch": 0.05236768802228412,
"grad_norm": 260660.28125,
"learning_rate": 9.4e-06,
"loss": 0.598,
"step": 470
},
{
"epoch": 0.053481894150417826,
"grad_norm": 337898.15625,
"learning_rate": 9.600000000000001e-06,
"loss": 0.5329,
"step": 480
},
{
"epoch": 0.05459610027855153,
"grad_norm": 262814.03125,
"learning_rate": 9.800000000000001e-06,
"loss": 0.5255,
"step": 490
},
{
"epoch": 0.055710306406685235,
"grad_norm": 378706.5625,
"learning_rate": 1e-05,
"loss": 0.5261,
"step": 500
},
{
"epoch": 0.05682451253481894,
"grad_norm": 405478.5,
"learning_rate": 9.988200589970503e-06,
"loss": 0.5366,
"step": 510
},
{
"epoch": 0.057938718662952644,
"grad_norm": 299159.59375,
"learning_rate": 9.976401179941004e-06,
"loss": 0.5714,
"step": 520
},
{
"epoch": 0.05905292479108635,
"grad_norm": 353445.59375,
"learning_rate": 9.964601769911504e-06,
"loss": 0.4717,
"step": 530
},
{
"epoch": 0.06016713091922005,
"grad_norm": 320303.59375,
"learning_rate": 9.952802359882007e-06,
"loss": 0.5686,
"step": 540
},
{
"epoch": 0.06128133704735376,
"grad_norm": 233915.875,
"learning_rate": 9.941002949852509e-06,
"loss": 0.6282,
"step": 550
},
{
"epoch": 0.06239554317548746,
"grad_norm": 300448.0625,
"learning_rate": 9.92920353982301e-06,
"loss": 0.4815,
"step": 560
},
{
"epoch": 0.06350974930362117,
"grad_norm": 283011.96875,
"learning_rate": 9.917404129793512e-06,
"loss": 0.5632,
"step": 570
},
{
"epoch": 0.06462395543175488,
"grad_norm": 695608.625,
"learning_rate": 9.905604719764012e-06,
"loss": 0.4856,
"step": 580
},
{
"epoch": 0.06573816155988858,
"grad_norm": 216388.484375,
"learning_rate": 9.893805309734514e-06,
"loss": 0.5146,
"step": 590
},
{
"epoch": 0.06685236768802229,
"grad_norm": 235648.84375,
"learning_rate": 9.882005899705015e-06,
"loss": 0.508,
"step": 600
},
{
"epoch": 0.06796657381615599,
"grad_norm": 271049.1875,
"learning_rate": 9.870206489675517e-06,
"loss": 0.5011,
"step": 610
},
{
"epoch": 0.0690807799442897,
"grad_norm": 228041.625,
"learning_rate": 9.858407079646018e-06,
"loss": 0.486,
"step": 620
},
{
"epoch": 0.0701949860724234,
"grad_norm": 217752.53125,
"learning_rate": 9.84660766961652e-06,
"loss": 0.5339,
"step": 630
},
{
"epoch": 0.0713091922005571,
"grad_norm": 44443688.0,
"learning_rate": 9.83480825958702e-06,
"loss": 0.4714,
"step": 640
},
{
"epoch": 0.07242339832869081,
"grad_norm": 222843.84375,
"learning_rate": 9.823008849557523e-06,
"loss": 0.4784,
"step": 650
},
{
"epoch": 0.07353760445682452,
"grad_norm": 214836.625,
"learning_rate": 9.811209439528024e-06,
"loss": 0.5541,
"step": 660
},
{
"epoch": 0.07465181058495822,
"grad_norm": 358109.9375,
"learning_rate": 9.799410029498526e-06,
"loss": 0.5086,
"step": 670
},
{
"epoch": 0.07576601671309192,
"grad_norm": 218794.015625,
"learning_rate": 9.787610619469026e-06,
"loss": 0.5288,
"step": 680
},
{
"epoch": 0.07688022284122563,
"grad_norm": 233655.0,
"learning_rate": 9.775811209439529e-06,
"loss": 0.4452,
"step": 690
},
{
"epoch": 0.07799442896935933,
"grad_norm": 314280.78125,
"learning_rate": 9.764011799410031e-06,
"loss": 0.5537,
"step": 700
},
{
"epoch": 0.07910863509749304,
"grad_norm": 256511.78125,
"learning_rate": 9.752212389380532e-06,
"loss": 0.4991,
"step": 710
},
{
"epoch": 0.08022284122562674,
"grad_norm": 227226.875,
"learning_rate": 9.740412979351032e-06,
"loss": 0.5703,
"step": 720
},
{
"epoch": 0.08133704735376045,
"grad_norm": 9358421.0,
"learning_rate": 9.728613569321534e-06,
"loss": 0.5663,
"step": 730
},
{
"epoch": 0.08245125348189415,
"grad_norm": 179491.125,
"learning_rate": 9.716814159292037e-06,
"loss": 0.4795,
"step": 740
},
{
"epoch": 0.08356545961002786,
"grad_norm": 247549.875,
"learning_rate": 9.705014749262537e-06,
"loss": 0.4769,
"step": 750
},
{
"epoch": 0.08467966573816156,
"grad_norm": 237436.28125,
"learning_rate": 9.693215339233038e-06,
"loss": 0.5177,
"step": 760
},
{
"epoch": 0.08579387186629527,
"grad_norm": 201981.609375,
"learning_rate": 9.68141592920354e-06,
"loss": 0.4623,
"step": 770
},
{
"epoch": 0.08690807799442897,
"grad_norm": 260105.90625,
"learning_rate": 9.669616519174042e-06,
"loss": 0.4948,
"step": 780
},
{
"epoch": 0.08802228412256267,
"grad_norm": 205071.875,
"learning_rate": 9.657817109144543e-06,
"loss": 0.4359,
"step": 790
},
{
"epoch": 0.08913649025069638,
"grad_norm": 223004.84375,
"learning_rate": 9.646017699115045e-06,
"loss": 0.5101,
"step": 800
},
{
"epoch": 0.09025069637883008,
"grad_norm": 258591.375,
"learning_rate": 9.634218289085546e-06,
"loss": 0.4504,
"step": 810
},
{
"epoch": 0.09136490250696379,
"grad_norm": 1028943.1875,
"learning_rate": 9.622418879056048e-06,
"loss": 0.4848,
"step": 820
},
{
"epoch": 0.09247910863509749,
"grad_norm": 173033.28125,
"learning_rate": 9.61061946902655e-06,
"loss": 0.467,
"step": 830
},
{
"epoch": 0.0935933147632312,
"grad_norm": 404133.6875,
"learning_rate": 9.598820058997051e-06,
"loss": 0.4951,
"step": 840
},
{
"epoch": 0.0947075208913649,
"grad_norm": 189107.265625,
"learning_rate": 9.587020648967552e-06,
"loss": 0.4616,
"step": 850
},
{
"epoch": 0.0958217270194986,
"grad_norm": 239286.09375,
"learning_rate": 9.575221238938054e-06,
"loss": 0.4294,
"step": 860
},
{
"epoch": 0.09693593314763231,
"grad_norm": 240974.71875,
"learning_rate": 9.563421828908556e-06,
"loss": 0.5314,
"step": 870
},
{
"epoch": 0.09805013927576602,
"grad_norm": 197173.859375,
"learning_rate": 9.551622418879057e-06,
"loss": 0.4884,
"step": 880
},
{
"epoch": 0.09916434540389972,
"grad_norm": 202271.03125,
"learning_rate": 9.539823008849557e-06,
"loss": 0.509,
"step": 890
},
{
"epoch": 0.10027855153203342,
"grad_norm": 16256707.0,
"learning_rate": 9.52802359882006e-06,
"loss": 0.5174,
"step": 900
},
{
"epoch": 0.10139275766016713,
"grad_norm": 243519.046875,
"learning_rate": 9.516224188790562e-06,
"loss": 0.4546,
"step": 910
},
{
"epoch": 0.10250696378830083,
"grad_norm": 250017.1875,
"learning_rate": 9.504424778761062e-06,
"loss": 0.4463,
"step": 920
},
{
"epoch": 0.10362116991643454,
"grad_norm": 213143.515625,
"learning_rate": 9.492625368731565e-06,
"loss": 0.4715,
"step": 930
},
{
"epoch": 0.10473537604456824,
"grad_norm": 195005.078125,
"learning_rate": 9.480825958702065e-06,
"loss": 0.4921,
"step": 940
},
{
"epoch": 0.10584958217270195,
"grad_norm": 256061.859375,
"learning_rate": 9.469026548672568e-06,
"loss": 0.5129,
"step": 950
},
{
"epoch": 0.10696378830083565,
"grad_norm": 223323.75,
"learning_rate": 9.457227138643068e-06,
"loss": 0.4535,
"step": 960
},
{
"epoch": 0.10807799442896936,
"grad_norm": 167112.203125,
"learning_rate": 9.44542772861357e-06,
"loss": 0.4698,
"step": 970
},
{
"epoch": 0.10919220055710306,
"grad_norm": 223777.640625,
"learning_rate": 9.433628318584071e-06,
"loss": 0.4681,
"step": 980
},
{
"epoch": 0.11030640668523677,
"grad_norm": 606988.4375,
"learning_rate": 9.421828908554573e-06,
"loss": 0.5525,
"step": 990
},
{
"epoch": 0.11142061281337047,
"grad_norm": 3532775.25,
"learning_rate": 9.410029498525074e-06,
"loss": 0.5028,
"step": 1000
},
{
"epoch": 0.11253481894150417,
"grad_norm": 234763.8125,
"learning_rate": 9.398230088495576e-06,
"loss": 0.5049,
"step": 1010
},
{
"epoch": 0.11364902506963788,
"grad_norm": 187823.65625,
"learning_rate": 9.386430678466077e-06,
"loss": 0.4735,
"step": 1020
},
{
"epoch": 0.11476323119777158,
"grad_norm": 177220.46875,
"learning_rate": 9.374631268436579e-06,
"loss": 0.453,
"step": 1030
},
{
"epoch": 0.11587743732590529,
"grad_norm": 222257.65625,
"learning_rate": 9.36283185840708e-06,
"loss": 0.5109,
"step": 1040
},
{
"epoch": 0.116991643454039,
"grad_norm": 197302.859375,
"learning_rate": 9.351032448377582e-06,
"loss": 0.521,
"step": 1050
},
{
"epoch": 0.1181058495821727,
"grad_norm": 206071.125,
"learning_rate": 9.339233038348084e-06,
"loss": 0.4685,
"step": 1060
},
{
"epoch": 0.1192200557103064,
"grad_norm": 217538.734375,
"learning_rate": 9.327433628318585e-06,
"loss": 0.4788,
"step": 1070
},
{
"epoch": 0.1203342618384401,
"grad_norm": 175732.96875,
"learning_rate": 9.315634218289085e-06,
"loss": 0.4497,
"step": 1080
},
{
"epoch": 0.12144846796657381,
"grad_norm": 215663.546875,
"learning_rate": 9.303834808259587e-06,
"loss": 0.4988,
"step": 1090
},
{
"epoch": 0.12256267409470752,
"grad_norm": 184019.109375,
"learning_rate": 9.29203539823009e-06,
"loss": 0.4578,
"step": 1100
},
{
"epoch": 0.12367688022284122,
"grad_norm": 215794.1875,
"learning_rate": 9.28023598820059e-06,
"loss": 0.4774,
"step": 1110
},
{
"epoch": 0.12479108635097493,
"grad_norm": 220917.953125,
"learning_rate": 9.268436578171091e-06,
"loss": 0.4589,
"step": 1120
},
{
"epoch": 0.12590529247910864,
"grad_norm": 247261.0625,
"learning_rate": 9.256637168141593e-06,
"loss": 0.4599,
"step": 1130
},
{
"epoch": 0.12701949860724235,
"grad_norm": 210848.296875,
"learning_rate": 9.244837758112095e-06,
"loss": 0.5151,
"step": 1140
},
{
"epoch": 0.12813370473537605,
"grad_norm": 202988.421875,
"learning_rate": 9.233038348082598e-06,
"loss": 0.4908,
"step": 1150
},
{
"epoch": 0.12924791086350976,
"grad_norm": 205763.890625,
"learning_rate": 9.221238938053098e-06,
"loss": 0.4931,
"step": 1160
},
{
"epoch": 0.13036211699164346,
"grad_norm": 228740.640625,
"learning_rate": 9.209439528023599e-06,
"loss": 0.4155,
"step": 1170
},
{
"epoch": 0.13147632311977717,
"grad_norm": 247188.296875,
"learning_rate": 9.197640117994101e-06,
"loss": 0.4654,
"step": 1180
},
{
"epoch": 0.13259052924791087,
"grad_norm": 192504.578125,
"learning_rate": 9.185840707964603e-06,
"loss": 0.4207,
"step": 1190
},
{
"epoch": 0.13370473537604458,
"grad_norm": 163095.4375,
"learning_rate": 9.174041297935104e-06,
"loss": 0.476,
"step": 1200
},
{
"epoch": 0.13481894150417828,
"grad_norm": 206575.328125,
"learning_rate": 9.162241887905605e-06,
"loss": 0.4762,
"step": 1210
},
{
"epoch": 0.13593314763231198,
"grad_norm": 220094.328125,
"learning_rate": 9.150442477876107e-06,
"loss": 0.5028,
"step": 1220
},
{
"epoch": 0.1370473537604457,
"grad_norm": 179223.671875,
"learning_rate": 9.138643067846609e-06,
"loss": 0.4679,
"step": 1230
},
{
"epoch": 0.1381615598885794,
"grad_norm": 159205.375,
"learning_rate": 9.12684365781711e-06,
"loss": 0.4489,
"step": 1240
},
{
"epoch": 0.1392757660167131,
"grad_norm": 239964.90625,
"learning_rate": 9.11504424778761e-06,
"loss": 0.4944,
"step": 1250
},
{
"epoch": 0.1403899721448468,
"grad_norm": 1860553.375,
"learning_rate": 9.103244837758113e-06,
"loss": 0.4799,
"step": 1260
},
{
"epoch": 0.1415041782729805,
"grad_norm": 215564.03125,
"learning_rate": 9.091445427728615e-06,
"loss": 0.4839,
"step": 1270
},
{
"epoch": 0.1426183844011142,
"grad_norm": 165737.609375,
"learning_rate": 9.079646017699115e-06,
"loss": 0.4782,
"step": 1280
},
{
"epoch": 0.14373259052924792,
"grad_norm": 227705.28125,
"learning_rate": 9.067846607669618e-06,
"loss": 0.4569,
"step": 1290
},
{
"epoch": 0.14484679665738162,
"grad_norm": 239591.640625,
"learning_rate": 9.056047197640118e-06,
"loss": 0.4656,
"step": 1300
},
{
"epoch": 0.14596100278551533,
"grad_norm": 228280.484375,
"learning_rate": 9.04424778761062e-06,
"loss": 0.4438,
"step": 1310
},
{
"epoch": 0.14707520891364903,
"grad_norm": 211993.90625,
"learning_rate": 9.032448377581121e-06,
"loss": 0.4636,
"step": 1320
},
{
"epoch": 0.14818941504178273,
"grad_norm": 225698.015625,
"learning_rate": 9.020648967551623e-06,
"loss": 0.445,
"step": 1330
},
{
"epoch": 0.14930362116991644,
"grad_norm": 201884.421875,
"learning_rate": 9.008849557522124e-06,
"loss": 0.4308,
"step": 1340
},
{
"epoch": 0.15041782729805014,
"grad_norm": 177903.1875,
"learning_rate": 8.997050147492626e-06,
"loss": 0.4389,
"step": 1350
},
{
"epoch": 0.15153203342618385,
"grad_norm": 160066.296875,
"learning_rate": 8.985250737463127e-06,
"loss": 0.4577,
"step": 1360
},
{
"epoch": 0.15264623955431755,
"grad_norm": 209065.21875,
"learning_rate": 8.973451327433629e-06,
"loss": 0.4774,
"step": 1370
},
{
"epoch": 0.15376044568245126,
"grad_norm": 174174.75,
"learning_rate": 8.961651917404131e-06,
"loss": 0.4316,
"step": 1380
},
{
"epoch": 0.15487465181058496,
"grad_norm": 207085.890625,
"learning_rate": 8.949852507374632e-06,
"loss": 0.4979,
"step": 1390
},
{
"epoch": 0.15598885793871867,
"grad_norm": 165266.171875,
"learning_rate": 8.938053097345133e-06,
"loss": 0.4555,
"step": 1400
},
{
"epoch": 0.15710306406685237,
"grad_norm": 187202.796875,
"learning_rate": 8.926253687315635e-06,
"loss": 0.4285,
"step": 1410
},
{
"epoch": 0.15821727019498608,
"grad_norm": 241471.515625,
"learning_rate": 8.914454277286137e-06,
"loss": 0.4079,
"step": 1420
},
{
"epoch": 0.15933147632311978,
"grad_norm": 186964.390625,
"learning_rate": 8.902654867256638e-06,
"loss": 0.4669,
"step": 1430
},
{
"epoch": 0.16044568245125349,
"grad_norm": 187827.359375,
"learning_rate": 8.890855457227138e-06,
"loss": 0.469,
"step": 1440
},
{
"epoch": 0.1615598885793872,
"grad_norm": 185109.921875,
"learning_rate": 8.87905604719764e-06,
"loss": 0.4418,
"step": 1450
},
{
"epoch": 0.1626740947075209,
"grad_norm": 163606.359375,
"learning_rate": 8.867256637168143e-06,
"loss": 0.4303,
"step": 1460
},
{
"epoch": 0.1637883008356546,
"grad_norm": 301747.125,
"learning_rate": 8.855457227138643e-06,
"loss": 0.5794,
"step": 1470
},
{
"epoch": 0.1649025069637883,
"grad_norm": 197552.6875,
"learning_rate": 8.843657817109144e-06,
"loss": 0.4615,
"step": 1480
},
{
"epoch": 0.166016713091922,
"grad_norm": 173249.578125,
"learning_rate": 8.831858407079646e-06,
"loss": 0.4011,
"step": 1490
},
{
"epoch": 0.1671309192200557,
"grad_norm": 182496.59375,
"learning_rate": 8.820058997050148e-06,
"loss": 0.4486,
"step": 1500
},
{
"epoch": 0.16824512534818942,
"grad_norm": 210268.046875,
"learning_rate": 8.80825958702065e-06,
"loss": 0.5297,
"step": 1510
},
{
"epoch": 0.16935933147632312,
"grad_norm": 187245.1875,
"learning_rate": 8.796460176991151e-06,
"loss": 0.5058,
"step": 1520
},
{
"epoch": 0.17047353760445683,
"grad_norm": 151204.625,
"learning_rate": 8.784660766961652e-06,
"loss": 0.4313,
"step": 1530
},
{
"epoch": 0.17158774373259053,
"grad_norm": 199678.75,
"learning_rate": 8.772861356932154e-06,
"loss": 0.4923,
"step": 1540
},
{
"epoch": 0.17270194986072424,
"grad_norm": 216835.015625,
"learning_rate": 8.761061946902656e-06,
"loss": 0.4496,
"step": 1550
},
{
"epoch": 0.17381615598885794,
"grad_norm": 220194.328125,
"learning_rate": 8.749262536873157e-06,
"loss": 0.451,
"step": 1560
},
{
"epoch": 0.17493036211699164,
"grad_norm": 257876.03125,
"learning_rate": 8.737463126843658e-06,
"loss": 0.4967,
"step": 1570
},
{
"epoch": 0.17604456824512535,
"grad_norm": 247228.328125,
"learning_rate": 8.72566371681416e-06,
"loss": 0.4585,
"step": 1580
},
{
"epoch": 0.17715877437325905,
"grad_norm": 199109.828125,
"learning_rate": 8.713864306784662e-06,
"loss": 0.4796,
"step": 1590
},
{
"epoch": 0.17827298050139276,
"grad_norm": 168323.46875,
"learning_rate": 8.702064896755163e-06,
"loss": 0.4713,
"step": 1600
},
{
"epoch": 0.17938718662952646,
"grad_norm": 217545.203125,
"learning_rate": 8.690265486725665e-06,
"loss": 0.5065,
"step": 1610
},
{
"epoch": 0.18050139275766017,
"grad_norm": 170031.875,
"learning_rate": 8.678466076696166e-06,
"loss": 0.4523,
"step": 1620
},
{
"epoch": 0.18161559888579387,
"grad_norm": 173914.984375,
"learning_rate": 8.666666666666668e-06,
"loss": 0.4428,
"step": 1630
},
{
"epoch": 0.18272980501392758,
"grad_norm": 328774.90625,
"learning_rate": 8.654867256637168e-06,
"loss": 0.4636,
"step": 1640
},
{
"epoch": 0.18384401114206128,
"grad_norm": 160804.953125,
"learning_rate": 8.64306784660767e-06,
"loss": 0.4819,
"step": 1650
},
{
"epoch": 0.18495821727019499,
"grad_norm": 178090.625,
"learning_rate": 8.631268436578171e-06,
"loss": 0.3609,
"step": 1660
},
{
"epoch": 0.1860724233983287,
"grad_norm": 229922.359375,
"learning_rate": 8.619469026548674e-06,
"loss": 0.544,
"step": 1670
},
{
"epoch": 0.1871866295264624,
"grad_norm": 220525.125,
"learning_rate": 8.607669616519174e-06,
"loss": 0.4629,
"step": 1680
},
{
"epoch": 0.1883008356545961,
"grad_norm": 200167.140625,
"learning_rate": 8.595870206489676e-06,
"loss": 0.4388,
"step": 1690
},
{
"epoch": 0.1894150417827298,
"grad_norm": 162366.34375,
"learning_rate": 8.584070796460177e-06,
"loss": 0.4031,
"step": 1700
},
{
"epoch": 0.1905292479108635,
"grad_norm": 168463.046875,
"learning_rate": 8.57227138643068e-06,
"loss": 0.4573,
"step": 1710
},
{
"epoch": 0.1916434540389972,
"grad_norm": 186504.0625,
"learning_rate": 8.56047197640118e-06,
"loss": 0.4124,
"step": 1720
},
{
"epoch": 0.19275766016713092,
"grad_norm": 172114.3125,
"learning_rate": 8.548672566371682e-06,
"loss": 0.4513,
"step": 1730
},
{
"epoch": 0.19387186629526462,
"grad_norm": 180931.875,
"learning_rate": 8.536873156342184e-06,
"loss": 0.4853,
"step": 1740
},
{
"epoch": 0.19498607242339833,
"grad_norm": 161106.390625,
"learning_rate": 8.525073746312685e-06,
"loss": 0.4688,
"step": 1750
},
{
"epoch": 0.19610027855153203,
"grad_norm": 309931.21875,
"learning_rate": 8.513274336283186e-06,
"loss": 0.4299,
"step": 1760
},
{
"epoch": 0.19721448467966574,
"grad_norm": 187258.265625,
"learning_rate": 8.501474926253688e-06,
"loss": 0.3971,
"step": 1770
},
{
"epoch": 0.19832869080779944,
"grad_norm": 189410.328125,
"learning_rate": 8.48967551622419e-06,
"loss": 0.4437,
"step": 1780
},
{
"epoch": 0.19944289693593314,
"grad_norm": 227058.671875,
"learning_rate": 8.47787610619469e-06,
"loss": 0.4647,
"step": 1790
},
{
"epoch": 0.20055710306406685,
"grad_norm": 212547.15625,
"learning_rate": 8.466076696165191e-06,
"loss": 0.3928,
"step": 1800
},
{
"epoch": 0.20167130919220055,
"grad_norm": 243853.921875,
"learning_rate": 8.454277286135693e-06,
"loss": 0.4864,
"step": 1810
},
{
"epoch": 0.20278551532033426,
"grad_norm": 155955.109375,
"learning_rate": 8.442477876106196e-06,
"loss": 0.402,
"step": 1820
},
{
"epoch": 0.20389972144846796,
"grad_norm": 202079.875,
"learning_rate": 8.430678466076696e-06,
"loss": 0.4701,
"step": 1830
},
{
"epoch": 0.20501392757660167,
"grad_norm": 714132.75,
"learning_rate": 8.418879056047199e-06,
"loss": 0.4677,
"step": 1840
},
{
"epoch": 0.20612813370473537,
"grad_norm": 193706.546875,
"learning_rate": 8.4070796460177e-06,
"loss": 0.4514,
"step": 1850
},
{
"epoch": 0.20724233983286908,
"grad_norm": 185646.90625,
"learning_rate": 8.395280235988201e-06,
"loss": 0.5203,
"step": 1860
},
{
"epoch": 0.20835654596100278,
"grad_norm": 200083.578125,
"learning_rate": 8.383480825958704e-06,
"loss": 0.4839,
"step": 1870
},
{
"epoch": 0.20947075208913649,
"grad_norm": 188911.671875,
"learning_rate": 8.371681415929204e-06,
"loss": 0.4435,
"step": 1880
},
{
"epoch": 0.2105849582172702,
"grad_norm": 181933.046875,
"learning_rate": 8.359882005899705e-06,
"loss": 0.4596,
"step": 1890
},
{
"epoch": 0.2116991643454039,
"grad_norm": 220758.59375,
"learning_rate": 8.348082595870207e-06,
"loss": 0.4758,
"step": 1900
},
{
"epoch": 0.2128133704735376,
"grad_norm": 220606.0,
"learning_rate": 8.33628318584071e-06,
"loss": 0.4422,
"step": 1910
},
{
"epoch": 0.2139275766016713,
"grad_norm": 206306.265625,
"learning_rate": 8.32448377581121e-06,
"loss": 0.379,
"step": 1920
},
{
"epoch": 0.215041782729805,
"grad_norm": 181457.03125,
"learning_rate": 8.31268436578171e-06,
"loss": 0.5551,
"step": 1930
},
{
"epoch": 0.2161559888579387,
"grad_norm": 228577.40625,
"learning_rate": 8.300884955752213e-06,
"loss": 0.4185,
"step": 1940
},
{
"epoch": 0.21727019498607242,
"grad_norm": 213855.328125,
"learning_rate": 8.289085545722715e-06,
"loss": 0.4823,
"step": 1950
},
{
"epoch": 0.21838440111420612,
"grad_norm": 230408.421875,
"learning_rate": 8.277286135693216e-06,
"loss": 0.4441,
"step": 1960
},
{
"epoch": 0.21949860724233983,
"grad_norm": 2382021.0,
"learning_rate": 8.265486725663718e-06,
"loss": 0.4433,
"step": 1970
},
{
"epoch": 0.22061281337047353,
"grad_norm": 205788.359375,
"learning_rate": 8.253687315634219e-06,
"loss": 0.4529,
"step": 1980
},
{
"epoch": 0.22172701949860724,
"grad_norm": 188394.21875,
"learning_rate": 8.24188790560472e-06,
"loss": 0.4691,
"step": 1990
},
{
"epoch": 0.22284122562674094,
"grad_norm": 160699.125,
"learning_rate": 8.230088495575221e-06,
"loss": 0.3891,
"step": 2000
},
{
"epoch": 0.22395543175487465,
"grad_norm": 177135.328125,
"learning_rate": 8.218289085545724e-06,
"loss": 0.4791,
"step": 2010
},
{
"epoch": 0.22506963788300835,
"grad_norm": 176526.390625,
"learning_rate": 8.206489675516224e-06,
"loss": 0.48,
"step": 2020
},
{
"epoch": 0.22618384401114205,
"grad_norm": 176218.734375,
"learning_rate": 8.194690265486727e-06,
"loss": 0.4568,
"step": 2030
},
{
"epoch": 0.22729805013927576,
"grad_norm": 212857.703125,
"learning_rate": 8.182890855457227e-06,
"loss": 0.5493,
"step": 2040
},
{
"epoch": 0.22841225626740946,
"grad_norm": 169878.546875,
"learning_rate": 8.17109144542773e-06,
"loss": 0.5106,
"step": 2050
},
{
"epoch": 0.22952646239554317,
"grad_norm": 191395.078125,
"learning_rate": 8.15929203539823e-06,
"loss": 0.4612,
"step": 2060
},
{
"epoch": 0.23064066852367687,
"grad_norm": 202636.328125,
"learning_rate": 8.147492625368732e-06,
"loss": 0.4216,
"step": 2070
},
{
"epoch": 0.23175487465181058,
"grad_norm": 190389.3125,
"learning_rate": 8.135693215339233e-06,
"loss": 0.47,
"step": 2080
},
{
"epoch": 0.23286908077994428,
"grad_norm": 169839.28125,
"learning_rate": 8.123893805309735e-06,
"loss": 0.3739,
"step": 2090
},
{
"epoch": 0.233983286908078,
"grad_norm": 193950.34375,
"learning_rate": 8.112094395280237e-06,
"loss": 0.4212,
"step": 2100
},
{
"epoch": 0.2350974930362117,
"grad_norm": 244072.90625,
"learning_rate": 8.100294985250738e-06,
"loss": 0.4404,
"step": 2110
},
{
"epoch": 0.2362116991643454,
"grad_norm": 187252.015625,
"learning_rate": 8.088495575221239e-06,
"loss": 0.4064,
"step": 2120
},
{
"epoch": 0.2373259052924791,
"grad_norm": 198252.640625,
"learning_rate": 8.07669616519174e-06,
"loss": 0.4886,
"step": 2130
},
{
"epoch": 0.2384401114206128,
"grad_norm": 166305.921875,
"learning_rate": 8.064896755162243e-06,
"loss": 0.4645,
"step": 2140
},
{
"epoch": 0.2395543175487465,
"grad_norm": 170926.28125,
"learning_rate": 8.053097345132744e-06,
"loss": 0.4812,
"step": 2150
},
{
"epoch": 0.2406685236768802,
"grad_norm": 186730.828125,
"learning_rate": 8.041297935103244e-06,
"loss": 0.4462,
"step": 2160
},
{
"epoch": 0.24178272980501392,
"grad_norm": 166617.265625,
"learning_rate": 8.029498525073746e-06,
"loss": 0.4239,
"step": 2170
},
{
"epoch": 0.24289693593314762,
"grad_norm": 231541.78125,
"learning_rate": 8.017699115044249e-06,
"loss": 0.494,
"step": 2180
},
{
"epoch": 0.24401114206128133,
"grad_norm": 244759.0625,
"learning_rate": 8.005899705014751e-06,
"loss": 0.4842,
"step": 2190
},
{
"epoch": 0.24512534818941503,
"grad_norm": 213151.8125,
"learning_rate": 7.994100294985252e-06,
"loss": 0.4053,
"step": 2200
},
{
"epoch": 0.24623955431754874,
"grad_norm": 177919.25,
"learning_rate": 7.982300884955752e-06,
"loss": 0.4367,
"step": 2210
},
{
"epoch": 0.24735376044568244,
"grad_norm": 189381.546875,
"learning_rate": 7.970501474926254e-06,
"loss": 0.4456,
"step": 2220
},
{
"epoch": 0.24846796657381615,
"grad_norm": 182038.265625,
"learning_rate": 7.958702064896757e-06,
"loss": 0.4758,
"step": 2230
},
{
"epoch": 0.24958217270194985,
"grad_norm": 194923.015625,
"learning_rate": 7.946902654867257e-06,
"loss": 0.4544,
"step": 2240
},
{
"epoch": 0.25069637883008355,
"grad_norm": 213895.234375,
"learning_rate": 7.935103244837758e-06,
"loss": 0.4855,
"step": 2250
},
{
"epoch": 0.2518105849582173,
"grad_norm": 176202.59375,
"learning_rate": 7.92330383480826e-06,
"loss": 0.4411,
"step": 2260
},
{
"epoch": 0.25292479108635096,
"grad_norm": 173177.484375,
"learning_rate": 7.911504424778762e-06,
"loss": 0.408,
"step": 2270
},
{
"epoch": 0.2540389972144847,
"grad_norm": 165414.84375,
"learning_rate": 7.899705014749263e-06,
"loss": 0.4193,
"step": 2280
},
{
"epoch": 0.2551532033426184,
"grad_norm": 187442.96875,
"learning_rate": 7.887905604719764e-06,
"loss": 0.4563,
"step": 2290
},
{
"epoch": 0.2562674094707521,
"grad_norm": 174169.46875,
"learning_rate": 7.876106194690266e-06,
"loss": 0.469,
"step": 2300
},
{
"epoch": 0.2573816155988858,
"grad_norm": 178938.265625,
"learning_rate": 7.864306784660768e-06,
"loss": 0.4368,
"step": 2310
},
{
"epoch": 0.2584958217270195,
"grad_norm": 180309.984375,
"learning_rate": 7.852507374631269e-06,
"loss": 0.3941,
"step": 2320
},
{
"epoch": 0.2596100278551532,
"grad_norm": 163068.09375,
"learning_rate": 7.840707964601771e-06,
"loss": 0.4593,
"step": 2330
},
{
"epoch": 0.2607242339832869,
"grad_norm": 438010.625,
"learning_rate": 7.828908554572272e-06,
"loss": 0.3922,
"step": 2340
},
{
"epoch": 0.2618384401114206,
"grad_norm": 160514.765625,
"learning_rate": 7.817109144542774e-06,
"loss": 0.414,
"step": 2350
},
{
"epoch": 0.26295264623955433,
"grad_norm": 183856.734375,
"learning_rate": 7.805309734513274e-06,
"loss": 0.4401,
"step": 2360
},
{
"epoch": 0.264066852367688,
"grad_norm": 148409.546875,
"learning_rate": 7.793510324483777e-06,
"loss": 0.4098,
"step": 2370
},
{
"epoch": 0.26518105849582174,
"grad_norm": 141153.078125,
"learning_rate": 7.781710914454277e-06,
"loss": 0.4056,
"step": 2380
},
{
"epoch": 0.2662952646239554,
"grad_norm": 180253.265625,
"learning_rate": 7.76991150442478e-06,
"loss": 0.4528,
"step": 2390
},
{
"epoch": 0.26740947075208915,
"grad_norm": 197980.125,
"learning_rate": 7.75811209439528e-06,
"loss": 0.5156,
"step": 2400
},
{
"epoch": 0.26852367688022283,
"grad_norm": 240508.0625,
"learning_rate": 7.746312684365782e-06,
"loss": 0.5328,
"step": 2410
},
{
"epoch": 0.26963788300835656,
"grad_norm": 185720.15625,
"learning_rate": 7.734513274336285e-06,
"loss": 0.4215,
"step": 2420
},
{
"epoch": 0.27075208913649024,
"grad_norm": 202803.9375,
"learning_rate": 7.722713864306785e-06,
"loss": 0.4465,
"step": 2430
},
{
"epoch": 0.27186629526462397,
"grad_norm": 224614.953125,
"learning_rate": 7.710914454277286e-06,
"loss": 0.4462,
"step": 2440
},
{
"epoch": 0.27298050139275765,
"grad_norm": 180010.578125,
"learning_rate": 7.699115044247788e-06,
"loss": 0.4352,
"step": 2450
},
{
"epoch": 0.2740947075208914,
"grad_norm": 163524.25,
"learning_rate": 7.68731563421829e-06,
"loss": 0.4927,
"step": 2460
},
{
"epoch": 0.27520891364902506,
"grad_norm": 207432.9375,
"learning_rate": 7.675516224188791e-06,
"loss": 0.4633,
"step": 2470
},
{
"epoch": 0.2763231197771588,
"grad_norm": 204331.390625,
"learning_rate": 7.663716814159292e-06,
"loss": 0.3958,
"step": 2480
},
{
"epoch": 0.27743732590529246,
"grad_norm": 190513.875,
"learning_rate": 7.651917404129794e-06,
"loss": 0.4297,
"step": 2490
},
{
"epoch": 0.2785515320334262,
"grad_norm": 217766.4375,
"learning_rate": 7.640117994100296e-06,
"loss": 0.5156,
"step": 2500
},
{
"epoch": 0.2796657381615599,
"grad_norm": 154360.34375,
"learning_rate": 7.6283185840707975e-06,
"loss": 0.4699,
"step": 2510
},
{
"epoch": 0.2807799442896936,
"grad_norm": 191062.34375,
"learning_rate": 7.616519174041298e-06,
"loss": 0.4874,
"step": 2520
},
{
"epoch": 0.2818941504178273,
"grad_norm": 172361.28125,
"learning_rate": 7.6047197640117995e-06,
"loss": 0.4375,
"step": 2530
},
{
"epoch": 0.283008356545961,
"grad_norm": 184886.234375,
"learning_rate": 7.592920353982302e-06,
"loss": 0.4439,
"step": 2540
},
{
"epoch": 0.2841225626740947,
"grad_norm": 219691.9375,
"learning_rate": 7.581120943952803e-06,
"loss": 0.4365,
"step": 2550
},
{
"epoch": 0.2852367688022284,
"grad_norm": 196143.03125,
"learning_rate": 7.569321533923304e-06,
"loss": 0.429,
"step": 2560
},
{
"epoch": 0.2863509749303621,
"grad_norm": 208180.90625,
"learning_rate": 7.557522123893806e-06,
"loss": 0.4469,
"step": 2570
},
{
"epoch": 0.28746518105849583,
"grad_norm": 171706.4375,
"learning_rate": 7.5457227138643075e-06,
"loss": 0.3754,
"step": 2580
},
{
"epoch": 0.2885793871866295,
"grad_norm": 191040.453125,
"learning_rate": 7.533923303834809e-06,
"loss": 0.3943,
"step": 2590
},
{
"epoch": 0.28969359331476324,
"grad_norm": 242674.359375,
"learning_rate": 7.5221238938053095e-06,
"loss": 0.4126,
"step": 2600
},
{
"epoch": 0.2908077994428969,
"grad_norm": 176212.265625,
"learning_rate": 7.510324483775812e-06,
"loss": 0.5054,
"step": 2610
},
{
"epoch": 0.29192200557103065,
"grad_norm": 160570.40625,
"learning_rate": 7.498525073746313e-06,
"loss": 0.4517,
"step": 2620
},
{
"epoch": 0.29303621169916433,
"grad_norm": 225097.109375,
"learning_rate": 7.4867256637168155e-06,
"loss": 0.4204,
"step": 2630
},
{
"epoch": 0.29415041782729806,
"grad_norm": 212929.71875,
"learning_rate": 7.474926253687316e-06,
"loss": 0.3645,
"step": 2640
},
{
"epoch": 0.29526462395543174,
"grad_norm": 302795.75,
"learning_rate": 7.4631268436578175e-06,
"loss": 0.4555,
"step": 2650
},
{
"epoch": 0.29637883008356547,
"grad_norm": 274847.28125,
"learning_rate": 7.451327433628319e-06,
"loss": 0.4636,
"step": 2660
},
{
"epoch": 0.29749303621169915,
"grad_norm": 247978.859375,
"learning_rate": 7.439528023598821e-06,
"loss": 0.497,
"step": 2670
},
{
"epoch": 0.2986072423398329,
"grad_norm": 235642.4375,
"learning_rate": 7.427728613569322e-06,
"loss": 0.4798,
"step": 2680
},
{
"epoch": 0.29972144846796656,
"grad_norm": 175856.296875,
"learning_rate": 7.415929203539823e-06,
"loss": 0.3984,
"step": 2690
},
{
"epoch": 0.3008356545961003,
"grad_norm": 164871.171875,
"learning_rate": 7.4041297935103254e-06,
"loss": 0.4505,
"step": 2700
},
{
"epoch": 0.30194986072423396,
"grad_norm": 235865.71875,
"learning_rate": 7.392330383480827e-06,
"loss": 0.4366,
"step": 2710
},
{
"epoch": 0.3030640668523677,
"grad_norm": 210909.359375,
"learning_rate": 7.3805309734513274e-06,
"loss": 0.4519,
"step": 2720
},
{
"epoch": 0.3041782729805014,
"grad_norm": 233273.6875,
"learning_rate": 7.368731563421829e-06,
"loss": 0.5225,
"step": 2730
},
{
"epoch": 0.3052924791086351,
"grad_norm": 182172.390625,
"learning_rate": 7.356932153392331e-06,
"loss": 0.4502,
"step": 2740
},
{
"epoch": 0.3064066852367688,
"grad_norm": 148367.265625,
"learning_rate": 7.3451327433628326e-06,
"loss": 0.4645,
"step": 2750
},
{
"epoch": 0.3075208913649025,
"grad_norm": 153879.859375,
"learning_rate": 7.333333333333333e-06,
"loss": 0.4434,
"step": 2760
},
{
"epoch": 0.3086350974930362,
"grad_norm": 166923.828125,
"learning_rate": 7.321533923303835e-06,
"loss": 0.4275,
"step": 2770
},
{
"epoch": 0.3097493036211699,
"grad_norm": 181019.078125,
"learning_rate": 7.309734513274337e-06,
"loss": 0.4433,
"step": 2780
},
{
"epoch": 0.3108635097493036,
"grad_norm": 174066.703125,
"learning_rate": 7.297935103244838e-06,
"loss": 0.4151,
"step": 2790
},
{
"epoch": 0.31197771587743733,
"grad_norm": 182453.328125,
"learning_rate": 7.28613569321534e-06,
"loss": 0.4393,
"step": 2800
},
{
"epoch": 0.313091922005571,
"grad_norm": 221355.9375,
"learning_rate": 7.274336283185841e-06,
"loss": 0.4509,
"step": 2810
},
{
"epoch": 0.31420612813370474,
"grad_norm": 161181.375,
"learning_rate": 7.2625368731563425e-06,
"loss": 0.4331,
"step": 2820
},
{
"epoch": 0.3153203342618384,
"grad_norm": 180448.265625,
"learning_rate": 7.250737463126845e-06,
"loss": 0.3847,
"step": 2830
},
{
"epoch": 0.31643454038997215,
"grad_norm": 195976.234375,
"learning_rate": 7.238938053097345e-06,
"loss": 0.4582,
"step": 2840
},
{
"epoch": 0.31754874651810583,
"grad_norm": 225505.5,
"learning_rate": 7.227138643067847e-06,
"loss": 0.4278,
"step": 2850
},
{
"epoch": 0.31866295264623956,
"grad_norm": 197629.5,
"learning_rate": 7.215339233038349e-06,
"loss": 0.427,
"step": 2860
},
{
"epoch": 0.31977715877437324,
"grad_norm": 186139.46875,
"learning_rate": 7.2035398230088505e-06,
"loss": 0.4538,
"step": 2870
},
{
"epoch": 0.32089136490250697,
"grad_norm": 242211.59375,
"learning_rate": 7.191740412979351e-06,
"loss": 0.4626,
"step": 2880
},
{
"epoch": 0.32200557103064065,
"grad_norm": 219988.5625,
"learning_rate": 7.1799410029498525e-06,
"loss": 0.4787,
"step": 2890
},
{
"epoch": 0.3231197771587744,
"grad_norm": 248857.53125,
"learning_rate": 7.168141592920355e-06,
"loss": 0.4291,
"step": 2900
},
{
"epoch": 0.32423398328690806,
"grad_norm": 189503.140625,
"learning_rate": 7.156342182890856e-06,
"loss": 0.4115,
"step": 2910
},
{
"epoch": 0.3253481894150418,
"grad_norm": 187285.109375,
"learning_rate": 7.144542772861357e-06,
"loss": 0.409,
"step": 2920
},
{
"epoch": 0.32646239554317547,
"grad_norm": 165844.203125,
"learning_rate": 7.132743362831859e-06,
"loss": 0.4235,
"step": 2930
},
{
"epoch": 0.3275766016713092,
"grad_norm": 143210.59375,
"learning_rate": 7.1209439528023605e-06,
"loss": 0.4236,
"step": 2940
},
{
"epoch": 0.3286908077994429,
"grad_norm": 189349.671875,
"learning_rate": 7.109144542772862e-06,
"loss": 0.4673,
"step": 2950
},
{
"epoch": 0.3298050139275766,
"grad_norm": 200229.21875,
"learning_rate": 7.0973451327433625e-06,
"loss": 0.4638,
"step": 2960
},
{
"epoch": 0.3309192200557103,
"grad_norm": 211443.546875,
"learning_rate": 7.085545722713865e-06,
"loss": 0.4817,
"step": 2970
},
{
"epoch": 0.332033426183844,
"grad_norm": 163127.796875,
"learning_rate": 7.073746312684366e-06,
"loss": 0.4673,
"step": 2980
},
{
"epoch": 0.3331476323119777,
"grad_norm": 204851.25,
"learning_rate": 7.0619469026548685e-06,
"loss": 0.3908,
"step": 2990
},
{
"epoch": 0.3342618384401114,
"grad_norm": 197754.03125,
"learning_rate": 7.050147492625369e-06,
"loss": 0.4242,
"step": 3000
},
{
"epoch": 0.3353760445682451,
"grad_norm": 169182.234375,
"learning_rate": 7.0383480825958705e-06,
"loss": 0.3986,
"step": 3010
},
{
"epoch": 0.33649025069637883,
"grad_norm": 203432.265625,
"learning_rate": 7.026548672566372e-06,
"loss": 0.4509,
"step": 3020
},
{
"epoch": 0.3376044568245125,
"grad_norm": 151958.265625,
"learning_rate": 7.014749262536874e-06,
"loss": 0.4076,
"step": 3030
},
{
"epoch": 0.33871866295264624,
"grad_norm": 175936.0,
"learning_rate": 7.002949852507375e-06,
"loss": 0.4503,
"step": 3040
},
{
"epoch": 0.3398328690807799,
"grad_norm": 235924.75,
"learning_rate": 6.991150442477876e-06,
"loss": 0.5033,
"step": 3050
},
{
"epoch": 0.34094707520891365,
"grad_norm": 233674.546875,
"learning_rate": 6.9793510324483784e-06,
"loss": 0.4037,
"step": 3060
},
{
"epoch": 0.34206128133704733,
"grad_norm": 219473.421875,
"learning_rate": 6.96755162241888e-06,
"loss": 0.5048,
"step": 3070
},
{
"epoch": 0.34317548746518106,
"grad_norm": 174127.296875,
"learning_rate": 6.9557522123893805e-06,
"loss": 0.4419,
"step": 3080
},
{
"epoch": 0.34428969359331474,
"grad_norm": 192749.65625,
"learning_rate": 6.943952802359883e-06,
"loss": 0.4122,
"step": 3090
},
{
"epoch": 0.34540389972144847,
"grad_norm": 162299.4375,
"learning_rate": 6.932153392330384e-06,
"loss": 0.3922,
"step": 3100
},
{
"epoch": 0.34651810584958215,
"grad_norm": 164610.609375,
"learning_rate": 6.9203539823008856e-06,
"loss": 0.4145,
"step": 3110
},
{
"epoch": 0.3476323119777159,
"grad_norm": 216083.84375,
"learning_rate": 6.908554572271386e-06,
"loss": 0.4466,
"step": 3120
},
{
"epoch": 0.34874651810584956,
"grad_norm": 164565.40625,
"learning_rate": 6.8967551622418884e-06,
"loss": 0.4169,
"step": 3130
},
{
"epoch": 0.3498607242339833,
"grad_norm": 186178.03125,
"learning_rate": 6.88495575221239e-06,
"loss": 0.4237,
"step": 3140
},
{
"epoch": 0.35097493036211697,
"grad_norm": 214979.140625,
"learning_rate": 6.873156342182892e-06,
"loss": 0.4219,
"step": 3150
},
{
"epoch": 0.3520891364902507,
"grad_norm": 189355.046875,
"learning_rate": 6.861356932153393e-06,
"loss": 0.4277,
"step": 3160
},
{
"epoch": 0.3532033426183844,
"grad_norm": 146298.015625,
"learning_rate": 6.849557522123894e-06,
"loss": 0.3793,
"step": 3170
},
{
"epoch": 0.3543175487465181,
"grad_norm": 161813.9375,
"learning_rate": 6.8377581120943956e-06,
"loss": 0.4261,
"step": 3180
},
{
"epoch": 0.3554317548746518,
"grad_norm": 190914.546875,
"learning_rate": 6.825958702064898e-06,
"loss": 0.4386,
"step": 3190
},
{
"epoch": 0.3565459610027855,
"grad_norm": 192714.109375,
"learning_rate": 6.814159292035398e-06,
"loss": 0.3358,
"step": 3200
},
{
"epoch": 0.3576601671309192,
"grad_norm": 230763.09375,
"learning_rate": 6.8023598820059e-06,
"loss": 0.395,
"step": 3210
},
{
"epoch": 0.3587743732590529,
"grad_norm": 205893.734375,
"learning_rate": 6.790560471976402e-06,
"loss": 0.3984,
"step": 3220
},
{
"epoch": 0.3598885793871866,
"grad_norm": 197603.359375,
"learning_rate": 6.7787610619469035e-06,
"loss": 0.4227,
"step": 3230
},
{
"epoch": 0.36100278551532033,
"grad_norm": 201566.0625,
"learning_rate": 6.766961651917404e-06,
"loss": 0.4909,
"step": 3240
},
{
"epoch": 0.362116991643454,
"grad_norm": 198001.234375,
"learning_rate": 6.7551622418879055e-06,
"loss": 0.4225,
"step": 3250
},
{
"epoch": 0.36323119777158774,
"grad_norm": 202295.75,
"learning_rate": 6.743362831858408e-06,
"loss": 0.4214,
"step": 3260
},
{
"epoch": 0.3643454038997215,
"grad_norm": 232146.796875,
"learning_rate": 6.731563421828909e-06,
"loss": 0.4536,
"step": 3270
},
{
"epoch": 0.36545961002785515,
"grad_norm": 176061.234375,
"learning_rate": 6.71976401179941e-06,
"loss": 0.565,
"step": 3280
},
{
"epoch": 0.3665738161559889,
"grad_norm": 186836.6875,
"learning_rate": 6.707964601769912e-06,
"loss": 0.4747,
"step": 3290
},
{
"epoch": 0.36768802228412256,
"grad_norm": 209567.4375,
"learning_rate": 6.6961651917404135e-06,
"loss": 0.5102,
"step": 3300
},
{
"epoch": 0.3688022284122563,
"grad_norm": 168502.078125,
"learning_rate": 6.684365781710915e-06,
"loss": 0.4422,
"step": 3310
},
{
"epoch": 0.36991643454038997,
"grad_norm": 140973.546875,
"learning_rate": 6.672566371681416e-06,
"loss": 0.4106,
"step": 3320
},
{
"epoch": 0.3710306406685237,
"grad_norm": 198291.03125,
"learning_rate": 6.660766961651918e-06,
"loss": 0.4064,
"step": 3330
},
{
"epoch": 0.3721448467966574,
"grad_norm": 223377.78125,
"learning_rate": 6.648967551622419e-06,
"loss": 0.4166,
"step": 3340
},
{
"epoch": 0.3732590529247911,
"grad_norm": 153720.453125,
"learning_rate": 6.6371681415929215e-06,
"loss": 0.4166,
"step": 3350
},
{
"epoch": 0.3743732590529248,
"grad_norm": 201311.625,
"learning_rate": 6.625368731563422e-06,
"loss": 0.4366,
"step": 3360
},
{
"epoch": 0.3754874651810585,
"grad_norm": 231880.140625,
"learning_rate": 6.6135693215339235e-06,
"loss": 0.4424,
"step": 3370
},
{
"epoch": 0.3766016713091922,
"grad_norm": 215704.15625,
"learning_rate": 6.601769911504426e-06,
"loss": 0.4231,
"step": 3380
},
{
"epoch": 0.37771587743732593,
"grad_norm": 218725.96875,
"learning_rate": 6.589970501474927e-06,
"loss": 0.4443,
"step": 3390
},
{
"epoch": 0.3788300835654596,
"grad_norm": 172654.328125,
"learning_rate": 6.578171091445428e-06,
"loss": 0.4628,
"step": 3400
},
{
"epoch": 0.37994428969359334,
"grad_norm": 255787.09375,
"learning_rate": 6.566371681415929e-06,
"loss": 0.3853,
"step": 3410
},
{
"epoch": 0.381058495821727,
"grad_norm": 214523.421875,
"learning_rate": 6.5545722713864315e-06,
"loss": 0.3911,
"step": 3420
},
{
"epoch": 0.38217270194986075,
"grad_norm": 171143.546875,
"learning_rate": 6.542772861356933e-06,
"loss": 0.3911,
"step": 3430
},
{
"epoch": 0.3832869080779944,
"grad_norm": 208521.5625,
"learning_rate": 6.5309734513274335e-06,
"loss": 0.4602,
"step": 3440
},
{
"epoch": 0.38440111420612816,
"grad_norm": 164048.6875,
"learning_rate": 6.519174041297936e-06,
"loss": 0.3663,
"step": 3450
},
{
"epoch": 0.38551532033426184,
"grad_norm": 175852.65625,
"learning_rate": 6.507374631268437e-06,
"loss": 0.4219,
"step": 3460
},
{
"epoch": 0.38662952646239557,
"grad_norm": 155706.46875,
"learning_rate": 6.495575221238939e-06,
"loss": 0.4241,
"step": 3470
},
{
"epoch": 0.38774373259052924,
"grad_norm": 208711.96875,
"learning_rate": 6.483775811209439e-06,
"loss": 0.4556,
"step": 3480
},
{
"epoch": 0.388857938718663,
"grad_norm": 190895.765625,
"learning_rate": 6.4719764011799414e-06,
"loss": 0.4824,
"step": 3490
},
{
"epoch": 0.38997214484679665,
"grad_norm": 190407.703125,
"learning_rate": 6.460176991150443e-06,
"loss": 0.3921,
"step": 3500
},
{
"epoch": 0.3910863509749304,
"grad_norm": 189109.84375,
"learning_rate": 6.448377581120945e-06,
"loss": 0.4369,
"step": 3510
},
{
"epoch": 0.39220055710306406,
"grad_norm": 164608.40625,
"learning_rate": 6.436578171091446e-06,
"loss": 0.4079,
"step": 3520
},
{
"epoch": 0.3933147632311978,
"grad_norm": 195097.40625,
"learning_rate": 6.424778761061947e-06,
"loss": 0.439,
"step": 3530
},
{
"epoch": 0.39442896935933147,
"grad_norm": 161624.625,
"learning_rate": 6.4129793510324486e-06,
"loss": 0.4484,
"step": 3540
},
{
"epoch": 0.3955431754874652,
"grad_norm": 183812.625,
"learning_rate": 6.401179941002951e-06,
"loss": 0.4078,
"step": 3550
},
{
"epoch": 0.3966573816155989,
"grad_norm": 160959.0625,
"learning_rate": 6.389380530973451e-06,
"loss": 0.4448,
"step": 3560
},
{
"epoch": 0.3977715877437326,
"grad_norm": 172894.578125,
"learning_rate": 6.377581120943953e-06,
"loss": 0.4315,
"step": 3570
},
{
"epoch": 0.3988857938718663,
"grad_norm": 167681.3125,
"learning_rate": 6.365781710914455e-06,
"loss": 0.5089,
"step": 3580
},
{
"epoch": 0.4,
"grad_norm": 210563.953125,
"learning_rate": 6.3539823008849565e-06,
"loss": 0.4593,
"step": 3590
},
{
"epoch": 0.4011142061281337,
"grad_norm": 190382.875,
"learning_rate": 6.342182890855457e-06,
"loss": 0.4257,
"step": 3600
},
{
"epoch": 0.40222841225626743,
"grad_norm": 146628.546875,
"learning_rate": 6.330383480825959e-06,
"loss": 0.4464,
"step": 3610
},
{
"epoch": 0.4033426183844011,
"grad_norm": 185890.65625,
"learning_rate": 6.318584070796461e-06,
"loss": 0.4525,
"step": 3620
},
{
"epoch": 0.40445682451253484,
"grad_norm": 168210.1875,
"learning_rate": 6.306784660766962e-06,
"loss": 0.4163,
"step": 3630
},
{
"epoch": 0.4055710306406685,
"grad_norm": 186928.890625,
"learning_rate": 6.294985250737463e-06,
"loss": 0.4371,
"step": 3640
},
{
"epoch": 0.40668523676880225,
"grad_norm": 156051.6875,
"learning_rate": 6.283185840707965e-06,
"loss": 0.3911,
"step": 3650
},
{
"epoch": 0.4077994428969359,
"grad_norm": 199442.53125,
"learning_rate": 6.2713864306784665e-06,
"loss": 0.459,
"step": 3660
},
{
"epoch": 0.40891364902506966,
"grad_norm": 545817.75,
"learning_rate": 6.259587020648969e-06,
"loss": 0.4288,
"step": 3670
},
{
"epoch": 0.41002785515320334,
"grad_norm": 167921.96875,
"learning_rate": 6.247787610619469e-06,
"loss": 0.4002,
"step": 3680
},
{
"epoch": 0.41114206128133707,
"grad_norm": 150013.328125,
"learning_rate": 6.235988200589971e-06,
"loss": 0.3893,
"step": 3690
},
{
"epoch": 0.41225626740947074,
"grad_norm": 197072.15625,
"learning_rate": 6.224188790560472e-06,
"loss": 0.4764,
"step": 3700
},
{
"epoch": 0.4133704735376045,
"grad_norm": 173856.421875,
"learning_rate": 6.2123893805309745e-06,
"loss": 0.4133,
"step": 3710
},
{
"epoch": 0.41448467966573815,
"grad_norm": 189156.15625,
"learning_rate": 6.200589970501475e-06,
"loss": 0.4095,
"step": 3720
},
{
"epoch": 0.4155988857938719,
"grad_norm": 154648.484375,
"learning_rate": 6.1887905604719765e-06,
"loss": 0.4359,
"step": 3730
},
{
"epoch": 0.41671309192200556,
"grad_norm": 204105.28125,
"learning_rate": 6.176991150442479e-06,
"loss": 0.4027,
"step": 3740
},
{
"epoch": 0.4178272980501393,
"grad_norm": 143494.203125,
"learning_rate": 6.16519174041298e-06,
"loss": 0.4136,
"step": 3750
},
{
"epoch": 0.41894150417827297,
"grad_norm": 207321.671875,
"learning_rate": 6.153392330383481e-06,
"loss": 0.4809,
"step": 3760
},
{
"epoch": 0.4200557103064067,
"grad_norm": 188413.15625,
"learning_rate": 6.141592920353982e-06,
"loss": 0.4515,
"step": 3770
},
{
"epoch": 0.4211699164345404,
"grad_norm": 154952.078125,
"learning_rate": 6.1297935103244845e-06,
"loss": 0.3791,
"step": 3780
},
{
"epoch": 0.4222841225626741,
"grad_norm": 166050.15625,
"learning_rate": 6.117994100294986e-06,
"loss": 0.4097,
"step": 3790
},
{
"epoch": 0.4233983286908078,
"grad_norm": 182377.765625,
"learning_rate": 6.1061946902654865e-06,
"loss": 0.3749,
"step": 3800
},
{
"epoch": 0.4245125348189415,
"grad_norm": 208522.78125,
"learning_rate": 6.094395280235989e-06,
"loss": 0.3636,
"step": 3810
},
{
"epoch": 0.4256267409470752,
"grad_norm": 182704.5,
"learning_rate": 6.08259587020649e-06,
"loss": 0.4477,
"step": 3820
},
{
"epoch": 0.42674094707520893,
"grad_norm": 163660.265625,
"learning_rate": 6.070796460176992e-06,
"loss": 0.4134,
"step": 3830
},
{
"epoch": 0.4278551532033426,
"grad_norm": 191465.125,
"learning_rate": 6.058997050147493e-06,
"loss": 0.4783,
"step": 3840
},
{
"epoch": 0.42896935933147634,
"grad_norm": 185075.90625,
"learning_rate": 6.0471976401179945e-06,
"loss": 0.4293,
"step": 3850
},
{
"epoch": 0.43008356545961,
"grad_norm": 221350.125,
"learning_rate": 6.035398230088496e-06,
"loss": 0.3962,
"step": 3860
},
{
"epoch": 0.43119777158774375,
"grad_norm": 175274.96875,
"learning_rate": 6.023598820058998e-06,
"loss": 0.4292,
"step": 3870
},
{
"epoch": 0.4323119777158774,
"grad_norm": 206309.9375,
"learning_rate": 6.011799410029499e-06,
"loss": 0.4278,
"step": 3880
},
{
"epoch": 0.43342618384401116,
"grad_norm": 184827.65625,
"learning_rate": 6e-06,
"loss": 0.4033,
"step": 3890
},
{
"epoch": 0.43454038997214484,
"grad_norm": 192188.78125,
"learning_rate": 5.9882005899705024e-06,
"loss": 0.4204,
"step": 3900
},
{
"epoch": 0.43565459610027857,
"grad_norm": 171022.75,
"learning_rate": 5.976401179941004e-06,
"loss": 0.443,
"step": 3910
},
{
"epoch": 0.43676880222841225,
"grad_norm": 174545.4375,
"learning_rate": 5.9646017699115044e-06,
"loss": 0.3965,
"step": 3920
},
{
"epoch": 0.437883008356546,
"grad_norm": 196021.375,
"learning_rate": 5.952802359882006e-06,
"loss": 0.4329,
"step": 3930
},
{
"epoch": 0.43899721448467965,
"grad_norm": 201056.484375,
"learning_rate": 5.941002949852508e-06,
"loss": 0.4765,
"step": 3940
},
{
"epoch": 0.4401114206128134,
"grad_norm": 169847.234375,
"learning_rate": 5.9292035398230096e-06,
"loss": 0.3634,
"step": 3950
},
{
"epoch": 0.44122562674094706,
"grad_norm": 164063.359375,
"learning_rate": 5.91740412979351e-06,
"loss": 0.4535,
"step": 3960
},
{
"epoch": 0.4423398328690808,
"grad_norm": 165311.09375,
"learning_rate": 5.905604719764012e-06,
"loss": 0.3972,
"step": 3970
},
{
"epoch": 0.4434540389972145,
"grad_norm": 154916.328125,
"learning_rate": 5.893805309734514e-06,
"loss": 0.4292,
"step": 3980
},
{
"epoch": 0.4445682451253482,
"grad_norm": 242260.9375,
"learning_rate": 5.882005899705015e-06,
"loss": 0.4778,
"step": 3990
},
{
"epoch": 0.4456824512534819,
"grad_norm": 182793.953125,
"learning_rate": 5.870206489675516e-06,
"loss": 0.411,
"step": 4000
},
{
"epoch": 0.4467966573816156,
"grad_norm": 162919.3125,
"learning_rate": 5.858407079646018e-06,
"loss": 0.471,
"step": 4010
},
{
"epoch": 0.4479108635097493,
"grad_norm": 195205.6875,
"learning_rate": 5.8466076696165195e-06,
"loss": 0.4646,
"step": 4020
},
{
"epoch": 0.449025069637883,
"grad_norm": 184059.46875,
"learning_rate": 5.834808259587022e-06,
"loss": 0.4894,
"step": 4030
},
{
"epoch": 0.4501392757660167,
"grad_norm": 160941.640625,
"learning_rate": 5.823008849557522e-06,
"loss": 0.3985,
"step": 4040
},
{
"epoch": 0.45125348189415043,
"grad_norm": 173049.71875,
"learning_rate": 5.811209439528024e-06,
"loss": 0.3436,
"step": 4050
},
{
"epoch": 0.4523676880222841,
"grad_norm": 174724.65625,
"learning_rate": 5.799410029498525e-06,
"loss": 0.3968,
"step": 4060
},
{
"epoch": 0.45348189415041784,
"grad_norm": 225425.4375,
"learning_rate": 5.7876106194690275e-06,
"loss": 0.4066,
"step": 4070
},
{
"epoch": 0.4545961002785515,
"grad_norm": 165040.125,
"learning_rate": 5.775811209439528e-06,
"loss": 0.4007,
"step": 4080
},
{
"epoch": 0.45571030640668525,
"grad_norm": 163311.609375,
"learning_rate": 5.7640117994100295e-06,
"loss": 0.3548,
"step": 4090
},
{
"epoch": 0.4568245125348189,
"grad_norm": 165943.046875,
"learning_rate": 5.752212389380532e-06,
"loss": 0.4311,
"step": 4100
},
{
"epoch": 0.45793871866295266,
"grad_norm": 150324.8125,
"learning_rate": 5.740412979351033e-06,
"loss": 0.4099,
"step": 4110
},
{
"epoch": 0.45905292479108634,
"grad_norm": 172383.046875,
"learning_rate": 5.728613569321534e-06,
"loss": 0.4104,
"step": 4120
},
{
"epoch": 0.46016713091922007,
"grad_norm": 183318.265625,
"learning_rate": 5.716814159292036e-06,
"loss": 0.4281,
"step": 4130
},
{
"epoch": 0.46128133704735375,
"grad_norm": 152074.421875,
"learning_rate": 5.7050147492625375e-06,
"loss": 0.4459,
"step": 4140
},
{
"epoch": 0.4623955431754875,
"grad_norm": 191926.703125,
"learning_rate": 5.693215339233039e-06,
"loss": 0.3893,
"step": 4150
},
{
"epoch": 0.46350974930362115,
"grad_norm": 163396.796875,
"learning_rate": 5.6814159292035395e-06,
"loss": 0.4141,
"step": 4160
},
{
"epoch": 0.4646239554317549,
"grad_norm": 192465.65625,
"learning_rate": 5.669616519174042e-06,
"loss": 0.4562,
"step": 4170
},
{
"epoch": 0.46573816155988856,
"grad_norm": 218218.90625,
"learning_rate": 5.657817109144543e-06,
"loss": 0.4242,
"step": 4180
},
{
"epoch": 0.4668523676880223,
"grad_norm": 202565.15625,
"learning_rate": 5.6460176991150455e-06,
"loss": 0.4162,
"step": 4190
},
{
"epoch": 0.467966573816156,
"grad_norm": 173776.546875,
"learning_rate": 5.634218289085546e-06,
"loss": 0.3934,
"step": 4200
},
{
"epoch": 0.4690807799442897,
"grad_norm": 176373.203125,
"learning_rate": 5.6224188790560475e-06,
"loss": 0.4091,
"step": 4210
},
{
"epoch": 0.4701949860724234,
"grad_norm": 212039.5625,
"learning_rate": 5.610619469026549e-06,
"loss": 0.4188,
"step": 4220
},
{
"epoch": 0.4713091922005571,
"grad_norm": 197447.328125,
"learning_rate": 5.598820058997051e-06,
"loss": 0.3962,
"step": 4230
},
{
"epoch": 0.4724233983286908,
"grad_norm": 173795.234375,
"learning_rate": 5.587020648967552e-06,
"loss": 0.3441,
"step": 4240
},
{
"epoch": 0.4735376044568245,
"grad_norm": 210427.46875,
"learning_rate": 5.575221238938053e-06,
"loss": 0.409,
"step": 4250
},
{
"epoch": 0.4746518105849582,
"grad_norm": 185594.359375,
"learning_rate": 5.5634218289085554e-06,
"loss": 0.4243,
"step": 4260
},
{
"epoch": 0.47576601671309193,
"grad_norm": 160850.046875,
"learning_rate": 5.551622418879057e-06,
"loss": 0.4024,
"step": 4270
},
{
"epoch": 0.4768802228412256,
"grad_norm": 165540.953125,
"learning_rate": 5.5398230088495574e-06,
"loss": 0.4727,
"step": 4280
},
{
"epoch": 0.47799442896935934,
"grad_norm": 207796.59375,
"learning_rate": 5.528023598820059e-06,
"loss": 0.4451,
"step": 4290
},
{
"epoch": 0.479108635097493,
"grad_norm": 200276.796875,
"learning_rate": 5.516224188790561e-06,
"loss": 0.5015,
"step": 4300
},
{
"epoch": 0.48022284122562675,
"grad_norm": 177270.5625,
"learning_rate": 5.5044247787610626e-06,
"loss": 0.4163,
"step": 4310
},
{
"epoch": 0.4813370473537604,
"grad_norm": 169286.03125,
"learning_rate": 5.492625368731563e-06,
"loss": 0.4025,
"step": 4320
},
{
"epoch": 0.48245125348189416,
"grad_norm": 159905.65625,
"learning_rate": 5.480825958702065e-06,
"loss": 0.4096,
"step": 4330
},
{
"epoch": 0.48356545961002784,
"grad_norm": 168440.375,
"learning_rate": 5.469026548672567e-06,
"loss": 0.4202,
"step": 4340
},
{
"epoch": 0.48467966573816157,
"grad_norm": 168913.28125,
"learning_rate": 5.457227138643068e-06,
"loss": 0.3979,
"step": 4350
},
{
"epoch": 0.48579387186629525,
"grad_norm": 165448.90625,
"learning_rate": 5.44542772861357e-06,
"loss": 0.4168,
"step": 4360
},
{
"epoch": 0.486908077994429,
"grad_norm": 183158.03125,
"learning_rate": 5.433628318584071e-06,
"loss": 0.3508,
"step": 4370
},
{
"epoch": 0.48802228412256266,
"grad_norm": 194381.09375,
"learning_rate": 5.4218289085545725e-06,
"loss": 0.4704,
"step": 4380
},
{
"epoch": 0.4891364902506964,
"grad_norm": 179988.703125,
"learning_rate": 5.410029498525075e-06,
"loss": 0.3706,
"step": 4390
},
{
"epoch": 0.49025069637883006,
"grad_norm": 150438.375,
"learning_rate": 5.398230088495575e-06,
"loss": 0.4081,
"step": 4400
},
{
"epoch": 0.4913649025069638,
"grad_norm": 151454.5625,
"learning_rate": 5.386430678466077e-06,
"loss": 0.409,
"step": 4410
},
{
"epoch": 0.4924791086350975,
"grad_norm": 183632.453125,
"learning_rate": 5.374631268436579e-06,
"loss": 0.454,
"step": 4420
},
{
"epoch": 0.4935933147632312,
"grad_norm": 159092.578125,
"learning_rate": 5.3628318584070805e-06,
"loss": 0.4194,
"step": 4430
},
{
"epoch": 0.4947075208913649,
"grad_norm": 165830.4375,
"learning_rate": 5.351032448377581e-06,
"loss": 0.3769,
"step": 4440
},
{
"epoch": 0.4958217270194986,
"grad_norm": 155446.09375,
"learning_rate": 5.3392330383480825e-06,
"loss": 0.3807,
"step": 4450
},
{
"epoch": 0.4969359331476323,
"grad_norm": 154877.59375,
"learning_rate": 5.327433628318585e-06,
"loss": 0.4705,
"step": 4460
},
{
"epoch": 0.498050139275766,
"grad_norm": 248433.34375,
"learning_rate": 5.315634218289086e-06,
"loss": 0.4605,
"step": 4470
},
{
"epoch": 0.4991643454038997,
"grad_norm": 173858.453125,
"learning_rate": 5.303834808259587e-06,
"loss": 0.4343,
"step": 4480
},
{
"epoch": 0.5002785515320334,
"grad_norm": 173157.5625,
"learning_rate": 5.292035398230089e-06,
"loss": 0.4337,
"step": 4490
},
{
"epoch": 0.5013927576601671,
"grad_norm": 173380.3125,
"learning_rate": 5.2802359882005905e-06,
"loss": 0.3652,
"step": 4500
},
{
"epoch": 0.5025069637883008,
"grad_norm": 173306.734375,
"learning_rate": 5.268436578171092e-06,
"loss": 0.4024,
"step": 4510
},
{
"epoch": 0.5036211699164346,
"grad_norm": 162839.046875,
"learning_rate": 5.2566371681415925e-06,
"loss": 0.424,
"step": 4520
},
{
"epoch": 0.5047353760445682,
"grad_norm": 188280.625,
"learning_rate": 5.244837758112095e-06,
"loss": 0.4393,
"step": 4530
},
{
"epoch": 0.5058495821727019,
"grad_norm": 151708.1875,
"learning_rate": 5.233038348082596e-06,
"loss": 0.3599,
"step": 4540
},
{
"epoch": 0.5069637883008357,
"grad_norm": 212805.03125,
"learning_rate": 5.2212389380530985e-06,
"loss": 0.4578,
"step": 4550
},
{
"epoch": 0.5080779944289694,
"grad_norm": 157613.765625,
"learning_rate": 5.209439528023599e-06,
"loss": 0.4536,
"step": 4560
},
{
"epoch": 0.509192200557103,
"grad_norm": 183449.875,
"learning_rate": 5.1976401179941005e-06,
"loss": 0.4293,
"step": 4570
},
{
"epoch": 0.5103064066852367,
"grad_norm": 195998.046875,
"learning_rate": 5.185840707964602e-06,
"loss": 0.4089,
"step": 4580
},
{
"epoch": 0.5114206128133705,
"grad_norm": 168456.9375,
"learning_rate": 5.174041297935104e-06,
"loss": 0.4112,
"step": 4590
},
{
"epoch": 0.5125348189415042,
"grad_norm": 151747.875,
"learning_rate": 5.162241887905605e-06,
"loss": 0.4146,
"step": 4600
},
{
"epoch": 0.5136490250696378,
"grad_norm": 181618.5,
"learning_rate": 5.150442477876106e-06,
"loss": 0.4189,
"step": 4610
},
{
"epoch": 0.5147632311977716,
"grad_norm": 193827.984375,
"learning_rate": 5.1386430678466084e-06,
"loss": 0.4167,
"step": 4620
},
{
"epoch": 0.5158774373259053,
"grad_norm": 171558.84375,
"learning_rate": 5.12684365781711e-06,
"loss": 0.4338,
"step": 4630
},
{
"epoch": 0.516991643454039,
"grad_norm": 175443.65625,
"learning_rate": 5.1150442477876105e-06,
"loss": 0.3747,
"step": 4640
},
{
"epoch": 0.5181058495821727,
"grad_norm": 171425.59375,
"learning_rate": 5.103244837758113e-06,
"loss": 0.4327,
"step": 4650
},
{
"epoch": 0.5192200557103064,
"grad_norm": 179066.078125,
"learning_rate": 5.091445427728614e-06,
"loss": 0.4302,
"step": 4660
},
{
"epoch": 0.5203342618384401,
"grad_norm": 174975.703125,
"learning_rate": 5.0796460176991156e-06,
"loss": 0.4329,
"step": 4670
},
{
"epoch": 0.5214484679665738,
"grad_norm": 195037.359375,
"learning_rate": 5.067846607669616e-06,
"loss": 0.4322,
"step": 4680
},
{
"epoch": 0.5225626740947075,
"grad_norm": 175203.125,
"learning_rate": 5.0560471976401184e-06,
"loss": 0.4506,
"step": 4690
},
{
"epoch": 0.5236768802228412,
"grad_norm": 172270.53125,
"learning_rate": 5.04424778761062e-06,
"loss": 0.3653,
"step": 4700
},
{
"epoch": 0.5247910863509749,
"grad_norm": 203312.03125,
"learning_rate": 5.032448377581122e-06,
"loss": 0.4235,
"step": 4710
},
{
"epoch": 0.5259052924791087,
"grad_norm": 193338.390625,
"learning_rate": 5.020648967551623e-06,
"loss": 0.497,
"step": 4720
},
{
"epoch": 0.5270194986072423,
"grad_norm": 181869.234375,
"learning_rate": 5.008849557522124e-06,
"loss": 0.4106,
"step": 4730
},
{
"epoch": 0.528133704735376,
"grad_norm": 185743.0,
"learning_rate": 4.9970501474926256e-06,
"loss": 0.4143,
"step": 4740
},
{
"epoch": 0.5292479108635098,
"grad_norm": 187086.46875,
"learning_rate": 4.985250737463127e-06,
"loss": 0.4683,
"step": 4750
},
{
"epoch": 0.5303621169916435,
"grad_norm": 203731.9375,
"learning_rate": 4.973451327433628e-06,
"loss": 0.4548,
"step": 4760
},
{
"epoch": 0.5314763231197771,
"grad_norm": 175473.453125,
"learning_rate": 4.96165191740413e-06,
"loss": 0.4167,
"step": 4770
},
{
"epoch": 0.5325905292479108,
"grad_norm": 181863.0625,
"learning_rate": 4.949852507374632e-06,
"loss": 0.4219,
"step": 4780
},
{
"epoch": 0.5337047353760446,
"grad_norm": 170986.390625,
"learning_rate": 4.938053097345133e-06,
"loss": 0.4189,
"step": 4790
},
{
"epoch": 0.5348189415041783,
"grad_norm": 172215.15625,
"learning_rate": 4.926253687315635e-06,
"loss": 0.4129,
"step": 4800
},
{
"epoch": 0.5359331476323119,
"grad_norm": 180747.453125,
"learning_rate": 4.9144542772861355e-06,
"loss": 0.4347,
"step": 4810
},
{
"epoch": 0.5370473537604457,
"grad_norm": 176216.703125,
"learning_rate": 4.902654867256638e-06,
"loss": 0.4296,
"step": 4820
},
{
"epoch": 0.5381615598885794,
"grad_norm": 206918.265625,
"learning_rate": 4.890855457227139e-06,
"loss": 0.4342,
"step": 4830
},
{
"epoch": 0.5392757660167131,
"grad_norm": 184958.484375,
"learning_rate": 4.879056047197641e-06,
"loss": 0.4113,
"step": 4840
},
{
"epoch": 0.5403899721448467,
"grad_norm": 140585.65625,
"learning_rate": 4.867256637168142e-06,
"loss": 0.449,
"step": 4850
},
{
"epoch": 0.5415041782729805,
"grad_norm": 192482.1875,
"learning_rate": 4.8554572271386435e-06,
"loss": 0.3664,
"step": 4860
},
{
"epoch": 0.5426183844011142,
"grad_norm": 191947.328125,
"learning_rate": 4.843657817109145e-06,
"loss": 0.4395,
"step": 4870
},
{
"epoch": 0.5437325905292479,
"grad_norm": 202158.28125,
"learning_rate": 4.831858407079646e-06,
"loss": 0.4609,
"step": 4880
},
{
"epoch": 0.5448467966573816,
"grad_norm": 147651.859375,
"learning_rate": 4.820058997050148e-06,
"loss": 0.4156,
"step": 4890
},
{
"epoch": 0.5459610027855153,
"grad_norm": 176702.09375,
"learning_rate": 4.808259587020649e-06,
"loss": 0.3789,
"step": 4900
},
{
"epoch": 0.547075208913649,
"grad_norm": 196184.625,
"learning_rate": 4.796460176991151e-06,
"loss": 0.4384,
"step": 4910
},
{
"epoch": 0.5481894150417828,
"grad_norm": 188953.125,
"learning_rate": 4.784660766961652e-06,
"loss": 0.4238,
"step": 4920
},
{
"epoch": 0.5493036211699165,
"grad_norm": 202452.03125,
"learning_rate": 4.7728613569321535e-06,
"loss": 0.4208,
"step": 4930
},
{
"epoch": 0.5504178272980501,
"grad_norm": 196150.65625,
"learning_rate": 4.761061946902655e-06,
"loss": 0.4011,
"step": 4940
},
{
"epoch": 0.5515320334261838,
"grad_norm": 156776.75,
"learning_rate": 4.749262536873156e-06,
"loss": 0.3853,
"step": 4950
},
{
"epoch": 0.5526462395543176,
"grad_norm": 178294.34375,
"learning_rate": 4.737463126843659e-06,
"loss": 0.4161,
"step": 4960
},
{
"epoch": 0.5537604456824513,
"grad_norm": 191155.40625,
"learning_rate": 4.725663716814159e-06,
"loss": 0.3637,
"step": 4970
},
{
"epoch": 0.5548746518105849,
"grad_norm": 168218.421875,
"learning_rate": 4.7138643067846615e-06,
"loss": 0.4271,
"step": 4980
},
{
"epoch": 0.5559888579387187,
"grad_norm": 157993.203125,
"learning_rate": 4.702064896755162e-06,
"loss": 0.4609,
"step": 4990
},
{
"epoch": 0.5571030640668524,
"grad_norm": 177861.5625,
"learning_rate": 4.690265486725664e-06,
"loss": 0.3956,
"step": 5000
}
],
"logging_steps": 10,
"max_steps": 8975,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 5000,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 5.707023777792e+16,
"train_batch_size": 2,
"trial_name": null,
"trial_params": null
}