kurtpayne's picture
Auto fine-tune: 11674 examples (LoRA adapter)
f9b2f5a verified
Invalid JSON: Unexpected token 'N', ..."ad_norm": NaN, "... is not valid JSON
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 3.0,
"eval_steps": 500,
"global_step": 4302,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.00697350069735007,
"grad_norm": 13.183924674987793,
"learning_rate": 1.9974895397489544e-05,
"loss": 3.835,
"step": 10
},
{
"epoch": 0.01394700139470014,
"grad_norm": 14.83541202545166,
"learning_rate": 1.9947001394700142e-05,
"loss": 3.7714,
"step": 20
},
{
"epoch": 0.02092050209205021,
"grad_norm": 9.84648323059082,
"learning_rate": 1.991910739191074e-05,
"loss": 3.1422,
"step": 30
},
{
"epoch": 0.02789400278940028,
"grad_norm": 11.044331550598145,
"learning_rate": 1.989121338912134e-05,
"loss": 2.7074,
"step": 40
},
{
"epoch": 0.03486750348675035,
"grad_norm": 8.12322998046875,
"learning_rate": 1.986331938633194e-05,
"loss": 2.0078,
"step": 50
},
{
"epoch": 0.04184100418410042,
"grad_norm": 4.419774532318115,
"learning_rate": 1.983542538354254e-05,
"loss": 2.2417,
"step": 60
},
{
"epoch": 0.04881450488145049,
"grad_norm": 8.11199951171875,
"learning_rate": 1.980753138075314e-05,
"loss": 2.0051,
"step": 70
},
{
"epoch": 0.05578800557880056,
"grad_norm": 3.7404990196228027,
"learning_rate": 1.9779637377963737e-05,
"loss": 1.2384,
"step": 80
},
{
"epoch": 0.06276150627615062,
"grad_norm": 5.447963714599609,
"learning_rate": 1.975174337517434e-05,
"loss": 1.2396,
"step": 90
},
{
"epoch": 0.0697350069735007,
"grad_norm": 4.791909217834473,
"learning_rate": 1.9723849372384937e-05,
"loss": 1.0734,
"step": 100
},
{
"epoch": 0.07670850767085077,
"grad_norm": 6.843250274658203,
"learning_rate": 1.9695955369595538e-05,
"loss": 1.0683,
"step": 110
},
{
"epoch": 0.08368200836820083,
"grad_norm": 4.563827991485596,
"learning_rate": 1.966806136680614e-05,
"loss": 0.9533,
"step": 120
},
{
"epoch": 0.09065550906555091,
"grad_norm": 2.3097805976867676,
"learning_rate": 1.9640167364016738e-05,
"loss": 1.0514,
"step": 130
},
{
"epoch": 0.09762900976290098,
"grad_norm": 2.343554973602295,
"learning_rate": 1.961227336122734e-05,
"loss": 0.8546,
"step": 140
},
{
"epoch": 0.10460251046025104,
"grad_norm": 4.010223388671875,
"learning_rate": 1.9584379358437937e-05,
"loss": 0.6854,
"step": 150
},
{
"epoch": 0.11157601115760112,
"grad_norm": 2.506309747695923,
"learning_rate": 1.955648535564854e-05,
"loss": 0.9116,
"step": 160
},
{
"epoch": 0.11854951185495119,
"grad_norm": 1.248282551765442,
"learning_rate": 1.9528591352859137e-05,
"loss": 0.6762,
"step": 170
},
{
"epoch": 0.12552301255230125,
"grad_norm": 2.629155397415161,
"learning_rate": 1.9500697350069738e-05,
"loss": 0.7382,
"step": 180
},
{
"epoch": 0.13249651324965134,
"grad_norm": 2.2664334774017334,
"learning_rate": 1.9472803347280336e-05,
"loss": 0.5232,
"step": 190
},
{
"epoch": 0.1394700139470014,
"grad_norm": 2.295131206512451,
"learning_rate": 1.9444909344490938e-05,
"loss": 0.6542,
"step": 200
},
{
"epoch": 0.14644351464435146,
"grad_norm": 2.584252119064331,
"learning_rate": 1.9417015341701536e-05,
"loss": 0.5236,
"step": 210
},
{
"epoch": 0.15341701534170155,
"grad_norm": 3.074542999267578,
"learning_rate": 1.9389121338912137e-05,
"loss": 0.6076,
"step": 220
},
{
"epoch": 0.1603905160390516,
"grad_norm": 2.1027579307556152,
"learning_rate": 1.9361227336122735e-05,
"loss": 0.4767,
"step": 230
},
{
"epoch": 0.16736401673640167,
"grad_norm": 1.3120466470718384,
"learning_rate": 1.9333333333333333e-05,
"loss": 0.4428,
"step": 240
},
{
"epoch": 0.17433751743375175,
"grad_norm": 2.9058358669281006,
"learning_rate": 1.9305439330543935e-05,
"loss": 0.4286,
"step": 250
},
{
"epoch": 0.18131101813110181,
"grad_norm": 3.063037157058716,
"learning_rate": 1.9277545327754533e-05,
"loss": 0.5119,
"step": 260
},
{
"epoch": 0.18828451882845187,
"grad_norm": 1.2567652463912964,
"learning_rate": 1.9249651324965134e-05,
"loss": 0.3206,
"step": 270
},
{
"epoch": 0.19525801952580196,
"grad_norm": 1.3967201709747314,
"learning_rate": 1.9221757322175733e-05,
"loss": 0.3629,
"step": 280
},
{
"epoch": 0.20223152022315202,
"grad_norm": 1.4811025857925415,
"learning_rate": 1.9193863319386334e-05,
"loss": 0.3026,
"step": 290
},
{
"epoch": 0.20920502092050208,
"grad_norm": 1.2193363904953003,
"learning_rate": 1.9165969316596932e-05,
"loss": 0.3382,
"step": 300
},
{
"epoch": 0.21617852161785217,
"grad_norm": 1.5240554809570312,
"learning_rate": 1.9138075313807534e-05,
"loss": 0.2803,
"step": 310
},
{
"epoch": 0.22315202231520223,
"grad_norm": 2.093574285507202,
"learning_rate": 1.911018131101813e-05,
"loss": 0.1951,
"step": 320
},
{
"epoch": 0.2301255230125523,
"grad_norm": 0.9359086155891418,
"learning_rate": 1.9082287308228733e-05,
"loss": 0.2116,
"step": 330
},
{
"epoch": 0.23709902370990238,
"grad_norm": 1.443814992904663,
"learning_rate": 1.905439330543933e-05,
"loss": 0.2668,
"step": 340
},
{
"epoch": 0.24407252440725244,
"grad_norm": 1.0664631128311157,
"learning_rate": 1.9026499302649933e-05,
"loss": 0.3009,
"step": 350
},
{
"epoch": 0.2510460251046025,
"grad_norm": 0.7187846899032593,
"learning_rate": 1.899860529986053e-05,
"loss": 0.1704,
"step": 360
},
{
"epoch": 0.2580195258019526,
"grad_norm": 3.616009473800659,
"learning_rate": 1.8970711297071132e-05,
"loss": 0.2365,
"step": 370
},
{
"epoch": 0.2649930264993027,
"grad_norm": 1.6073416471481323,
"learning_rate": 1.894281729428173e-05,
"loss": 0.3163,
"step": 380
},
{
"epoch": 0.2719665271966527,
"grad_norm": 3.3605103492736816,
"learning_rate": 1.8914923291492332e-05,
"loss": 0.2076,
"step": 390
},
{
"epoch": 0.2789400278940028,
"grad_norm": 0.6530489921569824,
"learning_rate": 1.888702928870293e-05,
"loss": 0.1256,
"step": 400
},
{
"epoch": 0.2859135285913529,
"grad_norm": 0.6921319365501404,
"learning_rate": 1.885913528591353e-05,
"loss": 0.228,
"step": 410
},
{
"epoch": 0.2928870292887029,
"grad_norm": 0.6970381736755371,
"learning_rate": 1.883124128312413e-05,
"loss": 0.208,
"step": 420
},
{
"epoch": 0.299860529986053,
"grad_norm": 3.324526071548462,
"learning_rate": 1.880334728033473e-05,
"loss": 0.1737,
"step": 430
},
{
"epoch": 0.3068340306834031,
"grad_norm": NaN,
"learning_rate": 1.877545327754533e-05,
"loss": 0.2422,
"step": 440
},
{
"epoch": 0.3138075313807531,
"grad_norm": 1.2172619104385376,
"learning_rate": 1.8747559274755927e-05,
"loss": 0.174,
"step": 450
},
{
"epoch": 0.3207810320781032,
"grad_norm": 0.3495887219905853,
"learning_rate": 1.871966527196653e-05,
"loss": 0.1924,
"step": 460
},
{
"epoch": 0.3277545327754533,
"grad_norm": 0.41428402066230774,
"learning_rate": 1.8691771269177127e-05,
"loss": 0.0965,
"step": 470
},
{
"epoch": 0.33472803347280333,
"grad_norm": 0.6227982044219971,
"learning_rate": 1.8663877266387728e-05,
"loss": 0.1796,
"step": 480
},
{
"epoch": 0.3417015341701534,
"grad_norm": 4.252852439880371,
"learning_rate": 1.8635983263598326e-05,
"loss": 0.1918,
"step": 490
},
{
"epoch": 0.3486750348675035,
"grad_norm": 2.1391079425811768,
"learning_rate": 1.8608089260808928e-05,
"loss": 0.2705,
"step": 500
},
{
"epoch": 0.35564853556485354,
"grad_norm": 3.4021589756011963,
"learning_rate": 1.8580195258019526e-05,
"loss": 0.0966,
"step": 510
},
{
"epoch": 0.36262203626220363,
"grad_norm": 4.478765487670898,
"learning_rate": 1.8552301255230127e-05,
"loss": 0.1666,
"step": 520
},
{
"epoch": 0.3695955369595537,
"grad_norm": 3.3071353435516357,
"learning_rate": 1.8524407252440725e-05,
"loss": 0.1089,
"step": 530
},
{
"epoch": 0.37656903765690375,
"grad_norm": 6.9056267738342285,
"learning_rate": 1.8496513249651327e-05,
"loss": 0.1055,
"step": 540
},
{
"epoch": 0.38354253835425384,
"grad_norm": 1.1857019662857056,
"learning_rate": 1.8468619246861925e-05,
"loss": 0.1114,
"step": 550
},
{
"epoch": 0.3905160390516039,
"grad_norm": 2.9364492893218994,
"learning_rate": 1.8440725244072526e-05,
"loss": 0.2345,
"step": 560
},
{
"epoch": 0.39748953974895396,
"grad_norm": 0.46063023805618286,
"learning_rate": 1.8412831241283128e-05,
"loss": 0.0427,
"step": 570
},
{
"epoch": 0.40446304044630405,
"grad_norm": 0.4743267893791199,
"learning_rate": 1.8384937238493726e-05,
"loss": 0.0756,
"step": 580
},
{
"epoch": 0.41143654114365413,
"grad_norm": 0.6988991498947144,
"learning_rate": 1.8357043235704327e-05,
"loss": 0.1294,
"step": 590
},
{
"epoch": 0.41841004184100417,
"grad_norm": 2.7544264793395996,
"learning_rate": 1.8329149232914925e-05,
"loss": 0.2031,
"step": 600
},
{
"epoch": 0.42538354253835425,
"grad_norm": 3.5098769664764404,
"learning_rate": 1.8301255230125527e-05,
"loss": 0.1283,
"step": 610
},
{
"epoch": 0.43235704323570434,
"grad_norm": 3.013094663619995,
"learning_rate": 1.8273361227336125e-05,
"loss": 0.1626,
"step": 620
},
{
"epoch": 0.4393305439330544,
"grad_norm": 2.9749810695648193,
"learning_rate": 1.8245467224546723e-05,
"loss": 0.0544,
"step": 630
},
{
"epoch": 0.44630404463040446,
"grad_norm": 0.1516144722700119,
"learning_rate": 1.8217573221757325e-05,
"loss": 0.0864,
"step": 640
},
{
"epoch": 0.45327754532775455,
"grad_norm": 3.139366626739502,
"learning_rate": 1.8189679218967923e-05,
"loss": 0.082,
"step": 650
},
{
"epoch": 0.4602510460251046,
"grad_norm": 3.0818684101104736,
"learning_rate": 1.816178521617852e-05,
"loss": 0.0263,
"step": 660
},
{
"epoch": 0.46722454672245467,
"grad_norm": 0.23410484194755554,
"learning_rate": 1.8133891213389122e-05,
"loss": 0.2833,
"step": 670
},
{
"epoch": 0.47419804741980476,
"grad_norm": 0.900560200214386,
"learning_rate": 1.810599721059972e-05,
"loss": 0.073,
"step": 680
},
{
"epoch": 0.4811715481171548,
"grad_norm": 11.757755279541016,
"learning_rate": 1.8078103207810322e-05,
"loss": 0.1058,
"step": 690
},
{
"epoch": 0.4881450488145049,
"grad_norm": 0.34188902378082275,
"learning_rate": 1.805020920502092e-05,
"loss": 0.209,
"step": 700
},
{
"epoch": 0.49511854951185497,
"grad_norm": 0.1428012251853943,
"learning_rate": 1.802231520223152e-05,
"loss": 0.0565,
"step": 710
},
{
"epoch": 0.502092050209205,
"grad_norm": 8.282142639160156,
"learning_rate": 1.7994421199442123e-05,
"loss": 0.1955,
"step": 720
},
{
"epoch": 0.5090655509065551,
"grad_norm": 0.49546220898628235,
"learning_rate": 1.796652719665272e-05,
"loss": 0.2029,
"step": 730
},
{
"epoch": 0.5160390516039052,
"grad_norm": 0.4430611729621887,
"learning_rate": 1.7938633193863322e-05,
"loss": 0.154,
"step": 740
},
{
"epoch": 0.5230125523012552,
"grad_norm": 2.5573275089263916,
"learning_rate": 1.791073919107392e-05,
"loss": 0.1891,
"step": 750
},
{
"epoch": 0.5299860529986054,
"grad_norm": 0.5785364508628845,
"learning_rate": 1.7882845188284522e-05,
"loss": 0.0325,
"step": 760
},
{
"epoch": 0.5369595536959554,
"grad_norm": 2.325209856033325,
"learning_rate": 1.785495118549512e-05,
"loss": 0.1057,
"step": 770
},
{
"epoch": 0.5439330543933054,
"grad_norm": 0.2150566428899765,
"learning_rate": 1.782705718270572e-05,
"loss": 0.1506,
"step": 780
},
{
"epoch": 0.5509065550906556,
"grad_norm": 0.37389442324638367,
"learning_rate": 1.779916317991632e-05,
"loss": 0.0842,
"step": 790
},
{
"epoch": 0.5578800557880056,
"grad_norm": 6.229526519775391,
"learning_rate": 1.777126917712692e-05,
"loss": 0.1864,
"step": 800
},
{
"epoch": 0.5648535564853556,
"grad_norm": 0.1626635193824768,
"learning_rate": 1.774337517433752e-05,
"loss": 0.0813,
"step": 810
},
{
"epoch": 0.5718270571827058,
"grad_norm": 10.923450469970703,
"learning_rate": 1.771548117154812e-05,
"loss": 0.1013,
"step": 820
},
{
"epoch": 0.5788005578800558,
"grad_norm": 4.7759294509887695,
"learning_rate": 1.768758716875872e-05,
"loss": 0.0739,
"step": 830
},
{
"epoch": 0.5857740585774058,
"grad_norm": 11.055768966674805,
"learning_rate": 1.7659693165969317e-05,
"loss": 0.1342,
"step": 840
},
{
"epoch": 0.592747559274756,
"grad_norm": 1.800850510597229,
"learning_rate": 1.7631799163179918e-05,
"loss": 0.0595,
"step": 850
},
{
"epoch": 0.599721059972106,
"grad_norm": 6.058688163757324,
"learning_rate": 1.7603905160390516e-05,
"loss": 0.16,
"step": 860
},
{
"epoch": 0.606694560669456,
"grad_norm": 0.05257971212267876,
"learning_rate": 1.7576011157601118e-05,
"loss": 0.0637,
"step": 870
},
{
"epoch": 0.6136680613668062,
"grad_norm": 0.12368276715278625,
"learning_rate": 1.7548117154811716e-05,
"loss": 0.1242,
"step": 880
},
{
"epoch": 0.6206415620641562,
"grad_norm": 0.06948111951351166,
"learning_rate": 1.7520223152022317e-05,
"loss": 0.081,
"step": 890
},
{
"epoch": 0.6276150627615062,
"grad_norm": 5.596299171447754,
"learning_rate": 1.7492329149232915e-05,
"loss": 0.1923,
"step": 900
},
{
"epoch": 0.6345885634588564,
"grad_norm": 2.5386273860931396,
"learning_rate": 1.7464435146443517e-05,
"loss": 0.0898,
"step": 910
},
{
"epoch": 0.6415620641562064,
"grad_norm": 0.06753092259168625,
"learning_rate": 1.7436541143654115e-05,
"loss": 0.0224,
"step": 920
},
{
"epoch": 0.6485355648535565,
"grad_norm": 0.07754819095134735,
"learning_rate": 1.7408647140864716e-05,
"loss": 0.0242,
"step": 930
},
{
"epoch": 0.6555090655509066,
"grad_norm": 0.06803246587514877,
"learning_rate": 1.7380753138075315e-05,
"loss": 0.0439,
"step": 940
},
{
"epoch": 0.6624825662482566,
"grad_norm": 0.08195364475250244,
"learning_rate": 1.7352859135285916e-05,
"loss": 0.0437,
"step": 950
},
{
"epoch": 0.6694560669456067,
"grad_norm": 15.50480842590332,
"learning_rate": 1.7324965132496514e-05,
"loss": 0.1555,
"step": 960
},
{
"epoch": 0.6764295676429568,
"grad_norm": 1.3638603687286377,
"learning_rate": 1.7297071129707116e-05,
"loss": 0.1336,
"step": 970
},
{
"epoch": 0.6834030683403068,
"grad_norm": 0.08070117235183716,
"learning_rate": 1.7269177126917714e-05,
"loss": 0.0136,
"step": 980
},
{
"epoch": 0.6903765690376569,
"grad_norm": 0.0671255961060524,
"learning_rate": 1.7241283124128315e-05,
"loss": 0.0094,
"step": 990
},
{
"epoch": 0.697350069735007,
"grad_norm": 0.020939119160175323,
"learning_rate": 1.7213389121338913e-05,
"loss": 0.0177,
"step": 1000
},
{
"epoch": 0.704323570432357,
"grad_norm": 0.0943412110209465,
"learning_rate": 1.7185495118549515e-05,
"loss": 0.1003,
"step": 1010
},
{
"epoch": 0.7112970711297071,
"grad_norm": 0.05611201003193855,
"learning_rate": 1.7157601115760113e-05,
"loss": 0.0223,
"step": 1020
},
{
"epoch": 0.7182705718270572,
"grad_norm": 0.10727556049823761,
"learning_rate": 1.7129707112970714e-05,
"loss": 0.1631,
"step": 1030
},
{
"epoch": 0.7252440725244073,
"grad_norm": 0.06021064519882202,
"learning_rate": 1.7101813110181312e-05,
"loss": 0.0181,
"step": 1040
},
{
"epoch": 0.7322175732217573,
"grad_norm": 4.713542938232422,
"learning_rate": 1.707391910739191e-05,
"loss": 0.1523,
"step": 1050
},
{
"epoch": 0.7391910739191074,
"grad_norm": 0.07371031492948532,
"learning_rate": 1.7046025104602512e-05,
"loss": 0.2023,
"step": 1060
},
{
"epoch": 0.7461645746164575,
"grad_norm": 0.10017743706703186,
"learning_rate": 1.701813110181311e-05,
"loss": 0.1815,
"step": 1070
},
{
"epoch": 0.7531380753138075,
"grad_norm": 0.17356480658054352,
"learning_rate": 1.699023709902371e-05,
"loss": 0.0531,
"step": 1080
},
{
"epoch": 0.7601115760111576,
"grad_norm": 6.460514068603516,
"learning_rate": 1.696234309623431e-05,
"loss": 0.1038,
"step": 1090
},
{
"epoch": 0.7670850767085077,
"grad_norm": 1.358777403831482,
"learning_rate": 1.693444909344491e-05,
"loss": 0.0947,
"step": 1100
},
{
"epoch": 0.7740585774058577,
"grad_norm": 0.11568786948919296,
"learning_rate": 1.690655509065551e-05,
"loss": 0.174,
"step": 1110
},
{
"epoch": 0.7810320781032078,
"grad_norm": 1.7592473030090332,
"learning_rate": 1.687866108786611e-05,
"loss": 0.0833,
"step": 1120
},
{
"epoch": 0.7880055788005579,
"grad_norm": 0.06832170486450195,
"learning_rate": 1.685076708507671e-05,
"loss": 0.1201,
"step": 1130
},
{
"epoch": 0.7949790794979079,
"grad_norm": 0.04899122938513756,
"learning_rate": 1.682287308228731e-05,
"loss": 0.1153,
"step": 1140
},
{
"epoch": 0.8019525801952581,
"grad_norm": 8.826611518859863,
"learning_rate": 1.6794979079497908e-05,
"loss": 0.0943,
"step": 1150
},
{
"epoch": 0.8089260808926081,
"grad_norm": 1.2349894046783447,
"learning_rate": 1.676708507670851e-05,
"loss": 0.0512,
"step": 1160
},
{
"epoch": 0.8158995815899581,
"grad_norm": 0.04042017459869385,
"learning_rate": 1.6739191073919108e-05,
"loss": 0.0176,
"step": 1170
},
{
"epoch": 0.8228730822873083,
"grad_norm": 3.0442705154418945,
"learning_rate": 1.671129707112971e-05,
"loss": 0.0891,
"step": 1180
},
{
"epoch": 0.8298465829846583,
"grad_norm": 0.11100644618272781,
"learning_rate": 1.668340306834031e-05,
"loss": 0.1416,
"step": 1190
},
{
"epoch": 0.8368200836820083,
"grad_norm": 2.700934410095215,
"learning_rate": 1.665550906555091e-05,
"loss": 0.1365,
"step": 1200
},
{
"epoch": 0.8437935843793585,
"grad_norm": 0.2526993155479431,
"learning_rate": 1.6627615062761507e-05,
"loss": 0.0633,
"step": 1210
},
{
"epoch": 0.8507670850767085,
"grad_norm": 0.034327197819948196,
"learning_rate": 1.6599721059972108e-05,
"loss": 0.0566,
"step": 1220
},
{
"epoch": 0.8577405857740585,
"grad_norm": 6.361912250518799,
"learning_rate": 1.6571827057182706e-05,
"loss": 0.0814,
"step": 1230
},
{
"epoch": 0.8647140864714087,
"grad_norm": 0.2599698007106781,
"learning_rate": 1.6543933054393308e-05,
"loss": 0.0523,
"step": 1240
},
{
"epoch": 0.8716875871687587,
"grad_norm": 0.1802450269460678,
"learning_rate": 1.6516039051603906e-05,
"loss": 0.0672,
"step": 1250
},
{
"epoch": 0.8786610878661087,
"grad_norm": 0.05986528471112251,
"learning_rate": 1.6488145048814504e-05,
"loss": 0.031,
"step": 1260
},
{
"epoch": 0.8856345885634589,
"grad_norm": 5.647293567657471,
"learning_rate": 1.6460251046025105e-05,
"loss": 0.1258,
"step": 1270
},
{
"epoch": 0.8926080892608089,
"grad_norm": 0.18946939706802368,
"learning_rate": 1.6432357043235704e-05,
"loss": 0.1959,
"step": 1280
},
{
"epoch": 0.899581589958159,
"grad_norm": 0.03759470209479332,
"learning_rate": 1.6404463040446305e-05,
"loss": 0.0162,
"step": 1290
},
{
"epoch": 0.9065550906555091,
"grad_norm": 0.320306658744812,
"learning_rate": 1.6376569037656903e-05,
"loss": 0.0548,
"step": 1300
},
{
"epoch": 0.9135285913528591,
"grad_norm": 0.04905885457992554,
"learning_rate": 1.6348675034867505e-05,
"loss": 0.0596,
"step": 1310
},
{
"epoch": 0.9205020920502092,
"grad_norm": 0.03322403505444527,
"learning_rate": 1.6320781032078103e-05,
"loss": 0.1531,
"step": 1320
},
{
"epoch": 0.9274755927475593,
"grad_norm": 0.051219817250967026,
"learning_rate": 1.6292887029288704e-05,
"loss": 0.0047,
"step": 1330
},
{
"epoch": 0.9344490934449093,
"grad_norm": 0.1239551454782486,
"learning_rate": 1.6264993026499306e-05,
"loss": 0.1413,
"step": 1340
},
{
"epoch": 0.9414225941422594,
"grad_norm": 0.1667717546224594,
"learning_rate": 1.6237099023709904e-05,
"loss": 0.1345,
"step": 1350
},
{
"epoch": 0.9483960948396095,
"grad_norm": 5.152453899383545,
"learning_rate": 1.6209205020920505e-05,
"loss": 0.038,
"step": 1360
},
{
"epoch": 0.9553695955369595,
"grad_norm": 10.803096771240234,
"learning_rate": 1.6181311018131103e-05,
"loss": 0.0478,
"step": 1370
},
{
"epoch": 0.9623430962343096,
"grad_norm": 5.05436372756958,
"learning_rate": 1.6153417015341705e-05,
"loss": 0.0641,
"step": 1380
},
{
"epoch": 0.9693165969316597,
"grad_norm": 0.05341633781790733,
"learning_rate": 1.6125523012552303e-05,
"loss": 0.1159,
"step": 1390
},
{
"epoch": 0.9762900976290098,
"grad_norm": 0.030432600528001785,
"learning_rate": 1.6097629009762904e-05,
"loss": 0.2422,
"step": 1400
},
{
"epoch": 0.9832635983263598,
"grad_norm": 0.0488433800637722,
"learning_rate": 1.6069735006973502e-05,
"loss": 0.017,
"step": 1410
},
{
"epoch": 0.9902370990237099,
"grad_norm": 0.11435980349779129,
"learning_rate": 1.6041841004184104e-05,
"loss": 0.0678,
"step": 1420
},
{
"epoch": 0.99721059972106,
"grad_norm": 0.04273105785250664,
"learning_rate": 1.6013947001394702e-05,
"loss": 0.148,
"step": 1430
},
{
"epoch": 1.00418410041841,
"grad_norm": 0.03464385122060776,
"learning_rate": 1.59860529986053e-05,
"loss": 0.0116,
"step": 1440
},
{
"epoch": 1.0111576011157601,
"grad_norm": 0.051146648824214935,
"learning_rate": 1.59581589958159e-05,
"loss": 0.0471,
"step": 1450
},
{
"epoch": 1.0181311018131103,
"grad_norm": 0.03880644217133522,
"learning_rate": 1.59302649930265e-05,
"loss": 0.0805,
"step": 1460
},
{
"epoch": 1.0251046025104602,
"grad_norm": 0.04440051317214966,
"learning_rate": 1.5902370990237098e-05,
"loss": 0.0185,
"step": 1470
},
{
"epoch": 1.0320781032078103,
"grad_norm": 0.04550454020500183,
"learning_rate": 1.58744769874477e-05,
"loss": 0.0093,
"step": 1480
},
{
"epoch": 1.0390516039051605,
"grad_norm": 0.062290117144584656,
"learning_rate": 1.58465829846583e-05,
"loss": 0.0061,
"step": 1490
},
{
"epoch": 1.0460251046025104,
"grad_norm": 0.06050781160593033,
"learning_rate": 1.58186889818689e-05,
"loss": 0.0609,
"step": 1500
},
{
"epoch": 1.0529986052998606,
"grad_norm": 0.7175278067588806,
"learning_rate": 1.57907949790795e-05,
"loss": 0.0077,
"step": 1510
},
{
"epoch": 1.0599721059972107,
"grad_norm": 4.573083877563477,
"learning_rate": 1.5762900976290098e-05,
"loss": 0.0284,
"step": 1520
},
{
"epoch": 1.0669456066945606,
"grad_norm": 5.817536354064941,
"learning_rate": 1.57350069735007e-05,
"loss": 0.0581,
"step": 1530
},
{
"epoch": 1.0739191073919108,
"grad_norm": 1.3729101419448853,
"learning_rate": 1.5707112970711298e-05,
"loss": 0.1075,
"step": 1540
},
{
"epoch": 1.080892608089261,
"grad_norm": 0.019089030101895332,
"learning_rate": 1.56792189679219e-05,
"loss": 0.1176,
"step": 1550
},
{
"epoch": 1.0878661087866108,
"grad_norm": 0.05113361030817032,
"learning_rate": 1.5651324965132497e-05,
"loss": 0.011,
"step": 1560
},
{
"epoch": 1.094839609483961,
"grad_norm": 0.15870462357997894,
"learning_rate": 1.56234309623431e-05,
"loss": 0.0555,
"step": 1570
},
{
"epoch": 1.1018131101813111,
"grad_norm": 5.806714057922363,
"learning_rate": 1.5595536959553697e-05,
"loss": 0.1076,
"step": 1580
},
{
"epoch": 1.108786610878661,
"grad_norm": 0.043709512799978256,
"learning_rate": 1.55676429567643e-05,
"loss": 0.1412,
"step": 1590
},
{
"epoch": 1.1157601115760112,
"grad_norm": 3.896087884902954,
"learning_rate": 1.5539748953974896e-05,
"loss": 0.2978,
"step": 1600
},
{
"epoch": 1.1227336122733613,
"grad_norm": 0.6703562140464783,
"learning_rate": 1.5511854951185498e-05,
"loss": 0.0219,
"step": 1610
},
{
"epoch": 1.1297071129707112,
"grad_norm": 0.13185322284698486,
"learning_rate": 1.5483960948396096e-05,
"loss": 0.0379,
"step": 1620
},
{
"epoch": 1.1366806136680614,
"grad_norm": 0.07004066556692123,
"learning_rate": 1.5456066945606697e-05,
"loss": 0.0063,
"step": 1630
},
{
"epoch": 1.1436541143654115,
"grad_norm": 0.06793645024299622,
"learning_rate": 1.5428172942817296e-05,
"loss": 0.0618,
"step": 1640
},
{
"epoch": 1.1506276150627615,
"grad_norm": 0.12030527740716934,
"learning_rate": 1.5400278940027894e-05,
"loss": 0.111,
"step": 1650
},
{
"epoch": 1.1576011157601116,
"grad_norm": 0.018711797893047333,
"learning_rate": 1.5372384937238495e-05,
"loss": 0.0263,
"step": 1660
},
{
"epoch": 1.1645746164574617,
"grad_norm": 0.01281982846558094,
"learning_rate": 1.5344490934449093e-05,
"loss": 0.0676,
"step": 1670
},
{
"epoch": 1.1715481171548117,
"grad_norm": 10.88961410522461,
"learning_rate": 1.5316596931659695e-05,
"loss": 0.039,
"step": 1680
},
{
"epoch": 1.1785216178521618,
"grad_norm": 0.0368611216545105,
"learning_rate": 1.5288702928870293e-05,
"loss": 0.0299,
"step": 1690
},
{
"epoch": 1.185495118549512,
"grad_norm": 0.43271970748901367,
"learning_rate": 1.5260808926080894e-05,
"loss": 0.0125,
"step": 1700
},
{
"epoch": 1.1924686192468619,
"grad_norm": 0.038611456751823425,
"learning_rate": 1.5232914923291492e-05,
"loss": 0.0692,
"step": 1710
},
{
"epoch": 1.199442119944212,
"grad_norm": 0.05871947854757309,
"learning_rate": 1.5205020920502094e-05,
"loss": 0.0361,
"step": 1720
},
{
"epoch": 1.2064156206415622,
"grad_norm": 0.13699010014533997,
"learning_rate": 1.5177126917712692e-05,
"loss": 0.018,
"step": 1730
},
{
"epoch": 1.213389121338912,
"grad_norm": 0.028825167566537857,
"learning_rate": 1.5149232914923293e-05,
"loss": 0.0237,
"step": 1740
},
{
"epoch": 1.2203626220362622,
"grad_norm": 6.847581386566162,
"learning_rate": 1.5121338912133891e-05,
"loss": 0.1168,
"step": 1750
},
{
"epoch": 1.2273361227336124,
"grad_norm": 0.048287533223629,
"learning_rate": 1.5093444909344493e-05,
"loss": 0.0206,
"step": 1760
},
{
"epoch": 1.2343096234309623,
"grad_norm": 0.5911905169487,
"learning_rate": 1.5065550906555091e-05,
"loss": 0.004,
"step": 1770
},
{
"epoch": 1.2412831241283124,
"grad_norm": 2.227172374725342,
"learning_rate": 1.5037656903765692e-05,
"loss": 0.0032,
"step": 1780
},
{
"epoch": 1.2482566248256626,
"grad_norm": 0.014612744562327862,
"learning_rate": 1.5009762900976292e-05,
"loss": 0.1276,
"step": 1790
},
{
"epoch": 1.2552301255230125,
"grad_norm": 0.04997061565518379,
"learning_rate": 1.498186889818689e-05,
"loss": 0.07,
"step": 1800
},
{
"epoch": 1.2622036262203626,
"grad_norm": 0.033887382596731186,
"learning_rate": 1.4953974895397492e-05,
"loss": 0.0372,
"step": 1810
},
{
"epoch": 1.2691771269177128,
"grad_norm": 0.017684003338217735,
"learning_rate": 1.492608089260809e-05,
"loss": 0.0411,
"step": 1820
},
{
"epoch": 1.2761506276150627,
"grad_norm": 7.278536319732666,
"learning_rate": 1.4898186889818691e-05,
"loss": 0.1178,
"step": 1830
},
{
"epoch": 1.2831241283124128,
"grad_norm": 0.05286577343940735,
"learning_rate": 1.487029288702929e-05,
"loss": 0.2894,
"step": 1840
},
{
"epoch": 1.2900976290097628,
"grad_norm": 0.13287349045276642,
"learning_rate": 1.4842398884239891e-05,
"loss": 0.1081,
"step": 1850
},
{
"epoch": 1.297071129707113,
"grad_norm": 0.031001577153801918,
"learning_rate": 1.4814504881450489e-05,
"loss": 0.0861,
"step": 1860
},
{
"epoch": 1.304044630404463,
"grad_norm": 0.08806217461824417,
"learning_rate": 1.4786610878661089e-05,
"loss": 0.1862,
"step": 1870
},
{
"epoch": 1.3110181311018132,
"grad_norm": 0.025183405727148056,
"learning_rate": 1.4758716875871689e-05,
"loss": 0.0808,
"step": 1880
},
{
"epoch": 1.3179916317991631,
"grad_norm": 0.04694396257400513,
"learning_rate": 1.4730822873082288e-05,
"loss": 0.161,
"step": 1890
},
{
"epoch": 1.3249651324965133,
"grad_norm": 17.191736221313477,
"learning_rate": 1.4702928870292888e-05,
"loss": 0.0253,
"step": 1900
},
{
"epoch": 1.3319386331938632,
"grad_norm": 0.03222784027457237,
"learning_rate": 1.4675034867503488e-05,
"loss": 0.0053,
"step": 1910
},
{
"epoch": 1.3389121338912133,
"grad_norm": 0.033389899879693985,
"learning_rate": 1.4647140864714086e-05,
"loss": 0.2636,
"step": 1920
},
{
"epoch": 1.3458856345885635,
"grad_norm": 0.5124267935752869,
"learning_rate": 1.4619246861924687e-05,
"loss": 0.0165,
"step": 1930
},
{
"epoch": 1.3528591352859136,
"grad_norm": 0.01550813764333725,
"learning_rate": 1.4591352859135289e-05,
"loss": 0.1907,
"step": 1940
},
{
"epoch": 1.3598326359832635,
"grad_norm": 0.47707241773605347,
"learning_rate": 1.4563458856345887e-05,
"loss": 0.1509,
"step": 1950
},
{
"epoch": 1.3668061366806137,
"grad_norm": 0.030653545632958412,
"learning_rate": 1.4535564853556487e-05,
"loss": 0.0977,
"step": 1960
},
{
"epoch": 1.3737796373779636,
"grad_norm": 0.2786136865615845,
"learning_rate": 1.4507670850767087e-05,
"loss": 0.0355,
"step": 1970
},
{
"epoch": 1.3807531380753137,
"grad_norm": 0.13970093429088593,
"learning_rate": 1.4479776847977686e-05,
"loss": 0.0175,
"step": 1980
},
{
"epoch": 1.387726638772664,
"grad_norm": 1.529632329940796,
"learning_rate": 1.4451882845188286e-05,
"loss": 0.1932,
"step": 1990
},
{
"epoch": 1.394700139470014,
"grad_norm": 0.02077634632587433,
"learning_rate": 1.4423988842398886e-05,
"loss": 0.0691,
"step": 2000
},
{
"epoch": 1.401673640167364,
"grad_norm": 0.529776394367218,
"learning_rate": 1.4396094839609484e-05,
"loss": 0.0948,
"step": 2010
},
{
"epoch": 1.408647140864714,
"grad_norm": 0.006440140772610903,
"learning_rate": 1.4368200836820085e-05,
"loss": 0.1153,
"step": 2020
},
{
"epoch": 1.415620641562064,
"grad_norm": 0.049642570316791534,
"learning_rate": 1.4340306834030684e-05,
"loss": 0.002,
"step": 2030
},
{
"epoch": 1.4225941422594142,
"grad_norm": 0.017692767083644867,
"learning_rate": 1.4312412831241285e-05,
"loss": 0.005,
"step": 2040
},
{
"epoch": 1.4295676429567643,
"grad_norm": 3.4156153202056885,
"learning_rate": 1.4284518828451883e-05,
"loss": 0.0185,
"step": 2050
},
{
"epoch": 1.4365411436541144,
"grad_norm": 0.01563469134271145,
"learning_rate": 1.4256624825662485e-05,
"loss": 0.0187,
"step": 2060
},
{
"epoch": 1.4435146443514644,
"grad_norm": 0.016272351145744324,
"learning_rate": 1.4228730822873083e-05,
"loss": 0.0831,
"step": 2070
},
{
"epoch": 1.4504881450488145,
"grad_norm": 0.02539738453924656,
"learning_rate": 1.4200836820083682e-05,
"loss": 0.0104,
"step": 2080
},
{
"epoch": 1.4574616457461644,
"grad_norm": 0.036670733243227005,
"learning_rate": 1.4172942817294282e-05,
"loss": 0.0592,
"step": 2090
},
{
"epoch": 1.4644351464435146,
"grad_norm": 0.04291674867272377,
"learning_rate": 1.4145048814504882e-05,
"loss": 0.0906,
"step": 2100
},
{
"epoch": 1.4714086471408647,
"grad_norm": 9.656791687011719,
"learning_rate": 1.4117154811715483e-05,
"loss": 0.0653,
"step": 2110
},
{
"epoch": 1.4783821478382149,
"grad_norm": 0.19923704862594604,
"learning_rate": 1.4089260808926082e-05,
"loss": 0.0298,
"step": 2120
},
{
"epoch": 1.4853556485355648,
"grad_norm": 19.535799026489258,
"learning_rate": 1.4061366806136683e-05,
"loss": 0.1084,
"step": 2130
},
{
"epoch": 1.492329149232915,
"grad_norm": 0.007252114824950695,
"learning_rate": 1.4033472803347281e-05,
"loss": 0.0036,
"step": 2140
},
{
"epoch": 1.499302649930265,
"grad_norm": 7.753361701965332,
"learning_rate": 1.4005578800557883e-05,
"loss": 0.0369,
"step": 2150
},
{
"epoch": 1.506276150627615,
"grad_norm": 2.255980968475342,
"learning_rate": 1.397768479776848e-05,
"loss": 0.0174,
"step": 2160
},
{
"epoch": 1.5132496513249651,
"grad_norm": 0.9268475770950317,
"learning_rate": 1.394979079497908e-05,
"loss": 0.119,
"step": 2170
},
{
"epoch": 1.5202231520223153,
"grad_norm": 1.6389845609664917,
"learning_rate": 1.392189679218968e-05,
"loss": 0.0027,
"step": 2180
},
{
"epoch": 1.5271966527196654,
"grad_norm": 13.793027877807617,
"learning_rate": 1.389400278940028e-05,
"loss": 0.0421,
"step": 2190
},
{
"epoch": 1.5341701534170153,
"grad_norm": 15.437527656555176,
"learning_rate": 1.386610878661088e-05,
"loss": 0.1532,
"step": 2200
},
{
"epoch": 1.5411436541143653,
"grad_norm": 0.054035939276218414,
"learning_rate": 1.383821478382148e-05,
"loss": 0.0961,
"step": 2210
},
{
"epoch": 1.5481171548117154,
"grad_norm": 0.01988278515636921,
"learning_rate": 1.3810320781032078e-05,
"loss": 0.0214,
"step": 2220
},
{
"epoch": 1.5550906555090656,
"grad_norm": 3.2793631553649902,
"learning_rate": 1.3782426778242679e-05,
"loss": 0.115,
"step": 2230
},
{
"epoch": 1.5620641562064157,
"grad_norm": 12.380880355834961,
"learning_rate": 1.3754532775453277e-05,
"loss": 0.1112,
"step": 2240
},
{
"epoch": 1.5690376569037658,
"grad_norm": 0.30685436725616455,
"learning_rate": 1.3726638772663879e-05,
"loss": 0.0509,
"step": 2250
},
{
"epoch": 1.5760111576011158,
"grad_norm": 0.05449102073907852,
"learning_rate": 1.3698744769874478e-05,
"loss": 0.0088,
"step": 2260
},
{
"epoch": 1.5829846582984657,
"grad_norm": 18.799972534179688,
"learning_rate": 1.3670850767085078e-05,
"loss": 0.0667,
"step": 2270
},
{
"epoch": 1.5899581589958158,
"grad_norm": 0.01865663006901741,
"learning_rate": 1.3642956764295678e-05,
"loss": 0.141,
"step": 2280
},
{
"epoch": 1.596931659693166,
"grad_norm": 0.021439887583255768,
"learning_rate": 1.3615062761506278e-05,
"loss": 0.0022,
"step": 2290
},
{
"epoch": 1.6039051603905161,
"grad_norm": 0.1574893295764923,
"learning_rate": 1.3587168758716878e-05,
"loss": 0.0027,
"step": 2300
},
{
"epoch": 1.6108786610878663,
"grad_norm": 6.708405494689941,
"learning_rate": 1.3559274755927476e-05,
"loss": 0.0474,
"step": 2310
},
{
"epoch": 1.6178521617852162,
"grad_norm": 0.020400822162628174,
"learning_rate": 1.3531380753138077e-05,
"loss": 0.0521,
"step": 2320
},
{
"epoch": 1.624825662482566,
"grad_norm": 0.012889823876321316,
"learning_rate": 1.3503486750348675e-05,
"loss": 0.0589,
"step": 2330
},
{
"epoch": 1.6317991631799162,
"grad_norm": 0.03564007952809334,
"learning_rate": 1.3475592747559277e-05,
"loss": 0.0519,
"step": 2340
},
{
"epoch": 1.6387726638772664,
"grad_norm": 0.03289749100804329,
"learning_rate": 1.3447698744769875e-05,
"loss": 0.1251,
"step": 2350
},
{
"epoch": 1.6457461645746165,
"grad_norm": 0.011527667753398418,
"learning_rate": 1.3419804741980476e-05,
"loss": 0.0593,
"step": 2360
},
{
"epoch": 1.6527196652719667,
"grad_norm": 0.007492201868444681,
"learning_rate": 1.3391910739191074e-05,
"loss": 0.0047,
"step": 2370
},
{
"epoch": 1.6596931659693166,
"grad_norm": 0.02754775807261467,
"learning_rate": 1.3364016736401674e-05,
"loss": 0.0077,
"step": 2380
},
{
"epoch": 1.6666666666666665,
"grad_norm": 0.006202343851327896,
"learning_rate": 1.3336122733612274e-05,
"loss": 0.0031,
"step": 2390
},
{
"epoch": 1.6736401673640167,
"grad_norm": 0.008386103436350822,
"learning_rate": 1.3308228730822874e-05,
"loss": 0.0455,
"step": 2400
},
{
"epoch": 1.6806136680613668,
"grad_norm": 0.027837343513965607,
"learning_rate": 1.3280334728033475e-05,
"loss": 0.0758,
"step": 2410
},
{
"epoch": 1.687587168758717,
"grad_norm": 0.027335721999406815,
"learning_rate": 1.3252440725244073e-05,
"loss": 0.1159,
"step": 2420
},
{
"epoch": 1.694560669456067,
"grad_norm": 0.030841577798128128,
"learning_rate": 1.3224546722454675e-05,
"loss": 0.0242,
"step": 2430
},
{
"epoch": 1.701534170153417,
"grad_norm": 0.1417429894208908,
"learning_rate": 1.3196652719665273e-05,
"loss": 0.005,
"step": 2440
},
{
"epoch": 1.708507670850767,
"grad_norm": 0.05901753529906273,
"learning_rate": 1.3168758716875874e-05,
"loss": 0.0349,
"step": 2450
},
{
"epoch": 1.715481171548117,
"grad_norm": 9.147476196289062,
"learning_rate": 1.3140864714086472e-05,
"loss": 0.0876,
"step": 2460
},
{
"epoch": 1.7224546722454672,
"grad_norm": 0.012487313710153103,
"learning_rate": 1.3112970711297072e-05,
"loss": 0.0508,
"step": 2470
},
{
"epoch": 1.7294281729428174,
"grad_norm": 0.045956723392009735,
"learning_rate": 1.3085076708507672e-05,
"loss": 0.0589,
"step": 2480
},
{
"epoch": 1.7364016736401675,
"grad_norm": 0.013152926228940487,
"learning_rate": 1.3057182705718272e-05,
"loss": 0.0377,
"step": 2490
},
{
"epoch": 1.7433751743375174,
"grad_norm": 0.13610075414180756,
"learning_rate": 1.3029288702928871e-05,
"loss": 0.1634,
"step": 2500
},
{
"epoch": 1.7503486750348674,
"grad_norm": 0.012711451388895512,
"learning_rate": 1.3001394700139471e-05,
"loss": 0.0012,
"step": 2510
},
{
"epoch": 1.7573221757322175,
"grad_norm": 15.870648384094238,
"learning_rate": 1.297350069735007e-05,
"loss": 0.0779,
"step": 2520
},
{
"epoch": 1.7642956764295676,
"grad_norm": 0.06364299356937408,
"learning_rate": 1.294560669456067e-05,
"loss": 0.0842,
"step": 2530
},
{
"epoch": 1.7712691771269178,
"grad_norm": 0.19660770893096924,
"learning_rate": 1.2917712691771269e-05,
"loss": 0.0031,
"step": 2540
},
{
"epoch": 1.778242677824268,
"grad_norm": 0.012577497400343418,
"learning_rate": 1.288981868898187e-05,
"loss": 0.0342,
"step": 2550
},
{
"epoch": 1.7852161785216178,
"grad_norm": 0.02324119582772255,
"learning_rate": 1.286192468619247e-05,
"loss": 0.0026,
"step": 2560
},
{
"epoch": 1.7921896792189678,
"grad_norm": 0.010656113736331463,
"learning_rate": 1.283403068340307e-05,
"loss": 0.0023,
"step": 2570
},
{
"epoch": 1.799163179916318,
"grad_norm": 0.007635409012436867,
"learning_rate": 1.280613668061367e-05,
"loss": 0.0607,
"step": 2580
},
{
"epoch": 1.806136680613668,
"grad_norm": 1.153235912322998,
"learning_rate": 1.2778242677824268e-05,
"loss": 0.2041,
"step": 2590
},
{
"epoch": 1.8131101813110182,
"grad_norm": 11.79737663269043,
"learning_rate": 1.275034867503487e-05,
"loss": 0.0801,
"step": 2600
},
{
"epoch": 1.8200836820083683,
"grad_norm": 1.4480602741241455,
"learning_rate": 1.2722454672245467e-05,
"loss": 0.0129,
"step": 2610
},
{
"epoch": 1.8270571827057183,
"grad_norm": 0.05854364112019539,
"learning_rate": 1.2694560669456069e-05,
"loss": 0.0485,
"step": 2620
},
{
"epoch": 1.8340306834030682,
"grad_norm": 0.01310009602457285,
"learning_rate": 1.2666666666666667e-05,
"loss": 0.065,
"step": 2630
},
{
"epoch": 1.8410041841004183,
"grad_norm": 0.12124790251255035,
"learning_rate": 1.2638772663877268e-05,
"loss": 0.0031,
"step": 2640
},
{
"epoch": 1.8479776847977685,
"grad_norm": 0.06004326790571213,
"learning_rate": 1.2610878661087866e-05,
"loss": 0.0682,
"step": 2650
},
{
"epoch": 1.8549511854951186,
"grad_norm": 0.015955684706568718,
"learning_rate": 1.2582984658298468e-05,
"loss": 0.0437,
"step": 2660
},
{
"epoch": 1.8619246861924688,
"grad_norm": 0.005588918924331665,
"learning_rate": 1.2555090655509066e-05,
"loss": 0.0257,
"step": 2670
},
{
"epoch": 1.8688981868898187,
"grad_norm": 0.028774168342351913,
"learning_rate": 1.2527196652719666e-05,
"loss": 0.0281,
"step": 2680
},
{
"epoch": 1.8758716875871686,
"grad_norm": 0.01642877236008644,
"learning_rate": 1.2499302649930265e-05,
"loss": 0.0204,
"step": 2690
},
{
"epoch": 1.8828451882845187,
"grad_norm": 0.3172270357608795,
"learning_rate": 1.2471408647140865e-05,
"loss": 0.0508,
"step": 2700
},
{
"epoch": 1.8898186889818689,
"grad_norm": 0.049273423850536346,
"learning_rate": 1.2443514644351467e-05,
"loss": 0.0508,
"step": 2710
},
{
"epoch": 1.896792189679219,
"grad_norm": 0.018894419074058533,
"learning_rate": 1.2415620641562065e-05,
"loss": 0.0015,
"step": 2720
},
{
"epoch": 1.9037656903765692,
"grad_norm": 0.04853112995624542,
"learning_rate": 1.2387726638772666e-05,
"loss": 0.0027,
"step": 2730
},
{
"epoch": 1.910739191073919,
"grad_norm": 0.017383117228746414,
"learning_rate": 1.2359832635983264e-05,
"loss": 0.0127,
"step": 2740
},
{
"epoch": 1.917712691771269,
"grad_norm": 0.003187231719493866,
"learning_rate": 1.2331938633193866e-05,
"loss": 0.0153,
"step": 2750
},
{
"epoch": 1.9246861924686192,
"grad_norm": 0.03848472237586975,
"learning_rate": 1.2304044630404464e-05,
"loss": 0.0011,
"step": 2760
},
{
"epoch": 1.9316596931659693,
"grad_norm": 2.478860378265381,
"learning_rate": 1.2276150627615064e-05,
"loss": 0.1436,
"step": 2770
},
{
"epoch": 1.9386331938633194,
"grad_norm": 0.010695680044591427,
"learning_rate": 1.2248256624825663e-05,
"loss": 0.0103,
"step": 2780
},
{
"epoch": 1.9456066945606696,
"grad_norm": 0.2534022927284241,
"learning_rate": 1.2220362622036263e-05,
"loss": 0.0396,
"step": 2790
},
{
"epoch": 1.9525801952580195,
"grad_norm": 0.04518653079867363,
"learning_rate": 1.2192468619246863e-05,
"loss": 0.0089,
"step": 2800
},
{
"epoch": 1.9595536959553694,
"grad_norm": 0.040403205901384354,
"learning_rate": 1.2164574616457463e-05,
"loss": 0.0028,
"step": 2810
},
{
"epoch": 1.9665271966527196,
"grad_norm": 0.004626311827450991,
"learning_rate": 1.2136680613668061e-05,
"loss": 0.1684,
"step": 2820
},
{
"epoch": 1.9735006973500697,
"grad_norm": 0.009998406283557415,
"learning_rate": 1.2108786610878662e-05,
"loss": 0.0401,
"step": 2830
},
{
"epoch": 1.9804741980474199,
"grad_norm": 0.02394164726138115,
"learning_rate": 1.208089260808926e-05,
"loss": 0.0128,
"step": 2840
},
{
"epoch": 1.98744769874477,
"grad_norm": 0.203586608171463,
"learning_rate": 1.2052998605299862e-05,
"loss": 0.0039,
"step": 2850
},
{
"epoch": 1.99442119944212,
"grad_norm": 0.6173217296600342,
"learning_rate": 1.2025104602510462e-05,
"loss": 0.0021,
"step": 2860
},
{
"epoch": 2.00139470013947,
"grad_norm": 0.01286329049617052,
"learning_rate": 1.1997210599721061e-05,
"loss": 0.0445,
"step": 2870
},
{
"epoch": 2.00836820083682,
"grad_norm": 0.019031327217817307,
"learning_rate": 1.1969316596931661e-05,
"loss": 0.0255,
"step": 2880
},
{
"epoch": 2.01534170153417,
"grad_norm": 3.7492589950561523,
"learning_rate": 1.194142259414226e-05,
"loss": 0.0061,
"step": 2890
},
{
"epoch": 2.0223152022315203,
"grad_norm": 0.015447934158146381,
"learning_rate": 1.191352859135286e-05,
"loss": 0.004,
"step": 2900
},
{
"epoch": 2.0292887029288704,
"grad_norm": 0.010356171987950802,
"learning_rate": 1.1885634588563459e-05,
"loss": 0.0311,
"step": 2910
},
{
"epoch": 2.0362622036262206,
"grad_norm": 0.020356524735689163,
"learning_rate": 1.185774058577406e-05,
"loss": 0.0018,
"step": 2920
},
{
"epoch": 2.0432357043235703,
"grad_norm": 0.3535953164100647,
"learning_rate": 1.1829846582984658e-05,
"loss": 0.0014,
"step": 2930
},
{
"epoch": 2.0502092050209204,
"grad_norm": 0.014560551382601261,
"learning_rate": 1.180195258019526e-05,
"loss": 0.03,
"step": 2940
},
{
"epoch": 2.0571827057182706,
"grad_norm": 9.034151077270508,
"learning_rate": 1.1774058577405858e-05,
"loss": 0.0116,
"step": 2950
},
{
"epoch": 2.0641562064156207,
"grad_norm": 6.865650177001953,
"learning_rate": 1.174616457461646e-05,
"loss": 0.1078,
"step": 2960
},
{
"epoch": 2.071129707112971,
"grad_norm": 0.040994029492139816,
"learning_rate": 1.1718270571827058e-05,
"loss": 0.05,
"step": 2970
},
{
"epoch": 2.078103207810321,
"grad_norm": 0.004967757500708103,
"learning_rate": 1.1690376569037657e-05,
"loss": 0.0031,
"step": 2980
},
{
"epoch": 2.0850767085076707,
"grad_norm": 0.01656835526227951,
"learning_rate": 1.1662482566248257e-05,
"loss": 0.1353,
"step": 2990
},
{
"epoch": 2.092050209205021,
"grad_norm": 0.020232422277331352,
"learning_rate": 1.1634588563458857e-05,
"loss": 0.0378,
"step": 3000
},
{
"epoch": 2.099023709902371,
"grad_norm": 0.006969059351831675,
"learning_rate": 1.1606694560669458e-05,
"loss": 0.1451,
"step": 3010
},
{
"epoch": 2.105997210599721,
"grad_norm": 0.011694218032062054,
"learning_rate": 1.1578800557880056e-05,
"loss": 0.0263,
"step": 3020
},
{
"epoch": 2.1129707112970713,
"grad_norm": 0.07425375282764435,
"learning_rate": 1.1550906555090658e-05,
"loss": 0.0022,
"step": 3030
},
{
"epoch": 2.1199442119944214,
"grad_norm": 0.10199136286973953,
"learning_rate": 1.1523012552301256e-05,
"loss": 0.0621,
"step": 3040
},
{
"epoch": 2.126917712691771,
"grad_norm": 0.10195229947566986,
"learning_rate": 1.1495118549511857e-05,
"loss": 0.0342,
"step": 3050
},
{
"epoch": 2.1338912133891212,
"grad_norm": 0.012728706002235413,
"learning_rate": 1.1467224546722456e-05,
"loss": 0.1303,
"step": 3060
},
{
"epoch": 2.1408647140864714,
"grad_norm": 0.018413392826914787,
"learning_rate": 1.1439330543933055e-05,
"loss": 0.0026,
"step": 3070
},
{
"epoch": 2.1478382147838215,
"grad_norm": 0.00421161437407136,
"learning_rate": 1.1411436541143655e-05,
"loss": 0.0795,
"step": 3080
},
{
"epoch": 2.1548117154811717,
"grad_norm": 0.006007787771522999,
"learning_rate": 1.1383542538354255e-05,
"loss": 0.018,
"step": 3090
},
{
"epoch": 2.161785216178522,
"grad_norm": 0.004907716065645218,
"learning_rate": 1.1355648535564853e-05,
"loss": 0.0009,
"step": 3100
},
{
"epoch": 2.1687587168758715,
"grad_norm": 5.030208110809326,
"learning_rate": 1.1327754532775454e-05,
"loss": 0.0816,
"step": 3110
},
{
"epoch": 2.1757322175732217,
"grad_norm": 0.021412838250398636,
"learning_rate": 1.1299860529986053e-05,
"loss": 0.0573,
"step": 3120
},
{
"epoch": 2.182705718270572,
"grad_norm": 0.020408878102898598,
"learning_rate": 1.1271966527196654e-05,
"loss": 0.0005,
"step": 3130
},
{
"epoch": 2.189679218967922,
"grad_norm": 0.20092400908470154,
"learning_rate": 1.1244072524407252e-05,
"loss": 0.0013,
"step": 3140
},
{
"epoch": 2.196652719665272,
"grad_norm": 0.007184051908552647,
"learning_rate": 1.1216178521617854e-05,
"loss": 0.0143,
"step": 3150
},
{
"epoch": 2.2036262203626222,
"grad_norm": 0.0019369281362742186,
"learning_rate": 1.1188284518828453e-05,
"loss": 0.0068,
"step": 3160
},
{
"epoch": 2.210599721059972,
"grad_norm": 0.008907733485102654,
"learning_rate": 1.1160390516039053e-05,
"loss": 0.0017,
"step": 3170
},
{
"epoch": 2.217573221757322,
"grad_norm": 8.365190505981445,
"learning_rate": 1.1132496513249653e-05,
"loss": 0.0422,
"step": 3180
},
{
"epoch": 2.224546722454672,
"grad_norm": 1.0398991107940674,
"learning_rate": 1.1104602510460251e-05,
"loss": 0.0015,
"step": 3190
},
{
"epoch": 2.2315202231520224,
"grad_norm": 0.01677248626947403,
"learning_rate": 1.1076708507670852e-05,
"loss": 0.0053,
"step": 3200
},
{
"epoch": 2.2384937238493725,
"grad_norm": 0.05190812796354294,
"learning_rate": 1.104881450488145e-05,
"loss": 0.0005,
"step": 3210
},
{
"epoch": 2.2454672245467227,
"grad_norm": 4.997675895690918,
"learning_rate": 1.1020920502092052e-05,
"loss": 0.2023,
"step": 3220
},
{
"epoch": 2.2524407252440724,
"grad_norm": 0.006259276531636715,
"learning_rate": 1.099302649930265e-05,
"loss": 0.004,
"step": 3230
},
{
"epoch": 2.2594142259414225,
"grad_norm": 0.008285343647003174,
"learning_rate": 1.0965132496513252e-05,
"loss": 0.0159,
"step": 3240
},
{
"epoch": 2.2663877266387726,
"grad_norm": 3.1502225399017334,
"learning_rate": 1.093723849372385e-05,
"loss": 0.0056,
"step": 3250
},
{
"epoch": 2.273361227336123,
"grad_norm": 1.8060733079910278,
"learning_rate": 1.0909344490934451e-05,
"loss": 0.003,
"step": 3260
},
{
"epoch": 2.280334728033473,
"grad_norm": 0.6619565486907959,
"learning_rate": 1.088145048814505e-05,
"loss": 0.0021,
"step": 3270
},
{
"epoch": 2.287308228730823,
"grad_norm": 0.008702186867594719,
"learning_rate": 1.0853556485355649e-05,
"loss": 0.0678,
"step": 3280
},
{
"epoch": 2.2942817294281728,
"grad_norm": 0.006757175084203482,
"learning_rate": 1.0825662482566249e-05,
"loss": 0.0019,
"step": 3290
},
{
"epoch": 2.301255230125523,
"grad_norm": 0.04931594431400299,
"learning_rate": 1.0797768479776849e-05,
"loss": 0.0014,
"step": 3300
},
{
"epoch": 2.308228730822873,
"grad_norm": 0.4247874319553375,
"learning_rate": 1.0769874476987448e-05,
"loss": 0.1117,
"step": 3310
},
{
"epoch": 2.315202231520223,
"grad_norm": 0.006627683062106371,
"learning_rate": 1.0741980474198048e-05,
"loss": 0.0004,
"step": 3320
},
{
"epoch": 2.3221757322175733,
"grad_norm": 0.07560670375823975,
"learning_rate": 1.071408647140865e-05,
"loss": 0.0014,
"step": 3330
},
{
"epoch": 2.3291492329149235,
"grad_norm": 0.00642388267442584,
"learning_rate": 1.0686192468619248e-05,
"loss": 0.0008,
"step": 3340
},
{
"epoch": 2.336122733612273,
"grad_norm": 0.010923785157501698,
"learning_rate": 1.0658298465829849e-05,
"loss": 0.0005,
"step": 3350
},
{
"epoch": 2.3430962343096233,
"grad_norm": 0.015175443142652512,
"learning_rate": 1.0630404463040447e-05,
"loss": 0.0223,
"step": 3360
},
{
"epoch": 2.3500697350069735,
"grad_norm": 0.004983542487025261,
"learning_rate": 1.0602510460251047e-05,
"loss": 0.0004,
"step": 3370
},
{
"epoch": 2.3570432357043236,
"grad_norm": 0.08845807611942291,
"learning_rate": 1.0574616457461647e-05,
"loss": 0.0568,
"step": 3380
},
{
"epoch": 2.3640167364016738,
"grad_norm": 1.122623085975647,
"learning_rate": 1.0546722454672247e-05,
"loss": 0.0016,
"step": 3390
},
{
"epoch": 2.370990237099024,
"grad_norm": 0.012382916174829006,
"learning_rate": 1.0518828451882845e-05,
"loss": 0.0397,
"step": 3400
},
{
"epoch": 2.3779637377963736,
"grad_norm": 0.052785713225603104,
"learning_rate": 1.0490934449093446e-05,
"loss": 0.0308,
"step": 3410
},
{
"epoch": 2.3849372384937237,
"grad_norm": 0.003585915081202984,
"learning_rate": 1.0463040446304044e-05,
"loss": 0.0005,
"step": 3420
},
{
"epoch": 2.391910739191074,
"grad_norm": 0.13946089148521423,
"learning_rate": 1.0435146443514646e-05,
"loss": 0.0496,
"step": 3430
},
{
"epoch": 2.398884239888424,
"grad_norm": 0.014828328974545002,
"learning_rate": 1.0407252440725244e-05,
"loss": 0.0888,
"step": 3440
},
{
"epoch": 2.405857740585774,
"grad_norm": 5.3019585609436035,
"learning_rate": 1.0379358437935845e-05,
"loss": 0.189,
"step": 3450
},
{
"epoch": 2.4128312412831243,
"grad_norm": 0.019477086141705513,
"learning_rate": 1.0351464435146443e-05,
"loss": 0.0021,
"step": 3460
},
{
"epoch": 2.419804741980474,
"grad_norm": 0.004096941091120243,
"learning_rate": 1.0323570432357045e-05,
"loss": 0.0014,
"step": 3470
},
{
"epoch": 2.426778242677824,
"grad_norm": 0.012808839790523052,
"learning_rate": 1.0295676429567645e-05,
"loss": 0.0013,
"step": 3480
},
{
"epoch": 2.4337517433751743,
"grad_norm": 0.027664173394441605,
"learning_rate": 1.0267782426778243e-05,
"loss": 0.0017,
"step": 3490
},
{
"epoch": 2.4407252440725244,
"grad_norm": 0.002692542504519224,
"learning_rate": 1.0239888423988844e-05,
"loss": 0.0006,
"step": 3500
},
{
"epoch": 2.4476987447698746,
"grad_norm": 0.013481782749295235,
"learning_rate": 1.0211994421199442e-05,
"loss": 0.0006,
"step": 3510
},
{
"epoch": 2.4546722454672247,
"grad_norm": 0.04460657387971878,
"learning_rate": 1.0184100418410044e-05,
"loss": 0.1172,
"step": 3520
},
{
"epoch": 2.4616457461645744,
"grad_norm": 0.009075530804693699,
"learning_rate": 1.0156206415620642e-05,
"loss": 0.0003,
"step": 3530
},
{
"epoch": 2.4686192468619246,
"grad_norm": 0.05851946026086807,
"learning_rate": 1.0128312412831243e-05,
"loss": 0.0385,
"step": 3540
},
{
"epoch": 2.4755927475592747,
"grad_norm": 0.011403707787394524,
"learning_rate": 1.0100418410041841e-05,
"loss": 0.0553,
"step": 3550
},
{
"epoch": 2.482566248256625,
"grad_norm": 0.008116642013192177,
"learning_rate": 1.0072524407252443e-05,
"loss": 0.0455,
"step": 3560
},
{
"epoch": 2.489539748953975,
"grad_norm": 0.01469523087143898,
"learning_rate": 1.004463040446304e-05,
"loss": 0.09,
"step": 3570
},
{
"epoch": 2.496513249651325,
"grad_norm": 0.009429101832211018,
"learning_rate": 1.001673640167364e-05,
"loss": 0.0003,
"step": 3580
},
{
"epoch": 2.5034867503486753,
"grad_norm": 0.002064500702545047,
"learning_rate": 9.98884239888424e-06,
"loss": 0.0277,
"step": 3590
},
{
"epoch": 2.510460251046025,
"grad_norm": 0.0018807955784723163,
"learning_rate": 9.96094839609484e-06,
"loss": 0.0035,
"step": 3600
},
{
"epoch": 2.517433751743375,
"grad_norm": 0.0022944663651287556,
"learning_rate": 9.93305439330544e-06,
"loss": 0.0659,
"step": 3610
},
{
"epoch": 2.5244072524407253,
"grad_norm": 0.004956633783876896,
"learning_rate": 9.90516039051604e-06,
"loss": 0.0005,
"step": 3620
},
{
"epoch": 2.5313807531380754,
"grad_norm": 0.009603263810276985,
"learning_rate": 9.87726638772664e-06,
"loss": 0.0008,
"step": 3630
},
{
"epoch": 2.5383542538354256,
"grad_norm": 0.00588383199647069,
"learning_rate": 9.84937238493724e-06,
"loss": 0.0004,
"step": 3640
},
{
"epoch": 2.5453277545327753,
"grad_norm": 0.0017828900599852204,
"learning_rate": 9.821478382147839e-06,
"loss": 0.0769,
"step": 3650
},
{
"epoch": 2.5523012552301254,
"grad_norm": 2.002347946166992,
"learning_rate": 9.793584379358439e-06,
"loss": 0.0405,
"step": 3660
},
{
"epoch": 2.5592747559274756,
"grad_norm": 0.08858360350131989,
"learning_rate": 9.765690376569039e-06,
"loss": 0.0006,
"step": 3670
},
{
"epoch": 2.5662482566248257,
"grad_norm": 0.0016570795560255647,
"learning_rate": 9.737796373779638e-06,
"loss": 0.0156,
"step": 3680
},
{
"epoch": 2.573221757322176,
"grad_norm": 0.008444487117230892,
"learning_rate": 9.709902370990238e-06,
"loss": 0.0004,
"step": 3690
},
{
"epoch": 2.5801952580195255,
"grad_norm": 0.009042381308972836,
"learning_rate": 9.682008368200838e-06,
"loss": 0.0639,
"step": 3700
},
{
"epoch": 2.587168758716876,
"grad_norm": 0.01534841675311327,
"learning_rate": 9.654114365411438e-06,
"loss": 0.008,
"step": 3710
},
{
"epoch": 2.594142259414226,
"grad_norm": 0.004281465895473957,
"learning_rate": 9.626220362622038e-06,
"loss": 0.001,
"step": 3720
},
{
"epoch": 2.601115760111576,
"grad_norm": 0.012187506072223186,
"learning_rate": 9.598326359832637e-06,
"loss": 0.0013,
"step": 3730
},
{
"epoch": 2.608089260808926,
"grad_norm": 0.007811861112713814,
"learning_rate": 9.570432357043237e-06,
"loss": 0.0761,
"step": 3740
},
{
"epoch": 2.6150627615062763,
"grad_norm": 0.0038925069384276867,
"learning_rate": 9.542538354253837e-06,
"loss": 0.0054,
"step": 3750
},
{
"epoch": 2.6220362622036264,
"grad_norm": 0.004444095306098461,
"learning_rate": 9.514644351464437e-06,
"loss": 0.0115,
"step": 3760
},
{
"epoch": 2.629009762900976,
"grad_norm": 0.035112250596284866,
"learning_rate": 9.486750348675036e-06,
"loss": 0.0016,
"step": 3770
},
{
"epoch": 2.6359832635983262,
"grad_norm": 13.684135437011719,
"learning_rate": 9.458856345885634e-06,
"loss": 0.073,
"step": 3780
},
{
"epoch": 2.6429567642956764,
"grad_norm": 0.012164488434791565,
"learning_rate": 9.430962343096234e-06,
"loss": 0.0013,
"step": 3790
},
{
"epoch": 2.6499302649930265,
"grad_norm": 0.052418239414691925,
"learning_rate": 9.403068340306834e-06,
"loss": 0.0007,
"step": 3800
},
{
"epoch": 2.6569037656903767,
"grad_norm": 0.00277147744782269,
"learning_rate": 9.375174337517434e-06,
"loss": 0.1436,
"step": 3810
},
{
"epoch": 2.6638772663877264,
"grad_norm": 0.0020822423975914717,
"learning_rate": 9.347280334728034e-06,
"loss": 0.029,
"step": 3820
},
{
"epoch": 2.670850767085077,
"grad_norm": 0.04161955416202545,
"learning_rate": 9.319386331938633e-06,
"loss": 0.0618,
"step": 3830
},
{
"epoch": 2.6778242677824267,
"grad_norm": 0.03144453093409538,
"learning_rate": 9.291492329149233e-06,
"loss": 0.0429,
"step": 3840
},
{
"epoch": 2.684797768479777,
"grad_norm": 0.01364242285490036,
"learning_rate": 9.263598326359835e-06,
"loss": 0.0201,
"step": 3850
},
{
"epoch": 2.691771269177127,
"grad_norm": 0.007967864163219929,
"learning_rate": 9.235704323570434e-06,
"loss": 0.0029,
"step": 3860
},
{
"epoch": 2.698744769874477,
"grad_norm": 0.006493957247585058,
"learning_rate": 9.207810320781032e-06,
"loss": 0.052,
"step": 3870
},
{
"epoch": 2.7057182705718272,
"grad_norm": 0.07452358305454254,
"learning_rate": 9.179916317991632e-06,
"loss": 0.0026,
"step": 3880
},
{
"epoch": 2.712691771269177,
"grad_norm": 0.005458319094032049,
"learning_rate": 9.152022315202232e-06,
"loss": 0.0017,
"step": 3890
},
{
"epoch": 2.719665271966527,
"grad_norm": 0.010741036385297775,
"learning_rate": 9.124128312412832e-06,
"loss": 0.0732,
"step": 3900
},
{
"epoch": 2.726638772663877,
"grad_norm": 0.007957357913255692,
"learning_rate": 9.096234309623432e-06,
"loss": 0.0003,
"step": 3910
},
{
"epoch": 2.7336122733612274,
"grad_norm": 0.31968942284584045,
"learning_rate": 9.068340306834031e-06,
"loss": 0.0171,
"step": 3920
},
{
"epoch": 2.7405857740585775,
"grad_norm": 0.0007297178963199258,
"learning_rate": 9.040446304044631e-06,
"loss": 0.0003,
"step": 3930
},
{
"epoch": 2.747559274755927,
"grad_norm": 0.009427106007933617,
"learning_rate": 9.012552301255231e-06,
"loss": 0.0219,
"step": 3940
},
{
"epoch": 2.754532775453278,
"grad_norm": 0.001962635898962617,
"learning_rate": 8.98465829846583e-06,
"loss": 0.0371,
"step": 3950
},
{
"epoch": 2.7615062761506275,
"grad_norm": 0.020612264052033424,
"learning_rate": 8.95676429567643e-06,
"loss": 0.0006,
"step": 3960
},
{
"epoch": 2.7684797768479776,
"grad_norm": 0.004890106618404388,
"learning_rate": 8.92887029288703e-06,
"loss": 0.0016,
"step": 3970
},
{
"epoch": 2.775453277545328,
"grad_norm": 0.03778740391135216,
"learning_rate": 8.90097629009763e-06,
"loss": 0.0061,
"step": 3980
},
{
"epoch": 2.782426778242678,
"grad_norm": 0.0058617801405489445,
"learning_rate": 8.873082287308228e-06,
"loss": 0.0445,
"step": 3990
},
{
"epoch": 2.789400278940028,
"grad_norm": 0.005100315902382135,
"learning_rate": 8.84518828451883e-06,
"loss": 0.087,
"step": 4000
},
{
"epoch": 2.7963737796373778,
"grad_norm": 0.030730150640010834,
"learning_rate": 8.81729428172943e-06,
"loss": 0.0791,
"step": 4010
},
{
"epoch": 2.803347280334728,
"grad_norm": 0.00628610560670495,
"learning_rate": 8.789400278940029e-06,
"loss": 0.1605,
"step": 4020
},
{
"epoch": 2.810320781032078,
"grad_norm": 0.0032315838616341352,
"learning_rate": 8.761506276150629e-06,
"loss": 0.0085,
"step": 4030
},
{
"epoch": 2.817294281729428,
"grad_norm": 0.0032316127326339483,
"learning_rate": 8.733612273361229e-06,
"loss": 0.0006,
"step": 4040
},
{
"epoch": 2.8242677824267783,
"grad_norm": 0.0020475969649851322,
"learning_rate": 8.705718270571828e-06,
"loss": 0.0293,
"step": 4050
},
{
"epoch": 2.831241283124128,
"grad_norm": 0.032252971082925797,
"learning_rate": 8.677824267782428e-06,
"loss": 0.0309,
"step": 4060
},
{
"epoch": 2.8382147838214786,
"grad_norm": 0.07949300855398178,
"learning_rate": 8.649930264993028e-06,
"loss": 0.0026,
"step": 4070
},
{
"epoch": 2.8451882845188283,
"grad_norm": 0.0017817869083955884,
"learning_rate": 8.622036262203626e-06,
"loss": 0.0005,
"step": 4080
},
{
"epoch": 2.8521617852161785,
"grad_norm": 0.07673770934343338,
"learning_rate": 8.594142259414226e-06,
"loss": 0.0016,
"step": 4090
},
{
"epoch": 2.8591352859135286,
"grad_norm": 13.104846954345703,
"learning_rate": 8.566248256624826e-06,
"loss": 0.0546,
"step": 4100
},
{
"epoch": 2.8661087866108788,
"grad_norm": 14.739638328552246,
"learning_rate": 8.538354253835425e-06,
"loss": 0.0658,
"step": 4110
},
{
"epoch": 2.873082287308229,
"grad_norm": 0.018161823973059654,
"learning_rate": 8.510460251046025e-06,
"loss": 0.0839,
"step": 4120
},
{
"epoch": 2.8800557880055786,
"grad_norm": 0.02036408893764019,
"learning_rate": 8.482566248256625e-06,
"loss": 0.0007,
"step": 4130
},
{
"epoch": 2.8870292887029287,
"grad_norm": 0.007715345360338688,
"learning_rate": 8.454672245467225e-06,
"loss": 0.0009,
"step": 4140
},
{
"epoch": 2.894002789400279,
"grad_norm": 0.3437242805957794,
"learning_rate": 8.426778242677825e-06,
"loss": 0.0007,
"step": 4150
},
{
"epoch": 2.900976290097629,
"grad_norm": 0.027546469122171402,
"learning_rate": 8.398884239888424e-06,
"loss": 0.0124,
"step": 4160
},
{
"epoch": 2.907949790794979,
"grad_norm": 0.0030263513326644897,
"learning_rate": 8.370990237099024e-06,
"loss": 0.0808,
"step": 4170
},
{
"epoch": 2.914923291492329,
"grad_norm": 0.0032276464626193047,
"learning_rate": 8.343096234309624e-06,
"loss": 0.0014,
"step": 4180
},
{
"epoch": 2.9218967921896795,
"grad_norm": 0.0074035353027284145,
"learning_rate": 8.315202231520224e-06,
"loss": 0.0004,
"step": 4190
},
{
"epoch": 2.928870292887029,
"grad_norm": 0.016391828656196594,
"learning_rate": 8.287308228730823e-06,
"loss": 0.0314,
"step": 4200
},
{
"epoch": 2.9358437935843793,
"grad_norm": 0.9994223713874817,
"learning_rate": 8.259414225941423e-06,
"loss": 0.0013,
"step": 4210
},
{
"epoch": 2.9428172942817294,
"grad_norm": 0.006372373551130295,
"learning_rate": 8.231520223152023e-06,
"loss": 0.0654,
"step": 4220
},
{
"epoch": 2.9497907949790796,
"grad_norm": 0.006764199584722519,
"learning_rate": 8.203626220362623e-06,
"loss": 0.0004,
"step": 4230
},
{
"epoch": 2.9567642956764297,
"grad_norm": 0.012071878649294376,
"learning_rate": 8.175732217573223e-06,
"loss": 0.0341,
"step": 4240
},
{
"epoch": 2.9637377963737794,
"grad_norm": 0.10174605250358582,
"learning_rate": 8.147838214783822e-06,
"loss": 0.0285,
"step": 4250
},
{
"epoch": 2.9707112970711296,
"grad_norm": 0.010248345322906971,
"learning_rate": 8.119944211994422e-06,
"loss": 0.0931,
"step": 4260
},
{
"epoch": 2.9776847977684797,
"grad_norm": 0.06934584677219391,
"learning_rate": 8.092050209205022e-06,
"loss": 0.0008,
"step": 4270
},
{
"epoch": 2.98465829846583,
"grad_norm": 0.04618504270911217,
"learning_rate": 8.064156206415622e-06,
"loss": 0.0534,
"step": 4280
},
{
"epoch": 2.99163179916318,
"grad_norm": 0.04851532354950905,
"learning_rate": 8.03626220362622e-06,
"loss": 0.0679,
"step": 4290
},
{
"epoch": 2.99860529986053,
"grad_norm": 0.004045933019369841,
"learning_rate": 8.00836820083682e-06,
"loss": 0.0816,
"step": 4300
}
],
"logging_steps": 10,
"max_steps": 7170,
"num_input_tokens_seen": 0,
"num_train_epochs": 5,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 4652091396864000.0,
"train_batch_size": 8,
"trial_name": null,
"trial_params": null
}