gr00t-folding-clothes / trainer_state.json
maxzand's picture
Upload folder using huggingface_hub
954c86b verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 6.476683937823834,
"eval_steps": 500,
"global_step": 10000,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.006476683937823834,
"grad_norm": 12.527985572814941,
"learning_rate": 2.0000000000000003e-06,
"loss": 1.244,
"step": 10
},
{
"epoch": 0.012953367875647668,
"grad_norm": 19.27385139465332,
"learning_rate": 4.000000000000001e-06,
"loss": 1.2301,
"step": 20
},
{
"epoch": 0.019430051813471502,
"grad_norm": 6.885942459106445,
"learning_rate": 6e-06,
"loss": 1.169,
"step": 30
},
{
"epoch": 0.025906735751295335,
"grad_norm": 5.766233921051025,
"learning_rate": 8.000000000000001e-06,
"loss": 0.831,
"step": 40
},
{
"epoch": 0.03238341968911917,
"grad_norm": 3.2696096897125244,
"learning_rate": 1e-05,
"loss": 0.5349,
"step": 50
},
{
"epoch": 0.038860103626943004,
"grad_norm": 3.4554696083068848,
"learning_rate": 1.2e-05,
"loss": 0.4294,
"step": 60
},
{
"epoch": 0.04533678756476684,
"grad_norm": 2.4693260192871094,
"learning_rate": 1.4000000000000001e-05,
"loss": 0.3355,
"step": 70
},
{
"epoch": 0.05181347150259067,
"grad_norm": 1.4052163362503052,
"learning_rate": 1.6000000000000003e-05,
"loss": 0.3158,
"step": 80
},
{
"epoch": 0.05829015544041451,
"grad_norm": 2.1825428009033203,
"learning_rate": 1.8e-05,
"loss": 0.2853,
"step": 90
},
{
"epoch": 0.06476683937823834,
"grad_norm": 1.7514649629592896,
"learning_rate": 2e-05,
"loss": 0.2427,
"step": 100
},
{
"epoch": 0.07124352331606218,
"grad_norm": 4.749204158782959,
"learning_rate": 2.2000000000000003e-05,
"loss": 0.2211,
"step": 110
},
{
"epoch": 0.07772020725388601,
"grad_norm": 2.262394428253174,
"learning_rate": 2.4e-05,
"loss": 0.2275,
"step": 120
},
{
"epoch": 0.08419689119170984,
"grad_norm": 2.971313238143921,
"learning_rate": 2.6000000000000002e-05,
"loss": 0.1767,
"step": 130
},
{
"epoch": 0.09067357512953368,
"grad_norm": 1.2210947275161743,
"learning_rate": 2.8000000000000003e-05,
"loss": 0.167,
"step": 140
},
{
"epoch": 0.09715025906735751,
"grad_norm": 6.283907890319824,
"learning_rate": 3e-05,
"loss": 0.1677,
"step": 150
},
{
"epoch": 0.10362694300518134,
"grad_norm": 0.9573088884353638,
"learning_rate": 3.2000000000000005e-05,
"loss": 0.1766,
"step": 160
},
{
"epoch": 0.11010362694300518,
"grad_norm": 1.4948713779449463,
"learning_rate": 3.4000000000000007e-05,
"loss": 0.149,
"step": 170
},
{
"epoch": 0.11658031088082901,
"grad_norm": 1.1040873527526855,
"learning_rate": 3.6e-05,
"loss": 0.1549,
"step": 180
},
{
"epoch": 0.12305699481865284,
"grad_norm": 0.8945522904396057,
"learning_rate": 3.8e-05,
"loss": 0.1424,
"step": 190
},
{
"epoch": 0.12953367875647667,
"grad_norm": 1.265764594078064,
"learning_rate": 4e-05,
"loss": 0.1503,
"step": 200
},
{
"epoch": 0.13601036269430053,
"grad_norm": 1.3545417785644531,
"learning_rate": 4.2e-05,
"loss": 0.1511,
"step": 210
},
{
"epoch": 0.14248704663212436,
"grad_norm": 2.2851409912109375,
"learning_rate": 4.4000000000000006e-05,
"loss": 0.1424,
"step": 220
},
{
"epoch": 0.1489637305699482,
"grad_norm": 1.2114957571029663,
"learning_rate": 4.600000000000001e-05,
"loss": 0.1295,
"step": 230
},
{
"epoch": 0.15544041450777202,
"grad_norm": 0.7779485583305359,
"learning_rate": 4.8e-05,
"loss": 0.1347,
"step": 240
},
{
"epoch": 0.16191709844559585,
"grad_norm": 0.9195041060447693,
"learning_rate": 5e-05,
"loss": 0.1205,
"step": 250
},
{
"epoch": 0.16839378238341968,
"grad_norm": 1.378207802772522,
"learning_rate": 5.2000000000000004e-05,
"loss": 0.128,
"step": 260
},
{
"epoch": 0.17487046632124353,
"grad_norm": 0.9869899153709412,
"learning_rate": 5.4000000000000005e-05,
"loss": 0.1199,
"step": 270
},
{
"epoch": 0.18134715025906736,
"grad_norm": 2.6648809909820557,
"learning_rate": 5.6000000000000006e-05,
"loss": 0.1442,
"step": 280
},
{
"epoch": 0.1878238341968912,
"grad_norm": 1.7314049005508423,
"learning_rate": 5.8e-05,
"loss": 0.1287,
"step": 290
},
{
"epoch": 0.19430051813471502,
"grad_norm": 1.1444119215011597,
"learning_rate": 6e-05,
"loss": 0.1206,
"step": 300
},
{
"epoch": 0.20077720207253885,
"grad_norm": 0.8097096085548401,
"learning_rate": 6.2e-05,
"loss": 0.1139,
"step": 310
},
{
"epoch": 0.20725388601036268,
"grad_norm": 1.2885841131210327,
"learning_rate": 6.400000000000001e-05,
"loss": 0.1244,
"step": 320
},
{
"epoch": 0.21373056994818654,
"grad_norm": 0.8578065037727356,
"learning_rate": 6.6e-05,
"loss": 0.1112,
"step": 330
},
{
"epoch": 0.22020725388601037,
"grad_norm": 1.3751784563064575,
"learning_rate": 6.800000000000001e-05,
"loss": 0.1033,
"step": 340
},
{
"epoch": 0.2266839378238342,
"grad_norm": 1.1707627773284912,
"learning_rate": 7e-05,
"loss": 0.1162,
"step": 350
},
{
"epoch": 0.23316062176165803,
"grad_norm": 1.7474905252456665,
"learning_rate": 7.2e-05,
"loss": 0.1038,
"step": 360
},
{
"epoch": 0.23963730569948186,
"grad_norm": 0.5757717490196228,
"learning_rate": 7.4e-05,
"loss": 0.0896,
"step": 370
},
{
"epoch": 0.24611398963730569,
"grad_norm": 1.0151946544647217,
"learning_rate": 7.6e-05,
"loss": 0.1136,
"step": 380
},
{
"epoch": 0.25259067357512954,
"grad_norm": 0.9157730937004089,
"learning_rate": 7.800000000000001e-05,
"loss": 0.1207,
"step": 390
},
{
"epoch": 0.25906735751295334,
"grad_norm": 0.9437503218650818,
"learning_rate": 8e-05,
"loss": 0.1058,
"step": 400
},
{
"epoch": 0.2655440414507772,
"grad_norm": 0.8575088381767273,
"learning_rate": 8.2e-05,
"loss": 0.1019,
"step": 410
},
{
"epoch": 0.27202072538860106,
"grad_norm": 1.1373648643493652,
"learning_rate": 8.4e-05,
"loss": 0.1168,
"step": 420
},
{
"epoch": 0.27849740932642486,
"grad_norm": 0.9613192677497864,
"learning_rate": 8.6e-05,
"loss": 0.1048,
"step": 430
},
{
"epoch": 0.2849740932642487,
"grad_norm": 1.4302594661712646,
"learning_rate": 8.800000000000001e-05,
"loss": 0.1037,
"step": 440
},
{
"epoch": 0.2914507772020725,
"grad_norm": 1.0947346687316895,
"learning_rate": 9e-05,
"loss": 0.1023,
"step": 450
},
{
"epoch": 0.2979274611398964,
"grad_norm": 1.21216881275177,
"learning_rate": 9.200000000000001e-05,
"loss": 0.0955,
"step": 460
},
{
"epoch": 0.30440414507772023,
"grad_norm": 1.4514696598052979,
"learning_rate": 9.4e-05,
"loss": 0.1054,
"step": 470
},
{
"epoch": 0.31088082901554404,
"grad_norm": 1.2861961126327515,
"learning_rate": 9.6e-05,
"loss": 0.1016,
"step": 480
},
{
"epoch": 0.3173575129533679,
"grad_norm": 0.5643912553787231,
"learning_rate": 9.8e-05,
"loss": 0.1003,
"step": 490
},
{
"epoch": 0.3238341968911917,
"grad_norm": 0.9163880348205566,
"learning_rate": 0.0001,
"loss": 0.0849,
"step": 500
},
{
"epoch": 0.33031088082901555,
"grad_norm": 1.2243982553482056,
"learning_rate": 9.999972660400536e-05,
"loss": 0.0903,
"step": 510
},
{
"epoch": 0.33678756476683935,
"grad_norm": 0.9769583940505981,
"learning_rate": 9.999890641901125e-05,
"loss": 0.0926,
"step": 520
},
{
"epoch": 0.3432642487046632,
"grad_norm": 1.0890876054763794,
"learning_rate": 9.999753945398704e-05,
"loss": 0.09,
"step": 530
},
{
"epoch": 0.34974093264248707,
"grad_norm": 0.701673150062561,
"learning_rate": 9.99956257238817e-05,
"loss": 0.105,
"step": 540
},
{
"epoch": 0.35621761658031087,
"grad_norm": 0.6724784970283508,
"learning_rate": 9.999316524962345e-05,
"loss": 0.0862,
"step": 550
},
{
"epoch": 0.3626943005181347,
"grad_norm": 0.6740795373916626,
"learning_rate": 9.999015805811965e-05,
"loss": 0.0933,
"step": 560
},
{
"epoch": 0.36917098445595853,
"grad_norm": 0.9289199113845825,
"learning_rate": 9.998660418225645e-05,
"loss": 0.1064,
"step": 570
},
{
"epoch": 0.3756476683937824,
"grad_norm": 0.9368308186531067,
"learning_rate": 9.998250366089848e-05,
"loss": 0.1032,
"step": 580
},
{
"epoch": 0.38212435233160624,
"grad_norm": 0.688762903213501,
"learning_rate": 9.997785653888835e-05,
"loss": 0.0899,
"step": 590
},
{
"epoch": 0.38860103626943004,
"grad_norm": 0.9816915988922119,
"learning_rate": 9.997266286704631e-05,
"loss": 0.0998,
"step": 600
},
{
"epoch": 0.3950777202072539,
"grad_norm": 0.8907963037490845,
"learning_rate": 9.996692270216947e-05,
"loss": 0.0987,
"step": 610
},
{
"epoch": 0.4015544041450777,
"grad_norm": 1.1411789655685425,
"learning_rate": 9.996063610703137e-05,
"loss": 0.0936,
"step": 620
},
{
"epoch": 0.40803108808290156,
"grad_norm": 0.7415697574615479,
"learning_rate": 9.995380315038119e-05,
"loss": 0.084,
"step": 630
},
{
"epoch": 0.41450777202072536,
"grad_norm": 0.856239378452301,
"learning_rate": 9.994642390694308e-05,
"loss": 0.0881,
"step": 640
},
{
"epoch": 0.4209844559585492,
"grad_norm": 0.8964262008666992,
"learning_rate": 9.993849845741524e-05,
"loss": 0.0872,
"step": 650
},
{
"epoch": 0.4274611398963731,
"grad_norm": 1.0198777914047241,
"learning_rate": 9.993002688846913e-05,
"loss": 0.086,
"step": 660
},
{
"epoch": 0.4339378238341969,
"grad_norm": 0.7242151498794556,
"learning_rate": 9.992100929274846e-05,
"loss": 0.0695,
"step": 670
},
{
"epoch": 0.44041450777202074,
"grad_norm": 0.6688233017921448,
"learning_rate": 9.991144576886823e-05,
"loss": 0.0869,
"step": 680
},
{
"epoch": 0.44689119170984454,
"grad_norm": 1.0634338855743408,
"learning_rate": 9.990133642141359e-05,
"loss": 0.074,
"step": 690
},
{
"epoch": 0.4533678756476684,
"grad_norm": 0.9007896184921265,
"learning_rate": 9.989068136093873e-05,
"loss": 0.0795,
"step": 700
},
{
"epoch": 0.45984455958549225,
"grad_norm": 0.9645273685455322,
"learning_rate": 9.987948070396571e-05,
"loss": 0.0748,
"step": 710
},
{
"epoch": 0.46632124352331605,
"grad_norm": 0.6668915748596191,
"learning_rate": 9.986773457298311e-05,
"loss": 0.0724,
"step": 720
},
{
"epoch": 0.4727979274611399,
"grad_norm": 0.6029163599014282,
"learning_rate": 9.985544309644475e-05,
"loss": 0.0914,
"step": 730
},
{
"epoch": 0.4792746113989637,
"grad_norm": 0.4224916398525238,
"learning_rate": 9.984260640876821e-05,
"loss": 0.0698,
"step": 740
},
{
"epoch": 0.48575129533678757,
"grad_norm": 0.5059609413146973,
"learning_rate": 9.98292246503335e-05,
"loss": 0.0735,
"step": 750
},
{
"epoch": 0.49222797927461137,
"grad_norm": 0.6006156802177429,
"learning_rate": 9.981529796748134e-05,
"loss": 0.0738,
"step": 760
},
{
"epoch": 0.49870466321243523,
"grad_norm": 0.6188303828239441,
"learning_rate": 9.980082651251175e-05,
"loss": 0.0757,
"step": 770
},
{
"epoch": 0.5051813471502591,
"grad_norm": 0.5524353981018066,
"learning_rate": 9.97858104436822e-05,
"loss": 0.0759,
"step": 780
},
{
"epoch": 0.5116580310880829,
"grad_norm": 0.7834250330924988,
"learning_rate": 9.977024992520602e-05,
"loss": 0.0798,
"step": 790
},
{
"epoch": 0.5181347150259067,
"grad_norm": 0.5550430417060852,
"learning_rate": 9.975414512725057e-05,
"loss": 0.0829,
"step": 800
},
{
"epoch": 0.5246113989637305,
"grad_norm": 0.4284016788005829,
"learning_rate": 9.973749622593534e-05,
"loss": 0.0737,
"step": 810
},
{
"epoch": 0.5310880829015544,
"grad_norm": 0.6256512403488159,
"learning_rate": 9.972030340333001e-05,
"loss": 0.0911,
"step": 820
},
{
"epoch": 0.5375647668393783,
"grad_norm": 0.7751624584197998,
"learning_rate": 9.970256684745258e-05,
"loss": 0.0744,
"step": 830
},
{
"epoch": 0.5440414507772021,
"grad_norm": 0.9001058340072632,
"learning_rate": 9.968428675226714e-05,
"loss": 0.0587,
"step": 840
},
{
"epoch": 0.5505181347150259,
"grad_norm": 0.46805211901664734,
"learning_rate": 9.966546331768191e-05,
"loss": 0.0704,
"step": 850
},
{
"epoch": 0.5569948186528497,
"grad_norm": 0.6288490891456604,
"learning_rate": 9.964609674954696e-05,
"loss": 0.0778,
"step": 860
},
{
"epoch": 0.5634715025906736,
"grad_norm": 0.6661942601203918,
"learning_rate": 9.962618725965196e-05,
"loss": 0.0722,
"step": 870
},
{
"epoch": 0.5699481865284974,
"grad_norm": 1.02120041847229,
"learning_rate": 9.96057350657239e-05,
"loss": 0.0733,
"step": 880
},
{
"epoch": 0.5764248704663213,
"grad_norm": 0.5569030046463013,
"learning_rate": 9.95847403914247e-05,
"loss": 0.0744,
"step": 890
},
{
"epoch": 0.582901554404145,
"grad_norm": 0.6581366062164307,
"learning_rate": 9.956320346634876e-05,
"loss": 0.0641,
"step": 900
},
{
"epoch": 0.5893782383419689,
"grad_norm": 0.8849700689315796,
"learning_rate": 9.954112452602045e-05,
"loss": 0.0638,
"step": 910
},
{
"epoch": 0.5958549222797928,
"grad_norm": 0.6778038740158081,
"learning_rate": 9.95185038118915e-05,
"loss": 0.074,
"step": 920
},
{
"epoch": 0.6023316062176166,
"grad_norm": 0.8255287408828735,
"learning_rate": 9.949534157133844e-05,
"loss": 0.0886,
"step": 930
},
{
"epoch": 0.6088082901554405,
"grad_norm": 0.6942656636238098,
"learning_rate": 9.94716380576598e-05,
"loss": 0.0708,
"step": 940
},
{
"epoch": 0.6152849740932642,
"grad_norm": 0.5593329071998596,
"learning_rate": 9.944739353007344e-05,
"loss": 0.071,
"step": 950
},
{
"epoch": 0.6217616580310881,
"grad_norm": 0.9311866164207458,
"learning_rate": 9.942260825371358e-05,
"loss": 0.0684,
"step": 960
},
{
"epoch": 0.6282383419689119,
"grad_norm": 0.5894584655761719,
"learning_rate": 9.939728249962807e-05,
"loss": 0.0676,
"step": 970
},
{
"epoch": 0.6347150259067358,
"grad_norm": 0.40771910548210144,
"learning_rate": 9.937141654477528e-05,
"loss": 0.0638,
"step": 980
},
{
"epoch": 0.6411917098445595,
"grad_norm": 0.4833804666996002,
"learning_rate": 9.934501067202117e-05,
"loss": 0.0656,
"step": 990
},
{
"epoch": 0.6476683937823834,
"grad_norm": 0.5921794772148132,
"learning_rate": 9.931806517013612e-05,
"loss": 0.0591,
"step": 1000
},
{
"epoch": 0.6541450777202072,
"grad_norm": 0.40994754433631897,
"learning_rate": 9.929058033379181e-05,
"loss": 0.077,
"step": 1010
},
{
"epoch": 0.6606217616580311,
"grad_norm": 1.0852352380752563,
"learning_rate": 9.926255646355804e-05,
"loss": 0.0573,
"step": 1020
},
{
"epoch": 0.667098445595855,
"grad_norm": 0.833574116230011,
"learning_rate": 9.923399386589933e-05,
"loss": 0.0586,
"step": 1030
},
{
"epoch": 0.6735751295336787,
"grad_norm": 0.9025760889053345,
"learning_rate": 9.92048928531717e-05,
"loss": 0.0627,
"step": 1040
},
{
"epoch": 0.6800518134715026,
"grad_norm": 0.525789737701416,
"learning_rate": 9.917525374361912e-05,
"loss": 0.0599,
"step": 1050
},
{
"epoch": 0.6865284974093264,
"grad_norm": 0.4664829969406128,
"learning_rate": 9.914507686137019e-05,
"loss": 0.0635,
"step": 1060
},
{
"epoch": 0.6930051813471503,
"grad_norm": 0.29409900307655334,
"learning_rate": 9.911436253643445e-05,
"loss": 0.0528,
"step": 1070
},
{
"epoch": 0.6994818652849741,
"grad_norm": 0.7681676149368286,
"learning_rate": 9.90831111046988e-05,
"loss": 0.062,
"step": 1080
},
{
"epoch": 0.7059585492227979,
"grad_norm": 0.741635799407959,
"learning_rate": 9.905132290792394e-05,
"loss": 0.0615,
"step": 1090
},
{
"epoch": 0.7124352331606217,
"grad_norm": 0.4648561477661133,
"learning_rate": 9.901899829374047e-05,
"loss": 0.0627,
"step": 1100
},
{
"epoch": 0.7189119170984456,
"grad_norm": 0.40356361865997314,
"learning_rate": 9.89861376156452e-05,
"loss": 0.0577,
"step": 1110
},
{
"epoch": 0.7253886010362695,
"grad_norm": 0.3577727675437927,
"learning_rate": 9.895274123299723e-05,
"loss": 0.0538,
"step": 1120
},
{
"epoch": 0.7318652849740933,
"grad_norm": 0.3680018186569214,
"learning_rate": 9.891880951101407e-05,
"loss": 0.0538,
"step": 1130
},
{
"epoch": 0.7383419689119171,
"grad_norm": 0.5400322675704956,
"learning_rate": 9.888434282076758e-05,
"loss": 0.0541,
"step": 1140
},
{
"epoch": 0.7448186528497409,
"grad_norm": 0.4998588562011719,
"learning_rate": 9.884934153917997e-05,
"loss": 0.0582,
"step": 1150
},
{
"epoch": 0.7512953367875648,
"grad_norm": 0.6761271953582764,
"learning_rate": 9.881380604901964e-05,
"loss": 0.0557,
"step": 1160
},
{
"epoch": 0.7577720207253886,
"grad_norm": 0.751621425151825,
"learning_rate": 9.877773673889701e-05,
"loss": 0.0627,
"step": 1170
},
{
"epoch": 0.7642487046632125,
"grad_norm": 0.563669741153717,
"learning_rate": 9.87411340032603e-05,
"loss": 0.0572,
"step": 1180
},
{
"epoch": 0.7707253886010362,
"grad_norm": 0.4242180585861206,
"learning_rate": 9.870399824239117e-05,
"loss": 0.0572,
"step": 1190
},
{
"epoch": 0.7772020725388601,
"grad_norm": 0.37374168634414673,
"learning_rate": 9.86663298624003e-05,
"loss": 0.058,
"step": 1200
},
{
"epoch": 0.783678756476684,
"grad_norm": 0.4612903296947479,
"learning_rate": 9.862812927522309e-05,
"loss": 0.0657,
"step": 1210
},
{
"epoch": 0.7901554404145078,
"grad_norm": 0.6304490566253662,
"learning_rate": 9.858939689861506e-05,
"loss": 0.0573,
"step": 1220
},
{
"epoch": 0.7966321243523317,
"grad_norm": 0.5008482336997986,
"learning_rate": 9.855013315614725e-05,
"loss": 0.0564,
"step": 1230
},
{
"epoch": 0.8031088082901554,
"grad_norm": 0.6502981185913086,
"learning_rate": 9.851033847720166e-05,
"loss": 0.0688,
"step": 1240
},
{
"epoch": 0.8095854922279793,
"grad_norm": 0.6085125803947449,
"learning_rate": 9.847001329696653e-05,
"loss": 0.0539,
"step": 1250
},
{
"epoch": 0.8160621761658031,
"grad_norm": 0.4931461215019226,
"learning_rate": 9.842915805643155e-05,
"loss": 0.0491,
"step": 1260
},
{
"epoch": 0.822538860103627,
"grad_norm": 0.6507974863052368,
"learning_rate": 9.838777320238312e-05,
"loss": 0.0514,
"step": 1270
},
{
"epoch": 0.8290155440414507,
"grad_norm": 0.753237783908844,
"learning_rate": 9.834585918739936e-05,
"loss": 0.0632,
"step": 1280
},
{
"epoch": 0.8354922279792746,
"grad_norm": 0.5048700571060181,
"learning_rate": 9.830341646984521e-05,
"loss": 0.0604,
"step": 1290
},
{
"epoch": 0.8419689119170984,
"grad_norm": 0.5395866632461548,
"learning_rate": 9.826044551386744e-05,
"loss": 0.0596,
"step": 1300
},
{
"epoch": 0.8484455958549223,
"grad_norm": 0.3940522074699402,
"learning_rate": 9.821694678938953e-05,
"loss": 0.0547,
"step": 1310
},
{
"epoch": 0.8549222797927462,
"grad_norm": 0.4183802902698517,
"learning_rate": 9.817292077210659e-05,
"loss": 0.0542,
"step": 1320
},
{
"epoch": 0.8613989637305699,
"grad_norm": 0.6100337505340576,
"learning_rate": 9.812836794348004e-05,
"loss": 0.0688,
"step": 1330
},
{
"epoch": 0.8678756476683938,
"grad_norm": 0.5487945675849915,
"learning_rate": 9.808328879073251e-05,
"loss": 0.0597,
"step": 1340
},
{
"epoch": 0.8743523316062176,
"grad_norm": 0.6972073912620544,
"learning_rate": 9.803768380684242e-05,
"loss": 0.0526,
"step": 1350
},
{
"epoch": 0.8808290155440415,
"grad_norm": 0.6903045773506165,
"learning_rate": 9.799155349053851e-05,
"loss": 0.0608,
"step": 1360
},
{
"epoch": 0.8873056994818653,
"grad_norm": 0.5447720885276794,
"learning_rate": 9.794489834629455e-05,
"loss": 0.047,
"step": 1370
},
{
"epoch": 0.8937823834196891,
"grad_norm": 0.6312441825866699,
"learning_rate": 9.789771888432375e-05,
"loss": 0.0508,
"step": 1380
},
{
"epoch": 0.9002590673575129,
"grad_norm": 0.6527279615402222,
"learning_rate": 9.785001562057309e-05,
"loss": 0.06,
"step": 1390
},
{
"epoch": 0.9067357512953368,
"grad_norm": 0.5341177582740784,
"learning_rate": 9.780178907671789e-05,
"loss": 0.055,
"step": 1400
},
{
"epoch": 0.9132124352331606,
"grad_norm": 0.7597373723983765,
"learning_rate": 9.775303978015585e-05,
"loss": 0.0677,
"step": 1410
},
{
"epoch": 0.9196891191709845,
"grad_norm": 0.6934833526611328,
"learning_rate": 9.77037682640015e-05,
"loss": 0.0528,
"step": 1420
},
{
"epoch": 0.9261658031088082,
"grad_norm": 0.6315605640411377,
"learning_rate": 9.765397506708023e-05,
"loss": 0.0607,
"step": 1430
},
{
"epoch": 0.9326424870466321,
"grad_norm": 0.5957879424095154,
"learning_rate": 9.760366073392246e-05,
"loss": 0.0428,
"step": 1440
},
{
"epoch": 0.939119170984456,
"grad_norm": 0.4082203209400177,
"learning_rate": 9.755282581475769e-05,
"loss": 0.0534,
"step": 1450
},
{
"epoch": 0.9455958549222798,
"grad_norm": 0.5609003305435181,
"learning_rate": 9.750147086550844e-05,
"loss": 0.0489,
"step": 1460
},
{
"epoch": 0.9520725388601037,
"grad_norm": 0.5083820819854736,
"learning_rate": 9.744959644778422e-05,
"loss": 0.0452,
"step": 1470
},
{
"epoch": 0.9585492227979274,
"grad_norm": 0.48094549775123596,
"learning_rate": 9.739720312887535e-05,
"loss": 0.0551,
"step": 1480
},
{
"epoch": 0.9650259067357513,
"grad_norm": 0.46845394372940063,
"learning_rate": 9.734429148174675e-05,
"loss": 0.0459,
"step": 1490
},
{
"epoch": 0.9715025906735751,
"grad_norm": 0.46226462721824646,
"learning_rate": 9.729086208503174e-05,
"loss": 0.0565,
"step": 1500
},
{
"epoch": 0.977979274611399,
"grad_norm": 0.3774145543575287,
"learning_rate": 9.723691552302562e-05,
"loss": 0.0445,
"step": 1510
},
{
"epoch": 0.9844559585492227,
"grad_norm": 0.6989986300468445,
"learning_rate": 9.718245238567939e-05,
"loss": 0.0513,
"step": 1520
},
{
"epoch": 0.9909326424870466,
"grad_norm": 0.31146273016929626,
"learning_rate": 9.712747326859315e-05,
"loss": 0.0415,
"step": 1530
},
{
"epoch": 0.9974093264248705,
"grad_norm": 0.5306687951087952,
"learning_rate": 9.707197877300974e-05,
"loss": 0.043,
"step": 1540
},
{
"epoch": 1.0038860103626943,
"grad_norm": 0.4852505922317505,
"learning_rate": 9.701596950580806e-05,
"loss": 0.0519,
"step": 1550
},
{
"epoch": 1.0103626943005182,
"grad_norm": 0.38807374238967896,
"learning_rate": 9.695944607949649e-05,
"loss": 0.0486,
"step": 1560
},
{
"epoch": 1.016839378238342,
"grad_norm": 0.4936634600162506,
"learning_rate": 9.690240911220618e-05,
"loss": 0.0459,
"step": 1570
},
{
"epoch": 1.0233160621761659,
"grad_norm": 0.5796962976455688,
"learning_rate": 9.684485922768422e-05,
"loss": 0.0515,
"step": 1580
},
{
"epoch": 1.0297927461139897,
"grad_norm": 0.38532519340515137,
"learning_rate": 9.6786797055287e-05,
"loss": 0.0498,
"step": 1590
},
{
"epoch": 1.0362694300518134,
"grad_norm": 0.45021045207977295,
"learning_rate": 9.672822322997305e-05,
"loss": 0.0497,
"step": 1600
},
{
"epoch": 1.0427461139896372,
"grad_norm": 0.5176680684089661,
"learning_rate": 9.66691383922964e-05,
"loss": 0.0556,
"step": 1610
},
{
"epoch": 1.049222797927461,
"grad_norm": 0.4210398495197296,
"learning_rate": 9.660954318839933e-05,
"loss": 0.0531,
"step": 1620
},
{
"epoch": 1.055699481865285,
"grad_norm": 0.4386137127876282,
"learning_rate": 9.654943827000548e-05,
"loss": 0.0497,
"step": 1630
},
{
"epoch": 1.0621761658031088,
"grad_norm": 0.37542179226875305,
"learning_rate": 9.648882429441257e-05,
"loss": 0.0471,
"step": 1640
},
{
"epoch": 1.0686528497409327,
"grad_norm": 0.5773810744285583,
"learning_rate": 9.642770192448536e-05,
"loss": 0.0387,
"step": 1650
},
{
"epoch": 1.0751295336787565,
"grad_norm": 0.6515969038009644,
"learning_rate": 9.636607182864827e-05,
"loss": 0.0575,
"step": 1660
},
{
"epoch": 1.0816062176165804,
"grad_norm": 0.28263774514198303,
"learning_rate": 9.630393468087818e-05,
"loss": 0.0435,
"step": 1670
},
{
"epoch": 1.0880829015544042,
"grad_norm": 0.49530646204948425,
"learning_rate": 9.624129116069694e-05,
"loss": 0.0452,
"step": 1680
},
{
"epoch": 1.0945595854922279,
"grad_norm": 0.5361151695251465,
"learning_rate": 9.617814195316411e-05,
"loss": 0.0685,
"step": 1690
},
{
"epoch": 1.1010362694300517,
"grad_norm": 0.9678163528442383,
"learning_rate": 9.611448774886924e-05,
"loss": 0.048,
"step": 1700
},
{
"epoch": 1.1075129533678756,
"grad_norm": 0.3357471823692322,
"learning_rate": 9.605032924392457e-05,
"loss": 0.0492,
"step": 1710
},
{
"epoch": 1.1139896373056994,
"grad_norm": 0.7007777094841003,
"learning_rate": 9.598566713995718e-05,
"loss": 0.0489,
"step": 1720
},
{
"epoch": 1.1204663212435233,
"grad_norm": 0.2847372591495514,
"learning_rate": 9.59205021441015e-05,
"loss": 0.0519,
"step": 1730
},
{
"epoch": 1.1269430051813472,
"grad_norm": 0.41937583684921265,
"learning_rate": 9.58548349689915e-05,
"loss": 0.0533,
"step": 1740
},
{
"epoch": 1.133419689119171,
"grad_norm": 0.6489237546920776,
"learning_rate": 9.578866633275288e-05,
"loss": 0.0455,
"step": 1750
},
{
"epoch": 1.1398963730569949,
"grad_norm": 0.5252217054367065,
"learning_rate": 9.572199695899522e-05,
"loss": 0.0512,
"step": 1760
},
{
"epoch": 1.1463730569948187,
"grad_norm": 0.44682297110557556,
"learning_rate": 9.565482757680415e-05,
"loss": 0.0479,
"step": 1770
},
{
"epoch": 1.1528497409326426,
"grad_norm": 0.7223442196846008,
"learning_rate": 9.558715892073323e-05,
"loss": 0.0427,
"step": 1780
},
{
"epoch": 1.1593264248704664,
"grad_norm": 0.4472532570362091,
"learning_rate": 9.551899173079607e-05,
"loss": 0.0549,
"step": 1790
},
{
"epoch": 1.16580310880829,
"grad_norm": 0.4524969160556793,
"learning_rate": 9.545032675245813e-05,
"loss": 0.0543,
"step": 1800
},
{
"epoch": 1.172279792746114,
"grad_norm": 0.5706340670585632,
"learning_rate": 9.538116473662861e-05,
"loss": 0.0442,
"step": 1810
},
{
"epoch": 1.1787564766839378,
"grad_norm": 0.655547559261322,
"learning_rate": 9.531150643965223e-05,
"loss": 0.0355,
"step": 1820
},
{
"epoch": 1.1852331606217616,
"grad_norm": 0.3323609530925751,
"learning_rate": 9.524135262330098e-05,
"loss": 0.0508,
"step": 1830
},
{
"epoch": 1.1917098445595855,
"grad_norm": 0.5258129239082336,
"learning_rate": 9.517070405476575e-05,
"loss": 0.0491,
"step": 1840
},
{
"epoch": 1.1981865284974094,
"grad_norm": 0.4815446734428406,
"learning_rate": 9.509956150664796e-05,
"loss": 0.0492,
"step": 1850
},
{
"epoch": 1.2046632124352332,
"grad_norm": 0.3571079969406128,
"learning_rate": 9.502792575695112e-05,
"loss": 0.0478,
"step": 1860
},
{
"epoch": 1.211139896373057,
"grad_norm": 0.5302177667617798,
"learning_rate": 9.49557975890723e-05,
"loss": 0.0405,
"step": 1870
},
{
"epoch": 1.2176165803108807,
"grad_norm": 0.3246991038322449,
"learning_rate": 9.488317779179361e-05,
"loss": 0.0413,
"step": 1880
},
{
"epoch": 1.2240932642487046,
"grad_norm": 0.5374770760536194,
"learning_rate": 9.481006715927351e-05,
"loss": 0.0502,
"step": 1890
},
{
"epoch": 1.2305699481865284,
"grad_norm": 0.410244882106781,
"learning_rate": 9.473646649103818e-05,
"loss": 0.0502,
"step": 1900
},
{
"epoch": 1.2370466321243523,
"grad_norm": 0.5318041443824768,
"learning_rate": 9.46623765919727e-05,
"loss": 0.0429,
"step": 1910
},
{
"epoch": 1.2435233160621761,
"grad_norm": 0.42394232749938965,
"learning_rate": 9.458779827231237e-05,
"loss": 0.0394,
"step": 1920
},
{
"epoch": 1.25,
"grad_norm": 0.4311422109603882,
"learning_rate": 9.451273234763371e-05,
"loss": 0.0438,
"step": 1930
},
{
"epoch": 1.2564766839378239,
"grad_norm": 0.5800412893295288,
"learning_rate": 9.443717963884569e-05,
"loss": 0.0403,
"step": 1940
},
{
"epoch": 1.2629533678756477,
"grad_norm": 0.48299023509025574,
"learning_rate": 9.43611409721806e-05,
"loss": 0.0549,
"step": 1950
},
{
"epoch": 1.2694300518134716,
"grad_norm": 0.5357002019882202,
"learning_rate": 9.428461717918511e-05,
"loss": 0.0403,
"step": 1960
},
{
"epoch": 1.2759067357512954,
"grad_norm": 0.43265822529792786,
"learning_rate": 9.420760909671118e-05,
"loss": 0.0529,
"step": 1970
},
{
"epoch": 1.2823834196891193,
"grad_norm": 0.36043480038642883,
"learning_rate": 9.413011756690685e-05,
"loss": 0.0476,
"step": 1980
},
{
"epoch": 1.2888601036269431,
"grad_norm": 0.43068262934684753,
"learning_rate": 9.405214343720707e-05,
"loss": 0.0443,
"step": 1990
},
{
"epoch": 1.2953367875647668,
"grad_norm": 0.5082911849021912,
"learning_rate": 9.397368756032445e-05,
"loss": 0.0466,
"step": 2000
},
{
"epoch": 1.3018134715025906,
"grad_norm": 0.5428886413574219,
"learning_rate": 9.389475079423988e-05,
"loss": 0.0441,
"step": 2010
},
{
"epoch": 1.3082901554404145,
"grad_norm": 0.4764600098133087,
"learning_rate": 9.381533400219318e-05,
"loss": 0.041,
"step": 2020
},
{
"epoch": 1.3147668393782384,
"grad_norm": 0.40374669432640076,
"learning_rate": 9.373543805267368e-05,
"loss": 0.0446,
"step": 2030
},
{
"epoch": 1.3212435233160622,
"grad_norm": 0.31766676902770996,
"learning_rate": 9.365506381941066e-05,
"loss": 0.0416,
"step": 2040
},
{
"epoch": 1.327720207253886,
"grad_norm": 0.4188934862613678,
"learning_rate": 9.357421218136386e-05,
"loss": 0.0381,
"step": 2050
},
{
"epoch": 1.33419689119171,
"grad_norm": 0.3089483380317688,
"learning_rate": 9.349288402271388e-05,
"loss": 0.044,
"step": 2060
},
{
"epoch": 1.3406735751295336,
"grad_norm": 0.2659463882446289,
"learning_rate": 9.341108023285238e-05,
"loss": 0.0487,
"step": 2070
},
{
"epoch": 1.3471502590673574,
"grad_norm": 0.589297354221344,
"learning_rate": 9.332880170637252e-05,
"loss": 0.0472,
"step": 2080
},
{
"epoch": 1.3536269430051813,
"grad_norm": 0.39319437742233276,
"learning_rate": 9.32460493430591e-05,
"loss": 0.0377,
"step": 2090
},
{
"epoch": 1.3601036269430051,
"grad_norm": 0.3916621506214142,
"learning_rate": 9.316282404787871e-05,
"loss": 0.0404,
"step": 2100
},
{
"epoch": 1.366580310880829,
"grad_norm": 0.6271021366119385,
"learning_rate": 9.30791267309698e-05,
"loss": 0.0467,
"step": 2110
},
{
"epoch": 1.3730569948186528,
"grad_norm": 0.44995781779289246,
"learning_rate": 9.299495830763286e-05,
"loss": 0.0521,
"step": 2120
},
{
"epoch": 1.3795336787564767,
"grad_norm": 0.46163469552993774,
"learning_rate": 9.291031969832026e-05,
"loss": 0.0477,
"step": 2130
},
{
"epoch": 1.3860103626943006,
"grad_norm": 0.43207502365112305,
"learning_rate": 9.282521182862629e-05,
"loss": 0.0383,
"step": 2140
},
{
"epoch": 1.3924870466321244,
"grad_norm": 0.2974700331687927,
"learning_rate": 9.273963562927695e-05,
"loss": 0.038,
"step": 2150
},
{
"epoch": 1.3989637305699483,
"grad_norm": 0.45704299211502075,
"learning_rate": 9.265359203611987e-05,
"loss": 0.0553,
"step": 2160
},
{
"epoch": 1.4054404145077721,
"grad_norm": 0.4036669135093689,
"learning_rate": 9.256708199011401e-05,
"loss": 0.0366,
"step": 2170
},
{
"epoch": 1.411917098445596,
"grad_norm": 0.44508373737335205,
"learning_rate": 9.248010643731935e-05,
"loss": 0.0307,
"step": 2180
},
{
"epoch": 1.4183937823834196,
"grad_norm": 0.31299716234207153,
"learning_rate": 9.239266632888659e-05,
"loss": 0.0363,
"step": 2190
},
{
"epoch": 1.4248704663212435,
"grad_norm": 0.3911212086677551,
"learning_rate": 9.230476262104677e-05,
"loss": 0.0377,
"step": 2200
},
{
"epoch": 1.4313471502590673,
"grad_norm": 0.4919986128807068,
"learning_rate": 9.221639627510076e-05,
"loss": 0.0417,
"step": 2210
},
{
"epoch": 1.4378238341968912,
"grad_norm": 0.5026757717132568,
"learning_rate": 9.212756825740873e-05,
"loss": 0.0426,
"step": 2220
},
{
"epoch": 1.444300518134715,
"grad_norm": 0.4060062766075134,
"learning_rate": 9.20382795393797e-05,
"loss": 0.0518,
"step": 2230
},
{
"epoch": 1.450777202072539,
"grad_norm": 0.7231489419937134,
"learning_rate": 9.194853109746074e-05,
"loss": 0.0405,
"step": 2240
},
{
"epoch": 1.4572538860103628,
"grad_norm": 0.42704254388809204,
"learning_rate": 9.185832391312644e-05,
"loss": 0.042,
"step": 2250
},
{
"epoch": 1.4637305699481864,
"grad_norm": 0.5011823773384094,
"learning_rate": 9.176765897286813e-05,
"loss": 0.055,
"step": 2260
},
{
"epoch": 1.4702072538860103,
"grad_norm": 0.49078866839408875,
"learning_rate": 9.167653726818305e-05,
"loss": 0.0464,
"step": 2270
},
{
"epoch": 1.4766839378238341,
"grad_norm": 0.37776052951812744,
"learning_rate": 9.158495979556358e-05,
"loss": 0.0435,
"step": 2280
},
{
"epoch": 1.483160621761658,
"grad_norm": 0.3427102267742157,
"learning_rate": 9.14929275564863e-05,
"loss": 0.0433,
"step": 2290
},
{
"epoch": 1.4896373056994818,
"grad_norm": 0.3482416570186615,
"learning_rate": 9.140044155740101e-05,
"loss": 0.0398,
"step": 2300
},
{
"epoch": 1.4961139896373057,
"grad_norm": 0.36297425627708435,
"learning_rate": 9.130750280971978e-05,
"loss": 0.0412,
"step": 2310
},
{
"epoch": 1.5025906735751295,
"grad_norm": 0.6208764910697937,
"learning_rate": 9.121411232980588e-05,
"loss": 0.0494,
"step": 2320
},
{
"epoch": 1.5090673575129534,
"grad_norm": 0.3590516149997711,
"learning_rate": 9.112027113896262e-05,
"loss": 0.0495,
"step": 2330
},
{
"epoch": 1.5155440414507773,
"grad_norm": 0.4894999861717224,
"learning_rate": 9.102598026342222e-05,
"loss": 0.0384,
"step": 2340
},
{
"epoch": 1.5220207253886011,
"grad_norm": 0.34248238801956177,
"learning_rate": 9.093124073433463e-05,
"loss": 0.0417,
"step": 2350
},
{
"epoch": 1.528497409326425,
"grad_norm": 0.333857923746109,
"learning_rate": 9.083605358775612e-05,
"loss": 0.0398,
"step": 2360
},
{
"epoch": 1.5349740932642488,
"grad_norm": 0.3952425420284271,
"learning_rate": 9.074041986463808e-05,
"loss": 0.0416,
"step": 2370
},
{
"epoch": 1.5414507772020727,
"grad_norm": 0.5747725963592529,
"learning_rate": 9.064434061081562e-05,
"loss": 0.0383,
"step": 2380
},
{
"epoch": 1.5479274611398963,
"grad_norm": 0.49283695220947266,
"learning_rate": 9.0547816876996e-05,
"loss": 0.045,
"step": 2390
},
{
"epoch": 1.5544041450777202,
"grad_norm": 0.3009427487850189,
"learning_rate": 9.045084971874738e-05,
"loss": 0.0387,
"step": 2400
},
{
"epoch": 1.560880829015544,
"grad_norm": 0.37806493043899536,
"learning_rate": 9.035344019648702e-05,
"loss": 0.0399,
"step": 2410
},
{
"epoch": 1.567357512953368,
"grad_norm": 0.425051748752594,
"learning_rate": 9.025558937546988e-05,
"loss": 0.0418,
"step": 2420
},
{
"epoch": 1.5738341968911918,
"grad_norm": 0.24157755076885223,
"learning_rate": 9.015729832577681e-05,
"loss": 0.0344,
"step": 2430
},
{
"epoch": 1.5803108808290154,
"grad_norm": 0.2987273335456848,
"learning_rate": 9.005856812230304e-05,
"loss": 0.0323,
"step": 2440
},
{
"epoch": 1.5867875647668392,
"grad_norm": 0.27894994616508484,
"learning_rate": 8.995939984474624e-05,
"loss": 0.0467,
"step": 2450
},
{
"epoch": 1.593264248704663,
"grad_norm": 0.30715203285217285,
"learning_rate": 8.98597945775948e-05,
"loss": 0.0447,
"step": 2460
},
{
"epoch": 1.599740932642487,
"grad_norm": 0.4254259467124939,
"learning_rate": 8.975975341011596e-05,
"loss": 0.0401,
"step": 2470
},
{
"epoch": 1.6062176165803108,
"grad_norm": 0.4341532289981842,
"learning_rate": 8.965927743634391e-05,
"loss": 0.0435,
"step": 2480
},
{
"epoch": 1.6126943005181347,
"grad_norm": 0.4242204427719116,
"learning_rate": 8.955836775506776e-05,
"loss": 0.0437,
"step": 2490
},
{
"epoch": 1.6191709844559585,
"grad_norm": 0.4756164252758026,
"learning_rate": 8.945702546981969e-05,
"loss": 0.0428,
"step": 2500
},
{
"epoch": 1.6256476683937824,
"grad_norm": 0.6672074794769287,
"learning_rate": 8.935525168886262e-05,
"loss": 0.0448,
"step": 2510
},
{
"epoch": 1.6321243523316062,
"grad_norm": 0.38436517119407654,
"learning_rate": 8.92530475251784e-05,
"loss": 0.0357,
"step": 2520
},
{
"epoch": 1.63860103626943,
"grad_norm": 0.5012999773025513,
"learning_rate": 8.91504140964553e-05,
"loss": 0.046,
"step": 2530
},
{
"epoch": 1.645077720207254,
"grad_norm": 0.4836990237236023,
"learning_rate": 8.90473525250761e-05,
"loss": 0.0497,
"step": 2540
},
{
"epoch": 1.6515544041450778,
"grad_norm": 0.318345844745636,
"learning_rate": 8.894386393810563e-05,
"loss": 0.0362,
"step": 2550
},
{
"epoch": 1.6580310880829017,
"grad_norm": 0.43974965810775757,
"learning_rate": 8.883994946727849e-05,
"loss": 0.0463,
"step": 2560
},
{
"epoch": 1.6645077720207255,
"grad_norm": 0.5346915125846863,
"learning_rate": 8.873561024898668e-05,
"loss": 0.0447,
"step": 2570
},
{
"epoch": 1.6709844559585494,
"grad_norm": 0.45229285955429077,
"learning_rate": 8.863084742426719e-05,
"loss": 0.0431,
"step": 2580
},
{
"epoch": 1.677461139896373,
"grad_norm": 0.41755497455596924,
"learning_rate": 8.852566213878947e-05,
"loss": 0.0321,
"step": 2590
},
{
"epoch": 1.6839378238341969,
"grad_norm": 0.25371554493904114,
"learning_rate": 8.842005554284296e-05,
"loss": 0.0462,
"step": 2600
},
{
"epoch": 1.6904145077720207,
"grad_norm": 0.40556564927101135,
"learning_rate": 8.831402879132446e-05,
"loss": 0.0377,
"step": 2610
},
{
"epoch": 1.6968911917098446,
"grad_norm": 0.3499738574028015,
"learning_rate": 8.820758304372557e-05,
"loss": 0.0371,
"step": 2620
},
{
"epoch": 1.7033678756476682,
"grad_norm": 0.4621606171131134,
"learning_rate": 8.810071946411989e-05,
"loss": 0.0339,
"step": 2630
},
{
"epoch": 1.709844559585492,
"grad_norm": 0.2640065848827362,
"learning_rate": 8.799343922115044e-05,
"loss": 0.04,
"step": 2640
},
{
"epoch": 1.716321243523316,
"grad_norm": 0.33107438683509827,
"learning_rate": 8.788574348801675e-05,
"loss": 0.0433,
"step": 2650
},
{
"epoch": 1.7227979274611398,
"grad_norm": 0.3660358786582947,
"learning_rate": 8.77776334424621e-05,
"loss": 0.0331,
"step": 2660
},
{
"epoch": 1.7292746113989637,
"grad_norm": 0.40852832794189453,
"learning_rate": 8.766911026676064e-05,
"loss": 0.0404,
"step": 2670
},
{
"epoch": 1.7357512953367875,
"grad_norm": 0.33427226543426514,
"learning_rate": 8.756017514770443e-05,
"loss": 0.037,
"step": 2680
},
{
"epoch": 1.7422279792746114,
"grad_norm": 0.2343767136335373,
"learning_rate": 8.745082927659047e-05,
"loss": 0.0334,
"step": 2690
},
{
"epoch": 1.7487046632124352,
"grad_norm": 0.3142069876194,
"learning_rate": 8.73410738492077e-05,
"loss": 0.0349,
"step": 2700
},
{
"epoch": 1.755181347150259,
"grad_norm": 0.43764960765838623,
"learning_rate": 8.723091006582389e-05,
"loss": 0.0318,
"step": 2710
},
{
"epoch": 1.761658031088083,
"grad_norm": 0.509016752243042,
"learning_rate": 8.71203391311725e-05,
"loss": 0.0462,
"step": 2720
},
{
"epoch": 1.7681347150259068,
"grad_norm": 0.29676973819732666,
"learning_rate": 8.700936225443959e-05,
"loss": 0.0277,
"step": 2730
},
{
"epoch": 1.7746113989637307,
"grad_norm": 0.3989731967449188,
"learning_rate": 8.689798064925049e-05,
"loss": 0.0285,
"step": 2740
},
{
"epoch": 1.7810880829015545,
"grad_norm": 0.3644524812698364,
"learning_rate": 8.678619553365659e-05,
"loss": 0.0372,
"step": 2750
},
{
"epoch": 1.7875647668393784,
"grad_norm": 0.4097837209701538,
"learning_rate": 8.6674008130122e-05,
"loss": 0.038,
"step": 2760
},
{
"epoch": 1.7940414507772022,
"grad_norm": 0.2878129482269287,
"learning_rate": 8.656141966551019e-05,
"loss": 0.0331,
"step": 2770
},
{
"epoch": 1.8005181347150259,
"grad_norm": 0.2862109839916229,
"learning_rate": 8.644843137107059e-05,
"loss": 0.0335,
"step": 2780
},
{
"epoch": 1.8069948186528497,
"grad_norm": 0.43468615412712097,
"learning_rate": 8.633504448242505e-05,
"loss": 0.0348,
"step": 2790
},
{
"epoch": 1.8134715025906736,
"grad_norm": 0.5068714022636414,
"learning_rate": 8.622126023955446e-05,
"loss": 0.0402,
"step": 2800
},
{
"epoch": 1.8199481865284974,
"grad_norm": 0.5587580800056458,
"learning_rate": 8.610707988678503e-05,
"loss": 0.0344,
"step": 2810
},
{
"epoch": 1.8264248704663213,
"grad_norm": 0.6059616804122925,
"learning_rate": 8.599250467277483e-05,
"loss": 0.0369,
"step": 2820
},
{
"epoch": 1.832901554404145,
"grad_norm": 0.35330525040626526,
"learning_rate": 8.587753585050004e-05,
"loss": 0.0379,
"step": 2830
},
{
"epoch": 1.8393782383419688,
"grad_norm": 0.41709282994270325,
"learning_rate": 8.576217467724128e-05,
"loss": 0.0395,
"step": 2840
},
{
"epoch": 1.8458549222797926,
"grad_norm": 0.43537795543670654,
"learning_rate": 8.564642241456986e-05,
"loss": 0.035,
"step": 2850
},
{
"epoch": 1.8523316062176165,
"grad_norm": 0.4171069264411926,
"learning_rate": 8.553028032833397e-05,
"loss": 0.0376,
"step": 2860
},
{
"epoch": 1.8588082901554404,
"grad_norm": 0.4365420341491699,
"learning_rate": 8.541374968864487e-05,
"loss": 0.0403,
"step": 2870
},
{
"epoch": 1.8652849740932642,
"grad_norm": 0.4330903887748718,
"learning_rate": 8.529683176986295e-05,
"loss": 0.0356,
"step": 2880
},
{
"epoch": 1.871761658031088,
"grad_norm": 0.28101474046707153,
"learning_rate": 8.517952785058385e-05,
"loss": 0.0401,
"step": 2890
},
{
"epoch": 1.878238341968912,
"grad_norm": 0.45936867594718933,
"learning_rate": 8.506183921362443e-05,
"loss": 0.0354,
"step": 2900
},
{
"epoch": 1.8847150259067358,
"grad_norm": 0.35999923944473267,
"learning_rate": 8.494376714600878e-05,
"loss": 0.0384,
"step": 2910
},
{
"epoch": 1.8911917098445596,
"grad_norm": 0.36796870827674866,
"learning_rate": 8.482531293895412e-05,
"loss": 0.0399,
"step": 2920
},
{
"epoch": 1.8976683937823835,
"grad_norm": 0.443658709526062,
"learning_rate": 8.470647788785665e-05,
"loss": 0.0345,
"step": 2930
},
{
"epoch": 1.9041450777202074,
"grad_norm": 0.4713057279586792,
"learning_rate": 8.458726329227747e-05,
"loss": 0.0409,
"step": 2940
},
{
"epoch": 1.9106217616580312,
"grad_norm": 0.540421724319458,
"learning_rate": 8.44676704559283e-05,
"loss": 0.0502,
"step": 2950
},
{
"epoch": 1.917098445595855,
"grad_norm": 0.4227132499217987,
"learning_rate": 8.434770068665723e-05,
"loss": 0.0405,
"step": 2960
},
{
"epoch": 1.9235751295336787,
"grad_norm": 0.40846768021583557,
"learning_rate": 8.422735529643444e-05,
"loss": 0.0412,
"step": 2970
},
{
"epoch": 1.9300518134715026,
"grad_norm": 0.3132789134979248,
"learning_rate": 8.410663560133784e-05,
"loss": 0.0286,
"step": 2980
},
{
"epoch": 1.9365284974093264,
"grad_norm": 0.3604018986225128,
"learning_rate": 8.398554292153866e-05,
"loss": 0.0414,
"step": 2990
},
{
"epoch": 1.9430051813471503,
"grad_norm": 0.6764439940452576,
"learning_rate": 8.386407858128706e-05,
"loss": 0.0541,
"step": 3000
},
{
"epoch": 1.9494818652849741,
"grad_norm": 0.3984512388706207,
"learning_rate": 8.37422439088976e-05,
"loss": 0.039,
"step": 3010
},
{
"epoch": 1.9559585492227978,
"grad_norm": 0.31599175930023193,
"learning_rate": 8.362004023673474e-05,
"loss": 0.0412,
"step": 3020
},
{
"epoch": 1.9624352331606216,
"grad_norm": 0.34263965487480164,
"learning_rate": 8.349746890119826e-05,
"loss": 0.0343,
"step": 3030
},
{
"epoch": 1.9689119170984455,
"grad_norm": 0.2634800672531128,
"learning_rate": 8.337453124270863e-05,
"loss": 0.0374,
"step": 3040
},
{
"epoch": 1.9753886010362693,
"grad_norm": 0.2721804976463318,
"learning_rate": 8.32512286056924e-05,
"loss": 0.0463,
"step": 3050
},
{
"epoch": 1.9818652849740932,
"grad_norm": 0.37321698665618896,
"learning_rate": 8.31275623385675e-05,
"loss": 0.0363,
"step": 3060
},
{
"epoch": 1.988341968911917,
"grad_norm": 0.40666913986206055,
"learning_rate": 8.300353379372834e-05,
"loss": 0.0326,
"step": 3070
},
{
"epoch": 1.994818652849741,
"grad_norm": 0.32317647337913513,
"learning_rate": 8.287914432753123e-05,
"loss": 0.0496,
"step": 3080
},
{
"epoch": 2.0012953367875648,
"grad_norm": 0.23770615458488464,
"learning_rate": 8.275439530027948e-05,
"loss": 0.0421,
"step": 3090
},
{
"epoch": 2.0077720207253886,
"grad_norm": 0.31836286187171936,
"learning_rate": 8.262928807620843e-05,
"loss": 0.0399,
"step": 3100
},
{
"epoch": 2.0142487046632125,
"grad_norm": 0.3128102719783783,
"learning_rate": 8.250382402347065e-05,
"loss": 0.0338,
"step": 3110
},
{
"epoch": 2.0207253886010363,
"grad_norm": 0.45987972617149353,
"learning_rate": 8.237800451412095e-05,
"loss": 0.035,
"step": 3120
},
{
"epoch": 2.02720207253886,
"grad_norm": 0.35997146368026733,
"learning_rate": 8.225183092410128e-05,
"loss": 0.0318,
"step": 3130
},
{
"epoch": 2.033678756476684,
"grad_norm": 0.5645427703857422,
"learning_rate": 8.212530463322583e-05,
"loss": 0.0329,
"step": 3140
},
{
"epoch": 2.040155440414508,
"grad_norm": 0.5178828239440918,
"learning_rate": 8.199842702516583e-05,
"loss": 0.0354,
"step": 3150
},
{
"epoch": 2.0466321243523318,
"grad_norm": 0.48487406969070435,
"learning_rate": 8.18711994874345e-05,
"loss": 0.0288,
"step": 3160
},
{
"epoch": 2.0531088082901556,
"grad_norm": 0.4110161066055298,
"learning_rate": 8.174362341137177e-05,
"loss": 0.0348,
"step": 3170
},
{
"epoch": 2.0595854922279795,
"grad_norm": 0.26325592398643494,
"learning_rate": 8.161570019212921e-05,
"loss": 0.0344,
"step": 3180
},
{
"epoch": 2.066062176165803,
"grad_norm": 0.2679133713245392,
"learning_rate": 8.148743122865463e-05,
"loss": 0.0263,
"step": 3190
},
{
"epoch": 2.0725388601036268,
"grad_norm": 0.33095425367355347,
"learning_rate": 8.135881792367686e-05,
"loss": 0.0349,
"step": 3200
},
{
"epoch": 2.0790155440414506,
"grad_norm": 0.3910488784313202,
"learning_rate": 8.12298616836904e-05,
"loss": 0.0313,
"step": 3210
},
{
"epoch": 2.0854922279792745,
"grad_norm": 0.378434956073761,
"learning_rate": 8.110056391894005e-05,
"loss": 0.026,
"step": 3220
},
{
"epoch": 2.0919689119170983,
"grad_norm": 0.3993297815322876,
"learning_rate": 8.097092604340542e-05,
"loss": 0.0304,
"step": 3230
},
{
"epoch": 2.098445595854922,
"grad_norm": 0.5249981880187988,
"learning_rate": 8.084094947478556e-05,
"loss": 0.0283,
"step": 3240
},
{
"epoch": 2.104922279792746,
"grad_norm": 0.2571522295475006,
"learning_rate": 8.07106356344834e-05,
"loss": 0.034,
"step": 3250
},
{
"epoch": 2.11139896373057,
"grad_norm": 0.5822983384132385,
"learning_rate": 8.057998594759022e-05,
"loss": 0.0274,
"step": 3260
},
{
"epoch": 2.1178756476683938,
"grad_norm": 0.40026816725730896,
"learning_rate": 8.044900184287007e-05,
"loss": 0.0348,
"step": 3270
},
{
"epoch": 2.1243523316062176,
"grad_norm": 0.29010850191116333,
"learning_rate": 8.031768475274413e-05,
"loss": 0.0305,
"step": 3280
},
{
"epoch": 2.1308290155440415,
"grad_norm": 0.4505520164966583,
"learning_rate": 8.018603611327504e-05,
"loss": 0.0353,
"step": 3290
},
{
"epoch": 2.1373056994818653,
"grad_norm": 0.49468860030174255,
"learning_rate": 8.005405736415126e-05,
"loss": 0.0385,
"step": 3300
},
{
"epoch": 2.143782383419689,
"grad_norm": 0.27332326769828796,
"learning_rate": 7.992174994867123e-05,
"loss": 0.0319,
"step": 3310
},
{
"epoch": 2.150259067357513,
"grad_norm": 0.30603963136672974,
"learning_rate": 7.978911531372765e-05,
"loss": 0.0351,
"step": 3320
},
{
"epoch": 2.156735751295337,
"grad_norm": 0.4821016490459442,
"learning_rate": 7.965615490979163e-05,
"loss": 0.0343,
"step": 3330
},
{
"epoch": 2.1632124352331608,
"grad_norm": 0.2607490122318268,
"learning_rate": 7.952287019089685e-05,
"loss": 0.025,
"step": 3340
},
{
"epoch": 2.1696891191709846,
"grad_norm": 0.44392070174217224,
"learning_rate": 7.938926261462366e-05,
"loss": 0.0318,
"step": 3350
},
{
"epoch": 2.1761658031088085,
"grad_norm": 0.5164742469787598,
"learning_rate": 7.925533364208309e-05,
"loss": 0.0386,
"step": 3360
},
{
"epoch": 2.1826424870466323,
"grad_norm": 0.41299721598625183,
"learning_rate": 7.912108473790092e-05,
"loss": 0.0374,
"step": 3370
},
{
"epoch": 2.1891191709844557,
"grad_norm": 0.5575032234191895,
"learning_rate": 7.898651737020166e-05,
"loss": 0.0312,
"step": 3380
},
{
"epoch": 2.1955958549222796,
"grad_norm": 0.27284616231918335,
"learning_rate": 7.88516330105925e-05,
"loss": 0.0273,
"step": 3390
},
{
"epoch": 2.2020725388601035,
"grad_norm": 0.39631810784339905,
"learning_rate": 7.871643313414718e-05,
"loss": 0.0313,
"step": 3400
},
{
"epoch": 2.2085492227979273,
"grad_norm": 0.2716298997402191,
"learning_rate": 7.858091921938988e-05,
"loss": 0.0332,
"step": 3410
},
{
"epoch": 2.215025906735751,
"grad_norm": 0.35479772090911865,
"learning_rate": 7.844509274827907e-05,
"loss": 0.0343,
"step": 3420
},
{
"epoch": 2.221502590673575,
"grad_norm": 0.3278186321258545,
"learning_rate": 7.830895520619128e-05,
"loss": 0.0295,
"step": 3430
},
{
"epoch": 2.227979274611399,
"grad_norm": 0.36920130252838135,
"learning_rate": 7.817250808190483e-05,
"loss": 0.0411,
"step": 3440
},
{
"epoch": 2.2344559585492227,
"grad_norm": 0.49877047538757324,
"learning_rate": 7.803575286758364e-05,
"loss": 0.0338,
"step": 3450
},
{
"epoch": 2.2409326424870466,
"grad_norm": 0.3202963173389435,
"learning_rate": 7.789869105876083e-05,
"loss": 0.028,
"step": 3460
},
{
"epoch": 2.2474093264248705,
"grad_norm": 0.33412453532218933,
"learning_rate": 7.776132415432234e-05,
"loss": 0.0386,
"step": 3470
},
{
"epoch": 2.2538860103626943,
"grad_norm": 0.46674206852912903,
"learning_rate": 7.762365365649067e-05,
"loss": 0.0272,
"step": 3480
},
{
"epoch": 2.260362694300518,
"grad_norm": 0.2830505073070526,
"learning_rate": 7.748568107080832e-05,
"loss": 0.0319,
"step": 3490
},
{
"epoch": 2.266839378238342,
"grad_norm": 0.4614030122756958,
"learning_rate": 7.734740790612136e-05,
"loss": 0.0376,
"step": 3500
},
{
"epoch": 2.273316062176166,
"grad_norm": 0.4553893804550171,
"learning_rate": 7.720883567456298e-05,
"loss": 0.0362,
"step": 3510
},
{
"epoch": 2.2797927461139897,
"grad_norm": 0.36776071786880493,
"learning_rate": 7.70699658915369e-05,
"loss": 0.0392,
"step": 3520
},
{
"epoch": 2.2862694300518136,
"grad_norm": 0.6267045736312866,
"learning_rate": 7.693080007570084e-05,
"loss": 0.0256,
"step": 3530
},
{
"epoch": 2.2927461139896375,
"grad_norm": 0.36080533266067505,
"learning_rate": 7.679133974894983e-05,
"loss": 0.0374,
"step": 3540
},
{
"epoch": 2.2992227979274613,
"grad_norm": 0.32267022132873535,
"learning_rate": 7.66515864363997e-05,
"loss": 0.04,
"step": 3550
},
{
"epoch": 2.305699481865285,
"grad_norm": 0.344371497631073,
"learning_rate": 7.651154166637025e-05,
"loss": 0.0338,
"step": 3560
},
{
"epoch": 2.312176165803109,
"grad_norm": 0.43632611632347107,
"learning_rate": 7.637120697036866e-05,
"loss": 0.0351,
"step": 3570
},
{
"epoch": 2.318652849740933,
"grad_norm": 0.35309430956840515,
"learning_rate": 7.623058388307269e-05,
"loss": 0.036,
"step": 3580
},
{
"epoch": 2.3251295336787563,
"grad_norm": 0.301886647939682,
"learning_rate": 7.608967394231387e-05,
"loss": 0.0319,
"step": 3590
},
{
"epoch": 2.33160621761658,
"grad_norm": 0.31548261642456055,
"learning_rate": 7.594847868906076e-05,
"loss": 0.0329,
"step": 3600
},
{
"epoch": 2.338082901554404,
"grad_norm": 0.3690776526927948,
"learning_rate": 7.580699966740201e-05,
"loss": 0.0331,
"step": 3610
},
{
"epoch": 2.344559585492228,
"grad_norm": 0.3857821226119995,
"learning_rate": 7.566523842452958e-05,
"loss": 0.0329,
"step": 3620
},
{
"epoch": 2.3510362694300517,
"grad_norm": 0.3122609555721283,
"learning_rate": 7.552319651072164e-05,
"loss": 0.0351,
"step": 3630
},
{
"epoch": 2.3575129533678756,
"grad_norm": 0.23658838868141174,
"learning_rate": 7.538087547932585e-05,
"loss": 0.0246,
"step": 3640
},
{
"epoch": 2.3639896373056994,
"grad_norm": 0.40285107493400574,
"learning_rate": 7.52382768867422e-05,
"loss": 0.0349,
"step": 3650
},
{
"epoch": 2.3704663212435233,
"grad_norm": 0.17549413442611694,
"learning_rate": 7.509540229240601e-05,
"loss": 0.0226,
"step": 3660
},
{
"epoch": 2.376943005181347,
"grad_norm": 0.3721100091934204,
"learning_rate": 7.495225325877103e-05,
"loss": 0.0302,
"step": 3670
},
{
"epoch": 2.383419689119171,
"grad_norm": 0.24715149402618408,
"learning_rate": 7.480883135129211e-05,
"loss": 0.0313,
"step": 3680
},
{
"epoch": 2.389896373056995,
"grad_norm": 0.3196186423301697,
"learning_rate": 7.466513813840825e-05,
"loss": 0.0284,
"step": 3690
},
{
"epoch": 2.3963730569948187,
"grad_norm": 0.2939792275428772,
"learning_rate": 7.452117519152542e-05,
"loss": 0.0302,
"step": 3700
},
{
"epoch": 2.4028497409326426,
"grad_norm": 0.31848374009132385,
"learning_rate": 7.437694408499933e-05,
"loss": 0.0347,
"step": 3710
},
{
"epoch": 2.4093264248704664,
"grad_norm": 0.32931575179100037,
"learning_rate": 7.423244639611826e-05,
"loss": 0.0413,
"step": 3720
},
{
"epoch": 2.4158031088082903,
"grad_norm": 0.45900219678878784,
"learning_rate": 7.408768370508576e-05,
"loss": 0.0299,
"step": 3730
},
{
"epoch": 2.422279792746114,
"grad_norm": 0.35719117522239685,
"learning_rate": 7.394265759500348e-05,
"loss": 0.0345,
"step": 3740
},
{
"epoch": 2.428756476683938,
"grad_norm": 0.25340452790260315,
"learning_rate": 7.379736965185368e-05,
"loss": 0.0301,
"step": 3750
},
{
"epoch": 2.4352331606217614,
"grad_norm": 0.2732887268066406,
"learning_rate": 7.365182146448205e-05,
"loss": 0.0258,
"step": 3760
},
{
"epoch": 2.4417098445595853,
"grad_norm": 0.41965925693511963,
"learning_rate": 7.350601462458024e-05,
"loss": 0.0258,
"step": 3770
},
{
"epoch": 2.448186528497409,
"grad_norm": 0.22055207192897797,
"learning_rate": 7.335995072666848e-05,
"loss": 0.0279,
"step": 3780
},
{
"epoch": 2.454663212435233,
"grad_norm": 0.23329317569732666,
"learning_rate": 7.32136313680782e-05,
"loss": 0.0295,
"step": 3790
},
{
"epoch": 2.461139896373057,
"grad_norm": 0.4226686358451843,
"learning_rate": 7.30670581489344e-05,
"loss": 0.0421,
"step": 3800
},
{
"epoch": 2.4676165803108807,
"grad_norm": 0.2794199585914612,
"learning_rate": 7.292023267213835e-05,
"loss": 0.0327,
"step": 3810
},
{
"epoch": 2.4740932642487046,
"grad_norm": 0.45654261112213135,
"learning_rate": 7.277315654334997e-05,
"loss": 0.0312,
"step": 3820
},
{
"epoch": 2.4805699481865284,
"grad_norm": 0.5298384428024292,
"learning_rate": 7.262583137097018e-05,
"loss": 0.0298,
"step": 3830
},
{
"epoch": 2.4870466321243523,
"grad_norm": 0.4858406186103821,
"learning_rate": 7.247825876612353e-05,
"loss": 0.0376,
"step": 3840
},
{
"epoch": 2.493523316062176,
"grad_norm": 0.40908846259117126,
"learning_rate": 7.233044034264034e-05,
"loss": 0.0289,
"step": 3850
},
{
"epoch": 2.5,
"grad_norm": 0.6489147543907166,
"learning_rate": 7.218237771703921e-05,
"loss": 0.0374,
"step": 3860
},
{
"epoch": 2.506476683937824,
"grad_norm": 0.5571417808532715,
"learning_rate": 7.203407250850928e-05,
"loss": 0.0313,
"step": 3870
},
{
"epoch": 2.5129533678756477,
"grad_norm": 0.4606262743473053,
"learning_rate": 7.188552633889259e-05,
"loss": 0.0362,
"step": 3880
},
{
"epoch": 2.5194300518134716,
"grad_norm": 0.45915576815605164,
"learning_rate": 7.173674083266624e-05,
"loss": 0.0344,
"step": 3890
},
{
"epoch": 2.5259067357512954,
"grad_norm": 0.312846839427948,
"learning_rate": 7.158771761692464e-05,
"loss": 0.0287,
"step": 3900
},
{
"epoch": 2.5323834196891193,
"grad_norm": 0.3359382748603821,
"learning_rate": 7.143845832136188e-05,
"loss": 0.0296,
"step": 3910
},
{
"epoch": 2.538860103626943,
"grad_norm": 0.33793967962265015,
"learning_rate": 7.128896457825364e-05,
"loss": 0.0249,
"step": 3920
},
{
"epoch": 2.545336787564767,
"grad_norm": 0.6286918520927429,
"learning_rate": 7.113923802243957e-05,
"loss": 0.0321,
"step": 3930
},
{
"epoch": 2.551813471502591,
"grad_norm": 0.27349719405174255,
"learning_rate": 7.09892802913053e-05,
"loss": 0.0387,
"step": 3940
},
{
"epoch": 2.5582901554404147,
"grad_norm": 0.20686767995357513,
"learning_rate": 7.083909302476453e-05,
"loss": 0.0406,
"step": 3950
},
{
"epoch": 2.5647668393782386,
"grad_norm": 0.491794615983963,
"learning_rate": 7.068867786524116e-05,
"loss": 0.0449,
"step": 3960
},
{
"epoch": 2.5712435233160624,
"grad_norm": 0.28735771775245667,
"learning_rate": 7.053803645765128e-05,
"loss": 0.0351,
"step": 3970
},
{
"epoch": 2.5777202072538863,
"grad_norm": 0.2941044270992279,
"learning_rate": 7.038717044938519e-05,
"loss": 0.0275,
"step": 3980
},
{
"epoch": 2.5841968911917097,
"grad_norm": 0.3536795973777771,
"learning_rate": 7.023608149028937e-05,
"loss": 0.0305,
"step": 3990
},
{
"epoch": 2.5906735751295336,
"grad_norm": 0.3308267593383789,
"learning_rate": 7.008477123264848e-05,
"loss": 0.0276,
"step": 4000
},
{
"epoch": 2.5971502590673574,
"grad_norm": 0.2571580410003662,
"learning_rate": 6.993324133116726e-05,
"loss": 0.0286,
"step": 4010
},
{
"epoch": 2.6036269430051813,
"grad_norm": 0.21292202174663544,
"learning_rate": 6.978149344295242e-05,
"loss": 0.0264,
"step": 4020
},
{
"epoch": 2.610103626943005,
"grad_norm": 0.27990859746932983,
"learning_rate": 6.962952922749457e-05,
"loss": 0.0327,
"step": 4030
},
{
"epoch": 2.616580310880829,
"grad_norm": 0.4610576629638672,
"learning_rate": 6.947735034665002e-05,
"loss": 0.033,
"step": 4040
},
{
"epoch": 2.623056994818653,
"grad_norm": 0.2786068618297577,
"learning_rate": 6.932495846462261e-05,
"loss": 0.0367,
"step": 4050
},
{
"epoch": 2.6295336787564767,
"grad_norm": 0.21466326713562012,
"learning_rate": 6.917235524794558e-05,
"loss": 0.0366,
"step": 4060
},
{
"epoch": 2.6360103626943006,
"grad_norm": 0.4376002252101898,
"learning_rate": 6.901954236546323e-05,
"loss": 0.0415,
"step": 4070
},
{
"epoch": 2.6424870466321244,
"grad_norm": 0.327411025762558,
"learning_rate": 6.886652148831279e-05,
"loss": 0.0307,
"step": 4080
},
{
"epoch": 2.6489637305699483,
"grad_norm": 0.32535767555236816,
"learning_rate": 6.871329428990602e-05,
"loss": 0.0327,
"step": 4090
},
{
"epoch": 2.655440414507772,
"grad_norm": 0.17626729607582092,
"learning_rate": 6.855986244591104e-05,
"loss": 0.0301,
"step": 4100
},
{
"epoch": 2.661917098445596,
"grad_norm": 0.3351651430130005,
"learning_rate": 6.840622763423391e-05,
"loss": 0.0304,
"step": 4110
},
{
"epoch": 2.66839378238342,
"grad_norm": 0.41209253668785095,
"learning_rate": 6.825239153500029e-05,
"loss": 0.0278,
"step": 4120
},
{
"epoch": 2.6748704663212433,
"grad_norm": 0.33279556035995483,
"learning_rate": 6.809835583053715e-05,
"loss": 0.0221,
"step": 4130
},
{
"epoch": 2.681347150259067,
"grad_norm": 0.3693098723888397,
"learning_rate": 6.794412220535426e-05,
"loss": 0.0312,
"step": 4140
},
{
"epoch": 2.687823834196891,
"grad_norm": 0.5105347037315369,
"learning_rate": 6.778969234612584e-05,
"loss": 0.0296,
"step": 4150
},
{
"epoch": 2.694300518134715,
"grad_norm": 0.32539427280426025,
"learning_rate": 6.763506794167208e-05,
"loss": 0.0298,
"step": 4160
},
{
"epoch": 2.7007772020725387,
"grad_norm": 0.455147922039032,
"learning_rate": 6.748025068294067e-05,
"loss": 0.0251,
"step": 4170
},
{
"epoch": 2.7072538860103625,
"grad_norm": 0.21031919121742249,
"learning_rate": 6.732524226298841e-05,
"loss": 0.0265,
"step": 4180
},
{
"epoch": 2.7137305699481864,
"grad_norm": 0.36876460909843445,
"learning_rate": 6.71700443769625e-05,
"loss": 0.0242,
"step": 4190
},
{
"epoch": 2.7202072538860103,
"grad_norm": 0.3641934096813202,
"learning_rate": 6.701465872208216e-05,
"loss": 0.0236,
"step": 4200
},
{
"epoch": 2.726683937823834,
"grad_norm": 0.3689476251602173,
"learning_rate": 6.685908699762002e-05,
"loss": 0.0261,
"step": 4210
},
{
"epoch": 2.733160621761658,
"grad_norm": 0.40867921710014343,
"learning_rate": 6.670333090488356e-05,
"loss": 0.0268,
"step": 4220
},
{
"epoch": 2.739637305699482,
"grad_norm": 0.29465481638908386,
"learning_rate": 6.654739214719641e-05,
"loss": 0.0245,
"step": 4230
},
{
"epoch": 2.7461139896373057,
"grad_norm": 0.33505406975746155,
"learning_rate": 6.639127242987988e-05,
"loss": 0.0249,
"step": 4240
},
{
"epoch": 2.7525906735751295,
"grad_norm": 0.21609602868556976,
"learning_rate": 6.623497346023418e-05,
"loss": 0.03,
"step": 4250
},
{
"epoch": 2.7590673575129534,
"grad_norm": 0.29220953583717346,
"learning_rate": 6.607849694751977e-05,
"loss": 0.0276,
"step": 4260
},
{
"epoch": 2.7655440414507773,
"grad_norm": 0.23505854606628418,
"learning_rate": 6.592184460293877e-05,
"loss": 0.0261,
"step": 4270
},
{
"epoch": 2.772020725388601,
"grad_norm": 0.2939005196094513,
"learning_rate": 6.576501813961609e-05,
"loss": 0.032,
"step": 4280
},
{
"epoch": 2.778497409326425,
"grad_norm": 0.24093054234981537,
"learning_rate": 6.56080192725808e-05,
"loss": 0.0398,
"step": 4290
},
{
"epoch": 2.784974093264249,
"grad_norm": 0.16263030469417572,
"learning_rate": 6.545084971874738e-05,
"loss": 0.0303,
"step": 4300
},
{
"epoch": 2.7914507772020727,
"grad_norm": 0.2163333296775818,
"learning_rate": 6.529351119689688e-05,
"loss": 0.0215,
"step": 4310
},
{
"epoch": 2.7979274611398965,
"grad_norm": 0.17953196167945862,
"learning_rate": 6.513600542765817e-05,
"loss": 0.0284,
"step": 4320
},
{
"epoch": 2.8044041450777204,
"grad_norm": 0.2565161883831024,
"learning_rate": 6.497833413348909e-05,
"loss": 0.0275,
"step": 4330
},
{
"epoch": 2.8108808290155443,
"grad_norm": 0.30938777327537537,
"learning_rate": 6.48204990386577e-05,
"loss": 0.0232,
"step": 4340
},
{
"epoch": 2.817357512953368,
"grad_norm": 0.4033896028995514,
"learning_rate": 6.466250186922325e-05,
"loss": 0.0245,
"step": 4350
},
{
"epoch": 2.823834196891192,
"grad_norm": 0.3601841628551483,
"learning_rate": 6.450434435301751e-05,
"loss": 0.0288,
"step": 4360
},
{
"epoch": 2.8303108808290154,
"grad_norm": 0.4404318332672119,
"learning_rate": 6.43460282196257e-05,
"loss": 0.032,
"step": 4370
},
{
"epoch": 2.8367875647668392,
"grad_norm": 0.37518399953842163,
"learning_rate": 6.418755520036775e-05,
"loss": 0.0301,
"step": 4380
},
{
"epoch": 2.843264248704663,
"grad_norm": 0.27603596448898315,
"learning_rate": 6.402892702827916e-05,
"loss": 0.023,
"step": 4390
},
{
"epoch": 2.849740932642487,
"grad_norm": 0.36747151613235474,
"learning_rate": 6.387014543809223e-05,
"loss": 0.0301,
"step": 4400
},
{
"epoch": 2.856217616580311,
"grad_norm": 0.28517892956733704,
"learning_rate": 6.371121216621698e-05,
"loss": 0.0239,
"step": 4410
},
{
"epoch": 2.8626943005181347,
"grad_norm": 0.5214961171150208,
"learning_rate": 6.355212895072223e-05,
"loss": 0.0268,
"step": 4420
},
{
"epoch": 2.8691709844559585,
"grad_norm": 0.35717862844467163,
"learning_rate": 6.339289753131649e-05,
"loss": 0.0253,
"step": 4430
},
{
"epoch": 2.8756476683937824,
"grad_norm": 0.23553046584129333,
"learning_rate": 6.323351964932908e-05,
"loss": 0.0291,
"step": 4440
},
{
"epoch": 2.8821243523316062,
"grad_norm": 0.3337346613407135,
"learning_rate": 6.307399704769099e-05,
"loss": 0.0265,
"step": 4450
},
{
"epoch": 2.88860103626943,
"grad_norm": 0.31009089946746826,
"learning_rate": 6.291433147091583e-05,
"loss": 0.022,
"step": 4460
},
{
"epoch": 2.895077720207254,
"grad_norm": 0.3828548789024353,
"learning_rate": 6.275452466508077e-05,
"loss": 0.0279,
"step": 4470
},
{
"epoch": 2.901554404145078,
"grad_norm": 0.28669974207878113,
"learning_rate": 6.259457837780742e-05,
"loss": 0.0225,
"step": 4480
},
{
"epoch": 2.9080310880829017,
"grad_norm": 0.31521838903427124,
"learning_rate": 6.243449435824276e-05,
"loss": 0.0255,
"step": 4490
},
{
"epoch": 2.9145077720207255,
"grad_norm": 0.308748334646225,
"learning_rate": 6.227427435703997e-05,
"loss": 0.0368,
"step": 4500
},
{
"epoch": 2.9209844559585494,
"grad_norm": 0.3998130261898041,
"learning_rate": 6.211392012633932e-05,
"loss": 0.0224,
"step": 4510
},
{
"epoch": 2.927461139896373,
"grad_norm": 0.440463662147522,
"learning_rate": 6.195343341974899e-05,
"loss": 0.0344,
"step": 4520
},
{
"epoch": 2.9339378238341967,
"grad_norm": 0.3381267786026001,
"learning_rate": 6.179281599232591e-05,
"loss": 0.0276,
"step": 4530
},
{
"epoch": 2.9404145077720205,
"grad_norm": 0.2546345293521881,
"learning_rate": 6.163206960055651e-05,
"loss": 0.021,
"step": 4540
},
{
"epoch": 2.9468911917098444,
"grad_norm": 0.49094951152801514,
"learning_rate": 6.147119600233758e-05,
"loss": 0.0307,
"step": 4550
},
{
"epoch": 2.9533678756476682,
"grad_norm": 0.506216824054718,
"learning_rate": 6.131019695695702e-05,
"loss": 0.0281,
"step": 4560
},
{
"epoch": 2.959844559585492,
"grad_norm": 0.33828750252723694,
"learning_rate": 6.11490742250746e-05,
"loss": 0.0285,
"step": 4570
},
{
"epoch": 2.966321243523316,
"grad_norm": 0.43117380142211914,
"learning_rate": 6.0987829568702656e-05,
"loss": 0.0275,
"step": 4580
},
{
"epoch": 2.97279792746114,
"grad_norm": 0.32091742753982544,
"learning_rate": 6.0826464751186994e-05,
"loss": 0.0235,
"step": 4590
},
{
"epoch": 2.9792746113989637,
"grad_norm": 0.3023090958595276,
"learning_rate": 6.066498153718735e-05,
"loss": 0.0222,
"step": 4600
},
{
"epoch": 2.9857512953367875,
"grad_norm": 0.3170013725757599,
"learning_rate": 6.05033816926583e-05,
"loss": 0.0357,
"step": 4610
},
{
"epoch": 2.9922279792746114,
"grad_norm": 0.39176446199417114,
"learning_rate": 6.034166698482984e-05,
"loss": 0.0222,
"step": 4620
},
{
"epoch": 2.9987046632124352,
"grad_norm": 0.3850434422492981,
"learning_rate": 6.017983918218812e-05,
"loss": 0.0268,
"step": 4630
},
{
"epoch": 3.005181347150259,
"grad_norm": 0.37918686866760254,
"learning_rate": 6.001790005445607e-05,
"loss": 0.0195,
"step": 4640
},
{
"epoch": 3.011658031088083,
"grad_norm": 0.21127471327781677,
"learning_rate": 5.985585137257401e-05,
"loss": 0.0311,
"step": 4650
},
{
"epoch": 3.018134715025907,
"grad_norm": 0.3583742082118988,
"learning_rate": 5.969369490868042e-05,
"loss": 0.0314,
"step": 4660
},
{
"epoch": 3.0246113989637307,
"grad_norm": 0.43034079670906067,
"learning_rate": 5.953143243609235e-05,
"loss": 0.0289,
"step": 4670
},
{
"epoch": 3.0310880829015545,
"grad_norm": 0.20105870068073273,
"learning_rate": 5.9369065729286245e-05,
"loss": 0.0239,
"step": 4680
},
{
"epoch": 3.0375647668393784,
"grad_norm": 0.4128076732158661,
"learning_rate": 5.9206596563878357e-05,
"loss": 0.0315,
"step": 4690
},
{
"epoch": 3.0440414507772022,
"grad_norm": 0.3665732145309448,
"learning_rate": 5.90440267166055e-05,
"loss": 0.0254,
"step": 4700
},
{
"epoch": 3.050518134715026,
"grad_norm": 0.3067231774330139,
"learning_rate": 5.888135796530544e-05,
"loss": 0.0287,
"step": 4710
},
{
"epoch": 3.05699481865285,
"grad_norm": 0.3856498897075653,
"learning_rate": 5.871859208889759e-05,
"loss": 0.0221,
"step": 4720
},
{
"epoch": 3.063471502590674,
"grad_norm": 0.31735512614250183,
"learning_rate": 5.85557308673635e-05,
"loss": 0.0266,
"step": 4730
},
{
"epoch": 3.069948186528497,
"grad_norm": 0.4213178753852844,
"learning_rate": 5.8392776081727385e-05,
"loss": 0.0279,
"step": 4740
},
{
"epoch": 3.076424870466321,
"grad_norm": 0.33281782269477844,
"learning_rate": 5.8229729514036705e-05,
"loss": 0.0196,
"step": 4750
},
{
"epoch": 3.082901554404145,
"grad_norm": 0.4126596748828888,
"learning_rate": 5.8066592947342555e-05,
"loss": 0.0202,
"step": 4760
},
{
"epoch": 3.089378238341969,
"grad_norm": 0.2282947152853012,
"learning_rate": 5.7903368165680327e-05,
"loss": 0.0226,
"step": 4770
},
{
"epoch": 3.0958549222797926,
"grad_norm": 0.44935131072998047,
"learning_rate": 5.7740056954050084e-05,
"loss": 0.0303,
"step": 4780
},
{
"epoch": 3.1023316062176165,
"grad_norm": 0.37293264269828796,
"learning_rate": 5.757666109839702e-05,
"loss": 0.0353,
"step": 4790
},
{
"epoch": 3.1088082901554404,
"grad_norm": 0.20915338397026062,
"learning_rate": 5.74131823855921e-05,
"loss": 0.0267,
"step": 4800
},
{
"epoch": 3.115284974093264,
"grad_norm": 0.26994702219963074,
"learning_rate": 5.72496226034123e-05,
"loss": 0.0247,
"step": 4810
},
{
"epoch": 3.121761658031088,
"grad_norm": 0.3534125089645386,
"learning_rate": 5.7085983540521216e-05,
"loss": 0.0231,
"step": 4820
},
{
"epoch": 3.128238341968912,
"grad_norm": 0.3730353116989136,
"learning_rate": 5.692226698644938e-05,
"loss": 0.0241,
"step": 4830
},
{
"epoch": 3.134715025906736,
"grad_norm": 0.25073689222335815,
"learning_rate": 5.675847473157485e-05,
"loss": 0.0228,
"step": 4840
},
{
"epoch": 3.1411917098445596,
"grad_norm": 0.23234787583351135,
"learning_rate": 5.6594608567103456e-05,
"loss": 0.0165,
"step": 4850
},
{
"epoch": 3.1476683937823835,
"grad_norm": 0.2170400619506836,
"learning_rate": 5.6430670285049314e-05,
"loss": 0.0188,
"step": 4860
},
{
"epoch": 3.1541450777202074,
"grad_norm": 0.38324370980262756,
"learning_rate": 5.6266661678215216e-05,
"loss": 0.0313,
"step": 4870
},
{
"epoch": 3.160621761658031,
"grad_norm": 0.3521007001399994,
"learning_rate": 5.6102584540173006e-05,
"loss": 0.0239,
"step": 4880
},
{
"epoch": 3.167098445595855,
"grad_norm": 0.2577424943447113,
"learning_rate": 5.5938440665244006e-05,
"loss": 0.0212,
"step": 4890
},
{
"epoch": 3.173575129533679,
"grad_norm": 0.36752060055732727,
"learning_rate": 5.577423184847932e-05,
"loss": 0.0226,
"step": 4900
},
{
"epoch": 3.180051813471503,
"grad_norm": 0.3478221595287323,
"learning_rate": 5.560995988564023e-05,
"loss": 0.024,
"step": 4910
},
{
"epoch": 3.186528497409326,
"grad_norm": 0.31752872467041016,
"learning_rate": 5.544562657317863e-05,
"loss": 0.0301,
"step": 4920
},
{
"epoch": 3.19300518134715,
"grad_norm": 0.22945484519004822,
"learning_rate": 5.52812337082173e-05,
"loss": 0.0319,
"step": 4930
},
{
"epoch": 3.199481865284974,
"grad_norm": 0.3376820385456085,
"learning_rate": 5.511678308853026e-05,
"loss": 0.0229,
"step": 4940
},
{
"epoch": 3.2059585492227978,
"grad_norm": 0.4235116243362427,
"learning_rate": 5.495227651252315e-05,
"loss": 0.0223,
"step": 4950
},
{
"epoch": 3.2124352331606216,
"grad_norm": 0.3130274713039398,
"learning_rate": 5.478771577921351e-05,
"loss": 0.0302,
"step": 4960
},
{
"epoch": 3.2189119170984455,
"grad_norm": 0.27809765934944153,
"learning_rate": 5.462310268821118e-05,
"loss": 0.0223,
"step": 4970
},
{
"epoch": 3.2253886010362693,
"grad_norm": 0.2564016580581665,
"learning_rate": 5.445843903969854e-05,
"loss": 0.0232,
"step": 4980
},
{
"epoch": 3.231865284974093,
"grad_norm": 0.28008344769477844,
"learning_rate": 5.4293726634410855e-05,
"loss": 0.0211,
"step": 4990
},
{
"epoch": 3.238341968911917,
"grad_norm": 0.19657611846923828,
"learning_rate": 5.4128967273616625e-05,
"loss": 0.0251,
"step": 5000
},
{
"epoch": 3.244818652849741,
"grad_norm": 0.2586984634399414,
"learning_rate": 5.396416275909779e-05,
"loss": 0.0249,
"step": 5010
},
{
"epoch": 3.2512953367875648,
"grad_norm": 0.474925696849823,
"learning_rate": 5.379931489313016e-05,
"loss": 0.0205,
"step": 5020
},
{
"epoch": 3.2577720207253886,
"grad_norm": 0.4486338198184967,
"learning_rate": 5.363442547846356e-05,
"loss": 0.0274,
"step": 5030
},
{
"epoch": 3.2642487046632125,
"grad_norm": 0.23376742005348206,
"learning_rate": 5.3469496318302204e-05,
"loss": 0.0199,
"step": 5040
},
{
"epoch": 3.2707253886010363,
"grad_norm": 0.19954350590705872,
"learning_rate": 5.330452921628497e-05,
"loss": 0.0203,
"step": 5050
},
{
"epoch": 3.27720207253886,
"grad_norm": 0.2920505404472351,
"learning_rate": 5.313952597646568e-05,
"loss": 0.0211,
"step": 5060
},
{
"epoch": 3.283678756476684,
"grad_norm": 0.4120335578918457,
"learning_rate": 5.297448840329329e-05,
"loss": 0.026,
"step": 5070
},
{
"epoch": 3.290155440414508,
"grad_norm": 0.3703533411026001,
"learning_rate": 5.280941830159227e-05,
"loss": 0.0249,
"step": 5080
},
{
"epoch": 3.2966321243523318,
"grad_norm": 0.342776894569397,
"learning_rate": 5.264431747654284e-05,
"loss": 0.0255,
"step": 5090
},
{
"epoch": 3.3031088082901556,
"grad_norm": 0.42008474469184875,
"learning_rate": 5.247918773366112e-05,
"loss": 0.0381,
"step": 5100
},
{
"epoch": 3.3095854922279795,
"grad_norm": 0.3160938620567322,
"learning_rate": 5.231403087877955e-05,
"loss": 0.0235,
"step": 5110
},
{
"epoch": 3.3160621761658033,
"grad_norm": 0.2879834473133087,
"learning_rate": 5.214884871802703e-05,
"loss": 0.0253,
"step": 5120
},
{
"epoch": 3.3225388601036268,
"grad_norm": 0.43620437383651733,
"learning_rate": 5.198364305780922e-05,
"loss": 0.0222,
"step": 5130
},
{
"epoch": 3.3290155440414506,
"grad_norm": 0.3343920111656189,
"learning_rate": 5.1818415704788725e-05,
"loss": 0.0233,
"step": 5140
},
{
"epoch": 3.3354922279792745,
"grad_norm": 0.2971336543560028,
"learning_rate": 5.165316846586541e-05,
"loss": 0.03,
"step": 5150
},
{
"epoch": 3.3419689119170983,
"grad_norm": 0.28571948409080505,
"learning_rate": 5.148790314815663e-05,
"loss": 0.021,
"step": 5160
},
{
"epoch": 3.348445595854922,
"grad_norm": 0.25450941920280457,
"learning_rate": 5.132262155897739e-05,
"loss": 0.0251,
"step": 5170
},
{
"epoch": 3.354922279792746,
"grad_norm": 0.3214353322982788,
"learning_rate": 5.1157325505820694e-05,
"loss": 0.0312,
"step": 5180
},
{
"epoch": 3.36139896373057,
"grad_norm": 0.20665358006954193,
"learning_rate": 5.0992016796337686e-05,
"loss": 0.0186,
"step": 5190
},
{
"epoch": 3.3678756476683938,
"grad_norm": 0.40207940340042114,
"learning_rate": 5.0826697238317935e-05,
"loss": 0.0267,
"step": 5200
},
{
"epoch": 3.3743523316062176,
"grad_norm": 0.3291640281677246,
"learning_rate": 5.066136863966963e-05,
"loss": 0.0237,
"step": 5210
},
{
"epoch": 3.3808290155440415,
"grad_norm": 0.2767675817012787,
"learning_rate": 5.0496032808399815e-05,
"loss": 0.0211,
"step": 5220
},
{
"epoch": 3.3873056994818653,
"grad_norm": 0.20625481009483337,
"learning_rate": 5.033069155259471e-05,
"loss": 0.0306,
"step": 5230
},
{
"epoch": 3.393782383419689,
"grad_norm": 0.39488154649734497,
"learning_rate": 5.016534668039976e-05,
"loss": 0.0219,
"step": 5240
},
{
"epoch": 3.400259067357513,
"grad_norm": 0.20591653883457184,
"learning_rate": 5e-05,
"loss": 0.0179,
"step": 5250
},
{
"epoch": 3.406735751295337,
"grad_norm": 0.26007452607154846,
"learning_rate": 4.9834653319600246e-05,
"loss": 0.0249,
"step": 5260
},
{
"epoch": 3.4132124352331608,
"grad_norm": 0.26599636673927307,
"learning_rate": 4.96693084474053e-05,
"loss": 0.0206,
"step": 5270
},
{
"epoch": 3.4196891191709846,
"grad_norm": 0.13836756348609924,
"learning_rate": 4.950396719160018e-05,
"loss": 0.0168,
"step": 5280
},
{
"epoch": 3.4261658031088085,
"grad_norm": 0.2556383013725281,
"learning_rate": 4.93386313603304e-05,
"loss": 0.0159,
"step": 5290
},
{
"epoch": 3.432642487046632,
"grad_norm": 0.23129259049892426,
"learning_rate": 4.917330276168208e-05,
"loss": 0.0231,
"step": 5300
},
{
"epoch": 3.4391191709844557,
"grad_norm": 0.2568337619304657,
"learning_rate": 4.9007983203662326e-05,
"loss": 0.0398,
"step": 5310
},
{
"epoch": 3.4455958549222796,
"grad_norm": 0.31978926062583923,
"learning_rate": 4.884267449417931e-05,
"loss": 0.0227,
"step": 5320
},
{
"epoch": 3.4520725388601035,
"grad_norm": 0.3868251442909241,
"learning_rate": 4.867737844102261e-05,
"loss": 0.0331,
"step": 5330
},
{
"epoch": 3.4585492227979273,
"grad_norm": 0.28642046451568604,
"learning_rate": 4.851209685184338e-05,
"loss": 0.0249,
"step": 5340
},
{
"epoch": 3.465025906735751,
"grad_norm": 0.3905954360961914,
"learning_rate": 4.834683153413459e-05,
"loss": 0.0211,
"step": 5350
},
{
"epoch": 3.471502590673575,
"grad_norm": 0.2978500425815582,
"learning_rate": 4.818158429521129e-05,
"loss": 0.0321,
"step": 5360
},
{
"epoch": 3.477979274611399,
"grad_norm": 0.40893664956092834,
"learning_rate": 4.801635694219079e-05,
"loss": 0.0187,
"step": 5370
},
{
"epoch": 3.4844559585492227,
"grad_norm": 0.33391204476356506,
"learning_rate": 4.785115128197298e-05,
"loss": 0.0272,
"step": 5380
},
{
"epoch": 3.4909326424870466,
"grad_norm": 0.4223330616950989,
"learning_rate": 4.7685969121220456e-05,
"loss": 0.0247,
"step": 5390
},
{
"epoch": 3.4974093264248705,
"grad_norm": 0.23924241960048676,
"learning_rate": 4.7520812266338885e-05,
"loss": 0.0256,
"step": 5400
},
{
"epoch": 3.5038860103626943,
"grad_norm": 0.21827708184719086,
"learning_rate": 4.735568252345718e-05,
"loss": 0.0241,
"step": 5410
},
{
"epoch": 3.510362694300518,
"grad_norm": 0.36227914690971375,
"learning_rate": 4.7190581698407725e-05,
"loss": 0.0276,
"step": 5420
},
{
"epoch": 3.516839378238342,
"grad_norm": 0.2625763416290283,
"learning_rate": 4.702551159670672e-05,
"loss": 0.0239,
"step": 5430
},
{
"epoch": 3.523316062176166,
"grad_norm": 0.2994100749492645,
"learning_rate": 4.6860474023534335e-05,
"loss": 0.0245,
"step": 5440
},
{
"epoch": 3.5297927461139897,
"grad_norm": 0.31954896450042725,
"learning_rate": 4.669547078371504e-05,
"loss": 0.023,
"step": 5450
},
{
"epoch": 3.5362694300518136,
"grad_norm": 0.22966216504573822,
"learning_rate": 4.65305036816978e-05,
"loss": 0.0181,
"step": 5460
},
{
"epoch": 3.5427461139896375,
"grad_norm": 0.35159385204315186,
"learning_rate": 4.6365574521536445e-05,
"loss": 0.0242,
"step": 5470
},
{
"epoch": 3.5492227979274613,
"grad_norm": 0.2604275047779083,
"learning_rate": 4.620068510686985e-05,
"loss": 0.021,
"step": 5480
},
{
"epoch": 3.555699481865285,
"grad_norm": 0.27099111676216125,
"learning_rate": 4.60358372409022e-05,
"loss": 0.0187,
"step": 5490
},
{
"epoch": 3.562176165803109,
"grad_norm": 0.36249786615371704,
"learning_rate": 4.5871032726383386e-05,
"loss": 0.0174,
"step": 5500
},
{
"epoch": 3.568652849740933,
"grad_norm": 0.20331737399101257,
"learning_rate": 4.570627336558915e-05,
"loss": 0.0215,
"step": 5510
},
{
"epoch": 3.5751295336787567,
"grad_norm": 0.1920534372329712,
"learning_rate": 4.554156096030149e-05,
"loss": 0.0199,
"step": 5520
},
{
"epoch": 3.58160621761658,
"grad_norm": 0.24783368408679962,
"learning_rate": 4.537689731178883e-05,
"loss": 0.0177,
"step": 5530
},
{
"epoch": 3.588082901554404,
"grad_norm": 0.41117021441459656,
"learning_rate": 4.5212284220786494e-05,
"loss": 0.0217,
"step": 5540
},
{
"epoch": 3.594559585492228,
"grad_norm": 0.28663378953933716,
"learning_rate": 4.504772348747687e-05,
"loss": 0.0195,
"step": 5550
},
{
"epoch": 3.6010362694300517,
"grad_norm": 0.1578565090894699,
"learning_rate": 4.488321691146975e-05,
"loss": 0.0245,
"step": 5560
},
{
"epoch": 3.6075129533678756,
"grad_norm": 0.2837083637714386,
"learning_rate": 4.471876629178273e-05,
"loss": 0.0244,
"step": 5570
},
{
"epoch": 3.6139896373056994,
"grad_norm": 0.41021156311035156,
"learning_rate": 4.4554373426821374e-05,
"loss": 0.0176,
"step": 5580
},
{
"epoch": 3.6204663212435233,
"grad_norm": 0.459164023399353,
"learning_rate": 4.439004011435979e-05,
"loss": 0.022,
"step": 5590
},
{
"epoch": 3.626943005181347,
"grad_norm": 0.3070141077041626,
"learning_rate": 4.4225768151520694e-05,
"loss": 0.0207,
"step": 5600
},
{
"epoch": 3.633419689119171,
"grad_norm": 0.2605834901332855,
"learning_rate": 4.406155933475599e-05,
"loss": 0.0201,
"step": 5610
},
{
"epoch": 3.639896373056995,
"grad_norm": 0.33735018968582153,
"learning_rate": 4.3897415459827e-05,
"loss": 0.0235,
"step": 5620
},
{
"epoch": 3.6463730569948187,
"grad_norm": 0.1962365359067917,
"learning_rate": 4.373333832178478e-05,
"loss": 0.0143,
"step": 5630
},
{
"epoch": 3.6528497409326426,
"grad_norm": 0.21969126164913177,
"learning_rate": 4.3569329714950704e-05,
"loss": 0.0201,
"step": 5640
},
{
"epoch": 3.6593264248704664,
"grad_norm": 0.255919486284256,
"learning_rate": 4.3405391432896555e-05,
"loss": 0.0237,
"step": 5650
},
{
"epoch": 3.6658031088082903,
"grad_norm": 0.21406243741512299,
"learning_rate": 4.324152526842517e-05,
"loss": 0.0213,
"step": 5660
},
{
"epoch": 3.6722797927461137,
"grad_norm": 0.18419137597084045,
"learning_rate": 4.307773301355062e-05,
"loss": 0.0192,
"step": 5670
},
{
"epoch": 3.6787564766839376,
"grad_norm": 0.28473156690597534,
"learning_rate": 4.291401645947879e-05,
"loss": 0.0305,
"step": 5680
},
{
"epoch": 3.6852331606217614,
"grad_norm": 0.16764795780181885,
"learning_rate": 4.275037739658771e-05,
"loss": 0.0162,
"step": 5690
},
{
"epoch": 3.6917098445595853,
"grad_norm": 0.1837482899427414,
"learning_rate": 4.2586817614407895e-05,
"loss": 0.0218,
"step": 5700
},
{
"epoch": 3.698186528497409,
"grad_norm": 0.3757379651069641,
"learning_rate": 4.2423338901602985e-05,
"loss": 0.0295,
"step": 5710
},
{
"epoch": 3.704663212435233,
"grad_norm": 0.284368097782135,
"learning_rate": 4.2259943045949934e-05,
"loss": 0.0203,
"step": 5720
},
{
"epoch": 3.711139896373057,
"grad_norm": 0.3066781759262085,
"learning_rate": 4.209663183431969e-05,
"loss": 0.0198,
"step": 5730
},
{
"epoch": 3.7176165803108807,
"grad_norm": 0.26480358839035034,
"learning_rate": 4.1933407052657456e-05,
"loss": 0.0204,
"step": 5740
},
{
"epoch": 3.7240932642487046,
"grad_norm": 0.3328205347061157,
"learning_rate": 4.17702704859633e-05,
"loss": 0.0213,
"step": 5750
},
{
"epoch": 3.7305699481865284,
"grad_norm": 0.3852354884147644,
"learning_rate": 4.160722391827262e-05,
"loss": 0.0206,
"step": 5760
},
{
"epoch": 3.7370466321243523,
"grad_norm": 0.1848895400762558,
"learning_rate": 4.14442691326365e-05,
"loss": 0.0156,
"step": 5770
},
{
"epoch": 3.743523316062176,
"grad_norm": 0.4053875207901001,
"learning_rate": 4.1281407911102425e-05,
"loss": 0.0242,
"step": 5780
},
{
"epoch": 3.75,
"grad_norm": 0.21550576388835907,
"learning_rate": 4.111864203469457e-05,
"loss": 0.0179,
"step": 5790
},
{
"epoch": 3.756476683937824,
"grad_norm": 0.3576870262622833,
"learning_rate": 4.095597328339452e-05,
"loss": 0.0233,
"step": 5800
},
{
"epoch": 3.7629533678756477,
"grad_norm": 0.5374760627746582,
"learning_rate": 4.079340343612165e-05,
"loss": 0.0249,
"step": 5810
},
{
"epoch": 3.7694300518134716,
"grad_norm": 0.2569693922996521,
"learning_rate": 4.063093427071376e-05,
"loss": 0.029,
"step": 5820
},
{
"epoch": 3.7759067357512954,
"grad_norm": 0.2154322862625122,
"learning_rate": 4.046856756390767e-05,
"loss": 0.0232,
"step": 5830
},
{
"epoch": 3.7823834196891193,
"grad_norm": 0.22250936925411224,
"learning_rate": 4.0306305091319595e-05,
"loss": 0.019,
"step": 5840
},
{
"epoch": 3.788860103626943,
"grad_norm": 0.29183244705200195,
"learning_rate": 4.0144148627425993e-05,
"loss": 0.0203,
"step": 5850
},
{
"epoch": 3.795336787564767,
"grad_norm": 0.3605186939239502,
"learning_rate": 3.9982099945543945e-05,
"loss": 0.0202,
"step": 5860
},
{
"epoch": 3.801813471502591,
"grad_norm": 0.19650620222091675,
"learning_rate": 3.982016081781189e-05,
"loss": 0.0169,
"step": 5870
},
{
"epoch": 3.8082901554404147,
"grad_norm": 0.289876788854599,
"learning_rate": 3.965833301517017e-05,
"loss": 0.0272,
"step": 5880
},
{
"epoch": 3.8147668393782386,
"grad_norm": 0.291120320558548,
"learning_rate": 3.949661830734172e-05,
"loss": 0.0159,
"step": 5890
},
{
"epoch": 3.8212435233160624,
"grad_norm": 0.32480406761169434,
"learning_rate": 3.933501846281267e-05,
"loss": 0.0191,
"step": 5900
},
{
"epoch": 3.8277202072538863,
"grad_norm": 0.449545681476593,
"learning_rate": 3.917353524881302e-05,
"loss": 0.0297,
"step": 5910
},
{
"epoch": 3.8341968911917097,
"grad_norm": 0.26695263385772705,
"learning_rate": 3.901217043129735e-05,
"loss": 0.0255,
"step": 5920
},
{
"epoch": 3.8406735751295336,
"grad_norm": 0.4865301549434662,
"learning_rate": 3.8850925774925425e-05,
"loss": 0.0219,
"step": 5930
},
{
"epoch": 3.8471502590673574,
"grad_norm": 0.31623756885528564,
"learning_rate": 3.8689803043043e-05,
"loss": 0.0226,
"step": 5940
},
{
"epoch": 3.8536269430051813,
"grad_norm": 0.19429579377174377,
"learning_rate": 3.852880399766243e-05,
"loss": 0.0173,
"step": 5950
},
{
"epoch": 3.860103626943005,
"grad_norm": 0.24352151155471802,
"learning_rate": 3.836793039944349e-05,
"loss": 0.0211,
"step": 5960
},
{
"epoch": 3.866580310880829,
"grad_norm": 0.24858231842517853,
"learning_rate": 3.820718400767409e-05,
"loss": 0.02,
"step": 5970
},
{
"epoch": 3.873056994818653,
"grad_norm": 0.22467108070850372,
"learning_rate": 3.8046566580251e-05,
"loss": 0.0145,
"step": 5980
},
{
"epoch": 3.8795336787564767,
"grad_norm": 0.23540061712265015,
"learning_rate": 3.788607987366069e-05,
"loss": 0.0191,
"step": 5990
},
{
"epoch": 3.8860103626943006,
"grad_norm": 0.22473685443401337,
"learning_rate": 3.772572564296005e-05,
"loss": 0.0221,
"step": 6000
},
{
"epoch": 3.8924870466321244,
"grad_norm": 0.40158477425575256,
"learning_rate": 3.756550564175727e-05,
"loss": 0.0235,
"step": 6010
},
{
"epoch": 3.8989637305699483,
"grad_norm": 0.3086557686328888,
"learning_rate": 3.74054216221926e-05,
"loss": 0.0166,
"step": 6020
},
{
"epoch": 3.905440414507772,
"grad_norm": 0.3648836612701416,
"learning_rate": 3.7245475334919246e-05,
"loss": 0.0264,
"step": 6030
},
{
"epoch": 3.911917098445596,
"grad_norm": 0.12361598759889603,
"learning_rate": 3.7085668529084184e-05,
"loss": 0.026,
"step": 6040
},
{
"epoch": 3.91839378238342,
"grad_norm": 0.25430572032928467,
"learning_rate": 3.6926002952309016e-05,
"loss": 0.0198,
"step": 6050
},
{
"epoch": 3.9248704663212433,
"grad_norm": 0.2055133730173111,
"learning_rate": 3.676648035067093e-05,
"loss": 0.0211,
"step": 6060
},
{
"epoch": 3.931347150259067,
"grad_norm": 0.20963652431964874,
"learning_rate": 3.6607102468683526e-05,
"loss": 0.0213,
"step": 6070
},
{
"epoch": 3.937823834196891,
"grad_norm": 0.2894296646118164,
"learning_rate": 3.6447871049277796e-05,
"loss": 0.016,
"step": 6080
},
{
"epoch": 3.944300518134715,
"grad_norm": 0.31744205951690674,
"learning_rate": 3.628878783378302e-05,
"loss": 0.0177,
"step": 6090
},
{
"epoch": 3.9507772020725387,
"grad_norm": 0.2509874105453491,
"learning_rate": 3.612985456190778e-05,
"loss": 0.0282,
"step": 6100
},
{
"epoch": 3.9572538860103625,
"grad_norm": 0.22488081455230713,
"learning_rate": 3.597107297172084e-05,
"loss": 0.0259,
"step": 6110
},
{
"epoch": 3.9637305699481864,
"grad_norm": 0.255126953125,
"learning_rate": 3.581244479963225e-05,
"loss": 0.0209,
"step": 6120
},
{
"epoch": 3.9702072538860103,
"grad_norm": 0.2088916003704071,
"learning_rate": 3.5653971780374295e-05,
"loss": 0.0233,
"step": 6130
},
{
"epoch": 3.976683937823834,
"grad_norm": 0.24006441235542297,
"learning_rate": 3.5495655646982505e-05,
"loss": 0.025,
"step": 6140
},
{
"epoch": 3.983160621761658,
"grad_norm": 0.25505682826042175,
"learning_rate": 3.533749813077677e-05,
"loss": 0.0187,
"step": 6150
},
{
"epoch": 3.989637305699482,
"grad_norm": 0.3427753746509552,
"learning_rate": 3.517950096134232e-05,
"loss": 0.02,
"step": 6160
},
{
"epoch": 3.9961139896373057,
"grad_norm": 0.1884257197380066,
"learning_rate": 3.5021665866510925e-05,
"loss": 0.0175,
"step": 6170
},
{
"epoch": 4.0025906735751295,
"grad_norm": 0.45834600925445557,
"learning_rate": 3.4863994572341843e-05,
"loss": 0.0235,
"step": 6180
},
{
"epoch": 4.009067357512953,
"grad_norm": 0.3222751319408417,
"learning_rate": 3.470648880310313e-05,
"loss": 0.0142,
"step": 6190
},
{
"epoch": 4.015544041450777,
"grad_norm": 0.3138159215450287,
"learning_rate": 3.4549150281252636e-05,
"loss": 0.0215,
"step": 6200
},
{
"epoch": 4.022020725388601,
"grad_norm": 0.2751568853855133,
"learning_rate": 3.439198072741921e-05,
"loss": 0.0263,
"step": 6210
},
{
"epoch": 4.028497409326425,
"grad_norm": 0.3685873746871948,
"learning_rate": 3.423498186038393e-05,
"loss": 0.0241,
"step": 6220
},
{
"epoch": 4.034974093264249,
"grad_norm": 0.243320032954216,
"learning_rate": 3.407815539706124e-05,
"loss": 0.0201,
"step": 6230
},
{
"epoch": 4.041450777202073,
"grad_norm": 0.26391202211380005,
"learning_rate": 3.392150305248024e-05,
"loss": 0.0206,
"step": 6240
},
{
"epoch": 4.0479274611398965,
"grad_norm": 0.3051769733428955,
"learning_rate": 3.3765026539765834e-05,
"loss": 0.0174,
"step": 6250
},
{
"epoch": 4.05440414507772,
"grad_norm": 0.2602379620075226,
"learning_rate": 3.360872757012011e-05,
"loss": 0.0206,
"step": 6260
},
{
"epoch": 4.060880829015544,
"grad_norm": 0.37795671820640564,
"learning_rate": 3.3452607852803584e-05,
"loss": 0.0216,
"step": 6270
},
{
"epoch": 4.067357512953368,
"grad_norm": 0.20276519656181335,
"learning_rate": 3.329666909511645e-05,
"loss": 0.022,
"step": 6280
},
{
"epoch": 4.073834196891192,
"grad_norm": 0.2749174237251282,
"learning_rate": 3.3140913002379995e-05,
"loss": 0.019,
"step": 6290
},
{
"epoch": 4.080310880829016,
"grad_norm": 0.27063703536987305,
"learning_rate": 3.298534127791785e-05,
"loss": 0.0164,
"step": 6300
},
{
"epoch": 4.08678756476684,
"grad_norm": 0.2670196294784546,
"learning_rate": 3.282995562303754e-05,
"loss": 0.0178,
"step": 6310
},
{
"epoch": 4.0932642487046635,
"grad_norm": 0.24421292543411255,
"learning_rate": 3.267475773701161e-05,
"loss": 0.0149,
"step": 6320
},
{
"epoch": 4.099740932642487,
"grad_norm": 0.16270811855793,
"learning_rate": 3.251974931705933e-05,
"loss": 0.0202,
"step": 6330
},
{
"epoch": 4.106217616580311,
"grad_norm": 0.27116742730140686,
"learning_rate": 3.236493205832795e-05,
"loss": 0.0228,
"step": 6340
},
{
"epoch": 4.112694300518135,
"grad_norm": 0.29166123270988464,
"learning_rate": 3.221030765387417e-05,
"loss": 0.0225,
"step": 6350
},
{
"epoch": 4.119170984455959,
"grad_norm": 0.30937689542770386,
"learning_rate": 3.205587779464576e-05,
"loss": 0.0195,
"step": 6360
},
{
"epoch": 4.125647668393782,
"grad_norm": 0.32410162687301636,
"learning_rate": 3.190164416946285e-05,
"loss": 0.0154,
"step": 6370
},
{
"epoch": 4.132124352331606,
"grad_norm": 0.37122395634651184,
"learning_rate": 3.1747608464999725e-05,
"loss": 0.0197,
"step": 6380
},
{
"epoch": 4.13860103626943,
"grad_norm": 0.2892199754714966,
"learning_rate": 3.1593772365766105e-05,
"loss": 0.0253,
"step": 6390
},
{
"epoch": 4.1450777202072535,
"grad_norm": 0.20431406795978546,
"learning_rate": 3.144013755408895e-05,
"loss": 0.0181,
"step": 6400
},
{
"epoch": 4.151554404145077,
"grad_norm": 0.2995465099811554,
"learning_rate": 3.128670571009399e-05,
"loss": 0.0209,
"step": 6410
},
{
"epoch": 4.158031088082901,
"grad_norm": 0.24306261539459229,
"learning_rate": 3.113347851168721e-05,
"loss": 0.0207,
"step": 6420
},
{
"epoch": 4.164507772020725,
"grad_norm": 0.19058877229690552,
"learning_rate": 3.098045763453678e-05,
"loss": 0.0173,
"step": 6430
},
{
"epoch": 4.170984455958549,
"grad_norm": 0.15538224577903748,
"learning_rate": 3.082764475205442e-05,
"loss": 0.018,
"step": 6440
},
{
"epoch": 4.177461139896373,
"grad_norm": 0.3089154064655304,
"learning_rate": 3.0675041535377405e-05,
"loss": 0.0178,
"step": 6450
},
{
"epoch": 4.183937823834197,
"grad_norm": 0.19737383723258972,
"learning_rate": 3.052264965335e-05,
"loss": 0.015,
"step": 6460
},
{
"epoch": 4.1904145077720205,
"grad_norm": 0.25178325176239014,
"learning_rate": 3.0370470772505433e-05,
"loss": 0.0141,
"step": 6470
},
{
"epoch": 4.196891191709844,
"grad_norm": 0.16084274649620056,
"learning_rate": 3.0218506557047598e-05,
"loss": 0.0133,
"step": 6480
},
{
"epoch": 4.203367875647668,
"grad_norm": 0.18579982221126556,
"learning_rate": 3.006675866883275e-05,
"loss": 0.0147,
"step": 6490
},
{
"epoch": 4.209844559585492,
"grad_norm": 0.48777055740356445,
"learning_rate": 2.991522876735154e-05,
"loss": 0.0136,
"step": 6500
},
{
"epoch": 4.216321243523316,
"grad_norm": 0.30258408188819885,
"learning_rate": 2.976391850971065e-05,
"loss": 0.0269,
"step": 6510
},
{
"epoch": 4.22279792746114,
"grad_norm": 0.13562749326229095,
"learning_rate": 2.9612829550614836e-05,
"loss": 0.0258,
"step": 6520
},
{
"epoch": 4.229274611398964,
"grad_norm": 0.25811293721199036,
"learning_rate": 2.9461963542348737e-05,
"loss": 0.0143,
"step": 6530
},
{
"epoch": 4.2357512953367875,
"grad_norm": 0.32385483384132385,
"learning_rate": 2.931132213475884e-05,
"loss": 0.0241,
"step": 6540
},
{
"epoch": 4.242227979274611,
"grad_norm": 0.19972571730613708,
"learning_rate": 2.916090697523549e-05,
"loss": 0.0171,
"step": 6550
},
{
"epoch": 4.248704663212435,
"grad_norm": 0.21418355405330658,
"learning_rate": 2.9010719708694722e-05,
"loss": 0.0141,
"step": 6560
},
{
"epoch": 4.255181347150259,
"grad_norm": 0.430324524641037,
"learning_rate": 2.8860761977560436e-05,
"loss": 0.0191,
"step": 6570
},
{
"epoch": 4.261658031088083,
"grad_norm": 0.20250950753688812,
"learning_rate": 2.8711035421746367e-05,
"loss": 0.0195,
"step": 6580
},
{
"epoch": 4.268134715025907,
"grad_norm": 0.3518775701522827,
"learning_rate": 2.8561541678638142e-05,
"loss": 0.0195,
"step": 6590
},
{
"epoch": 4.274611398963731,
"grad_norm": 0.28056877851486206,
"learning_rate": 2.8412282383075363e-05,
"loss": 0.0145,
"step": 6600
},
{
"epoch": 4.2810880829015545,
"grad_norm": 0.17302846908569336,
"learning_rate": 2.8263259167333777e-05,
"loss": 0.014,
"step": 6610
},
{
"epoch": 4.287564766839378,
"grad_norm": 0.2156630903482437,
"learning_rate": 2.811447366110741e-05,
"loss": 0.0194,
"step": 6620
},
{
"epoch": 4.294041450777202,
"grad_norm": 0.20651696622371674,
"learning_rate": 2.7965927491490705e-05,
"loss": 0.0154,
"step": 6630
},
{
"epoch": 4.300518134715026,
"grad_norm": 0.24434152245521545,
"learning_rate": 2.7817622282960815e-05,
"loss": 0.0175,
"step": 6640
},
{
"epoch": 4.30699481865285,
"grad_norm": 0.3232467770576477,
"learning_rate": 2.766955965735968e-05,
"loss": 0.016,
"step": 6650
},
{
"epoch": 4.313471502590674,
"grad_norm": 0.2705603837966919,
"learning_rate": 2.7521741233876496e-05,
"loss": 0.0198,
"step": 6660
},
{
"epoch": 4.319948186528498,
"grad_norm": 0.302611380815506,
"learning_rate": 2.7374168629029813e-05,
"loss": 0.0221,
"step": 6670
},
{
"epoch": 4.3264248704663215,
"grad_norm": 0.23167212307453156,
"learning_rate": 2.7226843456650037e-05,
"loss": 0.0134,
"step": 6680
},
{
"epoch": 4.332901554404145,
"grad_norm": 0.21923419833183289,
"learning_rate": 2.707976732786166e-05,
"loss": 0.0194,
"step": 6690
},
{
"epoch": 4.339378238341969,
"grad_norm": 0.22099579870700836,
"learning_rate": 2.693294185106562e-05,
"loss": 0.0178,
"step": 6700
},
{
"epoch": 4.345854922279793,
"grad_norm": 0.2990250885486603,
"learning_rate": 2.6786368631921836e-05,
"loss": 0.0192,
"step": 6710
},
{
"epoch": 4.352331606217617,
"grad_norm": 0.32166576385498047,
"learning_rate": 2.6640049273331515e-05,
"loss": 0.0207,
"step": 6720
},
{
"epoch": 4.358808290155441,
"grad_norm": 0.24853219091892242,
"learning_rate": 2.6493985375419778e-05,
"loss": 0.0196,
"step": 6730
},
{
"epoch": 4.365284974093265,
"grad_norm": 0.223812997341156,
"learning_rate": 2.6348178535517966e-05,
"loss": 0.0244,
"step": 6740
},
{
"epoch": 4.3717616580310885,
"grad_norm": 0.1604139655828476,
"learning_rate": 2.6202630348146324e-05,
"loss": 0.0174,
"step": 6750
},
{
"epoch": 4.3782383419689115,
"grad_norm": 0.19458530843257904,
"learning_rate": 2.6057342404996522e-05,
"loss": 0.0176,
"step": 6760
},
{
"epoch": 4.384715025906735,
"grad_norm": 0.3442190885543823,
"learning_rate": 2.591231629491423e-05,
"loss": 0.0232,
"step": 6770
},
{
"epoch": 4.391191709844559,
"grad_norm": 0.2418828159570694,
"learning_rate": 2.5767553603881767e-05,
"loss": 0.0138,
"step": 6780
},
{
"epoch": 4.397668393782383,
"grad_norm": 0.28616422414779663,
"learning_rate": 2.562305591500069e-05,
"loss": 0.0182,
"step": 6790
},
{
"epoch": 4.404145077720207,
"grad_norm": 0.35665300488471985,
"learning_rate": 2.547882480847461e-05,
"loss": 0.0256,
"step": 6800
},
{
"epoch": 4.410621761658031,
"grad_norm": 0.28347641229629517,
"learning_rate": 2.5334861861591753e-05,
"loss": 0.0181,
"step": 6810
},
{
"epoch": 4.417098445595855,
"grad_norm": 0.2930223047733307,
"learning_rate": 2.5191168648707887e-05,
"loss": 0.0181,
"step": 6820
},
{
"epoch": 4.4235751295336785,
"grad_norm": 0.2513889968395233,
"learning_rate": 2.5047746741228978e-05,
"loss": 0.0209,
"step": 6830
},
{
"epoch": 4.430051813471502,
"grad_norm": 0.22858624160289764,
"learning_rate": 2.490459770759398e-05,
"loss": 0.0186,
"step": 6840
},
{
"epoch": 4.436528497409326,
"grad_norm": 0.2156023532152176,
"learning_rate": 2.476172311325783e-05,
"loss": 0.0154,
"step": 6850
},
{
"epoch": 4.44300518134715,
"grad_norm": 0.21967634558677673,
"learning_rate": 2.4619124520674146e-05,
"loss": 0.0192,
"step": 6860
},
{
"epoch": 4.449481865284974,
"grad_norm": 0.16934919357299805,
"learning_rate": 2.447680348927837e-05,
"loss": 0.013,
"step": 6870
},
{
"epoch": 4.455958549222798,
"grad_norm": 0.18204748630523682,
"learning_rate": 2.433476157547044e-05,
"loss": 0.0128,
"step": 6880
},
{
"epoch": 4.462435233160622,
"grad_norm": 0.2556453049182892,
"learning_rate": 2.419300033259798e-05,
"loss": 0.0242,
"step": 6890
},
{
"epoch": 4.4689119170984455,
"grad_norm": 0.30375412106513977,
"learning_rate": 2.405152131093926e-05,
"loss": 0.0123,
"step": 6900
},
{
"epoch": 4.475388601036269,
"grad_norm": 0.19570554792881012,
"learning_rate": 2.3910326057686127e-05,
"loss": 0.02,
"step": 6910
},
{
"epoch": 4.481865284974093,
"grad_norm": 0.20033107697963715,
"learning_rate": 2.3769416116927335e-05,
"loss": 0.0199,
"step": 6920
},
{
"epoch": 4.488341968911917,
"grad_norm": 0.22169610857963562,
"learning_rate": 2.362879302963135e-05,
"loss": 0.0155,
"step": 6930
},
{
"epoch": 4.494818652849741,
"grad_norm": 0.20770221948623657,
"learning_rate": 2.3488458333629777e-05,
"loss": 0.0183,
"step": 6940
},
{
"epoch": 4.501295336787565,
"grad_norm": 0.2045334279537201,
"learning_rate": 2.3348413563600325e-05,
"loss": 0.0137,
"step": 6950
},
{
"epoch": 4.507772020725389,
"grad_norm": 0.2747853696346283,
"learning_rate": 2.3208660251050158e-05,
"loss": 0.0139,
"step": 6960
},
{
"epoch": 4.5142487046632125,
"grad_norm": 0.2516135573387146,
"learning_rate": 2.3069199924299174e-05,
"loss": 0.0185,
"step": 6970
},
{
"epoch": 4.520725388601036,
"grad_norm": 0.28163227438926697,
"learning_rate": 2.29300341084631e-05,
"loss": 0.0194,
"step": 6980
},
{
"epoch": 4.52720207253886,
"grad_norm": 0.18591704964637756,
"learning_rate": 2.279116432543705e-05,
"loss": 0.026,
"step": 6990
},
{
"epoch": 4.533678756476684,
"grad_norm": 0.2261771857738495,
"learning_rate": 2.2652592093878666e-05,
"loss": 0.0208,
"step": 7000
},
{
"epoch": 4.540155440414508,
"grad_norm": 0.22614069283008575,
"learning_rate": 2.251431892919171e-05,
"loss": 0.0154,
"step": 7010
},
{
"epoch": 4.546632124352332,
"grad_norm": 0.2026568055152893,
"learning_rate": 2.237634634350934e-05,
"loss": 0.019,
"step": 7020
},
{
"epoch": 4.553108808290156,
"grad_norm": 0.16602849960327148,
"learning_rate": 2.2238675845677663e-05,
"loss": 0.0121,
"step": 7030
},
{
"epoch": 4.5595854922279795,
"grad_norm": 0.32185062766075134,
"learning_rate": 2.2101308941239203e-05,
"loss": 0.0189,
"step": 7040
},
{
"epoch": 4.566062176165803,
"grad_norm": 0.1680583357810974,
"learning_rate": 2.196424713241637e-05,
"loss": 0.013,
"step": 7050
},
{
"epoch": 4.572538860103627,
"grad_norm": 0.16130873560905457,
"learning_rate": 2.182749191809518e-05,
"loss": 0.0171,
"step": 7060
},
{
"epoch": 4.579015544041451,
"grad_norm": 0.276980459690094,
"learning_rate": 2.1691044793808734e-05,
"loss": 0.0154,
"step": 7070
},
{
"epoch": 4.585492227979275,
"grad_norm": 0.2915078401565552,
"learning_rate": 2.1554907251720945e-05,
"loss": 0.0162,
"step": 7080
},
{
"epoch": 4.591968911917099,
"grad_norm": 0.5446373224258423,
"learning_rate": 2.1419080780610123e-05,
"loss": 0.0263,
"step": 7090
},
{
"epoch": 4.598445595854923,
"grad_norm": 0.3362918794155121,
"learning_rate": 2.128356686585282e-05,
"loss": 0.0156,
"step": 7100
},
{
"epoch": 4.6049222797927465,
"grad_norm": 0.2678106427192688,
"learning_rate": 2.1148366989407496e-05,
"loss": 0.0197,
"step": 7110
},
{
"epoch": 4.61139896373057,
"grad_norm": 0.18777549266815186,
"learning_rate": 2.1013482629798333e-05,
"loss": 0.0167,
"step": 7120
},
{
"epoch": 4.617875647668393,
"grad_norm": 0.17594756186008453,
"learning_rate": 2.0878915262099098e-05,
"loss": 0.0215,
"step": 7130
},
{
"epoch": 4.624352331606218,
"grad_norm": 0.22304749488830566,
"learning_rate": 2.0744666357916925e-05,
"loss": 0.0226,
"step": 7140
},
{
"epoch": 4.630829015544041,
"grad_norm": 0.18945784866809845,
"learning_rate": 2.061073738537635e-05,
"loss": 0.0134,
"step": 7150
},
{
"epoch": 4.637305699481866,
"grad_norm": 0.152085542678833,
"learning_rate": 2.0477129809103147e-05,
"loss": 0.0175,
"step": 7160
},
{
"epoch": 4.643782383419689,
"grad_norm": 0.1946026086807251,
"learning_rate": 2.0343845090208368e-05,
"loss": 0.0223,
"step": 7170
},
{
"epoch": 4.650259067357513,
"grad_norm": 0.17056028544902802,
"learning_rate": 2.0210884686272368e-05,
"loss": 0.017,
"step": 7180
},
{
"epoch": 4.6567357512953365,
"grad_norm": 0.20490196347236633,
"learning_rate": 2.0078250051328784e-05,
"loss": 0.0146,
"step": 7190
},
{
"epoch": 4.66321243523316,
"grad_norm": 0.2622607350349426,
"learning_rate": 1.9945942635848748e-05,
"loss": 0.0184,
"step": 7200
},
{
"epoch": 4.669689119170984,
"grad_norm": 0.27773723006248474,
"learning_rate": 1.981396388672496e-05,
"loss": 0.0193,
"step": 7210
},
{
"epoch": 4.676165803108808,
"grad_norm": 0.14281229674816132,
"learning_rate": 1.9682315247255894e-05,
"loss": 0.0115,
"step": 7220
},
{
"epoch": 4.682642487046632,
"grad_norm": 0.40963441133499146,
"learning_rate": 1.9550998157129946e-05,
"loss": 0.0148,
"step": 7230
},
{
"epoch": 4.689119170984456,
"grad_norm": 0.164134681224823,
"learning_rate": 1.942001405240979e-05,
"loss": 0.0128,
"step": 7240
},
{
"epoch": 4.69559585492228,
"grad_norm": 0.32639560103416443,
"learning_rate": 1.928936436551661e-05,
"loss": 0.0153,
"step": 7250
},
{
"epoch": 4.7020725388601035,
"grad_norm": 0.3040475845336914,
"learning_rate": 1.9159050525214452e-05,
"loss": 0.0217,
"step": 7260
},
{
"epoch": 4.708549222797927,
"grad_norm": 0.15811248123645782,
"learning_rate": 1.9029073956594606e-05,
"loss": 0.0205,
"step": 7270
},
{
"epoch": 4.715025906735751,
"grad_norm": 0.21602000296115875,
"learning_rate": 1.8899436081059975e-05,
"loss": 0.0129,
"step": 7280
},
{
"epoch": 4.721502590673575,
"grad_norm": 0.25439000129699707,
"learning_rate": 1.877013831630961e-05,
"loss": 0.0165,
"step": 7290
},
{
"epoch": 4.727979274611399,
"grad_norm": 0.2095918357372284,
"learning_rate": 1.8641182076323148e-05,
"loss": 0.024,
"step": 7300
},
{
"epoch": 4.734455958549223,
"grad_norm": 0.20923274755477905,
"learning_rate": 1.851256877134538e-05,
"loss": 0.0162,
"step": 7310
},
{
"epoch": 4.740932642487047,
"grad_norm": 0.2099110335111618,
"learning_rate": 1.838429980787081e-05,
"loss": 0.0158,
"step": 7320
},
{
"epoch": 4.7474093264248705,
"grad_norm": 0.30646830797195435,
"learning_rate": 1.8256376588628238e-05,
"loss": 0.0134,
"step": 7330
},
{
"epoch": 4.753886010362694,
"grad_norm": 0.10917850583791733,
"learning_rate": 1.8128800512565513e-05,
"loss": 0.0203,
"step": 7340
},
{
"epoch": 4.760362694300518,
"grad_norm": 0.30095070600509644,
"learning_rate": 1.800157297483417e-05,
"loss": 0.0173,
"step": 7350
},
{
"epoch": 4.766839378238342,
"grad_norm": 0.2577114701271057,
"learning_rate": 1.787469536677419e-05,
"loss": 0.0197,
"step": 7360
},
{
"epoch": 4.773316062176166,
"grad_norm": 0.3354087471961975,
"learning_rate": 1.774816907589873e-05,
"loss": 0.0186,
"step": 7370
},
{
"epoch": 4.77979274611399,
"grad_norm": 0.2139454483985901,
"learning_rate": 1.7621995485879062e-05,
"loss": 0.0207,
"step": 7380
},
{
"epoch": 4.786269430051814,
"grad_norm": 0.15414099395275116,
"learning_rate": 1.749617597652934e-05,
"loss": 0.0174,
"step": 7390
},
{
"epoch": 4.7927461139896375,
"grad_norm": 0.23587022721767426,
"learning_rate": 1.7370711923791567e-05,
"loss": 0.0161,
"step": 7400
},
{
"epoch": 4.799222797927461,
"grad_norm": 0.32957470417022705,
"learning_rate": 1.7245604699720535e-05,
"loss": 0.016,
"step": 7410
},
{
"epoch": 4.805699481865285,
"grad_norm": 0.381789892911911,
"learning_rate": 1.712085567246878e-05,
"loss": 0.0278,
"step": 7420
},
{
"epoch": 4.812176165803109,
"grad_norm": 0.10974530875682831,
"learning_rate": 1.699646620627168e-05,
"loss": 0.0204,
"step": 7430
},
{
"epoch": 4.818652849740933,
"grad_norm": 0.13477617502212524,
"learning_rate": 1.6872437661432517e-05,
"loss": 0.013,
"step": 7440
},
{
"epoch": 4.825129533678757,
"grad_norm": 0.24634115397930145,
"learning_rate": 1.6748771394307585e-05,
"loss": 0.0168,
"step": 7450
},
{
"epoch": 4.831606217616581,
"grad_norm": 0.20444193482398987,
"learning_rate": 1.662546875729138e-05,
"loss": 0.0128,
"step": 7460
},
{
"epoch": 4.8380829015544045,
"grad_norm": 0.21118474006652832,
"learning_rate": 1.6502531098801753e-05,
"loss": 0.0149,
"step": 7470
},
{
"epoch": 4.844559585492228,
"grad_norm": 0.21043027937412262,
"learning_rate": 1.637995976326527e-05,
"loss": 0.0235,
"step": 7480
},
{
"epoch": 4.851036269430052,
"grad_norm": 0.23634518682956696,
"learning_rate": 1.62577560911024e-05,
"loss": 0.0152,
"step": 7490
},
{
"epoch": 4.857512953367876,
"grad_norm": 0.11259549856185913,
"learning_rate": 1.6135921418712956e-05,
"loss": 0.0175,
"step": 7500
},
{
"epoch": 4.8639896373057,
"grad_norm": 0.210161030292511,
"learning_rate": 1.6014457078461353e-05,
"loss": 0.016,
"step": 7510
},
{
"epoch": 4.870466321243523,
"grad_norm": 0.26830533146858215,
"learning_rate": 1.5893364398662176e-05,
"loss": 0.02,
"step": 7520
},
{
"epoch": 4.876943005181348,
"grad_norm": 0.2090412825345993,
"learning_rate": 1.5772644703565565e-05,
"loss": 0.019,
"step": 7530
},
{
"epoch": 4.883419689119171,
"grad_norm": 0.22939516603946686,
"learning_rate": 1.5652299313342773e-05,
"loss": 0.0136,
"step": 7540
},
{
"epoch": 4.889896373056995,
"grad_norm": 0.1718941479921341,
"learning_rate": 1.553232954407171e-05,
"loss": 0.0176,
"step": 7550
},
{
"epoch": 4.896373056994818,
"grad_norm": 0.17308102548122406,
"learning_rate": 1.5412736707722537e-05,
"loss": 0.0176,
"step": 7560
},
{
"epoch": 4.902849740932642,
"grad_norm": 0.11239796131849289,
"learning_rate": 1.5293522112143373e-05,
"loss": 0.0121,
"step": 7570
},
{
"epoch": 4.909326424870466,
"grad_norm": 0.35807281732559204,
"learning_rate": 1.517468706104589e-05,
"loss": 0.0146,
"step": 7580
},
{
"epoch": 4.91580310880829,
"grad_norm": 0.34626898169517517,
"learning_rate": 1.5056232853991209e-05,
"loss": 0.0176,
"step": 7590
},
{
"epoch": 4.922279792746114,
"grad_norm": 0.19787181913852692,
"learning_rate": 1.4938160786375572e-05,
"loss": 0.0148,
"step": 7600
},
{
"epoch": 4.928756476683938,
"grad_norm": 0.1723092943429947,
"learning_rate": 1.4820472149416154e-05,
"loss": 0.0154,
"step": 7610
},
{
"epoch": 4.935233160621761,
"grad_norm": 0.12467465549707413,
"learning_rate": 1.470316823013707e-05,
"loss": 0.0125,
"step": 7620
},
{
"epoch": 4.941709844559585,
"grad_norm": 0.36818578839302063,
"learning_rate": 1.4586250311355132e-05,
"loss": 0.0238,
"step": 7630
},
{
"epoch": 4.948186528497409,
"grad_norm": 0.17660541832447052,
"learning_rate": 1.4469719671666043e-05,
"loss": 0.0141,
"step": 7640
},
{
"epoch": 4.954663212435233,
"grad_norm": 0.24274510145187378,
"learning_rate": 1.435357758543015e-05,
"loss": 0.0171,
"step": 7650
},
{
"epoch": 4.961139896373057,
"grad_norm": 0.3167835772037506,
"learning_rate": 1.4237825322758736e-05,
"loss": 0.0215,
"step": 7660
},
{
"epoch": 4.967616580310881,
"grad_norm": 0.20456522703170776,
"learning_rate": 1.412246414949997e-05,
"loss": 0.012,
"step": 7670
},
{
"epoch": 4.974093264248705,
"grad_norm": 0.08046074211597443,
"learning_rate": 1.4007495327225162e-05,
"loss": 0.0123,
"step": 7680
},
{
"epoch": 4.980569948186528,
"grad_norm": 0.27661871910095215,
"learning_rate": 1.389292011321498e-05,
"loss": 0.0143,
"step": 7690
},
{
"epoch": 4.987046632124352,
"grad_norm": 0.33071455359458923,
"learning_rate": 1.3778739760445552e-05,
"loss": 0.0148,
"step": 7700
},
{
"epoch": 4.993523316062176,
"grad_norm": 0.33523082733154297,
"learning_rate": 1.3664955517574968e-05,
"loss": 0.0136,
"step": 7710
},
{
"epoch": 5.0,
"grad_norm": 0.15871787071228027,
"learning_rate": 1.3551568628929434e-05,
"loss": 0.0193,
"step": 7720
},
{
"epoch": 5.006476683937824,
"grad_norm": 0.20482423901557922,
"learning_rate": 1.343858033448982e-05,
"loss": 0.0152,
"step": 7730
},
{
"epoch": 5.012953367875648,
"grad_norm": 0.2882786691188812,
"learning_rate": 1.3325991869878013e-05,
"loss": 0.0127,
"step": 7740
},
{
"epoch": 5.019430051813472,
"grad_norm": 0.22831624746322632,
"learning_rate": 1.3213804466343421e-05,
"loss": 0.0184,
"step": 7750
},
{
"epoch": 5.025906735751295,
"grad_norm": 0.12518496811389923,
"learning_rate": 1.3102019350749528e-05,
"loss": 0.0141,
"step": 7760
},
{
"epoch": 5.032383419689119,
"grad_norm": 0.13528479635715485,
"learning_rate": 1.299063774556042e-05,
"loss": 0.0085,
"step": 7770
},
{
"epoch": 5.038860103626943,
"grad_norm": 0.1275235116481781,
"learning_rate": 1.2879660868827508e-05,
"loss": 0.0117,
"step": 7780
},
{
"epoch": 5.045336787564767,
"grad_norm": 0.20098711550235748,
"learning_rate": 1.2769089934176126e-05,
"loss": 0.0167,
"step": 7790
},
{
"epoch": 5.051813471502591,
"grad_norm": 0.09308230131864548,
"learning_rate": 1.2658926150792322e-05,
"loss": 0.0135,
"step": 7800
},
{
"epoch": 5.058290155440415,
"grad_norm": 0.1514425277709961,
"learning_rate": 1.2549170723409549e-05,
"loss": 0.0182,
"step": 7810
},
{
"epoch": 5.064766839378239,
"grad_norm": 0.3412984013557434,
"learning_rate": 1.243982485229559e-05,
"loss": 0.0125,
"step": 7820
},
{
"epoch": 5.071243523316062,
"grad_norm": 0.08874571323394775,
"learning_rate": 1.233088973323937e-05,
"loss": 0.0105,
"step": 7830
},
{
"epoch": 5.077720207253886,
"grad_norm": 0.11828942596912384,
"learning_rate": 1.2222366557537911e-05,
"loss": 0.0158,
"step": 7840
},
{
"epoch": 5.08419689119171,
"grad_norm": 0.24079908430576324,
"learning_rate": 1.2114256511983274e-05,
"loss": 0.0128,
"step": 7850
},
{
"epoch": 5.090673575129534,
"grad_norm": 0.12796323001384735,
"learning_rate": 1.2006560778849578e-05,
"loss": 0.0158,
"step": 7860
},
{
"epoch": 5.097150259067358,
"grad_norm": 0.4417702555656433,
"learning_rate": 1.1899280535880119e-05,
"loss": 0.0192,
"step": 7870
},
{
"epoch": 5.103626943005182,
"grad_norm": 0.29181721806526184,
"learning_rate": 1.1792416956274444e-05,
"loss": 0.0148,
"step": 7880
},
{
"epoch": 5.110103626943006,
"grad_norm": 0.15089385211467743,
"learning_rate": 1.1685971208675539e-05,
"loss": 0.0133,
"step": 7890
},
{
"epoch": 5.116580310880829,
"grad_norm": 0.22751882672309875,
"learning_rate": 1.157994445715706e-05,
"loss": 0.0179,
"step": 7900
},
{
"epoch": 5.123056994818652,
"grad_norm": 0.18326793611049652,
"learning_rate": 1.1474337861210543e-05,
"loss": 0.0135,
"step": 7910
},
{
"epoch": 5.129533678756476,
"grad_norm": 0.22920016944408417,
"learning_rate": 1.1369152575732822e-05,
"loss": 0.0128,
"step": 7920
},
{
"epoch": 5.1360103626943,
"grad_norm": 0.3829505145549774,
"learning_rate": 1.1264389751013326e-05,
"loss": 0.0118,
"step": 7930
},
{
"epoch": 5.142487046632124,
"grad_norm": 0.26591065526008606,
"learning_rate": 1.1160050532721528e-05,
"loss": 0.0176,
"step": 7940
},
{
"epoch": 5.148963730569948,
"grad_norm": 0.1697339564561844,
"learning_rate": 1.1056136061894384e-05,
"loss": 0.0161,
"step": 7950
},
{
"epoch": 5.155440414507772,
"grad_norm": 0.23472066223621368,
"learning_rate": 1.095264747492391e-05,
"loss": 0.0098,
"step": 7960
},
{
"epoch": 5.1619170984455955,
"grad_norm": 0.2254534214735031,
"learning_rate": 1.0849585903544706e-05,
"loss": 0.0155,
"step": 7970
},
{
"epoch": 5.168393782383419,
"grad_norm": 0.1349540650844574,
"learning_rate": 1.0746952474821614e-05,
"loss": 0.0127,
"step": 7980
},
{
"epoch": 5.174870466321243,
"grad_norm": 0.27007216215133667,
"learning_rate": 1.0644748311137376e-05,
"loss": 0.0146,
"step": 7990
},
{
"epoch": 5.181347150259067,
"grad_norm": 0.19731518626213074,
"learning_rate": 1.0542974530180327e-05,
"loss": 0.0158,
"step": 8000
},
{
"epoch": 5.187823834196891,
"grad_norm": 0.2689395844936371,
"learning_rate": 1.0441632244932237e-05,
"loss": 0.0154,
"step": 8010
},
{
"epoch": 5.194300518134715,
"grad_norm": 0.2564564645290375,
"learning_rate": 1.0340722563656107e-05,
"loss": 0.014,
"step": 8020
},
{
"epoch": 5.200777202072539,
"grad_norm": 0.1243646964430809,
"learning_rate": 1.0240246589884044e-05,
"loss": 0.0123,
"step": 8030
},
{
"epoch": 5.2072538860103625,
"grad_norm": 0.2494179755449295,
"learning_rate": 1.0140205422405214e-05,
"loss": 0.0139,
"step": 8040
},
{
"epoch": 5.213730569948186,
"grad_norm": 0.18832044303417206,
"learning_rate": 1.0040600155253765e-05,
"loss": 0.0143,
"step": 8050
},
{
"epoch": 5.22020725388601,
"grad_norm": 0.17419497668743134,
"learning_rate": 9.941431877696955e-06,
"loss": 0.0131,
"step": 8060
},
{
"epoch": 5.226683937823834,
"grad_norm": 0.42716658115386963,
"learning_rate": 9.842701674223187e-06,
"loss": 0.0209,
"step": 8070
},
{
"epoch": 5.233160621761658,
"grad_norm": 0.21540680527687073,
"learning_rate": 9.744410624530148e-06,
"loss": 0.0169,
"step": 8080
},
{
"epoch": 5.239637305699482,
"grad_norm": 0.1386623978614807,
"learning_rate": 9.646559803512994e-06,
"loss": 0.0137,
"step": 8090
},
{
"epoch": 5.246113989637306,
"grad_norm": 0.21496275067329407,
"learning_rate": 9.549150281252633e-06,
"loss": 0.0121,
"step": 8100
},
{
"epoch": 5.2525906735751295,
"grad_norm": 0.204043909907341,
"learning_rate": 9.452183123004e-06,
"loss": 0.0188,
"step": 8110
},
{
"epoch": 5.259067357512953,
"grad_norm": 0.22977150976657867,
"learning_rate": 9.355659389184396e-06,
"loss": 0.018,
"step": 8120
},
{
"epoch": 5.265544041450777,
"grad_norm": 0.18128716945648193,
"learning_rate": 9.259580135361929e-06,
"loss": 0.0166,
"step": 8130
},
{
"epoch": 5.272020725388601,
"grad_norm": 0.10355131328105927,
"learning_rate": 9.163946412243896e-06,
"loss": 0.0138,
"step": 8140
},
{
"epoch": 5.278497409326425,
"grad_norm": 0.1374073177576065,
"learning_rate": 9.068759265665384e-06,
"loss": 0.0139,
"step": 8150
},
{
"epoch": 5.284974093264249,
"grad_norm": 0.27741244435310364,
"learning_rate": 8.974019736577777e-06,
"loss": 0.0103,
"step": 8160
},
{
"epoch": 5.291450777202073,
"grad_norm": 0.14168044924736023,
"learning_rate": 8.879728861037384e-06,
"loss": 0.0146,
"step": 8170
},
{
"epoch": 5.2979274611398965,
"grad_norm": 0.15541480481624603,
"learning_rate": 8.785887670194138e-06,
"loss": 0.0094,
"step": 8180
},
{
"epoch": 5.30440414507772,
"grad_norm": 0.16454242169857025,
"learning_rate": 8.692497190280224e-06,
"loss": 0.0173,
"step": 8190
},
{
"epoch": 5.310880829015544,
"grad_norm": 0.16756625473499298,
"learning_rate": 8.599558442598998e-06,
"loss": 0.0165,
"step": 8200
},
{
"epoch": 5.317357512953368,
"grad_norm": 0.3439781069755554,
"learning_rate": 8.507072443513702e-06,
"loss": 0.0156,
"step": 8210
},
{
"epoch": 5.323834196891192,
"grad_norm": 0.18753990530967712,
"learning_rate": 8.415040204436426e-06,
"loss": 0.0141,
"step": 8220
},
{
"epoch": 5.330310880829016,
"grad_norm": 0.18207696080207825,
"learning_rate": 8.323462731816961e-06,
"loss": 0.015,
"step": 8230
},
{
"epoch": 5.33678756476684,
"grad_norm": 0.15657085180282593,
"learning_rate": 8.232341027131885e-06,
"loss": 0.0137,
"step": 8240
},
{
"epoch": 5.3432642487046635,
"grad_norm": 0.10936085879802704,
"learning_rate": 8.141676086873572e-06,
"loss": 0.0172,
"step": 8250
},
{
"epoch": 5.349740932642487,
"grad_norm": 0.26479002833366394,
"learning_rate": 8.051468902539272e-06,
"loss": 0.0099,
"step": 8260
},
{
"epoch": 5.356217616580311,
"grad_norm": 0.23503267765045166,
"learning_rate": 7.96172046062032e-06,
"loss": 0.0136,
"step": 8270
},
{
"epoch": 5.362694300518135,
"grad_norm": 0.19794131815433502,
"learning_rate": 7.872431742591268e-06,
"loss": 0.0165,
"step": 8280
},
{
"epoch": 5.369170984455959,
"grad_norm": 0.24394656717777252,
"learning_rate": 7.783603724899257e-06,
"loss": 0.0191,
"step": 8290
},
{
"epoch": 5.375647668393782,
"grad_norm": 0.17414677143096924,
"learning_rate": 7.695237378953223e-06,
"loss": 0.0108,
"step": 8300
},
{
"epoch": 5.382124352331607,
"grad_norm": 0.13544592261314392,
"learning_rate": 7.607333671113409e-06,
"loss": 0.0082,
"step": 8310
},
{
"epoch": 5.38860103626943,
"grad_norm": 0.1350071132183075,
"learning_rate": 7.519893562680663e-06,
"loss": 0.0077,
"step": 8320
},
{
"epoch": 5.3950777202072535,
"grad_norm": 0.20169180631637573,
"learning_rate": 7.432918009885997e-06,
"loss": 0.0178,
"step": 8330
},
{
"epoch": 5.401554404145077,
"grad_norm": 0.19569142162799835,
"learning_rate": 7.3464079638801365e-06,
"loss": 0.0186,
"step": 8340
},
{
"epoch": 5.408031088082901,
"grad_norm": 0.31749585270881653,
"learning_rate": 7.260364370723044e-06,
"loss": 0.0155,
"step": 8350
},
{
"epoch": 5.414507772020725,
"grad_norm": 0.18974344432353973,
"learning_rate": 7.174788171373731e-06,
"loss": 0.0136,
"step": 8360
},
{
"epoch": 5.420984455958549,
"grad_norm": 0.2608870267868042,
"learning_rate": 7.089680301679752e-06,
"loss": 0.0234,
"step": 8370
},
{
"epoch": 5.427461139896373,
"grad_norm": 0.08300212025642395,
"learning_rate": 7.005041692367154e-06,
"loss": 0.0143,
"step": 8380
},
{
"epoch": 5.433937823834197,
"grad_norm": 0.17654754221439362,
"learning_rate": 6.92087326903022e-06,
"loss": 0.0104,
"step": 8390
},
{
"epoch": 5.4404145077720205,
"grad_norm": 0.109446220099926,
"learning_rate": 6.837175952121306e-06,
"loss": 0.0106,
"step": 8400
},
{
"epoch": 5.446891191709844,
"grad_norm": 0.3720182776451111,
"learning_rate": 6.753950656940905e-06,
"loss": 0.012,
"step": 8410
},
{
"epoch": 5.453367875647668,
"grad_norm": 0.17289039492607117,
"learning_rate": 6.671198293627479e-06,
"loss": 0.0119,
"step": 8420
},
{
"epoch": 5.459844559585492,
"grad_norm": 0.21983198821544647,
"learning_rate": 6.588919767147639e-06,
"loss": 0.0111,
"step": 8430
},
{
"epoch": 5.466321243523316,
"grad_norm": 0.14437580108642578,
"learning_rate": 6.5071159772861436e-06,
"loss": 0.0152,
"step": 8440
},
{
"epoch": 5.47279792746114,
"grad_norm": 0.30240964889526367,
"learning_rate": 6.425787818636131e-06,
"loss": 0.014,
"step": 8450
},
{
"epoch": 5.479274611398964,
"grad_norm": 0.3552602529525757,
"learning_rate": 6.344936180589351e-06,
"loss": 0.0085,
"step": 8460
},
{
"epoch": 5.4857512953367875,
"grad_norm": 0.3316027820110321,
"learning_rate": 6.264561947326331e-06,
"loss": 0.0128,
"step": 8470
},
{
"epoch": 5.492227979274611,
"grad_norm": 0.17865358293056488,
"learning_rate": 6.184665997806832e-06,
"loss": 0.0135,
"step": 8480
},
{
"epoch": 5.498704663212435,
"grad_norm": 0.336990624666214,
"learning_rate": 6.1052492057601275e-06,
"loss": 0.0138,
"step": 8490
},
{
"epoch": 5.505181347150259,
"grad_norm": 0.23426495492458344,
"learning_rate": 6.026312439675552e-06,
"loss": 0.0149,
"step": 8500
},
{
"epoch": 5.511658031088083,
"grad_norm": 0.2391207218170166,
"learning_rate": 5.947856562792925e-06,
"loss": 0.0171,
"step": 8510
},
{
"epoch": 5.518134715025907,
"grad_norm": 0.1208881288766861,
"learning_rate": 5.869882433093155e-06,
"loss": 0.0166,
"step": 8520
},
{
"epoch": 5.524611398963731,
"grad_norm": 0.15781645476818085,
"learning_rate": 5.79239090328883e-06,
"loss": 0.0109,
"step": 8530
},
{
"epoch": 5.5310880829015545,
"grad_norm": 0.31300756335258484,
"learning_rate": 5.715382820814885e-06,
"loss": 0.0173,
"step": 8540
},
{
"epoch": 5.537564766839378,
"grad_norm": 0.13253220915794373,
"learning_rate": 5.6388590278194096e-06,
"loss": 0.0112,
"step": 8550
},
{
"epoch": 5.544041450777202,
"grad_norm": 0.1631617695093155,
"learning_rate": 5.562820361154314e-06,
"loss": 0.0117,
"step": 8560
},
{
"epoch": 5.550518134715026,
"grad_norm": 0.2857914865016937,
"learning_rate": 5.48726765236629e-06,
"loss": 0.011,
"step": 8570
},
{
"epoch": 5.55699481865285,
"grad_norm": 0.286710649728775,
"learning_rate": 5.412201727687644e-06,
"loss": 0.0109,
"step": 8580
},
{
"epoch": 5.563471502590674,
"grad_norm": 0.1950463354587555,
"learning_rate": 5.337623408027293e-06,
"loss": 0.0143,
"step": 8590
},
{
"epoch": 5.569948186528498,
"grad_norm": 0.1861039400100708,
"learning_rate": 5.263533508961827e-06,
"loss": 0.0098,
"step": 8600
},
{
"epoch": 5.5764248704663215,
"grad_norm": 0.266740083694458,
"learning_rate": 5.1899328407264855e-06,
"loss": 0.014,
"step": 8610
},
{
"epoch": 5.582901554404145,
"grad_norm": 0.2564321756362915,
"learning_rate": 5.116822208206396e-06,
"loss": 0.0104,
"step": 8620
},
{
"epoch": 5.589378238341969,
"grad_norm": 0.20271873474121094,
"learning_rate": 5.044202410927706e-06,
"loss": 0.017,
"step": 8630
},
{
"epoch": 5.595854922279793,
"grad_norm": 0.12181955575942993,
"learning_rate": 4.972074243048897e-06,
"loss": 0.0093,
"step": 8640
},
{
"epoch": 5.602331606217617,
"grad_norm": 0.28861793875694275,
"learning_rate": 4.900438493352055e-06,
"loss": 0.0105,
"step": 8650
},
{
"epoch": 5.608808290155441,
"grad_norm": 0.17971809208393097,
"learning_rate": 4.829295945234258e-06,
"loss": 0.011,
"step": 8660
},
{
"epoch": 5.615284974093264,
"grad_norm": 0.10693217068910599,
"learning_rate": 4.758647376699032e-06,
"loss": 0.0072,
"step": 8670
},
{
"epoch": 5.6217616580310885,
"grad_norm": 0.1231376975774765,
"learning_rate": 4.688493560347773e-06,
"loss": 0.0125,
"step": 8680
},
{
"epoch": 5.6282383419689115,
"grad_norm": 0.31137993931770325,
"learning_rate": 4.618835263371396e-06,
"loss": 0.0141,
"step": 8690
},
{
"epoch": 5.634715025906736,
"grad_norm": 0.10346370935440063,
"learning_rate": 4.549673247541875e-06,
"loss": 0.0096,
"step": 8700
},
{
"epoch": 5.641191709844559,
"grad_norm": 0.16310732066631317,
"learning_rate": 4.48100826920394e-06,
"loss": 0.0119,
"step": 8710
},
{
"epoch": 5.647668393782383,
"grad_norm": 0.12703819572925568,
"learning_rate": 4.412841079266777e-06,
"loss": 0.0097,
"step": 8720
},
{
"epoch": 5.654145077720207,
"grad_norm": 0.35846859216690063,
"learning_rate": 4.3451724231958644e-06,
"loss": 0.0152,
"step": 8730
},
{
"epoch": 5.660621761658031,
"grad_norm": 0.12785233557224274,
"learning_rate": 4.27800304100478e-06,
"loss": 0.0099,
"step": 8740
},
{
"epoch": 5.667098445595855,
"grad_norm": 0.17708571255207062,
"learning_rate": 4.2113336672471245e-06,
"loss": 0.013,
"step": 8750
},
{
"epoch": 5.6735751295336785,
"grad_norm": 0.4389305114746094,
"learning_rate": 4.145165031008508e-06,
"loss": 0.0092,
"step": 8760
},
{
"epoch": 5.680051813471502,
"grad_norm": 0.1950322389602661,
"learning_rate": 4.079497855898501e-06,
"loss": 0.0133,
"step": 8770
},
{
"epoch": 5.686528497409326,
"grad_norm": 0.2934739589691162,
"learning_rate": 4.01433286004283e-06,
"loss": 0.018,
"step": 8780
},
{
"epoch": 5.69300518134715,
"grad_norm": 0.21868778765201569,
"learning_rate": 3.949670756075447e-06,
"loss": 0.0178,
"step": 8790
},
{
"epoch": 5.699481865284974,
"grad_norm": 0.12526535987854004,
"learning_rate": 3.885512251130763e-06,
"loss": 0.0144,
"step": 8800
},
{
"epoch": 5.705958549222798,
"grad_norm": 0.3391956090927124,
"learning_rate": 3.821858046835913e-06,
"loss": 0.0216,
"step": 8810
},
{
"epoch": 5.712435233160622,
"grad_norm": 0.22200612723827362,
"learning_rate": 3.75870883930306e-06,
"loss": 0.0102,
"step": 8820
},
{
"epoch": 5.7189119170984455,
"grad_norm": 0.08342672139406204,
"learning_rate": 3.696065319121833e-06,
"loss": 0.0141,
"step": 8830
},
{
"epoch": 5.725388601036269,
"grad_norm": 0.3359827399253845,
"learning_rate": 3.6339281713517303e-06,
"loss": 0.0173,
"step": 8840
},
{
"epoch": 5.731865284974093,
"grad_norm": 0.2735726237297058,
"learning_rate": 3.5722980755146517e-06,
"loss": 0.0186,
"step": 8850
},
{
"epoch": 5.738341968911917,
"grad_norm": 0.2652912735939026,
"learning_rate": 3.511175705587433e-06,
"loss": 0.0101,
"step": 8860
},
{
"epoch": 5.744818652849741,
"grad_norm": 0.16694048047065735,
"learning_rate": 3.4505617299945336e-06,
"loss": 0.0114,
"step": 8870
},
{
"epoch": 5.751295336787565,
"grad_norm": 0.268002450466156,
"learning_rate": 3.390456811600673e-06,
"loss": 0.0199,
"step": 8880
},
{
"epoch": 5.757772020725389,
"grad_norm": 0.1921738237142563,
"learning_rate": 3.3308616077036115e-06,
"loss": 0.0119,
"step": 8890
},
{
"epoch": 5.7642487046632125,
"grad_norm": 0.17599613964557648,
"learning_rate": 3.271776770026963e-06,
"loss": 0.0137,
"step": 8900
},
{
"epoch": 5.770725388601036,
"grad_norm": 0.2630617618560791,
"learning_rate": 3.213202944713023e-06,
"loss": 0.0124,
"step": 8910
},
{
"epoch": 5.77720207253886,
"grad_norm": 0.13770775496959686,
"learning_rate": 3.155140772315773e-06,
"loss": 0.0125,
"step": 8920
},
{
"epoch": 5.783678756476684,
"grad_norm": 0.1589035540819168,
"learning_rate": 3.0975908877938277e-06,
"loss": 0.0093,
"step": 8930
},
{
"epoch": 5.790155440414508,
"grad_norm": 0.22588101029396057,
"learning_rate": 3.040553920503503e-06,
"loss": 0.0124,
"step": 8940
},
{
"epoch": 5.796632124352332,
"grad_norm": 0.15115275979042053,
"learning_rate": 2.9840304941919415e-06,
"loss": 0.015,
"step": 8950
},
{
"epoch": 5.803108808290156,
"grad_norm": 0.2765119969844818,
"learning_rate": 2.928021226990263e-06,
"loss": 0.0138,
"step": 8960
},
{
"epoch": 5.8095854922279795,
"grad_norm": 0.10985146462917328,
"learning_rate": 2.8725267314068495e-06,
"loss": 0.0123,
"step": 8970
},
{
"epoch": 5.816062176165803,
"grad_norm": 0.135583758354187,
"learning_rate": 2.817547614320615e-06,
"loss": 0.0134,
"step": 8980
},
{
"epoch": 5.822538860103627,
"grad_norm": 0.10807247459888458,
"learning_rate": 2.7630844769743757e-06,
"loss": 0.0109,
"step": 8990
},
{
"epoch": 5.829015544041451,
"grad_norm": 0.30291974544525146,
"learning_rate": 2.7091379149682685e-06,
"loss": 0.0145,
"step": 9000
},
{
"epoch": 5.835492227979275,
"grad_norm": 0.21395935118198395,
"learning_rate": 2.6557085182532582e-06,
"loss": 0.0225,
"step": 9010
},
{
"epoch": 5.841968911917099,
"grad_norm": 0.15883110463619232,
"learning_rate": 2.602796871124663e-06,
"loss": 0.0156,
"step": 9020
},
{
"epoch": 5.848445595854923,
"grad_norm": 0.20018735527992249,
"learning_rate": 2.5504035522157854e-06,
"loss": 0.0166,
"step": 9030
},
{
"epoch": 5.8549222797927465,
"grad_norm": 0.17231498658657074,
"learning_rate": 2.4985291344915674e-06,
"loss": 0.0115,
"step": 9040
},
{
"epoch": 5.86139896373057,
"grad_norm": 0.12376800179481506,
"learning_rate": 2.4471741852423237e-06,
"loss": 0.0117,
"step": 9050
},
{
"epoch": 5.867875647668393,
"grad_norm": 0.1932302713394165,
"learning_rate": 2.3963392660775575e-06,
"loss": 0.0169,
"step": 9060
},
{
"epoch": 5.874352331606218,
"grad_norm": 0.2442460060119629,
"learning_rate": 2.3460249329197824e-06,
"loss": 0.0119,
"step": 9070
},
{
"epoch": 5.880829015544041,
"grad_norm": 0.40588802099227905,
"learning_rate": 2.296231735998511e-06,
"loss": 0.0123,
"step": 9080
},
{
"epoch": 5.887305699481866,
"grad_norm": 0.13834908604621887,
"learning_rate": 2.2469602198441573e-06,
"loss": 0.0124,
"step": 9090
},
{
"epoch": 5.893782383419689,
"grad_norm": 0.12029829621315002,
"learning_rate": 2.1982109232821178e-06,
"loss": 0.0125,
"step": 9100
},
{
"epoch": 5.900259067357513,
"grad_norm": 0.23821647465229034,
"learning_rate": 2.149984379426906e-06,
"loss": 0.0104,
"step": 9110
},
{
"epoch": 5.9067357512953365,
"grad_norm": 0.14522571861743927,
"learning_rate": 2.102281115676258e-06,
"loss": 0.0095,
"step": 9120
},
{
"epoch": 5.91321243523316,
"grad_norm": 0.32294297218322754,
"learning_rate": 2.0551016537054493e-06,
"loss": 0.0195,
"step": 9130
},
{
"epoch": 5.919689119170984,
"grad_norm": 0.2826516628265381,
"learning_rate": 2.008446509461498e-06,
"loss": 0.0114,
"step": 9140
},
{
"epoch": 5.926165803108808,
"grad_norm": 0.19465412199497223,
"learning_rate": 1.962316193157593e-06,
"loss": 0.0108,
"step": 9150
},
{
"epoch": 5.932642487046632,
"grad_norm": 0.14356905221939087,
"learning_rate": 1.91671120926748e-06,
"loss": 0.0102,
"step": 9160
},
{
"epoch": 5.939119170984456,
"grad_norm": 0.1268174797296524,
"learning_rate": 1.8716320565199618e-06,
"loss": 0.0116,
"step": 9170
},
{
"epoch": 5.94559585492228,
"grad_norm": 0.4275663197040558,
"learning_rate": 1.8270792278934302e-06,
"loss": 0.0117,
"step": 9180
},
{
"epoch": 5.9520725388601035,
"grad_norm": 0.2674071192741394,
"learning_rate": 1.7830532106104747e-06,
"loss": 0.0129,
"step": 9190
},
{
"epoch": 5.958549222797927,
"grad_norm": 0.1318097561597824,
"learning_rate": 1.7395544861325718e-06,
"loss": 0.0089,
"step": 9200
},
{
"epoch": 5.965025906735751,
"grad_norm": 0.1907251924276352,
"learning_rate": 1.696583530154794e-06,
"loss": 0.0184,
"step": 9210
},
{
"epoch": 5.971502590673575,
"grad_norm": 0.12996065616607666,
"learning_rate": 1.6541408126006463e-06,
"loss": 0.0133,
"step": 9220
},
{
"epoch": 5.977979274611399,
"grad_norm": 0.1298728883266449,
"learning_rate": 1.6122267976168781e-06,
"loss": 0.0139,
"step": 9230
},
{
"epoch": 5.984455958549223,
"grad_norm": 0.23814009130001068,
"learning_rate": 1.5708419435684462e-06,
"loss": 0.0173,
"step": 9240
},
{
"epoch": 5.990932642487047,
"grad_norm": 0.2996975779533386,
"learning_rate": 1.5299867030334814e-06,
"loss": 0.0137,
"step": 9250
},
{
"epoch": 5.9974093264248705,
"grad_norm": 0.13092438876628876,
"learning_rate": 1.4896615227983468e-06,
"loss": 0.0119,
"step": 9260
},
{
"epoch": 6.003886010362694,
"grad_norm": 0.1732238382101059,
"learning_rate": 1.4498668438527597e-06,
"loss": 0.0134,
"step": 9270
},
{
"epoch": 6.010362694300518,
"grad_norm": 0.17689648270606995,
"learning_rate": 1.4106031013849496e-06,
"loss": 0.0073,
"step": 9280
},
{
"epoch": 6.016839378238342,
"grad_norm": 0.13013668358325958,
"learning_rate": 1.3718707247769135e-06,
"loss": 0.0157,
"step": 9290
},
{
"epoch": 6.023316062176166,
"grad_norm": 0.149629145860672,
"learning_rate": 1.333670137599713e-06,
"loss": 0.012,
"step": 9300
},
{
"epoch": 6.02979274611399,
"grad_norm": 0.12040393799543381,
"learning_rate": 1.2960017576088446e-06,
"loss": 0.0175,
"step": 9310
},
{
"epoch": 6.036269430051814,
"grad_norm": 0.28257250785827637,
"learning_rate": 1.2588659967397e-06,
"loss": 0.0281,
"step": 9320
},
{
"epoch": 6.0427461139896375,
"grad_norm": 0.1398344486951828,
"learning_rate": 1.222263261102985e-06,
"loss": 0.0151,
"step": 9330
},
{
"epoch": 6.049222797927461,
"grad_norm": 0.294515460729599,
"learning_rate": 1.1861939509803687e-06,
"loss": 0.0141,
"step": 9340
},
{
"epoch": 6.055699481865285,
"grad_norm": 0.06825648248195648,
"learning_rate": 1.1506584608200367e-06,
"loss": 0.0092,
"step": 9350
},
{
"epoch": 6.062176165803109,
"grad_norm": 0.06912140548229218,
"learning_rate": 1.1156571792324211e-06,
"loss": 0.008,
"step": 9360
},
{
"epoch": 6.068652849740933,
"grad_norm": 0.1832340657711029,
"learning_rate": 1.0811904889859336e-06,
"loss": 0.0195,
"step": 9370
},
{
"epoch": 6.075129533678757,
"grad_norm": 0.15293700993061066,
"learning_rate": 1.0472587670027678e-06,
"loss": 0.0075,
"step": 9380
},
{
"epoch": 6.081606217616581,
"grad_norm": 0.18111123144626617,
"learning_rate": 1.0138623843548078e-06,
"loss": 0.0173,
"step": 9390
},
{
"epoch": 6.0880829015544045,
"grad_norm": 0.1301887333393097,
"learning_rate": 9.810017062595322e-07,
"loss": 0.0125,
"step": 9400
},
{
"epoch": 6.094559585492228,
"grad_norm": 0.12335970997810364,
"learning_rate": 9.486770920760668e-07,
"loss": 0.0102,
"step": 9410
},
{
"epoch": 6.101036269430052,
"grad_norm": 0.1748531013727188,
"learning_rate": 9.168888953011989e-07,
"loss": 0.0129,
"step": 9420
},
{
"epoch": 6.107512953367876,
"grad_norm": 0.21782076358795166,
"learning_rate": 8.856374635655695e-07,
"loss": 0.0125,
"step": 9430
},
{
"epoch": 6.1139896373057,
"grad_norm": 0.269727498292923,
"learning_rate": 8.549231386298151e-07,
"loss": 0.0142,
"step": 9440
},
{
"epoch": 6.120466321243524,
"grad_norm": 0.10577181726694107,
"learning_rate": 8.247462563808817e-07,
"loss": 0.0093,
"step": 9450
},
{
"epoch": 6.126943005181348,
"grad_norm": 0.10951922833919525,
"learning_rate": 7.951071468283167e-07,
"loss": 0.0142,
"step": 9460
},
{
"epoch": 6.133419689119171,
"grad_norm": 0.181460440158844,
"learning_rate": 7.66006134100672e-07,
"loss": 0.0112,
"step": 9470
},
{
"epoch": 6.139896373056994,
"grad_norm": 0.10866732150316238,
"learning_rate": 7.374435364419674e-07,
"loss": 0.0076,
"step": 9480
},
{
"epoch": 6.146373056994818,
"grad_norm": 0.21688801050186157,
"learning_rate": 7.094196662081831e-07,
"loss": 0.0131,
"step": 9490
},
{
"epoch": 6.152849740932642,
"grad_norm": 0.17768456041812897,
"learning_rate": 6.819348298638839e-07,
"loss": 0.0104,
"step": 9500
},
{
"epoch": 6.159326424870466,
"grad_norm": 0.1749340444803238,
"learning_rate": 6.549893279788277e-07,
"loss": 0.0126,
"step": 9510
},
{
"epoch": 6.16580310880829,
"grad_norm": 0.08541977405548096,
"learning_rate": 6.285834552247128e-07,
"loss": 0.0097,
"step": 9520
},
{
"epoch": 6.172279792746114,
"grad_norm": 0.17559358477592468,
"learning_rate": 6.027175003719354e-07,
"loss": 0.0128,
"step": 9530
},
{
"epoch": 6.178756476683938,
"grad_norm": 0.16296590864658356,
"learning_rate": 5.773917462864264e-07,
"loss": 0.0137,
"step": 9540
},
{
"epoch": 6.185233160621761,
"grad_norm": 0.27558839321136475,
"learning_rate": 5.526064699265753e-07,
"loss": 0.0128,
"step": 9550
},
{
"epoch": 6.191709844559585,
"grad_norm": 0.26449841260910034,
"learning_rate": 5.283619423401998e-07,
"loss": 0.0139,
"step": 9560
},
{
"epoch": 6.198186528497409,
"grad_norm": 0.21926769614219666,
"learning_rate": 5.046584286615697e-07,
"loss": 0.0132,
"step": 9570
},
{
"epoch": 6.204663212435233,
"grad_norm": 0.08541421592235565,
"learning_rate": 4.814961881085045e-07,
"loss": 0.0058,
"step": 9580
},
{
"epoch": 6.211139896373057,
"grad_norm": 0.08796455711126328,
"learning_rate": 4.5887547397955864e-07,
"loss": 0.0103,
"step": 9590
},
{
"epoch": 6.217616580310881,
"grad_norm": 0.22742117941379547,
"learning_rate": 4.367965336512403e-07,
"loss": 0.0179,
"step": 9600
},
{
"epoch": 6.224093264248705,
"grad_norm": 0.09479006379842758,
"learning_rate": 4.1525960857530243e-07,
"loss": 0.0117,
"step": 9610
},
{
"epoch": 6.230569948186528,
"grad_norm": 0.24150408804416656,
"learning_rate": 3.9426493427611177e-07,
"loss": 0.0125,
"step": 9620
},
{
"epoch": 6.237046632124352,
"grad_norm": 0.07416193932294846,
"learning_rate": 3.738127403480507e-07,
"loss": 0.0125,
"step": 9630
},
{
"epoch": 6.243523316062176,
"grad_norm": 0.1948796957731247,
"learning_rate": 3.5390325045304706e-07,
"loss": 0.0089,
"step": 9640
},
{
"epoch": 6.25,
"grad_norm": 0.15250228345394135,
"learning_rate": 3.3453668231809286e-07,
"loss": 0.0127,
"step": 9650
},
{
"epoch": 6.256476683937824,
"grad_norm": 0.1568685621023178,
"learning_rate": 3.157132477328628e-07,
"loss": 0.009,
"step": 9660
},
{
"epoch": 6.262953367875648,
"grad_norm": 0.23393931984901428,
"learning_rate": 2.9743315254743833e-07,
"loss": 0.023,
"step": 9670
},
{
"epoch": 6.269430051813472,
"grad_norm": 0.24607960879802704,
"learning_rate": 2.796965966699927e-07,
"loss": 0.0222,
"step": 9680
},
{
"epoch": 6.275906735751295,
"grad_norm": 0.2668175995349884,
"learning_rate": 2.625037740646763e-07,
"loss": 0.007,
"step": 9690
},
{
"epoch": 6.282383419689119,
"grad_norm": 0.1123218759894371,
"learning_rate": 2.458548727494292e-07,
"loss": 0.0178,
"step": 9700
},
{
"epoch": 6.288860103626943,
"grad_norm": 0.08573432266712189,
"learning_rate": 2.2975007479397738e-07,
"loss": 0.0097,
"step": 9710
},
{
"epoch": 6.295336787564767,
"grad_norm": 0.24590806663036346,
"learning_rate": 2.1418955631781202e-07,
"loss": 0.0102,
"step": 9720
},
{
"epoch": 6.301813471502591,
"grad_norm": 0.12596063315868378,
"learning_rate": 1.9917348748826335e-07,
"loss": 0.0142,
"step": 9730
},
{
"epoch": 6.308290155440415,
"grad_norm": 0.2864455282688141,
"learning_rate": 1.847020325186577e-07,
"loss": 0.0143,
"step": 9740
},
{
"epoch": 6.314766839378239,
"grad_norm": 0.31299856305122375,
"learning_rate": 1.7077534966650766e-07,
"loss": 0.0123,
"step": 9750
},
{
"epoch": 6.321243523316062,
"grad_norm": 0.2245355099439621,
"learning_rate": 1.5739359123178587e-07,
"loss": 0.0141,
"step": 9760
},
{
"epoch": 6.327720207253886,
"grad_norm": 0.23238977789878845,
"learning_rate": 1.4455690355525964e-07,
"loss": 0.0085,
"step": 9770
},
{
"epoch": 6.33419689119171,
"grad_norm": 0.1620815098285675,
"learning_rate": 1.3226542701689215e-07,
"loss": 0.0121,
"step": 9780
},
{
"epoch": 6.340673575129534,
"grad_norm": 0.14035151898860931,
"learning_rate": 1.2051929603428825e-07,
"loss": 0.0153,
"step": 9790
},
{
"epoch": 6.347150259067358,
"grad_norm": 0.12476951628923416,
"learning_rate": 1.0931863906127327e-07,
"loss": 0.0094,
"step": 9800
},
{
"epoch": 6.353626943005182,
"grad_norm": 0.1264141947031021,
"learning_rate": 9.866357858642205e-08,
"loss": 0.0108,
"step": 9810
},
{
"epoch": 6.360103626943006,
"grad_norm": 0.19493091106414795,
"learning_rate": 8.855423113177664e-08,
"loss": 0.0137,
"step": 9820
},
{
"epoch": 6.366580310880829,
"grad_norm": 0.2744383215904236,
"learning_rate": 7.899070725153613e-08,
"loss": 0.0088,
"step": 9830
},
{
"epoch": 6.373056994818652,
"grad_norm": 0.1168895810842514,
"learning_rate": 6.997311153086883e-08,
"loss": 0.0094,
"step": 9840
},
{
"epoch": 6.379533678756477,
"grad_norm": 0.15993544459342957,
"learning_rate": 6.150154258476315e-08,
"loss": 0.0146,
"step": 9850
},
{
"epoch": 6.3860103626943,
"grad_norm": 0.13558878004550934,
"learning_rate": 5.3576093056922906e-08,
"loss": 0.0111,
"step": 9860
},
{
"epoch": 6.392487046632124,
"grad_norm": 0.21038438379764557,
"learning_rate": 4.619684961881254e-08,
"loss": 0.0112,
"step": 9870
},
{
"epoch": 6.398963730569948,
"grad_norm": 0.24471516907215118,
"learning_rate": 3.936389296864129e-08,
"loss": 0.0093,
"step": 9880
},
{
"epoch": 6.405440414507772,
"grad_norm": 0.05535868927836418,
"learning_rate": 3.3077297830541584e-08,
"loss": 0.0146,
"step": 9890
},
{
"epoch": 6.4119170984455955,
"grad_norm": 0.34832239151000977,
"learning_rate": 2.7337132953697554e-08,
"loss": 0.0164,
"step": 9900
},
{
"epoch": 6.418393782383419,
"grad_norm": 0.11602523177862167,
"learning_rate": 2.214346111164556e-08,
"loss": 0.013,
"step": 9910
},
{
"epoch": 6.424870466321243,
"grad_norm": 0.12291716039180756,
"learning_rate": 1.749633910153592e-08,
"loss": 0.0098,
"step": 9920
},
{
"epoch": 6.431347150259067,
"grad_norm": 0.1366141438484192,
"learning_rate": 1.3395817743561134e-08,
"loss": 0.0132,
"step": 9930
},
{
"epoch": 6.437823834196891,
"grad_norm": 0.1532369703054428,
"learning_rate": 9.841941880361916e-09,
"loss": 0.0151,
"step": 9940
},
{
"epoch": 6.444300518134715,
"grad_norm": 0.12662509083747864,
"learning_rate": 6.834750376549792e-09,
"loss": 0.014,
"step": 9950
},
{
"epoch": 6.450777202072539,
"grad_norm": 0.07399441301822662,
"learning_rate": 4.3742761183018784e-09,
"loss": 0.0217,
"step": 9960
},
{
"epoch": 6.4572538860103625,
"grad_norm": 0.29164353013038635,
"learning_rate": 2.4605460129556445e-09,
"loss": 0.0128,
"step": 9970
},
{
"epoch": 6.463730569948186,
"grad_norm": 0.27305173873901367,
"learning_rate": 1.0935809887702154e-09,
"loss": 0.0113,
"step": 9980
},
{
"epoch": 6.47020725388601,
"grad_norm": 0.10412518680095673,
"learning_rate": 2.7339599464326627e-10,
"loss": 0.0121,
"step": 9990
},
{
"epoch": 6.476683937823834,
"grad_norm": 0.16107480227947235,
"learning_rate": 0.0,
"loss": 0.0127,
"step": 10000
},
{
"epoch": 6.476683937823834,
"step": 10000,
"total_flos": 3.3046784608923354e+17,
"train_loss": 0.039629857166856526,
"train_runtime": 2421.3782,
"train_samples_per_second": 66.078,
"train_steps_per_second": 4.13
}
],
"logging_steps": 10,
"max_steps": 10000,
"num_input_tokens_seen": 0,
"num_train_epochs": 7,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 3.3046784608923354e+17,
"train_batch_size": 16,
"trial_name": null,
"trial_params": null
}