flyingbugs's picture
Model save
a760f60 verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 3.0,
"eval_steps": 500,
"global_step": 513,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.005847953216374269,
"grad_norm": 17.388755230326108,
"learning_rate": 3.846153846153847e-07,
"loss": 2.1955,
"step": 1
},
{
"epoch": 0.011695906432748537,
"grad_norm": 17.472331789941457,
"learning_rate": 7.692307692307694e-07,
"loss": 2.3172,
"step": 2
},
{
"epoch": 0.017543859649122806,
"grad_norm": 16.918399664716798,
"learning_rate": 1.153846153846154e-06,
"loss": 2.369,
"step": 3
},
{
"epoch": 0.023391812865497075,
"grad_norm": 18.225976223813337,
"learning_rate": 1.5384615384615387e-06,
"loss": 2.3503,
"step": 4
},
{
"epoch": 0.029239766081871343,
"grad_norm": 17.290350784392576,
"learning_rate": 1.9230769230769234e-06,
"loss": 2.206,
"step": 5
},
{
"epoch": 0.03508771929824561,
"grad_norm": 17.713013786969835,
"learning_rate": 2.307692307692308e-06,
"loss": 2.1757,
"step": 6
},
{
"epoch": 0.04093567251461988,
"grad_norm": 16.957525841766792,
"learning_rate": 2.6923076923076923e-06,
"loss": 2.263,
"step": 7
},
{
"epoch": 0.04678362573099415,
"grad_norm": 15.274517120371355,
"learning_rate": 3.0769230769230774e-06,
"loss": 2.0823,
"step": 8
},
{
"epoch": 0.05263157894736842,
"grad_norm": 16.17819052550626,
"learning_rate": 3.4615384615384617e-06,
"loss": 2.1592,
"step": 9
},
{
"epoch": 0.05847953216374269,
"grad_norm": 14.878611384619472,
"learning_rate": 3.846153846153847e-06,
"loss": 2.0778,
"step": 10
},
{
"epoch": 0.06432748538011696,
"grad_norm": 11.028350456358304,
"learning_rate": 4.230769230769231e-06,
"loss": 1.7525,
"step": 11
},
{
"epoch": 0.07017543859649122,
"grad_norm": 10.604424239205104,
"learning_rate": 4.615384615384616e-06,
"loss": 1.8686,
"step": 12
},
{
"epoch": 0.07602339181286549,
"grad_norm": 9.83197777453571,
"learning_rate": 5e-06,
"loss": 1.6995,
"step": 13
},
{
"epoch": 0.08187134502923976,
"grad_norm": 9.28612318800235,
"learning_rate": 5.384615384615385e-06,
"loss": 1.7511,
"step": 14
},
{
"epoch": 0.08771929824561403,
"grad_norm": 3.6938751947460333,
"learning_rate": 5.769230769230769e-06,
"loss": 1.4354,
"step": 15
},
{
"epoch": 0.0935672514619883,
"grad_norm": 3.64251741494419,
"learning_rate": 6.153846153846155e-06,
"loss": 1.4634,
"step": 16
},
{
"epoch": 0.09941520467836257,
"grad_norm": 3.249845410537068,
"learning_rate": 6.538461538461539e-06,
"loss": 1.4062,
"step": 17
},
{
"epoch": 0.10526315789473684,
"grad_norm": 3.0197933728284476,
"learning_rate": 6.923076923076923e-06,
"loss": 1.4268,
"step": 18
},
{
"epoch": 0.1111111111111111,
"grad_norm": 2.5032405922437087,
"learning_rate": 7.307692307692308e-06,
"loss": 1.3546,
"step": 19
},
{
"epoch": 0.11695906432748537,
"grad_norm": 1.6914458221673982,
"learning_rate": 7.692307692307694e-06,
"loss": 1.2072,
"step": 20
},
{
"epoch": 0.12280701754385964,
"grad_norm": 1.633209041430983,
"learning_rate": 8.076923076923077e-06,
"loss": 1.1706,
"step": 21
},
{
"epoch": 0.1286549707602339,
"grad_norm": 1.62333800604462,
"learning_rate": 8.461538461538462e-06,
"loss": 1.2573,
"step": 22
},
{
"epoch": 0.13450292397660818,
"grad_norm": 1.2572597783261759,
"learning_rate": 8.846153846153847e-06,
"loss": 1.1549,
"step": 23
},
{
"epoch": 0.14035087719298245,
"grad_norm": 1.0892793835477907,
"learning_rate": 9.230769230769232e-06,
"loss": 1.1367,
"step": 24
},
{
"epoch": 0.14619883040935672,
"grad_norm": 0.9726760103698124,
"learning_rate": 9.615384615384616e-06,
"loss": 1.1664,
"step": 25
},
{
"epoch": 0.15204678362573099,
"grad_norm": 0.8399835297943901,
"learning_rate": 1e-05,
"loss": 1.0771,
"step": 26
},
{
"epoch": 0.15789473684210525,
"grad_norm": 0.756344388475637,
"learning_rate": 1.0384615384615386e-05,
"loss": 1.0361,
"step": 27
},
{
"epoch": 0.16374269005847952,
"grad_norm": 0.6916203141074345,
"learning_rate": 1.076923076923077e-05,
"loss": 1.0276,
"step": 28
},
{
"epoch": 0.1695906432748538,
"grad_norm": 0.6795075377629257,
"learning_rate": 1.1153846153846154e-05,
"loss": 1.0305,
"step": 29
},
{
"epoch": 0.17543859649122806,
"grad_norm": 0.7397958603300506,
"learning_rate": 1.1538461538461538e-05,
"loss": 1.036,
"step": 30
},
{
"epoch": 0.18128654970760233,
"grad_norm": 0.5914063886870811,
"learning_rate": 1.1923076923076925e-05,
"loss": 1.0643,
"step": 31
},
{
"epoch": 0.1871345029239766,
"grad_norm": 0.558807526586334,
"learning_rate": 1.230769230769231e-05,
"loss": 0.9457,
"step": 32
},
{
"epoch": 0.19298245614035087,
"grad_norm": 0.4962345963320037,
"learning_rate": 1.2692307692307693e-05,
"loss": 0.9556,
"step": 33
},
{
"epoch": 0.19883040935672514,
"grad_norm": 0.5368004540999115,
"learning_rate": 1.3076923076923078e-05,
"loss": 1.0031,
"step": 34
},
{
"epoch": 0.2046783625730994,
"grad_norm": 0.5193693046254093,
"learning_rate": 1.3461538461538463e-05,
"loss": 0.937,
"step": 35
},
{
"epoch": 0.21052631578947367,
"grad_norm": 0.42294351955291465,
"learning_rate": 1.3846153846153847e-05,
"loss": 0.8972,
"step": 36
},
{
"epoch": 0.21637426900584794,
"grad_norm": 0.39791430214156615,
"learning_rate": 1.4230769230769232e-05,
"loss": 0.9484,
"step": 37
},
{
"epoch": 0.2222222222222222,
"grad_norm": 0.42681451896746464,
"learning_rate": 1.4615384615384615e-05,
"loss": 0.942,
"step": 38
},
{
"epoch": 0.22807017543859648,
"grad_norm": 0.39243989614880825,
"learning_rate": 1.5000000000000002e-05,
"loss": 0.9379,
"step": 39
},
{
"epoch": 0.23391812865497075,
"grad_norm": 0.4195184915021303,
"learning_rate": 1.5384615384615387e-05,
"loss": 0.9327,
"step": 40
},
{
"epoch": 0.23976608187134502,
"grad_norm": 0.3544937192321327,
"learning_rate": 1.576923076923077e-05,
"loss": 0.851,
"step": 41
},
{
"epoch": 0.24561403508771928,
"grad_norm": 0.3416373732580841,
"learning_rate": 1.6153846153846154e-05,
"loss": 0.8644,
"step": 42
},
{
"epoch": 0.25146198830409355,
"grad_norm": 0.4128427286910145,
"learning_rate": 1.653846153846154e-05,
"loss": 0.9002,
"step": 43
},
{
"epoch": 0.2573099415204678,
"grad_norm": 0.4386903858466522,
"learning_rate": 1.6923076923076924e-05,
"loss": 0.8995,
"step": 44
},
{
"epoch": 0.2631578947368421,
"grad_norm": 0.3894766430305266,
"learning_rate": 1.730769230769231e-05,
"loss": 0.8796,
"step": 45
},
{
"epoch": 0.26900584795321636,
"grad_norm": 0.33237410703928805,
"learning_rate": 1.7692307692307694e-05,
"loss": 0.887,
"step": 46
},
{
"epoch": 0.27485380116959063,
"grad_norm": 0.3287665841977238,
"learning_rate": 1.807692307692308e-05,
"loss": 0.8444,
"step": 47
},
{
"epoch": 0.2807017543859649,
"grad_norm": 0.3109160417844228,
"learning_rate": 1.8461538461538465e-05,
"loss": 0.8708,
"step": 48
},
{
"epoch": 0.28654970760233917,
"grad_norm": 0.30795401416756046,
"learning_rate": 1.8846153846153846e-05,
"loss": 0.8434,
"step": 49
},
{
"epoch": 0.29239766081871343,
"grad_norm": 0.3549208935855604,
"learning_rate": 1.923076923076923e-05,
"loss": 0.8526,
"step": 50
},
{
"epoch": 0.2982456140350877,
"grad_norm": 0.2755256325053317,
"learning_rate": 1.9615384615384617e-05,
"loss": 0.7736,
"step": 51
},
{
"epoch": 0.30409356725146197,
"grad_norm": 0.43817461634852256,
"learning_rate": 2e-05,
"loss": 0.8534,
"step": 52
},
{
"epoch": 0.30994152046783624,
"grad_norm": 0.3377814673600554,
"learning_rate": 1.995661605206074e-05,
"loss": 0.8019,
"step": 53
},
{
"epoch": 0.3157894736842105,
"grad_norm": 0.34154032226288855,
"learning_rate": 1.9913232104121476e-05,
"loss": 0.8458,
"step": 54
},
{
"epoch": 0.3216374269005848,
"grad_norm": 0.33647765891218867,
"learning_rate": 1.9869848156182215e-05,
"loss": 0.8527,
"step": 55
},
{
"epoch": 0.32748538011695905,
"grad_norm": 0.2933887985818341,
"learning_rate": 1.9826464208242954e-05,
"loss": 0.8182,
"step": 56
},
{
"epoch": 0.3333333333333333,
"grad_norm": 0.28265837293209917,
"learning_rate": 1.978308026030369e-05,
"loss": 0.8355,
"step": 57
},
{
"epoch": 0.3391812865497076,
"grad_norm": 0.2632405163872988,
"learning_rate": 1.973969631236443e-05,
"loss": 0.7543,
"step": 58
},
{
"epoch": 0.34502923976608185,
"grad_norm": 0.38062672853122476,
"learning_rate": 1.9696312364425164e-05,
"loss": 0.8183,
"step": 59
},
{
"epoch": 0.3508771929824561,
"grad_norm": 0.26245608696685946,
"learning_rate": 1.96529284164859e-05,
"loss": 0.8004,
"step": 60
},
{
"epoch": 0.3567251461988304,
"grad_norm": 0.32799401692804,
"learning_rate": 1.960954446854664e-05,
"loss": 0.8405,
"step": 61
},
{
"epoch": 0.36257309941520466,
"grad_norm": 0.29066553024128605,
"learning_rate": 1.9566160520607378e-05,
"loss": 0.8492,
"step": 62
},
{
"epoch": 0.3684210526315789,
"grad_norm": 0.28501467209616904,
"learning_rate": 1.9522776572668113e-05,
"loss": 0.8207,
"step": 63
},
{
"epoch": 0.3742690058479532,
"grad_norm": 0.2525036458551552,
"learning_rate": 1.9479392624728852e-05,
"loss": 0.779,
"step": 64
},
{
"epoch": 0.38011695906432746,
"grad_norm": 0.2920718950928194,
"learning_rate": 1.9436008676789588e-05,
"loss": 0.7937,
"step": 65
},
{
"epoch": 0.38596491228070173,
"grad_norm": 0.27183550316859734,
"learning_rate": 1.9392624728850327e-05,
"loss": 0.8344,
"step": 66
},
{
"epoch": 0.391812865497076,
"grad_norm": 0.272325024687968,
"learning_rate": 1.9349240780911066e-05,
"loss": 0.7577,
"step": 67
},
{
"epoch": 0.39766081871345027,
"grad_norm": 0.2761663772793096,
"learning_rate": 1.93058568329718e-05,
"loss": 0.8253,
"step": 68
},
{
"epoch": 0.40350877192982454,
"grad_norm": 0.3577604398976665,
"learning_rate": 1.926247288503254e-05,
"loss": 0.871,
"step": 69
},
{
"epoch": 0.4093567251461988,
"grad_norm": 0.3054954243342987,
"learning_rate": 1.921908893709328e-05,
"loss": 0.8485,
"step": 70
},
{
"epoch": 0.4152046783625731,
"grad_norm": 0.2295446772431491,
"learning_rate": 1.9175704989154015e-05,
"loss": 0.775,
"step": 71
},
{
"epoch": 0.42105263157894735,
"grad_norm": 0.27441930221043814,
"learning_rate": 1.9132321041214754e-05,
"loss": 0.7984,
"step": 72
},
{
"epoch": 0.4269005847953216,
"grad_norm": 0.25560502683198316,
"learning_rate": 1.908893709327549e-05,
"loss": 0.8089,
"step": 73
},
{
"epoch": 0.4327485380116959,
"grad_norm": 0.27391446302846595,
"learning_rate": 1.9045553145336228e-05,
"loss": 0.8194,
"step": 74
},
{
"epoch": 0.43859649122807015,
"grad_norm": 0.25049008602661516,
"learning_rate": 1.9002169197396964e-05,
"loss": 0.7685,
"step": 75
},
{
"epoch": 0.4444444444444444,
"grad_norm": 0.32703190034733925,
"learning_rate": 1.8958785249457703e-05,
"loss": 0.8045,
"step": 76
},
{
"epoch": 0.4502923976608187,
"grad_norm": 0.2461722936867296,
"learning_rate": 1.8915401301518438e-05,
"loss": 0.7747,
"step": 77
},
{
"epoch": 0.45614035087719296,
"grad_norm": 0.3049860315464052,
"learning_rate": 1.8872017353579177e-05,
"loss": 0.8265,
"step": 78
},
{
"epoch": 0.4619883040935672,
"grad_norm": 0.2769624138638705,
"learning_rate": 1.8828633405639916e-05,
"loss": 0.8186,
"step": 79
},
{
"epoch": 0.4678362573099415,
"grad_norm": 0.22632052204690653,
"learning_rate": 1.878524945770065e-05,
"loss": 0.7426,
"step": 80
},
{
"epoch": 0.47368421052631576,
"grad_norm": 0.2538308819987603,
"learning_rate": 1.874186550976139e-05,
"loss": 0.7849,
"step": 81
},
{
"epoch": 0.47953216374269003,
"grad_norm": 0.3146181235378422,
"learning_rate": 1.869848156182213e-05,
"loss": 0.8087,
"step": 82
},
{
"epoch": 0.4853801169590643,
"grad_norm": 0.22831617588223724,
"learning_rate": 1.8655097613882865e-05,
"loss": 0.7431,
"step": 83
},
{
"epoch": 0.49122807017543857,
"grad_norm": 0.24832072861713958,
"learning_rate": 1.8611713665943604e-05,
"loss": 0.7807,
"step": 84
},
{
"epoch": 0.49707602339181284,
"grad_norm": 0.28945761508471823,
"learning_rate": 1.856832971800434e-05,
"loss": 0.8025,
"step": 85
},
{
"epoch": 0.5029239766081871,
"grad_norm": 0.24882286573309492,
"learning_rate": 1.852494577006508e-05,
"loss": 0.8041,
"step": 86
},
{
"epoch": 0.5087719298245614,
"grad_norm": 0.2569507918826724,
"learning_rate": 1.8481561822125814e-05,
"loss": 0.8097,
"step": 87
},
{
"epoch": 0.5146198830409356,
"grad_norm": 0.2660930480772777,
"learning_rate": 1.8438177874186553e-05,
"loss": 0.7199,
"step": 88
},
{
"epoch": 0.52046783625731,
"grad_norm": 0.26945118834678633,
"learning_rate": 1.839479392624729e-05,
"loss": 0.8035,
"step": 89
},
{
"epoch": 0.5263157894736842,
"grad_norm": 0.2748667946921001,
"learning_rate": 1.8351409978308028e-05,
"loss": 0.8062,
"step": 90
},
{
"epoch": 0.5321637426900585,
"grad_norm": 0.2363367636075127,
"learning_rate": 1.8308026030368763e-05,
"loss": 0.7497,
"step": 91
},
{
"epoch": 0.5380116959064327,
"grad_norm": 0.2194408996520716,
"learning_rate": 1.8264642082429502e-05,
"loss": 0.7582,
"step": 92
},
{
"epoch": 0.543859649122807,
"grad_norm": 0.2479217006944137,
"learning_rate": 1.822125813449024e-05,
"loss": 0.7816,
"step": 93
},
{
"epoch": 0.5497076023391813,
"grad_norm": 0.24365954457591307,
"learning_rate": 1.8177874186550977e-05,
"loss": 0.7951,
"step": 94
},
{
"epoch": 0.5555555555555556,
"grad_norm": 0.2480572301895391,
"learning_rate": 1.8134490238611715e-05,
"loss": 0.7808,
"step": 95
},
{
"epoch": 0.5614035087719298,
"grad_norm": 0.24464048645651124,
"learning_rate": 1.8091106290672454e-05,
"loss": 0.7153,
"step": 96
},
{
"epoch": 0.5672514619883041,
"grad_norm": 0.23776979402481216,
"learning_rate": 1.804772234273319e-05,
"loss": 0.7168,
"step": 97
},
{
"epoch": 0.5730994152046783,
"grad_norm": 0.2779826898090206,
"learning_rate": 1.800433839479393e-05,
"loss": 0.784,
"step": 98
},
{
"epoch": 0.5789473684210527,
"grad_norm": 0.2625471662464305,
"learning_rate": 1.7960954446854664e-05,
"loss": 0.7575,
"step": 99
},
{
"epoch": 0.5847953216374269,
"grad_norm": 0.24973722791738373,
"learning_rate": 1.7917570498915403e-05,
"loss": 0.7604,
"step": 100
},
{
"epoch": 0.5906432748538012,
"grad_norm": 0.24882129597326091,
"learning_rate": 1.787418655097614e-05,
"loss": 0.7571,
"step": 101
},
{
"epoch": 0.5964912280701754,
"grad_norm": 0.2490465646513338,
"learning_rate": 1.7830802603036878e-05,
"loss": 0.7728,
"step": 102
},
{
"epoch": 0.6023391812865497,
"grad_norm": 0.2803127473261744,
"learning_rate": 1.7787418655097614e-05,
"loss": 0.7486,
"step": 103
},
{
"epoch": 0.6081871345029239,
"grad_norm": 0.30875931205277196,
"learning_rate": 1.7744034707158352e-05,
"loss": 0.7747,
"step": 104
},
{
"epoch": 0.6140350877192983,
"grad_norm": 0.249801739956383,
"learning_rate": 1.770065075921909e-05,
"loss": 0.7719,
"step": 105
},
{
"epoch": 0.6198830409356725,
"grad_norm": 0.2493900745685089,
"learning_rate": 1.7657266811279827e-05,
"loss": 0.7517,
"step": 106
},
{
"epoch": 0.6257309941520468,
"grad_norm": 0.2217608176730444,
"learning_rate": 1.7613882863340566e-05,
"loss": 0.7385,
"step": 107
},
{
"epoch": 0.631578947368421,
"grad_norm": 0.23151529808146598,
"learning_rate": 1.7570498915401305e-05,
"loss": 0.7092,
"step": 108
},
{
"epoch": 0.6374269005847953,
"grad_norm": 0.2648606357036367,
"learning_rate": 1.752711496746204e-05,
"loss": 0.7748,
"step": 109
},
{
"epoch": 0.6432748538011696,
"grad_norm": 0.22637593754873542,
"learning_rate": 1.748373101952278e-05,
"loss": 0.7594,
"step": 110
},
{
"epoch": 0.6491228070175439,
"grad_norm": 0.24569329004133555,
"learning_rate": 1.7440347071583515e-05,
"loss": 0.7662,
"step": 111
},
{
"epoch": 0.6549707602339181,
"grad_norm": 0.23086082605618571,
"learning_rate": 1.7396963123644254e-05,
"loss": 0.7291,
"step": 112
},
{
"epoch": 0.6608187134502924,
"grad_norm": 0.23164513757355204,
"learning_rate": 1.735357917570499e-05,
"loss": 0.761,
"step": 113
},
{
"epoch": 0.6666666666666666,
"grad_norm": 0.2341951309434963,
"learning_rate": 1.731019522776573e-05,
"loss": 0.7707,
"step": 114
},
{
"epoch": 0.672514619883041,
"grad_norm": 0.2294815579241083,
"learning_rate": 1.7266811279826464e-05,
"loss": 0.7307,
"step": 115
},
{
"epoch": 0.6783625730994152,
"grad_norm": 0.2425767445634441,
"learning_rate": 1.7223427331887203e-05,
"loss": 0.7573,
"step": 116
},
{
"epoch": 0.6842105263157895,
"grad_norm": 0.22967591410278537,
"learning_rate": 1.718004338394794e-05,
"loss": 0.7513,
"step": 117
},
{
"epoch": 0.6900584795321637,
"grad_norm": 0.26903092877754314,
"learning_rate": 1.7136659436008677e-05,
"loss": 0.7858,
"step": 118
},
{
"epoch": 0.695906432748538,
"grad_norm": 0.2571480378610959,
"learning_rate": 1.7093275488069416e-05,
"loss": 0.7736,
"step": 119
},
{
"epoch": 0.7017543859649122,
"grad_norm": 0.23273043019862788,
"learning_rate": 1.7049891540130152e-05,
"loss": 0.7669,
"step": 120
},
{
"epoch": 0.7076023391812866,
"grad_norm": 0.23314091686361454,
"learning_rate": 1.700650759219089e-05,
"loss": 0.7699,
"step": 121
},
{
"epoch": 0.7134502923976608,
"grad_norm": 0.26268224212689045,
"learning_rate": 1.696312364425163e-05,
"loss": 0.7832,
"step": 122
},
{
"epoch": 0.7192982456140351,
"grad_norm": 0.26423904380170976,
"learning_rate": 1.6919739696312365e-05,
"loss": 0.7595,
"step": 123
},
{
"epoch": 0.7251461988304093,
"grad_norm": 0.21495414583172803,
"learning_rate": 1.6876355748373104e-05,
"loss": 0.7106,
"step": 124
},
{
"epoch": 0.7309941520467836,
"grad_norm": 0.2111254963997244,
"learning_rate": 1.6832971800433843e-05,
"loss": 0.7455,
"step": 125
},
{
"epoch": 0.7368421052631579,
"grad_norm": 0.2156942153910527,
"learning_rate": 1.678958785249458e-05,
"loss": 0.69,
"step": 126
},
{
"epoch": 0.7426900584795322,
"grad_norm": 0.20057578031019538,
"learning_rate": 1.6746203904555314e-05,
"loss": 0.7253,
"step": 127
},
{
"epoch": 0.7485380116959064,
"grad_norm": 0.258323958272931,
"learning_rate": 1.6702819956616053e-05,
"loss": 0.7156,
"step": 128
},
{
"epoch": 0.7543859649122807,
"grad_norm": 0.23301112011268071,
"learning_rate": 1.665943600867679e-05,
"loss": 0.7562,
"step": 129
},
{
"epoch": 0.7602339181286549,
"grad_norm": 0.27354281471105707,
"learning_rate": 1.6616052060737528e-05,
"loss": 0.7494,
"step": 130
},
{
"epoch": 0.7660818713450293,
"grad_norm": 0.25737706341844985,
"learning_rate": 1.6572668112798267e-05,
"loss": 0.7471,
"step": 131
},
{
"epoch": 0.7719298245614035,
"grad_norm": 0.2112391813708006,
"learning_rate": 1.6529284164859002e-05,
"loss": 0.7296,
"step": 132
},
{
"epoch": 0.7777777777777778,
"grad_norm": 0.2066541279425585,
"learning_rate": 1.648590021691974e-05,
"loss": 0.7427,
"step": 133
},
{
"epoch": 0.783625730994152,
"grad_norm": 0.21492978047244818,
"learning_rate": 1.644251626898048e-05,
"loss": 0.6956,
"step": 134
},
{
"epoch": 0.7894736842105263,
"grad_norm": 0.22539724372329056,
"learning_rate": 1.6399132321041216e-05,
"loss": 0.7358,
"step": 135
},
{
"epoch": 0.7953216374269005,
"grad_norm": 0.223824231061946,
"learning_rate": 1.6355748373101955e-05,
"loss": 0.747,
"step": 136
},
{
"epoch": 0.8011695906432749,
"grad_norm": 0.22433634692844312,
"learning_rate": 1.631236442516269e-05,
"loss": 0.7478,
"step": 137
},
{
"epoch": 0.8070175438596491,
"grad_norm": 0.2355525364186235,
"learning_rate": 1.626898047722343e-05,
"loss": 0.7539,
"step": 138
},
{
"epoch": 0.8128654970760234,
"grad_norm": 0.22774103617994296,
"learning_rate": 1.6225596529284168e-05,
"loss": 0.7227,
"step": 139
},
{
"epoch": 0.8187134502923976,
"grad_norm": 0.24837995707152566,
"learning_rate": 1.6182212581344904e-05,
"loss": 0.7048,
"step": 140
},
{
"epoch": 0.8245614035087719,
"grad_norm": 0.2165941656087455,
"learning_rate": 1.613882863340564e-05,
"loss": 0.7095,
"step": 141
},
{
"epoch": 0.8304093567251462,
"grad_norm": 0.24496476577766357,
"learning_rate": 1.609544468546638e-05,
"loss": 0.731,
"step": 142
},
{
"epoch": 0.8362573099415205,
"grad_norm": 0.2275760050109454,
"learning_rate": 1.6052060737527114e-05,
"loss": 0.7159,
"step": 143
},
{
"epoch": 0.8421052631578947,
"grad_norm": 0.203518916790755,
"learning_rate": 1.6008676789587853e-05,
"loss": 0.6462,
"step": 144
},
{
"epoch": 0.847953216374269,
"grad_norm": 0.24268384602078139,
"learning_rate": 1.5965292841648592e-05,
"loss": 0.7206,
"step": 145
},
{
"epoch": 0.8538011695906432,
"grad_norm": 0.2911481588164572,
"learning_rate": 1.5921908893709327e-05,
"loss": 0.766,
"step": 146
},
{
"epoch": 0.8596491228070176,
"grad_norm": 0.25277324694147335,
"learning_rate": 1.5878524945770066e-05,
"loss": 0.7501,
"step": 147
},
{
"epoch": 0.8654970760233918,
"grad_norm": 0.2372457450088363,
"learning_rate": 1.5835140997830805e-05,
"loss": 0.712,
"step": 148
},
{
"epoch": 0.8713450292397661,
"grad_norm": 0.19877008506291952,
"learning_rate": 1.579175704989154e-05,
"loss": 0.7146,
"step": 149
},
{
"epoch": 0.8771929824561403,
"grad_norm": 0.27732708769815756,
"learning_rate": 1.574837310195228e-05,
"loss": 0.7347,
"step": 150
},
{
"epoch": 0.8830409356725146,
"grad_norm": 0.20303134209612006,
"learning_rate": 1.570498915401302e-05,
"loss": 0.71,
"step": 151
},
{
"epoch": 0.8888888888888888,
"grad_norm": 0.23703380426454326,
"learning_rate": 1.5661605206073754e-05,
"loss": 0.6915,
"step": 152
},
{
"epoch": 0.8947368421052632,
"grad_norm": 0.23526601753982804,
"learning_rate": 1.5618221258134493e-05,
"loss": 0.72,
"step": 153
},
{
"epoch": 0.9005847953216374,
"grad_norm": 0.2408627140057496,
"learning_rate": 1.557483731019523e-05,
"loss": 0.7206,
"step": 154
},
{
"epoch": 0.9064327485380117,
"grad_norm": 0.22070261442759123,
"learning_rate": 1.5531453362255964e-05,
"loss": 0.7019,
"step": 155
},
{
"epoch": 0.9122807017543859,
"grad_norm": 0.236776997470983,
"learning_rate": 1.5488069414316703e-05,
"loss": 0.7314,
"step": 156
},
{
"epoch": 0.9181286549707602,
"grad_norm": 0.25431877096559957,
"learning_rate": 1.5444685466377442e-05,
"loss": 0.7663,
"step": 157
},
{
"epoch": 0.9239766081871345,
"grad_norm": 0.2934790109300597,
"learning_rate": 1.5401301518438178e-05,
"loss": 0.7388,
"step": 158
},
{
"epoch": 0.9298245614035088,
"grad_norm": 0.2287254855752223,
"learning_rate": 1.5357917570498917e-05,
"loss": 0.703,
"step": 159
},
{
"epoch": 0.935672514619883,
"grad_norm": 0.21116594695108679,
"learning_rate": 1.5314533622559656e-05,
"loss": 0.7228,
"step": 160
},
{
"epoch": 0.9415204678362573,
"grad_norm": 0.25825565901072856,
"learning_rate": 1.527114967462039e-05,
"loss": 0.7791,
"step": 161
},
{
"epoch": 0.9473684210526315,
"grad_norm": 0.23103746722781796,
"learning_rate": 1.522776572668113e-05,
"loss": 0.6951,
"step": 162
},
{
"epoch": 0.9532163742690059,
"grad_norm": 0.2580198439201409,
"learning_rate": 1.5184381778741866e-05,
"loss": 0.741,
"step": 163
},
{
"epoch": 0.9590643274853801,
"grad_norm": 0.2974306573786225,
"learning_rate": 1.5140997830802605e-05,
"loss": 0.725,
"step": 164
},
{
"epoch": 0.9649122807017544,
"grad_norm": 0.26570078456731205,
"learning_rate": 1.5097613882863342e-05,
"loss": 0.7467,
"step": 165
},
{
"epoch": 0.9707602339181286,
"grad_norm": 0.4533476269871839,
"learning_rate": 1.5054229934924078e-05,
"loss": 0.6971,
"step": 166
},
{
"epoch": 0.9766081871345029,
"grad_norm": 0.23895703831919585,
"learning_rate": 1.5010845986984816e-05,
"loss": 0.7341,
"step": 167
},
{
"epoch": 0.9824561403508771,
"grad_norm": 0.20339364928582415,
"learning_rate": 1.4967462039045555e-05,
"loss": 0.6757,
"step": 168
},
{
"epoch": 0.9883040935672515,
"grad_norm": 0.23049708494261534,
"learning_rate": 1.4924078091106291e-05,
"loss": 0.7236,
"step": 169
},
{
"epoch": 0.9941520467836257,
"grad_norm": 0.2026976413223512,
"learning_rate": 1.488069414316703e-05,
"loss": 0.7006,
"step": 170
},
{
"epoch": 1.0,
"grad_norm": 0.22355532686958146,
"learning_rate": 1.4837310195227767e-05,
"loss": 0.701,
"step": 171
},
{
"epoch": 1.0058479532163742,
"grad_norm": 0.20282760221622007,
"learning_rate": 1.4793926247288504e-05,
"loss": 0.7168,
"step": 172
},
{
"epoch": 1.0116959064327484,
"grad_norm": 0.19949275591479185,
"learning_rate": 1.4750542299349242e-05,
"loss": 0.6984,
"step": 173
},
{
"epoch": 1.0175438596491229,
"grad_norm": 0.21384886986850865,
"learning_rate": 1.470715835140998e-05,
"loss": 0.7272,
"step": 174
},
{
"epoch": 1.023391812865497,
"grad_norm": 0.2085869267067017,
"learning_rate": 1.4663774403470716e-05,
"loss": 0.7044,
"step": 175
},
{
"epoch": 1.0292397660818713,
"grad_norm": 0.20631353790684379,
"learning_rate": 1.4620390455531455e-05,
"loss": 0.7181,
"step": 176
},
{
"epoch": 1.0350877192982457,
"grad_norm": 0.24538509221900098,
"learning_rate": 1.4577006507592192e-05,
"loss": 0.7731,
"step": 177
},
{
"epoch": 1.04093567251462,
"grad_norm": 0.23156823897416134,
"learning_rate": 1.453362255965293e-05,
"loss": 0.7129,
"step": 178
},
{
"epoch": 1.0467836257309941,
"grad_norm": 0.20155082532453575,
"learning_rate": 1.4490238611713667e-05,
"loss": 0.7037,
"step": 179
},
{
"epoch": 1.0526315789473684,
"grad_norm": 0.19242380310026896,
"learning_rate": 1.4446854663774406e-05,
"loss": 0.7026,
"step": 180
},
{
"epoch": 1.0584795321637426,
"grad_norm": 0.21376599859201403,
"learning_rate": 1.4403470715835141e-05,
"loss": 0.7021,
"step": 181
},
{
"epoch": 1.064327485380117,
"grad_norm": 0.21321842835439078,
"learning_rate": 1.436008676789588e-05,
"loss": 0.7186,
"step": 182
},
{
"epoch": 1.0701754385964912,
"grad_norm": 0.23152992175479814,
"learning_rate": 1.4316702819956618e-05,
"loss": 0.7262,
"step": 183
},
{
"epoch": 1.0760233918128654,
"grad_norm": 0.20707778685395156,
"learning_rate": 1.4273318872017355e-05,
"loss": 0.742,
"step": 184
},
{
"epoch": 1.0818713450292399,
"grad_norm": 0.21284401184030297,
"learning_rate": 1.4229934924078092e-05,
"loss": 0.683,
"step": 185
},
{
"epoch": 1.087719298245614,
"grad_norm": 0.21105448131636317,
"learning_rate": 1.418655097613883e-05,
"loss": 0.7218,
"step": 186
},
{
"epoch": 1.0935672514619883,
"grad_norm": 0.23854659151648439,
"learning_rate": 1.4143167028199567e-05,
"loss": 0.707,
"step": 187
},
{
"epoch": 1.0994152046783625,
"grad_norm": 0.1979900232322942,
"learning_rate": 1.4099783080260306e-05,
"loss": 0.6793,
"step": 188
},
{
"epoch": 1.1052631578947367,
"grad_norm": 0.19940118749588795,
"learning_rate": 1.4056399132321041e-05,
"loss": 0.6793,
"step": 189
},
{
"epoch": 1.1111111111111112,
"grad_norm": 0.2216608207413802,
"learning_rate": 1.401301518438178e-05,
"loss": 0.7183,
"step": 190
},
{
"epoch": 1.1169590643274854,
"grad_norm": 0.19705996044476262,
"learning_rate": 1.3969631236442517e-05,
"loss": 0.692,
"step": 191
},
{
"epoch": 1.1228070175438596,
"grad_norm": 0.18840081658391272,
"learning_rate": 1.3926247288503255e-05,
"loss": 0.69,
"step": 192
},
{
"epoch": 1.128654970760234,
"grad_norm": 0.22778993399760028,
"learning_rate": 1.3882863340563992e-05,
"loss": 0.7458,
"step": 193
},
{
"epoch": 1.1345029239766082,
"grad_norm": 0.19922962343898284,
"learning_rate": 1.3839479392624731e-05,
"loss": 0.6935,
"step": 194
},
{
"epoch": 1.1403508771929824,
"grad_norm": 0.17961965737395658,
"learning_rate": 1.3796095444685466e-05,
"loss": 0.6902,
"step": 195
},
{
"epoch": 1.1461988304093567,
"grad_norm": 0.20117480573787744,
"learning_rate": 1.3752711496746205e-05,
"loss": 0.6966,
"step": 196
},
{
"epoch": 1.1520467836257309,
"grad_norm": 0.20576287270564314,
"learning_rate": 1.3709327548806943e-05,
"loss": 0.6626,
"step": 197
},
{
"epoch": 1.1578947368421053,
"grad_norm": 0.20954364596102132,
"learning_rate": 1.366594360086768e-05,
"loss": 0.712,
"step": 198
},
{
"epoch": 1.1637426900584795,
"grad_norm": 0.18682996007735939,
"learning_rate": 1.3622559652928417e-05,
"loss": 0.7075,
"step": 199
},
{
"epoch": 1.1695906432748537,
"grad_norm": 0.20043695366127617,
"learning_rate": 1.3579175704989156e-05,
"loss": 0.688,
"step": 200
},
{
"epoch": 1.1754385964912282,
"grad_norm": 0.19280097802899304,
"learning_rate": 1.3535791757049892e-05,
"loss": 0.7177,
"step": 201
},
{
"epoch": 1.1812865497076024,
"grad_norm": 0.1857970119964957,
"learning_rate": 1.349240780911063e-05,
"loss": 0.6463,
"step": 202
},
{
"epoch": 1.1871345029239766,
"grad_norm": 0.1825176976963816,
"learning_rate": 1.3449023861171368e-05,
"loss": 0.6673,
"step": 203
},
{
"epoch": 1.1929824561403508,
"grad_norm": 0.22051713697050027,
"learning_rate": 1.3405639913232105e-05,
"loss": 0.7145,
"step": 204
},
{
"epoch": 1.198830409356725,
"grad_norm": 0.18423219666459137,
"learning_rate": 1.3362255965292842e-05,
"loss": 0.6528,
"step": 205
},
{
"epoch": 1.2046783625730995,
"grad_norm": 0.19618225427002017,
"learning_rate": 1.3318872017353581e-05,
"loss": 0.6668,
"step": 206
},
{
"epoch": 1.2105263157894737,
"grad_norm": 0.20587148922859191,
"learning_rate": 1.3275488069414317e-05,
"loss": 0.7067,
"step": 207
},
{
"epoch": 1.2163742690058479,
"grad_norm": 0.2090448851687986,
"learning_rate": 1.3232104121475056e-05,
"loss": 0.7029,
"step": 208
},
{
"epoch": 1.2222222222222223,
"grad_norm": 0.19626708957217415,
"learning_rate": 1.3188720173535795e-05,
"loss": 0.6662,
"step": 209
},
{
"epoch": 1.2280701754385965,
"grad_norm": 0.18762036234283117,
"learning_rate": 1.314533622559653e-05,
"loss": 0.6874,
"step": 210
},
{
"epoch": 1.2339181286549707,
"grad_norm": 0.19417670023667025,
"learning_rate": 1.3101952277657268e-05,
"loss": 0.6683,
"step": 211
},
{
"epoch": 1.239766081871345,
"grad_norm": 0.20177458796436643,
"learning_rate": 1.3058568329718005e-05,
"loss": 0.685,
"step": 212
},
{
"epoch": 1.2456140350877192,
"grad_norm": 0.22040877827401095,
"learning_rate": 1.3015184381778742e-05,
"loss": 0.7254,
"step": 213
},
{
"epoch": 1.2514619883040936,
"grad_norm": 0.19637215780432019,
"learning_rate": 1.2971800433839481e-05,
"loss": 0.6897,
"step": 214
},
{
"epoch": 1.2573099415204678,
"grad_norm": 0.199110748095673,
"learning_rate": 1.2928416485900217e-05,
"loss": 0.6854,
"step": 215
},
{
"epoch": 1.263157894736842,
"grad_norm": 0.21819712890299467,
"learning_rate": 1.2885032537960956e-05,
"loss": 0.6986,
"step": 216
},
{
"epoch": 1.2690058479532165,
"grad_norm": 0.21142557814635124,
"learning_rate": 1.2841648590021693e-05,
"loss": 0.7203,
"step": 217
},
{
"epoch": 1.2748538011695907,
"grad_norm": 0.18250187399866635,
"learning_rate": 1.279826464208243e-05,
"loss": 0.6785,
"step": 218
},
{
"epoch": 1.280701754385965,
"grad_norm": 0.19755959536466466,
"learning_rate": 1.2754880694143167e-05,
"loss": 0.6706,
"step": 219
},
{
"epoch": 1.286549707602339,
"grad_norm": 0.19529246308103604,
"learning_rate": 1.2711496746203906e-05,
"loss": 0.6998,
"step": 220
},
{
"epoch": 1.2923976608187133,
"grad_norm": 0.1936160811683211,
"learning_rate": 1.2668112798264642e-05,
"loss": 0.6896,
"step": 221
},
{
"epoch": 1.2982456140350878,
"grad_norm": 0.1845218398315034,
"learning_rate": 1.262472885032538e-05,
"loss": 0.6568,
"step": 222
},
{
"epoch": 1.304093567251462,
"grad_norm": 0.20772884369385505,
"learning_rate": 1.258134490238612e-05,
"loss": 0.6625,
"step": 223
},
{
"epoch": 1.3099415204678362,
"grad_norm": 0.229042568059284,
"learning_rate": 1.2537960954446855e-05,
"loss": 0.6861,
"step": 224
},
{
"epoch": 1.3157894736842106,
"grad_norm": 0.20350171741172374,
"learning_rate": 1.2494577006507593e-05,
"loss": 0.6478,
"step": 225
},
{
"epoch": 1.3216374269005848,
"grad_norm": 0.19144221585747292,
"learning_rate": 1.2451193058568331e-05,
"loss": 0.6764,
"step": 226
},
{
"epoch": 1.327485380116959,
"grad_norm": 0.21913738701924326,
"learning_rate": 1.2407809110629067e-05,
"loss": 0.695,
"step": 227
},
{
"epoch": 1.3333333333333333,
"grad_norm": 0.2020711158267139,
"learning_rate": 1.2364425162689806e-05,
"loss": 0.7062,
"step": 228
},
{
"epoch": 1.3391812865497075,
"grad_norm": 0.21899620359258645,
"learning_rate": 1.2321041214750545e-05,
"loss": 0.7145,
"step": 229
},
{
"epoch": 1.345029239766082,
"grad_norm": 0.18931923720637447,
"learning_rate": 1.227765726681128e-05,
"loss": 0.6956,
"step": 230
},
{
"epoch": 1.3508771929824561,
"grad_norm": 0.1916810843880607,
"learning_rate": 1.223427331887202e-05,
"loss": 0.668,
"step": 231
},
{
"epoch": 1.3567251461988303,
"grad_norm": 0.19261705533668297,
"learning_rate": 1.2190889370932757e-05,
"loss": 0.653,
"step": 232
},
{
"epoch": 1.3625730994152048,
"grad_norm": 0.20814835626639575,
"learning_rate": 1.2147505422993492e-05,
"loss": 0.7068,
"step": 233
},
{
"epoch": 1.368421052631579,
"grad_norm": 0.2076525513781835,
"learning_rate": 1.2104121475054231e-05,
"loss": 0.6989,
"step": 234
},
{
"epoch": 1.3742690058479532,
"grad_norm": 0.1911948741286201,
"learning_rate": 1.2060737527114967e-05,
"loss": 0.6774,
"step": 235
},
{
"epoch": 1.3801169590643274,
"grad_norm": 0.2100123955547407,
"learning_rate": 1.2017353579175706e-05,
"loss": 0.6997,
"step": 236
},
{
"epoch": 1.3859649122807016,
"grad_norm": 0.31584573390504456,
"learning_rate": 1.1973969631236445e-05,
"loss": 0.7052,
"step": 237
},
{
"epoch": 1.391812865497076,
"grad_norm": 0.18688166233524203,
"learning_rate": 1.193058568329718e-05,
"loss": 0.6526,
"step": 238
},
{
"epoch": 1.3976608187134503,
"grad_norm": 0.22026356851753442,
"learning_rate": 1.1887201735357918e-05,
"loss": 0.6454,
"step": 239
},
{
"epoch": 1.4035087719298245,
"grad_norm": 0.19323076025261185,
"learning_rate": 1.1843817787418656e-05,
"loss": 0.6594,
"step": 240
},
{
"epoch": 1.409356725146199,
"grad_norm": 0.19902277064282112,
"learning_rate": 1.1800433839479392e-05,
"loss": 0.7244,
"step": 241
},
{
"epoch": 1.4152046783625731,
"grad_norm": 0.1908671762046153,
"learning_rate": 1.1757049891540131e-05,
"loss": 0.6681,
"step": 242
},
{
"epoch": 1.4210526315789473,
"grad_norm": 0.19560133699568794,
"learning_rate": 1.171366594360087e-05,
"loss": 0.6731,
"step": 243
},
{
"epoch": 1.4269005847953216,
"grad_norm": 0.2094438443568091,
"learning_rate": 1.1670281995661605e-05,
"loss": 0.701,
"step": 244
},
{
"epoch": 1.4327485380116958,
"grad_norm": 0.19053569086952576,
"learning_rate": 1.1626898047722344e-05,
"loss": 0.6104,
"step": 245
},
{
"epoch": 1.4385964912280702,
"grad_norm": 0.19913609339747246,
"learning_rate": 1.1583514099783082e-05,
"loss": 0.6573,
"step": 246
},
{
"epoch": 1.4444444444444444,
"grad_norm": 0.20956879358597585,
"learning_rate": 1.1540130151843817e-05,
"loss": 0.662,
"step": 247
},
{
"epoch": 1.4502923976608186,
"grad_norm": 0.20216430625120646,
"learning_rate": 1.1496746203904556e-05,
"loss": 0.6505,
"step": 248
},
{
"epoch": 1.456140350877193,
"grad_norm": 0.2061734262125184,
"learning_rate": 1.1453362255965295e-05,
"loss": 0.6786,
"step": 249
},
{
"epoch": 1.4619883040935673,
"grad_norm": 0.22574876209542377,
"learning_rate": 1.140997830802603e-05,
"loss": 0.7325,
"step": 250
},
{
"epoch": 1.4678362573099415,
"grad_norm": 0.1772735034592302,
"learning_rate": 1.136659436008677e-05,
"loss": 0.636,
"step": 251
},
{
"epoch": 1.4736842105263157,
"grad_norm": 0.2073376585966582,
"learning_rate": 1.1323210412147507e-05,
"loss": 0.6791,
"step": 252
},
{
"epoch": 1.47953216374269,
"grad_norm": 0.18918995778508665,
"learning_rate": 1.1279826464208244e-05,
"loss": 0.6406,
"step": 253
},
{
"epoch": 1.4853801169590644,
"grad_norm": 0.20195402902912296,
"learning_rate": 1.1236442516268981e-05,
"loss": 0.6625,
"step": 254
},
{
"epoch": 1.4912280701754386,
"grad_norm": 0.18582829458374092,
"learning_rate": 1.119305856832972e-05,
"loss": 0.6831,
"step": 255
},
{
"epoch": 1.4970760233918128,
"grad_norm": 0.18667034513926425,
"learning_rate": 1.1149674620390456e-05,
"loss": 0.6819,
"step": 256
},
{
"epoch": 1.5029239766081872,
"grad_norm": 0.1884977125227984,
"learning_rate": 1.1106290672451195e-05,
"loss": 0.6515,
"step": 257
},
{
"epoch": 1.5087719298245614,
"grad_norm": 0.19917650464796147,
"learning_rate": 1.1062906724511932e-05,
"loss": 0.672,
"step": 258
},
{
"epoch": 1.5146198830409356,
"grad_norm": 0.20496434407592237,
"learning_rate": 1.101952277657267e-05,
"loss": 0.6538,
"step": 259
},
{
"epoch": 1.52046783625731,
"grad_norm": 0.18169707048812828,
"learning_rate": 1.0976138828633407e-05,
"loss": 0.661,
"step": 260
},
{
"epoch": 1.526315789473684,
"grad_norm": 0.22056891228087572,
"learning_rate": 1.0932754880694142e-05,
"loss": 0.6929,
"step": 261
},
{
"epoch": 1.5321637426900585,
"grad_norm": 0.2085232928793704,
"learning_rate": 1.0889370932754881e-05,
"loss": 0.6954,
"step": 262
},
{
"epoch": 1.5380116959064327,
"grad_norm": 0.20789260798479195,
"learning_rate": 1.084598698481562e-05,
"loss": 0.7011,
"step": 263
},
{
"epoch": 1.543859649122807,
"grad_norm": 0.1849807776906847,
"learning_rate": 1.0802603036876356e-05,
"loss": 0.686,
"step": 264
},
{
"epoch": 1.5497076023391814,
"grad_norm": 0.18518274667657642,
"learning_rate": 1.0759219088937095e-05,
"loss": 0.6636,
"step": 265
},
{
"epoch": 1.5555555555555556,
"grad_norm": 0.19333183404204385,
"learning_rate": 1.0715835140997832e-05,
"loss": 0.7133,
"step": 266
},
{
"epoch": 1.5614035087719298,
"grad_norm": 0.19922629243071752,
"learning_rate": 1.0672451193058569e-05,
"loss": 0.6745,
"step": 267
},
{
"epoch": 1.5672514619883042,
"grad_norm": 0.1895519362467185,
"learning_rate": 1.0629067245119306e-05,
"loss": 0.6648,
"step": 268
},
{
"epoch": 1.5730994152046782,
"grad_norm": 0.1871315579144127,
"learning_rate": 1.0585683297180045e-05,
"loss": 0.6721,
"step": 269
},
{
"epoch": 1.5789473684210527,
"grad_norm": 0.18380449023430315,
"learning_rate": 1.0542299349240781e-05,
"loss": 0.6697,
"step": 270
},
{
"epoch": 1.5847953216374269,
"grad_norm": 0.2562545867136886,
"learning_rate": 1.049891540130152e-05,
"loss": 0.6969,
"step": 271
},
{
"epoch": 1.590643274853801,
"grad_norm": 0.1740952081571547,
"learning_rate": 1.0455531453362257e-05,
"loss": 0.6282,
"step": 272
},
{
"epoch": 1.5964912280701755,
"grad_norm": 0.1800104491661315,
"learning_rate": 1.0412147505422994e-05,
"loss": 0.666,
"step": 273
},
{
"epoch": 1.6023391812865497,
"grad_norm": 0.21004999295392382,
"learning_rate": 1.0368763557483732e-05,
"loss": 0.6849,
"step": 274
},
{
"epoch": 1.608187134502924,
"grad_norm": 0.1787656466284205,
"learning_rate": 1.032537960954447e-05,
"loss": 0.6723,
"step": 275
},
{
"epoch": 1.6140350877192984,
"grad_norm": 0.21871948943154398,
"learning_rate": 1.0281995661605206e-05,
"loss": 0.6901,
"step": 276
},
{
"epoch": 1.6198830409356724,
"grad_norm": 0.18361595886864504,
"learning_rate": 1.0238611713665945e-05,
"loss": 0.6421,
"step": 277
},
{
"epoch": 1.6257309941520468,
"grad_norm": 0.18916065927378428,
"learning_rate": 1.0195227765726682e-05,
"loss": 0.6511,
"step": 278
},
{
"epoch": 1.631578947368421,
"grad_norm": 0.1979017696376173,
"learning_rate": 1.015184381778742e-05,
"loss": 0.7018,
"step": 279
},
{
"epoch": 1.6374269005847952,
"grad_norm": 0.18969323017848436,
"learning_rate": 1.0108459869848157e-05,
"loss": 0.6677,
"step": 280
},
{
"epoch": 1.6432748538011697,
"grad_norm": 0.18594561560924552,
"learning_rate": 1.0065075921908896e-05,
"loss": 0.6584,
"step": 281
},
{
"epoch": 1.6491228070175439,
"grad_norm": 0.17998449840117228,
"learning_rate": 1.0021691973969631e-05,
"loss": 0.6616,
"step": 282
},
{
"epoch": 1.654970760233918,
"grad_norm": 0.18463090829340062,
"learning_rate": 9.97830802603037e-06,
"loss": 0.6718,
"step": 283
},
{
"epoch": 1.6608187134502925,
"grad_norm": 0.1941128688320993,
"learning_rate": 9.934924078091108e-06,
"loss": 0.7071,
"step": 284
},
{
"epoch": 1.6666666666666665,
"grad_norm": 0.19238570224026413,
"learning_rate": 9.891540130151845e-06,
"loss": 0.6732,
"step": 285
},
{
"epoch": 1.672514619883041,
"grad_norm": 0.19518291083148756,
"learning_rate": 9.848156182212582e-06,
"loss": 0.6761,
"step": 286
},
{
"epoch": 1.6783625730994152,
"grad_norm": 0.18202289710684016,
"learning_rate": 9.80477223427332e-06,
"loss": 0.6577,
"step": 287
},
{
"epoch": 1.6842105263157894,
"grad_norm": 0.18625725493648193,
"learning_rate": 9.761388286334057e-06,
"loss": 0.7017,
"step": 288
},
{
"epoch": 1.6900584795321638,
"grad_norm": 0.2100099826676321,
"learning_rate": 9.718004338394794e-06,
"loss": 0.6858,
"step": 289
},
{
"epoch": 1.695906432748538,
"grad_norm": 0.2084953033980061,
"learning_rate": 9.674620390455533e-06,
"loss": 0.6674,
"step": 290
},
{
"epoch": 1.7017543859649122,
"grad_norm": 0.18596234796611538,
"learning_rate": 9.63123644251627e-06,
"loss": 0.6496,
"step": 291
},
{
"epoch": 1.7076023391812867,
"grad_norm": 0.1978635671319887,
"learning_rate": 9.587852494577007e-06,
"loss": 0.6642,
"step": 292
},
{
"epoch": 1.7134502923976607,
"grad_norm": 0.1979317376200934,
"learning_rate": 9.544468546637745e-06,
"loss": 0.7282,
"step": 293
},
{
"epoch": 1.719298245614035,
"grad_norm": 0.1792470289825809,
"learning_rate": 9.501084598698482e-06,
"loss": 0.661,
"step": 294
},
{
"epoch": 1.7251461988304093,
"grad_norm": 0.18979635817761115,
"learning_rate": 9.457700650759219e-06,
"loss": 0.6911,
"step": 295
},
{
"epoch": 1.7309941520467835,
"grad_norm": 0.21977929643672667,
"learning_rate": 9.414316702819958e-06,
"loss": 0.6636,
"step": 296
},
{
"epoch": 1.736842105263158,
"grad_norm": 0.19142793950578896,
"learning_rate": 9.370932754880695e-06,
"loss": 0.6652,
"step": 297
},
{
"epoch": 1.7426900584795322,
"grad_norm": 0.1872314527946603,
"learning_rate": 9.327548806941433e-06,
"loss": 0.7241,
"step": 298
},
{
"epoch": 1.7485380116959064,
"grad_norm": 0.21294716763423086,
"learning_rate": 9.28416485900217e-06,
"loss": 0.6505,
"step": 299
},
{
"epoch": 1.7543859649122808,
"grad_norm": 0.20627848491038323,
"learning_rate": 9.240780911062907e-06,
"loss": 0.6839,
"step": 300
},
{
"epoch": 1.7602339181286548,
"grad_norm": 0.1913775128261492,
"learning_rate": 9.197396963123644e-06,
"loss": 0.7072,
"step": 301
},
{
"epoch": 1.7660818713450293,
"grad_norm": 0.18287999729259147,
"learning_rate": 9.154013015184382e-06,
"loss": 0.6571,
"step": 302
},
{
"epoch": 1.7719298245614035,
"grad_norm": 0.1743048128512118,
"learning_rate": 9.11062906724512e-06,
"loss": 0.6404,
"step": 303
},
{
"epoch": 1.7777777777777777,
"grad_norm": 0.18832199972802494,
"learning_rate": 9.067245119305858e-06,
"loss": 0.6853,
"step": 304
},
{
"epoch": 1.7836257309941521,
"grad_norm": 0.20655204935711033,
"learning_rate": 9.023861171366595e-06,
"loss": 0.7093,
"step": 305
},
{
"epoch": 1.7894736842105263,
"grad_norm": 0.19209280973506734,
"learning_rate": 8.980477223427332e-06,
"loss": 0.6548,
"step": 306
},
{
"epoch": 1.7953216374269005,
"grad_norm": 0.1885931981782652,
"learning_rate": 8.93709327548807e-06,
"loss": 0.6558,
"step": 307
},
{
"epoch": 1.801169590643275,
"grad_norm": 0.1962953890386984,
"learning_rate": 8.893709327548807e-06,
"loss": 0.6586,
"step": 308
},
{
"epoch": 1.807017543859649,
"grad_norm": 0.19945775782899686,
"learning_rate": 8.850325379609546e-06,
"loss": 0.6636,
"step": 309
},
{
"epoch": 1.8128654970760234,
"grad_norm": 0.1941326419111805,
"learning_rate": 8.806941431670283e-06,
"loss": 0.6615,
"step": 310
},
{
"epoch": 1.8187134502923976,
"grad_norm": 0.18927283838641645,
"learning_rate": 8.76355748373102e-06,
"loss": 0.6722,
"step": 311
},
{
"epoch": 1.8245614035087718,
"grad_norm": 0.18432693872655953,
"learning_rate": 8.720173535791757e-06,
"loss": 0.6522,
"step": 312
},
{
"epoch": 1.8304093567251463,
"grad_norm": 0.1971710237782894,
"learning_rate": 8.676789587852495e-06,
"loss": 0.6996,
"step": 313
},
{
"epoch": 1.8362573099415205,
"grad_norm": 0.1809013320142788,
"learning_rate": 8.633405639913232e-06,
"loss": 0.6476,
"step": 314
},
{
"epoch": 1.8421052631578947,
"grad_norm": 0.17273470066786814,
"learning_rate": 8.59002169197397e-06,
"loss": 0.6205,
"step": 315
},
{
"epoch": 1.8479532163742691,
"grad_norm": 0.1973244932241699,
"learning_rate": 8.546637744034708e-06,
"loss": 0.7028,
"step": 316
},
{
"epoch": 1.8538011695906431,
"grad_norm": 0.18443943998865936,
"learning_rate": 8.503253796095445e-06,
"loss": 0.6821,
"step": 317
},
{
"epoch": 1.8596491228070176,
"grad_norm": 0.19742863809842442,
"learning_rate": 8.459869848156183e-06,
"loss": 0.696,
"step": 318
},
{
"epoch": 1.8654970760233918,
"grad_norm": 0.19602002536800328,
"learning_rate": 8.416485900216922e-06,
"loss": 0.6643,
"step": 319
},
{
"epoch": 1.871345029239766,
"grad_norm": 0.18322608246185332,
"learning_rate": 8.373101952277657e-06,
"loss": 0.6877,
"step": 320
},
{
"epoch": 1.8771929824561404,
"grad_norm": 0.20312293700355982,
"learning_rate": 8.329718004338394e-06,
"loss": 0.6779,
"step": 321
},
{
"epoch": 1.8830409356725146,
"grad_norm": 0.18955838414122606,
"learning_rate": 8.286334056399133e-06,
"loss": 0.6889,
"step": 322
},
{
"epoch": 1.8888888888888888,
"grad_norm": 0.18608685857531174,
"learning_rate": 8.24295010845987e-06,
"loss": 0.7066,
"step": 323
},
{
"epoch": 1.8947368421052633,
"grad_norm": 0.19324997721387963,
"learning_rate": 8.199566160520608e-06,
"loss": 0.6721,
"step": 324
},
{
"epoch": 1.9005847953216373,
"grad_norm": 0.18269525520661356,
"learning_rate": 8.156182212581345e-06,
"loss": 0.6606,
"step": 325
},
{
"epoch": 1.9064327485380117,
"grad_norm": 0.17879213689825307,
"learning_rate": 8.112798264642084e-06,
"loss": 0.6195,
"step": 326
},
{
"epoch": 1.912280701754386,
"grad_norm": 0.19572563149944922,
"learning_rate": 8.06941431670282e-06,
"loss": 0.6553,
"step": 327
},
{
"epoch": 1.9181286549707601,
"grad_norm": 0.20455740497972336,
"learning_rate": 8.026030368763557e-06,
"loss": 0.7073,
"step": 328
},
{
"epoch": 1.9239766081871346,
"grad_norm": 0.20379817717927606,
"learning_rate": 7.982646420824296e-06,
"loss": 0.6656,
"step": 329
},
{
"epoch": 1.9298245614035088,
"grad_norm": 0.18816989178876325,
"learning_rate": 7.939262472885033e-06,
"loss": 0.6599,
"step": 330
},
{
"epoch": 1.935672514619883,
"grad_norm": 0.19040798822146188,
"learning_rate": 7.89587852494577e-06,
"loss": 0.6872,
"step": 331
},
{
"epoch": 1.9415204678362574,
"grad_norm": 0.2060421681157549,
"learning_rate": 7.85249457700651e-06,
"loss": 0.6634,
"step": 332
},
{
"epoch": 1.9473684210526314,
"grad_norm": 0.1841817001629427,
"learning_rate": 7.809110629067247e-06,
"loss": 0.6249,
"step": 333
},
{
"epoch": 1.9532163742690059,
"grad_norm": 0.19185741242924698,
"learning_rate": 7.765726681127982e-06,
"loss": 0.6603,
"step": 334
},
{
"epoch": 1.95906432748538,
"grad_norm": 0.17490775565813746,
"learning_rate": 7.722342733188721e-06,
"loss": 0.649,
"step": 335
},
{
"epoch": 1.9649122807017543,
"grad_norm": 0.18154097192716664,
"learning_rate": 7.678958785249458e-06,
"loss": 0.6869,
"step": 336
},
{
"epoch": 1.9707602339181287,
"grad_norm": 0.2171151900817146,
"learning_rate": 7.635574837310196e-06,
"loss": 0.6806,
"step": 337
},
{
"epoch": 1.976608187134503,
"grad_norm": 0.20056475561893633,
"learning_rate": 7.592190889370933e-06,
"loss": 0.6143,
"step": 338
},
{
"epoch": 1.9824561403508771,
"grad_norm": 0.1859196448723673,
"learning_rate": 7.548806941431671e-06,
"loss": 0.6565,
"step": 339
},
{
"epoch": 1.9883040935672516,
"grad_norm": 0.18291788926738473,
"learning_rate": 7.505422993492408e-06,
"loss": 0.6656,
"step": 340
},
{
"epoch": 1.9941520467836256,
"grad_norm": 0.1851247551589902,
"learning_rate": 7.4620390455531455e-06,
"loss": 0.658,
"step": 341
},
{
"epoch": 2.0,
"grad_norm": 0.19091041161918318,
"learning_rate": 7.418655097613884e-06,
"loss": 0.675,
"step": 342
},
{
"epoch": 2.0058479532163744,
"grad_norm": 0.18034530461971127,
"learning_rate": 7.375271149674621e-06,
"loss": 0.6371,
"step": 343
},
{
"epoch": 2.0116959064327484,
"grad_norm": 0.17929321132321624,
"learning_rate": 7.331887201735358e-06,
"loss": 0.6238,
"step": 344
},
{
"epoch": 2.017543859649123,
"grad_norm": 0.1805743053336667,
"learning_rate": 7.288503253796096e-06,
"loss": 0.676,
"step": 345
},
{
"epoch": 2.023391812865497,
"grad_norm": 0.18134202268639932,
"learning_rate": 7.2451193058568335e-06,
"loss": 0.6926,
"step": 346
},
{
"epoch": 2.0292397660818713,
"grad_norm": 0.16664489258040083,
"learning_rate": 7.201735357917571e-06,
"loss": 0.635,
"step": 347
},
{
"epoch": 2.0350877192982457,
"grad_norm": 0.17418680651725119,
"learning_rate": 7.158351409978309e-06,
"loss": 0.6625,
"step": 348
},
{
"epoch": 2.0409356725146197,
"grad_norm": 0.16806000135863103,
"learning_rate": 7.114967462039046e-06,
"loss": 0.656,
"step": 349
},
{
"epoch": 2.046783625730994,
"grad_norm": 0.1766385026508446,
"learning_rate": 7.071583514099783e-06,
"loss": 0.644,
"step": 350
},
{
"epoch": 2.0526315789473686,
"grad_norm": 0.18299281472851398,
"learning_rate": 7.028199566160521e-06,
"loss": 0.6609,
"step": 351
},
{
"epoch": 2.0584795321637426,
"grad_norm": 0.20986189876178032,
"learning_rate": 6.984815618221259e-06,
"loss": 0.6113,
"step": 352
},
{
"epoch": 2.064327485380117,
"grad_norm": 0.17241912699938555,
"learning_rate": 6.941431670281996e-06,
"loss": 0.622,
"step": 353
},
{
"epoch": 2.0701754385964914,
"grad_norm": 0.17175110508577335,
"learning_rate": 6.898047722342733e-06,
"loss": 0.6475,
"step": 354
},
{
"epoch": 2.0760233918128654,
"grad_norm": 0.17952837380953865,
"learning_rate": 6.854663774403471e-06,
"loss": 0.624,
"step": 355
},
{
"epoch": 2.08187134502924,
"grad_norm": 0.16440737350129503,
"learning_rate": 6.8112798264642086e-06,
"loss": 0.6216,
"step": 356
},
{
"epoch": 2.087719298245614,
"grad_norm": 0.19647840255348978,
"learning_rate": 6.767895878524946e-06,
"loss": 0.6685,
"step": 357
},
{
"epoch": 2.0935672514619883,
"grad_norm": 0.1696642474859097,
"learning_rate": 6.724511930585684e-06,
"loss": 0.6513,
"step": 358
},
{
"epoch": 2.0994152046783627,
"grad_norm": 0.16781192390446642,
"learning_rate": 6.681127982646421e-06,
"loss": 0.6316,
"step": 359
},
{
"epoch": 2.1052631578947367,
"grad_norm": 0.17665396661182975,
"learning_rate": 6.6377440347071584e-06,
"loss": 0.6444,
"step": 360
},
{
"epoch": 2.111111111111111,
"grad_norm": 0.17026024356498806,
"learning_rate": 6.594360086767897e-06,
"loss": 0.6369,
"step": 361
},
{
"epoch": 2.116959064327485,
"grad_norm": 0.1771238959431666,
"learning_rate": 6.550976138828634e-06,
"loss": 0.6363,
"step": 362
},
{
"epoch": 2.1228070175438596,
"grad_norm": 0.18074195829403725,
"learning_rate": 6.507592190889371e-06,
"loss": 0.6295,
"step": 363
},
{
"epoch": 2.128654970760234,
"grad_norm": 0.17590315483807462,
"learning_rate": 6.464208242950108e-06,
"loss": 0.6352,
"step": 364
},
{
"epoch": 2.134502923976608,
"grad_norm": 0.1833679378524948,
"learning_rate": 6.420824295010846e-06,
"loss": 0.668,
"step": 365
},
{
"epoch": 2.1403508771929824,
"grad_norm": 0.17426945543091085,
"learning_rate": 6.377440347071584e-06,
"loss": 0.6309,
"step": 366
},
{
"epoch": 2.146198830409357,
"grad_norm": 0.17558570852982017,
"learning_rate": 6.334056399132321e-06,
"loss": 0.6183,
"step": 367
},
{
"epoch": 2.152046783625731,
"grad_norm": 0.18869020603808476,
"learning_rate": 6.29067245119306e-06,
"loss": 0.6743,
"step": 368
},
{
"epoch": 2.1578947368421053,
"grad_norm": 0.16860328391840887,
"learning_rate": 6.247288503253796e-06,
"loss": 0.6272,
"step": 369
},
{
"epoch": 2.1637426900584797,
"grad_norm": 0.1787201818661304,
"learning_rate": 6.2039045553145335e-06,
"loss": 0.6536,
"step": 370
},
{
"epoch": 2.1695906432748537,
"grad_norm": 0.17123056998213806,
"learning_rate": 6.1605206073752725e-06,
"loss": 0.6382,
"step": 371
},
{
"epoch": 2.175438596491228,
"grad_norm": 0.1687316996284582,
"learning_rate": 6.11713665943601e-06,
"loss": 0.6212,
"step": 372
},
{
"epoch": 2.181286549707602,
"grad_norm": 0.1891269844696612,
"learning_rate": 6.073752711496746e-06,
"loss": 0.6585,
"step": 373
},
{
"epoch": 2.1871345029239766,
"grad_norm": 0.1725455615706422,
"learning_rate": 6.030368763557483e-06,
"loss": 0.6559,
"step": 374
},
{
"epoch": 2.192982456140351,
"grad_norm": 0.16915435536877974,
"learning_rate": 5.986984815618222e-06,
"loss": 0.6324,
"step": 375
},
{
"epoch": 2.198830409356725,
"grad_norm": 0.17215684923648952,
"learning_rate": 5.943600867678959e-06,
"loss": 0.6539,
"step": 376
},
{
"epoch": 2.2046783625730995,
"grad_norm": 0.1954313719903045,
"learning_rate": 5.900216919739696e-06,
"loss": 0.6866,
"step": 377
},
{
"epoch": 2.2105263157894735,
"grad_norm": 0.17042598764998235,
"learning_rate": 5.856832971800435e-06,
"loss": 0.6558,
"step": 378
},
{
"epoch": 2.216374269005848,
"grad_norm": 0.17192364297534282,
"learning_rate": 5.813449023861172e-06,
"loss": 0.6378,
"step": 379
},
{
"epoch": 2.2222222222222223,
"grad_norm": 0.1739599234963019,
"learning_rate": 5.770065075921909e-06,
"loss": 0.6194,
"step": 380
},
{
"epoch": 2.2280701754385963,
"grad_norm": 0.17013107466272653,
"learning_rate": 5.7266811279826476e-06,
"loss": 0.6071,
"step": 381
},
{
"epoch": 2.2339181286549707,
"grad_norm": 0.1848300211606863,
"learning_rate": 5.683297180043385e-06,
"loss": 0.6859,
"step": 382
},
{
"epoch": 2.239766081871345,
"grad_norm": 0.17752768182741563,
"learning_rate": 5.639913232104122e-06,
"loss": 0.6674,
"step": 383
},
{
"epoch": 2.245614035087719,
"grad_norm": 0.17268014916608854,
"learning_rate": 5.59652928416486e-06,
"loss": 0.6447,
"step": 384
},
{
"epoch": 2.2514619883040936,
"grad_norm": 0.1975248493024482,
"learning_rate": 5.5531453362255974e-06,
"loss": 0.6877,
"step": 385
},
{
"epoch": 2.257309941520468,
"grad_norm": 0.1854455256428647,
"learning_rate": 5.509761388286335e-06,
"loss": 0.6663,
"step": 386
},
{
"epoch": 2.263157894736842,
"grad_norm": 0.18048830972034413,
"learning_rate": 5.466377440347071e-06,
"loss": 0.6515,
"step": 387
},
{
"epoch": 2.2690058479532165,
"grad_norm": 0.18529428469214002,
"learning_rate": 5.42299349240781e-06,
"loss": 0.6742,
"step": 388
},
{
"epoch": 2.2748538011695905,
"grad_norm": 0.1953029253712016,
"learning_rate": 5.379609544468547e-06,
"loss": 0.6715,
"step": 389
},
{
"epoch": 2.280701754385965,
"grad_norm": 0.18506576413704273,
"learning_rate": 5.3362255965292846e-06,
"loss": 0.6441,
"step": 390
},
{
"epoch": 2.2865497076023393,
"grad_norm": 0.20519359995385428,
"learning_rate": 5.292841648590023e-06,
"loss": 0.6324,
"step": 391
},
{
"epoch": 2.2923976608187133,
"grad_norm": 0.1812910105371836,
"learning_rate": 5.24945770065076e-06,
"loss": 0.6151,
"step": 392
},
{
"epoch": 2.2982456140350878,
"grad_norm": 0.16615863932290006,
"learning_rate": 5.206073752711497e-06,
"loss": 0.6178,
"step": 393
},
{
"epoch": 2.3040935672514617,
"grad_norm": 0.1867312948806079,
"learning_rate": 5.162689804772235e-06,
"loss": 0.6844,
"step": 394
},
{
"epoch": 2.309941520467836,
"grad_norm": 0.17482246796590165,
"learning_rate": 5.1193058568329725e-06,
"loss": 0.6316,
"step": 395
},
{
"epoch": 2.3157894736842106,
"grad_norm": 0.18919167148846638,
"learning_rate": 5.07592190889371e-06,
"loss": 0.6929,
"step": 396
},
{
"epoch": 2.3216374269005846,
"grad_norm": 0.17135779399399315,
"learning_rate": 5.032537960954448e-06,
"loss": 0.6507,
"step": 397
},
{
"epoch": 2.327485380116959,
"grad_norm": 0.16589752923541318,
"learning_rate": 4.989154013015185e-06,
"loss": 0.6169,
"step": 398
},
{
"epoch": 2.3333333333333335,
"grad_norm": 0.17836212191167625,
"learning_rate": 4.945770065075922e-06,
"loss": 0.6534,
"step": 399
},
{
"epoch": 2.3391812865497075,
"grad_norm": 0.17486989043138282,
"learning_rate": 4.90238611713666e-06,
"loss": 0.6229,
"step": 400
},
{
"epoch": 2.345029239766082,
"grad_norm": 0.18358705375708667,
"learning_rate": 4.859002169197397e-06,
"loss": 0.6806,
"step": 401
},
{
"epoch": 2.3508771929824563,
"grad_norm": 0.17755890153992399,
"learning_rate": 4.815618221258135e-06,
"loss": 0.6835,
"step": 402
},
{
"epoch": 2.3567251461988303,
"grad_norm": 0.1796432140151646,
"learning_rate": 4.772234273318872e-06,
"loss": 0.6643,
"step": 403
},
{
"epoch": 2.3625730994152048,
"grad_norm": 0.16924652263187123,
"learning_rate": 4.7288503253796095e-06,
"loss": 0.6157,
"step": 404
},
{
"epoch": 2.3684210526315788,
"grad_norm": 0.1726890776222668,
"learning_rate": 4.685466377440348e-06,
"loss": 0.6557,
"step": 405
},
{
"epoch": 2.374269005847953,
"grad_norm": 0.1780550008345725,
"learning_rate": 4.642082429501085e-06,
"loss": 0.6554,
"step": 406
},
{
"epoch": 2.3801169590643276,
"grad_norm": 0.17128524860089567,
"learning_rate": 4.598698481561822e-06,
"loss": 0.6768,
"step": 407
},
{
"epoch": 2.3859649122807016,
"grad_norm": 0.16558703660041527,
"learning_rate": 4.55531453362256e-06,
"loss": 0.6492,
"step": 408
},
{
"epoch": 2.391812865497076,
"grad_norm": 0.17532697429039085,
"learning_rate": 4.5119305856832975e-06,
"loss": 0.6572,
"step": 409
},
{
"epoch": 2.39766081871345,
"grad_norm": 0.16937166076280238,
"learning_rate": 4.468546637744035e-06,
"loss": 0.6628,
"step": 410
},
{
"epoch": 2.4035087719298245,
"grad_norm": 0.18312431667652093,
"learning_rate": 4.425162689804773e-06,
"loss": 0.6477,
"step": 411
},
{
"epoch": 2.409356725146199,
"grad_norm": 0.1695999967647095,
"learning_rate": 4.38177874186551e-06,
"loss": 0.6603,
"step": 412
},
{
"epoch": 2.415204678362573,
"grad_norm": 0.1720525919637609,
"learning_rate": 4.338394793926247e-06,
"loss": 0.6452,
"step": 413
},
{
"epoch": 2.4210526315789473,
"grad_norm": 0.16726491752955858,
"learning_rate": 4.295010845986985e-06,
"loss": 0.6199,
"step": 414
},
{
"epoch": 2.426900584795322,
"grad_norm": 0.2059763044769876,
"learning_rate": 4.251626898047723e-06,
"loss": 0.6783,
"step": 415
},
{
"epoch": 2.4327485380116958,
"grad_norm": 0.1731042493041403,
"learning_rate": 4.208242950108461e-06,
"loss": 0.6289,
"step": 416
},
{
"epoch": 2.43859649122807,
"grad_norm": 0.17595716841396303,
"learning_rate": 4.164859002169197e-06,
"loss": 0.6607,
"step": 417
},
{
"epoch": 2.4444444444444446,
"grad_norm": 0.17232699533316642,
"learning_rate": 4.121475054229935e-06,
"loss": 0.6203,
"step": 418
},
{
"epoch": 2.4502923976608186,
"grad_norm": 0.17550156147686838,
"learning_rate": 4.078091106290673e-06,
"loss": 0.6584,
"step": 419
},
{
"epoch": 2.456140350877193,
"grad_norm": 0.18080214333436065,
"learning_rate": 4.03470715835141e-06,
"loss": 0.6031,
"step": 420
},
{
"epoch": 2.461988304093567,
"grad_norm": 0.18048583412947314,
"learning_rate": 3.991323210412148e-06,
"loss": 0.6354,
"step": 421
},
{
"epoch": 2.4678362573099415,
"grad_norm": 0.18253929691844767,
"learning_rate": 3.947939262472885e-06,
"loss": 0.6502,
"step": 422
},
{
"epoch": 2.473684210526316,
"grad_norm": 0.1697304593286738,
"learning_rate": 3.904555314533623e-06,
"loss": 0.6332,
"step": 423
},
{
"epoch": 2.47953216374269,
"grad_norm": 0.17269048510291535,
"learning_rate": 3.8611713665943606e-06,
"loss": 0.6095,
"step": 424
},
{
"epoch": 2.4853801169590644,
"grad_norm": 0.16216619960446743,
"learning_rate": 3.817787418655098e-06,
"loss": 0.6112,
"step": 425
},
{
"epoch": 2.4912280701754383,
"grad_norm": 0.17239216132714047,
"learning_rate": 3.7744034707158355e-06,
"loss": 0.6715,
"step": 426
},
{
"epoch": 2.497076023391813,
"grad_norm": 0.1715509924251108,
"learning_rate": 3.7310195227765728e-06,
"loss": 0.6459,
"step": 427
},
{
"epoch": 2.502923976608187,
"grad_norm": 0.1674736258064931,
"learning_rate": 3.6876355748373104e-06,
"loss": 0.6355,
"step": 428
},
{
"epoch": 2.5087719298245617,
"grad_norm": 0.16465926005700326,
"learning_rate": 3.644251626898048e-06,
"loss": 0.6381,
"step": 429
},
{
"epoch": 2.5146198830409356,
"grad_norm": 0.1766218788353798,
"learning_rate": 3.6008676789587854e-06,
"loss": 0.67,
"step": 430
},
{
"epoch": 2.52046783625731,
"grad_norm": 0.17349720246234343,
"learning_rate": 3.557483731019523e-06,
"loss": 0.6493,
"step": 431
},
{
"epoch": 2.526315789473684,
"grad_norm": 0.167194985421623,
"learning_rate": 3.5140997830802603e-06,
"loss": 0.628,
"step": 432
},
{
"epoch": 2.5321637426900585,
"grad_norm": 0.1704752632069036,
"learning_rate": 3.470715835140998e-06,
"loss": 0.6431,
"step": 433
},
{
"epoch": 2.538011695906433,
"grad_norm": 0.18481707817941734,
"learning_rate": 3.4273318872017357e-06,
"loss": 0.6715,
"step": 434
},
{
"epoch": 2.543859649122807,
"grad_norm": 0.1953699500403843,
"learning_rate": 3.383947939262473e-06,
"loss": 0.6845,
"step": 435
},
{
"epoch": 2.5497076023391814,
"grad_norm": 0.16379216515216974,
"learning_rate": 3.3405639913232106e-06,
"loss": 0.6455,
"step": 436
},
{
"epoch": 2.5555555555555554,
"grad_norm": 0.1980120403081147,
"learning_rate": 3.2971800433839487e-06,
"loss": 0.6695,
"step": 437
},
{
"epoch": 2.56140350877193,
"grad_norm": 0.16118979174422027,
"learning_rate": 3.2537960954446855e-06,
"loss": 0.5928,
"step": 438
},
{
"epoch": 2.5672514619883042,
"grad_norm": 0.1657791823109499,
"learning_rate": 3.210412147505423e-06,
"loss": 0.645,
"step": 439
},
{
"epoch": 2.573099415204678,
"grad_norm": 0.17132915146971192,
"learning_rate": 3.1670281995661605e-06,
"loss": 0.6847,
"step": 440
},
{
"epoch": 2.5789473684210527,
"grad_norm": 0.16324444549230824,
"learning_rate": 3.123644251626898e-06,
"loss": 0.6413,
"step": 441
},
{
"epoch": 2.5847953216374266,
"grad_norm": 0.17488238495665867,
"learning_rate": 3.0802603036876362e-06,
"loss": 0.6321,
"step": 442
},
{
"epoch": 2.590643274853801,
"grad_norm": 0.17634328132329954,
"learning_rate": 3.036876355748373e-06,
"loss": 0.6784,
"step": 443
},
{
"epoch": 2.5964912280701755,
"grad_norm": 0.17868073636307982,
"learning_rate": 2.993492407809111e-06,
"loss": 0.678,
"step": 444
},
{
"epoch": 2.60233918128655,
"grad_norm": 0.1632381806494582,
"learning_rate": 2.950108459869848e-06,
"loss": 0.6266,
"step": 445
},
{
"epoch": 2.608187134502924,
"grad_norm": 0.16547418794872898,
"learning_rate": 2.906724511930586e-06,
"loss": 0.6328,
"step": 446
},
{
"epoch": 2.6140350877192984,
"grad_norm": 0.17622874984246908,
"learning_rate": 2.8633405639913238e-06,
"loss": 0.661,
"step": 447
},
{
"epoch": 2.6198830409356724,
"grad_norm": 0.16541694161777937,
"learning_rate": 2.819956616052061e-06,
"loss": 0.6236,
"step": 448
},
{
"epoch": 2.625730994152047,
"grad_norm": 0.1662936609526993,
"learning_rate": 2.7765726681127987e-06,
"loss": 0.6159,
"step": 449
},
{
"epoch": 2.6315789473684212,
"grad_norm": 0.16669675160496522,
"learning_rate": 2.7331887201735356e-06,
"loss": 0.6505,
"step": 450
},
{
"epoch": 2.6374269005847952,
"grad_norm": 0.16549634014330256,
"learning_rate": 2.6898047722342737e-06,
"loss": 0.6508,
"step": 451
},
{
"epoch": 2.6432748538011697,
"grad_norm": 0.1810240612515184,
"learning_rate": 2.6464208242950113e-06,
"loss": 0.6705,
"step": 452
},
{
"epoch": 2.6491228070175437,
"grad_norm": 0.17384206262358587,
"learning_rate": 2.6030368763557486e-06,
"loss": 0.6376,
"step": 453
},
{
"epoch": 2.654970760233918,
"grad_norm": 0.17845392301327417,
"learning_rate": 2.5596529284164863e-06,
"loss": 0.6726,
"step": 454
},
{
"epoch": 2.6608187134502925,
"grad_norm": 0.1998313342763234,
"learning_rate": 2.516268980477224e-06,
"loss": 0.7018,
"step": 455
},
{
"epoch": 2.6666666666666665,
"grad_norm": 0.16980658137466279,
"learning_rate": 2.472885032537961e-06,
"loss": 0.649,
"step": 456
},
{
"epoch": 2.672514619883041,
"grad_norm": 0.18303799471242801,
"learning_rate": 2.4295010845986985e-06,
"loss": 0.6807,
"step": 457
},
{
"epoch": 2.678362573099415,
"grad_norm": 0.16687064769711984,
"learning_rate": 2.386117136659436e-06,
"loss": 0.625,
"step": 458
},
{
"epoch": 2.6842105263157894,
"grad_norm": 0.17675345144700674,
"learning_rate": 2.342733188720174e-06,
"loss": 0.6877,
"step": 459
},
{
"epoch": 2.690058479532164,
"grad_norm": 0.18401049432140446,
"learning_rate": 2.299349240780911e-06,
"loss": 0.6887,
"step": 460
},
{
"epoch": 2.6959064327485383,
"grad_norm": 0.1744979749607572,
"learning_rate": 2.2559652928416487e-06,
"loss": 0.6589,
"step": 461
},
{
"epoch": 2.7017543859649122,
"grad_norm": 0.1746641852105471,
"learning_rate": 2.2125813449023864e-06,
"loss": 0.6495,
"step": 462
},
{
"epoch": 2.7076023391812867,
"grad_norm": 0.16657675516372344,
"learning_rate": 2.1691973969631237e-06,
"loss": 0.662,
"step": 463
},
{
"epoch": 2.7134502923976607,
"grad_norm": 0.17198446823209654,
"learning_rate": 2.1258134490238614e-06,
"loss": 0.6732,
"step": 464
},
{
"epoch": 2.719298245614035,
"grad_norm": 0.1666041499402243,
"learning_rate": 2.0824295010845986e-06,
"loss": 0.6812,
"step": 465
},
{
"epoch": 2.7251461988304095,
"grad_norm": 0.17396505588176064,
"learning_rate": 2.0390455531453363e-06,
"loss": 0.6591,
"step": 466
},
{
"epoch": 2.7309941520467835,
"grad_norm": 0.17207201652443582,
"learning_rate": 1.995661605206074e-06,
"loss": 0.6278,
"step": 467
},
{
"epoch": 2.736842105263158,
"grad_norm": 0.16767533054287867,
"learning_rate": 1.9522776572668117e-06,
"loss": 0.6508,
"step": 468
},
{
"epoch": 2.742690058479532,
"grad_norm": 0.17199489358502026,
"learning_rate": 1.908893709327549e-06,
"loss": 0.6652,
"step": 469
},
{
"epoch": 2.7485380116959064,
"grad_norm": 0.15742337242113655,
"learning_rate": 1.8655097613882864e-06,
"loss": 0.6281,
"step": 470
},
{
"epoch": 2.754385964912281,
"grad_norm": 0.16549888305173557,
"learning_rate": 1.822125813449024e-06,
"loss": 0.6257,
"step": 471
},
{
"epoch": 2.760233918128655,
"grad_norm": 0.17228844722867567,
"learning_rate": 1.7787418655097615e-06,
"loss": 0.6897,
"step": 472
},
{
"epoch": 2.7660818713450293,
"grad_norm": 0.16656984900009209,
"learning_rate": 1.735357917570499e-06,
"loss": 0.6576,
"step": 473
},
{
"epoch": 2.7719298245614032,
"grad_norm": 0.1617090427960584,
"learning_rate": 1.6919739696312365e-06,
"loss": 0.6375,
"step": 474
},
{
"epoch": 2.7777777777777777,
"grad_norm": 0.17066915492008342,
"learning_rate": 1.6485900216919743e-06,
"loss": 0.6434,
"step": 475
},
{
"epoch": 2.783625730994152,
"grad_norm": 0.17283365217712324,
"learning_rate": 1.6052060737527116e-06,
"loss": 0.6404,
"step": 476
},
{
"epoch": 2.7894736842105265,
"grad_norm": 0.16377562920106029,
"learning_rate": 1.561822125813449e-06,
"loss": 0.5996,
"step": 477
},
{
"epoch": 2.7953216374269005,
"grad_norm": 0.16639432488486533,
"learning_rate": 1.5184381778741865e-06,
"loss": 0.5969,
"step": 478
},
{
"epoch": 2.801169590643275,
"grad_norm": 0.16980646505093647,
"learning_rate": 1.475054229934924e-06,
"loss": 0.6769,
"step": 479
},
{
"epoch": 2.807017543859649,
"grad_norm": 0.1628222318868079,
"learning_rate": 1.4316702819956619e-06,
"loss": 0.6508,
"step": 480
},
{
"epoch": 2.8128654970760234,
"grad_norm": 0.18172158119006254,
"learning_rate": 1.3882863340563994e-06,
"loss": 0.6604,
"step": 481
},
{
"epoch": 2.818713450292398,
"grad_norm": 0.16423487898529526,
"learning_rate": 1.3449023861171368e-06,
"loss": 0.6228,
"step": 482
},
{
"epoch": 2.824561403508772,
"grad_norm": 0.17478062902651836,
"learning_rate": 1.3015184381778743e-06,
"loss": 0.6251,
"step": 483
},
{
"epoch": 2.8304093567251463,
"grad_norm": 0.1726032282493946,
"learning_rate": 1.258134490238612e-06,
"loss": 0.6735,
"step": 484
},
{
"epoch": 2.8362573099415203,
"grad_norm": 0.16790264066853555,
"learning_rate": 1.2147505422993492e-06,
"loss": 0.6434,
"step": 485
},
{
"epoch": 2.8421052631578947,
"grad_norm": 0.1671571499569638,
"learning_rate": 1.171366594360087e-06,
"loss": 0.6578,
"step": 486
},
{
"epoch": 2.847953216374269,
"grad_norm": 0.16863160149729373,
"learning_rate": 1.1279826464208244e-06,
"loss": 0.6242,
"step": 487
},
{
"epoch": 2.853801169590643,
"grad_norm": 0.161190538585518,
"learning_rate": 1.0845986984815618e-06,
"loss": 0.6342,
"step": 488
},
{
"epoch": 2.8596491228070176,
"grad_norm": 0.16562131765046972,
"learning_rate": 1.0412147505422993e-06,
"loss": 0.6346,
"step": 489
},
{
"epoch": 2.8654970760233915,
"grad_norm": 0.16478891417223968,
"learning_rate": 9.97830802603037e-07,
"loss": 0.629,
"step": 490
},
{
"epoch": 2.871345029239766,
"grad_norm": 0.1652066649082407,
"learning_rate": 9.544468546637745e-07,
"loss": 0.6512,
"step": 491
},
{
"epoch": 2.8771929824561404,
"grad_norm": 0.1808259238987679,
"learning_rate": 9.11062906724512e-07,
"loss": 0.6501,
"step": 492
},
{
"epoch": 2.883040935672515,
"grad_norm": 0.16595306747518687,
"learning_rate": 8.676789587852495e-07,
"loss": 0.6187,
"step": 493
},
{
"epoch": 2.888888888888889,
"grad_norm": 0.16577185891507523,
"learning_rate": 8.242950108459872e-07,
"loss": 0.6608,
"step": 494
},
{
"epoch": 2.8947368421052633,
"grad_norm": 0.17578227817996883,
"learning_rate": 7.809110629067245e-07,
"loss": 0.6522,
"step": 495
},
{
"epoch": 2.9005847953216373,
"grad_norm": 0.16712846714191626,
"learning_rate": 7.37527114967462e-07,
"loss": 0.6741,
"step": 496
},
{
"epoch": 2.9064327485380117,
"grad_norm": 0.1695052637928444,
"learning_rate": 6.941431670281997e-07,
"loss": 0.6697,
"step": 497
},
{
"epoch": 2.912280701754386,
"grad_norm": 0.16523586320140343,
"learning_rate": 6.507592190889371e-07,
"loss": 0.6359,
"step": 498
},
{
"epoch": 2.91812865497076,
"grad_norm": 0.17250068186561412,
"learning_rate": 6.073752711496746e-07,
"loss": 0.635,
"step": 499
},
{
"epoch": 2.9239766081871346,
"grad_norm": 0.15377259520433112,
"learning_rate": 5.639913232104122e-07,
"loss": 0.5981,
"step": 500
},
{
"epoch": 2.9298245614035086,
"grad_norm": 0.1658156010520523,
"learning_rate": 5.206073752711497e-07,
"loss": 0.6501,
"step": 501
},
{
"epoch": 2.935672514619883,
"grad_norm": 0.16408478791637582,
"learning_rate": 4.772234273318872e-07,
"loss": 0.6142,
"step": 502
},
{
"epoch": 2.9415204678362574,
"grad_norm": 0.16388275974704705,
"learning_rate": 4.3383947939262475e-07,
"loss": 0.6752,
"step": 503
},
{
"epoch": 2.9473684210526314,
"grad_norm": 0.16114934023396965,
"learning_rate": 3.9045553145336227e-07,
"loss": 0.6342,
"step": 504
},
{
"epoch": 2.953216374269006,
"grad_norm": 0.16383179577586124,
"learning_rate": 3.4707158351409984e-07,
"loss": 0.6357,
"step": 505
},
{
"epoch": 2.95906432748538,
"grad_norm": 0.15975403048273223,
"learning_rate": 3.036876355748373e-07,
"loss": 0.6188,
"step": 506
},
{
"epoch": 2.9649122807017543,
"grad_norm": 0.1676357353298206,
"learning_rate": 2.6030368763557483e-07,
"loss": 0.6311,
"step": 507
},
{
"epoch": 2.9707602339181287,
"grad_norm": 0.15959844257029077,
"learning_rate": 2.1691973969631237e-07,
"loss": 0.6397,
"step": 508
},
{
"epoch": 2.976608187134503,
"grad_norm": 0.1748528110908195,
"learning_rate": 1.7353579175704992e-07,
"loss": 0.6354,
"step": 509
},
{
"epoch": 2.982456140350877,
"grad_norm": 0.16899094676604337,
"learning_rate": 1.3015184381778741e-07,
"loss": 0.6553,
"step": 510
},
{
"epoch": 2.9883040935672516,
"grad_norm": 0.18419361328324801,
"learning_rate": 8.676789587852496e-08,
"loss": 0.6327,
"step": 511
},
{
"epoch": 2.9941520467836256,
"grad_norm": 0.1710576834882864,
"learning_rate": 4.338394793926248e-08,
"loss": 0.6288,
"step": 512
},
{
"epoch": 3.0,
"grad_norm": 0.1608724141438565,
"learning_rate": 0.0,
"loss": 0.6311,
"step": 513
},
{
"epoch": 3.0,
"step": 513,
"total_flos": 233827146399744.0,
"train_loss": 0.7522663490349322,
"train_runtime": 28313.6625,
"train_samples_per_second": 0.289,
"train_steps_per_second": 0.018
}
],
"logging_steps": 1,
"max_steps": 513,
"num_input_tokens_seen": 0,
"num_train_epochs": 3,
"save_steps": 150,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 233827146399744.0,
"train_batch_size": 4,
"trial_name": null,
"trial_params": null
}