llama3-RL-both-E2-0117-ckpt1624 / trainer_state.json
DongfuJiang's picture
Upload folder using huggingface_hub
da6aa4d verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 0.9997883760749533,
"eval_steps": 100,
"global_step": 1624,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0006156332364993556,
"grad_norm": 26.714405059814453,
"learning_rate": 2.0408163265306121e-07,
"loss": 0.9855,
"step": 1
},
{
"epoch": 0.0012312664729987111,
"grad_norm": 25.45075225830078,
"learning_rate": 4.0816326530612243e-07,
"loss": 0.926,
"step": 2
},
{
"epoch": 0.0018468997094980666,
"grad_norm": 24.12227439880371,
"learning_rate": 6.122448979591837e-07,
"loss": 0.8903,
"step": 3
},
{
"epoch": 0.0024625329459974222,
"grad_norm": 26.573762893676758,
"learning_rate": 8.163265306122449e-07,
"loss": 0.9779,
"step": 4
},
{
"epoch": 0.0030781661824967775,
"grad_norm": 26.632810592651367,
"learning_rate": 1.0204081632653063e-06,
"loss": 0.9337,
"step": 5
},
{
"epoch": 0.003693799418996133,
"grad_norm": 24.73032569885254,
"learning_rate": 1.2244897959183673e-06,
"loss": 0.8677,
"step": 6
},
{
"epoch": 0.004309432655495489,
"grad_norm": 22.517139434814453,
"learning_rate": 1.4285714285714286e-06,
"loss": 0.813,
"step": 7
},
{
"epoch": 0.0049250658919948445,
"grad_norm": 20.523988723754883,
"learning_rate": 1.6326530612244897e-06,
"loss": 0.7831,
"step": 8
},
{
"epoch": 0.005540699128494199,
"grad_norm": 20.76930046081543,
"learning_rate": 1.8367346938775512e-06,
"loss": 0.7559,
"step": 9
},
{
"epoch": 0.006156332364993555,
"grad_norm": 16.376604080200195,
"learning_rate": 2.0408163265306125e-06,
"loss": 0.6649,
"step": 10
},
{
"epoch": 0.006771965601492911,
"grad_norm": 19.152969360351562,
"learning_rate": 2.244897959183674e-06,
"loss": 0.5937,
"step": 11
},
{
"epoch": 0.007387598837992266,
"grad_norm": 14.190017700195312,
"learning_rate": 2.4489795918367347e-06,
"loss": 0.5062,
"step": 12
},
{
"epoch": 0.008003232074491622,
"grad_norm": 7.9161272048950195,
"learning_rate": 2.6530612244897964e-06,
"loss": 0.5011,
"step": 13
},
{
"epoch": 0.008618865310990978,
"grad_norm": 21.18466567993164,
"learning_rate": 2.8571428571428573e-06,
"loss": 0.4779,
"step": 14
},
{
"epoch": 0.009234498547490333,
"grad_norm": 21.010019302368164,
"learning_rate": 3.0612244897959185e-06,
"loss": 0.582,
"step": 15
},
{
"epoch": 0.009850131783989689,
"grad_norm": 11.696112632751465,
"learning_rate": 3.2653061224489794e-06,
"loss": 0.5208,
"step": 16
},
{
"epoch": 0.010465765020489043,
"grad_norm": 6.880239486694336,
"learning_rate": 3.469387755102041e-06,
"loss": 0.4374,
"step": 17
},
{
"epoch": 0.011081398256988399,
"grad_norm": 29.14151382446289,
"learning_rate": 3.6734693877551024e-06,
"loss": 0.5039,
"step": 18
},
{
"epoch": 0.011697031493487754,
"grad_norm": 7.683447360992432,
"learning_rate": 3.877551020408164e-06,
"loss": 0.472,
"step": 19
},
{
"epoch": 0.01231266472998711,
"grad_norm": 5.148148536682129,
"learning_rate": 4.081632653061225e-06,
"loss": 0.48,
"step": 20
},
{
"epoch": 0.012928297966486466,
"grad_norm": 5.763864994049072,
"learning_rate": 4.2857142857142855e-06,
"loss": 0.5208,
"step": 21
},
{
"epoch": 0.013543931202985821,
"grad_norm": 5.345851421356201,
"learning_rate": 4.489795918367348e-06,
"loss": 0.4837,
"step": 22
},
{
"epoch": 0.014159564439485177,
"grad_norm": 4.355923652648926,
"learning_rate": 4.693877551020409e-06,
"loss": 0.4873,
"step": 23
},
{
"epoch": 0.014775197675984533,
"grad_norm": 6.132438659667969,
"learning_rate": 4.897959183673469e-06,
"loss": 0.4696,
"step": 24
},
{
"epoch": 0.015390830912483888,
"grad_norm": 5.307392597198486,
"learning_rate": 5.1020408163265315e-06,
"loss": 0.4756,
"step": 25
},
{
"epoch": 0.016006464148983244,
"grad_norm": 4.319431781768799,
"learning_rate": 5.306122448979593e-06,
"loss": 0.4765,
"step": 26
},
{
"epoch": 0.0166220973854826,
"grad_norm": 4.057931900024414,
"learning_rate": 5.510204081632653e-06,
"loss": 0.5113,
"step": 27
},
{
"epoch": 0.017237730621981955,
"grad_norm": 4.102950572967529,
"learning_rate": 5.7142857142857145e-06,
"loss": 0.4438,
"step": 28
},
{
"epoch": 0.01785336385848131,
"grad_norm": 4.861684322357178,
"learning_rate": 5.918367346938776e-06,
"loss": 0.4829,
"step": 29
},
{
"epoch": 0.018468997094980667,
"grad_norm": 3.862741231918335,
"learning_rate": 6.122448979591837e-06,
"loss": 0.5129,
"step": 30
},
{
"epoch": 0.019084630331480022,
"grad_norm": 3.8774240016937256,
"learning_rate": 6.326530612244899e-06,
"loss": 0.4498,
"step": 31
},
{
"epoch": 0.019700263567979378,
"grad_norm": 3.369861602783203,
"learning_rate": 6.530612244897959e-06,
"loss": 0.4595,
"step": 32
},
{
"epoch": 0.02031589680447873,
"grad_norm": 4.286397457122803,
"learning_rate": 6.734693877551021e-06,
"loss": 0.4822,
"step": 33
},
{
"epoch": 0.020931530040978086,
"grad_norm": 4.149627685546875,
"learning_rate": 6.938775510204082e-06,
"loss": 0.4825,
"step": 34
},
{
"epoch": 0.02154716327747744,
"grad_norm": 3.7248666286468506,
"learning_rate": 7.1428571428571436e-06,
"loss": 0.483,
"step": 35
},
{
"epoch": 0.022162796513976797,
"grad_norm": 3.714015007019043,
"learning_rate": 7.346938775510205e-06,
"loss": 0.4759,
"step": 36
},
{
"epoch": 0.022778429750476153,
"grad_norm": 4.151897430419922,
"learning_rate": 7.551020408163265e-06,
"loss": 0.5299,
"step": 37
},
{
"epoch": 0.02339406298697551,
"grad_norm": 3.7130208015441895,
"learning_rate": 7.755102040816327e-06,
"loss": 0.4994,
"step": 38
},
{
"epoch": 0.024009696223474864,
"grad_norm": 3.6021628379821777,
"learning_rate": 7.959183673469388e-06,
"loss": 0.494,
"step": 39
},
{
"epoch": 0.02462532945997422,
"grad_norm": 4.136284828186035,
"learning_rate": 8.16326530612245e-06,
"loss": 0.4955,
"step": 40
},
{
"epoch": 0.025240962696473575,
"grad_norm": 3.669343948364258,
"learning_rate": 8.36734693877551e-06,
"loss": 0.4999,
"step": 41
},
{
"epoch": 0.02585659593297293,
"grad_norm": 3.592020273208618,
"learning_rate": 8.571428571428571e-06,
"loss": 0.505,
"step": 42
},
{
"epoch": 0.026472229169472287,
"grad_norm": 3.9431192874908447,
"learning_rate": 8.775510204081633e-06,
"loss": 0.4714,
"step": 43
},
{
"epoch": 0.027087862405971642,
"grad_norm": 3.835538864135742,
"learning_rate": 8.979591836734695e-06,
"loss": 0.4833,
"step": 44
},
{
"epoch": 0.027703495642470998,
"grad_norm": 3.6205527782440186,
"learning_rate": 9.183673469387756e-06,
"loss": 0.4946,
"step": 45
},
{
"epoch": 0.028319128878970354,
"grad_norm": 3.6335999965667725,
"learning_rate": 9.387755102040818e-06,
"loss": 0.5319,
"step": 46
},
{
"epoch": 0.02893476211546971,
"grad_norm": 3.832490921020508,
"learning_rate": 9.591836734693878e-06,
"loss": 0.4922,
"step": 47
},
{
"epoch": 0.029550395351969065,
"grad_norm": 3.39542555809021,
"learning_rate": 9.795918367346939e-06,
"loss": 0.5099,
"step": 48
},
{
"epoch": 0.03016602858846842,
"grad_norm": 3.5109446048736572,
"learning_rate": 1e-05,
"loss": 0.5137,
"step": 49
},
{
"epoch": 0.030781661824967776,
"grad_norm": 3.368596076965332,
"learning_rate": 9.99999005331204e-06,
"loss": 0.5375,
"step": 50
},
{
"epoch": 0.03139729506146713,
"grad_norm": 4.006367206573486,
"learning_rate": 9.999960213287734e-06,
"loss": 0.5605,
"step": 51
},
{
"epoch": 0.03201292829796649,
"grad_norm": 3.265024185180664,
"learning_rate": 9.999910480045805e-06,
"loss": 0.5158,
"step": 52
},
{
"epoch": 0.03262856153446584,
"grad_norm": 3.560621976852417,
"learning_rate": 9.999840853784125e-06,
"loss": 0.5671,
"step": 53
},
{
"epoch": 0.0332441947709652,
"grad_norm": 3.2886385917663574,
"learning_rate": 9.999751334779716e-06,
"loss": 0.5654,
"step": 54
},
{
"epoch": 0.03385982800746455,
"grad_norm": 3.0971665382385254,
"learning_rate": 9.999641923388745e-06,
"loss": 0.5049,
"step": 55
},
{
"epoch": 0.03447546124396391,
"grad_norm": 3.312525510787964,
"learning_rate": 9.999512620046523e-06,
"loss": 0.5291,
"step": 56
},
{
"epoch": 0.03509109448046326,
"grad_norm": 3.5942792892456055,
"learning_rate": 9.999363425267506e-06,
"loss": 0.5531,
"step": 57
},
{
"epoch": 0.03570672771696262,
"grad_norm": 2.7662947177886963,
"learning_rate": 9.999194339645292e-06,
"loss": 0.5195,
"step": 58
},
{
"epoch": 0.036322360953461974,
"grad_norm": 3.3403029441833496,
"learning_rate": 9.999005363852619e-06,
"loss": 0.5265,
"step": 59
},
{
"epoch": 0.03693799418996133,
"grad_norm": 3.2366700172424316,
"learning_rate": 9.99879649864136e-06,
"loss": 0.5558,
"step": 60
},
{
"epoch": 0.037553627426460685,
"grad_norm": 2.8445067405700684,
"learning_rate": 9.998567744842518e-06,
"loss": 0.5047,
"step": 61
},
{
"epoch": 0.038169260662960044,
"grad_norm": 3.0779359340667725,
"learning_rate": 9.998319103366233e-06,
"loss": 0.5314,
"step": 62
},
{
"epoch": 0.0387848938994594,
"grad_norm": 3.4407589435577393,
"learning_rate": 9.998050575201772e-06,
"loss": 0.5547,
"step": 63
},
{
"epoch": 0.039400527135958756,
"grad_norm": 3.0764389038085938,
"learning_rate": 9.997762161417517e-06,
"loss": 0.5269,
"step": 64
},
{
"epoch": 0.04001616037245811,
"grad_norm": 2.964404344558716,
"learning_rate": 9.997453863160975e-06,
"loss": 0.5248,
"step": 65
},
{
"epoch": 0.04063179360895746,
"grad_norm": 3.130155086517334,
"learning_rate": 9.997125681658761e-06,
"loss": 0.5261,
"step": 66
},
{
"epoch": 0.04124742684545682,
"grad_norm": 3.1478676795959473,
"learning_rate": 9.996777618216608e-06,
"loss": 0.5227,
"step": 67
},
{
"epoch": 0.04186306008195617,
"grad_norm": 3.343843936920166,
"learning_rate": 9.996409674219343e-06,
"loss": 0.5443,
"step": 68
},
{
"epoch": 0.04247869331845553,
"grad_norm": 3.1439449787139893,
"learning_rate": 9.996021851130897e-06,
"loss": 0.5445,
"step": 69
},
{
"epoch": 0.04309432655495488,
"grad_norm": 3.548717498779297,
"learning_rate": 9.995614150494293e-06,
"loss": 0.5376,
"step": 70
},
{
"epoch": 0.04370995979145424,
"grad_norm": 3.1395504474639893,
"learning_rate": 9.995186573931638e-06,
"loss": 0.5118,
"step": 71
},
{
"epoch": 0.044325593027953594,
"grad_norm": 2.945014476776123,
"learning_rate": 9.994739123144121e-06,
"loss": 0.5225,
"step": 72
},
{
"epoch": 0.04494122626445295,
"grad_norm": 2.8614890575408936,
"learning_rate": 9.994271799912004e-06,
"loss": 0.5346,
"step": 73
},
{
"epoch": 0.045556859500952306,
"grad_norm": 3.3048510551452637,
"learning_rate": 9.993784606094612e-06,
"loss": 0.5635,
"step": 74
},
{
"epoch": 0.046172492737451665,
"grad_norm": 3.2629144191741943,
"learning_rate": 9.993277543630335e-06,
"loss": 0.5126,
"step": 75
},
{
"epoch": 0.04678812597395102,
"grad_norm": 2.8238108158111572,
"learning_rate": 9.992750614536606e-06,
"loss": 0.5215,
"step": 76
},
{
"epoch": 0.047403759210450376,
"grad_norm": 3.0240776538848877,
"learning_rate": 9.992203820909906e-06,
"loss": 0.5342,
"step": 77
},
{
"epoch": 0.04801939244694973,
"grad_norm": 2.6868398189544678,
"learning_rate": 9.99163716492575e-06,
"loss": 0.5389,
"step": 78
},
{
"epoch": 0.04863502568344909,
"grad_norm": 3.591717004776001,
"learning_rate": 9.991050648838676e-06,
"loss": 0.5321,
"step": 79
},
{
"epoch": 0.04925065891994844,
"grad_norm": 3.2408864498138428,
"learning_rate": 9.990444274982245e-06,
"loss": 0.5375,
"step": 80
},
{
"epoch": 0.0498662921564478,
"grad_norm": 3.125800371170044,
"learning_rate": 9.989818045769017e-06,
"loss": 0.5244,
"step": 81
},
{
"epoch": 0.05048192539294715,
"grad_norm": 3.0238773822784424,
"learning_rate": 9.989171963690556e-06,
"loss": 0.5372,
"step": 82
},
{
"epoch": 0.05109755862944651,
"grad_norm": 2.623504877090454,
"learning_rate": 9.988506031317416e-06,
"loss": 0.5165,
"step": 83
},
{
"epoch": 0.05171319186594586,
"grad_norm": 2.9732768535614014,
"learning_rate": 9.987820251299121e-06,
"loss": 0.5287,
"step": 84
},
{
"epoch": 0.05232882510244522,
"grad_norm": 2.9855895042419434,
"learning_rate": 9.987114626364172e-06,
"loss": 0.5379,
"step": 85
},
{
"epoch": 0.052944458338944574,
"grad_norm": 2.6988890171051025,
"learning_rate": 9.986389159320016e-06,
"loss": 0.53,
"step": 86
},
{
"epoch": 0.05356009157544393,
"grad_norm": 3.033294677734375,
"learning_rate": 9.985643853053053e-06,
"loss": 0.5345,
"step": 87
},
{
"epoch": 0.054175724811943285,
"grad_norm": 2.823150873184204,
"learning_rate": 9.984878710528615e-06,
"loss": 0.5482,
"step": 88
},
{
"epoch": 0.05479135804844264,
"grad_norm": 3.044827699661255,
"learning_rate": 9.984093734790955e-06,
"loss": 0.4908,
"step": 89
},
{
"epoch": 0.055406991284941996,
"grad_norm": 2.8079962730407715,
"learning_rate": 9.983288928963238e-06,
"loss": 0.5431,
"step": 90
},
{
"epoch": 0.05602262452144135,
"grad_norm": 2.7283718585968018,
"learning_rate": 9.982464296247523e-06,
"loss": 0.5427,
"step": 91
},
{
"epoch": 0.05663825775794071,
"grad_norm": 3.2571794986724854,
"learning_rate": 9.981619839924757e-06,
"loss": 0.558,
"step": 92
},
{
"epoch": 0.05725389099444006,
"grad_norm": 2.8086328506469727,
"learning_rate": 9.980755563354755e-06,
"loss": 0.5257,
"step": 93
},
{
"epoch": 0.05786952423093942,
"grad_norm": 3.075275421142578,
"learning_rate": 9.979871469976197e-06,
"loss": 0.5065,
"step": 94
},
{
"epoch": 0.05848515746743877,
"grad_norm": 2.7873611450195312,
"learning_rate": 9.978967563306599e-06,
"loss": 0.522,
"step": 95
},
{
"epoch": 0.05910079070393813,
"grad_norm": 2.693174123764038,
"learning_rate": 9.978043846942314e-06,
"loss": 0.5522,
"step": 96
},
{
"epoch": 0.05971642394043748,
"grad_norm": 3.019831657409668,
"learning_rate": 9.97710032455851e-06,
"loss": 0.5408,
"step": 97
},
{
"epoch": 0.06033205717693684,
"grad_norm": 2.5392229557037354,
"learning_rate": 9.976136999909156e-06,
"loss": 0.5677,
"step": 98
},
{
"epoch": 0.060947690413436194,
"grad_norm": 2.809018135070801,
"learning_rate": 9.975153876827008e-06,
"loss": 0.5415,
"step": 99
},
{
"epoch": 0.06156332364993555,
"grad_norm": 3.3943169116973877,
"learning_rate": 9.974150959223591e-06,
"loss": 0.5673,
"step": 100
},
{
"epoch": 0.06156332364993555,
"eval_loss": 0.5280219316482544,
"eval_runtime": 119.7149,
"eval_samples_per_second": 35.092,
"eval_steps_per_second": 4.394,
"step": 100
},
{
"epoch": 0.062178956886434905,
"grad_norm": 3.059950351715088,
"learning_rate": 9.973128251089193e-06,
"loss": 0.577,
"step": 101
},
{
"epoch": 0.06279459012293426,
"grad_norm": 3.0409281253814697,
"learning_rate": 9.972085756492831e-06,
"loss": 0.5375,
"step": 102
},
{
"epoch": 0.06341022335943362,
"grad_norm": 2.890801429748535,
"learning_rate": 9.971023479582258e-06,
"loss": 0.5278,
"step": 103
},
{
"epoch": 0.06402585659593298,
"grad_norm": 2.8441672325134277,
"learning_rate": 9.969941424583926e-06,
"loss": 0.573,
"step": 104
},
{
"epoch": 0.06464148983243233,
"grad_norm": 2.7627878189086914,
"learning_rate": 9.968839595802982e-06,
"loss": 0.5409,
"step": 105
},
{
"epoch": 0.06525712306893168,
"grad_norm": 3.9377424716949463,
"learning_rate": 9.967717997623245e-06,
"loss": 0.5905,
"step": 106
},
{
"epoch": 0.06587275630543105,
"grad_norm": 2.8966352939605713,
"learning_rate": 9.966576634507187e-06,
"loss": 0.5224,
"step": 107
},
{
"epoch": 0.0664883895419304,
"grad_norm": 3.656322956085205,
"learning_rate": 9.965415510995924e-06,
"loss": 0.54,
"step": 108
},
{
"epoch": 0.06710402277842975,
"grad_norm": 2.6917004585266113,
"learning_rate": 9.964234631709188e-06,
"loss": 0.5301,
"step": 109
},
{
"epoch": 0.0677196560149291,
"grad_norm": 2.6347432136535645,
"learning_rate": 9.963034001345313e-06,
"loss": 0.5244,
"step": 110
},
{
"epoch": 0.06833528925142845,
"grad_norm": 2.6941580772399902,
"learning_rate": 9.96181362468122e-06,
"loss": 0.5582,
"step": 111
},
{
"epoch": 0.06895092248792782,
"grad_norm": 2.9986321926116943,
"learning_rate": 9.960573506572391e-06,
"loss": 0.5007,
"step": 112
},
{
"epoch": 0.06956655572442717,
"grad_norm": 2.8413689136505127,
"learning_rate": 9.95931365195285e-06,
"loss": 0.5437,
"step": 113
},
{
"epoch": 0.07018218896092653,
"grad_norm": 2.505411148071289,
"learning_rate": 9.958034065835151e-06,
"loss": 0.4966,
"step": 114
},
{
"epoch": 0.07079782219742588,
"grad_norm": 2.35355544090271,
"learning_rate": 9.956734753310355e-06,
"loss": 0.511,
"step": 115
},
{
"epoch": 0.07141345543392524,
"grad_norm": 2.7929799556732178,
"learning_rate": 9.955415719547998e-06,
"loss": 0.5106,
"step": 116
},
{
"epoch": 0.0720290886704246,
"grad_norm": 2.843961715698242,
"learning_rate": 9.954076969796093e-06,
"loss": 0.559,
"step": 117
},
{
"epoch": 0.07264472190692395,
"grad_norm": 2.5479166507720947,
"learning_rate": 9.952718509381086e-06,
"loss": 0.5593,
"step": 118
},
{
"epoch": 0.0732603551434233,
"grad_norm": 2.5203702449798584,
"learning_rate": 9.951340343707852e-06,
"loss": 0.5323,
"step": 119
},
{
"epoch": 0.07387598837992267,
"grad_norm": 2.7408761978149414,
"learning_rate": 9.949942478259665e-06,
"loss": 0.5451,
"step": 120
},
{
"epoch": 0.07449162161642202,
"grad_norm": 2.840665817260742,
"learning_rate": 9.948524918598175e-06,
"loss": 0.5662,
"step": 121
},
{
"epoch": 0.07510725485292137,
"grad_norm": 2.8258652687072754,
"learning_rate": 9.947087670363395e-06,
"loss": 0.5246,
"step": 122
},
{
"epoch": 0.07572288808942072,
"grad_norm": 2.577761650085449,
"learning_rate": 9.945630739273665e-06,
"loss": 0.5415,
"step": 123
},
{
"epoch": 0.07633852132592009,
"grad_norm": 2.4587228298187256,
"learning_rate": 9.944154131125643e-06,
"loss": 0.5058,
"step": 124
},
{
"epoch": 0.07695415456241944,
"grad_norm": 2.7867796421051025,
"learning_rate": 9.942657851794273e-06,
"loss": 0.5608,
"step": 125
},
{
"epoch": 0.0775697877989188,
"grad_norm": 2.8648955821990967,
"learning_rate": 9.941141907232766e-06,
"loss": 0.5288,
"step": 126
},
{
"epoch": 0.07818542103541815,
"grad_norm": 2.883604049682617,
"learning_rate": 9.93960630347257e-06,
"loss": 0.5214,
"step": 127
},
{
"epoch": 0.07880105427191751,
"grad_norm": 3.0440468788146973,
"learning_rate": 9.938051046623353e-06,
"loss": 0.4979,
"step": 128
},
{
"epoch": 0.07941668750841686,
"grad_norm": 2.5932624340057373,
"learning_rate": 9.936476142872979e-06,
"loss": 0.5342,
"step": 129
},
{
"epoch": 0.08003232074491622,
"grad_norm": 2.6591358184814453,
"learning_rate": 9.934881598487478e-06,
"loss": 0.4935,
"step": 130
},
{
"epoch": 0.08064795398141557,
"grad_norm": 2.621525526046753,
"learning_rate": 9.933267419811026e-06,
"loss": 0.5193,
"step": 131
},
{
"epoch": 0.08126358721791492,
"grad_norm": 2.65376877784729,
"learning_rate": 9.931633613265913e-06,
"loss": 0.5228,
"step": 132
},
{
"epoch": 0.08187922045441429,
"grad_norm": 2.4640400409698486,
"learning_rate": 9.929980185352525e-06,
"loss": 0.5097,
"step": 133
},
{
"epoch": 0.08249485369091364,
"grad_norm": 2.425771951675415,
"learning_rate": 9.928307142649315e-06,
"loss": 0.4832,
"step": 134
},
{
"epoch": 0.08311048692741299,
"grad_norm": 2.5605103969573975,
"learning_rate": 9.926614491812778e-06,
"loss": 0.5164,
"step": 135
},
{
"epoch": 0.08372612016391234,
"grad_norm": 2.524656295776367,
"learning_rate": 9.924902239577419e-06,
"loss": 0.5565,
"step": 136
},
{
"epoch": 0.08434175340041171,
"grad_norm": 2.78824782371521,
"learning_rate": 9.923170392755735e-06,
"loss": 0.5721,
"step": 137
},
{
"epoch": 0.08495738663691106,
"grad_norm": 2.4345874786376953,
"learning_rate": 9.921418958238182e-06,
"loss": 0.5229,
"step": 138
},
{
"epoch": 0.08557301987341041,
"grad_norm": 2.9033138751983643,
"learning_rate": 9.91964794299315e-06,
"loss": 0.537,
"step": 139
},
{
"epoch": 0.08618865310990977,
"grad_norm": 2.644517660140991,
"learning_rate": 9.91785735406693e-06,
"loss": 0.5303,
"step": 140
},
{
"epoch": 0.08680428634640913,
"grad_norm": 2.703230381011963,
"learning_rate": 9.916047198583698e-06,
"loss": 0.5568,
"step": 141
},
{
"epoch": 0.08741991958290848,
"grad_norm": 2.5611250400543213,
"learning_rate": 9.914217483745472e-06,
"loss": 0.525,
"step": 142
},
{
"epoch": 0.08803555281940784,
"grad_norm": 2.7442543506622314,
"learning_rate": 9.912368216832094e-06,
"loss": 0.5422,
"step": 143
},
{
"epoch": 0.08865118605590719,
"grad_norm": 2.5576305389404297,
"learning_rate": 9.910499405201195e-06,
"loss": 0.5488,
"step": 144
},
{
"epoch": 0.08926681929240655,
"grad_norm": 3.037740707397461,
"learning_rate": 9.90861105628817e-06,
"loss": 0.5239,
"step": 145
},
{
"epoch": 0.0898824525289059,
"grad_norm": 2.7151741981506348,
"learning_rate": 9.906703177606149e-06,
"loss": 0.5273,
"step": 146
},
{
"epoch": 0.09049808576540526,
"grad_norm": 2.5526156425476074,
"learning_rate": 9.904775776745959e-06,
"loss": 0.509,
"step": 147
},
{
"epoch": 0.09111371900190461,
"grad_norm": 2.4451942443847656,
"learning_rate": 9.902828861376101e-06,
"loss": 0.5146,
"step": 148
},
{
"epoch": 0.09172935223840398,
"grad_norm": 2.4070897102355957,
"learning_rate": 9.900862439242719e-06,
"loss": 0.5187,
"step": 149
},
{
"epoch": 0.09234498547490333,
"grad_norm": 2.5667154788970947,
"learning_rate": 9.898876518169572e-06,
"loss": 0.5231,
"step": 150
},
{
"epoch": 0.09296061871140268,
"grad_norm": 2.5709125995635986,
"learning_rate": 9.896871106057989e-06,
"loss": 0.5416,
"step": 151
},
{
"epoch": 0.09357625194790203,
"grad_norm": 2.5680558681488037,
"learning_rate": 9.894846210886856e-06,
"loss": 0.5211,
"step": 152
},
{
"epoch": 0.09419188518440139,
"grad_norm": 2.598843574523926,
"learning_rate": 9.892801840712576e-06,
"loss": 0.5311,
"step": 153
},
{
"epoch": 0.09480751842090075,
"grad_norm": 2.8312110900878906,
"learning_rate": 9.890738003669029e-06,
"loss": 0.5565,
"step": 154
},
{
"epoch": 0.0954231516574001,
"grad_norm": 2.64947247505188,
"learning_rate": 9.888654707967556e-06,
"loss": 0.5304,
"step": 155
},
{
"epoch": 0.09603878489389946,
"grad_norm": 2.66739559173584,
"learning_rate": 9.88655196189691e-06,
"loss": 0.5197,
"step": 156
},
{
"epoch": 0.09665441813039881,
"grad_norm": 2.781301259994507,
"learning_rate": 9.884429773823238e-06,
"loss": 0.5929,
"step": 157
},
{
"epoch": 0.09727005136689817,
"grad_norm": 2.493687152862549,
"learning_rate": 9.882288152190039e-06,
"loss": 0.4982,
"step": 158
},
{
"epoch": 0.09788568460339753,
"grad_norm": 2.649986982345581,
"learning_rate": 9.880127105518122e-06,
"loss": 0.5235,
"step": 159
},
{
"epoch": 0.09850131783989688,
"grad_norm": 2.764665365219116,
"learning_rate": 9.877946642405598e-06,
"loss": 0.5313,
"step": 160
},
{
"epoch": 0.09911695107639623,
"grad_norm": 2.6386075019836426,
"learning_rate": 9.875746771527817e-06,
"loss": 0.5328,
"step": 161
},
{
"epoch": 0.0997325843128956,
"grad_norm": 2.810497999191284,
"learning_rate": 9.873527501637352e-06,
"loss": 0.5359,
"step": 162
},
{
"epoch": 0.10034821754939495,
"grad_norm": 2.4889254570007324,
"learning_rate": 9.871288841563956e-06,
"loss": 0.5219,
"step": 163
},
{
"epoch": 0.1009638507858943,
"grad_norm": 2.7173171043395996,
"learning_rate": 9.869030800214531e-06,
"loss": 0.5609,
"step": 164
},
{
"epoch": 0.10157948402239365,
"grad_norm": 2.800018548965454,
"learning_rate": 9.866753386573091e-06,
"loss": 0.5286,
"step": 165
},
{
"epoch": 0.10219511725889302,
"grad_norm": 2.4725608825683594,
"learning_rate": 9.864456609700726e-06,
"loss": 0.5079,
"step": 166
},
{
"epoch": 0.10281075049539237,
"grad_norm": 2.834932327270508,
"learning_rate": 9.86214047873556e-06,
"loss": 0.5567,
"step": 167
},
{
"epoch": 0.10342638373189172,
"grad_norm": 2.691657066345215,
"learning_rate": 9.859805002892733e-06,
"loss": 0.5305,
"step": 168
},
{
"epoch": 0.10404201696839108,
"grad_norm": 2.373129367828369,
"learning_rate": 9.857450191464337e-06,
"loss": 0.5059,
"step": 169
},
{
"epoch": 0.10465765020489044,
"grad_norm": 2.8620853424072266,
"learning_rate": 9.855076053819409e-06,
"loss": 0.5431,
"step": 170
},
{
"epoch": 0.1052732834413898,
"grad_norm": 2.4922218322753906,
"learning_rate": 9.852682599403867e-06,
"loss": 0.5163,
"step": 171
},
{
"epoch": 0.10588891667788915,
"grad_norm": 2.6138806343078613,
"learning_rate": 9.85026983774049e-06,
"loss": 0.5466,
"step": 172
},
{
"epoch": 0.1065045499143885,
"grad_norm": 2.4458065032958984,
"learning_rate": 9.847837778428873e-06,
"loss": 0.5397,
"step": 173
},
{
"epoch": 0.10712018315088787,
"grad_norm": 2.5585734844207764,
"learning_rate": 9.84538643114539e-06,
"loss": 0.5771,
"step": 174
},
{
"epoch": 0.10773581638738722,
"grad_norm": 2.4674112796783447,
"learning_rate": 9.842915805643156e-06,
"loss": 0.5292,
"step": 175
},
{
"epoch": 0.10835144962388657,
"grad_norm": 2.4433512687683105,
"learning_rate": 9.840425911751987e-06,
"loss": 0.5505,
"step": 176
},
{
"epoch": 0.10896708286038592,
"grad_norm": 2.528373956680298,
"learning_rate": 9.837916759378363e-06,
"loss": 0.5325,
"step": 177
},
{
"epoch": 0.10958271609688527,
"grad_norm": 2.3507394790649414,
"learning_rate": 9.835388358505383e-06,
"loss": 0.5224,
"step": 178
},
{
"epoch": 0.11019834933338464,
"grad_norm": 2.5113093852996826,
"learning_rate": 9.832840719192737e-06,
"loss": 0.5798,
"step": 179
},
{
"epoch": 0.11081398256988399,
"grad_norm": 2.4653539657592773,
"learning_rate": 9.830273851576651e-06,
"loss": 0.5304,
"step": 180
},
{
"epoch": 0.11142961580638334,
"grad_norm": 2.6374294757843018,
"learning_rate": 9.827687765869859e-06,
"loss": 0.524,
"step": 181
},
{
"epoch": 0.1120452490428827,
"grad_norm": 2.344590425491333,
"learning_rate": 9.825082472361558e-06,
"loss": 0.5064,
"step": 182
},
{
"epoch": 0.11266088227938206,
"grad_norm": 2.315798759460449,
"learning_rate": 9.822457981417362e-06,
"loss": 0.5046,
"step": 183
},
{
"epoch": 0.11327651551588142,
"grad_norm": 2.5081753730773926,
"learning_rate": 9.819814303479268e-06,
"loss": 0.4975,
"step": 184
},
{
"epoch": 0.11389214875238077,
"grad_norm": 2.3528170585632324,
"learning_rate": 9.817151449065612e-06,
"loss": 0.5297,
"step": 185
},
{
"epoch": 0.11450778198888012,
"grad_norm": 2.668283462524414,
"learning_rate": 9.814469428771028e-06,
"loss": 0.5598,
"step": 186
},
{
"epoch": 0.11512341522537949,
"grad_norm": 2.670201539993286,
"learning_rate": 9.811768253266401e-06,
"loss": 0.4961,
"step": 187
},
{
"epoch": 0.11573904846187884,
"grad_norm": 2.6510801315307617,
"learning_rate": 9.809047933298834e-06,
"loss": 0.5335,
"step": 188
},
{
"epoch": 0.11635468169837819,
"grad_norm": 2.9253435134887695,
"learning_rate": 9.806308479691595e-06,
"loss": 0.5289,
"step": 189
},
{
"epoch": 0.11697031493487754,
"grad_norm": 2.3529560565948486,
"learning_rate": 9.803549903344081e-06,
"loss": 0.5525,
"step": 190
},
{
"epoch": 0.11758594817137691,
"grad_norm": 2.510202407836914,
"learning_rate": 9.80077221523177e-06,
"loss": 0.4958,
"step": 191
},
{
"epoch": 0.11820158140787626,
"grad_norm": 2.914436101913452,
"learning_rate": 9.79797542640618e-06,
"loss": 0.5495,
"step": 192
},
{
"epoch": 0.11881721464437561,
"grad_norm": 2.32578444480896,
"learning_rate": 9.79515954799483e-06,
"loss": 0.5378,
"step": 193
},
{
"epoch": 0.11943284788087496,
"grad_norm": 2.4786176681518555,
"learning_rate": 9.792324591201179e-06,
"loss": 0.5331,
"step": 194
},
{
"epoch": 0.12004848111737433,
"grad_norm": 2.7119698524475098,
"learning_rate": 9.789470567304604e-06,
"loss": 0.5563,
"step": 195
},
{
"epoch": 0.12066411435387368,
"grad_norm": 2.453625202178955,
"learning_rate": 9.786597487660336e-06,
"loss": 0.53,
"step": 196
},
{
"epoch": 0.12127974759037304,
"grad_norm": 2.4620349407196045,
"learning_rate": 9.78370536369943e-06,
"loss": 0.543,
"step": 197
},
{
"epoch": 0.12189538082687239,
"grad_norm": 2.302342414855957,
"learning_rate": 9.780794206928704e-06,
"loss": 0.5526,
"step": 198
},
{
"epoch": 0.12251101406337174,
"grad_norm": 2.308497905731201,
"learning_rate": 9.777864028930705e-06,
"loss": 0.4909,
"step": 199
},
{
"epoch": 0.1231266472998711,
"grad_norm": 2.459630250930786,
"learning_rate": 9.774914841363661e-06,
"loss": 0.495,
"step": 200
},
{
"epoch": 0.1231266472998711,
"eval_loss": 0.51694256067276,
"eval_runtime": 119.7286,
"eval_samples_per_second": 35.088,
"eval_steps_per_second": 4.393,
"step": 200
},
{
"epoch": 0.12374228053637046,
"grad_norm": 2.3562207221984863,
"learning_rate": 9.771946655961431e-06,
"loss": 0.5513,
"step": 201
},
{
"epoch": 0.12435791377286981,
"grad_norm": 2.693406343460083,
"learning_rate": 9.768959484533461e-06,
"loss": 0.5348,
"step": 202
},
{
"epoch": 0.12497354700936916,
"grad_norm": 2.5745203495025635,
"learning_rate": 9.765953338964736e-06,
"loss": 0.5027,
"step": 203
},
{
"epoch": 0.12558918024586851,
"grad_norm": 2.345874786376953,
"learning_rate": 9.762928231215731e-06,
"loss": 0.5171,
"step": 204
},
{
"epoch": 0.12620481348236787,
"grad_norm": 2.9996278285980225,
"learning_rate": 9.75988417332237e-06,
"loss": 0.54,
"step": 205
},
{
"epoch": 0.12682044671886725,
"grad_norm": 2.7544898986816406,
"learning_rate": 9.756821177395969e-06,
"loss": 0.5506,
"step": 206
},
{
"epoch": 0.1274360799553666,
"grad_norm": 2.560554265975952,
"learning_rate": 9.753739255623193e-06,
"loss": 0.537,
"step": 207
},
{
"epoch": 0.12805171319186595,
"grad_norm": 2.4070980548858643,
"learning_rate": 9.750638420266008e-06,
"loss": 0.5257,
"step": 208
},
{
"epoch": 0.1286673464283653,
"grad_norm": 2.6485419273376465,
"learning_rate": 9.747518683661632e-06,
"loss": 0.5227,
"step": 209
},
{
"epoch": 0.12928297966486466,
"grad_norm": 2.5590012073516846,
"learning_rate": 9.744380058222483e-06,
"loss": 0.5281,
"step": 210
},
{
"epoch": 0.129898612901364,
"grad_norm": 2.250981330871582,
"learning_rate": 9.741222556436132e-06,
"loss": 0.5149,
"step": 211
},
{
"epoch": 0.13051424613786336,
"grad_norm": 2.4137957096099854,
"learning_rate": 9.738046190865254e-06,
"loss": 0.5331,
"step": 212
},
{
"epoch": 0.1311298793743627,
"grad_norm": 2.0661706924438477,
"learning_rate": 9.734850974147573e-06,
"loss": 0.497,
"step": 213
},
{
"epoch": 0.1317455126108621,
"grad_norm": 2.2877936363220215,
"learning_rate": 9.731636918995821e-06,
"loss": 0.4819,
"step": 214
},
{
"epoch": 0.13236114584736144,
"grad_norm": 2.485813856124878,
"learning_rate": 9.72840403819768e-06,
"loss": 0.4859,
"step": 215
},
{
"epoch": 0.1329767790838608,
"grad_norm": 2.2839794158935547,
"learning_rate": 9.72515234461573e-06,
"loss": 0.4996,
"step": 216
},
{
"epoch": 0.13359241232036015,
"grad_norm": 2.558027744293213,
"learning_rate": 9.721881851187406e-06,
"loss": 0.5171,
"step": 217
},
{
"epoch": 0.1342080455568595,
"grad_norm": 2.5948326587677,
"learning_rate": 9.718592570924938e-06,
"loss": 0.5091,
"step": 218
},
{
"epoch": 0.13482367879335885,
"grad_norm": 2.0874948501586914,
"learning_rate": 9.715284516915303e-06,
"loss": 0.5083,
"step": 219
},
{
"epoch": 0.1354393120298582,
"grad_norm": 2.195895195007324,
"learning_rate": 9.711957702320176e-06,
"loss": 0.5147,
"step": 220
},
{
"epoch": 0.13605494526635756,
"grad_norm": 2.4264023303985596,
"learning_rate": 9.708612140375867e-06,
"loss": 0.5375,
"step": 221
},
{
"epoch": 0.1366705785028569,
"grad_norm": 2.349743604660034,
"learning_rate": 9.705247844393284e-06,
"loss": 0.5058,
"step": 222
},
{
"epoch": 0.1372862117393563,
"grad_norm": 2.254120111465454,
"learning_rate": 9.701864827757868e-06,
"loss": 0.52,
"step": 223
},
{
"epoch": 0.13790184497585564,
"grad_norm": 2.2042782306671143,
"learning_rate": 9.698463103929542e-06,
"loss": 0.5309,
"step": 224
},
{
"epoch": 0.138517478212355,
"grad_norm": 2.114847183227539,
"learning_rate": 9.695042686442662e-06,
"loss": 0.4971,
"step": 225
},
{
"epoch": 0.13913311144885435,
"grad_norm": 2.7704129219055176,
"learning_rate": 9.691603588905956e-06,
"loss": 0.4993,
"step": 226
},
{
"epoch": 0.1397487446853537,
"grad_norm": 2.32524037361145,
"learning_rate": 9.688145825002475e-06,
"loss": 0.532,
"step": 227
},
{
"epoch": 0.14036437792185305,
"grad_norm": 2.225130319595337,
"learning_rate": 9.684669408489542e-06,
"loss": 0.5176,
"step": 228
},
{
"epoch": 0.1409800111583524,
"grad_norm": 2.4317879676818848,
"learning_rate": 9.681174353198687e-06,
"loss": 0.5321,
"step": 229
},
{
"epoch": 0.14159564439485176,
"grad_norm": 2.2843475341796875,
"learning_rate": 9.6776606730356e-06,
"loss": 0.5326,
"step": 230
},
{
"epoch": 0.14221127763135114,
"grad_norm": 2.269463062286377,
"learning_rate": 9.674128381980073e-06,
"loss": 0.5145,
"step": 231
},
{
"epoch": 0.1428269108678505,
"grad_norm": 2.3606982231140137,
"learning_rate": 9.670577494085945e-06,
"loss": 0.5358,
"step": 232
},
{
"epoch": 0.14344254410434984,
"grad_norm": 2.1334755420684814,
"learning_rate": 9.667008023481045e-06,
"loss": 0.5177,
"step": 233
},
{
"epoch": 0.1440581773408492,
"grad_norm": 2.2030303478240967,
"learning_rate": 9.663419984367139e-06,
"loss": 0.5027,
"step": 234
},
{
"epoch": 0.14467381057734854,
"grad_norm": 2.3622584342956543,
"learning_rate": 9.659813391019867e-06,
"loss": 0.5297,
"step": 235
},
{
"epoch": 0.1452894438138479,
"grad_norm": 2.399989128112793,
"learning_rate": 9.656188257788694e-06,
"loss": 0.5509,
"step": 236
},
{
"epoch": 0.14590507705034725,
"grad_norm": 2.143125057220459,
"learning_rate": 9.652544599096846e-06,
"loss": 0.5156,
"step": 237
},
{
"epoch": 0.1465207102868466,
"grad_norm": 1.8969368934631348,
"learning_rate": 9.648882429441258e-06,
"loss": 0.4808,
"step": 238
},
{
"epoch": 0.14713634352334598,
"grad_norm": 2.2974019050598145,
"learning_rate": 9.645201763392513e-06,
"loss": 0.5358,
"step": 239
},
{
"epoch": 0.14775197675984533,
"grad_norm": 2.315643787384033,
"learning_rate": 9.641502615594789e-06,
"loss": 0.5157,
"step": 240
},
{
"epoch": 0.14836760999634468,
"grad_norm": 2.4144253730773926,
"learning_rate": 9.637785000765789e-06,
"loss": 0.517,
"step": 241
},
{
"epoch": 0.14898324323284404,
"grad_norm": 2.0618128776550293,
"learning_rate": 9.634048933696697e-06,
"loss": 0.4799,
"step": 242
},
{
"epoch": 0.1495988764693434,
"grad_norm": 2.4816224575042725,
"learning_rate": 9.630294429252112e-06,
"loss": 0.5059,
"step": 243
},
{
"epoch": 0.15021450970584274,
"grad_norm": 2.3560738563537598,
"learning_rate": 9.626521502369984e-06,
"loss": 0.5393,
"step": 244
},
{
"epoch": 0.1508301429423421,
"grad_norm": 2.447284698486328,
"learning_rate": 9.622730168061568e-06,
"loss": 0.5179,
"step": 245
},
{
"epoch": 0.15144577617884145,
"grad_norm": 2.3853726387023926,
"learning_rate": 9.618920441411346e-06,
"loss": 0.4957,
"step": 246
},
{
"epoch": 0.1520614094153408,
"grad_norm": 2.0928761959075928,
"learning_rate": 9.615092337576987e-06,
"loss": 0.4782,
"step": 247
},
{
"epoch": 0.15267704265184018,
"grad_norm": 2.4348087310791016,
"learning_rate": 9.611245871789273e-06,
"loss": 0.5522,
"step": 248
},
{
"epoch": 0.15329267588833953,
"grad_norm": 2.5106360912323,
"learning_rate": 9.60738105935204e-06,
"loss": 0.5157,
"step": 249
},
{
"epoch": 0.15390830912483888,
"grad_norm": 2.3466601371765137,
"learning_rate": 9.603497915642122e-06,
"loss": 0.5197,
"step": 250
},
{
"epoch": 0.15452394236133823,
"grad_norm": 2.5545997619628906,
"learning_rate": 9.599596456109286e-06,
"loss": 0.4927,
"step": 251
},
{
"epoch": 0.1551395755978376,
"grad_norm": 2.521439790725708,
"learning_rate": 9.595676696276173e-06,
"loss": 0.5541,
"step": 252
},
{
"epoch": 0.15575520883433694,
"grad_norm": 2.3179068565368652,
"learning_rate": 9.591738651738235e-06,
"loss": 0.4994,
"step": 253
},
{
"epoch": 0.1563708420708363,
"grad_norm": 2.2857041358947754,
"learning_rate": 9.58778233816367e-06,
"loss": 0.537,
"step": 254
},
{
"epoch": 0.15698647530733564,
"grad_norm": 2.6174025535583496,
"learning_rate": 9.583807771293366e-06,
"loss": 0.5298,
"step": 255
},
{
"epoch": 0.15760210854383502,
"grad_norm": 2.400820016860962,
"learning_rate": 9.579814966940833e-06,
"loss": 0.4969,
"step": 256
},
{
"epoch": 0.15821774178033438,
"grad_norm": 2.4716885089874268,
"learning_rate": 9.575803940992143e-06,
"loss": 0.5227,
"step": 257
},
{
"epoch": 0.15883337501683373,
"grad_norm": 2.1826908588409424,
"learning_rate": 9.571774709405866e-06,
"loss": 0.4987,
"step": 258
},
{
"epoch": 0.15944900825333308,
"grad_norm": 2.163158893585205,
"learning_rate": 9.567727288213005e-06,
"loss": 0.5295,
"step": 259
},
{
"epoch": 0.16006464148983243,
"grad_norm": 2.319798469543457,
"learning_rate": 9.563661693516934e-06,
"loss": 0.5388,
"step": 260
},
{
"epoch": 0.16068027472633178,
"grad_norm": 2.828787088394165,
"learning_rate": 9.559577941493334e-06,
"loss": 0.5277,
"step": 261
},
{
"epoch": 0.16129590796283114,
"grad_norm": 2.2098703384399414,
"learning_rate": 9.55547604839013e-06,
"loss": 0.5134,
"step": 262
},
{
"epoch": 0.1619115411993305,
"grad_norm": 2.636500120162964,
"learning_rate": 9.551356030527417e-06,
"loss": 0.5242,
"step": 263
},
{
"epoch": 0.16252717443582984,
"grad_norm": 2.5841751098632812,
"learning_rate": 9.547217904297411e-06,
"loss": 0.4996,
"step": 264
},
{
"epoch": 0.16314280767232922,
"grad_norm": 2.4081389904022217,
"learning_rate": 9.543061686164374e-06,
"loss": 0.5207,
"step": 265
},
{
"epoch": 0.16375844090882857,
"grad_norm": 2.21783447265625,
"learning_rate": 9.538887392664544e-06,
"loss": 0.5077,
"step": 266
},
{
"epoch": 0.16437407414532793,
"grad_norm": 2.636134386062622,
"learning_rate": 9.534695040406082e-06,
"loss": 0.4935,
"step": 267
},
{
"epoch": 0.16498970738182728,
"grad_norm": 2.134760856628418,
"learning_rate": 9.530484646068996e-06,
"loss": 0.5007,
"step": 268
},
{
"epoch": 0.16560534061832663,
"grad_norm": 1.9881904125213623,
"learning_rate": 9.526256226405075e-06,
"loss": 0.5029,
"step": 269
},
{
"epoch": 0.16622097385482598,
"grad_norm": 2.2205843925476074,
"learning_rate": 9.52200979823783e-06,
"loss": 0.4938,
"step": 270
},
{
"epoch": 0.16683660709132533,
"grad_norm": 2.17769455909729,
"learning_rate": 9.517745378462417e-06,
"loss": 0.5203,
"step": 271
},
{
"epoch": 0.1674522403278247,
"grad_norm": 2.097182273864746,
"learning_rate": 9.513462984045577e-06,
"loss": 0.5235,
"step": 272
},
{
"epoch": 0.16806787356432407,
"grad_norm": 2.164889097213745,
"learning_rate": 9.50916263202557e-06,
"loss": 0.5092,
"step": 273
},
{
"epoch": 0.16868350680082342,
"grad_norm": 2.035274028778076,
"learning_rate": 9.504844339512096e-06,
"loss": 0.4962,
"step": 274
},
{
"epoch": 0.16929914003732277,
"grad_norm": 2.267477512359619,
"learning_rate": 9.500508123686241e-06,
"loss": 0.5486,
"step": 275
},
{
"epoch": 0.16991477327382212,
"grad_norm": 2.092531204223633,
"learning_rate": 9.496154001800397e-06,
"loss": 0.487,
"step": 276
},
{
"epoch": 0.17053040651032148,
"grad_norm": 2.2290382385253906,
"learning_rate": 9.491781991178203e-06,
"loss": 0.4949,
"step": 277
},
{
"epoch": 0.17114603974682083,
"grad_norm": 2.1672849655151367,
"learning_rate": 9.487392109214468e-06,
"loss": 0.4772,
"step": 278
},
{
"epoch": 0.17176167298332018,
"grad_norm": 2.4247779846191406,
"learning_rate": 9.482984373375105e-06,
"loss": 0.5293,
"step": 279
},
{
"epoch": 0.17237730621981953,
"grad_norm": 2.375716209411621,
"learning_rate": 9.478558801197065e-06,
"loss": 0.5297,
"step": 280
},
{
"epoch": 0.1729929394563189,
"grad_norm": 2.107602834701538,
"learning_rate": 9.474115410288263e-06,
"loss": 0.464,
"step": 281
},
{
"epoch": 0.17360857269281826,
"grad_norm": 2.328535556793213,
"learning_rate": 9.469654218327503e-06,
"loss": 0.5001,
"step": 282
},
{
"epoch": 0.17422420592931762,
"grad_norm": 2.2215569019317627,
"learning_rate": 9.465175243064428e-06,
"loss": 0.4842,
"step": 283
},
{
"epoch": 0.17483983916581697,
"grad_norm": 2.3649046421051025,
"learning_rate": 9.460678502319419e-06,
"loss": 0.5121,
"step": 284
},
{
"epoch": 0.17545547240231632,
"grad_norm": 2.451415538787842,
"learning_rate": 9.456164013983546e-06,
"loss": 0.5342,
"step": 285
},
{
"epoch": 0.17607110563881567,
"grad_norm": 2.2832226753234863,
"learning_rate": 9.451631796018495e-06,
"loss": 0.5329,
"step": 286
},
{
"epoch": 0.17668673887531502,
"grad_norm": 2.2211861610412598,
"learning_rate": 9.44708186645649e-06,
"loss": 0.54,
"step": 287
},
{
"epoch": 0.17730237211181438,
"grad_norm": 2.234879493713379,
"learning_rate": 9.442514243400218e-06,
"loss": 0.5407,
"step": 288
},
{
"epoch": 0.17791800534831373,
"grad_norm": 2.016080617904663,
"learning_rate": 9.437928945022772e-06,
"loss": 0.5114,
"step": 289
},
{
"epoch": 0.1785336385848131,
"grad_norm": 2.1441290378570557,
"learning_rate": 9.433325989567562e-06,
"loss": 0.512,
"step": 290
},
{
"epoch": 0.17914927182131246,
"grad_norm": 2.112748384475708,
"learning_rate": 9.428705395348254e-06,
"loss": 0.5141,
"step": 291
},
{
"epoch": 0.1797649050578118,
"grad_norm": 1.9823687076568604,
"learning_rate": 9.424067180748692e-06,
"loss": 0.5006,
"step": 292
},
{
"epoch": 0.18038053829431117,
"grad_norm": 2.2916078567504883,
"learning_rate": 9.419411364222826e-06,
"loss": 0.5134,
"step": 293
},
{
"epoch": 0.18099617153081052,
"grad_norm": 2.619990110397339,
"learning_rate": 9.414737964294636e-06,
"loss": 0.4893,
"step": 294
},
{
"epoch": 0.18161180476730987,
"grad_norm": 2.1955621242523193,
"learning_rate": 9.410046999558062e-06,
"loss": 0.4994,
"step": 295
},
{
"epoch": 0.18222743800380922,
"grad_norm": 2.555013656616211,
"learning_rate": 9.40533848867693e-06,
"loss": 0.5372,
"step": 296
},
{
"epoch": 0.18284307124030857,
"grad_norm": 2.11368727684021,
"learning_rate": 9.400612450384874e-06,
"loss": 0.4894,
"step": 297
},
{
"epoch": 0.18345870447680795,
"grad_norm": 2.3000552654266357,
"learning_rate": 9.395868903485269e-06,
"loss": 0.5129,
"step": 298
},
{
"epoch": 0.1840743377133073,
"grad_norm": 2.0731847286224365,
"learning_rate": 9.391107866851143e-06,
"loss": 0.4828,
"step": 299
},
{
"epoch": 0.18468997094980666,
"grad_norm": 2.25274395942688,
"learning_rate": 9.386329359425117e-06,
"loss": 0.5173,
"step": 300
},
{
"epoch": 0.18468997094980666,
"eval_loss": 0.5033955574035645,
"eval_runtime": 119.4386,
"eval_samples_per_second": 35.173,
"eval_steps_per_second": 4.404,
"step": 300
},
{
"epoch": 0.185305604186306,
"grad_norm": 2.0389397144317627,
"learning_rate": 9.381533400219319e-06,
"loss": 0.4855,
"step": 301
},
{
"epoch": 0.18592123742280536,
"grad_norm": 2.3065874576568604,
"learning_rate": 9.376720008315312e-06,
"loss": 0.4963,
"step": 302
},
{
"epoch": 0.18653687065930472,
"grad_norm": 2.1684486865997314,
"learning_rate": 9.37188920286402e-06,
"loss": 0.5139,
"step": 303
},
{
"epoch": 0.18715250389580407,
"grad_norm": 2.176117181777954,
"learning_rate": 9.36704100308565e-06,
"loss": 0.4582,
"step": 304
},
{
"epoch": 0.18776813713230342,
"grad_norm": 2.326688766479492,
"learning_rate": 9.36217542826961e-06,
"loss": 0.5286,
"step": 305
},
{
"epoch": 0.18838377036880277,
"grad_norm": 2.441455125808716,
"learning_rate": 9.357292497774447e-06,
"loss": 0.5235,
"step": 306
},
{
"epoch": 0.18899940360530215,
"grad_norm": 2.0674080848693848,
"learning_rate": 9.352392231027752e-06,
"loss": 0.4739,
"step": 307
},
{
"epoch": 0.1896150368418015,
"grad_norm": 2.186354637145996,
"learning_rate": 9.347474647526095e-06,
"loss": 0.5179,
"step": 308
},
{
"epoch": 0.19023067007830086,
"grad_norm": 2.2529890537261963,
"learning_rate": 9.342539766834945e-06,
"loss": 0.4914,
"step": 309
},
{
"epoch": 0.1908463033148002,
"grad_norm": 2.005094051361084,
"learning_rate": 9.337587608588588e-06,
"loss": 0.4972,
"step": 310
},
{
"epoch": 0.19146193655129956,
"grad_norm": 2.2308027744293213,
"learning_rate": 9.332618192490054e-06,
"loss": 0.4884,
"step": 311
},
{
"epoch": 0.1920775697877989,
"grad_norm": 2.4357378482818604,
"learning_rate": 9.327631538311036e-06,
"loss": 0.5289,
"step": 312
},
{
"epoch": 0.19269320302429827,
"grad_norm": 2.0533759593963623,
"learning_rate": 9.322627665891807e-06,
"loss": 0.4723,
"step": 313
},
{
"epoch": 0.19330883626079762,
"grad_norm": 2.2002153396606445,
"learning_rate": 9.317606595141156e-06,
"loss": 0.5384,
"step": 314
},
{
"epoch": 0.193924469497297,
"grad_norm": 2.239171266555786,
"learning_rate": 9.312568346036288e-06,
"loss": 0.524,
"step": 315
},
{
"epoch": 0.19454010273379635,
"grad_norm": 2.389228343963623,
"learning_rate": 9.307512938622762e-06,
"loss": 0.5396,
"step": 316
},
{
"epoch": 0.1951557359702957,
"grad_norm": 1.962758183479309,
"learning_rate": 9.302440393014402e-06,
"loss": 0.4919,
"step": 317
},
{
"epoch": 0.19577136920679505,
"grad_norm": 2.2056610584259033,
"learning_rate": 9.29735072939322e-06,
"loss": 0.5253,
"step": 318
},
{
"epoch": 0.1963870024432944,
"grad_norm": 2.3771414756774902,
"learning_rate": 9.292243968009332e-06,
"loss": 0.5048,
"step": 319
},
{
"epoch": 0.19700263567979376,
"grad_norm": 2.1659281253814697,
"learning_rate": 9.287120129180884e-06,
"loss": 0.4925,
"step": 320
},
{
"epoch": 0.1976182689162931,
"grad_norm": 2.415493965148926,
"learning_rate": 9.281979233293966e-06,
"loss": 0.5006,
"step": 321
},
{
"epoch": 0.19823390215279246,
"grad_norm": 2.220350980758667,
"learning_rate": 9.276821300802535e-06,
"loss": 0.5117,
"step": 322
},
{
"epoch": 0.19884953538929184,
"grad_norm": 2.1746761798858643,
"learning_rate": 9.271646352228324e-06,
"loss": 0.5036,
"step": 323
},
{
"epoch": 0.1994651686257912,
"grad_norm": 2.0392343997955322,
"learning_rate": 9.266454408160779e-06,
"loss": 0.4946,
"step": 324
},
{
"epoch": 0.20008080186229055,
"grad_norm": 2.0878195762634277,
"learning_rate": 9.261245489256956e-06,
"loss": 0.5064,
"step": 325
},
{
"epoch": 0.2006964350987899,
"grad_norm": 2.1423075199127197,
"learning_rate": 9.25601961624145e-06,
"loss": 0.5088,
"step": 326
},
{
"epoch": 0.20131206833528925,
"grad_norm": 2.3595194816589355,
"learning_rate": 9.250776809906313e-06,
"loss": 0.5523,
"step": 327
},
{
"epoch": 0.2019277015717886,
"grad_norm": 2.1990396976470947,
"learning_rate": 9.24551709111097e-06,
"loss": 0.499,
"step": 328
},
{
"epoch": 0.20254333480828796,
"grad_norm": 2.1120452880859375,
"learning_rate": 9.24024048078213e-06,
"loss": 0.5174,
"step": 329
},
{
"epoch": 0.2031589680447873,
"grad_norm": 2.780289649963379,
"learning_rate": 9.234946999913717e-06,
"loss": 0.5507,
"step": 330
},
{
"epoch": 0.20377460128128666,
"grad_norm": 2.629866123199463,
"learning_rate": 9.229636669566769e-06,
"loss": 0.5358,
"step": 331
},
{
"epoch": 0.20439023451778604,
"grad_norm": 2.216796636581421,
"learning_rate": 9.224309510869364e-06,
"loss": 0.4949,
"step": 332
},
{
"epoch": 0.2050058677542854,
"grad_norm": 2.279585838317871,
"learning_rate": 9.218965545016538e-06,
"loss": 0.4953,
"step": 333
},
{
"epoch": 0.20562150099078474,
"grad_norm": 2.3182451725006104,
"learning_rate": 9.213604793270196e-06,
"loss": 0.5338,
"step": 334
},
{
"epoch": 0.2062371342272841,
"grad_norm": 2.1451542377471924,
"learning_rate": 9.208227276959028e-06,
"loss": 0.5004,
"step": 335
},
{
"epoch": 0.20685276746378345,
"grad_norm": 2.218402147293091,
"learning_rate": 9.202833017478421e-06,
"loss": 0.5257,
"step": 336
},
{
"epoch": 0.2074684007002828,
"grad_norm": 2.235485076904297,
"learning_rate": 9.197422036290386e-06,
"loss": 0.5282,
"step": 337
},
{
"epoch": 0.20808403393678215,
"grad_norm": 2.1318018436431885,
"learning_rate": 9.191994354923459e-06,
"loss": 0.4672,
"step": 338
},
{
"epoch": 0.2086996671732815,
"grad_norm": 2.1314377784729004,
"learning_rate": 9.186549994972618e-06,
"loss": 0.5151,
"step": 339
},
{
"epoch": 0.20931530040978089,
"grad_norm": 2.10489821434021,
"learning_rate": 9.181088978099203e-06,
"loss": 0.4928,
"step": 340
},
{
"epoch": 0.20993093364628024,
"grad_norm": 2.1959035396575928,
"learning_rate": 9.17561132603083e-06,
"loss": 0.4913,
"step": 341
},
{
"epoch": 0.2105465668827796,
"grad_norm": 2.1556055545806885,
"learning_rate": 9.170117060561296e-06,
"loss": 0.4652,
"step": 342
},
{
"epoch": 0.21116220011927894,
"grad_norm": 2.0064280033111572,
"learning_rate": 9.164606203550498e-06,
"loss": 0.4924,
"step": 343
},
{
"epoch": 0.2117778333557783,
"grad_norm": 2.0715909004211426,
"learning_rate": 9.159078776924347e-06,
"loss": 0.4939,
"step": 344
},
{
"epoch": 0.21239346659227765,
"grad_norm": 1.9962373971939087,
"learning_rate": 9.153534802674675e-06,
"loss": 0.5172,
"step": 345
},
{
"epoch": 0.213009099828777,
"grad_norm": 2.0290207862854004,
"learning_rate": 9.147974302859158e-06,
"loss": 0.4953,
"step": 346
},
{
"epoch": 0.21362473306527635,
"grad_norm": 2.3874406814575195,
"learning_rate": 9.142397299601216e-06,
"loss": 0.4644,
"step": 347
},
{
"epoch": 0.21424036630177573,
"grad_norm": 2.1734139919281006,
"learning_rate": 9.136803815089936e-06,
"loss": 0.4563,
"step": 348
},
{
"epoch": 0.21485599953827508,
"grad_norm": 2.2753751277923584,
"learning_rate": 9.131193871579975e-06,
"loss": 0.4988,
"step": 349
},
{
"epoch": 0.21547163277477444,
"grad_norm": 2.179553508758545,
"learning_rate": 9.125567491391476e-06,
"loss": 0.5437,
"step": 350
},
{
"epoch": 0.2160872660112738,
"grad_norm": 1.9942052364349365,
"learning_rate": 9.119924696909979e-06,
"loss": 0.4784,
"step": 351
},
{
"epoch": 0.21670289924777314,
"grad_norm": 2.176542043685913,
"learning_rate": 9.114265510586329e-06,
"loss": 0.491,
"step": 352
},
{
"epoch": 0.2173185324842725,
"grad_norm": 2.19527268409729,
"learning_rate": 9.108589954936592e-06,
"loss": 0.5015,
"step": 353
},
{
"epoch": 0.21793416572077184,
"grad_norm": 2.237499475479126,
"learning_rate": 9.102898052541959e-06,
"loss": 0.5393,
"step": 354
},
{
"epoch": 0.2185497989572712,
"grad_norm": 2.2937676906585693,
"learning_rate": 9.09718982604866e-06,
"loss": 0.5236,
"step": 355
},
{
"epoch": 0.21916543219377055,
"grad_norm": 2.004369020462036,
"learning_rate": 9.091465298167876e-06,
"loss": 0.4828,
"step": 356
},
{
"epoch": 0.21978106543026993,
"grad_norm": 2.072390556335449,
"learning_rate": 9.085724491675642e-06,
"loss": 0.5532,
"step": 357
},
{
"epoch": 0.22039669866676928,
"grad_norm": 2.0293619632720947,
"learning_rate": 9.079967429412766e-06,
"loss": 0.4947,
"step": 358
},
{
"epoch": 0.22101233190326863,
"grad_norm": 2.168522357940674,
"learning_rate": 9.074194134284726e-06,
"loss": 0.5111,
"step": 359
},
{
"epoch": 0.22162796513976799,
"grad_norm": 1.933297872543335,
"learning_rate": 9.068404629261587e-06,
"loss": 0.466,
"step": 360
},
{
"epoch": 0.22224359837626734,
"grad_norm": 1.9404878616333008,
"learning_rate": 9.062598937377911e-06,
"loss": 0.4857,
"step": 361
},
{
"epoch": 0.2228592316127667,
"grad_norm": 2.2384283542633057,
"learning_rate": 9.05677708173266e-06,
"loss": 0.5159,
"step": 362
},
{
"epoch": 0.22347486484926604,
"grad_norm": 2.2054219245910645,
"learning_rate": 9.050939085489104e-06,
"loss": 0.5122,
"step": 363
},
{
"epoch": 0.2240904980857654,
"grad_norm": 2.036163330078125,
"learning_rate": 9.045084971874738e-06,
"loss": 0.4962,
"step": 364
},
{
"epoch": 0.22470613132226477,
"grad_norm": 2.03003191947937,
"learning_rate": 9.039214764181175e-06,
"loss": 0.4877,
"step": 365
},
{
"epoch": 0.22532176455876413,
"grad_norm": 2.1945431232452393,
"learning_rate": 9.033328485764068e-06,
"loss": 0.4999,
"step": 366
},
{
"epoch": 0.22593739779526348,
"grad_norm": 2.0336525440216064,
"learning_rate": 9.027426160043005e-06,
"loss": 0.4955,
"step": 367
},
{
"epoch": 0.22655303103176283,
"grad_norm": 2.09950590133667,
"learning_rate": 9.021507810501422e-06,
"loss": 0.5286,
"step": 368
},
{
"epoch": 0.22716866426826218,
"grad_norm": 2.2560412883758545,
"learning_rate": 9.01557346068651e-06,
"loss": 0.5208,
"step": 369
},
{
"epoch": 0.22778429750476153,
"grad_norm": 2.1105082035064697,
"learning_rate": 9.00962313420912e-06,
"loss": 0.5187,
"step": 370
},
{
"epoch": 0.2283999307412609,
"grad_norm": 2.1909470558166504,
"learning_rate": 9.003656854743667e-06,
"loss": 0.5056,
"step": 371
},
{
"epoch": 0.22901556397776024,
"grad_norm": 2.144836187362671,
"learning_rate": 8.997674646028044e-06,
"loss": 0.5014,
"step": 372
},
{
"epoch": 0.2296311972142596,
"grad_norm": 2.117741823196411,
"learning_rate": 8.991676531863507e-06,
"loss": 0.489,
"step": 373
},
{
"epoch": 0.23024683045075897,
"grad_norm": 2.0648508071899414,
"learning_rate": 8.985662536114614e-06,
"loss": 0.5018,
"step": 374
},
{
"epoch": 0.23086246368725832,
"grad_norm": 2.022077798843384,
"learning_rate": 8.979632682709093e-06,
"loss": 0.5197,
"step": 375
},
{
"epoch": 0.23147809692375768,
"grad_norm": 2.2208433151245117,
"learning_rate": 8.973586995637778e-06,
"loss": 0.5082,
"step": 376
},
{
"epoch": 0.23209373016025703,
"grad_norm": 2.2770118713378906,
"learning_rate": 8.967525498954488e-06,
"loss": 0.5106,
"step": 377
},
{
"epoch": 0.23270936339675638,
"grad_norm": 2.2035584449768066,
"learning_rate": 8.961448216775955e-06,
"loss": 0.5177,
"step": 378
},
{
"epoch": 0.23332499663325573,
"grad_norm": 2.2252767086029053,
"learning_rate": 8.955355173281709e-06,
"loss": 0.4947,
"step": 379
},
{
"epoch": 0.23394062986975508,
"grad_norm": 2.2952847480773926,
"learning_rate": 8.949246392713986e-06,
"loss": 0.4586,
"step": 380
},
{
"epoch": 0.23455626310625444,
"grad_norm": 2.498413324356079,
"learning_rate": 8.943121899377649e-06,
"loss": 0.5331,
"step": 381
},
{
"epoch": 0.23517189634275382,
"grad_norm": 2.176753520965576,
"learning_rate": 8.936981717640061e-06,
"loss": 0.4954,
"step": 382
},
{
"epoch": 0.23578752957925317,
"grad_norm": 2.187340259552002,
"learning_rate": 8.930825871931012e-06,
"loss": 0.52,
"step": 383
},
{
"epoch": 0.23640316281575252,
"grad_norm": 2.0732364654541016,
"learning_rate": 8.924654386742613e-06,
"loss": 0.5076,
"step": 384
},
{
"epoch": 0.23701879605225187,
"grad_norm": 2.4784514904022217,
"learning_rate": 8.9184672866292e-06,
"loss": 0.4996,
"step": 385
},
{
"epoch": 0.23763442928875123,
"grad_norm": 2.0487194061279297,
"learning_rate": 8.912264596207233e-06,
"loss": 0.4942,
"step": 386
},
{
"epoch": 0.23825006252525058,
"grad_norm": 2.120363473892212,
"learning_rate": 8.906046340155203e-06,
"loss": 0.5164,
"step": 387
},
{
"epoch": 0.23886569576174993,
"grad_norm": 2.0596227645874023,
"learning_rate": 8.899812543213532e-06,
"loss": 0.5168,
"step": 388
},
{
"epoch": 0.23948132899824928,
"grad_norm": 2.095977306365967,
"learning_rate": 8.89356323018447e-06,
"loss": 0.4889,
"step": 389
},
{
"epoch": 0.24009696223474866,
"grad_norm": 2.0541493892669678,
"learning_rate": 8.88729842593201e-06,
"loss": 0.5029,
"step": 390
},
{
"epoch": 0.24071259547124801,
"grad_norm": 1.9384243488311768,
"learning_rate": 8.881018155381766e-06,
"loss": 0.5098,
"step": 391
},
{
"epoch": 0.24132822870774737,
"grad_norm": 2.151761054992676,
"learning_rate": 8.874722443520898e-06,
"loss": 0.4938,
"step": 392
},
{
"epoch": 0.24194386194424672,
"grad_norm": 1.8888019323349,
"learning_rate": 8.868411315398e-06,
"loss": 0.4587,
"step": 393
},
{
"epoch": 0.24255949518074607,
"grad_norm": 1.9799542427062988,
"learning_rate": 8.862084796122998e-06,
"loss": 0.4672,
"step": 394
},
{
"epoch": 0.24317512841724542,
"grad_norm": 2.1893832683563232,
"learning_rate": 8.85574291086706e-06,
"loss": 0.5069,
"step": 395
},
{
"epoch": 0.24379076165374478,
"grad_norm": 2.151860237121582,
"learning_rate": 8.849385684862483e-06,
"loss": 0.4905,
"step": 396
},
{
"epoch": 0.24440639489024413,
"grad_norm": 1.8696808815002441,
"learning_rate": 8.84301314340261e-06,
"loss": 0.4912,
"step": 397
},
{
"epoch": 0.24502202812674348,
"grad_norm": 2.1755058765411377,
"learning_rate": 8.836625311841711e-06,
"loss": 0.5006,
"step": 398
},
{
"epoch": 0.24563766136324286,
"grad_norm": 2.073040008544922,
"learning_rate": 8.83022221559489e-06,
"loss": 0.4933,
"step": 399
},
{
"epoch": 0.2462532945997422,
"grad_norm": 2.1384034156799316,
"learning_rate": 8.823803880137993e-06,
"loss": 0.4981,
"step": 400
},
{
"epoch": 0.2462532945997422,
"eval_loss": 0.49532872438430786,
"eval_runtime": 119.469,
"eval_samples_per_second": 35.164,
"eval_steps_per_second": 4.403,
"step": 400
},
{
"epoch": 0.24686892783624156,
"grad_norm": 2.1016058921813965,
"learning_rate": 8.817370331007488e-06,
"loss": 0.4962,
"step": 401
},
{
"epoch": 0.24748456107274092,
"grad_norm": 2.3278987407684326,
"learning_rate": 8.810921593800377e-06,
"loss": 0.5203,
"step": 402
},
{
"epoch": 0.24810019430924027,
"grad_norm": 1.9771771430969238,
"learning_rate": 8.804457694174093e-06,
"loss": 0.4822,
"step": 403
},
{
"epoch": 0.24871582754573962,
"grad_norm": 2.107381582260132,
"learning_rate": 8.797978657846391e-06,
"loss": 0.4842,
"step": 404
},
{
"epoch": 0.24933146078223897,
"grad_norm": 2.2924184799194336,
"learning_rate": 8.791484510595254e-06,
"loss": 0.506,
"step": 405
},
{
"epoch": 0.24994709401873832,
"grad_norm": 2.1443545818328857,
"learning_rate": 8.784975278258783e-06,
"loss": 0.4708,
"step": 406
},
{
"epoch": 0.2505627272552377,
"grad_norm": 2.0638580322265625,
"learning_rate": 8.7784509867351e-06,
"loss": 0.4986,
"step": 407
},
{
"epoch": 0.25117836049173703,
"grad_norm": 2.3423922061920166,
"learning_rate": 8.77191166198224e-06,
"loss": 0.4741,
"step": 408
},
{
"epoch": 0.2517939937282364,
"grad_norm": 2.2074921131134033,
"learning_rate": 8.765357330018056e-06,
"loss": 0.5047,
"step": 409
},
{
"epoch": 0.25240962696473573,
"grad_norm": 1.9105783700942993,
"learning_rate": 8.758788016920102e-06,
"loss": 0.4464,
"step": 410
},
{
"epoch": 0.2530252602012351,
"grad_norm": 2.2252490520477295,
"learning_rate": 8.752203748825542e-06,
"loss": 0.4925,
"step": 411
},
{
"epoch": 0.2536408934377345,
"grad_norm": 2.2548561096191406,
"learning_rate": 8.745604551931042e-06,
"loss": 0.5135,
"step": 412
},
{
"epoch": 0.2542565266742338,
"grad_norm": 1.8392614126205444,
"learning_rate": 8.73899045249266e-06,
"loss": 0.5312,
"step": 413
},
{
"epoch": 0.2548721599107332,
"grad_norm": 2.1313741207122803,
"learning_rate": 8.732361476825752e-06,
"loss": 0.5049,
"step": 414
},
{
"epoch": 0.2554877931472325,
"grad_norm": 2.245140790939331,
"learning_rate": 8.725717651304856e-06,
"loss": 0.5204,
"step": 415
},
{
"epoch": 0.2561034263837319,
"grad_norm": 2.030898332595825,
"learning_rate": 8.719059002363598e-06,
"loss": 0.4646,
"step": 416
},
{
"epoch": 0.2567190596202312,
"grad_norm": 1.904626727104187,
"learning_rate": 8.71238555649458e-06,
"loss": 0.4824,
"step": 417
},
{
"epoch": 0.2573346928567306,
"grad_norm": 2.06482195854187,
"learning_rate": 8.705697340249275e-06,
"loss": 0.4946,
"step": 418
},
{
"epoch": 0.25795032609322993,
"grad_norm": 1.913489818572998,
"learning_rate": 8.698994380237921e-06,
"loss": 0.46,
"step": 419
},
{
"epoch": 0.2585659593297293,
"grad_norm": 2.09264874458313,
"learning_rate": 8.692276703129421e-06,
"loss": 0.4871,
"step": 420
},
{
"epoch": 0.2591815925662287,
"grad_norm": 2.0330116748809814,
"learning_rate": 8.685544335651226e-06,
"loss": 0.4607,
"step": 421
},
{
"epoch": 0.259797225802728,
"grad_norm": 2.0528652667999268,
"learning_rate": 8.678797304589245e-06,
"loss": 0.457,
"step": 422
},
{
"epoch": 0.2604128590392274,
"grad_norm": 2.1522958278656006,
"learning_rate": 8.672035636787721e-06,
"loss": 0.4952,
"step": 423
},
{
"epoch": 0.2610284922757267,
"grad_norm": 1.9825266599655151,
"learning_rate": 8.665259359149132e-06,
"loss": 0.483,
"step": 424
},
{
"epoch": 0.2616441255122261,
"grad_norm": 2.00447678565979,
"learning_rate": 8.658468498634089e-06,
"loss": 0.4992,
"step": 425
},
{
"epoch": 0.2622597587487254,
"grad_norm": 2.057898759841919,
"learning_rate": 8.651663082261217e-06,
"loss": 0.477,
"step": 426
},
{
"epoch": 0.2628753919852248,
"grad_norm": 2.0360608100891113,
"learning_rate": 8.644843137107058e-06,
"loss": 0.4907,
"step": 427
},
{
"epoch": 0.2634910252217242,
"grad_norm": 2.054442882537842,
"learning_rate": 8.638008690305961e-06,
"loss": 0.4841,
"step": 428
},
{
"epoch": 0.2641066584582235,
"grad_norm": 2.14125919342041,
"learning_rate": 8.631159769049965e-06,
"loss": 0.4952,
"step": 429
},
{
"epoch": 0.2647222916947229,
"grad_norm": 1.8915045261383057,
"learning_rate": 8.62429640058871e-06,
"loss": 0.4932,
"step": 430
},
{
"epoch": 0.2653379249312222,
"grad_norm": 2.1041178703308105,
"learning_rate": 8.617418612229303e-06,
"loss": 0.5157,
"step": 431
},
{
"epoch": 0.2659535581677216,
"grad_norm": 2.169093132019043,
"learning_rate": 8.610526431336235e-06,
"loss": 0.4761,
"step": 432
},
{
"epoch": 0.2665691914042209,
"grad_norm": 2.22904896736145,
"learning_rate": 8.603619885331251e-06,
"loss": 0.4891,
"step": 433
},
{
"epoch": 0.2671848246407203,
"grad_norm": 2.2368030548095703,
"learning_rate": 8.596699001693257e-06,
"loss": 0.4992,
"step": 434
},
{
"epoch": 0.2678004578772196,
"grad_norm": 2.204688310623169,
"learning_rate": 8.589763807958198e-06,
"loss": 0.5025,
"step": 435
},
{
"epoch": 0.268416091113719,
"grad_norm": 2.0749876499176025,
"learning_rate": 8.582814331718961e-06,
"loss": 0.4672,
"step": 436
},
{
"epoch": 0.2690317243502184,
"grad_norm": 2.1938445568084717,
"learning_rate": 8.575850600625252e-06,
"loss": 0.4928,
"step": 437
},
{
"epoch": 0.2696473575867177,
"grad_norm": 2.2102932929992676,
"learning_rate": 8.568872642383497e-06,
"loss": 0.5046,
"step": 438
},
{
"epoch": 0.2702629908232171,
"grad_norm": 2.0184459686279297,
"learning_rate": 8.561880484756726e-06,
"loss": 0.488,
"step": 439
},
{
"epoch": 0.2708786240597164,
"grad_norm": 2.0338387489318848,
"learning_rate": 8.554874155564459e-06,
"loss": 0.5106,
"step": 440
},
{
"epoch": 0.2714942572962158,
"grad_norm": 1.887638807296753,
"learning_rate": 8.547853682682605e-06,
"loss": 0.4739,
"step": 441
},
{
"epoch": 0.2721098905327151,
"grad_norm": 1.763267993927002,
"learning_rate": 8.540819094043349e-06,
"loss": 0.4643,
"step": 442
},
{
"epoch": 0.2727255237692145,
"grad_norm": 1.9619516134262085,
"learning_rate": 8.53377041763503e-06,
"loss": 0.4846,
"step": 443
},
{
"epoch": 0.2733411570057138,
"grad_norm": 2.0067808628082275,
"learning_rate": 8.526707681502045e-06,
"loss": 0.4675,
"step": 444
},
{
"epoch": 0.2739567902422132,
"grad_norm": 2.1719822883605957,
"learning_rate": 8.519630913744726e-06,
"loss": 0.5093,
"step": 445
},
{
"epoch": 0.2745724234787126,
"grad_norm": 2.05495285987854,
"learning_rate": 8.512540142519232e-06,
"loss": 0.462,
"step": 446
},
{
"epoch": 0.2751880567152119,
"grad_norm": 2.103501081466675,
"learning_rate": 8.50543539603744e-06,
"loss": 0.4913,
"step": 447
},
{
"epoch": 0.2758036899517113,
"grad_norm": 2.015477418899536,
"learning_rate": 8.498316702566828e-06,
"loss": 0.5072,
"step": 448
},
{
"epoch": 0.2764193231882106,
"grad_norm": 2.2199337482452393,
"learning_rate": 8.491184090430365e-06,
"loss": 0.4917,
"step": 449
},
{
"epoch": 0.27703495642471,
"grad_norm": 2.044389009475708,
"learning_rate": 8.484037588006398e-06,
"loss": 0.4719,
"step": 450
},
{
"epoch": 0.2776505896612093,
"grad_norm": 2.093029260635376,
"learning_rate": 8.476877223728539e-06,
"loss": 0.4918,
"step": 451
},
{
"epoch": 0.2782662228977087,
"grad_norm": 2.2878201007843018,
"learning_rate": 8.469703026085551e-06,
"loss": 0.481,
"step": 452
},
{
"epoch": 0.2788818561342081,
"grad_norm": 2.4797415733337402,
"learning_rate": 8.462515023621237e-06,
"loss": 0.4938,
"step": 453
},
{
"epoch": 0.2794974893707074,
"grad_norm": 2.1992409229278564,
"learning_rate": 8.455313244934324e-06,
"loss": 0.5019,
"step": 454
},
{
"epoch": 0.2801131226072068,
"grad_norm": 2.093852996826172,
"learning_rate": 8.44809771867835e-06,
"loss": 0.4769,
"step": 455
},
{
"epoch": 0.2807287558437061,
"grad_norm": 1.909757375717163,
"learning_rate": 8.44086847356155e-06,
"loss": 0.4817,
"step": 456
},
{
"epoch": 0.2813443890802055,
"grad_norm": 2.1621382236480713,
"learning_rate": 8.433625538346742e-06,
"loss": 0.5072,
"step": 457
},
{
"epoch": 0.2819600223167048,
"grad_norm": 2.0320560932159424,
"learning_rate": 8.426368941851212e-06,
"loss": 0.4586,
"step": 458
},
{
"epoch": 0.2825756555532042,
"grad_norm": 2.267420768737793,
"learning_rate": 8.4190987129466e-06,
"loss": 0.4661,
"step": 459
},
{
"epoch": 0.2831912887897035,
"grad_norm": 2.3038079738616943,
"learning_rate": 8.41181488055879e-06,
"loss": 0.5029,
"step": 460
},
{
"epoch": 0.2838069220262029,
"grad_norm": 2.2598018646240234,
"learning_rate": 8.404517473667779e-06,
"loss": 0.4913,
"step": 461
},
{
"epoch": 0.28442255526270227,
"grad_norm": 2.0566563606262207,
"learning_rate": 8.397206521307584e-06,
"loss": 0.4903,
"step": 462
},
{
"epoch": 0.2850381884992016,
"grad_norm": 2.1668128967285156,
"learning_rate": 8.389882052566106e-06,
"loss": 0.5088,
"step": 463
},
{
"epoch": 0.285653821735701,
"grad_norm": 2.189140796661377,
"learning_rate": 8.382544096585028e-06,
"loss": 0.4873,
"step": 464
},
{
"epoch": 0.2862694549722003,
"grad_norm": 1.9882960319519043,
"learning_rate": 8.375192682559692e-06,
"loss": 0.4853,
"step": 465
},
{
"epoch": 0.2868850882086997,
"grad_norm": 1.9169676303863525,
"learning_rate": 8.36782783973899e-06,
"loss": 0.4884,
"step": 466
},
{
"epoch": 0.287500721445199,
"grad_norm": 1.9822300672531128,
"learning_rate": 8.360449597425236e-06,
"loss": 0.4691,
"step": 467
},
{
"epoch": 0.2881163546816984,
"grad_norm": 2.0725347995758057,
"learning_rate": 8.353057984974062e-06,
"loss": 0.4669,
"step": 468
},
{
"epoch": 0.2887319879181977,
"grad_norm": 2.0785980224609375,
"learning_rate": 8.345653031794292e-06,
"loss": 0.4813,
"step": 469
},
{
"epoch": 0.2893476211546971,
"grad_norm": 2.356861114501953,
"learning_rate": 8.338234767347829e-06,
"loss": 0.5462,
"step": 470
},
{
"epoch": 0.28996325439119647,
"grad_norm": 2.0012290477752686,
"learning_rate": 8.33080322114954e-06,
"loss": 0.4935,
"step": 471
},
{
"epoch": 0.2905788876276958,
"grad_norm": 2.2095272541046143,
"learning_rate": 8.32335842276713e-06,
"loss": 0.4841,
"step": 472
},
{
"epoch": 0.29119452086419517,
"grad_norm": 2.300325632095337,
"learning_rate": 8.315900401821034e-06,
"loss": 0.508,
"step": 473
},
{
"epoch": 0.2918101541006945,
"grad_norm": 1.9145146608352661,
"learning_rate": 8.308429187984298e-06,
"loss": 0.4741,
"step": 474
},
{
"epoch": 0.2924257873371939,
"grad_norm": 2.039344549179077,
"learning_rate": 8.300944810982452e-06,
"loss": 0.5042,
"step": 475
},
{
"epoch": 0.2930414205736932,
"grad_norm": 2.0209226608276367,
"learning_rate": 8.293447300593402e-06,
"loss": 0.5022,
"step": 476
},
{
"epoch": 0.2936570538101926,
"grad_norm": 2.110853910446167,
"learning_rate": 8.28593668664731e-06,
"loss": 0.5069,
"step": 477
},
{
"epoch": 0.29427268704669196,
"grad_norm": 2.0190882682800293,
"learning_rate": 8.278412999026462e-06,
"loss": 0.4713,
"step": 478
},
{
"epoch": 0.2948883202831913,
"grad_norm": 1.9714604616165161,
"learning_rate": 8.270876267665173e-06,
"loss": 0.4787,
"step": 479
},
{
"epoch": 0.29550395351969067,
"grad_norm": 2.083486795425415,
"learning_rate": 8.263326522549647e-06,
"loss": 0.5078,
"step": 480
},
{
"epoch": 0.29611958675619,
"grad_norm": 2.099017381668091,
"learning_rate": 8.255763793717868e-06,
"loss": 0.5128,
"step": 481
},
{
"epoch": 0.29673521999268937,
"grad_norm": 1.9817054271697998,
"learning_rate": 8.248188111259479e-06,
"loss": 0.5162,
"step": 482
},
{
"epoch": 0.2973508532291887,
"grad_norm": 1.9473239183425903,
"learning_rate": 8.240599505315656e-06,
"loss": 0.4715,
"step": 483
},
{
"epoch": 0.2979664864656881,
"grad_norm": 1.8737359046936035,
"learning_rate": 8.232998006078998e-06,
"loss": 0.4758,
"step": 484
},
{
"epoch": 0.2985821197021874,
"grad_norm": 2.0523834228515625,
"learning_rate": 8.225383643793405e-06,
"loss": 0.4855,
"step": 485
},
{
"epoch": 0.2991977529386868,
"grad_norm": 2.096587657928467,
"learning_rate": 8.217756448753948e-06,
"loss": 0.4902,
"step": 486
},
{
"epoch": 0.29981338617518616,
"grad_norm": 1.9058270454406738,
"learning_rate": 8.210116451306762e-06,
"loss": 0.4588,
"step": 487
},
{
"epoch": 0.3004290194116855,
"grad_norm": 1.8534164428710938,
"learning_rate": 8.20246368184891e-06,
"loss": 0.463,
"step": 488
},
{
"epoch": 0.30104465264818486,
"grad_norm": 2.1333110332489014,
"learning_rate": 8.19479817082828e-06,
"loss": 0.4698,
"step": 489
},
{
"epoch": 0.3016602858846842,
"grad_norm": 1.8725422620773315,
"learning_rate": 8.18711994874345e-06,
"loss": 0.4917,
"step": 490
},
{
"epoch": 0.30227591912118357,
"grad_norm": 1.8624604940414429,
"learning_rate": 8.17942904614357e-06,
"loss": 0.4325,
"step": 491
},
{
"epoch": 0.3028915523576829,
"grad_norm": 1.9307676553726196,
"learning_rate": 8.171725493628244e-06,
"loss": 0.5097,
"step": 492
},
{
"epoch": 0.30350718559418227,
"grad_norm": 1.9485713243484497,
"learning_rate": 8.164009321847405e-06,
"loss": 0.4537,
"step": 493
},
{
"epoch": 0.3041228188306816,
"grad_norm": 1.9698349237442017,
"learning_rate": 8.156280561501196e-06,
"loss": 0.4916,
"step": 494
},
{
"epoch": 0.304738452067181,
"grad_norm": 1.8933942317962646,
"learning_rate": 8.148539243339842e-06,
"loss": 0.4615,
"step": 495
},
{
"epoch": 0.30535408530368036,
"grad_norm": 1.9967283010482788,
"learning_rate": 8.140785398163535e-06,
"loss": 0.5266,
"step": 496
},
{
"epoch": 0.3059697185401797,
"grad_norm": 1.8503018617630005,
"learning_rate": 8.133019056822303e-06,
"loss": 0.4874,
"step": 497
},
{
"epoch": 0.30658535177667906,
"grad_norm": 2.033705949783325,
"learning_rate": 8.1252402502159e-06,
"loss": 0.4927,
"step": 498
},
{
"epoch": 0.3072009850131784,
"grad_norm": 1.9999321699142456,
"learning_rate": 8.117449009293668e-06,
"loss": 0.4997,
"step": 499
},
{
"epoch": 0.30781661824967776,
"grad_norm": 2.0142297744750977,
"learning_rate": 8.109645365054426e-06,
"loss": 0.4879,
"step": 500
},
{
"epoch": 0.30781661824967776,
"eval_loss": 0.4839506447315216,
"eval_runtime": 119.0711,
"eval_samples_per_second": 35.281,
"eval_steps_per_second": 4.418,
"step": 500
},
{
"epoch": 0.3084322514861771,
"grad_norm": 2.0126116275787354,
"learning_rate": 8.101829348546336e-06,
"loss": 0.4833,
"step": 501
},
{
"epoch": 0.30904788472267647,
"grad_norm": 1.8968122005462646,
"learning_rate": 8.094000990866795e-06,
"loss": 0.497,
"step": 502
},
{
"epoch": 0.3096635179591758,
"grad_norm": 2.234069585800171,
"learning_rate": 8.086160323162288e-06,
"loss": 0.5034,
"step": 503
},
{
"epoch": 0.3102791511956752,
"grad_norm": 2.1748671531677246,
"learning_rate": 8.078307376628292e-06,
"loss": 0.5222,
"step": 504
},
{
"epoch": 0.31089478443217455,
"grad_norm": 1.9957149028778076,
"learning_rate": 8.070442182509127e-06,
"loss": 0.4856,
"step": 505
},
{
"epoch": 0.3115104176686739,
"grad_norm": 2.1460072994232178,
"learning_rate": 8.062564772097844e-06,
"loss": 0.4864,
"step": 506
},
{
"epoch": 0.31212605090517326,
"grad_norm": 1.902100920677185,
"learning_rate": 8.054675176736104e-06,
"loss": 0.4761,
"step": 507
},
{
"epoch": 0.3127416841416726,
"grad_norm": 2.132812023162842,
"learning_rate": 8.046773427814043e-06,
"loss": 0.4882,
"step": 508
},
{
"epoch": 0.31335731737817196,
"grad_norm": 1.9943275451660156,
"learning_rate": 8.038859556770152e-06,
"loss": 0.4956,
"step": 509
},
{
"epoch": 0.3139729506146713,
"grad_norm": 2.189013957977295,
"learning_rate": 8.030933595091152e-06,
"loss": 0.5044,
"step": 510
},
{
"epoch": 0.31458858385117067,
"grad_norm": 2.0293703079223633,
"learning_rate": 8.022995574311876e-06,
"loss": 0.4922,
"step": 511
},
{
"epoch": 0.31520421708767005,
"grad_norm": 1.961392879486084,
"learning_rate": 8.015045526015124e-06,
"loss": 0.4703,
"step": 512
},
{
"epoch": 0.31581985032416937,
"grad_norm": 1.8382376432418823,
"learning_rate": 8.00708348183156e-06,
"loss": 0.4599,
"step": 513
},
{
"epoch": 0.31643548356066875,
"grad_norm": 2.000192165374756,
"learning_rate": 7.99910947343957e-06,
"loss": 0.4659,
"step": 514
},
{
"epoch": 0.3170511167971681,
"grad_norm": 2.1506423950195312,
"learning_rate": 7.991123532565142e-06,
"loss": 0.4923,
"step": 515
},
{
"epoch": 0.31766675003366746,
"grad_norm": 2.114412307739258,
"learning_rate": 7.983125690981743e-06,
"loss": 0.4632,
"step": 516
},
{
"epoch": 0.3182823832701668,
"grad_norm": 2.0266101360321045,
"learning_rate": 7.975115980510187e-06,
"loss": 0.4797,
"step": 517
},
{
"epoch": 0.31889801650666616,
"grad_norm": 2.1428349018096924,
"learning_rate": 7.967094433018508e-06,
"loss": 0.5035,
"step": 518
},
{
"epoch": 0.3195136497431655,
"grad_norm": 2.085111141204834,
"learning_rate": 7.95906108042184e-06,
"loss": 0.5129,
"step": 519
},
{
"epoch": 0.32012928297966486,
"grad_norm": 2.149914026260376,
"learning_rate": 7.951015954682281e-06,
"loss": 0.4803,
"step": 520
},
{
"epoch": 0.32074491621616424,
"grad_norm": 1.8125468492507935,
"learning_rate": 7.942959087808776e-06,
"loss": 0.4819,
"step": 521
},
{
"epoch": 0.32136054945266357,
"grad_norm": 2.132568597793579,
"learning_rate": 7.934890511856982e-06,
"loss": 0.5151,
"step": 522
},
{
"epoch": 0.32197618268916295,
"grad_norm": 1.9607751369476318,
"learning_rate": 7.926810258929138e-06,
"loss": 0.453,
"step": 523
},
{
"epoch": 0.3225918159256623,
"grad_norm": 1.891742467880249,
"learning_rate": 7.918718361173951e-06,
"loss": 0.4499,
"step": 524
},
{
"epoch": 0.32320744916216165,
"grad_norm": 2.0719101428985596,
"learning_rate": 7.910614850786448e-06,
"loss": 0.4726,
"step": 525
},
{
"epoch": 0.323823082398661,
"grad_norm": 1.9925060272216797,
"learning_rate": 7.902499760007867e-06,
"loss": 0.4841,
"step": 526
},
{
"epoch": 0.32443871563516036,
"grad_norm": 1.9080984592437744,
"learning_rate": 7.89437312112552e-06,
"loss": 0.4618,
"step": 527
},
{
"epoch": 0.3250543488716597,
"grad_norm": 2.142263412475586,
"learning_rate": 7.886234966472664e-06,
"loss": 0.5141,
"step": 528
},
{
"epoch": 0.32566998210815906,
"grad_norm": 2.0402634143829346,
"learning_rate": 7.87808532842837e-06,
"loss": 0.4661,
"step": 529
},
{
"epoch": 0.32628561534465844,
"grad_norm": 1.8866894245147705,
"learning_rate": 7.8699242394174e-06,
"loss": 0.471,
"step": 530
},
{
"epoch": 0.32690124858115777,
"grad_norm": 2.106036901473999,
"learning_rate": 7.86175173191008e-06,
"loss": 0.4684,
"step": 531
},
{
"epoch": 0.32751688181765715,
"grad_norm": 1.9650591611862183,
"learning_rate": 7.85356783842216e-06,
"loss": 0.4984,
"step": 532
},
{
"epoch": 0.32813251505415647,
"grad_norm": 1.8672831058502197,
"learning_rate": 7.845372591514694e-06,
"loss": 0.4811,
"step": 533
},
{
"epoch": 0.32874814829065585,
"grad_norm": 1.993276834487915,
"learning_rate": 7.83716602379391e-06,
"loss": 0.4646,
"step": 534
},
{
"epoch": 0.3293637815271552,
"grad_norm": 2.1027157306671143,
"learning_rate": 7.828948167911073e-06,
"loss": 0.5203,
"step": 535
},
{
"epoch": 0.32997941476365455,
"grad_norm": 2.2700319290161133,
"learning_rate": 7.820719056562363e-06,
"loss": 0.5072,
"step": 536
},
{
"epoch": 0.33059504800015393,
"grad_norm": 1.9552809000015259,
"learning_rate": 7.812478722488741e-06,
"loss": 0.4929,
"step": 537
},
{
"epoch": 0.33121068123665326,
"grad_norm": 2.141432523727417,
"learning_rate": 7.804227198475823e-06,
"loss": 0.493,
"step": 538
},
{
"epoch": 0.33182631447315264,
"grad_norm": 1.8569056987762451,
"learning_rate": 7.795964517353734e-06,
"loss": 0.4676,
"step": 539
},
{
"epoch": 0.33244194770965196,
"grad_norm": 1.9998515844345093,
"learning_rate": 7.787690711997008e-06,
"loss": 0.4817,
"step": 540
},
{
"epoch": 0.33305758094615134,
"grad_norm": 2.048243522644043,
"learning_rate": 7.779405815324424e-06,
"loss": 0.4783,
"step": 541
},
{
"epoch": 0.33367321418265067,
"grad_norm": 2.198432683944702,
"learning_rate": 7.771109860298895e-06,
"loss": 0.4748,
"step": 542
},
{
"epoch": 0.33428884741915005,
"grad_norm": 2.0038063526153564,
"learning_rate": 7.762802879927333e-06,
"loss": 0.4365,
"step": 543
},
{
"epoch": 0.3349044806556494,
"grad_norm": 1.9695496559143066,
"learning_rate": 7.754484907260513e-06,
"loss": 0.4768,
"step": 544
},
{
"epoch": 0.33552011389214875,
"grad_norm": 2.031867504119873,
"learning_rate": 7.746155975392948e-06,
"loss": 0.4615,
"step": 545
},
{
"epoch": 0.33613574712864813,
"grad_norm": 2.1375014781951904,
"learning_rate": 7.737816117462752e-06,
"loss": 0.4713,
"step": 546
},
{
"epoch": 0.33675138036514746,
"grad_norm": 1.867982268333435,
"learning_rate": 7.72946536665151e-06,
"loss": 0.4907,
"step": 547
},
{
"epoch": 0.33736701360164684,
"grad_norm": 1.8443107604980469,
"learning_rate": 7.721103756184147e-06,
"loss": 0.4717,
"step": 548
},
{
"epoch": 0.33798264683814616,
"grad_norm": 2.0463571548461914,
"learning_rate": 7.712731319328798e-06,
"loss": 0.4587,
"step": 549
},
{
"epoch": 0.33859828007464554,
"grad_norm": 2.1210920810699463,
"learning_rate": 7.704348089396667e-06,
"loss": 0.4736,
"step": 550
},
{
"epoch": 0.33921391331114487,
"grad_norm": 2.04471492767334,
"learning_rate": 7.695954099741902e-06,
"loss": 0.5262,
"step": 551
},
{
"epoch": 0.33982954654764425,
"grad_norm": 1.95708429813385,
"learning_rate": 7.687549383761463e-06,
"loss": 0.4963,
"step": 552
},
{
"epoch": 0.34044517978414357,
"grad_norm": 2.132002115249634,
"learning_rate": 7.679133974894984e-06,
"loss": 0.5178,
"step": 553
},
{
"epoch": 0.34106081302064295,
"grad_norm": 2.2385482788085938,
"learning_rate": 7.670707906624644e-06,
"loss": 0.4837,
"step": 554
},
{
"epoch": 0.34167644625714233,
"grad_norm": 2.0102343559265137,
"learning_rate": 7.662271212475034e-06,
"loss": 0.4738,
"step": 555
},
{
"epoch": 0.34229207949364165,
"grad_norm": 1.831471562385559,
"learning_rate": 7.653823926013016e-06,
"loss": 0.4881,
"step": 556
},
{
"epoch": 0.34290771273014103,
"grad_norm": 1.9880772829055786,
"learning_rate": 7.645366080847599e-06,
"loss": 0.4866,
"step": 557
},
{
"epoch": 0.34352334596664036,
"grad_norm": 2.1533780097961426,
"learning_rate": 7.636897710629804e-06,
"loss": 0.4939,
"step": 558
},
{
"epoch": 0.34413897920313974,
"grad_norm": 1.9375075101852417,
"learning_rate": 7.628418849052523e-06,
"loss": 0.485,
"step": 559
},
{
"epoch": 0.34475461243963906,
"grad_norm": 2.021385431289673,
"learning_rate": 7.619929529850397e-06,
"loss": 0.4933,
"step": 560
},
{
"epoch": 0.34537024567613844,
"grad_norm": 1.887455940246582,
"learning_rate": 7.611429786799664e-06,
"loss": 0.5031,
"step": 561
},
{
"epoch": 0.3459858789126378,
"grad_norm": 1.961814284324646,
"learning_rate": 7.602919653718044e-06,
"loss": 0.4686,
"step": 562
},
{
"epoch": 0.34660151214913715,
"grad_norm": 2.0938830375671387,
"learning_rate": 7.5943991644645895e-06,
"loss": 0.4776,
"step": 563
},
{
"epoch": 0.3472171453856365,
"grad_norm": 1.9017668962478638,
"learning_rate": 7.585868352939564e-06,
"loss": 0.446,
"step": 564
},
{
"epoch": 0.34783277862213585,
"grad_norm": 2.077495813369751,
"learning_rate": 7.577327253084292e-06,
"loss": 0.4819,
"step": 565
},
{
"epoch": 0.34844841185863523,
"grad_norm": 1.9357627630233765,
"learning_rate": 7.568775898881038e-06,
"loss": 0.4896,
"step": 566
},
{
"epoch": 0.34906404509513456,
"grad_norm": 2.004091501235962,
"learning_rate": 7.560214324352858e-06,
"loss": 0.5051,
"step": 567
},
{
"epoch": 0.34967967833163394,
"grad_norm": 2.0148847103118896,
"learning_rate": 7.551642563563481e-06,
"loss": 0.5176,
"step": 568
},
{
"epoch": 0.35029531156813326,
"grad_norm": 2.167863130569458,
"learning_rate": 7.543060650617159e-06,
"loss": 0.4722,
"step": 569
},
{
"epoch": 0.35091094480463264,
"grad_norm": 2.0419163703918457,
"learning_rate": 7.534468619658534e-06,
"loss": 0.486,
"step": 570
},
{
"epoch": 0.351526578041132,
"grad_norm": 2.1185479164123535,
"learning_rate": 7.5258665048725065e-06,
"loss": 0.497,
"step": 571
},
{
"epoch": 0.35214221127763135,
"grad_norm": 1.860520601272583,
"learning_rate": 7.517254340484097e-06,
"loss": 0.4484,
"step": 572
},
{
"epoch": 0.3527578445141307,
"grad_norm": 2.0788426399230957,
"learning_rate": 7.50863216075831e-06,
"loss": 0.4737,
"step": 573
},
{
"epoch": 0.35337347775063005,
"grad_norm": 2.152902603149414,
"learning_rate": 7.500000000000001e-06,
"loss": 0.4998,
"step": 574
},
{
"epoch": 0.35398911098712943,
"grad_norm": 1.9528014659881592,
"learning_rate": 7.49135789255373e-06,
"loss": 0.4687,
"step": 575
},
{
"epoch": 0.35460474422362875,
"grad_norm": 1.8286877870559692,
"learning_rate": 7.482705872803637e-06,
"loss": 0.4693,
"step": 576
},
{
"epoch": 0.35522037746012813,
"grad_norm": 2.0849242210388184,
"learning_rate": 7.4740439751732994e-06,
"loss": 0.4599,
"step": 577
},
{
"epoch": 0.35583601069662746,
"grad_norm": 1.9514186382293701,
"learning_rate": 7.465372234125592e-06,
"loss": 0.4909,
"step": 578
},
{
"epoch": 0.35645164393312684,
"grad_norm": 1.7071397304534912,
"learning_rate": 7.456690684162557e-06,
"loss": 0.4237,
"step": 579
},
{
"epoch": 0.3570672771696262,
"grad_norm": 1.916062831878662,
"learning_rate": 7.447999359825263e-06,
"loss": 0.4683,
"step": 580
},
{
"epoch": 0.35768291040612554,
"grad_norm": 1.9211770296096802,
"learning_rate": 7.4392982956936644e-06,
"loss": 0.4664,
"step": 581
},
{
"epoch": 0.3582985436426249,
"grad_norm": 1.7628663778305054,
"learning_rate": 7.43058752638647e-06,
"loss": 0.483,
"step": 582
},
{
"epoch": 0.35891417687912425,
"grad_norm": 1.9189079999923706,
"learning_rate": 7.421867086561001e-06,
"loss": 0.4593,
"step": 583
},
{
"epoch": 0.3595298101156236,
"grad_norm": 1.860856294631958,
"learning_rate": 7.413137010913055e-06,
"loss": 0.4742,
"step": 584
},
{
"epoch": 0.36014544335212295,
"grad_norm": 1.8644829988479614,
"learning_rate": 7.4043973341767695e-06,
"loss": 0.4894,
"step": 585
},
{
"epoch": 0.36076107658862233,
"grad_norm": 1.9820992946624756,
"learning_rate": 7.395648091124476e-06,
"loss": 0.4886,
"step": 586
},
{
"epoch": 0.3613767098251217,
"grad_norm": 1.976830244064331,
"learning_rate": 7.386889316566571e-06,
"loss": 0.4399,
"step": 587
},
{
"epoch": 0.36199234306162104,
"grad_norm": 1.9326996803283691,
"learning_rate": 7.378121045351378e-06,
"loss": 0.5073,
"step": 588
},
{
"epoch": 0.3626079762981204,
"grad_norm": 1.817628026008606,
"learning_rate": 7.369343312364994e-06,
"loss": 0.4573,
"step": 589
},
{
"epoch": 0.36322360953461974,
"grad_norm": 2.304370880126953,
"learning_rate": 7.360556152531171e-06,
"loss": 0.4976,
"step": 590
},
{
"epoch": 0.3638392427711191,
"grad_norm": 1.789089322090149,
"learning_rate": 7.351759600811163e-06,
"loss": 0.4474,
"step": 591
},
{
"epoch": 0.36445487600761844,
"grad_norm": 1.9560165405273438,
"learning_rate": 7.342953692203594e-06,
"loss": 0.5014,
"step": 592
},
{
"epoch": 0.3650705092441178,
"grad_norm": 1.8216361999511719,
"learning_rate": 7.33413846174431e-06,
"loss": 0.4896,
"step": 593
},
{
"epoch": 0.36568614248061715,
"grad_norm": 2.0918242931365967,
"learning_rate": 7.3253139445062535e-06,
"loss": 0.5184,
"step": 594
},
{
"epoch": 0.36630177571711653,
"grad_norm": 2.250962018966675,
"learning_rate": 7.31648017559931e-06,
"loss": 0.4779,
"step": 595
},
{
"epoch": 0.3669174089536159,
"grad_norm": 2.12443208694458,
"learning_rate": 7.307637190170176e-06,
"loss": 0.4761,
"step": 596
},
{
"epoch": 0.36753304219011523,
"grad_norm": 2.055332660675049,
"learning_rate": 7.29878502340222e-06,
"loss": 0.4926,
"step": 597
},
{
"epoch": 0.3681486754266146,
"grad_norm": 1.9562499523162842,
"learning_rate": 7.289923710515338e-06,
"loss": 0.4574,
"step": 598
},
{
"epoch": 0.36876430866311394,
"grad_norm": 1.8528696298599243,
"learning_rate": 7.281053286765816e-06,
"loss": 0.4715,
"step": 599
},
{
"epoch": 0.3693799418996133,
"grad_norm": 2.025754928588867,
"learning_rate": 7.272173787446188e-06,
"loss": 0.5057,
"step": 600
},
{
"epoch": 0.3693799418996133,
"eval_loss": 0.46705466508865356,
"eval_runtime": 119.3636,
"eval_samples_per_second": 35.195,
"eval_steps_per_second": 4.407,
"step": 600
},
{
"epoch": 0.36999557513611264,
"grad_norm": 1.8789739608764648,
"learning_rate": 7.263285247885097e-06,
"loss": 0.4827,
"step": 601
},
{
"epoch": 0.370611208372612,
"grad_norm": 1.8459640741348267,
"learning_rate": 7.254387703447154e-06,
"loss": 0.4929,
"step": 602
},
{
"epoch": 0.37122684160911135,
"grad_norm": 1.799880027770996,
"learning_rate": 7.245481189532801e-06,
"loss": 0.4768,
"step": 603
},
{
"epoch": 0.3718424748456107,
"grad_norm": 1.8112698793411255,
"learning_rate": 7.236565741578163e-06,
"loss": 0.4871,
"step": 604
},
{
"epoch": 0.3724581080821101,
"grad_norm": 1.7186943292617798,
"learning_rate": 7.227641395054913e-06,
"loss": 0.4718,
"step": 605
},
{
"epoch": 0.37307374131860943,
"grad_norm": 1.945148229598999,
"learning_rate": 7.218708185470122e-06,
"loss": 0.53,
"step": 606
},
{
"epoch": 0.3736893745551088,
"grad_norm": 1.7540737390518188,
"learning_rate": 7.2097661483661355e-06,
"loss": 0.4647,
"step": 607
},
{
"epoch": 0.37430500779160814,
"grad_norm": 1.795328974723816,
"learning_rate": 7.200815319320409e-06,
"loss": 0.4348,
"step": 608
},
{
"epoch": 0.3749206410281075,
"grad_norm": 1.8775458335876465,
"learning_rate": 7.191855733945388e-06,
"loss": 0.4866,
"step": 609
},
{
"epoch": 0.37553627426460684,
"grad_norm": 1.7888774871826172,
"learning_rate": 7.182887427888351e-06,
"loss": 0.4586,
"step": 610
},
{
"epoch": 0.3761519075011062,
"grad_norm": 1.954525351524353,
"learning_rate": 7.173910436831274e-06,
"loss": 0.4297,
"step": 611
},
{
"epoch": 0.37676754073760554,
"grad_norm": 1.9484002590179443,
"learning_rate": 7.164924796490689e-06,
"loss": 0.4793,
"step": 612
},
{
"epoch": 0.3773831739741049,
"grad_norm": 1.8956586122512817,
"learning_rate": 7.155930542617543e-06,
"loss": 0.4552,
"step": 613
},
{
"epoch": 0.3779988072106043,
"grad_norm": 1.8784023523330688,
"learning_rate": 7.146927710997047e-06,
"loss": 0.4494,
"step": 614
},
{
"epoch": 0.37861444044710363,
"grad_norm": 2.010004997253418,
"learning_rate": 7.137916337448544e-06,
"loss": 0.4853,
"step": 615
},
{
"epoch": 0.379230073683603,
"grad_norm": 1.7376317977905273,
"learning_rate": 7.128896457825364e-06,
"loss": 0.4687,
"step": 616
},
{
"epoch": 0.37984570692010233,
"grad_norm": 1.83708918094635,
"learning_rate": 7.119868108014677e-06,
"loss": 0.4779,
"step": 617
},
{
"epoch": 0.3804613401566017,
"grad_norm": 2.276491165161133,
"learning_rate": 7.110831323937356e-06,
"loss": 0.4661,
"step": 618
},
{
"epoch": 0.38107697339310104,
"grad_norm": 2.030910015106201,
"learning_rate": 7.101786141547829e-06,
"loss": 0.4846,
"step": 619
},
{
"epoch": 0.3816926066296004,
"grad_norm": 1.8752516508102417,
"learning_rate": 7.092732596833937e-06,
"loss": 0.4876,
"step": 620
},
{
"epoch": 0.3823082398660998,
"grad_norm": 1.798912525177002,
"learning_rate": 7.083670725816795e-06,
"loss": 0.4161,
"step": 621
},
{
"epoch": 0.3829238731025991,
"grad_norm": 1.8931162357330322,
"learning_rate": 7.074600564550643e-06,
"loss": 0.4608,
"step": 622
},
{
"epoch": 0.3835395063390985,
"grad_norm": 2.037741184234619,
"learning_rate": 7.06552214912271e-06,
"loss": 0.4546,
"step": 623
},
{
"epoch": 0.3841551395755978,
"grad_norm": 2.005077600479126,
"learning_rate": 7.056435515653059e-06,
"loss": 0.4921,
"step": 624
},
{
"epoch": 0.3847707728120972,
"grad_norm": 1.9674670696258545,
"learning_rate": 7.047340700294454e-06,
"loss": 0.4609,
"step": 625
},
{
"epoch": 0.38538640604859653,
"grad_norm": 1.9863380193710327,
"learning_rate": 7.03823773923221e-06,
"loss": 0.4902,
"step": 626
},
{
"epoch": 0.3860020392850959,
"grad_norm": 2.1521503925323486,
"learning_rate": 7.029126668684055e-06,
"loss": 0.4818,
"step": 627
},
{
"epoch": 0.38661767252159523,
"grad_norm": 2.2088866233825684,
"learning_rate": 7.020007524899976e-06,
"loss": 0.5094,
"step": 628
},
{
"epoch": 0.3872333057580946,
"grad_norm": 1.8303382396697998,
"learning_rate": 7.010880344162087e-06,
"loss": 0.4662,
"step": 629
},
{
"epoch": 0.387848938994594,
"grad_norm": 1.7346774339675903,
"learning_rate": 7.0017451627844765e-06,
"loss": 0.4348,
"step": 630
},
{
"epoch": 0.3884645722310933,
"grad_norm": 2.2596514225006104,
"learning_rate": 6.992602017113058e-06,
"loss": 0.4936,
"step": 631
},
{
"epoch": 0.3890802054675927,
"grad_norm": 2.0665206909179688,
"learning_rate": 6.983450943525445e-06,
"loss": 0.4967,
"step": 632
},
{
"epoch": 0.389695838704092,
"grad_norm": 1.771286964416504,
"learning_rate": 6.974291978430783e-06,
"loss": 0.4906,
"step": 633
},
{
"epoch": 0.3903114719405914,
"grad_norm": 1.8066425323486328,
"learning_rate": 6.965125158269619e-06,
"loss": 0.4635,
"step": 634
},
{
"epoch": 0.39092710517709073,
"grad_norm": 1.8853797912597656,
"learning_rate": 6.955950519513754e-06,
"loss": 0.442,
"step": 635
},
{
"epoch": 0.3915427384135901,
"grad_norm": 1.959373950958252,
"learning_rate": 6.946768098666097e-06,
"loss": 0.476,
"step": 636
},
{
"epoch": 0.39215837165008943,
"grad_norm": 1.789989948272705,
"learning_rate": 6.9375779322605154e-06,
"loss": 0.4787,
"step": 637
},
{
"epoch": 0.3927740048865888,
"grad_norm": 1.832987904548645,
"learning_rate": 6.9283800568616986e-06,
"loss": 0.4614,
"step": 638
},
{
"epoch": 0.3933896381230882,
"grad_norm": 1.9762518405914307,
"learning_rate": 6.919174509065003e-06,
"loss": 0.4557,
"step": 639
},
{
"epoch": 0.3940052713595875,
"grad_norm": 1.903957724571228,
"learning_rate": 6.909961325496312e-06,
"loss": 0.4806,
"step": 640
},
{
"epoch": 0.3946209045960869,
"grad_norm": 2.0859627723693848,
"learning_rate": 6.900740542811896e-06,
"loss": 0.4778,
"step": 641
},
{
"epoch": 0.3952365378325862,
"grad_norm": 2.363494873046875,
"learning_rate": 6.891512197698249e-06,
"loss": 0.4642,
"step": 642
},
{
"epoch": 0.3958521710690856,
"grad_norm": 2.01015305519104,
"learning_rate": 6.88227632687196e-06,
"loss": 0.4997,
"step": 643
},
{
"epoch": 0.3964678043055849,
"grad_norm": 1.8750239610671997,
"learning_rate": 6.873032967079562e-06,
"loss": 0.4757,
"step": 644
},
{
"epoch": 0.3970834375420843,
"grad_norm": 1.866356372833252,
"learning_rate": 6.863782155097376e-06,
"loss": 0.4753,
"step": 645
},
{
"epoch": 0.3976990707785837,
"grad_norm": 1.8274710178375244,
"learning_rate": 6.854523927731383e-06,
"loss": 0.474,
"step": 646
},
{
"epoch": 0.398314704015083,
"grad_norm": 1.9891657829284668,
"learning_rate": 6.8452583218170575e-06,
"loss": 0.4676,
"step": 647
},
{
"epoch": 0.3989303372515824,
"grad_norm": 1.8337827920913696,
"learning_rate": 6.835985374219241e-06,
"loss": 0.4589,
"step": 648
},
{
"epoch": 0.3995459704880817,
"grad_norm": 1.8421549797058105,
"learning_rate": 6.8267051218319766e-06,
"loss": 0.4423,
"step": 649
},
{
"epoch": 0.4001616037245811,
"grad_norm": 1.809378743171692,
"learning_rate": 6.817417601578375e-06,
"loss": 0.4619,
"step": 650
},
{
"epoch": 0.4007772369610804,
"grad_norm": 1.912400722503662,
"learning_rate": 6.808122850410461e-06,
"loss": 0.4618,
"step": 651
},
{
"epoch": 0.4013928701975798,
"grad_norm": 2.02554988861084,
"learning_rate": 6.798820905309036e-06,
"loss": 0.4733,
"step": 652
},
{
"epoch": 0.4020085034340791,
"grad_norm": 1.967797040939331,
"learning_rate": 6.789511803283512e-06,
"loss": 0.4557,
"step": 653
},
{
"epoch": 0.4026241366705785,
"grad_norm": 1.9477007389068604,
"learning_rate": 6.780195581371785e-06,
"loss": 0.5078,
"step": 654
},
{
"epoch": 0.4032397699070779,
"grad_norm": 2.080044746398926,
"learning_rate": 6.7708722766400745e-06,
"loss": 0.4631,
"step": 655
},
{
"epoch": 0.4038554031435772,
"grad_norm": 1.8657619953155518,
"learning_rate": 6.761541926182783e-06,
"loss": 0.4748,
"step": 656
},
{
"epoch": 0.4044710363800766,
"grad_norm": 2.008187770843506,
"learning_rate": 6.752204567122343e-06,
"loss": 0.4681,
"step": 657
},
{
"epoch": 0.4050866696165759,
"grad_norm": 1.8819204568862915,
"learning_rate": 6.7428602366090764e-06,
"loss": 0.47,
"step": 658
},
{
"epoch": 0.4057023028530753,
"grad_norm": 2.0878162384033203,
"learning_rate": 6.733508971821037e-06,
"loss": 0.4637,
"step": 659
},
{
"epoch": 0.4063179360895746,
"grad_norm": 2.172257900238037,
"learning_rate": 6.724150809963867e-06,
"loss": 0.4755,
"step": 660
},
{
"epoch": 0.406933569326074,
"grad_norm": 1.7706599235534668,
"learning_rate": 6.714785788270658e-06,
"loss": 0.4294,
"step": 661
},
{
"epoch": 0.4075492025625733,
"grad_norm": 1.698087215423584,
"learning_rate": 6.705413944001786e-06,
"loss": 0.405,
"step": 662
},
{
"epoch": 0.4081648357990727,
"grad_norm": 2.301974296569824,
"learning_rate": 6.696035314444778e-06,
"loss": 0.4452,
"step": 663
},
{
"epoch": 0.4087804690355721,
"grad_norm": 1.8717014789581299,
"learning_rate": 6.686649936914151e-06,
"loss": 0.44,
"step": 664
},
{
"epoch": 0.4093961022720714,
"grad_norm": 1.8877750635147095,
"learning_rate": 6.677257848751276e-06,
"loss": 0.4753,
"step": 665
},
{
"epoch": 0.4100117355085708,
"grad_norm": 1.8425832986831665,
"learning_rate": 6.667859087324221e-06,
"loss": 0.452,
"step": 666
},
{
"epoch": 0.4106273687450701,
"grad_norm": 1.7998721599578857,
"learning_rate": 6.658453690027604e-06,
"loss": 0.4302,
"step": 667
},
{
"epoch": 0.4112430019815695,
"grad_norm": 1.9948549270629883,
"learning_rate": 6.6490416942824466e-06,
"loss": 0.4542,
"step": 668
},
{
"epoch": 0.4118586352180688,
"grad_norm": 1.861610770225525,
"learning_rate": 6.639623137536023e-06,
"loss": 0.4816,
"step": 669
},
{
"epoch": 0.4124742684545682,
"grad_norm": 1.737973928451538,
"learning_rate": 6.63019805726171e-06,
"loss": 0.4309,
"step": 670
},
{
"epoch": 0.4130899016910676,
"grad_norm": 1.927831768989563,
"learning_rate": 6.620766490958842e-06,
"loss": 0.4732,
"step": 671
},
{
"epoch": 0.4137055349275669,
"grad_norm": 1.807926058769226,
"learning_rate": 6.611328476152557e-06,
"loss": 0.4507,
"step": 672
},
{
"epoch": 0.4143211681640663,
"grad_norm": 1.7335807085037231,
"learning_rate": 6.601884050393649e-06,
"loss": 0.4366,
"step": 673
},
{
"epoch": 0.4149368014005656,
"grad_norm": 1.676943302154541,
"learning_rate": 6.592433251258423e-06,
"loss": 0.4158,
"step": 674
},
{
"epoch": 0.415552434637065,
"grad_norm": 2.2752737998962402,
"learning_rate": 6.582976116348538e-06,
"loss": 0.4686,
"step": 675
},
{
"epoch": 0.4161680678735643,
"grad_norm": 2.1173386573791504,
"learning_rate": 6.57351268329086e-06,
"loss": 0.4662,
"step": 676
},
{
"epoch": 0.4167837011100637,
"grad_norm": 1.8290038108825684,
"learning_rate": 6.5640429897373195e-06,
"loss": 0.4415,
"step": 677
},
{
"epoch": 0.417399334346563,
"grad_norm": 1.850178599357605,
"learning_rate": 6.554567073364747e-06,
"loss": 0.4806,
"step": 678
},
{
"epoch": 0.4180149675830624,
"grad_norm": 1.913724422454834,
"learning_rate": 6.545084971874738e-06,
"loss": 0.4714,
"step": 679
},
{
"epoch": 0.41863060081956177,
"grad_norm": 2.002952814102173,
"learning_rate": 6.535596722993494e-06,
"loss": 0.4673,
"step": 680
},
{
"epoch": 0.4192462340560611,
"grad_norm": 1.8530998229980469,
"learning_rate": 6.526102364471675e-06,
"loss": 0.4492,
"step": 681
},
{
"epoch": 0.4198618672925605,
"grad_norm": 1.7990001440048218,
"learning_rate": 6.51660193408425e-06,
"loss": 0.46,
"step": 682
},
{
"epoch": 0.4204775005290598,
"grad_norm": 1.7169803380966187,
"learning_rate": 6.507095469630347e-06,
"loss": 0.4629,
"step": 683
},
{
"epoch": 0.4210931337655592,
"grad_norm": 2.5550501346588135,
"learning_rate": 6.497583008933097e-06,
"loss": 0.4674,
"step": 684
},
{
"epoch": 0.4217087670020585,
"grad_norm": 1.9652258157730103,
"learning_rate": 6.4880645898394935e-06,
"loss": 0.4587,
"step": 685
},
{
"epoch": 0.4223244002385579,
"grad_norm": 2.055323362350464,
"learning_rate": 6.4785402502202345e-06,
"loss": 0.472,
"step": 686
},
{
"epoch": 0.4229400334750572,
"grad_norm": 1.860081434249878,
"learning_rate": 6.469010027969573e-06,
"loss": 0.4676,
"step": 687
},
{
"epoch": 0.4235556667115566,
"grad_norm": 1.8490623235702515,
"learning_rate": 6.459473961005168e-06,
"loss": 0.4637,
"step": 688
},
{
"epoch": 0.42417129994805597,
"grad_norm": 1.8875542879104614,
"learning_rate": 6.449932087267932e-06,
"loss": 0.5051,
"step": 689
},
{
"epoch": 0.4247869331845553,
"grad_norm": 1.7553805112838745,
"learning_rate": 6.440384444721881e-06,
"loss": 0.4544,
"step": 690
},
{
"epoch": 0.4254025664210547,
"grad_norm": 1.723633050918579,
"learning_rate": 6.4308310713539845e-06,
"loss": 0.469,
"step": 691
},
{
"epoch": 0.426018199657554,
"grad_norm": 1.8384066820144653,
"learning_rate": 6.4212720051740126e-06,
"loss": 0.4348,
"step": 692
},
{
"epoch": 0.4266338328940534,
"grad_norm": 1.7237180471420288,
"learning_rate": 6.411707284214384e-06,
"loss": 0.4188,
"step": 693
},
{
"epoch": 0.4272494661305527,
"grad_norm": 1.7882165908813477,
"learning_rate": 6.402136946530014e-06,
"loss": 0.4481,
"step": 694
},
{
"epoch": 0.4278650993670521,
"grad_norm": 1.8759676218032837,
"learning_rate": 6.3925610301981726e-06,
"loss": 0.4826,
"step": 695
},
{
"epoch": 0.42848073260355146,
"grad_norm": 1.849109411239624,
"learning_rate": 6.382979573318317e-06,
"loss": 0.4745,
"step": 696
},
{
"epoch": 0.4290963658400508,
"grad_norm": 1.6711199283599854,
"learning_rate": 6.373392614011952e-06,
"loss": 0.4435,
"step": 697
},
{
"epoch": 0.42971199907655017,
"grad_norm": 2.019181251525879,
"learning_rate": 6.3638001904224755e-06,
"loss": 0.4867,
"step": 698
},
{
"epoch": 0.4303276323130495,
"grad_norm": 1.7364513874053955,
"learning_rate": 6.354202340715027e-06,
"loss": 0.4524,
"step": 699
},
{
"epoch": 0.43094326554954887,
"grad_norm": 1.7672978639602661,
"learning_rate": 6.344599103076329e-06,
"loss": 0.4707,
"step": 700
},
{
"epoch": 0.43094326554954887,
"eval_loss": 0.45585593581199646,
"eval_runtime": 119.2986,
"eval_samples_per_second": 35.214,
"eval_steps_per_second": 4.409,
"step": 700
},
{
"epoch": 0.4315588987860482,
"grad_norm": 2.1754207611083984,
"learning_rate": 6.334990515714548e-06,
"loss": 0.4533,
"step": 701
},
{
"epoch": 0.4321745320225476,
"grad_norm": 2.133251190185547,
"learning_rate": 6.3253766168591315e-06,
"loss": 0.4589,
"step": 702
},
{
"epoch": 0.4327901652590469,
"grad_norm": 1.9236661195755005,
"learning_rate": 6.315757444760659e-06,
"loss": 0.4455,
"step": 703
},
{
"epoch": 0.4334057984955463,
"grad_norm": 1.8091168403625488,
"learning_rate": 6.306133037690693e-06,
"loss": 0.4474,
"step": 704
},
{
"epoch": 0.43402143173204566,
"grad_norm": 1.7465119361877441,
"learning_rate": 6.296503433941622e-06,
"loss": 0.4715,
"step": 705
},
{
"epoch": 0.434637064968545,
"grad_norm": 2.0189712047576904,
"learning_rate": 6.286868671826513e-06,
"loss": 0.4935,
"step": 706
},
{
"epoch": 0.43525269820504436,
"grad_norm": 1.897495985031128,
"learning_rate": 6.277228789678953e-06,
"loss": 0.4634,
"step": 707
},
{
"epoch": 0.4358683314415437,
"grad_norm": 1.7837578058242798,
"learning_rate": 6.2675838258529054e-06,
"loss": 0.438,
"step": 708
},
{
"epoch": 0.43648396467804307,
"grad_norm": 1.9282805919647217,
"learning_rate": 6.257933818722544e-06,
"loss": 0.4495,
"step": 709
},
{
"epoch": 0.4370995979145424,
"grad_norm": 1.834639072418213,
"learning_rate": 6.248278806682114e-06,
"loss": 0.4832,
"step": 710
},
{
"epoch": 0.4377152311510418,
"grad_norm": 1.8320832252502441,
"learning_rate": 6.238618828145775e-06,
"loss": 0.4619,
"step": 711
},
{
"epoch": 0.4383308643875411,
"grad_norm": 1.6084938049316406,
"learning_rate": 6.228953921547441e-06,
"loss": 0.4203,
"step": 712
},
{
"epoch": 0.4389464976240405,
"grad_norm": 1.7636457681655884,
"learning_rate": 6.219284125340637e-06,
"loss": 0.4569,
"step": 713
},
{
"epoch": 0.43956213086053986,
"grad_norm": 1.8105461597442627,
"learning_rate": 6.209609477998339e-06,
"loss": 0.4535,
"step": 714
},
{
"epoch": 0.4401777640970392,
"grad_norm": 1.876543641090393,
"learning_rate": 6.19993001801283e-06,
"loss": 0.4581,
"step": 715
},
{
"epoch": 0.44079339733353856,
"grad_norm": 1.8101094961166382,
"learning_rate": 6.190245783895537e-06,
"loss": 0.4716,
"step": 716
},
{
"epoch": 0.4414090305700379,
"grad_norm": 1.9081860780715942,
"learning_rate": 6.180556814176878e-06,
"loss": 0.4828,
"step": 717
},
{
"epoch": 0.44202466380653727,
"grad_norm": 1.996167540550232,
"learning_rate": 6.17086314740612e-06,
"loss": 0.4573,
"step": 718
},
{
"epoch": 0.4426402970430366,
"grad_norm": 2.116783618927002,
"learning_rate": 6.161164822151213e-06,
"loss": 0.4308,
"step": 719
},
{
"epoch": 0.44325593027953597,
"grad_norm": 1.809017300605774,
"learning_rate": 6.151461876998643e-06,
"loss": 0.4475,
"step": 720
},
{
"epoch": 0.4438715635160353,
"grad_norm": 1.7881202697753906,
"learning_rate": 6.141754350553279e-06,
"loss": 0.4379,
"step": 721
},
{
"epoch": 0.4444871967525347,
"grad_norm": 1.7520116567611694,
"learning_rate": 6.1320422814382145e-06,
"loss": 0.424,
"step": 722
},
{
"epoch": 0.44510282998903405,
"grad_norm": 1.9185141324996948,
"learning_rate": 6.122325708294615e-06,
"loss": 0.4646,
"step": 723
},
{
"epoch": 0.4457184632255334,
"grad_norm": 1.9544090032577515,
"learning_rate": 6.112604669781572e-06,
"loss": 0.4444,
"step": 724
},
{
"epoch": 0.44633409646203276,
"grad_norm": 1.8842326402664185,
"learning_rate": 6.102879204575941e-06,
"loss": 0.4264,
"step": 725
},
{
"epoch": 0.4469497296985321,
"grad_norm": 1.85416841506958,
"learning_rate": 6.093149351372186e-06,
"loss": 0.4688,
"step": 726
},
{
"epoch": 0.44756536293503146,
"grad_norm": 1.716825008392334,
"learning_rate": 6.083415148882236e-06,
"loss": 0.4453,
"step": 727
},
{
"epoch": 0.4481809961715308,
"grad_norm": 1.9674599170684814,
"learning_rate": 6.073676635835317e-06,
"loss": 0.4903,
"step": 728
},
{
"epoch": 0.44879662940803017,
"grad_norm": 1.8208335638046265,
"learning_rate": 6.063933850977811e-06,
"loss": 0.4369,
"step": 729
},
{
"epoch": 0.44941226264452955,
"grad_norm": 1.7994003295898438,
"learning_rate": 6.054186833073096e-06,
"loss": 0.4595,
"step": 730
},
{
"epoch": 0.45002789588102887,
"grad_norm": 1.9228452444076538,
"learning_rate": 6.044435620901388e-06,
"loss": 0.4366,
"step": 731
},
{
"epoch": 0.45064352911752825,
"grad_norm": 1.9610226154327393,
"learning_rate": 6.034680253259594e-06,
"loss": 0.4463,
"step": 732
},
{
"epoch": 0.4512591623540276,
"grad_norm": 1.9020127058029175,
"learning_rate": 6.024920768961153e-06,
"loss": 0.4667,
"step": 733
},
{
"epoch": 0.45187479559052696,
"grad_norm": 1.8535819053649902,
"learning_rate": 6.015157206835881e-06,
"loss": 0.4452,
"step": 734
},
{
"epoch": 0.4524904288270263,
"grad_norm": 1.96487557888031,
"learning_rate": 6.005389605729824e-06,
"loss": 0.469,
"step": 735
},
{
"epoch": 0.45310606206352566,
"grad_norm": 1.8634556531906128,
"learning_rate": 5.995618004505091e-06,
"loss": 0.4395,
"step": 736
},
{
"epoch": 0.453721695300025,
"grad_norm": 1.9039191007614136,
"learning_rate": 5.985842442039712e-06,
"loss": 0.4955,
"step": 737
},
{
"epoch": 0.45433732853652437,
"grad_norm": 1.8303287029266357,
"learning_rate": 5.976062957227472e-06,
"loss": 0.459,
"step": 738
},
{
"epoch": 0.45495296177302375,
"grad_norm": 2.0153088569641113,
"learning_rate": 5.9662795889777666e-06,
"loss": 0.4635,
"step": 739
},
{
"epoch": 0.45556859500952307,
"grad_norm": 1.7339246273040771,
"learning_rate": 5.956492376215439e-06,
"loss": 0.4648,
"step": 740
},
{
"epoch": 0.45618422824602245,
"grad_norm": 1.8080629110336304,
"learning_rate": 5.946701357880632e-06,
"loss": 0.448,
"step": 741
},
{
"epoch": 0.4567998614825218,
"grad_norm": 1.7795019149780273,
"learning_rate": 5.936906572928625e-06,
"loss": 0.4518,
"step": 742
},
{
"epoch": 0.45741549471902115,
"grad_norm": 1.7698966264724731,
"learning_rate": 5.927108060329685e-06,
"loss": 0.4544,
"step": 743
},
{
"epoch": 0.4580311279555205,
"grad_norm": 1.7339845895767212,
"learning_rate": 5.917305859068912e-06,
"loss": 0.4603,
"step": 744
},
{
"epoch": 0.45864676119201986,
"grad_norm": 1.6085306406021118,
"learning_rate": 5.907500008146082e-06,
"loss": 0.4236,
"step": 745
},
{
"epoch": 0.4592623944285192,
"grad_norm": 1.7907172441482544,
"learning_rate": 5.897690546575491e-06,
"loss": 0.4557,
"step": 746
},
{
"epoch": 0.45987802766501856,
"grad_norm": 1.759875774383545,
"learning_rate": 5.887877513385799e-06,
"loss": 0.4632,
"step": 747
},
{
"epoch": 0.46049366090151794,
"grad_norm": 2.0250210762023926,
"learning_rate": 5.878060947619877e-06,
"loss": 0.4826,
"step": 748
},
{
"epoch": 0.46110929413801727,
"grad_norm": 1.8454856872558594,
"learning_rate": 5.8682408883346535e-06,
"loss": 0.4475,
"step": 749
},
{
"epoch": 0.46172492737451665,
"grad_norm": 1.6851238012313843,
"learning_rate": 5.858417374600952e-06,
"loss": 0.4461,
"step": 750
},
{
"epoch": 0.46234056061101597,
"grad_norm": 1.6859235763549805,
"learning_rate": 5.848590445503345e-06,
"loss": 0.4569,
"step": 751
},
{
"epoch": 0.46295619384751535,
"grad_norm": 1.661718726158142,
"learning_rate": 5.838760140139993e-06,
"loss": 0.4588,
"step": 752
},
{
"epoch": 0.4635718270840147,
"grad_norm": 1.8251081705093384,
"learning_rate": 5.828926497622484e-06,
"loss": 0.454,
"step": 753
},
{
"epoch": 0.46418746032051406,
"grad_norm": 1.5905681848526,
"learning_rate": 5.819089557075689e-06,
"loss": 0.4269,
"step": 754
},
{
"epoch": 0.46480309355701344,
"grad_norm": 1.7265760898590088,
"learning_rate": 5.809249357637601e-06,
"loss": 0.4491,
"step": 755
},
{
"epoch": 0.46541872679351276,
"grad_norm": 1.8297325372695923,
"learning_rate": 5.799405938459175e-06,
"loss": 0.4729,
"step": 756
},
{
"epoch": 0.46603436003001214,
"grad_norm": 1.8845032453536987,
"learning_rate": 5.78955933870418e-06,
"loss": 0.4238,
"step": 757
},
{
"epoch": 0.46664999326651146,
"grad_norm": 2.0128591060638428,
"learning_rate": 5.779709597549037e-06,
"loss": 0.4643,
"step": 758
},
{
"epoch": 0.46726562650301084,
"grad_norm": 1.8747127056121826,
"learning_rate": 5.769856754182668e-06,
"loss": 0.4703,
"step": 759
},
{
"epoch": 0.46788125973951017,
"grad_norm": 1.7039685249328613,
"learning_rate": 5.760000847806337e-06,
"loss": 0.4492,
"step": 760
},
{
"epoch": 0.46849689297600955,
"grad_norm": 1.7922077178955078,
"learning_rate": 5.750141917633491e-06,
"loss": 0.4566,
"step": 761
},
{
"epoch": 0.4691125262125089,
"grad_norm": 1.7171673774719238,
"learning_rate": 5.740280002889613e-06,
"loss": 0.4379,
"step": 762
},
{
"epoch": 0.46972815944900825,
"grad_norm": 1.6933897733688354,
"learning_rate": 5.730415142812059e-06,
"loss": 0.4393,
"step": 763
},
{
"epoch": 0.47034379268550763,
"grad_norm": 1.9281322956085205,
"learning_rate": 5.720547376649901e-06,
"loss": 0.4508,
"step": 764
},
{
"epoch": 0.47095942592200696,
"grad_norm": 1.9108821153640747,
"learning_rate": 5.710676743663777e-06,
"loss": 0.471,
"step": 765
},
{
"epoch": 0.47157505915850634,
"grad_norm": 1.8311848640441895,
"learning_rate": 5.70080328312573e-06,
"loss": 0.4702,
"step": 766
},
{
"epoch": 0.47219069239500566,
"grad_norm": 1.7278153896331787,
"learning_rate": 5.690927034319051e-06,
"loss": 0.4357,
"step": 767
},
{
"epoch": 0.47280632563150504,
"grad_norm": 1.8962657451629639,
"learning_rate": 5.681048036538126e-06,
"loss": 0.4561,
"step": 768
},
{
"epoch": 0.47342195886800437,
"grad_norm": 1.8439857959747314,
"learning_rate": 5.671166329088278e-06,
"loss": 0.4298,
"step": 769
},
{
"epoch": 0.47403759210450375,
"grad_norm": 1.9815741777420044,
"learning_rate": 5.661281951285613e-06,
"loss": 0.4562,
"step": 770
},
{
"epoch": 0.47465322534100307,
"grad_norm": 1.8740676641464233,
"learning_rate": 5.6513949424568585e-06,
"loss": 0.4719,
"step": 771
},
{
"epoch": 0.47526885857750245,
"grad_norm": 1.8455761671066284,
"learning_rate": 5.641505341939212e-06,
"loss": 0.4527,
"step": 772
},
{
"epoch": 0.47588449181400183,
"grad_norm": 1.7912147045135498,
"learning_rate": 5.631613189080178e-06,
"loss": 0.4582,
"step": 773
},
{
"epoch": 0.47650012505050116,
"grad_norm": 1.8444029092788696,
"learning_rate": 5.621718523237427e-06,
"loss": 0.4389,
"step": 774
},
{
"epoch": 0.47711575828700054,
"grad_norm": 1.8730937242507935,
"learning_rate": 5.611821383778614e-06,
"loss": 0.4286,
"step": 775
},
{
"epoch": 0.47773139152349986,
"grad_norm": 1.6841018199920654,
"learning_rate": 5.601921810081243e-06,
"loss": 0.4222,
"step": 776
},
{
"epoch": 0.47834702475999924,
"grad_norm": 1.817315936088562,
"learning_rate": 5.592019841532507e-06,
"loss": 0.4576,
"step": 777
},
{
"epoch": 0.47896265799649856,
"grad_norm": 1.967796802520752,
"learning_rate": 5.582115517529114e-06,
"loss": 0.441,
"step": 778
},
{
"epoch": 0.47957829123299794,
"grad_norm": 1.9327470064163208,
"learning_rate": 5.57220887747716e-06,
"loss": 0.4482,
"step": 779
},
{
"epoch": 0.4801939244694973,
"grad_norm": 2.1096456050872803,
"learning_rate": 5.562299960791946e-06,
"loss": 0.4721,
"step": 780
},
{
"epoch": 0.48080955770599665,
"grad_norm": 1.8861747980117798,
"learning_rate": 5.55238880689783e-06,
"loss": 0.4171,
"step": 781
},
{
"epoch": 0.48142519094249603,
"grad_norm": 1.9005745649337769,
"learning_rate": 5.542475455228077e-06,
"loss": 0.4449,
"step": 782
},
{
"epoch": 0.48204082417899535,
"grad_norm": 1.7999687194824219,
"learning_rate": 5.532559945224692e-06,
"loss": 0.429,
"step": 783
},
{
"epoch": 0.48265645741549473,
"grad_norm": 2.0499937534332275,
"learning_rate": 5.522642316338268e-06,
"loss": 0.4536,
"step": 784
},
{
"epoch": 0.48327209065199406,
"grad_norm": 1.913791537284851,
"learning_rate": 5.51272260802783e-06,
"loss": 0.4458,
"step": 785
},
{
"epoch": 0.48388772388849344,
"grad_norm": 1.9510321617126465,
"learning_rate": 5.502800859760676e-06,
"loss": 0.4638,
"step": 786
},
{
"epoch": 0.48450335712499276,
"grad_norm": 1.9328547716140747,
"learning_rate": 5.4928771110122185e-06,
"loss": 0.4792,
"step": 787
},
{
"epoch": 0.48511899036149214,
"grad_norm": 1.892099380493164,
"learning_rate": 5.48295140126583e-06,
"loss": 0.4676,
"step": 788
},
{
"epoch": 0.4857346235979915,
"grad_norm": 1.8072856664657593,
"learning_rate": 5.473023770012686e-06,
"loss": 0.4944,
"step": 789
},
{
"epoch": 0.48635025683449085,
"grad_norm": 1.7051112651824951,
"learning_rate": 5.463094256751608e-06,
"loss": 0.4385,
"step": 790
},
{
"epoch": 0.4869658900709902,
"grad_norm": 1.9088255167007446,
"learning_rate": 5.453162900988902e-06,
"loss": 0.4404,
"step": 791
},
{
"epoch": 0.48758152330748955,
"grad_norm": 1.7755197286605835,
"learning_rate": 5.443229742238207e-06,
"loss": 0.4596,
"step": 792
},
{
"epoch": 0.48819715654398893,
"grad_norm": 2.002506732940674,
"learning_rate": 5.433294820020335e-06,
"loss": 0.4381,
"step": 793
},
{
"epoch": 0.48881278978048825,
"grad_norm": 1.7047615051269531,
"learning_rate": 5.423358173863117e-06,
"loss": 0.4504,
"step": 794
},
{
"epoch": 0.48942842301698763,
"grad_norm": 1.7884583473205566,
"learning_rate": 5.413419843301238e-06,
"loss": 0.4247,
"step": 795
},
{
"epoch": 0.49004405625348696,
"grad_norm": 1.8708890676498413,
"learning_rate": 5.403479867876087e-06,
"loss": 0.4512,
"step": 796
},
{
"epoch": 0.49065968948998634,
"grad_norm": 1.8814146518707275,
"learning_rate": 5.3935382871356004e-06,
"loss": 0.4624,
"step": 797
},
{
"epoch": 0.4912753227264857,
"grad_norm": 1.8110054731369019,
"learning_rate": 5.383595140634093e-06,
"loss": 0.428,
"step": 798
},
{
"epoch": 0.49189095596298504,
"grad_norm": 1.8073616027832031,
"learning_rate": 5.373650467932122e-06,
"loss": 0.4319,
"step": 799
},
{
"epoch": 0.4925065891994844,
"grad_norm": 2.0299525260925293,
"learning_rate": 5.363704308596306e-06,
"loss": 0.4161,
"step": 800
},
{
"epoch": 0.4925065891994844,
"eval_loss": 0.4422464072704315,
"eval_runtime": 118.6207,
"eval_samples_per_second": 35.415,
"eval_steps_per_second": 4.434,
"step": 800
},
{
"epoch": 0.49312222243598375,
"grad_norm": 1.728995442390442,
"learning_rate": 5.3537567021991825e-06,
"loss": 0.4256,
"step": 801
},
{
"epoch": 0.49373785567248313,
"grad_norm": 1.9535176753997803,
"learning_rate": 5.343807688319047e-06,
"loss": 0.4604,
"step": 802
},
{
"epoch": 0.49435348890898245,
"grad_norm": 1.6464016437530518,
"learning_rate": 5.3338573065397936e-06,
"loss": 0.4201,
"step": 803
},
{
"epoch": 0.49496912214548183,
"grad_norm": 1.7924641370773315,
"learning_rate": 5.323905596450759e-06,
"loss": 0.4414,
"step": 804
},
{
"epoch": 0.4955847553819812,
"grad_norm": 1.6692326068878174,
"learning_rate": 5.3139525976465675e-06,
"loss": 0.4621,
"step": 805
},
{
"epoch": 0.49620038861848054,
"grad_norm": 1.7795913219451904,
"learning_rate": 5.303998349726966e-06,
"loss": 0.4365,
"step": 806
},
{
"epoch": 0.4968160218549799,
"grad_norm": 1.6701775789260864,
"learning_rate": 5.294042892296675e-06,
"loss": 0.4358,
"step": 807
},
{
"epoch": 0.49743165509147924,
"grad_norm": 1.8857059478759766,
"learning_rate": 5.284086264965224e-06,
"loss": 0.4596,
"step": 808
},
{
"epoch": 0.4980472883279786,
"grad_norm": 1.9470977783203125,
"learning_rate": 5.274128507346801e-06,
"loss": 0.4549,
"step": 809
},
{
"epoch": 0.49866292156447795,
"grad_norm": 1.7852569818496704,
"learning_rate": 5.264169659060087e-06,
"loss": 0.4562,
"step": 810
},
{
"epoch": 0.4992785548009773,
"grad_norm": 2.10684871673584,
"learning_rate": 5.2542097597281095e-06,
"loss": 0.4646,
"step": 811
},
{
"epoch": 0.49989418803747665,
"grad_norm": 1.738823413848877,
"learning_rate": 5.244248848978067e-06,
"loss": 0.4538,
"step": 812
},
{
"epoch": 0.500509821273976,
"grad_norm": 1.7574955224990845,
"learning_rate": 5.234286966441191e-06,
"loss": 0.4268,
"step": 813
},
{
"epoch": 0.5011254545104754,
"grad_norm": 1.9219170808792114,
"learning_rate": 5.224324151752575e-06,
"loss": 0.4719,
"step": 814
},
{
"epoch": 0.5017410877469748,
"grad_norm": 1.7875138521194458,
"learning_rate": 5.214360444551024e-06,
"loss": 0.4516,
"step": 815
},
{
"epoch": 0.5023567209834741,
"grad_norm": 1.7680673599243164,
"learning_rate": 5.2043958844788925e-06,
"loss": 0.4378,
"step": 816
},
{
"epoch": 0.5029723542199734,
"grad_norm": 1.9988130331039429,
"learning_rate": 5.194430511181925e-06,
"loss": 0.4332,
"step": 817
},
{
"epoch": 0.5035879874564728,
"grad_norm": 1.7063559293746948,
"learning_rate": 5.184464364309109e-06,
"loss": 0.4367,
"step": 818
},
{
"epoch": 0.5042036206929722,
"grad_norm": 1.628308653831482,
"learning_rate": 5.174497483512506e-06,
"loss": 0.4326,
"step": 819
},
{
"epoch": 0.5048192539294715,
"grad_norm": 1.7696188688278198,
"learning_rate": 5.1645299084470936e-06,
"loss": 0.4391,
"step": 820
},
{
"epoch": 0.5054348871659708,
"grad_norm": 1.741668462753296,
"learning_rate": 5.1545616787706186e-06,
"loss": 0.4462,
"step": 821
},
{
"epoch": 0.5060505204024702,
"grad_norm": 1.745593786239624,
"learning_rate": 5.144592834143427e-06,
"loss": 0.426,
"step": 822
},
{
"epoch": 0.5066661536389696,
"grad_norm": 1.8672295808792114,
"learning_rate": 5.134623414228315e-06,
"loss": 0.4332,
"step": 823
},
{
"epoch": 0.507281786875469,
"grad_norm": 1.8388646841049194,
"learning_rate": 5.1246534586903655e-06,
"loss": 0.4686,
"step": 824
},
{
"epoch": 0.5078974201119683,
"grad_norm": 1.799299955368042,
"learning_rate": 5.114683007196793e-06,
"loss": 0.4461,
"step": 825
},
{
"epoch": 0.5085130533484676,
"grad_norm": 1.7791926860809326,
"learning_rate": 5.1047120994167855e-06,
"loss": 0.4314,
"step": 826
},
{
"epoch": 0.509128686584967,
"grad_norm": 1.9311397075653076,
"learning_rate": 5.094740775021348e-06,
"loss": 0.4418,
"step": 827
},
{
"epoch": 0.5097443198214664,
"grad_norm": 1.6415823698043823,
"learning_rate": 5.084769073683138e-06,
"loss": 0.4164,
"step": 828
},
{
"epoch": 0.5103599530579657,
"grad_norm": 1.9113596677780151,
"learning_rate": 5.074797035076319e-06,
"loss": 0.4284,
"step": 829
},
{
"epoch": 0.510975586294465,
"grad_norm": 1.8482359647750854,
"learning_rate": 5.064824698876393e-06,
"loss": 0.4498,
"step": 830
},
{
"epoch": 0.5115912195309644,
"grad_norm": 1.7767953872680664,
"learning_rate": 5.0548521047600465e-06,
"loss": 0.4228,
"step": 831
},
{
"epoch": 0.5122068527674638,
"grad_norm": 1.8928037881851196,
"learning_rate": 5.04487929240499e-06,
"loss": 0.4414,
"step": 832
},
{
"epoch": 0.5128224860039632,
"grad_norm": 1.6631807088851929,
"learning_rate": 5.034906301489808e-06,
"loss": 0.4069,
"step": 833
},
{
"epoch": 0.5134381192404625,
"grad_norm": 1.8700741529464722,
"learning_rate": 5.024933171693791e-06,
"loss": 0.4274,
"step": 834
},
{
"epoch": 0.5140537524769618,
"grad_norm": 1.8563249111175537,
"learning_rate": 5.014959942696782e-06,
"loss": 0.4591,
"step": 835
},
{
"epoch": 0.5146693857134612,
"grad_norm": 1.7255792617797852,
"learning_rate": 5.00498665417902e-06,
"loss": 0.4255,
"step": 836
},
{
"epoch": 0.5152850189499606,
"grad_norm": 1.871659517288208,
"learning_rate": 4.995013345820982e-06,
"loss": 0.495,
"step": 837
},
{
"epoch": 0.5159006521864599,
"grad_norm": 1.9164929389953613,
"learning_rate": 4.98504005730322e-06,
"loss": 0.4417,
"step": 838
},
{
"epoch": 0.5165162854229592,
"grad_norm": 1.8884233236312866,
"learning_rate": 4.9750668283062104e-06,
"loss": 0.4669,
"step": 839
},
{
"epoch": 0.5171319186594586,
"grad_norm": 1.838681697845459,
"learning_rate": 4.965093698510192e-06,
"loss": 0.4591,
"step": 840
},
{
"epoch": 0.517747551895958,
"grad_norm": 1.8403676748275757,
"learning_rate": 4.955120707595011e-06,
"loss": 0.4791,
"step": 841
},
{
"epoch": 0.5183631851324574,
"grad_norm": 1.7576521635055542,
"learning_rate": 4.945147895239956e-06,
"loss": 0.4303,
"step": 842
},
{
"epoch": 0.5189788183689567,
"grad_norm": 1.7351022958755493,
"learning_rate": 4.935175301123609e-06,
"loss": 0.4444,
"step": 843
},
{
"epoch": 0.519594451605456,
"grad_norm": 1.892082929611206,
"learning_rate": 4.9252029649236835e-06,
"loss": 0.4557,
"step": 844
},
{
"epoch": 0.5202100848419554,
"grad_norm": 1.8198317289352417,
"learning_rate": 4.915230926316864e-06,
"loss": 0.4411,
"step": 845
},
{
"epoch": 0.5208257180784548,
"grad_norm": 1.693459153175354,
"learning_rate": 4.905259224978655e-06,
"loss": 0.4461,
"step": 846
},
{
"epoch": 0.5214413513149542,
"grad_norm": 1.8052204847335815,
"learning_rate": 4.895287900583216e-06,
"loss": 0.4581,
"step": 847
},
{
"epoch": 0.5220569845514534,
"grad_norm": 1.957545280456543,
"learning_rate": 4.8853169928032094e-06,
"loss": 0.4593,
"step": 848
},
{
"epoch": 0.5226726177879528,
"grad_norm": 1.9859813451766968,
"learning_rate": 4.875346541309637e-06,
"loss": 0.4198,
"step": 849
},
{
"epoch": 0.5232882510244522,
"grad_norm": 1.8037396669387817,
"learning_rate": 4.865376585771687e-06,
"loss": 0.4379,
"step": 850
},
{
"epoch": 0.5239038842609516,
"grad_norm": 1.760682463645935,
"learning_rate": 4.8554071658565745e-06,
"loss": 0.4436,
"step": 851
},
{
"epoch": 0.5245195174974508,
"grad_norm": 1.7362818717956543,
"learning_rate": 4.845438321229382e-06,
"loss": 0.4445,
"step": 852
},
{
"epoch": 0.5251351507339502,
"grad_norm": 1.8464460372924805,
"learning_rate": 4.835470091552906e-06,
"loss": 0.4302,
"step": 853
},
{
"epoch": 0.5257507839704496,
"grad_norm": 1.7991427183151245,
"learning_rate": 4.825502516487497e-06,
"loss": 0.4433,
"step": 854
},
{
"epoch": 0.526366417206949,
"grad_norm": 1.824872374534607,
"learning_rate": 4.815535635690892e-06,
"loss": 0.4563,
"step": 855
},
{
"epoch": 0.5269820504434484,
"grad_norm": 1.8947278261184692,
"learning_rate": 4.805569488818077e-06,
"loss": 0.4206,
"step": 856
},
{
"epoch": 0.5275976836799476,
"grad_norm": 1.9948461055755615,
"learning_rate": 4.795604115521109e-06,
"loss": 0.45,
"step": 857
},
{
"epoch": 0.528213316916447,
"grad_norm": 1.8243073225021362,
"learning_rate": 4.785639555448977e-06,
"loss": 0.4614,
"step": 858
},
{
"epoch": 0.5288289501529464,
"grad_norm": 1.8149497509002686,
"learning_rate": 4.775675848247427e-06,
"loss": 0.4322,
"step": 859
},
{
"epoch": 0.5294445833894458,
"grad_norm": 1.8010841608047485,
"learning_rate": 4.7657130335588115e-06,
"loss": 0.4637,
"step": 860
},
{
"epoch": 0.530060216625945,
"grad_norm": 1.6604598760604858,
"learning_rate": 4.755751151021934e-06,
"loss": 0.4391,
"step": 861
},
{
"epoch": 0.5306758498624444,
"grad_norm": 1.7927626371383667,
"learning_rate": 4.745790240271892e-06,
"loss": 0.4417,
"step": 862
},
{
"epoch": 0.5312914830989438,
"grad_norm": 1.9684895277023315,
"learning_rate": 4.735830340939913e-06,
"loss": 0.4599,
"step": 863
},
{
"epoch": 0.5319071163354432,
"grad_norm": 1.9318605661392212,
"learning_rate": 4.7258714926532e-06,
"loss": 0.4308,
"step": 864
},
{
"epoch": 0.5325227495719426,
"grad_norm": 1.8322875499725342,
"learning_rate": 4.715913735034779e-06,
"loss": 0.4454,
"step": 865
},
{
"epoch": 0.5331383828084418,
"grad_norm": 1.7331942319869995,
"learning_rate": 4.705957107703327e-06,
"loss": 0.4114,
"step": 866
},
{
"epoch": 0.5337540160449412,
"grad_norm": 1.7381324768066406,
"learning_rate": 4.6960016502730354e-06,
"loss": 0.4206,
"step": 867
},
{
"epoch": 0.5343696492814406,
"grad_norm": 1.6991071701049805,
"learning_rate": 4.686047402353433e-06,
"loss": 0.4413,
"step": 868
},
{
"epoch": 0.53498528251794,
"grad_norm": 1.9641220569610596,
"learning_rate": 4.676094403549241e-06,
"loss": 0.4381,
"step": 869
},
{
"epoch": 0.5356009157544392,
"grad_norm": 1.9879289865493774,
"learning_rate": 4.666142693460208e-06,
"loss": 0.4182,
"step": 870
},
{
"epoch": 0.5362165489909386,
"grad_norm": 1.7053484916687012,
"learning_rate": 4.6561923116809545e-06,
"loss": 0.413,
"step": 871
},
{
"epoch": 0.536832182227438,
"grad_norm": 1.6974514722824097,
"learning_rate": 4.646243297800818e-06,
"loss": 0.4231,
"step": 872
},
{
"epoch": 0.5374478154639374,
"grad_norm": 1.9661206007003784,
"learning_rate": 4.636295691403696e-06,
"loss": 0.4395,
"step": 873
},
{
"epoch": 0.5380634487004368,
"grad_norm": 2.0033390522003174,
"learning_rate": 4.626349532067879e-06,
"loss": 0.4263,
"step": 874
},
{
"epoch": 0.538679081936936,
"grad_norm": 1.7665894031524658,
"learning_rate": 4.6164048593659076e-06,
"loss": 0.4375,
"step": 875
},
{
"epoch": 0.5392947151734354,
"grad_norm": 1.7947988510131836,
"learning_rate": 4.606461712864403e-06,
"loss": 0.4257,
"step": 876
},
{
"epoch": 0.5399103484099348,
"grad_norm": 1.890939712524414,
"learning_rate": 4.596520132123915e-06,
"loss": 0.4112,
"step": 877
},
{
"epoch": 0.5405259816464342,
"grad_norm": 2.065849781036377,
"learning_rate": 4.586580156698764e-06,
"loss": 0.4441,
"step": 878
},
{
"epoch": 0.5411416148829334,
"grad_norm": 1.8077067136764526,
"learning_rate": 4.576641826136884e-06,
"loss": 0.3964,
"step": 879
},
{
"epoch": 0.5417572481194328,
"grad_norm": 1.7111256122589111,
"learning_rate": 4.566705179979665e-06,
"loss": 0.4366,
"step": 880
},
{
"epoch": 0.5423728813559322,
"grad_norm": 2.0317494869232178,
"learning_rate": 4.556770257761794e-06,
"loss": 0.4585,
"step": 881
},
{
"epoch": 0.5429885145924316,
"grad_norm": 1.7759976387023926,
"learning_rate": 4.546837099011101e-06,
"loss": 0.4066,
"step": 882
},
{
"epoch": 0.543604147828931,
"grad_norm": 1.768035888671875,
"learning_rate": 4.536905743248394e-06,
"loss": 0.4295,
"step": 883
},
{
"epoch": 0.5442197810654302,
"grad_norm": 1.805371880531311,
"learning_rate": 4.526976229987315e-06,
"loss": 0.4489,
"step": 884
},
{
"epoch": 0.5448354143019296,
"grad_norm": 1.6467334032058716,
"learning_rate": 4.517048598734171e-06,
"loss": 0.4053,
"step": 885
},
{
"epoch": 0.545451047538429,
"grad_norm": 1.7442387342453003,
"learning_rate": 4.507122888987782e-06,
"loss": 0.4275,
"step": 886
},
{
"epoch": 0.5460666807749284,
"grad_norm": 1.9833625555038452,
"learning_rate": 4.497199140239326e-06,
"loss": 0.4501,
"step": 887
},
{
"epoch": 0.5466823140114276,
"grad_norm": 1.791882872581482,
"learning_rate": 4.487277391972171e-06,
"loss": 0.4346,
"step": 888
},
{
"epoch": 0.547297947247927,
"grad_norm": 1.6458256244659424,
"learning_rate": 4.477357683661734e-06,
"loss": 0.3962,
"step": 889
},
{
"epoch": 0.5479135804844264,
"grad_norm": 1.7302979230880737,
"learning_rate": 4.467440054775311e-06,
"loss": 0.4273,
"step": 890
},
{
"epoch": 0.5485292137209258,
"grad_norm": 1.7547414302825928,
"learning_rate": 4.457524544771925e-06,
"loss": 0.4467,
"step": 891
},
{
"epoch": 0.5491448469574252,
"grad_norm": 1.7946034669876099,
"learning_rate": 4.447611193102171e-06,
"loss": 0.4531,
"step": 892
},
{
"epoch": 0.5497604801939244,
"grad_norm": 1.863038420677185,
"learning_rate": 4.437700039208056e-06,
"loss": 0.4395,
"step": 893
},
{
"epoch": 0.5503761134304238,
"grad_norm": 1.7706995010375977,
"learning_rate": 4.427791122522841e-06,
"loss": 0.4301,
"step": 894
},
{
"epoch": 0.5509917466669232,
"grad_norm": 3.401134967803955,
"learning_rate": 4.417884482470887e-06,
"loss": 0.4546,
"step": 895
},
{
"epoch": 0.5516073799034226,
"grad_norm": 1.8848762512207031,
"learning_rate": 4.4079801584674955e-06,
"loss": 0.4771,
"step": 896
},
{
"epoch": 0.5522230131399218,
"grad_norm": 1.8655364513397217,
"learning_rate": 4.398078189918756e-06,
"loss": 0.4374,
"step": 897
},
{
"epoch": 0.5528386463764212,
"grad_norm": 1.812591791152954,
"learning_rate": 4.388178616221389e-06,
"loss": 0.4679,
"step": 898
},
{
"epoch": 0.5534542796129206,
"grad_norm": 1.71664559841156,
"learning_rate": 4.3782814767625755e-06,
"loss": 0.4305,
"step": 899
},
{
"epoch": 0.55406991284942,
"grad_norm": 2.093832015991211,
"learning_rate": 4.3683868109198225e-06,
"loss": 0.4307,
"step": 900
},
{
"epoch": 0.55406991284942,
"eval_loss": 0.430372953414917,
"eval_runtime": 118.7087,
"eval_samples_per_second": 35.389,
"eval_steps_per_second": 4.431,
"step": 900
},
{
"epoch": 0.5546855460859194,
"grad_norm": 1.8110665082931519,
"learning_rate": 4.35849465806079e-06,
"loss": 0.4191,
"step": 901
},
{
"epoch": 0.5553011793224186,
"grad_norm": 1.8144805431365967,
"learning_rate": 4.348605057543142e-06,
"loss": 0.4629,
"step": 902
},
{
"epoch": 0.555916812558918,
"grad_norm": 1.7565912008285522,
"learning_rate": 4.3387180487143875e-06,
"loss": 0.4358,
"step": 903
},
{
"epoch": 0.5565324457954174,
"grad_norm": 1.7798113822937012,
"learning_rate": 4.3288336709117246e-06,
"loss": 0.441,
"step": 904
},
{
"epoch": 0.5571480790319168,
"grad_norm": 1.789359211921692,
"learning_rate": 4.318951963461876e-06,
"loss": 0.4234,
"step": 905
},
{
"epoch": 0.5577637122684161,
"grad_norm": 1.7896876335144043,
"learning_rate": 4.309072965680951e-06,
"loss": 0.4281,
"step": 906
},
{
"epoch": 0.5583793455049154,
"grad_norm": 1.8061071634292603,
"learning_rate": 4.299196716874271e-06,
"loss": 0.4444,
"step": 907
},
{
"epoch": 0.5589949787414148,
"grad_norm": 1.637271523475647,
"learning_rate": 4.289323256336223e-06,
"loss": 0.443,
"step": 908
},
{
"epoch": 0.5596106119779142,
"grad_norm": 1.7649685144424438,
"learning_rate": 4.279452623350101e-06,
"loss": 0.4477,
"step": 909
},
{
"epoch": 0.5602262452144136,
"grad_norm": 1.7707200050354004,
"learning_rate": 4.269584857187942e-06,
"loss": 0.4373,
"step": 910
},
{
"epoch": 0.5608418784509128,
"grad_norm": 1.8373018503189087,
"learning_rate": 4.259719997110388e-06,
"loss": 0.4122,
"step": 911
},
{
"epoch": 0.5614575116874122,
"grad_norm": 1.7104779481887817,
"learning_rate": 4.24985808236651e-06,
"loss": 0.4227,
"step": 912
},
{
"epoch": 0.5620731449239116,
"grad_norm": 1.7874691486358643,
"learning_rate": 4.239999152193664e-06,
"loss": 0.4341,
"step": 913
},
{
"epoch": 0.562688778160411,
"grad_norm": 2.1757819652557373,
"learning_rate": 4.230143245817332e-06,
"loss": 0.4018,
"step": 914
},
{
"epoch": 0.5633044113969103,
"grad_norm": 1.6143423318862915,
"learning_rate": 4.2202904024509635e-06,
"loss": 0.409,
"step": 915
},
{
"epoch": 0.5639200446334096,
"grad_norm": 1.7682405710220337,
"learning_rate": 4.2104406612958216e-06,
"loss": 0.4158,
"step": 916
},
{
"epoch": 0.564535677869909,
"grad_norm": 1.7131983041763306,
"learning_rate": 4.200594061540827e-06,
"loss": 0.4337,
"step": 917
},
{
"epoch": 0.5651513111064084,
"grad_norm": 1.7463741302490234,
"learning_rate": 4.1907506423624006e-06,
"loss": 0.4275,
"step": 918
},
{
"epoch": 0.5657669443429078,
"grad_norm": 1.743363857269287,
"learning_rate": 4.180910442924312e-06,
"loss": 0.4138,
"step": 919
},
{
"epoch": 0.566382577579407,
"grad_norm": 1.5896950960159302,
"learning_rate": 4.171073502377519e-06,
"loss": 0.39,
"step": 920
},
{
"epoch": 0.5669982108159064,
"grad_norm": 1.815305471420288,
"learning_rate": 4.16123985986001e-06,
"loss": 0.4447,
"step": 921
},
{
"epoch": 0.5676138440524058,
"grad_norm": 1.758428931236267,
"learning_rate": 4.1514095544966556e-06,
"loss": 0.4344,
"step": 922
},
{
"epoch": 0.5682294772889052,
"grad_norm": 1.9296027421951294,
"learning_rate": 4.141582625399049e-06,
"loss": 0.4423,
"step": 923
},
{
"epoch": 0.5688451105254045,
"grad_norm": 2.148974895477295,
"learning_rate": 4.131759111665349e-06,
"loss": 0.4594,
"step": 924
},
{
"epoch": 0.5694607437619038,
"grad_norm": 1.6873085498809814,
"learning_rate": 4.121939052380125e-06,
"loss": 0.4355,
"step": 925
},
{
"epoch": 0.5700763769984032,
"grad_norm": 1.6905049085617065,
"learning_rate": 4.112122486614204e-06,
"loss": 0.4036,
"step": 926
},
{
"epoch": 0.5706920102349026,
"grad_norm": 1.883623719215393,
"learning_rate": 4.102309453424511e-06,
"loss": 0.4391,
"step": 927
},
{
"epoch": 0.571307643471402,
"grad_norm": 1.7885838747024536,
"learning_rate": 4.092499991853919e-06,
"loss": 0.435,
"step": 928
},
{
"epoch": 0.5719232767079012,
"grad_norm": 1.7965201139450073,
"learning_rate": 4.0826941409310885e-06,
"loss": 0.4319,
"step": 929
},
{
"epoch": 0.5725389099444006,
"grad_norm": 1.7711795568466187,
"learning_rate": 4.072891939670317e-06,
"loss": 0.4279,
"step": 930
},
{
"epoch": 0.5731545431809,
"grad_norm": 1.7697465419769287,
"learning_rate": 4.063093427071376e-06,
"loss": 0.4278,
"step": 931
},
{
"epoch": 0.5737701764173994,
"grad_norm": 1.721031904220581,
"learning_rate": 4.05329864211937e-06,
"loss": 0.4315,
"step": 932
},
{
"epoch": 0.5743858096538987,
"grad_norm": 1.8830902576446533,
"learning_rate": 4.043507623784562e-06,
"loss": 0.4151,
"step": 933
},
{
"epoch": 0.575001442890398,
"grad_norm": 1.7482315301895142,
"learning_rate": 4.033720411022235e-06,
"loss": 0.3916,
"step": 934
},
{
"epoch": 0.5756170761268974,
"grad_norm": 1.5488115549087524,
"learning_rate": 4.023937042772529e-06,
"loss": 0.415,
"step": 935
},
{
"epoch": 0.5762327093633968,
"grad_norm": 1.8000409603118896,
"learning_rate": 4.014157557960289e-06,
"loss": 0.4446,
"step": 936
},
{
"epoch": 0.5768483425998961,
"grad_norm": 1.7415274381637573,
"learning_rate": 4.0043819954949105e-06,
"loss": 0.4254,
"step": 937
},
{
"epoch": 0.5774639758363954,
"grad_norm": 1.8254972696304321,
"learning_rate": 3.994610394270178e-06,
"loss": 0.4395,
"step": 938
},
{
"epoch": 0.5780796090728948,
"grad_norm": 1.7846405506134033,
"learning_rate": 3.98484279316412e-06,
"loss": 0.432,
"step": 939
},
{
"epoch": 0.5786952423093942,
"grad_norm": 1.882558822631836,
"learning_rate": 3.975079231038848e-06,
"loss": 0.4496,
"step": 940
},
{
"epoch": 0.5793108755458936,
"grad_norm": 1.6487857103347778,
"learning_rate": 3.965319746740407e-06,
"loss": 0.4255,
"step": 941
},
{
"epoch": 0.5799265087823929,
"grad_norm": 1.9286268949508667,
"learning_rate": 3.955564379098613e-06,
"loss": 0.4313,
"step": 942
},
{
"epoch": 0.5805421420188922,
"grad_norm": 1.7310795783996582,
"learning_rate": 3.9458131669269066e-06,
"loss": 0.4536,
"step": 943
},
{
"epoch": 0.5811577752553916,
"grad_norm": 1.6216704845428467,
"learning_rate": 3.936066149022191e-06,
"loss": 0.4125,
"step": 944
},
{
"epoch": 0.581773408491891,
"grad_norm": 1.9235811233520508,
"learning_rate": 3.926323364164684e-06,
"loss": 0.4337,
"step": 945
},
{
"epoch": 0.5823890417283903,
"grad_norm": 1.5660535097122192,
"learning_rate": 3.916584851117766e-06,
"loss": 0.4164,
"step": 946
},
{
"epoch": 0.5830046749648896,
"grad_norm": 1.82460355758667,
"learning_rate": 3.906850648627814e-06,
"loss": 0.4192,
"step": 947
},
{
"epoch": 0.583620308201389,
"grad_norm": 1.855008840560913,
"learning_rate": 3.897120795424062e-06,
"loss": 0.4453,
"step": 948
},
{
"epoch": 0.5842359414378884,
"grad_norm": 1.7206461429595947,
"learning_rate": 3.887395330218429e-06,
"loss": 0.4161,
"step": 949
},
{
"epoch": 0.5848515746743878,
"grad_norm": 1.8171783685684204,
"learning_rate": 3.877674291705386e-06,
"loss": 0.4232,
"step": 950
},
{
"epoch": 0.5854672079108871,
"grad_norm": 1.8228973150253296,
"learning_rate": 3.867957718561787e-06,
"loss": 0.425,
"step": 951
},
{
"epoch": 0.5860828411473864,
"grad_norm": 1.7684314250946045,
"learning_rate": 3.8582456494467214e-06,
"loss": 0.4122,
"step": 952
},
{
"epoch": 0.5866984743838858,
"grad_norm": 1.9155278205871582,
"learning_rate": 3.848538123001356e-06,
"loss": 0.4661,
"step": 953
},
{
"epoch": 0.5873141076203852,
"grad_norm": 1.8282909393310547,
"learning_rate": 3.8388351778487884e-06,
"loss": 0.4492,
"step": 954
},
{
"epoch": 0.5879297408568845,
"grad_norm": 1.8336620330810547,
"learning_rate": 3.829136852593881e-06,
"loss": 0.4211,
"step": 955
},
{
"epoch": 0.5885453740933839,
"grad_norm": 1.7195727825164795,
"learning_rate": 3.8194431858231226e-06,
"loss": 0.4237,
"step": 956
},
{
"epoch": 0.5891610073298832,
"grad_norm": 1.6625844240188599,
"learning_rate": 3.8097542161044653e-06,
"loss": 0.4292,
"step": 957
},
{
"epoch": 0.5897766405663826,
"grad_norm": 1.9300004243850708,
"learning_rate": 3.8000699819871704e-06,
"loss": 0.4464,
"step": 958
},
{
"epoch": 0.590392273802882,
"grad_norm": 1.8018397092819214,
"learning_rate": 3.790390522001662e-06,
"loss": 0.4212,
"step": 959
},
{
"epoch": 0.5910079070393813,
"grad_norm": 1.8984296321868896,
"learning_rate": 3.780715874659366e-06,
"loss": 0.3802,
"step": 960
},
{
"epoch": 0.5916235402758806,
"grad_norm": 1.6990512609481812,
"learning_rate": 3.7710460784525617e-06,
"loss": 0.4089,
"step": 961
},
{
"epoch": 0.59223917351238,
"grad_norm": 1.6612354516983032,
"learning_rate": 3.761381171854227e-06,
"loss": 0.4378,
"step": 962
},
{
"epoch": 0.5928548067488794,
"grad_norm": 1.6652956008911133,
"learning_rate": 3.751721193317887e-06,
"loss": 0.4194,
"step": 963
},
{
"epoch": 0.5934704399853787,
"grad_norm": 1.7413960695266724,
"learning_rate": 3.7420661812774577e-06,
"loss": 0.4403,
"step": 964
},
{
"epoch": 0.5940860732218781,
"grad_norm": 1.7049193382263184,
"learning_rate": 3.7324161741470975e-06,
"loss": 0.4331,
"step": 965
},
{
"epoch": 0.5947017064583774,
"grad_norm": 1.7444761991500854,
"learning_rate": 3.7227712103210485e-06,
"loss": 0.4205,
"step": 966
},
{
"epoch": 0.5953173396948768,
"grad_norm": 1.8145601749420166,
"learning_rate": 3.7131313281734895e-06,
"loss": 0.4234,
"step": 967
},
{
"epoch": 0.5959329729313761,
"grad_norm": 1.758312702178955,
"learning_rate": 3.7034965660583794e-06,
"loss": 0.4213,
"step": 968
},
{
"epoch": 0.5965486061678755,
"grad_norm": 1.9307377338409424,
"learning_rate": 3.6938669623093086e-06,
"loss": 0.4348,
"step": 969
},
{
"epoch": 0.5971642394043748,
"grad_norm": 1.7513355016708374,
"learning_rate": 3.6842425552393424e-06,
"loss": 0.4503,
"step": 970
},
{
"epoch": 0.5977798726408742,
"grad_norm": 1.624816656112671,
"learning_rate": 3.6746233831408706e-06,
"loss": 0.4049,
"step": 971
},
{
"epoch": 0.5983955058773736,
"grad_norm": 1.6518126726150513,
"learning_rate": 3.6650094842854532e-06,
"loss": 0.4209,
"step": 972
},
{
"epoch": 0.5990111391138729,
"grad_norm": 1.6552377939224243,
"learning_rate": 3.655400896923672e-06,
"loss": 0.4019,
"step": 973
},
{
"epoch": 0.5996267723503723,
"grad_norm": 1.7360631227493286,
"learning_rate": 3.6457976592849753e-06,
"loss": 0.4301,
"step": 974
},
{
"epoch": 0.6002424055868716,
"grad_norm": 1.5934597253799438,
"learning_rate": 3.636199809577524e-06,
"loss": 0.4172,
"step": 975
},
{
"epoch": 0.600858038823371,
"grad_norm": 1.6730518341064453,
"learning_rate": 3.62660738598805e-06,
"loss": 0.4075,
"step": 976
},
{
"epoch": 0.6014736720598703,
"grad_norm": 1.6485260725021362,
"learning_rate": 3.6170204266816854e-06,
"loss": 0.3908,
"step": 977
},
{
"epoch": 0.6020893052963697,
"grad_norm": 1.7933323383331299,
"learning_rate": 3.6074389698018295e-06,
"loss": 0.4214,
"step": 978
},
{
"epoch": 0.602704938532869,
"grad_norm": 1.7532880306243896,
"learning_rate": 3.5978630534699873e-06,
"loss": 0.4064,
"step": 979
},
{
"epoch": 0.6033205717693684,
"grad_norm": 1.749861478805542,
"learning_rate": 3.5882927157856175e-06,
"loss": 0.3909,
"step": 980
},
{
"epoch": 0.6039362050058678,
"grad_norm": 1.7551836967468262,
"learning_rate": 3.578727994825988e-06,
"loss": 0.421,
"step": 981
},
{
"epoch": 0.6045518382423671,
"grad_norm": 1.8642972707748413,
"learning_rate": 3.5691689286460172e-06,
"loss": 0.4359,
"step": 982
},
{
"epoch": 0.6051674714788665,
"grad_norm": 1.601855993270874,
"learning_rate": 3.5596155552781207e-06,
"loss": 0.4009,
"step": 983
},
{
"epoch": 0.6057831047153658,
"grad_norm": 1.6237114667892456,
"learning_rate": 3.550067912732069e-06,
"loss": 0.4126,
"step": 984
},
{
"epoch": 0.6063987379518652,
"grad_norm": 1.7239121198654175,
"learning_rate": 3.540526038994834e-06,
"loss": 0.414,
"step": 985
},
{
"epoch": 0.6070143711883645,
"grad_norm": 1.7628268003463745,
"learning_rate": 3.530989972030428e-06,
"loss": 0.4464,
"step": 986
},
{
"epoch": 0.6076300044248639,
"grad_norm": 1.675179362297058,
"learning_rate": 3.521459749779769e-06,
"loss": 0.3922,
"step": 987
},
{
"epoch": 0.6082456376613632,
"grad_norm": 1.7558987140655518,
"learning_rate": 3.5119354101605086e-06,
"loss": 0.4214,
"step": 988
},
{
"epoch": 0.6088612708978626,
"grad_norm": 1.877814531326294,
"learning_rate": 3.502416991066904e-06,
"loss": 0.4259,
"step": 989
},
{
"epoch": 0.609476904134362,
"grad_norm": 1.7786641120910645,
"learning_rate": 3.492904530369655e-06,
"loss": 0.4338,
"step": 990
},
{
"epoch": 0.6100925373708613,
"grad_norm": 1.812157154083252,
"learning_rate": 3.4833980659157507e-06,
"loss": 0.4233,
"step": 991
},
{
"epoch": 0.6107081706073607,
"grad_norm": 1.7805922031402588,
"learning_rate": 3.4738976355283257e-06,
"loss": 0.4639,
"step": 992
},
{
"epoch": 0.61132380384386,
"grad_norm": 1.6990723609924316,
"learning_rate": 3.464403277006508e-06,
"loss": 0.4129,
"step": 993
},
{
"epoch": 0.6119394370803594,
"grad_norm": 1.7403292655944824,
"learning_rate": 3.4549150281252635e-06,
"loss": 0.429,
"step": 994
},
{
"epoch": 0.6125550703168587,
"grad_norm": 1.8647472858428955,
"learning_rate": 3.4454329266352543e-06,
"loss": 0.4163,
"step": 995
},
{
"epoch": 0.6131707035533581,
"grad_norm": 1.6785216331481934,
"learning_rate": 3.435957010262682e-06,
"loss": 0.4122,
"step": 996
},
{
"epoch": 0.6137863367898574,
"grad_norm": 1.7292088270187378,
"learning_rate": 3.4264873167091405e-06,
"loss": 0.4378,
"step": 997
},
{
"epoch": 0.6144019700263568,
"grad_norm": 1.7312982082366943,
"learning_rate": 3.4170238836514645e-06,
"loss": 0.3968,
"step": 998
},
{
"epoch": 0.6150176032628561,
"grad_norm": 1.7593203783035278,
"learning_rate": 3.4075667487415785e-06,
"loss": 0.4182,
"step": 999
},
{
"epoch": 0.6156332364993555,
"grad_norm": 1.7004156112670898,
"learning_rate": 3.398115949606352e-06,
"loss": 0.3914,
"step": 1000
},
{
"epoch": 0.6156332364993555,
"eval_loss": 0.41942450404167175,
"eval_runtime": 118.6467,
"eval_samples_per_second": 35.408,
"eval_steps_per_second": 4.433,
"step": 1000
},
{
"epoch": 0.6162488697358549,
"grad_norm": 1.7762300968170166,
"learning_rate": 3.3886715238474454e-06,
"loss": 0.4441,
"step": 1001
},
{
"epoch": 0.6168645029723542,
"grad_norm": 1.6758389472961426,
"learning_rate": 3.37923350904116e-06,
"loss": 0.4116,
"step": 1002
},
{
"epoch": 0.6174801362088536,
"grad_norm": 1.741321325302124,
"learning_rate": 3.3698019427382912e-06,
"loss": 0.4383,
"step": 1003
},
{
"epoch": 0.6180957694453529,
"grad_norm": 1.7947500944137573,
"learning_rate": 3.3603768624639786e-06,
"loss": 0.4261,
"step": 1004
},
{
"epoch": 0.6187114026818523,
"grad_norm": 1.74982750415802,
"learning_rate": 3.3509583057175547e-06,
"loss": 0.4293,
"step": 1005
},
{
"epoch": 0.6193270359183516,
"grad_norm": 1.919725775718689,
"learning_rate": 3.341546309972398e-06,
"loss": 0.4156,
"step": 1006
},
{
"epoch": 0.619942669154851,
"grad_norm": 2.013151168823242,
"learning_rate": 3.3321409126757807e-06,
"loss": 0.4306,
"step": 1007
},
{
"epoch": 0.6205583023913503,
"grad_norm": 1.8035351037979126,
"learning_rate": 3.322742151248726e-06,
"loss": 0.4317,
"step": 1008
},
{
"epoch": 0.6211739356278497,
"grad_norm": 1.722594141960144,
"learning_rate": 3.3133500630858507e-06,
"loss": 0.3966,
"step": 1009
},
{
"epoch": 0.6217895688643491,
"grad_norm": 1.719792366027832,
"learning_rate": 3.3039646855552243e-06,
"loss": 0.391,
"step": 1010
},
{
"epoch": 0.6224052021008484,
"grad_norm": 1.7098640203475952,
"learning_rate": 3.2945860559982153e-06,
"loss": 0.3938,
"step": 1011
},
{
"epoch": 0.6230208353373478,
"grad_norm": 1.6516406536102295,
"learning_rate": 3.2852142117293435e-06,
"loss": 0.4288,
"step": 1012
},
{
"epoch": 0.6236364685738471,
"grad_norm": 1.7048057317733765,
"learning_rate": 3.275849190036133e-06,
"loss": 0.4214,
"step": 1013
},
{
"epoch": 0.6242521018103465,
"grad_norm": 1.7411932945251465,
"learning_rate": 3.266491028178964e-06,
"loss": 0.4102,
"step": 1014
},
{
"epoch": 0.6248677350468459,
"grad_norm": 1.5722557306289673,
"learning_rate": 3.2571397633909252e-06,
"loss": 0.4009,
"step": 1015
},
{
"epoch": 0.6254833682833452,
"grad_norm": 1.5852289199829102,
"learning_rate": 3.2477954328776574e-06,
"loss": 0.3832,
"step": 1016
},
{
"epoch": 0.6260990015198445,
"grad_norm": 1.6573041677474976,
"learning_rate": 3.2384580738172185e-06,
"loss": 0.4145,
"step": 1017
},
{
"epoch": 0.6267146347563439,
"grad_norm": 1.798135757446289,
"learning_rate": 3.229127723359927e-06,
"loss": 0.4606,
"step": 1018
},
{
"epoch": 0.6273302679928433,
"grad_norm": 1.8369131088256836,
"learning_rate": 3.219804418628216e-06,
"loss": 0.4171,
"step": 1019
},
{
"epoch": 0.6279459012293426,
"grad_norm": 1.6728371381759644,
"learning_rate": 3.2104881967164886e-06,
"loss": 0.3942,
"step": 1020
},
{
"epoch": 0.628561534465842,
"grad_norm": 1.8083055019378662,
"learning_rate": 3.2011790946909673e-06,
"loss": 0.448,
"step": 1021
},
{
"epoch": 0.6291771677023413,
"grad_norm": 1.6701570749282837,
"learning_rate": 3.1918771495895395e-06,
"loss": 0.4179,
"step": 1022
},
{
"epoch": 0.6297928009388407,
"grad_norm": 1.777587652206421,
"learning_rate": 3.1825823984216264e-06,
"loss": 0.4296,
"step": 1023
},
{
"epoch": 0.6304084341753401,
"grad_norm": 1.6975241899490356,
"learning_rate": 3.173294878168025e-06,
"loss": 0.3934,
"step": 1024
},
{
"epoch": 0.6310240674118394,
"grad_norm": 1.6508057117462158,
"learning_rate": 3.1640146257807604e-06,
"loss": 0.3989,
"step": 1025
},
{
"epoch": 0.6316397006483387,
"grad_norm": 1.7512353658676147,
"learning_rate": 3.154741678182945e-06,
"loss": 0.4084,
"step": 1026
},
{
"epoch": 0.6322553338848381,
"grad_norm": 1.6738899946212769,
"learning_rate": 3.1454760722686206e-06,
"loss": 0.4062,
"step": 1027
},
{
"epoch": 0.6328709671213375,
"grad_norm": 1.9070992469787598,
"learning_rate": 3.1362178449026246e-06,
"loss": 0.4404,
"step": 1028
},
{
"epoch": 0.6334866003578368,
"grad_norm": 1.5783123970031738,
"learning_rate": 3.12696703292044e-06,
"loss": 0.4073,
"step": 1029
},
{
"epoch": 0.6341022335943362,
"grad_norm": 1.8654526472091675,
"learning_rate": 3.11772367312804e-06,
"loss": 0.4228,
"step": 1030
},
{
"epoch": 0.6347178668308355,
"grad_norm": 1.5824333429336548,
"learning_rate": 3.1084878023017517e-06,
"loss": 0.3902,
"step": 1031
},
{
"epoch": 0.6353335000673349,
"grad_norm": 1.7520995140075684,
"learning_rate": 3.0992594571881056e-06,
"loss": 0.4149,
"step": 1032
},
{
"epoch": 0.6359491333038343,
"grad_norm": 1.7872405052185059,
"learning_rate": 3.090038674503688e-06,
"loss": 0.4332,
"step": 1033
},
{
"epoch": 0.6365647665403336,
"grad_norm": 1.7681427001953125,
"learning_rate": 3.0808254909349987e-06,
"loss": 0.404,
"step": 1034
},
{
"epoch": 0.6371803997768329,
"grad_norm": 1.7980319261550903,
"learning_rate": 3.071619943138303e-06,
"loss": 0.4751,
"step": 1035
},
{
"epoch": 0.6377960330133323,
"grad_norm": 1.615838646888733,
"learning_rate": 3.0624220677394854e-06,
"loss": 0.3881,
"step": 1036
},
{
"epoch": 0.6384116662498317,
"grad_norm": 1.9394729137420654,
"learning_rate": 3.0532319013339053e-06,
"loss": 0.3998,
"step": 1037
},
{
"epoch": 0.639027299486331,
"grad_norm": 1.7381932735443115,
"learning_rate": 3.044049480486247e-06,
"loss": 0.4133,
"step": 1038
},
{
"epoch": 0.6396429327228303,
"grad_norm": 2.063239097595215,
"learning_rate": 3.0348748417303826e-06,
"loss": 0.4368,
"step": 1039
},
{
"epoch": 0.6402585659593297,
"grad_norm": 1.744970679283142,
"learning_rate": 3.025708021569219e-06,
"loss": 0.4323,
"step": 1040
},
{
"epoch": 0.6408741991958291,
"grad_norm": 1.686367154121399,
"learning_rate": 3.016549056474557e-06,
"loss": 0.4231,
"step": 1041
},
{
"epoch": 0.6414898324323285,
"grad_norm": 1.6732068061828613,
"learning_rate": 3.007397982886942e-06,
"loss": 0.3948,
"step": 1042
},
{
"epoch": 0.6421054656688278,
"grad_norm": 1.747349500656128,
"learning_rate": 2.9982548372155264e-06,
"loss": 0.447,
"step": 1043
},
{
"epoch": 0.6427210989053271,
"grad_norm": 1.8045332431793213,
"learning_rate": 2.989119655837913e-06,
"loss": 0.4287,
"step": 1044
},
{
"epoch": 0.6433367321418265,
"grad_norm": 1.688730001449585,
"learning_rate": 2.979992475100024e-06,
"loss": 0.4378,
"step": 1045
},
{
"epoch": 0.6439523653783259,
"grad_norm": 1.8829971551895142,
"learning_rate": 2.9708733313159464e-06,
"loss": 0.416,
"step": 1046
},
{
"epoch": 0.6445679986148252,
"grad_norm": 1.7018674612045288,
"learning_rate": 2.961762260767791e-06,
"loss": 0.4234,
"step": 1047
},
{
"epoch": 0.6451836318513245,
"grad_norm": 1.718910813331604,
"learning_rate": 2.9526592997055488e-06,
"loss": 0.3954,
"step": 1048
},
{
"epoch": 0.6457992650878239,
"grad_norm": 1.749856948852539,
"learning_rate": 2.9435644843469434e-06,
"loss": 0.4294,
"step": 1049
},
{
"epoch": 0.6464148983243233,
"grad_norm": 1.732338786125183,
"learning_rate": 2.934477850877292e-06,
"loss": 0.4071,
"step": 1050
},
{
"epoch": 0.6470305315608227,
"grad_norm": 1.71662175655365,
"learning_rate": 2.9253994354493575e-06,
"loss": 0.3991,
"step": 1051
},
{
"epoch": 0.647646164797322,
"grad_norm": 1.7738405466079712,
"learning_rate": 2.916329274183206e-06,
"loss": 0.4385,
"step": 1052
},
{
"epoch": 0.6482617980338213,
"grad_norm": 1.7057359218597412,
"learning_rate": 2.9072674031660647e-06,
"loss": 0.4068,
"step": 1053
},
{
"epoch": 0.6488774312703207,
"grad_norm": 1.8498355150222778,
"learning_rate": 2.8982138584521734e-06,
"loss": 0.4146,
"step": 1054
},
{
"epoch": 0.6494930645068201,
"grad_norm": 1.7028414011001587,
"learning_rate": 2.8891686760626445e-06,
"loss": 0.4142,
"step": 1055
},
{
"epoch": 0.6501086977433194,
"grad_norm": 1.7531042098999023,
"learning_rate": 2.8801318919853237e-06,
"loss": 0.3989,
"step": 1056
},
{
"epoch": 0.6507243309798187,
"grad_norm": 1.7394646406173706,
"learning_rate": 2.871103542174637e-06,
"loss": 0.4068,
"step": 1057
},
{
"epoch": 0.6513399642163181,
"grad_norm": 1.8246972560882568,
"learning_rate": 2.8620836625514577e-06,
"loss": 0.4311,
"step": 1058
},
{
"epoch": 0.6519555974528175,
"grad_norm": 1.6580798625946045,
"learning_rate": 2.853072289002954e-06,
"loss": 0.4046,
"step": 1059
},
{
"epoch": 0.6525712306893169,
"grad_norm": 1.8089441061019897,
"learning_rate": 2.844069457382459e-06,
"loss": 0.4322,
"step": 1060
},
{
"epoch": 0.6531868639258162,
"grad_norm": 1.7561008930206299,
"learning_rate": 2.835075203509312e-06,
"loss": 0.407,
"step": 1061
},
{
"epoch": 0.6538024971623155,
"grad_norm": 1.74833345413208,
"learning_rate": 2.8260895631687267e-06,
"loss": 0.4334,
"step": 1062
},
{
"epoch": 0.6544181303988149,
"grad_norm": 1.6814000606536865,
"learning_rate": 2.817112572111651e-06,
"loss": 0.4253,
"step": 1063
},
{
"epoch": 0.6550337636353143,
"grad_norm": 1.8488044738769531,
"learning_rate": 2.8081442660546126e-06,
"loss": 0.4284,
"step": 1064
},
{
"epoch": 0.6556493968718137,
"grad_norm": 1.7568581104278564,
"learning_rate": 2.799184680679592e-06,
"loss": 0.3943,
"step": 1065
},
{
"epoch": 0.6562650301083129,
"grad_norm": 1.7765746116638184,
"learning_rate": 2.790233851633868e-06,
"loss": 0.4156,
"step": 1066
},
{
"epoch": 0.6568806633448123,
"grad_norm": 1.9872164726257324,
"learning_rate": 2.7812918145298785e-06,
"loss": 0.4239,
"step": 1067
},
{
"epoch": 0.6574962965813117,
"grad_norm": 1.6706302165985107,
"learning_rate": 2.7723586049450902e-06,
"loss": 0.3858,
"step": 1068
},
{
"epoch": 0.6581119298178111,
"grad_norm": 1.7720205783843994,
"learning_rate": 2.7634342584218364e-06,
"loss": 0.4312,
"step": 1069
},
{
"epoch": 0.6587275630543103,
"grad_norm": 1.5793696641921997,
"learning_rate": 2.7545188104671995e-06,
"loss": 0.3945,
"step": 1070
},
{
"epoch": 0.6593431962908097,
"grad_norm": 1.6533647775650024,
"learning_rate": 2.7456122965528475e-06,
"loss": 0.4268,
"step": 1071
},
{
"epoch": 0.6599588295273091,
"grad_norm": 1.676277756690979,
"learning_rate": 2.7367147521149052e-06,
"loss": 0.4068,
"step": 1072
},
{
"epoch": 0.6605744627638085,
"grad_norm": 1.6618707180023193,
"learning_rate": 2.7278262125538153e-06,
"loss": 0.4057,
"step": 1073
},
{
"epoch": 0.6611900960003079,
"grad_norm": 1.6752638816833496,
"learning_rate": 2.718946713234185e-06,
"loss": 0.404,
"step": 1074
},
{
"epoch": 0.6618057292368071,
"grad_norm": 1.6323987245559692,
"learning_rate": 2.7100762894846633e-06,
"loss": 0.3856,
"step": 1075
},
{
"epoch": 0.6624213624733065,
"grad_norm": 1.6033183336257935,
"learning_rate": 2.7012149765977823e-06,
"loss": 0.3717,
"step": 1076
},
{
"epoch": 0.6630369957098059,
"grad_norm": 1.7118803262710571,
"learning_rate": 2.692362809829825e-06,
"loss": 0.3968,
"step": 1077
},
{
"epoch": 0.6636526289463053,
"grad_norm": 1.763137936592102,
"learning_rate": 2.683519824400693e-06,
"loss": 0.4166,
"step": 1078
},
{
"epoch": 0.6642682621828045,
"grad_norm": 1.8000075817108154,
"learning_rate": 2.674686055493748e-06,
"loss": 0.4143,
"step": 1079
},
{
"epoch": 0.6648838954193039,
"grad_norm": 1.8939229249954224,
"learning_rate": 2.66586153825569e-06,
"loss": 0.4026,
"step": 1080
},
{
"epoch": 0.6654995286558033,
"grad_norm": 1.8988964557647705,
"learning_rate": 2.657046307796407e-06,
"loss": 0.4102,
"step": 1081
},
{
"epoch": 0.6661151618923027,
"grad_norm": 1.603047490119934,
"learning_rate": 2.648240399188837e-06,
"loss": 0.4035,
"step": 1082
},
{
"epoch": 0.6667307951288021,
"grad_norm": 1.6859813928604126,
"learning_rate": 2.639443847468831e-06,
"loss": 0.4147,
"step": 1083
},
{
"epoch": 0.6673464283653013,
"grad_norm": 1.7707592248916626,
"learning_rate": 2.6306566876350072e-06,
"loss": 0.3923,
"step": 1084
},
{
"epoch": 0.6679620616018007,
"grad_norm": 1.7900476455688477,
"learning_rate": 2.6218789546486235e-06,
"loss": 0.4022,
"step": 1085
},
{
"epoch": 0.6685776948383001,
"grad_norm": 1.7816258668899536,
"learning_rate": 2.6131106834334296e-06,
"loss": 0.419,
"step": 1086
},
{
"epoch": 0.6691933280747995,
"grad_norm": 1.775720477104187,
"learning_rate": 2.6043519088755263e-06,
"loss": 0.415,
"step": 1087
},
{
"epoch": 0.6698089613112987,
"grad_norm": 1.8259687423706055,
"learning_rate": 2.5956026658232347e-06,
"loss": 0.4257,
"step": 1088
},
{
"epoch": 0.6704245945477981,
"grad_norm": 1.7508234977722168,
"learning_rate": 2.5868629890869467e-06,
"loss": 0.4181,
"step": 1089
},
{
"epoch": 0.6710402277842975,
"grad_norm": 1.6744227409362793,
"learning_rate": 2.578132913439e-06,
"loss": 0.3982,
"step": 1090
},
{
"epoch": 0.6716558610207969,
"grad_norm": 1.828457236289978,
"learning_rate": 2.5694124736135315e-06,
"loss": 0.4321,
"step": 1091
},
{
"epoch": 0.6722714942572963,
"grad_norm": 1.7128944396972656,
"learning_rate": 2.560701704306336e-06,
"loss": 0.4008,
"step": 1092
},
{
"epoch": 0.6728871274937955,
"grad_norm": 1.7066082954406738,
"learning_rate": 2.55200064017474e-06,
"loss": 0.4134,
"step": 1093
},
{
"epoch": 0.6735027607302949,
"grad_norm": 1.647888422012329,
"learning_rate": 2.543309315837444e-06,
"loss": 0.3792,
"step": 1094
},
{
"epoch": 0.6741183939667943,
"grad_norm": 1.7262251377105713,
"learning_rate": 2.5346277658744083e-06,
"loss": 0.4137,
"step": 1095
},
{
"epoch": 0.6747340272032937,
"grad_norm": 1.782806158065796,
"learning_rate": 2.5259560248267022e-06,
"loss": 0.4341,
"step": 1096
},
{
"epoch": 0.6753496604397929,
"grad_norm": 1.7016490697860718,
"learning_rate": 2.5172941271963626e-06,
"loss": 0.4176,
"step": 1097
},
{
"epoch": 0.6759652936762923,
"grad_norm": 1.7586191892623901,
"learning_rate": 2.5086421074462707e-06,
"loss": 0.4121,
"step": 1098
},
{
"epoch": 0.6765809269127917,
"grad_norm": 1.7193186283111572,
"learning_rate": 2.5000000000000015e-06,
"loss": 0.4088,
"step": 1099
},
{
"epoch": 0.6771965601492911,
"grad_norm": 1.7770181894302368,
"learning_rate": 2.49136783924169e-06,
"loss": 0.4464,
"step": 1100
},
{
"epoch": 0.6771965601492911,
"eval_loss": 0.40752142667770386,
"eval_runtime": 118.9248,
"eval_samples_per_second": 35.325,
"eval_steps_per_second": 4.423,
"step": 1100
},
{
"epoch": 0.6778121933857905,
"grad_norm": 1.7708814144134521,
"learning_rate": 2.482745659515905e-06,
"loss": 0.3943,
"step": 1101
},
{
"epoch": 0.6784278266222897,
"grad_norm": 1.7782667875289917,
"learning_rate": 2.4741334951274948e-06,
"loss": 0.411,
"step": 1102
},
{
"epoch": 0.6790434598587891,
"grad_norm": 1.8407803773880005,
"learning_rate": 2.4655313803414676e-06,
"loss": 0.4085,
"step": 1103
},
{
"epoch": 0.6796590930952885,
"grad_norm": 1.6154685020446777,
"learning_rate": 2.4569393493828433e-06,
"loss": 0.4043,
"step": 1104
},
{
"epoch": 0.6802747263317879,
"grad_norm": 1.7624921798706055,
"learning_rate": 2.448357436436519e-06,
"loss": 0.4309,
"step": 1105
},
{
"epoch": 0.6808903595682871,
"grad_norm": 1.7305024862289429,
"learning_rate": 2.4397856756471435e-06,
"loss": 0.3809,
"step": 1106
},
{
"epoch": 0.6815059928047865,
"grad_norm": 1.7794702053070068,
"learning_rate": 2.4312241011189643e-06,
"loss": 0.4138,
"step": 1107
},
{
"epoch": 0.6821216260412859,
"grad_norm": 1.7242809534072876,
"learning_rate": 2.4226727469157097e-06,
"loss": 0.4045,
"step": 1108
},
{
"epoch": 0.6827372592777853,
"grad_norm": 1.867807149887085,
"learning_rate": 2.4141316470604362e-06,
"loss": 0.4258,
"step": 1109
},
{
"epoch": 0.6833528925142847,
"grad_norm": 1.7269940376281738,
"learning_rate": 2.405600835535411e-06,
"loss": 0.4283,
"step": 1110
},
{
"epoch": 0.6839685257507839,
"grad_norm": 1.7357679605484009,
"learning_rate": 2.3970803462819586e-06,
"loss": 0.4161,
"step": 1111
},
{
"epoch": 0.6845841589872833,
"grad_norm": 1.7404741048812866,
"learning_rate": 2.388570213200337e-06,
"loss": 0.3982,
"step": 1112
},
{
"epoch": 0.6851997922237827,
"grad_norm": 1.899558186531067,
"learning_rate": 2.380070470149605e-06,
"loss": 0.4103,
"step": 1113
},
{
"epoch": 0.6858154254602821,
"grad_norm": 1.8454153537750244,
"learning_rate": 2.371581150947476e-06,
"loss": 0.3985,
"step": 1114
},
{
"epoch": 0.6864310586967813,
"grad_norm": 1.6742509603500366,
"learning_rate": 2.363102289370198e-06,
"loss": 0.3938,
"step": 1115
},
{
"epoch": 0.6870466919332807,
"grad_norm": 1.6200331449508667,
"learning_rate": 2.354633919152404e-06,
"loss": 0.3638,
"step": 1116
},
{
"epoch": 0.6876623251697801,
"grad_norm": 1.6609104871749878,
"learning_rate": 2.3461760739869865e-06,
"loss": 0.4015,
"step": 1117
},
{
"epoch": 0.6882779584062795,
"grad_norm": 1.658146619796753,
"learning_rate": 2.3377287875249694e-06,
"loss": 0.3951,
"step": 1118
},
{
"epoch": 0.6888935916427789,
"grad_norm": 1.9421802759170532,
"learning_rate": 2.3292920933753566e-06,
"loss": 0.4385,
"step": 1119
},
{
"epoch": 0.6895092248792781,
"grad_norm": 1.8629512786865234,
"learning_rate": 2.320866025105016e-06,
"loss": 0.4117,
"step": 1120
},
{
"epoch": 0.6901248581157775,
"grad_norm": 1.7256838083267212,
"learning_rate": 2.31245061623854e-06,
"loss": 0.4164,
"step": 1121
},
{
"epoch": 0.6907404913522769,
"grad_norm": 2.03247332572937,
"learning_rate": 2.3040459002581e-06,
"loss": 0.4423,
"step": 1122
},
{
"epoch": 0.6913561245887763,
"grad_norm": 1.765477180480957,
"learning_rate": 2.2956519106033366e-06,
"loss": 0.4036,
"step": 1123
},
{
"epoch": 0.6919717578252756,
"grad_norm": 1.7879197597503662,
"learning_rate": 2.2872686806712037e-06,
"loss": 0.4357,
"step": 1124
},
{
"epoch": 0.6925873910617749,
"grad_norm": 1.898209571838379,
"learning_rate": 2.278896243815852e-06,
"loss": 0.4032,
"step": 1125
},
{
"epoch": 0.6932030242982743,
"grad_norm": 1.812808871269226,
"learning_rate": 2.2705346333484925e-06,
"loss": 0.459,
"step": 1126
},
{
"epoch": 0.6938186575347737,
"grad_norm": 1.8547736406326294,
"learning_rate": 2.2621838825372496e-06,
"loss": 0.4036,
"step": 1127
},
{
"epoch": 0.694434290771273,
"grad_norm": 1.6805411577224731,
"learning_rate": 2.253844024607054e-06,
"loss": 0.3908,
"step": 1128
},
{
"epoch": 0.6950499240077723,
"grad_norm": 1.666832685470581,
"learning_rate": 2.245515092739488e-06,
"loss": 0.4003,
"step": 1129
},
{
"epoch": 0.6956655572442717,
"grad_norm": 1.7172799110412598,
"learning_rate": 2.237197120072667e-06,
"loss": 0.4229,
"step": 1130
},
{
"epoch": 0.6962811904807711,
"grad_norm": 1.6013420820236206,
"learning_rate": 2.228890139701106e-06,
"loss": 0.3989,
"step": 1131
},
{
"epoch": 0.6968968237172705,
"grad_norm": 1.7002962827682495,
"learning_rate": 2.2205941846755787e-06,
"loss": 0.3897,
"step": 1132
},
{
"epoch": 0.6975124569537698,
"grad_norm": 1.7835053205490112,
"learning_rate": 2.2123092880029928e-06,
"loss": 0.3967,
"step": 1133
},
{
"epoch": 0.6981280901902691,
"grad_norm": 1.798115849494934,
"learning_rate": 2.204035482646267e-06,
"loss": 0.4067,
"step": 1134
},
{
"epoch": 0.6987437234267685,
"grad_norm": 1.706468939781189,
"learning_rate": 2.1957728015241793e-06,
"loss": 0.4123,
"step": 1135
},
{
"epoch": 0.6993593566632679,
"grad_norm": 1.8424924612045288,
"learning_rate": 2.187521277511259e-06,
"loss": 0.4145,
"step": 1136
},
{
"epoch": 0.6999749898997673,
"grad_norm": 1.9657038450241089,
"learning_rate": 2.1792809434376366e-06,
"loss": 0.4245,
"step": 1137
},
{
"epoch": 0.7005906231362665,
"grad_norm": 1.906627893447876,
"learning_rate": 2.171051832088928e-06,
"loss": 0.4311,
"step": 1138
},
{
"epoch": 0.7012062563727659,
"grad_norm": 2.0402626991271973,
"learning_rate": 2.162833976206092e-06,
"loss": 0.3795,
"step": 1139
},
{
"epoch": 0.7018218896092653,
"grad_norm": 1.7367284297943115,
"learning_rate": 2.1546274084853062e-06,
"loss": 0.4367,
"step": 1140
},
{
"epoch": 0.7024375228457647,
"grad_norm": 1.8020879030227661,
"learning_rate": 2.146432161577842e-06,
"loss": 0.4153,
"step": 1141
},
{
"epoch": 0.703053156082264,
"grad_norm": 1.6975538730621338,
"learning_rate": 2.1382482680899213e-06,
"loss": 0.4141,
"step": 1142
},
{
"epoch": 0.7036687893187633,
"grad_norm": 1.5046709775924683,
"learning_rate": 2.130075760582602e-06,
"loss": 0.3734,
"step": 1143
},
{
"epoch": 0.7042844225552627,
"grad_norm": 1.6403920650482178,
"learning_rate": 2.1219146715716332e-06,
"loss": 0.3891,
"step": 1144
},
{
"epoch": 0.7049000557917621,
"grad_norm": 1.7096176147460938,
"learning_rate": 2.113765033527338e-06,
"loss": 0.4007,
"step": 1145
},
{
"epoch": 0.7055156890282615,
"grad_norm": 1.775561809539795,
"learning_rate": 2.1056268788744803e-06,
"loss": 0.3982,
"step": 1146
},
{
"epoch": 0.7061313222647607,
"grad_norm": 1.9706265926361084,
"learning_rate": 2.097500239992132e-06,
"loss": 0.4374,
"step": 1147
},
{
"epoch": 0.7067469555012601,
"grad_norm": 1.670867681503296,
"learning_rate": 2.0893851492135536e-06,
"loss": 0.3741,
"step": 1148
},
{
"epoch": 0.7073625887377595,
"grad_norm": 2.0540754795074463,
"learning_rate": 2.081281638826052e-06,
"loss": 0.4138,
"step": 1149
},
{
"epoch": 0.7079782219742589,
"grad_norm": 1.6522064208984375,
"learning_rate": 2.0731897410708618e-06,
"loss": 0.4081,
"step": 1150
},
{
"epoch": 0.7085938552107582,
"grad_norm": 1.7104498147964478,
"learning_rate": 2.0651094881430194e-06,
"loss": 0.3942,
"step": 1151
},
{
"epoch": 0.7092094884472575,
"grad_norm": 1.8993293046951294,
"learning_rate": 2.0570409121912233e-06,
"loss": 0.3972,
"step": 1152
},
{
"epoch": 0.7098251216837569,
"grad_norm": 1.670290470123291,
"learning_rate": 2.0489840453177198e-06,
"loss": 0.3816,
"step": 1153
},
{
"epoch": 0.7104407549202563,
"grad_norm": 1.8099826574325562,
"learning_rate": 2.0409389195781627e-06,
"loss": 0.375,
"step": 1154
},
{
"epoch": 0.7110563881567556,
"grad_norm": 1.8723803758621216,
"learning_rate": 2.0329055669814936e-06,
"loss": 0.4192,
"step": 1155
},
{
"epoch": 0.7116720213932549,
"grad_norm": 1.5666253566741943,
"learning_rate": 2.0248840194898155e-06,
"loss": 0.3741,
"step": 1156
},
{
"epoch": 0.7122876546297543,
"grad_norm": 1.806522011756897,
"learning_rate": 2.0168743090182574e-06,
"loss": 0.3924,
"step": 1157
},
{
"epoch": 0.7129032878662537,
"grad_norm": 1.791751742362976,
"learning_rate": 2.0088764674348593e-06,
"loss": 0.4237,
"step": 1158
},
{
"epoch": 0.7135189211027531,
"grad_norm": 1.632794737815857,
"learning_rate": 2.0008905265604316e-06,
"loss": 0.4081,
"step": 1159
},
{
"epoch": 0.7141345543392524,
"grad_norm": 1.7562607526779175,
"learning_rate": 1.992916518168442e-06,
"loss": 0.3939,
"step": 1160
},
{
"epoch": 0.7147501875757517,
"grad_norm": 1.7803467512130737,
"learning_rate": 1.9849544739848782e-06,
"loss": 0.4032,
"step": 1161
},
{
"epoch": 0.7153658208122511,
"grad_norm": 1.7394850254058838,
"learning_rate": 1.977004425688126e-06,
"loss": 0.4083,
"step": 1162
},
{
"epoch": 0.7159814540487505,
"grad_norm": 1.5431199073791504,
"learning_rate": 1.9690664049088494e-06,
"loss": 0.366,
"step": 1163
},
{
"epoch": 0.7165970872852498,
"grad_norm": 1.7097880840301514,
"learning_rate": 1.9611404432298505e-06,
"loss": 0.4058,
"step": 1164
},
{
"epoch": 0.7172127205217491,
"grad_norm": 1.768742561340332,
"learning_rate": 1.95322657218596e-06,
"loss": 0.4349,
"step": 1165
},
{
"epoch": 0.7178283537582485,
"grad_norm": 1.8069496154785156,
"learning_rate": 1.945324823263899e-06,
"loss": 0.4023,
"step": 1166
},
{
"epoch": 0.7184439869947479,
"grad_norm": 1.7423423528671265,
"learning_rate": 1.937435227902157e-06,
"loss": 0.3918,
"step": 1167
},
{
"epoch": 0.7190596202312473,
"grad_norm": 1.7018494606018066,
"learning_rate": 1.929557817490874e-06,
"loss": 0.4077,
"step": 1168
},
{
"epoch": 0.7196752534677466,
"grad_norm": 1.7701889276504517,
"learning_rate": 1.9216926233717087e-06,
"loss": 0.4062,
"step": 1169
},
{
"epoch": 0.7202908867042459,
"grad_norm": 1.7747608423233032,
"learning_rate": 1.9138396768377106e-06,
"loss": 0.4047,
"step": 1170
},
{
"epoch": 0.7209065199407453,
"grad_norm": 1.7562189102172852,
"learning_rate": 1.9059990091332082e-06,
"loss": 0.3998,
"step": 1171
},
{
"epoch": 0.7215221531772447,
"grad_norm": 1.805248498916626,
"learning_rate": 1.8981706514536641e-06,
"loss": 0.4238,
"step": 1172
},
{
"epoch": 0.722137786413744,
"grad_norm": 1.671569585800171,
"learning_rate": 1.8903546349455748e-06,
"loss": 0.3735,
"step": 1173
},
{
"epoch": 0.7227534196502434,
"grad_norm": 1.866377830505371,
"learning_rate": 1.8825509907063328e-06,
"loss": 0.4411,
"step": 1174
},
{
"epoch": 0.7233690528867427,
"grad_norm": 1.6782642602920532,
"learning_rate": 1.8747597497841003e-06,
"loss": 0.3815,
"step": 1175
},
{
"epoch": 0.7239846861232421,
"grad_norm": 1.7450333833694458,
"learning_rate": 1.8669809431776991e-06,
"loss": 0.3848,
"step": 1176
},
{
"epoch": 0.7246003193597415,
"grad_norm": 1.8355008363723755,
"learning_rate": 1.8592146018364682e-06,
"loss": 0.4183,
"step": 1177
},
{
"epoch": 0.7252159525962408,
"grad_norm": 1.8200833797454834,
"learning_rate": 1.851460756660159e-06,
"loss": 0.3915,
"step": 1178
},
{
"epoch": 0.7258315858327401,
"grad_norm": 1.7371735572814941,
"learning_rate": 1.843719438498806e-06,
"loss": 0.4138,
"step": 1179
},
{
"epoch": 0.7264472190692395,
"grad_norm": 1.702866554260254,
"learning_rate": 1.8359906781525955e-06,
"loss": 0.3959,
"step": 1180
},
{
"epoch": 0.7270628523057389,
"grad_norm": 1.6576327085494995,
"learning_rate": 1.8282745063717577e-06,
"loss": 0.3961,
"step": 1181
},
{
"epoch": 0.7276784855422382,
"grad_norm": 1.7897869348526,
"learning_rate": 1.8205709538564326e-06,
"loss": 0.4128,
"step": 1182
},
{
"epoch": 0.7282941187787376,
"grad_norm": 1.6971567869186401,
"learning_rate": 1.8128800512565514e-06,
"loss": 0.391,
"step": 1183
},
{
"epoch": 0.7289097520152369,
"grad_norm": 1.7428371906280518,
"learning_rate": 1.8052018291717216e-06,
"loss": 0.3992,
"step": 1184
},
{
"epoch": 0.7295253852517363,
"grad_norm": 1.6395870447158813,
"learning_rate": 1.7975363181510901e-06,
"loss": 0.384,
"step": 1185
},
{
"epoch": 0.7301410184882356,
"grad_norm": 1.7006916999816895,
"learning_rate": 1.7898835486932398e-06,
"loss": 0.4237,
"step": 1186
},
{
"epoch": 0.730756651724735,
"grad_norm": 1.6358023881912231,
"learning_rate": 1.7822435512460512e-06,
"loss": 0.372,
"step": 1187
},
{
"epoch": 0.7313722849612343,
"grad_norm": 1.7687275409698486,
"learning_rate": 1.7746163562065955e-06,
"loss": 0.4198,
"step": 1188
},
{
"epoch": 0.7319879181977337,
"grad_norm": 1.8274449110031128,
"learning_rate": 1.7670019939210025e-06,
"loss": 0.4086,
"step": 1189
},
{
"epoch": 0.7326035514342331,
"grad_norm": 1.8345245122909546,
"learning_rate": 1.7594004946843458e-06,
"loss": 0.3979,
"step": 1190
},
{
"epoch": 0.7332191846707324,
"grad_norm": 1.7566828727722168,
"learning_rate": 1.7518118887405239e-06,
"loss": 0.4244,
"step": 1191
},
{
"epoch": 0.7338348179072318,
"grad_norm": 1.7326489686965942,
"learning_rate": 1.7442362062821323e-06,
"loss": 0.4081,
"step": 1192
},
{
"epoch": 0.7344504511437311,
"grad_norm": 1.7257455587387085,
"learning_rate": 1.7366734774503541e-06,
"loss": 0.3989,
"step": 1193
},
{
"epoch": 0.7350660843802305,
"grad_norm": 1.5269023180007935,
"learning_rate": 1.7291237323348287e-06,
"loss": 0.3905,
"step": 1194
},
{
"epoch": 0.7356817176167298,
"grad_norm": 1.63347589969635,
"learning_rate": 1.7215870009735386e-06,
"loss": 0.3853,
"step": 1195
},
{
"epoch": 0.7362973508532292,
"grad_norm": 1.6561665534973145,
"learning_rate": 1.714063313352693e-06,
"loss": 0.3871,
"step": 1196
},
{
"epoch": 0.7369129840897285,
"grad_norm": 1.7448550462722778,
"learning_rate": 1.7065526994065973e-06,
"loss": 0.4134,
"step": 1197
},
{
"epoch": 0.7375286173262279,
"grad_norm": 1.7241095304489136,
"learning_rate": 1.6990551890175488e-06,
"loss": 0.3957,
"step": 1198
},
{
"epoch": 0.7381442505627273,
"grad_norm": 1.7178313732147217,
"learning_rate": 1.6915708120157042e-06,
"loss": 0.4008,
"step": 1199
},
{
"epoch": 0.7387598837992266,
"grad_norm": 1.663047432899475,
"learning_rate": 1.684099598178967e-06,
"loss": 0.3922,
"step": 1200
},
{
"epoch": 0.7387598837992266,
"eval_loss": 0.39609086513519287,
"eval_runtime": 118.1732,
"eval_samples_per_second": 35.55,
"eval_steps_per_second": 4.451,
"step": 1200
},
{
"epoch": 0.739375517035726,
"grad_norm": 1.841245412826538,
"learning_rate": 1.6766415772328732e-06,
"loss": 0.4101,
"step": 1201
},
{
"epoch": 0.7399911502722253,
"grad_norm": 1.8247209787368774,
"learning_rate": 1.669196778850462e-06,
"loss": 0.3869,
"step": 1202
},
{
"epoch": 0.7406067835087247,
"grad_norm": 1.6148180961608887,
"learning_rate": 1.6617652326521705e-06,
"loss": 0.3857,
"step": 1203
},
{
"epoch": 0.741222416745224,
"grad_norm": 1.8191235065460205,
"learning_rate": 1.6543469682057105e-06,
"loss": 0.4058,
"step": 1204
},
{
"epoch": 0.7418380499817234,
"grad_norm": 1.7104837894439697,
"learning_rate": 1.6469420150259396e-06,
"loss": 0.3987,
"step": 1205
},
{
"epoch": 0.7424536832182227,
"grad_norm": 1.7302885055541992,
"learning_rate": 1.639550402574766e-06,
"loss": 0.3967,
"step": 1206
},
{
"epoch": 0.7430693164547221,
"grad_norm": 1.7323321104049683,
"learning_rate": 1.632172160261012e-06,
"loss": 0.4132,
"step": 1207
},
{
"epoch": 0.7436849496912215,
"grad_norm": 1.627414345741272,
"learning_rate": 1.6248073174403083e-06,
"loss": 0.4317,
"step": 1208
},
{
"epoch": 0.7443005829277208,
"grad_norm": 1.7372418642044067,
"learning_rate": 1.617455903414974e-06,
"loss": 0.4288,
"step": 1209
},
{
"epoch": 0.7449162161642202,
"grad_norm": 1.571689248085022,
"learning_rate": 1.610117947433897e-06,
"loss": 0.3923,
"step": 1210
},
{
"epoch": 0.7455318494007195,
"grad_norm": 1.6416131258010864,
"learning_rate": 1.6027934786924187e-06,
"loss": 0.3731,
"step": 1211
},
{
"epoch": 0.7461474826372189,
"grad_norm": 1.8294082880020142,
"learning_rate": 1.5954825263322215e-06,
"loss": 0.416,
"step": 1212
},
{
"epoch": 0.7467631158737182,
"grad_norm": 1.5999701023101807,
"learning_rate": 1.5881851194412106e-06,
"loss": 0.3864,
"step": 1213
},
{
"epoch": 0.7473787491102176,
"grad_norm": 1.582100749015808,
"learning_rate": 1.5809012870533996e-06,
"loss": 0.4004,
"step": 1214
},
{
"epoch": 0.7479943823467169,
"grad_norm": 1.7439690828323364,
"learning_rate": 1.57363105814879e-06,
"loss": 0.3662,
"step": 1215
},
{
"epoch": 0.7486100155832163,
"grad_norm": 1.7250139713287354,
"learning_rate": 1.5663744616532612e-06,
"loss": 0.3711,
"step": 1216
},
{
"epoch": 0.7492256488197157,
"grad_norm": 1.5566151142120361,
"learning_rate": 1.559131526438452e-06,
"loss": 0.3768,
"step": 1217
},
{
"epoch": 0.749841282056215,
"grad_norm": 1.6487715244293213,
"learning_rate": 1.551902281321651e-06,
"loss": 0.4079,
"step": 1218
},
{
"epoch": 0.7504569152927144,
"grad_norm": 1.6890857219696045,
"learning_rate": 1.544686755065677e-06,
"loss": 0.4214,
"step": 1219
},
{
"epoch": 0.7510725485292137,
"grad_norm": 1.7031358480453491,
"learning_rate": 1.537484976378763e-06,
"loss": 0.3975,
"step": 1220
},
{
"epoch": 0.7516881817657131,
"grad_norm": 1.5813488960266113,
"learning_rate": 1.5302969739144497e-06,
"loss": 0.3636,
"step": 1221
},
{
"epoch": 0.7523038150022124,
"grad_norm": 1.6897863149642944,
"learning_rate": 1.523122776271463e-06,
"loss": 0.4046,
"step": 1222
},
{
"epoch": 0.7529194482387118,
"grad_norm": 1.6395469903945923,
"learning_rate": 1.5159624119936028e-06,
"loss": 0.401,
"step": 1223
},
{
"epoch": 0.7535350814752111,
"grad_norm": 1.9050360918045044,
"learning_rate": 1.5088159095696365e-06,
"loss": 0.4201,
"step": 1224
},
{
"epoch": 0.7541507147117105,
"grad_norm": 1.8043394088745117,
"learning_rate": 1.5016832974331725e-06,
"loss": 0.3948,
"step": 1225
},
{
"epoch": 0.7547663479482098,
"grad_norm": 1.6688209772109985,
"learning_rate": 1.4945646039625611e-06,
"loss": 0.3852,
"step": 1226
},
{
"epoch": 0.7553819811847092,
"grad_norm": 1.732325553894043,
"learning_rate": 1.4874598574807697e-06,
"loss": 0.3934,
"step": 1227
},
{
"epoch": 0.7559976144212086,
"grad_norm": 1.6879240274429321,
"learning_rate": 1.4803690862552755e-06,
"loss": 0.4096,
"step": 1228
},
{
"epoch": 0.7566132476577079,
"grad_norm": 1.6550501585006714,
"learning_rate": 1.4732923184979563e-06,
"loss": 0.3892,
"step": 1229
},
{
"epoch": 0.7572288808942073,
"grad_norm": 1.8862385749816895,
"learning_rate": 1.4662295823649702e-06,
"loss": 0.4022,
"step": 1230
},
{
"epoch": 0.7578445141307066,
"grad_norm": 2.008464813232422,
"learning_rate": 1.459180905956653e-06,
"loss": 0.4218,
"step": 1231
},
{
"epoch": 0.758460147367206,
"grad_norm": 1.6959370374679565,
"learning_rate": 1.4521463173173966e-06,
"loss": 0.3896,
"step": 1232
},
{
"epoch": 0.7590757806037054,
"grad_norm": 1.7890487909317017,
"learning_rate": 1.4451258444355432e-06,
"loss": 0.3469,
"step": 1233
},
{
"epoch": 0.7596914138402047,
"grad_norm": 1.7809585332870483,
"learning_rate": 1.438119515243277e-06,
"loss": 0.3953,
"step": 1234
},
{
"epoch": 0.760307047076704,
"grad_norm": 1.6403446197509766,
"learning_rate": 1.431127357616503e-06,
"loss": 0.3852,
"step": 1235
},
{
"epoch": 0.7609226803132034,
"grad_norm": 1.7148042917251587,
"learning_rate": 1.424149399374748e-06,
"loss": 0.3885,
"step": 1236
},
{
"epoch": 0.7615383135497028,
"grad_norm": 1.762613296508789,
"learning_rate": 1.4171856682810386e-06,
"loss": 0.4068,
"step": 1237
},
{
"epoch": 0.7621539467862021,
"grad_norm": 1.8621405363082886,
"learning_rate": 1.4102361920418022e-06,
"loss": 0.3884,
"step": 1238
},
{
"epoch": 0.7627695800227015,
"grad_norm": 1.9017481803894043,
"learning_rate": 1.4033009983067454e-06,
"loss": 0.3999,
"step": 1239
},
{
"epoch": 0.7633852132592008,
"grad_norm": 1.7529352903366089,
"learning_rate": 1.39638011466875e-06,
"loss": 0.3954,
"step": 1240
},
{
"epoch": 0.7640008464957002,
"grad_norm": 1.744629144668579,
"learning_rate": 1.3894735686637672e-06,
"loss": 0.3868,
"step": 1241
},
{
"epoch": 0.7646164797321996,
"grad_norm": 1.6554515361785889,
"learning_rate": 1.3825813877706973e-06,
"loss": 0.3851,
"step": 1242
},
{
"epoch": 0.7652321129686989,
"grad_norm": 1.637499451637268,
"learning_rate": 1.3757035994112915e-06,
"loss": 0.4003,
"step": 1243
},
{
"epoch": 0.7658477462051982,
"grad_norm": 1.6990104913711548,
"learning_rate": 1.3688402309500353e-06,
"loss": 0.3871,
"step": 1244
},
{
"epoch": 0.7664633794416976,
"grad_norm": 1.6374468803405762,
"learning_rate": 1.3619913096940408e-06,
"loss": 0.3956,
"step": 1245
},
{
"epoch": 0.767079012678197,
"grad_norm": 1.8268284797668457,
"learning_rate": 1.3551568628929434e-06,
"loss": 0.4302,
"step": 1246
},
{
"epoch": 0.7676946459146963,
"grad_norm": 1.6786978244781494,
"learning_rate": 1.3483369177387845e-06,
"loss": 0.3645,
"step": 1247
},
{
"epoch": 0.7683102791511957,
"grad_norm": 1.6917263269424438,
"learning_rate": 1.341531501365912e-06,
"loss": 0.4048,
"step": 1248
},
{
"epoch": 0.768925912387695,
"grad_norm": 1.892749547958374,
"learning_rate": 1.3347406408508695e-06,
"loss": 0.4066,
"step": 1249
},
{
"epoch": 0.7695415456241944,
"grad_norm": 1.8319532871246338,
"learning_rate": 1.3279643632122807e-06,
"loss": 0.4067,
"step": 1250
},
{
"epoch": 0.7701571788606938,
"grad_norm": 1.621427297592163,
"learning_rate": 1.3212026954107564e-06,
"loss": 0.3982,
"step": 1251
},
{
"epoch": 0.7707728120971931,
"grad_norm": 1.7341312170028687,
"learning_rate": 1.3144556643487743e-06,
"loss": 0.4222,
"step": 1252
},
{
"epoch": 0.7713884453336924,
"grad_norm": 1.6124165058135986,
"learning_rate": 1.3077232968705805e-06,
"loss": 0.371,
"step": 1253
},
{
"epoch": 0.7720040785701918,
"grad_norm": 1.6049258708953857,
"learning_rate": 1.3010056197620813e-06,
"loss": 0.3728,
"step": 1254
},
{
"epoch": 0.7726197118066912,
"grad_norm": 1.7597967386245728,
"learning_rate": 1.2943026597507268e-06,
"loss": 0.4174,
"step": 1255
},
{
"epoch": 0.7732353450431905,
"grad_norm": 1.5948874950408936,
"learning_rate": 1.2876144435054194e-06,
"loss": 0.3633,
"step": 1256
},
{
"epoch": 0.7738509782796898,
"grad_norm": 1.8157576322555542,
"learning_rate": 1.2809409976364017e-06,
"loss": 0.4034,
"step": 1257
},
{
"epoch": 0.7744666115161892,
"grad_norm": 1.7797636985778809,
"learning_rate": 1.2742823486951434e-06,
"loss": 0.3928,
"step": 1258
},
{
"epoch": 0.7750822447526886,
"grad_norm": 1.8237395286560059,
"learning_rate": 1.2676385231742493e-06,
"loss": 0.4087,
"step": 1259
},
{
"epoch": 0.775697877989188,
"grad_norm": 1.768092393875122,
"learning_rate": 1.2610095475073415e-06,
"loss": 0.3719,
"step": 1260
},
{
"epoch": 0.7763135112256873,
"grad_norm": 1.7261672019958496,
"learning_rate": 1.254395448068959e-06,
"loss": 0.3819,
"step": 1261
},
{
"epoch": 0.7769291444621866,
"grad_norm": 1.8468579053878784,
"learning_rate": 1.247796251174459e-06,
"loss": 0.4232,
"step": 1262
},
{
"epoch": 0.777544777698686,
"grad_norm": 1.6958353519439697,
"learning_rate": 1.2412119830798992e-06,
"loss": 0.3827,
"step": 1263
},
{
"epoch": 0.7781604109351854,
"grad_norm": 1.7623149156570435,
"learning_rate": 1.234642669981946e-06,
"loss": 0.4027,
"step": 1264
},
{
"epoch": 0.7787760441716847,
"grad_norm": 1.7944426536560059,
"learning_rate": 1.2280883380177593e-06,
"loss": 0.3931,
"step": 1265
},
{
"epoch": 0.779391677408184,
"grad_norm": 1.720826268196106,
"learning_rate": 1.2215490132649016e-06,
"loss": 0.4437,
"step": 1266
},
{
"epoch": 0.7800073106446834,
"grad_norm": 1.7686612606048584,
"learning_rate": 1.2150247217412186e-06,
"loss": 0.381,
"step": 1267
},
{
"epoch": 0.7806229438811828,
"grad_norm": 1.734942078590393,
"learning_rate": 1.2085154894047468e-06,
"loss": 0.413,
"step": 1268
},
{
"epoch": 0.7812385771176822,
"grad_norm": 1.618408203125,
"learning_rate": 1.2020213421536103e-06,
"loss": 0.3706,
"step": 1269
},
{
"epoch": 0.7818542103541815,
"grad_norm": 1.7088454961776733,
"learning_rate": 1.195542305825908e-06,
"loss": 0.3694,
"step": 1270
},
{
"epoch": 0.7824698435906808,
"grad_norm": 1.6714153289794922,
"learning_rate": 1.189078406199624e-06,
"loss": 0.4032,
"step": 1271
},
{
"epoch": 0.7830854768271802,
"grad_norm": 1.786453127861023,
"learning_rate": 1.1826296689925142e-06,
"loss": 0.3906,
"step": 1272
},
{
"epoch": 0.7837011100636796,
"grad_norm": 1.66269850730896,
"learning_rate": 1.1761961198620081e-06,
"loss": 0.3647,
"step": 1273
},
{
"epoch": 0.7843167433001789,
"grad_norm": 1.7173188924789429,
"learning_rate": 1.1697777844051105e-06,
"loss": 0.3923,
"step": 1274
},
{
"epoch": 0.7849323765366782,
"grad_norm": 1.6236497163772583,
"learning_rate": 1.1633746881582902e-06,
"loss": 0.3736,
"step": 1275
},
{
"epoch": 0.7855480097731776,
"grad_norm": 1.8804194927215576,
"learning_rate": 1.1569868565973912e-06,
"loss": 0.4031,
"step": 1276
},
{
"epoch": 0.786163643009677,
"grad_norm": 1.5298935174942017,
"learning_rate": 1.1506143151375177e-06,
"loss": 0.3728,
"step": 1277
},
{
"epoch": 0.7867792762461764,
"grad_norm": 1.8781102895736694,
"learning_rate": 1.144257089132942e-06,
"loss": 0.393,
"step": 1278
},
{
"epoch": 0.7873949094826757,
"grad_norm": 1.7233707904815674,
"learning_rate": 1.137915203877003e-06,
"loss": 0.3832,
"step": 1279
},
{
"epoch": 0.788010542719175,
"grad_norm": 1.775540828704834,
"learning_rate": 1.1315886846020008e-06,
"loss": 0.3932,
"step": 1280
},
{
"epoch": 0.7886261759556744,
"grad_norm": 1.8409913778305054,
"learning_rate": 1.1252775564791023e-06,
"loss": 0.4121,
"step": 1281
},
{
"epoch": 0.7892418091921738,
"grad_norm": 1.746534824371338,
"learning_rate": 1.118981844618236e-06,
"loss": 0.387,
"step": 1282
},
{
"epoch": 0.7898574424286732,
"grad_norm": 1.7044163942337036,
"learning_rate": 1.1127015740679925e-06,
"loss": 0.3775,
"step": 1283
},
{
"epoch": 0.7904730756651724,
"grad_norm": 1.6011128425598145,
"learning_rate": 1.1064367698155303e-06,
"loss": 0.3737,
"step": 1284
},
{
"epoch": 0.7910887089016718,
"grad_norm": 1.5867910385131836,
"learning_rate": 1.1001874567864696e-06,
"loss": 0.3783,
"step": 1285
},
{
"epoch": 0.7917043421381712,
"grad_norm": 1.633167028427124,
"learning_rate": 1.0939536598447986e-06,
"loss": 0.3673,
"step": 1286
},
{
"epoch": 0.7923199753746706,
"grad_norm": 1.6005995273590088,
"learning_rate": 1.087735403792768e-06,
"loss": 0.3728,
"step": 1287
},
{
"epoch": 0.7929356086111699,
"grad_norm": 1.7536711692810059,
"learning_rate": 1.0815327133708015e-06,
"loss": 0.3928,
"step": 1288
},
{
"epoch": 0.7935512418476692,
"grad_norm": 1.7381296157836914,
"learning_rate": 1.0753456132573886e-06,
"loss": 0.3931,
"step": 1289
},
{
"epoch": 0.7941668750841686,
"grad_norm": 1.7327688932418823,
"learning_rate": 1.0691741280689894e-06,
"loss": 0.3898,
"step": 1290
},
{
"epoch": 0.794782508320668,
"grad_norm": 1.7023015022277832,
"learning_rate": 1.06301828235994e-06,
"loss": 0.4062,
"step": 1291
},
{
"epoch": 0.7953981415571674,
"grad_norm": 1.8886052370071411,
"learning_rate": 1.0568781006223528e-06,
"loss": 0.4037,
"step": 1292
},
{
"epoch": 0.7960137747936666,
"grad_norm": 1.603112816810608,
"learning_rate": 1.0507536072860141e-06,
"loss": 0.3525,
"step": 1293
},
{
"epoch": 0.796629408030166,
"grad_norm": 1.7494615316390991,
"learning_rate": 1.044644826718295e-06,
"loss": 0.4175,
"step": 1294
},
{
"epoch": 0.7972450412666654,
"grad_norm": 1.6311700344085693,
"learning_rate": 1.0385517832240472e-06,
"loss": 0.3842,
"step": 1295
},
{
"epoch": 0.7978606745031648,
"grad_norm": 1.7615406513214111,
"learning_rate": 1.0324745010455124e-06,
"loss": 0.3754,
"step": 1296
},
{
"epoch": 0.798476307739664,
"grad_norm": 1.6909494400024414,
"learning_rate": 1.0264130043622245e-06,
"loss": 0.3909,
"step": 1297
},
{
"epoch": 0.7990919409761634,
"grad_norm": 1.6405009031295776,
"learning_rate": 1.0203673172909068e-06,
"loss": 0.3877,
"step": 1298
},
{
"epoch": 0.7997075742126628,
"grad_norm": 1.6983451843261719,
"learning_rate": 1.0143374638853892e-06,
"loss": 0.3881,
"step": 1299
},
{
"epoch": 0.8003232074491622,
"grad_norm": 1.7770990133285522,
"learning_rate": 1.0083234681364934e-06,
"loss": 0.4142,
"step": 1300
},
{
"epoch": 0.8003232074491622,
"eval_loss": 0.3866761028766632,
"eval_runtime": 118.3992,
"eval_samples_per_second": 35.482,
"eval_steps_per_second": 4.443,
"step": 1300
},
{
"epoch": 0.8009388406856616,
"grad_norm": 1.518217921257019,
"learning_rate": 1.002325353971958e-06,
"loss": 0.3526,
"step": 1301
},
{
"epoch": 0.8015544739221608,
"grad_norm": 1.729003667831421,
"learning_rate": 9.963431452563331e-07,
"loss": 0.4206,
"step": 1302
},
{
"epoch": 0.8021701071586602,
"grad_norm": 1.719579815864563,
"learning_rate": 9.903768657908803e-07,
"loss": 0.3843,
"step": 1303
},
{
"epoch": 0.8027857403951596,
"grad_norm": 1.7568597793579102,
"learning_rate": 9.844265393134927e-07,
"loss": 0.3944,
"step": 1304
},
{
"epoch": 0.803401373631659,
"grad_norm": 1.6757253408432007,
"learning_rate": 9.784921894985799e-07,
"loss": 0.3816,
"step": 1305
},
{
"epoch": 0.8040170068681582,
"grad_norm": 1.6806457042694092,
"learning_rate": 9.725738399569968e-07,
"loss": 0.3851,
"step": 1306
},
{
"epoch": 0.8046326401046576,
"grad_norm": 1.6344943046569824,
"learning_rate": 9.666715142359334e-07,
"loss": 0.371,
"step": 1307
},
{
"epoch": 0.805248273341157,
"grad_norm": 1.7062585353851318,
"learning_rate": 9.607852358188247e-07,
"loss": 0.4154,
"step": 1308
},
{
"epoch": 0.8058639065776564,
"grad_norm": 1.660595417022705,
"learning_rate": 9.549150281252633e-07,
"loss": 0.388,
"step": 1309
},
{
"epoch": 0.8064795398141558,
"grad_norm": 1.7375619411468506,
"learning_rate": 9.490609145108976e-07,
"loss": 0.3945,
"step": 1310
},
{
"epoch": 0.807095173050655,
"grad_norm": 1.7544565200805664,
"learning_rate": 9.43222918267342e-07,
"loss": 0.4139,
"step": 1311
},
{
"epoch": 0.8077108062871544,
"grad_norm": 1.699823021888733,
"learning_rate": 9.374010626220908e-07,
"loss": 0.3667,
"step": 1312
},
{
"epoch": 0.8083264395236538,
"grad_norm": 1.675127387046814,
"learning_rate": 9.31595370738414e-07,
"loss": 0.3797,
"step": 1313
},
{
"epoch": 0.8089420727601532,
"grad_norm": 1.7964922189712524,
"learning_rate": 9.258058657152763e-07,
"loss": 0.4127,
"step": 1314
},
{
"epoch": 0.8095577059966524,
"grad_norm": 1.7282572984695435,
"learning_rate": 9.200325705872342e-07,
"loss": 0.3814,
"step": 1315
},
{
"epoch": 0.8101733392331518,
"grad_norm": 1.6478873491287231,
"learning_rate": 9.142755083243577e-07,
"loss": 0.3766,
"step": 1316
},
{
"epoch": 0.8107889724696512,
"grad_norm": 1.785675287246704,
"learning_rate": 9.085347018321255e-07,
"loss": 0.4374,
"step": 1317
},
{
"epoch": 0.8114046057061506,
"grad_norm": 1.7040303945541382,
"learning_rate": 9.028101739513406e-07,
"loss": 0.3802,
"step": 1318
},
{
"epoch": 0.81202023894265,
"grad_norm": 1.6498903036117554,
"learning_rate": 8.971019474580428e-07,
"loss": 0.3946,
"step": 1319
},
{
"epoch": 0.8126358721791492,
"grad_norm": 1.7775410413742065,
"learning_rate": 8.914100450634089e-07,
"loss": 0.3782,
"step": 1320
},
{
"epoch": 0.8132515054156486,
"grad_norm": 1.778093695640564,
"learning_rate": 8.857344894136715e-07,
"loss": 0.4015,
"step": 1321
},
{
"epoch": 0.813867138652148,
"grad_norm": 1.5908254384994507,
"learning_rate": 8.800753030900228e-07,
"loss": 0.3668,
"step": 1322
},
{
"epoch": 0.8144827718886474,
"grad_norm": 1.6788358688354492,
"learning_rate": 8.744325086085248e-07,
"loss": 0.3819,
"step": 1323
},
{
"epoch": 0.8150984051251466,
"grad_norm": 1.5763933658599854,
"learning_rate": 8.688061284200266e-07,
"loss": 0.3579,
"step": 1324
},
{
"epoch": 0.815714038361646,
"grad_norm": 1.725440263748169,
"learning_rate": 8.631961849100651e-07,
"loss": 0.371,
"step": 1325
},
{
"epoch": 0.8163296715981454,
"grad_norm": 1.601870059967041,
"learning_rate": 8.576027003987842e-07,
"loss": 0.3488,
"step": 1326
},
{
"epoch": 0.8169453048346448,
"grad_norm": 1.610034704208374,
"learning_rate": 8.520256971408453e-07,
"loss": 0.3793,
"step": 1327
},
{
"epoch": 0.8175609380711442,
"grad_norm": 1.7768540382385254,
"learning_rate": 8.464651973253269e-07,
"loss": 0.3988,
"step": 1328
},
{
"epoch": 0.8181765713076434,
"grad_norm": 1.7675001621246338,
"learning_rate": 8.409212230756564e-07,
"loss": 0.4025,
"step": 1329
},
{
"epoch": 0.8187922045441428,
"grad_norm": 1.7898603677749634,
"learning_rate": 8.353937964495029e-07,
"loss": 0.3879,
"step": 1330
},
{
"epoch": 0.8194078377806422,
"grad_norm": 1.6345689296722412,
"learning_rate": 8.298829394387032e-07,
"loss": 0.3738,
"step": 1331
},
{
"epoch": 0.8200234710171416,
"grad_norm": 1.5670359134674072,
"learning_rate": 8.243886739691703e-07,
"loss": 0.373,
"step": 1332
},
{
"epoch": 0.8206391042536408,
"grad_norm": 1.689761757850647,
"learning_rate": 8.189110219007967e-07,
"loss": 0.3823,
"step": 1333
},
{
"epoch": 0.8212547374901402,
"grad_norm": 1.6478036642074585,
"learning_rate": 8.134500050273841e-07,
"loss": 0.3727,
"step": 1334
},
{
"epoch": 0.8218703707266396,
"grad_norm": 1.914725422859192,
"learning_rate": 8.080056450765427e-07,
"loss": 0.4018,
"step": 1335
},
{
"epoch": 0.822486003963139,
"grad_norm": 1.8748279809951782,
"learning_rate": 8.025779637096138e-07,
"loss": 0.4253,
"step": 1336
},
{
"epoch": 0.8231016371996384,
"grad_norm": 1.621164321899414,
"learning_rate": 7.971669825215789e-07,
"loss": 0.3821,
"step": 1337
},
{
"epoch": 0.8237172704361376,
"grad_norm": 1.6918919086456299,
"learning_rate": 7.917727230409739e-07,
"loss": 0.3825,
"step": 1338
},
{
"epoch": 0.824332903672637,
"grad_norm": 1.6017346382141113,
"learning_rate": 7.863952067298042e-07,
"loss": 0.3639,
"step": 1339
},
{
"epoch": 0.8249485369091364,
"grad_norm": 1.6089705228805542,
"learning_rate": 7.810344549834625e-07,
"loss": 0.3662,
"step": 1340
},
{
"epoch": 0.8255641701456358,
"grad_norm": 1.766654372215271,
"learning_rate": 7.756904891306366e-07,
"loss": 0.3862,
"step": 1341
},
{
"epoch": 0.8261798033821351,
"grad_norm": 1.6718283891677856,
"learning_rate": 7.70363330433233e-07,
"loss": 0.3908,
"step": 1342
},
{
"epoch": 0.8267954366186344,
"grad_norm": 1.629204511642456,
"learning_rate": 7.650530000862849e-07,
"loss": 0.3617,
"step": 1343
},
{
"epoch": 0.8274110698551338,
"grad_norm": 1.6336185932159424,
"learning_rate": 7.597595192178702e-07,
"loss": 0.3991,
"step": 1344
},
{
"epoch": 0.8280267030916332,
"grad_norm": 1.7665934562683105,
"learning_rate": 7.544829088890326e-07,
"loss": 0.3823,
"step": 1345
},
{
"epoch": 0.8286423363281326,
"grad_norm": 1.7317899465560913,
"learning_rate": 7.492231900936886e-07,
"loss": 0.3805,
"step": 1346
},
{
"epoch": 0.8292579695646318,
"grad_norm": 1.8492108583450317,
"learning_rate": 7.439803837585524e-07,
"loss": 0.3662,
"step": 1347
},
{
"epoch": 0.8298736028011312,
"grad_norm": 1.671828269958496,
"learning_rate": 7.387545107430455e-07,
"loss": 0.3933,
"step": 1348
},
{
"epoch": 0.8304892360376306,
"grad_norm": 1.697264552116394,
"learning_rate": 7.33545591839222e-07,
"loss": 0.3553,
"step": 1349
},
{
"epoch": 0.83110486927413,
"grad_norm": 1.8597784042358398,
"learning_rate": 7.283536477716763e-07,
"loss": 0.402,
"step": 1350
},
{
"epoch": 0.8317205025106293,
"grad_norm": 1.7416291236877441,
"learning_rate": 7.23178699197467e-07,
"loss": 0.3691,
"step": 1351
},
{
"epoch": 0.8323361357471286,
"grad_norm": 1.6027207374572754,
"learning_rate": 7.180207667060352e-07,
"loss": 0.3671,
"step": 1352
},
{
"epoch": 0.832951768983628,
"grad_norm": 1.605884075164795,
"learning_rate": 7.12879870819117e-07,
"loss": 0.3578,
"step": 1353
},
{
"epoch": 0.8335674022201274,
"grad_norm": 1.5832176208496094,
"learning_rate": 7.077560319906696e-07,
"loss": 0.3545,
"step": 1354
},
{
"epoch": 0.8341830354566268,
"grad_norm": 1.7259970903396606,
"learning_rate": 7.026492706067823e-07,
"loss": 0.3544,
"step": 1355
},
{
"epoch": 0.834798668693126,
"grad_norm": 1.8300843238830566,
"learning_rate": 6.975596069855983e-07,
"loss": 0.3984,
"step": 1356
},
{
"epoch": 0.8354143019296254,
"grad_norm": 1.8081567287445068,
"learning_rate": 6.924870613772388e-07,
"loss": 0.395,
"step": 1357
},
{
"epoch": 0.8360299351661248,
"grad_norm": 1.7496634721755981,
"learning_rate": 6.874316539637127e-07,
"loss": 0.3751,
"step": 1358
},
{
"epoch": 0.8366455684026242,
"grad_norm": 1.7504856586456299,
"learning_rate": 6.82393404858846e-07,
"loss": 0.4058,
"step": 1359
},
{
"epoch": 0.8372612016391235,
"grad_norm": 1.6938621997833252,
"learning_rate": 6.773723341081945e-07,
"loss": 0.3958,
"step": 1360
},
{
"epoch": 0.8378768348756228,
"grad_norm": 1.6554718017578125,
"learning_rate": 6.723684616889664e-07,
"loss": 0.3718,
"step": 1361
},
{
"epoch": 0.8384924681121222,
"grad_norm": 1.7190313339233398,
"learning_rate": 6.673818075099475e-07,
"loss": 0.3915,
"step": 1362
},
{
"epoch": 0.8391081013486216,
"grad_norm": 1.721051812171936,
"learning_rate": 6.624123914114122e-07,
"loss": 0.3565,
"step": 1363
},
{
"epoch": 0.839723734585121,
"grad_norm": 1.6990845203399658,
"learning_rate": 6.574602331650559e-07,
"loss": 0.3871,
"step": 1364
},
{
"epoch": 0.8403393678216202,
"grad_norm": 1.6603196859359741,
"learning_rate": 6.52525352473905e-07,
"loss": 0.3791,
"step": 1365
},
{
"epoch": 0.8409550010581196,
"grad_norm": 1.9718493223190308,
"learning_rate": 6.476077689722487e-07,
"loss": 0.3968,
"step": 1366
},
{
"epoch": 0.841570634294619,
"grad_norm": 1.744043231010437,
"learning_rate": 6.427075022255547e-07,
"loss": 0.3897,
"step": 1367
},
{
"epoch": 0.8421862675311184,
"grad_norm": 1.7016851902008057,
"learning_rate": 6.378245717303899e-07,
"loss": 0.3967,
"step": 1368
},
{
"epoch": 0.8428019007676177,
"grad_norm": 1.7479567527770996,
"learning_rate": 6.329589969143518e-07,
"loss": 0.3832,
"step": 1369
},
{
"epoch": 0.843417534004117,
"grad_norm": 1.6406941413879395,
"learning_rate": 6.281107971359801e-07,
"loss": 0.3965,
"step": 1370
},
{
"epoch": 0.8440331672406164,
"grad_norm": 1.6212124824523926,
"learning_rate": 6.232799916846888e-07,
"loss": 0.3874,
"step": 1371
},
{
"epoch": 0.8446488004771158,
"grad_norm": 1.7557140588760376,
"learning_rate": 6.184665997806832e-07,
"loss": 0.3894,
"step": 1372
},
{
"epoch": 0.8452644337136151,
"grad_norm": 1.7088721990585327,
"learning_rate": 6.136706405748838e-07,
"loss": 0.3955,
"step": 1373
},
{
"epoch": 0.8458800669501144,
"grad_norm": 1.8180969953536987,
"learning_rate": 6.088921331488568e-07,
"loss": 0.3946,
"step": 1374
},
{
"epoch": 0.8464957001866138,
"grad_norm": 1.5999853610992432,
"learning_rate": 6.041310965147318e-07,
"loss": 0.3843,
"step": 1375
},
{
"epoch": 0.8471113334231132,
"grad_norm": 1.669258952140808,
"learning_rate": 5.993875496151253e-07,
"loss": 0.3811,
"step": 1376
},
{
"epoch": 0.8477269666596126,
"grad_norm": 1.7172578573226929,
"learning_rate": 5.94661511323072e-07,
"loss": 0.4119,
"step": 1377
},
{
"epoch": 0.8483425998961119,
"grad_norm": 1.7204616069793701,
"learning_rate": 5.899530004419396e-07,
"loss": 0.4112,
"step": 1378
},
{
"epoch": 0.8489582331326112,
"grad_norm": 1.6229082345962524,
"learning_rate": 5.852620357053651e-07,
"loss": 0.3582,
"step": 1379
},
{
"epoch": 0.8495738663691106,
"grad_norm": 1.6449095010757446,
"learning_rate": 5.80588635777175e-07,
"loss": 0.3719,
"step": 1380
},
{
"epoch": 0.85018949960561,
"grad_norm": 1.5927188396453857,
"learning_rate": 5.759328192513075e-07,
"loss": 0.3786,
"step": 1381
},
{
"epoch": 0.8508051328421093,
"grad_norm": 1.69962477684021,
"learning_rate": 5.71294604651747e-07,
"loss": 0.3991,
"step": 1382
},
{
"epoch": 0.8514207660786086,
"grad_norm": 1.7231343984603882,
"learning_rate": 5.666740104324392e-07,
"loss": 0.4029,
"step": 1383
},
{
"epoch": 0.852036399315108,
"grad_norm": 1.570091724395752,
"learning_rate": 5.620710549772295e-07,
"loss": 0.3688,
"step": 1384
},
{
"epoch": 0.8526520325516074,
"grad_norm": 1.7892401218414307,
"learning_rate": 5.574857565997838e-07,
"loss": 0.4016,
"step": 1385
},
{
"epoch": 0.8532676657881068,
"grad_norm": 1.714543342590332,
"learning_rate": 5.529181335435124e-07,
"loss": 0.3883,
"step": 1386
},
{
"epoch": 0.8538832990246061,
"grad_norm": 1.5173468589782715,
"learning_rate": 5.483682039815059e-07,
"loss": 0.3706,
"step": 1387
},
{
"epoch": 0.8544989322611054,
"grad_norm": 1.5355753898620605,
"learning_rate": 5.438359860164555e-07,
"loss": 0.3557,
"step": 1388
},
{
"epoch": 0.8551145654976048,
"grad_norm": 1.6694539785385132,
"learning_rate": 5.393214976805833e-07,
"loss": 0.3878,
"step": 1389
},
{
"epoch": 0.8557301987341042,
"grad_norm": 1.6121852397918701,
"learning_rate": 5.348247569355736e-07,
"loss": 0.3804,
"step": 1390
},
{
"epoch": 0.8563458319706035,
"grad_norm": 1.6395108699798584,
"learning_rate": 5.303457816724955e-07,
"loss": 0.3602,
"step": 1391
},
{
"epoch": 0.8569614652071029,
"grad_norm": 1.6895431280136108,
"learning_rate": 5.258845897117387e-07,
"loss": 0.3834,
"step": 1392
},
{
"epoch": 0.8575770984436022,
"grad_norm": 1.9071327447891235,
"learning_rate": 5.214411988029355e-07,
"loss": 0.3926,
"step": 1393
},
{
"epoch": 0.8581927316801016,
"grad_norm": 1.610316514968872,
"learning_rate": 5.17015626624896e-07,
"loss": 0.3709,
"step": 1394
},
{
"epoch": 0.858808364916601,
"grad_norm": 1.5735969543457031,
"learning_rate": 5.126078907855342e-07,
"loss": 0.3613,
"step": 1395
},
{
"epoch": 0.8594239981531003,
"grad_norm": 1.9769666194915771,
"learning_rate": 5.082180088217981e-07,
"loss": 0.4104,
"step": 1396
},
{
"epoch": 0.8600396313895996,
"grad_norm": 1.7611583471298218,
"learning_rate": 5.038459981996036e-07,
"loss": 0.3717,
"step": 1397
},
{
"epoch": 0.860655264626099,
"grad_norm": 1.6172531843185425,
"learning_rate": 4.994918763137596e-07,
"loss": 0.3812,
"step": 1398
},
{
"epoch": 0.8612708978625984,
"grad_norm": 1.7320444583892822,
"learning_rate": 4.951556604879049e-07,
"loss": 0.3781,
"step": 1399
},
{
"epoch": 0.8618865310990977,
"grad_norm": 1.697505235671997,
"learning_rate": 4.908373679744316e-07,
"loss": 0.3847,
"step": 1400
},
{
"epoch": 0.8618865310990977,
"eval_loss": 0.3799481987953186,
"eval_runtime": 117.871,
"eval_samples_per_second": 35.641,
"eval_steps_per_second": 4.463,
"step": 1400
},
{
"epoch": 0.8625021643355971,
"grad_norm": 1.6632241010665894,
"learning_rate": 4.865370159544236e-07,
"loss": 0.384,
"step": 1401
},
{
"epoch": 0.8631177975720964,
"grad_norm": 1.6136724948883057,
"learning_rate": 4.822546215375851e-07,
"loss": 0.3654,
"step": 1402
},
{
"epoch": 0.8637334308085958,
"grad_norm": 1.6867445707321167,
"learning_rate": 4.779902017621718e-07,
"loss": 0.3938,
"step": 1403
},
{
"epoch": 0.8643490640450952,
"grad_norm": 1.618415117263794,
"learning_rate": 4.737437735949263e-07,
"loss": 0.3642,
"step": 1404
},
{
"epoch": 0.8649646972815945,
"grad_norm": 1.6127417087554932,
"learning_rate": 4.6951535393100654e-07,
"loss": 0.3645,
"step": 1405
},
{
"epoch": 0.8655803305180938,
"grad_norm": 1.56674063205719,
"learning_rate": 4.653049595939191e-07,
"loss": 0.3645,
"step": 1406
},
{
"epoch": 0.8661959637545932,
"grad_norm": 1.5660558938980103,
"learning_rate": 4.6111260733545714e-07,
"loss": 0.3577,
"step": 1407
},
{
"epoch": 0.8668115969910926,
"grad_norm": 1.7765249013900757,
"learning_rate": 4.569383138356276e-07,
"loss": 0.3888,
"step": 1408
},
{
"epoch": 0.8674272302275919,
"grad_norm": 1.646727442741394,
"learning_rate": 4.5278209570258914e-07,
"loss": 0.3965,
"step": 1409
},
{
"epoch": 0.8680428634640913,
"grad_norm": 1.8428785800933838,
"learning_rate": 4.486439694725858e-07,
"loss": 0.4066,
"step": 1410
},
{
"epoch": 0.8686584967005906,
"grad_norm": 1.605111837387085,
"learning_rate": 4.4452395160987314e-07,
"loss": 0.3915,
"step": 1411
},
{
"epoch": 0.86927412993709,
"grad_norm": 1.6525288820266724,
"learning_rate": 4.404220585066671e-07,
"loss": 0.3705,
"step": 1412
},
{
"epoch": 0.8698897631735893,
"grad_norm": 1.6098047494888306,
"learning_rate": 4.3633830648306675e-07,
"loss": 0.3834,
"step": 1413
},
{
"epoch": 0.8705053964100887,
"grad_norm": 1.6825004816055298,
"learning_rate": 4.322727117869951e-07,
"loss": 0.3743,
"step": 1414
},
{
"epoch": 0.871121029646588,
"grad_norm": 1.6571918725967407,
"learning_rate": 4.282252905941342e-07,
"loss": 0.3528,
"step": 1415
},
{
"epoch": 0.8717366628830874,
"grad_norm": 1.560193419456482,
"learning_rate": 4.2419605900785755e-07,
"loss": 0.3714,
"step": 1416
},
{
"epoch": 0.8723522961195868,
"grad_norm": 1.694968819618225,
"learning_rate": 4.201850330591678e-07,
"loss": 0.4058,
"step": 1417
},
{
"epoch": 0.8729679293560861,
"grad_norm": 1.742849588394165,
"learning_rate": 4.16192228706635e-07,
"loss": 0.4102,
"step": 1418
},
{
"epoch": 0.8735835625925855,
"grad_norm": 1.7649222612380981,
"learning_rate": 4.122176618363305e-07,
"loss": 0.3816,
"step": 1419
},
{
"epoch": 0.8741991958290848,
"grad_norm": 1.692421555519104,
"learning_rate": 4.082613482617664e-07,
"loss": 0.3759,
"step": 1420
},
{
"epoch": 0.8748148290655842,
"grad_norm": 1.821478247642517,
"learning_rate": 4.043233037238281e-07,
"loss": 0.3886,
"step": 1421
},
{
"epoch": 0.8754304623020835,
"grad_norm": 1.5918692350387573,
"learning_rate": 4.0040354389071613e-07,
"loss": 0.3695,
"step": 1422
},
{
"epoch": 0.8760460955385829,
"grad_norm": 1.6844005584716797,
"learning_rate": 3.965020843578804e-07,
"loss": 0.3911,
"step": 1423
},
{
"epoch": 0.8766617287750822,
"grad_norm": 1.5743554830551147,
"learning_rate": 3.9261894064796136e-07,
"loss": 0.3598,
"step": 1424
},
{
"epoch": 0.8772773620115816,
"grad_norm": 1.627099633216858,
"learning_rate": 3.8875412821072875e-07,
"loss": 0.3787,
"step": 1425
},
{
"epoch": 0.877892995248081,
"grad_norm": 1.6978716850280762,
"learning_rate": 3.8490766242301356e-07,
"loss": 0.3838,
"step": 1426
},
{
"epoch": 0.8785086284845803,
"grad_norm": 1.640041470527649,
"learning_rate": 3.810795585886551e-07,
"loss": 0.3756,
"step": 1427
},
{
"epoch": 0.8791242617210797,
"grad_norm": 1.7754591703414917,
"learning_rate": 3.772698319384349e-07,
"loss": 0.4062,
"step": 1428
},
{
"epoch": 0.879739894957579,
"grad_norm": 1.700578212738037,
"learning_rate": 3.734784976300165e-07,
"loss": 0.3834,
"step": 1429
},
{
"epoch": 0.8803555281940784,
"grad_norm": 1.6692880392074585,
"learning_rate": 3.6970557074788913e-07,
"loss": 0.3648,
"step": 1430
},
{
"epoch": 0.8809711614305777,
"grad_norm": 1.7037373781204224,
"learning_rate": 3.6595106630330277e-07,
"loss": 0.3955,
"step": 1431
},
{
"epoch": 0.8815867946670771,
"grad_norm": 1.6975781917572021,
"learning_rate": 3.6221499923421164e-07,
"loss": 0.3978,
"step": 1432
},
{
"epoch": 0.8822024279035764,
"grad_norm": 1.6766349077224731,
"learning_rate": 3.5849738440521254e-07,
"loss": 0.3781,
"step": 1433
},
{
"epoch": 0.8828180611400758,
"grad_norm": 1.649646282196045,
"learning_rate": 3.5479823660748703e-07,
"loss": 0.385,
"step": 1434
},
{
"epoch": 0.8834336943765752,
"grad_norm": 1.8312703371047974,
"learning_rate": 3.511175705587433e-07,
"loss": 0.3806,
"step": 1435
},
{
"epoch": 0.8840493276130745,
"grad_norm": 1.7184886932373047,
"learning_rate": 3.4745540090315556e-07,
"loss": 0.4092,
"step": 1436
},
{
"epoch": 0.8846649608495739,
"grad_norm": 1.6493487358093262,
"learning_rate": 3.4381174221130796e-07,
"loss": 0.3742,
"step": 1437
},
{
"epoch": 0.8852805940860732,
"grad_norm": 1.7439310550689697,
"learning_rate": 3.4018660898013423e-07,
"loss": 0.4147,
"step": 1438
},
{
"epoch": 0.8858962273225726,
"grad_norm": 1.649951696395874,
"learning_rate": 3.365800156328619e-07,
"loss": 0.3849,
"step": 1439
},
{
"epoch": 0.8865118605590719,
"grad_norm": 1.6006723642349243,
"learning_rate": 3.329919765189554e-07,
"loss": 0.3673,
"step": 1440
},
{
"epoch": 0.8871274937955713,
"grad_norm": 1.630724549293518,
"learning_rate": 3.2942250591405546e-07,
"loss": 0.3806,
"step": 1441
},
{
"epoch": 0.8877431270320706,
"grad_norm": 1.649955153465271,
"learning_rate": 3.258716180199278e-07,
"loss": 0.3725,
"step": 1442
},
{
"epoch": 0.88835876026857,
"grad_norm": 1.5823496580123901,
"learning_rate": 3.2233932696440096e-07,
"loss": 0.3574,
"step": 1443
},
{
"epoch": 0.8889743935050693,
"grad_norm": 1.649905800819397,
"learning_rate": 3.18825646801314e-07,
"loss": 0.3625,
"step": 1444
},
{
"epoch": 0.8895900267415687,
"grad_norm": 1.7206919193267822,
"learning_rate": 3.153305915104593e-07,
"loss": 0.3922,
"step": 1445
},
{
"epoch": 0.8902056599780681,
"grad_norm": 1.6427253484725952,
"learning_rate": 3.118541749975257e-07,
"loss": 0.3815,
"step": 1446
},
{
"epoch": 0.8908212932145674,
"grad_norm": 1.593934416770935,
"learning_rate": 3.0839641109404627e-07,
"loss": 0.37,
"step": 1447
},
{
"epoch": 0.8914369264510668,
"grad_norm": 1.6241251230239868,
"learning_rate": 3.0495731355733915e-07,
"loss": 0.4025,
"step": 1448
},
{
"epoch": 0.8920525596875661,
"grad_norm": 1.6705073118209839,
"learning_rate": 3.015368960704584e-07,
"loss": 0.374,
"step": 1449
},
{
"epoch": 0.8926681929240655,
"grad_norm": 1.6619371175765991,
"learning_rate": 2.9813517224213274e-07,
"loss": 0.3668,
"step": 1450
},
{
"epoch": 0.8932838261605649,
"grad_norm": 1.7148122787475586,
"learning_rate": 2.947521556067162e-07,
"loss": 0.3972,
"step": 1451
},
{
"epoch": 0.8938994593970642,
"grad_norm": 1.7702488899230957,
"learning_rate": 2.913878596241343e-07,
"loss": 0.3788,
"step": 1452
},
{
"epoch": 0.8945150926335635,
"grad_norm": 1.6259140968322754,
"learning_rate": 2.8804229767982637e-07,
"loss": 0.3687,
"step": 1453
},
{
"epoch": 0.8951307258700629,
"grad_norm": 1.8274110555648804,
"learning_rate": 2.847154830846971e-07,
"loss": 0.417,
"step": 1454
},
{
"epoch": 0.8957463591065623,
"grad_norm": 1.8136776685714722,
"learning_rate": 2.8140742907506403e-07,
"loss": 0.3772,
"step": 1455
},
{
"epoch": 0.8963619923430616,
"grad_norm": 1.7630428075790405,
"learning_rate": 2.7811814881259503e-07,
"loss": 0.3994,
"step": 1456
},
{
"epoch": 0.896977625579561,
"grad_norm": 1.6697845458984375,
"learning_rate": 2.748476553842711e-07,
"loss": 0.3783,
"step": 1457
},
{
"epoch": 0.8975932588160603,
"grad_norm": 1.7866156101226807,
"learning_rate": 2.715959618023212e-07,
"loss": 0.3926,
"step": 1458
},
{
"epoch": 0.8982088920525597,
"grad_norm": 1.6477011442184448,
"learning_rate": 2.6836308100417874e-07,
"loss": 0.3921,
"step": 1459
},
{
"epoch": 0.8988245252890591,
"grad_norm": 1.6541386842727661,
"learning_rate": 2.651490258524281e-07,
"loss": 0.3612,
"step": 1460
},
{
"epoch": 0.8994401585255584,
"grad_norm": 1.728593349456787,
"learning_rate": 2.619538091347473e-07,
"loss": 0.3688,
"step": 1461
},
{
"epoch": 0.9000557917620577,
"grad_norm": 1.6868138313293457,
"learning_rate": 2.587774435638679e-07,
"loss": 0.3673,
"step": 1462
},
{
"epoch": 0.9006714249985571,
"grad_norm": 1.7073603868484497,
"learning_rate": 2.556199417775174e-07,
"loss": 0.3852,
"step": 1463
},
{
"epoch": 0.9012870582350565,
"grad_norm": 1.7586218118667603,
"learning_rate": 2.524813163383683e-07,
"loss": 0.4101,
"step": 1464
},
{
"epoch": 0.9019026914715558,
"grad_norm": 1.718984603881836,
"learning_rate": 2.4936157973399266e-07,
"loss": 0.3901,
"step": 1465
},
{
"epoch": 0.9025183247080552,
"grad_norm": 1.692847728729248,
"learning_rate": 2.4626074437680836e-07,
"loss": 0.3694,
"step": 1466
},
{
"epoch": 0.9031339579445545,
"grad_norm": 1.7062618732452393,
"learning_rate": 2.431788226040327e-07,
"loss": 0.3863,
"step": 1467
},
{
"epoch": 0.9037495911810539,
"grad_norm": 1.7544633150100708,
"learning_rate": 2.40115826677631e-07,
"loss": 0.389,
"step": 1468
},
{
"epoch": 0.9043652244175533,
"grad_norm": 1.732604742050171,
"learning_rate": 2.3707176878426886e-07,
"loss": 0.4039,
"step": 1469
},
{
"epoch": 0.9049808576540526,
"grad_norm": 1.642321228981018,
"learning_rate": 2.3404666103526542e-07,
"loss": 0.3708,
"step": 1470
},
{
"epoch": 0.9055964908905519,
"grad_norm": 1.707271933555603,
"learning_rate": 2.3104051546654016e-07,
"loss": 0.383,
"step": 1471
},
{
"epoch": 0.9062121241270513,
"grad_norm": 1.650363802909851,
"learning_rate": 2.280533440385696e-07,
"loss": 0.3985,
"step": 1472
},
{
"epoch": 0.9068277573635507,
"grad_norm": 1.6128193140029907,
"learning_rate": 2.2508515863634062e-07,
"loss": 0.3699,
"step": 1473
},
{
"epoch": 0.90744339060005,
"grad_norm": 1.687888503074646,
"learning_rate": 2.2213597106929608e-07,
"loss": 0.3824,
"step": 1474
},
{
"epoch": 0.9080590238365494,
"grad_norm": 1.7968963384628296,
"learning_rate": 2.1920579307129818e-07,
"loss": 0.3833,
"step": 1475
},
{
"epoch": 0.9086746570730487,
"grad_norm": 1.6406185626983643,
"learning_rate": 2.1629463630057136e-07,
"loss": 0.3858,
"step": 1476
},
{
"epoch": 0.9092902903095481,
"grad_norm": 1.5874775648117065,
"learning_rate": 2.134025123396638e-07,
"loss": 0.3473,
"step": 1477
},
{
"epoch": 0.9099059235460475,
"grad_norm": 1.7503160238265991,
"learning_rate": 2.1052943269539716e-07,
"loss": 0.4037,
"step": 1478
},
{
"epoch": 0.9105215567825468,
"grad_norm": 1.679175615310669,
"learning_rate": 2.0767540879882143e-07,
"loss": 0.3891,
"step": 1479
},
{
"epoch": 0.9111371900190461,
"grad_norm": 1.7528096437454224,
"learning_rate": 2.0484045200517222e-07,
"loss": 0.382,
"step": 1480
},
{
"epoch": 0.9117528232555455,
"grad_norm": 1.7357664108276367,
"learning_rate": 2.0202457359381978e-07,
"loss": 0.3854,
"step": 1481
},
{
"epoch": 0.9123684564920449,
"grad_norm": 1.7275396585464478,
"learning_rate": 1.9922778476823167e-07,
"loss": 0.3966,
"step": 1482
},
{
"epoch": 0.9129840897285442,
"grad_norm": 1.6157152652740479,
"learning_rate": 1.9645009665592073e-07,
"loss": 0.3657,
"step": 1483
},
{
"epoch": 0.9135997229650435,
"grad_norm": 1.7594859600067139,
"learning_rate": 1.9369152030840553e-07,
"loss": 0.4016,
"step": 1484
},
{
"epoch": 0.9142153562015429,
"grad_norm": 1.7511507272720337,
"learning_rate": 1.9095206670116718e-07,
"loss": 0.4145,
"step": 1485
},
{
"epoch": 0.9148309894380423,
"grad_norm": 1.7000651359558105,
"learning_rate": 1.882317467335998e-07,
"loss": 0.3958,
"step": 1486
},
{
"epoch": 0.9154466226745417,
"grad_norm": 1.7108042240142822,
"learning_rate": 1.85530571228974e-07,
"loss": 0.3891,
"step": 1487
},
{
"epoch": 0.916062255911041,
"grad_norm": 1.7302545309066772,
"learning_rate": 1.8284855093438969e-07,
"loss": 0.3691,
"step": 1488
},
{
"epoch": 0.9166778891475403,
"grad_norm": 1.855621576309204,
"learning_rate": 1.801856965207338e-07,
"loss": 0.3857,
"step": 1489
},
{
"epoch": 0.9172935223840397,
"grad_norm": 1.662436842918396,
"learning_rate": 1.7754201858263987e-07,
"loss": 0.368,
"step": 1490
},
{
"epoch": 0.9179091556205391,
"grad_norm": 1.6870900392532349,
"learning_rate": 1.7491752763844294e-07,
"loss": 0.3869,
"step": 1491
},
{
"epoch": 0.9185247888570384,
"grad_norm": 1.6385629177093506,
"learning_rate": 1.7231223413014086e-07,
"loss": 0.372,
"step": 1492
},
{
"epoch": 0.9191404220935377,
"grad_norm": 1.694305419921875,
"learning_rate": 1.697261484233492e-07,
"loss": 0.389,
"step": 1493
},
{
"epoch": 0.9197560553300371,
"grad_norm": 1.7155510187149048,
"learning_rate": 1.6715928080726417e-07,
"loss": 0.3931,
"step": 1494
},
{
"epoch": 0.9203716885665365,
"grad_norm": 1.7123414278030396,
"learning_rate": 1.6461164149461805e-07,
"loss": 0.3886,
"step": 1495
},
{
"epoch": 0.9209873218030359,
"grad_norm": 1.6566325426101685,
"learning_rate": 1.6208324062163884e-07,
"loss": 0.3807,
"step": 1496
},
{
"epoch": 0.9216029550395352,
"grad_norm": 1.7428280115127563,
"learning_rate": 1.5957408824801347e-07,
"loss": 0.3841,
"step": 1497
},
{
"epoch": 0.9222185882760345,
"grad_norm": 1.6075456142425537,
"learning_rate": 1.5708419435684463e-07,
"loss": 0.3639,
"step": 1498
},
{
"epoch": 0.9228342215125339,
"grad_norm": 1.6050225496292114,
"learning_rate": 1.5461356885461077e-07,
"loss": 0.3555,
"step": 1499
},
{
"epoch": 0.9234498547490333,
"grad_norm": 1.565177083015442,
"learning_rate": 1.5216222157112826e-07,
"loss": 0.3785,
"step": 1500
},
{
"epoch": 0.9234498547490333,
"eval_loss": 0.37673336267471313,
"eval_runtime": 118.7596,
"eval_samples_per_second": 35.374,
"eval_steps_per_second": 4.429,
"step": 1500
},
{
"epoch": 0.9240654879855327,
"grad_norm": 1.634409785270691,
"learning_rate": 1.4973016225951097e-07,
"loss": 0.3773,
"step": 1501
},
{
"epoch": 0.9246811212220319,
"grad_norm": 1.7125306129455566,
"learning_rate": 1.4731740059613365e-07,
"loss": 0.3803,
"step": 1502
},
{
"epoch": 0.9252967544585313,
"grad_norm": 1.8356815576553345,
"learning_rate": 1.4492394618059234e-07,
"loss": 0.3966,
"step": 1503
},
{
"epoch": 0.9259123876950307,
"grad_norm": 1.6698647737503052,
"learning_rate": 1.4254980853566248e-07,
"loss": 0.362,
"step": 1504
},
{
"epoch": 0.9265280209315301,
"grad_norm": 1.796517014503479,
"learning_rate": 1.4019499710726913e-07,
"loss": 0.4043,
"step": 1505
},
{
"epoch": 0.9271436541680294,
"grad_norm": 1.8080421686172485,
"learning_rate": 1.3785952126444014e-07,
"loss": 0.3847,
"step": 1506
},
{
"epoch": 0.9277592874045287,
"grad_norm": 1.6595298051834106,
"learning_rate": 1.3554339029927532e-07,
"loss": 0.3827,
"step": 1507
},
{
"epoch": 0.9283749206410281,
"grad_norm": 1.7067153453826904,
"learning_rate": 1.3324661342690892e-07,
"loss": 0.3979,
"step": 1508
},
{
"epoch": 0.9289905538775275,
"grad_norm": 1.6506260633468628,
"learning_rate": 1.3096919978546842e-07,
"loss": 0.355,
"step": 1509
},
{
"epoch": 0.9296061871140269,
"grad_norm": 1.6815375089645386,
"learning_rate": 1.2871115843604508e-07,
"loss": 0.3635,
"step": 1510
},
{
"epoch": 0.9302218203505261,
"grad_norm": 1.6872223615646362,
"learning_rate": 1.264724983626492e-07,
"loss": 0.3799,
"step": 1511
},
{
"epoch": 0.9308374535870255,
"grad_norm": 1.5445810556411743,
"learning_rate": 1.2425322847218368e-07,
"loss": 0.3644,
"step": 1512
},
{
"epoch": 0.9314530868235249,
"grad_norm": 1.8322566747665405,
"learning_rate": 1.220533575944033e-07,
"loss": 0.4074,
"step": 1513
},
{
"epoch": 0.9320687200600243,
"grad_norm": 1.8397573232650757,
"learning_rate": 1.1987289448187777e-07,
"loss": 0.4239,
"step": 1514
},
{
"epoch": 0.9326843532965235,
"grad_norm": 1.6941214799880981,
"learning_rate": 1.1771184780996315e-07,
"loss": 0.3807,
"step": 1515
},
{
"epoch": 0.9332999865330229,
"grad_norm": 1.8412351608276367,
"learning_rate": 1.1557022617676217e-07,
"loss": 0.3952,
"step": 1516
},
{
"epoch": 0.9339156197695223,
"grad_norm": 1.6536684036254883,
"learning_rate": 1.1344803810309001e-07,
"loss": 0.3542,
"step": 1517
},
{
"epoch": 0.9345312530060217,
"grad_norm": 1.670566439628601,
"learning_rate": 1.1134529203244592e-07,
"loss": 0.3879,
"step": 1518
},
{
"epoch": 0.9351468862425211,
"grad_norm": 1.7458523511886597,
"learning_rate": 1.0926199633097156e-07,
"loss": 0.3742,
"step": 1519
},
{
"epoch": 0.9357625194790203,
"grad_norm": 1.7485136985778809,
"learning_rate": 1.071981592874255e-07,
"loss": 0.3796,
"step": 1520
},
{
"epoch": 0.9363781527155197,
"grad_norm": 1.7157516479492188,
"learning_rate": 1.0515378911314378e-07,
"loss": 0.3915,
"step": 1521
},
{
"epoch": 0.9369937859520191,
"grad_norm": 1.6509552001953125,
"learning_rate": 1.031288939420122e-07,
"loss": 0.3931,
"step": 1522
},
{
"epoch": 0.9376094191885185,
"grad_norm": 1.7180267572402954,
"learning_rate": 1.011234818304302e-07,
"loss": 0.403,
"step": 1523
},
{
"epoch": 0.9382250524250177,
"grad_norm": 1.630218744277954,
"learning_rate": 9.913756075728088e-08,
"loss": 0.3504,
"step": 1524
},
{
"epoch": 0.9388406856615171,
"grad_norm": 1.607527732849121,
"learning_rate": 9.717113862389993e-08,
"loss": 0.3572,
"step": 1525
},
{
"epoch": 0.9394563188980165,
"grad_norm": 1.72373628616333,
"learning_rate": 9.522422325404234e-08,
"loss": 0.3881,
"step": 1526
},
{
"epoch": 0.9400719521345159,
"grad_norm": 1.6644386053085327,
"learning_rate": 9.32968223938513e-08,
"loss": 0.3824,
"step": 1527
},
{
"epoch": 0.9406875853710153,
"grad_norm": 1.7010245323181152,
"learning_rate": 9.138894371182983e-08,
"loss": 0.3697,
"step": 1528
},
{
"epoch": 0.9413032186075145,
"grad_norm": 1.7236442565917969,
"learning_rate": 8.950059479880591e-08,
"loss": 0.398,
"step": 1529
},
{
"epoch": 0.9419188518440139,
"grad_norm": 1.6048997640609741,
"learning_rate": 8.7631783167908e-08,
"loss": 0.3533,
"step": 1530
},
{
"epoch": 0.9425344850805133,
"grad_norm": 1.529986023902893,
"learning_rate": 8.57825162545295e-08,
"loss": 0.3575,
"step": 1531
},
{
"epoch": 0.9431501183170127,
"grad_norm": 1.6632636785507202,
"learning_rate": 8.395280141630324e-08,
"loss": 0.3639,
"step": 1532
},
{
"epoch": 0.943765751553512,
"grad_norm": 1.7666611671447754,
"learning_rate": 8.214264593307097e-08,
"loss": 0.4111,
"step": 1533
},
{
"epoch": 0.9443813847900113,
"grad_norm": 1.7613328695297241,
"learning_rate": 8.035205700685167e-08,
"loss": 0.3867,
"step": 1534
},
{
"epoch": 0.9449970180265107,
"grad_norm": 1.7610583305358887,
"learning_rate": 7.85810417618188e-08,
"loss": 0.3847,
"step": 1535
},
{
"epoch": 0.9456126512630101,
"grad_norm": 1.7215214967727661,
"learning_rate": 7.682960724426592e-08,
"loss": 0.3978,
"step": 1536
},
{
"epoch": 0.9462282844995095,
"grad_norm": 1.7222514152526855,
"learning_rate": 7.509776042258166e-08,
"loss": 0.3871,
"step": 1537
},
{
"epoch": 0.9468439177360087,
"grad_norm": 1.7724417448043823,
"learning_rate": 7.338550818722367e-08,
"loss": 0.3539,
"step": 1538
},
{
"epoch": 0.9474595509725081,
"grad_norm": 1.6608505249023438,
"learning_rate": 7.169285735068531e-08,
"loss": 0.3768,
"step": 1539
},
{
"epoch": 0.9480751842090075,
"grad_norm": 1.6848286390304565,
"learning_rate": 7.001981464747565e-08,
"loss": 0.3879,
"step": 1540
},
{
"epoch": 0.9486908174455069,
"grad_norm": 1.7346253395080566,
"learning_rate": 6.83663867340878e-08,
"loss": 0.3887,
"step": 1541
},
{
"epoch": 0.9493064506820061,
"grad_norm": 1.8061364889144897,
"learning_rate": 6.673258018897455e-08,
"loss": 0.3684,
"step": 1542
},
{
"epoch": 0.9499220839185055,
"grad_norm": 1.6204763650894165,
"learning_rate": 6.511840151252169e-08,
"loss": 0.3895,
"step": 1543
},
{
"epoch": 0.9505377171550049,
"grad_norm": 1.61834716796875,
"learning_rate": 6.352385712702191e-08,
"loss": 0.3594,
"step": 1544
},
{
"epoch": 0.9511533503915043,
"grad_norm": 1.6177845001220703,
"learning_rate": 6.194895337664875e-08,
"loss": 0.3703,
"step": 1545
},
{
"epoch": 0.9517689836280037,
"grad_norm": 1.6411689519882202,
"learning_rate": 6.039369652743266e-08,
"loss": 0.3706,
"step": 1546
},
{
"epoch": 0.9523846168645029,
"grad_norm": 1.7305967807769775,
"learning_rate": 5.8858092767236084e-08,
"loss": 0.3757,
"step": 1547
},
{
"epoch": 0.9530002501010023,
"grad_norm": 1.7725684642791748,
"learning_rate": 5.734214820572737e-08,
"loss": 0.401,
"step": 1548
},
{
"epoch": 0.9536158833375017,
"grad_norm": 1.592667818069458,
"learning_rate": 5.584586887435739e-08,
"loss": 0.3622,
"step": 1549
},
{
"epoch": 0.9542315165740011,
"grad_norm": 1.591291904449463,
"learning_rate": 5.436926072633575e-08,
"loss": 0.367,
"step": 1550
},
{
"epoch": 0.9548471498105003,
"grad_norm": 1.700524091720581,
"learning_rate": 5.291232963660686e-08,
"loss": 0.3915,
"step": 1551
},
{
"epoch": 0.9554627830469997,
"grad_norm": 1.600724220275879,
"learning_rate": 5.1475081401825553e-08,
"loss": 0.3611,
"step": 1552
},
{
"epoch": 0.9560784162834991,
"grad_norm": 1.7572377920150757,
"learning_rate": 5.0057521740336515e-08,
"loss": 0.378,
"step": 1553
},
{
"epoch": 0.9566940495199985,
"grad_norm": 1.6119253635406494,
"learning_rate": 4.865965629214819e-08,
"loss": 0.3671,
"step": 1554
},
{
"epoch": 0.9573096827564979,
"grad_norm": 1.6394660472869873,
"learning_rate": 4.7281490618914516e-08,
"loss": 0.3738,
"step": 1555
},
{
"epoch": 0.9579253159929971,
"grad_norm": 1.7939733266830444,
"learning_rate": 4.5923030203908203e-08,
"loss": 0.3778,
"step": 1556
},
{
"epoch": 0.9585409492294965,
"grad_norm": 1.6165034770965576,
"learning_rate": 4.4584280452001914e-08,
"loss": 0.3462,
"step": 1557
},
{
"epoch": 0.9591565824659959,
"grad_norm": 1.738077998161316,
"learning_rate": 4.32652466896466e-08,
"loss": 0.4012,
"step": 1558
},
{
"epoch": 0.9597722157024953,
"grad_norm": 1.7063754796981812,
"learning_rate": 4.196593416484873e-08,
"loss": 0.3752,
"step": 1559
},
{
"epoch": 0.9603878489389946,
"grad_norm": 1.73617684841156,
"learning_rate": 4.068634804715088e-08,
"loss": 0.3808,
"step": 1560
},
{
"epoch": 0.9610034821754939,
"grad_norm": 1.6154147386550903,
"learning_rate": 3.9426493427611177e-08,
"loss": 0.3569,
"step": 1561
},
{
"epoch": 0.9616191154119933,
"grad_norm": 1.5781503915786743,
"learning_rate": 3.818637531878056e-08,
"loss": 0.3652,
"step": 1562
},
{
"epoch": 0.9622347486484927,
"grad_norm": 1.6501221656799316,
"learning_rate": 3.69659986546872e-08,
"loss": 0.3717,
"step": 1563
},
{
"epoch": 0.9628503818849921,
"grad_norm": 1.5871952772140503,
"learning_rate": 3.576536829081323e-08,
"loss": 0.3757,
"step": 1564
},
{
"epoch": 0.9634660151214913,
"grad_norm": 1.6805063486099243,
"learning_rate": 3.458448900407752e-08,
"loss": 0.3818,
"step": 1565
},
{
"epoch": 0.9640816483579907,
"grad_norm": 1.7517391443252563,
"learning_rate": 3.3423365492813994e-08,
"loss": 0.3799,
"step": 1566
},
{
"epoch": 0.9646972815944901,
"grad_norm": 1.4884332418441772,
"learning_rate": 3.2282002376756163e-08,
"loss": 0.3387,
"step": 1567
},
{
"epoch": 0.9653129148309895,
"grad_norm": 1.7127952575683594,
"learning_rate": 3.1160404197018155e-08,
"loss": 0.3873,
"step": 1568
},
{
"epoch": 0.9659285480674888,
"grad_norm": 1.7407442331314087,
"learning_rate": 3.005857541607371e-08,
"loss": 0.3916,
"step": 1569
},
{
"epoch": 0.9665441813039881,
"grad_norm": 1.687168002128601,
"learning_rate": 2.8976520417742794e-08,
"loss": 0.3894,
"step": 1570
},
{
"epoch": 0.9671598145404875,
"grad_norm": 1.6315783262252808,
"learning_rate": 2.7914243507169427e-08,
"loss": 0.3603,
"step": 1571
},
{
"epoch": 0.9677754477769869,
"grad_norm": 1.6698793172836304,
"learning_rate": 2.6871748910808903e-08,
"loss": 0.3667,
"step": 1572
},
{
"epoch": 0.9683910810134863,
"grad_norm": 1.6630492210388184,
"learning_rate": 2.584904077640893e-08,
"loss": 0.3811,
"step": 1573
},
{
"epoch": 0.9690067142499855,
"grad_norm": 1.758796215057373,
"learning_rate": 2.4846123172992953e-08,
"loss": 0.404,
"step": 1574
},
{
"epoch": 0.9696223474864849,
"grad_norm": 1.6306681632995605,
"learning_rate": 2.386300009084408e-08,
"loss": 0.3869,
"step": 1575
},
{
"epoch": 0.9702379807229843,
"grad_norm": 1.671326994895935,
"learning_rate": 2.2899675441490078e-08,
"loss": 0.3626,
"step": 1576
},
{
"epoch": 0.9708536139594837,
"grad_norm": 1.7411401271820068,
"learning_rate": 2.195615305768617e-08,
"loss": 0.3879,
"step": 1577
},
{
"epoch": 0.971469247195983,
"grad_norm": 1.7686033248901367,
"learning_rate": 2.103243669340227e-08,
"loss": 0.408,
"step": 1578
},
{
"epoch": 0.9720848804324823,
"grad_norm": 1.715114712715149,
"learning_rate": 2.012853002380466e-08,
"loss": 0.3809,
"step": 1579
},
{
"epoch": 0.9727005136689817,
"grad_norm": 1.6715350151062012,
"learning_rate": 1.9244436645246002e-08,
"loss": 0.3737,
"step": 1580
},
{
"epoch": 0.9733161469054811,
"grad_norm": 1.6584084033966064,
"learning_rate": 1.838016007524479e-08,
"loss": 0.3678,
"step": 1581
},
{
"epoch": 0.9739317801419805,
"grad_norm": 1.616811990737915,
"learning_rate": 1.753570375247815e-08,
"loss": 0.3659,
"step": 1582
},
{
"epoch": 0.9745474133784797,
"grad_norm": 1.5945422649383545,
"learning_rate": 1.6711071036763506e-08,
"loss": 0.3612,
"step": 1583
},
{
"epoch": 0.9751630466149791,
"grad_norm": 1.6576652526855469,
"learning_rate": 1.590626520904526e-08,
"loss": 0.3926,
"step": 1584
},
{
"epoch": 0.9757786798514785,
"grad_norm": 1.6075187921524048,
"learning_rate": 1.5121289471385915e-08,
"loss": 0.3596,
"step": 1585
},
{
"epoch": 0.9763943130879779,
"grad_norm": 1.673209547996521,
"learning_rate": 1.4356146946948313e-08,
"loss": 0.3785,
"step": 1586
},
{
"epoch": 0.9770099463244772,
"grad_norm": 1.7732353210449219,
"learning_rate": 1.3610840679985638e-08,
"loss": 0.3842,
"step": 1587
},
{
"epoch": 0.9776255795609765,
"grad_norm": 1.6880178451538086,
"learning_rate": 1.2885373635829756e-08,
"loss": 0.3914,
"step": 1588
},
{
"epoch": 0.9782412127974759,
"grad_norm": 1.6833375692367554,
"learning_rate": 1.2179748700879013e-08,
"loss": 0.3878,
"step": 1589
},
{
"epoch": 0.9788568460339753,
"grad_norm": 1.6276341676712036,
"learning_rate": 1.14939686825849e-08,
"loss": 0.3786,
"step": 1590
},
{
"epoch": 0.9794724792704746,
"grad_norm": 1.8428518772125244,
"learning_rate": 1.0828036309443735e-08,
"loss": 0.4015,
"step": 1591
},
{
"epoch": 0.9800881125069739,
"grad_norm": 1.7889384031295776,
"learning_rate": 1.0181954230983893e-08,
"loss": 0.3867,
"step": 1592
},
{
"epoch": 0.9807037457434733,
"grad_norm": 1.6265109777450562,
"learning_rate": 9.555725017756922e-09,
"loss": 0.372,
"step": 1593
},
{
"epoch": 0.9813193789799727,
"grad_norm": 1.63323974609375,
"learning_rate": 8.949351161324227e-09,
"loss": 0.3725,
"step": 1594
},
{
"epoch": 0.9819350122164721,
"grad_norm": 1.5772345066070557,
"learning_rate": 8.362835074251508e-09,
"loss": 0.3765,
"step": 1595
},
{
"epoch": 0.9825506454529714,
"grad_norm": 1.7093181610107422,
"learning_rate": 7.796179090094891e-09,
"loss": 0.3648,
"step": 1596
},
{
"epoch": 0.9831662786894707,
"grad_norm": 1.6425986289978027,
"learning_rate": 7.249385463395375e-09,
"loss": 0.3802,
"step": 1597
},
{
"epoch": 0.9837819119259701,
"grad_norm": 1.6807538270950317,
"learning_rate": 6.722456369666619e-09,
"loss": 0.3741,
"step": 1598
},
{
"epoch": 0.9843975451624695,
"grad_norm": 1.6659401655197144,
"learning_rate": 6.215393905388278e-09,
"loss": 0.3929,
"step": 1599
},
{
"epoch": 0.9850131783989688,
"grad_norm": 1.6375937461853027,
"learning_rate": 5.728200087997126e-09,
"loss": 0.3676,
"step": 1600
},
{
"epoch": 0.9850131783989688,
"eval_loss": 0.37590205669403076,
"eval_runtime": 118.0408,
"eval_samples_per_second": 35.589,
"eval_steps_per_second": 4.456,
"step": 1600
},
{
"epoch": 0.9856288116354681,
"grad_norm": 1.6175541877746582,
"learning_rate": 5.2608768558798376e-09,
"loss": 0.3793,
"step": 1601
},
{
"epoch": 0.9862444448719675,
"grad_norm": 1.5827006101608276,
"learning_rate": 4.813426068362992e-09,
"loss": 0.3577,
"step": 1602
},
{
"epoch": 0.9868600781084669,
"grad_norm": 1.6540141105651855,
"learning_rate": 4.385849505708084e-09,
"loss": 0.3754,
"step": 1603
},
{
"epoch": 0.9874757113449663,
"grad_norm": 1.7661128044128418,
"learning_rate": 3.978148869103748e-09,
"loss": 0.4099,
"step": 1604
},
{
"epoch": 0.9880913445814656,
"grad_norm": 1.6497414112091064,
"learning_rate": 3.5903257806579884e-09,
"loss": 0.3911,
"step": 1605
},
{
"epoch": 0.9887069778179649,
"grad_norm": 1.6843533515930176,
"learning_rate": 3.2223817833931803e-09,
"loss": 0.36,
"step": 1606
},
{
"epoch": 0.9893226110544643,
"grad_norm": 1.7438652515411377,
"learning_rate": 2.8743183412388578e-09,
"loss": 0.385,
"step": 1607
},
{
"epoch": 0.9899382442909637,
"grad_norm": 1.6070297956466675,
"learning_rate": 2.5461368390261587e-09,
"loss": 0.3675,
"step": 1608
},
{
"epoch": 0.990553877527463,
"grad_norm": 1.569028377532959,
"learning_rate": 2.237838582483387e-09,
"loss": 0.3662,
"step": 1609
},
{
"epoch": 0.9911695107639624,
"grad_norm": 1.6607086658477783,
"learning_rate": 1.9494247982282386e-09,
"loss": 0.3952,
"step": 1610
},
{
"epoch": 0.9917851440004617,
"grad_norm": 1.6013164520263672,
"learning_rate": 1.6808966337661382e-09,
"loss": 0.3746,
"step": 1611
},
{
"epoch": 0.9924007772369611,
"grad_norm": 1.690483808517456,
"learning_rate": 1.4322551574830202e-09,
"loss": 0.3731,
"step": 1612
},
{
"epoch": 0.9930164104734605,
"grad_norm": 1.6636048555374146,
"learning_rate": 1.203501358642556e-09,
"loss": 0.375,
"step": 1613
},
{
"epoch": 0.9936320437099598,
"grad_norm": 1.710705041885376,
"learning_rate": 9.946361473822664e-10,
"loss": 0.3713,
"step": 1614
},
{
"epoch": 0.9942476769464591,
"grad_norm": 1.6652582883834839,
"learning_rate": 8.056603547090813e-10,
"loss": 0.4017,
"step": 1615
},
{
"epoch": 0.9948633101829585,
"grad_norm": 1.7423772811889648,
"learning_rate": 6.365747324954541e-10,
"loss": 0.3843,
"step": 1616
},
{
"epoch": 0.9954789434194579,
"grad_norm": 1.6425862312316895,
"learning_rate": 4.87379953478806e-10,
"loss": 0.3755,
"step": 1617
},
{
"epoch": 0.9960945766559572,
"grad_norm": 1.648992657661438,
"learning_rate": 3.580766112565304e-10,
"loss": 0.3831,
"step": 1618
},
{
"epoch": 0.9967102098924566,
"grad_norm": 1.7648104429244995,
"learning_rate": 2.486652202848827e-10,
"loss": 0.3979,
"step": 1619
},
{
"epoch": 0.9973258431289559,
"grad_norm": 1.735405445098877,
"learning_rate": 1.591462158756496e-10,
"loss": 0.3663,
"step": 1620
},
{
"epoch": 0.9979414763654553,
"grad_norm": 1.6370799541473389,
"learning_rate": 8.951995419614889e-11,
"loss": 0.3798,
"step": 1621
},
{
"epoch": 0.9985571096019547,
"grad_norm": 1.6457384824752808,
"learning_rate": 3.9786712267009256e-11,
"loss": 0.3724,
"step": 1622
},
{
"epoch": 0.999172742838454,
"grad_norm": 1.5603142976760864,
"learning_rate": 9.946687960504797e-12,
"loss": 0.3682,
"step": 1623
},
{
"epoch": 0.9997883760749533,
"grad_norm": 1.7614634037017822,
"learning_rate": 0.0,
"loss": 0.3983,
"step": 1624
}
],
"logging_steps": 1.0,
"max_steps": 1624,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 400,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 9.8093512752785e+17,
"train_batch_size": 1,
"trial_name": null,
"trial_params": null
}