Sapphire-12B-adaptor / trainer_state.json
Delta-Vector's picture
Upload folder using huggingface_hub
f79e5c2 verified
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 0.9996983408748115,
"eval_steps": 208,
"global_step": 1657,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0006033182503770739,
"grad_norm": 2.140331984013226,
"learning_rate": 0.0,
"loss": 2.4164,
"step": 1
},
{
"epoch": 0.0006033182503770739,
"eval_loss": 2.440932273864746,
"eval_runtime": 21.8578,
"eval_samples_per_second": 4.026,
"eval_steps_per_second": 0.503,
"step": 1
},
{
"epoch": 0.0012066365007541479,
"grad_norm": 2.131881596246372,
"learning_rate": 5.000000000000001e-07,
"loss": 2.3859,
"step": 2
},
{
"epoch": 0.0018099547511312218,
"grad_norm": 2.0329130314081363,
"learning_rate": 1.0000000000000002e-06,
"loss": 2.4842,
"step": 3
},
{
"epoch": 0.0024132730015082957,
"grad_norm": 2.0136673779808563,
"learning_rate": 1.5e-06,
"loss": 2.2674,
"step": 4
},
{
"epoch": 0.0030165912518853697,
"grad_norm": 2.301421361800185,
"learning_rate": 2.0000000000000003e-06,
"loss": 2.4779,
"step": 5
},
{
"epoch": 0.0036199095022624436,
"grad_norm": 2.1048944577756146,
"learning_rate": 2.5e-06,
"loss": 2.4403,
"step": 6
},
{
"epoch": 0.004223227752639517,
"grad_norm": 2.2857994912312045,
"learning_rate": 3e-06,
"loss": 2.3985,
"step": 7
},
{
"epoch": 0.0048265460030165915,
"grad_norm": 1.9373222563858792,
"learning_rate": 3.5e-06,
"loss": 2.4469,
"step": 8
},
{
"epoch": 0.005429864253393665,
"grad_norm": 1.8828732336223377,
"learning_rate": 4.000000000000001e-06,
"loss": 2.4803,
"step": 9
},
{
"epoch": 0.006033182503770739,
"grad_norm": 1.770391428824802,
"learning_rate": 4.5e-06,
"loss": 2.4667,
"step": 10
},
{
"epoch": 0.006636500754147813,
"grad_norm": 1.7040163364876169,
"learning_rate": 5e-06,
"loss": 2.3847,
"step": 11
},
{
"epoch": 0.007239819004524887,
"grad_norm": 1.6348390447246268,
"learning_rate": 5.500000000000001e-06,
"loss": 2.4091,
"step": 12
},
{
"epoch": 0.00784313725490196,
"grad_norm": 1.8132153740408123,
"learning_rate": 6e-06,
"loss": 2.3761,
"step": 13
},
{
"epoch": 0.008446455505279034,
"grad_norm": 1.7688677203718561,
"learning_rate": 6.5000000000000004e-06,
"loss": 2.3668,
"step": 14
},
{
"epoch": 0.00904977375565611,
"grad_norm": 2.012566995500888,
"learning_rate": 7e-06,
"loss": 2.541,
"step": 15
},
{
"epoch": 0.009653092006033183,
"grad_norm": 2.344552551150623,
"learning_rate": 7.500000000000001e-06,
"loss": 2.4033,
"step": 16
},
{
"epoch": 0.010256410256410256,
"grad_norm": 2.239204038709957,
"learning_rate": 8.000000000000001e-06,
"loss": 2.4575,
"step": 17
},
{
"epoch": 0.01085972850678733,
"grad_norm": 1.7688865688548743,
"learning_rate": 8.5e-06,
"loss": 2.3797,
"step": 18
},
{
"epoch": 0.011463046757164403,
"grad_norm": 1.6318435610254944,
"learning_rate": 9e-06,
"loss": 2.4044,
"step": 19
},
{
"epoch": 0.012066365007541479,
"grad_norm": 1.8510626760551105,
"learning_rate": 9.5e-06,
"loss": 2.4719,
"step": 20
},
{
"epoch": 0.012669683257918552,
"grad_norm": 1.7111956301378386,
"learning_rate": 1e-05,
"loss": 2.3634,
"step": 21
},
{
"epoch": 0.013273001508295626,
"grad_norm": 1.9210730110130552,
"learning_rate": 1.0500000000000001e-05,
"loss": 2.4445,
"step": 22
},
{
"epoch": 0.013876319758672699,
"grad_norm": 1.817913869449993,
"learning_rate": 1.1000000000000001e-05,
"loss": 2.4591,
"step": 23
},
{
"epoch": 0.014479638009049774,
"grad_norm": 1.870987381889623,
"learning_rate": 1.15e-05,
"loss": 2.4188,
"step": 24
},
{
"epoch": 0.015082956259426848,
"grad_norm": 1.7931111751340785,
"learning_rate": 1.2e-05,
"loss": 2.3321,
"step": 25
},
{
"epoch": 0.01568627450980392,
"grad_norm": 1.8300547920846246,
"learning_rate": 1.25e-05,
"loss": 2.5083,
"step": 26
},
{
"epoch": 0.016289592760180997,
"grad_norm": 1.886339441644327,
"learning_rate": 1.3000000000000001e-05,
"loss": 2.4205,
"step": 27
},
{
"epoch": 0.01689291101055807,
"grad_norm": 1.672397817180637,
"learning_rate": 1.3500000000000001e-05,
"loss": 2.3449,
"step": 28
},
{
"epoch": 0.017496229260935144,
"grad_norm": 1.701493800333666,
"learning_rate": 1.4e-05,
"loss": 2.3802,
"step": 29
},
{
"epoch": 0.01809954751131222,
"grad_norm": 1.7101969275215947,
"learning_rate": 1.45e-05,
"loss": 2.4433,
"step": 30
},
{
"epoch": 0.01870286576168929,
"grad_norm": 1.622468433014236,
"learning_rate": 1.5000000000000002e-05,
"loss": 2.3849,
"step": 31
},
{
"epoch": 0.019306184012066366,
"grad_norm": 1.7244402516216317,
"learning_rate": 1.55e-05,
"loss": 2.3494,
"step": 32
},
{
"epoch": 0.019909502262443438,
"grad_norm": 1.6968966331071,
"learning_rate": 1.6000000000000003e-05,
"loss": 2.3825,
"step": 33
},
{
"epoch": 0.020512820512820513,
"grad_norm": 1.7580609504607645,
"learning_rate": 1.65e-05,
"loss": 2.4379,
"step": 34
},
{
"epoch": 0.021116138763197588,
"grad_norm": 1.764259902209323,
"learning_rate": 1.7e-05,
"loss": 2.4232,
"step": 35
},
{
"epoch": 0.02171945701357466,
"grad_norm": 1.8164208533771704,
"learning_rate": 1.7500000000000002e-05,
"loss": 2.364,
"step": 36
},
{
"epoch": 0.022322775263951735,
"grad_norm": 1.689659414180739,
"learning_rate": 1.8e-05,
"loss": 2.5181,
"step": 37
},
{
"epoch": 0.022926093514328807,
"grad_norm": 1.7385790082200887,
"learning_rate": 1.8500000000000002e-05,
"loss": 2.4695,
"step": 38
},
{
"epoch": 0.023529411764705882,
"grad_norm": 1.635856548315033,
"learning_rate": 1.9e-05,
"loss": 2.3484,
"step": 39
},
{
"epoch": 0.024132730015082957,
"grad_norm": 1.7607010990183498,
"learning_rate": 1.95e-05,
"loss": 2.4922,
"step": 40
},
{
"epoch": 0.02473604826546003,
"grad_norm": 1.6736592370180061,
"learning_rate": 2e-05,
"loss": 2.3143,
"step": 41
},
{
"epoch": 0.025339366515837104,
"grad_norm": 1.7437431992559804,
"learning_rate": 1.999998112662482e-05,
"loss": 2.36,
"step": 42
},
{
"epoch": 0.02594268476621418,
"grad_norm": 1.7788761951573173,
"learning_rate": 1.999992450657051e-05,
"loss": 2.3651,
"step": 43
},
{
"epoch": 0.02654600301659125,
"grad_norm": 1.74387066132697,
"learning_rate": 1.9999830140050802e-05,
"loss": 2.3465,
"step": 44
},
{
"epoch": 0.027149321266968326,
"grad_norm": 1.7474556357496291,
"learning_rate": 1.9999698027421894e-05,
"loss": 2.3534,
"step": 45
},
{
"epoch": 0.027752639517345398,
"grad_norm": 1.7289399191537893,
"learning_rate": 1.9999528169182472e-05,
"loss": 2.3961,
"step": 46
},
{
"epoch": 0.028355957767722473,
"grad_norm": 1.7709476810067486,
"learning_rate": 1.999932056597369e-05,
"loss": 2.364,
"step": 47
},
{
"epoch": 0.02895927601809955,
"grad_norm": 1.7002848185604087,
"learning_rate": 1.9999075218579184e-05,
"loss": 2.4127,
"step": 48
},
{
"epoch": 0.02956259426847662,
"grad_norm": 1.6668830763317524,
"learning_rate": 1.9998792127925066e-05,
"loss": 2.4382,
"step": 49
},
{
"epoch": 0.030165912518853696,
"grad_norm": 1.8216191873465934,
"learning_rate": 1.9998471295079908e-05,
"loss": 2.4013,
"step": 50
},
{
"epoch": 0.03076923076923077,
"grad_norm": 1.7185319302782023,
"learning_rate": 1.999811272125474e-05,
"loss": 2.4387,
"step": 51
},
{
"epoch": 0.03137254901960784,
"grad_norm": 1.7446573922589135,
"learning_rate": 1.999771640780308e-05,
"loss": 2.4049,
"step": 52
},
{
"epoch": 0.031975867269984914,
"grad_norm": 1.7533551196856423,
"learning_rate": 1.999728235622087e-05,
"loss": 2.383,
"step": 53
},
{
"epoch": 0.03257918552036199,
"grad_norm": 2.4428720219194875,
"learning_rate": 1.999681056814652e-05,
"loss": 2.3984,
"step": 54
},
{
"epoch": 0.033182503770739065,
"grad_norm": 1.9117067623356128,
"learning_rate": 1.9996301045360874e-05,
"loss": 2.3987,
"step": 55
},
{
"epoch": 0.03378582202111614,
"grad_norm": 1.6871395703400287,
"learning_rate": 1.9995753789787212e-05,
"loss": 2.4025,
"step": 56
},
{
"epoch": 0.034389140271493215,
"grad_norm": 1.806353695603544,
"learning_rate": 1.9995168803491246e-05,
"loss": 2.285,
"step": 57
},
{
"epoch": 0.03499245852187029,
"grad_norm": 1.7210953438365815,
"learning_rate": 1.9994546088681116e-05,
"loss": 2.4058,
"step": 58
},
{
"epoch": 0.03559577677224736,
"grad_norm": 1.6710134197503792,
"learning_rate": 1.9993885647707363e-05,
"loss": 2.4324,
"step": 59
},
{
"epoch": 0.03619909502262444,
"grad_norm": 1.6682273145402082,
"learning_rate": 1.9993187483062935e-05,
"loss": 2.4344,
"step": 60
},
{
"epoch": 0.03680241327300151,
"grad_norm": 1.9225767969813456,
"learning_rate": 1.999245159738318e-05,
"loss": 2.4027,
"step": 61
},
{
"epoch": 0.03740573152337858,
"grad_norm": 1.7589628857228554,
"learning_rate": 1.9991677993445832e-05,
"loss": 2.4609,
"step": 62
},
{
"epoch": 0.03800904977375565,
"grad_norm": 1.9221108803705138,
"learning_rate": 1.9990866674170984e-05,
"loss": 2.3392,
"step": 63
},
{
"epoch": 0.03861236802413273,
"grad_norm": 1.9187569364342647,
"learning_rate": 1.999001764262111e-05,
"loss": 2.3234,
"step": 64
},
{
"epoch": 0.0392156862745098,
"grad_norm": 1.731376317705545,
"learning_rate": 1.9989130902001025e-05,
"loss": 2.344,
"step": 65
},
{
"epoch": 0.039819004524886875,
"grad_norm": 1.6530803542103498,
"learning_rate": 1.9988206455657887e-05,
"loss": 2.4121,
"step": 66
},
{
"epoch": 0.040422322775263954,
"grad_norm": 1.8599525275677855,
"learning_rate": 1.9987244307081184e-05,
"loss": 2.4073,
"step": 67
},
{
"epoch": 0.041025641025641026,
"grad_norm": 1.871690372257642,
"learning_rate": 1.998624445990271e-05,
"loss": 2.3929,
"step": 68
},
{
"epoch": 0.0416289592760181,
"grad_norm": 1.9038530342428834,
"learning_rate": 1.9985206917896563e-05,
"loss": 2.4434,
"step": 69
},
{
"epoch": 0.042232277526395176,
"grad_norm": 1.7143239850083882,
"learning_rate": 1.9984131684979134e-05,
"loss": 2.4668,
"step": 70
},
{
"epoch": 0.04283559577677225,
"grad_norm": 1.8822886351415713,
"learning_rate": 1.9983018765209067e-05,
"loss": 2.4079,
"step": 71
},
{
"epoch": 0.04343891402714932,
"grad_norm": 1.8419494425726253,
"learning_rate": 1.9981868162787283e-05,
"loss": 2.3937,
"step": 72
},
{
"epoch": 0.0440422322775264,
"grad_norm": 1.710334016900471,
"learning_rate": 1.9980679882056925e-05,
"loss": 2.3719,
"step": 73
},
{
"epoch": 0.04464555052790347,
"grad_norm": 1.7409130841301985,
"learning_rate": 1.9979453927503366e-05,
"loss": 2.5024,
"step": 74
},
{
"epoch": 0.04524886877828054,
"grad_norm": 1.9118707931356833,
"learning_rate": 1.997819030375419e-05,
"loss": 2.3466,
"step": 75
},
{
"epoch": 0.045852187028657614,
"grad_norm": 1.9263333015328319,
"learning_rate": 1.9976889015579167e-05,
"loss": 2.397,
"step": 76
},
{
"epoch": 0.04645550527903469,
"grad_norm": 1.9416725485562814,
"learning_rate": 1.997555006789023e-05,
"loss": 2.4406,
"step": 77
},
{
"epoch": 0.047058823529411764,
"grad_norm": 1.8059449377845525,
"learning_rate": 1.997417346574148e-05,
"loss": 2.3462,
"step": 78
},
{
"epoch": 0.047662141779788836,
"grad_norm": 2.182819038622641,
"learning_rate": 1.9972759214329142e-05,
"loss": 2.3722,
"step": 79
},
{
"epoch": 0.048265460030165915,
"grad_norm": 2.134570927089691,
"learning_rate": 1.9971307318991546e-05,
"loss": 2.3632,
"step": 80
},
{
"epoch": 0.048868778280542986,
"grad_norm": 2.081827313081015,
"learning_rate": 1.9969817785209137e-05,
"loss": 2.3737,
"step": 81
},
{
"epoch": 0.04947209653092006,
"grad_norm": 1.7992613052147577,
"learning_rate": 1.9968290618604413e-05,
"loss": 2.4025,
"step": 82
},
{
"epoch": 0.05007541478129714,
"grad_norm": 1.7171671224778233,
"learning_rate": 1.9966725824941933e-05,
"loss": 2.4723,
"step": 83
},
{
"epoch": 0.05067873303167421,
"grad_norm": 1.705969280625204,
"learning_rate": 1.9965123410128287e-05,
"loss": 2.3856,
"step": 84
},
{
"epoch": 0.05128205128205128,
"grad_norm": 1.9104369520615112,
"learning_rate": 1.996348338021207e-05,
"loss": 2.4186,
"step": 85
},
{
"epoch": 0.05188536953242836,
"grad_norm": 2.042807057903928,
"learning_rate": 1.9961805741383862e-05,
"loss": 2.4081,
"step": 86
},
{
"epoch": 0.05248868778280543,
"grad_norm": 2.07582997320187,
"learning_rate": 1.99600904999762e-05,
"loss": 2.3573,
"step": 87
},
{
"epoch": 0.0530920060331825,
"grad_norm": 1.7304234841275201,
"learning_rate": 1.995833766246357e-05,
"loss": 2.3617,
"step": 88
},
{
"epoch": 0.053695324283559574,
"grad_norm": 1.827279827289296,
"learning_rate": 1.995654723546236e-05,
"loss": 2.3151,
"step": 89
},
{
"epoch": 0.05429864253393665,
"grad_norm": 1.795039145615716,
"learning_rate": 1.9954719225730847e-05,
"loss": 2.3447,
"step": 90
},
{
"epoch": 0.054901960784313725,
"grad_norm": 1.9487853189804405,
"learning_rate": 1.995285364016918e-05,
"loss": 2.3791,
"step": 91
},
{
"epoch": 0.055505279034690796,
"grad_norm": 1.8727113692711528,
"learning_rate": 1.9950950485819334e-05,
"loss": 2.3648,
"step": 92
},
{
"epoch": 0.056108597285067875,
"grad_norm": 1.796921337605627,
"learning_rate": 1.99490097698651e-05,
"loss": 2.377,
"step": 93
},
{
"epoch": 0.05671191553544495,
"grad_norm": 1.8406515054937658,
"learning_rate": 1.994703149963205e-05,
"loss": 2.3786,
"step": 94
},
{
"epoch": 0.05731523378582202,
"grad_norm": 1.9102332156136683,
"learning_rate": 1.9945015682587512e-05,
"loss": 2.4431,
"step": 95
},
{
"epoch": 0.0579185520361991,
"grad_norm": 1.74427627997967,
"learning_rate": 1.994296232634054e-05,
"loss": 2.3747,
"step": 96
},
{
"epoch": 0.05852187028657617,
"grad_norm": 1.916801794748654,
"learning_rate": 1.994087143864188e-05,
"loss": 2.3677,
"step": 97
},
{
"epoch": 0.05912518853695324,
"grad_norm": 1.9361095372662218,
"learning_rate": 1.9938743027383966e-05,
"loss": 2.3377,
"step": 98
},
{
"epoch": 0.05972850678733032,
"grad_norm": 1.6654143015011817,
"learning_rate": 1.9936577100600848e-05,
"loss": 2.3843,
"step": 99
},
{
"epoch": 0.06033182503770739,
"grad_norm": 1.739958437002719,
"learning_rate": 1.9934373666468203e-05,
"loss": 2.4005,
"step": 100
},
{
"epoch": 0.06093514328808446,
"grad_norm": 1.903097711755015,
"learning_rate": 1.9932132733303273e-05,
"loss": 2.3827,
"step": 101
},
{
"epoch": 0.06153846153846154,
"grad_norm": 1.8795398203703997,
"learning_rate": 1.9929854309564858e-05,
"loss": 2.3317,
"step": 102
},
{
"epoch": 0.062141779788838614,
"grad_norm": 1.7431133865940558,
"learning_rate": 1.992753840385326e-05,
"loss": 2.4836,
"step": 103
},
{
"epoch": 0.06274509803921569,
"grad_norm": 1.9216277777624893,
"learning_rate": 1.992518502491028e-05,
"loss": 2.4088,
"step": 104
},
{
"epoch": 0.06334841628959276,
"grad_norm": 1.7436250240847813,
"learning_rate": 1.992279418161915e-05,
"loss": 2.309,
"step": 105
},
{
"epoch": 0.06395173453996983,
"grad_norm": 1.80540945976844,
"learning_rate": 1.992036588300453e-05,
"loss": 2.3845,
"step": 106
},
{
"epoch": 0.06455505279034691,
"grad_norm": 1.8598471706172288,
"learning_rate": 1.991790013823246e-05,
"loss": 2.3712,
"step": 107
},
{
"epoch": 0.06515837104072399,
"grad_norm": 1.7251014111069356,
"learning_rate": 1.9915396956610328e-05,
"loss": 2.3913,
"step": 108
},
{
"epoch": 0.06576168929110106,
"grad_norm": 1.8148853902478392,
"learning_rate": 1.991285634758682e-05,
"loss": 2.3982,
"step": 109
},
{
"epoch": 0.06636500754147813,
"grad_norm": 1.7498689126618199,
"learning_rate": 1.991027832075192e-05,
"loss": 2.4465,
"step": 110
},
{
"epoch": 0.0669683257918552,
"grad_norm": 1.7457643777959446,
"learning_rate": 1.9907662885836836e-05,
"loss": 2.4567,
"step": 111
},
{
"epoch": 0.06757164404223227,
"grad_norm": 1.7733087443387425,
"learning_rate": 1.9905010052713988e-05,
"loss": 2.3656,
"step": 112
},
{
"epoch": 0.06817496229260935,
"grad_norm": 1.78400076670591,
"learning_rate": 1.9902319831396956e-05,
"loss": 2.4547,
"step": 113
},
{
"epoch": 0.06877828054298643,
"grad_norm": 1.8501073003356572,
"learning_rate": 1.9899592232040454e-05,
"loss": 2.3982,
"step": 114
},
{
"epoch": 0.0693815987933635,
"grad_norm": 1.7939030174137776,
"learning_rate": 1.989682726494028e-05,
"loss": 2.4242,
"step": 115
},
{
"epoch": 0.06998491704374057,
"grad_norm": 1.739031017675765,
"learning_rate": 1.989402494053329e-05,
"loss": 2.309,
"step": 116
},
{
"epoch": 0.07058823529411765,
"grad_norm": 1.726265958686578,
"learning_rate": 1.9891185269397347e-05,
"loss": 2.3979,
"step": 117
},
{
"epoch": 0.07119155354449472,
"grad_norm": 1.8241387788082768,
"learning_rate": 1.9888308262251286e-05,
"loss": 2.4293,
"step": 118
},
{
"epoch": 0.07179487179487179,
"grad_norm": 1.796999661875299,
"learning_rate": 1.9885393929954876e-05,
"loss": 2.4281,
"step": 119
},
{
"epoch": 0.07239819004524888,
"grad_norm": 1.8359034488503743,
"learning_rate": 1.988244228350877e-05,
"loss": 2.3915,
"step": 120
},
{
"epoch": 0.07300150829562595,
"grad_norm": 1.8354333144775479,
"learning_rate": 1.9879453334054476e-05,
"loss": 2.3626,
"step": 121
},
{
"epoch": 0.07360482654600302,
"grad_norm": 1.7464715815370597,
"learning_rate": 1.987642709287431e-05,
"loss": 2.3856,
"step": 122
},
{
"epoch": 0.07420814479638009,
"grad_norm": 1.7365107395117005,
"learning_rate": 1.9873363571391344e-05,
"loss": 2.3996,
"step": 123
},
{
"epoch": 0.07481146304675716,
"grad_norm": 1.7674652876441117,
"learning_rate": 1.9870262781169378e-05,
"loss": 2.4401,
"step": 124
},
{
"epoch": 0.07541478129713423,
"grad_norm": 1.731649233262622,
"learning_rate": 1.986712473391289e-05,
"loss": 2.4364,
"step": 125
},
{
"epoch": 0.0760180995475113,
"grad_norm": 1.7269243496040354,
"learning_rate": 1.9863949441466988e-05,
"loss": 2.3828,
"step": 126
},
{
"epoch": 0.07662141779788839,
"grad_norm": 1.732468058395144,
"learning_rate": 1.9860736915817365e-05,
"loss": 2.3844,
"step": 127
},
{
"epoch": 0.07722473604826546,
"grad_norm": 1.7808885852323488,
"learning_rate": 1.9857487169090265e-05,
"loss": 2.4133,
"step": 128
},
{
"epoch": 0.07782805429864253,
"grad_norm": 1.9023595705344876,
"learning_rate": 1.9854200213552426e-05,
"loss": 2.4064,
"step": 129
},
{
"epoch": 0.0784313725490196,
"grad_norm": 1.7783293299159864,
"learning_rate": 1.9850876061611036e-05,
"loss": 2.4363,
"step": 130
},
{
"epoch": 0.07903469079939668,
"grad_norm": 1.874231784614395,
"learning_rate": 1.984751472581369e-05,
"loss": 2.3655,
"step": 131
},
{
"epoch": 0.07963800904977375,
"grad_norm": 1.7499368726095257,
"learning_rate": 1.9844116218848335e-05,
"loss": 2.335,
"step": 132
},
{
"epoch": 0.08024132730015084,
"grad_norm": 2.6022340689179075,
"learning_rate": 1.984068055354323e-05,
"loss": 2.347,
"step": 133
},
{
"epoch": 0.08084464555052791,
"grad_norm": 1.9347849143168379,
"learning_rate": 1.98372077428669e-05,
"loss": 2.4458,
"step": 134
},
{
"epoch": 0.08144796380090498,
"grad_norm": 1.8916873969542793,
"learning_rate": 1.9833697799928074e-05,
"loss": 2.419,
"step": 135
},
{
"epoch": 0.08205128205128205,
"grad_norm": 1.980019343205032,
"learning_rate": 1.9830150737975648e-05,
"loss": 2.3749,
"step": 136
},
{
"epoch": 0.08265460030165912,
"grad_norm": 1.7724669285888033,
"learning_rate": 1.9826566570398622e-05,
"loss": 2.4208,
"step": 137
},
{
"epoch": 0.0832579185520362,
"grad_norm": 1.749167988541911,
"learning_rate": 1.982294531072607e-05,
"loss": 2.429,
"step": 138
},
{
"epoch": 0.08386123680241327,
"grad_norm": 1.6677676621185875,
"learning_rate": 1.9819286972627066e-05,
"loss": 2.3928,
"step": 139
},
{
"epoch": 0.08446455505279035,
"grad_norm": 1.9850127221560177,
"learning_rate": 1.9815591569910654e-05,
"loss": 2.3877,
"step": 140
},
{
"epoch": 0.08506787330316742,
"grad_norm": 1.7617725554904797,
"learning_rate": 1.9811859116525774e-05,
"loss": 2.3771,
"step": 141
},
{
"epoch": 0.0856711915535445,
"grad_norm": 1.91372872316542,
"learning_rate": 1.9808089626561226e-05,
"loss": 2.3356,
"step": 142
},
{
"epoch": 0.08627450980392157,
"grad_norm": 1.7024663523887813,
"learning_rate": 1.9804283114245605e-05,
"loss": 2.4353,
"step": 143
},
{
"epoch": 0.08687782805429864,
"grad_norm": 1.9337653881090566,
"learning_rate": 1.9800439593947262e-05,
"loss": 2.3856,
"step": 144
},
{
"epoch": 0.08748114630467571,
"grad_norm": 1.9735612724311176,
"learning_rate": 1.979655908017424e-05,
"loss": 2.4385,
"step": 145
},
{
"epoch": 0.0880844645550528,
"grad_norm": 1.7880500112263782,
"learning_rate": 1.9792641587574212e-05,
"loss": 2.4161,
"step": 146
},
{
"epoch": 0.08868778280542987,
"grad_norm": 1.9465027868919351,
"learning_rate": 1.9788687130934445e-05,
"loss": 2.3453,
"step": 147
},
{
"epoch": 0.08929110105580694,
"grad_norm": 1.7593991336221384,
"learning_rate": 1.9784695725181722e-05,
"loss": 2.339,
"step": 148
},
{
"epoch": 0.08989441930618401,
"grad_norm": 1.738025895590932,
"learning_rate": 1.9780667385382303e-05,
"loss": 2.3519,
"step": 149
},
{
"epoch": 0.09049773755656108,
"grad_norm": 2.024610431953343,
"learning_rate": 1.9776602126741867e-05,
"loss": 2.3997,
"step": 150
},
{
"epoch": 0.09110105580693816,
"grad_norm": 1.7286397557211362,
"learning_rate": 1.977249996460544e-05,
"loss": 2.4231,
"step": 151
},
{
"epoch": 0.09170437405731523,
"grad_norm": 1.9340147315431204,
"learning_rate": 1.9768360914457355e-05,
"loss": 2.3326,
"step": 152
},
{
"epoch": 0.09230769230769231,
"grad_norm": 1.7528407442090708,
"learning_rate": 1.9764184991921178e-05,
"loss": 2.4433,
"step": 153
},
{
"epoch": 0.09291101055806938,
"grad_norm": 1.7779252807636488,
"learning_rate": 1.9759972212759657e-05,
"loss": 2.3569,
"step": 154
},
{
"epoch": 0.09351432880844646,
"grad_norm": 1.8010667696730225,
"learning_rate": 1.975572259287467e-05,
"loss": 2.3723,
"step": 155
},
{
"epoch": 0.09411764705882353,
"grad_norm": 1.9083856537037562,
"learning_rate": 1.9751436148307145e-05,
"loss": 2.4458,
"step": 156
},
{
"epoch": 0.0947209653092006,
"grad_norm": 1.760952758078511,
"learning_rate": 1.9747112895237025e-05,
"loss": 2.3695,
"step": 157
},
{
"epoch": 0.09532428355957767,
"grad_norm": 1.7712302777595437,
"learning_rate": 1.974275284998318e-05,
"loss": 2.3216,
"step": 158
},
{
"epoch": 0.09592760180995476,
"grad_norm": 1.8823126275701794,
"learning_rate": 1.9738356029003367e-05,
"loss": 2.4338,
"step": 159
},
{
"epoch": 0.09653092006033183,
"grad_norm": 1.6853694604703957,
"learning_rate": 1.973392244889415e-05,
"loss": 2.3782,
"step": 160
},
{
"epoch": 0.0971342383107089,
"grad_norm": 1.9145605199325024,
"learning_rate": 1.972945212639086e-05,
"loss": 2.3911,
"step": 161
},
{
"epoch": 0.09773755656108597,
"grad_norm": 1.8510508481654304,
"learning_rate": 1.9724945078367513e-05,
"loss": 2.4365,
"step": 162
},
{
"epoch": 0.09834087481146304,
"grad_norm": 1.7055487785395052,
"learning_rate": 1.9720401321836742e-05,
"loss": 2.3917,
"step": 163
},
{
"epoch": 0.09894419306184012,
"grad_norm": 1.8561955372145713,
"learning_rate": 1.971582087394976e-05,
"loss": 2.4272,
"step": 164
},
{
"epoch": 0.09954751131221719,
"grad_norm": 1.828842762360334,
"learning_rate": 1.9711203751996267e-05,
"loss": 2.3263,
"step": 165
},
{
"epoch": 0.10015082956259427,
"grad_norm": 2.1464668211309377,
"learning_rate": 1.9706549973404394e-05,
"loss": 2.4101,
"step": 166
},
{
"epoch": 0.10075414781297135,
"grad_norm": 1.7924458294359513,
"learning_rate": 1.9701859555740647e-05,
"loss": 2.3067,
"step": 167
},
{
"epoch": 0.10135746606334842,
"grad_norm": 1.9914703905505418,
"learning_rate": 1.9697132516709826e-05,
"loss": 2.4465,
"step": 168
},
{
"epoch": 0.10196078431372549,
"grad_norm": 2.1516167037282234,
"learning_rate": 1.9692368874154966e-05,
"loss": 2.3477,
"step": 169
},
{
"epoch": 0.10256410256410256,
"grad_norm": 2.2682594453717195,
"learning_rate": 1.9687568646057277e-05,
"loss": 2.4296,
"step": 170
},
{
"epoch": 0.10316742081447963,
"grad_norm": 1.7325214885723506,
"learning_rate": 1.9682731850536054e-05,
"loss": 2.3712,
"step": 171
},
{
"epoch": 0.10377073906485672,
"grad_norm": 1.8188841894294345,
"learning_rate": 1.9677858505848627e-05,
"loss": 2.3909,
"step": 172
},
{
"epoch": 0.10437405731523379,
"grad_norm": 1.8475101959381366,
"learning_rate": 1.9672948630390296e-05,
"loss": 2.3942,
"step": 173
},
{
"epoch": 0.10497737556561086,
"grad_norm": 2.0699573667060194,
"learning_rate": 1.966800224269424e-05,
"loss": 2.4203,
"step": 174
},
{
"epoch": 0.10558069381598793,
"grad_norm": 1.8970948583101985,
"learning_rate": 1.966301936143146e-05,
"loss": 2.378,
"step": 175
},
{
"epoch": 0.106184012066365,
"grad_norm": 1.918771810482122,
"learning_rate": 1.965800000541072e-05,
"loss": 2.3355,
"step": 176
},
{
"epoch": 0.10678733031674208,
"grad_norm": 1.9104732128969586,
"learning_rate": 1.965294419357846e-05,
"loss": 2.3714,
"step": 177
},
{
"epoch": 0.10739064856711915,
"grad_norm": 1.9623062607289763,
"learning_rate": 1.9647851945018723e-05,
"loss": 2.3975,
"step": 178
},
{
"epoch": 0.10799396681749623,
"grad_norm": 1.7978825611011124,
"learning_rate": 1.9642723278953097e-05,
"loss": 2.3132,
"step": 179
},
{
"epoch": 0.1085972850678733,
"grad_norm": 1.8053266937593022,
"learning_rate": 1.9637558214740618e-05,
"loss": 2.465,
"step": 180
},
{
"epoch": 0.10920060331825038,
"grad_norm": 1.889779034267551,
"learning_rate": 1.9632356771877735e-05,
"loss": 2.3872,
"step": 181
},
{
"epoch": 0.10980392156862745,
"grad_norm": 1.8803842729323514,
"learning_rate": 1.9627118969998204e-05,
"loss": 2.3778,
"step": 182
},
{
"epoch": 0.11040723981900452,
"grad_norm": 1.9505864821578314,
"learning_rate": 1.9621844828873024e-05,
"loss": 2.4465,
"step": 183
},
{
"epoch": 0.11101055806938159,
"grad_norm": 1.815874396814083,
"learning_rate": 1.9616534368410364e-05,
"loss": 2.376,
"step": 184
},
{
"epoch": 0.11161387631975868,
"grad_norm": 2.0291364738085362,
"learning_rate": 1.9611187608655484e-05,
"loss": 2.4539,
"step": 185
},
{
"epoch": 0.11221719457013575,
"grad_norm": 1.7293291522046839,
"learning_rate": 1.9605804569790667e-05,
"loss": 2.3774,
"step": 186
},
{
"epoch": 0.11282051282051282,
"grad_norm": 1.8271832077043337,
"learning_rate": 1.9600385272135133e-05,
"loss": 2.3419,
"step": 187
},
{
"epoch": 0.1134238310708899,
"grad_norm": 1.9073281870355994,
"learning_rate": 1.9594929736144978e-05,
"loss": 2.4058,
"step": 188
},
{
"epoch": 0.11402714932126697,
"grad_norm": 1.8920093693507751,
"learning_rate": 1.958943798241306e-05,
"loss": 2.3835,
"step": 189
},
{
"epoch": 0.11463046757164404,
"grad_norm": 1.9721387572671636,
"learning_rate": 1.9583910031668984e-05,
"loss": 2.4108,
"step": 190
},
{
"epoch": 0.11523378582202111,
"grad_norm": 1.8457198854743908,
"learning_rate": 1.9578345904778956e-05,
"loss": 2.4417,
"step": 191
},
{
"epoch": 0.1158371040723982,
"grad_norm": 2.111648646737359,
"learning_rate": 1.957274562274575e-05,
"loss": 2.4391,
"step": 192
},
{
"epoch": 0.11644042232277527,
"grad_norm": 2.3730141778195293,
"learning_rate": 1.9567109206708615e-05,
"loss": 2.3304,
"step": 193
},
{
"epoch": 0.11704374057315234,
"grad_norm": 2.0315724564258373,
"learning_rate": 1.9561436677943183e-05,
"loss": 2.3952,
"step": 194
},
{
"epoch": 0.11764705882352941,
"grad_norm": 1.681575439138192,
"learning_rate": 1.955572805786141e-05,
"loss": 2.3506,
"step": 195
},
{
"epoch": 0.11825037707390648,
"grad_norm": 1.788871747181257,
"learning_rate": 1.954998336801148e-05,
"loss": 2.3203,
"step": 196
},
{
"epoch": 0.11885369532428355,
"grad_norm": 1.8503621082397483,
"learning_rate": 1.9544202630077733e-05,
"loss": 2.5021,
"step": 197
},
{
"epoch": 0.11945701357466064,
"grad_norm": 1.9772710060608363,
"learning_rate": 1.9538385865880574e-05,
"loss": 2.3863,
"step": 198
},
{
"epoch": 0.12006033182503771,
"grad_norm": 1.7920396633477924,
"learning_rate": 1.95325330973764e-05,
"loss": 2.3542,
"step": 199
},
{
"epoch": 0.12066365007541478,
"grad_norm": 1.9252370879367382,
"learning_rate": 1.9526644346657508e-05,
"loss": 2.388,
"step": 200
},
{
"epoch": 0.12126696832579185,
"grad_norm": 1.7917196698282978,
"learning_rate": 1.9520719635952015e-05,
"loss": 2.4271,
"step": 201
},
{
"epoch": 0.12187028657616893,
"grad_norm": 1.7496514390158402,
"learning_rate": 1.9514758987623784e-05,
"loss": 2.443,
"step": 202
},
{
"epoch": 0.122473604826546,
"grad_norm": 1.9458516540089497,
"learning_rate": 1.9508762424172326e-05,
"loss": 2.4568,
"step": 203
},
{
"epoch": 0.12307692307692308,
"grad_norm": 1.8063666366956526,
"learning_rate": 1.9502729968232718e-05,
"loss": 2.3891,
"step": 204
},
{
"epoch": 0.12368024132730016,
"grad_norm": 1.7536255425649965,
"learning_rate": 1.9496661642575517e-05,
"loss": 2.3578,
"step": 205
},
{
"epoch": 0.12428355957767723,
"grad_norm": 1.8450234221910247,
"learning_rate": 1.949055747010669e-05,
"loss": 2.3632,
"step": 206
},
{
"epoch": 0.1248868778280543,
"grad_norm": 1.8760594417203018,
"learning_rate": 1.9484417473867493e-05,
"loss": 2.3541,
"step": 207
},
{
"epoch": 0.12549019607843137,
"grad_norm": 1.7663666525781663,
"learning_rate": 1.9478241677034422e-05,
"loss": 2.3477,
"step": 208
},
{
"epoch": 0.12549019607843137,
"eval_loss": 2.3970625400543213,
"eval_runtime": 22.1187,
"eval_samples_per_second": 3.979,
"eval_steps_per_second": 0.497,
"step": 208
},
{
"epoch": 0.12609351432880844,
"grad_norm": 1.592698799935792,
"learning_rate": 1.9472030102919102e-05,
"loss": 2.3408,
"step": 209
},
{
"epoch": 0.12669683257918551,
"grad_norm": 1.7453187285581997,
"learning_rate": 1.946578277496821e-05,
"loss": 2.2977,
"step": 210
},
{
"epoch": 0.12730015082956259,
"grad_norm": 1.7030576182449984,
"learning_rate": 1.9459499716763376e-05,
"loss": 2.3966,
"step": 211
},
{
"epoch": 0.12790346907993966,
"grad_norm": 1.9418163249698999,
"learning_rate": 1.94531809520211e-05,
"loss": 2.3932,
"step": 212
},
{
"epoch": 0.12850678733031673,
"grad_norm": 1.9115180621965062,
"learning_rate": 1.944682650459267e-05,
"loss": 2.4014,
"step": 213
},
{
"epoch": 0.12911010558069383,
"grad_norm": 1.8587614833043946,
"learning_rate": 1.944043639846406e-05,
"loss": 2.4471,
"step": 214
},
{
"epoch": 0.1297134238310709,
"grad_norm": 2.015076007850878,
"learning_rate": 1.943401065775584e-05,
"loss": 2.4381,
"step": 215
},
{
"epoch": 0.13031674208144797,
"grad_norm": 1.915288111900487,
"learning_rate": 1.94275493067231e-05,
"loss": 2.2686,
"step": 216
},
{
"epoch": 0.13092006033182504,
"grad_norm": 1.9180430163339584,
"learning_rate": 1.9421052369755335e-05,
"loss": 2.3931,
"step": 217
},
{
"epoch": 0.13152337858220212,
"grad_norm": 1.9604487273843694,
"learning_rate": 1.9414519871376373e-05,
"loss": 2.4332,
"step": 218
},
{
"epoch": 0.1321266968325792,
"grad_norm": 1.9174581601925882,
"learning_rate": 1.940795183624427e-05,
"loss": 2.4211,
"step": 219
},
{
"epoch": 0.13273001508295626,
"grad_norm": 1.943376592580462,
"learning_rate": 1.940134828915123e-05,
"loss": 2.3691,
"step": 220
},
{
"epoch": 0.13333333333333333,
"grad_norm": 1.8170672134115446,
"learning_rate": 1.9394709255023488e-05,
"loss": 2.3705,
"step": 221
},
{
"epoch": 0.1339366515837104,
"grad_norm": 1.8996487790159449,
"learning_rate": 1.9388034758921247e-05,
"loss": 2.3874,
"step": 222
},
{
"epoch": 0.13453996983408748,
"grad_norm": 2.0789020898005166,
"learning_rate": 1.938132482603856e-05,
"loss": 2.4122,
"step": 223
},
{
"epoch": 0.13514328808446455,
"grad_norm": 2.3124371595134243,
"learning_rate": 1.9374579481703244e-05,
"loss": 2.3854,
"step": 224
},
{
"epoch": 0.13574660633484162,
"grad_norm": 2.2679685622803123,
"learning_rate": 1.936779875137678e-05,
"loss": 2.4774,
"step": 225
},
{
"epoch": 0.1363499245852187,
"grad_norm": 1.9820900351023627,
"learning_rate": 1.936098266065422e-05,
"loss": 2.4126,
"step": 226
},
{
"epoch": 0.1369532428355958,
"grad_norm": 1.7859284260510715,
"learning_rate": 1.93541312352641e-05,
"loss": 2.367,
"step": 227
},
{
"epoch": 0.13755656108597286,
"grad_norm": 1.7574332004410753,
"learning_rate": 1.934724450106831e-05,
"loss": 2.3353,
"step": 228
},
{
"epoch": 0.13815987933634993,
"grad_norm": 1.8454300169336622,
"learning_rate": 1.934032248406205e-05,
"loss": 2.4519,
"step": 229
},
{
"epoch": 0.138763197586727,
"grad_norm": 2.1927204914704,
"learning_rate": 1.9333365210373668e-05,
"loss": 2.5473,
"step": 230
},
{
"epoch": 0.13936651583710408,
"grad_norm": 2.0054427145982707,
"learning_rate": 1.9326372706264625e-05,
"loss": 2.3596,
"step": 231
},
{
"epoch": 0.13996983408748115,
"grad_norm": 1.8243562565754317,
"learning_rate": 1.9319344998129344e-05,
"loss": 2.3503,
"step": 232
},
{
"epoch": 0.14057315233785822,
"grad_norm": 2.002408622083468,
"learning_rate": 1.9312282112495146e-05,
"loss": 2.3835,
"step": 233
},
{
"epoch": 0.1411764705882353,
"grad_norm": 2.067448917796081,
"learning_rate": 1.9305184076022117e-05,
"loss": 2.4941,
"step": 234
},
{
"epoch": 0.14177978883861236,
"grad_norm": 2.071059748009206,
"learning_rate": 1.9298050915503053e-05,
"loss": 2.3011,
"step": 235
},
{
"epoch": 0.14238310708898944,
"grad_norm": 1.8223690330691251,
"learning_rate": 1.929088265786331e-05,
"loss": 2.3448,
"step": 236
},
{
"epoch": 0.1429864253393665,
"grad_norm": 1.9649289609379725,
"learning_rate": 1.9283679330160726e-05,
"loss": 2.404,
"step": 237
},
{
"epoch": 0.14358974358974358,
"grad_norm": 1.8402195523194098,
"learning_rate": 1.9276440959585533e-05,
"loss": 2.2817,
"step": 238
},
{
"epoch": 0.14419306184012065,
"grad_norm": 1.8343107897406508,
"learning_rate": 1.926916757346022e-05,
"loss": 2.3819,
"step": 239
},
{
"epoch": 0.14479638009049775,
"grad_norm": 1.990742200302091,
"learning_rate": 1.926185919923946e-05,
"loss": 2.3836,
"step": 240
},
{
"epoch": 0.14539969834087482,
"grad_norm": 2.18088582410677,
"learning_rate": 1.9254515864509982e-05,
"loss": 2.4103,
"step": 241
},
{
"epoch": 0.1460030165912519,
"grad_norm": 1.87907229653626,
"learning_rate": 1.92471375969905e-05,
"loss": 2.3973,
"step": 242
},
{
"epoch": 0.14660633484162897,
"grad_norm": 1.8475576105015994,
"learning_rate": 1.9239724424531575e-05,
"loss": 2.3917,
"step": 243
},
{
"epoch": 0.14720965309200604,
"grad_norm": 2.055658433766295,
"learning_rate": 1.9232276375115517e-05,
"loss": 2.3383,
"step": 244
},
{
"epoch": 0.1478129713423831,
"grad_norm": 2.1041231379062935,
"learning_rate": 1.9224793476856293e-05,
"loss": 2.3808,
"step": 245
},
{
"epoch": 0.14841628959276018,
"grad_norm": 2.072538470281817,
"learning_rate": 1.9217275757999418e-05,
"loss": 2.3157,
"step": 246
},
{
"epoch": 0.14901960784313725,
"grad_norm": 1.8558398330207053,
"learning_rate": 1.9209723246921837e-05,
"loss": 2.4278,
"step": 247
},
{
"epoch": 0.14962292609351432,
"grad_norm": 1.8255106731337276,
"learning_rate": 1.920213597213182e-05,
"loss": 2.396,
"step": 248
},
{
"epoch": 0.1502262443438914,
"grad_norm": 2.29854070332297,
"learning_rate": 1.9194513962268865e-05,
"loss": 2.3833,
"step": 249
},
{
"epoch": 0.15082956259426847,
"grad_norm": 1.997730060781554,
"learning_rate": 1.9186857246103586e-05,
"loss": 2.4327,
"step": 250
},
{
"epoch": 0.15143288084464554,
"grad_norm": 1.8277428582585862,
"learning_rate": 1.9179165852537596e-05,
"loss": 2.4328,
"step": 251
},
{
"epoch": 0.1520361990950226,
"grad_norm": 1.8068053318064188,
"learning_rate": 1.9171439810603406e-05,
"loss": 2.3604,
"step": 252
},
{
"epoch": 0.1526395173453997,
"grad_norm": 1.9030580310166223,
"learning_rate": 1.9163679149464313e-05,
"loss": 2.4525,
"step": 253
},
{
"epoch": 0.15324283559577678,
"grad_norm": 2.148689163299343,
"learning_rate": 1.9155883898414292e-05,
"loss": 2.4487,
"step": 254
},
{
"epoch": 0.15384615384615385,
"grad_norm": 2.636326374028854,
"learning_rate": 1.9148054086877884e-05,
"loss": 2.4019,
"step": 255
},
{
"epoch": 0.15444947209653093,
"grad_norm": 2.2875442599864138,
"learning_rate": 1.914018974441008e-05,
"loss": 2.3296,
"step": 256
},
{
"epoch": 0.155052790346908,
"grad_norm": 1.9038065532836297,
"learning_rate": 1.913229090069622e-05,
"loss": 2.3768,
"step": 257
},
{
"epoch": 0.15565610859728507,
"grad_norm": 1.799523347575496,
"learning_rate": 1.9124357585551872e-05,
"loss": 2.3565,
"step": 258
},
{
"epoch": 0.15625942684766214,
"grad_norm": 2.554360566381463,
"learning_rate": 1.9116389828922717e-05,
"loss": 2.501,
"step": 259
},
{
"epoch": 0.1568627450980392,
"grad_norm": 2.1864975862465603,
"learning_rate": 1.9108387660884456e-05,
"loss": 2.3677,
"step": 260
},
{
"epoch": 0.15746606334841629,
"grad_norm": 1.9958565133131163,
"learning_rate": 1.9100351111642666e-05,
"loss": 2.41,
"step": 261
},
{
"epoch": 0.15806938159879336,
"grad_norm": 1.7747256573037626,
"learning_rate": 1.9092280211532715e-05,
"loss": 2.3605,
"step": 262
},
{
"epoch": 0.15867269984917043,
"grad_norm": 1.6053666596618383,
"learning_rate": 1.9084174991019622e-05,
"loss": 2.3649,
"step": 263
},
{
"epoch": 0.1592760180995475,
"grad_norm": 1.7529886672743595,
"learning_rate": 1.9076035480697964e-05,
"loss": 2.3464,
"step": 264
},
{
"epoch": 0.15987933634992457,
"grad_norm": 1.8311834709458998,
"learning_rate": 1.9067861711291744e-05,
"loss": 2.4533,
"step": 265
},
{
"epoch": 0.16048265460030167,
"grad_norm": 1.7726486534434671,
"learning_rate": 1.905965371365429e-05,
"loss": 2.4056,
"step": 266
},
{
"epoch": 0.16108597285067874,
"grad_norm": 2.0550030984404213,
"learning_rate": 1.9051411518768126e-05,
"loss": 2.394,
"step": 267
},
{
"epoch": 0.16168929110105582,
"grad_norm": 1.7669710679260708,
"learning_rate": 1.9043135157744853e-05,
"loss": 2.3723,
"step": 268
},
{
"epoch": 0.1622926093514329,
"grad_norm": 1.8777975727584983,
"learning_rate": 1.9034824661825048e-05,
"loss": 2.3603,
"step": 269
},
{
"epoch": 0.16289592760180996,
"grad_norm": 1.935796347721567,
"learning_rate": 1.9026480062378136e-05,
"loss": 2.3601,
"step": 270
},
{
"epoch": 0.16349924585218703,
"grad_norm": 2.010363683111041,
"learning_rate": 1.9018101390902262e-05,
"loss": 2.4271,
"step": 271
},
{
"epoch": 0.1641025641025641,
"grad_norm": 1.8516623115874755,
"learning_rate": 1.900968867902419e-05,
"loss": 2.3637,
"step": 272
},
{
"epoch": 0.16470588235294117,
"grad_norm": 1.788273551595415,
"learning_rate": 1.900124195849918e-05,
"loss": 2.3673,
"step": 273
},
{
"epoch": 0.16530920060331825,
"grad_norm": 1.811773390862476,
"learning_rate": 1.8992761261210848e-05,
"loss": 2.4597,
"step": 274
},
{
"epoch": 0.16591251885369532,
"grad_norm": 2.0646022057034723,
"learning_rate": 1.8984246619171075e-05,
"loss": 2.3945,
"step": 275
},
{
"epoch": 0.1665158371040724,
"grad_norm": 1.8228637291719356,
"learning_rate": 1.8975698064519865e-05,
"loss": 2.3747,
"step": 276
},
{
"epoch": 0.16711915535444946,
"grad_norm": 2.0357715504851033,
"learning_rate": 1.8967115629525238e-05,
"loss": 2.4305,
"step": 277
},
{
"epoch": 0.16772247360482653,
"grad_norm": 1.8806892592912532,
"learning_rate": 1.8958499346583092e-05,
"loss": 2.4659,
"step": 278
},
{
"epoch": 0.16832579185520363,
"grad_norm": 1.8299663755931939,
"learning_rate": 1.89498492482171e-05,
"loss": 2.2796,
"step": 279
},
{
"epoch": 0.1689291101055807,
"grad_norm": 1.8549052690023438,
"learning_rate": 1.894116536707857e-05,
"loss": 2.4846,
"step": 280
},
{
"epoch": 0.16953242835595778,
"grad_norm": 2.01979326739457,
"learning_rate": 1.8932447735946332e-05,
"loss": 2.3653,
"step": 281
},
{
"epoch": 0.17013574660633485,
"grad_norm": 1.921195088621264,
"learning_rate": 1.892369638772661e-05,
"loss": 2.401,
"step": 282
},
{
"epoch": 0.17073906485671192,
"grad_norm": 2.1371043076225087,
"learning_rate": 1.8914911355452895e-05,
"loss": 2.3625,
"step": 283
},
{
"epoch": 0.171342383107089,
"grad_norm": 1.7344453161696207,
"learning_rate": 1.8906092672285842e-05,
"loss": 2.3967,
"step": 284
},
{
"epoch": 0.17194570135746606,
"grad_norm": 1.8690649602037965,
"learning_rate": 1.8897240371513098e-05,
"loss": 2.4613,
"step": 285
},
{
"epoch": 0.17254901960784313,
"grad_norm": 1.7407275349017972,
"learning_rate": 1.8888354486549238e-05,
"loss": 2.3812,
"step": 286
},
{
"epoch": 0.1731523378582202,
"grad_norm": 2.04205458370752,
"learning_rate": 1.8879435050935577e-05,
"loss": 2.3417,
"step": 287
},
{
"epoch": 0.17375565610859728,
"grad_norm": 1.7278652356135291,
"learning_rate": 1.887048209834009e-05,
"loss": 2.3437,
"step": 288
},
{
"epoch": 0.17435897435897435,
"grad_norm": 1.7583874021257075,
"learning_rate": 1.8861495662557264e-05,
"loss": 2.4033,
"step": 289
},
{
"epoch": 0.17496229260935142,
"grad_norm": 1.863241146572609,
"learning_rate": 1.8852475777507983e-05,
"loss": 2.3677,
"step": 290
},
{
"epoch": 0.1755656108597285,
"grad_norm": 1.8653641337953326,
"learning_rate": 1.8843422477239362e-05,
"loss": 2.456,
"step": 291
},
{
"epoch": 0.1761689291101056,
"grad_norm": 1.7776126571499877,
"learning_rate": 1.8834335795924686e-05,
"loss": 2.4091,
"step": 292
},
{
"epoch": 0.17677224736048266,
"grad_norm": 1.7570559673454416,
"learning_rate": 1.8825215767863215e-05,
"loss": 2.4307,
"step": 293
},
{
"epoch": 0.17737556561085974,
"grad_norm": 1.9052460328013197,
"learning_rate": 1.881606242748009e-05,
"loss": 2.407,
"step": 294
},
{
"epoch": 0.1779788838612368,
"grad_norm": 1.972445114283892,
"learning_rate": 1.8806875809326204e-05,
"loss": 2.3553,
"step": 295
},
{
"epoch": 0.17858220211161388,
"grad_norm": 1.7997390190238367,
"learning_rate": 1.879765594807805e-05,
"loss": 2.4215,
"step": 296
},
{
"epoch": 0.17918552036199095,
"grad_norm": 2.058979279045622,
"learning_rate": 1.878840287853761e-05,
"loss": 2.3919,
"step": 297
},
{
"epoch": 0.17978883861236802,
"grad_norm": 1.7717249063838723,
"learning_rate": 1.877911663563221e-05,
"loss": 2.4244,
"step": 298
},
{
"epoch": 0.1803921568627451,
"grad_norm": 2.0093236997931463,
"learning_rate": 1.8769797254414406e-05,
"loss": 2.4132,
"step": 299
},
{
"epoch": 0.18099547511312217,
"grad_norm": 1.8173301125856807,
"learning_rate": 1.876044477006183e-05,
"loss": 2.4322,
"step": 300
},
{
"epoch": 0.18159879336349924,
"grad_norm": 2.152436375631988,
"learning_rate": 1.875105921787707e-05,
"loss": 2.3923,
"step": 301
},
{
"epoch": 0.1822021116138763,
"grad_norm": 1.8521342794831184,
"learning_rate": 1.874164063328754e-05,
"loss": 2.346,
"step": 302
},
{
"epoch": 0.18280542986425338,
"grad_norm": 1.8949500548076719,
"learning_rate": 1.8732189051845328e-05,
"loss": 2.4535,
"step": 303
},
{
"epoch": 0.18340874811463045,
"grad_norm": 1.8548204739343872,
"learning_rate": 1.8722704509227094e-05,
"loss": 2.3634,
"step": 304
},
{
"epoch": 0.18401206636500755,
"grad_norm": 2.0044853401637748,
"learning_rate": 1.8713187041233896e-05,
"loss": 2.353,
"step": 305
},
{
"epoch": 0.18461538461538463,
"grad_norm": 1.8962192812988332,
"learning_rate": 1.8703636683791084e-05,
"loss": 2.347,
"step": 306
},
{
"epoch": 0.1852187028657617,
"grad_norm": 1.8762931478739766,
"learning_rate": 1.8694053472948154e-05,
"loss": 2.4613,
"step": 307
},
{
"epoch": 0.18582202111613877,
"grad_norm": 1.775862031858868,
"learning_rate": 1.868443744487862e-05,
"loss": 2.36,
"step": 308
},
{
"epoch": 0.18642533936651584,
"grad_norm": 1.7944201096764627,
"learning_rate": 1.8674788635879848e-05,
"loss": 2.449,
"step": 309
},
{
"epoch": 0.1870286576168929,
"grad_norm": 1.8128871531412751,
"learning_rate": 1.866510708237297e-05,
"loss": 2.4367,
"step": 310
},
{
"epoch": 0.18763197586726998,
"grad_norm": 1.7802612520365133,
"learning_rate": 1.8655392820902695e-05,
"loss": 2.3369,
"step": 311
},
{
"epoch": 0.18823529411764706,
"grad_norm": 2.2309703895171364,
"learning_rate": 1.8645645888137213e-05,
"loss": 2.4264,
"step": 312
},
{
"epoch": 0.18883861236802413,
"grad_norm": 1.7727549439698453,
"learning_rate": 1.8635866320868023e-05,
"loss": 2.3847,
"step": 313
},
{
"epoch": 0.1894419306184012,
"grad_norm": 1.8039382080690232,
"learning_rate": 1.8626054156009807e-05,
"loss": 2.325,
"step": 314
},
{
"epoch": 0.19004524886877827,
"grad_norm": 1.877849837240779,
"learning_rate": 1.861620943060031e-05,
"loss": 2.4105,
"step": 315
},
{
"epoch": 0.19064856711915534,
"grad_norm": 1.8964244614031414,
"learning_rate": 1.8606332181800165e-05,
"loss": 2.3869,
"step": 316
},
{
"epoch": 0.19125188536953242,
"grad_norm": 1.7787737145042568,
"learning_rate": 1.8596422446892774e-05,
"loss": 2.3289,
"step": 317
},
{
"epoch": 0.19185520361990951,
"grad_norm": 1.8520073770815342,
"learning_rate": 1.8586480263284174e-05,
"loss": 2.3654,
"step": 318
},
{
"epoch": 0.1924585218702866,
"grad_norm": 2.0189483743742196,
"learning_rate": 1.8576505668502872e-05,
"loss": 2.4469,
"step": 319
},
{
"epoch": 0.19306184012066366,
"grad_norm": 1.739253673937665,
"learning_rate": 1.856649870019972e-05,
"loss": 2.3625,
"step": 320
},
{
"epoch": 0.19366515837104073,
"grad_norm": 1.773893768635032,
"learning_rate": 1.8556459396147777e-05,
"loss": 2.4035,
"step": 321
},
{
"epoch": 0.1942684766214178,
"grad_norm": 1.7946696459095224,
"learning_rate": 1.8546387794242148e-05,
"loss": 2.376,
"step": 322
},
{
"epoch": 0.19487179487179487,
"grad_norm": 1.8538835498926256,
"learning_rate": 1.853628393249986e-05,
"loss": 2.3625,
"step": 323
},
{
"epoch": 0.19547511312217195,
"grad_norm": 1.797967680017188,
"learning_rate": 1.8526147849059705e-05,
"loss": 2.4747,
"step": 324
},
{
"epoch": 0.19607843137254902,
"grad_norm": 1.9224740564061376,
"learning_rate": 1.8515979582182112e-05,
"loss": 2.3916,
"step": 325
},
{
"epoch": 0.1966817496229261,
"grad_norm": 2.0564977386941985,
"learning_rate": 1.8505779170248978e-05,
"loss": 2.3166,
"step": 326
},
{
"epoch": 0.19728506787330316,
"grad_norm": 1.808289138294294,
"learning_rate": 1.849554665176354e-05,
"loss": 2.4381,
"step": 327
},
{
"epoch": 0.19788838612368023,
"grad_norm": 1.8842853314907695,
"learning_rate": 1.8485282065350237e-05,
"loss": 2.3487,
"step": 328
},
{
"epoch": 0.1984917043740573,
"grad_norm": 1.8142033636317016,
"learning_rate": 1.8474985449754543e-05,
"loss": 2.4145,
"step": 329
},
{
"epoch": 0.19909502262443438,
"grad_norm": 2.034938580816979,
"learning_rate": 1.8464656843842837e-05,
"loss": 2.3543,
"step": 330
},
{
"epoch": 0.19969834087481148,
"grad_norm": 2.2995536408204083,
"learning_rate": 1.845429628660225e-05,
"loss": 2.4052,
"step": 331
},
{
"epoch": 0.20030165912518855,
"grad_norm": 1.9107922335599887,
"learning_rate": 1.8443903817140517e-05,
"loss": 2.331,
"step": 332
},
{
"epoch": 0.20090497737556562,
"grad_norm": 1.8530530610478275,
"learning_rate": 1.8433479474685837e-05,
"loss": 2.391,
"step": 333
},
{
"epoch": 0.2015082956259427,
"grad_norm": 1.7772675731075585,
"learning_rate": 1.8423023298586716e-05,
"loss": 2.3647,
"step": 334
},
{
"epoch": 0.20211161387631976,
"grad_norm": 1.7438120884628647,
"learning_rate": 1.8412535328311813e-05,
"loss": 2.4638,
"step": 335
},
{
"epoch": 0.20271493212669683,
"grad_norm": 1.939231045048208,
"learning_rate": 1.8402015603449814e-05,
"loss": 2.3401,
"step": 336
},
{
"epoch": 0.2033182503770739,
"grad_norm": 2.1156637551867075,
"learning_rate": 1.839146416370926e-05,
"loss": 2.3731,
"step": 337
},
{
"epoch": 0.20392156862745098,
"grad_norm": 1.721959556886391,
"learning_rate": 1.8380881048918406e-05,
"loss": 2.3552,
"step": 338
},
{
"epoch": 0.20452488687782805,
"grad_norm": 1.7740805853861341,
"learning_rate": 1.8370266299025076e-05,
"loss": 2.3339,
"step": 339
},
{
"epoch": 0.20512820512820512,
"grad_norm": 1.8222550385292156,
"learning_rate": 1.8359619954096497e-05,
"loss": 2.3466,
"step": 340
},
{
"epoch": 0.2057315233785822,
"grad_norm": 1.9260210550895165,
"learning_rate": 1.8348942054319164e-05,
"loss": 2.3875,
"step": 341
},
{
"epoch": 0.20633484162895926,
"grad_norm": 2.050552910276077,
"learning_rate": 1.8338232639998672e-05,
"loss": 2.3296,
"step": 342
},
{
"epoch": 0.20693815987933634,
"grad_norm": 1.9365149897495877,
"learning_rate": 1.832749175155959e-05,
"loss": 2.3629,
"step": 343
},
{
"epoch": 0.20754147812971344,
"grad_norm": 1.7559234344283723,
"learning_rate": 1.8316719429545277e-05,
"loss": 2.4042,
"step": 344
},
{
"epoch": 0.2081447963800905,
"grad_norm": 1.9011593642730806,
"learning_rate": 1.8305915714617745e-05,
"loss": 2.4089,
"step": 345
},
{
"epoch": 0.20874811463046758,
"grad_norm": 1.8581574541544932,
"learning_rate": 1.8295080647557507e-05,
"loss": 2.4676,
"step": 346
},
{
"epoch": 0.20935143288084465,
"grad_norm": 1.669412385871575,
"learning_rate": 1.828421426926343e-05,
"loss": 2.4181,
"step": 347
},
{
"epoch": 0.20995475113122172,
"grad_norm": 1.7111028859089021,
"learning_rate": 1.8273316620752548e-05,
"loss": 2.3814,
"step": 348
},
{
"epoch": 0.2105580693815988,
"grad_norm": 1.8094190923284823,
"learning_rate": 1.826238774315995e-05,
"loss": 2.368,
"step": 349
},
{
"epoch": 0.21116138763197587,
"grad_norm": 1.9349580280278726,
"learning_rate": 1.8251427677738596e-05,
"loss": 2.4115,
"step": 350
},
{
"epoch": 0.21176470588235294,
"grad_norm": 1.9056928493906655,
"learning_rate": 1.824043646585917e-05,
"loss": 2.3545,
"step": 351
},
{
"epoch": 0.21236802413273,
"grad_norm": 1.6630940301503085,
"learning_rate": 1.822941414900993e-05,
"loss": 2.3402,
"step": 352
},
{
"epoch": 0.21297134238310708,
"grad_norm": 1.7549145614861923,
"learning_rate": 1.8218360768796534e-05,
"loss": 2.4039,
"step": 353
},
{
"epoch": 0.21357466063348415,
"grad_norm": 1.8427236861415082,
"learning_rate": 1.8207276366941905e-05,
"loss": 2.3381,
"step": 354
},
{
"epoch": 0.21417797888386123,
"grad_norm": 2.235373185504055,
"learning_rate": 1.8196160985286052e-05,
"loss": 2.3765,
"step": 355
},
{
"epoch": 0.2147812971342383,
"grad_norm": 1.770871765959977,
"learning_rate": 1.8185014665785936e-05,
"loss": 2.3681,
"step": 356
},
{
"epoch": 0.2153846153846154,
"grad_norm": 1.8385630389005978,
"learning_rate": 1.8173837450515286e-05,
"loss": 2.4128,
"step": 357
},
{
"epoch": 0.21598793363499247,
"grad_norm": 1.7678911062682983,
"learning_rate": 1.816262938166446e-05,
"loss": 2.3674,
"step": 358
},
{
"epoch": 0.21659125188536954,
"grad_norm": 1.9123835544861427,
"learning_rate": 1.8151390501540276e-05,
"loss": 2.3439,
"step": 359
},
{
"epoch": 0.2171945701357466,
"grad_norm": 2.0786843839491533,
"learning_rate": 1.814012085256585e-05,
"loss": 2.4165,
"step": 360
},
{
"epoch": 0.21779788838612368,
"grad_norm": 1.8542968655837673,
"learning_rate": 1.812882047728045e-05,
"loss": 2.3766,
"step": 361
},
{
"epoch": 0.21840120663650076,
"grad_norm": 1.9816967730585244,
"learning_rate": 1.8117489418339317e-05,
"loss": 2.4764,
"step": 362
},
{
"epoch": 0.21900452488687783,
"grad_norm": 1.7957823857988076,
"learning_rate": 1.810612771851352e-05,
"loss": 2.3854,
"step": 363
},
{
"epoch": 0.2196078431372549,
"grad_norm": 1.6610433514447112,
"learning_rate": 1.8094735420689776e-05,
"loss": 2.3259,
"step": 364
},
{
"epoch": 0.22021116138763197,
"grad_norm": 1.8308638860416302,
"learning_rate": 1.8083312567870315e-05,
"loss": 2.3721,
"step": 365
},
{
"epoch": 0.22081447963800904,
"grad_norm": 1.8325626958437702,
"learning_rate": 1.8071859203172694e-05,
"loss": 2.377,
"step": 366
},
{
"epoch": 0.22141779788838611,
"grad_norm": 1.9547508634110777,
"learning_rate": 1.8060375369829634e-05,
"loss": 2.4247,
"step": 367
},
{
"epoch": 0.22202111613876319,
"grad_norm": 1.8649123365121487,
"learning_rate": 1.8048861111188886e-05,
"loss": 2.3844,
"step": 368
},
{
"epoch": 0.22262443438914026,
"grad_norm": 1.74624962518136,
"learning_rate": 1.803731647071303e-05,
"loss": 2.4366,
"step": 369
},
{
"epoch": 0.22322775263951736,
"grad_norm": 1.8219871819075906,
"learning_rate": 1.8025741491979326e-05,
"loss": 2.3105,
"step": 370
},
{
"epoch": 0.22383107088989443,
"grad_norm": 2.1182021323134954,
"learning_rate": 1.8014136218679566e-05,
"loss": 2.318,
"step": 371
},
{
"epoch": 0.2244343891402715,
"grad_norm": 1.7494056659050516,
"learning_rate": 1.8002500694619884e-05,
"loss": 2.3206,
"step": 372
},
{
"epoch": 0.22503770739064857,
"grad_norm": 2.2164869617170266,
"learning_rate": 1.79908349637206e-05,
"loss": 2.4067,
"step": 373
},
{
"epoch": 0.22564102564102564,
"grad_norm": 1.9630568714796657,
"learning_rate": 1.7979139070016054e-05,
"loss": 2.3345,
"step": 374
},
{
"epoch": 0.22624434389140272,
"grad_norm": 1.994047154202555,
"learning_rate": 1.7967413057654452e-05,
"loss": 2.4362,
"step": 375
},
{
"epoch": 0.2268476621417798,
"grad_norm": 1.9394175558405136,
"learning_rate": 1.7955656970897673e-05,
"loss": 2.427,
"step": 376
},
{
"epoch": 0.22745098039215686,
"grad_norm": 2.143445611041672,
"learning_rate": 1.7943870854121126e-05,
"loss": 2.4238,
"step": 377
},
{
"epoch": 0.22805429864253393,
"grad_norm": 2.3004226826510754,
"learning_rate": 1.7932054751813574e-05,
"loss": 2.3988,
"step": 378
},
{
"epoch": 0.228657616892911,
"grad_norm": 1.7127745874017524,
"learning_rate": 1.7920208708576962e-05,
"loss": 2.3783,
"step": 379
},
{
"epoch": 0.22926093514328807,
"grad_norm": 1.8221494141382277,
"learning_rate": 1.7908332769126255e-05,
"loss": 2.4106,
"step": 380
},
{
"epoch": 0.22986425339366515,
"grad_norm": 2.0184574556961485,
"learning_rate": 1.7896426978289266e-05,
"loss": 2.3633,
"step": 381
},
{
"epoch": 0.23046757164404222,
"grad_norm": 1.8606731026968055,
"learning_rate": 1.788449138100648e-05,
"loss": 2.4626,
"step": 382
},
{
"epoch": 0.23107088989441932,
"grad_norm": 1.882291435136788,
"learning_rate": 1.7872526022330902e-05,
"loss": 2.4771,
"step": 383
},
{
"epoch": 0.2316742081447964,
"grad_norm": 1.899729423493797,
"learning_rate": 1.7860530947427878e-05,
"loss": 2.4091,
"step": 384
},
{
"epoch": 0.23227752639517346,
"grad_norm": 1.706959359533271,
"learning_rate": 1.784850620157491e-05,
"loss": 2.3219,
"step": 385
},
{
"epoch": 0.23288084464555053,
"grad_norm": 1.9187456706408401,
"learning_rate": 1.7836451830161508e-05,
"loss": 2.3767,
"step": 386
},
{
"epoch": 0.2334841628959276,
"grad_norm": 1.9626088909394706,
"learning_rate": 1.782436787868901e-05,
"loss": 2.3429,
"step": 387
},
{
"epoch": 0.23408748114630468,
"grad_norm": 1.9942190856904858,
"learning_rate": 1.7812254392770404e-05,
"loss": 2.4339,
"step": 388
},
{
"epoch": 0.23469079939668175,
"grad_norm": 1.7936311787007881,
"learning_rate": 1.7800111418130157e-05,
"loss": 2.4009,
"step": 389
},
{
"epoch": 0.23529411764705882,
"grad_norm": 1.7321179132501958,
"learning_rate": 1.7787939000604063e-05,
"loss": 2.3799,
"step": 390
},
{
"epoch": 0.2358974358974359,
"grad_norm": 1.862467975513408,
"learning_rate": 1.777573718613904e-05,
"loss": 2.5027,
"step": 391
},
{
"epoch": 0.23650075414781296,
"grad_norm": 1.9080097798672786,
"learning_rate": 1.7763506020792968e-05,
"loss": 2.3809,
"step": 392
},
{
"epoch": 0.23710407239819004,
"grad_norm": 1.8141940878625136,
"learning_rate": 1.775124555073452e-05,
"loss": 2.3633,
"step": 393
},
{
"epoch": 0.2377073906485671,
"grad_norm": 1.906573233645706,
"learning_rate": 1.773895582224299e-05,
"loss": 2.4482,
"step": 394
},
{
"epoch": 0.2383107088989442,
"grad_norm": 1.957367683643951,
"learning_rate": 1.7726636881708114e-05,
"loss": 2.2908,
"step": 395
},
{
"epoch": 0.23891402714932128,
"grad_norm": 1.7675271917750348,
"learning_rate": 1.771428877562988e-05,
"loss": 2.3769,
"step": 396
},
{
"epoch": 0.23951734539969835,
"grad_norm": 2.042449439254776,
"learning_rate": 1.7701911550618383e-05,
"loss": 2.3632,
"step": 397
},
{
"epoch": 0.24012066365007542,
"grad_norm": 2.3068254018217065,
"learning_rate": 1.768950525339362e-05,
"loss": 2.4363,
"step": 398
},
{
"epoch": 0.2407239819004525,
"grad_norm": 2.0732986884998224,
"learning_rate": 1.7677069930785338e-05,
"loss": 2.3961,
"step": 399
},
{
"epoch": 0.24132730015082957,
"grad_norm": 1.7517108830779222,
"learning_rate": 1.7664605629732832e-05,
"loss": 2.3616,
"step": 400
},
{
"epoch": 0.24193061840120664,
"grad_norm": 1.76566025562354,
"learning_rate": 1.765211239728479e-05,
"loss": 2.3751,
"step": 401
},
{
"epoch": 0.2425339366515837,
"grad_norm": 1.9790147630767292,
"learning_rate": 1.7639590280599107e-05,
"loss": 2.2897,
"step": 402
},
{
"epoch": 0.24313725490196078,
"grad_norm": 1.9354803283654394,
"learning_rate": 1.7627039326942702e-05,
"loss": 2.3675,
"step": 403
},
{
"epoch": 0.24374057315233785,
"grad_norm": 2.0127286733426613,
"learning_rate": 1.7614459583691346e-05,
"loss": 2.3644,
"step": 404
},
{
"epoch": 0.24434389140271492,
"grad_norm": 2.0183875068444004,
"learning_rate": 1.7601851098329484e-05,
"loss": 2.3722,
"step": 405
},
{
"epoch": 0.244947209653092,
"grad_norm": 1.7880232703698695,
"learning_rate": 1.758921391845005e-05,
"loss": 2.3411,
"step": 406
},
{
"epoch": 0.24555052790346907,
"grad_norm": 1.9587130275464881,
"learning_rate": 1.757654809175429e-05,
"loss": 2.4508,
"step": 407
},
{
"epoch": 0.24615384615384617,
"grad_norm": 1.62012638567783,
"learning_rate": 1.7563853666051586e-05,
"loss": 2.3503,
"step": 408
},
{
"epoch": 0.24675716440422324,
"grad_norm": 1.883736203148245,
"learning_rate": 1.7551130689259272e-05,
"loss": 2.3349,
"step": 409
},
{
"epoch": 0.2473604826546003,
"grad_norm": 2.0210348824514885,
"learning_rate": 1.7538379209402442e-05,
"loss": 2.4396,
"step": 410
},
{
"epoch": 0.24796380090497738,
"grad_norm": 1.922814654639588,
"learning_rate": 1.7525599274613798e-05,
"loss": 2.3656,
"step": 411
},
{
"epoch": 0.24856711915535445,
"grad_norm": 1.8260728794891592,
"learning_rate": 1.7512790933133435e-05,
"loss": 2.3812,
"step": 412
},
{
"epoch": 0.24917043740573153,
"grad_norm": 2.2119732212101426,
"learning_rate": 1.7499954233308686e-05,
"loss": 2.3182,
"step": 413
},
{
"epoch": 0.2497737556561086,
"grad_norm": 1.7730977493628404,
"learning_rate": 1.7487089223593913e-05,
"loss": 2.3445,
"step": 414
},
{
"epoch": 0.25037707390648567,
"grad_norm": 1.9928063308228516,
"learning_rate": 1.7474195952550355e-05,
"loss": 2.3702,
"step": 415
},
{
"epoch": 0.25098039215686274,
"grad_norm": 1.9034844033014646,
"learning_rate": 1.7461274468845917e-05,
"loss": 2.4197,
"step": 416
},
{
"epoch": 0.25098039215686274,
"eval_loss": 2.3942031860351562,
"eval_runtime": 21.4634,
"eval_samples_per_second": 4.1,
"eval_steps_per_second": 0.513,
"step": 416
},
{
"epoch": 0.2515837104072398,
"grad_norm": 1.8947314580332886,
"learning_rate": 1.7448324821255e-05,
"loss": 2.3497,
"step": 417
},
{
"epoch": 0.2521870286576169,
"grad_norm": 1.709832121306641,
"learning_rate": 1.7435347058658317e-05,
"loss": 2.3237,
"step": 418
},
{
"epoch": 0.25279034690799396,
"grad_norm": 1.9912783646331091,
"learning_rate": 1.74223412300427e-05,
"loss": 2.4166,
"step": 419
},
{
"epoch": 0.25339366515837103,
"grad_norm": 1.7460156364263784,
"learning_rate": 1.7409307384500932e-05,
"loss": 2.3867,
"step": 420
},
{
"epoch": 0.2539969834087481,
"grad_norm": 1.8538623245046475,
"learning_rate": 1.7396245571231546e-05,
"loss": 2.3734,
"step": 421
},
{
"epoch": 0.25460030165912517,
"grad_norm": 1.846509710074018,
"learning_rate": 1.7383155839538634e-05,
"loss": 2.4071,
"step": 422
},
{
"epoch": 0.25520361990950224,
"grad_norm": 1.7825358596959273,
"learning_rate": 1.7370038238831682e-05,
"loss": 2.3877,
"step": 423
},
{
"epoch": 0.2558069381598793,
"grad_norm": 1.678243810735703,
"learning_rate": 1.7356892818625374e-05,
"loss": 2.3365,
"step": 424
},
{
"epoch": 0.2564102564102564,
"grad_norm": 1.748549957580436,
"learning_rate": 1.7343719628539396e-05,
"loss": 2.4936,
"step": 425
},
{
"epoch": 0.25701357466063346,
"grad_norm": 1.8214141054998045,
"learning_rate": 1.7330518718298263e-05,
"loss": 2.4665,
"step": 426
},
{
"epoch": 0.25761689291101053,
"grad_norm": 1.8466245931683942,
"learning_rate": 1.7317290137731122e-05,
"loss": 2.4653,
"step": 427
},
{
"epoch": 0.25822021116138766,
"grad_norm": 1.9016215729218684,
"learning_rate": 1.7304033936771557e-05,
"loss": 2.3672,
"step": 428
},
{
"epoch": 0.25882352941176473,
"grad_norm": 1.853880516402256,
"learning_rate": 1.729075016545743e-05,
"loss": 2.3238,
"step": 429
},
{
"epoch": 0.2594268476621418,
"grad_norm": 1.734071118881534,
"learning_rate": 1.7277438873930654e-05,
"loss": 2.431,
"step": 430
},
{
"epoch": 0.2600301659125189,
"grad_norm": 1.9892858435111747,
"learning_rate": 1.726410011243703e-05,
"loss": 2.3572,
"step": 431
},
{
"epoch": 0.26063348416289595,
"grad_norm": 1.7704805245487907,
"learning_rate": 1.725073393132605e-05,
"loss": 2.4709,
"step": 432
},
{
"epoch": 0.261236802413273,
"grad_norm": 1.7240494225737328,
"learning_rate": 1.72373403810507e-05,
"loss": 2.4048,
"step": 433
},
{
"epoch": 0.2618401206636501,
"grad_norm": 1.781399562041848,
"learning_rate": 1.7223919512167292e-05,
"loss": 2.3077,
"step": 434
},
{
"epoch": 0.26244343891402716,
"grad_norm": 1.944568269606338,
"learning_rate": 1.7210471375335225e-05,
"loss": 2.3961,
"step": 435
},
{
"epoch": 0.26304675716440423,
"grad_norm": 1.9648161722720134,
"learning_rate": 1.7196996021316862e-05,
"loss": 2.3704,
"step": 436
},
{
"epoch": 0.2636500754147813,
"grad_norm": 1.7752870309421316,
"learning_rate": 1.7183493500977277e-05,
"loss": 2.3925,
"step": 437
},
{
"epoch": 0.2642533936651584,
"grad_norm": 1.9406708220706959,
"learning_rate": 1.71699638652841e-05,
"loss": 2.4408,
"step": 438
},
{
"epoch": 0.26485671191553545,
"grad_norm": 1.8139653351292817,
"learning_rate": 1.715640716530731e-05,
"loss": 2.2928,
"step": 439
},
{
"epoch": 0.2654600301659125,
"grad_norm": 1.82683639815915,
"learning_rate": 1.7142823452219036e-05,
"loss": 2.3599,
"step": 440
},
{
"epoch": 0.2660633484162896,
"grad_norm": 1.7707719003945261,
"learning_rate": 1.7129212777293392e-05,
"loss": 2.4122,
"step": 441
},
{
"epoch": 0.26666666666666666,
"grad_norm": 1.7857253124157222,
"learning_rate": 1.7115575191906245e-05,
"loss": 2.3878,
"step": 442
},
{
"epoch": 0.26726998491704373,
"grad_norm": 1.9293969953243704,
"learning_rate": 1.7101910747535054e-05,
"loss": 2.4097,
"step": 443
},
{
"epoch": 0.2678733031674208,
"grad_norm": 2.066827352214268,
"learning_rate": 1.7088219495758652e-05,
"loss": 2.3335,
"step": 444
},
{
"epoch": 0.2684766214177979,
"grad_norm": 2.044784403214522,
"learning_rate": 1.7074501488257062e-05,
"loss": 2.3789,
"step": 445
},
{
"epoch": 0.26907993966817495,
"grad_norm": 1.8907127176156884,
"learning_rate": 1.706075677681131e-05,
"loss": 2.3815,
"step": 446
},
{
"epoch": 0.269683257918552,
"grad_norm": 1.852455283235919,
"learning_rate": 1.7046985413303215e-05,
"loss": 2.3542,
"step": 447
},
{
"epoch": 0.2702865761689291,
"grad_norm": 1.7754703332492303,
"learning_rate": 1.7033187449715195e-05,
"loss": 2.3499,
"step": 448
},
{
"epoch": 0.27088989441930617,
"grad_norm": 2.06022512348094,
"learning_rate": 1.7019362938130085e-05,
"loss": 2.324,
"step": 449
},
{
"epoch": 0.27149321266968324,
"grad_norm": 1.9527487591276138,
"learning_rate": 1.700551193073092e-05,
"loss": 2.4478,
"step": 450
},
{
"epoch": 0.2720965309200603,
"grad_norm": 2.043185174930958,
"learning_rate": 1.699163447980075e-05,
"loss": 2.4106,
"step": 451
},
{
"epoch": 0.2726998491704374,
"grad_norm": 1.86188494052839,
"learning_rate": 1.6977730637722446e-05,
"loss": 2.4203,
"step": 452
},
{
"epoch": 0.27330316742081445,
"grad_norm": 1.8173005419840083,
"learning_rate": 1.6963800456978495e-05,
"loss": 2.4529,
"step": 453
},
{
"epoch": 0.2739064856711916,
"grad_norm": 1.926288288051369,
"learning_rate": 1.6949843990150798e-05,
"loss": 2.3295,
"step": 454
},
{
"epoch": 0.27450980392156865,
"grad_norm": 1.9443063715328432,
"learning_rate": 1.693586128992048e-05,
"loss": 2.3609,
"step": 455
},
{
"epoch": 0.2751131221719457,
"grad_norm": 2.328188386516258,
"learning_rate": 1.6921852409067698e-05,
"loss": 2.3975,
"step": 456
},
{
"epoch": 0.2757164404223228,
"grad_norm": 2.1483376873907245,
"learning_rate": 1.6907817400471422e-05,
"loss": 2.3472,
"step": 457
},
{
"epoch": 0.27631975867269987,
"grad_norm": 2.0212484018314205,
"learning_rate": 1.689375631710924e-05,
"loss": 2.3998,
"step": 458
},
{
"epoch": 0.27692307692307694,
"grad_norm": 1.7768919412269368,
"learning_rate": 1.6879669212057187e-05,
"loss": 2.3576,
"step": 459
},
{
"epoch": 0.277526395173454,
"grad_norm": 1.940782585527019,
"learning_rate": 1.6865556138489497e-05,
"loss": 2.4266,
"step": 460
},
{
"epoch": 0.2781297134238311,
"grad_norm": 1.825130165036442,
"learning_rate": 1.6851417149678442e-05,
"loss": 2.4597,
"step": 461
},
{
"epoch": 0.27873303167420815,
"grad_norm": 1.8767052262788935,
"learning_rate": 1.6837252298994107e-05,
"loss": 2.4077,
"step": 462
},
{
"epoch": 0.2793363499245852,
"grad_norm": 2.093256354258747,
"learning_rate": 1.68230616399042e-05,
"loss": 2.3917,
"step": 463
},
{
"epoch": 0.2799396681749623,
"grad_norm": 1.8064129694457856,
"learning_rate": 1.680884522597385e-05,
"loss": 2.4043,
"step": 464
},
{
"epoch": 0.28054298642533937,
"grad_norm": 1.7734169912562705,
"learning_rate": 1.6794603110865396e-05,
"loss": 2.3579,
"step": 465
},
{
"epoch": 0.28114630467571644,
"grad_norm": 1.7758372258527604,
"learning_rate": 1.6780335348338195e-05,
"loss": 2.453,
"step": 466
},
{
"epoch": 0.2817496229260935,
"grad_norm": 1.7745279291014808,
"learning_rate": 1.6766041992248415e-05,
"loss": 2.3205,
"step": 467
},
{
"epoch": 0.2823529411764706,
"grad_norm": 2.0349417614230694,
"learning_rate": 1.6751723096548834e-05,
"loss": 2.3715,
"step": 468
},
{
"epoch": 0.28295625942684766,
"grad_norm": 1.8709834820124767,
"learning_rate": 1.6737378715288627e-05,
"loss": 2.3246,
"step": 469
},
{
"epoch": 0.28355957767722473,
"grad_norm": 1.754382359490896,
"learning_rate": 1.672300890261317e-05,
"loss": 2.364,
"step": 470
},
{
"epoch": 0.2841628959276018,
"grad_norm": 1.8716249102630769,
"learning_rate": 1.670861371276384e-05,
"loss": 2.3305,
"step": 471
},
{
"epoch": 0.28476621417797887,
"grad_norm": 1.8767204933617139,
"learning_rate": 1.6694193200077796e-05,
"loss": 2.4049,
"step": 472
},
{
"epoch": 0.28536953242835594,
"grad_norm": 1.8614514890660463,
"learning_rate": 1.667974741898779e-05,
"loss": 2.3556,
"step": 473
},
{
"epoch": 0.285972850678733,
"grad_norm": 1.7008268918437945,
"learning_rate": 1.6665276424021955e-05,
"loss": 2.4237,
"step": 474
},
{
"epoch": 0.2865761689291101,
"grad_norm": 1.913791072383469,
"learning_rate": 1.6650780269803587e-05,
"loss": 2.3659,
"step": 475
},
{
"epoch": 0.28717948717948716,
"grad_norm": 1.6601117504241991,
"learning_rate": 1.663625901105096e-05,
"loss": 2.4241,
"step": 476
},
{
"epoch": 0.28778280542986423,
"grad_norm": 1.837686989315102,
"learning_rate": 1.6621712702577116e-05,
"loss": 2.4188,
"step": 477
},
{
"epoch": 0.2883861236802413,
"grad_norm": 1.8421165580221608,
"learning_rate": 1.6607141399289628e-05,
"loss": 2.3939,
"step": 478
},
{
"epoch": 0.2889894419306184,
"grad_norm": 1.8192642232304175,
"learning_rate": 1.6592545156190437e-05,
"loss": 2.2918,
"step": 479
},
{
"epoch": 0.2895927601809955,
"grad_norm": 1.6952460022357256,
"learning_rate": 1.6577924028375622e-05,
"loss": 2.3662,
"step": 480
},
{
"epoch": 0.2901960784313726,
"grad_norm": 1.9577770501552076,
"learning_rate": 1.6563278071035182e-05,
"loss": 2.3789,
"step": 481
},
{
"epoch": 0.29079939668174964,
"grad_norm": 1.9639779173712133,
"learning_rate": 1.6548607339452853e-05,
"loss": 2.4703,
"step": 482
},
{
"epoch": 0.2914027149321267,
"grad_norm": 2.0419999930141826,
"learning_rate": 1.6533911889005874e-05,
"loss": 2.4667,
"step": 483
},
{
"epoch": 0.2920060331825038,
"grad_norm": 1.8963556387959155,
"learning_rate": 1.6519191775164795e-05,
"loss": 2.2996,
"step": 484
},
{
"epoch": 0.29260935143288086,
"grad_norm": 1.934074324015012,
"learning_rate": 1.6504447053493264e-05,
"loss": 2.4242,
"step": 485
},
{
"epoch": 0.29321266968325793,
"grad_norm": 1.8545213632058337,
"learning_rate": 1.6489677779647813e-05,
"loss": 2.3063,
"step": 486
},
{
"epoch": 0.293815987933635,
"grad_norm": 3.973702407940097,
"learning_rate": 1.6474884009377658e-05,
"loss": 2.3574,
"step": 487
},
{
"epoch": 0.2944193061840121,
"grad_norm": 1.9537109220075501,
"learning_rate": 1.6460065798524464e-05,
"loss": 2.3463,
"step": 488
},
{
"epoch": 0.29502262443438915,
"grad_norm": 1.9389244940154626,
"learning_rate": 1.644522320302217e-05,
"loss": 2.3892,
"step": 489
},
{
"epoch": 0.2956259426847662,
"grad_norm": 1.927684899495583,
"learning_rate": 1.643035627889674e-05,
"loss": 2.3459,
"step": 490
},
{
"epoch": 0.2962292609351433,
"grad_norm": 1.8034588471166737,
"learning_rate": 1.641546508226599e-05,
"loss": 2.4316,
"step": 491
},
{
"epoch": 0.29683257918552036,
"grad_norm": 1.8514486122667233,
"learning_rate": 1.640054966933935e-05,
"loss": 2.3838,
"step": 492
},
{
"epoch": 0.29743589743589743,
"grad_norm": 1.827240704511863,
"learning_rate": 1.6385610096417654e-05,
"loss": 2.448,
"step": 493
},
{
"epoch": 0.2980392156862745,
"grad_norm": 1.9560055670404755,
"learning_rate": 1.637064641989293e-05,
"loss": 2.3373,
"step": 494
},
{
"epoch": 0.2986425339366516,
"grad_norm": 1.8968484574591564,
"learning_rate": 1.63556586962482e-05,
"loss": 2.4617,
"step": 495
},
{
"epoch": 0.29924585218702865,
"grad_norm": 1.8874111567645753,
"learning_rate": 1.634064698205725e-05,
"loss": 2.4268,
"step": 496
},
{
"epoch": 0.2998491704374057,
"grad_norm": 1.8988827502210233,
"learning_rate": 1.632561133398442e-05,
"loss": 2.3495,
"step": 497
},
{
"epoch": 0.3004524886877828,
"grad_norm": 1.7580087811549536,
"learning_rate": 1.6310551808784394e-05,
"loss": 2.3844,
"step": 498
},
{
"epoch": 0.30105580693815986,
"grad_norm": 1.8050290057379497,
"learning_rate": 1.6295468463301993e-05,
"loss": 2.3541,
"step": 499
},
{
"epoch": 0.30165912518853694,
"grad_norm": 1.673799931237698,
"learning_rate": 1.628036135447194e-05,
"loss": 2.3608,
"step": 500
},
{
"epoch": 0.302262443438914,
"grad_norm": 1.8311291235848792,
"learning_rate": 1.6265230539318658e-05,
"loss": 2.3797,
"step": 501
},
{
"epoch": 0.3028657616892911,
"grad_norm": 1.9367398724614582,
"learning_rate": 1.6250076074956066e-05,
"loss": 2.4683,
"step": 502
},
{
"epoch": 0.30346907993966815,
"grad_norm": 1.7572907260857697,
"learning_rate": 1.6234898018587336e-05,
"loss": 2.3569,
"step": 503
},
{
"epoch": 0.3040723981900452,
"grad_norm": 2.1156012270415445,
"learning_rate": 1.6219696427504703e-05,
"loss": 2.3643,
"step": 504
},
{
"epoch": 0.3046757164404223,
"grad_norm": 1.6954259066781059,
"learning_rate": 1.6204471359089224e-05,
"loss": 2.4835,
"step": 505
},
{
"epoch": 0.3052790346907994,
"grad_norm": 1.880357726030102,
"learning_rate": 1.6189222870810596e-05,
"loss": 2.4242,
"step": 506
},
{
"epoch": 0.3058823529411765,
"grad_norm": 1.743051972913388,
"learning_rate": 1.61739510202269e-05,
"loss": 2.4604,
"step": 507
},
{
"epoch": 0.30648567119155357,
"grad_norm": 1.7096306081262191,
"learning_rate": 1.6158655864984413e-05,
"loss": 2.3306,
"step": 508
},
{
"epoch": 0.30708898944193064,
"grad_norm": 1.8969099344419835,
"learning_rate": 1.6143337462817372e-05,
"loss": 2.403,
"step": 509
},
{
"epoch": 0.3076923076923077,
"grad_norm": 1.8973160763468697,
"learning_rate": 1.612799587154777e-05,
"loss": 2.4471,
"step": 510
},
{
"epoch": 0.3082956259426848,
"grad_norm": 1.8915447769270686,
"learning_rate": 1.6112631149085128e-05,
"loss": 2.3624,
"step": 511
},
{
"epoch": 0.30889894419306185,
"grad_norm": 1.6567739781051283,
"learning_rate": 1.609724335342628e-05,
"loss": 2.3836,
"step": 512
},
{
"epoch": 0.3095022624434389,
"grad_norm": 2.0548142027771186,
"learning_rate": 1.6081832542655154e-05,
"loss": 2.4051,
"step": 513
},
{
"epoch": 0.310105580693816,
"grad_norm": 2.0154305540780637,
"learning_rate": 1.6066398774942556e-05,
"loss": 2.3679,
"step": 514
},
{
"epoch": 0.31070889894419307,
"grad_norm": 1.6683133670779435,
"learning_rate": 1.6050942108545938e-05,
"loss": 2.3651,
"step": 515
},
{
"epoch": 0.31131221719457014,
"grad_norm": 1.9261114209064358,
"learning_rate": 1.6035462601809193e-05,
"loss": 2.3811,
"step": 516
},
{
"epoch": 0.3119155354449472,
"grad_norm": 1.9055368841091447,
"learning_rate": 1.6019960313162436e-05,
"loss": 2.3471,
"step": 517
},
{
"epoch": 0.3125188536953243,
"grad_norm": 1.7619898676546586,
"learning_rate": 1.6004435301121762e-05,
"loss": 2.3532,
"step": 518
},
{
"epoch": 0.31312217194570136,
"grad_norm": 1.7379302950765867,
"learning_rate": 1.5988887624289045e-05,
"loss": 2.4402,
"step": 519
},
{
"epoch": 0.3137254901960784,
"grad_norm": 1.7861658019735307,
"learning_rate": 1.5973317341351725e-05,
"loss": 2.3356,
"step": 520
},
{
"epoch": 0.3143288084464555,
"grad_norm": 1.7406388213681667,
"learning_rate": 1.595772451108254e-05,
"loss": 2.3595,
"step": 521
},
{
"epoch": 0.31493212669683257,
"grad_norm": 1.8740480929030214,
"learning_rate": 1.5942109192339375e-05,
"loss": 2.4009,
"step": 522
},
{
"epoch": 0.31553544494720964,
"grad_norm": 1.6669644613043895,
"learning_rate": 1.592647144406498e-05,
"loss": 2.383,
"step": 523
},
{
"epoch": 0.3161387631975867,
"grad_norm": 1.7563204578628122,
"learning_rate": 1.5910811325286768e-05,
"loss": 2.3741,
"step": 524
},
{
"epoch": 0.3167420814479638,
"grad_norm": 1.7027904743606184,
"learning_rate": 1.58951288951166e-05,
"loss": 2.3938,
"step": 525
},
{
"epoch": 0.31734539969834086,
"grad_norm": 1.7906099932440336,
"learning_rate": 1.5879424212750554e-05,
"loss": 2.3705,
"step": 526
},
{
"epoch": 0.31794871794871793,
"grad_norm": 1.7898710643832698,
"learning_rate": 1.5863697337468704e-05,
"loss": 2.3321,
"step": 527
},
{
"epoch": 0.318552036199095,
"grad_norm": 3.435785300962325,
"learning_rate": 1.5847948328634895e-05,
"loss": 2.3694,
"step": 528
},
{
"epoch": 0.3191553544494721,
"grad_norm": 1.858360689105569,
"learning_rate": 1.583217724569651e-05,
"loss": 2.4358,
"step": 529
},
{
"epoch": 0.31975867269984914,
"grad_norm": 1.7287161321640319,
"learning_rate": 1.5816384148184273e-05,
"loss": 2.313,
"step": 530
},
{
"epoch": 0.32036199095022627,
"grad_norm": 1.970600858577929,
"learning_rate": 1.5800569095711983e-05,
"loss": 2.35,
"step": 531
},
{
"epoch": 0.32096530920060334,
"grad_norm": 1.9064390487389395,
"learning_rate": 1.5784732147976333e-05,
"loss": 2.4124,
"step": 532
},
{
"epoch": 0.3215686274509804,
"grad_norm": 1.7527577771779475,
"learning_rate": 1.5768873364756653e-05,
"loss": 2.3539,
"step": 533
},
{
"epoch": 0.3221719457013575,
"grad_norm": 2.0432674010861214,
"learning_rate": 1.575299280591469e-05,
"loss": 2.3665,
"step": 534
},
{
"epoch": 0.32277526395173456,
"grad_norm": 1.7149420503457244,
"learning_rate": 1.57370905313944e-05,
"loss": 2.3832,
"step": 535
},
{
"epoch": 0.32337858220211163,
"grad_norm": 1.6986453672998818,
"learning_rate": 1.5721166601221697e-05,
"loss": 2.3081,
"step": 536
},
{
"epoch": 0.3239819004524887,
"grad_norm": 1.6812118408175234,
"learning_rate": 1.5705221075504247e-05,
"loss": 2.4513,
"step": 537
},
{
"epoch": 0.3245852187028658,
"grad_norm": 1.910283619404697,
"learning_rate": 1.5689254014431225e-05,
"loss": 2.3518,
"step": 538
},
{
"epoch": 0.32518853695324285,
"grad_norm": 1.8424382759411888,
"learning_rate": 1.56732654782731e-05,
"loss": 2.2766,
"step": 539
},
{
"epoch": 0.3257918552036199,
"grad_norm": 1.8701280171641446,
"learning_rate": 1.5657255527381395e-05,
"loss": 2.3568,
"step": 540
},
{
"epoch": 0.326395173453997,
"grad_norm": 1.7703149577050399,
"learning_rate": 1.5641224222188476e-05,
"loss": 2.4348,
"step": 541
},
{
"epoch": 0.32699849170437406,
"grad_norm": 1.699314651054947,
"learning_rate": 1.562517162320731e-05,
"loss": 2.4006,
"step": 542
},
{
"epoch": 0.32760180995475113,
"grad_norm": 1.992395374798314,
"learning_rate": 1.5609097791031243e-05,
"loss": 2.4042,
"step": 543
},
{
"epoch": 0.3282051282051282,
"grad_norm": 1.8182004021843592,
"learning_rate": 1.559300278633377e-05,
"loss": 2.4339,
"step": 544
},
{
"epoch": 0.3288084464555053,
"grad_norm": 1.9273546760508904,
"learning_rate": 1.5576886669868297e-05,
"loss": 2.3665,
"step": 545
},
{
"epoch": 0.32941176470588235,
"grad_norm": 1.9311811042716147,
"learning_rate": 1.556074950246793e-05,
"loss": 2.387,
"step": 546
},
{
"epoch": 0.3300150829562594,
"grad_norm": 2.3722660221407588,
"learning_rate": 1.554459134504523e-05,
"loss": 2.3884,
"step": 547
},
{
"epoch": 0.3306184012066365,
"grad_norm": 1.797532989222507,
"learning_rate": 1.5528412258591994e-05,
"loss": 2.3019,
"step": 548
},
{
"epoch": 0.33122171945701356,
"grad_norm": 1.7199496012123132,
"learning_rate": 1.5512212304179015e-05,
"loss": 2.4037,
"step": 549
},
{
"epoch": 0.33182503770739064,
"grad_norm": 1.7484659894079446,
"learning_rate": 1.5495991542955855e-05,
"loss": 2.4882,
"step": 550
},
{
"epoch": 0.3324283559577677,
"grad_norm": 11.886778043932564,
"learning_rate": 1.5479750036150614e-05,
"loss": 2.3168,
"step": 551
},
{
"epoch": 0.3330316742081448,
"grad_norm": 2.0257322511655422,
"learning_rate": 1.5463487845069708e-05,
"loss": 2.4386,
"step": 552
},
{
"epoch": 0.33363499245852185,
"grad_norm": 2.104527806500846,
"learning_rate": 1.544720503109762e-05,
"loss": 2.3966,
"step": 553
},
{
"epoch": 0.3342383107088989,
"grad_norm": 1.8881704290067898,
"learning_rate": 1.5430901655696683e-05,
"loss": 2.4835,
"step": 554
},
{
"epoch": 0.334841628959276,
"grad_norm": 1.7313204083862717,
"learning_rate": 1.541457778040684e-05,
"loss": 2.3059,
"step": 555
},
{
"epoch": 0.33544494720965307,
"grad_norm": 1.7885236223630756,
"learning_rate": 1.539823346684542e-05,
"loss": 2.3921,
"step": 556
},
{
"epoch": 0.3360482654600302,
"grad_norm": 1.7461855308005563,
"learning_rate": 1.5381868776706883e-05,
"loss": 2.3616,
"step": 557
},
{
"epoch": 0.33665158371040727,
"grad_norm": 1.9361450007910532,
"learning_rate": 1.536548377176263e-05,
"loss": 2.3722,
"step": 558
},
{
"epoch": 0.33725490196078434,
"grad_norm": 1.9336429106855157,
"learning_rate": 1.5349078513860728e-05,
"loss": 2.4162,
"step": 559
},
{
"epoch": 0.3378582202111614,
"grad_norm": 1.761755498515305,
"learning_rate": 1.5332653064925683e-05,
"loss": 2.4606,
"step": 560
},
{
"epoch": 0.3384615384615385,
"grad_norm": 1.7505505528301577,
"learning_rate": 1.5316207486958242e-05,
"loss": 2.4586,
"step": 561
},
{
"epoch": 0.33906485671191555,
"grad_norm": 1.779820283923665,
"learning_rate": 1.5299741842035108e-05,
"loss": 2.3328,
"step": 562
},
{
"epoch": 0.3396681749622926,
"grad_norm": 1.8162808883880843,
"learning_rate": 1.5283256192308744e-05,
"loss": 2.3907,
"step": 563
},
{
"epoch": 0.3402714932126697,
"grad_norm": 1.8002288685975787,
"learning_rate": 1.5266750600007122e-05,
"loss": 2.3733,
"step": 564
},
{
"epoch": 0.34087481146304677,
"grad_norm": 1.7074592980093009,
"learning_rate": 1.5250225127433485e-05,
"loss": 2.3787,
"step": 565
},
{
"epoch": 0.34147812971342384,
"grad_norm": 1.727791102751961,
"learning_rate": 1.5233679836966122e-05,
"loss": 2.4609,
"step": 566
},
{
"epoch": 0.3420814479638009,
"grad_norm": 1.7409151277373238,
"learning_rate": 1.5217114791058129e-05,
"loss": 2.3501,
"step": 567
},
{
"epoch": 0.342684766214178,
"grad_norm": 1.8502483538705539,
"learning_rate": 1.5200530052237174e-05,
"loss": 2.4225,
"step": 568
},
{
"epoch": 0.34328808446455505,
"grad_norm": 1.9323986008652658,
"learning_rate": 1.5183925683105254e-05,
"loss": 2.4579,
"step": 569
},
{
"epoch": 0.3438914027149321,
"grad_norm": 1.6959428286232177,
"learning_rate": 1.5167301746338466e-05,
"loss": 2.3903,
"step": 570
},
{
"epoch": 0.3444947209653092,
"grad_norm": 1.785549938873077,
"learning_rate": 1.5150658304686766e-05,
"loss": 2.3733,
"step": 571
},
{
"epoch": 0.34509803921568627,
"grad_norm": 1.8119824004389333,
"learning_rate": 1.5133995420973746e-05,
"loss": 2.3818,
"step": 572
},
{
"epoch": 0.34570135746606334,
"grad_norm": 1.8260773713603422,
"learning_rate": 1.5117313158096371e-05,
"loss": 2.3361,
"step": 573
},
{
"epoch": 0.3463046757164404,
"grad_norm": 2.06836980707018,
"learning_rate": 1.510061157902477e-05,
"loss": 2.4415,
"step": 574
},
{
"epoch": 0.3469079939668175,
"grad_norm": 2.086356686499412,
"learning_rate": 1.5083890746801962e-05,
"loss": 2.3938,
"step": 575
},
{
"epoch": 0.34751131221719456,
"grad_norm": 1.8906423761627238,
"learning_rate": 1.5067150724543669e-05,
"loss": 2.3683,
"step": 576
},
{
"epoch": 0.34811463046757163,
"grad_norm": 1.871834489314333,
"learning_rate": 1.5050391575438026e-05,
"loss": 2.3848,
"step": 577
},
{
"epoch": 0.3487179487179487,
"grad_norm": 1.8101661578969148,
"learning_rate": 1.503361336274538e-05,
"loss": 2.4243,
"step": 578
},
{
"epoch": 0.34932126696832577,
"grad_norm": 1.7689845298062439,
"learning_rate": 1.5016816149798033e-05,
"loss": 2.3667,
"step": 579
},
{
"epoch": 0.34992458521870284,
"grad_norm": 1.7995447114490488,
"learning_rate": 1.5000000000000002e-05,
"loss": 2.2974,
"step": 580
},
{
"epoch": 0.3505279034690799,
"grad_norm": 2.160922337798451,
"learning_rate": 1.4983164976826788e-05,
"loss": 2.3526,
"step": 581
},
{
"epoch": 0.351131221719457,
"grad_norm": 2.0689488964902902,
"learning_rate": 1.4966311143825132e-05,
"loss": 2.4311,
"step": 582
},
{
"epoch": 0.3517345399698341,
"grad_norm": 1.7389091824888516,
"learning_rate": 1.4949438564612778e-05,
"loss": 2.4222,
"step": 583
},
{
"epoch": 0.3523378582202112,
"grad_norm": 1.8173600434790154,
"learning_rate": 1.4932547302878228e-05,
"loss": 2.3584,
"step": 584
},
{
"epoch": 0.35294117647058826,
"grad_norm": 1.8292488414493129,
"learning_rate": 1.491563742238051e-05,
"loss": 2.3919,
"step": 585
},
{
"epoch": 0.35354449472096533,
"grad_norm": 1.6564271315606274,
"learning_rate": 1.4898708986948925e-05,
"loss": 2.4269,
"step": 586
},
{
"epoch": 0.3541478129713424,
"grad_norm": 1.8221323736787778,
"learning_rate": 1.4881762060482814e-05,
"loss": 2.291,
"step": 587
},
{
"epoch": 0.3547511312217195,
"grad_norm": 1.7436753903680235,
"learning_rate": 1.486479670695132e-05,
"loss": 2.3669,
"step": 588
},
{
"epoch": 0.35535444947209655,
"grad_norm": 1.7523543866066271,
"learning_rate": 1.4847812990393138e-05,
"loss": 2.4397,
"step": 589
},
{
"epoch": 0.3559577677224736,
"grad_norm": 1.8682538834622033,
"learning_rate": 1.483081097491628e-05,
"loss": 2.409,
"step": 590
},
{
"epoch": 0.3565610859728507,
"grad_norm": 1.745032073795883,
"learning_rate": 1.4813790724697832e-05,
"loss": 2.3007,
"step": 591
},
{
"epoch": 0.35716440422322776,
"grad_norm": 1.7186408434763945,
"learning_rate": 1.47967523039837e-05,
"loss": 2.4034,
"step": 592
},
{
"epoch": 0.35776772247360483,
"grad_norm": 1.7483375800449275,
"learning_rate": 1.4779695777088392e-05,
"loss": 2.3844,
"step": 593
},
{
"epoch": 0.3583710407239819,
"grad_norm": 1.7819426598449446,
"learning_rate": 1.476262120839475e-05,
"loss": 2.4795,
"step": 594
},
{
"epoch": 0.358974358974359,
"grad_norm": 1.7139242136095565,
"learning_rate": 1.4745528662353728e-05,
"loss": 2.3507,
"step": 595
},
{
"epoch": 0.35957767722473605,
"grad_norm": 1.8436276797329483,
"learning_rate": 1.4728418203484125e-05,
"loss": 2.4585,
"step": 596
},
{
"epoch": 0.3601809954751131,
"grad_norm": 1.8598700014175478,
"learning_rate": 1.471128989637237e-05,
"loss": 2.4102,
"step": 597
},
{
"epoch": 0.3607843137254902,
"grad_norm": 1.6533400176564714,
"learning_rate": 1.4694143805672254e-05,
"loss": 2.3843,
"step": 598
},
{
"epoch": 0.36138763197586726,
"grad_norm": 2.1098729029957215,
"learning_rate": 1.4676979996104694e-05,
"loss": 2.3878,
"step": 599
},
{
"epoch": 0.36199095022624433,
"grad_norm": 2.4340141500482684,
"learning_rate": 1.4659798532457497e-05,
"loss": 2.4534,
"step": 600
},
{
"epoch": 0.3625942684766214,
"grad_norm": 1.9471998072388452,
"learning_rate": 1.4642599479585106e-05,
"loss": 2.3896,
"step": 601
},
{
"epoch": 0.3631975867269985,
"grad_norm": 2.0019981913767744,
"learning_rate": 1.4625382902408356e-05,
"loss": 2.33,
"step": 602
},
{
"epoch": 0.36380090497737555,
"grad_norm": 1.6948730617309857,
"learning_rate": 1.4608148865914226e-05,
"loss": 2.3983,
"step": 603
},
{
"epoch": 0.3644042232277526,
"grad_norm": 1.8123228593449345,
"learning_rate": 1.4590897435155609e-05,
"loss": 2.3558,
"step": 604
},
{
"epoch": 0.3650075414781297,
"grad_norm": 1.8752512293969732,
"learning_rate": 1.4573628675251051e-05,
"loss": 2.4097,
"step": 605
},
{
"epoch": 0.36561085972850677,
"grad_norm": 1.7681963283069855,
"learning_rate": 1.4556342651384503e-05,
"loss": 2.4081,
"step": 606
},
{
"epoch": 0.36621417797888384,
"grad_norm": 1.742690929021266,
"learning_rate": 1.453903942880509e-05,
"loss": 2.3479,
"step": 607
},
{
"epoch": 0.3668174962292609,
"grad_norm": 1.99882414897086,
"learning_rate": 1.4521719072826858e-05,
"loss": 2.3713,
"step": 608
},
{
"epoch": 0.36742081447963804,
"grad_norm": 1.9029689038645006,
"learning_rate": 1.4504381648828518e-05,
"loss": 2.4505,
"step": 609
},
{
"epoch": 0.3680241327300151,
"grad_norm": 2.0476458036895218,
"learning_rate": 1.4487027222253216e-05,
"loss": 2.4079,
"step": 610
},
{
"epoch": 0.3686274509803922,
"grad_norm": 1.776067769797596,
"learning_rate": 1.4469655858608267e-05,
"loss": 2.3776,
"step": 611
},
{
"epoch": 0.36923076923076925,
"grad_norm": 1.9672318792751764,
"learning_rate": 1.445226762346493e-05,
"loss": 2.3368,
"step": 612
},
{
"epoch": 0.3698340874811463,
"grad_norm": 1.855562519423061,
"learning_rate": 1.4434862582458136e-05,
"loss": 2.3117,
"step": 613
},
{
"epoch": 0.3704374057315234,
"grad_norm": 1.8894861586454959,
"learning_rate": 1.4417440801286263e-05,
"loss": 2.3204,
"step": 614
},
{
"epoch": 0.37104072398190047,
"grad_norm": 1.6694812676346151,
"learning_rate": 1.4400002345710871e-05,
"loss": 2.3575,
"step": 615
},
{
"epoch": 0.37164404223227754,
"grad_norm": 1.7479003447288977,
"learning_rate": 1.4382547281556464e-05,
"loss": 2.3689,
"step": 616
},
{
"epoch": 0.3722473604826546,
"grad_norm": 2.3948984633240036,
"learning_rate": 1.4365075674710238e-05,
"loss": 2.3109,
"step": 617
},
{
"epoch": 0.3728506787330317,
"grad_norm": 1.887014513355491,
"learning_rate": 1.434758759112183e-05,
"loss": 2.4637,
"step": 618
},
{
"epoch": 0.37345399698340875,
"grad_norm": 1.7882548264987235,
"learning_rate": 1.4330083096803073e-05,
"loss": 2.3452,
"step": 619
},
{
"epoch": 0.3740573152337858,
"grad_norm": 1.848141313490385,
"learning_rate": 1.4312562257827742e-05,
"loss": 2.4144,
"step": 620
},
{
"epoch": 0.3746606334841629,
"grad_norm": 1.9944819383375783,
"learning_rate": 1.4295025140331317e-05,
"loss": 2.3772,
"step": 621
},
{
"epoch": 0.37526395173453997,
"grad_norm": 1.8430443308071278,
"learning_rate": 1.427747181051071e-05,
"loss": 2.3471,
"step": 622
},
{
"epoch": 0.37586726998491704,
"grad_norm": 1.7064478636418337,
"learning_rate": 1.4259902334624043e-05,
"loss": 2.3765,
"step": 623
},
{
"epoch": 0.3764705882352941,
"grad_norm": 1.8467921519847166,
"learning_rate": 1.4242316778990373e-05,
"loss": 2.4057,
"step": 624
},
{
"epoch": 0.3764705882352941,
"eval_loss": 2.3918943405151367,
"eval_runtime": 21.7913,
"eval_samples_per_second": 4.038,
"eval_steps_per_second": 0.505,
"step": 624
},
{
"epoch": 0.3770739064856712,
"grad_norm": 2.0029681453109047,
"learning_rate": 1.4224715209989463e-05,
"loss": 2.3662,
"step": 625
},
{
"epoch": 0.37767722473604826,
"grad_norm": 1.7665174192031903,
"learning_rate": 1.4207097694061514e-05,
"loss": 2.3706,
"step": 626
},
{
"epoch": 0.3782805429864253,
"grad_norm": 1.7876219359582222,
"learning_rate": 1.418946429770692e-05,
"loss": 2.3525,
"step": 627
},
{
"epoch": 0.3788838612368024,
"grad_norm": 1.7700044933464636,
"learning_rate": 1.4171815087486026e-05,
"loss": 2.4189,
"step": 628
},
{
"epoch": 0.37948717948717947,
"grad_norm": 1.86529649557736,
"learning_rate": 1.4154150130018867e-05,
"loss": 2.3815,
"step": 629
},
{
"epoch": 0.38009049773755654,
"grad_norm": 2.006317051920272,
"learning_rate": 1.4136469491984913e-05,
"loss": 2.4124,
"step": 630
},
{
"epoch": 0.3806938159879336,
"grad_norm": 1.772114382724889,
"learning_rate": 1.4118773240122825e-05,
"loss": 2.3476,
"step": 631
},
{
"epoch": 0.3812971342383107,
"grad_norm": 1.7056526828735088,
"learning_rate": 1.4101061441230209e-05,
"loss": 2.4368,
"step": 632
},
{
"epoch": 0.38190045248868776,
"grad_norm": 1.8707670004402084,
"learning_rate": 1.4083334162163347e-05,
"loss": 2.4275,
"step": 633
},
{
"epoch": 0.38250377073906483,
"grad_norm": 1.7861302749515335,
"learning_rate": 1.4065591469836958e-05,
"loss": 2.3732,
"step": 634
},
{
"epoch": 0.38310708898944196,
"grad_norm": 1.818683066931359,
"learning_rate": 1.4047833431223938e-05,
"loss": 2.3422,
"step": 635
},
{
"epoch": 0.38371040723981903,
"grad_norm": 1.6885761613630272,
"learning_rate": 1.4030060113355118e-05,
"loss": 2.3495,
"step": 636
},
{
"epoch": 0.3843137254901961,
"grad_norm": 7.180336365886139,
"learning_rate": 1.4012271583318989e-05,
"loss": 2.353,
"step": 637
},
{
"epoch": 0.3849170437405732,
"grad_norm": 1.7340067613190102,
"learning_rate": 1.3994467908261474e-05,
"loss": 2.2981,
"step": 638
},
{
"epoch": 0.38552036199095024,
"grad_norm": 1.7030954630468456,
"learning_rate": 1.397664915538566e-05,
"loss": 2.3274,
"step": 639
},
{
"epoch": 0.3861236802413273,
"grad_norm": 1.800533489062573,
"learning_rate": 1.3958815391951552e-05,
"loss": 2.5047,
"step": 640
},
{
"epoch": 0.3867269984917044,
"grad_norm": 1.7827540232861019,
"learning_rate": 1.3940966685275812e-05,
"loss": 2.2918,
"step": 641
},
{
"epoch": 0.38733031674208146,
"grad_norm": 1.7797398302982137,
"learning_rate": 1.3923103102731504e-05,
"loss": 2.38,
"step": 642
},
{
"epoch": 0.38793363499245853,
"grad_norm": 1.6684389473922723,
"learning_rate": 1.3905224711747844e-05,
"loss": 2.3335,
"step": 643
},
{
"epoch": 0.3885369532428356,
"grad_norm": 1.8443665423108562,
"learning_rate": 1.3887331579809958e-05,
"loss": 2.3327,
"step": 644
},
{
"epoch": 0.3891402714932127,
"grad_norm": 1.9000405287193112,
"learning_rate": 1.3869423774458594e-05,
"loss": 2.4339,
"step": 645
},
{
"epoch": 0.38974358974358975,
"grad_norm": 1.7335394014606145,
"learning_rate": 1.3851501363289907e-05,
"loss": 2.3429,
"step": 646
},
{
"epoch": 0.3903469079939668,
"grad_norm": 1.939117624820807,
"learning_rate": 1.3833564413955171e-05,
"loss": 2.4046,
"step": 647
},
{
"epoch": 0.3909502262443439,
"grad_norm": 1.8243037308209973,
"learning_rate": 1.3815612994160544e-05,
"loss": 2.4171,
"step": 648
},
{
"epoch": 0.39155354449472096,
"grad_norm": 1.6139052960785931,
"learning_rate": 1.3797647171666792e-05,
"loss": 2.3557,
"step": 649
},
{
"epoch": 0.39215686274509803,
"grad_norm": 1.8121247464252912,
"learning_rate": 1.3779667014289067e-05,
"loss": 2.3947,
"step": 650
},
{
"epoch": 0.3927601809954751,
"grad_norm": 1.7575043092638487,
"learning_rate": 1.3761672589896615e-05,
"loss": 2.4148,
"step": 651
},
{
"epoch": 0.3933634992458522,
"grad_norm": 1.8471659624255694,
"learning_rate": 1.3743663966412547e-05,
"loss": 2.3923,
"step": 652
},
{
"epoch": 0.39396681749622925,
"grad_norm": 1.8481281201046837,
"learning_rate": 1.3725641211813557e-05,
"loss": 2.4083,
"step": 653
},
{
"epoch": 0.3945701357466063,
"grad_norm": 1.783889419462132,
"learning_rate": 1.3707604394129687e-05,
"loss": 2.3017,
"step": 654
},
{
"epoch": 0.3951734539969834,
"grad_norm": 1.8916835677064565,
"learning_rate": 1.3689553581444069e-05,
"loss": 2.35,
"step": 655
},
{
"epoch": 0.39577677224736046,
"grad_norm": 1.8370602445589161,
"learning_rate": 1.3671488841892648e-05,
"loss": 2.3432,
"step": 656
},
{
"epoch": 0.39638009049773754,
"grad_norm": 1.8327493008799256,
"learning_rate": 1.3653410243663953e-05,
"loss": 2.3983,
"step": 657
},
{
"epoch": 0.3969834087481146,
"grad_norm": 1.7620328988966256,
"learning_rate": 1.3635317854998809e-05,
"loss": 2.3972,
"step": 658
},
{
"epoch": 0.3975867269984917,
"grad_norm": 1.8446532445383867,
"learning_rate": 1.361721174419011e-05,
"loss": 2.4332,
"step": 659
},
{
"epoch": 0.39819004524886875,
"grad_norm": 1.7192647983726854,
"learning_rate": 1.3599091979582537e-05,
"loss": 2.4035,
"step": 660
},
{
"epoch": 0.3987933634992459,
"grad_norm": 1.8612813664426646,
"learning_rate": 1.3580958629572316e-05,
"loss": 2.3399,
"step": 661
},
{
"epoch": 0.39939668174962295,
"grad_norm": 1.6726700821557248,
"learning_rate": 1.356281176260695e-05,
"loss": 2.3465,
"step": 662
},
{
"epoch": 0.4,
"grad_norm": 1.7445257737899353,
"learning_rate": 1.3544651447184961e-05,
"loss": 2.3113,
"step": 663
},
{
"epoch": 0.4006033182503771,
"grad_norm": 1.8653411246265055,
"learning_rate": 1.3526477751855645e-05,
"loss": 2.4037,
"step": 664
},
{
"epoch": 0.40120663650075417,
"grad_norm": 1.8347872489948513,
"learning_rate": 1.3508290745218789e-05,
"loss": 2.318,
"step": 665
},
{
"epoch": 0.40180995475113124,
"grad_norm": 1.8918225555087078,
"learning_rate": 1.3490090495924437e-05,
"loss": 2.3722,
"step": 666
},
{
"epoch": 0.4024132730015083,
"grad_norm": 2.010943861471511,
"learning_rate": 1.3471877072672617e-05,
"loss": 2.3639,
"step": 667
},
{
"epoch": 0.4030165912518854,
"grad_norm": 1.8980686640668742,
"learning_rate": 1.3453650544213078e-05,
"loss": 2.3517,
"step": 668
},
{
"epoch": 0.40361990950226245,
"grad_norm": 1.840946209795983,
"learning_rate": 1.3435410979345048e-05,
"loss": 2.4486,
"step": 669
},
{
"epoch": 0.4042232277526395,
"grad_norm": 1.977220556338354,
"learning_rate": 1.341715844691695e-05,
"loss": 2.4036,
"step": 670
},
{
"epoch": 0.4048265460030166,
"grad_norm": 1.835226836663754,
"learning_rate": 1.3398893015826166e-05,
"loss": 2.3811,
"step": 671
},
{
"epoch": 0.40542986425339367,
"grad_norm": 1.8478422310537772,
"learning_rate": 1.338061475501877e-05,
"loss": 2.4005,
"step": 672
},
{
"epoch": 0.40603318250377074,
"grad_norm": 1.7154610218519464,
"learning_rate": 1.3362323733489247e-05,
"loss": 2.3651,
"step": 673
},
{
"epoch": 0.4066365007541478,
"grad_norm": 1.9555464431886362,
"learning_rate": 1.3344020020280262e-05,
"loss": 2.3635,
"step": 674
},
{
"epoch": 0.4072398190045249,
"grad_norm": 1.9659968540183852,
"learning_rate": 1.3325703684482383e-05,
"loss": 2.5214,
"step": 675
},
{
"epoch": 0.40784313725490196,
"grad_norm": 1.7901769953025661,
"learning_rate": 1.330737479523383e-05,
"loss": 2.4147,
"step": 676
},
{
"epoch": 0.408446455505279,
"grad_norm": 1.925446038371978,
"learning_rate": 1.32890334217202e-05,
"loss": 2.3693,
"step": 677
},
{
"epoch": 0.4090497737556561,
"grad_norm": 1.887938807896764,
"learning_rate": 1.3270679633174219e-05,
"loss": 2.396,
"step": 678
},
{
"epoch": 0.40965309200603317,
"grad_norm": 1.9909687777389626,
"learning_rate": 1.3252313498875473e-05,
"loss": 2.4342,
"step": 679
},
{
"epoch": 0.41025641025641024,
"grad_norm": 1.8345463519611278,
"learning_rate": 1.3233935088150154e-05,
"loss": 2.3493,
"step": 680
},
{
"epoch": 0.4108597285067873,
"grad_norm": 1.8258441712601658,
"learning_rate": 1.3215544470370785e-05,
"loss": 2.3791,
"step": 681
},
{
"epoch": 0.4114630467571644,
"grad_norm": 1.779542065553901,
"learning_rate": 1.3197141714955977e-05,
"loss": 2.3066,
"step": 682
},
{
"epoch": 0.41206636500754146,
"grad_norm": 1.9061971700529505,
"learning_rate": 1.317872689137015e-05,
"loss": 2.4178,
"step": 683
},
{
"epoch": 0.41266968325791853,
"grad_norm": 2.151947023896646,
"learning_rate": 1.3160300069123277e-05,
"loss": 2.3833,
"step": 684
},
{
"epoch": 0.4132730015082956,
"grad_norm": 2.1725967166655233,
"learning_rate": 1.3141861317770628e-05,
"loss": 2.4076,
"step": 685
},
{
"epoch": 0.4138763197586727,
"grad_norm": 1.7110768486865224,
"learning_rate": 1.312341070691249e-05,
"loss": 2.4303,
"step": 686
},
{
"epoch": 0.4144796380090498,
"grad_norm": 1.8876018712519784,
"learning_rate": 1.3104948306193932e-05,
"loss": 2.4082,
"step": 687
},
{
"epoch": 0.41508295625942687,
"grad_norm": 1.7138291223841546,
"learning_rate": 1.308647418530451e-05,
"loss": 2.4395,
"step": 688
},
{
"epoch": 0.41568627450980394,
"grad_norm": 1.6819798162077328,
"learning_rate": 1.3067988413978032e-05,
"loss": 2.398,
"step": 689
},
{
"epoch": 0.416289592760181,
"grad_norm": 1.9776754872059599,
"learning_rate": 1.3049491061992274e-05,
"loss": 2.3758,
"step": 690
},
{
"epoch": 0.4168929110105581,
"grad_norm": 2.4185263540111133,
"learning_rate": 1.3030982199168732e-05,
"loss": 2.342,
"step": 691
},
{
"epoch": 0.41749622926093516,
"grad_norm": 1.9732286019963168,
"learning_rate": 1.3012461895372343e-05,
"loss": 2.4523,
"step": 692
},
{
"epoch": 0.41809954751131223,
"grad_norm": 1.866052215173324,
"learning_rate": 1.2993930220511245e-05,
"loss": 2.32,
"step": 693
},
{
"epoch": 0.4187028657616893,
"grad_norm": 1.7385777573811818,
"learning_rate": 1.2975387244536478e-05,
"loss": 2.3619,
"step": 694
},
{
"epoch": 0.4193061840120664,
"grad_norm": 1.7561905475842445,
"learning_rate": 1.2956833037441756e-05,
"loss": 2.3834,
"step": 695
},
{
"epoch": 0.41990950226244345,
"grad_norm": 1.9463356269336127,
"learning_rate": 1.2938267669263179e-05,
"loss": 2.4704,
"step": 696
},
{
"epoch": 0.4205128205128205,
"grad_norm": 1.801091105836512,
"learning_rate": 1.2919691210078982e-05,
"loss": 2.3265,
"step": 697
},
{
"epoch": 0.4211161387631976,
"grad_norm": 1.943220111113144,
"learning_rate": 1.2901103730009261e-05,
"loss": 2.3718,
"step": 698
},
{
"epoch": 0.42171945701357466,
"grad_norm": 1.836242175707625,
"learning_rate": 1.2882505299215711e-05,
"loss": 2.33,
"step": 699
},
{
"epoch": 0.42232277526395173,
"grad_norm": 1.6868877589339009,
"learning_rate": 1.2863895987901364e-05,
"loss": 2.3869,
"step": 700
},
{
"epoch": 0.4229260935143288,
"grad_norm": 1.7970188659789745,
"learning_rate": 1.2845275866310325e-05,
"loss": 2.4001,
"step": 701
},
{
"epoch": 0.4235294117647059,
"grad_norm": 1.95275973209963,
"learning_rate": 1.2826645004727503e-05,
"loss": 2.3717,
"step": 702
},
{
"epoch": 0.42413273001508295,
"grad_norm": 1.7680186752612674,
"learning_rate": 1.2808003473478343e-05,
"loss": 2.3729,
"step": 703
},
{
"epoch": 0.42473604826546,
"grad_norm": 1.8149329130672966,
"learning_rate": 1.278935134292857e-05,
"loss": 2.3976,
"step": 704
},
{
"epoch": 0.4253393665158371,
"grad_norm": 2.088358936591777,
"learning_rate": 1.2770688683483914e-05,
"loss": 2.3096,
"step": 705
},
{
"epoch": 0.42594268476621416,
"grad_norm": 2.111711796208285,
"learning_rate": 1.2752015565589852e-05,
"loss": 2.4244,
"step": 706
},
{
"epoch": 0.42654600301659124,
"grad_norm": 1.7477535636939239,
"learning_rate": 1.2733332059731333e-05,
"loss": 2.3133,
"step": 707
},
{
"epoch": 0.4271493212669683,
"grad_norm": 1.8511085641604008,
"learning_rate": 1.2714638236432526e-05,
"loss": 2.4276,
"step": 708
},
{
"epoch": 0.4277526395173454,
"grad_norm": 1.9150387848439585,
"learning_rate": 1.2695934166256528e-05,
"loss": 2.5002,
"step": 709
},
{
"epoch": 0.42835595776772245,
"grad_norm": 1.74427260027451,
"learning_rate": 1.2677219919805137e-05,
"loss": 2.3458,
"step": 710
},
{
"epoch": 0.4289592760180995,
"grad_norm": 1.805483491725746,
"learning_rate": 1.2658495567718543e-05,
"loss": 2.3962,
"step": 711
},
{
"epoch": 0.4295625942684766,
"grad_norm": 1.7916901194265042,
"learning_rate": 1.2639761180675098e-05,
"loss": 2.4087,
"step": 712
},
{
"epoch": 0.4301659125188537,
"grad_norm": 1.8014131588828444,
"learning_rate": 1.2621016829391022e-05,
"loss": 2.402,
"step": 713
},
{
"epoch": 0.4307692307692308,
"grad_norm": 1.9360536162458533,
"learning_rate": 1.2602262584620154e-05,
"loss": 2.36,
"step": 714
},
{
"epoch": 0.43137254901960786,
"grad_norm": 1.8462103917561457,
"learning_rate": 1.2583498517153662e-05,
"loss": 2.3282,
"step": 715
},
{
"epoch": 0.43197586726998494,
"grad_norm": 1.9550039911148358,
"learning_rate": 1.2564724697819814e-05,
"loss": 2.3885,
"step": 716
},
{
"epoch": 0.432579185520362,
"grad_norm": 1.7547962323016189,
"learning_rate": 1.254594119748367e-05,
"loss": 2.3607,
"step": 717
},
{
"epoch": 0.4331825037707391,
"grad_norm": 1.7133242570223588,
"learning_rate": 1.2527148087046847e-05,
"loss": 2.3203,
"step": 718
},
{
"epoch": 0.43378582202111615,
"grad_norm": 1.7841657659406196,
"learning_rate": 1.2508345437447226e-05,
"loss": 2.4431,
"step": 719
},
{
"epoch": 0.4343891402714932,
"grad_norm": 1.6718263361727208,
"learning_rate": 1.2489533319658703e-05,
"loss": 2.372,
"step": 720
},
{
"epoch": 0.4349924585218703,
"grad_norm": 1.747207962107338,
"learning_rate": 1.2470711804690901e-05,
"loss": 2.4386,
"step": 721
},
{
"epoch": 0.43559577677224737,
"grad_norm": 1.7786322597485218,
"learning_rate": 1.2451880963588927e-05,
"loss": 2.3613,
"step": 722
},
{
"epoch": 0.43619909502262444,
"grad_norm": 1.7709982000618676,
"learning_rate": 1.2433040867433087e-05,
"loss": 2.3374,
"step": 723
},
{
"epoch": 0.4368024132730015,
"grad_norm": 1.8274123875511648,
"learning_rate": 1.2414191587338627e-05,
"loss": 2.4121,
"step": 724
},
{
"epoch": 0.4374057315233786,
"grad_norm": 1.8239929149081342,
"learning_rate": 1.2395333194455444e-05,
"loss": 2.4182,
"step": 725
},
{
"epoch": 0.43800904977375565,
"grad_norm": 1.8398198214129082,
"learning_rate": 1.2376465759967849e-05,
"loss": 2.3402,
"step": 726
},
{
"epoch": 0.4386123680241327,
"grad_norm": 1.9183852343383927,
"learning_rate": 1.2357589355094275e-05,
"loss": 2.3468,
"step": 727
},
{
"epoch": 0.4392156862745098,
"grad_norm": 1.965693182428336,
"learning_rate": 1.2338704051087014e-05,
"loss": 2.3226,
"step": 728
},
{
"epoch": 0.43981900452488687,
"grad_norm": 1.649936113840551,
"learning_rate": 1.2319809919231957e-05,
"loss": 2.4072,
"step": 729
},
{
"epoch": 0.44042232277526394,
"grad_norm": 1.8341340410648517,
"learning_rate": 1.2300907030848307e-05,
"loss": 2.2409,
"step": 730
},
{
"epoch": 0.441025641025641,
"grad_norm": 1.6910528946384642,
"learning_rate": 1.2281995457288324e-05,
"loss": 2.4764,
"step": 731
},
{
"epoch": 0.4416289592760181,
"grad_norm": 1.8929128640012716,
"learning_rate": 1.2263075269937057e-05,
"loss": 2.374,
"step": 732
},
{
"epoch": 0.44223227752639516,
"grad_norm": 1.9914643440790407,
"learning_rate": 1.2244146540212063e-05,
"loss": 2.3981,
"step": 733
},
{
"epoch": 0.44283559577677223,
"grad_norm": 1.8208263878089117,
"learning_rate": 1.2225209339563144e-05,
"loss": 2.3715,
"step": 734
},
{
"epoch": 0.4434389140271493,
"grad_norm": 1.7264998580660669,
"learning_rate": 1.2206263739472085e-05,
"loss": 2.3765,
"step": 735
},
{
"epoch": 0.44404223227752637,
"grad_norm": 1.7059386365866958,
"learning_rate": 1.2187309811452357e-05,
"loss": 2.4202,
"step": 736
},
{
"epoch": 0.44464555052790344,
"grad_norm": 1.9473821855556346,
"learning_rate": 1.2168347627048891e-05,
"loss": 2.3991,
"step": 737
},
{
"epoch": 0.4452488687782805,
"grad_norm": 1.752167824857159,
"learning_rate": 1.2149377257837767e-05,
"loss": 2.4284,
"step": 738
},
{
"epoch": 0.44585218702865764,
"grad_norm": 1.8153328418105037,
"learning_rate": 1.2130398775425964e-05,
"loss": 2.3456,
"step": 739
},
{
"epoch": 0.4464555052790347,
"grad_norm": 2.073143137177712,
"learning_rate": 1.2111412251451085e-05,
"loss": 2.3302,
"step": 740
},
{
"epoch": 0.4470588235294118,
"grad_norm": 1.8523051441295324,
"learning_rate": 1.2092417757581085e-05,
"loss": 2.4034,
"step": 741
},
{
"epoch": 0.44766214177978886,
"grad_norm": 1.8264910950212179,
"learning_rate": 1.2073415365514014e-05,
"loss": 2.3729,
"step": 742
},
{
"epoch": 0.44826546003016593,
"grad_norm": 1.886363437229635,
"learning_rate": 1.2054405146977719e-05,
"loss": 2.2983,
"step": 743
},
{
"epoch": 0.448868778280543,
"grad_norm": 1.914473698777555,
"learning_rate": 1.2035387173729606e-05,
"loss": 2.3985,
"step": 744
},
{
"epoch": 0.4494720965309201,
"grad_norm": 1.7371950120576183,
"learning_rate": 1.2016361517556334e-05,
"loss": 2.3949,
"step": 745
},
{
"epoch": 0.45007541478129715,
"grad_norm": 1.8051498258025656,
"learning_rate": 1.1997328250273582e-05,
"loss": 2.3474,
"step": 746
},
{
"epoch": 0.4506787330316742,
"grad_norm": 1.8453416553348072,
"learning_rate": 1.1978287443725737e-05,
"loss": 2.3895,
"step": 747
},
{
"epoch": 0.4512820512820513,
"grad_norm": 1.871938613961509,
"learning_rate": 1.1959239169785668e-05,
"loss": 2.4942,
"step": 748
},
{
"epoch": 0.45188536953242836,
"grad_norm": 1.9069542302070057,
"learning_rate": 1.194018350035441e-05,
"loss": 2.3946,
"step": 749
},
{
"epoch": 0.45248868778280543,
"grad_norm": 1.8230490419481198,
"learning_rate": 1.1921120507360934e-05,
"loss": 2.3828,
"step": 750
},
{
"epoch": 0.4530920060331825,
"grad_norm": 1.7100130596372602,
"learning_rate": 1.190205026276183e-05,
"loss": 2.4444,
"step": 751
},
{
"epoch": 0.4536953242835596,
"grad_norm": 2.037831829444688,
"learning_rate": 1.1882972838541084e-05,
"loss": 2.3874,
"step": 752
},
{
"epoch": 0.45429864253393665,
"grad_norm": 1.8278872762113902,
"learning_rate": 1.1863888306709772e-05,
"loss": 2.4134,
"step": 753
},
{
"epoch": 0.4549019607843137,
"grad_norm": 1.7853622553097093,
"learning_rate": 1.1844796739305792e-05,
"loss": 2.4366,
"step": 754
},
{
"epoch": 0.4555052790346908,
"grad_norm": 1.7502372984758952,
"learning_rate": 1.182569820839362e-05,
"loss": 2.3502,
"step": 755
},
{
"epoch": 0.45610859728506786,
"grad_norm": 1.8953013191810872,
"learning_rate": 1.1806592786063991e-05,
"loss": 2.3261,
"step": 756
},
{
"epoch": 0.45671191553544493,
"grad_norm": 1.8474649604357705,
"learning_rate": 1.1787480544433673e-05,
"loss": 2.4107,
"step": 757
},
{
"epoch": 0.457315233785822,
"grad_norm": 1.9813399740850826,
"learning_rate": 1.1768361555645164e-05,
"loss": 2.3633,
"step": 758
},
{
"epoch": 0.4579185520361991,
"grad_norm": 1.8305808245169044,
"learning_rate": 1.1749235891866437e-05,
"loss": 2.4417,
"step": 759
},
{
"epoch": 0.45852187028657615,
"grad_norm": 1.835264821449839,
"learning_rate": 1.1730103625290658e-05,
"loss": 2.3976,
"step": 760
},
{
"epoch": 0.4591251885369532,
"grad_norm": 1.7498847639131172,
"learning_rate": 1.1710964828135913e-05,
"loss": 2.4323,
"step": 761
},
{
"epoch": 0.4597285067873303,
"grad_norm": 1.7819698121609244,
"learning_rate": 1.1691819572644941e-05,
"loss": 2.3824,
"step": 762
},
{
"epoch": 0.46033182503770737,
"grad_norm": 1.6605141840526678,
"learning_rate": 1.1672667931084862e-05,
"loss": 2.3819,
"step": 763
},
{
"epoch": 0.46093514328808444,
"grad_norm": 1.7855281123097961,
"learning_rate": 1.1653509975746899e-05,
"loss": 2.3018,
"step": 764
},
{
"epoch": 0.46153846153846156,
"grad_norm": 1.8601026398729645,
"learning_rate": 1.1634345778946112e-05,
"loss": 2.3745,
"step": 765
},
{
"epoch": 0.46214177978883864,
"grad_norm": 1.7547970617468687,
"learning_rate": 1.1615175413021107e-05,
"loss": 2.3493,
"step": 766
},
{
"epoch": 0.4627450980392157,
"grad_norm": 1.9548991514666338,
"learning_rate": 1.1595998950333794e-05,
"loss": 2.3649,
"step": 767
},
{
"epoch": 0.4633484162895928,
"grad_norm": 1.8357021323354088,
"learning_rate": 1.1576816463269083e-05,
"loss": 2.4205,
"step": 768
},
{
"epoch": 0.46395173453996985,
"grad_norm": 1.7260048621415132,
"learning_rate": 1.155762802423463e-05,
"loss": 2.3972,
"step": 769
},
{
"epoch": 0.4645550527903469,
"grad_norm": 1.8773223271613868,
"learning_rate": 1.1538433705660561e-05,
"loss": 2.4437,
"step": 770
},
{
"epoch": 0.465158371040724,
"grad_norm": 1.8937934512459285,
"learning_rate": 1.1519233579999187e-05,
"loss": 2.3078,
"step": 771
},
{
"epoch": 0.46576168929110107,
"grad_norm": 1.8347615261731738,
"learning_rate": 1.1500027719724745e-05,
"loss": 2.3686,
"step": 772
},
{
"epoch": 0.46636500754147814,
"grad_norm": 1.8991250931263675,
"learning_rate": 1.148081619733311e-05,
"loss": 2.4149,
"step": 773
},
{
"epoch": 0.4669683257918552,
"grad_norm": 1.7068569233280024,
"learning_rate": 1.1461599085341549e-05,
"loss": 2.417,
"step": 774
},
{
"epoch": 0.4675716440422323,
"grad_norm": 1.8546938193616271,
"learning_rate": 1.1442376456288402e-05,
"loss": 2.496,
"step": 775
},
{
"epoch": 0.46817496229260935,
"grad_norm": 1.7282335446656245,
"learning_rate": 1.1423148382732854e-05,
"loss": 2.3496,
"step": 776
},
{
"epoch": 0.4687782805429864,
"grad_norm": 1.8993396930857818,
"learning_rate": 1.140391493725463e-05,
"loss": 2.3287,
"step": 777
},
{
"epoch": 0.4693815987933635,
"grad_norm": 1.7742417433579973,
"learning_rate": 1.138467619245374e-05,
"loss": 2.4362,
"step": 778
},
{
"epoch": 0.46998491704374057,
"grad_norm": 1.6863824684148747,
"learning_rate": 1.1365432220950195e-05,
"loss": 2.3075,
"step": 779
},
{
"epoch": 0.47058823529411764,
"grad_norm": 1.9755312265214005,
"learning_rate": 1.1346183095383731e-05,
"loss": 2.4258,
"step": 780
},
{
"epoch": 0.4711915535444947,
"grad_norm": 1.7659437015031778,
"learning_rate": 1.1326928888413539e-05,
"loss": 2.3789,
"step": 781
},
{
"epoch": 0.4717948717948718,
"grad_norm": 1.9953934429422493,
"learning_rate": 1.1307669672718e-05,
"loss": 2.4341,
"step": 782
},
{
"epoch": 0.47239819004524886,
"grad_norm": 1.6759600117800293,
"learning_rate": 1.128840552099439e-05,
"loss": 2.38,
"step": 783
},
{
"epoch": 0.4730015082956259,
"grad_norm": 1.854728287765269,
"learning_rate": 1.1269136505958623e-05,
"loss": 2.3623,
"step": 784
},
{
"epoch": 0.473604826546003,
"grad_norm": 1.892879776282185,
"learning_rate": 1.1249862700344969e-05,
"loss": 2.4379,
"step": 785
},
{
"epoch": 0.47420814479638007,
"grad_norm": 1.842873393468782,
"learning_rate": 1.1230584176905784e-05,
"loss": 2.3336,
"step": 786
},
{
"epoch": 0.47481146304675714,
"grad_norm": 1.7678588758287328,
"learning_rate": 1.1211301008411222e-05,
"loss": 2.3832,
"step": 787
},
{
"epoch": 0.4754147812971342,
"grad_norm": 1.7957652718067842,
"learning_rate": 1.1192013267648982e-05,
"loss": 2.478,
"step": 788
},
{
"epoch": 0.4760180995475113,
"grad_norm": 2.0496571931310164,
"learning_rate": 1.1172721027424021e-05,
"loss": 2.3345,
"step": 789
},
{
"epoch": 0.4766214177978884,
"grad_norm": 1.8206515205346219,
"learning_rate": 1.1153424360558268e-05,
"loss": 2.3796,
"step": 790
},
{
"epoch": 0.4772247360482655,
"grad_norm": 1.9139906768147443,
"learning_rate": 1.1134123339890376e-05,
"loss": 2.3923,
"step": 791
},
{
"epoch": 0.47782805429864256,
"grad_norm": 1.749531594471372,
"learning_rate": 1.1114818038275428e-05,
"loss": 2.5011,
"step": 792
},
{
"epoch": 0.47843137254901963,
"grad_norm": 3.724545765526481,
"learning_rate": 1.109550852858466e-05,
"loss": 2.451,
"step": 793
},
{
"epoch": 0.4790346907993967,
"grad_norm": 1.7207858054874157,
"learning_rate": 1.1076194883705194e-05,
"loss": 2.4097,
"step": 794
},
{
"epoch": 0.4796380090497738,
"grad_norm": 1.9401644882234703,
"learning_rate": 1.1056877176539767e-05,
"loss": 2.4028,
"step": 795
},
{
"epoch": 0.48024132730015084,
"grad_norm": 1.8070053767404293,
"learning_rate": 1.1037555480006445e-05,
"loss": 2.3567,
"step": 796
},
{
"epoch": 0.4808446455505279,
"grad_norm": 1.8091050025141635,
"learning_rate": 1.1018229867038358e-05,
"loss": 2.4448,
"step": 797
},
{
"epoch": 0.481447963800905,
"grad_norm": 1.78424566269542,
"learning_rate": 1.0998900410583404e-05,
"loss": 2.3909,
"step": 798
},
{
"epoch": 0.48205128205128206,
"grad_norm": 1.7452068371419385,
"learning_rate": 1.0979567183604009e-05,
"loss": 2.3607,
"step": 799
},
{
"epoch": 0.48265460030165913,
"grad_norm": 1.8685395323481204,
"learning_rate": 1.0960230259076819e-05,
"loss": 2.3596,
"step": 800
},
{
"epoch": 0.4832579185520362,
"grad_norm": 1.7681779054376958,
"learning_rate": 1.0940889709992441e-05,
"loss": 2.4577,
"step": 801
},
{
"epoch": 0.4838612368024133,
"grad_norm": 1.7900733532064816,
"learning_rate": 1.0921545609355162e-05,
"loss": 2.3578,
"step": 802
},
{
"epoch": 0.48446455505279035,
"grad_norm": 1.8719430790020382,
"learning_rate": 1.0902198030182677e-05,
"loss": 2.4568,
"step": 803
},
{
"epoch": 0.4850678733031674,
"grad_norm": 1.7120170734056872,
"learning_rate": 1.0882847045505809e-05,
"loss": 2.3494,
"step": 804
},
{
"epoch": 0.4856711915535445,
"grad_norm": 1.6197324052113928,
"learning_rate": 1.086349272836824e-05,
"loss": 2.3106,
"step": 805
},
{
"epoch": 0.48627450980392156,
"grad_norm": 1.8174715253639449,
"learning_rate": 1.084413515182622e-05,
"loss": 2.3496,
"step": 806
},
{
"epoch": 0.48687782805429863,
"grad_norm": 2.0172167263611653,
"learning_rate": 1.0824774388948321e-05,
"loss": 2.4097,
"step": 807
},
{
"epoch": 0.4874811463046757,
"grad_norm": 1.7907222804659544,
"learning_rate": 1.0805410512815123e-05,
"loss": 2.2761,
"step": 808
},
{
"epoch": 0.4880844645550528,
"grad_norm": 1.7182248007974297,
"learning_rate": 1.0786043596518964e-05,
"loss": 2.3949,
"step": 809
},
{
"epoch": 0.48868778280542985,
"grad_norm": 1.761572833600802,
"learning_rate": 1.0766673713163667e-05,
"loss": 2.3268,
"step": 810
},
{
"epoch": 0.4892911010558069,
"grad_norm": 1.978086578514091,
"learning_rate": 1.0747300935864245e-05,
"loss": 2.4644,
"step": 811
},
{
"epoch": 0.489894419306184,
"grad_norm": 2.179344097420728,
"learning_rate": 1.0727925337746633e-05,
"loss": 2.3845,
"step": 812
},
{
"epoch": 0.49049773755656106,
"grad_norm": 1.8078731760776887,
"learning_rate": 1.0708546991947422e-05,
"loss": 2.3472,
"step": 813
},
{
"epoch": 0.49110105580693814,
"grad_norm": 1.7449102909226242,
"learning_rate": 1.0689165971613566e-05,
"loss": 2.3945,
"step": 814
},
{
"epoch": 0.4917043740573152,
"grad_norm": 1.739318963481491,
"learning_rate": 1.0669782349902122e-05,
"loss": 2.3701,
"step": 815
},
{
"epoch": 0.49230769230769234,
"grad_norm": 1.6488397607877079,
"learning_rate": 1.0650396199979964e-05,
"loss": 2.2992,
"step": 816
},
{
"epoch": 0.4929110105580694,
"grad_norm": 1.7709600436471968,
"learning_rate": 1.0631007595023504e-05,
"loss": 2.425,
"step": 817
},
{
"epoch": 0.4935143288084465,
"grad_norm": 1.7076343907698315,
"learning_rate": 1.0611616608218429e-05,
"loss": 2.426,
"step": 818
},
{
"epoch": 0.49411764705882355,
"grad_norm": 1.8481580796609003,
"learning_rate": 1.0592223312759409e-05,
"loss": 2.365,
"step": 819
},
{
"epoch": 0.4947209653092006,
"grad_norm": 1.8715006232035116,
"learning_rate": 1.0572827781849835e-05,
"loss": 2.5006,
"step": 820
},
{
"epoch": 0.4953242835595777,
"grad_norm": 1.6925435107980902,
"learning_rate": 1.0553430088701533e-05,
"loss": 2.3887,
"step": 821
},
{
"epoch": 0.49592760180995477,
"grad_norm": 13.753810679352204,
"learning_rate": 1.0534030306534491e-05,
"loss": 2.3408,
"step": 822
},
{
"epoch": 0.49653092006033184,
"grad_norm": 1.7063797659828734,
"learning_rate": 1.051462850857658e-05,
"loss": 2.3331,
"step": 823
},
{
"epoch": 0.4971342383107089,
"grad_norm": 1.7178057305930887,
"learning_rate": 1.0495224768063288e-05,
"loss": 2.3707,
"step": 824
},
{
"epoch": 0.497737556561086,
"grad_norm": 1.7992237998948943,
"learning_rate": 1.0475819158237426e-05,
"loss": 2.5099,
"step": 825
},
{
"epoch": 0.49834087481146305,
"grad_norm": 1.7416788690847198,
"learning_rate": 1.045641175234886e-05,
"loss": 2.3624,
"step": 826
},
{
"epoch": 0.4989441930618401,
"grad_norm": 1.8216770572714687,
"learning_rate": 1.0437002623654256e-05,
"loss": 2.3162,
"step": 827
},
{
"epoch": 0.4995475113122172,
"grad_norm": 1.7366014386825241,
"learning_rate": 1.0417591845416748e-05,
"loss": 2.3976,
"step": 828
},
{
"epoch": 0.5001508295625943,
"grad_norm": 1.7235867220953933,
"learning_rate": 1.0398179490905731e-05,
"loss": 2.2961,
"step": 829
},
{
"epoch": 0.5007541478129713,
"grad_norm": 1.6477357985522785,
"learning_rate": 1.0378765633396526e-05,
"loss": 2.4396,
"step": 830
},
{
"epoch": 0.5013574660633484,
"grad_norm": 1.6818154245812602,
"learning_rate": 1.0359350346170142e-05,
"loss": 2.3604,
"step": 831
},
{
"epoch": 0.5019607843137255,
"grad_norm": 1.843044729927334,
"learning_rate": 1.0339933702512978e-05,
"loss": 2.3084,
"step": 832
},
{
"epoch": 0.5019607843137255,
"eval_loss": 2.387592077255249,
"eval_runtime": 22.7208,
"eval_samples_per_second": 3.873,
"eval_steps_per_second": 0.484,
"step": 832
},
{
"epoch": 0.5025641025641026,
"grad_norm": 1.7184520976508428,
"learning_rate": 1.0320515775716556e-05,
"loss": 2.4301,
"step": 833
},
{
"epoch": 0.5031674208144796,
"grad_norm": 1.6890268292680732,
"learning_rate": 1.0301096639077229e-05,
"loss": 2.4126,
"step": 834
},
{
"epoch": 0.5037707390648567,
"grad_norm": 1.7252703313236202,
"learning_rate": 1.0281676365895939e-05,
"loss": 2.2905,
"step": 835
},
{
"epoch": 0.5043740573152338,
"grad_norm": 1.7486803185975979,
"learning_rate": 1.02622550294779e-05,
"loss": 2.4323,
"step": 836
},
{
"epoch": 0.5049773755656108,
"grad_norm": 1.8603836336573905,
"learning_rate": 1.0242832703132353e-05,
"loss": 2.3632,
"step": 837
},
{
"epoch": 0.5055806938159879,
"grad_norm": 1.9392259883391494,
"learning_rate": 1.022340946017226e-05,
"loss": 2.3691,
"step": 838
},
{
"epoch": 0.506184012066365,
"grad_norm": 1.806362355351131,
"learning_rate": 1.0203985373914056e-05,
"loss": 2.4253,
"step": 839
},
{
"epoch": 0.5067873303167421,
"grad_norm": 1.753984778463969,
"learning_rate": 1.0184560517677353e-05,
"loss": 2.3383,
"step": 840
},
{
"epoch": 0.5073906485671191,
"grad_norm": 1.743749696683883,
"learning_rate": 1.0165134964784669e-05,
"loss": 2.3721,
"step": 841
},
{
"epoch": 0.5079939668174962,
"grad_norm": 1.8626358872488646,
"learning_rate": 1.0145708788561157e-05,
"loss": 2.4944,
"step": 842
},
{
"epoch": 0.5085972850678733,
"grad_norm": 1.7659918339275127,
"learning_rate": 1.012628206233432e-05,
"loss": 2.3927,
"step": 843
},
{
"epoch": 0.5092006033182503,
"grad_norm": 1.768537603828139,
"learning_rate": 1.0106854859433734e-05,
"loss": 2.3068,
"step": 844
},
{
"epoch": 0.5098039215686274,
"grad_norm": 1.6921895602019934,
"learning_rate": 1.0087427253190775e-05,
"loss": 2.3904,
"step": 845
},
{
"epoch": 0.5104072398190045,
"grad_norm": 1.8514273394892835,
"learning_rate": 1.0067999316938348e-05,
"loss": 2.3424,
"step": 846
},
{
"epoch": 0.5110105580693816,
"grad_norm": 1.793772258787467,
"learning_rate": 1.0048571124010597e-05,
"loss": 2.398,
"step": 847
},
{
"epoch": 0.5116138763197586,
"grad_norm": 1.6918053539167304,
"learning_rate": 1.0029142747742637e-05,
"loss": 2.3746,
"step": 848
},
{
"epoch": 0.5122171945701357,
"grad_norm": 1.6101656310591637,
"learning_rate": 1.0009714261470274e-05,
"loss": 2.3165,
"step": 849
},
{
"epoch": 0.5128205128205128,
"grad_norm": 1.700912056066946,
"learning_rate": 9.990285738529733e-06,
"loss": 2.4027,
"step": 850
},
{
"epoch": 0.5134238310708898,
"grad_norm": 1.7795095656194206,
"learning_rate": 9.970857252257368e-06,
"loss": 2.4301,
"step": 851
},
{
"epoch": 0.5140271493212669,
"grad_norm": 1.8166978303180596,
"learning_rate": 9.951428875989408e-06,
"loss": 2.3943,
"step": 852
},
{
"epoch": 0.514630467571644,
"grad_norm": 1.7906994439446355,
"learning_rate": 9.932000683061654e-06,
"loss": 2.3666,
"step": 853
},
{
"epoch": 0.5152337858220211,
"grad_norm": 1.8507567275198866,
"learning_rate": 9.912572746809228e-06,
"loss": 2.4303,
"step": 854
},
{
"epoch": 0.5158371040723982,
"grad_norm": 1.7029190223017787,
"learning_rate": 9.89314514056627e-06,
"loss": 2.3407,
"step": 855
},
{
"epoch": 0.5164404223227753,
"grad_norm": 1.8029171374885462,
"learning_rate": 9.873717937665683e-06,
"loss": 2.3434,
"step": 856
},
{
"epoch": 0.5170437405731524,
"grad_norm": 1.8949933952050677,
"learning_rate": 9.854291211438846e-06,
"loss": 2.451,
"step": 857
},
{
"epoch": 0.5176470588235295,
"grad_norm": 1.85164457739374,
"learning_rate": 9.834865035215333e-06,
"loss": 2.3654,
"step": 858
},
{
"epoch": 0.5182503770739065,
"grad_norm": 1.7585113182468484,
"learning_rate": 9.81543948232265e-06,
"loss": 2.4026,
"step": 859
},
{
"epoch": 0.5188536953242836,
"grad_norm": 1.8861048330861192,
"learning_rate": 9.79601462608595e-06,
"loss": 2.3372,
"step": 860
},
{
"epoch": 0.5194570135746607,
"grad_norm": 1.7958907530874708,
"learning_rate": 9.776590539827745e-06,
"loss": 2.4609,
"step": 861
},
{
"epoch": 0.5200603318250377,
"grad_norm": 1.7290587905372026,
"learning_rate": 9.757167296867652e-06,
"loss": 2.331,
"step": 862
},
{
"epoch": 0.5206636500754148,
"grad_norm": 1.7389613818950007,
"learning_rate": 9.737744970522101e-06,
"loss": 2.3763,
"step": 863
},
{
"epoch": 0.5212669683257919,
"grad_norm": 1.8264438820594222,
"learning_rate": 9.718323634104063e-06,
"loss": 2.3895,
"step": 864
},
{
"epoch": 0.521870286576169,
"grad_norm": 1.786930557626034,
"learning_rate": 9.698903360922773e-06,
"loss": 2.3234,
"step": 865
},
{
"epoch": 0.522473604826546,
"grad_norm": 2.0432045735745468,
"learning_rate": 9.67948422428345e-06,
"loss": 2.4544,
"step": 866
},
{
"epoch": 0.5230769230769231,
"grad_norm": 1.7232243886550707,
"learning_rate": 9.660066297487024e-06,
"loss": 2.4616,
"step": 867
},
{
"epoch": 0.5236802413273002,
"grad_norm": 1.7777195779321493,
"learning_rate": 9.640649653829856e-06,
"loss": 2.3366,
"step": 868
},
{
"epoch": 0.5242835595776772,
"grad_norm": 1.7329928051061616,
"learning_rate": 9.621234366603474e-06,
"loss": 2.3406,
"step": 869
},
{
"epoch": 0.5248868778280543,
"grad_norm": 1.6947835637273732,
"learning_rate": 9.601820509094272e-06,
"loss": 2.399,
"step": 870
},
{
"epoch": 0.5254901960784314,
"grad_norm": 1.7646228167407467,
"learning_rate": 9.582408154583256e-06,
"loss": 2.4245,
"step": 871
},
{
"epoch": 0.5260935143288085,
"grad_norm": 1.7818475873068609,
"learning_rate": 9.56299737634575e-06,
"loss": 2.4695,
"step": 872
},
{
"epoch": 0.5266968325791855,
"grad_norm": 1.6020743028303097,
"learning_rate": 9.543588247651141e-06,
"loss": 2.3758,
"step": 873
},
{
"epoch": 0.5273001508295626,
"grad_norm": 2.0059468394494795,
"learning_rate": 9.524180841762577e-06,
"loss": 2.3622,
"step": 874
},
{
"epoch": 0.5279034690799397,
"grad_norm": 1.8138883017044576,
"learning_rate": 9.504775231936716e-06,
"loss": 2.439,
"step": 875
},
{
"epoch": 0.5285067873303168,
"grad_norm": 1.6606861902316774,
"learning_rate": 9.485371491423421e-06,
"loss": 2.3756,
"step": 876
},
{
"epoch": 0.5291101055806938,
"grad_norm": 1.9247118890514288,
"learning_rate": 9.46596969346551e-06,
"loss": 2.4956,
"step": 877
},
{
"epoch": 0.5297134238310709,
"grad_norm": 1.6729495810971284,
"learning_rate": 9.446569911298469e-06,
"loss": 2.4496,
"step": 878
},
{
"epoch": 0.530316742081448,
"grad_norm": 1.721608451090351,
"learning_rate": 9.427172218150166e-06,
"loss": 2.4336,
"step": 879
},
{
"epoch": 0.530920060331825,
"grad_norm": 1.7440028090249518,
"learning_rate": 9.407776687240591e-06,
"loss": 2.399,
"step": 880
},
{
"epoch": 0.5315233785822021,
"grad_norm": 1.852395281031802,
"learning_rate": 9.388383391781576e-06,
"loss": 2.3752,
"step": 881
},
{
"epoch": 0.5321266968325792,
"grad_norm": 2.063001040819312,
"learning_rate": 9.3689924049765e-06,
"loss": 2.4234,
"step": 882
},
{
"epoch": 0.5327300150829563,
"grad_norm": 2.1374286448897055,
"learning_rate": 9.34960380002004e-06,
"loss": 2.4208,
"step": 883
},
{
"epoch": 0.5333333333333333,
"grad_norm": 1.7187678468330105,
"learning_rate": 9.330217650097881e-06,
"loss": 2.3219,
"step": 884
},
{
"epoch": 0.5339366515837104,
"grad_norm": 1.7650084385659235,
"learning_rate": 9.310834028386436e-06,
"loss": 2.3753,
"step": 885
},
{
"epoch": 0.5345399698340875,
"grad_norm": 1.7280929880181042,
"learning_rate": 9.291453008052582e-06,
"loss": 2.3902,
"step": 886
},
{
"epoch": 0.5351432880844645,
"grad_norm": 1.9002978359745568,
"learning_rate": 9.272074662253368e-06,
"loss": 2.3117,
"step": 887
},
{
"epoch": 0.5357466063348416,
"grad_norm": 1.95189657556873,
"learning_rate": 9.252699064135759e-06,
"loss": 2.4177,
"step": 888
},
{
"epoch": 0.5363499245852187,
"grad_norm": 2.014163602151102,
"learning_rate": 9.233326286836335e-06,
"loss": 2.4068,
"step": 889
},
{
"epoch": 0.5369532428355958,
"grad_norm": 1.8364320842962563,
"learning_rate": 9.213956403481037e-06,
"loss": 2.3912,
"step": 890
},
{
"epoch": 0.5375565610859728,
"grad_norm": 1.7499298359418458,
"learning_rate": 9.194589487184884e-06,
"loss": 2.3843,
"step": 891
},
{
"epoch": 0.5381598793363499,
"grad_norm": 1.686625665723383,
"learning_rate": 9.175225611051684e-06,
"loss": 2.3425,
"step": 892
},
{
"epoch": 0.538763197586727,
"grad_norm": 1.7193751821408507,
"learning_rate": 9.155864848173782e-06,
"loss": 2.3955,
"step": 893
},
{
"epoch": 0.539366515837104,
"grad_norm": 1.7894758836039961,
"learning_rate": 9.136507271631764e-06,
"loss": 2.3973,
"step": 894
},
{
"epoch": 0.5399698340874811,
"grad_norm": 1.8264782691225658,
"learning_rate": 9.117152954494195e-06,
"loss": 2.3688,
"step": 895
},
{
"epoch": 0.5405731523378582,
"grad_norm": 1.7326254631321008,
"learning_rate": 9.097801969817324e-06,
"loss": 2.328,
"step": 896
},
{
"epoch": 0.5411764705882353,
"grad_norm": 1.6970752681437251,
"learning_rate": 9.078454390644841e-06,
"loss": 2.3562,
"step": 897
},
{
"epoch": 0.5417797888386123,
"grad_norm": 1.874581417655975,
"learning_rate": 9.05911029000756e-06,
"loss": 2.3387,
"step": 898
},
{
"epoch": 0.5423831070889894,
"grad_norm": 1.7409066719499708,
"learning_rate": 9.039769740923183e-06,
"loss": 2.3901,
"step": 899
},
{
"epoch": 0.5429864253393665,
"grad_norm": 1.9842969144165807,
"learning_rate": 9.020432816395993e-06,
"loss": 2.4248,
"step": 900
},
{
"epoch": 0.5435897435897435,
"grad_norm": 1.9292319261176405,
"learning_rate": 9.001099589416602e-06,
"loss": 2.3535,
"step": 901
},
{
"epoch": 0.5441930618401206,
"grad_norm": 1.7634105099688444,
"learning_rate": 8.981770132961649e-06,
"loss": 2.3952,
"step": 902
},
{
"epoch": 0.5447963800904977,
"grad_norm": 1.7210160546146538,
"learning_rate": 8.962444519993558e-06,
"loss": 2.4644,
"step": 903
},
{
"epoch": 0.5453996983408748,
"grad_norm": 2.0164293222578227,
"learning_rate": 8.943122823460235e-06,
"loss": 2.4605,
"step": 904
},
{
"epoch": 0.5460030165912518,
"grad_norm": 1.640145013684435,
"learning_rate": 8.92380511629481e-06,
"loss": 2.3656,
"step": 905
},
{
"epoch": 0.5466063348416289,
"grad_norm": 1.7608736859619678,
"learning_rate": 8.904491471415343e-06,
"loss": 2.4277,
"step": 906
},
{
"epoch": 0.5472096530920061,
"grad_norm": 1.689731655791089,
"learning_rate": 8.885181961724575e-06,
"loss": 2.3845,
"step": 907
},
{
"epoch": 0.5478129713423832,
"grad_norm": 2.006248612420757,
"learning_rate": 8.865876660109625e-06,
"loss": 2.3648,
"step": 908
},
{
"epoch": 0.5484162895927602,
"grad_norm": 2.0601382092003004,
"learning_rate": 8.846575639441732e-06,
"loss": 2.3658,
"step": 909
},
{
"epoch": 0.5490196078431373,
"grad_norm": 1.758520194938707,
"learning_rate": 8.827278972575984e-06,
"loss": 2.4306,
"step": 910
},
{
"epoch": 0.5496229260935144,
"grad_norm": 1.8296491728786637,
"learning_rate": 8.807986732351018e-06,
"loss": 2.3984,
"step": 911
},
{
"epoch": 0.5502262443438914,
"grad_norm": 1.8016195740772143,
"learning_rate": 8.788698991588782e-06,
"loss": 2.3319,
"step": 912
},
{
"epoch": 0.5508295625942685,
"grad_norm": 1.9273005935291878,
"learning_rate": 8.769415823094221e-06,
"loss": 2.373,
"step": 913
},
{
"epoch": 0.5514328808446456,
"grad_norm": 1.7813908784365993,
"learning_rate": 8.750137299655034e-06,
"loss": 2.3778,
"step": 914
},
{
"epoch": 0.5520361990950227,
"grad_norm": 1.9254300444919479,
"learning_rate": 8.730863494041379e-06,
"loss": 2.3988,
"step": 915
},
{
"epoch": 0.5526395173453997,
"grad_norm": 1.934896760614601,
"learning_rate": 8.711594479005614e-06,
"loss": 2.3483,
"step": 916
},
{
"epoch": 0.5532428355957768,
"grad_norm": 1.8095850637153221,
"learning_rate": 8.692330327282003e-06,
"loss": 2.3025,
"step": 917
},
{
"epoch": 0.5538461538461539,
"grad_norm": 1.7458152707487324,
"learning_rate": 8.673071111586463e-06,
"loss": 2.3381,
"step": 918
},
{
"epoch": 0.554449472096531,
"grad_norm": 1.72088810393769,
"learning_rate": 8.653816904616272e-06,
"loss": 2.3856,
"step": 919
},
{
"epoch": 0.555052790346908,
"grad_norm": 1.7299264791576274,
"learning_rate": 8.634567779049807e-06,
"loss": 2.3233,
"step": 920
},
{
"epoch": 0.5556561085972851,
"grad_norm": 1.8697572352042986,
"learning_rate": 8.615323807546258e-06,
"loss": 2.4076,
"step": 921
},
{
"epoch": 0.5562594268476622,
"grad_norm": 1.65992529283014,
"learning_rate": 8.596085062745375e-06,
"loss": 2.3409,
"step": 922
},
{
"epoch": 0.5568627450980392,
"grad_norm": 1.842335383881053,
"learning_rate": 8.576851617267151e-06,
"loss": 2.3512,
"step": 923
},
{
"epoch": 0.5574660633484163,
"grad_norm": 1.8043016583386247,
"learning_rate": 8.557623543711603e-06,
"loss": 2.4132,
"step": 924
},
{
"epoch": 0.5580693815987934,
"grad_norm": 1.8017666828160193,
"learning_rate": 8.538400914658456e-06,
"loss": 2.4248,
"step": 925
},
{
"epoch": 0.5586726998491705,
"grad_norm": 1.8679972521400925,
"learning_rate": 8.519183802666891e-06,
"loss": 2.411,
"step": 926
},
{
"epoch": 0.5592760180995475,
"grad_norm": 1.7033984464574257,
"learning_rate": 8.499972280275259e-06,
"loss": 2.3492,
"step": 927
},
{
"epoch": 0.5598793363499246,
"grad_norm": 1.8740497521935295,
"learning_rate": 8.480766420000815e-06,
"loss": 2.3457,
"step": 928
},
{
"epoch": 0.5604826546003017,
"grad_norm": 1.6508049361676014,
"learning_rate": 8.46156629433944e-06,
"loss": 2.3904,
"step": 929
},
{
"epoch": 0.5610859728506787,
"grad_norm": 1.916157350258544,
"learning_rate": 8.442371975765368e-06,
"loss": 2.506,
"step": 930
},
{
"epoch": 0.5616892911010558,
"grad_norm": 1.6300920878061467,
"learning_rate": 8.423183536730919e-06,
"loss": 2.3701,
"step": 931
},
{
"epoch": 0.5622926093514329,
"grad_norm": 1.7723202663180058,
"learning_rate": 8.404001049666211e-06,
"loss": 2.3965,
"step": 932
},
{
"epoch": 0.56289592760181,
"grad_norm": 1.8619013776832338,
"learning_rate": 8.384824586978896e-06,
"loss": 2.4014,
"step": 933
},
{
"epoch": 0.563499245852187,
"grad_norm": 1.9071332908477077,
"learning_rate": 8.365654221053894e-06,
"loss": 2.3833,
"step": 934
},
{
"epoch": 0.5641025641025641,
"grad_norm": 1.691778794010645,
"learning_rate": 8.346490024253103e-06,
"loss": 2.3487,
"step": 935
},
{
"epoch": 0.5647058823529412,
"grad_norm": 1.787146332821833,
"learning_rate": 8.327332068915141e-06,
"loss": 2.2985,
"step": 936
},
{
"epoch": 0.5653092006033182,
"grad_norm": 2.1950823456779345,
"learning_rate": 8.308180427355062e-06,
"loss": 2.3126,
"step": 937
},
{
"epoch": 0.5659125188536953,
"grad_norm": 1.775970367583555,
"learning_rate": 8.28903517186409e-06,
"loss": 2.3708,
"step": 938
},
{
"epoch": 0.5665158371040724,
"grad_norm": 1.8089771718898564,
"learning_rate": 8.269896374709345e-06,
"loss": 2.3944,
"step": 939
},
{
"epoch": 0.5671191553544495,
"grad_norm": 1.78584822525857,
"learning_rate": 8.250764108133562e-06,
"loss": 2.3171,
"step": 940
},
{
"epoch": 0.5677224736048265,
"grad_norm": 1.9530454117081384,
"learning_rate": 8.231638444354836e-06,
"loss": 2.4186,
"step": 941
},
{
"epoch": 0.5683257918552036,
"grad_norm": 1.768100357370704,
"learning_rate": 8.212519455566328e-06,
"loss": 2.3517,
"step": 942
},
{
"epoch": 0.5689291101055807,
"grad_norm": 1.7791100202829597,
"learning_rate": 8.193407213936014e-06,
"loss": 2.3895,
"step": 943
},
{
"epoch": 0.5695324283559577,
"grad_norm": 1.8015643530446919,
"learning_rate": 8.174301791606384e-06,
"loss": 2.4025,
"step": 944
},
{
"epoch": 0.5701357466063348,
"grad_norm": 1.731486155885199,
"learning_rate": 8.15520326069421e-06,
"loss": 2.4665,
"step": 945
},
{
"epoch": 0.5707390648567119,
"grad_norm": 1.8081292377643385,
"learning_rate": 8.136111693290231e-06,
"loss": 2.2807,
"step": 946
},
{
"epoch": 0.571342383107089,
"grad_norm": 1.712750921195952,
"learning_rate": 8.117027161458917e-06,
"loss": 2.3717,
"step": 947
},
{
"epoch": 0.571945701357466,
"grad_norm": 1.8003483361496844,
"learning_rate": 8.097949737238172e-06,
"loss": 2.2899,
"step": 948
},
{
"epoch": 0.5725490196078431,
"grad_norm": 1.7573744920295975,
"learning_rate": 8.078879492639069e-06,
"loss": 2.274,
"step": 949
},
{
"epoch": 0.5731523378582202,
"grad_norm": 1.983102632284402,
"learning_rate": 8.05981649964559e-06,
"loss": 2.4041,
"step": 950
},
{
"epoch": 0.5737556561085972,
"grad_norm": 1.7185303365501152,
"learning_rate": 8.040760830214334e-06,
"loss": 2.4131,
"step": 951
},
{
"epoch": 0.5743589743589743,
"grad_norm": 1.8744380753797991,
"learning_rate": 8.021712556274264e-06,
"loss": 2.3407,
"step": 952
},
{
"epoch": 0.5749622926093514,
"grad_norm": 1.9017253226675055,
"learning_rate": 8.002671749726425e-06,
"loss": 2.3663,
"step": 953
},
{
"epoch": 0.5755656108597285,
"grad_norm": 1.6585789142483918,
"learning_rate": 7.983638482443671e-06,
"loss": 2.325,
"step": 954
},
{
"epoch": 0.5761689291101055,
"grad_norm": 1.7262944652057712,
"learning_rate": 7.964612826270399e-06,
"loss": 2.3591,
"step": 955
},
{
"epoch": 0.5767722473604826,
"grad_norm": 1.760968270313303,
"learning_rate": 7.945594853022283e-06,
"loss": 2.4155,
"step": 956
},
{
"epoch": 0.5773755656108597,
"grad_norm": 1.6969107057603918,
"learning_rate": 7.926584634485988e-06,
"loss": 2.4495,
"step": 957
},
{
"epoch": 0.5779788838612367,
"grad_norm": 1.7402844415180172,
"learning_rate": 7.907582242418916e-06,
"loss": 2.3239,
"step": 958
},
{
"epoch": 0.5785822021116139,
"grad_norm": 1.7038508656877012,
"learning_rate": 7.888587748548918e-06,
"loss": 2.415,
"step": 959
},
{
"epoch": 0.579185520361991,
"grad_norm": 1.7929534294396436,
"learning_rate": 7.86960122457404e-06,
"loss": 2.4077,
"step": 960
},
{
"epoch": 0.5797888386123681,
"grad_norm": 1.7460741900975971,
"learning_rate": 7.850622742162236e-06,
"loss": 2.3493,
"step": 961
},
{
"epoch": 0.5803921568627451,
"grad_norm": 1.703203253822099,
"learning_rate": 7.831652372951109e-06,
"loss": 2.3821,
"step": 962
},
{
"epoch": 0.5809954751131222,
"grad_norm": 1.7364171065552723,
"learning_rate": 7.812690188547645e-06,
"loss": 2.3724,
"step": 963
},
{
"epoch": 0.5815987933634993,
"grad_norm": 1.9274640570228776,
"learning_rate": 7.793736260527922e-06,
"loss": 2.4338,
"step": 964
},
{
"epoch": 0.5822021116138764,
"grad_norm": 1.706481470208011,
"learning_rate": 7.774790660436857e-06,
"loss": 2.4833,
"step": 965
},
{
"epoch": 0.5828054298642534,
"grad_norm": 1.8296308204177585,
"learning_rate": 7.75585345978794e-06,
"loss": 2.4156,
"step": 966
},
{
"epoch": 0.5834087481146305,
"grad_norm": 1.8143140751021931,
"learning_rate": 7.736924730062947e-06,
"loss": 2.3107,
"step": 967
},
{
"epoch": 0.5840120663650076,
"grad_norm": 2.1068370693786953,
"learning_rate": 7.718004542711677e-06,
"loss": 2.4055,
"step": 968
},
{
"epoch": 0.5846153846153846,
"grad_norm": 1.6564875004624282,
"learning_rate": 7.699092969151698e-06,
"loss": 2.3797,
"step": 969
},
{
"epoch": 0.5852187028657617,
"grad_norm": 1.712883405875796,
"learning_rate": 7.680190080768046e-06,
"loss": 2.4207,
"step": 970
},
{
"epoch": 0.5858220211161388,
"grad_norm": 1.7983081675568953,
"learning_rate": 7.661295948912988e-06,
"loss": 2.38,
"step": 971
},
{
"epoch": 0.5864253393665159,
"grad_norm": 1.669670969733644,
"learning_rate": 7.642410644905726e-06,
"loss": 2.3756,
"step": 972
},
{
"epoch": 0.5870286576168929,
"grad_norm": 1.7493034103928633,
"learning_rate": 7.623534240032153e-06,
"loss": 2.364,
"step": 973
},
{
"epoch": 0.58763197586727,
"grad_norm": 1.7740033814156886,
"learning_rate": 7.604666805544561e-06,
"loss": 2.3382,
"step": 974
},
{
"epoch": 0.5882352941176471,
"grad_norm": 1.9391123215492103,
"learning_rate": 7.585808412661379e-06,
"loss": 2.3416,
"step": 975
},
{
"epoch": 0.5888386123680242,
"grad_norm": 1.8938327666813242,
"learning_rate": 7.566959132566914e-06,
"loss": 2.4275,
"step": 976
},
{
"epoch": 0.5894419306184012,
"grad_norm": 1.6461158841876589,
"learning_rate": 7.548119036411077e-06,
"loss": 2.3116,
"step": 977
},
{
"epoch": 0.5900452488687783,
"grad_norm": 1.7377089925600602,
"learning_rate": 7.529288195309102e-06,
"loss": 2.4299,
"step": 978
},
{
"epoch": 0.5906485671191554,
"grad_norm": 1.7021528327172468,
"learning_rate": 7.5104666803413015e-06,
"loss": 2.4079,
"step": 979
},
{
"epoch": 0.5912518853695324,
"grad_norm": 2.549582267427392,
"learning_rate": 7.4916545625527745e-06,
"loss": 2.4804,
"step": 980
},
{
"epoch": 0.5918552036199095,
"grad_norm": 1.731330521143831,
"learning_rate": 7.472851912953152e-06,
"loss": 2.3502,
"step": 981
},
{
"epoch": 0.5924585218702866,
"grad_norm": 1.750250274307276,
"learning_rate": 7.45405880251633e-06,
"loss": 2.3992,
"step": 982
},
{
"epoch": 0.5930618401206637,
"grad_norm": 1.722284719736197,
"learning_rate": 7.435275302180187e-06,
"loss": 2.3836,
"step": 983
},
{
"epoch": 0.5936651583710407,
"grad_norm": 1.761282630994155,
"learning_rate": 7.416501482846341e-06,
"loss": 2.4412,
"step": 984
},
{
"epoch": 0.5942684766214178,
"grad_norm": 1.8234928253520493,
"learning_rate": 7.397737415379853e-06,
"loss": 2.4086,
"step": 985
},
{
"epoch": 0.5948717948717949,
"grad_norm": 1.933085444592592,
"learning_rate": 7.378983170608982e-06,
"loss": 2.3915,
"step": 986
},
{
"epoch": 0.5954751131221719,
"grad_norm": 1.6863684524777538,
"learning_rate": 7.360238819324903e-06,
"loss": 2.3106,
"step": 987
},
{
"epoch": 0.596078431372549,
"grad_norm": 1.654594116317452,
"learning_rate": 7.341504432281459e-06,
"loss": 2.4465,
"step": 988
},
{
"epoch": 0.5966817496229261,
"grad_norm": 1.7758673763312907,
"learning_rate": 7.322780080194867e-06,
"loss": 2.4278,
"step": 989
},
{
"epoch": 0.5972850678733032,
"grad_norm": 1.6980250876911802,
"learning_rate": 7.304065833743475e-06,
"loss": 2.3902,
"step": 990
},
{
"epoch": 0.5978883861236802,
"grad_norm": 1.7745567427843472,
"learning_rate": 7.285361763567477e-06,
"loss": 2.4236,
"step": 991
},
{
"epoch": 0.5984917043740573,
"grad_norm": 1.652280491444713,
"learning_rate": 7.266667940268668e-06,
"loss": 2.3634,
"step": 992
},
{
"epoch": 0.5990950226244344,
"grad_norm": 1.8981724529225397,
"learning_rate": 7.24798443441015e-06,
"loss": 2.3727,
"step": 993
},
{
"epoch": 0.5996983408748114,
"grad_norm": 1.7079496543700798,
"learning_rate": 7.22931131651609e-06,
"loss": 2.3931,
"step": 994
},
{
"epoch": 0.6003016591251885,
"grad_norm": 1.9997843698547484,
"learning_rate": 7.210648657071433e-06,
"loss": 2.4107,
"step": 995
},
{
"epoch": 0.6009049773755656,
"grad_norm": 1.8744026362961528,
"learning_rate": 7.191996526521661e-06,
"loss": 2.3737,
"step": 996
},
{
"epoch": 0.6015082956259427,
"grad_norm": 1.7267988037294835,
"learning_rate": 7.173354995272499e-06,
"loss": 2.4609,
"step": 997
},
{
"epoch": 0.6021116138763197,
"grad_norm": 1.7375446041528633,
"learning_rate": 7.154724133689677e-06,
"loss": 2.3567,
"step": 998
},
{
"epoch": 0.6027149321266968,
"grad_norm": 1.789874796279508,
"learning_rate": 7.1361040120986394e-06,
"loss": 2.3626,
"step": 999
},
{
"epoch": 0.6033182503770739,
"grad_norm": 1.8360650580584557,
"learning_rate": 7.117494700784292e-06,
"loss": 2.3746,
"step": 1000
},
{
"epoch": 0.6039215686274509,
"grad_norm": 1.8647510973629962,
"learning_rate": 7.098896269990743e-06,
"loss": 2.4365,
"step": 1001
},
{
"epoch": 0.604524886877828,
"grad_norm": 1.6999258224571074,
"learning_rate": 7.080308789921019e-06,
"loss": 2.385,
"step": 1002
},
{
"epoch": 0.6051282051282051,
"grad_norm": 1.7305006652986321,
"learning_rate": 7.061732330736823e-06,
"loss": 2.4122,
"step": 1003
},
{
"epoch": 0.6057315233785822,
"grad_norm": 1.8268769233660138,
"learning_rate": 7.04316696255825e-06,
"loss": 2.3716,
"step": 1004
},
{
"epoch": 0.6063348416289592,
"grad_norm": 1.680563312051675,
"learning_rate": 7.024612755463529e-06,
"loss": 2.453,
"step": 1005
},
{
"epoch": 0.6069381598793363,
"grad_norm": 1.647602536203344,
"learning_rate": 7.006069779488761e-06,
"loss": 2.3768,
"step": 1006
},
{
"epoch": 0.6075414781297134,
"grad_norm": 1.8306490002639166,
"learning_rate": 6.9875381046276605e-06,
"loss": 2.3631,
"step": 1007
},
{
"epoch": 0.6081447963800904,
"grad_norm": 1.713462266890918,
"learning_rate": 6.969017800831273e-06,
"loss": 2.3453,
"step": 1008
},
{
"epoch": 0.6087481146304675,
"grad_norm": 2.08603045309057,
"learning_rate": 6.95050893800773e-06,
"loss": 2.3204,
"step": 1009
},
{
"epoch": 0.6093514328808446,
"grad_norm": 1.9474114701647294,
"learning_rate": 6.9320115860219705e-06,
"loss": 2.3946,
"step": 1010
},
{
"epoch": 0.6099547511312218,
"grad_norm": 1.8838133566846818,
"learning_rate": 6.913525814695492e-06,
"loss": 2.4846,
"step": 1011
},
{
"epoch": 0.6105580693815988,
"grad_norm": 1.837197570579041,
"learning_rate": 6.8950516938060716e-06,
"loss": 2.4237,
"step": 1012
},
{
"epoch": 0.6111613876319759,
"grad_norm": 1.800008780512233,
"learning_rate": 6.87658929308751e-06,
"loss": 2.3863,
"step": 1013
},
{
"epoch": 0.611764705882353,
"grad_norm": 1.8475263728724154,
"learning_rate": 6.8581386822293765e-06,
"loss": 2.3558,
"step": 1014
},
{
"epoch": 0.6123680241327301,
"grad_norm": 1.8487028902348472,
"learning_rate": 6.839699930876727e-06,
"loss": 2.2994,
"step": 1015
},
{
"epoch": 0.6129713423831071,
"grad_norm": 1.8095359265876,
"learning_rate": 6.821273108629853e-06,
"loss": 2.3966,
"step": 1016
},
{
"epoch": 0.6135746606334842,
"grad_norm": 2.1655842902381837,
"learning_rate": 6.802858285044025e-06,
"loss": 2.3741,
"step": 1017
},
{
"epoch": 0.6141779788838613,
"grad_norm": 1.9282607392510218,
"learning_rate": 6.784455529629218e-06,
"loss": 2.411,
"step": 1018
},
{
"epoch": 0.6147812971342383,
"grad_norm": 2.0907716606886937,
"learning_rate": 6.76606491184985e-06,
"loss": 2.4071,
"step": 1019
},
{
"epoch": 0.6153846153846154,
"grad_norm": 1.8869508977324307,
"learning_rate": 6.747686501124531e-06,
"loss": 2.421,
"step": 1020
},
{
"epoch": 0.6159879336349925,
"grad_norm": 1.7052193424579607,
"learning_rate": 6.729320366825785e-06,
"loss": 2.3511,
"step": 1021
},
{
"epoch": 0.6165912518853696,
"grad_norm": 1.7106004296023014,
"learning_rate": 6.710966578279802e-06,
"loss": 2.3493,
"step": 1022
},
{
"epoch": 0.6171945701357466,
"grad_norm": 1.6618114403839686,
"learning_rate": 6.692625204766172e-06,
"loss": 2.3097,
"step": 1023
},
{
"epoch": 0.6177978883861237,
"grad_norm": 1.7345051493958623,
"learning_rate": 6.6742963155176185e-06,
"loss": 2.426,
"step": 1024
},
{
"epoch": 0.6184012066365008,
"grad_norm": 1.9925628567257796,
"learning_rate": 6.655979979719744e-06,
"loss": 2.3454,
"step": 1025
},
{
"epoch": 0.6190045248868778,
"grad_norm": 1.8951801846690113,
"learning_rate": 6.63767626651076e-06,
"loss": 2.4046,
"step": 1026
},
{
"epoch": 0.6196078431372549,
"grad_norm": 1.8958330452775232,
"learning_rate": 6.619385244981233e-06,
"loss": 2.3169,
"step": 1027
},
{
"epoch": 0.620211161387632,
"grad_norm": 1.850986974478954,
"learning_rate": 6.601106984173835e-06,
"loss": 2.3384,
"step": 1028
},
{
"epoch": 0.6208144796380091,
"grad_norm": 1.830337165876901,
"learning_rate": 6.582841553083053e-06,
"loss": 2.3646,
"step": 1029
},
{
"epoch": 0.6214177978883861,
"grad_norm": 1.7035092444406967,
"learning_rate": 6.5645890206549566e-06,
"loss": 2.4757,
"step": 1030
},
{
"epoch": 0.6220211161387632,
"grad_norm": 1.7086779285604496,
"learning_rate": 6.546349455786926e-06,
"loss": 2.3828,
"step": 1031
},
{
"epoch": 0.6226244343891403,
"grad_norm": 1.8135018764370858,
"learning_rate": 6.528122927327386e-06,
"loss": 2.4015,
"step": 1032
},
{
"epoch": 0.6232277526395174,
"grad_norm": 1.7536779084006537,
"learning_rate": 6.5099095040755645e-06,
"loss": 2.423,
"step": 1033
},
{
"epoch": 0.6238310708898944,
"grad_norm": 1.8292406773895364,
"learning_rate": 6.491709254781211e-06,
"loss": 2.3724,
"step": 1034
},
{
"epoch": 0.6244343891402715,
"grad_norm": 1.7869792886207858,
"learning_rate": 6.473522248144359e-06,
"loss": 2.3563,
"step": 1035
},
{
"epoch": 0.6250377073906486,
"grad_norm": 1.8002600022182031,
"learning_rate": 6.455348552815042e-06,
"loss": 2.4471,
"step": 1036
},
{
"epoch": 0.6256410256410256,
"grad_norm": 1.8555130033856473,
"learning_rate": 6.437188237393055e-06,
"loss": 2.3658,
"step": 1037
},
{
"epoch": 0.6262443438914027,
"grad_norm": 1.7962582816756645,
"learning_rate": 6.419041370427686e-06,
"loss": 2.3816,
"step": 1038
},
{
"epoch": 0.6268476621417798,
"grad_norm": 1.6546276512104834,
"learning_rate": 6.400908020417466e-06,
"loss": 2.4006,
"step": 1039
},
{
"epoch": 0.6274509803921569,
"grad_norm": 1.8099892370074893,
"learning_rate": 6.382788255809893e-06,
"loss": 2.3307,
"step": 1040
},
{
"epoch": 0.6274509803921569,
"eval_loss": 2.383073091506958,
"eval_runtime": 22.1846,
"eval_samples_per_second": 3.967,
"eval_steps_per_second": 0.496,
"step": 1040
},
{
"epoch": 0.6280542986425339,
"grad_norm": 1.7763186085366514,
"learning_rate": 6.364682145001193e-06,
"loss": 2.4669,
"step": 1041
},
{
"epoch": 0.628657616892911,
"grad_norm": 1.7270436347923916,
"learning_rate": 6.34658975633605e-06,
"loss": 2.4226,
"step": 1042
},
{
"epoch": 0.6292609351432881,
"grad_norm": 1.7457663835716066,
"learning_rate": 6.3285111581073535e-06,
"loss": 2.4119,
"step": 1043
},
{
"epoch": 0.6298642533936651,
"grad_norm": 1.871543890238202,
"learning_rate": 6.310446418555934e-06,
"loss": 2.3105,
"step": 1044
},
{
"epoch": 0.6304675716440422,
"grad_norm": 1.6143421614477536,
"learning_rate": 6.292395605870314e-06,
"loss": 2.3267,
"step": 1045
},
{
"epoch": 0.6310708898944193,
"grad_norm": 1.7964895997705217,
"learning_rate": 6.2743587881864485e-06,
"loss": 2.4736,
"step": 1046
},
{
"epoch": 0.6316742081447964,
"grad_norm": 1.7328805184139204,
"learning_rate": 6.256336033587459e-06,
"loss": 2.3039,
"step": 1047
},
{
"epoch": 0.6322775263951734,
"grad_norm": 1.7477853637357237,
"learning_rate": 6.2383274101033865e-06,
"loss": 2.3596,
"step": 1048
},
{
"epoch": 0.6328808446455505,
"grad_norm": 1.7330713096361468,
"learning_rate": 6.220332985710936e-06,
"loss": 2.4127,
"step": 1049
},
{
"epoch": 0.6334841628959276,
"grad_norm": 1.9186686697353552,
"learning_rate": 6.202352828333211e-06,
"loss": 2.3919,
"step": 1050
},
{
"epoch": 0.6340874811463046,
"grad_norm": 1.8655237304210366,
"learning_rate": 6.18438700583946e-06,
"loss": 2.3999,
"step": 1051
},
{
"epoch": 0.6346907993966817,
"grad_norm": 1.820718197399878,
"learning_rate": 6.16643558604483e-06,
"loss": 2.3537,
"step": 1052
},
{
"epoch": 0.6352941176470588,
"grad_norm": 1.8837447292622485,
"learning_rate": 6.148498636710092e-06,
"loss": 2.4198,
"step": 1053
},
{
"epoch": 0.6358974358974359,
"grad_norm": 1.6822630207864002,
"learning_rate": 6.130576225541405e-06,
"loss": 2.3893,
"step": 1054
},
{
"epoch": 0.6365007541478129,
"grad_norm": 1.7791618232922595,
"learning_rate": 6.112668420190042e-06,
"loss": 2.371,
"step": 1055
},
{
"epoch": 0.63710407239819,
"grad_norm": 1.6994048296111355,
"learning_rate": 6.094775288252157e-06,
"loss": 2.3775,
"step": 1056
},
{
"epoch": 0.6377073906485671,
"grad_norm": 1.7134892091362488,
"learning_rate": 6.076896897268503e-06,
"loss": 2.3201,
"step": 1057
},
{
"epoch": 0.6383107088989441,
"grad_norm": 1.9344148280860245,
"learning_rate": 6.059033314724194e-06,
"loss": 2.3657,
"step": 1058
},
{
"epoch": 0.6389140271493212,
"grad_norm": 1.7314061386065116,
"learning_rate": 6.041184608048452e-06,
"loss": 2.4049,
"step": 1059
},
{
"epoch": 0.6395173453996983,
"grad_norm": 1.7161590279229737,
"learning_rate": 6.023350844614344e-06,
"loss": 2.3644,
"step": 1060
},
{
"epoch": 0.6401206636500754,
"grad_norm": 1.62133335141988,
"learning_rate": 6.0055320917385305e-06,
"loss": 2.3621,
"step": 1061
},
{
"epoch": 0.6407239819004525,
"grad_norm": 1.7234099751611929,
"learning_rate": 5.987728416681015e-06,
"loss": 2.3857,
"step": 1062
},
{
"epoch": 0.6413273001508296,
"grad_norm": 1.975399590971045,
"learning_rate": 5.9699398866448846e-06,
"loss": 2.379,
"step": 1063
},
{
"epoch": 0.6419306184012067,
"grad_norm": 1.7347993415134486,
"learning_rate": 5.952166568776062e-06,
"loss": 2.4556,
"step": 1064
},
{
"epoch": 0.6425339366515838,
"grad_norm": 1.7369197590791692,
"learning_rate": 5.9344085301630425e-06,
"loss": 2.3327,
"step": 1065
},
{
"epoch": 0.6431372549019608,
"grad_norm": 1.7195622807621243,
"learning_rate": 5.916665837836657e-06,
"loss": 2.361,
"step": 1066
},
{
"epoch": 0.6437405731523379,
"grad_norm": 1.9257307776473533,
"learning_rate": 5.8989385587697936e-06,
"loss": 2.3611,
"step": 1067
},
{
"epoch": 0.644343891402715,
"grad_norm": 1.7840387108031497,
"learning_rate": 5.881226759877179e-06,
"loss": 2.3426,
"step": 1068
},
{
"epoch": 0.644947209653092,
"grad_norm": 1.7327317147015306,
"learning_rate": 5.8635305080150916e-06,
"loss": 2.4682,
"step": 1069
},
{
"epoch": 0.6455505279034691,
"grad_norm": 1.8944105633833825,
"learning_rate": 5.845849869981137e-06,
"loss": 2.3418,
"step": 1070
},
{
"epoch": 0.6461538461538462,
"grad_norm": 1.8040897971811702,
"learning_rate": 5.828184912513974e-06,
"loss": 2.3958,
"step": 1071
},
{
"epoch": 0.6467571644042233,
"grad_norm": 1.9119298329179213,
"learning_rate": 5.810535702293081e-06,
"loss": 2.3984,
"step": 1072
},
{
"epoch": 0.6473604826546003,
"grad_norm": 1.6761960978767056,
"learning_rate": 5.792902305938491e-06,
"loss": 2.4212,
"step": 1073
},
{
"epoch": 0.6479638009049774,
"grad_norm": 1.883720966722848,
"learning_rate": 5.77528479001054e-06,
"loss": 2.3655,
"step": 1074
},
{
"epoch": 0.6485671191553545,
"grad_norm": 1.924326515284571,
"learning_rate": 5.757683221009625e-06,
"loss": 2.3697,
"step": 1075
},
{
"epoch": 0.6491704374057315,
"grad_norm": 1.7604122143051846,
"learning_rate": 5.740097665375956e-06,
"loss": 2.339,
"step": 1076
},
{
"epoch": 0.6497737556561086,
"grad_norm": 1.7198858746018695,
"learning_rate": 5.722528189489294e-06,
"loss": 2.3814,
"step": 1077
},
{
"epoch": 0.6503770739064857,
"grad_norm": 1.705416286375821,
"learning_rate": 5.7049748596686884e-06,
"loss": 2.4044,
"step": 1078
},
{
"epoch": 0.6509803921568628,
"grad_norm": 1.7533272673200686,
"learning_rate": 5.687437742172258e-06,
"loss": 2.3606,
"step": 1079
},
{
"epoch": 0.6515837104072398,
"grad_norm": 1.9879675721501724,
"learning_rate": 5.669916903196931e-06,
"loss": 2.2996,
"step": 1080
},
{
"epoch": 0.6521870286576169,
"grad_norm": 1.7554867269724397,
"learning_rate": 5.652412408878173e-06,
"loss": 2.397,
"step": 1081
},
{
"epoch": 0.652790346907994,
"grad_norm": 1.6659668606646618,
"learning_rate": 5.634924325289766e-06,
"loss": 2.4034,
"step": 1082
},
{
"epoch": 0.653393665158371,
"grad_norm": 1.7416386848131735,
"learning_rate": 5.617452718443539e-06,
"loss": 2.3319,
"step": 1083
},
{
"epoch": 0.6539969834087481,
"grad_norm": 1.7245860454063235,
"learning_rate": 5.599997654289129e-06,
"loss": 2.3469,
"step": 1084
},
{
"epoch": 0.6546003016591252,
"grad_norm": 1.7431292906249471,
"learning_rate": 5.58255919871374e-06,
"loss": 2.4416,
"step": 1085
},
{
"epoch": 0.6552036199095023,
"grad_norm": 1.6656660538319628,
"learning_rate": 5.565137417541866e-06,
"loss": 2.4012,
"step": 1086
},
{
"epoch": 0.6558069381598793,
"grad_norm": 1.652853454891257,
"learning_rate": 5.547732376535073e-06,
"loss": 2.2344,
"step": 1087
},
{
"epoch": 0.6564102564102564,
"grad_norm": 1.8033291285488744,
"learning_rate": 5.530344141391735e-06,
"loss": 2.319,
"step": 1088
},
{
"epoch": 0.6570135746606335,
"grad_norm": 1.5871742155343276,
"learning_rate": 5.512972777746788e-06,
"loss": 2.3877,
"step": 1089
},
{
"epoch": 0.6576168929110106,
"grad_norm": 1.8718927560904637,
"learning_rate": 5.495618351171484e-06,
"loss": 2.4203,
"step": 1090
},
{
"epoch": 0.6582202111613876,
"grad_norm": 1.6655319360040397,
"learning_rate": 5.478280927173145e-06,
"loss": 2.4034,
"step": 1091
},
{
"epoch": 0.6588235294117647,
"grad_norm": 1.7438510013325328,
"learning_rate": 5.46096057119491e-06,
"loss": 2.4229,
"step": 1092
},
{
"epoch": 0.6594268476621418,
"grad_norm": 1.891566092957791,
"learning_rate": 5.443657348615499e-06,
"loss": 2.4016,
"step": 1093
},
{
"epoch": 0.6600301659125188,
"grad_norm": 1.794552188228555,
"learning_rate": 5.4263713247489525e-06,
"loss": 2.4229,
"step": 1094
},
{
"epoch": 0.6606334841628959,
"grad_norm": 1.871493745282659,
"learning_rate": 5.409102564844393e-06,
"loss": 2.3732,
"step": 1095
},
{
"epoch": 0.661236802413273,
"grad_norm": 1.687434162032868,
"learning_rate": 5.391851134085777e-06,
"loss": 2.383,
"step": 1096
},
{
"epoch": 0.6618401206636501,
"grad_norm": 1.7303740770734184,
"learning_rate": 5.37461709759165e-06,
"loss": 2.4028,
"step": 1097
},
{
"epoch": 0.6624434389140271,
"grad_norm": 1.7189510745367969,
"learning_rate": 5.357400520414898e-06,
"loss": 2.3981,
"step": 1098
},
{
"epoch": 0.6630467571644042,
"grad_norm": 1.8096301053467294,
"learning_rate": 5.340201467542507e-06,
"loss": 2.3628,
"step": 1099
},
{
"epoch": 0.6636500754147813,
"grad_norm": 1.7165713960764049,
"learning_rate": 5.323020003895307e-06,
"loss": 2.3966,
"step": 1100
},
{
"epoch": 0.6642533936651583,
"grad_norm": 2.137276625245083,
"learning_rate": 5.30585619432775e-06,
"loss": 2.3458,
"step": 1101
},
{
"epoch": 0.6648567119155354,
"grad_norm": 1.9805084995758804,
"learning_rate": 5.2887101036276326e-06,
"loss": 2.3472,
"step": 1102
},
{
"epoch": 0.6654600301659125,
"grad_norm": 1.7370499167243267,
"learning_rate": 5.271581796515877e-06,
"loss": 2.3599,
"step": 1103
},
{
"epoch": 0.6660633484162896,
"grad_norm": 1.990509263132044,
"learning_rate": 5.254471337646277e-06,
"loss": 2.3297,
"step": 1104
},
{
"epoch": 0.6666666666666666,
"grad_norm": 1.817469601955063,
"learning_rate": 5.237378791605249e-06,
"loss": 2.3826,
"step": 1105
},
{
"epoch": 0.6672699849170437,
"grad_norm": 1.7081728667896796,
"learning_rate": 5.22030422291161e-06,
"loss": 2.3599,
"step": 1106
},
{
"epoch": 0.6678733031674208,
"grad_norm": 1.6649837042112172,
"learning_rate": 5.203247696016304e-06,
"loss": 2.2998,
"step": 1107
},
{
"epoch": 0.6684766214177978,
"grad_norm": 2.0879657423509252,
"learning_rate": 5.186209275302175e-06,
"loss": 2.3596,
"step": 1108
},
{
"epoch": 0.6690799396681749,
"grad_norm": 1.8790127017717781,
"learning_rate": 5.169189025083721e-06,
"loss": 2.3778,
"step": 1109
},
{
"epoch": 0.669683257918552,
"grad_norm": 1.9364344405317626,
"learning_rate": 5.152187009606864e-06,
"loss": 2.496,
"step": 1110
},
{
"epoch": 0.6702865761689291,
"grad_norm": 1.8162698780455198,
"learning_rate": 5.135203293048683e-06,
"loss": 2.3594,
"step": 1111
},
{
"epoch": 0.6708898944193061,
"grad_norm": 1.7365875388281704,
"learning_rate": 5.11823793951719e-06,
"loss": 2.4206,
"step": 1112
},
{
"epoch": 0.6714932126696832,
"grad_norm": 1.7389301743219594,
"learning_rate": 5.101291013051076e-06,
"loss": 2.4577,
"step": 1113
},
{
"epoch": 0.6720965309200604,
"grad_norm": 1.801725591550719,
"learning_rate": 5.08436257761949e-06,
"loss": 2.36,
"step": 1114
},
{
"epoch": 0.6726998491704375,
"grad_norm": 1.6246554556969792,
"learning_rate": 5.067452697121773e-06,
"loss": 2.4463,
"step": 1115
},
{
"epoch": 0.6733031674208145,
"grad_norm": 1.6391920742953727,
"learning_rate": 5.050561435387225e-06,
"loss": 2.3824,
"step": 1116
},
{
"epoch": 0.6739064856711916,
"grad_norm": 1.7957501126838844,
"learning_rate": 5.033688856174872e-06,
"loss": 2.3333,
"step": 1117
},
{
"epoch": 0.6745098039215687,
"grad_norm": 1.900643070460856,
"learning_rate": 5.016835023173216e-06,
"loss": 2.393,
"step": 1118
},
{
"epoch": 0.6751131221719457,
"grad_norm": 1.743212484710362,
"learning_rate": 5.000000000000003e-06,
"loss": 2.368,
"step": 1119
},
{
"epoch": 0.6757164404223228,
"grad_norm": 1.7167273166225816,
"learning_rate": 4.98318385020197e-06,
"loss": 2.3213,
"step": 1120
},
{
"epoch": 0.6763197586726999,
"grad_norm": 1.7430619282647748,
"learning_rate": 4.966386637254619e-06,
"loss": 2.4359,
"step": 1121
},
{
"epoch": 0.676923076923077,
"grad_norm": 1.7339856018167896,
"learning_rate": 4.949608424561974e-06,
"loss": 2.3696,
"step": 1122
},
{
"epoch": 0.677526395173454,
"grad_norm": 1.6797421152789709,
"learning_rate": 4.932849275456334e-06,
"loss": 2.3274,
"step": 1123
},
{
"epoch": 0.6781297134238311,
"grad_norm": 1.700903017111767,
"learning_rate": 4.91610925319804e-06,
"loss": 2.4187,
"step": 1124
},
{
"epoch": 0.6787330316742082,
"grad_norm": 1.8361798532146734,
"learning_rate": 4.8993884209752364e-06,
"loss": 2.3666,
"step": 1125
},
{
"epoch": 0.6793363499245852,
"grad_norm": 1.816892302835707,
"learning_rate": 4.882686841903627e-06,
"loss": 2.3812,
"step": 1126
},
{
"epoch": 0.6799396681749623,
"grad_norm": 1.7175271186337646,
"learning_rate": 4.866004579026254e-06,
"loss": 2.3617,
"step": 1127
},
{
"epoch": 0.6805429864253394,
"grad_norm": 1.6205294077884607,
"learning_rate": 4.8493416953132375e-06,
"loss": 2.3892,
"step": 1128
},
{
"epoch": 0.6811463046757165,
"grad_norm": 1.6716762176030333,
"learning_rate": 4.832698253661542e-06,
"loss": 2.4494,
"step": 1129
},
{
"epoch": 0.6817496229260935,
"grad_norm": 1.6898145097930017,
"learning_rate": 4.81607431689475e-06,
"loss": 2.3703,
"step": 1130
},
{
"epoch": 0.6823529411764706,
"grad_norm": 1.6545729812453533,
"learning_rate": 4.799469947762829e-06,
"loss": 2.3776,
"step": 1131
},
{
"epoch": 0.6829562594268477,
"grad_norm": 1.627605596598923,
"learning_rate": 4.782885208941873e-06,
"loss": 2.3966,
"step": 1132
},
{
"epoch": 0.6835595776772248,
"grad_norm": 1.7911359160166918,
"learning_rate": 4.766320163033882e-06,
"loss": 2.3174,
"step": 1133
},
{
"epoch": 0.6841628959276018,
"grad_norm": 1.7161660139511359,
"learning_rate": 4.749774872566516e-06,
"loss": 2.3946,
"step": 1134
},
{
"epoch": 0.6847662141779789,
"grad_norm": 1.7354626205952686,
"learning_rate": 4.7332493999928785e-06,
"loss": 2.4666,
"step": 1135
},
{
"epoch": 0.685369532428356,
"grad_norm": 1.688627517329885,
"learning_rate": 4.716743807691255e-06,
"loss": 2.4739,
"step": 1136
},
{
"epoch": 0.685972850678733,
"grad_norm": 1.730630827281108,
"learning_rate": 4.700258157964892e-06,
"loss": 2.3249,
"step": 1137
},
{
"epoch": 0.6865761689291101,
"grad_norm": 1.6111061207035091,
"learning_rate": 4.68379251304176e-06,
"loss": 2.3399,
"step": 1138
},
{
"epoch": 0.6871794871794872,
"grad_norm": 1.724226418639754,
"learning_rate": 4.667346935074317e-06,
"loss": 2.3608,
"step": 1139
},
{
"epoch": 0.6877828054298643,
"grad_norm": 1.6986068675165844,
"learning_rate": 4.6509214861392785e-06,
"loss": 2.4214,
"step": 1140
},
{
"epoch": 0.6883861236802413,
"grad_norm": 1.6645298951693541,
"learning_rate": 4.634516228237372e-06,
"loss": 2.3376,
"step": 1141
},
{
"epoch": 0.6889894419306184,
"grad_norm": 1.6189114356863668,
"learning_rate": 4.618131223293119e-06,
"loss": 2.5135,
"step": 1142
},
{
"epoch": 0.6895927601809955,
"grad_norm": 1.7083736473449533,
"learning_rate": 4.6017665331545845e-06,
"loss": 2.3635,
"step": 1143
},
{
"epoch": 0.6901960784313725,
"grad_norm": 1.877655085744349,
"learning_rate": 4.585422219593161e-06,
"loss": 2.3677,
"step": 1144
},
{
"epoch": 0.6907993966817496,
"grad_norm": 1.6987263814462694,
"learning_rate": 4.569098344303319e-06,
"loss": 2.3695,
"step": 1145
},
{
"epoch": 0.6914027149321267,
"grad_norm": 1.825635519300492,
"learning_rate": 4.552794968902382e-06,
"loss": 2.4652,
"step": 1146
},
{
"epoch": 0.6920060331825038,
"grad_norm": 1.6935317131155125,
"learning_rate": 4.5365121549302916e-06,
"loss": 2.4444,
"step": 1147
},
{
"epoch": 0.6926093514328808,
"grad_norm": 1.6512716687547069,
"learning_rate": 4.520249963849386e-06,
"loss": 2.374,
"step": 1148
},
{
"epoch": 0.6932126696832579,
"grad_norm": 1.702035225234864,
"learning_rate": 4.504008457044151e-06,
"loss": 2.3455,
"step": 1149
},
{
"epoch": 0.693815987933635,
"grad_norm": 1.8068570190141913,
"learning_rate": 4.487787695820991e-06,
"loss": 2.4141,
"step": 1150
},
{
"epoch": 0.694419306184012,
"grad_norm": 1.7162440248028858,
"learning_rate": 4.471587741408008e-06,
"loss": 2.4136,
"step": 1151
},
{
"epoch": 0.6950226244343891,
"grad_norm": 1.8312298352488556,
"learning_rate": 4.455408654954771e-06,
"loss": 2.3802,
"step": 1152
},
{
"epoch": 0.6956259426847662,
"grad_norm": 1.6692722099624584,
"learning_rate": 4.439250497532074e-06,
"loss": 2.3668,
"step": 1153
},
{
"epoch": 0.6962292609351433,
"grad_norm": 1.7023852481076565,
"learning_rate": 4.423113330131708e-06,
"loss": 2.3927,
"step": 1154
},
{
"epoch": 0.6968325791855203,
"grad_norm": 1.8669640411601005,
"learning_rate": 4.406997213666236e-06,
"loss": 2.4098,
"step": 1155
},
{
"epoch": 0.6974358974358974,
"grad_norm": 1.7473073763109117,
"learning_rate": 4.390902208968756e-06,
"loss": 2.3939,
"step": 1156
},
{
"epoch": 0.6980392156862745,
"grad_norm": 1.7800610206406344,
"learning_rate": 4.3748283767926895e-06,
"loss": 2.4372,
"step": 1157
},
{
"epoch": 0.6986425339366515,
"grad_norm": 1.634490779211142,
"learning_rate": 4.3587757778115255e-06,
"loss": 2.3264,
"step": 1158
},
{
"epoch": 0.6992458521870286,
"grad_norm": 1.77926534676703,
"learning_rate": 4.342744472618609e-06,
"loss": 2.37,
"step": 1159
},
{
"epoch": 0.6998491704374057,
"grad_norm": 1.7462074976007822,
"learning_rate": 4.326734521726905e-06,
"loss": 2.416,
"step": 1160
},
{
"epoch": 0.7004524886877828,
"grad_norm": 1.6554836021696273,
"learning_rate": 4.310745985568779e-06,
"loss": 2.3602,
"step": 1161
},
{
"epoch": 0.7010558069381598,
"grad_norm": 1.7946895000231815,
"learning_rate": 4.294778924495756e-06,
"loss": 2.2787,
"step": 1162
},
{
"epoch": 0.7016591251885369,
"grad_norm": 1.7454537713750038,
"learning_rate": 4.278833398778306e-06,
"loss": 2.3994,
"step": 1163
},
{
"epoch": 0.702262443438914,
"grad_norm": 1.7980490954890451,
"learning_rate": 4.262909468605602e-06,
"loss": 2.3779,
"step": 1164
},
{
"epoch": 0.702865761689291,
"grad_norm": 1.7044519866326602,
"learning_rate": 4.24700719408531e-06,
"loss": 2.3362,
"step": 1165
},
{
"epoch": 0.7034690799396682,
"grad_norm": 1.7347177996543848,
"learning_rate": 4.231126635243351e-06,
"loss": 2.3837,
"step": 1166
},
{
"epoch": 0.7040723981900453,
"grad_norm": 1.6524797749017155,
"learning_rate": 4.215267852023669e-06,
"loss": 2.36,
"step": 1167
},
{
"epoch": 0.7046757164404224,
"grad_norm": 1.537259068333786,
"learning_rate": 4.19943090428802e-06,
"loss": 2.2915,
"step": 1168
},
{
"epoch": 0.7052790346907994,
"grad_norm": 1.6467961805341684,
"learning_rate": 4.1836158518157335e-06,
"loss": 2.3372,
"step": 1169
},
{
"epoch": 0.7058823529411765,
"grad_norm": 1.6557291928485045,
"learning_rate": 4.167822754303493e-06,
"loss": 2.308,
"step": 1170
},
{
"epoch": 0.7064856711915536,
"grad_norm": 1.6836785725181491,
"learning_rate": 4.152051671365111e-06,
"loss": 2.4054,
"step": 1171
},
{
"epoch": 0.7070889894419307,
"grad_norm": 1.6787180661459031,
"learning_rate": 4.136302662531297e-06,
"loss": 2.335,
"step": 1172
},
{
"epoch": 0.7076923076923077,
"grad_norm": 1.643848874676066,
"learning_rate": 4.120575787249448e-06,
"loss": 2.4118,
"step": 1173
},
{
"epoch": 0.7082956259426848,
"grad_norm": 1.7616396709398916,
"learning_rate": 4.104871104883403e-06,
"loss": 2.3463,
"step": 1174
},
{
"epoch": 0.7088989441930619,
"grad_norm": 1.7903610252942221,
"learning_rate": 4.0891886747132356e-06,
"loss": 2.3359,
"step": 1175
},
{
"epoch": 0.709502262443439,
"grad_norm": 1.7098349377567204,
"learning_rate": 4.073528555935023e-06,
"loss": 2.3234,
"step": 1176
},
{
"epoch": 0.710105580693816,
"grad_norm": 1.7698002503180297,
"learning_rate": 4.057890807660623e-06,
"loss": 2.3478,
"step": 1177
},
{
"epoch": 0.7107088989441931,
"grad_norm": 1.7223602812887717,
"learning_rate": 4.042275488917457e-06,
"loss": 2.3879,
"step": 1178
},
{
"epoch": 0.7113122171945702,
"grad_norm": 1.7463132678483144,
"learning_rate": 4.026682658648279e-06,
"loss": 2.3691,
"step": 1179
},
{
"epoch": 0.7119155354449472,
"grad_norm": 1.7311429231481532,
"learning_rate": 4.011112375710958e-06,
"loss": 2.4077,
"step": 1180
},
{
"epoch": 0.7125188536953243,
"grad_norm": 1.8325302347981212,
"learning_rate": 3.995564698878242e-06,
"loss": 2.3488,
"step": 1181
},
{
"epoch": 0.7131221719457014,
"grad_norm": 1.68718592965624,
"learning_rate": 3.9800396868375675e-06,
"loss": 2.3314,
"step": 1182
},
{
"epoch": 0.7137254901960784,
"grad_norm": 1.6227069192861296,
"learning_rate": 3.964537398190809e-06,
"loss": 2.4016,
"step": 1183
},
{
"epoch": 0.7143288084464555,
"grad_norm": 1.6767707875017006,
"learning_rate": 3.949057891454067e-06,
"loss": 2.3296,
"step": 1184
},
{
"epoch": 0.7149321266968326,
"grad_norm": 1.7535276237480546,
"learning_rate": 3.933601225057446e-06,
"loss": 2.4307,
"step": 1185
},
{
"epoch": 0.7155354449472097,
"grad_norm": 1.793638115657222,
"learning_rate": 3.918167457344846e-06,
"loss": 2.3967,
"step": 1186
},
{
"epoch": 0.7161387631975867,
"grad_norm": 1.699285645702112,
"learning_rate": 3.902756646573721e-06,
"loss": 2.4461,
"step": 1187
},
{
"epoch": 0.7167420814479638,
"grad_norm": 1.6733211026986439,
"learning_rate": 3.887368850914873e-06,
"loss": 2.3596,
"step": 1188
},
{
"epoch": 0.7173453996983409,
"grad_norm": 1.7635125169609782,
"learning_rate": 3.872004128452231e-06,
"loss": 2.2959,
"step": 1189
},
{
"epoch": 0.717948717948718,
"grad_norm": 1.706941005183369,
"learning_rate": 3.85666253718263e-06,
"loss": 2.4371,
"step": 1190
},
{
"epoch": 0.718552036199095,
"grad_norm": 1.7770281662637257,
"learning_rate": 3.841344135015591e-06,
"loss": 2.3395,
"step": 1191
},
{
"epoch": 0.7191553544494721,
"grad_norm": 1.5833608069906788,
"learning_rate": 3.826048979773104e-06,
"loss": 2.4091,
"step": 1192
},
{
"epoch": 0.7197586726998492,
"grad_norm": 1.6958121990199075,
"learning_rate": 3.8107771291894092e-06,
"loss": 2.362,
"step": 1193
},
{
"epoch": 0.7203619909502262,
"grad_norm": 1.8301505796048145,
"learning_rate": 3.795528640910776e-06,
"loss": 2.39,
"step": 1194
},
{
"epoch": 0.7209653092006033,
"grad_norm": 1.6750258822027213,
"learning_rate": 3.7803035724953007e-06,
"loss": 2.4088,
"step": 1195
},
{
"epoch": 0.7215686274509804,
"grad_norm": 1.8697843559056373,
"learning_rate": 3.7651019814126656e-06,
"loss": 2.3559,
"step": 1196
},
{
"epoch": 0.7221719457013575,
"grad_norm": 1.7345691144446933,
"learning_rate": 3.7499239250439358e-06,
"loss": 2.4212,
"step": 1197
},
{
"epoch": 0.7227752639517345,
"grad_norm": 1.7015198165790977,
"learning_rate": 3.73476946068134e-06,
"loss": 2.3812,
"step": 1198
},
{
"epoch": 0.7233785822021116,
"grad_norm": 1.8583945749056379,
"learning_rate": 3.719638645528061e-06,
"loss": 2.3947,
"step": 1199
},
{
"epoch": 0.7239819004524887,
"grad_norm": 1.8719157111099178,
"learning_rate": 3.704531536698012e-06,
"loss": 2.3613,
"step": 1200
},
{
"epoch": 0.7245852187028657,
"grad_norm": 1.6366629687175498,
"learning_rate": 3.68944819121561e-06,
"loss": 2.4264,
"step": 1201
},
{
"epoch": 0.7251885369532428,
"grad_norm": 1.6694031295876548,
"learning_rate": 3.674388666015584e-06,
"loss": 2.3218,
"step": 1202
},
{
"epoch": 0.7257918552036199,
"grad_norm": 1.6815494848334889,
"learning_rate": 3.659353017942754e-06,
"loss": 2.3919,
"step": 1203
},
{
"epoch": 0.726395173453997,
"grad_norm": 1.7510552907903105,
"learning_rate": 3.644341303751804e-06,
"loss": 2.3933,
"step": 1204
},
{
"epoch": 0.726998491704374,
"grad_norm": 1.6587047957707022,
"learning_rate": 3.6293535801070735e-06,
"loss": 2.4262,
"step": 1205
},
{
"epoch": 0.7276018099547511,
"grad_norm": 1.7383291104545608,
"learning_rate": 3.6143899035823516e-06,
"loss": 2.2937,
"step": 1206
},
{
"epoch": 0.7282051282051282,
"grad_norm": 1.613937185194106,
"learning_rate": 3.5994503306606497e-06,
"loss": 2.4775,
"step": 1207
},
{
"epoch": 0.7288084464555052,
"grad_norm": 1.7978965594601073,
"learning_rate": 3.5845349177340083e-06,
"loss": 2.3361,
"step": 1208
},
{
"epoch": 0.7294117647058823,
"grad_norm": 1.945082423342847,
"learning_rate": 3.5696437211032607e-06,
"loss": 2.4762,
"step": 1209
},
{
"epoch": 0.7300150829562594,
"grad_norm": 1.6728640229319411,
"learning_rate": 3.5547767969778355e-06,
"loss": 2.3713,
"step": 1210
},
{
"epoch": 0.7306184012066365,
"grad_norm": 1.813104500880649,
"learning_rate": 3.5399342014755388e-06,
"loss": 2.3755,
"step": 1211
},
{
"epoch": 0.7312217194570135,
"grad_norm": 1.6597104880097602,
"learning_rate": 3.5251159906223453e-06,
"loss": 2.3724,
"step": 1212
},
{
"epoch": 0.7318250377073906,
"grad_norm": 1.65223185764697,
"learning_rate": 3.510322220352188e-06,
"loss": 2.4131,
"step": 1213
},
{
"epoch": 0.7324283559577677,
"grad_norm": 1.576921560711547,
"learning_rate": 3.4955529465067394e-06,
"loss": 2.3482,
"step": 1214
},
{
"epoch": 0.7330316742081447,
"grad_norm": 1.779960713070471,
"learning_rate": 3.4808082248352058e-06,
"loss": 2.364,
"step": 1215
},
{
"epoch": 0.7336349924585218,
"grad_norm": 1.6919838533539924,
"learning_rate": 3.466088110994129e-06,
"loss": 2.3319,
"step": 1216
},
{
"epoch": 0.7342383107088989,
"grad_norm": 1.7997004290448038,
"learning_rate": 3.4513926605471504e-06,
"loss": 2.3316,
"step": 1217
},
{
"epoch": 0.7348416289592761,
"grad_norm": 1.6733906358624668,
"learning_rate": 3.4367219289648192e-06,
"loss": 2.4196,
"step": 1218
},
{
"epoch": 0.7354449472096531,
"grad_norm": 1.6985536840122628,
"learning_rate": 3.42207597162438e-06,
"loss": 2.5372,
"step": 1219
},
{
"epoch": 0.7360482654600302,
"grad_norm": 1.7863639968736682,
"learning_rate": 3.40745484380956e-06,
"loss": 2.4425,
"step": 1220
},
{
"epoch": 0.7366515837104073,
"grad_norm": 1.61404911543304,
"learning_rate": 3.392858600710376e-06,
"loss": 2.3413,
"step": 1221
},
{
"epoch": 0.7372549019607844,
"grad_norm": 1.7141058073808544,
"learning_rate": 3.3782872974228896e-06,
"loss": 2.3926,
"step": 1222
},
{
"epoch": 0.7378582202111614,
"grad_norm": 1.6705147616730136,
"learning_rate": 3.363740988949038e-06,
"loss": 2.3701,
"step": 1223
},
{
"epoch": 0.7384615384615385,
"grad_norm": 1.709270311327098,
"learning_rate": 3.3492197301964145e-06,
"loss": 2.4238,
"step": 1224
},
{
"epoch": 0.7390648567119156,
"grad_norm": 1.645818471528142,
"learning_rate": 3.3347235759780483e-06,
"loss": 2.2983,
"step": 1225
},
{
"epoch": 0.7396681749622926,
"grad_norm": 1.70357964847191,
"learning_rate": 3.320252581012212e-06,
"loss": 2.3661,
"step": 1226
},
{
"epoch": 0.7402714932126697,
"grad_norm": 1.7149434102428434,
"learning_rate": 3.3058067999222075e-06,
"loss": 2.4988,
"step": 1227
},
{
"epoch": 0.7408748114630468,
"grad_norm": 1.7463338130667014,
"learning_rate": 3.2913862872361624e-06,
"loss": 2.4385,
"step": 1228
},
{
"epoch": 0.7414781297134239,
"grad_norm": 1.6395449989057074,
"learning_rate": 3.2769910973868314e-06,
"loss": 2.3395,
"step": 1229
},
{
"epoch": 0.7420814479638009,
"grad_norm": 1.8064334638366089,
"learning_rate": 3.262621284711376e-06,
"loss": 2.38,
"step": 1230
},
{
"epoch": 0.742684766214178,
"grad_norm": 1.6510309870602204,
"learning_rate": 3.248276903451171e-06,
"loss": 2.422,
"step": 1231
},
{
"epoch": 0.7432880844645551,
"grad_norm": 1.8146336246679287,
"learning_rate": 3.2339580077515864e-06,
"loss": 2.3916,
"step": 1232
},
{
"epoch": 0.7438914027149321,
"grad_norm": 1.703832113781218,
"learning_rate": 3.219664651661808e-06,
"loss": 2.3852,
"step": 1233
},
{
"epoch": 0.7444947209653092,
"grad_norm": 1.6845182669842365,
"learning_rate": 3.2053968891346087e-06,
"loss": 2.261,
"step": 1234
},
{
"epoch": 0.7450980392156863,
"grad_norm": 1.753880542406519,
"learning_rate": 3.191154774026156e-06,
"loss": 2.2682,
"step": 1235
},
{
"epoch": 0.7457013574660634,
"grad_norm": 1.639422373413466,
"learning_rate": 3.1769383600958005e-06,
"loss": 2.3552,
"step": 1236
},
{
"epoch": 0.7463046757164404,
"grad_norm": 1.7878659580421807,
"learning_rate": 3.1627477010058936e-06,
"loss": 2.3888,
"step": 1237
},
{
"epoch": 0.7469079939668175,
"grad_norm": 1.707074502643292,
"learning_rate": 3.1485828503215588e-06,
"loss": 2.3579,
"step": 1238
},
{
"epoch": 0.7475113122171946,
"grad_norm": 1.6154045829514472,
"learning_rate": 3.1344438615105023e-06,
"loss": 2.3499,
"step": 1239
},
{
"epoch": 0.7481146304675717,
"grad_norm": 1.9133709112277255,
"learning_rate": 3.1203307879428146e-06,
"loss": 2.369,
"step": 1240
},
{
"epoch": 0.7487179487179487,
"grad_norm": 1.6141320071125547,
"learning_rate": 3.1062436828907605e-06,
"loss": 2.3186,
"step": 1241
},
{
"epoch": 0.7493212669683258,
"grad_norm": 1.7956916560691374,
"learning_rate": 3.092182599528585e-06,
"loss": 2.3688,
"step": 1242
},
{
"epoch": 0.7499245852187029,
"grad_norm": 1.7711534803830473,
"learning_rate": 3.0781475909323066e-06,
"loss": 2.3732,
"step": 1243
},
{
"epoch": 0.7505279034690799,
"grad_norm": 1.7181406266567723,
"learning_rate": 3.0641387100795237e-06,
"loss": 2.4178,
"step": 1244
},
{
"epoch": 0.751131221719457,
"grad_norm": 2.2656899500092478,
"learning_rate": 3.0501560098492056e-06,
"loss": 2.4398,
"step": 1245
},
{
"epoch": 0.7517345399698341,
"grad_norm": 1.541135482751282,
"learning_rate": 3.0361995430215087e-06,
"loss": 2.3834,
"step": 1246
},
{
"epoch": 0.7523378582202112,
"grad_norm": 1.7382581186222361,
"learning_rate": 3.0222693622775544e-06,
"loss": 2.4601,
"step": 1247
},
{
"epoch": 0.7529411764705882,
"grad_norm": 1.9075571974715178,
"learning_rate": 3.008365520199251e-06,
"loss": 2.3921,
"step": 1248
},
{
"epoch": 0.7529411764705882,
"eval_loss": 2.38016414642334,
"eval_runtime": 22.1293,
"eval_samples_per_second": 3.977,
"eval_steps_per_second": 0.497,
"step": 1248
},
{
"epoch": 0.7535444947209653,
"grad_norm": 1.7905575217759446,
"learning_rate": 2.994488069269079e-06,
"loss": 2.3063,
"step": 1249
},
{
"epoch": 0.7541478129713424,
"grad_norm": 1.7748462108982017,
"learning_rate": 2.9806370618699142e-06,
"loss": 2.3667,
"step": 1250
},
{
"epoch": 0.7547511312217194,
"grad_norm": 1.6917719110640708,
"learning_rate": 2.9668125502848035e-06,
"loss": 2.3628,
"step": 1251
},
{
"epoch": 0.7553544494720965,
"grad_norm": 1.5787368628352945,
"learning_rate": 2.9530145866967897e-06,
"loss": 2.3794,
"step": 1252
},
{
"epoch": 0.7559577677224736,
"grad_norm": 1.619923189656748,
"learning_rate": 2.9392432231886914e-06,
"loss": 2.3134,
"step": 1253
},
{
"epoch": 0.7565610859728507,
"grad_norm": 1.7237975265243972,
"learning_rate": 2.9254985117429415e-06,
"loss": 2.3619,
"step": 1254
},
{
"epoch": 0.7571644042232277,
"grad_norm": 1.886525917985081,
"learning_rate": 2.911780504241354e-06,
"loss": 2.3792,
"step": 1255
},
{
"epoch": 0.7577677224736048,
"grad_norm": 1.6579917254662444,
"learning_rate": 2.8980892524649506e-06,
"loss": 2.3537,
"step": 1256
},
{
"epoch": 0.7583710407239819,
"grad_norm": 1.7615250599662575,
"learning_rate": 2.8844248080937543e-06,
"loss": 2.4131,
"step": 1257
},
{
"epoch": 0.7589743589743589,
"grad_norm": 1.5993153277369443,
"learning_rate": 2.870787222706609e-06,
"loss": 2.3332,
"step": 1258
},
{
"epoch": 0.759577677224736,
"grad_norm": 1.7633809449855375,
"learning_rate": 2.8571765477809645e-06,
"loss": 2.4275,
"step": 1259
},
{
"epoch": 0.7601809954751131,
"grad_norm": 1.7752019583671115,
"learning_rate": 2.8435928346926945e-06,
"loss": 2.3932,
"step": 1260
},
{
"epoch": 0.7607843137254902,
"grad_norm": 1.6600684023130048,
"learning_rate": 2.830036134715902e-06,
"loss": 2.3767,
"step": 1261
},
{
"epoch": 0.7613876319758672,
"grad_norm": 1.6177485230843274,
"learning_rate": 2.8165064990227255e-06,
"loss": 2.3334,
"step": 1262
},
{
"epoch": 0.7619909502262443,
"grad_norm": 1.718986673782489,
"learning_rate": 2.803003978683142e-06,
"loss": 2.3863,
"step": 1263
},
{
"epoch": 0.7625942684766214,
"grad_norm": 1.697123205087847,
"learning_rate": 2.789528624664778e-06,
"loss": 2.3717,
"step": 1264
},
{
"epoch": 0.7631975867269984,
"grad_norm": 1.8768955126798301,
"learning_rate": 2.776080487832715e-06,
"loss": 2.3315,
"step": 1265
},
{
"epoch": 0.7638009049773755,
"grad_norm": 1.7028416533958965,
"learning_rate": 2.7626596189492983e-06,
"loss": 2.3685,
"step": 1266
},
{
"epoch": 0.7644042232277526,
"grad_norm": 1.6899932080641498,
"learning_rate": 2.7492660686739513e-06,
"loss": 2.3124,
"step": 1267
},
{
"epoch": 0.7650075414781297,
"grad_norm": 1.7265124545392858,
"learning_rate": 2.7358998875629716e-06,
"loss": 2.304,
"step": 1268
},
{
"epoch": 0.7656108597285067,
"grad_norm": 1.6878514865004561,
"learning_rate": 2.7225611260693485e-06,
"loss": 2.3384,
"step": 1269
},
{
"epoch": 0.7662141779788839,
"grad_norm": 1.8156324278376936,
"learning_rate": 2.70924983454257e-06,
"loss": 2.3312,
"step": 1270
},
{
"epoch": 0.766817496229261,
"grad_norm": 1.6906563413337043,
"learning_rate": 2.695966063228442e-06,
"loss": 2.3785,
"step": 1271
},
{
"epoch": 0.7674208144796381,
"grad_norm": 1.6438022037492663,
"learning_rate": 2.682709862268883e-06,
"loss": 2.3599,
"step": 1272
},
{
"epoch": 0.7680241327300151,
"grad_norm": 1.6535431923407702,
"learning_rate": 2.669481281701739e-06,
"loss": 2.2519,
"step": 1273
},
{
"epoch": 0.7686274509803922,
"grad_norm": 1.6926489077110243,
"learning_rate": 2.6562803714606033e-06,
"loss": 2.3282,
"step": 1274
},
{
"epoch": 0.7692307692307693,
"grad_norm": 1.72212806620652,
"learning_rate": 2.6431071813746277e-06,
"loss": 2.3748,
"step": 1275
},
{
"epoch": 0.7698340874811463,
"grad_norm": 1.6096452402637356,
"learning_rate": 2.62996176116832e-06,
"loss": 2.3074,
"step": 1276
},
{
"epoch": 0.7704374057315234,
"grad_norm": 1.6456536712636463,
"learning_rate": 2.6168441604613706e-06,
"loss": 2.3434,
"step": 1277
},
{
"epoch": 0.7710407239819005,
"grad_norm": 1.6644054476758603,
"learning_rate": 2.6037544287684603e-06,
"loss": 2.4145,
"step": 1278
},
{
"epoch": 0.7716440422322776,
"grad_norm": 1.7176809008066278,
"learning_rate": 2.5906926154990676e-06,
"loss": 2.4399,
"step": 1279
},
{
"epoch": 0.7722473604826546,
"grad_norm": 1.6886873855562976,
"learning_rate": 2.5776587699573007e-06,
"loss": 2.3975,
"step": 1280
},
{
"epoch": 0.7728506787330317,
"grad_norm": 1.6825199363623684,
"learning_rate": 2.5646529413416864e-06,
"loss": 2.2743,
"step": 1281
},
{
"epoch": 0.7734539969834088,
"grad_norm": 1.726936978113641,
"learning_rate": 2.551675178745003e-06,
"loss": 2.3887,
"step": 1282
},
{
"epoch": 0.7740573152337858,
"grad_norm": 1.7424095587164001,
"learning_rate": 2.538725531154087e-06,
"loss": 2.3657,
"step": 1283
},
{
"epoch": 0.7746606334841629,
"grad_norm": 1.6742037882262482,
"learning_rate": 2.5258040474496483e-06,
"loss": 2.3799,
"step": 1284
},
{
"epoch": 0.77526395173454,
"grad_norm": 1.8169398307903717,
"learning_rate": 2.512910776406089e-06,
"loss": 2.3991,
"step": 1285
},
{
"epoch": 0.7758672699849171,
"grad_norm": 1.6743349016789828,
"learning_rate": 2.500045766691319e-06,
"loss": 2.3413,
"step": 1286
},
{
"epoch": 0.7764705882352941,
"grad_norm": 1.7015287344998682,
"learning_rate": 2.487209066866565e-06,
"loss": 2.4439,
"step": 1287
},
{
"epoch": 0.7770739064856712,
"grad_norm": 1.6878482036002078,
"learning_rate": 2.4744007253862046e-06,
"loss": 2.4143,
"step": 1288
},
{
"epoch": 0.7776772247360483,
"grad_norm": 1.629791277018855,
"learning_rate": 2.46162079059756e-06,
"loss": 2.3622,
"step": 1289
},
{
"epoch": 0.7782805429864253,
"grad_norm": 1.64498956245565,
"learning_rate": 2.4488693107407335e-06,
"loss": 2.2916,
"step": 1290
},
{
"epoch": 0.7788838612368024,
"grad_norm": 1.587955075191154,
"learning_rate": 2.436146333948416e-06,
"loss": 2.3719,
"step": 1291
},
{
"epoch": 0.7794871794871795,
"grad_norm": 1.7976619934058362,
"learning_rate": 2.4234519082457096e-06,
"loss": 2.3873,
"step": 1292
},
{
"epoch": 0.7800904977375566,
"grad_norm": 1.6815612433609544,
"learning_rate": 2.410786081549954e-06,
"loss": 2.3841,
"step": 1293
},
{
"epoch": 0.7806938159879336,
"grad_norm": 1.7102442917631429,
"learning_rate": 2.398148901670521e-06,
"loss": 2.2898,
"step": 1294
},
{
"epoch": 0.7812971342383107,
"grad_norm": 1.5996194988101127,
"learning_rate": 2.3855404163086558e-06,
"loss": 2.3665,
"step": 1295
},
{
"epoch": 0.7819004524886878,
"grad_norm": 1.690930485646474,
"learning_rate": 2.372960673057301e-06,
"loss": 2.421,
"step": 1296
},
{
"epoch": 0.7825037707390649,
"grad_norm": 1.6395486346485142,
"learning_rate": 2.3604097194008957e-06,
"loss": 2.4242,
"step": 1297
},
{
"epoch": 0.7831070889894419,
"grad_norm": 1.7291499453127324,
"learning_rate": 2.347887602715213e-06,
"loss": 2.3369,
"step": 1298
},
{
"epoch": 0.783710407239819,
"grad_norm": 1.7533383876291355,
"learning_rate": 2.3353943702671722e-06,
"loss": 2.3686,
"step": 1299
},
{
"epoch": 0.7843137254901961,
"grad_norm": 1.7910834697851878,
"learning_rate": 2.322930069214664e-06,
"loss": 2.3576,
"step": 1300
},
{
"epoch": 0.7849170437405731,
"grad_norm": 1.5708555637587793,
"learning_rate": 2.3104947466063785e-06,
"loss": 2.3296,
"step": 1301
},
{
"epoch": 0.7855203619909502,
"grad_norm": 1.729646722845939,
"learning_rate": 2.298088449381618e-06,
"loss": 2.4129,
"step": 1302
},
{
"epoch": 0.7861236802413273,
"grad_norm": 1.7080334843001908,
"learning_rate": 2.285711224370123e-06,
"loss": 2.4113,
"step": 1303
},
{
"epoch": 0.7867269984917044,
"grad_norm": 1.7346094079358705,
"learning_rate": 2.273363118291889e-06,
"loss": 2.4141,
"step": 1304
},
{
"epoch": 0.7873303167420814,
"grad_norm": 1.671715470945606,
"learning_rate": 2.2610441777570104e-06,
"loss": 2.4082,
"step": 1305
},
{
"epoch": 0.7879336349924585,
"grad_norm": 1.6161721547944843,
"learning_rate": 2.2487544492654832e-06,
"loss": 2.3392,
"step": 1306
},
{
"epoch": 0.7885369532428356,
"grad_norm": 1.6230809291834614,
"learning_rate": 2.2364939792070385e-06,
"loss": 2.3649,
"step": 1307
},
{
"epoch": 0.7891402714932126,
"grad_norm": 1.7871076592596147,
"learning_rate": 2.224262813860962e-06,
"loss": 2.274,
"step": 1308
},
{
"epoch": 0.7897435897435897,
"grad_norm": 1.7193363412008316,
"learning_rate": 2.2120609993959376e-06,
"loss": 2.3066,
"step": 1309
},
{
"epoch": 0.7903469079939668,
"grad_norm": 1.641785651926145,
"learning_rate": 2.1998885818698434e-06,
"loss": 2.3099,
"step": 1310
},
{
"epoch": 0.7909502262443439,
"grad_norm": 1.734997074775059,
"learning_rate": 2.187745607229601e-06,
"loss": 2.4392,
"step": 1311
},
{
"epoch": 0.7915535444947209,
"grad_norm": 1.6745699343545057,
"learning_rate": 2.1756321213109944e-06,
"loss": 2.3645,
"step": 1312
},
{
"epoch": 0.792156862745098,
"grad_norm": 1.745294391010054,
"learning_rate": 2.163548169838495e-06,
"loss": 2.3745,
"step": 1313
},
{
"epoch": 0.7927601809954751,
"grad_norm": 1.6435958547913674,
"learning_rate": 2.151493798425095e-06,
"loss": 2.3365,
"step": 1314
},
{
"epoch": 0.7933634992458521,
"grad_norm": 1.7601330578805139,
"learning_rate": 2.1394690525721275e-06,
"loss": 2.4231,
"step": 1315
},
{
"epoch": 0.7939668174962292,
"grad_norm": 1.7385461679888394,
"learning_rate": 2.1274739776691013e-06,
"loss": 2.3454,
"step": 1316
},
{
"epoch": 0.7945701357466063,
"grad_norm": 1.577322158373456,
"learning_rate": 2.1155086189935227e-06,
"loss": 2.3731,
"step": 1317
},
{
"epoch": 0.7951734539969834,
"grad_norm": 1.5972905038230132,
"learning_rate": 2.1035730217107385e-06,
"loss": 2.3499,
"step": 1318
},
{
"epoch": 0.7957767722473604,
"grad_norm": 1.595018031186073,
"learning_rate": 2.0916672308737464e-06,
"loss": 2.4249,
"step": 1319
},
{
"epoch": 0.7963800904977375,
"grad_norm": 1.7621017326525585,
"learning_rate": 2.079791291423039e-06,
"loss": 2.3145,
"step": 1320
},
{
"epoch": 0.7969834087481147,
"grad_norm": 1.6264135452299817,
"learning_rate": 2.0679452481864247e-06,
"loss": 2.2997,
"step": 1321
},
{
"epoch": 0.7975867269984918,
"grad_norm": 1.6580584494866133,
"learning_rate": 2.0561291458788736e-06,
"loss": 2.3622,
"step": 1322
},
{
"epoch": 0.7981900452488688,
"grad_norm": 1.5566118261930884,
"learning_rate": 2.044343029102328e-06,
"loss": 2.3974,
"step": 1323
},
{
"epoch": 0.7987933634992459,
"grad_norm": 1.7250474143269023,
"learning_rate": 2.0325869423455523e-06,
"loss": 2.3415,
"step": 1324
},
{
"epoch": 0.799396681749623,
"grad_norm": 1.626325032827294,
"learning_rate": 2.0208609299839465e-06,
"loss": 2.3513,
"step": 1325
},
{
"epoch": 0.8,
"grad_norm": 1.774403154637252,
"learning_rate": 2.0091650362794035e-06,
"loss": 2.3231,
"step": 1326
},
{
"epoch": 0.8006033182503771,
"grad_norm": 1.7206279247637468,
"learning_rate": 1.9974993053801186e-06,
"loss": 2.3612,
"step": 1327
},
{
"epoch": 0.8012066365007542,
"grad_norm": 1.8044368171035148,
"learning_rate": 1.9858637813204352e-06,
"loss": 2.4704,
"step": 1328
},
{
"epoch": 0.8018099547511313,
"grad_norm": 1.7163985106653774,
"learning_rate": 1.9742585080206754e-06,
"loss": 2.3197,
"step": 1329
},
{
"epoch": 0.8024132730015083,
"grad_norm": 1.661643572576915,
"learning_rate": 1.962683529286973e-06,
"loss": 2.4173,
"step": 1330
},
{
"epoch": 0.8030165912518854,
"grad_norm": 1.729300657247654,
"learning_rate": 1.951138888811115e-06,
"loss": 2.3567,
"step": 1331
},
{
"epoch": 0.8036199095022625,
"grad_norm": 1.5633312280127363,
"learning_rate": 1.939624630170367e-06,
"loss": 2.3826,
"step": 1332
},
{
"epoch": 0.8042232277526395,
"grad_norm": 1.775527155985604,
"learning_rate": 1.9281407968273115e-06,
"loss": 2.3891,
"step": 1333
},
{
"epoch": 0.8048265460030166,
"grad_norm": 1.540034243183988,
"learning_rate": 1.916687432129688e-06,
"loss": 2.3909,
"step": 1334
},
{
"epoch": 0.8054298642533937,
"grad_norm": 1.651033910298305,
"learning_rate": 1.9052645793102277e-06,
"loss": 2.4138,
"step": 1335
},
{
"epoch": 0.8060331825037708,
"grad_norm": 1.6787312602876567,
"learning_rate": 1.8938722814864863e-06,
"loss": 2.4045,
"step": 1336
},
{
"epoch": 0.8066365007541478,
"grad_norm": 1.6738748640963028,
"learning_rate": 1.882510581660687e-06,
"loss": 2.3668,
"step": 1337
},
{
"epoch": 0.8072398190045249,
"grad_norm": 1.8596276668931182,
"learning_rate": 1.8711795227195528e-06,
"loss": 2.4065,
"step": 1338
},
{
"epoch": 0.807843137254902,
"grad_norm": 1.6277462587992833,
"learning_rate": 1.8598791474341516e-06,
"loss": 2.3414,
"step": 1339
},
{
"epoch": 0.808446455505279,
"grad_norm": 1.7241134762025054,
"learning_rate": 1.8486094984597268e-06,
"loss": 2.3492,
"step": 1340
},
{
"epoch": 0.8090497737556561,
"grad_norm": 1.6923648374789768,
"learning_rate": 1.8373706183355423e-06,
"loss": 2.4431,
"step": 1341
},
{
"epoch": 0.8096530920060332,
"grad_norm": 1.7463635386431937,
"learning_rate": 1.8261625494847156e-06,
"loss": 2.3816,
"step": 1342
},
{
"epoch": 0.8102564102564103,
"grad_norm": 1.699154495364708,
"learning_rate": 1.8149853342140644e-06,
"loss": 2.3481,
"step": 1343
},
{
"epoch": 0.8108597285067873,
"grad_norm": 1.692016876163523,
"learning_rate": 1.8038390147139506e-06,
"loss": 2.367,
"step": 1344
},
{
"epoch": 0.8114630467571644,
"grad_norm": 1.7018531425762653,
"learning_rate": 1.7927236330581e-06,
"loss": 2.3712,
"step": 1345
},
{
"epoch": 0.8120663650075415,
"grad_norm": 1.7025540938243773,
"learning_rate": 1.781639231203467e-06,
"loss": 2.4019,
"step": 1346
},
{
"epoch": 0.8126696832579186,
"grad_norm": 1.6716228928422712,
"learning_rate": 1.770585850990072e-06,
"loss": 2.4047,
"step": 1347
},
{
"epoch": 0.8132730015082956,
"grad_norm": 1.655058366571219,
"learning_rate": 1.7595635341408302e-06,
"loss": 2.3402,
"step": 1348
},
{
"epoch": 0.8138763197586727,
"grad_norm": 1.6490662154522266,
"learning_rate": 1.7485723222614059e-06,
"loss": 2.3871,
"step": 1349
},
{
"epoch": 0.8144796380090498,
"grad_norm": 1.6062153254680276,
"learning_rate": 1.7376122568400533e-06,
"loss": 2.4266,
"step": 1350
},
{
"epoch": 0.8150829562594268,
"grad_norm": 1.663003025297214,
"learning_rate": 1.7266833792474536e-06,
"loss": 2.3385,
"step": 1351
},
{
"epoch": 0.8156862745098039,
"grad_norm": 1.5846299172862364,
"learning_rate": 1.7157857307365733e-06,
"loss": 2.3776,
"step": 1352
},
{
"epoch": 0.816289592760181,
"grad_norm": 1.7929078747126366,
"learning_rate": 1.7049193524424922e-06,
"loss": 2.4375,
"step": 1353
},
{
"epoch": 0.816892911010558,
"grad_norm": 1.6503047291415613,
"learning_rate": 1.6940842853822582e-06,
"loss": 2.3582,
"step": 1354
},
{
"epoch": 0.8174962292609351,
"grad_norm": 1.7197416263684178,
"learning_rate": 1.6832805704547272e-06,
"loss": 2.3355,
"step": 1355
},
{
"epoch": 0.8180995475113122,
"grad_norm": 1.761539196901066,
"learning_rate": 1.6725082484404132e-06,
"loss": 2.3332,
"step": 1356
},
{
"epoch": 0.8187028657616893,
"grad_norm": 1.729763987929638,
"learning_rate": 1.6617673600013295e-06,
"loss": 2.4059,
"step": 1357
},
{
"epoch": 0.8193061840120663,
"grad_norm": 1.6870548202068116,
"learning_rate": 1.6510579456808417e-06,
"loss": 2.3768,
"step": 1358
},
{
"epoch": 0.8199095022624434,
"grad_norm": 1.6953755837989941,
"learning_rate": 1.6403800459035046e-06,
"loss": 2.3077,
"step": 1359
},
{
"epoch": 0.8205128205128205,
"grad_norm": 1.6342968081775047,
"learning_rate": 1.6297337009749249e-06,
"loss": 2.4096,
"step": 1360
},
{
"epoch": 0.8211161387631976,
"grad_norm": 1.80156853216978,
"learning_rate": 1.6191189510815942e-06,
"loss": 2.2616,
"step": 1361
},
{
"epoch": 0.8217194570135746,
"grad_norm": 1.6421938563723806,
"learning_rate": 1.6085358362907423e-06,
"loss": 2.3517,
"step": 1362
},
{
"epoch": 0.8223227752639517,
"grad_norm": 1.6856507493913957,
"learning_rate": 1.5979843965501885e-06,
"loss": 2.4184,
"step": 1363
},
{
"epoch": 0.8229260935143288,
"grad_norm": 1.6662976991524578,
"learning_rate": 1.587464671688187e-06,
"loss": 2.365,
"step": 1364
},
{
"epoch": 0.8235294117647058,
"grad_norm": 1.801209558500375,
"learning_rate": 1.5769767014132885e-06,
"loss": 2.3553,
"step": 1365
},
{
"epoch": 0.8241327300150829,
"grad_norm": 1.7660705340536775,
"learning_rate": 1.5665205253141647e-06,
"loss": 2.4239,
"step": 1366
},
{
"epoch": 0.82473604826546,
"grad_norm": 1.8381496275503855,
"learning_rate": 1.5560961828594845e-06,
"loss": 2.4207,
"step": 1367
},
{
"epoch": 0.8253393665158371,
"grad_norm": 1.7129726266530854,
"learning_rate": 1.5457037133977515e-06,
"loss": 2.4011,
"step": 1368
},
{
"epoch": 0.8259426847662141,
"grad_norm": 1.6162587223613567,
"learning_rate": 1.5353431561571653e-06,
"loss": 2.4168,
"step": 1369
},
{
"epoch": 0.8265460030165912,
"grad_norm": 1.8119562213217464,
"learning_rate": 1.5250145502454594e-06,
"loss": 2.4459,
"step": 1370
},
{
"epoch": 0.8271493212669683,
"grad_norm": 1.7252802585436375,
"learning_rate": 1.5147179346497665e-06,
"loss": 2.4109,
"step": 1371
},
{
"epoch": 0.8277526395173453,
"grad_norm": 1.6087075158634467,
"learning_rate": 1.504453348236461e-06,
"loss": 2.4332,
"step": 1372
},
{
"epoch": 0.8283559577677225,
"grad_norm": 1.7356484029383643,
"learning_rate": 1.4942208297510252e-06,
"loss": 2.271,
"step": 1373
},
{
"epoch": 0.8289592760180996,
"grad_norm": 5.655572111782827,
"learning_rate": 1.4840204178178897e-06,
"loss": 2.4529,
"step": 1374
},
{
"epoch": 0.8295625942684767,
"grad_norm": 1.6446113205280009,
"learning_rate": 1.473852150940297e-06,
"loss": 2.3765,
"step": 1375
},
{
"epoch": 0.8301659125188537,
"grad_norm": 1.7789716974806704,
"learning_rate": 1.4637160675001427e-06,
"loss": 2.4082,
"step": 1376
},
{
"epoch": 0.8307692307692308,
"grad_norm": 1.755320567961607,
"learning_rate": 1.453612205757855e-06,
"loss": 2.3835,
"step": 1377
},
{
"epoch": 0.8313725490196079,
"grad_norm": 1.741950402553732,
"learning_rate": 1.443540603852227e-06,
"loss": 2.4143,
"step": 1378
},
{
"epoch": 0.831975867269985,
"grad_norm": 1.638206384727529,
"learning_rate": 1.433501299800283e-06,
"loss": 2.3397,
"step": 1379
},
{
"epoch": 0.832579185520362,
"grad_norm": 1.6525001416131908,
"learning_rate": 1.4234943314971328e-06,
"loss": 2.4176,
"step": 1380
},
{
"epoch": 0.8331825037707391,
"grad_norm": 1.636977385908322,
"learning_rate": 1.413519736715827e-06,
"loss": 2.3485,
"step": 1381
},
{
"epoch": 0.8337858220211162,
"grad_norm": 1.5618943455158865,
"learning_rate": 1.4035775531072259e-06,
"loss": 2.4065,
"step": 1382
},
{
"epoch": 0.8343891402714932,
"grad_norm": 1.5734395328774438,
"learning_rate": 1.3936678181998376e-06,
"loss": 2.2659,
"step": 1383
},
{
"epoch": 0.8349924585218703,
"grad_norm": 1.8179891942394608,
"learning_rate": 1.3837905693996922e-06,
"loss": 2.3585,
"step": 1384
},
{
"epoch": 0.8355957767722474,
"grad_norm": 1.6722431204654182,
"learning_rate": 1.373945843990192e-06,
"loss": 2.3467,
"step": 1385
},
{
"epoch": 0.8361990950226245,
"grad_norm": 1.6316885953843028,
"learning_rate": 1.3641336791319814e-06,
"loss": 2.3139,
"step": 1386
},
{
"epoch": 0.8368024132730015,
"grad_norm": 1.6498294540673093,
"learning_rate": 1.35435411186279e-06,
"loss": 2.4035,
"step": 1387
},
{
"epoch": 0.8374057315233786,
"grad_norm": 1.663069087036581,
"learning_rate": 1.3446071790973058e-06,
"loss": 2.268,
"step": 1388
},
{
"epoch": 0.8380090497737557,
"grad_norm": 1.5896047232548594,
"learning_rate": 1.334892917627033e-06,
"loss": 2.4154,
"step": 1389
},
{
"epoch": 0.8386123680241327,
"grad_norm": 1.5609131592393046,
"learning_rate": 1.3252113641201537e-06,
"loss": 2.3732,
"step": 1390
},
{
"epoch": 0.8392156862745098,
"grad_norm": 1.661564956740201,
"learning_rate": 1.3155625551213857e-06,
"loss": 2.2994,
"step": 1391
},
{
"epoch": 0.8398190045248869,
"grad_norm": 1.6410595605624814,
"learning_rate": 1.3059465270518469e-06,
"loss": 2.4214,
"step": 1392
},
{
"epoch": 0.840422322775264,
"grad_norm": 1.602523309837128,
"learning_rate": 1.2963633162089174e-06,
"loss": 2.3991,
"step": 1393
},
{
"epoch": 0.841025641025641,
"grad_norm": 1.6334270239786723,
"learning_rate": 1.286812958766106e-06,
"loss": 2.3418,
"step": 1394
},
{
"epoch": 0.8416289592760181,
"grad_norm": 1.7860198422911693,
"learning_rate": 1.2772954907729074e-06,
"loss": 2.3447,
"step": 1395
},
{
"epoch": 0.8422322775263952,
"grad_norm": 1.6682612826514833,
"learning_rate": 1.267810948154674e-06,
"loss": 2.4333,
"step": 1396
},
{
"epoch": 0.8428355957767723,
"grad_norm": 1.6290333910999475,
"learning_rate": 1.2583593667124638e-06,
"loss": 2.3509,
"step": 1397
},
{
"epoch": 0.8434389140271493,
"grad_norm": 1.6691854260224455,
"learning_rate": 1.2489407821229326e-06,
"loss": 2.3983,
"step": 1398
},
{
"epoch": 0.8440422322775264,
"grad_norm": 1.741145842002938,
"learning_rate": 1.2395552299381742e-06,
"loss": 2.3382,
"step": 1399
},
{
"epoch": 0.8446455505279035,
"grad_norm": 1.6289594974977173,
"learning_rate": 1.2302027455855969e-06,
"loss": 2.3712,
"step": 1400
},
{
"epoch": 0.8452488687782805,
"grad_norm": 1.6425484443639187,
"learning_rate": 1.220883364367792e-06,
"loss": 2.4272,
"step": 1401
},
{
"epoch": 0.8458521870286576,
"grad_norm": 1.8073588625784758,
"learning_rate": 1.2115971214623923e-06,
"loss": 2.3826,
"step": 1402
},
{
"epoch": 0.8464555052790347,
"grad_norm": 1.6222889871033748,
"learning_rate": 1.2023440519219508e-06,
"loss": 2.3432,
"step": 1403
},
{
"epoch": 0.8470588235294118,
"grad_norm": 1.7205116329534476,
"learning_rate": 1.1931241906737966e-06,
"loss": 2.3555,
"step": 1404
},
{
"epoch": 0.8476621417797888,
"grad_norm": 1.5973272142066768,
"learning_rate": 1.1839375725199098e-06,
"loss": 2.4325,
"step": 1405
},
{
"epoch": 0.8482654600301659,
"grad_norm": 1.6364231863924403,
"learning_rate": 1.1747842321367886e-06,
"loss": 2.4447,
"step": 1406
},
{
"epoch": 0.848868778280543,
"grad_norm": 1.7216509921725922,
"learning_rate": 1.1656642040753174e-06,
"loss": 2.4644,
"step": 1407
},
{
"epoch": 0.84947209653092,
"grad_norm": 1.882302558304212,
"learning_rate": 1.156577522760639e-06,
"loss": 2.3529,
"step": 1408
},
{
"epoch": 0.8500754147812971,
"grad_norm": 1.67946358471103,
"learning_rate": 1.1475242224920234e-06,
"loss": 2.3677,
"step": 1409
},
{
"epoch": 0.8506787330316742,
"grad_norm": 1.7014475741055364,
"learning_rate": 1.1385043374427341e-06,
"loss": 2.3374,
"step": 1410
},
{
"epoch": 0.8512820512820513,
"grad_norm": 1.6288323501069348,
"learning_rate": 1.129517901659911e-06,
"loss": 2.3707,
"step": 1411
},
{
"epoch": 0.8518853695324283,
"grad_norm": 1.715269118664735,
"learning_rate": 1.1205649490644255e-06,
"loss": 2.351,
"step": 1412
},
{
"epoch": 0.8524886877828054,
"grad_norm": 1.7131818295241272,
"learning_rate": 1.1116455134507665e-06,
"loss": 2.3191,
"step": 1413
},
{
"epoch": 0.8530920060331825,
"grad_norm": 1.6242702822484834,
"learning_rate": 1.1027596284869024e-06,
"loss": 2.4025,
"step": 1414
},
{
"epoch": 0.8536953242835595,
"grad_norm": 1.6766455089052343,
"learning_rate": 1.0939073277141598e-06,
"loss": 2.3731,
"step": 1415
},
{
"epoch": 0.8542986425339366,
"grad_norm": 1.673925183108547,
"learning_rate": 1.0850886445471055e-06,
"loss": 2.4059,
"step": 1416
},
{
"epoch": 0.8549019607843137,
"grad_norm": 1.8068773405643075,
"learning_rate": 1.076303612273395e-06,
"loss": 2.4252,
"step": 1417
},
{
"epoch": 0.8555052790346908,
"grad_norm": 1.6512291093394074,
"learning_rate": 1.0675522640536706e-06,
"loss": 2.3158,
"step": 1418
},
{
"epoch": 0.8561085972850678,
"grad_norm": 1.7090234334662167,
"learning_rate": 1.0588346329214316e-06,
"loss": 2.3748,
"step": 1419
},
{
"epoch": 0.8567119155354449,
"grad_norm": 1.6948548741381664,
"learning_rate": 1.0501507517829012e-06,
"loss": 2.3942,
"step": 1420
},
{
"epoch": 0.857315233785822,
"grad_norm": 1.7826231466250537,
"learning_rate": 1.0415006534169092e-06,
"loss": 2.3419,
"step": 1421
},
{
"epoch": 0.857918552036199,
"grad_norm": 1.7591224301384984,
"learning_rate": 1.0328843704747649e-06,
"loss": 2.3298,
"step": 1422
},
{
"epoch": 0.8585218702865761,
"grad_norm": 1.679495270548289,
"learning_rate": 1.0243019354801353e-06,
"loss": 2.3149,
"step": 1423
},
{
"epoch": 0.8591251885369532,
"grad_norm": 1.7069415551851421,
"learning_rate": 1.0157533808289265e-06,
"loss": 2.3253,
"step": 1424
},
{
"epoch": 0.8597285067873304,
"grad_norm": 1.8331634766626557,
"learning_rate": 1.0072387387891535e-06,
"loss": 2.438,
"step": 1425
},
{
"epoch": 0.8603318250377074,
"grad_norm": 1.791237994709679,
"learning_rate": 9.987580415008224e-07,
"loss": 2.4065,
"step": 1426
},
{
"epoch": 0.8609351432880845,
"grad_norm": 1.6788734645769163,
"learning_rate": 9.903113209758098e-07,
"loss": 2.3797,
"step": 1427
},
{
"epoch": 0.8615384615384616,
"grad_norm": 1.65803920979439,
"learning_rate": 9.8189860909774e-07,
"loss": 2.3822,
"step": 1428
},
{
"epoch": 0.8621417797888387,
"grad_norm": 1.6380863804172392,
"learning_rate": 9.735199376218673e-07,
"loss": 2.3546,
"step": 1429
},
{
"epoch": 0.8627450980392157,
"grad_norm": 1.6992202984146336,
"learning_rate": 9.65175338174954e-07,
"loss": 2.4007,
"step": 1430
},
{
"epoch": 0.8633484162895928,
"grad_norm": 1.746741845341246,
"learning_rate": 9.568648422551486e-07,
"loss": 2.371,
"step": 1431
},
{
"epoch": 0.8639517345399699,
"grad_norm": 1.7520256764048072,
"learning_rate": 9.485884812318769e-07,
"loss": 2.3613,
"step": 1432
},
{
"epoch": 0.864555052790347,
"grad_norm": 1.709025973421567,
"learning_rate": 9.403462863457113e-07,
"loss": 2.4001,
"step": 1433
},
{
"epoch": 0.865158371040724,
"grad_norm": 1.5659271803992887,
"learning_rate": 9.321382887082564e-07,
"loss": 2.4279,
"step": 1434
},
{
"epoch": 0.8657616892911011,
"grad_norm": 1.6574233931237368,
"learning_rate": 9.239645193020386e-07,
"loss": 2.4045,
"step": 1435
},
{
"epoch": 0.8663650075414782,
"grad_norm": 1.7567684546745939,
"learning_rate": 9.158250089803789e-07,
"loss": 2.3661,
"step": 1436
},
{
"epoch": 0.8669683257918552,
"grad_norm": 1.6579040791726805,
"learning_rate": 9.077197884672884e-07,
"loss": 2.3595,
"step": 1437
},
{
"epoch": 0.8675716440422323,
"grad_norm": 1.672146465058933,
"learning_rate": 8.996488883573351e-07,
"loss": 2.3272,
"step": 1438
},
{
"epoch": 0.8681749622926094,
"grad_norm": 1.7982516402454005,
"learning_rate": 8.916123391155473e-07,
"loss": 2.3342,
"step": 1439
},
{
"epoch": 0.8687782805429864,
"grad_norm": 1.6513082922110383,
"learning_rate": 8.836101710772826e-07,
"loss": 2.4187,
"step": 1440
},
{
"epoch": 0.8693815987933635,
"grad_norm": 1.5801424653470257,
"learning_rate": 8.756424144481313e-07,
"loss": 2.3546,
"step": 1441
},
{
"epoch": 0.8699849170437406,
"grad_norm": 1.6204238318617161,
"learning_rate": 8.677090993037817e-07,
"loss": 2.3551,
"step": 1442
},
{
"epoch": 0.8705882352941177,
"grad_norm": 1.819858713479699,
"learning_rate": 8.598102555899224e-07,
"loss": 2.4483,
"step": 1443
},
{
"epoch": 0.8711915535444947,
"grad_norm": 1.8786719832489334,
"learning_rate": 8.519459131221175e-07,
"loss": 2.3604,
"step": 1444
},
{
"epoch": 0.8717948717948718,
"grad_norm": 1.667217198114092,
"learning_rate": 8.441161015857092e-07,
"loss": 2.3518,
"step": 1445
},
{
"epoch": 0.8723981900452489,
"grad_norm": 1.732586131005076,
"learning_rate": 8.36320850535689e-07,
"loss": 2.3817,
"step": 1446
},
{
"epoch": 0.873001508295626,
"grad_norm": 1.632569270479025,
"learning_rate": 8.285601893965989e-07,
"loss": 2.2941,
"step": 1447
},
{
"epoch": 0.873604826546003,
"grad_norm": 1.7481093246691317,
"learning_rate": 8.208341474624071e-07,
"loss": 2.3644,
"step": 1448
},
{
"epoch": 0.8742081447963801,
"grad_norm": 1.592396538771664,
"learning_rate": 8.131427538964165e-07,
"loss": 2.3092,
"step": 1449
},
{
"epoch": 0.8748114630467572,
"grad_norm": 1.7076427444522029,
"learning_rate": 8.054860377311368e-07,
"loss": 2.3829,
"step": 1450
},
{
"epoch": 0.8754147812971342,
"grad_norm": 1.624933821901664,
"learning_rate": 7.978640278681838e-07,
"loss": 2.3219,
"step": 1451
},
{
"epoch": 0.8760180995475113,
"grad_norm": 1.5590949529060047,
"learning_rate": 7.902767530781664e-07,
"loss": 2.3748,
"step": 1452
},
{
"epoch": 0.8766214177978884,
"grad_norm": 1.6724019664102798,
"learning_rate": 7.82724242000581e-07,
"loss": 2.4011,
"step": 1453
},
{
"epoch": 0.8772247360482655,
"grad_norm": 1.7230129361136048,
"learning_rate": 7.752065231437067e-07,
"loss": 2.2988,
"step": 1454
},
{
"epoch": 0.8778280542986425,
"grad_norm": 1.7025219674521455,
"learning_rate": 7.677236248844855e-07,
"loss": 2.3598,
"step": 1455
},
{
"epoch": 0.8784313725490196,
"grad_norm": 1.762003177975733,
"learning_rate": 7.602755754684277e-07,
"loss": 2.3904,
"step": 1456
},
{
"epoch": 0.8784313725490196,
"eval_loss": 2.378657341003418,
"eval_runtime": 21.8258,
"eval_samples_per_second": 4.032,
"eval_steps_per_second": 0.504,
"step": 1456
},
{
"epoch": 0.8790346907993967,
"grad_norm": 1.6970670634162761,
"learning_rate": 7.528624030094978e-07,
"loss": 2.3864,
"step": 1457
},
{
"epoch": 0.8796380090497737,
"grad_norm": 1.7350810480457661,
"learning_rate": 7.454841354900177e-07,
"loss": 2.2923,
"step": 1458
},
{
"epoch": 0.8802413273001508,
"grad_norm": 1.6909883458124622,
"learning_rate": 7.38140800760545e-07,
"loss": 2.3932,
"step": 1459
},
{
"epoch": 0.8808446455505279,
"grad_norm": 1.7254478671591273,
"learning_rate": 7.308324265397837e-07,
"loss": 2.4299,
"step": 1460
},
{
"epoch": 0.881447963800905,
"grad_norm": 1.6323170982575828,
"learning_rate": 7.235590404144688e-07,
"loss": 2.4528,
"step": 1461
},
{
"epoch": 0.882051282051282,
"grad_norm": 1.725888739611964,
"learning_rate": 7.163206698392744e-07,
"loss": 2.4173,
"step": 1462
},
{
"epoch": 0.8826546003016591,
"grad_norm": 1.5964294056930346,
"learning_rate": 7.091173421366937e-07,
"loss": 2.3595,
"step": 1463
},
{
"epoch": 0.8832579185520362,
"grad_norm": 1.6073598119187729,
"learning_rate": 7.01949084496949e-07,
"loss": 2.3126,
"step": 1464
},
{
"epoch": 0.8838612368024132,
"grad_norm": 1.649407476234432,
"learning_rate": 6.948159239778829e-07,
"loss": 2.2887,
"step": 1465
},
{
"epoch": 0.8844645550527903,
"grad_norm": 1.7340350529598598,
"learning_rate": 6.877178875048573e-07,
"loss": 2.3362,
"step": 1466
},
{
"epoch": 0.8850678733031674,
"grad_norm": 1.6839092408802154,
"learning_rate": 6.80655001870657e-07,
"loss": 2.3465,
"step": 1467
},
{
"epoch": 0.8856711915535445,
"grad_norm": 1.6218961606812634,
"learning_rate": 6.736272937353782e-07,
"loss": 2.3734,
"step": 1468
},
{
"epoch": 0.8862745098039215,
"grad_norm": 1.6983984598193318,
"learning_rate": 6.666347896263326e-07,
"loss": 2.4695,
"step": 1469
},
{
"epoch": 0.8868778280542986,
"grad_norm": 1.6624539260672893,
"learning_rate": 6.596775159379543e-07,
"loss": 2.4142,
"step": 1470
},
{
"epoch": 0.8874811463046757,
"grad_norm": 1.8405004468613892,
"learning_rate": 6.527554989316898e-07,
"loss": 2.3249,
"step": 1471
},
{
"epoch": 0.8880844645550527,
"grad_norm": 1.653966697394111,
"learning_rate": 6.458687647359041e-07,
"loss": 2.3264,
"step": 1472
},
{
"epoch": 0.8886877828054298,
"grad_norm": 1.57539011035201,
"learning_rate": 6.3901733934578e-07,
"loss": 2.3421,
"step": 1473
},
{
"epoch": 0.8892911010558069,
"grad_norm": 1.5954240002596058,
"learning_rate": 6.322012486232209e-07,
"loss": 2.401,
"step": 1474
},
{
"epoch": 0.889894419306184,
"grad_norm": 1.6355649776928247,
"learning_rate": 6.254205182967566e-07,
"loss": 2.3746,
"step": 1475
},
{
"epoch": 0.890497737556561,
"grad_norm": 1.6844563379748747,
"learning_rate": 6.186751739614405e-07,
"loss": 2.3419,
"step": 1476
},
{
"epoch": 0.8911010558069382,
"grad_norm": 1.631699028891681,
"learning_rate": 6.119652410787546e-07,
"loss": 2.4116,
"step": 1477
},
{
"epoch": 0.8917043740573153,
"grad_norm": 1.7577623661059945,
"learning_rate": 6.052907449765144e-07,
"loss": 2.3307,
"step": 1478
},
{
"epoch": 0.8923076923076924,
"grad_norm": 1.9272993274326495,
"learning_rate": 5.986517108487754e-07,
"loss": 2.4229,
"step": 1479
},
{
"epoch": 0.8929110105580694,
"grad_norm": 1.731338520723134,
"learning_rate": 5.920481637557318e-07,
"loss": 2.4008,
"step": 1480
},
{
"epoch": 0.8935143288084465,
"grad_norm": 1.7062510402870246,
"learning_rate": 5.8548012862363e-07,
"loss": 2.3974,
"step": 1481
},
{
"epoch": 0.8941176470588236,
"grad_norm": 1.5686806047724653,
"learning_rate": 5.789476302446662e-07,
"loss": 2.3423,
"step": 1482
},
{
"epoch": 0.8947209653092006,
"grad_norm": 1.709116996155703,
"learning_rate": 5.724506932769014e-07,
"loss": 2.4025,
"step": 1483
},
{
"epoch": 0.8953242835595777,
"grad_norm": 1.712797698360303,
"learning_rate": 5.659893422441598e-07,
"loss": 2.3405,
"step": 1484
},
{
"epoch": 0.8959276018099548,
"grad_norm": 1.760912337787795,
"learning_rate": 5.59563601535943e-07,
"loss": 2.3576,
"step": 1485
},
{
"epoch": 0.8965309200603319,
"grad_norm": 1.655187155662018,
"learning_rate": 5.53173495407332e-07,
"loss": 2.3894,
"step": 1486
},
{
"epoch": 0.8971342383107089,
"grad_norm": 1.6356081181884141,
"learning_rate": 5.468190479789015e-07,
"loss": 2.3583,
"step": 1487
},
{
"epoch": 0.897737556561086,
"grad_norm": 1.5988218833944716,
"learning_rate": 5.40500283236628e-07,
"loss": 2.3008,
"step": 1488
},
{
"epoch": 0.8983408748114631,
"grad_norm": 1.669649706786489,
"learning_rate": 5.342172250317946e-07,
"loss": 2.3583,
"step": 1489
},
{
"epoch": 0.8989441930618401,
"grad_norm": 1.6851067875014922,
"learning_rate": 5.279698970809011e-07,
"loss": 2.341,
"step": 1490
},
{
"epoch": 0.8995475113122172,
"grad_norm": 1.714939456296494,
"learning_rate": 5.21758322965581e-07,
"loss": 2.3949,
"step": 1491
},
{
"epoch": 0.9001508295625943,
"grad_norm": 1.6680686230304222,
"learning_rate": 5.155825261325099e-07,
"loss": 2.352,
"step": 1492
},
{
"epoch": 0.9007541478129714,
"grad_norm": 1.694172379851801,
"learning_rate": 5.094425298933136e-07,
"loss": 2.3659,
"step": 1493
},
{
"epoch": 0.9013574660633484,
"grad_norm": 1.5458043855268861,
"learning_rate": 5.033383574244832e-07,
"loss": 2.3697,
"step": 1494
},
{
"epoch": 0.9019607843137255,
"grad_norm": 1.6316763489085158,
"learning_rate": 4.972700317672829e-07,
"loss": 2.3894,
"step": 1495
},
{
"epoch": 0.9025641025641026,
"grad_norm": 1.7347252339740011,
"learning_rate": 4.912375758276744e-07,
"loss": 2.4098,
"step": 1496
},
{
"epoch": 0.9031674208144796,
"grad_norm": 1.6488961343325694,
"learning_rate": 4.852410123762164e-07,
"loss": 2.4116,
"step": 1497
},
{
"epoch": 0.9037707390648567,
"grad_norm": 1.7413362686027694,
"learning_rate": 4.792803640479871e-07,
"loss": 2.4963,
"step": 1498
},
{
"epoch": 0.9043740573152338,
"grad_norm": 1.5131014161067307,
"learning_rate": 4.7335565334249767e-07,
"loss": 2.3327,
"step": 1499
},
{
"epoch": 0.9049773755656109,
"grad_norm": 1.6485978846587896,
"learning_rate": 4.674669026236045e-07,
"loss": 2.3153,
"step": 1500
},
{
"epoch": 0.9055806938159879,
"grad_norm": 1.648121715180723,
"learning_rate": 4.6161413411942913e-07,
"loss": 2.3907,
"step": 1501
},
{
"epoch": 0.906184012066365,
"grad_norm": 1.5954459345960825,
"learning_rate": 4.557973699222706e-07,
"loss": 2.4178,
"step": 1502
},
{
"epoch": 0.9067873303167421,
"grad_norm": 1.7890764532981966,
"learning_rate": 4.500166319885235e-07,
"loss": 2.3997,
"step": 1503
},
{
"epoch": 0.9073906485671192,
"grad_norm": 1.6434691831960049,
"learning_rate": 4.4427194213859216e-07,
"loss": 2.3621,
"step": 1504
},
{
"epoch": 0.9079939668174962,
"grad_norm": 1.6893382016076397,
"learning_rate": 4.385633220568186e-07,
"loss": 2.4687,
"step": 1505
},
{
"epoch": 0.9085972850678733,
"grad_norm": 1.6875837245052094,
"learning_rate": 4.328907932913873e-07,
"loss": 2.4183,
"step": 1506
},
{
"epoch": 0.9092006033182504,
"grad_norm": 1.7217829130925384,
"learning_rate": 4.2725437725424923e-07,
"loss": 2.3603,
"step": 1507
},
{
"epoch": 0.9098039215686274,
"grad_norm": 1.6624234725771576,
"learning_rate": 4.216540952210435e-07,
"loss": 2.3669,
"step": 1508
},
{
"epoch": 0.9104072398190045,
"grad_norm": 1.8787811617905237,
"learning_rate": 4.160899683310171e-07,
"loss": 2.4136,
"step": 1509
},
{
"epoch": 0.9110105580693816,
"grad_norm": 1.8583772875085336,
"learning_rate": 4.1056201758693957e-07,
"loss": 2.2831,
"step": 1510
},
{
"epoch": 0.9116138763197587,
"grad_norm": 1.6368612883465103,
"learning_rate": 4.0507026385502747e-07,
"loss": 2.3401,
"step": 1511
},
{
"epoch": 0.9122171945701357,
"grad_norm": 1.6299438372055655,
"learning_rate": 3.9961472786486655e-07,
"loss": 2.3577,
"step": 1512
},
{
"epoch": 0.9128205128205128,
"grad_norm": 1.7111536524933928,
"learning_rate": 3.9419543020933426e-07,
"loss": 2.4232,
"step": 1513
},
{
"epoch": 0.9134238310708899,
"grad_norm": 1.7191711135202754,
"learning_rate": 3.888123913445174e-07,
"loss": 2.5138,
"step": 1514
},
{
"epoch": 0.9140271493212669,
"grad_norm": 1.6802486623040969,
"learning_rate": 3.834656315896379e-07,
"loss": 2.3612,
"step": 1515
},
{
"epoch": 0.914630467571644,
"grad_norm": 1.5822887932572551,
"learning_rate": 3.7815517112697707e-07,
"loss": 2.4312,
"step": 1516
},
{
"epoch": 0.9152337858220211,
"grad_norm": 1.6833890185021834,
"learning_rate": 3.728810300017949e-07,
"loss": 2.433,
"step": 1517
},
{
"epoch": 0.9158371040723982,
"grad_norm": 1.750558917564341,
"learning_rate": 3.6764322812226416e-07,
"loss": 2.3951,
"step": 1518
},
{
"epoch": 0.9164404223227752,
"grad_norm": 1.566804602225607,
"learning_rate": 3.624417852593842e-07,
"loss": 2.414,
"step": 1519
},
{
"epoch": 0.9170437405731523,
"grad_norm": 1.7643675825250036,
"learning_rate": 3.572767210469086e-07,
"loss": 2.3622,
"step": 1520
},
{
"epoch": 0.9176470588235294,
"grad_norm": 1.6153326061930553,
"learning_rate": 3.521480549812784e-07,
"loss": 2.4519,
"step": 1521
},
{
"epoch": 0.9182503770739064,
"grad_norm": 1.6247385009617024,
"learning_rate": 3.4705580642154126e-07,
"loss": 2.4414,
"step": 1522
},
{
"epoch": 0.9188536953242835,
"grad_norm": 1.6397317973016956,
"learning_rate": 3.4199999458928045e-07,
"loss": 2.3467,
"step": 1523
},
{
"epoch": 0.9194570135746606,
"grad_norm": 1.608345000582214,
"learning_rate": 3.3698063856854257e-07,
"loss": 2.3482,
"step": 1524
},
{
"epoch": 0.9200603318250377,
"grad_norm": 1.6313694914216899,
"learning_rate": 3.319977573057642e-07,
"loss": 2.3654,
"step": 1525
},
{
"epoch": 0.9206636500754147,
"grad_norm": 1.6034585133542865,
"learning_rate": 3.2705136960970554e-07,
"loss": 2.4205,
"step": 1526
},
{
"epoch": 0.9212669683257918,
"grad_norm": 1.7012160770662008,
"learning_rate": 3.221414941513723e-07,
"loss": 2.4297,
"step": 1527
},
{
"epoch": 0.9218702865761689,
"grad_norm": 1.6314068122413963,
"learning_rate": 3.1726814946394736e-07,
"loss": 2.3526,
"step": 1528
},
{
"epoch": 0.9224736048265461,
"grad_norm": 1.7604298285793514,
"learning_rate": 3.124313539427226e-07,
"loss": 2.3841,
"step": 1529
},
{
"epoch": 0.9230769230769231,
"grad_norm": 1.6897111047811944,
"learning_rate": 3.0763112584503264e-07,
"loss": 2.3596,
"step": 1530
},
{
"epoch": 0.9236802413273002,
"grad_norm": 1.901856272346252,
"learning_rate": 3.028674832901757e-07,
"loss": 2.3238,
"step": 1531
},
{
"epoch": 0.9242835595776773,
"grad_norm": 1.6867131472505243,
"learning_rate": 2.9814044425935605e-07,
"loss": 2.419,
"step": 1532
},
{
"epoch": 0.9248868778280543,
"grad_norm": 1.83379043500035,
"learning_rate": 2.934500265956075e-07,
"loss": 2.3716,
"step": 1533
},
{
"epoch": 0.9254901960784314,
"grad_norm": 1.8719709347158549,
"learning_rate": 2.887962480037354e-07,
"loss": 2.4137,
"step": 1534
},
{
"epoch": 0.9260935143288085,
"grad_norm": 1.6300473777324955,
"learning_rate": 2.841791260502402e-07,
"loss": 2.3371,
"step": 1535
},
{
"epoch": 0.9266968325791856,
"grad_norm": 1.6432377193732035,
"learning_rate": 2.7959867816325756e-07,
"loss": 2.4116,
"step": 1536
},
{
"epoch": 0.9273001508295626,
"grad_norm": 1.6374974156620303,
"learning_rate": 2.750549216324894e-07,
"loss": 2.3411,
"step": 1537
},
{
"epoch": 0.9279034690799397,
"grad_norm": 1.6801637904838937,
"learning_rate": 2.7054787360913825e-07,
"loss": 2.414,
"step": 1538
},
{
"epoch": 0.9285067873303168,
"grad_norm": 1.6856307351719522,
"learning_rate": 2.6607755110584886e-07,
"loss": 2.3434,
"step": 1539
},
{
"epoch": 0.9291101055806938,
"grad_norm": 1.603414777890947,
"learning_rate": 2.6164397099663676e-07,
"loss": 2.4188,
"step": 1540
},
{
"epoch": 0.9297134238310709,
"grad_norm": 1.5895687457039656,
"learning_rate": 2.5724715001682053e-07,
"loss": 2.4484,
"step": 1541
},
{
"epoch": 0.930316742081448,
"grad_norm": 1.685584255381596,
"learning_rate": 2.5288710476297553e-07,
"loss": 2.3443,
"step": 1542
},
{
"epoch": 0.9309200603318251,
"grad_norm": 1.6565581708347004,
"learning_rate": 2.4856385169285457e-07,
"loss": 2.3593,
"step": 1543
},
{
"epoch": 0.9315233785822021,
"grad_norm": 1.817733041298196,
"learning_rate": 2.442774071253329e-07,
"loss": 2.4111,
"step": 1544
},
{
"epoch": 0.9321266968325792,
"grad_norm": 1.6109618863028958,
"learning_rate": 2.4002778724034447e-07,
"loss": 2.3405,
"step": 1545
},
{
"epoch": 0.9327300150829563,
"grad_norm": 1.6820631065521003,
"learning_rate": 2.3581500807882462e-07,
"loss": 2.4116,
"step": 1546
},
{
"epoch": 0.9333333333333333,
"grad_norm": 1.6802462851437703,
"learning_rate": 2.3163908554264646e-07,
"loss": 2.4147,
"step": 1547
},
{
"epoch": 0.9339366515837104,
"grad_norm": 1.5529306934293425,
"learning_rate": 2.2750003539456e-07,
"loss": 2.368,
"step": 1548
},
{
"epoch": 0.9345399698340875,
"grad_norm": 1.7540477373608447,
"learning_rate": 2.2339787325813323e-07,
"loss": 2.3551,
"step": 1549
},
{
"epoch": 0.9351432880844646,
"grad_norm": 1.5684347320472798,
"learning_rate": 2.1933261461769772e-07,
"loss": 2.3646,
"step": 1550
},
{
"epoch": 0.9357466063348416,
"grad_norm": 1.6975612580460133,
"learning_rate": 2.15304274818281e-07,
"loss": 2.424,
"step": 1551
},
{
"epoch": 0.9363499245852187,
"grad_norm": 1.6439050320927016,
"learning_rate": 2.1131286906555859e-07,
"loss": 2.4373,
"step": 1552
},
{
"epoch": 0.9369532428355958,
"grad_norm": 1.6463439573689524,
"learning_rate": 2.0735841242578992e-07,
"loss": 2.3877,
"step": 1553
},
{
"epoch": 0.9375565610859729,
"grad_norm": 1.596036485060441,
"learning_rate": 2.034409198257614e-07,
"loss": 2.3444,
"step": 1554
},
{
"epoch": 0.9381598793363499,
"grad_norm": 1.60735589310518,
"learning_rate": 1.9956040605273784e-07,
"loss": 2.382,
"step": 1555
},
{
"epoch": 0.938763197586727,
"grad_norm": 1.7502524053387238,
"learning_rate": 1.9571688575439672e-07,
"loss": 2.3612,
"step": 1556
},
{
"epoch": 0.9393665158371041,
"grad_norm": 1.695777678767089,
"learning_rate": 1.9191037343877729e-07,
"loss": 2.4561,
"step": 1557
},
{
"epoch": 0.9399698340874811,
"grad_norm": 1.7535784624939774,
"learning_rate": 1.8814088347422822e-07,
"loss": 2.4411,
"step": 1558
},
{
"epoch": 0.9405731523378582,
"grad_norm": 1.7687315605771063,
"learning_rate": 1.844084300893456e-07,
"loss": 2.3016,
"step": 1559
},
{
"epoch": 0.9411764705882353,
"grad_norm": 1.6191879781315448,
"learning_rate": 1.8071302737293294e-07,
"loss": 2.4139,
"step": 1560
},
{
"epoch": 0.9417797888386124,
"grad_norm": 1.7873729255837811,
"learning_rate": 1.770546892739322e-07,
"loss": 2.413,
"step": 1561
},
{
"epoch": 0.9423831070889894,
"grad_norm": 1.619912308221573,
"learning_rate": 1.7343342960138064e-07,
"loss": 2.3733,
"step": 1562
},
{
"epoch": 0.9429864253393665,
"grad_norm": 1.6108954711011358,
"learning_rate": 1.6984926202435527e-07,
"loss": 2.3484,
"step": 1563
},
{
"epoch": 0.9435897435897436,
"grad_norm": 1.7174024797225025,
"learning_rate": 1.6630220007192722e-07,
"loss": 2.4189,
"step": 1564
},
{
"epoch": 0.9441930618401206,
"grad_norm": 1.6005639732300854,
"learning_rate": 1.6279225713310088e-07,
"loss": 2.3632,
"step": 1565
},
{
"epoch": 0.9447963800904977,
"grad_norm": 1.7260576555936502,
"learning_rate": 1.5931944645677043e-07,
"loss": 2.2744,
"step": 1566
},
{
"epoch": 0.9453996983408748,
"grad_norm": 1.6490451332752685,
"learning_rate": 1.558837811516667e-07,
"loss": 2.4731,
"step": 1567
},
{
"epoch": 0.9460030165912519,
"grad_norm": 1.6988860268737969,
"learning_rate": 1.5248527418631254e-07,
"loss": 2.3633,
"step": 1568
},
{
"epoch": 0.9466063348416289,
"grad_norm": 1.7173844921390578,
"learning_rate": 1.4912393838896422e-07,
"loss": 2.3386,
"step": 1569
},
{
"epoch": 0.947209653092006,
"grad_norm": 1.7545899522448907,
"learning_rate": 1.4579978644757463e-07,
"loss": 2.3882,
"step": 1570
},
{
"epoch": 0.9478129713423831,
"grad_norm": 1.72621481584512,
"learning_rate": 1.4251283090973567e-07,
"loss": 2.3771,
"step": 1571
},
{
"epoch": 0.9484162895927601,
"grad_norm": 1.642167082784339,
"learning_rate": 1.392630841826359e-07,
"loss": 2.3513,
"step": 1572
},
{
"epoch": 0.9490196078431372,
"grad_norm": 1.6632894815030224,
"learning_rate": 1.360505585330152e-07,
"loss": 2.4374,
"step": 1573
},
{
"epoch": 0.9496229260935143,
"grad_norm": 1.7481181022414323,
"learning_rate": 1.3287526608711132e-07,
"loss": 2.411,
"step": 1574
},
{
"epoch": 0.9502262443438914,
"grad_norm": 1.6299990975986602,
"learning_rate": 1.297372188306234e-07,
"loss": 2.3651,
"step": 1575
},
{
"epoch": 0.9508295625942684,
"grad_norm": 1.632744134449333,
"learning_rate": 1.2663642860865854e-07,
"loss": 2.3385,
"step": 1576
},
{
"epoch": 0.9514328808446455,
"grad_norm": 1.669311029172285,
"learning_rate": 1.2357290712569304e-07,
"loss": 2.371,
"step": 1577
},
{
"epoch": 0.9520361990950226,
"grad_norm": 1.6777638968211144,
"learning_rate": 1.2054666594552568e-07,
"loss": 2.3964,
"step": 1578
},
{
"epoch": 0.9526395173453996,
"grad_norm": 1.8704079528291009,
"learning_rate": 1.1755771649123337e-07,
"loss": 2.331,
"step": 1579
},
{
"epoch": 0.9532428355957768,
"grad_norm": 1.6167520099218842,
"learning_rate": 1.1460607004512681e-07,
"loss": 2.3648,
"step": 1580
},
{
"epoch": 0.9538461538461539,
"grad_norm": 1.5402727403200858,
"learning_rate": 1.1169173774871478e-07,
"loss": 2.3065,
"step": 1581
},
{
"epoch": 0.954449472096531,
"grad_norm": 1.7323323663200998,
"learning_rate": 1.0881473060265325e-07,
"loss": 2.4157,
"step": 1582
},
{
"epoch": 0.955052790346908,
"grad_norm": 1.6250278954640458,
"learning_rate": 1.0597505946670972e-07,
"loss": 2.4076,
"step": 1583
},
{
"epoch": 0.9556561085972851,
"grad_norm": 1.7877603632530947,
"learning_rate": 1.0317273505972003e-07,
"loss": 2.4501,
"step": 1584
},
{
"epoch": 0.9562594268476622,
"grad_norm": 1.690873762437158,
"learning_rate": 1.004077679595472e-07,
"loss": 2.3116,
"step": 1585
},
{
"epoch": 0.9568627450980393,
"grad_norm": 1.7188774311515,
"learning_rate": 9.768016860304485e-08,
"loss": 2.3585,
"step": 1586
},
{
"epoch": 0.9574660633484163,
"grad_norm": 1.612172189966338,
"learning_rate": 9.498994728601386e-08,
"loss": 2.3611,
"step": 1587
},
{
"epoch": 0.9580693815987934,
"grad_norm": 1.7076734282514445,
"learning_rate": 9.233711416316571e-08,
"loss": 2.3298,
"step": 1588
},
{
"epoch": 0.9586726998491705,
"grad_norm": 1.6271830112713341,
"learning_rate": 8.972167924808151e-08,
"loss": 2.4282,
"step": 1589
},
{
"epoch": 0.9592760180995475,
"grad_norm": 1.6452718258981232,
"learning_rate": 8.714365241318079e-08,
"loss": 2.2814,
"step": 1590
},
{
"epoch": 0.9598793363499246,
"grad_norm": 1.736858370561717,
"learning_rate": 8.460304338967496e-08,
"loss": 2.2471,
"step": 1591
},
{
"epoch": 0.9604826546003017,
"grad_norm": 1.698600906604979,
"learning_rate": 8.209986176753947e-08,
"loss": 2.3503,
"step": 1592
},
{
"epoch": 0.9610859728506788,
"grad_norm": 1.7115650921235999,
"learning_rate": 7.963411699546952e-08,
"loss": 2.3511,
"step": 1593
},
{
"epoch": 0.9616892911010558,
"grad_norm": 1.676613995847439,
"learning_rate": 7.720581838085106e-08,
"loss": 2.4299,
"step": 1594
},
{
"epoch": 0.9622926093514329,
"grad_norm": 1.5996939948140017,
"learning_rate": 7.481497508972313e-08,
"loss": 2.2491,
"step": 1595
},
{
"epoch": 0.96289592760181,
"grad_norm": 1.6337951191726159,
"learning_rate": 7.24615961467412e-08,
"loss": 2.4536,
"step": 1596
},
{
"epoch": 0.963499245852187,
"grad_norm": 1.7162528403403121,
"learning_rate": 7.014569043514496e-08,
"loss": 2.4588,
"step": 1597
},
{
"epoch": 0.9641025641025641,
"grad_norm": 1.701535912798598,
"learning_rate": 6.78672666967295e-08,
"loss": 2.339,
"step": 1598
},
{
"epoch": 0.9647058823529412,
"grad_norm": 1.7375073029608583,
"learning_rate": 6.562633353180081e-08,
"loss": 2.3682,
"step": 1599
},
{
"epoch": 0.9653092006033183,
"grad_norm": 1.6157005142344734,
"learning_rate": 6.342289939915369e-08,
"loss": 2.3839,
"step": 1600
},
{
"epoch": 0.9659125188536953,
"grad_norm": 1.7080860462253225,
"learning_rate": 6.125697261603725e-08,
"loss": 2.4579,
"step": 1601
},
{
"epoch": 0.9665158371040724,
"grad_norm": 1.6596429838292508,
"learning_rate": 5.912856135812051e-08,
"loss": 2.4166,
"step": 1602
},
{
"epoch": 0.9671191553544495,
"grad_norm": 1.6281663411559284,
"learning_rate": 5.7037673659464664e-08,
"loss": 2.3515,
"step": 1603
},
{
"epoch": 0.9677224736048265,
"grad_norm": 1.6615869318672554,
"learning_rate": 5.498431741249089e-08,
"loss": 2.4207,
"step": 1604
},
{
"epoch": 0.9683257918552036,
"grad_norm": 1.6928639425812697,
"learning_rate": 5.2968500367951425e-08,
"loss": 2.3342,
"step": 1605
},
{
"epoch": 0.9689291101055807,
"grad_norm": 1.6711375673234121,
"learning_rate": 5.0990230134900786e-08,
"loss": 2.3565,
"step": 1606
},
{
"epoch": 0.9695324283559578,
"grad_norm": 1.6097613979206937,
"learning_rate": 4.904951418066684e-08,
"loss": 2.3441,
"step": 1607
},
{
"epoch": 0.9701357466063348,
"grad_norm": 1.8307288771308332,
"learning_rate": 4.7146359830821944e-08,
"loss": 2.3499,
"step": 1608
},
{
"epoch": 0.9707390648567119,
"grad_norm": 1.6421462675997172,
"learning_rate": 4.528077426915412e-08,
"loss": 2.4347,
"step": 1609
},
{
"epoch": 0.971342383107089,
"grad_norm": 1.5882185826913309,
"learning_rate": 4.345276453764258e-08,
"loss": 2.3434,
"step": 1610
},
{
"epoch": 0.971945701357466,
"grad_norm": 1.6558426293124853,
"learning_rate": 4.166233753643112e-08,
"loss": 2.3193,
"step": 1611
},
{
"epoch": 0.9725490196078431,
"grad_norm": 1.7560871448672373,
"learning_rate": 3.990950002380034e-08,
"loss": 2.3671,
"step": 1612
},
{
"epoch": 0.9731523378582202,
"grad_norm": 1.6400996276397237,
"learning_rate": 3.81942586161399e-08,
"loss": 2.3296,
"step": 1613
},
{
"epoch": 0.9737556561085973,
"grad_norm": 1.5216875785599606,
"learning_rate": 3.651661978793075e-08,
"loss": 2.4094,
"step": 1614
},
{
"epoch": 0.9743589743589743,
"grad_norm": 1.5635756439509034,
"learning_rate": 3.487658987171294e-08,
"loss": 2.4104,
"step": 1615
},
{
"epoch": 0.9749622926093514,
"grad_norm": 1.641632266579705,
"learning_rate": 3.327417505806785e-08,
"loss": 2.4169,
"step": 1616
},
{
"epoch": 0.9755656108597285,
"grad_norm": 1.7298559985631585,
"learning_rate": 3.170938139558932e-08,
"loss": 2.3705,
"step": 1617
},
{
"epoch": 0.9761689291101056,
"grad_norm": 1.6921930674125782,
"learning_rate": 3.0182214790865915e-08,
"loss": 2.3795,
"step": 1618
},
{
"epoch": 0.9767722473604826,
"grad_norm": 1.771755697713286,
"learning_rate": 2.8692681008454238e-08,
"loss": 2.3752,
"step": 1619
},
{
"epoch": 0.9773755656108597,
"grad_norm": 1.6033324873008887,
"learning_rate": 2.724078567086119e-08,
"loss": 2.3251,
"step": 1620
},
{
"epoch": 0.9779788838612368,
"grad_norm": 1.803718606228007,
"learning_rate": 2.5826534258520663e-08,
"loss": 2.3912,
"step": 1621
},
{
"epoch": 0.9785822021116138,
"grad_norm": 1.655323213505876,
"learning_rate": 2.44499321097702e-08,
"loss": 2.3988,
"step": 1622
},
{
"epoch": 0.9791855203619909,
"grad_norm": 1.6389264444134706,
"learning_rate": 2.311098442083659e-08,
"loss": 2.3493,
"step": 1623
},
{
"epoch": 0.979788838612368,
"grad_norm": 1.9126115252359497,
"learning_rate": 2.180969624581253e-08,
"loss": 2.4097,
"step": 1624
},
{
"epoch": 0.9803921568627451,
"grad_norm": 1.6775223298076922,
"learning_rate": 2.054607249663665e-08,
"loss": 2.4115,
"step": 1625
},
{
"epoch": 0.9809954751131221,
"grad_norm": 1.6034357462290654,
"learning_rate": 1.9320117943080198e-08,
"loss": 2.418,
"step": 1626
},
{
"epoch": 0.9815987933634992,
"grad_norm": 1.614030224153856,
"learning_rate": 1.813183721272038e-08,
"loss": 2.3146,
"step": 1627
},
{
"epoch": 0.9822021116138763,
"grad_norm": 1.577279995047101,
"learning_rate": 1.698123479093372e-08,
"loss": 2.3794,
"step": 1628
},
{
"epoch": 0.9828054298642533,
"grad_norm": 1.6247769579619815,
"learning_rate": 1.5868315020868276e-08,
"loss": 2.3719,
"step": 1629
},
{
"epoch": 0.9834087481146304,
"grad_norm": 1.7166433701178447,
"learning_rate": 1.4793082103435885e-08,
"loss": 2.4619,
"step": 1630
},
{
"epoch": 0.9840120663650075,
"grad_norm": 1.7219302287966303,
"learning_rate": 1.3755540097291076e-08,
"loss": 2.3699,
"step": 1631
},
{
"epoch": 0.9846153846153847,
"grad_norm": 1.6258356906532427,
"learning_rate": 1.275569291881662e-08,
"loss": 2.3593,
"step": 1632
},
{
"epoch": 0.9852187028657617,
"grad_norm": 1.6647030705810246,
"learning_rate": 1.179354434211355e-08,
"loss": 2.4539,
"step": 1633
},
{
"epoch": 0.9858220211161388,
"grad_norm": 1.8941466485318852,
"learning_rate": 1.0869097998976729e-08,
"loss": 2.372,
"step": 1634
},
{
"epoch": 0.9864253393665159,
"grad_norm": 1.694320915923539,
"learning_rate": 9.982357378891528e-09,
"loss": 2.3551,
"step": 1635
},
{
"epoch": 0.987028657616893,
"grad_norm": 1.5883863869845254,
"learning_rate": 9.13332582901716e-09,
"loss": 2.3607,
"step": 1636
},
{
"epoch": 0.98763197586727,
"grad_norm": 1.657927151109981,
"learning_rate": 8.322006554171147e-09,
"loss": 2.4434,
"step": 1637
},
{
"epoch": 0.9882352941176471,
"grad_norm": 1.5879679104330986,
"learning_rate": 7.548402616819328e-09,
"loss": 2.4031,
"step": 1638
},
{
"epoch": 0.9888386123680242,
"grad_norm": 1.6918171007969323,
"learning_rate": 6.812516937065861e-09,
"loss": 2.4065,
"step": 1639
},
{
"epoch": 0.9894419306184012,
"grad_norm": 1.7232747761996445,
"learning_rate": 6.114352292639902e-09,
"loss": 2.3715,
"step": 1640
},
{
"epoch": 0.9900452488687783,
"grad_norm": 1.7276585553136063,
"learning_rate": 5.453911318886729e-09,
"loss": 2.385,
"step": 1641
},
{
"epoch": 0.9906485671191554,
"grad_norm": 1.7987368522109572,
"learning_rate": 4.83119650875552e-09,
"loss": 2.3348,
"step": 1642
},
{
"epoch": 0.9912518853695325,
"grad_norm": 1.6681326236785403,
"learning_rate": 4.246210212791591e-09,
"loss": 2.3628,
"step": 1643
},
{
"epoch": 0.9918552036199095,
"grad_norm": 1.6883734713107637,
"learning_rate": 3.698954639129726e-09,
"loss": 2.3348,
"step": 1644
},
{
"epoch": 0.9924585218702866,
"grad_norm": 1.6356203453610036,
"learning_rate": 3.1894318534819725e-09,
"loss": 2.3339,
"step": 1645
},
{
"epoch": 0.9930618401206637,
"grad_norm": 1.6389219683246756,
"learning_rate": 2.717643779129864e-09,
"loss": 2.3864,
"step": 1646
},
{
"epoch": 0.9936651583710407,
"grad_norm": 1.7109810853278875,
"learning_rate": 2.2835921969210917e-09,
"loss": 2.3851,
"step": 1647
},
{
"epoch": 0.9942684766214178,
"grad_norm": 1.6983534471695187,
"learning_rate": 1.8872787452584028e-09,
"loss": 2.4063,
"step": 1648
},
{
"epoch": 0.9948717948717949,
"grad_norm": 1.8209800366157713,
"learning_rate": 1.5287049200962688e-09,
"loss": 2.4157,
"step": 1649
},
{
"epoch": 0.995475113122172,
"grad_norm": 1.711823043494049,
"learning_rate": 1.2078720749364447e-09,
"loss": 2.3617,
"step": 1650
},
{
"epoch": 0.996078431372549,
"grad_norm": 1.6751671226929132,
"learning_rate": 9.24781420816867e-10,
"loss": 2.384,
"step": 1651
},
{
"epoch": 0.9966817496229261,
"grad_norm": 1.6900768872777132,
"learning_rate": 6.794340263127641e-10,
"loss": 2.4084,
"step": 1652
},
{
"epoch": 0.9972850678733032,
"grad_norm": 1.6710422234638216,
"learning_rate": 4.718308175311049e-10,
"loss": 2.4462,
"step": 1653
},
{
"epoch": 0.9978883861236802,
"grad_norm": 1.6582342788101545,
"learning_rate": 3.0197257810615774e-10,
"loss": 2.392,
"step": 1654
},
{
"epoch": 0.9984917043740573,
"grad_norm": 1.7272268608762122,
"learning_rate": 1.69859949198381e-10,
"loss": 2.3052,
"step": 1655
},
{
"epoch": 0.9990950226244344,
"grad_norm": 1.770451843913821,
"learning_rate": 7.549342948887095e-11,
"loss": 2.4259,
"step": 1656
},
{
"epoch": 0.9996983408748115,
"grad_norm": 1.6154000753848161,
"learning_rate": 1.8873375182693054e-11,
"loss": 2.3743,
"step": 1657
}
],
"logging_steps": 1,
"max_steps": 1657,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 829,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 540265949429760.0,
"train_batch_size": 2,
"trial_name": null,
"trial_params": null
}