model_jf74ps8o / checkpoint-1212 /trainer_state.json
ugaoo's picture
Upload folder using huggingface_hub
48c176e verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 1.0,
"eval_steps": 500,
"global_step": 1212,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0008250825082508251,
"grad_norm": 1.7177605628967285,
"learning_rate": 5.0000000000000004e-08,
"loss": 1.0661,
"step": 1
},
{
"epoch": 0.0016501650165016502,
"grad_norm": 1.5985264778137207,
"learning_rate": 1.0000000000000001e-07,
"loss": 1.0534,
"step": 2
},
{
"epoch": 0.0024752475247524753,
"grad_norm": 1.708420753479004,
"learning_rate": 1.5000000000000002e-07,
"loss": 1.0598,
"step": 3
},
{
"epoch": 0.0033003300330033004,
"grad_norm": 1.608269453048706,
"learning_rate": 2.0000000000000002e-07,
"loss": 1.0497,
"step": 4
},
{
"epoch": 0.004125412541254125,
"grad_norm": 1.68818998336792,
"learning_rate": 2.5000000000000004e-07,
"loss": 1.0521,
"step": 5
},
{
"epoch": 0.0049504950495049506,
"grad_norm": 1.5812345743179321,
"learning_rate": 3.0000000000000004e-07,
"loss": 1.0328,
"step": 6
},
{
"epoch": 0.005775577557755775,
"grad_norm": 1.5700342655181885,
"learning_rate": 3.5000000000000004e-07,
"loss": 1.0412,
"step": 7
},
{
"epoch": 0.006600660066006601,
"grad_norm": 1.6265473365783691,
"learning_rate": 4.0000000000000003e-07,
"loss": 1.0552,
"step": 8
},
{
"epoch": 0.007425742574257425,
"grad_norm": 1.623134970664978,
"learning_rate": 4.5000000000000003e-07,
"loss": 1.0526,
"step": 9
},
{
"epoch": 0.00825082508250825,
"grad_norm": 1.6078376770019531,
"learning_rate": 5.000000000000001e-07,
"loss": 1.04,
"step": 10
},
{
"epoch": 0.009075907590759076,
"grad_norm": 1.6052888631820679,
"learning_rate": 5.5e-07,
"loss": 1.0722,
"step": 11
},
{
"epoch": 0.009900990099009901,
"grad_norm": 1.5547384023666382,
"learning_rate": 6.000000000000001e-07,
"loss": 1.0436,
"step": 12
},
{
"epoch": 0.010726072607260726,
"grad_norm": 1.5156539678573608,
"learning_rate": 6.5e-07,
"loss": 1.0181,
"step": 13
},
{
"epoch": 0.01155115511551155,
"grad_norm": 1.4860498905181885,
"learning_rate": 7.000000000000001e-07,
"loss": 1.0169,
"step": 14
},
{
"epoch": 0.012376237623762377,
"grad_norm": 1.4493576288223267,
"learning_rate": 7.5e-07,
"loss": 1.0193,
"step": 15
},
{
"epoch": 0.013201320132013201,
"grad_norm": 1.4495404958724976,
"learning_rate": 8.000000000000001e-07,
"loss": 1.0234,
"step": 16
},
{
"epoch": 0.014026402640264026,
"grad_norm": 1.4204658269882202,
"learning_rate": 8.500000000000001e-07,
"loss": 1.0524,
"step": 17
},
{
"epoch": 0.01485148514851485,
"grad_norm": 1.24644136428833,
"learning_rate": 9.000000000000001e-07,
"loss": 1.0206,
"step": 18
},
{
"epoch": 0.015676567656765675,
"grad_norm": 1.1867165565490723,
"learning_rate": 9.500000000000001e-07,
"loss": 1.0333,
"step": 19
},
{
"epoch": 0.0165016501650165,
"grad_norm": 1.0698730945587158,
"learning_rate": 1.0000000000000002e-06,
"loss": 1.0293,
"step": 20
},
{
"epoch": 0.017326732673267328,
"grad_norm": 0.9976843595504761,
"learning_rate": 1.0500000000000001e-06,
"loss": 1.012,
"step": 21
},
{
"epoch": 0.018151815181518153,
"grad_norm": 0.9542626738548279,
"learning_rate": 1.1e-06,
"loss": 0.9987,
"step": 22
},
{
"epoch": 0.018976897689768978,
"grad_norm": 0.9308854341506958,
"learning_rate": 1.1500000000000002e-06,
"loss": 1.0111,
"step": 23
},
{
"epoch": 0.019801980198019802,
"grad_norm": 0.8705531358718872,
"learning_rate": 1.2000000000000002e-06,
"loss": 0.9736,
"step": 24
},
{
"epoch": 0.020627062706270627,
"grad_norm": 0.8885819315910339,
"learning_rate": 1.25e-06,
"loss": 0.9616,
"step": 25
},
{
"epoch": 0.02145214521452145,
"grad_norm": 0.8245412111282349,
"learning_rate": 1.3e-06,
"loss": 0.9697,
"step": 26
},
{
"epoch": 0.022277227722772276,
"grad_norm": 0.7995723485946655,
"learning_rate": 1.3500000000000002e-06,
"loss": 0.9678,
"step": 27
},
{
"epoch": 0.0231023102310231,
"grad_norm": 0.7252822518348694,
"learning_rate": 1.4000000000000001e-06,
"loss": 0.9638,
"step": 28
},
{
"epoch": 0.02392739273927393,
"grad_norm": 0.6858912706375122,
"learning_rate": 1.45e-06,
"loss": 0.9485,
"step": 29
},
{
"epoch": 0.024752475247524754,
"grad_norm": 0.6455612778663635,
"learning_rate": 1.5e-06,
"loss": 0.9397,
"step": 30
},
{
"epoch": 0.02557755775577558,
"grad_norm": 0.6329395771026611,
"learning_rate": 1.5500000000000002e-06,
"loss": 0.9771,
"step": 31
},
{
"epoch": 0.026402640264026403,
"grad_norm": 0.6316455602645874,
"learning_rate": 1.6000000000000001e-06,
"loss": 0.9789,
"step": 32
},
{
"epoch": 0.027227722772277228,
"grad_norm": 0.6126256585121155,
"learning_rate": 1.6500000000000003e-06,
"loss": 0.9506,
"step": 33
},
{
"epoch": 0.028052805280528052,
"grad_norm": 0.5972760319709778,
"learning_rate": 1.7000000000000002e-06,
"loss": 0.964,
"step": 34
},
{
"epoch": 0.028877887788778877,
"grad_norm": 0.5646793246269226,
"learning_rate": 1.75e-06,
"loss": 0.9211,
"step": 35
},
{
"epoch": 0.0297029702970297,
"grad_norm": 0.5675886869430542,
"learning_rate": 1.8000000000000001e-06,
"loss": 0.946,
"step": 36
},
{
"epoch": 0.03052805280528053,
"grad_norm": 0.540591299533844,
"learning_rate": 1.85e-06,
"loss": 0.9402,
"step": 37
},
{
"epoch": 0.03135313531353135,
"grad_norm": 0.5284631848335266,
"learning_rate": 1.9000000000000002e-06,
"loss": 0.918,
"step": 38
},
{
"epoch": 0.03217821782178218,
"grad_norm": 0.5251491665840149,
"learning_rate": 1.9500000000000004e-06,
"loss": 0.9419,
"step": 39
},
{
"epoch": 0.033003300330033,
"grad_norm": 0.5078873038291931,
"learning_rate": 2.0000000000000003e-06,
"loss": 0.9256,
"step": 40
},
{
"epoch": 0.03382838283828383,
"grad_norm": 0.5054848194122314,
"learning_rate": 2.05e-06,
"loss": 0.9344,
"step": 41
},
{
"epoch": 0.034653465346534656,
"grad_norm": 0.49411484599113464,
"learning_rate": 2.1000000000000002e-06,
"loss": 0.9143,
"step": 42
},
{
"epoch": 0.03547854785478548,
"grad_norm": 0.4762454330921173,
"learning_rate": 2.15e-06,
"loss": 0.9235,
"step": 43
},
{
"epoch": 0.036303630363036306,
"grad_norm": 0.46506863832473755,
"learning_rate": 2.2e-06,
"loss": 0.9018,
"step": 44
},
{
"epoch": 0.03712871287128713,
"grad_norm": 0.4539809226989746,
"learning_rate": 2.25e-06,
"loss": 0.8876,
"step": 45
},
{
"epoch": 0.037953795379537955,
"grad_norm": 0.4611811935901642,
"learning_rate": 2.3000000000000004e-06,
"loss": 0.9106,
"step": 46
},
{
"epoch": 0.038778877887788776,
"grad_norm": 0.4345838129520416,
"learning_rate": 2.35e-06,
"loss": 0.8899,
"step": 47
},
{
"epoch": 0.039603960396039604,
"grad_norm": 0.4400959014892578,
"learning_rate": 2.4000000000000003e-06,
"loss": 0.8978,
"step": 48
},
{
"epoch": 0.040429042904290426,
"grad_norm": 0.42290085554122925,
"learning_rate": 2.4500000000000003e-06,
"loss": 0.8991,
"step": 49
},
{
"epoch": 0.041254125412541254,
"grad_norm": 0.4143967032432556,
"learning_rate": 2.5e-06,
"loss": 0.869,
"step": 50
},
{
"epoch": 0.04207920792079208,
"grad_norm": 0.39597901701927185,
"learning_rate": 2.55e-06,
"loss": 0.8457,
"step": 51
},
{
"epoch": 0.0429042904290429,
"grad_norm": 0.3814972937107086,
"learning_rate": 2.6e-06,
"loss": 0.8418,
"step": 52
},
{
"epoch": 0.04372937293729373,
"grad_norm": 0.398303359746933,
"learning_rate": 2.6500000000000005e-06,
"loss": 0.8684,
"step": 53
},
{
"epoch": 0.04455445544554455,
"grad_norm": 0.3740525543689728,
"learning_rate": 2.7000000000000004e-06,
"loss": 0.8673,
"step": 54
},
{
"epoch": 0.04537953795379538,
"grad_norm": 0.4020557701587677,
"learning_rate": 2.7500000000000004e-06,
"loss": 0.8882,
"step": 55
},
{
"epoch": 0.0462046204620462,
"grad_norm": 0.38221463561058044,
"learning_rate": 2.8000000000000003e-06,
"loss": 0.8663,
"step": 56
},
{
"epoch": 0.04702970297029703,
"grad_norm": 0.3905200958251953,
"learning_rate": 2.85e-06,
"loss": 0.8683,
"step": 57
},
{
"epoch": 0.04785478547854786,
"grad_norm": 0.3818514347076416,
"learning_rate": 2.9e-06,
"loss": 0.8721,
"step": 58
},
{
"epoch": 0.04867986798679868,
"grad_norm": 0.35962340235710144,
"learning_rate": 2.95e-06,
"loss": 0.8523,
"step": 59
},
{
"epoch": 0.04950495049504951,
"grad_norm": 0.3732520341873169,
"learning_rate": 3e-06,
"loss": 0.8376,
"step": 60
},
{
"epoch": 0.05033003300330033,
"grad_norm": 0.3615162670612335,
"learning_rate": 3.05e-06,
"loss": 0.8523,
"step": 61
},
{
"epoch": 0.05115511551155116,
"grad_norm": 0.37727200984954834,
"learning_rate": 3.1000000000000004e-06,
"loss": 0.8296,
"step": 62
},
{
"epoch": 0.05198019801980198,
"grad_norm": 0.35481664538383484,
"learning_rate": 3.1500000000000003e-06,
"loss": 0.8513,
"step": 63
},
{
"epoch": 0.052805280528052806,
"grad_norm": 0.35886090993881226,
"learning_rate": 3.2000000000000003e-06,
"loss": 0.8584,
"step": 64
},
{
"epoch": 0.05363036303630363,
"grad_norm": 0.3518712818622589,
"learning_rate": 3.2500000000000002e-06,
"loss": 0.8583,
"step": 65
},
{
"epoch": 0.054455445544554455,
"grad_norm": 0.33794984221458435,
"learning_rate": 3.3000000000000006e-06,
"loss": 0.8414,
"step": 66
},
{
"epoch": 0.05528052805280528,
"grad_norm": 0.336191862821579,
"learning_rate": 3.3500000000000005e-06,
"loss": 0.8674,
"step": 67
},
{
"epoch": 0.056105610561056105,
"grad_norm": 0.3314290940761566,
"learning_rate": 3.4000000000000005e-06,
"loss": 0.8182,
"step": 68
},
{
"epoch": 0.05693069306930693,
"grad_norm": 0.3418997526168823,
"learning_rate": 3.45e-06,
"loss": 0.8405,
"step": 69
},
{
"epoch": 0.057755775577557754,
"grad_norm": 0.3234967589378357,
"learning_rate": 3.5e-06,
"loss": 0.8417,
"step": 70
},
{
"epoch": 0.05858085808580858,
"grad_norm": 0.33048129081726074,
"learning_rate": 3.5500000000000003e-06,
"loss": 0.8213,
"step": 71
},
{
"epoch": 0.0594059405940594,
"grad_norm": 0.3219156563282013,
"learning_rate": 3.6000000000000003e-06,
"loss": 0.8266,
"step": 72
},
{
"epoch": 0.06023102310231023,
"grad_norm": 0.31983497738838196,
"learning_rate": 3.65e-06,
"loss": 0.8298,
"step": 73
},
{
"epoch": 0.06105610561056106,
"grad_norm": 0.32737359404563904,
"learning_rate": 3.7e-06,
"loss": 0.8048,
"step": 74
},
{
"epoch": 0.06188118811881188,
"grad_norm": 0.325057715177536,
"learning_rate": 3.7500000000000005e-06,
"loss": 0.8299,
"step": 75
},
{
"epoch": 0.0627062706270627,
"grad_norm": 0.32514944672584534,
"learning_rate": 3.8000000000000005e-06,
"loss": 0.8279,
"step": 76
},
{
"epoch": 0.06353135313531354,
"grad_norm": 0.33182644844055176,
"learning_rate": 3.85e-06,
"loss": 0.8295,
"step": 77
},
{
"epoch": 0.06435643564356436,
"grad_norm": 0.3327374756336212,
"learning_rate": 3.900000000000001e-06,
"loss": 0.8365,
"step": 78
},
{
"epoch": 0.06518151815181518,
"grad_norm": 0.31190282106399536,
"learning_rate": 3.95e-06,
"loss": 0.8437,
"step": 79
},
{
"epoch": 0.066006600660066,
"grad_norm": 0.3261486887931824,
"learning_rate": 4.000000000000001e-06,
"loss": 0.8258,
"step": 80
},
{
"epoch": 0.06683168316831684,
"grad_norm": 0.3433217704296112,
"learning_rate": 4.05e-06,
"loss": 0.8379,
"step": 81
},
{
"epoch": 0.06765676567656766,
"grad_norm": 0.32538896799087524,
"learning_rate": 4.1e-06,
"loss": 0.8037,
"step": 82
},
{
"epoch": 0.06848184818481848,
"grad_norm": 0.3292044401168823,
"learning_rate": 4.15e-06,
"loss": 0.8385,
"step": 83
},
{
"epoch": 0.06930693069306931,
"grad_norm": 0.32659634947776794,
"learning_rate": 4.2000000000000004e-06,
"loss": 0.8648,
"step": 84
},
{
"epoch": 0.07013201320132013,
"grad_norm": 0.32226109504699707,
"learning_rate": 4.25e-06,
"loss": 0.8544,
"step": 85
},
{
"epoch": 0.07095709570957096,
"grad_norm": 0.3303010165691376,
"learning_rate": 4.3e-06,
"loss": 0.8162,
"step": 86
},
{
"epoch": 0.07178217821782178,
"grad_norm": 0.3263317346572876,
"learning_rate": 4.350000000000001e-06,
"loss": 0.8326,
"step": 87
},
{
"epoch": 0.07260726072607261,
"grad_norm": 0.31490302085876465,
"learning_rate": 4.4e-06,
"loss": 0.821,
"step": 88
},
{
"epoch": 0.07343234323432343,
"grad_norm": 0.32537841796875,
"learning_rate": 4.450000000000001e-06,
"loss": 0.8262,
"step": 89
},
{
"epoch": 0.07425742574257425,
"grad_norm": 0.32452619075775146,
"learning_rate": 4.5e-06,
"loss": 0.7995,
"step": 90
},
{
"epoch": 0.07508250825082509,
"grad_norm": 0.3285425901412964,
"learning_rate": 4.5500000000000005e-06,
"loss": 0.8243,
"step": 91
},
{
"epoch": 0.07590759075907591,
"grad_norm": 0.32374563813209534,
"learning_rate": 4.600000000000001e-06,
"loss": 0.8089,
"step": 92
},
{
"epoch": 0.07673267326732673,
"grad_norm": 0.32995370030403137,
"learning_rate": 4.65e-06,
"loss": 0.8281,
"step": 93
},
{
"epoch": 0.07755775577557755,
"grad_norm": 0.31566327810287476,
"learning_rate": 4.7e-06,
"loss": 0.8248,
"step": 94
},
{
"epoch": 0.07838283828382839,
"grad_norm": 0.32131826877593994,
"learning_rate": 4.75e-06,
"loss": 0.8183,
"step": 95
},
{
"epoch": 0.07920792079207921,
"grad_norm": 0.3297450840473175,
"learning_rate": 4.800000000000001e-06,
"loss": 0.8159,
"step": 96
},
{
"epoch": 0.08003300330033003,
"grad_norm": 0.33907413482666016,
"learning_rate": 4.85e-06,
"loss": 0.8167,
"step": 97
},
{
"epoch": 0.08085808580858085,
"grad_norm": 0.33130621910095215,
"learning_rate": 4.9000000000000005e-06,
"loss": 0.8293,
"step": 98
},
{
"epoch": 0.08168316831683169,
"grad_norm": 0.3269996643066406,
"learning_rate": 4.95e-06,
"loss": 0.7956,
"step": 99
},
{
"epoch": 0.08250825082508251,
"grad_norm": 0.31901946663856506,
"learning_rate": 5e-06,
"loss": 0.7986,
"step": 100
},
{
"epoch": 0.08333333333333333,
"grad_norm": 0.3291037082672119,
"learning_rate": 4.999999760155817e-06,
"loss": 0.8103,
"step": 101
},
{
"epoch": 0.08415841584158416,
"grad_norm": 0.32120469212532043,
"learning_rate": 4.999999040623315e-06,
"loss": 0.8099,
"step": 102
},
{
"epoch": 0.08498349834983498,
"grad_norm": 0.3390505909919739,
"learning_rate": 4.999997841402631e-06,
"loss": 0.8195,
"step": 103
},
{
"epoch": 0.0858085808580858,
"grad_norm": 0.33092647790908813,
"learning_rate": 4.9999961624939945e-06,
"loss": 0.816,
"step": 104
},
{
"epoch": 0.08663366336633663,
"grad_norm": 0.32288476824760437,
"learning_rate": 4.999994003897729e-06,
"loss": 0.8034,
"step": 105
},
{
"epoch": 0.08745874587458746,
"grad_norm": 0.33893638849258423,
"learning_rate": 4.999991365614248e-06,
"loss": 0.8255,
"step": 106
},
{
"epoch": 0.08828382838283828,
"grad_norm": 0.3212301731109619,
"learning_rate": 4.999988247644058e-06,
"loss": 0.8053,
"step": 107
},
{
"epoch": 0.0891089108910891,
"grad_norm": 0.3236828148365021,
"learning_rate": 4.999984649987758e-06,
"loss": 0.7893,
"step": 108
},
{
"epoch": 0.08993399339933994,
"grad_norm": 0.3349727988243103,
"learning_rate": 4.999980572646038e-06,
"loss": 0.8239,
"step": 109
},
{
"epoch": 0.09075907590759076,
"grad_norm": 0.31924861669540405,
"learning_rate": 4.999976015619679e-06,
"loss": 0.792,
"step": 110
},
{
"epoch": 0.09158415841584158,
"grad_norm": 0.3242986798286438,
"learning_rate": 4.999970978909556e-06,
"loss": 0.7889,
"step": 111
},
{
"epoch": 0.0924092409240924,
"grad_norm": 0.32954731583595276,
"learning_rate": 4.999965462516636e-06,
"loss": 0.7983,
"step": 112
},
{
"epoch": 0.09323432343234324,
"grad_norm": 0.43535739183425903,
"learning_rate": 4.999959466441976e-06,
"loss": 0.7884,
"step": 113
},
{
"epoch": 0.09405940594059406,
"grad_norm": 0.3296893537044525,
"learning_rate": 4.999952990686729e-06,
"loss": 0.8129,
"step": 114
},
{
"epoch": 0.09488448844884488,
"grad_norm": 0.33009904623031616,
"learning_rate": 4.999946035252136e-06,
"loss": 0.8134,
"step": 115
},
{
"epoch": 0.09570957095709572,
"grad_norm": 0.33187994360923767,
"learning_rate": 4.999938600139531e-06,
"loss": 0.7787,
"step": 116
},
{
"epoch": 0.09653465346534654,
"grad_norm": 0.33838731050491333,
"learning_rate": 4.999930685350342e-06,
"loss": 0.8065,
"step": 117
},
{
"epoch": 0.09735973597359736,
"grad_norm": 0.3391963541507721,
"learning_rate": 4.999922290886087e-06,
"loss": 0.7982,
"step": 118
},
{
"epoch": 0.09818481848184818,
"grad_norm": 0.33598625659942627,
"learning_rate": 4.999913416748376e-06,
"loss": 0.8136,
"step": 119
},
{
"epoch": 0.09900990099009901,
"grad_norm": 0.33401018381118774,
"learning_rate": 4.999904062938913e-06,
"loss": 0.7953,
"step": 120
},
{
"epoch": 0.09983498349834984,
"grad_norm": 0.3376487195491791,
"learning_rate": 4.999894229459492e-06,
"loss": 0.8085,
"step": 121
},
{
"epoch": 0.10066006600660066,
"grad_norm": 0.33256658911705017,
"learning_rate": 4.999883916312e-06,
"loss": 0.8121,
"step": 122
},
{
"epoch": 0.10148514851485149,
"grad_norm": 0.3282712399959564,
"learning_rate": 4.999873123498416e-06,
"loss": 0.8101,
"step": 123
},
{
"epoch": 0.10231023102310231,
"grad_norm": 0.3370251953601837,
"learning_rate": 4.999861851020811e-06,
"loss": 0.8091,
"step": 124
},
{
"epoch": 0.10313531353135313,
"grad_norm": 0.34541329741477966,
"learning_rate": 4.999850098881347e-06,
"loss": 0.8045,
"step": 125
},
{
"epoch": 0.10396039603960396,
"grad_norm": 0.3403375446796417,
"learning_rate": 4.99983786708228e-06,
"loss": 0.8307,
"step": 126
},
{
"epoch": 0.10478547854785479,
"grad_norm": 0.3415558338165283,
"learning_rate": 4.9998251556259555e-06,
"loss": 0.8012,
"step": 127
},
{
"epoch": 0.10561056105610561,
"grad_norm": 0.33992183208465576,
"learning_rate": 4.9998119645148145e-06,
"loss": 0.8036,
"step": 128
},
{
"epoch": 0.10643564356435643,
"grad_norm": 0.3449820280075073,
"learning_rate": 4.999798293751387e-06,
"loss": 0.7627,
"step": 129
},
{
"epoch": 0.10726072607260725,
"grad_norm": 0.3471784293651581,
"learning_rate": 4.999784143338296e-06,
"loss": 0.7936,
"step": 130
},
{
"epoch": 0.10808580858085809,
"grad_norm": 0.37417036294937134,
"learning_rate": 4.999769513278258e-06,
"loss": 0.767,
"step": 131
},
{
"epoch": 0.10891089108910891,
"grad_norm": 0.3380601108074188,
"learning_rate": 4.999754403574077e-06,
"loss": 0.7926,
"step": 132
},
{
"epoch": 0.10973597359735973,
"grad_norm": 0.36881914734840393,
"learning_rate": 4.999738814228655e-06,
"loss": 0.7982,
"step": 133
},
{
"epoch": 0.11056105610561057,
"grad_norm": 0.3481132388114929,
"learning_rate": 4.999722745244983e-06,
"loss": 0.8036,
"step": 134
},
{
"epoch": 0.11138613861386139,
"grad_norm": 0.3401270806789398,
"learning_rate": 4.999706196626143e-06,
"loss": 0.7879,
"step": 135
},
{
"epoch": 0.11221122112211221,
"grad_norm": 0.3421446979045868,
"learning_rate": 4.99968916837531e-06,
"loss": 0.7994,
"step": 136
},
{
"epoch": 0.11303630363036303,
"grad_norm": 0.3525069057941437,
"learning_rate": 4.999671660495754e-06,
"loss": 0.799,
"step": 137
},
{
"epoch": 0.11386138613861387,
"grad_norm": 0.33705243468284607,
"learning_rate": 4.999653672990831e-06,
"loss": 0.797,
"step": 138
},
{
"epoch": 0.11468646864686469,
"grad_norm": 0.34886419773101807,
"learning_rate": 4.999635205863994e-06,
"loss": 0.7911,
"step": 139
},
{
"epoch": 0.11551155115511551,
"grad_norm": 0.3489723205566406,
"learning_rate": 4.999616259118787e-06,
"loss": 0.785,
"step": 140
},
{
"epoch": 0.11633663366336634,
"grad_norm": 0.3380016088485718,
"learning_rate": 4.999596832758844e-06,
"loss": 0.7946,
"step": 141
},
{
"epoch": 0.11716171617161716,
"grad_norm": 0.3427796959877014,
"learning_rate": 4.999576926787893e-06,
"loss": 0.8117,
"step": 142
},
{
"epoch": 0.11798679867986799,
"grad_norm": 0.35554543137550354,
"learning_rate": 4.9995565412097535e-06,
"loss": 0.7546,
"step": 143
},
{
"epoch": 0.1188118811881188,
"grad_norm": 0.3470841646194458,
"learning_rate": 4.999535676028338e-06,
"loss": 0.7796,
"step": 144
},
{
"epoch": 0.11963696369636964,
"grad_norm": 0.3392399251461029,
"learning_rate": 4.999514331247648e-06,
"loss": 0.7639,
"step": 145
},
{
"epoch": 0.12046204620462046,
"grad_norm": 0.33807244896888733,
"learning_rate": 4.999492506871781e-06,
"loss": 0.7789,
"step": 146
},
{
"epoch": 0.12128712871287128,
"grad_norm": 0.3525167405605316,
"learning_rate": 4.999470202904923e-06,
"loss": 0.7929,
"step": 147
},
{
"epoch": 0.12211221122112212,
"grad_norm": 0.35303160548210144,
"learning_rate": 4.9994474193513545e-06,
"loss": 0.8019,
"step": 148
},
{
"epoch": 0.12293729372937294,
"grad_norm": 0.3498503267765045,
"learning_rate": 4.999424156215446e-06,
"loss": 0.7801,
"step": 149
},
{
"epoch": 0.12376237623762376,
"grad_norm": 0.348417192697525,
"learning_rate": 4.9994004135016625e-06,
"loss": 0.7744,
"step": 150
},
{
"epoch": 0.12458745874587458,
"grad_norm": 0.35064205527305603,
"learning_rate": 4.999376191214559e-06,
"loss": 0.756,
"step": 151
},
{
"epoch": 0.1254125412541254,
"grad_norm": 0.3553641736507416,
"learning_rate": 4.999351489358783e-06,
"loss": 0.7864,
"step": 152
},
{
"epoch": 0.12623762376237624,
"grad_norm": 0.35194501280784607,
"learning_rate": 4.999326307939076e-06,
"loss": 0.7616,
"step": 153
},
{
"epoch": 0.12706270627062707,
"grad_norm": 0.34095126390457153,
"learning_rate": 4.999300646960267e-06,
"loss": 0.7951,
"step": 154
},
{
"epoch": 0.12788778877887788,
"grad_norm": 0.33836010098457336,
"learning_rate": 4.999274506427281e-06,
"loss": 0.8008,
"step": 155
},
{
"epoch": 0.12871287128712872,
"grad_norm": 0.343062162399292,
"learning_rate": 4.9992478863451335e-06,
"loss": 0.7906,
"step": 156
},
{
"epoch": 0.12953795379537955,
"grad_norm": 0.3474920988082886,
"learning_rate": 4.999220786718932e-06,
"loss": 0.7654,
"step": 157
},
{
"epoch": 0.13036303630363036,
"grad_norm": 0.36109790205955505,
"learning_rate": 4.9991932075538765e-06,
"loss": 0.7615,
"step": 158
},
{
"epoch": 0.1311881188118812,
"grad_norm": 0.3488277792930603,
"learning_rate": 4.99916514885526e-06,
"loss": 0.7799,
"step": 159
},
{
"epoch": 0.132013201320132,
"grad_norm": 0.36712321639060974,
"learning_rate": 4.9991366106284635e-06,
"loss": 0.8018,
"step": 160
},
{
"epoch": 0.13283828382838284,
"grad_norm": 0.34038668870925903,
"learning_rate": 4.999107592878964e-06,
"loss": 0.7605,
"step": 161
},
{
"epoch": 0.13366336633663367,
"grad_norm": 0.35164085030555725,
"learning_rate": 4.999078095612332e-06,
"loss": 0.7894,
"step": 162
},
{
"epoch": 0.13448844884488448,
"grad_norm": 0.35475778579711914,
"learning_rate": 4.9990481188342234e-06,
"loss": 0.7915,
"step": 163
},
{
"epoch": 0.1353135313531353,
"grad_norm": 0.3551616072654724,
"learning_rate": 4.999017662550392e-06,
"loss": 0.7929,
"step": 164
},
{
"epoch": 0.13613861386138615,
"grad_norm": 0.34733644127845764,
"learning_rate": 4.99898672676668e-06,
"loss": 0.7892,
"step": 165
},
{
"epoch": 0.13696369636963696,
"grad_norm": 0.35961049795150757,
"learning_rate": 4.998955311489025e-06,
"loss": 0.7567,
"step": 166
},
{
"epoch": 0.1377887788778878,
"grad_norm": 0.3517046570777893,
"learning_rate": 4.998923416723456e-06,
"loss": 0.755,
"step": 167
},
{
"epoch": 0.13861386138613863,
"grad_norm": 0.35234466195106506,
"learning_rate": 4.998891042476089e-06,
"loss": 0.7761,
"step": 168
},
{
"epoch": 0.13943894389438943,
"grad_norm": 0.3564783036708832,
"learning_rate": 4.9988581887531386e-06,
"loss": 0.7763,
"step": 169
},
{
"epoch": 0.14026402640264027,
"grad_norm": 0.3533042371273041,
"learning_rate": 4.998824855560907e-06,
"loss": 0.7615,
"step": 170
},
{
"epoch": 0.14108910891089108,
"grad_norm": 0.3700357973575592,
"learning_rate": 4.998791042905791e-06,
"loss": 0.7607,
"step": 171
},
{
"epoch": 0.1419141914191419,
"grad_norm": 0.35690873861312866,
"learning_rate": 4.99875675079428e-06,
"loss": 0.7713,
"step": 172
},
{
"epoch": 0.14273927392739275,
"grad_norm": 0.36469247937202454,
"learning_rate": 4.9987219792329505e-06,
"loss": 0.7696,
"step": 173
},
{
"epoch": 0.14356435643564355,
"grad_norm": 0.37469345331192017,
"learning_rate": 4.998686728228476e-06,
"loss": 0.7739,
"step": 174
},
{
"epoch": 0.1443894389438944,
"grad_norm": 0.36179906129837036,
"learning_rate": 4.9986509977876205e-06,
"loss": 0.7708,
"step": 175
},
{
"epoch": 0.14521452145214522,
"grad_norm": 0.37290266156196594,
"learning_rate": 4.9986147879172395e-06,
"loss": 0.7663,
"step": 176
},
{
"epoch": 0.14603960396039603,
"grad_norm": 0.36196717619895935,
"learning_rate": 4.998578098624282e-06,
"loss": 0.7517,
"step": 177
},
{
"epoch": 0.14686468646864687,
"grad_norm": 0.38069623708724976,
"learning_rate": 4.998540929915784e-06,
"loss": 0.7966,
"step": 178
},
{
"epoch": 0.1476897689768977,
"grad_norm": 0.3567394018173218,
"learning_rate": 4.998503281798882e-06,
"loss": 0.7781,
"step": 179
},
{
"epoch": 0.1485148514851485,
"grad_norm": 0.3728371560573578,
"learning_rate": 4.998465154280796e-06,
"loss": 0.7537,
"step": 180
},
{
"epoch": 0.14933993399339934,
"grad_norm": 0.3626164197921753,
"learning_rate": 4.998426547368844e-06,
"loss": 0.7811,
"step": 181
},
{
"epoch": 0.15016501650165018,
"grad_norm": 0.3611028492450714,
"learning_rate": 4.998387461070433e-06,
"loss": 0.7635,
"step": 182
},
{
"epoch": 0.15099009900990099,
"grad_norm": 0.37311699986457825,
"learning_rate": 4.998347895393063e-06,
"loss": 0.7513,
"step": 183
},
{
"epoch": 0.15181518151815182,
"grad_norm": 0.3596293032169342,
"learning_rate": 4.998307850344325e-06,
"loss": 0.7806,
"step": 184
},
{
"epoch": 0.15264026402640263,
"grad_norm": 0.3608078956604004,
"learning_rate": 4.998267325931903e-06,
"loss": 0.7766,
"step": 185
},
{
"epoch": 0.15346534653465346,
"grad_norm": 0.3593875467777252,
"learning_rate": 4.998226322163573e-06,
"loss": 0.7557,
"step": 186
},
{
"epoch": 0.1542904290429043,
"grad_norm": 0.3720031976699829,
"learning_rate": 4.998184839047202e-06,
"loss": 0.7779,
"step": 187
},
{
"epoch": 0.1551155115511551,
"grad_norm": 0.36318108439445496,
"learning_rate": 4.998142876590749e-06,
"loss": 0.7707,
"step": 188
},
{
"epoch": 0.15594059405940594,
"grad_norm": 0.3631824851036072,
"learning_rate": 4.998100434802267e-06,
"loss": 0.7808,
"step": 189
},
{
"epoch": 0.15676567656765678,
"grad_norm": 0.37768399715423584,
"learning_rate": 4.9980575136899e-06,
"loss": 0.7674,
"step": 190
},
{
"epoch": 0.15759075907590758,
"grad_norm": 0.3613969385623932,
"learning_rate": 4.998014113261882e-06,
"loss": 0.7673,
"step": 191
},
{
"epoch": 0.15841584158415842,
"grad_norm": 0.3773675262928009,
"learning_rate": 4.99797023352654e-06,
"loss": 0.7719,
"step": 192
},
{
"epoch": 0.15924092409240925,
"grad_norm": 0.36111781001091003,
"learning_rate": 4.997925874492295e-06,
"loss": 0.7558,
"step": 193
},
{
"epoch": 0.16006600660066006,
"grad_norm": 0.3750019967556,
"learning_rate": 4.997881036167659e-06,
"loss": 0.778,
"step": 194
},
{
"epoch": 0.1608910891089109,
"grad_norm": 0.37131670117378235,
"learning_rate": 4.997835718561232e-06,
"loss": 0.7325,
"step": 195
},
{
"epoch": 0.1617161716171617,
"grad_norm": 0.37602895498275757,
"learning_rate": 4.9977899216817124e-06,
"loss": 0.7573,
"step": 196
},
{
"epoch": 0.16254125412541254,
"grad_norm": 0.37271130084991455,
"learning_rate": 4.9977436455378865e-06,
"loss": 0.8003,
"step": 197
},
{
"epoch": 0.16336633663366337,
"grad_norm": 0.37385690212249756,
"learning_rate": 4.997696890138635e-06,
"loss": 0.757,
"step": 198
},
{
"epoch": 0.16419141914191418,
"grad_norm": 0.3819064199924469,
"learning_rate": 4.997649655492925e-06,
"loss": 0.7425,
"step": 199
},
{
"epoch": 0.16501650165016502,
"grad_norm": 0.3783811032772064,
"learning_rate": 4.997601941609824e-06,
"loss": 0.7637,
"step": 200
},
{
"epoch": 0.16584158415841585,
"grad_norm": 0.353215754032135,
"learning_rate": 4.997553748498486e-06,
"loss": 0.7579,
"step": 201
},
{
"epoch": 0.16666666666666666,
"grad_norm": 0.36633336544036865,
"learning_rate": 4.9975050761681574e-06,
"loss": 0.7514,
"step": 202
},
{
"epoch": 0.1674917491749175,
"grad_norm": 0.36640000343322754,
"learning_rate": 4.997455924628176e-06,
"loss": 0.7634,
"step": 203
},
{
"epoch": 0.16831683168316833,
"grad_norm": 0.36854660511016846,
"learning_rate": 4.997406293887976e-06,
"loss": 0.7585,
"step": 204
},
{
"epoch": 0.16914191419141913,
"grad_norm": 0.36816444993019104,
"learning_rate": 4.9973561839570775e-06,
"loss": 0.7268,
"step": 205
},
{
"epoch": 0.16996699669966997,
"grad_norm": 0.37374815344810486,
"learning_rate": 4.997305594845097e-06,
"loss": 0.7654,
"step": 206
},
{
"epoch": 0.1707920792079208,
"grad_norm": 0.37258732318878174,
"learning_rate": 4.997254526561739e-06,
"loss": 0.7441,
"step": 207
},
{
"epoch": 0.1716171617161716,
"grad_norm": 0.3762890696525574,
"learning_rate": 4.997202979116805e-06,
"loss": 0.7766,
"step": 208
},
{
"epoch": 0.17244224422442245,
"grad_norm": 0.3801199197769165,
"learning_rate": 4.997150952520185e-06,
"loss": 0.7552,
"step": 209
},
{
"epoch": 0.17326732673267325,
"grad_norm": 0.38364118337631226,
"learning_rate": 4.997098446781861e-06,
"loss": 0.76,
"step": 210
},
{
"epoch": 0.1740924092409241,
"grad_norm": 0.36506178975105286,
"learning_rate": 4.997045461911907e-06,
"loss": 0.755,
"step": 211
},
{
"epoch": 0.17491749174917492,
"grad_norm": 0.3866812586784363,
"learning_rate": 4.996991997920491e-06,
"loss": 0.757,
"step": 212
},
{
"epoch": 0.17574257425742573,
"grad_norm": 0.37725409865379333,
"learning_rate": 4.99693805481787e-06,
"loss": 0.7598,
"step": 213
},
{
"epoch": 0.17656765676567657,
"grad_norm": 0.3766557276248932,
"learning_rate": 4.996883632614396e-06,
"loss": 0.7658,
"step": 214
},
{
"epoch": 0.1773927392739274,
"grad_norm": 0.3847043812274933,
"learning_rate": 4.99682873132051e-06,
"loss": 0.7284,
"step": 215
},
{
"epoch": 0.1782178217821782,
"grad_norm": 0.3873996138572693,
"learning_rate": 4.996773350946747e-06,
"loss": 0.7336,
"step": 216
},
{
"epoch": 0.17904290429042904,
"grad_norm": 0.3642526865005493,
"learning_rate": 4.9967174915037305e-06,
"loss": 0.7498,
"step": 217
},
{
"epoch": 0.17986798679867988,
"grad_norm": 0.3840898871421814,
"learning_rate": 4.996661153002183e-06,
"loss": 0.7708,
"step": 218
},
{
"epoch": 0.1806930693069307,
"grad_norm": 0.3865383267402649,
"learning_rate": 4.996604335452911e-06,
"loss": 0.7667,
"step": 219
},
{
"epoch": 0.18151815181518152,
"grad_norm": 0.367243230342865,
"learning_rate": 4.996547038866817e-06,
"loss": 0.7565,
"step": 220
},
{
"epoch": 0.18234323432343233,
"grad_norm": 0.3867991864681244,
"learning_rate": 4.996489263254897e-06,
"loss": 0.7634,
"step": 221
},
{
"epoch": 0.18316831683168316,
"grad_norm": 0.3795239329338074,
"learning_rate": 4.996431008628234e-06,
"loss": 0.7624,
"step": 222
},
{
"epoch": 0.183993399339934,
"grad_norm": 0.3892177641391754,
"learning_rate": 4.996372274998007e-06,
"loss": 0.7672,
"step": 223
},
{
"epoch": 0.1848184818481848,
"grad_norm": 0.3779730498790741,
"learning_rate": 4.9963130623754855e-06,
"loss": 0.7367,
"step": 224
},
{
"epoch": 0.18564356435643564,
"grad_norm": 0.36853930354118347,
"learning_rate": 4.99625337077203e-06,
"loss": 0.7609,
"step": 225
},
{
"epoch": 0.18646864686468648,
"grad_norm": 0.37936636805534363,
"learning_rate": 4.996193200199094e-06,
"loss": 0.7647,
"step": 226
},
{
"epoch": 0.18729372937293728,
"grad_norm": 0.3741963505744934,
"learning_rate": 4.996132550668224e-06,
"loss": 0.7413,
"step": 227
},
{
"epoch": 0.18811881188118812,
"grad_norm": 0.3922561705112457,
"learning_rate": 4.996071422191057e-06,
"loss": 0.7381,
"step": 228
},
{
"epoch": 0.18894389438943895,
"grad_norm": 0.388773113489151,
"learning_rate": 4.996009814779321e-06,
"loss": 0.7642,
"step": 229
},
{
"epoch": 0.18976897689768976,
"grad_norm": 0.37353891134262085,
"learning_rate": 4.995947728444837e-06,
"loss": 0.7481,
"step": 230
},
{
"epoch": 0.1905940594059406,
"grad_norm": 0.3833358585834503,
"learning_rate": 4.995885163199519e-06,
"loss": 0.7479,
"step": 231
},
{
"epoch": 0.19141914191419143,
"grad_norm": 0.3960364758968353,
"learning_rate": 4.9958221190553705e-06,
"loss": 0.7751,
"step": 232
},
{
"epoch": 0.19224422442244224,
"grad_norm": 0.3736567795276642,
"learning_rate": 4.995758596024488e-06,
"loss": 0.7623,
"step": 233
},
{
"epoch": 0.19306930693069307,
"grad_norm": 0.37923943996429443,
"learning_rate": 4.9956945941190614e-06,
"loss": 0.765,
"step": 234
},
{
"epoch": 0.19389438943894388,
"grad_norm": 0.37835410237312317,
"learning_rate": 4.99563011335137e-06,
"loss": 0.7569,
"step": 235
},
{
"epoch": 0.19471947194719472,
"grad_norm": 0.37825924158096313,
"learning_rate": 4.9955651537337865e-06,
"loss": 0.7551,
"step": 236
},
{
"epoch": 0.19554455445544555,
"grad_norm": 0.38854920864105225,
"learning_rate": 4.995499715278774e-06,
"loss": 0.7527,
"step": 237
},
{
"epoch": 0.19636963696369636,
"grad_norm": 0.38994744420051575,
"learning_rate": 4.995433797998891e-06,
"loss": 0.7664,
"step": 238
},
{
"epoch": 0.1971947194719472,
"grad_norm": 0.3996884524822235,
"learning_rate": 4.995367401906783e-06,
"loss": 0.7932,
"step": 239
},
{
"epoch": 0.19801980198019803,
"grad_norm": 0.3820636570453644,
"learning_rate": 4.99530052701519e-06,
"loss": 0.7575,
"step": 240
},
{
"epoch": 0.19884488448844884,
"grad_norm": 0.3814580738544464,
"learning_rate": 4.9952331733369455e-06,
"loss": 0.7476,
"step": 241
},
{
"epoch": 0.19966996699669967,
"grad_norm": 0.4007291793823242,
"learning_rate": 4.995165340884971e-06,
"loss": 0.7671,
"step": 242
},
{
"epoch": 0.2004950495049505,
"grad_norm": 0.38576772809028625,
"learning_rate": 4.995097029672282e-06,
"loss": 0.7705,
"step": 243
},
{
"epoch": 0.20132013201320131,
"grad_norm": 0.3890525698661804,
"learning_rate": 4.995028239711987e-06,
"loss": 0.739,
"step": 244
},
{
"epoch": 0.20214521452145215,
"grad_norm": 0.3844315707683563,
"learning_rate": 4.994958971017285e-06,
"loss": 0.7464,
"step": 245
},
{
"epoch": 0.20297029702970298,
"grad_norm": 0.3911251425743103,
"learning_rate": 4.994889223601466e-06,
"loss": 0.7628,
"step": 246
},
{
"epoch": 0.2037953795379538,
"grad_norm": 0.40388089418411255,
"learning_rate": 4.994818997477912e-06,
"loss": 0.7437,
"step": 247
},
{
"epoch": 0.20462046204620463,
"grad_norm": 0.3855500817298889,
"learning_rate": 4.994748292660101e-06,
"loss": 0.7687,
"step": 248
},
{
"epoch": 0.20544554455445543,
"grad_norm": 0.3879557251930237,
"learning_rate": 4.994677109161597e-06,
"loss": 0.7712,
"step": 249
},
{
"epoch": 0.20627062706270627,
"grad_norm": 0.40615251660346985,
"learning_rate": 4.9946054469960574e-06,
"loss": 0.7495,
"step": 250
},
{
"epoch": 0.2070957095709571,
"grad_norm": 0.3867475986480713,
"learning_rate": 4.9945333061772346e-06,
"loss": 0.7467,
"step": 251
},
{
"epoch": 0.2079207920792079,
"grad_norm": 0.37825512886047363,
"learning_rate": 4.99446068671897e-06,
"loss": 0.7413,
"step": 252
},
{
"epoch": 0.20874587458745875,
"grad_norm": 0.3996836543083191,
"learning_rate": 4.994387588635197e-06,
"loss": 0.7635,
"step": 253
},
{
"epoch": 0.20957095709570958,
"grad_norm": 0.38707849383354187,
"learning_rate": 4.994314011939941e-06,
"loss": 0.7418,
"step": 254
},
{
"epoch": 0.2103960396039604,
"grad_norm": 0.3831859827041626,
"learning_rate": 4.994239956647321e-06,
"loss": 0.7593,
"step": 255
},
{
"epoch": 0.21122112211221122,
"grad_norm": 0.3887327015399933,
"learning_rate": 4.994165422771545e-06,
"loss": 0.777,
"step": 256
},
{
"epoch": 0.21204620462046206,
"grad_norm": 0.39478468894958496,
"learning_rate": 4.994090410326916e-06,
"loss": 0.7535,
"step": 257
},
{
"epoch": 0.21287128712871287,
"grad_norm": 0.3960399627685547,
"learning_rate": 4.994014919327824e-06,
"loss": 0.7519,
"step": 258
},
{
"epoch": 0.2136963696369637,
"grad_norm": 0.40766602754592896,
"learning_rate": 4.9939389497887565e-06,
"loss": 0.7339,
"step": 259
},
{
"epoch": 0.2145214521452145,
"grad_norm": 0.397185742855072,
"learning_rate": 4.993862501724289e-06,
"loss": 0.7823,
"step": 260
},
{
"epoch": 0.21534653465346534,
"grad_norm": 0.41641128063201904,
"learning_rate": 4.993785575149092e-06,
"loss": 0.7289,
"step": 261
},
{
"epoch": 0.21617161716171618,
"grad_norm": 0.39782091975212097,
"learning_rate": 4.993708170077922e-06,
"loss": 0.7472,
"step": 262
},
{
"epoch": 0.21699669966996699,
"grad_norm": 0.39305055141448975,
"learning_rate": 4.993630286525634e-06,
"loss": 0.7659,
"step": 263
},
{
"epoch": 0.21782178217821782,
"grad_norm": 0.39734795689582825,
"learning_rate": 4.993551924507172e-06,
"loss": 0.7543,
"step": 264
},
{
"epoch": 0.21864686468646866,
"grad_norm": 0.43676817417144775,
"learning_rate": 4.99347308403757e-06,
"loss": 0.75,
"step": 265
},
{
"epoch": 0.21947194719471946,
"grad_norm": 0.3790392279624939,
"learning_rate": 4.993393765131956e-06,
"loss": 0.7481,
"step": 266
},
{
"epoch": 0.2202970297029703,
"grad_norm": 0.3992585241794586,
"learning_rate": 4.993313967805551e-06,
"loss": 0.7467,
"step": 267
},
{
"epoch": 0.22112211221122113,
"grad_norm": 0.40704596042633057,
"learning_rate": 4.9932336920736645e-06,
"loss": 0.761,
"step": 268
},
{
"epoch": 0.22194719471947194,
"grad_norm": 0.394362211227417,
"learning_rate": 4.9931529379517006e-06,
"loss": 0.7424,
"step": 269
},
{
"epoch": 0.22277227722772278,
"grad_norm": 0.4009395241737366,
"learning_rate": 4.993071705455152e-06,
"loss": 0.7471,
"step": 270
},
{
"epoch": 0.2235973597359736,
"grad_norm": 0.4041494131088257,
"learning_rate": 4.992989994599607e-06,
"loss": 0.7309,
"step": 271
},
{
"epoch": 0.22442244224422442,
"grad_norm": 0.39666467905044556,
"learning_rate": 4.992907805400744e-06,
"loss": 0.7448,
"step": 272
},
{
"epoch": 0.22524752475247525,
"grad_norm": 0.39431697130203247,
"learning_rate": 4.992825137874332e-06,
"loss": 0.7679,
"step": 273
},
{
"epoch": 0.22607260726072606,
"grad_norm": 0.39886102080345154,
"learning_rate": 4.992741992036234e-06,
"loss": 0.7564,
"step": 274
},
{
"epoch": 0.2268976897689769,
"grad_norm": 0.38774392008781433,
"learning_rate": 4.992658367902402e-06,
"loss": 0.7545,
"step": 275
},
{
"epoch": 0.22772277227722773,
"grad_norm": 0.40633663535118103,
"learning_rate": 4.992574265488883e-06,
"loss": 0.7531,
"step": 276
},
{
"epoch": 0.22854785478547854,
"grad_norm": 0.4006880819797516,
"learning_rate": 4.9924896848118145e-06,
"loss": 0.7205,
"step": 277
},
{
"epoch": 0.22937293729372937,
"grad_norm": 0.40613582730293274,
"learning_rate": 4.992404625887423e-06,
"loss": 0.756,
"step": 278
},
{
"epoch": 0.2301980198019802,
"grad_norm": 0.41099947690963745,
"learning_rate": 4.9923190887320315e-06,
"loss": 0.7728,
"step": 279
},
{
"epoch": 0.23102310231023102,
"grad_norm": 0.41351890563964844,
"learning_rate": 4.992233073362052e-06,
"loss": 0.7472,
"step": 280
},
{
"epoch": 0.23184818481848185,
"grad_norm": 0.3932720124721527,
"learning_rate": 4.992146579793988e-06,
"loss": 0.7465,
"step": 281
},
{
"epoch": 0.23267326732673269,
"grad_norm": 0.4054405987262726,
"learning_rate": 4.992059608044436e-06,
"loss": 0.7425,
"step": 282
},
{
"epoch": 0.2334983498349835,
"grad_norm": 0.4057449400424957,
"learning_rate": 4.991972158130084e-06,
"loss": 0.7665,
"step": 283
},
{
"epoch": 0.23432343234323433,
"grad_norm": 0.4057076573371887,
"learning_rate": 4.99188423006771e-06,
"loss": 0.7388,
"step": 284
},
{
"epoch": 0.23514851485148514,
"grad_norm": 0.4172205328941345,
"learning_rate": 4.991795823874188e-06,
"loss": 0.7421,
"step": 285
},
{
"epoch": 0.23597359735973597,
"grad_norm": 0.42152348160743713,
"learning_rate": 4.9917069395664786e-06,
"loss": 0.7247,
"step": 286
},
{
"epoch": 0.2367986798679868,
"grad_norm": 0.3970670998096466,
"learning_rate": 4.991617577161638e-06,
"loss": 0.736,
"step": 287
},
{
"epoch": 0.2376237623762376,
"grad_norm": 0.41120800375938416,
"learning_rate": 4.991527736676811e-06,
"loss": 0.7373,
"step": 288
},
{
"epoch": 0.23844884488448845,
"grad_norm": 0.4015986919403076,
"learning_rate": 4.991437418129237e-06,
"loss": 0.7249,
"step": 289
},
{
"epoch": 0.23927392739273928,
"grad_norm": 0.4287201762199402,
"learning_rate": 4.991346621536245e-06,
"loss": 0.7792,
"step": 290
},
{
"epoch": 0.2400990099009901,
"grad_norm": 0.40472856163978577,
"learning_rate": 4.991255346915258e-06,
"loss": 0.7365,
"step": 291
},
{
"epoch": 0.24092409240924093,
"grad_norm": 0.4001949727535248,
"learning_rate": 4.991163594283789e-06,
"loss": 0.7265,
"step": 292
},
{
"epoch": 0.24174917491749176,
"grad_norm": 0.3954242467880249,
"learning_rate": 4.991071363659442e-06,
"loss": 0.7257,
"step": 293
},
{
"epoch": 0.24257425742574257,
"grad_norm": 0.3959173560142517,
"learning_rate": 4.990978655059914e-06,
"loss": 0.749,
"step": 294
},
{
"epoch": 0.2433993399339934,
"grad_norm": 0.4046264588832855,
"learning_rate": 4.990885468502995e-06,
"loss": 0.739,
"step": 295
},
{
"epoch": 0.24422442244224424,
"grad_norm": 0.3978911340236664,
"learning_rate": 4.990791804006563e-06,
"loss": 0.7098,
"step": 296
},
{
"epoch": 0.24504950495049505,
"grad_norm": 0.3915862441062927,
"learning_rate": 4.9906976615885916e-06,
"loss": 0.7323,
"step": 297
},
{
"epoch": 0.24587458745874588,
"grad_norm": 0.39746829867362976,
"learning_rate": 4.990603041267144e-06,
"loss": 0.741,
"step": 298
},
{
"epoch": 0.2466996699669967,
"grad_norm": 0.4023366868495941,
"learning_rate": 4.990507943060374e-06,
"loss": 0.7712,
"step": 299
},
{
"epoch": 0.24752475247524752,
"grad_norm": 0.4111311137676239,
"learning_rate": 4.9904123669865315e-06,
"loss": 0.7451,
"step": 300
},
{
"epoch": 0.24834983498349836,
"grad_norm": 0.4031809866428375,
"learning_rate": 4.990316313063953e-06,
"loss": 0.732,
"step": 301
},
{
"epoch": 0.24917491749174916,
"grad_norm": 0.4134192168712616,
"learning_rate": 4.99021978131107e-06,
"loss": 0.7404,
"step": 302
},
{
"epoch": 0.25,
"grad_norm": 0.39537039399147034,
"learning_rate": 4.990122771746403e-06,
"loss": 0.7175,
"step": 303
},
{
"epoch": 0.2508250825082508,
"grad_norm": 0.42326098680496216,
"learning_rate": 4.990025284388567e-06,
"loss": 0.7663,
"step": 304
},
{
"epoch": 0.25165016501650167,
"grad_norm": 0.42979490756988525,
"learning_rate": 4.989927319256269e-06,
"loss": 0.7227,
"step": 305
},
{
"epoch": 0.2524752475247525,
"grad_norm": 0.420631468296051,
"learning_rate": 4.989828876368303e-06,
"loss": 0.7506,
"step": 306
},
{
"epoch": 0.2533003300330033,
"grad_norm": 0.4130260944366455,
"learning_rate": 4.989729955743559e-06,
"loss": 0.7324,
"step": 307
},
{
"epoch": 0.25412541254125415,
"grad_norm": 0.39602625370025635,
"learning_rate": 4.989630557401018e-06,
"loss": 0.7482,
"step": 308
},
{
"epoch": 0.25495049504950495,
"grad_norm": 0.41538646817207336,
"learning_rate": 4.989530681359751e-06,
"loss": 0.7193,
"step": 309
},
{
"epoch": 0.25577557755775576,
"grad_norm": 0.40971875190734863,
"learning_rate": 4.989430327638923e-06,
"loss": 0.7497,
"step": 310
},
{
"epoch": 0.2566006600660066,
"grad_norm": 0.4176967442035675,
"learning_rate": 4.989329496257789e-06,
"loss": 0.7371,
"step": 311
},
{
"epoch": 0.25742574257425743,
"grad_norm": 0.4094579517841339,
"learning_rate": 4.989228187235695e-06,
"loss": 0.7436,
"step": 312
},
{
"epoch": 0.25825082508250824,
"grad_norm": 0.4166909158229828,
"learning_rate": 4.9891264005920805e-06,
"loss": 0.7224,
"step": 313
},
{
"epoch": 0.2590759075907591,
"grad_norm": 0.4074993431568146,
"learning_rate": 4.989024136346477e-06,
"loss": 0.7467,
"step": 314
},
{
"epoch": 0.2599009900990099,
"grad_norm": 0.4112595319747925,
"learning_rate": 4.988921394518504e-06,
"loss": 0.761,
"step": 315
},
{
"epoch": 0.2607260726072607,
"grad_norm": 0.4261530339717865,
"learning_rate": 4.9888181751278765e-06,
"loss": 0.7578,
"step": 316
},
{
"epoch": 0.2615511551155115,
"grad_norm": 0.4100978672504425,
"learning_rate": 4.9887144781944e-06,
"loss": 0.7663,
"step": 317
},
{
"epoch": 0.2623762376237624,
"grad_norm": 0.42287471890449524,
"learning_rate": 4.988610303737972e-06,
"loss": 0.7569,
"step": 318
},
{
"epoch": 0.2632013201320132,
"grad_norm": 0.41293367743492126,
"learning_rate": 4.98850565177858e-06,
"loss": 0.7413,
"step": 319
},
{
"epoch": 0.264026402640264,
"grad_norm": 0.40984824299812317,
"learning_rate": 4.988400522336304e-06,
"loss": 0.7402,
"step": 320
},
{
"epoch": 0.26485148514851486,
"grad_norm": 0.41604360938072205,
"learning_rate": 4.9882949154313156e-06,
"loss": 0.7368,
"step": 321
},
{
"epoch": 0.26567656765676567,
"grad_norm": 0.4161511957645416,
"learning_rate": 4.988188831083879e-06,
"loss": 0.7365,
"step": 322
},
{
"epoch": 0.2665016501650165,
"grad_norm": 0.4176185131072998,
"learning_rate": 4.988082269314348e-06,
"loss": 0.7454,
"step": 323
},
{
"epoch": 0.26732673267326734,
"grad_norm": 0.40819647908210754,
"learning_rate": 4.987975230143171e-06,
"loss": 0.7155,
"step": 324
},
{
"epoch": 0.26815181518151815,
"grad_norm": 0.41623732447624207,
"learning_rate": 4.9878677135908845e-06,
"loss": 0.7423,
"step": 325
},
{
"epoch": 0.26897689768976896,
"grad_norm": 0.4013398587703705,
"learning_rate": 4.987759719678119e-06,
"loss": 0.7312,
"step": 326
},
{
"epoch": 0.2698019801980198,
"grad_norm": 0.41542699933052063,
"learning_rate": 4.987651248425596e-06,
"loss": 0.7202,
"step": 327
},
{
"epoch": 0.2706270627062706,
"grad_norm": 0.4313049018383026,
"learning_rate": 4.987542299854128e-06,
"loss": 0.753,
"step": 328
},
{
"epoch": 0.27145214521452143,
"grad_norm": 0.41977638006210327,
"learning_rate": 4.98743287398462e-06,
"loss": 0.7394,
"step": 329
},
{
"epoch": 0.2722772277227723,
"grad_norm": 0.4171437919139862,
"learning_rate": 4.987322970838068e-06,
"loss": 0.7304,
"step": 330
},
{
"epoch": 0.2731023102310231,
"grad_norm": 0.4311671853065491,
"learning_rate": 4.987212590435559e-06,
"loss": 0.732,
"step": 331
},
{
"epoch": 0.2739273927392739,
"grad_norm": 0.4147193729877472,
"learning_rate": 4.987101732798273e-06,
"loss": 0.7396,
"step": 332
},
{
"epoch": 0.2747524752475248,
"grad_norm": 0.4111470878124237,
"learning_rate": 4.986990397947481e-06,
"loss": 0.7449,
"step": 333
},
{
"epoch": 0.2755775577557756,
"grad_norm": 0.42444244027137756,
"learning_rate": 4.986878585904546e-06,
"loss": 0.7163,
"step": 334
},
{
"epoch": 0.2764026402640264,
"grad_norm": 0.41067928075790405,
"learning_rate": 4.986766296690919e-06,
"loss": 0.7316,
"step": 335
},
{
"epoch": 0.27722772277227725,
"grad_norm": 0.42234039306640625,
"learning_rate": 4.986653530328149e-06,
"loss": 0.7039,
"step": 336
},
{
"epoch": 0.27805280528052806,
"grad_norm": 0.4271155595779419,
"learning_rate": 4.986540286837871e-06,
"loss": 0.7461,
"step": 337
},
{
"epoch": 0.27887788778877887,
"grad_norm": 0.41107016801834106,
"learning_rate": 4.9864265662418155e-06,
"loss": 0.7162,
"step": 338
},
{
"epoch": 0.27970297029702973,
"grad_norm": 0.4205438792705536,
"learning_rate": 4.9863123685618005e-06,
"loss": 0.7067,
"step": 339
},
{
"epoch": 0.28052805280528054,
"grad_norm": 0.408677339553833,
"learning_rate": 4.986197693819739e-06,
"loss": 0.7201,
"step": 340
},
{
"epoch": 0.28135313531353134,
"grad_norm": 0.44216471910476685,
"learning_rate": 4.9860825420376345e-06,
"loss": 0.7355,
"step": 341
},
{
"epoch": 0.28217821782178215,
"grad_norm": 0.4323996305465698,
"learning_rate": 4.985966913237581e-06,
"loss": 0.6978,
"step": 342
},
{
"epoch": 0.283003300330033,
"grad_norm": 0.43284088373184204,
"learning_rate": 4.985850807441764e-06,
"loss": 0.7424,
"step": 343
},
{
"epoch": 0.2838283828382838,
"grad_norm": 0.4161660075187683,
"learning_rate": 4.985734224672464e-06,
"loss": 0.746,
"step": 344
},
{
"epoch": 0.28465346534653463,
"grad_norm": 0.40742814540863037,
"learning_rate": 4.985617164952048e-06,
"loss": 0.7378,
"step": 345
},
{
"epoch": 0.2854785478547855,
"grad_norm": 0.4200558662414551,
"learning_rate": 4.985499628302978e-06,
"loss": 0.7339,
"step": 346
},
{
"epoch": 0.2863036303630363,
"grad_norm": 0.42096641659736633,
"learning_rate": 4.985381614747807e-06,
"loss": 0.7441,
"step": 347
},
{
"epoch": 0.2871287128712871,
"grad_norm": 0.42450881004333496,
"learning_rate": 4.9852631243091755e-06,
"loss": 0.7372,
"step": 348
},
{
"epoch": 0.28795379537953797,
"grad_norm": 0.4473407566547394,
"learning_rate": 4.985144157009824e-06,
"loss": 0.7134,
"step": 349
},
{
"epoch": 0.2887788778877888,
"grad_norm": 0.4309209883213043,
"learning_rate": 4.985024712872575e-06,
"loss": 0.7484,
"step": 350
},
{
"epoch": 0.2896039603960396,
"grad_norm": 0.43831008672714233,
"learning_rate": 4.984904791920349e-06,
"loss": 0.7577,
"step": 351
},
{
"epoch": 0.29042904290429045,
"grad_norm": 0.4128909111022949,
"learning_rate": 4.984784394176155e-06,
"loss": 0.742,
"step": 352
},
{
"epoch": 0.29125412541254125,
"grad_norm": 0.4246348440647125,
"learning_rate": 4.984663519663097e-06,
"loss": 0.7224,
"step": 353
},
{
"epoch": 0.29207920792079206,
"grad_norm": 0.4162266254425049,
"learning_rate": 4.984542168404364e-06,
"loss": 0.7193,
"step": 354
},
{
"epoch": 0.2929042904290429,
"grad_norm": 0.42818185687065125,
"learning_rate": 4.984420340423242e-06,
"loss": 0.7171,
"step": 355
},
{
"epoch": 0.29372937293729373,
"grad_norm": 0.43851590156555176,
"learning_rate": 4.984298035743107e-06,
"loss": 0.734,
"step": 356
},
{
"epoch": 0.29455445544554454,
"grad_norm": 0.4138251841068268,
"learning_rate": 4.984175254387426e-06,
"loss": 0.7258,
"step": 357
},
{
"epoch": 0.2953795379537954,
"grad_norm": 0.42890465259552,
"learning_rate": 4.984051996379758e-06,
"loss": 0.725,
"step": 358
},
{
"epoch": 0.2962046204620462,
"grad_norm": 0.43275824189186096,
"learning_rate": 4.983928261743753e-06,
"loss": 0.7337,
"step": 359
},
{
"epoch": 0.297029702970297,
"grad_norm": 0.44491758942604065,
"learning_rate": 4.9838040505031525e-06,
"loss": 0.7519,
"step": 360
},
{
"epoch": 0.2978547854785479,
"grad_norm": 0.4430970251560211,
"learning_rate": 4.983679362681789e-06,
"loss": 0.7417,
"step": 361
},
{
"epoch": 0.2986798679867987,
"grad_norm": 0.43559929728507996,
"learning_rate": 4.9835541983035886e-06,
"loss": 0.7444,
"step": 362
},
{
"epoch": 0.2995049504950495,
"grad_norm": 0.43662071228027344,
"learning_rate": 4.9834285573925665e-06,
"loss": 0.7648,
"step": 363
},
{
"epoch": 0.30033003300330036,
"grad_norm": 0.43434590101242065,
"learning_rate": 4.9833024399728295e-06,
"loss": 0.7179,
"step": 364
},
{
"epoch": 0.30115511551155116,
"grad_norm": 0.41816094517707825,
"learning_rate": 4.9831758460685765e-06,
"loss": 0.7328,
"step": 365
},
{
"epoch": 0.30198019801980197,
"grad_norm": 0.4434167146682739,
"learning_rate": 4.983048775704098e-06,
"loss": 0.7213,
"step": 366
},
{
"epoch": 0.3028052805280528,
"grad_norm": 0.4452936351299286,
"learning_rate": 4.982921228903776e-06,
"loss": 0.7552,
"step": 367
},
{
"epoch": 0.30363036303630364,
"grad_norm": 0.4163966774940491,
"learning_rate": 4.982793205692083e-06,
"loss": 0.7167,
"step": 368
},
{
"epoch": 0.30445544554455445,
"grad_norm": 0.42152345180511475,
"learning_rate": 4.982664706093585e-06,
"loss": 0.7176,
"step": 369
},
{
"epoch": 0.30528052805280526,
"grad_norm": 0.44663283228874207,
"learning_rate": 4.982535730132936e-06,
"loss": 0.7474,
"step": 370
},
{
"epoch": 0.3061056105610561,
"grad_norm": 0.42691031098365784,
"learning_rate": 4.982406277834884e-06,
"loss": 0.7148,
"step": 371
},
{
"epoch": 0.3069306930693069,
"grad_norm": 0.437341570854187,
"learning_rate": 4.9822763492242674e-06,
"loss": 0.7374,
"step": 372
},
{
"epoch": 0.30775577557755773,
"grad_norm": 0.42526715993881226,
"learning_rate": 4.982145944326018e-06,
"loss": 0.7161,
"step": 373
},
{
"epoch": 0.3085808580858086,
"grad_norm": 0.4362766444683075,
"learning_rate": 4.9820150631651545e-06,
"loss": 0.7356,
"step": 374
},
{
"epoch": 0.3094059405940594,
"grad_norm": 0.42550137639045715,
"learning_rate": 4.981883705766792e-06,
"loss": 0.7474,
"step": 375
},
{
"epoch": 0.3102310231023102,
"grad_norm": 0.45245617628097534,
"learning_rate": 4.981751872156134e-06,
"loss": 0.6949,
"step": 376
},
{
"epoch": 0.3110561056105611,
"grad_norm": 0.433254599571228,
"learning_rate": 4.981619562358475e-06,
"loss": 0.7263,
"step": 377
},
{
"epoch": 0.3118811881188119,
"grad_norm": 0.4480639398097992,
"learning_rate": 4.981486776399204e-06,
"loss": 0.7289,
"step": 378
},
{
"epoch": 0.3127062706270627,
"grad_norm": 0.44228753447532654,
"learning_rate": 4.9813535143037985e-06,
"loss": 0.7316,
"step": 379
},
{
"epoch": 0.31353135313531355,
"grad_norm": 0.4304775893688202,
"learning_rate": 4.981219776097828e-06,
"loss": 0.7453,
"step": 380
},
{
"epoch": 0.31435643564356436,
"grad_norm": 0.42373213171958923,
"learning_rate": 4.981085561806953e-06,
"loss": 0.7361,
"step": 381
},
{
"epoch": 0.31518151815181517,
"grad_norm": 0.4551517963409424,
"learning_rate": 4.980950871456927e-06,
"loss": 0.7465,
"step": 382
},
{
"epoch": 0.31600660066006603,
"grad_norm": 0.4127417206764221,
"learning_rate": 4.980815705073594e-06,
"loss": 0.7183,
"step": 383
},
{
"epoch": 0.31683168316831684,
"grad_norm": 0.42258021235466003,
"learning_rate": 4.9806800626828885e-06,
"loss": 0.7249,
"step": 384
},
{
"epoch": 0.31765676567656764,
"grad_norm": 0.44680890440940857,
"learning_rate": 4.980543944310836e-06,
"loss": 0.7355,
"step": 385
},
{
"epoch": 0.3184818481848185,
"grad_norm": 0.45091512799263,
"learning_rate": 4.980407349983556e-06,
"loss": 0.7173,
"step": 386
},
{
"epoch": 0.3193069306930693,
"grad_norm": 0.43471530079841614,
"learning_rate": 4.980270279727256e-06,
"loss": 0.7381,
"step": 387
},
{
"epoch": 0.3201320132013201,
"grad_norm": 0.42503878474235535,
"learning_rate": 4.980132733568237e-06,
"loss": 0.7378,
"step": 388
},
{
"epoch": 0.320957095709571,
"grad_norm": 0.44271692633628845,
"learning_rate": 4.979994711532892e-06,
"loss": 0.7208,
"step": 389
},
{
"epoch": 0.3217821782178218,
"grad_norm": 0.4279749393463135,
"learning_rate": 4.979856213647702e-06,
"loss": 0.7319,
"step": 390
},
{
"epoch": 0.3226072607260726,
"grad_norm": 0.4179086983203888,
"learning_rate": 4.979717239939242e-06,
"loss": 0.7349,
"step": 391
},
{
"epoch": 0.3234323432343234,
"grad_norm": 0.4250434935092926,
"learning_rate": 4.979577790434179e-06,
"loss": 0.713,
"step": 392
},
{
"epoch": 0.32425742574257427,
"grad_norm": 0.43326812982559204,
"learning_rate": 4.979437865159268e-06,
"loss": 0.7149,
"step": 393
},
{
"epoch": 0.3250825082508251,
"grad_norm": 0.44481202960014343,
"learning_rate": 4.979297464141358e-06,
"loss": 0.7421,
"step": 394
},
{
"epoch": 0.3259075907590759,
"grad_norm": 0.436131089925766,
"learning_rate": 4.979156587407388e-06,
"loss": 0.7519,
"step": 395
},
{
"epoch": 0.32673267326732675,
"grad_norm": 0.42413946986198425,
"learning_rate": 4.9790152349843904e-06,
"loss": 0.7376,
"step": 396
},
{
"epoch": 0.32755775577557755,
"grad_norm": 0.4464262127876282,
"learning_rate": 4.978873406899485e-06,
"loss": 0.7322,
"step": 397
},
{
"epoch": 0.32838283828382836,
"grad_norm": 0.43812641501426697,
"learning_rate": 4.978731103179887e-06,
"loss": 0.7336,
"step": 398
},
{
"epoch": 0.3292079207920792,
"grad_norm": 0.4382777810096741,
"learning_rate": 4.9785883238529e-06,
"loss": 0.7248,
"step": 399
},
{
"epoch": 0.33003300330033003,
"grad_norm": 0.43636494874954224,
"learning_rate": 4.978445068945918e-06,
"loss": 0.7357,
"step": 400
},
{
"epoch": 0.33085808580858084,
"grad_norm": 0.4354000687599182,
"learning_rate": 4.978301338486432e-06,
"loss": 0.7376,
"step": 401
},
{
"epoch": 0.3316831683168317,
"grad_norm": 0.44714194536209106,
"learning_rate": 4.978157132502019e-06,
"loss": 0.7002,
"step": 402
},
{
"epoch": 0.3325082508250825,
"grad_norm": 0.43645936250686646,
"learning_rate": 4.978012451020347e-06,
"loss": 0.7332,
"step": 403
},
{
"epoch": 0.3333333333333333,
"grad_norm": 0.4704788625240326,
"learning_rate": 4.977867294069178e-06,
"loss": 0.7192,
"step": 404
},
{
"epoch": 0.3341584158415842,
"grad_norm": 0.4357188642024994,
"learning_rate": 4.977721661676364e-06,
"loss": 0.721,
"step": 405
},
{
"epoch": 0.334983498349835,
"grad_norm": 0.4423423111438751,
"learning_rate": 4.977575553869848e-06,
"loss": 0.7302,
"step": 406
},
{
"epoch": 0.3358085808580858,
"grad_norm": 0.453336626291275,
"learning_rate": 4.977428970677664e-06,
"loss": 0.7213,
"step": 407
},
{
"epoch": 0.33663366336633666,
"grad_norm": 0.45698311924934387,
"learning_rate": 4.9772819121279395e-06,
"loss": 0.7483,
"step": 408
},
{
"epoch": 0.33745874587458746,
"grad_norm": 0.4569329619407654,
"learning_rate": 4.97713437824889e-06,
"loss": 0.7171,
"step": 409
},
{
"epoch": 0.33828382838283827,
"grad_norm": 0.4383382201194763,
"learning_rate": 4.976986369068823e-06,
"loss": 0.7348,
"step": 410
},
{
"epoch": 0.33910891089108913,
"grad_norm": 0.4543409049510956,
"learning_rate": 4.9768378846161395e-06,
"loss": 0.7117,
"step": 411
},
{
"epoch": 0.33993399339933994,
"grad_norm": 0.43206459283828735,
"learning_rate": 4.976688924919328e-06,
"loss": 0.7225,
"step": 412
},
{
"epoch": 0.34075907590759075,
"grad_norm": 0.4427265226840973,
"learning_rate": 4.976539490006972e-06,
"loss": 0.7373,
"step": 413
},
{
"epoch": 0.3415841584158416,
"grad_norm": 0.44355079531669617,
"learning_rate": 4.976389579907745e-06,
"loss": 0.7211,
"step": 414
},
{
"epoch": 0.3424092409240924,
"grad_norm": 0.4588415324687958,
"learning_rate": 4.976239194650407e-06,
"loss": 0.7244,
"step": 415
},
{
"epoch": 0.3432343234323432,
"grad_norm": 0.4601019620895386,
"learning_rate": 4.976088334263818e-06,
"loss": 0.7356,
"step": 416
},
{
"epoch": 0.34405940594059403,
"grad_norm": 0.4477859139442444,
"learning_rate": 4.975936998776922e-06,
"loss": 0.7194,
"step": 417
},
{
"epoch": 0.3448844884488449,
"grad_norm": 0.4489600360393524,
"learning_rate": 4.975785188218757e-06,
"loss": 0.7178,
"step": 418
},
{
"epoch": 0.3457095709570957,
"grad_norm": 0.45358383655548096,
"learning_rate": 4.975632902618451e-06,
"loss": 0.7521,
"step": 419
},
{
"epoch": 0.3465346534653465,
"grad_norm": 0.43763983249664307,
"learning_rate": 4.975480142005225e-06,
"loss": 0.7251,
"step": 420
},
{
"epoch": 0.34735973597359737,
"grad_norm": 0.4395206570625305,
"learning_rate": 4.975326906408389e-06,
"loss": 0.738,
"step": 421
},
{
"epoch": 0.3481848184818482,
"grad_norm": 0.45151060819625854,
"learning_rate": 4.975173195857346e-06,
"loss": 0.7455,
"step": 422
},
{
"epoch": 0.349009900990099,
"grad_norm": 0.45000430941581726,
"learning_rate": 4.975019010381589e-06,
"loss": 0.7263,
"step": 423
},
{
"epoch": 0.34983498349834985,
"grad_norm": 0.45460546016693115,
"learning_rate": 4.9748643500107015e-06,
"loss": 0.7155,
"step": 424
},
{
"epoch": 0.35066006600660066,
"grad_norm": 0.4484122097492218,
"learning_rate": 4.974709214774361e-06,
"loss": 0.7278,
"step": 425
},
{
"epoch": 0.35148514851485146,
"grad_norm": 0.4488012492656708,
"learning_rate": 4.974553604702332e-06,
"loss": 0.725,
"step": 426
},
{
"epoch": 0.3523102310231023,
"grad_norm": 0.45438095927238464,
"learning_rate": 4.974397519824474e-06,
"loss": 0.7087,
"step": 427
},
{
"epoch": 0.35313531353135313,
"grad_norm": 0.4460967481136322,
"learning_rate": 4.974240960170734e-06,
"loss": 0.7219,
"step": 428
},
{
"epoch": 0.35396039603960394,
"grad_norm": 0.4628778100013733,
"learning_rate": 4.974083925771154e-06,
"loss": 0.7032,
"step": 429
},
{
"epoch": 0.3547854785478548,
"grad_norm": 0.4375389516353607,
"learning_rate": 4.973926416655863e-06,
"loss": 0.6971,
"step": 430
},
{
"epoch": 0.3556105610561056,
"grad_norm": 0.47083836793899536,
"learning_rate": 4.9737684328550835e-06,
"loss": 0.7262,
"step": 431
},
{
"epoch": 0.3564356435643564,
"grad_norm": 0.4437631666660309,
"learning_rate": 4.9736099743991305e-06,
"loss": 0.7158,
"step": 432
},
{
"epoch": 0.3572607260726073,
"grad_norm": 0.44406312704086304,
"learning_rate": 4.973451041318407e-06,
"loss": 0.7324,
"step": 433
},
{
"epoch": 0.3580858085808581,
"grad_norm": 0.4787375032901764,
"learning_rate": 4.973291633643408e-06,
"loss": 0.7336,
"step": 434
},
{
"epoch": 0.3589108910891089,
"grad_norm": 0.45829135179519653,
"learning_rate": 4.9731317514047195e-06,
"loss": 0.7057,
"step": 435
},
{
"epoch": 0.35973597359735976,
"grad_norm": 0.4471598267555237,
"learning_rate": 4.972971394633021e-06,
"loss": 0.7004,
"step": 436
},
{
"epoch": 0.36056105610561057,
"grad_norm": 0.45264288783073425,
"learning_rate": 4.972810563359079e-06,
"loss": 0.7318,
"step": 437
},
{
"epoch": 0.3613861386138614,
"grad_norm": 0.4582453668117523,
"learning_rate": 4.972649257613754e-06,
"loss": 0.7188,
"step": 438
},
{
"epoch": 0.36221122112211224,
"grad_norm": 0.4455133080482483,
"learning_rate": 4.972487477427996e-06,
"loss": 0.7228,
"step": 439
},
{
"epoch": 0.36303630363036304,
"grad_norm": 0.47273391485214233,
"learning_rate": 4.972325222832848e-06,
"loss": 0.728,
"step": 440
},
{
"epoch": 0.36386138613861385,
"grad_norm": 0.4556146562099457,
"learning_rate": 4.97216249385944e-06,
"loss": 0.7391,
"step": 441
},
{
"epoch": 0.36468646864686466,
"grad_norm": 0.43739357590675354,
"learning_rate": 4.971999290538999e-06,
"loss": 0.7249,
"step": 442
},
{
"epoch": 0.3655115511551155,
"grad_norm": 0.4616910517215729,
"learning_rate": 4.971835612902838e-06,
"loss": 0.7307,
"step": 443
},
{
"epoch": 0.36633663366336633,
"grad_norm": 0.45879244804382324,
"learning_rate": 4.971671460982362e-06,
"loss": 0.7089,
"step": 444
},
{
"epoch": 0.36716171617161714,
"grad_norm": 0.4455641806125641,
"learning_rate": 4.971506834809069e-06,
"loss": 0.6983,
"step": 445
},
{
"epoch": 0.367986798679868,
"grad_norm": 0.4431065320968628,
"learning_rate": 4.971341734414546e-06,
"loss": 0.7155,
"step": 446
},
{
"epoch": 0.3688118811881188,
"grad_norm": 0.4592761993408203,
"learning_rate": 4.971176159830471e-06,
"loss": 0.7128,
"step": 447
},
{
"epoch": 0.3696369636963696,
"grad_norm": 0.4585060179233551,
"learning_rate": 4.971010111088615e-06,
"loss": 0.721,
"step": 448
},
{
"epoch": 0.3704620462046205,
"grad_norm": 0.4845431447029114,
"learning_rate": 4.970843588220839e-06,
"loss": 0.7197,
"step": 449
},
{
"epoch": 0.3712871287128713,
"grad_norm": 0.452482670545578,
"learning_rate": 4.970676591259094e-06,
"loss": 0.7445,
"step": 450
},
{
"epoch": 0.3721122112211221,
"grad_norm": 0.4393105208873749,
"learning_rate": 4.970509120235422e-06,
"loss": 0.7188,
"step": 451
},
{
"epoch": 0.37293729372937295,
"grad_norm": 0.4629392921924591,
"learning_rate": 4.970341175181957e-06,
"loss": 0.7369,
"step": 452
},
{
"epoch": 0.37376237623762376,
"grad_norm": 0.4740729033946991,
"learning_rate": 4.970172756130922e-06,
"loss": 0.7187,
"step": 453
},
{
"epoch": 0.37458745874587457,
"grad_norm": 0.45829933881759644,
"learning_rate": 4.970003863114636e-06,
"loss": 0.7142,
"step": 454
},
{
"epoch": 0.37541254125412543,
"grad_norm": 0.46147412061691284,
"learning_rate": 4.969834496165502e-06,
"loss": 0.7088,
"step": 455
},
{
"epoch": 0.37623762376237624,
"grad_norm": 0.4474492371082306,
"learning_rate": 4.96966465531602e-06,
"loss": 0.7322,
"step": 456
},
{
"epoch": 0.37706270627062705,
"grad_norm": 0.4511535167694092,
"learning_rate": 4.969494340598776e-06,
"loss": 0.7258,
"step": 457
},
{
"epoch": 0.3778877887788779,
"grad_norm": 0.46486011147499084,
"learning_rate": 4.96932355204645e-06,
"loss": 0.698,
"step": 458
},
{
"epoch": 0.3787128712871287,
"grad_norm": 0.45214101672172546,
"learning_rate": 4.969152289691813e-06,
"loss": 0.7265,
"step": 459
},
{
"epoch": 0.3795379537953795,
"grad_norm": 0.475598007440567,
"learning_rate": 4.968980553567726e-06,
"loss": 0.7156,
"step": 460
},
{
"epoch": 0.3803630363036304,
"grad_norm": 0.4582163691520691,
"learning_rate": 4.968808343707139e-06,
"loss": 0.7333,
"step": 461
},
{
"epoch": 0.3811881188118812,
"grad_norm": 0.4644453227519989,
"learning_rate": 4.968635660143096e-06,
"loss": 0.7415,
"step": 462
},
{
"epoch": 0.382013201320132,
"grad_norm": 0.45985257625579834,
"learning_rate": 4.968462502908732e-06,
"loss": 0.7594,
"step": 463
},
{
"epoch": 0.38283828382838286,
"grad_norm": 0.455812931060791,
"learning_rate": 4.968288872037269e-06,
"loss": 0.7357,
"step": 464
},
{
"epoch": 0.38366336633663367,
"grad_norm": 0.47641173005104065,
"learning_rate": 4.968114767562026e-06,
"loss": 0.7339,
"step": 465
},
{
"epoch": 0.3844884488448845,
"grad_norm": 0.4478452503681183,
"learning_rate": 4.967940189516405e-06,
"loss": 0.6818,
"step": 466
},
{
"epoch": 0.38531353135313534,
"grad_norm": 0.4571129381656647,
"learning_rate": 4.9677651379339065e-06,
"loss": 0.6977,
"step": 467
},
{
"epoch": 0.38613861386138615,
"grad_norm": 0.46259671449661255,
"learning_rate": 4.967589612848117e-06,
"loss": 0.7378,
"step": 468
},
{
"epoch": 0.38696369636963696,
"grad_norm": 0.4669695496559143,
"learning_rate": 4.9674136142927165e-06,
"loss": 0.6989,
"step": 469
},
{
"epoch": 0.38778877887788776,
"grad_norm": 0.45127734541893005,
"learning_rate": 4.967237142301474e-06,
"loss": 0.7299,
"step": 470
},
{
"epoch": 0.3886138613861386,
"grad_norm": 0.4583680033683777,
"learning_rate": 4.967060196908251e-06,
"loss": 0.7245,
"step": 471
},
{
"epoch": 0.38943894389438943,
"grad_norm": 0.46106797456741333,
"learning_rate": 4.966882778146997e-06,
"loss": 0.718,
"step": 472
},
{
"epoch": 0.39026402640264024,
"grad_norm": 0.4659496247768402,
"learning_rate": 4.9667048860517575e-06,
"loss": 0.7357,
"step": 473
},
{
"epoch": 0.3910891089108911,
"grad_norm": 0.4740099012851715,
"learning_rate": 4.966526520656663e-06,
"loss": 0.7338,
"step": 474
},
{
"epoch": 0.3919141914191419,
"grad_norm": 0.46527743339538574,
"learning_rate": 4.966347681995938e-06,
"loss": 0.6972,
"step": 475
},
{
"epoch": 0.3927392739273927,
"grad_norm": 0.4390571713447571,
"learning_rate": 4.966168370103897e-06,
"loss": 0.711,
"step": 476
},
{
"epoch": 0.3935643564356436,
"grad_norm": 0.46139904856681824,
"learning_rate": 4.965988585014946e-06,
"loss": 0.7396,
"step": 477
},
{
"epoch": 0.3943894389438944,
"grad_norm": 0.4442991614341736,
"learning_rate": 4.9658083267635814e-06,
"loss": 0.7077,
"step": 478
},
{
"epoch": 0.3952145214521452,
"grad_norm": 0.4543309509754181,
"learning_rate": 4.965627595384391e-06,
"loss": 0.7017,
"step": 479
},
{
"epoch": 0.39603960396039606,
"grad_norm": 0.45788052678108215,
"learning_rate": 4.965446390912051e-06,
"loss": 0.7096,
"step": 480
},
{
"epoch": 0.39686468646864687,
"grad_norm": 0.4616749584674835,
"learning_rate": 4.965264713381331e-06,
"loss": 0.7261,
"step": 481
},
{
"epoch": 0.3976897689768977,
"grad_norm": 0.4634535014629364,
"learning_rate": 4.965082562827091e-06,
"loss": 0.7193,
"step": 482
},
{
"epoch": 0.39851485148514854,
"grad_norm": 0.4743461608886719,
"learning_rate": 4.96489993928428e-06,
"loss": 0.7076,
"step": 483
},
{
"epoch": 0.39933993399339934,
"grad_norm": 0.4673152565956116,
"learning_rate": 4.964716842787939e-06,
"loss": 0.7153,
"step": 484
},
{
"epoch": 0.40016501650165015,
"grad_norm": 0.456820547580719,
"learning_rate": 4.964533273373201e-06,
"loss": 0.7348,
"step": 485
},
{
"epoch": 0.400990099009901,
"grad_norm": 0.4581908881664276,
"learning_rate": 4.964349231075287e-06,
"loss": 0.7185,
"step": 486
},
{
"epoch": 0.4018151815181518,
"grad_norm": 0.4570706784725189,
"learning_rate": 4.964164715929512e-06,
"loss": 0.7033,
"step": 487
},
{
"epoch": 0.40264026402640263,
"grad_norm": 0.4756205677986145,
"learning_rate": 4.9639797279712775e-06,
"loss": 0.7375,
"step": 488
},
{
"epoch": 0.4034653465346535,
"grad_norm": 0.4577876925468445,
"learning_rate": 4.96379426723608e-06,
"loss": 0.736,
"step": 489
},
{
"epoch": 0.4042904290429043,
"grad_norm": 0.4596739411354065,
"learning_rate": 4.963608333759505e-06,
"loss": 0.7389,
"step": 490
},
{
"epoch": 0.4051155115511551,
"grad_norm": 0.4691055715084076,
"learning_rate": 4.963421927577227e-06,
"loss": 0.7018,
"step": 491
},
{
"epoch": 0.40594059405940597,
"grad_norm": 0.4546271860599518,
"learning_rate": 4.963235048725014e-06,
"loss": 0.7242,
"step": 492
},
{
"epoch": 0.4067656765676568,
"grad_norm": 0.465183287858963,
"learning_rate": 4.963047697238722e-06,
"loss": 0.7073,
"step": 493
},
{
"epoch": 0.4075907590759076,
"grad_norm": 0.456122487783432,
"learning_rate": 4.962859873154301e-06,
"loss": 0.7329,
"step": 494
},
{
"epoch": 0.4084158415841584,
"grad_norm": 0.4514836370944977,
"learning_rate": 4.962671576507788e-06,
"loss": 0.7001,
"step": 495
},
{
"epoch": 0.40924092409240925,
"grad_norm": 0.45486271381378174,
"learning_rate": 4.9624828073353144e-06,
"loss": 0.742,
"step": 496
},
{
"epoch": 0.41006600660066006,
"grad_norm": 0.48726174235343933,
"learning_rate": 4.962293565673099e-06,
"loss": 0.7257,
"step": 497
},
{
"epoch": 0.41089108910891087,
"grad_norm": 0.4707367420196533,
"learning_rate": 4.9621038515574535e-06,
"loss": 0.7586,
"step": 498
},
{
"epoch": 0.41171617161716173,
"grad_norm": 0.4668041467666626,
"learning_rate": 4.961913665024778e-06,
"loss": 0.7141,
"step": 499
},
{
"epoch": 0.41254125412541254,
"grad_norm": 0.47152554988861084,
"learning_rate": 4.961723006111566e-06,
"loss": 0.7227,
"step": 500
},
{
"epoch": 0.41336633663366334,
"grad_norm": 0.449969083070755,
"learning_rate": 4.9615318748544e-06,
"loss": 0.7256,
"step": 501
},
{
"epoch": 0.4141914191419142,
"grad_norm": 0.45015445351600647,
"learning_rate": 4.9613402712899516e-06,
"loss": 0.7176,
"step": 502
},
{
"epoch": 0.415016501650165,
"grad_norm": 0.4499993920326233,
"learning_rate": 4.961148195454988e-06,
"loss": 0.7228,
"step": 503
},
{
"epoch": 0.4158415841584158,
"grad_norm": 0.4471442401409149,
"learning_rate": 4.960955647386361e-06,
"loss": 0.7019,
"step": 504
},
{
"epoch": 0.4166666666666667,
"grad_norm": 0.46286097168922424,
"learning_rate": 4.9607626271210165e-06,
"loss": 0.6826,
"step": 505
},
{
"epoch": 0.4174917491749175,
"grad_norm": 0.46081873774528503,
"learning_rate": 4.960569134695991e-06,
"loss": 0.7256,
"step": 506
},
{
"epoch": 0.4183168316831683,
"grad_norm": 0.46333253383636475,
"learning_rate": 4.9603751701484115e-06,
"loss": 0.7328,
"step": 507
},
{
"epoch": 0.41914191419141916,
"grad_norm": 0.45679524540901184,
"learning_rate": 4.960180733515494e-06,
"loss": 0.7332,
"step": 508
},
{
"epoch": 0.41996699669966997,
"grad_norm": 0.45151451230049133,
"learning_rate": 4.959985824834546e-06,
"loss": 0.7418,
"step": 509
},
{
"epoch": 0.4207920792079208,
"grad_norm": 0.47151753306388855,
"learning_rate": 4.9597904441429664e-06,
"loss": 0.7294,
"step": 510
},
{
"epoch": 0.42161716171617164,
"grad_norm": 0.4531096816062927,
"learning_rate": 4.959594591478243e-06,
"loss": 0.707,
"step": 511
},
{
"epoch": 0.42244224422442245,
"grad_norm": 0.4700416028499603,
"learning_rate": 4.959398266877955e-06,
"loss": 0.728,
"step": 512
},
{
"epoch": 0.42326732673267325,
"grad_norm": 0.4657609760761261,
"learning_rate": 4.959201470379774e-06,
"loss": 0.7175,
"step": 513
},
{
"epoch": 0.4240924092409241,
"grad_norm": 0.4651918113231659,
"learning_rate": 4.959004202021459e-06,
"loss": 0.7181,
"step": 514
},
{
"epoch": 0.4249174917491749,
"grad_norm": 0.451061487197876,
"learning_rate": 4.95880646184086e-06,
"loss": 0.7576,
"step": 515
},
{
"epoch": 0.42574257425742573,
"grad_norm": 0.4648871421813965,
"learning_rate": 4.958608249875921e-06,
"loss": 0.7155,
"step": 516
},
{
"epoch": 0.4265676567656766,
"grad_norm": 0.45435768365859985,
"learning_rate": 4.9584095661646725e-06,
"loss": 0.7244,
"step": 517
},
{
"epoch": 0.4273927392739274,
"grad_norm": 0.4550236463546753,
"learning_rate": 4.958210410745237e-06,
"loss": 0.729,
"step": 518
},
{
"epoch": 0.4282178217821782,
"grad_norm": 0.4895179271697998,
"learning_rate": 4.958010783655827e-06,
"loss": 0.7091,
"step": 519
},
{
"epoch": 0.429042904290429,
"grad_norm": 0.4642874002456665,
"learning_rate": 4.957810684934747e-06,
"loss": 0.7122,
"step": 520
},
{
"epoch": 0.4298679867986799,
"grad_norm": 0.4746834933757782,
"learning_rate": 4.9576101146203905e-06,
"loss": 0.6977,
"step": 521
},
{
"epoch": 0.4306930693069307,
"grad_norm": 0.4938143491744995,
"learning_rate": 4.957409072751243e-06,
"loss": 0.7213,
"step": 522
},
{
"epoch": 0.4315181518151815,
"grad_norm": 0.4942130744457245,
"learning_rate": 4.957207559365877e-06,
"loss": 0.6968,
"step": 523
},
{
"epoch": 0.43234323432343236,
"grad_norm": 0.47856637835502625,
"learning_rate": 4.957005574502961e-06,
"loss": 0.7075,
"step": 524
},
{
"epoch": 0.43316831683168316,
"grad_norm": 0.4804016351699829,
"learning_rate": 4.9568031182012485e-06,
"loss": 0.7298,
"step": 525
},
{
"epoch": 0.43399339933993397,
"grad_norm": 0.46422865986824036,
"learning_rate": 4.956600190499588e-06,
"loss": 0.696,
"step": 526
},
{
"epoch": 0.43481848184818483,
"grad_norm": 0.4641623795032501,
"learning_rate": 4.956396791436915e-06,
"loss": 0.7217,
"step": 527
},
{
"epoch": 0.43564356435643564,
"grad_norm": 0.4901314377784729,
"learning_rate": 4.956192921052256e-06,
"loss": 0.7075,
"step": 528
},
{
"epoch": 0.43646864686468645,
"grad_norm": 0.46495771408081055,
"learning_rate": 4.955988579384731e-06,
"loss": 0.7046,
"step": 529
},
{
"epoch": 0.4372937293729373,
"grad_norm": 0.4624289870262146,
"learning_rate": 4.955783766473546e-06,
"loss": 0.7527,
"step": 530
},
{
"epoch": 0.4381188118811881,
"grad_norm": 0.4648906886577606,
"learning_rate": 4.955578482358e-06,
"loss": 0.7188,
"step": 531
},
{
"epoch": 0.4389438943894389,
"grad_norm": 0.46676790714263916,
"learning_rate": 4.955372727077483e-06,
"loss": 0.6907,
"step": 532
},
{
"epoch": 0.4397689768976898,
"grad_norm": 0.46131378412246704,
"learning_rate": 4.955166500671474e-06,
"loss": 0.7272,
"step": 533
},
{
"epoch": 0.4405940594059406,
"grad_norm": 0.4766186773777008,
"learning_rate": 4.954959803179542e-06,
"loss": 0.6933,
"step": 534
},
{
"epoch": 0.4414191419141914,
"grad_norm": 0.47350916266441345,
"learning_rate": 4.954752634641347e-06,
"loss": 0.7001,
"step": 535
},
{
"epoch": 0.44224422442244227,
"grad_norm": 0.4756217300891876,
"learning_rate": 4.954544995096641e-06,
"loss": 0.7256,
"step": 536
},
{
"epoch": 0.4430693069306931,
"grad_norm": 0.48165464401245117,
"learning_rate": 4.954336884585264e-06,
"loss": 0.7212,
"step": 537
},
{
"epoch": 0.4438943894389439,
"grad_norm": 0.4626830816268921,
"learning_rate": 4.954128303147146e-06,
"loss": 0.7183,
"step": 538
},
{
"epoch": 0.44471947194719474,
"grad_norm": 0.47849732637405396,
"learning_rate": 4.953919250822312e-06,
"loss": 0.7089,
"step": 539
},
{
"epoch": 0.44554455445544555,
"grad_norm": 0.46145811676979065,
"learning_rate": 4.95370972765087e-06,
"loss": 0.7144,
"step": 540
},
{
"epoch": 0.44636963696369636,
"grad_norm": 0.4692698121070862,
"learning_rate": 4.953499733673026e-06,
"loss": 0.7307,
"step": 541
},
{
"epoch": 0.4471947194719472,
"grad_norm": 0.4780563414096832,
"learning_rate": 4.95328926892907e-06,
"loss": 0.6988,
"step": 542
},
{
"epoch": 0.44801980198019803,
"grad_norm": 0.45988762378692627,
"learning_rate": 4.953078333459386e-06,
"loss": 0.7342,
"step": 543
},
{
"epoch": 0.44884488448844884,
"grad_norm": 0.47155487537384033,
"learning_rate": 4.952866927304447e-06,
"loss": 0.6989,
"step": 544
},
{
"epoch": 0.44966996699669964,
"grad_norm": 0.48000568151474,
"learning_rate": 4.952655050504817e-06,
"loss": 0.7206,
"step": 545
},
{
"epoch": 0.4504950495049505,
"grad_norm": 0.47779932618141174,
"learning_rate": 4.95244270310115e-06,
"loss": 0.7062,
"step": 546
},
{
"epoch": 0.4513201320132013,
"grad_norm": 0.48263299465179443,
"learning_rate": 4.95222988513419e-06,
"loss": 0.6971,
"step": 547
},
{
"epoch": 0.4521452145214521,
"grad_norm": 0.4724646210670471,
"learning_rate": 4.9520165966447715e-06,
"loss": 0.7213,
"step": 548
},
{
"epoch": 0.452970297029703,
"grad_norm": 0.4655725657939911,
"learning_rate": 4.9518028376738196e-06,
"loss": 0.7234,
"step": 549
},
{
"epoch": 0.4537953795379538,
"grad_norm": 0.4778476357460022,
"learning_rate": 4.9515886082623485e-06,
"loss": 0.7148,
"step": 550
},
{
"epoch": 0.4546204620462046,
"grad_norm": 0.47832944989204407,
"learning_rate": 4.951373908451465e-06,
"loss": 0.705,
"step": 551
},
{
"epoch": 0.45544554455445546,
"grad_norm": 0.4635158181190491,
"learning_rate": 4.951158738282364e-06,
"loss": 0.7112,
"step": 552
},
{
"epoch": 0.45627062706270627,
"grad_norm": 0.48013851046562195,
"learning_rate": 4.95094309779633e-06,
"loss": 0.7216,
"step": 553
},
{
"epoch": 0.4570957095709571,
"grad_norm": 0.4669191241264343,
"learning_rate": 4.950726987034741e-06,
"loss": 0.7045,
"step": 554
},
{
"epoch": 0.45792079207920794,
"grad_norm": 0.4785819947719574,
"learning_rate": 4.950510406039063e-06,
"loss": 0.6867,
"step": 555
},
{
"epoch": 0.45874587458745875,
"grad_norm": 0.46676164865493774,
"learning_rate": 4.9502933548508515e-06,
"loss": 0.7278,
"step": 556
},
{
"epoch": 0.45957095709570955,
"grad_norm": 0.4710772931575775,
"learning_rate": 4.950075833511755e-06,
"loss": 0.7155,
"step": 557
},
{
"epoch": 0.4603960396039604,
"grad_norm": 0.48126041889190674,
"learning_rate": 4.949857842063509e-06,
"loss": 0.7033,
"step": 558
},
{
"epoch": 0.4612211221122112,
"grad_norm": 0.4683317542076111,
"learning_rate": 4.949639380547941e-06,
"loss": 0.7294,
"step": 559
},
{
"epoch": 0.46204620462046203,
"grad_norm": 0.47730982303619385,
"learning_rate": 4.949420449006968e-06,
"loss": 0.7065,
"step": 560
},
{
"epoch": 0.4628712871287129,
"grad_norm": 0.4867958426475525,
"learning_rate": 4.949201047482599e-06,
"loss": 0.72,
"step": 561
},
{
"epoch": 0.4636963696369637,
"grad_norm": 0.4763893485069275,
"learning_rate": 4.94898117601693e-06,
"loss": 0.7158,
"step": 562
},
{
"epoch": 0.4645214521452145,
"grad_norm": 0.49231722950935364,
"learning_rate": 4.94876083465215e-06,
"loss": 0.729,
"step": 563
},
{
"epoch": 0.46534653465346537,
"grad_norm": 0.4658971130847931,
"learning_rate": 4.948540023430538e-06,
"loss": 0.6907,
"step": 564
},
{
"epoch": 0.4661716171617162,
"grad_norm": 0.4774307906627655,
"learning_rate": 4.948318742394459e-06,
"loss": 0.6925,
"step": 565
},
{
"epoch": 0.466996699669967,
"grad_norm": 0.47116512060165405,
"learning_rate": 4.948096991586375e-06,
"loss": 0.7229,
"step": 566
},
{
"epoch": 0.46782178217821785,
"grad_norm": 0.47676825523376465,
"learning_rate": 4.947874771048833e-06,
"loss": 0.7248,
"step": 567
},
{
"epoch": 0.46864686468646866,
"grad_norm": 0.4748082756996155,
"learning_rate": 4.94765208082447e-06,
"loss": 0.7335,
"step": 568
},
{
"epoch": 0.46947194719471946,
"grad_norm": 0.4675842821598053,
"learning_rate": 4.9474289209560174e-06,
"loss": 0.7169,
"step": 569
},
{
"epoch": 0.47029702970297027,
"grad_norm": 0.46060022711753845,
"learning_rate": 4.947205291486293e-06,
"loss": 0.7246,
"step": 570
},
{
"epoch": 0.47112211221122113,
"grad_norm": 0.4750185012817383,
"learning_rate": 4.9469811924582065e-06,
"loss": 0.6989,
"step": 571
},
{
"epoch": 0.47194719471947194,
"grad_norm": 0.46624764800071716,
"learning_rate": 4.9467566239147555e-06,
"loss": 0.7053,
"step": 572
},
{
"epoch": 0.47277227722772275,
"grad_norm": 0.4750285744667053,
"learning_rate": 4.94653158589903e-06,
"loss": 0.7189,
"step": 573
},
{
"epoch": 0.4735973597359736,
"grad_norm": 0.48672083020210266,
"learning_rate": 4.946306078454209e-06,
"loss": 0.7255,
"step": 574
},
{
"epoch": 0.4744224422442244,
"grad_norm": 0.4581200182437897,
"learning_rate": 4.9460801016235625e-06,
"loss": 0.6919,
"step": 575
},
{
"epoch": 0.4752475247524752,
"grad_norm": 0.4761126637458801,
"learning_rate": 4.945853655450449e-06,
"loss": 0.7125,
"step": 576
},
{
"epoch": 0.4760726072607261,
"grad_norm": 0.4857783913612366,
"learning_rate": 4.945626739978319e-06,
"loss": 0.7098,
"step": 577
},
{
"epoch": 0.4768976897689769,
"grad_norm": 0.5091535449028015,
"learning_rate": 4.94539935525071e-06,
"loss": 0.723,
"step": 578
},
{
"epoch": 0.4777227722772277,
"grad_norm": 0.5000525712966919,
"learning_rate": 4.9451715013112545e-06,
"loss": 0.7165,
"step": 579
},
{
"epoch": 0.47854785478547857,
"grad_norm": 0.4900214970111847,
"learning_rate": 4.9449431782036695e-06,
"loss": 0.7106,
"step": 580
},
{
"epoch": 0.4793729372937294,
"grad_norm": 0.4770774841308594,
"learning_rate": 4.9447143859717664e-06,
"loss": 0.7395,
"step": 581
},
{
"epoch": 0.4801980198019802,
"grad_norm": 0.49591994285583496,
"learning_rate": 4.944485124659443e-06,
"loss": 0.6861,
"step": 582
},
{
"epoch": 0.48102310231023104,
"grad_norm": 0.4733443856239319,
"learning_rate": 4.944255394310689e-06,
"loss": 0.7321,
"step": 583
},
{
"epoch": 0.48184818481848185,
"grad_norm": 0.4814870357513428,
"learning_rate": 4.944025194969586e-06,
"loss": 0.7042,
"step": 584
},
{
"epoch": 0.48267326732673266,
"grad_norm": 0.4886230528354645,
"learning_rate": 4.943794526680302e-06,
"loss": 0.7151,
"step": 585
},
{
"epoch": 0.4834983498349835,
"grad_norm": 0.47293350100517273,
"learning_rate": 4.943563389487097e-06,
"loss": 0.7109,
"step": 586
},
{
"epoch": 0.48432343234323433,
"grad_norm": 0.4692576229572296,
"learning_rate": 4.94333178343432e-06,
"loss": 0.6982,
"step": 587
},
{
"epoch": 0.48514851485148514,
"grad_norm": 0.479910671710968,
"learning_rate": 4.9430997085664105e-06,
"loss": 0.728,
"step": 588
},
{
"epoch": 0.485973597359736,
"grad_norm": 0.48567378520965576,
"learning_rate": 4.942867164927899e-06,
"loss": 0.6874,
"step": 589
},
{
"epoch": 0.4867986798679868,
"grad_norm": 0.4664131700992584,
"learning_rate": 4.942634152563405e-06,
"loss": 0.7371,
"step": 590
},
{
"epoch": 0.4876237623762376,
"grad_norm": 0.4846361577510834,
"learning_rate": 4.942400671517635e-06,
"loss": 0.7066,
"step": 591
},
{
"epoch": 0.4884488448844885,
"grad_norm": 0.48338061571121216,
"learning_rate": 4.942166721835392e-06,
"loss": 0.7282,
"step": 592
},
{
"epoch": 0.4892739273927393,
"grad_norm": 0.4792849123477936,
"learning_rate": 4.941932303561563e-06,
"loss": 0.7033,
"step": 593
},
{
"epoch": 0.4900990099009901,
"grad_norm": 0.47274795174598694,
"learning_rate": 4.941697416741128e-06,
"loss": 0.7016,
"step": 594
},
{
"epoch": 0.4909240924092409,
"grad_norm": 0.4659174680709839,
"learning_rate": 4.9414620614191555e-06,
"loss": 0.7162,
"step": 595
},
{
"epoch": 0.49174917491749176,
"grad_norm": 0.486172080039978,
"learning_rate": 4.941226237640804e-06,
"loss": 0.6966,
"step": 596
},
{
"epoch": 0.49257425742574257,
"grad_norm": 0.4819536507129669,
"learning_rate": 4.940989945451323e-06,
"loss": 0.7138,
"step": 597
},
{
"epoch": 0.4933993399339934,
"grad_norm": 0.4761784076690674,
"learning_rate": 4.940753184896051e-06,
"loss": 0.6963,
"step": 598
},
{
"epoch": 0.49422442244224424,
"grad_norm": 0.48324984312057495,
"learning_rate": 4.940515956020416e-06,
"loss": 0.7029,
"step": 599
},
{
"epoch": 0.49504950495049505,
"grad_norm": 0.49788162112236023,
"learning_rate": 4.940278258869937e-06,
"loss": 0.7065,
"step": 600
},
{
"epoch": 0.49587458745874585,
"grad_norm": 0.5038782954216003,
"learning_rate": 4.940040093490223e-06,
"loss": 0.6954,
"step": 601
},
{
"epoch": 0.4966996699669967,
"grad_norm": 0.4979074001312256,
"learning_rate": 4.939801459926969e-06,
"loss": 0.7374,
"step": 602
},
{
"epoch": 0.4975247524752475,
"grad_norm": 0.468302458524704,
"learning_rate": 4.9395623582259665e-06,
"loss": 0.7201,
"step": 603
},
{
"epoch": 0.49834983498349833,
"grad_norm": 0.484279990196228,
"learning_rate": 4.939322788433091e-06,
"loss": 0.7053,
"step": 604
},
{
"epoch": 0.4991749174917492,
"grad_norm": 0.4879089295864105,
"learning_rate": 4.939082750594311e-06,
"loss": 0.721,
"step": 605
},
{
"epoch": 0.5,
"grad_norm": 0.48636701703071594,
"learning_rate": 4.938842244755683e-06,
"loss": 0.694,
"step": 606
},
{
"epoch": 0.5008250825082509,
"grad_norm": 0.48307308554649353,
"learning_rate": 4.938601270963355e-06,
"loss": 0.7183,
"step": 607
},
{
"epoch": 0.5016501650165016,
"grad_norm": 0.476773738861084,
"learning_rate": 4.938359829263564e-06,
"loss": 0.7125,
"step": 608
},
{
"epoch": 0.5024752475247525,
"grad_norm": 0.4792875051498413,
"learning_rate": 4.938117919702636e-06,
"loss": 0.71,
"step": 609
},
{
"epoch": 0.5033003300330033,
"grad_norm": 0.486017644405365,
"learning_rate": 4.937875542326989e-06,
"loss": 0.7208,
"step": 610
},
{
"epoch": 0.5041254125412541,
"grad_norm": 0.49490052461624146,
"learning_rate": 4.937632697183126e-06,
"loss": 0.7107,
"step": 611
},
{
"epoch": 0.504950495049505,
"grad_norm": 0.4896621108055115,
"learning_rate": 4.937389384317647e-06,
"loss": 0.731,
"step": 612
},
{
"epoch": 0.5057755775577558,
"grad_norm": 0.4702865779399872,
"learning_rate": 4.937145603777234e-06,
"loss": 0.6861,
"step": 613
},
{
"epoch": 0.5066006600660066,
"grad_norm": 0.49433633685112,
"learning_rate": 4.936901355608665e-06,
"loss": 0.7039,
"step": 614
},
{
"epoch": 0.5074257425742574,
"grad_norm": 0.46913057565689087,
"learning_rate": 4.936656639858805e-06,
"loss": 0.7173,
"step": 615
},
{
"epoch": 0.5082508250825083,
"grad_norm": 0.4838907718658447,
"learning_rate": 4.936411456574608e-06,
"loss": 0.7291,
"step": 616
},
{
"epoch": 0.509075907590759,
"grad_norm": 0.48521754145622253,
"learning_rate": 4.936165805803119e-06,
"loss": 0.7169,
"step": 617
},
{
"epoch": 0.5099009900990099,
"grad_norm": 0.48856601119041443,
"learning_rate": 4.9359196875914725e-06,
"loss": 0.7279,
"step": 618
},
{
"epoch": 0.5107260726072608,
"grad_norm": 0.4963640868663788,
"learning_rate": 4.935673101986892e-06,
"loss": 0.737,
"step": 619
},
{
"epoch": 0.5115511551155115,
"grad_norm": 0.48461318016052246,
"learning_rate": 4.935426049036692e-06,
"loss": 0.7006,
"step": 620
},
{
"epoch": 0.5123762376237624,
"grad_norm": 0.477611780166626,
"learning_rate": 4.935178528788275e-06,
"loss": 0.6816,
"step": 621
},
{
"epoch": 0.5132013201320133,
"grad_norm": 0.4781244397163391,
"learning_rate": 4.934930541289134e-06,
"loss": 0.6946,
"step": 622
},
{
"epoch": 0.514026402640264,
"grad_norm": 0.47109952569007874,
"learning_rate": 4.934682086586853e-06,
"loss": 0.6967,
"step": 623
},
{
"epoch": 0.5148514851485149,
"grad_norm": 0.4875166416168213,
"learning_rate": 4.934433164729103e-06,
"loss": 0.7215,
"step": 624
},
{
"epoch": 0.5156765676567657,
"grad_norm": 0.4789406955242157,
"learning_rate": 4.934183775763647e-06,
"loss": 0.7005,
"step": 625
},
{
"epoch": 0.5165016501650165,
"grad_norm": 0.4995609223842621,
"learning_rate": 4.933933919738336e-06,
"loss": 0.7326,
"step": 626
},
{
"epoch": 0.5173267326732673,
"grad_norm": 0.4869863986968994,
"learning_rate": 4.933683596701111e-06,
"loss": 0.7073,
"step": 627
},
{
"epoch": 0.5181518151815182,
"grad_norm": 0.4834015369415283,
"learning_rate": 4.933432806700004e-06,
"loss": 0.7098,
"step": 628
},
{
"epoch": 0.518976897689769,
"grad_norm": 0.483574241399765,
"learning_rate": 4.933181549783132e-06,
"loss": 0.6863,
"step": 629
},
{
"epoch": 0.5198019801980198,
"grad_norm": 0.48267966508865356,
"learning_rate": 4.93292982599871e-06,
"loss": 0.7008,
"step": 630
},
{
"epoch": 0.5206270627062707,
"grad_norm": 0.471221387386322,
"learning_rate": 4.932677635395035e-06,
"loss": 0.7033,
"step": 631
},
{
"epoch": 0.5214521452145214,
"grad_norm": 0.4946180284023285,
"learning_rate": 4.932424978020495e-06,
"loss": 0.7133,
"step": 632
},
{
"epoch": 0.5222772277227723,
"grad_norm": 0.48916375637054443,
"learning_rate": 4.93217185392357e-06,
"loss": 0.7147,
"step": 633
},
{
"epoch": 0.523102310231023,
"grad_norm": 0.49226388335227966,
"learning_rate": 4.931918263152829e-06,
"loss": 0.7093,
"step": 634
},
{
"epoch": 0.5239273927392739,
"grad_norm": 0.5069959759712219,
"learning_rate": 4.931664205756928e-06,
"loss": 0.7247,
"step": 635
},
{
"epoch": 0.5247524752475248,
"grad_norm": 0.5008878707885742,
"learning_rate": 4.9314096817846166e-06,
"loss": 0.7134,
"step": 636
},
{
"epoch": 0.5255775577557755,
"grad_norm": 0.4857085645198822,
"learning_rate": 4.9311546912847305e-06,
"loss": 0.7214,
"step": 637
},
{
"epoch": 0.5264026402640264,
"grad_norm": 0.4922746419906616,
"learning_rate": 4.930899234306196e-06,
"loss": 0.7348,
"step": 638
},
{
"epoch": 0.5272277227722773,
"grad_norm": 0.522466242313385,
"learning_rate": 4.930643310898028e-06,
"loss": 0.7035,
"step": 639
},
{
"epoch": 0.528052805280528,
"grad_norm": 0.4872569441795349,
"learning_rate": 4.930386921109334e-06,
"loss": 0.7033,
"step": 640
},
{
"epoch": 0.5288778877887789,
"grad_norm": 0.5073298215866089,
"learning_rate": 4.930130064989308e-06,
"loss": 0.6994,
"step": 641
},
{
"epoch": 0.5297029702970297,
"grad_norm": 0.48736149072647095,
"learning_rate": 4.929872742587233e-06,
"loss": 0.7089,
"step": 642
},
{
"epoch": 0.5305280528052805,
"grad_norm": 0.4810430109500885,
"learning_rate": 4.929614953952485e-06,
"loss": 0.7034,
"step": 643
},
{
"epoch": 0.5313531353135313,
"grad_norm": 0.5020985007286072,
"learning_rate": 4.929356699134526e-06,
"loss": 0.7086,
"step": 644
},
{
"epoch": 0.5321782178217822,
"grad_norm": 0.48785045742988586,
"learning_rate": 4.929097978182909e-06,
"loss": 0.6918,
"step": 645
},
{
"epoch": 0.533003300330033,
"grad_norm": 0.5045816898345947,
"learning_rate": 4.928838791147277e-06,
"loss": 0.7361,
"step": 646
},
{
"epoch": 0.5338283828382838,
"grad_norm": 0.5134187936782837,
"learning_rate": 4.9285791380773596e-06,
"loss": 0.6857,
"step": 647
},
{
"epoch": 0.5346534653465347,
"grad_norm": 0.4964018762111664,
"learning_rate": 4.9283190190229795e-06,
"loss": 0.7001,
"step": 648
},
{
"epoch": 0.5354785478547854,
"grad_norm": 0.5150806307792664,
"learning_rate": 4.928058434034047e-06,
"loss": 0.7254,
"step": 649
},
{
"epoch": 0.5363036303630363,
"grad_norm": 0.5018999576568604,
"learning_rate": 4.927797383160561e-06,
"loss": 0.7308,
"step": 650
},
{
"epoch": 0.5371287128712872,
"grad_norm": 0.47672152519226074,
"learning_rate": 4.927535866452612e-06,
"loss": 0.7032,
"step": 651
},
{
"epoch": 0.5379537953795379,
"grad_norm": 0.5027835965156555,
"learning_rate": 4.927273883960378e-06,
"loss": 0.7258,
"step": 652
},
{
"epoch": 0.5387788778877888,
"grad_norm": 0.5115182399749756,
"learning_rate": 4.9270114357341265e-06,
"loss": 0.7054,
"step": 653
},
{
"epoch": 0.5396039603960396,
"grad_norm": 0.49437984824180603,
"learning_rate": 4.926748521824215e-06,
"loss": 0.698,
"step": 654
},
{
"epoch": 0.5404290429042904,
"grad_norm": 0.49635687470436096,
"learning_rate": 4.926485142281091e-06,
"loss": 0.6807,
"step": 655
},
{
"epoch": 0.5412541254125413,
"grad_norm": 0.4810151755809784,
"learning_rate": 4.92622129715529e-06,
"loss": 0.6833,
"step": 656
},
{
"epoch": 0.5420792079207921,
"grad_norm": 0.48570844531059265,
"learning_rate": 4.9259569864974374e-06,
"loss": 0.7319,
"step": 657
},
{
"epoch": 0.5429042904290429,
"grad_norm": 0.48881736397743225,
"learning_rate": 4.925692210358248e-06,
"loss": 0.6801,
"step": 658
},
{
"epoch": 0.5437293729372937,
"grad_norm": 0.4965687096118927,
"learning_rate": 4.925426968788525e-06,
"loss": 0.6843,
"step": 659
},
{
"epoch": 0.5445544554455446,
"grad_norm": 0.5146209597587585,
"learning_rate": 4.925161261839163e-06,
"loss": 0.71,
"step": 660
},
{
"epoch": 0.5453795379537953,
"grad_norm": 0.5105807781219482,
"learning_rate": 4.924895089561144e-06,
"loss": 0.7021,
"step": 661
},
{
"epoch": 0.5462046204620462,
"grad_norm": 0.5018863081932068,
"learning_rate": 4.92462845200554e-06,
"loss": 0.678,
"step": 662
},
{
"epoch": 0.5470297029702971,
"grad_norm": 0.4923059940338135,
"learning_rate": 4.924361349223512e-06,
"loss": 0.6943,
"step": 663
},
{
"epoch": 0.5478547854785478,
"grad_norm": 0.5007607936859131,
"learning_rate": 4.92409378126631e-06,
"loss": 0.6811,
"step": 664
},
{
"epoch": 0.5486798679867987,
"grad_norm": 0.481728732585907,
"learning_rate": 4.923825748185275e-06,
"loss": 0.7166,
"step": 665
},
{
"epoch": 0.5495049504950495,
"grad_norm": 0.4846872091293335,
"learning_rate": 4.923557250031834e-06,
"loss": 0.6815,
"step": 666
},
{
"epoch": 0.5503300330033003,
"grad_norm": 0.4947971999645233,
"learning_rate": 4.923288286857508e-06,
"loss": 0.7153,
"step": 667
},
{
"epoch": 0.5511551155115512,
"grad_norm": 0.5153589248657227,
"learning_rate": 4.923018858713902e-06,
"loss": 0.7191,
"step": 668
},
{
"epoch": 0.551980198019802,
"grad_norm": 0.515349805355072,
"learning_rate": 4.922748965652713e-06,
"loss": 0.7223,
"step": 669
},
{
"epoch": 0.5528052805280528,
"grad_norm": 0.4987703263759613,
"learning_rate": 4.922478607725728e-06,
"loss": 0.6917,
"step": 670
},
{
"epoch": 0.5536303630363036,
"grad_norm": 0.4923804700374603,
"learning_rate": 4.92220778498482e-06,
"loss": 0.7272,
"step": 671
},
{
"epoch": 0.5544554455445545,
"grad_norm": 0.495511531829834,
"learning_rate": 4.921936497481956e-06,
"loss": 0.7183,
"step": 672
},
{
"epoch": 0.5552805280528053,
"grad_norm": 0.5124722719192505,
"learning_rate": 4.921664745269187e-06,
"loss": 0.7023,
"step": 673
},
{
"epoch": 0.5561056105610561,
"grad_norm": 0.49752897024154663,
"learning_rate": 4.921392528398656e-06,
"loss": 0.7228,
"step": 674
},
{
"epoch": 0.556930693069307,
"grad_norm": 0.5025122761726379,
"learning_rate": 4.9211198469225955e-06,
"loss": 0.697,
"step": 675
},
{
"epoch": 0.5577557755775577,
"grad_norm": 0.5009769201278687,
"learning_rate": 4.920846700893326e-06,
"loss": 0.7079,
"step": 676
},
{
"epoch": 0.5585808580858086,
"grad_norm": 0.5044685006141663,
"learning_rate": 4.920573090363257e-06,
"loss": 0.7345,
"step": 677
},
{
"epoch": 0.5594059405940595,
"grad_norm": 0.4943746328353882,
"learning_rate": 4.920299015384888e-06,
"loss": 0.692,
"step": 678
},
{
"epoch": 0.5602310231023102,
"grad_norm": 0.4920106828212738,
"learning_rate": 4.920024476010808e-06,
"loss": 0.6911,
"step": 679
},
{
"epoch": 0.5610561056105611,
"grad_norm": 0.48839354515075684,
"learning_rate": 4.919749472293693e-06,
"loss": 0.709,
"step": 680
},
{
"epoch": 0.5618811881188119,
"grad_norm": 0.4939538538455963,
"learning_rate": 4.91947400428631e-06,
"loss": 0.7341,
"step": 681
},
{
"epoch": 0.5627062706270627,
"grad_norm": 0.5017114877700806,
"learning_rate": 4.919198072041515e-06,
"loss": 0.7234,
"step": 682
},
{
"epoch": 0.5635313531353136,
"grad_norm": 0.5011210441589355,
"learning_rate": 4.918921675612252e-06,
"loss": 0.7105,
"step": 683
},
{
"epoch": 0.5643564356435643,
"grad_norm": 0.5012781620025635,
"learning_rate": 4.918644815051554e-06,
"loss": 0.7018,
"step": 684
},
{
"epoch": 0.5651815181518152,
"grad_norm": 0.5037400126457214,
"learning_rate": 4.9183674904125455e-06,
"loss": 0.6873,
"step": 685
},
{
"epoch": 0.566006600660066,
"grad_norm": 0.4981100857257843,
"learning_rate": 4.918089701748436e-06,
"loss": 0.7274,
"step": 686
},
{
"epoch": 0.5668316831683168,
"grad_norm": 0.5186753273010254,
"learning_rate": 4.917811449112529e-06,
"loss": 0.719,
"step": 687
},
{
"epoch": 0.5676567656765676,
"grad_norm": 0.4970262944698334,
"learning_rate": 4.917532732558212e-06,
"loss": 0.6961,
"step": 688
},
{
"epoch": 0.5684818481848185,
"grad_norm": 0.5173365473747253,
"learning_rate": 4.9172535521389655e-06,
"loss": 0.743,
"step": 689
},
{
"epoch": 0.5693069306930693,
"grad_norm": 0.5220822691917419,
"learning_rate": 4.9169739079083564e-06,
"loss": 0.6913,
"step": 690
},
{
"epoch": 0.5701320132013201,
"grad_norm": 0.5165892839431763,
"learning_rate": 4.916693799920041e-06,
"loss": 0.7194,
"step": 691
},
{
"epoch": 0.570957095709571,
"grad_norm": 0.5441265106201172,
"learning_rate": 4.9164132282277665e-06,
"loss": 0.7358,
"step": 692
},
{
"epoch": 0.5717821782178217,
"grad_norm": 0.5081223845481873,
"learning_rate": 4.916132192885366e-06,
"loss": 0.7314,
"step": 693
},
{
"epoch": 0.5726072607260726,
"grad_norm": 0.48418229818344116,
"learning_rate": 4.915850693946766e-06,
"loss": 0.685,
"step": 694
},
{
"epoch": 0.5734323432343235,
"grad_norm": 0.5044229626655579,
"learning_rate": 4.915568731465977e-06,
"loss": 0.6891,
"step": 695
},
{
"epoch": 0.5742574257425742,
"grad_norm": 0.5088455677032471,
"learning_rate": 4.9152863054971e-06,
"loss": 0.7153,
"step": 696
},
{
"epoch": 0.5750825082508251,
"grad_norm": 0.5048734545707703,
"learning_rate": 4.915003416094327e-06,
"loss": 0.7097,
"step": 697
},
{
"epoch": 0.5759075907590759,
"grad_norm": 0.5223882794380188,
"learning_rate": 4.914720063311939e-06,
"loss": 0.6743,
"step": 698
},
{
"epoch": 0.5767326732673267,
"grad_norm": 0.49376392364501953,
"learning_rate": 4.914436247204301e-06,
"loss": 0.7214,
"step": 699
},
{
"epoch": 0.5775577557755776,
"grad_norm": 0.49322131276130676,
"learning_rate": 4.914151967825872e-06,
"loss": 0.7095,
"step": 700
},
{
"epoch": 0.5783828382838284,
"grad_norm": 0.5010339021682739,
"learning_rate": 4.913867225231197e-06,
"loss": 0.7048,
"step": 701
},
{
"epoch": 0.5792079207920792,
"grad_norm": 0.5402106642723083,
"learning_rate": 4.913582019474914e-06,
"loss": 0.7228,
"step": 702
},
{
"epoch": 0.58003300330033,
"grad_norm": 0.5094380378723145,
"learning_rate": 4.913296350611745e-06,
"loss": 0.7018,
"step": 703
},
{
"epoch": 0.5808580858085809,
"grad_norm": 0.48760926723480225,
"learning_rate": 4.913010218696502e-06,
"loss": 0.6736,
"step": 704
},
{
"epoch": 0.5816831683168316,
"grad_norm": 0.5021440386772156,
"learning_rate": 4.9127236237840885e-06,
"loss": 0.7187,
"step": 705
},
{
"epoch": 0.5825082508250825,
"grad_norm": 0.5223121047019958,
"learning_rate": 4.9124365659294935e-06,
"loss": 0.702,
"step": 706
},
{
"epoch": 0.5833333333333334,
"grad_norm": 0.5153369307518005,
"learning_rate": 4.912149045187797e-06,
"loss": 0.6944,
"step": 707
},
{
"epoch": 0.5841584158415841,
"grad_norm": 0.5183912515640259,
"learning_rate": 4.911861061614168e-06,
"loss": 0.6992,
"step": 708
},
{
"epoch": 0.584983498349835,
"grad_norm": 0.5021354556083679,
"learning_rate": 4.911572615263862e-06,
"loss": 0.6783,
"step": 709
},
{
"epoch": 0.5858085808580858,
"grad_norm": 0.49843043088912964,
"learning_rate": 4.9112837061922255e-06,
"loss": 0.6867,
"step": 710
},
{
"epoch": 0.5866336633663366,
"grad_norm": 0.5146470665931702,
"learning_rate": 4.9109943344546924e-06,
"loss": 0.6862,
"step": 711
},
{
"epoch": 0.5874587458745875,
"grad_norm": 0.49383649230003357,
"learning_rate": 4.910704500106786e-06,
"loss": 0.7119,
"step": 712
},
{
"epoch": 0.5882838283828383,
"grad_norm": 0.497063010931015,
"learning_rate": 4.91041420320412e-06,
"loss": 0.7091,
"step": 713
},
{
"epoch": 0.5891089108910891,
"grad_norm": 0.4999721944332123,
"learning_rate": 4.910123443802394e-06,
"loss": 0.7166,
"step": 714
},
{
"epoch": 0.5899339933993399,
"grad_norm": 0.49842125177383423,
"learning_rate": 4.909832221957397e-06,
"loss": 0.7051,
"step": 715
},
{
"epoch": 0.5907590759075908,
"grad_norm": 0.5021651387214661,
"learning_rate": 4.909540537725007e-06,
"loss": 0.7108,
"step": 716
},
{
"epoch": 0.5915841584158416,
"grad_norm": 0.5190720558166504,
"learning_rate": 4.909248391161193e-06,
"loss": 0.6969,
"step": 717
},
{
"epoch": 0.5924092409240924,
"grad_norm": 0.5224063992500305,
"learning_rate": 4.9089557823220096e-06,
"loss": 0.7128,
"step": 718
},
{
"epoch": 0.5932343234323433,
"grad_norm": 0.5178462266921997,
"learning_rate": 4.908662711263601e-06,
"loss": 0.7036,
"step": 719
},
{
"epoch": 0.594059405940594,
"grad_norm": 0.5114635825157166,
"learning_rate": 4.9083691780422e-06,
"loss": 0.7143,
"step": 720
},
{
"epoch": 0.5948844884488449,
"grad_norm": 0.5062857866287231,
"learning_rate": 4.90807518271413e-06,
"loss": 0.6911,
"step": 721
},
{
"epoch": 0.5957095709570958,
"grad_norm": 0.5059993267059326,
"learning_rate": 4.9077807253358e-06,
"loss": 0.6971,
"step": 722
},
{
"epoch": 0.5965346534653465,
"grad_norm": 0.5015937685966492,
"learning_rate": 4.9074858059637084e-06,
"loss": 0.699,
"step": 723
},
{
"epoch": 0.5973597359735974,
"grad_norm": 0.5120682716369629,
"learning_rate": 4.907190424654446e-06,
"loss": 0.6871,
"step": 724
},
{
"epoch": 0.5981848184818482,
"grad_norm": 0.5022520422935486,
"learning_rate": 4.906894581464687e-06,
"loss": 0.6961,
"step": 725
},
{
"epoch": 0.599009900990099,
"grad_norm": 0.481477290391922,
"learning_rate": 4.906598276451194e-06,
"loss": 0.6732,
"step": 726
},
{
"epoch": 0.5998349834983498,
"grad_norm": 0.5167291164398193,
"learning_rate": 4.906301509670826e-06,
"loss": 0.713,
"step": 727
},
{
"epoch": 0.6006600660066007,
"grad_norm": 0.5114957690238953,
"learning_rate": 4.906004281180521e-06,
"loss": 0.7362,
"step": 728
},
{
"epoch": 0.6014851485148515,
"grad_norm": 0.504904568195343,
"learning_rate": 4.905706591037313e-06,
"loss": 0.7097,
"step": 729
},
{
"epoch": 0.6023102310231023,
"grad_norm": 0.5066644549369812,
"learning_rate": 4.9054084392983185e-06,
"loss": 0.7078,
"step": 730
},
{
"epoch": 0.6031353135313532,
"grad_norm": 0.5130860209465027,
"learning_rate": 4.905109826020746e-06,
"loss": 0.7124,
"step": 731
},
{
"epoch": 0.6039603960396039,
"grad_norm": 0.5110951662063599,
"learning_rate": 4.904810751261894e-06,
"loss": 0.6983,
"step": 732
},
{
"epoch": 0.6047854785478548,
"grad_norm": 0.5051254630088806,
"learning_rate": 4.904511215079147e-06,
"loss": 0.7374,
"step": 733
},
{
"epoch": 0.6056105610561056,
"grad_norm": 0.4949505031108856,
"learning_rate": 4.904211217529976e-06,
"loss": 0.707,
"step": 734
},
{
"epoch": 0.6064356435643564,
"grad_norm": 0.5078654885292053,
"learning_rate": 4.903910758671946e-06,
"loss": 0.7112,
"step": 735
},
{
"epoch": 0.6072607260726073,
"grad_norm": 0.5021851062774658,
"learning_rate": 4.903609838562706e-06,
"loss": 0.6755,
"step": 736
},
{
"epoch": 0.608085808580858,
"grad_norm": 0.5080071687698364,
"learning_rate": 4.9033084572599966e-06,
"loss": 0.7008,
"step": 737
},
{
"epoch": 0.6089108910891089,
"grad_norm": 0.5117108225822449,
"learning_rate": 4.903006614821645e-06,
"loss": 0.6919,
"step": 738
},
{
"epoch": 0.6097359735973598,
"grad_norm": 0.5092887282371521,
"learning_rate": 4.902704311305566e-06,
"loss": 0.6827,
"step": 739
},
{
"epoch": 0.6105610561056105,
"grad_norm": 0.5054975152015686,
"learning_rate": 4.902401546769766e-06,
"loss": 0.6716,
"step": 740
},
{
"epoch": 0.6113861386138614,
"grad_norm": 0.4999292492866516,
"learning_rate": 4.9020983212723365e-06,
"loss": 0.6656,
"step": 741
},
{
"epoch": 0.6122112211221122,
"grad_norm": 0.5277410745620728,
"learning_rate": 4.90179463487146e-06,
"loss": 0.702,
"step": 742
},
{
"epoch": 0.613036303630363,
"grad_norm": 0.5241521596908569,
"learning_rate": 4.901490487625406e-06,
"loss": 0.6835,
"step": 743
},
{
"epoch": 0.6138613861386139,
"grad_norm": 0.5295579433441162,
"learning_rate": 4.901185879592534e-06,
"loss": 0.6853,
"step": 744
},
{
"epoch": 0.6146864686468647,
"grad_norm": 0.49168312549591064,
"learning_rate": 4.900880810831289e-06,
"loss": 0.7004,
"step": 745
},
{
"epoch": 0.6155115511551155,
"grad_norm": 0.496028333902359,
"learning_rate": 4.9005752814002076e-06,
"loss": 0.7039,
"step": 746
},
{
"epoch": 0.6163366336633663,
"grad_norm": 0.5242146849632263,
"learning_rate": 4.900269291357912e-06,
"loss": 0.7277,
"step": 747
},
{
"epoch": 0.6171617161716172,
"grad_norm": 0.5047608613967896,
"learning_rate": 4.899962840763115e-06,
"loss": 0.7096,
"step": 748
},
{
"epoch": 0.6179867986798679,
"grad_norm": 0.51358562707901,
"learning_rate": 4.899655929674617e-06,
"loss": 0.6923,
"step": 749
},
{
"epoch": 0.6188118811881188,
"grad_norm": 0.5006871819496155,
"learning_rate": 4.899348558151306e-06,
"loss": 0.7124,
"step": 750
},
{
"epoch": 0.6196369636963697,
"grad_norm": 0.5047678351402283,
"learning_rate": 4.89904072625216e-06,
"loss": 0.7228,
"step": 751
},
{
"epoch": 0.6204620462046204,
"grad_norm": 0.5035737156867981,
"learning_rate": 4.8987324340362445e-06,
"loss": 0.6694,
"step": 752
},
{
"epoch": 0.6212871287128713,
"grad_norm": 0.5005902647972107,
"learning_rate": 4.898423681562711e-06,
"loss": 0.7096,
"step": 753
},
{
"epoch": 0.6221122112211221,
"grad_norm": 0.5169086456298828,
"learning_rate": 4.8981144688908035e-06,
"loss": 0.677,
"step": 754
},
{
"epoch": 0.6229372937293729,
"grad_norm": 0.5119456052780151,
"learning_rate": 4.897804796079852e-06,
"loss": 0.7085,
"step": 755
},
{
"epoch": 0.6237623762376238,
"grad_norm": 0.5147557854652405,
"learning_rate": 4.897494663189275e-06,
"loss": 0.7082,
"step": 756
},
{
"epoch": 0.6245874587458746,
"grad_norm": 0.5276811718940735,
"learning_rate": 4.897184070278579e-06,
"loss": 0.6951,
"step": 757
},
{
"epoch": 0.6254125412541254,
"grad_norm": 0.5133523344993591,
"learning_rate": 4.89687301740736e-06,
"loss": 0.7086,
"step": 758
},
{
"epoch": 0.6262376237623762,
"grad_norm": 0.5005844831466675,
"learning_rate": 4.8965615046353e-06,
"loss": 0.6754,
"step": 759
},
{
"epoch": 0.6270627062706271,
"grad_norm": 0.529815137386322,
"learning_rate": 4.8962495320221714e-06,
"loss": 0.6916,
"step": 760
},
{
"epoch": 0.6278877887788779,
"grad_norm": 0.49290475249290466,
"learning_rate": 4.895937099627834e-06,
"loss": 0.6809,
"step": 761
},
{
"epoch": 0.6287128712871287,
"grad_norm": 0.5003436207771301,
"learning_rate": 4.895624207512237e-06,
"loss": 0.7153,
"step": 762
},
{
"epoch": 0.6295379537953796,
"grad_norm": 0.5121909379959106,
"learning_rate": 4.895310855735415e-06,
"loss": 0.6851,
"step": 763
},
{
"epoch": 0.6303630363036303,
"grad_norm": 0.5076280832290649,
"learning_rate": 4.894997044357492e-06,
"loss": 0.6824,
"step": 764
},
{
"epoch": 0.6311881188118812,
"grad_norm": 0.49783045053482056,
"learning_rate": 4.894682773438683e-06,
"loss": 0.6987,
"step": 765
},
{
"epoch": 0.6320132013201321,
"grad_norm": 0.5217746496200562,
"learning_rate": 4.894368043039286e-06,
"loss": 0.6937,
"step": 766
},
{
"epoch": 0.6328382838283828,
"grad_norm": 0.529611349105835,
"learning_rate": 4.894052853219693e-06,
"loss": 0.7008,
"step": 767
},
{
"epoch": 0.6336633663366337,
"grad_norm": 0.5516501069068909,
"learning_rate": 4.893737204040378e-06,
"loss": 0.6885,
"step": 768
},
{
"epoch": 0.6344884488448845,
"grad_norm": 0.5219238996505737,
"learning_rate": 4.89342109556191e-06,
"loss": 0.7011,
"step": 769
},
{
"epoch": 0.6353135313531353,
"grad_norm": 0.5126083493232727,
"learning_rate": 4.89310452784494e-06,
"loss": 0.7126,
"step": 770
},
{
"epoch": 0.6361386138613861,
"grad_norm": 0.49892565608024597,
"learning_rate": 4.892787500950209e-06,
"loss": 0.6844,
"step": 771
},
{
"epoch": 0.636963696369637,
"grad_norm": 0.5378970503807068,
"learning_rate": 4.892470014938548e-06,
"loss": 0.7181,
"step": 772
},
{
"epoch": 0.6377887788778878,
"grad_norm": 0.5072008371353149,
"learning_rate": 4.892152069870874e-06,
"loss": 0.7237,
"step": 773
},
{
"epoch": 0.6386138613861386,
"grad_norm": 0.5335053205490112,
"learning_rate": 4.891833665808195e-06,
"loss": 0.7044,
"step": 774
},
{
"epoch": 0.6394389438943895,
"grad_norm": 0.503537118434906,
"learning_rate": 4.891514802811601e-06,
"loss": 0.6794,
"step": 775
},
{
"epoch": 0.6402640264026402,
"grad_norm": 0.520697295665741,
"learning_rate": 4.891195480942277e-06,
"loss": 0.699,
"step": 776
},
{
"epoch": 0.6410891089108911,
"grad_norm": 0.5216426849365234,
"learning_rate": 4.890875700261492e-06,
"loss": 0.7019,
"step": 777
},
{
"epoch": 0.641914191419142,
"grad_norm": 0.5011724829673767,
"learning_rate": 4.890555460830604e-06,
"loss": 0.7019,
"step": 778
},
{
"epoch": 0.6427392739273927,
"grad_norm": 0.507106363773346,
"learning_rate": 4.890234762711059e-06,
"loss": 0.6956,
"step": 779
},
{
"epoch": 0.6435643564356436,
"grad_norm": 0.5091033577919006,
"learning_rate": 4.889913605964391e-06,
"loss": 0.6891,
"step": 780
},
{
"epoch": 0.6443894389438944,
"grad_norm": 0.516913115978241,
"learning_rate": 4.889591990652222e-06,
"loss": 0.692,
"step": 781
},
{
"epoch": 0.6452145214521452,
"grad_norm": 0.5138155817985535,
"learning_rate": 4.8892699168362626e-06,
"loss": 0.7113,
"step": 782
},
{
"epoch": 0.6460396039603961,
"grad_norm": 0.5041294693946838,
"learning_rate": 4.88894738457831e-06,
"loss": 0.6814,
"step": 783
},
{
"epoch": 0.6468646864686468,
"grad_norm": 0.5104554891586304,
"learning_rate": 4.888624393940251e-06,
"loss": 0.6742,
"step": 784
},
{
"epoch": 0.6476897689768977,
"grad_norm": 0.5303329229354858,
"learning_rate": 4.888300944984059e-06,
"loss": 0.6837,
"step": 785
},
{
"epoch": 0.6485148514851485,
"grad_norm": 0.5213814377784729,
"learning_rate": 4.887977037771797e-06,
"loss": 0.6708,
"step": 786
},
{
"epoch": 0.6493399339933993,
"grad_norm": 0.5052602291107178,
"learning_rate": 4.887652672365613e-06,
"loss": 0.7113,
"step": 787
},
{
"epoch": 0.6501650165016502,
"grad_norm": 0.5011994242668152,
"learning_rate": 4.887327848827746e-06,
"loss": 0.6981,
"step": 788
},
{
"epoch": 0.650990099009901,
"grad_norm": 0.5150925517082214,
"learning_rate": 4.887002567220521e-06,
"loss": 0.7011,
"step": 789
},
{
"epoch": 0.6518151815181518,
"grad_norm": 0.5412828326225281,
"learning_rate": 4.886676827606352e-06,
"loss": 0.6775,
"step": 790
},
{
"epoch": 0.6526402640264026,
"grad_norm": 0.49732375144958496,
"learning_rate": 4.886350630047741e-06,
"loss": 0.6911,
"step": 791
},
{
"epoch": 0.6534653465346535,
"grad_norm": 0.531814455986023,
"learning_rate": 4.886023974607275e-06,
"loss": 0.6921,
"step": 792
},
{
"epoch": 0.6542904290429042,
"grad_norm": 0.5216978192329407,
"learning_rate": 4.885696861347633e-06,
"loss": 0.7006,
"step": 793
},
{
"epoch": 0.6551155115511551,
"grad_norm": 0.5057194232940674,
"learning_rate": 4.8853692903315796e-06,
"loss": 0.688,
"step": 794
},
{
"epoch": 0.655940594059406,
"grad_norm": 0.5147150754928589,
"learning_rate": 4.885041261621967e-06,
"loss": 0.7002,
"step": 795
},
{
"epoch": 0.6567656765676567,
"grad_norm": 0.5100370645523071,
"learning_rate": 4.884712775281737e-06,
"loss": 0.6832,
"step": 796
},
{
"epoch": 0.6575907590759076,
"grad_norm": 0.5073105096817017,
"learning_rate": 4.884383831373918e-06,
"loss": 0.7098,
"step": 797
},
{
"epoch": 0.6584158415841584,
"grad_norm": 0.529285728931427,
"learning_rate": 4.884054429961625e-06,
"loss": 0.6986,
"step": 798
},
{
"epoch": 0.6592409240924092,
"grad_norm": 0.5070759654045105,
"learning_rate": 4.8837245711080626e-06,
"loss": 0.7007,
"step": 799
},
{
"epoch": 0.6600660066006601,
"grad_norm": 0.5227763652801514,
"learning_rate": 4.883394254876523e-06,
"loss": 0.6642,
"step": 800
},
{
"epoch": 0.6608910891089109,
"grad_norm": 0.5032903552055359,
"learning_rate": 4.883063481330384e-06,
"loss": 0.7151,
"step": 801
},
{
"epoch": 0.6617161716171617,
"grad_norm": 0.5112184882164001,
"learning_rate": 4.8827322505331155e-06,
"loss": 0.6944,
"step": 802
},
{
"epoch": 0.6625412541254125,
"grad_norm": 0.49932217597961426,
"learning_rate": 4.882400562548271e-06,
"loss": 0.7018,
"step": 803
},
{
"epoch": 0.6633663366336634,
"grad_norm": 0.5168468952178955,
"learning_rate": 4.8820684174394935e-06,
"loss": 0.7097,
"step": 804
},
{
"epoch": 0.6641914191419142,
"grad_norm": 0.5187894105911255,
"learning_rate": 4.881735815270513e-06,
"loss": 0.7187,
"step": 805
},
{
"epoch": 0.665016501650165,
"grad_norm": 0.5122649073600769,
"learning_rate": 4.881402756105149e-06,
"loss": 0.6636,
"step": 806
},
{
"epoch": 0.6658415841584159,
"grad_norm": 0.5010491013526917,
"learning_rate": 4.8810692400073065e-06,
"loss": 0.696,
"step": 807
},
{
"epoch": 0.6666666666666666,
"grad_norm": 0.5105947852134705,
"learning_rate": 4.880735267040978e-06,
"loss": 0.6849,
"step": 808
},
{
"epoch": 0.6674917491749175,
"grad_norm": 0.5258045196533203,
"learning_rate": 4.880400837270246e-06,
"loss": 0.6923,
"step": 809
},
{
"epoch": 0.6683168316831684,
"grad_norm": 0.5446014404296875,
"learning_rate": 4.88006595075928e-06,
"loss": 0.7248,
"step": 810
},
{
"epoch": 0.6691419141914191,
"grad_norm": 0.5149955749511719,
"learning_rate": 4.879730607572334e-06,
"loss": 0.7088,
"step": 811
},
{
"epoch": 0.66996699669967,
"grad_norm": 0.5132419466972351,
"learning_rate": 4.879394807773755e-06,
"loss": 0.7184,
"step": 812
},
{
"epoch": 0.6707920792079208,
"grad_norm": 0.5261136293411255,
"learning_rate": 4.879058551427972e-06,
"loss": 0.7055,
"step": 813
},
{
"epoch": 0.6716171617161716,
"grad_norm": 0.5141221880912781,
"learning_rate": 4.878721838599506e-06,
"loss": 0.6954,
"step": 814
},
{
"epoch": 0.6724422442244224,
"grad_norm": 0.5086462497711182,
"learning_rate": 4.878384669352964e-06,
"loss": 0.6901,
"step": 815
},
{
"epoch": 0.6732673267326733,
"grad_norm": 0.5050605535507202,
"learning_rate": 4.878047043753039e-06,
"loss": 0.7118,
"step": 816
},
{
"epoch": 0.6740924092409241,
"grad_norm": 0.52677321434021,
"learning_rate": 4.8777089618645146e-06,
"loss": 0.678,
"step": 817
},
{
"epoch": 0.6749174917491749,
"grad_norm": 0.5086498260498047,
"learning_rate": 4.877370423752259e-06,
"loss": 0.6868,
"step": 818
},
{
"epoch": 0.6757425742574258,
"grad_norm": 0.5587587952613831,
"learning_rate": 4.87703142948123e-06,
"loss": 0.7176,
"step": 819
},
{
"epoch": 0.6765676567656765,
"grad_norm": 0.5186298489570618,
"learning_rate": 4.876691979116474e-06,
"loss": 0.7037,
"step": 820
},
{
"epoch": 0.6773927392739274,
"grad_norm": 0.5174757242202759,
"learning_rate": 4.87635207272312e-06,
"loss": 0.7068,
"step": 821
},
{
"epoch": 0.6782178217821783,
"grad_norm": 0.5191114544868469,
"learning_rate": 4.876011710366389e-06,
"loss": 0.6974,
"step": 822
},
{
"epoch": 0.679042904290429,
"grad_norm": 0.506994903087616,
"learning_rate": 4.875670892111589e-06,
"loss": 0.697,
"step": 823
},
{
"epoch": 0.6798679867986799,
"grad_norm": 0.5108266472816467,
"learning_rate": 4.875329618024113e-06,
"loss": 0.693,
"step": 824
},
{
"epoch": 0.6806930693069307,
"grad_norm": 0.5139864087104797,
"learning_rate": 4.874987888169445e-06,
"loss": 0.6934,
"step": 825
},
{
"epoch": 0.6815181518151815,
"grad_norm": 0.5133042931556702,
"learning_rate": 4.874645702613152e-06,
"loss": 0.7016,
"step": 826
},
{
"epoch": 0.6823432343234324,
"grad_norm": 0.5109182000160217,
"learning_rate": 4.874303061420893e-06,
"loss": 0.71,
"step": 827
},
{
"epoch": 0.6831683168316832,
"grad_norm": 0.49460044503211975,
"learning_rate": 4.8739599646584126e-06,
"loss": 0.7166,
"step": 828
},
{
"epoch": 0.683993399339934,
"grad_norm": 0.507247269153595,
"learning_rate": 4.873616412391541e-06,
"loss": 0.6971,
"step": 829
},
{
"epoch": 0.6848184818481848,
"grad_norm": 0.5000693798065186,
"learning_rate": 4.873272404686199e-06,
"loss": 0.6985,
"step": 830
},
{
"epoch": 0.6856435643564357,
"grad_norm": 0.5239745378494263,
"learning_rate": 4.872927941608392e-06,
"loss": 0.7111,
"step": 831
},
{
"epoch": 0.6864686468646864,
"grad_norm": 0.5349238514900208,
"learning_rate": 4.872583023224215e-06,
"loss": 0.6929,
"step": 832
},
{
"epoch": 0.6872937293729373,
"grad_norm": 0.5302279591560364,
"learning_rate": 4.872237649599848e-06,
"loss": 0.6902,
"step": 833
},
{
"epoch": 0.6881188118811881,
"grad_norm": 0.4991590082645416,
"learning_rate": 4.871891820801561e-06,
"loss": 0.7048,
"step": 834
},
{
"epoch": 0.6889438943894389,
"grad_norm": 0.5134434700012207,
"learning_rate": 4.871545536895709e-06,
"loss": 0.7008,
"step": 835
},
{
"epoch": 0.6897689768976898,
"grad_norm": 0.5120382905006409,
"learning_rate": 4.871198797948736e-06,
"loss": 0.6798,
"step": 836
},
{
"epoch": 0.6905940594059405,
"grad_norm": 0.5111243724822998,
"learning_rate": 4.870851604027173e-06,
"loss": 0.7356,
"step": 837
},
{
"epoch": 0.6914191419141914,
"grad_norm": 0.5056900978088379,
"learning_rate": 4.870503955197638e-06,
"loss": 0.6962,
"step": 838
},
{
"epoch": 0.6922442244224423,
"grad_norm": 0.5044076442718506,
"learning_rate": 4.870155851526834e-06,
"loss": 0.6862,
"step": 839
},
{
"epoch": 0.693069306930693,
"grad_norm": 0.5099409222602844,
"learning_rate": 4.869807293081555e-06,
"loss": 0.7154,
"step": 840
},
{
"epoch": 0.6938943894389439,
"grad_norm": 0.5157263278961182,
"learning_rate": 4.869458279928682e-06,
"loss": 0.6567,
"step": 841
},
{
"epoch": 0.6947194719471947,
"grad_norm": 0.5214593410491943,
"learning_rate": 4.869108812135181e-06,
"loss": 0.7135,
"step": 842
},
{
"epoch": 0.6955445544554455,
"grad_norm": 0.5076237916946411,
"learning_rate": 4.868758889768106e-06,
"loss": 0.703,
"step": 843
},
{
"epoch": 0.6963696369636964,
"grad_norm": 0.5151360630989075,
"learning_rate": 4.868408512894599e-06,
"loss": 0.6772,
"step": 844
},
{
"epoch": 0.6971947194719472,
"grad_norm": 0.5417853593826294,
"learning_rate": 4.868057681581888e-06,
"loss": 0.6825,
"step": 845
},
{
"epoch": 0.698019801980198,
"grad_norm": 0.5326299667358398,
"learning_rate": 4.8677063958972895e-06,
"loss": 0.6777,
"step": 846
},
{
"epoch": 0.6988448844884488,
"grad_norm": 0.5056005716323853,
"learning_rate": 4.867354655908206e-06,
"loss": 0.737,
"step": 847
},
{
"epoch": 0.6996699669966997,
"grad_norm": 0.5118707418441772,
"learning_rate": 4.867002461682129e-06,
"loss": 0.6844,
"step": 848
},
{
"epoch": 0.7004950495049505,
"grad_norm": 0.5164377093315125,
"learning_rate": 4.866649813286634e-06,
"loss": 0.6765,
"step": 849
},
{
"epoch": 0.7013201320132013,
"grad_norm": 0.5244525074958801,
"learning_rate": 4.866296710789387e-06,
"loss": 0.6837,
"step": 850
},
{
"epoch": 0.7021452145214522,
"grad_norm": 0.5185418128967285,
"learning_rate": 4.865943154258138e-06,
"loss": 0.7103,
"step": 851
},
{
"epoch": 0.7029702970297029,
"grad_norm": 0.5295969843864441,
"learning_rate": 4.8655891437607285e-06,
"loss": 0.7153,
"step": 852
},
{
"epoch": 0.7037953795379538,
"grad_norm": 0.5205761194229126,
"learning_rate": 4.865234679365082e-06,
"loss": 0.69,
"step": 853
},
{
"epoch": 0.7046204620462047,
"grad_norm": 0.5231923460960388,
"learning_rate": 4.864879761139212e-06,
"loss": 0.7192,
"step": 854
},
{
"epoch": 0.7054455445544554,
"grad_norm": 0.5015358328819275,
"learning_rate": 4.864524389151219e-06,
"loss": 0.7005,
"step": 855
},
{
"epoch": 0.7062706270627063,
"grad_norm": 0.538537323474884,
"learning_rate": 4.8641685634692905e-06,
"loss": 0.7083,
"step": 856
},
{
"epoch": 0.7070957095709571,
"grad_norm": 0.5275318622589111,
"learning_rate": 4.8638122841616994e-06,
"loss": 0.6984,
"step": 857
},
{
"epoch": 0.7079207920792079,
"grad_norm": 0.5181031823158264,
"learning_rate": 4.863455551296808e-06,
"loss": 0.6884,
"step": 858
},
{
"epoch": 0.7087458745874587,
"grad_norm": 0.5242864489555359,
"learning_rate": 4.863098364943065e-06,
"loss": 0.6954,
"step": 859
},
{
"epoch": 0.7095709570957096,
"grad_norm": 0.5044652819633484,
"learning_rate": 4.862740725169004e-06,
"loss": 0.6902,
"step": 860
},
{
"epoch": 0.7103960396039604,
"grad_norm": 0.5207858085632324,
"learning_rate": 4.8623826320432486e-06,
"loss": 0.6509,
"step": 861
},
{
"epoch": 0.7112211221122112,
"grad_norm": 0.5226578712463379,
"learning_rate": 4.8620240856345075e-06,
"loss": 0.695,
"step": 862
},
{
"epoch": 0.7120462046204621,
"grad_norm": 0.5265222191810608,
"learning_rate": 4.8616650860115766e-06,
"loss": 0.6764,
"step": 863
},
{
"epoch": 0.7128712871287128,
"grad_norm": 0.5239368677139282,
"learning_rate": 4.86130563324334e-06,
"loss": 0.6903,
"step": 864
},
{
"epoch": 0.7136963696369637,
"grad_norm": 0.5724884271621704,
"learning_rate": 4.860945727398767e-06,
"loss": 0.6824,
"step": 865
},
{
"epoch": 0.7145214521452146,
"grad_norm": 0.5248749256134033,
"learning_rate": 4.860585368546915e-06,
"loss": 0.6789,
"step": 866
},
{
"epoch": 0.7153465346534653,
"grad_norm": 0.5303323864936829,
"learning_rate": 4.8602245567569275e-06,
"loss": 0.6902,
"step": 867
},
{
"epoch": 0.7161716171617162,
"grad_norm": 0.5075418949127197,
"learning_rate": 4.859863292098036e-06,
"loss": 0.6688,
"step": 868
},
{
"epoch": 0.716996699669967,
"grad_norm": 0.5122492909431458,
"learning_rate": 4.859501574639558e-06,
"loss": 0.6973,
"step": 869
},
{
"epoch": 0.7178217821782178,
"grad_norm": 0.5190475583076477,
"learning_rate": 4.8591394044508985e-06,
"loss": 0.6814,
"step": 870
},
{
"epoch": 0.7186468646864687,
"grad_norm": 0.5296965837478638,
"learning_rate": 4.858776781601549e-06,
"loss": 0.7114,
"step": 871
},
{
"epoch": 0.7194719471947195,
"grad_norm": 0.5140368938446045,
"learning_rate": 4.858413706161087e-06,
"loss": 0.6939,
"step": 872
},
{
"epoch": 0.7202970297029703,
"grad_norm": 0.5177188515663147,
"learning_rate": 4.858050178199179e-06,
"loss": 0.6884,
"step": 873
},
{
"epoch": 0.7211221122112211,
"grad_norm": 0.5166962146759033,
"learning_rate": 4.857686197785576e-06,
"loss": 0.6941,
"step": 874
},
{
"epoch": 0.721947194719472,
"grad_norm": 0.5186828970909119,
"learning_rate": 4.857321764990118e-06,
"loss": 0.6668,
"step": 875
},
{
"epoch": 0.7227722772277227,
"grad_norm": 0.5125436186790466,
"learning_rate": 4.85695687988273e-06,
"loss": 0.6797,
"step": 876
},
{
"epoch": 0.7235973597359736,
"grad_norm": 0.52864670753479,
"learning_rate": 4.8565915425334235e-06,
"loss": 0.6886,
"step": 877
},
{
"epoch": 0.7244224422442245,
"grad_norm": 0.5181275606155396,
"learning_rate": 4.856225753012299e-06,
"loss": 0.6788,
"step": 878
},
{
"epoch": 0.7252475247524752,
"grad_norm": 0.5550081133842468,
"learning_rate": 4.8558595113895426e-06,
"loss": 0.7227,
"step": 879
},
{
"epoch": 0.7260726072607261,
"grad_norm": 0.5321540832519531,
"learning_rate": 4.855492817735425e-06,
"loss": 0.6892,
"step": 880
},
{
"epoch": 0.726897689768977,
"grad_norm": 0.527124285697937,
"learning_rate": 4.8551256721203094e-06,
"loss": 0.6916,
"step": 881
},
{
"epoch": 0.7277227722772277,
"grad_norm": 0.5251095294952393,
"learning_rate": 4.854758074614639e-06,
"loss": 0.719,
"step": 882
},
{
"epoch": 0.7285478547854786,
"grad_norm": 0.5159496068954468,
"learning_rate": 4.854390025288948e-06,
"loss": 0.7029,
"step": 883
},
{
"epoch": 0.7293729372937293,
"grad_norm": 0.5572521090507507,
"learning_rate": 4.854021524213855e-06,
"loss": 0.6813,
"step": 884
},
{
"epoch": 0.7301980198019802,
"grad_norm": 0.5227091908454895,
"learning_rate": 4.853652571460067e-06,
"loss": 0.6545,
"step": 885
},
{
"epoch": 0.731023102310231,
"grad_norm": 0.5276998281478882,
"learning_rate": 4.853283167098376e-06,
"loss": 0.6736,
"step": 886
},
{
"epoch": 0.7318481848184818,
"grad_norm": 0.5453947186470032,
"learning_rate": 4.852913311199663e-06,
"loss": 0.7141,
"step": 887
},
{
"epoch": 0.7326732673267327,
"grad_norm": 0.5115426778793335,
"learning_rate": 4.852543003834894e-06,
"loss": 0.6854,
"step": 888
},
{
"epoch": 0.7334983498349835,
"grad_norm": 0.530997633934021,
"learning_rate": 4.852172245075121e-06,
"loss": 0.6976,
"step": 889
},
{
"epoch": 0.7343234323432343,
"grad_norm": 0.5112131834030151,
"learning_rate": 4.851801034991484e-06,
"loss": 0.7007,
"step": 890
},
{
"epoch": 0.7351485148514851,
"grad_norm": 0.509002685546875,
"learning_rate": 4.851429373655208e-06,
"loss": 0.6736,
"step": 891
},
{
"epoch": 0.735973597359736,
"grad_norm": 0.532152533531189,
"learning_rate": 4.851057261137608e-06,
"loss": 0.6868,
"step": 892
},
{
"epoch": 0.7367986798679867,
"grad_norm": 0.5529546141624451,
"learning_rate": 4.850684697510082e-06,
"loss": 0.7098,
"step": 893
},
{
"epoch": 0.7376237623762376,
"grad_norm": 0.5317026972770691,
"learning_rate": 4.850311682844115e-06,
"loss": 0.6951,
"step": 894
},
{
"epoch": 0.7384488448844885,
"grad_norm": 0.5340789556503296,
"learning_rate": 4.84993821721128e-06,
"loss": 0.7039,
"step": 895
},
{
"epoch": 0.7392739273927392,
"grad_norm": 0.5217326283454895,
"learning_rate": 4.849564300683235e-06,
"loss": 0.6869,
"step": 896
},
{
"epoch": 0.7400990099009901,
"grad_norm": 0.5279721021652222,
"learning_rate": 4.849189933331727e-06,
"loss": 0.7007,
"step": 897
},
{
"epoch": 0.740924092409241,
"grad_norm": 0.5272139310836792,
"learning_rate": 4.848815115228587e-06,
"loss": 0.6999,
"step": 898
},
{
"epoch": 0.7417491749174917,
"grad_norm": 0.5041061043739319,
"learning_rate": 4.848439846445732e-06,
"loss": 0.6941,
"step": 899
},
{
"epoch": 0.7425742574257426,
"grad_norm": 0.5143195986747742,
"learning_rate": 4.84806412705517e-06,
"loss": 0.6728,
"step": 900
},
{
"epoch": 0.7433993399339934,
"grad_norm": 0.5403009653091431,
"learning_rate": 4.84768795712899e-06,
"loss": 0.6737,
"step": 901
},
{
"epoch": 0.7442244224422442,
"grad_norm": 0.5111673474311829,
"learning_rate": 4.84731133673937e-06,
"loss": 0.6987,
"step": 902
},
{
"epoch": 0.745049504950495,
"grad_norm": 0.5134066343307495,
"learning_rate": 4.846934265958575e-06,
"loss": 0.7023,
"step": 903
},
{
"epoch": 0.7458745874587459,
"grad_norm": 0.5278534889221191,
"learning_rate": 4.846556744858953e-06,
"loss": 0.6627,
"step": 904
},
{
"epoch": 0.7466996699669967,
"grad_norm": 0.5452783107757568,
"learning_rate": 4.846178773512945e-06,
"loss": 0.7039,
"step": 905
},
{
"epoch": 0.7475247524752475,
"grad_norm": 0.5415001511573792,
"learning_rate": 4.845800351993072e-06,
"loss": 0.7216,
"step": 906
},
{
"epoch": 0.7483498349834984,
"grad_norm": 0.5375738143920898,
"learning_rate": 4.845421480371943e-06,
"loss": 0.6907,
"step": 907
},
{
"epoch": 0.7491749174917491,
"grad_norm": 0.5274370312690735,
"learning_rate": 4.8450421587222565e-06,
"loss": 0.7073,
"step": 908
},
{
"epoch": 0.75,
"grad_norm": 0.5074566006660461,
"learning_rate": 4.844662387116793e-06,
"loss": 0.7051,
"step": 909
},
{
"epoch": 0.7508250825082509,
"grad_norm": 0.5361227989196777,
"learning_rate": 4.844282165628422e-06,
"loss": 0.6943,
"step": 910
},
{
"epoch": 0.7516501650165016,
"grad_norm": 0.5223832130432129,
"learning_rate": 4.843901494330099e-06,
"loss": 0.6883,
"step": 911
},
{
"epoch": 0.7524752475247525,
"grad_norm": 0.5316516757011414,
"learning_rate": 4.8435203732948644e-06,
"loss": 0.6857,
"step": 912
},
{
"epoch": 0.7533003300330033,
"grad_norm": 0.5260957479476929,
"learning_rate": 4.843138802595847e-06,
"loss": 0.7223,
"step": 913
},
{
"epoch": 0.7541254125412541,
"grad_norm": 0.5405908226966858,
"learning_rate": 4.842756782306261e-06,
"loss": 0.7023,
"step": 914
},
{
"epoch": 0.754950495049505,
"grad_norm": 0.5285031795501709,
"learning_rate": 4.842374312499405e-06,
"loss": 0.6847,
"step": 915
},
{
"epoch": 0.7557755775577558,
"grad_norm": 0.5381676554679871,
"learning_rate": 4.841991393248667e-06,
"loss": 0.6667,
"step": 916
},
{
"epoch": 0.7566006600660066,
"grad_norm": 0.5372406840324402,
"learning_rate": 4.841608024627519e-06,
"loss": 0.7108,
"step": 917
},
{
"epoch": 0.7574257425742574,
"grad_norm": 0.5164246559143066,
"learning_rate": 4.841224206709521e-06,
"loss": 0.6913,
"step": 918
},
{
"epoch": 0.7582508250825083,
"grad_norm": 0.5240136384963989,
"learning_rate": 4.840839939568317e-06,
"loss": 0.6998,
"step": 919
},
{
"epoch": 0.759075907590759,
"grad_norm": 0.5204344987869263,
"learning_rate": 4.840455223277639e-06,
"loss": 0.6873,
"step": 920
},
{
"epoch": 0.7599009900990099,
"grad_norm": 0.5269798636436462,
"learning_rate": 4.8400700579113055e-06,
"loss": 0.6638,
"step": 921
},
{
"epoch": 0.7607260726072608,
"grad_norm": 0.5244765877723694,
"learning_rate": 4.839684443543218e-06,
"loss": 0.6875,
"step": 922
},
{
"epoch": 0.7615511551155115,
"grad_norm": 0.5180036425590515,
"learning_rate": 4.839298380247368e-06,
"loss": 0.6912,
"step": 923
},
{
"epoch": 0.7623762376237624,
"grad_norm": 0.5370272994041443,
"learning_rate": 4.838911868097832e-06,
"loss": 0.6948,
"step": 924
},
{
"epoch": 0.7632013201320133,
"grad_norm": 0.5375697016716003,
"learning_rate": 4.83852490716877e-06,
"loss": 0.6939,
"step": 925
},
{
"epoch": 0.764026402640264,
"grad_norm": 0.5340301394462585,
"learning_rate": 4.838137497534433e-06,
"loss": 0.6722,
"step": 926
},
{
"epoch": 0.7648514851485149,
"grad_norm": 0.5227602124214172,
"learning_rate": 4.837749639269153e-06,
"loss": 0.6788,
"step": 927
},
{
"epoch": 0.7656765676567657,
"grad_norm": 0.5335901975631714,
"learning_rate": 4.8373613324473515e-06,
"loss": 0.6758,
"step": 928
},
{
"epoch": 0.7665016501650165,
"grad_norm": 0.519488513469696,
"learning_rate": 4.836972577143535e-06,
"loss": 0.6944,
"step": 929
},
{
"epoch": 0.7673267326732673,
"grad_norm": 0.5218436121940613,
"learning_rate": 4.836583373432296e-06,
"loss": 0.6949,
"step": 930
},
{
"epoch": 0.7681518151815182,
"grad_norm": 0.5284325480461121,
"learning_rate": 4.836193721388313e-06,
"loss": 0.6852,
"step": 931
},
{
"epoch": 0.768976897689769,
"grad_norm": 0.5303220152854919,
"learning_rate": 4.83580362108635e-06,
"loss": 0.6743,
"step": 932
},
{
"epoch": 0.7698019801980198,
"grad_norm": 0.5296517610549927,
"learning_rate": 4.835413072601259e-06,
"loss": 0.6946,
"step": 933
},
{
"epoch": 0.7706270627062707,
"grad_norm": 0.5141308307647705,
"learning_rate": 4.835022076007976e-06,
"loss": 0.6888,
"step": 934
},
{
"epoch": 0.7714521452145214,
"grad_norm": 0.5189119577407837,
"learning_rate": 4.834630631381524e-06,
"loss": 0.6854,
"step": 935
},
{
"epoch": 0.7722772277227723,
"grad_norm": 0.5247241854667664,
"learning_rate": 4.8342387387970105e-06,
"loss": 0.7161,
"step": 936
},
{
"epoch": 0.773102310231023,
"grad_norm": 0.5307490229606628,
"learning_rate": 4.83384639832963e-06,
"loss": 0.6736,
"step": 937
},
{
"epoch": 0.7739273927392739,
"grad_norm": 0.5190678834915161,
"learning_rate": 4.833453610054665e-06,
"loss": 0.6782,
"step": 938
},
{
"epoch": 0.7747524752475248,
"grad_norm": 0.5487401485443115,
"learning_rate": 4.833060374047479e-06,
"loss": 0.7075,
"step": 939
},
{
"epoch": 0.7755775577557755,
"grad_norm": 0.5287332534790039,
"learning_rate": 4.832666690383526e-06,
"loss": 0.6936,
"step": 940
},
{
"epoch": 0.7764026402640264,
"grad_norm": 0.5404759049415588,
"learning_rate": 4.832272559138345e-06,
"loss": 0.6818,
"step": 941
},
{
"epoch": 0.7772277227722773,
"grad_norm": 0.5293354988098145,
"learning_rate": 4.831877980387558e-06,
"loss": 0.6965,
"step": 942
},
{
"epoch": 0.778052805280528,
"grad_norm": 0.5324503183364868,
"learning_rate": 4.831482954206877e-06,
"loss": 0.7106,
"step": 943
},
{
"epoch": 0.7788778877887789,
"grad_norm": 0.5342603325843811,
"learning_rate": 4.831087480672095e-06,
"loss": 0.6779,
"step": 944
},
{
"epoch": 0.7797029702970297,
"grad_norm": 0.5596241354942322,
"learning_rate": 4.830691559859098e-06,
"loss": 0.6838,
"step": 945
},
{
"epoch": 0.7805280528052805,
"grad_norm": 0.5506138801574707,
"learning_rate": 4.830295191843848e-06,
"loss": 0.6828,
"step": 946
},
{
"epoch": 0.7813531353135313,
"grad_norm": 0.532753586769104,
"learning_rate": 4.829898376702403e-06,
"loss": 0.6913,
"step": 947
},
{
"epoch": 0.7821782178217822,
"grad_norm": 0.5353578329086304,
"learning_rate": 4.8295011145108995e-06,
"loss": 0.7041,
"step": 948
},
{
"epoch": 0.783003300330033,
"grad_norm": 0.5337108373641968,
"learning_rate": 4.829103405345563e-06,
"loss": 0.6887,
"step": 949
},
{
"epoch": 0.7838283828382838,
"grad_norm": 0.5403887033462524,
"learning_rate": 4.828705249282704e-06,
"loss": 0.6977,
"step": 950
},
{
"epoch": 0.7846534653465347,
"grad_norm": 0.518827497959137,
"learning_rate": 4.8283066463987185e-06,
"loss": 0.6974,
"step": 951
},
{
"epoch": 0.7854785478547854,
"grad_norm": 0.5307276248931885,
"learning_rate": 4.827907596770089e-06,
"loss": 0.6946,
"step": 952
},
{
"epoch": 0.7863036303630363,
"grad_norm": 0.5107436180114746,
"learning_rate": 4.827508100473384e-06,
"loss": 0.6929,
"step": 953
},
{
"epoch": 0.7871287128712872,
"grad_norm": 0.5285372138023376,
"learning_rate": 4.8271081575852555e-06,
"loss": 0.7141,
"step": 954
},
{
"epoch": 0.7879537953795379,
"grad_norm": 0.5414252281188965,
"learning_rate": 4.8267077681824425e-06,
"loss": 0.6888,
"step": 955
},
{
"epoch": 0.7887788778877888,
"grad_norm": 0.5231629014015198,
"learning_rate": 4.826306932341772e-06,
"loss": 0.6945,
"step": 956
},
{
"epoch": 0.7896039603960396,
"grad_norm": 0.5293689370155334,
"learning_rate": 4.825905650140153e-06,
"loss": 0.6726,
"step": 957
},
{
"epoch": 0.7904290429042904,
"grad_norm": 0.5288978815078735,
"learning_rate": 4.825503921654582e-06,
"loss": 0.6847,
"step": 958
},
{
"epoch": 0.7912541254125413,
"grad_norm": 0.5331034064292908,
"learning_rate": 4.8251017469621404e-06,
"loss": 0.7272,
"step": 959
},
{
"epoch": 0.7920792079207921,
"grad_norm": 0.5253425240516663,
"learning_rate": 4.824699126139995e-06,
"loss": 0.6815,
"step": 960
},
{
"epoch": 0.7929042904290429,
"grad_norm": 0.521532416343689,
"learning_rate": 4.824296059265402e-06,
"loss": 0.6879,
"step": 961
},
{
"epoch": 0.7937293729372937,
"grad_norm": 0.5396357774734497,
"learning_rate": 4.823892546415696e-06,
"loss": 0.6855,
"step": 962
},
{
"epoch": 0.7945544554455446,
"grad_norm": 0.5377248525619507,
"learning_rate": 4.823488587668303e-06,
"loss": 0.6673,
"step": 963
},
{
"epoch": 0.7953795379537953,
"grad_norm": 0.5296643376350403,
"learning_rate": 4.823084183100732e-06,
"loss": 0.7033,
"step": 964
},
{
"epoch": 0.7962046204620462,
"grad_norm": 0.5253770351409912,
"learning_rate": 4.822679332790581e-06,
"loss": 0.671,
"step": 965
},
{
"epoch": 0.7970297029702971,
"grad_norm": 0.5348721742630005,
"learning_rate": 4.8222740368155265e-06,
"loss": 0.7095,
"step": 966
},
{
"epoch": 0.7978547854785478,
"grad_norm": 0.5385889410972595,
"learning_rate": 4.821868295253338e-06,
"loss": 0.6846,
"step": 967
},
{
"epoch": 0.7986798679867987,
"grad_norm": 0.5366014838218689,
"learning_rate": 4.821462108181866e-06,
"loss": 0.6879,
"step": 968
},
{
"epoch": 0.7995049504950495,
"grad_norm": 0.5168699026107788,
"learning_rate": 4.821055475679048e-06,
"loss": 0.6778,
"step": 969
},
{
"epoch": 0.8003300330033003,
"grad_norm": 0.5198276042938232,
"learning_rate": 4.820648397822907e-06,
"loss": 0.6992,
"step": 970
},
{
"epoch": 0.8011551155115512,
"grad_norm": 0.5328154563903809,
"learning_rate": 4.8202408746915514e-06,
"loss": 0.6701,
"step": 971
},
{
"epoch": 0.801980198019802,
"grad_norm": 0.5303798317909241,
"learning_rate": 4.819832906363174e-06,
"loss": 0.6732,
"step": 972
},
{
"epoch": 0.8028052805280528,
"grad_norm": 0.5164825916290283,
"learning_rate": 4.8194244929160546e-06,
"loss": 0.7238,
"step": 973
},
{
"epoch": 0.8036303630363036,
"grad_norm": 0.5376397371292114,
"learning_rate": 4.819015634428557e-06,
"loss": 0.7045,
"step": 974
},
{
"epoch": 0.8044554455445545,
"grad_norm": 0.5353125929832458,
"learning_rate": 4.818606330979132e-06,
"loss": 0.683,
"step": 975
},
{
"epoch": 0.8052805280528053,
"grad_norm": 0.53989177942276,
"learning_rate": 4.818196582646313e-06,
"loss": 0.6934,
"step": 976
},
{
"epoch": 0.8061056105610561,
"grad_norm": 0.5275681018829346,
"learning_rate": 4.817786389508723e-06,
"loss": 0.7117,
"step": 977
},
{
"epoch": 0.806930693069307,
"grad_norm": 0.5146031379699707,
"learning_rate": 4.817375751645066e-06,
"loss": 0.697,
"step": 978
},
{
"epoch": 0.8077557755775577,
"grad_norm": 0.5338720083236694,
"learning_rate": 4.8169646691341356e-06,
"loss": 0.6825,
"step": 979
},
{
"epoch": 0.8085808580858086,
"grad_norm": 0.5359170436859131,
"learning_rate": 4.816553142054806e-06,
"loss": 0.6914,
"step": 980
},
{
"epoch": 0.8094059405940595,
"grad_norm": 0.5431671142578125,
"learning_rate": 4.81614117048604e-06,
"loss": 0.6957,
"step": 981
},
{
"epoch": 0.8102310231023102,
"grad_norm": 0.5225863456726074,
"learning_rate": 4.815728754506884e-06,
"loss": 0.7052,
"step": 982
},
{
"epoch": 0.8110561056105611,
"grad_norm": 0.5262391567230225,
"learning_rate": 4.815315894196473e-06,
"loss": 0.6849,
"step": 983
},
{
"epoch": 0.8118811881188119,
"grad_norm": 0.5163729190826416,
"learning_rate": 4.814902589634022e-06,
"loss": 0.7071,
"step": 984
},
{
"epoch": 0.8127062706270627,
"grad_norm": 0.5278923511505127,
"learning_rate": 4.814488840898835e-06,
"loss": 0.6958,
"step": 985
},
{
"epoch": 0.8135313531353136,
"grad_norm": 0.5270055532455444,
"learning_rate": 4.8140746480703e-06,
"loss": 0.657,
"step": 986
},
{
"epoch": 0.8143564356435643,
"grad_norm": 0.5140476226806641,
"learning_rate": 4.813660011227891e-06,
"loss": 0.6889,
"step": 987
},
{
"epoch": 0.8151815181518152,
"grad_norm": 0.5366138219833374,
"learning_rate": 4.813244930451165e-06,
"loss": 0.7055,
"step": 988
},
{
"epoch": 0.816006600660066,
"grad_norm": 0.5412192940711975,
"learning_rate": 4.812829405819768e-06,
"loss": 0.6629,
"step": 989
},
{
"epoch": 0.8168316831683168,
"grad_norm": 0.5427959561347961,
"learning_rate": 4.812413437413428e-06,
"loss": 0.7067,
"step": 990
},
{
"epoch": 0.8176567656765676,
"grad_norm": 0.5499939322471619,
"learning_rate": 4.811997025311958e-06,
"loss": 0.6934,
"step": 991
},
{
"epoch": 0.8184818481848185,
"grad_norm": 0.5278768539428711,
"learning_rate": 4.8115801695952585e-06,
"loss": 0.6473,
"step": 992
},
{
"epoch": 0.8193069306930693,
"grad_norm": 0.5308309197425842,
"learning_rate": 4.8111628703433134e-06,
"loss": 0.6929,
"step": 993
},
{
"epoch": 0.8201320132013201,
"grad_norm": 0.5305526852607727,
"learning_rate": 4.810745127636192e-06,
"loss": 0.6882,
"step": 994
},
{
"epoch": 0.820957095709571,
"grad_norm": 0.5366460084915161,
"learning_rate": 4.81032694155405e-06,
"loss": 0.6863,
"step": 995
},
{
"epoch": 0.8217821782178217,
"grad_norm": 0.550890326499939,
"learning_rate": 4.809908312177125e-06,
"loss": 0.6772,
"step": 996
},
{
"epoch": 0.8226072607260726,
"grad_norm": 0.5378448963165283,
"learning_rate": 4.809489239585743e-06,
"loss": 0.689,
"step": 997
},
{
"epoch": 0.8234323432343235,
"grad_norm": 0.5434515476226807,
"learning_rate": 4.8090697238603125e-06,
"loss": 0.6868,
"step": 998
},
{
"epoch": 0.8242574257425742,
"grad_norm": 0.5296987295150757,
"learning_rate": 4.80864976508133e-06,
"loss": 0.684,
"step": 999
},
{
"epoch": 0.8250825082508251,
"grad_norm": 0.5145077705383301,
"learning_rate": 4.8082293633293746e-06,
"loss": 0.6633,
"step": 1000
},
{
"epoch": 0.8259075907590759,
"grad_norm": 0.5234752297401428,
"learning_rate": 4.80780851868511e-06,
"loss": 0.7047,
"step": 1001
},
{
"epoch": 0.8267326732673267,
"grad_norm": 0.5214663743972778,
"learning_rate": 4.807387231229287e-06,
"loss": 0.711,
"step": 1002
},
{
"epoch": 0.8275577557755776,
"grad_norm": 0.5485701560974121,
"learning_rate": 4.80696550104274e-06,
"loss": 0.6849,
"step": 1003
},
{
"epoch": 0.8283828382838284,
"grad_norm": 0.5370991826057434,
"learning_rate": 4.806543328206388e-06,
"loss": 0.6935,
"step": 1004
},
{
"epoch": 0.8292079207920792,
"grad_norm": 0.5193862318992615,
"learning_rate": 4.806120712801237e-06,
"loss": 0.6995,
"step": 1005
},
{
"epoch": 0.83003300330033,
"grad_norm": 0.5568088889122009,
"learning_rate": 4.805697654908375e-06,
"loss": 0.7021,
"step": 1006
},
{
"epoch": 0.8308580858085809,
"grad_norm": 0.5302184820175171,
"learning_rate": 4.805274154608977e-06,
"loss": 0.659,
"step": 1007
},
{
"epoch": 0.8316831683168316,
"grad_norm": 0.5468152165412903,
"learning_rate": 4.8048502119843025e-06,
"loss": 0.7086,
"step": 1008
},
{
"epoch": 0.8325082508250825,
"grad_norm": 0.5607840418815613,
"learning_rate": 4.804425827115695e-06,
"loss": 0.6795,
"step": 1009
},
{
"epoch": 0.8333333333333334,
"grad_norm": 0.5256121754646301,
"learning_rate": 4.804001000084585e-06,
"loss": 0.6816,
"step": 1010
},
{
"epoch": 0.8341584158415841,
"grad_norm": 0.5150307416915894,
"learning_rate": 4.803575730972484e-06,
"loss": 0.6607,
"step": 1011
},
{
"epoch": 0.834983498349835,
"grad_norm": 0.5352784395217896,
"learning_rate": 4.803150019860993e-06,
"loss": 0.6728,
"step": 1012
},
{
"epoch": 0.8358085808580858,
"grad_norm": 0.5387719869613647,
"learning_rate": 4.802723866831793e-06,
"loss": 0.6761,
"step": 1013
},
{
"epoch": 0.8366336633663366,
"grad_norm": 0.5325175523757935,
"learning_rate": 4.802297271966654e-06,
"loss": 0.6962,
"step": 1014
},
{
"epoch": 0.8374587458745875,
"grad_norm": 0.5446581840515137,
"learning_rate": 4.801870235347429e-06,
"loss": 0.6853,
"step": 1015
},
{
"epoch": 0.8382838283828383,
"grad_norm": 0.5339834094047546,
"learning_rate": 4.801442757056055e-06,
"loss": 0.6919,
"step": 1016
},
{
"epoch": 0.8391089108910891,
"grad_norm": 0.5403542518615723,
"learning_rate": 4.8010148371745555e-06,
"loss": 0.6876,
"step": 1017
},
{
"epoch": 0.8399339933993399,
"grad_norm": 0.5399143695831299,
"learning_rate": 4.8005864757850365e-06,
"loss": 0.6815,
"step": 1018
},
{
"epoch": 0.8407590759075908,
"grad_norm": 0.5371220707893372,
"learning_rate": 4.800157672969692e-06,
"loss": 0.6983,
"step": 1019
},
{
"epoch": 0.8415841584158416,
"grad_norm": 0.5120360851287842,
"learning_rate": 4.799728428810796e-06,
"loss": 0.6885,
"step": 1020
},
{
"epoch": 0.8424092409240924,
"grad_norm": 0.5353764295578003,
"learning_rate": 4.799298743390713e-06,
"loss": 0.663,
"step": 1021
},
{
"epoch": 0.8432343234323433,
"grad_norm": 0.5427811145782471,
"learning_rate": 4.798868616791886e-06,
"loss": 0.6836,
"step": 1022
},
{
"epoch": 0.844059405940594,
"grad_norm": 0.5221230387687683,
"learning_rate": 4.798438049096847e-06,
"loss": 0.7194,
"step": 1023
},
{
"epoch": 0.8448844884488449,
"grad_norm": 0.524575412273407,
"learning_rate": 4.798007040388212e-06,
"loss": 0.6602,
"step": 1024
},
{
"epoch": 0.8457095709570958,
"grad_norm": 0.5420461893081665,
"learning_rate": 4.79757559074868e-06,
"loss": 0.6931,
"step": 1025
},
{
"epoch": 0.8465346534653465,
"grad_norm": 0.5208996534347534,
"learning_rate": 4.797143700261035e-06,
"loss": 0.6835,
"step": 1026
},
{
"epoch": 0.8473597359735974,
"grad_norm": 0.5422525405883789,
"learning_rate": 4.796711369008149e-06,
"loss": 0.6942,
"step": 1027
},
{
"epoch": 0.8481848184818482,
"grad_norm": 0.5329200029373169,
"learning_rate": 4.796278597072972e-06,
"loss": 0.6494,
"step": 1028
},
{
"epoch": 0.849009900990099,
"grad_norm": 0.540357232093811,
"learning_rate": 4.795845384538545e-06,
"loss": 0.6849,
"step": 1029
},
{
"epoch": 0.8498349834983498,
"grad_norm": 0.5337477922439575,
"learning_rate": 4.7954117314879886e-06,
"loss": 0.6903,
"step": 1030
},
{
"epoch": 0.8506600660066007,
"grad_norm": 0.5205554962158203,
"learning_rate": 4.794977638004512e-06,
"loss": 0.6593,
"step": 1031
},
{
"epoch": 0.8514851485148515,
"grad_norm": 0.5238302946090698,
"learning_rate": 4.7945431041714065e-06,
"loss": 0.7004,
"step": 1032
},
{
"epoch": 0.8523102310231023,
"grad_norm": 0.5418237447738647,
"learning_rate": 4.794108130072048e-06,
"loss": 0.6659,
"step": 1033
},
{
"epoch": 0.8531353135313532,
"grad_norm": 0.5398669838905334,
"learning_rate": 4.793672715789899e-06,
"loss": 0.6813,
"step": 1034
},
{
"epoch": 0.8539603960396039,
"grad_norm": 0.5366935133934021,
"learning_rate": 4.793236861408501e-06,
"loss": 0.6994,
"step": 1035
},
{
"epoch": 0.8547854785478548,
"grad_norm": 0.5426564812660217,
"learning_rate": 4.792800567011488e-06,
"loss": 0.6837,
"step": 1036
},
{
"epoch": 0.8556105610561056,
"grad_norm": 0.5445976257324219,
"learning_rate": 4.792363832682571e-06,
"loss": 0.6925,
"step": 1037
},
{
"epoch": 0.8564356435643564,
"grad_norm": 0.5414583086967468,
"learning_rate": 4.79192665850555e-06,
"loss": 0.7103,
"step": 1038
},
{
"epoch": 0.8572607260726073,
"grad_norm": 0.5352795720100403,
"learning_rate": 4.791489044564307e-06,
"loss": 0.6754,
"step": 1039
},
{
"epoch": 0.858085808580858,
"grad_norm": 0.5245605707168579,
"learning_rate": 4.791050990942811e-06,
"loss": 0.7076,
"step": 1040
},
{
"epoch": 0.8589108910891089,
"grad_norm": 0.5312214493751526,
"learning_rate": 4.790612497725112e-06,
"loss": 0.6879,
"step": 1041
},
{
"epoch": 0.8597359735973598,
"grad_norm": 0.5388454794883728,
"learning_rate": 4.790173564995347e-06,
"loss": 0.6727,
"step": 1042
},
{
"epoch": 0.8605610561056105,
"grad_norm": 0.5453711748123169,
"learning_rate": 4.789734192837736e-06,
"loss": 0.703,
"step": 1043
},
{
"epoch": 0.8613861386138614,
"grad_norm": 0.5142508149147034,
"learning_rate": 4.789294381336585e-06,
"loss": 0.7039,
"step": 1044
},
{
"epoch": 0.8622112211221122,
"grad_norm": 0.5323778390884399,
"learning_rate": 4.78885413057628e-06,
"loss": 0.6765,
"step": 1045
},
{
"epoch": 0.863036303630363,
"grad_norm": 0.5433087348937988,
"learning_rate": 4.788413440641297e-06,
"loss": 0.671,
"step": 1046
},
{
"epoch": 0.8638613861386139,
"grad_norm": 0.5310676693916321,
"learning_rate": 4.787972311616193e-06,
"loss": 0.7094,
"step": 1047
},
{
"epoch": 0.8646864686468647,
"grad_norm": 0.517675518989563,
"learning_rate": 4.787530743585609e-06,
"loss": 0.6508,
"step": 1048
},
{
"epoch": 0.8655115511551155,
"grad_norm": 0.5481309294700623,
"learning_rate": 4.787088736634271e-06,
"loss": 0.6996,
"step": 1049
},
{
"epoch": 0.8663366336633663,
"grad_norm": 0.5559577345848083,
"learning_rate": 4.78664629084699e-06,
"loss": 0.7064,
"step": 1050
},
{
"epoch": 0.8671617161716172,
"grad_norm": 0.5332601070404053,
"learning_rate": 4.7862034063086595e-06,
"loss": 0.6724,
"step": 1051
},
{
"epoch": 0.8679867986798679,
"grad_norm": 0.5553277134895325,
"learning_rate": 4.78576008310426e-06,
"loss": 0.6721,
"step": 1052
},
{
"epoch": 0.8688118811881188,
"grad_norm": 0.5259321928024292,
"learning_rate": 4.785316321318851e-06,
"loss": 0.6826,
"step": 1053
},
{
"epoch": 0.8696369636963697,
"grad_norm": 0.5209128856658936,
"learning_rate": 4.7848721210375825e-06,
"loss": 0.7025,
"step": 1054
},
{
"epoch": 0.8704620462046204,
"grad_norm": 0.5183115601539612,
"learning_rate": 4.784427482345685e-06,
"loss": 0.6801,
"step": 1055
},
{
"epoch": 0.8712871287128713,
"grad_norm": 0.5196675658226013,
"learning_rate": 4.7839824053284725e-06,
"loss": 0.6795,
"step": 1056
},
{
"epoch": 0.8721122112211221,
"grad_norm": 0.5218325853347778,
"learning_rate": 4.783536890071345e-06,
"loss": 0.6981,
"step": 1057
},
{
"epoch": 0.8729372937293729,
"grad_norm": 0.5484752655029297,
"learning_rate": 4.783090936659786e-06,
"loss": 0.7011,
"step": 1058
},
{
"epoch": 0.8737623762376238,
"grad_norm": 0.5428327918052673,
"learning_rate": 4.782644545179363e-06,
"loss": 0.6846,
"step": 1059
},
{
"epoch": 0.8745874587458746,
"grad_norm": 0.5338976979255676,
"learning_rate": 4.782197715715728e-06,
"loss": 0.6863,
"step": 1060
},
{
"epoch": 0.8754125412541254,
"grad_norm": 0.518388032913208,
"learning_rate": 4.781750448354615e-06,
"loss": 0.6712,
"step": 1061
},
{
"epoch": 0.8762376237623762,
"grad_norm": 0.5285707116127014,
"learning_rate": 4.781302743181845e-06,
"loss": 0.6974,
"step": 1062
},
{
"epoch": 0.8770627062706271,
"grad_norm": 0.5459383130073547,
"learning_rate": 4.780854600283321e-06,
"loss": 0.6724,
"step": 1063
},
{
"epoch": 0.8778877887788779,
"grad_norm": 0.5176935791969299,
"learning_rate": 4.780406019745031e-06,
"loss": 0.6815,
"step": 1064
},
{
"epoch": 0.8787128712871287,
"grad_norm": 0.5563409924507141,
"learning_rate": 4.779957001653045e-06,
"loss": 0.7005,
"step": 1065
},
{
"epoch": 0.8795379537953796,
"grad_norm": 0.564302384853363,
"learning_rate": 4.7795075460935215e-06,
"loss": 0.678,
"step": 1066
},
{
"epoch": 0.8803630363036303,
"grad_norm": 0.5702047348022461,
"learning_rate": 4.7790576531526965e-06,
"loss": 0.6726,
"step": 1067
},
{
"epoch": 0.8811881188118812,
"grad_norm": 0.5401086211204529,
"learning_rate": 4.778607322916896e-06,
"loss": 0.6825,
"step": 1068
},
{
"epoch": 0.8820132013201321,
"grad_norm": 0.5245928764343262,
"learning_rate": 4.778156555472526e-06,
"loss": 0.6922,
"step": 1069
},
{
"epoch": 0.8828382838283828,
"grad_norm": 0.537380039691925,
"learning_rate": 4.777705350906079e-06,
"loss": 0.6899,
"step": 1070
},
{
"epoch": 0.8836633663366337,
"grad_norm": 0.534204363822937,
"learning_rate": 4.777253709304128e-06,
"loss": 0.6639,
"step": 1071
},
{
"epoch": 0.8844884488448845,
"grad_norm": 0.5462052822113037,
"learning_rate": 4.776801630753332e-06,
"loss": 0.6976,
"step": 1072
},
{
"epoch": 0.8853135313531353,
"grad_norm": 0.5318037867546082,
"learning_rate": 4.776349115340436e-06,
"loss": 0.6753,
"step": 1073
},
{
"epoch": 0.8861386138613861,
"grad_norm": 0.5356978178024292,
"learning_rate": 4.775896163152265e-06,
"loss": 0.7035,
"step": 1074
},
{
"epoch": 0.886963696369637,
"grad_norm": 0.529208242893219,
"learning_rate": 4.77544277427573e-06,
"loss": 0.6842,
"step": 1075
},
{
"epoch": 0.8877887788778878,
"grad_norm": 0.5417758226394653,
"learning_rate": 4.774988948797824e-06,
"loss": 0.6752,
"step": 1076
},
{
"epoch": 0.8886138613861386,
"grad_norm": 0.513358473777771,
"learning_rate": 4.774534686805625e-06,
"loss": 0.7093,
"step": 1077
},
{
"epoch": 0.8894389438943895,
"grad_norm": 0.5584703087806702,
"learning_rate": 4.7740799883862966e-06,
"loss": 0.6931,
"step": 1078
},
{
"epoch": 0.8902640264026402,
"grad_norm": 0.5548607707023621,
"learning_rate": 4.773624853627083e-06,
"loss": 0.6807,
"step": 1079
},
{
"epoch": 0.8910891089108911,
"grad_norm": 0.5403873920440674,
"learning_rate": 4.7731692826153115e-06,
"loss": 0.6467,
"step": 1080
},
{
"epoch": 0.891914191419142,
"grad_norm": 0.5343154668807983,
"learning_rate": 4.772713275438397e-06,
"loss": 0.6882,
"step": 1081
},
{
"epoch": 0.8927392739273927,
"grad_norm": 0.5312783718109131,
"learning_rate": 4.772256832183837e-06,
"loss": 0.699,
"step": 1082
},
{
"epoch": 0.8935643564356436,
"grad_norm": 0.5227769017219543,
"learning_rate": 4.77179995293921e-06,
"loss": 0.6801,
"step": 1083
},
{
"epoch": 0.8943894389438944,
"grad_norm": 0.5363834500312805,
"learning_rate": 4.77134263779218e-06,
"loss": 0.6731,
"step": 1084
},
{
"epoch": 0.8952145214521452,
"grad_norm": 0.5440719127655029,
"learning_rate": 4.7708848868304946e-06,
"loss": 0.6865,
"step": 1085
},
{
"epoch": 0.8960396039603961,
"grad_norm": 0.5516568422317505,
"learning_rate": 4.7704267001419856e-06,
"loss": 0.6729,
"step": 1086
},
{
"epoch": 0.8968646864686468,
"grad_norm": 0.538250207901001,
"learning_rate": 4.769968077814567e-06,
"loss": 0.6934,
"step": 1087
},
{
"epoch": 0.8976897689768977,
"grad_norm": 0.5338899493217468,
"learning_rate": 4.769509019936237e-06,
"loss": 0.6719,
"step": 1088
},
{
"epoch": 0.8985148514851485,
"grad_norm": 0.5299314856529236,
"learning_rate": 4.769049526595079e-06,
"loss": 0.6693,
"step": 1089
},
{
"epoch": 0.8993399339933993,
"grad_norm": 0.5240779519081116,
"learning_rate": 4.7685895978792564e-06,
"loss": 0.6822,
"step": 1090
},
{
"epoch": 0.9001650165016502,
"grad_norm": 0.5221059322357178,
"learning_rate": 4.768129233877019e-06,
"loss": 0.6918,
"step": 1091
},
{
"epoch": 0.900990099009901,
"grad_norm": 0.5516021251678467,
"learning_rate": 4.7676684346766994e-06,
"loss": 0.6783,
"step": 1092
},
{
"epoch": 0.9018151815181518,
"grad_norm": 0.5486177802085876,
"learning_rate": 4.767207200366713e-06,
"loss": 0.6908,
"step": 1093
},
{
"epoch": 0.9026402640264026,
"grad_norm": 0.5327460765838623,
"learning_rate": 4.7667455310355615e-06,
"loss": 0.6942,
"step": 1094
},
{
"epoch": 0.9034653465346535,
"grad_norm": 0.5445359349250793,
"learning_rate": 4.766283426771825e-06,
"loss": 0.6819,
"step": 1095
},
{
"epoch": 0.9042904290429042,
"grad_norm": 0.5357023477554321,
"learning_rate": 4.765820887664172e-06,
"loss": 0.7024,
"step": 1096
},
{
"epoch": 0.9051155115511551,
"grad_norm": 0.5279941558837891,
"learning_rate": 4.76535791380135e-06,
"loss": 0.6669,
"step": 1097
},
{
"epoch": 0.905940594059406,
"grad_norm": 0.5274310111999512,
"learning_rate": 4.7648945052721955e-06,
"loss": 0.6944,
"step": 1098
},
{
"epoch": 0.9067656765676567,
"grad_norm": 0.5562543272972107,
"learning_rate": 4.764430662165623e-06,
"loss": 0.713,
"step": 1099
},
{
"epoch": 0.9075907590759076,
"grad_norm": 0.566295862197876,
"learning_rate": 4.763966384570633e-06,
"loss": 0.6813,
"step": 1100
},
{
"epoch": 0.9084158415841584,
"grad_norm": 0.5282127857208252,
"learning_rate": 4.763501672576308e-06,
"loss": 0.6782,
"step": 1101
},
{
"epoch": 0.9092409240924092,
"grad_norm": 0.5187849998474121,
"learning_rate": 4.763036526271817e-06,
"loss": 0.6759,
"step": 1102
},
{
"epoch": 0.9100660066006601,
"grad_norm": 0.5376170873641968,
"learning_rate": 4.762570945746408e-06,
"loss": 0.6775,
"step": 1103
},
{
"epoch": 0.9108910891089109,
"grad_norm": 0.5497514605522156,
"learning_rate": 4.762104931089415e-06,
"loss": 0.7128,
"step": 1104
},
{
"epoch": 0.9117161716171617,
"grad_norm": 0.5497869253158569,
"learning_rate": 4.761638482390256e-06,
"loss": 0.6651,
"step": 1105
},
{
"epoch": 0.9125412541254125,
"grad_norm": 0.5260938405990601,
"learning_rate": 4.761171599738429e-06,
"loss": 0.674,
"step": 1106
},
{
"epoch": 0.9133663366336634,
"grad_norm": 0.5394425988197327,
"learning_rate": 4.760704283223518e-06,
"loss": 0.6801,
"step": 1107
},
{
"epoch": 0.9141914191419142,
"grad_norm": 0.5538128018379211,
"learning_rate": 4.760236532935191e-06,
"loss": 0.7046,
"step": 1108
},
{
"epoch": 0.915016501650165,
"grad_norm": 0.5472164154052734,
"learning_rate": 4.759768348963196e-06,
"loss": 0.6729,
"step": 1109
},
{
"epoch": 0.9158415841584159,
"grad_norm": 0.5471000075340271,
"learning_rate": 4.759299731397366e-06,
"loss": 0.663,
"step": 1110
},
{
"epoch": 0.9166666666666666,
"grad_norm": 0.5478131771087646,
"learning_rate": 4.758830680327618e-06,
"loss": 0.6781,
"step": 1111
},
{
"epoch": 0.9174917491749175,
"grad_norm": 0.5534459352493286,
"learning_rate": 4.7583611958439514e-06,
"loss": 0.6947,
"step": 1112
},
{
"epoch": 0.9183168316831684,
"grad_norm": 0.5353650450706482,
"learning_rate": 4.7578912780364475e-06,
"loss": 0.703,
"step": 1113
},
{
"epoch": 0.9191419141914191,
"grad_norm": 0.542109489440918,
"learning_rate": 4.757420926995273e-06,
"loss": 0.6543,
"step": 1114
},
{
"epoch": 0.91996699669967,
"grad_norm": 0.5209757685661316,
"learning_rate": 4.756950142810677e-06,
"loss": 0.6426,
"step": 1115
},
{
"epoch": 0.9207920792079208,
"grad_norm": 0.528536319732666,
"learning_rate": 4.75647892557299e-06,
"loss": 0.6917,
"step": 1116
},
{
"epoch": 0.9216171617161716,
"grad_norm": 0.5359807014465332,
"learning_rate": 4.756007275372627e-06,
"loss": 0.6741,
"step": 1117
},
{
"epoch": 0.9224422442244224,
"grad_norm": 0.5328729748725891,
"learning_rate": 4.755535192300088e-06,
"loss": 0.6721,
"step": 1118
},
{
"epoch": 0.9232673267326733,
"grad_norm": 0.538567841053009,
"learning_rate": 4.755062676445952e-06,
"loss": 0.6773,
"step": 1119
},
{
"epoch": 0.9240924092409241,
"grad_norm": 0.5369272828102112,
"learning_rate": 4.754589727900885e-06,
"loss": 0.6917,
"step": 1120
},
{
"epoch": 0.9249174917491749,
"grad_norm": 0.5393624305725098,
"learning_rate": 4.754116346755632e-06,
"loss": 0.7045,
"step": 1121
},
{
"epoch": 0.9257425742574258,
"grad_norm": 0.5320336222648621,
"learning_rate": 4.753642533101025e-06,
"loss": 0.6799,
"step": 1122
},
{
"epoch": 0.9265676567656765,
"grad_norm": 0.5338489413261414,
"learning_rate": 4.753168287027977e-06,
"loss": 0.6868,
"step": 1123
},
{
"epoch": 0.9273927392739274,
"grad_norm": 0.539506196975708,
"learning_rate": 4.752693608627484e-06,
"loss": 0.6798,
"step": 1124
},
{
"epoch": 0.9282178217821783,
"grad_norm": 0.5377005338668823,
"learning_rate": 4.7522184979906225e-06,
"loss": 0.6777,
"step": 1125
},
{
"epoch": 0.929042904290429,
"grad_norm": 0.5573501586914062,
"learning_rate": 4.751742955208558e-06,
"loss": 0.6861,
"step": 1126
},
{
"epoch": 0.9298679867986799,
"grad_norm": 0.5515655875205994,
"learning_rate": 4.751266980372534e-06,
"loss": 0.6908,
"step": 1127
},
{
"epoch": 0.9306930693069307,
"grad_norm": 0.5649194717407227,
"learning_rate": 4.750790573573879e-06,
"loss": 0.67,
"step": 1128
},
{
"epoch": 0.9315181518151815,
"grad_norm": 0.5307170748710632,
"learning_rate": 4.750313734904003e-06,
"loss": 0.6783,
"step": 1129
},
{
"epoch": 0.9323432343234324,
"grad_norm": 0.5290861129760742,
"learning_rate": 4.7498364644544e-06,
"loss": 0.6725,
"step": 1130
},
{
"epoch": 0.9331683168316832,
"grad_norm": 0.5453407764434814,
"learning_rate": 4.749358762316646e-06,
"loss": 0.7064,
"step": 1131
},
{
"epoch": 0.933993399339934,
"grad_norm": 0.5274932980537415,
"learning_rate": 4.7488806285824e-06,
"loss": 0.6802,
"step": 1132
},
{
"epoch": 0.9348184818481848,
"grad_norm": 0.527488648891449,
"learning_rate": 4.7484020633434055e-06,
"loss": 0.6571,
"step": 1133
},
{
"epoch": 0.9356435643564357,
"grad_norm": 0.5450428128242493,
"learning_rate": 4.747923066691487e-06,
"loss": 0.6708,
"step": 1134
},
{
"epoch": 0.9364686468646864,
"grad_norm": 0.5410321354866028,
"learning_rate": 4.74744363871855e-06,
"loss": 0.6869,
"step": 1135
},
{
"epoch": 0.9372937293729373,
"grad_norm": 0.5409252643585205,
"learning_rate": 4.746963779516587e-06,
"loss": 0.6935,
"step": 1136
},
{
"epoch": 0.9381188118811881,
"grad_norm": 0.5387977361679077,
"learning_rate": 4.746483489177671e-06,
"loss": 0.6679,
"step": 1137
},
{
"epoch": 0.9389438943894389,
"grad_norm": 0.5344523787498474,
"learning_rate": 4.746002767793957e-06,
"loss": 0.6861,
"step": 1138
},
{
"epoch": 0.9397689768976898,
"grad_norm": 0.5451862812042236,
"learning_rate": 4.745521615457685e-06,
"loss": 0.6688,
"step": 1139
},
{
"epoch": 0.9405940594059405,
"grad_norm": 0.5420905947685242,
"learning_rate": 4.745040032261175e-06,
"loss": 0.7016,
"step": 1140
},
{
"epoch": 0.9414191419141914,
"grad_norm": 0.5483806729316711,
"learning_rate": 4.744558018296831e-06,
"loss": 0.7091,
"step": 1141
},
{
"epoch": 0.9422442244224423,
"grad_norm": 0.5476734638214111,
"learning_rate": 4.74407557365714e-06,
"loss": 0.6928,
"step": 1142
},
{
"epoch": 0.943069306930693,
"grad_norm": 0.539954662322998,
"learning_rate": 4.743592698434671e-06,
"loss": 0.6617,
"step": 1143
},
{
"epoch": 0.9438943894389439,
"grad_norm": 0.5607476234436035,
"learning_rate": 4.7431093927220775e-06,
"loss": 0.6649,
"step": 1144
},
{
"epoch": 0.9447194719471947,
"grad_norm": 0.5419979691505432,
"learning_rate": 4.742625656612091e-06,
"loss": 0.6745,
"step": 1145
},
{
"epoch": 0.9455445544554455,
"grad_norm": 0.5784286260604858,
"learning_rate": 4.74214149019753e-06,
"loss": 0.6946,
"step": 1146
},
{
"epoch": 0.9463696369636964,
"grad_norm": 0.5449656248092651,
"learning_rate": 4.741656893571295e-06,
"loss": 0.685,
"step": 1147
},
{
"epoch": 0.9471947194719472,
"grad_norm": 0.5477748513221741,
"learning_rate": 4.741171866826366e-06,
"loss": 0.6714,
"step": 1148
},
{
"epoch": 0.948019801980198,
"grad_norm": 0.5583825707435608,
"learning_rate": 4.74068641005581e-06,
"loss": 0.6839,
"step": 1149
},
{
"epoch": 0.9488448844884488,
"grad_norm": 0.5398768186569214,
"learning_rate": 4.7402005233527725e-06,
"loss": 0.6763,
"step": 1150
},
{
"epoch": 0.9496699669966997,
"grad_norm": 0.5726080536842346,
"learning_rate": 4.739714206810484e-06,
"loss": 0.6851,
"step": 1151
},
{
"epoch": 0.9504950495049505,
"grad_norm": 0.5650553703308105,
"learning_rate": 4.739227460522256e-06,
"loss": 0.6828,
"step": 1152
},
{
"epoch": 0.9513201320132013,
"grad_norm": 0.558864414691925,
"learning_rate": 4.738740284581484e-06,
"loss": 0.6811,
"step": 1153
},
{
"epoch": 0.9521452145214522,
"grad_norm": 0.5432051420211792,
"learning_rate": 4.738252679081644e-06,
"loss": 0.6819,
"step": 1154
},
{
"epoch": 0.9529702970297029,
"grad_norm": 0.5568391680717468,
"learning_rate": 4.7377646441162975e-06,
"loss": 0.7298,
"step": 1155
},
{
"epoch": 0.9537953795379538,
"grad_norm": 0.569330632686615,
"learning_rate": 4.7372761797790836e-06,
"loss": 0.6487,
"step": 1156
},
{
"epoch": 0.9546204620462047,
"grad_norm": 0.5513132810592651,
"learning_rate": 4.736787286163728e-06,
"loss": 0.6921,
"step": 1157
},
{
"epoch": 0.9554455445544554,
"grad_norm": 0.561714768409729,
"learning_rate": 4.736297963364038e-06,
"loss": 0.6669,
"step": 1158
},
{
"epoch": 0.9562706270627063,
"grad_norm": 0.5341464877128601,
"learning_rate": 4.735808211473901e-06,
"loss": 0.6884,
"step": 1159
},
{
"epoch": 0.9570957095709571,
"grad_norm": 0.5575276613235474,
"learning_rate": 4.73531803058729e-06,
"loss": 0.6735,
"step": 1160
},
{
"epoch": 0.9579207920792079,
"grad_norm": 0.5622995495796204,
"learning_rate": 4.734827420798257e-06,
"loss": 0.6815,
"step": 1161
},
{
"epoch": 0.9587458745874587,
"grad_norm": 0.5288589596748352,
"learning_rate": 4.734336382200939e-06,
"loss": 0.6947,
"step": 1162
},
{
"epoch": 0.9595709570957096,
"grad_norm": 0.5552820563316345,
"learning_rate": 4.733844914889554e-06,
"loss": 0.6736,
"step": 1163
},
{
"epoch": 0.9603960396039604,
"grad_norm": 0.5323270559310913,
"learning_rate": 4.7333530189584024e-06,
"loss": 0.6516,
"step": 1164
},
{
"epoch": 0.9612211221122112,
"grad_norm": 0.5653538703918457,
"learning_rate": 4.732860694501867e-06,
"loss": 0.6808,
"step": 1165
},
{
"epoch": 0.9620462046204621,
"grad_norm": 0.5389164090156555,
"learning_rate": 4.732367941614412e-06,
"loss": 0.683,
"step": 1166
},
{
"epoch": 0.9628712871287128,
"grad_norm": 0.5325968861579895,
"learning_rate": 4.731874760390586e-06,
"loss": 0.6952,
"step": 1167
},
{
"epoch": 0.9636963696369637,
"grad_norm": 0.5450141429901123,
"learning_rate": 4.7313811509250165e-06,
"loss": 0.6852,
"step": 1168
},
{
"epoch": 0.9645214521452146,
"grad_norm": 0.5620688796043396,
"learning_rate": 4.730887113312417e-06,
"loss": 0.6882,
"step": 1169
},
{
"epoch": 0.9653465346534653,
"grad_norm": 0.5411251783370972,
"learning_rate": 4.730392647647579e-06,
"loss": 0.6699,
"step": 1170
},
{
"epoch": 0.9661716171617162,
"grad_norm": 0.557712197303772,
"learning_rate": 4.72989775402538e-06,
"loss": 0.6922,
"step": 1171
},
{
"epoch": 0.966996699669967,
"grad_norm": 0.5456312298774719,
"learning_rate": 4.729402432540776e-06,
"loss": 0.6858,
"step": 1172
},
{
"epoch": 0.9678217821782178,
"grad_norm": 0.5637298822402954,
"learning_rate": 4.72890668328881e-06,
"loss": 0.7151,
"step": 1173
},
{
"epoch": 0.9686468646864687,
"grad_norm": 0.5458623170852661,
"learning_rate": 4.728410506364601e-06,
"loss": 0.6895,
"step": 1174
},
{
"epoch": 0.9694719471947195,
"grad_norm": 0.5525950193405151,
"learning_rate": 4.727913901863355e-06,
"loss": 0.6662,
"step": 1175
},
{
"epoch": 0.9702970297029703,
"grad_norm": 0.5622225999832153,
"learning_rate": 4.727416869880357e-06,
"loss": 0.6842,
"step": 1176
},
{
"epoch": 0.9711221122112211,
"grad_norm": 0.5422683358192444,
"learning_rate": 4.726919410510976e-06,
"loss": 0.668,
"step": 1177
},
{
"epoch": 0.971947194719472,
"grad_norm": 0.5314889550209045,
"learning_rate": 4.726421523850662e-06,
"loss": 0.7056,
"step": 1178
},
{
"epoch": 0.9727722772277227,
"grad_norm": 0.5366458892822266,
"learning_rate": 4.725923209994947e-06,
"loss": 0.6655,
"step": 1179
},
{
"epoch": 0.9735973597359736,
"grad_norm": 0.5452573895454407,
"learning_rate": 4.725424469039445e-06,
"loss": 0.6691,
"step": 1180
},
{
"epoch": 0.9744224422442245,
"grad_norm": 0.5398051142692566,
"learning_rate": 4.724925301079852e-06,
"loss": 0.6985,
"step": 1181
},
{
"epoch": 0.9752475247524752,
"grad_norm": 0.5489977598190308,
"learning_rate": 4.724425706211947e-06,
"loss": 0.7002,
"step": 1182
},
{
"epoch": 0.9760726072607261,
"grad_norm": 0.5429955720901489,
"learning_rate": 4.72392568453159e-06,
"loss": 0.6927,
"step": 1183
},
{
"epoch": 0.976897689768977,
"grad_norm": 0.5515938401222229,
"learning_rate": 4.7234252361347215e-06,
"loss": 0.678,
"step": 1184
},
{
"epoch": 0.9777227722772277,
"grad_norm": 0.5300772190093994,
"learning_rate": 4.722924361117365e-06,
"loss": 0.6836,
"step": 1185
},
{
"epoch": 0.9785478547854786,
"grad_norm": 0.5720294713973999,
"learning_rate": 4.722423059575627e-06,
"loss": 0.6688,
"step": 1186
},
{
"epoch": 0.9793729372937293,
"grad_norm": 0.5381259918212891,
"learning_rate": 4.7219213316056955e-06,
"loss": 0.7059,
"step": 1187
},
{
"epoch": 0.9801980198019802,
"grad_norm": 0.545156717300415,
"learning_rate": 4.721419177303839e-06,
"loss": 0.6747,
"step": 1188
},
{
"epoch": 0.981023102310231,
"grad_norm": 0.55919349193573,
"learning_rate": 4.720916596766409e-06,
"loss": 0.6804,
"step": 1189
},
{
"epoch": 0.9818481848184818,
"grad_norm": 0.5549430251121521,
"learning_rate": 4.7204135900898364e-06,
"loss": 0.6678,
"step": 1190
},
{
"epoch": 0.9826732673267327,
"grad_norm": 0.5881220698356628,
"learning_rate": 4.719910157370638e-06,
"loss": 0.66,
"step": 1191
},
{
"epoch": 0.9834983498349835,
"grad_norm": 0.5495649576187134,
"learning_rate": 4.71940629870541e-06,
"loss": 0.6778,
"step": 1192
},
{
"epoch": 0.9843234323432343,
"grad_norm": 0.5426628589630127,
"learning_rate": 4.7189020141908295e-06,
"loss": 0.6767,
"step": 1193
},
{
"epoch": 0.9851485148514851,
"grad_norm": 0.5726175904273987,
"learning_rate": 4.718397303923656e-06,
"loss": 0.7013,
"step": 1194
},
{
"epoch": 0.985973597359736,
"grad_norm": 0.5587770342826843,
"learning_rate": 4.7178921680007316e-06,
"loss": 0.701,
"step": 1195
},
{
"epoch": 0.9867986798679867,
"grad_norm": 0.5477480292320251,
"learning_rate": 4.71738660651898e-06,
"loss": 0.6908,
"step": 1196
},
{
"epoch": 0.9876237623762376,
"grad_norm": 0.5668710470199585,
"learning_rate": 4.7168806195754045e-06,
"loss": 0.6655,
"step": 1197
},
{
"epoch": 0.9884488448844885,
"grad_norm": 0.5600260496139526,
"learning_rate": 4.716374207267094e-06,
"loss": 0.6763,
"step": 1198
},
{
"epoch": 0.9892739273927392,
"grad_norm": 0.5428857803344727,
"learning_rate": 4.715867369691214e-06,
"loss": 0.6983,
"step": 1199
},
{
"epoch": 0.9900990099009901,
"grad_norm": 0.5704364776611328,
"learning_rate": 4.715360106945015e-06,
"loss": 0.6835,
"step": 1200
},
{
"epoch": 0.990924092409241,
"grad_norm": 0.5516597032546997,
"learning_rate": 4.714852419125828e-06,
"loss": 0.6907,
"step": 1201
},
{
"epoch": 0.9917491749174917,
"grad_norm": 0.5384538769721985,
"learning_rate": 4.7143443063310665e-06,
"loss": 0.6746,
"step": 1202
},
{
"epoch": 0.9925742574257426,
"grad_norm": 0.5800072550773621,
"learning_rate": 4.713835768658224e-06,
"loss": 0.6915,
"step": 1203
},
{
"epoch": 0.9933993399339934,
"grad_norm": 0.5570341348648071,
"learning_rate": 4.713326806204877e-06,
"loss": 0.7025,
"step": 1204
},
{
"epoch": 0.9942244224422442,
"grad_norm": 0.5560991168022156,
"learning_rate": 4.712817419068682e-06,
"loss": 0.6893,
"step": 1205
},
{
"epoch": 0.995049504950495,
"grad_norm": 0.5329129695892334,
"learning_rate": 4.712307607347379e-06,
"loss": 0.704,
"step": 1206
},
{
"epoch": 0.9958745874587459,
"grad_norm": 0.5526347160339355,
"learning_rate": 4.7117973711387874e-06,
"loss": 0.7007,
"step": 1207
},
{
"epoch": 0.9966996699669967,
"grad_norm": 0.5711978077888489,
"learning_rate": 4.71128671054081e-06,
"loss": 0.6582,
"step": 1208
},
{
"epoch": 0.9975247524752475,
"grad_norm": 0.5508738160133362,
"learning_rate": 4.710775625651429e-06,
"loss": 0.6575,
"step": 1209
},
{
"epoch": 0.9983498349834984,
"grad_norm": 0.538335919380188,
"learning_rate": 4.710264116568709e-06,
"loss": 0.6864,
"step": 1210
},
{
"epoch": 0.9991749174917491,
"grad_norm": 0.5674409866333008,
"learning_rate": 4.709752183390796e-06,
"loss": 0.6696,
"step": 1211
},
{
"epoch": 1.0,
"grad_norm": 0.5489563941955566,
"learning_rate": 4.709239826215918e-06,
"loss": 0.6512,
"step": 1212
}
],
"logging_steps": 1,
"max_steps": 7272,
"num_input_tokens_seen": 0,
"num_train_epochs": 6,
"save_steps": 1212,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 2.715053320858239e+18,
"train_batch_size": 4,
"trial_name": null,
"trial_params": null
}