lemonhat's picture
Add files using upload-large-folder tool
5c41267 verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 0.3110419906687403,
"eval_steps": 100,
"global_step": 2000,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.00015552099533437013,
"grad_norm": 4.662841910859059,
"learning_rate": 9.999999403215137e-06,
"loss": 0.5705,
"step": 1
},
{
"epoch": 0.00031104199066874026,
"grad_norm": 3.2101601568771754,
"learning_rate": 9.999997612860688e-06,
"loss": 0.5483,
"step": 2
},
{
"epoch": 0.00046656298600311044,
"grad_norm": 4.156927485367742,
"learning_rate": 9.999994628937082e-06,
"loss": 0.5274,
"step": 3
},
{
"epoch": 0.0006220839813374805,
"grad_norm": 3.3680995992825533,
"learning_rate": 9.99999045144503e-06,
"loss": 0.5289,
"step": 4
},
{
"epoch": 0.0007776049766718507,
"grad_norm": 3.3625564866317608,
"learning_rate": 9.99998508038553e-06,
"loss": 0.379,
"step": 5
},
{
"epoch": 0.0009331259720062209,
"grad_norm": 2.7876669390361055,
"learning_rate": 9.999978515759865e-06,
"loss": 0.4019,
"step": 6
},
{
"epoch": 0.001088646967340591,
"grad_norm": 2.8774460632164955,
"learning_rate": 9.999970757569602e-06,
"loss": 0.4947,
"step": 7
},
{
"epoch": 0.001244167962674961,
"grad_norm": 2.126555417939935,
"learning_rate": 9.999961805816589e-06,
"loss": 0.3555,
"step": 8
},
{
"epoch": 0.0013996889580093312,
"grad_norm": 1.9764001175562893,
"learning_rate": 9.999951660502969e-06,
"loss": 0.3102,
"step": 9
},
{
"epoch": 0.0015552099533437014,
"grad_norm": 1.122915497849003,
"learning_rate": 9.999940321631158e-06,
"loss": 0.2802,
"step": 10
},
{
"epoch": 0.0017107309486780716,
"grad_norm": 2.6849893357934436,
"learning_rate": 9.99992778920387e-06,
"loss": 0.3883,
"step": 11
},
{
"epoch": 0.0018662519440124418,
"grad_norm": 1.5146624591903277,
"learning_rate": 9.999914063224088e-06,
"loss": 0.2749,
"step": 12
},
{
"epoch": 0.002021772939346812,
"grad_norm": 1.4042567809467454,
"learning_rate": 9.999899143695095e-06,
"loss": 0.296,
"step": 13
},
{
"epoch": 0.002177293934681182,
"grad_norm": 1.817010561320059,
"learning_rate": 9.99988303062045e-06,
"loss": 0.3278,
"step": 14
},
{
"epoch": 0.0023328149300155523,
"grad_norm": 1.795433870197738,
"learning_rate": 9.999865724003998e-06,
"loss": 0.3146,
"step": 15
},
{
"epoch": 0.002488335925349922,
"grad_norm": 1.7826807436565577,
"learning_rate": 9.999847223849875e-06,
"loss": 0.3233,
"step": 16
},
{
"epoch": 0.0026438569206842922,
"grad_norm": 1.7206284351475924,
"learning_rate": 9.999827530162493e-06,
"loss": 0.3246,
"step": 17
},
{
"epoch": 0.0027993779160186624,
"grad_norm": 3.8760819362380867,
"learning_rate": 9.999806642946554e-06,
"loss": 0.2648,
"step": 18
},
{
"epoch": 0.0029548989113530326,
"grad_norm": 1.5644293516985985,
"learning_rate": 9.999784562207046e-06,
"loss": 0.3096,
"step": 19
},
{
"epoch": 0.003110419906687403,
"grad_norm": 2.0190853150190877,
"learning_rate": 9.999761287949237e-06,
"loss": 0.307,
"step": 20
},
{
"epoch": 0.003265940902021773,
"grad_norm": 2.763319388592032,
"learning_rate": 9.999736820178686e-06,
"loss": 0.4327,
"step": 21
},
{
"epoch": 0.003421461897356143,
"grad_norm": 1.6605096033172442,
"learning_rate": 9.999711158901231e-06,
"loss": 0.3918,
"step": 22
},
{
"epoch": 0.0035769828926905133,
"grad_norm": 1.7508401571856476,
"learning_rate": 9.999684304123e-06,
"loss": 0.3852,
"step": 23
},
{
"epoch": 0.0037325038880248835,
"grad_norm": 2.0163360210179335,
"learning_rate": 9.999656255850401e-06,
"loss": 0.3567,
"step": 24
},
{
"epoch": 0.0038880248833592537,
"grad_norm": 1.5224484473221345,
"learning_rate": 9.999627014090133e-06,
"loss": 0.3185,
"step": 25
},
{
"epoch": 0.004043545878693624,
"grad_norm": 1.5651644136387708,
"learning_rate": 9.999596578849173e-06,
"loss": 0.2548,
"step": 26
},
{
"epoch": 0.004199066874027994,
"grad_norm": 1.506984699015577,
"learning_rate": 9.999564950134788e-06,
"loss": 0.2719,
"step": 27
},
{
"epoch": 0.004354587869362364,
"grad_norm": 1.5707101400798584,
"learning_rate": 9.99953212795453e-06,
"loss": 0.2585,
"step": 28
},
{
"epoch": 0.004510108864696734,
"grad_norm": 1.6678601949561362,
"learning_rate": 9.999498112316231e-06,
"loss": 0.2642,
"step": 29
},
{
"epoch": 0.004665629860031105,
"grad_norm": 1.1937228959267376,
"learning_rate": 9.99946290322801e-06,
"loss": 0.3348,
"step": 30
},
{
"epoch": 0.004821150855365474,
"grad_norm": 1.474398491556367,
"learning_rate": 9.999426500698277e-06,
"loss": 0.2936,
"step": 31
},
{
"epoch": 0.004976671850699844,
"grad_norm": 1.4230321858584387,
"learning_rate": 9.999388904735718e-06,
"loss": 0.316,
"step": 32
},
{
"epoch": 0.005132192846034215,
"grad_norm": 2.5118600752998645,
"learning_rate": 9.999350115349309e-06,
"loss": 0.3135,
"step": 33
},
{
"epoch": 0.0052877138413685845,
"grad_norm": 1.7910755988881728,
"learning_rate": 9.999310132548308e-06,
"loss": 0.249,
"step": 34
},
{
"epoch": 0.005443234836702955,
"grad_norm": 1.4981333944055653,
"learning_rate": 9.999268956342261e-06,
"loss": 0.2594,
"step": 35
},
{
"epoch": 0.005598755832037325,
"grad_norm": 0.9261919071743852,
"learning_rate": 9.999226586740995e-06,
"loss": 0.2333,
"step": 36
},
{
"epoch": 0.0057542768273716955,
"grad_norm": 1.26246346078558,
"learning_rate": 9.999183023754628e-06,
"loss": 0.1787,
"step": 37
},
{
"epoch": 0.005909797822706065,
"grad_norm": 1.9545697787374448,
"learning_rate": 9.999138267393557e-06,
"loss": 0.3246,
"step": 38
},
{
"epoch": 0.006065318818040436,
"grad_norm": 1.4285410822616305,
"learning_rate": 9.999092317668467e-06,
"loss": 0.223,
"step": 39
},
{
"epoch": 0.006220839813374806,
"grad_norm": 1.4526856529113084,
"learning_rate": 9.999045174590324e-06,
"loss": 0.182,
"step": 40
},
{
"epoch": 0.006376360808709175,
"grad_norm": 2.4846217662340995,
"learning_rate": 9.998996838170387e-06,
"loss": 0.36,
"step": 41
},
{
"epoch": 0.006531881804043546,
"grad_norm": 1.2772759621800358,
"learning_rate": 9.998947308420189e-06,
"loss": 0.241,
"step": 42
},
{
"epoch": 0.006687402799377916,
"grad_norm": 2.7720889102611945,
"learning_rate": 9.998896585351557e-06,
"loss": 0.3213,
"step": 43
},
{
"epoch": 0.006842923794712286,
"grad_norm": 1.7490095603308047,
"learning_rate": 9.998844668976595e-06,
"loss": 0.3155,
"step": 44
},
{
"epoch": 0.006998444790046656,
"grad_norm": 1.3823301922226903,
"learning_rate": 9.998791559307702e-06,
"loss": 0.2149,
"step": 45
},
{
"epoch": 0.007153965785381027,
"grad_norm": 1.288871141891326,
"learning_rate": 9.998737256357551e-06,
"loss": 0.2887,
"step": 46
},
{
"epoch": 0.007309486780715396,
"grad_norm": 3.483009451782568,
"learning_rate": 9.99868176013911e-06,
"loss": 0.263,
"step": 47
},
{
"epoch": 0.007465007776049767,
"grad_norm": 1.652490483156804,
"learning_rate": 9.998625070665622e-06,
"loss": 0.2664,
"step": 48
},
{
"epoch": 0.007620528771384137,
"grad_norm": 1.8206039592741312,
"learning_rate": 9.99856718795062e-06,
"loss": 0.224,
"step": 49
},
{
"epoch": 0.007776049766718507,
"grad_norm": 3.2471818448644743,
"learning_rate": 9.998508112007925e-06,
"loss": 0.293,
"step": 50
},
{
"epoch": 0.007931570762052876,
"grad_norm": 2.4630640416023,
"learning_rate": 9.998447842851638e-06,
"loss": 0.2958,
"step": 51
},
{
"epoch": 0.008087091757387248,
"grad_norm": 2.1952255314920817,
"learning_rate": 9.998386380496144e-06,
"loss": 0.2841,
"step": 52
},
{
"epoch": 0.008242612752721618,
"grad_norm": 1.7440998263562653,
"learning_rate": 9.998323724956114e-06,
"loss": 0.2392,
"step": 53
},
{
"epoch": 0.008398133748055987,
"grad_norm": 1.7713538170023606,
"learning_rate": 9.998259876246509e-06,
"loss": 0.2148,
"step": 54
},
{
"epoch": 0.008553654743390357,
"grad_norm": 2.196248357816803,
"learning_rate": 9.998194834382567e-06,
"loss": 0.2314,
"step": 55
},
{
"epoch": 0.008709175738724729,
"grad_norm": 1.5241920091736059,
"learning_rate": 9.998128599379817e-06,
"loss": 0.3538,
"step": 56
},
{
"epoch": 0.008864696734059098,
"grad_norm": 1.084932443566165,
"learning_rate": 9.998061171254068e-06,
"loss": 0.2061,
"step": 57
},
{
"epoch": 0.009020217729393468,
"grad_norm": 1.7028355052947441,
"learning_rate": 9.997992550021418e-06,
"loss": 0.2286,
"step": 58
},
{
"epoch": 0.009175738724727838,
"grad_norm": 1.7850241306158636,
"learning_rate": 9.997922735698247e-06,
"loss": 0.1935,
"step": 59
},
{
"epoch": 0.00933125972006221,
"grad_norm": 2.7780720350287287,
"learning_rate": 9.997851728301219e-06,
"loss": 0.2658,
"step": 60
},
{
"epoch": 0.009486780715396579,
"grad_norm": 1.8811033125325856,
"learning_rate": 9.997779527847287e-06,
"loss": 0.1963,
"step": 61
},
{
"epoch": 0.009642301710730949,
"grad_norm": 1.3758579938738247,
"learning_rate": 9.997706134353687e-06,
"loss": 0.2529,
"step": 62
},
{
"epoch": 0.009797822706065318,
"grad_norm": 1.9634000227706385,
"learning_rate": 9.997631547837934e-06,
"loss": 0.2544,
"step": 63
},
{
"epoch": 0.009953343701399688,
"grad_norm": 1.594710372018227,
"learning_rate": 9.997555768317838e-06,
"loss": 0.3528,
"step": 64
},
{
"epoch": 0.01010886469673406,
"grad_norm": 1.8005547220704254,
"learning_rate": 9.997478795811486e-06,
"loss": 0.2165,
"step": 65
},
{
"epoch": 0.01026438569206843,
"grad_norm": 2.290269323202059,
"learning_rate": 9.997400630337254e-06,
"loss": 0.2786,
"step": 66
},
{
"epoch": 0.0104199066874028,
"grad_norm": 1.5486051696063095,
"learning_rate": 9.997321271913801e-06,
"loss": 0.2188,
"step": 67
},
{
"epoch": 0.010575427682737169,
"grad_norm": 0.9684733219759649,
"learning_rate": 9.997240720560068e-06,
"loss": 0.2043,
"step": 68
},
{
"epoch": 0.01073094867807154,
"grad_norm": 2.1081587478577437,
"learning_rate": 9.997158976295288e-06,
"loss": 0.2908,
"step": 69
},
{
"epoch": 0.01088646967340591,
"grad_norm": 3.6233628477076736,
"learning_rate": 9.99707603913897e-06,
"loss": 0.2579,
"step": 70
},
{
"epoch": 0.01104199066874028,
"grad_norm": 1.090209411261846,
"learning_rate": 9.996991909110918e-06,
"loss": 0.2864,
"step": 71
},
{
"epoch": 0.01119751166407465,
"grad_norm": 1.3430452010098815,
"learning_rate": 9.99690658623121e-06,
"loss": 0.2217,
"step": 72
},
{
"epoch": 0.01135303265940902,
"grad_norm": 2.3549515267664005,
"learning_rate": 9.996820070520216e-06,
"loss": 0.2822,
"step": 73
},
{
"epoch": 0.011508553654743391,
"grad_norm": 1.5602820881890913,
"learning_rate": 9.996732361998588e-06,
"loss": 0.2456,
"step": 74
},
{
"epoch": 0.01166407465007776,
"grad_norm": 1.5856862134183374,
"learning_rate": 9.996643460687264e-06,
"loss": 0.3056,
"step": 75
},
{
"epoch": 0.01181959564541213,
"grad_norm": 1.6134033436501471,
"learning_rate": 9.996553366607464e-06,
"loss": 0.2141,
"step": 76
},
{
"epoch": 0.0119751166407465,
"grad_norm": 1.3597955630988308,
"learning_rate": 9.996462079780696e-06,
"loss": 0.2295,
"step": 77
},
{
"epoch": 0.012130637636080872,
"grad_norm": 1.1374281802105086,
"learning_rate": 9.996369600228753e-06,
"loss": 0.2487,
"step": 78
},
{
"epoch": 0.012286158631415241,
"grad_norm": 1.4298077500438133,
"learning_rate": 9.99627592797371e-06,
"loss": 0.2446,
"step": 79
},
{
"epoch": 0.012441679626749611,
"grad_norm": 1.3975983522660094,
"learning_rate": 9.996181063037924e-06,
"loss": 0.2611,
"step": 80
},
{
"epoch": 0.012597200622083981,
"grad_norm": 1.5544782250742402,
"learning_rate": 9.996085005444046e-06,
"loss": 0.2311,
"step": 81
},
{
"epoch": 0.01275272161741835,
"grad_norm": 1.3603452791878323,
"learning_rate": 9.995987755215006e-06,
"loss": 0.2003,
"step": 82
},
{
"epoch": 0.012908242612752722,
"grad_norm": 1.3071118505273163,
"learning_rate": 9.995889312374016e-06,
"loss": 0.2338,
"step": 83
},
{
"epoch": 0.013063763608087092,
"grad_norm": 1.7380116089178919,
"learning_rate": 9.995789676944576e-06,
"loss": 0.2645,
"step": 84
},
{
"epoch": 0.013219284603421462,
"grad_norm": 1.263086313797395,
"learning_rate": 9.995688848950473e-06,
"loss": 0.2215,
"step": 85
},
{
"epoch": 0.013374805598755831,
"grad_norm": 1.5100086739523095,
"learning_rate": 9.995586828415774e-06,
"loss": 0.2444,
"step": 86
},
{
"epoch": 0.013530326594090203,
"grad_norm": 1.0847005275250092,
"learning_rate": 9.995483615364833e-06,
"loss": 0.2129,
"step": 87
},
{
"epoch": 0.013685847589424573,
"grad_norm": 1.3155329082198164,
"learning_rate": 9.995379209822289e-06,
"loss": 0.2788,
"step": 88
},
{
"epoch": 0.013841368584758942,
"grad_norm": 1.8214452427995387,
"learning_rate": 9.995273611813065e-06,
"loss": 0.3027,
"step": 89
},
{
"epoch": 0.013996889580093312,
"grad_norm": 0.8312908694387112,
"learning_rate": 9.995166821362368e-06,
"loss": 0.226,
"step": 90
},
{
"epoch": 0.014152410575427682,
"grad_norm": 1.6627232520442479,
"learning_rate": 9.995058838495689e-06,
"loss": 0.2742,
"step": 91
},
{
"epoch": 0.014307931570762053,
"grad_norm": 0.9378761990044046,
"learning_rate": 9.994949663238809e-06,
"loss": 0.267,
"step": 92
},
{
"epoch": 0.014463452566096423,
"grad_norm": 2.122534441012584,
"learning_rate": 9.994839295617786e-06,
"loss": 0.2438,
"step": 93
},
{
"epoch": 0.014618973561430793,
"grad_norm": 1.577456726662404,
"learning_rate": 9.994727735658968e-06,
"loss": 0.2659,
"step": 94
},
{
"epoch": 0.014774494556765163,
"grad_norm": 1.6054087269070507,
"learning_rate": 9.994614983388986e-06,
"loss": 0.2404,
"step": 95
},
{
"epoch": 0.014930015552099534,
"grad_norm": 1.5558443339273214,
"learning_rate": 9.994501038834755e-06,
"loss": 0.2703,
"step": 96
},
{
"epoch": 0.015085536547433904,
"grad_norm": 1.7101494645663162,
"learning_rate": 9.994385902023474e-06,
"loss": 0.2148,
"step": 97
},
{
"epoch": 0.015241057542768274,
"grad_norm": 1.5422168422798725,
"learning_rate": 9.99426957298263e-06,
"loss": 0.2045,
"step": 98
},
{
"epoch": 0.015396578538102643,
"grad_norm": 1.3177834617215995,
"learning_rate": 9.994152051739991e-06,
"loss": 0.2097,
"step": 99
},
{
"epoch": 0.015552099533437015,
"grad_norm": 1.7250496500116883,
"learning_rate": 9.994033338323612e-06,
"loss": 0.2309,
"step": 100
},
{
"epoch": 0.015552099533437015,
"eval_loss": 0.255514532327652,
"eval_runtime": 9.4404,
"eval_samples_per_second": 2.754,
"eval_steps_per_second": 0.741,
"step": 100
},
{
"epoch": 0.015707620528771383,
"grad_norm": 1.2242797466034228,
"learning_rate": 9.993913432761831e-06,
"loss": 0.2309,
"step": 101
},
{
"epoch": 0.015863141524105753,
"grad_norm": 1.2091727169379591,
"learning_rate": 9.993792335083272e-06,
"loss": 0.215,
"step": 102
},
{
"epoch": 0.016018662519440126,
"grad_norm": 1.6991288183534923,
"learning_rate": 9.99367004531684e-06,
"loss": 0.2716,
"step": 103
},
{
"epoch": 0.016174183514774496,
"grad_norm": 1.8626540300463013,
"learning_rate": 9.99354656349173e-06,
"loss": 0.287,
"step": 104
},
{
"epoch": 0.016329704510108865,
"grad_norm": 1.2626220604624867,
"learning_rate": 9.993421889637418e-06,
"loss": 0.1737,
"step": 105
},
{
"epoch": 0.016485225505443235,
"grad_norm": 0.8807151838598477,
"learning_rate": 9.993296023783664e-06,
"loss": 0.227,
"step": 106
},
{
"epoch": 0.016640746500777605,
"grad_norm": 1.4662006360846318,
"learning_rate": 9.993168965960515e-06,
"loss": 0.2698,
"step": 107
},
{
"epoch": 0.016796267496111975,
"grad_norm": 2.5676508719496383,
"learning_rate": 9.993040716198304e-06,
"loss": 0.2231,
"step": 108
},
{
"epoch": 0.016951788491446344,
"grad_norm": 2.144008184181988,
"learning_rate": 9.992911274527641e-06,
"loss": 0.2729,
"step": 109
},
{
"epoch": 0.017107309486780714,
"grad_norm": 1.3871826576036752,
"learning_rate": 9.99278064097943e-06,
"loss": 0.2078,
"step": 110
},
{
"epoch": 0.017262830482115084,
"grad_norm": 1.9299054218636398,
"learning_rate": 9.992648815584853e-06,
"loss": 0.2543,
"step": 111
},
{
"epoch": 0.017418351477449457,
"grad_norm": 6.182669074382352,
"learning_rate": 9.992515798375379e-06,
"loss": 0.2442,
"step": 112
},
{
"epoch": 0.017573872472783827,
"grad_norm": 1.9218049477099652,
"learning_rate": 9.992381589382761e-06,
"loss": 0.2909,
"step": 113
},
{
"epoch": 0.017729393468118197,
"grad_norm": 1.7558505152868706,
"learning_rate": 9.992246188639035e-06,
"loss": 0.2182,
"step": 114
},
{
"epoch": 0.017884914463452566,
"grad_norm": 1.3145893008937046,
"learning_rate": 9.992109596176525e-06,
"loss": 0.2445,
"step": 115
},
{
"epoch": 0.018040435458786936,
"grad_norm": 2.3756692802265094,
"learning_rate": 9.991971812027836e-06,
"loss": 0.2961,
"step": 116
},
{
"epoch": 0.018195956454121306,
"grad_norm": 2.027933705938777,
"learning_rate": 9.991832836225863e-06,
"loss": 0.2459,
"step": 117
},
{
"epoch": 0.018351477449455676,
"grad_norm": 1.9997556478308784,
"learning_rate": 9.991692668803775e-06,
"loss": 0.2108,
"step": 118
},
{
"epoch": 0.018506998444790045,
"grad_norm": 1.39831187226532,
"learning_rate": 9.991551309795038e-06,
"loss": 0.1902,
"step": 119
},
{
"epoch": 0.01866251944012442,
"grad_norm": 1.6377700259822823,
"learning_rate": 9.991408759233394e-06,
"loss": 0.2491,
"step": 120
},
{
"epoch": 0.018818040435458788,
"grad_norm": 2.09576564356888,
"learning_rate": 9.991265017152869e-06,
"loss": 0.2526,
"step": 121
},
{
"epoch": 0.018973561430793158,
"grad_norm": 2.031216743667695,
"learning_rate": 9.991120083587779e-06,
"loss": 0.2418,
"step": 122
},
{
"epoch": 0.019129082426127528,
"grad_norm": 1.9897151692182136,
"learning_rate": 9.990973958572723e-06,
"loss": 0.2786,
"step": 123
},
{
"epoch": 0.019284603421461897,
"grad_norm": 1.7503968375792016,
"learning_rate": 9.990826642142581e-06,
"loss": 0.3231,
"step": 124
},
{
"epoch": 0.019440124416796267,
"grad_norm": 0.8307156104752434,
"learning_rate": 9.990678134332521e-06,
"loss": 0.2058,
"step": 125
},
{
"epoch": 0.019595645412130637,
"grad_norm": 2.105265419067902,
"learning_rate": 9.990528435177992e-06,
"loss": 0.2665,
"step": 126
},
{
"epoch": 0.019751166407465007,
"grad_norm": 0.845573052530141,
"learning_rate": 9.99037754471473e-06,
"loss": 0.1706,
"step": 127
},
{
"epoch": 0.019906687402799376,
"grad_norm": 1.3561288374051286,
"learning_rate": 9.990225462978756e-06,
"loss": 0.2834,
"step": 128
},
{
"epoch": 0.02006220839813375,
"grad_norm": 1.4639985615099256,
"learning_rate": 9.990072190006371e-06,
"loss": 0.2775,
"step": 129
},
{
"epoch": 0.02021772939346812,
"grad_norm": 1.424715750901468,
"learning_rate": 9.989917725834166e-06,
"loss": 0.2331,
"step": 130
},
{
"epoch": 0.02037325038880249,
"grad_norm": 1.4908712495988423,
"learning_rate": 9.989762070499015e-06,
"loss": 0.2326,
"step": 131
},
{
"epoch": 0.02052877138413686,
"grad_norm": 1.9371986234951772,
"learning_rate": 9.98960522403807e-06,
"loss": 0.248,
"step": 132
},
{
"epoch": 0.02068429237947123,
"grad_norm": 1.7802271420639102,
"learning_rate": 9.989447186488777e-06,
"loss": 0.2881,
"step": 133
},
{
"epoch": 0.0208398133748056,
"grad_norm": 1.1250396512690675,
"learning_rate": 9.98928795788886e-06,
"loss": 0.2309,
"step": 134
},
{
"epoch": 0.020995334370139968,
"grad_norm": 1.6801724252117862,
"learning_rate": 9.989127538276329e-06,
"loss": 0.2292,
"step": 135
},
{
"epoch": 0.021150855365474338,
"grad_norm": 1.1771299351260398,
"learning_rate": 9.98896592768948e-06,
"loss": 0.1553,
"step": 136
},
{
"epoch": 0.021306376360808708,
"grad_norm": 2.1842202518230645,
"learning_rate": 9.988803126166889e-06,
"loss": 0.3029,
"step": 137
},
{
"epoch": 0.02146189735614308,
"grad_norm": 1.3745547142156036,
"learning_rate": 9.988639133747422e-06,
"loss": 0.1702,
"step": 138
},
{
"epoch": 0.02161741835147745,
"grad_norm": 1.8504088238591443,
"learning_rate": 9.988473950470223e-06,
"loss": 0.2318,
"step": 139
},
{
"epoch": 0.02177293934681182,
"grad_norm": 1.7870069125473158,
"learning_rate": 9.988307576374727e-06,
"loss": 0.2008,
"step": 140
},
{
"epoch": 0.02192846034214619,
"grad_norm": 2.3953898044564883,
"learning_rate": 9.988140011500647e-06,
"loss": 0.2007,
"step": 141
},
{
"epoch": 0.02208398133748056,
"grad_norm": 1.1845465157973594,
"learning_rate": 9.987971255887985e-06,
"loss": 0.2334,
"step": 142
},
{
"epoch": 0.02223950233281493,
"grad_norm": 1.747163885973197,
"learning_rate": 9.987801309577026e-06,
"loss": 0.2559,
"step": 143
},
{
"epoch": 0.0223950233281493,
"grad_norm": 1.6909380164686145,
"learning_rate": 9.987630172608333e-06,
"loss": 0.2819,
"step": 144
},
{
"epoch": 0.02255054432348367,
"grad_norm": 1.6459040836915735,
"learning_rate": 9.987457845022767e-06,
"loss": 0.2283,
"step": 145
},
{
"epoch": 0.02270606531881804,
"grad_norm": 1.0639213494130906,
"learning_rate": 9.987284326861459e-06,
"loss": 0.2947,
"step": 146
},
{
"epoch": 0.022861586314152412,
"grad_norm": 1.423659630662775,
"learning_rate": 9.987109618165832e-06,
"loss": 0.1895,
"step": 147
},
{
"epoch": 0.023017107309486782,
"grad_norm": 2.1171729246911966,
"learning_rate": 9.986933718977591e-06,
"loss": 0.1967,
"step": 148
},
{
"epoch": 0.02317262830482115,
"grad_norm": 1.4659656443481106,
"learning_rate": 9.986756629338728e-06,
"loss": 0.1553,
"step": 149
},
{
"epoch": 0.02332814930015552,
"grad_norm": 3.3524464413937762,
"learning_rate": 9.986578349291514e-06,
"loss": 0.2472,
"step": 150
},
{
"epoch": 0.02348367029548989,
"grad_norm": 1.4421209559287633,
"learning_rate": 9.986398878878507e-06,
"loss": 0.1791,
"step": 151
},
{
"epoch": 0.02363919129082426,
"grad_norm": 1.7313564339261944,
"learning_rate": 9.98621821814255e-06,
"loss": 0.2238,
"step": 152
},
{
"epoch": 0.02379471228615863,
"grad_norm": 1.7017996756379121,
"learning_rate": 9.986036367126769e-06,
"loss": 0.2007,
"step": 153
},
{
"epoch": 0.023950233281493,
"grad_norm": 1.515471002124247,
"learning_rate": 9.985853325874575e-06,
"loss": 0.2688,
"step": 154
},
{
"epoch": 0.02410575427682737,
"grad_norm": 0.8049651881516254,
"learning_rate": 9.985669094429662e-06,
"loss": 0.1865,
"step": 155
},
{
"epoch": 0.024261275272161743,
"grad_norm": 1.2861650933813724,
"learning_rate": 9.985483672836007e-06,
"loss": 0.2403,
"step": 156
},
{
"epoch": 0.024416796267496113,
"grad_norm": 2.173379700965189,
"learning_rate": 9.985297061137877e-06,
"loss": 0.2045,
"step": 157
},
{
"epoch": 0.024572317262830483,
"grad_norm": 1.5915407935889336,
"learning_rate": 9.985109259379813e-06,
"loss": 0.2063,
"step": 158
},
{
"epoch": 0.024727838258164853,
"grad_norm": 1.877271886192633,
"learning_rate": 9.98492026760665e-06,
"loss": 0.226,
"step": 159
},
{
"epoch": 0.024883359253499222,
"grad_norm": 1.590999803444347,
"learning_rate": 9.984730085863504e-06,
"loss": 0.2243,
"step": 160
},
{
"epoch": 0.025038880248833592,
"grad_norm": 2.2602490405621016,
"learning_rate": 9.98453871419577e-06,
"loss": 0.2599,
"step": 161
},
{
"epoch": 0.025194401244167962,
"grad_norm": 1.8247502790432317,
"learning_rate": 9.984346152649135e-06,
"loss": 0.2575,
"step": 162
},
{
"epoch": 0.02534992223950233,
"grad_norm": 1.6317702406563646,
"learning_rate": 9.984152401269562e-06,
"loss": 0.2513,
"step": 163
},
{
"epoch": 0.0255054432348367,
"grad_norm": 1.479820350518653,
"learning_rate": 9.983957460103307e-06,
"loss": 0.2134,
"step": 164
},
{
"epoch": 0.025660964230171075,
"grad_norm": 2.2204278110409716,
"learning_rate": 9.9837613291969e-06,
"loss": 0.2288,
"step": 165
},
{
"epoch": 0.025816485225505444,
"grad_norm": 1.8249773963334357,
"learning_rate": 9.983564008597164e-06,
"loss": 0.2342,
"step": 166
},
{
"epoch": 0.025972006220839814,
"grad_norm": 1.892476010698033,
"learning_rate": 9.9833654983512e-06,
"loss": 0.2263,
"step": 167
},
{
"epoch": 0.026127527216174184,
"grad_norm": 1.593847254715758,
"learning_rate": 9.983165798506398e-06,
"loss": 0.2163,
"step": 168
},
{
"epoch": 0.026283048211508554,
"grad_norm": 1.7653992228114257,
"learning_rate": 9.982964909110426e-06,
"loss": 0.2938,
"step": 169
},
{
"epoch": 0.026438569206842923,
"grad_norm": 1.3352350617943483,
"learning_rate": 9.982762830211239e-06,
"loss": 0.2069,
"step": 170
},
{
"epoch": 0.026594090202177293,
"grad_norm": 1.6623662216358996,
"learning_rate": 9.982559561857079e-06,
"loss": 0.213,
"step": 171
},
{
"epoch": 0.026749611197511663,
"grad_norm": 1.1923151136153478,
"learning_rate": 9.982355104096468e-06,
"loss": 0.2068,
"step": 172
},
{
"epoch": 0.026905132192846033,
"grad_norm": 1.5009321240819553,
"learning_rate": 9.98214945697821e-06,
"loss": 0.3292,
"step": 173
},
{
"epoch": 0.027060653188180406,
"grad_norm": 1.6168504596283289,
"learning_rate": 9.981942620551399e-06,
"loss": 0.2001,
"step": 174
},
{
"epoch": 0.027216174183514776,
"grad_norm": 1.0410735731938325,
"learning_rate": 9.98173459486541e-06,
"loss": 0.2697,
"step": 175
},
{
"epoch": 0.027371695178849145,
"grad_norm": 1.477725722611291,
"learning_rate": 9.9815253799699e-06,
"loss": 0.1796,
"step": 176
},
{
"epoch": 0.027527216174183515,
"grad_norm": 1.5159741098115525,
"learning_rate": 9.981314975914811e-06,
"loss": 0.2203,
"step": 177
},
{
"epoch": 0.027682737169517885,
"grad_norm": 0.8954975243967727,
"learning_rate": 9.981103382750372e-06,
"loss": 0.2662,
"step": 178
},
{
"epoch": 0.027838258164852255,
"grad_norm": 1.418625218985406,
"learning_rate": 9.980890600527092e-06,
"loss": 0.2484,
"step": 179
},
{
"epoch": 0.027993779160186624,
"grad_norm": 1.4411516373436362,
"learning_rate": 9.980676629295763e-06,
"loss": 0.302,
"step": 180
},
{
"epoch": 0.028149300155520994,
"grad_norm": 0.9480510792156464,
"learning_rate": 9.980461469107463e-06,
"loss": 0.2075,
"step": 181
},
{
"epoch": 0.028304821150855364,
"grad_norm": 2.081864475923441,
"learning_rate": 9.980245120013558e-06,
"loss": 0.2942,
"step": 182
},
{
"epoch": 0.028460342146189737,
"grad_norm": 1.2615838373847896,
"learning_rate": 9.980027582065691e-06,
"loss": 0.2018,
"step": 183
},
{
"epoch": 0.028615863141524107,
"grad_norm": 1.2086223544731691,
"learning_rate": 9.979808855315792e-06,
"loss": 0.2743,
"step": 184
},
{
"epoch": 0.028771384136858476,
"grad_norm": 0.9412206342605678,
"learning_rate": 9.979588939816071e-06,
"loss": 0.2318,
"step": 185
},
{
"epoch": 0.028926905132192846,
"grad_norm": 1.365479987499767,
"learning_rate": 9.979367835619029e-06,
"loss": 0.2813,
"step": 186
},
{
"epoch": 0.029082426127527216,
"grad_norm": 1.1385427599520912,
"learning_rate": 9.979145542777444e-06,
"loss": 0.2627,
"step": 187
},
{
"epoch": 0.029237947122861586,
"grad_norm": 1.560448582637042,
"learning_rate": 9.97892206134438e-06,
"loss": 0.2042,
"step": 188
},
{
"epoch": 0.029393468118195955,
"grad_norm": 1.9585068672638826,
"learning_rate": 9.97869739137319e-06,
"loss": 0.2647,
"step": 189
},
{
"epoch": 0.029548989113530325,
"grad_norm": 1.612253014357388,
"learning_rate": 9.9784715329175e-06,
"loss": 0.2573,
"step": 190
},
{
"epoch": 0.0297045101088647,
"grad_norm": 1.2782552177366555,
"learning_rate": 9.978244486031228e-06,
"loss": 0.1914,
"step": 191
},
{
"epoch": 0.029860031104199068,
"grad_norm": 2.1188620010348163,
"learning_rate": 9.978016250768573e-06,
"loss": 0.245,
"step": 192
},
{
"epoch": 0.030015552099533438,
"grad_norm": 1.9777647488169638,
"learning_rate": 9.977786827184019e-06,
"loss": 0.2774,
"step": 193
},
{
"epoch": 0.030171073094867808,
"grad_norm": 2.0157801185629407,
"learning_rate": 9.977556215332332e-06,
"loss": 0.297,
"step": 194
},
{
"epoch": 0.030326594090202177,
"grad_norm": 0.8845310906810993,
"learning_rate": 9.97732441526856e-06,
"loss": 0.1756,
"step": 195
},
{
"epoch": 0.030482115085536547,
"grad_norm": 1.2647941053184737,
"learning_rate": 9.97709142704804e-06,
"loss": 0.1773,
"step": 196
},
{
"epoch": 0.030637636080870917,
"grad_norm": 1.1823797462719756,
"learning_rate": 9.976857250726389e-06,
"loss": 0.2501,
"step": 197
},
{
"epoch": 0.030793157076205287,
"grad_norm": 1.643272741263538,
"learning_rate": 9.976621886359506e-06,
"loss": 0.2794,
"step": 198
},
{
"epoch": 0.030948678071539656,
"grad_norm": 1.6415813649465196,
"learning_rate": 9.976385334003577e-06,
"loss": 0.2562,
"step": 199
},
{
"epoch": 0.03110419906687403,
"grad_norm": 1.4019518238717095,
"learning_rate": 9.976147593715074e-06,
"loss": 0.2066,
"step": 200
},
{
"epoch": 0.03110419906687403,
"eval_loss": 0.2431146204471588,
"eval_runtime": 9.4441,
"eval_samples_per_second": 2.753,
"eval_steps_per_second": 0.741,
"step": 200
},
{
"epoch": 0.031259720062208396,
"grad_norm": 1.5261705881041825,
"learning_rate": 9.975908665550742e-06,
"loss": 0.168,
"step": 201
},
{
"epoch": 0.031415241057542766,
"grad_norm": 1.3552305394454693,
"learning_rate": 9.975668549567623e-06,
"loss": 0.2513,
"step": 202
},
{
"epoch": 0.031570762052877135,
"grad_norm": 1.09704983539552,
"learning_rate": 9.97542724582303e-06,
"loss": 0.1877,
"step": 203
},
{
"epoch": 0.031726283048211505,
"grad_norm": 1.8452203060592092,
"learning_rate": 9.975184754374572e-06,
"loss": 0.3442,
"step": 204
},
{
"epoch": 0.03188180404354588,
"grad_norm": 1.4512649025391702,
"learning_rate": 9.974941075280128e-06,
"loss": 0.2172,
"step": 205
},
{
"epoch": 0.03203732503888025,
"grad_norm": 1.5376722263850107,
"learning_rate": 9.974696208597874e-06,
"loss": 0.2206,
"step": 206
},
{
"epoch": 0.03219284603421462,
"grad_norm": 1.6097488768932668,
"learning_rate": 9.97445015438626e-06,
"loss": 0.2134,
"step": 207
},
{
"epoch": 0.03234836702954899,
"grad_norm": 1.2381378734127797,
"learning_rate": 9.974202912704022e-06,
"loss": 0.2026,
"step": 208
},
{
"epoch": 0.03250388802488336,
"grad_norm": 2.0110329862327663,
"learning_rate": 9.973954483610184e-06,
"loss": 0.2117,
"step": 209
},
{
"epoch": 0.03265940902021773,
"grad_norm": 4.938465463538487,
"learning_rate": 9.973704867164044e-06,
"loss": 0.2787,
"step": 210
},
{
"epoch": 0.0328149300155521,
"grad_norm": 1.9318587506840115,
"learning_rate": 9.973454063425191e-06,
"loss": 0.2901,
"step": 211
},
{
"epoch": 0.03297045101088647,
"grad_norm": 1.5730776773238022,
"learning_rate": 9.973202072453498e-06,
"loss": 0.3557,
"step": 212
},
{
"epoch": 0.03312597200622084,
"grad_norm": 2.333406801079277,
"learning_rate": 9.972948894309116e-06,
"loss": 0.2553,
"step": 213
},
{
"epoch": 0.03328149300155521,
"grad_norm": 1.2613725609366824,
"learning_rate": 9.972694529052482e-06,
"loss": 0.2721,
"step": 214
},
{
"epoch": 0.03343701399688958,
"grad_norm": 1.233807021429561,
"learning_rate": 9.972438976744317e-06,
"loss": 0.194,
"step": 215
},
{
"epoch": 0.03359253499222395,
"grad_norm": 1.0922019141763,
"learning_rate": 9.972182237445624e-06,
"loss": 0.2625,
"step": 216
},
{
"epoch": 0.03374805598755832,
"grad_norm": 1.5332376003824164,
"learning_rate": 9.971924311217693e-06,
"loss": 0.2369,
"step": 217
},
{
"epoch": 0.03390357698289269,
"grad_norm": 2.1386234582292856,
"learning_rate": 9.971665198122093e-06,
"loss": 0.2691,
"step": 218
},
{
"epoch": 0.03405909797822706,
"grad_norm": 1.4374027394103162,
"learning_rate": 9.97140489822068e-06,
"loss": 0.2217,
"step": 219
},
{
"epoch": 0.03421461897356143,
"grad_norm": 1.7261766376116665,
"learning_rate": 9.971143411575585e-06,
"loss": 0.3063,
"step": 220
},
{
"epoch": 0.0343701399688958,
"grad_norm": 1.5632670578977363,
"learning_rate": 9.970880738249236e-06,
"loss": 0.2333,
"step": 221
},
{
"epoch": 0.03452566096423017,
"grad_norm": 1.6709935682257062,
"learning_rate": 9.97061687830433e-06,
"loss": 0.2808,
"step": 222
},
{
"epoch": 0.034681181959564544,
"grad_norm": 1.7747486994884278,
"learning_rate": 9.970351831803862e-06,
"loss": 0.3182,
"step": 223
},
{
"epoch": 0.034836702954898914,
"grad_norm": 1.2079739996818415,
"learning_rate": 9.970085598811094e-06,
"loss": 0.2426,
"step": 224
},
{
"epoch": 0.034992223950233284,
"grad_norm": 2.269795435480081,
"learning_rate": 9.969818179389586e-06,
"loss": 0.1933,
"step": 225
},
{
"epoch": 0.035147744945567654,
"grad_norm": 1.28324330975912,
"learning_rate": 9.96954957360317e-06,
"loss": 0.2078,
"step": 226
},
{
"epoch": 0.03530326594090202,
"grad_norm": 3.0240429569891147,
"learning_rate": 9.969279781515967e-06,
"loss": 0.2865,
"step": 227
},
{
"epoch": 0.03545878693623639,
"grad_norm": 1.4022531253860526,
"learning_rate": 9.969008803192385e-06,
"loss": 0.189,
"step": 228
},
{
"epoch": 0.03561430793157076,
"grad_norm": 1.4481645110880101,
"learning_rate": 9.968736638697105e-06,
"loss": 0.2038,
"step": 229
},
{
"epoch": 0.03576982892690513,
"grad_norm": 1.2439638440320844,
"learning_rate": 9.968463288095096e-06,
"loss": 0.1962,
"step": 230
},
{
"epoch": 0.0359253499222395,
"grad_norm": 1.550618674775446,
"learning_rate": 9.968188751451613e-06,
"loss": 0.2461,
"step": 231
},
{
"epoch": 0.03608087091757387,
"grad_norm": 1.2590441656933422,
"learning_rate": 9.967913028832192e-06,
"loss": 0.28,
"step": 232
},
{
"epoch": 0.03623639191290824,
"grad_norm": 15.743047596573488,
"learning_rate": 9.96763612030265e-06,
"loss": 0.2272,
"step": 233
},
{
"epoch": 0.03639191290824261,
"grad_norm": 1.0832646660805165,
"learning_rate": 9.967358025929092e-06,
"loss": 0.2766,
"step": 234
},
{
"epoch": 0.03654743390357698,
"grad_norm": 1.496152606461021,
"learning_rate": 9.9670787457779e-06,
"loss": 0.1928,
"step": 235
},
{
"epoch": 0.03670295489891135,
"grad_norm": 1.5049076518304147,
"learning_rate": 9.966798279915744e-06,
"loss": 0.2023,
"step": 236
},
{
"epoch": 0.03685847589424572,
"grad_norm": 0.9377725167524534,
"learning_rate": 9.966516628409573e-06,
"loss": 0.1657,
"step": 237
},
{
"epoch": 0.03701399688958009,
"grad_norm": 1.5646202349920761,
"learning_rate": 9.96623379132662e-06,
"loss": 0.2157,
"step": 238
},
{
"epoch": 0.03716951788491446,
"grad_norm": 1.14277577769819,
"learning_rate": 9.965949768734409e-06,
"loss": 0.2163,
"step": 239
},
{
"epoch": 0.03732503888024884,
"grad_norm": 2.158716016882222,
"learning_rate": 9.965664560700734e-06,
"loss": 0.2041,
"step": 240
},
{
"epoch": 0.03748055987558321,
"grad_norm": 1.8568349342429766,
"learning_rate": 9.965378167293679e-06,
"loss": 0.2266,
"step": 241
},
{
"epoch": 0.037636080870917576,
"grad_norm": 2.035673543431871,
"learning_rate": 9.965090588581609e-06,
"loss": 0.2893,
"step": 242
},
{
"epoch": 0.037791601866251946,
"grad_norm": 1.2421527558787024,
"learning_rate": 9.964801824633177e-06,
"loss": 0.166,
"step": 243
},
{
"epoch": 0.037947122861586316,
"grad_norm": 1.7368625294642988,
"learning_rate": 9.964511875517313e-06,
"loss": 0.2593,
"step": 244
},
{
"epoch": 0.038102643856920686,
"grad_norm": 1.274064232837515,
"learning_rate": 9.964220741303232e-06,
"loss": 0.1676,
"step": 245
},
{
"epoch": 0.038258164852255055,
"grad_norm": 1.3271094523398685,
"learning_rate": 9.963928422060432e-06,
"loss": 0.2048,
"step": 246
},
{
"epoch": 0.038413685847589425,
"grad_norm": 1.441894820882409,
"learning_rate": 9.963634917858692e-06,
"loss": 0.2102,
"step": 247
},
{
"epoch": 0.038569206842923795,
"grad_norm": 1.3882607946902543,
"learning_rate": 9.963340228768077e-06,
"loss": 0.1862,
"step": 248
},
{
"epoch": 0.038724727838258165,
"grad_norm": 1.1529068772443192,
"learning_rate": 9.963044354858934e-06,
"loss": 0.2519,
"step": 249
},
{
"epoch": 0.038880248833592534,
"grad_norm": 2.236043321099024,
"learning_rate": 9.962747296201891e-06,
"loss": 0.1635,
"step": 250
},
{
"epoch": 0.039035769828926904,
"grad_norm": 1.8503487939836718,
"learning_rate": 9.96244905286786e-06,
"loss": 0.181,
"step": 251
},
{
"epoch": 0.039191290824261274,
"grad_norm": 1.4083157880171735,
"learning_rate": 9.962149624928037e-06,
"loss": 0.1781,
"step": 252
},
{
"epoch": 0.039346811819595644,
"grad_norm": 1.6536407646222175,
"learning_rate": 9.961849012453899e-06,
"loss": 0.2699,
"step": 253
},
{
"epoch": 0.039502332814930013,
"grad_norm": 1.3154495432198843,
"learning_rate": 9.961547215517206e-06,
"loss": 0.2096,
"step": 254
},
{
"epoch": 0.03965785381026438,
"grad_norm": 1.222944730470649,
"learning_rate": 9.961244234190001e-06,
"loss": 0.209,
"step": 255
},
{
"epoch": 0.03981337480559875,
"grad_norm": 1.3903861430735245,
"learning_rate": 9.96094006854461e-06,
"loss": 0.177,
"step": 256
},
{
"epoch": 0.03996889580093312,
"grad_norm": 1.8733569984170189,
"learning_rate": 9.960634718653644e-06,
"loss": 0.4051,
"step": 257
},
{
"epoch": 0.0401244167962675,
"grad_norm": 1.3013086938531622,
"learning_rate": 9.96032818458999e-06,
"loss": 0.2215,
"step": 258
},
{
"epoch": 0.04027993779160187,
"grad_norm": 1.9062067810307814,
"learning_rate": 9.960020466426825e-06,
"loss": 0.2131,
"step": 259
},
{
"epoch": 0.04043545878693624,
"grad_norm": 1.240725461727028,
"learning_rate": 9.959711564237603e-06,
"loss": 0.2376,
"step": 260
},
{
"epoch": 0.04059097978227061,
"grad_norm": 1.504578258989953,
"learning_rate": 9.95940147809607e-06,
"loss": 0.2238,
"step": 261
},
{
"epoch": 0.04074650077760498,
"grad_norm": 1.112441665378311,
"learning_rate": 9.959090208076239e-06,
"loss": 0.175,
"step": 262
},
{
"epoch": 0.04090202177293935,
"grad_norm": 1.492328645699945,
"learning_rate": 9.958777754252418e-06,
"loss": 0.2332,
"step": 263
},
{
"epoch": 0.04105754276827372,
"grad_norm": 1.4626777112927891,
"learning_rate": 9.958464116699196e-06,
"loss": 0.2093,
"step": 264
},
{
"epoch": 0.04121306376360809,
"grad_norm": 2.4304182018626266,
"learning_rate": 9.958149295491441e-06,
"loss": 0.2495,
"step": 265
},
{
"epoch": 0.04136858475894246,
"grad_norm": 2.1830670676642256,
"learning_rate": 9.957833290704305e-06,
"loss": 0.2151,
"step": 266
},
{
"epoch": 0.04152410575427683,
"grad_norm": 0.9776646131405466,
"learning_rate": 9.957516102413223e-06,
"loss": 0.2215,
"step": 267
},
{
"epoch": 0.0416796267496112,
"grad_norm": 0.9811824757237497,
"learning_rate": 9.957197730693912e-06,
"loss": 0.2671,
"step": 268
},
{
"epoch": 0.04183514774494557,
"grad_norm": 1.025030756788744,
"learning_rate": 9.956878175622372e-06,
"loss": 0.1935,
"step": 269
},
{
"epoch": 0.041990668740279936,
"grad_norm": 1.715248799705313,
"learning_rate": 9.956557437274887e-06,
"loss": 0.2639,
"step": 270
},
{
"epoch": 0.042146189735614306,
"grad_norm": 1.4715136542514509,
"learning_rate": 9.95623551572802e-06,
"loss": 0.1863,
"step": 271
},
{
"epoch": 0.042301710730948676,
"grad_norm": 2.0941396313348766,
"learning_rate": 9.955912411058616e-06,
"loss": 0.1764,
"step": 272
},
{
"epoch": 0.042457231726283046,
"grad_norm": 1.4113410003708207,
"learning_rate": 9.955588123343808e-06,
"loss": 0.2635,
"step": 273
},
{
"epoch": 0.042612752721617415,
"grad_norm": 1.0999635349018924,
"learning_rate": 9.955262652661009e-06,
"loss": 0.2424,
"step": 274
},
{
"epoch": 0.042768273716951785,
"grad_norm": 1.0847541480257452,
"learning_rate": 9.954935999087908e-06,
"loss": 0.276,
"step": 275
},
{
"epoch": 0.04292379471228616,
"grad_norm": 1.695906274664277,
"learning_rate": 9.954608162702488e-06,
"loss": 0.2316,
"step": 276
},
{
"epoch": 0.04307931570762053,
"grad_norm": 1.428650374776818,
"learning_rate": 9.954279143583003e-06,
"loss": 0.234,
"step": 277
},
{
"epoch": 0.0432348367029549,
"grad_norm": 1.261831528775643,
"learning_rate": 9.953948941807998e-06,
"loss": 0.2331,
"step": 278
},
{
"epoch": 0.04339035769828927,
"grad_norm": 1.1389240235405695,
"learning_rate": 9.953617557456295e-06,
"loss": 0.1813,
"step": 279
},
{
"epoch": 0.04354587869362364,
"grad_norm": 2.1356821017337264,
"learning_rate": 9.953284990607e-06,
"loss": 0.2716,
"step": 280
},
{
"epoch": 0.04370139968895801,
"grad_norm": 1.256196669200449,
"learning_rate": 9.952951241339501e-06,
"loss": 0.2586,
"step": 281
},
{
"epoch": 0.04385692068429238,
"grad_norm": 1.6264279435141102,
"learning_rate": 9.952616309733471e-06,
"loss": 0.2138,
"step": 282
},
{
"epoch": 0.04401244167962675,
"grad_norm": 1.0771562874552736,
"learning_rate": 9.952280195868859e-06,
"loss": 0.2798,
"step": 283
},
{
"epoch": 0.04416796267496112,
"grad_norm": 1.6634031368562676,
"learning_rate": 9.951942899825906e-06,
"loss": 0.3159,
"step": 284
},
{
"epoch": 0.04432348367029549,
"grad_norm": 1.5379741925800816,
"learning_rate": 9.951604421685121e-06,
"loss": 0.3275,
"step": 285
},
{
"epoch": 0.04447900466562986,
"grad_norm": 1.4489954817264272,
"learning_rate": 9.951264761527311e-06,
"loss": 0.1989,
"step": 286
},
{
"epoch": 0.04463452566096423,
"grad_norm": 1.6369744606712289,
"learning_rate": 9.950923919433555e-06,
"loss": 0.2068,
"step": 287
},
{
"epoch": 0.0447900466562986,
"grad_norm": 1.8400125131547473,
"learning_rate": 9.950581895485214e-06,
"loss": 0.1977,
"step": 288
},
{
"epoch": 0.04494556765163297,
"grad_norm": 2.1448208174547743,
"learning_rate": 9.950238689763937e-06,
"loss": 0.1882,
"step": 289
},
{
"epoch": 0.04510108864696734,
"grad_norm": 1.1002755110550755,
"learning_rate": 9.949894302351653e-06,
"loss": 0.2422,
"step": 290
},
{
"epoch": 0.04525660964230171,
"grad_norm": 0.8557887132764603,
"learning_rate": 9.94954873333057e-06,
"loss": 0.2249,
"step": 291
},
{
"epoch": 0.04541213063763608,
"grad_norm": 1.800548229871832,
"learning_rate": 9.94920198278318e-06,
"loss": 0.2462,
"step": 292
},
{
"epoch": 0.04556765163297045,
"grad_norm": 1.077848623865367,
"learning_rate": 9.948854050792256e-06,
"loss": 0.1693,
"step": 293
},
{
"epoch": 0.045723172628304824,
"grad_norm": 1.3420617788641933,
"learning_rate": 9.948504937440857e-06,
"loss": 0.2632,
"step": 294
},
{
"epoch": 0.045878693623639194,
"grad_norm": 1.786889545891979,
"learning_rate": 9.948154642812321e-06,
"loss": 0.1812,
"step": 295
},
{
"epoch": 0.046034214618973564,
"grad_norm": 1.6608331504976344,
"learning_rate": 9.947803166990267e-06,
"loss": 0.2781,
"step": 296
},
{
"epoch": 0.046189735614307934,
"grad_norm": 1.479079510539959,
"learning_rate": 9.947450510058596e-06,
"loss": 0.2176,
"step": 297
},
{
"epoch": 0.0463452566096423,
"grad_norm": 1.1205653962227666,
"learning_rate": 9.947096672101496e-06,
"loss": 0.2189,
"step": 298
},
{
"epoch": 0.04650077760497667,
"grad_norm": 1.6903970393534788,
"learning_rate": 9.94674165320343e-06,
"loss": 0.1715,
"step": 299
},
{
"epoch": 0.04665629860031104,
"grad_norm": 3.020535469766265,
"learning_rate": 9.946385453449145e-06,
"loss": 0.2334,
"step": 300
},
{
"epoch": 0.04665629860031104,
"eval_loss": 0.23520340025424957,
"eval_runtime": 9.4655,
"eval_samples_per_second": 2.747,
"eval_steps_per_second": 0.74,
"step": 300
},
{
"epoch": 0.04681181959564541,
"grad_norm": 1.2625213750296742,
"learning_rate": 9.946028072923675e-06,
"loss": 0.2153,
"step": 301
},
{
"epoch": 0.04696734059097978,
"grad_norm": 1.326552639234392,
"learning_rate": 9.945669511712328e-06,
"loss": 0.1378,
"step": 302
},
{
"epoch": 0.04712286158631415,
"grad_norm": 1.1353660480206176,
"learning_rate": 9.945309769900698e-06,
"loss": 0.2505,
"step": 303
},
{
"epoch": 0.04727838258164852,
"grad_norm": 1.2591178630665596,
"learning_rate": 9.944948847574662e-06,
"loss": 0.1704,
"step": 304
},
{
"epoch": 0.04743390357698289,
"grad_norm": 1.3520689396483014,
"learning_rate": 9.944586744820377e-06,
"loss": 0.2324,
"step": 305
},
{
"epoch": 0.04758942457231726,
"grad_norm": 1.0116417439713241,
"learning_rate": 9.94422346172428e-06,
"loss": 0.1512,
"step": 306
},
{
"epoch": 0.04774494556765163,
"grad_norm": 1.479626380132595,
"learning_rate": 9.943858998373093e-06,
"loss": 0.2121,
"step": 307
},
{
"epoch": 0.047900466562986,
"grad_norm": 1.4227055232441543,
"learning_rate": 9.94349335485382e-06,
"loss": 0.2667,
"step": 308
},
{
"epoch": 0.04805598755832037,
"grad_norm": 1.583200032514501,
"learning_rate": 9.943126531253744e-06,
"loss": 0.289,
"step": 309
},
{
"epoch": 0.04821150855365474,
"grad_norm": 1.8189938486203978,
"learning_rate": 9.942758527660429e-06,
"loss": 0.3084,
"step": 310
},
{
"epoch": 0.04836702954898912,
"grad_norm": 1.146189412882889,
"learning_rate": 9.942389344161724e-06,
"loss": 0.1669,
"step": 311
},
{
"epoch": 0.04852255054432349,
"grad_norm": 1.547896984860253,
"learning_rate": 9.94201898084576e-06,
"loss": 0.2064,
"step": 312
},
{
"epoch": 0.048678071539657856,
"grad_norm": 1.5949794296702688,
"learning_rate": 9.941647437800946e-06,
"loss": 0.1929,
"step": 313
},
{
"epoch": 0.048833592534992226,
"grad_norm": 1.803377063241175,
"learning_rate": 9.941274715115976e-06,
"loss": 0.2791,
"step": 314
},
{
"epoch": 0.048989113530326596,
"grad_norm": 1.3837921692775779,
"learning_rate": 9.940900812879822e-06,
"loss": 0.1767,
"step": 315
},
{
"epoch": 0.049144634525660966,
"grad_norm": 1.3433932609509933,
"learning_rate": 9.940525731181741e-06,
"loss": 0.2084,
"step": 316
},
{
"epoch": 0.049300155520995335,
"grad_norm": 1.357062528683942,
"learning_rate": 9.940149470111269e-06,
"loss": 0.2047,
"step": 317
},
{
"epoch": 0.049455676516329705,
"grad_norm": 1.6539883727473814,
"learning_rate": 9.939772029758225e-06,
"loss": 0.2925,
"step": 318
},
{
"epoch": 0.049611197511664075,
"grad_norm": 1.2278880982790155,
"learning_rate": 9.939393410212713e-06,
"loss": 0.2649,
"step": 319
},
{
"epoch": 0.049766718506998445,
"grad_norm": 1.6247947056783312,
"learning_rate": 9.93901361156511e-06,
"loss": 0.3355,
"step": 320
},
{
"epoch": 0.049922239502332814,
"grad_norm": 1.1732603342184649,
"learning_rate": 9.93863263390608e-06,
"loss": 0.2603,
"step": 321
},
{
"epoch": 0.050077760497667184,
"grad_norm": 1.4022468720638315,
"learning_rate": 9.93825047732657e-06,
"loss": 0.3171,
"step": 322
},
{
"epoch": 0.050233281493001554,
"grad_norm": 1.3668475608164796,
"learning_rate": 9.937867141917804e-06,
"loss": 0.2952,
"step": 323
},
{
"epoch": 0.050388802488335924,
"grad_norm": 1.4553813573539522,
"learning_rate": 9.93748262777129e-06,
"loss": 0.1581,
"step": 324
},
{
"epoch": 0.05054432348367029,
"grad_norm": 1.9871080316775154,
"learning_rate": 9.937096934978819e-06,
"loss": 0.2368,
"step": 325
},
{
"epoch": 0.05069984447900466,
"grad_norm": 1.2900065629907207,
"learning_rate": 9.936710063632457e-06,
"loss": 0.2831,
"step": 326
},
{
"epoch": 0.05085536547433903,
"grad_norm": 0.9263549089146618,
"learning_rate": 9.93632201382456e-06,
"loss": 0.2086,
"step": 327
},
{
"epoch": 0.0510108864696734,
"grad_norm": 1.9892589335821493,
"learning_rate": 9.935932785647756e-06,
"loss": 0.2717,
"step": 328
},
{
"epoch": 0.05116640746500778,
"grad_norm": 1.1155547773179386,
"learning_rate": 9.935542379194965e-06,
"loss": 0.2731,
"step": 329
},
{
"epoch": 0.05132192846034215,
"grad_norm": 1.0330106857849222,
"learning_rate": 9.935150794559379e-06,
"loss": 0.1841,
"step": 330
},
{
"epoch": 0.05147744945567652,
"grad_norm": 1.52093348670823,
"learning_rate": 9.934758031834475e-06,
"loss": 0.2061,
"step": 331
},
{
"epoch": 0.05163297045101089,
"grad_norm": 1.1824055834479263,
"learning_rate": 9.93436409111401e-06,
"loss": 0.2613,
"step": 332
},
{
"epoch": 0.05178849144634526,
"grad_norm": 1.5329142188470473,
"learning_rate": 9.933968972492026e-06,
"loss": 0.2541,
"step": 333
},
{
"epoch": 0.05194401244167963,
"grad_norm": 1.0304282737168275,
"learning_rate": 9.933572676062841e-06,
"loss": 0.2024,
"step": 334
},
{
"epoch": 0.052099533437014,
"grad_norm": 1.1252175849664872,
"learning_rate": 9.933175201921057e-06,
"loss": 0.201,
"step": 335
},
{
"epoch": 0.05225505443234837,
"grad_norm": 1.6828294804696526,
"learning_rate": 9.932776550161559e-06,
"loss": 0.2298,
"step": 336
},
{
"epoch": 0.05241057542768274,
"grad_norm": 1.2831001226274117,
"learning_rate": 9.932376720879503e-06,
"loss": 0.2352,
"step": 337
},
{
"epoch": 0.05256609642301711,
"grad_norm": 2.152789286567263,
"learning_rate": 9.931975714170345e-06,
"loss": 0.3382,
"step": 338
},
{
"epoch": 0.05272161741835148,
"grad_norm": 1.702657664273862,
"learning_rate": 9.931573530129803e-06,
"loss": 0.2368,
"step": 339
},
{
"epoch": 0.05287713841368585,
"grad_norm": 2.05056832602719,
"learning_rate": 9.931170168853886e-06,
"loss": 0.2992,
"step": 340
},
{
"epoch": 0.053032659409020216,
"grad_norm": 1.5775290622934088,
"learning_rate": 9.930765630438882e-06,
"loss": 0.212,
"step": 341
},
{
"epoch": 0.053188180404354586,
"grad_norm": 1.166034186090071,
"learning_rate": 9.93035991498136e-06,
"loss": 0.2081,
"step": 342
},
{
"epoch": 0.053343701399688956,
"grad_norm": 1.4555896083998001,
"learning_rate": 9.929953022578171e-06,
"loss": 0.1857,
"step": 343
},
{
"epoch": 0.053499222395023326,
"grad_norm": 1.343927833342108,
"learning_rate": 9.929544953326445e-06,
"loss": 0.2691,
"step": 344
},
{
"epoch": 0.053654743390357695,
"grad_norm": 1.8890642830307378,
"learning_rate": 9.929135707323592e-06,
"loss": 0.1967,
"step": 345
},
{
"epoch": 0.053810264385692065,
"grad_norm": 1.4990308791372666,
"learning_rate": 9.928725284667308e-06,
"loss": 0.1774,
"step": 346
},
{
"epoch": 0.05396578538102644,
"grad_norm": 1.615806257387967,
"learning_rate": 9.928313685455565e-06,
"loss": 0.2234,
"step": 347
},
{
"epoch": 0.05412130637636081,
"grad_norm": 1.3758078431089233,
"learning_rate": 9.927900909786617e-06,
"loss": 0.259,
"step": 348
},
{
"epoch": 0.05427682737169518,
"grad_norm": 0.855435278326685,
"learning_rate": 9.927486957759001e-06,
"loss": 0.2068,
"step": 349
},
{
"epoch": 0.05443234836702955,
"grad_norm": 1.5217482862634222,
"learning_rate": 9.927071829471531e-06,
"loss": 0.1551,
"step": 350
},
{
"epoch": 0.05458786936236392,
"grad_norm": 1.5111503264835533,
"learning_rate": 9.926655525023304e-06,
"loss": 0.2599,
"step": 351
},
{
"epoch": 0.05474339035769829,
"grad_norm": 0.8967930843733002,
"learning_rate": 9.9262380445137e-06,
"loss": 0.169,
"step": 352
},
{
"epoch": 0.05489891135303266,
"grad_norm": 1.9464375941159884,
"learning_rate": 9.925819388042374e-06,
"loss": 0.2983,
"step": 353
},
{
"epoch": 0.05505443234836703,
"grad_norm": 1.574189824318599,
"learning_rate": 9.925399555709269e-06,
"loss": 0.1937,
"step": 354
},
{
"epoch": 0.0552099533437014,
"grad_norm": 3.1438752373638232,
"learning_rate": 9.924978547614604e-06,
"loss": 0.2181,
"step": 355
},
{
"epoch": 0.05536547433903577,
"grad_norm": 1.6348127637741856,
"learning_rate": 9.924556363858877e-06,
"loss": 0.1847,
"step": 356
},
{
"epoch": 0.05552099533437014,
"grad_norm": 1.724455721347507,
"learning_rate": 9.92413300454287e-06,
"loss": 0.1924,
"step": 357
},
{
"epoch": 0.05567651632970451,
"grad_norm": 0.9215074637606898,
"learning_rate": 9.923708469767645e-06,
"loss": 0.1484,
"step": 358
},
{
"epoch": 0.05583203732503888,
"grad_norm": 1.0048144642733263,
"learning_rate": 9.923282759634547e-06,
"loss": 0.139,
"step": 359
},
{
"epoch": 0.05598755832037325,
"grad_norm": 1.6563473574979655,
"learning_rate": 9.922855874245197e-06,
"loss": 0.2462,
"step": 360
},
{
"epoch": 0.05614307931570762,
"grad_norm": 1.0753481257964308,
"learning_rate": 9.922427813701495e-06,
"loss": 0.2543,
"step": 361
},
{
"epoch": 0.05629860031104199,
"grad_norm": 1.1607722120362791,
"learning_rate": 9.92199857810563e-06,
"loss": 0.1919,
"step": 362
},
{
"epoch": 0.05645412130637636,
"grad_norm": 1.0235707105593828,
"learning_rate": 9.921568167560065e-06,
"loss": 0.1851,
"step": 363
},
{
"epoch": 0.05660964230171073,
"grad_norm": 1.443489161948352,
"learning_rate": 9.921136582167545e-06,
"loss": 0.2566,
"step": 364
},
{
"epoch": 0.056765163297045104,
"grad_norm": 1.1047251832726421,
"learning_rate": 9.920703822031094e-06,
"loss": 0.2268,
"step": 365
},
{
"epoch": 0.056920684292379474,
"grad_norm": 1.8071891113724519,
"learning_rate": 9.92026988725402e-06,
"loss": 0.286,
"step": 366
},
{
"epoch": 0.057076205287713844,
"grad_norm": 1.127534519608966,
"learning_rate": 9.919834777939908e-06,
"loss": 0.2078,
"step": 367
},
{
"epoch": 0.05723172628304821,
"grad_norm": 1.3537981754957027,
"learning_rate": 9.919398494192625e-06,
"loss": 0.2574,
"step": 368
},
{
"epoch": 0.05738724727838258,
"grad_norm": 1.5740289483284484,
"learning_rate": 9.918961036116317e-06,
"loss": 0.2168,
"step": 369
},
{
"epoch": 0.05754276827371695,
"grad_norm": 2.1521943324617854,
"learning_rate": 9.918522403815414e-06,
"loss": 0.5388,
"step": 370
},
{
"epoch": 0.05769828926905132,
"grad_norm": 0.9621156840694527,
"learning_rate": 9.918082597394621e-06,
"loss": 0.2206,
"step": 371
},
{
"epoch": 0.05785381026438569,
"grad_norm": 0.8374473543740336,
"learning_rate": 9.91764161695893e-06,
"loss": 0.1931,
"step": 372
},
{
"epoch": 0.05800933125972006,
"grad_norm": 1.594565893913882,
"learning_rate": 9.917199462613601e-06,
"loss": 0.2664,
"step": 373
},
{
"epoch": 0.05816485225505443,
"grad_norm": 2.539276249800021,
"learning_rate": 9.916756134464191e-06,
"loss": 0.3158,
"step": 374
},
{
"epoch": 0.0583203732503888,
"grad_norm": 1.0461962066836652,
"learning_rate": 9.916311632616525e-06,
"loss": 0.2489,
"step": 375
},
{
"epoch": 0.05847589424572317,
"grad_norm": 1.1340444520472663,
"learning_rate": 9.915865957176709e-06,
"loss": 0.2718,
"step": 376
},
{
"epoch": 0.05863141524105754,
"grad_norm": 1.467480205738983,
"learning_rate": 9.915419108251138e-06,
"loss": 0.1753,
"step": 377
},
{
"epoch": 0.05878693623639191,
"grad_norm": 1.4394725259816188,
"learning_rate": 9.914971085946476e-06,
"loss": 0.1973,
"step": 378
},
{
"epoch": 0.05894245723172628,
"grad_norm": 1.2534669496284443,
"learning_rate": 9.914521890369676e-06,
"loss": 0.2127,
"step": 379
},
{
"epoch": 0.05909797822706065,
"grad_norm": 1.282361137311585,
"learning_rate": 9.914071521627964e-06,
"loss": 0.1881,
"step": 380
},
{
"epoch": 0.05925349922239502,
"grad_norm": 1.7744186005576332,
"learning_rate": 9.913619979828851e-06,
"loss": 0.1875,
"step": 381
},
{
"epoch": 0.0594090202177294,
"grad_norm": 1.5020250209663002,
"learning_rate": 9.913167265080126e-06,
"loss": 0.1684,
"step": 382
},
{
"epoch": 0.05956454121306377,
"grad_norm": 1.259074929221576,
"learning_rate": 9.912713377489858e-06,
"loss": 0.2268,
"step": 383
},
{
"epoch": 0.059720062208398136,
"grad_norm": 1.7761373693512776,
"learning_rate": 9.912258317166398e-06,
"loss": 0.223,
"step": 384
},
{
"epoch": 0.059875583203732506,
"grad_norm": 2.38865888975245,
"learning_rate": 9.911802084218374e-06,
"loss": 0.2401,
"step": 385
},
{
"epoch": 0.060031104199066876,
"grad_norm": 0.8949382740792282,
"learning_rate": 9.911344678754694e-06,
"loss": 0.1922,
"step": 386
},
{
"epoch": 0.060186625194401246,
"grad_norm": 1.5889982876131292,
"learning_rate": 9.910886100884547e-06,
"loss": 0.1943,
"step": 387
},
{
"epoch": 0.060342146189735615,
"grad_norm": 1.4147870380604834,
"learning_rate": 9.910426350717404e-06,
"loss": 0.1812,
"step": 388
},
{
"epoch": 0.060497667185069985,
"grad_norm": 1.8231195124047115,
"learning_rate": 9.909965428363012e-06,
"loss": 0.2312,
"step": 389
},
{
"epoch": 0.060653188180404355,
"grad_norm": 1.8874621933930384,
"learning_rate": 9.909503333931402e-06,
"loss": 0.287,
"step": 390
},
{
"epoch": 0.060808709175738725,
"grad_norm": 1.7665216636429069,
"learning_rate": 9.90904006753288e-06,
"loss": 0.2185,
"step": 391
},
{
"epoch": 0.060964230171073094,
"grad_norm": 1.256357590139898,
"learning_rate": 9.908575629278034e-06,
"loss": 0.1919,
"step": 392
},
{
"epoch": 0.061119751166407464,
"grad_norm": 4.375967721306914,
"learning_rate": 9.908110019277735e-06,
"loss": 0.1781,
"step": 393
},
{
"epoch": 0.061275272161741834,
"grad_norm": 1.4286735960699084,
"learning_rate": 9.907643237643127e-06,
"loss": 0.253,
"step": 394
},
{
"epoch": 0.061430793157076204,
"grad_norm": 1.6229980414007696,
"learning_rate": 9.90717528448564e-06,
"loss": 0.2598,
"step": 395
},
{
"epoch": 0.06158631415241057,
"grad_norm": 1.654127403226531,
"learning_rate": 9.906706159916977e-06,
"loss": 0.2677,
"step": 396
},
{
"epoch": 0.06174183514774494,
"grad_norm": 0.7489317566220969,
"learning_rate": 9.90623586404913e-06,
"loss": 0.1595,
"step": 397
},
{
"epoch": 0.06189735614307931,
"grad_norm": 1.0243584995437751,
"learning_rate": 9.90576439699436e-06,
"loss": 0.2089,
"step": 398
},
{
"epoch": 0.06205287713841368,
"grad_norm": 1.2843274122650117,
"learning_rate": 9.905291758865217e-06,
"loss": 0.2458,
"step": 399
},
{
"epoch": 0.06220839813374806,
"grad_norm": 1.482986812845832,
"learning_rate": 9.904817949774524e-06,
"loss": 0.2611,
"step": 400
},
{
"epoch": 0.06220839813374806,
"eval_loss": 0.23184187710285187,
"eval_runtime": 9.4466,
"eval_samples_per_second": 2.752,
"eval_steps_per_second": 0.741,
"step": 400
},
{
"epoch": 0.06236391912908243,
"grad_norm": 2.01899839511783,
"learning_rate": 9.904342969835385e-06,
"loss": 0.2178,
"step": 401
},
{
"epoch": 0.06251944012441679,
"grad_norm": 1.4244669635257896,
"learning_rate": 9.903866819161188e-06,
"loss": 0.2321,
"step": 402
},
{
"epoch": 0.06267496111975117,
"grad_norm": 1.7090867256976423,
"learning_rate": 9.903389497865593e-06,
"loss": 0.2071,
"step": 403
},
{
"epoch": 0.06283048211508553,
"grad_norm": 1.305136754505658,
"learning_rate": 9.902911006062543e-06,
"loss": 0.1899,
"step": 404
},
{
"epoch": 0.06298600311041991,
"grad_norm": 1.0188677304744835,
"learning_rate": 9.902431343866266e-06,
"loss": 0.2457,
"step": 405
},
{
"epoch": 0.06314152410575427,
"grad_norm": 1.6042710170666996,
"learning_rate": 9.901950511391259e-06,
"loss": 0.1894,
"step": 406
},
{
"epoch": 0.06329704510108865,
"grad_norm": 1.3017493690494788,
"learning_rate": 9.901468508752304e-06,
"loss": 0.2908,
"step": 407
},
{
"epoch": 0.06345256609642301,
"grad_norm": 1.3230633029674432,
"learning_rate": 9.900985336064463e-06,
"loss": 0.2786,
"step": 408
},
{
"epoch": 0.06360808709175739,
"grad_norm": 1.5120257860737862,
"learning_rate": 9.900500993443076e-06,
"loss": 0.2516,
"step": 409
},
{
"epoch": 0.06376360808709176,
"grad_norm": 1.004582433223966,
"learning_rate": 9.900015481003762e-06,
"loss": 0.2232,
"step": 410
},
{
"epoch": 0.06391912908242613,
"grad_norm": 1.399115724283105,
"learning_rate": 9.89952879886242e-06,
"loss": 0.2763,
"step": 411
},
{
"epoch": 0.0640746500777605,
"grad_norm": 1.816764777159624,
"learning_rate": 9.899040947135225e-06,
"loss": 0.2913,
"step": 412
},
{
"epoch": 0.06423017107309487,
"grad_norm": 1.1949304261760583,
"learning_rate": 9.898551925938638e-06,
"loss": 0.191,
"step": 413
},
{
"epoch": 0.06438569206842924,
"grad_norm": 1.6899096837752585,
"learning_rate": 9.898061735389395e-06,
"loss": 0.2314,
"step": 414
},
{
"epoch": 0.0645412130637636,
"grad_norm": 1.6400875402483213,
"learning_rate": 9.897570375604508e-06,
"loss": 0.1985,
"step": 415
},
{
"epoch": 0.06469673405909798,
"grad_norm": 1.1700291435704913,
"learning_rate": 9.897077846701274e-06,
"loss": 0.2178,
"step": 416
},
{
"epoch": 0.06485225505443235,
"grad_norm": 1.6396026705753728,
"learning_rate": 9.896584148797265e-06,
"loss": 0.2443,
"step": 417
},
{
"epoch": 0.06500777604976672,
"grad_norm": 0.8511496035113331,
"learning_rate": 9.896089282010338e-06,
"loss": 0.1619,
"step": 418
},
{
"epoch": 0.06516329704510108,
"grad_norm": 1.3924064844406538,
"learning_rate": 9.895593246458617e-06,
"loss": 0.2021,
"step": 419
},
{
"epoch": 0.06531881804043546,
"grad_norm": 0.8605197503722029,
"learning_rate": 9.895096042260517e-06,
"loss": 0.1628,
"step": 420
},
{
"epoch": 0.06547433903576982,
"grad_norm": 1.3908417494412908,
"learning_rate": 9.894597669534729e-06,
"loss": 0.2054,
"step": 421
},
{
"epoch": 0.0656298600311042,
"grad_norm": 1.445540354985538,
"learning_rate": 9.894098128400219e-06,
"loss": 0.2197,
"step": 422
},
{
"epoch": 0.06578538102643856,
"grad_norm": 1.3103752658839474,
"learning_rate": 9.893597418976234e-06,
"loss": 0.2297,
"step": 423
},
{
"epoch": 0.06594090202177294,
"grad_norm": 1.0497805770986521,
"learning_rate": 9.893095541382304e-06,
"loss": 0.1747,
"step": 424
},
{
"epoch": 0.0660964230171073,
"grad_norm": 1.513640843523071,
"learning_rate": 9.892592495738229e-06,
"loss": 0.1754,
"step": 425
},
{
"epoch": 0.06625194401244168,
"grad_norm": 1.0493517604475748,
"learning_rate": 9.892088282164098e-06,
"loss": 0.2586,
"step": 426
},
{
"epoch": 0.06640746500777606,
"grad_norm": 1.4678962231044086,
"learning_rate": 9.89158290078027e-06,
"loss": 0.2932,
"step": 427
},
{
"epoch": 0.06656298600311042,
"grad_norm": 1.6765991678498569,
"learning_rate": 9.891076351707389e-06,
"loss": 0.2116,
"step": 428
},
{
"epoch": 0.0667185069984448,
"grad_norm": 1.4655721822686016,
"learning_rate": 9.890568635066373e-06,
"loss": 0.1543,
"step": 429
},
{
"epoch": 0.06687402799377916,
"grad_norm": 1.6313534003780414,
"learning_rate": 9.890059750978425e-06,
"loss": 0.1571,
"step": 430
},
{
"epoch": 0.06702954898911354,
"grad_norm": 1.0261848775525118,
"learning_rate": 9.889549699565017e-06,
"loss": 0.2865,
"step": 431
},
{
"epoch": 0.0671850699844479,
"grad_norm": 1.5225780156038968,
"learning_rate": 9.88903848094791e-06,
"loss": 0.1914,
"step": 432
},
{
"epoch": 0.06734059097978227,
"grad_norm": 1.3350387169313882,
"learning_rate": 9.888526095249138e-06,
"loss": 0.2754,
"step": 433
},
{
"epoch": 0.06749611197511664,
"grad_norm": 1.192180411270206,
"learning_rate": 9.888012542591014e-06,
"loss": 0.1974,
"step": 434
},
{
"epoch": 0.06765163297045101,
"grad_norm": 1.3005497242232493,
"learning_rate": 9.88749782309613e-06,
"loss": 0.1903,
"step": 435
},
{
"epoch": 0.06780715396578538,
"grad_norm": 1.1288456938448086,
"learning_rate": 9.88698193688736e-06,
"loss": 0.2333,
"step": 436
},
{
"epoch": 0.06796267496111975,
"grad_norm": 1.130396483559975,
"learning_rate": 9.886464884087846e-06,
"loss": 0.2674,
"step": 437
},
{
"epoch": 0.06811819595645412,
"grad_norm": 0.9035948769600225,
"learning_rate": 9.885946664821021e-06,
"loss": 0.1864,
"step": 438
},
{
"epoch": 0.0682737169517885,
"grad_norm": 1.1233476167867031,
"learning_rate": 9.885427279210592e-06,
"loss": 0.1787,
"step": 439
},
{
"epoch": 0.06842923794712286,
"grad_norm": 1.2410015017602511,
"learning_rate": 9.88490672738054e-06,
"loss": 0.2509,
"step": 440
},
{
"epoch": 0.06858475894245723,
"grad_norm": 1.3429869818046247,
"learning_rate": 9.884385009455131e-06,
"loss": 0.2811,
"step": 441
},
{
"epoch": 0.0687402799377916,
"grad_norm": 0.7587532198438675,
"learning_rate": 9.883862125558904e-06,
"loss": 0.1781,
"step": 442
},
{
"epoch": 0.06889580093312597,
"grad_norm": 0.9782244567957874,
"learning_rate": 9.88333807581668e-06,
"loss": 0.1891,
"step": 443
},
{
"epoch": 0.06905132192846034,
"grad_norm": 1.8354472673215871,
"learning_rate": 9.882812860353558e-06,
"loss": 0.2372,
"step": 444
},
{
"epoch": 0.06920684292379471,
"grad_norm": 1.0210293095436775,
"learning_rate": 9.882286479294911e-06,
"loss": 0.1988,
"step": 445
},
{
"epoch": 0.06936236391912909,
"grad_norm": 2.117567357062213,
"learning_rate": 9.881758932766398e-06,
"loss": 0.1992,
"step": 446
},
{
"epoch": 0.06951788491446345,
"grad_norm": 1.1644685693150085,
"learning_rate": 9.881230220893948e-06,
"loss": 0.18,
"step": 447
},
{
"epoch": 0.06967340590979783,
"grad_norm": 1.1209275418337545,
"learning_rate": 9.880700343803773e-06,
"loss": 0.3069,
"step": 448
},
{
"epoch": 0.06982892690513219,
"grad_norm": 1.155686416296927,
"learning_rate": 9.880169301622362e-06,
"loss": 0.1744,
"step": 449
},
{
"epoch": 0.06998444790046657,
"grad_norm": 0.9709514091501408,
"learning_rate": 9.879637094476482e-06,
"loss": 0.1871,
"step": 450
},
{
"epoch": 0.07013996889580093,
"grad_norm": 1.1219093494884402,
"learning_rate": 9.87910372249318e-06,
"loss": 0.1932,
"step": 451
},
{
"epoch": 0.07029548989113531,
"grad_norm": 1.9094748023939434,
"learning_rate": 9.878569185799778e-06,
"loss": 0.2339,
"step": 452
},
{
"epoch": 0.07045101088646967,
"grad_norm": 1.3264334862739553,
"learning_rate": 9.878033484523876e-06,
"loss": 0.1407,
"step": 453
},
{
"epoch": 0.07060653188180405,
"grad_norm": 1.667180383137504,
"learning_rate": 9.877496618793356e-06,
"loss": 0.1867,
"step": 454
},
{
"epoch": 0.07076205287713841,
"grad_norm": 1.0486860196671894,
"learning_rate": 9.876958588736371e-06,
"loss": 0.1683,
"step": 455
},
{
"epoch": 0.07091757387247279,
"grad_norm": 1.2507603637095628,
"learning_rate": 9.876419394481363e-06,
"loss": 0.1958,
"step": 456
},
{
"epoch": 0.07107309486780715,
"grad_norm": 1.7806763122908775,
"learning_rate": 9.87587903615704e-06,
"loss": 0.2466,
"step": 457
},
{
"epoch": 0.07122861586314153,
"grad_norm": 1.0570385231053188,
"learning_rate": 9.875337513892395e-06,
"loss": 0.1336,
"step": 458
},
{
"epoch": 0.07138413685847589,
"grad_norm": 1.8093621923009064,
"learning_rate": 9.874794827816696e-06,
"loss": 0.245,
"step": 459
},
{
"epoch": 0.07153965785381027,
"grad_norm": 1.6343174119313473,
"learning_rate": 9.874250978059489e-06,
"loss": 0.1878,
"step": 460
},
{
"epoch": 0.07169517884914463,
"grad_norm": 1.2474757406216732,
"learning_rate": 9.873705964750603e-06,
"loss": 0.201,
"step": 461
},
{
"epoch": 0.071850699844479,
"grad_norm": 0.9854370189019162,
"learning_rate": 9.873159788020135e-06,
"loss": 0.1572,
"step": 462
},
{
"epoch": 0.07200622083981338,
"grad_norm": 1.2046716423202313,
"learning_rate": 9.872612447998466e-06,
"loss": 0.1644,
"step": 463
},
{
"epoch": 0.07216174183514774,
"grad_norm": 1.6657683984708445,
"learning_rate": 9.872063944816257e-06,
"loss": 0.2026,
"step": 464
},
{
"epoch": 0.07231726283048212,
"grad_norm": 1.6319780353610651,
"learning_rate": 9.871514278604439e-06,
"loss": 0.2361,
"step": 465
},
{
"epoch": 0.07247278382581648,
"grad_norm": 0.930626270347552,
"learning_rate": 9.870963449494228e-06,
"loss": 0.2334,
"step": 466
},
{
"epoch": 0.07262830482115086,
"grad_norm": 1.7347785771237878,
"learning_rate": 9.870411457617115e-06,
"loss": 0.3121,
"step": 467
},
{
"epoch": 0.07278382581648522,
"grad_norm": 1.6861297399111428,
"learning_rate": 9.869858303104864e-06,
"loss": 0.2234,
"step": 468
},
{
"epoch": 0.0729393468118196,
"grad_norm": 2.2175613812233856,
"learning_rate": 9.869303986089525e-06,
"loss": 0.215,
"step": 469
},
{
"epoch": 0.07309486780715396,
"grad_norm": 1.2151103786584494,
"learning_rate": 9.86874850670342e-06,
"loss": 0.143,
"step": 470
},
{
"epoch": 0.07325038880248834,
"grad_norm": 1.8347498082665927,
"learning_rate": 9.868191865079149e-06,
"loss": 0.1847,
"step": 471
},
{
"epoch": 0.0734059097978227,
"grad_norm": 0.7662001443118179,
"learning_rate": 9.867634061349592e-06,
"loss": 0.2132,
"step": 472
},
{
"epoch": 0.07356143079315708,
"grad_norm": 1.127229878211817,
"learning_rate": 9.8670750956479e-06,
"loss": 0.2405,
"step": 473
},
{
"epoch": 0.07371695178849144,
"grad_norm": 0.8919765028163983,
"learning_rate": 9.866514968107511e-06,
"loss": 0.2187,
"step": 474
},
{
"epoch": 0.07387247278382582,
"grad_norm": 0.8318099634868261,
"learning_rate": 9.865953678862133e-06,
"loss": 0.149,
"step": 475
},
{
"epoch": 0.07402799377916018,
"grad_norm": 1.577340616348031,
"learning_rate": 9.865391228045753e-06,
"loss": 0.2319,
"step": 476
},
{
"epoch": 0.07418351477449456,
"grad_norm": 1.116181816359047,
"learning_rate": 9.864827615792637e-06,
"loss": 0.1901,
"step": 477
},
{
"epoch": 0.07433903576982892,
"grad_norm": 1.105109643192386,
"learning_rate": 9.864262842237327e-06,
"loss": 0.2011,
"step": 478
},
{
"epoch": 0.0744945567651633,
"grad_norm": 1.9701207318396636,
"learning_rate": 9.863696907514641e-06,
"loss": 0.2409,
"step": 479
},
{
"epoch": 0.07465007776049767,
"grad_norm": 2.2498632028053507,
"learning_rate": 9.863129811759678e-06,
"loss": 0.3829,
"step": 480
},
{
"epoch": 0.07480559875583204,
"grad_norm": 1.1224194434111838,
"learning_rate": 9.86256155510781e-06,
"loss": 0.2114,
"step": 481
},
{
"epoch": 0.07496111975116641,
"grad_norm": 1.5539407325523458,
"learning_rate": 9.861992137694687e-06,
"loss": 0.1976,
"step": 482
},
{
"epoch": 0.07511664074650078,
"grad_norm": 1.962092802549792,
"learning_rate": 9.86142155965624e-06,
"loss": 0.2725,
"step": 483
},
{
"epoch": 0.07527216174183515,
"grad_norm": 0.8983695148666645,
"learning_rate": 9.860849821128668e-06,
"loss": 0.154,
"step": 484
},
{
"epoch": 0.07542768273716952,
"grad_norm": 1.398592267234838,
"learning_rate": 9.86027692224846e-06,
"loss": 0.1497,
"step": 485
},
{
"epoch": 0.07558320373250389,
"grad_norm": 1.0403186420901969,
"learning_rate": 9.859702863152372e-06,
"loss": 0.1936,
"step": 486
},
{
"epoch": 0.07573872472783826,
"grad_norm": 0.7470818354767621,
"learning_rate": 9.859127643977438e-06,
"loss": 0.1523,
"step": 487
},
{
"epoch": 0.07589424572317263,
"grad_norm": 1.2067693893481815,
"learning_rate": 9.858551264860972e-06,
"loss": 0.3168,
"step": 488
},
{
"epoch": 0.076049766718507,
"grad_norm": 1.5295551443098423,
"learning_rate": 9.857973725940565e-06,
"loss": 0.2194,
"step": 489
},
{
"epoch": 0.07620528771384137,
"grad_norm": 1.618418958541224,
"learning_rate": 9.857395027354085e-06,
"loss": 0.2209,
"step": 490
},
{
"epoch": 0.07636080870917573,
"grad_norm": 1.1696631104347366,
"learning_rate": 9.856815169239671e-06,
"loss": 0.1993,
"step": 491
},
{
"epoch": 0.07651632970451011,
"grad_norm": 1.4918786793556023,
"learning_rate": 9.856234151735744e-06,
"loss": 0.2657,
"step": 492
},
{
"epoch": 0.07667185069984447,
"grad_norm": 1.3100404095494855,
"learning_rate": 9.855651974981005e-06,
"loss": 0.2832,
"step": 493
},
{
"epoch": 0.07682737169517885,
"grad_norm": 13.98784357990924,
"learning_rate": 9.855068639114425e-06,
"loss": 0.2488,
"step": 494
},
{
"epoch": 0.07698289269051321,
"grad_norm": 1.3956332181045448,
"learning_rate": 9.854484144275254e-06,
"loss": 0.225,
"step": 495
},
{
"epoch": 0.07713841368584759,
"grad_norm": 1.1858198275947147,
"learning_rate": 9.853898490603018e-06,
"loss": 0.2041,
"step": 496
},
{
"epoch": 0.07729393468118195,
"grad_norm": 0.765411378364051,
"learning_rate": 9.853311678237524e-06,
"loss": 0.1492,
"step": 497
},
{
"epoch": 0.07744945567651633,
"grad_norm": 1.2288325537770441,
"learning_rate": 9.85272370731885e-06,
"loss": 0.1773,
"step": 498
},
{
"epoch": 0.0776049766718507,
"grad_norm": 1.3901203640607709,
"learning_rate": 9.852134577987353e-06,
"loss": 0.2091,
"step": 499
},
{
"epoch": 0.07776049766718507,
"grad_norm": 1.5991626946866644,
"learning_rate": 9.85154429038367e-06,
"loss": 0.2485,
"step": 500
},
{
"epoch": 0.07776049766718507,
"eval_loss": 0.22800126671791077,
"eval_runtime": 9.4446,
"eval_samples_per_second": 2.753,
"eval_steps_per_second": 0.741,
"step": 500
},
{
"epoch": 0.07791601866251945,
"grad_norm": 0.9946822389547595,
"learning_rate": 9.850952844648705e-06,
"loss": 0.2324,
"step": 501
},
{
"epoch": 0.07807153965785381,
"grad_norm": 1.088817573789371,
"learning_rate": 9.850360240923647e-06,
"loss": 0.1813,
"step": 502
},
{
"epoch": 0.07822706065318819,
"grad_norm": 5.945777339639669,
"learning_rate": 9.849766479349959e-06,
"loss": 0.1976,
"step": 503
},
{
"epoch": 0.07838258164852255,
"grad_norm": 0.8593394406117729,
"learning_rate": 9.84917156006938e-06,
"loss": 0.2474,
"step": 504
},
{
"epoch": 0.07853810264385692,
"grad_norm": 1.3930666133589364,
"learning_rate": 9.848575483223925e-06,
"loss": 0.215,
"step": 505
},
{
"epoch": 0.07869362363919129,
"grad_norm": 1.6493288101835173,
"learning_rate": 9.84797824895589e-06,
"loss": 0.303,
"step": 506
},
{
"epoch": 0.07884914463452566,
"grad_norm": 1.1106903817577367,
"learning_rate": 9.847379857407835e-06,
"loss": 0.1654,
"step": 507
},
{
"epoch": 0.07900466562986003,
"grad_norm": 1.166896696404847,
"learning_rate": 9.846780308722612e-06,
"loss": 0.2046,
"step": 508
},
{
"epoch": 0.0791601866251944,
"grad_norm": 1.7221901123414272,
"learning_rate": 9.846179603043338e-06,
"loss": 0.2543,
"step": 509
},
{
"epoch": 0.07931570762052877,
"grad_norm": 1.0398664154595585,
"learning_rate": 9.845577740513409e-06,
"loss": 0.2616,
"step": 510
},
{
"epoch": 0.07947122861586314,
"grad_norm": 1.2062182369254026,
"learning_rate": 9.8449747212765e-06,
"loss": 0.1641,
"step": 511
},
{
"epoch": 0.0796267496111975,
"grad_norm": 1.3859318575453086,
"learning_rate": 9.84437054547656e-06,
"loss": 0.193,
"step": 512
},
{
"epoch": 0.07978227060653188,
"grad_norm": 3.5056235741823523,
"learning_rate": 9.843765213257814e-06,
"loss": 0.2399,
"step": 513
},
{
"epoch": 0.07993779160186625,
"grad_norm": 1.2578551416521373,
"learning_rate": 9.843158724764762e-06,
"loss": 0.2177,
"step": 514
},
{
"epoch": 0.08009331259720062,
"grad_norm": 1.4118043035204642,
"learning_rate": 9.842551080142182e-06,
"loss": 0.21,
"step": 515
},
{
"epoch": 0.080248833592535,
"grad_norm": 1.1155160124053434,
"learning_rate": 9.841942279535128e-06,
"loss": 0.2128,
"step": 516
},
{
"epoch": 0.08040435458786936,
"grad_norm": 1.0287833439256027,
"learning_rate": 9.84133232308893e-06,
"loss": 0.1846,
"step": 517
},
{
"epoch": 0.08055987558320374,
"grad_norm": 2.2894965228305377,
"learning_rate": 9.84072121094919e-06,
"loss": 0.1814,
"step": 518
},
{
"epoch": 0.0807153965785381,
"grad_norm": 1.345886098139959,
"learning_rate": 9.84010894326179e-06,
"loss": 0.1912,
"step": 519
},
{
"epoch": 0.08087091757387248,
"grad_norm": 1.9234609876851483,
"learning_rate": 9.83949552017289e-06,
"loss": 0.2982,
"step": 520
},
{
"epoch": 0.08102643856920684,
"grad_norm": 1.2452886345823744,
"learning_rate": 9.83888094182892e-06,
"loss": 0.2144,
"step": 521
},
{
"epoch": 0.08118195956454122,
"grad_norm": 1.2711995935698062,
"learning_rate": 9.838265208376584e-06,
"loss": 0.1799,
"step": 522
},
{
"epoch": 0.08133748055987558,
"grad_norm": 1.0755729955519457,
"learning_rate": 9.837648319962876e-06,
"loss": 0.3311,
"step": 523
},
{
"epoch": 0.08149300155520996,
"grad_norm": 1.5025152130217085,
"learning_rate": 9.837030276735049e-06,
"loss": 0.203,
"step": 524
},
{
"epoch": 0.08164852255054432,
"grad_norm": 1.4271542860149822,
"learning_rate": 9.83641107884064e-06,
"loss": 0.2055,
"step": 525
},
{
"epoch": 0.0818040435458787,
"grad_norm": 1.1896665999932865,
"learning_rate": 9.83579072642746e-06,
"loss": 0.2191,
"step": 526
},
{
"epoch": 0.08195956454121306,
"grad_norm": 1.6391797544527267,
"learning_rate": 9.835169219643597e-06,
"loss": 0.2164,
"step": 527
},
{
"epoch": 0.08211508553654744,
"grad_norm": 1.3905263994766632,
"learning_rate": 9.834546558637412e-06,
"loss": 0.2188,
"step": 528
},
{
"epoch": 0.0822706065318818,
"grad_norm": 1.1325886941547982,
"learning_rate": 9.833922743557545e-06,
"loss": 0.3596,
"step": 529
},
{
"epoch": 0.08242612752721618,
"grad_norm": 1.58458236573862,
"learning_rate": 9.833297774552905e-06,
"loss": 0.2725,
"step": 530
},
{
"epoch": 0.08258164852255054,
"grad_norm": 1.2630288499628133,
"learning_rate": 9.832671651772685e-06,
"loss": 0.3327,
"step": 531
},
{
"epoch": 0.08273716951788491,
"grad_norm": 1.1472998381036559,
"learning_rate": 9.832044375366347e-06,
"loss": 0.1758,
"step": 532
},
{
"epoch": 0.08289269051321929,
"grad_norm": 1.148175513948538,
"learning_rate": 9.831415945483634e-06,
"loss": 0.189,
"step": 533
},
{
"epoch": 0.08304821150855365,
"grad_norm": 1.2010115460022994,
"learning_rate": 9.830786362274556e-06,
"loss": 0.2065,
"step": 534
},
{
"epoch": 0.08320373250388803,
"grad_norm": 1.357353814240526,
"learning_rate": 9.830155625889406e-06,
"loss": 0.1505,
"step": 535
},
{
"epoch": 0.0833592534992224,
"grad_norm": 1.2541527971168078,
"learning_rate": 9.829523736478748e-06,
"loss": 0.2309,
"step": 536
},
{
"epoch": 0.08351477449455677,
"grad_norm": 1.0453169347517781,
"learning_rate": 9.828890694193425e-06,
"loss": 0.1593,
"step": 537
},
{
"epoch": 0.08367029548989113,
"grad_norm": 1.256435896986176,
"learning_rate": 9.828256499184553e-06,
"loss": 0.2081,
"step": 538
},
{
"epoch": 0.08382581648522551,
"grad_norm": 1.4617851677608784,
"learning_rate": 9.827621151603522e-06,
"loss": 0.2181,
"step": 539
},
{
"epoch": 0.08398133748055987,
"grad_norm": 2.512069587946666,
"learning_rate": 9.826984651601998e-06,
"loss": 0.4003,
"step": 540
},
{
"epoch": 0.08413685847589425,
"grad_norm": 1.1168842922612399,
"learning_rate": 9.826346999331923e-06,
"loss": 0.2823,
"step": 541
},
{
"epoch": 0.08429237947122861,
"grad_norm": 1.0417831453973136,
"learning_rate": 9.825708194945514e-06,
"loss": 0.1889,
"step": 542
},
{
"epoch": 0.08444790046656299,
"grad_norm": 0.9096481087872343,
"learning_rate": 9.82506823859526e-06,
"loss": 0.2351,
"step": 543
},
{
"epoch": 0.08460342146189735,
"grad_norm": 1.0430082881953087,
"learning_rate": 9.824427130433932e-06,
"loss": 0.1953,
"step": 544
},
{
"epoch": 0.08475894245723173,
"grad_norm": 0.6608850743713712,
"learning_rate": 9.823784870614568e-06,
"loss": 0.1854,
"step": 545
},
{
"epoch": 0.08491446345256609,
"grad_norm": 0.9535990803944258,
"learning_rate": 9.823141459290486e-06,
"loss": 0.3623,
"step": 546
},
{
"epoch": 0.08506998444790047,
"grad_norm": 1.2084813471627978,
"learning_rate": 9.822496896615276e-06,
"loss": 0.2088,
"step": 547
},
{
"epoch": 0.08522550544323483,
"grad_norm": 1.751880921507202,
"learning_rate": 9.821851182742806e-06,
"loss": 0.2367,
"step": 548
},
{
"epoch": 0.08538102643856921,
"grad_norm": 0.859776642879622,
"learning_rate": 9.821204317827214e-06,
"loss": 0.249,
"step": 549
},
{
"epoch": 0.08553654743390357,
"grad_norm": 1.127529266910784,
"learning_rate": 9.820556302022916e-06,
"loss": 0.2038,
"step": 550
},
{
"epoch": 0.08569206842923795,
"grad_norm": 1.1762380712487397,
"learning_rate": 9.819907135484607e-06,
"loss": 0.1408,
"step": 551
},
{
"epoch": 0.08584758942457232,
"grad_norm": 1.1841316789710945,
"learning_rate": 9.819256818367247e-06,
"loss": 0.1971,
"step": 552
},
{
"epoch": 0.08600311041990669,
"grad_norm": 0.9978225930526609,
"learning_rate": 9.818605350826078e-06,
"loss": 0.2221,
"step": 553
},
{
"epoch": 0.08615863141524106,
"grad_norm": 1.6694424755142652,
"learning_rate": 9.817952733016614e-06,
"loss": 0.1549,
"step": 554
},
{
"epoch": 0.08631415241057543,
"grad_norm": 0.9346983450738274,
"learning_rate": 9.817298965094644e-06,
"loss": 0.1579,
"step": 555
},
{
"epoch": 0.0864696734059098,
"grad_norm": 1.147526345911482,
"learning_rate": 9.816644047216231e-06,
"loss": 0.1873,
"step": 556
},
{
"epoch": 0.08662519440124417,
"grad_norm": 1.1886850012764587,
"learning_rate": 9.815987979537713e-06,
"loss": 0.2347,
"step": 557
},
{
"epoch": 0.08678071539657854,
"grad_norm": 1.6793433753087175,
"learning_rate": 9.815330762215704e-06,
"loss": 0.2773,
"step": 558
},
{
"epoch": 0.0869362363919129,
"grad_norm": 0.7389091927152867,
"learning_rate": 9.81467239540709e-06,
"loss": 0.2376,
"step": 559
},
{
"epoch": 0.08709175738724728,
"grad_norm": 1.5501383478894555,
"learning_rate": 9.814012879269031e-06,
"loss": 0.249,
"step": 560
},
{
"epoch": 0.08724727838258164,
"grad_norm": 1.985092307546573,
"learning_rate": 9.813352213958966e-06,
"loss": 0.2293,
"step": 561
},
{
"epoch": 0.08740279937791602,
"grad_norm": 1.1408911673993625,
"learning_rate": 9.812690399634601e-06,
"loss": 0.29,
"step": 562
},
{
"epoch": 0.08755832037325038,
"grad_norm": 1.2461126532920535,
"learning_rate": 9.812027436453924e-06,
"loss": 0.2783,
"step": 563
},
{
"epoch": 0.08771384136858476,
"grad_norm": 1.764223151926025,
"learning_rate": 9.81136332457519e-06,
"loss": 0.2528,
"step": 564
},
{
"epoch": 0.08786936236391912,
"grad_norm": 1.0618642840366128,
"learning_rate": 9.810698064156935e-06,
"loss": 0.1723,
"step": 565
},
{
"epoch": 0.0880248833592535,
"grad_norm": 0.8569330765683667,
"learning_rate": 9.810031655357964e-06,
"loss": 0.2241,
"step": 566
},
{
"epoch": 0.08818040435458786,
"grad_norm": 1.0553303848822568,
"learning_rate": 9.80936409833736e-06,
"loss": 0.2312,
"step": 567
},
{
"epoch": 0.08833592534992224,
"grad_norm": 1.8702866312005988,
"learning_rate": 9.808695393254474e-06,
"loss": 0.1949,
"step": 568
},
{
"epoch": 0.08849144634525662,
"grad_norm": 0.9476538253542002,
"learning_rate": 9.808025540268939e-06,
"loss": 0.1783,
"step": 569
},
{
"epoch": 0.08864696734059098,
"grad_norm": 1.4661601306937122,
"learning_rate": 9.80735453954066e-06,
"loss": 0.2941,
"step": 570
},
{
"epoch": 0.08880248833592536,
"grad_norm": 1.1865752816456114,
"learning_rate": 9.80668239122981e-06,
"loss": 0.2196,
"step": 571
},
{
"epoch": 0.08895800933125972,
"grad_norm": 0.9682721759722641,
"learning_rate": 9.80600909549684e-06,
"loss": 0.2453,
"step": 572
},
{
"epoch": 0.0891135303265941,
"grad_norm": 1.0402552655035497,
"learning_rate": 9.805334652502478e-06,
"loss": 0.2528,
"step": 573
},
{
"epoch": 0.08926905132192846,
"grad_norm": 1.1058208608284787,
"learning_rate": 9.804659062407721e-06,
"loss": 0.1704,
"step": 574
},
{
"epoch": 0.08942457231726283,
"grad_norm": 0.9300562072054855,
"learning_rate": 9.803982325373843e-06,
"loss": 0.241,
"step": 575
},
{
"epoch": 0.0895800933125972,
"grad_norm": 1.3452145435832572,
"learning_rate": 9.803304441562391e-06,
"loss": 0.179,
"step": 576
},
{
"epoch": 0.08973561430793157,
"grad_norm": 0.934714522466104,
"learning_rate": 9.802625411135183e-06,
"loss": 0.2131,
"step": 577
},
{
"epoch": 0.08989113530326594,
"grad_norm": 1.2723518042915498,
"learning_rate": 9.801945234254315e-06,
"loss": 0.2342,
"step": 578
},
{
"epoch": 0.09004665629860031,
"grad_norm": 2.11692073632197,
"learning_rate": 9.801263911082154e-06,
"loss": 0.2148,
"step": 579
},
{
"epoch": 0.09020217729393468,
"grad_norm": 2.6365326907523396,
"learning_rate": 9.800581441781342e-06,
"loss": 0.2787,
"step": 580
},
{
"epoch": 0.09035769828926905,
"grad_norm": 1.3369047254369875,
"learning_rate": 9.799897826514793e-06,
"loss": 0.2365,
"step": 581
},
{
"epoch": 0.09051321928460342,
"grad_norm": 0.9493060685693816,
"learning_rate": 9.799213065445696e-06,
"loss": 0.1656,
"step": 582
},
{
"epoch": 0.09066874027993779,
"grad_norm": 1.0470819909783555,
"learning_rate": 9.798527158737512e-06,
"loss": 0.1578,
"step": 583
},
{
"epoch": 0.09082426127527216,
"grad_norm": 1.0969444747176942,
"learning_rate": 9.797840106553977e-06,
"loss": 0.2095,
"step": 584
},
{
"epoch": 0.09097978227060653,
"grad_norm": 1.6035875172395766,
"learning_rate": 9.797151909059102e-06,
"loss": 0.2682,
"step": 585
},
{
"epoch": 0.0911353032659409,
"grad_norm": 1.3049640527657593,
"learning_rate": 9.796462566417169e-06,
"loss": 0.2537,
"step": 586
},
{
"epoch": 0.09129082426127527,
"grad_norm": 1.365745492042764,
"learning_rate": 9.79577207879273e-06,
"loss": 0.2065,
"step": 587
},
{
"epoch": 0.09144634525660965,
"grad_norm": 0.9500261347653985,
"learning_rate": 9.795080446350616e-06,
"loss": 0.1885,
"step": 588
},
{
"epoch": 0.09160186625194401,
"grad_norm": 1.5405453397493063,
"learning_rate": 9.79438766925593e-06,
"loss": 0.2507,
"step": 589
},
{
"epoch": 0.09175738724727839,
"grad_norm": 0.9919977440587929,
"learning_rate": 9.79369374767405e-06,
"loss": 0.1607,
"step": 590
},
{
"epoch": 0.09191290824261275,
"grad_norm": 1.2052697190695243,
"learning_rate": 9.79299868177062e-06,
"loss": 0.2247,
"step": 591
},
{
"epoch": 0.09206842923794713,
"grad_norm": 1.5911347684916193,
"learning_rate": 9.792302471711564e-06,
"loss": 0.1812,
"step": 592
},
{
"epoch": 0.09222395023328149,
"grad_norm": 1.3772469912987155,
"learning_rate": 9.791605117663076e-06,
"loss": 0.1567,
"step": 593
},
{
"epoch": 0.09237947122861587,
"grad_norm": 1.456752513640415,
"learning_rate": 9.790906619791627e-06,
"loss": 0.2009,
"step": 594
},
{
"epoch": 0.09253499222395023,
"grad_norm": 0.9824754188966437,
"learning_rate": 9.790206978263955e-06,
"loss": 0.2041,
"step": 595
},
{
"epoch": 0.0926905132192846,
"grad_norm": 1.1576177882724517,
"learning_rate": 9.789506193247075e-06,
"loss": 0.2304,
"step": 596
},
{
"epoch": 0.09284603421461897,
"grad_norm": 1.3814578099918997,
"learning_rate": 9.788804264908276e-06,
"loss": 0.1935,
"step": 597
},
{
"epoch": 0.09300155520995335,
"grad_norm": 0.8483069008778095,
"learning_rate": 9.788101193415116e-06,
"loss": 0.2148,
"step": 598
},
{
"epoch": 0.09315707620528771,
"grad_norm": 1.3477202886979611,
"learning_rate": 9.787396978935431e-06,
"loss": 0.23,
"step": 599
},
{
"epoch": 0.09331259720062209,
"grad_norm": 1.4372703771133322,
"learning_rate": 9.786691621637322e-06,
"loss": 0.2496,
"step": 600
},
{
"epoch": 0.09331259720062209,
"eval_loss": 0.22426502406597137,
"eval_runtime": 9.4405,
"eval_samples_per_second": 2.754,
"eval_steps_per_second": 0.741,
"step": 600
},
{
"epoch": 0.09346811819595645,
"grad_norm": 1.9416761367047068,
"learning_rate": 9.785985121689171e-06,
"loss": 0.6927,
"step": 601
},
{
"epoch": 0.09362363919129083,
"grad_norm": 1.268764907148312,
"learning_rate": 9.785277479259629e-06,
"loss": 0.2501,
"step": 602
},
{
"epoch": 0.09377916018662519,
"grad_norm": 2.3273439454641514,
"learning_rate": 9.784568694517618e-06,
"loss": 0.2469,
"step": 603
},
{
"epoch": 0.09393468118195956,
"grad_norm": 1.8747313801721335,
"learning_rate": 9.783858767632338e-06,
"loss": 0.2289,
"step": 604
},
{
"epoch": 0.09409020217729394,
"grad_norm": 1.2586569667037595,
"learning_rate": 9.783147698773257e-06,
"loss": 0.1962,
"step": 605
},
{
"epoch": 0.0942457231726283,
"grad_norm": 1.496316694651238,
"learning_rate": 9.782435488110116e-06,
"loss": 0.298,
"step": 606
},
{
"epoch": 0.09440124416796268,
"grad_norm": 1.1801510466185432,
"learning_rate": 9.781722135812932e-06,
"loss": 0.2189,
"step": 607
},
{
"epoch": 0.09455676516329704,
"grad_norm": 1.2565748248573585,
"learning_rate": 9.78100764205199e-06,
"loss": 0.2186,
"step": 608
},
{
"epoch": 0.09471228615863142,
"grad_norm": 0.9391168040034623,
"learning_rate": 9.780292006997849e-06,
"loss": 0.2144,
"step": 609
},
{
"epoch": 0.09486780715396578,
"grad_norm": 1.1387381134081225,
"learning_rate": 9.779575230821344e-06,
"loss": 0.1718,
"step": 610
},
{
"epoch": 0.09502332814930016,
"grad_norm": 1.0177855034745955,
"learning_rate": 9.778857313693578e-06,
"loss": 0.1586,
"step": 611
},
{
"epoch": 0.09517884914463452,
"grad_norm": 1.4624255805438011,
"learning_rate": 9.778138255785928e-06,
"loss": 0.2697,
"step": 612
},
{
"epoch": 0.0953343701399689,
"grad_norm": 1.0796167649791846,
"learning_rate": 9.77741805727004e-06,
"loss": 0.2668,
"step": 613
},
{
"epoch": 0.09548989113530326,
"grad_norm": 2.1747859377128806,
"learning_rate": 9.776696718317842e-06,
"loss": 0.2117,
"step": 614
},
{
"epoch": 0.09564541213063764,
"grad_norm": 1.6173977205310859,
"learning_rate": 9.775974239101522e-06,
"loss": 0.2048,
"step": 615
},
{
"epoch": 0.095800933125972,
"grad_norm": 1.281075534048029,
"learning_rate": 9.775250619793548e-06,
"loss": 0.2218,
"step": 616
},
{
"epoch": 0.09595645412130638,
"grad_norm": 1.5623409338338163,
"learning_rate": 9.77452586056666e-06,
"loss": 0.2843,
"step": 617
},
{
"epoch": 0.09611197511664074,
"grad_norm": 0.920135780872905,
"learning_rate": 9.773799961593862e-06,
"loss": 0.218,
"step": 618
},
{
"epoch": 0.09626749611197512,
"grad_norm": 1.6644765009913491,
"learning_rate": 9.773072923048443e-06,
"loss": 0.277,
"step": 619
},
{
"epoch": 0.09642301710730948,
"grad_norm": 1.0758387537045102,
"learning_rate": 9.772344745103955e-06,
"loss": 0.2405,
"step": 620
},
{
"epoch": 0.09657853810264386,
"grad_norm": 1.1751354263981124,
"learning_rate": 9.77161542793422e-06,
"loss": 0.2362,
"step": 621
},
{
"epoch": 0.09673405909797823,
"grad_norm": 2.7957127911749655,
"learning_rate": 9.770884971713344e-06,
"loss": 0.178,
"step": 622
},
{
"epoch": 0.0968895800933126,
"grad_norm": 5.021758252286217,
"learning_rate": 9.770153376615692e-06,
"loss": 0.2095,
"step": 623
},
{
"epoch": 0.09704510108864697,
"grad_norm": 0.8518883317455118,
"learning_rate": 9.769420642815905e-06,
"loss": 0.2174,
"step": 624
},
{
"epoch": 0.09720062208398134,
"grad_norm": 1.0603512343033086,
"learning_rate": 9.7686867704889e-06,
"loss": 0.2437,
"step": 625
},
{
"epoch": 0.09735614307931571,
"grad_norm": 2.7767054670419067,
"learning_rate": 9.767951759809861e-06,
"loss": 0.3072,
"step": 626
},
{
"epoch": 0.09751166407465008,
"grad_norm": 0.875830402681162,
"learning_rate": 9.767215610954246e-06,
"loss": 0.1865,
"step": 627
},
{
"epoch": 0.09766718506998445,
"grad_norm": 1.1746324049289305,
"learning_rate": 9.766478324097784e-06,
"loss": 0.1775,
"step": 628
},
{
"epoch": 0.09782270606531882,
"grad_norm": 1.3198405804921558,
"learning_rate": 9.765739899416474e-06,
"loss": 0.2202,
"step": 629
},
{
"epoch": 0.09797822706065319,
"grad_norm": 0.9040537149469751,
"learning_rate": 9.76500033708659e-06,
"loss": 0.134,
"step": 630
},
{
"epoch": 0.09813374805598755,
"grad_norm": 1.1116680855923542,
"learning_rate": 9.764259637284674e-06,
"loss": 0.2413,
"step": 631
},
{
"epoch": 0.09828926905132193,
"grad_norm": 1.816511140625042,
"learning_rate": 9.763517800187543e-06,
"loss": 0.1881,
"step": 632
},
{
"epoch": 0.0984447900466563,
"grad_norm": 1.1808179637924803,
"learning_rate": 9.762774825972284e-06,
"loss": 0.1797,
"step": 633
},
{
"epoch": 0.09860031104199067,
"grad_norm": 0.9260180174403776,
"learning_rate": 9.762030714816255e-06,
"loss": 0.1692,
"step": 634
},
{
"epoch": 0.09875583203732503,
"grad_norm": 0.9809663827224766,
"learning_rate": 9.761285466897086e-06,
"loss": 0.1971,
"step": 635
},
{
"epoch": 0.09891135303265941,
"grad_norm": 1.1818951833176021,
"learning_rate": 9.760539082392678e-06,
"loss": 0.3061,
"step": 636
},
{
"epoch": 0.09906687402799377,
"grad_norm": 1.5126562950843534,
"learning_rate": 9.759791561481201e-06,
"loss": 0.2214,
"step": 637
},
{
"epoch": 0.09922239502332815,
"grad_norm": 1.1563368410762391,
"learning_rate": 9.759042904341103e-06,
"loss": 0.1879,
"step": 638
},
{
"epoch": 0.09937791601866251,
"grad_norm": 1.7465834025848672,
"learning_rate": 9.758293111151094e-06,
"loss": 0.2936,
"step": 639
},
{
"epoch": 0.09953343701399689,
"grad_norm": 1.4420901394687415,
"learning_rate": 9.757542182090165e-06,
"loss": 0.1977,
"step": 640
},
{
"epoch": 0.09968895800933127,
"grad_norm": 1.4320029014579423,
"learning_rate": 9.756790117337569e-06,
"loss": 0.235,
"step": 641
},
{
"epoch": 0.09984447900466563,
"grad_norm": 1.0178157213981396,
"learning_rate": 9.756036917072837e-06,
"loss": 0.228,
"step": 642
},
{
"epoch": 0.1,
"grad_norm": 1.634337451034447,
"learning_rate": 9.755282581475769e-06,
"loss": 0.174,
"step": 643
},
{
"epoch": 0.10015552099533437,
"grad_norm": 1.3123622467109133,
"learning_rate": 9.754527110726432e-06,
"loss": 0.1854,
"step": 644
},
{
"epoch": 0.10031104199066875,
"grad_norm": 1.3700959071130703,
"learning_rate": 9.753770505005171e-06,
"loss": 0.271,
"step": 645
},
{
"epoch": 0.10046656298600311,
"grad_norm": 1.5589446061903662,
"learning_rate": 9.753012764492596e-06,
"loss": 0.1669,
"step": 646
},
{
"epoch": 0.10062208398133748,
"grad_norm": 1.3813884723817376,
"learning_rate": 9.752253889369592e-06,
"loss": 0.1525,
"step": 647
},
{
"epoch": 0.10077760497667185,
"grad_norm": 1.3858844961504873,
"learning_rate": 9.75149387981731e-06,
"loss": 0.2673,
"step": 648
},
{
"epoch": 0.10093312597200622,
"grad_norm": 0.9436000404569762,
"learning_rate": 9.75073273601718e-06,
"loss": 0.2058,
"step": 649
},
{
"epoch": 0.10108864696734059,
"grad_norm": 1.4599521330072638,
"learning_rate": 9.749970458150893e-06,
"loss": 0.2145,
"step": 650
},
{
"epoch": 0.10124416796267496,
"grad_norm": 1.3455835009343615,
"learning_rate": 9.749207046400415e-06,
"loss": 0.2353,
"step": 651
},
{
"epoch": 0.10139968895800933,
"grad_norm": 1.6299219848605395,
"learning_rate": 9.748442500947988e-06,
"loss": 0.2582,
"step": 652
},
{
"epoch": 0.1015552099533437,
"grad_norm": 2.1538893724554966,
"learning_rate": 9.747676821976116e-06,
"loss": 0.2128,
"step": 653
},
{
"epoch": 0.10171073094867807,
"grad_norm": 1.1642628978054306,
"learning_rate": 9.746910009667577e-06,
"loss": 0.2092,
"step": 654
},
{
"epoch": 0.10186625194401244,
"grad_norm": 0.9776673463806724,
"learning_rate": 9.746142064205422e-06,
"loss": 0.176,
"step": 655
},
{
"epoch": 0.1020217729393468,
"grad_norm": 1.350687490540933,
"learning_rate": 9.745372985772968e-06,
"loss": 0.2426,
"step": 656
},
{
"epoch": 0.10217729393468118,
"grad_norm": 1.7681295289484116,
"learning_rate": 9.744602774553807e-06,
"loss": 0.2204,
"step": 657
},
{
"epoch": 0.10233281493001556,
"grad_norm": 0.9199423619051535,
"learning_rate": 9.743831430731796e-06,
"loss": 0.1647,
"step": 658
},
{
"epoch": 0.10248833592534992,
"grad_norm": 5.138426947168042,
"learning_rate": 9.743058954491067e-06,
"loss": 0.2107,
"step": 659
},
{
"epoch": 0.1026438569206843,
"grad_norm": 1.446510693113484,
"learning_rate": 9.742285346016024e-06,
"loss": 0.2379,
"step": 660
},
{
"epoch": 0.10279937791601866,
"grad_norm": 1.4833539837619547,
"learning_rate": 9.741510605491335e-06,
"loss": 0.1714,
"step": 661
},
{
"epoch": 0.10295489891135304,
"grad_norm": 1.3228899574182327,
"learning_rate": 9.74073473310194e-06,
"loss": 0.2388,
"step": 662
},
{
"epoch": 0.1031104199066874,
"grad_norm": 1.0712502633957945,
"learning_rate": 9.739957729033054e-06,
"loss": 0.2289,
"step": 663
},
{
"epoch": 0.10326594090202178,
"grad_norm": 1.1587775220461487,
"learning_rate": 9.739179593470156e-06,
"loss": 0.1741,
"step": 664
},
{
"epoch": 0.10342146189735614,
"grad_norm": 1.0260279383302884,
"learning_rate": 9.738400326599e-06,
"loss": 0.2412,
"step": 665
},
{
"epoch": 0.10357698289269052,
"grad_norm": 1.491042707966078,
"learning_rate": 9.737619928605605e-06,
"loss": 0.1833,
"step": 666
},
{
"epoch": 0.10373250388802488,
"grad_norm": 1.6710832262506907,
"learning_rate": 9.736838399676266e-06,
"loss": 0.1712,
"step": 667
},
{
"epoch": 0.10388802488335926,
"grad_norm": 1.4001413138925893,
"learning_rate": 9.736055739997543e-06,
"loss": 0.2739,
"step": 668
},
{
"epoch": 0.10404354587869362,
"grad_norm": 1.0413982567358797,
"learning_rate": 9.735271949756269e-06,
"loss": 0.1655,
"step": 669
},
{
"epoch": 0.104199066874028,
"grad_norm": 2.062452927969995,
"learning_rate": 9.734487029139544e-06,
"loss": 0.2384,
"step": 670
},
{
"epoch": 0.10435458786936236,
"grad_norm": 1.1419346714711909,
"learning_rate": 9.733700978334741e-06,
"loss": 0.2176,
"step": 671
},
{
"epoch": 0.10451010886469674,
"grad_norm": 1.4704145498498906,
"learning_rate": 9.7329137975295e-06,
"loss": 0.2281,
"step": 672
},
{
"epoch": 0.1046656298600311,
"grad_norm": 1.7257595787120843,
"learning_rate": 9.732125486911733e-06,
"loss": 0.1964,
"step": 673
},
{
"epoch": 0.10482115085536547,
"grad_norm": 1.596182048450316,
"learning_rate": 9.731336046669621e-06,
"loss": 0.1863,
"step": 674
},
{
"epoch": 0.10497667185069985,
"grad_norm": 1.741565962255971,
"learning_rate": 9.730545476991613e-06,
"loss": 0.1358,
"step": 675
},
{
"epoch": 0.10513219284603421,
"grad_norm": 1.2105023861624677,
"learning_rate": 9.729753778066431e-06,
"loss": 0.2757,
"step": 676
},
{
"epoch": 0.10528771384136859,
"grad_norm": 1.1483441998296096,
"learning_rate": 9.728960950083062e-06,
"loss": 0.2327,
"step": 677
},
{
"epoch": 0.10544323483670295,
"grad_norm": 2.6827889453865255,
"learning_rate": 9.728166993230768e-06,
"loss": 0.2841,
"step": 678
},
{
"epoch": 0.10559875583203733,
"grad_norm": 1.3531013447523792,
"learning_rate": 9.727371907699075e-06,
"loss": 0.2742,
"step": 679
},
{
"epoch": 0.1057542768273717,
"grad_norm": 1.4165422039945663,
"learning_rate": 9.726575693677782e-06,
"loss": 0.1733,
"step": 680
},
{
"epoch": 0.10590979782270607,
"grad_norm": 1.1633994693280907,
"learning_rate": 9.725778351356958e-06,
"loss": 0.1752,
"step": 681
},
{
"epoch": 0.10606531881804043,
"grad_norm": 1.4801298044861129,
"learning_rate": 9.724979880926937e-06,
"loss": 0.1654,
"step": 682
},
{
"epoch": 0.10622083981337481,
"grad_norm": 1.038476254792903,
"learning_rate": 9.724180282578327e-06,
"loss": 0.1796,
"step": 683
},
{
"epoch": 0.10637636080870917,
"grad_norm": 1.1715546692057253,
"learning_rate": 9.723379556502002e-06,
"loss": 0.2615,
"step": 684
},
{
"epoch": 0.10653188180404355,
"grad_norm": 0.9669903775949065,
"learning_rate": 9.722577702889106e-06,
"loss": 0.2217,
"step": 685
},
{
"epoch": 0.10668740279937791,
"grad_norm": 0.9554324370526551,
"learning_rate": 9.721774721931056e-06,
"loss": 0.2067,
"step": 686
},
{
"epoch": 0.10684292379471229,
"grad_norm": 1.5055382554521828,
"learning_rate": 9.720970613819532e-06,
"loss": 0.2886,
"step": 687
},
{
"epoch": 0.10699844479004665,
"grad_norm": 1.4701983051316598,
"learning_rate": 9.720165378746486e-06,
"loss": 0.2461,
"step": 688
},
{
"epoch": 0.10715396578538103,
"grad_norm": 0.8955915121278603,
"learning_rate": 9.719359016904137e-06,
"loss": 0.1296,
"step": 689
},
{
"epoch": 0.10730948678071539,
"grad_norm": 1.1365940197104127,
"learning_rate": 9.718551528484979e-06,
"loss": 0.1756,
"step": 690
},
{
"epoch": 0.10746500777604977,
"grad_norm": 1.1309854500820393,
"learning_rate": 9.717742913681769e-06,
"loss": 0.1685,
"step": 691
},
{
"epoch": 0.10762052877138413,
"grad_norm": 1.228647590848163,
"learning_rate": 9.716933172687533e-06,
"loss": 0.1988,
"step": 692
},
{
"epoch": 0.1077760497667185,
"grad_norm": 1.8437087557242553,
"learning_rate": 9.71612230569557e-06,
"loss": 0.2259,
"step": 693
},
{
"epoch": 0.10793157076205288,
"grad_norm": 2.190128145243616,
"learning_rate": 9.715310312899445e-06,
"loss": 0.1593,
"step": 694
},
{
"epoch": 0.10808709175738725,
"grad_norm": 1.9542747095305757,
"learning_rate": 9.714497194492988e-06,
"loss": 0.1942,
"step": 695
},
{
"epoch": 0.10824261275272162,
"grad_norm": 1.190017072453523,
"learning_rate": 9.713682950670305e-06,
"loss": 0.184,
"step": 696
},
{
"epoch": 0.10839813374805599,
"grad_norm": 1.3702585397170965,
"learning_rate": 9.712867581625769e-06,
"loss": 0.2747,
"step": 697
},
{
"epoch": 0.10855365474339036,
"grad_norm": 1.1224607857205071,
"learning_rate": 9.712051087554017e-06,
"loss": 0.1851,
"step": 698
},
{
"epoch": 0.10870917573872473,
"grad_norm": 1.1610995749820388,
"learning_rate": 9.711233468649958e-06,
"loss": 0.1651,
"step": 699
},
{
"epoch": 0.1088646967340591,
"grad_norm": 1.0713548580433974,
"learning_rate": 9.710414725108771e-06,
"loss": 0.2798,
"step": 700
},
{
"epoch": 0.1088646967340591,
"eval_loss": 0.2192843109369278,
"eval_runtime": 9.4454,
"eval_samples_per_second": 2.753,
"eval_steps_per_second": 0.741,
"step": 700
},
{
"epoch": 0.10902021772939346,
"grad_norm": 1.086974338576193,
"learning_rate": 9.709594857125898e-06,
"loss": 0.3235,
"step": 701
},
{
"epoch": 0.10917573872472784,
"grad_norm": 3.455927159294357,
"learning_rate": 9.708773864897059e-06,
"loss": 0.1502,
"step": 702
},
{
"epoch": 0.1093312597200622,
"grad_norm": 1.6070730415734276,
"learning_rate": 9.707951748618229e-06,
"loss": 0.2652,
"step": 703
},
{
"epoch": 0.10948678071539658,
"grad_norm": 1.0297377958380671,
"learning_rate": 9.707128508485663e-06,
"loss": 0.2352,
"step": 704
},
{
"epoch": 0.10964230171073094,
"grad_norm": 1.07292209906991,
"learning_rate": 9.706304144695877e-06,
"loss": 0.1471,
"step": 705
},
{
"epoch": 0.10979782270606532,
"grad_norm": 1.2095547877752455,
"learning_rate": 9.705478657445661e-06,
"loss": 0.2107,
"step": 706
},
{
"epoch": 0.10995334370139968,
"grad_norm": 1.307669146215221,
"learning_rate": 9.70465204693207e-06,
"loss": 0.2337,
"step": 707
},
{
"epoch": 0.11010886469673406,
"grad_norm": 0.8004125116368356,
"learning_rate": 9.703824313352428e-06,
"loss": 0.2042,
"step": 708
},
{
"epoch": 0.11026438569206842,
"grad_norm": 1.5202724190274493,
"learning_rate": 9.702995456904323e-06,
"loss": 0.2446,
"step": 709
},
{
"epoch": 0.1104199066874028,
"grad_norm": 1.3109419274601464,
"learning_rate": 9.702165477785618e-06,
"loss": 0.2791,
"step": 710
},
{
"epoch": 0.11057542768273718,
"grad_norm": 1.2175779348655416,
"learning_rate": 9.70133437619444e-06,
"loss": 0.2787,
"step": 711
},
{
"epoch": 0.11073094867807154,
"grad_norm": 2.4619987863193824,
"learning_rate": 9.700502152329182e-06,
"loss": 0.2184,
"step": 712
},
{
"epoch": 0.11088646967340592,
"grad_norm": 1.1204962171981678,
"learning_rate": 9.69966880638851e-06,
"loss": 0.1796,
"step": 713
},
{
"epoch": 0.11104199066874028,
"grad_norm": 1.3460375672771012,
"learning_rate": 9.698834338571355e-06,
"loss": 0.1536,
"step": 714
},
{
"epoch": 0.11119751166407466,
"grad_norm": 1.4551247859245915,
"learning_rate": 9.697998749076916e-06,
"loss": 0.1775,
"step": 715
},
{
"epoch": 0.11135303265940902,
"grad_norm": 1.64865769787968,
"learning_rate": 9.69716203810466e-06,
"loss": 0.2341,
"step": 716
},
{
"epoch": 0.1115085536547434,
"grad_norm": 1.8250018792840808,
"learning_rate": 9.696324205854322e-06,
"loss": 0.2058,
"step": 717
},
{
"epoch": 0.11166407465007776,
"grad_norm": 1.067050937242904,
"learning_rate": 9.695485252525902e-06,
"loss": 0.1463,
"step": 718
},
{
"epoch": 0.11181959564541213,
"grad_norm": 2.2821274396758127,
"learning_rate": 9.694645178319673e-06,
"loss": 0.2508,
"step": 719
},
{
"epoch": 0.1119751166407465,
"grad_norm": 1.388014808020173,
"learning_rate": 9.69380398343617e-06,
"loss": 0.1977,
"step": 720
},
{
"epoch": 0.11213063763608087,
"grad_norm": 1.5658859493501038,
"learning_rate": 9.692961668076197e-06,
"loss": 0.2291,
"step": 721
},
{
"epoch": 0.11228615863141524,
"grad_norm": 1.0853791710998715,
"learning_rate": 9.69211823244083e-06,
"loss": 0.2763,
"step": 722
},
{
"epoch": 0.11244167962674961,
"grad_norm": 1.27256020581809,
"learning_rate": 9.691273676731408e-06,
"loss": 0.195,
"step": 723
},
{
"epoch": 0.11259720062208398,
"grad_norm": 0.6768405188507002,
"learning_rate": 9.690428001149537e-06,
"loss": 0.1839,
"step": 724
},
{
"epoch": 0.11275272161741835,
"grad_norm": 3.309861478677342,
"learning_rate": 9.68958120589709e-06,
"loss": 0.1446,
"step": 725
},
{
"epoch": 0.11290824261275272,
"grad_norm": 1.3577561463931358,
"learning_rate": 9.688733291176211e-06,
"loss": 0.174,
"step": 726
},
{
"epoch": 0.11306376360808709,
"grad_norm": 0.7899130738957459,
"learning_rate": 9.68788425718931e-06,
"loss": 0.1819,
"step": 727
},
{
"epoch": 0.11321928460342146,
"grad_norm": 1.9374468863177388,
"learning_rate": 9.68703410413906e-06,
"loss": 0.2148,
"step": 728
},
{
"epoch": 0.11337480559875583,
"grad_norm": 0.9790173123360771,
"learning_rate": 9.686182832228408e-06,
"loss": 0.1842,
"step": 729
},
{
"epoch": 0.11353032659409021,
"grad_norm": 1.8838507925348544,
"learning_rate": 9.685330441660564e-06,
"loss": 0.2482,
"step": 730
},
{
"epoch": 0.11368584758942457,
"grad_norm": 1.7209011423931209,
"learning_rate": 9.684476932639002e-06,
"loss": 0.1938,
"step": 731
},
{
"epoch": 0.11384136858475895,
"grad_norm": 1.3133247484457822,
"learning_rate": 9.68362230536747e-06,
"loss": 0.1629,
"step": 732
},
{
"epoch": 0.11399688958009331,
"grad_norm": 1.4346328630835792,
"learning_rate": 9.682766560049979e-06,
"loss": 0.2393,
"step": 733
},
{
"epoch": 0.11415241057542769,
"grad_norm": 1.416880965769396,
"learning_rate": 9.681909696890805e-06,
"loss": 0.2149,
"step": 734
},
{
"epoch": 0.11430793157076205,
"grad_norm": 1.3604331981225013,
"learning_rate": 9.681051716094497e-06,
"loss": 0.2116,
"step": 735
},
{
"epoch": 0.11446345256609643,
"grad_norm": 1.370682231566524,
"learning_rate": 9.680192617865862e-06,
"loss": 0.1574,
"step": 736
},
{
"epoch": 0.11461897356143079,
"grad_norm": 3.11697026931608,
"learning_rate": 9.679332402409983e-06,
"loss": 0.1659,
"step": 737
},
{
"epoch": 0.11477449455676517,
"grad_norm": 1.0795485204091093,
"learning_rate": 9.678471069932205e-06,
"loss": 0.1843,
"step": 738
},
{
"epoch": 0.11493001555209953,
"grad_norm": 1.089003737321956,
"learning_rate": 9.677608620638138e-06,
"loss": 0.1289,
"step": 739
},
{
"epoch": 0.1150855365474339,
"grad_norm": 1.9816825572482675,
"learning_rate": 9.676745054733661e-06,
"loss": 0.183,
"step": 740
},
{
"epoch": 0.11524105754276827,
"grad_norm": 4.608323882578619,
"learning_rate": 9.675880372424922e-06,
"loss": 0.1797,
"step": 741
},
{
"epoch": 0.11539657853810265,
"grad_norm": 0.9751878331403108,
"learning_rate": 9.675014573918328e-06,
"loss": 0.2649,
"step": 742
},
{
"epoch": 0.11555209953343701,
"grad_norm": 0.913137520804308,
"learning_rate": 9.67414765942056e-06,
"loss": 0.1229,
"step": 743
},
{
"epoch": 0.11570762052877138,
"grad_norm": 1.1182409613228717,
"learning_rate": 9.673279629138565e-06,
"loss": 0.1554,
"step": 744
},
{
"epoch": 0.11586314152410575,
"grad_norm": 2.425925853364065,
"learning_rate": 9.67241048327955e-06,
"loss": 0.2414,
"step": 745
},
{
"epoch": 0.11601866251944012,
"grad_norm": 2.1643434151507024,
"learning_rate": 9.671540222050995e-06,
"loss": 0.2402,
"step": 746
},
{
"epoch": 0.1161741835147745,
"grad_norm": 1.1869224601016288,
"learning_rate": 9.67066884566064e-06,
"loss": 0.225,
"step": 747
},
{
"epoch": 0.11632970451010886,
"grad_norm": 1.1850496858694712,
"learning_rate": 9.669796354316497e-06,
"loss": 0.1732,
"step": 748
},
{
"epoch": 0.11648522550544324,
"grad_norm": 1.083880428656249,
"learning_rate": 9.668922748226842e-06,
"loss": 0.2256,
"step": 749
},
{
"epoch": 0.1166407465007776,
"grad_norm": 0.9290306352610638,
"learning_rate": 9.668048027600217e-06,
"loss": 0.1814,
"step": 750
},
{
"epoch": 0.11679626749611198,
"grad_norm": 1.1985316233321583,
"learning_rate": 9.66717219264543e-06,
"loss": 0.2646,
"step": 751
},
{
"epoch": 0.11695178849144634,
"grad_norm": 1.5752976014862634,
"learning_rate": 9.666295243571553e-06,
"loss": 0.2212,
"step": 752
},
{
"epoch": 0.11710730948678072,
"grad_norm": 1.554593030529623,
"learning_rate": 9.665417180587928e-06,
"loss": 0.2008,
"step": 753
},
{
"epoch": 0.11726283048211508,
"grad_norm": 1.802147426905897,
"learning_rate": 9.664538003904162e-06,
"loss": 0.1694,
"step": 754
},
{
"epoch": 0.11741835147744946,
"grad_norm": 1.117253074112765,
"learning_rate": 9.663657713730123e-06,
"loss": 0.1769,
"step": 755
},
{
"epoch": 0.11757387247278382,
"grad_norm": 1.2713208371120763,
"learning_rate": 9.662776310275954e-06,
"loss": 0.3356,
"step": 756
},
{
"epoch": 0.1177293934681182,
"grad_norm": 1.5049877808240208,
"learning_rate": 9.661893793752053e-06,
"loss": 0.2156,
"step": 757
},
{
"epoch": 0.11788491446345256,
"grad_norm": 1.3646831264890733,
"learning_rate": 9.661010164369092e-06,
"loss": 0.2077,
"step": 758
},
{
"epoch": 0.11804043545878694,
"grad_norm": 1.2057674637964264,
"learning_rate": 9.660125422338003e-06,
"loss": 0.234,
"step": 759
},
{
"epoch": 0.1181959564541213,
"grad_norm": 1.7059599899477969,
"learning_rate": 9.659239567869989e-06,
"loss": 0.2019,
"step": 760
},
{
"epoch": 0.11835147744945568,
"grad_norm": 1.359054263386884,
"learning_rate": 9.658352601176514e-06,
"loss": 0.2263,
"step": 761
},
{
"epoch": 0.11850699844479004,
"grad_norm": 1.4779502971821263,
"learning_rate": 9.65746452246931e-06,
"loss": 0.229,
"step": 762
},
{
"epoch": 0.11866251944012442,
"grad_norm": 1.2106031530437371,
"learning_rate": 9.656575331960376e-06,
"loss": 0.2075,
"step": 763
},
{
"epoch": 0.1188180404354588,
"grad_norm": 1.5750869920441555,
"learning_rate": 9.655685029861969e-06,
"loss": 0.2103,
"step": 764
},
{
"epoch": 0.11897356143079316,
"grad_norm": 1.328300416339256,
"learning_rate": 9.654793616386621e-06,
"loss": 0.1822,
"step": 765
},
{
"epoch": 0.11912908242612753,
"grad_norm": 2.218866258760128,
"learning_rate": 9.653901091747124e-06,
"loss": 0.1909,
"step": 766
},
{
"epoch": 0.1192846034214619,
"grad_norm": 1.8622051312400103,
"learning_rate": 9.653007456156536e-06,
"loss": 0.2241,
"step": 767
},
{
"epoch": 0.11944012441679627,
"grad_norm": 1.3832228672336278,
"learning_rate": 9.652112709828179e-06,
"loss": 0.2256,
"step": 768
},
{
"epoch": 0.11959564541213064,
"grad_norm": 1.0673171707909481,
"learning_rate": 9.651216852975643e-06,
"loss": 0.1959,
"step": 769
},
{
"epoch": 0.11975116640746501,
"grad_norm": 1.3393619429463375,
"learning_rate": 9.650319885812777e-06,
"loss": 0.2727,
"step": 770
},
{
"epoch": 0.11990668740279938,
"grad_norm": 1.0882111784771522,
"learning_rate": 9.649421808553708e-06,
"loss": 0.2259,
"step": 771
},
{
"epoch": 0.12006220839813375,
"grad_norm": 4.447919742603164,
"learning_rate": 9.648522621412812e-06,
"loss": 0.231,
"step": 772
},
{
"epoch": 0.12021772939346811,
"grad_norm": 1.5176403638597071,
"learning_rate": 9.647622324604742e-06,
"loss": 0.2824,
"step": 773
},
{
"epoch": 0.12037325038880249,
"grad_norm": 1.7576074795768224,
"learning_rate": 9.646720918344409e-06,
"loss": 0.2034,
"step": 774
},
{
"epoch": 0.12052877138413685,
"grad_norm": 1.5792838723378395,
"learning_rate": 9.645818402846992e-06,
"loss": 0.1677,
"step": 775
},
{
"epoch": 0.12068429237947123,
"grad_norm": 1.0405000433648128,
"learning_rate": 9.644914778327935e-06,
"loss": 0.1742,
"step": 776
},
{
"epoch": 0.1208398133748056,
"grad_norm": 1.545200668981177,
"learning_rate": 9.644010045002942e-06,
"loss": 0.215,
"step": 777
},
{
"epoch": 0.12099533437013997,
"grad_norm": 1.203039484308954,
"learning_rate": 9.64310420308799e-06,
"loss": 0.1997,
"step": 778
},
{
"epoch": 0.12115085536547433,
"grad_norm": 1.038062251460105,
"learning_rate": 9.642197252799315e-06,
"loss": 0.2001,
"step": 779
},
{
"epoch": 0.12130637636080871,
"grad_norm": 1.3963430783849184,
"learning_rate": 9.641289194353418e-06,
"loss": 0.2034,
"step": 780
},
{
"epoch": 0.12146189735614307,
"grad_norm": 1.7069918759015217,
"learning_rate": 9.640380027967065e-06,
"loss": 0.1763,
"step": 781
},
{
"epoch": 0.12161741835147745,
"grad_norm": 1.1485309219449071,
"learning_rate": 9.639469753857287e-06,
"loss": 0.1946,
"step": 782
},
{
"epoch": 0.12177293934681183,
"grad_norm": 0.9976269624811838,
"learning_rate": 9.63855837224138e-06,
"loss": 0.1797,
"step": 783
},
{
"epoch": 0.12192846034214619,
"grad_norm": 1.413148682632424,
"learning_rate": 9.6376458833369e-06,
"loss": 0.1873,
"step": 784
},
{
"epoch": 0.12208398133748057,
"grad_norm": 1.287068701523726,
"learning_rate": 9.636732287361675e-06,
"loss": 0.1964,
"step": 785
},
{
"epoch": 0.12223950233281493,
"grad_norm": 1.338092957612231,
"learning_rate": 9.635817584533791e-06,
"loss": 0.2353,
"step": 786
},
{
"epoch": 0.1223950233281493,
"grad_norm": 1.018985176065171,
"learning_rate": 9.6349017750716e-06,
"loss": 0.243,
"step": 787
},
{
"epoch": 0.12255054432348367,
"grad_norm": 1.434405666961768,
"learning_rate": 9.633984859193722e-06,
"loss": 0.1622,
"step": 788
},
{
"epoch": 0.12270606531881804,
"grad_norm": 1.2392900109261706,
"learning_rate": 9.633066837119034e-06,
"loss": 0.2223,
"step": 789
},
{
"epoch": 0.12286158631415241,
"grad_norm": 0.9045673894396051,
"learning_rate": 9.632147709066682e-06,
"loss": 0.2079,
"step": 790
},
{
"epoch": 0.12301710730948678,
"grad_norm": 1.14443309047443,
"learning_rate": 9.631227475256072e-06,
"loss": 0.1611,
"step": 791
},
{
"epoch": 0.12317262830482115,
"grad_norm": 1.1564291271253233,
"learning_rate": 9.630306135906882e-06,
"loss": 0.1918,
"step": 792
},
{
"epoch": 0.12332814930015552,
"grad_norm": 2.1831582412646138,
"learning_rate": 9.629383691239043e-06,
"loss": 0.3687,
"step": 793
},
{
"epoch": 0.12348367029548989,
"grad_norm": 1.0115623861000755,
"learning_rate": 9.628460141472759e-06,
"loss": 0.1589,
"step": 794
},
{
"epoch": 0.12363919129082426,
"grad_norm": 0.8936049036056027,
"learning_rate": 9.627535486828491e-06,
"loss": 0.1775,
"step": 795
},
{
"epoch": 0.12379471228615863,
"grad_norm": 1.3757750926899586,
"learning_rate": 9.626609727526973e-06,
"loss": 0.2,
"step": 796
},
{
"epoch": 0.123950233281493,
"grad_norm": 1.3462049704057701,
"learning_rate": 9.62568286378919e-06,
"loss": 0.2079,
"step": 797
},
{
"epoch": 0.12410575427682737,
"grad_norm": 2.793319589376331,
"learning_rate": 9.624754895836401e-06,
"loss": 0.2297,
"step": 798
},
{
"epoch": 0.12426127527216174,
"grad_norm": 2.1016347336310357,
"learning_rate": 9.623825823890123e-06,
"loss": 0.3106,
"step": 799
},
{
"epoch": 0.12441679626749612,
"grad_norm": 1.003756031018623,
"learning_rate": 9.622895648172141e-06,
"loss": 0.2143,
"step": 800
},
{
"epoch": 0.12441679626749612,
"eval_loss": 0.2170763909816742,
"eval_runtime": 9.4305,
"eval_samples_per_second": 2.757,
"eval_steps_per_second": 0.742,
"step": 800
},
{
"epoch": 0.12457231726283048,
"grad_norm": 0.897563337381756,
"learning_rate": 9.621964368904497e-06,
"loss": 0.1512,
"step": 801
},
{
"epoch": 0.12472783825816486,
"grad_norm": 1.4190659163727315,
"learning_rate": 9.621031986309504e-06,
"loss": 0.1372,
"step": 802
},
{
"epoch": 0.12488335925349922,
"grad_norm": 1.4031206175030444,
"learning_rate": 9.620098500609734e-06,
"loss": 0.1871,
"step": 803
},
{
"epoch": 0.12503888024883358,
"grad_norm": 1.387547575925909,
"learning_rate": 9.61916391202802e-06,
"loss": 0.2899,
"step": 804
},
{
"epoch": 0.12519440124416797,
"grad_norm": 1.3476031364192975,
"learning_rate": 9.618228220787466e-06,
"loss": 0.1693,
"step": 805
},
{
"epoch": 0.12534992223950234,
"grad_norm": 2.5401419561208787,
"learning_rate": 9.617291427111431e-06,
"loss": 0.141,
"step": 806
},
{
"epoch": 0.1255054432348367,
"grad_norm": 1.918003643731122,
"learning_rate": 9.616353531223543e-06,
"loss": 0.2531,
"step": 807
},
{
"epoch": 0.12566096423017106,
"grad_norm": 0.8824574964250353,
"learning_rate": 9.61541453334769e-06,
"loss": 0.2257,
"step": 808
},
{
"epoch": 0.12581648522550545,
"grad_norm": 1.2069677012195894,
"learning_rate": 9.614474433708021e-06,
"loss": 0.2012,
"step": 809
},
{
"epoch": 0.12597200622083982,
"grad_norm": 0.8806254573901449,
"learning_rate": 9.613533232528956e-06,
"loss": 0.2312,
"step": 810
},
{
"epoch": 0.12612752721617418,
"grad_norm": 0.9758926813848963,
"learning_rate": 9.61259093003517e-06,
"loss": 0.1623,
"step": 811
},
{
"epoch": 0.12628304821150854,
"grad_norm": 1.601541464183247,
"learning_rate": 9.611647526451603e-06,
"loss": 0.2448,
"step": 812
},
{
"epoch": 0.12643856920684293,
"grad_norm": 0.987236561765066,
"learning_rate": 9.610703022003462e-06,
"loss": 0.1833,
"step": 813
},
{
"epoch": 0.1265940902021773,
"grad_norm": 1.1685078861500846,
"learning_rate": 9.60975741691621e-06,
"loss": 0.2708,
"step": 814
},
{
"epoch": 0.12674961119751166,
"grad_norm": 1.2818789908746795,
"learning_rate": 9.608810711415577e-06,
"loss": 0.2132,
"step": 815
},
{
"epoch": 0.12690513219284602,
"grad_norm": 1.7355503765107922,
"learning_rate": 9.607862905727556e-06,
"loss": 0.2316,
"step": 816
},
{
"epoch": 0.1270606531881804,
"grad_norm": 2.4291900998321614,
"learning_rate": 9.6069140000784e-06,
"loss": 0.2607,
"step": 817
},
{
"epoch": 0.12721617418351477,
"grad_norm": 1.2126882446943306,
"learning_rate": 9.605963994694625e-06,
"loss": 0.2374,
"step": 818
},
{
"epoch": 0.12737169517884914,
"grad_norm": 1.402793253608196,
"learning_rate": 9.605012889803013e-06,
"loss": 0.1854,
"step": 819
},
{
"epoch": 0.12752721617418353,
"grad_norm": 1.1350096409875572,
"learning_rate": 9.604060685630608e-06,
"loss": 0.2353,
"step": 820
},
{
"epoch": 0.1276827371695179,
"grad_norm": 0.8605397955086846,
"learning_rate": 9.603107382404708e-06,
"loss": 0.1725,
"step": 821
},
{
"epoch": 0.12783825816485225,
"grad_norm": 1.8193213761501528,
"learning_rate": 9.602152980352884e-06,
"loss": 0.191,
"step": 822
},
{
"epoch": 0.12799377916018662,
"grad_norm": 1.0560479092155457,
"learning_rate": 9.601197479702963e-06,
"loss": 0.2129,
"step": 823
},
{
"epoch": 0.128149300155521,
"grad_norm": 0.9886146739779551,
"learning_rate": 9.60024088068304e-06,
"loss": 0.1349,
"step": 824
},
{
"epoch": 0.12830482115085537,
"grad_norm": 1.044208330213169,
"learning_rate": 9.599283183521467e-06,
"loss": 0.1611,
"step": 825
},
{
"epoch": 0.12846034214618973,
"grad_norm": 1.105951942629371,
"learning_rate": 9.598324388446856e-06,
"loss": 0.25,
"step": 826
},
{
"epoch": 0.1286158631415241,
"grad_norm": 1.2794645483672162,
"learning_rate": 9.59736449568809e-06,
"loss": 0.2132,
"step": 827
},
{
"epoch": 0.12877138413685849,
"grad_norm": 1.3758053785309152,
"learning_rate": 9.596403505474304e-06,
"loss": 0.2149,
"step": 828
},
{
"epoch": 0.12892690513219285,
"grad_norm": 3.355818230170184,
"learning_rate": 9.595441418034903e-06,
"loss": 0.3682,
"step": 829
},
{
"epoch": 0.1290824261275272,
"grad_norm": 1.837073128336488,
"learning_rate": 9.594478233599551e-06,
"loss": 0.2032,
"step": 830
},
{
"epoch": 0.12923794712286157,
"grad_norm": 1.5066969144898332,
"learning_rate": 9.593513952398172e-06,
"loss": 0.2378,
"step": 831
},
{
"epoch": 0.12939346811819596,
"grad_norm": 2.2384679831338614,
"learning_rate": 9.592548574660954e-06,
"loss": 0.3073,
"step": 832
},
{
"epoch": 0.12954898911353033,
"grad_norm": 0.9921790422628257,
"learning_rate": 9.591582100618345e-06,
"loss": 0.1937,
"step": 833
},
{
"epoch": 0.1297045101088647,
"grad_norm": 1.198440591432804,
"learning_rate": 9.590614530501057e-06,
"loss": 0.1925,
"step": 834
},
{
"epoch": 0.12986003110419908,
"grad_norm": 1.3748463927035848,
"learning_rate": 9.589645864540061e-06,
"loss": 0.1941,
"step": 835
},
{
"epoch": 0.13001555209953344,
"grad_norm": 1.3610943196332044,
"learning_rate": 9.588676102966593e-06,
"loss": 0.166,
"step": 836
},
{
"epoch": 0.1301710730948678,
"grad_norm": 0.8955532583487235,
"learning_rate": 9.58770524601215e-06,
"loss": 0.1495,
"step": 837
},
{
"epoch": 0.13032659409020217,
"grad_norm": 1.285038495994977,
"learning_rate": 9.586733293908486e-06,
"loss": 0.2182,
"step": 838
},
{
"epoch": 0.13048211508553656,
"grad_norm": 1.3128144306673817,
"learning_rate": 9.585760246887618e-06,
"loss": 0.2371,
"step": 839
},
{
"epoch": 0.13063763608087092,
"grad_norm": 0.9827038114137296,
"learning_rate": 9.584786105181831e-06,
"loss": 0.2151,
"step": 840
},
{
"epoch": 0.13079315707620529,
"grad_norm": 1.0846767572687748,
"learning_rate": 9.583810869023663e-06,
"loss": 0.2757,
"step": 841
},
{
"epoch": 0.13094867807153965,
"grad_norm": 2.4064875629004265,
"learning_rate": 9.582834538645917e-06,
"loss": 0.2357,
"step": 842
},
{
"epoch": 0.13110419906687404,
"grad_norm": 1.2894114641673238,
"learning_rate": 9.581857114281656e-06,
"loss": 0.1877,
"step": 843
},
{
"epoch": 0.1312597200622084,
"grad_norm": 1.5574730662344252,
"learning_rate": 9.580878596164207e-06,
"loss": 0.1623,
"step": 844
},
{
"epoch": 0.13141524105754276,
"grad_norm": 1.2710340775794473,
"learning_rate": 9.579898984527154e-06,
"loss": 0.187,
"step": 845
},
{
"epoch": 0.13157076205287713,
"grad_norm": 1.4508001676942102,
"learning_rate": 9.578918279604346e-06,
"loss": 0.1372,
"step": 846
},
{
"epoch": 0.13172628304821152,
"grad_norm": 1.0012618056091263,
"learning_rate": 9.577936481629887e-06,
"loss": 0.2201,
"step": 847
},
{
"epoch": 0.13188180404354588,
"grad_norm": 0.8624233281967797,
"learning_rate": 9.576953590838149e-06,
"loss": 0.1979,
"step": 848
},
{
"epoch": 0.13203732503888024,
"grad_norm": 1.083965076436999,
"learning_rate": 9.57596960746376e-06,
"loss": 0.2404,
"step": 849
},
{
"epoch": 0.1321928460342146,
"grad_norm": 2.717551231092263,
"learning_rate": 9.574984531741613e-06,
"loss": 0.2745,
"step": 850
},
{
"epoch": 0.132348367029549,
"grad_norm": 1.1408480485083061,
"learning_rate": 9.573998363906858e-06,
"loss": 0.207,
"step": 851
},
{
"epoch": 0.13250388802488336,
"grad_norm": 1.6013848917828304,
"learning_rate": 9.573011104194907e-06,
"loss": 0.1826,
"step": 852
},
{
"epoch": 0.13265940902021772,
"grad_norm": 1.1178529036140945,
"learning_rate": 9.572022752841433e-06,
"loss": 0.1676,
"step": 853
},
{
"epoch": 0.1328149300155521,
"grad_norm": 1.4964605327939924,
"learning_rate": 9.571033310082367e-06,
"loss": 0.1929,
"step": 854
},
{
"epoch": 0.13297045101088648,
"grad_norm": 1.1404147062516024,
"learning_rate": 9.570042776153904e-06,
"loss": 0.2274,
"step": 855
},
{
"epoch": 0.13312597200622084,
"grad_norm": 1.037410347500119,
"learning_rate": 9.5690511512925e-06,
"loss": 0.1577,
"step": 856
},
{
"epoch": 0.1332814930015552,
"grad_norm": 0.8366673014473697,
"learning_rate": 9.56805843573487e-06,
"loss": 0.1689,
"step": 857
},
{
"epoch": 0.1334370139968896,
"grad_norm": 1.1452085152848681,
"learning_rate": 9.567064629717986e-06,
"loss": 0.1882,
"step": 858
},
{
"epoch": 0.13359253499222395,
"grad_norm": 1.574854487100182,
"learning_rate": 9.566069733479087e-06,
"loss": 0.31,
"step": 859
},
{
"epoch": 0.13374805598755832,
"grad_norm": 1.593208427145828,
"learning_rate": 9.565073747255665e-06,
"loss": 0.2198,
"step": 860
},
{
"epoch": 0.13390357698289268,
"grad_norm": 1.1177720055491567,
"learning_rate": 9.564076671285477e-06,
"loss": 0.2164,
"step": 861
},
{
"epoch": 0.13405909797822707,
"grad_norm": 0.8165999821951461,
"learning_rate": 9.56307850580654e-06,
"loss": 0.1506,
"step": 862
},
{
"epoch": 0.13421461897356143,
"grad_norm": 1.6750367279986849,
"learning_rate": 9.562079251057129e-06,
"loss": 0.1732,
"step": 863
},
{
"epoch": 0.1343701399688958,
"grad_norm": 0.8044448243559967,
"learning_rate": 9.561078907275781e-06,
"loss": 0.1922,
"step": 864
},
{
"epoch": 0.13452566096423016,
"grad_norm": 1.271960150991974,
"learning_rate": 9.56007747470129e-06,
"loss": 0.2229,
"step": 865
},
{
"epoch": 0.13468118195956455,
"grad_norm": 1.0004490456147865,
"learning_rate": 9.559074953572713e-06,
"loss": 0.171,
"step": 866
},
{
"epoch": 0.1348367029548989,
"grad_norm": 1.312217862895249,
"learning_rate": 9.558071344129368e-06,
"loss": 0.1783,
"step": 867
},
{
"epoch": 0.13499222395023328,
"grad_norm": 0.9356844106701133,
"learning_rate": 9.557066646610826e-06,
"loss": 0.1279,
"step": 868
},
{
"epoch": 0.13514774494556764,
"grad_norm": 1.4966712904656105,
"learning_rate": 9.556060861256928e-06,
"loss": 0.1971,
"step": 869
},
{
"epoch": 0.13530326594090203,
"grad_norm": 0.9157016732315058,
"learning_rate": 9.555053988307764e-06,
"loss": 0.1739,
"step": 870
},
{
"epoch": 0.1354587869362364,
"grad_norm": 1.6187813697357434,
"learning_rate": 9.554046028003691e-06,
"loss": 0.2326,
"step": 871
},
{
"epoch": 0.13561430793157075,
"grad_norm": 1.649258041134042,
"learning_rate": 9.553036980585323e-06,
"loss": 0.2775,
"step": 872
},
{
"epoch": 0.13576982892690515,
"grad_norm": 0.8386386166459481,
"learning_rate": 9.552026846293532e-06,
"loss": 0.2225,
"step": 873
},
{
"epoch": 0.1359253499222395,
"grad_norm": 0.96771492040488,
"learning_rate": 9.551015625369455e-06,
"loss": 0.1999,
"step": 874
},
{
"epoch": 0.13608087091757387,
"grad_norm": 1.4939182411934322,
"learning_rate": 9.550003318054482e-06,
"loss": 0.2427,
"step": 875
},
{
"epoch": 0.13623639191290823,
"grad_norm": 1.1599555983572944,
"learning_rate": 9.548989924590263e-06,
"loss": 0.2038,
"step": 876
},
{
"epoch": 0.13639191290824262,
"grad_norm": 1.094972018927162,
"learning_rate": 9.547975445218712e-06,
"loss": 0.1477,
"step": 877
},
{
"epoch": 0.136547433903577,
"grad_norm": 1.5378516224601575,
"learning_rate": 9.546959880181998e-06,
"loss": 0.2411,
"step": 878
},
{
"epoch": 0.13670295489891135,
"grad_norm": 0.8702765312556789,
"learning_rate": 9.545943229722553e-06,
"loss": 0.1646,
"step": 879
},
{
"epoch": 0.1368584758942457,
"grad_norm": 1.3664019719395564,
"learning_rate": 9.544925494083062e-06,
"loss": 0.1688,
"step": 880
},
{
"epoch": 0.1370139968895801,
"grad_norm": 1.3206104649159593,
"learning_rate": 9.543906673506474e-06,
"loss": 0.1623,
"step": 881
},
{
"epoch": 0.13716951788491447,
"grad_norm": 1.3156230503659714,
"learning_rate": 9.542886768235996e-06,
"loss": 0.2297,
"step": 882
},
{
"epoch": 0.13732503888024883,
"grad_norm": 1.727680640232904,
"learning_rate": 9.541865778515094e-06,
"loss": 0.2824,
"step": 883
},
{
"epoch": 0.1374805598755832,
"grad_norm": 1.3346266664784416,
"learning_rate": 9.540843704587492e-06,
"loss": 0.2533,
"step": 884
},
{
"epoch": 0.13763608087091758,
"grad_norm": 1.663603312691407,
"learning_rate": 9.539820546697175e-06,
"loss": 0.1889,
"step": 885
},
{
"epoch": 0.13779160186625194,
"grad_norm": 1.3931002570801638,
"learning_rate": 9.53879630508838e-06,
"loss": 0.2125,
"step": 886
},
{
"epoch": 0.1379471228615863,
"grad_norm": 1.0312695868953268,
"learning_rate": 9.537770980005616e-06,
"loss": 0.157,
"step": 887
},
{
"epoch": 0.13810264385692067,
"grad_norm": 1.291055270497525,
"learning_rate": 9.536744571693634e-06,
"loss": 0.1542,
"step": 888
},
{
"epoch": 0.13825816485225506,
"grad_norm": 1.0586309772197517,
"learning_rate": 9.535717080397458e-06,
"loss": 0.1413,
"step": 889
},
{
"epoch": 0.13841368584758942,
"grad_norm": 1.9142459890481243,
"learning_rate": 9.53468850636236e-06,
"loss": 0.2132,
"step": 890
},
{
"epoch": 0.1385692068429238,
"grad_norm": 1.57785159694773,
"learning_rate": 9.533658849833879e-06,
"loss": 0.2704,
"step": 891
},
{
"epoch": 0.13872472783825818,
"grad_norm": 0.6767899331815482,
"learning_rate": 9.532628111057804e-06,
"loss": 0.1994,
"step": 892
},
{
"epoch": 0.13888024883359254,
"grad_norm": 0.7786068585931847,
"learning_rate": 9.531596290280191e-06,
"loss": 0.2215,
"step": 893
},
{
"epoch": 0.1390357698289269,
"grad_norm": 1.1907351307303637,
"learning_rate": 9.530563387747348e-06,
"loss": 0.1597,
"step": 894
},
{
"epoch": 0.13919129082426127,
"grad_norm": 0.994862972128769,
"learning_rate": 9.529529403705844e-06,
"loss": 0.2586,
"step": 895
},
{
"epoch": 0.13934681181959566,
"grad_norm": 0.9549652766512168,
"learning_rate": 9.528494338402502e-06,
"loss": 0.1332,
"step": 896
},
{
"epoch": 0.13950233281493002,
"grad_norm": 1.1799329518454007,
"learning_rate": 9.527458192084413e-06,
"loss": 0.1884,
"step": 897
},
{
"epoch": 0.13965785381026438,
"grad_norm": 0.7863314952979764,
"learning_rate": 9.526420964998915e-06,
"loss": 0.1679,
"step": 898
},
{
"epoch": 0.13981337480559874,
"grad_norm": 0.937917950726602,
"learning_rate": 9.52538265739361e-06,
"loss": 0.2024,
"step": 899
},
{
"epoch": 0.13996889580093314,
"grad_norm": 1.7160775693106616,
"learning_rate": 9.524343269516354e-06,
"loss": 0.2127,
"step": 900
},
{
"epoch": 0.13996889580093314,
"eval_loss": 0.21867091953754425,
"eval_runtime": 9.4128,
"eval_samples_per_second": 2.762,
"eval_steps_per_second": 0.744,
"step": 900
},
{
"epoch": 0.1401244167962675,
"grad_norm": 1.4496209630087886,
"learning_rate": 9.523302801615266e-06,
"loss": 0.2026,
"step": 901
},
{
"epoch": 0.14027993779160186,
"grad_norm": 0.9035504049737524,
"learning_rate": 9.522261253938721e-06,
"loss": 0.237,
"step": 902
},
{
"epoch": 0.14043545878693622,
"grad_norm": 1.0344016899215176,
"learning_rate": 9.521218626735347e-06,
"loss": 0.2079,
"step": 903
},
{
"epoch": 0.14059097978227061,
"grad_norm": 0.8764502702407341,
"learning_rate": 9.52017492025404e-06,
"loss": 0.1512,
"step": 904
},
{
"epoch": 0.14074650077760498,
"grad_norm": 0.78362955023232,
"learning_rate": 9.519130134743938e-06,
"loss": 0.1544,
"step": 905
},
{
"epoch": 0.14090202177293934,
"grad_norm": 1.331879071297993,
"learning_rate": 9.518084270454456e-06,
"loss": 0.208,
"step": 906
},
{
"epoch": 0.14105754276827373,
"grad_norm": 1.0576721252655992,
"learning_rate": 9.51703732763525e-06,
"loss": 0.1777,
"step": 907
},
{
"epoch": 0.1412130637636081,
"grad_norm": 0.9777650095779323,
"learning_rate": 9.515989306536241e-06,
"loss": 0.2431,
"step": 908
},
{
"epoch": 0.14136858475894246,
"grad_norm": 1.2351460184737522,
"learning_rate": 9.514940207407608e-06,
"loss": 0.164,
"step": 909
},
{
"epoch": 0.14152410575427682,
"grad_norm": 1.0466682687606328,
"learning_rate": 9.513890030499786e-06,
"loss": 0.1862,
"step": 910
},
{
"epoch": 0.1416796267496112,
"grad_norm": 1.667573553968496,
"learning_rate": 9.512838776063464e-06,
"loss": 0.1881,
"step": 911
},
{
"epoch": 0.14183514774494557,
"grad_norm": 1.0309274313381354,
"learning_rate": 9.51178644434959e-06,
"loss": 0.1894,
"step": 912
},
{
"epoch": 0.14199066874027994,
"grad_norm": 1.1516030880613233,
"learning_rate": 9.510733035609376e-06,
"loss": 0.1906,
"step": 913
},
{
"epoch": 0.1421461897356143,
"grad_norm": 1.1964374362259393,
"learning_rate": 9.509678550094282e-06,
"loss": 0.2193,
"step": 914
},
{
"epoch": 0.1423017107309487,
"grad_norm": 1.018131456622998,
"learning_rate": 9.508622988056026e-06,
"loss": 0.18,
"step": 915
},
{
"epoch": 0.14245723172628305,
"grad_norm": 0.9878879365994556,
"learning_rate": 9.50756634974659e-06,
"loss": 0.2303,
"step": 916
},
{
"epoch": 0.14261275272161741,
"grad_norm": 0.9092163587106824,
"learning_rate": 9.506508635418203e-06,
"loss": 0.1565,
"step": 917
},
{
"epoch": 0.14276827371695178,
"grad_norm": 1.2600000274625656,
"learning_rate": 9.505449845323362e-06,
"loss": 0.2203,
"step": 918
},
{
"epoch": 0.14292379471228617,
"grad_norm": 1.0177653430547444,
"learning_rate": 9.504389979714812e-06,
"loss": 0.1708,
"step": 919
},
{
"epoch": 0.14307931570762053,
"grad_norm": 1.323141251202386,
"learning_rate": 9.503329038845556e-06,
"loss": 0.2041,
"step": 920
},
{
"epoch": 0.1432348367029549,
"grad_norm": 0.8666613786933973,
"learning_rate": 9.50226702296886e-06,
"loss": 0.1709,
"step": 921
},
{
"epoch": 0.14339035769828926,
"grad_norm": 1.4717207003269144,
"learning_rate": 9.501203932338238e-06,
"loss": 0.1531,
"step": 922
},
{
"epoch": 0.14354587869362365,
"grad_norm": 0.9850527774643847,
"learning_rate": 9.500139767207465e-06,
"loss": 0.2673,
"step": 923
},
{
"epoch": 0.143701399688958,
"grad_norm": 0.795383661376322,
"learning_rate": 9.499074527830576e-06,
"loss": 0.1514,
"step": 924
},
{
"epoch": 0.14385692068429237,
"grad_norm": 1.5926732733378721,
"learning_rate": 9.498008214461854e-06,
"loss": 0.1919,
"step": 925
},
{
"epoch": 0.14401244167962676,
"grad_norm": 1.0577956165619293,
"learning_rate": 9.496940827355843e-06,
"loss": 0.2541,
"step": 926
},
{
"epoch": 0.14416796267496113,
"grad_norm": 1.0853608193427453,
"learning_rate": 9.495872366767345e-06,
"loss": 0.3026,
"step": 927
},
{
"epoch": 0.1443234836702955,
"grad_norm": 1.5841584604687593,
"learning_rate": 9.494802832951416e-06,
"loss": 0.237,
"step": 928
},
{
"epoch": 0.14447900466562985,
"grad_norm": 1.2668912692543315,
"learning_rate": 9.493732226163368e-06,
"loss": 0.1962,
"step": 929
},
{
"epoch": 0.14463452566096424,
"grad_norm": 1.1865934879383473,
"learning_rate": 9.492660546658771e-06,
"loss": 0.205,
"step": 930
},
{
"epoch": 0.1447900466562986,
"grad_norm": 1.16907334182334,
"learning_rate": 9.491587794693448e-06,
"loss": 0.1649,
"step": 931
},
{
"epoch": 0.14494556765163297,
"grad_norm": 2.6694118671679035,
"learning_rate": 9.490513970523482e-06,
"loss": 0.1716,
"step": 932
},
{
"epoch": 0.14510108864696733,
"grad_norm": 1.2693916754547256,
"learning_rate": 9.489439074405211e-06,
"loss": 0.2102,
"step": 933
},
{
"epoch": 0.14525660964230172,
"grad_norm": 1.4815910522621762,
"learning_rate": 9.488363106595223e-06,
"loss": 0.2146,
"step": 934
},
{
"epoch": 0.14541213063763608,
"grad_norm": 1.5330200808441012,
"learning_rate": 9.48728606735037e-06,
"loss": 0.1767,
"step": 935
},
{
"epoch": 0.14556765163297045,
"grad_norm": 1.123567228978502,
"learning_rate": 9.486207956927756e-06,
"loss": 0.1864,
"step": 936
},
{
"epoch": 0.1457231726283048,
"grad_norm": 0.9960966752159592,
"learning_rate": 9.485128775584737e-06,
"loss": 0.2118,
"step": 937
},
{
"epoch": 0.1458786936236392,
"grad_norm": 1.2303193618017887,
"learning_rate": 9.484048523578934e-06,
"loss": 0.2106,
"step": 938
},
{
"epoch": 0.14603421461897356,
"grad_norm": 1.2867421133114936,
"learning_rate": 9.482967201168218e-06,
"loss": 0.2252,
"step": 939
},
{
"epoch": 0.14618973561430793,
"grad_norm": 1.3372951799730566,
"learning_rate": 9.481884808610712e-06,
"loss": 0.2662,
"step": 940
},
{
"epoch": 0.1463452566096423,
"grad_norm": 0.6808561025624517,
"learning_rate": 9.4808013461648e-06,
"loss": 0.1613,
"step": 941
},
{
"epoch": 0.14650077760497668,
"grad_norm": 1.0617639952793092,
"learning_rate": 9.479716814089119e-06,
"loss": 0.22,
"step": 942
},
{
"epoch": 0.14665629860031104,
"grad_norm": 1.2088515247514138,
"learning_rate": 9.478631212642565e-06,
"loss": 0.2027,
"step": 943
},
{
"epoch": 0.1468118195956454,
"grad_norm": 0.9673478217504623,
"learning_rate": 9.477544542084283e-06,
"loss": 0.2291,
"step": 944
},
{
"epoch": 0.1469673405909798,
"grad_norm": 1.3295783157520016,
"learning_rate": 9.476456802673677e-06,
"loss": 0.2153,
"step": 945
},
{
"epoch": 0.14712286158631416,
"grad_norm": 1.1001160858062626,
"learning_rate": 9.475367994670406e-06,
"loss": 0.2195,
"step": 946
},
{
"epoch": 0.14727838258164852,
"grad_norm": 1.291866801296516,
"learning_rate": 9.474278118334382e-06,
"loss": 0.2213,
"step": 947
},
{
"epoch": 0.14743390357698288,
"grad_norm": 1.2674302718543788,
"learning_rate": 9.473187173925777e-06,
"loss": 0.1371,
"step": 948
},
{
"epoch": 0.14758942457231727,
"grad_norm": 1.4168689609608738,
"learning_rate": 9.472095161705014e-06,
"loss": 0.1902,
"step": 949
},
{
"epoch": 0.14774494556765164,
"grad_norm": 1.0439332293475743,
"learning_rate": 9.471002081932767e-06,
"loss": 0.2069,
"step": 950
},
{
"epoch": 0.147900466562986,
"grad_norm": 1.346490441102045,
"learning_rate": 9.469907934869974e-06,
"loss": 0.1982,
"step": 951
},
{
"epoch": 0.14805598755832036,
"grad_norm": 1.1817129831636979,
"learning_rate": 9.468812720777822e-06,
"loss": 0.1626,
"step": 952
},
{
"epoch": 0.14821150855365475,
"grad_norm": 0.846186520557803,
"learning_rate": 9.467716439917753e-06,
"loss": 0.1659,
"step": 953
},
{
"epoch": 0.14836702954898912,
"grad_norm": 1.77057726290962,
"learning_rate": 9.466619092551467e-06,
"loss": 0.1571,
"step": 954
},
{
"epoch": 0.14852255054432348,
"grad_norm": 1.503606666530362,
"learning_rate": 9.465520678940913e-06,
"loss": 0.2317,
"step": 955
},
{
"epoch": 0.14867807153965784,
"grad_norm": 1.2988561500793663,
"learning_rate": 9.4644211993483e-06,
"loss": 0.184,
"step": 956
},
{
"epoch": 0.14883359253499223,
"grad_norm": 0.9494708116205622,
"learning_rate": 9.463320654036088e-06,
"loss": 0.2061,
"step": 957
},
{
"epoch": 0.1489891135303266,
"grad_norm": 1.1960711999747602,
"learning_rate": 9.462219043266993e-06,
"loss": 0.1595,
"step": 958
},
{
"epoch": 0.14914463452566096,
"grad_norm": 1.456286481771,
"learning_rate": 9.461116367303985e-06,
"loss": 0.1803,
"step": 959
},
{
"epoch": 0.14930015552099535,
"grad_norm": 2.193608162058263,
"learning_rate": 9.460012626410286e-06,
"loss": 0.2372,
"step": 960
},
{
"epoch": 0.1494556765163297,
"grad_norm": 1.1257027932111565,
"learning_rate": 9.458907820849378e-06,
"loss": 0.2183,
"step": 961
},
{
"epoch": 0.14961119751166407,
"grad_norm": 1.2699403552308035,
"learning_rate": 9.457801950884991e-06,
"loss": 0.2112,
"step": 962
},
{
"epoch": 0.14976671850699844,
"grad_norm": 2.0211225561288986,
"learning_rate": 9.456695016781112e-06,
"loss": 0.3771,
"step": 963
},
{
"epoch": 0.14992223950233283,
"grad_norm": 1.6233952494139523,
"learning_rate": 9.455587018801979e-06,
"loss": 0.1654,
"step": 964
},
{
"epoch": 0.1500777604976672,
"grad_norm": 0.9536635356305013,
"learning_rate": 9.454477957212092e-06,
"loss": 0.1971,
"step": 965
},
{
"epoch": 0.15023328149300155,
"grad_norm": 1.2024688455270478,
"learning_rate": 9.453367832276196e-06,
"loss": 0.2073,
"step": 966
},
{
"epoch": 0.15038880248833592,
"grad_norm": 1.0163258023024337,
"learning_rate": 9.452256644259296e-06,
"loss": 0.1622,
"step": 967
},
{
"epoch": 0.1505443234836703,
"grad_norm": 1.4838973791587633,
"learning_rate": 9.451144393426643e-06,
"loss": 0.2058,
"step": 968
},
{
"epoch": 0.15069984447900467,
"grad_norm": 1.0443777554962437,
"learning_rate": 9.450031080043752e-06,
"loss": 0.165,
"step": 969
},
{
"epoch": 0.15085536547433903,
"grad_norm": 1.1175170370729908,
"learning_rate": 9.448916704376384e-06,
"loss": 0.1419,
"step": 970
},
{
"epoch": 0.1510108864696734,
"grad_norm": 1.2857861611804626,
"learning_rate": 9.447801266690557e-06,
"loss": 0.2171,
"step": 971
},
{
"epoch": 0.15116640746500778,
"grad_norm": 0.7407729973632995,
"learning_rate": 9.446684767252539e-06,
"loss": 0.1714,
"step": 972
},
{
"epoch": 0.15132192846034215,
"grad_norm": 2.195989894115042,
"learning_rate": 9.445567206328857e-06,
"loss": 0.1989,
"step": 973
},
{
"epoch": 0.1514774494556765,
"grad_norm": 0.989971668490221,
"learning_rate": 9.444448584186288e-06,
"loss": 0.1664,
"step": 974
},
{
"epoch": 0.15163297045101087,
"grad_norm": 1.081538706581427,
"learning_rate": 9.44332890109186e-06,
"loss": 0.2066,
"step": 975
},
{
"epoch": 0.15178849144634526,
"grad_norm": 1.4377035491264887,
"learning_rate": 9.442208157312859e-06,
"loss": 0.2057,
"step": 976
},
{
"epoch": 0.15194401244167963,
"grad_norm": 1.5898783963503191,
"learning_rate": 9.441086353116825e-06,
"loss": 0.1665,
"step": 977
},
{
"epoch": 0.152099533437014,
"grad_norm": 0.899579074969373,
"learning_rate": 9.439963488771543e-06,
"loss": 0.2091,
"step": 978
},
{
"epoch": 0.15225505443234838,
"grad_norm": 1.4218933674345213,
"learning_rate": 9.438839564545059e-06,
"loss": 0.2344,
"step": 979
},
{
"epoch": 0.15241057542768274,
"grad_norm": 1.2490316562718224,
"learning_rate": 9.437714580705671e-06,
"loss": 0.1771,
"step": 980
},
{
"epoch": 0.1525660964230171,
"grad_norm": 1.3535600594171835,
"learning_rate": 9.436588537521925e-06,
"loss": 0.2402,
"step": 981
},
{
"epoch": 0.15272161741835147,
"grad_norm": 1.2653882449622933,
"learning_rate": 9.435461435262623e-06,
"loss": 0.2368,
"step": 982
},
{
"epoch": 0.15287713841368586,
"grad_norm": 1.4171554003791706,
"learning_rate": 9.434333274196822e-06,
"loss": 0.16,
"step": 983
},
{
"epoch": 0.15303265940902022,
"grad_norm": 0.9372171947174371,
"learning_rate": 9.433204054593832e-06,
"loss": 0.1464,
"step": 984
},
{
"epoch": 0.15318818040435458,
"grad_norm": 0.9807519101904891,
"learning_rate": 9.43207377672321e-06,
"loss": 0.1743,
"step": 985
},
{
"epoch": 0.15334370139968895,
"grad_norm": 1.9830197584350164,
"learning_rate": 9.430942440854772e-06,
"loss": 0.2979,
"step": 986
},
{
"epoch": 0.15349922239502334,
"grad_norm": 1.013327149062581,
"learning_rate": 9.429810047258578e-06,
"loss": 0.2257,
"step": 987
},
{
"epoch": 0.1536547433903577,
"grad_norm": 1.3644569563063227,
"learning_rate": 9.428676596204953e-06,
"loss": 0.227,
"step": 988
},
{
"epoch": 0.15381026438569206,
"grad_norm": 1.2971192291816034,
"learning_rate": 9.427542087964462e-06,
"loss": 0.2012,
"step": 989
},
{
"epoch": 0.15396578538102643,
"grad_norm": 1.063681975107411,
"learning_rate": 9.426406522807932e-06,
"loss": 0.2299,
"step": 990
},
{
"epoch": 0.15412130637636082,
"grad_norm": 1.0390353297783406,
"learning_rate": 9.425269901006435e-06,
"loss": 0.1438,
"step": 991
},
{
"epoch": 0.15427682737169518,
"grad_norm": 1.821321152512482,
"learning_rate": 9.424132222831301e-06,
"loss": 0.1797,
"step": 992
},
{
"epoch": 0.15443234836702954,
"grad_norm": 1.0266940584964872,
"learning_rate": 9.422993488554108e-06,
"loss": 0.1524,
"step": 993
},
{
"epoch": 0.1545878693623639,
"grad_norm": 1.2357982408354415,
"learning_rate": 9.42185369844669e-06,
"loss": 0.1765,
"step": 994
},
{
"epoch": 0.1547433903576983,
"grad_norm": 1.3007180654461126,
"learning_rate": 9.420712852781129e-06,
"loss": 0.2278,
"step": 995
},
{
"epoch": 0.15489891135303266,
"grad_norm": 1.3519816843089092,
"learning_rate": 9.419570951829761e-06,
"loss": 0.2261,
"step": 996
},
{
"epoch": 0.15505443234836702,
"grad_norm": 0.814621189176537,
"learning_rate": 9.418427995865174e-06,
"loss": 0.2172,
"step": 997
},
{
"epoch": 0.1552099533437014,
"grad_norm": 1.7543842879443927,
"learning_rate": 9.417283985160206e-06,
"loss": 0.2164,
"step": 998
},
{
"epoch": 0.15536547433903578,
"grad_norm": 0.8276231350286671,
"learning_rate": 9.41613891998795e-06,
"loss": 0.1975,
"step": 999
},
{
"epoch": 0.15552099533437014,
"grad_norm": 1.1550898822511304,
"learning_rate": 9.414992800621749e-06,
"loss": 0.1501,
"step": 1000
},
{
"epoch": 0.15552099533437014,
"eval_loss": 0.21367190778255463,
"eval_runtime": 9.4284,
"eval_samples_per_second": 2.758,
"eval_steps_per_second": 0.742,
"step": 1000
},
{
"epoch": 0.1556765163297045,
"grad_norm": 1.6764153048318766,
"learning_rate": 9.413845627335197e-06,
"loss": 0.2071,
"step": 1001
},
{
"epoch": 0.1558320373250389,
"grad_norm": 1.1886246410449919,
"learning_rate": 9.41269740040214e-06,
"loss": 0.1956,
"step": 1002
},
{
"epoch": 0.15598755832037325,
"grad_norm": 1.0793500722611682,
"learning_rate": 9.411548120096676e-06,
"loss": 0.144,
"step": 1003
},
{
"epoch": 0.15614307931570762,
"grad_norm": 1.2449924636096124,
"learning_rate": 9.410397786693157e-06,
"loss": 0.2734,
"step": 1004
},
{
"epoch": 0.15629860031104198,
"grad_norm": 0.8611732851449306,
"learning_rate": 9.409246400466178e-06,
"loss": 0.1923,
"step": 1005
},
{
"epoch": 0.15645412130637637,
"grad_norm": 6.74577569453225,
"learning_rate": 9.408093961690596e-06,
"loss": 0.1956,
"step": 1006
},
{
"epoch": 0.15660964230171073,
"grad_norm": 1.2060004741533563,
"learning_rate": 9.406940470641512e-06,
"loss": 0.2739,
"step": 1007
},
{
"epoch": 0.1567651632970451,
"grad_norm": 1.6202727992084955,
"learning_rate": 9.405785927594281e-06,
"loss": 0.3171,
"step": 1008
},
{
"epoch": 0.15692068429237946,
"grad_norm": 2.0124632761977534,
"learning_rate": 9.404630332824509e-06,
"loss": 0.2104,
"step": 1009
},
{
"epoch": 0.15707620528771385,
"grad_norm": 2.0142886633624286,
"learning_rate": 9.40347368660805e-06,
"loss": 0.2548,
"step": 1010
},
{
"epoch": 0.1572317262830482,
"grad_norm": 1.3434989581281018,
"learning_rate": 9.402315989221013e-06,
"loss": 0.2411,
"step": 1011
},
{
"epoch": 0.15738724727838257,
"grad_norm": 1.3315974814677487,
"learning_rate": 9.40115724093976e-06,
"loss": 0.2839,
"step": 1012
},
{
"epoch": 0.15754276827371697,
"grad_norm": 1.1186058721777734,
"learning_rate": 9.399997442040894e-06,
"loss": 0.167,
"step": 1013
},
{
"epoch": 0.15769828926905133,
"grad_norm": 1.4492217703231243,
"learning_rate": 9.39883659280128e-06,
"loss": 0.1268,
"step": 1014
},
{
"epoch": 0.1578538102643857,
"grad_norm": 1.257425749091041,
"learning_rate": 9.39767469349803e-06,
"loss": 0.1433,
"step": 1015
},
{
"epoch": 0.15800933125972005,
"grad_norm": 1.7996939549666984,
"learning_rate": 9.396511744408498e-06,
"loss": 0.2012,
"step": 1016
},
{
"epoch": 0.15816485225505444,
"grad_norm": 0.8429015986655448,
"learning_rate": 9.395347745810304e-06,
"loss": 0.1935,
"step": 1017
},
{
"epoch": 0.1583203732503888,
"grad_norm": 1.370521795316769,
"learning_rate": 9.394182697981306e-06,
"loss": 0.2183,
"step": 1018
},
{
"epoch": 0.15847589424572317,
"grad_norm": 1.622770939923456,
"learning_rate": 9.393016601199622e-06,
"loss": 0.1593,
"step": 1019
},
{
"epoch": 0.15863141524105753,
"grad_norm": 1.011909638401176,
"learning_rate": 9.39184945574361e-06,
"loss": 0.2053,
"step": 1020
},
{
"epoch": 0.15878693623639192,
"grad_norm": 1.6110438711648936,
"learning_rate": 9.390681261891887e-06,
"loss": 0.222,
"step": 1021
},
{
"epoch": 0.1589424572317263,
"grad_norm": 1.4859951673056488,
"learning_rate": 9.389512019923318e-06,
"loss": 0.231,
"step": 1022
},
{
"epoch": 0.15909797822706065,
"grad_norm": 1.166598629738374,
"learning_rate": 9.388341730117015e-06,
"loss": 0.1917,
"step": 1023
},
{
"epoch": 0.159253499222395,
"grad_norm": 1.0987845208229972,
"learning_rate": 9.387170392752342e-06,
"loss": 0.184,
"step": 1024
},
{
"epoch": 0.1594090202177294,
"grad_norm": 1.5795930559063704,
"learning_rate": 9.385998008108917e-06,
"loss": 0.2097,
"step": 1025
},
{
"epoch": 0.15956454121306377,
"grad_norm": 1.4302193933514027,
"learning_rate": 9.384824576466601e-06,
"loss": 0.2194,
"step": 1026
},
{
"epoch": 0.15972006220839813,
"grad_norm": 0.9372034033824603,
"learning_rate": 9.383650098105512e-06,
"loss": 0.243,
"step": 1027
},
{
"epoch": 0.1598755832037325,
"grad_norm": 1.0038945695499553,
"learning_rate": 9.382474573306011e-06,
"loss": 0.1861,
"step": 1028
},
{
"epoch": 0.16003110419906688,
"grad_norm": 0.9989868346004813,
"learning_rate": 9.381298002348713e-06,
"loss": 0.2324,
"step": 1029
},
{
"epoch": 0.16018662519440124,
"grad_norm": 1.4240189031581216,
"learning_rate": 9.380120385514484e-06,
"loss": 0.1974,
"step": 1030
},
{
"epoch": 0.1603421461897356,
"grad_norm": 1.378754367931683,
"learning_rate": 9.378941723084436e-06,
"loss": 0.245,
"step": 1031
},
{
"epoch": 0.16049766718507,
"grad_norm": 1.8715129600892846,
"learning_rate": 9.37776201533993e-06,
"loss": 0.3174,
"step": 1032
},
{
"epoch": 0.16065318818040436,
"grad_norm": 1.1921962243878195,
"learning_rate": 9.376581262562584e-06,
"loss": 0.1917,
"step": 1033
},
{
"epoch": 0.16080870917573872,
"grad_norm": 1.2635206395103649,
"learning_rate": 9.375399465034257e-06,
"loss": 0.1878,
"step": 1034
},
{
"epoch": 0.16096423017107309,
"grad_norm": 1.2398545424205532,
"learning_rate": 9.374216623037057e-06,
"loss": 0.2344,
"step": 1035
},
{
"epoch": 0.16111975116640748,
"grad_norm": 0.9462934166321078,
"learning_rate": 9.373032736853352e-06,
"loss": 0.187,
"step": 1036
},
{
"epoch": 0.16127527216174184,
"grad_norm": 1.5590735847268282,
"learning_rate": 9.371847806765749e-06,
"loss": 0.2097,
"step": 1037
},
{
"epoch": 0.1614307931570762,
"grad_norm": 1.160888284446341,
"learning_rate": 9.370661833057103e-06,
"loss": 0.1506,
"step": 1038
},
{
"epoch": 0.16158631415241057,
"grad_norm": 1.1778543046473768,
"learning_rate": 9.36947481601053e-06,
"loss": 0.1716,
"step": 1039
},
{
"epoch": 0.16174183514774496,
"grad_norm": 1.4532605779910739,
"learning_rate": 9.368286755909383e-06,
"loss": 0.182,
"step": 1040
},
{
"epoch": 0.16189735614307932,
"grad_norm": 0.9502972420425978,
"learning_rate": 9.36709765303727e-06,
"loss": 0.2161,
"step": 1041
},
{
"epoch": 0.16205287713841368,
"grad_norm": 1.4588748874097772,
"learning_rate": 9.365907507678045e-06,
"loss": 0.2338,
"step": 1042
},
{
"epoch": 0.16220839813374804,
"grad_norm": 1.4225573142040282,
"learning_rate": 9.364716320115813e-06,
"loss": 0.1781,
"step": 1043
},
{
"epoch": 0.16236391912908243,
"grad_norm": 1.029996429205044,
"learning_rate": 9.363524090634928e-06,
"loss": 0.2257,
"step": 1044
},
{
"epoch": 0.1625194401244168,
"grad_norm": 1.379085736135871,
"learning_rate": 9.362330819519991e-06,
"loss": 0.2186,
"step": 1045
},
{
"epoch": 0.16267496111975116,
"grad_norm": 1.2962827183429935,
"learning_rate": 9.361136507055853e-06,
"loss": 0.1916,
"step": 1046
},
{
"epoch": 0.16283048211508552,
"grad_norm": 0.9451500150098339,
"learning_rate": 9.359941153527612e-06,
"loss": 0.1859,
"step": 1047
},
{
"epoch": 0.1629860031104199,
"grad_norm": 1.0944328685975881,
"learning_rate": 9.358744759220614e-06,
"loss": 0.2225,
"step": 1048
},
{
"epoch": 0.16314152410575428,
"grad_norm": 1.1266179070522002,
"learning_rate": 9.357547324420461e-06,
"loss": 0.2039,
"step": 1049
},
{
"epoch": 0.16329704510108864,
"grad_norm": 1.26823288307141,
"learning_rate": 9.356348849412991e-06,
"loss": 0.2686,
"step": 1050
},
{
"epoch": 0.16345256609642303,
"grad_norm": 1.3783372129870655,
"learning_rate": 9.355149334484302e-06,
"loss": 0.2715,
"step": 1051
},
{
"epoch": 0.1636080870917574,
"grad_norm": 0.950454440753535,
"learning_rate": 9.35394877992073e-06,
"loss": 0.1697,
"step": 1052
},
{
"epoch": 0.16376360808709176,
"grad_norm": 2.4437577046740895,
"learning_rate": 9.352747186008865e-06,
"loss": 0.2087,
"step": 1053
},
{
"epoch": 0.16391912908242612,
"grad_norm": 1.4140943006046114,
"learning_rate": 9.351544553035547e-06,
"loss": 0.2063,
"step": 1054
},
{
"epoch": 0.1640746500777605,
"grad_norm": 0.967217619359645,
"learning_rate": 9.350340881287861e-06,
"loss": 0.2008,
"step": 1055
},
{
"epoch": 0.16423017107309487,
"grad_norm": 1.4590565286071695,
"learning_rate": 9.349136171053139e-06,
"loss": 0.1897,
"step": 1056
},
{
"epoch": 0.16438569206842923,
"grad_norm": 1.0794053199949247,
"learning_rate": 9.34793042261896e-06,
"loss": 0.1037,
"step": 1057
},
{
"epoch": 0.1645412130637636,
"grad_norm": 1.15272662266887,
"learning_rate": 9.346723636273157e-06,
"loss": 0.239,
"step": 1058
},
{
"epoch": 0.164696734059098,
"grad_norm": 1.3755496055051248,
"learning_rate": 9.345515812303802e-06,
"loss": 0.2655,
"step": 1059
},
{
"epoch": 0.16485225505443235,
"grad_norm": 1.1623669619389423,
"learning_rate": 9.344306950999226e-06,
"loss": 0.2254,
"step": 1060
},
{
"epoch": 0.1650077760497667,
"grad_norm": 1.1373510201117636,
"learning_rate": 9.343097052647996e-06,
"loss": 0.2515,
"step": 1061
},
{
"epoch": 0.16516329704510108,
"grad_norm": 1.349812652007435,
"learning_rate": 9.341886117538931e-06,
"loss": 0.2367,
"step": 1062
},
{
"epoch": 0.16531881804043547,
"grad_norm": 1.0436524504014346,
"learning_rate": 9.340674145961101e-06,
"loss": 0.1552,
"step": 1063
},
{
"epoch": 0.16547433903576983,
"grad_norm": 1.3297059840324263,
"learning_rate": 9.339461138203821e-06,
"loss": 0.2201,
"step": 1064
},
{
"epoch": 0.1656298600311042,
"grad_norm": 1.7541537167845238,
"learning_rate": 9.338247094556651e-06,
"loss": 0.2076,
"step": 1065
},
{
"epoch": 0.16578538102643858,
"grad_norm": 1.442252163275357,
"learning_rate": 9.3370320153094e-06,
"loss": 0.1753,
"step": 1066
},
{
"epoch": 0.16594090202177295,
"grad_norm": 1.143025605577321,
"learning_rate": 9.335815900752125e-06,
"loss": 0.2217,
"step": 1067
},
{
"epoch": 0.1660964230171073,
"grad_norm": 1.178025675869792,
"learning_rate": 9.33459875117513e-06,
"loss": 0.1621,
"step": 1068
},
{
"epoch": 0.16625194401244167,
"grad_norm": 0.8859479026343935,
"learning_rate": 9.333380566868963e-06,
"loss": 0.2214,
"step": 1069
},
{
"epoch": 0.16640746500777606,
"grad_norm": 1.1580516447127225,
"learning_rate": 9.332161348124426e-06,
"loss": 0.2104,
"step": 1070
},
{
"epoch": 0.16656298600311042,
"grad_norm": 0.9322363288405592,
"learning_rate": 9.33094109523256e-06,
"loss": 0.1524,
"step": 1071
},
{
"epoch": 0.1667185069984448,
"grad_norm": 1.2071920671355123,
"learning_rate": 9.32971980848466e-06,
"loss": 0.2204,
"step": 1072
},
{
"epoch": 0.16687402799377915,
"grad_norm": 1.4321090820471434,
"learning_rate": 9.328497488172256e-06,
"loss": 0.2185,
"step": 1073
},
{
"epoch": 0.16702954898911354,
"grad_norm": 1.5323210185604608,
"learning_rate": 9.327274134587144e-06,
"loss": 0.1967,
"step": 1074
},
{
"epoch": 0.1671850699844479,
"grad_norm": 1.2827697157454871,
"learning_rate": 9.326049748021348e-06,
"loss": 0.1835,
"step": 1075
},
{
"epoch": 0.16734059097978227,
"grad_norm": 0.9598851088099357,
"learning_rate": 9.324824328767148e-06,
"loss": 0.1524,
"step": 1076
},
{
"epoch": 0.16749611197511663,
"grad_norm": 1.1012363230038584,
"learning_rate": 9.323597877117069e-06,
"loss": 0.1934,
"step": 1077
},
{
"epoch": 0.16765163297045102,
"grad_norm": 1.7979943018863753,
"learning_rate": 9.322370393363881e-06,
"loss": 0.2809,
"step": 1078
},
{
"epoch": 0.16780715396578538,
"grad_norm": 0.9525483556320685,
"learning_rate": 9.321141877800604e-06,
"loss": 0.1544,
"step": 1079
},
{
"epoch": 0.16796267496111975,
"grad_norm": 1.1079754408286966,
"learning_rate": 9.319912330720502e-06,
"loss": 0.1939,
"step": 1080
},
{
"epoch": 0.1681181959564541,
"grad_norm": 1.4615045454023567,
"learning_rate": 9.31868175241708e-06,
"loss": 0.1879,
"step": 1081
},
{
"epoch": 0.1682737169517885,
"grad_norm": 0.9677318917431114,
"learning_rate": 9.3174501431841e-06,
"loss": 0.1572,
"step": 1082
},
{
"epoch": 0.16842923794712286,
"grad_norm": 1.1156223371393144,
"learning_rate": 9.316217503315562e-06,
"loss": 0.2477,
"step": 1083
},
{
"epoch": 0.16858475894245722,
"grad_norm": 0.9283556985369971,
"learning_rate": 9.314983833105713e-06,
"loss": 0.1855,
"step": 1084
},
{
"epoch": 0.16874027993779162,
"grad_norm": 0.9107625137180413,
"learning_rate": 9.313749132849048e-06,
"loss": 0.1941,
"step": 1085
},
{
"epoch": 0.16889580093312598,
"grad_norm": 1.1200752990922627,
"learning_rate": 9.312513402840308e-06,
"loss": 0.1714,
"step": 1086
},
{
"epoch": 0.16905132192846034,
"grad_norm": 1.5919484746453285,
"learning_rate": 9.311276643374478e-06,
"loss": 0.1907,
"step": 1087
},
{
"epoch": 0.1692068429237947,
"grad_norm": 1.6737891841333687,
"learning_rate": 9.310038854746793e-06,
"loss": 0.3096,
"step": 1088
},
{
"epoch": 0.1693623639191291,
"grad_norm": 0.9356610939198378,
"learning_rate": 9.308800037252726e-06,
"loss": 0.215,
"step": 1089
},
{
"epoch": 0.16951788491446346,
"grad_norm": 0.9978911792591384,
"learning_rate": 9.307560191188e-06,
"loss": 0.2023,
"step": 1090
},
{
"epoch": 0.16967340590979782,
"grad_norm": 0.8618605808228078,
"learning_rate": 9.30631931684859e-06,
"loss": 0.1835,
"step": 1091
},
{
"epoch": 0.16982892690513218,
"grad_norm": 1.073899023320524,
"learning_rate": 9.305077414530701e-06,
"loss": 0.2856,
"step": 1092
},
{
"epoch": 0.16998444790046657,
"grad_norm": 1.390799646940327,
"learning_rate": 9.303834484530798e-06,
"loss": 0.1768,
"step": 1093
},
{
"epoch": 0.17013996889580094,
"grad_norm": 1.1517992631531213,
"learning_rate": 9.302590527145585e-06,
"loss": 0.1661,
"step": 1094
},
{
"epoch": 0.1702954898911353,
"grad_norm": 1.0942354595322217,
"learning_rate": 9.301345542672012e-06,
"loss": 0.2161,
"step": 1095
},
{
"epoch": 0.17045101088646966,
"grad_norm": 0.8079291053355052,
"learning_rate": 9.300099531407273e-06,
"loss": 0.1768,
"step": 1096
},
{
"epoch": 0.17060653188180405,
"grad_norm": 0.8090971826904667,
"learning_rate": 9.298852493648808e-06,
"loss": 0.1761,
"step": 1097
},
{
"epoch": 0.17076205287713841,
"grad_norm": 1.2570428694136606,
"learning_rate": 9.297604429694305e-06,
"loss": 0.1742,
"step": 1098
},
{
"epoch": 0.17091757387247278,
"grad_norm": 1.4714283316352859,
"learning_rate": 9.296355339841692e-06,
"loss": 0.2716,
"step": 1099
},
{
"epoch": 0.17107309486780714,
"grad_norm": 1.07865700806752,
"learning_rate": 9.295105224389144e-06,
"loss": 0.1507,
"step": 1100
},
{
"epoch": 0.17107309486780714,
"eval_loss": 0.21004652976989746,
"eval_runtime": 9.4236,
"eval_samples_per_second": 2.759,
"eval_steps_per_second": 0.743,
"step": 1100
},
{
"epoch": 0.17122861586314153,
"grad_norm": 0.8784655316390252,
"learning_rate": 9.293854083635081e-06,
"loss": 0.1673,
"step": 1101
},
{
"epoch": 0.1713841368584759,
"grad_norm": 1.025281186756548,
"learning_rate": 9.292601917878169e-06,
"loss": 0.1715,
"step": 1102
},
{
"epoch": 0.17153965785381026,
"grad_norm": 1.409333718683306,
"learning_rate": 9.291348727417318e-06,
"loss": 0.2155,
"step": 1103
},
{
"epoch": 0.17169517884914465,
"grad_norm": 1.0469534251307742,
"learning_rate": 9.290094512551679e-06,
"loss": 0.1918,
"step": 1104
},
{
"epoch": 0.171850699844479,
"grad_norm": 1.275008024365504,
"learning_rate": 9.288839273580652e-06,
"loss": 0.1264,
"step": 1105
},
{
"epoch": 0.17200622083981337,
"grad_norm": 1.2168876399929267,
"learning_rate": 9.287583010803882e-06,
"loss": 0.2855,
"step": 1106
},
{
"epoch": 0.17216174183514774,
"grad_norm": 1.2066762279123466,
"learning_rate": 9.286325724521254e-06,
"loss": 0.2242,
"step": 1107
},
{
"epoch": 0.17231726283048213,
"grad_norm": 2.4948253959447144,
"learning_rate": 9.285067415032902e-06,
"loss": 0.2875,
"step": 1108
},
{
"epoch": 0.1724727838258165,
"grad_norm": 1.8284540511597713,
"learning_rate": 9.283808082639198e-06,
"loss": 0.2049,
"step": 1109
},
{
"epoch": 0.17262830482115085,
"grad_norm": 1.3355119525104016,
"learning_rate": 9.282547727640767e-06,
"loss": 0.1717,
"step": 1110
},
{
"epoch": 0.17278382581648521,
"grad_norm": 1.0266534905254066,
"learning_rate": 9.281286350338472e-06,
"loss": 0.2066,
"step": 1111
},
{
"epoch": 0.1729393468118196,
"grad_norm": 1.2099083780797275,
"learning_rate": 9.280023951033418e-06,
"loss": 0.2807,
"step": 1112
},
{
"epoch": 0.17309486780715397,
"grad_norm": 0.949550488293792,
"learning_rate": 9.278760530026963e-06,
"loss": 0.1992,
"step": 1113
},
{
"epoch": 0.17325038880248833,
"grad_norm": 1.0598653084819885,
"learning_rate": 9.277496087620696e-06,
"loss": 0.2358,
"step": 1114
},
{
"epoch": 0.1734059097978227,
"grad_norm": 1.4050304182051088,
"learning_rate": 9.276230624116464e-06,
"loss": 0.2222,
"step": 1115
},
{
"epoch": 0.17356143079315708,
"grad_norm": 0.9817712530234229,
"learning_rate": 9.274964139816347e-06,
"loss": 0.1931,
"step": 1116
},
{
"epoch": 0.17371695178849145,
"grad_norm": 1.7060543693066812,
"learning_rate": 9.273696635022674e-06,
"loss": 0.2343,
"step": 1117
},
{
"epoch": 0.1738724727838258,
"grad_norm": 1.2527360379181598,
"learning_rate": 9.272428110038016e-06,
"loss": 0.1717,
"step": 1118
},
{
"epoch": 0.17402799377916017,
"grad_norm": 1.0592648467758805,
"learning_rate": 9.271158565165186e-06,
"loss": 0.1338,
"step": 1119
},
{
"epoch": 0.17418351477449456,
"grad_norm": 1.1697431614729739,
"learning_rate": 9.269888000707243e-06,
"loss": 0.0937,
"step": 1120
},
{
"epoch": 0.17433903576982893,
"grad_norm": 1.3666630215902802,
"learning_rate": 9.26861641696749e-06,
"loss": 0.195,
"step": 1121
},
{
"epoch": 0.1744945567651633,
"grad_norm": 0.9618565647030869,
"learning_rate": 9.267343814249468e-06,
"loss": 0.175,
"step": 1122
},
{
"epoch": 0.17465007776049768,
"grad_norm": 1.4220832361635052,
"learning_rate": 9.266070192856968e-06,
"loss": 0.1593,
"step": 1123
},
{
"epoch": 0.17480559875583204,
"grad_norm": 0.776257033559064,
"learning_rate": 9.264795553094022e-06,
"loss": 0.2249,
"step": 1124
},
{
"epoch": 0.1749611197511664,
"grad_norm": 1.2113799530837854,
"learning_rate": 9.263519895264901e-06,
"loss": 0.1907,
"step": 1125
},
{
"epoch": 0.17511664074650077,
"grad_norm": 1.3082437362032786,
"learning_rate": 9.262243219674126e-06,
"loss": 0.2666,
"step": 1126
},
{
"epoch": 0.17527216174183516,
"grad_norm": 1.872862944531211,
"learning_rate": 9.260965526626452e-06,
"loss": 0.1784,
"step": 1127
},
{
"epoch": 0.17542768273716952,
"grad_norm": 1.3432522813757912,
"learning_rate": 9.25968681642689e-06,
"loss": 0.1451,
"step": 1128
},
{
"epoch": 0.17558320373250388,
"grad_norm": 0.9703679937198076,
"learning_rate": 9.258407089380679e-06,
"loss": 0.1297,
"step": 1129
},
{
"epoch": 0.17573872472783825,
"grad_norm": 1.0365436632456377,
"learning_rate": 9.25712634579331e-06,
"loss": 0.1761,
"step": 1130
},
{
"epoch": 0.17589424572317264,
"grad_norm": 2.1522303469420994,
"learning_rate": 9.255844585970516e-06,
"loss": 0.1296,
"step": 1131
},
{
"epoch": 0.176049766718507,
"grad_norm": 1.291217930882477,
"learning_rate": 9.254561810218269e-06,
"loss": 0.2044,
"step": 1132
},
{
"epoch": 0.17620528771384136,
"grad_norm": 0.9937462574500329,
"learning_rate": 9.253278018842786e-06,
"loss": 0.1997,
"step": 1133
},
{
"epoch": 0.17636080870917573,
"grad_norm": 0.9450489875743622,
"learning_rate": 9.251993212150525e-06,
"loss": 0.1747,
"step": 1134
},
{
"epoch": 0.17651632970451012,
"grad_norm": 1.4735357191672043,
"learning_rate": 9.250707390448187e-06,
"loss": 0.2377,
"step": 1135
},
{
"epoch": 0.17667185069984448,
"grad_norm": 0.957023692443933,
"learning_rate": 9.24942055404272e-06,
"loss": 0.1319,
"step": 1136
},
{
"epoch": 0.17682737169517884,
"grad_norm": 0.9533362941250507,
"learning_rate": 9.248132703241306e-06,
"loss": 0.142,
"step": 1137
},
{
"epoch": 0.17698289269051323,
"grad_norm": 1.1321821260027138,
"learning_rate": 9.246843838351371e-06,
"loss": 0.185,
"step": 1138
},
{
"epoch": 0.1771384136858476,
"grad_norm": 0.6564569809439412,
"learning_rate": 9.24555395968059e-06,
"loss": 0.1511,
"step": 1139
},
{
"epoch": 0.17729393468118196,
"grad_norm": 0.8235534803965409,
"learning_rate": 9.244263067536872e-06,
"loss": 0.1851,
"step": 1140
},
{
"epoch": 0.17744945567651632,
"grad_norm": 0.97851675810554,
"learning_rate": 9.24297116222837e-06,
"loss": 0.2184,
"step": 1141
},
{
"epoch": 0.1776049766718507,
"grad_norm": 1.1485004351012151,
"learning_rate": 9.241678244063482e-06,
"loss": 0.2106,
"step": 1142
},
{
"epoch": 0.17776049766718507,
"grad_norm": 1.081146125371241,
"learning_rate": 9.240384313350845e-06,
"loss": 0.1844,
"step": 1143
},
{
"epoch": 0.17791601866251944,
"grad_norm": 1.4013409835542678,
"learning_rate": 9.239089370399338e-06,
"loss": 0.2538,
"step": 1144
},
{
"epoch": 0.1780715396578538,
"grad_norm": 6.587281038828778,
"learning_rate": 9.237793415518083e-06,
"loss": 0.2319,
"step": 1145
},
{
"epoch": 0.1782270606531882,
"grad_norm": 1.463087775034242,
"learning_rate": 9.23649644901644e-06,
"loss": 0.1833,
"step": 1146
},
{
"epoch": 0.17838258164852255,
"grad_norm": 0.8603221586452274,
"learning_rate": 9.235198471204017e-06,
"loss": 0.1652,
"step": 1147
},
{
"epoch": 0.17853810264385692,
"grad_norm": 1.243900965186844,
"learning_rate": 9.233899482390654e-06,
"loss": 0.1688,
"step": 1148
},
{
"epoch": 0.17869362363919128,
"grad_norm": 2.2219504182745964,
"learning_rate": 9.232599482886444e-06,
"loss": 0.2472,
"step": 1149
},
{
"epoch": 0.17884914463452567,
"grad_norm": 0.8152250444616337,
"learning_rate": 9.23129847300171e-06,
"loss": 0.1542,
"step": 1150
},
{
"epoch": 0.17900466562986003,
"grad_norm": 0.8972000242254355,
"learning_rate": 9.229996453047022e-06,
"loss": 0.1914,
"step": 1151
},
{
"epoch": 0.1791601866251944,
"grad_norm": 1.3946215944007783,
"learning_rate": 9.228693423333192e-06,
"loss": 0.2517,
"step": 1152
},
{
"epoch": 0.17931570762052876,
"grad_norm": 1.7211813642698215,
"learning_rate": 9.227389384171272e-06,
"loss": 0.1639,
"step": 1153
},
{
"epoch": 0.17947122861586315,
"grad_norm": 1.045567391255685,
"learning_rate": 9.22608433587255e-06,
"loss": 0.1269,
"step": 1154
},
{
"epoch": 0.1796267496111975,
"grad_norm": 1.6046875031988923,
"learning_rate": 9.224778278748567e-06,
"loss": 0.279,
"step": 1155
},
{
"epoch": 0.17978227060653187,
"grad_norm": 1.204453994991899,
"learning_rate": 9.223471213111089e-06,
"loss": 0.1925,
"step": 1156
},
{
"epoch": 0.17993779160186626,
"grad_norm": 1.3023690662744187,
"learning_rate": 9.222163139272134e-06,
"loss": 0.1788,
"step": 1157
},
{
"epoch": 0.18009331259720063,
"grad_norm": 1.1433449264456945,
"learning_rate": 9.220854057543958e-06,
"loss": 0.2228,
"step": 1158
},
{
"epoch": 0.180248833592535,
"grad_norm": 1.2657407961939997,
"learning_rate": 9.219543968239057e-06,
"loss": 0.1985,
"step": 1159
},
{
"epoch": 0.18040435458786935,
"grad_norm": 1.0010295228905417,
"learning_rate": 9.218232871670168e-06,
"loss": 0.1976,
"step": 1160
},
{
"epoch": 0.18055987558320374,
"grad_norm": 1.0125003198196167,
"learning_rate": 9.216920768150266e-06,
"loss": 0.1886,
"step": 1161
},
{
"epoch": 0.1807153965785381,
"grad_norm": 1.2238980097949077,
"learning_rate": 9.215607657992569e-06,
"loss": 0.2848,
"step": 1162
},
{
"epoch": 0.18087091757387247,
"grad_norm": 1.6192762802858285,
"learning_rate": 9.214293541510537e-06,
"loss": 0.1714,
"step": 1163
},
{
"epoch": 0.18102643856920683,
"grad_norm": 1.0273533259054548,
"learning_rate": 9.212978419017864e-06,
"loss": 0.2001,
"step": 1164
},
{
"epoch": 0.18118195956454122,
"grad_norm": 1.1061300881511378,
"learning_rate": 9.211662290828493e-06,
"loss": 0.2214,
"step": 1165
},
{
"epoch": 0.18133748055987559,
"grad_norm": 1.061080909625091,
"learning_rate": 9.210345157256597e-06,
"loss": 0.1914,
"step": 1166
},
{
"epoch": 0.18149300155520995,
"grad_norm": 0.6997117059310394,
"learning_rate": 9.209027018616598e-06,
"loss": 0.1434,
"step": 1167
},
{
"epoch": 0.1816485225505443,
"grad_norm": 2.4894170703666125,
"learning_rate": 9.207707875223153e-06,
"loss": 0.154,
"step": 1168
},
{
"epoch": 0.1818040435458787,
"grad_norm": 1.294402841120763,
"learning_rate": 9.20638772739116e-06,
"loss": 0.1398,
"step": 1169
},
{
"epoch": 0.18195956454121306,
"grad_norm": 1.4691556974020672,
"learning_rate": 9.205066575435754e-06,
"loss": 0.2599,
"step": 1170
},
{
"epoch": 0.18211508553654743,
"grad_norm": 1.5109698106731952,
"learning_rate": 9.203744419672318e-06,
"loss": 0.2715,
"step": 1171
},
{
"epoch": 0.1822706065318818,
"grad_norm": 0.7824851605920647,
"learning_rate": 9.202421260416464e-06,
"loss": 0.155,
"step": 1172
},
{
"epoch": 0.18242612752721618,
"grad_norm": 1.229104135640711,
"learning_rate": 9.20109709798405e-06,
"loss": 0.173,
"step": 1173
},
{
"epoch": 0.18258164852255054,
"grad_norm": 1.493187696337834,
"learning_rate": 9.199771932691172e-06,
"loss": 0.1874,
"step": 1174
},
{
"epoch": 0.1827371695178849,
"grad_norm": 1.3355865457774434,
"learning_rate": 9.198445764854166e-06,
"loss": 0.1868,
"step": 1175
},
{
"epoch": 0.1828926905132193,
"grad_norm": 1.4822915990950787,
"learning_rate": 9.19711859478961e-06,
"loss": 0.1936,
"step": 1176
},
{
"epoch": 0.18304821150855366,
"grad_norm": 1.3568523945836255,
"learning_rate": 9.19579042281431e-06,
"loss": 0.2351,
"step": 1177
},
{
"epoch": 0.18320373250388802,
"grad_norm": 1.1221237537622042,
"learning_rate": 9.194461249245326e-06,
"loss": 0.1651,
"step": 1178
},
{
"epoch": 0.18335925349922239,
"grad_norm": 1.0427220049147299,
"learning_rate": 9.193131074399949e-06,
"loss": 0.2095,
"step": 1179
},
{
"epoch": 0.18351477449455678,
"grad_norm": 1.1443234808493088,
"learning_rate": 9.191799898595706e-06,
"loss": 0.1987,
"step": 1180
},
{
"epoch": 0.18367029548989114,
"grad_norm": 0.8812799774315752,
"learning_rate": 9.190467722150373e-06,
"loss": 0.2529,
"step": 1181
},
{
"epoch": 0.1838258164852255,
"grad_norm": 0.9190808713383141,
"learning_rate": 9.189134545381954e-06,
"loss": 0.2043,
"step": 1182
},
{
"epoch": 0.18398133748055986,
"grad_norm": 1.1496814316391453,
"learning_rate": 9.187800368608703e-06,
"loss": 0.2166,
"step": 1183
},
{
"epoch": 0.18413685847589426,
"grad_norm": 1.3800541644049227,
"learning_rate": 9.1864651921491e-06,
"loss": 0.2258,
"step": 1184
},
{
"epoch": 0.18429237947122862,
"grad_norm": 0.91743359427612,
"learning_rate": 9.185129016321877e-06,
"loss": 0.1383,
"step": 1185
},
{
"epoch": 0.18444790046656298,
"grad_norm": 1.4610869068656602,
"learning_rate": 9.18379184144599e-06,
"loss": 0.1508,
"step": 1186
},
{
"epoch": 0.18460342146189734,
"grad_norm": 1.675711445184492,
"learning_rate": 9.18245366784065e-06,
"loss": 0.303,
"step": 1187
},
{
"epoch": 0.18475894245723173,
"grad_norm": 0.7182617914658281,
"learning_rate": 9.18111449582529e-06,
"loss": 0.1663,
"step": 1188
},
{
"epoch": 0.1849144634525661,
"grad_norm": 2.5919566299762105,
"learning_rate": 9.179774325719593e-06,
"loss": 0.1913,
"step": 1189
},
{
"epoch": 0.18506998444790046,
"grad_norm": 1.5246187638405735,
"learning_rate": 9.178433157843474e-06,
"loss": 0.1974,
"step": 1190
},
{
"epoch": 0.18522550544323485,
"grad_norm": 1.1870049850604156,
"learning_rate": 9.17709099251709e-06,
"loss": 0.1889,
"step": 1191
},
{
"epoch": 0.1853810264385692,
"grad_norm": 1.2780528349437963,
"learning_rate": 9.175747830060837e-06,
"loss": 0.1682,
"step": 1192
},
{
"epoch": 0.18553654743390358,
"grad_norm": 0.852064776201917,
"learning_rate": 9.174403670795342e-06,
"loss": 0.1786,
"step": 1193
},
{
"epoch": 0.18569206842923794,
"grad_norm": 0.982736851978155,
"learning_rate": 9.173058515041477e-06,
"loss": 0.1759,
"step": 1194
},
{
"epoch": 0.18584758942457233,
"grad_norm": 5.383045313258924,
"learning_rate": 9.171712363120351e-06,
"loss": 0.3862,
"step": 1195
},
{
"epoch": 0.1860031104199067,
"grad_norm": 0.9849374073337689,
"learning_rate": 9.170365215353306e-06,
"loss": 0.1981,
"step": 1196
},
{
"epoch": 0.18615863141524105,
"grad_norm": 1.1001803535527055,
"learning_rate": 9.169017072061926e-06,
"loss": 0.1989,
"step": 1197
},
{
"epoch": 0.18631415241057542,
"grad_norm": 1.1570335250140034,
"learning_rate": 9.167667933568032e-06,
"loss": 0.1822,
"step": 1198
},
{
"epoch": 0.1864696734059098,
"grad_norm": 1.6984581879530103,
"learning_rate": 9.166317800193683e-06,
"loss": 0.2171,
"step": 1199
},
{
"epoch": 0.18662519440124417,
"grad_norm": 1.650860536979747,
"learning_rate": 9.164966672261171e-06,
"loss": 0.3055,
"step": 1200
},
{
"epoch": 0.18662519440124417,
"eval_loss": 0.210090771317482,
"eval_runtime": 9.4293,
"eval_samples_per_second": 2.757,
"eval_steps_per_second": 0.742,
"step": 1200
},
{
"epoch": 0.18678071539657853,
"grad_norm": 1.416615143797259,
"learning_rate": 9.163614550093035e-06,
"loss": 0.1347,
"step": 1201
},
{
"epoch": 0.1869362363919129,
"grad_norm": 1.3794733777830905,
"learning_rate": 9.16226143401204e-06,
"loss": 0.2041,
"step": 1202
},
{
"epoch": 0.1870917573872473,
"grad_norm": 0.9282701621282511,
"learning_rate": 9.160907324341199e-06,
"loss": 0.1589,
"step": 1203
},
{
"epoch": 0.18724727838258165,
"grad_norm": 1.4894253244171338,
"learning_rate": 9.159552221403752e-06,
"loss": 0.174,
"step": 1204
},
{
"epoch": 0.187402799377916,
"grad_norm": 1.1504157025776975,
"learning_rate": 9.158196125523182e-06,
"loss": 0.1942,
"step": 1205
},
{
"epoch": 0.18755832037325038,
"grad_norm": 0.7255523870962133,
"learning_rate": 9.156839037023209e-06,
"loss": 0.1925,
"step": 1206
},
{
"epoch": 0.18771384136858477,
"grad_norm": 1.3297160614851913,
"learning_rate": 9.155480956227789e-06,
"loss": 0.2448,
"step": 1207
},
{
"epoch": 0.18786936236391913,
"grad_norm": 1.2394203928257357,
"learning_rate": 9.154121883461115e-06,
"loss": 0.1644,
"step": 1208
},
{
"epoch": 0.1880248833592535,
"grad_norm": 1.110942304313815,
"learning_rate": 9.152761819047617e-06,
"loss": 0.158,
"step": 1209
},
{
"epoch": 0.18818040435458788,
"grad_norm": 0.8597754146450871,
"learning_rate": 9.151400763311958e-06,
"loss": 0.1765,
"step": 1210
},
{
"epoch": 0.18833592534992225,
"grad_norm": 1.1244255534137637,
"learning_rate": 9.150038716579046e-06,
"loss": 0.14,
"step": 1211
},
{
"epoch": 0.1884914463452566,
"grad_norm": 0.9441808017939254,
"learning_rate": 9.148675679174017e-06,
"loss": 0.1685,
"step": 1212
},
{
"epoch": 0.18864696734059097,
"grad_norm": 1.49569762403274,
"learning_rate": 9.147311651422248e-06,
"loss": 0.1637,
"step": 1213
},
{
"epoch": 0.18880248833592536,
"grad_norm": 1.0568658204953814,
"learning_rate": 9.145946633649352e-06,
"loss": 0.1713,
"step": 1214
},
{
"epoch": 0.18895800933125972,
"grad_norm": 1.2127109888393217,
"learning_rate": 9.144580626181176e-06,
"loss": 0.161,
"step": 1215
},
{
"epoch": 0.1891135303265941,
"grad_norm": 0.8503234486008238,
"learning_rate": 9.143213629343807e-06,
"loss": 0.1489,
"step": 1216
},
{
"epoch": 0.18926905132192845,
"grad_norm": 0.9120088478974758,
"learning_rate": 9.141845643463565e-06,
"loss": 0.1939,
"step": 1217
},
{
"epoch": 0.18942457231726284,
"grad_norm": 1.0121267789823751,
"learning_rate": 9.140476668867008e-06,
"loss": 0.15,
"step": 1218
},
{
"epoch": 0.1895800933125972,
"grad_norm": 1.3638566134338714,
"learning_rate": 9.13910670588093e-06,
"loss": 0.2105,
"step": 1219
},
{
"epoch": 0.18973561430793157,
"grad_norm": 1.6276021550806605,
"learning_rate": 9.13773575483236e-06,
"loss": 0.2869,
"step": 1220
},
{
"epoch": 0.18989113530326593,
"grad_norm": 1.6764188720931026,
"learning_rate": 9.136363816048562e-06,
"loss": 0.1458,
"step": 1221
},
{
"epoch": 0.19004665629860032,
"grad_norm": 0.6701780576831128,
"learning_rate": 9.134990889857036e-06,
"loss": 0.1842,
"step": 1222
},
{
"epoch": 0.19020217729393468,
"grad_norm": 1.1322931167082202,
"learning_rate": 9.133616976585522e-06,
"loss": 0.2556,
"step": 1223
},
{
"epoch": 0.19035769828926905,
"grad_norm": 1.2524154763717683,
"learning_rate": 9.13224207656199e-06,
"loss": 0.2104,
"step": 1224
},
{
"epoch": 0.1905132192846034,
"grad_norm": 0.9592897430767787,
"learning_rate": 9.130866190114649e-06,
"loss": 0.2833,
"step": 1225
},
{
"epoch": 0.1906687402799378,
"grad_norm": 1.7651472837705433,
"learning_rate": 9.12948931757194e-06,
"loss": 0.2524,
"step": 1226
},
{
"epoch": 0.19082426127527216,
"grad_norm": 0.9879072001537496,
"learning_rate": 9.128111459262543e-06,
"loss": 0.1624,
"step": 1227
},
{
"epoch": 0.19097978227060652,
"grad_norm": 1.320308534660155,
"learning_rate": 9.126732615515373e-06,
"loss": 0.2937,
"step": 1228
},
{
"epoch": 0.19113530326594091,
"grad_norm": 1.6528470759003213,
"learning_rate": 9.125352786659577e-06,
"loss": 0.1824,
"step": 1229
},
{
"epoch": 0.19129082426127528,
"grad_norm": 1.099113810582022,
"learning_rate": 9.123971973024543e-06,
"loss": 0.2282,
"step": 1230
},
{
"epoch": 0.19144634525660964,
"grad_norm": 0.9906932002367946,
"learning_rate": 9.122590174939887e-06,
"loss": 0.1908,
"step": 1231
},
{
"epoch": 0.191601866251944,
"grad_norm": 1.3700619269813867,
"learning_rate": 9.121207392735465e-06,
"loss": 0.1736,
"step": 1232
},
{
"epoch": 0.1917573872472784,
"grad_norm": 0.9132669255091096,
"learning_rate": 9.119823626741367e-06,
"loss": 0.2559,
"step": 1233
},
{
"epoch": 0.19191290824261276,
"grad_norm": 1.0158832597362466,
"learning_rate": 9.118438877287913e-06,
"loss": 0.218,
"step": 1234
},
{
"epoch": 0.19206842923794712,
"grad_norm": 0.9172450560816615,
"learning_rate": 9.11705314470567e-06,
"loss": 0.2038,
"step": 1235
},
{
"epoch": 0.19222395023328148,
"grad_norm": 1.0457809289045787,
"learning_rate": 9.115666429325424e-06,
"loss": 0.2383,
"step": 1236
},
{
"epoch": 0.19237947122861587,
"grad_norm": 0.6123808194220389,
"learning_rate": 9.114278731478207e-06,
"loss": 0.1059,
"step": 1237
},
{
"epoch": 0.19253499222395024,
"grad_norm": 0.8957445923668392,
"learning_rate": 9.112890051495281e-06,
"loss": 0.1753,
"step": 1238
},
{
"epoch": 0.1926905132192846,
"grad_norm": 1.010302756648279,
"learning_rate": 9.111500389708144e-06,
"loss": 0.2162,
"step": 1239
},
{
"epoch": 0.19284603421461896,
"grad_norm": 1.26307408847368,
"learning_rate": 9.110109746448527e-06,
"loss": 0.1901,
"step": 1240
},
{
"epoch": 0.19300155520995335,
"grad_norm": 1.237621554432501,
"learning_rate": 9.108718122048395e-06,
"loss": 0.1746,
"step": 1241
},
{
"epoch": 0.19315707620528771,
"grad_norm": 0.9172927280641415,
"learning_rate": 9.107325516839952e-06,
"loss": 0.1556,
"step": 1242
},
{
"epoch": 0.19331259720062208,
"grad_norm": 1.7268710214147918,
"learning_rate": 9.105931931155626e-06,
"loss": 0.2808,
"step": 1243
},
{
"epoch": 0.19346811819595647,
"grad_norm": 0.8932022562830918,
"learning_rate": 9.10453736532809e-06,
"loss": 0.1527,
"step": 1244
},
{
"epoch": 0.19362363919129083,
"grad_norm": 1.2202712676463288,
"learning_rate": 9.103141819690246e-06,
"loss": 0.1376,
"step": 1245
},
{
"epoch": 0.1937791601866252,
"grad_norm": 1.0826681500025592,
"learning_rate": 9.101745294575227e-06,
"loss": 0.1449,
"step": 1246
},
{
"epoch": 0.19393468118195956,
"grad_norm": 1.1807575757930213,
"learning_rate": 9.100347790316409e-06,
"loss": 0.2126,
"step": 1247
},
{
"epoch": 0.19409020217729395,
"grad_norm": 0.941763687751761,
"learning_rate": 9.098949307247391e-06,
"loss": 0.1632,
"step": 1248
},
{
"epoch": 0.1942457231726283,
"grad_norm": 1.378441641768549,
"learning_rate": 9.097549845702009e-06,
"loss": 0.1906,
"step": 1249
},
{
"epoch": 0.19440124416796267,
"grad_norm": 1.2339116886059447,
"learning_rate": 9.09614940601434e-06,
"loss": 0.2006,
"step": 1250
},
{
"epoch": 0.19455676516329704,
"grad_norm": 1.1239344680494445,
"learning_rate": 9.094747988518683e-06,
"loss": 0.2336,
"step": 1251
},
{
"epoch": 0.19471228615863143,
"grad_norm": 0.927588276459713,
"learning_rate": 9.093345593549579e-06,
"loss": 0.1449,
"step": 1252
},
{
"epoch": 0.1948678071539658,
"grad_norm": 1.13724282637735,
"learning_rate": 9.091942221441797e-06,
"loss": 0.2126,
"step": 1253
},
{
"epoch": 0.19502332814930015,
"grad_norm": 1.0365698182525573,
"learning_rate": 9.090537872530343e-06,
"loss": 0.1867,
"step": 1254
},
{
"epoch": 0.19517884914463451,
"grad_norm": 0.9371814591941575,
"learning_rate": 9.089132547150453e-06,
"loss": 0.1618,
"step": 1255
},
{
"epoch": 0.1953343701399689,
"grad_norm": 1.0697225550230685,
"learning_rate": 9.0877262456376e-06,
"loss": 0.1849,
"step": 1256
},
{
"epoch": 0.19548989113530327,
"grad_norm": 1.5583498729530745,
"learning_rate": 9.086318968327488e-06,
"loss": 0.2014,
"step": 1257
},
{
"epoch": 0.19564541213063763,
"grad_norm": 1.2271229677253923,
"learning_rate": 9.084910715556052e-06,
"loss": 0.2017,
"step": 1258
},
{
"epoch": 0.195800933125972,
"grad_norm": 1.0026506309270833,
"learning_rate": 9.083501487659461e-06,
"loss": 0.1646,
"step": 1259
},
{
"epoch": 0.19595645412130638,
"grad_norm": 1.2598951391108157,
"learning_rate": 9.08209128497412e-06,
"loss": 0.1851,
"step": 1260
},
{
"epoch": 0.19611197511664075,
"grad_norm": 1.5838356552966606,
"learning_rate": 9.080680107836662e-06,
"loss": 0.1948,
"step": 1261
},
{
"epoch": 0.1962674961119751,
"grad_norm": 1.1087104243969894,
"learning_rate": 9.079267956583953e-06,
"loss": 0.1687,
"step": 1262
},
{
"epoch": 0.1964230171073095,
"grad_norm": 1.6020412697904411,
"learning_rate": 9.077854831553097e-06,
"loss": 0.1854,
"step": 1263
},
{
"epoch": 0.19657853810264386,
"grad_norm": 1.0315547992066338,
"learning_rate": 9.076440733081426e-06,
"loss": 0.2211,
"step": 1264
},
{
"epoch": 0.19673405909797823,
"grad_norm": 1.0349194289967332,
"learning_rate": 9.075025661506505e-06,
"loss": 0.182,
"step": 1265
},
{
"epoch": 0.1968895800933126,
"grad_norm": 0.8148640872234216,
"learning_rate": 9.073609617166129e-06,
"loss": 0.2319,
"step": 1266
},
{
"epoch": 0.19704510108864698,
"grad_norm": 0.8956967698145264,
"learning_rate": 9.072192600398328e-06,
"loss": 0.2318,
"step": 1267
},
{
"epoch": 0.19720062208398134,
"grad_norm": 1.512397062737358,
"learning_rate": 9.070774611541366e-06,
"loss": 0.1279,
"step": 1268
},
{
"epoch": 0.1973561430793157,
"grad_norm": 1.089155641459757,
"learning_rate": 9.069355650933732e-06,
"loss": 0.132,
"step": 1269
},
{
"epoch": 0.19751166407465007,
"grad_norm": 1.15341700389814,
"learning_rate": 9.06793571891416e-06,
"loss": 0.1416,
"step": 1270
},
{
"epoch": 0.19766718506998446,
"grad_norm": 1.2188604321419376,
"learning_rate": 9.0665148158216e-06,
"loss": 0.1635,
"step": 1271
},
{
"epoch": 0.19782270606531882,
"grad_norm": 1.6133883720632236,
"learning_rate": 9.065092941995245e-06,
"loss": 0.185,
"step": 1272
},
{
"epoch": 0.19797822706065318,
"grad_norm": 1.4486872766212289,
"learning_rate": 9.063670097774513e-06,
"loss": 0.2325,
"step": 1273
},
{
"epoch": 0.19813374805598755,
"grad_norm": 1.557263365124596,
"learning_rate": 9.062246283499058e-06,
"loss": 0.1712,
"step": 1274
},
{
"epoch": 0.19828926905132194,
"grad_norm": 1.9875754585690109,
"learning_rate": 9.060821499508769e-06,
"loss": 0.1843,
"step": 1275
},
{
"epoch": 0.1984447900466563,
"grad_norm": 1.1418131416263584,
"learning_rate": 9.059395746143756e-06,
"loss": 0.1777,
"step": 1276
},
{
"epoch": 0.19860031104199066,
"grad_norm": 1.0395361627239141,
"learning_rate": 9.057969023744367e-06,
"loss": 0.2194,
"step": 1277
},
{
"epoch": 0.19875583203732503,
"grad_norm": 1.305159234748547,
"learning_rate": 9.056541332651183e-06,
"loss": 0.2141,
"step": 1278
},
{
"epoch": 0.19891135303265942,
"grad_norm": 1.0849932011185046,
"learning_rate": 9.055112673205014e-06,
"loss": 0.1821,
"step": 1279
},
{
"epoch": 0.19906687402799378,
"grad_norm": 0.979089764226756,
"learning_rate": 9.053683045746897e-06,
"loss": 0.269,
"step": 1280
},
{
"epoch": 0.19922239502332814,
"grad_norm": 1.078405593629792,
"learning_rate": 9.052252450618106e-06,
"loss": 0.1413,
"step": 1281
},
{
"epoch": 0.19937791601866253,
"grad_norm": 1.2031448135959215,
"learning_rate": 9.050820888160145e-06,
"loss": 0.2268,
"step": 1282
},
{
"epoch": 0.1995334370139969,
"grad_norm": 0.9432997632179643,
"learning_rate": 9.049388358714747e-06,
"loss": 0.0856,
"step": 1283
},
{
"epoch": 0.19968895800933126,
"grad_norm": 1.1798467376681538,
"learning_rate": 9.04795486262388e-06,
"loss": 0.1487,
"step": 1284
},
{
"epoch": 0.19984447900466562,
"grad_norm": 0.9959594825238516,
"learning_rate": 9.046520400229734e-06,
"loss": 0.1363,
"step": 1285
},
{
"epoch": 0.2,
"grad_norm": 1.2777597650080654,
"learning_rate": 9.045084971874738e-06,
"loss": 0.2053,
"step": 1286
},
{
"epoch": 0.20015552099533437,
"grad_norm": 1.3807813898572032,
"learning_rate": 9.04364857790155e-06,
"loss": 0.1608,
"step": 1287
},
{
"epoch": 0.20031104199066874,
"grad_norm": 1.213101350130223,
"learning_rate": 9.042211218653054e-06,
"loss": 0.1783,
"step": 1288
},
{
"epoch": 0.2004665629860031,
"grad_norm": 1.270497799974636,
"learning_rate": 9.040772894472369e-06,
"loss": 0.1335,
"step": 1289
},
{
"epoch": 0.2006220839813375,
"grad_norm": 1.143678584624158,
"learning_rate": 9.039333605702844e-06,
"loss": 0.2566,
"step": 1290
},
{
"epoch": 0.20077760497667185,
"grad_norm": 0.9321591383595857,
"learning_rate": 9.03789335268806e-06,
"loss": 0.1517,
"step": 1291
},
{
"epoch": 0.20093312597200622,
"grad_norm": 0.8482625172580437,
"learning_rate": 9.036452135771818e-06,
"loss": 0.2284,
"step": 1292
},
{
"epoch": 0.20108864696734058,
"grad_norm": 1.5799008472731184,
"learning_rate": 9.035009955298163e-06,
"loss": 0.2491,
"step": 1293
},
{
"epoch": 0.20124416796267497,
"grad_norm": 1.5021594414320747,
"learning_rate": 9.03356681161136e-06,
"loss": 0.1623,
"step": 1294
},
{
"epoch": 0.20139968895800933,
"grad_norm": 1.1207507593154515,
"learning_rate": 9.032122705055912e-06,
"loss": 0.1996,
"step": 1295
},
{
"epoch": 0.2015552099533437,
"grad_norm": 1.1753346897113919,
"learning_rate": 9.030677635976542e-06,
"loss": 0.156,
"step": 1296
},
{
"epoch": 0.20171073094867809,
"grad_norm": 1.582912014985177,
"learning_rate": 9.02923160471821e-06,
"loss": 0.2852,
"step": 1297
},
{
"epoch": 0.20186625194401245,
"grad_norm": 4.24419003235004,
"learning_rate": 9.027784611626108e-06,
"loss": 0.1857,
"step": 1298
},
{
"epoch": 0.2020217729393468,
"grad_norm": 3.465507316165179,
"learning_rate": 9.026336657045646e-06,
"loss": 0.1331,
"step": 1299
},
{
"epoch": 0.20217729393468117,
"grad_norm": 0.8992554022243577,
"learning_rate": 9.024887741322475e-06,
"loss": 0.1649,
"step": 1300
},
{
"epoch": 0.20217729393468117,
"eval_loss": 0.20873166620731354,
"eval_runtime": 9.4107,
"eval_samples_per_second": 2.763,
"eval_steps_per_second": 0.744,
"step": 1300
},
{
"epoch": 0.20233281493001556,
"grad_norm": 1.2089278079623347,
"learning_rate": 9.023437864802472e-06,
"loss": 0.2705,
"step": 1301
},
{
"epoch": 0.20248833592534993,
"grad_norm": 1.2901991665649666,
"learning_rate": 9.021987027831743e-06,
"loss": 0.1672,
"step": 1302
},
{
"epoch": 0.2026438569206843,
"grad_norm": 1.5354719963652408,
"learning_rate": 9.02053523075662e-06,
"loss": 0.239,
"step": 1303
},
{
"epoch": 0.20279937791601865,
"grad_norm": 1.214882523492219,
"learning_rate": 9.01908247392367e-06,
"loss": 0.1566,
"step": 1304
},
{
"epoch": 0.20295489891135304,
"grad_norm": 1.473765899129253,
"learning_rate": 9.017628757679685e-06,
"loss": 0.1931,
"step": 1305
},
{
"epoch": 0.2031104199066874,
"grad_norm": 2.6517165969707683,
"learning_rate": 9.01617408237169e-06,
"loss": 0.1307,
"step": 1306
},
{
"epoch": 0.20326594090202177,
"grad_norm": 1.4993932954062734,
"learning_rate": 9.01471844834693e-06,
"loss": 0.2079,
"step": 1307
},
{
"epoch": 0.20342146189735613,
"grad_norm": 1.0866992812991043,
"learning_rate": 9.013261855952893e-06,
"loss": 0.2361,
"step": 1308
},
{
"epoch": 0.20357698289269052,
"grad_norm": 1.4691858213747517,
"learning_rate": 9.011804305537281e-06,
"loss": 0.2062,
"step": 1309
},
{
"epoch": 0.20373250388802489,
"grad_norm": 1.218397331201916,
"learning_rate": 9.010345797448037e-06,
"loss": 0.1295,
"step": 1310
},
{
"epoch": 0.20388802488335925,
"grad_norm": 1.317910015288317,
"learning_rate": 9.008886332033323e-06,
"loss": 0.221,
"step": 1311
},
{
"epoch": 0.2040435458786936,
"grad_norm": 1.4368413534493716,
"learning_rate": 9.007425909641538e-06,
"loss": 0.3292,
"step": 1312
},
{
"epoch": 0.204199066874028,
"grad_norm": 1.24467623609956,
"learning_rate": 9.005964530621301e-06,
"loss": 0.2276,
"step": 1313
},
{
"epoch": 0.20435458786936236,
"grad_norm": 0.9849662601801316,
"learning_rate": 9.004502195321468e-06,
"loss": 0.1825,
"step": 1314
},
{
"epoch": 0.20451010886469673,
"grad_norm": 3.783152250453029,
"learning_rate": 9.003038904091113e-06,
"loss": 0.1834,
"step": 1315
},
{
"epoch": 0.20466562986003112,
"grad_norm": 1.0234608190416166,
"learning_rate": 9.001574657279548e-06,
"loss": 0.2172,
"step": 1316
},
{
"epoch": 0.20482115085536548,
"grad_norm": 1.3240981295825394,
"learning_rate": 9.00010945523631e-06,
"loss": 0.1857,
"step": 1317
},
{
"epoch": 0.20497667185069984,
"grad_norm": 1.1823107793426477,
"learning_rate": 8.99864329831116e-06,
"loss": 0.2747,
"step": 1318
},
{
"epoch": 0.2051321928460342,
"grad_norm": 1.183188676477308,
"learning_rate": 8.997176186854091e-06,
"loss": 0.2091,
"step": 1319
},
{
"epoch": 0.2052877138413686,
"grad_norm": 1.1306812200844953,
"learning_rate": 8.995708121215325e-06,
"loss": 0.1789,
"step": 1320
},
{
"epoch": 0.20544323483670296,
"grad_norm": 1.3914844708441778,
"learning_rate": 8.994239101745309e-06,
"loss": 0.1626,
"step": 1321
},
{
"epoch": 0.20559875583203732,
"grad_norm": 1.3328736681097808,
"learning_rate": 8.992769128794717e-06,
"loss": 0.1699,
"step": 1322
},
{
"epoch": 0.20575427682737168,
"grad_norm": 1.3262550452320387,
"learning_rate": 8.991298202714453e-06,
"loss": 0.1985,
"step": 1323
},
{
"epoch": 0.20590979782270608,
"grad_norm": 1.5863201904107513,
"learning_rate": 8.989826323855647e-06,
"loss": 0.2729,
"step": 1324
},
{
"epoch": 0.20606531881804044,
"grad_norm": 1.0484153422588192,
"learning_rate": 8.988353492569657e-06,
"loss": 0.2243,
"step": 1325
},
{
"epoch": 0.2062208398133748,
"grad_norm": 0.9724310873787251,
"learning_rate": 8.986879709208069e-06,
"loss": 0.2349,
"step": 1326
},
{
"epoch": 0.20637636080870916,
"grad_norm": 1.319839764006134,
"learning_rate": 8.985404974122699e-06,
"loss": 0.1796,
"step": 1327
},
{
"epoch": 0.20653188180404355,
"grad_norm": 1.7134943634197457,
"learning_rate": 8.983929287665579e-06,
"loss": 0.2289,
"step": 1328
},
{
"epoch": 0.20668740279937792,
"grad_norm": 1.1812406274342315,
"learning_rate": 8.98245265018898e-06,
"loss": 0.2123,
"step": 1329
},
{
"epoch": 0.20684292379471228,
"grad_norm": 1.4771839041530355,
"learning_rate": 8.980975062045398e-06,
"loss": 0.2228,
"step": 1330
},
{
"epoch": 0.20699844479004664,
"grad_norm": 1.0073337669892177,
"learning_rate": 8.979496523587552e-06,
"loss": 0.1455,
"step": 1331
},
{
"epoch": 0.20715396578538103,
"grad_norm": 0.8665969448850475,
"learning_rate": 8.978017035168389e-06,
"loss": 0.1689,
"step": 1332
},
{
"epoch": 0.2073094867807154,
"grad_norm": 1.0555827692971853,
"learning_rate": 8.976536597141085e-06,
"loss": 0.1708,
"step": 1333
},
{
"epoch": 0.20746500777604976,
"grad_norm": 0.8842215270037568,
"learning_rate": 8.97505520985904e-06,
"loss": 0.1751,
"step": 1334
},
{
"epoch": 0.20762052877138415,
"grad_norm": 1.6924145041248846,
"learning_rate": 8.973572873675882e-06,
"loss": 0.1697,
"step": 1335
},
{
"epoch": 0.2077760497667185,
"grad_norm": 1.1225115788471978,
"learning_rate": 8.972089588945467e-06,
"loss": 0.22,
"step": 1336
},
{
"epoch": 0.20793157076205288,
"grad_norm": 1.1964311921620439,
"learning_rate": 8.970605356021873e-06,
"loss": 0.1953,
"step": 1337
},
{
"epoch": 0.20808709175738724,
"grad_norm": 1.1874827397504135,
"learning_rate": 8.96912017525941e-06,
"loss": 0.1541,
"step": 1338
},
{
"epoch": 0.20824261275272163,
"grad_norm": 1.2996586003784654,
"learning_rate": 8.967634047012607e-06,
"loss": 0.2543,
"step": 1339
},
{
"epoch": 0.208398133748056,
"grad_norm": 1.9568915465615424,
"learning_rate": 8.96614697163623e-06,
"loss": 0.1742,
"step": 1340
},
{
"epoch": 0.20855365474339035,
"grad_norm": 1.327702070183964,
"learning_rate": 8.96465894948526e-06,
"loss": 0.1688,
"step": 1341
},
{
"epoch": 0.20870917573872472,
"grad_norm": 0.998729186682604,
"learning_rate": 8.963169980914908e-06,
"loss": 0.2165,
"step": 1342
},
{
"epoch": 0.2088646967340591,
"grad_norm": 0.9250328323650552,
"learning_rate": 8.961680066280614e-06,
"loss": 0.1978,
"step": 1343
},
{
"epoch": 0.20902021772939347,
"grad_norm": 0.685484375204563,
"learning_rate": 8.96018920593804e-06,
"loss": 0.1521,
"step": 1344
},
{
"epoch": 0.20917573872472783,
"grad_norm": 1.2194077898180222,
"learning_rate": 8.958697400243077e-06,
"loss": 0.129,
"step": 1345
},
{
"epoch": 0.2093312597200622,
"grad_norm": 1.3390006867631312,
"learning_rate": 8.957204649551838e-06,
"loss": 0.2295,
"step": 1346
},
{
"epoch": 0.2094867807153966,
"grad_norm": 1.0791715779616644,
"learning_rate": 8.955710954220664e-06,
"loss": 0.1922,
"step": 1347
},
{
"epoch": 0.20964230171073095,
"grad_norm": 1.0448818497216468,
"learning_rate": 8.954216314606123e-06,
"loss": 0.2074,
"step": 1348
},
{
"epoch": 0.2097978227060653,
"grad_norm": 1.0968024521734823,
"learning_rate": 8.952720731065e-06,
"loss": 0.1956,
"step": 1349
},
{
"epoch": 0.2099533437013997,
"grad_norm": 1.1729159260054676,
"learning_rate": 8.95122420395432e-06,
"loss": 0.1032,
"step": 1350
},
{
"epoch": 0.21010886469673407,
"grad_norm": 0.7605452577854958,
"learning_rate": 8.949726733631319e-06,
"loss": 0.2173,
"step": 1351
},
{
"epoch": 0.21026438569206843,
"grad_norm": 0.7896405561018206,
"learning_rate": 8.948228320453465e-06,
"loss": 0.1411,
"step": 1352
},
{
"epoch": 0.2104199066874028,
"grad_norm": 1.3664851820052848,
"learning_rate": 8.946728964778452e-06,
"loss": 0.2043,
"step": 1353
},
{
"epoch": 0.21057542768273718,
"grad_norm": 1.0930532560076165,
"learning_rate": 8.945228666964197e-06,
"loss": 0.2112,
"step": 1354
},
{
"epoch": 0.21073094867807154,
"grad_norm": 1.3370376996193614,
"learning_rate": 8.94372742736884e-06,
"loss": 0.2763,
"step": 1355
},
{
"epoch": 0.2108864696734059,
"grad_norm": 1.1733695403983486,
"learning_rate": 8.942225246350748e-06,
"loss": 0.1383,
"step": 1356
},
{
"epoch": 0.21104199066874027,
"grad_norm": 1.518123240050466,
"learning_rate": 8.940722124268515e-06,
"loss": 0.2035,
"step": 1357
},
{
"epoch": 0.21119751166407466,
"grad_norm": 0.7154774393150748,
"learning_rate": 8.939218061480955e-06,
"loss": 0.1513,
"step": 1358
},
{
"epoch": 0.21135303265940902,
"grad_norm": 1.7277749667928948,
"learning_rate": 8.937713058347109e-06,
"loss": 0.1852,
"step": 1359
},
{
"epoch": 0.2115085536547434,
"grad_norm": 0.8101754008908368,
"learning_rate": 8.936207115226242e-06,
"loss": 0.1755,
"step": 1360
},
{
"epoch": 0.21166407465007775,
"grad_norm": 2.154263107894285,
"learning_rate": 8.934700232477845e-06,
"loss": 0.2284,
"step": 1361
},
{
"epoch": 0.21181959564541214,
"grad_norm": 2.9946702775104552,
"learning_rate": 8.933192410461632e-06,
"loss": 0.1571,
"step": 1362
},
{
"epoch": 0.2119751166407465,
"grad_norm": 1.3293853025848206,
"learning_rate": 8.931683649537539e-06,
"loss": 0.1818,
"step": 1363
},
{
"epoch": 0.21213063763608087,
"grad_norm": 1.069623910831374,
"learning_rate": 8.93017395006573e-06,
"loss": 0.2389,
"step": 1364
},
{
"epoch": 0.21228615863141523,
"grad_norm": 1.2692486168753456,
"learning_rate": 8.928663312406593e-06,
"loss": 0.1725,
"step": 1365
},
{
"epoch": 0.21244167962674962,
"grad_norm": 2.31269662319102,
"learning_rate": 8.927151736920733e-06,
"loss": 0.3472,
"step": 1366
},
{
"epoch": 0.21259720062208398,
"grad_norm": 1.3024374295612378,
"learning_rate": 8.925639223968989e-06,
"loss": 0.1601,
"step": 1367
},
{
"epoch": 0.21275272161741834,
"grad_norm": 1.475662600105692,
"learning_rate": 8.924125773912418e-06,
"loss": 0.1652,
"step": 1368
},
{
"epoch": 0.21290824261275273,
"grad_norm": 0.8719883727219597,
"learning_rate": 8.9226113871123e-06,
"loss": 0.2406,
"step": 1369
},
{
"epoch": 0.2130637636080871,
"grad_norm": 1.355947295843189,
"learning_rate": 8.921096063930141e-06,
"loss": 0.2387,
"step": 1370
},
{
"epoch": 0.21321928460342146,
"grad_norm": 1.462171782992857,
"learning_rate": 8.919579804727671e-06,
"loss": 0.2075,
"step": 1371
},
{
"epoch": 0.21337480559875582,
"grad_norm": 1.4186556891621878,
"learning_rate": 8.91806260986684e-06,
"loss": 0.1906,
"step": 1372
},
{
"epoch": 0.21353032659409021,
"grad_norm": 1.0297515081183366,
"learning_rate": 8.916544479709826e-06,
"loss": 0.1813,
"step": 1373
},
{
"epoch": 0.21368584758942458,
"grad_norm": 0.8517207332254344,
"learning_rate": 8.915025414619025e-06,
"loss": 0.2314,
"step": 1374
},
{
"epoch": 0.21384136858475894,
"grad_norm": 1.4500725099182117,
"learning_rate": 8.91350541495706e-06,
"loss": 0.2702,
"step": 1375
},
{
"epoch": 0.2139968895800933,
"grad_norm": 1.4840249529134437,
"learning_rate": 8.911984481086779e-06,
"loss": 0.1957,
"step": 1376
},
{
"epoch": 0.2141524105754277,
"grad_norm": 1.0812621557572404,
"learning_rate": 8.910462613371246e-06,
"loss": 0.1773,
"step": 1377
},
{
"epoch": 0.21430793157076206,
"grad_norm": 0.8285771638848516,
"learning_rate": 8.908939812173756e-06,
"loss": 0.1879,
"step": 1378
},
{
"epoch": 0.21446345256609642,
"grad_norm": 1.5413069191948623,
"learning_rate": 8.907416077857818e-06,
"loss": 0.2024,
"step": 1379
},
{
"epoch": 0.21461897356143078,
"grad_norm": 1.5546998088262725,
"learning_rate": 8.905891410787174e-06,
"loss": 0.1297,
"step": 1380
},
{
"epoch": 0.21477449455676517,
"grad_norm": 1.0276705986435684,
"learning_rate": 8.904365811325779e-06,
"loss": 0.1777,
"step": 1381
},
{
"epoch": 0.21493001555209953,
"grad_norm": 2.186178551364591,
"learning_rate": 8.902839279837818e-06,
"loss": 0.1936,
"step": 1382
},
{
"epoch": 0.2150855365474339,
"grad_norm": 1.409142378067793,
"learning_rate": 8.901311816687693e-06,
"loss": 0.2347,
"step": 1383
},
{
"epoch": 0.21524105754276826,
"grad_norm": 0.909249039104448,
"learning_rate": 8.899783422240031e-06,
"loss": 0.1858,
"step": 1384
},
{
"epoch": 0.21539657853810265,
"grad_norm": 1.389710830109919,
"learning_rate": 8.898254096859681e-06,
"loss": 0.2546,
"step": 1385
},
{
"epoch": 0.215552099533437,
"grad_norm": 1.1722812780197163,
"learning_rate": 8.896723840911718e-06,
"loss": 0.2451,
"step": 1386
},
{
"epoch": 0.21570762052877138,
"grad_norm": 1.0186256750739588,
"learning_rate": 8.89519265476143e-06,
"loss": 0.1423,
"step": 1387
},
{
"epoch": 0.21586314152410577,
"grad_norm": 1.463755060922718,
"learning_rate": 8.893660538774335e-06,
"loss": 0.678,
"step": 1388
},
{
"epoch": 0.21601866251944013,
"grad_norm": 1.2144290461428764,
"learning_rate": 8.892127493316172e-06,
"loss": 0.1289,
"step": 1389
},
{
"epoch": 0.2161741835147745,
"grad_norm": 1.2754281076641276,
"learning_rate": 8.8905935187529e-06,
"loss": 0.1775,
"step": 1390
},
{
"epoch": 0.21632970451010886,
"grad_norm": 0.8239843617970345,
"learning_rate": 8.889058615450695e-06,
"loss": 0.1379,
"step": 1391
},
{
"epoch": 0.21648522550544325,
"grad_norm": 0.8183516543340216,
"learning_rate": 8.887522783775965e-06,
"loss": 0.4396,
"step": 1392
},
{
"epoch": 0.2166407465007776,
"grad_norm": 1.163898200737944,
"learning_rate": 8.885986024095334e-06,
"loss": 0.1788,
"step": 1393
},
{
"epoch": 0.21679626749611197,
"grad_norm": 1.0398663598746642,
"learning_rate": 8.884448336775647e-06,
"loss": 0.2058,
"step": 1394
},
{
"epoch": 0.21695178849144633,
"grad_norm": 1.1038527572141106,
"learning_rate": 8.882909722183973e-06,
"loss": 0.1603,
"step": 1395
},
{
"epoch": 0.21710730948678073,
"grad_norm": 1.1407224011212185,
"learning_rate": 8.881370180687597e-06,
"loss": 0.212,
"step": 1396
},
{
"epoch": 0.2172628304821151,
"grad_norm": 1.171491183176733,
"learning_rate": 8.879829712654032e-06,
"loss": 0.156,
"step": 1397
},
{
"epoch": 0.21741835147744945,
"grad_norm": 1.0914587320494888,
"learning_rate": 8.878288318451006e-06,
"loss": 0.0999,
"step": 1398
},
{
"epoch": 0.2175738724727838,
"grad_norm": 1.0719935831541472,
"learning_rate": 8.876745998446477e-06,
"loss": 0.2026,
"step": 1399
},
{
"epoch": 0.2177293934681182,
"grad_norm": 0.8893812774700685,
"learning_rate": 8.875202753008614e-06,
"loss": 0.1152,
"step": 1400
},
{
"epoch": 0.2177293934681182,
"eval_loss": 0.20550738275051117,
"eval_runtime": 9.4165,
"eval_samples_per_second": 2.761,
"eval_steps_per_second": 0.743,
"step": 1400
},
{
"epoch": 0.21788491446345257,
"grad_norm": 1.1732595194107243,
"learning_rate": 8.873658582505813e-06,
"loss": 0.184,
"step": 1401
},
{
"epoch": 0.21804043545878693,
"grad_norm": 1.3681566501491238,
"learning_rate": 8.872113487306686e-06,
"loss": 0.1787,
"step": 1402
},
{
"epoch": 0.2181959564541213,
"grad_norm": 0.9384518321736989,
"learning_rate": 8.870567467780073e-06,
"loss": 0.1907,
"step": 1403
},
{
"epoch": 0.21835147744945568,
"grad_norm": 1.6918321800893066,
"learning_rate": 8.86902052429503e-06,
"loss": 0.1814,
"step": 1404
},
{
"epoch": 0.21850699844479005,
"grad_norm": 1.0615675392544648,
"learning_rate": 8.867472657220829e-06,
"loss": 0.1807,
"step": 1405
},
{
"epoch": 0.2186625194401244,
"grad_norm": 1.2104557155019795,
"learning_rate": 8.865923866926973e-06,
"loss": 0.2046,
"step": 1406
},
{
"epoch": 0.2188180404354588,
"grad_norm": 1.409015102478802,
"learning_rate": 8.864374153783177e-06,
"loss": 0.2415,
"step": 1407
},
{
"epoch": 0.21897356143079316,
"grad_norm": 1.2088161026937052,
"learning_rate": 8.86282351815938e-06,
"loss": 0.1573,
"step": 1408
},
{
"epoch": 0.21912908242612752,
"grad_norm": 1.0288225427805875,
"learning_rate": 8.861271960425741e-06,
"loss": 0.1812,
"step": 1409
},
{
"epoch": 0.2192846034214619,
"grad_norm": 1.1067487085965078,
"learning_rate": 8.859719480952637e-06,
"loss": 0.1955,
"step": 1410
},
{
"epoch": 0.21944012441679628,
"grad_norm": 1.331527983707418,
"learning_rate": 8.858166080110666e-06,
"loss": 0.2153,
"step": 1411
},
{
"epoch": 0.21959564541213064,
"grad_norm": 1.3966673201995545,
"learning_rate": 8.85661175827065e-06,
"loss": 0.1861,
"step": 1412
},
{
"epoch": 0.219751166407465,
"grad_norm": 1.7346922539447693,
"learning_rate": 8.855056515803624e-06,
"loss": 0.2217,
"step": 1413
},
{
"epoch": 0.21990668740279937,
"grad_norm": 1.0429561703393233,
"learning_rate": 8.853500353080848e-06,
"loss": 0.137,
"step": 1414
},
{
"epoch": 0.22006220839813376,
"grad_norm": 1.099146007367247,
"learning_rate": 8.851943270473797e-06,
"loss": 0.1888,
"step": 1415
},
{
"epoch": 0.22021772939346812,
"grad_norm": 1.0622173162674204,
"learning_rate": 8.850385268354171e-06,
"loss": 0.2054,
"step": 1416
},
{
"epoch": 0.22037325038880248,
"grad_norm": 1.7275165681110787,
"learning_rate": 8.848826347093887e-06,
"loss": 0.1839,
"step": 1417
},
{
"epoch": 0.22052877138413685,
"grad_norm": 1.4049206778214125,
"learning_rate": 8.84726650706508e-06,
"loss": 0.2719,
"step": 1418
},
{
"epoch": 0.22068429237947124,
"grad_norm": 0.984134518775913,
"learning_rate": 8.845705748640104e-06,
"loss": 0.2118,
"step": 1419
},
{
"epoch": 0.2208398133748056,
"grad_norm": 0.8575267757080008,
"learning_rate": 8.844144072191537e-06,
"loss": 0.1633,
"step": 1420
},
{
"epoch": 0.22099533437013996,
"grad_norm": 1.2572159208716647,
"learning_rate": 8.842581478092172e-06,
"loss": 0.2397,
"step": 1421
},
{
"epoch": 0.22115085536547435,
"grad_norm": 1.2016606507273602,
"learning_rate": 8.841017966715019e-06,
"loss": 0.2033,
"step": 1422
},
{
"epoch": 0.22130637636080872,
"grad_norm": 1.3276461025791215,
"learning_rate": 8.839453538433314e-06,
"loss": 0.1925,
"step": 1423
},
{
"epoch": 0.22146189735614308,
"grad_norm": 0.8224092915871075,
"learning_rate": 8.837888193620506e-06,
"loss": 0.1358,
"step": 1424
},
{
"epoch": 0.22161741835147744,
"grad_norm": 1.4495835386689406,
"learning_rate": 8.836321932650266e-06,
"loss": 0.2432,
"step": 1425
},
{
"epoch": 0.22177293934681183,
"grad_norm": 1.4755944744177818,
"learning_rate": 8.83475475589648e-06,
"loss": 0.1231,
"step": 1426
},
{
"epoch": 0.2219284603421462,
"grad_norm": 0.8119316049057401,
"learning_rate": 8.833186663733258e-06,
"loss": 0.2097,
"step": 1427
},
{
"epoch": 0.22208398133748056,
"grad_norm": 1.0060829041279713,
"learning_rate": 8.83161765653492e-06,
"loss": 0.1738,
"step": 1428
},
{
"epoch": 0.22223950233281492,
"grad_norm": 2.4145754711073733,
"learning_rate": 8.830047734676018e-06,
"loss": 0.2858,
"step": 1429
},
{
"epoch": 0.2223950233281493,
"grad_norm": 1.1242173153619541,
"learning_rate": 8.828476898531308e-06,
"loss": 0.2166,
"step": 1430
},
{
"epoch": 0.22255054432348367,
"grad_norm": 0.9324040289076934,
"learning_rate": 8.826905148475772e-06,
"loss": 0.1157,
"step": 1431
},
{
"epoch": 0.22270606531881804,
"grad_norm": 1.1091005510043248,
"learning_rate": 8.82533248488461e-06,
"loss": 0.2387,
"step": 1432
},
{
"epoch": 0.2228615863141524,
"grad_norm": 1.5660091935097067,
"learning_rate": 8.823758908133237e-06,
"loss": 0.1783,
"step": 1433
},
{
"epoch": 0.2230171073094868,
"grad_norm": 1.7595194847301099,
"learning_rate": 8.822184418597289e-06,
"loss": 0.1971,
"step": 1434
},
{
"epoch": 0.22317262830482115,
"grad_norm": 1.1991294408769844,
"learning_rate": 8.820609016652616e-06,
"loss": 0.1993,
"step": 1435
},
{
"epoch": 0.22332814930015552,
"grad_norm": 1.16155323748872,
"learning_rate": 8.819032702675293e-06,
"loss": 0.1663,
"step": 1436
},
{
"epoch": 0.22348367029548988,
"grad_norm": 1.144471577400653,
"learning_rate": 8.817455477041605e-06,
"loss": 0.1553,
"step": 1437
},
{
"epoch": 0.22363919129082427,
"grad_norm": 1.1758756635872867,
"learning_rate": 8.815877340128059e-06,
"loss": 0.1997,
"step": 1438
},
{
"epoch": 0.22379471228615863,
"grad_norm": 1.0774573442962538,
"learning_rate": 8.814298292311376e-06,
"loss": 0.224,
"step": 1439
},
{
"epoch": 0.223950233281493,
"grad_norm": 1.018897824496983,
"learning_rate": 8.812718333968498e-06,
"loss": 0.1969,
"step": 1440
},
{
"epoch": 0.22410575427682738,
"grad_norm": 0.7464671714955523,
"learning_rate": 8.811137465476584e-06,
"loss": 0.1704,
"step": 1441
},
{
"epoch": 0.22426127527216175,
"grad_norm": 1.120267062163412,
"learning_rate": 8.80955568721301e-06,
"loss": 0.1941,
"step": 1442
},
{
"epoch": 0.2244167962674961,
"grad_norm": 1.57559360058438,
"learning_rate": 8.807972999555368e-06,
"loss": 0.2603,
"step": 1443
},
{
"epoch": 0.22457231726283047,
"grad_norm": 0.939702806688543,
"learning_rate": 8.806389402881466e-06,
"loss": 0.2116,
"step": 1444
},
{
"epoch": 0.22472783825816486,
"grad_norm": 1.2188628504615986,
"learning_rate": 8.80480489756933e-06,
"loss": 0.2424,
"step": 1445
},
{
"epoch": 0.22488335925349923,
"grad_norm": 1.0944670304974327,
"learning_rate": 8.803219483997205e-06,
"loss": 0.1828,
"step": 1446
},
{
"epoch": 0.2250388802488336,
"grad_norm": 1.6182633129569433,
"learning_rate": 8.801633162543555e-06,
"loss": 0.1964,
"step": 1447
},
{
"epoch": 0.22519440124416795,
"grad_norm": 0.8387542620173406,
"learning_rate": 8.800045933587052e-06,
"loss": 0.1585,
"step": 1448
},
{
"epoch": 0.22534992223950234,
"grad_norm": 1.2464384825217707,
"learning_rate": 8.798457797506588e-06,
"loss": 0.1134,
"step": 1449
},
{
"epoch": 0.2255054432348367,
"grad_norm": 0.9893884401535724,
"learning_rate": 8.79686875468128e-06,
"loss": 0.2421,
"step": 1450
},
{
"epoch": 0.22566096423017107,
"grad_norm": 1.0480572189617101,
"learning_rate": 8.79527880549045e-06,
"loss": 0.1921,
"step": 1451
},
{
"epoch": 0.22581648522550543,
"grad_norm": 1.3152458887234093,
"learning_rate": 8.793687950313643e-06,
"loss": 0.1597,
"step": 1452
},
{
"epoch": 0.22597200622083982,
"grad_norm": 1.0970403207876425,
"learning_rate": 8.792096189530614e-06,
"loss": 0.1854,
"step": 1453
},
{
"epoch": 0.22612752721617418,
"grad_norm": 1.0705518033654797,
"learning_rate": 8.790503523521346e-06,
"loss": 0.1839,
"step": 1454
},
{
"epoch": 0.22628304821150855,
"grad_norm": 1.132932961220967,
"learning_rate": 8.788909952666024e-06,
"loss": 0.1871,
"step": 1455
},
{
"epoch": 0.2264385692068429,
"grad_norm": 1.4797221000535143,
"learning_rate": 8.787315477345059e-06,
"loss": 0.2295,
"step": 1456
},
{
"epoch": 0.2265940902021773,
"grad_norm": 1.0944162670416104,
"learning_rate": 8.785720097939075e-06,
"loss": 0.1745,
"step": 1457
},
{
"epoch": 0.22674961119751166,
"grad_norm": 1.6430830107526284,
"learning_rate": 8.784123814828908e-06,
"loss": 0.3592,
"step": 1458
},
{
"epoch": 0.22690513219284603,
"grad_norm": 1.1438907408683774,
"learning_rate": 8.782526628395616e-06,
"loss": 0.1613,
"step": 1459
},
{
"epoch": 0.22706065318818042,
"grad_norm": 2.655708868757693,
"learning_rate": 8.780928539020467e-06,
"loss": 0.1821,
"step": 1460
},
{
"epoch": 0.22721617418351478,
"grad_norm": 0.9605535718803637,
"learning_rate": 8.779329547084949e-06,
"loss": 0.1707,
"step": 1461
},
{
"epoch": 0.22737169517884914,
"grad_norm": 2.2075086894366036,
"learning_rate": 8.777729652970765e-06,
"loss": 0.1383,
"step": 1462
},
{
"epoch": 0.2275272161741835,
"grad_norm": 1.1974721511606266,
"learning_rate": 8.77612885705983e-06,
"loss": 0.2615,
"step": 1463
},
{
"epoch": 0.2276827371695179,
"grad_norm": 1.076273551290465,
"learning_rate": 8.774527159734277e-06,
"loss": 0.2094,
"step": 1464
},
{
"epoch": 0.22783825816485226,
"grad_norm": 1.3601919661341624,
"learning_rate": 8.772924561376454e-06,
"loss": 0.2324,
"step": 1465
},
{
"epoch": 0.22799377916018662,
"grad_norm": 1.4328079078867457,
"learning_rate": 8.771321062368922e-06,
"loss": 0.1763,
"step": 1466
},
{
"epoch": 0.22814930015552098,
"grad_norm": 1.1869126356200645,
"learning_rate": 8.76971666309446e-06,
"loss": 0.1093,
"step": 1467
},
{
"epoch": 0.22830482115085537,
"grad_norm": 0.8016043523305539,
"learning_rate": 8.768111363936058e-06,
"loss": 0.1716,
"step": 1468
},
{
"epoch": 0.22846034214618974,
"grad_norm": 1.1279000832737547,
"learning_rate": 8.766505165276928e-06,
"loss": 0.1415,
"step": 1469
},
{
"epoch": 0.2286158631415241,
"grad_norm": 1.4632653437041683,
"learning_rate": 8.764898067500488e-06,
"loss": 0.1682,
"step": 1470
},
{
"epoch": 0.22877138413685846,
"grad_norm": 1.427331448842405,
"learning_rate": 8.763290070990377e-06,
"loss": 0.261,
"step": 1471
},
{
"epoch": 0.22892690513219285,
"grad_norm": 0.9926126679211552,
"learning_rate": 8.761681176130443e-06,
"loss": 0.1625,
"step": 1472
},
{
"epoch": 0.22908242612752722,
"grad_norm": 1.690385156533882,
"learning_rate": 8.760071383304755e-06,
"loss": 0.2803,
"step": 1473
},
{
"epoch": 0.22923794712286158,
"grad_norm": 1.0976612977720204,
"learning_rate": 8.758460692897593e-06,
"loss": 0.1802,
"step": 1474
},
{
"epoch": 0.22939346811819597,
"grad_norm": 1.2314757179900722,
"learning_rate": 8.756849105293447e-06,
"loss": 0.1768,
"step": 1475
},
{
"epoch": 0.22954898911353033,
"grad_norm": 1.1327643054428198,
"learning_rate": 8.755236620877033e-06,
"loss": 0.1865,
"step": 1476
},
{
"epoch": 0.2297045101088647,
"grad_norm": 1.1639229615649782,
"learning_rate": 8.753623240033265e-06,
"loss": 0.1524,
"step": 1477
},
{
"epoch": 0.22986003110419906,
"grad_norm": 0.9603164098229106,
"learning_rate": 8.752008963147285e-06,
"loss": 0.1721,
"step": 1478
},
{
"epoch": 0.23001555209953345,
"grad_norm": 1.38792631561096,
"learning_rate": 8.750393790604442e-06,
"loss": 0.2342,
"step": 1479
},
{
"epoch": 0.2301710730948678,
"grad_norm": 1.2479053691859538,
"learning_rate": 8.7487777227903e-06,
"loss": 0.1938,
"step": 1480
},
{
"epoch": 0.23032659409020217,
"grad_norm": 1.2509939431760002,
"learning_rate": 8.747160760090637e-06,
"loss": 0.1844,
"step": 1481
},
{
"epoch": 0.23048211508553654,
"grad_norm": 1.465934150389407,
"learning_rate": 8.745542902891444e-06,
"loss": 0.205,
"step": 1482
},
{
"epoch": 0.23063763608087093,
"grad_norm": 1.0510694170069674,
"learning_rate": 8.743924151578928e-06,
"loss": 0.1759,
"step": 1483
},
{
"epoch": 0.2307931570762053,
"grad_norm": 1.2869382169156265,
"learning_rate": 8.742304506539506e-06,
"loss": 0.1634,
"step": 1484
},
{
"epoch": 0.23094867807153965,
"grad_norm": 2.0849533877813067,
"learning_rate": 8.740683968159808e-06,
"loss": 0.1834,
"step": 1485
},
{
"epoch": 0.23110419906687402,
"grad_norm": 0.5391088701503829,
"learning_rate": 8.739062536826683e-06,
"loss": 0.1062,
"step": 1486
},
{
"epoch": 0.2312597200622084,
"grad_norm": 1.339043790882886,
"learning_rate": 8.737440212927188e-06,
"loss": 0.154,
"step": 1487
},
{
"epoch": 0.23141524105754277,
"grad_norm": 1.2239049109865379,
"learning_rate": 8.735816996848592e-06,
"loss": 0.1694,
"step": 1488
},
{
"epoch": 0.23157076205287713,
"grad_norm": 0.8785721668205927,
"learning_rate": 8.734192888978381e-06,
"loss": 0.1501,
"step": 1489
},
{
"epoch": 0.2317262830482115,
"grad_norm": 1.1018359589714184,
"learning_rate": 8.732567889704253e-06,
"loss": 0.2004,
"step": 1490
},
{
"epoch": 0.23188180404354589,
"grad_norm": 1.2782960384351885,
"learning_rate": 8.730941999414117e-06,
"loss": 0.1514,
"step": 1491
},
{
"epoch": 0.23203732503888025,
"grad_norm": 0.7470536578634075,
"learning_rate": 8.729315218496097e-06,
"loss": 0.1828,
"step": 1492
},
{
"epoch": 0.2321928460342146,
"grad_norm": 1.0314729949458916,
"learning_rate": 8.727687547338527e-06,
"loss": 0.1766,
"step": 1493
},
{
"epoch": 0.232348367029549,
"grad_norm": 1.435780946058732,
"learning_rate": 8.726058986329954e-06,
"loss": 0.2574,
"step": 1494
},
{
"epoch": 0.23250388802488337,
"grad_norm": 1.3013711909380183,
"learning_rate": 8.72442953585914e-06,
"loss": 0.2304,
"step": 1495
},
{
"epoch": 0.23265940902021773,
"grad_norm": 1.3258835525000316,
"learning_rate": 8.722799196315057e-06,
"loss": 0.1649,
"step": 1496
},
{
"epoch": 0.2328149300155521,
"grad_norm": 1.4810824648278473,
"learning_rate": 8.721167968086888e-06,
"loss": 0.2786,
"step": 1497
},
{
"epoch": 0.23297045101088648,
"grad_norm": 0.8879588001193606,
"learning_rate": 8.719535851564034e-06,
"loss": 0.1662,
"step": 1498
},
{
"epoch": 0.23312597200622084,
"grad_norm": 1.0006636128134747,
"learning_rate": 8.7179028471361e-06,
"loss": 0.144,
"step": 1499
},
{
"epoch": 0.2332814930015552,
"grad_norm": 1.0732426035660707,
"learning_rate": 8.716268955192908e-06,
"loss": 0.1799,
"step": 1500
},
{
"epoch": 0.2332814930015552,
"eval_loss": 0.20381511747837067,
"eval_runtime": 9.4315,
"eval_samples_per_second": 2.757,
"eval_steps_per_second": 0.742,
"step": 1500
},
{
"epoch": 0.23343701399688957,
"grad_norm": 1.1848798776210054,
"learning_rate": 8.714634176124492e-06,
"loss": 0.2192,
"step": 1501
},
{
"epoch": 0.23359253499222396,
"grad_norm": 1.1734962627193575,
"learning_rate": 8.712998510321095e-06,
"loss": 0.2218,
"step": 1502
},
{
"epoch": 0.23374805598755832,
"grad_norm": 1.0346380522248477,
"learning_rate": 8.711361958173175e-06,
"loss": 0.1561,
"step": 1503
},
{
"epoch": 0.23390357698289269,
"grad_norm": 0.8380236750022618,
"learning_rate": 8.709724520071399e-06,
"loss": 0.1238,
"step": 1504
},
{
"epoch": 0.23405909797822705,
"grad_norm": 0.8234400155679666,
"learning_rate": 8.708086196406646e-06,
"loss": 0.1887,
"step": 1505
},
{
"epoch": 0.23421461897356144,
"grad_norm": 1.3627952832885772,
"learning_rate": 8.706446987570005e-06,
"loss": 0.1739,
"step": 1506
},
{
"epoch": 0.2343701399688958,
"grad_norm": 2.486707766460104,
"learning_rate": 8.704806893952782e-06,
"loss": 0.1462,
"step": 1507
},
{
"epoch": 0.23452566096423016,
"grad_norm": 1.041812062354574,
"learning_rate": 8.703165915946488e-06,
"loss": 0.2247,
"step": 1508
},
{
"epoch": 0.23468118195956453,
"grad_norm": 1.2090827115985525,
"learning_rate": 8.701524053942846e-06,
"loss": 0.1931,
"step": 1509
},
{
"epoch": 0.23483670295489892,
"grad_norm": 0.7956311279751848,
"learning_rate": 8.699881308333794e-06,
"loss": 0.1801,
"step": 1510
},
{
"epoch": 0.23499222395023328,
"grad_norm": 2.3005427634248017,
"learning_rate": 8.698237679511476e-06,
"loss": 0.2116,
"step": 1511
},
{
"epoch": 0.23514774494556764,
"grad_norm": 1.1297158899245439,
"learning_rate": 8.696593167868252e-06,
"loss": 0.2319,
"step": 1512
},
{
"epoch": 0.23530326594090203,
"grad_norm": 0.960775125545338,
"learning_rate": 8.694947773796685e-06,
"loss": 0.1543,
"step": 1513
},
{
"epoch": 0.2354587869362364,
"grad_norm": 1.213893040863673,
"learning_rate": 8.69330149768956e-06,
"loss": 0.2041,
"step": 1514
},
{
"epoch": 0.23561430793157076,
"grad_norm": 0.8074468351762752,
"learning_rate": 8.69165433993986e-06,
"loss": 0.1965,
"step": 1515
},
{
"epoch": 0.23576982892690512,
"grad_norm": 1.1267774919804718,
"learning_rate": 8.690006300940789e-06,
"loss": 0.1823,
"step": 1516
},
{
"epoch": 0.2359253499222395,
"grad_norm": 1.4711843699980223,
"learning_rate": 8.688357381085753e-06,
"loss": 0.1753,
"step": 1517
},
{
"epoch": 0.23608087091757388,
"grad_norm": 1.0215570051060534,
"learning_rate": 8.686707580768376e-06,
"loss": 0.214,
"step": 1518
},
{
"epoch": 0.23623639191290824,
"grad_norm": 1.4485746749390973,
"learning_rate": 8.685056900382486e-06,
"loss": 0.1742,
"step": 1519
},
{
"epoch": 0.2363919129082426,
"grad_norm": 1.6525523323599767,
"learning_rate": 8.683405340322123e-06,
"loss": 0.3261,
"step": 1520
},
{
"epoch": 0.236547433903577,
"grad_norm": 1.411135121552525,
"learning_rate": 8.681752900981539e-06,
"loss": 0.1753,
"step": 1521
},
{
"epoch": 0.23670295489891136,
"grad_norm": 1.4707330597490842,
"learning_rate": 8.680099582755196e-06,
"loss": 0.1668,
"step": 1522
},
{
"epoch": 0.23685847589424572,
"grad_norm": 1.0942391175538886,
"learning_rate": 8.678445386037759e-06,
"loss": 0.1601,
"step": 1523
},
{
"epoch": 0.23701399688958008,
"grad_norm": 1.470588177448403,
"learning_rate": 8.67679031122411e-06,
"loss": 0.246,
"step": 1524
},
{
"epoch": 0.23716951788491447,
"grad_norm": 0.9581346042453303,
"learning_rate": 8.675134358709341e-06,
"loss": 0.1574,
"step": 1525
},
{
"epoch": 0.23732503888024883,
"grad_norm": 1.4763786660245666,
"learning_rate": 8.67347752888875e-06,
"loss": 0.1907,
"step": 1526
},
{
"epoch": 0.2374805598755832,
"grad_norm": 1.0363167034974192,
"learning_rate": 8.671819822157842e-06,
"loss": 0.1531,
"step": 1527
},
{
"epoch": 0.2376360808709176,
"grad_norm": 1.1924345869848432,
"learning_rate": 8.670161238912338e-06,
"loss": 0.1347,
"step": 1528
},
{
"epoch": 0.23779160186625195,
"grad_norm": 1.3358065512422586,
"learning_rate": 8.668501779548165e-06,
"loss": 0.1827,
"step": 1529
},
{
"epoch": 0.2379471228615863,
"grad_norm": 2.021234266844145,
"learning_rate": 8.666841444461456e-06,
"loss": 0.1368,
"step": 1530
},
{
"epoch": 0.23810264385692068,
"grad_norm": 1.4808660901110622,
"learning_rate": 8.665180234048561e-06,
"loss": 0.2527,
"step": 1531
},
{
"epoch": 0.23825816485225507,
"grad_norm": 1.065494309629267,
"learning_rate": 8.66351814870603e-06,
"loss": 0.1645,
"step": 1532
},
{
"epoch": 0.23841368584758943,
"grad_norm": 1.154174016882306,
"learning_rate": 8.661855188830626e-06,
"loss": 0.2328,
"step": 1533
},
{
"epoch": 0.2385692068429238,
"grad_norm": 1.1447203609781391,
"learning_rate": 8.660191354819324e-06,
"loss": 0.1794,
"step": 1534
},
{
"epoch": 0.23872472783825816,
"grad_norm": 0.9991428522588004,
"learning_rate": 8.658526647069303e-06,
"loss": 0.1233,
"step": 1535
},
{
"epoch": 0.23888024883359255,
"grad_norm": 0.7670014014044277,
"learning_rate": 8.65686106597795e-06,
"loss": 0.1834,
"step": 1536
},
{
"epoch": 0.2390357698289269,
"grad_norm": 1.5945089662017708,
"learning_rate": 8.655194611942863e-06,
"loss": 0.1921,
"step": 1537
},
{
"epoch": 0.23919129082426127,
"grad_norm": 1.2997434550841578,
"learning_rate": 8.65352728536185e-06,
"loss": 0.1873,
"step": 1538
},
{
"epoch": 0.23934681181959563,
"grad_norm": 0.7625665208100638,
"learning_rate": 8.651859086632924e-06,
"loss": 0.1049,
"step": 1539
},
{
"epoch": 0.23950233281493002,
"grad_norm": 2.315830524891549,
"learning_rate": 8.650190016154307e-06,
"loss": 0.2199,
"step": 1540
},
{
"epoch": 0.2396578538102644,
"grad_norm": 0.893513036921711,
"learning_rate": 8.648520074324429e-06,
"loss": 0.1486,
"step": 1541
},
{
"epoch": 0.23981337480559875,
"grad_norm": 1.0954057776977126,
"learning_rate": 8.64684926154193e-06,
"loss": 0.143,
"step": 1542
},
{
"epoch": 0.2399688958009331,
"grad_norm": 1.1636396222045602,
"learning_rate": 8.645177578205654e-06,
"loss": 0.1386,
"step": 1543
},
{
"epoch": 0.2401244167962675,
"grad_norm": 1.6636278556595083,
"learning_rate": 8.643505024714656e-06,
"loss": 0.2057,
"step": 1544
},
{
"epoch": 0.24027993779160187,
"grad_norm": 0.847583750776468,
"learning_rate": 8.641831601468198e-06,
"loss": 0.1272,
"step": 1545
},
{
"epoch": 0.24043545878693623,
"grad_norm": 1.1676164916999088,
"learning_rate": 8.640157308865751e-06,
"loss": 0.2057,
"step": 1546
},
{
"epoch": 0.24059097978227062,
"grad_norm": 1.1944835161358125,
"learning_rate": 8.63848214730699e-06,
"loss": 0.2237,
"step": 1547
},
{
"epoch": 0.24074650077760498,
"grad_norm": 1.3051952058816747,
"learning_rate": 8.6368061171918e-06,
"loss": 0.1398,
"step": 1548
},
{
"epoch": 0.24090202177293935,
"grad_norm": 1.2433159998532273,
"learning_rate": 8.635129218920272e-06,
"loss": 0.1514,
"step": 1549
},
{
"epoch": 0.2410575427682737,
"grad_norm": 1.6469350149721569,
"learning_rate": 8.633451452892707e-06,
"loss": 0.2141,
"step": 1550
},
{
"epoch": 0.2412130637636081,
"grad_norm": 1.0473985194623197,
"learning_rate": 8.631772819509609e-06,
"loss": 0.1629,
"step": 1551
},
{
"epoch": 0.24136858475894246,
"grad_norm": 1.081030634052537,
"learning_rate": 8.630093319171692e-06,
"loss": 0.1647,
"step": 1552
},
{
"epoch": 0.24152410575427682,
"grad_norm": 1.0002048515938975,
"learning_rate": 8.628412952279879e-06,
"loss": 0.1636,
"step": 1553
},
{
"epoch": 0.2416796267496112,
"grad_norm": 1.2635804994332953,
"learning_rate": 8.62673171923529e-06,
"loss": 0.1922,
"step": 1554
},
{
"epoch": 0.24183514774494558,
"grad_norm": 1.0841589283406547,
"learning_rate": 8.625049620439266e-06,
"loss": 0.1796,
"step": 1555
},
{
"epoch": 0.24199066874027994,
"grad_norm": 1.2588626615586416,
"learning_rate": 8.623366656293345e-06,
"loss": 0.2045,
"step": 1556
},
{
"epoch": 0.2421461897356143,
"grad_norm": 1.114070429674418,
"learning_rate": 8.621682827199271e-06,
"loss": 0.2155,
"step": 1557
},
{
"epoch": 0.24230171073094867,
"grad_norm": 1.122877032526039,
"learning_rate": 8.619998133559001e-06,
"loss": 0.1647,
"step": 1558
},
{
"epoch": 0.24245723172628306,
"grad_norm": 2.039494379737774,
"learning_rate": 8.618312575774696e-06,
"loss": 0.2327,
"step": 1559
},
{
"epoch": 0.24261275272161742,
"grad_norm": 1.1450723191422727,
"learning_rate": 8.616626154248717e-06,
"loss": 0.1879,
"step": 1560
},
{
"epoch": 0.24276827371695178,
"grad_norm": 1.1035439479736404,
"learning_rate": 8.614938869383643e-06,
"loss": 0.1987,
"step": 1561
},
{
"epoch": 0.24292379471228615,
"grad_norm": 9.183796995970361,
"learning_rate": 8.613250721582244e-06,
"loss": 0.1657,
"step": 1562
},
{
"epoch": 0.24307931570762054,
"grad_norm": 10.346790090579951,
"learning_rate": 8.611561711247512e-06,
"loss": 0.1277,
"step": 1563
},
{
"epoch": 0.2432348367029549,
"grad_norm": 1.0950378522648088,
"learning_rate": 8.609871838782636e-06,
"loss": 0.1792,
"step": 1564
},
{
"epoch": 0.24339035769828926,
"grad_norm": 1.2442899837619454,
"learning_rate": 8.608181104591008e-06,
"loss": 0.2481,
"step": 1565
},
{
"epoch": 0.24354587869362365,
"grad_norm": 0.9579587283389649,
"learning_rate": 8.606489509076232e-06,
"loss": 0.1464,
"step": 1566
},
{
"epoch": 0.24370139968895801,
"grad_norm": 1.3434609920952423,
"learning_rate": 8.604797052642118e-06,
"loss": 0.167,
"step": 1567
},
{
"epoch": 0.24385692068429238,
"grad_norm": 1.3932778191886934,
"learning_rate": 8.603103735692678e-06,
"loss": 0.222,
"step": 1568
},
{
"epoch": 0.24401244167962674,
"grad_norm": 1.2606515150004263,
"learning_rate": 8.601409558632125e-06,
"loss": 0.1734,
"step": 1569
},
{
"epoch": 0.24416796267496113,
"grad_norm": 0.7524170445152542,
"learning_rate": 8.59971452186489e-06,
"loss": 0.1377,
"step": 1570
},
{
"epoch": 0.2443234836702955,
"grad_norm": 1.8039225543958133,
"learning_rate": 8.5980186257956e-06,
"loss": 0.1645,
"step": 1571
},
{
"epoch": 0.24447900466562986,
"grad_norm": 1.2660119379119157,
"learning_rate": 8.596321870829084e-06,
"loss": 0.1297,
"step": 1572
},
{
"epoch": 0.24463452566096422,
"grad_norm": 0.9837487875887194,
"learning_rate": 8.594624257370388e-06,
"loss": 0.2292,
"step": 1573
},
{
"epoch": 0.2447900466562986,
"grad_norm": 1.4946436207685003,
"learning_rate": 8.592925785824753e-06,
"loss": 0.171,
"step": 1574
},
{
"epoch": 0.24494556765163297,
"grad_norm": 1.0654266730537136,
"learning_rate": 8.591226456597626e-06,
"loss": 0.1375,
"step": 1575
},
{
"epoch": 0.24510108864696734,
"grad_norm": 0.971876018180366,
"learning_rate": 8.589526270094664e-06,
"loss": 0.1924,
"step": 1576
},
{
"epoch": 0.2452566096423017,
"grad_norm": 1.0087644300116139,
"learning_rate": 8.587825226721722e-06,
"loss": 0.1687,
"step": 1577
},
{
"epoch": 0.2454121306376361,
"grad_norm": 1.1652659496533695,
"learning_rate": 8.586123326884865e-06,
"loss": 0.186,
"step": 1578
},
{
"epoch": 0.24556765163297045,
"grad_norm": 1.4775732365533967,
"learning_rate": 8.584420570990361e-06,
"loss": 0.1889,
"step": 1579
},
{
"epoch": 0.24572317262830481,
"grad_norm": 1.0459439420285532,
"learning_rate": 8.582716959444679e-06,
"loss": 0.1928,
"step": 1580
},
{
"epoch": 0.2458786936236392,
"grad_norm": 1.5372117734449058,
"learning_rate": 8.581012492654495e-06,
"loss": 0.1877,
"step": 1581
},
{
"epoch": 0.24603421461897357,
"grad_norm": 1.9347395817267816,
"learning_rate": 8.579307171026693e-06,
"loss": 0.2777,
"step": 1582
},
{
"epoch": 0.24618973561430793,
"grad_norm": 0.9029125279631515,
"learning_rate": 8.577600994968352e-06,
"loss": 0.1297,
"step": 1583
},
{
"epoch": 0.2463452566096423,
"grad_norm": 0.8355029037365392,
"learning_rate": 8.575893964886763e-06,
"loss": 0.2099,
"step": 1584
},
{
"epoch": 0.24650077760497668,
"grad_norm": 1.6899413873191795,
"learning_rate": 8.574186081189416e-06,
"loss": 0.2022,
"step": 1585
},
{
"epoch": 0.24665629860031105,
"grad_norm": 1.087509710593699,
"learning_rate": 8.572477344284009e-06,
"loss": 0.1751,
"step": 1586
},
{
"epoch": 0.2468118195956454,
"grad_norm": 1.0292806428751466,
"learning_rate": 8.570767754578438e-06,
"loss": 0.1593,
"step": 1587
},
{
"epoch": 0.24696734059097977,
"grad_norm": 1.188609591991913,
"learning_rate": 8.56905731248081e-06,
"loss": 0.1491,
"step": 1588
},
{
"epoch": 0.24712286158631416,
"grad_norm": 1.2300883239133906,
"learning_rate": 8.567346018399427e-06,
"loss": 0.165,
"step": 1589
},
{
"epoch": 0.24727838258164853,
"grad_norm": 1.2064414577216789,
"learning_rate": 8.565633872742803e-06,
"loss": 0.2524,
"step": 1590
},
{
"epoch": 0.2474339035769829,
"grad_norm": 0.8406003864640567,
"learning_rate": 8.56392087591965e-06,
"loss": 0.1658,
"step": 1591
},
{
"epoch": 0.24758942457231725,
"grad_norm": 2.634699334807654,
"learning_rate": 8.56220702833888e-06,
"loss": 0.1692,
"step": 1592
},
{
"epoch": 0.24774494556765164,
"grad_norm": 0.9815581638651881,
"learning_rate": 8.560492330409618e-06,
"loss": 0.1678,
"step": 1593
},
{
"epoch": 0.247900466562986,
"grad_norm": 1.3909573488426212,
"learning_rate": 8.558776782541183e-06,
"loss": 0.2397,
"step": 1594
},
{
"epoch": 0.24805598755832037,
"grad_norm": 1.2613818557792364,
"learning_rate": 8.557060385143102e-06,
"loss": 0.2273,
"step": 1595
},
{
"epoch": 0.24821150855365473,
"grad_norm": 0.9777010646149178,
"learning_rate": 8.5553431386251e-06,
"loss": 0.1713,
"step": 1596
},
{
"epoch": 0.24836702954898912,
"grad_norm": 1.2012423072130696,
"learning_rate": 8.553625043397112e-06,
"loss": 0.2192,
"step": 1597
},
{
"epoch": 0.24852255054432348,
"grad_norm": 1.0747389022970961,
"learning_rate": 8.551906099869269e-06,
"loss": 0.1555,
"step": 1598
},
{
"epoch": 0.24867807153965785,
"grad_norm": 0.9987345212261577,
"learning_rate": 8.550186308451906e-06,
"loss": 0.2117,
"step": 1599
},
{
"epoch": 0.24883359253499224,
"grad_norm": 1.1743809541983374,
"learning_rate": 8.548465669555564e-06,
"loss": 0.1547,
"step": 1600
},
{
"epoch": 0.24883359253499224,
"eval_loss": 0.2037108987569809,
"eval_runtime": 9.4238,
"eval_samples_per_second": 2.759,
"eval_steps_per_second": 0.743,
"step": 1600
},
{
"epoch": 0.2489891135303266,
"grad_norm": 1.0755504197866683,
"learning_rate": 8.546744183590979e-06,
"loss": 0.1448,
"step": 1601
},
{
"epoch": 0.24914463452566096,
"grad_norm": 1.293645268455303,
"learning_rate": 8.545021850969097e-06,
"loss": 0.2045,
"step": 1602
},
{
"epoch": 0.24930015552099533,
"grad_norm": 1.644496498579518,
"learning_rate": 8.543298672101063e-06,
"loss": 0.1745,
"step": 1603
},
{
"epoch": 0.24945567651632972,
"grad_norm": 1.8853737644375217,
"learning_rate": 8.541574647398224e-06,
"loss": 0.1785,
"step": 1604
},
{
"epoch": 0.24961119751166408,
"grad_norm": 0.8348472318309339,
"learning_rate": 8.539849777272125e-06,
"loss": 0.1976,
"step": 1605
},
{
"epoch": 0.24976671850699844,
"grad_norm": 1.6007239985640846,
"learning_rate": 8.538124062134521e-06,
"loss": 0.1766,
"step": 1606
},
{
"epoch": 0.2499222395023328,
"grad_norm": 2.1944156006209194,
"learning_rate": 8.53639750239736e-06,
"loss": 0.2715,
"step": 1607
},
{
"epoch": 0.25007776049766717,
"grad_norm": 1.105749977206952,
"learning_rate": 8.534670098472802e-06,
"loss": 0.1564,
"step": 1608
},
{
"epoch": 0.25023328149300156,
"grad_norm": 0.8083237797522677,
"learning_rate": 8.532941850773195e-06,
"loss": 0.1668,
"step": 1609
},
{
"epoch": 0.25038880248833595,
"grad_norm": 1.172486307255137,
"learning_rate": 8.531212759711103e-06,
"loss": 0.2302,
"step": 1610
},
{
"epoch": 0.2505443234836703,
"grad_norm": 1.268322758173216,
"learning_rate": 8.52948282569928e-06,
"loss": 0.1789,
"step": 1611
},
{
"epoch": 0.2506998444790047,
"grad_norm": 0.9091823227567202,
"learning_rate": 8.527752049150685e-06,
"loss": 0.0784,
"step": 1612
},
{
"epoch": 0.250855365474339,
"grad_norm": 1.3902158634610304,
"learning_rate": 8.52602043047848e-06,
"loss": 0.1681,
"step": 1613
},
{
"epoch": 0.2510108864696734,
"grad_norm": 1.4942303280111533,
"learning_rate": 8.524287970096026e-06,
"loss": 0.217,
"step": 1614
},
{
"epoch": 0.2511664074650078,
"grad_norm": 0.8627158934582907,
"learning_rate": 8.522554668416887e-06,
"loss": 0.2181,
"step": 1615
},
{
"epoch": 0.2513219284603421,
"grad_norm": 1.0390290867530942,
"learning_rate": 8.520820525854824e-06,
"loss": 0.1764,
"step": 1616
},
{
"epoch": 0.2514774494556765,
"grad_norm": 1.4108685539031005,
"learning_rate": 8.519085542823802e-06,
"loss": 0.2164,
"step": 1617
},
{
"epoch": 0.2516329704510109,
"grad_norm": 1.371077345528009,
"learning_rate": 8.517349719737984e-06,
"loss": 0.1561,
"step": 1618
},
{
"epoch": 0.25178849144634524,
"grad_norm": 1.2763042021188964,
"learning_rate": 8.51561305701174e-06,
"loss": 0.1526,
"step": 1619
},
{
"epoch": 0.25194401244167963,
"grad_norm": 1.077695325158449,
"learning_rate": 8.51387555505963e-06,
"loss": 0.1876,
"step": 1620
},
{
"epoch": 0.252099533437014,
"grad_norm": 1.3164226998591637,
"learning_rate": 8.512137214296422e-06,
"loss": 0.2131,
"step": 1621
},
{
"epoch": 0.25225505443234836,
"grad_norm": 1.7522341912294899,
"learning_rate": 8.510398035137083e-06,
"loss": 0.133,
"step": 1622
},
{
"epoch": 0.25241057542768275,
"grad_norm": 4.615604310333582,
"learning_rate": 8.50865801799678e-06,
"loss": 0.1955,
"step": 1623
},
{
"epoch": 0.2525660964230171,
"grad_norm": 2.3506074867763536,
"learning_rate": 8.506917163290877e-06,
"loss": 0.3199,
"step": 1624
},
{
"epoch": 0.2527216174183515,
"grad_norm": 0.7483739763165084,
"learning_rate": 8.505175471434943e-06,
"loss": 0.2213,
"step": 1625
},
{
"epoch": 0.25287713841368586,
"grad_norm": 2.0095572169442333,
"learning_rate": 8.50343294284474e-06,
"loss": 0.2356,
"step": 1626
},
{
"epoch": 0.2530326594090202,
"grad_norm": 0.9367298995041891,
"learning_rate": 8.501689577936238e-06,
"loss": 0.1567,
"step": 1627
},
{
"epoch": 0.2531881804043546,
"grad_norm": 1.2746896918156698,
"learning_rate": 8.499945377125602e-06,
"loss": 0.1465,
"step": 1628
},
{
"epoch": 0.253343701399689,
"grad_norm": 0.7971645300115215,
"learning_rate": 8.498200340829195e-06,
"loss": 0.1419,
"step": 1629
},
{
"epoch": 0.2534992223950233,
"grad_norm": 1.7131432725110083,
"learning_rate": 8.496454469463583e-06,
"loss": 0.1437,
"step": 1630
},
{
"epoch": 0.2536547433903577,
"grad_norm": 1.3945635968284718,
"learning_rate": 8.494707763445526e-06,
"loss": 0.2116,
"step": 1631
},
{
"epoch": 0.25381026438569204,
"grad_norm": 1.130700720901677,
"learning_rate": 8.492960223191994e-06,
"loss": 0.1783,
"step": 1632
},
{
"epoch": 0.25396578538102643,
"grad_norm": 0.9910207975897489,
"learning_rate": 8.491211849120146e-06,
"loss": 0.1275,
"step": 1633
},
{
"epoch": 0.2541213063763608,
"grad_norm": 1.6819299813099522,
"learning_rate": 8.48946264164734e-06,
"loss": 0.2092,
"step": 1634
},
{
"epoch": 0.25427682737169516,
"grad_norm": 0.8070165110990363,
"learning_rate": 8.487712601191143e-06,
"loss": 0.2104,
"step": 1635
},
{
"epoch": 0.25443234836702955,
"grad_norm": 0.7832453865024183,
"learning_rate": 8.485961728169308e-06,
"loss": 0.1491,
"step": 1636
},
{
"epoch": 0.25458786936236394,
"grad_norm": 1.570863259158348,
"learning_rate": 8.484210022999795e-06,
"loss": 0.1337,
"step": 1637
},
{
"epoch": 0.2547433903576983,
"grad_norm": 2.094162070797788,
"learning_rate": 8.482457486100761e-06,
"loss": 0.1732,
"step": 1638
},
{
"epoch": 0.25489891135303266,
"grad_norm": 1.3293274255208316,
"learning_rate": 8.48070411789056e-06,
"loss": 0.1587,
"step": 1639
},
{
"epoch": 0.25505443234836706,
"grad_norm": 0.9704592907631973,
"learning_rate": 8.478949918787746e-06,
"loss": 0.167,
"step": 1640
},
{
"epoch": 0.2552099533437014,
"grad_norm": 2.2927511192581935,
"learning_rate": 8.47719488921107e-06,
"loss": 0.1731,
"step": 1641
},
{
"epoch": 0.2553654743390358,
"grad_norm": 1.2113969832398468,
"learning_rate": 8.475439029579487e-06,
"loss": 0.1636,
"step": 1642
},
{
"epoch": 0.2555209953343701,
"grad_norm": 1.2700840486141427,
"learning_rate": 8.473682340312136e-06,
"loss": 0.2251,
"step": 1643
},
{
"epoch": 0.2556765163297045,
"grad_norm": 0.8692629936958125,
"learning_rate": 8.47192482182837e-06,
"loss": 0.1944,
"step": 1644
},
{
"epoch": 0.2558320373250389,
"grad_norm": 1.0546031026829716,
"learning_rate": 8.470166474547731e-06,
"loss": 0.1963,
"step": 1645
},
{
"epoch": 0.25598755832037323,
"grad_norm": 1.8035421603246344,
"learning_rate": 8.468407298889962e-06,
"loss": 0.1678,
"step": 1646
},
{
"epoch": 0.2561430793157076,
"grad_norm": 0.8593243264529278,
"learning_rate": 8.466647295275002e-06,
"loss": 0.1272,
"step": 1647
},
{
"epoch": 0.256298600311042,
"grad_norm": 1.5174530612382813,
"learning_rate": 8.464886464122988e-06,
"loss": 0.2685,
"step": 1648
},
{
"epoch": 0.25645412130637635,
"grad_norm": 1.5250972376290421,
"learning_rate": 8.463124805854257e-06,
"loss": 0.1674,
"step": 1649
},
{
"epoch": 0.25660964230171074,
"grad_norm": 1.1663575092987046,
"learning_rate": 8.461362320889338e-06,
"loss": 0.1577,
"step": 1650
},
{
"epoch": 0.2567651632970451,
"grad_norm": 1.474673013106268,
"learning_rate": 8.459599009648964e-06,
"loss": 0.1769,
"step": 1651
},
{
"epoch": 0.25692068429237946,
"grad_norm": 1.1672631965692757,
"learning_rate": 8.45783487255406e-06,
"loss": 0.2249,
"step": 1652
},
{
"epoch": 0.25707620528771385,
"grad_norm": 1.1953181883355133,
"learning_rate": 8.456069910025751e-06,
"loss": 0.2018,
"step": 1653
},
{
"epoch": 0.2572317262830482,
"grad_norm": 1.1089828464331577,
"learning_rate": 8.454304122485358e-06,
"loss": 0.1419,
"step": 1654
},
{
"epoch": 0.2573872472783826,
"grad_norm": 1.2716710060074294,
"learning_rate": 8.452537510354397e-06,
"loss": 0.1966,
"step": 1655
},
{
"epoch": 0.25754276827371697,
"grad_norm": 1.952579937166782,
"learning_rate": 8.450770074054586e-06,
"loss": 0.2699,
"step": 1656
},
{
"epoch": 0.2576982892690513,
"grad_norm": 0.7319931402583304,
"learning_rate": 8.449001814007838e-06,
"loss": 0.1401,
"step": 1657
},
{
"epoch": 0.2578538102643857,
"grad_norm": 1.627013708512288,
"learning_rate": 8.447232730636257e-06,
"loss": 0.2617,
"step": 1658
},
{
"epoch": 0.2580093312597201,
"grad_norm": 1.0492953509552387,
"learning_rate": 8.44546282436215e-06,
"loss": 0.1922,
"step": 1659
},
{
"epoch": 0.2581648522550544,
"grad_norm": 0.9166534435780459,
"learning_rate": 8.443692095608019e-06,
"loss": 0.2099,
"step": 1660
},
{
"epoch": 0.2583203732503888,
"grad_norm": 1.1458120209760718,
"learning_rate": 8.441920544796558e-06,
"loss": 0.1724,
"step": 1661
},
{
"epoch": 0.25847589424572315,
"grad_norm": 1.071395804244241,
"learning_rate": 8.440148172350666e-06,
"loss": 0.1728,
"step": 1662
},
{
"epoch": 0.25863141524105754,
"grad_norm": 1.2413704662622753,
"learning_rate": 8.43837497869343e-06,
"loss": 0.2031,
"step": 1663
},
{
"epoch": 0.25878693623639193,
"grad_norm": 1.1068242296182698,
"learning_rate": 8.436600964248138e-06,
"loss": 0.1951,
"step": 1664
},
{
"epoch": 0.25894245723172626,
"grad_norm": 0.8699381605693407,
"learning_rate": 8.43482612943827e-06,
"loss": 0.1764,
"step": 1665
},
{
"epoch": 0.25909797822706065,
"grad_norm": 1.2048052321069596,
"learning_rate": 8.433050474687505e-06,
"loss": 0.2311,
"step": 1666
},
{
"epoch": 0.25925349922239505,
"grad_norm": 1.315498269766704,
"learning_rate": 8.431274000419716e-06,
"loss": 0.2412,
"step": 1667
},
{
"epoch": 0.2594090202177294,
"grad_norm": 0.6128855898398873,
"learning_rate": 8.42949670705897e-06,
"loss": 0.1068,
"step": 1668
},
{
"epoch": 0.25956454121306377,
"grad_norm": 0.9552988172621262,
"learning_rate": 8.427718595029537e-06,
"loss": 0.1458,
"step": 1669
},
{
"epoch": 0.25972006220839816,
"grad_norm": 1.411892967173632,
"learning_rate": 8.425939664755874e-06,
"loss": 0.2327,
"step": 1670
},
{
"epoch": 0.2598755832037325,
"grad_norm": 1.066036249369497,
"learning_rate": 8.424159916662636e-06,
"loss": 0.1845,
"step": 1671
},
{
"epoch": 0.2600311041990669,
"grad_norm": 1.0078601069914832,
"learning_rate": 8.422379351174673e-06,
"loss": 0.129,
"step": 1672
},
{
"epoch": 0.2601866251944012,
"grad_norm": 0.9627418389301211,
"learning_rate": 8.420597968717033e-06,
"loss": 0.2346,
"step": 1673
},
{
"epoch": 0.2603421461897356,
"grad_norm": 1.0190302705099263,
"learning_rate": 8.418815769714956e-06,
"loss": 0.1291,
"step": 1674
},
{
"epoch": 0.26049766718507,
"grad_norm": 0.8536213147159897,
"learning_rate": 8.417032754593879e-06,
"loss": 0.1759,
"step": 1675
},
{
"epoch": 0.26065318818040434,
"grad_norm": 0.9477728405361937,
"learning_rate": 8.415248923779431e-06,
"loss": 0.1708,
"step": 1676
},
{
"epoch": 0.26080870917573873,
"grad_norm": 1.0305276755799404,
"learning_rate": 8.413464277697436e-06,
"loss": 0.3205,
"step": 1677
},
{
"epoch": 0.2609642301710731,
"grad_norm": 1.324545893865915,
"learning_rate": 8.411678816773916e-06,
"loss": 0.2936,
"step": 1678
},
{
"epoch": 0.26111975116640745,
"grad_norm": 1.3383489149505705,
"learning_rate": 8.409892541435085e-06,
"loss": 0.2406,
"step": 1679
},
{
"epoch": 0.26127527216174184,
"grad_norm": 0.9651270377598534,
"learning_rate": 8.408105452107353e-06,
"loss": 0.1511,
"step": 1680
},
{
"epoch": 0.2614307931570762,
"grad_norm": 0.6783781205233194,
"learning_rate": 8.40631754921732e-06,
"loss": 0.1567,
"step": 1681
},
{
"epoch": 0.26158631415241057,
"grad_norm": 1.198981860000486,
"learning_rate": 8.404528833191786e-06,
"loss": 0.2125,
"step": 1682
},
{
"epoch": 0.26174183514774496,
"grad_norm": 0.7449630196962097,
"learning_rate": 8.402739304457743e-06,
"loss": 0.179,
"step": 1683
},
{
"epoch": 0.2618973561430793,
"grad_norm": 1.3499907032342544,
"learning_rate": 8.400948963442373e-06,
"loss": 0.1492,
"step": 1684
},
{
"epoch": 0.2620528771384137,
"grad_norm": 1.2324653573954145,
"learning_rate": 8.39915781057306e-06,
"loss": 0.1442,
"step": 1685
},
{
"epoch": 0.2622083981337481,
"grad_norm": 1.5240761421711815,
"learning_rate": 8.397365846277371e-06,
"loss": 0.3141,
"step": 1686
},
{
"epoch": 0.2623639191290824,
"grad_norm": 0.9242701212113029,
"learning_rate": 8.39557307098308e-06,
"loss": 0.175,
"step": 1687
},
{
"epoch": 0.2625194401244168,
"grad_norm": 1.0215723172112428,
"learning_rate": 8.393779485118142e-06,
"loss": 0.1572,
"step": 1688
},
{
"epoch": 0.2626749611197512,
"grad_norm": 1.4272441271545482,
"learning_rate": 8.391985089110715e-06,
"loss": 0.2086,
"step": 1689
},
{
"epoch": 0.26283048211508553,
"grad_norm": 0.98493015131112,
"learning_rate": 8.390189883389143e-06,
"loss": 0.1758,
"step": 1690
},
{
"epoch": 0.2629860031104199,
"grad_norm": 1.412962012368002,
"learning_rate": 8.388393868381967e-06,
"loss": 0.137,
"step": 1691
},
{
"epoch": 0.26314152410575425,
"grad_norm": 0.8439849086089997,
"learning_rate": 8.386597044517923e-06,
"loss": 0.1794,
"step": 1692
},
{
"epoch": 0.26329704510108864,
"grad_norm": 0.9027272166442722,
"learning_rate": 8.384799412225936e-06,
"loss": 0.1827,
"step": 1693
},
{
"epoch": 0.26345256609642304,
"grad_norm": 1.0861962602589315,
"learning_rate": 8.383000971935129e-06,
"loss": 0.1736,
"step": 1694
},
{
"epoch": 0.26360808709175737,
"grad_norm": 1.4467531133479765,
"learning_rate": 8.38120172407481e-06,
"loss": 0.2872,
"step": 1695
},
{
"epoch": 0.26376360808709176,
"grad_norm": 0.7243899321635017,
"learning_rate": 8.379401669074489e-06,
"loss": 0.1568,
"step": 1696
},
{
"epoch": 0.26391912908242615,
"grad_norm": 0.8947544881090379,
"learning_rate": 8.37760080736386e-06,
"loss": 0.1516,
"step": 1697
},
{
"epoch": 0.2640746500777605,
"grad_norm": 1.1759725115418023,
"learning_rate": 8.375799139372818e-06,
"loss": 0.1384,
"step": 1698
},
{
"epoch": 0.2642301710730949,
"grad_norm": 0.8519187195565056,
"learning_rate": 8.373996665531443e-06,
"loss": 0.2027,
"step": 1699
},
{
"epoch": 0.2643856920684292,
"grad_norm": 1.4756118825078526,
"learning_rate": 8.37219338627001e-06,
"loss": 0.2323,
"step": 1700
},
{
"epoch": 0.2643856920684292,
"eval_loss": 0.19943906366825104,
"eval_runtime": 9.4244,
"eval_samples_per_second": 2.759,
"eval_steps_per_second": 0.743,
"step": 1700
},
{
"epoch": 0.2645412130637636,
"grad_norm": 1.1415194682343677,
"learning_rate": 8.370389302018993e-06,
"loss": 0.1627,
"step": 1701
},
{
"epoch": 0.264696734059098,
"grad_norm": 0.9887030475180681,
"learning_rate": 8.368584413209044e-06,
"loss": 0.1913,
"step": 1702
},
{
"epoch": 0.26485225505443233,
"grad_norm": 1.579433234849522,
"learning_rate": 8.366778720271022e-06,
"loss": 0.2494,
"step": 1703
},
{
"epoch": 0.2650077760497667,
"grad_norm": 1.1581416599961576,
"learning_rate": 8.364972223635967e-06,
"loss": 0.1984,
"step": 1704
},
{
"epoch": 0.2651632970451011,
"grad_norm": 1.4481396852315895,
"learning_rate": 8.363164923735116e-06,
"loss": 0.1772,
"step": 1705
},
{
"epoch": 0.26531881804043544,
"grad_norm": 2.2248131911902918,
"learning_rate": 8.361356820999897e-06,
"loss": 0.2035,
"step": 1706
},
{
"epoch": 0.26547433903576984,
"grad_norm": 1.296906679431483,
"learning_rate": 8.359547915861927e-06,
"loss": 0.1906,
"step": 1707
},
{
"epoch": 0.2656298600311042,
"grad_norm": 1.4510599043288837,
"learning_rate": 8.357738208753022e-06,
"loss": 0.215,
"step": 1708
},
{
"epoch": 0.26578538102643856,
"grad_norm": 1.3812180344156422,
"learning_rate": 8.35592770010518e-06,
"loss": 0.2366,
"step": 1709
},
{
"epoch": 0.26594090202177295,
"grad_norm": 0.7624028953564842,
"learning_rate": 8.354116390350594e-06,
"loss": 0.1337,
"step": 1710
},
{
"epoch": 0.2660964230171073,
"grad_norm": 1.0938571817018024,
"learning_rate": 8.352304279921655e-06,
"loss": 0.1739,
"step": 1711
},
{
"epoch": 0.2662519440124417,
"grad_norm": 1.3112579396126312,
"learning_rate": 8.350491369250933e-06,
"loss": 0.2866,
"step": 1712
},
{
"epoch": 0.26640746500777607,
"grad_norm": 1.4175431035953647,
"learning_rate": 8.348677658771197e-06,
"loss": 0.1308,
"step": 1713
},
{
"epoch": 0.2665629860031104,
"grad_norm": 2.1014926949253327,
"learning_rate": 8.346863148915402e-06,
"loss": 0.1549,
"step": 1714
},
{
"epoch": 0.2667185069984448,
"grad_norm": 1.132911689146343,
"learning_rate": 8.345047840116704e-06,
"loss": 0.2182,
"step": 1715
},
{
"epoch": 0.2668740279937792,
"grad_norm": 0.6535130581015213,
"learning_rate": 8.343231732808435e-06,
"loss": 0.1748,
"step": 1716
},
{
"epoch": 0.2670295489891135,
"grad_norm": 0.9808104365320156,
"learning_rate": 8.34141482742413e-06,
"loss": 0.1512,
"step": 1717
},
{
"epoch": 0.2671850699844479,
"grad_norm": 1.2630125658621263,
"learning_rate": 8.339597124397509e-06,
"loss": 0.1698,
"step": 1718
},
{
"epoch": 0.26734059097978224,
"grad_norm": 1.279259047820582,
"learning_rate": 8.33777862416248e-06,
"loss": 0.1769,
"step": 1719
},
{
"epoch": 0.26749611197511663,
"grad_norm": 1.1242790219258612,
"learning_rate": 8.335959327153148e-06,
"loss": 0.2224,
"step": 1720
},
{
"epoch": 0.267651632970451,
"grad_norm": 1.0035835372337707,
"learning_rate": 8.334139233803801e-06,
"loss": 0.1697,
"step": 1721
},
{
"epoch": 0.26780715396578536,
"grad_norm": 1.9776796243145607,
"learning_rate": 8.332318344548926e-06,
"loss": 0.2033,
"step": 1722
},
{
"epoch": 0.26796267496111975,
"grad_norm": 1.1521258085682824,
"learning_rate": 8.330496659823189e-06,
"loss": 0.1729,
"step": 1723
},
{
"epoch": 0.26811819595645414,
"grad_norm": 1.0253842887133877,
"learning_rate": 8.328674180061453e-06,
"loss": 0.2185,
"step": 1724
},
{
"epoch": 0.2682737169517885,
"grad_norm": 0.871091469827773,
"learning_rate": 8.326850905698774e-06,
"loss": 0.1359,
"step": 1725
},
{
"epoch": 0.26842923794712287,
"grad_norm": 1.7009594103702224,
"learning_rate": 8.325026837170386e-06,
"loss": 0.2348,
"step": 1726
},
{
"epoch": 0.26858475894245726,
"grad_norm": 1.367926551681483,
"learning_rate": 8.323201974911723e-06,
"loss": 0.1842,
"step": 1727
},
{
"epoch": 0.2687402799377916,
"grad_norm": 1.148927442910907,
"learning_rate": 8.321376319358407e-06,
"loss": 0.1096,
"step": 1728
},
{
"epoch": 0.268895800933126,
"grad_norm": 1.3075658909675654,
"learning_rate": 8.319549870946244e-06,
"loss": 0.1543,
"step": 1729
},
{
"epoch": 0.2690513219284603,
"grad_norm": 0.8291270774545968,
"learning_rate": 8.317722630111233e-06,
"loss": 0.1093,
"step": 1730
},
{
"epoch": 0.2692068429237947,
"grad_norm": 2.2622896049282706,
"learning_rate": 8.315894597289565e-06,
"loss": 0.2042,
"step": 1731
},
{
"epoch": 0.2693623639191291,
"grad_norm": 0.7046996138148661,
"learning_rate": 8.314065772917612e-06,
"loss": 0.1303,
"step": 1732
},
{
"epoch": 0.26951788491446343,
"grad_norm": 0.9333196367153322,
"learning_rate": 8.312236157431946e-06,
"loss": 0.169,
"step": 1733
},
{
"epoch": 0.2696734059097978,
"grad_norm": 1.1869718333049797,
"learning_rate": 8.310405751269318e-06,
"loss": 0.2494,
"step": 1734
},
{
"epoch": 0.2698289269051322,
"grad_norm": 0.9186255111712875,
"learning_rate": 8.30857455486667e-06,
"loss": 0.1449,
"step": 1735
},
{
"epoch": 0.26998444790046655,
"grad_norm": 1.7158457711756847,
"learning_rate": 8.306742568661137e-06,
"loss": 0.2472,
"step": 1736
},
{
"epoch": 0.27013996889580094,
"grad_norm": 0.9091734067747751,
"learning_rate": 8.304909793090039e-06,
"loss": 0.1517,
"step": 1737
},
{
"epoch": 0.2702954898911353,
"grad_norm": 0.9472038650945157,
"learning_rate": 8.303076228590885e-06,
"loss": 0.1293,
"step": 1738
},
{
"epoch": 0.27045101088646967,
"grad_norm": 1.359961162735269,
"learning_rate": 8.301241875601371e-06,
"loss": 0.1687,
"step": 1739
},
{
"epoch": 0.27060653188180406,
"grad_norm": 1.3706412614563859,
"learning_rate": 8.299406734559385e-06,
"loss": 0.1151,
"step": 1740
},
{
"epoch": 0.2707620528771384,
"grad_norm": 1.4633698039358347,
"learning_rate": 8.297570805903e-06,
"loss": 0.1834,
"step": 1741
},
{
"epoch": 0.2709175738724728,
"grad_norm": 1.2706325476878815,
"learning_rate": 8.295734090070477e-06,
"loss": 0.1889,
"step": 1742
},
{
"epoch": 0.2710730948678072,
"grad_norm": 1.40063937560449,
"learning_rate": 8.293896587500266e-06,
"loss": 0.1644,
"step": 1743
},
{
"epoch": 0.2712286158631415,
"grad_norm": 1.756399176307069,
"learning_rate": 8.292058298631003e-06,
"loss": 0.2121,
"step": 1744
},
{
"epoch": 0.2713841368584759,
"grad_norm": 1.3118943702099763,
"learning_rate": 8.290219223901517e-06,
"loss": 0.1657,
"step": 1745
},
{
"epoch": 0.2715396578538103,
"grad_norm": 1.221070247479925,
"learning_rate": 8.288379363750818e-06,
"loss": 0.1799,
"step": 1746
},
{
"epoch": 0.2716951788491446,
"grad_norm": 1.30049039400021,
"learning_rate": 8.286538718618107e-06,
"loss": 0.1659,
"step": 1747
},
{
"epoch": 0.271850699844479,
"grad_norm": 0.8218052779463395,
"learning_rate": 8.28469728894277e-06,
"loss": 0.1417,
"step": 1748
},
{
"epoch": 0.27200622083981335,
"grad_norm": 1.318881683721639,
"learning_rate": 8.282855075164386e-06,
"loss": 0.2086,
"step": 1749
},
{
"epoch": 0.27216174183514774,
"grad_norm": 1.168225071909074,
"learning_rate": 8.281012077722712e-06,
"loss": 0.1481,
"step": 1750
},
{
"epoch": 0.27231726283048213,
"grad_norm": 1.387527553498744,
"learning_rate": 8.2791682970577e-06,
"loss": 0.224,
"step": 1751
},
{
"epoch": 0.27247278382581647,
"grad_norm": 0.9455523699522945,
"learning_rate": 8.277323733609488e-06,
"loss": 0.1689,
"step": 1752
},
{
"epoch": 0.27262830482115086,
"grad_norm": 1.301993231412919,
"learning_rate": 8.275478387818394e-06,
"loss": 0.17,
"step": 1753
},
{
"epoch": 0.27278382581648525,
"grad_norm": 1.1753804485169133,
"learning_rate": 8.273632260124934e-06,
"loss": 0.2231,
"step": 1754
},
{
"epoch": 0.2729393468118196,
"grad_norm": 1.080698611275427,
"learning_rate": 8.271785350969799e-06,
"loss": 0.1796,
"step": 1755
},
{
"epoch": 0.273094867807154,
"grad_norm": 1.290015540604507,
"learning_rate": 8.269937660793875e-06,
"loss": 0.1941,
"step": 1756
},
{
"epoch": 0.2732503888024883,
"grad_norm": 1.070538218943679,
"learning_rate": 8.268089190038228e-06,
"loss": 0.1909,
"step": 1757
},
{
"epoch": 0.2734059097978227,
"grad_norm": 1.2252798699112468,
"learning_rate": 8.266239939144118e-06,
"loss": 0.1569,
"step": 1758
},
{
"epoch": 0.2735614307931571,
"grad_norm": 1.2346475130597931,
"learning_rate": 8.264389908552987e-06,
"loss": 0.1881,
"step": 1759
},
{
"epoch": 0.2737169517884914,
"grad_norm": 0.8909529676508143,
"learning_rate": 8.26253909870646e-06,
"loss": 0.1635,
"step": 1760
},
{
"epoch": 0.2738724727838258,
"grad_norm": 1.3801819199807877,
"learning_rate": 8.260687510046352e-06,
"loss": 0.1957,
"step": 1761
},
{
"epoch": 0.2740279937791602,
"grad_norm": 0.9098604615543268,
"learning_rate": 8.258835143014663e-06,
"loss": 0.1556,
"step": 1762
},
{
"epoch": 0.27418351477449454,
"grad_norm": 1.479953181946323,
"learning_rate": 8.25698199805358e-06,
"loss": 0.1673,
"step": 1763
},
{
"epoch": 0.27433903576982893,
"grad_norm": 1.0391961011580078,
"learning_rate": 8.255128075605475e-06,
"loss": 0.1678,
"step": 1764
},
{
"epoch": 0.2744945567651633,
"grad_norm": 1.1674213515628957,
"learning_rate": 8.253273376112902e-06,
"loss": 0.1575,
"step": 1765
},
{
"epoch": 0.27465007776049766,
"grad_norm": 0.776827674790433,
"learning_rate": 8.251417900018606e-06,
"loss": 0.2087,
"step": 1766
},
{
"epoch": 0.27480559875583205,
"grad_norm": 1.0737505366105782,
"learning_rate": 8.249561647765515e-06,
"loss": 0.202,
"step": 1767
},
{
"epoch": 0.2749611197511664,
"grad_norm": 1.0278179070478979,
"learning_rate": 8.247704619796743e-06,
"loss": 0.2246,
"step": 1768
},
{
"epoch": 0.2751166407465008,
"grad_norm": 1.3308057309065462,
"learning_rate": 8.245846816555588e-06,
"loss": 0.1781,
"step": 1769
},
{
"epoch": 0.27527216174183516,
"grad_norm": 1.171891225152092,
"learning_rate": 8.24398823848553e-06,
"loss": 0.2838,
"step": 1770
},
{
"epoch": 0.2754276827371695,
"grad_norm": 0.9162549134019579,
"learning_rate": 8.242128886030243e-06,
"loss": 0.153,
"step": 1771
},
{
"epoch": 0.2755832037325039,
"grad_norm": 1.7094368421056838,
"learning_rate": 8.240268759633576e-06,
"loss": 0.1769,
"step": 1772
},
{
"epoch": 0.2757387247278383,
"grad_norm": 1.088761334959302,
"learning_rate": 8.23840785973957e-06,
"loss": 0.1872,
"step": 1773
},
{
"epoch": 0.2758942457231726,
"grad_norm": 1.0467068106039534,
"learning_rate": 8.236546186792446e-06,
"loss": 0.1941,
"step": 1774
},
{
"epoch": 0.276049766718507,
"grad_norm": 1.469925204114295,
"learning_rate": 8.234683741236612e-06,
"loss": 0.2439,
"step": 1775
},
{
"epoch": 0.27620528771384134,
"grad_norm": 1.286843667284798,
"learning_rate": 8.23282052351666e-06,
"loss": 0.1825,
"step": 1776
},
{
"epoch": 0.27636080870917573,
"grad_norm": 1.5684518100667084,
"learning_rate": 8.230956534077366e-06,
"loss": 0.2088,
"step": 1777
},
{
"epoch": 0.2765163297045101,
"grad_norm": 1.3158757876867857,
"learning_rate": 8.22909177336369e-06,
"loss": 0.1965,
"step": 1778
},
{
"epoch": 0.27667185069984446,
"grad_norm": 0.7862541895693009,
"learning_rate": 8.227226241820779e-06,
"loss": 0.1388,
"step": 1779
},
{
"epoch": 0.27682737169517885,
"grad_norm": 0.9288123715441376,
"learning_rate": 8.225359939893954e-06,
"loss": 0.243,
"step": 1780
},
{
"epoch": 0.27698289269051324,
"grad_norm": 1.491008802108701,
"learning_rate": 8.223492868028736e-06,
"loss": 0.2521,
"step": 1781
},
{
"epoch": 0.2771384136858476,
"grad_norm": 1.1202886550853388,
"learning_rate": 8.221625026670814e-06,
"loss": 0.1688,
"step": 1782
},
{
"epoch": 0.27729393468118196,
"grad_norm": 1.1962734383960754,
"learning_rate": 8.219756416266073e-06,
"loss": 0.1294,
"step": 1783
},
{
"epoch": 0.27744945567651635,
"grad_norm": 0.6740427840476089,
"learning_rate": 8.217887037260575e-06,
"loss": 0.1501,
"step": 1784
},
{
"epoch": 0.2776049766718507,
"grad_norm": 1.8752578154372959,
"learning_rate": 8.216016890100564e-06,
"loss": 0.2524,
"step": 1785
},
{
"epoch": 0.2777604976671851,
"grad_norm": 1.3276982202120067,
"learning_rate": 8.214145975232474e-06,
"loss": 0.1611,
"step": 1786
},
{
"epoch": 0.2779160186625194,
"grad_norm": 0.9180331686214024,
"learning_rate": 8.212274293102917e-06,
"loss": 0.2069,
"step": 1787
},
{
"epoch": 0.2780715396578538,
"grad_norm": 1.1644000920434754,
"learning_rate": 8.210401844158688e-06,
"loss": 0.2113,
"step": 1788
},
{
"epoch": 0.2782270606531882,
"grad_norm": 1.6247680870264813,
"learning_rate": 8.20852862884677e-06,
"loss": 0.2167,
"step": 1789
},
{
"epoch": 0.27838258164852253,
"grad_norm": 2.465352962757943,
"learning_rate": 8.206654647614323e-06,
"loss": 0.2917,
"step": 1790
},
{
"epoch": 0.2785381026438569,
"grad_norm": 0.9826147561106185,
"learning_rate": 8.204779900908694e-06,
"loss": 0.1513,
"step": 1791
},
{
"epoch": 0.2786936236391913,
"grad_norm": 1.1924827625995933,
"learning_rate": 8.202904389177409e-06,
"loss": 0.2069,
"step": 1792
},
{
"epoch": 0.27884914463452565,
"grad_norm": 1.2507233550051102,
"learning_rate": 8.201028112868182e-06,
"loss": 0.1713,
"step": 1793
},
{
"epoch": 0.27900466562986004,
"grad_norm": 1.056564405898492,
"learning_rate": 8.199151072428903e-06,
"loss": 0.152,
"step": 1794
},
{
"epoch": 0.27916018662519443,
"grad_norm": 1.0582767182694146,
"learning_rate": 8.19727326830765e-06,
"loss": 0.1313,
"step": 1795
},
{
"epoch": 0.27931570762052876,
"grad_norm": 0.9960646193169612,
"learning_rate": 8.195394700952681e-06,
"loss": 0.1663,
"step": 1796
},
{
"epoch": 0.27947122861586315,
"grad_norm": 0.8580536351756373,
"learning_rate": 8.193515370812433e-06,
"loss": 0.1595,
"step": 1797
},
{
"epoch": 0.2796267496111975,
"grad_norm": 1.0831765474333348,
"learning_rate": 8.191635278335533e-06,
"loss": 0.1646,
"step": 1798
},
{
"epoch": 0.2797822706065319,
"grad_norm": 1.0292790758688968,
"learning_rate": 8.189754423970783e-06,
"loss": 0.1294,
"step": 1799
},
{
"epoch": 0.27993779160186627,
"grad_norm": 0.6900206697382273,
"learning_rate": 8.18787280816717e-06,
"loss": 0.1962,
"step": 1800
},
{
"epoch": 0.27993779160186627,
"eval_loss": 0.1942623406648636,
"eval_runtime": 9.4402,
"eval_samples_per_second": 2.754,
"eval_steps_per_second": 0.742,
"step": 1800
},
{
"epoch": 0.2800933125972006,
"grad_norm": 1.225359903420926,
"learning_rate": 8.18599043137386e-06,
"loss": 0.1613,
"step": 1801
},
{
"epoch": 0.280248833592535,
"grad_norm": 1.6844618005986576,
"learning_rate": 8.184107294040204e-06,
"loss": 0.2253,
"step": 1802
},
{
"epoch": 0.2804043545878694,
"grad_norm": 1.0175001190789204,
"learning_rate": 8.182223396615733e-06,
"loss": 0.1912,
"step": 1803
},
{
"epoch": 0.2805598755832037,
"grad_norm": 1.050408004024866,
"learning_rate": 8.18033873955016e-06,
"loss": 0.2061,
"step": 1804
},
{
"epoch": 0.2807153965785381,
"grad_norm": 1.4763046668239692,
"learning_rate": 8.178453323293378e-06,
"loss": 0.2781,
"step": 1805
},
{
"epoch": 0.28087091757387245,
"grad_norm": 0.8219546561822222,
"learning_rate": 8.176567148295462e-06,
"loss": 0.2129,
"step": 1806
},
{
"epoch": 0.28102643856920684,
"grad_norm": 0.9534941567105831,
"learning_rate": 8.174680215006671e-06,
"loss": 0.1653,
"step": 1807
},
{
"epoch": 0.28118195956454123,
"grad_norm": 1.0531235123680651,
"learning_rate": 8.172792523877439e-06,
"loss": 0.1384,
"step": 1808
},
{
"epoch": 0.28133748055987556,
"grad_norm": 1.3227244850484494,
"learning_rate": 8.170904075358386e-06,
"loss": 0.1878,
"step": 1809
},
{
"epoch": 0.28149300155520995,
"grad_norm": 0.8199812475506189,
"learning_rate": 8.169014869900308e-06,
"loss": 0.1583,
"step": 1810
},
{
"epoch": 0.28164852255054434,
"grad_norm": 1.1873233496157647,
"learning_rate": 8.167124907954188e-06,
"loss": 0.1689,
"step": 1811
},
{
"epoch": 0.2818040435458787,
"grad_norm": 1.332689389458692,
"learning_rate": 8.165234189971188e-06,
"loss": 0.1509,
"step": 1812
},
{
"epoch": 0.28195956454121307,
"grad_norm": 1.4288659016319332,
"learning_rate": 8.163342716402645e-06,
"loss": 0.1862,
"step": 1813
},
{
"epoch": 0.28211508553654746,
"grad_norm": 1.314918590717926,
"learning_rate": 8.16145048770008e-06,
"loss": 0.226,
"step": 1814
},
{
"epoch": 0.2822706065318818,
"grad_norm": 0.9155638179955898,
"learning_rate": 8.159557504315197e-06,
"loss": 0.1929,
"step": 1815
},
{
"epoch": 0.2824261275272162,
"grad_norm": 1.0431139463881003,
"learning_rate": 8.157663766699875e-06,
"loss": 0.1443,
"step": 1816
},
{
"epoch": 0.2825816485225505,
"grad_norm": 1.3294250069242237,
"learning_rate": 8.155769275306178e-06,
"loss": 0.193,
"step": 1817
},
{
"epoch": 0.2827371695178849,
"grad_norm": 0.9943106694297035,
"learning_rate": 8.153874030586343e-06,
"loss": 0.1421,
"step": 1818
},
{
"epoch": 0.2828926905132193,
"grad_norm": 1.165982265832558,
"learning_rate": 8.151978032992798e-06,
"loss": 0.1739,
"step": 1819
},
{
"epoch": 0.28304821150855364,
"grad_norm": 0.7428727580266941,
"learning_rate": 8.150081282978139e-06,
"loss": 0.1572,
"step": 1820
},
{
"epoch": 0.28320373250388803,
"grad_norm": 1.3026564844558632,
"learning_rate": 8.14818378099515e-06,
"loss": 0.1805,
"step": 1821
},
{
"epoch": 0.2833592534992224,
"grad_norm": 1.2645554368075294,
"learning_rate": 8.146285527496789e-06,
"loss": 0.1798,
"step": 1822
},
{
"epoch": 0.28351477449455675,
"grad_norm": 1.456481044065325,
"learning_rate": 8.144386522936195e-06,
"loss": 0.1598,
"step": 1823
},
{
"epoch": 0.28367029548989114,
"grad_norm": 1.1300906728090474,
"learning_rate": 8.142486767766688e-06,
"loss": 0.1648,
"step": 1824
},
{
"epoch": 0.2838258164852255,
"grad_norm": 1.1388001178297193,
"learning_rate": 8.140586262441767e-06,
"loss": 0.2733,
"step": 1825
},
{
"epoch": 0.28398133748055987,
"grad_norm": 0.7532300919484063,
"learning_rate": 8.138685007415109e-06,
"loss": 0.1213,
"step": 1826
},
{
"epoch": 0.28413685847589426,
"grad_norm": 1.0796067807349936,
"learning_rate": 8.136783003140568e-06,
"loss": 0.2189,
"step": 1827
},
{
"epoch": 0.2842923794712286,
"grad_norm": 1.331438012696905,
"learning_rate": 8.134880250072179e-06,
"loss": 0.1804,
"step": 1828
},
{
"epoch": 0.284447900466563,
"grad_norm": 1.2091930191659346,
"learning_rate": 8.13297674866416e-06,
"loss": 0.2194,
"step": 1829
},
{
"epoch": 0.2846034214618974,
"grad_norm": 1.0049073814467957,
"learning_rate": 8.131072499370897e-06,
"loss": 0.1333,
"step": 1830
},
{
"epoch": 0.2847589424572317,
"grad_norm": 1.0223717163539678,
"learning_rate": 8.129167502646966e-06,
"loss": 0.1988,
"step": 1831
},
{
"epoch": 0.2849144634525661,
"grad_norm": 1.4867747212307119,
"learning_rate": 8.127261758947114e-06,
"loss": 0.1467,
"step": 1832
},
{
"epoch": 0.2850699844479005,
"grad_norm": 0.8173079980321136,
"learning_rate": 8.125355268726266e-06,
"loss": 0.1058,
"step": 1833
},
{
"epoch": 0.28522550544323483,
"grad_norm": 1.570586484505542,
"learning_rate": 8.123448032439534e-06,
"loss": 0.2065,
"step": 1834
},
{
"epoch": 0.2853810264385692,
"grad_norm": 1.5595299992669573,
"learning_rate": 8.121540050542198e-06,
"loss": 0.2193,
"step": 1835
},
{
"epoch": 0.28553654743390355,
"grad_norm": 1.007755342730857,
"learning_rate": 8.119631323489722e-06,
"loss": 0.1371,
"step": 1836
},
{
"epoch": 0.28569206842923794,
"grad_norm": 1.301433540358406,
"learning_rate": 8.117721851737744e-06,
"loss": 0.176,
"step": 1837
},
{
"epoch": 0.28584758942457233,
"grad_norm": 0.8910831403501445,
"learning_rate": 8.115811635742079e-06,
"loss": 0.1626,
"step": 1838
},
{
"epoch": 0.28600311041990667,
"grad_norm": 0.7467945070581918,
"learning_rate": 8.113900675958728e-06,
"loss": 0.1821,
"step": 1839
},
{
"epoch": 0.28615863141524106,
"grad_norm": 1.1530448700016815,
"learning_rate": 8.111988972843859e-06,
"loss": 0.1923,
"step": 1840
},
{
"epoch": 0.28631415241057545,
"grad_norm": 2.088923036862537,
"learning_rate": 8.110076526853824e-06,
"loss": 0.1206,
"step": 1841
},
{
"epoch": 0.2864696734059098,
"grad_norm": 1.2835755029352423,
"learning_rate": 8.108163338445152e-06,
"loss": 0.2546,
"step": 1842
},
{
"epoch": 0.2866251944012442,
"grad_norm": 0.8829913186503389,
"learning_rate": 8.106249408074544e-06,
"loss": 0.1445,
"step": 1843
},
{
"epoch": 0.2867807153965785,
"grad_norm": 1.7629923458811358,
"learning_rate": 8.104334736198887e-06,
"loss": 0.1544,
"step": 1844
},
{
"epoch": 0.2869362363919129,
"grad_norm": 0.9657174681831697,
"learning_rate": 8.102419323275234e-06,
"loss": 0.2351,
"step": 1845
},
{
"epoch": 0.2870917573872473,
"grad_norm": 1.492589192357126,
"learning_rate": 8.100503169760827e-06,
"loss": 0.186,
"step": 1846
},
{
"epoch": 0.28724727838258163,
"grad_norm": 1.1233971084394738,
"learning_rate": 8.098586276113073e-06,
"loss": 0.1946,
"step": 1847
},
{
"epoch": 0.287402799377916,
"grad_norm": 0.8577653456028049,
"learning_rate": 8.096668642789565e-06,
"loss": 0.1633,
"step": 1848
},
{
"epoch": 0.2875583203732504,
"grad_norm": 1.1536545920544707,
"learning_rate": 8.094750270248065e-06,
"loss": 0.1603,
"step": 1849
},
{
"epoch": 0.28771384136858474,
"grad_norm": 0.9838814306399297,
"learning_rate": 8.09283115894652e-06,
"loss": 0.1623,
"step": 1850
},
{
"epoch": 0.28786936236391913,
"grad_norm": 0.9601384951644616,
"learning_rate": 8.090911309343045e-06,
"loss": 0.1252,
"step": 1851
},
{
"epoch": 0.2880248833592535,
"grad_norm": 0.9976176427201153,
"learning_rate": 8.088990721895938e-06,
"loss": 0.1815,
"step": 1852
},
{
"epoch": 0.28818040435458786,
"grad_norm": 0.7583399424827217,
"learning_rate": 8.087069397063666e-06,
"loss": 0.141,
"step": 1853
},
{
"epoch": 0.28833592534992225,
"grad_norm": 1.5185081715928586,
"learning_rate": 8.085147335304879e-06,
"loss": 0.1887,
"step": 1854
},
{
"epoch": 0.2884914463452566,
"grad_norm": 1.3061310770751247,
"learning_rate": 8.083224537078401e-06,
"loss": 0.1451,
"step": 1855
},
{
"epoch": 0.288646967340591,
"grad_norm": 1.7351614485797129,
"learning_rate": 8.081301002843226e-06,
"loss": 0.1264,
"step": 1856
},
{
"epoch": 0.28880248833592537,
"grad_norm": 0.9943204442132273,
"learning_rate": 8.079376733058532e-06,
"loss": 0.1743,
"step": 1857
},
{
"epoch": 0.2889580093312597,
"grad_norm": 1.1594102001358773,
"learning_rate": 8.07745172818367e-06,
"loss": 0.1607,
"step": 1858
},
{
"epoch": 0.2891135303265941,
"grad_norm": 1.6350163238448654,
"learning_rate": 8.075525988678163e-06,
"loss": 0.1813,
"step": 1859
},
{
"epoch": 0.2892690513219285,
"grad_norm": 1.083878957563236,
"learning_rate": 8.073599515001713e-06,
"loss": 0.1194,
"step": 1860
},
{
"epoch": 0.2894245723172628,
"grad_norm": 0.8178116527073355,
"learning_rate": 8.071672307614195e-06,
"loss": 0.228,
"step": 1861
},
{
"epoch": 0.2895800933125972,
"grad_norm": 1.1118324651261078,
"learning_rate": 8.069744366975664e-06,
"loss": 0.197,
"step": 1862
},
{
"epoch": 0.28973561430793154,
"grad_norm": 1.149751561349185,
"learning_rate": 8.06781569354634e-06,
"loss": 0.269,
"step": 1863
},
{
"epoch": 0.28989113530326593,
"grad_norm": 1.1618468357632399,
"learning_rate": 8.06588628778663e-06,
"loss": 0.1846,
"step": 1864
},
{
"epoch": 0.2900466562986003,
"grad_norm": 1.3277875865938236,
"learning_rate": 8.063956150157107e-06,
"loss": 0.1273,
"step": 1865
},
{
"epoch": 0.29020217729393466,
"grad_norm": 2.436613602568436,
"learning_rate": 8.062025281118524e-06,
"loss": 0.2442,
"step": 1866
},
{
"epoch": 0.29035769828926905,
"grad_norm": 1.1958552059286012,
"learning_rate": 8.060093681131804e-06,
"loss": 0.1874,
"step": 1867
},
{
"epoch": 0.29051321928460344,
"grad_norm": 0.8401669076143116,
"learning_rate": 8.058161350658047e-06,
"loss": 0.1901,
"step": 1868
},
{
"epoch": 0.2906687402799378,
"grad_norm": 0.9487811357677395,
"learning_rate": 8.056228290158528e-06,
"loss": 0.1346,
"step": 1869
},
{
"epoch": 0.29082426127527217,
"grad_norm": 0.957901732333534,
"learning_rate": 8.054294500094697e-06,
"loss": 0.1411,
"step": 1870
},
{
"epoch": 0.29097978227060656,
"grad_norm": 1.292131532239805,
"learning_rate": 8.052359980928172e-06,
"loss": 0.1827,
"step": 1871
},
{
"epoch": 0.2911353032659409,
"grad_norm": 0.9675272253773427,
"learning_rate": 8.050424733120757e-06,
"loss": 0.1738,
"step": 1872
},
{
"epoch": 0.2912908242612753,
"grad_norm": 1.367184606419033,
"learning_rate": 8.048488757134416e-06,
"loss": 0.1787,
"step": 1873
},
{
"epoch": 0.2914463452566096,
"grad_norm": 1.2673549853684765,
"learning_rate": 8.046552053431298e-06,
"loss": 0.2333,
"step": 1874
},
{
"epoch": 0.291601866251944,
"grad_norm": 1.9351495105907597,
"learning_rate": 8.044614622473717e-06,
"loss": 0.1987,
"step": 1875
},
{
"epoch": 0.2917573872472784,
"grad_norm": 0.8606527150680897,
"learning_rate": 8.042676464724169e-06,
"loss": 0.172,
"step": 1876
},
{
"epoch": 0.29191290824261273,
"grad_norm": 1.4901933699318817,
"learning_rate": 8.040737580645316e-06,
"loss": 0.1735,
"step": 1877
},
{
"epoch": 0.2920684292379471,
"grad_norm": 1.1691647071434712,
"learning_rate": 8.038797970699998e-06,
"loss": 0.2316,
"step": 1878
},
{
"epoch": 0.2922239502332815,
"grad_norm": 1.240770117738676,
"learning_rate": 8.036857635351226e-06,
"loss": 0.1667,
"step": 1879
},
{
"epoch": 0.29237947122861585,
"grad_norm": 1.0351622011955766,
"learning_rate": 8.034916575062188e-06,
"loss": 0.1405,
"step": 1880
},
{
"epoch": 0.29253499222395024,
"grad_norm": 0.9153491401389935,
"learning_rate": 8.032974790296239e-06,
"loss": 0.1726,
"step": 1881
},
{
"epoch": 0.2926905132192846,
"grad_norm": 1.4362209764019978,
"learning_rate": 8.031032281516913e-06,
"loss": 0.1827,
"step": 1882
},
{
"epoch": 0.29284603421461897,
"grad_norm": 1.183605507206791,
"learning_rate": 8.029089049187909e-06,
"loss": 0.1883,
"step": 1883
},
{
"epoch": 0.29300155520995336,
"grad_norm": 1.0539400115284923,
"learning_rate": 8.02714509377311e-06,
"loss": 0.1208,
"step": 1884
},
{
"epoch": 0.2931570762052877,
"grad_norm": 1.0217425114195149,
"learning_rate": 8.02520041573656e-06,
"loss": 0.174,
"step": 1885
},
{
"epoch": 0.2933125972006221,
"grad_norm": 1.0405110359742253,
"learning_rate": 8.023255015542482e-06,
"loss": 0.249,
"step": 1886
},
{
"epoch": 0.2934681181959565,
"grad_norm": 0.9949747841829932,
"learning_rate": 8.021308893655273e-06,
"loss": 0.1861,
"step": 1887
},
{
"epoch": 0.2936236391912908,
"grad_norm": 0.9631918396707634,
"learning_rate": 8.019362050539497e-06,
"loss": 0.22,
"step": 1888
},
{
"epoch": 0.2937791601866252,
"grad_norm": 1.471400212660711,
"learning_rate": 8.017414486659894e-06,
"loss": 0.2831,
"step": 1889
},
{
"epoch": 0.2939346811819596,
"grad_norm": 1.6502542476240603,
"learning_rate": 8.015466202481371e-06,
"loss": 0.1856,
"step": 1890
},
{
"epoch": 0.2940902021772939,
"grad_norm": 1.0678255046461738,
"learning_rate": 8.013517198469017e-06,
"loss": 0.2714,
"step": 1891
},
{
"epoch": 0.2942457231726283,
"grad_norm": 1.5419672646129527,
"learning_rate": 8.01156747508808e-06,
"loss": 0.2432,
"step": 1892
},
{
"epoch": 0.29440124416796265,
"grad_norm": 1.691620262630438,
"learning_rate": 8.009617032803989e-06,
"loss": 0.2494,
"step": 1893
},
{
"epoch": 0.29455676516329704,
"grad_norm": 1.0149866152436102,
"learning_rate": 8.007665872082343e-06,
"loss": 0.1446,
"step": 1894
},
{
"epoch": 0.29471228615863143,
"grad_norm": 1.2593397067130077,
"learning_rate": 8.005713993388908e-06,
"loss": 0.1813,
"step": 1895
},
{
"epoch": 0.29486780715396577,
"grad_norm": 1.751259190433369,
"learning_rate": 8.003761397189629e-06,
"loss": 0.3067,
"step": 1896
},
{
"epoch": 0.29502332814930016,
"grad_norm": 1.0592944557403567,
"learning_rate": 8.001808083950615e-06,
"loss": 0.1774,
"step": 1897
},
{
"epoch": 0.29517884914463455,
"grad_norm": 0.7601316574689209,
"learning_rate": 7.999854054138148e-06,
"loss": 0.1986,
"step": 1898
},
{
"epoch": 0.2953343701399689,
"grad_norm": 1.0763633141744329,
"learning_rate": 7.997899308218687e-06,
"loss": 0.1693,
"step": 1899
},
{
"epoch": 0.2954898911353033,
"grad_norm": 0.848192935949934,
"learning_rate": 7.995943846658852e-06,
"loss": 0.1785,
"step": 1900
},
{
"epoch": 0.2954898911353033,
"eval_loss": 0.19579939544200897,
"eval_runtime": 9.4258,
"eval_samples_per_second": 2.758,
"eval_steps_per_second": 0.743,
"step": 1900
},
{
"epoch": 0.29564541213063766,
"grad_norm": 1.1366949640186217,
"learning_rate": 7.99398766992544e-06,
"loss": 0.3427,
"step": 1901
},
{
"epoch": 0.295800933125972,
"grad_norm": 1.3011369731626548,
"learning_rate": 7.99203077848542e-06,
"loss": 0.128,
"step": 1902
},
{
"epoch": 0.2959564541213064,
"grad_norm": 1.6239083693901217,
"learning_rate": 7.990073172805927e-06,
"loss": 0.2033,
"step": 1903
},
{
"epoch": 0.2961119751166407,
"grad_norm": 2.136757506768007,
"learning_rate": 7.98811485335427e-06,
"loss": 0.8244,
"step": 1904
},
{
"epoch": 0.2962674961119751,
"grad_norm": 1.4156103108687226,
"learning_rate": 7.986155820597927e-06,
"loss": 0.2266,
"step": 1905
},
{
"epoch": 0.2964230171073095,
"grad_norm": 1.3059948518525273,
"learning_rate": 7.984196075004547e-06,
"loss": 0.1772,
"step": 1906
},
{
"epoch": 0.29657853810264384,
"grad_norm": 1.1897397554067446,
"learning_rate": 7.982235617041947e-06,
"loss": 0.2153,
"step": 1907
},
{
"epoch": 0.29673405909797823,
"grad_norm": 1.8814984942898336,
"learning_rate": 7.980274447178116e-06,
"loss": 0.163,
"step": 1908
},
{
"epoch": 0.2968895800933126,
"grad_norm": 0.8490191091642275,
"learning_rate": 7.978312565881212e-06,
"loss": 0.1929,
"step": 1909
},
{
"epoch": 0.29704510108864696,
"grad_norm": 1.0730207253151238,
"learning_rate": 7.976349973619567e-06,
"loss": 0.152,
"step": 1910
},
{
"epoch": 0.29720062208398135,
"grad_norm": 1.0988494794101311,
"learning_rate": 7.974386670861676e-06,
"loss": 0.1796,
"step": 1911
},
{
"epoch": 0.2973561430793157,
"grad_norm": 0.8890702707468837,
"learning_rate": 7.972422658076206e-06,
"loss": 0.1658,
"step": 1912
},
{
"epoch": 0.2975116640746501,
"grad_norm": 1.5485447290305507,
"learning_rate": 7.970457935731996e-06,
"loss": 0.219,
"step": 1913
},
{
"epoch": 0.29766718506998446,
"grad_norm": 1.1870158533528972,
"learning_rate": 7.968492504298053e-06,
"loss": 0.1678,
"step": 1914
},
{
"epoch": 0.2978227060653188,
"grad_norm": 0.8791513734953905,
"learning_rate": 7.966526364243553e-06,
"loss": 0.1379,
"step": 1915
},
{
"epoch": 0.2979782270606532,
"grad_norm": 1.1547532699065137,
"learning_rate": 7.96455951603784e-06,
"loss": 0.1578,
"step": 1916
},
{
"epoch": 0.2981337480559876,
"grad_norm": 1.2343036137247707,
"learning_rate": 7.962591960150426e-06,
"loss": 0.167,
"step": 1917
},
{
"epoch": 0.2982892690513219,
"grad_norm": 1.199679900214714,
"learning_rate": 7.960623697051e-06,
"loss": 0.2216,
"step": 1918
},
{
"epoch": 0.2984447900466563,
"grad_norm": 0.8701547919093023,
"learning_rate": 7.958654727209406e-06,
"loss": 0.1334,
"step": 1919
},
{
"epoch": 0.2986003110419907,
"grad_norm": 1.0186941275746395,
"learning_rate": 7.956685051095672e-06,
"loss": 0.1992,
"step": 1920
},
{
"epoch": 0.29875583203732503,
"grad_norm": 1.677907044209659,
"learning_rate": 7.954714669179981e-06,
"loss": 0.2557,
"step": 1921
},
{
"epoch": 0.2989113530326594,
"grad_norm": 1.0741276350489937,
"learning_rate": 7.952743581932696e-06,
"loss": 0.2228,
"step": 1922
},
{
"epoch": 0.29906687402799376,
"grad_norm": 1.1370483443720154,
"learning_rate": 7.950771789824341e-06,
"loss": 0.1822,
"step": 1923
},
{
"epoch": 0.29922239502332815,
"grad_norm": 1.4805485099895457,
"learning_rate": 7.948799293325607e-06,
"loss": 0.2066,
"step": 1924
},
{
"epoch": 0.29937791601866254,
"grad_norm": 1.0841471732459598,
"learning_rate": 7.946826092907362e-06,
"loss": 0.2086,
"step": 1925
},
{
"epoch": 0.2995334370139969,
"grad_norm": 0.9923801848699839,
"learning_rate": 7.944852189040633e-06,
"loss": 0.1457,
"step": 1926
},
{
"epoch": 0.29968895800933126,
"grad_norm": 1.1826489754247185,
"learning_rate": 7.942877582196618e-06,
"loss": 0.1335,
"step": 1927
},
{
"epoch": 0.29984447900466565,
"grad_norm": 1.0374422374980892,
"learning_rate": 7.940902272846684e-06,
"loss": 0.1747,
"step": 1928
},
{
"epoch": 0.3,
"grad_norm": 0.9355242713051211,
"learning_rate": 7.938926261462366e-06,
"loss": 0.2035,
"step": 1929
},
{
"epoch": 0.3001555209953344,
"grad_norm": 1.3452996964657524,
"learning_rate": 7.936949548515364e-06,
"loss": 0.2284,
"step": 1930
},
{
"epoch": 0.3003110419906687,
"grad_norm": 0.7948433007606517,
"learning_rate": 7.93497213447755e-06,
"loss": 0.2051,
"step": 1931
},
{
"epoch": 0.3004665629860031,
"grad_norm": 1.130699049423352,
"learning_rate": 7.932994019820956e-06,
"loss": 0.174,
"step": 1932
},
{
"epoch": 0.3006220839813375,
"grad_norm": 4.331642107991714,
"learning_rate": 7.931015205017788e-06,
"loss": 0.2259,
"step": 1933
},
{
"epoch": 0.30077760497667183,
"grad_norm": 1.5306684316210843,
"learning_rate": 7.929035690540414e-06,
"loss": 0.1917,
"step": 1934
},
{
"epoch": 0.3009331259720062,
"grad_norm": 0.8871970028065491,
"learning_rate": 7.927055476861376e-06,
"loss": 0.1765,
"step": 1935
},
{
"epoch": 0.3010886469673406,
"grad_norm": 0.9400711133682595,
"learning_rate": 7.925074564453376e-06,
"loss": 0.1824,
"step": 1936
},
{
"epoch": 0.30124416796267495,
"grad_norm": 0.9734009328190283,
"learning_rate": 7.923092953789287e-06,
"loss": 0.1575,
"step": 1937
},
{
"epoch": 0.30139968895800934,
"grad_norm": 1.1309704131602631,
"learning_rate": 7.921110645342144e-06,
"loss": 0.2438,
"step": 1938
},
{
"epoch": 0.30155520995334373,
"grad_norm": 1.2491112218817273,
"learning_rate": 7.919127639585153e-06,
"loss": 0.2252,
"step": 1939
},
{
"epoch": 0.30171073094867806,
"grad_norm": 0.9626959898568382,
"learning_rate": 7.917143936991688e-06,
"loss": 0.1416,
"step": 1940
},
{
"epoch": 0.30186625194401245,
"grad_norm": 0.933932349728071,
"learning_rate": 7.915159538035284e-06,
"loss": 0.1924,
"step": 1941
},
{
"epoch": 0.3020217729393468,
"grad_norm": 1.198922066826054,
"learning_rate": 7.913174443189645e-06,
"loss": 0.1918,
"step": 1942
},
{
"epoch": 0.3021772939346812,
"grad_norm": 0.711619672728743,
"learning_rate": 7.911188652928639e-06,
"loss": 0.1322,
"step": 1943
},
{
"epoch": 0.30233281493001557,
"grad_norm": 0.9224372120486194,
"learning_rate": 7.909202167726306e-06,
"loss": 0.1775,
"step": 1944
},
{
"epoch": 0.3024883359253499,
"grad_norm": 1.3276511094955517,
"learning_rate": 7.907214988056844e-06,
"loss": 0.2187,
"step": 1945
},
{
"epoch": 0.3026438569206843,
"grad_norm": 0.8655219464600901,
"learning_rate": 7.905227114394623e-06,
"loss": 0.1465,
"step": 1946
},
{
"epoch": 0.3027993779160187,
"grad_norm": 0.995295145761775,
"learning_rate": 7.903238547214173e-06,
"loss": 0.2004,
"step": 1947
},
{
"epoch": 0.302954898911353,
"grad_norm": 1.1948454763354273,
"learning_rate": 7.901249286990196e-06,
"loss": 0.1755,
"step": 1948
},
{
"epoch": 0.3031104199066874,
"grad_norm": 0.8682961110627464,
"learning_rate": 7.899259334197554e-06,
"loss": 0.1999,
"step": 1949
},
{
"epoch": 0.30326594090202175,
"grad_norm": 1.0906703142458485,
"learning_rate": 7.897268689311278e-06,
"loss": 0.1014,
"step": 1950
},
{
"epoch": 0.30342146189735614,
"grad_norm": 1.2664526681944839,
"learning_rate": 7.895277352806562e-06,
"loss": 0.2251,
"step": 1951
},
{
"epoch": 0.30357698289269053,
"grad_norm": 0.9627019771781781,
"learning_rate": 7.893285325158766e-06,
"loss": 0.1591,
"step": 1952
},
{
"epoch": 0.30373250388802486,
"grad_norm": 1.9216322578695895,
"learning_rate": 7.891292606843414e-06,
"loss": 0.2066,
"step": 1953
},
{
"epoch": 0.30388802488335925,
"grad_norm": 0.9086586156841527,
"learning_rate": 7.889299198336197e-06,
"loss": 0.2196,
"step": 1954
},
{
"epoch": 0.30404354587869364,
"grad_norm": 1.4203649142405548,
"learning_rate": 7.887305100112967e-06,
"loss": 0.1804,
"step": 1955
},
{
"epoch": 0.304199066874028,
"grad_norm": 1.2381428600296667,
"learning_rate": 7.885310312649747e-06,
"loss": 0.1434,
"step": 1956
},
{
"epoch": 0.30435458786936237,
"grad_norm": 0.7952770821447226,
"learning_rate": 7.883314836422717e-06,
"loss": 0.1955,
"step": 1957
},
{
"epoch": 0.30451010886469676,
"grad_norm": 0.998488522800322,
"learning_rate": 7.881318671908228e-06,
"loss": 0.2239,
"step": 1958
},
{
"epoch": 0.3046656298600311,
"grad_norm": 1.0829580403987296,
"learning_rate": 7.879321819582788e-06,
"loss": 0.2401,
"step": 1959
},
{
"epoch": 0.3048211508553655,
"grad_norm": 1.043363355464928,
"learning_rate": 7.877324279923078e-06,
"loss": 0.1821,
"step": 1960
},
{
"epoch": 0.3049766718506998,
"grad_norm": 1.7533270649933215,
"learning_rate": 7.875326053405936e-06,
"loss": 0.2513,
"step": 1961
},
{
"epoch": 0.3051321928460342,
"grad_norm": 1.3436274607263432,
"learning_rate": 7.873327140508367e-06,
"loss": 0.2352,
"step": 1962
},
{
"epoch": 0.3052877138413686,
"grad_norm": 2.0633364771352274,
"learning_rate": 7.87132754170754e-06,
"loss": 0.2125,
"step": 1963
},
{
"epoch": 0.30544323483670294,
"grad_norm": 0.9097966158633792,
"learning_rate": 7.869327257480787e-06,
"loss": 0.1627,
"step": 1964
},
{
"epoch": 0.3055987558320373,
"grad_norm": 1.8317277834761483,
"learning_rate": 7.867326288305603e-06,
"loss": 0.211,
"step": 1965
},
{
"epoch": 0.3057542768273717,
"grad_norm": 1.1448361049962872,
"learning_rate": 7.865324634659647e-06,
"loss": 0.1683,
"step": 1966
},
{
"epoch": 0.30590979782270605,
"grad_norm": 1.14865744697956,
"learning_rate": 7.863322297020743e-06,
"loss": 0.2238,
"step": 1967
},
{
"epoch": 0.30606531881804044,
"grad_norm": 1.0967845312311937,
"learning_rate": 7.861319275866877e-06,
"loss": 0.1889,
"step": 1968
},
{
"epoch": 0.3062208398133748,
"grad_norm": 1.2461473684464468,
"learning_rate": 7.859315571676198e-06,
"loss": 0.2138,
"step": 1969
},
{
"epoch": 0.30637636080870917,
"grad_norm": 1.1992165952645324,
"learning_rate": 7.857311184927015e-06,
"loss": 0.2289,
"step": 1970
},
{
"epoch": 0.30653188180404356,
"grad_norm": 0.9734656478980178,
"learning_rate": 7.855306116097807e-06,
"loss": 0.1798,
"step": 1971
},
{
"epoch": 0.3066874027993779,
"grad_norm": 0.8576094794110676,
"learning_rate": 7.853300365667211e-06,
"loss": 0.1849,
"step": 1972
},
{
"epoch": 0.3068429237947123,
"grad_norm": 0.9320489557446329,
"learning_rate": 7.851293934114026e-06,
"loss": 0.1663,
"step": 1973
},
{
"epoch": 0.3069984447900467,
"grad_norm": 1.5628965027384294,
"learning_rate": 7.849286821917217e-06,
"loss": 0.2741,
"step": 1974
},
{
"epoch": 0.307153965785381,
"grad_norm": 1.1064029390023975,
"learning_rate": 7.847279029555908e-06,
"loss": 0.1655,
"step": 1975
},
{
"epoch": 0.3073094867807154,
"grad_norm": 1.1272492512254035,
"learning_rate": 7.845270557509389e-06,
"loss": 0.1473,
"step": 1976
},
{
"epoch": 0.3074650077760498,
"grad_norm": 0.8321910160414181,
"learning_rate": 7.843261406257108e-06,
"loss": 0.1571,
"step": 1977
},
{
"epoch": 0.3076205287713841,
"grad_norm": 0.9606345664210296,
"learning_rate": 7.841251576278681e-06,
"loss": 0.227,
"step": 1978
},
{
"epoch": 0.3077760497667185,
"grad_norm": 1.0695344914586096,
"learning_rate": 7.839241068053878e-06,
"loss": 0.1616,
"step": 1979
},
{
"epoch": 0.30793157076205285,
"grad_norm": 2.415757365845339,
"learning_rate": 7.837229882062638e-06,
"loss": 0.2091,
"step": 1980
},
{
"epoch": 0.30808709175738724,
"grad_norm": 0.8945861519999385,
"learning_rate": 7.83521801878506e-06,
"loss": 0.1769,
"step": 1981
},
{
"epoch": 0.30824261275272163,
"grad_norm": 1.1779736396630833,
"learning_rate": 7.8332054787014e-06,
"loss": 0.2311,
"step": 1982
},
{
"epoch": 0.30839813374805597,
"grad_norm": 1.372493149836755,
"learning_rate": 7.831192262292082e-06,
"loss": 0.172,
"step": 1983
},
{
"epoch": 0.30855365474339036,
"grad_norm": 2.4487535069237407,
"learning_rate": 7.82917837003769e-06,
"loss": 0.1395,
"step": 1984
},
{
"epoch": 0.30870917573872475,
"grad_norm": 0.871516091971395,
"learning_rate": 7.827163802418967e-06,
"loss": 0.1437,
"step": 1985
},
{
"epoch": 0.3088646967340591,
"grad_norm": 1.2731269701284036,
"learning_rate": 7.825148559916817e-06,
"loss": 0.1857,
"step": 1986
},
{
"epoch": 0.3090202177293935,
"grad_norm": 0.9768725926218434,
"learning_rate": 7.823132643012308e-06,
"loss": 0.195,
"step": 1987
},
{
"epoch": 0.3091757387247278,
"grad_norm": 0.9369720572131188,
"learning_rate": 7.821116052186668e-06,
"loss": 0.2034,
"step": 1988
},
{
"epoch": 0.3093312597200622,
"grad_norm": 1.1315495162839369,
"learning_rate": 7.819098787921283e-06,
"loss": 0.1755,
"step": 1989
},
{
"epoch": 0.3094867807153966,
"grad_norm": 1.0958800584806985,
"learning_rate": 7.817080850697705e-06,
"loss": 0.2575,
"step": 1990
},
{
"epoch": 0.3096423017107309,
"grad_norm": 0.9336006846477578,
"learning_rate": 7.815062240997642e-06,
"loss": 0.1376,
"step": 1991
},
{
"epoch": 0.3097978227060653,
"grad_norm": 0.9121065280126879,
"learning_rate": 7.813042959302963e-06,
"loss": 0.1212,
"step": 1992
},
{
"epoch": 0.3099533437013997,
"grad_norm": 0.6936258475052076,
"learning_rate": 7.811023006095703e-06,
"loss": 0.13,
"step": 1993
},
{
"epoch": 0.31010886469673404,
"grad_norm": 1.278051470184625,
"learning_rate": 7.809002381858048e-06,
"loss": 0.1686,
"step": 1994
},
{
"epoch": 0.31026438569206843,
"grad_norm": 1.2807241898257353,
"learning_rate": 7.806981087072354e-06,
"loss": 0.2569,
"step": 1995
},
{
"epoch": 0.3104199066874028,
"grad_norm": 1.6449581085415006,
"learning_rate": 7.804959122221127e-06,
"loss": 0.3075,
"step": 1996
},
{
"epoch": 0.31057542768273716,
"grad_norm": 0.9051580549498448,
"learning_rate": 7.802936487787045e-06,
"loss": 0.1603,
"step": 1997
},
{
"epoch": 0.31073094867807155,
"grad_norm": 1.5451818475345835,
"learning_rate": 7.800913184252931e-06,
"loss": 0.2057,
"step": 1998
},
{
"epoch": 0.3108864696734059,
"grad_norm": 1.0935081143315897,
"learning_rate": 7.79888921210178e-06,
"loss": 0.2238,
"step": 1999
},
{
"epoch": 0.3110419906687403,
"grad_norm": 1.262020237993972,
"learning_rate": 7.796864571816745e-06,
"loss": 0.1977,
"step": 2000
},
{
"epoch": 0.3110419906687403,
"eval_loss": 0.19129334390163422,
"eval_runtime": 9.4405,
"eval_samples_per_second": 2.754,
"eval_steps_per_second": 0.741,
"step": 2000
}
],
"logging_steps": 1,
"max_steps": 6430,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 2000,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 191987712000000.0,
"train_batch_size": 1,
"trial_name": null,
"trial_params": null
}