random_DSzHdZtZEVFTCdpO / trainer_state.json
cutelemonlili's picture
Add files using upload-large-folder tool
4810440 verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 2.0,
"eval_steps": 500,
"global_step": 2144,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0009328358208955224,
"grad_norm": 3.504828120771814,
"learning_rate": 9.999994632276776e-06,
"loss": 0.3261,
"step": 1
},
{
"epoch": 0.0018656716417910447,
"grad_norm": 2.3615723232353814,
"learning_rate": 9.999978529118625e-06,
"loss": 0.2777,
"step": 2
},
{
"epoch": 0.002798507462686567,
"grad_norm": 5.916454919666803,
"learning_rate": 9.999951690560122e-06,
"loss": 0.2836,
"step": 3
},
{
"epoch": 0.0037313432835820895,
"grad_norm": 1.8873931288294354,
"learning_rate": 9.999914116658896e-06,
"loss": 0.2506,
"step": 4
},
{
"epoch": 0.0046641791044776115,
"grad_norm": 2.2131303334001577,
"learning_rate": 9.999865807495616e-06,
"loss": 0.3082,
"step": 5
},
{
"epoch": 0.005597014925373134,
"grad_norm": 1.9071229748203362,
"learning_rate": 9.999806763174009e-06,
"loss": 0.2879,
"step": 6
},
{
"epoch": 0.0065298507462686565,
"grad_norm": 1.705352427822773,
"learning_rate": 9.999736983820849e-06,
"loss": 0.292,
"step": 7
},
{
"epoch": 0.007462686567164179,
"grad_norm": 1.713988508392442,
"learning_rate": 9.999656469585957e-06,
"loss": 0.27,
"step": 8
},
{
"epoch": 0.008395522388059701,
"grad_norm": 1.7146113640225777,
"learning_rate": 9.999565220642204e-06,
"loss": 0.273,
"step": 9
},
{
"epoch": 0.009328358208955223,
"grad_norm": 1.9717667837041324,
"learning_rate": 9.999463237185512e-06,
"loss": 0.2698,
"step": 10
},
{
"epoch": 0.010261194029850746,
"grad_norm": 1.6944388445330036,
"learning_rate": 9.999350519434845e-06,
"loss": 0.2598,
"step": 11
},
{
"epoch": 0.011194029850746268,
"grad_norm": 1.5056746742653715,
"learning_rate": 9.999227067632223e-06,
"loss": 0.2145,
"step": 12
},
{
"epoch": 0.012126865671641791,
"grad_norm": 1.6049599181993224,
"learning_rate": 9.999092882042703e-06,
"loss": 0.2638,
"step": 13
},
{
"epoch": 0.013059701492537313,
"grad_norm": 1.598791504158011,
"learning_rate": 9.998947962954395e-06,
"loss": 0.296,
"step": 14
},
{
"epoch": 0.013992537313432836,
"grad_norm": 1.4737350024393174,
"learning_rate": 9.998792310678454e-06,
"loss": 0.2599,
"step": 15
},
{
"epoch": 0.014925373134328358,
"grad_norm": 1.6209751246114812,
"learning_rate": 9.99862592554908e-06,
"loss": 0.2262,
"step": 16
},
{
"epoch": 0.01585820895522388,
"grad_norm": 1.5381445660374602,
"learning_rate": 9.998448807923517e-06,
"loss": 0.291,
"step": 17
},
{
"epoch": 0.016791044776119403,
"grad_norm": 1.5243144687560854,
"learning_rate": 9.998260958182048e-06,
"loss": 0.2357,
"step": 18
},
{
"epoch": 0.017723880597014924,
"grad_norm": 1.6139229688930536,
"learning_rate": 9.99806237672801e-06,
"loss": 0.2643,
"step": 19
},
{
"epoch": 0.018656716417910446,
"grad_norm": 1.6217246024693246,
"learning_rate": 9.997853063987768e-06,
"loss": 0.2965,
"step": 20
},
{
"epoch": 0.01958955223880597,
"grad_norm": 1.6080924641938708,
"learning_rate": 9.997633020410742e-06,
"loss": 0.2757,
"step": 21
},
{
"epoch": 0.020522388059701493,
"grad_norm": 1.5781983576612648,
"learning_rate": 9.997402246469382e-06,
"loss": 0.2598,
"step": 22
},
{
"epoch": 0.021455223880597014,
"grad_norm": 1.5222547775408568,
"learning_rate": 9.997160742659176e-06,
"loss": 0.2608,
"step": 23
},
{
"epoch": 0.022388059701492536,
"grad_norm": 1.4640943343714499,
"learning_rate": 9.996908509498662e-06,
"loss": 0.2431,
"step": 24
},
{
"epoch": 0.02332089552238806,
"grad_norm": 1.5871771632413834,
"learning_rate": 9.996645547529402e-06,
"loss": 0.3371,
"step": 25
},
{
"epoch": 0.024253731343283583,
"grad_norm": 1.5463201473320933,
"learning_rate": 9.996371857316e-06,
"loss": 0.234,
"step": 26
},
{
"epoch": 0.025186567164179104,
"grad_norm": 1.4718472647196803,
"learning_rate": 9.996087439446094e-06,
"loss": 0.2645,
"step": 27
},
{
"epoch": 0.026119402985074626,
"grad_norm": 1.4683539374199128,
"learning_rate": 9.995792294530356e-06,
"loss": 0.2496,
"step": 28
},
{
"epoch": 0.027052238805970148,
"grad_norm": 1.5295384301190804,
"learning_rate": 9.995486423202485e-06,
"loss": 0.2658,
"step": 29
},
{
"epoch": 0.027985074626865673,
"grad_norm": 1.5622767261246613,
"learning_rate": 9.995169826119215e-06,
"loss": 0.2305,
"step": 30
},
{
"epoch": 0.028917910447761194,
"grad_norm": 1.4856190072614097,
"learning_rate": 9.994842503960308e-06,
"loss": 0.2223,
"step": 31
},
{
"epoch": 0.029850746268656716,
"grad_norm": 1.5394365920327815,
"learning_rate": 9.994504457428557e-06,
"loss": 0.2478,
"step": 32
},
{
"epoch": 0.030783582089552237,
"grad_norm": 1.462578205444393,
"learning_rate": 9.994155687249775e-06,
"loss": 0.2205,
"step": 33
},
{
"epoch": 0.03171641791044776,
"grad_norm": 1.52001491082532,
"learning_rate": 9.993796194172806e-06,
"loss": 0.2576,
"step": 34
},
{
"epoch": 0.03264925373134328,
"grad_norm": 1.5000699624498415,
"learning_rate": 9.993425978969508e-06,
"loss": 0.2344,
"step": 35
},
{
"epoch": 0.033582089552238806,
"grad_norm": 1.4030942155270072,
"learning_rate": 9.993045042434772e-06,
"loss": 0.2137,
"step": 36
},
{
"epoch": 0.03451492537313433,
"grad_norm": 1.4902543532941115,
"learning_rate": 9.9926533853865e-06,
"loss": 0.2625,
"step": 37
},
{
"epoch": 0.03544776119402985,
"grad_norm": 1.469358131239095,
"learning_rate": 9.992251008665613e-06,
"loss": 0.2191,
"step": 38
},
{
"epoch": 0.036380597014925374,
"grad_norm": 1.5244188812937278,
"learning_rate": 9.991837913136053e-06,
"loss": 0.2598,
"step": 39
},
{
"epoch": 0.03731343283582089,
"grad_norm": 1.6528109851134296,
"learning_rate": 9.99141409968477e-06,
"loss": 0.3381,
"step": 40
},
{
"epoch": 0.03824626865671642,
"grad_norm": 1.6045526459467032,
"learning_rate": 9.990979569221733e-06,
"loss": 0.2637,
"step": 41
},
{
"epoch": 0.03917910447761194,
"grad_norm": 1.4764228467945528,
"learning_rate": 9.990534322679915e-06,
"loss": 0.2359,
"step": 42
},
{
"epoch": 0.04011194029850746,
"grad_norm": 1.471552235019458,
"learning_rate": 9.9900783610153e-06,
"loss": 0.2603,
"step": 43
},
{
"epoch": 0.041044776119402986,
"grad_norm": 1.4939545950094606,
"learning_rate": 9.989611685206881e-06,
"loss": 0.2442,
"step": 44
},
{
"epoch": 0.04197761194029851,
"grad_norm": 1.5330545678463738,
"learning_rate": 9.989134296256648e-06,
"loss": 0.2168,
"step": 45
},
{
"epoch": 0.04291044776119403,
"grad_norm": 1.5722324426590966,
"learning_rate": 9.988646195189604e-06,
"loss": 0.2524,
"step": 46
},
{
"epoch": 0.043843283582089554,
"grad_norm": 1.6673648587933672,
"learning_rate": 9.988147383053739e-06,
"loss": 0.279,
"step": 47
},
{
"epoch": 0.04477611940298507,
"grad_norm": 1.6316110614165935,
"learning_rate": 9.987637860920053e-06,
"loss": 0.277,
"step": 48
},
{
"epoch": 0.0457089552238806,
"grad_norm": 1.3017219161555913,
"learning_rate": 9.98711762988253e-06,
"loss": 0.2087,
"step": 49
},
{
"epoch": 0.04664179104477612,
"grad_norm": 1.6585007240946634,
"learning_rate": 9.986586691058156e-06,
"loss": 0.2402,
"step": 50
},
{
"epoch": 0.04757462686567164,
"grad_norm": 1.435129267578802,
"learning_rate": 9.986045045586904e-06,
"loss": 0.2258,
"step": 51
},
{
"epoch": 0.048507462686567165,
"grad_norm": 1.3641626482919518,
"learning_rate": 9.985492694631733e-06,
"loss": 0.2103,
"step": 52
},
{
"epoch": 0.049440298507462684,
"grad_norm": 1.5007160160488142,
"learning_rate": 9.984929639378593e-06,
"loss": 0.2694,
"step": 53
},
{
"epoch": 0.05037313432835821,
"grad_norm": 1.509449093302392,
"learning_rate": 9.98435588103641e-06,
"loss": 0.2931,
"step": 54
},
{
"epoch": 0.051305970149253734,
"grad_norm": 1.4629541603512073,
"learning_rate": 9.983771420837098e-06,
"loss": 0.2485,
"step": 55
},
{
"epoch": 0.05223880597014925,
"grad_norm": 1.3311013739415256,
"learning_rate": 9.983176260035546e-06,
"loss": 0.2031,
"step": 56
},
{
"epoch": 0.05317164179104478,
"grad_norm": 1.5115451257421706,
"learning_rate": 9.982570399909612e-06,
"loss": 0.2504,
"step": 57
},
{
"epoch": 0.054104477611940295,
"grad_norm": 1.3579306876167139,
"learning_rate": 9.981953841760137e-06,
"loss": 0.2289,
"step": 58
},
{
"epoch": 0.05503731343283582,
"grad_norm": 1.436056964181841,
"learning_rate": 9.981326586910921e-06,
"loss": 0.2212,
"step": 59
},
{
"epoch": 0.055970149253731345,
"grad_norm": 1.4863669227109872,
"learning_rate": 9.980688636708744e-06,
"loss": 0.2077,
"step": 60
},
{
"epoch": 0.05690298507462686,
"grad_norm": 1.6159740773767797,
"learning_rate": 9.980039992523335e-06,
"loss": 0.2353,
"step": 61
},
{
"epoch": 0.05783582089552239,
"grad_norm": 1.458582213485752,
"learning_rate": 9.979380655747395e-06,
"loss": 0.2133,
"step": 62
},
{
"epoch": 0.058768656716417914,
"grad_norm": 1.4937196703685367,
"learning_rate": 9.978710627796577e-06,
"loss": 0.2609,
"step": 63
},
{
"epoch": 0.05970149253731343,
"grad_norm": 1.5665181980933172,
"learning_rate": 9.978029910109491e-06,
"loss": 0.3306,
"step": 64
},
{
"epoch": 0.06063432835820896,
"grad_norm": 1.4545115623875786,
"learning_rate": 9.9773385041477e-06,
"loss": 0.2248,
"step": 65
},
{
"epoch": 0.061567164179104475,
"grad_norm": 1.367323597428911,
"learning_rate": 9.976636411395712e-06,
"loss": 0.2631,
"step": 66
},
{
"epoch": 0.0625,
"grad_norm": 1.4079528542577002,
"learning_rate": 9.975923633360985e-06,
"loss": 0.2141,
"step": 67
},
{
"epoch": 0.06343283582089553,
"grad_norm": 1.4239669532338677,
"learning_rate": 9.975200171573917e-06,
"loss": 0.2628,
"step": 68
},
{
"epoch": 0.06436567164179105,
"grad_norm": 1.456254873499018,
"learning_rate": 9.974466027587844e-06,
"loss": 0.2258,
"step": 69
},
{
"epoch": 0.06529850746268656,
"grad_norm": 1.4502780060390226,
"learning_rate": 9.973721202979038e-06,
"loss": 0.2491,
"step": 70
},
{
"epoch": 0.06623134328358209,
"grad_norm": 1.7170577208006956,
"learning_rate": 9.972965699346705e-06,
"loss": 0.238,
"step": 71
},
{
"epoch": 0.06716417910447761,
"grad_norm": 1.4596891076306684,
"learning_rate": 9.972199518312979e-06,
"loss": 0.2127,
"step": 72
},
{
"epoch": 0.06809701492537314,
"grad_norm": 1.4933040545922878,
"learning_rate": 9.971422661522919e-06,
"loss": 0.2424,
"step": 73
},
{
"epoch": 0.06902985074626866,
"grad_norm": 1.5726209337479455,
"learning_rate": 9.970635130644507e-06,
"loss": 0.2149,
"step": 74
},
{
"epoch": 0.06996268656716417,
"grad_norm": 1.4668363536094942,
"learning_rate": 9.96983692736864e-06,
"loss": 0.239,
"step": 75
},
{
"epoch": 0.0708955223880597,
"grad_norm": 1.4357573184764414,
"learning_rate": 9.969028053409131e-06,
"loss": 0.2146,
"step": 76
},
{
"epoch": 0.07182835820895522,
"grad_norm": 1.5436317261849903,
"learning_rate": 9.968208510502708e-06,
"loss": 0.2794,
"step": 77
},
{
"epoch": 0.07276119402985075,
"grad_norm": 1.4616769566530838,
"learning_rate": 9.967378300408998e-06,
"loss": 0.1884,
"step": 78
},
{
"epoch": 0.07369402985074627,
"grad_norm": 1.3693601049840698,
"learning_rate": 9.966537424910542e-06,
"loss": 0.2411,
"step": 79
},
{
"epoch": 0.07462686567164178,
"grad_norm": 1.4594105908413364,
"learning_rate": 9.965685885812773e-06,
"loss": 0.2815,
"step": 80
},
{
"epoch": 0.07555970149253731,
"grad_norm": 1.4610387019396303,
"learning_rate": 9.964823684944017e-06,
"loss": 0.2263,
"step": 81
},
{
"epoch": 0.07649253731343283,
"grad_norm": 1.6171224207013994,
"learning_rate": 9.963950824155502e-06,
"loss": 0.2349,
"step": 82
},
{
"epoch": 0.07742537313432836,
"grad_norm": 1.669653253270167,
"learning_rate": 9.963067305321334e-06,
"loss": 0.2529,
"step": 83
},
{
"epoch": 0.07835820895522388,
"grad_norm": 1.4542612622001394,
"learning_rate": 9.96217313033851e-06,
"loss": 0.2857,
"step": 84
},
{
"epoch": 0.07929104477611941,
"grad_norm": 1.5163247384336487,
"learning_rate": 9.961268301126902e-06,
"loss": 0.2822,
"step": 85
},
{
"epoch": 0.08022388059701492,
"grad_norm": 1.373641468055401,
"learning_rate": 9.960352819629259e-06,
"loss": 0.2355,
"step": 86
},
{
"epoch": 0.08115671641791045,
"grad_norm": 1.517536486659926,
"learning_rate": 9.959426687811202e-06,
"loss": 0.2159,
"step": 87
},
{
"epoch": 0.08208955223880597,
"grad_norm": 1.4125395258698592,
"learning_rate": 9.958489907661217e-06,
"loss": 0.2388,
"step": 88
},
{
"epoch": 0.0830223880597015,
"grad_norm": 1.3566639028343168,
"learning_rate": 9.957542481190656e-06,
"loss": 0.2173,
"step": 89
},
{
"epoch": 0.08395522388059702,
"grad_norm": 1.49748689852031,
"learning_rate": 9.95658441043373e-06,
"loss": 0.2716,
"step": 90
},
{
"epoch": 0.08488805970149253,
"grad_norm": 1.3945447151040264,
"learning_rate": 9.955615697447499e-06,
"loss": 0.2559,
"step": 91
},
{
"epoch": 0.08582089552238806,
"grad_norm": 1.3652676888583273,
"learning_rate": 9.95463634431188e-06,
"loss": 0.2426,
"step": 92
},
{
"epoch": 0.08675373134328358,
"grad_norm": 1.2906675900129567,
"learning_rate": 9.953646353129626e-06,
"loss": 0.2015,
"step": 93
},
{
"epoch": 0.08768656716417911,
"grad_norm": 1.4046299397203328,
"learning_rate": 9.952645726026344e-06,
"loss": 0.2339,
"step": 94
},
{
"epoch": 0.08861940298507463,
"grad_norm": 1.5899253571402825,
"learning_rate": 9.951634465150463e-06,
"loss": 0.2938,
"step": 95
},
{
"epoch": 0.08955223880597014,
"grad_norm": 1.3529056182643173,
"learning_rate": 9.950612572673255e-06,
"loss": 0.2065,
"step": 96
},
{
"epoch": 0.09048507462686567,
"grad_norm": 1.4152494711718966,
"learning_rate": 9.949580050788813e-06,
"loss": 0.2309,
"step": 97
},
{
"epoch": 0.0914179104477612,
"grad_norm": 1.330761603292533,
"learning_rate": 9.948536901714052e-06,
"loss": 0.2039,
"step": 98
},
{
"epoch": 0.09235074626865672,
"grad_norm": 1.4554400226887259,
"learning_rate": 9.947483127688708e-06,
"loss": 0.2206,
"step": 99
},
{
"epoch": 0.09328358208955224,
"grad_norm": 1.5989141026459632,
"learning_rate": 9.946418730975326e-06,
"loss": 0.3154,
"step": 100
},
{
"epoch": 0.09421641791044776,
"grad_norm": 1.4173678693730305,
"learning_rate": 9.945343713859265e-06,
"loss": 0.2588,
"step": 101
},
{
"epoch": 0.09514925373134328,
"grad_norm": 1.2482382413232498,
"learning_rate": 9.944258078648679e-06,
"loss": 0.1844,
"step": 102
},
{
"epoch": 0.0960820895522388,
"grad_norm": 1.5034039526413285,
"learning_rate": 9.943161827674524e-06,
"loss": 0.2794,
"step": 103
},
{
"epoch": 0.09701492537313433,
"grad_norm": 1.4654084731129182,
"learning_rate": 9.942054963290549e-06,
"loss": 0.2893,
"step": 104
},
{
"epoch": 0.09794776119402986,
"grad_norm": 1.3112663438975303,
"learning_rate": 9.940937487873291e-06,
"loss": 0.2416,
"step": 105
},
{
"epoch": 0.09888059701492537,
"grad_norm": 1.4583196765720625,
"learning_rate": 9.939809403822069e-06,
"loss": 0.2563,
"step": 106
},
{
"epoch": 0.09981343283582089,
"grad_norm": 1.5160302662676066,
"learning_rate": 9.938670713558983e-06,
"loss": 0.2758,
"step": 107
},
{
"epoch": 0.10074626865671642,
"grad_norm": 1.3769691563103652,
"learning_rate": 9.9375214195289e-06,
"loss": 0.2187,
"step": 108
},
{
"epoch": 0.10167910447761194,
"grad_norm": 1.3836367725537049,
"learning_rate": 9.936361524199457e-06,
"loss": 0.2439,
"step": 109
},
{
"epoch": 0.10261194029850747,
"grad_norm": 1.3438591129480295,
"learning_rate": 9.935191030061052e-06,
"loss": 0.2104,
"step": 110
},
{
"epoch": 0.10354477611940298,
"grad_norm": 1.3272511661454085,
"learning_rate": 9.934009939626841e-06,
"loss": 0.2139,
"step": 111
},
{
"epoch": 0.1044776119402985,
"grad_norm": 1.3915937126780076,
"learning_rate": 9.932818255432733e-06,
"loss": 0.242,
"step": 112
},
{
"epoch": 0.10541044776119403,
"grad_norm": 1.3760119807889453,
"learning_rate": 9.931615980037379e-06,
"loss": 0.2244,
"step": 113
},
{
"epoch": 0.10634328358208955,
"grad_norm": 1.429382679968247,
"learning_rate": 9.930403116022169e-06,
"loss": 0.257,
"step": 114
},
{
"epoch": 0.10727611940298508,
"grad_norm": 1.4820454678416672,
"learning_rate": 9.929179665991234e-06,
"loss": 0.2634,
"step": 115
},
{
"epoch": 0.10820895522388059,
"grad_norm": 1.3717549449001434,
"learning_rate": 9.92794563257143e-06,
"loss": 0.2269,
"step": 116
},
{
"epoch": 0.10914179104477612,
"grad_norm": 1.4152419184213607,
"learning_rate": 9.926701018412335e-06,
"loss": 0.2371,
"step": 117
},
{
"epoch": 0.11007462686567164,
"grad_norm": 1.4937791162173693,
"learning_rate": 9.925445826186246e-06,
"loss": 0.2784,
"step": 118
},
{
"epoch": 0.11100746268656717,
"grad_norm": 1.4668852652230355,
"learning_rate": 9.924180058588177e-06,
"loss": 0.251,
"step": 119
},
{
"epoch": 0.11194029850746269,
"grad_norm": 1.4588624579256642,
"learning_rate": 9.92290371833584e-06,
"loss": 0.2427,
"step": 120
},
{
"epoch": 0.11287313432835822,
"grad_norm": 1.4192268281373919,
"learning_rate": 9.921616808169655e-06,
"loss": 0.2535,
"step": 121
},
{
"epoch": 0.11380597014925373,
"grad_norm": 1.3360201557699796,
"learning_rate": 9.920319330852729e-06,
"loss": 0.2069,
"step": 122
},
{
"epoch": 0.11473880597014925,
"grad_norm": 1.442691611821781,
"learning_rate": 9.919011289170863e-06,
"loss": 0.262,
"step": 123
},
{
"epoch": 0.11567164179104478,
"grad_norm": 1.5573463553214775,
"learning_rate": 9.91769268593254e-06,
"loss": 0.2973,
"step": 124
},
{
"epoch": 0.1166044776119403,
"grad_norm": 1.338534413462635,
"learning_rate": 9.91636352396892e-06,
"loss": 0.2108,
"step": 125
},
{
"epoch": 0.11753731343283583,
"grad_norm": 1.4018913055329987,
"learning_rate": 9.915023806133833e-06,
"loss": 0.2473,
"step": 126
},
{
"epoch": 0.11847014925373134,
"grad_norm": 1.418439965059982,
"learning_rate": 9.913673535303768e-06,
"loss": 0.2702,
"step": 127
},
{
"epoch": 0.11940298507462686,
"grad_norm": 1.453592069106935,
"learning_rate": 9.91231271437788e-06,
"loss": 0.2167,
"step": 128
},
{
"epoch": 0.12033582089552239,
"grad_norm": 1.3895229333541956,
"learning_rate": 9.910941346277976e-06,
"loss": 0.2384,
"step": 129
},
{
"epoch": 0.12126865671641791,
"grad_norm": 1.364367296904366,
"learning_rate": 9.909559433948501e-06,
"loss": 0.2114,
"step": 130
},
{
"epoch": 0.12220149253731344,
"grad_norm": 1.3482844635327793,
"learning_rate": 9.908166980356548e-06,
"loss": 0.1961,
"step": 131
},
{
"epoch": 0.12313432835820895,
"grad_norm": 1.3630742951053016,
"learning_rate": 9.906763988491834e-06,
"loss": 0.2126,
"step": 132
},
{
"epoch": 0.12406716417910447,
"grad_norm": 1.3064487916152947,
"learning_rate": 9.905350461366713e-06,
"loss": 0.2072,
"step": 133
},
{
"epoch": 0.125,
"grad_norm": 1.3972019735824797,
"learning_rate": 9.903926402016153e-06,
"loss": 0.2486,
"step": 134
},
{
"epoch": 0.1259328358208955,
"grad_norm": 1.3602872226033067,
"learning_rate": 9.902491813497735e-06,
"loss": 0.2286,
"step": 135
},
{
"epoch": 0.12686567164179105,
"grad_norm": 1.2596145835806758,
"learning_rate": 9.901046698891648e-06,
"loss": 0.2085,
"step": 136
},
{
"epoch": 0.12779850746268656,
"grad_norm": 1.3636680187102763,
"learning_rate": 9.899591061300684e-06,
"loss": 0.2283,
"step": 137
},
{
"epoch": 0.1287313432835821,
"grad_norm": 1.3820368798328602,
"learning_rate": 9.898124903850228e-06,
"loss": 0.2515,
"step": 138
},
{
"epoch": 0.1296641791044776,
"grad_norm": 1.4127756119119061,
"learning_rate": 9.896648229688248e-06,
"loss": 0.1884,
"step": 139
},
{
"epoch": 0.13059701492537312,
"grad_norm": 1.3516953099470277,
"learning_rate": 9.895161041985295e-06,
"loss": 0.2493,
"step": 140
},
{
"epoch": 0.13152985074626866,
"grad_norm": 1.4736087776119398,
"learning_rate": 9.893663343934496e-06,
"loss": 0.2884,
"step": 141
},
{
"epoch": 0.13246268656716417,
"grad_norm": 1.4800592264693608,
"learning_rate": 9.892155138751542e-06,
"loss": 0.2488,
"step": 142
},
{
"epoch": 0.1333955223880597,
"grad_norm": 1.5725783650380547,
"learning_rate": 9.890636429674684e-06,
"loss": 0.2206,
"step": 143
},
{
"epoch": 0.13432835820895522,
"grad_norm": 1.4414825402474298,
"learning_rate": 9.889107219964726e-06,
"loss": 0.2911,
"step": 144
},
{
"epoch": 0.13526119402985073,
"grad_norm": 1.4198450748776796,
"learning_rate": 9.887567512905019e-06,
"loss": 0.2328,
"step": 145
},
{
"epoch": 0.13619402985074627,
"grad_norm": 1.4477631005726908,
"learning_rate": 9.886017311801449e-06,
"loss": 0.2357,
"step": 146
},
{
"epoch": 0.13712686567164178,
"grad_norm": 1.2948928799390829,
"learning_rate": 9.884456619982437e-06,
"loss": 0.2041,
"step": 147
},
{
"epoch": 0.13805970149253732,
"grad_norm": 1.3172494063741267,
"learning_rate": 9.882885440798928e-06,
"loss": 0.2666,
"step": 148
},
{
"epoch": 0.13899253731343283,
"grad_norm": 1.4117329418770321,
"learning_rate": 9.881303777624385e-06,
"loss": 0.1783,
"step": 149
},
{
"epoch": 0.13992537313432835,
"grad_norm": 1.5119339771702494,
"learning_rate": 9.879711633854778e-06,
"loss": 0.2387,
"step": 150
},
{
"epoch": 0.14085820895522388,
"grad_norm": 1.4705530827214168,
"learning_rate": 9.878109012908583e-06,
"loss": 0.2498,
"step": 151
},
{
"epoch": 0.1417910447761194,
"grad_norm": 1.4229149575244904,
"learning_rate": 9.876495918226772e-06,
"loss": 0.2582,
"step": 152
},
{
"epoch": 0.14272388059701493,
"grad_norm": 1.4502287985677522,
"learning_rate": 9.8748723532728e-06,
"loss": 0.226,
"step": 153
},
{
"epoch": 0.14365671641791045,
"grad_norm": 1.3114945077716187,
"learning_rate": 9.873238321532609e-06,
"loss": 0.1934,
"step": 154
},
{
"epoch": 0.14458955223880596,
"grad_norm": 1.3654237571951762,
"learning_rate": 9.871593826514607e-06,
"loss": 0.2284,
"step": 155
},
{
"epoch": 0.1455223880597015,
"grad_norm": 1.3814896512674422,
"learning_rate": 9.869938871749676e-06,
"loss": 0.2393,
"step": 156
},
{
"epoch": 0.146455223880597,
"grad_norm": 1.3384575997158925,
"learning_rate": 9.86827346079115e-06,
"loss": 0.2453,
"step": 157
},
{
"epoch": 0.14738805970149255,
"grad_norm": 1.4269277163538632,
"learning_rate": 9.866597597214815e-06,
"loss": 0.2434,
"step": 158
},
{
"epoch": 0.14832089552238806,
"grad_norm": 1.2466769585933348,
"learning_rate": 9.864911284618899e-06,
"loss": 0.1927,
"step": 159
},
{
"epoch": 0.14925373134328357,
"grad_norm": 1.3782896759760206,
"learning_rate": 9.863214526624065e-06,
"loss": 0.2159,
"step": 160
},
{
"epoch": 0.1501865671641791,
"grad_norm": 1.391702740422766,
"learning_rate": 9.861507326873407e-06,
"loss": 0.2506,
"step": 161
},
{
"epoch": 0.15111940298507462,
"grad_norm": 1.3717322250887456,
"learning_rate": 9.859789689032434e-06,
"loss": 0.2532,
"step": 162
},
{
"epoch": 0.15205223880597016,
"grad_norm": 1.286489768706081,
"learning_rate": 9.858061616789068e-06,
"loss": 0.1956,
"step": 163
},
{
"epoch": 0.15298507462686567,
"grad_norm": 1.4077796091428876,
"learning_rate": 9.856323113853632e-06,
"loss": 0.2275,
"step": 164
},
{
"epoch": 0.15391791044776118,
"grad_norm": 1.3326459434111209,
"learning_rate": 9.854574183958849e-06,
"loss": 0.2273,
"step": 165
},
{
"epoch": 0.15485074626865672,
"grad_norm": 1.3422225620535737,
"learning_rate": 9.852814830859827e-06,
"loss": 0.2599,
"step": 166
},
{
"epoch": 0.15578358208955223,
"grad_norm": 1.3195632256694576,
"learning_rate": 9.851045058334055e-06,
"loss": 0.1826,
"step": 167
},
{
"epoch": 0.15671641791044777,
"grad_norm": 1.266822639136575,
"learning_rate": 9.849264870181393e-06,
"loss": 0.2173,
"step": 168
},
{
"epoch": 0.15764925373134328,
"grad_norm": 1.3815334501848933,
"learning_rate": 9.847474270224062e-06,
"loss": 0.2077,
"step": 169
},
{
"epoch": 0.15858208955223882,
"grad_norm": 1.400081571893218,
"learning_rate": 9.845673262306643e-06,
"loss": 0.2722,
"step": 170
},
{
"epoch": 0.15951492537313433,
"grad_norm": 1.4215899008723631,
"learning_rate": 9.843861850296058e-06,
"loss": 0.2297,
"step": 171
},
{
"epoch": 0.16044776119402984,
"grad_norm": 1.3378155864059595,
"learning_rate": 9.842040038081572e-06,
"loss": 0.2473,
"step": 172
},
{
"epoch": 0.16138059701492538,
"grad_norm": 1.3874487537624232,
"learning_rate": 9.840207829574777e-06,
"loss": 0.2578,
"step": 173
},
{
"epoch": 0.1623134328358209,
"grad_norm": 1.2370882678550728,
"learning_rate": 9.838365228709588e-06,
"loss": 0.1954,
"step": 174
},
{
"epoch": 0.16324626865671643,
"grad_norm": 1.3358518993309685,
"learning_rate": 9.836512239442237e-06,
"loss": 0.2406,
"step": 175
},
{
"epoch": 0.16417910447761194,
"grad_norm": 1.4150781522345088,
"learning_rate": 9.834648865751254e-06,
"loss": 0.2219,
"step": 176
},
{
"epoch": 0.16511194029850745,
"grad_norm": 1.5355994840163563,
"learning_rate": 9.832775111637469e-06,
"loss": 0.2353,
"step": 177
},
{
"epoch": 0.166044776119403,
"grad_norm": 1.58448880113716,
"learning_rate": 9.830890981124001e-06,
"loss": 0.2773,
"step": 178
},
{
"epoch": 0.1669776119402985,
"grad_norm": 1.4064170913348892,
"learning_rate": 9.828996478256246e-06,
"loss": 0.2411,
"step": 179
},
{
"epoch": 0.16791044776119404,
"grad_norm": 1.343334533340573,
"learning_rate": 9.827091607101871e-06,
"loss": 0.2452,
"step": 180
},
{
"epoch": 0.16884328358208955,
"grad_norm": 1.3862492550886658,
"learning_rate": 9.825176371750802e-06,
"loss": 0.2269,
"step": 181
},
{
"epoch": 0.16977611940298507,
"grad_norm": 1.4314862429416477,
"learning_rate": 9.823250776315223e-06,
"loss": 0.289,
"step": 182
},
{
"epoch": 0.1707089552238806,
"grad_norm": 1.3897064722948196,
"learning_rate": 9.82131482492956e-06,
"loss": 0.2456,
"step": 183
},
{
"epoch": 0.17164179104477612,
"grad_norm": 1.4425674269218023,
"learning_rate": 9.81936852175047e-06,
"loss": 0.2426,
"step": 184
},
{
"epoch": 0.17257462686567165,
"grad_norm": 1.3179382745429815,
"learning_rate": 9.817411870956843e-06,
"loss": 0.2202,
"step": 185
},
{
"epoch": 0.17350746268656717,
"grad_norm": 1.4657681026021574,
"learning_rate": 9.81544487674978e-06,
"loss": 0.278,
"step": 186
},
{
"epoch": 0.17444029850746268,
"grad_norm": 1.39854851518916,
"learning_rate": 9.813467543352598e-06,
"loss": 0.2452,
"step": 187
},
{
"epoch": 0.17537313432835822,
"grad_norm": 1.4897917428136873,
"learning_rate": 9.811479875010801e-06,
"loss": 0.2795,
"step": 188
},
{
"epoch": 0.17630597014925373,
"grad_norm": 1.45635595333176,
"learning_rate": 9.809481875992097e-06,
"loss": 0.2349,
"step": 189
},
{
"epoch": 0.17723880597014927,
"grad_norm": 1.4518420147728965,
"learning_rate": 9.807473550586368e-06,
"loss": 0.2513,
"step": 190
},
{
"epoch": 0.17817164179104478,
"grad_norm": 1.4205330284183837,
"learning_rate": 9.805454903105663e-06,
"loss": 0.2035,
"step": 191
},
{
"epoch": 0.1791044776119403,
"grad_norm": 1.4932540354115929,
"learning_rate": 9.803425937884202e-06,
"loss": 0.2407,
"step": 192
},
{
"epoch": 0.18003731343283583,
"grad_norm": 1.4332138465693849,
"learning_rate": 9.801386659278354e-06,
"loss": 0.2444,
"step": 193
},
{
"epoch": 0.18097014925373134,
"grad_norm": 1.432375521439532,
"learning_rate": 9.799337071666632e-06,
"loss": 0.2618,
"step": 194
},
{
"epoch": 0.18190298507462688,
"grad_norm": 1.3471819631859887,
"learning_rate": 9.797277179449684e-06,
"loss": 0.2668,
"step": 195
},
{
"epoch": 0.1828358208955224,
"grad_norm": 1.284197489128854,
"learning_rate": 9.79520698705028e-06,
"loss": 0.2015,
"step": 196
},
{
"epoch": 0.1837686567164179,
"grad_norm": 1.3569140878727413,
"learning_rate": 9.793126498913313e-06,
"loss": 0.2431,
"step": 197
},
{
"epoch": 0.18470149253731344,
"grad_norm": 1.38127819209215,
"learning_rate": 9.791035719505773e-06,
"loss": 0.2279,
"step": 198
},
{
"epoch": 0.18563432835820895,
"grad_norm": 1.3106409372023844,
"learning_rate": 9.788934653316751e-06,
"loss": 0.2225,
"step": 199
},
{
"epoch": 0.1865671641791045,
"grad_norm": 1.385041781717496,
"learning_rate": 9.786823304857424e-06,
"loss": 0.2577,
"step": 200
},
{
"epoch": 0.1875,
"grad_norm": 1.3374693150632666,
"learning_rate": 9.784701678661045e-06,
"loss": 0.2055,
"step": 201
},
{
"epoch": 0.1884328358208955,
"grad_norm": 1.2943022183043107,
"learning_rate": 9.782569779282936e-06,
"loss": 0.1775,
"step": 202
},
{
"epoch": 0.18936567164179105,
"grad_norm": 1.4905925652544376,
"learning_rate": 9.780427611300474e-06,
"loss": 0.2704,
"step": 203
},
{
"epoch": 0.19029850746268656,
"grad_norm": 1.3498812547956893,
"learning_rate": 9.778275179313084e-06,
"loss": 0.2123,
"step": 204
},
{
"epoch": 0.1912313432835821,
"grad_norm": 1.4192006060768911,
"learning_rate": 9.776112487942234e-06,
"loss": 0.2311,
"step": 205
},
{
"epoch": 0.1921641791044776,
"grad_norm": 1.318443539824902,
"learning_rate": 9.77393954183141e-06,
"loss": 0.212,
"step": 206
},
{
"epoch": 0.19309701492537312,
"grad_norm": 1.4033355905689706,
"learning_rate": 9.771756345646126e-06,
"loss": 0.2445,
"step": 207
},
{
"epoch": 0.19402985074626866,
"grad_norm": 1.4135479045174189,
"learning_rate": 9.769562904073896e-06,
"loss": 0.2259,
"step": 208
},
{
"epoch": 0.19496268656716417,
"grad_norm": 1.5189371630436521,
"learning_rate": 9.767359221824236e-06,
"loss": 0.3221,
"step": 209
},
{
"epoch": 0.1958955223880597,
"grad_norm": 1.501654221671077,
"learning_rate": 9.765145303628649e-06,
"loss": 0.2155,
"step": 210
},
{
"epoch": 0.19682835820895522,
"grad_norm": 1.3422511623677083,
"learning_rate": 9.762921154240614e-06,
"loss": 0.213,
"step": 211
},
{
"epoch": 0.19776119402985073,
"grad_norm": 1.417574830290805,
"learning_rate": 9.76068677843558e-06,
"loss": 0.2494,
"step": 212
},
{
"epoch": 0.19869402985074627,
"grad_norm": 1.4303838899263415,
"learning_rate": 9.75844218101095e-06,
"loss": 0.229,
"step": 213
},
{
"epoch": 0.19962686567164178,
"grad_norm": 1.3210532389813487,
"learning_rate": 9.756187366786077e-06,
"loss": 0.2287,
"step": 214
},
{
"epoch": 0.20055970149253732,
"grad_norm": 1.4301375913667724,
"learning_rate": 9.753922340602245e-06,
"loss": 0.2262,
"step": 215
},
{
"epoch": 0.20149253731343283,
"grad_norm": 1.464427337181014,
"learning_rate": 9.751647107322668e-06,
"loss": 0.2387,
"step": 216
},
{
"epoch": 0.20242537313432835,
"grad_norm": 1.283762633828094,
"learning_rate": 9.749361671832478e-06,
"loss": 0.2051,
"step": 217
},
{
"epoch": 0.20335820895522388,
"grad_norm": 1.4046271563330237,
"learning_rate": 9.747066039038707e-06,
"loss": 0.2809,
"step": 218
},
{
"epoch": 0.2042910447761194,
"grad_norm": 1.3858875789586913,
"learning_rate": 9.744760213870286e-06,
"loss": 0.2559,
"step": 219
},
{
"epoch": 0.20522388059701493,
"grad_norm": 1.4633410789476218,
"learning_rate": 9.742444201278022e-06,
"loss": 0.1991,
"step": 220
},
{
"epoch": 0.20615671641791045,
"grad_norm": 1.35952836186008,
"learning_rate": 9.740118006234607e-06,
"loss": 0.2196,
"step": 221
},
{
"epoch": 0.20708955223880596,
"grad_norm": 1.3170051893053112,
"learning_rate": 9.737781633734586e-06,
"loss": 0.1885,
"step": 222
},
{
"epoch": 0.2080223880597015,
"grad_norm": 1.3364564968573158,
"learning_rate": 9.735435088794361e-06,
"loss": 0.2092,
"step": 223
},
{
"epoch": 0.208955223880597,
"grad_norm": 1.310621521925476,
"learning_rate": 9.733078376452172e-06,
"loss": 0.1858,
"step": 224
},
{
"epoch": 0.20988805970149255,
"grad_norm": 1.3305475178644435,
"learning_rate": 9.730711501768091e-06,
"loss": 0.2396,
"step": 225
},
{
"epoch": 0.21082089552238806,
"grad_norm": 1.3635865870378607,
"learning_rate": 9.72833446982401e-06,
"loss": 0.2343,
"step": 226
},
{
"epoch": 0.21175373134328357,
"grad_norm": 1.3892908120924656,
"learning_rate": 9.725947285723629e-06,
"loss": 0.2494,
"step": 227
},
{
"epoch": 0.2126865671641791,
"grad_norm": 1.4923858063155087,
"learning_rate": 9.723549954592447e-06,
"loss": 0.2684,
"step": 228
},
{
"epoch": 0.21361940298507462,
"grad_norm": 1.4794880593929247,
"learning_rate": 9.721142481577744e-06,
"loss": 0.2433,
"step": 229
},
{
"epoch": 0.21455223880597016,
"grad_norm": 1.319052535367511,
"learning_rate": 9.718724871848581e-06,
"loss": 0.1957,
"step": 230
},
{
"epoch": 0.21548507462686567,
"grad_norm": 1.4725370664727746,
"learning_rate": 9.716297130595784e-06,
"loss": 0.2691,
"step": 231
},
{
"epoch": 0.21641791044776118,
"grad_norm": 1.450324974590314,
"learning_rate": 9.713859263031928e-06,
"loss": 0.2075,
"step": 232
},
{
"epoch": 0.21735074626865672,
"grad_norm": 1.4029613700148216,
"learning_rate": 9.711411274391334e-06,
"loss": 0.2636,
"step": 233
},
{
"epoch": 0.21828358208955223,
"grad_norm": 1.3218009630051155,
"learning_rate": 9.70895316993005e-06,
"loss": 0.1856,
"step": 234
},
{
"epoch": 0.21921641791044777,
"grad_norm": 1.3772203562395018,
"learning_rate": 9.706484954925848e-06,
"loss": 0.2221,
"step": 235
},
{
"epoch": 0.22014925373134328,
"grad_norm": 1.338495075466863,
"learning_rate": 9.704006634678205e-06,
"loss": 0.2422,
"step": 236
},
{
"epoch": 0.22108208955223882,
"grad_norm": 1.3505260994538177,
"learning_rate": 9.701518214508295e-06,
"loss": 0.2219,
"step": 237
},
{
"epoch": 0.22201492537313433,
"grad_norm": 1.3796800954684891,
"learning_rate": 9.69901969975898e-06,
"loss": 0.2599,
"step": 238
},
{
"epoch": 0.22294776119402984,
"grad_norm": 1.3801269945094994,
"learning_rate": 9.696511095794794e-06,
"loss": 0.2658,
"step": 239
},
{
"epoch": 0.22388059701492538,
"grad_norm": 1.373830630137388,
"learning_rate": 9.693992408001934e-06,
"loss": 0.2366,
"step": 240
},
{
"epoch": 0.2248134328358209,
"grad_norm": 1.3099841410165376,
"learning_rate": 9.691463641788244e-06,
"loss": 0.2444,
"step": 241
},
{
"epoch": 0.22574626865671643,
"grad_norm": 1.2221561716646219,
"learning_rate": 9.688924802583215e-06,
"loss": 0.1762,
"step": 242
},
{
"epoch": 0.22667910447761194,
"grad_norm": 1.4100739023363644,
"learning_rate": 9.68637589583796e-06,
"loss": 0.27,
"step": 243
},
{
"epoch": 0.22761194029850745,
"grad_norm": 1.3049181672879322,
"learning_rate": 9.683816927025212e-06,
"loss": 0.2358,
"step": 244
},
{
"epoch": 0.228544776119403,
"grad_norm": 1.3168040345621081,
"learning_rate": 9.6812479016393e-06,
"loss": 0.2132,
"step": 245
},
{
"epoch": 0.2294776119402985,
"grad_norm": 1.3491278430037894,
"learning_rate": 9.678668825196155e-06,
"loss": 0.225,
"step": 246
},
{
"epoch": 0.23041044776119404,
"grad_norm": 1.3564580773647767,
"learning_rate": 9.676079703233283e-06,
"loss": 0.2174,
"step": 247
},
{
"epoch": 0.23134328358208955,
"grad_norm": 1.3908729062228546,
"learning_rate": 9.673480541309761e-06,
"loss": 0.2413,
"step": 248
},
{
"epoch": 0.23227611940298507,
"grad_norm": 1.269500269440636,
"learning_rate": 9.670871345006221e-06,
"loss": 0.1922,
"step": 249
},
{
"epoch": 0.2332089552238806,
"grad_norm": 1.370700147884663,
"learning_rate": 9.66825211992484e-06,
"loss": 0.2288,
"step": 250
},
{
"epoch": 0.23414179104477612,
"grad_norm": 1.267163085637987,
"learning_rate": 9.665622871689329e-06,
"loss": 0.2046,
"step": 251
},
{
"epoch": 0.23507462686567165,
"grad_norm": 1.298510817211434,
"learning_rate": 9.662983605944918e-06,
"loss": 0.2151,
"step": 252
},
{
"epoch": 0.23600746268656717,
"grad_norm": 1.311525030838766,
"learning_rate": 9.660334328358345e-06,
"loss": 0.2041,
"step": 253
},
{
"epoch": 0.23694029850746268,
"grad_norm": 1.3768838650896877,
"learning_rate": 9.65767504461785e-06,
"loss": 0.2488,
"step": 254
},
{
"epoch": 0.23787313432835822,
"grad_norm": 1.3453744719741791,
"learning_rate": 9.65500576043315e-06,
"loss": 0.2079,
"step": 255
},
{
"epoch": 0.23880597014925373,
"grad_norm": 1.2776829624435933,
"learning_rate": 9.652326481535434e-06,
"loss": 0.1966,
"step": 256
},
{
"epoch": 0.23973880597014927,
"grad_norm": 1.3767181262573056,
"learning_rate": 9.649637213677357e-06,
"loss": 0.2377,
"step": 257
},
{
"epoch": 0.24067164179104478,
"grad_norm": 1.510423982807279,
"learning_rate": 9.646937962633014e-06,
"loss": 0.2917,
"step": 258
},
{
"epoch": 0.2416044776119403,
"grad_norm": 1.387266789016929,
"learning_rate": 9.64422873419794e-06,
"loss": 0.2163,
"step": 259
},
{
"epoch": 0.24253731343283583,
"grad_norm": 1.3166417891082647,
"learning_rate": 9.64150953418909e-06,
"loss": 0.2357,
"step": 260
},
{
"epoch": 0.24347014925373134,
"grad_norm": 1.2797218591347883,
"learning_rate": 9.63878036844483e-06,
"loss": 0.2081,
"step": 261
},
{
"epoch": 0.24440298507462688,
"grad_norm": 1.311554046142437,
"learning_rate": 9.636041242824921e-06,
"loss": 0.1977,
"step": 262
},
{
"epoch": 0.2453358208955224,
"grad_norm": 1.3999553328296679,
"learning_rate": 9.63329216321051e-06,
"loss": 0.2363,
"step": 263
},
{
"epoch": 0.2462686567164179,
"grad_norm": 1.3554101059225228,
"learning_rate": 9.630533135504118e-06,
"loss": 0.2508,
"step": 264
},
{
"epoch": 0.24720149253731344,
"grad_norm": 1.3676131273735823,
"learning_rate": 9.627764165629623e-06,
"loss": 0.2192,
"step": 265
},
{
"epoch": 0.24813432835820895,
"grad_norm": 1.3976149276187737,
"learning_rate": 9.624985259532251e-06,
"loss": 0.2292,
"step": 266
},
{
"epoch": 0.2490671641791045,
"grad_norm": 1.5796116347031655,
"learning_rate": 9.622196423178562e-06,
"loss": 0.2759,
"step": 267
},
{
"epoch": 0.25,
"grad_norm": 1.5270801308560245,
"learning_rate": 9.619397662556434e-06,
"loss": 0.2455,
"step": 268
},
{
"epoch": 0.25093283582089554,
"grad_norm": 1.414754787779726,
"learning_rate": 9.61658898367506e-06,
"loss": 0.1956,
"step": 269
},
{
"epoch": 0.251865671641791,
"grad_norm": 1.313800531793292,
"learning_rate": 9.613770392564921e-06,
"loss": 0.2041,
"step": 270
},
{
"epoch": 0.25279850746268656,
"grad_norm": 1.4419235484624284,
"learning_rate": 9.610941895277784e-06,
"loss": 0.2756,
"step": 271
},
{
"epoch": 0.2537313432835821,
"grad_norm": 1.449298486490396,
"learning_rate": 9.608103497886687e-06,
"loss": 0.2678,
"step": 272
},
{
"epoch": 0.25466417910447764,
"grad_norm": 1.413245202772997,
"learning_rate": 9.605255206485922e-06,
"loss": 0.2551,
"step": 273
},
{
"epoch": 0.2555970149253731,
"grad_norm": 1.2714103244027373,
"learning_rate": 9.602397027191026e-06,
"loss": 0.1897,
"step": 274
},
{
"epoch": 0.25652985074626866,
"grad_norm": 1.3662586560658077,
"learning_rate": 9.599528966138763e-06,
"loss": 0.2685,
"step": 275
},
{
"epoch": 0.2574626865671642,
"grad_norm": 1.3062222445535137,
"learning_rate": 9.596651029487116e-06,
"loss": 0.2123,
"step": 276
},
{
"epoch": 0.2583955223880597,
"grad_norm": 1.2910765824769777,
"learning_rate": 9.593763223415275e-06,
"loss": 0.2362,
"step": 277
},
{
"epoch": 0.2593283582089552,
"grad_norm": 1.2822070852083836,
"learning_rate": 9.590865554123614e-06,
"loss": 0.2068,
"step": 278
},
{
"epoch": 0.26026119402985076,
"grad_norm": 1.392777607498332,
"learning_rate": 9.587958027833691e-06,
"loss": 0.2229,
"step": 279
},
{
"epoch": 0.26119402985074625,
"grad_norm": 1.4255656729973265,
"learning_rate": 9.585040650788222e-06,
"loss": 0.2477,
"step": 280
},
{
"epoch": 0.2621268656716418,
"grad_norm": 1.4169296716396083,
"learning_rate": 9.582113429251076e-06,
"loss": 0.1878,
"step": 281
},
{
"epoch": 0.2630597014925373,
"grad_norm": 1.3513572522860997,
"learning_rate": 9.579176369507262e-06,
"loss": 0.1998,
"step": 282
},
{
"epoch": 0.26399253731343286,
"grad_norm": 1.3667340131720151,
"learning_rate": 9.576229477862905e-06,
"loss": 0.2251,
"step": 283
},
{
"epoch": 0.26492537313432835,
"grad_norm": 1.2853412344374613,
"learning_rate": 9.573272760645248e-06,
"loss": 0.236,
"step": 284
},
{
"epoch": 0.2658582089552239,
"grad_norm": 1.4691562800080749,
"learning_rate": 9.570306224202625e-06,
"loss": 0.3,
"step": 285
},
{
"epoch": 0.2667910447761194,
"grad_norm": 1.3925927328135574,
"learning_rate": 9.567329874904456e-06,
"loss": 0.2249,
"step": 286
},
{
"epoch": 0.2677238805970149,
"grad_norm": 1.439250276065806,
"learning_rate": 9.56434371914123e-06,
"loss": 0.2476,
"step": 287
},
{
"epoch": 0.26865671641791045,
"grad_norm": 1.38564396436711,
"learning_rate": 9.561347763324484e-06,
"loss": 0.2584,
"step": 288
},
{
"epoch": 0.269589552238806,
"grad_norm": 1.399725121507279,
"learning_rate": 9.55834201388681e-06,
"loss": 0.2033,
"step": 289
},
{
"epoch": 0.27052238805970147,
"grad_norm": 1.4448814363169957,
"learning_rate": 9.555326477281816e-06,
"loss": 0.2303,
"step": 290
},
{
"epoch": 0.271455223880597,
"grad_norm": 1.3032836699591415,
"learning_rate": 9.55230115998413e-06,
"loss": 0.1913,
"step": 291
},
{
"epoch": 0.27238805970149255,
"grad_norm": 1.3592667166710317,
"learning_rate": 9.549266068489377e-06,
"loss": 0.1912,
"step": 292
},
{
"epoch": 0.2733208955223881,
"grad_norm": 1.3387689709683435,
"learning_rate": 9.546221209314172e-06,
"loss": 0.2436,
"step": 293
},
{
"epoch": 0.27425373134328357,
"grad_norm": 1.3735637776687901,
"learning_rate": 9.543166588996095e-06,
"loss": 0.2401,
"step": 294
},
{
"epoch": 0.2751865671641791,
"grad_norm": 1.5670786625519744,
"learning_rate": 9.540102214093696e-06,
"loss": 0.3073,
"step": 295
},
{
"epoch": 0.27611940298507465,
"grad_norm": 1.2913637891950454,
"learning_rate": 9.537028091186453e-06,
"loss": 0.205,
"step": 296
},
{
"epoch": 0.27705223880597013,
"grad_norm": 1.419222985662341,
"learning_rate": 9.533944226874787e-06,
"loss": 0.2355,
"step": 297
},
{
"epoch": 0.27798507462686567,
"grad_norm": 1.3369499615171254,
"learning_rate": 9.530850627780031e-06,
"loss": 0.2227,
"step": 298
},
{
"epoch": 0.2789179104477612,
"grad_norm": 1.302876915331245,
"learning_rate": 9.527747300544417e-06,
"loss": 0.2379,
"step": 299
},
{
"epoch": 0.2798507462686567,
"grad_norm": 1.3275069238503028,
"learning_rate": 9.524634251831064e-06,
"loss": 0.2041,
"step": 300
},
{
"epoch": 0.28078358208955223,
"grad_norm": 1.3029244538742029,
"learning_rate": 9.521511488323968e-06,
"loss": 0.2101,
"step": 301
},
{
"epoch": 0.28171641791044777,
"grad_norm": 1.4213044150670375,
"learning_rate": 9.518379016727979e-06,
"loss": 0.2401,
"step": 302
},
{
"epoch": 0.2826492537313433,
"grad_norm": 1.3307001580231097,
"learning_rate": 9.515236843768796e-06,
"loss": 0.2359,
"step": 303
},
{
"epoch": 0.2835820895522388,
"grad_norm": 1.3336151644523149,
"learning_rate": 9.512084976192944e-06,
"loss": 0.1733,
"step": 304
},
{
"epoch": 0.28451492537313433,
"grad_norm": 1.2890758576612307,
"learning_rate": 9.508923420767761e-06,
"loss": 0.2354,
"step": 305
},
{
"epoch": 0.28544776119402987,
"grad_norm": 1.449676011395037,
"learning_rate": 9.505752184281391e-06,
"loss": 0.2595,
"step": 306
},
{
"epoch": 0.28638059701492535,
"grad_norm": 1.3083299671213158,
"learning_rate": 9.502571273542765e-06,
"loss": 0.1848,
"step": 307
},
{
"epoch": 0.2873134328358209,
"grad_norm": 1.347052858071002,
"learning_rate": 9.499380695381577e-06,
"loss": 0.2129,
"step": 308
},
{
"epoch": 0.28824626865671643,
"grad_norm": 1.3624392783860741,
"learning_rate": 9.496180456648287e-06,
"loss": 0.1911,
"step": 309
},
{
"epoch": 0.2891791044776119,
"grad_norm": 1.384030158008443,
"learning_rate": 9.492970564214093e-06,
"loss": 0.2636,
"step": 310
},
{
"epoch": 0.29011194029850745,
"grad_norm": 1.2657375947456138,
"learning_rate": 9.489751024970917e-06,
"loss": 0.2169,
"step": 311
},
{
"epoch": 0.291044776119403,
"grad_norm": 1.336905624247156,
"learning_rate": 9.486521845831403e-06,
"loss": 0.2014,
"step": 312
},
{
"epoch": 0.29197761194029853,
"grad_norm": 1.3592589856849937,
"learning_rate": 9.48328303372888e-06,
"loss": 0.2385,
"step": 313
},
{
"epoch": 0.292910447761194,
"grad_norm": 1.4208885596358325,
"learning_rate": 9.480034595617374e-06,
"loss": 0.2375,
"step": 314
},
{
"epoch": 0.29384328358208955,
"grad_norm": 1.439265171883172,
"learning_rate": 9.476776538471567e-06,
"loss": 0.2033,
"step": 315
},
{
"epoch": 0.2947761194029851,
"grad_norm": 1.4281095450068615,
"learning_rate": 9.4735088692868e-06,
"loss": 0.2845,
"step": 316
},
{
"epoch": 0.2957089552238806,
"grad_norm": 1.4346746831675432,
"learning_rate": 9.470231595079051e-06,
"loss": 0.2325,
"step": 317
},
{
"epoch": 0.2966417910447761,
"grad_norm": 1.4014402224696572,
"learning_rate": 9.466944722884918e-06,
"loss": 0.1886,
"step": 318
},
{
"epoch": 0.29757462686567165,
"grad_norm": 1.317389277924677,
"learning_rate": 9.463648259761613e-06,
"loss": 0.21,
"step": 319
},
{
"epoch": 0.29850746268656714,
"grad_norm": 1.274308126049011,
"learning_rate": 9.460342212786933e-06,
"loss": 0.1835,
"step": 320
},
{
"epoch": 0.2994402985074627,
"grad_norm": 1.3212156231617627,
"learning_rate": 9.45702658905926e-06,
"loss": 0.226,
"step": 321
},
{
"epoch": 0.3003731343283582,
"grad_norm": 1.3164665027162752,
"learning_rate": 9.453701395697528e-06,
"loss": 0.2405,
"step": 322
},
{
"epoch": 0.30130597014925375,
"grad_norm": 1.3515078744929137,
"learning_rate": 9.450366639841232e-06,
"loss": 0.2784,
"step": 323
},
{
"epoch": 0.30223880597014924,
"grad_norm": 1.3701250089759658,
"learning_rate": 9.447022328650382e-06,
"loss": 0.2393,
"step": 324
},
{
"epoch": 0.3031716417910448,
"grad_norm": 1.4467880233772275,
"learning_rate": 9.44366846930552e-06,
"loss": 0.2465,
"step": 325
},
{
"epoch": 0.3041044776119403,
"grad_norm": 1.3771540876067927,
"learning_rate": 9.440305069007678e-06,
"loss": 0.2346,
"step": 326
},
{
"epoch": 0.3050373134328358,
"grad_norm": 1.3565472388783033,
"learning_rate": 9.436932134978378e-06,
"loss": 0.2531,
"step": 327
},
{
"epoch": 0.30597014925373134,
"grad_norm": 1.4340547477308785,
"learning_rate": 9.43354967445961e-06,
"loss": 0.252,
"step": 328
},
{
"epoch": 0.3069029850746269,
"grad_norm": 1.332979343322144,
"learning_rate": 9.430157694713817e-06,
"loss": 0.2229,
"step": 329
},
{
"epoch": 0.30783582089552236,
"grad_norm": 1.265965935369589,
"learning_rate": 9.426756203023886e-06,
"loss": 0.1874,
"step": 330
},
{
"epoch": 0.3087686567164179,
"grad_norm": 1.3230512165885997,
"learning_rate": 9.42334520669312e-06,
"loss": 0.1967,
"step": 331
},
{
"epoch": 0.30970149253731344,
"grad_norm": 1.3394167852727439,
"learning_rate": 9.419924713045234e-06,
"loss": 0.2153,
"step": 332
},
{
"epoch": 0.310634328358209,
"grad_norm": 1.394569996114462,
"learning_rate": 9.416494729424334e-06,
"loss": 0.2437,
"step": 333
},
{
"epoch": 0.31156716417910446,
"grad_norm": 1.3675625019234572,
"learning_rate": 9.413055263194902e-06,
"loss": 0.1944,
"step": 334
},
{
"epoch": 0.3125,
"grad_norm": 1.4436671719611258,
"learning_rate": 9.409606321741776e-06,
"loss": 0.2441,
"step": 335
},
{
"epoch": 0.31343283582089554,
"grad_norm": 1.3395252962767008,
"learning_rate": 9.406147912470142e-06,
"loss": 0.217,
"step": 336
},
{
"epoch": 0.314365671641791,
"grad_norm": 1.2362809735457574,
"learning_rate": 9.402680042805517e-06,
"loss": 0.1843,
"step": 337
},
{
"epoch": 0.31529850746268656,
"grad_norm": 1.4528930102963502,
"learning_rate": 9.399202720193723e-06,
"loss": 0.2532,
"step": 338
},
{
"epoch": 0.3162313432835821,
"grad_norm": 1.4476764879536321,
"learning_rate": 9.395715952100882e-06,
"loss": 0.2267,
"step": 339
},
{
"epoch": 0.31716417910447764,
"grad_norm": 1.3827760190955474,
"learning_rate": 9.392219746013399e-06,
"loss": 0.1961,
"step": 340
},
{
"epoch": 0.3180970149253731,
"grad_norm": 1.3224792417888853,
"learning_rate": 9.38871410943794e-06,
"loss": 0.229,
"step": 341
},
{
"epoch": 0.31902985074626866,
"grad_norm": 1.3670929236147178,
"learning_rate": 9.385199049901418e-06,
"loss": 0.2092,
"step": 342
},
{
"epoch": 0.3199626865671642,
"grad_norm": 1.5700881422687811,
"learning_rate": 9.381674574950981e-06,
"loss": 0.2812,
"step": 343
},
{
"epoch": 0.3208955223880597,
"grad_norm": 1.2819951775763399,
"learning_rate": 9.378140692153991e-06,
"loss": 0.2057,
"step": 344
},
{
"epoch": 0.3218283582089552,
"grad_norm": 1.4025000620916979,
"learning_rate": 9.374597409098011e-06,
"loss": 0.2824,
"step": 345
},
{
"epoch": 0.32276119402985076,
"grad_norm": 1.386974140069185,
"learning_rate": 9.371044733390786e-06,
"loss": 0.2108,
"step": 346
},
{
"epoch": 0.32369402985074625,
"grad_norm": 1.2954081638008537,
"learning_rate": 9.367482672660226e-06,
"loss": 0.2173,
"step": 347
},
{
"epoch": 0.3246268656716418,
"grad_norm": 1.3479486636153222,
"learning_rate": 9.363911234554394e-06,
"loss": 0.2013,
"step": 348
},
{
"epoch": 0.3255597014925373,
"grad_norm": 1.3024944216710168,
"learning_rate": 9.360330426741488e-06,
"loss": 0.2264,
"step": 349
},
{
"epoch": 0.32649253731343286,
"grad_norm": 1.336369716253106,
"learning_rate": 9.356740256909822e-06,
"loss": 0.1936,
"step": 350
},
{
"epoch": 0.32742537313432835,
"grad_norm": 1.4184100938638582,
"learning_rate": 9.353140732767811e-06,
"loss": 0.2476,
"step": 351
},
{
"epoch": 0.3283582089552239,
"grad_norm": 1.398208351719416,
"learning_rate": 9.349531862043952e-06,
"loss": 0.231,
"step": 352
},
{
"epoch": 0.3292910447761194,
"grad_norm": 1.4263392740126681,
"learning_rate": 9.345913652486815e-06,
"loss": 0.2102,
"step": 353
},
{
"epoch": 0.3302238805970149,
"grad_norm": 1.333836826585129,
"learning_rate": 9.342286111865023e-06,
"loss": 0.2261,
"step": 354
},
{
"epoch": 0.33115671641791045,
"grad_norm": 1.2767788258210142,
"learning_rate": 9.338649247967221e-06,
"loss": 0.2374,
"step": 355
},
{
"epoch": 0.332089552238806,
"grad_norm": 1.5017433864293555,
"learning_rate": 9.335003068602087e-06,
"loss": 0.2536,
"step": 356
},
{
"epoch": 0.33302238805970147,
"grad_norm": 1.3395453592365296,
"learning_rate": 9.33134758159829e-06,
"loss": 0.2158,
"step": 357
},
{
"epoch": 0.333955223880597,
"grad_norm": 1.1527112023803379,
"learning_rate": 9.32768279480449e-06,
"loss": 0.1606,
"step": 358
},
{
"epoch": 0.33488805970149255,
"grad_norm": 1.4608914907807746,
"learning_rate": 9.32400871608931e-06,
"loss": 0.2198,
"step": 359
},
{
"epoch": 0.3358208955223881,
"grad_norm": 1.4120297778863582,
"learning_rate": 9.320325353341325e-06,
"loss": 0.2271,
"step": 360
},
{
"epoch": 0.33675373134328357,
"grad_norm": 1.3454385285721666,
"learning_rate": 9.316632714469044e-06,
"loss": 0.2207,
"step": 361
},
{
"epoch": 0.3376865671641791,
"grad_norm": 1.4474892479226766,
"learning_rate": 9.312930807400893e-06,
"loss": 0.2536,
"step": 362
},
{
"epoch": 0.33861940298507465,
"grad_norm": 1.3463722181697488,
"learning_rate": 9.309219640085196e-06,
"loss": 0.1933,
"step": 363
},
{
"epoch": 0.33955223880597013,
"grad_norm": 1.28823123640731,
"learning_rate": 9.305499220490162e-06,
"loss": 0.2178,
"step": 364
},
{
"epoch": 0.34048507462686567,
"grad_norm": 1.4294791094282566,
"learning_rate": 9.30176955660386e-06,
"loss": 0.2698,
"step": 365
},
{
"epoch": 0.3414179104477612,
"grad_norm": 1.493909659198492,
"learning_rate": 9.298030656434215e-06,
"loss": 0.2423,
"step": 366
},
{
"epoch": 0.3423507462686567,
"grad_norm": 1.503880145512656,
"learning_rate": 9.294282528008981e-06,
"loss": 0.2257,
"step": 367
},
{
"epoch": 0.34328358208955223,
"grad_norm": 1.333114077695562,
"learning_rate": 9.290525179375722e-06,
"loss": 0.1707,
"step": 368
},
{
"epoch": 0.34421641791044777,
"grad_norm": 1.2349069876312035,
"learning_rate": 9.286758618601801e-06,
"loss": 0.1934,
"step": 369
},
{
"epoch": 0.3451492537313433,
"grad_norm": 1.4718018200541485,
"learning_rate": 9.28298285377436e-06,
"loss": 0.2482,
"step": 370
},
{
"epoch": 0.3460820895522388,
"grad_norm": 1.5413975233614134,
"learning_rate": 9.279197893000305e-06,
"loss": 0.2086,
"step": 371
},
{
"epoch": 0.34701492537313433,
"grad_norm": 1.4107305952629499,
"learning_rate": 9.275403744406282e-06,
"loss": 0.2305,
"step": 372
},
{
"epoch": 0.34794776119402987,
"grad_norm": 1.2780210723834629,
"learning_rate": 9.271600416138669e-06,
"loss": 0.2093,
"step": 373
},
{
"epoch": 0.34888059701492535,
"grad_norm": 1.3697674107174014,
"learning_rate": 9.26778791636355e-06,
"loss": 0.223,
"step": 374
},
{
"epoch": 0.3498134328358209,
"grad_norm": 1.4273089435605406,
"learning_rate": 9.263966253266705e-06,
"loss": 0.2368,
"step": 375
},
{
"epoch": 0.35074626865671643,
"grad_norm": 1.3805965144164551,
"learning_rate": 9.260135435053583e-06,
"loss": 0.2178,
"step": 376
},
{
"epoch": 0.3516791044776119,
"grad_norm": 1.3842697410395384,
"learning_rate": 9.256295469949295e-06,
"loss": 0.2333,
"step": 377
},
{
"epoch": 0.35261194029850745,
"grad_norm": 1.302286084633613,
"learning_rate": 9.252446366198586e-06,
"loss": 0.1893,
"step": 378
},
{
"epoch": 0.353544776119403,
"grad_norm": 1.361784401424521,
"learning_rate": 9.248588132065828e-06,
"loss": 0.2469,
"step": 379
},
{
"epoch": 0.35447761194029853,
"grad_norm": 1.356482260144131,
"learning_rate": 9.244720775834993e-06,
"loss": 0.2284,
"step": 380
},
{
"epoch": 0.355410447761194,
"grad_norm": 1.3825317630910496,
"learning_rate": 9.240844305809641e-06,
"loss": 0.2309,
"step": 381
},
{
"epoch": 0.35634328358208955,
"grad_norm": 1.3391371234122509,
"learning_rate": 9.2369587303129e-06,
"loss": 0.2131,
"step": 382
},
{
"epoch": 0.3572761194029851,
"grad_norm": 1.4748961962979945,
"learning_rate": 9.233064057687444e-06,
"loss": 0.2088,
"step": 383
},
{
"epoch": 0.3582089552238806,
"grad_norm": 1.4605973374859904,
"learning_rate": 9.229160296295488e-06,
"loss": 0.2232,
"step": 384
},
{
"epoch": 0.3591417910447761,
"grad_norm": 1.3009340090036816,
"learning_rate": 9.225247454518752e-06,
"loss": 0.1865,
"step": 385
},
{
"epoch": 0.36007462686567165,
"grad_norm": 1.337221511833832,
"learning_rate": 9.221325540758459e-06,
"loss": 0.1986,
"step": 386
},
{
"epoch": 0.36100746268656714,
"grad_norm": 1.4010446178744635,
"learning_rate": 9.217394563435306e-06,
"loss": 0.2581,
"step": 387
},
{
"epoch": 0.3619402985074627,
"grad_norm": 1.2816959963616372,
"learning_rate": 9.213454530989454e-06,
"loss": 0.1839,
"step": 388
},
{
"epoch": 0.3628731343283582,
"grad_norm": 1.3669145076068119,
"learning_rate": 9.209505451880504e-06,
"loss": 0.1982,
"step": 389
},
{
"epoch": 0.36380597014925375,
"grad_norm": 1.3905705737339125,
"learning_rate": 9.205547334587483e-06,
"loss": 0.2397,
"step": 390
},
{
"epoch": 0.36473880597014924,
"grad_norm": 1.2603360041662903,
"learning_rate": 9.201580187608818e-06,
"loss": 0.2043,
"step": 391
},
{
"epoch": 0.3656716417910448,
"grad_norm": 1.5586670110559833,
"learning_rate": 9.197604019462334e-06,
"loss": 0.2125,
"step": 392
},
{
"epoch": 0.3666044776119403,
"grad_norm": 1.2690662617236015,
"learning_rate": 9.193618838685213e-06,
"loss": 0.1993,
"step": 393
},
{
"epoch": 0.3675373134328358,
"grad_norm": 1.341769910680454,
"learning_rate": 9.189624653833997e-06,
"loss": 0.2118,
"step": 394
},
{
"epoch": 0.36847014925373134,
"grad_norm": 1.3367513943714662,
"learning_rate": 9.185621473484558e-06,
"loss": 0.2473,
"step": 395
},
{
"epoch": 0.3694029850746269,
"grad_norm": 1.3820435433945204,
"learning_rate": 9.18160930623208e-06,
"loss": 0.2119,
"step": 396
},
{
"epoch": 0.37033582089552236,
"grad_norm": 1.2943072011184742,
"learning_rate": 9.177588160691044e-06,
"loss": 0.2035,
"step": 397
},
{
"epoch": 0.3712686567164179,
"grad_norm": 1.3473164568630493,
"learning_rate": 9.173558045495212e-06,
"loss": 0.1908,
"step": 398
},
{
"epoch": 0.37220149253731344,
"grad_norm": 1.4333605080144285,
"learning_rate": 9.169518969297598e-06,
"loss": 0.2524,
"step": 399
},
{
"epoch": 0.373134328358209,
"grad_norm": 1.3166066985246667,
"learning_rate": 9.165470940770458e-06,
"loss": 0.2094,
"step": 400
},
{
"epoch": 0.37406716417910446,
"grad_norm": 1.4239397473404334,
"learning_rate": 9.161413968605274e-06,
"loss": 0.2357,
"step": 401
},
{
"epoch": 0.375,
"grad_norm": 1.3411122706114262,
"learning_rate": 9.157348061512728e-06,
"loss": 0.2254,
"step": 402
},
{
"epoch": 0.37593283582089554,
"grad_norm": 1.2846258510307025,
"learning_rate": 9.15327322822268e-06,
"loss": 0.1828,
"step": 403
},
{
"epoch": 0.376865671641791,
"grad_norm": 1.2409415483459634,
"learning_rate": 9.149189477484169e-06,
"loss": 0.2003,
"step": 404
},
{
"epoch": 0.37779850746268656,
"grad_norm": 1.2227527914049607,
"learning_rate": 9.145096818065365e-06,
"loss": 0.2259,
"step": 405
},
{
"epoch": 0.3787313432835821,
"grad_norm": 1.4138015334010903,
"learning_rate": 9.140995258753577e-06,
"loss": 0.2522,
"step": 406
},
{
"epoch": 0.37966417910447764,
"grad_norm": 1.4737839013873402,
"learning_rate": 9.136884808355219e-06,
"loss": 0.2929,
"step": 407
},
{
"epoch": 0.3805970149253731,
"grad_norm": 1.4308442254453198,
"learning_rate": 9.132765475695795e-06,
"loss": 0.2169,
"step": 408
},
{
"epoch": 0.38152985074626866,
"grad_norm": 1.290712133189965,
"learning_rate": 9.128637269619878e-06,
"loss": 0.2192,
"step": 409
},
{
"epoch": 0.3824626865671642,
"grad_norm": 1.3116978703979065,
"learning_rate": 9.124500198991098e-06,
"loss": 0.2167,
"step": 410
},
{
"epoch": 0.3833955223880597,
"grad_norm": 1.2736502633453959,
"learning_rate": 9.12035427269211e-06,
"loss": 0.2052,
"step": 411
},
{
"epoch": 0.3843283582089552,
"grad_norm": 1.3231801576512818,
"learning_rate": 9.116199499624596e-06,
"loss": 0.2396,
"step": 412
},
{
"epoch": 0.38526119402985076,
"grad_norm": 1.2853453099268355,
"learning_rate": 9.112035888709219e-06,
"loss": 0.2299,
"step": 413
},
{
"epoch": 0.38619402985074625,
"grad_norm": 1.2938537527377314,
"learning_rate": 9.107863448885624e-06,
"loss": 0.1803,
"step": 414
},
{
"epoch": 0.3871268656716418,
"grad_norm": 1.4049132483353646,
"learning_rate": 9.103682189112413e-06,
"loss": 0.2356,
"step": 415
},
{
"epoch": 0.3880597014925373,
"grad_norm": 1.2872197040412747,
"learning_rate": 9.099492118367123e-06,
"loss": 0.1891,
"step": 416
},
{
"epoch": 0.38899253731343286,
"grad_norm": 1.3013097402971163,
"learning_rate": 9.095293245646212e-06,
"loss": 0.2198,
"step": 417
},
{
"epoch": 0.38992537313432835,
"grad_norm": 1.3448050767428346,
"learning_rate": 9.091085579965034e-06,
"loss": 0.2813,
"step": 418
},
{
"epoch": 0.3908582089552239,
"grad_norm": 1.3765739745948038,
"learning_rate": 9.08686913035782e-06,
"loss": 0.2607,
"step": 419
},
{
"epoch": 0.3917910447761194,
"grad_norm": 1.2736711574783002,
"learning_rate": 9.082643905877665e-06,
"loss": 0.1773,
"step": 420
},
{
"epoch": 0.3927238805970149,
"grad_norm": 1.2635354964139183,
"learning_rate": 9.078409915596506e-06,
"loss": 0.2189,
"step": 421
},
{
"epoch": 0.39365671641791045,
"grad_norm": 1.3303045131479903,
"learning_rate": 9.074167168605096e-06,
"loss": 0.1938,
"step": 422
},
{
"epoch": 0.394589552238806,
"grad_norm": 1.2925868456600165,
"learning_rate": 9.069915674012995e-06,
"loss": 0.1588,
"step": 423
},
{
"epoch": 0.39552238805970147,
"grad_norm": 1.3442352091237408,
"learning_rate": 9.065655440948536e-06,
"loss": 0.2465,
"step": 424
},
{
"epoch": 0.396455223880597,
"grad_norm": 1.3209591762725579,
"learning_rate": 9.061386478558822e-06,
"loss": 0.2311,
"step": 425
},
{
"epoch": 0.39738805970149255,
"grad_norm": 1.3202035679156854,
"learning_rate": 9.057108796009697e-06,
"loss": 0.2542,
"step": 426
},
{
"epoch": 0.3983208955223881,
"grad_norm": 1.2533498431687355,
"learning_rate": 9.052822402485727e-06,
"loss": 0.1865,
"step": 427
},
{
"epoch": 0.39925373134328357,
"grad_norm": 1.2299852771240687,
"learning_rate": 9.048527307190182e-06,
"loss": 0.2163,
"step": 428
},
{
"epoch": 0.4001865671641791,
"grad_norm": 1.2503929440804913,
"learning_rate": 9.044223519345016e-06,
"loss": 0.1874,
"step": 429
},
{
"epoch": 0.40111940298507465,
"grad_norm": 1.3216934001751737,
"learning_rate": 9.039911048190843e-06,
"loss": 0.2123,
"step": 430
},
{
"epoch": 0.40205223880597013,
"grad_norm": 1.3239969525003676,
"learning_rate": 9.035589902986928e-06,
"loss": 0.2043,
"step": 431
},
{
"epoch": 0.40298507462686567,
"grad_norm": 1.412007085946633,
"learning_rate": 9.03126009301115e-06,
"loss": 0.2275,
"step": 432
},
{
"epoch": 0.4039179104477612,
"grad_norm": 1.3728799593814967,
"learning_rate": 9.026921627560001e-06,
"loss": 0.2259,
"step": 433
},
{
"epoch": 0.4048507462686567,
"grad_norm": 1.358199171602574,
"learning_rate": 9.022574515948554e-06,
"loss": 0.2502,
"step": 434
},
{
"epoch": 0.40578358208955223,
"grad_norm": 1.2294059495823382,
"learning_rate": 9.018218767510445e-06,
"loss": 0.2121,
"step": 435
},
{
"epoch": 0.40671641791044777,
"grad_norm": 1.4141686440372585,
"learning_rate": 9.013854391597856e-06,
"loss": 0.2779,
"step": 436
},
{
"epoch": 0.4076492537313433,
"grad_norm": 1.2946191221322243,
"learning_rate": 9.009481397581489e-06,
"loss": 0.1833,
"step": 437
},
{
"epoch": 0.4085820895522388,
"grad_norm": 1.351996529947262,
"learning_rate": 9.005099794850554e-06,
"loss": 0.2359,
"step": 438
},
{
"epoch": 0.40951492537313433,
"grad_norm": 1.301343400669271,
"learning_rate": 9.000709592812743e-06,
"loss": 0.2247,
"step": 439
},
{
"epoch": 0.41044776119402987,
"grad_norm": 1.352989620123602,
"learning_rate": 8.996310800894215e-06,
"loss": 0.2334,
"step": 440
},
{
"epoch": 0.41138059701492535,
"grad_norm": 1.3175223253148902,
"learning_rate": 8.991903428539566e-06,
"loss": 0.244,
"step": 441
},
{
"epoch": 0.4123134328358209,
"grad_norm": 1.2973696190684876,
"learning_rate": 8.987487485211817e-06,
"loss": 0.2017,
"step": 442
},
{
"epoch": 0.41324626865671643,
"grad_norm": 1.3589491444232285,
"learning_rate": 8.983062980392394e-06,
"loss": 0.2234,
"step": 443
},
{
"epoch": 0.4141791044776119,
"grad_norm": 1.3714705942529328,
"learning_rate": 8.978629923581104e-06,
"loss": 0.2104,
"step": 444
},
{
"epoch": 0.41511194029850745,
"grad_norm": 1.2643963582167521,
"learning_rate": 8.974188324296115e-06,
"loss": 0.2061,
"step": 445
},
{
"epoch": 0.416044776119403,
"grad_norm": 1.374417969556956,
"learning_rate": 8.969738192073939e-06,
"loss": 0.2379,
"step": 446
},
{
"epoch": 0.41697761194029853,
"grad_norm": 1.2795266726469274,
"learning_rate": 8.965279536469406e-06,
"loss": 0.1792,
"step": 447
},
{
"epoch": 0.417910447761194,
"grad_norm": 1.539527566481072,
"learning_rate": 8.960812367055646e-06,
"loss": 0.2795,
"step": 448
},
{
"epoch": 0.41884328358208955,
"grad_norm": 1.3966125864951573,
"learning_rate": 8.956336693424076e-06,
"loss": 0.2622,
"step": 449
},
{
"epoch": 0.4197761194029851,
"grad_norm": 1.3281178679930612,
"learning_rate": 8.951852525184361e-06,
"loss": 0.2047,
"step": 450
},
{
"epoch": 0.4207089552238806,
"grad_norm": 1.343025406228925,
"learning_rate": 8.947359871964415e-06,
"loss": 0.2581,
"step": 451
},
{
"epoch": 0.4216417910447761,
"grad_norm": 1.2642884902145128,
"learning_rate": 8.94285874341036e-06,
"loss": 0.2315,
"step": 452
},
{
"epoch": 0.42257462686567165,
"grad_norm": 1.2777621737129141,
"learning_rate": 8.938349149186527e-06,
"loss": 0.2081,
"step": 453
},
{
"epoch": 0.42350746268656714,
"grad_norm": 1.3748536858035563,
"learning_rate": 8.933831098975416e-06,
"loss": 0.2615,
"step": 454
},
{
"epoch": 0.4244402985074627,
"grad_norm": 1.331415513725724,
"learning_rate": 8.929304602477681e-06,
"loss": 0.2468,
"step": 455
},
{
"epoch": 0.4253731343283582,
"grad_norm": 1.2761378826109053,
"learning_rate": 8.924769669412117e-06,
"loss": 0.2239,
"step": 456
},
{
"epoch": 0.42630597014925375,
"grad_norm": 1.284142601434654,
"learning_rate": 8.92022630951563e-06,
"loss": 0.2071,
"step": 457
},
{
"epoch": 0.42723880597014924,
"grad_norm": 1.3209951417631995,
"learning_rate": 8.915674532543218e-06,
"loss": 0.2172,
"step": 458
},
{
"epoch": 0.4281716417910448,
"grad_norm": 1.271616268742621,
"learning_rate": 8.911114348267954e-06,
"loss": 0.22,
"step": 459
},
{
"epoch": 0.4291044776119403,
"grad_norm": 1.257564050424032,
"learning_rate": 8.906545766480961e-06,
"loss": 0.2041,
"step": 460
},
{
"epoch": 0.4300373134328358,
"grad_norm": 1.2645376908104489,
"learning_rate": 8.90196879699139e-06,
"loss": 0.1872,
"step": 461
},
{
"epoch": 0.43097014925373134,
"grad_norm": 1.4142300040480527,
"learning_rate": 8.897383449626407e-06,
"loss": 0.2538,
"step": 462
},
{
"epoch": 0.4319029850746269,
"grad_norm": 1.3758724410443472,
"learning_rate": 8.892789734231158e-06,
"loss": 0.2524,
"step": 463
},
{
"epoch": 0.43283582089552236,
"grad_norm": 1.3530802717750985,
"learning_rate": 8.888187660668762e-06,
"loss": 0.2343,
"step": 464
},
{
"epoch": 0.4337686567164179,
"grad_norm": 1.410062910966012,
"learning_rate": 8.88357723882028e-06,
"loss": 0.2316,
"step": 465
},
{
"epoch": 0.43470149253731344,
"grad_norm": 1.2537537090855193,
"learning_rate": 8.878958478584702e-06,
"loss": 0.2089,
"step": 466
},
{
"epoch": 0.435634328358209,
"grad_norm": 1.2951276863764574,
"learning_rate": 8.87433138987892e-06,
"loss": 0.2362,
"step": 467
},
{
"epoch": 0.43656716417910446,
"grad_norm": 1.2881126860233525,
"learning_rate": 8.869695982637703e-06,
"loss": 0.2003,
"step": 468
},
{
"epoch": 0.4375,
"grad_norm": 1.277822481224725,
"learning_rate": 8.865052266813686e-06,
"loss": 0.2026,
"step": 469
},
{
"epoch": 0.43843283582089554,
"grad_norm": 1.4012002589366537,
"learning_rate": 8.86040025237734e-06,
"loss": 0.2559,
"step": 470
},
{
"epoch": 0.439365671641791,
"grad_norm": 1.4071481786290667,
"learning_rate": 8.855739949316957e-06,
"loss": 0.2396,
"step": 471
},
{
"epoch": 0.44029850746268656,
"grad_norm": 1.2757247732597385,
"learning_rate": 8.851071367638625e-06,
"loss": 0.2242,
"step": 472
},
{
"epoch": 0.4412313432835821,
"grad_norm": 1.394162739130695,
"learning_rate": 8.846394517366202e-06,
"loss": 0.2304,
"step": 473
},
{
"epoch": 0.44216417910447764,
"grad_norm": 1.2545998211941618,
"learning_rate": 8.841709408541304e-06,
"loss": 0.2248,
"step": 474
},
{
"epoch": 0.4430970149253731,
"grad_norm": 1.3407244194077155,
"learning_rate": 8.837016051223281e-06,
"loss": 0.2599,
"step": 475
},
{
"epoch": 0.44402985074626866,
"grad_norm": 1.3149582884940112,
"learning_rate": 8.832314455489188e-06,
"loss": 0.2129,
"step": 476
},
{
"epoch": 0.4449626865671642,
"grad_norm": 1.332806260986635,
"learning_rate": 8.827604631433771e-06,
"loss": 0.2353,
"step": 477
},
{
"epoch": 0.4458955223880597,
"grad_norm": 1.219532308210816,
"learning_rate": 8.822886589169443e-06,
"loss": 0.2008,
"step": 478
},
{
"epoch": 0.4468283582089552,
"grad_norm": 1.2571868893463725,
"learning_rate": 8.818160338826262e-06,
"loss": 0.1888,
"step": 479
},
{
"epoch": 0.44776119402985076,
"grad_norm": 1.250447624595975,
"learning_rate": 8.81342589055191e-06,
"loss": 0.2034,
"step": 480
},
{
"epoch": 0.44869402985074625,
"grad_norm": 1.432874296705188,
"learning_rate": 8.80868325451167e-06,
"loss": 0.2098,
"step": 481
},
{
"epoch": 0.4496268656716418,
"grad_norm": 1.1940303384477422,
"learning_rate": 8.803932440888404e-06,
"loss": 0.1792,
"step": 482
},
{
"epoch": 0.4505597014925373,
"grad_norm": 1.3281007803218612,
"learning_rate": 8.799173459882534e-06,
"loss": 0.2309,
"step": 483
},
{
"epoch": 0.45149253731343286,
"grad_norm": 1.4531661248443652,
"learning_rate": 8.794406321712017e-06,
"loss": 0.2171,
"step": 484
},
{
"epoch": 0.45242537313432835,
"grad_norm": 1.2485056199303581,
"learning_rate": 8.789631036612324e-06,
"loss": 0.1974,
"step": 485
},
{
"epoch": 0.4533582089552239,
"grad_norm": 1.3147902796079114,
"learning_rate": 8.784847614836418e-06,
"loss": 0.2238,
"step": 486
},
{
"epoch": 0.4542910447761194,
"grad_norm": 1.2516311910221467,
"learning_rate": 8.780056066654734e-06,
"loss": 0.188,
"step": 487
},
{
"epoch": 0.4552238805970149,
"grad_norm": 1.418005163198149,
"learning_rate": 8.775256402355155e-06,
"loss": 0.1961,
"step": 488
},
{
"epoch": 0.45615671641791045,
"grad_norm": 1.4460731180502704,
"learning_rate": 8.770448632242984e-06,
"loss": 0.2675,
"step": 489
},
{
"epoch": 0.457089552238806,
"grad_norm": 1.418299887927644,
"learning_rate": 8.765632766640937e-06,
"loss": 0.2119,
"step": 490
},
{
"epoch": 0.45802238805970147,
"grad_norm": 1.4354119801995215,
"learning_rate": 8.760808815889105e-06,
"loss": 0.2301,
"step": 491
},
{
"epoch": 0.458955223880597,
"grad_norm": 1.2713842322726168,
"learning_rate": 8.755976790344945e-06,
"loss": 0.2129,
"step": 492
},
{
"epoch": 0.45988805970149255,
"grad_norm": 1.3143191873476805,
"learning_rate": 8.751136700383243e-06,
"loss": 0.2182,
"step": 493
},
{
"epoch": 0.4608208955223881,
"grad_norm": 1.348721547945487,
"learning_rate": 8.746288556396104e-06,
"loss": 0.2323,
"step": 494
},
{
"epoch": 0.46175373134328357,
"grad_norm": 1.3836440530622345,
"learning_rate": 8.74143236879293e-06,
"loss": 0.2389,
"step": 495
},
{
"epoch": 0.4626865671641791,
"grad_norm": 1.2376424574935958,
"learning_rate": 8.736568148000386e-06,
"loss": 0.19,
"step": 496
},
{
"epoch": 0.46361940298507465,
"grad_norm": 1.2986566312073538,
"learning_rate": 8.731695904462389e-06,
"loss": 0.2358,
"step": 497
},
{
"epoch": 0.46455223880597013,
"grad_norm": 1.2552562207036049,
"learning_rate": 8.726815648640084e-06,
"loss": 0.1946,
"step": 498
},
{
"epoch": 0.46548507462686567,
"grad_norm": 1.2972481020683562,
"learning_rate": 8.721927391011812e-06,
"loss": 0.2122,
"step": 499
},
{
"epoch": 0.4664179104477612,
"grad_norm": 1.383067739478344,
"learning_rate": 8.7170311420731e-06,
"loss": 0.2672,
"step": 500
},
{
"epoch": 0.4664179104477612,
"eval_loss": 0.23051877319812775,
"eval_runtime": 3.439,
"eval_samples_per_second": 25.298,
"eval_steps_per_second": 6.397,
"step": 500
},
{
"epoch": 0.4673507462686567,
"grad_norm": 1.2632786993125102,
"learning_rate": 8.712126912336631e-06,
"loss": 0.207,
"step": 501
},
{
"epoch": 0.46828358208955223,
"grad_norm": 1.2435472593530392,
"learning_rate": 8.707214712332227e-06,
"loss": 0.1969,
"step": 502
},
{
"epoch": 0.46921641791044777,
"grad_norm": 1.371389841592427,
"learning_rate": 8.702294552606815e-06,
"loss": 0.2569,
"step": 503
},
{
"epoch": 0.4701492537313433,
"grad_norm": 1.1936395590557582,
"learning_rate": 8.697366443724424e-06,
"loss": 0.1828,
"step": 504
},
{
"epoch": 0.4710820895522388,
"grad_norm": 1.1722012737846563,
"learning_rate": 8.692430396266138e-06,
"loss": 0.1936,
"step": 505
},
{
"epoch": 0.47201492537313433,
"grad_norm": 1.403394574957527,
"learning_rate": 8.687486420830093e-06,
"loss": 0.2647,
"step": 506
},
{
"epoch": 0.47294776119402987,
"grad_norm": 1.2870699600150621,
"learning_rate": 8.682534528031447e-06,
"loss": 0.212,
"step": 507
},
{
"epoch": 0.47388059701492535,
"grad_norm": 1.3177979647832148,
"learning_rate": 8.677574728502355e-06,
"loss": 0.2261,
"step": 508
},
{
"epoch": 0.4748134328358209,
"grad_norm": 1.3767619394332882,
"learning_rate": 8.67260703289195e-06,
"loss": 0.2443,
"step": 509
},
{
"epoch": 0.47574626865671643,
"grad_norm": 1.274531083090423,
"learning_rate": 8.667631451866317e-06,
"loss": 0.1821,
"step": 510
},
{
"epoch": 0.4766791044776119,
"grad_norm": 1.4471466002963596,
"learning_rate": 8.662647996108475e-06,
"loss": 0.2135,
"step": 511
},
{
"epoch": 0.47761194029850745,
"grad_norm": 1.3413363690939575,
"learning_rate": 8.657656676318346e-06,
"loss": 0.2019,
"step": 512
},
{
"epoch": 0.478544776119403,
"grad_norm": 1.3457177629223023,
"learning_rate": 8.65265750321274e-06,
"loss": 0.2698,
"step": 513
},
{
"epoch": 0.47947761194029853,
"grad_norm": 1.2975394462316192,
"learning_rate": 8.64765048752533e-06,
"loss": 0.2537,
"step": 514
},
{
"epoch": 0.480410447761194,
"grad_norm": 1.380115000712049,
"learning_rate": 8.642635640006623e-06,
"loss": 0.2151,
"step": 515
},
{
"epoch": 0.48134328358208955,
"grad_norm": 1.2294150732600446,
"learning_rate": 8.637612971423945e-06,
"loss": 0.1742,
"step": 516
},
{
"epoch": 0.4822761194029851,
"grad_norm": 1.3408677173066745,
"learning_rate": 8.632582492561414e-06,
"loss": 0.2137,
"step": 517
},
{
"epoch": 0.4832089552238806,
"grad_norm": 1.2584110827285158,
"learning_rate": 8.627544214219918e-06,
"loss": 0.2281,
"step": 518
},
{
"epoch": 0.4841417910447761,
"grad_norm": 1.4625077729613687,
"learning_rate": 8.622498147217091e-06,
"loss": 0.2527,
"step": 519
},
{
"epoch": 0.48507462686567165,
"grad_norm": 1.333638345288828,
"learning_rate": 8.617444302387288e-06,
"loss": 0.1941,
"step": 520
},
{
"epoch": 0.48600746268656714,
"grad_norm": 1.2537139376845916,
"learning_rate": 8.612382690581567e-06,
"loss": 0.2236,
"step": 521
},
{
"epoch": 0.4869402985074627,
"grad_norm": 1.2840159146700987,
"learning_rate": 8.607313322667657e-06,
"loss": 0.2346,
"step": 522
},
{
"epoch": 0.4878731343283582,
"grad_norm": 1.391601937565303,
"learning_rate": 8.602236209529948e-06,
"loss": 0.224,
"step": 523
},
{
"epoch": 0.48880597014925375,
"grad_norm": 1.3247869152095644,
"learning_rate": 8.597151362069452e-06,
"loss": 0.2217,
"step": 524
},
{
"epoch": 0.48973880597014924,
"grad_norm": 1.4021254940584056,
"learning_rate": 8.59205879120379e-06,
"loss": 0.201,
"step": 525
},
{
"epoch": 0.4906716417910448,
"grad_norm": 1.3157615398452691,
"learning_rate": 8.58695850786717e-06,
"loss": 0.2187,
"step": 526
},
{
"epoch": 0.4916044776119403,
"grad_norm": 1.3532240240654663,
"learning_rate": 8.581850523010353e-06,
"loss": 0.2518,
"step": 527
},
{
"epoch": 0.4925373134328358,
"grad_norm": 1.3134374105006694,
"learning_rate": 8.576734847600639e-06,
"loss": 0.2012,
"step": 528
},
{
"epoch": 0.49347014925373134,
"grad_norm": 1.3550733246495734,
"learning_rate": 8.571611492621839e-06,
"loss": 0.1959,
"step": 529
},
{
"epoch": 0.4944029850746269,
"grad_norm": 1.3414829685575018,
"learning_rate": 8.566480469074256e-06,
"loss": 0.2169,
"step": 530
},
{
"epoch": 0.49533582089552236,
"grad_norm": 1.390360326736497,
"learning_rate": 8.561341787974653e-06,
"loss": 0.2779,
"step": 531
},
{
"epoch": 0.4962686567164179,
"grad_norm": 1.2680890882771152,
"learning_rate": 8.55619546035624e-06,
"loss": 0.2351,
"step": 532
},
{
"epoch": 0.49720149253731344,
"grad_norm": 1.262419636891151,
"learning_rate": 8.55104149726864e-06,
"loss": 0.2307,
"step": 533
},
{
"epoch": 0.498134328358209,
"grad_norm": 1.2957731983042295,
"learning_rate": 8.545879909777872e-06,
"loss": 0.2002,
"step": 534
},
{
"epoch": 0.49906716417910446,
"grad_norm": 1.2538569059905391,
"learning_rate": 8.540710708966326e-06,
"loss": 0.214,
"step": 535
},
{
"epoch": 0.5,
"grad_norm": 1.2924302879204521,
"learning_rate": 8.535533905932739e-06,
"loss": 0.2009,
"step": 536
},
{
"epoch": 0.5009328358208955,
"grad_norm": 1.2711717751935465,
"learning_rate": 8.530349511792165e-06,
"loss": 0.214,
"step": 537
},
{
"epoch": 0.5018656716417911,
"grad_norm": 1.3427986027184053,
"learning_rate": 8.525157537675966e-06,
"loss": 0.2289,
"step": 538
},
{
"epoch": 0.5027985074626866,
"grad_norm": 1.3073171392549023,
"learning_rate": 8.519957994731768e-06,
"loss": 0.2105,
"step": 539
},
{
"epoch": 0.503731343283582,
"grad_norm": 1.2722660142973075,
"learning_rate": 8.514750894123463e-06,
"loss": 0.1889,
"step": 540
},
{
"epoch": 0.5046641791044776,
"grad_norm": 1.2452516022238156,
"learning_rate": 8.509536247031152e-06,
"loss": 0.2055,
"step": 541
},
{
"epoch": 0.5055970149253731,
"grad_norm": 1.2974718233932705,
"learning_rate": 8.504314064651154e-06,
"loss": 0.1962,
"step": 542
},
{
"epoch": 0.5065298507462687,
"grad_norm": 1.389455409648838,
"learning_rate": 8.499084358195957e-06,
"loss": 0.2399,
"step": 543
},
{
"epoch": 0.5074626865671642,
"grad_norm": 1.3616712771765143,
"learning_rate": 8.49384713889421e-06,
"loss": 0.2282,
"step": 544
},
{
"epoch": 0.5083955223880597,
"grad_norm": 1.234256347358437,
"learning_rate": 8.488602417990687e-06,
"loss": 0.1911,
"step": 545
},
{
"epoch": 0.5093283582089553,
"grad_norm": 1.5012470347965328,
"learning_rate": 8.483350206746277e-06,
"loss": 0.2228,
"step": 546
},
{
"epoch": 0.5102611940298507,
"grad_norm": 1.3895212595531514,
"learning_rate": 8.478090516437947e-06,
"loss": 0.2267,
"step": 547
},
{
"epoch": 0.5111940298507462,
"grad_norm": 1.3944450308287069,
"learning_rate": 8.472823358358716e-06,
"loss": 0.2124,
"step": 548
},
{
"epoch": 0.5121268656716418,
"grad_norm": 1.241137196118042,
"learning_rate": 8.467548743817645e-06,
"loss": 0.2131,
"step": 549
},
{
"epoch": 0.5130597014925373,
"grad_norm": 1.1941039288456845,
"learning_rate": 8.462266684139805e-06,
"loss": 0.1896,
"step": 550
},
{
"epoch": 0.5139925373134329,
"grad_norm": 1.37141699422829,
"learning_rate": 8.456977190666247e-06,
"loss": 0.203,
"step": 551
},
{
"epoch": 0.5149253731343284,
"grad_norm": 1.264274514723379,
"learning_rate": 8.451680274753986e-06,
"loss": 0.1992,
"step": 552
},
{
"epoch": 0.5158582089552238,
"grad_norm": 1.2475581664048347,
"learning_rate": 8.446375947775976e-06,
"loss": 0.21,
"step": 553
},
{
"epoch": 0.5167910447761194,
"grad_norm": 1.1750175255906279,
"learning_rate": 8.441064221121078e-06,
"loss": 0.1775,
"step": 554
},
{
"epoch": 0.5177238805970149,
"grad_norm": 1.250151778052525,
"learning_rate": 8.435745106194043e-06,
"loss": 0.2067,
"step": 555
},
{
"epoch": 0.5186567164179104,
"grad_norm": 1.3309654266729336,
"learning_rate": 8.430418614415488e-06,
"loss": 0.2342,
"step": 556
},
{
"epoch": 0.519589552238806,
"grad_norm": 1.318453627016584,
"learning_rate": 8.425084757221864e-06,
"loss": 0.2055,
"step": 557
},
{
"epoch": 0.5205223880597015,
"grad_norm": 1.418140023992704,
"learning_rate": 8.419743546065442e-06,
"loss": 0.2184,
"step": 558
},
{
"epoch": 0.5214552238805971,
"grad_norm": 1.3959518735990981,
"learning_rate": 8.414394992414276e-06,
"loss": 0.2429,
"step": 559
},
{
"epoch": 0.5223880597014925,
"grad_norm": 1.2598734786346006,
"learning_rate": 8.40903910775219e-06,
"loss": 0.1856,
"step": 560
},
{
"epoch": 0.523320895522388,
"grad_norm": 1.3414907853633748,
"learning_rate": 8.403675903578745e-06,
"loss": 0.2222,
"step": 561
},
{
"epoch": 0.5242537313432836,
"grad_norm": 1.3347668559036316,
"learning_rate": 8.398305391409221e-06,
"loss": 0.2392,
"step": 562
},
{
"epoch": 0.5251865671641791,
"grad_norm": 1.3978351879329773,
"learning_rate": 8.392927582774586e-06,
"loss": 0.2579,
"step": 563
},
{
"epoch": 0.5261194029850746,
"grad_norm": 1.2856902817689613,
"learning_rate": 8.387542489221477e-06,
"loss": 0.2355,
"step": 564
},
{
"epoch": 0.5270522388059702,
"grad_norm": 1.3482815664102457,
"learning_rate": 8.38215012231217e-06,
"loss": 0.2072,
"step": 565
},
{
"epoch": 0.5279850746268657,
"grad_norm": 1.2983598827313911,
"learning_rate": 8.376750493624556e-06,
"loss": 0.2077,
"step": 566
},
{
"epoch": 0.5289179104477612,
"grad_norm": 1.3895329159962166,
"learning_rate": 8.371343614752124e-06,
"loss": 0.2021,
"step": 567
},
{
"epoch": 0.5298507462686567,
"grad_norm": 1.3295405138308913,
"learning_rate": 8.36592949730392e-06,
"loss": 0.199,
"step": 568
},
{
"epoch": 0.5307835820895522,
"grad_norm": 1.283862155323016,
"learning_rate": 8.360508152904544e-06,
"loss": 0.2216,
"step": 569
},
{
"epoch": 0.5317164179104478,
"grad_norm": 1.24320649554638,
"learning_rate": 8.355079593194102e-06,
"loss": 0.1912,
"step": 570
},
{
"epoch": 0.5326492537313433,
"grad_norm": 1.4117502387459255,
"learning_rate": 8.349643829828198e-06,
"loss": 0.2304,
"step": 571
},
{
"epoch": 0.5335820895522388,
"grad_norm": 1.3931937360094073,
"learning_rate": 8.344200874477901e-06,
"loss": 0.2254,
"step": 572
},
{
"epoch": 0.5345149253731343,
"grad_norm": 1.3523007674514218,
"learning_rate": 8.338750738829723e-06,
"loss": 0.2344,
"step": 573
},
{
"epoch": 0.5354477611940298,
"grad_norm": 1.2450871950282771,
"learning_rate": 8.33329343458559e-06,
"loss": 0.2088,
"step": 574
},
{
"epoch": 0.5363805970149254,
"grad_norm": 1.3441841407397146,
"learning_rate": 8.327828973462823e-06,
"loss": 0.2562,
"step": 575
},
{
"epoch": 0.5373134328358209,
"grad_norm": 1.301540588732784,
"learning_rate": 8.32235736719411e-06,
"loss": 0.2059,
"step": 576
},
{
"epoch": 0.5382462686567164,
"grad_norm": 1.2306114909644439,
"learning_rate": 8.316878627527474e-06,
"loss": 0.2082,
"step": 577
},
{
"epoch": 0.539179104477612,
"grad_norm": 1.26349335480942,
"learning_rate": 8.311392766226261e-06,
"loss": 0.2258,
"step": 578
},
{
"epoch": 0.5401119402985075,
"grad_norm": 1.2190459793055979,
"learning_rate": 8.305899795069102e-06,
"loss": 0.2055,
"step": 579
},
{
"epoch": 0.5410447761194029,
"grad_norm": 1.234466722835294,
"learning_rate": 8.300399725849902e-06,
"loss": 0.2085,
"step": 580
},
{
"epoch": 0.5419776119402985,
"grad_norm": 1.3094795394905954,
"learning_rate": 8.294892570377794e-06,
"loss": 0.2352,
"step": 581
},
{
"epoch": 0.542910447761194,
"grad_norm": 1.1820382478545008,
"learning_rate": 8.289378340477138e-06,
"loss": 0.1761,
"step": 582
},
{
"epoch": 0.5438432835820896,
"grad_norm": 1.3309794913007937,
"learning_rate": 8.283857047987475e-06,
"loss": 0.1963,
"step": 583
},
{
"epoch": 0.5447761194029851,
"grad_norm": 1.3019079556605686,
"learning_rate": 8.278328704763516e-06,
"loss": 0.1934,
"step": 584
},
{
"epoch": 0.5457089552238806,
"grad_norm": 1.2802340210839334,
"learning_rate": 8.272793322675103e-06,
"loss": 0.2048,
"step": 585
},
{
"epoch": 0.5466417910447762,
"grad_norm": 1.2763571802306888,
"learning_rate": 8.2672509136072e-06,
"loss": 0.2,
"step": 586
},
{
"epoch": 0.5475746268656716,
"grad_norm": 1.3104917380211667,
"learning_rate": 8.261701489459852e-06,
"loss": 0.2043,
"step": 587
},
{
"epoch": 0.5485074626865671,
"grad_norm": 1.403312055285428,
"learning_rate": 8.256145062148168e-06,
"loss": 0.2278,
"step": 588
},
{
"epoch": 0.5494402985074627,
"grad_norm": 1.4440532908602695,
"learning_rate": 8.250581643602293e-06,
"loss": 0.2663,
"step": 589
},
{
"epoch": 0.5503731343283582,
"grad_norm": 1.2954811986308785,
"learning_rate": 8.245011245767385e-06,
"loss": 0.2108,
"step": 590
},
{
"epoch": 0.5513059701492538,
"grad_norm": 1.428979335450204,
"learning_rate": 8.239433880603585e-06,
"loss": 0.2753,
"step": 591
},
{
"epoch": 0.5522388059701493,
"grad_norm": 1.2834945741130208,
"learning_rate": 8.233849560085994e-06,
"loss": 0.2308,
"step": 592
},
{
"epoch": 0.5531716417910447,
"grad_norm": 1.310861000017192,
"learning_rate": 8.228258296204647e-06,
"loss": 0.2199,
"step": 593
},
{
"epoch": 0.5541044776119403,
"grad_norm": 1.414164947125919,
"learning_rate": 8.222660100964487e-06,
"loss": 0.225,
"step": 594
},
{
"epoch": 0.5550373134328358,
"grad_norm": 1.4371239993652112,
"learning_rate": 8.217054986385336e-06,
"loss": 0.2486,
"step": 595
},
{
"epoch": 0.5559701492537313,
"grad_norm": 1.2923123326954635,
"learning_rate": 8.211442964501879e-06,
"loss": 0.2069,
"step": 596
},
{
"epoch": 0.5569029850746269,
"grad_norm": 1.2877468481074716,
"learning_rate": 8.205824047363627e-06,
"loss": 0.1978,
"step": 597
},
{
"epoch": 0.5578358208955224,
"grad_norm": 1.3463440860422735,
"learning_rate": 8.200198247034897e-06,
"loss": 0.2268,
"step": 598
},
{
"epoch": 0.558768656716418,
"grad_norm": 1.333488528093814,
"learning_rate": 8.194565575594784e-06,
"loss": 0.2128,
"step": 599
},
{
"epoch": 0.5597014925373134,
"grad_norm": 1.3246372107891478,
"learning_rate": 8.188926045137139e-06,
"loss": 0.2312,
"step": 600
},
{
"epoch": 0.5606343283582089,
"grad_norm": 1.2673792193477058,
"learning_rate": 8.183279667770534e-06,
"loss": 0.1904,
"step": 601
},
{
"epoch": 0.5615671641791045,
"grad_norm": 1.4226097193723977,
"learning_rate": 8.177626455618245e-06,
"loss": 0.2411,
"step": 602
},
{
"epoch": 0.5625,
"grad_norm": 1.3246126171335773,
"learning_rate": 8.171966420818227e-06,
"loss": 0.2621,
"step": 603
},
{
"epoch": 0.5634328358208955,
"grad_norm": 1.3533798674718698,
"learning_rate": 8.166299575523081e-06,
"loss": 0.209,
"step": 604
},
{
"epoch": 0.5643656716417911,
"grad_norm": 1.435325938380449,
"learning_rate": 8.160625931900022e-06,
"loss": 0.2188,
"step": 605
},
{
"epoch": 0.5652985074626866,
"grad_norm": 1.4480142027219483,
"learning_rate": 8.154945502130877e-06,
"loss": 0.2267,
"step": 606
},
{
"epoch": 0.566231343283582,
"grad_norm": 1.2876510191778578,
"learning_rate": 8.149258298412033e-06,
"loss": 0.1969,
"step": 607
},
{
"epoch": 0.5671641791044776,
"grad_norm": 1.3790311508015853,
"learning_rate": 8.143564332954426e-06,
"loss": 0.2527,
"step": 608
},
{
"epoch": 0.5680970149253731,
"grad_norm": 1.2745691146097093,
"learning_rate": 8.137863617983506e-06,
"loss": 0.1928,
"step": 609
},
{
"epoch": 0.5690298507462687,
"grad_norm": 1.4184536736628595,
"learning_rate": 8.132156165739216e-06,
"loss": 0.2549,
"step": 610
},
{
"epoch": 0.5699626865671642,
"grad_norm": 1.3236046692752557,
"learning_rate": 8.12644198847597e-06,
"loss": 0.1754,
"step": 611
},
{
"epoch": 0.5708955223880597,
"grad_norm": 1.1976066020851874,
"learning_rate": 8.120721098462612e-06,
"loss": 0.1859,
"step": 612
},
{
"epoch": 0.5718283582089553,
"grad_norm": 1.2801823115311637,
"learning_rate": 8.114993507982408e-06,
"loss": 0.1937,
"step": 613
},
{
"epoch": 0.5727611940298507,
"grad_norm": 1.397910990799343,
"learning_rate": 8.109259229333005e-06,
"loss": 0.2362,
"step": 614
},
{
"epoch": 0.5736940298507462,
"grad_norm": 1.2380118843638828,
"learning_rate": 8.103518274826408e-06,
"loss": 0.1957,
"step": 615
},
{
"epoch": 0.5746268656716418,
"grad_norm": 1.2923134364968993,
"learning_rate": 8.097770656788961e-06,
"loss": 0.2036,
"step": 616
},
{
"epoch": 0.5755597014925373,
"grad_norm": 1.3088551068005592,
"learning_rate": 8.092016387561316e-06,
"loss": 0.2183,
"step": 617
},
{
"epoch": 0.5764925373134329,
"grad_norm": 1.3350688818343555,
"learning_rate": 8.086255479498398e-06,
"loss": 0.2612,
"step": 618
},
{
"epoch": 0.5774253731343284,
"grad_norm": 1.3275422480188492,
"learning_rate": 8.080487944969395e-06,
"loss": 0.2446,
"step": 619
},
{
"epoch": 0.5783582089552238,
"grad_norm": 1.3420496546097096,
"learning_rate": 8.074713796357717e-06,
"loss": 0.2252,
"step": 620
},
{
"epoch": 0.5792910447761194,
"grad_norm": 1.2681186056512634,
"learning_rate": 8.068933046060976e-06,
"loss": 0.2278,
"step": 621
},
{
"epoch": 0.5802238805970149,
"grad_norm": 1.4269201636687978,
"learning_rate": 8.063145706490961e-06,
"loss": 0.2738,
"step": 622
},
{
"epoch": 0.5811567164179104,
"grad_norm": 1.3211328025917792,
"learning_rate": 8.057351790073601e-06,
"loss": 0.1956,
"step": 623
},
{
"epoch": 0.582089552238806,
"grad_norm": 1.2945232584986655,
"learning_rate": 8.051551309248961e-06,
"loss": 0.2237,
"step": 624
},
{
"epoch": 0.5830223880597015,
"grad_norm": 1.421072978242554,
"learning_rate": 8.045744276471185e-06,
"loss": 0.2451,
"step": 625
},
{
"epoch": 0.5839552238805971,
"grad_norm": 1.4533180953467522,
"learning_rate": 8.039930704208492e-06,
"loss": 0.27,
"step": 626
},
{
"epoch": 0.5848880597014925,
"grad_norm": 1.3155482973906332,
"learning_rate": 8.034110604943144e-06,
"loss": 0.2066,
"step": 627
},
{
"epoch": 0.585820895522388,
"grad_norm": 1.4332765411034296,
"learning_rate": 8.028283991171408e-06,
"loss": 0.219,
"step": 628
},
{
"epoch": 0.5867537313432836,
"grad_norm": 1.335468080964928,
"learning_rate": 8.02245087540355e-06,
"loss": 0.1863,
"step": 629
},
{
"epoch": 0.5876865671641791,
"grad_norm": 1.3115973526953748,
"learning_rate": 8.016611270163783e-06,
"loss": 0.2144,
"step": 630
},
{
"epoch": 0.5886194029850746,
"grad_norm": 1.351986420193054,
"learning_rate": 8.010765187990268e-06,
"loss": 0.2183,
"step": 631
},
{
"epoch": 0.5895522388059702,
"grad_norm": 1.4929872029838387,
"learning_rate": 8.004912641435064e-06,
"loss": 0.2862,
"step": 632
},
{
"epoch": 0.5904850746268657,
"grad_norm": 1.2731526175510945,
"learning_rate": 7.999053643064108e-06,
"loss": 0.193,
"step": 633
},
{
"epoch": 0.5914179104477612,
"grad_norm": 1.427846234487328,
"learning_rate": 7.993188205457195e-06,
"loss": 0.2389,
"step": 634
},
{
"epoch": 0.5923507462686567,
"grad_norm": 1.2785953885496837,
"learning_rate": 7.987316341207942e-06,
"loss": 0.2026,
"step": 635
},
{
"epoch": 0.5932835820895522,
"grad_norm": 1.3353601466548186,
"learning_rate": 7.981438062923767e-06,
"loss": 0.212,
"step": 636
},
{
"epoch": 0.5942164179104478,
"grad_norm": 1.3874063096107416,
"learning_rate": 7.975553383225857e-06,
"loss": 0.2738,
"step": 637
},
{
"epoch": 0.5951492537313433,
"grad_norm": 1.3925772765894244,
"learning_rate": 7.969662314749148e-06,
"loss": 0.2551,
"step": 638
},
{
"epoch": 0.5960820895522388,
"grad_norm": 1.3499900031324361,
"learning_rate": 7.963764870142286e-06,
"loss": 0.2459,
"step": 639
},
{
"epoch": 0.5970149253731343,
"grad_norm": 1.3034783407482293,
"learning_rate": 7.957861062067614e-06,
"loss": 0.2571,
"step": 640
},
{
"epoch": 0.5979477611940298,
"grad_norm": 1.3012226651014325,
"learning_rate": 7.951950903201133e-06,
"loss": 0.1911,
"step": 641
},
{
"epoch": 0.5988805970149254,
"grad_norm": 1.352696538850524,
"learning_rate": 7.946034406232481e-06,
"loss": 0.2157,
"step": 642
},
{
"epoch": 0.5998134328358209,
"grad_norm": 1.2598648626979547,
"learning_rate": 7.940111583864909e-06,
"loss": 0.1902,
"step": 643
},
{
"epoch": 0.6007462686567164,
"grad_norm": 1.2568549391070882,
"learning_rate": 7.934182448815244e-06,
"loss": 0.1963,
"step": 644
},
{
"epoch": 0.601679104477612,
"grad_norm": 1.3016294510176711,
"learning_rate": 7.928247013813867e-06,
"loss": 0.2098,
"step": 645
},
{
"epoch": 0.6026119402985075,
"grad_norm": 1.4082924172412312,
"learning_rate": 7.922305291604688e-06,
"loss": 0.2653,
"step": 646
},
{
"epoch": 0.6035447761194029,
"grad_norm": 1.3726324485129864,
"learning_rate": 7.916357294945116e-06,
"loss": 0.2194,
"step": 647
},
{
"epoch": 0.6044776119402985,
"grad_norm": 1.2451400940034223,
"learning_rate": 7.910403036606028e-06,
"loss": 0.1846,
"step": 648
},
{
"epoch": 0.605410447761194,
"grad_norm": 1.2728449461164848,
"learning_rate": 7.90444252937175e-06,
"loss": 0.2083,
"step": 649
},
{
"epoch": 0.6063432835820896,
"grad_norm": 1.3773934769808678,
"learning_rate": 7.898475786040025e-06,
"loss": 0.209,
"step": 650
},
{
"epoch": 0.6072761194029851,
"grad_norm": 1.405684014418204,
"learning_rate": 7.892502819421979e-06,
"loss": 0.2408,
"step": 651
},
{
"epoch": 0.6082089552238806,
"grad_norm": 1.3138210727359434,
"learning_rate": 7.88652364234211e-06,
"loss": 0.216,
"step": 652
},
{
"epoch": 0.6091417910447762,
"grad_norm": 1.272247116045296,
"learning_rate": 7.880538267638243e-06,
"loss": 0.2261,
"step": 653
},
{
"epoch": 0.6100746268656716,
"grad_norm": 1.2742313717737872,
"learning_rate": 7.874546708161512e-06,
"loss": 0.193,
"step": 654
},
{
"epoch": 0.6110074626865671,
"grad_norm": 1.3353009311794783,
"learning_rate": 7.868548976776328e-06,
"loss": 0.2395,
"step": 655
},
{
"epoch": 0.6119402985074627,
"grad_norm": 1.261662018670456,
"learning_rate": 7.86254508636036e-06,
"loss": 0.2475,
"step": 656
},
{
"epoch": 0.6128731343283582,
"grad_norm": 1.2686834317416418,
"learning_rate": 7.856535049804495e-06,
"loss": 0.1734,
"step": 657
},
{
"epoch": 0.6138059701492538,
"grad_norm": 1.2987561305634387,
"learning_rate": 7.850518880012815e-06,
"loss": 0.2272,
"step": 658
},
{
"epoch": 0.6147388059701493,
"grad_norm": 1.2645798199797904,
"learning_rate": 7.844496589902577e-06,
"loss": 0.2086,
"step": 659
},
{
"epoch": 0.6156716417910447,
"grad_norm": 1.2881560804345098,
"learning_rate": 7.838468192404176e-06,
"loss": 0.2402,
"step": 660
},
{
"epoch": 0.6166044776119403,
"grad_norm": 1.3617560232793342,
"learning_rate": 7.83243370046112e-06,
"loss": 0.2563,
"step": 661
},
{
"epoch": 0.6175373134328358,
"grad_norm": 1.242645518937026,
"learning_rate": 7.826393127029998e-06,
"loss": 0.1715,
"step": 662
},
{
"epoch": 0.6184701492537313,
"grad_norm": 1.2222827975734027,
"learning_rate": 7.820346485080466e-06,
"loss": 0.2152,
"step": 663
},
{
"epoch": 0.6194029850746269,
"grad_norm": 1.501581792730611,
"learning_rate": 7.814293787595197e-06,
"loss": 0.2325,
"step": 664
},
{
"epoch": 0.6203358208955224,
"grad_norm": 1.3996008994629061,
"learning_rate": 7.80823504756988e-06,
"loss": 0.2174,
"step": 665
},
{
"epoch": 0.621268656716418,
"grad_norm": 1.2085367755829877,
"learning_rate": 7.80217027801317e-06,
"loss": 0.1789,
"step": 666
},
{
"epoch": 0.6222014925373134,
"grad_norm": 1.268280098224564,
"learning_rate": 7.796099491946665e-06,
"loss": 0.1904,
"step": 667
},
{
"epoch": 0.6231343283582089,
"grad_norm": 1.2755297915497015,
"learning_rate": 7.790022702404887e-06,
"loss": 0.1982,
"step": 668
},
{
"epoch": 0.6240671641791045,
"grad_norm": 1.3752880079713161,
"learning_rate": 7.783939922435244e-06,
"loss": 0.2617,
"step": 669
},
{
"epoch": 0.625,
"grad_norm": 1.3760610529149837,
"learning_rate": 7.777851165098012e-06,
"loss": 0.2268,
"step": 670
},
{
"epoch": 0.6259328358208955,
"grad_norm": 1.2876371621650369,
"learning_rate": 7.771756443466292e-06,
"loss": 0.2373,
"step": 671
},
{
"epoch": 0.6268656716417911,
"grad_norm": 1.3413023124396006,
"learning_rate": 7.765655770625997e-06,
"loss": 0.2423,
"step": 672
},
{
"epoch": 0.6277985074626866,
"grad_norm": 1.3067002081116716,
"learning_rate": 7.759549159675819e-06,
"loss": 0.1845,
"step": 673
},
{
"epoch": 0.628731343283582,
"grad_norm": 1.4016448515552087,
"learning_rate": 7.753436623727193e-06,
"loss": 0.1792,
"step": 674
},
{
"epoch": 0.6296641791044776,
"grad_norm": 1.27456610650169,
"learning_rate": 7.747318175904281e-06,
"loss": 0.2139,
"step": 675
},
{
"epoch": 0.6305970149253731,
"grad_norm": 1.3121464973056274,
"learning_rate": 7.741193829343937e-06,
"loss": 0.2139,
"step": 676
},
{
"epoch": 0.6315298507462687,
"grad_norm": 1.3607854375467525,
"learning_rate": 7.73506359719568e-06,
"loss": 0.2303,
"step": 677
},
{
"epoch": 0.6324626865671642,
"grad_norm": 1.2158147491065376,
"learning_rate": 7.728927492621665e-06,
"loss": 0.1887,
"step": 678
},
{
"epoch": 0.6333955223880597,
"grad_norm": 1.3402425409255352,
"learning_rate": 7.722785528796657e-06,
"loss": 0.2171,
"step": 679
},
{
"epoch": 0.6343283582089553,
"grad_norm": 1.3178852845138038,
"learning_rate": 7.716637718908002e-06,
"loss": 0.205,
"step": 680
},
{
"epoch": 0.6352611940298507,
"grad_norm": 1.2253743856627322,
"learning_rate": 7.710484076155595e-06,
"loss": 0.1834,
"step": 681
},
{
"epoch": 0.6361940298507462,
"grad_norm": 1.2606137619057538,
"learning_rate": 7.704324613751856e-06,
"loss": 0.2106,
"step": 682
},
{
"epoch": 0.6371268656716418,
"grad_norm": 1.2691977152639984,
"learning_rate": 7.698159344921704e-06,
"loss": 0.1913,
"step": 683
},
{
"epoch": 0.6380597014925373,
"grad_norm": 1.4050053172099695,
"learning_rate": 7.691988282902519e-06,
"loss": 0.2649,
"step": 684
},
{
"epoch": 0.6389925373134329,
"grad_norm": 1.2987797625819433,
"learning_rate": 7.685811440944121e-06,
"loss": 0.2174,
"step": 685
},
{
"epoch": 0.6399253731343284,
"grad_norm": 1.5415194185556067,
"learning_rate": 7.679628832308743e-06,
"loss": 0.267,
"step": 686
},
{
"epoch": 0.6408582089552238,
"grad_norm": 1.214096524298427,
"learning_rate": 7.673440470270998e-06,
"loss": 0.1743,
"step": 687
},
{
"epoch": 0.6417910447761194,
"grad_norm": 1.4037990172322976,
"learning_rate": 7.667246368117852e-06,
"loss": 0.2573,
"step": 688
},
{
"epoch": 0.6427238805970149,
"grad_norm": 1.2985067455372177,
"learning_rate": 7.661046539148596e-06,
"loss": 0.2067,
"step": 689
},
{
"epoch": 0.6436567164179104,
"grad_norm": 1.3698413507795268,
"learning_rate": 7.654840996674813e-06,
"loss": 0.2305,
"step": 690
},
{
"epoch": 0.644589552238806,
"grad_norm": 1.2335877423016568,
"learning_rate": 7.648629754020359e-06,
"loss": 0.1892,
"step": 691
},
{
"epoch": 0.6455223880597015,
"grad_norm": 1.2599139522935205,
"learning_rate": 7.642412824521328e-06,
"loss": 0.186,
"step": 692
},
{
"epoch": 0.6464552238805971,
"grad_norm": 1.4056428024323913,
"learning_rate": 7.636190221526022e-06,
"loss": 0.2824,
"step": 693
},
{
"epoch": 0.6473880597014925,
"grad_norm": 1.3733426492933554,
"learning_rate": 7.629961958394923e-06,
"loss": 0.2392,
"step": 694
},
{
"epoch": 0.648320895522388,
"grad_norm": 1.280382940927538,
"learning_rate": 7.623728048500669e-06,
"loss": 0.2425,
"step": 695
},
{
"epoch": 0.6492537313432836,
"grad_norm": 1.3553528855964356,
"learning_rate": 7.617488505228023e-06,
"loss": 0.2442,
"step": 696
},
{
"epoch": 0.6501865671641791,
"grad_norm": 1.2788028409563263,
"learning_rate": 7.611243341973839e-06,
"loss": 0.2052,
"step": 697
},
{
"epoch": 0.6511194029850746,
"grad_norm": 1.1777433569265974,
"learning_rate": 7.6049925721470455e-06,
"loss": 0.1718,
"step": 698
},
{
"epoch": 0.6520522388059702,
"grad_norm": 1.3055979038996244,
"learning_rate": 7.598736209168595e-06,
"loss": 0.2287,
"step": 699
},
{
"epoch": 0.6529850746268657,
"grad_norm": 1.274827166623728,
"learning_rate": 7.592474266471464e-06,
"loss": 0.2234,
"step": 700
},
{
"epoch": 0.6539179104477612,
"grad_norm": 1.260025562760807,
"learning_rate": 7.5862067575006e-06,
"loss": 0.1654,
"step": 701
},
{
"epoch": 0.6548507462686567,
"grad_norm": 1.2899093555470775,
"learning_rate": 7.579933695712905e-06,
"loss": 0.2328,
"step": 702
},
{
"epoch": 0.6557835820895522,
"grad_norm": 1.3355248187559228,
"learning_rate": 7.573655094577204e-06,
"loss": 0.2558,
"step": 703
},
{
"epoch": 0.6567164179104478,
"grad_norm": 1.2496645745418022,
"learning_rate": 7.56737096757421e-06,
"loss": 0.1839,
"step": 704
},
{
"epoch": 0.6576492537313433,
"grad_norm": 1.3061822327647394,
"learning_rate": 7.56108132819651e-06,
"loss": 0.2056,
"step": 705
},
{
"epoch": 0.6585820895522388,
"grad_norm": 1.361324005957055,
"learning_rate": 7.5547861899485175e-06,
"loss": 0.2374,
"step": 706
},
{
"epoch": 0.6595149253731343,
"grad_norm": 1.329252555227055,
"learning_rate": 7.5484855663464595e-06,
"loss": 0.237,
"step": 707
},
{
"epoch": 0.6604477611940298,
"grad_norm": 1.2593760566247931,
"learning_rate": 7.542179470918336e-06,
"loss": 0.1985,
"step": 708
},
{
"epoch": 0.6613805970149254,
"grad_norm": 1.3178489992256595,
"learning_rate": 7.535867917203897e-06,
"loss": 0.2046,
"step": 709
},
{
"epoch": 0.6623134328358209,
"grad_norm": 1.3733355088588963,
"learning_rate": 7.529550918754609e-06,
"loss": 0.2579,
"step": 710
},
{
"epoch": 0.6632462686567164,
"grad_norm": 1.3287402740753909,
"learning_rate": 7.523228489133639e-06,
"loss": 0.2385,
"step": 711
},
{
"epoch": 0.664179104477612,
"grad_norm": 1.336829144755182,
"learning_rate": 7.5169006419157985e-06,
"loss": 0.2519,
"step": 712
},
{
"epoch": 0.6651119402985075,
"grad_norm": 1.2822083018178134,
"learning_rate": 7.510567390687549e-06,
"loss": 0.2429,
"step": 713
},
{
"epoch": 0.6660447761194029,
"grad_norm": 1.3227489712739566,
"learning_rate": 7.504228749046941e-06,
"loss": 0.2532,
"step": 714
},
{
"epoch": 0.6669776119402985,
"grad_norm": 1.2158384388400283,
"learning_rate": 7.497884730603608e-06,
"loss": 0.1889,
"step": 715
},
{
"epoch": 0.667910447761194,
"grad_norm": 1.2300611255968865,
"learning_rate": 7.491535348978719e-06,
"loss": 0.2031,
"step": 716
},
{
"epoch": 0.6688432835820896,
"grad_norm": 1.3917027799740254,
"learning_rate": 7.485180617804968e-06,
"loss": 0.2087,
"step": 717
},
{
"epoch": 0.6697761194029851,
"grad_norm": 1.2984111895743784,
"learning_rate": 7.478820550726528e-06,
"loss": 0.2404,
"step": 718
},
{
"epoch": 0.6707089552238806,
"grad_norm": 1.3555569274235573,
"learning_rate": 7.472455161399031e-06,
"loss": 0.247,
"step": 719
},
{
"epoch": 0.6716417910447762,
"grad_norm": 1.3816030387793872,
"learning_rate": 7.466084463489537e-06,
"loss": 0.2182,
"step": 720
},
{
"epoch": 0.6725746268656716,
"grad_norm": 1.2663395556813302,
"learning_rate": 7.459708470676504e-06,
"loss": 0.2221,
"step": 721
},
{
"epoch": 0.6735074626865671,
"grad_norm": 1.3067189909365724,
"learning_rate": 7.453327196649756e-06,
"loss": 0.1994,
"step": 722
},
{
"epoch": 0.6744402985074627,
"grad_norm": 1.3354233334197128,
"learning_rate": 7.446940655110457e-06,
"loss": 0.1899,
"step": 723
},
{
"epoch": 0.6753731343283582,
"grad_norm": 1.3794949675774215,
"learning_rate": 7.440548859771086e-06,
"loss": 0.2159,
"step": 724
},
{
"epoch": 0.6763059701492538,
"grad_norm": 1.3100708012693134,
"learning_rate": 7.434151824355396e-06,
"loss": 0.2233,
"step": 725
},
{
"epoch": 0.6772388059701493,
"grad_norm": 1.271288846350891,
"learning_rate": 7.4277495625983916e-06,
"loss": 0.2234,
"step": 726
},
{
"epoch": 0.6781716417910447,
"grad_norm": 1.2975245411849095,
"learning_rate": 7.421342088246304e-06,
"loss": 0.2407,
"step": 727
},
{
"epoch": 0.6791044776119403,
"grad_norm": 1.2435497512296882,
"learning_rate": 7.414929415056551e-06,
"loss": 0.1997,
"step": 728
},
{
"epoch": 0.6800373134328358,
"grad_norm": 1.3107202575780283,
"learning_rate": 7.408511556797714e-06,
"loss": 0.2396,
"step": 729
},
{
"epoch": 0.6809701492537313,
"grad_norm": 1.3303686258359244,
"learning_rate": 7.402088527249508e-06,
"loss": 0.2735,
"step": 730
},
{
"epoch": 0.6819029850746269,
"grad_norm": 1.2785847204338623,
"learning_rate": 7.395660340202752e-06,
"loss": 0.1935,
"step": 731
},
{
"epoch": 0.6828358208955224,
"grad_norm": 1.2944858838779594,
"learning_rate": 7.389227009459335e-06,
"loss": 0.2036,
"step": 732
},
{
"epoch": 0.683768656716418,
"grad_norm": 1.3189703714874468,
"learning_rate": 7.382788548832196e-06,
"loss": 0.2557,
"step": 733
},
{
"epoch": 0.6847014925373134,
"grad_norm": 1.2868391513140633,
"learning_rate": 7.3763449721452815e-06,
"loss": 0.1901,
"step": 734
},
{
"epoch": 0.6856343283582089,
"grad_norm": 1.357808247604318,
"learning_rate": 7.369896293233531e-06,
"loss": 0.2247,
"step": 735
},
{
"epoch": 0.6865671641791045,
"grad_norm": 1.3124644418425873,
"learning_rate": 7.363442525942827e-06,
"loss": 0.1975,
"step": 736
},
{
"epoch": 0.6875,
"grad_norm": 1.2731450065419383,
"learning_rate": 7.3569836841299905e-06,
"loss": 0.1819,
"step": 737
},
{
"epoch": 0.6884328358208955,
"grad_norm": 1.3480325337789343,
"learning_rate": 7.350519781662726e-06,
"loss": 0.2502,
"step": 738
},
{
"epoch": 0.6893656716417911,
"grad_norm": 1.4153540585327467,
"learning_rate": 7.3440508324196126e-06,
"loss": 0.2524,
"step": 739
},
{
"epoch": 0.6902985074626866,
"grad_norm": 1.3182119358350495,
"learning_rate": 7.3375768502900626e-06,
"loss": 0.2217,
"step": 740
},
{
"epoch": 0.691231343283582,
"grad_norm": 1.32600345807212,
"learning_rate": 7.331097849174292e-06,
"loss": 0.2132,
"step": 741
},
{
"epoch": 0.6921641791044776,
"grad_norm": 1.2238434834138427,
"learning_rate": 7.3246138429832945e-06,
"loss": 0.2025,
"step": 742
},
{
"epoch": 0.6930970149253731,
"grad_norm": 1.280187761697935,
"learning_rate": 7.3181248456388124e-06,
"loss": 0.1997,
"step": 743
},
{
"epoch": 0.6940298507462687,
"grad_norm": 1.2115465450484972,
"learning_rate": 7.311630871073301e-06,
"loss": 0.1858,
"step": 744
},
{
"epoch": 0.6949626865671642,
"grad_norm": 1.3194935216161052,
"learning_rate": 7.305131933229902e-06,
"loss": 0.2629,
"step": 745
},
{
"epoch": 0.6958955223880597,
"grad_norm": 1.3474367087215364,
"learning_rate": 7.298628046062417e-06,
"loss": 0.2106,
"step": 746
},
{
"epoch": 0.6968283582089553,
"grad_norm": 1.2772038481835335,
"learning_rate": 7.292119223535273e-06,
"loss": 0.1831,
"step": 747
},
{
"epoch": 0.6977611940298507,
"grad_norm": 1.3913533127594813,
"learning_rate": 7.2856054796234944e-06,
"loss": 0.2819,
"step": 748
},
{
"epoch": 0.6986940298507462,
"grad_norm": 1.2343619133538144,
"learning_rate": 7.279086828312666e-06,
"loss": 0.195,
"step": 749
},
{
"epoch": 0.6996268656716418,
"grad_norm": 1.1736678408101116,
"learning_rate": 7.272563283598918e-06,
"loss": 0.1897,
"step": 750
},
{
"epoch": 0.7005597014925373,
"grad_norm": 1.2037655185681564,
"learning_rate": 7.266034859488883e-06,
"loss": 0.1949,
"step": 751
},
{
"epoch": 0.7014925373134329,
"grad_norm": 1.3734571083959768,
"learning_rate": 7.25950156999967e-06,
"loss": 0.2412,
"step": 752
},
{
"epoch": 0.7024253731343284,
"grad_norm": 1.255827542301762,
"learning_rate": 7.252963429158835e-06,
"loss": 0.1766,
"step": 753
},
{
"epoch": 0.7033582089552238,
"grad_norm": 1.3750180737088755,
"learning_rate": 7.246420451004352e-06,
"loss": 0.2157,
"step": 754
},
{
"epoch": 0.7042910447761194,
"grad_norm": 1.3432725001900598,
"learning_rate": 7.239872649584574e-06,
"loss": 0.222,
"step": 755
},
{
"epoch": 0.7052238805970149,
"grad_norm": 1.3717844129040624,
"learning_rate": 7.23332003895822e-06,
"loss": 0.2127,
"step": 756
},
{
"epoch": 0.7061567164179104,
"grad_norm": 1.2185242804017098,
"learning_rate": 7.226762633194331e-06,
"loss": 0.172,
"step": 757
},
{
"epoch": 0.707089552238806,
"grad_norm": 1.2951740775174005,
"learning_rate": 7.220200446372239e-06,
"loss": 0.237,
"step": 758
},
{
"epoch": 0.7080223880597015,
"grad_norm": 1.4291494271201155,
"learning_rate": 7.2136334925815455e-06,
"loss": 0.303,
"step": 759
},
{
"epoch": 0.7089552238805971,
"grad_norm": 1.2767443600841293,
"learning_rate": 7.207061785922089e-06,
"loss": 0.2645,
"step": 760
},
{
"epoch": 0.7098880597014925,
"grad_norm": 1.2558172096850615,
"learning_rate": 7.20048534050391e-06,
"loss": 0.1923,
"step": 761
},
{
"epoch": 0.710820895522388,
"grad_norm": 1.2682779571350995,
"learning_rate": 7.193904170447223e-06,
"loss": 0.1779,
"step": 762
},
{
"epoch": 0.7117537313432836,
"grad_norm": 1.4248910177875456,
"learning_rate": 7.187318289882387e-06,
"loss": 0.2371,
"step": 763
},
{
"epoch": 0.7126865671641791,
"grad_norm": 1.2194662674040737,
"learning_rate": 7.1807277129498774e-06,
"loss": 0.1942,
"step": 764
},
{
"epoch": 0.7136194029850746,
"grad_norm": 1.24249804380326,
"learning_rate": 7.17413245380025e-06,
"loss": 0.2263,
"step": 765
},
{
"epoch": 0.7145522388059702,
"grad_norm": 1.3870816837211544,
"learning_rate": 7.167532526594116e-06,
"loss": 0.2344,
"step": 766
},
{
"epoch": 0.7154850746268657,
"grad_norm": 1.3983268621959495,
"learning_rate": 7.160927945502109e-06,
"loss": 0.2187,
"step": 767
},
{
"epoch": 0.7164179104477612,
"grad_norm": 1.2771984166842354,
"learning_rate": 7.1543187247048525e-06,
"loss": 0.2029,
"step": 768
},
{
"epoch": 0.7173507462686567,
"grad_norm": 1.3874043953579143,
"learning_rate": 7.147704878392935e-06,
"loss": 0.2774,
"step": 769
},
{
"epoch": 0.7182835820895522,
"grad_norm": 1.1553166176292557,
"learning_rate": 7.141086420766875e-06,
"loss": 0.1844,
"step": 770
},
{
"epoch": 0.7192164179104478,
"grad_norm": 1.4378856847783543,
"learning_rate": 7.134463366037091e-06,
"loss": 0.2848,
"step": 771
},
{
"epoch": 0.7201492537313433,
"grad_norm": 1.2503624007594234,
"learning_rate": 7.1278357284238745e-06,
"loss": 0.1789,
"step": 772
},
{
"epoch": 0.7210820895522388,
"grad_norm": 1.2534733748456004,
"learning_rate": 7.121203522157354e-06,
"loss": 0.1838,
"step": 773
},
{
"epoch": 0.7220149253731343,
"grad_norm": 1.2915708752816295,
"learning_rate": 7.114566761477468e-06,
"loss": 0.249,
"step": 774
},
{
"epoch": 0.7229477611940298,
"grad_norm": 1.279147688945239,
"learning_rate": 7.107925460633936e-06,
"loss": 0.2184,
"step": 775
},
{
"epoch": 0.7238805970149254,
"grad_norm": 1.310867616394776,
"learning_rate": 7.101279633886222e-06,
"loss": 0.2431,
"step": 776
},
{
"epoch": 0.7248134328358209,
"grad_norm": 1.3011380699356498,
"learning_rate": 7.094629295503513e-06,
"loss": 0.2408,
"step": 777
},
{
"epoch": 0.7257462686567164,
"grad_norm": 1.3435867669486072,
"learning_rate": 7.087974459764675e-06,
"loss": 0.2097,
"step": 778
},
{
"epoch": 0.726679104477612,
"grad_norm": 1.2481127962379166,
"learning_rate": 7.081315140958236e-06,
"loss": 0.1873,
"step": 779
},
{
"epoch": 0.7276119402985075,
"grad_norm": 1.2347418383894004,
"learning_rate": 7.074651353382349e-06,
"loss": 0.1863,
"step": 780
},
{
"epoch": 0.7285447761194029,
"grad_norm": 1.1862014967353693,
"learning_rate": 7.067983111344762e-06,
"loss": 0.1799,
"step": 781
},
{
"epoch": 0.7294776119402985,
"grad_norm": 1.288195756530044,
"learning_rate": 7.061310429162782e-06,
"loss": 0.2364,
"step": 782
},
{
"epoch": 0.730410447761194,
"grad_norm": 1.3419414029305805,
"learning_rate": 7.054633321163258e-06,
"loss": 0.2427,
"step": 783
},
{
"epoch": 0.7313432835820896,
"grad_norm": 1.264073939413233,
"learning_rate": 7.047951801682533e-06,
"loss": 0.1923,
"step": 784
},
{
"epoch": 0.7322761194029851,
"grad_norm": 1.2705408941546572,
"learning_rate": 7.041265885066428e-06,
"loss": 0.2326,
"step": 785
},
{
"epoch": 0.7332089552238806,
"grad_norm": 1.373108720756827,
"learning_rate": 7.034575585670205e-06,
"loss": 0.2455,
"step": 786
},
{
"epoch": 0.7341417910447762,
"grad_norm": 1.3545548371493887,
"learning_rate": 7.027880917858529e-06,
"loss": 0.261,
"step": 787
},
{
"epoch": 0.7350746268656716,
"grad_norm": 1.366716670408608,
"learning_rate": 7.021181896005456e-06,
"loss": 0.2319,
"step": 788
},
{
"epoch": 0.7360074626865671,
"grad_norm": 1.2549936147832534,
"learning_rate": 7.014478534494378e-06,
"loss": 0.2337,
"step": 789
},
{
"epoch": 0.7369402985074627,
"grad_norm": 1.336996910856189,
"learning_rate": 7.007770847718014e-06,
"loss": 0.1859,
"step": 790
},
{
"epoch": 0.7378731343283582,
"grad_norm": 1.2310220030547625,
"learning_rate": 7.001058850078366e-06,
"loss": 0.1885,
"step": 791
},
{
"epoch": 0.7388059701492538,
"grad_norm": 1.2740115144273731,
"learning_rate": 6.994342555986692e-06,
"loss": 0.1969,
"step": 792
},
{
"epoch": 0.7397388059701493,
"grad_norm": 1.1889466328158036,
"learning_rate": 6.987621979863475e-06,
"loss": 0.1618,
"step": 793
},
{
"epoch": 0.7406716417910447,
"grad_norm": 1.2248802281858773,
"learning_rate": 6.9808971361383935e-06,
"loss": 0.2103,
"step": 794
},
{
"epoch": 0.7416044776119403,
"grad_norm": 1.4300940894297904,
"learning_rate": 6.9741680392502845e-06,
"loss": 0.2702,
"step": 795
},
{
"epoch": 0.7425373134328358,
"grad_norm": 1.2116830325525012,
"learning_rate": 6.967434703647123e-06,
"loss": 0.1764,
"step": 796
},
{
"epoch": 0.7434701492537313,
"grad_norm": 1.3112261692065397,
"learning_rate": 6.960697143785979e-06,
"loss": 0.2494,
"step": 797
},
{
"epoch": 0.7444029850746269,
"grad_norm": 1.4169735590782278,
"learning_rate": 6.953955374132996e-06,
"loss": 0.2954,
"step": 798
},
{
"epoch": 0.7453358208955224,
"grad_norm": 1.2798391276899126,
"learning_rate": 6.947209409163357e-06,
"loss": 0.2087,
"step": 799
},
{
"epoch": 0.746268656716418,
"grad_norm": 1.3846133582561395,
"learning_rate": 6.9404592633612486e-06,
"loss": 0.243,
"step": 800
},
{
"epoch": 0.7472014925373134,
"grad_norm": 1.3262645304233895,
"learning_rate": 6.93370495121984e-06,
"loss": 0.2253,
"step": 801
},
{
"epoch": 0.7481343283582089,
"grad_norm": 1.246757162976938,
"learning_rate": 6.926946487241239e-06,
"loss": 0.2214,
"step": 802
},
{
"epoch": 0.7490671641791045,
"grad_norm": 1.350005845462039,
"learning_rate": 6.920183885936473e-06,
"loss": 0.2375,
"step": 803
},
{
"epoch": 0.75,
"grad_norm": 1.2209331776937584,
"learning_rate": 6.913417161825449e-06,
"loss": 0.2199,
"step": 804
},
{
"epoch": 0.7509328358208955,
"grad_norm": 1.2267624877565906,
"learning_rate": 6.90664632943693e-06,
"loss": 0.1936,
"step": 805
},
{
"epoch": 0.7518656716417911,
"grad_norm": 1.172302982511542,
"learning_rate": 6.899871403308498e-06,
"loss": 0.211,
"step": 806
},
{
"epoch": 0.7527985074626866,
"grad_norm": 1.2676625196680436,
"learning_rate": 6.893092397986523e-06,
"loss": 0.2283,
"step": 807
},
{
"epoch": 0.753731343283582,
"grad_norm": 1.3374734891623064,
"learning_rate": 6.886309328026135e-06,
"loss": 0.225,
"step": 808
},
{
"epoch": 0.7546641791044776,
"grad_norm": 1.3195162787198706,
"learning_rate": 6.879522207991191e-06,
"loss": 0.249,
"step": 809
},
{
"epoch": 0.7555970149253731,
"grad_norm": 1.2275779555398447,
"learning_rate": 6.872731052454243e-06,
"loss": 0.2211,
"step": 810
},
{
"epoch": 0.7565298507462687,
"grad_norm": 1.289029763845464,
"learning_rate": 6.865935875996509e-06,
"loss": 0.2281,
"step": 811
},
{
"epoch": 0.7574626865671642,
"grad_norm": 1.3541277141371464,
"learning_rate": 6.85913669320784e-06,
"loss": 0.1995,
"step": 812
},
{
"epoch": 0.7583955223880597,
"grad_norm": 1.321909779991229,
"learning_rate": 6.852333518686688e-06,
"loss": 0.2344,
"step": 813
},
{
"epoch": 0.7593283582089553,
"grad_norm": 1.224871553289291,
"learning_rate": 6.845526367040076e-06,
"loss": 0.1985,
"step": 814
},
{
"epoch": 0.7602611940298507,
"grad_norm": 1.3775854589181042,
"learning_rate": 6.838715252883567e-06,
"loss": 0.2605,
"step": 815
},
{
"epoch": 0.7611940298507462,
"grad_norm": 1.4821817128027184,
"learning_rate": 6.831900190841232e-06,
"loss": 0.2936,
"step": 816
},
{
"epoch": 0.7621268656716418,
"grad_norm": 1.2397684942862677,
"learning_rate": 6.825081195545615e-06,
"loss": 0.2292,
"step": 817
},
{
"epoch": 0.7630597014925373,
"grad_norm": 1.345221673931461,
"learning_rate": 6.818258281637709e-06,
"loss": 0.2335,
"step": 818
},
{
"epoch": 0.7639925373134329,
"grad_norm": 1.426682576710431,
"learning_rate": 6.811431463766922e-06,
"loss": 0.2553,
"step": 819
},
{
"epoch": 0.7649253731343284,
"grad_norm": 1.2602342447821695,
"learning_rate": 6.804600756591037e-06,
"loss": 0.1955,
"step": 820
},
{
"epoch": 0.7658582089552238,
"grad_norm": 1.3593906222216048,
"learning_rate": 6.797766174776197e-06,
"loss": 0.2566,
"step": 821
},
{
"epoch": 0.7667910447761194,
"grad_norm": 1.1752129115708971,
"learning_rate": 6.790927732996855e-06,
"loss": 0.2007,
"step": 822
},
{
"epoch": 0.7677238805970149,
"grad_norm": 1.2985625823041023,
"learning_rate": 6.78408544593576e-06,
"loss": 0.2312,
"step": 823
},
{
"epoch": 0.7686567164179104,
"grad_norm": 1.3215582389199496,
"learning_rate": 6.777239328283909e-06,
"loss": 0.2264,
"step": 824
},
{
"epoch": 0.769589552238806,
"grad_norm": 1.2861815312659521,
"learning_rate": 6.770389394740531e-06,
"loss": 0.224,
"step": 825
},
{
"epoch": 0.7705223880597015,
"grad_norm": 1.2062615319767154,
"learning_rate": 6.763535660013044e-06,
"loss": 0.2001,
"step": 826
},
{
"epoch": 0.7714552238805971,
"grad_norm": 1.192048831923032,
"learning_rate": 6.756678138817029e-06,
"loss": 0.1804,
"step": 827
},
{
"epoch": 0.7723880597014925,
"grad_norm": 1.3169223961426715,
"learning_rate": 6.749816845876196e-06,
"loss": 0.2107,
"step": 828
},
{
"epoch": 0.773320895522388,
"grad_norm": 1.2199842237106207,
"learning_rate": 6.742951795922355e-06,
"loss": 0.1968,
"step": 829
},
{
"epoch": 0.7742537313432836,
"grad_norm": 1.3463678809767026,
"learning_rate": 6.736083003695378e-06,
"loss": 0.2309,
"step": 830
},
{
"epoch": 0.7751865671641791,
"grad_norm": 1.38996426766889,
"learning_rate": 6.729210483943176e-06,
"loss": 0.2584,
"step": 831
},
{
"epoch": 0.7761194029850746,
"grad_norm": 1.228356172288229,
"learning_rate": 6.722334251421665e-06,
"loss": 0.1962,
"step": 832
},
{
"epoch": 0.7770522388059702,
"grad_norm": 1.3002701079919168,
"learning_rate": 6.715454320894728e-06,
"loss": 0.2167,
"step": 833
},
{
"epoch": 0.7779850746268657,
"grad_norm": 1.2962121956490298,
"learning_rate": 6.708570707134192e-06,
"loss": 0.2013,
"step": 834
},
{
"epoch": 0.7789179104477612,
"grad_norm": 1.29750596117597,
"learning_rate": 6.701683424919789e-06,
"loss": 0.2245,
"step": 835
},
{
"epoch": 0.7798507462686567,
"grad_norm": 1.3104874159288715,
"learning_rate": 6.6947924890391295e-06,
"loss": 0.2265,
"step": 836
},
{
"epoch": 0.7807835820895522,
"grad_norm": 1.33957381485937,
"learning_rate": 6.687897914287667e-06,
"loss": 0.2432,
"step": 837
},
{
"epoch": 0.7817164179104478,
"grad_norm": 1.2654860183364611,
"learning_rate": 6.680999715468669e-06,
"loss": 0.2329,
"step": 838
},
{
"epoch": 0.7826492537313433,
"grad_norm": 1.3809743084115988,
"learning_rate": 6.674097907393186e-06,
"loss": 0.2678,
"step": 839
},
{
"epoch": 0.7835820895522388,
"grad_norm": 1.3419028339922356,
"learning_rate": 6.667192504880016e-06,
"loss": 0.2387,
"step": 840
},
{
"epoch": 0.7845149253731343,
"grad_norm": 1.270127692627935,
"learning_rate": 6.660283522755674e-06,
"loss": 0.231,
"step": 841
},
{
"epoch": 0.7854477611940298,
"grad_norm": 1.323400044376785,
"learning_rate": 6.653370975854362e-06,
"loss": 0.2551,
"step": 842
},
{
"epoch": 0.7863805970149254,
"grad_norm": 1.306885444890462,
"learning_rate": 6.646454879017934e-06,
"loss": 0.2402,
"step": 843
},
{
"epoch": 0.7873134328358209,
"grad_norm": 1.279516309492333,
"learning_rate": 6.639535247095868e-06,
"loss": 0.2151,
"step": 844
},
{
"epoch": 0.7882462686567164,
"grad_norm": 1.2841870522741121,
"learning_rate": 6.632612094945234e-06,
"loss": 0.2191,
"step": 845
},
{
"epoch": 0.789179104477612,
"grad_norm": 1.384854633009525,
"learning_rate": 6.625685437430656e-06,
"loss": 0.2904,
"step": 846
},
{
"epoch": 0.7901119402985075,
"grad_norm": 1.325501174260801,
"learning_rate": 6.618755289424285e-06,
"loss": 0.2128,
"step": 847
},
{
"epoch": 0.7910447761194029,
"grad_norm": 1.2696793987185822,
"learning_rate": 6.611821665805769e-06,
"loss": 0.2056,
"step": 848
},
{
"epoch": 0.7919776119402985,
"grad_norm": 1.3494734406081708,
"learning_rate": 6.604884581462219e-06,
"loss": 0.2479,
"step": 849
},
{
"epoch": 0.792910447761194,
"grad_norm": 1.3238649985819988,
"learning_rate": 6.597944051288169e-06,
"loss": 0.2196,
"step": 850
},
{
"epoch": 0.7938432835820896,
"grad_norm": 1.2737431492446287,
"learning_rate": 6.5910000901855606e-06,
"loss": 0.185,
"step": 851
},
{
"epoch": 0.7947761194029851,
"grad_norm": 1.261459403427656,
"learning_rate": 6.5840527130637e-06,
"loss": 0.1649,
"step": 852
},
{
"epoch": 0.7957089552238806,
"grad_norm": 1.3334323935868955,
"learning_rate": 6.577101934839222e-06,
"loss": 0.2042,
"step": 853
},
{
"epoch": 0.7966417910447762,
"grad_norm": 1.326118422926226,
"learning_rate": 6.570147770436071e-06,
"loss": 0.2118,
"step": 854
},
{
"epoch": 0.7975746268656716,
"grad_norm": 1.2735300323417673,
"learning_rate": 6.56319023478546e-06,
"loss": 0.2433,
"step": 855
},
{
"epoch": 0.7985074626865671,
"grad_norm": 1.1516244040283352,
"learning_rate": 6.556229342825835e-06,
"loss": 0.196,
"step": 856
},
{
"epoch": 0.7994402985074627,
"grad_norm": 1.2798641246720104,
"learning_rate": 6.549265109502856e-06,
"loss": 0.1941,
"step": 857
},
{
"epoch": 0.8003731343283582,
"grad_norm": 1.2451080478624204,
"learning_rate": 6.542297549769353e-06,
"loss": 0.2012,
"step": 858
},
{
"epoch": 0.8013059701492538,
"grad_norm": 1.2256859001807432,
"learning_rate": 6.5353266785852976e-06,
"loss": 0.21,
"step": 859
},
{
"epoch": 0.8022388059701493,
"grad_norm": 1.2375250508685598,
"learning_rate": 6.528352510917774e-06,
"loss": 0.202,
"step": 860
},
{
"epoch": 0.8031716417910447,
"grad_norm": 1.267433173640717,
"learning_rate": 6.521375061740945e-06,
"loss": 0.2315,
"step": 861
},
{
"epoch": 0.8041044776119403,
"grad_norm": 1.2414436142743404,
"learning_rate": 6.514394346036013e-06,
"loss": 0.2269,
"step": 862
},
{
"epoch": 0.8050373134328358,
"grad_norm": 1.3305170506619883,
"learning_rate": 6.507410378791198e-06,
"loss": 0.2104,
"step": 863
},
{
"epoch": 0.8059701492537313,
"grad_norm": 1.26434323921976,
"learning_rate": 6.500423175001705e-06,
"loss": 0.2242,
"step": 864
},
{
"epoch": 0.8069029850746269,
"grad_norm": 1.2803166868672846,
"learning_rate": 6.493432749669682e-06,
"loss": 0.1832,
"step": 865
},
{
"epoch": 0.8078358208955224,
"grad_norm": 1.3081597715295281,
"learning_rate": 6.486439117804195e-06,
"loss": 0.184,
"step": 866
},
{
"epoch": 0.808768656716418,
"grad_norm": 1.2160280841149596,
"learning_rate": 6.479442294421199e-06,
"loss": 0.1772,
"step": 867
},
{
"epoch": 0.8097014925373134,
"grad_norm": 1.187633961625421,
"learning_rate": 6.472442294543497e-06,
"loss": 0.1885,
"step": 868
},
{
"epoch": 0.8106343283582089,
"grad_norm": 1.4011122461263708,
"learning_rate": 6.465439133200715e-06,
"loss": 0.2081,
"step": 869
},
{
"epoch": 0.8115671641791045,
"grad_norm": 1.4590773110743023,
"learning_rate": 6.458432825429264e-06,
"loss": 0.2345,
"step": 870
},
{
"epoch": 0.8125,
"grad_norm": 1.3977394231703384,
"learning_rate": 6.451423386272312e-06,
"loss": 0.2305,
"step": 871
},
{
"epoch": 0.8134328358208955,
"grad_norm": 1.3240392180046858,
"learning_rate": 6.444410830779753e-06,
"loss": 0.2039,
"step": 872
},
{
"epoch": 0.8143656716417911,
"grad_norm": 1.3010915438246045,
"learning_rate": 6.437395174008169e-06,
"loss": 0.204,
"step": 873
},
{
"epoch": 0.8152985074626866,
"grad_norm": 1.1697900871038651,
"learning_rate": 6.4303764310208015e-06,
"loss": 0.1722,
"step": 874
},
{
"epoch": 0.816231343283582,
"grad_norm": 1.2738694790905063,
"learning_rate": 6.4233546168875185e-06,
"loss": 0.242,
"step": 875
},
{
"epoch": 0.8171641791044776,
"grad_norm": 1.298362565686738,
"learning_rate": 6.4163297466847795e-06,
"loss": 0.2254,
"step": 876
},
{
"epoch": 0.8180970149253731,
"grad_norm": 1.3150297463061384,
"learning_rate": 6.409301835495611e-06,
"loss": 0.247,
"step": 877
},
{
"epoch": 0.8190298507462687,
"grad_norm": 1.28947125272434,
"learning_rate": 6.402270898409565e-06,
"loss": 0.2055,
"step": 878
},
{
"epoch": 0.8199626865671642,
"grad_norm": 1.1479792741170483,
"learning_rate": 6.395236950522691e-06,
"loss": 0.1885,
"step": 879
},
{
"epoch": 0.8208955223880597,
"grad_norm": 1.2901505058336102,
"learning_rate": 6.388200006937503e-06,
"loss": 0.2275,
"step": 880
},
{
"epoch": 0.8218283582089553,
"grad_norm": 1.3612667025324792,
"learning_rate": 6.381160082762949e-06,
"loss": 0.2483,
"step": 881
},
{
"epoch": 0.8227611940298507,
"grad_norm": 1.3057407524788593,
"learning_rate": 6.374117193114373e-06,
"loss": 0.2208,
"step": 882
},
{
"epoch": 0.8236940298507462,
"grad_norm": 1.2683413858234487,
"learning_rate": 6.3670713531134865e-06,
"loss": 0.2391,
"step": 883
},
{
"epoch": 0.8246268656716418,
"grad_norm": 1.2679383991747715,
"learning_rate": 6.3600225778883395e-06,
"loss": 0.2304,
"step": 884
},
{
"epoch": 0.8255597014925373,
"grad_norm": 1.291847617097924,
"learning_rate": 6.352970882573283e-06,
"loss": 0.2398,
"step": 885
},
{
"epoch": 0.8264925373134329,
"grad_norm": 1.2834163636252958,
"learning_rate": 6.3459162823089325e-06,
"loss": 0.2115,
"step": 886
},
{
"epoch": 0.8274253731343284,
"grad_norm": 1.2881959492626784,
"learning_rate": 6.338858792242147e-06,
"loss": 0.2399,
"step": 887
},
{
"epoch": 0.8283582089552238,
"grad_norm": 1.220315589698992,
"learning_rate": 6.33179842752599e-06,
"loss": 0.1731,
"step": 888
},
{
"epoch": 0.8292910447761194,
"grad_norm": 1.2527952925756376,
"learning_rate": 6.324735203319691e-06,
"loss": 0.2183,
"step": 889
},
{
"epoch": 0.8302238805970149,
"grad_norm": 1.3134848726122574,
"learning_rate": 6.317669134788625e-06,
"loss": 0.2124,
"step": 890
},
{
"epoch": 0.8311567164179104,
"grad_norm": 1.3821896713692334,
"learning_rate": 6.3106002371042716e-06,
"loss": 0.2542,
"step": 891
},
{
"epoch": 0.832089552238806,
"grad_norm": 1.3281677908441818,
"learning_rate": 6.303528525444185e-06,
"loss": 0.206,
"step": 892
},
{
"epoch": 0.8330223880597015,
"grad_norm": 1.2195697801878131,
"learning_rate": 6.296454014991962e-06,
"loss": 0.209,
"step": 893
},
{
"epoch": 0.8339552238805971,
"grad_norm": 1.281174429543326,
"learning_rate": 6.289376720937208e-06,
"loss": 0.2063,
"step": 894
},
{
"epoch": 0.8348880597014925,
"grad_norm": 1.2628179124634378,
"learning_rate": 6.282296658475508e-06,
"loss": 0.2221,
"step": 895
},
{
"epoch": 0.835820895522388,
"grad_norm": 1.3001412968878898,
"learning_rate": 6.275213842808383e-06,
"loss": 0.2347,
"step": 896
},
{
"epoch": 0.8367537313432836,
"grad_norm": 1.332802953329869,
"learning_rate": 6.268128289143274e-06,
"loss": 0.257,
"step": 897
},
{
"epoch": 0.8376865671641791,
"grad_norm": 1.4211822881217815,
"learning_rate": 6.261040012693498e-06,
"loss": 0.2491,
"step": 898
},
{
"epoch": 0.8386194029850746,
"grad_norm": 1.1835916153283808,
"learning_rate": 6.253949028678214e-06,
"loss": 0.1787,
"step": 899
},
{
"epoch": 0.8395522388059702,
"grad_norm": 1.30040911243428,
"learning_rate": 6.246855352322403e-06,
"loss": 0.2218,
"step": 900
},
{
"epoch": 0.8404850746268657,
"grad_norm": 1.2751227090200317,
"learning_rate": 6.2397589988568175e-06,
"loss": 0.1832,
"step": 901
},
{
"epoch": 0.8414179104477612,
"grad_norm": 1.336123053932801,
"learning_rate": 6.232659983517964e-06,
"loss": 0.2538,
"step": 902
},
{
"epoch": 0.8423507462686567,
"grad_norm": 1.2973986362379657,
"learning_rate": 6.22555832154806e-06,
"loss": 0.1842,
"step": 903
},
{
"epoch": 0.8432835820895522,
"grad_norm": 1.293187865349667,
"learning_rate": 6.21845402819501e-06,
"loss": 0.2144,
"step": 904
},
{
"epoch": 0.8442164179104478,
"grad_norm": 1.4216499384977432,
"learning_rate": 6.211347118712365e-06,
"loss": 0.2583,
"step": 905
},
{
"epoch": 0.8451492537313433,
"grad_norm": 1.289953059628164,
"learning_rate": 6.204237608359296e-06,
"loss": 0.2042,
"step": 906
},
{
"epoch": 0.8460820895522388,
"grad_norm": 1.2412676307218935,
"learning_rate": 6.197125512400555e-06,
"loss": 0.195,
"step": 907
},
{
"epoch": 0.8470149253731343,
"grad_norm": 1.216660198794667,
"learning_rate": 6.190010846106446e-06,
"loss": 0.2033,
"step": 908
},
{
"epoch": 0.8479477611940298,
"grad_norm": 1.346208844228071,
"learning_rate": 6.182893624752796e-06,
"loss": 0.2585,
"step": 909
},
{
"epoch": 0.8488805970149254,
"grad_norm": 1.2971232768563918,
"learning_rate": 6.1757738636209115e-06,
"loss": 0.2304,
"step": 910
},
{
"epoch": 0.8498134328358209,
"grad_norm": 1.2139712586213292,
"learning_rate": 6.168651577997558e-06,
"loss": 0.1774,
"step": 911
},
{
"epoch": 0.8507462686567164,
"grad_norm": 1.312466911856846,
"learning_rate": 6.161526783174917e-06,
"loss": 0.2283,
"step": 912
},
{
"epoch": 0.851679104477612,
"grad_norm": 1.304590817855252,
"learning_rate": 6.154399494450559e-06,
"loss": 0.2139,
"step": 913
},
{
"epoch": 0.8526119402985075,
"grad_norm": 1.2634075009111383,
"learning_rate": 6.14726972712741e-06,
"loss": 0.2252,
"step": 914
},
{
"epoch": 0.8535447761194029,
"grad_norm": 1.2470179967350044,
"learning_rate": 6.140137496513718e-06,
"loss": 0.1914,
"step": 915
},
{
"epoch": 0.8544776119402985,
"grad_norm": 1.2646976296163095,
"learning_rate": 6.1330028179230185e-06,
"loss": 0.2066,
"step": 916
},
{
"epoch": 0.855410447761194,
"grad_norm": 1.1803232608347158,
"learning_rate": 6.125865706674103e-06,
"loss": 0.1708,
"step": 917
},
{
"epoch": 0.8563432835820896,
"grad_norm": 1.2689825513910395,
"learning_rate": 6.1187261780909835e-06,
"loss": 0.2114,
"step": 918
},
{
"epoch": 0.8572761194029851,
"grad_norm": 1.2346460755363526,
"learning_rate": 6.111584247502871e-06,
"loss": 0.2333,
"step": 919
},
{
"epoch": 0.8582089552238806,
"grad_norm": 1.260220733261907,
"learning_rate": 6.104439930244125e-06,
"loss": 0.18,
"step": 920
},
{
"epoch": 0.8591417910447762,
"grad_norm": 1.285111466705012,
"learning_rate": 6.0972932416542326e-06,
"loss": 0.2324,
"step": 921
},
{
"epoch": 0.8600746268656716,
"grad_norm": 1.2460248488612586,
"learning_rate": 6.090144197077774e-06,
"loss": 0.2397,
"step": 922
},
{
"epoch": 0.8610074626865671,
"grad_norm": 1.3234609423589674,
"learning_rate": 6.082992811864385e-06,
"loss": 0.2126,
"step": 923
},
{
"epoch": 0.8619402985074627,
"grad_norm": 1.2509638402578742,
"learning_rate": 6.075839101368728e-06,
"loss": 0.2047,
"step": 924
},
{
"epoch": 0.8628731343283582,
"grad_norm": 1.360932430546119,
"learning_rate": 6.068683080950458e-06,
"loss": 0.2705,
"step": 925
},
{
"epoch": 0.8638059701492538,
"grad_norm": 1.224176428108218,
"learning_rate": 6.061524765974191e-06,
"loss": 0.2087,
"step": 926
},
{
"epoch": 0.8647388059701493,
"grad_norm": 1.2150273756251704,
"learning_rate": 6.054364171809467e-06,
"loss": 0.1813,
"step": 927
},
{
"epoch": 0.8656716417910447,
"grad_norm": 1.3381501042482085,
"learning_rate": 6.047201313830724e-06,
"loss": 0.2353,
"step": 928
},
{
"epoch": 0.8666044776119403,
"grad_norm": 1.4149342164499332,
"learning_rate": 6.040036207417252e-06,
"loss": 0.2405,
"step": 929
},
{
"epoch": 0.8675373134328358,
"grad_norm": 1.2226530757677798,
"learning_rate": 6.032868867953181e-06,
"loss": 0.2066,
"step": 930
},
{
"epoch": 0.8684701492537313,
"grad_norm": 1.3101998121678258,
"learning_rate": 6.025699310827423e-06,
"loss": 0.2337,
"step": 931
},
{
"epoch": 0.8694029850746269,
"grad_norm": 1.257761679103726,
"learning_rate": 6.01852755143366e-06,
"loss": 0.1732,
"step": 932
},
{
"epoch": 0.8703358208955224,
"grad_norm": 1.2844011334095315,
"learning_rate": 6.011353605170303e-06,
"loss": 0.2174,
"step": 933
},
{
"epoch": 0.871268656716418,
"grad_norm": 1.2817045648391725,
"learning_rate": 6.004177487440448e-06,
"loss": 0.2065,
"step": 934
},
{
"epoch": 0.8722014925373134,
"grad_norm": 1.253138318239424,
"learning_rate": 5.996999213651866e-06,
"loss": 0.2031,
"step": 935
},
{
"epoch": 0.8731343283582089,
"grad_norm": 1.3346763040111453,
"learning_rate": 5.98981879921695e-06,
"loss": 0.2113,
"step": 936
},
{
"epoch": 0.8740671641791045,
"grad_norm": 1.23116222222353,
"learning_rate": 5.982636259552691e-06,
"loss": 0.2099,
"step": 937
},
{
"epoch": 0.875,
"grad_norm": 1.204120206628589,
"learning_rate": 5.975451610080643e-06,
"loss": 0.1834,
"step": 938
},
{
"epoch": 0.8759328358208955,
"grad_norm": 1.2206632469272407,
"learning_rate": 5.968264866226888e-06,
"loss": 0.201,
"step": 939
},
{
"epoch": 0.8768656716417911,
"grad_norm": 1.374354159020404,
"learning_rate": 5.961076043422011e-06,
"loss": 0.2338,
"step": 940
},
{
"epoch": 0.8777985074626866,
"grad_norm": 1.3883837341962375,
"learning_rate": 5.953885157101054e-06,
"loss": 0.287,
"step": 941
},
{
"epoch": 0.878731343283582,
"grad_norm": 1.3503256511984503,
"learning_rate": 5.9466922227034915e-06,
"loss": 0.224,
"step": 942
},
{
"epoch": 0.8796641791044776,
"grad_norm": 1.4151723372695262,
"learning_rate": 5.939497255673197e-06,
"loss": 0.258,
"step": 943
},
{
"epoch": 0.8805970149253731,
"grad_norm": 1.3110628245283942,
"learning_rate": 5.932300271458406e-06,
"loss": 0.2009,
"step": 944
},
{
"epoch": 0.8815298507462687,
"grad_norm": 1.2922411186960756,
"learning_rate": 5.925101285511687e-06,
"loss": 0.1875,
"step": 945
},
{
"epoch": 0.8824626865671642,
"grad_norm": 1.2244793038262138,
"learning_rate": 5.9179003132899075e-06,
"loss": 0.214,
"step": 946
},
{
"epoch": 0.8833955223880597,
"grad_norm": 1.276674565595725,
"learning_rate": 5.910697370254195e-06,
"loss": 0.2237,
"step": 947
},
{
"epoch": 0.8843283582089553,
"grad_norm": 1.2332094258137027,
"learning_rate": 5.90349247186991e-06,
"loss": 0.1976,
"step": 948
},
{
"epoch": 0.8852611940298507,
"grad_norm": 1.1889954764792152,
"learning_rate": 5.8962856336066175e-06,
"loss": 0.1948,
"step": 949
},
{
"epoch": 0.8861940298507462,
"grad_norm": 1.3664765584935996,
"learning_rate": 5.889076870938041e-06,
"loss": 0.2504,
"step": 950
},
{
"epoch": 0.8871268656716418,
"grad_norm": 1.3032321848741621,
"learning_rate": 5.881866199342035e-06,
"loss": 0.2608,
"step": 951
},
{
"epoch": 0.8880597014925373,
"grad_norm": 1.3377092395995163,
"learning_rate": 5.874653634300555e-06,
"loss": 0.2043,
"step": 952
},
{
"epoch": 0.8889925373134329,
"grad_norm": 1.3320395505868632,
"learning_rate": 5.867439191299629e-06,
"loss": 0.2566,
"step": 953
},
{
"epoch": 0.8899253731343284,
"grad_norm": 1.3630820848349416,
"learning_rate": 5.860222885829302e-06,
"loss": 0.2373,
"step": 954
},
{
"epoch": 0.8908582089552238,
"grad_norm": 1.3345807317360974,
"learning_rate": 5.853004733383631e-06,
"loss": 0.2438,
"step": 955
},
{
"epoch": 0.8917910447761194,
"grad_norm": 1.278615606825059,
"learning_rate": 5.845784749460632e-06,
"loss": 0.2104,
"step": 956
},
{
"epoch": 0.8927238805970149,
"grad_norm": 1.2799506965670524,
"learning_rate": 5.838562949562257e-06,
"loss": 0.2288,
"step": 957
},
{
"epoch": 0.8936567164179104,
"grad_norm": 1.2913790288284412,
"learning_rate": 5.831339349194352e-06,
"loss": 0.2114,
"step": 958
},
{
"epoch": 0.894589552238806,
"grad_norm": 1.2194240810958283,
"learning_rate": 5.824113963866635e-06,
"loss": 0.1746,
"step": 959
},
{
"epoch": 0.8955223880597015,
"grad_norm": 1.2703407850964368,
"learning_rate": 5.816886809092651e-06,
"loss": 0.2049,
"step": 960
},
{
"epoch": 0.8964552238805971,
"grad_norm": 1.3099986058972541,
"learning_rate": 5.809657900389749e-06,
"loss": 0.197,
"step": 961
},
{
"epoch": 0.8973880597014925,
"grad_norm": 1.376627956546677,
"learning_rate": 5.802427253279042e-06,
"loss": 0.1987,
"step": 962
},
{
"epoch": 0.898320895522388,
"grad_norm": 1.4401252624901832,
"learning_rate": 5.795194883285371e-06,
"loss": 0.2391,
"step": 963
},
{
"epoch": 0.8992537313432836,
"grad_norm": 1.2451440182815314,
"learning_rate": 5.787960805937283e-06,
"loss": 0.2011,
"step": 964
},
{
"epoch": 0.9001865671641791,
"grad_norm": 1.2173276185663062,
"learning_rate": 5.780725036766988e-06,
"loss": 0.2017,
"step": 965
},
{
"epoch": 0.9011194029850746,
"grad_norm": 1.2605780909868989,
"learning_rate": 5.773487591310329e-06,
"loss": 0.2029,
"step": 966
},
{
"epoch": 0.9020522388059702,
"grad_norm": 1.1729539366805513,
"learning_rate": 5.7662484851067435e-06,
"loss": 0.1764,
"step": 967
},
{
"epoch": 0.9029850746268657,
"grad_norm": 1.2941189951902425,
"learning_rate": 5.759007733699245e-06,
"loss": 0.2276,
"step": 968
},
{
"epoch": 0.9039179104477612,
"grad_norm": 1.4279132641665215,
"learning_rate": 5.751765352634369e-06,
"loss": 0.2741,
"step": 969
},
{
"epoch": 0.9048507462686567,
"grad_norm": 1.1819532323786706,
"learning_rate": 5.7445213574621565e-06,
"loss": 0.1939,
"step": 970
},
{
"epoch": 0.9057835820895522,
"grad_norm": 1.2426120494149469,
"learning_rate": 5.73727576373611e-06,
"loss": 0.1947,
"step": 971
},
{
"epoch": 0.9067164179104478,
"grad_norm": 1.337603891990572,
"learning_rate": 5.730028587013168e-06,
"loss": 0.2336,
"step": 972
},
{
"epoch": 0.9076492537313433,
"grad_norm": 1.1832053184869655,
"learning_rate": 5.722779842853665e-06,
"loss": 0.2092,
"step": 973
},
{
"epoch": 0.9085820895522388,
"grad_norm": 1.2401664664738221,
"learning_rate": 5.715529546821303e-06,
"loss": 0.1889,
"step": 974
},
{
"epoch": 0.9095149253731343,
"grad_norm": 1.2853976947091192,
"learning_rate": 5.708277714483114e-06,
"loss": 0.244,
"step": 975
},
{
"epoch": 0.9104477611940298,
"grad_norm": 1.0995739573370429,
"learning_rate": 5.701024361409431e-06,
"loss": 0.1633,
"step": 976
},
{
"epoch": 0.9113805970149254,
"grad_norm": 1.3409416136518981,
"learning_rate": 5.693769503173847e-06,
"loss": 0.2369,
"step": 977
},
{
"epoch": 0.9123134328358209,
"grad_norm": 1.2983356696907475,
"learning_rate": 5.6865131553531925e-06,
"loss": 0.2696,
"step": 978
},
{
"epoch": 0.9132462686567164,
"grad_norm": 1.2323698861651475,
"learning_rate": 5.679255333527498e-06,
"loss": 0.1971,
"step": 979
},
{
"epoch": 0.914179104477612,
"grad_norm": 1.314131614199844,
"learning_rate": 5.671996053279949e-06,
"loss": 0.2168,
"step": 980
},
{
"epoch": 0.9151119402985075,
"grad_norm": 1.2957854875866097,
"learning_rate": 5.664735330196871e-06,
"loss": 0.2177,
"step": 981
},
{
"epoch": 0.9160447761194029,
"grad_norm": 1.3538437754250945,
"learning_rate": 5.657473179867686e-06,
"loss": 0.2457,
"step": 982
},
{
"epoch": 0.9169776119402985,
"grad_norm": 1.2207256127091906,
"learning_rate": 5.6502096178848786e-06,
"loss": 0.1802,
"step": 983
},
{
"epoch": 0.917910447761194,
"grad_norm": 1.228820602876557,
"learning_rate": 5.642944659843962e-06,
"loss": 0.2141,
"step": 984
},
{
"epoch": 0.9188432835820896,
"grad_norm": 1.2793579747049237,
"learning_rate": 5.635678321343453e-06,
"loss": 0.2037,
"step": 985
},
{
"epoch": 0.9197761194029851,
"grad_norm": 1.3437393655992524,
"learning_rate": 5.628410617984828e-06,
"loss": 0.2008,
"step": 986
},
{
"epoch": 0.9207089552238806,
"grad_norm": 1.2489129719017027,
"learning_rate": 5.6211415653724965e-06,
"loss": 0.2046,
"step": 987
},
{
"epoch": 0.9216417910447762,
"grad_norm": 1.2894068857021934,
"learning_rate": 5.613871179113761e-06,
"loss": 0.2137,
"step": 988
},
{
"epoch": 0.9225746268656716,
"grad_norm": 1.319224823579118,
"learning_rate": 5.606599474818793e-06,
"loss": 0.2477,
"step": 989
},
{
"epoch": 0.9235074626865671,
"grad_norm": 1.2386689277056102,
"learning_rate": 5.5993264681005875e-06,
"loss": 0.2158,
"step": 990
},
{
"epoch": 0.9244402985074627,
"grad_norm": 1.3325603148451477,
"learning_rate": 5.592052174574942e-06,
"loss": 0.2087,
"step": 991
},
{
"epoch": 0.9253731343283582,
"grad_norm": 1.1456735516877268,
"learning_rate": 5.584776609860414e-06,
"loss": 0.1569,
"step": 992
},
{
"epoch": 0.9263059701492538,
"grad_norm": 1.2619437102530076,
"learning_rate": 5.5774997895782875e-06,
"loss": 0.2053,
"step": 993
},
{
"epoch": 0.9272388059701493,
"grad_norm": 1.2446625743320796,
"learning_rate": 5.570221729352549e-06,
"loss": 0.2323,
"step": 994
},
{
"epoch": 0.9281716417910447,
"grad_norm": 1.3816218116328933,
"learning_rate": 5.562942444809842e-06,
"loss": 0.2549,
"step": 995
},
{
"epoch": 0.9291044776119403,
"grad_norm": 1.3210090999184592,
"learning_rate": 5.555661951579443e-06,
"loss": 0.2193,
"step": 996
},
{
"epoch": 0.9300373134328358,
"grad_norm": 1.3003885784496516,
"learning_rate": 5.5483802652932165e-06,
"loss": 0.2111,
"step": 997
},
{
"epoch": 0.9309701492537313,
"grad_norm": 1.2050797563542268,
"learning_rate": 5.541097401585596e-06,
"loss": 0.1778,
"step": 998
},
{
"epoch": 0.9319029850746269,
"grad_norm": 1.177432522858866,
"learning_rate": 5.53381337609354e-06,
"loss": 0.1784,
"step": 999
},
{
"epoch": 0.9328358208955224,
"grad_norm": 1.1743424389077388,
"learning_rate": 5.5265282044565005e-06,
"loss": 0.1791,
"step": 1000
},
{
"epoch": 0.9328358208955224,
"eval_loss": 0.21907268464565277,
"eval_runtime": 3.4377,
"eval_samples_per_second": 25.308,
"eval_steps_per_second": 6.4,
"step": 1000
},
{
"epoch": 0.933768656716418,
"grad_norm": 1.1635684511765956,
"learning_rate": 5.519241902316392e-06,
"loss": 0.1816,
"step": 1001
},
{
"epoch": 0.9347014925373134,
"grad_norm": 1.3832271905864557,
"learning_rate": 5.511954485317558e-06,
"loss": 0.2517,
"step": 1002
},
{
"epoch": 0.9356343283582089,
"grad_norm": 1.2598044090680305,
"learning_rate": 5.504665969106731e-06,
"loss": 0.1921,
"step": 1003
},
{
"epoch": 0.9365671641791045,
"grad_norm": 1.2368589807753354,
"learning_rate": 5.497376369333005e-06,
"loss": 0.1973,
"step": 1004
},
{
"epoch": 0.9375,
"grad_norm": 1.3209236079778428,
"learning_rate": 5.490085701647805e-06,
"loss": 0.1921,
"step": 1005
},
{
"epoch": 0.9384328358208955,
"grad_norm": 1.253108089681956,
"learning_rate": 5.482793981704842e-06,
"loss": 0.1515,
"step": 1006
},
{
"epoch": 0.9393656716417911,
"grad_norm": 1.278153011620517,
"learning_rate": 5.475501225160092e-06,
"loss": 0.2183,
"step": 1007
},
{
"epoch": 0.9402985074626866,
"grad_norm": 1.3035057923325821,
"learning_rate": 5.468207447671755e-06,
"loss": 0.2398,
"step": 1008
},
{
"epoch": 0.941231343283582,
"grad_norm": 1.2568883295893882,
"learning_rate": 5.4609126649002206e-06,
"loss": 0.197,
"step": 1009
},
{
"epoch": 0.9421641791044776,
"grad_norm": 1.3164928013180202,
"learning_rate": 5.45361689250804e-06,
"loss": 0.2031,
"step": 1010
},
{
"epoch": 0.9430970149253731,
"grad_norm": 1.2609431407237786,
"learning_rate": 5.446320146159888e-06,
"loss": 0.2138,
"step": 1011
},
{
"epoch": 0.9440298507462687,
"grad_norm": 1.287828094306322,
"learning_rate": 5.43902244152253e-06,
"loss": 0.2007,
"step": 1012
},
{
"epoch": 0.9449626865671642,
"grad_norm": 1.2224301172959635,
"learning_rate": 5.431723794264789e-06,
"loss": 0.191,
"step": 1013
},
{
"epoch": 0.9458955223880597,
"grad_norm": 1.2296770029919053,
"learning_rate": 5.424424220057514e-06,
"loss": 0.2002,
"step": 1014
},
{
"epoch": 0.9468283582089553,
"grad_norm": 1.375993340109819,
"learning_rate": 5.417123734573541e-06,
"loss": 0.2306,
"step": 1015
},
{
"epoch": 0.9477611940298507,
"grad_norm": 1.335234251332967,
"learning_rate": 5.409822353487666e-06,
"loss": 0.2509,
"step": 1016
},
{
"epoch": 0.9486940298507462,
"grad_norm": 1.2393282472383353,
"learning_rate": 5.402520092476604e-06,
"loss": 0.2009,
"step": 1017
},
{
"epoch": 0.9496268656716418,
"grad_norm": 1.2356611423329757,
"learning_rate": 5.395216967218961e-06,
"loss": 0.2063,
"step": 1018
},
{
"epoch": 0.9505597014925373,
"grad_norm": 1.307932934613099,
"learning_rate": 5.387912993395203e-06,
"loss": 0.2324,
"step": 1019
},
{
"epoch": 0.9514925373134329,
"grad_norm": 1.3243498570571994,
"learning_rate": 5.38060818668761e-06,
"loss": 0.2388,
"step": 1020
},
{
"epoch": 0.9524253731343284,
"grad_norm": 1.1645371806053033,
"learning_rate": 5.373302562780256e-06,
"loss": 0.1886,
"step": 1021
},
{
"epoch": 0.9533582089552238,
"grad_norm": 1.297730287399217,
"learning_rate": 5.365996137358969e-06,
"loss": 0.2073,
"step": 1022
},
{
"epoch": 0.9542910447761194,
"grad_norm": 1.250457649249079,
"learning_rate": 5.358688926111293e-06,
"loss": 0.189,
"step": 1023
},
{
"epoch": 0.9552238805970149,
"grad_norm": 1.2705310178726523,
"learning_rate": 5.351380944726465e-06,
"loss": 0.2266,
"step": 1024
},
{
"epoch": 0.9561567164179104,
"grad_norm": 1.383325173135325,
"learning_rate": 5.344072208895376e-06,
"loss": 0.2612,
"step": 1025
},
{
"epoch": 0.957089552238806,
"grad_norm": 1.2801631073608817,
"learning_rate": 5.33676273431053e-06,
"loss": 0.2197,
"step": 1026
},
{
"epoch": 0.9580223880597015,
"grad_norm": 1.313531018381848,
"learning_rate": 5.329452536666025e-06,
"loss": 0.2202,
"step": 1027
},
{
"epoch": 0.9589552238805971,
"grad_norm": 1.3086085921045902,
"learning_rate": 5.322141631657507e-06,
"loss": 0.2456,
"step": 1028
},
{
"epoch": 0.9598880597014925,
"grad_norm": 1.257199461783946,
"learning_rate": 5.314830034982142e-06,
"loss": 0.2229,
"step": 1029
},
{
"epoch": 0.960820895522388,
"grad_norm": 1.2749870374862313,
"learning_rate": 5.30751776233858e-06,
"loss": 0.2209,
"step": 1030
},
{
"epoch": 0.9617537313432836,
"grad_norm": 1.2394504236431108,
"learning_rate": 5.300204829426923e-06,
"loss": 0.2157,
"step": 1031
},
{
"epoch": 0.9626865671641791,
"grad_norm": 1.2209777484060933,
"learning_rate": 5.292891251948694e-06,
"loss": 0.2167,
"step": 1032
},
{
"epoch": 0.9636194029850746,
"grad_norm": 1.2368359051230373,
"learning_rate": 5.2855770456067936e-06,
"loss": 0.1912,
"step": 1033
},
{
"epoch": 0.9645522388059702,
"grad_norm": 1.3016803387713154,
"learning_rate": 5.278262226105476e-06,
"loss": 0.201,
"step": 1034
},
{
"epoch": 0.9654850746268657,
"grad_norm": 1.2524173615322032,
"learning_rate": 5.270946809150315e-06,
"loss": 0.2159,
"step": 1035
},
{
"epoch": 0.9664179104477612,
"grad_norm": 1.2683630285568621,
"learning_rate": 5.263630810448161e-06,
"loss": 0.2101,
"step": 1036
},
{
"epoch": 0.9673507462686567,
"grad_norm": 1.3024423342612523,
"learning_rate": 5.256314245707118e-06,
"loss": 0.2346,
"step": 1037
},
{
"epoch": 0.9682835820895522,
"grad_norm": 1.2352735531904246,
"learning_rate": 5.2489971306365025e-06,
"loss": 0.1927,
"step": 1038
},
{
"epoch": 0.9692164179104478,
"grad_norm": 1.2910473126613706,
"learning_rate": 5.2416794809468145e-06,
"loss": 0.1807,
"step": 1039
},
{
"epoch": 0.9701492537313433,
"grad_norm": 1.2631389614599406,
"learning_rate": 5.234361312349701e-06,
"loss": 0.2146,
"step": 1040
},
{
"epoch": 0.9710820895522388,
"grad_norm": 1.2549022169376725,
"learning_rate": 5.227042640557924e-06,
"loss": 0.2111,
"step": 1041
},
{
"epoch": 0.9720149253731343,
"grad_norm": 1.2247144243410715,
"learning_rate": 5.219723481285326e-06,
"loss": 0.1699,
"step": 1042
},
{
"epoch": 0.9729477611940298,
"grad_norm": 1.257160905866937,
"learning_rate": 5.212403850246794e-06,
"loss": 0.2209,
"step": 1043
},
{
"epoch": 0.9738805970149254,
"grad_norm": 1.2629034927846967,
"learning_rate": 5.205083763158228e-06,
"loss": 0.2412,
"step": 1044
},
{
"epoch": 0.9748134328358209,
"grad_norm": 1.2990007211575794,
"learning_rate": 5.197763235736512e-06,
"loss": 0.1919,
"step": 1045
},
{
"epoch": 0.9757462686567164,
"grad_norm": 1.311144641480511,
"learning_rate": 5.190442283699472e-06,
"loss": 0.2012,
"step": 1046
},
{
"epoch": 0.976679104477612,
"grad_norm": 1.3260780293625873,
"learning_rate": 5.183120922765842e-06,
"loss": 0.2002,
"step": 1047
},
{
"epoch": 0.9776119402985075,
"grad_norm": 1.188604340910306,
"learning_rate": 5.175799168655241e-06,
"loss": 0.1973,
"step": 1048
},
{
"epoch": 0.9785447761194029,
"grad_norm": 1.315357343596737,
"learning_rate": 5.168477037088129e-06,
"loss": 0.252,
"step": 1049
},
{
"epoch": 0.9794776119402985,
"grad_norm": 1.2147122512747308,
"learning_rate": 5.161154543785773e-06,
"loss": 0.1833,
"step": 1050
},
{
"epoch": 0.980410447761194,
"grad_norm": 1.3545401003703426,
"learning_rate": 5.153831704470224e-06,
"loss": 0.222,
"step": 1051
},
{
"epoch": 0.9813432835820896,
"grad_norm": 1.34778657278548,
"learning_rate": 5.146508534864267e-06,
"loss": 0.1957,
"step": 1052
},
{
"epoch": 0.9822761194029851,
"grad_norm": 1.186757360343082,
"learning_rate": 5.1391850506914055e-06,
"loss": 0.2058,
"step": 1053
},
{
"epoch": 0.9832089552238806,
"grad_norm": 1.2544761756808898,
"learning_rate": 5.131861267675813e-06,
"loss": 0.203,
"step": 1054
},
{
"epoch": 0.9841417910447762,
"grad_norm": 1.310346335595727,
"learning_rate": 5.124537201542303e-06,
"loss": 0.2254,
"step": 1055
},
{
"epoch": 0.9850746268656716,
"grad_norm": 1.3016888883046014,
"learning_rate": 5.117212868016303e-06,
"loss": 0.1994,
"step": 1056
},
{
"epoch": 0.9860074626865671,
"grad_norm": 1.2638298711530747,
"learning_rate": 5.109888282823809e-06,
"loss": 0.2293,
"step": 1057
},
{
"epoch": 0.9869402985074627,
"grad_norm": 1.2883471738550145,
"learning_rate": 5.10256346169136e-06,
"loss": 0.2185,
"step": 1058
},
{
"epoch": 0.9878731343283582,
"grad_norm": 1.4541116567456558,
"learning_rate": 5.095238420346e-06,
"loss": 0.2787,
"step": 1059
},
{
"epoch": 0.9888059701492538,
"grad_norm": 1.2579334018092971,
"learning_rate": 5.087913174515247e-06,
"loss": 0.1723,
"step": 1060
},
{
"epoch": 0.9897388059701493,
"grad_norm": 1.2457324155617162,
"learning_rate": 5.080587739927061e-06,
"loss": 0.1943,
"step": 1061
},
{
"epoch": 0.9906716417910447,
"grad_norm": 1.1872127809722754,
"learning_rate": 5.073262132309801e-06,
"loss": 0.1711,
"step": 1062
},
{
"epoch": 0.9916044776119403,
"grad_norm": 1.1823669677918365,
"learning_rate": 5.0659363673922e-06,
"loss": 0.1954,
"step": 1063
},
{
"epoch": 0.9925373134328358,
"grad_norm": 1.338060737656068,
"learning_rate": 5.058610460903332e-06,
"loss": 0.2344,
"step": 1064
},
{
"epoch": 0.9934701492537313,
"grad_norm": 1.2439993576220725,
"learning_rate": 5.0512844285725715e-06,
"loss": 0.2049,
"step": 1065
},
{
"epoch": 0.9944029850746269,
"grad_norm": 1.2745912155788846,
"learning_rate": 5.043958286129562e-06,
"loss": 0.2073,
"step": 1066
},
{
"epoch": 0.9953358208955224,
"grad_norm": 1.2268753263397718,
"learning_rate": 5.036632049304189e-06,
"loss": 0.1862,
"step": 1067
},
{
"epoch": 0.996268656716418,
"grad_norm": 1.3112166103200893,
"learning_rate": 5.029305733826533e-06,
"loss": 0.2043,
"step": 1068
},
{
"epoch": 0.9972014925373134,
"grad_norm": 1.2980556932588276,
"learning_rate": 5.021979355426851e-06,
"loss": 0.1931,
"step": 1069
},
{
"epoch": 0.9981343283582089,
"grad_norm": 1.3165921241201037,
"learning_rate": 5.0146529298355305e-06,
"loss": 0.2276,
"step": 1070
},
{
"epoch": 0.9990671641791045,
"grad_norm": 1.1963395543514876,
"learning_rate": 5.007326472783061e-06,
"loss": 0.1893,
"step": 1071
},
{
"epoch": 1.0,
"grad_norm": 1.236538164498631,
"learning_rate": 5e-06,
"loss": 0.1633,
"step": 1072
},
{
"epoch": 1.0009328358208955,
"grad_norm": 1.2479547546792191,
"learning_rate": 4.992673527216939e-06,
"loss": 0.1578,
"step": 1073
},
{
"epoch": 1.001865671641791,
"grad_norm": 1.3305876874607747,
"learning_rate": 4.985347070164471e-06,
"loss": 0.1739,
"step": 1074
},
{
"epoch": 1.0027985074626866,
"grad_norm": 1.237995595567211,
"learning_rate": 4.97802064457315e-06,
"loss": 0.1947,
"step": 1075
},
{
"epoch": 1.0037313432835822,
"grad_norm": 1.204780028041673,
"learning_rate": 4.970694266173467e-06,
"loss": 0.1933,
"step": 1076
},
{
"epoch": 1.0046641791044777,
"grad_norm": 1.276059263832681,
"learning_rate": 4.963367950695814e-06,
"loss": 0.1783,
"step": 1077
},
{
"epoch": 1.0055970149253732,
"grad_norm": 1.309820410447539,
"learning_rate": 4.956041713870439e-06,
"loss": 0.2121,
"step": 1078
},
{
"epoch": 1.0065298507462686,
"grad_norm": 1.4483451181535176,
"learning_rate": 4.948715571427432e-06,
"loss": 0.1996,
"step": 1079
},
{
"epoch": 1.007462686567164,
"grad_norm": 1.3271595457277217,
"learning_rate": 4.94138953909667e-06,
"loss": 0.1788,
"step": 1080
},
{
"epoch": 1.0083955223880596,
"grad_norm": 1.3870634078200816,
"learning_rate": 4.934063632607802e-06,
"loss": 0.1837,
"step": 1081
},
{
"epoch": 1.0093283582089552,
"grad_norm": 1.2307753356456927,
"learning_rate": 4.9267378676902014e-06,
"loss": 0.1558,
"step": 1082
},
{
"epoch": 1.0102611940298507,
"grad_norm": 1.38292309795395,
"learning_rate": 4.9194122600729396e-06,
"loss": 0.1834,
"step": 1083
},
{
"epoch": 1.0111940298507462,
"grad_norm": 1.2878361217427439,
"learning_rate": 4.9120868254847535e-06,
"loss": 0.1501,
"step": 1084
},
{
"epoch": 1.0121268656716418,
"grad_norm": 1.3923743816580727,
"learning_rate": 4.9047615796540014e-06,
"loss": 0.1943,
"step": 1085
},
{
"epoch": 1.0130597014925373,
"grad_norm": 1.3496982501402985,
"learning_rate": 4.897436538308641e-06,
"loss": 0.1889,
"step": 1086
},
{
"epoch": 1.0139925373134329,
"grad_norm": 1.2551367521987897,
"learning_rate": 4.890111717176193e-06,
"loss": 0.179,
"step": 1087
},
{
"epoch": 1.0149253731343284,
"grad_norm": 1.2732948826314543,
"learning_rate": 4.882787131983698e-06,
"loss": 0.1594,
"step": 1088
},
{
"epoch": 1.015858208955224,
"grad_norm": 1.2430616813440283,
"learning_rate": 4.875462798457698e-06,
"loss": 0.1569,
"step": 1089
},
{
"epoch": 1.0167910447761195,
"grad_norm": 1.2440828869258598,
"learning_rate": 4.8681387323241895e-06,
"loss": 0.1618,
"step": 1090
},
{
"epoch": 1.017723880597015,
"grad_norm": 1.24177612792635,
"learning_rate": 4.860814949308595e-06,
"loss": 0.1529,
"step": 1091
},
{
"epoch": 1.0186567164179103,
"grad_norm": 1.2695148146798123,
"learning_rate": 4.853491465135733e-06,
"loss": 0.1898,
"step": 1092
},
{
"epoch": 1.0195895522388059,
"grad_norm": 1.3003959474708364,
"learning_rate": 4.8461682955297795e-06,
"loss": 0.1832,
"step": 1093
},
{
"epoch": 1.0205223880597014,
"grad_norm": 1.2117092616641418,
"learning_rate": 4.83884545621423e-06,
"loss": 0.1651,
"step": 1094
},
{
"epoch": 1.021455223880597,
"grad_norm": 1.2739216395954265,
"learning_rate": 4.831522962911874e-06,
"loss": 0.1517,
"step": 1095
},
{
"epoch": 1.0223880597014925,
"grad_norm": 1.355545800661616,
"learning_rate": 4.82420083134476e-06,
"loss": 0.2124,
"step": 1096
},
{
"epoch": 1.023320895522388,
"grad_norm": 1.3120936898639997,
"learning_rate": 4.8168790772341595e-06,
"loss": 0.2069,
"step": 1097
},
{
"epoch": 1.0242537313432836,
"grad_norm": 1.4257530106077783,
"learning_rate": 4.80955771630053e-06,
"loss": 0.221,
"step": 1098
},
{
"epoch": 1.025186567164179,
"grad_norm": 1.2501902868432049,
"learning_rate": 4.8022367642634886e-06,
"loss": 0.1711,
"step": 1099
},
{
"epoch": 1.0261194029850746,
"grad_norm": 1.3281722597905856,
"learning_rate": 4.794916236841773e-06,
"loss": 0.1585,
"step": 1100
},
{
"epoch": 1.0270522388059702,
"grad_norm": 1.3188983055502095,
"learning_rate": 4.787596149753208e-06,
"loss": 0.1633,
"step": 1101
},
{
"epoch": 1.0279850746268657,
"grad_norm": 1.3378048102111348,
"learning_rate": 4.780276518714675e-06,
"loss": 0.1648,
"step": 1102
},
{
"epoch": 1.0289179104477613,
"grad_norm": 1.2067917710652079,
"learning_rate": 4.7729573594420765e-06,
"loss": 0.1482,
"step": 1103
},
{
"epoch": 1.0298507462686568,
"grad_norm": 1.1149191069835047,
"learning_rate": 4.765638687650299e-06,
"loss": 0.11,
"step": 1104
},
{
"epoch": 1.0307835820895523,
"grad_norm": 1.2203251235917674,
"learning_rate": 4.758320519053186e-06,
"loss": 0.1484,
"step": 1105
},
{
"epoch": 1.0317164179104477,
"grad_norm": 1.2189178116466146,
"learning_rate": 4.7510028693635e-06,
"loss": 0.1427,
"step": 1106
},
{
"epoch": 1.0326492537313432,
"grad_norm": 1.372735938568047,
"learning_rate": 4.743685754292885e-06,
"loss": 0.2167,
"step": 1107
},
{
"epoch": 1.0335820895522387,
"grad_norm": 1.3282242998736706,
"learning_rate": 4.736369189551841e-06,
"loss": 0.1985,
"step": 1108
},
{
"epoch": 1.0345149253731343,
"grad_norm": 1.4369705125880448,
"learning_rate": 4.729053190849686e-06,
"loss": 0.2087,
"step": 1109
},
{
"epoch": 1.0354477611940298,
"grad_norm": 1.1942738930700767,
"learning_rate": 4.721737773894525e-06,
"loss": 0.1347,
"step": 1110
},
{
"epoch": 1.0363805970149254,
"grad_norm": 1.1913762459997832,
"learning_rate": 4.714422954393208e-06,
"loss": 0.1777,
"step": 1111
},
{
"epoch": 1.037313432835821,
"grad_norm": 1.2267283573790213,
"learning_rate": 4.7071087480513075e-06,
"loss": 0.1501,
"step": 1112
},
{
"epoch": 1.0382462686567164,
"grad_norm": 1.3046537812115568,
"learning_rate": 4.699795170573078e-06,
"loss": 0.1829,
"step": 1113
},
{
"epoch": 1.039179104477612,
"grad_norm": 1.198889462535983,
"learning_rate": 4.692482237661421e-06,
"loss": 0.1508,
"step": 1114
},
{
"epoch": 1.0401119402985075,
"grad_norm": 1.259168735576952,
"learning_rate": 4.6851699650178595e-06,
"loss": 0.1642,
"step": 1115
},
{
"epoch": 1.041044776119403,
"grad_norm": 1.336907239087494,
"learning_rate": 4.677858368342495e-06,
"loss": 0.1912,
"step": 1116
},
{
"epoch": 1.0419776119402986,
"grad_norm": 1.2619832711180206,
"learning_rate": 4.670547463333976e-06,
"loss": 0.1455,
"step": 1117
},
{
"epoch": 1.0429104477611941,
"grad_norm": 1.220034852354567,
"learning_rate": 4.66323726568947e-06,
"loss": 0.1547,
"step": 1118
},
{
"epoch": 1.0438432835820897,
"grad_norm": 1.248237699247018,
"learning_rate": 4.655927791104627e-06,
"loss": 0.1445,
"step": 1119
},
{
"epoch": 1.044776119402985,
"grad_norm": 1.2788401711193869,
"learning_rate": 4.6486190552735375e-06,
"loss": 0.1766,
"step": 1120
},
{
"epoch": 1.0457089552238805,
"grad_norm": 1.370368413770226,
"learning_rate": 4.641311073888709e-06,
"loss": 0.194,
"step": 1121
},
{
"epoch": 1.046641791044776,
"grad_norm": 1.2631891199390823,
"learning_rate": 4.6340038626410335e-06,
"loss": 0.1752,
"step": 1122
},
{
"epoch": 1.0475746268656716,
"grad_norm": 1.1920491518946674,
"learning_rate": 4.626697437219746e-06,
"loss": 0.1248,
"step": 1123
},
{
"epoch": 1.0485074626865671,
"grad_norm": 1.2240491739822534,
"learning_rate": 4.619391813312391e-06,
"loss": 0.1625,
"step": 1124
},
{
"epoch": 1.0494402985074627,
"grad_norm": 1.319460412241081,
"learning_rate": 4.6120870066047976e-06,
"loss": 0.1971,
"step": 1125
},
{
"epoch": 1.0503731343283582,
"grad_norm": 1.264580192904606,
"learning_rate": 4.6047830327810396e-06,
"loss": 0.186,
"step": 1126
},
{
"epoch": 1.0513059701492538,
"grad_norm": 1.1887375529482427,
"learning_rate": 4.597479907523397e-06,
"loss": 0.142,
"step": 1127
},
{
"epoch": 1.0522388059701493,
"grad_norm": 1.3019193689654893,
"learning_rate": 4.590177646512335e-06,
"loss": 0.1574,
"step": 1128
},
{
"epoch": 1.0531716417910448,
"grad_norm": 1.3520770477490758,
"learning_rate": 4.5828762654264595e-06,
"loss": 0.2208,
"step": 1129
},
{
"epoch": 1.0541044776119404,
"grad_norm": 1.3539523658619874,
"learning_rate": 4.575575779942487e-06,
"loss": 0.2068,
"step": 1130
},
{
"epoch": 1.055037313432836,
"grad_norm": 1.2367294883717355,
"learning_rate": 4.568276205735211e-06,
"loss": 0.1532,
"step": 1131
},
{
"epoch": 1.0559701492537314,
"grad_norm": 1.2491884318330926,
"learning_rate": 4.560977558477471e-06,
"loss": 0.1519,
"step": 1132
},
{
"epoch": 1.0569029850746268,
"grad_norm": 1.19498379683984,
"learning_rate": 4.553679853840114e-06,
"loss": 0.1364,
"step": 1133
},
{
"epoch": 1.0578358208955223,
"grad_norm": 1.2111964940176165,
"learning_rate": 4.546383107491963e-06,
"loss": 0.1803,
"step": 1134
},
{
"epoch": 1.0587686567164178,
"grad_norm": 1.2421854687745804,
"learning_rate": 4.539087335099781e-06,
"loss": 0.1393,
"step": 1135
},
{
"epoch": 1.0597014925373134,
"grad_norm": 1.2448365427323471,
"learning_rate": 4.531792552328247e-06,
"loss": 0.1829,
"step": 1136
},
{
"epoch": 1.060634328358209,
"grad_norm": 1.3333692826121972,
"learning_rate": 4.52449877483991e-06,
"loss": 0.187,
"step": 1137
},
{
"epoch": 1.0615671641791045,
"grad_norm": 1.2354906525606948,
"learning_rate": 4.51720601829516e-06,
"loss": 0.1899,
"step": 1138
},
{
"epoch": 1.0625,
"grad_norm": 1.3201744820123211,
"learning_rate": 4.509914298352197e-06,
"loss": 0.2219,
"step": 1139
},
{
"epoch": 1.0634328358208955,
"grad_norm": 1.278633468940202,
"learning_rate": 4.502623630666997e-06,
"loss": 0.1682,
"step": 1140
},
{
"epoch": 1.064365671641791,
"grad_norm": 1.3164919074298664,
"learning_rate": 4.495334030893272e-06,
"loss": 0.1887,
"step": 1141
},
{
"epoch": 1.0652985074626866,
"grad_norm": 1.3431779551114107,
"learning_rate": 4.488045514682444e-06,
"loss": 0.2015,
"step": 1142
},
{
"epoch": 1.0662313432835822,
"grad_norm": 1.2656504973462495,
"learning_rate": 4.480758097683608e-06,
"loss": 0.1716,
"step": 1143
},
{
"epoch": 1.0671641791044777,
"grad_norm": 1.3709142303612718,
"learning_rate": 4.4734717955435e-06,
"loss": 0.1854,
"step": 1144
},
{
"epoch": 1.0680970149253732,
"grad_norm": 1.2494186095135305,
"learning_rate": 4.466186623906462e-06,
"loss": 0.1487,
"step": 1145
},
{
"epoch": 1.0690298507462686,
"grad_norm": 1.330911651469678,
"learning_rate": 4.458902598414407e-06,
"loss": 0.1765,
"step": 1146
},
{
"epoch": 1.069962686567164,
"grad_norm": 1.2014798451429747,
"learning_rate": 4.451619734706786e-06,
"loss": 0.135,
"step": 1147
},
{
"epoch": 1.0708955223880596,
"grad_norm": 1.335995551880163,
"learning_rate": 4.44433804842056e-06,
"loss": 0.1923,
"step": 1148
},
{
"epoch": 1.0718283582089552,
"grad_norm": 1.2401414131378654,
"learning_rate": 4.437057555190159e-06,
"loss": 0.1883,
"step": 1149
},
{
"epoch": 1.0727611940298507,
"grad_norm": 1.1528790815227568,
"learning_rate": 4.429778270647452e-06,
"loss": 0.1441,
"step": 1150
},
{
"epoch": 1.0736940298507462,
"grad_norm": 1.2902145418144173,
"learning_rate": 4.422500210421713e-06,
"loss": 0.1828,
"step": 1151
},
{
"epoch": 1.0746268656716418,
"grad_norm": 1.25051022835927,
"learning_rate": 4.415223390139588e-06,
"loss": 0.1621,
"step": 1152
},
{
"epoch": 1.0755597014925373,
"grad_norm": 1.2532151202153479,
"learning_rate": 4.40794782542506e-06,
"loss": 0.1615,
"step": 1153
},
{
"epoch": 1.0764925373134329,
"grad_norm": 1.483668421157478,
"learning_rate": 4.400673531899413e-06,
"loss": 0.2112,
"step": 1154
},
{
"epoch": 1.0774253731343284,
"grad_norm": 1.3062093686578884,
"learning_rate": 4.393400525181208e-06,
"loss": 0.164,
"step": 1155
},
{
"epoch": 1.078358208955224,
"grad_norm": 1.2299161462076307,
"learning_rate": 4.38612882088624e-06,
"loss": 0.1434,
"step": 1156
},
{
"epoch": 1.0792910447761195,
"grad_norm": 1.238818335888987,
"learning_rate": 4.378858434627504e-06,
"loss": 0.1559,
"step": 1157
},
{
"epoch": 1.080223880597015,
"grad_norm": 1.2915690997598344,
"learning_rate": 4.371589382015171e-06,
"loss": 0.1784,
"step": 1158
},
{
"epoch": 1.0811567164179103,
"grad_norm": 1.1839866337995881,
"learning_rate": 4.364321678656548e-06,
"loss": 0.1178,
"step": 1159
},
{
"epoch": 1.0820895522388059,
"grad_norm": 1.3599177349094425,
"learning_rate": 4.357055340156041e-06,
"loss": 0.192,
"step": 1160
},
{
"epoch": 1.0830223880597014,
"grad_norm": 1.223492563759451,
"learning_rate": 4.349790382115125e-06,
"loss": 0.1774,
"step": 1161
},
{
"epoch": 1.083955223880597,
"grad_norm": 1.3573643375419246,
"learning_rate": 4.342526820132316e-06,
"loss": 0.1902,
"step": 1162
},
{
"epoch": 1.0848880597014925,
"grad_norm": 1.2752241604652728,
"learning_rate": 4.335264669803131e-06,
"loss": 0.1893,
"step": 1163
},
{
"epoch": 1.085820895522388,
"grad_norm": 1.3819680352974013,
"learning_rate": 4.328003946720053e-06,
"loss": 0.2114,
"step": 1164
},
{
"epoch": 1.0867537313432836,
"grad_norm": 1.1909614809262572,
"learning_rate": 4.320744666472504e-06,
"loss": 0.1443,
"step": 1165
},
{
"epoch": 1.087686567164179,
"grad_norm": 1.2475137107433707,
"learning_rate": 4.313486844646808e-06,
"loss": 0.1764,
"step": 1166
},
{
"epoch": 1.0886194029850746,
"grad_norm": 1.2181237630025556,
"learning_rate": 4.3062304968261545e-06,
"loss": 0.1663,
"step": 1167
},
{
"epoch": 1.0895522388059702,
"grad_norm": 1.2425063726533858,
"learning_rate": 4.2989756385905715e-06,
"loss": 0.1384,
"step": 1168
},
{
"epoch": 1.0904850746268657,
"grad_norm": 1.3832152612220123,
"learning_rate": 4.291722285516887e-06,
"loss": 0.2491,
"step": 1169
},
{
"epoch": 1.0914179104477613,
"grad_norm": 1.3080535208233788,
"learning_rate": 4.284470453178698e-06,
"loss": 0.1993,
"step": 1170
},
{
"epoch": 1.0923507462686568,
"grad_norm": 1.3291999749207284,
"learning_rate": 4.277220157146335e-06,
"loss": 0.1574,
"step": 1171
},
{
"epoch": 1.0932835820895523,
"grad_norm": 1.3149288003758082,
"learning_rate": 4.269971412986833e-06,
"loss": 0.1649,
"step": 1172
},
{
"epoch": 1.0942164179104477,
"grad_norm": 1.3969699379485367,
"learning_rate": 4.262724236263892e-06,
"loss": 0.2198,
"step": 1173
},
{
"epoch": 1.0951492537313432,
"grad_norm": 1.2513590469723619,
"learning_rate": 4.255478642537846e-06,
"loss": 0.1558,
"step": 1174
},
{
"epoch": 1.0960820895522387,
"grad_norm": 1.2738684649625196,
"learning_rate": 4.248234647365632e-06,
"loss": 0.1623,
"step": 1175
},
{
"epoch": 1.0970149253731343,
"grad_norm": 1.3008995242875747,
"learning_rate": 4.240992266300757e-06,
"loss": 0.1873,
"step": 1176
},
{
"epoch": 1.0979477611940298,
"grad_norm": 1.3647443465246403,
"learning_rate": 4.233751514893257e-06,
"loss": 0.1809,
"step": 1177
},
{
"epoch": 1.0988805970149254,
"grad_norm": 1.2902770059265867,
"learning_rate": 4.226512408689674e-06,
"loss": 0.1627,
"step": 1178
},
{
"epoch": 1.099813432835821,
"grad_norm": 1.2552774636523052,
"learning_rate": 4.219274963233014e-06,
"loss": 0.1742,
"step": 1179
},
{
"epoch": 1.1007462686567164,
"grad_norm": 1.2994851316208804,
"learning_rate": 4.212039194062718e-06,
"loss": 0.1818,
"step": 1180
},
{
"epoch": 1.101679104477612,
"grad_norm": 1.2955747010389893,
"learning_rate": 4.20480511671463e-06,
"loss": 0.1775,
"step": 1181
},
{
"epoch": 1.1026119402985075,
"grad_norm": 1.3932391378184827,
"learning_rate": 4.19757274672096e-06,
"loss": 0.2347,
"step": 1182
},
{
"epoch": 1.103544776119403,
"grad_norm": 1.2655556196719828,
"learning_rate": 4.1903420996102515e-06,
"loss": 0.1898,
"step": 1183
},
{
"epoch": 1.1044776119402986,
"grad_norm": 1.3020632302296102,
"learning_rate": 4.183113190907349e-06,
"loss": 0.1584,
"step": 1184
},
{
"epoch": 1.1054104477611941,
"grad_norm": 1.3265349936085298,
"learning_rate": 4.175886036133366e-06,
"loss": 0.1927,
"step": 1185
},
{
"epoch": 1.1063432835820897,
"grad_norm": 1.31504432408727,
"learning_rate": 4.16866065080565e-06,
"loss": 0.1751,
"step": 1186
},
{
"epoch": 1.107276119402985,
"grad_norm": 1.200509069590439,
"learning_rate": 4.161437050437746e-06,
"loss": 0.1581,
"step": 1187
},
{
"epoch": 1.1082089552238805,
"grad_norm": 1.2910674844500896,
"learning_rate": 4.1542152505393694e-06,
"loss": 0.1575,
"step": 1188
},
{
"epoch": 1.109141791044776,
"grad_norm": 1.3637483518464766,
"learning_rate": 4.146995266616371e-06,
"loss": 0.2139,
"step": 1189
},
{
"epoch": 1.1100746268656716,
"grad_norm": 1.4447566279813335,
"learning_rate": 4.1397771141706995e-06,
"loss": 0.1985,
"step": 1190
},
{
"epoch": 1.1110074626865671,
"grad_norm": 1.328516864124165,
"learning_rate": 4.132560808700374e-06,
"loss": 0.1696,
"step": 1191
},
{
"epoch": 1.1119402985074627,
"grad_norm": 1.27769217641431,
"learning_rate": 4.125346365699446e-06,
"loss": 0.1906,
"step": 1192
},
{
"epoch": 1.1128731343283582,
"grad_norm": 1.1918835238011203,
"learning_rate": 4.118133800657968e-06,
"loss": 0.1583,
"step": 1193
},
{
"epoch": 1.1138059701492538,
"grad_norm": 1.3366450390085258,
"learning_rate": 4.110923129061961e-06,
"loss": 0.1839,
"step": 1194
},
{
"epoch": 1.1147388059701493,
"grad_norm": 1.326326008877487,
"learning_rate": 4.103714366393383e-06,
"loss": 0.1636,
"step": 1195
},
{
"epoch": 1.1156716417910448,
"grad_norm": 1.2393685258161005,
"learning_rate": 4.09650752813009e-06,
"loss": 0.1452,
"step": 1196
},
{
"epoch": 1.1166044776119404,
"grad_norm": 1.1971809037071681,
"learning_rate": 4.089302629745806e-06,
"loss": 0.145,
"step": 1197
},
{
"epoch": 1.117537313432836,
"grad_norm": 1.2609209237949275,
"learning_rate": 4.082099686710093e-06,
"loss": 0.1666,
"step": 1198
},
{
"epoch": 1.1184701492537314,
"grad_norm": 1.2398171855960034,
"learning_rate": 4.074898714488313e-06,
"loss": 0.1686,
"step": 1199
},
{
"epoch": 1.1194029850746268,
"grad_norm": 1.2571581832718375,
"learning_rate": 4.067699728541595e-06,
"loss": 0.152,
"step": 1200
},
{
"epoch": 1.1203358208955223,
"grad_norm": 1.1265454199988725,
"learning_rate": 4.060502744326805e-06,
"loss": 0.1379,
"step": 1201
},
{
"epoch": 1.1212686567164178,
"grad_norm": 1.2099178791599692,
"learning_rate": 4.053307777296511e-06,
"loss": 0.1537,
"step": 1202
},
{
"epoch": 1.1222014925373134,
"grad_norm": 1.364960057577564,
"learning_rate": 4.046114842898948e-06,
"loss": 0.1836,
"step": 1203
},
{
"epoch": 1.123134328358209,
"grad_norm": 1.3518523823356796,
"learning_rate": 4.03892395657799e-06,
"loss": 0.1727,
"step": 1204
},
{
"epoch": 1.1240671641791045,
"grad_norm": 1.3239439887244806,
"learning_rate": 4.031735133773113e-06,
"loss": 0.1539,
"step": 1205
},
{
"epoch": 1.125,
"grad_norm": 1.2751889630504203,
"learning_rate": 4.02454838991936e-06,
"loss": 0.1852,
"step": 1206
},
{
"epoch": 1.1259328358208955,
"grad_norm": 1.326689919361112,
"learning_rate": 4.0173637404473105e-06,
"loss": 0.1811,
"step": 1207
},
{
"epoch": 1.126865671641791,
"grad_norm": 1.2127272826068796,
"learning_rate": 4.010181200783052e-06,
"loss": 0.1454,
"step": 1208
},
{
"epoch": 1.1277985074626866,
"grad_norm": 1.322726203190604,
"learning_rate": 4.003000786348135e-06,
"loss": 0.1787,
"step": 1209
},
{
"epoch": 1.1287313432835822,
"grad_norm": 1.2457042711147817,
"learning_rate": 3.995822512559552e-06,
"loss": 0.164,
"step": 1210
},
{
"epoch": 1.1296641791044777,
"grad_norm": 1.1615734518718064,
"learning_rate": 3.988646394829699e-06,
"loss": 0.1384,
"step": 1211
},
{
"epoch": 1.1305970149253732,
"grad_norm": 1.309796738470515,
"learning_rate": 3.981472448566339e-06,
"loss": 0.1789,
"step": 1212
},
{
"epoch": 1.1315298507462686,
"grad_norm": 1.2902712317148286,
"learning_rate": 3.974300689172579e-06,
"loss": 0.1658,
"step": 1213
},
{
"epoch": 1.132462686567164,
"grad_norm": 1.3037701154014294,
"learning_rate": 3.967131132046822e-06,
"loss": 0.2038,
"step": 1214
},
{
"epoch": 1.1333955223880596,
"grad_norm": 1.3434142092920884,
"learning_rate": 3.9599637925827495e-06,
"loss": 0.1484,
"step": 1215
},
{
"epoch": 1.1343283582089552,
"grad_norm": 1.401664096227999,
"learning_rate": 3.952798686169279e-06,
"loss": 0.1981,
"step": 1216
},
{
"epoch": 1.1352611940298507,
"grad_norm": 1.3177773174229326,
"learning_rate": 3.945635828190534e-06,
"loss": 0.1758,
"step": 1217
},
{
"epoch": 1.1361940298507462,
"grad_norm": 1.2030919853710615,
"learning_rate": 3.938475234025812e-06,
"loss": 0.1589,
"step": 1218
},
{
"epoch": 1.1371268656716418,
"grad_norm": 1.3322750694007146,
"learning_rate": 3.931316919049544e-06,
"loss": 0.2114,
"step": 1219
},
{
"epoch": 1.1380597014925373,
"grad_norm": 1.3513292327679234,
"learning_rate": 3.924160898631274e-06,
"loss": 0.2007,
"step": 1220
},
{
"epoch": 1.1389925373134329,
"grad_norm": 1.2715856870031297,
"learning_rate": 3.917007188135618e-06,
"loss": 0.1502,
"step": 1221
},
{
"epoch": 1.1399253731343284,
"grad_norm": 1.360952524579768,
"learning_rate": 3.9098558029222275e-06,
"loss": 0.2022,
"step": 1222
},
{
"epoch": 1.140858208955224,
"grad_norm": 1.394234335235106,
"learning_rate": 3.902706758345768e-06,
"loss": 0.2066,
"step": 1223
},
{
"epoch": 1.1417910447761195,
"grad_norm": 1.3657367560532043,
"learning_rate": 3.8955600697558764e-06,
"loss": 0.2092,
"step": 1224
},
{
"epoch": 1.142723880597015,
"grad_norm": 1.2934667493963083,
"learning_rate": 3.88841575249713e-06,
"loss": 0.2095,
"step": 1225
},
{
"epoch": 1.1436567164179103,
"grad_norm": 1.2407088587717017,
"learning_rate": 3.8812738219090165e-06,
"loss": 0.1546,
"step": 1226
},
{
"epoch": 1.1445895522388059,
"grad_norm": 1.3462235378169791,
"learning_rate": 3.874134293325901e-06,
"loss": 0.1789,
"step": 1227
},
{
"epoch": 1.1455223880597014,
"grad_norm": 1.4067365696444236,
"learning_rate": 3.866997182076985e-06,
"loss": 0.1827,
"step": 1228
},
{
"epoch": 1.146455223880597,
"grad_norm": 1.377478579563355,
"learning_rate": 3.8598625034862834e-06,
"loss": 0.1988,
"step": 1229
},
{
"epoch": 1.1473880597014925,
"grad_norm": 1.279473628493321,
"learning_rate": 3.8527302728725906e-06,
"loss": 0.1888,
"step": 1230
},
{
"epoch": 1.148320895522388,
"grad_norm": 1.2102178107485024,
"learning_rate": 3.845600505549443e-06,
"loss": 0.1501,
"step": 1231
},
{
"epoch": 1.1492537313432836,
"grad_norm": 1.3131128698812524,
"learning_rate": 3.838473216825085e-06,
"loss": 0.1687,
"step": 1232
},
{
"epoch": 1.150186567164179,
"grad_norm": 1.2627359714544923,
"learning_rate": 3.8313484220024434e-06,
"loss": 0.1535,
"step": 1233
},
{
"epoch": 1.1511194029850746,
"grad_norm": 1.288099194043848,
"learning_rate": 3.82422613637909e-06,
"loss": 0.1996,
"step": 1234
},
{
"epoch": 1.1520522388059702,
"grad_norm": 1.4105348381096752,
"learning_rate": 3.817106375247205e-06,
"loss": 0.1734,
"step": 1235
},
{
"epoch": 1.1529850746268657,
"grad_norm": 1.2964203374983818,
"learning_rate": 3.809989153893554e-06,
"loss": 0.1919,
"step": 1236
},
{
"epoch": 1.1539179104477613,
"grad_norm": 1.2822926379983786,
"learning_rate": 3.802874487599447e-06,
"loss": 0.1644,
"step": 1237
},
{
"epoch": 1.1548507462686568,
"grad_norm": 1.2879062945860258,
"learning_rate": 3.795762391640705e-06,
"loss": 0.1916,
"step": 1238
},
{
"epoch": 1.1557835820895521,
"grad_norm": 1.42619892292563,
"learning_rate": 3.788652881287635e-06,
"loss": 0.1916,
"step": 1239
},
{
"epoch": 1.1567164179104479,
"grad_norm": 1.3160995155313582,
"learning_rate": 3.781545971804992e-06,
"loss": 0.1921,
"step": 1240
},
{
"epoch": 1.1576492537313432,
"grad_norm": 1.2513175064563837,
"learning_rate": 3.774441678451943e-06,
"loss": 0.158,
"step": 1241
},
{
"epoch": 1.1585820895522387,
"grad_norm": 1.2976814682761826,
"learning_rate": 3.767340016482039e-06,
"loss": 0.1858,
"step": 1242
},
{
"epoch": 1.1595149253731343,
"grad_norm": 1.2631809769370392,
"learning_rate": 3.7602410011431837e-06,
"loss": 0.1929,
"step": 1243
},
{
"epoch": 1.1604477611940298,
"grad_norm": 1.301688507353941,
"learning_rate": 3.753144647677599e-06,
"loss": 0.1759,
"step": 1244
},
{
"epoch": 1.1613805970149254,
"grad_norm": 1.3028654984103567,
"learning_rate": 3.7460509713217863e-06,
"loss": 0.1478,
"step": 1245
},
{
"epoch": 1.162313432835821,
"grad_norm": 1.4170654304734462,
"learning_rate": 3.7389599873065034e-06,
"loss": 0.2231,
"step": 1246
},
{
"epoch": 1.1632462686567164,
"grad_norm": 1.4136925560947238,
"learning_rate": 3.731871710856727e-06,
"loss": 0.23,
"step": 1247
},
{
"epoch": 1.164179104477612,
"grad_norm": 1.3030305034565517,
"learning_rate": 3.7247861571916183e-06,
"loss": 0.1958,
"step": 1248
},
{
"epoch": 1.1651119402985075,
"grad_norm": 1.2163348158439937,
"learning_rate": 3.717703341524494e-06,
"loss": 0.1557,
"step": 1249
},
{
"epoch": 1.166044776119403,
"grad_norm": 1.2282391935385022,
"learning_rate": 3.7106232790627926e-06,
"loss": 0.1717,
"step": 1250
},
{
"epoch": 1.1669776119402986,
"grad_norm": 1.2831488659200352,
"learning_rate": 3.7035459850080392e-06,
"loss": 0.2062,
"step": 1251
},
{
"epoch": 1.1679104477611941,
"grad_norm": 1.2737349888267746,
"learning_rate": 3.696471474555816e-06,
"loss": 0.1576,
"step": 1252
},
{
"epoch": 1.1688432835820897,
"grad_norm": 1.3585086927572092,
"learning_rate": 3.6893997628957314e-06,
"loss": 0.1652,
"step": 1253
},
{
"epoch": 1.169776119402985,
"grad_norm": 1.2336076065127748,
"learning_rate": 3.6823308652113783e-06,
"loss": 0.1434,
"step": 1254
},
{
"epoch": 1.1707089552238805,
"grad_norm": 1.275887082452847,
"learning_rate": 3.6752647966803114e-06,
"loss": 0.1681,
"step": 1255
},
{
"epoch": 1.171641791044776,
"grad_norm": 1.265690020819207,
"learning_rate": 3.6682015724740116e-06,
"loss": 0.1566,
"step": 1256
},
{
"epoch": 1.1725746268656716,
"grad_norm": 1.3186785189168748,
"learning_rate": 3.661141207757854e-06,
"loss": 0.1895,
"step": 1257
},
{
"epoch": 1.1735074626865671,
"grad_norm": 1.3013458028814893,
"learning_rate": 3.6540837176910688e-06,
"loss": 0.1633,
"step": 1258
},
{
"epoch": 1.1744402985074627,
"grad_norm": 1.2924717838350035,
"learning_rate": 3.6470291174267187e-06,
"loss": 0.1551,
"step": 1259
},
{
"epoch": 1.1753731343283582,
"grad_norm": 1.3345403188122533,
"learning_rate": 3.6399774221116613e-06,
"loss": 0.1741,
"step": 1260
},
{
"epoch": 1.1763059701492538,
"grad_norm": 1.3254228160809514,
"learning_rate": 3.6329286468865143e-06,
"loss": 0.1822,
"step": 1261
},
{
"epoch": 1.1772388059701493,
"grad_norm": 1.3240960212231794,
"learning_rate": 3.625882806885629e-06,
"loss": 0.1716,
"step": 1262
},
{
"epoch": 1.1781716417910448,
"grad_norm": 1.277988066208133,
"learning_rate": 3.6188399172370526e-06,
"loss": 0.1998,
"step": 1263
},
{
"epoch": 1.1791044776119404,
"grad_norm": 1.2745092796536548,
"learning_rate": 3.611799993062497e-06,
"loss": 0.176,
"step": 1264
},
{
"epoch": 1.180037313432836,
"grad_norm": 1.3100020490753832,
"learning_rate": 3.6047630494773093e-06,
"loss": 0.1968,
"step": 1265
},
{
"epoch": 1.1809701492537314,
"grad_norm": 1.2350577675277863,
"learning_rate": 3.597729101590436e-06,
"loss": 0.1448,
"step": 1266
},
{
"epoch": 1.1819029850746268,
"grad_norm": 1.3403726848574598,
"learning_rate": 3.590698164504391e-06,
"loss": 0.183,
"step": 1267
},
{
"epoch": 1.1828358208955223,
"grad_norm": 1.2471727326244055,
"learning_rate": 3.583670253315223e-06,
"loss": 0.1597,
"step": 1268
},
{
"epoch": 1.1837686567164178,
"grad_norm": 1.300483980907673,
"learning_rate": 3.576645383112485e-06,
"loss": 0.1732,
"step": 1269
},
{
"epoch": 1.1847014925373134,
"grad_norm": 1.3056221253356903,
"learning_rate": 3.5696235689792e-06,
"loss": 0.1694,
"step": 1270
},
{
"epoch": 1.185634328358209,
"grad_norm": 1.2615843794808572,
"learning_rate": 3.5626048259918324e-06,
"loss": 0.1458,
"step": 1271
},
{
"epoch": 1.1865671641791045,
"grad_norm": 1.298997730458422,
"learning_rate": 3.5555891692202475e-06,
"loss": 0.1776,
"step": 1272
},
{
"epoch": 1.1875,
"grad_norm": 1.3886945575760463,
"learning_rate": 3.5485766137276894e-06,
"loss": 0.2076,
"step": 1273
},
{
"epoch": 1.1884328358208955,
"grad_norm": 1.2982711284559796,
"learning_rate": 3.5415671745707383e-06,
"loss": 0.1725,
"step": 1274
},
{
"epoch": 1.189365671641791,
"grad_norm": 1.3367857992818377,
"learning_rate": 3.5345608667992863e-06,
"loss": 0.1864,
"step": 1275
},
{
"epoch": 1.1902985074626866,
"grad_norm": 1.1634085446327156,
"learning_rate": 3.5275577054565047e-06,
"loss": 0.1258,
"step": 1276
},
{
"epoch": 1.1912313432835822,
"grad_norm": 1.14606876472478,
"learning_rate": 3.520557705578802e-06,
"loss": 0.1327,
"step": 1277
},
{
"epoch": 1.1921641791044777,
"grad_norm": 1.3260981006324937,
"learning_rate": 3.5135608821958055e-06,
"loss": 0.1932,
"step": 1278
},
{
"epoch": 1.1930970149253732,
"grad_norm": 1.3807183926628526,
"learning_rate": 3.5065672503303204e-06,
"loss": 0.1613,
"step": 1279
},
{
"epoch": 1.1940298507462686,
"grad_norm": 1.2651107865599973,
"learning_rate": 3.4995768249982975e-06,
"loss": 0.1606,
"step": 1280
},
{
"epoch": 1.194962686567164,
"grad_norm": 1.3830161336719449,
"learning_rate": 3.492589621208804e-06,
"loss": 0.1948,
"step": 1281
},
{
"epoch": 1.1958955223880596,
"grad_norm": 1.2676978832814751,
"learning_rate": 3.4856056539639906e-06,
"loss": 0.16,
"step": 1282
},
{
"epoch": 1.1968283582089552,
"grad_norm": 1.1400567919289024,
"learning_rate": 3.4786249382590575e-06,
"loss": 0.1215,
"step": 1283
},
{
"epoch": 1.1977611940298507,
"grad_norm": 1.4562871039890137,
"learning_rate": 3.471647489082227e-06,
"loss": 0.1727,
"step": 1284
},
{
"epoch": 1.1986940298507462,
"grad_norm": 1.2985859104756226,
"learning_rate": 3.4646733214147037e-06,
"loss": 0.1647,
"step": 1285
},
{
"epoch": 1.1996268656716418,
"grad_norm": 1.297692863890328,
"learning_rate": 3.457702450230649e-06,
"loss": 0.1519,
"step": 1286
},
{
"epoch": 1.2005597014925373,
"grad_norm": 1.2724731186295677,
"learning_rate": 3.450734890497146e-06,
"loss": 0.1501,
"step": 1287
},
{
"epoch": 1.2014925373134329,
"grad_norm": 1.2681638095698602,
"learning_rate": 3.443770657174166e-06,
"loss": 0.1722,
"step": 1288
},
{
"epoch": 1.2024253731343284,
"grad_norm": 1.1648570392932733,
"learning_rate": 3.4368097652145416e-06,
"loss": 0.1546,
"step": 1289
},
{
"epoch": 1.203358208955224,
"grad_norm": 1.2635339184109227,
"learning_rate": 3.4298522295639298e-06,
"loss": 0.1797,
"step": 1290
},
{
"epoch": 1.2042910447761195,
"grad_norm": 1.1618912447363592,
"learning_rate": 3.4228980651607787e-06,
"loss": 0.125,
"step": 1291
},
{
"epoch": 1.205223880597015,
"grad_norm": 1.3480410563587142,
"learning_rate": 3.415947286936301e-06,
"loss": 0.1904,
"step": 1292
},
{
"epoch": 1.2061567164179103,
"grad_norm": 1.2629731956681998,
"learning_rate": 3.40899990981444e-06,
"loss": 0.1636,
"step": 1293
},
{
"epoch": 1.2070895522388059,
"grad_norm": 1.1682543725678185,
"learning_rate": 3.4020559487118337e-06,
"loss": 0.1463,
"step": 1294
},
{
"epoch": 1.2080223880597014,
"grad_norm": 1.345852417828716,
"learning_rate": 3.3951154185377843e-06,
"loss": 0.1711,
"step": 1295
},
{
"epoch": 1.208955223880597,
"grad_norm": 1.2318372470877859,
"learning_rate": 3.388178334194232e-06,
"loss": 0.1459,
"step": 1296
},
{
"epoch": 1.2098880597014925,
"grad_norm": 1.324131635189433,
"learning_rate": 3.381244710575717e-06,
"loss": 0.212,
"step": 1297
},
{
"epoch": 1.210820895522388,
"grad_norm": 1.2369374528389712,
"learning_rate": 3.3743145625693456e-06,
"loss": 0.1639,
"step": 1298
},
{
"epoch": 1.2117537313432836,
"grad_norm": 1.2577013709675007,
"learning_rate": 3.3673879050547664e-06,
"loss": 0.1598,
"step": 1299
},
{
"epoch": 1.212686567164179,
"grad_norm": 1.3798021554722362,
"learning_rate": 3.360464752904132e-06,
"loss": 0.2157,
"step": 1300
},
{
"epoch": 1.2136194029850746,
"grad_norm": 1.3073475578732112,
"learning_rate": 3.353545120982067e-06,
"loss": 0.2055,
"step": 1301
},
{
"epoch": 1.2145522388059702,
"grad_norm": 1.2530740286530024,
"learning_rate": 3.346629024145639e-06,
"loss": 0.1594,
"step": 1302
},
{
"epoch": 1.2154850746268657,
"grad_norm": 1.2552812505931332,
"learning_rate": 3.3397164772443274e-06,
"loss": 0.1845,
"step": 1303
},
{
"epoch": 1.2164179104477613,
"grad_norm": 1.2529584561220057,
"learning_rate": 3.3328074951199846e-06,
"loss": 0.1577,
"step": 1304
},
{
"epoch": 1.2173507462686568,
"grad_norm": 1.257916536805769,
"learning_rate": 3.325902092606814e-06,
"loss": 0.1402,
"step": 1305
},
{
"epoch": 1.2182835820895521,
"grad_norm": 1.2205205794235958,
"learning_rate": 3.319000284531332e-06,
"loss": 0.1657,
"step": 1306
},
{
"epoch": 1.2192164179104479,
"grad_norm": 1.3301229261087273,
"learning_rate": 3.3121020857123364e-06,
"loss": 0.1882,
"step": 1307
},
{
"epoch": 1.2201492537313432,
"grad_norm": 1.309878309023618,
"learning_rate": 3.3052075109608734e-06,
"loss": 0.2019,
"step": 1308
},
{
"epoch": 1.2210820895522387,
"grad_norm": 1.2342859184120942,
"learning_rate": 3.2983165750802127e-06,
"loss": 0.1611,
"step": 1309
},
{
"epoch": 1.2220149253731343,
"grad_norm": 1.3024638759100047,
"learning_rate": 3.29142929286581e-06,
"loss": 0.1734,
"step": 1310
},
{
"epoch": 1.2229477611940298,
"grad_norm": 1.2777502310594075,
"learning_rate": 3.2845456791052733e-06,
"loss": 0.1944,
"step": 1311
},
{
"epoch": 1.2238805970149254,
"grad_norm": 1.2324858537536083,
"learning_rate": 3.2776657485783357e-06,
"loss": 0.1481,
"step": 1312
},
{
"epoch": 1.224813432835821,
"grad_norm": 1.313727487629851,
"learning_rate": 3.2707895160568255e-06,
"loss": 0.1932,
"step": 1313
},
{
"epoch": 1.2257462686567164,
"grad_norm": 1.2805182543437854,
"learning_rate": 3.263916996304624e-06,
"loss": 0.1579,
"step": 1314
},
{
"epoch": 1.226679104477612,
"grad_norm": 1.25660958657868,
"learning_rate": 3.257048204077647e-06,
"loss": 0.1615,
"step": 1315
},
{
"epoch": 1.2276119402985075,
"grad_norm": 1.232309569771475,
"learning_rate": 3.2501831541238048e-06,
"loss": 0.1497,
"step": 1316
},
{
"epoch": 1.228544776119403,
"grad_norm": 1.23013841269123,
"learning_rate": 3.2433218611829713e-06,
"loss": 0.1777,
"step": 1317
},
{
"epoch": 1.2294776119402986,
"grad_norm": 1.3214107220165558,
"learning_rate": 3.236464339986956e-06,
"loss": 0.1767,
"step": 1318
},
{
"epoch": 1.2304104477611941,
"grad_norm": 1.2928726430457418,
"learning_rate": 3.22961060525947e-06,
"loss": 0.179,
"step": 1319
},
{
"epoch": 1.2313432835820897,
"grad_norm": 1.3137887003967543,
"learning_rate": 3.2227606717160944e-06,
"loss": 0.1893,
"step": 1320
},
{
"epoch": 1.232276119402985,
"grad_norm": 1.2507248269607059,
"learning_rate": 3.2159145540642433e-06,
"loss": 0.1543,
"step": 1321
},
{
"epoch": 1.2332089552238805,
"grad_norm": 1.2140463898107692,
"learning_rate": 3.2090722670031465e-06,
"loss": 0.1462,
"step": 1322
},
{
"epoch": 1.234141791044776,
"grad_norm": 1.2122563405449025,
"learning_rate": 3.2022338252238062e-06,
"loss": 0.143,
"step": 1323
},
{
"epoch": 1.2350746268656716,
"grad_norm": 1.2406985130643,
"learning_rate": 3.1953992434089643e-06,
"loss": 0.1575,
"step": 1324
},
{
"epoch": 1.2360074626865671,
"grad_norm": 1.216622584482215,
"learning_rate": 3.18856853623308e-06,
"loss": 0.1665,
"step": 1325
},
{
"epoch": 1.2369402985074627,
"grad_norm": 1.3313794936617174,
"learning_rate": 3.1817417183622915e-06,
"loss": 0.1924,
"step": 1326
},
{
"epoch": 1.2378731343283582,
"grad_norm": 1.2094255732356214,
"learning_rate": 3.1749188044543865e-06,
"loss": 0.186,
"step": 1327
},
{
"epoch": 1.2388059701492538,
"grad_norm": 1.200400622111638,
"learning_rate": 3.168099809158769e-06,
"loss": 0.1269,
"step": 1328
},
{
"epoch": 1.2397388059701493,
"grad_norm": 1.218016115664713,
"learning_rate": 3.1612847471164335e-06,
"loss": 0.1805,
"step": 1329
},
{
"epoch": 1.2406716417910448,
"grad_norm": 1.2447106314830831,
"learning_rate": 3.1544736329599248e-06,
"loss": 0.1586,
"step": 1330
},
{
"epoch": 1.2416044776119404,
"grad_norm": 1.2604336406422085,
"learning_rate": 3.1476664813133118e-06,
"loss": 0.1614,
"step": 1331
},
{
"epoch": 1.242537313432836,
"grad_norm": 1.3542640752949107,
"learning_rate": 3.140863306792161e-06,
"loss": 0.1868,
"step": 1332
},
{
"epoch": 1.2434701492537314,
"grad_norm": 1.2358962119870105,
"learning_rate": 3.1340641240034907e-06,
"loss": 0.1545,
"step": 1333
},
{
"epoch": 1.2444029850746268,
"grad_norm": 1.2383222636503617,
"learning_rate": 3.1272689475457592e-06,
"loss": 0.1815,
"step": 1334
},
{
"epoch": 1.2453358208955223,
"grad_norm": 1.2508922934834654,
"learning_rate": 3.1204777920088108e-06,
"loss": 0.1668,
"step": 1335
},
{
"epoch": 1.2462686567164178,
"grad_norm": 1.2445475954545528,
"learning_rate": 3.113690671973867e-06,
"loss": 0.1444,
"step": 1336
},
{
"epoch": 1.2472014925373134,
"grad_norm": 1.3335118104124264,
"learning_rate": 3.1069076020134785e-06,
"loss": 0.1607,
"step": 1337
},
{
"epoch": 1.248134328358209,
"grad_norm": 1.3301678211762822,
"learning_rate": 3.100128596691503e-06,
"loss": 0.2012,
"step": 1338
},
{
"epoch": 1.2490671641791045,
"grad_norm": 1.151237341115816,
"learning_rate": 3.093353670563071e-06,
"loss": 0.1308,
"step": 1339
},
{
"epoch": 1.25,
"grad_norm": 1.2201549467316923,
"learning_rate": 3.0865828381745515e-06,
"loss": 0.1528,
"step": 1340
},
{
"epoch": 1.2509328358208955,
"grad_norm": 1.432176751420658,
"learning_rate": 3.0798161140635287e-06,
"loss": 0.1726,
"step": 1341
},
{
"epoch": 1.251865671641791,
"grad_norm": 1.2928222987662619,
"learning_rate": 3.0730535127587626e-06,
"loss": 0.1812,
"step": 1342
},
{
"epoch": 1.2527985074626866,
"grad_norm": 1.258937024396766,
"learning_rate": 3.0662950487801614e-06,
"loss": 0.1787,
"step": 1343
},
{
"epoch": 1.2537313432835822,
"grad_norm": 1.338580616178489,
"learning_rate": 3.059540736638751e-06,
"loss": 0.1484,
"step": 1344
},
{
"epoch": 1.2546641791044777,
"grad_norm": 1.3522120640921935,
"learning_rate": 3.052790590836644e-06,
"loss": 0.2266,
"step": 1345
},
{
"epoch": 1.2555970149253732,
"grad_norm": 1.297997790858102,
"learning_rate": 3.046044625867004e-06,
"loss": 0.2178,
"step": 1346
},
{
"epoch": 1.2565298507462686,
"grad_norm": 1.2977406734476347,
"learning_rate": 3.0393028562140237e-06,
"loss": 0.1922,
"step": 1347
},
{
"epoch": 1.2574626865671643,
"grad_norm": 1.176994177656682,
"learning_rate": 3.0325652963528797e-06,
"loss": 0.1904,
"step": 1348
},
{
"epoch": 1.2583955223880596,
"grad_norm": 1.3220076328359516,
"learning_rate": 3.0258319607497175e-06,
"loss": 0.1625,
"step": 1349
},
{
"epoch": 1.2593283582089552,
"grad_norm": 1.21578082266844,
"learning_rate": 3.0191028638616095e-06,
"loss": 0.1596,
"step": 1350
},
{
"epoch": 1.2602611940298507,
"grad_norm": 1.3360585999784078,
"learning_rate": 3.012378020136526e-06,
"loss": 0.1946,
"step": 1351
},
{
"epoch": 1.2611940298507462,
"grad_norm": 1.2339184197363335,
"learning_rate": 3.0056574440133104e-06,
"loss": 0.1624,
"step": 1352
},
{
"epoch": 1.2621268656716418,
"grad_norm": 1.2675456114784982,
"learning_rate": 2.9989411499216357e-06,
"loss": 0.1867,
"step": 1353
},
{
"epoch": 1.2630597014925373,
"grad_norm": 1.3160190054004581,
"learning_rate": 2.992229152281987e-06,
"loss": 0.2223,
"step": 1354
},
{
"epoch": 1.2639925373134329,
"grad_norm": 1.4460157844316839,
"learning_rate": 2.9855214655056243e-06,
"loss": 0.2286,
"step": 1355
},
{
"epoch": 1.2649253731343284,
"grad_norm": 1.333474546319058,
"learning_rate": 2.978818103994546e-06,
"loss": 0.1869,
"step": 1356
},
{
"epoch": 1.265858208955224,
"grad_norm": 1.282300154157047,
"learning_rate": 2.9721190821414713e-06,
"loss": 0.1843,
"step": 1357
},
{
"epoch": 1.2667910447761195,
"grad_norm": 1.2344193005337352,
"learning_rate": 2.9654244143297972e-06,
"loss": 0.1404,
"step": 1358
},
{
"epoch": 1.267723880597015,
"grad_norm": 1.362266068343807,
"learning_rate": 2.9587341149335726e-06,
"loss": 0.2044,
"step": 1359
},
{
"epoch": 1.2686567164179103,
"grad_norm": 1.2125996731121518,
"learning_rate": 2.9520481983174675e-06,
"loss": 0.1451,
"step": 1360
},
{
"epoch": 1.269589552238806,
"grad_norm": 1.3249164851880222,
"learning_rate": 2.945366678836745e-06,
"loss": 0.2002,
"step": 1361
},
{
"epoch": 1.2705223880597014,
"grad_norm": 1.20519470883145,
"learning_rate": 2.9386895708372205e-06,
"loss": 0.144,
"step": 1362
},
{
"epoch": 1.271455223880597,
"grad_norm": 1.2425903867255321,
"learning_rate": 2.932016888655241e-06,
"loss": 0.1759,
"step": 1363
},
{
"epoch": 1.2723880597014925,
"grad_norm": 1.3903366090601932,
"learning_rate": 2.9253486466176516e-06,
"loss": 0.2202,
"step": 1364
},
{
"epoch": 1.273320895522388,
"grad_norm": 1.2880579280720423,
"learning_rate": 2.9186848590417654e-06,
"loss": 0.1944,
"step": 1365
},
{
"epoch": 1.2742537313432836,
"grad_norm": 1.279645812179017,
"learning_rate": 2.912025540235327e-06,
"loss": 0.159,
"step": 1366
},
{
"epoch": 1.275186567164179,
"grad_norm": 1.304388046501594,
"learning_rate": 2.9053707044964886e-06,
"loss": 0.1818,
"step": 1367
},
{
"epoch": 1.2761194029850746,
"grad_norm": 1.3372290020118323,
"learning_rate": 2.8987203661137776e-06,
"loss": 0.179,
"step": 1368
},
{
"epoch": 1.2770522388059702,
"grad_norm": 1.3370082063445698,
"learning_rate": 2.8920745393660642e-06,
"loss": 0.2107,
"step": 1369
},
{
"epoch": 1.2779850746268657,
"grad_norm": 1.1694998297858563,
"learning_rate": 2.885433238522534e-06,
"loss": 0.1416,
"step": 1370
},
{
"epoch": 1.2789179104477613,
"grad_norm": 1.2882792517258348,
"learning_rate": 2.878796477842648e-06,
"loss": 0.1822,
"step": 1371
},
{
"epoch": 1.2798507462686568,
"grad_norm": 1.287059786494495,
"learning_rate": 2.8721642715761267e-06,
"loss": 0.1449,
"step": 1372
},
{
"epoch": 1.2807835820895521,
"grad_norm": 1.2673872593371118,
"learning_rate": 2.8655366339629093e-06,
"loss": 0.1703,
"step": 1373
},
{
"epoch": 1.2817164179104479,
"grad_norm": 1.3397105330121668,
"learning_rate": 2.858913579233127e-06,
"loss": 0.1595,
"step": 1374
},
{
"epoch": 1.2826492537313432,
"grad_norm": 1.3257315751315077,
"learning_rate": 2.852295121607066e-06,
"loss": 0.1816,
"step": 1375
},
{
"epoch": 1.2835820895522387,
"grad_norm": 1.3063699146894663,
"learning_rate": 2.8456812752951483e-06,
"loss": 0.1892,
"step": 1376
},
{
"epoch": 1.2845149253731343,
"grad_norm": 1.1803856021399208,
"learning_rate": 2.8390720544978933e-06,
"loss": 0.1451,
"step": 1377
},
{
"epoch": 1.2854477611940298,
"grad_norm": 1.2306047671883888,
"learning_rate": 2.8324674734058855e-06,
"loss": 0.1547,
"step": 1378
},
{
"epoch": 1.2863805970149254,
"grad_norm": 1.1824152613413623,
"learning_rate": 2.8258675461997513e-06,
"loss": 0.1466,
"step": 1379
},
{
"epoch": 1.287313432835821,
"grad_norm": 1.180770138321359,
"learning_rate": 2.8192722870501242e-06,
"loss": 0.1567,
"step": 1380
},
{
"epoch": 1.2882462686567164,
"grad_norm": 1.21308308779698,
"learning_rate": 2.812681710117614e-06,
"loss": 0.1476,
"step": 1381
},
{
"epoch": 1.289179104477612,
"grad_norm": 1.359483464677566,
"learning_rate": 2.8060958295527785e-06,
"loss": 0.2032,
"step": 1382
},
{
"epoch": 1.2901119402985075,
"grad_norm": 1.3840206529031518,
"learning_rate": 2.799514659496092e-06,
"loss": 0.154,
"step": 1383
},
{
"epoch": 1.291044776119403,
"grad_norm": 1.2453245620296973,
"learning_rate": 2.792938214077912e-06,
"loss": 0.1439,
"step": 1384
},
{
"epoch": 1.2919776119402986,
"grad_norm": 1.2518021387625964,
"learning_rate": 2.7863665074184553e-06,
"loss": 0.1748,
"step": 1385
},
{
"epoch": 1.292910447761194,
"grad_norm": 1.2698294847640776,
"learning_rate": 2.7797995536277624e-06,
"loss": 0.1641,
"step": 1386
},
{
"epoch": 1.2938432835820897,
"grad_norm": 1.2800728502720369,
"learning_rate": 2.773237366805672e-06,
"loss": 0.1558,
"step": 1387
},
{
"epoch": 1.294776119402985,
"grad_norm": 1.274571716079416,
"learning_rate": 2.766679961041781e-06,
"loss": 0.1857,
"step": 1388
},
{
"epoch": 1.2957089552238805,
"grad_norm": 1.320652195693462,
"learning_rate": 2.760127350415427e-06,
"loss": 0.1684,
"step": 1389
},
{
"epoch": 1.296641791044776,
"grad_norm": 1.3388822639892841,
"learning_rate": 2.753579548995652e-06,
"loss": 0.1651,
"step": 1390
},
{
"epoch": 1.2975746268656716,
"grad_norm": 1.1868289977432622,
"learning_rate": 2.7470365708411673e-06,
"loss": 0.1439,
"step": 1391
},
{
"epoch": 1.2985074626865671,
"grad_norm": 1.4092140368954786,
"learning_rate": 2.740498430000332e-06,
"loss": 0.1907,
"step": 1392
},
{
"epoch": 1.2994402985074627,
"grad_norm": 1.2461926006085717,
"learning_rate": 2.7339651405111176e-06,
"loss": 0.1608,
"step": 1393
},
{
"epoch": 1.3003731343283582,
"grad_norm": 1.2956733449383946,
"learning_rate": 2.727436716401083e-06,
"loss": 0.1857,
"step": 1394
},
{
"epoch": 1.3013059701492538,
"grad_norm": 1.30421727417671,
"learning_rate": 2.7209131716873347e-06,
"loss": 0.1943,
"step": 1395
},
{
"epoch": 1.3022388059701493,
"grad_norm": 1.2063263793325392,
"learning_rate": 2.714394520376509e-06,
"loss": 0.1539,
"step": 1396
},
{
"epoch": 1.3031716417910448,
"grad_norm": 1.280761928790951,
"learning_rate": 2.7078807764647277e-06,
"loss": 0.1676,
"step": 1397
},
{
"epoch": 1.3041044776119404,
"grad_norm": 1.3247312247362115,
"learning_rate": 2.701371953937583e-06,
"loss": 0.2055,
"step": 1398
},
{
"epoch": 1.3050373134328357,
"grad_norm": 1.257135213905167,
"learning_rate": 2.694868066770099e-06,
"loss": 0.1759,
"step": 1399
},
{
"epoch": 1.3059701492537314,
"grad_norm": 1.4301480995500815,
"learning_rate": 2.6883691289267e-06,
"loss": 0.1661,
"step": 1400
},
{
"epoch": 1.3069029850746268,
"grad_norm": 1.1818771406706536,
"learning_rate": 2.6818751543611892e-06,
"loss": 0.1491,
"step": 1401
},
{
"epoch": 1.3078358208955223,
"grad_norm": 1.240059855888409,
"learning_rate": 2.675386157016706e-06,
"loss": 0.16,
"step": 1402
},
{
"epoch": 1.3087686567164178,
"grad_norm": 1.2964425916034028,
"learning_rate": 2.6689021508257105e-06,
"loss": 0.205,
"step": 1403
},
{
"epoch": 1.3097014925373134,
"grad_norm": 1.2421368200692056,
"learning_rate": 2.6624231497099395e-06,
"loss": 0.1722,
"step": 1404
},
{
"epoch": 1.310634328358209,
"grad_norm": 1.3169176322250298,
"learning_rate": 2.6559491675803883e-06,
"loss": 0.1924,
"step": 1405
},
{
"epoch": 1.3115671641791045,
"grad_norm": 1.1951875951034385,
"learning_rate": 2.649480218337276e-06,
"loss": 0.1317,
"step": 1406
},
{
"epoch": 1.3125,
"grad_norm": 1.1963353766896352,
"learning_rate": 2.6430163158700116e-06,
"loss": 0.1393,
"step": 1407
},
{
"epoch": 1.3134328358208955,
"grad_norm": 1.351728946093592,
"learning_rate": 2.636557474057173e-06,
"loss": 0.183,
"step": 1408
},
{
"epoch": 1.314365671641791,
"grad_norm": 1.369279930285477,
"learning_rate": 2.6301037067664726e-06,
"loss": 0.2294,
"step": 1409
},
{
"epoch": 1.3152985074626866,
"grad_norm": 1.4547602099783485,
"learning_rate": 2.623655027854719e-06,
"loss": 0.1615,
"step": 1410
},
{
"epoch": 1.3162313432835822,
"grad_norm": 1.2297794308793724,
"learning_rate": 2.6172114511678047e-06,
"loss": 0.175,
"step": 1411
},
{
"epoch": 1.3171641791044777,
"grad_norm": 1.251638883776497,
"learning_rate": 2.6107729905406655e-06,
"loss": 0.1768,
"step": 1412
},
{
"epoch": 1.3180970149253732,
"grad_norm": 1.256896553193628,
"learning_rate": 2.6043396597972488e-06,
"loss": 0.1602,
"step": 1413
},
{
"epoch": 1.3190298507462686,
"grad_norm": 1.324806723241369,
"learning_rate": 2.597911472750494e-06,
"loss": 0.181,
"step": 1414
},
{
"epoch": 1.3199626865671643,
"grad_norm": 1.3490247296765157,
"learning_rate": 2.5914884432022873e-06,
"loss": 0.1896,
"step": 1415
},
{
"epoch": 1.3208955223880596,
"grad_norm": 1.2752042814670184,
"learning_rate": 2.585070584943452e-06,
"loss": 0.1584,
"step": 1416
},
{
"epoch": 1.3218283582089552,
"grad_norm": 1.2645878576273337,
"learning_rate": 2.5786579117536983e-06,
"loss": 0.1605,
"step": 1417
},
{
"epoch": 1.3227611940298507,
"grad_norm": 1.2539941415618645,
"learning_rate": 2.5722504374016093e-06,
"loss": 0.1606,
"step": 1418
},
{
"epoch": 1.3236940298507462,
"grad_norm": 1.2283847069486205,
"learning_rate": 2.5658481756446056e-06,
"loss": 0.1733,
"step": 1419
},
{
"epoch": 1.3246268656716418,
"grad_norm": 1.3228664331345847,
"learning_rate": 2.5594511402289145e-06,
"loss": 0.1941,
"step": 1420
},
{
"epoch": 1.3255597014925373,
"grad_norm": 1.2693422740719202,
"learning_rate": 2.553059344889543e-06,
"loss": 0.1592,
"step": 1421
},
{
"epoch": 1.3264925373134329,
"grad_norm": 1.268832561869623,
"learning_rate": 2.546672803350247e-06,
"loss": 0.1488,
"step": 1422
},
{
"epoch": 1.3274253731343284,
"grad_norm": 1.2715434470082407,
"learning_rate": 2.5402915293234985e-06,
"loss": 0.1787,
"step": 1423
},
{
"epoch": 1.328358208955224,
"grad_norm": 1.199882042843623,
"learning_rate": 2.533915536510464e-06,
"loss": 0.1383,
"step": 1424
},
{
"epoch": 1.3292910447761195,
"grad_norm": 1.2940512076143134,
"learning_rate": 2.527544838600969e-06,
"loss": 0.1668,
"step": 1425
},
{
"epoch": 1.330223880597015,
"grad_norm": 1.2859342036370367,
"learning_rate": 2.521179449273472e-06,
"loss": 0.1673,
"step": 1426
},
{
"epoch": 1.3311567164179103,
"grad_norm": 1.368465223818523,
"learning_rate": 2.5148193821950317e-06,
"loss": 0.1931,
"step": 1427
},
{
"epoch": 1.332089552238806,
"grad_norm": 1.2412914337957712,
"learning_rate": 2.5084646510212817e-06,
"loss": 0.1668,
"step": 1428
},
{
"epoch": 1.3330223880597014,
"grad_norm": 1.3694410540778188,
"learning_rate": 2.5021152693963957e-06,
"loss": 0.1668,
"step": 1429
},
{
"epoch": 1.333955223880597,
"grad_norm": 1.3108468106665578,
"learning_rate": 2.495771250953061e-06,
"loss": 0.1558,
"step": 1430
},
{
"epoch": 1.3348880597014925,
"grad_norm": 1.2678417230006749,
"learning_rate": 2.4894326093124534e-06,
"loss": 0.1824,
"step": 1431
},
{
"epoch": 1.335820895522388,
"grad_norm": 1.234900070038742,
"learning_rate": 2.4830993580842023e-06,
"loss": 0.1608,
"step": 1432
},
{
"epoch": 1.3367537313432836,
"grad_norm": 1.2791584100851827,
"learning_rate": 2.476771510866364e-06,
"loss": 0.1687,
"step": 1433
},
{
"epoch": 1.337686567164179,
"grad_norm": 1.3008277496389138,
"learning_rate": 2.4704490812453907e-06,
"loss": 0.1828,
"step": 1434
},
{
"epoch": 1.3386194029850746,
"grad_norm": 1.2603709807496224,
"learning_rate": 2.4641320827961063e-06,
"loss": 0.1811,
"step": 1435
},
{
"epoch": 1.3395522388059702,
"grad_norm": 1.3262286033891795,
"learning_rate": 2.457820529081666e-06,
"loss": 0.2049,
"step": 1436
},
{
"epoch": 1.3404850746268657,
"grad_norm": 1.2793186685327578,
"learning_rate": 2.4515144336535413e-06,
"loss": 0.176,
"step": 1437
},
{
"epoch": 1.3414179104477613,
"grad_norm": 1.3417857413110468,
"learning_rate": 2.445213810051482e-06,
"loss": 0.2049,
"step": 1438
},
{
"epoch": 1.3423507462686568,
"grad_norm": 1.2402106645566335,
"learning_rate": 2.43891867180349e-06,
"loss": 0.1617,
"step": 1439
},
{
"epoch": 1.3432835820895521,
"grad_norm": 1.270642739818944,
"learning_rate": 2.4326290324257896e-06,
"loss": 0.166,
"step": 1440
},
{
"epoch": 1.3442164179104479,
"grad_norm": 1.1747953276893874,
"learning_rate": 2.4263449054227983e-06,
"loss": 0.1229,
"step": 1441
},
{
"epoch": 1.3451492537313432,
"grad_norm": 1.312921174633494,
"learning_rate": 2.4200663042870977e-06,
"loss": 0.2017,
"step": 1442
},
{
"epoch": 1.3460820895522387,
"grad_norm": 1.2446358820836307,
"learning_rate": 2.413793242499402e-06,
"loss": 0.1706,
"step": 1443
},
{
"epoch": 1.3470149253731343,
"grad_norm": 1.327826951397786,
"learning_rate": 2.407525733528538e-06,
"loss": 0.192,
"step": 1444
},
{
"epoch": 1.3479477611940298,
"grad_norm": 1.3012234815142307,
"learning_rate": 2.4012637908314064e-06,
"loss": 0.1689,
"step": 1445
},
{
"epoch": 1.3488805970149254,
"grad_norm": 1.2513612057630938,
"learning_rate": 2.3950074278529566e-06,
"loss": 0.1736,
"step": 1446
},
{
"epoch": 1.349813432835821,
"grad_norm": 1.1442958163589816,
"learning_rate": 2.38875665802616e-06,
"loss": 0.1614,
"step": 1447
},
{
"epoch": 1.3507462686567164,
"grad_norm": 1.246027691312265,
"learning_rate": 2.382511494771979e-06,
"loss": 0.1756,
"step": 1448
},
{
"epoch": 1.351679104477612,
"grad_norm": 1.1708781886882182,
"learning_rate": 2.3762719514993327e-06,
"loss": 0.149,
"step": 1449
},
{
"epoch": 1.3526119402985075,
"grad_norm": 1.2270642508980645,
"learning_rate": 2.370038041605079e-06,
"loss": 0.1879,
"step": 1450
},
{
"epoch": 1.353544776119403,
"grad_norm": 1.2447935359458226,
"learning_rate": 2.36380977847398e-06,
"loss": 0.1375,
"step": 1451
},
{
"epoch": 1.3544776119402986,
"grad_norm": 1.2858647314786538,
"learning_rate": 2.357587175478672e-06,
"loss": 0.1907,
"step": 1452
},
{
"epoch": 1.355410447761194,
"grad_norm": 1.1806857980871994,
"learning_rate": 2.3513702459796406e-06,
"loss": 0.1344,
"step": 1453
},
{
"epoch": 1.3563432835820897,
"grad_norm": 1.1918140230068817,
"learning_rate": 2.3451590033251887e-06,
"loss": 0.1525,
"step": 1454
},
{
"epoch": 1.357276119402985,
"grad_norm": 1.2130814375230647,
"learning_rate": 2.338953460851408e-06,
"loss": 0.1469,
"step": 1455
},
{
"epoch": 1.3582089552238805,
"grad_norm": 1.3714970513970581,
"learning_rate": 2.3327536318821496e-06,
"loss": 0.1723,
"step": 1456
},
{
"epoch": 1.359141791044776,
"grad_norm": 1.1811319514596708,
"learning_rate": 2.3265595297290035e-06,
"loss": 0.1451,
"step": 1457
},
{
"epoch": 1.3600746268656716,
"grad_norm": 1.1782788935254018,
"learning_rate": 2.320371167691258e-06,
"loss": 0.136,
"step": 1458
},
{
"epoch": 1.3610074626865671,
"grad_norm": 1.2409821702879118,
"learning_rate": 2.31418855905588e-06,
"loss": 0.1773,
"step": 1459
},
{
"epoch": 1.3619402985074627,
"grad_norm": 1.2830342959648524,
"learning_rate": 2.3080117170974827e-06,
"loss": 0.1867,
"step": 1460
},
{
"epoch": 1.3628731343283582,
"grad_norm": 1.2132120797541985,
"learning_rate": 2.301840655078298e-06,
"loss": 0.1475,
"step": 1461
},
{
"epoch": 1.3638059701492538,
"grad_norm": 1.3156738879306071,
"learning_rate": 2.2956753862481444e-06,
"loss": 0.1851,
"step": 1462
},
{
"epoch": 1.3647388059701493,
"grad_norm": 1.3398257271092922,
"learning_rate": 2.289515923844406e-06,
"loss": 0.177,
"step": 1463
},
{
"epoch": 1.3656716417910448,
"grad_norm": 1.259885121964175,
"learning_rate": 2.2833622810919987e-06,
"loss": 0.183,
"step": 1464
},
{
"epoch": 1.3666044776119404,
"grad_norm": 1.135200212068689,
"learning_rate": 2.277214471203342e-06,
"loss": 0.1333,
"step": 1465
},
{
"epoch": 1.3675373134328357,
"grad_norm": 1.29786431462598,
"learning_rate": 2.2710725073783345e-06,
"loss": 0.1912,
"step": 1466
},
{
"epoch": 1.3684701492537314,
"grad_norm": 1.2025422661506906,
"learning_rate": 2.264936402804322e-06,
"loss": 0.1435,
"step": 1467
},
{
"epoch": 1.3694029850746268,
"grad_norm": 1.2695588694838684,
"learning_rate": 2.2588061706560643e-06,
"loss": 0.1621,
"step": 1468
},
{
"epoch": 1.3703358208955223,
"grad_norm": 1.189376190696932,
"learning_rate": 2.2526818240957217e-06,
"loss": 0.1478,
"step": 1469
},
{
"epoch": 1.3712686567164178,
"grad_norm": 1.285370928942321,
"learning_rate": 2.2465633762728093e-06,
"loss": 0.1552,
"step": 1470
},
{
"epoch": 1.3722014925373134,
"grad_norm": 1.254381199655615,
"learning_rate": 2.240450840324183e-06,
"loss": 0.1704,
"step": 1471
},
{
"epoch": 1.373134328358209,
"grad_norm": 1.300268568192004,
"learning_rate": 2.234344229374003e-06,
"loss": 0.1495,
"step": 1472
},
{
"epoch": 1.3740671641791045,
"grad_norm": 1.2923264402806232,
"learning_rate": 2.2282435565337084e-06,
"loss": 0.1695,
"step": 1473
},
{
"epoch": 1.375,
"grad_norm": 1.2156952202999838,
"learning_rate": 2.2221488349019903e-06,
"loss": 0.1648,
"step": 1474
},
{
"epoch": 1.3759328358208955,
"grad_norm": 1.2625527545126876,
"learning_rate": 2.216060077564757e-06,
"loss": 0.1832,
"step": 1475
},
{
"epoch": 1.376865671641791,
"grad_norm": 1.207405401122152,
"learning_rate": 2.2099772975951145e-06,
"loss": 0.146,
"step": 1476
},
{
"epoch": 1.3777985074626866,
"grad_norm": 1.2427419323477014,
"learning_rate": 2.203900508053336e-06,
"loss": 0.18,
"step": 1477
},
{
"epoch": 1.3787313432835822,
"grad_norm": 1.2046157819367844,
"learning_rate": 2.1978297219868307e-06,
"loss": 0.1665,
"step": 1478
},
{
"epoch": 1.3796641791044777,
"grad_norm": 1.2987998527053135,
"learning_rate": 2.191764952430119e-06,
"loss": 0.1981,
"step": 1479
},
{
"epoch": 1.3805970149253732,
"grad_norm": 1.4449869844658465,
"learning_rate": 2.1857062124048036e-06,
"loss": 0.1755,
"step": 1480
},
{
"epoch": 1.3815298507462686,
"grad_norm": 1.2788873598790043,
"learning_rate": 2.1796535149195362e-06,
"loss": 0.1709,
"step": 1481
},
{
"epoch": 1.3824626865671643,
"grad_norm": 1.3383933421237963,
"learning_rate": 2.1736068729700045e-06,
"loss": 0.2013,
"step": 1482
},
{
"epoch": 1.3833955223880596,
"grad_norm": 1.1532079090653113,
"learning_rate": 2.167566299538883e-06,
"loss": 0.1491,
"step": 1483
},
{
"epoch": 1.3843283582089552,
"grad_norm": 1.2545583313203545,
"learning_rate": 2.161531807595825e-06,
"loss": 0.1535,
"step": 1484
},
{
"epoch": 1.3852611940298507,
"grad_norm": 1.3383178306960863,
"learning_rate": 2.155503410097423e-06,
"loss": 0.1992,
"step": 1485
},
{
"epoch": 1.3861940298507462,
"grad_norm": 1.2521413725439199,
"learning_rate": 2.1494811199871857e-06,
"loss": 0.1563,
"step": 1486
},
{
"epoch": 1.3871268656716418,
"grad_norm": 1.3317947331297295,
"learning_rate": 2.1434649501955062e-06,
"loss": 0.2041,
"step": 1487
},
{
"epoch": 1.3880597014925373,
"grad_norm": 1.2507994216524636,
"learning_rate": 2.1374549136396417e-06,
"loss": 0.1589,
"step": 1488
},
{
"epoch": 1.3889925373134329,
"grad_norm": 1.392732959414099,
"learning_rate": 2.1314510232236723e-06,
"loss": 0.1922,
"step": 1489
},
{
"epoch": 1.3899253731343284,
"grad_norm": 1.33597802710792,
"learning_rate": 2.1254532918384892e-06,
"loss": 0.1754,
"step": 1490
},
{
"epoch": 1.390858208955224,
"grad_norm": 1.3917612946207263,
"learning_rate": 2.119461732361757e-06,
"loss": 0.2253,
"step": 1491
},
{
"epoch": 1.3917910447761195,
"grad_norm": 1.313878661367244,
"learning_rate": 2.113476357657889e-06,
"loss": 0.1885,
"step": 1492
},
{
"epoch": 1.392723880597015,
"grad_norm": 1.222705362849111,
"learning_rate": 2.1074971805780196e-06,
"loss": 0.1418,
"step": 1493
},
{
"epoch": 1.3936567164179103,
"grad_norm": 1.318990905960625,
"learning_rate": 2.1015242139599773e-06,
"loss": 0.1941,
"step": 1494
},
{
"epoch": 1.394589552238806,
"grad_norm": 1.216340100659452,
"learning_rate": 2.095557470628253e-06,
"loss": 0.1515,
"step": 1495
},
{
"epoch": 1.3955223880597014,
"grad_norm": 1.3156787670542847,
"learning_rate": 2.089596963393975e-06,
"loss": 0.1511,
"step": 1496
},
{
"epoch": 1.396455223880597,
"grad_norm": 1.2276298610177279,
"learning_rate": 2.0836427050548874e-06,
"loss": 0.1581,
"step": 1497
},
{
"epoch": 1.3973880597014925,
"grad_norm": 1.3040288892618115,
"learning_rate": 2.0776947083953136e-06,
"loss": 0.1696,
"step": 1498
},
{
"epoch": 1.398320895522388,
"grad_norm": 1.3052154898802157,
"learning_rate": 2.071752986186134e-06,
"loss": 0.1868,
"step": 1499
},
{
"epoch": 1.3992537313432836,
"grad_norm": 1.2960385420690526,
"learning_rate": 2.0658175511847565e-06,
"loss": 0.203,
"step": 1500
},
{
"epoch": 1.3992537313432836,
"eval_loss": 0.2172926515340805,
"eval_runtime": 3.4362,
"eval_samples_per_second": 25.318,
"eval_steps_per_second": 6.402,
"step": 1500
},
{
"epoch": 1.400186567164179,
"grad_norm": 1.191067930404344,
"learning_rate": 2.0598884161350923e-06,
"loss": 0.1297,
"step": 1501
},
{
"epoch": 1.4011194029850746,
"grad_norm": 1.2983856064443016,
"learning_rate": 2.05396559376752e-06,
"loss": 0.2005,
"step": 1502
},
{
"epoch": 1.4020522388059702,
"grad_norm": 1.273195108135393,
"learning_rate": 2.0480490967988693e-06,
"loss": 0.1653,
"step": 1503
},
{
"epoch": 1.4029850746268657,
"grad_norm": 1.2477168832906964,
"learning_rate": 2.042138937932388e-06,
"loss": 0.1725,
"step": 1504
},
{
"epoch": 1.4039179104477613,
"grad_norm": 1.2654990217842248,
"learning_rate": 2.036235129857715e-06,
"loss": 0.1967,
"step": 1505
},
{
"epoch": 1.4048507462686568,
"grad_norm": 1.300973276034434,
"learning_rate": 2.0303376852508527e-06,
"loss": 0.1654,
"step": 1506
},
{
"epoch": 1.4057835820895521,
"grad_norm": 1.2239235226334035,
"learning_rate": 2.0244466167741434e-06,
"loss": 0.1543,
"step": 1507
},
{
"epoch": 1.4067164179104479,
"grad_norm": 1.3621770901905592,
"learning_rate": 2.018561937076236e-06,
"loss": 0.2214,
"step": 1508
},
{
"epoch": 1.4076492537313432,
"grad_norm": 1.2314178688880044,
"learning_rate": 2.0126836587920605e-06,
"loss": 0.1505,
"step": 1509
},
{
"epoch": 1.4085820895522387,
"grad_norm": 1.2375838754524546,
"learning_rate": 2.0068117945428077e-06,
"loss": 0.1488,
"step": 1510
},
{
"epoch": 1.4095149253731343,
"grad_norm": 1.2772042963424348,
"learning_rate": 2.0009463569358937e-06,
"loss": 0.1578,
"step": 1511
},
{
"epoch": 1.4104477611940298,
"grad_norm": 1.2704695628333311,
"learning_rate": 1.995087358564938e-06,
"loss": 0.1484,
"step": 1512
},
{
"epoch": 1.4113805970149254,
"grad_norm": 1.2676512489978526,
"learning_rate": 1.989234812009732e-06,
"loss": 0.1716,
"step": 1513
},
{
"epoch": 1.412313432835821,
"grad_norm": 1.3893814832466052,
"learning_rate": 1.9833887298362185e-06,
"loss": 0.1924,
"step": 1514
},
{
"epoch": 1.4132462686567164,
"grad_norm": 1.1972367918528255,
"learning_rate": 1.9775491245964535e-06,
"loss": 0.1559,
"step": 1515
},
{
"epoch": 1.414179104477612,
"grad_norm": 1.2690046609761634,
"learning_rate": 1.971716008828593e-06,
"loss": 0.1703,
"step": 1516
},
{
"epoch": 1.4151119402985075,
"grad_norm": 1.4415095142128806,
"learning_rate": 1.9658893950568574e-06,
"loss": 0.1807,
"step": 1517
},
{
"epoch": 1.416044776119403,
"grad_norm": 1.231941180448534,
"learning_rate": 1.9600692957915076e-06,
"loss": 0.1511,
"step": 1518
},
{
"epoch": 1.4169776119402986,
"grad_norm": 1.2454516869261922,
"learning_rate": 1.9542557235288146e-06,
"loss": 0.1552,
"step": 1519
},
{
"epoch": 1.417910447761194,
"grad_norm": 1.3500668677203271,
"learning_rate": 1.9484486907510405e-06,
"loss": 0.2019,
"step": 1520
},
{
"epoch": 1.4188432835820897,
"grad_norm": 1.3246104342962923,
"learning_rate": 1.9426482099264e-06,
"loss": 0.1708,
"step": 1521
},
{
"epoch": 1.419776119402985,
"grad_norm": 1.1917074288234641,
"learning_rate": 1.936854293509043e-06,
"loss": 0.1211,
"step": 1522
},
{
"epoch": 1.4207089552238805,
"grad_norm": 1.216178650600443,
"learning_rate": 1.9310669539390266e-06,
"loss": 0.1496,
"step": 1523
},
{
"epoch": 1.421641791044776,
"grad_norm": 1.3395376084981512,
"learning_rate": 1.925286203642285e-06,
"loss": 0.1734,
"step": 1524
},
{
"epoch": 1.4225746268656716,
"grad_norm": 1.344682183658041,
"learning_rate": 1.919512055030606e-06,
"loss": 0.2162,
"step": 1525
},
{
"epoch": 1.4235074626865671,
"grad_norm": 1.2739248529598715,
"learning_rate": 1.913744520501602e-06,
"loss": 0.1712,
"step": 1526
},
{
"epoch": 1.4244402985074627,
"grad_norm": 1.2620710786687312,
"learning_rate": 1.9079836124386865e-06,
"loss": 0.1853,
"step": 1527
},
{
"epoch": 1.4253731343283582,
"grad_norm": 1.2358754004543349,
"learning_rate": 1.90222934321104e-06,
"loss": 0.1611,
"step": 1528
},
{
"epoch": 1.4263059701492538,
"grad_norm": 1.2593400350557469,
"learning_rate": 1.896481725173594e-06,
"loss": 0.1871,
"step": 1529
},
{
"epoch": 1.4272388059701493,
"grad_norm": 1.2844520802387187,
"learning_rate": 1.8907407706669972e-06,
"loss": 0.179,
"step": 1530
},
{
"epoch": 1.4281716417910448,
"grad_norm": 1.2746170483788477,
"learning_rate": 1.8850064920175927e-06,
"loss": 0.1772,
"step": 1531
},
{
"epoch": 1.4291044776119404,
"grad_norm": 1.2940878198602277,
"learning_rate": 1.8792789015373875e-06,
"loss": 0.1862,
"step": 1532
},
{
"epoch": 1.4300373134328357,
"grad_norm": 1.195426677196978,
"learning_rate": 1.873558011524032e-06,
"loss": 0.1494,
"step": 1533
},
{
"epoch": 1.4309701492537314,
"grad_norm": 1.2615153545374882,
"learning_rate": 1.8678438342607846e-06,
"loss": 0.1908,
"step": 1534
},
{
"epoch": 1.4319029850746268,
"grad_norm": 1.3150310494175041,
"learning_rate": 1.8621363820164978e-06,
"loss": 0.1738,
"step": 1535
},
{
"epoch": 1.4328358208955223,
"grad_norm": 1.1992716911898411,
"learning_rate": 1.856435667045577e-06,
"loss": 0.1807,
"step": 1536
},
{
"epoch": 1.4337686567164178,
"grad_norm": 1.2442012956833255,
"learning_rate": 1.850741701587968e-06,
"loss": 0.1563,
"step": 1537
},
{
"epoch": 1.4347014925373134,
"grad_norm": 1.2918039875800205,
"learning_rate": 1.8450544978691237e-06,
"loss": 0.1675,
"step": 1538
},
{
"epoch": 1.435634328358209,
"grad_norm": 1.1774630640190071,
"learning_rate": 1.8393740680999783e-06,
"loss": 0.1195,
"step": 1539
},
{
"epoch": 1.4365671641791045,
"grad_norm": 1.2423005244049108,
"learning_rate": 1.8337004244769225e-06,
"loss": 0.1659,
"step": 1540
},
{
"epoch": 1.4375,
"grad_norm": 1.236103986246663,
"learning_rate": 1.8280335791817733e-06,
"loss": 0.172,
"step": 1541
},
{
"epoch": 1.4384328358208955,
"grad_norm": 1.2663025654525684,
"learning_rate": 1.8223735443817546e-06,
"loss": 0.2009,
"step": 1542
},
{
"epoch": 1.439365671641791,
"grad_norm": 1.1945304888279726,
"learning_rate": 1.8167203322294673e-06,
"loss": 0.1493,
"step": 1543
},
{
"epoch": 1.4402985074626866,
"grad_norm": 1.3246209552257189,
"learning_rate": 1.811073954862862e-06,
"loss": 0.1647,
"step": 1544
},
{
"epoch": 1.4412313432835822,
"grad_norm": 1.2286615732904398,
"learning_rate": 1.8054344244052153e-06,
"loss": 0.1477,
"step": 1545
},
{
"epoch": 1.4421641791044777,
"grad_norm": 1.3606390787574698,
"learning_rate": 1.7998017529651042e-06,
"loss": 0.181,
"step": 1546
},
{
"epoch": 1.4430970149253732,
"grad_norm": 1.3220909797733937,
"learning_rate": 1.7941759526363739e-06,
"loss": 0.2089,
"step": 1547
},
{
"epoch": 1.4440298507462686,
"grad_norm": 1.23294866264412,
"learning_rate": 1.7885570354981236e-06,
"loss": 0.16,
"step": 1548
},
{
"epoch": 1.4449626865671643,
"grad_norm": 1.1456697988885036,
"learning_rate": 1.7829450136146664e-06,
"loss": 0.127,
"step": 1549
},
{
"epoch": 1.4458955223880596,
"grad_norm": 1.242510469154095,
"learning_rate": 1.7773398990355162e-06,
"loss": 0.1422,
"step": 1550
},
{
"epoch": 1.4468283582089552,
"grad_norm": 1.289842263551184,
"learning_rate": 1.771741703795355e-06,
"loss": 0.2084,
"step": 1551
},
{
"epoch": 1.4477611940298507,
"grad_norm": 1.2230492782672477,
"learning_rate": 1.7661504399140066e-06,
"loss": 0.133,
"step": 1552
},
{
"epoch": 1.4486940298507462,
"grad_norm": 1.2448285718824177,
"learning_rate": 1.7605661193964169e-06,
"loss": 0.1822,
"step": 1553
},
{
"epoch": 1.4496268656716418,
"grad_norm": 1.3230576198153048,
"learning_rate": 1.754988754232616e-06,
"loss": 0.1895,
"step": 1554
},
{
"epoch": 1.4505597014925373,
"grad_norm": 1.2449432948513492,
"learning_rate": 1.749418356397708e-06,
"loss": 0.1658,
"step": 1555
},
{
"epoch": 1.4514925373134329,
"grad_norm": 1.2492631381392154,
"learning_rate": 1.7438549378518331e-06,
"loss": 0.189,
"step": 1556
},
{
"epoch": 1.4524253731343284,
"grad_norm": 1.323522251818834,
"learning_rate": 1.7382985105401485e-06,
"loss": 0.2106,
"step": 1557
},
{
"epoch": 1.453358208955224,
"grad_norm": 1.2370037278441117,
"learning_rate": 1.7327490863927998e-06,
"loss": 0.1698,
"step": 1558
},
{
"epoch": 1.4542910447761195,
"grad_norm": 1.265942036133397,
"learning_rate": 1.7272066773248974e-06,
"loss": 0.1688,
"step": 1559
},
{
"epoch": 1.455223880597015,
"grad_norm": 1.2009005216690825,
"learning_rate": 1.721671295236485e-06,
"loss": 0.1437,
"step": 1560
},
{
"epoch": 1.4561567164179103,
"grad_norm": 1.2681764317380788,
"learning_rate": 1.7161429520125244e-06,
"loss": 0.1499,
"step": 1561
},
{
"epoch": 1.457089552238806,
"grad_norm": 1.2226810045311758,
"learning_rate": 1.7106216595228636e-06,
"loss": 0.1614,
"step": 1562
},
{
"epoch": 1.4580223880597014,
"grad_norm": 1.2775985160587016,
"learning_rate": 1.705107429622207e-06,
"loss": 0.158,
"step": 1563
},
{
"epoch": 1.458955223880597,
"grad_norm": 1.3008246431123165,
"learning_rate": 1.6996002741500999e-06,
"loss": 0.1935,
"step": 1564
},
{
"epoch": 1.4598880597014925,
"grad_norm": 1.14452594932925,
"learning_rate": 1.694100204930898e-06,
"loss": 0.1491,
"step": 1565
},
{
"epoch": 1.460820895522388,
"grad_norm": 1.4381105859802414,
"learning_rate": 1.6886072337737418e-06,
"loss": 0.2209,
"step": 1566
},
{
"epoch": 1.4617537313432836,
"grad_norm": 1.2009555335447437,
"learning_rate": 1.6831213724725282e-06,
"loss": 0.1649,
"step": 1567
},
{
"epoch": 1.462686567164179,
"grad_norm": 1.1044201612968545,
"learning_rate": 1.677642632805892e-06,
"loss": 0.1149,
"step": 1568
},
{
"epoch": 1.4636194029850746,
"grad_norm": 1.201292892365621,
"learning_rate": 1.672171026537177e-06,
"loss": 0.1563,
"step": 1569
},
{
"epoch": 1.4645522388059702,
"grad_norm": 1.4098607205577598,
"learning_rate": 1.6667065654144105e-06,
"loss": 0.2303,
"step": 1570
},
{
"epoch": 1.4654850746268657,
"grad_norm": 1.2324389236814668,
"learning_rate": 1.661249261170278e-06,
"loss": 0.1742,
"step": 1571
},
{
"epoch": 1.4664179104477613,
"grad_norm": 1.2922406342420865,
"learning_rate": 1.6557991255221007e-06,
"loss": 0.2045,
"step": 1572
},
{
"epoch": 1.4673507462686568,
"grad_norm": 1.3009557247528194,
"learning_rate": 1.650356170171804e-06,
"loss": 0.181,
"step": 1573
},
{
"epoch": 1.4682835820895521,
"grad_norm": 1.4405772195986282,
"learning_rate": 1.6449204068058994e-06,
"loss": 0.1519,
"step": 1574
},
{
"epoch": 1.4692164179104479,
"grad_norm": 1.1432566344922312,
"learning_rate": 1.639491847095459e-06,
"loss": 0.1384,
"step": 1575
},
{
"epoch": 1.4701492537313432,
"grad_norm": 1.249487726677484,
"learning_rate": 1.6340705026960818e-06,
"loss": 0.159,
"step": 1576
},
{
"epoch": 1.4710820895522387,
"grad_norm": 1.2956733493021935,
"learning_rate": 1.6286563852478787e-06,
"loss": 0.1749,
"step": 1577
},
{
"epoch": 1.4720149253731343,
"grad_norm": 1.351965315446389,
"learning_rate": 1.623249506375445e-06,
"loss": 0.1927,
"step": 1578
},
{
"epoch": 1.4729477611940298,
"grad_norm": 1.2584895365550355,
"learning_rate": 1.6178498776878333e-06,
"loss": 0.1658,
"step": 1579
},
{
"epoch": 1.4738805970149254,
"grad_norm": 1.2828618472382831,
"learning_rate": 1.6124575107785245e-06,
"loss": 0.1892,
"step": 1580
},
{
"epoch": 1.474813432835821,
"grad_norm": 1.2323978562690985,
"learning_rate": 1.6070724172254148e-06,
"loss": 0.1595,
"step": 1581
},
{
"epoch": 1.4757462686567164,
"grad_norm": 1.2644417526045404,
"learning_rate": 1.6016946085907798e-06,
"loss": 0.1774,
"step": 1582
},
{
"epoch": 1.476679104477612,
"grad_norm": 1.2342045537498876,
"learning_rate": 1.5963240964212556e-06,
"loss": 0.1543,
"step": 1583
},
{
"epoch": 1.4776119402985075,
"grad_norm": 1.2125459502856037,
"learning_rate": 1.5909608922478108e-06,
"loss": 0.1572,
"step": 1584
},
{
"epoch": 1.478544776119403,
"grad_norm": 1.2609621232962978,
"learning_rate": 1.585605007585726e-06,
"loss": 0.1478,
"step": 1585
},
{
"epoch": 1.4794776119402986,
"grad_norm": 1.2867353665845063,
"learning_rate": 1.5802564539345599e-06,
"loss": 0.1917,
"step": 1586
},
{
"epoch": 1.480410447761194,
"grad_norm": 1.2669560096494146,
"learning_rate": 1.5749152427781367e-06,
"loss": 0.1758,
"step": 1587
},
{
"epoch": 1.4813432835820897,
"grad_norm": 1.2656220053772782,
"learning_rate": 1.5695813855845149e-06,
"loss": 0.1842,
"step": 1588
},
{
"epoch": 1.482276119402985,
"grad_norm": 1.3153542193728112,
"learning_rate": 1.5642548938059588e-06,
"loss": 0.1454,
"step": 1589
},
{
"epoch": 1.4832089552238805,
"grad_norm": 1.2452876594670799,
"learning_rate": 1.5589357788789244e-06,
"loss": 0.135,
"step": 1590
},
{
"epoch": 1.484141791044776,
"grad_norm": 1.2717468140148107,
"learning_rate": 1.5536240522240259e-06,
"loss": 0.1937,
"step": 1591
},
{
"epoch": 1.4850746268656716,
"grad_norm": 1.3275570249611397,
"learning_rate": 1.5483197252460158e-06,
"loss": 0.2052,
"step": 1592
},
{
"epoch": 1.4860074626865671,
"grad_norm": 1.2883023956548525,
"learning_rate": 1.543022809333755e-06,
"loss": 0.1888,
"step": 1593
},
{
"epoch": 1.4869402985074627,
"grad_norm": 1.285850183604376,
"learning_rate": 1.537733315860197e-06,
"loss": 0.1362,
"step": 1594
},
{
"epoch": 1.4878731343283582,
"grad_norm": 1.3564622211084076,
"learning_rate": 1.5324512561823562e-06,
"loss": 0.2011,
"step": 1595
},
{
"epoch": 1.4888059701492538,
"grad_norm": 1.1780864154364084,
"learning_rate": 1.527176641641286e-06,
"loss": 0.148,
"step": 1596
},
{
"epoch": 1.4897388059701493,
"grad_norm": 1.3062816174924374,
"learning_rate": 1.5219094835620546e-06,
"loss": 0.2116,
"step": 1597
},
{
"epoch": 1.4906716417910448,
"grad_norm": 1.2861264449635466,
"learning_rate": 1.5166497932537233e-06,
"loss": 0.1807,
"step": 1598
},
{
"epoch": 1.4916044776119404,
"grad_norm": 1.3040316994439474,
"learning_rate": 1.5113975820093129e-06,
"loss": 0.1908,
"step": 1599
},
{
"epoch": 1.4925373134328357,
"grad_norm": 1.2287037658204552,
"learning_rate": 1.5061528611057917e-06,
"loss": 0.1513,
"step": 1600
},
{
"epoch": 1.4934701492537314,
"grad_norm": 1.2792449236973227,
"learning_rate": 1.5009156418040443e-06,
"loss": 0.1961,
"step": 1601
},
{
"epoch": 1.4944029850746268,
"grad_norm": 1.3629466625435644,
"learning_rate": 1.4956859353488484e-06,
"loss": 0.1457,
"step": 1602
},
{
"epoch": 1.4953358208955223,
"grad_norm": 1.151598855059448,
"learning_rate": 1.4904637529688492e-06,
"loss": 0.1234,
"step": 1603
},
{
"epoch": 1.4962686567164178,
"grad_norm": 1.2603516755627053,
"learning_rate": 1.4852491058765388e-06,
"loss": 0.1969,
"step": 1604
},
{
"epoch": 1.4972014925373134,
"grad_norm": 1.3029607032037984,
"learning_rate": 1.4800420052682308e-06,
"loss": 0.1732,
"step": 1605
},
{
"epoch": 1.498134328358209,
"grad_norm": 1.2718701457765713,
"learning_rate": 1.4748424623240364e-06,
"loss": 0.1748,
"step": 1606
},
{
"epoch": 1.4990671641791045,
"grad_norm": 1.2583215619223413,
"learning_rate": 1.4696504882078361e-06,
"loss": 0.1488,
"step": 1607
},
{
"epoch": 1.5,
"grad_norm": 1.3343211399094133,
"learning_rate": 1.4644660940672628e-06,
"loss": 0.154,
"step": 1608
},
{
"epoch": 1.5009328358208955,
"grad_norm": 1.370324950789676,
"learning_rate": 1.4592892910336738e-06,
"loss": 0.1641,
"step": 1609
},
{
"epoch": 1.501865671641791,
"grad_norm": 1.263752243517082,
"learning_rate": 1.4541200902221276e-06,
"loss": 0.1589,
"step": 1610
},
{
"epoch": 1.5027985074626866,
"grad_norm": 1.2805016526218522,
"learning_rate": 1.4489585027313613e-06,
"loss": 0.1869,
"step": 1611
},
{
"epoch": 1.5037313432835822,
"grad_norm": 1.289232569056423,
"learning_rate": 1.4438045396437606e-06,
"loss": 0.1483,
"step": 1612
},
{
"epoch": 1.5046641791044775,
"grad_norm": 1.1645787470541484,
"learning_rate": 1.4386582120253467e-06,
"loss": 0.1412,
"step": 1613
},
{
"epoch": 1.5055970149253732,
"grad_norm": 1.203746273397564,
"learning_rate": 1.433519530925745e-06,
"loss": 0.136,
"step": 1614
},
{
"epoch": 1.5065298507462686,
"grad_norm": 1.2037002849351364,
"learning_rate": 1.4283885073781628e-06,
"loss": 0.1473,
"step": 1615
},
{
"epoch": 1.5074626865671643,
"grad_norm": 1.2417425890659581,
"learning_rate": 1.4232651523993635e-06,
"loss": 0.1541,
"step": 1616
},
{
"epoch": 1.5083955223880596,
"grad_norm": 1.223691899770722,
"learning_rate": 1.4181494769896487e-06,
"loss": 0.1522,
"step": 1617
},
{
"epoch": 1.5093283582089554,
"grad_norm": 1.1957898706574794,
"learning_rate": 1.413041492132831e-06,
"loss": 0.1471,
"step": 1618
},
{
"epoch": 1.5102611940298507,
"grad_norm": 1.2163473521299886,
"learning_rate": 1.4079412087962113e-06,
"loss": 0.1753,
"step": 1619
},
{
"epoch": 1.5111940298507462,
"grad_norm": 1.2661586102986833,
"learning_rate": 1.4028486379305507e-06,
"loss": 0.1903,
"step": 1620
},
{
"epoch": 1.5121268656716418,
"grad_norm": 1.3129586395056405,
"learning_rate": 1.397763790470054e-06,
"loss": 0.1941,
"step": 1621
},
{
"epoch": 1.5130597014925373,
"grad_norm": 1.2377248133069354,
"learning_rate": 1.3926866773323434e-06,
"loss": 0.1758,
"step": 1622
},
{
"epoch": 1.5139925373134329,
"grad_norm": 1.2697894927159725,
"learning_rate": 1.3876173094184341e-06,
"loss": 0.1851,
"step": 1623
},
{
"epoch": 1.5149253731343284,
"grad_norm": 1.2664467073712171,
"learning_rate": 1.3825556976127119e-06,
"loss": 0.1798,
"step": 1624
},
{
"epoch": 1.515858208955224,
"grad_norm": 1.2561239218852494,
"learning_rate": 1.3775018527829103e-06,
"loss": 0.1708,
"step": 1625
},
{
"epoch": 1.5167910447761193,
"grad_norm": 1.2767075134930894,
"learning_rate": 1.3724557857800824e-06,
"loss": 0.1443,
"step": 1626
},
{
"epoch": 1.517723880597015,
"grad_norm": 1.202873843485902,
"learning_rate": 1.3674175074385866e-06,
"loss": 0.1548,
"step": 1627
},
{
"epoch": 1.5186567164179103,
"grad_norm": 1.2304513532013928,
"learning_rate": 1.362387028576056e-06,
"loss": 0.1731,
"step": 1628
},
{
"epoch": 1.519589552238806,
"grad_norm": 1.3149193632354779,
"learning_rate": 1.3573643599933794e-06,
"loss": 0.1781,
"step": 1629
},
{
"epoch": 1.5205223880597014,
"grad_norm": 1.2283146253340484,
"learning_rate": 1.3523495124746722e-06,
"loss": 0.1478,
"step": 1630
},
{
"epoch": 1.5214552238805972,
"grad_norm": 1.2370105468976618,
"learning_rate": 1.3473424967872606e-06,
"loss": 0.1539,
"step": 1631
},
{
"epoch": 1.5223880597014925,
"grad_norm": 1.1375086929232519,
"learning_rate": 1.3423433236816563e-06,
"loss": 0.129,
"step": 1632
},
{
"epoch": 1.523320895522388,
"grad_norm": 1.2856837840697544,
"learning_rate": 1.3373520038915271e-06,
"loss": 0.1773,
"step": 1633
},
{
"epoch": 1.5242537313432836,
"grad_norm": 1.2384146311184505,
"learning_rate": 1.332368548133684e-06,
"loss": 0.1603,
"step": 1634
},
{
"epoch": 1.525186567164179,
"grad_norm": 1.379242483405248,
"learning_rate": 1.3273929671080515e-06,
"loss": 0.1768,
"step": 1635
},
{
"epoch": 1.5261194029850746,
"grad_norm": 1.274654451134043,
"learning_rate": 1.322425271497646e-06,
"loss": 0.147,
"step": 1636
},
{
"epoch": 1.5270522388059702,
"grad_norm": 1.215985264316688,
"learning_rate": 1.3174654719685537e-06,
"loss": 0.136,
"step": 1637
},
{
"epoch": 1.5279850746268657,
"grad_norm": 1.2344772185357755,
"learning_rate": 1.3125135791699084e-06,
"loss": 0.1855,
"step": 1638
},
{
"epoch": 1.528917910447761,
"grad_norm": 1.255018239260394,
"learning_rate": 1.3075696037338636e-06,
"loss": 0.1624,
"step": 1639
},
{
"epoch": 1.5298507462686568,
"grad_norm": 1.3004318503788717,
"learning_rate": 1.302633556275577e-06,
"loss": 0.181,
"step": 1640
},
{
"epoch": 1.5307835820895521,
"grad_norm": 1.316168592475357,
"learning_rate": 1.2977054473931838e-06,
"loss": 0.1794,
"step": 1641
},
{
"epoch": 1.5317164179104479,
"grad_norm": 1.217753124143506,
"learning_rate": 1.292785287667775e-06,
"loss": 0.1454,
"step": 1642
},
{
"epoch": 1.5326492537313432,
"grad_norm": 1.2802301204083537,
"learning_rate": 1.2878730876633694e-06,
"loss": 0.1672,
"step": 1643
},
{
"epoch": 1.533582089552239,
"grad_norm": 1.4173466323508679,
"learning_rate": 1.2829688579269006e-06,
"loss": 0.223,
"step": 1644
},
{
"epoch": 1.5345149253731343,
"grad_norm": 1.2209656067143468,
"learning_rate": 1.27807260898819e-06,
"loss": 0.1507,
"step": 1645
},
{
"epoch": 1.5354477611940298,
"grad_norm": 1.4096385483613454,
"learning_rate": 1.2731843513599179e-06,
"loss": 0.2567,
"step": 1646
},
{
"epoch": 1.5363805970149254,
"grad_norm": 1.367550102899949,
"learning_rate": 1.2683040955376109e-06,
"loss": 0.2145,
"step": 1647
},
{
"epoch": 1.537313432835821,
"grad_norm": 1.4046543061081387,
"learning_rate": 1.2634318519996148e-06,
"loss": 0.208,
"step": 1648
},
{
"epoch": 1.5382462686567164,
"grad_norm": 1.216717515432744,
"learning_rate": 1.258567631207071e-06,
"loss": 0.156,
"step": 1649
},
{
"epoch": 1.539179104477612,
"grad_norm": 1.1279229884919437,
"learning_rate": 1.253711443603896e-06,
"loss": 0.1321,
"step": 1650
},
{
"epoch": 1.5401119402985075,
"grad_norm": 1.1956783677025602,
"learning_rate": 1.2488632996167594e-06,
"loss": 0.1412,
"step": 1651
},
{
"epoch": 1.5410447761194028,
"grad_norm": 1.191733998318685,
"learning_rate": 1.244023209655057e-06,
"loss": 0.1537,
"step": 1652
},
{
"epoch": 1.5419776119402986,
"grad_norm": 1.254861547050445,
"learning_rate": 1.239191184110895e-06,
"loss": 0.1684,
"step": 1653
},
{
"epoch": 1.542910447761194,
"grad_norm": 1.4186506209173417,
"learning_rate": 1.2343672333590639e-06,
"loss": 0.2478,
"step": 1654
},
{
"epoch": 1.5438432835820897,
"grad_norm": 1.2645629635280344,
"learning_rate": 1.2295513677570176e-06,
"loss": 0.1641,
"step": 1655
},
{
"epoch": 1.544776119402985,
"grad_norm": 1.1434600478478376,
"learning_rate": 1.2247435976448474e-06,
"loss": 0.1491,
"step": 1656
},
{
"epoch": 1.5457089552238807,
"grad_norm": 1.3072744229757376,
"learning_rate": 1.2199439333452667e-06,
"loss": 0.2054,
"step": 1657
},
{
"epoch": 1.546641791044776,
"grad_norm": 1.3410789883038186,
"learning_rate": 1.2151523851635839e-06,
"loss": 0.1912,
"step": 1658
},
{
"epoch": 1.5475746268656716,
"grad_norm": 1.2553138926818888,
"learning_rate": 1.2103689633876781e-06,
"loss": 0.1724,
"step": 1659
},
{
"epoch": 1.5485074626865671,
"grad_norm": 1.266417108898093,
"learning_rate": 1.2055936782879845e-06,
"loss": 0.1606,
"step": 1660
},
{
"epoch": 1.5494402985074627,
"grad_norm": 1.2492407069458422,
"learning_rate": 1.2008265401174673e-06,
"loss": 0.1741,
"step": 1661
},
{
"epoch": 1.5503731343283582,
"grad_norm": 1.2268918917008558,
"learning_rate": 1.1960675591115966e-06,
"loss": 0.1828,
"step": 1662
},
{
"epoch": 1.5513059701492538,
"grad_norm": 1.2622548581146535,
"learning_rate": 1.1913167454883306e-06,
"loss": 0.17,
"step": 1663
},
{
"epoch": 1.5522388059701493,
"grad_norm": 1.414252888193506,
"learning_rate": 1.186574109448091e-06,
"loss": 0.2519,
"step": 1664
},
{
"epoch": 1.5531716417910446,
"grad_norm": 1.165184752660399,
"learning_rate": 1.1818396611737381e-06,
"loss": 0.1337,
"step": 1665
},
{
"epoch": 1.5541044776119404,
"grad_norm": 1.2575282833611288,
"learning_rate": 1.1771134108305572e-06,
"loss": 0.1833,
"step": 1666
},
{
"epoch": 1.5550373134328357,
"grad_norm": 1.277913286175671,
"learning_rate": 1.1723953685662287e-06,
"loss": 0.1923,
"step": 1667
},
{
"epoch": 1.5559701492537314,
"grad_norm": 1.2216895135729264,
"learning_rate": 1.1676855445108114e-06,
"loss": 0.1484,
"step": 1668
},
{
"epoch": 1.5569029850746268,
"grad_norm": 1.22394919388589,
"learning_rate": 1.1629839487767198e-06,
"loss": 0.1856,
"step": 1669
},
{
"epoch": 1.5578358208955225,
"grad_norm": 1.1992828862006262,
"learning_rate": 1.1582905914586961e-06,
"loss": 0.1554,
"step": 1670
},
{
"epoch": 1.5587686567164178,
"grad_norm": 1.246475284992138,
"learning_rate": 1.1536054826338005e-06,
"loss": 0.1577,
"step": 1671
},
{
"epoch": 1.5597014925373134,
"grad_norm": 1.2456042827508538,
"learning_rate": 1.148928632361378e-06,
"loss": 0.16,
"step": 1672
},
{
"epoch": 1.560634328358209,
"grad_norm": 1.2153660310681467,
"learning_rate": 1.1442600506830443e-06,
"loss": 0.1692,
"step": 1673
},
{
"epoch": 1.5615671641791045,
"grad_norm": 1.3206114927759005,
"learning_rate": 1.1395997476226612e-06,
"loss": 0.1837,
"step": 1674
},
{
"epoch": 1.5625,
"grad_norm": 1.189916838030876,
"learning_rate": 1.134947733186315e-06,
"loss": 0.1443,
"step": 1675
},
{
"epoch": 1.5634328358208955,
"grad_norm": 1.2105885080130891,
"learning_rate": 1.1303040173622977e-06,
"loss": 0.1503,
"step": 1676
},
{
"epoch": 1.564365671641791,
"grad_norm": 1.2501407525049524,
"learning_rate": 1.1256686101210818e-06,
"loss": 0.1786,
"step": 1677
},
{
"epoch": 1.5652985074626866,
"grad_norm": 1.215174069621574,
"learning_rate": 1.1210415214152976e-06,
"loss": 0.1643,
"step": 1678
},
{
"epoch": 1.5662313432835822,
"grad_norm": 1.2836568443928325,
"learning_rate": 1.1164227611797202e-06,
"loss": 0.1698,
"step": 1679
},
{
"epoch": 1.5671641791044775,
"grad_norm": 1.2894993667961356,
"learning_rate": 1.1118123393312397e-06,
"loss": 0.195,
"step": 1680
},
{
"epoch": 1.5680970149253732,
"grad_norm": 1.2395608868763646,
"learning_rate": 1.1072102657688434e-06,
"loss": 0.1448,
"step": 1681
},
{
"epoch": 1.5690298507462686,
"grad_norm": 1.2571854764243646,
"learning_rate": 1.1026165503735959e-06,
"loss": 0.1682,
"step": 1682
},
{
"epoch": 1.5699626865671643,
"grad_norm": 1.241220089205576,
"learning_rate": 1.0980312030086104e-06,
"loss": 0.1608,
"step": 1683
},
{
"epoch": 1.5708955223880596,
"grad_norm": 1.2317979456686032,
"learning_rate": 1.0934542335190418e-06,
"loss": 0.1421,
"step": 1684
},
{
"epoch": 1.5718283582089554,
"grad_norm": 1.2762123047527187,
"learning_rate": 1.0888856517320478e-06,
"loss": 0.1711,
"step": 1685
},
{
"epoch": 1.5727611940298507,
"grad_norm": 1.274556971173205,
"learning_rate": 1.0843254674567832e-06,
"loss": 0.1498,
"step": 1686
},
{
"epoch": 1.5736940298507462,
"grad_norm": 1.360555726010378,
"learning_rate": 1.079773690484372e-06,
"loss": 0.1883,
"step": 1687
},
{
"epoch": 1.5746268656716418,
"grad_norm": 1.2144506087038018,
"learning_rate": 1.075230330587884e-06,
"loss": 0.1566,
"step": 1688
},
{
"epoch": 1.5755597014925373,
"grad_norm": 1.2797794908909506,
"learning_rate": 1.07069539752232e-06,
"loss": 0.1693,
"step": 1689
},
{
"epoch": 1.5764925373134329,
"grad_norm": 1.1695369348750864,
"learning_rate": 1.0661689010245868e-06,
"loss": 0.1335,
"step": 1690
},
{
"epoch": 1.5774253731343284,
"grad_norm": 1.278683912569651,
"learning_rate": 1.0616508508134737e-06,
"loss": 0.2141,
"step": 1691
},
{
"epoch": 1.578358208955224,
"grad_norm": 1.23092493269237,
"learning_rate": 1.0571412565896406e-06,
"loss": 0.1394,
"step": 1692
},
{
"epoch": 1.5792910447761193,
"grad_norm": 1.3214140489535262,
"learning_rate": 1.052640128035587e-06,
"loss": 0.1938,
"step": 1693
},
{
"epoch": 1.580223880597015,
"grad_norm": 1.1396889903545657,
"learning_rate": 1.048147474815639e-06,
"loss": 0.139,
"step": 1694
},
{
"epoch": 1.5811567164179103,
"grad_norm": 1.2733826197436557,
"learning_rate": 1.0436633065759243e-06,
"loss": 0.1387,
"step": 1695
},
{
"epoch": 1.582089552238806,
"grad_norm": 1.348534129553173,
"learning_rate": 1.0391876329443534e-06,
"loss": 0.2277,
"step": 1696
},
{
"epoch": 1.5830223880597014,
"grad_norm": 1.3244185013193703,
"learning_rate": 1.0347204635305963e-06,
"loss": 0.1666,
"step": 1697
},
{
"epoch": 1.5839552238805972,
"grad_norm": 1.1133707065001888,
"learning_rate": 1.030261807926063e-06,
"loss": 0.1185,
"step": 1698
},
{
"epoch": 1.5848880597014925,
"grad_norm": 1.296948122560028,
"learning_rate": 1.0258116757038862e-06,
"loss": 0.1699,
"step": 1699
},
{
"epoch": 1.585820895522388,
"grad_norm": 1.2146289607505514,
"learning_rate": 1.0213700764188978e-06,
"loss": 0.174,
"step": 1700
},
{
"epoch": 1.5867537313432836,
"grad_norm": 1.265118708956542,
"learning_rate": 1.0169370196076073e-06,
"loss": 0.1499,
"step": 1701
},
{
"epoch": 1.587686567164179,
"grad_norm": 1.1766160463133417,
"learning_rate": 1.0125125147881842e-06,
"loss": 0.134,
"step": 1702
},
{
"epoch": 1.5886194029850746,
"grad_norm": 1.2061441688887677,
"learning_rate": 1.0080965714604368e-06,
"loss": 0.1517,
"step": 1703
},
{
"epoch": 1.5895522388059702,
"grad_norm": 1.2419522468754731,
"learning_rate": 1.0036891991057863e-06,
"loss": 0.1785,
"step": 1704
},
{
"epoch": 1.5904850746268657,
"grad_norm": 1.233505174658301,
"learning_rate": 9.992904071872567e-07,
"loss": 0.1678,
"step": 1705
},
{
"epoch": 1.591417910447761,
"grad_norm": 1.2433709953213712,
"learning_rate": 9.949002051494467e-07,
"loss": 0.1655,
"step": 1706
},
{
"epoch": 1.5923507462686568,
"grad_norm": 1.3337135755983585,
"learning_rate": 9.90518602418512e-07,
"loss": 0.2046,
"step": 1707
},
{
"epoch": 1.5932835820895521,
"grad_norm": 1.1762132160436038,
"learning_rate": 9.861456084021448e-07,
"loss": 0.1386,
"step": 1708
},
{
"epoch": 1.5942164179104479,
"grad_norm": 1.139423349315348,
"learning_rate": 9.81781232489556e-07,
"loss": 0.1309,
"step": 1709
},
{
"epoch": 1.5951492537313432,
"grad_norm": 1.211881847706772,
"learning_rate": 9.774254840514474e-07,
"loss": 0.1451,
"step": 1710
},
{
"epoch": 1.596082089552239,
"grad_norm": 1.2929233087170993,
"learning_rate": 9.730783724400005e-07,
"loss": 0.1864,
"step": 1711
},
{
"epoch": 1.5970149253731343,
"grad_norm": 1.2599521105020628,
"learning_rate": 9.687399069888515e-07,
"loss": 0.1807,
"step": 1712
},
{
"epoch": 1.5979477611940298,
"grad_norm": 1.2254395494763746,
"learning_rate": 9.644100970130743e-07,
"loss": 0.1383,
"step": 1713
},
{
"epoch": 1.5988805970149254,
"grad_norm": 1.266780274181055,
"learning_rate": 9.600889518091572e-07,
"loss": 0.1634,
"step": 1714
},
{
"epoch": 1.599813432835821,
"grad_norm": 1.1862088701024427,
"learning_rate": 9.557764806549852e-07,
"loss": 0.1498,
"step": 1715
},
{
"epoch": 1.6007462686567164,
"grad_norm": 1.6649201506362419,
"learning_rate": 9.514726928098189e-07,
"loss": 0.1837,
"step": 1716
},
{
"epoch": 1.601679104477612,
"grad_norm": 1.234264729313629,
"learning_rate": 9.471775975142739e-07,
"loss": 0.1555,
"step": 1717
},
{
"epoch": 1.6026119402985075,
"grad_norm": 1.3098969463186163,
"learning_rate": 9.428912039903043e-07,
"loss": 0.1954,
"step": 1718
},
{
"epoch": 1.6035447761194028,
"grad_norm": 1.2922380419469448,
"learning_rate": 9.38613521441179e-07,
"loss": 0.1599,
"step": 1719
},
{
"epoch": 1.6044776119402986,
"grad_norm": 1.2490251437386872,
"learning_rate": 9.343445590514655e-07,
"loss": 0.1579,
"step": 1720
},
{
"epoch": 1.605410447761194,
"grad_norm": 1.2901488698253734,
"learning_rate": 9.300843259870063e-07,
"loss": 0.1648,
"step": 1721
},
{
"epoch": 1.6063432835820897,
"grad_norm": 1.2596444353650946,
"learning_rate": 9.258328313949039e-07,
"loss": 0.1771,
"step": 1722
},
{
"epoch": 1.607276119402985,
"grad_norm": 1.3134693635948327,
"learning_rate": 9.215900844034953e-07,
"loss": 0.2183,
"step": 1723
},
{
"epoch": 1.6082089552238807,
"grad_norm": 1.2504068109657847,
"learning_rate": 9.173560941223359e-07,
"loss": 0.1531,
"step": 1724
},
{
"epoch": 1.609141791044776,
"grad_norm": 1.2966735825237254,
"learning_rate": 9.131308696421825e-07,
"loss": 0.1723,
"step": 1725
},
{
"epoch": 1.6100746268656716,
"grad_norm": 1.2438087325878,
"learning_rate": 9.089144200349687e-07,
"loss": 0.1724,
"step": 1726
},
{
"epoch": 1.6110074626865671,
"grad_norm": 1.2142565513804016,
"learning_rate": 9.047067543537891e-07,
"loss": 0.1526,
"step": 1727
},
{
"epoch": 1.6119402985074627,
"grad_norm": 1.2410024286686867,
"learning_rate": 9.005078816328772e-07,
"loss": 0.1394,
"step": 1728
},
{
"epoch": 1.6128731343283582,
"grad_norm": 1.232244694202904,
"learning_rate": 8.963178108875886e-07,
"loss": 0.1573,
"step": 1729
},
{
"epoch": 1.6138059701492538,
"grad_norm": 1.2366886540270394,
"learning_rate": 8.92136551114377e-07,
"loss": 0.1699,
"step": 1730
},
{
"epoch": 1.6147388059701493,
"grad_norm": 1.195896184801038,
"learning_rate": 8.879641112907822e-07,
"loss": 0.1436,
"step": 1731
},
{
"epoch": 1.6156716417910446,
"grad_norm": 1.2275267806631487,
"learning_rate": 8.838005003754046e-07,
"loss": 0.1609,
"step": 1732
},
{
"epoch": 1.6166044776119404,
"grad_norm": 1.2746610225972304,
"learning_rate": 8.796457273078884e-07,
"loss": 0.1945,
"step": 1733
},
{
"epoch": 1.6175373134328357,
"grad_norm": 1.289578111770472,
"learning_rate": 8.754998010089033e-07,
"loss": 0.1872,
"step": 1734
},
{
"epoch": 1.6184701492537314,
"grad_norm": 1.2391204387890444,
"learning_rate": 8.713627303801237e-07,
"loss": 0.1821,
"step": 1735
},
{
"epoch": 1.6194029850746268,
"grad_norm": 1.247763928063891,
"learning_rate": 8.672345243042068e-07,
"loss": 0.1621,
"step": 1736
},
{
"epoch": 1.6203358208955225,
"grad_norm": 1.3225057066087762,
"learning_rate": 8.631151916447833e-07,
"loss": 0.2302,
"step": 1737
},
{
"epoch": 1.6212686567164178,
"grad_norm": 1.2664538479356695,
"learning_rate": 8.590047412464247e-07,
"loss": 0.1691,
"step": 1738
},
{
"epoch": 1.6222014925373134,
"grad_norm": 1.2197793802096346,
"learning_rate": 8.549031819346365e-07,
"loss": 0.1491,
"step": 1739
},
{
"epoch": 1.623134328358209,
"grad_norm": 1.1659712448306752,
"learning_rate": 8.50810522515833e-07,
"loss": 0.1362,
"step": 1740
},
{
"epoch": 1.6240671641791045,
"grad_norm": 1.2826109436963649,
"learning_rate": 8.467267717773198e-07,
"loss": 0.197,
"step": 1741
},
{
"epoch": 1.625,
"grad_norm": 1.1964370900129955,
"learning_rate": 8.426519384872733e-07,
"loss": 0.147,
"step": 1742
},
{
"epoch": 1.6259328358208955,
"grad_norm": 1.2219216768473495,
"learning_rate": 8.385860313947269e-07,
"loss": 0.1452,
"step": 1743
},
{
"epoch": 1.626865671641791,
"grad_norm": 1.215418686931946,
"learning_rate": 8.345290592295429e-07,
"loss": 0.1388,
"step": 1744
},
{
"epoch": 1.6277985074626866,
"grad_norm": 1.2916191415428755,
"learning_rate": 8.304810307024041e-07,
"loss": 0.213,
"step": 1745
},
{
"epoch": 1.6287313432835822,
"grad_norm": 1.3474400193843503,
"learning_rate": 8.264419545047892e-07,
"loss": 0.2257,
"step": 1746
},
{
"epoch": 1.6296641791044775,
"grad_norm": 1.417866910801035,
"learning_rate": 8.224118393089553e-07,
"loss": 0.1822,
"step": 1747
},
{
"epoch": 1.6305970149253732,
"grad_norm": 1.2701405693061993,
"learning_rate": 8.183906937679214e-07,
"loss": 0.162,
"step": 1748
},
{
"epoch": 1.6315298507462686,
"grad_norm": 1.171354349701999,
"learning_rate": 8.143785265154436e-07,
"loss": 0.1493,
"step": 1749
},
{
"epoch": 1.6324626865671643,
"grad_norm": 1.245853361402978,
"learning_rate": 8.103753461660046e-07,
"loss": 0.137,
"step": 1750
},
{
"epoch": 1.6333955223880596,
"grad_norm": 1.2401681768660737,
"learning_rate": 8.063811613147888e-07,
"loss": 0.1545,
"step": 1751
},
{
"epoch": 1.6343283582089554,
"grad_norm": 1.3543115825906518,
"learning_rate": 8.02395980537668e-07,
"loss": 0.2262,
"step": 1752
},
{
"epoch": 1.6352611940298507,
"grad_norm": 1.2622941439543376,
"learning_rate": 7.984198123911819e-07,
"loss": 0.1854,
"step": 1753
},
{
"epoch": 1.6361940298507462,
"grad_norm": 1.2200368244742743,
"learning_rate": 7.944526654125184e-07,
"loss": 0.1715,
"step": 1754
},
{
"epoch": 1.6371268656716418,
"grad_norm": 1.306003439685902,
"learning_rate": 7.904945481194959e-07,
"loss": 0.1799,
"step": 1755
},
{
"epoch": 1.6380597014925373,
"grad_norm": 1.1903945099779447,
"learning_rate": 7.865454690105472e-07,
"loss": 0.1441,
"step": 1756
},
{
"epoch": 1.6389925373134329,
"grad_norm": 1.1424787110504189,
"learning_rate": 7.826054365646951e-07,
"loss": 0.1264,
"step": 1757
},
{
"epoch": 1.6399253731343284,
"grad_norm": 1.25777704993289,
"learning_rate": 7.786744592415429e-07,
"loss": 0.1925,
"step": 1758
},
{
"epoch": 1.640858208955224,
"grad_norm": 1.2658783455065246,
"learning_rate": 7.747525454812488e-07,
"loss": 0.1521,
"step": 1759
},
{
"epoch": 1.6417910447761193,
"grad_norm": 1.2307245795846298,
"learning_rate": 7.708397037045129e-07,
"loss": 0.1411,
"step": 1760
},
{
"epoch": 1.642723880597015,
"grad_norm": 1.2428854455488574,
"learning_rate": 7.669359423125555e-07,
"loss": 0.1727,
"step": 1761
},
{
"epoch": 1.6436567164179103,
"grad_norm": 1.257958809857627,
"learning_rate": 7.630412696871015e-07,
"loss": 0.1585,
"step": 1762
},
{
"epoch": 1.644589552238806,
"grad_norm": 1.2947143695235483,
"learning_rate": 7.591556941903605e-07,
"loss": 0.1912,
"step": 1763
},
{
"epoch": 1.6455223880597014,
"grad_norm": 1.259074792643218,
"learning_rate": 7.552792241650081e-07,
"loss": 0.182,
"step": 1764
},
{
"epoch": 1.6464552238805972,
"grad_norm": 1.214636998645944,
"learning_rate": 7.514118679341737e-07,
"loss": 0.1533,
"step": 1765
},
{
"epoch": 1.6473880597014925,
"grad_norm": 1.1789528360329278,
"learning_rate": 7.475536338014156e-07,
"loss": 0.1592,
"step": 1766
},
{
"epoch": 1.648320895522388,
"grad_norm": 1.2385211364305988,
"learning_rate": 7.437045300507068e-07,
"loss": 0.1878,
"step": 1767
},
{
"epoch": 1.6492537313432836,
"grad_norm": 1.2464420700757242,
"learning_rate": 7.398645649464175e-07,
"loss": 0.1724,
"step": 1768
},
{
"epoch": 1.650186567164179,
"grad_norm": 1.2615694365379468,
"learning_rate": 7.360337467332968e-07,
"loss": 0.1751,
"step": 1769
},
{
"epoch": 1.6511194029850746,
"grad_norm": 1.2024654335089155,
"learning_rate": 7.322120836364504e-07,
"loss": 0.1368,
"step": 1770
},
{
"epoch": 1.6520522388059702,
"grad_norm": 1.2616960564996835,
"learning_rate": 7.283995838613323e-07,
"loss": 0.1651,
"step": 1771
},
{
"epoch": 1.6529850746268657,
"grad_norm": 1.2711287773276967,
"learning_rate": 7.245962555937192e-07,
"loss": 0.1482,
"step": 1772
},
{
"epoch": 1.653917910447761,
"grad_norm": 1.1648789458296942,
"learning_rate": 7.208021069996962e-07,
"loss": 0.1429,
"step": 1773
},
{
"epoch": 1.6548507462686568,
"grad_norm": 1.331743388691754,
"learning_rate": 7.170171462256404e-07,
"loss": 0.1576,
"step": 1774
},
{
"epoch": 1.6557835820895521,
"grad_norm": 1.2220059997930557,
"learning_rate": 7.132413813982003e-07,
"loss": 0.1585,
"step": 1775
},
{
"epoch": 1.6567164179104479,
"grad_norm": 1.277416037893367,
"learning_rate": 7.094748206242797e-07,
"loss": 0.1613,
"step": 1776
},
{
"epoch": 1.6576492537313432,
"grad_norm": 1.2930735979309478,
"learning_rate": 7.057174719910198e-07,
"loss": 0.1347,
"step": 1777
},
{
"epoch": 1.658582089552239,
"grad_norm": 1.3009220963894073,
"learning_rate": 7.019693435657848e-07,
"loss": 0.1604,
"step": 1778
},
{
"epoch": 1.6595149253731343,
"grad_norm": 1.2534234521523646,
"learning_rate": 6.982304433961406e-07,
"loss": 0.1725,
"step": 1779
},
{
"epoch": 1.6604477611940298,
"grad_norm": 1.1749339898084066,
"learning_rate": 6.945007795098402e-07,
"loss": 0.1303,
"step": 1780
},
{
"epoch": 1.6613805970149254,
"grad_norm": 1.1501782104693064,
"learning_rate": 6.907803599148049e-07,
"loss": 0.1569,
"step": 1781
},
{
"epoch": 1.662313432835821,
"grad_norm": 1.3411460520884892,
"learning_rate": 6.870691925991085e-07,
"loss": 0.201,
"step": 1782
},
{
"epoch": 1.6632462686567164,
"grad_norm": 1.2992658970414581,
"learning_rate": 6.833672855309565e-07,
"loss": 0.1711,
"step": 1783
},
{
"epoch": 1.664179104477612,
"grad_norm": 1.3023872730004322,
"learning_rate": 6.796746466586757e-07,
"loss": 0.1797,
"step": 1784
},
{
"epoch": 1.6651119402985075,
"grad_norm": 1.3168878099360892,
"learning_rate": 6.759912839106908e-07,
"loss": 0.2034,
"step": 1785
},
{
"epoch": 1.6660447761194028,
"grad_norm": 1.2482529228619816,
"learning_rate": 6.723172051955102e-07,
"loss": 0.1287,
"step": 1786
},
{
"epoch": 1.6669776119402986,
"grad_norm": 1.1644284096985102,
"learning_rate": 6.686524184017102e-07,
"loss": 0.1452,
"step": 1787
},
{
"epoch": 1.667910447761194,
"grad_norm": 1.2217954755057625,
"learning_rate": 6.649969313979149e-07,
"loss": 0.1605,
"step": 1788
},
{
"epoch": 1.6688432835820897,
"grad_norm": 1.3257197552048248,
"learning_rate": 6.613507520327811e-07,
"loss": 0.1746,
"step": 1789
},
{
"epoch": 1.669776119402985,
"grad_norm": 1.267615190897561,
"learning_rate": 6.577138881349804e-07,
"loss": 0.1812,
"step": 1790
},
{
"epoch": 1.6707089552238807,
"grad_norm": 1.262028394959287,
"learning_rate": 6.540863475131853e-07,
"loss": 0.1584,
"step": 1791
},
{
"epoch": 1.671641791044776,
"grad_norm": 1.3174514462210885,
"learning_rate": 6.50468137956049e-07,
"loss": 0.1853,
"step": 1792
},
{
"epoch": 1.6725746268656716,
"grad_norm": 1.2558940023330483,
"learning_rate": 6.468592672321905e-07,
"loss": 0.1731,
"step": 1793
},
{
"epoch": 1.6735074626865671,
"grad_norm": 1.4215053238505322,
"learning_rate": 6.432597430901782e-07,
"loss": 0.13,
"step": 1794
},
{
"epoch": 1.6744402985074627,
"grad_norm": 1.3440130764969576,
"learning_rate": 6.396695732585123e-07,
"loss": 0.2222,
"step": 1795
},
{
"epoch": 1.6753731343283582,
"grad_norm": 1.2110671300633844,
"learning_rate": 6.360887654456066e-07,
"loss": 0.144,
"step": 1796
},
{
"epoch": 1.6763059701492538,
"grad_norm": 1.2272021955175507,
"learning_rate": 6.32517327339775e-07,
"loss": 0.1471,
"step": 1797
},
{
"epoch": 1.6772388059701493,
"grad_norm": 1.2403806067786642,
"learning_rate": 6.289552666092153e-07,
"loss": 0.2007,
"step": 1798
},
{
"epoch": 1.6781716417910446,
"grad_norm": 1.263065524151067,
"learning_rate": 6.254025909019889e-07,
"loss": 0.1721,
"step": 1799
},
{
"epoch": 1.6791044776119404,
"grad_norm": 1.2887727320735014,
"learning_rate": 6.218593078460084e-07,
"loss": 0.1627,
"step": 1800
},
{
"epoch": 1.6800373134328357,
"grad_norm": 1.2164111533573003,
"learning_rate": 6.183254250490195e-07,
"loss": 0.1498,
"step": 1801
},
{
"epoch": 1.6809701492537314,
"grad_norm": 1.318078297981106,
"learning_rate": 6.14800950098583e-07,
"loss": 0.2064,
"step": 1802
},
{
"epoch": 1.6819029850746268,
"grad_norm": 1.2598847937787256,
"learning_rate": 6.112858905620622e-07,
"loss": 0.1608,
"step": 1803
},
{
"epoch": 1.6828358208955225,
"grad_norm": 1.2677604291651299,
"learning_rate": 6.077802539866023e-07,
"loss": 0.1731,
"step": 1804
},
{
"epoch": 1.6837686567164178,
"grad_norm": 1.2687851660876244,
"learning_rate": 6.042840478991185e-07,
"loss": 0.1781,
"step": 1805
},
{
"epoch": 1.6847014925373134,
"grad_norm": 1.2394459442409977,
"learning_rate": 6.007972798062783e-07,
"loss": 0.1636,
"step": 1806
},
{
"epoch": 1.685634328358209,
"grad_norm": 1.2469576957445645,
"learning_rate": 5.973199571944843e-07,
"loss": 0.1775,
"step": 1807
},
{
"epoch": 1.6865671641791045,
"grad_norm": 1.2306891342872057,
"learning_rate": 5.938520875298587e-07,
"loss": 0.1813,
"step": 1808
},
{
"epoch": 1.6875,
"grad_norm": 1.177667304477984,
"learning_rate": 5.903936782582253e-07,
"loss": 0.1528,
"step": 1809
},
{
"epoch": 1.6884328358208955,
"grad_norm": 1.2786596288229926,
"learning_rate": 5.869447368050995e-07,
"loss": 0.1772,
"step": 1810
},
{
"epoch": 1.689365671641791,
"grad_norm": 1.2230138861852573,
"learning_rate": 5.835052705756661e-07,
"loss": 0.1283,
"step": 1811
},
{
"epoch": 1.6902985074626866,
"grad_norm": 1.193870531656724,
"learning_rate": 5.80075286954766e-07,
"loss": 0.1447,
"step": 1812
},
{
"epoch": 1.6912313432835822,
"grad_norm": 1.2821363975078859,
"learning_rate": 5.766547933068806e-07,
"loss": 0.1789,
"step": 1813
},
{
"epoch": 1.6921641791044775,
"grad_norm": 1.3215922151226887,
"learning_rate": 5.732437969761156e-07,
"loss": 0.1793,
"step": 1814
},
{
"epoch": 1.6930970149253732,
"grad_norm": 1.2320951636990367,
"learning_rate": 5.698423052861835e-07,
"loss": 0.1639,
"step": 1815
},
{
"epoch": 1.6940298507462686,
"grad_norm": 1.2797531268301714,
"learning_rate": 5.664503255403925e-07,
"loss": 0.1727,
"step": 1816
},
{
"epoch": 1.6949626865671643,
"grad_norm": 1.252803455484566,
"learning_rate": 5.630678650216236e-07,
"loss": 0.1457,
"step": 1817
},
{
"epoch": 1.6958955223880596,
"grad_norm": 1.2726809083765898,
"learning_rate": 5.596949309923233e-07,
"loss": 0.1768,
"step": 1818
},
{
"epoch": 1.6968283582089554,
"grad_norm": 1.2854257510532858,
"learning_rate": 5.56331530694481e-07,
"loss": 0.1713,
"step": 1819
},
{
"epoch": 1.6977611940298507,
"grad_norm": 1.155844470522867,
"learning_rate": 5.529776713496182e-07,
"loss": 0.1391,
"step": 1820
},
{
"epoch": 1.6986940298507462,
"grad_norm": 1.2116496183532584,
"learning_rate": 5.496333601587711e-07,
"loss": 0.1637,
"step": 1821
},
{
"epoch": 1.6996268656716418,
"grad_norm": 1.3656652490874133,
"learning_rate": 5.462986043024726e-07,
"loss": 0.2023,
"step": 1822
},
{
"epoch": 1.7005597014925373,
"grad_norm": 1.2681746668842202,
"learning_rate": 5.429734109407426e-07,
"loss": 0.1873,
"step": 1823
},
{
"epoch": 1.7014925373134329,
"grad_norm": 1.2277116009701676,
"learning_rate": 5.396577872130676e-07,
"loss": 0.164,
"step": 1824
},
{
"epoch": 1.7024253731343284,
"grad_norm": 1.2089056006264882,
"learning_rate": 5.363517402383878e-07,
"loss": 0.1709,
"step": 1825
},
{
"epoch": 1.703358208955224,
"grad_norm": 1.3297359993443203,
"learning_rate": 5.330552771150821e-07,
"loss": 0.1505,
"step": 1826
},
{
"epoch": 1.7042910447761193,
"grad_norm": 1.1615752359721518,
"learning_rate": 5.297684049209511e-07,
"loss": 0.148,
"step": 1827
},
{
"epoch": 1.705223880597015,
"grad_norm": 1.2464968950420923,
"learning_rate": 5.264911307132009e-07,
"loss": 0.1622,
"step": 1828
},
{
"epoch": 1.7061567164179103,
"grad_norm": 1.2751631812112247,
"learning_rate": 5.232234615284337e-07,
"loss": 0.1937,
"step": 1829
},
{
"epoch": 1.707089552238806,
"grad_norm": 1.2576047652349769,
"learning_rate": 5.19965404382628e-07,
"loss": 0.1591,
"step": 1830
},
{
"epoch": 1.7080223880597014,
"grad_norm": 1.311492145110782,
"learning_rate": 5.167169662711202e-07,
"loss": 0.2102,
"step": 1831
},
{
"epoch": 1.7089552238805972,
"grad_norm": 1.188329714322775,
"learning_rate": 5.134781541685996e-07,
"loss": 0.1432,
"step": 1832
},
{
"epoch": 1.7098880597014925,
"grad_norm": 1.3440732304941392,
"learning_rate": 5.102489750290834e-07,
"loss": 0.1802,
"step": 1833
},
{
"epoch": 1.710820895522388,
"grad_norm": 1.1861906294896165,
"learning_rate": 5.070294357859096e-07,
"loss": 0.1459,
"step": 1834
},
{
"epoch": 1.7117537313432836,
"grad_norm": 1.2223465113629186,
"learning_rate": 5.03819543351714e-07,
"loss": 0.1553,
"step": 1835
},
{
"epoch": 1.712686567164179,
"grad_norm": 1.2770412401865483,
"learning_rate": 5.006193046184238e-07,
"loss": 0.1642,
"step": 1836
},
{
"epoch": 1.7136194029850746,
"grad_norm": 1.3033610076093305,
"learning_rate": 4.974287264572363e-07,
"loss": 0.1764,
"step": 1837
},
{
"epoch": 1.7145522388059702,
"grad_norm": 1.2497589447053674,
"learning_rate": 4.942478157186087e-07,
"loss": 0.1516,
"step": 1838
},
{
"epoch": 1.7154850746268657,
"grad_norm": 1.2455585184268392,
"learning_rate": 4.910765792322397e-07,
"loss": 0.1584,
"step": 1839
},
{
"epoch": 1.716417910447761,
"grad_norm": 1.2460420142287991,
"learning_rate": 4.879150238070585e-07,
"loss": 0.1731,
"step": 1840
},
{
"epoch": 1.7173507462686568,
"grad_norm": 1.3555355069639567,
"learning_rate": 4.847631562312049e-07,
"loss": 0.2119,
"step": 1841
},
{
"epoch": 1.7182835820895521,
"grad_norm": 1.231681728249583,
"learning_rate": 4.816209832720214e-07,
"loss": 0.1495,
"step": 1842
},
{
"epoch": 1.7192164179104479,
"grad_norm": 1.2514926193302185,
"learning_rate": 4.78488511676034e-07,
"loss": 0.1538,
"step": 1843
},
{
"epoch": 1.7201492537313432,
"grad_norm": 1.198768294078837,
"learning_rate": 4.753657481689372e-07,
"loss": 0.1411,
"step": 1844
},
{
"epoch": 1.721082089552239,
"grad_norm": 1.242098430941121,
"learning_rate": 4.7225269945558483e-07,
"loss": 0.1454,
"step": 1845
},
{
"epoch": 1.7220149253731343,
"grad_norm": 1.2184578344056238,
"learning_rate": 4.691493722199697e-07,
"loss": 0.1602,
"step": 1846
},
{
"epoch": 1.7229477611940298,
"grad_norm": 1.2127849041273684,
"learning_rate": 4.6605577312521354e-07,
"loss": 0.1701,
"step": 1847
},
{
"epoch": 1.7238805970149254,
"grad_norm": 1.1627994701266557,
"learning_rate": 4.6297190881354816e-07,
"loss": 0.1255,
"step": 1848
},
{
"epoch": 1.724813432835821,
"grad_norm": 1.1910213399847986,
"learning_rate": 4.598977859063064e-07,
"loss": 0.1268,
"step": 1849
},
{
"epoch": 1.7257462686567164,
"grad_norm": 1.3187420615246166,
"learning_rate": 4.5683341100390464e-07,
"loss": 0.2062,
"step": 1850
},
{
"epoch": 1.726679104477612,
"grad_norm": 1.2321784422916793,
"learning_rate": 4.537787906858293e-07,
"loss": 0.1689,
"step": 1851
},
{
"epoch": 1.7276119402985075,
"grad_norm": 1.1635538050550778,
"learning_rate": 4.507339315106235e-07,
"loss": 0.1239,
"step": 1852
},
{
"epoch": 1.7285447761194028,
"grad_norm": 1.257652215638962,
"learning_rate": 4.476988400158716e-07,
"loss": 0.1611,
"step": 1853
},
{
"epoch": 1.7294776119402986,
"grad_norm": 1.245129913597949,
"learning_rate": 4.446735227181853e-07,
"loss": 0.1574,
"step": 1854
},
{
"epoch": 1.730410447761194,
"grad_norm": 1.1901118742002041,
"learning_rate": 4.4165798611319145e-07,
"loss": 0.1337,
"step": 1855
},
{
"epoch": 1.7313432835820897,
"grad_norm": 1.2406972967645573,
"learning_rate": 4.386522366755169e-07,
"loss": 0.1676,
"step": 1856
},
{
"epoch": 1.732276119402985,
"grad_norm": 1.2215382055435324,
"learning_rate": 4.3565628085877275e-07,
"loss": 0.1536,
"step": 1857
},
{
"epoch": 1.7332089552238807,
"grad_norm": 1.2560948722737746,
"learning_rate": 4.326701250955445e-07,
"loss": 0.1743,
"step": 1858
},
{
"epoch": 1.734141791044776,
"grad_norm": 1.2253387185011415,
"learning_rate": 4.296937757973757e-07,
"loss": 0.1426,
"step": 1859
},
{
"epoch": 1.7350746268656716,
"grad_norm": 1.2013816474289636,
"learning_rate": 4.267272393547539e-07,
"loss": 0.1829,
"step": 1860
},
{
"epoch": 1.7360074626865671,
"grad_norm": 1.2876379806867537,
"learning_rate": 4.2377052213709634e-07,
"loss": 0.1818,
"step": 1861
},
{
"epoch": 1.7369402985074627,
"grad_norm": 1.233354531426832,
"learning_rate": 4.208236304927404e-07,
"loss": 0.175,
"step": 1862
},
{
"epoch": 1.7378731343283582,
"grad_norm": 1.1418723680307974,
"learning_rate": 4.178865707489249e-07,
"loss": 0.1326,
"step": 1863
},
{
"epoch": 1.7388059701492538,
"grad_norm": 1.267791274500478,
"learning_rate": 4.149593492117793e-07,
"loss": 0.1702,
"step": 1864
},
{
"epoch": 1.7397388059701493,
"grad_norm": 1.2692321911248439,
"learning_rate": 4.120419721663099e-07,
"loss": 0.1496,
"step": 1865
},
{
"epoch": 1.7406716417910446,
"grad_norm": 1.2588803625643528,
"learning_rate": 4.091344458763863e-07,
"loss": 0.1832,
"step": 1866
},
{
"epoch": 1.7416044776119404,
"grad_norm": 1.1759139757933372,
"learning_rate": 4.062367765847258e-07,
"loss": 0.1531,
"step": 1867
},
{
"epoch": 1.7425373134328357,
"grad_norm": 1.2610912446849163,
"learning_rate": 4.03348970512884e-07,
"loss": 0.1975,
"step": 1868
},
{
"epoch": 1.7434701492537314,
"grad_norm": 1.3085651313701954,
"learning_rate": 4.0047103386123777e-07,
"loss": 0.1918,
"step": 1869
},
{
"epoch": 1.7444029850746268,
"grad_norm": 1.2771664493168298,
"learning_rate": 3.9760297280897533e-07,
"loss": 0.1661,
"step": 1870
},
{
"epoch": 1.7453358208955225,
"grad_norm": 1.2370508412489076,
"learning_rate": 3.9474479351407803e-07,
"loss": 0.1897,
"step": 1871
},
{
"epoch": 1.7462686567164178,
"grad_norm": 1.237915742041455,
"learning_rate": 3.918965021133131e-07,
"loss": 0.1662,
"step": 1872
},
{
"epoch": 1.7472014925373134,
"grad_norm": 1.1964733189715644,
"learning_rate": 3.8905810472221636e-07,
"loss": 0.1272,
"step": 1873
},
{
"epoch": 1.748134328358209,
"grad_norm": 1.2441657107867947,
"learning_rate": 3.8622960743508074e-07,
"loss": 0.1636,
"step": 1874
},
{
"epoch": 1.7490671641791045,
"grad_norm": 1.2898967656039175,
"learning_rate": 3.834110163249416e-07,
"loss": 0.1683,
"step": 1875
},
{
"epoch": 1.75,
"grad_norm": 1.1520914532051287,
"learning_rate": 3.8060233744356634e-07,
"loss": 0.1205,
"step": 1876
},
{
"epoch": 1.7509328358208955,
"grad_norm": 1.2564727530163406,
"learning_rate": 3.7780357682143943e-07,
"loss": 0.1795,
"step": 1877
},
{
"epoch": 1.751865671641791,
"grad_norm": 1.2881573951266523,
"learning_rate": 3.75014740467749e-07,
"loss": 0.1693,
"step": 1878
},
{
"epoch": 1.7527985074626866,
"grad_norm": 1.2480429343512909,
"learning_rate": 3.72235834370378e-07,
"loss": 0.1742,
"step": 1879
},
{
"epoch": 1.7537313432835822,
"grad_norm": 1.2839924896207722,
"learning_rate": 3.6946686449588267e-07,
"loss": 0.2061,
"step": 1880
},
{
"epoch": 1.7546641791044775,
"grad_norm": 1.2061279903283946,
"learning_rate": 3.667078367894905e-07,
"loss": 0.1544,
"step": 1881
},
{
"epoch": 1.7555970149253732,
"grad_norm": 1.2372284741245994,
"learning_rate": 3.639587571750802e-07,
"loss": 0.1809,
"step": 1882
},
{
"epoch": 1.7565298507462686,
"grad_norm": 1.3036907322224398,
"learning_rate": 3.612196315551719e-07,
"loss": 0.1795,
"step": 1883
},
{
"epoch": 1.7574626865671643,
"grad_norm": 1.2616668191135232,
"learning_rate": 3.584904658109106e-07,
"loss": 0.1465,
"step": 1884
},
{
"epoch": 1.7583955223880596,
"grad_norm": 1.2097261595981943,
"learning_rate": 3.557712658020607e-07,
"loss": 0.1241,
"step": 1885
},
{
"epoch": 1.7593283582089554,
"grad_norm": 1.1598713960365468,
"learning_rate": 3.5306203736698686e-07,
"loss": 0.1155,
"step": 1886
},
{
"epoch": 1.7602611940298507,
"grad_norm": 1.1581715369378212,
"learning_rate": 3.503627863226455e-07,
"loss": 0.1302,
"step": 1887
},
{
"epoch": 1.7611940298507462,
"grad_norm": 1.3721765172969267,
"learning_rate": 3.4767351846456744e-07,
"loss": 0.1587,
"step": 1888
},
{
"epoch": 1.7621268656716418,
"grad_norm": 1.3747591450318046,
"learning_rate": 3.4499423956685207e-07,
"loss": 0.1997,
"step": 1889
},
{
"epoch": 1.7630597014925373,
"grad_norm": 1.3283176899587907,
"learning_rate": 3.423249553821506e-07,
"loss": 0.1867,
"step": 1890
},
{
"epoch": 1.7639925373134329,
"grad_norm": 1.2148518852704886,
"learning_rate": 3.3966567164165466e-07,
"loss": 0.1451,
"step": 1891
},
{
"epoch": 1.7649253731343284,
"grad_norm": 1.2376362058041783,
"learning_rate": 3.37016394055083e-07,
"loss": 0.1547,
"step": 1892
},
{
"epoch": 1.765858208955224,
"grad_norm": 1.2196305296400547,
"learning_rate": 3.343771283106728e-07,
"loss": 0.165,
"step": 1893
},
{
"epoch": 1.7667910447761193,
"grad_norm": 1.2217109193774467,
"learning_rate": 3.3174788007516166e-07,
"loss": 0.133,
"step": 1894
},
{
"epoch": 1.767723880597015,
"grad_norm": 1.2313212809615608,
"learning_rate": 3.2912865499378053e-07,
"loss": 0.1593,
"step": 1895
},
{
"epoch": 1.7686567164179103,
"grad_norm": 1.3465981871950774,
"learning_rate": 3.2651945869024035e-07,
"loss": 0.2268,
"step": 1896
},
{
"epoch": 1.769589552238806,
"grad_norm": 1.369769762821453,
"learning_rate": 3.239202967667182e-07,
"loss": 0.1979,
"step": 1897
},
{
"epoch": 1.7705223880597014,
"grad_norm": 1.2266535787952872,
"learning_rate": 3.2133117480384613e-07,
"loss": 0.1525,
"step": 1898
},
{
"epoch": 1.7714552238805972,
"grad_norm": 1.2029055250352059,
"learning_rate": 3.187520983607012e-07,
"loss": 0.1475,
"step": 1899
},
{
"epoch": 1.7723880597014925,
"grad_norm": 1.234808266249341,
"learning_rate": 3.1618307297479055e-07,
"loss": 0.1599,
"step": 1900
},
{
"epoch": 1.773320895522388,
"grad_norm": 1.2473184516361606,
"learning_rate": 3.1362410416204024e-07,
"loss": 0.1469,
"step": 1901
},
{
"epoch": 1.7742537313432836,
"grad_norm": 1.2725038653967082,
"learning_rate": 3.1107519741678526e-07,
"loss": 0.1736,
"step": 1902
},
{
"epoch": 1.775186567164179,
"grad_norm": 1.209842686248433,
"learning_rate": 3.0853635821175676e-07,
"loss": 0.1636,
"step": 1903
},
{
"epoch": 1.7761194029850746,
"grad_norm": 1.2386320385449814,
"learning_rate": 3.0600759199806815e-07,
"loss": 0.1648,
"step": 1904
},
{
"epoch": 1.7770522388059702,
"grad_norm": 1.2365815677283971,
"learning_rate": 3.0348890420520693e-07,
"loss": 0.1828,
"step": 1905
},
{
"epoch": 1.7779850746268657,
"grad_norm": 1.2932917260184533,
"learning_rate": 3.0098030024102107e-07,
"loss": 0.1664,
"step": 1906
},
{
"epoch": 1.778917910447761,
"grad_norm": 1.2884677497634525,
"learning_rate": 2.9848178549170604e-07,
"loss": 0.1549,
"step": 1907
},
{
"epoch": 1.7798507462686568,
"grad_norm": 1.1655230428805,
"learning_rate": 2.959933653217967e-07,
"loss": 0.1309,
"step": 1908
},
{
"epoch": 1.7807835820895521,
"grad_norm": 1.2142649182178054,
"learning_rate": 2.9351504507415305e-07,
"loss": 0.1602,
"step": 1909
},
{
"epoch": 1.7817164179104479,
"grad_norm": 1.2328295938780174,
"learning_rate": 2.9104683006995147e-07,
"loss": 0.1764,
"step": 1910
},
{
"epoch": 1.7826492537313432,
"grad_norm": 1.2276281692637927,
"learning_rate": 2.885887256086678e-07,
"loss": 0.157,
"step": 1911
},
{
"epoch": 1.783582089552239,
"grad_norm": 1.2275484894679003,
"learning_rate": 2.8614073696807297e-07,
"loss": 0.157,
"step": 1912
},
{
"epoch": 1.7845149253731343,
"grad_norm": 1.2993174273770114,
"learning_rate": 2.837028694042182e-07,
"loss": 0.1674,
"step": 1913
},
{
"epoch": 1.7854477611940298,
"grad_norm": 1.1840625313782969,
"learning_rate": 2.812751281514203e-07,
"loss": 0.1391,
"step": 1914
},
{
"epoch": 1.7863805970149254,
"grad_norm": 1.255504543240045,
"learning_rate": 2.7885751842225804e-07,
"loss": 0.157,
"step": 1915
},
{
"epoch": 1.787313432835821,
"grad_norm": 1.1684515816146364,
"learning_rate": 2.7645004540755527e-07,
"loss": 0.1556,
"step": 1916
},
{
"epoch": 1.7882462686567164,
"grad_norm": 1.3063847045432126,
"learning_rate": 2.74052714276371e-07,
"loss": 0.1968,
"step": 1917
},
{
"epoch": 1.789179104477612,
"grad_norm": 1.209985499231558,
"learning_rate": 2.716655301759902e-07,
"loss": 0.127,
"step": 1918
},
{
"epoch": 1.7901119402985075,
"grad_norm": 1.4224348466493586,
"learning_rate": 2.6928849823190995e-07,
"loss": 0.2183,
"step": 1919
},
{
"epoch": 1.7910447761194028,
"grad_norm": 1.2695172844316536,
"learning_rate": 2.669216235478295e-07,
"loss": 0.1668,
"step": 1920
},
{
"epoch": 1.7919776119402986,
"grad_norm": 1.2524223778087493,
"learning_rate": 2.6456491120564034e-07,
"loss": 0.1598,
"step": 1921
},
{
"epoch": 1.792910447761194,
"grad_norm": 1.2473601964151018,
"learning_rate": 2.622183662654143e-07,
"loss": 0.1733,
"step": 1922
},
{
"epoch": 1.7938432835820897,
"grad_norm": 1.222898030106351,
"learning_rate": 2.59881993765394e-07,
"loss": 0.1536,
"step": 1923
},
{
"epoch": 1.794776119402985,
"grad_norm": 1.3084287010523616,
"learning_rate": 2.575557987219784e-07,
"loss": 0.1912,
"step": 1924
},
{
"epoch": 1.7957089552238807,
"grad_norm": 1.3342414967971448,
"learning_rate": 2.5523978612971623e-07,
"loss": 0.1827,
"step": 1925
},
{
"epoch": 1.796641791044776,
"grad_norm": 1.2968683363611444,
"learning_rate": 2.529339609612941e-07,
"loss": 0.1587,
"step": 1926
},
{
"epoch": 1.7975746268656716,
"grad_norm": 1.2929590002031308,
"learning_rate": 2.506383281675229e-07,
"loss": 0.1542,
"step": 1927
},
{
"epoch": 1.7985074626865671,
"grad_norm": 1.2679707784255936,
"learning_rate": 2.4835289267733263e-07,
"loss": 0.166,
"step": 1928
},
{
"epoch": 1.7994402985074627,
"grad_norm": 1.260213173889151,
"learning_rate": 2.4607765939775706e-07,
"loss": 0.1841,
"step": 1929
},
{
"epoch": 1.8003731343283582,
"grad_norm": 1.20471851282945,
"learning_rate": 2.4381263321392514e-07,
"loss": 0.1426,
"step": 1930
},
{
"epoch": 1.8013059701492538,
"grad_norm": 1.2303747247753642,
"learning_rate": 2.415578189890505e-07,
"loss": 0.1621,
"step": 1931
},
{
"epoch": 1.8022388059701493,
"grad_norm": 1.2649296325203112,
"learning_rate": 2.3931322156442117e-07,
"loss": 0.1662,
"step": 1932
},
{
"epoch": 1.8031716417910446,
"grad_norm": 1.163644370815951,
"learning_rate": 2.3707884575938645e-07,
"loss": 0.1435,
"step": 1933
},
{
"epoch": 1.8041044776119404,
"grad_norm": 1.3676345128723275,
"learning_rate": 2.348546963713516e-07,
"loss": 0.244,
"step": 1934
},
{
"epoch": 1.8050373134328357,
"grad_norm": 1.365023911240722,
"learning_rate": 2.3264077817576446e-07,
"loss": 0.2213,
"step": 1935
},
{
"epoch": 1.8059701492537314,
"grad_norm": 1.2918235332781236,
"learning_rate": 2.3043709592610486e-07,
"loss": 0.1521,
"step": 1936
},
{
"epoch": 1.8069029850746268,
"grad_norm": 1.30356262574336,
"learning_rate": 2.2824365435387573e-07,
"loss": 0.1681,
"step": 1937
},
{
"epoch": 1.8078358208955225,
"grad_norm": 1.254062316141817,
"learning_rate": 2.2606045816859047e-07,
"loss": 0.1786,
"step": 1938
},
{
"epoch": 1.8087686567164178,
"grad_norm": 1.0726060509074782,
"learning_rate": 2.2388751205776826e-07,
"loss": 0.1126,
"step": 1939
},
{
"epoch": 1.8097014925373134,
"grad_norm": 1.1087652730174822,
"learning_rate": 2.2172482068691658e-07,
"loss": 0.1472,
"step": 1940
},
{
"epoch": 1.810634328358209,
"grad_norm": 1.2963637098299603,
"learning_rate": 2.1957238869952767e-07,
"loss": 0.2098,
"step": 1941
},
{
"epoch": 1.8115671641791045,
"grad_norm": 1.2253863654446437,
"learning_rate": 2.174302207170653e-07,
"loss": 0.1477,
"step": 1942
},
{
"epoch": 1.8125,
"grad_norm": 1.2721837097524327,
"learning_rate": 2.152983213389559e-07,
"loss": 0.1817,
"step": 1943
},
{
"epoch": 1.8134328358208955,
"grad_norm": 1.2653379590253169,
"learning_rate": 2.1317669514257678e-07,
"loss": 0.1601,
"step": 1944
},
{
"epoch": 1.814365671641791,
"grad_norm": 1.2482190972474847,
"learning_rate": 2.1106534668324963e-07,
"loss": 0.1583,
"step": 1945
},
{
"epoch": 1.8152985074626866,
"grad_norm": 1.219174084940544,
"learning_rate": 2.0896428049422768e-07,
"loss": 0.1655,
"step": 1946
},
{
"epoch": 1.8162313432835822,
"grad_norm": 1.2610747580499384,
"learning_rate": 2.0687350108668736e-07,
"loss": 0.1968,
"step": 1947
},
{
"epoch": 1.8171641791044775,
"grad_norm": 1.201167381487851,
"learning_rate": 2.0479301294971943e-07,
"loss": 0.1701,
"step": 1948
},
{
"epoch": 1.8180970149253732,
"grad_norm": 1.291343405885453,
"learning_rate": 2.0272282055031677e-07,
"loss": 0.1707,
"step": 1949
},
{
"epoch": 1.8190298507462686,
"grad_norm": 1.266129430543222,
"learning_rate": 2.006629283333694e-07,
"loss": 0.1795,
"step": 1950
},
{
"epoch": 1.8199626865671643,
"grad_norm": 1.195201708186078,
"learning_rate": 1.986133407216473e-07,
"loss": 0.137,
"step": 1951
},
{
"epoch": 1.8208955223880596,
"grad_norm": 1.286182662110936,
"learning_rate": 1.9657406211579966e-07,
"loss": 0.1586,
"step": 1952
},
{
"epoch": 1.8218283582089554,
"grad_norm": 1.2380020660793205,
"learning_rate": 1.9454509689433855e-07,
"loss": 0.1711,
"step": 1953
},
{
"epoch": 1.8227611940298507,
"grad_norm": 1.1637093894903545,
"learning_rate": 1.925264494136342e-07,
"loss": 0.1575,
"step": 1954
},
{
"epoch": 1.8236940298507462,
"grad_norm": 1.2883483299940546,
"learning_rate": 1.9051812400790294e-07,
"loss": 0.181,
"step": 1955
},
{
"epoch": 1.8246268656716418,
"grad_norm": 1.3248486280708933,
"learning_rate": 1.885201249891988e-07,
"loss": 0.1752,
"step": 1956
},
{
"epoch": 1.8255597014925373,
"grad_norm": 1.1699957820679514,
"learning_rate": 1.8653245664740415e-07,
"loss": 0.1336,
"step": 1957
},
{
"epoch": 1.8264925373134329,
"grad_norm": 1.217975885085646,
"learning_rate": 1.8455512325022073e-07,
"loss": 0.153,
"step": 1958
},
{
"epoch": 1.8274253731343284,
"grad_norm": 1.3227505872894074,
"learning_rate": 1.825881290431586e-07,
"loss": 0.2105,
"step": 1959
},
{
"epoch": 1.828358208955224,
"grad_norm": 1.3954681031280969,
"learning_rate": 1.806314782495311e-07,
"loss": 0.2267,
"step": 1960
},
{
"epoch": 1.8292910447761193,
"grad_norm": 1.1897675857272247,
"learning_rate": 1.7868517507044158e-07,
"loss": 0.1472,
"step": 1961
},
{
"epoch": 1.830223880597015,
"grad_norm": 1.251569228039328,
"learning_rate": 1.7674922368477675e-07,
"loss": 0.1474,
"step": 1962
},
{
"epoch": 1.8311567164179103,
"grad_norm": 1.2872655044907908,
"learning_rate": 1.7482362824919773e-07,
"loss": 0.1792,
"step": 1963
},
{
"epoch": 1.832089552238806,
"grad_norm": 1.293037384768504,
"learning_rate": 1.7290839289813065e-07,
"loss": 0.1752,
"step": 1964
},
{
"epoch": 1.8330223880597014,
"grad_norm": 1.2881640964829002,
"learning_rate": 1.71003521743755e-07,
"loss": 0.1867,
"step": 1965
},
{
"epoch": 1.8339552238805972,
"grad_norm": 1.3204633484140118,
"learning_rate": 1.6910901887599917e-07,
"loss": 0.1995,
"step": 1966
},
{
"epoch": 1.8348880597014925,
"grad_norm": 1.2679285220183314,
"learning_rate": 1.6722488836253104e-07,
"loss": 0.1661,
"step": 1967
},
{
"epoch": 1.835820895522388,
"grad_norm": 1.2423532391293808,
"learning_rate": 1.6535113424874683e-07,
"loss": 0.1732,
"step": 1968
},
{
"epoch": 1.8367537313432836,
"grad_norm": 1.3796806156601502,
"learning_rate": 1.6348776055776393e-07,
"loss": 0.1944,
"step": 1969
},
{
"epoch": 1.837686567164179,
"grad_norm": 1.2902379816793106,
"learning_rate": 1.6163477129041204e-07,
"loss": 0.175,
"step": 1970
},
{
"epoch": 1.8386194029850746,
"grad_norm": 1.2232784850810066,
"learning_rate": 1.5979217042522477e-07,
"loss": 0.129,
"step": 1971
},
{
"epoch": 1.8395522388059702,
"grad_norm": 1.2805197763508278,
"learning_rate": 1.5795996191842966e-07,
"loss": 0.1466,
"step": 1972
},
{
"epoch": 1.8404850746268657,
"grad_norm": 1.2795435182988995,
"learning_rate": 1.561381497039427e-07,
"loss": 0.1676,
"step": 1973
},
{
"epoch": 1.841417910447761,
"grad_norm": 1.1479715654839133,
"learning_rate": 1.5432673769335772e-07,
"loss": 0.1318,
"step": 1974
},
{
"epoch": 1.8423507462686568,
"grad_norm": 1.2255765389368125,
"learning_rate": 1.525257297759375e-07,
"loss": 0.1718,
"step": 1975
},
{
"epoch": 1.8432835820895521,
"grad_norm": 1.232347780038549,
"learning_rate": 1.5073512981860715e-07,
"loss": 0.1555,
"step": 1976
},
{
"epoch": 1.8442164179104479,
"grad_norm": 1.186929078251592,
"learning_rate": 1.4895494166594527e-07,
"loss": 0.1489,
"step": 1977
},
{
"epoch": 1.8451492537313432,
"grad_norm": 1.3533328231403003,
"learning_rate": 1.4718516914017433e-07,
"loss": 0.1932,
"step": 1978
},
{
"epoch": 1.846082089552239,
"grad_norm": 1.2033823427053365,
"learning_rate": 1.4542581604115258e-07,
"loss": 0.1388,
"step": 1979
},
{
"epoch": 1.8470149253731343,
"grad_norm": 1.2084325834974623,
"learning_rate": 1.4367688614637e-07,
"loss": 0.1705,
"step": 1980
},
{
"epoch": 1.8479477611940298,
"grad_norm": 1.2639200006890643,
"learning_rate": 1.4193838321093444e-07,
"loss": 0.1916,
"step": 1981
},
{
"epoch": 1.8488805970149254,
"grad_norm": 1.249684340310944,
"learning_rate": 1.4021031096756676e-07,
"loss": 0.1602,
"step": 1982
},
{
"epoch": 1.849813432835821,
"grad_norm": 1.2532245207528903,
"learning_rate": 1.3849267312659286e-07,
"loss": 0.1522,
"step": 1983
},
{
"epoch": 1.8507462686567164,
"grad_norm": 1.2884244986962023,
"learning_rate": 1.3678547337593494e-07,
"loss": 0.1602,
"step": 1984
},
{
"epoch": 1.851679104477612,
"grad_norm": 1.2943988387183982,
"learning_rate": 1.3508871538110257e-07,
"loss": 0.2043,
"step": 1985
},
{
"epoch": 1.8526119402985075,
"grad_norm": 1.1569657792604953,
"learning_rate": 1.3340240278518657e-07,
"loss": 0.1275,
"step": 1986
},
{
"epoch": 1.8535447761194028,
"grad_norm": 1.1921267259347368,
"learning_rate": 1.317265392088507e-07,
"loss": 0.137,
"step": 1987
},
{
"epoch": 1.8544776119402986,
"grad_norm": 1.301171410793594,
"learning_rate": 1.3006112825032447e-07,
"loss": 0.191,
"step": 1988
},
{
"epoch": 1.855410447761194,
"grad_norm": 1.2573536199515079,
"learning_rate": 1.284061734853931e-07,
"loss": 0.1457,
"step": 1989
},
{
"epoch": 1.8563432835820897,
"grad_norm": 1.1872268450628818,
"learning_rate": 1.2676167846739308e-07,
"loss": 0.1421,
"step": 1990
},
{
"epoch": 1.857276119402985,
"grad_norm": 1.2675468112267114,
"learning_rate": 1.2512764672720168e-07,
"loss": 0.1833,
"step": 1991
},
{
"epoch": 1.8582089552238807,
"grad_norm": 1.3549732318972694,
"learning_rate": 1.235040817732297e-07,
"loss": 0.1938,
"step": 1992
},
{
"epoch": 1.859141791044776,
"grad_norm": 1.2085891700977642,
"learning_rate": 1.2189098709141756e-07,
"loss": 0.1489,
"step": 1993
},
{
"epoch": 1.8600746268656716,
"grad_norm": 1.2385528532688561,
"learning_rate": 1.202883661452231e-07,
"loss": 0.1648,
"step": 1994
},
{
"epoch": 1.8610074626865671,
"grad_norm": 1.3342227258355062,
"learning_rate": 1.1869622237561662e-07,
"loss": 0.2074,
"step": 1995
},
{
"epoch": 1.8619402985074627,
"grad_norm": 1.2625600599310918,
"learning_rate": 1.1711455920107306e-07,
"loss": 0.1887,
"step": 1996
},
{
"epoch": 1.8628731343283582,
"grad_norm": 1.3176357088862165,
"learning_rate": 1.1554338001756482e-07,
"loss": 0.2104,
"step": 1997
},
{
"epoch": 1.8638059701492538,
"grad_norm": 1.3343195299943285,
"learning_rate": 1.1398268819855285e-07,
"loss": 0.2143,
"step": 1998
},
{
"epoch": 1.8647388059701493,
"grad_norm": 1.3588439215016899,
"learning_rate": 1.1243248709498278e-07,
"loss": 0.1616,
"step": 1999
},
{
"epoch": 1.8656716417910446,
"grad_norm": 1.2045136303338808,
"learning_rate": 1.1089278003527438e-07,
"loss": 0.1441,
"step": 2000
},
{
"epoch": 1.8656716417910446,
"eval_loss": 0.21490994095802307,
"eval_runtime": 3.411,
"eval_samples_per_second": 25.505,
"eval_steps_per_second": 6.45,
"step": 2000
},
{
"epoch": 1.8666044776119404,
"grad_norm": 1.2647703652398299,
"learning_rate": 1.0936357032531597e-07,
"loss": 0.1778,
"step": 2001
},
{
"epoch": 1.8675373134328357,
"grad_norm": 1.3578980897172048,
"learning_rate": 1.0784486124845783e-07,
"loss": 0.2182,
"step": 2002
},
{
"epoch": 1.8684701492537314,
"grad_norm": 1.2809410331706492,
"learning_rate": 1.0633665606550436e-07,
"loss": 0.2063,
"step": 2003
},
{
"epoch": 1.8694029850746268,
"grad_norm": 1.2287311481388954,
"learning_rate": 1.0483895801470579e-07,
"loss": 0.1484,
"step": 2004
},
{
"epoch": 1.8703358208955225,
"grad_norm": 1.1789462345833952,
"learning_rate": 1.0335177031175425e-07,
"loss": 0.1762,
"step": 2005
},
{
"epoch": 1.8712686567164178,
"grad_norm": 1.2786010454645145,
"learning_rate": 1.0187509614977387e-07,
"loss": 0.1437,
"step": 2006
},
{
"epoch": 1.8722014925373134,
"grad_norm": 1.2715109077455946,
"learning_rate": 1.0040893869931623e-07,
"loss": 0.1528,
"step": 2007
},
{
"epoch": 1.873134328358209,
"grad_norm": 1.269783047564372,
"learning_rate": 9.89533011083521e-08,
"loss": 0.1748,
"step": 2008
},
{
"epoch": 1.8740671641791045,
"grad_norm": 1.156925043114288,
"learning_rate": 9.75081865022659e-08,
"loss": 0.1273,
"step": 2009
},
{
"epoch": 1.875,
"grad_norm": 1.269385652751176,
"learning_rate": 9.607359798384785e-08,
"loss": 0.1548,
"step": 2010
},
{
"epoch": 1.8759328358208955,
"grad_norm": 1.228564799752932,
"learning_rate": 9.464953863328685e-08,
"loss": 0.1781,
"step": 2011
},
{
"epoch": 1.876865671641791,
"grad_norm": 1.2315874613017999,
"learning_rate": 9.323601150816597e-08,
"loss": 0.1504,
"step": 2012
},
{
"epoch": 1.8777985074626866,
"grad_norm": 1.2946326738241574,
"learning_rate": 9.18330196434536e-08,
"loss": 0.1665,
"step": 2013
},
{
"epoch": 1.8787313432835822,
"grad_norm": 1.2632594514798259,
"learning_rate": 9.044056605149898e-08,
"loss": 0.1639,
"step": 2014
},
{
"epoch": 1.8796641791044775,
"grad_norm": 1.2220859866764355,
"learning_rate": 8.905865372202449e-08,
"loss": 0.153,
"step": 2015
},
{
"epoch": 1.8805970149253732,
"grad_norm": 1.2154225699873356,
"learning_rate": 8.768728562211948e-08,
"loss": 0.1676,
"step": 2016
},
{
"epoch": 1.8815298507462686,
"grad_norm": 1.3222162104234343,
"learning_rate": 8.632646469623251e-08,
"loss": 0.1924,
"step": 2017
},
{
"epoch": 1.8824626865671643,
"grad_norm": 1.2498467042920565,
"learning_rate": 8.497619386616917e-08,
"loss": 0.1525,
"step": 2018
},
{
"epoch": 1.8833955223880596,
"grad_norm": 1.2593630693482811,
"learning_rate": 8.363647603108038e-08,
"loss": 0.1683,
"step": 2019
},
{
"epoch": 1.8843283582089554,
"grad_norm": 1.2586530048367546,
"learning_rate": 8.230731406746018e-08,
"loss": 0.1827,
"step": 2020
},
{
"epoch": 1.8852611940298507,
"grad_norm": 1.222002038407417,
"learning_rate": 8.098871082913795e-08,
"loss": 0.1328,
"step": 2021
},
{
"epoch": 1.8861940298507462,
"grad_norm": 1.3251949889002166,
"learning_rate": 7.968066914727346e-08,
"loss": 0.1915,
"step": 2022
},
{
"epoch": 1.8871268656716418,
"grad_norm": 1.2308479588962644,
"learning_rate": 7.838319183034738e-08,
"loss": 0.1398,
"step": 2023
},
{
"epoch": 1.8880597014925373,
"grad_norm": 1.3339175227843414,
"learning_rate": 7.709628166416128e-08,
"loss": 0.1998,
"step": 2024
},
{
"epoch": 1.8889925373134329,
"grad_norm": 1.1317450810318777,
"learning_rate": 7.581994141182436e-08,
"loss": 0.1335,
"step": 2025
},
{
"epoch": 1.8899253731343284,
"grad_norm": 1.209849710848305,
"learning_rate": 7.455417381375452e-08,
"loss": 0.1675,
"step": 2026
},
{
"epoch": 1.890858208955224,
"grad_norm": 1.3439662204687448,
"learning_rate": 7.329898158766668e-08,
"loss": 0.1983,
"step": 2027
},
{
"epoch": 1.8917910447761193,
"grad_norm": 1.2251396088707578,
"learning_rate": 7.20543674285712e-08,
"loss": 0.1488,
"step": 2028
},
{
"epoch": 1.892723880597015,
"grad_norm": 1.2755583940998025,
"learning_rate": 7.082033400876597e-08,
"loss": 0.1581,
"step": 2029
},
{
"epoch": 1.8936567164179103,
"grad_norm": 1.3121422656304376,
"learning_rate": 6.959688397783104e-08,
"loss": 0.1893,
"step": 2030
},
{
"epoch": 1.894589552238806,
"grad_norm": 1.2011563608659308,
"learning_rate": 6.838401996262289e-08,
"loss": 0.1181,
"step": 2031
},
{
"epoch": 1.8955223880597014,
"grad_norm": 1.2263127899449682,
"learning_rate": 6.718174456726789e-08,
"loss": 0.1782,
"step": 2032
},
{
"epoch": 1.8964552238805972,
"grad_norm": 1.2668253310217705,
"learning_rate": 6.599006037315891e-08,
"loss": 0.145,
"step": 2033
},
{
"epoch": 1.8973880597014925,
"grad_norm": 1.2872258234895284,
"learning_rate": 6.480896993894925e-08,
"loss": 0.1984,
"step": 2034
},
{
"epoch": 1.898320895522388,
"grad_norm": 1.2223492712874333,
"learning_rate": 6.363847580054483e-08,
"loss": 0.1534,
"step": 2035
},
{
"epoch": 1.8992537313432836,
"grad_norm": 1.303409477099441,
"learning_rate": 6.247858047110145e-08,
"loss": 0.1629,
"step": 2036
},
{
"epoch": 1.900186567164179,
"grad_norm": 1.3053581424951257,
"learning_rate": 6.13292864410181e-08,
"loss": 0.1755,
"step": 2037
},
{
"epoch": 1.9011194029850746,
"grad_norm": 1.2366118845923963,
"learning_rate": 6.019059617793088e-08,
"loss": 0.1631,
"step": 2038
},
{
"epoch": 1.9020522388059702,
"grad_norm": 1.2141993390108499,
"learning_rate": 5.906251212670966e-08,
"loss": 0.1533,
"step": 2039
},
{
"epoch": 1.9029850746268657,
"grad_norm": 1.2575829570687929,
"learning_rate": 5.794503670945195e-08,
"loss": 0.1674,
"step": 2040
},
{
"epoch": 1.903917910447761,
"grad_norm": 1.264400341181521,
"learning_rate": 5.683817232547739e-08,
"loss": 0.1751,
"step": 2041
},
{
"epoch": 1.9048507462686568,
"grad_norm": 1.3228707706783953,
"learning_rate": 5.5741921351322726e-08,
"loss": 0.2016,
"step": 2042
},
{
"epoch": 1.9057835820895521,
"grad_norm": 1.2226813604358504,
"learning_rate": 5.465628614073626e-08,
"loss": 0.1537,
"step": 2043
},
{
"epoch": 1.9067164179104479,
"grad_norm": 1.1625816999071248,
"learning_rate": 5.3581269024673975e-08,
"loss": 0.1548,
"step": 2044
},
{
"epoch": 1.9076492537313432,
"grad_norm": 1.262325483429432,
"learning_rate": 5.251687231129288e-08,
"loss": 0.1619,
"step": 2045
},
{
"epoch": 1.908582089552239,
"grad_norm": 1.2324344765240605,
"learning_rate": 5.1463098285948755e-08,
"loss": 0.1633,
"step": 2046
},
{
"epoch": 1.9095149253731343,
"grad_norm": 1.3424158979490182,
"learning_rate": 5.0419949211188426e-08,
"loss": 0.2337,
"step": 2047
},
{
"epoch": 1.9104477611940298,
"grad_norm": 1.2312577225133483,
"learning_rate": 4.9387427326745287e-08,
"loss": 0.172,
"step": 2048
},
{
"epoch": 1.9113805970149254,
"grad_norm": 1.24843958071994,
"learning_rate": 4.8365534849536546e-08,
"loss": 0.1413,
"step": 2049
},
{
"epoch": 1.912313432835821,
"grad_norm": 1.1973236844635842,
"learning_rate": 4.7354273973657106e-08,
"loss": 0.1492,
"step": 2050
},
{
"epoch": 1.9132462686567164,
"grad_norm": 1.222774081659431,
"learning_rate": 4.635364687037347e-08,
"loss": 0.1532,
"step": 2051
},
{
"epoch": 1.914179104477612,
"grad_norm": 1.2354400863685973,
"learning_rate": 4.536365568812206e-08,
"loss": 0.1796,
"step": 2052
},
{
"epoch": 1.9151119402985075,
"grad_norm": 1.2537534257119465,
"learning_rate": 4.438430255250148e-08,
"loss": 0.16,
"step": 2053
},
{
"epoch": 1.9160447761194028,
"grad_norm": 1.25376146885341,
"learning_rate": 4.3415589566271345e-08,
"loss": 0.156,
"step": 2054
},
{
"epoch": 1.9169776119402986,
"grad_norm": 1.1554648927829305,
"learning_rate": 4.245751880934401e-08,
"loss": 0.1269,
"step": 2055
},
{
"epoch": 1.917910447761194,
"grad_norm": 1.2777046286903468,
"learning_rate": 4.1510092338784005e-08,
"loss": 0.1644,
"step": 2056
},
{
"epoch": 1.9188432835820897,
"grad_norm": 1.3672542752617738,
"learning_rate": 4.057331218880023e-08,
"loss": 0.1966,
"step": 2057
},
{
"epoch": 1.919776119402985,
"grad_norm": 1.2151857535759114,
"learning_rate": 3.9647180370742664e-08,
"loss": 0.145,
"step": 2058
},
{
"epoch": 1.9207089552238807,
"grad_norm": 1.241760663584323,
"learning_rate": 3.8731698873099025e-08,
"loss": 0.1594,
"step": 2059
},
{
"epoch": 1.921641791044776,
"grad_norm": 1.2075163609084734,
"learning_rate": 3.782686966149085e-08,
"loss": 0.1637,
"step": 2060
},
{
"epoch": 1.9225746268656716,
"grad_norm": 1.2686508486559018,
"learning_rate": 3.6932694678666335e-08,
"loss": 0.1743,
"step": 2061
},
{
"epoch": 1.9235074626865671,
"grad_norm": 1.2796694294012751,
"learning_rate": 3.604917584449919e-08,
"loss": 0.155,
"step": 2062
},
{
"epoch": 1.9244402985074627,
"grad_norm": 1.1685408020475783,
"learning_rate": 3.5176315055983625e-08,
"loss": 0.1558,
"step": 2063
},
{
"epoch": 1.9253731343283582,
"grad_norm": 1.209641119343688,
"learning_rate": 3.431411418722941e-08,
"loss": 0.1348,
"step": 2064
},
{
"epoch": 1.9263059701492538,
"grad_norm": 1.308251030820361,
"learning_rate": 3.346257508945849e-08,
"loss": 0.1886,
"step": 2065
},
{
"epoch": 1.9272388059701493,
"grad_norm": 1.212712386031227,
"learning_rate": 3.26216995910017e-08,
"loss": 0.1652,
"step": 2066
},
{
"epoch": 1.9281716417910446,
"grad_norm": 1.247431350146223,
"learning_rate": 3.1791489497293715e-08,
"loss": 0.1616,
"step": 2067
},
{
"epoch": 1.9291044776119404,
"grad_norm": 1.389925951787403,
"learning_rate": 3.097194659086977e-08,
"loss": 0.2654,
"step": 2068
},
{
"epoch": 1.9300373134328357,
"grad_norm": 1.2490713519993188,
"learning_rate": 3.016307263136231e-08,
"loss": 0.1754,
"step": 2069
},
{
"epoch": 1.9309701492537314,
"grad_norm": 1.266116357622932,
"learning_rate": 2.9364869355494874e-08,
"loss": 0.1664,
"step": 2070
},
{
"epoch": 1.9319029850746268,
"grad_norm": 1.2389673660954839,
"learning_rate": 2.857733847708155e-08,
"loss": 0.162,
"step": 2071
},
{
"epoch": 1.9328358208955225,
"grad_norm": 1.266035053802742,
"learning_rate": 2.7800481687021987e-08,
"loss": 0.1844,
"step": 2072
},
{
"epoch": 1.9337686567164178,
"grad_norm": 1.2372099071432225,
"learning_rate": 2.7034300653295818e-08,
"loss": 0.1761,
"step": 2073
},
{
"epoch": 1.9347014925373134,
"grad_norm": 1.264218418799199,
"learning_rate": 2.6278797020963253e-08,
"loss": 0.1417,
"step": 2074
},
{
"epoch": 1.935634328358209,
"grad_norm": 1.2718568361161524,
"learning_rate": 2.5533972412157825e-08,
"loss": 0.2047,
"step": 2075
},
{
"epoch": 1.9365671641791045,
"grad_norm": 1.2231602912323578,
"learning_rate": 2.479982842608475e-08,
"loss": 0.1609,
"step": 2076
},
{
"epoch": 1.9375,
"grad_norm": 1.2898602762124696,
"learning_rate": 2.4076366639015914e-08,
"loss": 0.1717,
"step": 2077
},
{
"epoch": 1.9384328358208955,
"grad_norm": 1.2700166079913484,
"learning_rate": 2.3363588604288777e-08,
"loss": 0.1536,
"step": 2078
},
{
"epoch": 1.939365671641791,
"grad_norm": 1.3149483654485066,
"learning_rate": 2.2661495852301376e-08,
"loss": 0.184,
"step": 2079
},
{
"epoch": 1.9402985074626866,
"grad_norm": 1.2239797410288278,
"learning_rate": 2.1970089890509527e-08,
"loss": 0.1442,
"step": 2080
},
{
"epoch": 1.9412313432835822,
"grad_norm": 1.2532824903580881,
"learning_rate": 2.128937220342353e-08,
"loss": 0.1785,
"step": 2081
},
{
"epoch": 1.9421641791044775,
"grad_norm": 1.1849411459371284,
"learning_rate": 2.0619344252605922e-08,
"loss": 0.1478,
"step": 2082
},
{
"epoch": 1.9430970149253732,
"grad_norm": 1.2705288944933275,
"learning_rate": 1.9960007476665376e-08,
"loss": 0.1928,
"step": 2083
},
{
"epoch": 1.9440298507462686,
"grad_norm": 1.2633498041710418,
"learning_rate": 1.931136329125727e-08,
"loss": 0.1883,
"step": 2084
},
{
"epoch": 1.9449626865671643,
"grad_norm": 1.2150222998047435,
"learning_rate": 1.8673413089078108e-08,
"loss": 0.1446,
"step": 2085
},
{
"epoch": 1.9458955223880596,
"grad_norm": 1.2652170430429417,
"learning_rate": 1.8046158239864996e-08,
"loss": 0.1748,
"step": 2086
},
{
"epoch": 1.9468283582089554,
"grad_norm": 1.3157094882210538,
"learning_rate": 1.7429600090388966e-08,
"loss": 0.2167,
"step": 2087
},
{
"epoch": 1.9477611940298507,
"grad_norm": 1.258322651998683,
"learning_rate": 1.6823739964456078e-08,
"loss": 0.1558,
"step": 2088
},
{
"epoch": 1.9486940298507462,
"grad_norm": 1.2276650774757258,
"learning_rate": 1.622857916290188e-08,
"loss": 0.1878,
"step": 2089
},
{
"epoch": 1.9496268656716418,
"grad_norm": 1.2725861869530462,
"learning_rate": 1.5644118963590305e-08,
"loss": 0.1589,
"step": 2090
},
{
"epoch": 1.9505597014925373,
"grad_norm": 1.258881749534401,
"learning_rate": 1.5070360621408653e-08,
"loss": 0.1747,
"step": 2091
},
{
"epoch": 1.9514925373134329,
"grad_norm": 1.2733567362619342,
"learning_rate": 1.4507305368268166e-08,
"loss": 0.1696,
"step": 2092
},
{
"epoch": 1.9524253731343284,
"grad_norm": 1.1802931314348926,
"learning_rate": 1.395495441309791e-08,
"loss": 0.1562,
"step": 2093
},
{
"epoch": 1.953358208955224,
"grad_norm": 1.1632999193280913,
"learning_rate": 1.3413308941845338e-08,
"loss": 0.1242,
"step": 2094
},
{
"epoch": 1.9542910447761193,
"grad_norm": 1.23255407194608,
"learning_rate": 1.2882370117471843e-08,
"loss": 0.1384,
"step": 2095
},
{
"epoch": 1.955223880597015,
"grad_norm": 1.3452475601728535,
"learning_rate": 1.2362139079949431e-08,
"loss": 0.2157,
"step": 2096
},
{
"epoch": 1.9561567164179103,
"grad_norm": 1.2450531168181902,
"learning_rate": 1.185261694626183e-08,
"loss": 0.1842,
"step": 2097
},
{
"epoch": 1.957089552238806,
"grad_norm": 1.4275173568352066,
"learning_rate": 1.1353804810397828e-08,
"loss": 0.2392,
"step": 2098
},
{
"epoch": 1.9580223880597014,
"grad_norm": 1.3143714071167805,
"learning_rate": 1.086570374335183e-08,
"loss": 0.1991,
"step": 2099
},
{
"epoch": 1.9589552238805972,
"grad_norm": 1.1500674993387472,
"learning_rate": 1.038831479311997e-08,
"loss": 0.1344,
"step": 2100
},
{
"epoch": 1.9598880597014925,
"grad_norm": 1.2672492478782138,
"learning_rate": 9.92163898470011e-09,
"loss": 0.1771,
"step": 2101
},
{
"epoch": 1.960820895522388,
"grad_norm": 1.4494577443854482,
"learning_rate": 9.465677320085742e-09,
"loss": 0.2224,
"step": 2102
},
{
"epoch": 1.9617537313432836,
"grad_norm": 1.219893918861546,
"learning_rate": 9.020430778267642e-09,
"loss": 0.1558,
"step": 2103
},
{
"epoch": 1.962686567164179,
"grad_norm": 1.240858328811536,
"learning_rate": 8.585900315229434e-09,
"loss": 0.1657,
"step": 2104
},
{
"epoch": 1.9636194029850746,
"grad_norm": 1.297384654876598,
"learning_rate": 8.162086863948149e-09,
"loss": 0.1803,
"step": 2105
},
{
"epoch": 1.9645522388059702,
"grad_norm": 1.268935326338251,
"learning_rate": 7.748991334387557e-09,
"loss": 0.1682,
"step": 2106
},
{
"epoch": 1.9654850746268657,
"grad_norm": 1.285120399000862,
"learning_rate": 7.346614613501502e-09,
"loss": 0.1683,
"step": 2107
},
{
"epoch": 1.966417910447761,
"grad_norm": 1.176238300774971,
"learning_rate": 6.9549575652289036e-09,
"loss": 0.1421,
"step": 2108
},
{
"epoch": 1.9673507462686568,
"grad_norm": 1.174234396013993,
"learning_rate": 6.57402103049154e-09,
"loss": 0.1541,
"step": 2109
},
{
"epoch": 1.9682835820895521,
"grad_norm": 1.2516232436617056,
"learning_rate": 6.203805827195153e-09,
"loss": 0.1972,
"step": 2110
},
{
"epoch": 1.9692164179104479,
"grad_norm": 1.2194039452614363,
"learning_rate": 5.844312750224457e-09,
"loss": 0.1427,
"step": 2111
},
{
"epoch": 1.9701492537313432,
"grad_norm": 1.2231155094240758,
"learning_rate": 5.495542571443135e-09,
"loss": 0.1569,
"step": 2112
},
{
"epoch": 1.971082089552239,
"grad_norm": 1.2499932139754333,
"learning_rate": 5.157496039691623e-09,
"loss": 0.16,
"step": 2113
},
{
"epoch": 1.9720149253731343,
"grad_norm": 1.330027159799502,
"learning_rate": 4.830173880785993e-09,
"loss": 0.1973,
"step": 2114
},
{
"epoch": 1.9729477611940298,
"grad_norm": 1.2327867569444704,
"learning_rate": 4.51357679751685e-09,
"loss": 0.127,
"step": 2115
},
{
"epoch": 1.9738805970149254,
"grad_norm": 1.2352162496382828,
"learning_rate": 4.207705469645995e-09,
"loss": 0.1329,
"step": 2116
},
{
"epoch": 1.974813432835821,
"grad_norm": 1.2655767875127586,
"learning_rate": 3.9125605539064305e-09,
"loss": 0.1552,
"step": 2117
},
{
"epoch": 1.9757462686567164,
"grad_norm": 1.2532336431861906,
"learning_rate": 3.6281426840006907e-09,
"loss": 0.1485,
"step": 2118
},
{
"epoch": 1.976679104477612,
"grad_norm": 1.2832730129321155,
"learning_rate": 3.354452470599179e-09,
"loss": 0.1679,
"step": 2119
},
{
"epoch": 1.9776119402985075,
"grad_norm": 1.2499818530768971,
"learning_rate": 3.0914905013396113e-09,
"loss": 0.1743,
"step": 2120
},
{
"epoch": 1.9785447761194028,
"grad_norm": 1.3021117386738132,
"learning_rate": 2.8392573408242418e-09,
"loss": 0.2287,
"step": 2121
},
{
"epoch": 1.9794776119402986,
"grad_norm": 1.2416303010434144,
"learning_rate": 2.597753530620417e-09,
"loss": 0.1703,
"step": 2122
},
{
"epoch": 1.980410447761194,
"grad_norm": 1.2533231907204014,
"learning_rate": 2.3669795892589108e-09,
"loss": 0.1675,
"step": 2123
},
{
"epoch": 1.9813432835820897,
"grad_norm": 1.1854416515724393,
"learning_rate": 2.146936012231704e-09,
"loss": 0.1354,
"step": 2124
},
{
"epoch": 1.982276119402985,
"grad_norm": 1.20978357068862,
"learning_rate": 1.937623271991429e-09,
"loss": 0.1187,
"step": 2125
},
{
"epoch": 1.9832089552238807,
"grad_norm": 1.2998527302124716,
"learning_rate": 1.7390418179519253e-09,
"loss": 0.2289,
"step": 2126
},
{
"epoch": 1.984141791044776,
"grad_norm": 1.2918207881836592,
"learning_rate": 1.5511920764849087e-09,
"loss": 0.1355,
"step": 2127
},
{
"epoch": 1.9850746268656716,
"grad_norm": 1.3023076128240987,
"learning_rate": 1.3740744509205263e-09,
"loss": 0.1271,
"step": 2128
},
{
"epoch": 1.9860074626865671,
"grad_norm": 1.3462881653805205,
"learning_rate": 1.2076893215462459e-09,
"loss": 0.1962,
"step": 2129
},
{
"epoch": 1.9869402985074627,
"grad_norm": 1.356991102865659,
"learning_rate": 1.0520370456063023e-09,
"loss": 0.1986,
"step": 2130
},
{
"epoch": 1.9878731343283582,
"grad_norm": 1.275323689551574,
"learning_rate": 9.071179572989198e-10,
"loss": 0.1687,
"step": 2131
},
{
"epoch": 1.9888059701492538,
"grad_norm": 1.3226690040861337,
"learning_rate": 7.72932367779089e-10,
"loss": 0.1542,
"step": 2132
},
{
"epoch": 1.9897388059701493,
"grad_norm": 1.3005321836804855,
"learning_rate": 6.494805651557911e-10,
"loss": 0.1672,
"step": 2133
},
{
"epoch": 1.9906716417910446,
"grad_norm": 1.186978116813406,
"learning_rate": 5.367628144897774e-10,
"loss": 0.1429,
"step": 2134
},
{
"epoch": 1.9916044776119404,
"grad_norm": 1.369297450226836,
"learning_rate": 4.3477935779689953e-10,
"loss": 0.2124,
"step": 2135
},
{
"epoch": 1.9925373134328357,
"grad_norm": 1.2885014260387289,
"learning_rate": 3.4353041404477926e-10,
"loss": 0.2039,
"step": 2136
},
{
"epoch": 1.9934701492537314,
"grad_norm": 1.2446402200227755,
"learning_rate": 2.630161791528085e-10,
"loss": 0.1603,
"step": 2137
},
{
"epoch": 1.9944029850746268,
"grad_norm": 1.1952225706546216,
"learning_rate": 1.932368259921491e-10,
"loss": 0.1773,
"step": 2138
},
{
"epoch": 1.9953358208955225,
"grad_norm": 1.1708117724538196,
"learning_rate": 1.3419250438517771e-10,
"loss": 0.161,
"step": 2139
},
{
"epoch": 1.9962686567164178,
"grad_norm": 1.2760179119329083,
"learning_rate": 8.588334110604113e-11,
"loss": 0.1879,
"step": 2140
},
{
"epoch": 1.9972014925373134,
"grad_norm": 1.311338484988188,
"learning_rate": 4.830943987843562e-11,
"loss": 0.1501,
"step": 2141
},
{
"epoch": 1.998134328358209,
"grad_norm": 1.1724830744992465,
"learning_rate": 2.1470881376162157e-11,
"loss": 0.1318,
"step": 2142
},
{
"epoch": 1.9990671641791045,
"grad_norm": 1.2412413185361992,
"learning_rate": 5.367723225346844e-12,
"loss": 0.1641,
"step": 2143
},
{
"epoch": 2.0,
"grad_norm": 1.156053260474211,
"learning_rate": 0.0,
"loss": 0.1385,
"step": 2144
},
{
"epoch": 2.0,
"step": 2144,
"total_flos": 30161139400704.0,
"train_loss": 0.19700502045800103,
"train_runtime": 1537.8264,
"train_samples_per_second": 11.146,
"train_steps_per_second": 1.394
}
],
"logging_steps": 1,
"max_steps": 2144,
"num_input_tokens_seen": 0,
"num_train_epochs": 2,
"save_steps": 5000,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 30161139400704.0,
"train_batch_size": 2,
"trial_name": null,
"trial_params": null
}