ErrorAI's picture
Training in progress, step 886, checkpoint
dbb62b2 verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 1.0008469791078487,
"eval_steps": 500,
"global_step": 886,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.001129305477131564,
"grad_norm": 0.20765775442123413,
"learning_rate": 2e-05,
"loss": 1.9066,
"step": 1
},
{
"epoch": 0.002258610954263128,
"grad_norm": 0.06511678546667099,
"learning_rate": 4e-05,
"loss": 1.6166,
"step": 2
},
{
"epoch": 0.0033879164313946925,
"grad_norm": 0.1536235362291336,
"learning_rate": 6e-05,
"loss": 1.3065,
"step": 3
},
{
"epoch": 0.004517221908526256,
"grad_norm": 0.1566154658794403,
"learning_rate": 8e-05,
"loss": 1.7229,
"step": 4
},
{
"epoch": 0.00564652738565782,
"grad_norm": 0.10513211041688919,
"learning_rate": 0.0001,
"loss": 2.1128,
"step": 5
},
{
"epoch": 0.006775832862789385,
"grad_norm": 0.10510624945163727,
"learning_rate": 9.999968210223322e-05,
"loss": 1.9685,
"step": 6
},
{
"epoch": 0.007905138339920948,
"grad_norm": 0.0922314003109932,
"learning_rate": 9.999872841297521e-05,
"loss": 1.672,
"step": 7
},
{
"epoch": 0.009034443817052512,
"grad_norm": 0.20644737780094147,
"learning_rate": 9.999713894435302e-05,
"loss": 1.7507,
"step": 8
},
{
"epoch": 0.010163749294184076,
"grad_norm": 0.1501447558403015,
"learning_rate": 9.999491371657821e-05,
"loss": 2.3546,
"step": 9
},
{
"epoch": 0.01129305477131564,
"grad_norm": 0.1670556217432022,
"learning_rate": 9.999205275794653e-05,
"loss": 1.4289,
"step": 10
},
{
"epoch": 0.012422360248447204,
"grad_norm": 0.1365075558423996,
"learning_rate": 9.998855610483771e-05,
"loss": 1.7987,
"step": 11
},
{
"epoch": 0.01355166572557877,
"grad_norm": 0.2672828137874603,
"learning_rate": 9.998442380171484e-05,
"loss": 1.6974,
"step": 12
},
{
"epoch": 0.014680971202710334,
"grad_norm": 0.37785211205482483,
"learning_rate": 9.997965590112397e-05,
"loss": 2.1214,
"step": 13
},
{
"epoch": 0.015810276679841896,
"grad_norm": 0.1777479499578476,
"learning_rate": 9.997425246369325e-05,
"loss": 2.0929,
"step": 14
},
{
"epoch": 0.01693958215697346,
"grad_norm": 0.3635629713535309,
"learning_rate": 9.996821355813235e-05,
"loss": 1.7696,
"step": 15
},
{
"epoch": 0.018068887634105024,
"grad_norm": 0.148004412651062,
"learning_rate": 9.996153926123141e-05,
"loss": 1.7952,
"step": 16
},
{
"epoch": 0.019198193111236588,
"grad_norm": 0.14363674819469452,
"learning_rate": 9.995422965786025e-05,
"loss": 1.9941,
"step": 17
},
{
"epoch": 0.020327498588368152,
"grad_norm": 0.17726008594036102,
"learning_rate": 9.994628484096706e-05,
"loss": 2.1081,
"step": 18
},
{
"epoch": 0.021456804065499716,
"grad_norm": 0.1837041974067688,
"learning_rate": 9.993770491157749e-05,
"loss": 2.2376,
"step": 19
},
{
"epoch": 0.02258610954263128,
"grad_norm": 0.28806236386299133,
"learning_rate": 9.992848997879312e-05,
"loss": 1.8928,
"step": 20
},
{
"epoch": 0.023715415019762844,
"grad_norm": 0.16167967021465302,
"learning_rate": 9.991864015979021e-05,
"loss": 1.7153,
"step": 21
},
{
"epoch": 0.024844720496894408,
"grad_norm": 0.25105348229408264,
"learning_rate": 9.99081555798182e-05,
"loss": 2.1614,
"step": 22
},
{
"epoch": 0.025974025974025976,
"grad_norm": 0.1812150478363037,
"learning_rate": 9.989703637219806e-05,
"loss": 2.2657,
"step": 23
},
{
"epoch": 0.02710333145115754,
"grad_norm": 0.1782921701669693,
"learning_rate": 9.988528267832062e-05,
"loss": 2.2051,
"step": 24
},
{
"epoch": 0.028232636928289104,
"grad_norm": 0.18120209872722626,
"learning_rate": 9.987289464764485e-05,
"loss": 2.0319,
"step": 25
},
{
"epoch": 0.029361942405420668,
"grad_norm": 0.2875784635543823,
"learning_rate": 9.985987243769578e-05,
"loss": 1.9888,
"step": 26
},
{
"epoch": 0.030491247882552232,
"grad_norm": 0.18351072072982788,
"learning_rate": 9.984621621406273e-05,
"loss": 1.8919,
"step": 27
},
{
"epoch": 0.03162055335968379,
"grad_norm": 0.199467733502388,
"learning_rate": 9.983192615039699e-05,
"loss": 1.9984,
"step": 28
},
{
"epoch": 0.03274985883681536,
"grad_norm": 0.25572243332862854,
"learning_rate": 9.981700242840974e-05,
"loss": 2.2746,
"step": 29
},
{
"epoch": 0.03387916431394692,
"grad_norm": 0.5004045367240906,
"learning_rate": 9.980144523786967e-05,
"loss": 1.8283,
"step": 30
},
{
"epoch": 0.03500846979107849,
"grad_norm": 0.7726016640663147,
"learning_rate": 9.978525477660066e-05,
"loss": 1.889,
"step": 31
},
{
"epoch": 0.03613777526821005,
"grad_norm": 0.2675948441028595,
"learning_rate": 9.976843125047916e-05,
"loss": 2.2772,
"step": 32
},
{
"epoch": 0.037267080745341616,
"grad_norm": 0.27251410484313965,
"learning_rate": 9.975097487343161e-05,
"loss": 1.5502,
"step": 33
},
{
"epoch": 0.038396386222473176,
"grad_norm": 0.3356260061264038,
"learning_rate": 9.973288586743175e-05,
"loss": 1.8092,
"step": 34
},
{
"epoch": 0.039525691699604744,
"grad_norm": 0.2943379878997803,
"learning_rate": 9.971416446249777e-05,
"loss": 1.6588,
"step": 35
},
{
"epoch": 0.040654997176736304,
"grad_norm": 0.3058868646621704,
"learning_rate": 9.969481089668938e-05,
"loss": 1.8885,
"step": 36
},
{
"epoch": 0.04178430265386787,
"grad_norm": 0.4374743103981018,
"learning_rate": 9.96748254161048e-05,
"loss": 1.4605,
"step": 37
},
{
"epoch": 0.04291360813099943,
"grad_norm": 0.4457000195980072,
"learning_rate": 9.965420827487759e-05,
"loss": 2.223,
"step": 38
},
{
"epoch": 0.044042913608131,
"grad_norm": 0.4432419240474701,
"learning_rate": 9.963295973517352e-05,
"loss": 2.1732,
"step": 39
},
{
"epoch": 0.04517221908526256,
"grad_norm": 0.7509264945983887,
"learning_rate": 9.961108006718708e-05,
"loss": 2.3801,
"step": 40
},
{
"epoch": 0.04630152456239413,
"grad_norm": 0.4913429319858551,
"learning_rate": 9.95885695491382e-05,
"loss": 2.2028,
"step": 41
},
{
"epoch": 0.04743083003952569,
"grad_norm": 0.7258439660072327,
"learning_rate": 9.95654284672686e-05,
"loss": 2.2493,
"step": 42
},
{
"epoch": 0.048560135516657256,
"grad_norm": 1.0519062280654907,
"learning_rate": 9.954165711583821e-05,
"loss": 1.8228,
"step": 43
},
{
"epoch": 0.049689440993788817,
"grad_norm": 0.9518550634384155,
"learning_rate": 9.951725579712143e-05,
"loss": 1.9532,
"step": 44
},
{
"epoch": 0.050818746470920384,
"grad_norm": 2.408336639404297,
"learning_rate": 9.949222482140325e-05,
"loss": 1.943,
"step": 45
},
{
"epoch": 0.05194805194805195,
"grad_norm": 1.485158085823059,
"learning_rate": 9.946656450697529e-05,
"loss": 1.9683,
"step": 46
},
{
"epoch": 0.05307735742518351,
"grad_norm": 1.8976699113845825,
"learning_rate": 9.944027518013187e-05,
"loss": 2.4141,
"step": 47
},
{
"epoch": 0.05420666290231508,
"grad_norm": 2.9642817974090576,
"learning_rate": 9.941335717516564e-05,
"loss": 2.496,
"step": 48
},
{
"epoch": 0.05533596837944664,
"grad_norm": 3.4149694442749023,
"learning_rate": 9.938581083436363e-05,
"loss": 1.9677,
"step": 49
},
{
"epoch": 0.05646527385657821,
"grad_norm": 3.277317523956299,
"learning_rate": 9.935763650800259e-05,
"loss": 2.442,
"step": 50
},
{
"epoch": 0.05759457933370977,
"grad_norm": 0.18346086144447327,
"learning_rate": 9.932883455434476e-05,
"loss": 1.5118,
"step": 51
},
{
"epoch": 0.058723884810841336,
"grad_norm": 0.24609524011611938,
"learning_rate": 9.929940533963322e-05,
"loss": 1.8492,
"step": 52
},
{
"epoch": 0.059853190287972896,
"grad_norm": 0.33370301127433777,
"learning_rate": 9.926934923808722e-05,
"loss": 1.6053,
"step": 53
},
{
"epoch": 0.060982495765104464,
"grad_norm": 0.2675216495990753,
"learning_rate": 9.923866663189748e-05,
"loss": 1.3325,
"step": 54
},
{
"epoch": 0.062111801242236024,
"grad_norm": 0.32292208075523376,
"learning_rate": 9.920735791122126e-05,
"loss": 2.016,
"step": 55
},
{
"epoch": 0.06324110671936758,
"grad_norm": 0.32643455266952515,
"learning_rate": 9.917542347417746e-05,
"loss": 1.9595,
"step": 56
},
{
"epoch": 0.06437041219649915,
"grad_norm": 0.43551144003868103,
"learning_rate": 9.914286372684153e-05,
"loss": 1.6951,
"step": 57
},
{
"epoch": 0.06549971767363072,
"grad_norm": 0.3900545835494995,
"learning_rate": 9.910967908324033e-05,
"loss": 1.9828,
"step": 58
},
{
"epoch": 0.06662902315076229,
"grad_norm": 0.4311246871948242,
"learning_rate": 9.907586996534679e-05,
"loss": 1.4884,
"step": 59
},
{
"epoch": 0.06775832862789384,
"grad_norm": 0.46540212631225586,
"learning_rate": 9.904143680307464e-05,
"loss": 2.019,
"step": 60
},
{
"epoch": 0.06888763410502541,
"grad_norm": 0.3635936379432678,
"learning_rate": 9.900638003427291e-05,
"loss": 1.7411,
"step": 61
},
{
"epoch": 0.07001693958215698,
"grad_norm": 0.4469711482524872,
"learning_rate": 9.897070010472033e-05,
"loss": 2.1121,
"step": 62
},
{
"epoch": 0.07114624505928854,
"grad_norm": 0.3956710398197174,
"learning_rate": 9.89343974681197e-05,
"loss": 1.8976,
"step": 63
},
{
"epoch": 0.0722755505364201,
"grad_norm": 0.3273422122001648,
"learning_rate": 9.88974725860921e-05,
"loss": 2.4247,
"step": 64
},
{
"epoch": 0.07340485601355166,
"grad_norm": 0.40820926427841187,
"learning_rate": 9.885992592817103e-05,
"loss": 1.8519,
"step": 65
},
{
"epoch": 0.07453416149068323,
"grad_norm": 0.41497233510017395,
"learning_rate": 9.882175797179647e-05,
"loss": 2.0412,
"step": 66
},
{
"epoch": 0.0756634669678148,
"grad_norm": 0.26064857840538025,
"learning_rate": 9.878296920230869e-05,
"loss": 1.6159,
"step": 67
},
{
"epoch": 0.07679277244494635,
"grad_norm": 0.25881126523017883,
"learning_rate": 9.874356011294226e-05,
"loss": 1.7785,
"step": 68
},
{
"epoch": 0.07792207792207792,
"grad_norm": 0.31029218435287476,
"learning_rate": 9.870353120481961e-05,
"loss": 1.8466,
"step": 69
},
{
"epoch": 0.07905138339920949,
"grad_norm": 0.24932654201984406,
"learning_rate": 9.866288298694479e-05,
"loss": 2.0475,
"step": 70
},
{
"epoch": 0.08018068887634106,
"grad_norm": 0.24546779692173004,
"learning_rate": 9.862161597619689e-05,
"loss": 1.7603,
"step": 71
},
{
"epoch": 0.08130999435347261,
"grad_norm": 0.2344997376203537,
"learning_rate": 9.857973069732354e-05,
"loss": 1.8594,
"step": 72
},
{
"epoch": 0.08243929983060418,
"grad_norm": 0.27005735039711,
"learning_rate": 9.853722768293419e-05,
"loss": 2.0338,
"step": 73
},
{
"epoch": 0.08356860530773574,
"grad_norm": 0.22687004506587982,
"learning_rate": 9.849410747349338e-05,
"loss": 2.0741,
"step": 74
},
{
"epoch": 0.08469791078486731,
"grad_norm": 0.2578216791152954,
"learning_rate": 9.845037061731386e-05,
"loss": 2.0056,
"step": 75
},
{
"epoch": 0.08582721626199886,
"grad_norm": 0.21841199696063995,
"learning_rate": 9.840601767054957e-05,
"loss": 2.0873,
"step": 76
},
{
"epoch": 0.08695652173913043,
"grad_norm": 0.2152131050825119,
"learning_rate": 9.83610491971886e-05,
"loss": 1.6326,
"step": 77
},
{
"epoch": 0.088085827216262,
"grad_norm": 0.2605539560317993,
"learning_rate": 9.831546576904609e-05,
"loss": 1.7596,
"step": 78
},
{
"epoch": 0.08921513269339357,
"grad_norm": 0.2637302279472351,
"learning_rate": 9.826926796575679e-05,
"loss": 2.0953,
"step": 79
},
{
"epoch": 0.09034443817052512,
"grad_norm": 0.27054956555366516,
"learning_rate": 9.822245637476787e-05,
"loss": 1.8897,
"step": 80
},
{
"epoch": 0.09147374364765669,
"grad_norm": 0.25898051261901855,
"learning_rate": 9.817503159133132e-05,
"loss": 1.7747,
"step": 81
},
{
"epoch": 0.09260304912478826,
"grad_norm": 0.2623620331287384,
"learning_rate": 9.812699421849647e-05,
"loss": 1.603,
"step": 82
},
{
"epoch": 0.09373235460191982,
"grad_norm": 0.26288437843322754,
"learning_rate": 9.807834486710226e-05,
"loss": 1.8412,
"step": 83
},
{
"epoch": 0.09486166007905138,
"grad_norm": 0.323650598526001,
"learning_rate": 9.802908415576948e-05,
"loss": 1.3957,
"step": 84
},
{
"epoch": 0.09599096555618294,
"grad_norm": 0.3127897381782532,
"learning_rate": 9.797921271089294e-05,
"loss": 1.6553,
"step": 85
},
{
"epoch": 0.09712027103331451,
"grad_norm": 0.3926398456096649,
"learning_rate": 9.792873116663348e-05,
"loss": 1.4887,
"step": 86
},
{
"epoch": 0.09824957651044608,
"grad_norm": 0.6420180797576904,
"learning_rate": 9.787764016490992e-05,
"loss": 2.0315,
"step": 87
},
{
"epoch": 0.09937888198757763,
"grad_norm": 0.4471886157989502,
"learning_rate": 9.782594035539085e-05,
"loss": 1.5553,
"step": 88
},
{
"epoch": 0.1005081874647092,
"grad_norm": 0.4515712559223175,
"learning_rate": 9.777363239548644e-05,
"loss": 1.7503,
"step": 89
},
{
"epoch": 0.10163749294184077,
"grad_norm": 0.41610848903656006,
"learning_rate": 9.772071695034006e-05,
"loss": 1.7643,
"step": 90
},
{
"epoch": 0.10276679841897234,
"grad_norm": 0.41934433579444885,
"learning_rate": 9.766719469281974e-05,
"loss": 1.8208,
"step": 91
},
{
"epoch": 0.1038961038961039,
"grad_norm": 0.42789050936698914,
"learning_rate": 9.761306630350976e-05,
"loss": 1.5227,
"step": 92
},
{
"epoch": 0.10502540937323546,
"grad_norm": 0.47943541407585144,
"learning_rate": 9.755833247070188e-05,
"loss": 1.4097,
"step": 93
},
{
"epoch": 0.10615471485036702,
"grad_norm": 0.7126927971839905,
"learning_rate": 9.750299389038659e-05,
"loss": 1.4319,
"step": 94
},
{
"epoch": 0.10728402032749859,
"grad_norm": 0.7886515855789185,
"learning_rate": 9.744705126624439e-05,
"loss": 1.5488,
"step": 95
},
{
"epoch": 0.10841332580463016,
"grad_norm": 0.9192765355110168,
"learning_rate": 9.739050530963665e-05,
"loss": 1.4749,
"step": 96
},
{
"epoch": 0.10954263128176171,
"grad_norm": 0.9051669239997864,
"learning_rate": 9.733335673959671e-05,
"loss": 1.3721,
"step": 97
},
{
"epoch": 0.11067193675889328,
"grad_norm": 1.4240723848342896,
"learning_rate": 9.727560628282071e-05,
"loss": 1.6698,
"step": 98
},
{
"epoch": 0.11180124223602485,
"grad_norm": 1.5896399021148682,
"learning_rate": 9.721725467365826e-05,
"loss": 1.8088,
"step": 99
},
{
"epoch": 0.11293054771315642,
"grad_norm": 3.1276590824127197,
"learning_rate": 9.715830265410324e-05,
"loss": 3.2245,
"step": 100
},
{
"epoch": 0.11405985319028797,
"grad_norm": 0.10527385026216507,
"learning_rate": 9.709875097378425e-05,
"loss": 0.9533,
"step": 101
},
{
"epoch": 0.11518915866741954,
"grad_norm": 0.11625129729509354,
"learning_rate": 9.703860038995515e-05,
"loss": 1.5122,
"step": 102
},
{
"epoch": 0.1163184641445511,
"grad_norm": 0.14005830883979797,
"learning_rate": 9.697785166748536e-05,
"loss": 1.3598,
"step": 103
},
{
"epoch": 0.11744776962168267,
"grad_norm": 0.20591479539871216,
"learning_rate": 9.691650557885026e-05,
"loss": 1.6923,
"step": 104
},
{
"epoch": 0.11857707509881422,
"grad_norm": 0.1916189342737198,
"learning_rate": 9.685456290412119e-05,
"loss": 1.5945,
"step": 105
},
{
"epoch": 0.11970638057594579,
"grad_norm": 0.18392325937747955,
"learning_rate": 9.679202443095566e-05,
"loss": 1.539,
"step": 106
},
{
"epoch": 0.12083568605307736,
"grad_norm": 0.24299819767475128,
"learning_rate": 9.672889095458734e-05,
"loss": 2.0555,
"step": 107
},
{
"epoch": 0.12196499153020893,
"grad_norm": 0.1931591033935547,
"learning_rate": 9.666516327781588e-05,
"loss": 1.4817,
"step": 108
},
{
"epoch": 0.12309429700734048,
"grad_norm": 0.18632109463214874,
"learning_rate": 9.660084221099671e-05,
"loss": 1.8985,
"step": 109
},
{
"epoch": 0.12422360248447205,
"grad_norm": 0.3496991693973541,
"learning_rate": 9.653592857203076e-05,
"loss": 1.8324,
"step": 110
},
{
"epoch": 0.12535290796160362,
"grad_norm": 0.2754102945327759,
"learning_rate": 9.647042318635407e-05,
"loss": 1.8554,
"step": 111
},
{
"epoch": 0.12648221343873517,
"grad_norm": 0.2471209615468979,
"learning_rate": 9.640432688692728e-05,
"loss": 1.8519,
"step": 112
},
{
"epoch": 0.12761151891586675,
"grad_norm": 0.2555141746997833,
"learning_rate": 9.633764051422504e-05,
"loss": 2.0029,
"step": 113
},
{
"epoch": 0.1287408243929983,
"grad_norm": 0.28241923451423645,
"learning_rate": 9.627036491622529e-05,
"loss": 1.8218,
"step": 114
},
{
"epoch": 0.12987012987012986,
"grad_norm": 0.2457263320684433,
"learning_rate": 9.620250094839852e-05,
"loss": 1.984,
"step": 115
},
{
"epoch": 0.13099943534726144,
"grad_norm": 0.2737242579460144,
"learning_rate": 9.61340494736969e-05,
"loss": 1.6238,
"step": 116
},
{
"epoch": 0.132128740824393,
"grad_norm": 0.3082609176635742,
"learning_rate": 9.606501136254327e-05,
"loss": 1.6898,
"step": 117
},
{
"epoch": 0.13325804630152457,
"grad_norm": 0.2536124289035797,
"learning_rate": 9.599538749282005e-05,
"loss": 1.8739,
"step": 118
},
{
"epoch": 0.13438735177865613,
"grad_norm": 0.31439918279647827,
"learning_rate": 9.592517874985819e-05,
"loss": 1.7295,
"step": 119
},
{
"epoch": 0.13551665725578768,
"grad_norm": 0.23434846103191376,
"learning_rate": 9.585438602642578e-05,
"loss": 1.8987,
"step": 120
},
{
"epoch": 0.13664596273291926,
"grad_norm": 0.2667776048183441,
"learning_rate": 9.578301022271676e-05,
"loss": 1.8776,
"step": 121
},
{
"epoch": 0.13777526821005082,
"grad_norm": 0.26877158880233765,
"learning_rate": 9.571105224633948e-05,
"loss": 1.9248,
"step": 122
},
{
"epoch": 0.13890457368718237,
"grad_norm": 0.28717201948165894,
"learning_rate": 9.563851301230512e-05,
"loss": 1.9187,
"step": 123
},
{
"epoch": 0.14003387916431395,
"grad_norm": 0.2537637948989868,
"learning_rate": 9.556539344301613e-05,
"loss": 2.0498,
"step": 124
},
{
"epoch": 0.1411631846414455,
"grad_norm": 0.2713514566421509,
"learning_rate": 9.549169446825441e-05,
"loss": 1.9569,
"step": 125
},
{
"epoch": 0.1422924901185771,
"grad_norm": 0.27854233980178833,
"learning_rate": 9.541741702516954e-05,
"loss": 1.9205,
"step": 126
},
{
"epoch": 0.14342179559570864,
"grad_norm": 0.27771830558776855,
"learning_rate": 9.534256205826684e-05,
"loss": 1.8535,
"step": 127
},
{
"epoch": 0.1445511010728402,
"grad_norm": 0.2616361975669861,
"learning_rate": 9.52671305193954e-05,
"loss": 1.7369,
"step": 128
},
{
"epoch": 0.14568040654997177,
"grad_norm": 0.2548147141933441,
"learning_rate": 9.519112336773593e-05,
"loss": 2.1447,
"step": 129
},
{
"epoch": 0.14680971202710333,
"grad_norm": 0.43677401542663574,
"learning_rate": 9.511454156978855e-05,
"loss": 1.7814,
"step": 130
},
{
"epoch": 0.14793901750423488,
"grad_norm": 0.32601964473724365,
"learning_rate": 9.50373860993606e-05,
"loss": 1.8351,
"step": 131
},
{
"epoch": 0.14906832298136646,
"grad_norm": 0.3155123293399811,
"learning_rate": 9.495965793755414e-05,
"loss": 1.8318,
"step": 132
},
{
"epoch": 0.15019762845849802,
"grad_norm": 0.29326286911964417,
"learning_rate": 9.488135807275351e-05,
"loss": 1.613,
"step": 133
},
{
"epoch": 0.1513269339356296,
"grad_norm": 0.3525848686695099,
"learning_rate": 9.480248750061283e-05,
"loss": 1.8371,
"step": 134
},
{
"epoch": 0.15245623941276115,
"grad_norm": 0.30814632773399353,
"learning_rate": 9.472304722404323e-05,
"loss": 1.7233,
"step": 135
},
{
"epoch": 0.1535855448898927,
"grad_norm": 0.34674981236457825,
"learning_rate": 9.464303825320018e-05,
"loss": 1.6202,
"step": 136
},
{
"epoch": 0.1547148503670243,
"grad_norm": 0.3566695749759674,
"learning_rate": 9.456246160547057e-05,
"loss": 1.4083,
"step": 137
},
{
"epoch": 0.15584415584415584,
"grad_norm": 0.5000084638595581,
"learning_rate": 9.448131830545992e-05,
"loss": 1.5184,
"step": 138
},
{
"epoch": 0.15697346132128742,
"grad_norm": 0.341934472322464,
"learning_rate": 9.439960938497914e-05,
"loss": 1.7534,
"step": 139
},
{
"epoch": 0.15810276679841898,
"grad_norm": 0.5167602300643921,
"learning_rate": 9.431733588303156e-05,
"loss": 1.5504,
"step": 140
},
{
"epoch": 0.15923207227555053,
"grad_norm": 0.5055645108222961,
"learning_rate": 9.423449884579972e-05,
"loss": 2.1625,
"step": 141
},
{
"epoch": 0.1603613777526821,
"grad_norm": 0.4868288040161133,
"learning_rate": 9.415109932663193e-05,
"loss": 1.6623,
"step": 142
},
{
"epoch": 0.16149068322981366,
"grad_norm": 0.6852800250053406,
"learning_rate": 9.406713838602907e-05,
"loss": 1.7651,
"step": 143
},
{
"epoch": 0.16261998870694522,
"grad_norm": 0.7421604990959167,
"learning_rate": 9.398261709163095e-05,
"loss": 0.8697,
"step": 144
},
{
"epoch": 0.1637492941840768,
"grad_norm": 0.8236505389213562,
"learning_rate": 9.389753651820279e-05,
"loss": 1.2798,
"step": 145
},
{
"epoch": 0.16487859966120835,
"grad_norm": 0.7554454207420349,
"learning_rate": 9.381189774762158e-05,
"loss": 0.9496,
"step": 146
},
{
"epoch": 0.16600790513833993,
"grad_norm": 0.9927592873573303,
"learning_rate": 9.372570186886225e-05,
"loss": 2.1169,
"step": 147
},
{
"epoch": 0.1671372106154715,
"grad_norm": 1.3748329877853394,
"learning_rate": 9.363894997798392e-05,
"loss": 1.5176,
"step": 148
},
{
"epoch": 0.16826651609260304,
"grad_norm": 2.4232335090637207,
"learning_rate": 9.355164317811587e-05,
"loss": 1.9086,
"step": 149
},
{
"epoch": 0.16939582156973462,
"grad_norm": 3.602113962173462,
"learning_rate": 9.346378257944357e-05,
"loss": 2.502,
"step": 150
},
{
"epoch": 0.17052512704686618,
"grad_norm": 0.0982755646109581,
"learning_rate": 9.337536929919454e-05,
"loss": 0.9732,
"step": 151
},
{
"epoch": 0.17165443252399773,
"grad_norm": 0.1385735720396042,
"learning_rate": 9.328640446162416e-05,
"loss": 1.6766,
"step": 152
},
{
"epoch": 0.1727837380011293,
"grad_norm": 0.15490496158599854,
"learning_rate": 9.319688919800137e-05,
"loss": 1.4763,
"step": 153
},
{
"epoch": 0.17391304347826086,
"grad_norm": 0.15035314857959747,
"learning_rate": 9.310682464659424e-05,
"loss": 1.7746,
"step": 154
},
{
"epoch": 0.17504234895539245,
"grad_norm": 0.16564689576625824,
"learning_rate": 9.30162119526556e-05,
"loss": 1.7991,
"step": 155
},
{
"epoch": 0.176171654432524,
"grad_norm": 0.24599392712116241,
"learning_rate": 9.292505226840832e-05,
"loss": 1.6264,
"step": 156
},
{
"epoch": 0.17730095990965555,
"grad_norm": 0.17428162693977356,
"learning_rate": 9.283334675303084e-05,
"loss": 1.6458,
"step": 157
},
{
"epoch": 0.17843026538678713,
"grad_norm": 0.16533192992210388,
"learning_rate": 9.274109657264227e-05,
"loss": 1.8436,
"step": 158
},
{
"epoch": 0.1795595708639187,
"grad_norm": 0.2076898068189621,
"learning_rate": 9.264830290028771e-05,
"loss": 1.9228,
"step": 159
},
{
"epoch": 0.18068887634105024,
"grad_norm": 0.2833722233772278,
"learning_rate": 9.255496691592316e-05,
"loss": 1.5502,
"step": 160
},
{
"epoch": 0.18181818181818182,
"grad_norm": 0.23064666986465454,
"learning_rate": 9.246108980640069e-05,
"loss": 1.7692,
"step": 161
},
{
"epoch": 0.18294748729531338,
"grad_norm": 0.3194381892681122,
"learning_rate": 9.236667276545323e-05,
"loss": 1.4503,
"step": 162
},
{
"epoch": 0.18407679277244496,
"grad_norm": 0.24672725796699524,
"learning_rate": 9.227171699367943e-05,
"loss": 1.6418,
"step": 163
},
{
"epoch": 0.1852060982495765,
"grad_norm": 0.2164398431777954,
"learning_rate": 9.217622369852842e-05,
"loss": 1.4357,
"step": 164
},
{
"epoch": 0.18633540372670807,
"grad_norm": 0.18639886379241943,
"learning_rate": 9.208019409428439e-05,
"loss": 1.7777,
"step": 165
},
{
"epoch": 0.18746470920383965,
"grad_norm": 0.2551623284816742,
"learning_rate": 9.198362940205123e-05,
"loss": 1.7788,
"step": 166
},
{
"epoch": 0.1885940146809712,
"grad_norm": 0.20203350484371185,
"learning_rate": 9.188653084973692e-05,
"loss": 1.8567,
"step": 167
},
{
"epoch": 0.18972332015810275,
"grad_norm": 0.25348034501075745,
"learning_rate": 9.178889967203798e-05,
"loss": 1.9362,
"step": 168
},
{
"epoch": 0.19085262563523434,
"grad_norm": 0.22443613409996033,
"learning_rate": 9.169073711042378e-05,
"loss": 1.9439,
"step": 169
},
{
"epoch": 0.1919819311123659,
"grad_norm": 0.23565413057804108,
"learning_rate": 9.159204441312064e-05,
"loss": 1.8284,
"step": 170
},
{
"epoch": 0.19311123658949747,
"grad_norm": 0.24979020655155182,
"learning_rate": 9.14928228350961e-05,
"loss": 1.666,
"step": 171
},
{
"epoch": 0.19424054206662902,
"grad_norm": 0.2663923501968384,
"learning_rate": 9.139307363804289e-05,
"loss": 1.822,
"step": 172
},
{
"epoch": 0.19536984754376058,
"grad_norm": 0.23612940311431885,
"learning_rate": 9.129279809036287e-05,
"loss": 1.8881,
"step": 173
},
{
"epoch": 0.19649915302089216,
"grad_norm": 0.22009217739105225,
"learning_rate": 9.119199746715096e-05,
"loss": 1.854,
"step": 174
},
{
"epoch": 0.1976284584980237,
"grad_norm": 0.3293054699897766,
"learning_rate": 9.109067305017889e-05,
"loss": 1.8201,
"step": 175
},
{
"epoch": 0.19875776397515527,
"grad_norm": 0.22837886214256287,
"learning_rate": 9.098882612787886e-05,
"loss": 1.8194,
"step": 176
},
{
"epoch": 0.19988706945228685,
"grad_norm": 0.2876310646533966,
"learning_rate": 9.088645799532729e-05,
"loss": 1.7314,
"step": 177
},
{
"epoch": 0.2010163749294184,
"grad_norm": 0.21625038981437683,
"learning_rate": 9.078356995422817e-05,
"loss": 2.0749,
"step": 178
},
{
"epoch": 0.20214568040654998,
"grad_norm": 0.28515487909317017,
"learning_rate": 9.068016331289663e-05,
"loss": 1.8572,
"step": 179
},
{
"epoch": 0.20327498588368154,
"grad_norm": 0.2739616632461548,
"learning_rate": 9.057623938624234e-05,
"loss": 1.8118,
"step": 180
},
{
"epoch": 0.2044042913608131,
"grad_norm": 0.2997945547103882,
"learning_rate": 9.047179949575261e-05,
"loss": 1.713,
"step": 181
},
{
"epoch": 0.20553359683794467,
"grad_norm": 0.2930619418621063,
"learning_rate": 9.036684496947577e-05,
"loss": 1.7162,
"step": 182
},
{
"epoch": 0.20666290231507622,
"grad_norm": 0.2975890338420868,
"learning_rate": 9.026137714200423e-05,
"loss": 1.6059,
"step": 183
},
{
"epoch": 0.2077922077922078,
"grad_norm": 0.3093002736568451,
"learning_rate": 9.015539735445742e-05,
"loss": 1.6468,
"step": 184
},
{
"epoch": 0.20892151326933936,
"grad_norm": 0.3571140170097351,
"learning_rate": 9.004890695446489e-05,
"loss": 1.849,
"step": 185
},
{
"epoch": 0.2100508187464709,
"grad_norm": 0.34700098633766174,
"learning_rate": 8.994190729614903e-05,
"loss": 1.8542,
"step": 186
},
{
"epoch": 0.2111801242236025,
"grad_norm": 0.38373562693595886,
"learning_rate": 8.983439974010794e-05,
"loss": 1.4129,
"step": 187
},
{
"epoch": 0.21230942970073405,
"grad_norm": 0.3819943368434906,
"learning_rate": 8.972638565339812e-05,
"loss": 1.6305,
"step": 188
},
{
"epoch": 0.2134387351778656,
"grad_norm": 0.4611022472381592,
"learning_rate": 8.961786640951701e-05,
"loss": 1.9939,
"step": 189
},
{
"epoch": 0.21456804065499718,
"grad_norm": 0.478437602519989,
"learning_rate": 8.950884338838567e-05,
"loss": 1.9253,
"step": 190
},
{
"epoch": 0.21569734613212874,
"grad_norm": 0.5726389288902283,
"learning_rate": 8.939931797633106e-05,
"loss": 1.8656,
"step": 191
},
{
"epoch": 0.21682665160926032,
"grad_norm": 0.7200562357902527,
"learning_rate": 8.928929156606854e-05,
"loss": 1.4691,
"step": 192
},
{
"epoch": 0.21795595708639187,
"grad_norm": 0.6185989379882812,
"learning_rate": 8.917876555668412e-05,
"loss": 1.392,
"step": 193
},
{
"epoch": 0.21908526256352343,
"grad_norm": 0.6441951394081116,
"learning_rate": 8.906774135361667e-05,
"loss": 0.839,
"step": 194
},
{
"epoch": 0.220214568040655,
"grad_norm": 0.717701256275177,
"learning_rate": 8.895622036864004e-05,
"loss": 1.1955,
"step": 195
},
{
"epoch": 0.22134387351778656,
"grad_norm": 0.7623339295387268,
"learning_rate": 8.884420401984509e-05,
"loss": 1.2057,
"step": 196
},
{
"epoch": 0.2224731789949181,
"grad_norm": 1.0133534669876099,
"learning_rate": 8.873169373162174e-05,
"loss": 1.6524,
"step": 197
},
{
"epoch": 0.2236024844720497,
"grad_norm": 2.122885227203369,
"learning_rate": 8.861869093464073e-05,
"loss": 1.6981,
"step": 198
},
{
"epoch": 0.22473178994918125,
"grad_norm": 1.8476300239562988,
"learning_rate": 8.850519706583553e-05,
"loss": 1.4381,
"step": 199
},
{
"epoch": 0.22586109542631283,
"grad_norm": 3.508267879486084,
"learning_rate": 8.839121356838406e-05,
"loss": 2.286,
"step": 200
},
{
"epoch": 0.22699040090344438,
"grad_norm": 0.0983705148100853,
"learning_rate": 8.827674189169031e-05,
"loss": 1.8119,
"step": 201
},
{
"epoch": 0.22811970638057594,
"grad_norm": 0.11233831197023392,
"learning_rate": 8.816178349136586e-05,
"loss": 1.5454,
"step": 202
},
{
"epoch": 0.22924901185770752,
"grad_norm": 0.14189288020133972,
"learning_rate": 8.804633982921146e-05,
"loss": 1.9528,
"step": 203
},
{
"epoch": 0.23037831733483907,
"grad_norm": 0.17341268062591553,
"learning_rate": 8.793041237319843e-05,
"loss": 1.3116,
"step": 204
},
{
"epoch": 0.23150762281197063,
"grad_norm": 0.22769945859909058,
"learning_rate": 8.781400259744993e-05,
"loss": 1.5896,
"step": 205
},
{
"epoch": 0.2326369282891022,
"grad_norm": 0.1609055995941162,
"learning_rate": 8.769711198222225e-05,
"loss": 1.188,
"step": 206
},
{
"epoch": 0.23376623376623376,
"grad_norm": 0.15893089771270752,
"learning_rate": 8.757974201388605e-05,
"loss": 2.0351,
"step": 207
},
{
"epoch": 0.23489553924336534,
"grad_norm": 0.21750855445861816,
"learning_rate": 8.746189418490736e-05,
"loss": 1.7586,
"step": 208
},
{
"epoch": 0.2360248447204969,
"grad_norm": 0.20967909693717957,
"learning_rate": 8.73435699938286e-05,
"loss": 1.8875,
"step": 209
},
{
"epoch": 0.23715415019762845,
"grad_norm": 0.2173241674900055,
"learning_rate": 8.722477094524967e-05,
"loss": 1.8267,
"step": 210
},
{
"epoch": 0.23828345567476003,
"grad_norm": 0.23391224443912506,
"learning_rate": 8.710549854980863e-05,
"loss": 1.5153,
"step": 211
},
{
"epoch": 0.23941276115189158,
"grad_norm": 0.25549450516700745,
"learning_rate": 8.69857543241626e-05,
"loss": 1.6052,
"step": 212
},
{
"epoch": 0.24054206662902314,
"grad_norm": 0.18461880087852478,
"learning_rate": 8.686553979096848e-05,
"loss": 1.8639,
"step": 213
},
{
"epoch": 0.24167137210615472,
"grad_norm": 0.2688834071159363,
"learning_rate": 8.674485647886351e-05,
"loss": 2.0688,
"step": 214
},
{
"epoch": 0.24280067758328627,
"grad_norm": 0.28456270694732666,
"learning_rate": 8.662370592244593e-05,
"loss": 1.8468,
"step": 215
},
{
"epoch": 0.24392998306041785,
"grad_norm": 0.2307300567626953,
"learning_rate": 8.650208966225537e-05,
"loss": 1.7642,
"step": 216
},
{
"epoch": 0.2450592885375494,
"grad_norm": 0.2560746669769287,
"learning_rate": 8.638000924475336e-05,
"loss": 1.6622,
"step": 217
},
{
"epoch": 0.24618859401468096,
"grad_norm": 0.20194853842258453,
"learning_rate": 8.625746622230355e-05,
"loss": 1.9663,
"step": 218
},
{
"epoch": 0.24731789949181254,
"grad_norm": 0.33031973242759705,
"learning_rate": 8.61344621531521e-05,
"loss": 1.9537,
"step": 219
},
{
"epoch": 0.2484472049689441,
"grad_norm": 0.26182833313941956,
"learning_rate": 8.601099860140774e-05,
"loss": 1.9116,
"step": 220
},
{
"epoch": 0.24957651044607565,
"grad_norm": 0.23049500584602356,
"learning_rate": 8.588707713702198e-05,
"loss": 1.7189,
"step": 221
},
{
"epoch": 0.25070581592320723,
"grad_norm": 0.21192899346351624,
"learning_rate": 8.576269933576909e-05,
"loss": 1.9939,
"step": 222
},
{
"epoch": 0.2518351214003388,
"grad_norm": 0.27587172389030457,
"learning_rate": 8.563786677922608e-05,
"loss": 1.9758,
"step": 223
},
{
"epoch": 0.25296442687747034,
"grad_norm": 0.29069381952285767,
"learning_rate": 8.551258105475256e-05,
"loss": 1.7336,
"step": 224
},
{
"epoch": 0.2540937323546019,
"grad_norm": 0.2707800567150116,
"learning_rate": 8.538684375547064e-05,
"loss": 1.8063,
"step": 225
},
{
"epoch": 0.2552230378317335,
"grad_norm": 0.2815471887588501,
"learning_rate": 8.526065648024459e-05,
"loss": 1.8871,
"step": 226
},
{
"epoch": 0.256352343308865,
"grad_norm": 0.22785307466983795,
"learning_rate": 8.513402083366054e-05,
"loss": 1.6936,
"step": 227
},
{
"epoch": 0.2574816487859966,
"grad_norm": 0.22646154463291168,
"learning_rate": 8.5006938426006e-05,
"loss": 1.9189,
"step": 228
},
{
"epoch": 0.2586109542631282,
"grad_norm": 0.2953996956348419,
"learning_rate": 8.487941087324958e-05,
"loss": 1.8463,
"step": 229
},
{
"epoch": 0.2597402597402597,
"grad_norm": 0.2842676043510437,
"learning_rate": 8.475143979702022e-05,
"loss": 1.4269,
"step": 230
},
{
"epoch": 0.2608695652173913,
"grad_norm": 0.2800816297531128,
"learning_rate": 8.46230268245867e-05,
"loss": 1.8405,
"step": 231
},
{
"epoch": 0.2619988706945229,
"grad_norm": 0.30798566341400146,
"learning_rate": 8.449417358883689e-05,
"loss": 1.706,
"step": 232
},
{
"epoch": 0.26312817617165446,
"grad_norm": 0.32843559980392456,
"learning_rate": 8.436488172825705e-05,
"loss": 1.7581,
"step": 233
},
{
"epoch": 0.264257481648786,
"grad_norm": 0.3350053131580353,
"learning_rate": 8.42351528869109e-05,
"loss": 1.4183,
"step": 234
},
{
"epoch": 0.26538678712591757,
"grad_norm": 0.32234182953834534,
"learning_rate": 8.410498871441886e-05,
"loss": 1.6521,
"step": 235
},
{
"epoch": 0.26651609260304915,
"grad_norm": 0.3101066052913666,
"learning_rate": 8.397439086593683e-05,
"loss": 1.7472,
"step": 236
},
{
"epoch": 0.2676453980801807,
"grad_norm": 0.4354749917984009,
"learning_rate": 8.384336100213546e-05,
"loss": 1.5799,
"step": 237
},
{
"epoch": 0.26877470355731226,
"grad_norm": 0.3919726014137268,
"learning_rate": 8.371190078917875e-05,
"loss": 1.2785,
"step": 238
},
{
"epoch": 0.26990400903444384,
"grad_norm": 0.38805148005485535,
"learning_rate": 8.358001189870303e-05,
"loss": 1.5701,
"step": 239
},
{
"epoch": 0.27103331451157536,
"grad_norm": 0.4695982336997986,
"learning_rate": 8.344769600779568e-05,
"loss": 1.7407,
"step": 240
},
{
"epoch": 0.27216261998870694,
"grad_norm": 0.6635199189186096,
"learning_rate": 8.331495479897373e-05,
"loss": 1.5664,
"step": 241
},
{
"epoch": 0.2732919254658385,
"grad_norm": 0.4444817900657654,
"learning_rate": 8.318178996016253e-05,
"loss": 1.6331,
"step": 242
},
{
"epoch": 0.27442123094297005,
"grad_norm": 0.6371070146560669,
"learning_rate": 8.304820318467427e-05,
"loss": 1.6983,
"step": 243
},
{
"epoch": 0.27555053642010163,
"grad_norm": 0.5904344916343689,
"learning_rate": 8.291419617118646e-05,
"loss": 1.4706,
"step": 244
},
{
"epoch": 0.2766798418972332,
"grad_norm": 0.7313349843025208,
"learning_rate": 8.277977062372031e-05,
"loss": 1.1339,
"step": 245
},
{
"epoch": 0.27780914737436474,
"grad_norm": 0.9247500896453857,
"learning_rate": 8.264492825161909e-05,
"loss": 1.1903,
"step": 246
},
{
"epoch": 0.2789384528514963,
"grad_norm": 0.9339075088500977,
"learning_rate": 8.250967076952635e-05,
"loss": 1.6087,
"step": 247
},
{
"epoch": 0.2800677583286279,
"grad_norm": 1.1781666278839111,
"learning_rate": 8.237399989736414e-05,
"loss": 1.8521,
"step": 248
},
{
"epoch": 0.2811970638057595,
"grad_norm": 1.551308512687683,
"learning_rate": 8.223791736031117e-05,
"loss": 1.3684,
"step": 249
},
{
"epoch": 0.282326369282891,
"grad_norm": 2.004934787750244,
"learning_rate": 8.210142488878078e-05,
"loss": 1.7356,
"step": 250
},
{
"epoch": 0.2834556747600226,
"grad_norm": 0.11847388744354248,
"learning_rate": 8.196452421839911e-05,
"loss": 1.3472,
"step": 251
},
{
"epoch": 0.2845849802371542,
"grad_norm": 0.114841990172863,
"learning_rate": 8.18272170899828e-05,
"loss": 1.5753,
"step": 252
},
{
"epoch": 0.2857142857142857,
"grad_norm": 0.16253335773944855,
"learning_rate": 8.168950524951708e-05,
"loss": 1.7908,
"step": 253
},
{
"epoch": 0.2868435911914173,
"grad_norm": 0.15040843188762665,
"learning_rate": 8.155139044813336e-05,
"loss": 1.3829,
"step": 254
},
{
"epoch": 0.28797289666854886,
"grad_norm": 0.19803692400455475,
"learning_rate": 8.141287444208717e-05,
"loss": 1.3891,
"step": 255
},
{
"epoch": 0.2891022021456804,
"grad_norm": 0.1951441764831543,
"learning_rate": 8.127395899273561e-05,
"loss": 1.7411,
"step": 256
},
{
"epoch": 0.29023150762281197,
"grad_norm": 0.1544586420059204,
"learning_rate": 8.113464586651516e-05,
"loss": 1.7334,
"step": 257
},
{
"epoch": 0.29136081309994355,
"grad_norm": 0.1889820396900177,
"learning_rate": 8.099493683491909e-05,
"loss": 1.603,
"step": 258
},
{
"epoch": 0.2924901185770751,
"grad_norm": 0.2836708724498749,
"learning_rate": 8.085483367447498e-05,
"loss": 1.5592,
"step": 259
},
{
"epoch": 0.29361942405420666,
"grad_norm": 0.20887868106365204,
"learning_rate": 8.071433816672204e-05,
"loss": 1.9915,
"step": 260
},
{
"epoch": 0.29474872953133824,
"grad_norm": 0.19964289665222168,
"learning_rate": 8.057345209818868e-05,
"loss": 1.4532,
"step": 261
},
{
"epoch": 0.29587803500846976,
"grad_norm": 0.23576165735721588,
"learning_rate": 8.04321772603695e-05,
"loss": 1.6616,
"step": 262
},
{
"epoch": 0.29700734048560135,
"grad_norm": 0.1948481649160385,
"learning_rate": 8.029051544970274e-05,
"loss": 1.6309,
"step": 263
},
{
"epoch": 0.2981366459627329,
"grad_norm": 0.21681681275367737,
"learning_rate": 8.014846846754734e-05,
"loss": 1.8207,
"step": 264
},
{
"epoch": 0.2992659514398645,
"grad_norm": 0.1701822280883789,
"learning_rate": 8.000603812016002e-05,
"loss": 2.0191,
"step": 265
},
{
"epoch": 0.30039525691699603,
"grad_norm": 0.2877520024776459,
"learning_rate": 7.986322621867237e-05,
"loss": 1.7949,
"step": 266
},
{
"epoch": 0.3015245623941276,
"grad_norm": 0.22249895334243774,
"learning_rate": 7.972003457906773e-05,
"loss": 1.7619,
"step": 267
},
{
"epoch": 0.3026538678712592,
"grad_norm": 0.20995785295963287,
"learning_rate": 7.957646502215826e-05,
"loss": 1.919,
"step": 268
},
{
"epoch": 0.3037831733483907,
"grad_norm": 0.20118004083633423,
"learning_rate": 7.943251937356158e-05,
"loss": 1.9674,
"step": 269
},
{
"epoch": 0.3049124788255223,
"grad_norm": 0.24078606069087982,
"learning_rate": 7.928819946367772e-05,
"loss": 1.8938,
"step": 270
},
{
"epoch": 0.3060417843026539,
"grad_norm": 0.29334747791290283,
"learning_rate": 7.914350712766575e-05,
"loss": 1.8251,
"step": 271
},
{
"epoch": 0.3071710897797854,
"grad_norm": 0.2302616685628891,
"learning_rate": 7.899844420542047e-05,
"loss": 1.9416,
"step": 272
},
{
"epoch": 0.308300395256917,
"grad_norm": 0.3773344159126282,
"learning_rate": 7.885301254154908e-05,
"loss": 1.9136,
"step": 273
},
{
"epoch": 0.3094297007340486,
"grad_norm": 0.2522296607494354,
"learning_rate": 7.870721398534762e-05,
"loss": 2.0552,
"step": 274
},
{
"epoch": 0.3105590062111801,
"grad_norm": 0.2509544789791107,
"learning_rate": 7.856105039077748e-05,
"loss": 2.0108,
"step": 275
},
{
"epoch": 0.3116883116883117,
"grad_norm": 0.2968475818634033,
"learning_rate": 7.841452361644188e-05,
"loss": 2.0004,
"step": 276
},
{
"epoch": 0.31281761716544326,
"grad_norm": 0.2396615594625473,
"learning_rate": 7.826763552556222e-05,
"loss": 1.8284,
"step": 277
},
{
"epoch": 0.31394692264257484,
"grad_norm": 0.22894443571567535,
"learning_rate": 7.812038798595431e-05,
"loss": 2.1324,
"step": 278
},
{
"epoch": 0.31507622811970637,
"grad_norm": 0.27567097544670105,
"learning_rate": 7.797278287000475e-05,
"loss": 1.6882,
"step": 279
},
{
"epoch": 0.31620553359683795,
"grad_norm": 0.28118517994880676,
"learning_rate": 7.782482205464696e-05,
"loss": 1.8576,
"step": 280
},
{
"epoch": 0.31733483907396953,
"grad_norm": 0.2652731239795685,
"learning_rate": 7.767650742133747e-05,
"loss": 1.9145,
"step": 281
},
{
"epoch": 0.31846414455110106,
"grad_norm": 0.29614031314849854,
"learning_rate": 7.752784085603193e-05,
"loss": 1.6665,
"step": 282
},
{
"epoch": 0.31959345002823264,
"grad_norm": 0.3341023921966553,
"learning_rate": 7.737882424916107e-05,
"loss": 1.4237,
"step": 283
},
{
"epoch": 0.3207227555053642,
"grad_norm": 0.3023217022418976,
"learning_rate": 7.722945949560678e-05,
"loss": 1.7299,
"step": 284
},
{
"epoch": 0.32185206098249575,
"grad_norm": 0.30460652709007263,
"learning_rate": 7.707974849467791e-05,
"loss": 1.4939,
"step": 285
},
{
"epoch": 0.32298136645962733,
"grad_norm": 0.3721717894077301,
"learning_rate": 7.692969315008616e-05,
"loss": 1.2991,
"step": 286
},
{
"epoch": 0.3241106719367589,
"grad_norm": 0.31576859951019287,
"learning_rate": 7.677929536992194e-05,
"loss": 1.7151,
"step": 287
},
{
"epoch": 0.32523997741389044,
"grad_norm": 0.3870582580566406,
"learning_rate": 7.662855706662992e-05,
"loss": 1.6205,
"step": 288
},
{
"epoch": 0.326369282891022,
"grad_norm": 0.45515769720077515,
"learning_rate": 7.647748015698495e-05,
"loss": 1.8497,
"step": 289
},
{
"epoch": 0.3274985883681536,
"grad_norm": 0.4135870337486267,
"learning_rate": 7.632606656206748e-05,
"loss": 1.669,
"step": 290
},
{
"epoch": 0.3286278938452851,
"grad_norm": 0.5970892906188965,
"learning_rate": 7.617431820723928e-05,
"loss": 2.045,
"step": 291
},
{
"epoch": 0.3297571993224167,
"grad_norm": 0.560570478439331,
"learning_rate": 7.602223702211888e-05,
"loss": 1.739,
"step": 292
},
{
"epoch": 0.3308865047995483,
"grad_norm": 0.7267481088638306,
"learning_rate": 7.586982494055703e-05,
"loss": 1.2534,
"step": 293
},
{
"epoch": 0.33201581027667987,
"grad_norm": 0.6260027289390564,
"learning_rate": 7.571708390061215e-05,
"loss": 1.326,
"step": 294
},
{
"epoch": 0.3331451157538114,
"grad_norm": 0.637332558631897,
"learning_rate": 7.556401584452565e-05,
"loss": 1.209,
"step": 295
},
{
"epoch": 0.334274421230943,
"grad_norm": 0.6579151153564453,
"learning_rate": 7.541062271869727e-05,
"loss": 1.3,
"step": 296
},
{
"epoch": 0.33540372670807456,
"grad_norm": 0.9693841934204102,
"learning_rate": 7.525690647366032e-05,
"loss": 1.4362,
"step": 297
},
{
"epoch": 0.3365330321852061,
"grad_norm": 1.3491055965423584,
"learning_rate": 7.510286906405679e-05,
"loss": 1.8567,
"step": 298
},
{
"epoch": 0.33766233766233766,
"grad_norm": 1.6843390464782715,
"learning_rate": 7.494851244861265e-05,
"loss": 2.1742,
"step": 299
},
{
"epoch": 0.33879164313946925,
"grad_norm": 2.8176753520965576,
"learning_rate": 7.479383859011282e-05,
"loss": 2.1492,
"step": 300
},
{
"epoch": 0.33992094861660077,
"grad_norm": 0.11592619866132736,
"learning_rate": 7.463884945537629e-05,
"loss": 1.9557,
"step": 301
},
{
"epoch": 0.34105025409373235,
"grad_norm": 0.09616155922412872,
"learning_rate": 7.448354701523103e-05,
"loss": 1.5362,
"step": 302
},
{
"epoch": 0.34217955957086393,
"grad_norm": 0.1662580966949463,
"learning_rate": 7.4327933244489e-05,
"loss": 1.714,
"step": 303
},
{
"epoch": 0.34330886504799546,
"grad_norm": 0.1190900206565857,
"learning_rate": 7.417201012192102e-05,
"loss": 1.7049,
"step": 304
},
{
"epoch": 0.34443817052512704,
"grad_norm": 0.17841818928718567,
"learning_rate": 7.401577963023159e-05,
"loss": 1.4065,
"step": 305
},
{
"epoch": 0.3455674760022586,
"grad_norm": 0.14446666836738586,
"learning_rate": 7.385924375603365e-05,
"loss": 1.9471,
"step": 306
},
{
"epoch": 0.34669678147939015,
"grad_norm": 0.1485661119222641,
"learning_rate": 7.370240448982344e-05,
"loss": 1.536,
"step": 307
},
{
"epoch": 0.34782608695652173,
"grad_norm": 0.14800912141799927,
"learning_rate": 7.354526382595502e-05,
"loss": 1.9858,
"step": 308
},
{
"epoch": 0.3489553924336533,
"grad_norm": 0.16698437929153442,
"learning_rate": 7.338782376261508e-05,
"loss": 1.7016,
"step": 309
},
{
"epoch": 0.3500846979107849,
"grad_norm": 0.2107677012681961,
"learning_rate": 7.323008630179735e-05,
"loss": 1.9076,
"step": 310
},
{
"epoch": 0.3512140033879164,
"grad_norm": 0.17815802991390228,
"learning_rate": 7.307205344927733e-05,
"loss": 1.9451,
"step": 311
},
{
"epoch": 0.352343308865048,
"grad_norm": 0.22763967514038086,
"learning_rate": 7.291372721458663e-05,
"loss": 1.7138,
"step": 312
},
{
"epoch": 0.3534726143421796,
"grad_norm": 0.2137778401374817,
"learning_rate": 7.275510961098754e-05,
"loss": 1.7655,
"step": 313
},
{
"epoch": 0.3546019198193111,
"grad_norm": 0.3158370852470398,
"learning_rate": 7.25962026554473e-05,
"loss": 1.7698,
"step": 314
},
{
"epoch": 0.3557312252964427,
"grad_norm": 0.2486497312784195,
"learning_rate": 7.243700836861259e-05,
"loss": 1.4603,
"step": 315
},
{
"epoch": 0.35686053077357427,
"grad_norm": 0.2606787383556366,
"learning_rate": 7.227752877478372e-05,
"loss": 1.8183,
"step": 316
},
{
"epoch": 0.3579898362507058,
"grad_norm": 0.21474474668502808,
"learning_rate": 7.211776590188898e-05,
"loss": 1.8927,
"step": 317
},
{
"epoch": 0.3591191417278374,
"grad_norm": 0.21336178481578827,
"learning_rate": 7.195772178145877e-05,
"loss": 1.7893,
"step": 318
},
{
"epoch": 0.36024844720496896,
"grad_norm": 0.22856034338474274,
"learning_rate": 7.179739844859986e-05,
"loss": 1.8327,
"step": 319
},
{
"epoch": 0.3613777526821005,
"grad_norm": 0.19083748757839203,
"learning_rate": 7.163679794196937e-05,
"loss": 1.4431,
"step": 320
},
{
"epoch": 0.36250705815923207,
"grad_norm": 0.21726277470588684,
"learning_rate": 7.147592230374907e-05,
"loss": 1.8487,
"step": 321
},
{
"epoch": 0.36363636363636365,
"grad_norm": 0.2296500951051712,
"learning_rate": 7.131477357961913e-05,
"loss": 1.8114,
"step": 322
},
{
"epoch": 0.36476566911349523,
"grad_norm": 0.21099169552326202,
"learning_rate": 7.115335381873241e-05,
"loss": 1.7014,
"step": 323
},
{
"epoch": 0.36589497459062675,
"grad_norm": 0.2625615894794464,
"learning_rate": 7.09916650736881e-05,
"loss": 1.816,
"step": 324
},
{
"epoch": 0.36702428006775834,
"grad_norm": 0.2657228112220764,
"learning_rate": 7.082970940050589e-05,
"loss": 1.6727,
"step": 325
},
{
"epoch": 0.3681535855448899,
"grad_norm": 0.2254035770893097,
"learning_rate": 7.06674888585996e-05,
"loss": 1.9412,
"step": 326
},
{
"epoch": 0.36928289102202144,
"grad_norm": 0.3340676426887512,
"learning_rate": 7.050500551075121e-05,
"loss": 1.8683,
"step": 327
},
{
"epoch": 0.370412196499153,
"grad_norm": 0.2350630760192871,
"learning_rate": 7.03422614230844e-05,
"loss": 1.7303,
"step": 328
},
{
"epoch": 0.3715415019762846,
"grad_norm": 0.27818697690963745,
"learning_rate": 7.017925866503852e-05,
"loss": 1.642,
"step": 329
},
{
"epoch": 0.37267080745341613,
"grad_norm": 0.2882806062698364,
"learning_rate": 7.001599930934201e-05,
"loss": 1.8531,
"step": 330
},
{
"epoch": 0.3738001129305477,
"grad_norm": 0.2653593122959137,
"learning_rate": 6.985248543198628e-05,
"loss": 1.7041,
"step": 331
},
{
"epoch": 0.3749294184076793,
"grad_norm": 0.34544265270233154,
"learning_rate": 6.96887191121992e-05,
"loss": 1.4506,
"step": 332
},
{
"epoch": 0.3760587238848108,
"grad_norm": 0.33744436502456665,
"learning_rate": 6.952470243241865e-05,
"loss": 1.9651,
"step": 333
},
{
"epoch": 0.3771880293619424,
"grad_norm": 0.3108724057674408,
"learning_rate": 6.936043747826608e-05,
"loss": 1.6384,
"step": 334
},
{
"epoch": 0.378317334839074,
"grad_norm": 0.3719252347946167,
"learning_rate": 6.919592633851999e-05,
"loss": 1.7721,
"step": 335
},
{
"epoch": 0.3794466403162055,
"grad_norm": 0.39266571402549744,
"learning_rate": 6.903117110508931e-05,
"loss": 1.4012,
"step": 336
},
{
"epoch": 0.3805759457933371,
"grad_norm": 0.4938894808292389,
"learning_rate": 6.886617387298689e-05,
"loss": 1.3363,
"step": 337
},
{
"epoch": 0.38170525127046867,
"grad_norm": 0.4643020033836365,
"learning_rate": 6.870093674030277e-05,
"loss": 1.8535,
"step": 338
},
{
"epoch": 0.38283455674760025,
"grad_norm": 0.4194999039173126,
"learning_rate": 6.853546180817763e-05,
"loss": 1.3464,
"step": 339
},
{
"epoch": 0.3839638622247318,
"grad_norm": 0.4396350383758545,
"learning_rate": 6.836975118077585e-05,
"loss": 1.5401,
"step": 340
},
{
"epoch": 0.38509316770186336,
"grad_norm": 0.5123130679130554,
"learning_rate": 6.8203806965259e-05,
"loss": 1.4395,
"step": 341
},
{
"epoch": 0.38622247317899494,
"grad_norm": 0.5199635624885559,
"learning_rate": 6.803763127175892e-05,
"loss": 1.3563,
"step": 342
},
{
"epoch": 0.38735177865612647,
"grad_norm": 0.5113270282745361,
"learning_rate": 6.787122621335084e-05,
"loss": 1.2656,
"step": 343
},
{
"epoch": 0.38848108413325805,
"grad_norm": 0.8492355942726135,
"learning_rate": 6.770459390602665e-05,
"loss": 1.1771,
"step": 344
},
{
"epoch": 0.38961038961038963,
"grad_norm": 0.6425052881240845,
"learning_rate": 6.75377364686679e-05,
"loss": 1.0433,
"step": 345
},
{
"epoch": 0.39073969508752115,
"grad_norm": 0.7178412675857544,
"learning_rate": 6.73706560230188e-05,
"loss": 1.3795,
"step": 346
},
{
"epoch": 0.39186900056465274,
"grad_norm": 0.8413724899291992,
"learning_rate": 6.720335469365943e-05,
"loss": 1.3675,
"step": 347
},
{
"epoch": 0.3929983060417843,
"grad_norm": 1.199455738067627,
"learning_rate": 6.703583460797851e-05,
"loss": 1.1964,
"step": 348
},
{
"epoch": 0.39412761151891584,
"grad_norm": 1.3447051048278809,
"learning_rate": 6.686809789614652e-05,
"loss": 1.7217,
"step": 349
},
{
"epoch": 0.3952569169960474,
"grad_norm": 3.1556925773620605,
"learning_rate": 6.670014669108846e-05,
"loss": 2.2129,
"step": 350
},
{
"epoch": 0.396386222473179,
"grad_norm": 0.12518011033535004,
"learning_rate": 6.65319831284569e-05,
"loss": 1.803,
"step": 351
},
{
"epoch": 0.39751552795031053,
"grad_norm": 0.11627189069986343,
"learning_rate": 6.636360934660464e-05,
"loss": 1.5064,
"step": 352
},
{
"epoch": 0.3986448334274421,
"grad_norm": 0.2026718109846115,
"learning_rate": 6.619502748655768e-05,
"loss": 1.2533,
"step": 353
},
{
"epoch": 0.3997741389045737,
"grad_norm": 0.1420208066701889,
"learning_rate": 6.602623969198786e-05,
"loss": 1.7044,
"step": 354
},
{
"epoch": 0.4009034443817053,
"grad_norm": 0.2128676176071167,
"learning_rate": 6.585724810918575e-05,
"loss": 1.4417,
"step": 355
},
{
"epoch": 0.4020327498588368,
"grad_norm": 0.16009800136089325,
"learning_rate": 6.568805488703316e-05,
"loss": 1.7954,
"step": 356
},
{
"epoch": 0.4031620553359684,
"grad_norm": 0.22958825528621674,
"learning_rate": 6.551866217697602e-05,
"loss": 1.4241,
"step": 357
},
{
"epoch": 0.40429136081309996,
"grad_norm": 0.18111076951026917,
"learning_rate": 6.53490721329969e-05,
"loss": 2.1858,
"step": 358
},
{
"epoch": 0.4054206662902315,
"grad_norm": 0.20647546648979187,
"learning_rate": 6.517928691158766e-05,
"loss": 1.6176,
"step": 359
},
{
"epoch": 0.40654997176736307,
"grad_norm": 0.22477790713310242,
"learning_rate": 6.500930867172197e-05,
"loss": 2.0384,
"step": 360
},
{
"epoch": 0.40767927724449465,
"grad_norm": 0.19441178441047668,
"learning_rate": 6.4839139574828e-05,
"loss": 1.8078,
"step": 361
},
{
"epoch": 0.4088085827216262,
"grad_norm": 0.1936100721359253,
"learning_rate": 6.466878178476072e-05,
"loss": 2.1223,
"step": 362
},
{
"epoch": 0.40993788819875776,
"grad_norm": 0.16981804370880127,
"learning_rate": 6.449823746777463e-05,
"loss": 1.7857,
"step": 363
},
{
"epoch": 0.41106719367588934,
"grad_norm": 0.18102099001407623,
"learning_rate": 6.4327508792496e-05,
"loss": 1.9586,
"step": 364
},
{
"epoch": 0.41219649915302087,
"grad_norm": 0.2525807023048401,
"learning_rate": 6.415659792989543e-05,
"loss": 1.3721,
"step": 365
},
{
"epoch": 0.41332580463015245,
"grad_norm": 0.2487485557794571,
"learning_rate": 6.398550705326017e-05,
"loss": 1.5905,
"step": 366
},
{
"epoch": 0.41445511010728403,
"grad_norm": 0.2146550863981247,
"learning_rate": 6.381423833816653e-05,
"loss": 2.0711,
"step": 367
},
{
"epoch": 0.4155844155844156,
"grad_norm": 0.2551126778125763,
"learning_rate": 6.364279396245216e-05,
"loss": 1.9295,
"step": 368
},
{
"epoch": 0.41671372106154714,
"grad_norm": 0.21161001920700073,
"learning_rate": 6.347117610618847e-05,
"loss": 1.8385,
"step": 369
},
{
"epoch": 0.4178430265386787,
"grad_norm": 0.2629604935646057,
"learning_rate": 6.329938695165279e-05,
"loss": 1.6472,
"step": 370
},
{
"epoch": 0.4189723320158103,
"grad_norm": 0.265480101108551,
"learning_rate": 6.312742868330063e-05,
"loss": 1.9359,
"step": 371
},
{
"epoch": 0.4201016374929418,
"grad_norm": 0.21452881395816803,
"learning_rate": 6.295530348773799e-05,
"loss": 1.9496,
"step": 372
},
{
"epoch": 0.4212309429700734,
"grad_norm": 0.25752097368240356,
"learning_rate": 6.278301355369347e-05,
"loss": 1.7407,
"step": 373
},
{
"epoch": 0.422360248447205,
"grad_norm": 0.2401757687330246,
"learning_rate": 6.26105610719905e-05,
"loss": 1.9668,
"step": 374
},
{
"epoch": 0.4234895539243365,
"grad_norm": 0.2806456983089447,
"learning_rate": 6.243794823551943e-05,
"loss": 1.8072,
"step": 375
},
{
"epoch": 0.4246188594014681,
"grad_norm": 0.22247202694416046,
"learning_rate": 6.226517723920965e-05,
"loss": 1.9812,
"step": 376
},
{
"epoch": 0.4257481648785997,
"grad_norm": 0.28262099623680115,
"learning_rate": 6.209225028000173e-05,
"loss": 1.8815,
"step": 377
},
{
"epoch": 0.4268774703557312,
"grad_norm": 0.26785987615585327,
"learning_rate": 6.191916955681942e-05,
"loss": 1.7281,
"step": 378
},
{
"epoch": 0.4280067758328628,
"grad_norm": 0.26913347840309143,
"learning_rate": 6.174593727054176e-05,
"loss": 1.9988,
"step": 379
},
{
"epoch": 0.42913608130999437,
"grad_norm": 0.2614772617816925,
"learning_rate": 6.157255562397501e-05,
"loss": 2.0497,
"step": 380
},
{
"epoch": 0.4302653867871259,
"grad_norm": 0.28345417976379395,
"learning_rate": 6.139902682182472e-05,
"loss": 1.4859,
"step": 381
},
{
"epoch": 0.4313946922642575,
"grad_norm": 0.38483351469039917,
"learning_rate": 6.122535307066762e-05,
"loss": 1.8345,
"step": 382
},
{
"epoch": 0.43252399774138905,
"grad_norm": 0.3730812966823578,
"learning_rate": 6.105153657892361e-05,
"loss": 1.9294,
"step": 383
},
{
"epoch": 0.43365330321852064,
"grad_norm": 0.300212562084198,
"learning_rate": 6.0877579556827666e-05,
"loss": 1.8527,
"step": 384
},
{
"epoch": 0.43478260869565216,
"grad_norm": 0.3289623260498047,
"learning_rate": 6.0703484216401775e-05,
"loss": 1.4542,
"step": 385
},
{
"epoch": 0.43591191417278374,
"grad_norm": 0.33357787132263184,
"learning_rate": 6.0529252771426704e-05,
"loss": 1.6104,
"step": 386
},
{
"epoch": 0.4370412196499153,
"grad_norm": 0.3852611184120178,
"learning_rate": 6.0354887437413965e-05,
"loss": 1.7609,
"step": 387
},
{
"epoch": 0.43817052512704685,
"grad_norm": 0.4012943506240845,
"learning_rate": 6.018039043157755e-05,
"loss": 1.5484,
"step": 388
},
{
"epoch": 0.43929983060417843,
"grad_norm": 0.48133382201194763,
"learning_rate": 6.000576397280582e-05,
"loss": 1.5734,
"step": 389
},
{
"epoch": 0.44042913608131,
"grad_norm": 0.5372048020362854,
"learning_rate": 5.9831010281633194e-05,
"loss": 1.4444,
"step": 390
},
{
"epoch": 0.44155844155844154,
"grad_norm": 0.4447720944881439,
"learning_rate": 5.965613158021204e-05,
"loss": 1.7868,
"step": 391
},
{
"epoch": 0.4426877470355731,
"grad_norm": 0.48003891110420227,
"learning_rate": 5.948113009228426e-05,
"loss": 1.861,
"step": 392
},
{
"epoch": 0.4438170525127047,
"grad_norm": 0.5163267850875854,
"learning_rate": 5.9306008043153164e-05,
"loss": 1.5805,
"step": 393
},
{
"epoch": 0.4449463579898362,
"grad_norm": 0.9821428060531616,
"learning_rate": 5.9130767659655086e-05,
"loss": 1.3601,
"step": 394
},
{
"epoch": 0.4460756634669678,
"grad_norm": 0.8409574627876282,
"learning_rate": 5.895541117013109e-05,
"loss": 1.5384,
"step": 395
},
{
"epoch": 0.4472049689440994,
"grad_norm": 0.7223172187805176,
"learning_rate": 5.877994080439861e-05,
"loss": 1.5718,
"step": 396
},
{
"epoch": 0.4483342744212309,
"grad_norm": 0.7617491483688354,
"learning_rate": 5.860435879372319e-05,
"loss": 0.8487,
"step": 397
},
{
"epoch": 0.4494635798983625,
"grad_norm": 1.3099548816680908,
"learning_rate": 5.842866737078995e-05,
"loss": 1.9787,
"step": 398
},
{
"epoch": 0.4505928853754941,
"grad_norm": 1.2898166179656982,
"learning_rate": 5.825286876967534e-05,
"loss": 1.2648,
"step": 399
},
{
"epoch": 0.45172219085262566,
"grad_norm": 2.04656982421875,
"learning_rate": 5.807696522581867e-05,
"loss": 1.3147,
"step": 400
},
{
"epoch": 0.4528514963297572,
"grad_norm": 0.14516253769397736,
"learning_rate": 5.7900958975993705e-05,
"loss": 1.6677,
"step": 401
},
{
"epoch": 0.45398080180688877,
"grad_norm": 0.11851727962493896,
"learning_rate": 5.772485225828017e-05,
"loss": 1.3764,
"step": 402
},
{
"epoch": 0.45511010728402035,
"grad_norm": 0.1660085767507553,
"learning_rate": 5.754864731203537e-05,
"loss": 0.8836,
"step": 403
},
{
"epoch": 0.4562394127611519,
"grad_norm": 0.1634090393781662,
"learning_rate": 5.737234637786567e-05,
"loss": 1.7064,
"step": 404
},
{
"epoch": 0.45736871823828346,
"grad_norm": 0.1739731878042221,
"learning_rate": 5.7195951697597984e-05,
"loss": 1.4277,
"step": 405
},
{
"epoch": 0.45849802371541504,
"grad_norm": 0.15163052082061768,
"learning_rate": 5.7019465514251317e-05,
"loss": 1.7148,
"step": 406
},
{
"epoch": 0.45962732919254656,
"grad_norm": 0.18431293964385986,
"learning_rate": 5.684289007200819e-05,
"loss": 1.5384,
"step": 407
},
{
"epoch": 0.46075663466967814,
"grad_norm": 0.194001704454422,
"learning_rate": 5.6666227616186196e-05,
"loss": 1.5875,
"step": 408
},
{
"epoch": 0.4618859401468097,
"grad_norm": 0.19858571887016296,
"learning_rate": 5.648948039320932e-05,
"loss": 1.6302,
"step": 409
},
{
"epoch": 0.46301524562394125,
"grad_norm": 0.20067928731441498,
"learning_rate": 5.6312650650579434e-05,
"loss": 1.6761,
"step": 410
},
{
"epoch": 0.46414455110107283,
"grad_norm": 0.20594027638435364,
"learning_rate": 5.61357406368478e-05,
"loss": 2.0831,
"step": 411
},
{
"epoch": 0.4652738565782044,
"grad_norm": 0.20570452511310577,
"learning_rate": 5.595875260158632e-05,
"loss": 1.5535,
"step": 412
},
{
"epoch": 0.466403162055336,
"grad_norm": 0.21128208935260773,
"learning_rate": 5.578168879535905e-05,
"loss": 1.5396,
"step": 413
},
{
"epoch": 0.4675324675324675,
"grad_norm": 0.22810891270637512,
"learning_rate": 5.560455146969351e-05,
"loss": 2.0007,
"step": 414
},
{
"epoch": 0.4686617730095991,
"grad_norm": 0.23436829447746277,
"learning_rate": 5.5427342877052146e-05,
"loss": 1.7637,
"step": 415
},
{
"epoch": 0.4697910784867307,
"grad_norm": 0.20258279144763947,
"learning_rate": 5.525006527080356e-05,
"loss": 1.9177,
"step": 416
},
{
"epoch": 0.4709203839638622,
"grad_norm": 0.3795338571071625,
"learning_rate": 5.5072720905193984e-05,
"loss": 1.933,
"step": 417
},
{
"epoch": 0.4720496894409938,
"grad_norm": 0.2246110886335373,
"learning_rate": 5.4895312035318505e-05,
"loss": 1.8509,
"step": 418
},
{
"epoch": 0.4731789949181254,
"grad_norm": 0.20534281432628632,
"learning_rate": 5.471784091709249e-05,
"loss": 1.7642,
"step": 419
},
{
"epoch": 0.4743083003952569,
"grad_norm": 0.21959885954856873,
"learning_rate": 5.4540309807222787e-05,
"loss": 1.8869,
"step": 420
},
{
"epoch": 0.4754376058723885,
"grad_norm": 0.23442259430885315,
"learning_rate": 5.4362720963179184e-05,
"loss": 1.7901,
"step": 421
},
{
"epoch": 0.47656691134952006,
"grad_norm": 0.2303483933210373,
"learning_rate": 5.418507664316551e-05,
"loss": 1.8651,
"step": 422
},
{
"epoch": 0.4776962168266516,
"grad_norm": 0.23268766701221466,
"learning_rate": 5.40073791060911e-05,
"loss": 1.8254,
"step": 423
},
{
"epoch": 0.47882552230378317,
"grad_norm": 0.5629847049713135,
"learning_rate": 5.382963061154194e-05,
"loss": 1.9735,
"step": 424
},
{
"epoch": 0.47995482778091475,
"grad_norm": 0.3409874141216278,
"learning_rate": 5.3651833419752026e-05,
"loss": 1.9609,
"step": 425
},
{
"epoch": 0.4810841332580463,
"grad_norm": 0.24846181273460388,
"learning_rate": 5.347398979157455e-05,
"loss": 1.5292,
"step": 426
},
{
"epoch": 0.48221343873517786,
"grad_norm": 0.32662272453308105,
"learning_rate": 5.329610198845322e-05,
"loss": 1.5789,
"step": 427
},
{
"epoch": 0.48334274421230944,
"grad_norm": 0.2438475340604782,
"learning_rate": 5.311817227239343e-05,
"loss": 1.5546,
"step": 428
},
{
"epoch": 0.484472049689441,
"grad_norm": 0.3485732972621918,
"learning_rate": 5.2940202905933576e-05,
"loss": 1.9349,
"step": 429
},
{
"epoch": 0.48560135516657255,
"grad_norm": 0.30109038949012756,
"learning_rate": 5.276219615211622e-05,
"loss": 1.7651,
"step": 430
},
{
"epoch": 0.4867306606437041,
"grad_norm": 0.29728347063064575,
"learning_rate": 5.258415427445933e-05,
"loss": 1.7196,
"step": 431
},
{
"epoch": 0.4878599661208357,
"grad_norm": 0.3463418483734131,
"learning_rate": 5.240607953692751e-05,
"loss": 1.4324,
"step": 432
},
{
"epoch": 0.48898927159796723,
"grad_norm": 0.28663697838783264,
"learning_rate": 5.222797420390325e-05,
"loss": 1.6675,
"step": 433
},
{
"epoch": 0.4901185770750988,
"grad_norm": 0.3074226379394531,
"learning_rate": 5.204984054015803e-05,
"loss": 1.503,
"step": 434
},
{
"epoch": 0.4912478825522304,
"grad_norm": 0.3598564863204956,
"learning_rate": 5.187168081082361e-05,
"loss": 1.5335,
"step": 435
},
{
"epoch": 0.4923771880293619,
"grad_norm": 0.3530828654766083,
"learning_rate": 5.169349728136319e-05,
"loss": 1.3829,
"step": 436
},
{
"epoch": 0.4935064935064935,
"grad_norm": 0.42433059215545654,
"learning_rate": 5.151529221754262e-05,
"loss": 1.5466,
"step": 437
},
{
"epoch": 0.4946357989836251,
"grad_norm": 0.37672513723373413,
"learning_rate": 5.133706788540157e-05,
"loss": 1.5813,
"step": 438
},
{
"epoch": 0.4957651044607566,
"grad_norm": 0.5108361840248108,
"learning_rate": 5.1158826551224736e-05,
"loss": 1.8655,
"step": 439
},
{
"epoch": 0.4968944099378882,
"grad_norm": 0.43977028131484985,
"learning_rate": 5.098057048151298e-05,
"loss": 1.6398,
"step": 440
},
{
"epoch": 0.4980237154150198,
"grad_norm": 0.5553740859031677,
"learning_rate": 5.0802301942954586e-05,
"loss": 1.6456,
"step": 441
},
{
"epoch": 0.4991530208921513,
"grad_norm": 0.4294775724411011,
"learning_rate": 5.0624023202396346e-05,
"loss": 1.5727,
"step": 442
},
{
"epoch": 0.5002823263692829,
"grad_norm": 0.5539741516113281,
"learning_rate": 5.0445736526814814e-05,
"loss": 1.0095,
"step": 443
},
{
"epoch": 0.5014116318464145,
"grad_norm": 0.7881059646606445,
"learning_rate": 5.026744418328741e-05,
"loss": 1.175,
"step": 444
},
{
"epoch": 0.502540937323546,
"grad_norm": 0.5578503608703613,
"learning_rate": 5.0089148438963664e-05,
"loss": 0.7371,
"step": 445
},
{
"epoch": 0.5036702428006776,
"grad_norm": 0.6809929609298706,
"learning_rate": 4.991085156103635e-05,
"loss": 1.0037,
"step": 446
},
{
"epoch": 0.5047995482778092,
"grad_norm": 0.6220012307167053,
"learning_rate": 4.97325558167126e-05,
"loss": 1.1134,
"step": 447
},
{
"epoch": 0.5059288537549407,
"grad_norm": 1.3821215629577637,
"learning_rate": 4.955426347318521e-05,
"loss": 1.3825,
"step": 448
},
{
"epoch": 0.5070581592320723,
"grad_norm": 1.268893837928772,
"learning_rate": 4.9375976797603666e-05,
"loss": 1.0811,
"step": 449
},
{
"epoch": 0.5081874647092038,
"grad_norm": 3.948934555053711,
"learning_rate": 4.9197698057045426e-05,
"loss": 1.8798,
"step": 450
},
{
"epoch": 0.5093167701863354,
"grad_norm": 0.11119662970304489,
"learning_rate": 4.9019429518487034e-05,
"loss": 1.6768,
"step": 451
},
{
"epoch": 0.510446075663467,
"grad_norm": 0.12353526800870895,
"learning_rate": 4.884117344877528e-05,
"loss": 1.3001,
"step": 452
},
{
"epoch": 0.5115753811405985,
"grad_norm": 0.17551442980766296,
"learning_rate": 4.866293211459844e-05,
"loss": 1.1936,
"step": 453
},
{
"epoch": 0.51270468661773,
"grad_norm": 0.16263023018836975,
"learning_rate": 4.8484707782457384e-05,
"loss": 1.7866,
"step": 454
},
{
"epoch": 0.5138339920948617,
"grad_norm": 0.16919438540935516,
"learning_rate": 4.830650271863681e-05,
"loss": 1.479,
"step": 455
},
{
"epoch": 0.5149632975719932,
"grad_norm": 0.1805877834558487,
"learning_rate": 4.812831918917641e-05,
"loss": 1.4336,
"step": 456
},
{
"epoch": 0.5160926030491247,
"grad_norm": 0.2215701937675476,
"learning_rate": 4.795015945984198e-05,
"loss": 1.7715,
"step": 457
},
{
"epoch": 0.5172219085262564,
"grad_norm": 0.2349424511194229,
"learning_rate": 4.777202579609676e-05,
"loss": 2.181,
"step": 458
},
{
"epoch": 0.5183512140033879,
"grad_norm": 0.21069027483463287,
"learning_rate": 4.7593920463072485e-05,
"loss": 1.8312,
"step": 459
},
{
"epoch": 0.5194805194805194,
"grad_norm": 0.23758326470851898,
"learning_rate": 4.74158457255407e-05,
"loss": 1.7325,
"step": 460
},
{
"epoch": 0.5206098249576511,
"grad_norm": 0.19673678278923035,
"learning_rate": 4.72378038478838e-05,
"loss": 1.6742,
"step": 461
},
{
"epoch": 0.5217391304347826,
"grad_norm": 0.22907458245754242,
"learning_rate": 4.705979709406643e-05,
"loss": 1.6255,
"step": 462
},
{
"epoch": 0.5228684359119141,
"grad_norm": 0.2475663274526596,
"learning_rate": 4.688182772760656e-05,
"loss": 1.5928,
"step": 463
},
{
"epoch": 0.5239977413890458,
"grad_norm": 0.18993178009986877,
"learning_rate": 4.67038980115468e-05,
"loss": 2.0439,
"step": 464
},
{
"epoch": 0.5251270468661773,
"grad_norm": 0.19921617209911346,
"learning_rate": 4.652601020842546e-05,
"loss": 1.8901,
"step": 465
},
{
"epoch": 0.5262563523433089,
"grad_norm": 0.1859847903251648,
"learning_rate": 4.6348166580247986e-05,
"loss": 1.7906,
"step": 466
},
{
"epoch": 0.5273856578204404,
"grad_norm": 0.2471705973148346,
"learning_rate": 4.617036938845806e-05,
"loss": 1.541,
"step": 467
},
{
"epoch": 0.528514963297572,
"grad_norm": 0.18537718057632446,
"learning_rate": 4.599262089390892e-05,
"loss": 1.7591,
"step": 468
},
{
"epoch": 0.5296442687747036,
"grad_norm": 0.20029859244823456,
"learning_rate": 4.5814923356834507e-05,
"loss": 1.8927,
"step": 469
},
{
"epoch": 0.5307735742518351,
"grad_norm": 0.19296778738498688,
"learning_rate": 4.563727903682083e-05,
"loss": 1.724,
"step": 470
},
{
"epoch": 0.5319028797289667,
"grad_norm": 0.22295331954956055,
"learning_rate": 4.5459690192777205e-05,
"loss": 1.837,
"step": 471
},
{
"epoch": 0.5330321852060983,
"grad_norm": 0.22168877720832825,
"learning_rate": 4.528215908290753e-05,
"loss": 1.6778,
"step": 472
},
{
"epoch": 0.5341614906832298,
"grad_norm": 0.20707228779792786,
"learning_rate": 4.510468796468151e-05,
"loss": 1.9187,
"step": 473
},
{
"epoch": 0.5352907961603613,
"grad_norm": 0.2480616569519043,
"learning_rate": 4.492727909480603e-05,
"loss": 1.7309,
"step": 474
},
{
"epoch": 0.536420101637493,
"grad_norm": 0.2842020094394684,
"learning_rate": 4.4749934729196444e-05,
"loss": 1.8762,
"step": 475
},
{
"epoch": 0.5375494071146245,
"grad_norm": 0.2506377696990967,
"learning_rate": 4.457265712294787e-05,
"loss": 1.7255,
"step": 476
},
{
"epoch": 0.538678712591756,
"grad_norm": 0.3282622992992401,
"learning_rate": 4.43954485303065e-05,
"loss": 1.8469,
"step": 477
},
{
"epoch": 0.5398080180688877,
"grad_norm": 0.22683817148208618,
"learning_rate": 4.4218311204640964e-05,
"loss": 1.8733,
"step": 478
},
{
"epoch": 0.5409373235460192,
"grad_norm": 0.3191218972206116,
"learning_rate": 4.404124739841368e-05,
"loss": 1.98,
"step": 479
},
{
"epoch": 0.5420666290231507,
"grad_norm": 0.2846674621105194,
"learning_rate": 4.386425936315221e-05,
"loss": 1.38,
"step": 480
},
{
"epoch": 0.5431959345002824,
"grad_norm": 0.31951338052749634,
"learning_rate": 4.368734934942057e-05,
"loss": 1.6798,
"step": 481
},
{
"epoch": 0.5443252399774139,
"grad_norm": 0.27403882145881653,
"learning_rate": 4.35105196067907e-05,
"loss": 1.3672,
"step": 482
},
{
"epoch": 0.5454545454545454,
"grad_norm": 0.36696842312812805,
"learning_rate": 4.333377238381381e-05,
"loss": 1.7081,
"step": 483
},
{
"epoch": 0.546583850931677,
"grad_norm": 0.3341814875602722,
"learning_rate": 4.315710992799182e-05,
"loss": 1.6637,
"step": 484
},
{
"epoch": 0.5477131564088086,
"grad_norm": 0.381578654050827,
"learning_rate": 4.29805344857487e-05,
"loss": 1.82,
"step": 485
},
{
"epoch": 0.5488424618859401,
"grad_norm": 0.47679901123046875,
"learning_rate": 4.280404830240202e-05,
"loss": 1.5453,
"step": 486
},
{
"epoch": 0.5499717673630717,
"grad_norm": 0.2999393045902252,
"learning_rate": 4.2627653622134346e-05,
"loss": 1.4535,
"step": 487
},
{
"epoch": 0.5511010728402033,
"grad_norm": 0.39911600947380066,
"learning_rate": 4.245135268796464e-05,
"loss": 1.526,
"step": 488
},
{
"epoch": 0.5522303783173348,
"grad_norm": 0.4650627672672272,
"learning_rate": 4.2275147741719836e-05,
"loss": 1.8196,
"step": 489
},
{
"epoch": 0.5533596837944664,
"grad_norm": 0.4915354549884796,
"learning_rate": 4.20990410240063e-05,
"loss": 1.5382,
"step": 490
},
{
"epoch": 0.554488989271598,
"grad_norm": 0.5710887312889099,
"learning_rate": 4.192303477418132e-05,
"loss": 1.8352,
"step": 491
},
{
"epoch": 0.5556182947487295,
"grad_norm": 0.7017544507980347,
"learning_rate": 4.1747131230324674e-05,
"loss": 1.5447,
"step": 492
},
{
"epoch": 0.5567476002258611,
"grad_norm": 0.5755261182785034,
"learning_rate": 4.157133262921007e-05,
"loss": 1.0984,
"step": 493
},
{
"epoch": 0.5578769057029926,
"grad_norm": 0.5910277366638184,
"learning_rate": 4.139564120627682e-05,
"loss": 1.2644,
"step": 494
},
{
"epoch": 0.5590062111801242,
"grad_norm": 0.7593830227851868,
"learning_rate": 4.122005919560138e-05,
"loss": 1.0888,
"step": 495
},
{
"epoch": 0.5601355166572558,
"grad_norm": 0.9940705895423889,
"learning_rate": 4.104458882986893e-05,
"loss": 1.3263,
"step": 496
},
{
"epoch": 0.5612648221343873,
"grad_norm": 0.9789013862609863,
"learning_rate": 4.086923234034493e-05,
"loss": 2.0872,
"step": 497
},
{
"epoch": 0.562394127611519,
"grad_norm": 1.05492103099823,
"learning_rate": 4.069399195684684e-05,
"loss": 1.5978,
"step": 498
},
{
"epoch": 0.5635234330886505,
"grad_norm": 1.5677920579910278,
"learning_rate": 4.051886990771575e-05,
"loss": 1.4674,
"step": 499
},
{
"epoch": 0.564652738565782,
"grad_norm": 2.827597141265869,
"learning_rate": 4.034386841978799e-05,
"loss": 1.6893,
"step": 500
},
{
"epoch": 0.5657820440429137,
"grad_norm": 0.11306215822696686,
"learning_rate": 4.016898971836682e-05,
"loss": 1.8998,
"step": 501
},
{
"epoch": 0.5669113495200452,
"grad_norm": 0.12952418625354767,
"learning_rate": 3.999423602719419e-05,
"loss": 1.4633,
"step": 502
},
{
"epoch": 0.5680406549971767,
"grad_norm": 0.1617014855146408,
"learning_rate": 3.9819609568422444e-05,
"loss": 1.4874,
"step": 503
},
{
"epoch": 0.5691699604743083,
"grad_norm": 0.24738018214702606,
"learning_rate": 3.964511256258605e-05,
"loss": 1.2834,
"step": 504
},
{
"epoch": 0.5702992659514399,
"grad_norm": 0.15966880321502686,
"learning_rate": 3.94707472285733e-05,
"loss": 1.913,
"step": 505
},
{
"epoch": 0.5714285714285714,
"grad_norm": 0.2285860925912857,
"learning_rate": 3.929651578359823e-05,
"loss": 1.1719,
"step": 506
},
{
"epoch": 0.572557876905703,
"grad_norm": 0.149735227227211,
"learning_rate": 3.912242044317233e-05,
"loss": 1.6384,
"step": 507
},
{
"epoch": 0.5736871823828346,
"grad_norm": 0.1617288887500763,
"learning_rate": 3.8948463421076416e-05,
"loss": 1.7541,
"step": 508
},
{
"epoch": 0.5748164878599661,
"grad_norm": 0.20702283084392548,
"learning_rate": 3.87746469293324e-05,
"loss": 1.3209,
"step": 509
},
{
"epoch": 0.5759457933370977,
"grad_norm": 0.21300503611564636,
"learning_rate": 3.860097317817529e-05,
"loss": 1.2512,
"step": 510
},
{
"epoch": 0.5770750988142292,
"grad_norm": 0.18188899755477905,
"learning_rate": 3.842744437602498e-05,
"loss": 2.1871,
"step": 511
},
{
"epoch": 0.5782044042913608,
"grad_norm": 0.21721036732196808,
"learning_rate": 3.825406272945825e-05,
"loss": 1.8422,
"step": 512
},
{
"epoch": 0.5793337097684924,
"grad_norm": 0.19083920121192932,
"learning_rate": 3.8080830443180586e-05,
"loss": 1.6101,
"step": 513
},
{
"epoch": 0.5804630152456239,
"grad_norm": 0.2515491545200348,
"learning_rate": 3.7907749719998283e-05,
"loss": 1.2861,
"step": 514
},
{
"epoch": 0.5815923207227555,
"grad_norm": 0.16136078536510468,
"learning_rate": 3.773482276079035e-05,
"loss": 1.9461,
"step": 515
},
{
"epoch": 0.5827216261998871,
"grad_norm": 0.212098628282547,
"learning_rate": 3.7562051764480584e-05,
"loss": 2.0323,
"step": 516
},
{
"epoch": 0.5838509316770186,
"grad_norm": 0.28270667791366577,
"learning_rate": 3.738943892800951e-05,
"loss": 1.5369,
"step": 517
},
{
"epoch": 0.5849802371541502,
"grad_norm": 0.22667361795902252,
"learning_rate": 3.721698644630653e-05,
"loss": 1.7268,
"step": 518
},
{
"epoch": 0.5861095426312818,
"grad_norm": 0.23249778151512146,
"learning_rate": 3.704469651226202e-05,
"loss": 2.135,
"step": 519
},
{
"epoch": 0.5872388481084133,
"grad_norm": 0.2777628004550934,
"learning_rate": 3.687257131669939e-05,
"loss": 1.4989,
"step": 520
},
{
"epoch": 0.5883681535855448,
"grad_norm": 0.2231837958097458,
"learning_rate": 3.6700613048347226e-05,
"loss": 1.761,
"step": 521
},
{
"epoch": 0.5894974590626765,
"grad_norm": 0.2245432585477829,
"learning_rate": 3.652882389381154e-05,
"loss": 1.5851,
"step": 522
},
{
"epoch": 0.590626764539808,
"grad_norm": 0.2458360344171524,
"learning_rate": 3.635720603754785e-05,
"loss": 1.8387,
"step": 523
},
{
"epoch": 0.5917560700169395,
"grad_norm": 0.2425604611635208,
"learning_rate": 3.61857616618335e-05,
"loss": 1.984,
"step": 524
},
{
"epoch": 0.5928853754940712,
"grad_norm": 0.2515004575252533,
"learning_rate": 3.601449294673984e-05,
"loss": 1.5634,
"step": 525
},
{
"epoch": 0.5940146809712027,
"grad_norm": 0.29173892736434937,
"learning_rate": 3.5843402070104575e-05,
"loss": 2.1866,
"step": 526
},
{
"epoch": 0.5951439864483343,
"grad_norm": 0.2624877691268921,
"learning_rate": 3.5672491207504e-05,
"loss": 1.9706,
"step": 527
},
{
"epoch": 0.5962732919254659,
"grad_norm": 0.2421773225069046,
"learning_rate": 3.550176253222538e-05,
"loss": 1.8329,
"step": 528
},
{
"epoch": 0.5974025974025974,
"grad_norm": 0.24929580092430115,
"learning_rate": 3.533121821523928e-05,
"loss": 1.9352,
"step": 529
},
{
"epoch": 0.598531902879729,
"grad_norm": 0.28088921308517456,
"learning_rate": 3.516086042517202e-05,
"loss": 2.0107,
"step": 530
},
{
"epoch": 0.5996612083568605,
"grad_norm": 0.28900331258773804,
"learning_rate": 3.4990691328278026e-05,
"loss": 1.6626,
"step": 531
},
{
"epoch": 0.6007905138339921,
"grad_norm": 0.3050013780593872,
"learning_rate": 3.482071308841237e-05,
"loss": 1.3869,
"step": 532
},
{
"epoch": 0.6019198193111237,
"grad_norm": 0.2838362157344818,
"learning_rate": 3.4650927867003116e-05,
"loss": 1.7871,
"step": 533
},
{
"epoch": 0.6030491247882552,
"grad_norm": 0.3072238564491272,
"learning_rate": 3.448133782302399e-05,
"loss": 1.7517,
"step": 534
},
{
"epoch": 0.6041784302653868,
"grad_norm": 0.37723538279533386,
"learning_rate": 3.431194511296685e-05,
"loss": 1.6763,
"step": 535
},
{
"epoch": 0.6053077357425184,
"grad_norm": 0.35179805755615234,
"learning_rate": 3.4142751890814285e-05,
"loss": 1.4846,
"step": 536
},
{
"epoch": 0.6064370412196499,
"grad_norm": 0.3520725667476654,
"learning_rate": 3.397376030801215e-05,
"loss": 1.4679,
"step": 537
},
{
"epoch": 0.6075663466967814,
"grad_norm": 0.4937370717525482,
"learning_rate": 3.380497251344233e-05,
"loss": 1.369,
"step": 538
},
{
"epoch": 0.6086956521739131,
"grad_norm": 0.5829998254776001,
"learning_rate": 3.363639065339536e-05,
"loss": 1.4504,
"step": 539
},
{
"epoch": 0.6098249576510446,
"grad_norm": 0.5117580890655518,
"learning_rate": 3.346801687154312e-05,
"loss": 1.6758,
"step": 540
},
{
"epoch": 0.6109542631281761,
"grad_norm": 0.5459957122802734,
"learning_rate": 3.329985330891154e-05,
"loss": 1.4033,
"step": 541
},
{
"epoch": 0.6120835686053078,
"grad_norm": 0.6639454364776611,
"learning_rate": 3.3131902103853496e-05,
"loss": 1.2722,
"step": 542
},
{
"epoch": 0.6132128740824393,
"grad_norm": 0.6231001019477844,
"learning_rate": 3.296416539202149e-05,
"loss": 1.5103,
"step": 543
},
{
"epoch": 0.6143421795595708,
"grad_norm": 0.6634608507156372,
"learning_rate": 3.279664530634059e-05,
"loss": 1.4552,
"step": 544
},
{
"epoch": 0.6154714850367025,
"grad_norm": 0.7582663893699646,
"learning_rate": 3.2629343976981205e-05,
"loss": 1.6744,
"step": 545
},
{
"epoch": 0.616600790513834,
"grad_norm": 0.8784703016281128,
"learning_rate": 3.2462263531332114e-05,
"loss": 1.0153,
"step": 546
},
{
"epoch": 0.6177300959909655,
"grad_norm": 0.9141214489936829,
"learning_rate": 3.229540609397334e-05,
"loss": 1.8154,
"step": 547
},
{
"epoch": 0.6188594014680971,
"grad_norm": 1.0494526624679565,
"learning_rate": 3.212877378664917e-05,
"loss": 1.2725,
"step": 548
},
{
"epoch": 0.6199887069452287,
"grad_norm": 1.2754578590393066,
"learning_rate": 3.19623687282411e-05,
"loss": 1.3594,
"step": 549
},
{
"epoch": 0.6211180124223602,
"grad_norm": 3.0668442249298096,
"learning_rate": 3.1796193034740995e-05,
"loss": 2.2957,
"step": 550
},
{
"epoch": 0.6222473178994918,
"grad_norm": 0.14963117241859436,
"learning_rate": 3.163024881922415e-05,
"loss": 1.7012,
"step": 551
},
{
"epoch": 0.6233766233766234,
"grad_norm": 0.19053514301776886,
"learning_rate": 3.1464538191822395e-05,
"loss": 1.471,
"step": 552
},
{
"epoch": 0.6245059288537549,
"grad_norm": 0.14380377531051636,
"learning_rate": 3.1299063259697224e-05,
"loss": 1.3501,
"step": 553
},
{
"epoch": 0.6256352343308865,
"grad_norm": 0.13697926700115204,
"learning_rate": 3.113382612701312e-05,
"loss": 1.822,
"step": 554
},
{
"epoch": 0.626764539808018,
"grad_norm": 0.1762678027153015,
"learning_rate": 3.0968828894910696e-05,
"loss": 1.4053,
"step": 555
},
{
"epoch": 0.6278938452851497,
"grad_norm": 0.19729723036289215,
"learning_rate": 3.0804073661480024e-05,
"loss": 1.2143,
"step": 556
},
{
"epoch": 0.6290231507622812,
"grad_norm": 0.1666150987148285,
"learning_rate": 3.0639562521733935e-05,
"loss": 2.0227,
"step": 557
},
{
"epoch": 0.6301524562394127,
"grad_norm": 0.19616784155368805,
"learning_rate": 3.0475297567581363e-05,
"loss": 1.8752,
"step": 558
},
{
"epoch": 0.6312817617165444,
"grad_norm": 0.17856550216674805,
"learning_rate": 3.0311280887800807e-05,
"loss": 1.8782,
"step": 559
},
{
"epoch": 0.6324110671936759,
"grad_norm": 0.2001560926437378,
"learning_rate": 3.0147514568013736e-05,
"loss": 1.6296,
"step": 560
},
{
"epoch": 0.6335403726708074,
"grad_norm": 0.18400637805461884,
"learning_rate": 2.9984000690658003e-05,
"loss": 1.9136,
"step": 561
},
{
"epoch": 0.6346696781479391,
"grad_norm": 0.3142743408679962,
"learning_rate": 2.98207413349615e-05,
"loss": 1.7852,
"step": 562
},
{
"epoch": 0.6357989836250706,
"grad_norm": 0.21863651275634766,
"learning_rate": 2.9657738576915593e-05,
"loss": 2.0031,
"step": 563
},
{
"epoch": 0.6369282891022021,
"grad_norm": 0.20468103885650635,
"learning_rate": 2.9494994489248807e-05,
"loss": 1.7731,
"step": 564
},
{
"epoch": 0.6380575945793338,
"grad_norm": 0.17776687443256378,
"learning_rate": 2.9332511141400405e-05,
"loss": 1.6393,
"step": 565
},
{
"epoch": 0.6391869000564653,
"grad_norm": 0.200006902217865,
"learning_rate": 2.917029059949413e-05,
"loss": 1.7989,
"step": 566
},
{
"epoch": 0.6403162055335968,
"grad_norm": 0.20339208841323853,
"learning_rate": 2.900833492631191e-05,
"loss": 1.7774,
"step": 567
},
{
"epoch": 0.6414455110107284,
"grad_norm": 0.21164321899414062,
"learning_rate": 2.8846646181267617e-05,
"loss": 2.0764,
"step": 568
},
{
"epoch": 0.64257481648786,
"grad_norm": 0.2051491141319275,
"learning_rate": 2.8685226420380885e-05,
"loss": 1.985,
"step": 569
},
{
"epoch": 0.6437041219649915,
"grad_norm": 0.27175769209861755,
"learning_rate": 2.852407769625095e-05,
"loss": 1.9218,
"step": 570
},
{
"epoch": 0.6448334274421231,
"grad_norm": 0.23221158981323242,
"learning_rate": 2.8363202058030636e-05,
"loss": 1.8568,
"step": 571
},
{
"epoch": 0.6459627329192547,
"grad_norm": 0.2595285177230835,
"learning_rate": 2.8202601551400176e-05,
"loss": 1.609,
"step": 572
},
{
"epoch": 0.6470920383963862,
"grad_norm": 0.20131221413612366,
"learning_rate": 2.804227821854125e-05,
"loss": 1.7826,
"step": 573
},
{
"epoch": 0.6482213438735178,
"grad_norm": 0.20102736353874207,
"learning_rate": 2.7882234098111025e-05,
"loss": 1.6074,
"step": 574
},
{
"epoch": 0.6493506493506493,
"grad_norm": 0.3271494209766388,
"learning_rate": 2.7722471225216284e-05,
"loss": 2.046,
"step": 575
},
{
"epoch": 0.6504799548277809,
"grad_norm": 0.2570958137512207,
"learning_rate": 2.7562991631387424e-05,
"loss": 1.8585,
"step": 576
},
{
"epoch": 0.6516092603049125,
"grad_norm": 0.2624165117740631,
"learning_rate": 2.7403797344552696e-05,
"loss": 1.9614,
"step": 577
},
{
"epoch": 0.652738565782044,
"grad_norm": 0.2565469741821289,
"learning_rate": 2.7244890389012468e-05,
"loss": 2.0746,
"step": 578
},
{
"epoch": 0.6538678712591756,
"grad_norm": 0.2802393138408661,
"learning_rate": 2.708627278541337e-05,
"loss": 1.8132,
"step": 579
},
{
"epoch": 0.6549971767363072,
"grad_norm": 0.31447547674179077,
"learning_rate": 2.692794655072268e-05,
"loss": 1.9015,
"step": 580
},
{
"epoch": 0.6561264822134387,
"grad_norm": 0.3330397605895996,
"learning_rate": 2.6769913698202646e-05,
"loss": 1.5298,
"step": 581
},
{
"epoch": 0.6572557876905702,
"grad_norm": 0.2743810713291168,
"learning_rate": 2.6612176237384934e-05,
"loss": 1.7217,
"step": 582
},
{
"epoch": 0.6583850931677019,
"grad_norm": 0.3404594659805298,
"learning_rate": 2.645473617404498e-05,
"loss": 1.8576,
"step": 583
},
{
"epoch": 0.6595143986448334,
"grad_norm": 0.2993510365486145,
"learning_rate": 2.629759551017658e-05,
"loss": 1.5187,
"step": 584
},
{
"epoch": 0.6606437041219649,
"grad_norm": 0.28750112652778625,
"learning_rate": 2.6140756243966348e-05,
"loss": 1.9287,
"step": 585
},
{
"epoch": 0.6617730095990966,
"grad_norm": 0.39892521500587463,
"learning_rate": 2.598422036976843e-05,
"loss": 1.5573,
"step": 586
},
{
"epoch": 0.6629023150762281,
"grad_norm": 0.40390318632125854,
"learning_rate": 2.5827989878078972e-05,
"loss": 1.6552,
"step": 587
},
{
"epoch": 0.6640316205533597,
"grad_norm": 0.49753913283348083,
"learning_rate": 2.5672066755511015e-05,
"loss": 1.5754,
"step": 588
},
{
"epoch": 0.6651609260304913,
"grad_norm": 0.4602469205856323,
"learning_rate": 2.5516452984768972e-05,
"loss": 1.7824,
"step": 589
},
{
"epoch": 0.6662902315076228,
"grad_norm": 0.46448126435279846,
"learning_rate": 2.536115054462372e-05,
"loss": 1.2975,
"step": 590
},
{
"epoch": 0.6674195369847544,
"grad_norm": 0.5113171339035034,
"learning_rate": 2.5206161409887164e-05,
"loss": 1.3596,
"step": 591
},
{
"epoch": 0.668548842461886,
"grad_norm": 0.6399116516113281,
"learning_rate": 2.5051487551387366e-05,
"loss": 1.5185,
"step": 592
},
{
"epoch": 0.6696781479390175,
"grad_norm": 0.530022919178009,
"learning_rate": 2.4897130935943215e-05,
"loss": 1.6079,
"step": 593
},
{
"epoch": 0.6708074534161491,
"grad_norm": 0.7399399280548096,
"learning_rate": 2.4743093526339695e-05,
"loss": 1.6957,
"step": 594
},
{
"epoch": 0.6719367588932806,
"grad_norm": 0.9889739155769348,
"learning_rate": 2.458937728130271e-05,
"loss": 1.6572,
"step": 595
},
{
"epoch": 0.6730660643704122,
"grad_norm": 0.7296352982521057,
"learning_rate": 2.4435984155474362e-05,
"loss": 1.4702,
"step": 596
},
{
"epoch": 0.6741953698475438,
"grad_norm": 0.6716480851173401,
"learning_rate": 2.428291609938786e-05,
"loss": 1.6475,
"step": 597
},
{
"epoch": 0.6753246753246753,
"grad_norm": 0.9753092527389526,
"learning_rate": 2.4130175059442983e-05,
"loss": 1.6509,
"step": 598
},
{
"epoch": 0.6764539808018069,
"grad_norm": 1.042049765586853,
"learning_rate": 2.397776297788112e-05,
"loss": 1.9047,
"step": 599
},
{
"epoch": 0.6775832862789385,
"grad_norm": 1.636167287826538,
"learning_rate": 2.382568179276074e-05,
"loss": 1.5812,
"step": 600
},
{
"epoch": 0.67871259175607,
"grad_norm": 0.10443209111690521,
"learning_rate": 2.367393343793253e-05,
"loss": 1.6011,
"step": 601
},
{
"epoch": 0.6798418972332015,
"grad_norm": 0.11894982308149338,
"learning_rate": 2.352251984301508e-05,
"loss": 1.1336,
"step": 602
},
{
"epoch": 0.6809712027103332,
"grad_norm": 0.15143415331840515,
"learning_rate": 2.337144293337008e-05,
"loss": 1.4442,
"step": 603
},
{
"epoch": 0.6821005081874647,
"grad_norm": 0.17936274409294128,
"learning_rate": 2.3220704630078093e-05,
"loss": 1.5153,
"step": 604
},
{
"epoch": 0.6832298136645962,
"grad_norm": 0.19904710352420807,
"learning_rate": 2.3070306849913843e-05,
"loss": 1.4951,
"step": 605
},
{
"epoch": 0.6843591191417279,
"grad_norm": 0.20843562483787537,
"learning_rate": 2.292025150532211e-05,
"loss": 1.4662,
"step": 606
},
{
"epoch": 0.6854884246188594,
"grad_norm": 0.1950257271528244,
"learning_rate": 2.2770540504393224e-05,
"loss": 1.9538,
"step": 607
},
{
"epoch": 0.6866177300959909,
"grad_norm": 0.15244010090827942,
"learning_rate": 2.2621175750838954e-05,
"loss": 1.801,
"step": 608
},
{
"epoch": 0.6877470355731226,
"grad_norm": 0.2256423383951187,
"learning_rate": 2.2472159143968085e-05,
"loss": 1.2293,
"step": 609
},
{
"epoch": 0.6888763410502541,
"grad_norm": 0.24459119141101837,
"learning_rate": 2.232349257866254e-05,
"loss": 1.7217,
"step": 610
},
{
"epoch": 0.6900056465273856,
"grad_norm": 0.19310763478279114,
"learning_rate": 2.217517794535305e-05,
"loss": 1.9453,
"step": 611
},
{
"epoch": 0.6911349520045172,
"grad_norm": 0.19308945536613464,
"learning_rate": 2.2027217129995266e-05,
"loss": 1.6961,
"step": 612
},
{
"epoch": 0.6922642574816488,
"grad_norm": 0.1873122751712799,
"learning_rate": 2.1879612014045693e-05,
"loss": 1.6995,
"step": 613
},
{
"epoch": 0.6933935629587803,
"grad_norm": 0.18931369483470917,
"learning_rate": 2.1732364474437794e-05,
"loss": 1.9309,
"step": 614
},
{
"epoch": 0.6945228684359119,
"grad_norm": 0.25303930044174194,
"learning_rate": 2.158547638355811e-05,
"loss": 1.7571,
"step": 615
},
{
"epoch": 0.6956521739130435,
"grad_norm": 0.22161240875720978,
"learning_rate": 2.143894960922253e-05,
"loss": 2.0939,
"step": 616
},
{
"epoch": 0.6967814793901751,
"grad_norm": 0.23233333230018616,
"learning_rate": 2.1292786014652398e-05,
"loss": 1.803,
"step": 617
},
{
"epoch": 0.6979107848673066,
"grad_norm": 0.29414427280426025,
"learning_rate": 2.1146987458450935e-05,
"loss": 2.0023,
"step": 618
},
{
"epoch": 0.6990400903444381,
"grad_norm": 0.21665748953819275,
"learning_rate": 2.100155579457953e-05,
"loss": 1.8504,
"step": 619
},
{
"epoch": 0.7001693958215698,
"grad_norm": 0.2273651361465454,
"learning_rate": 2.0856492872334273e-05,
"loss": 1.401,
"step": 620
},
{
"epoch": 0.7012987012987013,
"grad_norm": 0.25878530740737915,
"learning_rate": 2.0711800536322296e-05,
"loss": 2.0893,
"step": 621
},
{
"epoch": 0.7024280067758328,
"grad_norm": 0.28381770849227905,
"learning_rate": 2.0567480626438416e-05,
"loss": 1.8416,
"step": 622
},
{
"epoch": 0.7035573122529645,
"grad_norm": 1.1040054559707642,
"learning_rate": 2.042353497784174e-05,
"loss": 1.871,
"step": 623
},
{
"epoch": 0.704686617730096,
"grad_norm": 0.27611055970191956,
"learning_rate": 2.0279965420932267e-05,
"loss": 2.0418,
"step": 624
},
{
"epoch": 0.7058159232072275,
"grad_norm": 0.21912366151809692,
"learning_rate": 2.0136773781327656e-05,
"loss": 1.6895,
"step": 625
},
{
"epoch": 0.7069452286843592,
"grad_norm": 0.22892306745052338,
"learning_rate": 1.999396187983998e-05,
"loss": 1.7621,
"step": 626
},
{
"epoch": 0.7080745341614907,
"grad_norm": 0.24957579374313354,
"learning_rate": 1.9851531532452665e-05,
"loss": 1.8752,
"step": 627
},
{
"epoch": 0.7092038396386222,
"grad_norm": 0.27737995982170105,
"learning_rate": 1.9709484550297263e-05,
"loss": 1.8712,
"step": 628
},
{
"epoch": 0.7103331451157539,
"grad_norm": 0.6962355375289917,
"learning_rate": 1.956782273963051e-05,
"loss": 1.8764,
"step": 629
},
{
"epoch": 0.7114624505928854,
"grad_norm": 0.27960485219955444,
"learning_rate": 1.942654790181132e-05,
"loss": 1.6422,
"step": 630
},
{
"epoch": 0.7125917560700169,
"grad_norm": 0.27620258927345276,
"learning_rate": 1.9285661833277953e-05,
"loss": 1.7739,
"step": 631
},
{
"epoch": 0.7137210615471485,
"grad_norm": 0.28414490818977356,
"learning_rate": 1.914516632552504e-05,
"loss": 1.9286,
"step": 632
},
{
"epoch": 0.7148503670242801,
"grad_norm": 0.32783564925193787,
"learning_rate": 1.9005063165080915e-05,
"loss": 1.6393,
"step": 633
},
{
"epoch": 0.7159796725014116,
"grad_norm": 0.5030328631401062,
"learning_rate": 1.8865354133484835e-05,
"loss": 1.9269,
"step": 634
},
{
"epoch": 0.7171089779785432,
"grad_norm": 0.3257984519004822,
"learning_rate": 1.8726041007264394e-05,
"loss": 1.6148,
"step": 635
},
{
"epoch": 0.7182382834556748,
"grad_norm": 0.32803335785865784,
"learning_rate": 1.8587125557912856e-05,
"loss": 1.9035,
"step": 636
},
{
"epoch": 0.7193675889328063,
"grad_norm": 0.4144839644432068,
"learning_rate": 1.8448609551866647e-05,
"loss": 1.6207,
"step": 637
},
{
"epoch": 0.7204968944099379,
"grad_norm": 0.5742985010147095,
"learning_rate": 1.8310494750482925e-05,
"loss": 1.5247,
"step": 638
},
{
"epoch": 0.7216261998870694,
"grad_norm": 0.45039287209510803,
"learning_rate": 1.8172782910017193e-05,
"loss": 1.8702,
"step": 639
},
{
"epoch": 0.722755505364201,
"grad_norm": 0.4651569128036499,
"learning_rate": 1.80354757816009e-05,
"loss": 1.6114,
"step": 640
},
{
"epoch": 0.7238848108413326,
"grad_norm": 0.5533832311630249,
"learning_rate": 1.7898575111219224e-05,
"loss": 1.724,
"step": 641
},
{
"epoch": 0.7250141163184641,
"grad_norm": 0.5084080696105957,
"learning_rate": 1.7762082639688844e-05,
"loss": 1.6029,
"step": 642
},
{
"epoch": 0.7261434217955957,
"grad_norm": 0.593112587928772,
"learning_rate": 1.7626000102635863e-05,
"loss": 1.7722,
"step": 643
},
{
"epoch": 0.7272727272727273,
"grad_norm": 0.7219982743263245,
"learning_rate": 1.7490329230473664e-05,
"loss": 1.1574,
"step": 644
},
{
"epoch": 0.7284020327498588,
"grad_norm": 0.8209093809127808,
"learning_rate": 1.735507174838092e-05,
"loss": 1.4226,
"step": 645
},
{
"epoch": 0.7295313382269905,
"grad_norm": 0.817489504814148,
"learning_rate": 1.7220229376279683e-05,
"loss": 0.6924,
"step": 646
},
{
"epoch": 0.730660643704122,
"grad_norm": 2.4483489990234375,
"learning_rate": 1.7085803828813546e-05,
"loss": 1.541,
"step": 647
},
{
"epoch": 0.7317899491812535,
"grad_norm": 1.2162669897079468,
"learning_rate": 1.6951796815325748e-05,
"loss": 1.9561,
"step": 648
},
{
"epoch": 0.7329192546583851,
"grad_norm": 1.5169554948806763,
"learning_rate": 1.6818210039837496e-05,
"loss": 1.9823,
"step": 649
},
{
"epoch": 0.7340485601355167,
"grad_norm": 1.8656399250030518,
"learning_rate": 1.668504520102628e-05,
"loss": 1.8061,
"step": 650
},
{
"epoch": 0.7351778656126482,
"grad_norm": 0.12022317200899124,
"learning_rate": 1.6552303992204327e-05,
"loss": 1.383,
"step": 651
},
{
"epoch": 0.7363071710897798,
"grad_norm": 0.13317741453647614,
"learning_rate": 1.6419988101296974e-05,
"loss": 1.6712,
"step": 652
},
{
"epoch": 0.7374364765669114,
"grad_norm": 0.13640844821929932,
"learning_rate": 1.6288099210821274e-05,
"loss": 1.5652,
"step": 653
},
{
"epoch": 0.7385657820440429,
"grad_norm": 0.20749111473560333,
"learning_rate": 1.615663899786456e-05,
"loss": 1.2273,
"step": 654
},
{
"epoch": 0.7396950875211745,
"grad_norm": 0.18163056671619415,
"learning_rate": 1.602560913406318e-05,
"loss": 1.3978,
"step": 655
},
{
"epoch": 0.740824392998306,
"grad_norm": 0.1996774822473526,
"learning_rate": 1.5895011285581173e-05,
"loss": 1.9716,
"step": 656
},
{
"epoch": 0.7419536984754376,
"grad_norm": 0.18158242106437683,
"learning_rate": 1.5764847113089094e-05,
"loss": 1.5364,
"step": 657
},
{
"epoch": 0.7430830039525692,
"grad_norm": 0.18996961414813995,
"learning_rate": 1.563511827174296e-05,
"loss": 1.5873,
"step": 658
},
{
"epoch": 0.7442123094297007,
"grad_norm": 0.22422468662261963,
"learning_rate": 1.5505826411163122e-05,
"loss": 1.6887,
"step": 659
},
{
"epoch": 0.7453416149068323,
"grad_norm": 0.19467850029468536,
"learning_rate": 1.5376973175413322e-05,
"loss": 1.8702,
"step": 660
},
{
"epoch": 0.7464709203839639,
"grad_norm": 0.27403318881988525,
"learning_rate": 1.5248560202979784e-05,
"loss": 1.479,
"step": 661
},
{
"epoch": 0.7476002258610954,
"grad_norm": 0.20241166651248932,
"learning_rate": 1.5120589126750429e-05,
"loss": 1.78,
"step": 662
},
{
"epoch": 0.748729531338227,
"grad_norm": 0.20999853312969208,
"learning_rate": 1.4993061573993988e-05,
"loss": 1.6491,
"step": 663
},
{
"epoch": 0.7498588368153586,
"grad_norm": 0.2149931937456131,
"learning_rate": 1.4865979166339489e-05,
"loss": 1.5793,
"step": 664
},
{
"epoch": 0.7509881422924901,
"grad_norm": 0.318891316652298,
"learning_rate": 1.473934351975541e-05,
"loss": 1.3242,
"step": 665
},
{
"epoch": 0.7521174477696216,
"grad_norm": 0.23082397878170013,
"learning_rate": 1.4613156244529363e-05,
"loss": 1.9251,
"step": 666
},
{
"epoch": 0.7532467532467533,
"grad_norm": 0.18606293201446533,
"learning_rate": 1.4487418945247438e-05,
"loss": 2.0595,
"step": 667
},
{
"epoch": 0.7543760587238848,
"grad_norm": 0.19954818487167358,
"learning_rate": 1.4362133220773955e-05,
"loss": 1.9616,
"step": 668
},
{
"epoch": 0.7555053642010163,
"grad_norm": 0.22097231447696686,
"learning_rate": 1.4237300664230923e-05,
"loss": 2.1689,
"step": 669
},
{
"epoch": 0.756634669678148,
"grad_norm": 0.23413076996803284,
"learning_rate": 1.411292286297803e-05,
"loss": 1.7825,
"step": 670
},
{
"epoch": 0.7577639751552795,
"grad_norm": 0.20669344067573547,
"learning_rate": 1.3989001398592255e-05,
"loss": 2.2408,
"step": 671
},
{
"epoch": 0.758893280632411,
"grad_norm": 0.23911398649215698,
"learning_rate": 1.386553784684792e-05,
"loss": 1.9885,
"step": 672
},
{
"epoch": 0.7600225861095427,
"grad_norm": 0.23120582103729248,
"learning_rate": 1.3742533777696454e-05,
"loss": 1.7861,
"step": 673
},
{
"epoch": 0.7611518915866742,
"grad_norm": 0.30187371373176575,
"learning_rate": 1.3619990755246654e-05,
"loss": 1.7233,
"step": 674
},
{
"epoch": 0.7622811970638057,
"grad_norm": 0.2317027598619461,
"learning_rate": 1.3497910337744624e-05,
"loss": 1.9729,
"step": 675
},
{
"epoch": 0.7634105025409373,
"grad_norm": 0.24414820969104767,
"learning_rate": 1.337629407755409e-05,
"loss": 1.5769,
"step": 676
},
{
"epoch": 0.7645398080180689,
"grad_norm": 0.3238002061843872,
"learning_rate": 1.3255143521136498e-05,
"loss": 2.1359,
"step": 677
},
{
"epoch": 0.7656691134952005,
"grad_norm": 0.2557482123374939,
"learning_rate": 1.3134460209031541e-05,
"loss": 1.7676,
"step": 678
},
{
"epoch": 0.766798418972332,
"grad_norm": 0.360801637172699,
"learning_rate": 1.30142456758374e-05,
"loss": 1.5168,
"step": 679
},
{
"epoch": 0.7679277244494636,
"grad_norm": 0.24271051585674286,
"learning_rate": 1.2894501450191399e-05,
"loss": 1.9566,
"step": 680
},
{
"epoch": 0.7690570299265952,
"grad_norm": 0.32158005237579346,
"learning_rate": 1.2775229054750343e-05,
"loss": 1.4323,
"step": 681
},
{
"epoch": 0.7701863354037267,
"grad_norm": 0.29279786348342896,
"learning_rate": 1.2656430006171404e-05,
"loss": 1.9157,
"step": 682
},
{
"epoch": 0.7713156408808582,
"grad_norm": 0.37682029604911804,
"learning_rate": 1.253810581509265e-05,
"loss": 1.5932,
"step": 683
},
{
"epoch": 0.7724449463579899,
"grad_norm": 0.3008486032485962,
"learning_rate": 1.2420257986113959e-05,
"loss": 1.5601,
"step": 684
},
{
"epoch": 0.7735742518351214,
"grad_norm": 0.3406381905078888,
"learning_rate": 1.2302888017777747e-05,
"loss": 1.7338,
"step": 685
},
{
"epoch": 0.7747035573122529,
"grad_norm": 0.3354251980781555,
"learning_rate": 1.2185997402550087e-05,
"loss": 1.1953,
"step": 686
},
{
"epoch": 0.7758328627893846,
"grad_norm": 0.45363056659698486,
"learning_rate": 1.206958762680157e-05,
"loss": 1.6571,
"step": 687
},
{
"epoch": 0.7769621682665161,
"grad_norm": 0.3696065843105316,
"learning_rate": 1.1953660170788538e-05,
"loss": 1.6085,
"step": 688
},
{
"epoch": 0.7780914737436476,
"grad_norm": 0.5141958594322205,
"learning_rate": 1.1838216508634154e-05,
"loss": 1.7784,
"step": 689
},
{
"epoch": 0.7792207792207793,
"grad_norm": 0.42403480410575867,
"learning_rate": 1.1723258108309703e-05,
"loss": 1.7745,
"step": 690
},
{
"epoch": 0.7803500846979108,
"grad_norm": 0.5388137698173523,
"learning_rate": 1.1608786431615931e-05,
"loss": 1.5025,
"step": 691
},
{
"epoch": 0.7814793901750423,
"grad_norm": 0.5538594126701355,
"learning_rate": 1.1494802934164473e-05,
"loss": 1.6429,
"step": 692
},
{
"epoch": 0.782608695652174,
"grad_norm": 0.6132928729057312,
"learning_rate": 1.1381309065359297e-05,
"loss": 1.0125,
"step": 693
},
{
"epoch": 0.7837380011293055,
"grad_norm": 0.6523614525794983,
"learning_rate": 1.1268306268378286e-05,
"loss": 1.2307,
"step": 694
},
{
"epoch": 0.784867306606437,
"grad_norm": 0.6368614435195923,
"learning_rate": 1.1155795980154916e-05,
"loss": 1.3637,
"step": 695
},
{
"epoch": 0.7859966120835686,
"grad_norm": 0.620228111743927,
"learning_rate": 1.1043779631359973e-05,
"loss": 0.9165,
"step": 696
},
{
"epoch": 0.7871259175607002,
"grad_norm": 0.7397240400314331,
"learning_rate": 1.0932258646383336e-05,
"loss": 1.1481,
"step": 697
},
{
"epoch": 0.7882552230378317,
"grad_norm": 0.9581983685493469,
"learning_rate": 1.0821234443315876e-05,
"loss": 2.1559,
"step": 698
},
{
"epoch": 0.7893845285149633,
"grad_norm": 1.2329559326171875,
"learning_rate": 1.0710708433931466e-05,
"loss": 1.61,
"step": 699
},
{
"epoch": 0.7905138339920948,
"grad_norm": 1.888492226600647,
"learning_rate": 1.060068202366895e-05,
"loss": 1.8341,
"step": 700
},
{
"epoch": 0.7916431394692264,
"grad_norm": 0.16424258053302765,
"learning_rate": 1.0491156611614344e-05,
"loss": 1.6505,
"step": 701
},
{
"epoch": 0.792772444946358,
"grad_norm": 0.16047315299510956,
"learning_rate": 1.0382133590482978e-05,
"loss": 1.276,
"step": 702
},
{
"epoch": 0.7939017504234895,
"grad_norm": 0.21792340278625488,
"learning_rate": 1.0273614346601883e-05,
"loss": 1.4329,
"step": 703
},
{
"epoch": 0.7950310559006211,
"grad_norm": 0.173606276512146,
"learning_rate": 1.0165600259892061e-05,
"loss": 2.0379,
"step": 704
},
{
"epoch": 0.7961603613777527,
"grad_norm": 0.13415886461734772,
"learning_rate": 1.0058092703850985e-05,
"loss": 1.9901,
"step": 705
},
{
"epoch": 0.7972896668548842,
"grad_norm": 0.15839488804340363,
"learning_rate": 9.951093045535115e-06,
"loss": 1.811,
"step": 706
},
{
"epoch": 0.7984189723320159,
"grad_norm": 0.20288027822971344,
"learning_rate": 9.844602645542584e-06,
"loss": 2.0712,
"step": 707
},
{
"epoch": 0.7995482778091474,
"grad_norm": 0.2803378999233246,
"learning_rate": 9.738622857995788e-06,
"loss": 1.8512,
"step": 708
},
{
"epoch": 0.8006775832862789,
"grad_norm": 0.21160300076007843,
"learning_rate": 9.633155030524243e-06,
"loss": 1.5314,
"step": 709
},
{
"epoch": 0.8018068887634106,
"grad_norm": 0.16740640997886658,
"learning_rate": 9.528200504247404e-06,
"loss": 1.7808,
"step": 710
},
{
"epoch": 0.8029361942405421,
"grad_norm": 0.22359423339366913,
"learning_rate": 9.423760613757677e-06,
"loss": 1.7337,
"step": 711
},
{
"epoch": 0.8040654997176736,
"grad_norm": 0.19431254267692566,
"learning_rate": 9.319836687103368e-06,
"loss": 2.122,
"step": 712
},
{
"epoch": 0.8051948051948052,
"grad_norm": 0.2232765257358551,
"learning_rate": 9.216430045771845e-06,
"loss": 1.7579,
"step": 713
},
{
"epoch": 0.8063241106719368,
"grad_norm": 0.203886941075325,
"learning_rate": 9.11354200467271e-06,
"loss": 1.9332,
"step": 714
},
{
"epoch": 0.8074534161490683,
"grad_norm": 0.2308446764945984,
"learning_rate": 9.011173872121132e-06,
"loss": 1.3203,
"step": 715
},
{
"epoch": 0.8085827216261999,
"grad_norm": 0.182839035987854,
"learning_rate": 8.909326949821123e-06,
"loss": 2.0008,
"step": 716
},
{
"epoch": 0.8097120271033315,
"grad_norm": 0.22202719748020172,
"learning_rate": 8.808002532849047e-06,
"loss": 2.0819,
"step": 717
},
{
"epoch": 0.810841332580463,
"grad_norm": 0.1916837841272354,
"learning_rate": 8.707201909637137e-06,
"loss": 1.9595,
"step": 718
},
{
"epoch": 0.8119706380575946,
"grad_norm": 0.26201632618904114,
"learning_rate": 8.606926361957124e-06,
"loss": 1.894,
"step": 719
},
{
"epoch": 0.8130999435347261,
"grad_norm": 0.5175297856330872,
"learning_rate": 8.507177164903907e-06,
"loss": 1.9983,
"step": 720
},
{
"epoch": 0.8142292490118577,
"grad_norm": 0.21798285841941833,
"learning_rate": 8.407955586879373e-06,
"loss": 1.9481,
"step": 721
},
{
"epoch": 0.8153585544889893,
"grad_norm": 0.22814205288887024,
"learning_rate": 8.309262889576225e-06,
"loss": 1.9928,
"step": 722
},
{
"epoch": 0.8164878599661208,
"grad_norm": 0.25377923250198364,
"learning_rate": 8.211100327962013e-06,
"loss": 1.9169,
"step": 723
},
{
"epoch": 0.8176171654432524,
"grad_norm": 0.23791451752185822,
"learning_rate": 8.113469150263087e-06,
"loss": 1.8194,
"step": 724
},
{
"epoch": 0.818746470920384,
"grad_norm": 0.2693537473678589,
"learning_rate": 8.016370597948785e-06,
"loss": 1.6758,
"step": 725
},
{
"epoch": 0.8198757763975155,
"grad_norm": 0.20643536746501923,
"learning_rate": 7.91980590571561e-06,
"loss": 1.5467,
"step": 726
},
{
"epoch": 0.821005081874647,
"grad_norm": 0.2922951877117157,
"learning_rate": 7.823776301471591e-06,
"loss": 1.7632,
"step": 727
},
{
"epoch": 0.8221343873517787,
"grad_norm": 0.33710768818855286,
"learning_rate": 7.72828300632058e-06,
"loss": 1.532,
"step": 728
},
{
"epoch": 0.8232636928289102,
"grad_norm": 0.33453261852264404,
"learning_rate": 7.633327234546788e-06,
"loss": 1.727,
"step": 729
},
{
"epoch": 0.8243929983060417,
"grad_norm": 0.2797680199146271,
"learning_rate": 7.538910193599313e-06,
"loss": 1.723,
"step": 730
},
{
"epoch": 0.8255223037831734,
"grad_norm": 0.3717952370643616,
"learning_rate": 7.445033084076847e-06,
"loss": 1.5709,
"step": 731
},
{
"epoch": 0.8266516092603049,
"grad_norm": 0.3003871738910675,
"learning_rate": 7.351697099712307e-06,
"loss": 1.7581,
"step": 732
},
{
"epoch": 0.8277809147374364,
"grad_norm": 0.36279451847076416,
"learning_rate": 7.258903427357727e-06,
"loss": 1.5827,
"step": 733
},
{
"epoch": 0.8289102202145681,
"grad_norm": 0.313398152589798,
"learning_rate": 7.166653246969174e-06,
"loss": 1.6037,
"step": 734
},
{
"epoch": 0.8300395256916996,
"grad_norm": 0.28683504462242126,
"learning_rate": 7.074947731591691e-06,
"loss": 1.331,
"step": 735
},
{
"epoch": 0.8311688311688312,
"grad_norm": 0.367031991481781,
"learning_rate": 6.983788047344419e-06,
"loss": 1.6444,
"step": 736
},
{
"epoch": 0.8322981366459627,
"grad_norm": 0.41479209065437317,
"learning_rate": 6.893175353405756e-06,
"loss": 1.4512,
"step": 737
},
{
"epoch": 0.8334274421230943,
"grad_norm": 0.346797913312912,
"learning_rate": 6.8031108019986356e-06,
"loss": 1.8215,
"step": 738
},
{
"epoch": 0.8345567476002259,
"grad_norm": 0.4141133427619934,
"learning_rate": 6.713595538375833e-06,
"loss": 1.73,
"step": 739
},
{
"epoch": 0.8356860530773574,
"grad_norm": 0.532598078250885,
"learning_rate": 6.624630700805473e-06,
"loss": 1.6717,
"step": 740
},
{
"epoch": 0.836815358554489,
"grad_norm": 0.6603345274925232,
"learning_rate": 6.53621742055644e-06,
"loss": 1.8964,
"step": 741
},
{
"epoch": 0.8379446640316206,
"grad_norm": 0.4613952040672302,
"learning_rate": 6.448356821884144e-06,
"loss": 1.4167,
"step": 742
},
{
"epoch": 0.8390739695087521,
"grad_norm": 0.5672091841697693,
"learning_rate": 6.361050022016085e-06,
"loss": 1.547,
"step": 743
},
{
"epoch": 0.8402032749858837,
"grad_norm": 0.47616711258888245,
"learning_rate": 6.274298131137763e-06,
"loss": 1.1175,
"step": 744
},
{
"epoch": 0.8413325804630153,
"grad_norm": 1.295542597770691,
"learning_rate": 6.188102252378431e-06,
"loss": 1.2317,
"step": 745
},
{
"epoch": 0.8424618859401468,
"grad_norm": 0.7239735126495361,
"learning_rate": 6.102463481797216e-06,
"loss": 0.9637,
"step": 746
},
{
"epoch": 0.8435911914172783,
"grad_norm": 0.9294856190681458,
"learning_rate": 6.017382908369051e-06,
"loss": 1.5054,
"step": 747
},
{
"epoch": 0.84472049689441,
"grad_norm": 0.9071400165557861,
"learning_rate": 5.932861613970941e-06,
"loss": 1.1075,
"step": 748
},
{
"epoch": 0.8458498023715415,
"grad_norm": 1.2108523845672607,
"learning_rate": 5.848900673368074e-06,
"loss": 1.5638,
"step": 749
},
{
"epoch": 0.846979107848673,
"grad_norm": 2.483159065246582,
"learning_rate": 5.765501154200298e-06,
"loss": 1.4443,
"step": 750
},
{
"epoch": 0.8481084133258047,
"grad_norm": 0.17873716354370117,
"learning_rate": 5.682664116968434e-06,
"loss": 1.1442,
"step": 751
},
{
"epoch": 0.8492377188029362,
"grad_norm": 0.13179203867912292,
"learning_rate": 5.600390615020879e-06,
"loss": 2.232,
"step": 752
},
{
"epoch": 0.8503670242800677,
"grad_norm": 0.16131728887557983,
"learning_rate": 5.518681694540084e-06,
"loss": 1.5456,
"step": 753
},
{
"epoch": 0.8514963297571994,
"grad_norm": 0.4394928514957428,
"learning_rate": 5.437538394529429e-06,
"loss": 1.8573,
"step": 754
},
{
"epoch": 0.8526256352343309,
"grad_norm": 0.19867239892482758,
"learning_rate": 5.3569617467998325e-06,
"loss": 1.6727,
"step": 755
},
{
"epoch": 0.8537549407114624,
"grad_norm": 0.18569554388523102,
"learning_rate": 5.276952775956784e-06,
"loss": 1.6288,
"step": 756
},
{
"epoch": 0.854884246188594,
"grad_norm": 0.20019683241844177,
"learning_rate": 5.197512499387175e-06,
"loss": 1.6844,
"step": 757
},
{
"epoch": 0.8560135516657256,
"grad_norm": 0.16318097710609436,
"learning_rate": 5.118641927246492e-06,
"loss": 1.7957,
"step": 758
},
{
"epoch": 0.8571428571428571,
"grad_norm": 0.15840142965316772,
"learning_rate": 5.040342062445869e-06,
"loss": 1.5726,
"step": 759
},
{
"epoch": 0.8582721626199887,
"grad_norm": 0.20747163891792297,
"learning_rate": 4.962613900639412e-06,
"loss": 1.2894,
"step": 760
},
{
"epoch": 0.8594014680971203,
"grad_norm": 0.17959041893482208,
"learning_rate": 4.8854584302114536e-06,
"loss": 1.9222,
"step": 761
},
{
"epoch": 0.8605307735742518,
"grad_norm": 0.253172904253006,
"learning_rate": 4.808876632264092e-06,
"loss": 1.7508,
"step": 762
},
{
"epoch": 0.8616600790513834,
"grad_norm": 0.1783914417028427,
"learning_rate": 4.732869480604607e-06,
"loss": 1.9951,
"step": 763
},
{
"epoch": 0.862789384528515,
"grad_norm": 0.1955009400844574,
"learning_rate": 4.657437941733167e-06,
"loss": 1.4675,
"step": 764
},
{
"epoch": 0.8639186900056465,
"grad_norm": 0.2847943603992462,
"learning_rate": 4.58258297483048e-06,
"loss": 1.621,
"step": 765
},
{
"epoch": 0.8650479954827781,
"grad_norm": 0.20446226000785828,
"learning_rate": 4.5083055317456045e-06,
"loss": 1.7117,
"step": 766
},
{
"epoch": 0.8661773009599096,
"grad_norm": 0.1737479567527771,
"learning_rate": 4.434606556983878e-06,
"loss": 1.9038,
"step": 767
},
{
"epoch": 0.8673066064370413,
"grad_norm": 0.22281497716903687,
"learning_rate": 4.361486987694891e-06,
"loss": 2.2309,
"step": 768
},
{
"epoch": 0.8684359119141728,
"grad_norm": 0.2025764435529709,
"learning_rate": 4.288947753660544e-06,
"loss": 2.1066,
"step": 769
},
{
"epoch": 0.8695652173913043,
"grad_norm": 0.2314663976430893,
"learning_rate": 4.216989777283259e-06,
"loss": 1.7796,
"step": 770
},
{
"epoch": 0.870694522868436,
"grad_norm": 0.21215543150901794,
"learning_rate": 4.14561397357423e-06,
"loss": 1.909,
"step": 771
},
{
"epoch": 0.8718238283455675,
"grad_norm": 0.2464420199394226,
"learning_rate": 4.074821250141814e-06,
"loss": 1.5897,
"step": 772
},
{
"epoch": 0.872953133822699,
"grad_norm": 0.2283257395029068,
"learning_rate": 4.00461250717995e-06,
"loss": 1.617,
"step": 773
},
{
"epoch": 0.8740824392998306,
"grad_norm": 0.29658201336860657,
"learning_rate": 3.934988637456738e-06,
"loss": 1.7849,
"step": 774
},
{
"epoch": 0.8752117447769622,
"grad_norm": 0.25188517570495605,
"learning_rate": 3.8659505263031025e-06,
"loss": 1.7917,
"step": 775
},
{
"epoch": 0.8763410502540937,
"grad_norm": 0.27738890051841736,
"learning_rate": 3.797499051601483e-06,
"loss": 1.6018,
"step": 776
},
{
"epoch": 0.8774703557312253,
"grad_norm": 0.2526523172855377,
"learning_rate": 3.729635083774724e-06,
"loss": 1.7928,
"step": 777
},
{
"epoch": 0.8785996612083569,
"grad_norm": 0.22987103462219238,
"learning_rate": 3.6623594857749633e-06,
"loss": 1.4585,
"step": 778
},
{
"epoch": 0.8797289666854884,
"grad_norm": 0.2625342607498169,
"learning_rate": 3.5956731130727173e-06,
"loss": 1.9541,
"step": 779
},
{
"epoch": 0.88085827216262,
"grad_norm": 0.34188714623451233,
"learning_rate": 3.529576813645935e-06,
"loss": 1.5689,
"step": 780
},
{
"epoch": 0.8819875776397516,
"grad_norm": 0.2936117649078369,
"learning_rate": 3.4640714279692566e-06,
"loss": 1.5456,
"step": 781
},
{
"epoch": 0.8831168831168831,
"grad_norm": 0.26569730043411255,
"learning_rate": 3.399157789003299e-06,
"loss": 1.7835,
"step": 782
},
{
"epoch": 0.8842461885940147,
"grad_norm": 0.3196849524974823,
"learning_rate": 3.3348367221841257e-06,
"loss": 1.3151,
"step": 783
},
{
"epoch": 0.8853754940711462,
"grad_norm": 0.32176318764686584,
"learning_rate": 3.271109045412657e-06,
"loss": 1.3831,
"step": 784
},
{
"epoch": 0.8865047995482778,
"grad_norm": 0.33749616146087646,
"learning_rate": 3.207975569044347e-06,
"loss": 1.2601,
"step": 785
},
{
"epoch": 0.8876341050254094,
"grad_norm": 0.40785613656044006,
"learning_rate": 3.145437095878828e-06,
"loss": 1.9139,
"step": 786
},
{
"epoch": 0.8887634105025409,
"grad_norm": 0.33301037549972534,
"learning_rate": 3.083494421149752e-06,
"loss": 1.4349,
"step": 787
},
{
"epoch": 0.8898927159796725,
"grad_norm": 0.3668874502182007,
"learning_rate": 3.022148332514635e-06,
"loss": 1.6247,
"step": 788
},
{
"epoch": 0.8910220214568041,
"grad_norm": 0.42873692512512207,
"learning_rate": 2.9613996100448625e-06,
"loss": 1.4897,
"step": 789
},
{
"epoch": 0.8921513269339356,
"grad_norm": 0.49289876222610474,
"learning_rate": 2.9012490262157465e-06,
"loss": 1.4293,
"step": 790
},
{
"epoch": 0.8932806324110671,
"grad_norm": 0.535629391670227,
"learning_rate": 2.84169734589676e-06,
"loss": 1.7038,
"step": 791
},
{
"epoch": 0.8944099378881988,
"grad_norm": 0.830600917339325,
"learning_rate": 2.78274532634174e-06,
"loss": 1.4014,
"step": 792
},
{
"epoch": 0.8955392433653303,
"grad_norm": 0.5958054661750793,
"learning_rate": 2.724393717179302e-06,
"loss": 1.5625,
"step": 793
},
{
"epoch": 0.8966685488424618,
"grad_norm": 0.7064595818519592,
"learning_rate": 2.66664326040329e-06,
"loss": 1.2575,
"step": 794
},
{
"epoch": 0.8977978543195935,
"grad_norm": 0.7008121013641357,
"learning_rate": 2.609494690363362e-06,
"loss": 1.0175,
"step": 795
},
{
"epoch": 0.898927159796725,
"grad_norm": 0.8058467507362366,
"learning_rate": 2.5529487337556277e-06,
"loss": 1.0846,
"step": 796
},
{
"epoch": 0.9000564652738566,
"grad_norm": 0.8895508050918579,
"learning_rate": 2.49700610961342e-06,
"loss": 1.4598,
"step": 797
},
{
"epoch": 0.9011857707509882,
"grad_norm": 0.9604859948158264,
"learning_rate": 2.4416675292981417e-06,
"loss": 1.2935,
"step": 798
},
{
"epoch": 0.9023150762281197,
"grad_norm": 1.620484709739685,
"learning_rate": 2.3869336964902455e-06,
"loss": 1.5513,
"step": 799
},
{
"epoch": 0.9034443817052513,
"grad_norm": 2.574310541152954,
"learning_rate": 2.3328053071802637e-06,
"loss": 2.1875,
"step": 800
},
{
"epoch": 0.9045736871823828,
"grad_norm": 0.14299297332763672,
"learning_rate": 2.2792830496599583e-06,
"loss": 1.1229,
"step": 801
},
{
"epoch": 0.9057029926595144,
"grad_norm": 0.12972106039524078,
"learning_rate": 2.226367604513557e-06,
"loss": 1.8229,
"step": 802
},
{
"epoch": 0.906832298136646,
"grad_norm": 0.15199488401412964,
"learning_rate": 2.174059644609161e-06,
"loss": 1.5613,
"step": 803
},
{
"epoch": 0.9079616036137775,
"grad_norm": 0.16729173064231873,
"learning_rate": 2.1223598350900988e-06,
"loss": 1.9501,
"step": 804
},
{
"epoch": 0.9090909090909091,
"grad_norm": 0.15182441473007202,
"learning_rate": 2.0712688333665297e-06,
"loss": 1.7393,
"step": 805
},
{
"epoch": 0.9102202145680407,
"grad_norm": 0.1326162964105606,
"learning_rate": 2.0207872891070736e-06,
"loss": 1.861,
"step": 806
},
{
"epoch": 0.9113495200451722,
"grad_norm": 0.18286187946796417,
"learning_rate": 1.9709158442305365e-06,
"loss": 1.7116,
"step": 807
},
{
"epoch": 0.9124788255223037,
"grad_norm": 0.1932005137205124,
"learning_rate": 1.9216551328977537e-06,
"loss": 1.8832,
"step": 808
},
{
"epoch": 0.9136081309994354,
"grad_norm": 0.1856583058834076,
"learning_rate": 1.8730057815035285e-06,
"loss": 1.8687,
"step": 809
},
{
"epoch": 0.9147374364765669,
"grad_norm": 0.19752384722232819,
"learning_rate": 1.8249684086686813e-06,
"loss": 1.9856,
"step": 810
},
{
"epoch": 0.9158667419536984,
"grad_norm": 0.2770135700702667,
"learning_rate": 1.777543625232142e-06,
"loss": 0.9592,
"step": 811
},
{
"epoch": 0.9169960474308301,
"grad_norm": 0.19897840917110443,
"learning_rate": 1.730732034243221e-06,
"loss": 1.7433,
"step": 812
},
{
"epoch": 0.9181253529079616,
"grad_norm": 0.17861053347587585,
"learning_rate": 1.6845342309539213e-06,
"loss": 1.4184,
"step": 813
},
{
"epoch": 0.9192546583850931,
"grad_norm": 0.6110484004020691,
"learning_rate": 1.638950802811401e-06,
"loss": 1.5792,
"step": 814
},
{
"epoch": 0.9203839638622248,
"grad_norm": 0.25728702545166016,
"learning_rate": 1.5939823294504386e-06,
"loss": 1.6094,
"step": 815
},
{
"epoch": 0.9215132693393563,
"grad_norm": 0.24172663688659668,
"learning_rate": 1.549629382686152e-06,
"loss": 1.7054,
"step": 816
},
{
"epoch": 0.9226425748164878,
"grad_norm": 0.24173271656036377,
"learning_rate": 1.5058925265066194e-06,
"loss": 1.7461,
"step": 817
},
{
"epoch": 0.9237718802936195,
"grad_norm": 0.30865177512168884,
"learning_rate": 1.4627723170658192e-06,
"loss": 1.6537,
"step": 818
},
{
"epoch": 0.924901185770751,
"grad_norm": 0.18484273552894592,
"learning_rate": 1.4202693026764636e-06,
"loss": 1.866,
"step": 819
},
{
"epoch": 0.9260304912478825,
"grad_norm": 0.2225736677646637,
"learning_rate": 1.3783840238031143e-06,
"loss": 1.7052,
"step": 820
},
{
"epoch": 0.9271597967250141,
"grad_norm": 0.21656034886837006,
"learning_rate": 1.3371170130552114e-06,
"loss": 1.6997,
"step": 821
},
{
"epoch": 0.9282891022021457,
"grad_norm": 0.218043714761734,
"learning_rate": 1.2964687951803888e-06,
"loss": 1.7289,
"step": 822
},
{
"epoch": 0.9294184076792772,
"grad_norm": 0.21153298020362854,
"learning_rate": 1.2564398870577476e-06,
"loss": 1.9596,
"step": 823
},
{
"epoch": 0.9305477131564088,
"grad_norm": 0.20686852931976318,
"learning_rate": 1.2170307976913154e-06,
"loss": 2.0872,
"step": 824
},
{
"epoch": 0.9316770186335404,
"grad_norm": 0.2282426804304123,
"learning_rate": 1.1782420282035467e-06,
"loss": 1.8915,
"step": 825
},
{
"epoch": 0.932806324110672,
"grad_norm": 0.31399673223495483,
"learning_rate": 1.1400740718289672e-06,
"loss": 1.8344,
"step": 826
},
{
"epoch": 0.9339356295878035,
"grad_norm": 0.24079066514968872,
"learning_rate": 1.1025274139079057e-06,
"loss": 2.0116,
"step": 827
},
{
"epoch": 0.935064935064935,
"grad_norm": 0.2544754147529602,
"learning_rate": 1.0656025318803165e-06,
"loss": 1.7656,
"step": 828
},
{
"epoch": 0.9361942405420667,
"grad_norm": 0.32379066944122314,
"learning_rate": 1.029299895279684e-06,
"loss": 1.8436,
"step": 829
},
{
"epoch": 0.9373235460191982,
"grad_norm": 0.33525246381759644,
"learning_rate": 9.93619965727105e-07,
"loss": 1.5118,
"step": 830
},
{
"epoch": 0.9384528514963297,
"grad_norm": 0.29879966378211975,
"learning_rate": 9.58563196925366e-07,
"loss": 1.8298,
"step": 831
},
{
"epoch": 0.9395821569734614,
"grad_norm": 0.2798129916191101,
"learning_rate": 9.241300346532255e-07,
"loss": 1.9266,
"step": 832
},
{
"epoch": 0.9407114624505929,
"grad_norm": 0.3247831463813782,
"learning_rate": 8.903209167596848e-07,
"loss": 1.6367,
"step": 833
},
{
"epoch": 0.9418407679277244,
"grad_norm": 0.6924816966056824,
"learning_rate": 8.571362731584653e-07,
"loss": 1.73,
"step": 834
},
{
"epoch": 0.9429700734048561,
"grad_norm": 0.33074456453323364,
"learning_rate": 8.245765258225402e-07,
"loss": 1.3488,
"step": 835
},
{
"epoch": 0.9440993788819876,
"grad_norm": 0.31922397017478943,
"learning_rate": 7.926420887787444e-07,
"loss": 1.8534,
"step": 836
},
{
"epoch": 0.9452286843591191,
"grad_norm": 0.48577114939689636,
"learning_rate": 7.613333681025236e-07,
"loss": 1.5371,
"step": 837
},
{
"epoch": 0.9463579898362507,
"grad_norm": 0.3482813239097595,
"learning_rate": 7.306507619127767e-07,
"loss": 1.777,
"step": 838
},
{
"epoch": 0.9474872953133823,
"grad_norm": 0.40237051248550415,
"learning_rate": 7.005946603667768e-07,
"loss": 1.9003,
"step": 839
},
{
"epoch": 0.9486166007905138,
"grad_norm": 0.3908424377441406,
"learning_rate": 6.711654456552364e-07,
"loss": 1.3752,
"step": 840
},
{
"epoch": 0.9497459062676454,
"grad_norm": 0.5101377964019775,
"learning_rate": 6.423634919974164e-07,
"loss": 1.7104,
"step": 841
},
{
"epoch": 0.950875211744777,
"grad_norm": 0.45392563939094543,
"learning_rate": 6.141891656363863e-07,
"loss": 1.6423,
"step": 842
},
{
"epoch": 0.9520045172219085,
"grad_norm": 0.4994739592075348,
"learning_rate": 5.866428248343603e-07,
"loss": 0.9681,
"step": 843
},
{
"epoch": 0.9531338226990401,
"grad_norm": 0.7889621257781982,
"learning_rate": 5.59724819868157e-07,
"loss": 1.4515,
"step": 844
},
{
"epoch": 0.9542631281761716,
"grad_norm": 0.7431004643440247,
"learning_rate": 5.334354930247087e-07,
"loss": 0.5782,
"step": 845
},
{
"epoch": 0.9553924336533032,
"grad_norm": 0.8190014362335205,
"learning_rate": 5.077751785967588e-07,
"loss": 1.1897,
"step": 846
},
{
"epoch": 0.9565217391304348,
"grad_norm": 0.7467794418334961,
"learning_rate": 4.827442028785655e-07,
"loss": 1.0159,
"step": 847
},
{
"epoch": 0.9576510446075663,
"grad_norm": 1.3388633728027344,
"learning_rate": 4.5834288416178874e-07,
"loss": 1.6963,
"step": 848
},
{
"epoch": 0.9587803500846979,
"grad_norm": 1.1506050825119019,
"learning_rate": 4.3457153273140925e-07,
"loss": 1.1707,
"step": 849
},
{
"epoch": 0.9599096555618295,
"grad_norm": 2.7924423217773438,
"learning_rate": 4.114304508618105e-07,
"loss": 2.2042,
"step": 850
},
{
"epoch": 0.961038961038961,
"grad_norm": 0.20497886836528778,
"learning_rate": 3.8891993281293117e-07,
"loss": 0.7544,
"step": 851
},
{
"epoch": 0.9621682665160926,
"grad_norm": 0.17141655087471008,
"learning_rate": 3.670402648264959e-07,
"loss": 1.5099,
"step": 852
},
{
"epoch": 0.9632975719932242,
"grad_norm": 0.19558130204677582,
"learning_rate": 3.45791725122413e-07,
"loss": 1.5086,
"step": 853
},
{
"epoch": 0.9644268774703557,
"grad_norm": 0.2082233726978302,
"learning_rate": 3.2517458389521027e-07,
"loss": 1.5123,
"step": 854
},
{
"epoch": 0.9655561829474872,
"grad_norm": 0.20731768012046814,
"learning_rate": 3.0518910331062135e-07,
"loss": 1.4448,
"step": 855
},
{
"epoch": 0.9666854884246189,
"grad_norm": 0.18717971444129944,
"learning_rate": 2.8583553750223255e-07,
"loss": 1.8793,
"step": 856
},
{
"epoch": 0.9678147939017504,
"grad_norm": 0.17500977218151093,
"learning_rate": 2.67114132568258e-07,
"loss": 1.9831,
"step": 857
},
{
"epoch": 0.968944099378882,
"grad_norm": 0.19684267044067383,
"learning_rate": 2.490251265683974e-07,
"loss": 1.741,
"step": 858
},
{
"epoch": 0.9700734048560136,
"grad_norm": 0.19779165089130402,
"learning_rate": 2.3156874952085516e-07,
"loss": 1.6896,
"step": 859
},
{
"epoch": 0.9712027103331451,
"grad_norm": 0.2639060616493225,
"learning_rate": 2.14745223399343e-07,
"loss": 1.3446,
"step": 860
},
{
"epoch": 0.9723320158102767,
"grad_norm": 0.18881270289421082,
"learning_rate": 1.9855476213033185e-07,
"loss": 1.4884,
"step": 861
},
{
"epoch": 0.9734613212874083,
"grad_norm": 0.23728030920028687,
"learning_rate": 1.82997571590271e-07,
"loss": 1.6801,
"step": 862
},
{
"epoch": 0.9745906267645398,
"grad_norm": 0.3492504358291626,
"learning_rate": 1.6807384960301208e-07,
"loss": 1.8541,
"step": 863
},
{
"epoch": 0.9757199322416714,
"grad_norm": 0.20422044396400452,
"learning_rate": 1.5378378593726706e-07,
"loss": 1.6448,
"step": 864
},
{
"epoch": 0.9768492377188029,
"grad_norm": 0.21626874804496765,
"learning_rate": 1.4012756230421532e-07,
"loss": 1.8438,
"step": 865
},
{
"epoch": 0.9779785431959345,
"grad_norm": 0.2580612301826477,
"learning_rate": 1.271053523551613e-07,
"loss": 1.9015,
"step": 866
},
{
"epoch": 0.9791078486730661,
"grad_norm": 0.23967643082141876,
"learning_rate": 1.1471732167938065e-07,
"loss": 1.7772,
"step": 867
},
{
"epoch": 0.9802371541501976,
"grad_norm": 0.2442159354686737,
"learning_rate": 1.0296362780195524e-07,
"loss": 1.6739,
"step": 868
},
{
"epoch": 0.9813664596273292,
"grad_norm": 0.4127430021762848,
"learning_rate": 9.184442018180805e-08,
"loss": 2.2228,
"step": 869
},
{
"epoch": 0.9824957651044608,
"grad_norm": 0.2699223756790161,
"learning_rate": 8.13598402097937e-08,
"loss": 1.9218,
"step": 870
},
{
"epoch": 0.9836250705815923,
"grad_norm": 0.24486035108566284,
"learning_rate": 7.151002120688865e-08,
"loss": 1.7938,
"step": 871
},
{
"epoch": 0.9847543760587238,
"grad_norm": 0.3202971816062927,
"learning_rate": 6.229508842251486e-08,
"loss": 1.57,
"step": 872
},
{
"epoch": 0.9858836815358555,
"grad_norm": 0.3089115619659424,
"learning_rate": 5.371515903293545e-08,
"loss": 1.5306,
"step": 873
},
{
"epoch": 0.987012987012987,
"grad_norm": 0.48857423663139343,
"learning_rate": 4.5770342139761504e-08,
"loss": 1.3286,
"step": 874
},
{
"epoch": 0.9881422924901185,
"grad_norm": 0.33920764923095703,
"learning_rate": 3.8460738768586465e-08,
"loss": 1.7378,
"step": 875
},
{
"epoch": 0.9892715979672502,
"grad_norm": 0.45418545603752136,
"learning_rate": 3.1786441867659446e-08,
"loss": 1.5101,
"step": 876
},
{
"epoch": 0.9904009034443817,
"grad_norm": 0.7004565000534058,
"learning_rate": 2.57475363067472e-08,
"loss": 1.2948,
"step": 877
},
{
"epoch": 0.9915302089215132,
"grad_norm": 0.5156087875366211,
"learning_rate": 2.0344098876040608e-08,
"loss": 1.9831,
"step": 878
},
{
"epoch": 0.9926595143986449,
"grad_norm": 0.5740581750869751,
"learning_rate": 1.557619828516099e-08,
"loss": 1.2267,
"step": 879
},
{
"epoch": 0.9937888198757764,
"grad_norm": 0.675317645072937,
"learning_rate": 1.1443895162305263e-08,
"loss": 1.1948,
"step": 880
},
{
"epoch": 0.9949181253529079,
"grad_norm": 0.5960811972618103,
"learning_rate": 7.947242053479853e-09,
"loss": 1.5265,
"step": 881
},
{
"epoch": 0.9960474308300395,
"grad_norm": 0.5866406559944153,
"learning_rate": 5.086283421801286e-09,
"loss": 0.8661,
"step": 882
},
{
"epoch": 0.9971767363071711,
"grad_norm": 0.910371720790863,
"learning_rate": 2.861055646968813e-09,
"loss": 1.4349,
"step": 883
},
{
"epoch": 0.9983060417843026,
"grad_norm": 0.9026908278465271,
"learning_rate": 1.2715870247870244e-09,
"loss": 1.5981,
"step": 884
},
{
"epoch": 0.9994353472614342,
"grad_norm": 1.4996330738067627,
"learning_rate": 3.178977667883665e-10,
"loss": 1.5431,
"step": 885
},
{
"epoch": 0.9994353472614342,
"eval_loss": 1.513724684715271,
"eval_runtime": 17.4374,
"eval_samples_per_second": 42.782,
"eval_steps_per_second": 10.724,
"step": 885
},
{
"epoch": 1.0008469791078487,
"grad_norm": 3.4714646339416504,
"learning_rate": 0.0,
"loss": 2.7846,
"step": 886
}
],
"logging_steps": 1,
"max_steps": 886,
"num_input_tokens_seen": 0,
"num_train_epochs": 2,
"save_steps": 222,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 1.4350477361676288e+17,
"train_batch_size": 4,
"trial_name": null,
"trial_params": null
}