error577's picture
Training in progress, step 600, checkpoint
3ce2c13 verified
{
"best_metric": 1.5770864486694336,
"best_model_checkpoint": "miner_id_24/checkpoint-600",
"epoch": 0.3236573278041873,
"eval_steps": 200,
"global_step": 600,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0005394288796736456,
"grad_norm": 18.71552085876465,
"learning_rate": 2.0000000000000003e-06,
"loss": 57.3329,
"step": 1
},
{
"epoch": 0.0005394288796736456,
"eval_loss": 4.576467514038086,
"eval_runtime": 141.0154,
"eval_samples_per_second": 2.12,
"eval_steps_per_second": 2.12,
"step": 1
},
{
"epoch": 0.0010788577593472911,
"grad_norm": 40.17718505859375,
"learning_rate": 4.000000000000001e-06,
"loss": 111.721,
"step": 2
},
{
"epoch": 0.0016182866390209367,
"grad_norm": 55.62163162231445,
"learning_rate": 6e-06,
"loss": 145.8098,
"step": 3
},
{
"epoch": 0.0021577155186945822,
"grad_norm": 70.09906005859375,
"learning_rate": 8.000000000000001e-06,
"loss": 176.5399,
"step": 4
},
{
"epoch": 0.0026971443983682276,
"grad_norm": 96.45822143554688,
"learning_rate": 1e-05,
"loss": 205.6804,
"step": 5
},
{
"epoch": 0.0032365732780418733,
"grad_norm": 96.46897888183594,
"learning_rate": 1.2e-05,
"loss": 191.1242,
"step": 6
},
{
"epoch": 0.0037760021577155187,
"grad_norm": 123.18101501464844,
"learning_rate": 1.4000000000000001e-05,
"loss": 200.1216,
"step": 7
},
{
"epoch": 0.0043154310373891645,
"grad_norm": 112.75751495361328,
"learning_rate": 1.6000000000000003e-05,
"loss": 199.3468,
"step": 8
},
{
"epoch": 0.00485485991706281,
"grad_norm": 105.84030151367188,
"learning_rate": 1.8e-05,
"loss": 197.7578,
"step": 9
},
{
"epoch": 0.005394288796736455,
"grad_norm": 152.0435333251953,
"learning_rate": 2e-05,
"loss": 221.8745,
"step": 10
},
{
"epoch": 0.0059337176764101005,
"grad_norm": 140.9628143310547,
"learning_rate": 2.2000000000000003e-05,
"loss": 202.1205,
"step": 11
},
{
"epoch": 0.006473146556083747,
"grad_norm": 136.8531036376953,
"learning_rate": 2.4e-05,
"loss": 192.207,
"step": 12
},
{
"epoch": 0.007012575435757392,
"grad_norm": 135.1580352783203,
"learning_rate": 2.6000000000000002e-05,
"loss": 188.5981,
"step": 13
},
{
"epoch": 0.007552004315431037,
"grad_norm": 135.94815063476562,
"learning_rate": 2.8000000000000003e-05,
"loss": 182.9973,
"step": 14
},
{
"epoch": 0.008091433195104683,
"grad_norm": 130.7935333251953,
"learning_rate": 3e-05,
"loss": 181.9996,
"step": 15
},
{
"epoch": 0.008630862074778329,
"grad_norm": 135.71165466308594,
"learning_rate": 3.2000000000000005e-05,
"loss": 156.7745,
"step": 16
},
{
"epoch": 0.009170290954451973,
"grad_norm": 80.55735778808594,
"learning_rate": 3.4000000000000007e-05,
"loss": 105.2249,
"step": 17
},
{
"epoch": 0.00970971983412562,
"grad_norm": 78.56623840332031,
"learning_rate": 3.6e-05,
"loss": 93.8699,
"step": 18
},
{
"epoch": 0.010249148713799266,
"grad_norm": 73.5405502319336,
"learning_rate": 3.8e-05,
"loss": 96.5256,
"step": 19
},
{
"epoch": 0.01078857759347291,
"grad_norm": 66.16717529296875,
"learning_rate": 4e-05,
"loss": 79.6901,
"step": 20
},
{
"epoch": 0.011328006473146556,
"grad_norm": 65.6923599243164,
"learning_rate": 4.2e-05,
"loss": 90.1881,
"step": 21
},
{
"epoch": 0.011867435352820201,
"grad_norm": 77.53053283691406,
"learning_rate": 4.4000000000000006e-05,
"loss": 85.8049,
"step": 22
},
{
"epoch": 0.012406864232493847,
"grad_norm": 71.17222595214844,
"learning_rate": 4.600000000000001e-05,
"loss": 64.6935,
"step": 23
},
{
"epoch": 0.012946293112167493,
"grad_norm": 46.50193786621094,
"learning_rate": 4.8e-05,
"loss": 72.2138,
"step": 24
},
{
"epoch": 0.013485721991841138,
"grad_norm": 45.66022491455078,
"learning_rate": 5e-05,
"loss": 71.2709,
"step": 25
},
{
"epoch": 0.014025150871514784,
"grad_norm": 46.14365768432617,
"learning_rate": 5.2000000000000004e-05,
"loss": 59.7781,
"step": 26
},
{
"epoch": 0.014564579751188429,
"grad_norm": 54.284664154052734,
"learning_rate": 5.4000000000000005e-05,
"loss": 64.8576,
"step": 27
},
{
"epoch": 0.015104008630862075,
"grad_norm": 43.3782958984375,
"learning_rate": 5.6000000000000006e-05,
"loss": 68.6312,
"step": 28
},
{
"epoch": 0.01564343751053572,
"grad_norm": 35.549217224121094,
"learning_rate": 5.8e-05,
"loss": 62.3088,
"step": 29
},
{
"epoch": 0.016182866390209365,
"grad_norm": 42.21353530883789,
"learning_rate": 6e-05,
"loss": 65.5001,
"step": 30
},
{
"epoch": 0.01672229526988301,
"grad_norm": 46.08031463623047,
"learning_rate": 6.2e-05,
"loss": 57.0952,
"step": 31
},
{
"epoch": 0.017261724149556658,
"grad_norm": 38.45962905883789,
"learning_rate": 6.400000000000001e-05,
"loss": 65.1331,
"step": 32
},
{
"epoch": 0.017801153029230302,
"grad_norm": 34.330406188964844,
"learning_rate": 6.6e-05,
"loss": 56.026,
"step": 33
},
{
"epoch": 0.018340581908903947,
"grad_norm": 35.08675003051758,
"learning_rate": 6.800000000000001e-05,
"loss": 55.9277,
"step": 34
},
{
"epoch": 0.018880010788577595,
"grad_norm": 37.337825775146484,
"learning_rate": 7e-05,
"loss": 53.9387,
"step": 35
},
{
"epoch": 0.01941943966825124,
"grad_norm": 36.146873474121094,
"learning_rate": 7.2e-05,
"loss": 61.2999,
"step": 36
},
{
"epoch": 0.019958868547924884,
"grad_norm": 41.229610443115234,
"learning_rate": 7.4e-05,
"loss": 70.8618,
"step": 37
},
{
"epoch": 0.02049829742759853,
"grad_norm": 42.86275863647461,
"learning_rate": 7.6e-05,
"loss": 60.1886,
"step": 38
},
{
"epoch": 0.021037726307272176,
"grad_norm": 36.5433235168457,
"learning_rate": 7.800000000000001e-05,
"loss": 61.5439,
"step": 39
},
{
"epoch": 0.02157715518694582,
"grad_norm": 39.95774841308594,
"learning_rate": 8e-05,
"loss": 57.8462,
"step": 40
},
{
"epoch": 0.022116584066619465,
"grad_norm": 38.86470413208008,
"learning_rate": 8.2e-05,
"loss": 55.4324,
"step": 41
},
{
"epoch": 0.022656012946293113,
"grad_norm": 30.977352142333984,
"learning_rate": 8.4e-05,
"loss": 57.6402,
"step": 42
},
{
"epoch": 0.023195441825966757,
"grad_norm": 38.25783157348633,
"learning_rate": 8.6e-05,
"loss": 50.586,
"step": 43
},
{
"epoch": 0.023734870705640402,
"grad_norm": 37.11707305908203,
"learning_rate": 8.800000000000001e-05,
"loss": 36.7947,
"step": 44
},
{
"epoch": 0.02427429958531405,
"grad_norm": 40.3302116394043,
"learning_rate": 9e-05,
"loss": 40.2388,
"step": 45
},
{
"epoch": 0.024813728464987694,
"grad_norm": 42.60755920410156,
"learning_rate": 9.200000000000001e-05,
"loss": 58.9665,
"step": 46
},
{
"epoch": 0.02535315734466134,
"grad_norm": 44.4195442199707,
"learning_rate": 9.4e-05,
"loss": 50.2349,
"step": 47
},
{
"epoch": 0.025892586224334987,
"grad_norm": 37.404727935791016,
"learning_rate": 9.6e-05,
"loss": 48.7437,
"step": 48
},
{
"epoch": 0.02643201510400863,
"grad_norm": 48.31377410888672,
"learning_rate": 9.8e-05,
"loss": 54.4895,
"step": 49
},
{
"epoch": 0.026971443983682276,
"grad_norm": 51.360191345214844,
"learning_rate": 0.0001,
"loss": 57.4654,
"step": 50
},
{
"epoch": 0.02751087286335592,
"grad_norm": 23.211647033691406,
"learning_rate": 0.00010200000000000001,
"loss": 36.3068,
"step": 51
},
{
"epoch": 0.028050301743029568,
"grad_norm": 34.541805267333984,
"learning_rate": 0.00010400000000000001,
"loss": 75.6809,
"step": 52
},
{
"epoch": 0.028589730622703213,
"grad_norm": 42.0761833190918,
"learning_rate": 0.00010600000000000002,
"loss": 96.818,
"step": 53
},
{
"epoch": 0.029129159502376857,
"grad_norm": 43.26933670043945,
"learning_rate": 0.00010800000000000001,
"loss": 101.1451,
"step": 54
},
{
"epoch": 0.029668588382050505,
"grad_norm": 51.45765686035156,
"learning_rate": 0.00011000000000000002,
"loss": 115.0704,
"step": 55
},
{
"epoch": 0.03020801726172415,
"grad_norm": 43.3838005065918,
"learning_rate": 0.00011200000000000001,
"loss": 112.4747,
"step": 56
},
{
"epoch": 0.030747446141397794,
"grad_norm": 59.83226013183594,
"learning_rate": 0.00011399999999999999,
"loss": 114.741,
"step": 57
},
{
"epoch": 0.03128687502107144,
"grad_norm": 38.54649353027344,
"learning_rate": 0.000116,
"loss": 100.9508,
"step": 58
},
{
"epoch": 0.03182630390074508,
"grad_norm": 34.7606086730957,
"learning_rate": 0.000118,
"loss": 87.4097,
"step": 59
},
{
"epoch": 0.03236573278041873,
"grad_norm": 34.808265686035156,
"learning_rate": 0.00012,
"loss": 94.2411,
"step": 60
},
{
"epoch": 0.03290516166009238,
"grad_norm": 33.40951156616211,
"learning_rate": 0.000122,
"loss": 85.6042,
"step": 61
},
{
"epoch": 0.03344459053976602,
"grad_norm": 25.83111572265625,
"learning_rate": 0.000124,
"loss": 79.6747,
"step": 62
},
{
"epoch": 0.03398401941943967,
"grad_norm": 51.73832321166992,
"learning_rate": 0.000126,
"loss": 65.953,
"step": 63
},
{
"epoch": 0.034523448299113316,
"grad_norm": 38.63320541381836,
"learning_rate": 0.00012800000000000002,
"loss": 72.4182,
"step": 64
},
{
"epoch": 0.03506287717878696,
"grad_norm": 20.10302734375,
"learning_rate": 0.00013000000000000002,
"loss": 61.2385,
"step": 65
},
{
"epoch": 0.035602306058460605,
"grad_norm": 27.804248809814453,
"learning_rate": 0.000132,
"loss": 59.6029,
"step": 66
},
{
"epoch": 0.03614173493813425,
"grad_norm": 30.542932510375977,
"learning_rate": 0.000134,
"loss": 51.2451,
"step": 67
},
{
"epoch": 0.036681163817807894,
"grad_norm": 70.11331176757812,
"learning_rate": 0.00013600000000000003,
"loss": 51.116,
"step": 68
},
{
"epoch": 0.03722059269748154,
"grad_norm": 155.8134307861328,
"learning_rate": 0.000138,
"loss": 76.3231,
"step": 69
},
{
"epoch": 0.03776002157715519,
"grad_norm": 146.5844268798828,
"learning_rate": 0.00014,
"loss": 68.6173,
"step": 70
},
{
"epoch": 0.03829945045682883,
"grad_norm": 102.16127014160156,
"learning_rate": 0.000142,
"loss": 72.0777,
"step": 71
},
{
"epoch": 0.03883887933650248,
"grad_norm": 40.04204559326172,
"learning_rate": 0.000144,
"loss": 52.744,
"step": 72
},
{
"epoch": 0.039378308216176126,
"grad_norm": 75.35163116455078,
"learning_rate": 0.000146,
"loss": 59.1683,
"step": 73
},
{
"epoch": 0.03991773709584977,
"grad_norm": 77.30841827392578,
"learning_rate": 0.000148,
"loss": 51.3059,
"step": 74
},
{
"epoch": 0.040457165975523415,
"grad_norm": 52.49984359741211,
"learning_rate": 0.00015000000000000001,
"loss": 52.407,
"step": 75
},
{
"epoch": 0.04099659485519706,
"grad_norm": 35.61119842529297,
"learning_rate": 0.000152,
"loss": 50.8863,
"step": 76
},
{
"epoch": 0.041536023734870704,
"grad_norm": 34.10403060913086,
"learning_rate": 0.000154,
"loss": 53.6901,
"step": 77
},
{
"epoch": 0.04207545261454435,
"grad_norm": 39.79935836791992,
"learning_rate": 0.00015600000000000002,
"loss": 49.8857,
"step": 78
},
{
"epoch": 0.042614881494218,
"grad_norm": 35.74922561645508,
"learning_rate": 0.00015800000000000002,
"loss": 62.577,
"step": 79
},
{
"epoch": 0.04315431037389164,
"grad_norm": 31.491291046142578,
"learning_rate": 0.00016,
"loss": 52.0815,
"step": 80
},
{
"epoch": 0.04369373925356529,
"grad_norm": 23.866592407226562,
"learning_rate": 0.000162,
"loss": 64.6077,
"step": 81
},
{
"epoch": 0.04423316813323893,
"grad_norm": 28.5296688079834,
"learning_rate": 0.000164,
"loss": 59.5244,
"step": 82
},
{
"epoch": 0.04477259701291258,
"grad_norm": 33.92407989501953,
"learning_rate": 0.000166,
"loss": 62.9956,
"step": 83
},
{
"epoch": 0.045312025892586226,
"grad_norm": 27.05453109741211,
"learning_rate": 0.000168,
"loss": 52.9777,
"step": 84
},
{
"epoch": 0.04585145477225987,
"grad_norm": 23.927709579467773,
"learning_rate": 0.00017,
"loss": 56.0232,
"step": 85
},
{
"epoch": 0.046390883651933515,
"grad_norm": 31.250370025634766,
"learning_rate": 0.000172,
"loss": 55.1487,
"step": 86
},
{
"epoch": 0.04693031253160716,
"grad_norm": 32.98558044433594,
"learning_rate": 0.000174,
"loss": 54.3132,
"step": 87
},
{
"epoch": 0.047469741411280804,
"grad_norm": 39.15415954589844,
"learning_rate": 0.00017600000000000002,
"loss": 56.2989,
"step": 88
},
{
"epoch": 0.04800917029095445,
"grad_norm": 32.42843246459961,
"learning_rate": 0.00017800000000000002,
"loss": 41.7672,
"step": 89
},
{
"epoch": 0.0485485991706281,
"grad_norm": 42.03153610229492,
"learning_rate": 0.00018,
"loss": 50.3046,
"step": 90
},
{
"epoch": 0.04908802805030174,
"grad_norm": 38.14472961425781,
"learning_rate": 0.000182,
"loss": 50.2817,
"step": 91
},
{
"epoch": 0.04962745692997539,
"grad_norm": 32.74757385253906,
"learning_rate": 0.00018400000000000003,
"loss": 47.9721,
"step": 92
},
{
"epoch": 0.05016688580964904,
"grad_norm": 41.20277404785156,
"learning_rate": 0.00018600000000000002,
"loss": 48.2985,
"step": 93
},
{
"epoch": 0.05070631468932268,
"grad_norm": 42.31992721557617,
"learning_rate": 0.000188,
"loss": 58.7386,
"step": 94
},
{
"epoch": 0.051245743568996326,
"grad_norm": 28.106618881225586,
"learning_rate": 0.00019,
"loss": 46.3057,
"step": 95
},
{
"epoch": 0.051785172448669974,
"grad_norm": 37.70038604736328,
"learning_rate": 0.000192,
"loss": 35.5874,
"step": 96
},
{
"epoch": 0.052324601328343615,
"grad_norm": 36.007530212402344,
"learning_rate": 0.000194,
"loss": 47.9065,
"step": 97
},
{
"epoch": 0.05286403020801726,
"grad_norm": 29.738492965698242,
"learning_rate": 0.000196,
"loss": 49.6222,
"step": 98
},
{
"epoch": 0.05340345908769091,
"grad_norm": 42.806785583496094,
"learning_rate": 0.00019800000000000002,
"loss": 44.6868,
"step": 99
},
{
"epoch": 0.05394288796736455,
"grad_norm": 31.359643936157227,
"learning_rate": 0.0002,
"loss": 30.5743,
"step": 100
},
{
"epoch": 0.0544823168470382,
"grad_norm": 24.176820755004883,
"learning_rate": 0.00019999998344063995,
"loss": 41.8829,
"step": 101
},
{
"epoch": 0.05502174572671184,
"grad_norm": 43.5556755065918,
"learning_rate": 0.00019999993376256528,
"loss": 64.5931,
"step": 102
},
{
"epoch": 0.05556117460638549,
"grad_norm": 35.98505401611328,
"learning_rate": 0.00019999985096579245,
"loss": 94.4231,
"step": 103
},
{
"epoch": 0.056100603486059136,
"grad_norm": 35.83631134033203,
"learning_rate": 0.00019999973505034887,
"loss": 113.3877,
"step": 104
},
{
"epoch": 0.05664003236573278,
"grad_norm": 30.29425621032715,
"learning_rate": 0.00019999958601627296,
"loss": 113.0325,
"step": 105
},
{
"epoch": 0.057179461245406425,
"grad_norm": 27.389789581298828,
"learning_rate": 0.000199999403863614,
"loss": 111.3191,
"step": 106
},
{
"epoch": 0.05771889012508007,
"grad_norm": 27.400251388549805,
"learning_rate": 0.00019999918859243244,
"loss": 97.0415,
"step": 107
},
{
"epoch": 0.058258319004753714,
"grad_norm": 20.399946212768555,
"learning_rate": 0.0001999989402027995,
"loss": 90.2641,
"step": 108
},
{
"epoch": 0.05879774788442736,
"grad_norm": 25.029308319091797,
"learning_rate": 0.0001999986586947974,
"loss": 94.4251,
"step": 109
},
{
"epoch": 0.05933717676410101,
"grad_norm": 29.495418548583984,
"learning_rate": 0.00019999834406851945,
"loss": 94.9159,
"step": 110
},
{
"epoch": 0.05987660564377465,
"grad_norm": 19.77571678161621,
"learning_rate": 0.0001999979963240698,
"loss": 75.4925,
"step": 111
},
{
"epoch": 0.0604160345234483,
"grad_norm": 25.004566192626953,
"learning_rate": 0.00019999761546156365,
"loss": 71.3454,
"step": 112
},
{
"epoch": 0.06095546340312195,
"grad_norm": 34.21379852294922,
"learning_rate": 0.00019999720148112715,
"loss": 66.511,
"step": 113
},
{
"epoch": 0.06149489228279559,
"grad_norm": 22.71439552307129,
"learning_rate": 0.00019999675438289738,
"loss": 52.0498,
"step": 114
},
{
"epoch": 0.062034321162469236,
"grad_norm": 24.381750106811523,
"learning_rate": 0.0001999962741670224,
"loss": 55.2827,
"step": 115
},
{
"epoch": 0.06257375004214288,
"grad_norm": 37.246803283691406,
"learning_rate": 0.00019999576083366125,
"loss": 54.9355,
"step": 116
},
{
"epoch": 0.06311317892181653,
"grad_norm": 81.53564453125,
"learning_rate": 0.00019999521438298398,
"loss": 59.4422,
"step": 117
},
{
"epoch": 0.06365260780149017,
"grad_norm": 129.4823760986328,
"learning_rate": 0.00019999463481517156,
"loss": 67.393,
"step": 118
},
{
"epoch": 0.06419203668116381,
"grad_norm": 77.96698760986328,
"learning_rate": 0.00019999402213041588,
"loss": 67.9443,
"step": 119
},
{
"epoch": 0.06473146556083746,
"grad_norm": 53.094512939453125,
"learning_rate": 0.0001999933763289199,
"loss": 61.054,
"step": 120
},
{
"epoch": 0.06527089444051111,
"grad_norm": 52.896366119384766,
"learning_rate": 0.00019999269741089752,
"loss": 62.3436,
"step": 121
},
{
"epoch": 0.06581032332018476,
"grad_norm": 57.282318115234375,
"learning_rate": 0.00019999198537657353,
"loss": 56.6129,
"step": 122
},
{
"epoch": 0.0663497521998584,
"grad_norm": 46.553062438964844,
"learning_rate": 0.0001999912402261838,
"loss": 55.701,
"step": 123
},
{
"epoch": 0.06688918107953204,
"grad_norm": 28.822669982910156,
"learning_rate": 0.00019999046195997512,
"loss": 54.2102,
"step": 124
},
{
"epoch": 0.06742860995920569,
"grad_norm": 28.726089477539062,
"learning_rate": 0.00019998965057820516,
"loss": 56.0332,
"step": 125
},
{
"epoch": 0.06796803883887934,
"grad_norm": 26.886003494262695,
"learning_rate": 0.0001999888060811427,
"loss": 43.4516,
"step": 126
},
{
"epoch": 0.06850746771855298,
"grad_norm": 31.9282169342041,
"learning_rate": 0.00019998792846906747,
"loss": 52.2149,
"step": 127
},
{
"epoch": 0.06904689659822663,
"grad_norm": 38.317962646484375,
"learning_rate": 0.00019998701774227005,
"loss": 54.0044,
"step": 128
},
{
"epoch": 0.06958632547790028,
"grad_norm": 31.158544540405273,
"learning_rate": 0.00019998607390105209,
"loss": 55.2255,
"step": 129
},
{
"epoch": 0.07012575435757391,
"grad_norm": 33.239166259765625,
"learning_rate": 0.00019998509694572615,
"loss": 56.3811,
"step": 130
},
{
"epoch": 0.07066518323724756,
"grad_norm": 30.34086799621582,
"learning_rate": 0.00019998408687661582,
"loss": 52.0529,
"step": 131
},
{
"epoch": 0.07120461211692121,
"grad_norm": 24.05341911315918,
"learning_rate": 0.00019998304369405563,
"loss": 60.5602,
"step": 132
},
{
"epoch": 0.07174404099659486,
"grad_norm": 26.90273094177246,
"learning_rate": 0.00019998196739839103,
"loss": 57.3375,
"step": 133
},
{
"epoch": 0.0722834698762685,
"grad_norm": 24.157773971557617,
"learning_rate": 0.0001999808579899785,
"loss": 47.7251,
"step": 134
},
{
"epoch": 0.07282289875594215,
"grad_norm": 28.088014602661133,
"learning_rate": 0.00019997971546918545,
"loss": 56.1037,
"step": 135
},
{
"epoch": 0.07336232763561579,
"grad_norm": 32.39021682739258,
"learning_rate": 0.00019997853983639029,
"loss": 52.0922,
"step": 136
},
{
"epoch": 0.07390175651528944,
"grad_norm": 29.597578048706055,
"learning_rate": 0.0001999773310919824,
"loss": 46.3537,
"step": 137
},
{
"epoch": 0.07444118539496308,
"grad_norm": 38.31181335449219,
"learning_rate": 0.000199976089236362,
"loss": 46.8711,
"step": 138
},
{
"epoch": 0.07498061427463673,
"grad_norm": 39.67713165283203,
"learning_rate": 0.00019997481426994044,
"loss": 45.0961,
"step": 139
},
{
"epoch": 0.07552004315431038,
"grad_norm": 48.8436164855957,
"learning_rate": 0.00019997350619314,
"loss": 48.7547,
"step": 140
},
{
"epoch": 0.07605947203398401,
"grad_norm": 88.95709991455078,
"learning_rate": 0.00019997216500639383,
"loss": 50.3681,
"step": 141
},
{
"epoch": 0.07659890091365766,
"grad_norm": 34.2819938659668,
"learning_rate": 0.0001999707907101462,
"loss": 44.3903,
"step": 142
},
{
"epoch": 0.07713832979333131,
"grad_norm": 42.79631042480469,
"learning_rate": 0.00019996938330485217,
"loss": 31.0566,
"step": 143
},
{
"epoch": 0.07767775867300496,
"grad_norm": 37.28693389892578,
"learning_rate": 0.00019996794279097791,
"loss": 34.0999,
"step": 144
},
{
"epoch": 0.0782171875526786,
"grad_norm": 43.65718460083008,
"learning_rate": 0.00019996646916900051,
"loss": 48.7369,
"step": 145
},
{
"epoch": 0.07875661643235225,
"grad_norm": 39.86713409423828,
"learning_rate": 0.00019996496243940794,
"loss": 36.6841,
"step": 146
},
{
"epoch": 0.07929604531202589,
"grad_norm": 32.35002899169922,
"learning_rate": 0.0001999634226026993,
"loss": 43.1344,
"step": 147
},
{
"epoch": 0.07983547419169953,
"grad_norm": 36.14616775512695,
"learning_rate": 0.0001999618496593845,
"loss": 51.9779,
"step": 148
},
{
"epoch": 0.08037490307137318,
"grad_norm": 31.071197509765625,
"learning_rate": 0.00019996024360998456,
"loss": 39.5621,
"step": 149
},
{
"epoch": 0.08091433195104683,
"grad_norm": 33.61774444580078,
"learning_rate": 0.00019995860445503127,
"loss": 37.7614,
"step": 150
},
{
"epoch": 0.08145376083072048,
"grad_norm": 22.93950653076172,
"learning_rate": 0.00019995693219506758,
"loss": 59.2331,
"step": 151
},
{
"epoch": 0.08199318971039413,
"grad_norm": 31.307132720947266,
"learning_rate": 0.00019995522683064726,
"loss": 70.8054,
"step": 152
},
{
"epoch": 0.08253261859006776,
"grad_norm": 28.894466400146484,
"learning_rate": 0.00019995348836233516,
"loss": 84.8097,
"step": 153
},
{
"epoch": 0.08307204746974141,
"grad_norm": 26.76435661315918,
"learning_rate": 0.000199951716790707,
"loss": 101.4707,
"step": 154
},
{
"epoch": 0.08361147634941506,
"grad_norm": 26.842918395996094,
"learning_rate": 0.00019994991211634954,
"loss": 107.518,
"step": 155
},
{
"epoch": 0.0841509052290887,
"grad_norm": 25.251588821411133,
"learning_rate": 0.00019994807433986047,
"loss": 106.076,
"step": 156
},
{
"epoch": 0.08469033410876235,
"grad_norm": 28.60271453857422,
"learning_rate": 0.0001999462034618484,
"loss": 96.3093,
"step": 157
},
{
"epoch": 0.085229762988436,
"grad_norm": 22.537473678588867,
"learning_rate": 0.00019994429948293291,
"loss": 88.6475,
"step": 158
},
{
"epoch": 0.08576919186810963,
"grad_norm": 18.868396759033203,
"learning_rate": 0.00019994236240374465,
"loss": 92.4222,
"step": 159
},
{
"epoch": 0.08630862074778328,
"grad_norm": 21.84971046447754,
"learning_rate": 0.00019994039222492513,
"loss": 88.0079,
"step": 160
},
{
"epoch": 0.08684804962745693,
"grad_norm": 23.634244918823242,
"learning_rate": 0.00019993838894712682,
"loss": 77.0574,
"step": 161
},
{
"epoch": 0.08738747850713058,
"grad_norm": 18.22877311706543,
"learning_rate": 0.00019993635257101322,
"loss": 67.3958,
"step": 162
},
{
"epoch": 0.08792690738680423,
"grad_norm": 21.62260627746582,
"learning_rate": 0.00019993428309725872,
"loss": 65.1832,
"step": 163
},
{
"epoch": 0.08846633626647786,
"grad_norm": 18.148618698120117,
"learning_rate": 0.0001999321805265487,
"loss": 63.1231,
"step": 164
},
{
"epoch": 0.08900576514615151,
"grad_norm": 20.20022201538086,
"learning_rate": 0.00019993004485957956,
"loss": 59.0852,
"step": 165
},
{
"epoch": 0.08954519402582516,
"grad_norm": 28.2082576751709,
"learning_rate": 0.00019992787609705853,
"loss": 55.8505,
"step": 166
},
{
"epoch": 0.0900846229054988,
"grad_norm": 43.48365020751953,
"learning_rate": 0.00019992567423970394,
"loss": 40.495,
"step": 167
},
{
"epoch": 0.09062405178517245,
"grad_norm": 149.13955688476562,
"learning_rate": 0.00019992343928824498,
"loss": 91.8388,
"step": 168
},
{
"epoch": 0.0911634806648461,
"grad_norm": 91.07251739501953,
"learning_rate": 0.00019992117124342183,
"loss": 61.9425,
"step": 169
},
{
"epoch": 0.09170290954451973,
"grad_norm": 65.70806121826172,
"learning_rate": 0.00019991887010598565,
"loss": 59.7979,
"step": 170
},
{
"epoch": 0.09224233842419338,
"grad_norm": 45.109580993652344,
"learning_rate": 0.00019991653587669855,
"loss": 63.235,
"step": 171
},
{
"epoch": 0.09278176730386703,
"grad_norm": 49.24695587158203,
"learning_rate": 0.00019991416855633364,
"loss": 55.8371,
"step": 172
},
{
"epoch": 0.09332119618354068,
"grad_norm": 44.50947952270508,
"learning_rate": 0.0001999117681456749,
"loss": 45.3712,
"step": 173
},
{
"epoch": 0.09386062506321433,
"grad_norm": 45.105506896972656,
"learning_rate": 0.00019990933464551728,
"loss": 59.354,
"step": 174
},
{
"epoch": 0.09440005394288797,
"grad_norm": 31.862106323242188,
"learning_rate": 0.0001999068680566668,
"loss": 49.2883,
"step": 175
},
{
"epoch": 0.09493948282256161,
"grad_norm": 34.86188507080078,
"learning_rate": 0.00019990436837994028,
"loss": 40.9445,
"step": 176
},
{
"epoch": 0.09547891170223526,
"grad_norm": 52.34774398803711,
"learning_rate": 0.00019990183561616567,
"loss": 54.3114,
"step": 177
},
{
"epoch": 0.0960183405819089,
"grad_norm": 30.12732696533203,
"learning_rate": 0.00019989926976618172,
"loss": 44.8966,
"step": 178
},
{
"epoch": 0.09655776946158255,
"grad_norm": 29.296287536621094,
"learning_rate": 0.00019989667083083825,
"loss": 47.5101,
"step": 179
},
{
"epoch": 0.0970971983412562,
"grad_norm": 42.42873764038086,
"learning_rate": 0.00019989403881099597,
"loss": 48.2378,
"step": 180
},
{
"epoch": 0.09763662722092983,
"grad_norm": 31.62274742126465,
"learning_rate": 0.00019989137370752657,
"loss": 42.1564,
"step": 181
},
{
"epoch": 0.09817605610060348,
"grad_norm": 30.754499435424805,
"learning_rate": 0.00019988867552131275,
"loss": 52.2929,
"step": 182
},
{
"epoch": 0.09871548498027713,
"grad_norm": 31.932157516479492,
"learning_rate": 0.000199885944253248,
"loss": 45.6226,
"step": 183
},
{
"epoch": 0.09925491385995078,
"grad_norm": 33.754722595214844,
"learning_rate": 0.00019988317990423703,
"loss": 39.9572,
"step": 184
},
{
"epoch": 0.09979434273962443,
"grad_norm": 33.33165740966797,
"learning_rate": 0.00019988038247519522,
"loss": 52.7357,
"step": 185
},
{
"epoch": 0.10033377161929807,
"grad_norm": 28.355619430541992,
"learning_rate": 0.0001998775519670491,
"loss": 39.8865,
"step": 186
},
{
"epoch": 0.10087320049897171,
"grad_norm": 60.16803741455078,
"learning_rate": 0.00019987468838073613,
"loss": 48.3595,
"step": 187
},
{
"epoch": 0.10141262937864536,
"grad_norm": 33.5135498046875,
"learning_rate": 0.00019987179171720464,
"loss": 34.3803,
"step": 188
},
{
"epoch": 0.101952058258319,
"grad_norm": 33.8374137878418,
"learning_rate": 0.00019986886197741403,
"loss": 46.4517,
"step": 189
},
{
"epoch": 0.10249148713799265,
"grad_norm": 26.143709182739258,
"learning_rate": 0.0001998658991623345,
"loss": 30.6351,
"step": 190
},
{
"epoch": 0.1030309160176663,
"grad_norm": 28.791723251342773,
"learning_rate": 0.0001998629032729474,
"loss": 44.2275,
"step": 191
},
{
"epoch": 0.10357034489733995,
"grad_norm": 33.818931579589844,
"learning_rate": 0.00019985987431024485,
"loss": 43.5677,
"step": 192
},
{
"epoch": 0.10410977377701358,
"grad_norm": 40.07392883300781,
"learning_rate": 0.00019985681227523006,
"loss": 34.5844,
"step": 193
},
{
"epoch": 0.10464920265668723,
"grad_norm": 30.963062286376953,
"learning_rate": 0.00019985371716891708,
"loss": 44.1099,
"step": 194
},
{
"epoch": 0.10518863153636088,
"grad_norm": 31.774293899536133,
"learning_rate": 0.000199850588992331,
"loss": 36.4496,
"step": 195
},
{
"epoch": 0.10572806041603452,
"grad_norm": 47.396575927734375,
"learning_rate": 0.00019984742774650785,
"loss": 50.9736,
"step": 196
},
{
"epoch": 0.10626748929570817,
"grad_norm": 58.573341369628906,
"learning_rate": 0.00019984423343249457,
"loss": 44.6643,
"step": 197
},
{
"epoch": 0.10680691817538182,
"grad_norm": 33.57207107543945,
"learning_rate": 0.00019984100605134906,
"loss": 36.4154,
"step": 198
},
{
"epoch": 0.10734634705505545,
"grad_norm": 33.817752838134766,
"learning_rate": 0.00019983774560414027,
"loss": 38.8474,
"step": 199
},
{
"epoch": 0.1078857759347291,
"grad_norm": 34.572608947753906,
"learning_rate": 0.00019983445209194791,
"loss": 30.1009,
"step": 200
},
{
"epoch": 0.1078857759347291,
"eval_loss": 1.836081624031067,
"eval_runtime": 141.0356,
"eval_samples_per_second": 2.12,
"eval_steps_per_second": 2.12,
"step": 200
},
{
"epoch": 0.10842520481440275,
"grad_norm": 23.590002059936523,
"learning_rate": 0.0001998311255158628,
"loss": 53.9458,
"step": 201
},
{
"epoch": 0.1089646336940764,
"grad_norm": 39.737159729003906,
"learning_rate": 0.00019982776587698666,
"loss": 85.7514,
"step": 202
},
{
"epoch": 0.10950406257375005,
"grad_norm": 35.41561508178711,
"learning_rate": 0.00019982437317643217,
"loss": 84.9662,
"step": 203
},
{
"epoch": 0.11004349145342368,
"grad_norm": 31.39605140686035,
"learning_rate": 0.0001998209474153229,
"loss": 110.0561,
"step": 204
},
{
"epoch": 0.11058292033309733,
"grad_norm": 30.160261154174805,
"learning_rate": 0.00019981748859479348,
"loss": 101.1574,
"step": 205
},
{
"epoch": 0.11112234921277098,
"grad_norm": 33.4417724609375,
"learning_rate": 0.00019981399671598939,
"loss": 116.0456,
"step": 206
},
{
"epoch": 0.11166177809244462,
"grad_norm": 34.16884994506836,
"learning_rate": 0.0001998104717800671,
"loss": 103.0287,
"step": 207
},
{
"epoch": 0.11220120697211827,
"grad_norm": 33.58393859863281,
"learning_rate": 0.00019980691378819406,
"loss": 95.5024,
"step": 208
},
{
"epoch": 0.11274063585179192,
"grad_norm": 29.785871505737305,
"learning_rate": 0.00019980332274154857,
"loss": 91.5854,
"step": 209
},
{
"epoch": 0.11328006473146555,
"grad_norm": 29.184667587280273,
"learning_rate": 0.00019979969864131997,
"loss": 86.9138,
"step": 210
},
{
"epoch": 0.1138194936111392,
"grad_norm": 25.164024353027344,
"learning_rate": 0.00019979604148870854,
"loss": 72.7827,
"step": 211
},
{
"epoch": 0.11435892249081285,
"grad_norm": 18.179292678833008,
"learning_rate": 0.00019979235128492545,
"loss": 67.364,
"step": 212
},
{
"epoch": 0.1148983513704865,
"grad_norm": 20.353260040283203,
"learning_rate": 0.00019978862803119284,
"loss": 60.0141,
"step": 213
},
{
"epoch": 0.11543778025016015,
"grad_norm": 27.25603485107422,
"learning_rate": 0.00019978487172874382,
"loss": 61.8063,
"step": 214
},
{
"epoch": 0.1159772091298338,
"grad_norm": 40.56468963623047,
"learning_rate": 0.00019978108237882244,
"loss": 51.2483,
"step": 215
},
{
"epoch": 0.11651663800950743,
"grad_norm": 64.65696716308594,
"learning_rate": 0.00019977725998268365,
"loss": 37.8312,
"step": 216
},
{
"epoch": 0.11705606688918108,
"grad_norm": 80.94468688964844,
"learning_rate": 0.00019977340454159343,
"loss": 55.2775,
"step": 217
},
{
"epoch": 0.11759549576885472,
"grad_norm": 100.61930084228516,
"learning_rate": 0.00019976951605682862,
"loss": 65.5767,
"step": 218
},
{
"epoch": 0.11813492464852837,
"grad_norm": 71.5768051147461,
"learning_rate": 0.00019976559452967703,
"loss": 57.5296,
"step": 219
},
{
"epoch": 0.11867435352820202,
"grad_norm": 37.10725021362305,
"learning_rate": 0.00019976163996143745,
"loss": 48.8497,
"step": 220
},
{
"epoch": 0.11921378240787567,
"grad_norm": 40.85627746582031,
"learning_rate": 0.00019975765235341955,
"loss": 47.6466,
"step": 221
},
{
"epoch": 0.1197532112875493,
"grad_norm": 55.1395263671875,
"learning_rate": 0.000199753631706944,
"loss": 60.2519,
"step": 222
},
{
"epoch": 0.12029264016722295,
"grad_norm": 42.060585021972656,
"learning_rate": 0.00019974957802334234,
"loss": 48.1031,
"step": 223
},
{
"epoch": 0.1208320690468966,
"grad_norm": 36.57340621948242,
"learning_rate": 0.00019974549130395713,
"loss": 43.3995,
"step": 224
},
{
"epoch": 0.12137149792657025,
"grad_norm": 31.497970581054688,
"learning_rate": 0.0001997413715501419,
"loss": 41.1591,
"step": 225
},
{
"epoch": 0.1219109268062439,
"grad_norm": 30.481502532958984,
"learning_rate": 0.00019973721876326094,
"loss": 38.0712,
"step": 226
},
{
"epoch": 0.12245035568591753,
"grad_norm": 38.2381477355957,
"learning_rate": 0.00019973303294468968,
"loss": 46.3861,
"step": 227
},
{
"epoch": 0.12298978456559118,
"grad_norm": 37.4508171081543,
"learning_rate": 0.0001997288140958144,
"loss": 49.3107,
"step": 228
},
{
"epoch": 0.12352921344526482,
"grad_norm": 37.3139533996582,
"learning_rate": 0.0001997245622180323,
"loss": 43.1914,
"step": 229
},
{
"epoch": 0.12406864232493847,
"grad_norm": 35.13384246826172,
"learning_rate": 0.0001997202773127516,
"loss": 45.7228,
"step": 230
},
{
"epoch": 0.12460807120461212,
"grad_norm": 37.45779037475586,
"learning_rate": 0.00019971595938139135,
"loss": 45.0848,
"step": 231
},
{
"epoch": 0.12514750008428577,
"grad_norm": 37.03962707519531,
"learning_rate": 0.00019971160842538162,
"loss": 46.3705,
"step": 232
},
{
"epoch": 0.12568692896395942,
"grad_norm": 30.98250389099121,
"learning_rate": 0.0001997072244461634,
"loss": 41.1065,
"step": 233
},
{
"epoch": 0.12622635784363306,
"grad_norm": 33.62482833862305,
"learning_rate": 0.00019970280744518854,
"loss": 41.8594,
"step": 234
},
{
"epoch": 0.1267657867233067,
"grad_norm": 45.488739013671875,
"learning_rate": 0.00019969835742392,
"loss": 38.6525,
"step": 235
},
{
"epoch": 0.12730521560298033,
"grad_norm": 43.84321594238281,
"learning_rate": 0.0001996938743838315,
"loss": 53.2114,
"step": 236
},
{
"epoch": 0.12784464448265398,
"grad_norm": 40.51958084106445,
"learning_rate": 0.00019968935832640782,
"loss": 50.4725,
"step": 237
},
{
"epoch": 0.12838407336232763,
"grad_norm": 35.1596794128418,
"learning_rate": 0.00019968480925314458,
"loss": 45.1618,
"step": 238
},
{
"epoch": 0.12892350224200128,
"grad_norm": 32.27614974975586,
"learning_rate": 0.00019968022716554832,
"loss": 38.2164,
"step": 239
},
{
"epoch": 0.12946293112167492,
"grad_norm": 33.67794418334961,
"learning_rate": 0.00019967561206513668,
"loss": 43.3203,
"step": 240
},
{
"epoch": 0.13000236000134857,
"grad_norm": 26.34979820251465,
"learning_rate": 0.00019967096395343806,
"loss": 32.1165,
"step": 241
},
{
"epoch": 0.13054178888102222,
"grad_norm": 33.10830307006836,
"learning_rate": 0.00019966628283199186,
"loss": 45.5207,
"step": 242
},
{
"epoch": 0.13108121776069587,
"grad_norm": 47.04872131347656,
"learning_rate": 0.00019966156870234844,
"loss": 44.7497,
"step": 243
},
{
"epoch": 0.13162064664036952,
"grad_norm": 38.99346160888672,
"learning_rate": 0.000199656821566069,
"loss": 43.9255,
"step": 244
},
{
"epoch": 0.13216007552004316,
"grad_norm": 29.892854690551758,
"learning_rate": 0.00019965204142472574,
"loss": 48.4896,
"step": 245
},
{
"epoch": 0.1326995043997168,
"grad_norm": 37.65726089477539,
"learning_rate": 0.00019964722827990185,
"loss": 37.7987,
"step": 246
},
{
"epoch": 0.13323893327939046,
"grad_norm": 41.673274993896484,
"learning_rate": 0.00019964238213319134,
"loss": 48.4095,
"step": 247
},
{
"epoch": 0.13377836215906408,
"grad_norm": 37.152793884277344,
"learning_rate": 0.00019963750298619917,
"loss": 33.8212,
"step": 248
},
{
"epoch": 0.13431779103873773,
"grad_norm": 43.92071533203125,
"learning_rate": 0.00019963259084054128,
"loss": 35.554,
"step": 249
},
{
"epoch": 0.13485721991841138,
"grad_norm": 39.161903381347656,
"learning_rate": 0.0001996276456978445,
"loss": 33.8096,
"step": 250
},
{
"epoch": 0.13539664879808502,
"grad_norm": 24.633363723754883,
"learning_rate": 0.00019962266755974657,
"loss": 46.0338,
"step": 251
},
{
"epoch": 0.13593607767775867,
"grad_norm": 54.83051300048828,
"learning_rate": 0.00019961765642789625,
"loss": 80.4599,
"step": 252
},
{
"epoch": 0.13647550655743232,
"grad_norm": 43.1768684387207,
"learning_rate": 0.0001996126123039531,
"loss": 84.3379,
"step": 253
},
{
"epoch": 0.13701493543710597,
"grad_norm": 24.49346160888672,
"learning_rate": 0.00019960753518958772,
"loss": 100.9898,
"step": 254
},
{
"epoch": 0.13755436431677961,
"grad_norm": 38.09309768676758,
"learning_rate": 0.00019960242508648154,
"loss": 101.0717,
"step": 255
},
{
"epoch": 0.13809379319645326,
"grad_norm": 40.072296142578125,
"learning_rate": 0.00019959728199632699,
"loss": 108.2131,
"step": 256
},
{
"epoch": 0.1386332220761269,
"grad_norm": 43.77210235595703,
"learning_rate": 0.0001995921059208274,
"loss": 111.636,
"step": 257
},
{
"epoch": 0.13917265095580056,
"grad_norm": 42.023155212402344,
"learning_rate": 0.00019958689686169697,
"loss": 90.4911,
"step": 258
},
{
"epoch": 0.13971207983547418,
"grad_norm": 27.917343139648438,
"learning_rate": 0.00019958165482066094,
"loss": 92.3676,
"step": 259
},
{
"epoch": 0.14025150871514783,
"grad_norm": 19.174135208129883,
"learning_rate": 0.00019957637979945537,
"loss": 88.4276,
"step": 260
},
{
"epoch": 0.14079093759482147,
"grad_norm": 22.779672622680664,
"learning_rate": 0.0001995710717998273,
"loss": 88.3991,
"step": 261
},
{
"epoch": 0.14133036647449512,
"grad_norm": 17.607568740844727,
"learning_rate": 0.00019956573082353463,
"loss": 77.4426,
"step": 262
},
{
"epoch": 0.14186979535416877,
"grad_norm": 22.228328704833984,
"learning_rate": 0.00019956035687234626,
"loss": 68.3415,
"step": 263
},
{
"epoch": 0.14240922423384242,
"grad_norm": 21.00279998779297,
"learning_rate": 0.00019955494994804198,
"loss": 70.7203,
"step": 264
},
{
"epoch": 0.14294865311351607,
"grad_norm": 27.789443969726562,
"learning_rate": 0.00019954951005241248,
"loss": 62.4471,
"step": 265
},
{
"epoch": 0.14348808199318971,
"grad_norm": 21.813310623168945,
"learning_rate": 0.0001995440371872594,
"loss": 65.5364,
"step": 266
},
{
"epoch": 0.14402751087286336,
"grad_norm": 22.338788986206055,
"learning_rate": 0.00019953853135439522,
"loss": 53.7872,
"step": 267
},
{
"epoch": 0.144566939752537,
"grad_norm": 17.053470611572266,
"learning_rate": 0.00019953299255564346,
"loss": 46.6823,
"step": 268
},
{
"epoch": 0.14510636863221066,
"grad_norm": 34.75794219970703,
"learning_rate": 0.0001995274207928385,
"loss": 32.208,
"step": 269
},
{
"epoch": 0.1456457975118843,
"grad_norm": 76.52667236328125,
"learning_rate": 0.00019952181606782565,
"loss": 52.4054,
"step": 270
},
{
"epoch": 0.14618522639155793,
"grad_norm": 71.48796844482422,
"learning_rate": 0.00019951617838246107,
"loss": 48.9668,
"step": 271
},
{
"epoch": 0.14672465527123157,
"grad_norm": 79.96577453613281,
"learning_rate": 0.00019951050773861192,
"loss": 61.6082,
"step": 272
},
{
"epoch": 0.14726408415090522,
"grad_norm": 42.05474090576172,
"learning_rate": 0.0001995048041381562,
"loss": 50.8627,
"step": 273
},
{
"epoch": 0.14780351303057887,
"grad_norm": 43.19125747680664,
"learning_rate": 0.00019949906758298295,
"loss": 45.519,
"step": 274
},
{
"epoch": 0.14834294191025252,
"grad_norm": 47.39426040649414,
"learning_rate": 0.00019949329807499198,
"loss": 51.654,
"step": 275
},
{
"epoch": 0.14888237078992617,
"grad_norm": 36.0722770690918,
"learning_rate": 0.00019948749561609415,
"loss": 46.8854,
"step": 276
},
{
"epoch": 0.14942179966959981,
"grad_norm": 33.252742767333984,
"learning_rate": 0.00019948166020821107,
"loss": 46.7532,
"step": 277
},
{
"epoch": 0.14996122854927346,
"grad_norm": 33.89019012451172,
"learning_rate": 0.0001994757918532754,
"loss": 49.6403,
"step": 278
},
{
"epoch": 0.1505006574289471,
"grad_norm": 37.914676666259766,
"learning_rate": 0.00019946989055323066,
"loss": 54.5018,
"step": 279
},
{
"epoch": 0.15104008630862076,
"grad_norm": 37.611061096191406,
"learning_rate": 0.00019946395631003128,
"loss": 50.6423,
"step": 280
},
{
"epoch": 0.1515795151882944,
"grad_norm": 36.489723205566406,
"learning_rate": 0.00019945798912564264,
"loss": 45.9299,
"step": 281
},
{
"epoch": 0.15211894406796803,
"grad_norm": 31.33220100402832,
"learning_rate": 0.00019945198900204095,
"loss": 47.4519,
"step": 282
},
{
"epoch": 0.15265837294764167,
"grad_norm": 32.4266242980957,
"learning_rate": 0.00019944595594121337,
"loss": 40.0806,
"step": 283
},
{
"epoch": 0.15319780182731532,
"grad_norm": 38.17313003540039,
"learning_rate": 0.00019943988994515797,
"loss": 39.9765,
"step": 284
},
{
"epoch": 0.15373723070698897,
"grad_norm": 40.299354553222656,
"learning_rate": 0.00019943379101588376,
"loss": 40.7812,
"step": 285
},
{
"epoch": 0.15427665958666262,
"grad_norm": 42.34661102294922,
"learning_rate": 0.00019942765915541063,
"loss": 31.2513,
"step": 286
},
{
"epoch": 0.15481608846633627,
"grad_norm": 46.61203384399414,
"learning_rate": 0.00019942149436576938,
"loss": 41.5619,
"step": 287
},
{
"epoch": 0.1553555173460099,
"grad_norm": 39.79526901245117,
"learning_rate": 0.00019941529664900168,
"loss": 38.13,
"step": 288
},
{
"epoch": 0.15589494622568356,
"grad_norm": 42.995567321777344,
"learning_rate": 0.0001994090660071601,
"loss": 41.3515,
"step": 289
},
{
"epoch": 0.1564343751053572,
"grad_norm": 34.27892303466797,
"learning_rate": 0.00019940280244230824,
"loss": 41.1277,
"step": 290
},
{
"epoch": 0.15697380398503086,
"grad_norm": 29.622488021850586,
"learning_rate": 0.00019939650595652045,
"loss": 49.2284,
"step": 291
},
{
"epoch": 0.1575132328647045,
"grad_norm": 36.693119049072266,
"learning_rate": 0.00019939017655188206,
"loss": 35.5444,
"step": 292
},
{
"epoch": 0.15805266174437815,
"grad_norm": 30.75679588317871,
"learning_rate": 0.00019938381423048932,
"loss": 34.9666,
"step": 293
},
{
"epoch": 0.15859209062405177,
"grad_norm": 35.84019088745117,
"learning_rate": 0.00019937741899444928,
"loss": 39.4625,
"step": 294
},
{
"epoch": 0.15913151950372542,
"grad_norm": 35.854496002197266,
"learning_rate": 0.00019937099084588002,
"loss": 37.2887,
"step": 295
},
{
"epoch": 0.15967094838339907,
"grad_norm": 33.07613754272461,
"learning_rate": 0.00019936452978691044,
"loss": 34.5375,
"step": 296
},
{
"epoch": 0.16021037726307272,
"grad_norm": 43.46371078491211,
"learning_rate": 0.00019935803581968035,
"loss": 30.3173,
"step": 297
},
{
"epoch": 0.16074980614274637,
"grad_norm": 52.03241729736328,
"learning_rate": 0.00019935150894634046,
"loss": 42.4725,
"step": 298
},
{
"epoch": 0.16128923502242,
"grad_norm": 50.36249542236328,
"learning_rate": 0.00019934494916905245,
"loss": 37.3647,
"step": 299
},
{
"epoch": 0.16182866390209366,
"grad_norm": 41.50126647949219,
"learning_rate": 0.00019933835648998875,
"loss": 24.2931,
"step": 300
},
{
"epoch": 0.1623680927817673,
"grad_norm": 31.253141403198242,
"learning_rate": 0.00019933173091133286,
"loss": 44.7853,
"step": 301
},
{
"epoch": 0.16290752166144096,
"grad_norm": 96.83972930908203,
"learning_rate": 0.000199325072435279,
"loss": 84.9808,
"step": 302
},
{
"epoch": 0.1634469505411146,
"grad_norm": 91.9966049194336,
"learning_rate": 0.0001993183810640324,
"loss": 99.5531,
"step": 303
},
{
"epoch": 0.16398637942078825,
"grad_norm": 66.43877410888672,
"learning_rate": 0.00019931165679980918,
"loss": 105.7665,
"step": 304
},
{
"epoch": 0.16452580830046187,
"grad_norm": 35.26411056518555,
"learning_rate": 0.00019930489964483633,
"loss": 109.6819,
"step": 305
},
{
"epoch": 0.16506523718013552,
"grad_norm": 47.18457794189453,
"learning_rate": 0.00019929810960135172,
"loss": 113.4221,
"step": 306
},
{
"epoch": 0.16560466605980917,
"grad_norm": 49.24475860595703,
"learning_rate": 0.00019929128667160408,
"loss": 108.0158,
"step": 307
},
{
"epoch": 0.16614409493948282,
"grad_norm": 45.63924026489258,
"learning_rate": 0.00019928443085785318,
"loss": 94.1414,
"step": 308
},
{
"epoch": 0.16668352381915646,
"grad_norm": 46.688350677490234,
"learning_rate": 0.00019927754216236948,
"loss": 87.8688,
"step": 309
},
{
"epoch": 0.1672229526988301,
"grad_norm": 39.54045486450195,
"learning_rate": 0.00019927062058743448,
"loss": 92.6019,
"step": 310
},
{
"epoch": 0.16776238157850376,
"grad_norm": 29.866121292114258,
"learning_rate": 0.0001992636661353405,
"loss": 81.9024,
"step": 311
},
{
"epoch": 0.1683018104581774,
"grad_norm": 22.350112915039062,
"learning_rate": 0.0001992566788083908,
"loss": 68.4321,
"step": 312
},
{
"epoch": 0.16884123933785106,
"grad_norm": 21.657258987426758,
"learning_rate": 0.00019924965860889944,
"loss": 65.7434,
"step": 313
},
{
"epoch": 0.1693806682175247,
"grad_norm": 18.347572326660156,
"learning_rate": 0.00019924260553919146,
"loss": 62.485,
"step": 314
},
{
"epoch": 0.16992009709719835,
"grad_norm": 28.368114471435547,
"learning_rate": 0.00019923551960160268,
"loss": 53.7759,
"step": 315
},
{
"epoch": 0.170459525976872,
"grad_norm": 35.214988708496094,
"learning_rate": 0.00019922840079848,
"loss": 45.4414,
"step": 316
},
{
"epoch": 0.17099895485654562,
"grad_norm": 38.698760986328125,
"learning_rate": 0.00019922124913218094,
"loss": 37.665,
"step": 317
},
{
"epoch": 0.17153838373621927,
"grad_norm": 43.39471435546875,
"learning_rate": 0.0001992140646050741,
"loss": 51.4899,
"step": 318
},
{
"epoch": 0.17207781261589292,
"grad_norm": 43.52251434326172,
"learning_rate": 0.00019920684721953894,
"loss": 48.5712,
"step": 319
},
{
"epoch": 0.17261724149556656,
"grad_norm": 60.897579193115234,
"learning_rate": 0.00019919959697796568,
"loss": 59.9231,
"step": 320
},
{
"epoch": 0.1731566703752402,
"grad_norm": 37.93972396850586,
"learning_rate": 0.0001991923138827556,
"loss": 47.906,
"step": 321
},
{
"epoch": 0.17369609925491386,
"grad_norm": 44.32222366333008,
"learning_rate": 0.0001991849979363207,
"loss": 54.5404,
"step": 322
},
{
"epoch": 0.1742355281345875,
"grad_norm": 37.367671966552734,
"learning_rate": 0.00019917764914108394,
"loss": 49.3113,
"step": 323
},
{
"epoch": 0.17477495701426116,
"grad_norm": 43.20479965209961,
"learning_rate": 0.00019917026749947917,
"loss": 41.9015,
"step": 324
},
{
"epoch": 0.1753143858939348,
"grad_norm": 36.7598991394043,
"learning_rate": 0.0001991628530139511,
"loss": 43.7222,
"step": 325
},
{
"epoch": 0.17585381477360845,
"grad_norm": 33.30655288696289,
"learning_rate": 0.0001991554056869553,
"loss": 48.4387,
"step": 326
},
{
"epoch": 0.1763932436532821,
"grad_norm": 32.89339828491211,
"learning_rate": 0.00019914792552095818,
"loss": 51.108,
"step": 327
},
{
"epoch": 0.17693267253295572,
"grad_norm": 31.422489166259766,
"learning_rate": 0.00019914041251843716,
"loss": 42.9287,
"step": 328
},
{
"epoch": 0.17747210141262937,
"grad_norm": 33.38264465332031,
"learning_rate": 0.00019913286668188037,
"loss": 47.0867,
"step": 329
},
{
"epoch": 0.17801153029230302,
"grad_norm": 37.976837158203125,
"learning_rate": 0.00019912528801378698,
"loss": 38.2593,
"step": 330
},
{
"epoch": 0.17855095917197666,
"grad_norm": 35.707054138183594,
"learning_rate": 0.0001991176765166669,
"loss": 44.5348,
"step": 331
},
{
"epoch": 0.1790903880516503,
"grad_norm": 43.86237335205078,
"learning_rate": 0.00019911003219304094,
"loss": 40.4868,
"step": 332
},
{
"epoch": 0.17962981693132396,
"grad_norm": 54.88194274902344,
"learning_rate": 0.00019910235504544082,
"loss": 38.935,
"step": 333
},
{
"epoch": 0.1801692458109976,
"grad_norm": 43.87349319458008,
"learning_rate": 0.00019909464507640915,
"loss": 43.0978,
"step": 334
},
{
"epoch": 0.18070867469067126,
"grad_norm": 43.421932220458984,
"learning_rate": 0.0001990869022884993,
"loss": 39.2888,
"step": 335
},
{
"epoch": 0.1812481035703449,
"grad_norm": 41.14269256591797,
"learning_rate": 0.00019907912668427566,
"loss": 42.6139,
"step": 336
},
{
"epoch": 0.18178753245001855,
"grad_norm": 38.619380950927734,
"learning_rate": 0.00019907131826631336,
"loss": 40.0248,
"step": 337
},
{
"epoch": 0.1823269613296922,
"grad_norm": 33.65724563598633,
"learning_rate": 0.00019906347703719845,
"loss": 38.7406,
"step": 338
},
{
"epoch": 0.18286639020936585,
"grad_norm": 35.25956344604492,
"learning_rate": 0.0001990556029995279,
"loss": 39.2734,
"step": 339
},
{
"epoch": 0.18340581908903947,
"grad_norm": 36.87468719482422,
"learning_rate": 0.00019904769615590942,
"loss": 40.6619,
"step": 340
},
{
"epoch": 0.18394524796871312,
"grad_norm": 32.0380973815918,
"learning_rate": 0.00019903975650896168,
"loss": 39.8376,
"step": 341
},
{
"epoch": 0.18448467684838676,
"grad_norm": 33.44660949707031,
"learning_rate": 0.0001990317840613142,
"loss": 33.338,
"step": 342
},
{
"epoch": 0.1850241057280604,
"grad_norm": 36.242523193359375,
"learning_rate": 0.00019902377881560735,
"loss": 35.0493,
"step": 343
},
{
"epoch": 0.18556353460773406,
"grad_norm": 37.39813232421875,
"learning_rate": 0.00019901574077449232,
"loss": 26.9563,
"step": 344
},
{
"epoch": 0.1861029634874077,
"grad_norm": 35.84196472167969,
"learning_rate": 0.0001990076699406313,
"loss": 33.7825,
"step": 345
},
{
"epoch": 0.18664239236708136,
"grad_norm": 38.69563293457031,
"learning_rate": 0.00019899956631669717,
"loss": 29.9582,
"step": 346
},
{
"epoch": 0.187181821246755,
"grad_norm": 47.82805633544922,
"learning_rate": 0.00019899142990537376,
"loss": 33.9471,
"step": 347
},
{
"epoch": 0.18772125012642865,
"grad_norm": 36.29233169555664,
"learning_rate": 0.00019898326070935579,
"loss": 28.1711,
"step": 348
},
{
"epoch": 0.1882606790061023,
"grad_norm": 45.26416015625,
"learning_rate": 0.00019897505873134872,
"loss": 33.76,
"step": 349
},
{
"epoch": 0.18880010788577595,
"grad_norm": 39.766441345214844,
"learning_rate": 0.000198966823974069,
"loss": 25.629,
"step": 350
},
{
"epoch": 0.18933953676544957,
"grad_norm": 30.092906951904297,
"learning_rate": 0.00019895855644024387,
"loss": 45.1687,
"step": 351
},
{
"epoch": 0.18987896564512322,
"grad_norm": 61.02379608154297,
"learning_rate": 0.00019895025613261136,
"loss": 77.4727,
"step": 352
},
{
"epoch": 0.19041839452479686,
"grad_norm": 51.788063049316406,
"learning_rate": 0.00019894192305392055,
"loss": 82.3816,
"step": 353
},
{
"epoch": 0.1909578234044705,
"grad_norm": 72.1239242553711,
"learning_rate": 0.0001989335572069311,
"loss": 103.2545,
"step": 354
},
{
"epoch": 0.19149725228414416,
"grad_norm": 29.279748916625977,
"learning_rate": 0.00019892515859441383,
"loss": 113.7908,
"step": 355
},
{
"epoch": 0.1920366811638178,
"grad_norm": 43.08776092529297,
"learning_rate": 0.00019891672721915015,
"loss": 107.6541,
"step": 356
},
{
"epoch": 0.19257611004349146,
"grad_norm": 54.121192932128906,
"learning_rate": 0.00019890826308393243,
"loss": 102.3774,
"step": 357
},
{
"epoch": 0.1931155389231651,
"grad_norm": 52.771793365478516,
"learning_rate": 0.0001988997661915639,
"loss": 87.3872,
"step": 358
},
{
"epoch": 0.19365496780283875,
"grad_norm": 58.10847854614258,
"learning_rate": 0.00019889123654485866,
"loss": 97.106,
"step": 359
},
{
"epoch": 0.1941943966825124,
"grad_norm": 52.38351058959961,
"learning_rate": 0.00019888267414664156,
"loss": 91.256,
"step": 360
},
{
"epoch": 0.19473382556218605,
"grad_norm": 48.153804779052734,
"learning_rate": 0.0001988740789997484,
"loss": 81.894,
"step": 361
},
{
"epoch": 0.19527325444185967,
"grad_norm": 25.811304092407227,
"learning_rate": 0.00019886545110702576,
"loss": 69.6325,
"step": 362
},
{
"epoch": 0.19581268332153332,
"grad_norm": 22.911964416503906,
"learning_rate": 0.00019885679047133107,
"loss": 65.5302,
"step": 363
},
{
"epoch": 0.19635211220120696,
"grad_norm": 37.54278564453125,
"learning_rate": 0.00019884809709553265,
"loss": 60.65,
"step": 364
},
{
"epoch": 0.1968915410808806,
"grad_norm": 20.303857803344727,
"learning_rate": 0.00019883937098250963,
"loss": 44.1299,
"step": 365
},
{
"epoch": 0.19743096996055426,
"grad_norm": 31.87704849243164,
"learning_rate": 0.00019883061213515197,
"loss": 34.1489,
"step": 366
},
{
"epoch": 0.1979703988402279,
"grad_norm": 39.10615539550781,
"learning_rate": 0.00019882182055636053,
"loss": 37.5989,
"step": 367
},
{
"epoch": 0.19850982771990155,
"grad_norm": 41.10018539428711,
"learning_rate": 0.00019881299624904692,
"loss": 48.6169,
"step": 368
},
{
"epoch": 0.1990492565995752,
"grad_norm": 34.8628044128418,
"learning_rate": 0.00019880413921613367,
"loss": 51.3889,
"step": 369
},
{
"epoch": 0.19958868547924885,
"grad_norm": 41.81850051879883,
"learning_rate": 0.0001987952494605541,
"loss": 46.2857,
"step": 370
},
{
"epoch": 0.2001281143589225,
"grad_norm": 46.00803756713867,
"learning_rate": 0.00019878632698525238,
"loss": 42.1201,
"step": 371
},
{
"epoch": 0.20066754323859615,
"grad_norm": 37.3172492980957,
"learning_rate": 0.00019877737179318353,
"loss": 44.8517,
"step": 372
},
{
"epoch": 0.2012069721182698,
"grad_norm": 30.38181495666504,
"learning_rate": 0.0001987683838873134,
"loss": 30.3321,
"step": 373
},
{
"epoch": 0.20174640099794341,
"grad_norm": 36.00757598876953,
"learning_rate": 0.00019875936327061865,
"loss": 41.3805,
"step": 374
},
{
"epoch": 0.20228582987761706,
"grad_norm": 36.742733001708984,
"learning_rate": 0.00019875030994608684,
"loss": 48.6651,
"step": 375
},
{
"epoch": 0.2028252587572907,
"grad_norm": 42.53518295288086,
"learning_rate": 0.00019874122391671622,
"loss": 32.5649,
"step": 376
},
{
"epoch": 0.20336468763696436,
"grad_norm": 35.77900314331055,
"learning_rate": 0.00019873210518551608,
"loss": 46.6955,
"step": 377
},
{
"epoch": 0.203904116516638,
"grad_norm": 44.95616149902344,
"learning_rate": 0.00019872295375550635,
"loss": 41.271,
"step": 378
},
{
"epoch": 0.20444354539631165,
"grad_norm": 34.28546142578125,
"learning_rate": 0.00019871376962971789,
"loss": 41.4059,
"step": 379
},
{
"epoch": 0.2049829742759853,
"grad_norm": 35.807682037353516,
"learning_rate": 0.00019870455281119237,
"loss": 45.8892,
"step": 380
},
{
"epoch": 0.20552240315565895,
"grad_norm": 30.27015495300293,
"learning_rate": 0.00019869530330298227,
"loss": 34.013,
"step": 381
},
{
"epoch": 0.2060618320353326,
"grad_norm": 38.26789093017578,
"learning_rate": 0.00019868602110815093,
"loss": 42.6953,
"step": 382
},
{
"epoch": 0.20660126091500625,
"grad_norm": 39.61716079711914,
"learning_rate": 0.00019867670622977248,
"loss": 40.4979,
"step": 383
},
{
"epoch": 0.2071406897946799,
"grad_norm": 35.717227935791016,
"learning_rate": 0.00019866735867093188,
"loss": 31.5146,
"step": 384
},
{
"epoch": 0.20768011867435351,
"grad_norm": 43.41541290283203,
"learning_rate": 0.0001986579784347249,
"loss": 37.5416,
"step": 385
},
{
"epoch": 0.20821954755402716,
"grad_norm": 40.18928146362305,
"learning_rate": 0.0001986485655242582,
"loss": 39.0367,
"step": 386
},
{
"epoch": 0.2087589764337008,
"grad_norm": 35.295291900634766,
"learning_rate": 0.00019863911994264926,
"loss": 36.8243,
"step": 387
},
{
"epoch": 0.20929840531337446,
"grad_norm": 52.24161148071289,
"learning_rate": 0.00019862964169302621,
"loss": 41.7241,
"step": 388
},
{
"epoch": 0.2098378341930481,
"grad_norm": 53.32133483886719,
"learning_rate": 0.00019862013077852822,
"loss": 38.7999,
"step": 389
},
{
"epoch": 0.21037726307272175,
"grad_norm": 42.945804595947266,
"learning_rate": 0.00019861058720230514,
"loss": 34.0199,
"step": 390
},
{
"epoch": 0.2109166919523954,
"grad_norm": 38.77582931518555,
"learning_rate": 0.00019860101096751768,
"loss": 33.4203,
"step": 391
},
{
"epoch": 0.21145612083206905,
"grad_norm": 30.80617332458496,
"learning_rate": 0.0001985914020773374,
"loss": 27.0483,
"step": 392
},
{
"epoch": 0.2119955497117427,
"grad_norm": 43.676090240478516,
"learning_rate": 0.00019858176053494663,
"loss": 33.954,
"step": 393
},
{
"epoch": 0.21253497859141635,
"grad_norm": 38.32650375366211,
"learning_rate": 0.00019857208634353852,
"loss": 29.378,
"step": 394
},
{
"epoch": 0.21307440747109,
"grad_norm": 39.12830352783203,
"learning_rate": 0.000198562379506317,
"loss": 27.9634,
"step": 395
},
{
"epoch": 0.21361383635076364,
"grad_norm": 47.39609909057617,
"learning_rate": 0.00019855264002649692,
"loss": 34.1847,
"step": 396
},
{
"epoch": 0.21415326523043726,
"grad_norm": 38.62258529663086,
"learning_rate": 0.00019854286790730384,
"loss": 26.0765,
"step": 397
},
{
"epoch": 0.2146926941101109,
"grad_norm": 42.81424331665039,
"learning_rate": 0.00019853306315197413,
"loss": 34.1509,
"step": 398
},
{
"epoch": 0.21523212298978456,
"grad_norm": 45.57196807861328,
"learning_rate": 0.00019852322576375503,
"loss": 32.0371,
"step": 399
},
{
"epoch": 0.2157715518694582,
"grad_norm": 35.20758819580078,
"learning_rate": 0.0001985133557459046,
"loss": 20.3634,
"step": 400
},
{
"epoch": 0.2157715518694582,
"eval_loss": 1.6627388000488281,
"eval_runtime": 141.0153,
"eval_samples_per_second": 2.12,
"eval_steps_per_second": 2.12,
"step": 400
},
{
"epoch": 0.21631098074913185,
"grad_norm": 24.1074161529541,
"learning_rate": 0.00019850345310169155,
"loss": 37.3797,
"step": 401
},
{
"epoch": 0.2168504096288055,
"grad_norm": 62.604949951171875,
"learning_rate": 0.00019849351783439561,
"loss": 78.7953,
"step": 402
},
{
"epoch": 0.21738983850847915,
"grad_norm": 43.36476135253906,
"learning_rate": 0.0001984835499473072,
"loss": 82.645,
"step": 403
},
{
"epoch": 0.2179292673881528,
"grad_norm": 52.12046432495117,
"learning_rate": 0.0001984735494437275,
"loss": 87.0839,
"step": 404
},
{
"epoch": 0.21846869626782645,
"grad_norm": 34.333431243896484,
"learning_rate": 0.00019846351632696863,
"loss": 105.6289,
"step": 405
},
{
"epoch": 0.2190081251475001,
"grad_norm": 41.665771484375,
"learning_rate": 0.00019845345060035335,
"loss": 112.3874,
"step": 406
},
{
"epoch": 0.21954755402717374,
"grad_norm": 58.79914093017578,
"learning_rate": 0.00019844335226721537,
"loss": 114.2657,
"step": 407
},
{
"epoch": 0.22008698290684736,
"grad_norm": 52.85742950439453,
"learning_rate": 0.00019843322133089906,
"loss": 98.4778,
"step": 408
},
{
"epoch": 0.220626411786521,
"grad_norm": 53.792476654052734,
"learning_rate": 0.00019842305779475968,
"loss": 94.7811,
"step": 409
},
{
"epoch": 0.22116584066619466,
"grad_norm": 49.56667709350586,
"learning_rate": 0.0001984128616621633,
"loss": 92.4516,
"step": 410
},
{
"epoch": 0.2217052695458683,
"grad_norm": 38.96401596069336,
"learning_rate": 0.0001984026329364867,
"loss": 78.0561,
"step": 411
},
{
"epoch": 0.22224469842554195,
"grad_norm": 35.649200439453125,
"learning_rate": 0.00019839237162111757,
"loss": 66.0612,
"step": 412
},
{
"epoch": 0.2227841273052156,
"grad_norm": 22.54837989807129,
"learning_rate": 0.00019838207771945426,
"loss": 59.3091,
"step": 413
},
{
"epoch": 0.22332355618488925,
"grad_norm": 16.843589782714844,
"learning_rate": 0.00019837175123490596,
"loss": 62.8711,
"step": 414
},
{
"epoch": 0.2238629850645629,
"grad_norm": 18.909435272216797,
"learning_rate": 0.00019836139217089275,
"loss": 55.3784,
"step": 415
},
{
"epoch": 0.22440241394423655,
"grad_norm": 25.120887756347656,
"learning_rate": 0.0001983510005308454,
"loss": 51.9063,
"step": 416
},
{
"epoch": 0.2249418428239102,
"grad_norm": 30.78650665283203,
"learning_rate": 0.00019834057631820543,
"loss": 32.4726,
"step": 417
},
{
"epoch": 0.22548127170358384,
"grad_norm": 72.46208953857422,
"learning_rate": 0.00019833011953642525,
"loss": 44.1452,
"step": 418
},
{
"epoch": 0.2260207005832575,
"grad_norm": 45.94267654418945,
"learning_rate": 0.000198319630188968,
"loss": 50.9596,
"step": 419
},
{
"epoch": 0.2265601294629311,
"grad_norm": 47.52016067504883,
"learning_rate": 0.00019830910827930764,
"loss": 44.8286,
"step": 420
},
{
"epoch": 0.22709955834260476,
"grad_norm": 40.93891525268555,
"learning_rate": 0.00019829855381092886,
"loss": 56.7985,
"step": 421
},
{
"epoch": 0.2276389872222784,
"grad_norm": 36.567108154296875,
"learning_rate": 0.0001982879667873272,
"loss": 35.7161,
"step": 422
},
{
"epoch": 0.22817841610195205,
"grad_norm": 31.908977508544922,
"learning_rate": 0.0001982773472120089,
"loss": 42.8407,
"step": 423
},
{
"epoch": 0.2287178449816257,
"grad_norm": 37.47427749633789,
"learning_rate": 0.00019826669508849108,
"loss": 39.5264,
"step": 424
},
{
"epoch": 0.22925727386129935,
"grad_norm": 43.83090591430664,
"learning_rate": 0.00019825601042030156,
"loss": 48.5415,
"step": 425
},
{
"epoch": 0.229796702740973,
"grad_norm": 42.004425048828125,
"learning_rate": 0.00019824529321097893,
"loss": 39.4127,
"step": 426
},
{
"epoch": 0.23033613162064664,
"grad_norm": 38.282066345214844,
"learning_rate": 0.00019823454346407267,
"loss": 40.8499,
"step": 427
},
{
"epoch": 0.2308755605003203,
"grad_norm": 33.92627716064453,
"learning_rate": 0.0001982237611831429,
"loss": 35.4472,
"step": 428
},
{
"epoch": 0.23141498937999394,
"grad_norm": 53.361106872558594,
"learning_rate": 0.00019821294637176057,
"loss": 43.1921,
"step": 429
},
{
"epoch": 0.2319544182596676,
"grad_norm": 40.92842102050781,
"learning_rate": 0.00019820209903350744,
"loss": 36.5019,
"step": 430
},
{
"epoch": 0.2324938471393412,
"grad_norm": 35.71042251586914,
"learning_rate": 0.00019819121917197602,
"loss": 36.598,
"step": 431
},
{
"epoch": 0.23303327601901486,
"grad_norm": 35.10508728027344,
"learning_rate": 0.00019818030679076952,
"loss": 31.6675,
"step": 432
},
{
"epoch": 0.2335727048986885,
"grad_norm": 31.885364532470703,
"learning_rate": 0.00019816936189350206,
"loss": 34.3554,
"step": 433
},
{
"epoch": 0.23411213377836215,
"grad_norm": 42.998878479003906,
"learning_rate": 0.0001981583844837984,
"loss": 28.1099,
"step": 434
},
{
"epoch": 0.2346515626580358,
"grad_norm": 38.70567321777344,
"learning_rate": 0.00019814737456529412,
"loss": 42.3567,
"step": 435
},
{
"epoch": 0.23519099153770945,
"grad_norm": 34.43855285644531,
"learning_rate": 0.00019813633214163555,
"loss": 22.8285,
"step": 436
},
{
"epoch": 0.2357304204173831,
"grad_norm": 33.38055419921875,
"learning_rate": 0.00019812525721647986,
"loss": 36.1465,
"step": 437
},
{
"epoch": 0.23626984929705674,
"grad_norm": 42.98970413208008,
"learning_rate": 0.00019811414979349485,
"loss": 34.8416,
"step": 438
},
{
"epoch": 0.2368092781767304,
"grad_norm": 37.12187957763672,
"learning_rate": 0.0001981030098763592,
"loss": 34.276,
"step": 439
},
{
"epoch": 0.23734870705640404,
"grad_norm": 44.36403274536133,
"learning_rate": 0.00019809183746876232,
"loss": 30.3544,
"step": 440
},
{
"epoch": 0.2378881359360777,
"grad_norm": 46.281654357910156,
"learning_rate": 0.00019808063257440432,
"loss": 27.8803,
"step": 441
},
{
"epoch": 0.23842756481575134,
"grad_norm": 49.94664001464844,
"learning_rate": 0.00019806939519699613,
"loss": 31.0358,
"step": 442
},
{
"epoch": 0.23896699369542496,
"grad_norm": 42.308616638183594,
"learning_rate": 0.0001980581253402595,
"loss": 29.4053,
"step": 443
},
{
"epoch": 0.2395064225750986,
"grad_norm": 51.36742401123047,
"learning_rate": 0.00019804682300792674,
"loss": 31.0947,
"step": 444
},
{
"epoch": 0.24004585145477225,
"grad_norm": 40.25013732910156,
"learning_rate": 0.00019803548820374113,
"loss": 26.6703,
"step": 445
},
{
"epoch": 0.2405852803344459,
"grad_norm": 53.013710021972656,
"learning_rate": 0.00019802412093145657,
"loss": 35.5286,
"step": 446
},
{
"epoch": 0.24112470921411955,
"grad_norm": 41.21833038330078,
"learning_rate": 0.00019801272119483775,
"loss": 25.3315,
"step": 447
},
{
"epoch": 0.2416641380937932,
"grad_norm": 61.56970977783203,
"learning_rate": 0.00019800128899766017,
"loss": 27.589,
"step": 448
},
{
"epoch": 0.24220356697346684,
"grad_norm": 58.22453308105469,
"learning_rate": 0.00019798982434371,
"loss": 37.2235,
"step": 449
},
{
"epoch": 0.2427429958531405,
"grad_norm": 36.04716110229492,
"learning_rate": 0.00019797832723678413,
"loss": 28.1485,
"step": 450
},
{
"epoch": 0.24328242473281414,
"grad_norm": 50.804813385009766,
"learning_rate": 0.00019796679768069032,
"loss": 49.1471,
"step": 451
},
{
"epoch": 0.2438218536124878,
"grad_norm": 91.2785873413086,
"learning_rate": 0.00019795523567924702,
"loss": 72.8998,
"step": 452
},
{
"epoch": 0.24436128249216144,
"grad_norm": 110.37539672851562,
"learning_rate": 0.00019794364123628335,
"loss": 98.2308,
"step": 453
},
{
"epoch": 0.24490071137183506,
"grad_norm": 79.3825912475586,
"learning_rate": 0.00019793201435563932,
"loss": 109.7274,
"step": 454
},
{
"epoch": 0.2454401402515087,
"grad_norm": 36.62171173095703,
"learning_rate": 0.00019792035504116555,
"loss": 107.5116,
"step": 455
},
{
"epoch": 0.24597956913118235,
"grad_norm": 57.664146423339844,
"learning_rate": 0.00019790866329672346,
"loss": 113.5622,
"step": 456
},
{
"epoch": 0.246518998010856,
"grad_norm": 57.12027359008789,
"learning_rate": 0.00019789693912618524,
"loss": 102.4627,
"step": 457
},
{
"epoch": 0.24705842689052965,
"grad_norm": 67.92241668701172,
"learning_rate": 0.00019788518253343376,
"loss": 90.2483,
"step": 458
},
{
"epoch": 0.2475978557702033,
"grad_norm": 63.95331573486328,
"learning_rate": 0.00019787339352236264,
"loss": 94.7671,
"step": 459
},
{
"epoch": 0.24813728464987694,
"grad_norm": 55.70960235595703,
"learning_rate": 0.00019786157209687627,
"loss": 92.1523,
"step": 460
},
{
"epoch": 0.2486767135295506,
"grad_norm": 44.270233154296875,
"learning_rate": 0.00019784971826088973,
"loss": 82.3084,
"step": 461
},
{
"epoch": 0.24921614240922424,
"grad_norm": 35.74955749511719,
"learning_rate": 0.0001978378320183289,
"loss": 71.401,
"step": 462
},
{
"epoch": 0.2497555712888979,
"grad_norm": 26.20838165283203,
"learning_rate": 0.00019782591337313035,
"loss": 68.6018,
"step": 463
},
{
"epoch": 0.25029500016857154,
"grad_norm": 20.70208740234375,
"learning_rate": 0.00019781396232924133,
"loss": 62.6257,
"step": 464
},
{
"epoch": 0.25083442904824516,
"grad_norm": 17.804771423339844,
"learning_rate": 0.00019780197889061993,
"loss": 54.6564,
"step": 465
},
{
"epoch": 0.25137385792791883,
"grad_norm": 24.327360153198242,
"learning_rate": 0.0001977899630612349,
"loss": 50.7451,
"step": 466
},
{
"epoch": 0.25191328680759245,
"grad_norm": 29.580142974853516,
"learning_rate": 0.00019777791484506567,
"loss": 34.4045,
"step": 467
},
{
"epoch": 0.2524527156872661,
"grad_norm": 30.99888801574707,
"learning_rate": 0.00019776583424610254,
"loss": 41.2975,
"step": 468
},
{
"epoch": 0.25299214456693975,
"grad_norm": 40.59465408325195,
"learning_rate": 0.0001977537212683464,
"loss": 56.0607,
"step": 469
},
{
"epoch": 0.2535315734466134,
"grad_norm": 42.85790252685547,
"learning_rate": 0.00019774157591580894,
"loss": 40.9168,
"step": 470
},
{
"epoch": 0.25407100232628704,
"grad_norm": 38.090885162353516,
"learning_rate": 0.0001977293981925125,
"loss": 49.6262,
"step": 471
},
{
"epoch": 0.25461043120596066,
"grad_norm": 33.007991790771484,
"learning_rate": 0.0001977171881024902,
"loss": 44.5241,
"step": 472
},
{
"epoch": 0.25514986008563434,
"grad_norm": 39.41592025756836,
"learning_rate": 0.00019770494564978595,
"loss": 38.185,
"step": 473
},
{
"epoch": 0.25568928896530796,
"grad_norm": 33.008148193359375,
"learning_rate": 0.00019769267083845417,
"loss": 42.3843,
"step": 474
},
{
"epoch": 0.25622871784498163,
"grad_norm": 27.917991638183594,
"learning_rate": 0.0001976803636725602,
"loss": 33.7216,
"step": 475
},
{
"epoch": 0.25676814672465526,
"grad_norm": 29.870256423950195,
"learning_rate": 0.00019766802415617998,
"loss": 35.7963,
"step": 476
},
{
"epoch": 0.25730757560432893,
"grad_norm": 44.98633575439453,
"learning_rate": 0.0001976556522934002,
"loss": 35.8127,
"step": 477
},
{
"epoch": 0.25784700448400255,
"grad_norm": 43.03909683227539,
"learning_rate": 0.0001976432480883183,
"loss": 35.4111,
"step": 478
},
{
"epoch": 0.2583864333636762,
"grad_norm": 47.32424545288086,
"learning_rate": 0.00019763081154504234,
"loss": 41.8895,
"step": 479
},
{
"epoch": 0.25892586224334985,
"grad_norm": 49.7735595703125,
"learning_rate": 0.0001976183426676912,
"loss": 32.9801,
"step": 480
},
{
"epoch": 0.2594652911230235,
"grad_norm": 44.57673645019531,
"learning_rate": 0.0001976058414603944,
"loss": 36.089,
"step": 481
},
{
"epoch": 0.26000472000269714,
"grad_norm": 36.22349548339844,
"learning_rate": 0.00019759330792729212,
"loss": 47.0487,
"step": 482
},
{
"epoch": 0.26054414888237076,
"grad_norm": 38.58706283569336,
"learning_rate": 0.00019758074207253535,
"loss": 34.3672,
"step": 483
},
{
"epoch": 0.26108357776204444,
"grad_norm": 40.61176300048828,
"learning_rate": 0.00019756814390028575,
"loss": 39.7468,
"step": 484
},
{
"epoch": 0.26162300664171806,
"grad_norm": 29.439836502075195,
"learning_rate": 0.00019755551341471566,
"loss": 34.1449,
"step": 485
},
{
"epoch": 0.26216243552139173,
"grad_norm": 35.68241882324219,
"learning_rate": 0.00019754285062000815,
"loss": 31.6102,
"step": 486
},
{
"epoch": 0.26270186440106535,
"grad_norm": 44.2021598815918,
"learning_rate": 0.000197530155520357,
"loss": 31.8889,
"step": 487
},
{
"epoch": 0.26324129328073903,
"grad_norm": 53.82715606689453,
"learning_rate": 0.00019751742811996656,
"loss": 31.6853,
"step": 488
},
{
"epoch": 0.26378072216041265,
"grad_norm": 41.77256774902344,
"learning_rate": 0.00019750466842305208,
"loss": 39.1939,
"step": 489
},
{
"epoch": 0.2643201510400863,
"grad_norm": 36.42414093017578,
"learning_rate": 0.00019749187643383937,
"loss": 26.3978,
"step": 490
},
{
"epoch": 0.26485957991975995,
"grad_norm": 49.238014221191406,
"learning_rate": 0.00019747905215656498,
"loss": 33.8181,
"step": 491
},
{
"epoch": 0.2653990087994336,
"grad_norm": 37.46484375,
"learning_rate": 0.00019746619559547619,
"loss": 32.0879,
"step": 492
},
{
"epoch": 0.26593843767910724,
"grad_norm": 29.428075790405273,
"learning_rate": 0.00019745330675483084,
"loss": 22.5194,
"step": 493
},
{
"epoch": 0.2664778665587809,
"grad_norm": 42.24260330200195,
"learning_rate": 0.00019744038563889764,
"loss": 34.5577,
"step": 494
},
{
"epoch": 0.26701729543845454,
"grad_norm": 43.271976470947266,
"learning_rate": 0.00019742743225195582,
"loss": 25.107,
"step": 495
},
{
"epoch": 0.26755672431812816,
"grad_norm": 41.1341667175293,
"learning_rate": 0.00019741444659829543,
"loss": 24.4596,
"step": 496
},
{
"epoch": 0.26809615319780183,
"grad_norm": 35.3587760925293,
"learning_rate": 0.00019740142868221713,
"loss": 21.1434,
"step": 497
},
{
"epoch": 0.26863558207747545,
"grad_norm": 47.48214340209961,
"learning_rate": 0.00019738837850803226,
"loss": 23.4752,
"step": 498
},
{
"epoch": 0.26917501095714913,
"grad_norm": 44.637882232666016,
"learning_rate": 0.00019737529608006293,
"loss": 21.9525,
"step": 499
},
{
"epoch": 0.26971443983682275,
"grad_norm": 31.005287170410156,
"learning_rate": 0.00019736218140264185,
"loss": 19.1622,
"step": 500
},
{
"epoch": 0.2702538687164964,
"grad_norm": 32.10681915283203,
"learning_rate": 0.0001973490344801124,
"loss": 44.8021,
"step": 501
},
{
"epoch": 0.27079329759617005,
"grad_norm": 67.818603515625,
"learning_rate": 0.0001973358553168287,
"loss": 90.5945,
"step": 502
},
{
"epoch": 0.2713327264758437,
"grad_norm": 78.30387115478516,
"learning_rate": 0.00019732264391715556,
"loss": 101.037,
"step": 503
},
{
"epoch": 0.27187215535551734,
"grad_norm": 92.50519561767578,
"learning_rate": 0.00019730940028546835,
"loss": 124.3723,
"step": 504
},
{
"epoch": 0.272411584235191,
"grad_norm": 38.794246673583984,
"learning_rate": 0.0001972961244261532,
"loss": 105.1317,
"step": 505
},
{
"epoch": 0.27295101311486464,
"grad_norm": 34.56374740600586,
"learning_rate": 0.00019728281634360698,
"loss": 101.3536,
"step": 506
},
{
"epoch": 0.27349044199453826,
"grad_norm": 33.79701614379883,
"learning_rate": 0.00019726947604223712,
"loss": 105.4946,
"step": 507
},
{
"epoch": 0.27402987087421193,
"grad_norm": 39.242740631103516,
"learning_rate": 0.00019725610352646172,
"loss": 82.6645,
"step": 508
},
{
"epoch": 0.27456929975388555,
"grad_norm": 41.144683837890625,
"learning_rate": 0.0001972426988007096,
"loss": 99.5104,
"step": 509
},
{
"epoch": 0.27510872863355923,
"grad_norm": 43.32292175292969,
"learning_rate": 0.00019722926186942026,
"loss": 90.6068,
"step": 510
},
{
"epoch": 0.27564815751323285,
"grad_norm": 40.97383117675781,
"learning_rate": 0.0001972157927370438,
"loss": 71.8933,
"step": 511
},
{
"epoch": 0.2761875863929065,
"grad_norm": 27.89875602722168,
"learning_rate": 0.0001972022914080411,
"loss": 66.0499,
"step": 512
},
{
"epoch": 0.27672701527258015,
"grad_norm": 23.75403594970703,
"learning_rate": 0.00019718875788688354,
"loss": 59.9798,
"step": 513
},
{
"epoch": 0.2772664441522538,
"grad_norm": 18.101530075073242,
"learning_rate": 0.0001971751921780533,
"loss": 55.1379,
"step": 514
},
{
"epoch": 0.27780587303192744,
"grad_norm": 24.123146057128906,
"learning_rate": 0.00019716159428604315,
"loss": 51.0036,
"step": 515
},
{
"epoch": 0.2783453019116011,
"grad_norm": 29.12915802001953,
"learning_rate": 0.00019714796421535654,
"loss": 35.74,
"step": 516
},
{
"epoch": 0.27888473079127474,
"grad_norm": 41.40327072143555,
"learning_rate": 0.00019713430197050756,
"loss": 34.8342,
"step": 517
},
{
"epoch": 0.27942415967094836,
"grad_norm": 65.70941162109375,
"learning_rate": 0.00019712060755602102,
"loss": 45.6267,
"step": 518
},
{
"epoch": 0.27996358855062203,
"grad_norm": 37.733158111572266,
"learning_rate": 0.00019710688097643227,
"loss": 40.7,
"step": 519
},
{
"epoch": 0.28050301743029565,
"grad_norm": 39.90540313720703,
"learning_rate": 0.0001970931222362874,
"loss": 52.105,
"step": 520
},
{
"epoch": 0.28104244630996933,
"grad_norm": 41.023155212402344,
"learning_rate": 0.0001970793313401432,
"loss": 47.4019,
"step": 521
},
{
"epoch": 0.28158187518964295,
"grad_norm": 39.340972900390625,
"learning_rate": 0.00019706550829256693,
"loss": 36.3784,
"step": 522
},
{
"epoch": 0.2821213040693166,
"grad_norm": 31.36964988708496,
"learning_rate": 0.0001970516530981367,
"loss": 32.5883,
"step": 523
},
{
"epoch": 0.28266073294899025,
"grad_norm": 31.426342010498047,
"learning_rate": 0.00019703776576144105,
"loss": 37.0281,
"step": 524
},
{
"epoch": 0.2832001618286639,
"grad_norm": 48.170589447021484,
"learning_rate": 0.00019702384628707945,
"loss": 50.0541,
"step": 525
},
{
"epoch": 0.28373959070833754,
"grad_norm": 58.017845153808594,
"learning_rate": 0.0001970098946796617,
"loss": 35.1185,
"step": 526
},
{
"epoch": 0.2842790195880112,
"grad_norm": 44.51712417602539,
"learning_rate": 0.0001969959109438085,
"loss": 30.6861,
"step": 527
},
{
"epoch": 0.28481844846768484,
"grad_norm": 38.26441955566406,
"learning_rate": 0.00019698189508415102,
"loss": 42.7979,
"step": 528
},
{
"epoch": 0.28535787734735846,
"grad_norm": 33.41388702392578,
"learning_rate": 0.00019696784710533115,
"loss": 31.6934,
"step": 529
},
{
"epoch": 0.28589730622703213,
"grad_norm": 39.14249038696289,
"learning_rate": 0.00019695376701200145,
"loss": 31.4034,
"step": 530
},
{
"epoch": 0.28643673510670575,
"grad_norm": 38.64737319946289,
"learning_rate": 0.000196939654808825,
"loss": 35.3318,
"step": 531
},
{
"epoch": 0.28697616398637943,
"grad_norm": 32.65852355957031,
"learning_rate": 0.0001969255105004756,
"loss": 33.1427,
"step": 532
},
{
"epoch": 0.28751559286605305,
"grad_norm": 33.65852355957031,
"learning_rate": 0.0001969113340916377,
"loss": 31.0407,
"step": 533
},
{
"epoch": 0.2880550217457267,
"grad_norm": 31.496322631835938,
"learning_rate": 0.00019689712558700628,
"loss": 32.1776,
"step": 534
},
{
"epoch": 0.28859445062540034,
"grad_norm": 37.255680084228516,
"learning_rate": 0.00019688288499128707,
"loss": 32.4352,
"step": 535
},
{
"epoch": 0.289133879505074,
"grad_norm": 35.74131774902344,
"learning_rate": 0.00019686861230919635,
"loss": 39.0239,
"step": 536
},
{
"epoch": 0.28967330838474764,
"grad_norm": 62.805694580078125,
"learning_rate": 0.00019685430754546107,
"loss": 39.168,
"step": 537
},
{
"epoch": 0.2902127372644213,
"grad_norm": 32.74406814575195,
"learning_rate": 0.00019683997070481875,
"loss": 27.3064,
"step": 538
},
{
"epoch": 0.29075216614409494,
"grad_norm": 60.63595199584961,
"learning_rate": 0.00019682560179201759,
"loss": 37.3217,
"step": 539
},
{
"epoch": 0.2912915950237686,
"grad_norm": 49.350975036621094,
"learning_rate": 0.00019681120081181636,
"loss": 32.6254,
"step": 540
},
{
"epoch": 0.29183102390344223,
"grad_norm": 33.03507614135742,
"learning_rate": 0.00019679676776898454,
"loss": 23.6142,
"step": 541
},
{
"epoch": 0.29237045278311585,
"grad_norm": 46.380985260009766,
"learning_rate": 0.00019678230266830212,
"loss": 26.1048,
"step": 542
},
{
"epoch": 0.29290988166278953,
"grad_norm": 44.384132385253906,
"learning_rate": 0.00019676780551455977,
"loss": 19.0745,
"step": 543
},
{
"epoch": 0.29344931054246315,
"grad_norm": 32.757320404052734,
"learning_rate": 0.0001967532763125588,
"loss": 33.5921,
"step": 544
},
{
"epoch": 0.2939887394221368,
"grad_norm": 40.512939453125,
"learning_rate": 0.000196738715067111,
"loss": 23.9648,
"step": 545
},
{
"epoch": 0.29452816830181044,
"grad_norm": 36.085330963134766,
"learning_rate": 0.00019672412178303898,
"loss": 25.8736,
"step": 546
},
{
"epoch": 0.2950675971814841,
"grad_norm": 39.4991340637207,
"learning_rate": 0.00019670949646517576,
"loss": 35.8085,
"step": 547
},
{
"epoch": 0.29560702606115774,
"grad_norm": 56.80205535888672,
"learning_rate": 0.0001966948391183651,
"loss": 21.2566,
"step": 548
},
{
"epoch": 0.2961464549408314,
"grad_norm": 51.80792999267578,
"learning_rate": 0.00019668014974746133,
"loss": 19.3891,
"step": 549
},
{
"epoch": 0.29668588382050504,
"grad_norm": 40.740726470947266,
"learning_rate": 0.00019666542835732937,
"loss": 17.442,
"step": 550
},
{
"epoch": 0.2972253127001787,
"grad_norm": 43.78228759765625,
"learning_rate": 0.00019665067495284476,
"loss": 53.1444,
"step": 551
},
{
"epoch": 0.29776474157985233,
"grad_norm": 68.15139770507812,
"learning_rate": 0.00019663588953889363,
"loss": 83.8455,
"step": 552
},
{
"epoch": 0.29830417045952595,
"grad_norm": 57.72416305541992,
"learning_rate": 0.00019662107212037273,
"loss": 91.3314,
"step": 553
},
{
"epoch": 0.29884359933919963,
"grad_norm": 70.40361785888672,
"learning_rate": 0.0001966062227021894,
"loss": 115.1381,
"step": 554
},
{
"epoch": 0.29938302821887325,
"grad_norm": 33.6906623840332,
"learning_rate": 0.00019659134128926156,
"loss": 96.5649,
"step": 555
},
{
"epoch": 0.2999224570985469,
"grad_norm": 41.24090576171875,
"learning_rate": 0.00019657642788651776,
"loss": 104.8012,
"step": 556
},
{
"epoch": 0.30046188597822054,
"grad_norm": 62.62508773803711,
"learning_rate": 0.00019656148249889714,
"loss": 89.1584,
"step": 557
},
{
"epoch": 0.3010013148578942,
"grad_norm": 54.20726013183594,
"learning_rate": 0.00019654650513134937,
"loss": 102.4601,
"step": 558
},
{
"epoch": 0.30154074373756784,
"grad_norm": 51.19554138183594,
"learning_rate": 0.00019653149578883482,
"loss": 94.7273,
"step": 559
},
{
"epoch": 0.3020801726172415,
"grad_norm": 50.297447204589844,
"learning_rate": 0.00019651645447632437,
"loss": 85.4999,
"step": 560
},
{
"epoch": 0.30261960149691514,
"grad_norm": 43.541648864746094,
"learning_rate": 0.00019650138119879952,
"loss": 84.9936,
"step": 561
},
{
"epoch": 0.3031590303765888,
"grad_norm": 30.611860275268555,
"learning_rate": 0.00019648627596125233,
"loss": 68.3871,
"step": 562
},
{
"epoch": 0.30369845925626243,
"grad_norm": 18.373859405517578,
"learning_rate": 0.00019647113876868546,
"loss": 64.1806,
"step": 563
},
{
"epoch": 0.30423788813593605,
"grad_norm": 17.967041015625,
"learning_rate": 0.00019645596962611218,
"loss": 58.1967,
"step": 564
},
{
"epoch": 0.30477731701560973,
"grad_norm": 17.57683563232422,
"learning_rate": 0.00019644076853855626,
"loss": 48.7426,
"step": 565
},
{
"epoch": 0.30531674589528335,
"grad_norm": 24.4635066986084,
"learning_rate": 0.00019642553551105219,
"loss": 45.5702,
"step": 566
},
{
"epoch": 0.305856174774957,
"grad_norm": 44.31038284301758,
"learning_rate": 0.0001964102705486449,
"loss": 36.4538,
"step": 567
},
{
"epoch": 0.30639560365463064,
"grad_norm": 45.66762924194336,
"learning_rate": 0.00019639497365638993,
"loss": 37.6228,
"step": 568
},
{
"epoch": 0.3069350325343043,
"grad_norm": 45.2806282043457,
"learning_rate": 0.00019637964483935346,
"loss": 47.7514,
"step": 569
},
{
"epoch": 0.30747446141397794,
"grad_norm": 44.627296447753906,
"learning_rate": 0.00019636428410261218,
"loss": 50.5934,
"step": 570
},
{
"epoch": 0.3080138902936516,
"grad_norm": 39.8631706237793,
"learning_rate": 0.00019634889145125336,
"loss": 33.2035,
"step": 571
},
{
"epoch": 0.30855331917332524,
"grad_norm": 43.88326644897461,
"learning_rate": 0.00019633346689037486,
"loss": 44.4418,
"step": 572
},
{
"epoch": 0.3090927480529989,
"grad_norm": 31.599515914916992,
"learning_rate": 0.0001963180104250851,
"loss": 29.8656,
"step": 573
},
{
"epoch": 0.30963217693267253,
"grad_norm": 29.062061309814453,
"learning_rate": 0.00019630252206050307,
"loss": 29.4416,
"step": 574
},
{
"epoch": 0.31017160581234615,
"grad_norm": 35.07856750488281,
"learning_rate": 0.00019628700180175833,
"loss": 33.663,
"step": 575
},
{
"epoch": 0.3107110346920198,
"grad_norm": 38.65933609008789,
"learning_rate": 0.00019627144965399094,
"loss": 43.6982,
"step": 576
},
{
"epoch": 0.31125046357169345,
"grad_norm": 36.53346252441406,
"learning_rate": 0.0001962558656223516,
"loss": 41.9741,
"step": 577
},
{
"epoch": 0.3117898924513671,
"grad_norm": 50.61214065551758,
"learning_rate": 0.00019624024971200154,
"loss": 31.3103,
"step": 578
},
{
"epoch": 0.31232932133104074,
"grad_norm": 39.70477294921875,
"learning_rate": 0.00019622460192811255,
"loss": 40.1001,
"step": 579
},
{
"epoch": 0.3128687502107144,
"grad_norm": 43.24115753173828,
"learning_rate": 0.000196208922275867,
"loss": 38.9648,
"step": 580
},
{
"epoch": 0.31340817909038804,
"grad_norm": 49.614410400390625,
"learning_rate": 0.00019619321076045778,
"loss": 38.396,
"step": 581
},
{
"epoch": 0.3139476079700617,
"grad_norm": 38.65335464477539,
"learning_rate": 0.0001961774673870883,
"loss": 33.8401,
"step": 582
},
{
"epoch": 0.31448703684973534,
"grad_norm": 36.919837951660156,
"learning_rate": 0.00019616169216097262,
"loss": 40.8598,
"step": 583
},
{
"epoch": 0.315026465729409,
"grad_norm": 34.90658187866211,
"learning_rate": 0.00019614588508733524,
"loss": 26.7875,
"step": 584
},
{
"epoch": 0.31556589460908263,
"grad_norm": 36.6773796081543,
"learning_rate": 0.00019613004617141132,
"loss": 38.7512,
"step": 585
},
{
"epoch": 0.3161053234887563,
"grad_norm": 38.80603790283203,
"learning_rate": 0.00019611417541844645,
"loss": 22.4567,
"step": 586
},
{
"epoch": 0.3166447523684299,
"grad_norm": 39.85905838012695,
"learning_rate": 0.00019609827283369687,
"loss": 34.7722,
"step": 587
},
{
"epoch": 0.31718418124810355,
"grad_norm": 42.714210510253906,
"learning_rate": 0.00019608233842242925,
"loss": 29.6514,
"step": 588
},
{
"epoch": 0.3177236101277772,
"grad_norm": 28.49331283569336,
"learning_rate": 0.00019606637218992092,
"loss": 32.2811,
"step": 589
},
{
"epoch": 0.31826303900745084,
"grad_norm": 38.48284912109375,
"learning_rate": 0.0001960503741414597,
"loss": 19.4347,
"step": 590
},
{
"epoch": 0.3188024678871245,
"grad_norm": 40.46686553955078,
"learning_rate": 0.00019603434428234389,
"loss": 36.0755,
"step": 591
},
{
"epoch": 0.31934189676679814,
"grad_norm": 33.52849578857422,
"learning_rate": 0.00019601828261788236,
"loss": 23.4967,
"step": 592
},
{
"epoch": 0.3198813256464718,
"grad_norm": 36.89003372192383,
"learning_rate": 0.0001960021891533946,
"loss": 17.4822,
"step": 593
},
{
"epoch": 0.32042075452614543,
"grad_norm": 47.023624420166016,
"learning_rate": 0.00019598606389421055,
"loss": 26.3533,
"step": 594
},
{
"epoch": 0.3209601834058191,
"grad_norm": 53.969627380371094,
"learning_rate": 0.00019596990684567063,
"loss": 36.3338,
"step": 595
},
{
"epoch": 0.32149961228549273,
"grad_norm": 31.71206283569336,
"learning_rate": 0.00019595371801312588,
"loss": 23.1099,
"step": 596
},
{
"epoch": 0.3220390411651664,
"grad_norm": 34.602901458740234,
"learning_rate": 0.00019593749740193784,
"loss": 20.7281,
"step": 597
},
{
"epoch": 0.32257847004484,
"grad_norm": 32.23836135864258,
"learning_rate": 0.00019592124501747855,
"loss": 19.1565,
"step": 598
},
{
"epoch": 0.32311789892451365,
"grad_norm": 31.762807846069336,
"learning_rate": 0.00019590496086513063,
"loss": 20.822,
"step": 599
},
{
"epoch": 0.3236573278041873,
"grad_norm": 38.77958297729492,
"learning_rate": 0.00019588864495028712,
"loss": 20.7172,
"step": 600
},
{
"epoch": 0.3236573278041873,
"eval_loss": 1.5770864486694336,
"eval_runtime": 140.3936,
"eval_samples_per_second": 2.13,
"eval_steps_per_second": 2.13,
"step": 600
}
],
"logging_steps": 1,
"max_steps": 5559,
"num_input_tokens_seen": 0,
"num_train_epochs": 3,
"save_steps": 200,
"stateful_callbacks": {
"EarlyStoppingCallback": {
"args": {
"early_stopping_patience": 3,
"early_stopping_threshold": 0.0
},
"attributes": {
"early_stopping_patience_counter": 0
}
},
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 1.0696873835715625e+18,
"train_batch_size": 1,
"trial_name": null,
"trial_params": null
}