ABSA-Multi_Label_Classification / trainer_state.json
ToBeWithYou's picture
Upload 10 files
66cc80d verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 3.984819734345351,
"eval_steps": 500,
"global_step": 10500,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.003795066413662239,
"grad_norm": 1.7131669521331787,
"learning_rate": 1.6666666666666667e-06,
"loss": 0.6804,
"step": 10
},
{
"epoch": 0.007590132827324478,
"grad_norm": 1.7053213119506836,
"learning_rate": 3.3333333333333333e-06,
"loss": 0.6854,
"step": 20
},
{
"epoch": 0.011385199240986717,
"grad_norm": 1.8932372331619263,
"learning_rate": 5e-06,
"loss": 0.6597,
"step": 30
},
{
"epoch": 0.015180265654648957,
"grad_norm": 1.516872763633728,
"learning_rate": 6.666666666666667e-06,
"loss": 0.6243,
"step": 40
},
{
"epoch": 0.018975332068311195,
"grad_norm": 1.5149081945419312,
"learning_rate": 8.333333333333334e-06,
"loss": 0.5632,
"step": 50
},
{
"epoch": 0.022770398481973434,
"grad_norm": 1.0706552267074585,
"learning_rate": 1e-05,
"loss": 0.5289,
"step": 60
},
{
"epoch": 0.026565464895635674,
"grad_norm": 1.102160930633545,
"learning_rate": 1.1666666666666668e-05,
"loss": 0.5263,
"step": 70
},
{
"epoch": 0.030360531309297913,
"grad_norm": 1.2059059143066406,
"learning_rate": 1.3333333333333333e-05,
"loss": 0.5466,
"step": 80
},
{
"epoch": 0.03415559772296015,
"grad_norm": 1.0622307062149048,
"learning_rate": 1.5e-05,
"loss": 0.4918,
"step": 90
},
{
"epoch": 0.03795066413662239,
"grad_norm": 1.5696407556533813,
"learning_rate": 1.6666666666666667e-05,
"loss": 0.516,
"step": 100
},
{
"epoch": 0.04174573055028463,
"grad_norm": 1.49858820438385,
"learning_rate": 1.8333333333333333e-05,
"loss": 0.5024,
"step": 110
},
{
"epoch": 0.04554079696394687,
"grad_norm": 1.5996527671813965,
"learning_rate": 2e-05,
"loss": 0.4775,
"step": 120
},
{
"epoch": 0.04933586337760911,
"grad_norm": 1.6391699314117432,
"learning_rate": 2.1666666666666667e-05,
"loss": 0.5028,
"step": 130
},
{
"epoch": 0.05313092979127135,
"grad_norm": 1.5045441389083862,
"learning_rate": 2.3333333333333336e-05,
"loss": 0.472,
"step": 140
},
{
"epoch": 0.056925996204933584,
"grad_norm": 1.1791646480560303,
"learning_rate": 2.5e-05,
"loss": 0.4606,
"step": 150
},
{
"epoch": 0.06072106261859583,
"grad_norm": 1.3659300804138184,
"learning_rate": 2.6666666666666667e-05,
"loss": 0.527,
"step": 160
},
{
"epoch": 0.06451612903225806,
"grad_norm": 0.9830155968666077,
"learning_rate": 2.8333333333333335e-05,
"loss": 0.458,
"step": 170
},
{
"epoch": 0.0683111954459203,
"grad_norm": 1.6211776733398438,
"learning_rate": 3e-05,
"loss": 0.4613,
"step": 180
},
{
"epoch": 0.07210626185958255,
"grad_norm": 1.9507710933685303,
"learning_rate": 3.1666666666666666e-05,
"loss": 0.4531,
"step": 190
},
{
"epoch": 0.07590132827324478,
"grad_norm": 1.312615156173706,
"learning_rate": 3.3333333333333335e-05,
"loss": 0.4384,
"step": 200
},
{
"epoch": 0.07969639468690702,
"grad_norm": 2.034919261932373,
"learning_rate": 3.5e-05,
"loss": 0.4747,
"step": 210
},
{
"epoch": 0.08349146110056926,
"grad_norm": 2.045759677886963,
"learning_rate": 3.6666666666666666e-05,
"loss": 0.4153,
"step": 220
},
{
"epoch": 0.0872865275142315,
"grad_norm": 2.0934813022613525,
"learning_rate": 3.8333333333333334e-05,
"loss": 0.3829,
"step": 230
},
{
"epoch": 0.09108159392789374,
"grad_norm": 2.4255552291870117,
"learning_rate": 4e-05,
"loss": 0.3816,
"step": 240
},
{
"epoch": 0.09487666034155598,
"grad_norm": 1.42184579372406,
"learning_rate": 4.166666666666667e-05,
"loss": 0.3948,
"step": 250
},
{
"epoch": 0.09867172675521822,
"grad_norm": 1.6787000894546509,
"learning_rate": 4.3333333333333334e-05,
"loss": 0.3877,
"step": 260
},
{
"epoch": 0.10246679316888045,
"grad_norm": 2.121290445327759,
"learning_rate": 4.5e-05,
"loss": 0.3732,
"step": 270
},
{
"epoch": 0.1062618595825427,
"grad_norm": 1.5,
"learning_rate": 4.666666666666667e-05,
"loss": 0.3567,
"step": 280
},
{
"epoch": 0.11005692599620494,
"grad_norm": 3.0193252563476562,
"learning_rate": 4.8333333333333334e-05,
"loss": 0.3916,
"step": 290
},
{
"epoch": 0.11385199240986717,
"grad_norm": 2.7301666736602783,
"learning_rate": 5e-05,
"loss": 0.3723,
"step": 300
},
{
"epoch": 0.11764705882352941,
"grad_norm": 1.8423070907592773,
"learning_rate": 4.9951171875e-05,
"loss": 0.3214,
"step": 310
},
{
"epoch": 0.12144212523719165,
"grad_norm": 1.204102873802185,
"learning_rate": 4.990234375e-05,
"loss": 0.3251,
"step": 320
},
{
"epoch": 0.1252371916508539,
"grad_norm": 1.803913950920105,
"learning_rate": 4.9853515625000005e-05,
"loss": 0.3942,
"step": 330
},
{
"epoch": 0.12903225806451613,
"grad_norm": 3.175114154815674,
"learning_rate": 4.9804687500000004e-05,
"loss": 0.39,
"step": 340
},
{
"epoch": 0.13282732447817835,
"grad_norm": 2.4476590156555176,
"learning_rate": 4.9755859375e-05,
"loss": 0.349,
"step": 350
},
{
"epoch": 0.1366223908918406,
"grad_norm": 1.2592339515686035,
"learning_rate": 4.970703125e-05,
"loss": 0.3315,
"step": 360
},
{
"epoch": 0.14041745730550284,
"grad_norm": 1.6238622665405273,
"learning_rate": 4.9658203125e-05,
"loss": 0.3307,
"step": 370
},
{
"epoch": 0.1442125237191651,
"grad_norm": 1.3984373807907104,
"learning_rate": 4.9609375000000005e-05,
"loss": 0.294,
"step": 380
},
{
"epoch": 0.14800759013282733,
"grad_norm": 3.1960623264312744,
"learning_rate": 4.9560546875e-05,
"loss": 0.3314,
"step": 390
},
{
"epoch": 0.15180265654648956,
"grad_norm": 1.5345971584320068,
"learning_rate": 4.951171875e-05,
"loss": 0.3438,
"step": 400
},
{
"epoch": 0.1555977229601518,
"grad_norm": 3.1037323474884033,
"learning_rate": 4.9462890625e-05,
"loss": 0.3246,
"step": 410
},
{
"epoch": 0.15939278937381404,
"grad_norm": 3.519519805908203,
"learning_rate": 4.94140625e-05,
"loss": 0.3087,
"step": 420
},
{
"epoch": 0.16318785578747627,
"grad_norm": 1.347273826599121,
"learning_rate": 4.9365234375000005e-05,
"loss": 0.3303,
"step": 430
},
{
"epoch": 0.16698292220113853,
"grad_norm": 1.2372374534606934,
"learning_rate": 4.931640625e-05,
"loss": 0.3225,
"step": 440
},
{
"epoch": 0.17077798861480076,
"grad_norm": 0.9122889637947083,
"learning_rate": 4.9267578125e-05,
"loss": 0.3081,
"step": 450
},
{
"epoch": 0.174573055028463,
"grad_norm": 3.7750535011291504,
"learning_rate": 4.921875e-05,
"loss": 0.2785,
"step": 460
},
{
"epoch": 0.17836812144212524,
"grad_norm": 1.0529924631118774,
"learning_rate": 4.9169921875000006e-05,
"loss": 0.283,
"step": 470
},
{
"epoch": 0.18216318785578747,
"grad_norm": 1.5323132276535034,
"learning_rate": 4.9121093750000004e-05,
"loss": 0.2982,
"step": 480
},
{
"epoch": 0.1859582542694497,
"grad_norm": 1.1751055717468262,
"learning_rate": 4.9072265625e-05,
"loss": 0.2639,
"step": 490
},
{
"epoch": 0.18975332068311196,
"grad_norm": 1.0208653211593628,
"learning_rate": 4.90234375e-05,
"loss": 0.2651,
"step": 500
},
{
"epoch": 0.1935483870967742,
"grad_norm": 1.7089987993240356,
"learning_rate": 4.8974609375e-05,
"loss": 0.2572,
"step": 510
},
{
"epoch": 0.19734345351043645,
"grad_norm": 4.918070316314697,
"learning_rate": 4.8925781250000006e-05,
"loss": 0.299,
"step": 520
},
{
"epoch": 0.20113851992409867,
"grad_norm": 1.117162823677063,
"learning_rate": 4.8876953125000004e-05,
"loss": 0.2699,
"step": 530
},
{
"epoch": 0.2049335863377609,
"grad_norm": 1.813411831855774,
"learning_rate": 4.8828125e-05,
"loss": 0.2391,
"step": 540
},
{
"epoch": 0.20872865275142316,
"grad_norm": 3.368643283843994,
"learning_rate": 4.8779296875e-05,
"loss": 0.3022,
"step": 550
},
{
"epoch": 0.2125237191650854,
"grad_norm": 16.486289978027344,
"learning_rate": 4.873046875e-05,
"loss": 0.2837,
"step": 560
},
{
"epoch": 0.21631878557874762,
"grad_norm": 1.3590037822723389,
"learning_rate": 4.8681640625000005e-05,
"loss": 0.2182,
"step": 570
},
{
"epoch": 0.22011385199240988,
"grad_norm": 1.8672986030578613,
"learning_rate": 4.8632812500000004e-05,
"loss": 0.2925,
"step": 580
},
{
"epoch": 0.2239089184060721,
"grad_norm": 2.350752592086792,
"learning_rate": 4.8583984375e-05,
"loss": 0.2585,
"step": 590
},
{
"epoch": 0.22770398481973433,
"grad_norm": 2.4918649196624756,
"learning_rate": 4.853515625e-05,
"loss": 0.2824,
"step": 600
},
{
"epoch": 0.2314990512333966,
"grad_norm": 2.4856553077697754,
"learning_rate": 4.8486328125e-05,
"loss": 0.2444,
"step": 610
},
{
"epoch": 0.23529411764705882,
"grad_norm": 1.87199866771698,
"learning_rate": 4.8437500000000005e-05,
"loss": 0.256,
"step": 620
},
{
"epoch": 0.23908918406072105,
"grad_norm": 1.0694291591644287,
"learning_rate": 4.8388671875000004e-05,
"loss": 0.245,
"step": 630
},
{
"epoch": 0.2428842504743833,
"grad_norm": 0.7904035449028015,
"learning_rate": 4.833984375e-05,
"loss": 0.2588,
"step": 640
},
{
"epoch": 0.24667931688804554,
"grad_norm": 2.714871883392334,
"learning_rate": 4.8291015625e-05,
"loss": 0.2741,
"step": 650
},
{
"epoch": 0.2504743833017078,
"grad_norm": 3.948547124862671,
"learning_rate": 4.82421875e-05,
"loss": 0.2335,
"step": 660
},
{
"epoch": 0.25426944971537,
"grad_norm": 1.6354694366455078,
"learning_rate": 4.8193359375000005e-05,
"loss": 0.2298,
"step": 670
},
{
"epoch": 0.25806451612903225,
"grad_norm": 1.1305994987487793,
"learning_rate": 4.8144531250000003e-05,
"loss": 0.2279,
"step": 680
},
{
"epoch": 0.2618595825426945,
"grad_norm": 1.804825782775879,
"learning_rate": 4.8095703125e-05,
"loss": 0.2401,
"step": 690
},
{
"epoch": 0.2656546489563567,
"grad_norm": 1.0778950452804565,
"learning_rate": 4.8046875e-05,
"loss": 0.2498,
"step": 700
},
{
"epoch": 0.269449715370019,
"grad_norm": 2.672403335571289,
"learning_rate": 4.7998046875e-05,
"loss": 0.2521,
"step": 710
},
{
"epoch": 0.2732447817836812,
"grad_norm": 1.0559144020080566,
"learning_rate": 4.7949218750000005e-05,
"loss": 0.1855,
"step": 720
},
{
"epoch": 0.27703984819734345,
"grad_norm": 1.3226491212844849,
"learning_rate": 4.7900390625e-05,
"loss": 0.21,
"step": 730
},
{
"epoch": 0.2808349146110057,
"grad_norm": 2.1266074180603027,
"learning_rate": 4.78515625e-05,
"loss": 0.2232,
"step": 740
},
{
"epoch": 0.2846299810246679,
"grad_norm": 2.9967539310455322,
"learning_rate": 4.7802734375e-05,
"loss": 0.2554,
"step": 750
},
{
"epoch": 0.2884250474383302,
"grad_norm": 2.6614627838134766,
"learning_rate": 4.775390625e-05,
"loss": 0.2811,
"step": 760
},
{
"epoch": 0.2922201138519924,
"grad_norm": 1.64667546749115,
"learning_rate": 4.7705078125000004e-05,
"loss": 0.2102,
"step": 770
},
{
"epoch": 0.29601518026565465,
"grad_norm": 2.339608669281006,
"learning_rate": 4.765625e-05,
"loss": 0.2125,
"step": 780
},
{
"epoch": 0.2998102466793169,
"grad_norm": 1.6804083585739136,
"learning_rate": 4.7607421875e-05,
"loss": 0.2722,
"step": 790
},
{
"epoch": 0.3036053130929791,
"grad_norm": 2.6005263328552246,
"learning_rate": 4.755859375e-05,
"loss": 0.2067,
"step": 800
},
{
"epoch": 0.30740037950664134,
"grad_norm": 5.113396167755127,
"learning_rate": 4.7509765625000006e-05,
"loss": 0.1988,
"step": 810
},
{
"epoch": 0.3111954459203036,
"grad_norm": 1.9176031351089478,
"learning_rate": 4.7460937500000004e-05,
"loss": 0.2416,
"step": 820
},
{
"epoch": 0.31499051233396586,
"grad_norm": 1.5946362018585205,
"learning_rate": 4.7412109375e-05,
"loss": 0.2416,
"step": 830
},
{
"epoch": 0.3187855787476281,
"grad_norm": 1.6692804098129272,
"learning_rate": 4.736328125e-05,
"loss": 0.2139,
"step": 840
},
{
"epoch": 0.3225806451612903,
"grad_norm": 4.5298285484313965,
"learning_rate": 4.7314453125e-05,
"loss": 0.2285,
"step": 850
},
{
"epoch": 0.32637571157495254,
"grad_norm": 1.9948817491531372,
"learning_rate": 4.7265625000000005e-05,
"loss": 0.2453,
"step": 860
},
{
"epoch": 0.3301707779886148,
"grad_norm": 2.5353565216064453,
"learning_rate": 4.7216796875000004e-05,
"loss": 0.2259,
"step": 870
},
{
"epoch": 0.33396584440227706,
"grad_norm": 5.23643684387207,
"learning_rate": 4.716796875e-05,
"loss": 0.2318,
"step": 880
},
{
"epoch": 0.3377609108159393,
"grad_norm": 3.062701463699341,
"learning_rate": 4.7119140625e-05,
"loss": 0.1835,
"step": 890
},
{
"epoch": 0.3415559772296015,
"grad_norm": 1.5771597623825073,
"learning_rate": 4.70703125e-05,
"loss": 0.2195,
"step": 900
},
{
"epoch": 0.34535104364326374,
"grad_norm": 0.9039077162742615,
"learning_rate": 4.7021484375000005e-05,
"loss": 0.1545,
"step": 910
},
{
"epoch": 0.349146110056926,
"grad_norm": 2.7035298347473145,
"learning_rate": 4.6972656250000004e-05,
"loss": 0.2221,
"step": 920
},
{
"epoch": 0.35294117647058826,
"grad_norm": 2.3225386142730713,
"learning_rate": 4.6923828125e-05,
"loss": 0.1912,
"step": 930
},
{
"epoch": 0.3567362428842505,
"grad_norm": 1.1066793203353882,
"learning_rate": 4.6875e-05,
"loss": 0.2003,
"step": 940
},
{
"epoch": 0.3605313092979127,
"grad_norm": 1.2358715534210205,
"learning_rate": 4.6826171875e-05,
"loss": 0.1944,
"step": 950
},
{
"epoch": 0.36432637571157495,
"grad_norm": 0.5866732001304626,
"learning_rate": 4.6777343750000005e-05,
"loss": 0.1885,
"step": 960
},
{
"epoch": 0.3681214421252372,
"grad_norm": 1.436168909072876,
"learning_rate": 4.6728515625000004e-05,
"loss": 0.182,
"step": 970
},
{
"epoch": 0.3719165085388994,
"grad_norm": 1.5037955045700073,
"learning_rate": 4.66796875e-05,
"loss": 0.2024,
"step": 980
},
{
"epoch": 0.3757115749525617,
"grad_norm": 1.4837393760681152,
"learning_rate": 4.6630859375e-05,
"loss": 0.2249,
"step": 990
},
{
"epoch": 0.3795066413662239,
"grad_norm": 12.082221031188965,
"learning_rate": 4.658203125e-05,
"loss": 0.2191,
"step": 1000
},
{
"epoch": 0.38330170777988615,
"grad_norm": 0.7743262648582458,
"learning_rate": 4.6533203125000005e-05,
"loss": 0.1654,
"step": 1010
},
{
"epoch": 0.3870967741935484,
"grad_norm": 3.7393670082092285,
"learning_rate": 4.6484375e-05,
"loss": 0.1595,
"step": 1020
},
{
"epoch": 0.3908918406072106,
"grad_norm": 1.2153229713439941,
"learning_rate": 4.6435546875e-05,
"loss": 0.2276,
"step": 1030
},
{
"epoch": 0.3946869070208729,
"grad_norm": 0.9271629452705383,
"learning_rate": 4.638671875e-05,
"loss": 0.2039,
"step": 1040
},
{
"epoch": 0.3984819734345351,
"grad_norm": 1.0829685926437378,
"learning_rate": 4.6337890625e-05,
"loss": 0.1731,
"step": 1050
},
{
"epoch": 0.40227703984819735,
"grad_norm": 1.2705596685409546,
"learning_rate": 4.6289062500000005e-05,
"loss": 0.1359,
"step": 1060
},
{
"epoch": 0.4060721062618596,
"grad_norm": 4.376911163330078,
"learning_rate": 4.6240234375e-05,
"loss": 0.2095,
"step": 1070
},
{
"epoch": 0.4098671726755218,
"grad_norm": 2.1292335987091064,
"learning_rate": 4.619140625e-05,
"loss": 0.1916,
"step": 1080
},
{
"epoch": 0.41366223908918404,
"grad_norm": 1.6525979042053223,
"learning_rate": 4.6142578125e-05,
"loss": 0.173,
"step": 1090
},
{
"epoch": 0.4174573055028463,
"grad_norm": 4.228000164031982,
"learning_rate": 4.609375e-05,
"loss": 0.2117,
"step": 1100
},
{
"epoch": 0.42125237191650855,
"grad_norm": 5.334222316741943,
"learning_rate": 4.6044921875000004e-05,
"loss": 0.185,
"step": 1110
},
{
"epoch": 0.4250474383301708,
"grad_norm": 1.7326403856277466,
"learning_rate": 4.599609375e-05,
"loss": 0.1875,
"step": 1120
},
{
"epoch": 0.428842504743833,
"grad_norm": 2.4292402267456055,
"learning_rate": 4.5947265625e-05,
"loss": 0.1747,
"step": 1130
},
{
"epoch": 0.43263757115749524,
"grad_norm": 1.6561298370361328,
"learning_rate": 4.58984375e-05,
"loss": 0.2017,
"step": 1140
},
{
"epoch": 0.4364326375711575,
"grad_norm": 2.659874439239502,
"learning_rate": 4.5849609375000005e-05,
"loss": 0.2415,
"step": 1150
},
{
"epoch": 0.44022770398481975,
"grad_norm": 2.743425130844116,
"learning_rate": 4.5800781250000004e-05,
"loss": 0.2332,
"step": 1160
},
{
"epoch": 0.444022770398482,
"grad_norm": 2.3197848796844482,
"learning_rate": 4.5751953125e-05,
"loss": 0.1946,
"step": 1170
},
{
"epoch": 0.4478178368121442,
"grad_norm": 2.110534191131592,
"learning_rate": 4.5703125e-05,
"loss": 0.1948,
"step": 1180
},
{
"epoch": 0.45161290322580644,
"grad_norm": 1.3609685897827148,
"learning_rate": 4.5654296875e-05,
"loss": 0.1801,
"step": 1190
},
{
"epoch": 0.45540796963946867,
"grad_norm": 3.159426689147949,
"learning_rate": 4.5605468750000005e-05,
"loss": 0.2184,
"step": 1200
},
{
"epoch": 0.45920303605313095,
"grad_norm": 1.7927987575531006,
"learning_rate": 4.5556640625000004e-05,
"loss": 0.1604,
"step": 1210
},
{
"epoch": 0.4629981024667932,
"grad_norm": 1.5928328037261963,
"learning_rate": 4.55078125e-05,
"loss": 0.1693,
"step": 1220
},
{
"epoch": 0.4667931688804554,
"grad_norm": 0.8145284056663513,
"learning_rate": 4.5458984375e-05,
"loss": 0.1761,
"step": 1230
},
{
"epoch": 0.47058823529411764,
"grad_norm": 0.7765156030654907,
"learning_rate": 4.541015625e-05,
"loss": 0.1799,
"step": 1240
},
{
"epoch": 0.47438330170777987,
"grad_norm": 1.8456169366836548,
"learning_rate": 4.5361328125000005e-05,
"loss": 0.168,
"step": 1250
},
{
"epoch": 0.4781783681214421,
"grad_norm": 1.6953251361846924,
"learning_rate": 4.5312500000000004e-05,
"loss": 0.1945,
"step": 1260
},
{
"epoch": 0.4819734345351044,
"grad_norm": 1.5285083055496216,
"learning_rate": 4.5263671875e-05,
"loss": 0.2075,
"step": 1270
},
{
"epoch": 0.4857685009487666,
"grad_norm": 2.95650577545166,
"learning_rate": 4.521484375e-05,
"loss": 0.1601,
"step": 1280
},
{
"epoch": 0.48956356736242884,
"grad_norm": 0.7677034735679626,
"learning_rate": 4.5166015625e-05,
"loss": 0.1695,
"step": 1290
},
{
"epoch": 0.49335863377609107,
"grad_norm": 1.9959975481033325,
"learning_rate": 4.5117187500000005e-05,
"loss": 0.2183,
"step": 1300
},
{
"epoch": 0.4971537001897533,
"grad_norm": 1.8000417947769165,
"learning_rate": 4.5068359375000003e-05,
"loss": 0.175,
"step": 1310
},
{
"epoch": 0.5009487666034156,
"grad_norm": 1.400612473487854,
"learning_rate": 4.501953125e-05,
"loss": 0.2085,
"step": 1320
},
{
"epoch": 0.5047438330170778,
"grad_norm": 1.6406989097595215,
"learning_rate": 4.4970703125e-05,
"loss": 0.1537,
"step": 1330
},
{
"epoch": 0.50853889943074,
"grad_norm": 2.0849852561950684,
"learning_rate": 4.4921875e-05,
"loss": 0.1579,
"step": 1340
},
{
"epoch": 0.5123339658444023,
"grad_norm": 2.6497225761413574,
"learning_rate": 4.4873046875000005e-05,
"loss": 0.1888,
"step": 1350
},
{
"epoch": 0.5161290322580645,
"grad_norm": 2.2594399452209473,
"learning_rate": 4.482421875e-05,
"loss": 0.1645,
"step": 1360
},
{
"epoch": 0.5199240986717267,
"grad_norm": 1.3591111898422241,
"learning_rate": 4.4775390625e-05,
"loss": 0.1876,
"step": 1370
},
{
"epoch": 0.523719165085389,
"grad_norm": 5.060487747192383,
"learning_rate": 4.47265625e-05,
"loss": 0.1946,
"step": 1380
},
{
"epoch": 0.5275142314990512,
"grad_norm": 1.7694716453552246,
"learning_rate": 4.4677734375e-05,
"loss": 0.0966,
"step": 1390
},
{
"epoch": 0.5313092979127134,
"grad_norm": 2.8661625385284424,
"learning_rate": 4.4628906250000004e-05,
"loss": 0.1614,
"step": 1400
},
{
"epoch": 0.5351043643263758,
"grad_norm": 2.2955727577209473,
"learning_rate": 4.4580078125e-05,
"loss": 0.193,
"step": 1410
},
{
"epoch": 0.538899430740038,
"grad_norm": 1.4596924781799316,
"learning_rate": 4.453125e-05,
"loss": 0.1971,
"step": 1420
},
{
"epoch": 0.5426944971537002,
"grad_norm": 1.039890170097351,
"learning_rate": 4.4482421875e-05,
"loss": 0.1909,
"step": 1430
},
{
"epoch": 0.5464895635673624,
"grad_norm": 1.433979868888855,
"learning_rate": 4.443359375e-05,
"loss": 0.1832,
"step": 1440
},
{
"epoch": 0.5502846299810247,
"grad_norm": 1.306391954421997,
"learning_rate": 4.4384765625000004e-05,
"loss": 0.1867,
"step": 1450
},
{
"epoch": 0.5540796963946869,
"grad_norm": 1.2681069374084473,
"learning_rate": 4.43359375e-05,
"loss": 0.1506,
"step": 1460
},
{
"epoch": 0.5578747628083491,
"grad_norm": 3.947502613067627,
"learning_rate": 4.4287109375e-05,
"loss": 0.1343,
"step": 1470
},
{
"epoch": 0.5616698292220114,
"grad_norm": 4.928821563720703,
"learning_rate": 4.423828125e-05,
"loss": 0.2057,
"step": 1480
},
{
"epoch": 0.5654648956356736,
"grad_norm": 2.162473201751709,
"learning_rate": 4.4189453125000005e-05,
"loss": 0.1942,
"step": 1490
},
{
"epoch": 0.5692599620493358,
"grad_norm": 5.402246475219727,
"learning_rate": 4.4140625000000004e-05,
"loss": 0.1727,
"step": 1500
},
{
"epoch": 0.573055028462998,
"grad_norm": 0.2728889286518097,
"learning_rate": 4.4091796875e-05,
"loss": 0.1345,
"step": 1510
},
{
"epoch": 0.5768500948766604,
"grad_norm": 2.027841567993164,
"learning_rate": 4.404296875e-05,
"loss": 0.213,
"step": 1520
},
{
"epoch": 0.5806451612903226,
"grad_norm": 1.3224737644195557,
"learning_rate": 4.3994140625e-05,
"loss": 0.1735,
"step": 1530
},
{
"epoch": 0.5844402277039848,
"grad_norm": 2.3124992847442627,
"learning_rate": 4.3945312500000005e-05,
"loss": 0.2177,
"step": 1540
},
{
"epoch": 0.5882352941176471,
"grad_norm": 1.2521787881851196,
"learning_rate": 4.3896484375000004e-05,
"loss": 0.1332,
"step": 1550
},
{
"epoch": 0.5920303605313093,
"grad_norm": 2.5216283798217773,
"learning_rate": 4.384765625e-05,
"loss": 0.1318,
"step": 1560
},
{
"epoch": 0.5958254269449715,
"grad_norm": 1.8268439769744873,
"learning_rate": 4.3798828125e-05,
"loss": 0.1269,
"step": 1570
},
{
"epoch": 0.5996204933586338,
"grad_norm": 0.6268766522407532,
"learning_rate": 4.375e-05,
"loss": 0.1381,
"step": 1580
},
{
"epoch": 0.603415559772296,
"grad_norm": 1.979546308517456,
"learning_rate": 4.3701171875000005e-05,
"loss": 0.1351,
"step": 1590
},
{
"epoch": 0.6072106261859582,
"grad_norm": 1.5526436567306519,
"learning_rate": 4.3652343750000004e-05,
"loss": 0.2163,
"step": 1600
},
{
"epoch": 0.6110056925996205,
"grad_norm": 0.9428083896636963,
"learning_rate": 4.3603515625e-05,
"loss": 0.1398,
"step": 1610
},
{
"epoch": 0.6148007590132827,
"grad_norm": 2.1224870681762695,
"learning_rate": 4.35546875e-05,
"loss": 0.1891,
"step": 1620
},
{
"epoch": 0.618595825426945,
"grad_norm": 0.3401525914669037,
"learning_rate": 4.3505859375e-05,
"loss": 0.1068,
"step": 1630
},
{
"epoch": 0.6223908918406073,
"grad_norm": 1.1070092916488647,
"learning_rate": 4.3457031250000005e-05,
"loss": 0.1407,
"step": 1640
},
{
"epoch": 0.6261859582542695,
"grad_norm": 1.1588579416275024,
"learning_rate": 4.3408203125e-05,
"loss": 0.2238,
"step": 1650
},
{
"epoch": 0.6299810246679317,
"grad_norm": 1.3201090097427368,
"learning_rate": 4.3359375e-05,
"loss": 0.2135,
"step": 1660
},
{
"epoch": 0.6337760910815939,
"grad_norm": 1.2257441282272339,
"learning_rate": 4.3310546875e-05,
"loss": 0.1261,
"step": 1670
},
{
"epoch": 0.6375711574952562,
"grad_norm": 1.4213567972183228,
"learning_rate": 4.326171875e-05,
"loss": 0.1439,
"step": 1680
},
{
"epoch": 0.6413662239089184,
"grad_norm": 1.0983916521072388,
"learning_rate": 4.3212890625000004e-05,
"loss": 0.1356,
"step": 1690
},
{
"epoch": 0.6451612903225806,
"grad_norm": 1.6485854387283325,
"learning_rate": 4.31640625e-05,
"loss": 0.1549,
"step": 1700
},
{
"epoch": 0.6489563567362429,
"grad_norm": 5.49334716796875,
"learning_rate": 4.3115234375e-05,
"loss": 0.1519,
"step": 1710
},
{
"epoch": 0.6527514231499051,
"grad_norm": 0.26703280210494995,
"learning_rate": 4.306640625e-05,
"loss": 0.1499,
"step": 1720
},
{
"epoch": 0.6565464895635673,
"grad_norm": 1.5822151899337769,
"learning_rate": 4.3017578125e-05,
"loss": 0.1733,
"step": 1730
},
{
"epoch": 0.6603415559772297,
"grad_norm": 1.1510590314865112,
"learning_rate": 4.2968750000000004e-05,
"loss": 0.1665,
"step": 1740
},
{
"epoch": 0.6641366223908919,
"grad_norm": 2.48427152633667,
"learning_rate": 4.2919921875e-05,
"loss": 0.1598,
"step": 1750
},
{
"epoch": 0.6679316888045541,
"grad_norm": 2.0076019763946533,
"learning_rate": 4.287109375e-05,
"loss": 0.1642,
"step": 1760
},
{
"epoch": 0.6717267552182163,
"grad_norm": 2.1611413955688477,
"learning_rate": 4.2822265625e-05,
"loss": 0.1538,
"step": 1770
},
{
"epoch": 0.6755218216318786,
"grad_norm": 2.476008415222168,
"learning_rate": 4.27734375e-05,
"loss": 0.1193,
"step": 1780
},
{
"epoch": 0.6793168880455408,
"grad_norm": 2.426025867462158,
"learning_rate": 4.2724609375000004e-05,
"loss": 0.161,
"step": 1790
},
{
"epoch": 0.683111954459203,
"grad_norm": 2.2168385982513428,
"learning_rate": 4.267578125e-05,
"loss": 0.1429,
"step": 1800
},
{
"epoch": 0.6869070208728653,
"grad_norm": 1.63054358959198,
"learning_rate": 4.2626953125e-05,
"loss": 0.1561,
"step": 1810
},
{
"epoch": 0.6907020872865275,
"grad_norm": 5.170077323913574,
"learning_rate": 4.2578125e-05,
"loss": 0.1685,
"step": 1820
},
{
"epoch": 0.6944971537001897,
"grad_norm": 2.700263023376465,
"learning_rate": 4.2529296875000005e-05,
"loss": 0.1601,
"step": 1830
},
{
"epoch": 0.698292220113852,
"grad_norm": 1.6965094804763794,
"learning_rate": 4.2480468750000004e-05,
"loss": 0.1046,
"step": 1840
},
{
"epoch": 0.7020872865275142,
"grad_norm": 5.461817264556885,
"learning_rate": 4.2431640625e-05,
"loss": 0.1421,
"step": 1850
},
{
"epoch": 0.7058823529411765,
"grad_norm": 1.584050178527832,
"learning_rate": 4.23828125e-05,
"loss": 0.1781,
"step": 1860
},
{
"epoch": 0.7096774193548387,
"grad_norm": 2.42586088180542,
"learning_rate": 4.2333984375e-05,
"loss": 0.1274,
"step": 1870
},
{
"epoch": 0.713472485768501,
"grad_norm": 3.151433229446411,
"learning_rate": 4.2285156250000005e-05,
"loss": 0.1825,
"step": 1880
},
{
"epoch": 0.7172675521821632,
"grad_norm": 1.1808427572250366,
"learning_rate": 4.2236328125000004e-05,
"loss": 0.2085,
"step": 1890
},
{
"epoch": 0.7210626185958254,
"grad_norm": 1.981814980506897,
"learning_rate": 4.21875e-05,
"loss": 0.1718,
"step": 1900
},
{
"epoch": 0.7248576850094877,
"grad_norm": 0.9719598293304443,
"learning_rate": 4.2138671875e-05,
"loss": 0.1461,
"step": 1910
},
{
"epoch": 0.7286527514231499,
"grad_norm": 1.493422031402588,
"learning_rate": 4.208984375e-05,
"loss": 0.1902,
"step": 1920
},
{
"epoch": 0.7324478178368121,
"grad_norm": 1.4552210569381714,
"learning_rate": 4.2041015625000005e-05,
"loss": 0.1253,
"step": 1930
},
{
"epoch": 0.7362428842504743,
"grad_norm": 2.0822556018829346,
"learning_rate": 4.1992187500000003e-05,
"loss": 0.144,
"step": 1940
},
{
"epoch": 0.7400379506641366,
"grad_norm": 2.461090326309204,
"learning_rate": 4.1943359375e-05,
"loss": 0.2084,
"step": 1950
},
{
"epoch": 0.7438330170777988,
"grad_norm": 1.8043471574783325,
"learning_rate": 4.189453125e-05,
"loss": 0.1904,
"step": 1960
},
{
"epoch": 0.7476280834914611,
"grad_norm": 1.6388760805130005,
"learning_rate": 4.1845703125e-05,
"loss": 0.2071,
"step": 1970
},
{
"epoch": 0.7514231499051234,
"grad_norm": 2.5029492378234863,
"learning_rate": 4.1796875000000005e-05,
"loss": 0.1881,
"step": 1980
},
{
"epoch": 0.7552182163187856,
"grad_norm": 1.3092814683914185,
"learning_rate": 4.1748046875e-05,
"loss": 0.1356,
"step": 1990
},
{
"epoch": 0.7590132827324478,
"grad_norm": 1.2208425998687744,
"learning_rate": 4.169921875e-05,
"loss": 0.1378,
"step": 2000
},
{
"epoch": 0.7628083491461101,
"grad_norm": 3.214336633682251,
"learning_rate": 4.1650390625e-05,
"loss": 0.1954,
"step": 2010
},
{
"epoch": 0.7666034155597723,
"grad_norm": 4.104292392730713,
"learning_rate": 4.16015625e-05,
"loss": 0.1886,
"step": 2020
},
{
"epoch": 0.7703984819734345,
"grad_norm": 2.170186996459961,
"learning_rate": 4.1552734375000004e-05,
"loss": 0.1705,
"step": 2030
},
{
"epoch": 0.7741935483870968,
"grad_norm": 2.6494083404541016,
"learning_rate": 4.150390625e-05,
"loss": 0.1986,
"step": 2040
},
{
"epoch": 0.777988614800759,
"grad_norm": 0.7542719841003418,
"learning_rate": 4.1455078125e-05,
"loss": 0.1255,
"step": 2050
},
{
"epoch": 0.7817836812144212,
"grad_norm": 3.126569986343384,
"learning_rate": 4.140625e-05,
"loss": 0.1576,
"step": 2060
},
{
"epoch": 0.7855787476280834,
"grad_norm": 1.0665310621261597,
"learning_rate": 4.1357421875e-05,
"loss": 0.174,
"step": 2070
},
{
"epoch": 0.7893738140417458,
"grad_norm": 1.3480401039123535,
"learning_rate": 4.1308593750000004e-05,
"loss": 0.1203,
"step": 2080
},
{
"epoch": 0.793168880455408,
"grad_norm": 2.358405113220215,
"learning_rate": 4.1259765625e-05,
"loss": 0.1394,
"step": 2090
},
{
"epoch": 0.7969639468690702,
"grad_norm": 3.2337498664855957,
"learning_rate": 4.12109375e-05,
"loss": 0.1711,
"step": 2100
},
{
"epoch": 0.8007590132827325,
"grad_norm": 2.7708380222320557,
"learning_rate": 4.1162109375e-05,
"loss": 0.1265,
"step": 2110
},
{
"epoch": 0.8045540796963947,
"grad_norm": 3.3023488521575928,
"learning_rate": 4.1113281250000005e-05,
"loss": 0.1706,
"step": 2120
},
{
"epoch": 0.8083491461100569,
"grad_norm": 1.758325219154358,
"learning_rate": 4.1064453125000004e-05,
"loss": 0.1371,
"step": 2130
},
{
"epoch": 0.8121442125237192,
"grad_norm": 1.5623672008514404,
"learning_rate": 4.1015625e-05,
"loss": 0.1756,
"step": 2140
},
{
"epoch": 0.8159392789373814,
"grad_norm": 1.3145450353622437,
"learning_rate": 4.0966796875e-05,
"loss": 0.1328,
"step": 2150
},
{
"epoch": 0.8197343453510436,
"grad_norm": 2.432619094848633,
"learning_rate": 4.091796875e-05,
"loss": 0.1286,
"step": 2160
},
{
"epoch": 0.8235294117647058,
"grad_norm": 0.4147840142250061,
"learning_rate": 4.0869140625000005e-05,
"loss": 0.1509,
"step": 2170
},
{
"epoch": 0.8273244781783681,
"grad_norm": 1.6098836660385132,
"learning_rate": 4.0820312500000004e-05,
"loss": 0.1746,
"step": 2180
},
{
"epoch": 0.8311195445920304,
"grad_norm": 2.5355212688446045,
"learning_rate": 4.0771484375e-05,
"loss": 0.1238,
"step": 2190
},
{
"epoch": 0.8349146110056926,
"grad_norm": 1.5544086694717407,
"learning_rate": 4.072265625e-05,
"loss": 0.2168,
"step": 2200
},
{
"epoch": 0.8387096774193549,
"grad_norm": 2.1792962551116943,
"learning_rate": 4.0673828125e-05,
"loss": 0.1338,
"step": 2210
},
{
"epoch": 0.8425047438330171,
"grad_norm": 2.667340040206909,
"learning_rate": 4.0625000000000005e-05,
"loss": 0.1505,
"step": 2220
},
{
"epoch": 0.8462998102466793,
"grad_norm": 0.8551260232925415,
"learning_rate": 4.0576171875000004e-05,
"loss": 0.1081,
"step": 2230
},
{
"epoch": 0.8500948766603416,
"grad_norm": 2.8773763179779053,
"learning_rate": 4.052734375e-05,
"loss": 0.1089,
"step": 2240
},
{
"epoch": 0.8538899430740038,
"grad_norm": 2.12497878074646,
"learning_rate": 4.0478515625e-05,
"loss": 0.1268,
"step": 2250
},
{
"epoch": 0.857685009487666,
"grad_norm": 1.8039929866790771,
"learning_rate": 4.04296875e-05,
"loss": 0.1544,
"step": 2260
},
{
"epoch": 0.8614800759013282,
"grad_norm": 0.4839627742767334,
"learning_rate": 4.0380859375000005e-05,
"loss": 0.1421,
"step": 2270
},
{
"epoch": 0.8652751423149905,
"grad_norm": 3.672240734100342,
"learning_rate": 4.033203125e-05,
"loss": 0.134,
"step": 2280
},
{
"epoch": 0.8690702087286527,
"grad_norm": 2.4371728897094727,
"learning_rate": 4.0283203125e-05,
"loss": 0.1419,
"step": 2290
},
{
"epoch": 0.872865275142315,
"grad_norm": 1.8469904661178589,
"learning_rate": 4.0234375e-05,
"loss": 0.1846,
"step": 2300
},
{
"epoch": 0.8766603415559773,
"grad_norm": 0.7639700174331665,
"learning_rate": 4.0185546875e-05,
"loss": 0.106,
"step": 2310
},
{
"epoch": 0.8804554079696395,
"grad_norm": 1.4450427293777466,
"learning_rate": 4.0136718750000004e-05,
"loss": 0.1408,
"step": 2320
},
{
"epoch": 0.8842504743833017,
"grad_norm": 1.3033993244171143,
"learning_rate": 4.0087890625e-05,
"loss": 0.1456,
"step": 2330
},
{
"epoch": 0.888045540796964,
"grad_norm": 1.3045791387557983,
"learning_rate": 4.00390625e-05,
"loss": 0.1531,
"step": 2340
},
{
"epoch": 0.8918406072106262,
"grad_norm": 3.4357423782348633,
"learning_rate": 3.9990234375e-05,
"loss": 0.1417,
"step": 2350
},
{
"epoch": 0.8956356736242884,
"grad_norm": 3.5311038494110107,
"learning_rate": 3.994140625e-05,
"loss": 0.1317,
"step": 2360
},
{
"epoch": 0.8994307400379506,
"grad_norm": 4.028538227081299,
"learning_rate": 3.9892578125000004e-05,
"loss": 0.1644,
"step": 2370
},
{
"epoch": 0.9032258064516129,
"grad_norm": 1.4089256525039673,
"learning_rate": 3.984375e-05,
"loss": 0.1087,
"step": 2380
},
{
"epoch": 0.9070208728652751,
"grad_norm": 0.2230881005525589,
"learning_rate": 3.9794921875e-05,
"loss": 0.1387,
"step": 2390
},
{
"epoch": 0.9108159392789373,
"grad_norm": 2.5647592544555664,
"learning_rate": 3.974609375e-05,
"loss": 0.1475,
"step": 2400
},
{
"epoch": 0.9146110056925996,
"grad_norm": 1.2803542613983154,
"learning_rate": 3.9697265625e-05,
"loss": 0.126,
"step": 2410
},
{
"epoch": 0.9184060721062619,
"grad_norm": 3.2023112773895264,
"learning_rate": 3.9648437500000004e-05,
"loss": 0.1458,
"step": 2420
},
{
"epoch": 0.9222011385199241,
"grad_norm": 3.615530252456665,
"learning_rate": 3.9599609375e-05,
"loss": 0.1297,
"step": 2430
},
{
"epoch": 0.9259962049335864,
"grad_norm": 3.396568536758423,
"learning_rate": 3.955078125e-05,
"loss": 0.1486,
"step": 2440
},
{
"epoch": 0.9297912713472486,
"grad_norm": 1.7030583620071411,
"learning_rate": 3.9501953125e-05,
"loss": 0.1464,
"step": 2450
},
{
"epoch": 0.9335863377609108,
"grad_norm": 1.0317497253417969,
"learning_rate": 3.9453125000000005e-05,
"loss": 0.1658,
"step": 2460
},
{
"epoch": 0.937381404174573,
"grad_norm": 1.1268532276153564,
"learning_rate": 3.9404296875000004e-05,
"loss": 0.1425,
"step": 2470
},
{
"epoch": 0.9411764705882353,
"grad_norm": 0.9238561391830444,
"learning_rate": 3.935546875e-05,
"loss": 0.1565,
"step": 2480
},
{
"epoch": 0.9449715370018975,
"grad_norm": 1.4960806369781494,
"learning_rate": 3.9306640625e-05,
"loss": 0.1681,
"step": 2490
},
{
"epoch": 0.9487666034155597,
"grad_norm": 1.306814193725586,
"learning_rate": 3.92578125e-05,
"loss": 0.1719,
"step": 2500
},
{
"epoch": 0.952561669829222,
"grad_norm": 0.391342431306839,
"learning_rate": 3.9208984375000005e-05,
"loss": 0.1497,
"step": 2510
},
{
"epoch": 0.9563567362428842,
"grad_norm": 1.9634449481964111,
"learning_rate": 3.9160156250000004e-05,
"loss": 0.124,
"step": 2520
},
{
"epoch": 0.9601518026565465,
"grad_norm": 2.7319021224975586,
"learning_rate": 3.9111328125e-05,
"loss": 0.1029,
"step": 2530
},
{
"epoch": 0.9639468690702088,
"grad_norm": 1.062157392501831,
"learning_rate": 3.90625e-05,
"loss": 0.1612,
"step": 2540
},
{
"epoch": 0.967741935483871,
"grad_norm": 2.737459182739258,
"learning_rate": 3.9013671875e-05,
"loss": 0.1817,
"step": 2550
},
{
"epoch": 0.9715370018975332,
"grad_norm": 1.4106887578964233,
"learning_rate": 3.8964843750000005e-05,
"loss": 0.1875,
"step": 2560
},
{
"epoch": 0.9753320683111955,
"grad_norm": 7.118113040924072,
"learning_rate": 3.8916015625000003e-05,
"loss": 0.2243,
"step": 2570
},
{
"epoch": 0.9791271347248577,
"grad_norm": 2.956235647201538,
"learning_rate": 3.88671875e-05,
"loss": 0.1059,
"step": 2580
},
{
"epoch": 0.9829222011385199,
"grad_norm": 1.2888784408569336,
"learning_rate": 3.8818359375e-05,
"loss": 0.1546,
"step": 2590
},
{
"epoch": 0.9867172675521821,
"grad_norm": 2.5757930278778076,
"learning_rate": 3.876953125e-05,
"loss": 0.115,
"step": 2600
},
{
"epoch": 0.9905123339658444,
"grad_norm": 0.7105236053466797,
"learning_rate": 3.8720703125000005e-05,
"loss": 0.1218,
"step": 2610
},
{
"epoch": 0.9943074003795066,
"grad_norm": 2.5876383781433105,
"learning_rate": 3.8671875e-05,
"loss": 0.1487,
"step": 2620
},
{
"epoch": 0.9981024667931688,
"grad_norm": 0.2208087146282196,
"learning_rate": 3.8623046875e-05,
"loss": 0.1429,
"step": 2630
},
{
"epoch": 1.0018975332068312,
"grad_norm": 0.6170036196708679,
"learning_rate": 3.857421875e-05,
"loss": 0.128,
"step": 2640
},
{
"epoch": 1.0056925996204933,
"grad_norm": 1.1868369579315186,
"learning_rate": 3.8525390625e-05,
"loss": 0.0923,
"step": 2650
},
{
"epoch": 1.0094876660341556,
"grad_norm": 3.0359079837799072,
"learning_rate": 3.8476562500000004e-05,
"loss": 0.1104,
"step": 2660
},
{
"epoch": 1.0132827324478177,
"grad_norm": 0.6559151411056519,
"learning_rate": 3.8427734375e-05,
"loss": 0.1089,
"step": 2670
},
{
"epoch": 1.01707779886148,
"grad_norm": 10.784985542297363,
"learning_rate": 3.837890625e-05,
"loss": 0.1408,
"step": 2680
},
{
"epoch": 1.0208728652751424,
"grad_norm": 1.7095699310302734,
"learning_rate": 3.8330078125e-05,
"loss": 0.1358,
"step": 2690
},
{
"epoch": 1.0246679316888045,
"grad_norm": 1.3584043979644775,
"learning_rate": 3.828125e-05,
"loss": 0.1248,
"step": 2700
},
{
"epoch": 1.0284629981024669,
"grad_norm": 5.567887783050537,
"learning_rate": 3.8232421875000004e-05,
"loss": 0.0992,
"step": 2710
},
{
"epoch": 1.032258064516129,
"grad_norm": 1.6698075532913208,
"learning_rate": 3.818359375e-05,
"loss": 0.1503,
"step": 2720
},
{
"epoch": 1.0360531309297913,
"grad_norm": 0.29519161581993103,
"learning_rate": 3.8134765625e-05,
"loss": 0.1247,
"step": 2730
},
{
"epoch": 1.0398481973434535,
"grad_norm": 2.3616697788238525,
"learning_rate": 3.80859375e-05,
"loss": 0.1459,
"step": 2740
},
{
"epoch": 1.0436432637571158,
"grad_norm": 1.219618320465088,
"learning_rate": 3.8037109375e-05,
"loss": 0.1036,
"step": 2750
},
{
"epoch": 1.047438330170778,
"grad_norm": 1.3592404127120972,
"learning_rate": 3.7988281250000004e-05,
"loss": 0.1399,
"step": 2760
},
{
"epoch": 1.0512333965844403,
"grad_norm": 1.2837351560592651,
"learning_rate": 3.7939453125e-05,
"loss": 0.1581,
"step": 2770
},
{
"epoch": 1.0550284629981024,
"grad_norm": 1.3627588748931885,
"learning_rate": 3.7890625e-05,
"loss": 0.1093,
"step": 2780
},
{
"epoch": 1.0588235294117647,
"grad_norm": 4.571230888366699,
"learning_rate": 3.7841796875e-05,
"loss": 0.1693,
"step": 2790
},
{
"epoch": 1.0626185958254268,
"grad_norm": 1.575040578842163,
"learning_rate": 3.7792968750000005e-05,
"loss": 0.1646,
"step": 2800
},
{
"epoch": 1.0664136622390892,
"grad_norm": 2.594174861907959,
"learning_rate": 3.7744140625000004e-05,
"loss": 0.0976,
"step": 2810
},
{
"epoch": 1.0702087286527515,
"grad_norm": 4.076402187347412,
"learning_rate": 3.76953125e-05,
"loss": 0.1301,
"step": 2820
},
{
"epoch": 1.0740037950664136,
"grad_norm": 2.7510082721710205,
"learning_rate": 3.7646484375e-05,
"loss": 0.1337,
"step": 2830
},
{
"epoch": 1.077798861480076,
"grad_norm": 0.8219005465507507,
"learning_rate": 3.759765625e-05,
"loss": 0.1122,
"step": 2840
},
{
"epoch": 1.081593927893738,
"grad_norm": 1.9153568744659424,
"learning_rate": 3.7548828125000005e-05,
"loss": 0.1428,
"step": 2850
},
{
"epoch": 1.0853889943074004,
"grad_norm": 2.93013858795166,
"learning_rate": 3.7500000000000003e-05,
"loss": 0.1872,
"step": 2860
},
{
"epoch": 1.0891840607210626,
"grad_norm": 0.7126034498214722,
"learning_rate": 3.7451171875e-05,
"loss": 0.1106,
"step": 2870
},
{
"epoch": 1.092979127134725,
"grad_norm": 1.8968008756637573,
"learning_rate": 3.740234375e-05,
"loss": 0.1131,
"step": 2880
},
{
"epoch": 1.096774193548387,
"grad_norm": 5.133113861083984,
"learning_rate": 3.7353515625e-05,
"loss": 0.0884,
"step": 2890
},
{
"epoch": 1.1005692599620494,
"grad_norm": 3.756060838699341,
"learning_rate": 3.7304687500000005e-05,
"loss": 0.1373,
"step": 2900
},
{
"epoch": 1.1043643263757117,
"grad_norm": 7.563070297241211,
"learning_rate": 3.7255859375e-05,
"loss": 0.1353,
"step": 2910
},
{
"epoch": 1.1081593927893738,
"grad_norm": 4.473198413848877,
"learning_rate": 3.720703125e-05,
"loss": 0.1639,
"step": 2920
},
{
"epoch": 1.1119544592030361,
"grad_norm": 2.689405679702759,
"learning_rate": 3.7158203125e-05,
"loss": 0.1117,
"step": 2930
},
{
"epoch": 1.1157495256166983,
"grad_norm": 0.2793045938014984,
"learning_rate": 3.7109375e-05,
"loss": 0.1073,
"step": 2940
},
{
"epoch": 1.1195445920303606,
"grad_norm": 1.4892089366912842,
"learning_rate": 3.7060546875000004e-05,
"loss": 0.1541,
"step": 2950
},
{
"epoch": 1.1233396584440227,
"grad_norm": 1.1303538084030151,
"learning_rate": 3.701171875e-05,
"loss": 0.0961,
"step": 2960
},
{
"epoch": 1.127134724857685,
"grad_norm": 0.6085264682769775,
"learning_rate": 3.6962890625e-05,
"loss": 0.111,
"step": 2970
},
{
"epoch": 1.1309297912713472,
"grad_norm": 0.44500744342803955,
"learning_rate": 3.69140625e-05,
"loss": 0.0939,
"step": 2980
},
{
"epoch": 1.1347248576850095,
"grad_norm": 1.8215651512145996,
"learning_rate": 3.6865234375e-05,
"loss": 0.1112,
"step": 2990
},
{
"epoch": 1.1385199240986716,
"grad_norm": 0.7494792938232422,
"learning_rate": 3.6816406250000004e-05,
"loss": 0.1407,
"step": 3000
},
{
"epoch": 1.142314990512334,
"grad_norm": 1.2958310842514038,
"learning_rate": 3.6767578125e-05,
"loss": 0.086,
"step": 3010
},
{
"epoch": 1.146110056925996,
"grad_norm": 1.223376989364624,
"learning_rate": 3.671875e-05,
"loss": 0.1152,
"step": 3020
},
{
"epoch": 1.1499051233396584,
"grad_norm": 5.232940196990967,
"learning_rate": 3.6669921875e-05,
"loss": 0.1308,
"step": 3030
},
{
"epoch": 1.1537001897533208,
"grad_norm": 1.4690934419631958,
"learning_rate": 3.662109375e-05,
"loss": 0.1275,
"step": 3040
},
{
"epoch": 1.157495256166983,
"grad_norm": 0.8882303833961487,
"learning_rate": 3.6572265625000004e-05,
"loss": 0.0709,
"step": 3050
},
{
"epoch": 1.1612903225806452,
"grad_norm": 7.125335216522217,
"learning_rate": 3.65234375e-05,
"loss": 0.0991,
"step": 3060
},
{
"epoch": 1.1650853889943074,
"grad_norm": 2.321225881576538,
"learning_rate": 3.6474609375e-05,
"loss": 0.1986,
"step": 3070
},
{
"epoch": 1.1688804554079697,
"grad_norm": 2.8146891593933105,
"learning_rate": 3.642578125e-05,
"loss": 0.1497,
"step": 3080
},
{
"epoch": 1.1726755218216318,
"grad_norm": 2.781428575515747,
"learning_rate": 3.6376953125e-05,
"loss": 0.1075,
"step": 3090
},
{
"epoch": 1.1764705882352942,
"grad_norm": 7.027383327484131,
"learning_rate": 3.6328125000000004e-05,
"loss": 0.0921,
"step": 3100
},
{
"epoch": 1.1802656546489563,
"grad_norm": 2.3189167976379395,
"learning_rate": 3.6279296875e-05,
"loss": 0.0784,
"step": 3110
},
{
"epoch": 1.1840607210626186,
"grad_norm": 3.060039758682251,
"learning_rate": 3.623046875e-05,
"loss": 0.1262,
"step": 3120
},
{
"epoch": 1.187855787476281,
"grad_norm": 6.099356174468994,
"learning_rate": 3.6181640625e-05,
"loss": 0.1506,
"step": 3130
},
{
"epoch": 1.191650853889943,
"grad_norm": 3.1299543380737305,
"learning_rate": 3.6132812500000005e-05,
"loss": 0.1431,
"step": 3140
},
{
"epoch": 1.1954459203036052,
"grad_norm": 1.5676418542861938,
"learning_rate": 3.6083984375000004e-05,
"loss": 0.1018,
"step": 3150
},
{
"epoch": 1.1992409867172675,
"grad_norm": 0.786465585231781,
"learning_rate": 3.603515625e-05,
"loss": 0.1471,
"step": 3160
},
{
"epoch": 1.2030360531309299,
"grad_norm": 0.6863810420036316,
"learning_rate": 3.5986328125e-05,
"loss": 0.1144,
"step": 3170
},
{
"epoch": 1.206831119544592,
"grad_norm": 6.13245964050293,
"learning_rate": 3.59375e-05,
"loss": 0.1378,
"step": 3180
},
{
"epoch": 1.2106261859582543,
"grad_norm": 0.9144377112388611,
"learning_rate": 3.5888671875000005e-05,
"loss": 0.1024,
"step": 3190
},
{
"epoch": 1.2144212523719164,
"grad_norm": 13.092443466186523,
"learning_rate": 3.583984375e-05,
"loss": 0.1241,
"step": 3200
},
{
"epoch": 1.2182163187855788,
"grad_norm": 5.453747272491455,
"learning_rate": 3.5791015625e-05,
"loss": 0.1307,
"step": 3210
},
{
"epoch": 1.222011385199241,
"grad_norm": 5.696516036987305,
"learning_rate": 3.57421875e-05,
"loss": 0.1661,
"step": 3220
},
{
"epoch": 1.2258064516129032,
"grad_norm": 1.4154207706451416,
"learning_rate": 3.5693359375e-05,
"loss": 0.1017,
"step": 3230
},
{
"epoch": 1.2296015180265654,
"grad_norm": 3.1260204315185547,
"learning_rate": 3.5644531250000005e-05,
"loss": 0.1224,
"step": 3240
},
{
"epoch": 1.2333965844402277,
"grad_norm": 1.4753592014312744,
"learning_rate": 3.5595703125e-05,
"loss": 0.1,
"step": 3250
},
{
"epoch": 1.23719165085389,
"grad_norm": 2.7512917518615723,
"learning_rate": 3.5546875e-05,
"loss": 0.152,
"step": 3260
},
{
"epoch": 1.2409867172675522,
"grad_norm": 0.1835506409406662,
"learning_rate": 3.5498046875e-05,
"loss": 0.0897,
"step": 3270
},
{
"epoch": 1.2447817836812145,
"grad_norm": 2.484245777130127,
"learning_rate": 3.544921875e-05,
"loss": 0.1284,
"step": 3280
},
{
"epoch": 1.2485768500948766,
"grad_norm": 2.778939962387085,
"learning_rate": 3.5400390625000004e-05,
"loss": 0.1225,
"step": 3290
},
{
"epoch": 1.252371916508539,
"grad_norm": 4.067395210266113,
"learning_rate": 3.53515625e-05,
"loss": 0.1687,
"step": 3300
},
{
"epoch": 1.256166982922201,
"grad_norm": 0.2922412157058716,
"learning_rate": 3.5302734375e-05,
"loss": 0.066,
"step": 3310
},
{
"epoch": 1.2599620493358634,
"grad_norm": 2.992678165435791,
"learning_rate": 3.525390625e-05,
"loss": 0.1016,
"step": 3320
},
{
"epoch": 1.2637571157495255,
"grad_norm": 0.5019288063049316,
"learning_rate": 3.5205078125e-05,
"loss": 0.0877,
"step": 3330
},
{
"epoch": 1.2675521821631879,
"grad_norm": 5.55689811706543,
"learning_rate": 3.5156250000000004e-05,
"loss": 0.1191,
"step": 3340
},
{
"epoch": 1.2713472485768502,
"grad_norm": 3.2791213989257812,
"learning_rate": 3.5107421875e-05,
"loss": 0.1086,
"step": 3350
},
{
"epoch": 1.2751423149905123,
"grad_norm": 7.413064956665039,
"learning_rate": 3.505859375e-05,
"loss": 0.1063,
"step": 3360
},
{
"epoch": 1.2789373814041745,
"grad_norm": 4.541271686553955,
"learning_rate": 3.5009765625e-05,
"loss": 0.0959,
"step": 3370
},
{
"epoch": 1.2827324478178368,
"grad_norm": 2.8879811763763428,
"learning_rate": 3.49609375e-05,
"loss": 0.1178,
"step": 3380
},
{
"epoch": 1.2865275142314991,
"grad_norm": 3.210865020751953,
"learning_rate": 3.4912109375000004e-05,
"loss": 0.1464,
"step": 3390
},
{
"epoch": 1.2903225806451613,
"grad_norm": 0.654231071472168,
"learning_rate": 3.486328125e-05,
"loss": 0.1404,
"step": 3400
},
{
"epoch": 1.2941176470588236,
"grad_norm": 2.9404890537261963,
"learning_rate": 3.4814453125e-05,
"loss": 0.1213,
"step": 3410
},
{
"epoch": 1.2979127134724857,
"grad_norm": 2.2991085052490234,
"learning_rate": 3.4765625e-05,
"loss": 0.1131,
"step": 3420
},
{
"epoch": 1.301707779886148,
"grad_norm": 0.30925440788269043,
"learning_rate": 3.4716796875e-05,
"loss": 0.1166,
"step": 3430
},
{
"epoch": 1.3055028462998102,
"grad_norm": 1.3804266452789307,
"learning_rate": 3.4667968750000004e-05,
"loss": 0.0634,
"step": 3440
},
{
"epoch": 1.3092979127134725,
"grad_norm": 3.1803112030029297,
"learning_rate": 3.4619140625e-05,
"loss": 0.1916,
"step": 3450
},
{
"epoch": 1.3130929791271346,
"grad_norm": 2.8847222328186035,
"learning_rate": 3.45703125e-05,
"loss": 0.1856,
"step": 3460
},
{
"epoch": 1.316888045540797,
"grad_norm": 7.0924973487854,
"learning_rate": 3.4521484375e-05,
"loss": 0.1292,
"step": 3470
},
{
"epoch": 1.3206831119544593,
"grad_norm": 4.695943355560303,
"learning_rate": 3.4472656250000005e-05,
"loss": 0.1518,
"step": 3480
},
{
"epoch": 1.3244781783681214,
"grad_norm": 4.995908260345459,
"learning_rate": 3.4423828125000003e-05,
"loss": 0.12,
"step": 3490
},
{
"epoch": 1.3282732447817835,
"grad_norm": 4.585287570953369,
"learning_rate": 3.4375e-05,
"loss": 0.0933,
"step": 3500
},
{
"epoch": 1.3320683111954459,
"grad_norm": 1.5841524600982666,
"learning_rate": 3.4326171875e-05,
"loss": 0.1172,
"step": 3510
},
{
"epoch": 1.3358633776091082,
"grad_norm": 3.6837852001190186,
"learning_rate": 3.427734375e-05,
"loss": 0.1164,
"step": 3520
},
{
"epoch": 1.3396584440227703,
"grad_norm": 2.470222234725952,
"learning_rate": 3.4228515625000005e-05,
"loss": 0.1258,
"step": 3530
},
{
"epoch": 1.3434535104364327,
"grad_norm": 1.8782237768173218,
"learning_rate": 3.41796875e-05,
"loss": 0.1078,
"step": 3540
},
{
"epoch": 1.3472485768500948,
"grad_norm": 0.29535171389579773,
"learning_rate": 3.4130859375e-05,
"loss": 0.1658,
"step": 3550
},
{
"epoch": 1.3510436432637571,
"grad_norm": 3.8535208702087402,
"learning_rate": 3.408203125e-05,
"loss": 0.1632,
"step": 3560
},
{
"epoch": 1.3548387096774195,
"grad_norm": 2.0340235233306885,
"learning_rate": 3.4033203125e-05,
"loss": 0.1498,
"step": 3570
},
{
"epoch": 1.3586337760910816,
"grad_norm": 3.015774726867676,
"learning_rate": 3.3984375000000004e-05,
"loss": 0.1099,
"step": 3580
},
{
"epoch": 1.3624288425047437,
"grad_norm": 5.396883487701416,
"learning_rate": 3.3935546875e-05,
"loss": 0.1308,
"step": 3590
},
{
"epoch": 1.366223908918406,
"grad_norm": 4.15665864944458,
"learning_rate": 3.388671875e-05,
"loss": 0.0893,
"step": 3600
},
{
"epoch": 1.3700189753320684,
"grad_norm": 2.0461652278900146,
"learning_rate": 3.3837890625e-05,
"loss": 0.1157,
"step": 3610
},
{
"epoch": 1.3738140417457305,
"grad_norm": 1.5953052043914795,
"learning_rate": 3.37890625e-05,
"loss": 0.1611,
"step": 3620
},
{
"epoch": 1.3776091081593929,
"grad_norm": 3.8149826526641846,
"learning_rate": 3.3740234375000004e-05,
"loss": 0.1582,
"step": 3630
},
{
"epoch": 1.381404174573055,
"grad_norm": 5.658437252044678,
"learning_rate": 3.369140625e-05,
"loss": 0.1481,
"step": 3640
},
{
"epoch": 1.3851992409867173,
"grad_norm": 0.47566506266593933,
"learning_rate": 3.3642578125e-05,
"loss": 0.1336,
"step": 3650
},
{
"epoch": 1.3889943074003794,
"grad_norm": 2.9851224422454834,
"learning_rate": 3.359375e-05,
"loss": 0.1274,
"step": 3660
},
{
"epoch": 1.3927893738140418,
"grad_norm": 2.3793752193450928,
"learning_rate": 3.3544921875e-05,
"loss": 0.1189,
"step": 3670
},
{
"epoch": 1.396584440227704,
"grad_norm": 0.35333120822906494,
"learning_rate": 3.3496093750000004e-05,
"loss": 0.1021,
"step": 3680
},
{
"epoch": 1.4003795066413662,
"grad_norm": 2.170039653778076,
"learning_rate": 3.3447265625e-05,
"loss": 0.1016,
"step": 3690
},
{
"epoch": 1.4041745730550286,
"grad_norm": 3.225989818572998,
"learning_rate": 3.33984375e-05,
"loss": 0.1559,
"step": 3700
},
{
"epoch": 1.4079696394686907,
"grad_norm": 5.81306266784668,
"learning_rate": 3.3349609375e-05,
"loss": 0.1378,
"step": 3710
},
{
"epoch": 1.4117647058823528,
"grad_norm": 0.839579701423645,
"learning_rate": 3.330078125e-05,
"loss": 0.0981,
"step": 3720
},
{
"epoch": 1.4155597722960152,
"grad_norm": 2.421964645385742,
"learning_rate": 3.3251953125000004e-05,
"loss": 0.1267,
"step": 3730
},
{
"epoch": 1.4193548387096775,
"grad_norm": 0.298155814409256,
"learning_rate": 3.3203125e-05,
"loss": 0.1619,
"step": 3740
},
{
"epoch": 1.4231499051233396,
"grad_norm": 5.643527030944824,
"learning_rate": 3.3154296875e-05,
"loss": 0.0844,
"step": 3750
},
{
"epoch": 1.426944971537002,
"grad_norm": 1.7513082027435303,
"learning_rate": 3.310546875e-05,
"loss": 0.133,
"step": 3760
},
{
"epoch": 1.430740037950664,
"grad_norm": 1.2837634086608887,
"learning_rate": 3.3056640625000005e-05,
"loss": 0.1241,
"step": 3770
},
{
"epoch": 1.4345351043643264,
"grad_norm": 0.7017351984977722,
"learning_rate": 3.3007812500000004e-05,
"loss": 0.1123,
"step": 3780
},
{
"epoch": 1.4383301707779887,
"grad_norm": 6.043475151062012,
"learning_rate": 3.2958984375e-05,
"loss": 0.1249,
"step": 3790
},
{
"epoch": 1.4421252371916509,
"grad_norm": 4.449422359466553,
"learning_rate": 3.291015625e-05,
"loss": 0.173,
"step": 3800
},
{
"epoch": 1.445920303605313,
"grad_norm": 1.7111449241638184,
"learning_rate": 3.2861328125e-05,
"loss": 0.1473,
"step": 3810
},
{
"epoch": 1.4497153700189753,
"grad_norm": 1.3379569053649902,
"learning_rate": 3.2812500000000005e-05,
"loss": 0.1119,
"step": 3820
},
{
"epoch": 1.4535104364326377,
"grad_norm": 7.154158115386963,
"learning_rate": 3.2763671875e-05,
"loss": 0.1273,
"step": 3830
},
{
"epoch": 1.4573055028462998,
"grad_norm": 1.2248731851577759,
"learning_rate": 3.271484375e-05,
"loss": 0.1081,
"step": 3840
},
{
"epoch": 1.4611005692599621,
"grad_norm": 1.219230055809021,
"learning_rate": 3.2666015625e-05,
"loss": 0.0945,
"step": 3850
},
{
"epoch": 1.4648956356736242,
"grad_norm": 4.3124189376831055,
"learning_rate": 3.26171875e-05,
"loss": 0.1039,
"step": 3860
},
{
"epoch": 1.4686907020872866,
"grad_norm": 2.915302038192749,
"learning_rate": 3.2568359375000005e-05,
"loss": 0.1236,
"step": 3870
},
{
"epoch": 1.4724857685009487,
"grad_norm": 0.3403218984603882,
"learning_rate": 3.251953125e-05,
"loss": 0.146,
"step": 3880
},
{
"epoch": 1.476280834914611,
"grad_norm": 1.74779212474823,
"learning_rate": 3.2470703125e-05,
"loss": 0.1096,
"step": 3890
},
{
"epoch": 1.4800759013282732,
"grad_norm": 2.724412202835083,
"learning_rate": 3.2421875e-05,
"loss": 0.1147,
"step": 3900
},
{
"epoch": 1.4838709677419355,
"grad_norm": 3.6029605865478516,
"learning_rate": 3.2373046875e-05,
"loss": 0.1293,
"step": 3910
},
{
"epoch": 1.4876660341555978,
"grad_norm": 1.7680699825286865,
"learning_rate": 3.2324218750000004e-05,
"loss": 0.0891,
"step": 3920
},
{
"epoch": 1.49146110056926,
"grad_norm": 0.7916316390037537,
"learning_rate": 3.2275390625e-05,
"loss": 0.1223,
"step": 3930
},
{
"epoch": 1.495256166982922,
"grad_norm": 0.9054811596870422,
"learning_rate": 3.22265625e-05,
"loss": 0.0934,
"step": 3940
},
{
"epoch": 1.4990512333965844,
"grad_norm": 0.14054611325263977,
"learning_rate": 3.2177734375e-05,
"loss": 0.0494,
"step": 3950
},
{
"epoch": 1.5028462998102468,
"grad_norm": 3.1943421363830566,
"learning_rate": 3.212890625e-05,
"loss": 0.1156,
"step": 3960
},
{
"epoch": 1.5066413662239089,
"grad_norm": 1.0965791940689087,
"learning_rate": 3.2080078125000004e-05,
"loss": 0.1016,
"step": 3970
},
{
"epoch": 1.510436432637571,
"grad_norm": 1.3087248802185059,
"learning_rate": 3.203125e-05,
"loss": 0.0764,
"step": 3980
},
{
"epoch": 1.5142314990512333,
"grad_norm": 2.760798692703247,
"learning_rate": 3.1982421875e-05,
"loss": 0.114,
"step": 3990
},
{
"epoch": 1.5180265654648957,
"grad_norm": 0.1450069397687912,
"learning_rate": 3.193359375e-05,
"loss": 0.1192,
"step": 4000
},
{
"epoch": 1.521821631878558,
"grad_norm": 4.504504680633545,
"learning_rate": 3.1884765625e-05,
"loss": 0.1046,
"step": 4010
},
{
"epoch": 1.5256166982922201,
"grad_norm": 0.7182434797286987,
"learning_rate": 3.1835937500000004e-05,
"loss": 0.0932,
"step": 4020
},
{
"epoch": 1.5294117647058822,
"grad_norm": 4.370609283447266,
"learning_rate": 3.1787109375e-05,
"loss": 0.144,
"step": 4030
},
{
"epoch": 1.5332068311195446,
"grad_norm": 3.8300323486328125,
"learning_rate": 3.173828125e-05,
"loss": 0.0982,
"step": 4040
},
{
"epoch": 1.537001897533207,
"grad_norm": 0.25771814584732056,
"learning_rate": 3.1689453125e-05,
"loss": 0.0691,
"step": 4050
},
{
"epoch": 1.540796963946869,
"grad_norm": 2.758225917816162,
"learning_rate": 3.1640625e-05,
"loss": 0.1308,
"step": 4060
},
{
"epoch": 1.5445920303605312,
"grad_norm": 2.7619638442993164,
"learning_rate": 3.1591796875000004e-05,
"loss": 0.094,
"step": 4070
},
{
"epoch": 1.5483870967741935,
"grad_norm": 0.9765902757644653,
"learning_rate": 3.154296875e-05,
"loss": 0.0811,
"step": 4080
},
{
"epoch": 1.5521821631878558,
"grad_norm": 4.361360549926758,
"learning_rate": 3.1494140625e-05,
"loss": 0.1742,
"step": 4090
},
{
"epoch": 1.5559772296015182,
"grad_norm": 2.249197244644165,
"learning_rate": 3.14453125e-05,
"loss": 0.0807,
"step": 4100
},
{
"epoch": 1.5597722960151803,
"grad_norm": 3.4518532752990723,
"learning_rate": 3.1396484375000005e-05,
"loss": 0.1422,
"step": 4110
},
{
"epoch": 1.5635673624288424,
"grad_norm": 0.6679037809371948,
"learning_rate": 3.1347656250000003e-05,
"loss": 0.1214,
"step": 4120
},
{
"epoch": 1.5673624288425048,
"grad_norm": 3.879596710205078,
"learning_rate": 3.1298828125e-05,
"loss": 0.1084,
"step": 4130
},
{
"epoch": 1.571157495256167,
"grad_norm": 5.232009410858154,
"learning_rate": 3.125e-05,
"loss": 0.1192,
"step": 4140
},
{
"epoch": 1.5749525616698292,
"grad_norm": 3.875843048095703,
"learning_rate": 3.1201171875e-05,
"loss": 0.1099,
"step": 4150
},
{
"epoch": 1.5787476280834913,
"grad_norm": 0.17772170901298523,
"learning_rate": 3.1152343750000005e-05,
"loss": 0.1001,
"step": 4160
},
{
"epoch": 1.5825426944971537,
"grad_norm": 0.6866888403892517,
"learning_rate": 3.1103515625e-05,
"loss": 0.1598,
"step": 4170
},
{
"epoch": 1.586337760910816,
"grad_norm": 2.2445452213287354,
"learning_rate": 3.10546875e-05,
"loss": 0.1532,
"step": 4180
},
{
"epoch": 1.5901328273244781,
"grad_norm": 1.2135056257247925,
"learning_rate": 3.1005859375e-05,
"loss": 0.1337,
"step": 4190
},
{
"epoch": 1.5939278937381403,
"grad_norm": 0.8548033833503723,
"learning_rate": 3.095703125e-05,
"loss": 0.1142,
"step": 4200
},
{
"epoch": 1.5977229601518026,
"grad_norm": 1.7404321432113647,
"learning_rate": 3.0908203125000004e-05,
"loss": 0.1195,
"step": 4210
},
{
"epoch": 1.601518026565465,
"grad_norm": 1.4047428369522095,
"learning_rate": 3.0859375e-05,
"loss": 0.1853,
"step": 4220
},
{
"epoch": 1.6053130929791273,
"grad_norm": 2.793487071990967,
"learning_rate": 3.0810546875e-05,
"loss": 0.1231,
"step": 4230
},
{
"epoch": 1.6091081593927894,
"grad_norm": 0.928959310054779,
"learning_rate": 3.076171875e-05,
"loss": 0.0891,
"step": 4240
},
{
"epoch": 1.6129032258064515,
"grad_norm": 1.1571967601776123,
"learning_rate": 3.0712890625e-05,
"loss": 0.1119,
"step": 4250
},
{
"epoch": 1.6166982922201139,
"grad_norm": 3.0740041732788086,
"learning_rate": 3.0664062500000004e-05,
"loss": 0.1518,
"step": 4260
},
{
"epoch": 1.6204933586337762,
"grad_norm": 5.726138114929199,
"learning_rate": 3.0615234375e-05,
"loss": 0.1121,
"step": 4270
},
{
"epoch": 1.6242884250474383,
"grad_norm": 3.900777816772461,
"learning_rate": 3.056640625e-05,
"loss": 0.1513,
"step": 4280
},
{
"epoch": 1.6280834914611004,
"grad_norm": 3.43808913230896,
"learning_rate": 3.0517578125e-05,
"loss": 0.1259,
"step": 4290
},
{
"epoch": 1.6318785578747628,
"grad_norm": 1.2054848670959473,
"learning_rate": 3.0468750000000002e-05,
"loss": 0.1446,
"step": 4300
},
{
"epoch": 1.635673624288425,
"grad_norm": 3.756579875946045,
"learning_rate": 3.0419921875e-05,
"loss": 0.1348,
"step": 4310
},
{
"epoch": 1.6394686907020875,
"grad_norm": 1.4033925533294678,
"learning_rate": 3.0371093750000003e-05,
"loss": 0.1053,
"step": 4320
},
{
"epoch": 1.6432637571157496,
"grad_norm": 1.6513621807098389,
"learning_rate": 3.0322265625e-05,
"loss": 0.1217,
"step": 4330
},
{
"epoch": 1.6470588235294117,
"grad_norm": 1.9821256399154663,
"learning_rate": 3.02734375e-05,
"loss": 0.0959,
"step": 4340
},
{
"epoch": 1.650853889943074,
"grad_norm": 7.50634241104126,
"learning_rate": 3.0224609375000002e-05,
"loss": 0.1487,
"step": 4350
},
{
"epoch": 1.6546489563567364,
"grad_norm": 1.1505802869796753,
"learning_rate": 3.017578125e-05,
"loss": 0.1246,
"step": 4360
},
{
"epoch": 1.6584440227703985,
"grad_norm": 1.774200677871704,
"learning_rate": 3.0126953125000002e-05,
"loss": 0.086,
"step": 4370
},
{
"epoch": 1.6622390891840606,
"grad_norm": 1.566748023033142,
"learning_rate": 3.0078125e-05,
"loss": 0.1088,
"step": 4380
},
{
"epoch": 1.666034155597723,
"grad_norm": 2.8167648315429688,
"learning_rate": 3.0029296875000003e-05,
"loss": 0.122,
"step": 4390
},
{
"epoch": 1.6698292220113853,
"grad_norm": 1.7637346982955933,
"learning_rate": 2.998046875e-05,
"loss": 0.1036,
"step": 4400
},
{
"epoch": 1.6736242884250474,
"grad_norm": 0.3347111642360687,
"learning_rate": 2.9931640625e-05,
"loss": 0.1259,
"step": 4410
},
{
"epoch": 1.6774193548387095,
"grad_norm": 4.920076370239258,
"learning_rate": 2.9882812500000002e-05,
"loss": 0.1594,
"step": 4420
},
{
"epoch": 1.6812144212523719,
"grad_norm": 3.4409444332122803,
"learning_rate": 2.9833984375e-05,
"loss": 0.1541,
"step": 4430
},
{
"epoch": 1.6850094876660342,
"grad_norm": 0.639980673789978,
"learning_rate": 2.9785156250000003e-05,
"loss": 0.0826,
"step": 4440
},
{
"epoch": 1.6888045540796965,
"grad_norm": 3.240345001220703,
"learning_rate": 2.9736328125e-05,
"loss": 0.1473,
"step": 4450
},
{
"epoch": 1.6925996204933587,
"grad_norm": 2.2682647705078125,
"learning_rate": 2.96875e-05,
"loss": 0.0959,
"step": 4460
},
{
"epoch": 1.6963946869070208,
"grad_norm": 2.3791496753692627,
"learning_rate": 2.9638671875000002e-05,
"loss": 0.0953,
"step": 4470
},
{
"epoch": 1.7001897533206831,
"grad_norm": 1.5654246807098389,
"learning_rate": 2.958984375e-05,
"loss": 0.113,
"step": 4480
},
{
"epoch": 1.7039848197343455,
"grad_norm": 5.17665958404541,
"learning_rate": 2.9541015625000003e-05,
"loss": 0.1164,
"step": 4490
},
{
"epoch": 1.7077798861480076,
"grad_norm": 18.226165771484375,
"learning_rate": 2.94921875e-05,
"loss": 0.1293,
"step": 4500
},
{
"epoch": 1.7115749525616697,
"grad_norm": 3.5760374069213867,
"learning_rate": 2.9443359375e-05,
"loss": 0.0931,
"step": 4510
},
{
"epoch": 1.715370018975332,
"grad_norm": 2.9964776039123535,
"learning_rate": 2.9394531250000002e-05,
"loss": 0.0932,
"step": 4520
},
{
"epoch": 1.7191650853889944,
"grad_norm": 10.505178451538086,
"learning_rate": 2.9345703125e-05,
"loss": 0.139,
"step": 4530
},
{
"epoch": 1.7229601518026565,
"grad_norm": 0.9944730997085571,
"learning_rate": 2.9296875000000002e-05,
"loss": 0.159,
"step": 4540
},
{
"epoch": 1.7267552182163188,
"grad_norm": 1.2323939800262451,
"learning_rate": 2.9248046875e-05,
"loss": 0.118,
"step": 4550
},
{
"epoch": 1.730550284629981,
"grad_norm": 0.8581392765045166,
"learning_rate": 2.9199218750000003e-05,
"loss": 0.1165,
"step": 4560
},
{
"epoch": 1.7343453510436433,
"grad_norm": 2.196648120880127,
"learning_rate": 2.9150390625e-05,
"loss": 0.0803,
"step": 4570
},
{
"epoch": 1.7381404174573056,
"grad_norm": 3.5112388134002686,
"learning_rate": 2.91015625e-05,
"loss": 0.1348,
"step": 4580
},
{
"epoch": 1.7419354838709677,
"grad_norm": 1.1738495826721191,
"learning_rate": 2.9052734375000002e-05,
"loss": 0.1114,
"step": 4590
},
{
"epoch": 1.7457305502846299,
"grad_norm": 1.6850240230560303,
"learning_rate": 2.900390625e-05,
"loss": 0.1457,
"step": 4600
},
{
"epoch": 1.7495256166982922,
"grad_norm": 1.4865467548370361,
"learning_rate": 2.8955078125000003e-05,
"loss": 0.1078,
"step": 4610
},
{
"epoch": 1.7533206831119545,
"grad_norm": 1.445610523223877,
"learning_rate": 2.890625e-05,
"loss": 0.0839,
"step": 4620
},
{
"epoch": 1.7571157495256167,
"grad_norm": 1.649983525276184,
"learning_rate": 2.8857421875e-05,
"loss": 0.1028,
"step": 4630
},
{
"epoch": 1.7609108159392788,
"grad_norm": 2.717585802078247,
"learning_rate": 2.8808593750000002e-05,
"loss": 0.1127,
"step": 4640
},
{
"epoch": 1.7647058823529411,
"grad_norm": 2.902244806289673,
"learning_rate": 2.8759765625e-05,
"loss": 0.0743,
"step": 4650
},
{
"epoch": 1.7685009487666035,
"grad_norm": 1.8880512714385986,
"learning_rate": 2.8710937500000002e-05,
"loss": 0.0875,
"step": 4660
},
{
"epoch": 1.7722960151802658,
"grad_norm": 1.119419813156128,
"learning_rate": 2.8662109375e-05,
"loss": 0.1028,
"step": 4670
},
{
"epoch": 1.776091081593928,
"grad_norm": 2.3372507095336914,
"learning_rate": 2.8613281250000003e-05,
"loss": 0.161,
"step": 4680
},
{
"epoch": 1.77988614800759,
"grad_norm": 0.6809380054473877,
"learning_rate": 2.8564453125e-05,
"loss": 0.091,
"step": 4690
},
{
"epoch": 1.7836812144212524,
"grad_norm": 4.871325969696045,
"learning_rate": 2.8515625e-05,
"loss": 0.1495,
"step": 4700
},
{
"epoch": 1.7874762808349147,
"grad_norm": 10.103543281555176,
"learning_rate": 2.8466796875000002e-05,
"loss": 0.0847,
"step": 4710
},
{
"epoch": 1.7912713472485768,
"grad_norm": 0.719699501991272,
"learning_rate": 2.841796875e-05,
"loss": 0.0991,
"step": 4720
},
{
"epoch": 1.795066413662239,
"grad_norm": 2.012406826019287,
"learning_rate": 2.8369140625000003e-05,
"loss": 0.069,
"step": 4730
},
{
"epoch": 1.7988614800759013,
"grad_norm": 2.038810968399048,
"learning_rate": 2.83203125e-05,
"loss": 0.0946,
"step": 4740
},
{
"epoch": 1.8026565464895636,
"grad_norm": 1.991003394126892,
"learning_rate": 2.8271484375e-05,
"loss": 0.1033,
"step": 4750
},
{
"epoch": 1.8064516129032258,
"grad_norm": 1.9379823207855225,
"learning_rate": 2.8222656250000002e-05,
"loss": 0.0738,
"step": 4760
},
{
"epoch": 1.810246679316888,
"grad_norm": 0.9378390312194824,
"learning_rate": 2.8173828125e-05,
"loss": 0.0907,
"step": 4770
},
{
"epoch": 1.8140417457305502,
"grad_norm": 2.5683369636535645,
"learning_rate": 2.8125000000000003e-05,
"loss": 0.1156,
"step": 4780
},
{
"epoch": 1.8178368121442126,
"grad_norm": 2.95536470413208,
"learning_rate": 2.8076171875e-05,
"loss": 0.0959,
"step": 4790
},
{
"epoch": 1.821631878557875,
"grad_norm": 11.215580940246582,
"learning_rate": 2.802734375e-05,
"loss": 0.0812,
"step": 4800
},
{
"epoch": 1.825426944971537,
"grad_norm": 0.4500042498111725,
"learning_rate": 2.7978515625000002e-05,
"loss": 0.1114,
"step": 4810
},
{
"epoch": 1.8292220113851991,
"grad_norm": 0.5829250812530518,
"learning_rate": 2.79296875e-05,
"loss": 0.1284,
"step": 4820
},
{
"epoch": 1.8330170777988615,
"grad_norm": 3.114776134490967,
"learning_rate": 2.7880859375000002e-05,
"loss": 0.1283,
"step": 4830
},
{
"epoch": 1.8368121442125238,
"grad_norm": 0.47552067041397095,
"learning_rate": 2.783203125e-05,
"loss": 0.0752,
"step": 4840
},
{
"epoch": 1.840607210626186,
"grad_norm": 4.794514179229736,
"learning_rate": 2.7783203125000003e-05,
"loss": 0.1012,
"step": 4850
},
{
"epoch": 1.844402277039848,
"grad_norm": 5.392133712768555,
"learning_rate": 2.7734375e-05,
"loss": 0.178,
"step": 4860
},
{
"epoch": 1.8481973434535104,
"grad_norm": 1.1505749225616455,
"learning_rate": 2.7685546875e-05,
"loss": 0.126,
"step": 4870
},
{
"epoch": 1.8519924098671727,
"grad_norm": 1.1924586296081543,
"learning_rate": 2.7636718750000002e-05,
"loss": 0.1109,
"step": 4880
},
{
"epoch": 1.855787476280835,
"grad_norm": 0.12782755494117737,
"learning_rate": 2.7587890625e-05,
"loss": 0.0732,
"step": 4890
},
{
"epoch": 1.8595825426944972,
"grad_norm": 1.1095064878463745,
"learning_rate": 2.7539062500000003e-05,
"loss": 0.0802,
"step": 4900
},
{
"epoch": 1.8633776091081593,
"grad_norm": 8.920310020446777,
"learning_rate": 2.7490234375e-05,
"loss": 0.0964,
"step": 4910
},
{
"epoch": 1.8671726755218216,
"grad_norm": 1.8678808212280273,
"learning_rate": 2.744140625e-05,
"loss": 0.1072,
"step": 4920
},
{
"epoch": 1.870967741935484,
"grad_norm": 1.8633017539978027,
"learning_rate": 2.7392578125000002e-05,
"loss": 0.0835,
"step": 4930
},
{
"epoch": 1.874762808349146,
"grad_norm": 1.7576115131378174,
"learning_rate": 2.734375e-05,
"loss": 0.1327,
"step": 4940
},
{
"epoch": 1.8785578747628082,
"grad_norm": 3.504157304763794,
"learning_rate": 2.7294921875000003e-05,
"loss": 0.1609,
"step": 4950
},
{
"epoch": 1.8823529411764706,
"grad_norm": 1.7668483257293701,
"learning_rate": 2.724609375e-05,
"loss": 0.1316,
"step": 4960
},
{
"epoch": 1.886148007590133,
"grad_norm": 0.659870982170105,
"learning_rate": 2.7197265625e-05,
"loss": 0.0913,
"step": 4970
},
{
"epoch": 1.889943074003795,
"grad_norm": 1.428725004196167,
"learning_rate": 2.7148437500000002e-05,
"loss": 0.118,
"step": 4980
},
{
"epoch": 1.8937381404174574,
"grad_norm": 1.8446964025497437,
"learning_rate": 2.7099609375e-05,
"loss": 0.1203,
"step": 4990
},
{
"epoch": 1.8975332068311195,
"grad_norm": 2.9335217475891113,
"learning_rate": 2.7050781250000002e-05,
"loss": 0.1301,
"step": 5000
},
{
"epoch": 1.9013282732447818,
"grad_norm": 0.8534810543060303,
"learning_rate": 2.7001953125e-05,
"loss": 0.0555,
"step": 5010
},
{
"epoch": 1.9051233396584442,
"grad_norm": 0.5556221604347229,
"learning_rate": 2.6953125000000003e-05,
"loss": 0.1036,
"step": 5020
},
{
"epoch": 1.9089184060721063,
"grad_norm": 1.7097387313842773,
"learning_rate": 2.6904296875e-05,
"loss": 0.0869,
"step": 5030
},
{
"epoch": 1.9127134724857684,
"grad_norm": 2.324669122695923,
"learning_rate": 2.685546875e-05,
"loss": 0.1233,
"step": 5040
},
{
"epoch": 1.9165085388994307,
"grad_norm": 2.4764981269836426,
"learning_rate": 2.6806640625000002e-05,
"loss": 0.1379,
"step": 5050
},
{
"epoch": 1.920303605313093,
"grad_norm": 4.731557846069336,
"learning_rate": 2.67578125e-05,
"loss": 0.189,
"step": 5060
},
{
"epoch": 1.9240986717267552,
"grad_norm": 0.4868462383747101,
"learning_rate": 2.6708984375000003e-05,
"loss": 0.0765,
"step": 5070
},
{
"epoch": 1.9278937381404173,
"grad_norm": 1.3497892618179321,
"learning_rate": 2.666015625e-05,
"loss": 0.1039,
"step": 5080
},
{
"epoch": 1.9316888045540797,
"grad_norm": 15.007429122924805,
"learning_rate": 2.6611328125e-05,
"loss": 0.0996,
"step": 5090
},
{
"epoch": 1.935483870967742,
"grad_norm": 8.113617897033691,
"learning_rate": 2.6562500000000002e-05,
"loss": 0.1316,
"step": 5100
},
{
"epoch": 1.9392789373814043,
"grad_norm": 0.4574742913246155,
"learning_rate": 2.6513671875e-05,
"loss": 0.1044,
"step": 5110
},
{
"epoch": 1.9430740037950665,
"grad_norm": 2.1475601196289062,
"learning_rate": 2.6464843750000002e-05,
"loss": 0.1236,
"step": 5120
},
{
"epoch": 1.9468690702087286,
"grad_norm": 2.370619058609009,
"learning_rate": 2.6416015625e-05,
"loss": 0.1358,
"step": 5130
},
{
"epoch": 1.950664136622391,
"grad_norm": 0.7283152937889099,
"learning_rate": 2.63671875e-05,
"loss": 0.1348,
"step": 5140
},
{
"epoch": 1.9544592030360532,
"grad_norm": 2.8883001804351807,
"learning_rate": 2.6318359375e-05,
"loss": 0.083,
"step": 5150
},
{
"epoch": 1.9582542694497154,
"grad_norm": 0.26794353127479553,
"learning_rate": 2.626953125e-05,
"loss": 0.1229,
"step": 5160
},
{
"epoch": 1.9620493358633775,
"grad_norm": 0.10836785286664963,
"learning_rate": 2.6220703125000002e-05,
"loss": 0.0731,
"step": 5170
},
{
"epoch": 1.9658444022770398,
"grad_norm": 1.5825821161270142,
"learning_rate": 2.6171875e-05,
"loss": 0.1394,
"step": 5180
},
{
"epoch": 1.9696394686907022,
"grad_norm": 2.9467551708221436,
"learning_rate": 2.6123046875000003e-05,
"loss": 0.0986,
"step": 5190
},
{
"epoch": 1.9734345351043643,
"grad_norm": 0.14293566346168518,
"learning_rate": 2.607421875e-05,
"loss": 0.0824,
"step": 5200
},
{
"epoch": 1.9772296015180264,
"grad_norm": 0.4912210702896118,
"learning_rate": 2.6025390625e-05,
"loss": 0.0863,
"step": 5210
},
{
"epoch": 1.9810246679316887,
"grad_norm": 0.2447841614484787,
"learning_rate": 2.5976562500000002e-05,
"loss": 0.0877,
"step": 5220
},
{
"epoch": 1.984819734345351,
"grad_norm": 0.13301405310630798,
"learning_rate": 2.5927734375e-05,
"loss": 0.104,
"step": 5230
},
{
"epoch": 1.9886148007590134,
"grad_norm": 3.25866961479187,
"learning_rate": 2.5878906250000003e-05,
"loss": 0.0806,
"step": 5240
},
{
"epoch": 1.9924098671726755,
"grad_norm": 3.9567527770996094,
"learning_rate": 2.5830078125e-05,
"loss": 0.1226,
"step": 5250
},
{
"epoch": 1.9962049335863377,
"grad_norm": 3.6540729999542236,
"learning_rate": 2.578125e-05,
"loss": 0.0628,
"step": 5260
},
{
"epoch": 2.0,
"grad_norm": 2.9958958625793457,
"learning_rate": 2.5732421875000002e-05,
"loss": 0.1229,
"step": 5270
},
{
"epoch": 2.0037950664136623,
"grad_norm": 4.634014129638672,
"learning_rate": 2.568359375e-05,
"loss": 0.1,
"step": 5280
},
{
"epoch": 2.0075901328273247,
"grad_norm": 1.0794429779052734,
"learning_rate": 2.5634765625000002e-05,
"loss": 0.1,
"step": 5290
},
{
"epoch": 2.0113851992409866,
"grad_norm": 2.6222951412200928,
"learning_rate": 2.55859375e-05,
"loss": 0.057,
"step": 5300
},
{
"epoch": 2.015180265654649,
"grad_norm": 1.499935507774353,
"learning_rate": 2.5537109375e-05,
"loss": 0.0766,
"step": 5310
},
{
"epoch": 2.0189753320683113,
"grad_norm": 2.614969491958618,
"learning_rate": 2.548828125e-05,
"loss": 0.1003,
"step": 5320
},
{
"epoch": 2.0227703984819736,
"grad_norm": 1.4524706602096558,
"learning_rate": 2.5439453125e-05,
"loss": 0.1681,
"step": 5330
},
{
"epoch": 2.0265654648956355,
"grad_norm": 1.5427693128585815,
"learning_rate": 2.5390625000000002e-05,
"loss": 0.0745,
"step": 5340
},
{
"epoch": 2.030360531309298,
"grad_norm": 0.6060462594032288,
"learning_rate": 2.5341796875e-05,
"loss": 0.0557,
"step": 5350
},
{
"epoch": 2.03415559772296,
"grad_norm": 2.1763222217559814,
"learning_rate": 2.5292968750000003e-05,
"loss": 0.0962,
"step": 5360
},
{
"epoch": 2.0379506641366225,
"grad_norm": 0.9857283234596252,
"learning_rate": 2.5244140625e-05,
"loss": 0.0646,
"step": 5370
},
{
"epoch": 2.041745730550285,
"grad_norm": 0.14561018347740173,
"learning_rate": 2.51953125e-05,
"loss": 0.0686,
"step": 5380
},
{
"epoch": 2.0455407969639468,
"grad_norm": 5.825016498565674,
"learning_rate": 2.5146484375000002e-05,
"loss": 0.1106,
"step": 5390
},
{
"epoch": 2.049335863377609,
"grad_norm": 0.4656510353088379,
"learning_rate": 2.509765625e-05,
"loss": 0.0793,
"step": 5400
},
{
"epoch": 2.0531309297912714,
"grad_norm": 5.336658954620361,
"learning_rate": 2.5048828125000003e-05,
"loss": 0.1136,
"step": 5410
},
{
"epoch": 2.0569259962049338,
"grad_norm": 1.3186858892440796,
"learning_rate": 2.5e-05,
"loss": 0.0908,
"step": 5420
},
{
"epoch": 2.0607210626185957,
"grad_norm": 2.3468871116638184,
"learning_rate": 2.4951171875e-05,
"loss": 0.1127,
"step": 5430
},
{
"epoch": 2.064516129032258,
"grad_norm": 1.6484739780426025,
"learning_rate": 2.4902343750000002e-05,
"loss": 0.0921,
"step": 5440
},
{
"epoch": 2.0683111954459203,
"grad_norm": 1.97286856174469,
"learning_rate": 2.4853515625e-05,
"loss": 0.064,
"step": 5450
},
{
"epoch": 2.0721062618595827,
"grad_norm": 0.7309706211090088,
"learning_rate": 2.4804687500000002e-05,
"loss": 0.1256,
"step": 5460
},
{
"epoch": 2.0759013282732446,
"grad_norm": 3.2271645069122314,
"learning_rate": 2.4755859375e-05,
"loss": 0.0889,
"step": 5470
},
{
"epoch": 2.079696394686907,
"grad_norm": 18.506216049194336,
"learning_rate": 2.470703125e-05,
"loss": 0.1328,
"step": 5480
},
{
"epoch": 2.0834914611005693,
"grad_norm": 1.2257277965545654,
"learning_rate": 2.4658203125e-05,
"loss": 0.0673,
"step": 5490
},
{
"epoch": 2.0872865275142316,
"grad_norm": 0.1906469613313675,
"learning_rate": 2.4609375e-05,
"loss": 0.0808,
"step": 5500
},
{
"epoch": 2.091081593927894,
"grad_norm": 0.9694260954856873,
"learning_rate": 2.4560546875000002e-05,
"loss": 0.0558,
"step": 5510
},
{
"epoch": 2.094876660341556,
"grad_norm": 5.630046844482422,
"learning_rate": 2.451171875e-05,
"loss": 0.1262,
"step": 5520
},
{
"epoch": 2.098671726755218,
"grad_norm": 0.13950304687023163,
"learning_rate": 2.4462890625000003e-05,
"loss": 0.0711,
"step": 5530
},
{
"epoch": 2.1024667931688805,
"grad_norm": 0.424904465675354,
"learning_rate": 2.44140625e-05,
"loss": 0.0841,
"step": 5540
},
{
"epoch": 2.106261859582543,
"grad_norm": 7.330411434173584,
"learning_rate": 2.4365234375e-05,
"loss": 0.1482,
"step": 5550
},
{
"epoch": 2.1100569259962048,
"grad_norm": 0.2741791009902954,
"learning_rate": 2.4316406250000002e-05,
"loss": 0.0945,
"step": 5560
},
{
"epoch": 2.113851992409867,
"grad_norm": 1.025099277496338,
"learning_rate": 2.4267578125e-05,
"loss": 0.0981,
"step": 5570
},
{
"epoch": 2.1176470588235294,
"grad_norm": 2.723508596420288,
"learning_rate": 2.4218750000000003e-05,
"loss": 0.067,
"step": 5580
},
{
"epoch": 2.121442125237192,
"grad_norm": 0.18666787445545197,
"learning_rate": 2.4169921875e-05,
"loss": 0.077,
"step": 5590
},
{
"epoch": 2.1252371916508537,
"grad_norm": 2.304980754852295,
"learning_rate": 2.412109375e-05,
"loss": 0.1016,
"step": 5600
},
{
"epoch": 2.129032258064516,
"grad_norm": 1.6174981594085693,
"learning_rate": 2.4072265625000002e-05,
"loss": 0.0735,
"step": 5610
},
{
"epoch": 2.1328273244781784,
"grad_norm": 5.401015758514404,
"learning_rate": 2.40234375e-05,
"loss": 0.087,
"step": 5620
},
{
"epoch": 2.1366223908918407,
"grad_norm": 2.5387024879455566,
"learning_rate": 2.3974609375000002e-05,
"loss": 0.1006,
"step": 5630
},
{
"epoch": 2.140417457305503,
"grad_norm": 4.753091812133789,
"learning_rate": 2.392578125e-05,
"loss": 0.1013,
"step": 5640
},
{
"epoch": 2.144212523719165,
"grad_norm": 3.540262460708618,
"learning_rate": 2.3876953125e-05,
"loss": 0.0697,
"step": 5650
},
{
"epoch": 2.1480075901328273,
"grad_norm": 1.53217613697052,
"learning_rate": 2.3828125e-05,
"loss": 0.0812,
"step": 5660
},
{
"epoch": 2.1518026565464896,
"grad_norm": 2.652308940887451,
"learning_rate": 2.3779296875e-05,
"loss": 0.092,
"step": 5670
},
{
"epoch": 2.155597722960152,
"grad_norm": 2.7964372634887695,
"learning_rate": 2.3730468750000002e-05,
"loss": 0.0658,
"step": 5680
},
{
"epoch": 2.159392789373814,
"grad_norm": 0.11225280165672302,
"learning_rate": 2.3681640625e-05,
"loss": 0.0939,
"step": 5690
},
{
"epoch": 2.163187855787476,
"grad_norm": 1.5736573934555054,
"learning_rate": 2.3632812500000003e-05,
"loss": 0.0727,
"step": 5700
},
{
"epoch": 2.1669829222011385,
"grad_norm": 2.087057113647461,
"learning_rate": 2.3583984375e-05,
"loss": 0.0654,
"step": 5710
},
{
"epoch": 2.170777988614801,
"grad_norm": 1.598823070526123,
"learning_rate": 2.353515625e-05,
"loss": 0.0874,
"step": 5720
},
{
"epoch": 2.174573055028463,
"grad_norm": 1.7258918285369873,
"learning_rate": 2.3486328125000002e-05,
"loss": 0.0703,
"step": 5730
},
{
"epoch": 2.178368121442125,
"grad_norm": 12.662415504455566,
"learning_rate": 2.34375e-05,
"loss": 0.0998,
"step": 5740
},
{
"epoch": 2.1821631878557874,
"grad_norm": 5.9703803062438965,
"learning_rate": 2.3388671875000002e-05,
"loss": 0.1021,
"step": 5750
},
{
"epoch": 2.18595825426945,
"grad_norm": 1.9118971824645996,
"learning_rate": 2.333984375e-05,
"loss": 0.0574,
"step": 5760
},
{
"epoch": 2.189753320683112,
"grad_norm": 2.8925118446350098,
"learning_rate": 2.3291015625e-05,
"loss": 0.0804,
"step": 5770
},
{
"epoch": 2.193548387096774,
"grad_norm": 0.9911293387413025,
"learning_rate": 2.32421875e-05,
"loss": 0.0673,
"step": 5780
},
{
"epoch": 2.1973434535104364,
"grad_norm": 3.4294886589050293,
"learning_rate": 2.3193359375e-05,
"loss": 0.0729,
"step": 5790
},
{
"epoch": 2.2011385199240987,
"grad_norm": 5.382150650024414,
"learning_rate": 2.3144531250000002e-05,
"loss": 0.1117,
"step": 5800
},
{
"epoch": 2.204933586337761,
"grad_norm": 3.5237820148468018,
"learning_rate": 2.3095703125e-05,
"loss": 0.0674,
"step": 5810
},
{
"epoch": 2.2087286527514234,
"grad_norm": 5.6236772537231445,
"learning_rate": 2.3046875e-05,
"loss": 0.0279,
"step": 5820
},
{
"epoch": 2.2125237191650853,
"grad_norm": 1.1168630123138428,
"learning_rate": 2.2998046875e-05,
"loss": 0.0773,
"step": 5830
},
{
"epoch": 2.2163187855787476,
"grad_norm": 1.0353121757507324,
"learning_rate": 2.294921875e-05,
"loss": 0.062,
"step": 5840
},
{
"epoch": 2.22011385199241,
"grad_norm": 1.4820594787597656,
"learning_rate": 2.2900390625000002e-05,
"loss": 0.0778,
"step": 5850
},
{
"epoch": 2.2239089184060723,
"grad_norm": 8.295422554016113,
"learning_rate": 2.28515625e-05,
"loss": 0.1192,
"step": 5860
},
{
"epoch": 2.227703984819734,
"grad_norm": 1.5980597734451294,
"learning_rate": 2.2802734375000003e-05,
"loss": 0.0648,
"step": 5870
},
{
"epoch": 2.2314990512333965,
"grad_norm": 0.2760424315929413,
"learning_rate": 2.275390625e-05,
"loss": 0.0722,
"step": 5880
},
{
"epoch": 2.235294117647059,
"grad_norm": 0.8219416737556458,
"learning_rate": 2.2705078125e-05,
"loss": 0.0935,
"step": 5890
},
{
"epoch": 2.239089184060721,
"grad_norm": 0.16338910162448883,
"learning_rate": 2.2656250000000002e-05,
"loss": 0.0876,
"step": 5900
},
{
"epoch": 2.242884250474383,
"grad_norm": 0.5857824683189392,
"learning_rate": 2.2607421875e-05,
"loss": 0.117,
"step": 5910
},
{
"epoch": 2.2466793168880455,
"grad_norm": 0.1616586148738861,
"learning_rate": 2.2558593750000002e-05,
"loss": 0.072,
"step": 5920
},
{
"epoch": 2.250474383301708,
"grad_norm": 0.26469337940216064,
"learning_rate": 2.2509765625e-05,
"loss": 0.0902,
"step": 5930
},
{
"epoch": 2.25426944971537,
"grad_norm": 3.576016426086426,
"learning_rate": 2.24609375e-05,
"loss": 0.1647,
"step": 5940
},
{
"epoch": 2.258064516129032,
"grad_norm": 6.523315906524658,
"learning_rate": 2.2412109375e-05,
"loss": 0.0705,
"step": 5950
},
{
"epoch": 2.2618595825426944,
"grad_norm": 4.0901689529418945,
"learning_rate": 2.236328125e-05,
"loss": 0.0786,
"step": 5960
},
{
"epoch": 2.2656546489563567,
"grad_norm": 0.5081945061683655,
"learning_rate": 2.2314453125000002e-05,
"loss": 0.1158,
"step": 5970
},
{
"epoch": 2.269449715370019,
"grad_norm": 0.10847347974777222,
"learning_rate": 2.2265625e-05,
"loss": 0.0825,
"step": 5980
},
{
"epoch": 2.2732447817836814,
"grad_norm": 9.521303176879883,
"learning_rate": 2.2216796875e-05,
"loss": 0.0875,
"step": 5990
},
{
"epoch": 2.2770398481973433,
"grad_norm": 6.0424580574035645,
"learning_rate": 2.216796875e-05,
"loss": 0.0994,
"step": 6000
},
{
"epoch": 2.2808349146110056,
"grad_norm": 0.3634886145591736,
"learning_rate": 2.2119140625e-05,
"loss": 0.0813,
"step": 6010
},
{
"epoch": 2.284629981024668,
"grad_norm": 1.929626703262329,
"learning_rate": 2.2070312500000002e-05,
"loss": 0.0705,
"step": 6020
},
{
"epoch": 2.2884250474383303,
"grad_norm": 4.993653297424316,
"learning_rate": 2.2021484375e-05,
"loss": 0.0731,
"step": 6030
},
{
"epoch": 2.292220113851992,
"grad_norm": 0.4869803190231323,
"learning_rate": 2.1972656250000003e-05,
"loss": 0.1123,
"step": 6040
},
{
"epoch": 2.2960151802656545,
"grad_norm": 1.1776117086410522,
"learning_rate": 2.1923828125e-05,
"loss": 0.0643,
"step": 6050
},
{
"epoch": 2.299810246679317,
"grad_norm": 1.7794570922851562,
"learning_rate": 2.1875e-05,
"loss": 0.0852,
"step": 6060
},
{
"epoch": 2.3036053130929792,
"grad_norm": 2.7579660415649414,
"learning_rate": 2.1826171875000002e-05,
"loss": 0.0975,
"step": 6070
},
{
"epoch": 2.3074003795066416,
"grad_norm": 2.9852662086486816,
"learning_rate": 2.177734375e-05,
"loss": 0.0724,
"step": 6080
},
{
"epoch": 2.3111954459203035,
"grad_norm": 3.543381452560425,
"learning_rate": 2.1728515625000002e-05,
"loss": 0.1108,
"step": 6090
},
{
"epoch": 2.314990512333966,
"grad_norm": 6.476046085357666,
"learning_rate": 2.16796875e-05,
"loss": 0.1231,
"step": 6100
},
{
"epoch": 2.318785578747628,
"grad_norm": 3.2935097217559814,
"learning_rate": 2.1630859375e-05,
"loss": 0.1052,
"step": 6110
},
{
"epoch": 2.3225806451612905,
"grad_norm": 1.1247642040252686,
"learning_rate": 2.158203125e-05,
"loss": 0.0817,
"step": 6120
},
{
"epoch": 2.3263757115749524,
"grad_norm": 6.793920993804932,
"learning_rate": 2.1533203125e-05,
"loss": 0.0623,
"step": 6130
},
{
"epoch": 2.3301707779886147,
"grad_norm": 0.12885475158691406,
"learning_rate": 2.1484375000000002e-05,
"loss": 0.0942,
"step": 6140
},
{
"epoch": 2.333965844402277,
"grad_norm": 1.4963340759277344,
"learning_rate": 2.1435546875e-05,
"loss": 0.0549,
"step": 6150
},
{
"epoch": 2.3377609108159394,
"grad_norm": 1.460093379020691,
"learning_rate": 2.138671875e-05,
"loss": 0.094,
"step": 6160
},
{
"epoch": 2.3415559772296017,
"grad_norm": 4.440692901611328,
"learning_rate": 2.1337890625e-05,
"loss": 0.1673,
"step": 6170
},
{
"epoch": 2.3453510436432636,
"grad_norm": 2.9689061641693115,
"learning_rate": 2.12890625e-05,
"loss": 0.0772,
"step": 6180
},
{
"epoch": 2.349146110056926,
"grad_norm": 8.890856742858887,
"learning_rate": 2.1240234375000002e-05,
"loss": 0.0588,
"step": 6190
},
{
"epoch": 2.3529411764705883,
"grad_norm": 0.12126415222883224,
"learning_rate": 2.119140625e-05,
"loss": 0.0624,
"step": 6200
},
{
"epoch": 2.3567362428842507,
"grad_norm": 0.5167102217674255,
"learning_rate": 2.1142578125000003e-05,
"loss": 0.0732,
"step": 6210
},
{
"epoch": 2.3605313092979125,
"grad_norm": 0.18846435844898224,
"learning_rate": 2.109375e-05,
"loss": 0.1007,
"step": 6220
},
{
"epoch": 2.364326375711575,
"grad_norm": 1.9389616250991821,
"learning_rate": 2.1044921875e-05,
"loss": 0.0912,
"step": 6230
},
{
"epoch": 2.3681214421252372,
"grad_norm": 5.2946457862854,
"learning_rate": 2.0996093750000002e-05,
"loss": 0.057,
"step": 6240
},
{
"epoch": 2.3719165085388996,
"grad_norm": 0.13522082567214966,
"learning_rate": 2.0947265625e-05,
"loss": 0.0877,
"step": 6250
},
{
"epoch": 2.375711574952562,
"grad_norm": 0.43759119510650635,
"learning_rate": 2.0898437500000002e-05,
"loss": 0.0791,
"step": 6260
},
{
"epoch": 2.379506641366224,
"grad_norm": 4.369633197784424,
"learning_rate": 2.0849609375e-05,
"loss": 0.0793,
"step": 6270
},
{
"epoch": 2.383301707779886,
"grad_norm": 3.1445748805999756,
"learning_rate": 2.080078125e-05,
"loss": 0.0994,
"step": 6280
},
{
"epoch": 2.3870967741935485,
"grad_norm": 0.5459542274475098,
"learning_rate": 2.0751953125e-05,
"loss": 0.0493,
"step": 6290
},
{
"epoch": 2.3908918406072104,
"grad_norm": 0.8807210326194763,
"learning_rate": 2.0703125e-05,
"loss": 0.0669,
"step": 6300
},
{
"epoch": 2.3946869070208727,
"grad_norm": 2.931506872177124,
"learning_rate": 2.0654296875000002e-05,
"loss": 0.1014,
"step": 6310
},
{
"epoch": 2.398481973434535,
"grad_norm": 1.1972861289978027,
"learning_rate": 2.060546875e-05,
"loss": 0.0643,
"step": 6320
},
{
"epoch": 2.4022770398481974,
"grad_norm": 2.670483112335205,
"learning_rate": 2.0556640625000003e-05,
"loss": 0.0651,
"step": 6330
},
{
"epoch": 2.4060721062618597,
"grad_norm": 2.790907382965088,
"learning_rate": 2.05078125e-05,
"loss": 0.0979,
"step": 6340
},
{
"epoch": 2.4098671726755216,
"grad_norm": 1.7010408639907837,
"learning_rate": 2.0458984375e-05,
"loss": 0.0616,
"step": 6350
},
{
"epoch": 2.413662239089184,
"grad_norm": 2.3590617179870605,
"learning_rate": 2.0410156250000002e-05,
"loss": 0.0877,
"step": 6360
},
{
"epoch": 2.4174573055028463,
"grad_norm": 0.7550681829452515,
"learning_rate": 2.0361328125e-05,
"loss": 0.0351,
"step": 6370
},
{
"epoch": 2.4212523719165087,
"grad_norm": 2.2927632331848145,
"learning_rate": 2.0312500000000002e-05,
"loss": 0.102,
"step": 6380
},
{
"epoch": 2.4250474383301706,
"grad_norm": 8.239547729492188,
"learning_rate": 2.0263671875e-05,
"loss": 0.1315,
"step": 6390
},
{
"epoch": 2.428842504743833,
"grad_norm": 0.12305755913257599,
"learning_rate": 2.021484375e-05,
"loss": 0.0508,
"step": 6400
},
{
"epoch": 2.4326375711574952,
"grad_norm": 0.24204160273075104,
"learning_rate": 2.0166015625e-05,
"loss": 0.1154,
"step": 6410
},
{
"epoch": 2.4364326375711576,
"grad_norm": 1.9680283069610596,
"learning_rate": 2.01171875e-05,
"loss": 0.0576,
"step": 6420
},
{
"epoch": 2.44022770398482,
"grad_norm": 2.9172940254211426,
"learning_rate": 2.0068359375000002e-05,
"loss": 0.0457,
"step": 6430
},
{
"epoch": 2.444022770398482,
"grad_norm": 4.63267707824707,
"learning_rate": 2.001953125e-05,
"loss": 0.0544,
"step": 6440
},
{
"epoch": 2.447817836812144,
"grad_norm": 1.447266936302185,
"learning_rate": 1.9970703125e-05,
"loss": 0.0885,
"step": 6450
},
{
"epoch": 2.4516129032258065,
"grad_norm": 2.839066505432129,
"learning_rate": 1.9921875e-05,
"loss": 0.1266,
"step": 6460
},
{
"epoch": 2.455407969639469,
"grad_norm": 2.1036999225616455,
"learning_rate": 1.9873046875e-05,
"loss": 0.1107,
"step": 6470
},
{
"epoch": 2.4592030360531307,
"grad_norm": 2.6435329914093018,
"learning_rate": 1.9824218750000002e-05,
"loss": 0.0539,
"step": 6480
},
{
"epoch": 2.462998102466793,
"grad_norm": 0.2627769112586975,
"learning_rate": 1.9775390625e-05,
"loss": 0.0713,
"step": 6490
},
{
"epoch": 2.4667931688804554,
"grad_norm": 3.5408475399017334,
"learning_rate": 1.9726562500000003e-05,
"loss": 0.1061,
"step": 6500
},
{
"epoch": 2.4705882352941178,
"grad_norm": 2.456315279006958,
"learning_rate": 1.9677734375e-05,
"loss": 0.0782,
"step": 6510
},
{
"epoch": 2.47438330170778,
"grad_norm": 5.217021942138672,
"learning_rate": 1.962890625e-05,
"loss": 0.1009,
"step": 6520
},
{
"epoch": 2.478178368121442,
"grad_norm": 4.218019962310791,
"learning_rate": 1.9580078125000002e-05,
"loss": 0.0663,
"step": 6530
},
{
"epoch": 2.4819734345351043,
"grad_norm": 2.7066123485565186,
"learning_rate": 1.953125e-05,
"loss": 0.0891,
"step": 6540
},
{
"epoch": 2.4857685009487667,
"grad_norm": 0.1062941625714302,
"learning_rate": 1.9482421875000002e-05,
"loss": 0.1085,
"step": 6550
},
{
"epoch": 2.489563567362429,
"grad_norm": 5.984579086303711,
"learning_rate": 1.943359375e-05,
"loss": 0.092,
"step": 6560
},
{
"epoch": 2.493358633776091,
"grad_norm": 0.7308592796325684,
"learning_rate": 1.9384765625e-05,
"loss": 0.072,
"step": 6570
},
{
"epoch": 2.4971537001897532,
"grad_norm": 0.8086015582084656,
"learning_rate": 1.93359375e-05,
"loss": 0.1052,
"step": 6580
},
{
"epoch": 2.5009487666034156,
"grad_norm": 1.8991528749465942,
"learning_rate": 1.9287109375e-05,
"loss": 0.0737,
"step": 6590
},
{
"epoch": 2.504743833017078,
"grad_norm": 6.63985013961792,
"learning_rate": 1.9238281250000002e-05,
"loss": 0.1096,
"step": 6600
},
{
"epoch": 2.5085388994307403,
"grad_norm": 0.17855627834796906,
"learning_rate": 1.9189453125e-05,
"loss": 0.0624,
"step": 6610
},
{
"epoch": 2.512333965844402,
"grad_norm": 4.877336502075195,
"learning_rate": 1.9140625e-05,
"loss": 0.1211,
"step": 6620
},
{
"epoch": 2.5161290322580645,
"grad_norm": 0.27590852975845337,
"learning_rate": 1.9091796875e-05,
"loss": 0.0521,
"step": 6630
},
{
"epoch": 2.519924098671727,
"grad_norm": 0.45393088459968567,
"learning_rate": 1.904296875e-05,
"loss": 0.0707,
"step": 6640
},
{
"epoch": 2.5237191650853887,
"grad_norm": 2.1049611568450928,
"learning_rate": 1.8994140625000002e-05,
"loss": 0.1105,
"step": 6650
},
{
"epoch": 2.527514231499051,
"grad_norm": 1.805330753326416,
"learning_rate": 1.89453125e-05,
"loss": 0.068,
"step": 6660
},
{
"epoch": 2.5313092979127134,
"grad_norm": 1.1227184534072876,
"learning_rate": 1.8896484375000003e-05,
"loss": 0.0572,
"step": 6670
},
{
"epoch": 2.5351043643263758,
"grad_norm": 2.483306646347046,
"learning_rate": 1.884765625e-05,
"loss": 0.1095,
"step": 6680
},
{
"epoch": 2.538899430740038,
"grad_norm": 0.1452198177576065,
"learning_rate": 1.8798828125e-05,
"loss": 0.0401,
"step": 6690
},
{
"epoch": 2.5426944971537004,
"grad_norm": 0.14945687353610992,
"learning_rate": 1.8750000000000002e-05,
"loss": 0.0796,
"step": 6700
},
{
"epoch": 2.5464895635673623,
"grad_norm": 1.3936477899551392,
"learning_rate": 1.8701171875e-05,
"loss": 0.0688,
"step": 6710
},
{
"epoch": 2.5502846299810247,
"grad_norm": 0.16819104552268982,
"learning_rate": 1.8652343750000002e-05,
"loss": 0.0454,
"step": 6720
},
{
"epoch": 2.554079696394687,
"grad_norm": 1.2239612340927124,
"learning_rate": 1.8603515625e-05,
"loss": 0.0588,
"step": 6730
},
{
"epoch": 2.557874762808349,
"grad_norm": 7.471010684967041,
"learning_rate": 1.85546875e-05,
"loss": 0.0528,
"step": 6740
},
{
"epoch": 2.5616698292220113,
"grad_norm": 4.900544166564941,
"learning_rate": 1.8505859375e-05,
"loss": 0.0858,
"step": 6750
},
{
"epoch": 2.5654648956356736,
"grad_norm": 3.8821702003479004,
"learning_rate": 1.845703125e-05,
"loss": 0.046,
"step": 6760
},
{
"epoch": 2.569259962049336,
"grad_norm": 0.17730577290058136,
"learning_rate": 1.8408203125000002e-05,
"loss": 0.0673,
"step": 6770
},
{
"epoch": 2.5730550284629983,
"grad_norm": 3.4757065773010254,
"learning_rate": 1.8359375e-05,
"loss": 0.094,
"step": 6780
},
{
"epoch": 2.5768500948766606,
"grad_norm": 3.2091782093048096,
"learning_rate": 1.8310546875e-05,
"loss": 0.08,
"step": 6790
},
{
"epoch": 2.5806451612903225,
"grad_norm": 5.548855304718018,
"learning_rate": 1.826171875e-05,
"loss": 0.0996,
"step": 6800
},
{
"epoch": 2.584440227703985,
"grad_norm": 0.17017248272895813,
"learning_rate": 1.8212890625e-05,
"loss": 0.0828,
"step": 6810
},
{
"epoch": 2.588235294117647,
"grad_norm": 9.512433052062988,
"learning_rate": 1.8164062500000002e-05,
"loss": 0.0696,
"step": 6820
},
{
"epoch": 2.592030360531309,
"grad_norm": 0.9737806916236877,
"learning_rate": 1.8115234375e-05,
"loss": 0.0881,
"step": 6830
},
{
"epoch": 2.5958254269449714,
"grad_norm": 7.027744293212891,
"learning_rate": 1.8066406250000002e-05,
"loss": 0.06,
"step": 6840
},
{
"epoch": 2.5996204933586338,
"grad_norm": 2.162301778793335,
"learning_rate": 1.8017578125e-05,
"loss": 0.0833,
"step": 6850
},
{
"epoch": 2.603415559772296,
"grad_norm": 0.30585893988609314,
"learning_rate": 1.796875e-05,
"loss": 0.0794,
"step": 6860
},
{
"epoch": 2.6072106261859584,
"grad_norm": 0.22574108839035034,
"learning_rate": 1.7919921875e-05,
"loss": 0.0965,
"step": 6870
},
{
"epoch": 2.6110056925996203,
"grad_norm": 0.6627634763717651,
"learning_rate": 1.787109375e-05,
"loss": 0.0622,
"step": 6880
},
{
"epoch": 2.6148007590132827,
"grad_norm": 0.17045138776302338,
"learning_rate": 1.7822265625000002e-05,
"loss": 0.0471,
"step": 6890
},
{
"epoch": 2.618595825426945,
"grad_norm": 0.31901392340660095,
"learning_rate": 1.77734375e-05,
"loss": 0.0607,
"step": 6900
},
{
"epoch": 2.6223908918406074,
"grad_norm": 0.21171316504478455,
"learning_rate": 1.7724609375e-05,
"loss": 0.0789,
"step": 6910
},
{
"epoch": 2.6261859582542693,
"grad_norm": 0.8109591007232666,
"learning_rate": 1.767578125e-05,
"loss": 0.0973,
"step": 6920
},
{
"epoch": 2.6299810246679316,
"grad_norm": 2.583545446395874,
"learning_rate": 1.7626953125e-05,
"loss": 0.0512,
"step": 6930
},
{
"epoch": 2.633776091081594,
"grad_norm": 1.5937598943710327,
"learning_rate": 1.7578125000000002e-05,
"loss": 0.0861,
"step": 6940
},
{
"epoch": 2.6375711574952563,
"grad_norm": 1.3143688440322876,
"learning_rate": 1.7529296875e-05,
"loss": 0.098,
"step": 6950
},
{
"epoch": 2.6413662239089186,
"grad_norm": 2.390667676925659,
"learning_rate": 1.748046875e-05,
"loss": 0.0621,
"step": 6960
},
{
"epoch": 2.6451612903225805,
"grad_norm": 0.30924805998802185,
"learning_rate": 1.7431640625e-05,
"loss": 0.0807,
"step": 6970
},
{
"epoch": 2.648956356736243,
"grad_norm": 1.6821314096450806,
"learning_rate": 1.73828125e-05,
"loss": 0.0598,
"step": 6980
},
{
"epoch": 2.652751423149905,
"grad_norm": 1.8624871969223022,
"learning_rate": 1.7333984375000002e-05,
"loss": 0.0841,
"step": 6990
},
{
"epoch": 2.656546489563567,
"grad_norm": 1.0055333375930786,
"learning_rate": 1.728515625e-05,
"loss": 0.0853,
"step": 7000
},
{
"epoch": 2.6603415559772294,
"grad_norm": 0.11686267703771591,
"learning_rate": 1.7236328125000002e-05,
"loss": 0.0455,
"step": 7010
},
{
"epoch": 2.6641366223908918,
"grad_norm": 5.000795841217041,
"learning_rate": 1.71875e-05,
"loss": 0.1102,
"step": 7020
},
{
"epoch": 2.667931688804554,
"grad_norm": 5.362839221954346,
"learning_rate": 1.7138671875e-05,
"loss": 0.0864,
"step": 7030
},
{
"epoch": 2.6717267552182165,
"grad_norm": 4.031505584716797,
"learning_rate": 1.708984375e-05,
"loss": 0.0753,
"step": 7040
},
{
"epoch": 2.675521821631879,
"grad_norm": 3.553187608718872,
"learning_rate": 1.7041015625e-05,
"loss": 0.0802,
"step": 7050
},
{
"epoch": 2.6793168880455407,
"grad_norm": 2.1504125595092773,
"learning_rate": 1.6992187500000002e-05,
"loss": 0.0798,
"step": 7060
},
{
"epoch": 2.683111954459203,
"grad_norm": 0.17360809445381165,
"learning_rate": 1.6943359375e-05,
"loss": 0.1064,
"step": 7070
},
{
"epoch": 2.6869070208728654,
"grad_norm": 0.16311465203762054,
"learning_rate": 1.689453125e-05,
"loss": 0.1194,
"step": 7080
},
{
"epoch": 2.6907020872865273,
"grad_norm": 3.6088805198669434,
"learning_rate": 1.6845703125e-05,
"loss": 0.0586,
"step": 7090
},
{
"epoch": 2.6944971537001896,
"grad_norm": 5.143406867980957,
"learning_rate": 1.6796875e-05,
"loss": 0.0892,
"step": 7100
},
{
"epoch": 2.698292220113852,
"grad_norm": 27.002168655395508,
"learning_rate": 1.6748046875000002e-05,
"loss": 0.089,
"step": 7110
},
{
"epoch": 2.7020872865275143,
"grad_norm": 1.443231225013733,
"learning_rate": 1.669921875e-05,
"loss": 0.1328,
"step": 7120
},
{
"epoch": 2.7058823529411766,
"grad_norm": 7.007279396057129,
"learning_rate": 1.6650390625e-05,
"loss": 0.0652,
"step": 7130
},
{
"epoch": 2.709677419354839,
"grad_norm": 0.25469958782196045,
"learning_rate": 1.66015625e-05,
"loss": 0.045,
"step": 7140
},
{
"epoch": 2.713472485768501,
"grad_norm": 4.693950653076172,
"learning_rate": 1.6552734375e-05,
"loss": 0.1245,
"step": 7150
},
{
"epoch": 2.717267552182163,
"grad_norm": 0.3287486732006073,
"learning_rate": 1.6503906250000002e-05,
"loss": 0.068,
"step": 7160
},
{
"epoch": 2.7210626185958255,
"grad_norm": 9.82812786102295,
"learning_rate": 1.6455078125e-05,
"loss": 0.0909,
"step": 7170
},
{
"epoch": 2.7248576850094874,
"grad_norm": 14.501320838928223,
"learning_rate": 1.6406250000000002e-05,
"loss": 0.0972,
"step": 7180
},
{
"epoch": 2.72865275142315,
"grad_norm": 5.130281448364258,
"learning_rate": 1.6357421875e-05,
"loss": 0.1253,
"step": 7190
},
{
"epoch": 2.732447817836812,
"grad_norm": 3.5541763305664062,
"learning_rate": 1.630859375e-05,
"loss": 0.0822,
"step": 7200
},
{
"epoch": 2.7362428842504745,
"grad_norm": 0.9670690894126892,
"learning_rate": 1.6259765625e-05,
"loss": 0.0231,
"step": 7210
},
{
"epoch": 2.740037950664137,
"grad_norm": 0.676513135433197,
"learning_rate": 1.62109375e-05,
"loss": 0.0972,
"step": 7220
},
{
"epoch": 2.7438330170777987,
"grad_norm": 7.5943217277526855,
"learning_rate": 1.6162109375000002e-05,
"loss": 0.0989,
"step": 7230
},
{
"epoch": 2.747628083491461,
"grad_norm": 0.20399871468544006,
"learning_rate": 1.611328125e-05,
"loss": 0.1036,
"step": 7240
},
{
"epoch": 2.7514231499051234,
"grad_norm": 0.43629199266433716,
"learning_rate": 1.6064453125e-05,
"loss": 0.0311,
"step": 7250
},
{
"epoch": 2.7552182163187857,
"grad_norm": 1.144394040107727,
"learning_rate": 1.6015625e-05,
"loss": 0.0815,
"step": 7260
},
{
"epoch": 2.7590132827324476,
"grad_norm": 0.06812827289104462,
"learning_rate": 1.5966796875e-05,
"loss": 0.0539,
"step": 7270
},
{
"epoch": 2.76280834914611,
"grad_norm": 2.913031578063965,
"learning_rate": 1.5917968750000002e-05,
"loss": 0.0443,
"step": 7280
},
{
"epoch": 2.7666034155597723,
"grad_norm": 2.4026944637298584,
"learning_rate": 1.5869140625e-05,
"loss": 0.0957,
"step": 7290
},
{
"epoch": 2.7703984819734346,
"grad_norm": 3.89658784866333,
"learning_rate": 1.58203125e-05,
"loss": 0.1125,
"step": 7300
},
{
"epoch": 2.774193548387097,
"grad_norm": 0.4522351920604706,
"learning_rate": 1.5771484375e-05,
"loss": 0.0889,
"step": 7310
},
{
"epoch": 2.777988614800759,
"grad_norm": 5.769268989562988,
"learning_rate": 1.572265625e-05,
"loss": 0.0631,
"step": 7320
},
{
"epoch": 2.781783681214421,
"grad_norm": 1.7276089191436768,
"learning_rate": 1.5673828125000002e-05,
"loss": 0.091,
"step": 7330
},
{
"epoch": 2.7855787476280836,
"grad_norm": 2.0759644508361816,
"learning_rate": 1.5625e-05,
"loss": 0.0655,
"step": 7340
},
{
"epoch": 2.789373814041746,
"grad_norm": 0.7582204937934875,
"learning_rate": 1.5576171875000002e-05,
"loss": 0.0541,
"step": 7350
},
{
"epoch": 2.793168880455408,
"grad_norm": 16.55638885498047,
"learning_rate": 1.552734375e-05,
"loss": 0.1178,
"step": 7360
},
{
"epoch": 2.79696394686907,
"grad_norm": 0.7026536464691162,
"learning_rate": 1.5478515625e-05,
"loss": 0.0459,
"step": 7370
},
{
"epoch": 2.8007590132827325,
"grad_norm": 4.089038372039795,
"learning_rate": 1.54296875e-05,
"loss": 0.0663,
"step": 7380
},
{
"epoch": 2.804554079696395,
"grad_norm": 3.8286547660827637,
"learning_rate": 1.5380859375e-05,
"loss": 0.1096,
"step": 7390
},
{
"epoch": 2.808349146110057,
"grad_norm": 2.5993642807006836,
"learning_rate": 1.5332031250000002e-05,
"loss": 0.0685,
"step": 7400
},
{
"epoch": 2.812144212523719,
"grad_norm": 1.0880334377288818,
"learning_rate": 1.5283203125e-05,
"loss": 0.0631,
"step": 7410
},
{
"epoch": 2.8159392789373814,
"grad_norm": 1.036834478378296,
"learning_rate": 1.5234375000000001e-05,
"loss": 0.086,
"step": 7420
},
{
"epoch": 2.8197343453510437,
"grad_norm": 5.436180114746094,
"learning_rate": 1.5185546875000001e-05,
"loss": 0.1121,
"step": 7430
},
{
"epoch": 2.8235294117647056,
"grad_norm": 3.7009427547454834,
"learning_rate": 1.513671875e-05,
"loss": 0.0764,
"step": 7440
},
{
"epoch": 2.827324478178368,
"grad_norm": 2.5197298526763916,
"learning_rate": 1.5087890625e-05,
"loss": 0.082,
"step": 7450
},
{
"epoch": 2.8311195445920303,
"grad_norm": 3.15004563331604,
"learning_rate": 1.50390625e-05,
"loss": 0.112,
"step": 7460
},
{
"epoch": 2.8349146110056926,
"grad_norm": 2.9666614532470703,
"learning_rate": 1.4990234375e-05,
"loss": 0.0872,
"step": 7470
},
{
"epoch": 2.838709677419355,
"grad_norm": 6.0326385498046875,
"learning_rate": 1.4941406250000001e-05,
"loss": 0.0817,
"step": 7480
},
{
"epoch": 2.8425047438330173,
"grad_norm": 1.699873685836792,
"learning_rate": 1.4892578125000001e-05,
"loss": 0.0816,
"step": 7490
},
{
"epoch": 2.846299810246679,
"grad_norm": 0.14119946956634521,
"learning_rate": 1.484375e-05,
"loss": 0.0725,
"step": 7500
},
{
"epoch": 2.8500948766603416,
"grad_norm": 6.737262725830078,
"learning_rate": 1.4794921875e-05,
"loss": 0.1205,
"step": 7510
},
{
"epoch": 2.853889943074004,
"grad_norm": 4.460575103759766,
"learning_rate": 1.474609375e-05,
"loss": 0.123,
"step": 7520
},
{
"epoch": 2.857685009487666,
"grad_norm": 0.09714975953102112,
"learning_rate": 1.4697265625000001e-05,
"loss": 0.0687,
"step": 7530
},
{
"epoch": 2.861480075901328,
"grad_norm": 3.972470760345459,
"learning_rate": 1.4648437500000001e-05,
"loss": 0.1089,
"step": 7540
},
{
"epoch": 2.8652751423149905,
"grad_norm": 2.0776712894439697,
"learning_rate": 1.4599609375000001e-05,
"loss": 0.1318,
"step": 7550
},
{
"epoch": 2.869070208728653,
"grad_norm": 0.21448436379432678,
"learning_rate": 1.455078125e-05,
"loss": 0.0639,
"step": 7560
},
{
"epoch": 2.872865275142315,
"grad_norm": 0.19727276265621185,
"learning_rate": 1.4501953125e-05,
"loss": 0.0464,
"step": 7570
},
{
"epoch": 2.8766603415559775,
"grad_norm": 2.9958267211914062,
"learning_rate": 1.4453125e-05,
"loss": 0.0715,
"step": 7580
},
{
"epoch": 2.8804554079696394,
"grad_norm": 1.823538064956665,
"learning_rate": 1.4404296875000001e-05,
"loss": 0.0781,
"step": 7590
},
{
"epoch": 2.8842504743833017,
"grad_norm": 2.5351407527923584,
"learning_rate": 1.4355468750000001e-05,
"loss": 0.0888,
"step": 7600
},
{
"epoch": 2.888045540796964,
"grad_norm": 4.274851322174072,
"learning_rate": 1.4306640625000002e-05,
"loss": 0.0228,
"step": 7610
},
{
"epoch": 2.891840607210626,
"grad_norm": 4.665604591369629,
"learning_rate": 1.42578125e-05,
"loss": 0.083,
"step": 7620
},
{
"epoch": 2.8956356736242883,
"grad_norm": 4.373048782348633,
"learning_rate": 1.4208984375e-05,
"loss": 0.0936,
"step": 7630
},
{
"epoch": 2.8994307400379506,
"grad_norm": 1.5743074417114258,
"learning_rate": 1.416015625e-05,
"loss": 0.0414,
"step": 7640
},
{
"epoch": 2.903225806451613,
"grad_norm": 2.3043341636657715,
"learning_rate": 1.4111328125000001e-05,
"loss": 0.0739,
"step": 7650
},
{
"epoch": 2.9070208728652753,
"grad_norm": 2.980686902999878,
"learning_rate": 1.4062500000000001e-05,
"loss": 0.0755,
"step": 7660
},
{
"epoch": 2.9108159392789372,
"grad_norm": 0.5928072929382324,
"learning_rate": 1.4013671875e-05,
"loss": 0.116,
"step": 7670
},
{
"epoch": 2.9146110056925996,
"grad_norm": 0.14647921919822693,
"learning_rate": 1.396484375e-05,
"loss": 0.0367,
"step": 7680
},
{
"epoch": 2.918406072106262,
"grad_norm": 6.466022968292236,
"learning_rate": 1.3916015625e-05,
"loss": 0.0365,
"step": 7690
},
{
"epoch": 2.9222011385199242,
"grad_norm": 13.139077186584473,
"learning_rate": 1.38671875e-05,
"loss": 0.1295,
"step": 7700
},
{
"epoch": 2.925996204933586,
"grad_norm": 0.3945586383342743,
"learning_rate": 1.3818359375000001e-05,
"loss": 0.0559,
"step": 7710
},
{
"epoch": 2.9297912713472485,
"grad_norm": 0.04980861395597458,
"learning_rate": 1.3769531250000001e-05,
"loss": 0.0485,
"step": 7720
},
{
"epoch": 2.933586337760911,
"grad_norm": 2.388545513153076,
"learning_rate": 1.3720703125e-05,
"loss": 0.0542,
"step": 7730
},
{
"epoch": 2.937381404174573,
"grad_norm": 2.4082882404327393,
"learning_rate": 1.3671875e-05,
"loss": 0.0939,
"step": 7740
},
{
"epoch": 2.9411764705882355,
"grad_norm": 4.933741569519043,
"learning_rate": 1.3623046875e-05,
"loss": 0.1409,
"step": 7750
},
{
"epoch": 2.9449715370018974,
"grad_norm": 5.57550573348999,
"learning_rate": 1.3574218750000001e-05,
"loss": 0.0646,
"step": 7760
},
{
"epoch": 2.9487666034155597,
"grad_norm": 1.8403911590576172,
"learning_rate": 1.3525390625000001e-05,
"loss": 0.0694,
"step": 7770
},
{
"epoch": 2.952561669829222,
"grad_norm": 6.1294331550598145,
"learning_rate": 1.3476562500000001e-05,
"loss": 0.0476,
"step": 7780
},
{
"epoch": 2.956356736242884,
"grad_norm": 0.0652192234992981,
"learning_rate": 1.3427734375e-05,
"loss": 0.0634,
"step": 7790
},
{
"epoch": 2.9601518026565463,
"grad_norm": 2.2705845832824707,
"learning_rate": 1.337890625e-05,
"loss": 0.0577,
"step": 7800
},
{
"epoch": 2.9639468690702087,
"grad_norm": 0.12686532735824585,
"learning_rate": 1.3330078125e-05,
"loss": 0.0948,
"step": 7810
},
{
"epoch": 2.967741935483871,
"grad_norm": 3.2810075283050537,
"learning_rate": 1.3281250000000001e-05,
"loss": 0.0813,
"step": 7820
},
{
"epoch": 2.9715370018975333,
"grad_norm": 2.2181339263916016,
"learning_rate": 1.3232421875000001e-05,
"loss": 0.1022,
"step": 7830
},
{
"epoch": 2.9753320683111957,
"grad_norm": 1.6737946271896362,
"learning_rate": 1.318359375e-05,
"loss": 0.0557,
"step": 7840
},
{
"epoch": 2.9791271347248576,
"grad_norm": 7.780960559844971,
"learning_rate": 1.3134765625e-05,
"loss": 0.0978,
"step": 7850
},
{
"epoch": 2.98292220113852,
"grad_norm": 8.983189582824707,
"learning_rate": 1.30859375e-05,
"loss": 0.0601,
"step": 7860
},
{
"epoch": 2.9867172675521823,
"grad_norm": 4.744899272918701,
"learning_rate": 1.3037109375e-05,
"loss": 0.0418,
"step": 7870
},
{
"epoch": 2.990512333965844,
"grad_norm": 2.1875483989715576,
"learning_rate": 1.2988281250000001e-05,
"loss": 0.0746,
"step": 7880
},
{
"epoch": 2.9943074003795065,
"grad_norm": 1.506842017173767,
"learning_rate": 1.2939453125000001e-05,
"loss": 0.0868,
"step": 7890
},
{
"epoch": 2.998102466793169,
"grad_norm": 2.1302731037139893,
"learning_rate": 1.2890625e-05,
"loss": 0.0687,
"step": 7900
},
{
"epoch": 3.001897533206831,
"grad_norm": 2.632828950881958,
"learning_rate": 1.2841796875e-05,
"loss": 0.0705,
"step": 7910
},
{
"epoch": 3.0056925996204935,
"grad_norm": 0.15800461173057556,
"learning_rate": 1.279296875e-05,
"loss": 0.0522,
"step": 7920
},
{
"epoch": 3.0094876660341554,
"grad_norm": 0.13846412301063538,
"learning_rate": 1.2744140625e-05,
"loss": 0.0363,
"step": 7930
},
{
"epoch": 3.0132827324478177,
"grad_norm": 4.117944717407227,
"learning_rate": 1.2695312500000001e-05,
"loss": 0.0605,
"step": 7940
},
{
"epoch": 3.01707779886148,
"grad_norm": 1.4927798509597778,
"learning_rate": 1.2646484375000001e-05,
"loss": 0.0346,
"step": 7950
},
{
"epoch": 3.0208728652751424,
"grad_norm": 4.367966175079346,
"learning_rate": 1.259765625e-05,
"loss": 0.0458,
"step": 7960
},
{
"epoch": 3.0246679316888048,
"grad_norm": 2.0026087760925293,
"learning_rate": 1.2548828125e-05,
"loss": 0.0749,
"step": 7970
},
{
"epoch": 3.0284629981024667,
"grad_norm": 2.106546640396118,
"learning_rate": 1.25e-05,
"loss": 0.065,
"step": 7980
},
{
"epoch": 3.032258064516129,
"grad_norm": 4.122467994689941,
"learning_rate": 1.2451171875000001e-05,
"loss": 0.0475,
"step": 7990
},
{
"epoch": 3.0360531309297913,
"grad_norm": 0.08205808699131012,
"learning_rate": 1.2402343750000001e-05,
"loss": 0.0692,
"step": 8000
},
{
"epoch": 3.0398481973434537,
"grad_norm": 1.0389831066131592,
"learning_rate": 1.2353515625e-05,
"loss": 0.0514,
"step": 8010
},
{
"epoch": 3.0436432637571156,
"grad_norm": 0.1080293357372284,
"learning_rate": 1.23046875e-05,
"loss": 0.0385,
"step": 8020
},
{
"epoch": 3.047438330170778,
"grad_norm": 0.2515338361263275,
"learning_rate": 1.2255859375e-05,
"loss": 0.0835,
"step": 8030
},
{
"epoch": 3.0512333965844403,
"grad_norm": 1.1087881326675415,
"learning_rate": 1.220703125e-05,
"loss": 0.0559,
"step": 8040
},
{
"epoch": 3.0550284629981026,
"grad_norm": 1.1088217496871948,
"learning_rate": 1.2158203125000001e-05,
"loss": 0.075,
"step": 8050
},
{
"epoch": 3.0588235294117645,
"grad_norm": 3.310959577560425,
"learning_rate": 1.2109375000000001e-05,
"loss": 0.0596,
"step": 8060
},
{
"epoch": 3.062618595825427,
"grad_norm": 1.186274766921997,
"learning_rate": 1.2060546875e-05,
"loss": 0.0399,
"step": 8070
},
{
"epoch": 3.066413662239089,
"grad_norm": 3.054225444793701,
"learning_rate": 1.201171875e-05,
"loss": 0.0352,
"step": 8080
},
{
"epoch": 3.0702087286527515,
"grad_norm": 0.3610187768936157,
"learning_rate": 1.1962890625e-05,
"loss": 0.0519,
"step": 8090
},
{
"epoch": 3.074003795066414,
"grad_norm": 1.7858855724334717,
"learning_rate": 1.19140625e-05,
"loss": 0.0712,
"step": 8100
},
{
"epoch": 3.0777988614800758,
"grad_norm": 3.144697666168213,
"learning_rate": 1.1865234375000001e-05,
"loss": 0.0343,
"step": 8110
},
{
"epoch": 3.081593927893738,
"grad_norm": 1.743668556213379,
"learning_rate": 1.1816406250000001e-05,
"loss": 0.0611,
"step": 8120
},
{
"epoch": 3.0853889943074004,
"grad_norm": 0.6149533987045288,
"learning_rate": 1.1767578125e-05,
"loss": 0.0512,
"step": 8130
},
{
"epoch": 3.0891840607210628,
"grad_norm": 6.247795581817627,
"learning_rate": 1.171875e-05,
"loss": 0.0741,
"step": 8140
},
{
"epoch": 3.0929791271347247,
"grad_norm": 0.8566815853118896,
"learning_rate": 1.1669921875e-05,
"loss": 0.0699,
"step": 8150
},
{
"epoch": 3.096774193548387,
"grad_norm": 3.2794229984283447,
"learning_rate": 1.162109375e-05,
"loss": 0.0296,
"step": 8160
},
{
"epoch": 3.1005692599620494,
"grad_norm": 0.10005365312099457,
"learning_rate": 1.1572265625000001e-05,
"loss": 0.0645,
"step": 8170
},
{
"epoch": 3.1043643263757117,
"grad_norm": 2.8992691040039062,
"learning_rate": 1.15234375e-05,
"loss": 0.0456,
"step": 8180
},
{
"epoch": 3.108159392789374,
"grad_norm": 3.6778674125671387,
"learning_rate": 1.1474609375e-05,
"loss": 0.0351,
"step": 8190
},
{
"epoch": 3.111954459203036,
"grad_norm": 1.5398664474487305,
"learning_rate": 1.142578125e-05,
"loss": 0.042,
"step": 8200
},
{
"epoch": 3.1157495256166983,
"grad_norm": 0.05135444924235344,
"learning_rate": 1.1376953125e-05,
"loss": 0.0478,
"step": 8210
},
{
"epoch": 3.1195445920303606,
"grad_norm": 0.6804483532905579,
"learning_rate": 1.1328125000000001e-05,
"loss": 0.058,
"step": 8220
},
{
"epoch": 3.123339658444023,
"grad_norm": 0.10011663287878036,
"learning_rate": 1.1279296875000001e-05,
"loss": 0.0456,
"step": 8230
},
{
"epoch": 3.127134724857685,
"grad_norm": 0.466981440782547,
"learning_rate": 1.123046875e-05,
"loss": 0.0449,
"step": 8240
},
{
"epoch": 3.130929791271347,
"grad_norm": 2.163849353790283,
"learning_rate": 1.1181640625e-05,
"loss": 0.0595,
"step": 8250
},
{
"epoch": 3.1347248576850095,
"grad_norm": 1.1013680696487427,
"learning_rate": 1.11328125e-05,
"loss": 0.0708,
"step": 8260
},
{
"epoch": 3.138519924098672,
"grad_norm": 8.969820022583008,
"learning_rate": 1.1083984375e-05,
"loss": 0.064,
"step": 8270
},
{
"epoch": 3.1423149905123338,
"grad_norm": 1.1106621026992798,
"learning_rate": 1.1035156250000001e-05,
"loss": 0.1007,
"step": 8280
},
{
"epoch": 3.146110056925996,
"grad_norm": 0.1508377343416214,
"learning_rate": 1.0986328125000001e-05,
"loss": 0.0464,
"step": 8290
},
{
"epoch": 3.1499051233396584,
"grad_norm": 0.07330877333879471,
"learning_rate": 1.09375e-05,
"loss": 0.0797,
"step": 8300
},
{
"epoch": 3.153700189753321,
"grad_norm": 1.6159915924072266,
"learning_rate": 1.0888671875e-05,
"loss": 0.0527,
"step": 8310
},
{
"epoch": 3.157495256166983,
"grad_norm": 0.5196408629417419,
"learning_rate": 1.083984375e-05,
"loss": 0.0433,
"step": 8320
},
{
"epoch": 3.161290322580645,
"grad_norm": 2.486041307449341,
"learning_rate": 1.0791015625e-05,
"loss": 0.0651,
"step": 8330
},
{
"epoch": 3.1650853889943074,
"grad_norm": 1.0713788270950317,
"learning_rate": 1.0742187500000001e-05,
"loss": 0.0695,
"step": 8340
},
{
"epoch": 3.1688804554079697,
"grad_norm": 0.19154168665409088,
"learning_rate": 1.0693359375e-05,
"loss": 0.0364,
"step": 8350
},
{
"epoch": 3.172675521821632,
"grad_norm": 0.31223466992378235,
"learning_rate": 1.064453125e-05,
"loss": 0.0267,
"step": 8360
},
{
"epoch": 3.176470588235294,
"grad_norm": 0.7767817378044128,
"learning_rate": 1.0595703125e-05,
"loss": 0.0635,
"step": 8370
},
{
"epoch": 3.1802656546489563,
"grad_norm": 2.4257445335388184,
"learning_rate": 1.0546875e-05,
"loss": 0.0588,
"step": 8380
},
{
"epoch": 3.1840607210626186,
"grad_norm": 1.2349954843521118,
"learning_rate": 1.0498046875000001e-05,
"loss": 0.0557,
"step": 8390
},
{
"epoch": 3.187855787476281,
"grad_norm": 3.209284543991089,
"learning_rate": 1.0449218750000001e-05,
"loss": 0.0459,
"step": 8400
},
{
"epoch": 3.191650853889943,
"grad_norm": 0.16265904903411865,
"learning_rate": 1.0400390625e-05,
"loss": 0.0525,
"step": 8410
},
{
"epoch": 3.195445920303605,
"grad_norm": 0.6664568781852722,
"learning_rate": 1.03515625e-05,
"loss": 0.0727,
"step": 8420
},
{
"epoch": 3.1992409867172675,
"grad_norm": 0.9481377005577087,
"learning_rate": 1.0302734375e-05,
"loss": 0.0215,
"step": 8430
},
{
"epoch": 3.20303605313093,
"grad_norm": 5.600297451019287,
"learning_rate": 1.025390625e-05,
"loss": 0.0385,
"step": 8440
},
{
"epoch": 3.206831119544592,
"grad_norm": 0.15000663697719574,
"learning_rate": 1.0205078125000001e-05,
"loss": 0.0659,
"step": 8450
},
{
"epoch": 3.210626185958254,
"grad_norm": 0.6691407561302185,
"learning_rate": 1.0156250000000001e-05,
"loss": 0.0666,
"step": 8460
},
{
"epoch": 3.2144212523719164,
"grad_norm": 1.3882899284362793,
"learning_rate": 1.0107421875e-05,
"loss": 0.0815,
"step": 8470
},
{
"epoch": 3.218216318785579,
"grad_norm": 1.0314580202102661,
"learning_rate": 1.005859375e-05,
"loss": 0.0178,
"step": 8480
},
{
"epoch": 3.222011385199241,
"grad_norm": 3.9537134170532227,
"learning_rate": 1.0009765625e-05,
"loss": 0.0631,
"step": 8490
},
{
"epoch": 3.225806451612903,
"grad_norm": 5.446588039398193,
"learning_rate": 9.9609375e-06,
"loss": 0.0548,
"step": 8500
},
{
"epoch": 3.2296015180265654,
"grad_norm": 8.026607513427734,
"learning_rate": 9.912109375000001e-06,
"loss": 0.0353,
"step": 8510
},
{
"epoch": 3.2333965844402277,
"grad_norm": 0.1389143019914627,
"learning_rate": 9.863281250000001e-06,
"loss": 0.0419,
"step": 8520
},
{
"epoch": 3.23719165085389,
"grad_norm": 1.255216121673584,
"learning_rate": 9.814453125e-06,
"loss": 0.0697,
"step": 8530
},
{
"epoch": 3.2409867172675524,
"grad_norm": 4.600146770477295,
"learning_rate": 9.765625e-06,
"loss": 0.0879,
"step": 8540
},
{
"epoch": 3.2447817836812143,
"grad_norm": 0.09613824635744095,
"learning_rate": 9.716796875e-06,
"loss": 0.0122,
"step": 8550
},
{
"epoch": 3.2485768500948766,
"grad_norm": 1.0265446901321411,
"learning_rate": 9.66796875e-06,
"loss": 0.0227,
"step": 8560
},
{
"epoch": 3.252371916508539,
"grad_norm": 2.185931444168091,
"learning_rate": 9.619140625000001e-06,
"loss": 0.1162,
"step": 8570
},
{
"epoch": 3.2561669829222013,
"grad_norm": 0.1482323259115219,
"learning_rate": 9.5703125e-06,
"loss": 0.0581,
"step": 8580
},
{
"epoch": 3.259962049335863,
"grad_norm": 0.17460452020168304,
"learning_rate": 9.521484375e-06,
"loss": 0.0399,
"step": 8590
},
{
"epoch": 3.2637571157495255,
"grad_norm": 1.6274187564849854,
"learning_rate": 9.47265625e-06,
"loss": 0.0537,
"step": 8600
},
{
"epoch": 3.267552182163188,
"grad_norm": 8.227033615112305,
"learning_rate": 9.423828125e-06,
"loss": 0.0646,
"step": 8610
},
{
"epoch": 3.27134724857685,
"grad_norm": 0.08734069019556046,
"learning_rate": 9.375000000000001e-06,
"loss": 0.0675,
"step": 8620
},
{
"epoch": 3.2751423149905126,
"grad_norm": 0.5700662732124329,
"learning_rate": 9.326171875000001e-06,
"loss": 0.0744,
"step": 8630
},
{
"epoch": 3.2789373814041745,
"grad_norm": 2.089008092880249,
"learning_rate": 9.27734375e-06,
"loss": 0.0812,
"step": 8640
},
{
"epoch": 3.282732447817837,
"grad_norm": 0.11990799009799957,
"learning_rate": 9.228515625e-06,
"loss": 0.071,
"step": 8650
},
{
"epoch": 3.286527514231499,
"grad_norm": 0.5663464665412903,
"learning_rate": 9.1796875e-06,
"loss": 0.0279,
"step": 8660
},
{
"epoch": 3.2903225806451615,
"grad_norm": 0.8847103118896484,
"learning_rate": 9.130859375e-06,
"loss": 0.0473,
"step": 8670
},
{
"epoch": 3.2941176470588234,
"grad_norm": 0.08891147375106812,
"learning_rate": 9.082031250000001e-06,
"loss": 0.041,
"step": 8680
},
{
"epoch": 3.2979127134724857,
"grad_norm": 0.0875004231929779,
"learning_rate": 9.033203125000001e-06,
"loss": 0.0284,
"step": 8690
},
{
"epoch": 3.301707779886148,
"grad_norm": 0.353773832321167,
"learning_rate": 8.984375e-06,
"loss": 0.0451,
"step": 8700
},
{
"epoch": 3.3055028462998104,
"grad_norm": 0.03987530991435051,
"learning_rate": 8.935546875e-06,
"loss": 0.0803,
"step": 8710
},
{
"epoch": 3.3092979127134727,
"grad_norm": 2.087677001953125,
"learning_rate": 8.88671875e-06,
"loss": 0.0257,
"step": 8720
},
{
"epoch": 3.3130929791271346,
"grad_norm": 4.051992893218994,
"learning_rate": 8.837890625e-06,
"loss": 0.0345,
"step": 8730
},
{
"epoch": 3.316888045540797,
"grad_norm": 3.694368362426758,
"learning_rate": 8.789062500000001e-06,
"loss": 0.0824,
"step": 8740
},
{
"epoch": 3.3206831119544593,
"grad_norm": 0.09131748974323273,
"learning_rate": 8.740234375e-06,
"loss": 0.0295,
"step": 8750
},
{
"epoch": 3.324478178368121,
"grad_norm": 0.05908443033695221,
"learning_rate": 8.69140625e-06,
"loss": 0.0282,
"step": 8760
},
{
"epoch": 3.3282732447817835,
"grad_norm": 1.863980770111084,
"learning_rate": 8.642578125e-06,
"loss": 0.0442,
"step": 8770
},
{
"epoch": 3.332068311195446,
"grad_norm": 1.2207703590393066,
"learning_rate": 8.59375e-06,
"loss": 0.0316,
"step": 8780
},
{
"epoch": 3.3358633776091082,
"grad_norm": 2.562156915664673,
"learning_rate": 8.544921875e-06,
"loss": 0.0598,
"step": 8790
},
{
"epoch": 3.3396584440227706,
"grad_norm": 5.533409595489502,
"learning_rate": 8.496093750000001e-06,
"loss": 0.0432,
"step": 8800
},
{
"epoch": 3.3434535104364325,
"grad_norm": 0.47492659091949463,
"learning_rate": 8.447265625e-06,
"loss": 0.0528,
"step": 8810
},
{
"epoch": 3.347248576850095,
"grad_norm": 1.0108855962753296,
"learning_rate": 8.3984375e-06,
"loss": 0.0552,
"step": 8820
},
{
"epoch": 3.351043643263757,
"grad_norm": 1.780705451965332,
"learning_rate": 8.349609375e-06,
"loss": 0.0252,
"step": 8830
},
{
"epoch": 3.3548387096774195,
"grad_norm": 0.3152208924293518,
"learning_rate": 8.30078125e-06,
"loss": 0.0915,
"step": 8840
},
{
"epoch": 3.3586337760910814,
"grad_norm": 1.9720813035964966,
"learning_rate": 8.251953125000001e-06,
"loss": 0.0571,
"step": 8850
},
{
"epoch": 3.3624288425047437,
"grad_norm": 0.5636972784996033,
"learning_rate": 8.203125000000001e-06,
"loss": 0.0716,
"step": 8860
},
{
"epoch": 3.366223908918406,
"grad_norm": 9.523944854736328,
"learning_rate": 8.154296875e-06,
"loss": 0.0649,
"step": 8870
},
{
"epoch": 3.3700189753320684,
"grad_norm": 1.868201732635498,
"learning_rate": 8.10546875e-06,
"loss": 0.1055,
"step": 8880
},
{
"epoch": 3.3738140417457307,
"grad_norm": 4.064790725708008,
"learning_rate": 8.056640625e-06,
"loss": 0.0681,
"step": 8890
},
{
"epoch": 3.3776091081593926,
"grad_norm": 5.854636192321777,
"learning_rate": 8.0078125e-06,
"loss": 0.0755,
"step": 8900
},
{
"epoch": 3.381404174573055,
"grad_norm": 0.47955596446990967,
"learning_rate": 7.958984375000001e-06,
"loss": 0.0832,
"step": 8910
},
{
"epoch": 3.3851992409867173,
"grad_norm": 0.48627012968063354,
"learning_rate": 7.91015625e-06,
"loss": 0.0487,
"step": 8920
},
{
"epoch": 3.3889943074003797,
"grad_norm": 1.4986870288848877,
"learning_rate": 7.861328125e-06,
"loss": 0.0769,
"step": 8930
},
{
"epoch": 3.3927893738140416,
"grad_norm": 1.139615774154663,
"learning_rate": 7.8125e-06,
"loss": 0.0238,
"step": 8940
},
{
"epoch": 3.396584440227704,
"grad_norm": 0.17134952545166016,
"learning_rate": 7.763671875e-06,
"loss": 0.072,
"step": 8950
},
{
"epoch": 3.4003795066413662,
"grad_norm": 0.15060165524482727,
"learning_rate": 7.71484375e-06,
"loss": 0.0607,
"step": 8960
},
{
"epoch": 3.4041745730550286,
"grad_norm": 1.0973819494247437,
"learning_rate": 7.666015625000001e-06,
"loss": 0.0914,
"step": 8970
},
{
"epoch": 3.407969639468691,
"grad_norm": 4.7881951332092285,
"learning_rate": 7.6171875000000005e-06,
"loss": 0.0515,
"step": 8980
},
{
"epoch": 3.411764705882353,
"grad_norm": 2.9025986194610596,
"learning_rate": 7.568359375e-06,
"loss": 0.0576,
"step": 8990
},
{
"epoch": 3.415559772296015,
"grad_norm": 0.07781478762626648,
"learning_rate": 7.51953125e-06,
"loss": 0.0318,
"step": 9000
},
{
"epoch": 3.4193548387096775,
"grad_norm": 2.8141448497772217,
"learning_rate": 7.4707031250000005e-06,
"loss": 0.0598,
"step": 9010
},
{
"epoch": 3.42314990512334,
"grad_norm": 1.2371045351028442,
"learning_rate": 7.421875e-06,
"loss": 0.1014,
"step": 9020
},
{
"epoch": 3.4269449715370017,
"grad_norm": 0.11280115693807602,
"learning_rate": 7.373046875e-06,
"loss": 0.0571,
"step": 9030
},
{
"epoch": 3.430740037950664,
"grad_norm": 0.07071410119533539,
"learning_rate": 7.3242187500000006e-06,
"loss": 0.0289,
"step": 9040
},
{
"epoch": 3.4345351043643264,
"grad_norm": 0.07948953658342361,
"learning_rate": 7.275390625e-06,
"loss": 0.0328,
"step": 9050
},
{
"epoch": 3.4383301707779887,
"grad_norm": 6.166849613189697,
"learning_rate": 7.2265625e-06,
"loss": 0.0501,
"step": 9060
},
{
"epoch": 3.442125237191651,
"grad_norm": 0.3815774619579315,
"learning_rate": 7.177734375000001e-06,
"loss": 0.0449,
"step": 9070
},
{
"epoch": 3.445920303605313,
"grad_norm": 0.21274378895759583,
"learning_rate": 7.12890625e-06,
"loss": 0.0871,
"step": 9080
},
{
"epoch": 3.4497153700189753,
"grad_norm": 0.5041061043739319,
"learning_rate": 7.080078125e-06,
"loss": 0.0451,
"step": 9090
},
{
"epoch": 3.4535104364326377,
"grad_norm": 2.4566073417663574,
"learning_rate": 7.031250000000001e-06,
"loss": 0.0622,
"step": 9100
},
{
"epoch": 3.4573055028462996,
"grad_norm": 5.31998872756958,
"learning_rate": 6.982421875e-06,
"loss": 0.0545,
"step": 9110
},
{
"epoch": 3.461100569259962,
"grad_norm": 0.2531034052371979,
"learning_rate": 6.93359375e-06,
"loss": 0.0449,
"step": 9120
},
{
"epoch": 3.4648956356736242,
"grad_norm": 0.03640067204833031,
"learning_rate": 6.884765625000001e-06,
"loss": 0.0944,
"step": 9130
},
{
"epoch": 3.4686907020872866,
"grad_norm": 0.9717852473258972,
"learning_rate": 6.8359375e-06,
"loss": 0.0165,
"step": 9140
},
{
"epoch": 3.472485768500949,
"grad_norm": 1.4924548864364624,
"learning_rate": 6.7871093750000004e-06,
"loss": 0.069,
"step": 9150
},
{
"epoch": 3.476280834914611,
"grad_norm": 2.620271682739258,
"learning_rate": 6.738281250000001e-06,
"loss": 0.0967,
"step": 9160
},
{
"epoch": 3.480075901328273,
"grad_norm": 2.279548406600952,
"learning_rate": 6.689453125e-06,
"loss": 0.0257,
"step": 9170
},
{
"epoch": 3.4838709677419355,
"grad_norm": 0.08608423173427582,
"learning_rate": 6.6406250000000005e-06,
"loss": 0.0359,
"step": 9180
},
{
"epoch": 3.487666034155598,
"grad_norm": 5.201995849609375,
"learning_rate": 6.591796875e-06,
"loss": 0.0349,
"step": 9190
},
{
"epoch": 3.4914611005692597,
"grad_norm": 0.6848796606063843,
"learning_rate": 6.54296875e-06,
"loss": 0.0473,
"step": 9200
},
{
"epoch": 3.495256166982922,
"grad_norm": 1.0673704147338867,
"learning_rate": 6.4941406250000005e-06,
"loss": 0.0751,
"step": 9210
},
{
"epoch": 3.4990512333965844,
"grad_norm": 6.374655723571777,
"learning_rate": 6.4453125e-06,
"loss": 0.0672,
"step": 9220
},
{
"epoch": 3.5028462998102468,
"grad_norm": 3.0670387744903564,
"learning_rate": 6.396484375e-06,
"loss": 0.1047,
"step": 9230
},
{
"epoch": 3.506641366223909,
"grad_norm": 2.0058538913726807,
"learning_rate": 6.3476562500000006e-06,
"loss": 0.0571,
"step": 9240
},
{
"epoch": 3.510436432637571,
"grad_norm": 0.8808121681213379,
"learning_rate": 6.298828125e-06,
"loss": 0.0742,
"step": 9250
},
{
"epoch": 3.5142314990512333,
"grad_norm": 0.1013035699725151,
"learning_rate": 6.25e-06,
"loss": 0.0506,
"step": 9260
},
{
"epoch": 3.5180265654648957,
"grad_norm": 1.1379400491714478,
"learning_rate": 6.201171875000001e-06,
"loss": 0.0466,
"step": 9270
},
{
"epoch": 3.521821631878558,
"grad_norm": 0.44777366518974304,
"learning_rate": 6.15234375e-06,
"loss": 0.0425,
"step": 9280
},
{
"epoch": 3.52561669829222,
"grad_norm": 0.6099011301994324,
"learning_rate": 6.103515625e-06,
"loss": 0.0368,
"step": 9290
},
{
"epoch": 3.5294117647058822,
"grad_norm": 10.134333610534668,
"learning_rate": 6.054687500000001e-06,
"loss": 0.0459,
"step": 9300
},
{
"epoch": 3.5332068311195446,
"grad_norm": 10.301962852478027,
"learning_rate": 6.005859375e-06,
"loss": 0.0712,
"step": 9310
},
{
"epoch": 3.537001897533207,
"grad_norm": 2.240419864654541,
"learning_rate": 5.95703125e-06,
"loss": 0.0496,
"step": 9320
},
{
"epoch": 3.5407969639468693,
"grad_norm": 9.403803825378418,
"learning_rate": 5.908203125000001e-06,
"loss": 0.0551,
"step": 9330
},
{
"epoch": 3.544592030360531,
"grad_norm": 0.0765363797545433,
"learning_rate": 5.859375e-06,
"loss": 0.0382,
"step": 9340
},
{
"epoch": 3.5483870967741935,
"grad_norm": 0.6216185688972473,
"learning_rate": 5.810546875e-06,
"loss": 0.0723,
"step": 9350
},
{
"epoch": 3.552182163187856,
"grad_norm": 6.577167987823486,
"learning_rate": 5.76171875e-06,
"loss": 0.0626,
"step": 9360
},
{
"epoch": 3.555977229601518,
"grad_norm": 0.15332098305225372,
"learning_rate": 5.712890625e-06,
"loss": 0.0419,
"step": 9370
},
{
"epoch": 3.55977229601518,
"grad_norm": 3.2923789024353027,
"learning_rate": 5.6640625000000005e-06,
"loss": 0.0894,
"step": 9380
},
{
"epoch": 3.5635673624288424,
"grad_norm": 1.0206191539764404,
"learning_rate": 5.615234375e-06,
"loss": 0.0477,
"step": 9390
},
{
"epoch": 3.5673624288425048,
"grad_norm": 5.454959869384766,
"learning_rate": 5.56640625e-06,
"loss": 0.0315,
"step": 9400
},
{
"epoch": 3.571157495256167,
"grad_norm": 0.3191007673740387,
"learning_rate": 5.5175781250000005e-06,
"loss": 0.068,
"step": 9410
},
{
"epoch": 3.5749525616698294,
"grad_norm": 12.383304595947266,
"learning_rate": 5.46875e-06,
"loss": 0.0444,
"step": 9420
},
{
"epoch": 3.5787476280834913,
"grad_norm": 1.9023758172988892,
"learning_rate": 5.419921875e-06,
"loss": 0.0942,
"step": 9430
},
{
"epoch": 3.5825426944971537,
"grad_norm": 0.06706677377223969,
"learning_rate": 5.3710937500000005e-06,
"loss": 0.0512,
"step": 9440
},
{
"epoch": 3.586337760910816,
"grad_norm": 0.32390040159225464,
"learning_rate": 5.322265625e-06,
"loss": 0.0603,
"step": 9450
},
{
"epoch": 3.590132827324478,
"grad_norm": 1.5318775177001953,
"learning_rate": 5.2734375e-06,
"loss": 0.0491,
"step": 9460
},
{
"epoch": 3.5939278937381403,
"grad_norm": 0.5909900665283203,
"learning_rate": 5.2246093750000006e-06,
"loss": 0.0294,
"step": 9470
},
{
"epoch": 3.5977229601518026,
"grad_norm": 1.5226948261260986,
"learning_rate": 5.17578125e-06,
"loss": 0.0621,
"step": 9480
},
{
"epoch": 3.601518026565465,
"grad_norm": 0.24643893539905548,
"learning_rate": 5.126953125e-06,
"loss": 0.0293,
"step": 9490
},
{
"epoch": 3.6053130929791273,
"grad_norm": 7.143110752105713,
"learning_rate": 5.078125000000001e-06,
"loss": 0.0592,
"step": 9500
},
{
"epoch": 3.6091081593927896,
"grad_norm": 3.5135350227355957,
"learning_rate": 5.029296875e-06,
"loss": 0.0705,
"step": 9510
},
{
"epoch": 3.6129032258064515,
"grad_norm": 4.653140544891357,
"learning_rate": 4.98046875e-06,
"loss": 0.0624,
"step": 9520
},
{
"epoch": 3.616698292220114,
"grad_norm": 0.044525645673274994,
"learning_rate": 4.931640625000001e-06,
"loss": 0.0449,
"step": 9530
},
{
"epoch": 3.620493358633776,
"grad_norm": 7.338439464569092,
"learning_rate": 4.8828125e-06,
"loss": 0.0536,
"step": 9540
},
{
"epoch": 3.624288425047438,
"grad_norm": 0.4086396396160126,
"learning_rate": 4.833984375e-06,
"loss": 0.038,
"step": 9550
},
{
"epoch": 3.6280834914611004,
"grad_norm": 0.05038388445973396,
"learning_rate": 4.78515625e-06,
"loss": 0.0458,
"step": 9560
},
{
"epoch": 3.6318785578747628,
"grad_norm": 0.09961717575788498,
"learning_rate": 4.736328125e-06,
"loss": 0.0468,
"step": 9570
},
{
"epoch": 3.635673624288425,
"grad_norm": 0.27485185861587524,
"learning_rate": 4.6875000000000004e-06,
"loss": 0.0675,
"step": 9580
},
{
"epoch": 3.6394686907020875,
"grad_norm": 4.295794486999512,
"learning_rate": 4.638671875e-06,
"loss": 0.0519,
"step": 9590
},
{
"epoch": 3.64326375711575,
"grad_norm": 1.9907684326171875,
"learning_rate": 4.58984375e-06,
"loss": 0.0422,
"step": 9600
},
{
"epoch": 3.6470588235294117,
"grad_norm": 0.12039614468812943,
"learning_rate": 4.5410156250000005e-06,
"loss": 0.044,
"step": 9610
},
{
"epoch": 3.650853889943074,
"grad_norm": 0.4942443072795868,
"learning_rate": 4.4921875e-06,
"loss": 0.0828,
"step": 9620
},
{
"epoch": 3.6546489563567364,
"grad_norm": 0.8744149804115295,
"learning_rate": 4.443359375e-06,
"loss": 0.0514,
"step": 9630
},
{
"epoch": 3.6584440227703983,
"grad_norm": 1.8012325763702393,
"learning_rate": 4.3945312500000005e-06,
"loss": 0.0389,
"step": 9640
},
{
"epoch": 3.6622390891840606,
"grad_norm": 0.09957607835531235,
"learning_rate": 4.345703125e-06,
"loss": 0.0512,
"step": 9650
},
{
"epoch": 3.666034155597723,
"grad_norm": 0.0749269425868988,
"learning_rate": 4.296875e-06,
"loss": 0.0278,
"step": 9660
},
{
"epoch": 3.6698292220113853,
"grad_norm": 0.04859253391623497,
"learning_rate": 4.2480468750000006e-06,
"loss": 0.0813,
"step": 9670
},
{
"epoch": 3.6736242884250476,
"grad_norm": 3.236546277999878,
"learning_rate": 4.19921875e-06,
"loss": 0.0408,
"step": 9680
},
{
"epoch": 3.6774193548387095,
"grad_norm": 2.782500743865967,
"learning_rate": 4.150390625e-06,
"loss": 0.0365,
"step": 9690
},
{
"epoch": 3.681214421252372,
"grad_norm": 0.2516065835952759,
"learning_rate": 4.101562500000001e-06,
"loss": 0.0541,
"step": 9700
},
{
"epoch": 3.685009487666034,
"grad_norm": 0.0802445337176323,
"learning_rate": 4.052734375e-06,
"loss": 0.0296,
"step": 9710
},
{
"epoch": 3.6888045540796965,
"grad_norm": 0.7485657930374146,
"learning_rate": 4.00390625e-06,
"loss": 0.0194,
"step": 9720
},
{
"epoch": 3.6925996204933584,
"grad_norm": 0.05877687409520149,
"learning_rate": 3.955078125e-06,
"loss": 0.0547,
"step": 9730
},
{
"epoch": 3.6963946869070208,
"grad_norm": 3.6818785667419434,
"learning_rate": 3.90625e-06,
"loss": 0.0801,
"step": 9740
},
{
"epoch": 3.700189753320683,
"grad_norm": 0.22303463518619537,
"learning_rate": 3.857421875e-06,
"loss": 0.0326,
"step": 9750
},
{
"epoch": 3.7039848197343455,
"grad_norm": 0.16665808856487274,
"learning_rate": 3.8085937500000002e-06,
"loss": 0.0664,
"step": 9760
},
{
"epoch": 3.707779886148008,
"grad_norm": 0.2113623172044754,
"learning_rate": 3.759765625e-06,
"loss": 0.0495,
"step": 9770
},
{
"epoch": 3.7115749525616697,
"grad_norm": 1.9400161504745483,
"learning_rate": 3.7109375e-06,
"loss": 0.062,
"step": 9780
},
{
"epoch": 3.715370018975332,
"grad_norm": 2.147211790084839,
"learning_rate": 3.6621093750000003e-06,
"loss": 0.0408,
"step": 9790
},
{
"epoch": 3.7191650853889944,
"grad_norm": 0.17818136513233185,
"learning_rate": 3.61328125e-06,
"loss": 0.0376,
"step": 9800
},
{
"epoch": 3.7229601518026563,
"grad_norm": 0.2646294832229614,
"learning_rate": 3.564453125e-06,
"loss": 0.0488,
"step": 9810
},
{
"epoch": 3.7267552182163186,
"grad_norm": 0.07648167759180069,
"learning_rate": 3.5156250000000003e-06,
"loss": 0.0618,
"step": 9820
},
{
"epoch": 3.730550284629981,
"grad_norm": 4.988431930541992,
"learning_rate": 3.466796875e-06,
"loss": 0.0438,
"step": 9830
},
{
"epoch": 3.7343453510436433,
"grad_norm": 4.025431156158447,
"learning_rate": 3.41796875e-06,
"loss": 0.0663,
"step": 9840
},
{
"epoch": 3.7381404174573056,
"grad_norm": 0.7877894043922424,
"learning_rate": 3.3691406250000004e-06,
"loss": 0.0261,
"step": 9850
},
{
"epoch": 3.741935483870968,
"grad_norm": 1.7883660793304443,
"learning_rate": 3.3203125000000002e-06,
"loss": 0.0481,
"step": 9860
},
{
"epoch": 3.74573055028463,
"grad_norm": 2.136960029602051,
"learning_rate": 3.271484375e-06,
"loss": 0.052,
"step": 9870
},
{
"epoch": 3.749525616698292,
"grad_norm": 0.9067153930664062,
"learning_rate": 3.22265625e-06,
"loss": 0.0567,
"step": 9880
},
{
"epoch": 3.7533206831119545,
"grad_norm": 1.2437059879302979,
"learning_rate": 3.1738281250000003e-06,
"loss": 0.053,
"step": 9890
},
{
"epoch": 3.7571157495256164,
"grad_norm": 2.1223294734954834,
"learning_rate": 3.125e-06,
"loss": 0.0484,
"step": 9900
},
{
"epoch": 3.760910815939279,
"grad_norm": 8.40434455871582,
"learning_rate": 3.076171875e-06,
"loss": 0.0451,
"step": 9910
},
{
"epoch": 3.764705882352941,
"grad_norm": 2.565584421157837,
"learning_rate": 3.0273437500000003e-06,
"loss": 0.0589,
"step": 9920
},
{
"epoch": 3.7685009487666035,
"grad_norm": 5.559597492218018,
"learning_rate": 2.978515625e-06,
"loss": 0.0396,
"step": 9930
},
{
"epoch": 3.772296015180266,
"grad_norm": 0.5843867659568787,
"learning_rate": 2.9296875e-06,
"loss": 0.0682,
"step": 9940
},
{
"epoch": 3.776091081593928,
"grad_norm": 1.6344566345214844,
"learning_rate": 2.880859375e-06,
"loss": 0.0892,
"step": 9950
},
{
"epoch": 3.77988614800759,
"grad_norm": 5.6130051612854,
"learning_rate": 2.8320312500000002e-06,
"loss": 0.0439,
"step": 9960
},
{
"epoch": 3.7836812144212524,
"grad_norm": 3.700528144836426,
"learning_rate": 2.783203125e-06,
"loss": 0.0228,
"step": 9970
},
{
"epoch": 3.7874762808349147,
"grad_norm": 2.797687530517578,
"learning_rate": 2.734375e-06,
"loss": 0.0247,
"step": 9980
},
{
"epoch": 3.7912713472485766,
"grad_norm": 1.7192658185958862,
"learning_rate": 2.6855468750000003e-06,
"loss": 0.0792,
"step": 9990
},
{
"epoch": 3.795066413662239,
"grad_norm": 0.0573776513338089,
"learning_rate": 2.63671875e-06,
"loss": 0.0136,
"step": 10000
},
{
"epoch": 3.7988614800759013,
"grad_norm": 0.07321004569530487,
"learning_rate": 2.587890625e-06,
"loss": 0.0461,
"step": 10010
},
{
"epoch": 3.8026565464895636,
"grad_norm": 0.045114945620298386,
"learning_rate": 2.5390625000000003e-06,
"loss": 0.0658,
"step": 10020
},
{
"epoch": 3.806451612903226,
"grad_norm": 0.3899228870868683,
"learning_rate": 2.490234375e-06,
"loss": 0.0389,
"step": 10030
},
{
"epoch": 3.8102466793168883,
"grad_norm": 0.6319021582603455,
"learning_rate": 2.44140625e-06,
"loss": 0.0247,
"step": 10040
},
{
"epoch": 3.81404174573055,
"grad_norm": 1.4026541709899902,
"learning_rate": 2.392578125e-06,
"loss": 0.0161,
"step": 10050
},
{
"epoch": 3.8178368121442126,
"grad_norm": 4.106344699859619,
"learning_rate": 2.3437500000000002e-06,
"loss": 0.042,
"step": 10060
},
{
"epoch": 3.821631878557875,
"grad_norm": 0.5673054456710815,
"learning_rate": 2.294921875e-06,
"loss": 0.0589,
"step": 10070
},
{
"epoch": 3.825426944971537,
"grad_norm": 0.057744644582271576,
"learning_rate": 2.24609375e-06,
"loss": 0.0305,
"step": 10080
},
{
"epoch": 3.829222011385199,
"grad_norm": 3.3453450202941895,
"learning_rate": 2.1972656250000003e-06,
"loss": 0.0317,
"step": 10090
},
{
"epoch": 3.8330170777988615,
"grad_norm": 0.08820886164903641,
"learning_rate": 2.1484375e-06,
"loss": 0.0355,
"step": 10100
},
{
"epoch": 3.836812144212524,
"grad_norm": 1.522764801979065,
"learning_rate": 2.099609375e-06,
"loss": 0.0496,
"step": 10110
},
{
"epoch": 3.840607210626186,
"grad_norm": 0.9732184410095215,
"learning_rate": 2.0507812500000003e-06,
"loss": 0.0303,
"step": 10120
},
{
"epoch": 3.844402277039848,
"grad_norm": 0.1131846010684967,
"learning_rate": 2.001953125e-06,
"loss": 0.0359,
"step": 10130
},
{
"epoch": 3.8481973434535104,
"grad_norm": 3.4666688442230225,
"learning_rate": 1.953125e-06,
"loss": 0.0542,
"step": 10140
},
{
"epoch": 3.8519924098671727,
"grad_norm": 3.6389381885528564,
"learning_rate": 1.9042968750000001e-06,
"loss": 0.0328,
"step": 10150
},
{
"epoch": 3.855787476280835,
"grad_norm": 0.7695565819740295,
"learning_rate": 1.85546875e-06,
"loss": 0.0294,
"step": 10160
},
{
"epoch": 3.859582542694497,
"grad_norm": 5.1775593757629395,
"learning_rate": 1.806640625e-06,
"loss": 0.055,
"step": 10170
},
{
"epoch": 3.8633776091081593,
"grad_norm": 0.46061795949935913,
"learning_rate": 1.7578125000000002e-06,
"loss": 0.0376,
"step": 10180
},
{
"epoch": 3.8671726755218216,
"grad_norm": 0.16866852343082428,
"learning_rate": 1.708984375e-06,
"loss": 0.092,
"step": 10190
},
{
"epoch": 3.870967741935484,
"grad_norm": 2.495349168777466,
"learning_rate": 1.6601562500000001e-06,
"loss": 0.0205,
"step": 10200
},
{
"epoch": 3.8747628083491463,
"grad_norm": 4.127594470977783,
"learning_rate": 1.611328125e-06,
"loss": 0.0376,
"step": 10210
},
{
"epoch": 3.878557874762808,
"grad_norm": 0.0868837833404541,
"learning_rate": 1.5625e-06,
"loss": 0.0715,
"step": 10220
},
{
"epoch": 3.8823529411764706,
"grad_norm": 2.7866268157958984,
"learning_rate": 1.5136718750000002e-06,
"loss": 0.0623,
"step": 10230
},
{
"epoch": 3.886148007590133,
"grad_norm": 0.5652477741241455,
"learning_rate": 1.46484375e-06,
"loss": 0.0521,
"step": 10240
},
{
"epoch": 3.889943074003795,
"grad_norm": 0.13568060100078583,
"learning_rate": 1.4160156250000001e-06,
"loss": 0.0373,
"step": 10250
},
{
"epoch": 3.893738140417457,
"grad_norm": 7.213637828826904,
"learning_rate": 1.3671875e-06,
"loss": 0.1189,
"step": 10260
},
{
"epoch": 3.8975332068311195,
"grad_norm": 4.795431613922119,
"learning_rate": 1.318359375e-06,
"loss": 0.0368,
"step": 10270
},
{
"epoch": 3.901328273244782,
"grad_norm": 4.8751220703125,
"learning_rate": 1.2695312500000002e-06,
"loss": 0.0972,
"step": 10280
},
{
"epoch": 3.905123339658444,
"grad_norm": 0.5513148307800293,
"learning_rate": 1.220703125e-06,
"loss": 0.0287,
"step": 10290
},
{
"epoch": 3.9089184060721065,
"grad_norm": 0.16232678294181824,
"learning_rate": 1.1718750000000001e-06,
"loss": 0.0651,
"step": 10300
},
{
"epoch": 3.9127134724857684,
"grad_norm": 3.053624391555786,
"learning_rate": 1.123046875e-06,
"loss": 0.0358,
"step": 10310
},
{
"epoch": 3.9165085388994307,
"grad_norm": 0.1307297945022583,
"learning_rate": 1.07421875e-06,
"loss": 0.0171,
"step": 10320
},
{
"epoch": 3.920303605313093,
"grad_norm": 5.61918306350708,
"learning_rate": 1.0253906250000001e-06,
"loss": 0.0383,
"step": 10330
},
{
"epoch": 3.924098671726755,
"grad_norm": 4.017998695373535,
"learning_rate": 9.765625e-07,
"loss": 0.0547,
"step": 10340
},
{
"epoch": 3.9278937381404173,
"grad_norm": 8.339895248413086,
"learning_rate": 9.27734375e-07,
"loss": 0.059,
"step": 10350
},
{
"epoch": 3.9316888045540797,
"grad_norm": 0.5986772179603577,
"learning_rate": 8.789062500000001e-07,
"loss": 0.0773,
"step": 10360
},
{
"epoch": 3.935483870967742,
"grad_norm": 0.0516970194876194,
"learning_rate": 8.300781250000001e-07,
"loss": 0.0697,
"step": 10370
},
{
"epoch": 3.9392789373814043,
"grad_norm": 1.0691931247711182,
"learning_rate": 7.8125e-07,
"loss": 0.0382,
"step": 10380
},
{
"epoch": 3.9430740037950667,
"grad_norm": 1.0503530502319336,
"learning_rate": 7.32421875e-07,
"loss": 0.0781,
"step": 10390
},
{
"epoch": 3.9468690702087286,
"grad_norm": 4.003793239593506,
"learning_rate": 6.8359375e-07,
"loss": 0.1007,
"step": 10400
},
{
"epoch": 3.950664136622391,
"grad_norm": 0.04315977543592453,
"learning_rate": 6.347656250000001e-07,
"loss": 0.0553,
"step": 10410
},
{
"epoch": 3.9544592030360532,
"grad_norm": 4.378900051116943,
"learning_rate": 5.859375000000001e-07,
"loss": 0.0239,
"step": 10420
},
{
"epoch": 3.958254269449715,
"grad_norm": 0.17604303359985352,
"learning_rate": 5.37109375e-07,
"loss": 0.0338,
"step": 10430
},
{
"epoch": 3.9620493358633775,
"grad_norm": 0.040019456297159195,
"learning_rate": 4.8828125e-07,
"loss": 0.0088,
"step": 10440
},
{
"epoch": 3.96584440227704,
"grad_norm": 4.001920700073242,
"learning_rate": 4.3945312500000004e-07,
"loss": 0.0395,
"step": 10450
},
{
"epoch": 3.969639468690702,
"grad_norm": 4.805160999298096,
"learning_rate": 3.90625e-07,
"loss": 0.0713,
"step": 10460
},
{
"epoch": 3.9734345351043645,
"grad_norm": 0.0865137130022049,
"learning_rate": 3.41796875e-07,
"loss": 0.0394,
"step": 10470
},
{
"epoch": 3.9772296015180264,
"grad_norm": 2.695357322692871,
"learning_rate": 2.9296875000000003e-07,
"loss": 0.0582,
"step": 10480
},
{
"epoch": 3.9810246679316887,
"grad_norm": 0.9629122018814087,
"learning_rate": 2.44140625e-07,
"loss": 0.0201,
"step": 10490
},
{
"epoch": 3.984819734345351,
"grad_norm": 0.8045425415039062,
"learning_rate": 1.953125e-07,
"loss": 0.0653,
"step": 10500
}
],
"logging_steps": 10,
"max_steps": 10540,
"num_input_tokens_seen": 0,
"num_train_epochs": 4,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 2762272477794816.0,
"train_batch_size": 4,
"trial_name": null,
"trial_params": null
}