llama-3.1-8b-final_model / trainer_state.json
yuzhounie's picture
End of training
1912dd1 verified
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 2.9972020145495244,
"eval_steps": 500,
"global_step": 891,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.003357582540570789,
"grad_norm": 15.05952844044008,
"learning_rate": 0.0,
"loss": 1.1094,
"step": 1
},
{
"epoch": 0.006715165081141578,
"grad_norm": 15.55795952285201,
"learning_rate": 1.1111111111111112e-07,
"loss": 1.2127,
"step": 2
},
{
"epoch": 0.010072747621712367,
"grad_norm": 16.539001809523764,
"learning_rate": 2.2222222222222224e-07,
"loss": 1.1671,
"step": 3
},
{
"epoch": 0.013430330162283156,
"grad_norm": 16.368302779349722,
"learning_rate": 3.3333333333333335e-07,
"loss": 1.1785,
"step": 4
},
{
"epoch": 0.016787912702853944,
"grad_norm": 15.588296216474147,
"learning_rate": 4.444444444444445e-07,
"loss": 1.1263,
"step": 5
},
{
"epoch": 0.020145495243424735,
"grad_norm": 16.499164600464685,
"learning_rate": 5.555555555555555e-07,
"loss": 1.1748,
"step": 6
},
{
"epoch": 0.023503077783995522,
"grad_norm": 14.083316686941275,
"learning_rate": 6.666666666666667e-07,
"loss": 1.1657,
"step": 7
},
{
"epoch": 0.026860660324566313,
"grad_norm": 16.604129854168686,
"learning_rate": 7.777777777777779e-07,
"loss": 1.2313,
"step": 8
},
{
"epoch": 0.0302182428651371,
"grad_norm": 14.603261078745698,
"learning_rate": 8.88888888888889e-07,
"loss": 1.1465,
"step": 9
},
{
"epoch": 0.03357582540570789,
"grad_norm": 12.7832318620063,
"learning_rate": 1.0000000000000002e-06,
"loss": 1.1103,
"step": 10
},
{
"epoch": 0.03693340794627868,
"grad_norm": 11.676821024645601,
"learning_rate": 1.111111111111111e-06,
"loss": 1.0456,
"step": 11
},
{
"epoch": 0.04029099048684947,
"grad_norm": 10.276715553394455,
"learning_rate": 1.2222222222222223e-06,
"loss": 1.0994,
"step": 12
},
{
"epoch": 0.04364857302742026,
"grad_norm": 7.580128095019628,
"learning_rate": 1.3333333333333334e-06,
"loss": 0.9192,
"step": 13
},
{
"epoch": 0.047006155567991044,
"grad_norm": 4.579086500989691,
"learning_rate": 1.4444444444444445e-06,
"loss": 0.9019,
"step": 14
},
{
"epoch": 0.05036373810856184,
"grad_norm": 4.462238466517284,
"learning_rate": 1.5555555555555558e-06,
"loss": 0.932,
"step": 15
},
{
"epoch": 0.053721320649132626,
"grad_norm": 4.49504437288462,
"learning_rate": 1.6666666666666667e-06,
"loss": 0.9504,
"step": 16
},
{
"epoch": 0.05707890318970341,
"grad_norm": 4.856137118720444,
"learning_rate": 1.777777777777778e-06,
"loss": 0.8997,
"step": 17
},
{
"epoch": 0.0604364857302742,
"grad_norm": 5.458807952193987,
"learning_rate": 1.888888888888889e-06,
"loss": 0.9135,
"step": 18
},
{
"epoch": 0.063794068270845,
"grad_norm": 4.914187499366455,
"learning_rate": 2.0000000000000003e-06,
"loss": 0.895,
"step": 19
},
{
"epoch": 0.06715165081141578,
"grad_norm": 5.8484928291138685,
"learning_rate": 2.1111111111111114e-06,
"loss": 0.9178,
"step": 20
},
{
"epoch": 0.07050923335198657,
"grad_norm": 6.405365379448204,
"learning_rate": 2.222222222222222e-06,
"loss": 0.9436,
"step": 21
},
{
"epoch": 0.07386681589255736,
"grad_norm": 5.140279358058298,
"learning_rate": 2.3333333333333336e-06,
"loss": 0.8577,
"step": 22
},
{
"epoch": 0.07722439843312814,
"grad_norm": 4.573855490751355,
"learning_rate": 2.4444444444444447e-06,
"loss": 0.849,
"step": 23
},
{
"epoch": 0.08058198097369894,
"grad_norm": 3.4903819904679914,
"learning_rate": 2.5555555555555557e-06,
"loss": 0.8684,
"step": 24
},
{
"epoch": 0.08393956351426973,
"grad_norm": 3.2866185409697413,
"learning_rate": 2.666666666666667e-06,
"loss": 0.9139,
"step": 25
},
{
"epoch": 0.08729714605484051,
"grad_norm": 2.9343312861580553,
"learning_rate": 2.7777777777777783e-06,
"loss": 0.825,
"step": 26
},
{
"epoch": 0.09065472859541131,
"grad_norm": 2.7441074948207635,
"learning_rate": 2.888888888888889e-06,
"loss": 0.7743,
"step": 27
},
{
"epoch": 0.09401231113598209,
"grad_norm": 3.0852450829413036,
"learning_rate": 3e-06,
"loss": 0.8325,
"step": 28
},
{
"epoch": 0.09736989367655288,
"grad_norm": 2.5969750970962107,
"learning_rate": 3.1111111111111116e-06,
"loss": 0.8181,
"step": 29
},
{
"epoch": 0.10072747621712368,
"grad_norm": 3.0607284743078367,
"learning_rate": 3.2222222222222227e-06,
"loss": 0.8839,
"step": 30
},
{
"epoch": 0.10408505875769446,
"grad_norm": 2.6971303460280813,
"learning_rate": 3.3333333333333333e-06,
"loss": 0.803,
"step": 31
},
{
"epoch": 0.10744264129826525,
"grad_norm": 2.7795448200981054,
"learning_rate": 3.444444444444445e-06,
"loss": 0.7857,
"step": 32
},
{
"epoch": 0.11080022383883603,
"grad_norm": 2.672977099956262,
"learning_rate": 3.555555555555556e-06,
"loss": 0.8166,
"step": 33
},
{
"epoch": 0.11415780637940683,
"grad_norm": 2.433972508138389,
"learning_rate": 3.6666666666666666e-06,
"loss": 0.7488,
"step": 34
},
{
"epoch": 0.11751538891997762,
"grad_norm": 2.4637559196857812,
"learning_rate": 3.777777777777778e-06,
"loss": 0.7965,
"step": 35
},
{
"epoch": 0.1208729714605484,
"grad_norm": 2.456980144441013,
"learning_rate": 3.88888888888889e-06,
"loss": 0.8154,
"step": 36
},
{
"epoch": 0.1242305540011192,
"grad_norm": 2.4869279368019273,
"learning_rate": 4.000000000000001e-06,
"loss": 0.8126,
"step": 37
},
{
"epoch": 0.12758813654169,
"grad_norm": 2.467906619619702,
"learning_rate": 4.111111111111111e-06,
"loss": 0.8001,
"step": 38
},
{
"epoch": 0.13094571908226077,
"grad_norm": 2.581215311622158,
"learning_rate": 4.222222222222223e-06,
"loss": 0.8286,
"step": 39
},
{
"epoch": 0.13430330162283155,
"grad_norm": 2.501542706637013,
"learning_rate": 4.333333333333334e-06,
"loss": 0.8294,
"step": 40
},
{
"epoch": 0.13766088416340236,
"grad_norm": 2.334359062766939,
"learning_rate": 4.444444444444444e-06,
"loss": 0.8057,
"step": 41
},
{
"epoch": 0.14101846670397314,
"grad_norm": 2.512161617115715,
"learning_rate": 4.555555555555556e-06,
"loss": 0.7816,
"step": 42
},
{
"epoch": 0.14437604924454392,
"grad_norm": 2.3512830431401186,
"learning_rate": 4.666666666666667e-06,
"loss": 0.7406,
"step": 43
},
{
"epoch": 0.14773363178511473,
"grad_norm": 2.353319121545363,
"learning_rate": 4.777777777777778e-06,
"loss": 0.7836,
"step": 44
},
{
"epoch": 0.1510912143256855,
"grad_norm": 2.3785414037245065,
"learning_rate": 4.888888888888889e-06,
"loss": 0.7905,
"step": 45
},
{
"epoch": 0.1544487968662563,
"grad_norm": 2.489874176687805,
"learning_rate": 5e-06,
"loss": 0.77,
"step": 46
},
{
"epoch": 0.1578063794068271,
"grad_norm": 2.640302288366859,
"learning_rate": 5.1111111111111115e-06,
"loss": 0.8096,
"step": 47
},
{
"epoch": 0.16116396194739788,
"grad_norm": 2.6168372492389875,
"learning_rate": 5.2222222222222226e-06,
"loss": 0.8085,
"step": 48
},
{
"epoch": 0.16452154448796866,
"grad_norm": 2.355298598185473,
"learning_rate": 5.333333333333334e-06,
"loss": 0.7634,
"step": 49
},
{
"epoch": 0.16787912702853947,
"grad_norm": 2.4159132229794564,
"learning_rate": 5.444444444444445e-06,
"loss": 0.7764,
"step": 50
},
{
"epoch": 0.17123670956911025,
"grad_norm": 2.4612658382916344,
"learning_rate": 5.555555555555557e-06,
"loss": 0.7637,
"step": 51
},
{
"epoch": 0.17459429210968103,
"grad_norm": 2.454872028492967,
"learning_rate": 5.666666666666667e-06,
"loss": 0.8304,
"step": 52
},
{
"epoch": 0.1779518746502518,
"grad_norm": 2.231797525957087,
"learning_rate": 5.777777777777778e-06,
"loss": 0.8054,
"step": 53
},
{
"epoch": 0.18130945719082261,
"grad_norm": 2.5145359657110573,
"learning_rate": 5.88888888888889e-06,
"loss": 0.7767,
"step": 54
},
{
"epoch": 0.1846670397313934,
"grad_norm": 2.2680418205268817,
"learning_rate": 6e-06,
"loss": 0.7446,
"step": 55
},
{
"epoch": 0.18802462227196418,
"grad_norm": 2.5397886944135095,
"learning_rate": 6.111111111111112e-06,
"loss": 0.8216,
"step": 56
},
{
"epoch": 0.19138220481253498,
"grad_norm": 2.5677000160418615,
"learning_rate": 6.222222222222223e-06,
"loss": 0.7516,
"step": 57
},
{
"epoch": 0.19473978735310576,
"grad_norm": 2.464277933743688,
"learning_rate": 6.333333333333333e-06,
"loss": 0.7809,
"step": 58
},
{
"epoch": 0.19809736989367654,
"grad_norm": 2.4742771459078763,
"learning_rate": 6.444444444444445e-06,
"loss": 0.7734,
"step": 59
},
{
"epoch": 0.20145495243424735,
"grad_norm": 2.4284525343661794,
"learning_rate": 6.555555555555556e-06,
"loss": 0.7763,
"step": 60
},
{
"epoch": 0.20481253497481813,
"grad_norm": 2.4953358610535985,
"learning_rate": 6.666666666666667e-06,
"loss": 0.7742,
"step": 61
},
{
"epoch": 0.2081701175153889,
"grad_norm": 2.221862854493057,
"learning_rate": 6.777777777777779e-06,
"loss": 0.7388,
"step": 62
},
{
"epoch": 0.21152770005595972,
"grad_norm": 2.4115958840359135,
"learning_rate": 6.88888888888889e-06,
"loss": 0.8117,
"step": 63
},
{
"epoch": 0.2148852825965305,
"grad_norm": 2.247325502758182,
"learning_rate": 7e-06,
"loss": 0.874,
"step": 64
},
{
"epoch": 0.21824286513710128,
"grad_norm": 2.459302207580601,
"learning_rate": 7.111111111111112e-06,
"loss": 0.7753,
"step": 65
},
{
"epoch": 0.22160044767767206,
"grad_norm": 2.3615999287857194,
"learning_rate": 7.222222222222223e-06,
"loss": 0.7364,
"step": 66
},
{
"epoch": 0.22495803021824287,
"grad_norm": 2.345803430121652,
"learning_rate": 7.333333333333333e-06,
"loss": 0.7828,
"step": 67
},
{
"epoch": 0.22831561275881365,
"grad_norm": 2.3409970367083095,
"learning_rate": 7.444444444444445e-06,
"loss": 0.7061,
"step": 68
},
{
"epoch": 0.23167319529938443,
"grad_norm": 2.4774854380802624,
"learning_rate": 7.555555555555556e-06,
"loss": 0.7717,
"step": 69
},
{
"epoch": 0.23503077783995524,
"grad_norm": 2.5208849967610782,
"learning_rate": 7.666666666666667e-06,
"loss": 0.7771,
"step": 70
},
{
"epoch": 0.23838836038052602,
"grad_norm": 2.6549693177981055,
"learning_rate": 7.77777777777778e-06,
"loss": 0.8112,
"step": 71
},
{
"epoch": 0.2417459429210968,
"grad_norm": 2.832698323256436,
"learning_rate": 7.88888888888889e-06,
"loss": 0.7739,
"step": 72
},
{
"epoch": 0.2451035254616676,
"grad_norm": 2.5304942003986453,
"learning_rate": 8.000000000000001e-06,
"loss": 0.8008,
"step": 73
},
{
"epoch": 0.2484611080022384,
"grad_norm": 2.431052677971174,
"learning_rate": 8.111111111111112e-06,
"loss": 0.8011,
"step": 74
},
{
"epoch": 0.2518186905428092,
"grad_norm": 2.2473360155050286,
"learning_rate": 8.222222222222222e-06,
"loss": 0.7438,
"step": 75
},
{
"epoch": 0.25517627308338,
"grad_norm": 2.5408424868361017,
"learning_rate": 8.333333333333334e-06,
"loss": 0.8015,
"step": 76
},
{
"epoch": 0.25853385562395076,
"grad_norm": 2.3659875362077996,
"learning_rate": 8.444444444444446e-06,
"loss": 0.7927,
"step": 77
},
{
"epoch": 0.26189143816452154,
"grad_norm": 2.3184771169883636,
"learning_rate": 8.555555555555556e-06,
"loss": 0.7832,
"step": 78
},
{
"epoch": 0.2652490207050923,
"grad_norm": 2.426200561656744,
"learning_rate": 8.666666666666668e-06,
"loss": 0.7475,
"step": 79
},
{
"epoch": 0.2686066032456631,
"grad_norm": 2.39729048396846,
"learning_rate": 8.777777777777778e-06,
"loss": 0.8269,
"step": 80
},
{
"epoch": 0.27196418578623394,
"grad_norm": 2.375526510228167,
"learning_rate": 8.888888888888888e-06,
"loss": 0.7924,
"step": 81
},
{
"epoch": 0.2753217683268047,
"grad_norm": 2.2496572603077833,
"learning_rate": 9e-06,
"loss": 0.8096,
"step": 82
},
{
"epoch": 0.2786793508673755,
"grad_norm": 2.3057931599473913,
"learning_rate": 9.111111111111112e-06,
"loss": 0.7586,
"step": 83
},
{
"epoch": 0.2820369334079463,
"grad_norm": 2.3828592253623784,
"learning_rate": 9.222222222222224e-06,
"loss": 0.7741,
"step": 84
},
{
"epoch": 0.28539451594851706,
"grad_norm": 2.479750143175691,
"learning_rate": 9.333333333333334e-06,
"loss": 0.7718,
"step": 85
},
{
"epoch": 0.28875209848908784,
"grad_norm": 2.2810057071437466,
"learning_rate": 9.444444444444445e-06,
"loss": 0.7687,
"step": 86
},
{
"epoch": 0.2921096810296586,
"grad_norm": 2.3537687144315655,
"learning_rate": 9.555555555555556e-06,
"loss": 0.7694,
"step": 87
},
{
"epoch": 0.29546726357022945,
"grad_norm": 2.480085659080849,
"learning_rate": 9.666666666666667e-06,
"loss": 0.8077,
"step": 88
},
{
"epoch": 0.29882484611080024,
"grad_norm": 2.3071631718134733,
"learning_rate": 9.777777777777779e-06,
"loss": 0.8218,
"step": 89
},
{
"epoch": 0.302182428651371,
"grad_norm": 2.42645063403485,
"learning_rate": 9.88888888888889e-06,
"loss": 0.7613,
"step": 90
},
{
"epoch": 0.3055400111919418,
"grad_norm": 2.2871396355674958,
"learning_rate": 1e-05,
"loss": 0.7597,
"step": 91
},
{
"epoch": 0.3088975937325126,
"grad_norm": 2.550927280073565,
"learning_rate": 9.999961543109546e-06,
"loss": 0.7757,
"step": 92
},
{
"epoch": 0.31225517627308336,
"grad_norm": 2.27888182793667,
"learning_rate": 9.999846173029752e-06,
"loss": 0.7865,
"step": 93
},
{
"epoch": 0.3156127588136542,
"grad_norm": 2.1772873128455807,
"learning_rate": 9.99965389153533e-06,
"loss": 0.7405,
"step": 94
},
{
"epoch": 0.318970341354225,
"grad_norm": 2.313731791988704,
"learning_rate": 9.999384701584098e-06,
"loss": 0.7605,
"step": 95
},
{
"epoch": 0.32232792389479575,
"grad_norm": 2.66647271515439,
"learning_rate": 9.999038607316942e-06,
"loss": 0.8139,
"step": 96
},
{
"epoch": 0.32568550643536653,
"grad_norm": 2.345938824538182,
"learning_rate": 9.998615614057743e-06,
"loss": 0.7782,
"step": 97
},
{
"epoch": 0.3290430889759373,
"grad_norm": 2.445303562264931,
"learning_rate": 9.998115728313305e-06,
"loss": 0.7628,
"step": 98
},
{
"epoch": 0.3324006715165081,
"grad_norm": 2.3792304151925974,
"learning_rate": 9.997538957773248e-06,
"loss": 0.773,
"step": 99
},
{
"epoch": 0.33575825405707893,
"grad_norm": 2.3052323825950816,
"learning_rate": 9.996885311309892e-06,
"loss": 0.8015,
"step": 100
},
{
"epoch": 0.3391158365976497,
"grad_norm": 2.2653536708074338,
"learning_rate": 9.996154798978122e-06,
"loss": 0.759,
"step": 101
},
{
"epoch": 0.3424734191382205,
"grad_norm": 2.445188297437759,
"learning_rate": 9.99534743201523e-06,
"loss": 0.7828,
"step": 102
},
{
"epoch": 0.34583100167879127,
"grad_norm": 2.1926261320850555,
"learning_rate": 9.994463222840748e-06,
"loss": 0.7518,
"step": 103
},
{
"epoch": 0.34918858421936205,
"grad_norm": 2.3228023822506154,
"learning_rate": 9.993502185056244e-06,
"loss": 0.7541,
"step": 104
},
{
"epoch": 0.35254616675993283,
"grad_norm": 2.361098770321157,
"learning_rate": 9.992464333445134e-06,
"loss": 0.6949,
"step": 105
},
{
"epoch": 0.3559037493005036,
"grad_norm": 2.2629507834451146,
"learning_rate": 9.991349683972435e-06,
"loss": 0.7731,
"step": 106
},
{
"epoch": 0.35926133184107445,
"grad_norm": 2.264400946281939,
"learning_rate": 9.990158253784525e-06,
"loss": 0.7746,
"step": 107
},
{
"epoch": 0.36261891438164523,
"grad_norm": 2.536408740504836,
"learning_rate": 9.988890061208889e-06,
"loss": 0.757,
"step": 108
},
{
"epoch": 0.365976496922216,
"grad_norm": 2.1481392669710657,
"learning_rate": 9.987545125753818e-06,
"loss": 0.8101,
"step": 109
},
{
"epoch": 0.3693340794627868,
"grad_norm": 2.271583484189747,
"learning_rate": 9.986123468108134e-06,
"loss": 0.7716,
"step": 110
},
{
"epoch": 0.37269166200335757,
"grad_norm": 2.301407093107629,
"learning_rate": 9.984625110140844e-06,
"loss": 0.7842,
"step": 111
},
{
"epoch": 0.37604924454392835,
"grad_norm": 2.3089880928671045,
"learning_rate": 9.983050074900824e-06,
"loss": 0.7452,
"step": 112
},
{
"epoch": 0.3794068270844992,
"grad_norm": 2.413625325654982,
"learning_rate": 9.98139838661646e-06,
"loss": 0.7502,
"step": 113
},
{
"epoch": 0.38276440962506997,
"grad_norm": 2.289871304585377,
"learning_rate": 9.979670070695265e-06,
"loss": 0.7708,
"step": 114
},
{
"epoch": 0.38612199216564075,
"grad_norm": 2.290612460056919,
"learning_rate": 9.977865153723508e-06,
"loss": 0.784,
"step": 115
},
{
"epoch": 0.38947957470621153,
"grad_norm": 2.3027870660016725,
"learning_rate": 9.97598366346578e-06,
"loss": 0.7534,
"step": 116
},
{
"epoch": 0.3928371572467823,
"grad_norm": 2.2479898856154015,
"learning_rate": 9.974025628864592e-06,
"loss": 0.7388,
"step": 117
},
{
"epoch": 0.3961947397873531,
"grad_norm": 2.20503493895051,
"learning_rate": 9.971991080039912e-06,
"loss": 0.763,
"step": 118
},
{
"epoch": 0.39955232232792387,
"grad_norm": 2.2879831426439807,
"learning_rate": 9.969880048288704e-06,
"loss": 0.8042,
"step": 119
},
{
"epoch": 0.4029099048684947,
"grad_norm": 2.0895550497991815,
"learning_rate": 9.96769256608446e-06,
"loss": 0.7267,
"step": 120
},
{
"epoch": 0.4062674874090655,
"grad_norm": 2.2118550621037047,
"learning_rate": 9.965428667076687e-06,
"loss": 0.7642,
"step": 121
},
{
"epoch": 0.40962506994963627,
"grad_norm": 2.3589353890246416,
"learning_rate": 9.963088386090386e-06,
"loss": 0.7688,
"step": 122
},
{
"epoch": 0.41298265249020705,
"grad_norm": 2.3245448693859805,
"learning_rate": 9.960671759125529e-06,
"loss": 0.7909,
"step": 123
},
{
"epoch": 0.4163402350307778,
"grad_norm": 2.2267480190116893,
"learning_rate": 9.958178823356503e-06,
"loss": 0.7525,
"step": 124
},
{
"epoch": 0.4196978175713486,
"grad_norm": 2.4295908603398364,
"learning_rate": 9.95560961713153e-06,
"loss": 0.8311,
"step": 125
},
{
"epoch": 0.42305540011191944,
"grad_norm": 2.2939978579098987,
"learning_rate": 9.95296417997208e-06,
"loss": 0.7679,
"step": 126
},
{
"epoch": 0.4264129826524902,
"grad_norm": 2.438507683294902,
"learning_rate": 9.950242552572272e-06,
"loss": 0.783,
"step": 127
},
{
"epoch": 0.429770565193061,
"grad_norm": 2.121254822380545,
"learning_rate": 9.947444776798235e-06,
"loss": 0.7213,
"step": 128
},
{
"epoch": 0.4331281477336318,
"grad_norm": 2.393441549390287,
"learning_rate": 9.944570895687471e-06,
"loss": 0.7832,
"step": 129
},
{
"epoch": 0.43648573027420257,
"grad_norm": 2.354252110767644,
"learning_rate": 9.941620953448195e-06,
"loss": 0.7984,
"step": 130
},
{
"epoch": 0.43984331281477335,
"grad_norm": 2.3944424356758143,
"learning_rate": 9.938594995458644e-06,
"loss": 0.7794,
"step": 131
},
{
"epoch": 0.4432008953553441,
"grad_norm": 2.3095894964107093,
"learning_rate": 9.935493068266396e-06,
"loss": 0.7876,
"step": 132
},
{
"epoch": 0.44655847789591496,
"grad_norm": 2.3120517699348273,
"learning_rate": 9.932315219587641e-06,
"loss": 0.7665,
"step": 133
},
{
"epoch": 0.44991606043648574,
"grad_norm": 2.4246945894908554,
"learning_rate": 9.929061498306448e-06,
"loss": 0.7985,
"step": 134
},
{
"epoch": 0.4532736429770565,
"grad_norm": 2.351327159028273,
"learning_rate": 9.92573195447402e-06,
"loss": 0.8119,
"step": 135
},
{
"epoch": 0.4566312255176273,
"grad_norm": 2.1900583413629042,
"learning_rate": 9.922326639307918e-06,
"loss": 0.753,
"step": 136
},
{
"epoch": 0.4599888080581981,
"grad_norm": 2.39557007686719,
"learning_rate": 9.918845605191274e-06,
"loss": 0.792,
"step": 137
},
{
"epoch": 0.46334639059876886,
"grad_norm": 2.245343140864623,
"learning_rate": 9.915288905671986e-06,
"loss": 0.7924,
"step": 138
},
{
"epoch": 0.4667039731393397,
"grad_norm": 2.095454810542127,
"learning_rate": 9.911656595461899e-06,
"loss": 0.7287,
"step": 139
},
{
"epoch": 0.4700615556799105,
"grad_norm": 2.3661531662229445,
"learning_rate": 9.90794873043595e-06,
"loss": 0.783,
"step": 140
},
{
"epoch": 0.47341913822048126,
"grad_norm": 2.218576799584266,
"learning_rate": 9.904165367631329e-06,
"loss": 0.7682,
"step": 141
},
{
"epoch": 0.47677672076105204,
"grad_norm": 2.2752695851092777,
"learning_rate": 9.900306565246579e-06,
"loss": 0.757,
"step": 142
},
{
"epoch": 0.4801343033016228,
"grad_norm": 2.201550744224962,
"learning_rate": 9.896372382640718e-06,
"loss": 0.7691,
"step": 143
},
{
"epoch": 0.4834918858421936,
"grad_norm": 2.1043983734924185,
"learning_rate": 9.892362880332316e-06,
"loss": 0.7383,
"step": 144
},
{
"epoch": 0.4868494683827644,
"grad_norm": 2.3833966355515153,
"learning_rate": 9.888278119998573e-06,
"loss": 0.807,
"step": 145
},
{
"epoch": 0.4902070509233352,
"grad_norm": 2.0943822596303674,
"learning_rate": 9.884118164474359e-06,
"loss": 0.7899,
"step": 146
},
{
"epoch": 0.493564633463906,
"grad_norm": 2.0793762196537524,
"learning_rate": 9.879883077751255e-06,
"loss": 0.7425,
"step": 147
},
{
"epoch": 0.4969222160044768,
"grad_norm": 2.2384449240578377,
"learning_rate": 9.875572924976568e-06,
"loss": 0.7857,
"step": 148
},
{
"epoch": 0.5002797985450476,
"grad_norm": 2.0645202722027642,
"learning_rate": 9.871187772452327e-06,
"loss": 0.7932,
"step": 149
},
{
"epoch": 0.5036373810856184,
"grad_norm": 2.2655225270221377,
"learning_rate": 9.866727687634266e-06,
"loss": 0.7613,
"step": 150
},
{
"epoch": 0.5069949636261891,
"grad_norm": 2.139175314801808,
"learning_rate": 9.86219273913078e-06,
"loss": 0.7728,
"step": 151
},
{
"epoch": 0.51035254616676,
"grad_norm": 2.2148970746168932,
"learning_rate": 9.857582996701878e-06,
"loss": 0.7613,
"step": 152
},
{
"epoch": 0.5137101287073307,
"grad_norm": 2.3638044057711065,
"learning_rate": 9.852898531258102e-06,
"loss": 0.7538,
"step": 153
},
{
"epoch": 0.5170677112479015,
"grad_norm": 2.16926286809674,
"learning_rate": 9.848139414859441e-06,
"loss": 0.7518,
"step": 154
},
{
"epoch": 0.5204252937884724,
"grad_norm": 2.264174308627129,
"learning_rate": 9.843305720714227e-06,
"loss": 0.758,
"step": 155
},
{
"epoch": 0.5237828763290431,
"grad_norm": 2.1658356317455403,
"learning_rate": 9.838397523177993e-06,
"loss": 0.7508,
"step": 156
},
{
"epoch": 0.5271404588696139,
"grad_norm": 2.2969058345093116,
"learning_rate": 9.833414897752346e-06,
"loss": 0.7595,
"step": 157
},
{
"epoch": 0.5304980414101846,
"grad_norm": 2.180966154918318,
"learning_rate": 9.828357921083803e-06,
"loss": 0.7734,
"step": 158
},
{
"epoch": 0.5338556239507555,
"grad_norm": 2.236205762743028,
"learning_rate": 9.823226670962598e-06,
"loss": 0.821,
"step": 159
},
{
"epoch": 0.5372132064913262,
"grad_norm": 2.4457753187441837,
"learning_rate": 9.818021226321502e-06,
"loss": 0.8161,
"step": 160
},
{
"epoch": 0.540570789031897,
"grad_norm": 2.1706687105611313,
"learning_rate": 9.812741667234599e-06,
"loss": 0.7693,
"step": 161
},
{
"epoch": 0.5439283715724679,
"grad_norm": 2.1712893591002045,
"learning_rate": 9.807388074916064e-06,
"loss": 0.759,
"step": 162
},
{
"epoch": 0.5472859541130386,
"grad_norm": 2.1454942490466675,
"learning_rate": 9.801960531718898e-06,
"loss": 0.7605,
"step": 163
},
{
"epoch": 0.5506435366536094,
"grad_norm": 2.22853836765068,
"learning_rate": 9.796459121133675e-06,
"loss": 0.8167,
"step": 164
},
{
"epoch": 0.5540011191941802,
"grad_norm": 2.147190276259434,
"learning_rate": 9.790883927787254e-06,
"loss": 0.7771,
"step": 165
},
{
"epoch": 0.557358701734751,
"grad_norm": 2.177712396336002,
"learning_rate": 9.785235037441473e-06,
"loss": 0.7749,
"step": 166
},
{
"epoch": 0.5607162842753217,
"grad_norm": 2.2764867484419353,
"learning_rate": 9.779512536991839e-06,
"loss": 0.7186,
"step": 167
},
{
"epoch": 0.5640738668158926,
"grad_norm": 2.0316602958146177,
"learning_rate": 9.773716514466179e-06,
"loss": 0.7092,
"step": 168
},
{
"epoch": 0.5674314493564634,
"grad_norm": 2.335399900268128,
"learning_rate": 9.767847059023292e-06,
"loss": 0.7561,
"step": 169
},
{
"epoch": 0.5707890318970341,
"grad_norm": 2.1709764537143945,
"learning_rate": 9.761904260951583e-06,
"loss": 0.7802,
"step": 170
},
{
"epoch": 0.574146614437605,
"grad_norm": 2.0593129146431512,
"learning_rate": 9.755888211667663e-06,
"loss": 0.7301,
"step": 171
},
{
"epoch": 0.5775041969781757,
"grad_norm": 2.441441500782324,
"learning_rate": 9.749799003714954e-06,
"loss": 0.7799,
"step": 172
},
{
"epoch": 0.5808617795187465,
"grad_norm": 2.2660299178551773,
"learning_rate": 9.743636730762259e-06,
"loss": 0.7827,
"step": 173
},
{
"epoch": 0.5842193620593172,
"grad_norm": 2.1765013932906396,
"learning_rate": 9.737401487602314e-06,
"loss": 0.7267,
"step": 174
},
{
"epoch": 0.5875769445998881,
"grad_norm": 2.3327142301922956,
"learning_rate": 9.731093370150349e-06,
"loss": 0.7456,
"step": 175
},
{
"epoch": 0.5909345271404589,
"grad_norm": 2.1506465746934973,
"learning_rate": 9.724712475442597e-06,
"loss": 0.7703,
"step": 176
},
{
"epoch": 0.5942921096810296,
"grad_norm": 2.356569475164915,
"learning_rate": 9.718258901634802e-06,
"loss": 0.7102,
"step": 177
},
{
"epoch": 0.5976496922216005,
"grad_norm": 2.232872599999044,
"learning_rate": 9.71173274800072e-06,
"loss": 0.7432,
"step": 178
},
{
"epoch": 0.6010072747621712,
"grad_norm": 2.2094954984314996,
"learning_rate": 9.70513411493058e-06,
"loss": 0.7298,
"step": 179
},
{
"epoch": 0.604364857302742,
"grad_norm": 2.247936712152706,
"learning_rate": 9.698463103929542e-06,
"loss": 0.7627,
"step": 180
},
{
"epoch": 0.6077224398433129,
"grad_norm": 2.178442031708872,
"learning_rate": 9.691719817616148e-06,
"loss": 0.747,
"step": 181
},
{
"epoch": 0.6110800223838836,
"grad_norm": 1.963354472091314,
"learning_rate": 9.684904359720724e-06,
"loss": 0.7338,
"step": 182
},
{
"epoch": 0.6144376049244544,
"grad_norm": 2.353759222193367,
"learning_rate": 9.678016835083798e-06,
"loss": 0.7535,
"step": 183
},
{
"epoch": 0.6177951874650252,
"grad_norm": 2.1917086328236373,
"learning_rate": 9.671057349654481e-06,
"loss": 0.8249,
"step": 184
},
{
"epoch": 0.621152770005596,
"grad_norm": 2.068590516399619,
"learning_rate": 9.66402601048884e-06,
"loss": 0.7565,
"step": 185
},
{
"epoch": 0.6245103525461667,
"grad_norm": 2.233193447193882,
"learning_rate": 9.656922925748254e-06,
"loss": 0.779,
"step": 186
},
{
"epoch": 0.6278679350867375,
"grad_norm": 1.9913300969947432,
"learning_rate": 9.649748204697741e-06,
"loss": 0.7111,
"step": 187
},
{
"epoch": 0.6312255176273084,
"grad_norm": 2.0770537048721205,
"learning_rate": 9.642501957704287e-06,
"loss": 0.7737,
"step": 188
},
{
"epoch": 0.6345831001678791,
"grad_norm": 2.026052626547883,
"learning_rate": 9.63518429623514e-06,
"loss": 0.7678,
"step": 189
},
{
"epoch": 0.63794068270845,
"grad_norm": 2.004420216986479,
"learning_rate": 9.627795332856107e-06,
"loss": 0.7765,
"step": 190
},
{
"epoch": 0.6412982652490207,
"grad_norm": 2.0936509457969374,
"learning_rate": 9.620335181229805e-06,
"loss": 0.7583,
"step": 191
},
{
"epoch": 0.6446558477895915,
"grad_norm": 2.038743312113399,
"learning_rate": 9.612803956113932e-06,
"loss": 0.7755,
"step": 192
},
{
"epoch": 0.6480134303301622,
"grad_norm": 2.1648346294989294,
"learning_rate": 9.605201773359485e-06,
"loss": 0.7125,
"step": 193
},
{
"epoch": 0.6513710128707331,
"grad_norm": 2.023535905337207,
"learning_rate": 9.59752874990899e-06,
"loss": 0.72,
"step": 194
},
{
"epoch": 0.6547285954113039,
"grad_norm": 2.0647928726363776,
"learning_rate": 9.589785003794692e-06,
"loss": 0.741,
"step": 195
},
{
"epoch": 0.6580861779518746,
"grad_norm": 2.657611602927363,
"learning_rate": 9.581970654136752e-06,
"loss": 0.7723,
"step": 196
},
{
"epoch": 0.6614437604924455,
"grad_norm": 2.211743855012671,
"learning_rate": 9.574085821141406e-06,
"loss": 0.754,
"step": 197
},
{
"epoch": 0.6648013430330162,
"grad_norm": 2.131488016325981,
"learning_rate": 9.566130626099118e-06,
"loss": 0.7738,
"step": 198
},
{
"epoch": 0.668158925573587,
"grad_norm": 2.1420712267681195,
"learning_rate": 9.55810519138271e-06,
"loss": 0.781,
"step": 199
},
{
"epoch": 0.6715165081141579,
"grad_norm": 2.245825883364256,
"learning_rate": 9.550009640445492e-06,
"loss": 0.7606,
"step": 200
},
{
"epoch": 0.6748740906547286,
"grad_norm": 2.027082764830745,
"learning_rate": 9.541844097819347e-06,
"loss": 0.7535,
"step": 201
},
{
"epoch": 0.6782316731952994,
"grad_norm": 1.9877374251586413,
"learning_rate": 9.533608689112827e-06,
"loss": 0.7559,
"step": 202
},
{
"epoch": 0.6815892557358701,
"grad_norm": 1.9927267340906514,
"learning_rate": 9.525303541009218e-06,
"loss": 0.6754,
"step": 203
},
{
"epoch": 0.684946838276441,
"grad_norm": 2.092882170995953,
"learning_rate": 9.516928781264588e-06,
"loss": 0.7431,
"step": 204
},
{
"epoch": 0.6883044208170117,
"grad_norm": 2.0525414950537972,
"learning_rate": 9.508484538705823e-06,
"loss": 0.7649,
"step": 205
},
{
"epoch": 0.6916620033575825,
"grad_norm": 1.9668368596759938,
"learning_rate": 9.499970943228646e-06,
"loss": 0.7218,
"step": 206
},
{
"epoch": 0.6950195858981534,
"grad_norm": 1.9745189748654561,
"learning_rate": 9.491388125795623e-06,
"loss": 0.7104,
"step": 207
},
{
"epoch": 0.6983771684387241,
"grad_norm": 2.201042855482161,
"learning_rate": 9.482736218434144e-06,
"loss": 0.7477,
"step": 208
},
{
"epoch": 0.7017347509792949,
"grad_norm": 2.0154595111164895,
"learning_rate": 9.474015354234385e-06,
"loss": 0.7587,
"step": 209
},
{
"epoch": 0.7050923335198657,
"grad_norm": 2.313239702363474,
"learning_rate": 9.465225667347275e-06,
"loss": 0.7292,
"step": 210
},
{
"epoch": 0.7084499160604365,
"grad_norm": 2.025272477121896,
"learning_rate": 9.45636729298243e-06,
"loss": 0.7241,
"step": 211
},
{
"epoch": 0.7118074986010072,
"grad_norm": 2.0795424496465573,
"learning_rate": 9.447440367406053e-06,
"loss": 0.7458,
"step": 212
},
{
"epoch": 0.7151650811415781,
"grad_norm": 2.320432869755114,
"learning_rate": 9.438445027938873e-06,
"loss": 0.76,
"step": 213
},
{
"epoch": 0.7185226636821489,
"grad_norm": 2.1418660928160875,
"learning_rate": 9.429381412954e-06,
"loss": 0.7463,
"step": 214
},
{
"epoch": 0.7218802462227196,
"grad_norm": 2.037263699160382,
"learning_rate": 9.420249661874812e-06,
"loss": 0.7562,
"step": 215
},
{
"epoch": 0.7252378287632905,
"grad_norm": 2.0664873609895844,
"learning_rate": 9.41104991517281e-06,
"loss": 0.719,
"step": 216
},
{
"epoch": 0.7285954113038612,
"grad_norm": 2.13157533259705,
"learning_rate": 9.401782314365458e-06,
"loss": 0.7611,
"step": 217
},
{
"epoch": 0.731952993844432,
"grad_norm": 1.9677915863746518,
"learning_rate": 9.392447002013996e-06,
"loss": 0.7241,
"step": 218
},
{
"epoch": 0.7353105763850027,
"grad_norm": 2.0599615713607466,
"learning_rate": 9.383044121721257e-06,
"loss": 0.7413,
"step": 219
},
{
"epoch": 0.7386681589255736,
"grad_norm": 2.0298587251271107,
"learning_rate": 9.37357381812946e-06,
"loss": 0.7479,
"step": 220
},
{
"epoch": 0.7420257414661444,
"grad_norm": 1.9944542234012115,
"learning_rate": 9.364036236917972e-06,
"loss": 0.6834,
"step": 221
},
{
"epoch": 0.7453833240067151,
"grad_norm": 2.0288217614081265,
"learning_rate": 9.354431524801082e-06,
"loss": 0.7512,
"step": 222
},
{
"epoch": 0.748740906547286,
"grad_norm": 2.0374680319858514,
"learning_rate": 9.344759829525734e-06,
"loss": 0.7138,
"step": 223
},
{
"epoch": 0.7520984890878567,
"grad_norm": 1.911969221157047,
"learning_rate": 9.335021299869256e-06,
"loss": 0.7382,
"step": 224
},
{
"epoch": 0.7554560716284275,
"grad_norm": 1.9835399597591075,
"learning_rate": 9.32521608563708e-06,
"loss": 0.7436,
"step": 225
},
{
"epoch": 0.7588136541689984,
"grad_norm": 2.0847726685364045,
"learning_rate": 9.315344337660422e-06,
"loss": 0.7364,
"step": 226
},
{
"epoch": 0.7621712367095691,
"grad_norm": 2.0415670551670426,
"learning_rate": 9.305406207793974e-06,
"loss": 0.7225,
"step": 227
},
{
"epoch": 0.7655288192501399,
"grad_norm": 1.987105167453389,
"learning_rate": 9.295401848913569e-06,
"loss": 0.7458,
"step": 228
},
{
"epoch": 0.7688864017907107,
"grad_norm": 1.9316452047409314,
"learning_rate": 9.285331414913816e-06,
"loss": 0.6967,
"step": 229
},
{
"epoch": 0.7722439843312815,
"grad_norm": 1.9874288288055189,
"learning_rate": 9.275195060705749e-06,
"loss": 0.7501,
"step": 230
},
{
"epoch": 0.7756015668718522,
"grad_norm": 2.0963531264104533,
"learning_rate": 9.264992942214427e-06,
"loss": 0.7236,
"step": 231
},
{
"epoch": 0.7789591494124231,
"grad_norm": 2.0961978727374397,
"learning_rate": 9.254725216376562e-06,
"loss": 0.7666,
"step": 232
},
{
"epoch": 0.7823167319529939,
"grad_norm": 1.981574993630262,
"learning_rate": 9.244392041138068e-06,
"loss": 0.7449,
"step": 233
},
{
"epoch": 0.7856743144935646,
"grad_norm": 1.907742433400848,
"learning_rate": 9.233993575451663e-06,
"loss": 0.7052,
"step": 234
},
{
"epoch": 0.7890318970341355,
"grad_norm": 2.0188171238038426,
"learning_rate": 9.223529979274411e-06,
"loss": 0.7166,
"step": 235
},
{
"epoch": 0.7923894795747062,
"grad_norm": 2.047444273717583,
"learning_rate": 9.213001413565259e-06,
"loss": 0.7614,
"step": 236
},
{
"epoch": 0.795747062115277,
"grad_norm": 2.0237204195979603,
"learning_rate": 9.202408040282567e-06,
"loss": 0.7407,
"step": 237
},
{
"epoch": 0.7991046446558477,
"grad_norm": 2.132189489227427,
"learning_rate": 9.191750022381613e-06,
"loss": 0.76,
"step": 238
},
{
"epoch": 0.8024622271964186,
"grad_norm": 1.9452576363735405,
"learning_rate": 9.181027523812088e-06,
"loss": 0.6906,
"step": 239
},
{
"epoch": 0.8058198097369894,
"grad_norm": 2.0526086557604204,
"learning_rate": 9.170240709515573e-06,
"loss": 0.7492,
"step": 240
},
{
"epoch": 0.8091773922775601,
"grad_norm": 2.127542984513071,
"learning_rate": 9.159389745423003e-06,
"loss": 0.753,
"step": 241
},
{
"epoch": 0.812534974818131,
"grad_norm": 2.190600637823083,
"learning_rate": 9.14847479845211e-06,
"loss": 0.7687,
"step": 242
},
{
"epoch": 0.8158925573587017,
"grad_norm": 1.976664413924339,
"learning_rate": 9.137496036504868e-06,
"loss": 0.7236,
"step": 243
},
{
"epoch": 0.8192501398992725,
"grad_norm": 2.0073069621610813,
"learning_rate": 9.126453628464889e-06,
"loss": 0.7513,
"step": 244
},
{
"epoch": 0.8226077224398433,
"grad_norm": 2.0275714232847366,
"learning_rate": 9.115347744194844e-06,
"loss": 0.7117,
"step": 245
},
{
"epoch": 0.8259653049804141,
"grad_norm": 1.950051870826525,
"learning_rate": 9.10417855453385e-06,
"loss": 0.7421,
"step": 246
},
{
"epoch": 0.8293228875209849,
"grad_norm": 1.9776365232195794,
"learning_rate": 9.09294623129482e-06,
"loss": 0.7683,
"step": 247
},
{
"epoch": 0.8326804700615557,
"grad_norm": 1.9425183877927914,
"learning_rate": 9.081650947261847e-06,
"loss": 0.7454,
"step": 248
},
{
"epoch": 0.8360380526021265,
"grad_norm": 2.0588781735343926,
"learning_rate": 9.070292876187532e-06,
"loss": 0.7511,
"step": 249
},
{
"epoch": 0.8393956351426972,
"grad_norm": 2.1208079935750734,
"learning_rate": 9.058872192790314e-06,
"loss": 0.7594,
"step": 250
},
{
"epoch": 0.842753217683268,
"grad_norm": 1.960725561854021,
"learning_rate": 9.047389072751777e-06,
"loss": 0.7164,
"step": 251
},
{
"epoch": 0.8461108002238389,
"grad_norm": 2.073474220676331,
"learning_rate": 9.035843692713961e-06,
"loss": 0.7256,
"step": 252
},
{
"epoch": 0.8494683827644096,
"grad_norm": 2.0957908157579137,
"learning_rate": 9.02423623027663e-06,
"loss": 0.7307,
"step": 253
},
{
"epoch": 0.8528259653049804,
"grad_norm": 2.152126139365395,
"learning_rate": 9.012566863994548e-06,
"loss": 0.7434,
"step": 254
},
{
"epoch": 0.8561835478455512,
"grad_norm": 2.024800247472397,
"learning_rate": 9.000835773374733e-06,
"loss": 0.7454,
"step": 255
},
{
"epoch": 0.859541130386122,
"grad_norm": 1.9021347801188042,
"learning_rate": 8.98904313887369e-06,
"loss": 0.7057,
"step": 256
},
{
"epoch": 0.8628987129266927,
"grad_norm": 2.1498786642675607,
"learning_rate": 8.977189141894645e-06,
"loss": 0.7711,
"step": 257
},
{
"epoch": 0.8662562954672636,
"grad_norm": 1.92161399371383,
"learning_rate": 8.965273964784735e-06,
"loss": 0.6948,
"step": 258
},
{
"epoch": 0.8696138780078344,
"grad_norm": 2.064988715065064,
"learning_rate": 8.953297790832231e-06,
"loss": 0.7545,
"step": 259
},
{
"epoch": 0.8729714605484051,
"grad_norm": 2.209448102356102,
"learning_rate": 8.941260804263697e-06,
"loss": 0.7427,
"step": 260
},
{
"epoch": 0.876329043088976,
"grad_norm": 2.0663786056657165,
"learning_rate": 8.929163190241157e-06,
"loss": 0.7129,
"step": 261
},
{
"epoch": 0.8796866256295467,
"grad_norm": 1.9398344638285674,
"learning_rate": 8.917005134859263e-06,
"loss": 0.6766,
"step": 262
},
{
"epoch": 0.8830442081701175,
"grad_norm": 2.1841210939318914,
"learning_rate": 8.904786825142416e-06,
"loss": 0.7312,
"step": 263
},
{
"epoch": 0.8864017907106883,
"grad_norm": 2.077339438871376,
"learning_rate": 8.892508449041893e-06,
"loss": 0.752,
"step": 264
},
{
"epoch": 0.8897593732512591,
"grad_norm": 2.0959324603771123,
"learning_rate": 8.88017019543296e-06,
"loss": 0.741,
"step": 265
},
{
"epoch": 0.8931169557918299,
"grad_norm": 2.0026594953656427,
"learning_rate": 8.867772254111966e-06,
"loss": 0.7121,
"step": 266
},
{
"epoch": 0.8964745383324007,
"grad_norm": 1.9765771238302214,
"learning_rate": 8.85531481579342e-06,
"loss": 0.7259,
"step": 267
},
{
"epoch": 0.8998321208729715,
"grad_norm": 2.081056505352585,
"learning_rate": 8.842798072107055e-06,
"loss": 0.8211,
"step": 268
},
{
"epoch": 0.9031897034135422,
"grad_norm": 2.0635888708323313,
"learning_rate": 8.83022221559489e-06,
"loss": 0.7545,
"step": 269
},
{
"epoch": 0.906547285954113,
"grad_norm": 1.9645226085630179,
"learning_rate": 8.81758743970826e-06,
"loss": 0.7097,
"step": 270
},
{
"epoch": 0.9099048684946839,
"grad_norm": 2.090055548313761,
"learning_rate": 8.804893938804839e-06,
"loss": 0.7085,
"step": 271
},
{
"epoch": 0.9132624510352546,
"grad_norm": 1.9631224971590835,
"learning_rate": 8.79214190814566e-06,
"loss": 0.749,
"step": 272
},
{
"epoch": 0.9166200335758254,
"grad_norm": 2.151696624831223,
"learning_rate": 8.779331543892097e-06,
"loss": 0.7437,
"step": 273
},
{
"epoch": 0.9199776161163962,
"grad_norm": 2.0673655977509533,
"learning_rate": 8.766463043102864e-06,
"loss": 0.7405,
"step": 274
},
{
"epoch": 0.923335198656967,
"grad_norm": 1.915638224161175,
"learning_rate": 8.75353660373097e-06,
"loss": 0.7099,
"step": 275
},
{
"epoch": 0.9266927811975377,
"grad_norm": 2.143796983074377,
"learning_rate": 8.740552424620679e-06,
"loss": 0.6971,
"step": 276
},
{
"epoch": 0.9300503637381086,
"grad_norm": 2.057152351652913,
"learning_rate": 8.727510705504453e-06,
"loss": 0.7293,
"step": 277
},
{
"epoch": 0.9334079462786794,
"grad_norm": 2.1240051292960116,
"learning_rate": 8.714411646999878e-06,
"loss": 0.7741,
"step": 278
},
{
"epoch": 0.9367655288192501,
"grad_norm": 2.0326991553451945,
"learning_rate": 8.701255450606579e-06,
"loss": 0.7643,
"step": 279
},
{
"epoch": 0.940123111359821,
"grad_norm": 2.088415602611133,
"learning_rate": 8.688042318703111e-06,
"loss": 0.7464,
"step": 280
},
{
"epoch": 0.9434806939003917,
"grad_norm": 2.1711465158973393,
"learning_rate": 8.674772454543869e-06,
"loss": 0.7103,
"step": 281
},
{
"epoch": 0.9468382764409625,
"grad_norm": 1.9500315142474602,
"learning_rate": 8.661446062255931e-06,
"loss": 0.6947,
"step": 282
},
{
"epoch": 0.9501958589815332,
"grad_norm": 1.846901032667399,
"learning_rate": 8.648063346835943e-06,
"loss": 0.726,
"step": 283
},
{
"epoch": 0.9535534415221041,
"grad_norm": 2.078490317740966,
"learning_rate": 8.634624514146954e-06,
"loss": 0.7353,
"step": 284
},
{
"epoch": 0.9569110240626749,
"grad_norm": 2.02753516538753,
"learning_rate": 8.621129770915248e-06,
"loss": 0.7712,
"step": 285
},
{
"epoch": 0.9602686066032456,
"grad_norm": 1.9463320169189116,
"learning_rate": 8.607579324727175e-06,
"loss": 0.7472,
"step": 286
},
{
"epoch": 0.9636261891438165,
"grad_norm": 1.9444246710199546,
"learning_rate": 8.59397338402594e-06,
"loss": 0.7502,
"step": 287
},
{
"epoch": 0.9669837716843872,
"grad_norm": 1.9590062622910647,
"learning_rate": 8.580312158108413e-06,
"loss": 0.7464,
"step": 288
},
{
"epoch": 0.970341354224958,
"grad_norm": 1.956148304515578,
"learning_rate": 8.566595857121902e-06,
"loss": 0.7099,
"step": 289
},
{
"epoch": 0.9736989367655288,
"grad_norm": 1.8926398800628126,
"learning_rate": 8.55282469206092e-06,
"loss": 0.7299,
"step": 290
},
{
"epoch": 0.9770565193060996,
"grad_norm": 2.0917641075340123,
"learning_rate": 8.538998874763942e-06,
"loss": 0.7639,
"step": 291
},
{
"epoch": 0.9804141018466704,
"grad_norm": 1.9718732131481036,
"learning_rate": 8.525118617910144e-06,
"loss": 0.7547,
"step": 292
},
{
"epoch": 0.9837716843872412,
"grad_norm": 1.9069388930319855,
"learning_rate": 8.511184135016134e-06,
"loss": 0.7309,
"step": 293
},
{
"epoch": 0.987129266927812,
"grad_norm": 2.014949861976609,
"learning_rate": 8.497195640432664e-06,
"loss": 0.7261,
"step": 294
},
{
"epoch": 0.9904868494683827,
"grad_norm": 1.9120513950060911,
"learning_rate": 8.483153349341336e-06,
"loss": 0.7166,
"step": 295
},
{
"epoch": 0.9938444320089536,
"grad_norm": 1.8053701743199972,
"learning_rate": 8.46905747775129e-06,
"loss": 0.6816,
"step": 296
},
{
"epoch": 0.9972020145495244,
"grad_norm": 2.003732103494112,
"learning_rate": 8.45490824249588e-06,
"loss": 0.7147,
"step": 297
},
{
"epoch": 1.0033575825405707,
"grad_norm": 8.303350987198142,
"learning_rate": 8.440705861229344e-06,
"loss": 1.2836,
"step": 298
},
{
"epoch": 1.0067151650811417,
"grad_norm": 2.2471820576545714,
"learning_rate": 8.426450552423451e-06,
"loss": 0.5247,
"step": 299
},
{
"epoch": 1.0100727476217124,
"grad_norm": 2.151639519325533,
"learning_rate": 8.412142535364139e-06,
"loss": 0.5023,
"step": 300
},
{
"epoch": 1.0134303301622831,
"grad_norm": 2.045570754833388,
"learning_rate": 8.397782030148147e-06,
"loss": 0.5212,
"step": 301
},
{
"epoch": 1.0167879127028538,
"grad_norm": 2.157217329180836,
"learning_rate": 8.383369257679625e-06,
"loss": 0.5258,
"step": 302
},
{
"epoch": 1.0201454952434248,
"grad_norm": 2.1393611084312547,
"learning_rate": 8.368904439666739e-06,
"loss": 0.4882,
"step": 303
},
{
"epoch": 1.0235030777839955,
"grad_norm": 2.4819644900313738,
"learning_rate": 8.354387798618254e-06,
"loss": 0.5222,
"step": 304
},
{
"epoch": 1.0268606603245662,
"grad_norm": 2.5039988079708904,
"learning_rate": 8.339819557840124e-06,
"loss": 0.4725,
"step": 305
},
{
"epoch": 1.0302182428651372,
"grad_norm": 2.4034784239124063,
"learning_rate": 8.32519994143204e-06,
"loss": 0.5223,
"step": 306
},
{
"epoch": 1.033575825405708,
"grad_norm": 2.388530410162058,
"learning_rate": 8.310529174284004e-06,
"loss": 0.5291,
"step": 307
},
{
"epoch": 1.0369334079462786,
"grad_norm": 2.3271329489823613,
"learning_rate": 8.295807482072842e-06,
"loss": 0.5197,
"step": 308
},
{
"epoch": 1.0402909904868494,
"grad_norm": 2.1361835792169432,
"learning_rate": 8.281035091258762e-06,
"loss": 0.4758,
"step": 309
},
{
"epoch": 1.0436485730274203,
"grad_norm": 2.2920827998361784,
"learning_rate": 8.266212229081846e-06,
"loss": 0.4927,
"step": 310
},
{
"epoch": 1.047006155567991,
"grad_norm": 2.296117198784697,
"learning_rate": 8.251339123558573e-06,
"loss": 0.4897,
"step": 311
},
{
"epoch": 1.0503637381085618,
"grad_norm": 2.4100622360712793,
"learning_rate": 8.236416003478295e-06,
"loss": 0.4794,
"step": 312
},
{
"epoch": 1.0537213206491327,
"grad_norm": 2.3131605280324865,
"learning_rate": 8.221443098399733e-06,
"loss": 0.4872,
"step": 313
},
{
"epoch": 1.0570789031897034,
"grad_norm": 2.236522744276815,
"learning_rate": 8.206420638647433e-06,
"loss": 0.4945,
"step": 314
},
{
"epoch": 1.0604364857302742,
"grad_norm": 2.2019268672438286,
"learning_rate": 8.191348855308229e-06,
"loss": 0.4766,
"step": 315
},
{
"epoch": 1.063794068270845,
"grad_norm": 2.1588616911267002,
"learning_rate": 8.176227980227693e-06,
"loss": 0.4723,
"step": 316
},
{
"epoch": 1.0671516508114158,
"grad_norm": 2.3900841792146577,
"learning_rate": 8.161058246006558e-06,
"loss": 0.5207,
"step": 317
},
{
"epoch": 1.0705092333519866,
"grad_norm": 2.3583948513012394,
"learning_rate": 8.145839885997146e-06,
"loss": 0.4906,
"step": 318
},
{
"epoch": 1.0738668158925573,
"grad_norm": 2.3746826909440064,
"learning_rate": 8.130573134299782e-06,
"loss": 0.4918,
"step": 319
},
{
"epoch": 1.0772243984331282,
"grad_norm": 2.002312382387202,
"learning_rate": 8.11525822575918e-06,
"loss": 0.4515,
"step": 320
},
{
"epoch": 1.080581980973699,
"grad_norm": 2.2773099709125497,
"learning_rate": 8.099895395960847e-06,
"loss": 0.5124,
"step": 321
},
{
"epoch": 1.0839395635142697,
"grad_norm": 1.9626004596776638,
"learning_rate": 8.084484881227449e-06,
"loss": 0.4867,
"step": 322
},
{
"epoch": 1.0872971460548406,
"grad_norm": 2.2075102999748317,
"learning_rate": 8.069026918615173e-06,
"loss": 0.4901,
"step": 323
},
{
"epoch": 1.0906547285954113,
"grad_norm": 2.1111018515847793,
"learning_rate": 8.05352174591009e-06,
"loss": 0.5072,
"step": 324
},
{
"epoch": 1.094012311135982,
"grad_norm": 2.343955652192936,
"learning_rate": 8.037969601624495e-06,
"loss": 0.5104,
"step": 325
},
{
"epoch": 1.0973698936765528,
"grad_norm": 2.0222492524048734,
"learning_rate": 8.022370724993229e-06,
"loss": 0.4585,
"step": 326
},
{
"epoch": 1.1007274762171237,
"grad_norm": 2.3170486820891316,
"learning_rate": 8.006725355970008e-06,
"loss": 0.4979,
"step": 327
},
{
"epoch": 1.1040850587576945,
"grad_norm": 2.2109613576414646,
"learning_rate": 7.99103373522373e-06,
"loss": 0.4929,
"step": 328
},
{
"epoch": 1.1074426412982652,
"grad_norm": 2.052769278396322,
"learning_rate": 7.975296104134768e-06,
"loss": 0.4891,
"step": 329
},
{
"epoch": 1.1108002238388361,
"grad_norm": 2.197495831185824,
"learning_rate": 7.959512704791269e-06,
"loss": 0.4957,
"step": 330
},
{
"epoch": 1.1141578063794069,
"grad_norm": 2.113288752249262,
"learning_rate": 7.943683779985412e-06,
"loss": 0.4891,
"step": 331
},
{
"epoch": 1.1175153889199776,
"grad_norm": 2.3150223677191604,
"learning_rate": 7.927809573209691e-06,
"loss": 0.4667,
"step": 332
},
{
"epoch": 1.1208729714605483,
"grad_norm": 2.066863441298375,
"learning_rate": 7.911890328653156e-06,
"loss": 0.4485,
"step": 333
},
{
"epoch": 1.1242305540011193,
"grad_norm": 2.1565376459504977,
"learning_rate": 7.895926291197667e-06,
"loss": 0.4817,
"step": 334
},
{
"epoch": 1.12758813654169,
"grad_norm": 2.3475799553693038,
"learning_rate": 7.87991770641412e-06,
"loss": 0.5305,
"step": 335
},
{
"epoch": 1.1309457190822607,
"grad_norm": 2.1203359960845116,
"learning_rate": 7.863864820558669e-06,
"loss": 0.5083,
"step": 336
},
{
"epoch": 1.1343033016228317,
"grad_norm": 2.081087965565155,
"learning_rate": 7.847767880568944e-06,
"loss": 0.4588,
"step": 337
},
{
"epoch": 1.1376608841634024,
"grad_norm": 2.081318262101045,
"learning_rate": 7.831627134060249e-06,
"loss": 0.4846,
"step": 338
},
{
"epoch": 1.141018466703973,
"grad_norm": 2.209767183684003,
"learning_rate": 7.815442829321754e-06,
"loss": 0.483,
"step": 339
},
{
"epoch": 1.1443760492445438,
"grad_norm": 2.011880925223341,
"learning_rate": 7.799215215312667e-06,
"loss": 0.4579,
"step": 340
},
{
"epoch": 1.1477336317851148,
"grad_norm": 2.0797725312115203,
"learning_rate": 7.782944541658423e-06,
"loss": 0.5117,
"step": 341
},
{
"epoch": 1.1510912143256855,
"grad_norm": 1.9959817767175392,
"learning_rate": 7.766631058646826e-06,
"loss": 0.4622,
"step": 342
},
{
"epoch": 1.1544487968662562,
"grad_norm": 2.2735006851582202,
"learning_rate": 7.750275017224208e-06,
"loss": 0.4724,
"step": 343
},
{
"epoch": 1.1578063794068272,
"grad_norm": 2.0970076738982044,
"learning_rate": 7.733876668991565e-06,
"loss": 0.4924,
"step": 344
},
{
"epoch": 1.161163961947398,
"grad_norm": 2.2352119607145307,
"learning_rate": 7.71743626620069e-06,
"loss": 0.517,
"step": 345
},
{
"epoch": 1.1645215444879686,
"grad_norm": 2.302631355787852,
"learning_rate": 7.700954061750295e-06,
"loss": 0.487,
"step": 346
},
{
"epoch": 1.1678791270285394,
"grad_norm": 2.1221219186208966,
"learning_rate": 7.684430309182106e-06,
"loss": 0.4709,
"step": 347
},
{
"epoch": 1.1712367095691103,
"grad_norm": 2.101482815450197,
"learning_rate": 7.667865262676981e-06,
"loss": 0.489,
"step": 348
},
{
"epoch": 1.174594292109681,
"grad_norm": 2.105161148484614,
"learning_rate": 7.651259177050996e-06,
"loss": 0.5033,
"step": 349
},
{
"epoch": 1.1779518746502518,
"grad_norm": 1.957600802485906,
"learning_rate": 7.634612307751513e-06,
"loss": 0.45,
"step": 350
},
{
"epoch": 1.1813094571908227,
"grad_norm": 2.2812310534697775,
"learning_rate": 7.617924910853266e-06,
"loss": 0.5108,
"step": 351
},
{
"epoch": 1.1846670397313934,
"grad_norm": 2.2141528970819246,
"learning_rate": 7.601197243054411e-06,
"loss": 0.4998,
"step": 352
},
{
"epoch": 1.1880246222719641,
"grad_norm": 2.1186842901832033,
"learning_rate": 7.584429561672586e-06,
"loss": 0.4822,
"step": 353
},
{
"epoch": 1.1913822048125349,
"grad_norm": 2.2241188415508857,
"learning_rate": 7.567622124640942e-06,
"loss": 0.4824,
"step": 354
},
{
"epoch": 1.1947397873531058,
"grad_norm": 2.2126573138667402,
"learning_rate": 7.5507751905041885e-06,
"loss": 0.5051,
"step": 355
},
{
"epoch": 1.1980973698936765,
"grad_norm": 2.175213998538769,
"learning_rate": 7.533889018414602e-06,
"loss": 0.4909,
"step": 356
},
{
"epoch": 1.2014549524342473,
"grad_norm": 2.1682912210779732,
"learning_rate": 7.516963868128054e-06,
"loss": 0.4975,
"step": 357
},
{
"epoch": 1.2048125349748182,
"grad_norm": 2.183195934735365,
"learning_rate": 7.500000000000001e-06,
"loss": 0.5057,
"step": 358
},
{
"epoch": 1.208170117515389,
"grad_norm": 2.1348501168594525,
"learning_rate": 7.4829976749814935e-06,
"loss": 0.4958,
"step": 359
},
{
"epoch": 1.2115277000559597,
"grad_norm": 2.45122587491375,
"learning_rate": 7.46595715461515e-06,
"loss": 0.51,
"step": 360
},
{
"epoch": 1.2148852825965304,
"grad_norm": 2.273341521941601,
"learning_rate": 7.4488787010311425e-06,
"loss": 0.4986,
"step": 361
},
{
"epoch": 1.2182428651371013,
"grad_norm": 2.1265214648425808,
"learning_rate": 7.431762576943157e-06,
"loss": 0.5224,
"step": 362
},
{
"epoch": 1.221600447677672,
"grad_norm": 2.207039996610266,
"learning_rate": 7.414609045644356e-06,
"loss": 0.5036,
"step": 363
},
{
"epoch": 1.2249580302182428,
"grad_norm": 2.2511356341290045,
"learning_rate": 7.3974183710033334e-06,
"loss": 0.4985,
"step": 364
},
{
"epoch": 1.2283156127588137,
"grad_norm": 2.2414220008243655,
"learning_rate": 7.38019081746004e-06,
"loss": 0.4826,
"step": 365
},
{
"epoch": 1.2316731952993845,
"grad_norm": 2.2345724571038583,
"learning_rate": 7.362926650021736e-06,
"loss": 0.4734,
"step": 366
},
{
"epoch": 1.2350307778399552,
"grad_norm": 2.020267975703017,
"learning_rate": 7.345626134258897e-06,
"loss": 0.4707,
"step": 367
},
{
"epoch": 1.238388360380526,
"grad_norm": 2.017700711508486,
"learning_rate": 7.3282895363011405e-06,
"loss": 0.4429,
"step": 368
},
{
"epoch": 1.2417459429210969,
"grad_norm": 2.4075032884901484,
"learning_rate": 7.310917122833127e-06,
"loss": 0.5123,
"step": 369
},
{
"epoch": 1.2451035254616676,
"grad_norm": 2.0744065650842924,
"learning_rate": 7.293509161090453e-06,
"loss": 0.4868,
"step": 370
},
{
"epoch": 1.2484611080022383,
"grad_norm": 2.1398898292409654,
"learning_rate": 7.276065918855554e-06,
"loss": 0.4917,
"step": 371
},
{
"epoch": 1.2518186905428093,
"grad_norm": 2.1639714949391915,
"learning_rate": 7.2585876644535705e-06,
"loss": 0.4957,
"step": 372
},
{
"epoch": 1.25517627308338,
"grad_norm": 2.3088349879154446,
"learning_rate": 7.241074666748228e-06,
"loss": 0.5311,
"step": 373
},
{
"epoch": 1.2585338556239507,
"grad_norm": 2.20877128750944,
"learning_rate": 7.2235271951377005e-06,
"loss": 0.5217,
"step": 374
},
{
"epoch": 1.2618914381645214,
"grad_norm": 2.079999331915844,
"learning_rate": 7.205945519550467e-06,
"loss": 0.4972,
"step": 375
},
{
"epoch": 1.2652490207050924,
"grad_norm": 2.1653370386720643,
"learning_rate": 7.188329910441154e-06,
"loss": 0.4715,
"step": 376
},
{
"epoch": 1.268606603245663,
"grad_norm": 2.2020260358728083,
"learning_rate": 7.170680638786383e-06,
"loss": 0.4841,
"step": 377
},
{
"epoch": 1.271964185786234,
"grad_norm": 2.313938671583983,
"learning_rate": 7.1529979760805946e-06,
"loss": 0.5132,
"step": 378
},
{
"epoch": 1.2753217683268048,
"grad_norm": 2.3187218424786433,
"learning_rate": 7.135282194331881e-06,
"loss": 0.4916,
"step": 379
},
{
"epoch": 1.2786793508673755,
"grad_norm": 2.381365131626571,
"learning_rate": 7.1175335660577906e-06,
"loss": 0.4985,
"step": 380
},
{
"epoch": 1.2820369334079462,
"grad_norm": 2.188895335954739,
"learning_rate": 7.099752364281147e-06,
"loss": 0.4985,
"step": 381
},
{
"epoch": 1.285394515948517,
"grad_norm": 2.2491929362704863,
"learning_rate": 7.0819388625258385e-06,
"loss": 0.4648,
"step": 382
},
{
"epoch": 1.288752098489088,
"grad_norm": 2.302063139422305,
"learning_rate": 7.0640933348126235e-06,
"loss": 0.5151,
"step": 383
},
{
"epoch": 1.2921096810296586,
"grad_norm": 2.2033193942118423,
"learning_rate": 7.046216055654902e-06,
"loss": 0.4853,
"step": 384
},
{
"epoch": 1.2954672635702296,
"grad_norm": 2.2781079428025364,
"learning_rate": 7.028307300054499e-06,
"loss": 0.5407,
"step": 385
},
{
"epoch": 1.2988248461108003,
"grad_norm": 2.4113225121524366,
"learning_rate": 7.0103673434974375e-06,
"loss": 0.504,
"step": 386
},
{
"epoch": 1.302182428651371,
"grad_norm": 2.1395395477301933,
"learning_rate": 6.992396461949693e-06,
"loss": 0.4724,
"step": 387
},
{
"epoch": 1.3055400111919417,
"grad_norm": 2.1628168470264364,
"learning_rate": 6.974394931852957e-06,
"loss": 0.4901,
"step": 388
},
{
"epoch": 1.3088975937325125,
"grad_norm": 2.0779952047374244,
"learning_rate": 6.956363030120377e-06,
"loss": 0.4779,
"step": 389
},
{
"epoch": 1.3122551762730834,
"grad_norm": 2.026126139321993,
"learning_rate": 6.9383010341323e-06,
"loss": 0.4661,
"step": 390
},
{
"epoch": 1.3156127588136541,
"grad_norm": 2.290741048224957,
"learning_rate": 6.920209221732007e-06,
"loss": 0.4814,
"step": 391
},
{
"epoch": 1.318970341354225,
"grad_norm": 2.1365050290585423,
"learning_rate": 6.902087871221439e-06,
"loss": 0.498,
"step": 392
},
{
"epoch": 1.3223279238947958,
"grad_norm": 2.2937163334382817,
"learning_rate": 6.88393726135691e-06,
"loss": 0.5079,
"step": 393
},
{
"epoch": 1.3256855064353665,
"grad_norm": 2.072458351219792,
"learning_rate": 6.865757671344827e-06,
"loss": 0.4769,
"step": 394
},
{
"epoch": 1.3290430889759373,
"grad_norm": 2.0989809200956544,
"learning_rate": 6.8475493808373895e-06,
"loss": 0.4766,
"step": 395
},
{
"epoch": 1.332400671516508,
"grad_norm": 2.064563966968899,
"learning_rate": 6.829312669928293e-06,
"loss": 0.456,
"step": 396
},
{
"epoch": 1.335758254057079,
"grad_norm": 2.1069716399584197,
"learning_rate": 6.811047819148413e-06,
"loss": 0.4984,
"step": 397
},
{
"epoch": 1.3391158365976497,
"grad_norm": 2.213794419117928,
"learning_rate": 6.792755109461498e-06,
"loss": 0.4866,
"step": 398
},
{
"epoch": 1.3424734191382206,
"grad_norm": 2.2174112182270522,
"learning_rate": 6.7744348222598386e-06,
"loss": 0.499,
"step": 399
},
{
"epoch": 1.3458310016787913,
"grad_norm": 2.079145185573435,
"learning_rate": 6.756087239359948e-06,
"loss": 0.493,
"step": 400
},
{
"epoch": 1.349188584219362,
"grad_norm": 2.1392680497000085,
"learning_rate": 6.737712642998219e-06,
"loss": 0.5378,
"step": 401
},
{
"epoch": 1.3525461667599328,
"grad_norm": 2.0513971936886675,
"learning_rate": 6.719311315826589e-06,
"loss": 0.4714,
"step": 402
},
{
"epoch": 1.3559037493005035,
"grad_norm": 2.2552876351142284,
"learning_rate": 6.700883540908185e-06,
"loss": 0.4872,
"step": 403
},
{
"epoch": 1.3592613318410744,
"grad_norm": 2.0728376046951777,
"learning_rate": 6.682429601712976e-06,
"loss": 0.4799,
"step": 404
},
{
"epoch": 1.3626189143816452,
"grad_norm": 2.0220373122992674,
"learning_rate": 6.663949782113413e-06,
"loss": 0.5166,
"step": 405
},
{
"epoch": 1.3659764969222161,
"grad_norm": 2.220177493037904,
"learning_rate": 6.64544436638005e-06,
"loss": 0.4781,
"step": 406
},
{
"epoch": 1.3693340794627868,
"grad_norm": 1.9793083289019477,
"learning_rate": 6.626913639177189e-06,
"loss": 0.4867,
"step": 407
},
{
"epoch": 1.3726916620033576,
"grad_norm": 2.0274582078000822,
"learning_rate": 6.608357885558485e-06,
"loss": 0.4443,
"step": 408
},
{
"epoch": 1.3760492445439283,
"grad_norm": 2.1847030624625883,
"learning_rate": 6.589777390962575e-06,
"loss": 0.5259,
"step": 409
},
{
"epoch": 1.3794068270844992,
"grad_norm": 2.2214921474975395,
"learning_rate": 6.571172441208678e-06,
"loss": 0.4816,
"step": 410
},
{
"epoch": 1.38276440962507,
"grad_norm": 2.349546029810803,
"learning_rate": 6.552543322492195e-06,
"loss": 0.5083,
"step": 411
},
{
"epoch": 1.3861219921656407,
"grad_norm": 2.2544220841659315,
"learning_rate": 6.53389032138032e-06,
"loss": 0.5073,
"step": 412
},
{
"epoch": 1.3894795747062116,
"grad_norm": 2.1899206248487135,
"learning_rate": 6.515213724807621e-06,
"loss": 0.473,
"step": 413
},
{
"epoch": 1.3928371572467824,
"grad_norm": 2.144097777561319,
"learning_rate": 6.49651382007163e-06,
"loss": 0.4745,
"step": 414
},
{
"epoch": 1.396194739787353,
"grad_norm": 2.249837918425837,
"learning_rate": 6.477790894828422e-06,
"loss": 0.5074,
"step": 415
},
{
"epoch": 1.3995523223279238,
"grad_norm": 2.203568377218348,
"learning_rate": 6.459045237088189e-06,
"loss": 0.5182,
"step": 416
},
{
"epoch": 1.4029099048684948,
"grad_norm": 2.207808012115493,
"learning_rate": 6.440277135210815e-06,
"loss": 0.4861,
"step": 417
},
{
"epoch": 1.4062674874090655,
"grad_norm": 2.1441864682397886,
"learning_rate": 6.421486877901436e-06,
"loss": 0.4886,
"step": 418
},
{
"epoch": 1.4096250699496362,
"grad_norm": 2.228627269764285,
"learning_rate": 6.402674754205998e-06,
"loss": 0.4773,
"step": 419
},
{
"epoch": 1.4129826524902072,
"grad_norm": 2.1994987489882893,
"learning_rate": 6.383841053506813e-06,
"loss": 0.5075,
"step": 420
},
{
"epoch": 1.4163402350307779,
"grad_norm": 2.183688476148547,
"learning_rate": 6.364986065518106e-06,
"loss": 0.4917,
"step": 421
},
{
"epoch": 1.4196978175713486,
"grad_norm": 2.269619040413762,
"learning_rate": 6.3461100802815625e-06,
"loss": 0.4967,
"step": 422
},
{
"epoch": 1.4230554001119193,
"grad_norm": 2.103981888475452,
"learning_rate": 6.3272133881618596e-06,
"loss": 0.4431,
"step": 423
},
{
"epoch": 1.4264129826524903,
"grad_norm": 2.1258273496585156,
"learning_rate": 6.308296279842204e-06,
"loss": 0.5031,
"step": 424
},
{
"epoch": 1.429770565193061,
"grad_norm": 2.196489174906513,
"learning_rate": 6.289359046319862e-06,
"loss": 0.4924,
"step": 425
},
{
"epoch": 1.4331281477336317,
"grad_norm": 2.1269856468419963,
"learning_rate": 6.270401978901678e-06,
"loss": 0.4895,
"step": 426
},
{
"epoch": 1.4364857302742027,
"grad_norm": 2.2126446207865587,
"learning_rate": 6.2514253691996e-06,
"loss": 0.5122,
"step": 427
},
{
"epoch": 1.4398433128147734,
"grad_norm": 2.1913325147414726,
"learning_rate": 6.2324295091261885e-06,
"loss": 0.5283,
"step": 428
},
{
"epoch": 1.4432008953553441,
"grad_norm": 2.091169249602068,
"learning_rate": 6.213414690890125e-06,
"loss": 0.4879,
"step": 429
},
{
"epoch": 1.4465584778959149,
"grad_norm": 2.210503348980095,
"learning_rate": 6.194381206991723e-06,
"loss": 0.4887,
"step": 430
},
{
"epoch": 1.4499160604364858,
"grad_norm": 2.164926411078867,
"learning_rate": 6.175329350218426e-06,
"loss": 0.4711,
"step": 431
},
{
"epoch": 1.4532736429770565,
"grad_norm": 2.1482110012584203,
"learning_rate": 6.156259413640302e-06,
"loss": 0.462,
"step": 432
},
{
"epoch": 1.4566312255176272,
"grad_norm": 2.294567867253737,
"learning_rate": 6.1371716906055336e-06,
"loss": 0.5164,
"step": 433
},
{
"epoch": 1.4599888080581982,
"grad_norm": 2.1428555316811346,
"learning_rate": 6.11806647473591e-06,
"loss": 0.4756,
"step": 434
},
{
"epoch": 1.463346390598769,
"grad_norm": 2.0267786414519313,
"learning_rate": 6.098944059922311e-06,
"loss": 0.477,
"step": 435
},
{
"epoch": 1.4667039731393396,
"grad_norm": 2.14676631545224,
"learning_rate": 6.079804740320181e-06,
"loss": 0.4668,
"step": 436
},
{
"epoch": 1.4700615556799104,
"grad_norm": 2.2538467654250267,
"learning_rate": 6.060648810345006e-06,
"loss": 0.495,
"step": 437
},
{
"epoch": 1.4734191382204813,
"grad_norm": 2.1469963540401067,
"learning_rate": 6.041476564667785e-06,
"loss": 0.4824,
"step": 438
},
{
"epoch": 1.476776720761052,
"grad_norm": 2.222380545278691,
"learning_rate": 6.022288298210502e-06,
"loss": 0.4753,
"step": 439
},
{
"epoch": 1.4801343033016228,
"grad_norm": 2.1083124763573746,
"learning_rate": 6.003084306141579e-06,
"loss": 0.5052,
"step": 440
},
{
"epoch": 1.4834918858421937,
"grad_norm": 2.1370436926979353,
"learning_rate": 5.983864883871344e-06,
"loss": 0.4789,
"step": 441
},
{
"epoch": 1.4868494683827644,
"grad_norm": 2.3178640332647316,
"learning_rate": 5.964630327047485e-06,
"loss": 0.5193,
"step": 442
},
{
"epoch": 1.4902070509233352,
"grad_norm": 2.1956778031643496,
"learning_rate": 5.945380931550497e-06,
"loss": 0.4849,
"step": 443
},
{
"epoch": 1.4935646334639059,
"grad_norm": 2.1954225526435063,
"learning_rate": 5.926116993489143e-06,
"loss": 0.4852,
"step": 444
},
{
"epoch": 1.4969222160044768,
"grad_norm": 2.0679488397891896,
"learning_rate": 5.906838809195879e-06,
"loss": 0.477,
"step": 445
},
{
"epoch": 1.5002797985450476,
"grad_norm": 2.1914965672856375,
"learning_rate": 5.887546675222319e-06,
"loss": 0.4897,
"step": 446
},
{
"epoch": 1.5036373810856185,
"grad_norm": 2.2169157191153603,
"learning_rate": 5.8682408883346535e-06,
"loss": 0.481,
"step": 447
},
{
"epoch": 1.5069949636261892,
"grad_norm": 2.2332788961047636,
"learning_rate": 5.848921745509094e-06,
"loss": 0.5045,
"step": 448
},
{
"epoch": 1.51035254616676,
"grad_norm": 2.116170949280568,
"learning_rate": 5.829589543927305e-06,
"loss": 0.4674,
"step": 449
},
{
"epoch": 1.5137101287073307,
"grad_norm": 2.043440626400537,
"learning_rate": 5.8102445809718325e-06,
"loss": 0.4964,
"step": 450
},
{
"epoch": 1.5170677112479014,
"grad_norm": 2.0944801885939204,
"learning_rate": 5.790887154221521e-06,
"loss": 0.4948,
"step": 451
},
{
"epoch": 1.5204252937884724,
"grad_norm": 2.3084091670034623,
"learning_rate": 5.771517561446949e-06,
"loss": 0.5108,
"step": 452
},
{
"epoch": 1.523782876329043,
"grad_norm": 2.3208214523309683,
"learning_rate": 5.75213610060584e-06,
"loss": 0.4914,
"step": 453
},
{
"epoch": 1.527140458869614,
"grad_norm": 2.024408080024364,
"learning_rate": 5.7327430698384775e-06,
"loss": 0.4919,
"step": 454
},
{
"epoch": 1.5304980414101848,
"grad_norm": 2.2191159501747073,
"learning_rate": 5.713338767463129e-06,
"loss": 0.5085,
"step": 455
},
{
"epoch": 1.5338556239507555,
"grad_norm": 2.192013145259984,
"learning_rate": 5.693923491971445e-06,
"loss": 0.4936,
"step": 456
},
{
"epoch": 1.5372132064913262,
"grad_norm": 2.137744041707972,
"learning_rate": 5.674497542023875e-06,
"loss": 0.5004,
"step": 457
},
{
"epoch": 1.540570789031897,
"grad_norm": 2.032793978959609,
"learning_rate": 5.65506121644507e-06,
"loss": 0.491,
"step": 458
},
{
"epoch": 1.5439283715724679,
"grad_norm": 2.2242785868303194,
"learning_rate": 5.635614814219289e-06,
"loss": 0.4974,
"step": 459
},
{
"epoch": 1.5472859541130386,
"grad_norm": 2.2872129720575662,
"learning_rate": 5.616158634485793e-06,
"loss": 0.5045,
"step": 460
},
{
"epoch": 1.5506435366536095,
"grad_norm": 2.5161657966811197,
"learning_rate": 5.596692976534256e-06,
"loss": 0.4776,
"step": 461
},
{
"epoch": 1.5540011191941803,
"grad_norm": 2.1830947757285566,
"learning_rate": 5.577218139800143e-06,
"loss": 0.4779,
"step": 462
},
{
"epoch": 1.557358701734751,
"grad_norm": 2.2723393111932286,
"learning_rate": 5.557734423860122e-06,
"loss": 0.4559,
"step": 463
},
{
"epoch": 1.5607162842753217,
"grad_norm": 2.039582763815814,
"learning_rate": 5.538242128427444e-06,
"loss": 0.4967,
"step": 464
},
{
"epoch": 1.5640738668158924,
"grad_norm": 2.1867791207126572,
"learning_rate": 5.518741553347341e-06,
"loss": 0.4793,
"step": 465
},
{
"epoch": 1.5674314493564634,
"grad_norm": 2.184162576004028,
"learning_rate": 5.499232998592399e-06,
"loss": 0.4563,
"step": 466
},
{
"epoch": 1.5707890318970341,
"grad_norm": 2.2279834676183077,
"learning_rate": 5.479716764257961e-06,
"loss": 0.4726,
"step": 467
},
{
"epoch": 1.574146614437605,
"grad_norm": 2.146603101538437,
"learning_rate": 5.4601931505575e-06,
"loss": 0.4761,
"step": 468
},
{
"epoch": 1.5775041969781758,
"grad_norm": 2.2103167022630386,
"learning_rate": 5.44066245781801e-06,
"loss": 0.4955,
"step": 469
},
{
"epoch": 1.5808617795187465,
"grad_norm": 2.3722558539750156,
"learning_rate": 5.421124986475371e-06,
"loss": 0.5089,
"step": 470
},
{
"epoch": 1.5842193620593172,
"grad_norm": 2.3496747654346697,
"learning_rate": 5.4015810370697445e-06,
"loss": 0.4878,
"step": 471
},
{
"epoch": 1.587576944599888,
"grad_norm": 2.185007470426342,
"learning_rate": 5.382030910240936e-06,
"loss": 0.4713,
"step": 472
},
{
"epoch": 1.590934527140459,
"grad_norm": 2.2103514110080593,
"learning_rate": 5.362474906723781e-06,
"loss": 0.5096,
"step": 473
},
{
"epoch": 1.5942921096810296,
"grad_norm": 2.141248032889711,
"learning_rate": 5.342913327343515e-06,
"loss": 0.4891,
"step": 474
},
{
"epoch": 1.5976496922216006,
"grad_norm": 2.172202613793392,
"learning_rate": 5.3233464730111426e-06,
"loss": 0.4929,
"step": 475
},
{
"epoch": 1.6010072747621713,
"grad_norm": 2.238037583616825,
"learning_rate": 5.303774644718813e-06,
"loss": 0.4849,
"step": 476
},
{
"epoch": 1.604364857302742,
"grad_norm": 2.0572504308978954,
"learning_rate": 5.284198143535188e-06,
"loss": 0.4946,
"step": 477
},
{
"epoch": 1.6077224398433128,
"grad_norm": 2.0917913417158016,
"learning_rate": 5.2646172706008154e-06,
"loss": 0.4834,
"step": 478
},
{
"epoch": 1.6110800223838835,
"grad_norm": 2.013696077036751,
"learning_rate": 5.245032327123488e-06,
"loss": 0.4564,
"step": 479
},
{
"epoch": 1.6144376049244544,
"grad_norm": 2.073234984497162,
"learning_rate": 5.225443614373614e-06,
"loss": 0.4479,
"step": 480
},
{
"epoch": 1.6177951874650252,
"grad_norm": 2.208574587163591,
"learning_rate": 5.20585143367959e-06,
"loss": 0.4761,
"step": 481
},
{
"epoch": 1.621152770005596,
"grad_norm": 2.1530116735583316,
"learning_rate": 5.186256086423148e-06,
"loss": 0.4702,
"step": 482
},
{
"epoch": 1.6245103525461668,
"grad_norm": 2.200115990394491,
"learning_rate": 5.166657874034745e-06,
"loss": 0.5088,
"step": 483
},
{
"epoch": 1.6278679350867375,
"grad_norm": 2.1407757545469424,
"learning_rate": 5.147057097988898e-06,
"loss": 0.5084,
"step": 484
},
{
"epoch": 1.6312255176273083,
"grad_norm": 2.124229197200729,
"learning_rate": 5.127454059799567e-06,
"loss": 0.4623,
"step": 485
},
{
"epoch": 1.634583100167879,
"grad_norm": 2.217735463671646,
"learning_rate": 5.1078490610155105e-06,
"loss": 0.4946,
"step": 486
},
{
"epoch": 1.63794068270845,
"grad_norm": 2.0764480657249758,
"learning_rate": 5.088242403215644e-06,
"loss": 0.5089,
"step": 487
},
{
"epoch": 1.6412982652490207,
"grad_norm": 2.331029267583452,
"learning_rate": 5.0686343880044044e-06,
"loss": 0.473,
"step": 488
},
{
"epoch": 1.6446558477895916,
"grad_norm": 2.1804145039026546,
"learning_rate": 5.049025317007108e-06,
"loss": 0.4934,
"step": 489
},
{
"epoch": 1.6480134303301623,
"grad_norm": 1.9682449318933604,
"learning_rate": 5.029415491865311e-06,
"loss": 0.4616,
"step": 490
},
{
"epoch": 1.651371012870733,
"grad_norm": 2.3938687293871133,
"learning_rate": 5.009805214232177e-06,
"loss": 0.5293,
"step": 491
},
{
"epoch": 1.6547285954113038,
"grad_norm": 2.1248954294843094,
"learning_rate": 4.990194785767824e-06,
"loss": 0.4815,
"step": 492
},
{
"epoch": 1.6580861779518745,
"grad_norm": 2.066522021485902,
"learning_rate": 4.97058450813469e-06,
"loss": 0.4782,
"step": 493
},
{
"epoch": 1.6614437604924455,
"grad_norm": 2.132367969004758,
"learning_rate": 4.950974682992894e-06,
"loss": 0.4493,
"step": 494
},
{
"epoch": 1.6648013430330162,
"grad_norm": 2.195382323978713,
"learning_rate": 4.931365611995598e-06,
"loss": 0.5095,
"step": 495
},
{
"epoch": 1.6681589255735871,
"grad_norm": 2.0731108125500346,
"learning_rate": 4.911757596784358e-06,
"loss": 0.5098,
"step": 496
},
{
"epoch": 1.6715165081141579,
"grad_norm": 2.2045936873389618,
"learning_rate": 4.892150938984491e-06,
"loss": 0.5034,
"step": 497
},
{
"epoch": 1.6748740906547286,
"grad_norm": 2.119499431019322,
"learning_rate": 4.872545940200435e-06,
"loss": 0.4621,
"step": 498
},
{
"epoch": 1.6782316731952993,
"grad_norm": 2.0571530774922717,
"learning_rate": 4.8529429020111035e-06,
"loss": 0.4324,
"step": 499
},
{
"epoch": 1.68158925573587,
"grad_norm": 2.080333899217976,
"learning_rate": 4.833342125965257e-06,
"loss": 0.4786,
"step": 500
},
{
"epoch": 1.684946838276441,
"grad_norm": 2.198178104289353,
"learning_rate": 4.813743913576852e-06,
"loss": 0.476,
"step": 501
},
{
"epoch": 1.6883044208170117,
"grad_norm": 2.1587194322010514,
"learning_rate": 4.794148566320412e-06,
"loss": 0.463,
"step": 502
},
{
"epoch": 1.6916620033575827,
"grad_norm": 2.3918082310854296,
"learning_rate": 4.774556385626386e-06,
"loss": 0.502,
"step": 503
},
{
"epoch": 1.6950195858981534,
"grad_norm": 2.267854796695586,
"learning_rate": 4.754967672876513e-06,
"loss": 0.5066,
"step": 504
},
{
"epoch": 1.698377168438724,
"grad_norm": 2.172897001954314,
"learning_rate": 4.7353827293991845e-06,
"loss": 0.4865,
"step": 505
},
{
"epoch": 1.7017347509792948,
"grad_norm": 2.249989307898594,
"learning_rate": 4.715801856464812e-06,
"loss": 0.5135,
"step": 506
},
{
"epoch": 1.7050923335198656,
"grad_norm": 2.182167380702455,
"learning_rate": 4.6962253552811885e-06,
"loss": 0.52,
"step": 507
},
{
"epoch": 1.7084499160604365,
"grad_norm": 2.3182206016361384,
"learning_rate": 4.676653526988858e-06,
"loss": 0.4623,
"step": 508
},
{
"epoch": 1.7118074986010072,
"grad_norm": 2.298712891082637,
"learning_rate": 4.657086672656486e-06,
"loss": 0.4734,
"step": 509
},
{
"epoch": 1.7151650811415782,
"grad_norm": 2.1611212242763194,
"learning_rate": 4.63752509327622e-06,
"loss": 0.4484,
"step": 510
},
{
"epoch": 1.718522663682149,
"grad_norm": 2.2744157038160497,
"learning_rate": 4.617969089759066e-06,
"loss": 0.5041,
"step": 511
},
{
"epoch": 1.7218802462227196,
"grad_norm": 2.001332944054495,
"learning_rate": 4.598418962930258e-06,
"loss": 0.494,
"step": 512
},
{
"epoch": 1.7252378287632903,
"grad_norm": 2.15178549984152,
"learning_rate": 4.57887501352463e-06,
"loss": 0.4906,
"step": 513
},
{
"epoch": 1.728595411303861,
"grad_norm": 2.0567126022881403,
"learning_rate": 4.559337542181993e-06,
"loss": 0.4654,
"step": 514
},
{
"epoch": 1.731952993844432,
"grad_norm": 2.3440444984090214,
"learning_rate": 4.539806849442501e-06,
"loss": 0.4806,
"step": 515
},
{
"epoch": 1.7353105763850027,
"grad_norm": 2.2415234653831475,
"learning_rate": 4.520283235742042e-06,
"loss": 0.4623,
"step": 516
},
{
"epoch": 1.7386681589255737,
"grad_norm": 2.1332817601414997,
"learning_rate": 4.500767001407604e-06,
"loss": 0.4522,
"step": 517
},
{
"epoch": 1.7420257414661444,
"grad_norm": 2.116507838045294,
"learning_rate": 4.481258446652662e-06,
"loss": 0.4842,
"step": 518
},
{
"epoch": 1.7453833240067151,
"grad_norm": 2.215105939137212,
"learning_rate": 4.4617578715725565e-06,
"loss": 0.4649,
"step": 519
},
{
"epoch": 1.7487409065472859,
"grad_norm": 2.225499325958965,
"learning_rate": 4.4422655761398785e-06,
"loss": 0.4853,
"step": 520
},
{
"epoch": 1.7520984890878566,
"grad_norm": 2.254229574580799,
"learning_rate": 4.4227818601998575e-06,
"loss": 0.4883,
"step": 521
},
{
"epoch": 1.7554560716284275,
"grad_norm": 2.1468138229729443,
"learning_rate": 4.403307023465746e-06,
"loss": 0.4786,
"step": 522
},
{
"epoch": 1.7588136541689985,
"grad_norm": 2.169367653787952,
"learning_rate": 4.383841365514208e-06,
"loss": 0.4933,
"step": 523
},
{
"epoch": 1.7621712367095692,
"grad_norm": 2.2540586767928916,
"learning_rate": 4.364385185780712e-06,
"loss": 0.4423,
"step": 524
},
{
"epoch": 1.76552881925014,
"grad_norm": 2.046616711801834,
"learning_rate": 4.3449387835549305e-06,
"loss": 0.4517,
"step": 525
},
{
"epoch": 1.7688864017907107,
"grad_norm": 2.054096374050294,
"learning_rate": 4.325502457976126e-06,
"loss": 0.4562,
"step": 526
},
{
"epoch": 1.7722439843312814,
"grad_norm": 2.1897562491663023,
"learning_rate": 4.306076508028557e-06,
"loss": 0.4872,
"step": 527
},
{
"epoch": 1.775601566871852,
"grad_norm": 2.1685603692137985,
"learning_rate": 4.286661232536873e-06,
"loss": 0.4847,
"step": 528
},
{
"epoch": 1.778959149412423,
"grad_norm": 2.1915605427774034,
"learning_rate": 4.267256930161523e-06,
"loss": 0.5192,
"step": 529
},
{
"epoch": 1.782316731952994,
"grad_norm": 2.254879047746067,
"learning_rate": 4.247863899394162e-06,
"loss": 0.4687,
"step": 530
},
{
"epoch": 1.7856743144935647,
"grad_norm": 2.1134520274196125,
"learning_rate": 4.228482438553052e-06,
"loss": 0.5262,
"step": 531
},
{
"epoch": 1.7890318970341355,
"grad_norm": 2.068253922507518,
"learning_rate": 4.209112845778481e-06,
"loss": 0.4839,
"step": 532
},
{
"epoch": 1.7923894795747062,
"grad_norm": 2.0887709442403346,
"learning_rate": 4.189755419028169e-06,
"loss": 0.4623,
"step": 533
},
{
"epoch": 1.795747062115277,
"grad_norm": 2.300959929057977,
"learning_rate": 4.1704104560726955e-06,
"loss": 0.5047,
"step": 534
},
{
"epoch": 1.7991046446558476,
"grad_norm": 2.156784226239325,
"learning_rate": 4.151078254490908e-06,
"loss": 0.4553,
"step": 535
},
{
"epoch": 1.8024622271964186,
"grad_norm": 2.0472342846000893,
"learning_rate": 4.131759111665349e-06,
"loss": 0.444,
"step": 536
},
{
"epoch": 1.8058198097369895,
"grad_norm": 2.15308812602405,
"learning_rate": 4.112453324777683e-06,
"loss": 0.4504,
"step": 537
},
{
"epoch": 1.8091773922775602,
"grad_norm": 2.305556846294406,
"learning_rate": 4.09316119080412e-06,
"loss": 0.4561,
"step": 538
},
{
"epoch": 1.812534974818131,
"grad_norm": 2.0629496902341304,
"learning_rate": 4.073883006510858e-06,
"loss": 0.4639,
"step": 539
},
{
"epoch": 1.8158925573587017,
"grad_norm": 2.621774490091145,
"learning_rate": 4.054619068449502e-06,
"loss": 0.4988,
"step": 540
},
{
"epoch": 1.8192501398992724,
"grad_norm": 2.249906503732158,
"learning_rate": 4.035369672952516e-06,
"loss": 0.4665,
"step": 541
},
{
"epoch": 1.8226077224398431,
"grad_norm": 2.124308282019638,
"learning_rate": 4.016135116128656e-06,
"loss": 0.4837,
"step": 542
},
{
"epoch": 1.825965304980414,
"grad_norm": 2.216203736606476,
"learning_rate": 3.996915693858422e-06,
"loss": 0.4599,
"step": 543
},
{
"epoch": 1.829322887520985,
"grad_norm": 2.199055435229539,
"learning_rate": 3.977711701789499e-06,
"loss": 0.4996,
"step": 544
},
{
"epoch": 1.8326804700615558,
"grad_norm": 2.1456836995965123,
"learning_rate": 3.9585234353322155e-06,
"loss": 0.474,
"step": 545
},
{
"epoch": 1.8360380526021265,
"grad_norm": 2.088890168115453,
"learning_rate": 3.939351189654996e-06,
"loss": 0.4551,
"step": 546
},
{
"epoch": 1.8393956351426972,
"grad_norm": 2.059736510246409,
"learning_rate": 3.920195259679822e-06,
"loss": 0.484,
"step": 547
},
{
"epoch": 1.842753217683268,
"grad_norm": 2.4289464926505504,
"learning_rate": 3.901055940077691e-06,
"loss": 0.5043,
"step": 548
},
{
"epoch": 1.8461108002238389,
"grad_norm": 2.0962090325586606,
"learning_rate": 3.881933525264092e-06,
"loss": 0.4398,
"step": 549
},
{
"epoch": 1.8494683827644096,
"grad_norm": 2.1707115890324165,
"learning_rate": 3.862828309394469e-06,
"loss": 0.4925,
"step": 550
},
{
"epoch": 1.8528259653049806,
"grad_norm": 2.015982277160347,
"learning_rate": 3.843740586359701e-06,
"loss": 0.4757,
"step": 551
},
{
"epoch": 1.8561835478455513,
"grad_norm": 2.2304879540207203,
"learning_rate": 3.824670649781576e-06,
"loss": 0.4614,
"step": 552
},
{
"epoch": 1.859541130386122,
"grad_norm": 2.091915679725523,
"learning_rate": 3.805618793008279e-06,
"loss": 0.4448,
"step": 553
},
{
"epoch": 1.8628987129266927,
"grad_norm": 2.16946840752013,
"learning_rate": 3.786585309109877e-06,
"loss": 0.4649,
"step": 554
},
{
"epoch": 1.8662562954672635,
"grad_norm": 2.128475934593608,
"learning_rate": 3.7675704908738136e-06,
"loss": 0.4802,
"step": 555
},
{
"epoch": 1.8696138780078344,
"grad_norm": 2.165472295510831,
"learning_rate": 3.7485746308004013e-06,
"loss": 0.4977,
"step": 556
},
{
"epoch": 1.8729714605484051,
"grad_norm": 2.212252433928935,
"learning_rate": 3.7295980210983233e-06,
"loss": 0.4935,
"step": 557
},
{
"epoch": 1.876329043088976,
"grad_norm": 2.1928900923013956,
"learning_rate": 3.71064095368014e-06,
"loss": 0.4627,
"step": 558
},
{
"epoch": 1.8796866256295468,
"grad_norm": 2.085149102799712,
"learning_rate": 3.6917037201577977e-06,
"loss": 0.4278,
"step": 559
},
{
"epoch": 1.8830442081701175,
"grad_norm": 2.0816980346294307,
"learning_rate": 3.672786611838142e-06,
"loss": 0.4631,
"step": 560
},
{
"epoch": 1.8864017907106883,
"grad_norm": 2.103474181111992,
"learning_rate": 3.653889919718439e-06,
"loss": 0.4511,
"step": 561
},
{
"epoch": 1.889759373251259,
"grad_norm": 2.2744496879859275,
"learning_rate": 3.635013934481895e-06,
"loss": 0.4974,
"step": 562
},
{
"epoch": 1.89311695579183,
"grad_norm": 2.2227249595968934,
"learning_rate": 3.616158946493188e-06,
"loss": 0.4769,
"step": 563
},
{
"epoch": 1.8964745383324007,
"grad_norm": 2.152621448218455,
"learning_rate": 3.5973252457940034e-06,
"loss": 0.4994,
"step": 564
},
{
"epoch": 1.8998321208729716,
"grad_norm": 2.070223009955467,
"learning_rate": 3.578513122098566e-06,
"loss": 0.5039,
"step": 565
},
{
"epoch": 1.9031897034135423,
"grad_norm": 2.121366546607162,
"learning_rate": 3.559722864789187e-06,
"loss": 0.4789,
"step": 566
},
{
"epoch": 1.906547285954113,
"grad_norm": 2.1017767501093823,
"learning_rate": 3.5409547629118124e-06,
"loss": 0.4562,
"step": 567
},
{
"epoch": 1.9099048684946838,
"grad_norm": 2.274326927273865,
"learning_rate": 3.5222091051715803e-06,
"loss": 0.4623,
"step": 568
},
{
"epoch": 1.9132624510352545,
"grad_norm": 1.9186891449000945,
"learning_rate": 3.5034861799283713e-06,
"loss": 0.5144,
"step": 569
},
{
"epoch": 1.9166200335758254,
"grad_norm": 2.3556578562236252,
"learning_rate": 3.48478627519238e-06,
"loss": 0.4503,
"step": 570
},
{
"epoch": 1.9199776161163962,
"grad_norm": 1.9892297433331403,
"learning_rate": 3.466109678619681e-06,
"loss": 0.4934,
"step": 571
},
{
"epoch": 1.9233351986569671,
"grad_norm": 2.2314638850872144,
"learning_rate": 3.4474566775078055e-06,
"loss": 0.4934,
"step": 572
},
{
"epoch": 1.9266927811975378,
"grad_norm": 2.2782528645464173,
"learning_rate": 3.4288275587913235e-06,
"loss": 0.4948,
"step": 573
},
{
"epoch": 1.9300503637381086,
"grad_norm": 2.36739181255683,
"learning_rate": 3.4102226090374246e-06,
"loss": 0.4741,
"step": 574
},
{
"epoch": 1.9334079462786793,
"grad_norm": 2.1683249260482444,
"learning_rate": 3.3916421144415146e-06,
"loss": 0.4732,
"step": 575
},
{
"epoch": 1.93676552881925,
"grad_norm": 2.1221739672342497,
"learning_rate": 3.3730863608228125e-06,
"loss": 0.4274,
"step": 576
},
{
"epoch": 1.940123111359821,
"grad_norm": 2.095387968090082,
"learning_rate": 3.35455563361995e-06,
"loss": 0.4649,
"step": 577
},
{
"epoch": 1.9434806939003917,
"grad_norm": 2.1689072561371154,
"learning_rate": 3.336050217886588e-06,
"loss": 0.4986,
"step": 578
},
{
"epoch": 1.9468382764409626,
"grad_norm": 2.2504194821367345,
"learning_rate": 3.3175703982870232e-06,
"loss": 0.4716,
"step": 579
},
{
"epoch": 1.9501958589815334,
"grad_norm": 2.084871129563975,
"learning_rate": 3.2991164590918162e-06,
"loss": 0.4403,
"step": 580
},
{
"epoch": 1.953553441522104,
"grad_norm": 2.0143154450944123,
"learning_rate": 3.280688684173412e-06,
"loss": 0.4452,
"step": 581
},
{
"epoch": 1.9569110240626748,
"grad_norm": 2.1804050757472977,
"learning_rate": 3.262287357001781e-06,
"loss": 0.516,
"step": 582
},
{
"epoch": 1.9602686066032455,
"grad_norm": 2.0645062573328232,
"learning_rate": 3.2439127606400546e-06,
"loss": 0.461,
"step": 583
},
{
"epoch": 1.9636261891438165,
"grad_norm": 2.070842126513229,
"learning_rate": 3.225565177740163e-06,
"loss": 0.466,
"step": 584
},
{
"epoch": 1.9669837716843872,
"grad_norm": 2.0600131128468595,
"learning_rate": 3.2072448905385046e-06,
"loss": 0.433,
"step": 585
},
{
"epoch": 1.9703413542249582,
"grad_norm": 2.0764551843814107,
"learning_rate": 3.1889521808515888e-06,
"loss": 0.45,
"step": 586
},
{
"epoch": 1.9736989367655289,
"grad_norm": 2.1493865400194103,
"learning_rate": 3.1706873300717094e-06,
"loss": 0.4903,
"step": 587
},
{
"epoch": 1.9770565193060996,
"grad_norm": 2.1217797993065988,
"learning_rate": 3.152450619162612e-06,
"loss": 0.456,
"step": 588
},
{
"epoch": 1.9804141018466703,
"grad_norm": 2.2674307611908273,
"learning_rate": 3.1342423286551756e-06,
"loss": 0.4758,
"step": 589
},
{
"epoch": 1.983771684387241,
"grad_norm": 2.016774239865244,
"learning_rate": 3.116062738643092e-06,
"loss": 0.4871,
"step": 590
},
{
"epoch": 1.987129266927812,
"grad_norm": 2.167349361097923,
"learning_rate": 3.097912128778563e-06,
"loss": 0.4621,
"step": 591
},
{
"epoch": 1.9904868494683827,
"grad_norm": 2.1086622374082644,
"learning_rate": 3.0797907782679944e-06,
"loss": 0.462,
"step": 592
},
{
"epoch": 1.9938444320089537,
"grad_norm": 2.142067661122772,
"learning_rate": 3.061698965867701e-06,
"loss": 0.4403,
"step": 593
},
{
"epoch": 1.9972020145495244,
"grad_norm": 2.211038574058772,
"learning_rate": 3.043636969879625e-06,
"loss": 0.4748,
"step": 594
},
{
"epoch": 2.0033575825405707,
"grad_norm": 8.113958823135038,
"learning_rate": 3.0256050681470446e-06,
"loss": 0.7156,
"step": 595
},
{
"epoch": 2.0067151650811414,
"grad_norm": 2.568864765515772,
"learning_rate": 3.007603538050309e-06,
"loss": 0.2897,
"step": 596
},
{
"epoch": 2.010072747621712,
"grad_norm": 2.313285944394538,
"learning_rate": 2.989632656502564e-06,
"loss": 0.2573,
"step": 597
},
{
"epoch": 2.0134303301622833,
"grad_norm": 2.2688120461445687,
"learning_rate": 2.971692699945502e-06,
"loss": 0.2617,
"step": 598
},
{
"epoch": 2.016787912702854,
"grad_norm": 2.2390498633994875,
"learning_rate": 2.9537839443451e-06,
"loss": 0.2628,
"step": 599
},
{
"epoch": 2.020145495243425,
"grad_norm": 2.018211337998392,
"learning_rate": 2.935906665187378e-06,
"loss": 0.2577,
"step": 600
},
{
"epoch": 2.0235030777839955,
"grad_norm": 2.0822307985642268,
"learning_rate": 2.9180611374741623e-06,
"loss": 0.2481,
"step": 601
},
{
"epoch": 2.0268606603245662,
"grad_norm": 2.651846463170353,
"learning_rate": 2.900247635718856e-06,
"loss": 0.2961,
"step": 602
},
{
"epoch": 2.030218242865137,
"grad_norm": 3.6928643005110513,
"learning_rate": 2.8824664339422115e-06,
"loss": 0.281,
"step": 603
},
{
"epoch": 2.0335758254057077,
"grad_norm": 3.075668633649421,
"learning_rate": 2.8647178056681197e-06,
"loss": 0.2588,
"step": 604
},
{
"epoch": 2.036933407946279,
"grad_norm": 2.6467867984028577,
"learning_rate": 2.847002023919406e-06,
"loss": 0.2678,
"step": 605
},
{
"epoch": 2.0402909904868496,
"grad_norm": 2.724532376339797,
"learning_rate": 2.8293193612136183e-06,
"loss": 0.2405,
"step": 606
},
{
"epoch": 2.0436485730274203,
"grad_norm": 2.4483147811459975,
"learning_rate": 2.8116700895588473e-06,
"loss": 0.241,
"step": 607
},
{
"epoch": 2.047006155567991,
"grad_norm": 2.2477666437496566,
"learning_rate": 2.7940544804495345e-06,
"loss": 0.2513,
"step": 608
},
{
"epoch": 2.0503637381085618,
"grad_norm": 2.154565484035418,
"learning_rate": 2.7764728048623003e-06,
"loss": 0.2506,
"step": 609
},
{
"epoch": 2.0537213206491325,
"grad_norm": 2.17650555134875,
"learning_rate": 2.7589253332517736e-06,
"loss": 0.2387,
"step": 610
},
{
"epoch": 2.057078903189703,
"grad_norm": 2.253001429434042,
"learning_rate": 2.741412335546431e-06,
"loss": 0.2491,
"step": 611
},
{
"epoch": 2.0604364857302744,
"grad_norm": 2.0324493606743146,
"learning_rate": 2.7239340811444476e-06,
"loss": 0.2402,
"step": 612
},
{
"epoch": 2.063794068270845,
"grad_norm": 2.2299955712377666,
"learning_rate": 2.706490838909547e-06,
"loss": 0.255,
"step": 613
},
{
"epoch": 2.067151650811416,
"grad_norm": 2.0377107617066965,
"learning_rate": 2.6890828771668742e-06,
"loss": 0.2576,
"step": 614
},
{
"epoch": 2.0705092333519866,
"grad_norm": 2.1876913026406037,
"learning_rate": 2.671710463698859e-06,
"loss": 0.2427,
"step": 615
},
{
"epoch": 2.0738668158925573,
"grad_norm": 2.0004569183512233,
"learning_rate": 2.6543738657411033e-06,
"loss": 0.2305,
"step": 616
},
{
"epoch": 2.077224398433128,
"grad_norm": 2.2750688222972695,
"learning_rate": 2.6370733499782654e-06,
"loss": 0.2398,
"step": 617
},
{
"epoch": 2.0805819809736987,
"grad_norm": 2.196557061021631,
"learning_rate": 2.6198091825399606e-06,
"loss": 0.2659,
"step": 618
},
{
"epoch": 2.08393956351427,
"grad_norm": 2.4966706499173306,
"learning_rate": 2.6025816289966703e-06,
"loss": 0.2528,
"step": 619
},
{
"epoch": 2.0872971460548406,
"grad_norm": 1.9987480291512625,
"learning_rate": 2.5853909543556444e-06,
"loss": 0.2381,
"step": 620
},
{
"epoch": 2.0906547285954113,
"grad_norm": 2.3473080858318793,
"learning_rate": 2.568237423056844e-06,
"loss": 0.2185,
"step": 621
},
{
"epoch": 2.094012311135982,
"grad_norm": 2.2351448577994,
"learning_rate": 2.5511212989688587e-06,
"loss": 0.2492,
"step": 622
},
{
"epoch": 2.097369893676553,
"grad_norm": 2.334819375193785,
"learning_rate": 2.534042845384851e-06,
"loss": 0.2264,
"step": 623
},
{
"epoch": 2.1007274762171235,
"grad_norm": 2.1645622435514578,
"learning_rate": 2.517002325018508e-06,
"loss": 0.2433,
"step": 624
},
{
"epoch": 2.1040850587576942,
"grad_norm": 2.4245838623271645,
"learning_rate": 2.5000000000000015e-06,
"loss": 0.2685,
"step": 625
},
{
"epoch": 2.1074426412982654,
"grad_norm": 2.187347569512869,
"learning_rate": 2.4830361318719493e-06,
"loss": 0.2314,
"step": 626
},
{
"epoch": 2.110800223838836,
"grad_norm": 2.125826756884641,
"learning_rate": 2.4661109815854005e-06,
"loss": 0.2601,
"step": 627
},
{
"epoch": 2.114157806379407,
"grad_norm": 2.175643578704326,
"learning_rate": 2.449224809495815e-06,
"loss": 0.248,
"step": 628
},
{
"epoch": 2.1175153889199776,
"grad_norm": 2.3549155345423842,
"learning_rate": 2.4323778753590582e-06,
"loss": 0.2289,
"step": 629
},
{
"epoch": 2.1208729714605483,
"grad_norm": 2.2111119256937877,
"learning_rate": 2.4155704383274154e-06,
"loss": 0.2437,
"step": 630
},
{
"epoch": 2.124230554001119,
"grad_norm": 2.3478963695246273,
"learning_rate": 2.3988027569455895e-06,
"loss": 0.2517,
"step": 631
},
{
"epoch": 2.12758813654169,
"grad_norm": 2.2461086864800106,
"learning_rate": 2.3820750891467355e-06,
"loss": 0.2333,
"step": 632
},
{
"epoch": 2.130945719082261,
"grad_norm": 2.2329675778706926,
"learning_rate": 2.365387692248488e-06,
"loss": 0.2359,
"step": 633
},
{
"epoch": 2.1343033016228317,
"grad_norm": 2.3145727443697934,
"learning_rate": 2.348740822949006e-06,
"loss": 0.2477,
"step": 634
},
{
"epoch": 2.1376608841634024,
"grad_norm": 2.1666894881929895,
"learning_rate": 2.33213473732302e-06,
"loss": 0.2442,
"step": 635
},
{
"epoch": 2.141018466703973,
"grad_norm": 2.215179935351511,
"learning_rate": 2.3155696908178974e-06,
"loss": 0.2492,
"step": 636
},
{
"epoch": 2.144376049244544,
"grad_norm": 2.0303810141253344,
"learning_rate": 2.2990459382497086e-06,
"loss": 0.2414,
"step": 637
},
{
"epoch": 2.1477336317851146,
"grad_norm": 2.2599318995246636,
"learning_rate": 2.2825637337993094e-06,
"loss": 0.2542,
"step": 638
},
{
"epoch": 2.1510912143256853,
"grad_norm": 2.229319114603247,
"learning_rate": 2.266123331008436e-06,
"loss": 0.2763,
"step": 639
},
{
"epoch": 2.1544487968662565,
"grad_norm": 2.018769817603059,
"learning_rate": 2.2497249827757933e-06,
"loss": 0.2279,
"step": 640
},
{
"epoch": 2.157806379406827,
"grad_norm": 2.1991246705651317,
"learning_rate": 2.233368941353175e-06,
"loss": 0.2415,
"step": 641
},
{
"epoch": 2.161163961947398,
"grad_norm": 2.2319026864631875,
"learning_rate": 2.2170554583415782e-06,
"loss": 0.2207,
"step": 642
},
{
"epoch": 2.1645215444879686,
"grad_norm": 2.1965612835784936,
"learning_rate": 2.2007847846873342e-06,
"loss": 0.2425,
"step": 643
},
{
"epoch": 2.1678791270285394,
"grad_norm": 2.1567182556620774,
"learning_rate": 2.1845571706782486e-06,
"loss": 0.2303,
"step": 644
},
{
"epoch": 2.17123670956911,
"grad_norm": 2.2622288541045683,
"learning_rate": 2.1683728659397517e-06,
"loss": 0.2429,
"step": 645
},
{
"epoch": 2.1745942921096812,
"grad_norm": 2.2558926541854176,
"learning_rate": 2.1522321194310577e-06,
"loss": 0.2541,
"step": 646
},
{
"epoch": 2.177951874650252,
"grad_norm": 2.2770654917423765,
"learning_rate": 2.1361351794413334e-06,
"loss": 0.2446,
"step": 647
},
{
"epoch": 2.1813094571908227,
"grad_norm": 2.173609001184362,
"learning_rate": 2.1200822935858807e-06,
"loss": 0.251,
"step": 648
},
{
"epoch": 2.1846670397313934,
"grad_norm": 2.2522011504738577,
"learning_rate": 2.1040737088023323e-06,
"loss": 0.2481,
"step": 649
},
{
"epoch": 2.188024622271964,
"grad_norm": 2.1688069367996596,
"learning_rate": 2.0881096713468435e-06,
"loss": 0.2486,
"step": 650
},
{
"epoch": 2.191382204812535,
"grad_norm": 2.3838615198983706,
"learning_rate": 2.0721904267903097e-06,
"loss": 0.2457,
"step": 651
},
{
"epoch": 2.1947397873531056,
"grad_norm": 2.1625681906768346,
"learning_rate": 2.056316220014588e-06,
"loss": 0.2271,
"step": 652
},
{
"epoch": 2.1980973698936763,
"grad_norm": 2.2745686739163014,
"learning_rate": 2.040487295208732e-06,
"loss": 0.2238,
"step": 653
},
{
"epoch": 2.2014549524342475,
"grad_norm": 2.0740837502881235,
"learning_rate": 2.024703895865232e-06,
"loss": 0.2633,
"step": 654
},
{
"epoch": 2.204812534974818,
"grad_norm": 2.348484545437271,
"learning_rate": 2.0089662647762716e-06,
"loss": 0.2502,
"step": 655
},
{
"epoch": 2.208170117515389,
"grad_norm": 2.1411611952162346,
"learning_rate": 1.9932746440299926e-06,
"loss": 0.2352,
"step": 656
},
{
"epoch": 2.2115277000559597,
"grad_norm": 2.1602600116386514,
"learning_rate": 1.977629275006772e-06,
"loss": 0.2214,
"step": 657
},
{
"epoch": 2.2148852825965304,
"grad_norm": 2.1825649500433104,
"learning_rate": 1.962030398375506e-06,
"loss": 0.2217,
"step": 658
},
{
"epoch": 2.218242865137101,
"grad_norm": 2.076444790094385,
"learning_rate": 1.946478254089911e-06,
"loss": 0.2327,
"step": 659
},
{
"epoch": 2.2216004476776723,
"grad_norm": 2.242188296768225,
"learning_rate": 1.9309730813848302e-06,
"loss": 0.2341,
"step": 660
},
{
"epoch": 2.224958030218243,
"grad_norm": 2.324463919600608,
"learning_rate": 1.915515118772555e-06,
"loss": 0.2367,
"step": 661
},
{
"epoch": 2.2283156127588137,
"grad_norm": 2.0513327269909487,
"learning_rate": 1.9001046040391558e-06,
"loss": 0.242,
"step": 662
},
{
"epoch": 2.2316731952993845,
"grad_norm": 2.344026468770851,
"learning_rate": 1.884741774240823e-06,
"loss": 0.2665,
"step": 663
},
{
"epoch": 2.235030777839955,
"grad_norm": 2.2216964878287735,
"learning_rate": 1.8694268657002197e-06,
"loss": 0.2433,
"step": 664
},
{
"epoch": 2.238388360380526,
"grad_norm": 2.1555814735411976,
"learning_rate": 1.8541601140028542e-06,
"loss": 0.2397,
"step": 665
},
{
"epoch": 2.2417459429210966,
"grad_norm": 2.1482906635963253,
"learning_rate": 1.8389417539934428e-06,
"loss": 0.2216,
"step": 666
},
{
"epoch": 2.245103525461668,
"grad_norm": 2.096021593139733,
"learning_rate": 1.8237720197723075e-06,
"loss": 0.2262,
"step": 667
},
{
"epoch": 2.2484611080022385,
"grad_norm": 2.266711561199349,
"learning_rate": 1.8086511446917715e-06,
"loss": 0.2343,
"step": 668
},
{
"epoch": 2.2518186905428093,
"grad_norm": 2.232410418505839,
"learning_rate": 1.7935793613525693e-06,
"loss": 0.2593,
"step": 669
},
{
"epoch": 2.25517627308338,
"grad_norm": 2.3428925980088264,
"learning_rate": 1.7785569016002686e-06,
"loss": 0.2743,
"step": 670
},
{
"epoch": 2.2585338556239507,
"grad_norm": 2.236303024740682,
"learning_rate": 1.7635839965217055e-06,
"loss": 0.2301,
"step": 671
},
{
"epoch": 2.2618914381645214,
"grad_norm": 2.0637903925635177,
"learning_rate": 1.748660876441428e-06,
"loss": 0.2643,
"step": 672
},
{
"epoch": 2.265249020705092,
"grad_norm": 2.2271012154627994,
"learning_rate": 1.7337877709181527e-06,
"loss": 0.2309,
"step": 673
},
{
"epoch": 2.2686066032456633,
"grad_norm": 2.1077879509424005,
"learning_rate": 1.7189649087412385e-06,
"loss": 0.261,
"step": 674
},
{
"epoch": 2.271964185786234,
"grad_norm": 2.2008818744280263,
"learning_rate": 1.7041925179271584e-06,
"loss": 0.2453,
"step": 675
},
{
"epoch": 2.2753217683268048,
"grad_norm": 2.2427383093212394,
"learning_rate": 1.689470825715998e-06,
"loss": 0.2349,
"step": 676
},
{
"epoch": 2.2786793508673755,
"grad_norm": 2.325006154923223,
"learning_rate": 1.6748000585679602e-06,
"loss": 0.2529,
"step": 677
},
{
"epoch": 2.282036933407946,
"grad_norm": 2.30699822949776,
"learning_rate": 1.6601804421598787e-06,
"loss": 0.2558,
"step": 678
},
{
"epoch": 2.285394515948517,
"grad_norm": 2.131117963004742,
"learning_rate": 1.6456122013817477e-06,
"loss": 0.2334,
"step": 679
},
{
"epoch": 2.2887520984890877,
"grad_norm": 2.1171412775183582,
"learning_rate": 1.631095560333264e-06,
"loss": 0.2431,
"step": 680
},
{
"epoch": 2.2921096810296584,
"grad_norm": 2.2433228629531774,
"learning_rate": 1.6166307423203765e-06,
"loss": 0.214,
"step": 681
},
{
"epoch": 2.2954672635702296,
"grad_norm": 2.268872598922477,
"learning_rate": 1.6022179698518525e-06,
"loss": 0.2401,
"step": 682
},
{
"epoch": 2.2988248461108003,
"grad_norm": 2.191932766219746,
"learning_rate": 1.5878574646358608e-06,
"loss": 0.2178,
"step": 683
},
{
"epoch": 2.302182428651371,
"grad_norm": 2.2800614694305144,
"learning_rate": 1.573549447576549e-06,
"loss": 0.2335,
"step": 684
},
{
"epoch": 2.3055400111919417,
"grad_norm": 2.3217546136753273,
"learning_rate": 1.5592941387706562e-06,
"loss": 0.2349,
"step": 685
},
{
"epoch": 2.3088975937325125,
"grad_norm": 2.2055299968173,
"learning_rate": 1.5450917575041209e-06,
"loss": 0.2461,
"step": 686
},
{
"epoch": 2.312255176273083,
"grad_norm": 2.1265013617268256,
"learning_rate": 1.5309425222487119e-06,
"loss": 0.2166,
"step": 687
},
{
"epoch": 2.3156127588136544,
"grad_norm": 2.1979179058845695,
"learning_rate": 1.5168466506586654e-06,
"loss": 0.2196,
"step": 688
},
{
"epoch": 2.318970341354225,
"grad_norm": 2.167123534236895,
"learning_rate": 1.502804359567337e-06,
"loss": 0.2427,
"step": 689
},
{
"epoch": 2.322327923894796,
"grad_norm": 2.3539399012866418,
"learning_rate": 1.4888158649838675e-06,
"loss": 0.2386,
"step": 690
},
{
"epoch": 2.3256855064353665,
"grad_norm": 2.1796719146281345,
"learning_rate": 1.4748813820898554e-06,
"loss": 0.236,
"step": 691
},
{
"epoch": 2.3290430889759373,
"grad_norm": 2.2205243451241,
"learning_rate": 1.4610011252360594e-06,
"loss": 0.2229,
"step": 692
},
{
"epoch": 2.332400671516508,
"grad_norm": 2.3094055557029494,
"learning_rate": 1.4471753079390815e-06,
"loss": 0.2396,
"step": 693
},
{
"epoch": 2.3357582540570787,
"grad_norm": 2.1939883736480157,
"learning_rate": 1.4334041428781003e-06,
"loss": 0.231,
"step": 694
},
{
"epoch": 2.33911583659765,
"grad_norm": 2.149486473333343,
"learning_rate": 1.4196878418915894e-06,
"loss": 0.2365,
"step": 695
},
{
"epoch": 2.3424734191382206,
"grad_norm": 2.2453237673213255,
"learning_rate": 1.4060266159740627e-06,
"loss": 0.2388,
"step": 696
},
{
"epoch": 2.3458310016787913,
"grad_norm": 2.423577584509045,
"learning_rate": 1.3924206752728282e-06,
"loss": 0.2401,
"step": 697
},
{
"epoch": 2.349188584219362,
"grad_norm": 2.4806635634108187,
"learning_rate": 1.3788702290847517e-06,
"loss": 0.2429,
"step": 698
},
{
"epoch": 2.3525461667599328,
"grad_norm": 2.380640333144661,
"learning_rate": 1.3653754858530477e-06,
"loss": 0.2258,
"step": 699
},
{
"epoch": 2.3559037493005035,
"grad_norm": 2.2491745778254066,
"learning_rate": 1.3519366531640589e-06,
"loss": 0.2622,
"step": 700
},
{
"epoch": 2.3592613318410747,
"grad_norm": 2.316018513747914,
"learning_rate": 1.3385539377440709e-06,
"loss": 0.248,
"step": 701
},
{
"epoch": 2.3626189143816454,
"grad_norm": 2.314430211194231,
"learning_rate": 1.3252275454561337e-06,
"loss": 0.2536,
"step": 702
},
{
"epoch": 2.365976496922216,
"grad_norm": 2.1763408291528674,
"learning_rate": 1.3119576812968893e-06,
"loss": 0.2403,
"step": 703
},
{
"epoch": 2.369334079462787,
"grad_norm": 2.1416964805672283,
"learning_rate": 1.2987445493934236e-06,
"loss": 0.2273,
"step": 704
},
{
"epoch": 2.3726916620033576,
"grad_norm": 2.1683455218648358,
"learning_rate": 1.2855883530001228e-06,
"loss": 0.2423,
"step": 705
},
{
"epoch": 2.3760492445439283,
"grad_norm": 2.2242131906759597,
"learning_rate": 1.272489294495548e-06,
"loss": 0.2404,
"step": 706
},
{
"epoch": 2.379406827084499,
"grad_norm": 2.1865211099221553,
"learning_rate": 1.2594475753793211e-06,
"loss": 0.2483,
"step": 707
},
{
"epoch": 2.3827644096250697,
"grad_norm": 2.0946660616224815,
"learning_rate": 1.2464633962690304e-06,
"loss": 0.255,
"step": 708
},
{
"epoch": 2.386121992165641,
"grad_norm": 2.2231751389825463,
"learning_rate": 1.2335369568971362e-06,
"loss": 0.2343,
"step": 709
},
{
"epoch": 2.3894795747062116,
"grad_norm": 2.312671362781463,
"learning_rate": 1.2206684561079035e-06,
"loss": 0.2408,
"step": 710
},
{
"epoch": 2.3928371572467824,
"grad_norm": 2.179660173524886,
"learning_rate": 1.207858091854342e-06,
"loss": 0.2383,
"step": 711
},
{
"epoch": 2.396194739787353,
"grad_norm": 2.1293264857856555,
"learning_rate": 1.1951060611951615e-06,
"loss": 0.23,
"step": 712
},
{
"epoch": 2.399552322327924,
"grad_norm": 2.1755461849480446,
"learning_rate": 1.1824125602917414e-06,
"loss": 0.2354,
"step": 713
},
{
"epoch": 2.4029099048684945,
"grad_norm": 1.9840827894505013,
"learning_rate": 1.1697777844051105e-06,
"loss": 0.2284,
"step": 714
},
{
"epoch": 2.4062674874090657,
"grad_norm": 2.0944031352778643,
"learning_rate": 1.1572019278929457e-06,
"loss": 0.2357,
"step": 715
},
{
"epoch": 2.4096250699496364,
"grad_norm": 2.286049493049029,
"learning_rate": 1.1446851842065804e-06,
"loss": 0.2219,
"step": 716
},
{
"epoch": 2.412982652490207,
"grad_norm": 2.195180056991891,
"learning_rate": 1.1322277458880337e-06,
"loss": 0.2443,
"step": 717
},
{
"epoch": 2.416340235030778,
"grad_norm": 2.360744350880101,
"learning_rate": 1.1198298045670402e-06,
"loss": 0.2307,
"step": 718
},
{
"epoch": 2.4196978175713486,
"grad_norm": 2.27588589395575,
"learning_rate": 1.1074915509581086e-06,
"loss": 0.2218,
"step": 719
},
{
"epoch": 2.4230554001119193,
"grad_norm": 2.2915120318076894,
"learning_rate": 1.0952131748575855e-06,
"loss": 0.2348,
"step": 720
},
{
"epoch": 2.42641298265249,
"grad_norm": 2.3740865208478428,
"learning_rate": 1.0829948651407374e-06,
"loss": 0.233,
"step": 721
},
{
"epoch": 2.429770565193061,
"grad_norm": 2.309847767416398,
"learning_rate": 1.0708368097588435e-06,
"loss": 0.2411,
"step": 722
},
{
"epoch": 2.433128147733632,
"grad_norm": 2.1585332492732703,
"learning_rate": 1.0587391957363053e-06,
"loss": 0.2689,
"step": 723
},
{
"epoch": 2.4364857302742027,
"grad_norm": 2.2620198419443645,
"learning_rate": 1.0467022091677692e-06,
"loss": 0.2386,
"step": 724
},
{
"epoch": 2.4398433128147734,
"grad_norm": 2.4789714742421998,
"learning_rate": 1.0347260352152644e-06,
"loss": 0.2542,
"step": 725
},
{
"epoch": 2.443200895355344,
"grad_norm": 2.233580671739803,
"learning_rate": 1.0228108581053565e-06,
"loss": 0.2342,
"step": 726
},
{
"epoch": 2.446558477895915,
"grad_norm": 2.1024662676881314,
"learning_rate": 1.0109568611263094e-06,
"loss": 0.222,
"step": 727
},
{
"epoch": 2.4499160604364856,
"grad_norm": 2.276521915661851,
"learning_rate": 9.991642266252672e-07,
"loss": 0.2099,
"step": 728
},
{
"epoch": 2.4532736429770567,
"grad_norm": 2.260563206399162,
"learning_rate": 9.87433136005454e-07,
"loss": 0.2548,
"step": 729
},
{
"epoch": 2.4566312255176275,
"grad_norm": 2.156550439442849,
"learning_rate": 9.757637697233723e-07,
"loss": 0.2211,
"step": 730
},
{
"epoch": 2.459988808058198,
"grad_norm": 2.2236291372113866,
"learning_rate": 9.641563072860416e-07,
"loss": 0.2258,
"step": 731
},
{
"epoch": 2.463346390598769,
"grad_norm": 2.1205062426646437,
"learning_rate": 9.526109272482237e-07,
"loss": 0.2201,
"step": 732
},
{
"epoch": 2.4667039731393396,
"grad_norm": 2.3294849171122567,
"learning_rate": 9.41127807209688e-07,
"loss": 0.2303,
"step": 733
},
{
"epoch": 2.4700615556799104,
"grad_norm": 2.127623518916985,
"learning_rate": 9.297071238124683e-07,
"loss": 0.2374,
"step": 734
},
{
"epoch": 2.473419138220481,
"grad_norm": 2.358371434685423,
"learning_rate": 9.183490527381539e-07,
"loss": 0.2415,
"step": 735
},
{
"epoch": 2.476776720761052,
"grad_norm": 2.3348118958807014,
"learning_rate": 9.070537687051817e-07,
"loss": 0.2253,
"step": 736
},
{
"epoch": 2.480134303301623,
"grad_norm": 2.254796435173114,
"learning_rate": 8.958214454661529e-07,
"loss": 0.2474,
"step": 737
},
{
"epoch": 2.4834918858421937,
"grad_norm": 2.135376686047964,
"learning_rate": 8.846522558051563e-07,
"loss": 0.2193,
"step": 738
},
{
"epoch": 2.4868494683827644,
"grad_norm": 2.4255834301642745,
"learning_rate": 8.735463715351139e-07,
"loss": 0.2569,
"step": 739
},
{
"epoch": 2.490207050923335,
"grad_norm": 2.1786279330493694,
"learning_rate": 8.625039634951354e-07,
"loss": 0.2388,
"step": 740
},
{
"epoch": 2.493564633463906,
"grad_norm": 2.3684247953727144,
"learning_rate": 8.515252015478915e-07,
"loss": 0.2432,
"step": 741
},
{
"epoch": 2.4969222160044766,
"grad_norm": 2.1452983004533706,
"learning_rate": 8.406102545769989e-07,
"loss": 0.2361,
"step": 742
},
{
"epoch": 2.500279798545048,
"grad_norm": 2.239393813510702,
"learning_rate": 8.297592904844282e-07,
"loss": 0.2169,
"step": 743
},
{
"epoch": 2.5036373810856185,
"grad_norm": 2.1080838250603233,
"learning_rate": 8.189724761879131e-07,
"loss": 0.2402,
"step": 744
},
{
"epoch": 2.5069949636261892,
"grad_norm": 2.205253345024292,
"learning_rate": 8.082499776183883e-07,
"loss": 0.2345,
"step": 745
},
{
"epoch": 2.51035254616676,
"grad_norm": 2.236843913141464,
"learning_rate": 7.975919597174342e-07,
"loss": 0.2272,
"step": 746
},
{
"epoch": 2.5137101287073307,
"grad_norm": 2.1687504666441257,
"learning_rate": 7.869985864347424e-07,
"loss": 0.2304,
"step": 747
},
{
"epoch": 2.5170677112479014,
"grad_norm": 2.32345363923919,
"learning_rate": 7.764700207255904e-07,
"loss": 0.2409,
"step": 748
},
{
"epoch": 2.520425293788472,
"grad_norm": 2.2137378690316605,
"learning_rate": 7.660064245483384e-07,
"loss": 0.2273,
"step": 749
},
{
"epoch": 2.523782876329043,
"grad_norm": 2.261904827600831,
"learning_rate": 7.556079588619341e-07,
"loss": 0.2219,
"step": 750
},
{
"epoch": 2.527140458869614,
"grad_norm": 2.2075096631540245,
"learning_rate": 7.452747836234392e-07,
"loss": 0.2234,
"step": 751
},
{
"epoch": 2.5304980414101848,
"grad_norm": 2.2106003656998707,
"learning_rate": 7.350070577855716e-07,
"loss": 0.2485,
"step": 752
},
{
"epoch": 2.5338556239507555,
"grad_norm": 2.262010781586963,
"learning_rate": 7.24804939294253e-07,
"loss": 0.2405,
"step": 753
},
{
"epoch": 2.537213206491326,
"grad_norm": 2.1267348431476387,
"learning_rate": 7.146685850861851e-07,
"loss": 0.2394,
"step": 754
},
{
"epoch": 2.540570789031897,
"grad_norm": 2.305882442336494,
"learning_rate": 7.045981510864319e-07,
"loss": 0.2528,
"step": 755
},
{
"epoch": 2.543928371572468,
"grad_norm": 2.018265078425514,
"learning_rate": 6.945937922060259e-07,
"loss": 0.233,
"step": 756
},
{
"epoch": 2.547285954113039,
"grad_norm": 2.2614071614786666,
"learning_rate": 6.846556623395795e-07,
"loss": 0.222,
"step": 757
},
{
"epoch": 2.5506435366536095,
"grad_norm": 2.1351635366635158,
"learning_rate": 6.74783914362922e-07,
"loss": 0.2273,
"step": 758
},
{
"epoch": 2.5540011191941803,
"grad_norm": 2.100876831365208,
"learning_rate": 6.649787001307451e-07,
"loss": 0.2072,
"step": 759
},
{
"epoch": 2.557358701734751,
"grad_norm": 2.1077270882398524,
"learning_rate": 6.552401704742678e-07,
"loss": 0.2147,
"step": 760
},
{
"epoch": 2.5607162842753217,
"grad_norm": 2.185148590555281,
"learning_rate": 6.455684751989194e-07,
"loss": 0.2387,
"step": 761
},
{
"epoch": 2.5640738668158924,
"grad_norm": 2.4273332489758714,
"learning_rate": 6.359637630820292e-07,
"loss": 0.2187,
"step": 762
},
{
"epoch": 2.567431449356463,
"grad_norm": 2.2410879666266945,
"learning_rate": 6.26426181870542e-07,
"loss": 0.2356,
"step": 763
},
{
"epoch": 2.570789031897034,
"grad_norm": 2.0546535173663236,
"learning_rate": 6.169558782787438e-07,
"loss": 0.2134,
"step": 764
},
{
"epoch": 2.574146614437605,
"grad_norm": 2.2712662103864667,
"learning_rate": 6.075529979860068e-07,
"loss": 0.2434,
"step": 765
},
{
"epoch": 2.577504196978176,
"grad_norm": 2.495990931424611,
"learning_rate": 5.982176856345445e-07,
"loss": 0.2572,
"step": 766
},
{
"epoch": 2.5808617795187465,
"grad_norm": 2.272797069007875,
"learning_rate": 5.889500848271901e-07,
"loss": 0.2365,
"step": 767
},
{
"epoch": 2.5842193620593172,
"grad_norm": 2.2261054362951573,
"learning_rate": 5.797503381251896e-07,
"loss": 0.2345,
"step": 768
},
{
"epoch": 2.587576944599888,
"grad_norm": 2.2214663136739072,
"learning_rate": 5.706185870460018e-07,
"loss": 0.2582,
"step": 769
},
{
"epoch": 2.590934527140459,
"grad_norm": 2.2509221328745666,
"learning_rate": 5.61554972061128e-07,
"loss": 0.2405,
"step": 770
},
{
"epoch": 2.59429210968103,
"grad_norm": 1.9882931292487553,
"learning_rate": 5.525596325939469e-07,
"loss": 0.2074,
"step": 771
},
{
"epoch": 2.5976496922216006,
"grad_norm": 2.2641730214593667,
"learning_rate": 5.436327070175729e-07,
"loss": 0.2264,
"step": 772
},
{
"epoch": 2.6010072747621713,
"grad_norm": 2.1593141034556607,
"learning_rate": 5.347743326527255e-07,
"loss": 0.2334,
"step": 773
},
{
"epoch": 2.604364857302742,
"grad_norm": 2.160652045504084,
"learning_rate": 5.25984645765617e-07,
"loss": 0.2348,
"step": 774
},
{
"epoch": 2.6077224398433128,
"grad_norm": 2.1624773063108873,
"learning_rate": 5.172637815658583e-07,
"loss": 0.2046,
"step": 775
},
{
"epoch": 2.6110800223838835,
"grad_norm": 2.2467642799151624,
"learning_rate": 5.086118742043761e-07,
"loss": 0.2521,
"step": 776
},
{
"epoch": 2.614437604924454,
"grad_norm": 2.1254445389657435,
"learning_rate": 5.000290567713533e-07,
"loss": 0.2209,
"step": 777
},
{
"epoch": 2.617795187465025,
"grad_norm": 2.331029167817777,
"learning_rate": 4.915154612941781e-07,
"loss": 0.2461,
"step": 778
},
{
"epoch": 2.621152770005596,
"grad_norm": 2.225940066081278,
"learning_rate": 4.830712187354125e-07,
"loss": 0.2521,
"step": 779
},
{
"epoch": 2.624510352546167,
"grad_norm": 2.1960281391785768,
"learning_rate": 4.7469645899078153e-07,
"loss": 0.2081,
"step": 780
},
{
"epoch": 2.6278679350867375,
"grad_norm": 2.202588232752528,
"learning_rate": 4.663913108871726e-07,
"loss": 0.2217,
"step": 781
},
{
"epoch": 2.6312255176273083,
"grad_norm": 2.1413814255877788,
"learning_rate": 4.581559021806542e-07,
"loss": 0.2279,
"step": 782
},
{
"epoch": 2.634583100167879,
"grad_norm": 2.2575040003842877,
"learning_rate": 4.4999035955450964e-07,
"loss": 0.2507,
"step": 783
},
{
"epoch": 2.63794068270845,
"grad_norm": 2.29100443191833,
"learning_rate": 4.4189480861729137e-07,
"loss": 0.247,
"step": 784
},
{
"epoch": 2.641298265249021,
"grad_norm": 2.0949207855222625,
"learning_rate": 4.3386937390088366e-07,
"loss": 0.205,
"step": 785
},
{
"epoch": 2.6446558477895916,
"grad_norm": 2.3016984109745064,
"learning_rate": 4.259141788585947e-07,
"loss": 0.2436,
"step": 786
},
{
"epoch": 2.6480134303301623,
"grad_norm": 2.2773955253769276,
"learning_rate": 4.1802934586324897e-07,
"loss": 0.2329,
"step": 787
},
{
"epoch": 2.651371012870733,
"grad_norm": 2.2679994509516543,
"learning_rate": 4.102149962053098e-07,
"loss": 0.2416,
"step": 788
},
{
"epoch": 2.654728595411304,
"grad_norm": 2.134297523257456,
"learning_rate": 4.0247125009101275e-07,
"loss": 0.2384,
"step": 789
},
{
"epoch": 2.6580861779518745,
"grad_norm": 2.3727877129494774,
"learning_rate": 3.947982266405159e-07,
"loss": 0.2313,
"step": 790
},
{
"epoch": 2.6614437604924452,
"grad_norm": 2.1765597243225288,
"learning_rate": 3.871960438860689e-07,
"loss": 0.2257,
"step": 791
},
{
"epoch": 2.664801343033016,
"grad_norm": 2.313798954233314,
"learning_rate": 3.796648187701957e-07,
"loss": 0.2436,
"step": 792
},
{
"epoch": 2.668158925573587,
"grad_norm": 2.3444816164677595,
"learning_rate": 3.72204667143895e-07,
"loss": 0.2534,
"step": 793
},
{
"epoch": 2.671516508114158,
"grad_norm": 2.29388898580906,
"learning_rate": 3.648157037648598e-07,
"loss": 0.2159,
"step": 794
},
{
"epoch": 2.6748740906547286,
"grad_norm": 2.184396762194593,
"learning_rate": 3.574980422957147e-07,
"loss": 0.2151,
"step": 795
},
{
"epoch": 2.6782316731952993,
"grad_norm": 2.3518739979849683,
"learning_rate": 3.5025179530225995e-07,
"loss": 0.2236,
"step": 796
},
{
"epoch": 2.68158925573587,
"grad_norm": 2.2064933728915213,
"learning_rate": 3.43077074251747e-07,
"loss": 0.2305,
"step": 797
},
{
"epoch": 2.684946838276441,
"grad_norm": 2.340839131571328,
"learning_rate": 3.359739895111602e-07,
"loss": 0.2451,
"step": 798
},
{
"epoch": 2.688304420817012,
"grad_norm": 2.2538444477416055,
"learning_rate": 3.289426503455201e-07,
"loss": 0.2234,
"step": 799
},
{
"epoch": 2.6916620033575827,
"grad_norm": 2.244758740164714,
"learning_rate": 3.2198316491620305e-07,
"loss": 0.2294,
"step": 800
},
{
"epoch": 2.6950195858981534,
"grad_norm": 2.149355042894359,
"learning_rate": 3.150956402792765e-07,
"loss": 0.2216,
"step": 801
},
{
"epoch": 2.698377168438724,
"grad_norm": 2.259174848777998,
"learning_rate": 3.082801823838527e-07,
"loss": 0.2268,
"step": 802
},
{
"epoch": 2.701734750979295,
"grad_norm": 2.2477921891179204,
"learning_rate": 3.015368960704584e-07,
"loss": 0.242,
"step": 803
},
{
"epoch": 2.7050923335198656,
"grad_norm": 2.1272415575213643,
"learning_rate": 2.9486588506942303e-07,
"loss": 0.2342,
"step": 804
},
{
"epoch": 2.7084499160604363,
"grad_norm": 2.334010741930345,
"learning_rate": 2.882672519992824e-07,
"loss": 0.2285,
"step": 805
},
{
"epoch": 2.711807498601007,
"grad_norm": 2.427045189554706,
"learning_rate": 2.817410983651997e-07,
"loss": 0.2562,
"step": 806
},
{
"epoch": 2.715165081141578,
"grad_norm": 2.062462864366435,
"learning_rate": 2.7528752455740606e-07,
"loss": 0.1984,
"step": 807
},
{
"epoch": 2.718522663682149,
"grad_norm": 2.117107968846607,
"learning_rate": 2.6890662984965234e-07,
"loss": 0.2167,
"step": 808
},
{
"epoch": 2.7218802462227196,
"grad_norm": 2.172976071364664,
"learning_rate": 2.625985123976876e-07,
"loss": 0.2312,
"step": 809
},
{
"epoch": 2.7252378287632903,
"grad_norm": 2.1650501300436438,
"learning_rate": 2.5636326923774325e-07,
"loss": 0.2423,
"step": 810
},
{
"epoch": 2.728595411303861,
"grad_norm": 2.212412244165142,
"learning_rate": 2.5020099628504603e-07,
"loss": 0.2185,
"step": 811
},
{
"epoch": 2.7319529938444322,
"grad_norm": 2.1280889488629287,
"learning_rate": 2.441117883323374e-07,
"loss": 0.2413,
"step": 812
},
{
"epoch": 2.735310576385003,
"grad_norm": 2.18787418811109,
"learning_rate": 2.3809573904841844e-07,
"loss": 0.233,
"step": 813
},
{
"epoch": 2.7386681589255737,
"grad_norm": 2.2553778724383737,
"learning_rate": 2.3215294097670927e-07,
"loss": 0.2236,
"step": 814
},
{
"epoch": 2.7420257414661444,
"grad_norm": 2.080459053836358,
"learning_rate": 2.262834855338225e-07,
"loss": 0.2376,
"step": 815
},
{
"epoch": 2.745383324006715,
"grad_norm": 2.165809723126224,
"learning_rate": 2.204874630081616e-07,
"loss": 0.2225,
"step": 816
},
{
"epoch": 2.748740906547286,
"grad_norm": 1.956393897313748,
"learning_rate": 2.1476496255852685e-07,
"loss": 0.233,
"step": 817
},
{
"epoch": 2.7520984890878566,
"grad_norm": 2.101187708321008,
"learning_rate": 2.091160722127472e-07,
"loss": 0.2233,
"step": 818
},
{
"epoch": 2.7554560716284273,
"grad_norm": 2.328065045528626,
"learning_rate": 2.0354087886632623e-07,
"loss": 0.2371,
"step": 819
},
{
"epoch": 2.7588136541689985,
"grad_norm": 2.1295011524399086,
"learning_rate": 1.9803946828110376e-07,
"loss": 0.2408,
"step": 820
},
{
"epoch": 2.762171236709569,
"grad_norm": 2.0812004019092822,
"learning_rate": 1.9261192508393755e-07,
"loss": 0.2211,
"step": 821
},
{
"epoch": 2.76552881925014,
"grad_norm": 2.110225954008512,
"learning_rate": 1.8725833276540095e-07,
"loss": 0.2328,
"step": 822
},
{
"epoch": 2.7688864017907107,
"grad_norm": 2.209831203361078,
"learning_rate": 1.8197877367849948e-07,
"loss": 0.2424,
"step": 823
},
{
"epoch": 2.7722439843312814,
"grad_norm": 2.1406558326813103,
"learning_rate": 1.7677332903740296e-07,
"loss": 0.2293,
"step": 824
},
{
"epoch": 2.775601566871852,
"grad_norm": 2.160818736774281,
"learning_rate": 1.7164207891619823e-07,
"loss": 0.2265,
"step": 825
},
{
"epoch": 2.7789591494124233,
"grad_norm": 2.1902094381897177,
"learning_rate": 1.6658510224765333e-07,
"loss": 0.2253,
"step": 826
},
{
"epoch": 2.782316731952994,
"grad_norm": 2.311103825756955,
"learning_rate": 1.6160247682200813e-07,
"loss": 0.2455,
"step": 827
},
{
"epoch": 2.7856743144935647,
"grad_norm": 2.218083782097005,
"learning_rate": 1.566942792857745e-07,
"loss": 0.2233,
"step": 828
},
{
"epoch": 2.7890318970341355,
"grad_norm": 2.2738455358176277,
"learning_rate": 1.5186058514055912e-07,
"loss": 0.2399,
"step": 829
},
{
"epoch": 2.792389479574706,
"grad_norm": 2.193046078475537,
"learning_rate": 1.471014687418998e-07,
"loss": 0.219,
"step": 830
},
{
"epoch": 2.795747062115277,
"grad_norm": 2.1748698706908773,
"learning_rate": 1.4241700329812368e-07,
"loss": 0.2208,
"step": 831
},
{
"epoch": 2.7991046446558476,
"grad_norm": 2.25948599781455,
"learning_rate": 1.3780726086922103e-07,
"loss": 0.2205,
"step": 832
},
{
"epoch": 2.8024622271964184,
"grad_norm": 2.1627090816106906,
"learning_rate": 1.332723123657348e-07,
"loss": 0.2155,
"step": 833
},
{
"epoch": 2.8058198097369895,
"grad_norm": 2.1556941222951918,
"learning_rate": 1.288122275476733e-07,
"loss": 0.2209,
"step": 834
},
{
"epoch": 2.8091773922775602,
"grad_norm": 2.199934634740392,
"learning_rate": 1.244270750234333e-07,
"loss": 0.2362,
"step": 835
},
{
"epoch": 2.812534974818131,
"grad_norm": 2.265457178349347,
"learning_rate": 1.201169222487464e-07,
"loss": 0.2395,
"step": 836
},
{
"epoch": 2.8158925573587017,
"grad_norm": 2.243639772812836,
"learning_rate": 1.1588183552564247e-07,
"loss": 0.2251,
"step": 837
},
{
"epoch": 2.8192501398992724,
"grad_norm": 2.1527813152932174,
"learning_rate": 1.1172188000142803e-07,
"loss": 0.2434,
"step": 838
},
{
"epoch": 2.822607722439843,
"grad_norm": 2.290823128697244,
"learning_rate": 1.0763711966768453e-07,
"loss": 0.2078,
"step": 839
},
{
"epoch": 2.8259653049804143,
"grad_norm": 2.108924814493277,
"learning_rate": 1.0362761735928372e-07,
"loss": 0.2209,
"step": 840
},
{
"epoch": 2.829322887520985,
"grad_norm": 2.2692353365060574,
"learning_rate": 9.969343475342285e-08,
"loss": 0.2413,
"step": 841
},
{
"epoch": 2.8326804700615558,
"grad_norm": 2.320890260637866,
"learning_rate": 9.583463236867318e-08,
"loss": 0.2405,
"step": 842
},
{
"epoch": 2.8360380526021265,
"grad_norm": 2.114878278170637,
"learning_rate": 9.205126956405075e-08,
"loss": 0.2122,
"step": 843
},
{
"epoch": 2.839395635142697,
"grad_norm": 2.1364581985268356,
"learning_rate": 8.834340453810375e-08,
"loss": 0.2173,
"step": 844
},
{
"epoch": 2.842753217683268,
"grad_norm": 2.327798293544225,
"learning_rate": 8.471109432801494e-08,
"loss": 0.2305,
"step": 845
},
{
"epoch": 2.8461108002238387,
"grad_norm": 2.1784789640309614,
"learning_rate": 8.11543948087279e-08,
"loss": 0.2238,
"step": 846
},
{
"epoch": 2.8494683827644094,
"grad_norm": 2.3102169432150563,
"learning_rate": 7.76733606920832e-08,
"loss": 0.2158,
"step": 847
},
{
"epoch": 2.8528259653049806,
"grad_norm": 2.0676705717179877,
"learning_rate": 7.426804552598088e-08,
"loss": 0.2276,
"step": 848
},
{
"epoch": 2.8561835478455513,
"grad_norm": 2.321852646027188,
"learning_rate": 7.093850169355266e-08,
"loss": 0.2412,
"step": 849
},
{
"epoch": 2.859541130386122,
"grad_norm": 2.129502189630901,
"learning_rate": 6.768478041236037e-08,
"loss": 0.2102,
"step": 850
},
{
"epoch": 2.8628987129266927,
"grad_norm": 2.432479560424763,
"learning_rate": 6.450693173360445e-08,
"loss": 0.219,
"step": 851
},
{
"epoch": 2.8662562954672635,
"grad_norm": 2.246241784383759,
"learning_rate": 6.140500454135668e-08,
"loss": 0.2172,
"step": 852
},
{
"epoch": 2.8696138780078346,
"grad_norm": 2.2178086164936777,
"learning_rate": 5.8379046551807486e-08,
"loss": 0.2355,
"step": 853
},
{
"epoch": 2.8729714605484054,
"grad_norm": 2.079524766097902,
"learning_rate": 5.542910431252935e-08,
"loss": 0.2208,
"step": 854
},
{
"epoch": 2.876329043088976,
"grad_norm": 2.230864146180223,
"learning_rate": 5.255522320176565e-08,
"loss": 0.2268,
"step": 855
},
{
"epoch": 2.879686625629547,
"grad_norm": 2.1326498447596496,
"learning_rate": 4.975744742772848e-08,
"loss": 0.2182,
"step": 856
},
{
"epoch": 2.8830442081701175,
"grad_norm": 2.1859329833247467,
"learning_rate": 4.7035820027920284e-08,
"loss": 0.2364,
"step": 857
},
{
"epoch": 2.8864017907106883,
"grad_norm": 2.2460754412578505,
"learning_rate": 4.439038286847164e-08,
"loss": 0.2222,
"step": 858
},
{
"epoch": 2.889759373251259,
"grad_norm": 2.0025699472964504,
"learning_rate": 4.182117664349783e-08,
"loss": 0.2378,
"step": 859
},
{
"epoch": 2.8931169557918297,
"grad_norm": 2.3077570677713948,
"learning_rate": 3.9328240874471624e-08,
"loss": 0.2254,
"step": 860
},
{
"epoch": 2.8964745383324004,
"grad_norm": 2.0838649182430946,
"learning_rate": 3.6911613909616505e-08,
"loss": 0.2131,
"step": 861
},
{
"epoch": 2.8998321208729716,
"grad_norm": 2.2289951432899024,
"learning_rate": 3.457133292331494e-08,
"loss": 0.2288,
"step": 862
},
{
"epoch": 2.9031897034135423,
"grad_norm": 2.1558465137002654,
"learning_rate": 3.230743391553881e-08,
"loss": 0.2181,
"step": 863
},
{
"epoch": 2.906547285954113,
"grad_norm": 2.1450777488580774,
"learning_rate": 3.011995171129545e-08,
"loss": 0.2055,
"step": 864
},
{
"epoch": 2.9099048684946838,
"grad_norm": 2.1709969058929484,
"learning_rate": 2.8008919960090253e-08,
"loss": 0.2475,
"step": 865
},
{
"epoch": 2.9132624510352545,
"grad_norm": 1.9687943814735813,
"learning_rate": 2.5974371135408792e-08,
"loss": 0.2006,
"step": 866
},
{
"epoch": 2.9166200335758257,
"grad_norm": 2.2277357096196204,
"learning_rate": 2.401633653422053e-08,
"loss": 0.2245,
"step": 867
},
{
"epoch": 2.9199776161163964,
"grad_norm": 2.225179238729893,
"learning_rate": 2.2134846276494205e-08,
"loss": 0.2628,
"step": 868
},
{
"epoch": 2.923335198656967,
"grad_norm": 2.233813263586234,
"learning_rate": 2.032992930473543e-08,
"loss": 0.2367,
"step": 869
},
{
"epoch": 2.926692781197538,
"grad_norm": 2.192104481598423,
"learning_rate": 1.860161338354205e-08,
"loss": 0.221,
"step": 870
},
{
"epoch": 2.9300503637381086,
"grad_norm": 2.2721116115550117,
"learning_rate": 1.69499250991767e-08,
"loss": 0.2229,
"step": 871
},
{
"epoch": 2.9334079462786793,
"grad_norm": 2.069015317512474,
"learning_rate": 1.5374889859157137e-08,
"loss": 0.2026,
"step": 872
},
{
"epoch": 2.93676552881925,
"grad_norm": 2.226727947199419,
"learning_rate": 1.3876531891867106e-08,
"loss": 0.2329,
"step": 873
},
{
"epoch": 2.9401231113598207,
"grad_norm": 2.3274573956235862,
"learning_rate": 1.2454874246181081e-08,
"loss": 0.2307,
"step": 874
},
{
"epoch": 2.9434806939003915,
"grad_norm": 2.2826046277606267,
"learning_rate": 1.1109938791112328e-08,
"loss": 0.2381,
"step": 875
},
{
"epoch": 2.9468382764409626,
"grad_norm": 2.1682098280901574,
"learning_rate": 9.841746215474845e-09,
"loss": 0.2414,
"step": 876
},
{
"epoch": 2.9501958589815334,
"grad_norm": 2.023824966326867,
"learning_rate": 8.650316027566386e-09,
"loss": 0.2139,
"step": 877
},
{
"epoch": 2.953553441522104,
"grad_norm": 2.1379663757918106,
"learning_rate": 7.535666554866483e-09,
"loss": 0.2369,
"step": 878
},
{
"epoch": 2.956911024062675,
"grad_norm": 2.338410010088983,
"learning_rate": 6.497814943756675e-09,
"loss": 0.2422,
"step": 879
},
{
"epoch": 2.9602686066032455,
"grad_norm": 2.069543683427084,
"learning_rate": 5.536777159254603e-09,
"loss": 0.2214,
"step": 880
},
{
"epoch": 2.9636261891438167,
"grad_norm": 2.3978229945182914,
"learning_rate": 4.652567984770873e-09,
"loss": 0.2285,
"step": 881
},
{
"epoch": 2.9669837716843874,
"grad_norm": 2.177819105580462,
"learning_rate": 3.845201021879241e-09,
"loss": 0.2206,
"step": 882
},
{
"epoch": 2.970341354224958,
"grad_norm": 2.232082224924113,
"learning_rate": 3.1146886901090024e-09,
"loss": 0.2267,
"step": 883
},
{
"epoch": 2.973698936765529,
"grad_norm": 2.3050073009713854,
"learning_rate": 2.461042226752919e-09,
"loss": 0.2457,
"step": 884
},
{
"epoch": 2.9770565193060996,
"grad_norm": 2.1979166006413515,
"learning_rate": 1.8842716866956935e-09,
"loss": 0.2412,
"step": 885
},
{
"epoch": 2.9804141018466703,
"grad_norm": 2.255837191828035,
"learning_rate": 1.3843859422574269e-09,
"loss": 0.2331,
"step": 886
},
{
"epoch": 2.983771684387241,
"grad_norm": 2.342133934859939,
"learning_rate": 9.613926830587262e-10,
"loss": 0.2382,
"step": 887
},
{
"epoch": 2.9871292669278118,
"grad_norm": 2.280659517009034,
"learning_rate": 6.152984159024655e-10,
"loss": 0.2454,
"step": 888
},
{
"epoch": 2.9904868494683825,
"grad_norm": 2.3892948295661443,
"learning_rate": 3.4610846467109106e-10,
"loss": 0.2257,
"step": 889
},
{
"epoch": 2.9938444320089537,
"grad_norm": 2.2858765844038023,
"learning_rate": 1.538269702494599e-10,
"loss": 0.2244,
"step": 890
},
{
"epoch": 2.9972020145495244,
"grad_norm": 2.298089033956775,
"learning_rate": 3.8456890455451646e-11,
"loss": 0.2323,
"step": 891
},
{
"epoch": 2.9972020145495244,
"step": 891,
"total_flos": 1.9585051653255987e+17,
"train_loss": 0.5026469534026787,
"train_runtime": 9007.4522,
"train_samples_per_second": 4.76,
"train_steps_per_second": 0.099
}
],
"logging_steps": 1,
"max_steps": 891,
"num_input_tokens_seen": 0,
"num_train_epochs": 3,
"save_steps": 400,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 1.9585051653255987e+17,
"train_batch_size": 1,
"trial_name": null,
"trial_params": null
}