LRAT-e5-instruct / trainer_state.json
Yuqi-Zhou's picture
Upload folder using huggingface_hub
79e29c4 verified
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 2.0,
"eval_steps": 500,
"global_step": 752,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0026595744680851063,
"grad_norm": 7.823265044222166,
"learning_rate": 0.0,
"loss": 3.0134,
"step": 1
},
{
"epoch": 0.005319148936170213,
"grad_norm": 7.642957709635029,
"learning_rate": 1.6005307325482135e-07,
"loss": 3.1765,
"step": 2
},
{
"epoch": 0.007978723404255319,
"grad_norm": 7.334880781186477,
"learning_rate": 2.5367811923406806e-07,
"loss": 3.015,
"step": 3
},
{
"epoch": 0.010638297872340425,
"grad_norm": 7.622164840160959,
"learning_rate": 3.201061465096427e-07,
"loss": 3.0191,
"step": 4
},
{
"epoch": 0.013297872340425532,
"grad_norm": 7.559561166288389,
"learning_rate": 3.716317274634347e-07,
"loss": 3.0604,
"step": 5
},
{
"epoch": 0.015957446808510637,
"grad_norm": 7.537486932594524,
"learning_rate": 4.137311924888894e-07,
"loss": 3.0974,
"step": 6
},
{
"epoch": 0.018617021276595744,
"grad_norm": 7.852202340999875,
"learning_rate": 4.4932578299236894e-07,
"loss": 3.0015,
"step": 7
},
{
"epoch": 0.02127659574468085,
"grad_norm": 7.556325866974648,
"learning_rate": 4.80159219764464e-07,
"loss": 3.0507,
"step": 8
},
{
"epoch": 0.023936170212765957,
"grad_norm": 7.332175625826142,
"learning_rate": 5.073562384681361e-07,
"loss": 3.127,
"step": 9
},
{
"epoch": 0.026595744680851064,
"grad_norm": 7.725255533578292,
"learning_rate": 5.316848007182561e-07,
"loss": 3.0381,
"step": 10
},
{
"epoch": 0.02925531914893617,
"grad_norm": 7.457326924140823,
"learning_rate": 5.536926622778005e-07,
"loss": 3.0634,
"step": 11
},
{
"epoch": 0.031914893617021274,
"grad_norm": 7.693199711944396,
"learning_rate": 5.737842657437107e-07,
"loss": 3.0101,
"step": 12
},
{
"epoch": 0.034574468085106384,
"grad_norm": 7.313122323445423,
"learning_rate": 5.922667492826867e-07,
"loss": 3.0967,
"step": 13
},
{
"epoch": 0.03723404255319149,
"grad_norm": 7.381687042192129,
"learning_rate": 6.093788562471904e-07,
"loss": 3.0606,
"step": 14
},
{
"epoch": 0.0398936170212766,
"grad_norm": 7.501689466289098,
"learning_rate": 6.253098466975028e-07,
"loss": 3.0923,
"step": 15
},
{
"epoch": 0.0425531914893617,
"grad_norm": 7.764166896561166,
"learning_rate": 6.402122930192854e-07,
"loss": 3.0133,
"step": 16
},
{
"epoch": 0.04521276595744681,
"grad_norm": 7.632126776388045,
"learning_rate": 6.542109895570008e-07,
"loss": 3.0261,
"step": 17
},
{
"epoch": 0.047872340425531915,
"grad_norm": 7.5260472985128875,
"learning_rate": 6.674093117229574e-07,
"loss": 3.0122,
"step": 18
},
{
"epoch": 0.05053191489361702,
"grad_norm": 7.760501268851623,
"learning_rate": 6.798938534903572e-07,
"loss": 2.8592,
"step": 19
},
{
"epoch": 0.05319148936170213,
"grad_norm": 7.498060611474783,
"learning_rate": 6.917378739730775e-07,
"loss": 2.8595,
"step": 20
},
{
"epoch": 0.05585106382978723,
"grad_norm": 7.527553046681602,
"learning_rate": 7.030039022264371e-07,
"loss": 2.7323,
"step": 21
},
{
"epoch": 0.05851063829787234,
"grad_norm": 7.139599749462118,
"learning_rate": 7.13745735532622e-07,
"loss": 3.0506,
"step": 22
},
{
"epoch": 0.061170212765957445,
"grad_norm": 7.6904167286597165,
"learning_rate": 7.24009993125516e-07,
"loss": 2.8334,
"step": 23
},
{
"epoch": 0.06382978723404255,
"grad_norm": 7.13831891441032,
"learning_rate": 7.338373389985321e-07,
"loss": 3.0714,
"step": 24
},
{
"epoch": 0.06648936170212766,
"grad_norm": 7.277657469838149,
"learning_rate": 7.432634549268694e-07,
"loss": 3.0034,
"step": 25
},
{
"epoch": 0.06914893617021277,
"grad_norm": 7.601860581578553,
"learning_rate": 7.52319822537508e-07,
"loss": 2.8405,
"step": 26
},
{
"epoch": 0.07180851063829788,
"grad_norm": 7.406160378215662,
"learning_rate": 7.610343577022042e-07,
"loss": 2.8776,
"step": 27
},
{
"epoch": 0.07446808510638298,
"grad_norm": 7.501158815505739,
"learning_rate": 7.694319295020116e-07,
"loss": 2.8623,
"step": 28
},
{
"epoch": 0.07712765957446809,
"grad_norm": 7.475182990956878,
"learning_rate": 7.775347880836832e-07,
"loss": 2.8176,
"step": 29
},
{
"epoch": 0.0797872340425532,
"grad_norm": 6.90076598186568,
"learning_rate": 7.853629199523242e-07,
"loss": 2.6601,
"step": 30
},
{
"epoch": 0.08244680851063829,
"grad_norm": 7.317267534729387,
"learning_rate": 7.929343449851162e-07,
"loss": 2.5921,
"step": 31
},
{
"epoch": 0.0851063829787234,
"grad_norm": 6.540702440994457,
"learning_rate": 8.002653662741068e-07,
"loss": 2.7996,
"step": 32
},
{
"epoch": 0.08776595744680851,
"grad_norm": 7.135880043595326,
"learning_rate": 8.073707815118686e-07,
"loss": 2.5778,
"step": 33
},
{
"epoch": 0.09042553191489362,
"grad_norm": 6.507937939592382,
"learning_rate": 8.142640628118222e-07,
"loss": 2.7356,
"step": 34
},
{
"epoch": 0.09308510638297872,
"grad_norm": 6.445617298664252,
"learning_rate": 8.209575104558038e-07,
"loss": 2.7163,
"step": 35
},
{
"epoch": 0.09574468085106383,
"grad_norm": 6.62462859642164,
"learning_rate": 8.274623849777788e-07,
"loss": 2.6283,
"step": 36
},
{
"epoch": 0.09840425531914894,
"grad_norm": 6.7066040969467995,
"learning_rate": 8.337890211465859e-07,
"loss": 2.5485,
"step": 37
},
{
"epoch": 0.10106382978723404,
"grad_norm": 6.6019115107446815,
"learning_rate": 8.399469267451787e-07,
"loss": 2.5864,
"step": 38
},
{
"epoch": 0.10372340425531915,
"grad_norm": 6.526338473606032,
"learning_rate": 8.459448685167547e-07,
"loss": 2.6546,
"step": 39
},
{
"epoch": 0.10638297872340426,
"grad_norm": 6.677090538207485,
"learning_rate": 8.517909472278988e-07,
"loss": 2.6135,
"step": 40
},
{
"epoch": 0.10904255319148937,
"grad_norm": 6.627853354895093,
"learning_rate": 8.574926634616532e-07,
"loss": 2.6117,
"step": 41
},
{
"epoch": 0.11170212765957446,
"grad_norm": 6.210420944517809,
"learning_rate": 8.630569754812584e-07,
"loss": 2.6221,
"step": 42
},
{
"epoch": 0.11436170212765957,
"grad_norm": 6.490293150689325,
"learning_rate": 8.684903502843901e-07,
"loss": 2.6191,
"step": 43
},
{
"epoch": 0.11702127659574468,
"grad_norm": 6.5133887096434036,
"learning_rate": 8.737988087874431e-07,
"loss": 2.6479,
"step": 44
},
{
"epoch": 0.1196808510638298,
"grad_norm": 6.5629458321401275,
"learning_rate": 8.789879659315709e-07,
"loss": 2.5919,
"step": 45
},
{
"epoch": 0.12234042553191489,
"grad_norm": 6.263965110944729,
"learning_rate": 8.840630663803374e-07,
"loss": 2.564,
"step": 46
},
{
"epoch": 0.125,
"grad_norm": 6.2925368290438355,
"learning_rate": 8.890290163779749e-07,
"loss": 2.4828,
"step": 47
},
{
"epoch": 0.1276595744680851,
"grad_norm": 5.562972488052771,
"learning_rate": 8.938904122533535e-07,
"loss": 2.3536,
"step": 48
},
{
"epoch": 0.13031914893617022,
"grad_norm": 5.051596119252882,
"learning_rate": 8.986515659847379e-07,
"loss": 2.2057,
"step": 49
},
{
"epoch": 0.13297872340425532,
"grad_norm": 4.783115795828969,
"learning_rate": 9.033165281816909e-07,
"loss": 2.1078,
"step": 50
},
{
"epoch": 0.1356382978723404,
"grad_norm": 4.0379873353666,
"learning_rate": 9.078891087910689e-07,
"loss": 2.1146,
"step": 51
},
{
"epoch": 0.13829787234042554,
"grad_norm": 4.425461679286027,
"learning_rate": 9.123728957923294e-07,
"loss": 2.106,
"step": 52
},
{
"epoch": 0.14095744680851063,
"grad_norm": 4.219063302168513,
"learning_rate": 9.167712721119934e-07,
"loss": 2.0448,
"step": 53
},
{
"epoch": 0.14361702127659576,
"grad_norm": 4.177230312333208,
"learning_rate": 9.210874309570255e-07,
"loss": 2.106,
"step": 54
},
{
"epoch": 0.14627659574468085,
"grad_norm": 3.58662530955563,
"learning_rate": 9.253243897412354e-07,
"loss": 2.1577,
"step": 55
},
{
"epoch": 0.14893617021276595,
"grad_norm": 3.8076903225375607,
"learning_rate": 9.294850027568331e-07,
"loss": 2.044,
"step": 56
},
{
"epoch": 0.15159574468085107,
"grad_norm": 3.9757823965828445,
"learning_rate": 9.335719727244254e-07,
"loss": 2.1354,
"step": 57
},
{
"epoch": 0.15425531914893617,
"grad_norm": 3.8984355429604305,
"learning_rate": 9.375878613385046e-07,
"loss": 2.0297,
"step": 58
},
{
"epoch": 0.15691489361702127,
"grad_norm": 4.006300970220442,
"learning_rate": 9.415350989114764e-07,
"loss": 1.8268,
"step": 59
},
{
"epoch": 0.1595744680851064,
"grad_norm": 3.7231660155630126,
"learning_rate": 9.454159932071455e-07,
"loss": 1.8824,
"step": 60
},
{
"epoch": 0.1622340425531915,
"grad_norm": 3.896921356096762,
"learning_rate": 9.492327375440568e-07,
"loss": 1.9475,
"step": 61
},
{
"epoch": 0.16489361702127658,
"grad_norm": 3.1704796037774394,
"learning_rate": 9.529874182399376e-07,
"loss": 1.9461,
"step": 62
},
{
"epoch": 0.1675531914893617,
"grad_norm": 3.624185273266048,
"learning_rate": 9.566820214605051e-07,
"loss": 2.0426,
"step": 63
},
{
"epoch": 0.1702127659574468,
"grad_norm": 3.438777616799716,
"learning_rate": 9.60318439528928e-07,
"loss": 1.9094,
"step": 64
},
{
"epoch": 0.17287234042553193,
"grad_norm": 4.546206080990496,
"learning_rate": 9.638984767461214e-07,
"loss": 1.9037,
"step": 65
},
{
"epoch": 0.17553191489361702,
"grad_norm": 3.092553572071205,
"learning_rate": 9.6742385476669e-07,
"loss": 1.9928,
"step": 66
},
{
"epoch": 0.17819148936170212,
"grad_norm": 3.3574221590495807,
"learning_rate": 9.708962175706178e-07,
"loss": 1.9752,
"step": 67
},
{
"epoch": 0.18085106382978725,
"grad_norm": 3.0865121040891714,
"learning_rate": 9.743171360666435e-07,
"loss": 1.9853,
"step": 68
},
{
"epoch": 0.18351063829787234,
"grad_norm": 3.25288028731065,
"learning_rate": 9.776881123595842e-07,
"loss": 1.8024,
"step": 69
},
{
"epoch": 0.18617021276595744,
"grad_norm": 2.8739512645435865,
"learning_rate": 9.810105837106252e-07,
"loss": 2.0918,
"step": 70
},
{
"epoch": 0.18882978723404256,
"grad_norm": 2.8379601865829414,
"learning_rate": 9.842859262167094e-07,
"loss": 1.801,
"step": 71
},
{
"epoch": 0.19148936170212766,
"grad_norm": 2.634217473181439,
"learning_rate": 9.875154582326002e-07,
"loss": 1.9093,
"step": 72
},
{
"epoch": 0.19414893617021275,
"grad_norm": 2.674409166489119,
"learning_rate": 9.907004435569156e-07,
"loss": 1.8468,
"step": 73
},
{
"epoch": 0.19680851063829788,
"grad_norm": 2.5418462429291178,
"learning_rate": 9.938420944014074e-07,
"loss": 2.0187,
"step": 74
},
{
"epoch": 0.19946808510638298,
"grad_norm": 2.249520992577069,
"learning_rate": 9.969415741609375e-07,
"loss": 1.8433,
"step": 75
},
{
"epoch": 0.20212765957446807,
"grad_norm": 3.4070896898561567,
"learning_rate": 1e-06,
"loss": 1.6904,
"step": 76
},
{
"epoch": 0.2047872340425532,
"grad_norm": 2.405587439537431,
"learning_rate": 1e-06,
"loss": 1.9263,
"step": 77
},
{
"epoch": 0.2074468085106383,
"grad_norm": 2.1351508990882686,
"learning_rate": 9.985207100591716e-07,
"loss": 1.9915,
"step": 78
},
{
"epoch": 0.21010638297872342,
"grad_norm": 2.253674050573154,
"learning_rate": 9.97041420118343e-07,
"loss": 1.8581,
"step": 79
},
{
"epoch": 0.2127659574468085,
"grad_norm": 1.9816102720192281,
"learning_rate": 9.955621301775147e-07,
"loss": 1.8838,
"step": 80
},
{
"epoch": 0.2154255319148936,
"grad_norm": 2.1585154035600502,
"learning_rate": 9.940828402366864e-07,
"loss": 1.7891,
"step": 81
},
{
"epoch": 0.21808510638297873,
"grad_norm": 2.1927798177222466,
"learning_rate": 9.92603550295858e-07,
"loss": 1.6338,
"step": 82
},
{
"epoch": 0.22074468085106383,
"grad_norm": 2.1195681092445606,
"learning_rate": 9.911242603550295e-07,
"loss": 1.8233,
"step": 83
},
{
"epoch": 0.22340425531914893,
"grad_norm": 1.79894840198714,
"learning_rate": 9.896449704142011e-07,
"loss": 1.6456,
"step": 84
},
{
"epoch": 0.22606382978723405,
"grad_norm": 2.1334819930653004,
"learning_rate": 9.881656804733728e-07,
"loss": 1.7721,
"step": 85
},
{
"epoch": 0.22872340425531915,
"grad_norm": 2.2522144017174988,
"learning_rate": 9.866863905325444e-07,
"loss": 1.8225,
"step": 86
},
{
"epoch": 0.23138297872340424,
"grad_norm": 2.1109395194449885,
"learning_rate": 9.852071005917159e-07,
"loss": 1.8103,
"step": 87
},
{
"epoch": 0.23404255319148937,
"grad_norm": 1.889220101950186,
"learning_rate": 9.837278106508875e-07,
"loss": 1.7531,
"step": 88
},
{
"epoch": 0.23670212765957446,
"grad_norm": 2.0103297503036797,
"learning_rate": 9.822485207100592e-07,
"loss": 1.4589,
"step": 89
},
{
"epoch": 0.2393617021276596,
"grad_norm": 2.0087468092299012,
"learning_rate": 9.807692307692306e-07,
"loss": 1.7291,
"step": 90
},
{
"epoch": 0.24202127659574468,
"grad_norm": 2.1537427838116603,
"learning_rate": 9.792899408284023e-07,
"loss": 1.8277,
"step": 91
},
{
"epoch": 0.24468085106382978,
"grad_norm": 1.811327456337824,
"learning_rate": 9.77810650887574e-07,
"loss": 1.6806,
"step": 92
},
{
"epoch": 0.2473404255319149,
"grad_norm": 1.8193216533171808,
"learning_rate": 9.763313609467456e-07,
"loss": 1.5615,
"step": 93
},
{
"epoch": 0.25,
"grad_norm": 1.7880991378455267,
"learning_rate": 9.748520710059172e-07,
"loss": 1.6315,
"step": 94
},
{
"epoch": 0.2526595744680851,
"grad_norm": 1.9732905510612142,
"learning_rate": 9.733727810650887e-07,
"loss": 1.6118,
"step": 95
},
{
"epoch": 0.2553191489361702,
"grad_norm": 1.8507510310387487,
"learning_rate": 9.718934911242603e-07,
"loss": 1.562,
"step": 96
},
{
"epoch": 0.2579787234042553,
"grad_norm": 1.937704198597928,
"learning_rate": 9.704142011834318e-07,
"loss": 1.5891,
"step": 97
},
{
"epoch": 0.26063829787234044,
"grad_norm": 1.8949539814094551,
"learning_rate": 9.689349112426034e-07,
"loss": 1.55,
"step": 98
},
{
"epoch": 0.2632978723404255,
"grad_norm": 1.7741323445830024,
"learning_rate": 9.67455621301775e-07,
"loss": 1.734,
"step": 99
},
{
"epoch": 0.26595744680851063,
"grad_norm": 1.7021314190064671,
"learning_rate": 9.659763313609467e-07,
"loss": 1.4889,
"step": 100
},
{
"epoch": 0.26861702127659576,
"grad_norm": 1.9644914618403917,
"learning_rate": 9.644970414201184e-07,
"loss": 1.7278,
"step": 101
},
{
"epoch": 0.2712765957446808,
"grad_norm": 1.8634125925152643,
"learning_rate": 9.630177514792898e-07,
"loss": 1.5682,
"step": 102
},
{
"epoch": 0.27393617021276595,
"grad_norm": 1.8401952841001055,
"learning_rate": 9.615384615384615e-07,
"loss": 1.565,
"step": 103
},
{
"epoch": 0.2765957446808511,
"grad_norm": 1.804040900318666,
"learning_rate": 9.600591715976331e-07,
"loss": 1.5869,
"step": 104
},
{
"epoch": 0.27925531914893614,
"grad_norm": 1.706090462740245,
"learning_rate": 9.585798816568048e-07,
"loss": 1.5148,
"step": 105
},
{
"epoch": 0.28191489361702127,
"grad_norm": 1.728525487149655,
"learning_rate": 9.571005917159762e-07,
"loss": 1.5603,
"step": 106
},
{
"epoch": 0.2845744680851064,
"grad_norm": 1.7524632420405768,
"learning_rate": 9.556213017751479e-07,
"loss": 1.6348,
"step": 107
},
{
"epoch": 0.2872340425531915,
"grad_norm": 1.7478581672975904,
"learning_rate": 9.541420118343195e-07,
"loss": 1.6436,
"step": 108
},
{
"epoch": 0.2898936170212766,
"grad_norm": 1.9985633100646443,
"learning_rate": 9.526627218934911e-07,
"loss": 1.7681,
"step": 109
},
{
"epoch": 0.2925531914893617,
"grad_norm": 2.079481786216591,
"learning_rate": 9.511834319526627e-07,
"loss": 1.419,
"step": 110
},
{
"epoch": 0.29521276595744683,
"grad_norm": 1.6892443930394687,
"learning_rate": 9.497041420118342e-07,
"loss": 1.6604,
"step": 111
},
{
"epoch": 0.2978723404255319,
"grad_norm": 1.610960466459826,
"learning_rate": 9.482248520710058e-07,
"loss": 1.6354,
"step": 112
},
{
"epoch": 0.300531914893617,
"grad_norm": 1.8544084912738468,
"learning_rate": 9.467455621301774e-07,
"loss": 1.596,
"step": 113
},
{
"epoch": 0.30319148936170215,
"grad_norm": 1.9170248346565737,
"learning_rate": 9.45266272189349e-07,
"loss": 1.6547,
"step": 114
},
{
"epoch": 0.3058510638297872,
"grad_norm": 1.7881140714522759,
"learning_rate": 9.437869822485207e-07,
"loss": 1.6039,
"step": 115
},
{
"epoch": 0.30851063829787234,
"grad_norm": 1.9088889444538937,
"learning_rate": 9.423076923076923e-07,
"loss": 1.7795,
"step": 116
},
{
"epoch": 0.31117021276595747,
"grad_norm": 2.2553186073976383,
"learning_rate": 9.408284023668639e-07,
"loss": 1.4582,
"step": 117
},
{
"epoch": 0.31382978723404253,
"grad_norm": 1.843300845507743,
"learning_rate": 9.393491124260355e-07,
"loss": 1.5459,
"step": 118
},
{
"epoch": 0.31648936170212766,
"grad_norm": 1.7169781655337961,
"learning_rate": 9.378698224852071e-07,
"loss": 1.5621,
"step": 119
},
{
"epoch": 0.3191489361702128,
"grad_norm": 1.690585106680432,
"learning_rate": 9.363905325443787e-07,
"loss": 1.5449,
"step": 120
},
{
"epoch": 0.32180851063829785,
"grad_norm": 1.634720340224596,
"learning_rate": 9.349112426035502e-07,
"loss": 1.5937,
"step": 121
},
{
"epoch": 0.324468085106383,
"grad_norm": 1.905851103523696,
"learning_rate": 9.334319526627219e-07,
"loss": 1.6611,
"step": 122
},
{
"epoch": 0.3271276595744681,
"grad_norm": 1.5977114642239374,
"learning_rate": 9.319526627218934e-07,
"loss": 1.3517,
"step": 123
},
{
"epoch": 0.32978723404255317,
"grad_norm": 1.7739378101582597,
"learning_rate": 9.304733727810651e-07,
"loss": 1.6623,
"step": 124
},
{
"epoch": 0.3324468085106383,
"grad_norm": 1.7433956729666737,
"learning_rate": 9.289940828402366e-07,
"loss": 1.565,
"step": 125
},
{
"epoch": 0.3351063829787234,
"grad_norm": 1.8292660534852752,
"learning_rate": 9.275147928994083e-07,
"loss": 1.6095,
"step": 126
},
{
"epoch": 0.3377659574468085,
"grad_norm": 1.8582148418654536,
"learning_rate": 9.260355029585798e-07,
"loss": 1.6689,
"step": 127
},
{
"epoch": 0.3404255319148936,
"grad_norm": 1.5903483721719576,
"learning_rate": 9.245562130177515e-07,
"loss": 1.4741,
"step": 128
},
{
"epoch": 0.34308510638297873,
"grad_norm": 1.5937293556222676,
"learning_rate": 9.230769230769231e-07,
"loss": 1.4603,
"step": 129
},
{
"epoch": 0.34574468085106386,
"grad_norm": 1.8563999128333846,
"learning_rate": 9.215976331360947e-07,
"loss": 1.6907,
"step": 130
},
{
"epoch": 0.3484042553191489,
"grad_norm": 1.7139518347374663,
"learning_rate": 9.201183431952662e-07,
"loss": 1.6474,
"step": 131
},
{
"epoch": 0.35106382978723405,
"grad_norm": 1.7465122753927422,
"learning_rate": 9.186390532544378e-07,
"loss": 1.6324,
"step": 132
},
{
"epoch": 0.3537234042553192,
"grad_norm": 1.6610443253134841,
"learning_rate": 9.171597633136094e-07,
"loss": 1.5045,
"step": 133
},
{
"epoch": 0.35638297872340424,
"grad_norm": 1.655396079412198,
"learning_rate": 9.15680473372781e-07,
"loss": 1.5022,
"step": 134
},
{
"epoch": 0.35904255319148937,
"grad_norm": 1.6336073697442315,
"learning_rate": 9.142011834319526e-07,
"loss": 1.5522,
"step": 135
},
{
"epoch": 0.3617021276595745,
"grad_norm": 1.707143679480118,
"learning_rate": 9.127218934911243e-07,
"loss": 1.551,
"step": 136
},
{
"epoch": 0.36436170212765956,
"grad_norm": 1.685651756020523,
"learning_rate": 9.112426035502958e-07,
"loss": 1.6122,
"step": 137
},
{
"epoch": 0.3670212765957447,
"grad_norm": 2.358800653945757,
"learning_rate": 9.097633136094675e-07,
"loss": 1.5604,
"step": 138
},
{
"epoch": 0.3696808510638298,
"grad_norm": 1.9426261400386715,
"learning_rate": 9.08284023668639e-07,
"loss": 1.4685,
"step": 139
},
{
"epoch": 0.3723404255319149,
"grad_norm": 1.7028161565048658,
"learning_rate": 9.068047337278106e-07,
"loss": 1.4377,
"step": 140
},
{
"epoch": 0.375,
"grad_norm": 1.6566345919926695,
"learning_rate": 9.053254437869821e-07,
"loss": 1.4231,
"step": 141
},
{
"epoch": 0.3776595744680851,
"grad_norm": 1.750830521373255,
"learning_rate": 9.038461538461538e-07,
"loss": 1.5528,
"step": 142
},
{
"epoch": 0.3803191489361702,
"grad_norm": 1.673773427490192,
"learning_rate": 9.023668639053253e-07,
"loss": 1.5206,
"step": 143
},
{
"epoch": 0.3829787234042553,
"grad_norm": 1.9158107325263525,
"learning_rate": 9.00887573964497e-07,
"loss": 1.5624,
"step": 144
},
{
"epoch": 0.38563829787234044,
"grad_norm": 1.863606934304487,
"learning_rate": 8.994082840236686e-07,
"loss": 1.7617,
"step": 145
},
{
"epoch": 0.3882978723404255,
"grad_norm": 1.743890793561562,
"learning_rate": 8.979289940828402e-07,
"loss": 1.6736,
"step": 146
},
{
"epoch": 0.39095744680851063,
"grad_norm": 1.7765187119696408,
"learning_rate": 8.964497041420118e-07,
"loss": 1.401,
"step": 147
},
{
"epoch": 0.39361702127659576,
"grad_norm": 1.9492055399414594,
"learning_rate": 8.949704142011834e-07,
"loss": 1.7687,
"step": 148
},
{
"epoch": 0.3962765957446808,
"grad_norm": 1.9491511698228168,
"learning_rate": 8.93491124260355e-07,
"loss": 1.5873,
"step": 149
},
{
"epoch": 0.39893617021276595,
"grad_norm": 1.7732893423967535,
"learning_rate": 8.920118343195265e-07,
"loss": 1.4666,
"step": 150
},
{
"epoch": 0.4015957446808511,
"grad_norm": 1.628295930467344,
"learning_rate": 8.905325443786981e-07,
"loss": 1.4253,
"step": 151
},
{
"epoch": 0.40425531914893614,
"grad_norm": 1.9501879161375453,
"learning_rate": 8.890532544378698e-07,
"loss": 1.5748,
"step": 152
},
{
"epoch": 0.40691489361702127,
"grad_norm": 1.6883577356837587,
"learning_rate": 8.875739644970413e-07,
"loss": 1.3546,
"step": 153
},
{
"epoch": 0.4095744680851064,
"grad_norm": 1.7177779483356421,
"learning_rate": 8.86094674556213e-07,
"loss": 1.6715,
"step": 154
},
{
"epoch": 0.4122340425531915,
"grad_norm": 1.806803334100437,
"learning_rate": 8.846153846153846e-07,
"loss": 1.5485,
"step": 155
},
{
"epoch": 0.4148936170212766,
"grad_norm": 1.7522522193654075,
"learning_rate": 8.831360946745562e-07,
"loss": 1.5091,
"step": 156
},
{
"epoch": 0.4175531914893617,
"grad_norm": 1.8508800423865754,
"learning_rate": 8.816568047337278e-07,
"loss": 1.7112,
"step": 157
},
{
"epoch": 0.42021276595744683,
"grad_norm": 1.752024433296569,
"learning_rate": 8.801775147928994e-07,
"loss": 1.3702,
"step": 158
},
{
"epoch": 0.4228723404255319,
"grad_norm": 2.0875697232783246,
"learning_rate": 8.786982248520711e-07,
"loss": 1.5972,
"step": 159
},
{
"epoch": 0.425531914893617,
"grad_norm": 1.7852623572002673,
"learning_rate": 8.772189349112425e-07,
"loss": 1.5496,
"step": 160
},
{
"epoch": 0.42819148936170215,
"grad_norm": 2.0049459574733968,
"learning_rate": 8.757396449704142e-07,
"loss": 1.5256,
"step": 161
},
{
"epoch": 0.4308510638297872,
"grad_norm": 1.8394745863340762,
"learning_rate": 8.742603550295857e-07,
"loss": 1.5466,
"step": 162
},
{
"epoch": 0.43351063829787234,
"grad_norm": 1.890821588557376,
"learning_rate": 8.727810650887574e-07,
"loss": 1.4839,
"step": 163
},
{
"epoch": 0.43617021276595747,
"grad_norm": 1.6481011214712673,
"learning_rate": 8.713017751479289e-07,
"loss": 1.6322,
"step": 164
},
{
"epoch": 0.43882978723404253,
"grad_norm": 1.6910215297075097,
"learning_rate": 8.698224852071006e-07,
"loss": 1.4294,
"step": 165
},
{
"epoch": 0.44148936170212766,
"grad_norm": 2.2849926490581978,
"learning_rate": 8.683431952662722e-07,
"loss": 1.5214,
"step": 166
},
{
"epoch": 0.4441489361702128,
"grad_norm": 1.6530282697158378,
"learning_rate": 8.668639053254438e-07,
"loss": 1.5387,
"step": 167
},
{
"epoch": 0.44680851063829785,
"grad_norm": 1.8612064349812791,
"learning_rate": 8.653846153846154e-07,
"loss": 1.4698,
"step": 168
},
{
"epoch": 0.449468085106383,
"grad_norm": 1.844773154127249,
"learning_rate": 8.639053254437869e-07,
"loss": 1.6155,
"step": 169
},
{
"epoch": 0.4521276595744681,
"grad_norm": 1.7920892424117567,
"learning_rate": 8.624260355029585e-07,
"loss": 1.4435,
"step": 170
},
{
"epoch": 0.45478723404255317,
"grad_norm": 1.7631889349519279,
"learning_rate": 8.609467455621301e-07,
"loss": 1.5657,
"step": 171
},
{
"epoch": 0.4574468085106383,
"grad_norm": 1.850684750618834,
"learning_rate": 8.594674556213017e-07,
"loss": 1.4599,
"step": 172
},
{
"epoch": 0.4601063829787234,
"grad_norm": 1.8936077580680233,
"learning_rate": 8.579881656804734e-07,
"loss": 1.4487,
"step": 173
},
{
"epoch": 0.4627659574468085,
"grad_norm": 1.8302300226282981,
"learning_rate": 8.565088757396449e-07,
"loss": 1.3982,
"step": 174
},
{
"epoch": 0.4654255319148936,
"grad_norm": 1.77826681795055,
"learning_rate": 8.550295857988166e-07,
"loss": 1.4513,
"step": 175
},
{
"epoch": 0.46808510638297873,
"grad_norm": 1.687204688334926,
"learning_rate": 8.535502958579881e-07,
"loss": 1.4119,
"step": 176
},
{
"epoch": 0.47074468085106386,
"grad_norm": 1.8120179028458203,
"learning_rate": 8.520710059171598e-07,
"loss": 1.6192,
"step": 177
},
{
"epoch": 0.4734042553191489,
"grad_norm": 1.8795981293168291,
"learning_rate": 8.505917159763313e-07,
"loss": 1.4954,
"step": 178
},
{
"epoch": 0.47606382978723405,
"grad_norm": 1.7065716786077503,
"learning_rate": 8.491124260355029e-07,
"loss": 1.5966,
"step": 179
},
{
"epoch": 0.4787234042553192,
"grad_norm": 1.7627718668452295,
"learning_rate": 8.476331360946745e-07,
"loss": 1.4327,
"step": 180
},
{
"epoch": 0.48138297872340424,
"grad_norm": 1.8665938451163775,
"learning_rate": 8.461538461538461e-07,
"loss": 1.4918,
"step": 181
},
{
"epoch": 0.48404255319148937,
"grad_norm": 1.819110850294668,
"learning_rate": 8.446745562130177e-07,
"loss": 1.5539,
"step": 182
},
{
"epoch": 0.4867021276595745,
"grad_norm": 1.8453397847354074,
"learning_rate": 8.431952662721893e-07,
"loss": 1.5331,
"step": 183
},
{
"epoch": 0.48936170212765956,
"grad_norm": 2.622110865899153,
"learning_rate": 8.417159763313609e-07,
"loss": 1.3705,
"step": 184
},
{
"epoch": 0.4920212765957447,
"grad_norm": 2.0496831369913378,
"learning_rate": 8.402366863905325e-07,
"loss": 1.434,
"step": 185
},
{
"epoch": 0.4946808510638298,
"grad_norm": 1.7777460669960974,
"learning_rate": 8.387573964497041e-07,
"loss": 1.507,
"step": 186
},
{
"epoch": 0.4973404255319149,
"grad_norm": 1.7648525067264564,
"learning_rate": 8.372781065088757e-07,
"loss": 1.5419,
"step": 187
},
{
"epoch": 0.5,
"grad_norm": 1.7346580205717035,
"learning_rate": 8.357988165680473e-07,
"loss": 1.4474,
"step": 188
},
{
"epoch": 0.5026595744680851,
"grad_norm": 1.8941186829293386,
"learning_rate": 8.343195266272189e-07,
"loss": 1.4239,
"step": 189
},
{
"epoch": 0.5053191489361702,
"grad_norm": 1.793062893259623,
"learning_rate": 8.328402366863904e-07,
"loss": 1.5339,
"step": 190
},
{
"epoch": 0.5079787234042553,
"grad_norm": 1.867427730668421,
"learning_rate": 8.313609467455621e-07,
"loss": 1.3395,
"step": 191
},
{
"epoch": 0.5106382978723404,
"grad_norm": 1.836942681632619,
"learning_rate": 8.298816568047336e-07,
"loss": 1.6135,
"step": 192
},
{
"epoch": 0.5132978723404256,
"grad_norm": 1.6942657192312134,
"learning_rate": 8.284023668639053e-07,
"loss": 1.5308,
"step": 193
},
{
"epoch": 0.5159574468085106,
"grad_norm": 1.841411465300408,
"learning_rate": 8.269230769230768e-07,
"loss": 1.5345,
"step": 194
},
{
"epoch": 0.5186170212765957,
"grad_norm": 1.8794098811564628,
"learning_rate": 8.254437869822485e-07,
"loss": 1.6901,
"step": 195
},
{
"epoch": 0.5212765957446809,
"grad_norm": 1.7012388651957833,
"learning_rate": 8.239644970414202e-07,
"loss": 1.474,
"step": 196
},
{
"epoch": 0.523936170212766,
"grad_norm": 1.7944418314011599,
"learning_rate": 8.224852071005917e-07,
"loss": 1.32,
"step": 197
},
{
"epoch": 0.526595744680851,
"grad_norm": 1.7425046897179257,
"learning_rate": 8.210059171597633e-07,
"loss": 1.368,
"step": 198
},
{
"epoch": 0.5292553191489362,
"grad_norm": 1.7880936710475852,
"learning_rate": 8.195266272189348e-07,
"loss": 1.645,
"step": 199
},
{
"epoch": 0.5319148936170213,
"grad_norm": 1.6715457627732746,
"learning_rate": 8.180473372781065e-07,
"loss": 1.4261,
"step": 200
},
{
"epoch": 0.5345744680851063,
"grad_norm": 1.822736509594185,
"learning_rate": 8.16568047337278e-07,
"loss": 1.624,
"step": 201
},
{
"epoch": 0.5372340425531915,
"grad_norm": 1.6809290356200428,
"learning_rate": 8.150887573964497e-07,
"loss": 1.3937,
"step": 202
},
{
"epoch": 0.5398936170212766,
"grad_norm": 1.7496410410443377,
"learning_rate": 8.136094674556213e-07,
"loss": 1.5429,
"step": 203
},
{
"epoch": 0.5425531914893617,
"grad_norm": 1.9199038686131074,
"learning_rate": 8.121301775147929e-07,
"loss": 1.5023,
"step": 204
},
{
"epoch": 0.5452127659574468,
"grad_norm": 1.6850758554154257,
"learning_rate": 8.106508875739645e-07,
"loss": 1.4991,
"step": 205
},
{
"epoch": 0.5478723404255319,
"grad_norm": 1.9329660957508767,
"learning_rate": 8.091715976331361e-07,
"loss": 1.5398,
"step": 206
},
{
"epoch": 0.550531914893617,
"grad_norm": 1.666476222919606,
"learning_rate": 8.076923076923077e-07,
"loss": 1.5637,
"step": 207
},
{
"epoch": 0.5531914893617021,
"grad_norm": 2.224757697809998,
"learning_rate": 8.062130177514792e-07,
"loss": 1.6452,
"step": 208
},
{
"epoch": 0.5558510638297872,
"grad_norm": 1.861249667872802,
"learning_rate": 8.047337278106508e-07,
"loss": 1.541,
"step": 209
},
{
"epoch": 0.5585106382978723,
"grad_norm": 1.7749036870266581,
"learning_rate": 8.032544378698225e-07,
"loss": 1.4811,
"step": 210
},
{
"epoch": 0.5611702127659575,
"grad_norm": 1.9725453562773687,
"learning_rate": 8.01775147928994e-07,
"loss": 1.4289,
"step": 211
},
{
"epoch": 0.5638297872340425,
"grad_norm": 1.5771745142242444,
"learning_rate": 8.002958579881657e-07,
"loss": 1.1851,
"step": 212
},
{
"epoch": 0.5664893617021277,
"grad_norm": 1.7671763938332208,
"learning_rate": 7.988165680473372e-07,
"loss": 1.4632,
"step": 213
},
{
"epoch": 0.5691489361702128,
"grad_norm": 2.3539254619170147,
"learning_rate": 7.973372781065089e-07,
"loss": 1.4399,
"step": 214
},
{
"epoch": 0.5718085106382979,
"grad_norm": 1.6811659582751803,
"learning_rate": 7.958579881656804e-07,
"loss": 1.3874,
"step": 215
},
{
"epoch": 0.574468085106383,
"grad_norm": 1.816100982937805,
"learning_rate": 7.943786982248521e-07,
"loss": 1.3507,
"step": 216
},
{
"epoch": 0.5771276595744681,
"grad_norm": 1.6054386033989114,
"learning_rate": 7.928994082840237e-07,
"loss": 1.3523,
"step": 217
},
{
"epoch": 0.5797872340425532,
"grad_norm": 1.747109205347203,
"learning_rate": 7.914201183431952e-07,
"loss": 1.4471,
"step": 218
},
{
"epoch": 0.5824468085106383,
"grad_norm": 2.544095072667201,
"learning_rate": 7.899408284023668e-07,
"loss": 1.4659,
"step": 219
},
{
"epoch": 0.5851063829787234,
"grad_norm": 1.9052355208698295,
"learning_rate": 7.884615384615384e-07,
"loss": 1.6808,
"step": 220
},
{
"epoch": 0.5877659574468085,
"grad_norm": 1.7475037482225553,
"learning_rate": 7.8698224852071e-07,
"loss": 1.4223,
"step": 221
},
{
"epoch": 0.5904255319148937,
"grad_norm": 1.7030078252678653,
"learning_rate": 7.855029585798816e-07,
"loss": 1.558,
"step": 222
},
{
"epoch": 0.5930851063829787,
"grad_norm": 1.6935707673119045,
"learning_rate": 7.840236686390532e-07,
"loss": 1.3466,
"step": 223
},
{
"epoch": 0.5957446808510638,
"grad_norm": 1.9730090137474936,
"learning_rate": 7.825443786982249e-07,
"loss": 1.6373,
"step": 224
},
{
"epoch": 0.598404255319149,
"grad_norm": 1.7329005942211182,
"learning_rate": 7.810650887573964e-07,
"loss": 1.3348,
"step": 225
},
{
"epoch": 0.601063829787234,
"grad_norm": 2.033731441401403,
"learning_rate": 7.795857988165681e-07,
"loss": 1.5524,
"step": 226
},
{
"epoch": 0.6037234042553191,
"grad_norm": 1.7207604873916247,
"learning_rate": 7.781065088757395e-07,
"loss": 1.3862,
"step": 227
},
{
"epoch": 0.6063829787234043,
"grad_norm": 1.8075753757910789,
"learning_rate": 7.766272189349112e-07,
"loss": 1.5981,
"step": 228
},
{
"epoch": 0.6090425531914894,
"grad_norm": 2.291978352476086,
"learning_rate": 7.751479289940827e-07,
"loss": 1.4514,
"step": 229
},
{
"epoch": 0.6117021276595744,
"grad_norm": 1.8644568615293915,
"learning_rate": 7.736686390532544e-07,
"loss": 1.6587,
"step": 230
},
{
"epoch": 0.6143617021276596,
"grad_norm": 2.594171053250292,
"learning_rate": 7.721893491124259e-07,
"loss": 1.6336,
"step": 231
},
{
"epoch": 0.6170212765957447,
"grad_norm": 1.5011539788709316,
"learning_rate": 7.707100591715976e-07,
"loss": 1.2387,
"step": 232
},
{
"epoch": 0.6196808510638298,
"grad_norm": 1.6819405282763624,
"learning_rate": 7.692307692307693e-07,
"loss": 1.5038,
"step": 233
},
{
"epoch": 0.6223404255319149,
"grad_norm": 1.7251235005494032,
"learning_rate": 7.677514792899408e-07,
"loss": 1.5774,
"step": 234
},
{
"epoch": 0.625,
"grad_norm": 1.864499827243002,
"learning_rate": 7.662721893491125e-07,
"loss": 1.5276,
"step": 235
},
{
"epoch": 0.6276595744680851,
"grad_norm": 1.7781078666304035,
"learning_rate": 7.64792899408284e-07,
"loss": 1.5232,
"step": 236
},
{
"epoch": 0.6303191489361702,
"grad_norm": 1.6599021088795032,
"learning_rate": 7.633136094674556e-07,
"loss": 1.4473,
"step": 237
},
{
"epoch": 0.6329787234042553,
"grad_norm": 1.6721336663765791,
"learning_rate": 7.618343195266271e-07,
"loss": 1.3851,
"step": 238
},
{
"epoch": 0.6356382978723404,
"grad_norm": 1.797473310291003,
"learning_rate": 7.603550295857988e-07,
"loss": 1.4871,
"step": 239
},
{
"epoch": 0.6382978723404256,
"grad_norm": 1.68684289642348,
"learning_rate": 7.588757396449704e-07,
"loss": 1.3971,
"step": 240
},
{
"epoch": 0.6409574468085106,
"grad_norm": 1.6548030218587813,
"learning_rate": 7.57396449704142e-07,
"loss": 1.4413,
"step": 241
},
{
"epoch": 0.6436170212765957,
"grad_norm": 1.7764920048747164,
"learning_rate": 7.559171597633136e-07,
"loss": 1.5327,
"step": 242
},
{
"epoch": 0.6462765957446809,
"grad_norm": 2.3776019048662627,
"learning_rate": 7.544378698224852e-07,
"loss": 1.3973,
"step": 243
},
{
"epoch": 0.648936170212766,
"grad_norm": 2.180898241246454,
"learning_rate": 7.529585798816568e-07,
"loss": 1.4108,
"step": 244
},
{
"epoch": 0.651595744680851,
"grad_norm": 1.7308120559219609,
"learning_rate": 7.514792899408284e-07,
"loss": 1.437,
"step": 245
},
{
"epoch": 0.6542553191489362,
"grad_norm": 1.6797613083347633,
"learning_rate": 7.5e-07,
"loss": 1.4266,
"step": 246
},
{
"epoch": 0.6569148936170213,
"grad_norm": 1.7244677372074293,
"learning_rate": 7.485207100591716e-07,
"loss": 1.4562,
"step": 247
},
{
"epoch": 0.6595744680851063,
"grad_norm": 1.831008658275623,
"learning_rate": 7.470414201183431e-07,
"loss": 1.625,
"step": 248
},
{
"epoch": 0.6622340425531915,
"grad_norm": 1.5987807515924746,
"learning_rate": 7.455621301775148e-07,
"loss": 1.351,
"step": 249
},
{
"epoch": 0.6648936170212766,
"grad_norm": 1.657627324756177,
"learning_rate": 7.440828402366863e-07,
"loss": 1.3021,
"step": 250
},
{
"epoch": 0.6675531914893617,
"grad_norm": 1.6806656229564951,
"learning_rate": 7.42603550295858e-07,
"loss": 1.4708,
"step": 251
},
{
"epoch": 0.6702127659574468,
"grad_norm": 1.6469208307421896,
"learning_rate": 7.411242603550295e-07,
"loss": 1.4309,
"step": 252
},
{
"epoch": 0.6728723404255319,
"grad_norm": 1.6396856616158755,
"learning_rate": 7.396449704142012e-07,
"loss": 1.501,
"step": 253
},
{
"epoch": 0.675531914893617,
"grad_norm": 1.6377964159170837,
"learning_rate": 7.381656804733728e-07,
"loss": 1.5208,
"step": 254
},
{
"epoch": 0.6781914893617021,
"grad_norm": 1.6580558864253538,
"learning_rate": 7.366863905325444e-07,
"loss": 1.4638,
"step": 255
},
{
"epoch": 0.6808510638297872,
"grad_norm": 1.837851772242258,
"learning_rate": 7.352071005917159e-07,
"loss": 1.3164,
"step": 256
},
{
"epoch": 0.6835106382978723,
"grad_norm": 2.224825104258165,
"learning_rate": 7.337278106508875e-07,
"loss": 1.6295,
"step": 257
},
{
"epoch": 0.6861702127659575,
"grad_norm": 1.6131790535172048,
"learning_rate": 7.322485207100591e-07,
"loss": 1.4414,
"step": 258
},
{
"epoch": 0.6888297872340425,
"grad_norm": 1.549489595607848,
"learning_rate": 7.307692307692307e-07,
"loss": 1.4455,
"step": 259
},
{
"epoch": 0.6914893617021277,
"grad_norm": 1.761687284810298,
"learning_rate": 7.292899408284023e-07,
"loss": 1.4913,
"step": 260
},
{
"epoch": 0.6941489361702128,
"grad_norm": 1.6593936380320258,
"learning_rate": 7.27810650887574e-07,
"loss": 1.6427,
"step": 261
},
{
"epoch": 0.6968085106382979,
"grad_norm": 1.7879593292364175,
"learning_rate": 7.263313609467455e-07,
"loss": 1.6127,
"step": 262
},
{
"epoch": 0.699468085106383,
"grad_norm": 1.559119726167982,
"learning_rate": 7.248520710059172e-07,
"loss": 1.3617,
"step": 263
},
{
"epoch": 0.7021276595744681,
"grad_norm": 1.5376887507996986,
"learning_rate": 7.233727810650887e-07,
"loss": 1.3915,
"step": 264
},
{
"epoch": 0.7047872340425532,
"grad_norm": 1.892877482230423,
"learning_rate": 7.218934911242604e-07,
"loss": 1.3938,
"step": 265
},
{
"epoch": 0.7074468085106383,
"grad_norm": 2.1615047832844647,
"learning_rate": 7.204142011834318e-07,
"loss": 1.433,
"step": 266
},
{
"epoch": 0.7101063829787234,
"grad_norm": 1.5754637988987956,
"learning_rate": 7.189349112426035e-07,
"loss": 1.3913,
"step": 267
},
{
"epoch": 0.7127659574468085,
"grad_norm": 1.4917666655680848,
"learning_rate": 7.17455621301775e-07,
"loss": 1.4024,
"step": 268
},
{
"epoch": 0.7154255319148937,
"grad_norm": 1.7371252437936426,
"learning_rate": 7.159763313609467e-07,
"loss": 1.5104,
"step": 269
},
{
"epoch": 0.7180851063829787,
"grad_norm": 1.479255763133087,
"learning_rate": 7.144970414201183e-07,
"loss": 1.3533,
"step": 270
},
{
"epoch": 0.7207446808510638,
"grad_norm": 1.6094715867178733,
"learning_rate": 7.130177514792899e-07,
"loss": 1.3532,
"step": 271
},
{
"epoch": 0.723404255319149,
"grad_norm": 1.565198399335246,
"learning_rate": 7.115384615384616e-07,
"loss": 1.3988,
"step": 272
},
{
"epoch": 0.726063829787234,
"grad_norm": 1.5067122007483011,
"learning_rate": 7.100591715976331e-07,
"loss": 1.3825,
"step": 273
},
{
"epoch": 0.7287234042553191,
"grad_norm": 1.7140633929936213,
"learning_rate": 7.085798816568048e-07,
"loss": 1.4082,
"step": 274
},
{
"epoch": 0.7313829787234043,
"grad_norm": 1.540948863934289,
"learning_rate": 7.071005917159762e-07,
"loss": 1.5153,
"step": 275
},
{
"epoch": 0.7340425531914894,
"grad_norm": 1.7664241501358,
"learning_rate": 7.056213017751479e-07,
"loss": 1.2721,
"step": 276
},
{
"epoch": 0.7367021276595744,
"grad_norm": 1.5709026992552224,
"learning_rate": 7.041420118343195e-07,
"loss": 1.3492,
"step": 277
},
{
"epoch": 0.7393617021276596,
"grad_norm": 1.5068566647857482,
"learning_rate": 7.026627218934911e-07,
"loss": 1.362,
"step": 278
},
{
"epoch": 0.7420212765957447,
"grad_norm": 1.9554416192824882,
"learning_rate": 7.011834319526627e-07,
"loss": 1.6618,
"step": 279
},
{
"epoch": 0.7446808510638298,
"grad_norm": 1.6405976792740071,
"learning_rate": 6.997041420118343e-07,
"loss": 1.5917,
"step": 280
},
{
"epoch": 0.7473404255319149,
"grad_norm": 1.7066156854813295,
"learning_rate": 6.982248520710059e-07,
"loss": 1.2984,
"step": 281
},
{
"epoch": 0.75,
"grad_norm": 1.616654607721298,
"learning_rate": 6.967455621301775e-07,
"loss": 1.4085,
"step": 282
},
{
"epoch": 0.7526595744680851,
"grad_norm": 1.6119917549130687,
"learning_rate": 6.952662721893491e-07,
"loss": 1.4059,
"step": 283
},
{
"epoch": 0.7553191489361702,
"grad_norm": 1.4894224582399371,
"learning_rate": 6.937869822485208e-07,
"loss": 1.4205,
"step": 284
},
{
"epoch": 0.7579787234042553,
"grad_norm": 1.7561130701083838,
"learning_rate": 6.923076923076922e-07,
"loss": 1.5931,
"step": 285
},
{
"epoch": 0.7606382978723404,
"grad_norm": 1.5507572662266917,
"learning_rate": 6.908284023668639e-07,
"loss": 1.3968,
"step": 286
},
{
"epoch": 0.7632978723404256,
"grad_norm": 1.4671913155048064,
"learning_rate": 6.893491124260354e-07,
"loss": 1.2951,
"step": 287
},
{
"epoch": 0.7659574468085106,
"grad_norm": 1.5498863732312698,
"learning_rate": 6.878698224852071e-07,
"loss": 1.2232,
"step": 288
},
{
"epoch": 0.7686170212765957,
"grad_norm": 1.680206076834721,
"learning_rate": 6.863905325443786e-07,
"loss": 1.4992,
"step": 289
},
{
"epoch": 0.7712765957446809,
"grad_norm": 1.718088751084764,
"learning_rate": 6.849112426035503e-07,
"loss": 1.4422,
"step": 290
},
{
"epoch": 0.773936170212766,
"grad_norm": 1.5282347438855142,
"learning_rate": 6.834319526627219e-07,
"loss": 1.4063,
"step": 291
},
{
"epoch": 0.776595744680851,
"grad_norm": 1.9525999050003993,
"learning_rate": 6.819526627218935e-07,
"loss": 1.5957,
"step": 292
},
{
"epoch": 0.7792553191489362,
"grad_norm": 2.258813866966866,
"learning_rate": 6.804733727810651e-07,
"loss": 1.4431,
"step": 293
},
{
"epoch": 0.7819148936170213,
"grad_norm": 1.5364750834268603,
"learning_rate": 6.789940828402367e-07,
"loss": 1.3558,
"step": 294
},
{
"epoch": 0.7845744680851063,
"grad_norm": 1.5393402313754123,
"learning_rate": 6.775147928994082e-07,
"loss": 1.439,
"step": 295
},
{
"epoch": 0.7872340425531915,
"grad_norm": 1.6455162885770198,
"learning_rate": 6.760355029585798e-07,
"loss": 1.5158,
"step": 296
},
{
"epoch": 0.7898936170212766,
"grad_norm": 1.6475778661453933,
"learning_rate": 6.745562130177514e-07,
"loss": 1.4278,
"step": 297
},
{
"epoch": 0.7925531914893617,
"grad_norm": 1.502594611161215,
"learning_rate": 6.730769230769231e-07,
"loss": 1.3064,
"step": 298
},
{
"epoch": 0.7952127659574468,
"grad_norm": 1.4819306978451936,
"learning_rate": 6.715976331360946e-07,
"loss": 1.4,
"step": 299
},
{
"epoch": 0.7978723404255319,
"grad_norm": 1.6911681538448085,
"learning_rate": 6.701183431952663e-07,
"loss": 1.3364,
"step": 300
},
{
"epoch": 0.800531914893617,
"grad_norm": 1.4712764033020207,
"learning_rate": 6.686390532544378e-07,
"loss": 1.3514,
"step": 301
},
{
"epoch": 0.8031914893617021,
"grad_norm": 1.5453820007555663,
"learning_rate": 6.671597633136095e-07,
"loss": 1.252,
"step": 302
},
{
"epoch": 0.8058510638297872,
"grad_norm": 1.6870546106387143,
"learning_rate": 6.65680473372781e-07,
"loss": 1.4819,
"step": 303
},
{
"epoch": 0.8085106382978723,
"grad_norm": 1.539899104888,
"learning_rate": 6.642011834319526e-07,
"loss": 1.4248,
"step": 304
},
{
"epoch": 0.8111702127659575,
"grad_norm": 1.8570540873303243,
"learning_rate": 6.627218934911242e-07,
"loss": 1.398,
"step": 305
},
{
"epoch": 0.8138297872340425,
"grad_norm": 1.6462980732890118,
"learning_rate": 6.612426035502958e-07,
"loss": 1.472,
"step": 306
},
{
"epoch": 0.8164893617021277,
"grad_norm": 5.047207753458083,
"learning_rate": 6.597633136094674e-07,
"loss": 1.4934,
"step": 307
},
{
"epoch": 0.8191489361702128,
"grad_norm": 1.6578320558708661,
"learning_rate": 6.58284023668639e-07,
"loss": 1.4467,
"step": 308
},
{
"epoch": 0.8218085106382979,
"grad_norm": 1.650877101009254,
"learning_rate": 6.568047337278106e-07,
"loss": 1.3491,
"step": 309
},
{
"epoch": 0.824468085106383,
"grad_norm": 1.7139451577038085,
"learning_rate": 6.553254437869822e-07,
"loss": 1.4975,
"step": 310
},
{
"epoch": 0.8271276595744681,
"grad_norm": 1.6275656326818695,
"learning_rate": 6.538461538461538e-07,
"loss": 1.4493,
"step": 311
},
{
"epoch": 0.8297872340425532,
"grad_norm": 1.693438289435893,
"learning_rate": 6.523668639053254e-07,
"loss": 1.3593,
"step": 312
},
{
"epoch": 0.8324468085106383,
"grad_norm": 1.5252049292780119,
"learning_rate": 6.50887573964497e-07,
"loss": 1.4798,
"step": 313
},
{
"epoch": 0.8351063829787234,
"grad_norm": 1.7006952995622482,
"learning_rate": 6.494082840236686e-07,
"loss": 1.5054,
"step": 314
},
{
"epoch": 0.8377659574468085,
"grad_norm": 1.7203889834996966,
"learning_rate": 6.479289940828401e-07,
"loss": 1.599,
"step": 315
},
{
"epoch": 0.8404255319148937,
"grad_norm": 1.665289055188048,
"learning_rate": 6.464497041420118e-07,
"loss": 1.47,
"step": 316
},
{
"epoch": 0.8430851063829787,
"grad_norm": 1.783255201333473,
"learning_rate": 6.449704142011834e-07,
"loss": 1.3293,
"step": 317
},
{
"epoch": 0.8457446808510638,
"grad_norm": 1.5474686687545494,
"learning_rate": 6.43491124260355e-07,
"loss": 1.5827,
"step": 318
},
{
"epoch": 0.848404255319149,
"grad_norm": 1.7096057045749924,
"learning_rate": 6.420118343195266e-07,
"loss": 1.4208,
"step": 319
},
{
"epoch": 0.851063829787234,
"grad_norm": 1.660091264238197,
"learning_rate": 6.405325443786982e-07,
"loss": 1.3729,
"step": 320
},
{
"epoch": 0.8537234042553191,
"grad_norm": 2.1515758550003663,
"learning_rate": 6.390532544378699e-07,
"loss": 1.6061,
"step": 321
},
{
"epoch": 0.8563829787234043,
"grad_norm": 1.6705826372283528,
"learning_rate": 6.375739644970414e-07,
"loss": 1.3534,
"step": 322
},
{
"epoch": 0.8590425531914894,
"grad_norm": 1.6232024300738965,
"learning_rate": 6.360946745562131e-07,
"loss": 1.425,
"step": 323
},
{
"epoch": 0.8617021276595744,
"grad_norm": 1.7044169574045285,
"learning_rate": 6.346153846153845e-07,
"loss": 1.5695,
"step": 324
},
{
"epoch": 0.8643617021276596,
"grad_norm": 1.7606258681853417,
"learning_rate": 6.331360946745562e-07,
"loss": 1.418,
"step": 325
},
{
"epoch": 0.8670212765957447,
"grad_norm": 1.5280589114761016,
"learning_rate": 6.316568047337277e-07,
"loss": 1.4349,
"step": 326
},
{
"epoch": 0.8696808510638298,
"grad_norm": 1.7139434884413298,
"learning_rate": 6.301775147928994e-07,
"loss": 1.4371,
"step": 327
},
{
"epoch": 0.8723404255319149,
"grad_norm": 1.5926203744807812,
"learning_rate": 6.28698224852071e-07,
"loss": 1.4015,
"step": 328
},
{
"epoch": 0.875,
"grad_norm": 1.6264161761425606,
"learning_rate": 6.272189349112426e-07,
"loss": 1.4729,
"step": 329
},
{
"epoch": 0.8776595744680851,
"grad_norm": 1.5831228752137032,
"learning_rate": 6.257396449704142e-07,
"loss": 1.421,
"step": 330
},
{
"epoch": 0.8803191489361702,
"grad_norm": 1.5811866295220025,
"learning_rate": 6.242603550295858e-07,
"loss": 1.3628,
"step": 331
},
{
"epoch": 0.8829787234042553,
"grad_norm": 1.5679708453260865,
"learning_rate": 6.227810650887574e-07,
"loss": 1.2859,
"step": 332
},
{
"epoch": 0.8856382978723404,
"grad_norm": 1.6766225130373726,
"learning_rate": 6.213017751479289e-07,
"loss": 1.4369,
"step": 333
},
{
"epoch": 0.8882978723404256,
"grad_norm": 1.8047128650814857,
"learning_rate": 6.198224852071005e-07,
"loss": 1.5913,
"step": 334
},
{
"epoch": 0.8909574468085106,
"grad_norm": 1.6456822515106042,
"learning_rate": 6.183431952662722e-07,
"loss": 1.4972,
"step": 335
},
{
"epoch": 0.8936170212765957,
"grad_norm": 1.552523155961138,
"learning_rate": 6.168639053254437e-07,
"loss": 1.3171,
"step": 336
},
{
"epoch": 0.8962765957446809,
"grad_norm": 1.788183804411441,
"learning_rate": 6.153846153846154e-07,
"loss": 1.5059,
"step": 337
},
{
"epoch": 0.898936170212766,
"grad_norm": 1.5907686060024624,
"learning_rate": 6.139053254437869e-07,
"loss": 1.1485,
"step": 338
},
{
"epoch": 0.901595744680851,
"grad_norm": 1.7254040314022046,
"learning_rate": 6.124260355029586e-07,
"loss": 1.5628,
"step": 339
},
{
"epoch": 0.9042553191489362,
"grad_norm": 1.6347353623664331,
"learning_rate": 6.109467455621301e-07,
"loss": 1.3704,
"step": 340
},
{
"epoch": 0.9069148936170213,
"grad_norm": 2.194464251540189,
"learning_rate": 6.094674556213018e-07,
"loss": 1.4758,
"step": 341
},
{
"epoch": 0.9095744680851063,
"grad_norm": 1.5698776022464798,
"learning_rate": 6.079881656804734e-07,
"loss": 1.3871,
"step": 342
},
{
"epoch": 0.9122340425531915,
"grad_norm": 1.8859732282362605,
"learning_rate": 6.065088757396449e-07,
"loss": 1.4136,
"step": 343
},
{
"epoch": 0.9148936170212766,
"grad_norm": 1.7373147056080605,
"learning_rate": 6.050295857988165e-07,
"loss": 1.5494,
"step": 344
},
{
"epoch": 0.9175531914893617,
"grad_norm": 1.6179407549268443,
"learning_rate": 6.035502958579881e-07,
"loss": 1.3776,
"step": 345
},
{
"epoch": 0.9202127659574468,
"grad_norm": 1.77670135626407,
"learning_rate": 6.020710059171597e-07,
"loss": 1.3275,
"step": 346
},
{
"epoch": 0.9228723404255319,
"grad_norm": 1.7482955670467306,
"learning_rate": 6.005917159763313e-07,
"loss": 1.4015,
"step": 347
},
{
"epoch": 0.925531914893617,
"grad_norm": 1.6887523807534266,
"learning_rate": 5.991124260355029e-07,
"loss": 1.5069,
"step": 348
},
{
"epoch": 0.9281914893617021,
"grad_norm": 1.514381055516736,
"learning_rate": 5.976331360946746e-07,
"loss": 1.3818,
"step": 349
},
{
"epoch": 0.9308510638297872,
"grad_norm": 1.4907168186147164,
"learning_rate": 5.961538461538461e-07,
"loss": 1.4495,
"step": 350
},
{
"epoch": 0.9335106382978723,
"grad_norm": 2.265910373999388,
"learning_rate": 5.946745562130178e-07,
"loss": 1.2853,
"step": 351
},
{
"epoch": 0.9361702127659575,
"grad_norm": 1.7992082788491501,
"learning_rate": 5.931952662721894e-07,
"loss": 1.539,
"step": 352
},
{
"epoch": 0.9388297872340425,
"grad_norm": 1.443028062263383,
"learning_rate": 5.917159763313609e-07,
"loss": 1.324,
"step": 353
},
{
"epoch": 0.9414893617021277,
"grad_norm": 1.6139434859203183,
"learning_rate": 5.902366863905324e-07,
"loss": 1.3336,
"step": 354
},
{
"epoch": 0.9441489361702128,
"grad_norm": 2.252829785523421,
"learning_rate": 5.887573964497041e-07,
"loss": 1.2986,
"step": 355
},
{
"epoch": 0.9468085106382979,
"grad_norm": 1.7284412087838827,
"learning_rate": 5.872781065088757e-07,
"loss": 1.4817,
"step": 356
},
{
"epoch": 0.949468085106383,
"grad_norm": 1.7787571244355151,
"learning_rate": 5.857988165680473e-07,
"loss": 1.5187,
"step": 357
},
{
"epoch": 0.9521276595744681,
"grad_norm": 2.181835688354598,
"learning_rate": 5.84319526627219e-07,
"loss": 1.5578,
"step": 358
},
{
"epoch": 0.9547872340425532,
"grad_norm": 1.4634212657053263,
"learning_rate": 5.828402366863905e-07,
"loss": 1.2286,
"step": 359
},
{
"epoch": 0.9574468085106383,
"grad_norm": 1.687131629579792,
"learning_rate": 5.813609467455622e-07,
"loss": 1.3256,
"step": 360
},
{
"epoch": 0.9601063829787234,
"grad_norm": 1.629444719409858,
"learning_rate": 5.798816568047337e-07,
"loss": 1.5522,
"step": 361
},
{
"epoch": 0.9627659574468085,
"grad_norm": 1.6487449612370586,
"learning_rate": 5.784023668639053e-07,
"loss": 1.5252,
"step": 362
},
{
"epoch": 0.9654255319148937,
"grad_norm": 1.5119623190054727,
"learning_rate": 5.769230769230768e-07,
"loss": 1.4479,
"step": 363
},
{
"epoch": 0.9680851063829787,
"grad_norm": 1.529900871256959,
"learning_rate": 5.754437869822485e-07,
"loss": 1.4081,
"step": 364
},
{
"epoch": 0.9707446808510638,
"grad_norm": 1.679158185017686,
"learning_rate": 5.739644970414201e-07,
"loss": 1.3219,
"step": 365
},
{
"epoch": 0.973404255319149,
"grad_norm": 1.5743852626682602,
"learning_rate": 5.724852071005917e-07,
"loss": 1.4408,
"step": 366
},
{
"epoch": 0.976063829787234,
"grad_norm": 1.4327135424204693,
"learning_rate": 5.710059171597633e-07,
"loss": 1.4267,
"step": 367
},
{
"epoch": 0.9787234042553191,
"grad_norm": 1.693248001536766,
"learning_rate": 5.695266272189349e-07,
"loss": 1.459,
"step": 368
},
{
"epoch": 0.9813829787234043,
"grad_norm": 1.6118417002332202,
"learning_rate": 5.680473372781065e-07,
"loss": 1.3239,
"step": 369
},
{
"epoch": 0.9840425531914894,
"grad_norm": 1.5994817848229685,
"learning_rate": 5.665680473372781e-07,
"loss": 1.3316,
"step": 370
},
{
"epoch": 0.9867021276595744,
"grad_norm": 1.734698428678095,
"learning_rate": 5.650887573964497e-07,
"loss": 1.4394,
"step": 371
},
{
"epoch": 0.9893617021276596,
"grad_norm": 2.8750724783344626,
"learning_rate": 5.636094674556213e-07,
"loss": 1.3439,
"step": 372
},
{
"epoch": 0.9920212765957447,
"grad_norm": 1.5483975094463054,
"learning_rate": 5.621301775147928e-07,
"loss": 1.3684,
"step": 373
},
{
"epoch": 0.9946808510638298,
"grad_norm": 1.5202730618700395,
"learning_rate": 5.606508875739645e-07,
"loss": 1.3361,
"step": 374
},
{
"epoch": 0.9973404255319149,
"grad_norm": 1.6144865765856777,
"learning_rate": 5.59171597633136e-07,
"loss": 1.3195,
"step": 375
},
{
"epoch": 1.0,
"grad_norm": 1.6792801473937533,
"learning_rate": 5.576923076923077e-07,
"loss": 1.4384,
"step": 376
},
{
"epoch": 1.002659574468085,
"grad_norm": 1.9175762077814629,
"learning_rate": 5.562130177514792e-07,
"loss": 1.4758,
"step": 377
},
{
"epoch": 1.0053191489361701,
"grad_norm": 1.8048610851481421,
"learning_rate": 5.547337278106509e-07,
"loss": 1.4803,
"step": 378
},
{
"epoch": 1.0079787234042554,
"grad_norm": 1.606071563190404,
"learning_rate": 5.532544378698225e-07,
"loss": 1.485,
"step": 379
},
{
"epoch": 1.0106382978723405,
"grad_norm": 1.5572569044777356,
"learning_rate": 5.517751479289941e-07,
"loss": 1.2355,
"step": 380
},
{
"epoch": 1.0132978723404256,
"grad_norm": 1.5959684601920348,
"learning_rate": 5.502958579881657e-07,
"loss": 1.2246,
"step": 381
},
{
"epoch": 1.0159574468085106,
"grad_norm": 1.9674075560318893,
"learning_rate": 5.488165680473372e-07,
"loss": 1.5334,
"step": 382
},
{
"epoch": 1.0186170212765957,
"grad_norm": 1.6680206362227628,
"learning_rate": 5.473372781065088e-07,
"loss": 1.4226,
"step": 383
},
{
"epoch": 1.0212765957446808,
"grad_norm": 1.5700791218738284,
"learning_rate": 5.458579881656804e-07,
"loss": 1.3727,
"step": 384
},
{
"epoch": 1.023936170212766,
"grad_norm": 1.5969942768737249,
"learning_rate": 5.44378698224852e-07,
"loss": 1.4911,
"step": 385
},
{
"epoch": 1.0265957446808511,
"grad_norm": 1.5398360114287806,
"learning_rate": 5.428994082840237e-07,
"loss": 1.3769,
"step": 386
},
{
"epoch": 1.0292553191489362,
"grad_norm": 1.5805625597294484,
"learning_rate": 5.414201183431952e-07,
"loss": 1.4166,
"step": 387
},
{
"epoch": 1.0319148936170213,
"grad_norm": 1.5312252431931253,
"learning_rate": 5.399408284023669e-07,
"loss": 1.2332,
"step": 388
},
{
"epoch": 1.0345744680851063,
"grad_norm": 2.185966499141712,
"learning_rate": 5.384615384615384e-07,
"loss": 1.3489,
"step": 389
},
{
"epoch": 1.0372340425531914,
"grad_norm": 1.5033859343676257,
"learning_rate": 5.369822485207101e-07,
"loss": 1.4487,
"step": 390
},
{
"epoch": 1.0398936170212767,
"grad_norm": 1.6054054860368354,
"learning_rate": 5.355029585798815e-07,
"loss": 1.4788,
"step": 391
},
{
"epoch": 1.0425531914893618,
"grad_norm": 1.6494604615754016,
"learning_rate": 5.340236686390532e-07,
"loss": 1.479,
"step": 392
},
{
"epoch": 1.0452127659574468,
"grad_norm": 1.7222866777780232,
"learning_rate": 5.325443786982249e-07,
"loss": 1.3891,
"step": 393
},
{
"epoch": 1.047872340425532,
"grad_norm": 1.7350078493539867,
"learning_rate": 5.310650887573964e-07,
"loss": 1.5214,
"step": 394
},
{
"epoch": 1.050531914893617,
"grad_norm": 1.677699700420203,
"learning_rate": 5.295857988165681e-07,
"loss": 1.4027,
"step": 395
},
{
"epoch": 1.053191489361702,
"grad_norm": 1.7218061845324277,
"learning_rate": 5.281065088757396e-07,
"loss": 1.5612,
"step": 396
},
{
"epoch": 1.0558510638297873,
"grad_norm": 2.0460338465780015,
"learning_rate": 5.266272189349113e-07,
"loss": 1.7095,
"step": 397
},
{
"epoch": 1.0585106382978724,
"grad_norm": 1.8707733198479073,
"learning_rate": 5.251479289940828e-07,
"loss": 1.3582,
"step": 398
},
{
"epoch": 1.0611702127659575,
"grad_norm": 1.6674094055135629,
"learning_rate": 5.236686390532545e-07,
"loss": 1.4667,
"step": 399
},
{
"epoch": 1.0638297872340425,
"grad_norm": 1.9223542274996348,
"learning_rate": 5.22189349112426e-07,
"loss": 1.3237,
"step": 400
},
{
"epoch": 1.0664893617021276,
"grad_norm": 1.442702870639783,
"learning_rate": 5.207100591715976e-07,
"loss": 1.3436,
"step": 401
},
{
"epoch": 1.0691489361702127,
"grad_norm": 1.459623592531859,
"learning_rate": 5.192307692307692e-07,
"loss": 1.3075,
"step": 402
},
{
"epoch": 1.071808510638298,
"grad_norm": 1.7736298040913328,
"learning_rate": 5.177514792899408e-07,
"loss": 1.55,
"step": 403
},
{
"epoch": 1.074468085106383,
"grad_norm": 1.492584255658168,
"learning_rate": 5.162721893491124e-07,
"loss": 1.3287,
"step": 404
},
{
"epoch": 1.077127659574468,
"grad_norm": 1.5311371897968131,
"learning_rate": 5.14792899408284e-07,
"loss": 1.2852,
"step": 405
},
{
"epoch": 1.0797872340425532,
"grad_norm": 1.7056998990486645,
"learning_rate": 5.133136094674556e-07,
"loss": 1.3844,
"step": 406
},
{
"epoch": 1.0824468085106382,
"grad_norm": 1.5754295217572547,
"learning_rate": 5.118343195266272e-07,
"loss": 1.4362,
"step": 407
},
{
"epoch": 1.0851063829787233,
"grad_norm": 1.7118767376849466,
"learning_rate": 5.103550295857988e-07,
"loss": 1.4678,
"step": 408
},
{
"epoch": 1.0877659574468086,
"grad_norm": 1.7720130880057632,
"learning_rate": 5.088757396449705e-07,
"loss": 1.407,
"step": 409
},
{
"epoch": 1.0904255319148937,
"grad_norm": 1.6779654968724649,
"learning_rate": 5.07396449704142e-07,
"loss": 1.4306,
"step": 410
},
{
"epoch": 1.0930851063829787,
"grad_norm": 1.6236129122592553,
"learning_rate": 5.059171597633136e-07,
"loss": 1.3498,
"step": 411
},
{
"epoch": 1.0957446808510638,
"grad_norm": 1.6329048532167492,
"learning_rate": 5.044378698224851e-07,
"loss": 1.4461,
"step": 412
},
{
"epoch": 1.0984042553191489,
"grad_norm": 1.6207024159387382,
"learning_rate": 5.029585798816568e-07,
"loss": 1.3772,
"step": 413
},
{
"epoch": 1.101063829787234,
"grad_norm": 1.5324741841766363,
"learning_rate": 5.014792899408283e-07,
"loss": 1.1312,
"step": 414
},
{
"epoch": 1.1037234042553192,
"grad_norm": 1.7401441557132455,
"learning_rate": 5e-07,
"loss": 1.1982,
"step": 415
},
{
"epoch": 1.1063829787234043,
"grad_norm": 1.7504453773507886,
"learning_rate": 4.985207100591715e-07,
"loss": 1.4541,
"step": 416
},
{
"epoch": 1.1090425531914894,
"grad_norm": 1.699882851098421,
"learning_rate": 4.970414201183432e-07,
"loss": 1.2368,
"step": 417
},
{
"epoch": 1.1117021276595744,
"grad_norm": 1.6218516588828402,
"learning_rate": 4.955621301775147e-07,
"loss": 1.2906,
"step": 418
},
{
"epoch": 1.1143617021276595,
"grad_norm": 1.6649091116123456,
"learning_rate": 4.940828402366864e-07,
"loss": 1.4454,
"step": 419
},
{
"epoch": 1.1170212765957448,
"grad_norm": 1.728282227356823,
"learning_rate": 4.926035502958579e-07,
"loss": 1.4663,
"step": 420
},
{
"epoch": 1.1196808510638299,
"grad_norm": 1.6435295189184387,
"learning_rate": 4.911242603550296e-07,
"loss": 1.4789,
"step": 421
},
{
"epoch": 1.122340425531915,
"grad_norm": 1.8191659615562332,
"learning_rate": 4.896449704142011e-07,
"loss": 1.3986,
"step": 422
},
{
"epoch": 1.125,
"grad_norm": 1.5470082389400086,
"learning_rate": 4.881656804733728e-07,
"loss": 1.4072,
"step": 423
},
{
"epoch": 1.127659574468085,
"grad_norm": 1.581839768866324,
"learning_rate": 4.866863905325443e-07,
"loss": 1.3122,
"step": 424
},
{
"epoch": 1.1303191489361701,
"grad_norm": 1.4620677635311095,
"learning_rate": 4.852071005917159e-07,
"loss": 1.2643,
"step": 425
},
{
"epoch": 1.1329787234042552,
"grad_norm": 1.6707102916564711,
"learning_rate": 4.837278106508875e-07,
"loss": 1.3747,
"step": 426
},
{
"epoch": 1.1356382978723405,
"grad_norm": 1.5396285202284683,
"learning_rate": 4.822485207100592e-07,
"loss": 1.3101,
"step": 427
},
{
"epoch": 1.1382978723404256,
"grad_norm": 1.8606687901078265,
"learning_rate": 4.807692307692307e-07,
"loss": 1.3172,
"step": 428
},
{
"epoch": 1.1409574468085106,
"grad_norm": 1.6119139560046312,
"learning_rate": 4.792899408284024e-07,
"loss": 1.3865,
"step": 429
},
{
"epoch": 1.1436170212765957,
"grad_norm": 1.715672112601465,
"learning_rate": 4.778106508875739e-07,
"loss": 1.4168,
"step": 430
},
{
"epoch": 1.1462765957446808,
"grad_norm": 1.6367162736314051,
"learning_rate": 4.7633136094674555e-07,
"loss": 1.6202,
"step": 431
},
{
"epoch": 1.148936170212766,
"grad_norm": 1.6173047746530647,
"learning_rate": 4.748520710059171e-07,
"loss": 1.4345,
"step": 432
},
{
"epoch": 1.1515957446808511,
"grad_norm": 1.591852292459417,
"learning_rate": 4.733727810650887e-07,
"loss": 1.3504,
"step": 433
},
{
"epoch": 1.1542553191489362,
"grad_norm": 1.704091419091409,
"learning_rate": 4.7189349112426035e-07,
"loss": 1.3978,
"step": 434
},
{
"epoch": 1.1569148936170213,
"grad_norm": 1.6750388468322808,
"learning_rate": 4.7041420118343195e-07,
"loss": 1.5323,
"step": 435
},
{
"epoch": 1.1595744680851063,
"grad_norm": 1.550611356946591,
"learning_rate": 4.6893491124260356e-07,
"loss": 1.3516,
"step": 436
},
{
"epoch": 1.1622340425531914,
"grad_norm": 1.6666235759250934,
"learning_rate": 4.674556213017751e-07,
"loss": 1.3193,
"step": 437
},
{
"epoch": 1.1648936170212765,
"grad_norm": 1.6060648830034072,
"learning_rate": 4.659763313609467e-07,
"loss": 1.4802,
"step": 438
},
{
"epoch": 1.1675531914893618,
"grad_norm": 2.7759623465499113,
"learning_rate": 4.644970414201183e-07,
"loss": 1.3673,
"step": 439
},
{
"epoch": 1.1702127659574468,
"grad_norm": 1.6142500584687862,
"learning_rate": 4.630177514792899e-07,
"loss": 1.2367,
"step": 440
},
{
"epoch": 1.172872340425532,
"grad_norm": 1.6293255382971552,
"learning_rate": 4.6153846153846156e-07,
"loss": 1.4771,
"step": 441
},
{
"epoch": 1.175531914893617,
"grad_norm": 1.6166636037633662,
"learning_rate": 4.600591715976331e-07,
"loss": 1.3891,
"step": 442
},
{
"epoch": 1.178191489361702,
"grad_norm": 1.6156668770120142,
"learning_rate": 4.585798816568047e-07,
"loss": 1.3015,
"step": 443
},
{
"epoch": 1.1808510638297873,
"grad_norm": 1.541456190983287,
"learning_rate": 4.571005917159763e-07,
"loss": 1.325,
"step": 444
},
{
"epoch": 1.1835106382978724,
"grad_norm": 1.5371528822910774,
"learning_rate": 4.556213017751479e-07,
"loss": 1.391,
"step": 445
},
{
"epoch": 1.1861702127659575,
"grad_norm": 1.8047509120352834,
"learning_rate": 4.541420118343195e-07,
"loss": 1.3802,
"step": 446
},
{
"epoch": 1.1888297872340425,
"grad_norm": 1.4772002442457595,
"learning_rate": 4.5266272189349107e-07,
"loss": 1.2972,
"step": 447
},
{
"epoch": 1.1914893617021276,
"grad_norm": 1.4833680602448407,
"learning_rate": 4.5118343195266267e-07,
"loss": 1.3515,
"step": 448
},
{
"epoch": 1.1941489361702127,
"grad_norm": 1.557530779220624,
"learning_rate": 4.497041420118343e-07,
"loss": 1.367,
"step": 449
},
{
"epoch": 1.196808510638298,
"grad_norm": 1.8027220753490893,
"learning_rate": 4.482248520710059e-07,
"loss": 1.5443,
"step": 450
},
{
"epoch": 1.199468085106383,
"grad_norm": 1.5684441226470547,
"learning_rate": 4.467455621301775e-07,
"loss": 1.3059,
"step": 451
},
{
"epoch": 1.202127659574468,
"grad_norm": 1.593970040483734,
"learning_rate": 4.4526627218934907e-07,
"loss": 1.2474,
"step": 452
},
{
"epoch": 1.2047872340425532,
"grad_norm": 1.7048839620218588,
"learning_rate": 4.437869822485207e-07,
"loss": 1.4689,
"step": 453
},
{
"epoch": 1.2074468085106382,
"grad_norm": 1.6500745120708162,
"learning_rate": 4.423076923076923e-07,
"loss": 1.3768,
"step": 454
},
{
"epoch": 1.2101063829787235,
"grad_norm": 1.6649022378945304,
"learning_rate": 4.408284023668639e-07,
"loss": 1.6992,
"step": 455
},
{
"epoch": 1.2127659574468086,
"grad_norm": 2.150475218838757,
"learning_rate": 4.3934911242603553e-07,
"loss": 1.4338,
"step": 456
},
{
"epoch": 1.2154255319148937,
"grad_norm": 1.4810681098612493,
"learning_rate": 4.378698224852071e-07,
"loss": 1.2523,
"step": 457
},
{
"epoch": 1.2180851063829787,
"grad_norm": 1.5941194592252996,
"learning_rate": 4.363905325443787e-07,
"loss": 1.5144,
"step": 458
},
{
"epoch": 1.2207446808510638,
"grad_norm": 2.9846606692055855,
"learning_rate": 4.349112426035503e-07,
"loss": 1.4394,
"step": 459
},
{
"epoch": 1.2234042553191489,
"grad_norm": 1.5758645515570575,
"learning_rate": 4.334319526627219e-07,
"loss": 1.314,
"step": 460
},
{
"epoch": 1.226063829787234,
"grad_norm": 2.0348791713600374,
"learning_rate": 4.3195266272189343e-07,
"loss": 1.3581,
"step": 461
},
{
"epoch": 1.2287234042553192,
"grad_norm": 1.65492749945659,
"learning_rate": 4.3047337278106503e-07,
"loss": 1.5053,
"step": 462
},
{
"epoch": 1.2313829787234043,
"grad_norm": 1.6722641251398465,
"learning_rate": 4.289940828402367e-07,
"loss": 1.4641,
"step": 463
},
{
"epoch": 1.2340425531914894,
"grad_norm": 1.5474460973272384,
"learning_rate": 4.275147928994083e-07,
"loss": 1.4182,
"step": 464
},
{
"epoch": 1.2367021276595744,
"grad_norm": 1.7345506046508428,
"learning_rate": 4.260355029585799e-07,
"loss": 1.3139,
"step": 465
},
{
"epoch": 1.2393617021276595,
"grad_norm": 1.7713814803315784,
"learning_rate": 4.2455621301775144e-07,
"loss": 1.4832,
"step": 466
},
{
"epoch": 1.2420212765957448,
"grad_norm": 1.5498103025703653,
"learning_rate": 4.2307692307692304e-07,
"loss": 1.4115,
"step": 467
},
{
"epoch": 1.2446808510638299,
"grad_norm": 1.5577840972729278,
"learning_rate": 4.2159763313609464e-07,
"loss": 1.3256,
"step": 468
},
{
"epoch": 1.247340425531915,
"grad_norm": 1.578861933203747,
"learning_rate": 4.2011834319526624e-07,
"loss": 1.2007,
"step": 469
},
{
"epoch": 1.25,
"grad_norm": 1.6507686385229483,
"learning_rate": 4.1863905325443785e-07,
"loss": 1.3944,
"step": 470
},
{
"epoch": 1.252659574468085,
"grad_norm": 1.7990714539210155,
"learning_rate": 4.1715976331360945e-07,
"loss": 1.4632,
"step": 471
},
{
"epoch": 1.2553191489361701,
"grad_norm": 1.7618234269198014,
"learning_rate": 4.1568047337278105e-07,
"loss": 1.3313,
"step": 472
},
{
"epoch": 1.2579787234042552,
"grad_norm": 1.5213599490802718,
"learning_rate": 4.1420118343195265e-07,
"loss": 1.5047,
"step": 473
},
{
"epoch": 1.2606382978723405,
"grad_norm": 1.6052633557883167,
"learning_rate": 4.1272189349112425e-07,
"loss": 1.4177,
"step": 474
},
{
"epoch": 1.2632978723404256,
"grad_norm": 1.9773267391803975,
"learning_rate": 4.1124260355029585e-07,
"loss": 1.2606,
"step": 475
},
{
"epoch": 1.2659574468085106,
"grad_norm": 1.7023545368582853,
"learning_rate": 4.097633136094674e-07,
"loss": 1.3522,
"step": 476
},
{
"epoch": 1.2686170212765957,
"grad_norm": 1.657218002450086,
"learning_rate": 4.08284023668639e-07,
"loss": 1.307,
"step": 477
},
{
"epoch": 1.2712765957446808,
"grad_norm": 1.6560677482089055,
"learning_rate": 4.0680473372781066e-07,
"loss": 1.5599,
"step": 478
},
{
"epoch": 1.273936170212766,
"grad_norm": 1.5827603390864668,
"learning_rate": 4.0532544378698226e-07,
"loss": 1.3864,
"step": 479
},
{
"epoch": 1.2765957446808511,
"grad_norm": 1.490492812079521,
"learning_rate": 4.0384615384615386e-07,
"loss": 1.3238,
"step": 480
},
{
"epoch": 1.2792553191489362,
"grad_norm": 1.4427306337618429,
"learning_rate": 4.023668639053254e-07,
"loss": 1.3381,
"step": 481
},
{
"epoch": 1.2819148936170213,
"grad_norm": 1.8739427128710302,
"learning_rate": 4.00887573964497e-07,
"loss": 1.5195,
"step": 482
},
{
"epoch": 1.2845744680851063,
"grad_norm": 1.4205586135195478,
"learning_rate": 3.994082840236686e-07,
"loss": 1.3342,
"step": 483
},
{
"epoch": 1.2872340425531914,
"grad_norm": 1.4978308888768397,
"learning_rate": 3.979289940828402e-07,
"loss": 1.3198,
"step": 484
},
{
"epoch": 1.2898936170212765,
"grad_norm": 1.453096779169849,
"learning_rate": 3.9644970414201187e-07,
"loss": 1.0572,
"step": 485
},
{
"epoch": 1.2925531914893618,
"grad_norm": 1.9700050592115472,
"learning_rate": 3.949704142011834e-07,
"loss": 1.5476,
"step": 486
},
{
"epoch": 1.2952127659574468,
"grad_norm": 1.52650807341244,
"learning_rate": 3.93491124260355e-07,
"loss": 1.3027,
"step": 487
},
{
"epoch": 1.297872340425532,
"grad_norm": 1.6797022619264115,
"learning_rate": 3.920118343195266e-07,
"loss": 1.4014,
"step": 488
},
{
"epoch": 1.300531914893617,
"grad_norm": 1.4684740172475148,
"learning_rate": 3.905325443786982e-07,
"loss": 1.2891,
"step": 489
},
{
"epoch": 1.3031914893617023,
"grad_norm": 1.7009794386978352,
"learning_rate": 3.8905325443786977e-07,
"loss": 1.498,
"step": 490
},
{
"epoch": 1.3058510638297873,
"grad_norm": 1.8679273089411261,
"learning_rate": 3.8757396449704137e-07,
"loss": 1.5135,
"step": 491
},
{
"epoch": 1.3085106382978724,
"grad_norm": 2.6124670473785723,
"learning_rate": 3.8609467455621297e-07,
"loss": 1.4419,
"step": 492
},
{
"epoch": 1.3111702127659575,
"grad_norm": 1.531497234704401,
"learning_rate": 3.8461538461538463e-07,
"loss": 1.441,
"step": 493
},
{
"epoch": 1.3138297872340425,
"grad_norm": 1.6983808183380165,
"learning_rate": 3.8313609467455623e-07,
"loss": 1.3176,
"step": 494
},
{
"epoch": 1.3164893617021276,
"grad_norm": 1.7106971746124235,
"learning_rate": 3.816568047337278e-07,
"loss": 1.2673,
"step": 495
},
{
"epoch": 1.3191489361702127,
"grad_norm": 1.7661676163840787,
"learning_rate": 3.801775147928994e-07,
"loss": 1.6258,
"step": 496
},
{
"epoch": 1.3218085106382977,
"grad_norm": 1.6248132891862335,
"learning_rate": 3.78698224852071e-07,
"loss": 1.3813,
"step": 497
},
{
"epoch": 1.324468085106383,
"grad_norm": 1.5079876101311178,
"learning_rate": 3.772189349112426e-07,
"loss": 1.3491,
"step": 498
},
{
"epoch": 1.327127659574468,
"grad_norm": 1.9080784267885529,
"learning_rate": 3.757396449704142e-07,
"loss": 1.4263,
"step": 499
},
{
"epoch": 1.3297872340425532,
"grad_norm": 1.7134136936747053,
"learning_rate": 3.742603550295858e-07,
"loss": 1.4804,
"step": 500
},
{
"epoch": 1.3324468085106382,
"grad_norm": 2.6890220767611934,
"learning_rate": 3.727810650887574e-07,
"loss": 1.4301,
"step": 501
},
{
"epoch": 1.3351063829787235,
"grad_norm": 1.4491614153026324,
"learning_rate": 3.71301775147929e-07,
"loss": 1.2226,
"step": 502
},
{
"epoch": 1.3377659574468086,
"grad_norm": 1.4673050610910694,
"learning_rate": 3.698224852071006e-07,
"loss": 1.2824,
"step": 503
},
{
"epoch": 1.3404255319148937,
"grad_norm": 1.5811077672143066,
"learning_rate": 3.683431952662722e-07,
"loss": 1.4056,
"step": 504
},
{
"epoch": 1.3430851063829787,
"grad_norm": 1.784207214911482,
"learning_rate": 3.6686390532544374e-07,
"loss": 1.4456,
"step": 505
},
{
"epoch": 1.3457446808510638,
"grad_norm": 1.7545013437687231,
"learning_rate": 3.6538461538461534e-07,
"loss": 1.4255,
"step": 506
},
{
"epoch": 1.3484042553191489,
"grad_norm": 1.5378814658235478,
"learning_rate": 3.63905325443787e-07,
"loss": 1.4752,
"step": 507
},
{
"epoch": 1.351063829787234,
"grad_norm": 1.5663338737224375,
"learning_rate": 3.624260355029586e-07,
"loss": 1.4324,
"step": 508
},
{
"epoch": 1.3537234042553192,
"grad_norm": 1.5530747526395428,
"learning_rate": 3.609467455621302e-07,
"loss": 1.3294,
"step": 509
},
{
"epoch": 1.3563829787234043,
"grad_norm": 1.5555181536643647,
"learning_rate": 3.5946745562130175e-07,
"loss": 1.2615,
"step": 510
},
{
"epoch": 1.3590425531914894,
"grad_norm": 1.561907923100703,
"learning_rate": 3.5798816568047335e-07,
"loss": 1.4247,
"step": 511
},
{
"epoch": 1.3617021276595744,
"grad_norm": 1.561727561754077,
"learning_rate": 3.5650887573964495e-07,
"loss": 1.442,
"step": 512
},
{
"epoch": 1.3643617021276595,
"grad_norm": 1.571729864924405,
"learning_rate": 3.5502958579881655e-07,
"loss": 1.3471,
"step": 513
},
{
"epoch": 1.3670212765957448,
"grad_norm": 1.6501651767936791,
"learning_rate": 3.535502958579881e-07,
"loss": 1.4957,
"step": 514
},
{
"epoch": 1.3696808510638299,
"grad_norm": 1.7712985007484374,
"learning_rate": 3.5207100591715975e-07,
"loss": 1.3116,
"step": 515
},
{
"epoch": 1.372340425531915,
"grad_norm": 1.6021754882790804,
"learning_rate": 3.5059171597633135e-07,
"loss": 1.3507,
"step": 516
},
{
"epoch": 1.375,
"grad_norm": 1.9744682223829157,
"learning_rate": 3.4911242603550296e-07,
"loss": 1.3187,
"step": 517
},
{
"epoch": 1.377659574468085,
"grad_norm": 1.437548678030046,
"learning_rate": 3.4763313609467456e-07,
"loss": 1.3055,
"step": 518
},
{
"epoch": 1.3803191489361701,
"grad_norm": 1.7376163882785898,
"learning_rate": 3.461538461538461e-07,
"loss": 1.3712,
"step": 519
},
{
"epoch": 1.3829787234042552,
"grad_norm": 1.709895613646418,
"learning_rate": 3.446745562130177e-07,
"loss": 1.4941,
"step": 520
},
{
"epoch": 1.3856382978723405,
"grad_norm": 1.5064773577923485,
"learning_rate": 3.431952662721893e-07,
"loss": 1.3598,
"step": 521
},
{
"epoch": 1.3882978723404256,
"grad_norm": 1.6991123209979573,
"learning_rate": 3.4171597633136096e-07,
"loss": 1.3859,
"step": 522
},
{
"epoch": 1.3909574468085106,
"grad_norm": 1.611358975201833,
"learning_rate": 3.4023668639053256e-07,
"loss": 1.3624,
"step": 523
},
{
"epoch": 1.3936170212765957,
"grad_norm": 1.5235030722566782,
"learning_rate": 3.387573964497041e-07,
"loss": 1.306,
"step": 524
},
{
"epoch": 1.3962765957446808,
"grad_norm": 1.5097567026286727,
"learning_rate": 3.372781065088757e-07,
"loss": 1.3098,
"step": 525
},
{
"epoch": 1.398936170212766,
"grad_norm": 1.5501867735527708,
"learning_rate": 3.357988165680473e-07,
"loss": 1.2582,
"step": 526
},
{
"epoch": 1.4015957446808511,
"grad_norm": 1.5737400889065642,
"learning_rate": 3.343195266272189e-07,
"loss": 1.4226,
"step": 527
},
{
"epoch": 1.4042553191489362,
"grad_norm": 1.8163702192116935,
"learning_rate": 3.328402366863905e-07,
"loss": 1.45,
"step": 528
},
{
"epoch": 1.4069148936170213,
"grad_norm": 1.6761526127572786,
"learning_rate": 3.313609467455621e-07,
"loss": 1.4133,
"step": 529
},
{
"epoch": 1.4095744680851063,
"grad_norm": 1.7300976770863319,
"learning_rate": 3.298816568047337e-07,
"loss": 1.5036,
"step": 530
},
{
"epoch": 1.4122340425531914,
"grad_norm": 1.7219520565452116,
"learning_rate": 3.284023668639053e-07,
"loss": 1.4172,
"step": 531
},
{
"epoch": 1.4148936170212765,
"grad_norm": 1.8137826656078981,
"learning_rate": 3.269230769230769e-07,
"loss": 1.5673,
"step": 532
},
{
"epoch": 1.4175531914893618,
"grad_norm": 1.9605494871424245,
"learning_rate": 3.254437869822485e-07,
"loss": 1.4421,
"step": 533
},
{
"epoch": 1.4202127659574468,
"grad_norm": 1.5063443324517625,
"learning_rate": 3.239644970414201e-07,
"loss": 1.3858,
"step": 534
},
{
"epoch": 1.422872340425532,
"grad_norm": 1.5929428001187216,
"learning_rate": 3.224852071005917e-07,
"loss": 1.4245,
"step": 535
},
{
"epoch": 1.425531914893617,
"grad_norm": 1.5090052181328104,
"learning_rate": 3.210059171597633e-07,
"loss": 1.185,
"step": 536
},
{
"epoch": 1.4281914893617023,
"grad_norm": 1.7599894966549008,
"learning_rate": 3.1952662721893493e-07,
"loss": 1.5936,
"step": 537
},
{
"epoch": 1.4308510638297873,
"grad_norm": 1.8274682976599146,
"learning_rate": 3.1804733727810653e-07,
"loss": 1.5133,
"step": 538
},
{
"epoch": 1.4335106382978724,
"grad_norm": 1.6304863965807304,
"learning_rate": 3.165680473372781e-07,
"loss": 1.4513,
"step": 539
},
{
"epoch": 1.4361702127659575,
"grad_norm": 1.865748149954226,
"learning_rate": 3.150887573964497e-07,
"loss": 1.579,
"step": 540
},
{
"epoch": 1.4388297872340425,
"grad_norm": 1.497890260310679,
"learning_rate": 3.136094674556213e-07,
"loss": 1.3996,
"step": 541
},
{
"epoch": 1.4414893617021276,
"grad_norm": 1.5505684579290944,
"learning_rate": 3.121301775147929e-07,
"loss": 1.4765,
"step": 542
},
{
"epoch": 1.4441489361702127,
"grad_norm": 1.5934674629645669,
"learning_rate": 3.1065088757396443e-07,
"loss": 1.2206,
"step": 543
},
{
"epoch": 1.4468085106382977,
"grad_norm": 2.5003698075483776,
"learning_rate": 3.091715976331361e-07,
"loss": 1.4785,
"step": 544
},
{
"epoch": 1.449468085106383,
"grad_norm": 1.5430363507491573,
"learning_rate": 3.076923076923077e-07,
"loss": 1.3596,
"step": 545
},
{
"epoch": 1.452127659574468,
"grad_norm": 1.6114525579321486,
"learning_rate": 3.062130177514793e-07,
"loss": 1.3768,
"step": 546
},
{
"epoch": 1.4547872340425532,
"grad_norm": 1.51705181171149,
"learning_rate": 3.047337278106509e-07,
"loss": 1.3161,
"step": 547
},
{
"epoch": 1.4574468085106382,
"grad_norm": 1.659706683154854,
"learning_rate": 3.0325443786982244e-07,
"loss": 1.4808,
"step": 548
},
{
"epoch": 1.4601063829787235,
"grad_norm": 1.6484483474446856,
"learning_rate": 3.0177514792899404e-07,
"loss": 1.398,
"step": 549
},
{
"epoch": 1.4627659574468086,
"grad_norm": 1.6054531570011474,
"learning_rate": 3.0029585798816564e-07,
"loss": 1.1421,
"step": 550
},
{
"epoch": 1.4654255319148937,
"grad_norm": 1.5260699880356663,
"learning_rate": 2.988165680473373e-07,
"loss": 1.4223,
"step": 551
},
{
"epoch": 1.4680851063829787,
"grad_norm": 1.5022650148070196,
"learning_rate": 2.973372781065089e-07,
"loss": 1.3579,
"step": 552
},
{
"epoch": 1.4707446808510638,
"grad_norm": 1.696210632092268,
"learning_rate": 2.9585798816568045e-07,
"loss": 1.4437,
"step": 553
},
{
"epoch": 1.4734042553191489,
"grad_norm": 1.50505509525979,
"learning_rate": 2.9437869822485205e-07,
"loss": 1.3666,
"step": 554
},
{
"epoch": 1.476063829787234,
"grad_norm": 1.6283581586889138,
"learning_rate": 2.9289940828402365e-07,
"loss": 1.3807,
"step": 555
},
{
"epoch": 1.4787234042553192,
"grad_norm": 1.57845733466985,
"learning_rate": 2.9142011834319525e-07,
"loss": 1.4947,
"step": 556
},
{
"epoch": 1.4813829787234043,
"grad_norm": 1.6269594263364617,
"learning_rate": 2.8994082840236686e-07,
"loss": 1.5315,
"step": 557
},
{
"epoch": 1.4840425531914894,
"grad_norm": 1.4901674188093539,
"learning_rate": 2.884615384615384e-07,
"loss": 1.2067,
"step": 558
},
{
"epoch": 1.4867021276595744,
"grad_norm": 1.608926803251607,
"learning_rate": 2.8698224852071006e-07,
"loss": 1.4501,
"step": 559
},
{
"epoch": 1.4893617021276595,
"grad_norm": 1.5736272188001768,
"learning_rate": 2.8550295857988166e-07,
"loss": 1.4938,
"step": 560
},
{
"epoch": 1.4920212765957448,
"grad_norm": 1.6178988306695008,
"learning_rate": 2.8402366863905326e-07,
"loss": 1.2858,
"step": 561
},
{
"epoch": 1.4946808510638299,
"grad_norm": 1.612098241628475,
"learning_rate": 2.8254437869822486e-07,
"loss": 1.3793,
"step": 562
},
{
"epoch": 1.497340425531915,
"grad_norm": 1.521850228548639,
"learning_rate": 2.810650887573964e-07,
"loss": 1.3616,
"step": 563
},
{
"epoch": 1.5,
"grad_norm": 1.4283693834886921,
"learning_rate": 2.79585798816568e-07,
"loss": 1.2373,
"step": 564
},
{
"epoch": 1.502659574468085,
"grad_norm": 1.4614575118454327,
"learning_rate": 2.781065088757396e-07,
"loss": 1.3506,
"step": 565
},
{
"epoch": 1.5053191489361701,
"grad_norm": 4.833934856122629,
"learning_rate": 2.7662721893491127e-07,
"loss": 1.3368,
"step": 566
},
{
"epoch": 1.5079787234042552,
"grad_norm": 1.5417407593664367,
"learning_rate": 2.7514792899408287e-07,
"loss": 1.3806,
"step": 567
},
{
"epoch": 1.5106382978723403,
"grad_norm": 1.3942611390001125,
"learning_rate": 2.736686390532544e-07,
"loss": 1.2778,
"step": 568
},
{
"epoch": 1.5132978723404256,
"grad_norm": 1.5232973474443783,
"learning_rate": 2.72189349112426e-07,
"loss": 1.5106,
"step": 569
},
{
"epoch": 1.5159574468085106,
"grad_norm": 1.6181295111494955,
"learning_rate": 2.707100591715976e-07,
"loss": 1.3182,
"step": 570
},
{
"epoch": 1.5186170212765957,
"grad_norm": 1.4905875051329172,
"learning_rate": 2.692307692307692e-07,
"loss": 1.359,
"step": 571
},
{
"epoch": 1.521276595744681,
"grad_norm": 1.5438422326091557,
"learning_rate": 2.6775147928994077e-07,
"loss": 1.4581,
"step": 572
},
{
"epoch": 1.523936170212766,
"grad_norm": 1.6689444553647594,
"learning_rate": 2.662721893491124e-07,
"loss": 1.4416,
"step": 573
},
{
"epoch": 1.5265957446808511,
"grad_norm": 1.732092721800618,
"learning_rate": 2.6479289940828403e-07,
"loss": 1.4653,
"step": 574
},
{
"epoch": 1.5292553191489362,
"grad_norm": 1.5939357125781168,
"learning_rate": 2.6331360946745563e-07,
"loss": 1.3659,
"step": 575
},
{
"epoch": 1.5319148936170213,
"grad_norm": 1.619819379203523,
"learning_rate": 2.6183431952662723e-07,
"loss": 1.4057,
"step": 576
},
{
"epoch": 1.5345744680851063,
"grad_norm": 1.5228031500567076,
"learning_rate": 2.603550295857988e-07,
"loss": 1.3322,
"step": 577
},
{
"epoch": 1.5372340425531914,
"grad_norm": 1.6403075138073668,
"learning_rate": 2.588757396449704e-07,
"loss": 1.3243,
"step": 578
},
{
"epoch": 1.5398936170212765,
"grad_norm": 1.6158463818930031,
"learning_rate": 2.57396449704142e-07,
"loss": 1.3743,
"step": 579
},
{
"epoch": 1.5425531914893615,
"grad_norm": 1.4401607766731626,
"learning_rate": 2.559171597633136e-07,
"loss": 1.3209,
"step": 580
},
{
"epoch": 1.5452127659574468,
"grad_norm": 1.610458527778034,
"learning_rate": 2.5443786982248524e-07,
"loss": 1.437,
"step": 581
},
{
"epoch": 1.547872340425532,
"grad_norm": 1.4720391313596763,
"learning_rate": 2.529585798816568e-07,
"loss": 1.2406,
"step": 582
},
{
"epoch": 1.550531914893617,
"grad_norm": 1.4693642812943966,
"learning_rate": 2.514792899408284e-07,
"loss": 1.3345,
"step": 583
},
{
"epoch": 1.5531914893617023,
"grad_norm": 1.6024699547818029,
"learning_rate": 2.5e-07,
"loss": 1.4164,
"step": 584
},
{
"epoch": 1.5558510638297873,
"grad_norm": 1.602502091357314,
"learning_rate": 2.485207100591716e-07,
"loss": 1.4412,
"step": 585
},
{
"epoch": 1.5585106382978724,
"grad_norm": 1.7241679714315328,
"learning_rate": 2.470414201183432e-07,
"loss": 1.331,
"step": 586
},
{
"epoch": 1.5611702127659575,
"grad_norm": 1.7371187244572857,
"learning_rate": 2.455621301775148e-07,
"loss": 1.4532,
"step": 587
},
{
"epoch": 1.5638297872340425,
"grad_norm": 1.4995956670676633,
"learning_rate": 2.440828402366864e-07,
"loss": 1.2702,
"step": 588
},
{
"epoch": 1.5664893617021276,
"grad_norm": 1.4659221291046236,
"learning_rate": 2.4260355029585794e-07,
"loss": 1.4754,
"step": 589
},
{
"epoch": 1.5691489361702127,
"grad_norm": 1.5385805721266792,
"learning_rate": 2.411242603550296e-07,
"loss": 1.5509,
"step": 590
},
{
"epoch": 1.5718085106382977,
"grad_norm": 1.5161262548508925,
"learning_rate": 2.396449704142012e-07,
"loss": 1.3936,
"step": 591
},
{
"epoch": 1.574468085106383,
"grad_norm": 1.4666556990097799,
"learning_rate": 2.3816568047337277e-07,
"loss": 1.3143,
"step": 592
},
{
"epoch": 1.577127659574468,
"grad_norm": 1.6300523417207398,
"learning_rate": 2.3668639053254435e-07,
"loss": 1.3385,
"step": 593
},
{
"epoch": 1.5797872340425532,
"grad_norm": 1.5121985962743036,
"learning_rate": 2.3520710059171598e-07,
"loss": 1.4693,
"step": 594
},
{
"epoch": 1.5824468085106385,
"grad_norm": 1.6977627534281994,
"learning_rate": 2.3372781065088755e-07,
"loss": 1.5827,
"step": 595
},
{
"epoch": 1.5851063829787235,
"grad_norm": 1.4675438168952388,
"learning_rate": 2.3224852071005915e-07,
"loss": 1.4037,
"step": 596
},
{
"epoch": 1.5877659574468086,
"grad_norm": 1.5087620408684652,
"learning_rate": 2.3076923076923078e-07,
"loss": 1.2713,
"step": 597
},
{
"epoch": 1.5904255319148937,
"grad_norm": 1.853798719037303,
"learning_rate": 2.2928994082840236e-07,
"loss": 1.4517,
"step": 598
},
{
"epoch": 1.5930851063829787,
"grad_norm": 1.5624391162454545,
"learning_rate": 2.2781065088757396e-07,
"loss": 1.5716,
"step": 599
},
{
"epoch": 1.5957446808510638,
"grad_norm": 1.5647362537380562,
"learning_rate": 2.2633136094674553e-07,
"loss": 1.2679,
"step": 600
},
{
"epoch": 1.5984042553191489,
"grad_norm": 1.5028293469540326,
"learning_rate": 2.2485207100591716e-07,
"loss": 1.3477,
"step": 601
},
{
"epoch": 1.601063829787234,
"grad_norm": 1.5616178692766567,
"learning_rate": 2.2337278106508876e-07,
"loss": 1.349,
"step": 602
},
{
"epoch": 1.603723404255319,
"grad_norm": 1.5652068533404448,
"learning_rate": 2.2189349112426034e-07,
"loss": 1.3408,
"step": 603
},
{
"epoch": 1.6063829787234043,
"grad_norm": 1.8312281167867779,
"learning_rate": 2.2041420118343194e-07,
"loss": 1.5744,
"step": 604
},
{
"epoch": 1.6090425531914894,
"grad_norm": 1.5113532834536092,
"learning_rate": 2.1893491124260354e-07,
"loss": 1.5116,
"step": 605
},
{
"epoch": 1.6117021276595744,
"grad_norm": 1.4148002933798485,
"learning_rate": 2.1745562130177514e-07,
"loss": 1.3254,
"step": 606
},
{
"epoch": 1.6143617021276597,
"grad_norm": 1.4128390757612144,
"learning_rate": 2.1597633136094672e-07,
"loss": 1.3424,
"step": 607
},
{
"epoch": 1.6170212765957448,
"grad_norm": 1.664151543039297,
"learning_rate": 2.1449704142011834e-07,
"loss": 1.4507,
"step": 608
},
{
"epoch": 1.6196808510638299,
"grad_norm": 1.5001892924079347,
"learning_rate": 2.1301775147928995e-07,
"loss": 1.3598,
"step": 609
},
{
"epoch": 1.622340425531915,
"grad_norm": 1.7189011247258703,
"learning_rate": 2.1153846153846152e-07,
"loss": 1.4798,
"step": 610
},
{
"epoch": 1.625,
"grad_norm": 1.4495039913652832,
"learning_rate": 2.1005917159763312e-07,
"loss": 1.1879,
"step": 611
},
{
"epoch": 1.627659574468085,
"grad_norm": 1.4863964571390131,
"learning_rate": 2.0857988165680472e-07,
"loss": 1.4149,
"step": 612
},
{
"epoch": 1.6303191489361701,
"grad_norm": 1.470842696782351,
"learning_rate": 2.0710059171597633e-07,
"loss": 1.5213,
"step": 613
},
{
"epoch": 1.6329787234042552,
"grad_norm": 1.5332931589309218,
"learning_rate": 2.0562130177514793e-07,
"loss": 1.3847,
"step": 614
},
{
"epoch": 1.6356382978723403,
"grad_norm": 1.5012230655181953,
"learning_rate": 2.041420118343195e-07,
"loss": 1.2194,
"step": 615
},
{
"epoch": 1.6382978723404256,
"grad_norm": 1.4592244922211661,
"learning_rate": 2.0266272189349113e-07,
"loss": 1.2863,
"step": 616
},
{
"epoch": 1.6409574468085106,
"grad_norm": 1.6194968573694928,
"learning_rate": 2.011834319526627e-07,
"loss": 1.563,
"step": 617
},
{
"epoch": 1.6436170212765957,
"grad_norm": 1.5398995693701385,
"learning_rate": 1.997041420118343e-07,
"loss": 1.5,
"step": 618
},
{
"epoch": 1.646276595744681,
"grad_norm": 1.803830954994613,
"learning_rate": 1.9822485207100593e-07,
"loss": 1.3459,
"step": 619
},
{
"epoch": 1.648936170212766,
"grad_norm": 1.5731270083148248,
"learning_rate": 1.967455621301775e-07,
"loss": 1.3277,
"step": 620
},
{
"epoch": 1.6515957446808511,
"grad_norm": 1.6370008858204694,
"learning_rate": 1.952662721893491e-07,
"loss": 1.4752,
"step": 621
},
{
"epoch": 1.6542553191489362,
"grad_norm": 1.3905339157621093,
"learning_rate": 1.9378698224852069e-07,
"loss": 1.1591,
"step": 622
},
{
"epoch": 1.6569148936170213,
"grad_norm": 1.521784820078054,
"learning_rate": 1.9230769230769231e-07,
"loss": 1.348,
"step": 623
},
{
"epoch": 1.6595744680851063,
"grad_norm": 2.480779673395715,
"learning_rate": 1.908284023668639e-07,
"loss": 1.3468,
"step": 624
},
{
"epoch": 1.6622340425531914,
"grad_norm": 1.5047866424190777,
"learning_rate": 1.893491124260355e-07,
"loss": 1.3808,
"step": 625
},
{
"epoch": 1.6648936170212765,
"grad_norm": 1.5186127777273435,
"learning_rate": 1.878698224852071e-07,
"loss": 1.4201,
"step": 626
},
{
"epoch": 1.6675531914893615,
"grad_norm": 1.4407427328000266,
"learning_rate": 1.863905325443787e-07,
"loss": 1.273,
"step": 627
},
{
"epoch": 1.6702127659574468,
"grad_norm": 1.5224116074533014,
"learning_rate": 1.849112426035503e-07,
"loss": 1.2098,
"step": 628
},
{
"epoch": 1.672872340425532,
"grad_norm": 1.527239003211648,
"learning_rate": 1.8343195266272187e-07,
"loss": 1.3724,
"step": 629
},
{
"epoch": 1.675531914893617,
"grad_norm": 1.6525871512419401,
"learning_rate": 1.819526627218935e-07,
"loss": 1.3946,
"step": 630
},
{
"epoch": 1.6781914893617023,
"grad_norm": 3.0200043340992933,
"learning_rate": 1.804733727810651e-07,
"loss": 1.4742,
"step": 631
},
{
"epoch": 1.6808510638297873,
"grad_norm": 1.5029965510376364,
"learning_rate": 1.7899408284023667e-07,
"loss": 1.3623,
"step": 632
},
{
"epoch": 1.6835106382978724,
"grad_norm": 1.5389625013367383,
"learning_rate": 1.7751479289940827e-07,
"loss": 1.5043,
"step": 633
},
{
"epoch": 1.6861702127659575,
"grad_norm": 1.5608661501656413,
"learning_rate": 1.7603550295857988e-07,
"loss": 1.2883,
"step": 634
},
{
"epoch": 1.6888297872340425,
"grad_norm": 1.6847845057440693,
"learning_rate": 1.7455621301775148e-07,
"loss": 1.4244,
"step": 635
},
{
"epoch": 1.6914893617021276,
"grad_norm": 1.5793904433648327,
"learning_rate": 1.7307692307692305e-07,
"loss": 1.4062,
"step": 636
},
{
"epoch": 1.6941489361702127,
"grad_norm": 1.4350293530642095,
"learning_rate": 1.7159763313609465e-07,
"loss": 1.2754,
"step": 637
},
{
"epoch": 1.6968085106382977,
"grad_norm": 1.902506858522582,
"learning_rate": 1.7011834319526628e-07,
"loss": 1.4541,
"step": 638
},
{
"epoch": 1.699468085106383,
"grad_norm": 1.478754263683889,
"learning_rate": 1.6863905325443786e-07,
"loss": 1.3463,
"step": 639
},
{
"epoch": 1.702127659574468,
"grad_norm": 1.6464724285737642,
"learning_rate": 1.6715976331360946e-07,
"loss": 1.3807,
"step": 640
},
{
"epoch": 1.7047872340425532,
"grad_norm": 1.6125752749357112,
"learning_rate": 1.6568047337278106e-07,
"loss": 1.2933,
"step": 641
},
{
"epoch": 1.7074468085106385,
"grad_norm": 1.5928623495071816,
"learning_rate": 1.6420118343195266e-07,
"loss": 1.4326,
"step": 642
},
{
"epoch": 1.7101063829787235,
"grad_norm": 1.5193190242572798,
"learning_rate": 1.6272189349112426e-07,
"loss": 1.3588,
"step": 643
},
{
"epoch": 1.7127659574468086,
"grad_norm": 1.5482920311769846,
"learning_rate": 1.6124260355029584e-07,
"loss": 1.3839,
"step": 644
},
{
"epoch": 1.7154255319148937,
"grad_norm": 1.8407335336806905,
"learning_rate": 1.5976331360946747e-07,
"loss": 1.3248,
"step": 645
},
{
"epoch": 1.7180851063829787,
"grad_norm": 1.6055785649743377,
"learning_rate": 1.5828402366863904e-07,
"loss": 1.3872,
"step": 646
},
{
"epoch": 1.7207446808510638,
"grad_norm": 1.6297496194969232,
"learning_rate": 1.5680473372781064e-07,
"loss": 1.389,
"step": 647
},
{
"epoch": 1.7234042553191489,
"grad_norm": 1.577321745146047,
"learning_rate": 1.5532544378698222e-07,
"loss": 1.1947,
"step": 648
},
{
"epoch": 1.726063829787234,
"grad_norm": 1.6447713137577962,
"learning_rate": 1.5384615384615385e-07,
"loss": 1.2652,
"step": 649
},
{
"epoch": 1.728723404255319,
"grad_norm": 1.6234194331407543,
"learning_rate": 1.5236686390532545e-07,
"loss": 1.4239,
"step": 650
},
{
"epoch": 1.7313829787234043,
"grad_norm": 1.532776130454777,
"learning_rate": 1.5088757396449702e-07,
"loss": 1.3875,
"step": 651
},
{
"epoch": 1.7340425531914894,
"grad_norm": 1.4837535962878305,
"learning_rate": 1.4940828402366865e-07,
"loss": 1.2059,
"step": 652
},
{
"epoch": 1.7367021276595744,
"grad_norm": 1.5395205053467318,
"learning_rate": 1.4792899408284022e-07,
"loss": 1.3513,
"step": 653
},
{
"epoch": 1.7393617021276597,
"grad_norm": 1.4112077844892696,
"learning_rate": 1.4644970414201183e-07,
"loss": 1.3336,
"step": 654
},
{
"epoch": 1.7420212765957448,
"grad_norm": 1.481010800777514,
"learning_rate": 1.4497041420118343e-07,
"loss": 1.4028,
"step": 655
},
{
"epoch": 1.7446808510638299,
"grad_norm": 1.4564408238676725,
"learning_rate": 1.4349112426035503e-07,
"loss": 1.3502,
"step": 656
},
{
"epoch": 1.747340425531915,
"grad_norm": 1.6956227102239596,
"learning_rate": 1.4201183431952663e-07,
"loss": 1.5672,
"step": 657
},
{
"epoch": 1.75,
"grad_norm": 1.5705454639314052,
"learning_rate": 1.405325443786982e-07,
"loss": 1.4109,
"step": 658
},
{
"epoch": 1.752659574468085,
"grad_norm": 1.5656622358755812,
"learning_rate": 1.390532544378698e-07,
"loss": 1.557,
"step": 659
},
{
"epoch": 1.7553191489361701,
"grad_norm": 1.8848625197729474,
"learning_rate": 1.3757396449704143e-07,
"loss": 1.4017,
"step": 660
},
{
"epoch": 1.7579787234042552,
"grad_norm": 1.4196764538431994,
"learning_rate": 1.36094674556213e-07,
"loss": 1.2331,
"step": 661
},
{
"epoch": 1.7606382978723403,
"grad_norm": 1.4675927168298655,
"learning_rate": 1.346153846153846e-07,
"loss": 1.4689,
"step": 662
},
{
"epoch": 1.7632978723404256,
"grad_norm": 1.6895719453339277,
"learning_rate": 1.331360946745562e-07,
"loss": 1.6055,
"step": 663
},
{
"epoch": 1.7659574468085106,
"grad_norm": 1.6565509018980442,
"learning_rate": 1.3165680473372781e-07,
"loss": 1.346,
"step": 664
},
{
"epoch": 1.7686170212765957,
"grad_norm": 1.6111421234975374,
"learning_rate": 1.301775147928994e-07,
"loss": 1.3318,
"step": 665
},
{
"epoch": 1.771276595744681,
"grad_norm": 1.5477525938145107,
"learning_rate": 1.28698224852071e-07,
"loss": 1.4311,
"step": 666
},
{
"epoch": 1.773936170212766,
"grad_norm": 1.4344548853484294,
"learning_rate": 1.2721893491124262e-07,
"loss": 1.4168,
"step": 667
},
{
"epoch": 1.7765957446808511,
"grad_norm": 2.002400150167084,
"learning_rate": 1.257396449704142e-07,
"loss": 1.5304,
"step": 668
},
{
"epoch": 1.7792553191489362,
"grad_norm": 1.6203137830914942,
"learning_rate": 1.242603550295858e-07,
"loss": 1.4902,
"step": 669
},
{
"epoch": 1.7819148936170213,
"grad_norm": 1.653101321305009,
"learning_rate": 1.227810650887574e-07,
"loss": 1.523,
"step": 670
},
{
"epoch": 1.7845744680851063,
"grad_norm": 1.4583067028702263,
"learning_rate": 1.2130177514792897e-07,
"loss": 1.3307,
"step": 671
},
{
"epoch": 1.7872340425531914,
"grad_norm": 1.4416958484378999,
"learning_rate": 1.198224852071006e-07,
"loss": 1.2879,
"step": 672
},
{
"epoch": 1.7898936170212765,
"grad_norm": 1.5342015216907867,
"learning_rate": 1.1834319526627217e-07,
"loss": 1.3491,
"step": 673
},
{
"epoch": 1.7925531914893615,
"grad_norm": 1.5120417917571398,
"learning_rate": 1.1686390532544378e-07,
"loss": 1.5533,
"step": 674
},
{
"epoch": 1.7952127659574468,
"grad_norm": 1.6448669091043147,
"learning_rate": 1.1538461538461539e-07,
"loss": 1.4507,
"step": 675
},
{
"epoch": 1.797872340425532,
"grad_norm": 1.5744246355313867,
"learning_rate": 1.1390532544378698e-07,
"loss": 1.4762,
"step": 676
},
{
"epoch": 1.800531914893617,
"grad_norm": 1.407351126310039,
"learning_rate": 1.1242603550295858e-07,
"loss": 1.2665,
"step": 677
},
{
"epoch": 1.8031914893617023,
"grad_norm": 1.4428356495487928,
"learning_rate": 1.1094674556213017e-07,
"loss": 1.4222,
"step": 678
},
{
"epoch": 1.8058510638297873,
"grad_norm": 1.4978022369408812,
"learning_rate": 1.0946745562130177e-07,
"loss": 1.3571,
"step": 679
},
{
"epoch": 1.8085106382978724,
"grad_norm": 1.608694580830846,
"learning_rate": 1.0798816568047336e-07,
"loss": 1.2468,
"step": 680
},
{
"epoch": 1.8111702127659575,
"grad_norm": 1.3671652219864612,
"learning_rate": 1.0650887573964497e-07,
"loss": 1.1918,
"step": 681
},
{
"epoch": 1.8138297872340425,
"grad_norm": 1.5436563625586248,
"learning_rate": 1.0502958579881656e-07,
"loss": 1.3447,
"step": 682
},
{
"epoch": 1.8164893617021276,
"grad_norm": 2.0668175329496448,
"learning_rate": 1.0355029585798816e-07,
"loss": 1.2851,
"step": 683
},
{
"epoch": 1.8191489361702127,
"grad_norm": 1.4711737418040087,
"learning_rate": 1.0207100591715975e-07,
"loss": 1.4054,
"step": 684
},
{
"epoch": 1.8218085106382977,
"grad_norm": 1.628475068104997,
"learning_rate": 1.0059171597633135e-07,
"loss": 1.2297,
"step": 685
},
{
"epoch": 1.824468085106383,
"grad_norm": 1.6652537635356375,
"learning_rate": 9.911242603550297e-08,
"loss": 1.4249,
"step": 686
},
{
"epoch": 1.827127659574468,
"grad_norm": 1.4549454801379844,
"learning_rate": 9.763313609467456e-08,
"loss": 1.4738,
"step": 687
},
{
"epoch": 1.8297872340425532,
"grad_norm": 1.4571125733944477,
"learning_rate": 9.615384615384616e-08,
"loss": 1.2531,
"step": 688
},
{
"epoch": 1.8324468085106385,
"grad_norm": 1.4934710030590315,
"learning_rate": 9.467455621301774e-08,
"loss": 1.4224,
"step": 689
},
{
"epoch": 1.8351063829787235,
"grad_norm": 1.5068998589001918,
"learning_rate": 9.319526627218935e-08,
"loss": 1.4137,
"step": 690
},
{
"epoch": 1.8377659574468086,
"grad_norm": 1.5592030646382606,
"learning_rate": 9.171597633136093e-08,
"loss": 1.4923,
"step": 691
},
{
"epoch": 1.8404255319148937,
"grad_norm": 1.5420672523438603,
"learning_rate": 9.023668639053255e-08,
"loss": 1.3542,
"step": 692
},
{
"epoch": 1.8430851063829787,
"grad_norm": 1.4933658760362354,
"learning_rate": 8.875739644970414e-08,
"loss": 1.4062,
"step": 693
},
{
"epoch": 1.8457446808510638,
"grad_norm": 2.1197107348039648,
"learning_rate": 8.727810650887574e-08,
"loss": 1.3514,
"step": 694
},
{
"epoch": 1.8484042553191489,
"grad_norm": 1.420310868366173,
"learning_rate": 8.579881656804733e-08,
"loss": 1.3865,
"step": 695
},
{
"epoch": 1.851063829787234,
"grad_norm": 2.1476526664851083,
"learning_rate": 8.431952662721893e-08,
"loss": 1.1886,
"step": 696
},
{
"epoch": 1.853723404255319,
"grad_norm": 1.3847908910859454,
"learning_rate": 8.284023668639053e-08,
"loss": 1.4107,
"step": 697
},
{
"epoch": 1.8563829787234043,
"grad_norm": 1.6527903429011437,
"learning_rate": 8.136094674556213e-08,
"loss": 1.2876,
"step": 698
},
{
"epoch": 1.8590425531914894,
"grad_norm": 1.5745014854949893,
"learning_rate": 7.988165680473373e-08,
"loss": 1.4558,
"step": 699
},
{
"epoch": 1.8617021276595744,
"grad_norm": 1.5350363492855568,
"learning_rate": 7.840236686390532e-08,
"loss": 1.4523,
"step": 700
},
{
"epoch": 1.8643617021276597,
"grad_norm": 1.4853786087332579,
"learning_rate": 7.692307692307692e-08,
"loss": 1.3292,
"step": 701
},
{
"epoch": 1.8670212765957448,
"grad_norm": 1.4473821719214552,
"learning_rate": 7.544378698224851e-08,
"loss": 1.2034,
"step": 702
},
{
"epoch": 1.8696808510638299,
"grad_norm": 1.4659266830367277,
"learning_rate": 7.396449704142011e-08,
"loss": 1.2584,
"step": 703
},
{
"epoch": 1.872340425531915,
"grad_norm": 1.4759466915583441,
"learning_rate": 7.248520710059171e-08,
"loss": 1.3187,
"step": 704
},
{
"epoch": 1.875,
"grad_norm": 2.111257320773056,
"learning_rate": 7.100591715976332e-08,
"loss": 1.3323,
"step": 705
},
{
"epoch": 1.877659574468085,
"grad_norm": 1.5831480252428458,
"learning_rate": 6.95266272189349e-08,
"loss": 1.4302,
"step": 706
},
{
"epoch": 1.8803191489361701,
"grad_norm": 1.6086043948043176,
"learning_rate": 6.80473372781065e-08,
"loss": 1.4017,
"step": 707
},
{
"epoch": 1.8829787234042552,
"grad_norm": 2.0849492736061332,
"learning_rate": 6.65680473372781e-08,
"loss": 1.5211,
"step": 708
},
{
"epoch": 1.8856382978723403,
"grad_norm": 1.5043217865201886,
"learning_rate": 6.50887573964497e-08,
"loss": 1.2166,
"step": 709
},
{
"epoch": 1.8882978723404256,
"grad_norm": 1.5612635488662876,
"learning_rate": 6.360946745562131e-08,
"loss": 1.3481,
"step": 710
},
{
"epoch": 1.8909574468085106,
"grad_norm": 1.4947402449036076,
"learning_rate": 6.21301775147929e-08,
"loss": 1.1988,
"step": 711
},
{
"epoch": 1.8936170212765957,
"grad_norm": 1.7123431001612024,
"learning_rate": 6.065088757396449e-08,
"loss": 1.6215,
"step": 712
},
{
"epoch": 1.896276595744681,
"grad_norm": 1.5722027689056413,
"learning_rate": 5.917159763313609e-08,
"loss": 1.435,
"step": 713
},
{
"epoch": 1.898936170212766,
"grad_norm": 1.5736347184744337,
"learning_rate": 5.7692307692307695e-08,
"loss": 1.4243,
"step": 714
},
{
"epoch": 1.9015957446808511,
"grad_norm": 1.4558857769282714,
"learning_rate": 5.621301775147929e-08,
"loss": 1.3732,
"step": 715
},
{
"epoch": 1.9042553191489362,
"grad_norm": 1.430432818475582,
"learning_rate": 5.4733727810650885e-08,
"loss": 1.2582,
"step": 716
},
{
"epoch": 1.9069148936170213,
"grad_norm": 1.4010572562597123,
"learning_rate": 5.3254437869822486e-08,
"loss": 1.2036,
"step": 717
},
{
"epoch": 1.9095744680851063,
"grad_norm": 1.5030183623430164,
"learning_rate": 5.177514792899408e-08,
"loss": 1.3348,
"step": 718
},
{
"epoch": 1.9122340425531914,
"grad_norm": 1.5264425521471463,
"learning_rate": 5.0295857988165676e-08,
"loss": 1.3486,
"step": 719
},
{
"epoch": 1.9148936170212765,
"grad_norm": 1.6568867880777098,
"learning_rate": 4.881656804733728e-08,
"loss": 1.5102,
"step": 720
},
{
"epoch": 1.9175531914893615,
"grad_norm": 1.426877705408139,
"learning_rate": 4.733727810650887e-08,
"loss": 1.2843,
"step": 721
},
{
"epoch": 1.9202127659574468,
"grad_norm": 1.5745176121540452,
"learning_rate": 4.585798816568047e-08,
"loss": 1.5892,
"step": 722
},
{
"epoch": 1.922872340425532,
"grad_norm": 1.5299374151207628,
"learning_rate": 4.437869822485207e-08,
"loss": 1.2833,
"step": 723
},
{
"epoch": 1.925531914893617,
"grad_norm": 1.5485924328569498,
"learning_rate": 4.2899408284023664e-08,
"loss": 1.2528,
"step": 724
},
{
"epoch": 1.9281914893617023,
"grad_norm": 1.5650812571579016,
"learning_rate": 4.1420118343195265e-08,
"loss": 1.4623,
"step": 725
},
{
"epoch": 1.9308510638297873,
"grad_norm": 1.4874887834654986,
"learning_rate": 3.9940828402366866e-08,
"loss": 1.331,
"step": 726
},
{
"epoch": 1.9335106382978724,
"grad_norm": 2.2386953559992606,
"learning_rate": 3.846153846153846e-08,
"loss": 1.3,
"step": 727
},
{
"epoch": 1.9361702127659575,
"grad_norm": 1.6479534866842882,
"learning_rate": 3.6982248520710056e-08,
"loss": 1.5275,
"step": 728
},
{
"epoch": 1.9388297872340425,
"grad_norm": 1.3421164941268597,
"learning_rate": 3.550295857988166e-08,
"loss": 1.2612,
"step": 729
},
{
"epoch": 1.9414893617021276,
"grad_norm": 1.4738714329564195,
"learning_rate": 3.402366863905325e-08,
"loss": 1.4314,
"step": 730
},
{
"epoch": 1.9441489361702127,
"grad_norm": 1.44156180682146,
"learning_rate": 3.254437869822485e-08,
"loss": 1.3338,
"step": 731
},
{
"epoch": 1.9468085106382977,
"grad_norm": 1.5784061294788538,
"learning_rate": 3.106508875739645e-08,
"loss": 1.3386,
"step": 732
},
{
"epoch": 1.949468085106383,
"grad_norm": 1.3371318363907538,
"learning_rate": 2.9585798816568044e-08,
"loss": 1.3028,
"step": 733
},
{
"epoch": 1.952127659574468,
"grad_norm": 1.6852290803170833,
"learning_rate": 2.8106508875739645e-08,
"loss": 1.4942,
"step": 734
},
{
"epoch": 1.9547872340425532,
"grad_norm": 1.4000270483265091,
"learning_rate": 2.6627218934911243e-08,
"loss": 1.2736,
"step": 735
},
{
"epoch": 1.9574468085106385,
"grad_norm": 1.5150814064740574,
"learning_rate": 2.5147928994082838e-08,
"loss": 1.3892,
"step": 736
},
{
"epoch": 1.9601063829787235,
"grad_norm": 1.6902820824629503,
"learning_rate": 2.3668639053254436e-08,
"loss": 1.4473,
"step": 737
},
{
"epoch": 1.9627659574468086,
"grad_norm": 1.5540380607077866,
"learning_rate": 2.2189349112426034e-08,
"loss": 1.4118,
"step": 738
},
{
"epoch": 1.9654255319148937,
"grad_norm": 1.7104646212150858,
"learning_rate": 2.0710059171597633e-08,
"loss": 1.3168,
"step": 739
},
{
"epoch": 1.9680851063829787,
"grad_norm": 2.2605324172049865,
"learning_rate": 1.923076923076923e-08,
"loss": 1.5395,
"step": 740
},
{
"epoch": 1.9707446808510638,
"grad_norm": 1.5974851484011308,
"learning_rate": 1.775147928994083e-08,
"loss": 1.6206,
"step": 741
},
{
"epoch": 1.9734042553191489,
"grad_norm": 1.5065611553522427,
"learning_rate": 1.6272189349112424e-08,
"loss": 1.282,
"step": 742
},
{
"epoch": 1.976063829787234,
"grad_norm": 1.5885436344655675,
"learning_rate": 1.4792899408284022e-08,
"loss": 1.3356,
"step": 743
},
{
"epoch": 1.978723404255319,
"grad_norm": 1.503074753014641,
"learning_rate": 1.3313609467455622e-08,
"loss": 1.4519,
"step": 744
},
{
"epoch": 1.9813829787234043,
"grad_norm": 1.4858939981761545,
"learning_rate": 1.1834319526627218e-08,
"loss": 1.4708,
"step": 745
},
{
"epoch": 1.9840425531914894,
"grad_norm": 1.5483339923710784,
"learning_rate": 1.0355029585798816e-08,
"loss": 1.4574,
"step": 746
},
{
"epoch": 1.9867021276595744,
"grad_norm": 1.4527735951794787,
"learning_rate": 8.875739644970414e-09,
"loss": 1.2918,
"step": 747
},
{
"epoch": 1.9893617021276597,
"grad_norm": 1.6044461968692099,
"learning_rate": 7.396449704142011e-09,
"loss": 1.4377,
"step": 748
},
{
"epoch": 1.9920212765957448,
"grad_norm": 1.530330381812826,
"learning_rate": 5.917159763313609e-09,
"loss": 1.4002,
"step": 749
},
{
"epoch": 1.9946808510638299,
"grad_norm": 1.5188564969623919,
"learning_rate": 4.437869822485207e-09,
"loss": 1.3378,
"step": 750
},
{
"epoch": 1.997340425531915,
"grad_norm": 1.470529168569605,
"learning_rate": 2.9585798816568045e-09,
"loss": 1.3217,
"step": 751
},
{
"epoch": 2.0,
"grad_norm": 1.4650169982184404,
"learning_rate": 1.4792899408284023e-09,
"loss": 1.3323,
"step": 752
}
],
"logging_steps": 1.0,
"max_steps": 752,
"num_input_tokens_seen": 0,
"num_train_epochs": 2,
"save_steps": 3000,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 0.0,
"train_batch_size": 1,
"trial_name": null,
"trial_params": null
}