model_4kai894c / checkpoint-1311 /trainer_state.json
ugaoo's picture
Upload folder using huggingface_hub
63e9359 verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 2.993907083015994,
"eval_steps": 500,
"global_step": 1311,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.002284843869002285,
"grad_norm": 3.668196201324463,
"learning_rate": 5.0000000000000004e-08,
"loss": 1.5687,
"step": 1
},
{
"epoch": 0.00456968773800457,
"grad_norm": 3.6277146339416504,
"learning_rate": 1.0000000000000001e-07,
"loss": 1.5714,
"step": 2
},
{
"epoch": 0.006854531607006854,
"grad_norm": 3.813422918319702,
"learning_rate": 1.5000000000000002e-07,
"loss": 1.58,
"step": 3
},
{
"epoch": 0.00913937547600914,
"grad_norm": 3.4566409587860107,
"learning_rate": 2.0000000000000002e-07,
"loss": 1.5604,
"step": 4
},
{
"epoch": 0.011424219345011425,
"grad_norm": 3.287661552429199,
"learning_rate": 2.5000000000000004e-07,
"loss": 1.5425,
"step": 5
},
{
"epoch": 0.013709063214013708,
"grad_norm": 3.318340301513672,
"learning_rate": 3.0000000000000004e-07,
"loss": 1.5477,
"step": 6
},
{
"epoch": 0.015993907083015995,
"grad_norm": 3.407221555709839,
"learning_rate": 3.5000000000000004e-07,
"loss": 1.5848,
"step": 7
},
{
"epoch": 0.01827875095201828,
"grad_norm": 3.732999563217163,
"learning_rate": 4.0000000000000003e-07,
"loss": 1.5884,
"step": 8
},
{
"epoch": 0.020563594821020565,
"grad_norm": 3.532766580581665,
"learning_rate": 4.5000000000000003e-07,
"loss": 1.5892,
"step": 9
},
{
"epoch": 0.02284843869002285,
"grad_norm": 3.5676348209381104,
"learning_rate": 5.000000000000001e-07,
"loss": 1.5619,
"step": 10
},
{
"epoch": 0.02513328255902513,
"grad_norm": 3.1015849113464355,
"learning_rate": 5.5e-07,
"loss": 1.5649,
"step": 11
},
{
"epoch": 0.027418126428027417,
"grad_norm": 3.163240909576416,
"learning_rate": 6.000000000000001e-07,
"loss": 1.5807,
"step": 12
},
{
"epoch": 0.0297029702970297,
"grad_norm": 2.894922971725464,
"learning_rate": 6.5e-07,
"loss": 1.5454,
"step": 13
},
{
"epoch": 0.03198781416603199,
"grad_norm": 2.8211843967437744,
"learning_rate": 7.000000000000001e-07,
"loss": 1.5801,
"step": 14
},
{
"epoch": 0.03427265803503427,
"grad_norm": 2.676609516143799,
"learning_rate": 7.5e-07,
"loss": 1.5446,
"step": 15
},
{
"epoch": 0.03655750190403656,
"grad_norm": 2.6186320781707764,
"learning_rate": 8.000000000000001e-07,
"loss": 1.5443,
"step": 16
},
{
"epoch": 0.03884234577303884,
"grad_norm": 2.460139513015747,
"learning_rate": 8.500000000000001e-07,
"loss": 1.5489,
"step": 17
},
{
"epoch": 0.04112718964204113,
"grad_norm": 2.368126630783081,
"learning_rate": 9.000000000000001e-07,
"loss": 1.5317,
"step": 18
},
{
"epoch": 0.04341203351104341,
"grad_norm": 2.244192123413086,
"learning_rate": 9.500000000000001e-07,
"loss": 1.4805,
"step": 19
},
{
"epoch": 0.0456968773800457,
"grad_norm": 2.242701292037964,
"learning_rate": 1.0000000000000002e-06,
"loss": 1.5478,
"step": 20
},
{
"epoch": 0.04798172124904798,
"grad_norm": 2.13895583152771,
"learning_rate": 1.0500000000000001e-06,
"loss": 1.5194,
"step": 21
},
{
"epoch": 0.05026656511805026,
"grad_norm": 2.0152103900909424,
"learning_rate": 1.1e-06,
"loss": 1.5067,
"step": 22
},
{
"epoch": 0.05255140898705255,
"grad_norm": 1.9156895875930786,
"learning_rate": 1.1500000000000002e-06,
"loss": 1.5145,
"step": 23
},
{
"epoch": 0.05483625285605483,
"grad_norm": 1.7710504531860352,
"learning_rate": 1.2000000000000002e-06,
"loss": 1.5147,
"step": 24
},
{
"epoch": 0.05712109672505712,
"grad_norm": 1.807431936264038,
"learning_rate": 1.25e-06,
"loss": 1.5357,
"step": 25
},
{
"epoch": 0.0594059405940594,
"grad_norm": 1.6638832092285156,
"learning_rate": 1.3e-06,
"loss": 1.489,
"step": 26
},
{
"epoch": 0.06169078446306169,
"grad_norm": 1.5708481073379517,
"learning_rate": 1.3500000000000002e-06,
"loss": 1.4768,
"step": 27
},
{
"epoch": 0.06397562833206398,
"grad_norm": 1.615577220916748,
"learning_rate": 1.4000000000000001e-06,
"loss": 1.5159,
"step": 28
},
{
"epoch": 0.06626047220106626,
"grad_norm": 1.5125129222869873,
"learning_rate": 1.45e-06,
"loss": 1.4972,
"step": 29
},
{
"epoch": 0.06854531607006854,
"grad_norm": 1.479811668395996,
"learning_rate": 1.5e-06,
"loss": 1.4674,
"step": 30
},
{
"epoch": 0.07083015993907082,
"grad_norm": 1.4502017498016357,
"learning_rate": 1.5500000000000002e-06,
"loss": 1.4811,
"step": 31
},
{
"epoch": 0.07311500380807312,
"grad_norm": 1.3617135286331177,
"learning_rate": 1.6000000000000001e-06,
"loss": 1.4872,
"step": 32
},
{
"epoch": 0.0753998476770754,
"grad_norm": 1.367607831954956,
"learning_rate": 1.6500000000000003e-06,
"loss": 1.4699,
"step": 33
},
{
"epoch": 0.07768469154607768,
"grad_norm": 1.3374927043914795,
"learning_rate": 1.7000000000000002e-06,
"loss": 1.4659,
"step": 34
},
{
"epoch": 0.07996953541507996,
"grad_norm": 1.354506254196167,
"learning_rate": 1.75e-06,
"loss": 1.4351,
"step": 35
},
{
"epoch": 0.08225437928408226,
"grad_norm": 1.2532024383544922,
"learning_rate": 1.8000000000000001e-06,
"loss": 1.4358,
"step": 36
},
{
"epoch": 0.08453922315308454,
"grad_norm": 1.2684043645858765,
"learning_rate": 1.85e-06,
"loss": 1.4534,
"step": 37
},
{
"epoch": 0.08682406702208682,
"grad_norm": 1.2418140172958374,
"learning_rate": 1.9000000000000002e-06,
"loss": 1.4624,
"step": 38
},
{
"epoch": 0.0891089108910891,
"grad_norm": 1.2266045808792114,
"learning_rate": 1.9500000000000004e-06,
"loss": 1.4282,
"step": 39
},
{
"epoch": 0.0913937547600914,
"grad_norm": 1.180330753326416,
"learning_rate": 2.0000000000000003e-06,
"loss": 1.4107,
"step": 40
},
{
"epoch": 0.09367859862909368,
"grad_norm": 1.1651424169540405,
"learning_rate": 2.05e-06,
"loss": 1.4041,
"step": 41
},
{
"epoch": 0.09596344249809596,
"grad_norm": 1.181652307510376,
"learning_rate": 2.1000000000000002e-06,
"loss": 1.4558,
"step": 42
},
{
"epoch": 0.09824828636709824,
"grad_norm": 1.2221183776855469,
"learning_rate": 2.15e-06,
"loss": 1.4449,
"step": 43
},
{
"epoch": 0.10053313023610053,
"grad_norm": 1.085172414779663,
"learning_rate": 2.2e-06,
"loss": 1.4235,
"step": 44
},
{
"epoch": 0.10281797410510282,
"grad_norm": 1.0497649908065796,
"learning_rate": 2.25e-06,
"loss": 1.3891,
"step": 45
},
{
"epoch": 0.1051028179741051,
"grad_norm": 1.0502350330352783,
"learning_rate": 2.3000000000000004e-06,
"loss": 1.4048,
"step": 46
},
{
"epoch": 0.10738766184310738,
"grad_norm": 1.0798920392990112,
"learning_rate": 2.35e-06,
"loss": 1.4383,
"step": 47
},
{
"epoch": 0.10967250571210967,
"grad_norm": 1.067581057548523,
"learning_rate": 2.4000000000000003e-06,
"loss": 1.4128,
"step": 48
},
{
"epoch": 0.11195734958111196,
"grad_norm": 1.062606930732727,
"learning_rate": 2.4500000000000003e-06,
"loss": 1.4438,
"step": 49
},
{
"epoch": 0.11424219345011424,
"grad_norm": 1.0157577991485596,
"learning_rate": 2.5e-06,
"loss": 1.4257,
"step": 50
},
{
"epoch": 0.11652703731911652,
"grad_norm": 1.0165379047393799,
"learning_rate": 2.55e-06,
"loss": 1.407,
"step": 51
},
{
"epoch": 0.1188118811881188,
"grad_norm": 1.0268282890319824,
"learning_rate": 2.6e-06,
"loss": 1.3942,
"step": 52
},
{
"epoch": 0.1210967250571211,
"grad_norm": 1.0133647918701172,
"learning_rate": 2.6500000000000005e-06,
"loss": 1.3737,
"step": 53
},
{
"epoch": 0.12338156892612338,
"grad_norm": 1.0097134113311768,
"learning_rate": 2.7000000000000004e-06,
"loss": 1.3994,
"step": 54
},
{
"epoch": 0.12566641279512566,
"grad_norm": 1.1268850564956665,
"learning_rate": 2.7500000000000004e-06,
"loss": 1.3676,
"step": 55
},
{
"epoch": 0.12795125666412796,
"grad_norm": 0.981015682220459,
"learning_rate": 2.8000000000000003e-06,
"loss": 1.3819,
"step": 56
},
{
"epoch": 0.13023610053313023,
"grad_norm": 1.0456632375717163,
"learning_rate": 2.85e-06,
"loss": 1.4031,
"step": 57
},
{
"epoch": 0.13252094440213252,
"grad_norm": 1.0366231203079224,
"learning_rate": 2.9e-06,
"loss": 1.4017,
"step": 58
},
{
"epoch": 0.13480578827113482,
"grad_norm": 0.9980257749557495,
"learning_rate": 2.95e-06,
"loss": 1.4261,
"step": 59
},
{
"epoch": 0.1370906321401371,
"grad_norm": 0.990281879901886,
"learning_rate": 3e-06,
"loss": 1.3699,
"step": 60
},
{
"epoch": 0.13937547600913938,
"grad_norm": 1.0530250072479248,
"learning_rate": 3.05e-06,
"loss": 1.3656,
"step": 61
},
{
"epoch": 0.14166031987814165,
"grad_norm": 0.9878147840499878,
"learning_rate": 3.1000000000000004e-06,
"loss": 1.3712,
"step": 62
},
{
"epoch": 0.14394516374714394,
"grad_norm": 0.9554497599601746,
"learning_rate": 3.1500000000000003e-06,
"loss": 1.3507,
"step": 63
},
{
"epoch": 0.14623000761614624,
"grad_norm": 1.0152994394302368,
"learning_rate": 3.2000000000000003e-06,
"loss": 1.3531,
"step": 64
},
{
"epoch": 0.1485148514851485,
"grad_norm": 0.9816209077835083,
"learning_rate": 3.2500000000000002e-06,
"loss": 1.3733,
"step": 65
},
{
"epoch": 0.1507996953541508,
"grad_norm": 1.014113187789917,
"learning_rate": 3.3000000000000006e-06,
"loss": 1.3798,
"step": 66
},
{
"epoch": 0.15308453922315307,
"grad_norm": 1.005303978919983,
"learning_rate": 3.3500000000000005e-06,
"loss": 1.3877,
"step": 67
},
{
"epoch": 0.15536938309215537,
"grad_norm": 1.109976887702942,
"learning_rate": 3.4000000000000005e-06,
"loss": 1.4184,
"step": 68
},
{
"epoch": 0.15765422696115766,
"grad_norm": 1.033060908317566,
"learning_rate": 3.45e-06,
"loss": 1.4043,
"step": 69
},
{
"epoch": 0.15993907083015993,
"grad_norm": 0.9719234108924866,
"learning_rate": 3.5e-06,
"loss": 1.3481,
"step": 70
},
{
"epoch": 0.16222391469916222,
"grad_norm": 1.0430618524551392,
"learning_rate": 3.5500000000000003e-06,
"loss": 1.3227,
"step": 71
},
{
"epoch": 0.16450875856816452,
"grad_norm": 1.0481953620910645,
"learning_rate": 3.6000000000000003e-06,
"loss": 1.3174,
"step": 72
},
{
"epoch": 0.1667936024371668,
"grad_norm": 0.9868738055229187,
"learning_rate": 3.65e-06,
"loss": 1.356,
"step": 73
},
{
"epoch": 0.16907844630616908,
"grad_norm": 1.0015943050384521,
"learning_rate": 3.7e-06,
"loss": 1.3462,
"step": 74
},
{
"epoch": 0.17136329017517135,
"grad_norm": 1.0458308458328247,
"learning_rate": 3.7500000000000005e-06,
"loss": 1.3962,
"step": 75
},
{
"epoch": 0.17364813404417365,
"grad_norm": 1.0376830101013184,
"learning_rate": 3.8000000000000005e-06,
"loss": 1.3523,
"step": 76
},
{
"epoch": 0.17593297791317594,
"grad_norm": 0.9821555018424988,
"learning_rate": 3.85e-06,
"loss": 1.3559,
"step": 77
},
{
"epoch": 0.1782178217821782,
"grad_norm": 0.9579638838768005,
"learning_rate": 3.900000000000001e-06,
"loss": 1.3073,
"step": 78
},
{
"epoch": 0.1805026656511805,
"grad_norm": 0.9736194014549255,
"learning_rate": 3.95e-06,
"loss": 1.3494,
"step": 79
},
{
"epoch": 0.1827875095201828,
"grad_norm": 1.0055922269821167,
"learning_rate": 4.000000000000001e-06,
"loss": 1.3697,
"step": 80
},
{
"epoch": 0.18507235338918507,
"grad_norm": 0.9767876267433167,
"learning_rate": 4.05e-06,
"loss": 1.3225,
"step": 81
},
{
"epoch": 0.18735719725818736,
"grad_norm": 1.003092885017395,
"learning_rate": 4.1e-06,
"loss": 1.335,
"step": 82
},
{
"epoch": 0.18964204112718963,
"grad_norm": 0.9898741245269775,
"learning_rate": 4.15e-06,
"loss": 1.3103,
"step": 83
},
{
"epoch": 0.19192688499619193,
"grad_norm": 0.9903189539909363,
"learning_rate": 4.2000000000000004e-06,
"loss": 1.3741,
"step": 84
},
{
"epoch": 0.19421172886519422,
"grad_norm": 0.9661535620689392,
"learning_rate": 4.25e-06,
"loss": 1.3381,
"step": 85
},
{
"epoch": 0.1964965727341965,
"grad_norm": 0.9668599367141724,
"learning_rate": 4.3e-06,
"loss": 1.3511,
"step": 86
},
{
"epoch": 0.19878141660319879,
"grad_norm": 0.9633579254150391,
"learning_rate": 4.350000000000001e-06,
"loss": 1.3841,
"step": 87
},
{
"epoch": 0.20106626047220105,
"grad_norm": 0.9665766358375549,
"learning_rate": 4.4e-06,
"loss": 1.3211,
"step": 88
},
{
"epoch": 0.20335110434120335,
"grad_norm": 1.0263577699661255,
"learning_rate": 4.450000000000001e-06,
"loss": 1.3398,
"step": 89
},
{
"epoch": 0.20563594821020564,
"grad_norm": 1.0054337978363037,
"learning_rate": 4.5e-06,
"loss": 1.3598,
"step": 90
},
{
"epoch": 0.2079207920792079,
"grad_norm": 0.9768564701080322,
"learning_rate": 4.5500000000000005e-06,
"loss": 1.3386,
"step": 91
},
{
"epoch": 0.2102056359482102,
"grad_norm": 0.9710814356803894,
"learning_rate": 4.600000000000001e-06,
"loss": 1.306,
"step": 92
},
{
"epoch": 0.2124904798172125,
"grad_norm": 0.9943618774414062,
"learning_rate": 4.65e-06,
"loss": 1.3368,
"step": 93
},
{
"epoch": 0.21477532368621477,
"grad_norm": 1.0000272989273071,
"learning_rate": 4.7e-06,
"loss": 1.3561,
"step": 94
},
{
"epoch": 0.21706016755521707,
"grad_norm": 0.9748716950416565,
"learning_rate": 4.75e-06,
"loss": 1.3216,
"step": 95
},
{
"epoch": 0.21934501142421933,
"grad_norm": 0.977959930896759,
"learning_rate": 4.800000000000001e-06,
"loss": 1.3275,
"step": 96
},
{
"epoch": 0.22162985529322163,
"grad_norm": 0.9991240501403809,
"learning_rate": 4.85e-06,
"loss": 1.3143,
"step": 97
},
{
"epoch": 0.22391469916222392,
"grad_norm": 1.0590916872024536,
"learning_rate": 4.9000000000000005e-06,
"loss": 1.3467,
"step": 98
},
{
"epoch": 0.2261995430312262,
"grad_norm": 0.9592604041099548,
"learning_rate": 4.95e-06,
"loss": 1.3568,
"step": 99
},
{
"epoch": 0.2284843869002285,
"grad_norm": 0.9900586605072021,
"learning_rate": 5e-06,
"loss": 1.3162,
"step": 100
},
{
"epoch": 0.23076923076923078,
"grad_norm": 0.9882398843765259,
"learning_rate": 4.999998060367119e-06,
"loss": 1.3348,
"step": 101
},
{
"epoch": 0.23305407463823305,
"grad_norm": 0.9522809982299805,
"learning_rate": 4.999992241471486e-06,
"loss": 1.3004,
"step": 102
},
{
"epoch": 0.23533891850723535,
"grad_norm": 0.9822378754615784,
"learning_rate": 4.9999825433221295e-06,
"loss": 1.3326,
"step": 103
},
{
"epoch": 0.2376237623762376,
"grad_norm": 0.9944847822189331,
"learning_rate": 4.999968965934098e-06,
"loss": 1.3429,
"step": 104
},
{
"epoch": 0.2399086062452399,
"grad_norm": 1.052456021308899,
"learning_rate": 4.9999515093284605e-06,
"loss": 1.3476,
"step": 105
},
{
"epoch": 0.2421934501142422,
"grad_norm": 0.9862610697746277,
"learning_rate": 4.999930173532304e-06,
"loss": 1.3638,
"step": 106
},
{
"epoch": 0.24447829398324447,
"grad_norm": 0.9718945622444153,
"learning_rate": 4.999904958578735e-06,
"loss": 1.3013,
"step": 107
},
{
"epoch": 0.24676313785224677,
"grad_norm": 0.9535952210426331,
"learning_rate": 4.9998758645068805e-06,
"loss": 1.3317,
"step": 108
},
{
"epoch": 0.24904798172124903,
"grad_norm": 1.1905543804168701,
"learning_rate": 4.999842891361885e-06,
"loss": 1.3325,
"step": 109
},
{
"epoch": 0.25133282559025133,
"grad_norm": 1.0306485891342163,
"learning_rate": 4.9998060391949145e-06,
"loss": 1.3198,
"step": 110
},
{
"epoch": 0.2536176694592536,
"grad_norm": 1.0334984064102173,
"learning_rate": 4.999765308063152e-06,
"loss": 1.3075,
"step": 111
},
{
"epoch": 0.2559025133282559,
"grad_norm": 1.0020740032196045,
"learning_rate": 4.9997206980298e-06,
"loss": 1.3324,
"step": 112
},
{
"epoch": 0.25818735719725816,
"grad_norm": 0.9771923422813416,
"learning_rate": 4.9996722091640805e-06,
"loss": 1.3072,
"step": 113
},
{
"epoch": 0.26047220106626046,
"grad_norm": 0.9955299496650696,
"learning_rate": 4.999619841541234e-06,
"loss": 1.3501,
"step": 114
},
{
"epoch": 0.26275704493526275,
"grad_norm": 1.0125700235366821,
"learning_rate": 4.9995635952425205e-06,
"loss": 1.3387,
"step": 115
},
{
"epoch": 0.26504188880426505,
"grad_norm": 1.005936622619629,
"learning_rate": 4.999503470355215e-06,
"loss": 1.342,
"step": 116
},
{
"epoch": 0.26732673267326734,
"grad_norm": 0.9978262782096863,
"learning_rate": 4.999439466972616e-06,
"loss": 1.2954,
"step": 117
},
{
"epoch": 0.26961157654226964,
"grad_norm": 0.9668537974357605,
"learning_rate": 4.999371585194039e-06,
"loss": 1.3318,
"step": 118
},
{
"epoch": 0.2718964204112719,
"grad_norm": 1.0156077146530151,
"learning_rate": 4.999299825124814e-06,
"loss": 1.2681,
"step": 119
},
{
"epoch": 0.2741812642802742,
"grad_norm": 0.99967360496521,
"learning_rate": 4.999224186876293e-06,
"loss": 1.2666,
"step": 120
},
{
"epoch": 0.27646610814927647,
"grad_norm": 1.0085562467575073,
"learning_rate": 4.999144670565842e-06,
"loss": 1.3261,
"step": 121
},
{
"epoch": 0.27875095201827876,
"grad_norm": 1.0338691473007202,
"learning_rate": 4.999061276316851e-06,
"loss": 1.2943,
"step": 122
},
{
"epoch": 0.28103579588728106,
"grad_norm": 0.9880859851837158,
"learning_rate": 4.99897400425872e-06,
"loss": 1.3035,
"step": 123
},
{
"epoch": 0.2833206397562833,
"grad_norm": 0.9832742810249329,
"learning_rate": 4.998882854526872e-06,
"loss": 1.3015,
"step": 124
},
{
"epoch": 0.2856054836252856,
"grad_norm": 0.976040780544281,
"learning_rate": 4.998787827262743e-06,
"loss": 1.3325,
"step": 125
},
{
"epoch": 0.2878903274942879,
"grad_norm": 1.0309007167816162,
"learning_rate": 4.998688922613788e-06,
"loss": 1.2998,
"step": 126
},
{
"epoch": 0.2901751713632902,
"grad_norm": 1.0828396081924438,
"learning_rate": 4.998586140733477e-06,
"loss": 1.3093,
"step": 127
},
{
"epoch": 0.2924600152322925,
"grad_norm": 0.9725452661514282,
"learning_rate": 4.998479481781299e-06,
"loss": 1.2811,
"step": 128
},
{
"epoch": 0.2947448591012947,
"grad_norm": 0.9891279339790344,
"learning_rate": 4.998368945922757e-06,
"loss": 1.3104,
"step": 129
},
{
"epoch": 0.297029702970297,
"grad_norm": 1.022490382194519,
"learning_rate": 4.998254533329369e-06,
"loss": 1.3425,
"step": 130
},
{
"epoch": 0.2993145468392993,
"grad_norm": 1.00505530834198,
"learning_rate": 4.99813624417867e-06,
"loss": 1.3494,
"step": 131
},
{
"epoch": 0.3015993907083016,
"grad_norm": 1.033308982849121,
"learning_rate": 4.998014078654211e-06,
"loss": 1.278,
"step": 132
},
{
"epoch": 0.3038842345773039,
"grad_norm": 1.0194460153579712,
"learning_rate": 4.997888036945556e-06,
"loss": 1.2963,
"step": 133
},
{
"epoch": 0.30616907844630614,
"grad_norm": 1.005299687385559,
"learning_rate": 4.997758119248286e-06,
"loss": 1.3187,
"step": 134
},
{
"epoch": 0.30845392231530844,
"grad_norm": 1.0271679162979126,
"learning_rate": 4.997624325763994e-06,
"loss": 1.3106,
"step": 135
},
{
"epoch": 0.31073876618431073,
"grad_norm": 1.0343165397644043,
"learning_rate": 4.997486656700289e-06,
"loss": 1.3355,
"step": 136
},
{
"epoch": 0.31302361005331303,
"grad_norm": 1.0498188734054565,
"learning_rate": 4.997345112270792e-06,
"loss": 1.3126,
"step": 137
},
{
"epoch": 0.3153084539223153,
"grad_norm": 0.9742498993873596,
"learning_rate": 4.997199692695138e-06,
"loss": 1.3006,
"step": 138
},
{
"epoch": 0.3175932977913176,
"grad_norm": 1.0044124126434326,
"learning_rate": 4.997050398198977e-06,
"loss": 1.3298,
"step": 139
},
{
"epoch": 0.31987814166031986,
"grad_norm": 1.0173184871673584,
"learning_rate": 4.99689722901397e-06,
"loss": 1.3286,
"step": 140
},
{
"epoch": 0.32216298552932215,
"grad_norm": 0.9835124611854553,
"learning_rate": 4.99674018537779e-06,
"loss": 1.2937,
"step": 141
},
{
"epoch": 0.32444782939832445,
"grad_norm": 1.0389831066131592,
"learning_rate": 4.996579267534122e-06,
"loss": 1.3077,
"step": 142
},
{
"epoch": 0.32673267326732675,
"grad_norm": 1.0412015914916992,
"learning_rate": 4.996414475732664e-06,
"loss": 1.3131,
"step": 143
},
{
"epoch": 0.32901751713632904,
"grad_norm": 1.0527534484863281,
"learning_rate": 4.9962458102291254e-06,
"loss": 1.3075,
"step": 144
},
{
"epoch": 0.3313023610053313,
"grad_norm": 1.036034345626831,
"learning_rate": 4.9960732712852236e-06,
"loss": 1.3198,
"step": 145
},
{
"epoch": 0.3335872048743336,
"grad_norm": 1.0121785402297974,
"learning_rate": 4.99589685916869e-06,
"loss": 1.3346,
"step": 146
},
{
"epoch": 0.33587204874333587,
"grad_norm": 1.0597130060195923,
"learning_rate": 4.9957165741532635e-06,
"loss": 1.3025,
"step": 147
},
{
"epoch": 0.33815689261233817,
"grad_norm": 1.0982815027236938,
"learning_rate": 4.995532416518693e-06,
"loss": 1.3177,
"step": 148
},
{
"epoch": 0.34044173648134046,
"grad_norm": 1.012061357498169,
"learning_rate": 4.995344386550738e-06,
"loss": 1.2905,
"step": 149
},
{
"epoch": 0.3427265803503427,
"grad_norm": 1.0748074054718018,
"learning_rate": 4.995152484541166e-06,
"loss": 1.3191,
"step": 150
},
{
"epoch": 0.345011424219345,
"grad_norm": 1.0346341133117676,
"learning_rate": 4.994956710787752e-06,
"loss": 1.2923,
"step": 151
},
{
"epoch": 0.3472962680883473,
"grad_norm": 1.0333645343780518,
"learning_rate": 4.99475706559428e-06,
"loss": 1.3272,
"step": 152
},
{
"epoch": 0.3495811119573496,
"grad_norm": 1.0411094427108765,
"learning_rate": 4.9945535492705385e-06,
"loss": 1.3102,
"step": 153
},
{
"epoch": 0.3518659558263519,
"grad_norm": 1.0394591093063354,
"learning_rate": 4.994346162132329e-06,
"loss": 1.2912,
"step": 154
},
{
"epoch": 0.3541507996953541,
"grad_norm": 1.1258337497711182,
"learning_rate": 4.994134904501452e-06,
"loss": 1.295,
"step": 155
},
{
"epoch": 0.3564356435643564,
"grad_norm": 1.0196075439453125,
"learning_rate": 4.993919776705718e-06,
"loss": 1.2935,
"step": 156
},
{
"epoch": 0.3587204874333587,
"grad_norm": 1.020180583000183,
"learning_rate": 4.993700779078943e-06,
"loss": 1.3118,
"step": 157
},
{
"epoch": 0.361005331302361,
"grad_norm": 1.1170531511306763,
"learning_rate": 4.993477911960948e-06,
"loss": 1.2924,
"step": 158
},
{
"epoch": 0.3632901751713633,
"grad_norm": 1.0637717247009277,
"learning_rate": 4.993251175697554e-06,
"loss": 1.2797,
"step": 159
},
{
"epoch": 0.3655750190403656,
"grad_norm": 1.046305775642395,
"learning_rate": 4.993020570640592e-06,
"loss": 1.3142,
"step": 160
},
{
"epoch": 0.36785986290936784,
"grad_norm": 1.039476752281189,
"learning_rate": 4.992786097147892e-06,
"loss": 1.2773,
"step": 161
},
{
"epoch": 0.37014470677837014,
"grad_norm": 1.0379183292388916,
"learning_rate": 4.992547755583288e-06,
"loss": 1.3057,
"step": 162
},
{
"epoch": 0.37242955064737243,
"grad_norm": 1.0063403844833374,
"learning_rate": 4.992305546316617e-06,
"loss": 1.3108,
"step": 163
},
{
"epoch": 0.3747143945163747,
"grad_norm": 1.0467029809951782,
"learning_rate": 4.992059469723716e-06,
"loss": 1.2675,
"step": 164
},
{
"epoch": 0.376999238385377,
"grad_norm": 0.9822115898132324,
"learning_rate": 4.991809526186424e-06,
"loss": 1.2987,
"step": 165
},
{
"epoch": 0.37928408225437926,
"grad_norm": 0.9957991242408752,
"learning_rate": 4.9915557160925795e-06,
"loss": 1.2927,
"step": 166
},
{
"epoch": 0.38156892612338156,
"grad_norm": 1.020486831665039,
"learning_rate": 4.991298039836021e-06,
"loss": 1.2891,
"step": 167
},
{
"epoch": 0.38385376999238385,
"grad_norm": 0.9941042065620422,
"learning_rate": 4.991036497816587e-06,
"loss": 1.3279,
"step": 168
},
{
"epoch": 0.38613861386138615,
"grad_norm": 1.030573844909668,
"learning_rate": 4.990771090440114e-06,
"loss": 1.2715,
"step": 169
},
{
"epoch": 0.38842345773038844,
"grad_norm": 0.9810742735862732,
"learning_rate": 4.990501818118436e-06,
"loss": 1.2808,
"step": 170
},
{
"epoch": 0.3907083015993907,
"grad_norm": 1.0300201177597046,
"learning_rate": 4.990228681269383e-06,
"loss": 1.3079,
"step": 171
},
{
"epoch": 0.392993145468393,
"grad_norm": 1.0107353925704956,
"learning_rate": 4.989951680316787e-06,
"loss": 1.2872,
"step": 172
},
{
"epoch": 0.3952779893373953,
"grad_norm": 1.0361515283584595,
"learning_rate": 4.989670815690469e-06,
"loss": 1.2784,
"step": 173
},
{
"epoch": 0.39756283320639757,
"grad_norm": 1.0452970266342163,
"learning_rate": 4.989386087826248e-06,
"loss": 1.2976,
"step": 174
},
{
"epoch": 0.39984767707539987,
"grad_norm": 1.0585196018218994,
"learning_rate": 4.9890974971659405e-06,
"loss": 1.2921,
"step": 175
},
{
"epoch": 0.4021325209444021,
"grad_norm": 1.018211007118225,
"learning_rate": 4.988805044157353e-06,
"loss": 1.3046,
"step": 176
},
{
"epoch": 0.4044173648134044,
"grad_norm": 1.0587507486343384,
"learning_rate": 4.9885087292542865e-06,
"loss": 1.2901,
"step": 177
},
{
"epoch": 0.4067022086824067,
"grad_norm": 1.0261503458023071,
"learning_rate": 4.988208552916535e-06,
"loss": 1.3081,
"step": 178
},
{
"epoch": 0.408987052551409,
"grad_norm": 1.0412943363189697,
"learning_rate": 4.9879045156098846e-06,
"loss": 1.3052,
"step": 179
},
{
"epoch": 0.4112718964204113,
"grad_norm": 1.0323666334152222,
"learning_rate": 4.987596617806111e-06,
"loss": 1.3048,
"step": 180
},
{
"epoch": 0.4135567402894136,
"grad_norm": 1.0095067024230957,
"learning_rate": 4.9872848599829825e-06,
"loss": 1.3292,
"step": 181
},
{
"epoch": 0.4158415841584158,
"grad_norm": 0.9761002659797668,
"learning_rate": 4.986969242624254e-06,
"loss": 1.2884,
"step": 182
},
{
"epoch": 0.4181264280274181,
"grad_norm": 1.0436338186264038,
"learning_rate": 4.986649766219671e-06,
"loss": 1.3211,
"step": 183
},
{
"epoch": 0.4204112718964204,
"grad_norm": 1.0505225658416748,
"learning_rate": 4.986326431264969e-06,
"loss": 1.2863,
"step": 184
},
{
"epoch": 0.4226961157654227,
"grad_norm": 1.006611943244934,
"learning_rate": 4.985999238261867e-06,
"loss": 1.2812,
"step": 185
},
{
"epoch": 0.424980959634425,
"grad_norm": 1.0494719743728638,
"learning_rate": 4.985668187718073e-06,
"loss": 1.3105,
"step": 186
},
{
"epoch": 0.42726580350342724,
"grad_norm": 0.9847164750099182,
"learning_rate": 4.985333280147281e-06,
"loss": 1.2811,
"step": 187
},
{
"epoch": 0.42955064737242954,
"grad_norm": 1.0337165594100952,
"learning_rate": 4.984994516069168e-06,
"loss": 1.2876,
"step": 188
},
{
"epoch": 0.43183549124143183,
"grad_norm": 1.0178074836730957,
"learning_rate": 4.984651896009396e-06,
"loss": 1.2597,
"step": 189
},
{
"epoch": 0.43412033511043413,
"grad_norm": 1.0170668363571167,
"learning_rate": 4.984305420499612e-06,
"loss": 1.2916,
"step": 190
},
{
"epoch": 0.4364051789794364,
"grad_norm": 1.0148853063583374,
"learning_rate": 4.983955090077445e-06,
"loss": 1.2785,
"step": 191
},
{
"epoch": 0.43869002284843867,
"grad_norm": 1.0563602447509766,
"learning_rate": 4.983600905286502e-06,
"loss": 1.295,
"step": 192
},
{
"epoch": 0.44097486671744096,
"grad_norm": 0.9817858338356018,
"learning_rate": 4.983242866676376e-06,
"loss": 1.2832,
"step": 193
},
{
"epoch": 0.44325971058644326,
"grad_norm": 1.0299488306045532,
"learning_rate": 4.982880974802638e-06,
"loss": 1.2952,
"step": 194
},
{
"epoch": 0.44554455445544555,
"grad_norm": 0.9951279163360596,
"learning_rate": 4.982515230226837e-06,
"loss": 1.2901,
"step": 195
},
{
"epoch": 0.44782939832444785,
"grad_norm": 1.0001885890960693,
"learning_rate": 4.982145633516501e-06,
"loss": 1.2554,
"step": 196
},
{
"epoch": 0.4501142421934501,
"grad_norm": 1.0821017026901245,
"learning_rate": 4.981772185245135e-06,
"loss": 1.2903,
"step": 197
},
{
"epoch": 0.4523990860624524,
"grad_norm": 1.0269831418991089,
"learning_rate": 4.981394885992223e-06,
"loss": 1.3077,
"step": 198
},
{
"epoch": 0.4546839299314547,
"grad_norm": 1.025965929031372,
"learning_rate": 4.981013736343221e-06,
"loss": 1.2771,
"step": 199
},
{
"epoch": 0.456968773800457,
"grad_norm": 0.9828860759735107,
"learning_rate": 4.980628736889562e-06,
"loss": 1.2788,
"step": 200
},
{
"epoch": 0.45925361766945927,
"grad_norm": 1.077913761138916,
"learning_rate": 4.9802398882286515e-06,
"loss": 1.2815,
"step": 201
},
{
"epoch": 0.46153846153846156,
"grad_norm": 1.1024688482284546,
"learning_rate": 4.97984719096387e-06,
"loss": 1.3135,
"step": 202
},
{
"epoch": 0.4638233054074638,
"grad_norm": 1.0494202375411987,
"learning_rate": 4.979450645704567e-06,
"loss": 1.3027,
"step": 203
},
{
"epoch": 0.4661081492764661,
"grad_norm": 1.0050199031829834,
"learning_rate": 4.979050253066064e-06,
"loss": 1.3016,
"step": 204
},
{
"epoch": 0.4683929931454684,
"grad_norm": 1.0264744758605957,
"learning_rate": 4.978646013669652e-06,
"loss": 1.343,
"step": 205
},
{
"epoch": 0.4706778370144707,
"grad_norm": 1.001989722251892,
"learning_rate": 4.978237928142594e-06,
"loss": 1.3088,
"step": 206
},
{
"epoch": 0.472962680883473,
"grad_norm": 1.0501984357833862,
"learning_rate": 4.977825997118119e-06,
"loss": 1.2875,
"step": 207
},
{
"epoch": 0.4752475247524752,
"grad_norm": 1.0487364530563354,
"learning_rate": 4.977410221235421e-06,
"loss": 1.2917,
"step": 208
},
{
"epoch": 0.4775323686214775,
"grad_norm": 1.0768541097640991,
"learning_rate": 4.976990601139662e-06,
"loss": 1.3,
"step": 209
},
{
"epoch": 0.4798172124904798,
"grad_norm": 0.9696170687675476,
"learning_rate": 4.9765671374819715e-06,
"loss": 1.2822,
"step": 210
},
{
"epoch": 0.4821020563594821,
"grad_norm": 0.9987464547157288,
"learning_rate": 4.9761398309194385e-06,
"loss": 1.3076,
"step": 211
},
{
"epoch": 0.4843869002284844,
"grad_norm": 1.0254422426223755,
"learning_rate": 4.975708682115118e-06,
"loss": 1.281,
"step": 212
},
{
"epoch": 0.48667174409748665,
"grad_norm": 1.0040076971054077,
"learning_rate": 4.9752736917380274e-06,
"loss": 1.2821,
"step": 213
},
{
"epoch": 0.48895658796648894,
"grad_norm": 1.004184365272522,
"learning_rate": 4.9748348604631416e-06,
"loss": 1.2641,
"step": 214
},
{
"epoch": 0.49124143183549124,
"grad_norm": 1.0694694519042969,
"learning_rate": 4.9743921889714005e-06,
"loss": 1.2853,
"step": 215
},
{
"epoch": 0.49352627570449353,
"grad_norm": 1.0564874410629272,
"learning_rate": 4.973945677949699e-06,
"loss": 1.2882,
"step": 216
},
{
"epoch": 0.49581111957349583,
"grad_norm": 1.0076894760131836,
"learning_rate": 4.973495328090891e-06,
"loss": 1.2868,
"step": 217
},
{
"epoch": 0.49809596344249807,
"grad_norm": 1.0476043224334717,
"learning_rate": 4.973041140093786e-06,
"loss": 1.2642,
"step": 218
},
{
"epoch": 0.5003808073115004,
"grad_norm": 1.050991415977478,
"learning_rate": 4.972583114663153e-06,
"loss": 1.2751,
"step": 219
},
{
"epoch": 0.5026656511805027,
"grad_norm": 0.9902971386909485,
"learning_rate": 4.972121252509712e-06,
"loss": 1.2685,
"step": 220
},
{
"epoch": 0.504950495049505,
"grad_norm": 1.0011303424835205,
"learning_rate": 4.971655554350137e-06,
"loss": 1.2829,
"step": 221
},
{
"epoch": 0.5072353389185073,
"grad_norm": 1.010233998298645,
"learning_rate": 4.971186020907054e-06,
"loss": 1.277,
"step": 222
},
{
"epoch": 0.5095201827875095,
"grad_norm": 1.0275652408599854,
"learning_rate": 4.970712652909042e-06,
"loss": 1.2971,
"step": 223
},
{
"epoch": 0.5118050266565118,
"grad_norm": 1.0285537242889404,
"learning_rate": 4.970235451090629e-06,
"loss": 1.231,
"step": 224
},
{
"epoch": 0.5140898705255141,
"grad_norm": 1.0604579448699951,
"learning_rate": 4.969754416192292e-06,
"loss": 1.269,
"step": 225
},
{
"epoch": 0.5163747143945163,
"grad_norm": 1.0375958681106567,
"learning_rate": 4.969269548960456e-06,
"loss": 1.2712,
"step": 226
},
{
"epoch": 0.5186595582635186,
"grad_norm": 1.037304401397705,
"learning_rate": 4.9687808501474925e-06,
"loss": 1.2826,
"step": 227
},
{
"epoch": 0.5209444021325209,
"grad_norm": 1.0280749797821045,
"learning_rate": 4.968288320511718e-06,
"loss": 1.2726,
"step": 228
},
{
"epoch": 0.5232292460015232,
"grad_norm": 1.0595530271530151,
"learning_rate": 4.967791960817395e-06,
"loss": 1.281,
"step": 229
},
{
"epoch": 0.5255140898705255,
"grad_norm": 0.9964226484298706,
"learning_rate": 4.967291771834727e-06,
"loss": 1.3188,
"step": 230
},
{
"epoch": 0.5277989337395278,
"grad_norm": 1.0433804988861084,
"learning_rate": 4.966787754339861e-06,
"loss": 1.274,
"step": 231
},
{
"epoch": 0.5300837776085301,
"grad_norm": 1.079641580581665,
"learning_rate": 4.966279909114883e-06,
"loss": 1.2991,
"step": 232
},
{
"epoch": 0.5323686214775324,
"grad_norm": 1.0351816415786743,
"learning_rate": 4.965768236947821e-06,
"loss": 1.2659,
"step": 233
},
{
"epoch": 0.5346534653465347,
"grad_norm": 1.0495244264602661,
"learning_rate": 4.96525273863264e-06,
"loss": 1.2898,
"step": 234
},
{
"epoch": 0.536938309215537,
"grad_norm": 1.0479910373687744,
"learning_rate": 4.964733414969241e-06,
"loss": 1.2536,
"step": 235
},
{
"epoch": 0.5392231530845393,
"grad_norm": 1.0365879535675049,
"learning_rate": 4.964210266763461e-06,
"loss": 1.2369,
"step": 236
},
{
"epoch": 0.5415079969535415,
"grad_norm": 1.0398730039596558,
"learning_rate": 4.9636832948270745e-06,
"loss": 1.2669,
"step": 237
},
{
"epoch": 0.5437928408225438,
"grad_norm": 1.0146657228469849,
"learning_rate": 4.963152499977786e-06,
"loss": 1.2893,
"step": 238
},
{
"epoch": 0.546077684691546,
"grad_norm": 1.0974043607711792,
"learning_rate": 4.962617883039233e-06,
"loss": 1.2452,
"step": 239
},
{
"epoch": 0.5483625285605483,
"grad_norm": 0.9900649189949036,
"learning_rate": 4.962079444840985e-06,
"loss": 1.2215,
"step": 240
},
{
"epoch": 0.5506473724295506,
"grad_norm": 1.003464937210083,
"learning_rate": 4.9615371862185394e-06,
"loss": 1.2744,
"step": 241
},
{
"epoch": 0.5529322162985529,
"grad_norm": 1.004382848739624,
"learning_rate": 4.960991108013322e-06,
"loss": 1.271,
"step": 242
},
{
"epoch": 0.5552170601675552,
"grad_norm": 1.0129280090332031,
"learning_rate": 4.960441211072686e-06,
"loss": 1.2874,
"step": 243
},
{
"epoch": 0.5575019040365575,
"grad_norm": 1.040189266204834,
"learning_rate": 4.9598874962499096e-06,
"loss": 1.2918,
"step": 244
},
{
"epoch": 0.5597867479055598,
"grad_norm": 1.0145982503890991,
"learning_rate": 4.959329964404197e-06,
"loss": 1.2713,
"step": 245
},
{
"epoch": 0.5620715917745621,
"grad_norm": 1.0469987392425537,
"learning_rate": 4.958768616400672e-06,
"loss": 1.2689,
"step": 246
},
{
"epoch": 0.5643564356435643,
"grad_norm": 1.0191642045974731,
"learning_rate": 4.958203453110384e-06,
"loss": 1.2718,
"step": 247
},
{
"epoch": 0.5666412795125666,
"grad_norm": 1.0718231201171875,
"learning_rate": 4.957634475410298e-06,
"loss": 1.3128,
"step": 248
},
{
"epoch": 0.5689261233815689,
"grad_norm": 1.0109634399414062,
"learning_rate": 4.957061684183301e-06,
"loss": 1.2586,
"step": 249
},
{
"epoch": 0.5712109672505712,
"grad_norm": 0.9942657947540283,
"learning_rate": 4.956485080318198e-06,
"loss": 1.328,
"step": 250
},
{
"epoch": 0.5734958111195735,
"grad_norm": 1.0184757709503174,
"learning_rate": 4.955904664709707e-06,
"loss": 1.2815,
"step": 251
},
{
"epoch": 0.5757806549885758,
"grad_norm": 1.015625,
"learning_rate": 4.955320438258465e-06,
"loss": 1.2585,
"step": 252
},
{
"epoch": 0.5780654988575781,
"grad_norm": 0.9848981499671936,
"learning_rate": 4.954732401871018e-06,
"loss": 1.2866,
"step": 253
},
{
"epoch": 0.5803503427265804,
"grad_norm": 1.0482749938964844,
"learning_rate": 4.954140556459826e-06,
"loss": 1.2732,
"step": 254
},
{
"epoch": 0.5826351865955827,
"grad_norm": 1.0250680446624756,
"learning_rate": 4.95354490294326e-06,
"loss": 1.3053,
"step": 255
},
{
"epoch": 0.584920030464585,
"grad_norm": 1.0545597076416016,
"learning_rate": 4.952945442245598e-06,
"loss": 1.2638,
"step": 256
},
{
"epoch": 0.5872048743335873,
"grad_norm": 1.044873833656311,
"learning_rate": 4.952342175297028e-06,
"loss": 1.2683,
"step": 257
},
{
"epoch": 0.5894897182025894,
"grad_norm": 1.0361744165420532,
"learning_rate": 4.951735103033644e-06,
"loss": 1.2887,
"step": 258
},
{
"epoch": 0.5917745620715917,
"grad_norm": 1.0238685607910156,
"learning_rate": 4.951124226397441e-06,
"loss": 1.2736,
"step": 259
},
{
"epoch": 0.594059405940594,
"grad_norm": 1.0217833518981934,
"learning_rate": 4.950509546336323e-06,
"loss": 1.2681,
"step": 260
},
{
"epoch": 0.5963442498095963,
"grad_norm": 1.0546188354492188,
"learning_rate": 4.949891063804091e-06,
"loss": 1.2582,
"step": 261
},
{
"epoch": 0.5986290936785986,
"grad_norm": 1.0834907293319702,
"learning_rate": 4.94926877976045e-06,
"loss": 1.2487,
"step": 262
},
{
"epoch": 0.6009139375476009,
"grad_norm": 1.062184453010559,
"learning_rate": 4.948642695171e-06,
"loss": 1.3188,
"step": 263
},
{
"epoch": 0.6031987814166032,
"grad_norm": 1.0373252630233765,
"learning_rate": 4.948012811007242e-06,
"loss": 1.277,
"step": 264
},
{
"epoch": 0.6054836252856055,
"grad_norm": 1.0140316486358643,
"learning_rate": 4.947379128246571e-06,
"loss": 1.2617,
"step": 265
},
{
"epoch": 0.6077684691546078,
"grad_norm": 1.054410696029663,
"learning_rate": 4.946741647872277e-06,
"loss": 1.238,
"step": 266
},
{
"epoch": 0.6100533130236101,
"grad_norm": 1.0967663526535034,
"learning_rate": 4.94610037087354e-06,
"loss": 1.2682,
"step": 267
},
{
"epoch": 0.6123381568926123,
"grad_norm": 1.043338656425476,
"learning_rate": 4.945455298245436e-06,
"loss": 1.2572,
"step": 268
},
{
"epoch": 0.6146230007616146,
"grad_norm": 1.0187970399856567,
"learning_rate": 4.944806430988927e-06,
"loss": 1.2613,
"step": 269
},
{
"epoch": 0.6169078446306169,
"grad_norm": 1.0666472911834717,
"learning_rate": 4.9441537701108654e-06,
"loss": 1.2611,
"step": 270
},
{
"epoch": 0.6191926884996192,
"grad_norm": 1.0025635957717896,
"learning_rate": 4.943497316623988e-06,
"loss": 1.2519,
"step": 271
},
{
"epoch": 0.6214775323686215,
"grad_norm": 1.0135135650634766,
"learning_rate": 4.942837071546919e-06,
"loss": 1.2759,
"step": 272
},
{
"epoch": 0.6237623762376238,
"grad_norm": 0.9985151886940002,
"learning_rate": 4.942173035904164e-06,
"loss": 1.2844,
"step": 273
},
{
"epoch": 0.6260472201066261,
"grad_norm": 0.9952817559242249,
"learning_rate": 4.941505210726112e-06,
"loss": 1.2356,
"step": 274
},
{
"epoch": 0.6283320639756284,
"grad_norm": 1.0448962450027466,
"learning_rate": 4.9408335970490305e-06,
"loss": 1.2587,
"step": 275
},
{
"epoch": 0.6306169078446306,
"grad_norm": 1.011099100112915,
"learning_rate": 4.940158195915067e-06,
"loss": 1.2729,
"step": 276
},
{
"epoch": 0.6329017517136329,
"grad_norm": 1.052904725074768,
"learning_rate": 4.939479008372247e-06,
"loss": 1.2536,
"step": 277
},
{
"epoch": 0.6351865955826352,
"grad_norm": 1.058173418045044,
"learning_rate": 4.938796035474469e-06,
"loss": 1.2807,
"step": 278
},
{
"epoch": 0.6374714394516374,
"grad_norm": 1.022147536277771,
"learning_rate": 4.938109278281506e-06,
"loss": 1.2887,
"step": 279
},
{
"epoch": 0.6397562833206397,
"grad_norm": 1.0064011812210083,
"learning_rate": 4.937418737859004e-06,
"loss": 1.2192,
"step": 280
},
{
"epoch": 0.642041127189642,
"grad_norm": 1.0092360973358154,
"learning_rate": 4.936724415278479e-06,
"loss": 1.3159,
"step": 281
},
{
"epoch": 0.6443259710586443,
"grad_norm": 1.076401710510254,
"learning_rate": 4.936026311617316e-06,
"loss": 1.2872,
"step": 282
},
{
"epoch": 0.6466108149276466,
"grad_norm": 1.057209849357605,
"learning_rate": 4.935324427958766e-06,
"loss": 1.257,
"step": 283
},
{
"epoch": 0.6488956587966489,
"grad_norm": 1.1738762855529785,
"learning_rate": 4.934618765391946e-06,
"loss": 1.2547,
"step": 284
},
{
"epoch": 0.6511805026656512,
"grad_norm": 1.0405137538909912,
"learning_rate": 4.933909325011838e-06,
"loss": 1.2766,
"step": 285
},
{
"epoch": 0.6534653465346535,
"grad_norm": 1.0377894639968872,
"learning_rate": 4.933196107919286e-06,
"loss": 1.2624,
"step": 286
},
{
"epoch": 0.6557501904036558,
"grad_norm": 1.032714605331421,
"learning_rate": 4.932479115220991e-06,
"loss": 1.2527,
"step": 287
},
{
"epoch": 0.6580350342726581,
"grad_norm": 1.0755581855773926,
"learning_rate": 4.9317583480295175e-06,
"loss": 1.2966,
"step": 288
},
{
"epoch": 0.6603198781416603,
"grad_norm": 1.0262556076049805,
"learning_rate": 4.931033807463283e-06,
"loss": 1.2585,
"step": 289
},
{
"epoch": 0.6626047220106626,
"grad_norm": 1.0510430335998535,
"learning_rate": 4.930305494646562e-06,
"loss": 1.2662,
"step": 290
},
{
"epoch": 0.6648895658796649,
"grad_norm": 1.035854458808899,
"learning_rate": 4.9295734107094825e-06,
"loss": 1.2346,
"step": 291
},
{
"epoch": 0.6671744097486672,
"grad_norm": 1.0485846996307373,
"learning_rate": 4.928837556788023e-06,
"loss": 1.2978,
"step": 292
},
{
"epoch": 0.6694592536176694,
"grad_norm": 1.02550208568573,
"learning_rate": 4.928097934024013e-06,
"loss": 1.2478,
"step": 293
},
{
"epoch": 0.6717440974866717,
"grad_norm": 1.0328837633132935,
"learning_rate": 4.927354543565131e-06,
"loss": 1.2788,
"step": 294
},
{
"epoch": 0.674028941355674,
"grad_norm": 0.9913997054100037,
"learning_rate": 4.926607386564898e-06,
"loss": 1.2423,
"step": 295
},
{
"epoch": 0.6763137852246763,
"grad_norm": 1.0034306049346924,
"learning_rate": 4.925856464182685e-06,
"loss": 1.2562,
"step": 296
},
{
"epoch": 0.6785986290936786,
"grad_norm": 1.0546495914459229,
"learning_rate": 4.925101777583701e-06,
"loss": 1.2598,
"step": 297
},
{
"epoch": 0.6808834729626809,
"grad_norm": 1.0412935018539429,
"learning_rate": 4.924343327938999e-06,
"loss": 1.2744,
"step": 298
},
{
"epoch": 0.6831683168316832,
"grad_norm": 1.0731669664382935,
"learning_rate": 4.923581116425471e-06,
"loss": 1.2912,
"step": 299
},
{
"epoch": 0.6854531607006854,
"grad_norm": 1.0394880771636963,
"learning_rate": 4.922815144225843e-06,
"loss": 1.276,
"step": 300
},
{
"epoch": 0.6877380045696877,
"grad_norm": 1.0383579730987549,
"learning_rate": 4.92204541252868e-06,
"loss": 1.255,
"step": 301
},
{
"epoch": 0.69002284843869,
"grad_norm": 1.0251744985580444,
"learning_rate": 4.92127192252838e-06,
"loss": 1.2688,
"step": 302
},
{
"epoch": 0.6923076923076923,
"grad_norm": 1.017650842666626,
"learning_rate": 4.9204946754251724e-06,
"loss": 1.2818,
"step": 303
},
{
"epoch": 0.6945925361766946,
"grad_norm": 1.0219080448150635,
"learning_rate": 4.919713672425116e-06,
"loss": 1.2828,
"step": 304
},
{
"epoch": 0.6968773800456969,
"grad_norm": 1.0862151384353638,
"learning_rate": 4.918928914740098e-06,
"loss": 1.2514,
"step": 305
},
{
"epoch": 0.6991622239146992,
"grad_norm": 1.0639281272888184,
"learning_rate": 4.918140403587831e-06,
"loss": 1.2739,
"step": 306
},
{
"epoch": 0.7014470677837015,
"grad_norm": 1.0512444972991943,
"learning_rate": 4.9173481401918556e-06,
"loss": 1.2576,
"step": 307
},
{
"epoch": 0.7037319116527038,
"grad_norm": 1.0291866064071655,
"learning_rate": 4.916552125781529e-06,
"loss": 1.2934,
"step": 308
},
{
"epoch": 0.7060167555217061,
"grad_norm": 1.0338629484176636,
"learning_rate": 4.915752361592032e-06,
"loss": 1.263,
"step": 309
},
{
"epoch": 0.7083015993907082,
"grad_norm": 1.0358542203903198,
"learning_rate": 4.914948848864365e-06,
"loss": 1.2453,
"step": 310
},
{
"epoch": 0.7105864432597105,
"grad_norm": 1.1184923648834229,
"learning_rate": 4.914141588845344e-06,
"loss": 1.2653,
"step": 311
},
{
"epoch": 0.7128712871287128,
"grad_norm": 1.0791000127792358,
"learning_rate": 4.913330582787598e-06,
"loss": 1.2659,
"step": 312
},
{
"epoch": 0.7151561309977151,
"grad_norm": 1.0901819467544556,
"learning_rate": 4.912515831949571e-06,
"loss": 1.2208,
"step": 313
},
{
"epoch": 0.7174409748667174,
"grad_norm": 1.0219902992248535,
"learning_rate": 4.9116973375955166e-06,
"loss": 1.2711,
"step": 314
},
{
"epoch": 0.7197258187357197,
"grad_norm": 1.014364242553711,
"learning_rate": 4.910875100995499e-06,
"loss": 1.2877,
"step": 315
},
{
"epoch": 0.722010662604722,
"grad_norm": 1.0699234008789062,
"learning_rate": 4.910049123425386e-06,
"loss": 1.2425,
"step": 316
},
{
"epoch": 0.7242955064737243,
"grad_norm": 1.0614267587661743,
"learning_rate": 4.9092194061668535e-06,
"loss": 1.2475,
"step": 317
},
{
"epoch": 0.7265803503427266,
"grad_norm": 1.0620336532592773,
"learning_rate": 4.908385950507378e-06,
"loss": 1.2618,
"step": 318
},
{
"epoch": 0.7288651942117289,
"grad_norm": 1.0389032363891602,
"learning_rate": 4.90754875774024e-06,
"loss": 1.2742,
"step": 319
},
{
"epoch": 0.7311500380807312,
"grad_norm": 0.9754124879837036,
"learning_rate": 4.9067078291645144e-06,
"loss": 1.25,
"step": 320
},
{
"epoch": 0.7334348819497334,
"grad_norm": 1.056058406829834,
"learning_rate": 4.905863166085076e-06,
"loss": 1.2451,
"step": 321
},
{
"epoch": 0.7357197258187357,
"grad_norm": 1.0641580820083618,
"learning_rate": 4.9050147698125944e-06,
"loss": 1.2532,
"step": 322
},
{
"epoch": 0.738004569687738,
"grad_norm": 1.0407251119613647,
"learning_rate": 4.904162641663532e-06,
"loss": 1.3103,
"step": 323
},
{
"epoch": 0.7402894135567403,
"grad_norm": 1.0477187633514404,
"learning_rate": 4.9033067829601385e-06,
"loss": 1.2658,
"step": 324
},
{
"epoch": 0.7425742574257426,
"grad_norm": 1.0202401876449585,
"learning_rate": 4.902447195030459e-06,
"loss": 1.2569,
"step": 325
},
{
"epoch": 0.7448591012947449,
"grad_norm": 1.0629253387451172,
"learning_rate": 4.9015838792083196e-06,
"loss": 1.247,
"step": 326
},
{
"epoch": 0.7471439451637472,
"grad_norm": 1.0284748077392578,
"learning_rate": 4.900716836833333e-06,
"loss": 1.2659,
"step": 327
},
{
"epoch": 0.7494287890327495,
"grad_norm": 1.0653586387634277,
"learning_rate": 4.899846069250894e-06,
"loss": 1.2673,
"step": 328
},
{
"epoch": 0.7517136329017517,
"grad_norm": 1.0795682668685913,
"learning_rate": 4.898971577812179e-06,
"loss": 1.2778,
"step": 329
},
{
"epoch": 0.753998476770754,
"grad_norm": 1.0359232425689697,
"learning_rate": 4.8980933638741426e-06,
"loss": 1.2732,
"step": 330
},
{
"epoch": 0.7562833206397562,
"grad_norm": 1.0286237001419067,
"learning_rate": 4.897211428799512e-06,
"loss": 1.2455,
"step": 331
},
{
"epoch": 0.7585681645087585,
"grad_norm": 1.0179105997085571,
"learning_rate": 4.896325773956793e-06,
"loss": 1.2413,
"step": 332
},
{
"epoch": 0.7608530083777608,
"grad_norm": 1.0381865501403809,
"learning_rate": 4.895436400720264e-06,
"loss": 1.2409,
"step": 333
},
{
"epoch": 0.7631378522467631,
"grad_norm": 0.9918906688690186,
"learning_rate": 4.894543310469968e-06,
"loss": 1.2556,
"step": 334
},
{
"epoch": 0.7654226961157654,
"grad_norm": 1.0300416946411133,
"learning_rate": 4.8936465045917204e-06,
"loss": 1.2325,
"step": 335
},
{
"epoch": 0.7677075399847677,
"grad_norm": 1.052534580230713,
"learning_rate": 4.8927459844770995e-06,
"loss": 1.2561,
"step": 336
},
{
"epoch": 0.76999238385377,
"grad_norm": 1.0454604625701904,
"learning_rate": 4.891841751523448e-06,
"loss": 1.2845,
"step": 337
},
{
"epoch": 0.7722772277227723,
"grad_norm": 1.0518709421157837,
"learning_rate": 4.8909338071338706e-06,
"loss": 1.2485,
"step": 338
},
{
"epoch": 0.7745620715917746,
"grad_norm": 1.0326422452926636,
"learning_rate": 4.890022152717231e-06,
"loss": 1.2757,
"step": 339
},
{
"epoch": 0.7768469154607769,
"grad_norm": 1.2617943286895752,
"learning_rate": 4.889106789688148e-06,
"loss": 1.2656,
"step": 340
},
{
"epoch": 0.7791317593297792,
"grad_norm": 1.0038459300994873,
"learning_rate": 4.888187719466996e-06,
"loss": 1.2636,
"step": 341
},
{
"epoch": 0.7814166031987814,
"grad_norm": 1.1393420696258545,
"learning_rate": 4.887264943479903e-06,
"loss": 1.2621,
"step": 342
},
{
"epoch": 0.7837014470677837,
"grad_norm": 1.0969446897506714,
"learning_rate": 4.8863384631587446e-06,
"loss": 1.2208,
"step": 343
},
{
"epoch": 0.785986290936786,
"grad_norm": 1.034393310546875,
"learning_rate": 4.885408279941148e-06,
"loss": 1.2101,
"step": 344
},
{
"epoch": 0.7882711348057883,
"grad_norm": 1.1397764682769775,
"learning_rate": 4.884474395270484e-06,
"loss": 1.2823,
"step": 345
},
{
"epoch": 0.7905559786747905,
"grad_norm": 1.1488789319992065,
"learning_rate": 4.883536810595867e-06,
"loss": 1.2615,
"step": 346
},
{
"epoch": 0.7928408225437928,
"grad_norm": 1.0274580717086792,
"learning_rate": 4.8825955273721524e-06,
"loss": 1.2334,
"step": 347
},
{
"epoch": 0.7951256664127951,
"grad_norm": 1.0355713367462158,
"learning_rate": 4.8816505470599365e-06,
"loss": 1.2224,
"step": 348
},
{
"epoch": 0.7974105102817974,
"grad_norm": 1.0540703535079956,
"learning_rate": 4.880701871125551e-06,
"loss": 1.262,
"step": 349
},
{
"epoch": 0.7996953541507997,
"grad_norm": 1.0765819549560547,
"learning_rate": 4.879749501041062e-06,
"loss": 1.2731,
"step": 350
},
{
"epoch": 0.801980198019802,
"grad_norm": 1.0639638900756836,
"learning_rate": 4.878793438284268e-06,
"loss": 1.2673,
"step": 351
},
{
"epoch": 0.8042650418888042,
"grad_norm": 1.0149368047714233,
"learning_rate": 4.877833684338698e-06,
"loss": 1.2479,
"step": 352
},
{
"epoch": 0.8065498857578065,
"grad_norm": 1.1710435152053833,
"learning_rate": 4.876870240693608e-06,
"loss": 1.2775,
"step": 353
},
{
"epoch": 0.8088347296268088,
"grad_norm": 1.1317570209503174,
"learning_rate": 4.875903108843979e-06,
"loss": 1.2732,
"step": 354
},
{
"epoch": 0.8111195734958111,
"grad_norm": 1.0417158603668213,
"learning_rate": 4.874932290290517e-06,
"loss": 1.2647,
"step": 355
},
{
"epoch": 0.8134044173648134,
"grad_norm": 1.073765516281128,
"learning_rate": 4.873957786539646e-06,
"loss": 1.2738,
"step": 356
},
{
"epoch": 0.8156892612338157,
"grad_norm": 1.018481731414795,
"learning_rate": 4.872979599103511e-06,
"loss": 1.2509,
"step": 357
},
{
"epoch": 0.817974105102818,
"grad_norm": 1.0737470388412476,
"learning_rate": 4.8719977294999695e-06,
"loss": 1.232,
"step": 358
},
{
"epoch": 0.8202589489718203,
"grad_norm": 1.0921229124069214,
"learning_rate": 4.871012179252597e-06,
"loss": 1.2342,
"step": 359
},
{
"epoch": 0.8225437928408226,
"grad_norm": 1.0502641201019287,
"learning_rate": 4.870022949890676e-06,
"loss": 1.2463,
"step": 360
},
{
"epoch": 0.8248286367098249,
"grad_norm": 1.1755155324935913,
"learning_rate": 4.869030042949202e-06,
"loss": 1.2625,
"step": 361
},
{
"epoch": 0.8271134805788272,
"grad_norm": 1.0167341232299805,
"learning_rate": 4.868033459968874e-06,
"loss": 1.2563,
"step": 362
},
{
"epoch": 0.8293983244478293,
"grad_norm": 1.0481575727462769,
"learning_rate": 4.8670332024960954e-06,
"loss": 1.2541,
"step": 363
},
{
"epoch": 0.8316831683168316,
"grad_norm": 1.0657804012298584,
"learning_rate": 4.866029272082973e-06,
"loss": 1.2444,
"step": 364
},
{
"epoch": 0.8339680121858339,
"grad_norm": 1.0473397970199585,
"learning_rate": 4.865021670287311e-06,
"loss": 1.2356,
"step": 365
},
{
"epoch": 0.8362528560548362,
"grad_norm": 1.011077880859375,
"learning_rate": 4.864010398672612e-06,
"loss": 1.2417,
"step": 366
},
{
"epoch": 0.8385376999238385,
"grad_norm": 1.0485464334487915,
"learning_rate": 4.862995458808073e-06,
"loss": 1.2728,
"step": 367
},
{
"epoch": 0.8408225437928408,
"grad_norm": 1.0683908462524414,
"learning_rate": 4.861976852268582e-06,
"loss": 1.2354,
"step": 368
},
{
"epoch": 0.8431073876618431,
"grad_norm": 1.0323604345321655,
"learning_rate": 4.860954580634718e-06,
"loss": 1.2665,
"step": 369
},
{
"epoch": 0.8453922315308454,
"grad_norm": 1.024782419204712,
"learning_rate": 4.859928645492746e-06,
"loss": 1.2515,
"step": 370
},
{
"epoch": 0.8476770753998477,
"grad_norm": 1.02902090549469,
"learning_rate": 4.858899048434614e-06,
"loss": 1.2274,
"step": 371
},
{
"epoch": 0.84996191926885,
"grad_norm": 1.0355148315429688,
"learning_rate": 4.857865791057957e-06,
"loss": 1.2289,
"step": 372
},
{
"epoch": 0.8522467631378522,
"grad_norm": 1.0638132095336914,
"learning_rate": 4.856828874966086e-06,
"loss": 1.2245,
"step": 373
},
{
"epoch": 0.8545316070068545,
"grad_norm": 1.0459909439086914,
"learning_rate": 4.8557883017679895e-06,
"loss": 1.2347,
"step": 374
},
{
"epoch": 0.8568164508758568,
"grad_norm": 1.0818232297897339,
"learning_rate": 4.854744073078333e-06,
"loss": 1.2564,
"step": 375
},
{
"epoch": 0.8591012947448591,
"grad_norm": 1.0551162958145142,
"learning_rate": 4.853696190517452e-06,
"loss": 1.2809,
"step": 376
},
{
"epoch": 0.8613861386138614,
"grad_norm": 1.0419256687164307,
"learning_rate": 4.8526446557113525e-06,
"loss": 1.2532,
"step": 377
},
{
"epoch": 0.8636709824828637,
"grad_norm": 1.058478832244873,
"learning_rate": 4.851589470291707e-06,
"loss": 1.229,
"step": 378
},
{
"epoch": 0.865955826351866,
"grad_norm": 1.0275694131851196,
"learning_rate": 4.850530635895854e-06,
"loss": 1.2555,
"step": 379
},
{
"epoch": 0.8682406702208683,
"grad_norm": 1.0653144121170044,
"learning_rate": 4.849468154166794e-06,
"loss": 1.2397,
"step": 380
},
{
"epoch": 0.8705255140898706,
"grad_norm": 1.0227371454238892,
"learning_rate": 4.8484020267531855e-06,
"loss": 1.2568,
"step": 381
},
{
"epoch": 0.8728103579588729,
"grad_norm": 1.0583505630493164,
"learning_rate": 4.847332255309346e-06,
"loss": 1.2489,
"step": 382
},
{
"epoch": 0.8750952018278751,
"grad_norm": 1.0397239923477173,
"learning_rate": 4.846258841495246e-06,
"loss": 1.273,
"step": 383
},
{
"epoch": 0.8773800456968773,
"grad_norm": 1.020776391029358,
"learning_rate": 4.845181786976509e-06,
"loss": 1.2257,
"step": 384
},
{
"epoch": 0.8796648895658796,
"grad_norm": 1.0420705080032349,
"learning_rate": 4.844101093424407e-06,
"loss": 1.296,
"step": 385
},
{
"epoch": 0.8819497334348819,
"grad_norm": 1.0465624332427979,
"learning_rate": 4.84301676251586e-06,
"loss": 1.2514,
"step": 386
},
{
"epoch": 0.8842345773038842,
"grad_norm": 1.0915330648422241,
"learning_rate": 4.841928795933429e-06,
"loss": 1.2664,
"step": 387
},
{
"epoch": 0.8865194211728865,
"grad_norm": 1.0246195793151855,
"learning_rate": 4.84083719536532e-06,
"loss": 1.2499,
"step": 388
},
{
"epoch": 0.8888042650418888,
"grad_norm": 1.0145692825317383,
"learning_rate": 4.839741962505376e-06,
"loss": 1.2638,
"step": 389
},
{
"epoch": 0.8910891089108911,
"grad_norm": 1.05404531955719,
"learning_rate": 4.838643099053077e-06,
"loss": 1.1875,
"step": 390
},
{
"epoch": 0.8933739527798934,
"grad_norm": 1.1422752141952515,
"learning_rate": 4.837540606713538e-06,
"loss": 1.2496,
"step": 391
},
{
"epoch": 0.8956587966488957,
"grad_norm": 1.0648959875106812,
"learning_rate": 4.8364344871975e-06,
"loss": 1.2375,
"step": 392
},
{
"epoch": 0.897943640517898,
"grad_norm": 1.0459322929382324,
"learning_rate": 4.835324742221338e-06,
"loss": 1.2419,
"step": 393
},
{
"epoch": 0.9002284843869002,
"grad_norm": 1.0693044662475586,
"learning_rate": 4.834211373507048e-06,
"loss": 1.2485,
"step": 394
},
{
"epoch": 0.9025133282559025,
"grad_norm": 1.0930724143981934,
"learning_rate": 4.833094382782255e-06,
"loss": 1.2389,
"step": 395
},
{
"epoch": 0.9047981721249048,
"grad_norm": 1.1270296573638916,
"learning_rate": 4.831973771780197e-06,
"loss": 1.2033,
"step": 396
},
{
"epoch": 0.9070830159939071,
"grad_norm": 1.044074535369873,
"learning_rate": 4.830849542239735e-06,
"loss": 1.2464,
"step": 397
},
{
"epoch": 0.9093678598629094,
"grad_norm": 1.0138458013534546,
"learning_rate": 4.829721695905343e-06,
"loss": 1.2473,
"step": 398
},
{
"epoch": 0.9116527037319117,
"grad_norm": 1.1201279163360596,
"learning_rate": 4.828590234527107e-06,
"loss": 1.2729,
"step": 399
},
{
"epoch": 0.913937547600914,
"grad_norm": 1.0771571397781372,
"learning_rate": 4.8274551598607214e-06,
"loss": 1.2665,
"step": 400
},
{
"epoch": 0.9162223914699162,
"grad_norm": 1.0691912174224854,
"learning_rate": 4.8263164736674905e-06,
"loss": 1.2094,
"step": 401
},
{
"epoch": 0.9185072353389185,
"grad_norm": 1.0740418434143066,
"learning_rate": 4.8251741777143205e-06,
"loss": 1.2879,
"step": 402
},
{
"epoch": 0.9207920792079208,
"grad_norm": 1.0185081958770752,
"learning_rate": 4.824028273773719e-06,
"loss": 1.2459,
"step": 403
},
{
"epoch": 0.9230769230769231,
"grad_norm": 1.0672869682312012,
"learning_rate": 4.822878763623792e-06,
"loss": 1.2394,
"step": 404
},
{
"epoch": 0.9253617669459253,
"grad_norm": 1.08120858669281,
"learning_rate": 4.821725649048242e-06,
"loss": 1.2918,
"step": 405
},
{
"epoch": 0.9276466108149276,
"grad_norm": 1.0407681465148926,
"learning_rate": 4.820568931836364e-06,
"loss": 1.2443,
"step": 406
},
{
"epoch": 0.9299314546839299,
"grad_norm": 1.0847117900848389,
"learning_rate": 4.8194086137830445e-06,
"loss": 1.2505,
"step": 407
},
{
"epoch": 0.9322162985529322,
"grad_norm": 1.0484883785247803,
"learning_rate": 4.818244696688754e-06,
"loss": 1.2469,
"step": 408
},
{
"epoch": 0.9345011424219345,
"grad_norm": 1.0654011964797974,
"learning_rate": 4.817077182359553e-06,
"loss": 1.2544,
"step": 409
},
{
"epoch": 0.9367859862909368,
"grad_norm": 1.108176589012146,
"learning_rate": 4.815906072607079e-06,
"loss": 1.2387,
"step": 410
},
{
"epoch": 0.9390708301599391,
"grad_norm": 1.0624432563781738,
"learning_rate": 4.8147313692485495e-06,
"loss": 1.2488,
"step": 411
},
{
"epoch": 0.9413556740289414,
"grad_norm": 1.0391454696655273,
"learning_rate": 4.813553074106761e-06,
"loss": 1.2514,
"step": 412
},
{
"epoch": 0.9436405178979437,
"grad_norm": 1.1086232662200928,
"learning_rate": 4.812371189010081e-06,
"loss": 1.2694,
"step": 413
},
{
"epoch": 0.945925361766946,
"grad_norm": 1.0448237657546997,
"learning_rate": 4.8111857157924465e-06,
"loss": 1.2366,
"step": 414
},
{
"epoch": 0.9482102056359482,
"grad_norm": 1.0393203496932983,
"learning_rate": 4.809996656293367e-06,
"loss": 1.2747,
"step": 415
},
{
"epoch": 0.9504950495049505,
"grad_norm": 1.083590030670166,
"learning_rate": 4.8088040123579106e-06,
"loss": 1.2167,
"step": 416
},
{
"epoch": 0.9527798933739527,
"grad_norm": 1.071567177772522,
"learning_rate": 4.807607785836711e-06,
"loss": 1.2108,
"step": 417
},
{
"epoch": 0.955064737242955,
"grad_norm": 1.0953818559646606,
"learning_rate": 4.8064079785859615e-06,
"loss": 1.2381,
"step": 418
},
{
"epoch": 0.9573495811119573,
"grad_norm": 1.0628875494003296,
"learning_rate": 4.8052045924674105e-06,
"loss": 1.232,
"step": 419
},
{
"epoch": 0.9596344249809596,
"grad_norm": 1.0838161706924438,
"learning_rate": 4.803997629348359e-06,
"loss": 1.2699,
"step": 420
},
{
"epoch": 0.9619192688499619,
"grad_norm": 0.9980992078781128,
"learning_rate": 4.802787091101659e-06,
"loss": 1.2473,
"step": 421
},
{
"epoch": 0.9642041127189642,
"grad_norm": 1.094283103942871,
"learning_rate": 4.801572979605712e-06,
"loss": 1.2656,
"step": 422
},
{
"epoch": 0.9664889565879665,
"grad_norm": 1.0554611682891846,
"learning_rate": 4.800355296744461e-06,
"loss": 1.2584,
"step": 423
},
{
"epoch": 0.9687738004569688,
"grad_norm": 1.1019188165664673,
"learning_rate": 4.799134044407392e-06,
"loss": 1.2877,
"step": 424
},
{
"epoch": 0.9710586443259711,
"grad_norm": 1.087965726852417,
"learning_rate": 4.797909224489531e-06,
"loss": 1.2662,
"step": 425
},
{
"epoch": 0.9733434881949733,
"grad_norm": 1.08269202709198,
"learning_rate": 4.796680838891438e-06,
"loss": 1.2419,
"step": 426
},
{
"epoch": 0.9756283320639756,
"grad_norm": 1.071199893951416,
"learning_rate": 4.795448889519207e-06,
"loss": 1.2489,
"step": 427
},
{
"epoch": 0.9779131759329779,
"grad_norm": 1.0306544303894043,
"learning_rate": 4.794213378284462e-06,
"loss": 1.2467,
"step": 428
},
{
"epoch": 0.9801980198019802,
"grad_norm": 1.0567327737808228,
"learning_rate": 4.792974307104353e-06,
"loss": 1.2637,
"step": 429
},
{
"epoch": 0.9824828636709825,
"grad_norm": 1.0448797941207886,
"learning_rate": 4.7917316779015554e-06,
"loss": 1.2244,
"step": 430
},
{
"epoch": 0.9847677075399848,
"grad_norm": 1.0123138427734375,
"learning_rate": 4.790485492604264e-06,
"loss": 1.2326,
"step": 431
},
{
"epoch": 0.9870525514089871,
"grad_norm": 1.0484559535980225,
"learning_rate": 4.789235753146192e-06,
"loss": 1.2436,
"step": 432
},
{
"epoch": 0.9893373952779894,
"grad_norm": 1.0161617994308472,
"learning_rate": 4.787982461466568e-06,
"loss": 1.2185,
"step": 433
},
{
"epoch": 0.9916222391469917,
"grad_norm": 1.0779787302017212,
"learning_rate": 4.786725619510134e-06,
"loss": 1.2256,
"step": 434
},
{
"epoch": 0.993907083015994,
"grad_norm": 1.061590552330017,
"learning_rate": 4.785465229227139e-06,
"loss": 1.2747,
"step": 435
},
{
"epoch": 0.9961919268849961,
"grad_norm": 1.102403163909912,
"learning_rate": 4.784201292573337e-06,
"loss": 1.2561,
"step": 436
},
{
"epoch": 0.9984767707539984,
"grad_norm": 0.9936567544937134,
"learning_rate": 4.782933811509988e-06,
"loss": 1.2409,
"step": 437
},
{
"epoch": 1.0,
"grad_norm": 0.9936567544937134,
"learning_rate": 4.781662788003851e-06,
"loss": 1.2271,
"step": 438
},
{
"epoch": 1.0022848438690022,
"grad_norm": 1.4983975887298584,
"learning_rate": 4.780388224027179e-06,
"loss": 1.2312,
"step": 439
},
{
"epoch": 1.0045696877380046,
"grad_norm": 1.0163486003875732,
"learning_rate": 4.779110121557723e-06,
"loss": 1.1992,
"step": 440
},
{
"epoch": 1.0068545316070068,
"grad_norm": 1.0127511024475098,
"learning_rate": 4.777828482578722e-06,
"loss": 1.2135,
"step": 441
},
{
"epoch": 1.0091393754760092,
"grad_norm": 1.0328449010849,
"learning_rate": 4.776543309078903e-06,
"loss": 1.2143,
"step": 442
},
{
"epoch": 1.0114242193450114,
"grad_norm": 1.1579132080078125,
"learning_rate": 4.7752546030524775e-06,
"loss": 1.2051,
"step": 443
},
{
"epoch": 1.0137090632140138,
"grad_norm": 1.0556671619415283,
"learning_rate": 4.77396236649914e-06,
"loss": 1.2136,
"step": 444
},
{
"epoch": 1.015993907083016,
"grad_norm": 1.0315356254577637,
"learning_rate": 4.772666601424061e-06,
"loss": 1.2444,
"step": 445
},
{
"epoch": 1.0182787509520184,
"grad_norm": 1.0836431980133057,
"learning_rate": 4.771367309837888e-06,
"loss": 1.1967,
"step": 446
},
{
"epoch": 1.0205635948210205,
"grad_norm": 1.0516763925552368,
"learning_rate": 4.7700644937567385e-06,
"loss": 1.2012,
"step": 447
},
{
"epoch": 1.022848438690023,
"grad_norm": 1.0447187423706055,
"learning_rate": 4.768758155202202e-06,
"loss": 1.2281,
"step": 448
},
{
"epoch": 1.0251332825590251,
"grad_norm": 1.0647971630096436,
"learning_rate": 4.767448296201332e-06,
"loss": 1.1907,
"step": 449
},
{
"epoch": 1.0274181264280273,
"grad_norm": 1.0792133808135986,
"learning_rate": 4.766134918786646e-06,
"loss": 1.2346,
"step": 450
},
{
"epoch": 1.0297029702970297,
"grad_norm": 1.1311832666397095,
"learning_rate": 4.764818024996117e-06,
"loss": 1.2652,
"step": 451
},
{
"epoch": 1.031987814166032,
"grad_norm": 1.090455174446106,
"learning_rate": 4.763497616873181e-06,
"loss": 1.2258,
"step": 452
},
{
"epoch": 1.0342726580350343,
"grad_norm": 1.0815131664276123,
"learning_rate": 4.7621736964667204e-06,
"loss": 1.2233,
"step": 453
},
{
"epoch": 1.0365575019040365,
"grad_norm": 1.112673044204712,
"learning_rate": 4.760846265831073e-06,
"loss": 1.2136,
"step": 454
},
{
"epoch": 1.038842345773039,
"grad_norm": 1.068178653717041,
"learning_rate": 4.759515327026019e-06,
"loss": 1.214,
"step": 455
},
{
"epoch": 1.041127189642041,
"grad_norm": 1.079059362411499,
"learning_rate": 4.758180882116788e-06,
"loss": 1.2024,
"step": 456
},
{
"epoch": 1.0434120335110435,
"grad_norm": 1.1010431051254272,
"learning_rate": 4.756842933174044e-06,
"loss": 1.2239,
"step": 457
},
{
"epoch": 1.0456968773800457,
"grad_norm": 1.1039162874221802,
"learning_rate": 4.755501482273892e-06,
"loss": 1.2212,
"step": 458
},
{
"epoch": 1.047981721249048,
"grad_norm": 1.0536257028579712,
"learning_rate": 4.754156531497869e-06,
"loss": 1.1672,
"step": 459
},
{
"epoch": 1.0502665651180503,
"grad_norm": 1.085410475730896,
"learning_rate": 4.752808082932943e-06,
"loss": 1.2471,
"step": 460
},
{
"epoch": 1.0525514089870525,
"grad_norm": 1.0432335138320923,
"learning_rate": 4.751456138671512e-06,
"loss": 1.2345,
"step": 461
},
{
"epoch": 1.0548362528560549,
"grad_norm": 1.072708249092102,
"learning_rate": 4.750100700811395e-06,
"loss": 1.2328,
"step": 462
},
{
"epoch": 1.057121096725057,
"grad_norm": 1.0852991342544556,
"learning_rate": 4.748741771455835e-06,
"loss": 1.19,
"step": 463
},
{
"epoch": 1.0594059405940595,
"grad_norm": 1.0906398296356201,
"learning_rate": 4.747379352713489e-06,
"loss": 1.229,
"step": 464
},
{
"epoch": 1.0616907844630616,
"grad_norm": 1.055309772491455,
"learning_rate": 4.746013446698432e-06,
"loss": 1.2419,
"step": 465
},
{
"epoch": 1.063975628332064,
"grad_norm": 1.0667710304260254,
"learning_rate": 4.744644055530149e-06,
"loss": 1.1943,
"step": 466
},
{
"epoch": 1.0662604722010662,
"grad_norm": 1.091098427772522,
"learning_rate": 4.743271181333533e-06,
"loss": 1.171,
"step": 467
},
{
"epoch": 1.0685453160700686,
"grad_norm": 1.0701195001602173,
"learning_rate": 4.741894826238882e-06,
"loss": 1.2163,
"step": 468
},
{
"epoch": 1.0708301599390708,
"grad_norm": 1.0526652336120605,
"learning_rate": 4.740514992381893e-06,
"loss": 1.2329,
"step": 469
},
{
"epoch": 1.073115003808073,
"grad_norm": 1.0725401639938354,
"learning_rate": 4.739131681903666e-06,
"loss": 1.1793,
"step": 470
},
{
"epoch": 1.0753998476770754,
"grad_norm": 1.126091480255127,
"learning_rate": 4.737744896950689e-06,
"loss": 1.1769,
"step": 471
},
{
"epoch": 1.0776846915460776,
"grad_norm": 1.039844036102295,
"learning_rate": 4.736354639674847e-06,
"loss": 1.205,
"step": 472
},
{
"epoch": 1.07996953541508,
"grad_norm": 1.0432695150375366,
"learning_rate": 4.734960912233411e-06,
"loss": 1.1909,
"step": 473
},
{
"epoch": 1.0822543792840822,
"grad_norm": 1.0890403985977173,
"learning_rate": 4.7335637167890366e-06,
"loss": 1.1928,
"step": 474
},
{
"epoch": 1.0845392231530846,
"grad_norm": 1.0710458755493164,
"learning_rate": 4.732163055509759e-06,
"loss": 1.2402,
"step": 475
},
{
"epoch": 1.0868240670220868,
"grad_norm": 1.0940525531768799,
"learning_rate": 4.730758930568997e-06,
"loss": 1.181,
"step": 476
},
{
"epoch": 1.0891089108910892,
"grad_norm": 1.177641749382019,
"learning_rate": 4.729351344145536e-06,
"loss": 1.2232,
"step": 477
},
{
"epoch": 1.0913937547600914,
"grad_norm": 1.1401522159576416,
"learning_rate": 4.72794029842354e-06,
"loss": 1.1878,
"step": 478
},
{
"epoch": 1.0936785986290938,
"grad_norm": 1.0874228477478027,
"learning_rate": 4.726525795592535e-06,
"loss": 1.1658,
"step": 479
},
{
"epoch": 1.095963442498096,
"grad_norm": 1.0325064659118652,
"learning_rate": 4.725107837847414e-06,
"loss": 1.2084,
"step": 480
},
{
"epoch": 1.0982482863670981,
"grad_norm": 1.0492074489593506,
"learning_rate": 4.723686427388434e-06,
"loss": 1.2208,
"step": 481
},
{
"epoch": 1.1005331302361006,
"grad_norm": 1.1399495601654053,
"learning_rate": 4.722261566421204e-06,
"loss": 1.2158,
"step": 482
},
{
"epoch": 1.1028179741051027,
"grad_norm": 1.1156668663024902,
"learning_rate": 4.72083325715669e-06,
"loss": 1.2252,
"step": 483
},
{
"epoch": 1.1051028179741051,
"grad_norm": 1.072943091392517,
"learning_rate": 4.719401501811209e-06,
"loss": 1.2381,
"step": 484
},
{
"epoch": 1.1073876618431073,
"grad_norm": 1.0337257385253906,
"learning_rate": 4.717966302606424e-06,
"loss": 1.1782,
"step": 485
},
{
"epoch": 1.1096725057121097,
"grad_norm": 1.0744901895523071,
"learning_rate": 4.716527661769344e-06,
"loss": 1.2412,
"step": 486
},
{
"epoch": 1.111957349581112,
"grad_norm": 1.0331535339355469,
"learning_rate": 4.715085581532316e-06,
"loss": 1.1869,
"step": 487
},
{
"epoch": 1.1142421934501143,
"grad_norm": 1.0795518159866333,
"learning_rate": 4.7136400641330245e-06,
"loss": 1.214,
"step": 488
},
{
"epoch": 1.1165270373191165,
"grad_norm": 1.084125280380249,
"learning_rate": 4.71219111181449e-06,
"loss": 1.2049,
"step": 489
},
{
"epoch": 1.118811881188119,
"grad_norm": 1.1166882514953613,
"learning_rate": 4.710738726825059e-06,
"loss": 1.2143,
"step": 490
},
{
"epoch": 1.121096725057121,
"grad_norm": 1.0764187574386597,
"learning_rate": 4.709282911418408e-06,
"loss": 1.2301,
"step": 491
},
{
"epoch": 1.1233815689261233,
"grad_norm": 1.069149374961853,
"learning_rate": 4.7078236678535335e-06,
"loss": 1.2094,
"step": 492
},
{
"epoch": 1.1256664127951257,
"grad_norm": 1.0744988918304443,
"learning_rate": 4.7063609983947535e-06,
"loss": 1.1893,
"step": 493
},
{
"epoch": 1.1279512566641279,
"grad_norm": 1.090267300605774,
"learning_rate": 4.704894905311701e-06,
"loss": 1.1575,
"step": 494
},
{
"epoch": 1.1302361005331303,
"grad_norm": 1.067543625831604,
"learning_rate": 4.703425390879323e-06,
"loss": 1.1801,
"step": 495
},
{
"epoch": 1.1325209444021325,
"grad_norm": 1.0365897417068481,
"learning_rate": 4.701952457377874e-06,
"loss": 1.2197,
"step": 496
},
{
"epoch": 1.1348057882711349,
"grad_norm": 1.066163420677185,
"learning_rate": 4.700476107092913e-06,
"loss": 1.2156,
"step": 497
},
{
"epoch": 1.137090632140137,
"grad_norm": 1.1297317743301392,
"learning_rate": 4.698996342315303e-06,
"loss": 1.2064,
"step": 498
},
{
"epoch": 1.1393754760091395,
"grad_norm": 1.069610834121704,
"learning_rate": 4.697513165341204e-06,
"loss": 1.1986,
"step": 499
},
{
"epoch": 1.1416603198781416,
"grad_norm": 1.0844234228134155,
"learning_rate": 4.696026578472073e-06,
"loss": 1.1892,
"step": 500
},
{
"epoch": 1.1439451637471438,
"grad_norm": 1.079389214515686,
"learning_rate": 4.694536584014653e-06,
"loss": 1.201,
"step": 501
},
{
"epoch": 1.1462300076161462,
"grad_norm": 1.075682520866394,
"learning_rate": 4.693043184280978e-06,
"loss": 1.2261,
"step": 502
},
{
"epoch": 1.1485148514851484,
"grad_norm": 1.0244231224060059,
"learning_rate": 4.69154638158837e-06,
"loss": 1.2048,
"step": 503
},
{
"epoch": 1.1507996953541508,
"grad_norm": 1.0907280445098877,
"learning_rate": 4.690046178259423e-06,
"loss": 1.2202,
"step": 504
},
{
"epoch": 1.153084539223153,
"grad_norm": 1.097701907157898,
"learning_rate": 4.688542576622013e-06,
"loss": 1.1781,
"step": 505
},
{
"epoch": 1.1553693830921554,
"grad_norm": 1.0993037223815918,
"learning_rate": 4.687035579009288e-06,
"loss": 1.2113,
"step": 506
},
{
"epoch": 1.1576542269611576,
"grad_norm": 1.085300087928772,
"learning_rate": 4.685525187759666e-06,
"loss": 1.1996,
"step": 507
},
{
"epoch": 1.15993907083016,
"grad_norm": 1.0483977794647217,
"learning_rate": 4.684011405216832e-06,
"loss": 1.2343,
"step": 508
},
{
"epoch": 1.1622239146991622,
"grad_norm": 1.064441442489624,
"learning_rate": 4.682494233729729e-06,
"loss": 1.2405,
"step": 509
},
{
"epoch": 1.1645087585681646,
"grad_norm": 1.05643630027771,
"learning_rate": 4.680973675652564e-06,
"loss": 1.2112,
"step": 510
},
{
"epoch": 1.1667936024371668,
"grad_norm": 1.0908793210983276,
"learning_rate": 4.679449733344796e-06,
"loss": 1.2077,
"step": 511
},
{
"epoch": 1.1690784463061692,
"grad_norm": 1.1248105764389038,
"learning_rate": 4.677922409171136e-06,
"loss": 1.1987,
"step": 512
},
{
"epoch": 1.1713632901751714,
"grad_norm": 1.0605143308639526,
"learning_rate": 4.6763917055015414e-06,
"loss": 1.2027,
"step": 513
},
{
"epoch": 1.1736481340441736,
"grad_norm": 1.0496442317962646,
"learning_rate": 4.674857624711216e-06,
"loss": 1.2259,
"step": 514
},
{
"epoch": 1.175932977913176,
"grad_norm": 1.1550832986831665,
"learning_rate": 4.673320169180601e-06,
"loss": 1.2418,
"step": 515
},
{
"epoch": 1.1782178217821782,
"grad_norm": 1.1532083749771118,
"learning_rate": 4.671779341295378e-06,
"loss": 1.2265,
"step": 516
},
{
"epoch": 1.1805026656511806,
"grad_norm": 1.081101417541504,
"learning_rate": 4.670235143446457e-06,
"loss": 1.2078,
"step": 517
},
{
"epoch": 1.1827875095201827,
"grad_norm": 1.0701441764831543,
"learning_rate": 4.668687578029983e-06,
"loss": 1.2252,
"step": 518
},
{
"epoch": 1.1850723533891852,
"grad_norm": 1.0859651565551758,
"learning_rate": 4.667136647447319e-06,
"loss": 1.2131,
"step": 519
},
{
"epoch": 1.1873571972581873,
"grad_norm": 1.122533917427063,
"learning_rate": 4.6655823541050575e-06,
"loss": 1.1608,
"step": 520
},
{
"epoch": 1.1896420411271897,
"grad_norm": 1.0594309568405151,
"learning_rate": 4.664024700415002e-06,
"loss": 1.2151,
"step": 521
},
{
"epoch": 1.191926884996192,
"grad_norm": 1.1281721591949463,
"learning_rate": 4.662463688794175e-06,
"loss": 1.2197,
"step": 522
},
{
"epoch": 1.194211728865194,
"grad_norm": 1.1049555540084839,
"learning_rate": 4.660899321664808e-06,
"loss": 1.2416,
"step": 523
},
{
"epoch": 1.1964965727341965,
"grad_norm": 1.1022320985794067,
"learning_rate": 4.65933160145434e-06,
"loss": 1.1842,
"step": 524
},
{
"epoch": 1.1987814166031987,
"grad_norm": 1.1182475090026855,
"learning_rate": 4.657760530595411e-06,
"loss": 1.1417,
"step": 525
},
{
"epoch": 1.201066260472201,
"grad_norm": 1.1111788749694824,
"learning_rate": 4.656186111525863e-06,
"loss": 1.2092,
"step": 526
},
{
"epoch": 1.2033511043412033,
"grad_norm": 1.0595399141311646,
"learning_rate": 4.654608346688731e-06,
"loss": 1.1549,
"step": 527
},
{
"epoch": 1.2056359482102057,
"grad_norm": 1.1151765584945679,
"learning_rate": 4.6530272385322426e-06,
"loss": 1.2469,
"step": 528
},
{
"epoch": 1.2079207920792079,
"grad_norm": 1.0334808826446533,
"learning_rate": 4.651442789509813e-06,
"loss": 1.229,
"step": 529
},
{
"epoch": 1.2102056359482103,
"grad_norm": 1.0765459537506104,
"learning_rate": 4.649855002080044e-06,
"loss": 1.2163,
"step": 530
},
{
"epoch": 1.2124904798172125,
"grad_norm": 1.1509029865264893,
"learning_rate": 4.648263878706712e-06,
"loss": 1.2101,
"step": 531
},
{
"epoch": 1.2147753236862147,
"grad_norm": 1.1111629009246826,
"learning_rate": 4.646669421858776e-06,
"loss": 1.2379,
"step": 532
},
{
"epoch": 1.217060167555217,
"grad_norm": 1.1683619022369385,
"learning_rate": 4.645071634010363e-06,
"loss": 1.2011,
"step": 533
},
{
"epoch": 1.2193450114242192,
"grad_norm": 1.2373298406600952,
"learning_rate": 4.643470517640772e-06,
"loss": 1.1502,
"step": 534
},
{
"epoch": 1.2216298552932217,
"grad_norm": 1.080675482749939,
"learning_rate": 4.641866075234463e-06,
"loss": 1.2173,
"step": 535
},
{
"epoch": 1.2239146991622238,
"grad_norm": 1.0971184968948364,
"learning_rate": 4.640258309281062e-06,
"loss": 1.2117,
"step": 536
},
{
"epoch": 1.2261995430312262,
"grad_norm": 1.183856725692749,
"learning_rate": 4.638647222275349e-06,
"loss": 1.2137,
"step": 537
},
{
"epoch": 1.2284843869002284,
"grad_norm": 1.2277085781097412,
"learning_rate": 4.637032816717256e-06,
"loss": 1.1977,
"step": 538
},
{
"epoch": 1.2307692307692308,
"grad_norm": 1.1087970733642578,
"learning_rate": 4.6354150951118676e-06,
"loss": 1.2256,
"step": 539
},
{
"epoch": 1.233054074638233,
"grad_norm": 1.0706229209899902,
"learning_rate": 4.633794059969413e-06,
"loss": 1.2429,
"step": 540
},
{
"epoch": 1.2353389185072354,
"grad_norm": 1.1261042356491089,
"learning_rate": 4.632169713805262e-06,
"loss": 1.219,
"step": 541
},
{
"epoch": 1.2376237623762376,
"grad_norm": 1.1817506551742554,
"learning_rate": 4.630542059139923e-06,
"loss": 1.2367,
"step": 542
},
{
"epoch": 1.23990860624524,
"grad_norm": 1.1145075559616089,
"learning_rate": 4.628911098499039e-06,
"loss": 1.2029,
"step": 543
},
{
"epoch": 1.2421934501142422,
"grad_norm": 1.1309086084365845,
"learning_rate": 4.62727683441338e-06,
"loss": 1.2374,
"step": 544
},
{
"epoch": 1.2444782939832444,
"grad_norm": 1.0949103832244873,
"learning_rate": 4.6256392694188445e-06,
"loss": 1.2204,
"step": 545
},
{
"epoch": 1.2467631378522468,
"grad_norm": 1.2004865407943726,
"learning_rate": 4.6239984060564535e-06,
"loss": 1.2327,
"step": 546
},
{
"epoch": 1.249047981721249,
"grad_norm": 1.286232829093933,
"learning_rate": 4.622354246872344e-06,
"loss": 1.1838,
"step": 547
},
{
"epoch": 1.2513328255902514,
"grad_norm": 1.0974533557891846,
"learning_rate": 4.620706794417769e-06,
"loss": 1.1678,
"step": 548
},
{
"epoch": 1.2536176694592536,
"grad_norm": 1.0960924625396729,
"learning_rate": 4.61905605124909e-06,
"loss": 1.2314,
"step": 549
},
{
"epoch": 1.255902513328256,
"grad_norm": 1.1535454988479614,
"learning_rate": 4.617402019927776e-06,
"loss": 1.1928,
"step": 550
},
{
"epoch": 1.2581873571972582,
"grad_norm": 1.2693071365356445,
"learning_rate": 4.615744703020396e-06,
"loss": 1.1966,
"step": 551
},
{
"epoch": 1.2604722010662606,
"grad_norm": 1.1645997762680054,
"learning_rate": 4.614084103098623e-06,
"loss": 1.2251,
"step": 552
},
{
"epoch": 1.2627570449352628,
"grad_norm": 1.1186461448669434,
"learning_rate": 4.6124202227392175e-06,
"loss": 1.2037,
"step": 553
},
{
"epoch": 1.265041888804265,
"grad_norm": 1.1100102663040161,
"learning_rate": 4.610753064524034e-06,
"loss": 1.2011,
"step": 554
},
{
"epoch": 1.2673267326732673,
"grad_norm": 1.1173806190490723,
"learning_rate": 4.609082631040012e-06,
"loss": 1.1871,
"step": 555
},
{
"epoch": 1.2696115765422697,
"grad_norm": 1.1128157377243042,
"learning_rate": 4.6074089248791735e-06,
"loss": 1.1965,
"step": 556
},
{
"epoch": 1.271896420411272,
"grad_norm": 1.0940717458724976,
"learning_rate": 4.60573194863862e-06,
"loss": 1.193,
"step": 557
},
{
"epoch": 1.2741812642802741,
"grad_norm": 1.0955843925476074,
"learning_rate": 4.604051704920526e-06,
"loss": 1.187,
"step": 558
},
{
"epoch": 1.2764661081492765,
"grad_norm": 1.0802319049835205,
"learning_rate": 4.602368196332134e-06,
"loss": 1.1753,
"step": 559
},
{
"epoch": 1.2787509520182787,
"grad_norm": 1.1677578687667847,
"learning_rate": 4.600681425485757e-06,
"loss": 1.1964,
"step": 560
},
{
"epoch": 1.2810357958872811,
"grad_norm": 1.0803853273391724,
"learning_rate": 4.598991394998768e-06,
"loss": 1.2052,
"step": 561
},
{
"epoch": 1.2833206397562833,
"grad_norm": 1.128667950630188,
"learning_rate": 4.5972981074935975e-06,
"loss": 1.1774,
"step": 562
},
{
"epoch": 1.2856054836252855,
"grad_norm": 1.0685629844665527,
"learning_rate": 4.59560156559773e-06,
"loss": 1.1897,
"step": 563
},
{
"epoch": 1.287890327494288,
"grad_norm": 1.1464303731918335,
"learning_rate": 4.593901771943702e-06,
"loss": 1.1809,
"step": 564
},
{
"epoch": 1.2901751713632903,
"grad_norm": 1.1095281839370728,
"learning_rate": 4.592198729169091e-06,
"loss": 1.2118,
"step": 565
},
{
"epoch": 1.2924600152322925,
"grad_norm": 1.0897274017333984,
"learning_rate": 4.5904924399165215e-06,
"loss": 1.177,
"step": 566
},
{
"epoch": 1.2947448591012947,
"grad_norm": 1.0702495574951172,
"learning_rate": 4.588782906833653e-06,
"loss": 1.1872,
"step": 567
},
{
"epoch": 1.297029702970297,
"grad_norm": 1.0990184545516968,
"learning_rate": 4.587070132573178e-06,
"loss": 1.1903,
"step": 568
},
{
"epoch": 1.2993145468392993,
"grad_norm": 1.121097207069397,
"learning_rate": 4.58535411979282e-06,
"loss": 1.2469,
"step": 569
},
{
"epoch": 1.3015993907083017,
"grad_norm": 1.0787534713745117,
"learning_rate": 4.583634871155326e-06,
"loss": 1.1995,
"step": 570
},
{
"epoch": 1.3038842345773038,
"grad_norm": 1.0721417665481567,
"learning_rate": 4.581912389328466e-06,
"loss": 1.1889,
"step": 571
},
{
"epoch": 1.306169078446306,
"grad_norm": 1.1017696857452393,
"learning_rate": 4.580186676985024e-06,
"loss": 1.2133,
"step": 572
},
{
"epoch": 1.3084539223153084,
"grad_norm": 1.1040468215942383,
"learning_rate": 4.578457736802801e-06,
"loss": 1.1894,
"step": 573
},
{
"epoch": 1.3107387661843108,
"grad_norm": 1.0856465101242065,
"learning_rate": 4.576725571464604e-06,
"loss": 1.2234,
"step": 574
},
{
"epoch": 1.313023610053313,
"grad_norm": 1.0786073207855225,
"learning_rate": 4.574990183658244e-06,
"loss": 1.1989,
"step": 575
},
{
"epoch": 1.3153084539223152,
"grad_norm": 1.0701881647109985,
"learning_rate": 4.573251576076532e-06,
"loss": 1.2095,
"step": 576
},
{
"epoch": 1.3175932977913176,
"grad_norm": 1.0697689056396484,
"learning_rate": 4.5715097514172794e-06,
"loss": 1.2198,
"step": 577
},
{
"epoch": 1.3198781416603198,
"grad_norm": 1.1303515434265137,
"learning_rate": 4.569764712383284e-06,
"loss": 1.2456,
"step": 578
},
{
"epoch": 1.3221629855293222,
"grad_norm": 1.1471296548843384,
"learning_rate": 4.5680164616823355e-06,
"loss": 1.2155,
"step": 579
},
{
"epoch": 1.3244478293983244,
"grad_norm": 1.0679783821105957,
"learning_rate": 4.566265002027204e-06,
"loss": 1.2346,
"step": 580
},
{
"epoch": 1.3267326732673268,
"grad_norm": 1.087063193321228,
"learning_rate": 4.564510336135642e-06,
"loss": 1.1735,
"step": 581
},
{
"epoch": 1.329017517136329,
"grad_norm": 1.190617322921753,
"learning_rate": 4.562752466730374e-06,
"loss": 1.2472,
"step": 582
},
{
"epoch": 1.3313023610053314,
"grad_norm": 1.0759129524230957,
"learning_rate": 4.560991396539099e-06,
"loss": 1.2263,
"step": 583
},
{
"epoch": 1.3335872048743336,
"grad_norm": 1.080640196800232,
"learning_rate": 4.559227128294479e-06,
"loss": 1.1773,
"step": 584
},
{
"epoch": 1.3358720487433358,
"grad_norm": 1.0868074893951416,
"learning_rate": 4.5574596647341414e-06,
"loss": 1.254,
"step": 585
},
{
"epoch": 1.3381568926123382,
"grad_norm": 1.0621445178985596,
"learning_rate": 4.55568900860067e-06,
"loss": 1.2091,
"step": 586
},
{
"epoch": 1.3404417364813406,
"grad_norm": 1.1124675273895264,
"learning_rate": 4.553915162641602e-06,
"loss": 1.2093,
"step": 587
},
{
"epoch": 1.3427265803503428,
"grad_norm": 1.0877987146377563,
"learning_rate": 4.552138129609428e-06,
"loss": 1.2399,
"step": 588
},
{
"epoch": 1.345011424219345,
"grad_norm": 1.1441737413406372,
"learning_rate": 4.550357912261579e-06,
"loss": 1.2274,
"step": 589
},
{
"epoch": 1.3472962680883473,
"grad_norm": 1.131813645362854,
"learning_rate": 4.548574513360431e-06,
"loss": 1.2296,
"step": 590
},
{
"epoch": 1.3495811119573495,
"grad_norm": 1.0901340246200562,
"learning_rate": 4.546787935673294e-06,
"loss": 1.2151,
"step": 591
},
{
"epoch": 1.351865955826352,
"grad_norm": 1.1450812816619873,
"learning_rate": 4.544998181972412e-06,
"loss": 1.2054,
"step": 592
},
{
"epoch": 1.3541507996953541,
"grad_norm": 1.0988374948501587,
"learning_rate": 4.543205255034958e-06,
"loss": 1.2133,
"step": 593
},
{
"epoch": 1.3564356435643563,
"grad_norm": 1.0787577629089355,
"learning_rate": 4.541409157643027e-06,
"loss": 1.1972,
"step": 594
},
{
"epoch": 1.3587204874333587,
"grad_norm": 1.0694243907928467,
"learning_rate": 4.539609892583637e-06,
"loss": 1.2182,
"step": 595
},
{
"epoch": 1.3610053313023611,
"grad_norm": 1.065865397453308,
"learning_rate": 4.537807462648716e-06,
"loss": 1.2057,
"step": 596
},
{
"epoch": 1.3632901751713633,
"grad_norm": 1.0816274881362915,
"learning_rate": 4.5360018706351075e-06,
"loss": 1.1846,
"step": 597
},
{
"epoch": 1.3655750190403655,
"grad_norm": 1.105301022529602,
"learning_rate": 4.5341931193445585e-06,
"loss": 1.2219,
"step": 598
},
{
"epoch": 1.367859862909368,
"grad_norm": 1.1194454431533813,
"learning_rate": 4.5323812115837215e-06,
"loss": 1.2021,
"step": 599
},
{
"epoch": 1.37014470677837,
"grad_norm": 1.0899710655212402,
"learning_rate": 4.530566150164145e-06,
"loss": 1.173,
"step": 600
},
{
"epoch": 1.3724295506473725,
"grad_norm": 1.0824511051177979,
"learning_rate": 4.528747937902271e-06,
"loss": 1.2131,
"step": 601
},
{
"epoch": 1.3747143945163747,
"grad_norm": 1.0796427726745605,
"learning_rate": 4.52692657761943e-06,
"loss": 1.1911,
"step": 602
},
{
"epoch": 1.376999238385377,
"grad_norm": 1.1047371625900269,
"learning_rate": 4.525102072141839e-06,
"loss": 1.1734,
"step": 603
},
{
"epoch": 1.3792840822543793,
"grad_norm": 1.1101821660995483,
"learning_rate": 4.523274424300596e-06,
"loss": 1.2274,
"step": 604
},
{
"epoch": 1.3815689261233817,
"grad_norm": 1.1410820484161377,
"learning_rate": 4.521443636931671e-06,
"loss": 1.2,
"step": 605
},
{
"epoch": 1.3838537699923839,
"grad_norm": 1.0687155723571777,
"learning_rate": 4.5196097128759095e-06,
"loss": 1.2028,
"step": 606
},
{
"epoch": 1.386138613861386,
"grad_norm": 1.0923937559127808,
"learning_rate": 4.517772654979024e-06,
"loss": 1.2522,
"step": 607
},
{
"epoch": 1.3884234577303884,
"grad_norm": 1.1132218837738037,
"learning_rate": 4.515932466091587e-06,
"loss": 1.1797,
"step": 608
},
{
"epoch": 1.3907083015993906,
"grad_norm": 1.182809591293335,
"learning_rate": 4.514089149069033e-06,
"loss": 1.1885,
"step": 609
},
{
"epoch": 1.392993145468393,
"grad_norm": 1.064723253250122,
"learning_rate": 4.512242706771647e-06,
"loss": 1.174,
"step": 610
},
{
"epoch": 1.3952779893373952,
"grad_norm": 1.1065499782562256,
"learning_rate": 4.510393142064567e-06,
"loss": 1.1919,
"step": 611
},
{
"epoch": 1.3975628332063976,
"grad_norm": 1.0989713668823242,
"learning_rate": 4.508540457817772e-06,
"loss": 1.1835,
"step": 612
},
{
"epoch": 1.3998476770753998,
"grad_norm": 1.0850595235824585,
"learning_rate": 4.506684656906085e-06,
"loss": 1.1945,
"step": 613
},
{
"epoch": 1.4021325209444022,
"grad_norm": 1.1323665380477905,
"learning_rate": 4.5048257422091655e-06,
"loss": 1.209,
"step": 614
},
{
"epoch": 1.4044173648134044,
"grad_norm": 1.1112160682678223,
"learning_rate": 4.5029637166115e-06,
"loss": 1.1742,
"step": 615
},
{
"epoch": 1.4067022086824066,
"grad_norm": 1.1052254438400269,
"learning_rate": 4.5010985830024086e-06,
"loss": 1.1916,
"step": 616
},
{
"epoch": 1.408987052551409,
"grad_norm": 1.0695083141326904,
"learning_rate": 4.4992303442760286e-06,
"loss": 1.1829,
"step": 617
},
{
"epoch": 1.4112718964204114,
"grad_norm": 1.0871409177780151,
"learning_rate": 4.497359003331318e-06,
"loss": 1.2053,
"step": 618
},
{
"epoch": 1.4135567402894136,
"grad_norm": 1.093496322631836,
"learning_rate": 4.495484563072049e-06,
"loss": 1.1825,
"step": 619
},
{
"epoch": 1.4158415841584158,
"grad_norm": 1.0716845989227295,
"learning_rate": 4.493607026406802e-06,
"loss": 1.1911,
"step": 620
},
{
"epoch": 1.4181264280274182,
"grad_norm": 1.1274534463882446,
"learning_rate": 4.4917263962489635e-06,
"loss": 1.1737,
"step": 621
},
{
"epoch": 1.4204112718964204,
"grad_norm": 1.082309603691101,
"learning_rate": 4.489842675516718e-06,
"loss": 1.1986,
"step": 622
},
{
"epoch": 1.4226961157654228,
"grad_norm": 1.0890616178512573,
"learning_rate": 4.487955867133047e-06,
"loss": 1.2273,
"step": 623
},
{
"epoch": 1.424980959634425,
"grad_norm": 1.0633172988891602,
"learning_rate": 4.486065974025723e-06,
"loss": 1.1834,
"step": 624
},
{
"epoch": 1.4272658035034271,
"grad_norm": 1.0931994915008545,
"learning_rate": 4.484172999127305e-06,
"loss": 1.1976,
"step": 625
},
{
"epoch": 1.4295506473724295,
"grad_norm": 1.1375906467437744,
"learning_rate": 4.482276945375135e-06,
"loss": 1.2093,
"step": 626
},
{
"epoch": 1.431835491241432,
"grad_norm": 1.3243980407714844,
"learning_rate": 4.480377815711331e-06,
"loss": 1.2102,
"step": 627
},
{
"epoch": 1.4341203351104341,
"grad_norm": 1.0940284729003906,
"learning_rate": 4.478475613082783e-06,
"loss": 1.1888,
"step": 628
},
{
"epoch": 1.4364051789794363,
"grad_norm": 1.1363506317138672,
"learning_rate": 4.4765703404411534e-06,
"loss": 1.1833,
"step": 629
},
{
"epoch": 1.4386900228484387,
"grad_norm": 1.1287343502044678,
"learning_rate": 4.474662000742864e-06,
"loss": 1.2344,
"step": 630
},
{
"epoch": 1.440974866717441,
"grad_norm": 1.3280748128890991,
"learning_rate": 4.472750596949098e-06,
"loss": 1.2025,
"step": 631
},
{
"epoch": 1.4432597105864433,
"grad_norm": 1.1119890213012695,
"learning_rate": 4.470836132025793e-06,
"loss": 1.1586,
"step": 632
},
{
"epoch": 1.4455445544554455,
"grad_norm": 1.066416621208191,
"learning_rate": 4.4689186089436365e-06,
"loss": 1.1717,
"step": 633
},
{
"epoch": 1.447829398324448,
"grad_norm": 1.0481845140457153,
"learning_rate": 4.4669980306780605e-06,
"loss": 1.1949,
"step": 634
},
{
"epoch": 1.45011424219345,
"grad_norm": 1.094254732131958,
"learning_rate": 4.4650744002092384e-06,
"loss": 1.2005,
"step": 635
},
{
"epoch": 1.4523990860624525,
"grad_norm": 1.1029901504516602,
"learning_rate": 4.46314772052208e-06,
"loss": 1.1956,
"step": 636
},
{
"epoch": 1.4546839299314547,
"grad_norm": 1.129492163658142,
"learning_rate": 4.461217994606225e-06,
"loss": 1.2053,
"step": 637
},
{
"epoch": 1.4569687738004569,
"grad_norm": 1.1537097692489624,
"learning_rate": 4.459285225456044e-06,
"loss": 1.1668,
"step": 638
},
{
"epoch": 1.4592536176694593,
"grad_norm": 1.0732276439666748,
"learning_rate": 4.457349416070626e-06,
"loss": 1.2107,
"step": 639
},
{
"epoch": 1.4615384615384617,
"grad_norm": 1.186018943786621,
"learning_rate": 4.455410569453777e-06,
"loss": 1.1789,
"step": 640
},
{
"epoch": 1.4638233054074639,
"grad_norm": 1.1642903089523315,
"learning_rate": 4.453468688614019e-06,
"loss": 1.2169,
"step": 641
},
{
"epoch": 1.466108149276466,
"grad_norm": 1.1415472030639648,
"learning_rate": 4.451523776564581e-06,
"loss": 1.1716,
"step": 642
},
{
"epoch": 1.4683929931454685,
"grad_norm": 1.1644552946090698,
"learning_rate": 4.449575836323394e-06,
"loss": 1.1497,
"step": 643
},
{
"epoch": 1.4706778370144706,
"grad_norm": 1.2200912237167358,
"learning_rate": 4.447624870913091e-06,
"loss": 1.2289,
"step": 644
},
{
"epoch": 1.472962680883473,
"grad_norm": 1.1080158948898315,
"learning_rate": 4.445670883360996e-06,
"loss": 1.1378,
"step": 645
},
{
"epoch": 1.4752475247524752,
"grad_norm": 1.1372804641723633,
"learning_rate": 4.443713876699124e-06,
"loss": 1.1639,
"step": 646
},
{
"epoch": 1.4775323686214774,
"grad_norm": 1.1383754014968872,
"learning_rate": 4.441753853964174e-06,
"loss": 1.1558,
"step": 647
},
{
"epoch": 1.4798172124904798,
"grad_norm": 1.1565297842025757,
"learning_rate": 4.439790818197527e-06,
"loss": 1.242,
"step": 648
},
{
"epoch": 1.4821020563594822,
"grad_norm": 1.156384825706482,
"learning_rate": 4.4378247724452375e-06,
"loss": 1.2241,
"step": 649
},
{
"epoch": 1.4843869002284844,
"grad_norm": 1.2158401012420654,
"learning_rate": 4.43585571975803e-06,
"loss": 1.1977,
"step": 650
},
{
"epoch": 1.4866717440974866,
"grad_norm": 1.2885035276412964,
"learning_rate": 4.433883663191297e-06,
"loss": 1.1916,
"step": 651
},
{
"epoch": 1.488956587966489,
"grad_norm": 1.1530733108520508,
"learning_rate": 4.431908605805092e-06,
"loss": 1.2362,
"step": 652
},
{
"epoch": 1.4912414318354912,
"grad_norm": 1.1524220705032349,
"learning_rate": 4.429930550664121e-06,
"loss": 1.2263,
"step": 653
},
{
"epoch": 1.4935262757044936,
"grad_norm": 1.1547584533691406,
"learning_rate": 4.427949500837749e-06,
"loss": 1.1478,
"step": 654
},
{
"epoch": 1.4958111195734958,
"grad_norm": 1.1083858013153076,
"learning_rate": 4.425965459399979e-06,
"loss": 1.2183,
"step": 655
},
{
"epoch": 1.498095963442498,
"grad_norm": 1.130506992340088,
"learning_rate": 4.423978429429463e-06,
"loss": 1.1923,
"step": 656
},
{
"epoch": 1.5003808073115004,
"grad_norm": 1.106553077697754,
"learning_rate": 4.421988414009488e-06,
"loss": 1.192,
"step": 657
},
{
"epoch": 1.5026656511805028,
"grad_norm": 1.186131477355957,
"learning_rate": 4.419995416227973e-06,
"loss": 1.19,
"step": 658
},
{
"epoch": 1.504950495049505,
"grad_norm": 1.0756375789642334,
"learning_rate": 4.417999439177465e-06,
"loss": 1.1992,
"step": 659
},
{
"epoch": 1.5072353389185071,
"grad_norm": 1.1160579919815063,
"learning_rate": 4.416000485955135e-06,
"loss": 1.1747,
"step": 660
},
{
"epoch": 1.5095201827875095,
"grad_norm": 1.1178874969482422,
"learning_rate": 4.413998559662771e-06,
"loss": 1.1654,
"step": 661
},
{
"epoch": 1.511805026656512,
"grad_norm": 1.0919098854064941,
"learning_rate": 4.411993663406774e-06,
"loss": 1.2013,
"step": 662
},
{
"epoch": 1.5140898705255141,
"grad_norm": 1.058582067489624,
"learning_rate": 4.409985800298155e-06,
"loss": 1.1823,
"step": 663
},
{
"epoch": 1.5163747143945163,
"grad_norm": 1.0792784690856934,
"learning_rate": 4.407974973452527e-06,
"loss": 1.2013,
"step": 664
},
{
"epoch": 1.5186595582635185,
"grad_norm": 1.112774133682251,
"learning_rate": 4.405961185990103e-06,
"loss": 1.2005,
"step": 665
},
{
"epoch": 1.520944402132521,
"grad_norm": 1.1190800666809082,
"learning_rate": 4.403944441035691e-06,
"loss": 1.2146,
"step": 666
},
{
"epoch": 1.5232292460015233,
"grad_norm": 1.1045669317245483,
"learning_rate": 4.401924741718685e-06,
"loss": 1.2217,
"step": 667
},
{
"epoch": 1.5255140898705255,
"grad_norm": 1.1048752069473267,
"learning_rate": 4.399902091173065e-06,
"loss": 1.1944,
"step": 668
},
{
"epoch": 1.5277989337395277,
"grad_norm": 1.0909706354141235,
"learning_rate": 4.397876492537392e-06,
"loss": 1.2058,
"step": 669
},
{
"epoch": 1.53008377760853,
"grad_norm": 1.1354328393936157,
"learning_rate": 4.3958479489548e-06,
"loss": 1.2164,
"step": 670
},
{
"epoch": 1.5323686214775325,
"grad_norm": 1.1490201950073242,
"learning_rate": 4.393816463572993e-06,
"loss": 1.182,
"step": 671
},
{
"epoch": 1.5346534653465347,
"grad_norm": 1.1395319700241089,
"learning_rate": 4.391782039544239e-06,
"loss": 1.2201,
"step": 672
},
{
"epoch": 1.5369383092155369,
"grad_norm": 1.0788644552230835,
"learning_rate": 4.389744680025366e-06,
"loss": 1.2212,
"step": 673
},
{
"epoch": 1.5392231530845393,
"grad_norm": 1.0663102865219116,
"learning_rate": 4.387704388177759e-06,
"loss": 1.1872,
"step": 674
},
{
"epoch": 1.5415079969535415,
"grad_norm": 1.1177600622177124,
"learning_rate": 4.3856611671673505e-06,
"loss": 1.2032,
"step": 675
},
{
"epoch": 1.5437928408225439,
"grad_norm": 1.1109418869018555,
"learning_rate": 4.383615020164621e-06,
"loss": 1.2041,
"step": 676
},
{
"epoch": 1.546077684691546,
"grad_norm": 1.096182107925415,
"learning_rate": 4.3815659503445875e-06,
"loss": 1.1988,
"step": 677
},
{
"epoch": 1.5483625285605482,
"grad_norm": 1.2027829885482788,
"learning_rate": 4.379513960886807e-06,
"loss": 1.1812,
"step": 678
},
{
"epoch": 1.5506473724295506,
"grad_norm": 1.0674421787261963,
"learning_rate": 4.377459054975363e-06,
"loss": 1.1948,
"step": 679
},
{
"epoch": 1.552932216298553,
"grad_norm": 1.0463448762893677,
"learning_rate": 4.375401235798866e-06,
"loss": 1.2174,
"step": 680
},
{
"epoch": 1.5552170601675552,
"grad_norm": 1.1295356750488281,
"learning_rate": 4.373340506550447e-06,
"loss": 1.2013,
"step": 681
},
{
"epoch": 1.5575019040365574,
"grad_norm": 1.116245150566101,
"learning_rate": 4.3712768704277535e-06,
"loss": 1.1983,
"step": 682
},
{
"epoch": 1.5597867479055598,
"grad_norm": 1.1043322086334229,
"learning_rate": 4.369210330632942e-06,
"loss": 1.2042,
"step": 683
},
{
"epoch": 1.5620715917745622,
"grad_norm": 1.1363909244537354,
"learning_rate": 4.367140890372674e-06,
"loss": 1.1793,
"step": 684
},
{
"epoch": 1.5643564356435644,
"grad_norm": 1.099576473236084,
"learning_rate": 4.365068552858116e-06,
"loss": 1.1849,
"step": 685
},
{
"epoch": 1.5666412795125666,
"grad_norm": 1.0956041812896729,
"learning_rate": 4.3629933213049245e-06,
"loss": 1.169,
"step": 686
},
{
"epoch": 1.5689261233815688,
"grad_norm": 1.1022474765777588,
"learning_rate": 4.36091519893325e-06,
"loss": 1.2376,
"step": 687
},
{
"epoch": 1.5712109672505712,
"grad_norm": 1.1013610363006592,
"learning_rate": 4.35883418896773e-06,
"loss": 1.1665,
"step": 688
},
{
"epoch": 1.5734958111195736,
"grad_norm": 1.1273926496505737,
"learning_rate": 4.356750294637478e-06,
"loss": 1.1723,
"step": 689
},
{
"epoch": 1.5757806549885758,
"grad_norm": 1.1341313123703003,
"learning_rate": 4.3546635191760875e-06,
"loss": 1.1813,
"step": 690
},
{
"epoch": 1.578065498857578,
"grad_norm": 1.0935291051864624,
"learning_rate": 4.352573865821621e-06,
"loss": 1.1932,
"step": 691
},
{
"epoch": 1.5803503427265804,
"grad_norm": 1.189144492149353,
"learning_rate": 4.350481337816606e-06,
"loss": 1.1798,
"step": 692
},
{
"epoch": 1.5826351865955828,
"grad_norm": 1.2104555368423462,
"learning_rate": 4.348385938408033e-06,
"loss": 1.1895,
"step": 693
},
{
"epoch": 1.584920030464585,
"grad_norm": 1.1346546411514282,
"learning_rate": 4.346287670847345e-06,
"loss": 1.1896,
"step": 694
},
{
"epoch": 1.5872048743335871,
"grad_norm": 1.131858468055725,
"learning_rate": 4.344186538390438e-06,
"loss": 1.1895,
"step": 695
},
{
"epoch": 1.5894897182025893,
"grad_norm": 1.1175106763839722,
"learning_rate": 4.342082544297652e-06,
"loss": 1.2215,
"step": 696
},
{
"epoch": 1.5917745620715917,
"grad_norm": 1.1056832075119019,
"learning_rate": 4.3399756918337675e-06,
"loss": 1.1617,
"step": 697
},
{
"epoch": 1.5940594059405941,
"grad_norm": 1.0777372121810913,
"learning_rate": 4.337865984268002e-06,
"loss": 1.2716,
"step": 698
},
{
"epoch": 1.5963442498095963,
"grad_norm": 1.1472179889678955,
"learning_rate": 4.335753424874e-06,
"loss": 1.1842,
"step": 699
},
{
"epoch": 1.5986290936785985,
"grad_norm": 1.0825904607772827,
"learning_rate": 4.333638016929835e-06,
"loss": 1.2377,
"step": 700
},
{
"epoch": 1.600913937547601,
"grad_norm": 1.0884586572647095,
"learning_rate": 4.331519763717998e-06,
"loss": 1.2003,
"step": 701
},
{
"epoch": 1.6031987814166033,
"grad_norm": 1.132827639579773,
"learning_rate": 4.329398668525396e-06,
"loss": 1.186,
"step": 702
},
{
"epoch": 1.6054836252856055,
"grad_norm": 1.1931272745132446,
"learning_rate": 4.327274734643346e-06,
"loss": 1.21,
"step": 703
},
{
"epoch": 1.6077684691546077,
"grad_norm": 1.060774803161621,
"learning_rate": 4.3251479653675705e-06,
"loss": 1.1893,
"step": 704
},
{
"epoch": 1.61005331302361,
"grad_norm": 1.0615568161010742,
"learning_rate": 4.323018363998189e-06,
"loss": 1.1814,
"step": 705
},
{
"epoch": 1.6123381568926123,
"grad_norm": 1.0800261497497559,
"learning_rate": 4.320885933839718e-06,
"loss": 1.1995,
"step": 706
},
{
"epoch": 1.6146230007616147,
"grad_norm": 1.1502355337142944,
"learning_rate": 4.318750678201064e-06,
"loss": 1.219,
"step": 707
},
{
"epoch": 1.6169078446306169,
"grad_norm": 1.07515287399292,
"learning_rate": 4.316612600395515e-06,
"loss": 1.1787,
"step": 708
},
{
"epoch": 1.619192688499619,
"grad_norm": 1.075150728225708,
"learning_rate": 4.31447170374074e-06,
"loss": 1.2062,
"step": 709
},
{
"epoch": 1.6214775323686215,
"grad_norm": 1.1458789110183716,
"learning_rate": 4.312327991558782e-06,
"loss": 1.2009,
"step": 710
},
{
"epoch": 1.6237623762376239,
"grad_norm": 1.1655830144882202,
"learning_rate": 4.3101814671760546e-06,
"loss": 1.2004,
"step": 711
},
{
"epoch": 1.626047220106626,
"grad_norm": 1.1394225358963013,
"learning_rate": 4.30803213392333e-06,
"loss": 1.2059,
"step": 712
},
{
"epoch": 1.6283320639756282,
"grad_norm": 1.109095573425293,
"learning_rate": 4.305879995135745e-06,
"loss": 1.1727,
"step": 713
},
{
"epoch": 1.6306169078446306,
"grad_norm": 1.072530746459961,
"learning_rate": 4.303725054152785e-06,
"loss": 1.2059,
"step": 714
},
{
"epoch": 1.632901751713633,
"grad_norm": 1.079399824142456,
"learning_rate": 4.3015673143182864e-06,
"loss": 1.1929,
"step": 715
},
{
"epoch": 1.6351865955826352,
"grad_norm": 1.1387250423431396,
"learning_rate": 4.299406778980428e-06,
"loss": 1.1924,
"step": 716
},
{
"epoch": 1.6374714394516374,
"grad_norm": 1.1101268529891968,
"learning_rate": 4.297243451491724e-06,
"loss": 1.1678,
"step": 717
},
{
"epoch": 1.6397562833206396,
"grad_norm": 1.0745279788970947,
"learning_rate": 4.295077335209027e-06,
"loss": 1.1632,
"step": 718
},
{
"epoch": 1.642041127189642,
"grad_norm": 1.1062053442001343,
"learning_rate": 4.29290843349351e-06,
"loss": 1.183,
"step": 719
},
{
"epoch": 1.6443259710586444,
"grad_norm": 1.1918151378631592,
"learning_rate": 4.290736749710672e-06,
"loss": 1.2,
"step": 720
},
{
"epoch": 1.6466108149276466,
"grad_norm": 1.1596369743347168,
"learning_rate": 4.28856228723033e-06,
"loss": 1.2232,
"step": 721
},
{
"epoch": 1.6488956587966488,
"grad_norm": 1.1261903047561646,
"learning_rate": 4.28638504942661e-06,
"loss": 1.1614,
"step": 722
},
{
"epoch": 1.6511805026656512,
"grad_norm": 1.1628916263580322,
"learning_rate": 4.284205039677946e-06,
"loss": 1.1866,
"step": 723
},
{
"epoch": 1.6534653465346536,
"grad_norm": 1.114009976387024,
"learning_rate": 4.282022261367074e-06,
"loss": 1.2027,
"step": 724
},
{
"epoch": 1.6557501904036558,
"grad_norm": 1.2100892066955566,
"learning_rate": 4.279836717881022e-06,
"loss": 1.1922,
"step": 725
},
{
"epoch": 1.658035034272658,
"grad_norm": 1.122144103050232,
"learning_rate": 4.277648412611114e-06,
"loss": 1.178,
"step": 726
},
{
"epoch": 1.6603198781416602,
"grad_norm": 1.0997899770736694,
"learning_rate": 4.275457348952955e-06,
"loss": 1.2276,
"step": 727
},
{
"epoch": 1.6626047220106626,
"grad_norm": 1.1784963607788086,
"learning_rate": 4.273263530306435e-06,
"loss": 1.1889,
"step": 728
},
{
"epoch": 1.664889565879665,
"grad_norm": 1.1602816581726074,
"learning_rate": 4.271066960075715e-06,
"loss": 1.1671,
"step": 729
},
{
"epoch": 1.6671744097486672,
"grad_norm": 1.132406234741211,
"learning_rate": 4.268867641669225e-06,
"loss": 1.2017,
"step": 730
},
{
"epoch": 1.6694592536176693,
"grad_norm": 1.1267459392547607,
"learning_rate": 4.266665578499664e-06,
"loss": 1.2135,
"step": 731
},
{
"epoch": 1.6717440974866717,
"grad_norm": 1.0951600074768066,
"learning_rate": 4.2644607739839875e-06,
"loss": 1.2463,
"step": 732
},
{
"epoch": 1.6740289413556741,
"grad_norm": 1.073875069618225,
"learning_rate": 4.262253231543401e-06,
"loss": 1.1879,
"step": 733
},
{
"epoch": 1.6763137852246763,
"grad_norm": 1.1010491847991943,
"learning_rate": 4.260042954603366e-06,
"loss": 1.1812,
"step": 734
},
{
"epoch": 1.6785986290936785,
"grad_norm": 1.1129403114318848,
"learning_rate": 4.2578299465935805e-06,
"loss": 1.2281,
"step": 735
},
{
"epoch": 1.680883472962681,
"grad_norm": 1.3570629358291626,
"learning_rate": 4.255614210947985e-06,
"loss": 1.2013,
"step": 736
},
{
"epoch": 1.6831683168316833,
"grad_norm": 1.104535460472107,
"learning_rate": 4.2533957511047485e-06,
"loss": 1.1708,
"step": 737
},
{
"epoch": 1.6854531607006855,
"grad_norm": 1.1040364503860474,
"learning_rate": 4.25117457050627e-06,
"loss": 1.2154,
"step": 738
},
{
"epoch": 1.6877380045696877,
"grad_norm": 1.0932183265686035,
"learning_rate": 4.24895067259917e-06,
"loss": 1.2182,
"step": 739
},
{
"epoch": 1.6900228484386899,
"grad_norm": 1.0946629047393799,
"learning_rate": 4.246724060834284e-06,
"loss": 1.2058,
"step": 740
},
{
"epoch": 1.6923076923076923,
"grad_norm": 1.060051679611206,
"learning_rate": 4.24449473866666e-06,
"loss": 1.1919,
"step": 741
},
{
"epoch": 1.6945925361766947,
"grad_norm": 1.0483872890472412,
"learning_rate": 4.242262709555552e-06,
"loss": 1.1638,
"step": 742
},
{
"epoch": 1.6968773800456969,
"grad_norm": 1.0956077575683594,
"learning_rate": 4.240027976964412e-06,
"loss": 1.1805,
"step": 743
},
{
"epoch": 1.699162223914699,
"grad_norm": 1.1185762882232666,
"learning_rate": 4.237790544360889e-06,
"loss": 1.1923,
"step": 744
},
{
"epoch": 1.7014470677837015,
"grad_norm": 1.0720428228378296,
"learning_rate": 4.2355504152168235e-06,
"loss": 1.1895,
"step": 745
},
{
"epoch": 1.7037319116527039,
"grad_norm": 1.1020833253860474,
"learning_rate": 4.2333075930082345e-06,
"loss": 1.1845,
"step": 746
},
{
"epoch": 1.706016755521706,
"grad_norm": 1.107071876525879,
"learning_rate": 4.231062081215326e-06,
"loss": 1.1751,
"step": 747
},
{
"epoch": 1.7083015993907082,
"grad_norm": 1.1301578283309937,
"learning_rate": 4.228813883322472e-06,
"loss": 1.151,
"step": 748
},
{
"epoch": 1.7105864432597104,
"grad_norm": 1.096433401107788,
"learning_rate": 4.226563002818215e-06,
"loss": 1.1728,
"step": 749
},
{
"epoch": 1.7128712871287128,
"grad_norm": 1.1047178506851196,
"learning_rate": 4.224309443195261e-06,
"loss": 1.1947,
"step": 750
},
{
"epoch": 1.7151561309977152,
"grad_norm": 1.110417127609253,
"learning_rate": 4.222053207950472e-06,
"loss": 1.2186,
"step": 751
},
{
"epoch": 1.7174409748667174,
"grad_norm": 1.1027036905288696,
"learning_rate": 4.219794300584863e-06,
"loss": 1.2164,
"step": 752
},
{
"epoch": 1.7197258187357196,
"grad_norm": 1.109299898147583,
"learning_rate": 4.217532724603595e-06,
"loss": 1.2202,
"step": 753
},
{
"epoch": 1.722010662604722,
"grad_norm": 1.143134593963623,
"learning_rate": 4.2152684835159695e-06,
"loss": 1.1837,
"step": 754
},
{
"epoch": 1.7242955064737244,
"grad_norm": 1.0689709186553955,
"learning_rate": 4.213001580835423e-06,
"loss": 1.1874,
"step": 755
},
{
"epoch": 1.7265803503427266,
"grad_norm": 1.128243327140808,
"learning_rate": 4.2107320200795236e-06,
"loss": 1.1756,
"step": 756
},
{
"epoch": 1.7288651942117288,
"grad_norm": 1.067436933517456,
"learning_rate": 4.208459804769963e-06,
"loss": 1.2212,
"step": 757
},
{
"epoch": 1.7311500380807312,
"grad_norm": 1.1413803100585938,
"learning_rate": 4.206184938432552e-06,
"loss": 1.1491,
"step": 758
},
{
"epoch": 1.7334348819497334,
"grad_norm": 1.141803503036499,
"learning_rate": 4.203907424597214e-06,
"loss": 1.2636,
"step": 759
},
{
"epoch": 1.7357197258187358,
"grad_norm": 1.1099580526351929,
"learning_rate": 4.2016272667979814e-06,
"loss": 1.2192,
"step": 760
},
{
"epoch": 1.738004569687738,
"grad_norm": 1.100486397743225,
"learning_rate": 4.199344468572992e-06,
"loss": 1.2044,
"step": 761
},
{
"epoch": 1.7402894135567402,
"grad_norm": 1.0598186254501343,
"learning_rate": 4.197059033464476e-06,
"loss": 1.1983,
"step": 762
},
{
"epoch": 1.7425742574257426,
"grad_norm": 1.0922818183898926,
"learning_rate": 4.194770965018758e-06,
"loss": 1.2194,
"step": 763
},
{
"epoch": 1.744859101294745,
"grad_norm": 1.0975127220153809,
"learning_rate": 4.1924802667862485e-06,
"loss": 1.1465,
"step": 764
},
{
"epoch": 1.7471439451637472,
"grad_norm": 1.0934858322143555,
"learning_rate": 4.190186942321438e-06,
"loss": 1.1544,
"step": 765
},
{
"epoch": 1.7494287890327493,
"grad_norm": 1.0712271928787231,
"learning_rate": 4.187890995182893e-06,
"loss": 1.1893,
"step": 766
},
{
"epoch": 1.7517136329017517,
"grad_norm": 1.1157736778259277,
"learning_rate": 4.1855924289332485e-06,
"loss": 1.2362,
"step": 767
},
{
"epoch": 1.7539984767707542,
"grad_norm": 1.2225691080093384,
"learning_rate": 4.183291247139204e-06,
"loss": 1.22,
"step": 768
},
{
"epoch": 1.7562833206397563,
"grad_norm": 1.1402082443237305,
"learning_rate": 4.180987453371519e-06,
"loss": 1.2024,
"step": 769
},
{
"epoch": 1.7585681645087585,
"grad_norm": 1.1124638319015503,
"learning_rate": 4.178681051205004e-06,
"loss": 1.17,
"step": 770
},
{
"epoch": 1.7608530083777607,
"grad_norm": 1.1337512731552124,
"learning_rate": 4.176372044218519e-06,
"loss": 1.1862,
"step": 771
},
{
"epoch": 1.7631378522467631,
"grad_norm": 1.0702142715454102,
"learning_rate": 4.174060435994962e-06,
"loss": 1.2038,
"step": 772
},
{
"epoch": 1.7654226961157655,
"grad_norm": 1.112242579460144,
"learning_rate": 4.171746230121273e-06,
"loss": 1.2146,
"step": 773
},
{
"epoch": 1.7677075399847677,
"grad_norm": 1.1164225339889526,
"learning_rate": 4.169429430188418e-06,
"loss": 1.1768,
"step": 774
},
{
"epoch": 1.76999238385377,
"grad_norm": 1.091208577156067,
"learning_rate": 4.16711003979139e-06,
"loss": 1.193,
"step": 775
},
{
"epoch": 1.7722772277227723,
"grad_norm": 1.138411283493042,
"learning_rate": 4.164788062529203e-06,
"loss": 1.203,
"step": 776
},
{
"epoch": 1.7745620715917747,
"grad_norm": 1.168305516242981,
"learning_rate": 4.1624635020048835e-06,
"loss": 1.2154,
"step": 777
},
{
"epoch": 1.7768469154607769,
"grad_norm": 1.0742619037628174,
"learning_rate": 4.160136361825465e-06,
"loss": 1.214,
"step": 778
},
{
"epoch": 1.779131759329779,
"grad_norm": 1.076762080192566,
"learning_rate": 4.1578066456019885e-06,
"loss": 1.1834,
"step": 779
},
{
"epoch": 1.7814166031987813,
"grad_norm": 1.1189744472503662,
"learning_rate": 4.155474356949487e-06,
"loss": 1.191,
"step": 780
},
{
"epoch": 1.7837014470677837,
"grad_norm": 1.0916801691055298,
"learning_rate": 4.153139499486988e-06,
"loss": 1.2104,
"step": 781
},
{
"epoch": 1.785986290936786,
"grad_norm": 1.1265934705734253,
"learning_rate": 4.150802076837506e-06,
"loss": 1.2366,
"step": 782
},
{
"epoch": 1.7882711348057883,
"grad_norm": 1.1008100509643555,
"learning_rate": 4.148462092628032e-06,
"loss": 1.1919,
"step": 783
},
{
"epoch": 1.7905559786747904,
"grad_norm": 1.5858978033065796,
"learning_rate": 4.146119550489536e-06,
"loss": 1.1927,
"step": 784
},
{
"epoch": 1.7928408225437928,
"grad_norm": 1.1155521869659424,
"learning_rate": 4.143774454056954e-06,
"loss": 1.1948,
"step": 785
},
{
"epoch": 1.7951256664127953,
"grad_norm": 1.1289353370666504,
"learning_rate": 4.141426806969189e-06,
"loss": 1.1719,
"step": 786
},
{
"epoch": 1.7974105102817974,
"grad_norm": 1.1492801904678345,
"learning_rate": 4.139076612869098e-06,
"loss": 1.169,
"step": 787
},
{
"epoch": 1.7996953541507996,
"grad_norm": 1.0931838750839233,
"learning_rate": 4.1367238754034935e-06,
"loss": 1.1581,
"step": 788
},
{
"epoch": 1.801980198019802,
"grad_norm": 1.0901176929473877,
"learning_rate": 4.134368598223132e-06,
"loss": 1.2223,
"step": 789
},
{
"epoch": 1.8042650418888042,
"grad_norm": 1.0907678604125977,
"learning_rate": 4.132010784982711e-06,
"loss": 1.1839,
"step": 790
},
{
"epoch": 1.8065498857578066,
"grad_norm": 1.1389234066009521,
"learning_rate": 4.129650439340866e-06,
"loss": 1.1765,
"step": 791
},
{
"epoch": 1.8088347296268088,
"grad_norm": 1.0889054536819458,
"learning_rate": 4.12728756496016e-06,
"loss": 1.1913,
"step": 792
},
{
"epoch": 1.811119573495811,
"grad_norm": 1.090705156326294,
"learning_rate": 4.12492216550708e-06,
"loss": 1.1692,
"step": 793
},
{
"epoch": 1.8134044173648134,
"grad_norm": 1.1290946006774902,
"learning_rate": 4.12255424465203e-06,
"loss": 1.2114,
"step": 794
},
{
"epoch": 1.8156892612338158,
"grad_norm": 1.108325719833374,
"learning_rate": 4.120183806069328e-06,
"loss": 1.1941,
"step": 795
},
{
"epoch": 1.817974105102818,
"grad_norm": 1.0901302099227905,
"learning_rate": 4.1178108534371995e-06,
"loss": 1.1709,
"step": 796
},
{
"epoch": 1.8202589489718202,
"grad_norm": 1.1386867761611938,
"learning_rate": 4.11543539043777e-06,
"loss": 1.2008,
"step": 797
},
{
"epoch": 1.8225437928408226,
"grad_norm": 1.1768696308135986,
"learning_rate": 4.11305742075706e-06,
"loss": 1.1735,
"step": 798
},
{
"epoch": 1.824828636709825,
"grad_norm": 1.093137264251709,
"learning_rate": 4.1106769480849795e-06,
"loss": 1.1952,
"step": 799
},
{
"epoch": 1.8271134805788272,
"grad_norm": 1.12264084815979,
"learning_rate": 4.108293976115325e-06,
"loss": 1.2118,
"step": 800
},
{
"epoch": 1.8293983244478293,
"grad_norm": 1.089824914932251,
"learning_rate": 4.105908508545766e-06,
"loss": 1.1856,
"step": 801
},
{
"epoch": 1.8316831683168315,
"grad_norm": 1.3425248861312866,
"learning_rate": 4.1035205490778505e-06,
"loss": 1.1942,
"step": 802
},
{
"epoch": 1.833968012185834,
"grad_norm": 1.097115159034729,
"learning_rate": 4.101130101416988e-06,
"loss": 1.2083,
"step": 803
},
{
"epoch": 1.8362528560548363,
"grad_norm": 1.0973830223083496,
"learning_rate": 4.098737169272452e-06,
"loss": 1.2033,
"step": 804
},
{
"epoch": 1.8385376999238385,
"grad_norm": 1.127233624458313,
"learning_rate": 4.096341756357371e-06,
"loss": 1.1941,
"step": 805
},
{
"epoch": 1.8408225437928407,
"grad_norm": 1.097070574760437,
"learning_rate": 4.093943866388723e-06,
"loss": 1.1971,
"step": 806
},
{
"epoch": 1.8431073876618431,
"grad_norm": 1.0978144407272339,
"learning_rate": 4.091543503087327e-06,
"loss": 1.2029,
"step": 807
},
{
"epoch": 1.8453922315308455,
"grad_norm": 1.0643872022628784,
"learning_rate": 4.089140670177843e-06,
"loss": 1.1532,
"step": 808
},
{
"epoch": 1.8476770753998477,
"grad_norm": 1.1128400564193726,
"learning_rate": 4.086735371388762e-06,
"loss": 1.1851,
"step": 809
},
{
"epoch": 1.84996191926885,
"grad_norm": 1.1439098119735718,
"learning_rate": 4.0843276104524e-06,
"loss": 1.1816,
"step": 810
},
{
"epoch": 1.852246763137852,
"grad_norm": 1.1020617485046387,
"learning_rate": 4.0819173911048965e-06,
"loss": 1.2081,
"step": 811
},
{
"epoch": 1.8545316070068545,
"grad_norm": 1.0913503170013428,
"learning_rate": 4.079504717086203e-06,
"loss": 1.1892,
"step": 812
},
{
"epoch": 1.856816450875857,
"grad_norm": 1.1332125663757324,
"learning_rate": 4.077089592140082e-06,
"loss": 1.182,
"step": 813
},
{
"epoch": 1.859101294744859,
"grad_norm": 1.0584102869033813,
"learning_rate": 4.074672020014098e-06,
"loss": 1.2169,
"step": 814
},
{
"epoch": 1.8613861386138613,
"grad_norm": 1.1291085481643677,
"learning_rate": 4.072252004459612e-06,
"loss": 1.1796,
"step": 815
},
{
"epoch": 1.8636709824828637,
"grad_norm": 1.0868487358093262,
"learning_rate": 4.069829549231778e-06,
"loss": 1.1832,
"step": 816
},
{
"epoch": 1.865955826351866,
"grad_norm": 1.1105777025222778,
"learning_rate": 4.067404658089535e-06,
"loss": 1.2242,
"step": 817
},
{
"epoch": 1.8682406702208683,
"grad_norm": 1.1203657388687134,
"learning_rate": 4.0649773347956005e-06,
"loss": 1.1755,
"step": 818
},
{
"epoch": 1.8705255140898704,
"grad_norm": 1.1000312566757202,
"learning_rate": 4.062547583116469e-06,
"loss": 1.1829,
"step": 819
},
{
"epoch": 1.8728103579588729,
"grad_norm": 1.1125813722610474,
"learning_rate": 4.060115406822402e-06,
"loss": 1.2013,
"step": 820
},
{
"epoch": 1.8750952018278753,
"grad_norm": 1.0868099927902222,
"learning_rate": 4.057680809687421e-06,
"loss": 1.1749,
"step": 821
},
{
"epoch": 1.8773800456968774,
"grad_norm": 1.1242718696594238,
"learning_rate": 4.055243795489307e-06,
"loss": 1.1601,
"step": 822
},
{
"epoch": 1.8796648895658796,
"grad_norm": 1.1220780611038208,
"learning_rate": 4.052804368009589e-06,
"loss": 1.197,
"step": 823
},
{
"epoch": 1.8819497334348818,
"grad_norm": 1.0715032815933228,
"learning_rate": 4.050362531033545e-06,
"loss": 1.1834,
"step": 824
},
{
"epoch": 1.8842345773038842,
"grad_norm": 1.1281424760818481,
"learning_rate": 4.0479182883501855e-06,
"loss": 1.1653,
"step": 825
},
{
"epoch": 1.8865194211728866,
"grad_norm": 1.0727750062942505,
"learning_rate": 4.045471643752258e-06,
"loss": 1.1907,
"step": 826
},
{
"epoch": 1.8888042650418888,
"grad_norm": 1.122889757156372,
"learning_rate": 4.043022601036238e-06,
"loss": 1.1935,
"step": 827
},
{
"epoch": 1.891089108910891,
"grad_norm": 1.1605268716812134,
"learning_rate": 4.040571164002319e-06,
"loss": 1.211,
"step": 828
},
{
"epoch": 1.8933739527798934,
"grad_norm": 1.0915296077728271,
"learning_rate": 4.038117336454411e-06,
"loss": 1.1614,
"step": 829
},
{
"epoch": 1.8956587966488958,
"grad_norm": 1.1206241846084595,
"learning_rate": 4.035661122200135e-06,
"loss": 1.1592,
"step": 830
},
{
"epoch": 1.897943640517898,
"grad_norm": 1.1132665872573853,
"learning_rate": 4.033202525050813e-06,
"loss": 1.1865,
"step": 831
},
{
"epoch": 1.9002284843869002,
"grad_norm": 1.0694245100021362,
"learning_rate": 4.0307415488214675e-06,
"loss": 1.1767,
"step": 832
},
{
"epoch": 1.9025133282559024,
"grad_norm": 1.1312873363494873,
"learning_rate": 4.028278197330808e-06,
"loss": 1.2344,
"step": 833
},
{
"epoch": 1.9047981721249048,
"grad_norm": 1.0864746570587158,
"learning_rate": 4.025812474401236e-06,
"loss": 1.2146,
"step": 834
},
{
"epoch": 1.9070830159939072,
"grad_norm": 1.0774086713790894,
"learning_rate": 4.023344383858826e-06,
"loss": 1.1496,
"step": 835
},
{
"epoch": 1.9093678598629094,
"grad_norm": 1.0805225372314453,
"learning_rate": 4.0208739295333314e-06,
"loss": 1.2098,
"step": 836
},
{
"epoch": 1.9116527037319115,
"grad_norm": 1.0994813442230225,
"learning_rate": 4.018401115258172e-06,
"loss": 1.1881,
"step": 837
},
{
"epoch": 1.913937547600914,
"grad_norm": 1.0419007539749146,
"learning_rate": 4.015925944870428e-06,
"loss": 1.1935,
"step": 838
},
{
"epoch": 1.9162223914699164,
"grad_norm": 1.1358449459075928,
"learning_rate": 4.013448422210838e-06,
"loss": 1.1989,
"step": 839
},
{
"epoch": 1.9185072353389185,
"grad_norm": 1.121999740600586,
"learning_rate": 4.010968551123788e-06,
"loss": 1.2108,
"step": 840
},
{
"epoch": 1.9207920792079207,
"grad_norm": 1.1504106521606445,
"learning_rate": 4.008486335457312e-06,
"loss": 1.1768,
"step": 841
},
{
"epoch": 1.9230769230769231,
"grad_norm": 1.128138780593872,
"learning_rate": 4.006001779063078e-06,
"loss": 1.1992,
"step": 842
},
{
"epoch": 1.9253617669459253,
"grad_norm": 1.1590732336044312,
"learning_rate": 4.003514885796388e-06,
"loss": 1.181,
"step": 843
},
{
"epoch": 1.9276466108149277,
"grad_norm": 1.0851722955703735,
"learning_rate": 4.001025659516171e-06,
"loss": 1.1711,
"step": 844
},
{
"epoch": 1.92993145468393,
"grad_norm": 1.066331148147583,
"learning_rate": 3.998534104084974e-06,
"loss": 1.1728,
"step": 845
},
{
"epoch": 1.932216298552932,
"grad_norm": 1.110464096069336,
"learning_rate": 3.99604022336896e-06,
"loss": 1.178,
"step": 846
},
{
"epoch": 1.9345011424219345,
"grad_norm": 1.1028679609298706,
"learning_rate": 3.993544021237899e-06,
"loss": 1.2122,
"step": 847
},
{
"epoch": 1.936785986290937,
"grad_norm": 1.1760601997375488,
"learning_rate": 3.991045501565163e-06,
"loss": 1.2103,
"step": 848
},
{
"epoch": 1.939070830159939,
"grad_norm": 1.1260336637496948,
"learning_rate": 3.988544668227721e-06,
"loss": 1.1443,
"step": 849
},
{
"epoch": 1.9413556740289413,
"grad_norm": 1.1055935621261597,
"learning_rate": 3.9860415251061334e-06,
"loss": 1.1795,
"step": 850
},
{
"epoch": 1.9436405178979437,
"grad_norm": 1.1292855739593506,
"learning_rate": 3.983536076084541e-06,
"loss": 1.182,
"step": 851
},
{
"epoch": 1.945925361766946,
"grad_norm": 1.1108464002609253,
"learning_rate": 3.981028325050667e-06,
"loss": 1.1876,
"step": 852
},
{
"epoch": 1.9482102056359483,
"grad_norm": 1.1306401491165161,
"learning_rate": 3.978518275895802e-06,
"loss": 1.1645,
"step": 853
},
{
"epoch": 1.9504950495049505,
"grad_norm": 1.1031887531280518,
"learning_rate": 3.976005932514807e-06,
"loss": 1.2047,
"step": 854
},
{
"epoch": 1.9527798933739526,
"grad_norm": 1.0953725576400757,
"learning_rate": 3.973491298806101e-06,
"loss": 1.1756,
"step": 855
},
{
"epoch": 1.955064737242955,
"grad_norm": 1.109553575515747,
"learning_rate": 3.970974378671656e-06,
"loss": 1.2228,
"step": 856
},
{
"epoch": 1.9573495811119574,
"grad_norm": 1.1159707307815552,
"learning_rate": 3.968455176016993e-06,
"loss": 1.2037,
"step": 857
},
{
"epoch": 1.9596344249809596,
"grad_norm": 1.1045714616775513,
"learning_rate": 3.965933694751175e-06,
"loss": 1.196,
"step": 858
},
{
"epoch": 1.9619192688499618,
"grad_norm": 1.110876202583313,
"learning_rate": 3.963409938786801e-06,
"loss": 1.1772,
"step": 859
},
{
"epoch": 1.9642041127189642,
"grad_norm": 1.1226321458816528,
"learning_rate": 3.9608839120399975e-06,
"loss": 1.1875,
"step": 860
},
{
"epoch": 1.9664889565879666,
"grad_norm": 1.1401004791259766,
"learning_rate": 3.958355618430417e-06,
"loss": 1.2137,
"step": 861
},
{
"epoch": 1.9687738004569688,
"grad_norm": 1.0866281986236572,
"learning_rate": 3.95582506188123e-06,
"loss": 1.2001,
"step": 862
},
{
"epoch": 1.971058644325971,
"grad_norm": 1.1426069736480713,
"learning_rate": 3.9532922463191145e-06,
"loss": 1.1794,
"step": 863
},
{
"epoch": 1.9733434881949732,
"grad_norm": 1.1191396713256836,
"learning_rate": 3.950757175674257e-06,
"loss": 1.2118,
"step": 864
},
{
"epoch": 1.9756283320639756,
"grad_norm": 1.0993397235870361,
"learning_rate": 3.948219853880344e-06,
"loss": 1.2209,
"step": 865
},
{
"epoch": 1.977913175932978,
"grad_norm": 1.0973010063171387,
"learning_rate": 3.945680284874553e-06,
"loss": 1.1738,
"step": 866
},
{
"epoch": 1.9801980198019802,
"grad_norm": 1.2131692171096802,
"learning_rate": 3.943138472597549e-06,
"loss": 1.1833,
"step": 867
},
{
"epoch": 1.9824828636709824,
"grad_norm": 1.1128953695297241,
"learning_rate": 3.940594420993479e-06,
"loss": 1.1925,
"step": 868
},
{
"epoch": 1.9847677075399848,
"grad_norm": 1.0862925052642822,
"learning_rate": 3.938048134009962e-06,
"loss": 1.1965,
"step": 869
},
{
"epoch": 1.9870525514089872,
"grad_norm": 1.1464707851409912,
"learning_rate": 3.935499615598088e-06,
"loss": 1.1579,
"step": 870
},
{
"epoch": 1.9893373952779894,
"grad_norm": 1.1059821844100952,
"learning_rate": 3.932948869712412e-06,
"loss": 1.169,
"step": 871
},
{
"epoch": 1.9916222391469915,
"grad_norm": 1.1403911113739014,
"learning_rate": 3.930395900310939e-06,
"loss": 1.1586,
"step": 872
},
{
"epoch": 1.993907083015994,
"grad_norm": 1.0881669521331787,
"learning_rate": 3.9278407113551295e-06,
"loss": 1.2262,
"step": 873
},
{
"epoch": 1.9961919268849961,
"grad_norm": 1.1039564609527588,
"learning_rate": 3.925283306809885e-06,
"loss": 1.1951,
"step": 874
},
{
"epoch": 1.9984767707539985,
"grad_norm": 1.1270387172698975,
"learning_rate": 3.9227236906435484e-06,
"loss": 1.1808,
"step": 875
},
{
"epoch": 2.0,
"grad_norm": 1.307372808456421,
"learning_rate": 3.92016186682789e-06,
"loss": 1.1959,
"step": 876
},
{
"epoch": 2.002284843869002,
"grad_norm": 1.3203482627868652,
"learning_rate": 3.917597839338108e-06,
"loss": 1.1606,
"step": 877
},
{
"epoch": 2.0045696877380044,
"grad_norm": 1.1091164350509644,
"learning_rate": 3.915031612152823e-06,
"loss": 1.1532,
"step": 878
},
{
"epoch": 2.006854531607007,
"grad_norm": 1.1063039302825928,
"learning_rate": 3.912463189254063e-06,
"loss": 1.1635,
"step": 879
},
{
"epoch": 2.009139375476009,
"grad_norm": 1.1640090942382812,
"learning_rate": 3.909892574627267e-06,
"loss": 1.1174,
"step": 880
},
{
"epoch": 2.0114242193450114,
"grad_norm": 1.147412657737732,
"learning_rate": 3.907319772261273e-06,
"loss": 1.1285,
"step": 881
},
{
"epoch": 2.0137090632140136,
"grad_norm": 1.1952067613601685,
"learning_rate": 3.904744786148316e-06,
"loss": 1.1657,
"step": 882
},
{
"epoch": 2.015993907083016,
"grad_norm": 1.1174241304397583,
"learning_rate": 3.902167620284017e-06,
"loss": 1.1424,
"step": 883
},
{
"epoch": 2.0182787509520184,
"grad_norm": 1.0976516008377075,
"learning_rate": 3.899588278667382e-06,
"loss": 1.1328,
"step": 884
},
{
"epoch": 2.0205635948210205,
"grad_norm": 1.1376157999038696,
"learning_rate": 3.897006765300791e-06,
"loss": 1.155,
"step": 885
},
{
"epoch": 2.0228484386900227,
"grad_norm": 1.175310730934143,
"learning_rate": 3.8944230841899935e-06,
"loss": 1.1799,
"step": 886
},
{
"epoch": 2.025133282559025,
"grad_norm": 1.1553736925125122,
"learning_rate": 3.8918372393441036e-06,
"loss": 1.1656,
"step": 887
},
{
"epoch": 2.0274181264280275,
"grad_norm": 1.0809762477874756,
"learning_rate": 3.889249234775596e-06,
"loss": 1.158,
"step": 888
},
{
"epoch": 2.0297029702970297,
"grad_norm": 1.2475727796554565,
"learning_rate": 3.886659074500291e-06,
"loss": 1.1958,
"step": 889
},
{
"epoch": 2.031987814166032,
"grad_norm": 1.2172832489013672,
"learning_rate": 3.884066762537357e-06,
"loss": 1.1703,
"step": 890
},
{
"epoch": 2.034272658035034,
"grad_norm": 1.1374081373214722,
"learning_rate": 3.8814723029093014e-06,
"loss": 1.1384,
"step": 891
},
{
"epoch": 2.0365575019040367,
"grad_norm": 1.1220778226852417,
"learning_rate": 3.878875699641964e-06,
"loss": 1.1368,
"step": 892
},
{
"epoch": 2.038842345773039,
"grad_norm": 1.0943710803985596,
"learning_rate": 3.876276956764509e-06,
"loss": 1.1345,
"step": 893
},
{
"epoch": 2.041127189642041,
"grad_norm": 1.079895257949829,
"learning_rate": 3.873676078309423e-06,
"loss": 1.1469,
"step": 894
},
{
"epoch": 2.0434120335110433,
"grad_norm": 1.164766550064087,
"learning_rate": 3.871073068312506e-06,
"loss": 1.1458,
"step": 895
},
{
"epoch": 2.045696877380046,
"grad_norm": 1.1422792673110962,
"learning_rate": 3.868467930812864e-06,
"loss": 1.1286,
"step": 896
},
{
"epoch": 2.047981721249048,
"grad_norm": 1.104988694190979,
"learning_rate": 3.865860669852906e-06,
"loss": 1.1316,
"step": 897
},
{
"epoch": 2.0502665651180503,
"grad_norm": 1.157842755317688,
"learning_rate": 3.8632512894783345e-06,
"loss": 1.1515,
"step": 898
},
{
"epoch": 2.0525514089870525,
"grad_norm": 1.255118727684021,
"learning_rate": 3.860639793738143e-06,
"loss": 1.1806,
"step": 899
},
{
"epoch": 2.0548362528560546,
"grad_norm": 1.2516144514083862,
"learning_rate": 3.858026186684604e-06,
"loss": 1.1973,
"step": 900
},
{
"epoch": 2.0571210967250573,
"grad_norm": 1.143211007118225,
"learning_rate": 3.85541047237327e-06,
"loss": 1.1654,
"step": 901
},
{
"epoch": 2.0594059405940595,
"grad_norm": 1.1019172668457031,
"learning_rate": 3.852792654862959e-06,
"loss": 1.1534,
"step": 902
},
{
"epoch": 2.0616907844630616,
"grad_norm": 1.1939572095870972,
"learning_rate": 3.850172738215757e-06,
"loss": 1.1145,
"step": 903
},
{
"epoch": 2.063975628332064,
"grad_norm": 1.2124632596969604,
"learning_rate": 3.847550726497004e-06,
"loss": 1.1037,
"step": 904
},
{
"epoch": 2.0662604722010665,
"grad_norm": 1.144073486328125,
"learning_rate": 3.844926623775293e-06,
"loss": 1.1605,
"step": 905
},
{
"epoch": 2.0685453160700686,
"grad_norm": 1.1347826719284058,
"learning_rate": 3.84230043412246e-06,
"loss": 1.1421,
"step": 906
},
{
"epoch": 2.070830159939071,
"grad_norm": 1.1252721548080444,
"learning_rate": 3.8396721616135805e-06,
"loss": 1.1767,
"step": 907
},
{
"epoch": 2.073115003808073,
"grad_norm": 1.224308729171753,
"learning_rate": 3.837041810326961e-06,
"loss": 1.1325,
"step": 908
},
{
"epoch": 2.075399847677075,
"grad_norm": 1.25150728225708,
"learning_rate": 3.8344093843441345e-06,
"loss": 1.1235,
"step": 909
},
{
"epoch": 2.077684691546078,
"grad_norm": 1.244081974029541,
"learning_rate": 3.831774887749854e-06,
"loss": 1.2037,
"step": 910
},
{
"epoch": 2.07996953541508,
"grad_norm": 1.14827561378479,
"learning_rate": 3.829138324632082e-06,
"loss": 1.1378,
"step": 911
},
{
"epoch": 2.082254379284082,
"grad_norm": 1.1389387845993042,
"learning_rate": 3.826499699081992e-06,
"loss": 1.1785,
"step": 912
},
{
"epoch": 2.0845392231530844,
"grad_norm": 1.2572706937789917,
"learning_rate": 3.823859015193957e-06,
"loss": 1.1474,
"step": 913
},
{
"epoch": 2.086824067022087,
"grad_norm": 1.1964038610458374,
"learning_rate": 3.8212162770655405e-06,
"loss": 1.1508,
"step": 914
},
{
"epoch": 2.089108910891089,
"grad_norm": 1.2023462057113647,
"learning_rate": 3.818571488797496e-06,
"loss": 1.1401,
"step": 915
},
{
"epoch": 2.0913937547600914,
"grad_norm": 1.1288152933120728,
"learning_rate": 3.815924654493759e-06,
"loss": 1.1395,
"step": 916
},
{
"epoch": 2.0936785986290936,
"grad_norm": 1.15272855758667,
"learning_rate": 3.8132757782614405e-06,
"loss": 1.1357,
"step": 917
},
{
"epoch": 2.095963442498096,
"grad_norm": 1.176313877105713,
"learning_rate": 3.810624864210816e-06,
"loss": 1.1529,
"step": 918
},
{
"epoch": 2.0982482863670984,
"grad_norm": 1.2011158466339111,
"learning_rate": 3.807971916455325e-06,
"loss": 1.1301,
"step": 919
},
{
"epoch": 2.1005331302361006,
"grad_norm": 1.135551929473877,
"learning_rate": 3.8053169391115665e-06,
"loss": 1.1545,
"step": 920
},
{
"epoch": 2.1028179741051027,
"grad_norm": 1.1023021936416626,
"learning_rate": 3.802659936299283e-06,
"loss": 1.1619,
"step": 921
},
{
"epoch": 2.105102817974105,
"grad_norm": 1.1410151720046997,
"learning_rate": 3.800000912141363e-06,
"loss": 1.168,
"step": 922
},
{
"epoch": 2.1073876618431076,
"grad_norm": 1.1780641078948975,
"learning_rate": 3.797339870763831e-06,
"loss": 1.1895,
"step": 923
},
{
"epoch": 2.1096725057121097,
"grad_norm": 1.13074791431427,
"learning_rate": 3.7946768162958424e-06,
"loss": 1.1786,
"step": 924
},
{
"epoch": 2.111957349581112,
"grad_norm": 1.1389769315719604,
"learning_rate": 3.792011752869676e-06,
"loss": 1.1401,
"step": 925
},
{
"epoch": 2.114242193450114,
"grad_norm": 1.1424660682678223,
"learning_rate": 3.7893446846207254e-06,
"loss": 1.1269,
"step": 926
},
{
"epoch": 2.1165270373191167,
"grad_norm": 1.2282761335372925,
"learning_rate": 3.7866756156874996e-06,
"loss": 1.1739,
"step": 927
},
{
"epoch": 2.118811881188119,
"grad_norm": 1.1365275382995605,
"learning_rate": 3.7840045502116073e-06,
"loss": 1.1506,
"step": 928
},
{
"epoch": 2.121096725057121,
"grad_norm": 1.1741012334823608,
"learning_rate": 3.7813314923377603e-06,
"loss": 1.1436,
"step": 929
},
{
"epoch": 2.1233815689261233,
"grad_norm": 1.1063398122787476,
"learning_rate": 3.778656446213757e-06,
"loss": 1.1567,
"step": 930
},
{
"epoch": 2.1256664127951255,
"grad_norm": 1.1437269449234009,
"learning_rate": 3.775979415990485e-06,
"loss": 1.1524,
"step": 931
},
{
"epoch": 2.127951256664128,
"grad_norm": 1.180048942565918,
"learning_rate": 3.773300405821908e-06,
"loss": 1.1283,
"step": 932
},
{
"epoch": 2.1302361005331303,
"grad_norm": 1.179042100906372,
"learning_rate": 3.7706194198650635e-06,
"loss": 1.1285,
"step": 933
},
{
"epoch": 2.1325209444021325,
"grad_norm": 1.1634424924850464,
"learning_rate": 3.767936462280054e-06,
"loss": 1.1491,
"step": 934
},
{
"epoch": 2.1348057882711347,
"grad_norm": 1.1401522159576416,
"learning_rate": 3.7652515372300415e-06,
"loss": 1.1256,
"step": 935
},
{
"epoch": 2.1370906321401373,
"grad_norm": 1.240664005279541,
"learning_rate": 3.762564648881242e-06,
"loss": 1.1735,
"step": 936
},
{
"epoch": 2.1393754760091395,
"grad_norm": 1.23284113407135,
"learning_rate": 3.7598758014029158e-06,
"loss": 1.1421,
"step": 937
},
{
"epoch": 2.1416603198781416,
"grad_norm": 1.1896634101867676,
"learning_rate": 3.757184998967366e-06,
"loss": 1.1041,
"step": 938
},
{
"epoch": 2.143945163747144,
"grad_norm": 1.1241672039031982,
"learning_rate": 3.7544922457499256e-06,
"loss": 1.1566,
"step": 939
},
{
"epoch": 2.146230007616146,
"grad_norm": 1.1452815532684326,
"learning_rate": 3.751797545928959e-06,
"loss": 1.1952,
"step": 940
},
{
"epoch": 2.1485148514851486,
"grad_norm": 1.1131870746612549,
"learning_rate": 3.7491009036858483e-06,
"loss": 1.14,
"step": 941
},
{
"epoch": 2.150799695354151,
"grad_norm": 1.16527259349823,
"learning_rate": 3.7464023232049895e-06,
"loss": 1.1648,
"step": 942
},
{
"epoch": 2.153084539223153,
"grad_norm": 1.1289446353912354,
"learning_rate": 3.7437018086737876e-06,
"loss": 1.179,
"step": 943
},
{
"epoch": 2.155369383092155,
"grad_norm": 1.1474318504333496,
"learning_rate": 3.740999364282647e-06,
"loss": 1.2018,
"step": 944
},
{
"epoch": 2.157654226961158,
"grad_norm": 1.177672266960144,
"learning_rate": 3.7382949942249695e-06,
"loss": 1.1756,
"step": 945
},
{
"epoch": 2.15993907083016,
"grad_norm": 1.152292251586914,
"learning_rate": 3.7355887026971417e-06,
"loss": 1.1387,
"step": 946
},
{
"epoch": 2.162223914699162,
"grad_norm": 1.1584752798080444,
"learning_rate": 3.7328804938985335e-06,
"loss": 1.1648,
"step": 947
},
{
"epoch": 2.1645087585681644,
"grad_norm": 1.1106222867965698,
"learning_rate": 3.7301703720314897e-06,
"loss": 1.1875,
"step": 948
},
{
"epoch": 2.166793602437167,
"grad_norm": 1.177903413772583,
"learning_rate": 3.727458341301324e-06,
"loss": 1.1488,
"step": 949
},
{
"epoch": 2.169078446306169,
"grad_norm": 1.149046778678894,
"learning_rate": 3.7247444059163106e-06,
"loss": 1.1601,
"step": 950
},
{
"epoch": 2.1713632901751714,
"grad_norm": 1.1731359958648682,
"learning_rate": 3.7220285700876812e-06,
"loss": 1.1743,
"step": 951
},
{
"epoch": 2.1736481340441736,
"grad_norm": 1.1620594263076782,
"learning_rate": 3.719310838029615e-06,
"loss": 1.2199,
"step": 952
},
{
"epoch": 2.1759329779131757,
"grad_norm": 1.1532787084579468,
"learning_rate": 3.716591213959234e-06,
"loss": 1.1403,
"step": 953
},
{
"epoch": 2.1782178217821784,
"grad_norm": 1.1255860328674316,
"learning_rate": 3.7138697020965945e-06,
"loss": 1.1262,
"step": 954
},
{
"epoch": 2.1805026656511806,
"grad_norm": 1.1715703010559082,
"learning_rate": 3.7111463066646858e-06,
"loss": 1.1371,
"step": 955
},
{
"epoch": 2.1827875095201827,
"grad_norm": 1.121799111366272,
"learning_rate": 3.7084210318894177e-06,
"loss": 1.1942,
"step": 956
},
{
"epoch": 2.185072353389185,
"grad_norm": 1.1630092859268188,
"learning_rate": 3.7056938819996146e-06,
"loss": 1.1928,
"step": 957
},
{
"epoch": 2.1873571972581876,
"grad_norm": 1.132199764251709,
"learning_rate": 3.702964861227013e-06,
"loss": 1.1706,
"step": 958
},
{
"epoch": 2.1896420411271897,
"grad_norm": 1.1396610736846924,
"learning_rate": 3.7002339738062513e-06,
"loss": 1.1628,
"step": 959
},
{
"epoch": 2.191926884996192,
"grad_norm": 1.0978549718856812,
"learning_rate": 3.6975012239748664e-06,
"loss": 1.1966,
"step": 960
},
{
"epoch": 2.194211728865194,
"grad_norm": 1.1913878917694092,
"learning_rate": 3.694766615973281e-06,
"loss": 1.159,
"step": 961
},
{
"epoch": 2.1964965727341963,
"grad_norm": 1.1240078210830688,
"learning_rate": 3.6920301540448054e-06,
"loss": 1.1767,
"step": 962
},
{
"epoch": 2.198781416603199,
"grad_norm": 1.2754088640213013,
"learning_rate": 3.6892918424356238e-06,
"loss": 1.1382,
"step": 963
},
{
"epoch": 2.201066260472201,
"grad_norm": 1.1081241369247437,
"learning_rate": 3.6865516853947923e-06,
"loss": 1.1634,
"step": 964
},
{
"epoch": 2.2033511043412033,
"grad_norm": 1.1287072896957397,
"learning_rate": 3.683809687174229e-06,
"loss": 1.1354,
"step": 965
},
{
"epoch": 2.2056359482102055,
"grad_norm": 1.1273839473724365,
"learning_rate": 3.6810658520287106e-06,
"loss": 1.147,
"step": 966
},
{
"epoch": 2.207920792079208,
"grad_norm": 1.2143324613571167,
"learning_rate": 3.6783201842158633e-06,
"loss": 1.1793,
"step": 967
},
{
"epoch": 2.2102056359482103,
"grad_norm": 1.2204785346984863,
"learning_rate": 3.6755726879961575e-06,
"loss": 1.1915,
"step": 968
},
{
"epoch": 2.2124904798172125,
"grad_norm": 1.168456792831421,
"learning_rate": 3.6728233676328988e-06,
"loss": 1.1502,
"step": 969
},
{
"epoch": 2.2147753236862147,
"grad_norm": 1.1369385719299316,
"learning_rate": 3.670072227392226e-06,
"loss": 1.1787,
"step": 970
},
{
"epoch": 2.217060167555217,
"grad_norm": 1.1360613107681274,
"learning_rate": 3.6673192715431016e-06,
"loss": 1.1279,
"step": 971
},
{
"epoch": 2.2193450114242195,
"grad_norm": 1.2034990787506104,
"learning_rate": 3.6645645043573044e-06,
"loss": 1.1586,
"step": 972
},
{
"epoch": 2.2216298552932217,
"grad_norm": 1.133101224899292,
"learning_rate": 3.661807930109422e-06,
"loss": 1.1521,
"step": 973
},
{
"epoch": 2.223914699162224,
"grad_norm": 1.185314655303955,
"learning_rate": 3.6590495530768493e-06,
"loss": 1.1347,
"step": 974
},
{
"epoch": 2.226199543031226,
"grad_norm": 1.1746361255645752,
"learning_rate": 3.656289377539778e-06,
"loss": 1.1749,
"step": 975
},
{
"epoch": 2.2284843869002287,
"grad_norm": 1.1611051559448242,
"learning_rate": 3.65352740778119e-06,
"loss": 1.1399,
"step": 976
},
{
"epoch": 2.230769230769231,
"grad_norm": 1.1828947067260742,
"learning_rate": 3.650763648086849e-06,
"loss": 1.1815,
"step": 977
},
{
"epoch": 2.233054074638233,
"grad_norm": 1.1289913654327393,
"learning_rate": 3.6479981027453002e-06,
"loss": 1.1524,
"step": 978
},
{
"epoch": 2.235338918507235,
"grad_norm": 1.1266071796417236,
"learning_rate": 3.6452307760478583e-06,
"loss": 1.1502,
"step": 979
},
{
"epoch": 2.237623762376238,
"grad_norm": 1.128389596939087,
"learning_rate": 3.6424616722886004e-06,
"loss": 1.1611,
"step": 980
},
{
"epoch": 2.23990860624524,
"grad_norm": 1.216874122619629,
"learning_rate": 3.6396907957643623e-06,
"loss": 1.1693,
"step": 981
},
{
"epoch": 2.242193450114242,
"grad_norm": 1.1498830318450928,
"learning_rate": 3.6369181507747305e-06,
"loss": 1.1304,
"step": 982
},
{
"epoch": 2.2444782939832444,
"grad_norm": 1.1365691423416138,
"learning_rate": 3.634143741622036e-06,
"loss": 1.1673,
"step": 983
},
{
"epoch": 2.2467631378522466,
"grad_norm": 1.1243813037872314,
"learning_rate": 3.631367572611348e-06,
"loss": 1.1665,
"step": 984
},
{
"epoch": 2.249047981721249,
"grad_norm": 1.1227095127105713,
"learning_rate": 3.6285896480504633e-06,
"loss": 1.1566,
"step": 985
},
{
"epoch": 2.2513328255902514,
"grad_norm": 1.1476339101791382,
"learning_rate": 3.6258099722499063e-06,
"loss": 1.1759,
"step": 986
},
{
"epoch": 2.2536176694592536,
"grad_norm": 1.130340814590454,
"learning_rate": 3.623028549522918e-06,
"loss": 1.1835,
"step": 987
},
{
"epoch": 2.2559025133282558,
"grad_norm": 1.1270296573638916,
"learning_rate": 3.620245384185448e-06,
"loss": 1.177,
"step": 988
},
{
"epoch": 2.258187357197258,
"grad_norm": 1.170242190361023,
"learning_rate": 3.6174604805561524e-06,
"loss": 1.1589,
"step": 989
},
{
"epoch": 2.2604722010662606,
"grad_norm": 1.1653361320495605,
"learning_rate": 3.6146738429563837e-06,
"loss": 1.1349,
"step": 990
},
{
"epoch": 2.2627570449352628,
"grad_norm": 1.195779800415039,
"learning_rate": 3.6118854757101855e-06,
"loss": 1.1427,
"step": 991
},
{
"epoch": 2.265041888804265,
"grad_norm": 1.1496593952178955,
"learning_rate": 3.609095383144284e-06,
"loss": 1.1922,
"step": 992
},
{
"epoch": 2.2673267326732676,
"grad_norm": 1.1693689823150635,
"learning_rate": 3.6063035695880838e-06,
"loss": 1.1701,
"step": 993
},
{
"epoch": 2.2696115765422697,
"grad_norm": 1.1333836317062378,
"learning_rate": 3.60351003937366e-06,
"loss": 1.163,
"step": 994
},
{
"epoch": 2.271896420411272,
"grad_norm": 1.1533620357513428,
"learning_rate": 3.6007147968357505e-06,
"loss": 1.1117,
"step": 995
},
{
"epoch": 2.274181264280274,
"grad_norm": 1.1433489322662354,
"learning_rate": 3.5979178463117505e-06,
"loss": 1.1871,
"step": 996
},
{
"epoch": 2.2764661081492763,
"grad_norm": 1.1923670768737793,
"learning_rate": 3.5951191921417063e-06,
"loss": 1.1502,
"step": 997
},
{
"epoch": 2.278750952018279,
"grad_norm": 1.13682222366333,
"learning_rate": 3.5923188386683067e-06,
"loss": 1.1314,
"step": 998
},
{
"epoch": 2.281035795887281,
"grad_norm": 1.2247021198272705,
"learning_rate": 3.589516790236879e-06,
"loss": 1.1392,
"step": 999
},
{
"epoch": 2.2833206397562833,
"grad_norm": 1.1454575061798096,
"learning_rate": 3.586713051195378e-06,
"loss": 1.1473,
"step": 1000
},
{
"epoch": 2.2856054836252855,
"grad_norm": 1.1719251871109009,
"learning_rate": 3.583907625894384e-06,
"loss": 1.1642,
"step": 1001
},
{
"epoch": 2.2878903274942877,
"grad_norm": 1.171013355255127,
"learning_rate": 3.5811005186870927e-06,
"loss": 1.1811,
"step": 1002
},
{
"epoch": 2.2901751713632903,
"grad_norm": 1.1246569156646729,
"learning_rate": 3.578291733929311e-06,
"loss": 1.1172,
"step": 1003
},
{
"epoch": 2.2924600152322925,
"grad_norm": 1.1219955682754517,
"learning_rate": 3.5754812759794465e-06,
"loss": 1.1369,
"step": 1004
},
{
"epoch": 2.2947448591012947,
"grad_norm": 1.1373074054718018,
"learning_rate": 3.572669149198506e-06,
"loss": 1.1703,
"step": 1005
},
{
"epoch": 2.297029702970297,
"grad_norm": 1.204938530921936,
"learning_rate": 3.569855357950084e-06,
"loss": 1.1089,
"step": 1006
},
{
"epoch": 2.2993145468392995,
"grad_norm": 1.1538746356964111,
"learning_rate": 3.567039906600357e-06,
"loss": 1.1508,
"step": 1007
},
{
"epoch": 2.3015993907083017,
"grad_norm": 1.135554313659668,
"learning_rate": 3.5642227995180787e-06,
"loss": 1.1507,
"step": 1008
},
{
"epoch": 2.303884234577304,
"grad_norm": 1.1385624408721924,
"learning_rate": 3.5614040410745737e-06,
"loss": 1.1578,
"step": 1009
},
{
"epoch": 2.306169078446306,
"grad_norm": 1.144099473953247,
"learning_rate": 3.5585836356437266e-06,
"loss": 1.1549,
"step": 1010
},
{
"epoch": 2.3084539223153087,
"grad_norm": 1.1651406288146973,
"learning_rate": 3.555761587601976e-06,
"loss": 1.1613,
"step": 1011
},
{
"epoch": 2.310738766184311,
"grad_norm": 1.1579418182373047,
"learning_rate": 3.552937901328315e-06,
"loss": 1.1261,
"step": 1012
},
{
"epoch": 2.313023610053313,
"grad_norm": 1.2125619649887085,
"learning_rate": 3.550112581204273e-06,
"loss": 1.1778,
"step": 1013
},
{
"epoch": 2.315308453922315,
"grad_norm": 1.2006595134735107,
"learning_rate": 3.5472856316139193e-06,
"loss": 1.1755,
"step": 1014
},
{
"epoch": 2.3175932977913174,
"grad_norm": 1.1417663097381592,
"learning_rate": 3.5444570569438465e-06,
"loss": 1.1604,
"step": 1015
},
{
"epoch": 2.31987814166032,
"grad_norm": 1.1562373638153076,
"learning_rate": 3.5416268615831737e-06,
"loss": 1.17,
"step": 1016
},
{
"epoch": 2.322162985529322,
"grad_norm": 1.4638527631759644,
"learning_rate": 3.5387950499235323e-06,
"loss": 1.159,
"step": 1017
},
{
"epoch": 2.3244478293983244,
"grad_norm": 1.1422604322433472,
"learning_rate": 3.5359616263590637e-06,
"loss": 1.1391,
"step": 1018
},
{
"epoch": 2.3267326732673266,
"grad_norm": 1.1229259967803955,
"learning_rate": 3.5331265952864065e-06,
"loss": 1.2002,
"step": 1019
},
{
"epoch": 2.329017517136329,
"grad_norm": 1.1740734577178955,
"learning_rate": 3.530289961104698e-06,
"loss": 1.1347,
"step": 1020
},
{
"epoch": 2.3313023610053314,
"grad_norm": 1.1417142152786255,
"learning_rate": 3.527451728215561e-06,
"loss": 1.1979,
"step": 1021
},
{
"epoch": 2.3335872048743336,
"grad_norm": 1.1507670879364014,
"learning_rate": 3.5246119010230994e-06,
"loss": 1.1522,
"step": 1022
},
{
"epoch": 2.3358720487433358,
"grad_norm": 1.1694669723510742,
"learning_rate": 3.521770483933891e-06,
"loss": 1.1215,
"step": 1023
},
{
"epoch": 2.3381568926123384,
"grad_norm": 1.134131669998169,
"learning_rate": 3.5189274813569807e-06,
"loss": 1.1648,
"step": 1024
},
{
"epoch": 2.3404417364813406,
"grad_norm": 1.144411325454712,
"learning_rate": 3.516082897703873e-06,
"loss": 1.1526,
"step": 1025
},
{
"epoch": 2.3427265803503428,
"grad_norm": 1.1161473989486694,
"learning_rate": 3.5132367373885267e-06,
"loss": 1.1495,
"step": 1026
},
{
"epoch": 2.345011424219345,
"grad_norm": 1.143911600112915,
"learning_rate": 3.5103890048273464e-06,
"loss": 1.1724,
"step": 1027
},
{
"epoch": 2.347296268088347,
"grad_norm": 1.2014847993850708,
"learning_rate": 3.507539704439177e-06,
"loss": 1.1622,
"step": 1028
},
{
"epoch": 2.3495811119573498,
"grad_norm": 1.2054967880249023,
"learning_rate": 3.5046888406452966e-06,
"loss": 1.1539,
"step": 1029
},
{
"epoch": 2.351865955826352,
"grad_norm": 1.1581631898880005,
"learning_rate": 3.5018364178694077e-06,
"loss": 1.16,
"step": 1030
},
{
"epoch": 2.354150799695354,
"grad_norm": 1.1760412454605103,
"learning_rate": 3.4989824405376314e-06,
"loss": 1.1736,
"step": 1031
},
{
"epoch": 2.3564356435643563,
"grad_norm": 1.1309703588485718,
"learning_rate": 3.4961269130785047e-06,
"loss": 1.166,
"step": 1032
},
{
"epoch": 2.3587204874333585,
"grad_norm": 1.181168556213379,
"learning_rate": 3.493269839922967e-06,
"loss": 1.1701,
"step": 1033
},
{
"epoch": 2.361005331302361,
"grad_norm": 1.1288000345230103,
"learning_rate": 3.490411225504355e-06,
"loss": 1.1546,
"step": 1034
},
{
"epoch": 2.3632901751713633,
"grad_norm": 1.1293566226959229,
"learning_rate": 3.4875510742584006e-06,
"loss": 1.1389,
"step": 1035
},
{
"epoch": 2.3655750190403655,
"grad_norm": 1.161147952079773,
"learning_rate": 3.484689390623218e-06,
"loss": 1.2149,
"step": 1036
},
{
"epoch": 2.3678598629093677,
"grad_norm": 1.1328603029251099,
"learning_rate": 3.4818261790393e-06,
"loss": 1.1373,
"step": 1037
},
{
"epoch": 2.3701447067783703,
"grad_norm": 1.14400315284729,
"learning_rate": 3.478961443949509e-06,
"loss": 1.1547,
"step": 1038
},
{
"epoch": 2.3724295506473725,
"grad_norm": 1.1662416458129883,
"learning_rate": 3.4760951897990734e-06,
"loss": 1.1395,
"step": 1039
},
{
"epoch": 2.3747143945163747,
"grad_norm": 1.3636192083358765,
"learning_rate": 3.473227421035578e-06,
"loss": 1.1528,
"step": 1040
},
{
"epoch": 2.376999238385377,
"grad_norm": 1.1440818309783936,
"learning_rate": 3.4703581421089566e-06,
"loss": 1.1655,
"step": 1041
},
{
"epoch": 2.3792840822543795,
"grad_norm": 1.2090609073638916,
"learning_rate": 3.4674873574714886e-06,
"loss": 1.0997,
"step": 1042
},
{
"epoch": 2.3815689261233817,
"grad_norm": 1.1551830768585205,
"learning_rate": 3.464615071577788e-06,
"loss": 1.1652,
"step": 1043
},
{
"epoch": 2.383853769992384,
"grad_norm": 1.1676479578018188,
"learning_rate": 3.4617412888847984e-06,
"loss": 1.1596,
"step": 1044
},
{
"epoch": 2.386138613861386,
"grad_norm": 1.1927589178085327,
"learning_rate": 3.458866013851788e-06,
"loss": 1.1462,
"step": 1045
},
{
"epoch": 2.388423457730388,
"grad_norm": 1.2201225757598877,
"learning_rate": 3.455989250940338e-06,
"loss": 1.1703,
"step": 1046
},
{
"epoch": 2.390708301599391,
"grad_norm": 1.1762723922729492,
"learning_rate": 3.45311100461434e-06,
"loss": 1.1706,
"step": 1047
},
{
"epoch": 2.392993145468393,
"grad_norm": 1.163713812828064,
"learning_rate": 3.4502312793399873e-06,
"loss": 1.1208,
"step": 1048
},
{
"epoch": 2.395277989337395,
"grad_norm": 1.3621329069137573,
"learning_rate": 3.4473500795857674e-06,
"loss": 1.1493,
"step": 1049
},
{
"epoch": 2.3975628332063974,
"grad_norm": 1.1427867412567139,
"learning_rate": 3.4444674098224555e-06,
"loss": 1.1454,
"step": 1050
},
{
"epoch": 2.3998476770754,
"grad_norm": 1.1881464719772339,
"learning_rate": 3.4415832745231092e-06,
"loss": 1.1094,
"step": 1051
},
{
"epoch": 2.402132520944402,
"grad_norm": 1.134605050086975,
"learning_rate": 3.4386976781630594e-06,
"loss": 1.1676,
"step": 1052
},
{
"epoch": 2.4044173648134044,
"grad_norm": 1.1633696556091309,
"learning_rate": 3.4358106252199043e-06,
"loss": 1.1258,
"step": 1053
},
{
"epoch": 2.4067022086824066,
"grad_norm": 1.1574831008911133,
"learning_rate": 3.4329221201735015e-06,
"loss": 1.1499,
"step": 1054
},
{
"epoch": 2.408987052551409,
"grad_norm": 1.169659972190857,
"learning_rate": 3.430032167505962e-06,
"loss": 1.1197,
"step": 1055
},
{
"epoch": 2.4112718964204114,
"grad_norm": 1.191874623298645,
"learning_rate": 3.4271407717016456e-06,
"loss": 1.1673,
"step": 1056
},
{
"epoch": 2.4135567402894136,
"grad_norm": 1.1627485752105713,
"learning_rate": 3.424247937247148e-06,
"loss": 1.1634,
"step": 1057
},
{
"epoch": 2.4158415841584158,
"grad_norm": 1.1143652200698853,
"learning_rate": 3.421353668631299e-06,
"loss": 1.1509,
"step": 1058
},
{
"epoch": 2.418126428027418,
"grad_norm": 1.1409255266189575,
"learning_rate": 3.418457970345153e-06,
"loss": 1.1155,
"step": 1059
},
{
"epoch": 2.4204112718964206,
"grad_norm": 1.1744626760482788,
"learning_rate": 3.415560846881984e-06,
"loss": 1.176,
"step": 1060
},
{
"epoch": 2.4226961157654228,
"grad_norm": 1.1737017631530762,
"learning_rate": 3.4126623027372763e-06,
"loss": 1.1526,
"step": 1061
},
{
"epoch": 2.424980959634425,
"grad_norm": 1.1165390014648438,
"learning_rate": 3.4097623424087196e-06,
"loss": 1.1041,
"step": 1062
},
{
"epoch": 2.427265803503427,
"grad_norm": 1.1141180992126465,
"learning_rate": 3.4068609703961997e-06,
"loss": 1.153,
"step": 1063
},
{
"epoch": 2.4295506473724293,
"grad_norm": 1.193962574005127,
"learning_rate": 3.4039581912017946e-06,
"loss": 1.1109,
"step": 1064
},
{
"epoch": 2.431835491241432,
"grad_norm": 1.154571533203125,
"learning_rate": 3.401054009329765e-06,
"loss": 1.1416,
"step": 1065
},
{
"epoch": 2.434120335110434,
"grad_norm": 1.2508164644241333,
"learning_rate": 3.398148429286547e-06,
"loss": 1.122,
"step": 1066
},
{
"epoch": 2.4364051789794363,
"grad_norm": 1.1385433673858643,
"learning_rate": 3.3952414555807493e-06,
"loss": 1.167,
"step": 1067
},
{
"epoch": 2.4386900228484385,
"grad_norm": 1.1891573667526245,
"learning_rate": 3.392333092723141e-06,
"loss": 1.1732,
"step": 1068
},
{
"epoch": 2.440974866717441,
"grad_norm": 1.230272889137268,
"learning_rate": 3.389423345226647e-06,
"loss": 1.1714,
"step": 1069
},
{
"epoch": 2.4432597105864433,
"grad_norm": 1.1934008598327637,
"learning_rate": 3.386512217606339e-06,
"loss": 1.1613,
"step": 1070
},
{
"epoch": 2.4455445544554455,
"grad_norm": 1.1615201234817505,
"learning_rate": 3.383599714379435e-06,
"loss": 1.1551,
"step": 1071
},
{
"epoch": 2.4478293983244477,
"grad_norm": 1.1495729684829712,
"learning_rate": 3.3806858400652825e-06,
"loss": 1.1311,
"step": 1072
},
{
"epoch": 2.4501142421934503,
"grad_norm": 1.1871975660324097,
"learning_rate": 3.37777059918536e-06,
"loss": 1.1294,
"step": 1073
},
{
"epoch": 2.4523990860624525,
"grad_norm": 1.1563098430633545,
"learning_rate": 3.374853996263264e-06,
"loss": 1.1412,
"step": 1074
},
{
"epoch": 2.4546839299314547,
"grad_norm": 1.27406644821167,
"learning_rate": 3.3719360358247054e-06,
"loss": 1.1515,
"step": 1075
},
{
"epoch": 2.456968773800457,
"grad_norm": 1.227537751197815,
"learning_rate": 3.369016722397504e-06,
"loss": 1.1525,
"step": 1076
},
{
"epoch": 2.459253617669459,
"grad_norm": 1.1642208099365234,
"learning_rate": 3.366096060511575e-06,
"loss": 1.1476,
"step": 1077
},
{
"epoch": 2.4615384615384617,
"grad_norm": 1.1597189903259277,
"learning_rate": 3.363174054698928e-06,
"loss": 1.1567,
"step": 1078
},
{
"epoch": 2.463823305407464,
"grad_norm": 1.2041665315628052,
"learning_rate": 3.3602507094936576e-06,
"loss": 1.1265,
"step": 1079
},
{
"epoch": 2.466108149276466,
"grad_norm": 1.2899173498153687,
"learning_rate": 3.357326029431939e-06,
"loss": 1.157,
"step": 1080
},
{
"epoch": 2.4683929931454682,
"grad_norm": 1.2047252655029297,
"learning_rate": 3.3544000190520144e-06,
"loss": 1.1466,
"step": 1081
},
{
"epoch": 2.470677837014471,
"grad_norm": 1.2135494947433472,
"learning_rate": 3.351472682894193e-06,
"loss": 1.1474,
"step": 1082
},
{
"epoch": 2.472962680883473,
"grad_norm": 1.1497875452041626,
"learning_rate": 3.348544025500841e-06,
"loss": 1.0876,
"step": 1083
},
{
"epoch": 2.4752475247524752,
"grad_norm": 1.260158658027649,
"learning_rate": 3.3456140514163756e-06,
"loss": 1.1698,
"step": 1084
},
{
"epoch": 2.4775323686214774,
"grad_norm": 1.2946076393127441,
"learning_rate": 3.342682765187254e-06,
"loss": 1.1504,
"step": 1085
},
{
"epoch": 2.47981721249048,
"grad_norm": 1.2317827939987183,
"learning_rate": 3.3397501713619736e-06,
"loss": 1.121,
"step": 1086
},
{
"epoch": 2.4821020563594822,
"grad_norm": 1.1697484254837036,
"learning_rate": 3.336816274491057e-06,
"loss": 1.185,
"step": 1087
},
{
"epoch": 2.4843869002284844,
"grad_norm": 1.1670335531234741,
"learning_rate": 3.333881079127052e-06,
"loss": 1.1294,
"step": 1088
},
{
"epoch": 2.4866717440974866,
"grad_norm": 1.2509000301361084,
"learning_rate": 3.3309445898245184e-06,
"loss": 1.178,
"step": 1089
},
{
"epoch": 2.4889565879664888,
"grad_norm": 1.178087592124939,
"learning_rate": 3.328006811140026e-06,
"loss": 1.1516,
"step": 1090
},
{
"epoch": 2.4912414318354914,
"grad_norm": 1.2256996631622314,
"learning_rate": 3.3250677476321442e-06,
"loss": 1.1598,
"step": 1091
},
{
"epoch": 2.4935262757044936,
"grad_norm": 1.1501520872116089,
"learning_rate": 3.322127403861437e-06,
"loss": 1.1678,
"step": 1092
},
{
"epoch": 2.4958111195734958,
"grad_norm": 1.2015535831451416,
"learning_rate": 3.319185784390453e-06,
"loss": 1.159,
"step": 1093
},
{
"epoch": 2.498095963442498,
"grad_norm": 1.203966736793518,
"learning_rate": 3.3162428937837233e-06,
"loss": 1.1335,
"step": 1094
},
{
"epoch": 2.5003808073115,
"grad_norm": 1.2212433815002441,
"learning_rate": 3.313298736607748e-06,
"loss": 1.1801,
"step": 1095
},
{
"epoch": 2.5026656511805028,
"grad_norm": 1.222063422203064,
"learning_rate": 3.3103533174309967e-06,
"loss": 1.1536,
"step": 1096
},
{
"epoch": 2.504950495049505,
"grad_norm": 1.1464064121246338,
"learning_rate": 3.3074066408238927e-06,
"loss": 1.0955,
"step": 1097
},
{
"epoch": 2.507235338918507,
"grad_norm": 1.2135506868362427,
"learning_rate": 3.3044587113588134e-06,
"loss": 1.1545,
"step": 1098
},
{
"epoch": 2.5095201827875098,
"grad_norm": 1.208592414855957,
"learning_rate": 3.3015095336100795e-06,
"loss": 1.1504,
"step": 1099
},
{
"epoch": 2.511805026656512,
"grad_norm": 1.119957685470581,
"learning_rate": 3.2985591121539495e-06,
"loss": 1.1504,
"step": 1100
},
{
"epoch": 2.514089870525514,
"grad_norm": 1.1869513988494873,
"learning_rate": 3.2956074515686105e-06,
"loss": 1.1694,
"step": 1101
},
{
"epoch": 2.5163747143945163,
"grad_norm": 1.1720670461654663,
"learning_rate": 3.2926545564341715e-06,
"loss": 1.1518,
"step": 1102
},
{
"epoch": 2.5186595582635185,
"grad_norm": 1.1721378564834595,
"learning_rate": 3.2897004313326608e-06,
"loss": 1.1388,
"step": 1103
},
{
"epoch": 2.520944402132521,
"grad_norm": 1.1609101295471191,
"learning_rate": 3.2867450808480115e-06,
"loss": 1.1309,
"step": 1104
},
{
"epoch": 2.5232292460015233,
"grad_norm": 1.1172316074371338,
"learning_rate": 3.2837885095660598e-06,
"loss": 1.1626,
"step": 1105
},
{
"epoch": 2.5255140898705255,
"grad_norm": 1.2131626605987549,
"learning_rate": 3.280830722074536e-06,
"loss": 1.1458,
"step": 1106
},
{
"epoch": 2.5277989337395277,
"grad_norm": 1.2087211608886719,
"learning_rate": 3.2778717229630584e-06,
"loss": 1.1665,
"step": 1107
},
{
"epoch": 2.53008377760853,
"grad_norm": 1.171183705329895,
"learning_rate": 3.2749115168231238e-06,
"loss": 1.1922,
"step": 1108
},
{
"epoch": 2.5323686214775325,
"grad_norm": 1.1399853229522705,
"learning_rate": 3.271950108248102e-06,
"loss": 1.1292,
"step": 1109
},
{
"epoch": 2.5346534653465347,
"grad_norm": 1.1595377922058105,
"learning_rate": 3.268987501833231e-06,
"loss": 1.1481,
"step": 1110
},
{
"epoch": 2.536938309215537,
"grad_norm": 1.2080212831497192,
"learning_rate": 3.2660237021756047e-06,
"loss": 1.17,
"step": 1111
},
{
"epoch": 2.5392231530845395,
"grad_norm": 1.150943398475647,
"learning_rate": 3.26305871387417e-06,
"loss": 1.1193,
"step": 1112
},
{
"epoch": 2.5415079969535412,
"grad_norm": 1.1977055072784424,
"learning_rate": 3.260092541529718e-06,
"loss": 1.1656,
"step": 1113
},
{
"epoch": 2.543792840822544,
"grad_norm": 1.1943557262420654,
"learning_rate": 3.257125189744877e-06,
"loss": 1.1436,
"step": 1114
},
{
"epoch": 2.546077684691546,
"grad_norm": 1.1852405071258545,
"learning_rate": 3.254156663124106e-06,
"loss": 1.1743,
"step": 1115
},
{
"epoch": 2.5483625285605482,
"grad_norm": 1.1327954530715942,
"learning_rate": 3.2511869662736855e-06,
"loss": 1.1275,
"step": 1116
},
{
"epoch": 2.550647372429551,
"grad_norm": 1.1748192310333252,
"learning_rate": 3.248216103801713e-06,
"loss": 1.1277,
"step": 1117
},
{
"epoch": 2.552932216298553,
"grad_norm": 1.2016124725341797,
"learning_rate": 3.2452440803180953e-06,
"loss": 1.1692,
"step": 1118
},
{
"epoch": 2.5552170601675552,
"grad_norm": 1.1545820236206055,
"learning_rate": 3.24227090043454e-06,
"loss": 1.1335,
"step": 1119
},
{
"epoch": 2.5575019040365574,
"grad_norm": 1.168172836303711,
"learning_rate": 3.239296568764547e-06,
"loss": 1.1515,
"step": 1120
},
{
"epoch": 2.5597867479055596,
"grad_norm": 1.1570290327072144,
"learning_rate": 3.236321089923408e-06,
"loss": 1.1921,
"step": 1121
},
{
"epoch": 2.5620715917745622,
"grad_norm": 1.1722872257232666,
"learning_rate": 3.233344468528192e-06,
"loss": 1.1842,
"step": 1122
},
{
"epoch": 2.5643564356435644,
"grad_norm": 1.2346643209457397,
"learning_rate": 3.2303667091977397e-06,
"loss": 1.1987,
"step": 1123
},
{
"epoch": 2.5666412795125666,
"grad_norm": 1.1846752166748047,
"learning_rate": 3.2273878165526603e-06,
"loss": 1.1672,
"step": 1124
},
{
"epoch": 2.568926123381569,
"grad_norm": 1.1800742149353027,
"learning_rate": 3.224407795215319e-06,
"loss": 1.1405,
"step": 1125
},
{
"epoch": 2.571210967250571,
"grad_norm": 1.2667362689971924,
"learning_rate": 3.2214266498098357e-06,
"loss": 1.097,
"step": 1126
},
{
"epoch": 2.5734958111195736,
"grad_norm": 1.1848291158676147,
"learning_rate": 3.218444384962071e-06,
"loss": 1.1309,
"step": 1127
},
{
"epoch": 2.575780654988576,
"grad_norm": 1.2490592002868652,
"learning_rate": 3.215461005299624e-06,
"loss": 1.1677,
"step": 1128
},
{
"epoch": 2.578065498857578,
"grad_norm": 1.1780728101730347,
"learning_rate": 3.2124765154518245e-06,
"loss": 1.1438,
"step": 1129
},
{
"epoch": 2.5803503427265806,
"grad_norm": 1.1854690313339233,
"learning_rate": 3.209490920049724e-06,
"loss": 1.1854,
"step": 1130
},
{
"epoch": 2.5826351865955828,
"grad_norm": 1.1640013456344604,
"learning_rate": 3.2065042237260897e-06,
"loss": 1.1421,
"step": 1131
},
{
"epoch": 2.584920030464585,
"grad_norm": 1.1875327825546265,
"learning_rate": 3.2035164311153967e-06,
"loss": 1.1617,
"step": 1132
},
{
"epoch": 2.587204874333587,
"grad_norm": 1.188707709312439,
"learning_rate": 3.200527546853822e-06,
"loss": 1.1618,
"step": 1133
},
{
"epoch": 2.5894897182025893,
"grad_norm": 1.1984432935714722,
"learning_rate": 3.1975375755792358e-06,
"loss": 1.1647,
"step": 1134
},
{
"epoch": 2.591774562071592,
"grad_norm": 1.2000738382339478,
"learning_rate": 3.1945465219311964e-06,
"loss": 1.1555,
"step": 1135
},
{
"epoch": 2.594059405940594,
"grad_norm": 1.1691455841064453,
"learning_rate": 3.19155439055094e-06,
"loss": 1.1337,
"step": 1136
},
{
"epoch": 2.5963442498095963,
"grad_norm": 1.3328726291656494,
"learning_rate": 3.1885611860813747e-06,
"loss": 1.1662,
"step": 1137
},
{
"epoch": 2.5986290936785985,
"grad_norm": 1.159832239151001,
"learning_rate": 3.185566913167076e-06,
"loss": 1.1231,
"step": 1138
},
{
"epoch": 2.6009139375476007,
"grad_norm": 1.139963984489441,
"learning_rate": 3.1825715764542765e-06,
"loss": 1.1348,
"step": 1139
},
{
"epoch": 2.6031987814166033,
"grad_norm": 1.1485956907272339,
"learning_rate": 3.1795751805908578e-06,
"loss": 1.1157,
"step": 1140
},
{
"epoch": 2.6054836252856055,
"grad_norm": 1.1575555801391602,
"learning_rate": 3.1765777302263464e-06,
"loss": 1.1738,
"step": 1141
},
{
"epoch": 2.6077684691546077,
"grad_norm": 1.1840925216674805,
"learning_rate": 3.173579230011905e-06,
"loss": 1.1345,
"step": 1142
},
{
"epoch": 2.6100533130236103,
"grad_norm": 1.144516944885254,
"learning_rate": 3.1705796846003267e-06,
"loss": 1.1219,
"step": 1143
},
{
"epoch": 2.612338156892612,
"grad_norm": 1.1362788677215576,
"learning_rate": 3.1675790986460233e-06,
"loss": 1.1382,
"step": 1144
},
{
"epoch": 2.6146230007616147,
"grad_norm": 1.2047488689422607,
"learning_rate": 3.1645774768050224e-06,
"loss": 1.1427,
"step": 1145
},
{
"epoch": 2.616907844630617,
"grad_norm": 1.1238912343978882,
"learning_rate": 3.1615748237349626e-06,
"loss": 1.1298,
"step": 1146
},
{
"epoch": 2.619192688499619,
"grad_norm": 1.1865835189819336,
"learning_rate": 3.158571144095076e-06,
"loss": 1.1537,
"step": 1147
},
{
"epoch": 2.6214775323686217,
"grad_norm": 1.1614654064178467,
"learning_rate": 3.155566442546194e-06,
"loss": 1.1608,
"step": 1148
},
{
"epoch": 2.623762376237624,
"grad_norm": 1.1808875799179077,
"learning_rate": 3.1525607237507296e-06,
"loss": 1.1168,
"step": 1149
},
{
"epoch": 2.626047220106626,
"grad_norm": 1.2047436237335205,
"learning_rate": 3.1495539923726757e-06,
"loss": 1.1402,
"step": 1150
},
{
"epoch": 2.6283320639756282,
"grad_norm": 1.190082311630249,
"learning_rate": 3.146546253077597e-06,
"loss": 1.1257,
"step": 1151
},
{
"epoch": 2.6306169078446304,
"grad_norm": 1.1529537439346313,
"learning_rate": 3.1435375105326198e-06,
"loss": 1.1332,
"step": 1152
},
{
"epoch": 2.632901751713633,
"grad_norm": 1.1505589485168457,
"learning_rate": 3.1405277694064306e-06,
"loss": 1.1654,
"step": 1153
},
{
"epoch": 2.6351865955826352,
"grad_norm": 1.1244786977767944,
"learning_rate": 3.1375170343692642e-06,
"loss": 1.1625,
"step": 1154
},
{
"epoch": 2.6374714394516374,
"grad_norm": 1.2105226516723633,
"learning_rate": 3.134505310092895e-06,
"loss": 1.1743,
"step": 1155
},
{
"epoch": 2.6397562833206396,
"grad_norm": 1.1880476474761963,
"learning_rate": 3.131492601250636e-06,
"loss": 1.1119,
"step": 1156
},
{
"epoch": 2.642041127189642,
"grad_norm": 1.1480690240859985,
"learning_rate": 3.1284789125173257e-06,
"loss": 1.128,
"step": 1157
},
{
"epoch": 2.6443259710586444,
"grad_norm": 1.174415111541748,
"learning_rate": 3.1254642485693255e-06,
"loss": 1.1545,
"step": 1158
},
{
"epoch": 2.6466108149276466,
"grad_norm": 1.1595776081085205,
"learning_rate": 3.1224486140845063e-06,
"loss": 1.1502,
"step": 1159
},
{
"epoch": 2.648895658796649,
"grad_norm": 1.1877551078796387,
"learning_rate": 3.1194320137422483e-06,
"loss": 1.1487,
"step": 1160
},
{
"epoch": 2.6511805026656514,
"grad_norm": 1.1842609643936157,
"learning_rate": 3.116414452223429e-06,
"loss": 1.1329,
"step": 1161
},
{
"epoch": 2.6534653465346536,
"grad_norm": 1.1747876405715942,
"learning_rate": 3.1133959342104186e-06,
"loss": 1.138,
"step": 1162
},
{
"epoch": 2.655750190403656,
"grad_norm": 1.1728745698928833,
"learning_rate": 3.110376464387069e-06,
"loss": 1.1593,
"step": 1163
},
{
"epoch": 2.658035034272658,
"grad_norm": 1.1547205448150635,
"learning_rate": 3.1073560474387114e-06,
"loss": 1.1244,
"step": 1164
},
{
"epoch": 2.66031987814166,
"grad_norm": 1.1600390672683716,
"learning_rate": 3.1043346880521456e-06,
"loss": 1.1664,
"step": 1165
},
{
"epoch": 2.662604722010663,
"grad_norm": 1.1489025354385376,
"learning_rate": 3.1013123909156347e-06,
"loss": 1.1869,
"step": 1166
},
{
"epoch": 2.664889565879665,
"grad_norm": 1.1556684970855713,
"learning_rate": 3.0982891607188948e-06,
"loss": 1.1408,
"step": 1167
},
{
"epoch": 2.667174409748667,
"grad_norm": 1.1727960109710693,
"learning_rate": 3.095265002153092e-06,
"loss": 1.1676,
"step": 1168
},
{
"epoch": 2.6694592536176693,
"grad_norm": 1.1960148811340332,
"learning_rate": 3.0922399199108326e-06,
"loss": 1.138,
"step": 1169
},
{
"epoch": 2.6717440974866715,
"grad_norm": 1.1238528490066528,
"learning_rate": 3.0892139186861563e-06,
"loss": 1.1308,
"step": 1170
},
{
"epoch": 2.674028941355674,
"grad_norm": 1.1751792430877686,
"learning_rate": 3.0861870031745266e-06,
"loss": 1.1518,
"step": 1171
},
{
"epoch": 2.6763137852246763,
"grad_norm": 1.1722700595855713,
"learning_rate": 3.0831591780728282e-06,
"loss": 1.0687,
"step": 1172
},
{
"epoch": 2.6785986290936785,
"grad_norm": 1.1473584175109863,
"learning_rate": 3.0801304480793563e-06,
"loss": 1.1313,
"step": 1173
},
{
"epoch": 2.680883472962681,
"grad_norm": 1.1742531061172485,
"learning_rate": 3.0771008178938112e-06,
"loss": 1.1293,
"step": 1174
},
{
"epoch": 2.6831683168316833,
"grad_norm": 1.1747832298278809,
"learning_rate": 3.074070292217288e-06,
"loss": 1.1334,
"step": 1175
},
{
"epoch": 2.6854531607006855,
"grad_norm": 1.1738357543945312,
"learning_rate": 3.0710388757522724e-06,
"loss": 1.1663,
"step": 1176
},
{
"epoch": 2.6877380045696877,
"grad_norm": 1.1576931476593018,
"learning_rate": 3.068006573202634e-06,
"loss": 1.1508,
"step": 1177
},
{
"epoch": 2.69002284843869,
"grad_norm": 1.1925418376922607,
"learning_rate": 3.0649733892736143e-06,
"loss": 1.135,
"step": 1178
},
{
"epoch": 2.6923076923076925,
"grad_norm": 1.1554484367370605,
"learning_rate": 3.061939328671824e-06,
"loss": 1.1328,
"step": 1179
},
{
"epoch": 2.6945925361766947,
"grad_norm": 1.153615117073059,
"learning_rate": 3.0589043961052344e-06,
"loss": 1.1356,
"step": 1180
},
{
"epoch": 2.696877380045697,
"grad_norm": 1.1935803890228271,
"learning_rate": 3.05586859628317e-06,
"loss": 1.1266,
"step": 1181
},
{
"epoch": 2.699162223914699,
"grad_norm": 1.1908665895462036,
"learning_rate": 3.0528319339163003e-06,
"loss": 1.1393,
"step": 1182
},
{
"epoch": 2.7014470677837013,
"grad_norm": 1.194982647895813,
"learning_rate": 3.0497944137166326e-06,
"loss": 1.1349,
"step": 1183
},
{
"epoch": 2.703731911652704,
"grad_norm": 1.1403220891952515,
"learning_rate": 3.0467560403975066e-06,
"loss": 1.1269,
"step": 1184
},
{
"epoch": 2.706016755521706,
"grad_norm": 1.1594319343566895,
"learning_rate": 3.043716818673586e-06,
"loss": 1.1316,
"step": 1185
},
{
"epoch": 2.7083015993907082,
"grad_norm": 1.1441702842712402,
"learning_rate": 3.0406767532608495e-06,
"loss": 1.1605,
"step": 1186
},
{
"epoch": 2.7105864432597104,
"grad_norm": 1.2065651416778564,
"learning_rate": 3.0376358488765863e-06,
"loss": 1.1708,
"step": 1187
},
{
"epoch": 2.7128712871287126,
"grad_norm": 1.1729774475097656,
"learning_rate": 3.034594110239386e-06,
"loss": 1.1563,
"step": 1188
},
{
"epoch": 2.7151561309977152,
"grad_norm": 1.1797168254852295,
"learning_rate": 3.0315515420691354e-06,
"loss": 1.1732,
"step": 1189
},
{
"epoch": 2.7174409748667174,
"grad_norm": 1.1579010486602783,
"learning_rate": 3.0285081490870057e-06,
"loss": 1.1375,
"step": 1190
},
{
"epoch": 2.7197258187357196,
"grad_norm": 1.1517558097839355,
"learning_rate": 3.0254639360154475e-06,
"loss": 1.1594,
"step": 1191
},
{
"epoch": 2.7220106626047222,
"grad_norm": 1.2115283012390137,
"learning_rate": 3.0224189075781886e-06,
"loss": 1.1334,
"step": 1192
},
{
"epoch": 2.7242955064737244,
"grad_norm": 1.1372671127319336,
"learning_rate": 3.0193730685002153e-06,
"loss": 1.2064,
"step": 1193
},
{
"epoch": 2.7265803503427266,
"grad_norm": 1.2300207614898682,
"learning_rate": 3.0163264235077777e-06,
"loss": 1.1767,
"step": 1194
},
{
"epoch": 2.728865194211729,
"grad_norm": 1.1949130296707153,
"learning_rate": 3.0132789773283734e-06,
"loss": 1.1474,
"step": 1195
},
{
"epoch": 2.731150038080731,
"grad_norm": 1.1411316394805908,
"learning_rate": 3.0102307346907442e-06,
"loss": 1.1565,
"step": 1196
},
{
"epoch": 2.7334348819497336,
"grad_norm": 1.153688669204712,
"learning_rate": 3.0071817003248667e-06,
"loss": 1.1368,
"step": 1197
},
{
"epoch": 2.735719725818736,
"grad_norm": 1.2030675411224365,
"learning_rate": 3.0041318789619465e-06,
"loss": 1.1567,
"step": 1198
},
{
"epoch": 2.738004569687738,
"grad_norm": 1.1593271493911743,
"learning_rate": 3.001081275334412e-06,
"loss": 1.161,
"step": 1199
},
{
"epoch": 2.74028941355674,
"grad_norm": 1.181596279144287,
"learning_rate": 2.9980298941759035e-06,
"loss": 1.122,
"step": 1200
},
{
"epoch": 2.7425742574257423,
"grad_norm": 1.1973023414611816,
"learning_rate": 2.9949777402212677e-06,
"loss": 1.157,
"step": 1201
},
{
"epoch": 2.744859101294745,
"grad_norm": 1.1745750904083252,
"learning_rate": 2.9919248182065512e-06,
"loss": 1.1843,
"step": 1202
},
{
"epoch": 2.747143945163747,
"grad_norm": 1.1595338582992554,
"learning_rate": 2.9888711328689933e-06,
"loss": 1.1741,
"step": 1203
},
{
"epoch": 2.7494287890327493,
"grad_norm": 1.165958046913147,
"learning_rate": 2.985816688947017e-06,
"loss": 1.1217,
"step": 1204
},
{
"epoch": 2.751713632901752,
"grad_norm": 1.1976659297943115,
"learning_rate": 2.9827614911802205e-06,
"loss": 1.1443,
"step": 1205
},
{
"epoch": 2.753998476770754,
"grad_norm": 1.1795979738235474,
"learning_rate": 2.9797055443093744e-06,
"loss": 1.1624,
"step": 1206
},
{
"epoch": 2.7562833206397563,
"grad_norm": 1.1748056411743164,
"learning_rate": 2.9766488530764105e-06,
"loss": 1.1715,
"step": 1207
},
{
"epoch": 2.7585681645087585,
"grad_norm": 1.191728115081787,
"learning_rate": 2.9735914222244165e-06,
"loss": 1.1472,
"step": 1208
},
{
"epoch": 2.7608530083777607,
"grad_norm": 1.168042540550232,
"learning_rate": 2.970533256497627e-06,
"loss": 1.1389,
"step": 1209
},
{
"epoch": 2.7631378522467633,
"grad_norm": 1.2145792245864868,
"learning_rate": 2.9674743606414163e-06,
"loss": 1.1975,
"step": 1210
},
{
"epoch": 2.7654226961157655,
"grad_norm": 1.1970194578170776,
"learning_rate": 2.9644147394022925e-06,
"loss": 1.1911,
"step": 1211
},
{
"epoch": 2.7677075399847677,
"grad_norm": 1.181572675704956,
"learning_rate": 2.96135439752789e-06,
"loss": 1.1384,
"step": 1212
},
{
"epoch": 2.76999238385377,
"grad_norm": 1.1687877178192139,
"learning_rate": 2.95829333976696e-06,
"loss": 1.118,
"step": 1213
},
{
"epoch": 2.772277227722772,
"grad_norm": 1.171147108078003,
"learning_rate": 2.955231570869365e-06,
"loss": 1.1387,
"step": 1214
},
{
"epoch": 2.7745620715917747,
"grad_norm": 1.184873104095459,
"learning_rate": 2.9521690955860715e-06,
"loss": 1.1535,
"step": 1215
},
{
"epoch": 2.776846915460777,
"grad_norm": 1.1838406324386597,
"learning_rate": 2.9491059186691416e-06,
"loss": 1.157,
"step": 1216
},
{
"epoch": 2.779131759329779,
"grad_norm": 1.1575403213500977,
"learning_rate": 2.9460420448717264e-06,
"loss": 1.1179,
"step": 1217
},
{
"epoch": 2.7814166031987813,
"grad_norm": 1.1447091102600098,
"learning_rate": 2.9429774789480576e-06,
"loss": 1.1745,
"step": 1218
},
{
"epoch": 2.7837014470677834,
"grad_norm": 1.1797469854354858,
"learning_rate": 2.9399122256534412e-06,
"loss": 1.1278,
"step": 1219
},
{
"epoch": 2.785986290936786,
"grad_norm": 1.1838300228118896,
"learning_rate": 2.936846289744252e-06,
"loss": 1.1605,
"step": 1220
},
{
"epoch": 2.7882711348057883,
"grad_norm": 1.1908303499221802,
"learning_rate": 2.9337796759779197e-06,
"loss": 1.1133,
"step": 1221
},
{
"epoch": 2.7905559786747904,
"grad_norm": 1.192145824432373,
"learning_rate": 2.930712389112929e-06,
"loss": 1.1247,
"step": 1222
},
{
"epoch": 2.792840822543793,
"grad_norm": 1.1842783689498901,
"learning_rate": 2.9276444339088095e-06,
"loss": 1.0794,
"step": 1223
},
{
"epoch": 2.7951256664127953,
"grad_norm": 1.152113914489746,
"learning_rate": 2.924575815126125e-06,
"loss": 1.1397,
"step": 1224
},
{
"epoch": 2.7974105102817974,
"grad_norm": 1.220105767250061,
"learning_rate": 2.921506537526471e-06,
"loss": 1.1426,
"step": 1225
},
{
"epoch": 2.7996953541507996,
"grad_norm": 1.2473626136779785,
"learning_rate": 2.9184366058724655e-06,
"loss": 1.1645,
"step": 1226
},
{
"epoch": 2.801980198019802,
"grad_norm": 1.2051730155944824,
"learning_rate": 2.915366024927741e-06,
"loss": 1.1498,
"step": 1227
},
{
"epoch": 2.8042650418888044,
"grad_norm": 1.2148674726486206,
"learning_rate": 2.912294799456936e-06,
"loss": 1.1334,
"step": 1228
},
{
"epoch": 2.8065498857578066,
"grad_norm": 1.176007866859436,
"learning_rate": 2.9092229342256915e-06,
"loss": 1.1177,
"step": 1229
},
{
"epoch": 2.808834729626809,
"grad_norm": 1.1746013164520264,
"learning_rate": 2.90615043400064e-06,
"loss": 1.1708,
"step": 1230
},
{
"epoch": 2.811119573495811,
"grad_norm": 1.1968902349472046,
"learning_rate": 2.9030773035493997e-06,
"loss": 1.1497,
"step": 1231
},
{
"epoch": 2.813404417364813,
"grad_norm": 1.1996639966964722,
"learning_rate": 2.9000035476405657e-06,
"loss": 1.1179,
"step": 1232
},
{
"epoch": 2.815689261233816,
"grad_norm": 1.168249487876892,
"learning_rate": 2.8969291710437054e-06,
"loss": 1.1528,
"step": 1233
},
{
"epoch": 2.817974105102818,
"grad_norm": 1.1561291217803955,
"learning_rate": 2.893854178529347e-06,
"loss": 1.1591,
"step": 1234
},
{
"epoch": 2.82025894897182,
"grad_norm": 1.1894656419754028,
"learning_rate": 2.890778574868977e-06,
"loss": 1.1606,
"step": 1235
},
{
"epoch": 2.822543792840823,
"grad_norm": 1.1667678356170654,
"learning_rate": 2.8877023648350284e-06,
"loss": 1.1799,
"step": 1236
},
{
"epoch": 2.824828636709825,
"grad_norm": 1.176679253578186,
"learning_rate": 2.884625553200876e-06,
"loss": 1.1994,
"step": 1237
},
{
"epoch": 2.827113480578827,
"grad_norm": 1.1774452924728394,
"learning_rate": 2.8815481447408273e-06,
"loss": 1.1716,
"step": 1238
},
{
"epoch": 2.8293983244478293,
"grad_norm": 1.2223371267318726,
"learning_rate": 2.878470144230118e-06,
"loss": 1.131,
"step": 1239
},
{
"epoch": 2.8316831683168315,
"grad_norm": 1.1785866022109985,
"learning_rate": 2.875391556444898e-06,
"loss": 1.1411,
"step": 1240
},
{
"epoch": 2.833968012185834,
"grad_norm": 1.1710691452026367,
"learning_rate": 2.8723123861622338e-06,
"loss": 1.1718,
"step": 1241
},
{
"epoch": 2.8362528560548363,
"grad_norm": 1.1571224927902222,
"learning_rate": 2.8692326381600926e-06,
"loss": 1.1529,
"step": 1242
},
{
"epoch": 2.8385376999238385,
"grad_norm": 1.1815292835235596,
"learning_rate": 2.8661523172173392e-06,
"loss": 1.1522,
"step": 1243
},
{
"epoch": 2.8408225437928407,
"grad_norm": 1.1292341947555542,
"learning_rate": 2.8630714281137263e-06,
"loss": 1.1394,
"step": 1244
},
{
"epoch": 2.843107387661843,
"grad_norm": 1.1632921695709229,
"learning_rate": 2.8599899756298887e-06,
"loss": 1.1778,
"step": 1245
},
{
"epoch": 2.8453922315308455,
"grad_norm": 1.2025562524795532,
"learning_rate": 2.856907964547337e-06,
"loss": 1.1442,
"step": 1246
},
{
"epoch": 2.8476770753998477,
"grad_norm": 1.2091012001037598,
"learning_rate": 2.8538253996484465e-06,
"loss": 1.1388,
"step": 1247
},
{
"epoch": 2.84996191926885,
"grad_norm": 1.1544710397720337,
"learning_rate": 2.8507422857164523e-06,
"loss": 1.1402,
"step": 1248
},
{
"epoch": 2.852246763137852,
"grad_norm": 1.1689878702163696,
"learning_rate": 2.847658627535442e-06,
"loss": 1.1477,
"step": 1249
},
{
"epoch": 2.8545316070068543,
"grad_norm": 1.173471212387085,
"learning_rate": 2.844574429890347e-06,
"loss": 1.1661,
"step": 1250
},
{
"epoch": 2.856816450875857,
"grad_norm": 1.1636780500411987,
"learning_rate": 2.8414896975669374e-06,
"loss": 1.1465,
"step": 1251
},
{
"epoch": 2.859101294744859,
"grad_norm": 1.1479815244674683,
"learning_rate": 2.8384044353518104e-06,
"loss": 1.1568,
"step": 1252
},
{
"epoch": 2.8613861386138613,
"grad_norm": 1.2065167427062988,
"learning_rate": 2.835318648032388e-06,
"loss": 1.1585,
"step": 1253
},
{
"epoch": 2.863670982482864,
"grad_norm": 1.1619449853897095,
"learning_rate": 2.832232340396904e-06,
"loss": 1.1238,
"step": 1254
},
{
"epoch": 2.865955826351866,
"grad_norm": 1.17509126663208,
"learning_rate": 2.8291455172344045e-06,
"loss": 1.1614,
"step": 1255
},
{
"epoch": 2.8682406702208683,
"grad_norm": 1.1738674640655518,
"learning_rate": 2.82605818333473e-06,
"loss": 1.1523,
"step": 1256
},
{
"epoch": 2.8705255140898704,
"grad_norm": 1.1643424034118652,
"learning_rate": 2.8229703434885165e-06,
"loss": 1.1565,
"step": 1257
},
{
"epoch": 2.8728103579588726,
"grad_norm": 1.1405576467514038,
"learning_rate": 2.819882002487185e-06,
"loss": 1.1638,
"step": 1258
},
{
"epoch": 2.8750952018278753,
"grad_norm": 1.1445550918579102,
"learning_rate": 2.816793165122933e-06,
"loss": 1.129,
"step": 1259
},
{
"epoch": 2.8773800456968774,
"grad_norm": 1.1620489358901978,
"learning_rate": 2.8137038361887297e-06,
"loss": 1.128,
"step": 1260
},
{
"epoch": 2.8796648895658796,
"grad_norm": 1.2187519073486328,
"learning_rate": 2.8106140204783054e-06,
"loss": 1.1312,
"step": 1261
},
{
"epoch": 2.881949733434882,
"grad_norm": 1.178515076637268,
"learning_rate": 2.8075237227861475e-06,
"loss": 1.1075,
"step": 1262
},
{
"epoch": 2.884234577303884,
"grad_norm": 1.1716185808181763,
"learning_rate": 2.80443294790749e-06,
"loss": 1.1553,
"step": 1263
},
{
"epoch": 2.8865194211728866,
"grad_norm": 1.2317765951156616,
"learning_rate": 2.8013417006383078e-06,
"loss": 1.1377,
"step": 1264
},
{
"epoch": 2.888804265041889,
"grad_norm": 1.1073942184448242,
"learning_rate": 2.798249985775309e-06,
"loss": 1.1507,
"step": 1265
},
{
"epoch": 2.891089108910891,
"grad_norm": 1.149707317352295,
"learning_rate": 2.795157808115927e-06,
"loss": 1.1161,
"step": 1266
},
{
"epoch": 2.8933739527798936,
"grad_norm": 1.2311087846755981,
"learning_rate": 2.7920651724583124e-06,
"loss": 1.1481,
"step": 1267
},
{
"epoch": 2.895658796648896,
"grad_norm": 1.1760973930358887,
"learning_rate": 2.788972083601329e-06,
"loss": 1.16,
"step": 1268
},
{
"epoch": 2.897943640517898,
"grad_norm": 1.149901270866394,
"learning_rate": 2.785878546344541e-06,
"loss": 1.1187,
"step": 1269
},
{
"epoch": 2.9002284843869,
"grad_norm": 1.2037584781646729,
"learning_rate": 2.7827845654882112e-06,
"loss": 1.1508,
"step": 1270
},
{
"epoch": 2.9025133282559024,
"grad_norm": 1.2114837169647217,
"learning_rate": 2.7796901458332877e-06,
"loss": 1.1243,
"step": 1271
},
{
"epoch": 2.904798172124905,
"grad_norm": 1.179283857345581,
"learning_rate": 2.776595292181401e-06,
"loss": 1.1351,
"step": 1272
},
{
"epoch": 2.907083015993907,
"grad_norm": 1.2145215272903442,
"learning_rate": 2.7735000093348556e-06,
"loss": 1.1295,
"step": 1273
},
{
"epoch": 2.9093678598629094,
"grad_norm": 1.1994370222091675,
"learning_rate": 2.7704043020966222e-06,
"loss": 1.1405,
"step": 1274
},
{
"epoch": 2.9116527037319115,
"grad_norm": 1.218436360359192,
"learning_rate": 2.7673081752703275e-06,
"loss": 1.163,
"step": 1275
},
{
"epoch": 2.9139375476009137,
"grad_norm": 1.1954008340835571,
"learning_rate": 2.764211633660252e-06,
"loss": 1.1209,
"step": 1276
},
{
"epoch": 2.9162223914699164,
"grad_norm": 1.1329401731491089,
"learning_rate": 2.7611146820713187e-06,
"loss": 1.1394,
"step": 1277
},
{
"epoch": 2.9185072353389185,
"grad_norm": 1.153887391090393,
"learning_rate": 2.7580173253090876e-06,
"loss": 1.1236,
"step": 1278
},
{
"epoch": 2.9207920792079207,
"grad_norm": 1.2343658208847046,
"learning_rate": 2.754919568179746e-06,
"loss": 1.1582,
"step": 1279
},
{
"epoch": 2.9230769230769234,
"grad_norm": 1.1790735721588135,
"learning_rate": 2.7518214154901025e-06,
"loss": 1.148,
"step": 1280
},
{
"epoch": 2.925361766945925,
"grad_norm": 1.1787333488464355,
"learning_rate": 2.7487228720475812e-06,
"loss": 1.0793,
"step": 1281
},
{
"epoch": 2.9276466108149277,
"grad_norm": 1.1795237064361572,
"learning_rate": 2.745623942660211e-06,
"loss": 1.1811,
"step": 1282
},
{
"epoch": 2.92993145468393,
"grad_norm": 1.1928547620773315,
"learning_rate": 2.7425246321366205e-06,
"loss": 1.1207,
"step": 1283
},
{
"epoch": 2.932216298552932,
"grad_norm": 1.2683132886886597,
"learning_rate": 2.7394249452860296e-06,
"loss": 1.1868,
"step": 1284
},
{
"epoch": 2.9345011424219347,
"grad_norm": 1.2315943241119385,
"learning_rate": 2.7363248869182407e-06,
"loss": 1.1732,
"step": 1285
},
{
"epoch": 2.936785986290937,
"grad_norm": 1.178771734237671,
"learning_rate": 2.7332244618436355e-06,
"loss": 1.1377,
"step": 1286
},
{
"epoch": 2.939070830159939,
"grad_norm": 1.1433113813400269,
"learning_rate": 2.7301236748731623e-06,
"loss": 1.1556,
"step": 1287
},
{
"epoch": 2.9413556740289413,
"grad_norm": 1.1987029314041138,
"learning_rate": 2.7270225308183318e-06,
"loss": 1.16,
"step": 1288
},
{
"epoch": 2.9436405178979435,
"grad_norm": 1.1818667650222778,
"learning_rate": 2.7239210344912085e-06,
"loss": 1.1443,
"step": 1289
},
{
"epoch": 2.945925361766946,
"grad_norm": 1.2523951530456543,
"learning_rate": 2.720819190704405e-06,
"loss": 1.1512,
"step": 1290
},
{
"epoch": 2.9482102056359483,
"grad_norm": 1.186334490776062,
"learning_rate": 2.7177170042710706e-06,
"loss": 1.1347,
"step": 1291
},
{
"epoch": 2.9504950495049505,
"grad_norm": 1.1355229616165161,
"learning_rate": 2.714614480004888e-06,
"loss": 1.1717,
"step": 1292
},
{
"epoch": 2.9527798933739526,
"grad_norm": 1.1879637241363525,
"learning_rate": 2.7115116227200634e-06,
"loss": 1.1737,
"step": 1293
},
{
"epoch": 2.955064737242955,
"grad_norm": 1.160522699356079,
"learning_rate": 2.7084084372313207e-06,
"loss": 1.1603,
"step": 1294
},
{
"epoch": 2.9573495811119574,
"grad_norm": 1.15951406955719,
"learning_rate": 2.705304928353892e-06,
"loss": 1.1809,
"step": 1295
},
{
"epoch": 2.9596344249809596,
"grad_norm": 1.132550835609436,
"learning_rate": 2.702201100903511e-06,
"loss": 1.1537,
"step": 1296
},
{
"epoch": 2.961919268849962,
"grad_norm": 1.1578466892242432,
"learning_rate": 2.6990969596964066e-06,
"loss": 1.1578,
"step": 1297
},
{
"epoch": 2.9642041127189644,
"grad_norm": 1.1534026861190796,
"learning_rate": 2.6959925095492957e-06,
"loss": 1.1392,
"step": 1298
},
{
"epoch": 2.9664889565879666,
"grad_norm": 1.1865522861480713,
"learning_rate": 2.6928877552793716e-06,
"loss": 1.1292,
"step": 1299
},
{
"epoch": 2.968773800456969,
"grad_norm": 1.1986700296401978,
"learning_rate": 2.689782701704301e-06,
"loss": 1.1591,
"step": 1300
},
{
"epoch": 2.971058644325971,
"grad_norm": 1.1916639804840088,
"learning_rate": 2.6866773536422157e-06,
"loss": 1.1757,
"step": 1301
},
{
"epoch": 2.973343488194973,
"grad_norm": 1.1540440320968628,
"learning_rate": 2.6835717159117044e-06,
"loss": 1.132,
"step": 1302
},
{
"epoch": 2.975628332063976,
"grad_norm": 1.175986647605896,
"learning_rate": 2.6804657933318035e-06,
"loss": 1.158,
"step": 1303
},
{
"epoch": 2.977913175932978,
"grad_norm": 1.1727259159088135,
"learning_rate": 2.6773595907219937e-06,
"loss": 1.1454,
"step": 1304
},
{
"epoch": 2.98019801980198,
"grad_norm": 1.1897666454315186,
"learning_rate": 2.674253112902189e-06,
"loss": 1.1385,
"step": 1305
},
{
"epoch": 2.9824828636709824,
"grad_norm": 1.2285549640655518,
"learning_rate": 2.6711463646927296e-06,
"loss": 1.1057,
"step": 1306
},
{
"epoch": 2.9847677075399845,
"grad_norm": 1.1508749723434448,
"learning_rate": 2.668039350914377e-06,
"loss": 1.1537,
"step": 1307
},
{
"epoch": 2.987052551408987,
"grad_norm": 1.1747997999191284,
"learning_rate": 2.6649320763883045e-06,
"loss": 1.1447,
"step": 1308
},
{
"epoch": 2.9893373952779894,
"grad_norm": 1.1757659912109375,
"learning_rate": 2.6618245459360896e-06,
"loss": 1.1312,
"step": 1309
},
{
"epoch": 2.9916222391469915,
"grad_norm": 1.1855767965316772,
"learning_rate": 2.658716764379706e-06,
"loss": 1.1559,
"step": 1310
},
{
"epoch": 2.993907083015994,
"grad_norm": 1.1734507083892822,
"learning_rate": 2.6556087365415183e-06,
"loss": 1.1291,
"step": 1311
}
],
"logging_steps": 1,
"max_steps": 2622,
"num_input_tokens_seen": 0,
"num_train_epochs": 6,
"save_steps": 437,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 3.3097395819302093e+18,
"train_batch_size": 4,
"trial_name": null,
"trial_params": null
}