do222 / checkpoint-437 /trainer_state.json
ugaoo's picture
Upload folder using huggingface_hub
6119aea verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 0.9984767707539984,
"eval_steps": 500,
"global_step": 437,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.002284843869002285,
"grad_norm": 3.668196201324463,
"learning_rate": 5.0000000000000004e-08,
"loss": 1.5687,
"step": 1
},
{
"epoch": 0.00456968773800457,
"grad_norm": 3.6277146339416504,
"learning_rate": 1.0000000000000001e-07,
"loss": 1.5714,
"step": 2
},
{
"epoch": 0.006854531607006854,
"grad_norm": 3.813422918319702,
"learning_rate": 1.5000000000000002e-07,
"loss": 1.58,
"step": 3
},
{
"epoch": 0.00913937547600914,
"grad_norm": 3.4566409587860107,
"learning_rate": 2.0000000000000002e-07,
"loss": 1.5604,
"step": 4
},
{
"epoch": 0.011424219345011425,
"grad_norm": 3.287661552429199,
"learning_rate": 2.5000000000000004e-07,
"loss": 1.5425,
"step": 5
},
{
"epoch": 0.013709063214013708,
"grad_norm": 3.318340301513672,
"learning_rate": 3.0000000000000004e-07,
"loss": 1.5477,
"step": 6
},
{
"epoch": 0.015993907083015995,
"grad_norm": 3.407221555709839,
"learning_rate": 3.5000000000000004e-07,
"loss": 1.5848,
"step": 7
},
{
"epoch": 0.01827875095201828,
"grad_norm": 3.732999563217163,
"learning_rate": 4.0000000000000003e-07,
"loss": 1.5884,
"step": 8
},
{
"epoch": 0.020563594821020565,
"grad_norm": 3.532766580581665,
"learning_rate": 4.5000000000000003e-07,
"loss": 1.5892,
"step": 9
},
{
"epoch": 0.02284843869002285,
"grad_norm": 3.5676348209381104,
"learning_rate": 5.000000000000001e-07,
"loss": 1.5619,
"step": 10
},
{
"epoch": 0.02513328255902513,
"grad_norm": 3.1015849113464355,
"learning_rate": 5.5e-07,
"loss": 1.5649,
"step": 11
},
{
"epoch": 0.027418126428027417,
"grad_norm": 3.163240909576416,
"learning_rate": 6.000000000000001e-07,
"loss": 1.5807,
"step": 12
},
{
"epoch": 0.0297029702970297,
"grad_norm": 2.894922971725464,
"learning_rate": 6.5e-07,
"loss": 1.5454,
"step": 13
},
{
"epoch": 0.03198781416603199,
"grad_norm": 2.8211843967437744,
"learning_rate": 7.000000000000001e-07,
"loss": 1.5801,
"step": 14
},
{
"epoch": 0.03427265803503427,
"grad_norm": 2.676609516143799,
"learning_rate": 7.5e-07,
"loss": 1.5446,
"step": 15
},
{
"epoch": 0.03655750190403656,
"grad_norm": 2.6186320781707764,
"learning_rate": 8.000000000000001e-07,
"loss": 1.5443,
"step": 16
},
{
"epoch": 0.03884234577303884,
"grad_norm": 2.460139513015747,
"learning_rate": 8.500000000000001e-07,
"loss": 1.5489,
"step": 17
},
{
"epoch": 0.04112718964204113,
"grad_norm": 2.368126630783081,
"learning_rate": 9.000000000000001e-07,
"loss": 1.5317,
"step": 18
},
{
"epoch": 0.04341203351104341,
"grad_norm": 2.244192123413086,
"learning_rate": 9.500000000000001e-07,
"loss": 1.4805,
"step": 19
},
{
"epoch": 0.0456968773800457,
"grad_norm": 2.242701292037964,
"learning_rate": 1.0000000000000002e-06,
"loss": 1.5478,
"step": 20
},
{
"epoch": 0.04798172124904798,
"grad_norm": 2.13895583152771,
"learning_rate": 1.0500000000000001e-06,
"loss": 1.5194,
"step": 21
},
{
"epoch": 0.05026656511805026,
"grad_norm": 2.0152103900909424,
"learning_rate": 1.1e-06,
"loss": 1.5067,
"step": 22
},
{
"epoch": 0.05255140898705255,
"grad_norm": 1.9156895875930786,
"learning_rate": 1.1500000000000002e-06,
"loss": 1.5145,
"step": 23
},
{
"epoch": 0.05483625285605483,
"grad_norm": 1.7710504531860352,
"learning_rate": 1.2000000000000002e-06,
"loss": 1.5147,
"step": 24
},
{
"epoch": 0.05712109672505712,
"grad_norm": 1.807431936264038,
"learning_rate": 1.25e-06,
"loss": 1.5357,
"step": 25
},
{
"epoch": 0.0594059405940594,
"grad_norm": 1.6638832092285156,
"learning_rate": 1.3e-06,
"loss": 1.489,
"step": 26
},
{
"epoch": 0.06169078446306169,
"grad_norm": 1.5708481073379517,
"learning_rate": 1.3500000000000002e-06,
"loss": 1.4768,
"step": 27
},
{
"epoch": 0.06397562833206398,
"grad_norm": 1.615577220916748,
"learning_rate": 1.4000000000000001e-06,
"loss": 1.5159,
"step": 28
},
{
"epoch": 0.06626047220106626,
"grad_norm": 1.5125129222869873,
"learning_rate": 1.45e-06,
"loss": 1.4972,
"step": 29
},
{
"epoch": 0.06854531607006854,
"grad_norm": 1.479811668395996,
"learning_rate": 1.5e-06,
"loss": 1.4674,
"step": 30
},
{
"epoch": 0.07083015993907082,
"grad_norm": 1.4502017498016357,
"learning_rate": 1.5500000000000002e-06,
"loss": 1.4811,
"step": 31
},
{
"epoch": 0.07311500380807312,
"grad_norm": 1.3617135286331177,
"learning_rate": 1.6000000000000001e-06,
"loss": 1.4872,
"step": 32
},
{
"epoch": 0.0753998476770754,
"grad_norm": 1.367607831954956,
"learning_rate": 1.6500000000000003e-06,
"loss": 1.4699,
"step": 33
},
{
"epoch": 0.07768469154607768,
"grad_norm": 1.3374927043914795,
"learning_rate": 1.7000000000000002e-06,
"loss": 1.4659,
"step": 34
},
{
"epoch": 0.07996953541507996,
"grad_norm": 1.354506254196167,
"learning_rate": 1.75e-06,
"loss": 1.4351,
"step": 35
},
{
"epoch": 0.08225437928408226,
"grad_norm": 1.2532024383544922,
"learning_rate": 1.8000000000000001e-06,
"loss": 1.4358,
"step": 36
},
{
"epoch": 0.08453922315308454,
"grad_norm": 1.2684043645858765,
"learning_rate": 1.85e-06,
"loss": 1.4534,
"step": 37
},
{
"epoch": 0.08682406702208682,
"grad_norm": 1.2418140172958374,
"learning_rate": 1.9000000000000002e-06,
"loss": 1.4624,
"step": 38
},
{
"epoch": 0.0891089108910891,
"grad_norm": 1.2266045808792114,
"learning_rate": 1.9500000000000004e-06,
"loss": 1.4282,
"step": 39
},
{
"epoch": 0.0913937547600914,
"grad_norm": 1.180330753326416,
"learning_rate": 2.0000000000000003e-06,
"loss": 1.4107,
"step": 40
},
{
"epoch": 0.09367859862909368,
"grad_norm": 1.1651424169540405,
"learning_rate": 2.05e-06,
"loss": 1.4041,
"step": 41
},
{
"epoch": 0.09596344249809596,
"grad_norm": 1.181652307510376,
"learning_rate": 2.1000000000000002e-06,
"loss": 1.4558,
"step": 42
},
{
"epoch": 0.09824828636709824,
"grad_norm": 1.2221183776855469,
"learning_rate": 2.15e-06,
"loss": 1.4449,
"step": 43
},
{
"epoch": 0.10053313023610053,
"grad_norm": 1.085172414779663,
"learning_rate": 2.2e-06,
"loss": 1.4235,
"step": 44
},
{
"epoch": 0.10281797410510282,
"grad_norm": 1.0497649908065796,
"learning_rate": 2.25e-06,
"loss": 1.3891,
"step": 45
},
{
"epoch": 0.1051028179741051,
"grad_norm": 1.0502350330352783,
"learning_rate": 2.3000000000000004e-06,
"loss": 1.4048,
"step": 46
},
{
"epoch": 0.10738766184310738,
"grad_norm": 1.0798920392990112,
"learning_rate": 2.35e-06,
"loss": 1.4383,
"step": 47
},
{
"epoch": 0.10967250571210967,
"grad_norm": 1.067581057548523,
"learning_rate": 2.4000000000000003e-06,
"loss": 1.4128,
"step": 48
},
{
"epoch": 0.11195734958111196,
"grad_norm": 1.062606930732727,
"learning_rate": 2.4500000000000003e-06,
"loss": 1.4438,
"step": 49
},
{
"epoch": 0.11424219345011424,
"grad_norm": 1.0157577991485596,
"learning_rate": 2.5e-06,
"loss": 1.4257,
"step": 50
},
{
"epoch": 0.11652703731911652,
"grad_norm": 1.0165379047393799,
"learning_rate": 2.55e-06,
"loss": 1.407,
"step": 51
},
{
"epoch": 0.1188118811881188,
"grad_norm": 1.0268282890319824,
"learning_rate": 2.6e-06,
"loss": 1.3942,
"step": 52
},
{
"epoch": 0.1210967250571211,
"grad_norm": 1.0133647918701172,
"learning_rate": 2.6500000000000005e-06,
"loss": 1.3737,
"step": 53
},
{
"epoch": 0.12338156892612338,
"grad_norm": 1.0097134113311768,
"learning_rate": 2.7000000000000004e-06,
"loss": 1.3994,
"step": 54
},
{
"epoch": 0.12566641279512566,
"grad_norm": 1.1268850564956665,
"learning_rate": 2.7500000000000004e-06,
"loss": 1.3676,
"step": 55
},
{
"epoch": 0.12795125666412796,
"grad_norm": 0.981015682220459,
"learning_rate": 2.8000000000000003e-06,
"loss": 1.3819,
"step": 56
},
{
"epoch": 0.13023610053313023,
"grad_norm": 1.0456632375717163,
"learning_rate": 2.85e-06,
"loss": 1.4031,
"step": 57
},
{
"epoch": 0.13252094440213252,
"grad_norm": 1.0366231203079224,
"learning_rate": 2.9e-06,
"loss": 1.4017,
"step": 58
},
{
"epoch": 0.13480578827113482,
"grad_norm": 0.9980257749557495,
"learning_rate": 2.95e-06,
"loss": 1.4261,
"step": 59
},
{
"epoch": 0.1370906321401371,
"grad_norm": 0.990281879901886,
"learning_rate": 3e-06,
"loss": 1.3699,
"step": 60
},
{
"epoch": 0.13937547600913938,
"grad_norm": 1.0530250072479248,
"learning_rate": 3.05e-06,
"loss": 1.3656,
"step": 61
},
{
"epoch": 0.14166031987814165,
"grad_norm": 0.9878147840499878,
"learning_rate": 3.1000000000000004e-06,
"loss": 1.3712,
"step": 62
},
{
"epoch": 0.14394516374714394,
"grad_norm": 0.9554497599601746,
"learning_rate": 3.1500000000000003e-06,
"loss": 1.3507,
"step": 63
},
{
"epoch": 0.14623000761614624,
"grad_norm": 1.0152994394302368,
"learning_rate": 3.2000000000000003e-06,
"loss": 1.3531,
"step": 64
},
{
"epoch": 0.1485148514851485,
"grad_norm": 0.9816209077835083,
"learning_rate": 3.2500000000000002e-06,
"loss": 1.3733,
"step": 65
},
{
"epoch": 0.1507996953541508,
"grad_norm": 1.014113187789917,
"learning_rate": 3.3000000000000006e-06,
"loss": 1.3798,
"step": 66
},
{
"epoch": 0.15308453922315307,
"grad_norm": 1.005303978919983,
"learning_rate": 3.3500000000000005e-06,
"loss": 1.3877,
"step": 67
},
{
"epoch": 0.15536938309215537,
"grad_norm": 1.109976887702942,
"learning_rate": 3.4000000000000005e-06,
"loss": 1.4184,
"step": 68
},
{
"epoch": 0.15765422696115766,
"grad_norm": 1.033060908317566,
"learning_rate": 3.45e-06,
"loss": 1.4043,
"step": 69
},
{
"epoch": 0.15993907083015993,
"grad_norm": 0.9719234108924866,
"learning_rate": 3.5e-06,
"loss": 1.3481,
"step": 70
},
{
"epoch": 0.16222391469916222,
"grad_norm": 1.0430618524551392,
"learning_rate": 3.5500000000000003e-06,
"loss": 1.3227,
"step": 71
},
{
"epoch": 0.16450875856816452,
"grad_norm": 1.0481953620910645,
"learning_rate": 3.6000000000000003e-06,
"loss": 1.3174,
"step": 72
},
{
"epoch": 0.1667936024371668,
"grad_norm": 0.9868738055229187,
"learning_rate": 3.65e-06,
"loss": 1.356,
"step": 73
},
{
"epoch": 0.16907844630616908,
"grad_norm": 1.0015943050384521,
"learning_rate": 3.7e-06,
"loss": 1.3462,
"step": 74
},
{
"epoch": 0.17136329017517135,
"grad_norm": 1.0458308458328247,
"learning_rate": 3.7500000000000005e-06,
"loss": 1.3962,
"step": 75
},
{
"epoch": 0.17364813404417365,
"grad_norm": 1.0376830101013184,
"learning_rate": 3.8000000000000005e-06,
"loss": 1.3523,
"step": 76
},
{
"epoch": 0.17593297791317594,
"grad_norm": 0.9821555018424988,
"learning_rate": 3.85e-06,
"loss": 1.3559,
"step": 77
},
{
"epoch": 0.1782178217821782,
"grad_norm": 0.9579638838768005,
"learning_rate": 3.900000000000001e-06,
"loss": 1.3073,
"step": 78
},
{
"epoch": 0.1805026656511805,
"grad_norm": 0.9736194014549255,
"learning_rate": 3.95e-06,
"loss": 1.3494,
"step": 79
},
{
"epoch": 0.1827875095201828,
"grad_norm": 1.0055922269821167,
"learning_rate": 4.000000000000001e-06,
"loss": 1.3697,
"step": 80
},
{
"epoch": 0.18507235338918507,
"grad_norm": 0.9767876267433167,
"learning_rate": 4.05e-06,
"loss": 1.3225,
"step": 81
},
{
"epoch": 0.18735719725818736,
"grad_norm": 1.003092885017395,
"learning_rate": 4.1e-06,
"loss": 1.335,
"step": 82
},
{
"epoch": 0.18964204112718963,
"grad_norm": 0.9898741245269775,
"learning_rate": 4.15e-06,
"loss": 1.3103,
"step": 83
},
{
"epoch": 0.19192688499619193,
"grad_norm": 0.9903189539909363,
"learning_rate": 4.2000000000000004e-06,
"loss": 1.3741,
"step": 84
},
{
"epoch": 0.19421172886519422,
"grad_norm": 0.9661535620689392,
"learning_rate": 4.25e-06,
"loss": 1.3381,
"step": 85
},
{
"epoch": 0.1964965727341965,
"grad_norm": 0.9668599367141724,
"learning_rate": 4.3e-06,
"loss": 1.3511,
"step": 86
},
{
"epoch": 0.19878141660319879,
"grad_norm": 0.9633579254150391,
"learning_rate": 4.350000000000001e-06,
"loss": 1.3841,
"step": 87
},
{
"epoch": 0.20106626047220105,
"grad_norm": 0.9665766358375549,
"learning_rate": 4.4e-06,
"loss": 1.3211,
"step": 88
},
{
"epoch": 0.20335110434120335,
"grad_norm": 1.0263577699661255,
"learning_rate": 4.450000000000001e-06,
"loss": 1.3398,
"step": 89
},
{
"epoch": 0.20563594821020564,
"grad_norm": 1.0054337978363037,
"learning_rate": 4.5e-06,
"loss": 1.3598,
"step": 90
},
{
"epoch": 0.2079207920792079,
"grad_norm": 0.9768564701080322,
"learning_rate": 4.5500000000000005e-06,
"loss": 1.3386,
"step": 91
},
{
"epoch": 0.2102056359482102,
"grad_norm": 0.9710814356803894,
"learning_rate": 4.600000000000001e-06,
"loss": 1.306,
"step": 92
},
{
"epoch": 0.2124904798172125,
"grad_norm": 0.9943618774414062,
"learning_rate": 4.65e-06,
"loss": 1.3368,
"step": 93
},
{
"epoch": 0.21477532368621477,
"grad_norm": 1.0000272989273071,
"learning_rate": 4.7e-06,
"loss": 1.3561,
"step": 94
},
{
"epoch": 0.21706016755521707,
"grad_norm": 0.9748716950416565,
"learning_rate": 4.75e-06,
"loss": 1.3216,
"step": 95
},
{
"epoch": 0.21934501142421933,
"grad_norm": 0.977959930896759,
"learning_rate": 4.800000000000001e-06,
"loss": 1.3275,
"step": 96
},
{
"epoch": 0.22162985529322163,
"grad_norm": 0.9991240501403809,
"learning_rate": 4.85e-06,
"loss": 1.3143,
"step": 97
},
{
"epoch": 0.22391469916222392,
"grad_norm": 1.0590916872024536,
"learning_rate": 4.9000000000000005e-06,
"loss": 1.3467,
"step": 98
},
{
"epoch": 0.2261995430312262,
"grad_norm": 0.9592604041099548,
"learning_rate": 4.95e-06,
"loss": 1.3568,
"step": 99
},
{
"epoch": 0.2284843869002285,
"grad_norm": 0.9900586605072021,
"learning_rate": 5e-06,
"loss": 1.3162,
"step": 100
},
{
"epoch": 0.23076923076923078,
"grad_norm": 0.9882398843765259,
"learning_rate": 4.999998060367119e-06,
"loss": 1.3348,
"step": 101
},
{
"epoch": 0.23305407463823305,
"grad_norm": 0.9522809982299805,
"learning_rate": 4.999992241471486e-06,
"loss": 1.3004,
"step": 102
},
{
"epoch": 0.23533891850723535,
"grad_norm": 0.9822378754615784,
"learning_rate": 4.9999825433221295e-06,
"loss": 1.3326,
"step": 103
},
{
"epoch": 0.2376237623762376,
"grad_norm": 0.9944847822189331,
"learning_rate": 4.999968965934098e-06,
"loss": 1.3429,
"step": 104
},
{
"epoch": 0.2399086062452399,
"grad_norm": 1.052456021308899,
"learning_rate": 4.9999515093284605e-06,
"loss": 1.3476,
"step": 105
},
{
"epoch": 0.2421934501142422,
"grad_norm": 0.9862610697746277,
"learning_rate": 4.999930173532304e-06,
"loss": 1.3638,
"step": 106
},
{
"epoch": 0.24447829398324447,
"grad_norm": 0.9718945622444153,
"learning_rate": 4.999904958578735e-06,
"loss": 1.3013,
"step": 107
},
{
"epoch": 0.24676313785224677,
"grad_norm": 0.9535952210426331,
"learning_rate": 4.9998758645068805e-06,
"loss": 1.3317,
"step": 108
},
{
"epoch": 0.24904798172124903,
"grad_norm": 1.1905543804168701,
"learning_rate": 4.999842891361885e-06,
"loss": 1.3325,
"step": 109
},
{
"epoch": 0.25133282559025133,
"grad_norm": 1.0306485891342163,
"learning_rate": 4.9998060391949145e-06,
"loss": 1.3198,
"step": 110
},
{
"epoch": 0.2536176694592536,
"grad_norm": 1.0334984064102173,
"learning_rate": 4.999765308063152e-06,
"loss": 1.3075,
"step": 111
},
{
"epoch": 0.2559025133282559,
"grad_norm": 1.0020740032196045,
"learning_rate": 4.9997206980298e-06,
"loss": 1.3324,
"step": 112
},
{
"epoch": 0.25818735719725816,
"grad_norm": 0.9771923422813416,
"learning_rate": 4.9996722091640805e-06,
"loss": 1.3072,
"step": 113
},
{
"epoch": 0.26047220106626046,
"grad_norm": 0.9955299496650696,
"learning_rate": 4.999619841541234e-06,
"loss": 1.3501,
"step": 114
},
{
"epoch": 0.26275704493526275,
"grad_norm": 1.0125700235366821,
"learning_rate": 4.9995635952425205e-06,
"loss": 1.3387,
"step": 115
},
{
"epoch": 0.26504188880426505,
"grad_norm": 1.005936622619629,
"learning_rate": 4.999503470355215e-06,
"loss": 1.342,
"step": 116
},
{
"epoch": 0.26732673267326734,
"grad_norm": 0.9978262782096863,
"learning_rate": 4.999439466972616e-06,
"loss": 1.2954,
"step": 117
},
{
"epoch": 0.26961157654226964,
"grad_norm": 0.9668537974357605,
"learning_rate": 4.999371585194039e-06,
"loss": 1.3318,
"step": 118
},
{
"epoch": 0.2718964204112719,
"grad_norm": 1.0156077146530151,
"learning_rate": 4.999299825124814e-06,
"loss": 1.2681,
"step": 119
},
{
"epoch": 0.2741812642802742,
"grad_norm": 0.99967360496521,
"learning_rate": 4.999224186876293e-06,
"loss": 1.2666,
"step": 120
},
{
"epoch": 0.27646610814927647,
"grad_norm": 1.0085562467575073,
"learning_rate": 4.999144670565842e-06,
"loss": 1.3261,
"step": 121
},
{
"epoch": 0.27875095201827876,
"grad_norm": 1.0338691473007202,
"learning_rate": 4.999061276316851e-06,
"loss": 1.2943,
"step": 122
},
{
"epoch": 0.28103579588728106,
"grad_norm": 0.9880859851837158,
"learning_rate": 4.99897400425872e-06,
"loss": 1.3035,
"step": 123
},
{
"epoch": 0.2833206397562833,
"grad_norm": 0.9832742810249329,
"learning_rate": 4.998882854526872e-06,
"loss": 1.3015,
"step": 124
},
{
"epoch": 0.2856054836252856,
"grad_norm": 0.976040780544281,
"learning_rate": 4.998787827262743e-06,
"loss": 1.3325,
"step": 125
},
{
"epoch": 0.2878903274942879,
"grad_norm": 1.0309007167816162,
"learning_rate": 4.998688922613788e-06,
"loss": 1.2998,
"step": 126
},
{
"epoch": 0.2901751713632902,
"grad_norm": 1.0828396081924438,
"learning_rate": 4.998586140733477e-06,
"loss": 1.3093,
"step": 127
},
{
"epoch": 0.2924600152322925,
"grad_norm": 0.9725452661514282,
"learning_rate": 4.998479481781299e-06,
"loss": 1.2811,
"step": 128
},
{
"epoch": 0.2947448591012947,
"grad_norm": 0.9891279339790344,
"learning_rate": 4.998368945922757e-06,
"loss": 1.3104,
"step": 129
},
{
"epoch": 0.297029702970297,
"grad_norm": 1.022490382194519,
"learning_rate": 4.998254533329369e-06,
"loss": 1.3425,
"step": 130
},
{
"epoch": 0.2993145468392993,
"grad_norm": 1.00505530834198,
"learning_rate": 4.99813624417867e-06,
"loss": 1.3494,
"step": 131
},
{
"epoch": 0.3015993907083016,
"grad_norm": 1.033308982849121,
"learning_rate": 4.998014078654211e-06,
"loss": 1.278,
"step": 132
},
{
"epoch": 0.3038842345773039,
"grad_norm": 1.0194460153579712,
"learning_rate": 4.997888036945556e-06,
"loss": 1.2963,
"step": 133
},
{
"epoch": 0.30616907844630614,
"grad_norm": 1.005299687385559,
"learning_rate": 4.997758119248286e-06,
"loss": 1.3187,
"step": 134
},
{
"epoch": 0.30845392231530844,
"grad_norm": 1.0271679162979126,
"learning_rate": 4.997624325763994e-06,
"loss": 1.3106,
"step": 135
},
{
"epoch": 0.31073876618431073,
"grad_norm": 1.0343165397644043,
"learning_rate": 4.997486656700289e-06,
"loss": 1.3355,
"step": 136
},
{
"epoch": 0.31302361005331303,
"grad_norm": 1.0498188734054565,
"learning_rate": 4.997345112270792e-06,
"loss": 1.3126,
"step": 137
},
{
"epoch": 0.3153084539223153,
"grad_norm": 0.9742498993873596,
"learning_rate": 4.997199692695138e-06,
"loss": 1.3006,
"step": 138
},
{
"epoch": 0.3175932977913176,
"grad_norm": 1.0044124126434326,
"learning_rate": 4.997050398198977e-06,
"loss": 1.3298,
"step": 139
},
{
"epoch": 0.31987814166031986,
"grad_norm": 1.0173184871673584,
"learning_rate": 4.99689722901397e-06,
"loss": 1.3286,
"step": 140
},
{
"epoch": 0.32216298552932215,
"grad_norm": 0.9835124611854553,
"learning_rate": 4.99674018537779e-06,
"loss": 1.2937,
"step": 141
},
{
"epoch": 0.32444782939832445,
"grad_norm": 1.0389831066131592,
"learning_rate": 4.996579267534122e-06,
"loss": 1.3077,
"step": 142
},
{
"epoch": 0.32673267326732675,
"grad_norm": 1.0412015914916992,
"learning_rate": 4.996414475732664e-06,
"loss": 1.3131,
"step": 143
},
{
"epoch": 0.32901751713632904,
"grad_norm": 1.0527534484863281,
"learning_rate": 4.9962458102291254e-06,
"loss": 1.3075,
"step": 144
},
{
"epoch": 0.3313023610053313,
"grad_norm": 1.036034345626831,
"learning_rate": 4.9960732712852236e-06,
"loss": 1.3198,
"step": 145
},
{
"epoch": 0.3335872048743336,
"grad_norm": 1.0121785402297974,
"learning_rate": 4.99589685916869e-06,
"loss": 1.3346,
"step": 146
},
{
"epoch": 0.33587204874333587,
"grad_norm": 1.0597130060195923,
"learning_rate": 4.9957165741532635e-06,
"loss": 1.3025,
"step": 147
},
{
"epoch": 0.33815689261233817,
"grad_norm": 1.0982815027236938,
"learning_rate": 4.995532416518693e-06,
"loss": 1.3177,
"step": 148
},
{
"epoch": 0.34044173648134046,
"grad_norm": 1.012061357498169,
"learning_rate": 4.995344386550738e-06,
"loss": 1.2905,
"step": 149
},
{
"epoch": 0.3427265803503427,
"grad_norm": 1.0748074054718018,
"learning_rate": 4.995152484541166e-06,
"loss": 1.3191,
"step": 150
},
{
"epoch": 0.345011424219345,
"grad_norm": 1.0346341133117676,
"learning_rate": 4.994956710787752e-06,
"loss": 1.2923,
"step": 151
},
{
"epoch": 0.3472962680883473,
"grad_norm": 1.0333645343780518,
"learning_rate": 4.99475706559428e-06,
"loss": 1.3272,
"step": 152
},
{
"epoch": 0.3495811119573496,
"grad_norm": 1.0411094427108765,
"learning_rate": 4.9945535492705385e-06,
"loss": 1.3102,
"step": 153
},
{
"epoch": 0.3518659558263519,
"grad_norm": 1.0394591093063354,
"learning_rate": 4.994346162132329e-06,
"loss": 1.2912,
"step": 154
},
{
"epoch": 0.3541507996953541,
"grad_norm": 1.1258337497711182,
"learning_rate": 4.994134904501452e-06,
"loss": 1.295,
"step": 155
},
{
"epoch": 0.3564356435643564,
"grad_norm": 1.0196075439453125,
"learning_rate": 4.993919776705718e-06,
"loss": 1.2935,
"step": 156
},
{
"epoch": 0.3587204874333587,
"grad_norm": 1.020180583000183,
"learning_rate": 4.993700779078943e-06,
"loss": 1.3118,
"step": 157
},
{
"epoch": 0.361005331302361,
"grad_norm": 1.1170531511306763,
"learning_rate": 4.993477911960948e-06,
"loss": 1.2924,
"step": 158
},
{
"epoch": 0.3632901751713633,
"grad_norm": 1.0637717247009277,
"learning_rate": 4.993251175697554e-06,
"loss": 1.2797,
"step": 159
},
{
"epoch": 0.3655750190403656,
"grad_norm": 1.046305775642395,
"learning_rate": 4.993020570640592e-06,
"loss": 1.3142,
"step": 160
},
{
"epoch": 0.36785986290936784,
"grad_norm": 1.039476752281189,
"learning_rate": 4.992786097147892e-06,
"loss": 1.2773,
"step": 161
},
{
"epoch": 0.37014470677837014,
"grad_norm": 1.0379183292388916,
"learning_rate": 4.992547755583288e-06,
"loss": 1.3057,
"step": 162
},
{
"epoch": 0.37242955064737243,
"grad_norm": 1.0063403844833374,
"learning_rate": 4.992305546316617e-06,
"loss": 1.3108,
"step": 163
},
{
"epoch": 0.3747143945163747,
"grad_norm": 1.0467029809951782,
"learning_rate": 4.992059469723716e-06,
"loss": 1.2675,
"step": 164
},
{
"epoch": 0.376999238385377,
"grad_norm": 0.9822115898132324,
"learning_rate": 4.991809526186424e-06,
"loss": 1.2987,
"step": 165
},
{
"epoch": 0.37928408225437926,
"grad_norm": 0.9957991242408752,
"learning_rate": 4.9915557160925795e-06,
"loss": 1.2927,
"step": 166
},
{
"epoch": 0.38156892612338156,
"grad_norm": 1.020486831665039,
"learning_rate": 4.991298039836021e-06,
"loss": 1.2891,
"step": 167
},
{
"epoch": 0.38385376999238385,
"grad_norm": 0.9941042065620422,
"learning_rate": 4.991036497816587e-06,
"loss": 1.3279,
"step": 168
},
{
"epoch": 0.38613861386138615,
"grad_norm": 1.030573844909668,
"learning_rate": 4.990771090440114e-06,
"loss": 1.2715,
"step": 169
},
{
"epoch": 0.38842345773038844,
"grad_norm": 0.9810742735862732,
"learning_rate": 4.990501818118436e-06,
"loss": 1.2808,
"step": 170
},
{
"epoch": 0.3907083015993907,
"grad_norm": 1.0300201177597046,
"learning_rate": 4.990228681269383e-06,
"loss": 1.3079,
"step": 171
},
{
"epoch": 0.392993145468393,
"grad_norm": 1.0107353925704956,
"learning_rate": 4.989951680316787e-06,
"loss": 1.2872,
"step": 172
},
{
"epoch": 0.3952779893373953,
"grad_norm": 1.0361515283584595,
"learning_rate": 4.989670815690469e-06,
"loss": 1.2784,
"step": 173
},
{
"epoch": 0.39756283320639757,
"grad_norm": 1.0452970266342163,
"learning_rate": 4.989386087826248e-06,
"loss": 1.2976,
"step": 174
},
{
"epoch": 0.39984767707539987,
"grad_norm": 1.0585196018218994,
"learning_rate": 4.9890974971659405e-06,
"loss": 1.2921,
"step": 175
},
{
"epoch": 0.4021325209444021,
"grad_norm": 1.018211007118225,
"learning_rate": 4.988805044157353e-06,
"loss": 1.3046,
"step": 176
},
{
"epoch": 0.4044173648134044,
"grad_norm": 1.0587507486343384,
"learning_rate": 4.9885087292542865e-06,
"loss": 1.2901,
"step": 177
},
{
"epoch": 0.4067022086824067,
"grad_norm": 1.0261503458023071,
"learning_rate": 4.988208552916535e-06,
"loss": 1.3081,
"step": 178
},
{
"epoch": 0.408987052551409,
"grad_norm": 1.0412943363189697,
"learning_rate": 4.9879045156098846e-06,
"loss": 1.3052,
"step": 179
},
{
"epoch": 0.4112718964204113,
"grad_norm": 1.0323666334152222,
"learning_rate": 4.987596617806111e-06,
"loss": 1.3048,
"step": 180
},
{
"epoch": 0.4135567402894136,
"grad_norm": 1.0095067024230957,
"learning_rate": 4.9872848599829825e-06,
"loss": 1.3292,
"step": 181
},
{
"epoch": 0.4158415841584158,
"grad_norm": 0.9761002659797668,
"learning_rate": 4.986969242624254e-06,
"loss": 1.2884,
"step": 182
},
{
"epoch": 0.4181264280274181,
"grad_norm": 1.0436338186264038,
"learning_rate": 4.986649766219671e-06,
"loss": 1.3211,
"step": 183
},
{
"epoch": 0.4204112718964204,
"grad_norm": 1.0505225658416748,
"learning_rate": 4.986326431264969e-06,
"loss": 1.2863,
"step": 184
},
{
"epoch": 0.4226961157654227,
"grad_norm": 1.006611943244934,
"learning_rate": 4.985999238261867e-06,
"loss": 1.2812,
"step": 185
},
{
"epoch": 0.424980959634425,
"grad_norm": 1.0494719743728638,
"learning_rate": 4.985668187718073e-06,
"loss": 1.3105,
"step": 186
},
{
"epoch": 0.42726580350342724,
"grad_norm": 0.9847164750099182,
"learning_rate": 4.985333280147281e-06,
"loss": 1.2811,
"step": 187
},
{
"epoch": 0.42955064737242954,
"grad_norm": 1.0337165594100952,
"learning_rate": 4.984994516069168e-06,
"loss": 1.2876,
"step": 188
},
{
"epoch": 0.43183549124143183,
"grad_norm": 1.0178074836730957,
"learning_rate": 4.984651896009396e-06,
"loss": 1.2597,
"step": 189
},
{
"epoch": 0.43412033511043413,
"grad_norm": 1.0170668363571167,
"learning_rate": 4.984305420499612e-06,
"loss": 1.2916,
"step": 190
},
{
"epoch": 0.4364051789794364,
"grad_norm": 1.0148853063583374,
"learning_rate": 4.983955090077445e-06,
"loss": 1.2785,
"step": 191
},
{
"epoch": 0.43869002284843867,
"grad_norm": 1.0563602447509766,
"learning_rate": 4.983600905286502e-06,
"loss": 1.295,
"step": 192
},
{
"epoch": 0.44097486671744096,
"grad_norm": 0.9817858338356018,
"learning_rate": 4.983242866676376e-06,
"loss": 1.2832,
"step": 193
},
{
"epoch": 0.44325971058644326,
"grad_norm": 1.0299488306045532,
"learning_rate": 4.982880974802638e-06,
"loss": 1.2952,
"step": 194
},
{
"epoch": 0.44554455445544555,
"grad_norm": 0.9951279163360596,
"learning_rate": 4.982515230226837e-06,
"loss": 1.2901,
"step": 195
},
{
"epoch": 0.44782939832444785,
"grad_norm": 1.0001885890960693,
"learning_rate": 4.982145633516501e-06,
"loss": 1.2554,
"step": 196
},
{
"epoch": 0.4501142421934501,
"grad_norm": 1.0821017026901245,
"learning_rate": 4.981772185245135e-06,
"loss": 1.2903,
"step": 197
},
{
"epoch": 0.4523990860624524,
"grad_norm": 1.0269831418991089,
"learning_rate": 4.981394885992223e-06,
"loss": 1.3077,
"step": 198
},
{
"epoch": 0.4546839299314547,
"grad_norm": 1.025965929031372,
"learning_rate": 4.981013736343221e-06,
"loss": 1.2771,
"step": 199
},
{
"epoch": 0.456968773800457,
"grad_norm": 0.9828860759735107,
"learning_rate": 4.980628736889562e-06,
"loss": 1.2788,
"step": 200
},
{
"epoch": 0.45925361766945927,
"grad_norm": 1.077913761138916,
"learning_rate": 4.9802398882286515e-06,
"loss": 1.2815,
"step": 201
},
{
"epoch": 0.46153846153846156,
"grad_norm": 1.1024688482284546,
"learning_rate": 4.97984719096387e-06,
"loss": 1.3135,
"step": 202
},
{
"epoch": 0.4638233054074638,
"grad_norm": 1.0494202375411987,
"learning_rate": 4.979450645704567e-06,
"loss": 1.3027,
"step": 203
},
{
"epoch": 0.4661081492764661,
"grad_norm": 1.0050199031829834,
"learning_rate": 4.979050253066064e-06,
"loss": 1.3016,
"step": 204
},
{
"epoch": 0.4683929931454684,
"grad_norm": 1.0264744758605957,
"learning_rate": 4.978646013669652e-06,
"loss": 1.343,
"step": 205
},
{
"epoch": 0.4706778370144707,
"grad_norm": 1.001989722251892,
"learning_rate": 4.978237928142594e-06,
"loss": 1.3088,
"step": 206
},
{
"epoch": 0.472962680883473,
"grad_norm": 1.0501984357833862,
"learning_rate": 4.977825997118119e-06,
"loss": 1.2875,
"step": 207
},
{
"epoch": 0.4752475247524752,
"grad_norm": 1.0487364530563354,
"learning_rate": 4.977410221235421e-06,
"loss": 1.2917,
"step": 208
},
{
"epoch": 0.4775323686214775,
"grad_norm": 1.0768541097640991,
"learning_rate": 4.976990601139662e-06,
"loss": 1.3,
"step": 209
},
{
"epoch": 0.4798172124904798,
"grad_norm": 0.9696170687675476,
"learning_rate": 4.9765671374819715e-06,
"loss": 1.2822,
"step": 210
},
{
"epoch": 0.4821020563594821,
"grad_norm": 0.9987464547157288,
"learning_rate": 4.9761398309194385e-06,
"loss": 1.3076,
"step": 211
},
{
"epoch": 0.4843869002284844,
"grad_norm": 1.0254422426223755,
"learning_rate": 4.975708682115118e-06,
"loss": 1.281,
"step": 212
},
{
"epoch": 0.48667174409748665,
"grad_norm": 1.0040076971054077,
"learning_rate": 4.9752736917380274e-06,
"loss": 1.2821,
"step": 213
},
{
"epoch": 0.48895658796648894,
"grad_norm": 1.004184365272522,
"learning_rate": 4.9748348604631416e-06,
"loss": 1.2641,
"step": 214
},
{
"epoch": 0.49124143183549124,
"grad_norm": 1.0694694519042969,
"learning_rate": 4.9743921889714005e-06,
"loss": 1.2853,
"step": 215
},
{
"epoch": 0.49352627570449353,
"grad_norm": 1.0564874410629272,
"learning_rate": 4.973945677949699e-06,
"loss": 1.2882,
"step": 216
},
{
"epoch": 0.49581111957349583,
"grad_norm": 1.0076894760131836,
"learning_rate": 4.973495328090891e-06,
"loss": 1.2868,
"step": 217
},
{
"epoch": 0.49809596344249807,
"grad_norm": 1.0476043224334717,
"learning_rate": 4.973041140093786e-06,
"loss": 1.2642,
"step": 218
},
{
"epoch": 0.5003808073115004,
"grad_norm": 1.050991415977478,
"learning_rate": 4.972583114663153e-06,
"loss": 1.2751,
"step": 219
},
{
"epoch": 0.5026656511805027,
"grad_norm": 0.9902971386909485,
"learning_rate": 4.972121252509712e-06,
"loss": 1.2685,
"step": 220
},
{
"epoch": 0.504950495049505,
"grad_norm": 1.0011303424835205,
"learning_rate": 4.971655554350137e-06,
"loss": 1.2829,
"step": 221
},
{
"epoch": 0.5072353389185073,
"grad_norm": 1.010233998298645,
"learning_rate": 4.971186020907054e-06,
"loss": 1.277,
"step": 222
},
{
"epoch": 0.5095201827875095,
"grad_norm": 1.0275652408599854,
"learning_rate": 4.970712652909042e-06,
"loss": 1.2971,
"step": 223
},
{
"epoch": 0.5118050266565118,
"grad_norm": 1.0285537242889404,
"learning_rate": 4.970235451090629e-06,
"loss": 1.231,
"step": 224
},
{
"epoch": 0.5140898705255141,
"grad_norm": 1.0604579448699951,
"learning_rate": 4.969754416192292e-06,
"loss": 1.269,
"step": 225
},
{
"epoch": 0.5163747143945163,
"grad_norm": 1.0375958681106567,
"learning_rate": 4.969269548960456e-06,
"loss": 1.2712,
"step": 226
},
{
"epoch": 0.5186595582635186,
"grad_norm": 1.037304401397705,
"learning_rate": 4.9687808501474925e-06,
"loss": 1.2826,
"step": 227
},
{
"epoch": 0.5209444021325209,
"grad_norm": 1.0280749797821045,
"learning_rate": 4.968288320511718e-06,
"loss": 1.2726,
"step": 228
},
{
"epoch": 0.5232292460015232,
"grad_norm": 1.0595530271530151,
"learning_rate": 4.967791960817395e-06,
"loss": 1.281,
"step": 229
},
{
"epoch": 0.5255140898705255,
"grad_norm": 0.9964226484298706,
"learning_rate": 4.967291771834727e-06,
"loss": 1.3188,
"step": 230
},
{
"epoch": 0.5277989337395278,
"grad_norm": 1.0433804988861084,
"learning_rate": 4.966787754339861e-06,
"loss": 1.274,
"step": 231
},
{
"epoch": 0.5300837776085301,
"grad_norm": 1.079641580581665,
"learning_rate": 4.966279909114883e-06,
"loss": 1.2991,
"step": 232
},
{
"epoch": 0.5323686214775324,
"grad_norm": 1.0351816415786743,
"learning_rate": 4.965768236947821e-06,
"loss": 1.2659,
"step": 233
},
{
"epoch": 0.5346534653465347,
"grad_norm": 1.0495244264602661,
"learning_rate": 4.96525273863264e-06,
"loss": 1.2898,
"step": 234
},
{
"epoch": 0.536938309215537,
"grad_norm": 1.0479910373687744,
"learning_rate": 4.964733414969241e-06,
"loss": 1.2536,
"step": 235
},
{
"epoch": 0.5392231530845393,
"grad_norm": 1.0365879535675049,
"learning_rate": 4.964210266763461e-06,
"loss": 1.2369,
"step": 236
},
{
"epoch": 0.5415079969535415,
"grad_norm": 1.0398730039596558,
"learning_rate": 4.9636832948270745e-06,
"loss": 1.2669,
"step": 237
},
{
"epoch": 0.5437928408225438,
"grad_norm": 1.0146657228469849,
"learning_rate": 4.963152499977786e-06,
"loss": 1.2893,
"step": 238
},
{
"epoch": 0.546077684691546,
"grad_norm": 1.0974043607711792,
"learning_rate": 4.962617883039233e-06,
"loss": 1.2452,
"step": 239
},
{
"epoch": 0.5483625285605483,
"grad_norm": 0.9900649189949036,
"learning_rate": 4.962079444840985e-06,
"loss": 1.2215,
"step": 240
},
{
"epoch": 0.5506473724295506,
"grad_norm": 1.003464937210083,
"learning_rate": 4.9615371862185394e-06,
"loss": 1.2744,
"step": 241
},
{
"epoch": 0.5529322162985529,
"grad_norm": 1.004382848739624,
"learning_rate": 4.960991108013322e-06,
"loss": 1.271,
"step": 242
},
{
"epoch": 0.5552170601675552,
"grad_norm": 1.0129280090332031,
"learning_rate": 4.960441211072686e-06,
"loss": 1.2874,
"step": 243
},
{
"epoch": 0.5575019040365575,
"grad_norm": 1.040189266204834,
"learning_rate": 4.9598874962499096e-06,
"loss": 1.2918,
"step": 244
},
{
"epoch": 0.5597867479055598,
"grad_norm": 1.0145982503890991,
"learning_rate": 4.959329964404197e-06,
"loss": 1.2713,
"step": 245
},
{
"epoch": 0.5620715917745621,
"grad_norm": 1.0469987392425537,
"learning_rate": 4.958768616400672e-06,
"loss": 1.2689,
"step": 246
},
{
"epoch": 0.5643564356435643,
"grad_norm": 1.0191642045974731,
"learning_rate": 4.958203453110384e-06,
"loss": 1.2718,
"step": 247
},
{
"epoch": 0.5666412795125666,
"grad_norm": 1.0718231201171875,
"learning_rate": 4.957634475410298e-06,
"loss": 1.3128,
"step": 248
},
{
"epoch": 0.5689261233815689,
"grad_norm": 1.0109634399414062,
"learning_rate": 4.957061684183301e-06,
"loss": 1.2586,
"step": 249
},
{
"epoch": 0.5712109672505712,
"grad_norm": 0.9942657947540283,
"learning_rate": 4.956485080318198e-06,
"loss": 1.328,
"step": 250
},
{
"epoch": 0.5734958111195735,
"grad_norm": 1.0184757709503174,
"learning_rate": 4.955904664709707e-06,
"loss": 1.2815,
"step": 251
},
{
"epoch": 0.5757806549885758,
"grad_norm": 1.015625,
"learning_rate": 4.955320438258465e-06,
"loss": 1.2585,
"step": 252
},
{
"epoch": 0.5780654988575781,
"grad_norm": 0.9848981499671936,
"learning_rate": 4.954732401871018e-06,
"loss": 1.2866,
"step": 253
},
{
"epoch": 0.5803503427265804,
"grad_norm": 1.0482749938964844,
"learning_rate": 4.954140556459826e-06,
"loss": 1.2732,
"step": 254
},
{
"epoch": 0.5826351865955827,
"grad_norm": 1.0250680446624756,
"learning_rate": 4.95354490294326e-06,
"loss": 1.3053,
"step": 255
},
{
"epoch": 0.584920030464585,
"grad_norm": 1.0545597076416016,
"learning_rate": 4.952945442245598e-06,
"loss": 1.2638,
"step": 256
},
{
"epoch": 0.5872048743335873,
"grad_norm": 1.044873833656311,
"learning_rate": 4.952342175297028e-06,
"loss": 1.2683,
"step": 257
},
{
"epoch": 0.5894897182025894,
"grad_norm": 1.0361744165420532,
"learning_rate": 4.951735103033644e-06,
"loss": 1.2887,
"step": 258
},
{
"epoch": 0.5917745620715917,
"grad_norm": 1.0238685607910156,
"learning_rate": 4.951124226397441e-06,
"loss": 1.2736,
"step": 259
},
{
"epoch": 0.594059405940594,
"grad_norm": 1.0217833518981934,
"learning_rate": 4.950509546336323e-06,
"loss": 1.2681,
"step": 260
},
{
"epoch": 0.5963442498095963,
"grad_norm": 1.0546188354492188,
"learning_rate": 4.949891063804091e-06,
"loss": 1.2582,
"step": 261
},
{
"epoch": 0.5986290936785986,
"grad_norm": 1.0834907293319702,
"learning_rate": 4.94926877976045e-06,
"loss": 1.2487,
"step": 262
},
{
"epoch": 0.6009139375476009,
"grad_norm": 1.062184453010559,
"learning_rate": 4.948642695171e-06,
"loss": 1.3188,
"step": 263
},
{
"epoch": 0.6031987814166032,
"grad_norm": 1.0373252630233765,
"learning_rate": 4.948012811007242e-06,
"loss": 1.277,
"step": 264
},
{
"epoch": 0.6054836252856055,
"grad_norm": 1.0140316486358643,
"learning_rate": 4.947379128246571e-06,
"loss": 1.2617,
"step": 265
},
{
"epoch": 0.6077684691546078,
"grad_norm": 1.054410696029663,
"learning_rate": 4.946741647872277e-06,
"loss": 1.238,
"step": 266
},
{
"epoch": 0.6100533130236101,
"grad_norm": 1.0967663526535034,
"learning_rate": 4.94610037087354e-06,
"loss": 1.2682,
"step": 267
},
{
"epoch": 0.6123381568926123,
"grad_norm": 1.043338656425476,
"learning_rate": 4.945455298245436e-06,
"loss": 1.2572,
"step": 268
},
{
"epoch": 0.6146230007616146,
"grad_norm": 1.0187970399856567,
"learning_rate": 4.944806430988927e-06,
"loss": 1.2613,
"step": 269
},
{
"epoch": 0.6169078446306169,
"grad_norm": 1.0666472911834717,
"learning_rate": 4.9441537701108654e-06,
"loss": 1.2611,
"step": 270
},
{
"epoch": 0.6191926884996192,
"grad_norm": 1.0025635957717896,
"learning_rate": 4.943497316623988e-06,
"loss": 1.2519,
"step": 271
},
{
"epoch": 0.6214775323686215,
"grad_norm": 1.0135135650634766,
"learning_rate": 4.942837071546919e-06,
"loss": 1.2759,
"step": 272
},
{
"epoch": 0.6237623762376238,
"grad_norm": 0.9985151886940002,
"learning_rate": 4.942173035904164e-06,
"loss": 1.2844,
"step": 273
},
{
"epoch": 0.6260472201066261,
"grad_norm": 0.9952817559242249,
"learning_rate": 4.941505210726112e-06,
"loss": 1.2356,
"step": 274
},
{
"epoch": 0.6283320639756284,
"grad_norm": 1.0448962450027466,
"learning_rate": 4.9408335970490305e-06,
"loss": 1.2587,
"step": 275
},
{
"epoch": 0.6306169078446306,
"grad_norm": 1.011099100112915,
"learning_rate": 4.940158195915067e-06,
"loss": 1.2729,
"step": 276
},
{
"epoch": 0.6329017517136329,
"grad_norm": 1.052904725074768,
"learning_rate": 4.939479008372247e-06,
"loss": 1.2536,
"step": 277
},
{
"epoch": 0.6351865955826352,
"grad_norm": 1.058173418045044,
"learning_rate": 4.938796035474469e-06,
"loss": 1.2807,
"step": 278
},
{
"epoch": 0.6374714394516374,
"grad_norm": 1.022147536277771,
"learning_rate": 4.938109278281506e-06,
"loss": 1.2887,
"step": 279
},
{
"epoch": 0.6397562833206397,
"grad_norm": 1.0064011812210083,
"learning_rate": 4.937418737859004e-06,
"loss": 1.2192,
"step": 280
},
{
"epoch": 0.642041127189642,
"grad_norm": 1.0092360973358154,
"learning_rate": 4.936724415278479e-06,
"loss": 1.3159,
"step": 281
},
{
"epoch": 0.6443259710586443,
"grad_norm": 1.076401710510254,
"learning_rate": 4.936026311617316e-06,
"loss": 1.2872,
"step": 282
},
{
"epoch": 0.6466108149276466,
"grad_norm": 1.057209849357605,
"learning_rate": 4.935324427958766e-06,
"loss": 1.257,
"step": 283
},
{
"epoch": 0.6488956587966489,
"grad_norm": 1.1738762855529785,
"learning_rate": 4.934618765391946e-06,
"loss": 1.2547,
"step": 284
},
{
"epoch": 0.6511805026656512,
"grad_norm": 1.0405137538909912,
"learning_rate": 4.933909325011838e-06,
"loss": 1.2766,
"step": 285
},
{
"epoch": 0.6534653465346535,
"grad_norm": 1.0377894639968872,
"learning_rate": 4.933196107919286e-06,
"loss": 1.2624,
"step": 286
},
{
"epoch": 0.6557501904036558,
"grad_norm": 1.032714605331421,
"learning_rate": 4.932479115220991e-06,
"loss": 1.2527,
"step": 287
},
{
"epoch": 0.6580350342726581,
"grad_norm": 1.0755581855773926,
"learning_rate": 4.9317583480295175e-06,
"loss": 1.2966,
"step": 288
},
{
"epoch": 0.6603198781416603,
"grad_norm": 1.0262556076049805,
"learning_rate": 4.931033807463283e-06,
"loss": 1.2585,
"step": 289
},
{
"epoch": 0.6626047220106626,
"grad_norm": 1.0510430335998535,
"learning_rate": 4.930305494646562e-06,
"loss": 1.2662,
"step": 290
},
{
"epoch": 0.6648895658796649,
"grad_norm": 1.035854458808899,
"learning_rate": 4.9295734107094825e-06,
"loss": 1.2346,
"step": 291
},
{
"epoch": 0.6671744097486672,
"grad_norm": 1.0485846996307373,
"learning_rate": 4.928837556788023e-06,
"loss": 1.2978,
"step": 292
},
{
"epoch": 0.6694592536176694,
"grad_norm": 1.02550208568573,
"learning_rate": 4.928097934024013e-06,
"loss": 1.2478,
"step": 293
},
{
"epoch": 0.6717440974866717,
"grad_norm": 1.0328837633132935,
"learning_rate": 4.927354543565131e-06,
"loss": 1.2788,
"step": 294
},
{
"epoch": 0.674028941355674,
"grad_norm": 0.9913997054100037,
"learning_rate": 4.926607386564898e-06,
"loss": 1.2423,
"step": 295
},
{
"epoch": 0.6763137852246763,
"grad_norm": 1.0034306049346924,
"learning_rate": 4.925856464182685e-06,
"loss": 1.2562,
"step": 296
},
{
"epoch": 0.6785986290936786,
"grad_norm": 1.0546495914459229,
"learning_rate": 4.925101777583701e-06,
"loss": 1.2598,
"step": 297
},
{
"epoch": 0.6808834729626809,
"grad_norm": 1.0412935018539429,
"learning_rate": 4.924343327938999e-06,
"loss": 1.2744,
"step": 298
},
{
"epoch": 0.6831683168316832,
"grad_norm": 1.0731669664382935,
"learning_rate": 4.923581116425471e-06,
"loss": 1.2912,
"step": 299
},
{
"epoch": 0.6854531607006854,
"grad_norm": 1.0394880771636963,
"learning_rate": 4.922815144225843e-06,
"loss": 1.276,
"step": 300
},
{
"epoch": 0.6877380045696877,
"grad_norm": 1.0383579730987549,
"learning_rate": 4.92204541252868e-06,
"loss": 1.255,
"step": 301
},
{
"epoch": 0.69002284843869,
"grad_norm": 1.0251744985580444,
"learning_rate": 4.92127192252838e-06,
"loss": 1.2688,
"step": 302
},
{
"epoch": 0.6923076923076923,
"grad_norm": 1.017650842666626,
"learning_rate": 4.9204946754251724e-06,
"loss": 1.2818,
"step": 303
},
{
"epoch": 0.6945925361766946,
"grad_norm": 1.0219080448150635,
"learning_rate": 4.919713672425116e-06,
"loss": 1.2828,
"step": 304
},
{
"epoch": 0.6968773800456969,
"grad_norm": 1.0862151384353638,
"learning_rate": 4.918928914740098e-06,
"loss": 1.2514,
"step": 305
},
{
"epoch": 0.6991622239146992,
"grad_norm": 1.0639281272888184,
"learning_rate": 4.918140403587831e-06,
"loss": 1.2739,
"step": 306
},
{
"epoch": 0.7014470677837015,
"grad_norm": 1.0512444972991943,
"learning_rate": 4.9173481401918556e-06,
"loss": 1.2576,
"step": 307
},
{
"epoch": 0.7037319116527038,
"grad_norm": 1.0291866064071655,
"learning_rate": 4.916552125781529e-06,
"loss": 1.2934,
"step": 308
},
{
"epoch": 0.7060167555217061,
"grad_norm": 1.0338629484176636,
"learning_rate": 4.915752361592032e-06,
"loss": 1.263,
"step": 309
},
{
"epoch": 0.7083015993907082,
"grad_norm": 1.0358542203903198,
"learning_rate": 4.914948848864365e-06,
"loss": 1.2453,
"step": 310
},
{
"epoch": 0.7105864432597105,
"grad_norm": 1.1184923648834229,
"learning_rate": 4.914141588845344e-06,
"loss": 1.2653,
"step": 311
},
{
"epoch": 0.7128712871287128,
"grad_norm": 1.0791000127792358,
"learning_rate": 4.913330582787598e-06,
"loss": 1.2659,
"step": 312
},
{
"epoch": 0.7151561309977151,
"grad_norm": 1.0901819467544556,
"learning_rate": 4.912515831949571e-06,
"loss": 1.2208,
"step": 313
},
{
"epoch": 0.7174409748667174,
"grad_norm": 1.0219902992248535,
"learning_rate": 4.9116973375955166e-06,
"loss": 1.2711,
"step": 314
},
{
"epoch": 0.7197258187357197,
"grad_norm": 1.014364242553711,
"learning_rate": 4.910875100995499e-06,
"loss": 1.2877,
"step": 315
},
{
"epoch": 0.722010662604722,
"grad_norm": 1.0699234008789062,
"learning_rate": 4.910049123425386e-06,
"loss": 1.2425,
"step": 316
},
{
"epoch": 0.7242955064737243,
"grad_norm": 1.0614267587661743,
"learning_rate": 4.9092194061668535e-06,
"loss": 1.2475,
"step": 317
},
{
"epoch": 0.7265803503427266,
"grad_norm": 1.0620336532592773,
"learning_rate": 4.908385950507378e-06,
"loss": 1.2618,
"step": 318
},
{
"epoch": 0.7288651942117289,
"grad_norm": 1.0389032363891602,
"learning_rate": 4.90754875774024e-06,
"loss": 1.2742,
"step": 319
},
{
"epoch": 0.7311500380807312,
"grad_norm": 0.9754124879837036,
"learning_rate": 4.9067078291645144e-06,
"loss": 1.25,
"step": 320
},
{
"epoch": 0.7334348819497334,
"grad_norm": 1.056058406829834,
"learning_rate": 4.905863166085076e-06,
"loss": 1.2451,
"step": 321
},
{
"epoch": 0.7357197258187357,
"grad_norm": 1.0641580820083618,
"learning_rate": 4.9050147698125944e-06,
"loss": 1.2532,
"step": 322
},
{
"epoch": 0.738004569687738,
"grad_norm": 1.0407251119613647,
"learning_rate": 4.904162641663532e-06,
"loss": 1.3103,
"step": 323
},
{
"epoch": 0.7402894135567403,
"grad_norm": 1.0477187633514404,
"learning_rate": 4.9033067829601385e-06,
"loss": 1.2658,
"step": 324
},
{
"epoch": 0.7425742574257426,
"grad_norm": 1.0202401876449585,
"learning_rate": 4.902447195030459e-06,
"loss": 1.2569,
"step": 325
},
{
"epoch": 0.7448591012947449,
"grad_norm": 1.0629253387451172,
"learning_rate": 4.9015838792083196e-06,
"loss": 1.247,
"step": 326
},
{
"epoch": 0.7471439451637472,
"grad_norm": 1.0284748077392578,
"learning_rate": 4.900716836833333e-06,
"loss": 1.2659,
"step": 327
},
{
"epoch": 0.7494287890327495,
"grad_norm": 1.0653586387634277,
"learning_rate": 4.899846069250894e-06,
"loss": 1.2673,
"step": 328
},
{
"epoch": 0.7517136329017517,
"grad_norm": 1.0795682668685913,
"learning_rate": 4.898971577812179e-06,
"loss": 1.2778,
"step": 329
},
{
"epoch": 0.753998476770754,
"grad_norm": 1.0359232425689697,
"learning_rate": 4.8980933638741426e-06,
"loss": 1.2732,
"step": 330
},
{
"epoch": 0.7562833206397562,
"grad_norm": 1.0286237001419067,
"learning_rate": 4.897211428799512e-06,
"loss": 1.2455,
"step": 331
},
{
"epoch": 0.7585681645087585,
"grad_norm": 1.0179105997085571,
"learning_rate": 4.896325773956793e-06,
"loss": 1.2413,
"step": 332
},
{
"epoch": 0.7608530083777608,
"grad_norm": 1.0381865501403809,
"learning_rate": 4.895436400720264e-06,
"loss": 1.2409,
"step": 333
},
{
"epoch": 0.7631378522467631,
"grad_norm": 0.9918906688690186,
"learning_rate": 4.894543310469968e-06,
"loss": 1.2556,
"step": 334
},
{
"epoch": 0.7654226961157654,
"grad_norm": 1.0300416946411133,
"learning_rate": 4.8936465045917204e-06,
"loss": 1.2325,
"step": 335
},
{
"epoch": 0.7677075399847677,
"grad_norm": 1.052534580230713,
"learning_rate": 4.8927459844770995e-06,
"loss": 1.2561,
"step": 336
},
{
"epoch": 0.76999238385377,
"grad_norm": 1.0454604625701904,
"learning_rate": 4.891841751523448e-06,
"loss": 1.2845,
"step": 337
},
{
"epoch": 0.7722772277227723,
"grad_norm": 1.0518709421157837,
"learning_rate": 4.8909338071338706e-06,
"loss": 1.2485,
"step": 338
},
{
"epoch": 0.7745620715917746,
"grad_norm": 1.0326422452926636,
"learning_rate": 4.890022152717231e-06,
"loss": 1.2757,
"step": 339
},
{
"epoch": 0.7768469154607769,
"grad_norm": 1.2617943286895752,
"learning_rate": 4.889106789688148e-06,
"loss": 1.2656,
"step": 340
},
{
"epoch": 0.7791317593297792,
"grad_norm": 1.0038459300994873,
"learning_rate": 4.888187719466996e-06,
"loss": 1.2636,
"step": 341
},
{
"epoch": 0.7814166031987814,
"grad_norm": 1.1393420696258545,
"learning_rate": 4.887264943479903e-06,
"loss": 1.2621,
"step": 342
},
{
"epoch": 0.7837014470677837,
"grad_norm": 1.0969446897506714,
"learning_rate": 4.8863384631587446e-06,
"loss": 1.2208,
"step": 343
},
{
"epoch": 0.785986290936786,
"grad_norm": 1.034393310546875,
"learning_rate": 4.885408279941148e-06,
"loss": 1.2101,
"step": 344
},
{
"epoch": 0.7882711348057883,
"grad_norm": 1.1397764682769775,
"learning_rate": 4.884474395270484e-06,
"loss": 1.2823,
"step": 345
},
{
"epoch": 0.7905559786747905,
"grad_norm": 1.1488789319992065,
"learning_rate": 4.883536810595867e-06,
"loss": 1.2615,
"step": 346
},
{
"epoch": 0.7928408225437928,
"grad_norm": 1.0274580717086792,
"learning_rate": 4.8825955273721524e-06,
"loss": 1.2334,
"step": 347
},
{
"epoch": 0.7951256664127951,
"grad_norm": 1.0355713367462158,
"learning_rate": 4.8816505470599365e-06,
"loss": 1.2224,
"step": 348
},
{
"epoch": 0.7974105102817974,
"grad_norm": 1.0540703535079956,
"learning_rate": 4.880701871125551e-06,
"loss": 1.262,
"step": 349
},
{
"epoch": 0.7996953541507997,
"grad_norm": 1.0765819549560547,
"learning_rate": 4.879749501041062e-06,
"loss": 1.2731,
"step": 350
},
{
"epoch": 0.801980198019802,
"grad_norm": 1.0639638900756836,
"learning_rate": 4.878793438284268e-06,
"loss": 1.2673,
"step": 351
},
{
"epoch": 0.8042650418888042,
"grad_norm": 1.0149368047714233,
"learning_rate": 4.877833684338698e-06,
"loss": 1.2479,
"step": 352
},
{
"epoch": 0.8065498857578065,
"grad_norm": 1.1710435152053833,
"learning_rate": 4.876870240693608e-06,
"loss": 1.2775,
"step": 353
},
{
"epoch": 0.8088347296268088,
"grad_norm": 1.1317570209503174,
"learning_rate": 4.875903108843979e-06,
"loss": 1.2732,
"step": 354
},
{
"epoch": 0.8111195734958111,
"grad_norm": 1.0417158603668213,
"learning_rate": 4.874932290290517e-06,
"loss": 1.2647,
"step": 355
},
{
"epoch": 0.8134044173648134,
"grad_norm": 1.073765516281128,
"learning_rate": 4.873957786539646e-06,
"loss": 1.2738,
"step": 356
},
{
"epoch": 0.8156892612338157,
"grad_norm": 1.018481731414795,
"learning_rate": 4.872979599103511e-06,
"loss": 1.2509,
"step": 357
},
{
"epoch": 0.817974105102818,
"grad_norm": 1.0737470388412476,
"learning_rate": 4.8719977294999695e-06,
"loss": 1.232,
"step": 358
},
{
"epoch": 0.8202589489718203,
"grad_norm": 1.0921229124069214,
"learning_rate": 4.871012179252597e-06,
"loss": 1.2342,
"step": 359
},
{
"epoch": 0.8225437928408226,
"grad_norm": 1.0502641201019287,
"learning_rate": 4.870022949890676e-06,
"loss": 1.2463,
"step": 360
},
{
"epoch": 0.8248286367098249,
"grad_norm": 1.1755155324935913,
"learning_rate": 4.869030042949202e-06,
"loss": 1.2625,
"step": 361
},
{
"epoch": 0.8271134805788272,
"grad_norm": 1.0167341232299805,
"learning_rate": 4.868033459968874e-06,
"loss": 1.2563,
"step": 362
},
{
"epoch": 0.8293983244478293,
"grad_norm": 1.0481575727462769,
"learning_rate": 4.8670332024960954e-06,
"loss": 1.2541,
"step": 363
},
{
"epoch": 0.8316831683168316,
"grad_norm": 1.0657804012298584,
"learning_rate": 4.866029272082973e-06,
"loss": 1.2444,
"step": 364
},
{
"epoch": 0.8339680121858339,
"grad_norm": 1.0473397970199585,
"learning_rate": 4.865021670287311e-06,
"loss": 1.2356,
"step": 365
},
{
"epoch": 0.8362528560548362,
"grad_norm": 1.011077880859375,
"learning_rate": 4.864010398672612e-06,
"loss": 1.2417,
"step": 366
},
{
"epoch": 0.8385376999238385,
"grad_norm": 1.0485464334487915,
"learning_rate": 4.862995458808073e-06,
"loss": 1.2728,
"step": 367
},
{
"epoch": 0.8408225437928408,
"grad_norm": 1.0683908462524414,
"learning_rate": 4.861976852268582e-06,
"loss": 1.2354,
"step": 368
},
{
"epoch": 0.8431073876618431,
"grad_norm": 1.0323604345321655,
"learning_rate": 4.860954580634718e-06,
"loss": 1.2665,
"step": 369
},
{
"epoch": 0.8453922315308454,
"grad_norm": 1.024782419204712,
"learning_rate": 4.859928645492746e-06,
"loss": 1.2515,
"step": 370
},
{
"epoch": 0.8476770753998477,
"grad_norm": 1.02902090549469,
"learning_rate": 4.858899048434614e-06,
"loss": 1.2274,
"step": 371
},
{
"epoch": 0.84996191926885,
"grad_norm": 1.0355148315429688,
"learning_rate": 4.857865791057957e-06,
"loss": 1.2289,
"step": 372
},
{
"epoch": 0.8522467631378522,
"grad_norm": 1.0638132095336914,
"learning_rate": 4.856828874966086e-06,
"loss": 1.2245,
"step": 373
},
{
"epoch": 0.8545316070068545,
"grad_norm": 1.0459909439086914,
"learning_rate": 4.8557883017679895e-06,
"loss": 1.2347,
"step": 374
},
{
"epoch": 0.8568164508758568,
"grad_norm": 1.0818232297897339,
"learning_rate": 4.854744073078333e-06,
"loss": 1.2564,
"step": 375
},
{
"epoch": 0.8591012947448591,
"grad_norm": 1.0551162958145142,
"learning_rate": 4.853696190517452e-06,
"loss": 1.2809,
"step": 376
},
{
"epoch": 0.8613861386138614,
"grad_norm": 1.0419256687164307,
"learning_rate": 4.8526446557113525e-06,
"loss": 1.2532,
"step": 377
},
{
"epoch": 0.8636709824828637,
"grad_norm": 1.058478832244873,
"learning_rate": 4.851589470291707e-06,
"loss": 1.229,
"step": 378
},
{
"epoch": 0.865955826351866,
"grad_norm": 1.0275694131851196,
"learning_rate": 4.850530635895854e-06,
"loss": 1.2555,
"step": 379
},
{
"epoch": 0.8682406702208683,
"grad_norm": 1.0653144121170044,
"learning_rate": 4.849468154166794e-06,
"loss": 1.2397,
"step": 380
},
{
"epoch": 0.8705255140898706,
"grad_norm": 1.0227371454238892,
"learning_rate": 4.8484020267531855e-06,
"loss": 1.2568,
"step": 381
},
{
"epoch": 0.8728103579588729,
"grad_norm": 1.0583505630493164,
"learning_rate": 4.847332255309346e-06,
"loss": 1.2489,
"step": 382
},
{
"epoch": 0.8750952018278751,
"grad_norm": 1.0397239923477173,
"learning_rate": 4.846258841495246e-06,
"loss": 1.273,
"step": 383
},
{
"epoch": 0.8773800456968773,
"grad_norm": 1.020776391029358,
"learning_rate": 4.845181786976509e-06,
"loss": 1.2257,
"step": 384
},
{
"epoch": 0.8796648895658796,
"grad_norm": 1.0420705080032349,
"learning_rate": 4.844101093424407e-06,
"loss": 1.296,
"step": 385
},
{
"epoch": 0.8819497334348819,
"grad_norm": 1.0465624332427979,
"learning_rate": 4.84301676251586e-06,
"loss": 1.2514,
"step": 386
},
{
"epoch": 0.8842345773038842,
"grad_norm": 1.0915330648422241,
"learning_rate": 4.841928795933429e-06,
"loss": 1.2664,
"step": 387
},
{
"epoch": 0.8865194211728865,
"grad_norm": 1.0246195793151855,
"learning_rate": 4.84083719536532e-06,
"loss": 1.2499,
"step": 388
},
{
"epoch": 0.8888042650418888,
"grad_norm": 1.0145692825317383,
"learning_rate": 4.839741962505376e-06,
"loss": 1.2638,
"step": 389
},
{
"epoch": 0.8910891089108911,
"grad_norm": 1.05404531955719,
"learning_rate": 4.838643099053077e-06,
"loss": 1.1875,
"step": 390
},
{
"epoch": 0.8933739527798934,
"grad_norm": 1.1422752141952515,
"learning_rate": 4.837540606713538e-06,
"loss": 1.2496,
"step": 391
},
{
"epoch": 0.8956587966488957,
"grad_norm": 1.0648959875106812,
"learning_rate": 4.8364344871975e-06,
"loss": 1.2375,
"step": 392
},
{
"epoch": 0.897943640517898,
"grad_norm": 1.0459322929382324,
"learning_rate": 4.835324742221338e-06,
"loss": 1.2419,
"step": 393
},
{
"epoch": 0.9002284843869002,
"grad_norm": 1.0693044662475586,
"learning_rate": 4.834211373507048e-06,
"loss": 1.2485,
"step": 394
},
{
"epoch": 0.9025133282559025,
"grad_norm": 1.0930724143981934,
"learning_rate": 4.833094382782255e-06,
"loss": 1.2389,
"step": 395
},
{
"epoch": 0.9047981721249048,
"grad_norm": 1.1270296573638916,
"learning_rate": 4.831973771780197e-06,
"loss": 1.2033,
"step": 396
},
{
"epoch": 0.9070830159939071,
"grad_norm": 1.044074535369873,
"learning_rate": 4.830849542239735e-06,
"loss": 1.2464,
"step": 397
},
{
"epoch": 0.9093678598629094,
"grad_norm": 1.0138458013534546,
"learning_rate": 4.829721695905343e-06,
"loss": 1.2473,
"step": 398
},
{
"epoch": 0.9116527037319117,
"grad_norm": 1.1201279163360596,
"learning_rate": 4.828590234527107e-06,
"loss": 1.2729,
"step": 399
},
{
"epoch": 0.913937547600914,
"grad_norm": 1.0771571397781372,
"learning_rate": 4.8274551598607214e-06,
"loss": 1.2665,
"step": 400
},
{
"epoch": 0.9162223914699162,
"grad_norm": 1.0691912174224854,
"learning_rate": 4.8263164736674905e-06,
"loss": 1.2094,
"step": 401
},
{
"epoch": 0.9185072353389185,
"grad_norm": 1.0740418434143066,
"learning_rate": 4.8251741777143205e-06,
"loss": 1.2879,
"step": 402
},
{
"epoch": 0.9207920792079208,
"grad_norm": 1.0185081958770752,
"learning_rate": 4.824028273773719e-06,
"loss": 1.2459,
"step": 403
},
{
"epoch": 0.9230769230769231,
"grad_norm": 1.0672869682312012,
"learning_rate": 4.822878763623792e-06,
"loss": 1.2394,
"step": 404
},
{
"epoch": 0.9253617669459253,
"grad_norm": 1.08120858669281,
"learning_rate": 4.821725649048242e-06,
"loss": 1.2918,
"step": 405
},
{
"epoch": 0.9276466108149276,
"grad_norm": 1.0407681465148926,
"learning_rate": 4.820568931836364e-06,
"loss": 1.2443,
"step": 406
},
{
"epoch": 0.9299314546839299,
"grad_norm": 1.0847117900848389,
"learning_rate": 4.8194086137830445e-06,
"loss": 1.2505,
"step": 407
},
{
"epoch": 0.9322162985529322,
"grad_norm": 1.0484883785247803,
"learning_rate": 4.818244696688754e-06,
"loss": 1.2469,
"step": 408
},
{
"epoch": 0.9345011424219345,
"grad_norm": 1.0654011964797974,
"learning_rate": 4.817077182359553e-06,
"loss": 1.2544,
"step": 409
},
{
"epoch": 0.9367859862909368,
"grad_norm": 1.108176589012146,
"learning_rate": 4.815906072607079e-06,
"loss": 1.2387,
"step": 410
},
{
"epoch": 0.9390708301599391,
"grad_norm": 1.0624432563781738,
"learning_rate": 4.8147313692485495e-06,
"loss": 1.2488,
"step": 411
},
{
"epoch": 0.9413556740289414,
"grad_norm": 1.0391454696655273,
"learning_rate": 4.813553074106761e-06,
"loss": 1.2514,
"step": 412
},
{
"epoch": 0.9436405178979437,
"grad_norm": 1.1086232662200928,
"learning_rate": 4.812371189010081e-06,
"loss": 1.2694,
"step": 413
},
{
"epoch": 0.945925361766946,
"grad_norm": 1.0448237657546997,
"learning_rate": 4.8111857157924465e-06,
"loss": 1.2366,
"step": 414
},
{
"epoch": 0.9482102056359482,
"grad_norm": 1.0393203496932983,
"learning_rate": 4.809996656293367e-06,
"loss": 1.2747,
"step": 415
},
{
"epoch": 0.9504950495049505,
"grad_norm": 1.083590030670166,
"learning_rate": 4.8088040123579106e-06,
"loss": 1.2167,
"step": 416
},
{
"epoch": 0.9527798933739527,
"grad_norm": 1.071567177772522,
"learning_rate": 4.807607785836711e-06,
"loss": 1.2108,
"step": 417
},
{
"epoch": 0.955064737242955,
"grad_norm": 1.0953818559646606,
"learning_rate": 4.8064079785859615e-06,
"loss": 1.2381,
"step": 418
},
{
"epoch": 0.9573495811119573,
"grad_norm": 1.0628875494003296,
"learning_rate": 4.8052045924674105e-06,
"loss": 1.232,
"step": 419
},
{
"epoch": 0.9596344249809596,
"grad_norm": 1.0838161706924438,
"learning_rate": 4.803997629348359e-06,
"loss": 1.2699,
"step": 420
},
{
"epoch": 0.9619192688499619,
"grad_norm": 0.9980992078781128,
"learning_rate": 4.802787091101659e-06,
"loss": 1.2473,
"step": 421
},
{
"epoch": 0.9642041127189642,
"grad_norm": 1.094283103942871,
"learning_rate": 4.801572979605712e-06,
"loss": 1.2656,
"step": 422
},
{
"epoch": 0.9664889565879665,
"grad_norm": 1.0554611682891846,
"learning_rate": 4.800355296744461e-06,
"loss": 1.2584,
"step": 423
},
{
"epoch": 0.9687738004569688,
"grad_norm": 1.1019188165664673,
"learning_rate": 4.799134044407392e-06,
"loss": 1.2877,
"step": 424
},
{
"epoch": 0.9710586443259711,
"grad_norm": 1.087965726852417,
"learning_rate": 4.797909224489531e-06,
"loss": 1.2662,
"step": 425
},
{
"epoch": 0.9733434881949733,
"grad_norm": 1.08269202709198,
"learning_rate": 4.796680838891438e-06,
"loss": 1.2419,
"step": 426
},
{
"epoch": 0.9756283320639756,
"grad_norm": 1.071199893951416,
"learning_rate": 4.795448889519207e-06,
"loss": 1.2489,
"step": 427
},
{
"epoch": 0.9779131759329779,
"grad_norm": 1.0306544303894043,
"learning_rate": 4.794213378284462e-06,
"loss": 1.2467,
"step": 428
},
{
"epoch": 0.9801980198019802,
"grad_norm": 1.0567327737808228,
"learning_rate": 4.792974307104353e-06,
"loss": 1.2637,
"step": 429
},
{
"epoch": 0.9824828636709825,
"grad_norm": 1.0448797941207886,
"learning_rate": 4.7917316779015554e-06,
"loss": 1.2244,
"step": 430
},
{
"epoch": 0.9847677075399848,
"grad_norm": 1.0123138427734375,
"learning_rate": 4.790485492604264e-06,
"loss": 1.2326,
"step": 431
},
{
"epoch": 0.9870525514089871,
"grad_norm": 1.0484559535980225,
"learning_rate": 4.789235753146192e-06,
"loss": 1.2436,
"step": 432
},
{
"epoch": 0.9893373952779894,
"grad_norm": 1.0161617994308472,
"learning_rate": 4.787982461466568e-06,
"loss": 1.2185,
"step": 433
},
{
"epoch": 0.9916222391469917,
"grad_norm": 1.0779787302017212,
"learning_rate": 4.786725619510134e-06,
"loss": 1.2256,
"step": 434
},
{
"epoch": 0.993907083015994,
"grad_norm": 1.061590552330017,
"learning_rate": 4.785465229227139e-06,
"loss": 1.2747,
"step": 435
},
{
"epoch": 0.9961919268849961,
"grad_norm": 1.102403163909912,
"learning_rate": 4.784201292573337e-06,
"loss": 1.2561,
"step": 436
},
{
"epoch": 0.9984767707539984,
"grad_norm": 0.9936567544937134,
"learning_rate": 4.782933811509988e-06,
"loss": 1.2409,
"step": 437
}
],
"logging_steps": 1,
"max_steps": 2622,
"num_input_tokens_seen": 0,
"num_train_epochs": 6,
"save_steps": 437,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 1.1038780364842598e+18,
"train_batch_size": 4,
"trial_name": null,
"trial_params": null
}