9b-125-2 / trainer_state.json
furproxy's picture
Upload folder using huggingface_hub
8f4b8a3 verified
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 3.0,
"eval_steps": 500,
"global_step": 2781,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.002157497303128371,
"grad_norm": 2.5762534141540527,
"learning_rate": 1.4285714285714284e-08,
"loss": 0.9756889939308167,
"step": 2
},
{
"epoch": 0.004314994606256742,
"grad_norm": 4.507562637329102,
"learning_rate": 4.285714285714286e-08,
"loss": 0.661911129951477,
"step": 4
},
{
"epoch": 0.006472491909385114,
"grad_norm": 5.074595928192139,
"learning_rate": 7.142857142857142e-08,
"loss": 0.8359099626541138,
"step": 6
},
{
"epoch": 0.008629989212513484,
"grad_norm": 1.9605220556259155,
"learning_rate": 1e-07,
"loss": 0.8231168985366821,
"step": 8
},
{
"epoch": 0.010787486515641856,
"grad_norm": 1.6714932918548584,
"learning_rate": 1.2857142857142855e-07,
"loss": 0.7096143960952759,
"step": 10
},
{
"epoch": 0.012944983818770227,
"grad_norm": 4.321066379547119,
"learning_rate": 1.5714285714285714e-07,
"loss": 0.7912081480026245,
"step": 12
},
{
"epoch": 0.015102481121898598,
"grad_norm": 8.806991577148438,
"learning_rate": 1.8571428571428572e-07,
"loss": 0.856565535068512,
"step": 14
},
{
"epoch": 0.017259978425026967,
"grad_norm": 4.99681282043457,
"learning_rate": 2.1428571428571426e-07,
"loss": 1.1687901020050049,
"step": 16
},
{
"epoch": 0.019417475728155338,
"grad_norm": 2.5645041465759277,
"learning_rate": 2.4285714285714287e-07,
"loss": 1.0152899026870728,
"step": 18
},
{
"epoch": 0.021574973031283712,
"grad_norm": 1.8295631408691406,
"learning_rate": 2.714285714285714e-07,
"loss": 0.6205134987831116,
"step": 20
},
{
"epoch": 0.023732470334412083,
"grad_norm": 4.249131202697754,
"learning_rate": 3e-07,
"loss": 0.9060122966766357,
"step": 22
},
{
"epoch": 0.025889967637540454,
"grad_norm": 3.811307191848755,
"learning_rate": 3.2857142857142857e-07,
"loss": 0.8020558953285217,
"step": 24
},
{
"epoch": 0.028047464940668825,
"grad_norm": 3.780266284942627,
"learning_rate": 3.5714285714285716e-07,
"loss": 0.7707346677780151,
"step": 26
},
{
"epoch": 0.030204962243797196,
"grad_norm": 2.9128313064575195,
"learning_rate": 3.857142857142857e-07,
"loss": 0.6942681670188904,
"step": 28
},
{
"epoch": 0.032362459546925564,
"grad_norm": 3.999141216278076,
"learning_rate": 4.142857142857143e-07,
"loss": 1.0023633241653442,
"step": 30
},
{
"epoch": 0.034519956850053934,
"grad_norm": 1.0977139472961426,
"learning_rate": 4.428571428571428e-07,
"loss": 0.6870981454849243,
"step": 32
},
{
"epoch": 0.036677454153182305,
"grad_norm": 7.128520965576172,
"learning_rate": 4.714285714285714e-07,
"loss": 0.9697052836418152,
"step": 34
},
{
"epoch": 0.038834951456310676,
"grad_norm": 2.706787109375,
"learning_rate": 5e-07,
"loss": 0.8704000115394592,
"step": 36
},
{
"epoch": 0.040992448759439054,
"grad_norm": 28.9028377532959,
"learning_rate": 5.285714285714286e-07,
"loss": 1.3768728971481323,
"step": 38
},
{
"epoch": 0.043149946062567425,
"grad_norm": 5.019951343536377,
"learning_rate": 5.571428571428571e-07,
"loss": 0.5050771832466125,
"step": 40
},
{
"epoch": 0.045307443365695796,
"grad_norm": 1.896100401878357,
"learning_rate": 5.857142857142857e-07,
"loss": 0.7155470848083496,
"step": 42
},
{
"epoch": 0.04746494066882417,
"grad_norm": 1.3924132585525513,
"learning_rate": 6.142857142857143e-07,
"loss": 0.8246269822120667,
"step": 44
},
{
"epoch": 0.04962243797195254,
"grad_norm": 2.244379758834839,
"learning_rate": 6.428571428571429e-07,
"loss": 0.6300725340843201,
"step": 46
},
{
"epoch": 0.05177993527508091,
"grad_norm": 4.4260573387146,
"learning_rate": 6.714285714285714e-07,
"loss": 0.7466659545898438,
"step": 48
},
{
"epoch": 0.05393743257820928,
"grad_norm": 3.2849934101104736,
"learning_rate": 7e-07,
"loss": 0.5939810276031494,
"step": 50
},
{
"epoch": 0.05609492988133765,
"grad_norm": 1.4281387329101562,
"learning_rate": 7.285714285714286e-07,
"loss": 0.36054807901382446,
"step": 52
},
{
"epoch": 0.05825242718446602,
"grad_norm": 6.341257095336914,
"learning_rate": 7.57142857142857e-07,
"loss": 0.8747532367706299,
"step": 54
},
{
"epoch": 0.06040992448759439,
"grad_norm": 14.657182693481445,
"learning_rate": 7.857142857142856e-07,
"loss": 0.991641640663147,
"step": 56
},
{
"epoch": 0.06256742179072276,
"grad_norm": 1.8714779615402222,
"learning_rate": 8.142857142857142e-07,
"loss": 0.6384595632553101,
"step": 58
},
{
"epoch": 0.06472491909385113,
"grad_norm": 6.577902793884277,
"learning_rate": 8.428571428571428e-07,
"loss": 0.7958289384841919,
"step": 60
},
{
"epoch": 0.0668824163969795,
"grad_norm": 6.575786590576172,
"learning_rate": 8.714285714285714e-07,
"loss": 0.9528671503067017,
"step": 62
},
{
"epoch": 0.06903991370010787,
"grad_norm": 2.090458631515503,
"learning_rate": 9e-07,
"loss": 0.7380706071853638,
"step": 64
},
{
"epoch": 0.07119741100323625,
"grad_norm": 2.861210584640503,
"learning_rate": 9.285714285714285e-07,
"loss": 0.8518832921981812,
"step": 66
},
{
"epoch": 0.07335490830636461,
"grad_norm": 2.01114559173584,
"learning_rate": 9.571428571428572e-07,
"loss": 0.7344475388526917,
"step": 68
},
{
"epoch": 0.07551240560949299,
"grad_norm": 9.70598316192627,
"learning_rate": 9.857142857142857e-07,
"loss": 0.6952767968177795,
"step": 70
},
{
"epoch": 0.07766990291262135,
"grad_norm": 3.434774160385132,
"learning_rate": 1.0142857142857142e-06,
"loss": 0.6583250761032104,
"step": 72
},
{
"epoch": 0.07982740021574973,
"grad_norm": 2.2592010498046875,
"learning_rate": 1.0428571428571429e-06,
"loss": 0.6166390180587769,
"step": 74
},
{
"epoch": 0.08198489751887811,
"grad_norm": 2.191253185272217,
"learning_rate": 1.0714285714285714e-06,
"loss": 0.6770592927932739,
"step": 76
},
{
"epoch": 0.08414239482200647,
"grad_norm": 1.9236204624176025,
"learning_rate": 1.1e-06,
"loss": 0.6031355261802673,
"step": 78
},
{
"epoch": 0.08629989212513485,
"grad_norm": 4.434933662414551,
"learning_rate": 1.1285714285714285e-06,
"loss": 0.949522078037262,
"step": 80
},
{
"epoch": 0.08845738942826321,
"grad_norm": 3.6469240188598633,
"learning_rate": 1.1571428571428572e-06,
"loss": 0.5610405206680298,
"step": 82
},
{
"epoch": 0.09061488673139159,
"grad_norm": 2.5983078479766846,
"learning_rate": 1.1857142857142857e-06,
"loss": 0.40821573138237,
"step": 84
},
{
"epoch": 0.09277238403451996,
"grad_norm": 1.9814131259918213,
"learning_rate": 1.2142857142857142e-06,
"loss": 0.6474723815917969,
"step": 86
},
{
"epoch": 0.09492988133764833,
"grad_norm": 1.5465178489685059,
"learning_rate": 1.2428571428571429e-06,
"loss": 0.6871901154518127,
"step": 88
},
{
"epoch": 0.0970873786407767,
"grad_norm": 2.41676664352417,
"learning_rate": 1.2714285714285714e-06,
"loss": 0.6116282343864441,
"step": 90
},
{
"epoch": 0.09924487594390508,
"grad_norm": 1.3548113107681274,
"learning_rate": 1.3e-06,
"loss": 0.6866545677185059,
"step": 92
},
{
"epoch": 0.10140237324703344,
"grad_norm": 3.994168519973755,
"learning_rate": 1.3285714285714285e-06,
"loss": 0.7566230297088623,
"step": 94
},
{
"epoch": 0.10355987055016182,
"grad_norm": 0.6952499151229858,
"learning_rate": 1.3571428571428572e-06,
"loss": 0.5508694648742676,
"step": 96
},
{
"epoch": 0.10571736785329018,
"grad_norm": 2.8167052268981934,
"learning_rate": 1.3857142857142857e-06,
"loss": 0.6589823961257935,
"step": 98
},
{
"epoch": 0.10787486515641856,
"grad_norm": 20.95288848876953,
"learning_rate": 1.4142857142857144e-06,
"loss": 0.558512806892395,
"step": 100
},
{
"epoch": 0.11003236245954692,
"grad_norm": 2.7887301445007324,
"learning_rate": 1.4428571428571429e-06,
"loss": 0.7576460838317871,
"step": 102
},
{
"epoch": 0.1121898597626753,
"grad_norm": 3.9458041191101074,
"learning_rate": 1.4714285714285716e-06,
"loss": 0.7658395171165466,
"step": 104
},
{
"epoch": 0.11434735706580366,
"grad_norm": 3.11449933052063,
"learning_rate": 1.5e-06,
"loss": 0.40781867504119873,
"step": 106
},
{
"epoch": 0.11650485436893204,
"grad_norm": 55.18882369995117,
"learning_rate": 1.5285714285714283e-06,
"loss": 0.7694418430328369,
"step": 108
},
{
"epoch": 0.1186623516720604,
"grad_norm": 2.1952178478240967,
"learning_rate": 1.557142857142857e-06,
"loss": 0.6382092833518982,
"step": 110
},
{
"epoch": 0.12081984897518878,
"grad_norm": 4.497931957244873,
"learning_rate": 1.5857142857142855e-06,
"loss": 0.5454550981521606,
"step": 112
},
{
"epoch": 0.12297734627831715,
"grad_norm": 3.0473287105560303,
"learning_rate": 1.6142857142857142e-06,
"loss": 0.5170645713806152,
"step": 114
},
{
"epoch": 0.12513484358144553,
"grad_norm": 3.3767971992492676,
"learning_rate": 1.6428571428571426e-06,
"loss": 0.5989764332771301,
"step": 116
},
{
"epoch": 0.1272923408845739,
"grad_norm": 2.4410431385040283,
"learning_rate": 1.6714285714285713e-06,
"loss": 0.538360595703125,
"step": 118
},
{
"epoch": 0.12944983818770225,
"grad_norm": 2.4122188091278076,
"learning_rate": 1.6999999999999998e-06,
"loss": 0.5263152122497559,
"step": 120
},
{
"epoch": 0.13160733549083065,
"grad_norm": 1.3902043104171753,
"learning_rate": 1.7285714285714285e-06,
"loss": 0.6690125465393066,
"step": 122
},
{
"epoch": 0.133764832793959,
"grad_norm": 1.8167104721069336,
"learning_rate": 1.757142857142857e-06,
"loss": 0.6804316639900208,
"step": 124
},
{
"epoch": 0.13592233009708737,
"grad_norm": 0.8370219469070435,
"learning_rate": 1.7857142857142857e-06,
"loss": 0.8587678670883179,
"step": 126
},
{
"epoch": 0.13807982740021574,
"grad_norm": 1.6989076137542725,
"learning_rate": 1.8142857142857142e-06,
"loss": 0.5992355942726135,
"step": 128
},
{
"epoch": 0.14023732470334413,
"grad_norm": 2.9481143951416016,
"learning_rate": 1.8428571428571426e-06,
"loss": 0.35928595066070557,
"step": 130
},
{
"epoch": 0.1423948220064725,
"grad_norm": 4.938568592071533,
"learning_rate": 1.8714285714285713e-06,
"loss": 0.6318232417106628,
"step": 132
},
{
"epoch": 0.14455231930960086,
"grad_norm": 6.683016777038574,
"learning_rate": 1.8999999999999998e-06,
"loss": 0.7641289234161377,
"step": 134
},
{
"epoch": 0.14670981661272922,
"grad_norm": 3.555145740509033,
"learning_rate": 1.9285714285714285e-06,
"loss": 0.7520711421966553,
"step": 136
},
{
"epoch": 0.1488673139158576,
"grad_norm": 2.5793240070343018,
"learning_rate": 1.957142857142857e-06,
"loss": 0.7046728730201721,
"step": 138
},
{
"epoch": 0.15102481121898598,
"grad_norm": 4.05105447769165,
"learning_rate": 1.985714285714286e-06,
"loss": 0.4872206449508667,
"step": 140
},
{
"epoch": 0.15318230852211434,
"grad_norm": 3.6239945888519287,
"learning_rate": 1.9999993632405402e-06,
"loss": 0.6896734237670898,
"step": 142
},
{
"epoch": 0.1553398058252427,
"grad_norm": 1.404502272605896,
"learning_rate": 1.999994269170269e-06,
"loss": 0.5899009108543396,
"step": 144
},
{
"epoch": 0.1574973031283711,
"grad_norm": 4.971773147583008,
"learning_rate": 1.9999840810585597e-06,
"loss": 0.7049793601036072,
"step": 146
},
{
"epoch": 0.15965480043149946,
"grad_norm": 1.3405131101608276,
"learning_rate": 1.9999687989630773e-06,
"loss": 0.7568652033805847,
"step": 148
},
{
"epoch": 0.16181229773462782,
"grad_norm": 1.5060237646102905,
"learning_rate": 1.9999484229703205e-06,
"loss": 0.5981850624084473,
"step": 150
},
{
"epoch": 0.16396979503775622,
"grad_norm": 5.329726696014404,
"learning_rate": 1.9999229531956187e-06,
"loss": 0.4102513790130615,
"step": 152
},
{
"epoch": 0.16612729234088458,
"grad_norm": 6.227677822113037,
"learning_rate": 1.9998923897831327e-06,
"loss": 0.6681348085403442,
"step": 154
},
{
"epoch": 0.16828478964401294,
"grad_norm": 12.442171096801758,
"learning_rate": 1.9998567329058537e-06,
"loss": 0.7901923060417175,
"step": 156
},
{
"epoch": 0.1704422869471413,
"grad_norm": 1.463600754737854,
"learning_rate": 1.9998159827656035e-06,
"loss": 0.6713565587997437,
"step": 158
},
{
"epoch": 0.1725997842502697,
"grad_norm": 4.213168621063232,
"learning_rate": 1.9997701395930303e-06,
"loss": 0.8061548471450806,
"step": 160
},
{
"epoch": 0.17475728155339806,
"grad_norm": 1.8637281656265259,
"learning_rate": 1.9997192036476113e-06,
"loss": 0.6461450457572937,
"step": 162
},
{
"epoch": 0.17691477885652643,
"grad_norm": 1.6268417835235596,
"learning_rate": 1.999663175217647e-06,
"loss": 0.5500176548957825,
"step": 164
},
{
"epoch": 0.1790722761596548,
"grad_norm": 1.7090154886245728,
"learning_rate": 1.999602054620263e-06,
"loss": 0.6174845695495605,
"step": 166
},
{
"epoch": 0.18122977346278318,
"grad_norm": 1.6175590753555298,
"learning_rate": 1.9995358422014078e-06,
"loss": 0.5844609141349792,
"step": 168
},
{
"epoch": 0.18338727076591155,
"grad_norm": 1.7991397380828857,
"learning_rate": 1.9994645383358485e-06,
"loss": 0.6766707897186279,
"step": 170
},
{
"epoch": 0.1855447680690399,
"grad_norm": 2.1886162757873535,
"learning_rate": 1.9993881434271707e-06,
"loss": 0.7125424146652222,
"step": 172
},
{
"epoch": 0.18770226537216828,
"grad_norm": 2.0554721355438232,
"learning_rate": 1.9993066579077766e-06,
"loss": 0.6898304224014282,
"step": 174
},
{
"epoch": 0.18985976267529667,
"grad_norm": 1.493452548980713,
"learning_rate": 1.9992200822388794e-06,
"loss": 0.6477411985397339,
"step": 176
},
{
"epoch": 0.19201725997842503,
"grad_norm": 5.064616680145264,
"learning_rate": 1.999128416910507e-06,
"loss": 0.5987610816955566,
"step": 178
},
{
"epoch": 0.1941747572815534,
"grad_norm": 1.9240336418151855,
"learning_rate": 1.9990316624414902e-06,
"loss": 0.5100513100624084,
"step": 180
},
{
"epoch": 0.19633225458468176,
"grad_norm": 2.9364237785339355,
"learning_rate": 1.998929819379468e-06,
"loss": 0.8424034714698792,
"step": 182
},
{
"epoch": 0.19848975188781015,
"grad_norm": 1.8846420049667358,
"learning_rate": 1.998822888300881e-06,
"loss": 0.6540043354034424,
"step": 184
},
{
"epoch": 0.20064724919093851,
"grad_norm": 25.783281326293945,
"learning_rate": 1.9987108698109675e-06,
"loss": 0.7192497253417969,
"step": 186
},
{
"epoch": 0.20280474649406688,
"grad_norm": 3.445650100708008,
"learning_rate": 1.9985937645437617e-06,
"loss": 0.4845433831214905,
"step": 188
},
{
"epoch": 0.20496224379719524,
"grad_norm": 2.80410099029541,
"learning_rate": 1.9984715731620883e-06,
"loss": 0.42129552364349365,
"step": 190
},
{
"epoch": 0.20711974110032363,
"grad_norm": 4.098501682281494,
"learning_rate": 1.9983442963575616e-06,
"loss": 0.5982234477996826,
"step": 192
},
{
"epoch": 0.209277238403452,
"grad_norm": 3.001051664352417,
"learning_rate": 1.998211934850578e-06,
"loss": 0.65160071849823,
"step": 194
},
{
"epoch": 0.21143473570658036,
"grad_norm": 3.8019604682922363,
"learning_rate": 1.998074489390314e-06,
"loss": 0.5729217529296875,
"step": 196
},
{
"epoch": 0.21359223300970873,
"grad_norm": 7.936295986175537,
"learning_rate": 1.997931960754724e-06,
"loss": 0.6380269527435303,
"step": 198
},
{
"epoch": 0.21574973031283712,
"grad_norm": 1.6919151544570923,
"learning_rate": 1.99778434975053e-06,
"loss": 0.692238450050354,
"step": 200
},
{
"epoch": 0.21790722761596548,
"grad_norm": 2.152122974395752,
"learning_rate": 1.997631657213223e-06,
"loss": 0.5761340856552124,
"step": 202
},
{
"epoch": 0.22006472491909385,
"grad_norm": 1.63760244846344,
"learning_rate": 1.9974738840070554e-06,
"loss": 0.6452651619911194,
"step": 204
},
{
"epoch": 0.2222222222222222,
"grad_norm": 2.098172187805176,
"learning_rate": 1.9973110310250364e-06,
"loss": 0.6504206657409668,
"step": 206
},
{
"epoch": 0.2243797195253506,
"grad_norm": 5.233450889587402,
"learning_rate": 1.9971430991889274e-06,
"loss": 0.603036642074585,
"step": 208
},
{
"epoch": 0.22653721682847897,
"grad_norm": 5.004756927490234,
"learning_rate": 1.996970089449236e-06,
"loss": 0.6306214332580566,
"step": 210
},
{
"epoch": 0.22869471413160733,
"grad_norm": 2.422592878341675,
"learning_rate": 1.9967920027852115e-06,
"loss": 0.5596987009048462,
"step": 212
},
{
"epoch": 0.2308522114347357,
"grad_norm": 1.5725358724594116,
"learning_rate": 1.99660884020484e-06,
"loss": 0.5717631578445435,
"step": 214
},
{
"epoch": 0.23300970873786409,
"grad_norm": 1.93349289894104,
"learning_rate": 1.9964206027448355e-06,
"loss": 0.5819022059440613,
"step": 216
},
{
"epoch": 0.23516720604099245,
"grad_norm": 4.832953453063965,
"learning_rate": 1.9962272914706387e-06,
"loss": 0.6450964212417603,
"step": 218
},
{
"epoch": 0.2373247033441208,
"grad_norm": 3.763730525970459,
"learning_rate": 1.996028907476406e-06,
"loss": 0.7145527601242065,
"step": 220
},
{
"epoch": 0.23948220064724918,
"grad_norm": 3.421351909637451,
"learning_rate": 1.995825451885008e-06,
"loss": 0.7275266051292419,
"step": 222
},
{
"epoch": 0.24163969795037757,
"grad_norm": 1.2539039850234985,
"learning_rate": 1.995616925848019e-06,
"loss": 0.6702066659927368,
"step": 224
},
{
"epoch": 0.24379719525350593,
"grad_norm": 3.2891104221343994,
"learning_rate": 1.9954033305457154e-06,
"loss": 0.6769608855247498,
"step": 226
},
{
"epoch": 0.2459546925566343,
"grad_norm": 3.350397825241089,
"learning_rate": 1.995184667187062e-06,
"loss": 0.5695778727531433,
"step": 228
},
{
"epoch": 0.2481121898597627,
"grad_norm": 1.5770859718322754,
"learning_rate": 1.994960937009713e-06,
"loss": 0.690039873123169,
"step": 230
},
{
"epoch": 0.25026968716289105,
"grad_norm": 12.937152862548828,
"learning_rate": 1.9947321412799988e-06,
"loss": 0.883323073387146,
"step": 232
},
{
"epoch": 0.2524271844660194,
"grad_norm": 3.2988932132720947,
"learning_rate": 1.994498281292922e-06,
"loss": 0.798008918762207,
"step": 234
},
{
"epoch": 0.2545846817691478,
"grad_norm": 8.973938941955566,
"learning_rate": 1.9942593583721493e-06,
"loss": 0.6434545516967773,
"step": 236
},
{
"epoch": 0.25674217907227614,
"grad_norm": 1.573055624961853,
"learning_rate": 1.9940153738700045e-06,
"loss": 0.6816240549087524,
"step": 238
},
{
"epoch": 0.2588996763754045,
"grad_norm": 1.305035948753357,
"learning_rate": 1.9937663291674593e-06,
"loss": 0.7506214380264282,
"step": 240
},
{
"epoch": 0.26105717367853293,
"grad_norm": 1.362859845161438,
"learning_rate": 1.993512225674127e-06,
"loss": 0.6811486482620239,
"step": 242
},
{
"epoch": 0.2632146709816613,
"grad_norm": 1.6364890336990356,
"learning_rate": 1.9932530648282555e-06,
"loss": 0.648339569568634,
"step": 244
},
{
"epoch": 0.26537216828478966,
"grad_norm": 1.4568816423416138,
"learning_rate": 1.992988848096715e-06,
"loss": 0.6864685416221619,
"step": 246
},
{
"epoch": 0.267529665587918,
"grad_norm": 37.19289779663086,
"learning_rate": 1.9927195769749953e-06,
"loss": 0.8183077573776245,
"step": 248
},
{
"epoch": 0.2696871628910464,
"grad_norm": 2.8543128967285156,
"learning_rate": 1.9924452529871915e-06,
"loss": 0.6495329141616821,
"step": 250
},
{
"epoch": 0.27184466019417475,
"grad_norm": 2.8163821697235107,
"learning_rate": 1.992165877686001e-06,
"loss": 0.7900782227516174,
"step": 252
},
{
"epoch": 0.2740021574973031,
"grad_norm": 1.5852282047271729,
"learning_rate": 1.9918814526527105e-06,
"loss": 0.47972819209098816,
"step": 254
},
{
"epoch": 0.2761596548004315,
"grad_norm": 1.4288207292556763,
"learning_rate": 1.9915919794971892e-06,
"loss": 0.5876221656799316,
"step": 256
},
{
"epoch": 0.2783171521035599,
"grad_norm": 3.196465015411377,
"learning_rate": 1.9912974598578793e-06,
"loss": 0.46865469217300415,
"step": 258
},
{
"epoch": 0.28047464940668826,
"grad_norm": 1.5016988515853882,
"learning_rate": 1.9909978954017847e-06,
"loss": 0.7941604852676392,
"step": 260
},
{
"epoch": 0.2826321467098166,
"grad_norm": 2.889617919921875,
"learning_rate": 1.9906932878244665e-06,
"loss": 0.5378029942512512,
"step": 262
},
{
"epoch": 0.284789644012945,
"grad_norm": 1.2625280618667603,
"learning_rate": 1.990383638850028e-06,
"loss": 0.7057135701179504,
"step": 264
},
{
"epoch": 0.28694714131607335,
"grad_norm": 2.495908260345459,
"learning_rate": 1.990068950231107e-06,
"loss": 0.6187635660171509,
"step": 266
},
{
"epoch": 0.2891046386192017,
"grad_norm": 3.3619916439056396,
"learning_rate": 1.9897492237488683e-06,
"loss": 0.7007441520690918,
"step": 268
},
{
"epoch": 0.2912621359223301,
"grad_norm": 2.7590885162353516,
"learning_rate": 1.9894244612129886e-06,
"loss": 0.5531818270683289,
"step": 270
},
{
"epoch": 0.29341963322545844,
"grad_norm": 13.184637069702148,
"learning_rate": 1.9890946644616523e-06,
"loss": 0.7034265398979187,
"step": 272
},
{
"epoch": 0.29557713052858686,
"grad_norm": 3.412360191345215,
"learning_rate": 1.9887598353615344e-06,
"loss": 0.7009316086769104,
"step": 274
},
{
"epoch": 0.2977346278317152,
"grad_norm": 4.455333232879639,
"learning_rate": 1.988419975807796e-06,
"loss": 0.662095844745636,
"step": 276
},
{
"epoch": 0.2998921251348436,
"grad_norm": 4.525757789611816,
"learning_rate": 1.988075087724069e-06,
"loss": 0.6825252771377563,
"step": 278
},
{
"epoch": 0.30204962243797195,
"grad_norm": 2.143056869506836,
"learning_rate": 1.98772517306245e-06,
"loss": 0.674209713935852,
"step": 280
},
{
"epoch": 0.3042071197411003,
"grad_norm": 8.63754940032959,
"learning_rate": 1.9873702338034837e-06,
"loss": 0.6299592852592468,
"step": 282
},
{
"epoch": 0.3063646170442287,
"grad_norm": 3.6352531909942627,
"learning_rate": 1.9870102719561552e-06,
"loss": 0.6460418701171875,
"step": 284
},
{
"epoch": 0.30852211434735705,
"grad_norm": 6.43894624710083,
"learning_rate": 1.9866452895578784e-06,
"loss": 0.4357595145702362,
"step": 286
},
{
"epoch": 0.3106796116504854,
"grad_norm": 4.515871047973633,
"learning_rate": 1.986275288674484e-06,
"loss": 0.6767151355743408,
"step": 288
},
{
"epoch": 0.31283710895361383,
"grad_norm": 1.191287636756897,
"learning_rate": 1.9859002714002067e-06,
"loss": 0.6962684392929077,
"step": 290
},
{
"epoch": 0.3149946062567422,
"grad_norm": 6.864166259765625,
"learning_rate": 1.9855202398576756e-06,
"loss": 0.6553777456283569,
"step": 292
},
{
"epoch": 0.31715210355987056,
"grad_norm": 2.185790777206421,
"learning_rate": 1.9851351961979e-06,
"loss": 0.7482725977897644,
"step": 294
},
{
"epoch": 0.3193096008629989,
"grad_norm": 3.45621395111084,
"learning_rate": 1.9847451426002587e-06,
"loss": 0.4616151452064514,
"step": 296
},
{
"epoch": 0.3214670981661273,
"grad_norm": 6.849677085876465,
"learning_rate": 1.9843500812724876e-06,
"loss": 0.6612831354141235,
"step": 298
},
{
"epoch": 0.32362459546925565,
"grad_norm": 5.291024208068848,
"learning_rate": 1.9839500144506657e-06,
"loss": 0.6871935129165649,
"step": 300
},
{
"epoch": 0.325782092772384,
"grad_norm": 4.101494789123535,
"learning_rate": 1.9835449443992042e-06,
"loss": 0.4521007537841797,
"step": 302
},
{
"epoch": 0.32793959007551243,
"grad_norm": 2.2320597171783447,
"learning_rate": 1.9831348734108325e-06,
"loss": 0.6099227666854858,
"step": 304
},
{
"epoch": 0.3300970873786408,
"grad_norm": 2.0806515216827393,
"learning_rate": 1.9827198038065867e-06,
"loss": 0.6959011554718018,
"step": 306
},
{
"epoch": 0.33225458468176916,
"grad_norm": 2.5396556854248047,
"learning_rate": 1.9822997379357946e-06,
"loss": 0.6063118577003479,
"step": 308
},
{
"epoch": 0.3344120819848975,
"grad_norm": 1.9925243854522705,
"learning_rate": 1.9818746781760637e-06,
"loss": 0.7447793483734131,
"step": 310
},
{
"epoch": 0.3365695792880259,
"grad_norm": 1.5124423503875732,
"learning_rate": 1.9814446269332665e-06,
"loss": 0.6756496429443359,
"step": 312
},
{
"epoch": 0.33872707659115425,
"grad_norm": 1.5381604433059692,
"learning_rate": 1.9810095866415288e-06,
"loss": 0.7244548797607422,
"step": 314
},
{
"epoch": 0.3408845738942826,
"grad_norm": 2.3492956161499023,
"learning_rate": 1.980569559763214e-06,
"loss": 0.7345068454742432,
"step": 316
},
{
"epoch": 0.343042071197411,
"grad_norm": 3.104962110519409,
"learning_rate": 1.980124548788911e-06,
"loss": 0.7250902056694031,
"step": 318
},
{
"epoch": 0.3451995685005394,
"grad_norm": 5.452524662017822,
"learning_rate": 1.9796745562374177e-06,
"loss": 0.5213475823402405,
"step": 320
},
{
"epoch": 0.34735706580366776,
"grad_norm": 1.2911655902862549,
"learning_rate": 1.9792195846557292e-06,
"loss": 0.41105973720550537,
"step": 322
},
{
"epoch": 0.34951456310679613,
"grad_norm": 1.4482433795928955,
"learning_rate": 1.9787596366190224e-06,
"loss": 0.6460384130477905,
"step": 324
},
{
"epoch": 0.3516720604099245,
"grad_norm": 7.015091419219971,
"learning_rate": 1.9782947147306403e-06,
"loss": 0.7474948763847351,
"step": 326
},
{
"epoch": 0.35382955771305286,
"grad_norm": 3.078240156173706,
"learning_rate": 1.9778248216220793e-06,
"loss": 0.6818826198577881,
"step": 328
},
{
"epoch": 0.3559870550161812,
"grad_norm": 1.273003339767456,
"learning_rate": 1.977349959952973e-06,
"loss": 0.6558285355567932,
"step": 330
},
{
"epoch": 0.3581445523193096,
"grad_norm": 5.311271667480469,
"learning_rate": 1.976870132411077e-06,
"loss": 0.48177772760391235,
"step": 332
},
{
"epoch": 0.36030204962243795,
"grad_norm": 4.013199329376221,
"learning_rate": 1.976385341712255e-06,
"loss": 0.6249281167984009,
"step": 334
},
{
"epoch": 0.36245954692556637,
"grad_norm": 10.565508842468262,
"learning_rate": 1.9758955906004624e-06,
"loss": 0.5407902002334595,
"step": 336
},
{
"epoch": 0.36461704422869473,
"grad_norm": 3.089301824569702,
"learning_rate": 1.97540088184773e-06,
"loss": 0.5577709674835205,
"step": 338
},
{
"epoch": 0.3667745415318231,
"grad_norm": 9.143464088439941,
"learning_rate": 1.97490121825415e-06,
"loss": 0.7211488485336304,
"step": 340
},
{
"epoch": 0.36893203883495146,
"grad_norm": 1.91471529006958,
"learning_rate": 1.97439660264786e-06,
"loss": 0.6752923727035522,
"step": 342
},
{
"epoch": 0.3710895361380798,
"grad_norm": 1.919268250465393,
"learning_rate": 1.9738870378850255e-06,
"loss": 0.6122534871101379,
"step": 344
},
{
"epoch": 0.3732470334412082,
"grad_norm": 1.600658893585205,
"learning_rate": 1.973372526849825e-06,
"loss": 0.6465229988098145,
"step": 346
},
{
"epoch": 0.37540453074433655,
"grad_norm": 2.6975924968719482,
"learning_rate": 1.9728530724544317e-06,
"loss": 0.7250155806541443,
"step": 348
},
{
"epoch": 0.3775620280474649,
"grad_norm": 1.543426752090454,
"learning_rate": 1.972328677639003e-06,
"loss": 0.6498576998710632,
"step": 350
},
{
"epoch": 0.37971952535059333,
"grad_norm": 2.0587544441223145,
"learning_rate": 1.971799345371654e-06,
"loss": 0.6255270838737488,
"step": 352
},
{
"epoch": 0.3818770226537217,
"grad_norm": 4.156800746917725,
"learning_rate": 1.97126507864845e-06,
"loss": 0.7264662981033325,
"step": 354
},
{
"epoch": 0.38403451995685006,
"grad_norm": 1.7672313451766968,
"learning_rate": 1.9707258804933843e-06,
"loss": 0.6123859882354736,
"step": 356
},
{
"epoch": 0.3861920172599784,
"grad_norm": 2.0073418617248535,
"learning_rate": 1.9701817539583623e-06,
"loss": 0.584026038646698,
"step": 358
},
{
"epoch": 0.3883495145631068,
"grad_norm": 1.1840739250183105,
"learning_rate": 1.9696327021231857e-06,
"loss": 0.7097981572151184,
"step": 360
},
{
"epoch": 0.39050701186623515,
"grad_norm": 1.7339706420898438,
"learning_rate": 1.9690787280955324e-06,
"loss": 0.6338366866111755,
"step": 362
},
{
"epoch": 0.3926645091693635,
"grad_norm": 2.4935598373413086,
"learning_rate": 1.9685198350109406e-06,
"loss": 0.5935678482055664,
"step": 364
},
{
"epoch": 0.3948220064724919,
"grad_norm": 6.941248893737793,
"learning_rate": 1.9679560260327916e-06,
"loss": 0.7488420009613037,
"step": 366
},
{
"epoch": 0.3969795037756203,
"grad_norm": 5.579442024230957,
"learning_rate": 1.9673873043522904e-06,
"loss": 0.6892845630645752,
"step": 368
},
{
"epoch": 0.39913700107874867,
"grad_norm": 1.7267175912857056,
"learning_rate": 1.9668136731884486e-06,
"loss": 0.7125424742698669,
"step": 370
},
{
"epoch": 0.40129449838187703,
"grad_norm": 1.7778825759887695,
"learning_rate": 1.966235135788065e-06,
"loss": 0.6329432129859924,
"step": 372
},
{
"epoch": 0.4034519956850054,
"grad_norm": 4.815840244293213,
"learning_rate": 1.965651695425709e-06,
"loss": 0.6711968183517456,
"step": 374
},
{
"epoch": 0.40560949298813376,
"grad_norm": 1.8194682598114014,
"learning_rate": 1.965063355403701e-06,
"loss": 0.5624091625213623,
"step": 376
},
{
"epoch": 0.4077669902912621,
"grad_norm": 3.847508192062378,
"learning_rate": 1.9644701190520943e-06,
"loss": 0.43470942974090576,
"step": 378
},
{
"epoch": 0.4099244875943905,
"grad_norm": 1.7189173698425293,
"learning_rate": 1.9638719897286545e-06,
"loss": 0.5556265115737915,
"step": 380
},
{
"epoch": 0.4120819848975189,
"grad_norm": 2.6449780464172363,
"learning_rate": 1.9632689708188435e-06,
"loss": 0.5694633722305298,
"step": 382
},
{
"epoch": 0.41423948220064727,
"grad_norm": 6.1240410804748535,
"learning_rate": 1.962661065735797e-06,
"loss": 0.6872696280479431,
"step": 384
},
{
"epoch": 0.41639697950377563,
"grad_norm": 5.022050380706787,
"learning_rate": 1.9620482779203086e-06,
"loss": 0.6486364603042603,
"step": 386
},
{
"epoch": 0.418554476806904,
"grad_norm": 1.9404337406158447,
"learning_rate": 1.961430610840807e-06,
"loss": 0.6287031173706055,
"step": 388
},
{
"epoch": 0.42071197411003236,
"grad_norm": 4.457851886749268,
"learning_rate": 1.9608080679933385e-06,
"loss": 0.40318727493286133,
"step": 390
},
{
"epoch": 0.4228694714131607,
"grad_norm": 8.74242115020752,
"learning_rate": 1.960180652901547e-06,
"loss": 0.6349734663963318,
"step": 392
},
{
"epoch": 0.4250269687162891,
"grad_norm": 4.8024725914001465,
"learning_rate": 1.9595483691166534e-06,
"loss": 0.6840596199035645,
"step": 394
},
{
"epoch": 0.42718446601941745,
"grad_norm": 7.498271465301514,
"learning_rate": 1.958911220217436e-06,
"loss": 0.817265510559082,
"step": 396
},
{
"epoch": 0.42934196332254587,
"grad_norm": 2.826599359512329,
"learning_rate": 1.958269209810209e-06,
"loss": 0.5891008377075195,
"step": 398
},
{
"epoch": 0.43149946062567424,
"grad_norm": 4.398319244384766,
"learning_rate": 1.957622341528805e-06,
"loss": 0.5453633069992065,
"step": 400
},
{
"epoch": 0.4336569579288026,
"grad_norm": 2.75591778755188,
"learning_rate": 1.9569706190345512e-06,
"loss": 0.6217541098594666,
"step": 402
},
{
"epoch": 0.43581445523193096,
"grad_norm": 2.1976888179779053,
"learning_rate": 1.9563140460162505e-06,
"loss": 0.658210813999176,
"step": 404
},
{
"epoch": 0.43797195253505933,
"grad_norm": 4.405237197875977,
"learning_rate": 1.9556526261901602e-06,
"loss": 0.735411524772644,
"step": 406
},
{
"epoch": 0.4401294498381877,
"grad_norm": 3.2632224559783936,
"learning_rate": 1.95498636329997e-06,
"loss": 0.5909388065338135,
"step": 408
},
{
"epoch": 0.44228694714131606,
"grad_norm": 2.1249871253967285,
"learning_rate": 1.9543152611167837e-06,
"loss": 0.7845476865768433,
"step": 410
},
{
"epoch": 0.4444444444444444,
"grad_norm": 4.967487335205078,
"learning_rate": 1.9536393234390937e-06,
"loss": 0.6481199860572815,
"step": 412
},
{
"epoch": 0.44660194174757284,
"grad_norm": 1.769586443901062,
"learning_rate": 1.9529585540927636e-06,
"loss": 0.5764113068580627,
"step": 414
},
{
"epoch": 0.4487594390507012,
"grad_norm": 4.130702972412109,
"learning_rate": 1.9522729569310036e-06,
"loss": 0.7091037034988403,
"step": 416
},
{
"epoch": 0.45091693635382957,
"grad_norm": 2.021075487136841,
"learning_rate": 1.9515825358343494e-06,
"loss": 0.6121603846549988,
"step": 418
},
{
"epoch": 0.45307443365695793,
"grad_norm": 2.6177845001220703,
"learning_rate": 1.9508872947106413e-06,
"loss": 0.6298436522483826,
"step": 420
},
{
"epoch": 0.4552319309600863,
"grad_norm": 2.469846487045288,
"learning_rate": 1.9501872374950016e-06,
"loss": 0.6969653367996216,
"step": 422
},
{
"epoch": 0.45738942826321466,
"grad_norm": 1.5605947971343994,
"learning_rate": 1.949482368149811e-06,
"loss": 0.6391591429710388,
"step": 424
},
{
"epoch": 0.459546925566343,
"grad_norm": 7.972052097320557,
"learning_rate": 1.948772690664688e-06,
"loss": 0.6320364475250244,
"step": 426
},
{
"epoch": 0.4617044228694714,
"grad_norm": 3.05439829826355,
"learning_rate": 1.9480582090564657e-06,
"loss": 0.7582883238792419,
"step": 428
},
{
"epoch": 0.4638619201725998,
"grad_norm": 1.7435839176177979,
"learning_rate": 1.9473389273691686e-06,
"loss": 0.653886616230011,
"step": 430
},
{
"epoch": 0.46601941747572817,
"grad_norm": 1.643883466720581,
"learning_rate": 1.9466148496739893e-06,
"loss": 0.6401156187057495,
"step": 432
},
{
"epoch": 0.46817691477885653,
"grad_norm": 1.892043113708496,
"learning_rate": 1.9458859800692685e-06,
"loss": 0.42768222093582153,
"step": 434
},
{
"epoch": 0.4703344120819849,
"grad_norm": 7.755466938018799,
"learning_rate": 1.9451523226804665e-06,
"loss": 0.7829925417900085,
"step": 436
},
{
"epoch": 0.47249190938511326,
"grad_norm": 25.02216339111328,
"learning_rate": 1.944413881660145e-06,
"loss": 0.8609887361526489,
"step": 438
},
{
"epoch": 0.4746494066882416,
"grad_norm": 1.8407223224639893,
"learning_rate": 1.9436706611879413e-06,
"loss": 0.5991024374961853,
"step": 440
},
{
"epoch": 0.47680690399137,
"grad_norm": 1.7243049144744873,
"learning_rate": 1.9429226654705433e-06,
"loss": 0.5861119627952576,
"step": 442
},
{
"epoch": 0.47896440129449835,
"grad_norm": 3.1721413135528564,
"learning_rate": 1.9421698987416685e-06,
"loss": 0.6749376058578491,
"step": 444
},
{
"epoch": 0.4811218985976268,
"grad_norm": 3.368656873703003,
"learning_rate": 1.941412365262039e-06,
"loss": 0.6256532073020935,
"step": 446
},
{
"epoch": 0.48327939590075514,
"grad_norm": 2.6724510192871094,
"learning_rate": 1.9406500693193555e-06,
"loss": 0.6529517769813538,
"step": 448
},
{
"epoch": 0.4854368932038835,
"grad_norm": 2.0183353424072266,
"learning_rate": 1.939883015228276e-06,
"loss": 0.7027242183685303,
"step": 450
},
{
"epoch": 0.48759439050701187,
"grad_norm": 2.5906269550323486,
"learning_rate": 1.9391112073303897e-06,
"loss": 0.6666867733001709,
"step": 452
},
{
"epoch": 0.48975188781014023,
"grad_norm": 3.279174327850342,
"learning_rate": 1.9383346499941934e-06,
"loss": 0.6068412065505981,
"step": 454
},
{
"epoch": 0.4919093851132686,
"grad_norm": 2.020169496536255,
"learning_rate": 1.937553347615064e-06,
"loss": 0.49952036142349243,
"step": 456
},
{
"epoch": 0.49406688241639696,
"grad_norm": 1.5360465049743652,
"learning_rate": 1.936767304615237e-06,
"loss": 0.6741431951522827,
"step": 458
},
{
"epoch": 0.4962243797195254,
"grad_norm": 2.7520928382873535,
"learning_rate": 1.935976525443782e-06,
"loss": 0.6988986730575562,
"step": 460
},
{
"epoch": 0.49838187702265374,
"grad_norm": 3.1902847290039062,
"learning_rate": 1.935181014576573e-06,
"loss": 0.6338163614273071,
"step": 462
},
{
"epoch": 0.5005393743257821,
"grad_norm": 2.226433515548706,
"learning_rate": 1.934380776516266e-06,
"loss": 0.6862495541572571,
"step": 464
},
{
"epoch": 0.5026968716289104,
"grad_norm": 1.4880640506744385,
"learning_rate": 1.9335758157922757e-06,
"loss": 0.7557521462440491,
"step": 466
},
{
"epoch": 0.5048543689320388,
"grad_norm": 1.4232990741729736,
"learning_rate": 1.932766136960745e-06,
"loss": 0.675652027130127,
"step": 468
},
{
"epoch": 0.5070118662351673,
"grad_norm": 2.1612069606781006,
"learning_rate": 1.931951744604522e-06,
"loss": 0.621537446975708,
"step": 470
},
{
"epoch": 0.5091693635382956,
"grad_norm": 1.3291016817092896,
"learning_rate": 1.9311326433331355e-06,
"loss": 0.6317250728607178,
"step": 472
},
{
"epoch": 0.511326860841424,
"grad_norm": 1.6099094152450562,
"learning_rate": 1.9303088377827653e-06,
"loss": 0.7552534937858582,
"step": 474
},
{
"epoch": 0.5134843581445523,
"grad_norm": 1.411257028579712,
"learning_rate": 1.9294803326162187e-06,
"loss": 0.7963615655899048,
"step": 476
},
{
"epoch": 0.5156418554476807,
"grad_norm": 2.952651262283325,
"learning_rate": 1.9286471325229026e-06,
"loss": 0.7329859733581543,
"step": 478
},
{
"epoch": 0.517799352750809,
"grad_norm": 0.9981465935707092,
"learning_rate": 1.9278092422187978e-06,
"loss": 0.4232223331928253,
"step": 480
},
{
"epoch": 0.5199568500539374,
"grad_norm": 1.3622761964797974,
"learning_rate": 1.926966666446433e-06,
"loss": 0.6612151265144348,
"step": 482
},
{
"epoch": 0.5221143473570659,
"grad_norm": 2.9351611137390137,
"learning_rate": 1.9261194099748554e-06,
"loss": 0.6452651023864746,
"step": 484
},
{
"epoch": 0.5242718446601942,
"grad_norm": 2.3402223587036133,
"learning_rate": 1.9252674775996062e-06,
"loss": 0.7631157040596008,
"step": 486
},
{
"epoch": 0.5264293419633226,
"grad_norm": 2.175255537033081,
"learning_rate": 1.9244108741426933e-06,
"loss": 0.6183757781982422,
"step": 488
},
{
"epoch": 0.5285868392664509,
"grad_norm": 1.3048573732376099,
"learning_rate": 1.923549604452562e-06,
"loss": 0.5466787219047546,
"step": 490
},
{
"epoch": 0.5307443365695793,
"grad_norm": 6.882724761962891,
"learning_rate": 1.9226836734040696e-06,
"loss": 0.6256377696990967,
"step": 492
},
{
"epoch": 0.5329018338727076,
"grad_norm": 5.080470085144043,
"learning_rate": 1.9218130858984566e-06,
"loss": 0.7089909315109253,
"step": 494
},
{
"epoch": 0.535059331175836,
"grad_norm": 5.151968955993652,
"learning_rate": 1.92093784686332e-06,
"loss": 0.5963342785835266,
"step": 496
},
{
"epoch": 0.5372168284789643,
"grad_norm": 2.0868022441864014,
"learning_rate": 1.9200579612525847e-06,
"loss": 0.7230027318000793,
"step": 498
},
{
"epoch": 0.5393743257820928,
"grad_norm": 1.5343974828720093,
"learning_rate": 1.919173434046476e-06,
"loss": 0.5582040548324585,
"step": 500
},
{
"epoch": 0.5415318230852212,
"grad_norm": 2.0801985263824463,
"learning_rate": 1.9182842702514894e-06,
"loss": 0.7240785956382751,
"step": 502
},
{
"epoch": 0.5436893203883495,
"grad_norm": 4.034970760345459,
"learning_rate": 1.917390474900365e-06,
"loss": 0.6458247900009155,
"step": 504
},
{
"epoch": 0.5458468176914779,
"grad_norm": 1.5025601387023926,
"learning_rate": 1.916492053052059e-06,
"loss": 0.7182348370552063,
"step": 506
},
{
"epoch": 0.5480043149946062,
"grad_norm": 1.2147194147109985,
"learning_rate": 1.915589009791712e-06,
"loss": 0.7499125599861145,
"step": 508
},
{
"epoch": 0.5501618122977346,
"grad_norm": 1.183869481086731,
"learning_rate": 1.914681350230623e-06,
"loss": 0.6138162612915039,
"step": 510
},
{
"epoch": 0.552319309600863,
"grad_norm": 1.6860522031784058,
"learning_rate": 1.9137690795062195e-06,
"loss": 0.665122389793396,
"step": 512
},
{
"epoch": 0.5544768069039914,
"grad_norm": 2.0282976627349854,
"learning_rate": 1.9128522027820286e-06,
"loss": 0.6816024780273438,
"step": 514
},
{
"epoch": 0.5566343042071198,
"grad_norm": 4.281038284301758,
"learning_rate": 1.911930725247649e-06,
"loss": 0.5960591435432434,
"step": 516
},
{
"epoch": 0.5587918015102481,
"grad_norm": 1.3044649362564087,
"learning_rate": 1.911004652118718e-06,
"loss": 0.7166500687599182,
"step": 518
},
{
"epoch": 0.5609492988133765,
"grad_norm": 4.467653751373291,
"learning_rate": 1.9100739886368856e-06,
"loss": 0.6787055134773254,
"step": 520
},
{
"epoch": 0.5631067961165048,
"grad_norm": 6.644638538360596,
"learning_rate": 1.9091387400697836e-06,
"loss": 0.6345533728599548,
"step": 522
},
{
"epoch": 0.5652642934196332,
"grad_norm": 2.0383713245391846,
"learning_rate": 1.908198911710996e-06,
"loss": 0.432686984539032,
"step": 524
},
{
"epoch": 0.5674217907227616,
"grad_norm": 1.5780389308929443,
"learning_rate": 1.9072545088800281e-06,
"loss": 0.7076600790023804,
"step": 526
},
{
"epoch": 0.56957928802589,
"grad_norm": 1.6204893589019775,
"learning_rate": 1.9063055369222779e-06,
"loss": 0.6012558341026306,
"step": 528
},
{
"epoch": 0.5717367853290184,
"grad_norm": 6.985592842102051,
"learning_rate": 1.905352001209004e-06,
"loss": 0.6433860063552856,
"step": 530
},
{
"epoch": 0.5738942826321467,
"grad_norm": 1.4386237859725952,
"learning_rate": 1.9043939071372968e-06,
"loss": 0.6871167421340942,
"step": 532
},
{
"epoch": 0.5760517799352751,
"grad_norm": 1.2262943983078003,
"learning_rate": 1.9034312601300479e-06,
"loss": 0.7119494080543518,
"step": 534
},
{
"epoch": 0.5782092772384034,
"grad_norm": 2.725543975830078,
"learning_rate": 1.9024640656359182e-06,
"loss": 0.5970579385757446,
"step": 536
},
{
"epoch": 0.5803667745415318,
"grad_norm": 1.5997346639633179,
"learning_rate": 1.901492329129308e-06,
"loss": 0.6494900584220886,
"step": 538
},
{
"epoch": 0.5825242718446602,
"grad_norm": 3.0856845378875732,
"learning_rate": 1.9005160561103253e-06,
"loss": 0.7896479368209839,
"step": 540
},
{
"epoch": 0.5846817691477886,
"grad_norm": 2.178337574005127,
"learning_rate": 1.8995352521047555e-06,
"loss": 0.7269325256347656,
"step": 542
},
{
"epoch": 0.5868392664509169,
"grad_norm": 2.0112404823303223,
"learning_rate": 1.8985499226640302e-06,
"loss": 0.5430014133453369,
"step": 544
},
{
"epoch": 0.5889967637540453,
"grad_norm": 2.0597128868103027,
"learning_rate": 1.897560073365195e-06,
"loss": 0.7385756969451904,
"step": 546
},
{
"epoch": 0.5911542610571737,
"grad_norm": 0.3906221091747284,
"learning_rate": 1.8965657098108778e-06,
"loss": 0.739960253238678,
"step": 548
},
{
"epoch": 0.593311758360302,
"grad_norm": 6.757683753967285,
"learning_rate": 1.8955668376292584e-06,
"loss": 0.5648355484008789,
"step": 550
},
{
"epoch": 0.5954692556634305,
"grad_norm": 1.4673168659210205,
"learning_rate": 1.8945634624740346e-06,
"loss": 0.6756861209869385,
"step": 552
},
{
"epoch": 0.5976267529665588,
"grad_norm": 2.3454344272613525,
"learning_rate": 1.8935555900243924e-06,
"loss": 0.693338930606842,
"step": 554
},
{
"epoch": 0.5997842502696872,
"grad_norm": 3.2340376377105713,
"learning_rate": 1.8925432259849734e-06,
"loss": 0.6485008001327515,
"step": 556
},
{
"epoch": 0.6019417475728155,
"grad_norm": 3.2170920372009277,
"learning_rate": 1.89152637608584e-06,
"loss": 0.6817625164985657,
"step": 558
},
{
"epoch": 0.6040992448759439,
"grad_norm": 1.9984098672866821,
"learning_rate": 1.8905050460824468e-06,
"loss": 0.7717204093933105,
"step": 560
},
{
"epoch": 0.6062567421790723,
"grad_norm": 2.4421756267547607,
"learning_rate": 1.8894792417556051e-06,
"loss": 0.6852340698242188,
"step": 562
},
{
"epoch": 0.6084142394822006,
"grad_norm": 2.060135841369629,
"learning_rate": 1.888448968911452e-06,
"loss": 0.7176313996315002,
"step": 564
},
{
"epoch": 0.6105717367853291,
"grad_norm": 3.1218700408935547,
"learning_rate": 1.887414233381416e-06,
"loss": 0.6021454334259033,
"step": 566
},
{
"epoch": 0.6127292340884574,
"grad_norm": 1.8716174364089966,
"learning_rate": 1.8863750410221855e-06,
"loss": 0.6650149822235107,
"step": 568
},
{
"epoch": 0.6148867313915858,
"grad_norm": 1.871856689453125,
"learning_rate": 1.8853313977156739e-06,
"loss": 0.6372621655464172,
"step": 570
},
{
"epoch": 0.6170442286947141,
"grad_norm": 2.7764410972595215,
"learning_rate": 1.8842833093689885e-06,
"loss": 0.6875618100166321,
"step": 572
},
{
"epoch": 0.6192017259978425,
"grad_norm": 1.4079262018203735,
"learning_rate": 1.8832307819143953e-06,
"loss": 0.685975968837738,
"step": 574
},
{
"epoch": 0.6213592233009708,
"grad_norm": 5.893849849700928,
"learning_rate": 1.8821738213092862e-06,
"loss": 0.631260871887207,
"step": 576
},
{
"epoch": 0.6235167206040992,
"grad_norm": 2.4246366024017334,
"learning_rate": 1.8811124335361445e-06,
"loss": 0.6432245373725891,
"step": 578
},
{
"epoch": 0.6256742179072277,
"grad_norm": 56.797996520996094,
"learning_rate": 1.8800466246025129e-06,
"loss": 0.6804959177970886,
"step": 580
},
{
"epoch": 0.627831715210356,
"grad_norm": 1.7273633480072021,
"learning_rate": 1.8789764005409568e-06,
"loss": 0.5822848677635193,
"step": 582
},
{
"epoch": 0.6299892125134844,
"grad_norm": 1.6046229600906372,
"learning_rate": 1.8779017674090322e-06,
"loss": 0.7005263566970825,
"step": 584
},
{
"epoch": 0.6321467098166127,
"grad_norm": 1.5924113988876343,
"learning_rate": 1.8768227312892515e-06,
"loss": 0.7687848210334778,
"step": 586
},
{
"epoch": 0.6343042071197411,
"grad_norm": 2.035219430923462,
"learning_rate": 1.875739298289047e-06,
"loss": 0.5710114240646362,
"step": 588
},
{
"epoch": 0.6364617044228694,
"grad_norm": 2.202737808227539,
"learning_rate": 1.8746514745407386e-06,
"loss": 0.7539809346199036,
"step": 590
},
{
"epoch": 0.6386192017259978,
"grad_norm": 5.263622760772705,
"learning_rate": 1.8735592662014985e-06,
"loss": 0.7617581486701965,
"step": 592
},
{
"epoch": 0.6407766990291263,
"grad_norm": 17.145244598388672,
"learning_rate": 1.872462679453315e-06,
"loss": 0.8727496266365051,
"step": 594
},
{
"epoch": 0.6429341963322546,
"grad_norm": 1.9058817625045776,
"learning_rate": 1.871361720502959e-06,
"loss": 0.6560637950897217,
"step": 596
},
{
"epoch": 0.645091693635383,
"grad_norm": 2.8487465381622314,
"learning_rate": 1.8702563955819493e-06,
"loss": 0.5254390835762024,
"step": 598
},
{
"epoch": 0.6472491909385113,
"grad_norm": 2.9062187671661377,
"learning_rate": 1.869146710946515e-06,
"loss": 0.7910107970237732,
"step": 600
},
{
"epoch": 0.6494066882416397,
"grad_norm": 4.04607629776001,
"learning_rate": 1.8680326728775622e-06,
"loss": 0.6240645051002502,
"step": 602
},
{
"epoch": 0.651564185544768,
"grad_norm": 1.921399474143982,
"learning_rate": 1.866914287680638e-06,
"loss": 0.8376886248588562,
"step": 604
},
{
"epoch": 0.6537216828478964,
"grad_norm": 7.570333957672119,
"learning_rate": 1.8657915616858946e-06,
"loss": 0.7127501368522644,
"step": 606
},
{
"epoch": 0.6558791801510249,
"grad_norm": 1.5097516775131226,
"learning_rate": 1.864664501248053e-06,
"loss": 0.5545579195022583,
"step": 608
},
{
"epoch": 0.6580366774541532,
"grad_norm": 3.1739096641540527,
"learning_rate": 1.8635331127463678e-06,
"loss": 0.6854344010353088,
"step": 610
},
{
"epoch": 0.6601941747572816,
"grad_norm": 2.4847121238708496,
"learning_rate": 1.8623974025845913e-06,
"loss": 0.6225752234458923,
"step": 612
},
{
"epoch": 0.6623516720604099,
"grad_norm": 2.919856071472168,
"learning_rate": 1.8612573771909354e-06,
"loss": 0.7242894172668457,
"step": 614
},
{
"epoch": 0.6645091693635383,
"grad_norm": 1.5826700925827026,
"learning_rate": 1.8601130430180384e-06,
"loss": 0.7404430508613586,
"step": 616
},
{
"epoch": 0.6666666666666666,
"grad_norm": 1.9459686279296875,
"learning_rate": 1.8589644065429246e-06,
"loss": 0.7019950747489929,
"step": 618
},
{
"epoch": 0.668824163969795,
"grad_norm": 2.677245855331421,
"learning_rate": 1.8578114742669712e-06,
"loss": 0.6545602083206177,
"step": 620
},
{
"epoch": 0.6709816612729234,
"grad_norm": 6.284696578979492,
"learning_rate": 1.85665425271587e-06,
"loss": 0.4951339364051819,
"step": 622
},
{
"epoch": 0.6731391585760518,
"grad_norm": 1.498757243156433,
"learning_rate": 1.8554927484395892e-06,
"loss": 0.7832834720611572,
"step": 624
},
{
"epoch": 0.6752966558791802,
"grad_norm": 2.4137635231018066,
"learning_rate": 1.8543269680123387e-06,
"loss": 0.6441301107406616,
"step": 626
},
{
"epoch": 0.6774541531823085,
"grad_norm": 4.308967590332031,
"learning_rate": 1.853156918032531e-06,
"loss": 0.7098633050918579,
"step": 628
},
{
"epoch": 0.6796116504854369,
"grad_norm": 2.8264269828796387,
"learning_rate": 1.851982605122746e-06,
"loss": 0.610696017742157,
"step": 630
},
{
"epoch": 0.6817691477885652,
"grad_norm": 1.2851277589797974,
"learning_rate": 1.8508040359296903e-06,
"loss": 0.7390373945236206,
"step": 632
},
{
"epoch": 0.6839266450916937,
"grad_norm": 1.6539459228515625,
"learning_rate": 1.8496212171241626e-06,
"loss": 0.5240519046783447,
"step": 634
},
{
"epoch": 0.686084142394822,
"grad_norm": 1.8807573318481445,
"learning_rate": 1.8484341554010143e-06,
"loss": 0.4707701504230499,
"step": 636
},
{
"epoch": 0.6882416396979504,
"grad_norm": 2.358454704284668,
"learning_rate": 1.8472428574791121e-06,
"loss": 0.7253568172454834,
"step": 638
},
{
"epoch": 0.6903991370010788,
"grad_norm": 2.440108299255371,
"learning_rate": 1.8460473301013004e-06,
"loss": 0.7356727123260498,
"step": 640
},
{
"epoch": 0.6925566343042071,
"grad_norm": 2.899152994155884,
"learning_rate": 1.844847580034362e-06,
"loss": 0.6664748191833496,
"step": 642
},
{
"epoch": 0.6947141316073355,
"grad_norm": 5.704761028289795,
"learning_rate": 1.843643614068981e-06,
"loss": 0.7694708108901978,
"step": 644
},
{
"epoch": 0.6968716289104638,
"grad_norm": 1.51004159450531,
"learning_rate": 1.842435439019703e-06,
"loss": 0.6821762323379517,
"step": 646
},
{
"epoch": 0.6990291262135923,
"grad_norm": 5.242131233215332,
"learning_rate": 1.8412230617248988e-06,
"loss": 0.6199461221694946,
"step": 648
},
{
"epoch": 0.7011866235167206,
"grad_norm": 2.5778682231903076,
"learning_rate": 1.8400064890467229e-06,
"loss": 0.6760554313659668,
"step": 650
},
{
"epoch": 0.703344120819849,
"grad_norm": 1.4639006853103638,
"learning_rate": 1.8387857278710763e-06,
"loss": 0.662639856338501,
"step": 652
},
{
"epoch": 0.7055016181229773,
"grad_norm": 2.5555951595306396,
"learning_rate": 1.8375607851075678e-06,
"loss": 0.5903278589248657,
"step": 654
},
{
"epoch": 0.7076591154261057,
"grad_norm": 1.839576244354248,
"learning_rate": 1.8363316676894743e-06,
"loss": 0.659648597240448,
"step": 656
},
{
"epoch": 0.7098166127292341,
"grad_norm": 4.13273811340332,
"learning_rate": 1.8350983825737008e-06,
"loss": 0.5222451090812683,
"step": 658
},
{
"epoch": 0.7119741100323624,
"grad_norm": 1.8703253269195557,
"learning_rate": 1.833860936740742e-06,
"loss": 0.7516009211540222,
"step": 660
},
{
"epoch": 0.7141316073354909,
"grad_norm": 1.5587713718414307,
"learning_rate": 1.8326193371946435e-06,
"loss": 0.6802030801773071,
"step": 662
},
{
"epoch": 0.7162891046386192,
"grad_norm": 1.9971494674682617,
"learning_rate": 1.8313735909629605e-06,
"loss": 0.5823180675506592,
"step": 664
},
{
"epoch": 0.7184466019417476,
"grad_norm": 3.314469575881958,
"learning_rate": 1.8301237050967186e-06,
"loss": 0.6089075207710266,
"step": 666
},
{
"epoch": 0.7206040992448759,
"grad_norm": 2.5151665210723877,
"learning_rate": 1.8288696866703752e-06,
"loss": 0.5487096309661865,
"step": 668
},
{
"epoch": 0.7227615965480043,
"grad_norm": 1.755199909210205,
"learning_rate": 1.827611542781777e-06,
"loss": 0.6520088911056519,
"step": 670
},
{
"epoch": 0.7249190938511327,
"grad_norm": 2.233076333999634,
"learning_rate": 1.826349280552121e-06,
"loss": 0.6878398656845093,
"step": 672
},
{
"epoch": 0.727076591154261,
"grad_norm": 2.0914413928985596,
"learning_rate": 1.8250829071259162e-06,
"loss": 0.6050041317939758,
"step": 674
},
{
"epoch": 0.7292340884573895,
"grad_norm": 3.670649528503418,
"learning_rate": 1.8238124296709396e-06,
"loss": 0.5783309936523438,
"step": 676
},
{
"epoch": 0.7313915857605178,
"grad_norm": 4.5103559494018555,
"learning_rate": 1.8225378553781978e-06,
"loss": 0.5625826120376587,
"step": 678
},
{
"epoch": 0.7335490830636462,
"grad_norm": 2.3067467212677,
"learning_rate": 1.821259191461886e-06,
"loss": 0.6222144365310669,
"step": 680
},
{
"epoch": 0.7357065803667745,
"grad_norm": 4.616910934448242,
"learning_rate": 1.819976445159347e-06,
"loss": 0.6577675938606262,
"step": 682
},
{
"epoch": 0.7378640776699029,
"grad_norm": 2.574132204055786,
"learning_rate": 1.81868962373103e-06,
"loss": 0.5882217884063721,
"step": 684
},
{
"epoch": 0.7400215749730313,
"grad_norm": 1.4304159879684448,
"learning_rate": 1.8173987344604505e-06,
"loss": 0.7386992573738098,
"step": 686
},
{
"epoch": 0.7421790722761596,
"grad_norm": 2.1306235790252686,
"learning_rate": 1.816103784654147e-06,
"loss": 0.586725115776062,
"step": 688
},
{
"epoch": 0.7443365695792881,
"grad_norm": 1.9864001274108887,
"learning_rate": 1.814804781641642e-06,
"loss": 0.5822692513465881,
"step": 690
},
{
"epoch": 0.7464940668824164,
"grad_norm": 1.6799951791763306,
"learning_rate": 1.8135017327753992e-06,
"loss": 0.630893886089325,
"step": 692
},
{
"epoch": 0.7486515641855448,
"grad_norm": 1.0661367177963257,
"learning_rate": 1.8121946454307816e-06,
"loss": 0.682563066482544,
"step": 694
},
{
"epoch": 0.7508090614886731,
"grad_norm": 4.472043514251709,
"learning_rate": 1.8108835270060122e-06,
"loss": 0.6360002756118774,
"step": 696
},
{
"epoch": 0.7529665587918015,
"grad_norm": 1.2949084043502808,
"learning_rate": 1.8095683849221276e-06,
"loss": 0.6381992101669312,
"step": 698
},
{
"epoch": 0.7551240560949298,
"grad_norm": 1.5483993291854858,
"learning_rate": 1.8082492266229404e-06,
"loss": 0.7825127243995667,
"step": 700
},
{
"epoch": 0.7572815533980582,
"grad_norm": 2.104930877685547,
"learning_rate": 1.806926059574995e-06,
"loss": 0.5905802845954895,
"step": 702
},
{
"epoch": 0.7594390507011867,
"grad_norm": 2.691180467605591,
"learning_rate": 1.805598891267525e-06,
"loss": 0.6105803847312927,
"step": 704
},
{
"epoch": 0.761596548004315,
"grad_norm": 2.662587881088257,
"learning_rate": 1.8042677292124127e-06,
"loss": 0.7156485319137573,
"step": 706
},
{
"epoch": 0.7637540453074434,
"grad_norm": 4.052894115447998,
"learning_rate": 1.802932580944144e-06,
"loss": 0.6582145690917969,
"step": 708
},
{
"epoch": 0.7659115426105717,
"grad_norm": 1.772103190422058,
"learning_rate": 1.801593454019768e-06,
"loss": 0.5497456789016724,
"step": 710
},
{
"epoch": 0.7680690399137001,
"grad_norm": 4.38840913772583,
"learning_rate": 1.8002503560188531e-06,
"loss": 0.8528274893760681,
"step": 712
},
{
"epoch": 0.7702265372168284,
"grad_norm": 0.47714903950691223,
"learning_rate": 1.798903294543444e-06,
"loss": 0.6722896695137024,
"step": 714
},
{
"epoch": 0.7723840345199569,
"grad_norm": 1.1433959007263184,
"learning_rate": 1.797552277218019e-06,
"loss": 0.640397310256958,
"step": 716
},
{
"epoch": 0.7745415318230853,
"grad_norm": 2.8816723823547363,
"learning_rate": 1.7961973116894475e-06,
"loss": 0.43922677636146545,
"step": 718
},
{
"epoch": 0.7766990291262136,
"grad_norm": 1.3017030954360962,
"learning_rate": 1.7948384056269452e-06,
"loss": 0.6236469745635986,
"step": 720
},
{
"epoch": 0.778856526429342,
"grad_norm": 5.502106189727783,
"learning_rate": 1.7934755667220324e-06,
"loss": 0.6106448769569397,
"step": 722
},
{
"epoch": 0.7810140237324703,
"grad_norm": 3.15694522857666,
"learning_rate": 1.7921088026884895e-06,
"loss": 0.7106237411499023,
"step": 724
},
{
"epoch": 0.7831715210355987,
"grad_norm": 1.253527283668518,
"learning_rate": 1.7907381212623119e-06,
"loss": 0.6325215101242065,
"step": 726
},
{
"epoch": 0.785329018338727,
"grad_norm": 5.992726802825928,
"learning_rate": 1.7893635302016699e-06,
"loss": 0.698371946811676,
"step": 728
},
{
"epoch": 0.7874865156418555,
"grad_norm": 2.6129043102264404,
"learning_rate": 1.7879850372868614e-06,
"loss": 0.8592634797096252,
"step": 730
},
{
"epoch": 0.7896440129449838,
"grad_norm": 1.9722578525543213,
"learning_rate": 1.7866026503202696e-06,
"loss": 0.7127001881599426,
"step": 732
},
{
"epoch": 0.7918015102481122,
"grad_norm": 2.3035688400268555,
"learning_rate": 1.7852163771263183e-06,
"loss": 0.7264171242713928,
"step": 734
},
{
"epoch": 0.7939590075512406,
"grad_norm": 1.6729274988174438,
"learning_rate": 1.7838262255514273e-06,
"loss": 0.6522683501243591,
"step": 736
},
{
"epoch": 0.7961165048543689,
"grad_norm": 1.2953232526779175,
"learning_rate": 1.7824322034639688e-06,
"loss": 0.7508292198181152,
"step": 738
},
{
"epoch": 0.7982740021574973,
"grad_norm": 1.8854900598526,
"learning_rate": 1.781034318754222e-06,
"loss": 0.8205673098564148,
"step": 740
},
{
"epoch": 0.8004314994606256,
"grad_norm": 2.379824161529541,
"learning_rate": 1.7796325793343296e-06,
"loss": 0.627574622631073,
"step": 742
},
{
"epoch": 0.8025889967637541,
"grad_norm": 1.9717144966125488,
"learning_rate": 1.7782269931382514e-06,
"loss": 0.41914719343185425,
"step": 744
},
{
"epoch": 0.8047464940668824,
"grad_norm": 3.594667911529541,
"learning_rate": 1.7768175681217208e-06,
"loss": 0.40262705087661743,
"step": 746
},
{
"epoch": 0.8069039913700108,
"grad_norm": 1.6693843603134155,
"learning_rate": 1.7754043122621986e-06,
"loss": 0.6387592554092407,
"step": 748
},
{
"epoch": 0.8090614886731392,
"grad_norm": 5.557443141937256,
"learning_rate": 1.7739872335588298e-06,
"loss": 0.6391375064849854,
"step": 750
},
{
"epoch": 0.8112189859762675,
"grad_norm": 1.475829839706421,
"learning_rate": 1.7725663400323957e-06,
"loss": 0.5560780167579651,
"step": 752
},
{
"epoch": 0.8133764832793959,
"grad_norm": 3.5974369049072266,
"learning_rate": 1.77114163972527e-06,
"loss": 0.7343906164169312,
"step": 754
},
{
"epoch": 0.8155339805825242,
"grad_norm": 1.595281720161438,
"learning_rate": 1.769713140701374e-06,
"loss": 0.6695587038993835,
"step": 756
},
{
"epoch": 0.8176914778856527,
"grad_norm": 1.641003131866455,
"learning_rate": 1.7682808510461292e-06,
"loss": 0.7364107370376587,
"step": 758
},
{
"epoch": 0.819848975188781,
"grad_norm": 1.8976866006851196,
"learning_rate": 1.7668447788664126e-06,
"loss": 0.5367798209190369,
"step": 760
},
{
"epoch": 0.8220064724919094,
"grad_norm": 2.350424289703369,
"learning_rate": 1.7654049322905105e-06,
"loss": 0.6110427379608154,
"step": 762
},
{
"epoch": 0.8241639697950378,
"grad_norm": 1.7859790325164795,
"learning_rate": 1.7639613194680727e-06,
"loss": 0.8835413455963135,
"step": 764
},
{
"epoch": 0.8263214670981661,
"grad_norm": 1.9460476636886597,
"learning_rate": 1.7625139485700664e-06,
"loss": 0.5881315469741821,
"step": 766
},
{
"epoch": 0.8284789644012945,
"grad_norm": 17.784387588500977,
"learning_rate": 1.7610628277887297e-06,
"loss": 0.5561118721961975,
"step": 768
},
{
"epoch": 0.8306364617044228,
"grad_norm": 2.5915396213531494,
"learning_rate": 1.7596079653375253e-06,
"loss": 0.6103290319442749,
"step": 770
},
{
"epoch": 0.8327939590075513,
"grad_norm": 7.325887680053711,
"learning_rate": 1.758149369451094e-06,
"loss": 0.52987140417099,
"step": 772
},
{
"epoch": 0.8349514563106796,
"grad_norm": 3.882723093032837,
"learning_rate": 1.7566870483852086e-06,
"loss": 0.7465340495109558,
"step": 774
},
{
"epoch": 0.837108953613808,
"grad_norm": 5.062621593475342,
"learning_rate": 1.7552210104167257e-06,
"loss": 0.6753080487251282,
"step": 776
},
{
"epoch": 0.8392664509169363,
"grad_norm": 6.415841102600098,
"learning_rate": 1.753751263843541e-06,
"loss": 0.693338930606842,
"step": 778
},
{
"epoch": 0.8414239482200647,
"grad_norm": 2.172607898712158,
"learning_rate": 1.7522778169845408e-06,
"loss": 0.7129068374633789,
"step": 780
},
{
"epoch": 0.8435814455231931,
"grad_norm": 2.3066418170928955,
"learning_rate": 1.7508006781795555e-06,
"loss": 0.5250005722045898,
"step": 782
},
{
"epoch": 0.8457389428263214,
"grad_norm": 1.514641523361206,
"learning_rate": 1.7493198557893109e-06,
"loss": 0.5880756378173828,
"step": 784
},
{
"epoch": 0.8478964401294499,
"grad_norm": 2.358647584915161,
"learning_rate": 1.7478353581953846e-06,
"loss": 0.6020887494087219,
"step": 786
},
{
"epoch": 0.8500539374325782,
"grad_norm": 2.5027408599853516,
"learning_rate": 1.746347193800154e-06,
"loss": 0.7379757165908813,
"step": 788
},
{
"epoch": 0.8522114347357066,
"grad_norm": 1.7015382051467896,
"learning_rate": 1.7448553710267519e-06,
"loss": 0.3867076337337494,
"step": 790
},
{
"epoch": 0.8543689320388349,
"grad_norm": 24.41814613342285,
"learning_rate": 1.7433598983190181e-06,
"loss": 0.5596577525138855,
"step": 792
},
{
"epoch": 0.8565264293419633,
"grad_norm": 0.5729015469551086,
"learning_rate": 1.74186078414145e-06,
"loss": 0.37773168087005615,
"step": 794
},
{
"epoch": 0.8586839266450917,
"grad_norm": 1.438088297843933,
"learning_rate": 1.7403580369791577e-06,
"loss": 0.6138755679130554,
"step": 796
},
{
"epoch": 0.86084142394822,
"grad_norm": 1.3894504308700562,
"learning_rate": 1.7388516653378134e-06,
"loss": 0.6411980986595154,
"step": 798
},
{
"epoch": 0.8629989212513485,
"grad_norm": 7.195361137390137,
"learning_rate": 1.7373416777436036e-06,
"loss": 0.5361164808273315,
"step": 800
},
{
"epoch": 0.8651564185544768,
"grad_norm": 3.299745798110962,
"learning_rate": 1.7358280827431829e-06,
"loss": 0.45560529828071594,
"step": 802
},
{
"epoch": 0.8673139158576052,
"grad_norm": 2.5834922790527344,
"learning_rate": 1.7343108889036223e-06,
"loss": 0.5199063420295715,
"step": 804
},
{
"epoch": 0.8694714131607335,
"grad_norm": 2.0384316444396973,
"learning_rate": 1.7327901048123644e-06,
"loss": 0.6027982234954834,
"step": 806
},
{
"epoch": 0.8716289104638619,
"grad_norm": 3.682217597961426,
"learning_rate": 1.7312657390771714e-06,
"loss": 0.6176765561103821,
"step": 808
},
{
"epoch": 0.8737864077669902,
"grad_norm": 3.4343974590301514,
"learning_rate": 1.7297378003260787e-06,
"loss": 0.6307402849197388,
"step": 810
},
{
"epoch": 0.8759439050701187,
"grad_norm": 18.784271240234375,
"learning_rate": 1.728206297207345e-06,
"loss": 0.4677152633666992,
"step": 812
},
{
"epoch": 0.8781014023732471,
"grad_norm": 1.2662842273712158,
"learning_rate": 1.7266712383894037e-06,
"loss": 0.6467829346656799,
"step": 814
},
{
"epoch": 0.8802588996763754,
"grad_norm": 1.4935745000839233,
"learning_rate": 1.7251326325608135e-06,
"loss": 0.6746770143508911,
"step": 816
},
{
"epoch": 0.8824163969795038,
"grad_norm": 3.506131410598755,
"learning_rate": 1.7235904884302098e-06,
"loss": 0.6060282588005066,
"step": 818
},
{
"epoch": 0.8845738942826321,
"grad_norm": 3.4990806579589844,
"learning_rate": 1.7220448147262555e-06,
"loss": 0.5744661688804626,
"step": 820
},
{
"epoch": 0.8867313915857605,
"grad_norm": 1.9610271453857422,
"learning_rate": 1.7204956201975898e-06,
"loss": 0.6914322376251221,
"step": 822
},
{
"epoch": 0.8888888888888888,
"grad_norm": 2.512073040008545,
"learning_rate": 1.7189429136127814e-06,
"loss": 0.6700202226638794,
"step": 824
},
{
"epoch": 0.8910463861920173,
"grad_norm": 2.086268663406372,
"learning_rate": 1.7173867037602767e-06,
"loss": 0.7067221403121948,
"step": 826
},
{
"epoch": 0.8932038834951457,
"grad_norm": 3.7312817573547363,
"learning_rate": 1.7158269994483514e-06,
"loss": 0.31625503301620483,
"step": 828
},
{
"epoch": 0.895361380798274,
"grad_norm": 6.126044750213623,
"learning_rate": 1.71426380950506e-06,
"loss": 0.5323830842971802,
"step": 830
},
{
"epoch": 0.8975188781014024,
"grad_norm": 1.7195242643356323,
"learning_rate": 1.712697142778186e-06,
"loss": 0.782951831817627,
"step": 832
},
{
"epoch": 0.8996763754045307,
"grad_norm": 8.366249084472656,
"learning_rate": 1.7111270081351913e-06,
"loss": 0.5681637525558472,
"step": 834
},
{
"epoch": 0.9018338727076591,
"grad_norm": 2.791904926300049,
"learning_rate": 1.7095534144631668e-06,
"loss": 0.7307286858558655,
"step": 836
},
{
"epoch": 0.9039913700107874,
"grad_norm": 1.9204684495925903,
"learning_rate": 1.7079763706687827e-06,
"loss": 0.6743446588516235,
"step": 838
},
{
"epoch": 0.9061488673139159,
"grad_norm": 5.066476821899414,
"learning_rate": 1.706395885678235e-06,
"loss": 0.6655571460723877,
"step": 840
},
{
"epoch": 0.9083063646170443,
"grad_norm": 4.142644882202148,
"learning_rate": 1.7048119684371996e-06,
"loss": 0.6895488500595093,
"step": 842
},
{
"epoch": 0.9104638619201726,
"grad_norm": 7.594639778137207,
"learning_rate": 1.7032246279107776e-06,
"loss": 0.8503600358963013,
"step": 844
},
{
"epoch": 0.912621359223301,
"grad_norm": 8.625396728515625,
"learning_rate": 1.7016338730834468e-06,
"loss": 0.8498875498771667,
"step": 846
},
{
"epoch": 0.9147788565264293,
"grad_norm": 2.5336923599243164,
"learning_rate": 1.7000397129590104e-06,
"loss": 0.49179524183273315,
"step": 848
},
{
"epoch": 0.9169363538295577,
"grad_norm": 12.546621322631836,
"learning_rate": 1.6984421565605447e-06,
"loss": 0.7858133912086487,
"step": 850
},
{
"epoch": 0.919093851132686,
"grad_norm": 1.803154706954956,
"learning_rate": 1.696841212930351e-06,
"loss": 0.42831236124038696,
"step": 852
},
{
"epoch": 0.9212513484358145,
"grad_norm": 1.1497598886489868,
"learning_rate": 1.695236891129901e-06,
"loss": 0.6902183294296265,
"step": 854
},
{
"epoch": 0.9234088457389428,
"grad_norm": 0.7733110785484314,
"learning_rate": 1.6936292002397876e-06,
"loss": 0.7910528182983398,
"step": 856
},
{
"epoch": 0.9255663430420712,
"grad_norm": 4.334436893463135,
"learning_rate": 1.692018149359674e-06,
"loss": 0.6410449743270874,
"step": 858
},
{
"epoch": 0.9277238403451996,
"grad_norm": 3.1473817825317383,
"learning_rate": 1.6904037476082403e-06,
"loss": 0.5418177247047424,
"step": 860
},
{
"epoch": 0.9298813376483279,
"grad_norm": 3.289321184158325,
"learning_rate": 1.6887860041231324e-06,
"loss": 0.8675633072853088,
"step": 862
},
{
"epoch": 0.9320388349514563,
"grad_norm": 0.5947059392929077,
"learning_rate": 1.6871649280609114e-06,
"loss": 0.7250087857246399,
"step": 864
},
{
"epoch": 0.9341963322545846,
"grad_norm": 1.812920331954956,
"learning_rate": 1.6855405285970012e-06,
"loss": 0.3274366855621338,
"step": 866
},
{
"epoch": 0.9363538295577131,
"grad_norm": 1.37776780128479,
"learning_rate": 1.6839128149256357e-06,
"loss": 0.7339057326316833,
"step": 868
},
{
"epoch": 0.9385113268608414,
"grad_norm": 2.6405365467071533,
"learning_rate": 1.6822817962598079e-06,
"loss": 0.4312754273414612,
"step": 870
},
{
"epoch": 0.9406688241639698,
"grad_norm": 32.455711364746094,
"learning_rate": 1.6806474818312178e-06,
"loss": 0.6649459600448608,
"step": 872
},
{
"epoch": 0.9428263214670982,
"grad_norm": 1.1529123783111572,
"learning_rate": 1.6790098808902187e-06,
"loss": 0.694479763507843,
"step": 874
},
{
"epoch": 0.9449838187702265,
"grad_norm": 6.395750045776367,
"learning_rate": 1.6773690027057665e-06,
"loss": 0.5320945978164673,
"step": 876
},
{
"epoch": 0.9471413160733549,
"grad_norm": 12.70807933807373,
"learning_rate": 1.6757248565653666e-06,
"loss": 0.7014382481575012,
"step": 878
},
{
"epoch": 0.9492988133764833,
"grad_norm": 2.392099142074585,
"learning_rate": 1.674077451775021e-06,
"loss": 0.9157409071922302,
"step": 880
},
{
"epoch": 0.9514563106796117,
"grad_norm": 1.741999864578247,
"learning_rate": 1.6724267976591756e-06,
"loss": 0.616689145565033,
"step": 882
},
{
"epoch": 0.95361380798274,
"grad_norm": 1.99687922000885,
"learning_rate": 1.6707729035606691e-06,
"loss": 0.5802426338195801,
"step": 884
},
{
"epoch": 0.9557713052858684,
"grad_norm": 1.6920971870422363,
"learning_rate": 1.6691157788406773e-06,
"loss": 0.42533692717552185,
"step": 886
},
{
"epoch": 0.9579288025889967,
"grad_norm": 3.21420955657959,
"learning_rate": 1.6674554328786616e-06,
"loss": 0.8310537338256836,
"step": 888
},
{
"epoch": 0.9600862998921251,
"grad_norm": 2.7011826038360596,
"learning_rate": 1.6657918750723176e-06,
"loss": 0.8436251282691956,
"step": 890
},
{
"epoch": 0.9622437971952535,
"grad_norm": 1.146492838859558,
"learning_rate": 1.6641251148375184e-06,
"loss": 0.4956342577934265,
"step": 892
},
{
"epoch": 0.9644012944983819,
"grad_norm": 11.642980575561523,
"learning_rate": 1.6624551616082635e-06,
"loss": 0.643322765827179,
"step": 894
},
{
"epoch": 0.9665587918015103,
"grad_norm": 1.810482144355774,
"learning_rate": 1.6607820248366257e-06,
"loss": 0.6843705177307129,
"step": 896
},
{
"epoch": 0.9687162891046386,
"grad_norm": 1.3741674423217773,
"learning_rate": 1.6591057139926966e-06,
"loss": 0.7010579109191895,
"step": 898
},
{
"epoch": 0.970873786407767,
"grad_norm": 3.2511136531829834,
"learning_rate": 1.6574262385645323e-06,
"loss": 0.6527800559997559,
"step": 900
},
{
"epoch": 0.9730312837108953,
"grad_norm": 3.3792011737823486,
"learning_rate": 1.6557436080581027e-06,
"loss": 0.6726928949356079,
"step": 902
},
{
"epoch": 0.9751887810140237,
"grad_norm": 1.9965680837631226,
"learning_rate": 1.6540578319972335e-06,
"loss": 0.7932605147361755,
"step": 904
},
{
"epoch": 0.9773462783171522,
"grad_norm": 14.346431732177734,
"learning_rate": 1.652368919923557e-06,
"loss": 0.6329518556594849,
"step": 906
},
{
"epoch": 0.9795037756202805,
"grad_norm": 1.1356829404830933,
"learning_rate": 1.6506768813964527e-06,
"loss": 0.6013335585594177,
"step": 908
},
{
"epoch": 0.9816612729234089,
"grad_norm": 1.9456955194473267,
"learning_rate": 1.6489817259929978e-06,
"loss": 0.6943175792694092,
"step": 910
},
{
"epoch": 0.9838187702265372,
"grad_norm": 75.11457824707031,
"learning_rate": 1.647283463307912e-06,
"loss": 0.499568372964859,
"step": 912
},
{
"epoch": 0.9859762675296656,
"grad_norm": 2.1002883911132812,
"learning_rate": 1.6455821029535006e-06,
"loss": 0.6039252281188965,
"step": 914
},
{
"epoch": 0.9881337648327939,
"grad_norm": 2.215057849884033,
"learning_rate": 1.6438776545596032e-06,
"loss": 0.6023073196411133,
"step": 916
},
{
"epoch": 0.9902912621359223,
"grad_norm": 5.074563980102539,
"learning_rate": 1.6421701277735377e-06,
"loss": 0.6670839190483093,
"step": 918
},
{
"epoch": 0.9924487594390508,
"grad_norm": 1.9246139526367188,
"learning_rate": 1.6404595322600454e-06,
"loss": 0.45060187578201294,
"step": 920
},
{
"epoch": 0.9946062567421791,
"grad_norm": 2.673752784729004,
"learning_rate": 1.638745877701238e-06,
"loss": 0.5095839500427246,
"step": 922
},
{
"epoch": 0.9967637540453075,
"grad_norm": 1.1209965944290161,
"learning_rate": 1.6370291737965403e-06,
"loss": 0.6856327652931213,
"step": 924
},
{
"epoch": 0.9989212513484358,
"grad_norm": 1.4867414236068726,
"learning_rate": 1.6353094302626375e-06,
"loss": 0.7345451712608337,
"step": 926
},
{
"epoch": 1.0010787486515642,
"grad_norm": 1.4351245164871216,
"learning_rate": 1.6335866568334196e-06,
"loss": 0.4384617805480957,
"step": 928
},
{
"epoch": 1.0032362459546926,
"grad_norm": 1.608482837677002,
"learning_rate": 1.6318608632599252e-06,
"loss": 0.5233771800994873,
"step": 930
},
{
"epoch": 1.0053937432578208,
"grad_norm": 1.6175332069396973,
"learning_rate": 1.6301320593102877e-06,
"loss": 0.5526682734489441,
"step": 932
},
{
"epoch": 1.0075512405609492,
"grad_norm": 1.7772831916809082,
"learning_rate": 1.6284002547696794e-06,
"loss": 0.5304218530654907,
"step": 934
},
{
"epoch": 1.0097087378640777,
"grad_norm": 1.2558060884475708,
"learning_rate": 1.626665459440256e-06,
"loss": 0.3196244239807129,
"step": 936
},
{
"epoch": 1.011866235167206,
"grad_norm": 4.1625471115112305,
"learning_rate": 1.6249276831411015e-06,
"loss": 0.49367865920066833,
"step": 938
},
{
"epoch": 1.0140237324703345,
"grad_norm": 2.80029296875,
"learning_rate": 1.6231869357081726e-06,
"loss": 0.5806005597114563,
"step": 940
},
{
"epoch": 1.0161812297734627,
"grad_norm": 2.1595256328582764,
"learning_rate": 1.6214432269942426e-06,
"loss": 0.558415412902832,
"step": 942
},
{
"epoch": 1.0183387270765911,
"grad_norm": 4.51309061050415,
"learning_rate": 1.6196965668688455e-06,
"loss": 0.4171544909477234,
"step": 944
},
{
"epoch": 1.0204962243797195,
"grad_norm": 9.326614379882812,
"learning_rate": 1.6179469652182215e-06,
"loss": 0.49132904410362244,
"step": 946
},
{
"epoch": 1.022653721682848,
"grad_norm": 2.721449613571167,
"learning_rate": 1.6161944319452599e-06,
"loss": 0.526667058467865,
"step": 948
},
{
"epoch": 1.0248112189859762,
"grad_norm": 8.595200538635254,
"learning_rate": 1.6144389769694418e-06,
"loss": 0.519080400466919,
"step": 950
},
{
"epoch": 1.0269687162891046,
"grad_norm": 1.9319312572479248,
"learning_rate": 1.6126806102267871e-06,
"loss": 0.4982292950153351,
"step": 952
},
{
"epoch": 1.029126213592233,
"grad_norm": 2.2782833576202393,
"learning_rate": 1.6109193416697962e-06,
"loss": 0.47339990735054016,
"step": 954
},
{
"epoch": 1.0312837108953614,
"grad_norm": 1.7561050653457642,
"learning_rate": 1.609155181267393e-06,
"loss": 0.4229566156864166,
"step": 956
},
{
"epoch": 1.0334412081984898,
"grad_norm": 2.3419620990753174,
"learning_rate": 1.6073881390048708e-06,
"loss": 0.5675852298736572,
"step": 958
},
{
"epoch": 1.035598705501618,
"grad_norm": 4.762004852294922,
"learning_rate": 1.6056182248838333e-06,
"loss": 0.47640660405158997,
"step": 960
},
{
"epoch": 1.0377562028047465,
"grad_norm": 1.654563307762146,
"learning_rate": 1.6038454489221401e-06,
"loss": 0.39150771498680115,
"step": 962
},
{
"epoch": 1.0399137001078749,
"grad_norm": 6.535782337188721,
"learning_rate": 1.6020698211538485e-06,
"loss": 0.43942204117774963,
"step": 964
},
{
"epoch": 1.0420711974110033,
"grad_norm": 1.442032814025879,
"learning_rate": 1.6002913516291575e-06,
"loss": 0.3959490954875946,
"step": 966
},
{
"epoch": 1.0442286947141317,
"grad_norm": 2.8204493522644043,
"learning_rate": 1.5985100504143508e-06,
"loss": 0.46986186504364014,
"step": 968
},
{
"epoch": 1.04638619201726,
"grad_norm": 5.788197994232178,
"learning_rate": 1.596725927591739e-06,
"loss": 0.5587306022644043,
"step": 970
},
{
"epoch": 1.0485436893203883,
"grad_norm": 3.22556209564209,
"learning_rate": 1.594938993259604e-06,
"loss": 0.42848098278045654,
"step": 972
},
{
"epoch": 1.0507011866235167,
"grad_norm": 5.465310573577881,
"learning_rate": 1.5931492575321405e-06,
"loss": 0.3230629861354828,
"step": 974
},
{
"epoch": 1.0528586839266452,
"grad_norm": 2.285598039627075,
"learning_rate": 1.5913567305394004e-06,
"loss": 0.4129447937011719,
"step": 976
},
{
"epoch": 1.0550161812297734,
"grad_norm": 2.510387659072876,
"learning_rate": 1.5895614224272329e-06,
"loss": 0.5222740173339844,
"step": 978
},
{
"epoch": 1.0571736785329018,
"grad_norm": 3.7488322257995605,
"learning_rate": 1.5877633433572293e-06,
"loss": 0.47047188878059387,
"step": 980
},
{
"epoch": 1.0593311758360302,
"grad_norm": 1.3235845565795898,
"learning_rate": 1.5859625035066652e-06,
"loss": 0.4286286234855652,
"step": 982
},
{
"epoch": 1.0614886731391586,
"grad_norm": 1.2796275615692139,
"learning_rate": 1.5841589130684417e-06,
"loss": 0.411946564912796,
"step": 984
},
{
"epoch": 1.063646170442287,
"grad_norm": 2.5920588970184326,
"learning_rate": 1.5823525822510282e-06,
"loss": 0.4910277724266052,
"step": 986
},
{
"epoch": 1.0658036677454152,
"grad_norm": 13.590333938598633,
"learning_rate": 1.5805435212784066e-06,
"loss": 0.381788045167923,
"step": 988
},
{
"epoch": 1.0679611650485437,
"grad_norm": 1.8935883045196533,
"learning_rate": 1.5787317403900095e-06,
"loss": 0.4319833517074585,
"step": 990
},
{
"epoch": 1.070118662351672,
"grad_norm": 1.8740428686141968,
"learning_rate": 1.5769172498406657e-06,
"loss": 0.5537865161895752,
"step": 992
},
{
"epoch": 1.0722761596548005,
"grad_norm": 2.8530309200286865,
"learning_rate": 1.5751000599005411e-06,
"loss": 0.45889872312545776,
"step": 994
},
{
"epoch": 1.074433656957929,
"grad_norm": 3.0372843742370605,
"learning_rate": 1.573280180855079e-06,
"loss": 0.4843668043613434,
"step": 996
},
{
"epoch": 1.0765911542610571,
"grad_norm": 1.9461435079574585,
"learning_rate": 1.571457623004945e-06,
"loss": 0.3833789527416229,
"step": 998
},
{
"epoch": 1.0787486515641855,
"grad_norm": 4.167815208435059,
"learning_rate": 1.5696323966659659e-06,
"loss": 0.7622794508934021,
"step": 1000
},
{
"epoch": 1.080906148867314,
"grad_norm": 2.6408567428588867,
"learning_rate": 1.5678045121690723e-06,
"loss": 0.38144806027412415,
"step": 1002
},
{
"epoch": 1.0830636461704424,
"grad_norm": 1.6359201669692993,
"learning_rate": 1.5659739798602412e-06,
"loss": 0.5962096452713013,
"step": 1004
},
{
"epoch": 1.0852211434735706,
"grad_norm": 1.9073861837387085,
"learning_rate": 1.5641408101004348e-06,
"loss": 0.5042172074317932,
"step": 1006
},
{
"epoch": 1.087378640776699,
"grad_norm": 1.1828426122665405,
"learning_rate": 1.5623050132655452e-06,
"loss": 0.4065170884132385,
"step": 1008
},
{
"epoch": 1.0895361380798274,
"grad_norm": 1.109727144241333,
"learning_rate": 1.5604665997463326e-06,
"loss": 0.3995954990386963,
"step": 1010
},
{
"epoch": 1.0916936353829558,
"grad_norm": 2.5301997661590576,
"learning_rate": 1.5586255799483685e-06,
"loss": 0.4737590253353119,
"step": 1012
},
{
"epoch": 1.0938511326860842,
"grad_norm": 3.5179555416107178,
"learning_rate": 1.5567819642919768e-06,
"loss": 0.3755728006362915,
"step": 1014
},
{
"epoch": 1.0960086299892124,
"grad_norm": 2.1552042961120605,
"learning_rate": 1.5549357632121722e-06,
"loss": 0.5351279973983765,
"step": 1016
},
{
"epoch": 1.0981661272923409,
"grad_norm": 1.9615085124969482,
"learning_rate": 1.5530869871586058e-06,
"loss": 0.480570912361145,
"step": 1018
},
{
"epoch": 1.1003236245954693,
"grad_norm": 5.5772552490234375,
"learning_rate": 1.5512356465955008e-06,
"loss": 0.4701279103755951,
"step": 1020
},
{
"epoch": 1.1024811218985977,
"grad_norm": 2.0782828330993652,
"learning_rate": 1.5493817520015969e-06,
"loss": 0.6023370027542114,
"step": 1022
},
{
"epoch": 1.104638619201726,
"grad_norm": 4.706164360046387,
"learning_rate": 1.5475253138700899e-06,
"loss": 0.4403872489929199,
"step": 1024
},
{
"epoch": 1.1067961165048543,
"grad_norm": 2.136815309524536,
"learning_rate": 1.5456663427085716e-06,
"loss": 0.49264582991600037,
"step": 1026
},
{
"epoch": 1.1089536138079827,
"grad_norm": 2.051373243331909,
"learning_rate": 1.543804849038972e-06,
"loss": 0.4840565621852875,
"step": 1028
},
{
"epoch": 1.1111111111111112,
"grad_norm": 2.1924808025360107,
"learning_rate": 1.5419408433974974e-06,
"loss": 0.49226483702659607,
"step": 1030
},
{
"epoch": 1.1132686084142396,
"grad_norm": 3.719738245010376,
"learning_rate": 1.5400743363345733e-06,
"loss": 0.429510235786438,
"step": 1032
},
{
"epoch": 1.1154261057173678,
"grad_norm": 1.6024198532104492,
"learning_rate": 1.5382053384147828e-06,
"loss": 0.5755860805511475,
"step": 1034
},
{
"epoch": 1.1175836030204962,
"grad_norm": 5.685046672821045,
"learning_rate": 1.5363338602168072e-06,
"loss": 0.40157079696655273,
"step": 1036
},
{
"epoch": 1.1197411003236246,
"grad_norm": 1.610744833946228,
"learning_rate": 1.5344599123333671e-06,
"loss": 0.4434182643890381,
"step": 1038
},
{
"epoch": 1.121898597626753,
"grad_norm": 23.230365753173828,
"learning_rate": 1.532583505371161e-06,
"loss": 0.4990198314189911,
"step": 1040
},
{
"epoch": 1.1240560949298812,
"grad_norm": 1.455960988998413,
"learning_rate": 1.5307046499508066e-06,
"loss": 0.4062468409538269,
"step": 1042
},
{
"epoch": 1.1262135922330097,
"grad_norm": 1.460098385810852,
"learning_rate": 1.5288233567067794e-06,
"loss": 0.45499229431152344,
"step": 1044
},
{
"epoch": 1.128371089536138,
"grad_norm": 1.3446215391159058,
"learning_rate": 1.5269396362873542e-06,
"loss": 0.4300175905227661,
"step": 1046
},
{
"epoch": 1.1305285868392665,
"grad_norm": 1.6477187871932983,
"learning_rate": 1.5250534993545426e-06,
"loss": 0.4830603301525116,
"step": 1048
},
{
"epoch": 1.132686084142395,
"grad_norm": 2.070373296737671,
"learning_rate": 1.523164956584035e-06,
"loss": 0.47534123063087463,
"step": 1050
},
{
"epoch": 1.134843581445523,
"grad_norm": 2.0876166820526123,
"learning_rate": 1.5212740186651378e-06,
"loss": 0.4968222975730896,
"step": 1052
},
{
"epoch": 1.1370010787486515,
"grad_norm": 1.7046785354614258,
"learning_rate": 1.5193806963007156e-06,
"loss": 0.4516274034976959,
"step": 1054
},
{
"epoch": 1.13915857605178,
"grad_norm": 6.408827781677246,
"learning_rate": 1.517485000207128e-06,
"loss": 0.45875146985054016,
"step": 1056
},
{
"epoch": 1.1413160733549084,
"grad_norm": 1.6034789085388184,
"learning_rate": 1.5155869411141704e-06,
"loss": 0.5700262188911438,
"step": 1058
},
{
"epoch": 1.1434735706580366,
"grad_norm": 9.753545761108398,
"learning_rate": 1.5136865297650134e-06,
"loss": 0.3870803117752075,
"step": 1060
},
{
"epoch": 1.145631067961165,
"grad_norm": 2.6454174518585205,
"learning_rate": 1.511783776916141e-06,
"loss": 0.1962374895811081,
"step": 1062
},
{
"epoch": 1.1477885652642934,
"grad_norm": 9.807194709777832,
"learning_rate": 1.5098786933372907e-06,
"loss": 0.3792603611946106,
"step": 1064
},
{
"epoch": 1.1499460625674218,
"grad_norm": 1.371470332145691,
"learning_rate": 1.5079712898113916e-06,
"loss": 0.4742359519004822,
"step": 1066
},
{
"epoch": 1.1521035598705502,
"grad_norm": 9.515076637268066,
"learning_rate": 1.5060615771345045e-06,
"loss": 0.49537792801856995,
"step": 1068
},
{
"epoch": 1.1542610571736784,
"grad_norm": 3.214311361312866,
"learning_rate": 1.50414956611576e-06,
"loss": 0.5695366859436035,
"step": 1070
},
{
"epoch": 1.1564185544768069,
"grad_norm": 3.578993797302246,
"learning_rate": 1.5022352675772967e-06,
"loss": 0.4019346535205841,
"step": 1072
},
{
"epoch": 1.1585760517799353,
"grad_norm": 1.514540195465088,
"learning_rate": 1.5003186923542022e-06,
"loss": 0.4417833089828491,
"step": 1074
},
{
"epoch": 1.1607335490830637,
"grad_norm": 1.5279725790023804,
"learning_rate": 1.4983998512944497e-06,
"loss": 0.40684929490089417,
"step": 1076
},
{
"epoch": 1.162891046386192,
"grad_norm": 2.6913864612579346,
"learning_rate": 1.4964787552588364e-06,
"loss": 0.6169437766075134,
"step": 1078
},
{
"epoch": 1.1650485436893203,
"grad_norm": 6.149393558502197,
"learning_rate": 1.4945554151209241e-06,
"loss": 0.4913300573825836,
"step": 1080
},
{
"epoch": 1.1672060409924487,
"grad_norm": 3.6629035472869873,
"learning_rate": 1.4926298417669757e-06,
"loss": 0.4479219615459442,
"step": 1082
},
{
"epoch": 1.1693635382955772,
"grad_norm": 2.302075147628784,
"learning_rate": 1.4907020460958943e-06,
"loss": 0.4335775077342987,
"step": 1084
},
{
"epoch": 1.1715210355987056,
"grad_norm": 1.0914833545684814,
"learning_rate": 1.488772039019162e-06,
"loss": 0.466959148645401,
"step": 1086
},
{
"epoch": 1.173678532901834,
"grad_norm": 5.46653938293457,
"learning_rate": 1.4868398314607765e-06,
"loss": 0.6127966046333313,
"step": 1088
},
{
"epoch": 1.1758360302049622,
"grad_norm": 1.7374179363250732,
"learning_rate": 1.484905434357192e-06,
"loss": 0.5522704124450684,
"step": 1090
},
{
"epoch": 1.1779935275080906,
"grad_norm": 1.311828374862671,
"learning_rate": 1.482968858657255e-06,
"loss": 0.4033716320991516,
"step": 1092
},
{
"epoch": 1.180151024811219,
"grad_norm": 1.440038800239563,
"learning_rate": 1.481030115322142e-06,
"loss": 0.4107467234134674,
"step": 1094
},
{
"epoch": 1.1823085221143474,
"grad_norm": 17.832111358642578,
"learning_rate": 1.4790892153253004e-06,
"loss": 0.26430749893188477,
"step": 1096
},
{
"epoch": 1.1844660194174756,
"grad_norm": 21.0089054107666,
"learning_rate": 1.4771461696523828e-06,
"loss": 0.2329411655664444,
"step": 1098
},
{
"epoch": 1.186623516720604,
"grad_norm": 3.482215166091919,
"learning_rate": 1.4752009893011877e-06,
"loss": 0.33426716923713684,
"step": 1100
},
{
"epoch": 1.1887810140237325,
"grad_norm": 1.4247711896896362,
"learning_rate": 1.4732536852815948e-06,
"loss": 0.3406693637371063,
"step": 1102
},
{
"epoch": 1.190938511326861,
"grad_norm": 2.5058937072753906,
"learning_rate": 1.4713042686155054e-06,
"loss": 0.4682016670703888,
"step": 1104
},
{
"epoch": 1.1930960086299893,
"grad_norm": 3.2917213439941406,
"learning_rate": 1.469352750336778e-06,
"loss": 0.5560429096221924,
"step": 1106
},
{
"epoch": 1.1952535059331175,
"grad_norm": 2.8005712032318115,
"learning_rate": 1.4673991414911653e-06,
"loss": 0.49286743998527527,
"step": 1108
},
{
"epoch": 1.197411003236246,
"grad_norm": 1.9056379795074463,
"learning_rate": 1.465443453136255e-06,
"loss": 0.5415875911712646,
"step": 1110
},
{
"epoch": 1.1995685005393744,
"grad_norm": 3.3546078205108643,
"learning_rate": 1.4634856963414022e-06,
"loss": 0.5321105122566223,
"step": 1112
},
{
"epoch": 1.2017259978425028,
"grad_norm": 1.4719895124435425,
"learning_rate": 1.4615258821876726e-06,
"loss": 0.4267783761024475,
"step": 1114
},
{
"epoch": 1.203883495145631,
"grad_norm": 1.423250436782837,
"learning_rate": 1.459564021767774e-06,
"loss": 0.498091459274292,
"step": 1116
},
{
"epoch": 1.2060409924487594,
"grad_norm": 2.6084094047546387,
"learning_rate": 1.4576001261859981e-06,
"loss": 0.4652736186981201,
"step": 1118
},
{
"epoch": 1.2081984897518878,
"grad_norm": 1.2524727582931519,
"learning_rate": 1.4556342065581548e-06,
"loss": 0.5334936380386353,
"step": 1120
},
{
"epoch": 1.2103559870550162,
"grad_norm": 4.288187026977539,
"learning_rate": 1.453666274011511e-06,
"loss": 0.6997748017311096,
"step": 1122
},
{
"epoch": 1.2125134843581447,
"grad_norm": 2.6082146167755127,
"learning_rate": 1.4516963396847255e-06,
"loss": 0.6567426323890686,
"step": 1124
},
{
"epoch": 1.2146709816612729,
"grad_norm": 1.885820746421814,
"learning_rate": 1.4497244147277895e-06,
"loss": 0.41897153854370117,
"step": 1126
},
{
"epoch": 1.2168284789644013,
"grad_norm": 3.6321957111358643,
"learning_rate": 1.4477505103019587e-06,
"loss": 0.4789751172065735,
"step": 1128
},
{
"epoch": 1.2189859762675297,
"grad_norm": 3.317688226699829,
"learning_rate": 1.4457746375796956e-06,
"loss": 0.551139235496521,
"step": 1130
},
{
"epoch": 1.2211434735706581,
"grad_norm": 1.2082242965698242,
"learning_rate": 1.4437968077446013e-06,
"loss": 0.3944661617279053,
"step": 1132
},
{
"epoch": 1.2233009708737863,
"grad_norm": 1.8302658796310425,
"learning_rate": 1.4418170319913548e-06,
"loss": 0.23596011102199554,
"step": 1134
},
{
"epoch": 1.2254584681769147,
"grad_norm": 3.345332622528076,
"learning_rate": 1.43983532152565e-06,
"loss": 0.20758569240570068,
"step": 1136
},
{
"epoch": 1.2276159654800431,
"grad_norm": 2.142779588699341,
"learning_rate": 1.43785168756413e-06,
"loss": 0.4067525267601013,
"step": 1138
},
{
"epoch": 1.2297734627831716,
"grad_norm": 14.230850219726562,
"learning_rate": 1.4358661413343269e-06,
"loss": 0.5197821855545044,
"step": 1140
},
{
"epoch": 1.2319309600863,
"grad_norm": 1.4702306985855103,
"learning_rate": 1.4338786940745943e-06,
"loss": 0.6153298020362854,
"step": 1142
},
{
"epoch": 1.2340884573894282,
"grad_norm": 3.1709959506988525,
"learning_rate": 1.4318893570340476e-06,
"loss": 0.47198399901390076,
"step": 1144
},
{
"epoch": 1.2362459546925566,
"grad_norm": 2.0311388969421387,
"learning_rate": 1.4298981414724972e-06,
"loss": 0.4431988596916199,
"step": 1146
},
{
"epoch": 1.238403451995685,
"grad_norm": 2.6444904804229736,
"learning_rate": 1.4279050586603865e-06,
"loss": 0.49952733516693115,
"step": 1148
},
{
"epoch": 1.2405609492988134,
"grad_norm": 2.9312846660614014,
"learning_rate": 1.4259101198787284e-06,
"loss": 0.40768349170684814,
"step": 1150
},
{
"epoch": 1.2427184466019416,
"grad_norm": 1.304535150527954,
"learning_rate": 1.4239133364190402e-06,
"loss": 0.32800549268722534,
"step": 1152
},
{
"epoch": 1.24487594390507,
"grad_norm": 6.382114887237549,
"learning_rate": 1.4219147195832796e-06,
"loss": 0.5660591125488281,
"step": 1154
},
{
"epoch": 1.2470334412081985,
"grad_norm": 1.935137152671814,
"learning_rate": 1.4199142806837825e-06,
"loss": 0.46538597345352173,
"step": 1156
},
{
"epoch": 1.249190938511327,
"grad_norm": 1.4178097248077393,
"learning_rate": 1.4179120310431967e-06,
"loss": 0.3020792007446289,
"step": 1158
},
{
"epoch": 1.2513484358144553,
"grad_norm": 6.318742752075195,
"learning_rate": 1.41590798199442e-06,
"loss": 0.5570347309112549,
"step": 1160
},
{
"epoch": 1.2535059331175837,
"grad_norm": 1.248417615890503,
"learning_rate": 1.4139021448805344e-06,
"loss": 0.3992771506309509,
"step": 1162
},
{
"epoch": 1.255663430420712,
"grad_norm": 5.451845645904541,
"learning_rate": 1.4118945310547424e-06,
"loss": 0.5283824801445007,
"step": 1164
},
{
"epoch": 1.2578209277238404,
"grad_norm": 2.265537738800049,
"learning_rate": 1.4098851518803032e-06,
"loss": 0.41607847809791565,
"step": 1166
},
{
"epoch": 1.2599784250269688,
"grad_norm": 0.5451850295066833,
"learning_rate": 1.4078740187304678e-06,
"loss": 0.44866782426834106,
"step": 1168
},
{
"epoch": 1.262135922330097,
"grad_norm": 6.6960835456848145,
"learning_rate": 1.4058611429884153e-06,
"loss": 0.6595394015312195,
"step": 1170
},
{
"epoch": 1.2642934196332254,
"grad_norm": 6.947851657867432,
"learning_rate": 1.4038465360471872e-06,
"loss": 0.6133137345314026,
"step": 1172
},
{
"epoch": 1.2664509169363538,
"grad_norm": 2.5718576908111572,
"learning_rate": 1.401830209309624e-06,
"loss": 0.4383125901222229,
"step": 1174
},
{
"epoch": 1.2686084142394822,
"grad_norm": 2.443553924560547,
"learning_rate": 1.3998121741883012e-06,
"loss": 0.38315558433532715,
"step": 1176
},
{
"epoch": 1.2707659115426106,
"grad_norm": 0.8398682475090027,
"learning_rate": 1.3977924421054623e-06,
"loss": 0.22079361975193024,
"step": 1178
},
{
"epoch": 1.272923408845739,
"grad_norm": 3.0209848880767822,
"learning_rate": 1.3957710244929575e-06,
"loss": 0.4939245581626892,
"step": 1180
},
{
"epoch": 1.2750809061488673,
"grad_norm": 4.289799213409424,
"learning_rate": 1.3937479327921762e-06,
"loss": 0.42832162976264954,
"step": 1182
},
{
"epoch": 1.2772384034519957,
"grad_norm": 2.087005376815796,
"learning_rate": 1.3917231784539831e-06,
"loss": 0.5092071294784546,
"step": 1184
},
{
"epoch": 1.279395900755124,
"grad_norm": 1.6985106468200684,
"learning_rate": 1.3896967729386545e-06,
"loss": 0.6054165363311768,
"step": 1186
},
{
"epoch": 1.2815533980582523,
"grad_norm": 3.5646963119506836,
"learning_rate": 1.3876687277158117e-06,
"loss": 0.47859057784080505,
"step": 1188
},
{
"epoch": 1.2837108953613807,
"grad_norm": 3.154890537261963,
"learning_rate": 1.385639054264357e-06,
"loss": 0.43968018889427185,
"step": 1190
},
{
"epoch": 1.2858683926645091,
"grad_norm": 6.229619026184082,
"learning_rate": 1.383607764072409e-06,
"loss": 0.5543320775032043,
"step": 1192
},
{
"epoch": 1.2880258899676376,
"grad_norm": 12.460729598999023,
"learning_rate": 1.3815748686372368e-06,
"loss": 0.4493723511695862,
"step": 1194
},
{
"epoch": 1.290183387270766,
"grad_norm": 1.6863099336624146,
"learning_rate": 1.3795403794651955e-06,
"loss": 0.3126695156097412,
"step": 1196
},
{
"epoch": 1.2923408845738944,
"grad_norm": 3.3788959980010986,
"learning_rate": 1.3775043080716608e-06,
"loss": 0.46441030502319336,
"step": 1198
},
{
"epoch": 1.2944983818770226,
"grad_norm": 1.3057730197906494,
"learning_rate": 1.3754666659809636e-06,
"loss": 0.4863712191581726,
"step": 1200
},
{
"epoch": 1.296655879180151,
"grad_norm": 1.384608507156372,
"learning_rate": 1.3734274647263258e-06,
"loss": 0.41433578729629517,
"step": 1202
},
{
"epoch": 1.2988133764832794,
"grad_norm": 0.7437410950660706,
"learning_rate": 1.3713867158497935e-06,
"loss": 0.3361971378326416,
"step": 1204
},
{
"epoch": 1.3009708737864076,
"grad_norm": 5.0748090744018555,
"learning_rate": 1.369344430902173e-06,
"loss": 0.582435667514801,
"step": 1206
},
{
"epoch": 1.303128371089536,
"grad_norm": 1.445181131362915,
"learning_rate": 1.3673006214429657e-06,
"loss": 0.49374300241470337,
"step": 1208
},
{
"epoch": 1.3052858683926645,
"grad_norm": 2.7276389598846436,
"learning_rate": 1.3652552990402993e-06,
"loss": 0.49756351113319397,
"step": 1210
},
{
"epoch": 1.307443365695793,
"grad_norm": 3.362050771713257,
"learning_rate": 1.3632084752708672e-06,
"loss": 0.4800053536891937,
"step": 1212
},
{
"epoch": 1.3096008629989213,
"grad_norm": 1.3913723230361938,
"learning_rate": 1.36116016171986e-06,
"loss": 0.569862961769104,
"step": 1214
},
{
"epoch": 1.3117583603020497,
"grad_norm": 0.8088376522064209,
"learning_rate": 1.3591103699809009e-06,
"loss": 0.43602418899536133,
"step": 1216
},
{
"epoch": 1.313915857605178,
"grad_norm": 2.7153704166412354,
"learning_rate": 1.3570591116559786e-06,
"loss": 0.627713680267334,
"step": 1218
},
{
"epoch": 1.3160733549083063,
"grad_norm": 2.235117197036743,
"learning_rate": 1.3550063983553842e-06,
"loss": 0.20072109997272491,
"step": 1220
},
{
"epoch": 1.3182308522114348,
"grad_norm": 2.215144157409668,
"learning_rate": 1.352952241697643e-06,
"loss": 0.45614534616470337,
"step": 1222
},
{
"epoch": 1.3203883495145632,
"grad_norm": 1.2694110870361328,
"learning_rate": 1.3508966533094507e-06,
"loss": 0.4190627932548523,
"step": 1224
},
{
"epoch": 1.3225458468176914,
"grad_norm": 1.3221111297607422,
"learning_rate": 1.3488396448256063e-06,
"loss": 0.41167372465133667,
"step": 1226
},
{
"epoch": 1.3247033441208198,
"grad_norm": 0.3495451509952545,
"learning_rate": 1.3467812278889466e-06,
"loss": 0.2586868107318878,
"step": 1228
},
{
"epoch": 1.3268608414239482,
"grad_norm": 3.7483558654785156,
"learning_rate": 1.3447214141502801e-06,
"loss": 0.42079082131385803,
"step": 1230
},
{
"epoch": 1.3290183387270766,
"grad_norm": 1.2944005727767944,
"learning_rate": 1.3426602152683221e-06,
"loss": 0.4828168451786041,
"step": 1232
},
{
"epoch": 1.331175836030205,
"grad_norm": 2.611660957336426,
"learning_rate": 1.3405976429096268e-06,
"loss": 0.5353527665138245,
"step": 1234
},
{
"epoch": 1.3333333333333333,
"grad_norm": 3.6428263187408447,
"learning_rate": 1.3385337087485237e-06,
"loss": 0.28263047337532043,
"step": 1236
},
{
"epoch": 1.3354908306364617,
"grad_norm": 1.8777357339859009,
"learning_rate": 1.3364684244670498e-06,
"loss": 0.47503718733787537,
"step": 1238
},
{
"epoch": 1.33764832793959,
"grad_norm": 1.553531527519226,
"learning_rate": 1.334401801754883e-06,
"loss": 0.4773551821708679,
"step": 1240
},
{
"epoch": 1.3398058252427185,
"grad_norm": 26.932111740112305,
"learning_rate": 1.3323338523092775e-06,
"loss": 0.5582832098007202,
"step": 1242
},
{
"epoch": 1.3419633225458467,
"grad_norm": 5.682314395904541,
"learning_rate": 1.3302645878349972e-06,
"loss": 0.3482803702354431,
"step": 1244
},
{
"epoch": 1.3441208198489751,
"grad_norm": 1.8704055547714233,
"learning_rate": 1.3281940200442492e-06,
"loss": 0.5859532952308655,
"step": 1246
},
{
"epoch": 1.3462783171521036,
"grad_norm": 2.0849342346191406,
"learning_rate": 1.3261221606566161e-06,
"loss": 0.571201503276825,
"step": 1248
},
{
"epoch": 1.348435814455232,
"grad_norm": 1.3928718566894531,
"learning_rate": 1.324049021398993e-06,
"loss": 0.3548327088356018,
"step": 1250
},
{
"epoch": 1.3505933117583604,
"grad_norm": 1.392311930656433,
"learning_rate": 1.3219746140055185e-06,
"loss": 0.5696713328361511,
"step": 1252
},
{
"epoch": 1.3527508090614886,
"grad_norm": 1.4951963424682617,
"learning_rate": 1.3198989502175077e-06,
"loss": 0.34389352798461914,
"step": 1254
},
{
"epoch": 1.354908306364617,
"grad_norm": 2.442704916000366,
"learning_rate": 1.3178220417833887e-06,
"loss": 0.4191893935203552,
"step": 1256
},
{
"epoch": 1.3570658036677454,
"grad_norm": 2.9892749786376953,
"learning_rate": 1.315743900458634e-06,
"loss": 0.35198745131492615,
"step": 1258
},
{
"epoch": 1.3592233009708738,
"grad_norm": 2.776257038116455,
"learning_rate": 1.313664538005693e-06,
"loss": 0.3809160888195038,
"step": 1260
},
{
"epoch": 1.361380798274002,
"grad_norm": 2.1207423210144043,
"learning_rate": 1.3115839661939288e-06,
"loss": 0.3112916350364685,
"step": 1262
},
{
"epoch": 1.3635382955771305,
"grad_norm": 2.34796404838562,
"learning_rate": 1.3095021967995485e-06,
"loss": 0.3474862575531006,
"step": 1264
},
{
"epoch": 1.3656957928802589,
"grad_norm": 1.681514024734497,
"learning_rate": 1.3074192416055375e-06,
"loss": 0.6013367176055908,
"step": 1266
},
{
"epoch": 1.3678532901833873,
"grad_norm": 1.5219907760620117,
"learning_rate": 1.3053351124015935e-06,
"loss": 0.44022852182388306,
"step": 1268
},
{
"epoch": 1.3700107874865157,
"grad_norm": 10.068926811218262,
"learning_rate": 1.3032498209840583e-06,
"loss": 0.4306741952896118,
"step": 1270
},
{
"epoch": 1.3721682847896441,
"grad_norm": 3.296771287918091,
"learning_rate": 1.3011633791558532e-06,
"loss": 0.5527811050415039,
"step": 1272
},
{
"epoch": 1.3743257820927723,
"grad_norm": 136.3231201171875,
"learning_rate": 1.2990757987264098e-06,
"loss": 0.41877317428588867,
"step": 1274
},
{
"epoch": 1.3764832793959008,
"grad_norm": 2.0969786643981934,
"learning_rate": 1.2969870915116042e-06,
"loss": 0.578849732875824,
"step": 1276
},
{
"epoch": 1.3786407766990292,
"grad_norm": 4.652449131011963,
"learning_rate": 1.2948972693336916e-06,
"loss": 0.33083122968673706,
"step": 1278
},
{
"epoch": 1.3807982740021574,
"grad_norm": 1.5804355144500732,
"learning_rate": 1.292806344021237e-06,
"loss": 0.3789401948451996,
"step": 1280
},
{
"epoch": 1.3829557713052858,
"grad_norm": 1.5308772325515747,
"learning_rate": 1.2907143274090487e-06,
"loss": 0.5875998735427856,
"step": 1282
},
{
"epoch": 1.3851132686084142,
"grad_norm": 1.4146822690963745,
"learning_rate": 1.2886212313381128e-06,
"loss": 0.38486555218696594,
"step": 1284
},
{
"epoch": 1.3872707659115426,
"grad_norm": 4.086416244506836,
"learning_rate": 1.2865270676555249e-06,
"loss": 0.596904456615448,
"step": 1286
},
{
"epoch": 1.389428263214671,
"grad_norm": 3.7820913791656494,
"learning_rate": 1.2844318482144233e-06,
"loss": 0.43893247842788696,
"step": 1288
},
{
"epoch": 1.3915857605177995,
"grad_norm": 2.7569808959960938,
"learning_rate": 1.2823355848739217e-06,
"loss": 0.3261288106441498,
"step": 1290
},
{
"epoch": 1.3937432578209277,
"grad_norm": 10.709510803222656,
"learning_rate": 1.280238289499043e-06,
"loss": 0.5592629909515381,
"step": 1292
},
{
"epoch": 1.395900755124056,
"grad_norm": 3.0105295181274414,
"learning_rate": 1.2781399739606513e-06,
"loss": 0.5706429481506348,
"step": 1294
},
{
"epoch": 1.3980582524271845,
"grad_norm": 0.3642590045928955,
"learning_rate": 1.2760406501353845e-06,
"loss": 0.4913448393344879,
"step": 1296
},
{
"epoch": 1.4002157497303127,
"grad_norm": 1.7649108171463013,
"learning_rate": 1.273940329905588e-06,
"loss": 0.4015069007873535,
"step": 1298
},
{
"epoch": 1.4023732470334411,
"grad_norm": 5.478614807128906,
"learning_rate": 1.2718390251592465e-06,
"loss": 0.3647070527076721,
"step": 1300
},
{
"epoch": 1.4045307443365695,
"grad_norm": 2.2883858680725098,
"learning_rate": 1.2697367477899174e-06,
"loss": 0.5743715763092041,
"step": 1302
},
{
"epoch": 1.406688241639698,
"grad_norm": 1.2849724292755127,
"learning_rate": 1.2676335096966633e-06,
"loss": 0.3841140866279602,
"step": 1304
},
{
"epoch": 1.4088457389428264,
"grad_norm": 1.79099702835083,
"learning_rate": 1.2655293227839841e-06,
"loss": 0.4001426100730896,
"step": 1306
},
{
"epoch": 1.4110032362459548,
"grad_norm": 2.938184976577759,
"learning_rate": 1.2634241989617508e-06,
"loss": 0.5245987176895142,
"step": 1308
},
{
"epoch": 1.413160733549083,
"grad_norm": 1.6925368309020996,
"learning_rate": 1.2613181501451373e-06,
"loss": 0.41294950246810913,
"step": 1310
},
{
"epoch": 1.4153182308522114,
"grad_norm": 1.1948857307434082,
"learning_rate": 1.259211188254552e-06,
"loss": 0.4697638154029846,
"step": 1312
},
{
"epoch": 1.4174757281553398,
"grad_norm": 3.2041354179382324,
"learning_rate": 1.257103325215573e-06,
"loss": 0.47677257657051086,
"step": 1314
},
{
"epoch": 1.419633225458468,
"grad_norm": 4.060916423797607,
"learning_rate": 1.2549945729588771e-06,
"loss": 0.22076305747032166,
"step": 1316
},
{
"epoch": 1.4217907227615965,
"grad_norm": 4.826178073883057,
"learning_rate": 1.2528849434201758e-06,
"loss": 0.4530554711818695,
"step": 1318
},
{
"epoch": 1.4239482200647249,
"grad_norm": 1.5924415588378906,
"learning_rate": 1.2507744485401457e-06,
"loss": 0.4310169517993927,
"step": 1320
},
{
"epoch": 1.4261057173678533,
"grad_norm": 1.6999584436416626,
"learning_rate": 1.2486631002643604e-06,
"loss": 0.32071733474731445,
"step": 1322
},
{
"epoch": 1.4282632146709817,
"grad_norm": 1.5567405223846436,
"learning_rate": 1.2465509105432252e-06,
"loss": 0.2832459509372711,
"step": 1324
},
{
"epoch": 1.4304207119741101,
"grad_norm": 5.614641189575195,
"learning_rate": 1.2444378913319067e-06,
"loss": 0.47128552198410034,
"step": 1326
},
{
"epoch": 1.4325782092772383,
"grad_norm": 1.8778231143951416,
"learning_rate": 1.2423240545902674e-06,
"loss": 0.38101163506507874,
"step": 1328
},
{
"epoch": 1.4347357065803668,
"grad_norm": 3.9056172370910645,
"learning_rate": 1.2402094122827964e-06,
"loss": 0.537193775177002,
"step": 1330
},
{
"epoch": 1.4368932038834952,
"grad_norm": 3.836848735809326,
"learning_rate": 1.2380939763785433e-06,
"loss": 0.4837642014026642,
"step": 1332
},
{
"epoch": 1.4390507011866236,
"grad_norm": 1.4066507816314697,
"learning_rate": 1.2359777588510484e-06,
"loss": 0.5043050646781921,
"step": 1334
},
{
"epoch": 1.4412081984897518,
"grad_norm": 1.7807657718658447,
"learning_rate": 1.233860771678277e-06,
"loss": 0.42978519201278687,
"step": 1336
},
{
"epoch": 1.4433656957928802,
"grad_norm": 2.4499216079711914,
"learning_rate": 1.23174302684255e-06,
"loss": 0.5630735754966736,
"step": 1338
},
{
"epoch": 1.4455231930960086,
"grad_norm": 2.219531297683716,
"learning_rate": 1.2296245363304772e-06,
"loss": 0.6489322185516357,
"step": 1340
},
{
"epoch": 1.447680690399137,
"grad_norm": 3.5208077430725098,
"learning_rate": 1.2275053121328886e-06,
"loss": 0.424197793006897,
"step": 1342
},
{
"epoch": 1.4498381877022655,
"grad_norm": 2.0139458179473877,
"learning_rate": 1.2253853662447673e-06,
"loss": 0.51392662525177,
"step": 1344
},
{
"epoch": 1.4519956850053937,
"grad_norm": 3.8764588832855225,
"learning_rate": 1.223264710665181e-06,
"loss": 0.4180300533771515,
"step": 1346
},
{
"epoch": 1.454153182308522,
"grad_norm": 1.3136292695999146,
"learning_rate": 1.2211433573972145e-06,
"loss": 0.3597021698951721,
"step": 1348
},
{
"epoch": 1.4563106796116505,
"grad_norm": 2.6723670959472656,
"learning_rate": 1.219021318447901e-06,
"loss": 0.4391145408153534,
"step": 1350
},
{
"epoch": 1.458468176914779,
"grad_norm": 2.19071102142334,
"learning_rate": 1.2168986058281552e-06,
"loss": 0.31397783756256104,
"step": 1352
},
{
"epoch": 1.4606256742179071,
"grad_norm": 2.55515718460083,
"learning_rate": 1.2147752315527056e-06,
"loss": 0.49626126885414124,
"step": 1354
},
{
"epoch": 1.4627831715210355,
"grad_norm": 1.1953641176223755,
"learning_rate": 1.2126512076400238e-06,
"loss": 0.36800915002822876,
"step": 1356
},
{
"epoch": 1.464940668824164,
"grad_norm": 1.1821345090866089,
"learning_rate": 1.2105265461122599e-06,
"loss": 0.36970698833465576,
"step": 1358
},
{
"epoch": 1.4670981661272924,
"grad_norm": 1.9997817277908325,
"learning_rate": 1.208401258995173e-06,
"loss": 0.24953503906726837,
"step": 1360
},
{
"epoch": 1.4692556634304208,
"grad_norm": 1.4127711057662964,
"learning_rate": 1.2062753583180617e-06,
"loss": 0.6299887895584106,
"step": 1362
},
{
"epoch": 1.4714131607335492,
"grad_norm": 1.239811897277832,
"learning_rate": 1.2041488561136987e-06,
"loss": 0.2647631764411926,
"step": 1364
},
{
"epoch": 1.4735706580366774,
"grad_norm": 1.8730353116989136,
"learning_rate": 1.2020217644182618e-06,
"loss": 0.47313305735588074,
"step": 1366
},
{
"epoch": 1.4757281553398058,
"grad_norm": 1.2671191692352295,
"learning_rate": 1.1998940952712636e-06,
"loss": 0.4221327602863312,
"step": 1368
},
{
"epoch": 1.4778856526429343,
"grad_norm": 3.7683935165405273,
"learning_rate": 1.1977658607154866e-06,
"loss": 0.3292485773563385,
"step": 1370
},
{
"epoch": 1.4800431499460625,
"grad_norm": 2.1078288555145264,
"learning_rate": 1.1956370727969132e-06,
"loss": 0.4748386740684509,
"step": 1372
},
{
"epoch": 1.4822006472491909,
"grad_norm": 3.0718023777008057,
"learning_rate": 1.1935077435646573e-06,
"loss": 0.41127315163612366,
"step": 1374
},
{
"epoch": 1.4843581445523193,
"grad_norm": 9.03038501739502,
"learning_rate": 1.1913778850708974e-06,
"loss": 0.38048920035362244,
"step": 1376
},
{
"epoch": 1.4865156418554477,
"grad_norm": 2.5646114349365234,
"learning_rate": 1.189247509370807e-06,
"loss": 0.5044585466384888,
"step": 1378
},
{
"epoch": 1.4886731391585761,
"grad_norm": 3.9271023273468018,
"learning_rate": 1.1871166285224885e-06,
"loss": 0.5840790271759033,
"step": 1380
},
{
"epoch": 1.4908306364617046,
"grad_norm": 1.9364007711410522,
"learning_rate": 1.1849852545869013e-06,
"loss": 0.4913451671600342,
"step": 1382
},
{
"epoch": 1.4929881337648327,
"grad_norm": 7.079308986663818,
"learning_rate": 1.182853399627797e-06,
"loss": 0.40108633041381836,
"step": 1384
},
{
"epoch": 1.4951456310679612,
"grad_norm": 1.5303609371185303,
"learning_rate": 1.1807210757116505e-06,
"loss": 0.5875151753425598,
"step": 1386
},
{
"epoch": 1.4973031283710896,
"grad_norm": 1.7939358949661255,
"learning_rate": 1.1785882949075894e-06,
"loss": 0.43406108021736145,
"step": 1388
},
{
"epoch": 1.4994606256742178,
"grad_norm": 1.709847092628479,
"learning_rate": 1.1764550692873282e-06,
"loss": 0.4609090983867645,
"step": 1390
},
{
"epoch": 1.5016181229773462,
"grad_norm": 1.4324554204940796,
"learning_rate": 1.1743214109250992e-06,
"loss": 0.2564505934715271,
"step": 1392
},
{
"epoch": 1.5037756202804746,
"grad_norm": 2.9874749183654785,
"learning_rate": 1.1721873318975835e-06,
"loss": 0.46675199270248413,
"step": 1394
},
{
"epoch": 1.505933117583603,
"grad_norm": 3.750638008117676,
"learning_rate": 1.1700528442838442e-06,
"loss": 0.5055999755859375,
"step": 1396
},
{
"epoch": 1.5080906148867315,
"grad_norm": 2.8105647563934326,
"learning_rate": 1.167917960165256e-06,
"loss": 0.5268608331680298,
"step": 1398
},
{
"epoch": 1.5102481121898599,
"grad_norm": 3.50753116607666,
"learning_rate": 1.1657826916254382e-06,
"loss": 0.5102010369300842,
"step": 1400
},
{
"epoch": 1.512405609492988,
"grad_norm": 10.553208351135254,
"learning_rate": 1.1636470507501863e-06,
"loss": 0.4071239233016968,
"step": 1402
},
{
"epoch": 1.5145631067961165,
"grad_norm": 3.4755797386169434,
"learning_rate": 1.1615110496274028e-06,
"loss": 0.3140917420387268,
"step": 1404
},
{
"epoch": 1.516720604099245,
"grad_norm": 2.3255038261413574,
"learning_rate": 1.1593747003470294e-06,
"loss": 0.49230116605758667,
"step": 1406
},
{
"epoch": 1.5188781014023731,
"grad_norm": 1.2084012031555176,
"learning_rate": 1.1572380150009777e-06,
"loss": 0.39797013998031616,
"step": 1408
},
{
"epoch": 1.5210355987055015,
"grad_norm": 5.355250358581543,
"learning_rate": 1.1551010056830634e-06,
"loss": 0.36559203267097473,
"step": 1410
},
{
"epoch": 1.52319309600863,
"grad_norm": 1.0859466791152954,
"learning_rate": 1.152963684488934e-06,
"loss": 0.20361725986003876,
"step": 1412
},
{
"epoch": 1.5253505933117584,
"grad_norm": 3.301490306854248,
"learning_rate": 1.150826063516003e-06,
"loss": 0.36109161376953125,
"step": 1414
},
{
"epoch": 1.5275080906148868,
"grad_norm": 2.034646511077881,
"learning_rate": 1.1486881548633802e-06,
"loss": 0.4435052275657654,
"step": 1416
},
{
"epoch": 1.5296655879180152,
"grad_norm": 1.7059470415115356,
"learning_rate": 1.1465499706318048e-06,
"loss": 0.4154685437679291,
"step": 1418
},
{
"epoch": 1.5318230852211436,
"grad_norm": 1.5160272121429443,
"learning_rate": 1.1444115229235745e-06,
"loss": 0.37496164441108704,
"step": 1420
},
{
"epoch": 1.5339805825242718,
"grad_norm": 2.6280198097229004,
"learning_rate": 1.1422728238424785e-06,
"loss": 0.48741182684898376,
"step": 1422
},
{
"epoch": 1.5361380798274002,
"grad_norm": 14.306265830993652,
"learning_rate": 1.14013388549373e-06,
"loss": 0.5213165879249573,
"step": 1424
},
{
"epoch": 1.5382955771305284,
"grad_norm": 1.211489200592041,
"learning_rate": 1.1379947199838952e-06,
"loss": 0.345187783241272,
"step": 1426
},
{
"epoch": 1.5404530744336569,
"grad_norm": 3.2337164878845215,
"learning_rate": 1.1358553394208268e-06,
"loss": 0.5196102857589722,
"step": 1428
},
{
"epoch": 1.5426105717367853,
"grad_norm": 1.5404866933822632,
"learning_rate": 1.1337157559135942e-06,
"loss": 0.4148750603199005,
"step": 1430
},
{
"epoch": 1.5447680690399137,
"grad_norm": 1.683718204498291,
"learning_rate": 1.1315759815724152e-06,
"loss": 0.32485026121139526,
"step": 1432
},
{
"epoch": 1.5469255663430421,
"grad_norm": 2.2065541744232178,
"learning_rate": 1.1294360285085888e-06,
"loss": 0.2961767017841339,
"step": 1434
},
{
"epoch": 1.5490830636461705,
"grad_norm": 1.0630570650100708,
"learning_rate": 1.1272959088344253e-06,
"loss": 0.37115591764450073,
"step": 1436
},
{
"epoch": 1.551240560949299,
"grad_norm": 2.599900245666504,
"learning_rate": 1.1251556346631762e-06,
"loss": 0.5358873605728149,
"step": 1438
},
{
"epoch": 1.5533980582524272,
"grad_norm": 1.2480677366256714,
"learning_rate": 1.1230152181089708e-06,
"loss": 0.46197211742401123,
"step": 1440
},
{
"epoch": 1.5555555555555556,
"grad_norm": 2.2794196605682373,
"learning_rate": 1.1208746712867419e-06,
"loss": 0.44740840792655945,
"step": 1442
},
{
"epoch": 1.5577130528586838,
"grad_norm": 1.7489802837371826,
"learning_rate": 1.1187340063121593e-06,
"loss": 0.4339655339717865,
"step": 1444
},
{
"epoch": 1.5598705501618122,
"grad_norm": 3.410910129547119,
"learning_rate": 1.116593235301564e-06,
"loss": 0.3300541639328003,
"step": 1446
},
{
"epoch": 1.5620280474649406,
"grad_norm": 0.5800649523735046,
"learning_rate": 1.1144523703718942e-06,
"loss": 0.5032283663749695,
"step": 1448
},
{
"epoch": 1.564185544768069,
"grad_norm": 1.7073270082473755,
"learning_rate": 1.1123114236406224e-06,
"loss": 0.4437793791294098,
"step": 1450
},
{
"epoch": 1.5663430420711975,
"grad_norm": 1.9129263162612915,
"learning_rate": 1.1101704072256819e-06,
"loss": 0.49655881524086,
"step": 1452
},
{
"epoch": 1.5685005393743259,
"grad_norm": 3.6259055137634277,
"learning_rate": 1.1080293332454016e-06,
"loss": 0.331562340259552,
"step": 1454
},
{
"epoch": 1.5706580366774543,
"grad_norm": 1.8879085779190063,
"learning_rate": 1.1058882138184363e-06,
"loss": 0.5420922040939331,
"step": 1456
},
{
"epoch": 1.5728155339805825,
"grad_norm": 1.6056373119354248,
"learning_rate": 1.103747061063697e-06,
"loss": 0.2305726557970047,
"step": 1458
},
{
"epoch": 1.574973031283711,
"grad_norm": 2.3105075359344482,
"learning_rate": 1.101605887100285e-06,
"loss": 0.4295492470264435,
"step": 1460
},
{
"epoch": 1.577130528586839,
"grad_norm": 3.3066842555999756,
"learning_rate": 1.09946470404742e-06,
"loss": 0.5346636772155762,
"step": 1462
},
{
"epoch": 1.5792880258899675,
"grad_norm": 5.481215476989746,
"learning_rate": 1.097323524024374e-06,
"loss": 0.669352114200592,
"step": 1464
},
{
"epoch": 1.581445523193096,
"grad_norm": 5.5241851806640625,
"learning_rate": 1.095182359150402e-06,
"loss": 0.5989066958427429,
"step": 1466
},
{
"epoch": 1.5836030204962244,
"grad_norm": 1.298604130744934,
"learning_rate": 1.0930412215446723e-06,
"loss": 0.3661651015281677,
"step": 1468
},
{
"epoch": 1.5857605177993528,
"grad_norm": 4.695067405700684,
"learning_rate": 1.0909001233262001e-06,
"loss": 0.449363648891449,
"step": 1470
},
{
"epoch": 1.5879180151024812,
"grad_norm": 2.782097578048706,
"learning_rate": 1.0887590766137766e-06,
"loss": 0.5595487356185913,
"step": 1472
},
{
"epoch": 1.5900755124056096,
"grad_norm": 1.2103036642074585,
"learning_rate": 1.0866180935259022e-06,
"loss": 0.38902321457862854,
"step": 1474
},
{
"epoch": 1.5922330097087378,
"grad_norm": 1.6246592998504639,
"learning_rate": 1.084477186180717e-06,
"loss": 0.5024740099906921,
"step": 1476
},
{
"epoch": 1.5943905070118662,
"grad_norm": 1.3438127040863037,
"learning_rate": 1.0823363666959322e-06,
"loss": 0.47724461555480957,
"step": 1478
},
{
"epoch": 1.5965480043149944,
"grad_norm": 1.5329099893569946,
"learning_rate": 1.0801956471887618e-06,
"loss": 0.43613773584365845,
"step": 1480
},
{
"epoch": 1.5987055016181229,
"grad_norm": 2.6041982173919678,
"learning_rate": 1.078055039775854e-06,
"loss": 0.5445818305015564,
"step": 1482
},
{
"epoch": 1.6008629989212513,
"grad_norm": 3.287353277206421,
"learning_rate": 1.075914556573222e-06,
"loss": 0.35657113790512085,
"step": 1484
},
{
"epoch": 1.6030204962243797,
"grad_norm": 6.16733455657959,
"learning_rate": 1.0737742096961774e-06,
"loss": 0.5397022366523743,
"step": 1486
},
{
"epoch": 1.6051779935275081,
"grad_norm": 1.3404687643051147,
"learning_rate": 1.0716340112592582e-06,
"loss": 0.40695685148239136,
"step": 1488
},
{
"epoch": 1.6073354908306365,
"grad_norm": 4.531323432922363,
"learning_rate": 1.0694939733761635e-06,
"loss": 0.43187639117240906,
"step": 1490
},
{
"epoch": 1.609492988133765,
"grad_norm": 4.229406833648682,
"learning_rate": 1.067354108159684e-06,
"loss": 0.3659261465072632,
"step": 1492
},
{
"epoch": 1.6116504854368932,
"grad_norm": 1.4188188314437866,
"learning_rate": 1.0652144277216315e-06,
"loss": 0.5332222580909729,
"step": 1494
},
{
"epoch": 1.6138079827400216,
"grad_norm": 2.903252363204956,
"learning_rate": 1.063074944172774e-06,
"loss": 0.4275670647621155,
"step": 1496
},
{
"epoch": 1.61596548004315,
"grad_norm": 1.9704622030258179,
"learning_rate": 1.060935669622763e-06,
"loss": 0.5114681720733643,
"step": 1498
},
{
"epoch": 1.6181229773462782,
"grad_norm": 1.4989230632781982,
"learning_rate": 1.0587966161800688e-06,
"loss": 0.4305647909641266,
"step": 1500
},
{
"epoch": 1.6202804746494066,
"grad_norm": 4.043560981750488,
"learning_rate": 1.0566577959519086e-06,
"loss": 0.34898895025253296,
"step": 1502
},
{
"epoch": 1.622437971952535,
"grad_norm": 3.2984836101531982,
"learning_rate": 1.0545192210441814e-06,
"loss": 0.3457680642604828,
"step": 1504
},
{
"epoch": 1.6245954692556634,
"grad_norm": 2.0970866680145264,
"learning_rate": 1.0523809035613964e-06,
"loss": 0.45543625950813293,
"step": 1506
},
{
"epoch": 1.6267529665587919,
"grad_norm": 4.432509422302246,
"learning_rate": 1.0502428556066059e-06,
"loss": 0.33377963304519653,
"step": 1508
},
{
"epoch": 1.6289104638619203,
"grad_norm": 1.2249876260757446,
"learning_rate": 1.0481050892813368e-06,
"loss": 0.3518203794956207,
"step": 1510
},
{
"epoch": 1.6310679611650487,
"grad_norm": 3.6273698806762695,
"learning_rate": 1.0459676166855223e-06,
"loss": 0.47581151127815247,
"step": 1512
},
{
"epoch": 1.633225458468177,
"grad_norm": 1.7668628692626953,
"learning_rate": 1.0438304499174325e-06,
"loss": 0.31876808404922485,
"step": 1514
},
{
"epoch": 1.6353829557713053,
"grad_norm": 4.061316013336182,
"learning_rate": 1.0416936010736064e-06,
"loss": 0.47807684540748596,
"step": 1516
},
{
"epoch": 1.6375404530744335,
"grad_norm": 1.9564175605773926,
"learning_rate": 1.0395570822487845e-06,
"loss": 0.47794413566589355,
"step": 1518
},
{
"epoch": 1.639697950377562,
"grad_norm": 1.4420032501220703,
"learning_rate": 1.0374209055358385e-06,
"loss": 0.6091484427452087,
"step": 1520
},
{
"epoch": 1.6418554476806904,
"grad_norm": 2.4212918281555176,
"learning_rate": 1.0352850830257037e-06,
"loss": 0.3609981834888458,
"step": 1522
},
{
"epoch": 1.6440129449838188,
"grad_norm": 17.685544967651367,
"learning_rate": 1.0331496268073113e-06,
"loss": 0.3519137501716614,
"step": 1524
},
{
"epoch": 1.6461704422869472,
"grad_norm": 2.5702126026153564,
"learning_rate": 1.031014548967518e-06,
"loss": 0.4019058346748352,
"step": 1526
},
{
"epoch": 1.6483279395900756,
"grad_norm": 1.5977301597595215,
"learning_rate": 1.0288798615910409e-06,
"loss": 0.4482097923755646,
"step": 1528
},
{
"epoch": 1.650485436893204,
"grad_norm": 3.8261749744415283,
"learning_rate": 1.0267455767603842e-06,
"loss": 0.5603641867637634,
"step": 1530
},
{
"epoch": 1.6526429341963322,
"grad_norm": 2.4676754474639893,
"learning_rate": 1.0246117065557762e-06,
"loss": 0.6466296315193176,
"step": 1532
},
{
"epoch": 1.6548004314994607,
"grad_norm": 9.348182678222656,
"learning_rate": 1.0224782630550976e-06,
"loss": 0.4512023627758026,
"step": 1534
},
{
"epoch": 1.6569579288025889,
"grad_norm": 1.4680399894714355,
"learning_rate": 1.020345258333813e-06,
"loss": 0.3725220561027527,
"step": 1536
},
{
"epoch": 1.6591154261057173,
"grad_norm": 1.6723597049713135,
"learning_rate": 1.0182127044649052e-06,
"loss": 0.5063510537147522,
"step": 1538
},
{
"epoch": 1.6612729234088457,
"grad_norm": 1.5339092016220093,
"learning_rate": 1.0160806135188028e-06,
"loss": 0.46868813037872314,
"step": 1540
},
{
"epoch": 1.6634304207119741,
"grad_norm": 1.4481370449066162,
"learning_rate": 1.0139489975633166e-06,
"loss": 0.44415712356567383,
"step": 1542
},
{
"epoch": 1.6655879180151025,
"grad_norm": 4.979800701141357,
"learning_rate": 1.0118178686635677e-06,
"loss": 0.3348858952522278,
"step": 1544
},
{
"epoch": 1.667745415318231,
"grad_norm": 2.358186721801758,
"learning_rate": 1.00968723888192e-06,
"loss": 0.42780208587646484,
"step": 1546
},
{
"epoch": 1.6699029126213594,
"grad_norm": 1.201817512512207,
"learning_rate": 1.0075571202779138e-06,
"loss": 0.46995261311531067,
"step": 1548
},
{
"epoch": 1.6720604099244876,
"grad_norm": 3.709390878677368,
"learning_rate": 1.0054275249081947e-06,
"loss": 0.35104840993881226,
"step": 1550
},
{
"epoch": 1.674217907227616,
"grad_norm": 1.4292689561843872,
"learning_rate": 1.0032984648264479e-06,
"loss": 0.4314435124397278,
"step": 1552
},
{
"epoch": 1.6763754045307442,
"grad_norm": 1.3379240036010742,
"learning_rate": 1.0011699520833272e-06,
"loss": 0.4032558798789978,
"step": 1554
},
{
"epoch": 1.6785329018338726,
"grad_norm": 7.688792705535889,
"learning_rate": 9.990419987263904e-07,
"loss": 0.4385361671447754,
"step": 1556
},
{
"epoch": 1.680690399137001,
"grad_norm": 3.1750102043151855,
"learning_rate": 9.969146168000277e-07,
"loss": 0.31719791889190674,
"step": 1558
},
{
"epoch": 1.6828478964401294,
"grad_norm": 1.8544740676879883,
"learning_rate": 9.947878183453955e-07,
"loss": 0.5202147364616394,
"step": 1560
},
{
"epoch": 1.6850053937432579,
"grad_norm": 1.8201504945755005,
"learning_rate": 9.926616154003478e-07,
"loss": 0.34038931131362915,
"step": 1562
},
{
"epoch": 1.6871628910463863,
"grad_norm": 1.8023303747177124,
"learning_rate": 9.905360199993674e-07,
"loss": 0.3473019599914551,
"step": 1564
},
{
"epoch": 1.6893203883495147,
"grad_norm": 2.248263120651245,
"learning_rate": 9.884110441734992e-07,
"loss": 0.49435266852378845,
"step": 1566
},
{
"epoch": 1.691477885652643,
"grad_norm": 2.6698451042175293,
"learning_rate": 9.862866999502805e-07,
"loss": 0.4461665451526642,
"step": 1568
},
{
"epoch": 1.6936353829557713,
"grad_norm": 1.4548275470733643,
"learning_rate": 9.841629993536741e-07,
"loss": 0.5574808120727539,
"step": 1570
},
{
"epoch": 1.6957928802588995,
"grad_norm": 1.4881387948989868,
"learning_rate": 9.820399544039997e-07,
"loss": 0.3747144043445587,
"step": 1572
},
{
"epoch": 1.697950377562028,
"grad_norm": 43.702919006347656,
"learning_rate": 9.799175771178662e-07,
"loss": 0.543049693107605,
"step": 1574
},
{
"epoch": 1.7001078748651564,
"grad_norm": 3.537771463394165,
"learning_rate": 9.777958795081024e-07,
"loss": 0.38331982493400574,
"step": 1576
},
{
"epoch": 1.7022653721682848,
"grad_norm": 1.9265162944793701,
"learning_rate": 9.75674873583692e-07,
"loss": 0.3932670056819916,
"step": 1578
},
{
"epoch": 1.7044228694714132,
"grad_norm": 1.3593825101852417,
"learning_rate": 9.735545713497021e-07,
"loss": 0.4138597249984741,
"step": 1580
},
{
"epoch": 1.7065803667745416,
"grad_norm": 2.415477991104126,
"learning_rate": 9.714349848072175e-07,
"loss": 0.4992269277572632,
"step": 1582
},
{
"epoch": 1.70873786407767,
"grad_norm": 1.0291266441345215,
"learning_rate": 9.693161259532722e-07,
"loss": 0.4245167076587677,
"step": 1584
},
{
"epoch": 1.7108953613807982,
"grad_norm": 1.4725301265716553,
"learning_rate": 9.671980067807806e-07,
"loss": 0.35596776008605957,
"step": 1586
},
{
"epoch": 1.7130528586839266,
"grad_norm": 1.3389267921447754,
"learning_rate": 9.650806392784719e-07,
"loss": 0.3590199947357178,
"step": 1588
},
{
"epoch": 1.715210355987055,
"grad_norm": 1.9211981296539307,
"learning_rate": 9.629640354308188e-07,
"loss": 0.5305579900741577,
"step": 1590
},
{
"epoch": 1.7173678532901833,
"grad_norm": 1.3896666765213013,
"learning_rate": 9.60848207217974e-07,
"loss": 0.3872862458229065,
"step": 1592
},
{
"epoch": 1.7195253505933117,
"grad_norm": 1.2243990898132324,
"learning_rate": 9.587331666156988e-07,
"loss": 0.5288591384887695,
"step": 1594
},
{
"epoch": 1.72168284789644,
"grad_norm": 1.8954887390136719,
"learning_rate": 9.566189255952956e-07,
"loss": 0.43896806240081787,
"step": 1596
},
{
"epoch": 1.7238403451995685,
"grad_norm": 1.1927108764648438,
"learning_rate": 9.545054961235435e-07,
"loss": 0.4235879182815552,
"step": 1598
},
{
"epoch": 1.725997842502697,
"grad_norm": 7.993542194366455,
"learning_rate": 9.523928901626255e-07,
"loss": 0.35887616872787476,
"step": 1600
},
{
"epoch": 1.7281553398058254,
"grad_norm": 2.1313676834106445,
"learning_rate": 9.502811196700656e-07,
"loss": 0.46110397577285767,
"step": 1602
},
{
"epoch": 1.7303128371089536,
"grad_norm": 1.278878092765808,
"learning_rate": 9.481701965986574e-07,
"loss": 0.3147183656692505,
"step": 1604
},
{
"epoch": 1.732470334412082,
"grad_norm": 3.170421838760376,
"learning_rate": 9.460601328963996e-07,
"loss": 0.24724824726581573,
"step": 1606
},
{
"epoch": 1.7346278317152104,
"grad_norm": 1.7401503324508667,
"learning_rate": 9.439509405064254e-07,
"loss": 0.41423508524894714,
"step": 1608
},
{
"epoch": 1.7367853290183386,
"grad_norm": 1.8899052143096924,
"learning_rate": 9.41842631366937e-07,
"loss": 0.5291723608970642,
"step": 1610
},
{
"epoch": 1.738942826321467,
"grad_norm": 2.190075635910034,
"learning_rate": 9.397352174111372e-07,
"loss": 0.49489831924438477,
"step": 1612
},
{
"epoch": 1.7411003236245954,
"grad_norm": 4.175290584564209,
"learning_rate": 9.376287105671621e-07,
"loss": 0.2998746633529663,
"step": 1614
},
{
"epoch": 1.7432578209277239,
"grad_norm": 1.679629921913147,
"learning_rate": 9.355231227580132e-07,
"loss": 0.4566305875778198,
"step": 1616
},
{
"epoch": 1.7454153182308523,
"grad_norm": 1.7757675647735596,
"learning_rate": 9.334184659014901e-07,
"loss": 0.36898234486579895,
"step": 1618
},
{
"epoch": 1.7475728155339807,
"grad_norm": 24.852197647094727,
"learning_rate": 9.313147519101237e-07,
"loss": 0.2811485826969147,
"step": 1620
},
{
"epoch": 1.7497303128371091,
"grad_norm": 1.6118603944778442,
"learning_rate": 9.292119926911078e-07,
"loss": 0.2936355173587799,
"step": 1622
},
{
"epoch": 1.7518878101402373,
"grad_norm": 1.2674829959869385,
"learning_rate": 9.271102001462321e-07,
"loss": 0.3665968179702759,
"step": 1624
},
{
"epoch": 1.7540453074433657,
"grad_norm": 2.609710216522217,
"learning_rate": 9.250093861718151e-07,
"loss": 0.38114845752716064,
"step": 1626
},
{
"epoch": 1.756202804746494,
"grad_norm": 2.0557167530059814,
"learning_rate": 9.229095626586362e-07,
"loss": 0.4779360294342041,
"step": 1628
},
{
"epoch": 1.7583603020496223,
"grad_norm": 2.9698874950408936,
"learning_rate": 9.208107414918691e-07,
"loss": 0.5487996935844421,
"step": 1630
},
{
"epoch": 1.7605177993527508,
"grad_norm": 1.6979955434799194,
"learning_rate": 9.187129345510134e-07,
"loss": 0.5224738121032715,
"step": 1632
},
{
"epoch": 1.7626752966558792,
"grad_norm": 2.131030321121216,
"learning_rate": 9.166161537098287e-07,
"loss": 0.33794957399368286,
"step": 1634
},
{
"epoch": 1.7648327939590076,
"grad_norm": 1.3157271146774292,
"learning_rate": 9.145204108362672e-07,
"loss": 0.49309784173965454,
"step": 1636
},
{
"epoch": 1.766990291262136,
"grad_norm": 1.6136844158172607,
"learning_rate": 9.124257177924049e-07,
"loss": 0.5821846723556519,
"step": 1638
},
{
"epoch": 1.7691477885652644,
"grad_norm": 1.258776068687439,
"learning_rate": 9.10332086434377e-07,
"loss": 0.46728694438934326,
"step": 1640
},
{
"epoch": 1.7713052858683926,
"grad_norm": 1.5475536584854126,
"learning_rate": 9.082395286123081e-07,
"loss": 0.4196864068508148,
"step": 1642
},
{
"epoch": 1.773462783171521,
"grad_norm": 3.12204909324646,
"learning_rate": 9.061480561702482e-07,
"loss": 0.42648231983184814,
"step": 1644
},
{
"epoch": 1.7756202804746493,
"grad_norm": 4.430125713348389,
"learning_rate": 9.040576809461016e-07,
"loss": 0.5809032917022705,
"step": 1646
},
{
"epoch": 1.7777777777777777,
"grad_norm": 1.5230090618133545,
"learning_rate": 9.019684147715649e-07,
"loss": 0.4213182330131531,
"step": 1648
},
{
"epoch": 1.779935275080906,
"grad_norm": 2.2308318614959717,
"learning_rate": 8.99880269472056e-07,
"loss": 0.2347421497106552,
"step": 1650
},
{
"epoch": 1.7820927723840345,
"grad_norm": 1.6177752017974854,
"learning_rate": 8.97793256866648e-07,
"loss": 0.4257172644138336,
"step": 1652
},
{
"epoch": 1.784250269687163,
"grad_norm": 2.0257010459899902,
"learning_rate": 8.957073887680046e-07,
"loss": 0.3010298013687134,
"step": 1654
},
{
"epoch": 1.7864077669902914,
"grad_norm": 2.075418472290039,
"learning_rate": 8.936226769823094e-07,
"loss": 0.5388916730880737,
"step": 1656
},
{
"epoch": 1.7885652642934198,
"grad_norm": 1.9110989570617676,
"learning_rate": 8.915391333092028e-07,
"loss": 0.40239423513412476,
"step": 1658
},
{
"epoch": 1.790722761596548,
"grad_norm": 1.4136828184127808,
"learning_rate": 8.894567695417128e-07,
"loss": 0.44491565227508545,
"step": 1660
},
{
"epoch": 1.7928802588996764,
"grad_norm": 1.7407686710357666,
"learning_rate": 8.873755974661894e-07,
"loss": 0.4648374021053314,
"step": 1662
},
{
"epoch": 1.7950377562028046,
"grad_norm": 1.3232940435409546,
"learning_rate": 8.852956288622373e-07,
"loss": 0.4256327450275421,
"step": 1664
},
{
"epoch": 1.797195253505933,
"grad_norm": 2.126704692840576,
"learning_rate": 8.832168755026495e-07,
"loss": 0.1840769350528717,
"step": 1666
},
{
"epoch": 1.7993527508090614,
"grad_norm": 1.6251252889633179,
"learning_rate": 8.81139349153341e-07,
"loss": 0.4822881519794464,
"step": 1668
},
{
"epoch": 1.8015102481121898,
"grad_norm": 1.9323124885559082,
"learning_rate": 8.790630615732808e-07,
"loss": 0.4157404899597168,
"step": 1670
},
{
"epoch": 1.8036677454153183,
"grad_norm": 4.677561283111572,
"learning_rate": 8.769880245144277e-07,
"loss": 0.3802054226398468,
"step": 1672
},
{
"epoch": 1.8058252427184467,
"grad_norm": 0.5432685613632202,
"learning_rate": 8.749142497216613e-07,
"loss": 0.22273704409599304,
"step": 1674
},
{
"epoch": 1.807982740021575,
"grad_norm": 3.5203936100006104,
"learning_rate": 8.728417489327174e-07,
"loss": 0.546721339225769,
"step": 1676
},
{
"epoch": 1.8101402373247033,
"grad_norm": 1.5164800882339478,
"learning_rate": 8.707705338781202e-07,
"loss": 0.5539653897285461,
"step": 1678
},
{
"epoch": 1.8122977346278317,
"grad_norm": 4.041696071624756,
"learning_rate": 8.687006162811175e-07,
"loss": 0.48323866724967957,
"step": 1680
},
{
"epoch": 1.81445523193096,
"grad_norm": 2.8498449325561523,
"learning_rate": 8.666320078576125e-07,
"loss": 0.37030312418937683,
"step": 1682
},
{
"epoch": 1.8166127292340883,
"grad_norm": 2.4165847301483154,
"learning_rate": 8.645647203160988e-07,
"loss": 0.535261869430542,
"step": 1684
},
{
"epoch": 1.8187702265372168,
"grad_norm": 1.3950622081756592,
"learning_rate": 8.624987653575935e-07,
"loss": 0.09442806243896484,
"step": 1686
},
{
"epoch": 1.8209277238403452,
"grad_norm": 3.6123199462890625,
"learning_rate": 8.604341546755711e-07,
"loss": 0.4735386073589325,
"step": 1688
},
{
"epoch": 1.8230852211434736,
"grad_norm": 1.8474417924880981,
"learning_rate": 8.583708999558981e-07,
"loss": 0.42983824014663696,
"step": 1690
},
{
"epoch": 1.825242718446602,
"grad_norm": 4.4611406326293945,
"learning_rate": 8.563090128767643e-07,
"loss": 0.4846471846103668,
"step": 1692
},
{
"epoch": 1.8274002157497304,
"grad_norm": 4.02655553817749,
"learning_rate": 8.54248505108621e-07,
"loss": 0.4285997152328491,
"step": 1694
},
{
"epoch": 1.8295577130528586,
"grad_norm": 1.1664454936981201,
"learning_rate": 8.521893883141114e-07,
"loss": 0.3732617199420929,
"step": 1696
},
{
"epoch": 1.831715210355987,
"grad_norm": 2.430764675140381,
"learning_rate": 8.501316741480044e-07,
"loss": 0.5520771741867065,
"step": 1698
},
{
"epoch": 1.8338727076591155,
"grad_norm": 1.7392953634262085,
"learning_rate": 8.480753742571325e-07,
"loss": 0.4468059241771698,
"step": 1700
},
{
"epoch": 1.8360302049622437,
"grad_norm": 1.8151521682739258,
"learning_rate": 8.460205002803206e-07,
"loss": 0.623181939125061,
"step": 1702
},
{
"epoch": 1.838187702265372,
"grad_norm": 1.6103137731552124,
"learning_rate": 8.439670638483254e-07,
"loss": 0.47068604826927185,
"step": 1704
},
{
"epoch": 1.8403451995685005,
"grad_norm": 1.699935793876648,
"learning_rate": 8.419150765837644e-07,
"loss": 0.461783230304718,
"step": 1706
},
{
"epoch": 1.842502696871629,
"grad_norm": 1.5268728733062744,
"learning_rate": 8.398645501010544e-07,
"loss": 0.4249412715435028,
"step": 1708
},
{
"epoch": 1.8446601941747574,
"grad_norm": 1.973617434501648,
"learning_rate": 8.378154960063439e-07,
"loss": 0.3225463628768921,
"step": 1710
},
{
"epoch": 1.8468176914778858,
"grad_norm": 2.0861403942108154,
"learning_rate": 8.357679258974471e-07,
"loss": 0.41262945532798767,
"step": 1712
},
{
"epoch": 1.8489751887810142,
"grad_norm": 1.1829684972763062,
"learning_rate": 8.33721851363779e-07,
"loss": 0.3057762086391449,
"step": 1714
},
{
"epoch": 1.8511326860841424,
"grad_norm": 2.940964937210083,
"learning_rate": 8.316772839862889e-07,
"loss": 0.49465298652648926,
"step": 1716
},
{
"epoch": 1.8532901833872708,
"grad_norm": 2.7260243892669678,
"learning_rate": 8.296342353373964e-07,
"loss": 0.3695753216743469,
"step": 1718
},
{
"epoch": 1.855447680690399,
"grad_norm": 1.9509867429733276,
"learning_rate": 8.275927169809245e-07,
"loss": 0.33289045095443726,
"step": 1720
},
{
"epoch": 1.8576051779935274,
"grad_norm": 1.32254159450531,
"learning_rate": 8.255527404720346e-07,
"loss": 0.48791223764419556,
"step": 1722
},
{
"epoch": 1.8597626752966558,
"grad_norm": 1.0751956701278687,
"learning_rate": 8.235143173571615e-07,
"loss": 0.4154895544052124,
"step": 1724
},
{
"epoch": 1.8619201725997843,
"grad_norm": 2.691671371459961,
"learning_rate": 8.214774591739469e-07,
"loss": 0.4291550815105438,
"step": 1726
},
{
"epoch": 1.8640776699029127,
"grad_norm": 2.053277015686035,
"learning_rate": 8.194421774511757e-07,
"loss": 0.19994314014911652,
"step": 1728
},
{
"epoch": 1.866235167206041,
"grad_norm": 0.4679964482784271,
"learning_rate": 8.174084837087091e-07,
"loss": 0.25225332379341125,
"step": 1730
},
{
"epoch": 1.8683926645091695,
"grad_norm": 8.203169822692871,
"learning_rate": 8.15376389457421e-07,
"loss": 0.4160417914390564,
"step": 1732
},
{
"epoch": 1.8705501618122977,
"grad_norm": 2.1094777584075928,
"learning_rate": 8.133459061991312e-07,
"loss": 0.3063911199569702,
"step": 1734
},
{
"epoch": 1.8727076591154261,
"grad_norm": 5.972533702850342,
"learning_rate": 8.113170454265421e-07,
"loss": 0.48280882835388184,
"step": 1736
},
{
"epoch": 1.8748651564185543,
"grad_norm": 1.7371788024902344,
"learning_rate": 8.092898186231722e-07,
"loss": 0.5959540605545044,
"step": 1738
},
{
"epoch": 1.8770226537216828,
"grad_norm": 3.3627707958221436,
"learning_rate": 8.072642372632914e-07,
"loss": 0.4884318709373474,
"step": 1740
},
{
"epoch": 1.8791801510248112,
"grad_norm": 2.1984074115753174,
"learning_rate": 8.052403128118564e-07,
"loss": 0.6091974377632141,
"step": 1742
},
{
"epoch": 1.8813376483279396,
"grad_norm": 2.3978047370910645,
"learning_rate": 8.032180567244457e-07,
"loss": 0.44491517543792725,
"step": 1744
},
{
"epoch": 1.883495145631068,
"grad_norm": 1.8564910888671875,
"learning_rate": 8.011974804471953e-07,
"loss": 0.356891930103302,
"step": 1746
},
{
"epoch": 1.8856526429341964,
"grad_norm": 1.3446942567825317,
"learning_rate": 7.991785954167318e-07,
"loss": 0.23551291227340698,
"step": 1748
},
{
"epoch": 1.8878101402373249,
"grad_norm": 0.45997709035873413,
"learning_rate": 7.971614130601109e-07,
"loss": 0.4230949878692627,
"step": 1750
},
{
"epoch": 1.889967637540453,
"grad_norm": 38.918373107910156,
"learning_rate": 7.951459447947506e-07,
"loss": 0.47550415992736816,
"step": 1752
},
{
"epoch": 1.8921251348435815,
"grad_norm": 2.1779584884643555,
"learning_rate": 7.931322020283658e-07,
"loss": 0.34226706624031067,
"step": 1754
},
{
"epoch": 1.8942826321467097,
"grad_norm": 1.3864846229553223,
"learning_rate": 7.911201961589067e-07,
"loss": 0.4829237163066864,
"step": 1756
},
{
"epoch": 1.896440129449838,
"grad_norm": 13.083473205566406,
"learning_rate": 7.89109938574491e-07,
"loss": 0.4935177266597748,
"step": 1758
},
{
"epoch": 1.8985976267529665,
"grad_norm": 1.8217555284500122,
"learning_rate": 7.871014406533422e-07,
"loss": 0.33267736434936523,
"step": 1760
},
{
"epoch": 1.900755124056095,
"grad_norm": 1.1673423051834106,
"learning_rate": 7.850947137637231e-07,
"loss": 0.5361051559448242,
"step": 1762
},
{
"epoch": 1.9029126213592233,
"grad_norm": 2.398650646209717,
"learning_rate": 7.830897692638723e-07,
"loss": 0.45928269624710083,
"step": 1764
},
{
"epoch": 1.9050701186623518,
"grad_norm": 1.6035159826278687,
"learning_rate": 7.810866185019411e-07,
"loss": 0.40345799922943115,
"step": 1766
},
{
"epoch": 1.9072276159654802,
"grad_norm": 3.5140364170074463,
"learning_rate": 7.790852728159263e-07,
"loss": 0.4371829032897949,
"step": 1768
},
{
"epoch": 1.9093851132686084,
"grad_norm": 2.60213041305542,
"learning_rate": 7.770857435336096e-07,
"loss": 0.4061744213104248,
"step": 1770
},
{
"epoch": 1.9115426105717368,
"grad_norm": 1.3775845766067505,
"learning_rate": 7.750880419724901e-07,
"loss": 0.4554259181022644,
"step": 1772
},
{
"epoch": 1.913700107874865,
"grad_norm": 1.0794130563735962,
"learning_rate": 7.730921794397233e-07,
"loss": 0.5084207057952881,
"step": 1774
},
{
"epoch": 1.9158576051779934,
"grad_norm": 2.0664308071136475,
"learning_rate": 7.710981672320547e-07,
"loss": 0.41404515504837036,
"step": 1776
},
{
"epoch": 1.9180151024811218,
"grad_norm": 2.6501731872558594,
"learning_rate": 7.691060166357565e-07,
"loss": 0.43099674582481384,
"step": 1778
},
{
"epoch": 1.9201725997842503,
"grad_norm": 1.3328322172164917,
"learning_rate": 7.671157389265657e-07,
"loss": 0.28375762701034546,
"step": 1780
},
{
"epoch": 1.9223300970873787,
"grad_norm": 1.3328646421432495,
"learning_rate": 7.651273453696166e-07,
"loss": 0.3038649260997772,
"step": 1782
},
{
"epoch": 1.924487594390507,
"grad_norm": 1.6350358724594116,
"learning_rate": 7.631408472193804e-07,
"loss": 0.37957847118377686,
"step": 1784
},
{
"epoch": 1.9266450916936355,
"grad_norm": 2.626065731048584,
"learning_rate": 7.611562557195992e-07,
"loss": 0.5506111979484558,
"step": 1786
},
{
"epoch": 1.9288025889967637,
"grad_norm": 1.2828840017318726,
"learning_rate": 7.591735821032246e-07,
"loss": 0.27725642919540405,
"step": 1788
},
{
"epoch": 1.9309600862998921,
"grad_norm": 2.3256094455718994,
"learning_rate": 7.571928375923513e-07,
"loss": 0.5789600014686584,
"step": 1790
},
{
"epoch": 1.9331175836030206,
"grad_norm": 1.5279923677444458,
"learning_rate": 7.552140333981565e-07,
"loss": 0.3936736583709717,
"step": 1792
},
{
"epoch": 1.9352750809061487,
"grad_norm": 1.2446404695510864,
"learning_rate": 7.532371807208333e-07,
"loss": 0.3211576044559479,
"step": 1794
},
{
"epoch": 1.9374325782092772,
"grad_norm": 4.188495635986328,
"learning_rate": 7.51262290749531e-07,
"loss": 0.6068055629730225,
"step": 1796
},
{
"epoch": 1.9395900755124056,
"grad_norm": 0.5956944227218628,
"learning_rate": 7.49289374662289e-07,
"loss": 0.49566900730133057,
"step": 1798
},
{
"epoch": 1.941747572815534,
"grad_norm": 1.9421483278274536,
"learning_rate": 7.473184436259737e-07,
"loss": 0.6433679461479187,
"step": 1800
},
{
"epoch": 1.9439050701186624,
"grad_norm": 1.5543241500854492,
"learning_rate": 7.453495087962171e-07,
"loss": 0.24959444999694824,
"step": 1802
},
{
"epoch": 1.9460625674217908,
"grad_norm": 1.5686687231063843,
"learning_rate": 7.433825813173513e-07,
"loss": 0.5422605872154236,
"step": 1804
},
{
"epoch": 1.948220064724919,
"grad_norm": 1.1901352405548096,
"learning_rate": 7.414176723223484e-07,
"loss": 0.28422844409942627,
"step": 1806
},
{
"epoch": 1.9503775620280475,
"grad_norm": 1.0920721292495728,
"learning_rate": 7.394547929327533e-07,
"loss": 0.3562416732311249,
"step": 1808
},
{
"epoch": 1.9525350593311759,
"grad_norm": 1.3346633911132812,
"learning_rate": 7.374939542586249e-07,
"loss": 0.43261829018592834,
"step": 1810
},
{
"epoch": 1.954692556634304,
"grad_norm": 0.9234395623207092,
"learning_rate": 7.355351673984718e-07,
"loss": 0.1937822848558426,
"step": 1812
},
{
"epoch": 1.9568500539374325,
"grad_norm": 1.3584299087524414,
"learning_rate": 7.335784434391874e-07,
"loss": 0.48144611716270447,
"step": 1814
},
{
"epoch": 1.959007551240561,
"grad_norm": 3.7962646484375,
"learning_rate": 7.316237934559906e-07,
"loss": 0.5200175642967224,
"step": 1816
},
{
"epoch": 1.9611650485436893,
"grad_norm": 2.161349058151245,
"learning_rate": 7.296712285123603e-07,
"loss": 0.535617470741272,
"step": 1818
},
{
"epoch": 1.9633225458468178,
"grad_norm": 1.4636130332946777,
"learning_rate": 7.277207596599746e-07,
"loss": 0.5756503343582153,
"step": 1820
},
{
"epoch": 1.9654800431499462,
"grad_norm": 1.4813960790634155,
"learning_rate": 7.25772397938647e-07,
"loss": 0.47899457812309265,
"step": 1822
},
{
"epoch": 1.9676375404530746,
"grad_norm": 1.616752028465271,
"learning_rate": 7.238261543762651e-07,
"loss": 0.446144700050354,
"step": 1824
},
{
"epoch": 1.9697950377562028,
"grad_norm": 3.5466485023498535,
"learning_rate": 7.218820399887274e-07,
"loss": 0.6149036884307861,
"step": 1826
},
{
"epoch": 1.9719525350593312,
"grad_norm": 1.1728744506835938,
"learning_rate": 7.199400657798802e-07,
"loss": 0.3574240207672119,
"step": 1828
},
{
"epoch": 1.9741100323624594,
"grad_norm": 1.4667657613754272,
"learning_rate": 7.180002427414584e-07,
"loss": 0.4160582721233368,
"step": 1830
},
{
"epoch": 1.9762675296655878,
"grad_norm": 1.9609451293945312,
"learning_rate": 7.160625818530175e-07,
"loss": 0.4743785560131073,
"step": 1832
},
{
"epoch": 1.9784250269687162,
"grad_norm": 1.3401987552642822,
"learning_rate": 7.141270940818789e-07,
"loss": 0.4877952039241791,
"step": 1834
},
{
"epoch": 1.9805825242718447,
"grad_norm": 2.090475559234619,
"learning_rate": 7.121937903830615e-07,
"loss": 0.4774564206600189,
"step": 1836
},
{
"epoch": 1.982740021574973,
"grad_norm": 1.7114410400390625,
"learning_rate": 7.102626816992228e-07,
"loss": 0.5767732262611389,
"step": 1838
},
{
"epoch": 1.9848975188781015,
"grad_norm": 1.462022304534912,
"learning_rate": 7.08333778960597e-07,
"loss": 0.4107472598552704,
"step": 1840
},
{
"epoch": 1.98705501618123,
"grad_norm": 20.44681739807129,
"learning_rate": 7.064070930849315e-07,
"loss": 0.44799551367759705,
"step": 1842
},
{
"epoch": 1.9892125134843581,
"grad_norm": 1.4650821685791016,
"learning_rate": 7.044826349774271e-07,
"loss": 0.45217186212539673,
"step": 1844
},
{
"epoch": 1.9913700107874865,
"grad_norm": 1.2852694988250732,
"learning_rate": 7.025604155306735e-07,
"loss": 0.5372745394706726,
"step": 1846
},
{
"epoch": 1.9935275080906147,
"grad_norm": 3.7176291942596436,
"learning_rate": 7.006404456245918e-07,
"loss": 0.3280995488166809,
"step": 1848
},
{
"epoch": 1.9956850053937432,
"grad_norm": 2.549321174621582,
"learning_rate": 6.987227361263687e-07,
"loss": 0.419173002243042,
"step": 1850
},
{
"epoch": 1.9978425026968716,
"grad_norm": 0.8345087170600891,
"learning_rate": 6.968072978903971e-07,
"loss": 0.2861520051956177,
"step": 1852
},
{
"epoch": 2.0,
"grad_norm": 1.6546863317489624,
"learning_rate": 6.94894141758215e-07,
"loss": 0.38616907596588135,
"step": 1854
},
{
"epoch": 2.0021574973031284,
"grad_norm": 2.098503351211548,
"learning_rate": 6.929832785584435e-07,
"loss": 0.41417112946510315,
"step": 1856
},
{
"epoch": 2.004314994606257,
"grad_norm": 1.7302627563476562,
"learning_rate": 6.910747191067247e-07,
"loss": 0.2879858613014221,
"step": 1858
},
{
"epoch": 2.0064724919093853,
"grad_norm": 1.0965220928192139,
"learning_rate": 6.891684742056614e-07,
"loss": 0.3841347396373749,
"step": 1860
},
{
"epoch": 2.0086299892125137,
"grad_norm": 1.9664329290390015,
"learning_rate": 6.872645546447569e-07,
"loss": 0.13829857110977173,
"step": 1862
},
{
"epoch": 2.0107874865156417,
"grad_norm": 1.8741497993469238,
"learning_rate": 6.85362971200352e-07,
"loss": 0.3561688959598541,
"step": 1864
},
{
"epoch": 2.01294498381877,
"grad_norm": 1.2023261785507202,
"learning_rate": 6.834637346355648e-07,
"loss": 0.24942456185817719,
"step": 1866
},
{
"epoch": 2.0151024811218985,
"grad_norm": 3.097130298614502,
"learning_rate": 6.815668557002304e-07,
"loss": 0.16739408671855927,
"step": 1868
},
{
"epoch": 2.017259978425027,
"grad_norm": 2.240835189819336,
"learning_rate": 6.796723451308395e-07,
"loss": 0.287383109331131,
"step": 1870
},
{
"epoch": 2.0194174757281553,
"grad_norm": 1.164444923400879,
"learning_rate": 6.777802136504772e-07,
"loss": 0.27731871604919434,
"step": 1872
},
{
"epoch": 2.0215749730312838,
"grad_norm": 3.692326307296753,
"learning_rate": 6.758904719687624e-07,
"loss": 0.42448198795318604,
"step": 1874
},
{
"epoch": 2.023732470334412,
"grad_norm": 12.38591480255127,
"learning_rate": 6.740031307817894e-07,
"loss": 0.46731823682785034,
"step": 1876
},
{
"epoch": 2.0258899676375406,
"grad_norm": 1.388956904411316,
"learning_rate": 6.72118200772063e-07,
"loss": 0.45638781785964966,
"step": 1878
},
{
"epoch": 2.028047464940669,
"grad_norm": 2.0737364292144775,
"learning_rate": 6.702356926084422e-07,
"loss": 0.26328131556510925,
"step": 1880
},
{
"epoch": 2.030204962243797,
"grad_norm": 4.843571662902832,
"learning_rate": 6.683556169460786e-07,
"loss": 0.3340507447719574,
"step": 1882
},
{
"epoch": 2.0323624595469254,
"grad_norm": 0.6562245488166809,
"learning_rate": 6.664779844263533e-07,
"loss": 0.36223921179771423,
"step": 1884
},
{
"epoch": 2.034519956850054,
"grad_norm": 1.3073861598968506,
"learning_rate": 6.646028056768215e-07,
"loss": 0.3697828948497772,
"step": 1886
},
{
"epoch": 2.0366774541531822,
"grad_norm": 0.8905764818191528,
"learning_rate": 6.627300913111484e-07,
"loss": 0.23265878856182098,
"step": 1888
},
{
"epoch": 2.0388349514563107,
"grad_norm": 1.8539386987686157,
"learning_rate": 6.608598519290517e-07,
"loss": 0.2889014780521393,
"step": 1890
},
{
"epoch": 2.040992448759439,
"grad_norm": 1.5576704740524292,
"learning_rate": 6.589920981162384e-07,
"loss": 0.2241078019142151,
"step": 1892
},
{
"epoch": 2.0431499460625675,
"grad_norm": 1.355921983718872,
"learning_rate": 6.5712684044435e-07,
"loss": 0.3171182870864868,
"step": 1894
},
{
"epoch": 2.045307443365696,
"grad_norm": 1.5620806217193604,
"learning_rate": 6.552640894708971e-07,
"loss": 0.2683061361312866,
"step": 1896
},
{
"epoch": 2.0474649406688243,
"grad_norm": 1.372431755065918,
"learning_rate": 6.534038557392031e-07,
"loss": 0.3898204267024994,
"step": 1898
},
{
"epoch": 2.0496224379719523,
"grad_norm": 3.4002630710601807,
"learning_rate": 6.515461497783441e-07,
"loss": 0.18718461692333221,
"step": 1900
},
{
"epoch": 2.0517799352750807,
"grad_norm": 0.8313205242156982,
"learning_rate": 6.49690982103088e-07,
"loss": 0.26798462867736816,
"step": 1902
},
{
"epoch": 2.053937432578209,
"grad_norm": 0.6434590816497803,
"learning_rate": 6.478383632138364e-07,
"loss": 0.20526859164237976,
"step": 1904
},
{
"epoch": 2.0560949298813376,
"grad_norm": 1.930409550666809,
"learning_rate": 6.459883035965637e-07,
"loss": 0.13682284951210022,
"step": 1906
},
{
"epoch": 2.058252427184466,
"grad_norm": 1.254040241241455,
"learning_rate": 6.441408137227597e-07,
"loss": 0.21237482130527496,
"step": 1908
},
{
"epoch": 2.0604099244875944,
"grad_norm": 2.053589105606079,
"learning_rate": 6.422959040493687e-07,
"loss": 0.30055493116378784,
"step": 1910
},
{
"epoch": 2.062567421790723,
"grad_norm": 2.110161304473877,
"learning_rate": 6.404535850187305e-07,
"loss": 0.30984535813331604,
"step": 1912
},
{
"epoch": 2.0647249190938513,
"grad_norm": 1.3705302476882935,
"learning_rate": 6.386138670585226e-07,
"loss": 0.3130619525909424,
"step": 1914
},
{
"epoch": 2.0668824163969797,
"grad_norm": 2.5152392387390137,
"learning_rate": 6.367767605816994e-07,
"loss": 0.30358609557151794,
"step": 1916
},
{
"epoch": 2.0690399137001076,
"grad_norm": 2.5105814933776855,
"learning_rate": 6.349422759864343e-07,
"loss": 0.3234387934207916,
"step": 1918
},
{
"epoch": 2.071197411003236,
"grad_norm": 10.742331504821777,
"learning_rate": 6.331104236560605e-07,
"loss": 0.2621289789676666,
"step": 1920
},
{
"epoch": 2.0733549083063645,
"grad_norm": 5.28401517868042,
"learning_rate": 6.312812139590132e-07,
"loss": 0.3532802164554596,
"step": 1922
},
{
"epoch": 2.075512405609493,
"grad_norm": 1.3391534090042114,
"learning_rate": 6.294546572487688e-07,
"loss": 0.32384493947029114,
"step": 1924
},
{
"epoch": 2.0776699029126213,
"grad_norm": 2.9051060676574707,
"learning_rate": 6.276307638637881e-07,
"loss": 0.38077038526535034,
"step": 1926
},
{
"epoch": 2.0798274002157497,
"grad_norm": 1.0339443683624268,
"learning_rate": 6.258095441274582e-07,
"loss": 0.348030686378479,
"step": 1928
},
{
"epoch": 2.081984897518878,
"grad_norm": 1.0094170570373535,
"learning_rate": 6.239910083480317e-07,
"loss": 0.22280654311180115,
"step": 1930
},
{
"epoch": 2.0841423948220066,
"grad_norm": 6.025458812713623,
"learning_rate": 6.221751668185706e-07,
"loss": 0.2871300280094147,
"step": 1932
},
{
"epoch": 2.086299892125135,
"grad_norm": 1.6982316970825195,
"learning_rate": 6.203620298168865e-07,
"loss": 0.36530792713165283,
"step": 1934
},
{
"epoch": 2.0884573894282634,
"grad_norm": 1.1299806833267212,
"learning_rate": 6.185516076054848e-07,
"loss": 0.291080117225647,
"step": 1936
},
{
"epoch": 2.0906148867313914,
"grad_norm": 2.090158700942993,
"learning_rate": 6.167439104315022e-07,
"loss": 0.28274258971214294,
"step": 1938
},
{
"epoch": 2.09277238403452,
"grad_norm": 1.412570595741272,
"learning_rate": 6.14938948526654e-07,
"loss": 0.14839334785938263,
"step": 1940
},
{
"epoch": 2.0949298813376482,
"grad_norm": 1.48866868019104,
"learning_rate": 6.131367321071736e-07,
"loss": 0.20612022280693054,
"step": 1942
},
{
"epoch": 2.0970873786407767,
"grad_norm": 1.3251118659973145,
"learning_rate": 6.113372713737521e-07,
"loss": 0.2410675585269928,
"step": 1944
},
{
"epoch": 2.099244875943905,
"grad_norm": 1.649924635887146,
"learning_rate": 6.095405765114863e-07,
"loss": 0.4107120931148529,
"step": 1946
},
{
"epoch": 2.1014023732470335,
"grad_norm": 2.8146562576293945,
"learning_rate": 6.077466576898161e-07,
"loss": 0.22407367825508118,
"step": 1948
},
{
"epoch": 2.103559870550162,
"grad_norm": 1.3203238248825073,
"learning_rate": 6.05955525062469e-07,
"loss": 0.3075147867202759,
"step": 1950
},
{
"epoch": 2.1057173678532903,
"grad_norm": 2.156553268432617,
"learning_rate": 6.04167188767403e-07,
"loss": 0.32935836911201477,
"step": 1952
},
{
"epoch": 2.1078748651564188,
"grad_norm": 3.6382105350494385,
"learning_rate": 6.023816589267486e-07,
"loss": 0.3246581554412842,
"step": 1954
},
{
"epoch": 2.1100323624595467,
"grad_norm": 4.373478412628174,
"learning_rate": 6.005989456467511e-07,
"loss": 0.2509233355522156,
"step": 1956
},
{
"epoch": 2.112189859762675,
"grad_norm": 1.778868556022644,
"learning_rate": 5.988190590177132e-07,
"loss": 0.3160122036933899,
"step": 1958
},
{
"epoch": 2.1143473570658036,
"grad_norm": 0.4364719092845917,
"learning_rate": 5.970420091139407e-07,
"loss": 0.04425504431128502,
"step": 1960
},
{
"epoch": 2.116504854368932,
"grad_norm": 1.821292519569397,
"learning_rate": 5.952678059936811e-07,
"loss": 0.18517985939979553,
"step": 1962
},
{
"epoch": 2.1186623516720604,
"grad_norm": 3.6087453365325928,
"learning_rate": 5.934964596990697e-07,
"loss": 0.2705124616622925,
"step": 1964
},
{
"epoch": 2.120819848975189,
"grad_norm": 1.464837670326233,
"learning_rate": 5.917279802560719e-07,
"loss": 0.21107757091522217,
"step": 1966
},
{
"epoch": 2.1229773462783172,
"grad_norm": 4.029551029205322,
"learning_rate": 5.899623776744268e-07,
"loss": 0.23950833082199097,
"step": 1968
},
{
"epoch": 2.1251348435814457,
"grad_norm": 1.4416351318359375,
"learning_rate": 5.881996619475898e-07,
"loss": 0.3448520600795746,
"step": 1970
},
{
"epoch": 2.127292340884574,
"grad_norm": 2.3087425231933594,
"learning_rate": 5.864398430526765e-07,
"loss": 0.18799349665641785,
"step": 1972
},
{
"epoch": 2.129449838187702,
"grad_norm": 0.4913981556892395,
"learning_rate": 5.846829309504064e-07,
"loss": 0.23318088054656982,
"step": 1974
},
{
"epoch": 2.1316073354908305,
"grad_norm": 0.16859030723571777,
"learning_rate": 5.829289355850464e-07,
"loss": 0.22492466866970062,
"step": 1976
},
{
"epoch": 2.133764832793959,
"grad_norm": 1.511398196220398,
"learning_rate": 5.811778668843541e-07,
"loss": 0.20076408982276917,
"step": 1978
},
{
"epoch": 2.1359223300970873,
"grad_norm": 3.2070164680480957,
"learning_rate": 5.794297347595216e-07,
"loss": 0.2566869258880615,
"step": 1980
},
{
"epoch": 2.1380798274002157,
"grad_norm": 1.4940425157546997,
"learning_rate": 5.77684549105121e-07,
"loss": 0.34161150455474854,
"step": 1982
},
{
"epoch": 2.140237324703344,
"grad_norm": 1.6653574705123901,
"learning_rate": 5.75942319799046e-07,
"loss": 0.2889230251312256,
"step": 1984
},
{
"epoch": 2.1423948220064726,
"grad_norm": 1.7484220266342163,
"learning_rate": 5.742030567024571e-07,
"loss": 0.2357415109872818,
"step": 1986
},
{
"epoch": 2.144552319309601,
"grad_norm": 2.1993203163146973,
"learning_rate": 5.724667696597274e-07,
"loss": 0.43604907393455505,
"step": 1988
},
{
"epoch": 2.1467098166127294,
"grad_norm": 3.952744960784912,
"learning_rate": 5.707334684983824e-07,
"loss": 0.1417762041091919,
"step": 1990
},
{
"epoch": 2.148867313915858,
"grad_norm": 1.5836926698684692,
"learning_rate": 5.690031630290504e-07,
"loss": 0.3969094753265381,
"step": 1992
},
{
"epoch": 2.151024811218986,
"grad_norm": 2.5084621906280518,
"learning_rate": 5.672758630454016e-07,
"loss": 0.3280077576637268,
"step": 1994
},
{
"epoch": 2.1531823085221142,
"grad_norm": 2.075791835784912,
"learning_rate": 5.655515783240958e-07,
"loss": 0.23175282776355743,
"step": 1996
},
{
"epoch": 2.1553398058252426,
"grad_norm": 1.316325306892395,
"learning_rate": 5.63830318624726e-07,
"loss": 0.3914681077003479,
"step": 1998
},
{
"epoch": 2.157497303128371,
"grad_norm": 1.3925992250442505,
"learning_rate": 5.621120936897634e-07,
"loss": 0.2599402368068695,
"step": 2000
},
{
"epoch": 2.1596548004314995,
"grad_norm": 0.6926285624504089,
"learning_rate": 5.60396913244503e-07,
"loss": 0.20738252997398376,
"step": 2002
},
{
"epoch": 2.161812297734628,
"grad_norm": 2.7593421936035156,
"learning_rate": 5.586847869970058e-07,
"loss": 0.3029998242855072,
"step": 2004
},
{
"epoch": 2.1639697950377563,
"grad_norm": 1.116198182106018,
"learning_rate": 5.569757246380473e-07,
"loss": 0.3626508414745331,
"step": 2006
},
{
"epoch": 2.1661272923408847,
"grad_norm": 1.4919781684875488,
"learning_rate": 5.552697358410607e-07,
"loss": 0.1747465282678604,
"step": 2008
},
{
"epoch": 2.168284789644013,
"grad_norm": 2.0446255207061768,
"learning_rate": 5.535668302620828e-07,
"loss": 0.3265528082847595,
"step": 2010
},
{
"epoch": 2.170442286947141,
"grad_norm": 1.902366280555725,
"learning_rate": 5.518670175396986e-07,
"loss": 0.20548182725906372,
"step": 2012
},
{
"epoch": 2.1725997842502696,
"grad_norm": 1.5543972253799438,
"learning_rate": 5.50170307294988e-07,
"loss": 0.30606332421302795,
"step": 2014
},
{
"epoch": 2.174757281553398,
"grad_norm": 4.172656059265137,
"learning_rate": 5.484767091314703e-07,
"loss": 0.43032437562942505,
"step": 2016
},
{
"epoch": 2.1769147788565264,
"grad_norm": 1.8856433629989624,
"learning_rate": 5.467862326350495e-07,
"loss": 0.3198752701282501,
"step": 2018
},
{
"epoch": 2.179072276159655,
"grad_norm": 2.642690420150757,
"learning_rate": 5.450988873739622e-07,
"loss": 0.39496558904647827,
"step": 2020
},
{
"epoch": 2.1812297734627832,
"grad_norm": 1.6509486436843872,
"learning_rate": 5.434146828987205e-07,
"loss": 0.3583213686943054,
"step": 2022
},
{
"epoch": 2.1833872707659117,
"grad_norm": 5.141426086425781,
"learning_rate": 5.417336287420602e-07,
"loss": 0.3227855861186981,
"step": 2024
},
{
"epoch": 2.18554476806904,
"grad_norm": 2.045142412185669,
"learning_rate": 5.400557344188854e-07,
"loss": 0.3605496883392334,
"step": 2026
},
{
"epoch": 2.1877022653721685,
"grad_norm": 1.7817591428756714,
"learning_rate": 5.383810094262164e-07,
"loss": 0.2572648525238037,
"step": 2028
},
{
"epoch": 2.1898597626752965,
"grad_norm": 1.3152915239334106,
"learning_rate": 5.367094632431337e-07,
"loss": 0.23091773688793182,
"step": 2030
},
{
"epoch": 2.192017259978425,
"grad_norm": 2.0624170303344727,
"learning_rate": 5.350411053307258e-07,
"loss": 0.32704049348831177,
"step": 2032
},
{
"epoch": 2.1941747572815533,
"grad_norm": 0.41580072045326233,
"learning_rate": 5.33375945132036e-07,
"loss": 0.1712155044078827,
"step": 2034
},
{
"epoch": 2.1963322545846817,
"grad_norm": 6.342857360839844,
"learning_rate": 5.317139920720069e-07,
"loss": 0.11315549165010452,
"step": 2036
},
{
"epoch": 2.19848975188781,
"grad_norm": 2.2447385787963867,
"learning_rate": 5.300552555574296e-07,
"loss": 0.2954585552215576,
"step": 2038
},
{
"epoch": 2.2006472491909386,
"grad_norm": 1.2444943189620972,
"learning_rate": 5.28399744976889e-07,
"loss": 0.3177575170993805,
"step": 2040
},
{
"epoch": 2.202804746494067,
"grad_norm": 1.3165860176086426,
"learning_rate": 5.267474697007111e-07,
"loss": 0.18876095116138458,
"step": 2042
},
{
"epoch": 2.2049622437971954,
"grad_norm": 1.2250696420669556,
"learning_rate": 5.250984390809092e-07,
"loss": 0.36252525448799133,
"step": 2044
},
{
"epoch": 2.207119741100324,
"grad_norm": 2.139589309692383,
"learning_rate": 5.234526624511319e-07,
"loss": 0.30584216117858887,
"step": 2046
},
{
"epoch": 2.209277238403452,
"grad_norm": 1.1470484733581543,
"learning_rate": 5.218101491266108e-07,
"loss": 0.27216002345085144,
"step": 2048
},
{
"epoch": 2.2114347357065802,
"grad_norm": 1.030771017074585,
"learning_rate": 5.201709084041051e-07,
"loss": 0.26957935094833374,
"step": 2050
},
{
"epoch": 2.2135922330097086,
"grad_norm": 3.0721845626831055,
"learning_rate": 5.185349495618523e-07,
"loss": 0.35413840413093567,
"step": 2052
},
{
"epoch": 2.215749730312837,
"grad_norm": 1.2705237865447998,
"learning_rate": 5.169022818595139e-07,
"loss": 0.2741287648677826,
"step": 2054
},
{
"epoch": 2.2179072276159655,
"grad_norm": 1.2934070825576782,
"learning_rate": 5.152729145381226e-07,
"loss": 0.4308694899082184,
"step": 2056
},
{
"epoch": 2.220064724919094,
"grad_norm": 1.0611752271652222,
"learning_rate": 5.136468568200319e-07,
"loss": 0.3000924587249756,
"step": 2058
},
{
"epoch": 2.2222222222222223,
"grad_norm": 1.2346817255020142,
"learning_rate": 5.120241179088615e-07,
"loss": 0.24799837172031403,
"step": 2060
},
{
"epoch": 2.2243797195253507,
"grad_norm": 2.2104601860046387,
"learning_rate": 5.10404706989447e-07,
"loss": 0.26346859335899353,
"step": 2062
},
{
"epoch": 2.226537216828479,
"grad_norm": 1.240453839302063,
"learning_rate": 5.087886332277866e-07,
"loss": 0.22416910529136658,
"step": 2064
},
{
"epoch": 2.228694714131607,
"grad_norm": 1.543516755104065,
"learning_rate": 5.071759057709915e-07,
"loss": 0.2962421178817749,
"step": 2066
},
{
"epoch": 2.2308522114347356,
"grad_norm": 1.8429937362670898,
"learning_rate": 5.055665337472306e-07,
"loss": 0.21297654509544373,
"step": 2068
},
{
"epoch": 2.233009708737864,
"grad_norm": 1.4537338018417358,
"learning_rate": 5.039605262656816e-07,
"loss": 0.15425504744052887,
"step": 2070
},
{
"epoch": 2.2351672060409924,
"grad_norm": 2.0566556453704834,
"learning_rate": 5.023578924164795e-07,
"loss": 0.4284798502922058,
"step": 2072
},
{
"epoch": 2.237324703344121,
"grad_norm": 1.6528476476669312,
"learning_rate": 5.007586412706629e-07,
"loss": 0.3135349750518799,
"step": 2074
},
{
"epoch": 2.2394822006472492,
"grad_norm": 2.057396173477173,
"learning_rate": 4.991627818801245e-07,
"loss": 0.3012422025203705,
"step": 2076
},
{
"epoch": 2.2416396979503777,
"grad_norm": 2.47934889793396,
"learning_rate": 4.975703232775593e-07,
"loss": 0.24459701776504517,
"step": 2078
},
{
"epoch": 2.243797195253506,
"grad_norm": 1.943104863166809,
"learning_rate": 4.959812744764143e-07,
"loss": 0.3131766617298126,
"step": 2080
},
{
"epoch": 2.2459546925566345,
"grad_norm": 1.1001262664794922,
"learning_rate": 4.943956444708357e-07,
"loss": 0.21205957233905792,
"step": 2082
},
{
"epoch": 2.2481121898597625,
"grad_norm": 4.736835956573486,
"learning_rate": 4.928134422356194e-07,
"loss": 0.2981138825416565,
"step": 2084
},
{
"epoch": 2.250269687162891,
"grad_norm": 2.2798216342926025,
"learning_rate": 4.912346767261605e-07,
"loss": 0.1790456920862198,
"step": 2086
},
{
"epoch": 2.2524271844660193,
"grad_norm": 1.6988016366958618,
"learning_rate": 4.896593568784008e-07,
"loss": 0.35665163397789,
"step": 2088
},
{
"epoch": 2.2545846817691477,
"grad_norm": 8.441971778869629,
"learning_rate": 4.880874916087802e-07,
"loss": 0.42117640376091003,
"step": 2090
},
{
"epoch": 2.256742179072276,
"grad_norm": 1.6971404552459717,
"learning_rate": 4.865190898141847e-07,
"loss": 0.3492169678211212,
"step": 2092
},
{
"epoch": 2.2588996763754046,
"grad_norm": 0.9748818874359131,
"learning_rate": 4.849541603718984e-07,
"loss": 0.24743738770484924,
"step": 2094
},
{
"epoch": 2.261057173678533,
"grad_norm": 1.4215339422225952,
"learning_rate": 4.833927121395488e-07,
"loss": 0.3036370277404785,
"step": 2096
},
{
"epoch": 2.2632146709816614,
"grad_norm": 1.2809211015701294,
"learning_rate": 4.818347539550621e-07,
"loss": 0.2065061330795288,
"step": 2098
},
{
"epoch": 2.26537216828479,
"grad_norm": 1.3214771747589111,
"learning_rate": 4.802802946366094e-07,
"loss": 0.15301240980625153,
"step": 2100
},
{
"epoch": 2.267529665587918,
"grad_norm": 1.464438557624817,
"learning_rate": 4.787293429825575e-07,
"loss": 0.24719694256782532,
"step": 2102
},
{
"epoch": 2.269687162891046,
"grad_norm": 1.064639925956726,
"learning_rate": 4.771819077714207e-07,
"loss": 0.25772497057914734,
"step": 2104
},
{
"epoch": 2.2718446601941746,
"grad_norm": 1.1112815141677856,
"learning_rate": 4.756379977618093e-07,
"loss": 0.3478604257106781,
"step": 2106
},
{
"epoch": 2.274002157497303,
"grad_norm": 2.554323196411133,
"learning_rate": 4.740976216923803e-07,
"loss": 0.2710329592227936,
"step": 2108
},
{
"epoch": 2.2761596548004315,
"grad_norm": 1.0012887716293335,
"learning_rate": 4.725607882817886e-07,
"loss": 0.17554689943790436,
"step": 2110
},
{
"epoch": 2.27831715210356,
"grad_norm": 7.408605098724365,
"learning_rate": 4.710275062286379e-07,
"loss": 0.32163527607917786,
"step": 2112
},
{
"epoch": 2.2804746494066883,
"grad_norm": 1.3676038980484009,
"learning_rate": 4.694977842114303e-07,
"loss": 0.2676321864128113,
"step": 2114
},
{
"epoch": 2.2826321467098167,
"grad_norm": 3.824159860610962,
"learning_rate": 4.6797163088851777e-07,
"loss": 0.3604358434677124,
"step": 2116
},
{
"epoch": 2.284789644012945,
"grad_norm": 2.4927074909210205,
"learning_rate": 4.6644905489805377e-07,
"loss": 0.19784438610076904,
"step": 2118
},
{
"epoch": 2.286947141316073,
"grad_norm": 2.12221097946167,
"learning_rate": 4.6493006485794325e-07,
"loss": 0.14516694843769073,
"step": 2120
},
{
"epoch": 2.2891046386192015,
"grad_norm": 2.6523265838623047,
"learning_rate": 4.6341466936579445e-07,
"loss": 0.4613579511642456,
"step": 2122
},
{
"epoch": 2.29126213592233,
"grad_norm": 1.4518303871154785,
"learning_rate": 4.6190287699887e-07,
"loss": 0.29724588990211487,
"step": 2124
},
{
"epoch": 2.2934196332254584,
"grad_norm": 1.7566689252853394,
"learning_rate": 4.6039469631403926e-07,
"loss": 0.29199060797691345,
"step": 2126
},
{
"epoch": 2.295577130528587,
"grad_norm": 1.4850506782531738,
"learning_rate": 4.588901358477287e-07,
"loss": 0.34357935190200806,
"step": 2128
},
{
"epoch": 2.2977346278317152,
"grad_norm": 2.186891794204712,
"learning_rate": 4.5738920411587333e-07,
"loss": 0.35363560914993286,
"step": 2130
},
{
"epoch": 2.2998921251348436,
"grad_norm": 1.4798485040664673,
"learning_rate": 4.5589190961387085e-07,
"loss": 0.345289021730423,
"step": 2132
},
{
"epoch": 2.302049622437972,
"grad_norm": 1.1641236543655396,
"learning_rate": 4.543982608165307e-07,
"loss": 0.3444761633872986,
"step": 2134
},
{
"epoch": 2.3042071197411005,
"grad_norm": 5.22445821762085,
"learning_rate": 4.529082661780277e-07,
"loss": 0.4176110625267029,
"step": 2136
},
{
"epoch": 2.3063646170442285,
"grad_norm": 2.9714982509613037,
"learning_rate": 4.514219341318534e-07,
"loss": 0.3582867383956909,
"step": 2138
},
{
"epoch": 2.308522114347357,
"grad_norm": 1.2270290851593018,
"learning_rate": 4.499392730907701e-07,
"loss": 0.19157586991786957,
"step": 2140
},
{
"epoch": 2.3106796116504853,
"grad_norm": 2.424367904663086,
"learning_rate": 4.484602914467599e-07,
"loss": 0.17550167441368103,
"step": 2142
},
{
"epoch": 2.3128371089536137,
"grad_norm": 6.250462532043457,
"learning_rate": 4.4698499757098085e-07,
"loss": 0.2245817482471466,
"step": 2144
},
{
"epoch": 2.314994606256742,
"grad_norm": 1.6150588989257812,
"learning_rate": 4.4551339981371805e-07,
"loss": 0.22309915721416473,
"step": 2146
},
{
"epoch": 2.3171521035598706,
"grad_norm": 2.557511568069458,
"learning_rate": 4.4404550650433423e-07,
"loss": 0.31364479660987854,
"step": 2148
},
{
"epoch": 2.319309600862999,
"grad_norm": 1.9506558179855347,
"learning_rate": 4.4258132595122697e-07,
"loss": 0.3908032774925232,
"step": 2150
},
{
"epoch": 2.3214670981661274,
"grad_norm": 1.9863252639770508,
"learning_rate": 4.411208664417779e-07,
"loss": 0.33292022347450256,
"step": 2152
},
{
"epoch": 2.323624595469256,
"grad_norm": 2.143460512161255,
"learning_rate": 4.3966413624230847e-07,
"loss": 0.1916477084159851,
"step": 2154
},
{
"epoch": 2.325782092772384,
"grad_norm": 3.893472194671631,
"learning_rate": 4.3821114359803016e-07,
"loss": 0.33617085218429565,
"step": 2156
},
{
"epoch": 2.3279395900755127,
"grad_norm": 2.143810272216797,
"learning_rate": 4.367618967330011e-07,
"loss": 0.3440120220184326,
"step": 2158
},
{
"epoch": 2.3300970873786406,
"grad_norm": 1.6093735694885254,
"learning_rate": 4.35316403850078e-07,
"loss": 0.18562518060207367,
"step": 2160
},
{
"epoch": 2.332254584681769,
"grad_norm": 1.975888729095459,
"learning_rate": 4.3387467313086825e-07,
"loss": 0.15831519663333893,
"step": 2162
},
{
"epoch": 2.3344120819848975,
"grad_norm": 1.708641529083252,
"learning_rate": 4.324367127356868e-07,
"loss": 0.2527565360069275,
"step": 2164
},
{
"epoch": 2.336569579288026,
"grad_norm": 1.4729366302490234,
"learning_rate": 4.310025308035073e-07,
"loss": 0.2772301435470581,
"step": 2166
},
{
"epoch": 2.3387270765911543,
"grad_norm": 1.6301394701004028,
"learning_rate": 4.295721354519172e-07,
"loss": 0.3133164644241333,
"step": 2168
},
{
"epoch": 2.3408845738942827,
"grad_norm": 3.0249712467193604,
"learning_rate": 4.281455347770713e-07,
"loss": 0.24287529289722443,
"step": 2170
},
{
"epoch": 2.343042071197411,
"grad_norm": 1.6311941146850586,
"learning_rate": 4.2672273685364703e-07,
"loss": 0.4204927086830139,
"step": 2172
},
{
"epoch": 2.3451995685005396,
"grad_norm": 5.362087249755859,
"learning_rate": 4.253037497347971e-07,
"loss": 0.37392908334732056,
"step": 2174
},
{
"epoch": 2.347357065803668,
"grad_norm": 1.4209610223770142,
"learning_rate": 4.2388858145210506e-07,
"loss": 0.21947862207889557,
"step": 2176
},
{
"epoch": 2.349514563106796,
"grad_norm": 1.5687469244003296,
"learning_rate": 4.224772400155399e-07,
"loss": 0.36956965923309326,
"step": 2178
},
{
"epoch": 2.3516720604099244,
"grad_norm": 1.6838412284851074,
"learning_rate": 4.2106973341340976e-07,
"loss": 0.23953932523727417,
"step": 2180
},
{
"epoch": 2.353829557713053,
"grad_norm": 4.918888568878174,
"learning_rate": 4.1966606961231766e-07,
"loss": 0.3331076502799988,
"step": 2182
},
{
"epoch": 2.355987055016181,
"grad_norm": 1.4841556549072266,
"learning_rate": 4.182662565571154e-07,
"loss": 0.2150951325893402,
"step": 2184
},
{
"epoch": 2.3581445523193096,
"grad_norm": 2.378197193145752,
"learning_rate": 4.168703021708605e-07,
"loss": 0.442268043756485,
"step": 2186
},
{
"epoch": 2.360302049622438,
"grad_norm": 1.1213322877883911,
"learning_rate": 4.154782143547691e-07,
"loss": 0.21381919085979462,
"step": 2188
},
{
"epoch": 2.3624595469255665,
"grad_norm": 1.2793383598327637,
"learning_rate": 4.140900009881722e-07,
"loss": 0.297492116689682,
"step": 2190
},
{
"epoch": 2.364617044228695,
"grad_norm": 1.0152313709259033,
"learning_rate": 4.127056699284719e-07,
"loss": 0.2662775218486786,
"step": 2192
},
{
"epoch": 2.3667745415318233,
"grad_norm": 3.4524388313293457,
"learning_rate": 4.1132522901109547e-07,
"loss": 0.1951354295015335,
"step": 2194
},
{
"epoch": 2.3689320388349513,
"grad_norm": 5.245743751525879,
"learning_rate": 4.099486860494517e-07,
"loss": 0.3872916102409363,
"step": 2196
},
{
"epoch": 2.3710895361380797,
"grad_norm": 1.206447720527649,
"learning_rate": 4.085760488348866e-07,
"loss": 0.24260494112968445,
"step": 2198
},
{
"epoch": 2.373247033441208,
"grad_norm": 1.1352065801620483,
"learning_rate": 4.0720732513663985e-07,
"loss": 0.24157175421714783,
"step": 2200
},
{
"epoch": 2.3754045307443366,
"grad_norm": 1.7284468412399292,
"learning_rate": 4.0584252270179975e-07,
"loss": 0.24914561212062836,
"step": 2202
},
{
"epoch": 2.377562028047465,
"grad_norm": 3.4234535694122314,
"learning_rate": 4.0448164925525987e-07,
"loss": 0.4321536421775818,
"step": 2204
},
{
"epoch": 2.3797195253505934,
"grad_norm": 0.3995194137096405,
"learning_rate": 4.031247124996764e-07,
"loss": 0.3386417329311371,
"step": 2206
},
{
"epoch": 2.381877022653722,
"grad_norm": 1.4748889207839966,
"learning_rate": 4.017717201154217e-07,
"loss": 0.2493990957736969,
"step": 2208
},
{
"epoch": 2.3840345199568502,
"grad_norm": 2.027784824371338,
"learning_rate": 4.004226797605445e-07,
"loss": 0.36100074648857117,
"step": 2210
},
{
"epoch": 2.3861920172599786,
"grad_norm": 1.2897789478302002,
"learning_rate": 3.990775990707237e-07,
"loss": 0.23212602734565735,
"step": 2212
},
{
"epoch": 2.3883495145631066,
"grad_norm": 1.4718191623687744,
"learning_rate": 3.9773648565922634e-07,
"loss": 0.17623895406723022,
"step": 2214
},
{
"epoch": 2.390507011866235,
"grad_norm": 1.6537420749664307,
"learning_rate": 3.963993471168643e-07,
"loss": 0.3103001117706299,
"step": 2216
},
{
"epoch": 2.3926645091693635,
"grad_norm": 3.3019044399261475,
"learning_rate": 3.9506619101195196e-07,
"loss": 0.3791100084781647,
"step": 2218
},
{
"epoch": 2.394822006472492,
"grad_norm": 0.7170013785362244,
"learning_rate": 3.9373702489026184e-07,
"loss": 0.23801524937152863,
"step": 2220
},
{
"epoch": 2.3969795037756203,
"grad_norm": 1.0052597522735596,
"learning_rate": 3.9241185627498333e-07,
"loss": 0.2777608036994934,
"step": 2222
},
{
"epoch": 2.3991370010787487,
"grad_norm": 5.178793907165527,
"learning_rate": 3.9109069266668e-07,
"loss": 0.35337719321250916,
"step": 2224
},
{
"epoch": 2.401294498381877,
"grad_norm": 1.5237584114074707,
"learning_rate": 3.8977354154324586e-07,
"loss": 0.3222746253013611,
"step": 2226
},
{
"epoch": 2.4034519956850056,
"grad_norm": 2.202849864959717,
"learning_rate": 3.884604103598647e-07,
"loss": 0.2847940921783447,
"step": 2228
},
{
"epoch": 2.405609492988134,
"grad_norm": 2.125887870788574,
"learning_rate": 3.8715130654896623e-07,
"loss": 0.3124774992465973,
"step": 2230
},
{
"epoch": 2.407766990291262,
"grad_norm": 2.585139513015747,
"learning_rate": 3.858462375201862e-07,
"loss": 0.3422589898109436,
"step": 2232
},
{
"epoch": 2.4099244875943904,
"grad_norm": 2.663825035095215,
"learning_rate": 3.8454521066032214e-07,
"loss": 0.2664566934108734,
"step": 2234
},
{
"epoch": 2.412081984897519,
"grad_norm": 8.156516075134277,
"learning_rate": 3.8324823333329263e-07,
"loss": 0.2662767171859741,
"step": 2236
},
{
"epoch": 2.414239482200647,
"grad_norm": 1.3628453016281128,
"learning_rate": 3.819553128800962e-07,
"loss": 0.3383438289165497,
"step": 2238
},
{
"epoch": 2.4163969795037756,
"grad_norm": 1.212145447731018,
"learning_rate": 3.806664566187686e-07,
"loss": 0.3051017224788666,
"step": 2240
},
{
"epoch": 2.418554476806904,
"grad_norm": 4.809718608856201,
"learning_rate": 3.7938167184434206e-07,
"loss": 0.2975846230983734,
"step": 2242
},
{
"epoch": 2.4207119741100325,
"grad_norm": 1.992264986038208,
"learning_rate": 3.781009658288036e-07,
"loss": 0.2492162585258484,
"step": 2244
},
{
"epoch": 2.422869471413161,
"grad_norm": 2.0577735900878906,
"learning_rate": 3.768243458210549e-07,
"loss": 0.35954225063323975,
"step": 2246
},
{
"epoch": 2.4250269687162893,
"grad_norm": 1.2022833824157715,
"learning_rate": 3.755518190468697e-07,
"loss": 0.1975034475326538,
"step": 2248
},
{
"epoch": 2.4271844660194173,
"grad_norm": 2.0728843212127686,
"learning_rate": 3.7428339270885367e-07,
"loss": 0.3646237254142761,
"step": 2250
},
{
"epoch": 2.4293419633225457,
"grad_norm": 1.2893973588943481,
"learning_rate": 3.73019073986405e-07,
"loss": 0.22355937957763672,
"step": 2252
},
{
"epoch": 2.431499460625674,
"grad_norm": 2.275219440460205,
"learning_rate": 3.717588700356702e-07,
"loss": 0.1818881779909134,
"step": 2254
},
{
"epoch": 2.4336569579288025,
"grad_norm": 1.8378785848617554,
"learning_rate": 3.7050278798950795e-07,
"loss": 0.22650231420993805,
"step": 2256
},
{
"epoch": 2.435814455231931,
"grad_norm": 2.7440717220306396,
"learning_rate": 3.6925083495744534e-07,
"loss": 0.44353553652763367,
"step": 2258
},
{
"epoch": 2.4379719525350594,
"grad_norm": 1.588118076324463,
"learning_rate": 3.6800301802563927e-07,
"loss": 0.2522018551826477,
"step": 2260
},
{
"epoch": 2.440129449838188,
"grad_norm": 1.7098896503448486,
"learning_rate": 3.667593442568364e-07,
"loss": 0.32094866037368774,
"step": 2262
},
{
"epoch": 2.4422869471413162,
"grad_norm": 1.7896616458892822,
"learning_rate": 3.6551982069033205e-07,
"loss": 0.2640255093574524,
"step": 2264
},
{
"epoch": 2.4444444444444446,
"grad_norm": 0.299568235874176,
"learning_rate": 3.6428445434193136e-07,
"loss": 0.14886681735515594,
"step": 2266
},
{
"epoch": 2.4466019417475726,
"grad_norm": 1.7262334823608398,
"learning_rate": 3.6305325220390905e-07,
"loss": 0.26833122968673706,
"step": 2268
},
{
"epoch": 2.448759439050701,
"grad_norm": 1.531208872795105,
"learning_rate": 3.618262212449706e-07,
"loss": 0.4644531011581421,
"step": 2270
},
{
"epoch": 2.4509169363538295,
"grad_norm": 2.4541876316070557,
"learning_rate": 3.606033684102121e-07,
"loss": 0.21291311085224152,
"step": 2272
},
{
"epoch": 2.453074433656958,
"grad_norm": 1.4597750902175903,
"learning_rate": 3.5938470062108043e-07,
"loss": 0.22411037981510162,
"step": 2274
},
{
"epoch": 2.4552319309600863,
"grad_norm": 3.633920431137085,
"learning_rate": 3.5817022477533585e-07,
"loss": 0.23133979737758636,
"step": 2276
},
{
"epoch": 2.4573894282632147,
"grad_norm": 1.5003464221954346,
"learning_rate": 3.569599477470112e-07,
"loss": 0.3434515595436096,
"step": 2278
},
{
"epoch": 2.459546925566343,
"grad_norm": 2.270390033721924,
"learning_rate": 3.5575387638637357e-07,
"loss": 0.3822650611400604,
"step": 2280
},
{
"epoch": 2.4617044228694716,
"grad_norm": 1.8612627983093262,
"learning_rate": 3.545520175198858e-07,
"loss": 0.2585938274860382,
"step": 2282
},
{
"epoch": 2.4638619201726,
"grad_norm": 2.235671281814575,
"learning_rate": 3.5335437795016823e-07,
"loss": 0.28054654598236084,
"step": 2284
},
{
"epoch": 2.466019417475728,
"grad_norm": 4.319032669067383,
"learning_rate": 3.5216096445595884e-07,
"loss": 0.4531608819961548,
"step": 2286
},
{
"epoch": 2.4681769147788564,
"grad_norm": 1.4703388214111328,
"learning_rate": 3.509717837920756e-07,
"loss": 0.3540151119232178,
"step": 2288
},
{
"epoch": 2.470334412081985,
"grad_norm": 2.039675712585449,
"learning_rate": 3.497868426893793e-07,
"loss": 0.13517698645591736,
"step": 2290
},
{
"epoch": 2.472491909385113,
"grad_norm": 0.5410081148147583,
"learning_rate": 3.486061478547337e-07,
"loss": 0.02865418791770935,
"step": 2292
},
{
"epoch": 2.4746494066882416,
"grad_norm": 1.2113001346588135,
"learning_rate": 3.4742970597096834e-07,
"loss": 0.33490967750549316,
"step": 2294
},
{
"epoch": 2.47680690399137,
"grad_norm": 1.3965346813201904,
"learning_rate": 3.462575236968406e-07,
"loss": 0.18190385401248932,
"step": 2296
},
{
"epoch": 2.4789644012944985,
"grad_norm": 3.908724069595337,
"learning_rate": 3.4508960766699914e-07,
"loss": 0.31986406445503235,
"step": 2298
},
{
"epoch": 2.481121898597627,
"grad_norm": 1.823320746421814,
"learning_rate": 3.4392596449194346e-07,
"loss": 0.24571648240089417,
"step": 2300
},
{
"epoch": 2.4832793959007553,
"grad_norm": 9.324934959411621,
"learning_rate": 3.427666007579902e-07,
"loss": 0.2971467077732086,
"step": 2302
},
{
"epoch": 2.4854368932038833,
"grad_norm": 1.0990798473358154,
"learning_rate": 3.416115230272333e-07,
"loss": 0.3316362500190735,
"step": 2304
},
{
"epoch": 2.4875943905070117,
"grad_norm": 0.9614824056625366,
"learning_rate": 3.4046073783750726e-07,
"loss": 0.21634887158870697,
"step": 2306
},
{
"epoch": 2.48975188781014,
"grad_norm": 3.744295120239258,
"learning_rate": 3.3931425170235083e-07,
"loss": 0.3917967677116394,
"step": 2308
},
{
"epoch": 2.4919093851132685,
"grad_norm": 2.0177054405212402,
"learning_rate": 3.381720711109695e-07,
"loss": 0.3639075458049774,
"step": 2310
},
{
"epoch": 2.494066882416397,
"grad_norm": 3.873263359069824,
"learning_rate": 3.3703420252819947e-07,
"loss": 0.28831595182418823,
"step": 2312
},
{
"epoch": 2.4962243797195254,
"grad_norm": 9.90953540802002,
"learning_rate": 3.359006523944697e-07,
"loss": 0.2942795753479004,
"step": 2314
},
{
"epoch": 2.498381877022654,
"grad_norm": 1.6383076906204224,
"learning_rate": 3.347714271257679e-07,
"loss": 0.19500726461410522,
"step": 2316
},
{
"epoch": 2.500539374325782,
"grad_norm": 1.960985779762268,
"learning_rate": 3.3364653311360104e-07,
"loss": 0.30468082427978516,
"step": 2318
},
{
"epoch": 2.5026968716289106,
"grad_norm": 1.2923824787139893,
"learning_rate": 3.325259767249617e-07,
"loss": 0.20314782857894897,
"step": 2320
},
{
"epoch": 2.5048543689320386,
"grad_norm": 1.4000043869018555,
"learning_rate": 3.3140976430229136e-07,
"loss": 0.34522709250450134,
"step": 2322
},
{
"epoch": 2.5070118662351675,
"grad_norm": 1.8520127534866333,
"learning_rate": 3.302979021634438e-07,
"loss": 0.3963944911956787,
"step": 2324
},
{
"epoch": 2.5091693635382954,
"grad_norm": 1.3927682638168335,
"learning_rate": 3.2919039660164973e-07,
"loss": 0.126472607254982,
"step": 2326
},
{
"epoch": 2.511326860841424,
"grad_norm": 1.8297348022460938,
"learning_rate": 3.2808725388548164e-07,
"loss": 0.3507118821144104,
"step": 2328
},
{
"epoch": 2.5134843581445523,
"grad_norm": 0.6373293399810791,
"learning_rate": 3.269884802588181e-07,
"loss": 0.15893447399139404,
"step": 2330
},
{
"epoch": 2.5156418554476807,
"grad_norm": 0.42088356614112854,
"learning_rate": 3.258940819408079e-07,
"loss": 0.14911764860153198,
"step": 2332
},
{
"epoch": 2.517799352750809,
"grad_norm": 2.131605863571167,
"learning_rate": 3.248040651258352e-07,
"loss": 0.2661122679710388,
"step": 2334
},
{
"epoch": 2.5199568500539375,
"grad_norm": 7.058028697967529,
"learning_rate": 3.2371843598348485e-07,
"loss": 0.35104191303253174,
"step": 2336
},
{
"epoch": 2.522114347357066,
"grad_norm": 12.030430793762207,
"learning_rate": 3.2263720065850686e-07,
"loss": 0.28974059224128723,
"step": 2338
},
{
"epoch": 2.524271844660194,
"grad_norm": 2.7837140560150146,
"learning_rate": 3.215603652707819e-07,
"loss": 0.4368301033973694,
"step": 2340
},
{
"epoch": 2.526429341963323,
"grad_norm": 7.616663455963135,
"learning_rate": 3.2048793591528655e-07,
"loss": 0.47825562953948975,
"step": 2342
},
{
"epoch": 2.528586839266451,
"grad_norm": 3.9860897064208984,
"learning_rate": 3.194199186620592e-07,
"loss": 0.31387850642204285,
"step": 2344
},
{
"epoch": 2.530744336569579,
"grad_norm": 1.596435546875,
"learning_rate": 3.1835631955616505e-07,
"loss": 0.3126782178878784,
"step": 2346
},
{
"epoch": 2.5329018338727076,
"grad_norm": 1.315140724182129,
"learning_rate": 3.172971446176621e-07,
"loss": 0.26786503195762634,
"step": 2348
},
{
"epoch": 2.535059331175836,
"grad_norm": 0.4910350441932678,
"learning_rate": 3.16242399841568e-07,
"loss": 0.1304475963115692,
"step": 2350
},
{
"epoch": 2.5372168284789645,
"grad_norm": 1.8181979656219482,
"learning_rate": 3.1519209119782435e-07,
"loss": 0.26876750588417053,
"step": 2352
},
{
"epoch": 2.539374325782093,
"grad_norm": 2.385711193084717,
"learning_rate": 3.141462246312644e-07,
"loss": 0.2904283106327057,
"step": 2354
},
{
"epoch": 2.5415318230852213,
"grad_norm": 1.548781156539917,
"learning_rate": 3.1310480606157864e-07,
"loss": 0.2804209589958191,
"step": 2356
},
{
"epoch": 2.5436893203883493,
"grad_norm": 2.7937028408050537,
"learning_rate": 3.120678413832821e-07,
"loss": 0.36265939474105835,
"step": 2358
},
{
"epoch": 2.545846817691478,
"grad_norm": 1.6474692821502686,
"learning_rate": 3.110353364656792e-07,
"loss": 0.2979966700077057,
"step": 2360
},
{
"epoch": 2.548004314994606,
"grad_norm": 1.5488511323928833,
"learning_rate": 3.1000729715283306e-07,
"loss": 0.37080666422843933,
"step": 2362
},
{
"epoch": 2.5501618122977345,
"grad_norm": 2.0763769149780273,
"learning_rate": 3.089837292635309e-07,
"loss": 0.293621301651001,
"step": 2364
},
{
"epoch": 2.552319309600863,
"grad_norm": 1.1936835050582886,
"learning_rate": 3.079646385912502e-07,
"loss": 0.1373947560787201,
"step": 2366
},
{
"epoch": 2.5544768069039914,
"grad_norm": 1.4387550354003906,
"learning_rate": 3.069500309041283e-07,
"loss": 0.34591472148895264,
"step": 2368
},
{
"epoch": 2.55663430420712,
"grad_norm": 36.577571868896484,
"learning_rate": 3.05939911944928e-07,
"loss": 0.6267740726470947,
"step": 2370
},
{
"epoch": 2.558791801510248,
"grad_norm": 1.3116552829742432,
"learning_rate": 3.049342874310053e-07,
"loss": 0.25853827595710754,
"step": 2372
},
{
"epoch": 2.5609492988133766,
"grad_norm": 1.2759634256362915,
"learning_rate": 3.0393316305427743e-07,
"loss": 0.29107633233070374,
"step": 2374
},
{
"epoch": 2.5631067961165046,
"grad_norm": 2.6377315521240234,
"learning_rate": 3.0293654448119094e-07,
"loss": 0.3561844527721405,
"step": 2376
},
{
"epoch": 2.5652642934196335,
"grad_norm": 1.450838565826416,
"learning_rate": 3.0194443735268855e-07,
"loss": 0.24433653056621552,
"step": 2378
},
{
"epoch": 2.5674217907227614,
"grad_norm": 5.896989822387695,
"learning_rate": 3.009568472841778e-07,
"loss": 0.3096998929977417,
"step": 2380
},
{
"epoch": 2.56957928802589,
"grad_norm": 1.285710096359253,
"learning_rate": 2.999737798654999e-07,
"loss": 0.23295409977436066,
"step": 2382
},
{
"epoch": 2.5717367853290183,
"grad_norm": 1.9540512561798096,
"learning_rate": 2.9899524066089715e-07,
"loss": 0.3955782949924469,
"step": 2384
},
{
"epoch": 2.5738942826321467,
"grad_norm": 1.3187015056610107,
"learning_rate": 2.980212352089816e-07,
"loss": 0.34291237592697144,
"step": 2386
},
{
"epoch": 2.576051779935275,
"grad_norm": 35.879234313964844,
"learning_rate": 2.9705176902270386e-07,
"loss": 0.2686152160167694,
"step": 2388
},
{
"epoch": 2.5782092772384035,
"grad_norm": 1.8663164377212524,
"learning_rate": 2.960868475893224e-07,
"loss": 0.20186705887317657,
"step": 2390
},
{
"epoch": 2.580366774541532,
"grad_norm": 1.9428675174713135,
"learning_rate": 2.951264763703719e-07,
"loss": 0.2464224100112915,
"step": 2392
},
{
"epoch": 2.58252427184466,
"grad_norm": 1.2760838270187378,
"learning_rate": 2.941706608016317e-07,
"loss": 0.2949107885360718,
"step": 2394
},
{
"epoch": 2.584681769147789,
"grad_norm": 1.605370044708252,
"learning_rate": 2.9321940629309705e-07,
"loss": 0.2963062822818756,
"step": 2396
},
{
"epoch": 2.5868392664509168,
"grad_norm": 1.2159507274627686,
"learning_rate": 2.9227271822894615e-07,
"loss": 0.25476306676864624,
"step": 2398
},
{
"epoch": 2.588996763754045,
"grad_norm": 1.5449872016906738,
"learning_rate": 2.913306019675114e-07,
"loss": 0.1481795310974121,
"step": 2400
},
{
"epoch": 2.5911542610571736,
"grad_norm": 1.4837470054626465,
"learning_rate": 2.9039306284124764e-07,
"loss": 0.1671726405620575,
"step": 2402
},
{
"epoch": 2.593311758360302,
"grad_norm": 2.013652801513672,
"learning_rate": 2.8946010615670397e-07,
"loss": 0.3186720609664917,
"step": 2404
},
{
"epoch": 2.5954692556634305,
"grad_norm": 1.3679430484771729,
"learning_rate": 2.8853173719449153e-07,
"loss": 0.19645805656909943,
"step": 2406
},
{
"epoch": 2.597626752966559,
"grad_norm": 4.073277473449707,
"learning_rate": 2.8760796120925455e-07,
"loss": 0.27875351905822754,
"step": 2408
},
{
"epoch": 2.5997842502696873,
"grad_norm": 2.3851447105407715,
"learning_rate": 2.8668878342964165e-07,
"loss": 0.3042440414428711,
"step": 2410
},
{
"epoch": 2.6019417475728153,
"grad_norm": 3.0329368114471436,
"learning_rate": 2.8577420905827356e-07,
"loss": 0.40673866868019104,
"step": 2412
},
{
"epoch": 2.604099244875944,
"grad_norm": 1.5875539779663086,
"learning_rate": 2.848642432717171e-07,
"loss": 0.3267652988433838,
"step": 2414
},
{
"epoch": 2.606256742179072,
"grad_norm": 1.2932881116867065,
"learning_rate": 2.8395889122045293e-07,
"loss": 0.3058151602745056,
"step": 2416
},
{
"epoch": 2.6084142394822005,
"grad_norm": 1.7162359952926636,
"learning_rate": 2.8305815802884807e-07,
"loss": 0.3365314304828644,
"step": 2418
},
{
"epoch": 2.610571736785329,
"grad_norm": 2.6204607486724854,
"learning_rate": 2.8216204879512613e-07,
"loss": 0.26252228021621704,
"step": 2420
},
{
"epoch": 2.6127292340884574,
"grad_norm": 1.4677083492279053,
"learning_rate": 2.8127056859133914e-07,
"loss": 0.22944192588329315,
"step": 2422
},
{
"epoch": 2.614886731391586,
"grad_norm": 1.4243353605270386,
"learning_rate": 2.803837224633385e-07,
"loss": 0.30490928888320923,
"step": 2424
},
{
"epoch": 2.617044228694714,
"grad_norm": 1.6455210447311401,
"learning_rate": 2.795015154307454e-07,
"loss": 0.3725619614124298,
"step": 2426
},
{
"epoch": 2.6192017259978426,
"grad_norm": 1.263656497001648,
"learning_rate": 2.786239524869247e-07,
"loss": 0.38048383593559265,
"step": 2428
},
{
"epoch": 2.6213592233009706,
"grad_norm": 4.27044677734375,
"learning_rate": 2.7775103859895443e-07,
"loss": 0.306596577167511,
"step": 2430
},
{
"epoch": 2.6235167206040995,
"grad_norm": 1.5108482837677002,
"learning_rate": 2.7688277870759877e-07,
"loss": 0.25143009424209595,
"step": 2432
},
{
"epoch": 2.6256742179072274,
"grad_norm": 2.3310492038726807,
"learning_rate": 2.7601917772728e-07,
"loss": 0.38734516501426697,
"step": 2434
},
{
"epoch": 2.627831715210356,
"grad_norm": 3.2000060081481934,
"learning_rate": 2.7516024054605076e-07,
"loss": 0.3311081528663635,
"step": 2436
},
{
"epoch": 2.6299892125134843,
"grad_norm": 1.907240390777588,
"learning_rate": 2.743059720255658e-07,
"loss": 0.1861996203660965,
"step": 2438
},
{
"epoch": 2.6321467098166127,
"grad_norm": 1.7378534078598022,
"learning_rate": 2.73456377001055e-07,
"loss": 0.21612344682216644,
"step": 2440
},
{
"epoch": 2.634304207119741,
"grad_norm": 1.9995208978652954,
"learning_rate": 2.726114602812962e-07,
"loss": 0.20262135565280914,
"step": 2442
},
{
"epoch": 2.6364617044228695,
"grad_norm": 2.0192410945892334,
"learning_rate": 2.7177122664858727e-07,
"loss": 0.3542102575302124,
"step": 2444
},
{
"epoch": 2.638619201725998,
"grad_norm": 1.935210943222046,
"learning_rate": 2.709356808587195e-07,
"loss": 0.39216798543930054,
"step": 2446
},
{
"epoch": 2.6407766990291264,
"grad_norm": 1.6480642557144165,
"learning_rate": 2.7010482764095047e-07,
"loss": 0.12209905683994293,
"step": 2448
},
{
"epoch": 2.642934196332255,
"grad_norm": 7.414170265197754,
"learning_rate": 2.6927867169797805e-07,
"loss": 0.5208877325057983,
"step": 2450
},
{
"epoch": 2.6450916936353828,
"grad_norm": 1.8177531957626343,
"learning_rate": 2.6845721770591236e-07,
"loss": 0.4026768207550049,
"step": 2452
},
{
"epoch": 2.647249190938511,
"grad_norm": 7.202023506164551,
"learning_rate": 2.676404703142503e-07,
"loss": 0.25339025259017944,
"step": 2454
},
{
"epoch": 2.6494066882416396,
"grad_norm": 1.1629081964492798,
"learning_rate": 2.6682843414584954e-07,
"loss": 0.27695736289024353,
"step": 2456
},
{
"epoch": 2.651564185544768,
"grad_norm": 2.3341569900512695,
"learning_rate": 2.660211137969013e-07,
"loss": 0.1916518658399582,
"step": 2458
},
{
"epoch": 2.6537216828478964,
"grad_norm": 3.8498494625091553,
"learning_rate": 2.6521851383690486e-07,
"loss": 0.28546687960624695,
"step": 2460
},
{
"epoch": 2.655879180151025,
"grad_norm": 1.4487419128417969,
"learning_rate": 2.6442063880864183e-07,
"loss": 0.2577356994152069,
"step": 2462
},
{
"epoch": 2.6580366774541533,
"grad_norm": 1.4185736179351807,
"learning_rate": 2.636274932281508e-07,
"loss": 0.335868775844574,
"step": 2464
},
{
"epoch": 2.6601941747572817,
"grad_norm": 0.31347620487213135,
"learning_rate": 2.628390815847005e-07,
"loss": 0.08725874125957489,
"step": 2466
},
{
"epoch": 2.66235167206041,
"grad_norm": 1.61336088180542,
"learning_rate": 2.6205540834076545e-07,
"loss": 0.38296324014663696,
"step": 2468
},
{
"epoch": 2.664509169363538,
"grad_norm": 1.5005100965499878,
"learning_rate": 2.6127647793200105e-07,
"loss": 0.29919686913490295,
"step": 2470
},
{
"epoch": 2.6666666666666665,
"grad_norm": 1.2096229791641235,
"learning_rate": 2.6050229476721666e-07,
"loss": 0.20811551809310913,
"step": 2472
},
{
"epoch": 2.668824163969795,
"grad_norm": 1.0160880088806152,
"learning_rate": 2.59732863228353e-07,
"loss": 0.2570361793041229,
"step": 2474
},
{
"epoch": 2.6709816612729234,
"grad_norm": 1.900524377822876,
"learning_rate": 2.589681876704557e-07,
"loss": 0.2715557813644409,
"step": 2476
},
{
"epoch": 2.6731391585760518,
"grad_norm": 4.474449157714844,
"learning_rate": 2.58208272421651e-07,
"loss": 0.3477630913257599,
"step": 2478
},
{
"epoch": 2.67529665587918,
"grad_norm": 1.686800241470337,
"learning_rate": 2.574531217831218e-07,
"loss": 0.3386651277542114,
"step": 2480
},
{
"epoch": 2.6774541531823086,
"grad_norm": 1.3463099002838135,
"learning_rate": 2.567027400290826e-07,
"loss": 0.24490870535373688,
"step": 2482
},
{
"epoch": 2.679611650485437,
"grad_norm": 1.4641351699829102,
"learning_rate": 2.5595713140675575e-07,
"loss": 0.2602543234825134,
"step": 2484
},
{
"epoch": 2.6817691477885655,
"grad_norm": 1.584945797920227,
"learning_rate": 2.55216300136347e-07,
"loss": 0.2942560911178589,
"step": 2486
},
{
"epoch": 2.6839266450916934,
"grad_norm": 1.3279131650924683,
"learning_rate": 2.544802504110226e-07,
"loss": 0.2843012809753418,
"step": 2488
},
{
"epoch": 2.686084142394822,
"grad_norm": 1.5675629377365112,
"learning_rate": 2.537489863968842e-07,
"loss": 0.3618108034133911,
"step": 2490
},
{
"epoch": 2.6882416396979503,
"grad_norm": 1.386384129524231,
"learning_rate": 2.530225122329459e-07,
"loss": 0.18842831254005432,
"step": 2492
},
{
"epoch": 2.6903991370010787,
"grad_norm": 0.44938626885414124,
"learning_rate": 2.5230083203111163e-07,
"loss": 0.010065621696412563,
"step": 2494
},
{
"epoch": 2.692556634304207,
"grad_norm": 1.9967701435089111,
"learning_rate": 2.5158394987615014e-07,
"loss": 0.3003666400909424,
"step": 2496
},
{
"epoch": 2.6947141316073355,
"grad_norm": 0.5517582893371582,
"learning_rate": 2.5087186982567345e-07,
"loss": 0.1282682716846466,
"step": 2498
},
{
"epoch": 2.696871628910464,
"grad_norm": 1.3890687227249146,
"learning_rate": 2.5016459591011287e-07,
"loss": 0.3012073338031769,
"step": 2500
},
{
"epoch": 2.6990291262135924,
"grad_norm": 2.243579387664795,
"learning_rate": 2.494621321326972e-07,
"loss": 0.3187774121761322,
"step": 2502
},
{
"epoch": 2.701186623516721,
"grad_norm": 1.2543731927871704,
"learning_rate": 2.487644824694288e-07,
"loss": 0.2931416630744934,
"step": 2504
},
{
"epoch": 2.7033441208198488,
"grad_norm": 2.056020975112915,
"learning_rate": 2.48071650869062e-07,
"loss": 0.24611467123031616,
"step": 2506
},
{
"epoch": 2.705501618122977,
"grad_norm": 2.098752498626709,
"learning_rate": 2.473836412530809e-07,
"loss": 0.3165457248687744,
"step": 2508
},
{
"epoch": 2.7076591154261056,
"grad_norm": 4.687522888183594,
"learning_rate": 2.46700457515676e-07,
"loss": 0.4136981666088104,
"step": 2510
},
{
"epoch": 2.709816612729234,
"grad_norm": 0.5573480725288391,
"learning_rate": 2.460221035237235e-07,
"loss": 0.15423323214054108,
"step": 2512
},
{
"epoch": 2.7119741100323624,
"grad_norm": 1.1807475090026855,
"learning_rate": 2.453485831167625e-07,
"loss": 0.2989809811115265,
"step": 2514
},
{
"epoch": 2.714131607335491,
"grad_norm": 1.6326533555984497,
"learning_rate": 2.446799001069742e-07,
"loss": 0.3671968877315521,
"step": 2516
},
{
"epoch": 2.7162891046386193,
"grad_norm": 4.470088958740234,
"learning_rate": 2.440160582791589e-07,
"loss": 0.3751377463340759,
"step": 2518
},
{
"epoch": 2.7184466019417477,
"grad_norm": 0.9408198595046997,
"learning_rate": 2.43357061390716e-07,
"loss": 0.3237053155899048,
"step": 2520
},
{
"epoch": 2.720604099244876,
"grad_norm": 1.8276516199111938,
"learning_rate": 2.42702913171622e-07,
"loss": 0.3124433755874634,
"step": 2522
},
{
"epoch": 2.722761596548004,
"grad_norm": 1.5803215503692627,
"learning_rate": 2.420536173244094e-07,
"loss": 0.2791770100593567,
"step": 2524
},
{
"epoch": 2.724919093851133,
"grad_norm": 3.284719705581665,
"learning_rate": 2.414091775241462e-07,
"loss": 0.3442307114601135,
"step": 2526
},
{
"epoch": 2.727076591154261,
"grad_norm": 1.687919020652771,
"learning_rate": 2.4076959741841445e-07,
"loss": 0.3351602852344513,
"step": 2528
},
{
"epoch": 2.7292340884573894,
"grad_norm": 7.2448225021362305,
"learning_rate": 2.4013488062728993e-07,
"loss": 0.3161589205265045,
"step": 2530
},
{
"epoch": 2.7313915857605178,
"grad_norm": 4.134527206420898,
"learning_rate": 2.395050307433219e-07,
"loss": 0.36085984110832214,
"step": 2532
},
{
"epoch": 2.733549083063646,
"grad_norm": 1.2739371061325073,
"learning_rate": 2.3888005133151255e-07,
"loss": 0.22625665366649628,
"step": 2534
},
{
"epoch": 2.7357065803667746,
"grad_norm": 1.8671566247940063,
"learning_rate": 2.3825994592929645e-07,
"loss": 0.2694007158279419,
"step": 2536
},
{
"epoch": 2.737864077669903,
"grad_norm": 7.57783842086792,
"learning_rate": 2.3764471804652095e-07,
"loss": 0.18972235918045044,
"step": 2538
},
{
"epoch": 2.7400215749730314,
"grad_norm": 2.3178181648254395,
"learning_rate": 2.370343711654267e-07,
"loss": 0.277940571308136,
"step": 2540
},
{
"epoch": 2.7421790722761594,
"grad_norm": 1.989964246749878,
"learning_rate": 2.36428908740627e-07,
"loss": 0.22704048454761505,
"step": 2542
},
{
"epoch": 2.7443365695792883,
"grad_norm": 1.57944655418396,
"learning_rate": 2.358283341990889e-07,
"loss": 0.25424429774284363,
"step": 2544
},
{
"epoch": 2.7464940668824163,
"grad_norm": 2.715576410293579,
"learning_rate": 2.352326509401134e-07,
"loss": 0.2091311663389206,
"step": 2546
},
{
"epoch": 2.7486515641855447,
"grad_norm": 3.7704293727874756,
"learning_rate": 2.3464186233531696e-07,
"loss": 0.316684752702713,
"step": 2548
},
{
"epoch": 2.750809061488673,
"grad_norm": 0.6256927847862244,
"learning_rate": 2.3405597172861135e-07,
"loss": 0.2244507223367691,
"step": 2550
},
{
"epoch": 2.7529665587918015,
"grad_norm": 1.8245450258255005,
"learning_rate": 2.3347498243618558e-07,
"loss": 0.21601910889148712,
"step": 2552
},
{
"epoch": 2.75512405609493,
"grad_norm": 2.7671761512756348,
"learning_rate": 2.3289889774648675e-07,
"loss": 0.25035250186920166,
"step": 2554
},
{
"epoch": 2.7572815533980584,
"grad_norm": 1.276296854019165,
"learning_rate": 2.3232772092020148e-07,
"loss": 0.18391655385494232,
"step": 2556
},
{
"epoch": 2.759439050701187,
"grad_norm": 3.1530673503875732,
"learning_rate": 2.3176145519023742e-07,
"loss": 0.2945748567581177,
"step": 2558
},
{
"epoch": 2.7615965480043148,
"grad_norm": 1.5141795873641968,
"learning_rate": 2.312001037617051e-07,
"loss": 0.3175848424434662,
"step": 2560
},
{
"epoch": 2.7637540453074436,
"grad_norm": 1.762587547302246,
"learning_rate": 2.3064366981189995e-07,
"loss": 0.2632935643196106,
"step": 2562
},
{
"epoch": 2.7659115426105716,
"grad_norm": 2.4396347999572754,
"learning_rate": 2.3009215649028332e-07,
"loss": 0.12333346903324127,
"step": 2564
},
{
"epoch": 2.7680690399137,
"grad_norm": 0.8734754323959351,
"learning_rate": 2.295455669184662e-07,
"loss": 0.1719101369380951,
"step": 2566
},
{
"epoch": 2.7702265372168284,
"grad_norm": 2.0594780445098877,
"learning_rate": 2.2900390419019047e-07,
"loss": 0.24180670082569122,
"step": 2568
},
{
"epoch": 2.772384034519957,
"grad_norm": 3.8657302856445312,
"learning_rate": 2.2846717137131139e-07,
"loss": 0.17427459359169006,
"step": 2570
},
{
"epoch": 2.7745415318230853,
"grad_norm": 1.2864545583724976,
"learning_rate": 2.2793537149978097e-07,
"loss": 0.3185139298439026,
"step": 2572
},
{
"epoch": 2.7766990291262137,
"grad_norm": 1.2937431335449219,
"learning_rate": 2.2740850758563e-07,
"loss": 0.25411853194236755,
"step": 2574
},
{
"epoch": 2.778856526429342,
"grad_norm": 2.3176355361938477,
"learning_rate": 2.2688658261095177e-07,
"loss": 0.20836421847343445,
"step": 2576
},
{
"epoch": 2.78101402373247,
"grad_norm": 2.0247695446014404,
"learning_rate": 2.2636959952988402e-07,
"loss": 0.27753064036369324,
"step": 2578
},
{
"epoch": 2.783171521035599,
"grad_norm": 1.718490481376648,
"learning_rate": 2.2585756126859373e-07,
"loss": 0.26367393136024475,
"step": 2580
},
{
"epoch": 2.785329018338727,
"grad_norm": 2.086444854736328,
"learning_rate": 2.2535047072525968e-07,
"loss": 0.2552420198917389,
"step": 2582
},
{
"epoch": 2.7874865156418553,
"grad_norm": 2.691962957382202,
"learning_rate": 2.2484833077005534e-07,
"loss": 0.38996651768684387,
"step": 2584
},
{
"epoch": 2.7896440129449838,
"grad_norm": 3.1365230083465576,
"learning_rate": 2.2435114424513468e-07,
"loss": 0.2881295680999756,
"step": 2586
},
{
"epoch": 2.791801510248112,
"grad_norm": 0.678428053855896,
"learning_rate": 2.23858913964614e-07,
"loss": 0.3442489802837372,
"step": 2588
},
{
"epoch": 2.7939590075512406,
"grad_norm": 1.9768662452697754,
"learning_rate": 2.233716427145571e-07,
"loss": 0.3336244225502014,
"step": 2590
},
{
"epoch": 2.796116504854369,
"grad_norm": 7.05756139755249,
"learning_rate": 2.2288933325295919e-07,
"loss": 0.3653881251811981,
"step": 2592
},
{
"epoch": 2.7982740021574974,
"grad_norm": 1.4996511936187744,
"learning_rate": 2.224119883097315e-07,
"loss": 0.337455153465271,
"step": 2594
},
{
"epoch": 2.8004314994606254,
"grad_norm": 1.5251402854919434,
"learning_rate": 2.2193961058668565e-07,
"loss": 0.24892055988311768,
"step": 2596
},
{
"epoch": 2.8025889967637543,
"grad_norm": 2.474886655807495,
"learning_rate": 2.2147220275751817e-07,
"loss": 0.38033241033554077,
"step": 2598
},
{
"epoch": 2.8047464940668823,
"grad_norm": 2.1626007556915283,
"learning_rate": 2.2100976746779575e-07,
"loss": 0.2779306471347809,
"step": 2600
},
{
"epoch": 2.8069039913700107,
"grad_norm": 1.5766234397888184,
"learning_rate": 2.2055230733494034e-07,
"loss": 0.198373481631279,
"step": 2602
},
{
"epoch": 2.809061488673139,
"grad_norm": 1.3127539157867432,
"learning_rate": 2.2009982494821354e-07,
"loss": 0.2616628110408783,
"step": 2604
},
{
"epoch": 2.8112189859762675,
"grad_norm": 1.8110085725784302,
"learning_rate": 2.1965232286870293e-07,
"loss": 0.34928035736083984,
"step": 2606
},
{
"epoch": 2.813376483279396,
"grad_norm": 0.9856870770454407,
"learning_rate": 2.1920980362930693e-07,
"loss": 0.24830693006515503,
"step": 2608
},
{
"epoch": 2.8155339805825244,
"grad_norm": 1.0019290447235107,
"learning_rate": 2.1877226973472092e-07,
"loss": 0.37566351890563965,
"step": 2610
},
{
"epoch": 2.8176914778856528,
"grad_norm": 1.683439016342163,
"learning_rate": 2.1833972366142252e-07,
"loss": 0.20518970489501953,
"step": 2612
},
{
"epoch": 2.8198489751887807,
"grad_norm": 1.2993359565734863,
"learning_rate": 2.1791216785765812e-07,
"loss": 0.3496171832084656,
"step": 2614
},
{
"epoch": 2.8220064724919096,
"grad_norm": 1.8535475730895996,
"learning_rate": 2.1748960474342858e-07,
"loss": 0.40503692626953125,
"step": 2616
},
{
"epoch": 2.8241639697950376,
"grad_norm": 1.230737328529358,
"learning_rate": 2.1707203671047588e-07,
"loss": 0.3467937111854553,
"step": 2618
},
{
"epoch": 2.826321467098166,
"grad_norm": 2.092404365539551,
"learning_rate": 2.166594661222692e-07,
"loss": 0.45293277502059937,
"step": 2620
},
{
"epoch": 2.8284789644012944,
"grad_norm": 1.2861336469650269,
"learning_rate": 2.162518953139921e-07,
"loss": 0.2429104447364807,
"step": 2622
},
{
"epoch": 2.830636461704423,
"grad_norm": 1.7713249921798706,
"learning_rate": 2.1584932659252883e-07,
"loss": 0.31055137515068054,
"step": 2624
},
{
"epoch": 2.8327939590075513,
"grad_norm": 2.2935972213745117,
"learning_rate": 2.1545176223645118e-07,
"loss": 0.44860854744911194,
"step": 2626
},
{
"epoch": 2.8349514563106797,
"grad_norm": 3.7632486820220947,
"learning_rate": 2.1505920449600637e-07,
"loss": 0.45059871673583984,
"step": 2628
},
{
"epoch": 2.837108953613808,
"grad_norm": 1.9739702939987183,
"learning_rate": 2.146716555931031e-07,
"loss": 0.32334843277931213,
"step": 2630
},
{
"epoch": 2.839266450916936,
"grad_norm": 1.8699138164520264,
"learning_rate": 2.1428911772130022e-07,
"loss": 0.37822097539901733,
"step": 2632
},
{
"epoch": 2.841423948220065,
"grad_norm": 1.3838216066360474,
"learning_rate": 2.1391159304579338e-07,
"loss": 0.2415277361869812,
"step": 2634
},
{
"epoch": 2.843581445523193,
"grad_norm": 2.178896427154541,
"learning_rate": 2.1353908370340319e-07,
"loss": 0.36013925075531006,
"step": 2636
},
{
"epoch": 2.8457389428263213,
"grad_norm": 1.6107887029647827,
"learning_rate": 2.131715918025631e-07,
"loss": 0.28840532898902893,
"step": 2638
},
{
"epoch": 2.8478964401294498,
"grad_norm": 1.2073341608047485,
"learning_rate": 2.1280911942330754e-07,
"loss": 0.24024561047554016,
"step": 2640
},
{
"epoch": 2.850053937432578,
"grad_norm": 1.346824049949646,
"learning_rate": 2.1245166861725987e-07,
"loss": 0.29280197620391846,
"step": 2642
},
{
"epoch": 2.8522114347357066,
"grad_norm": 2.1668944358825684,
"learning_rate": 2.1209924140762103e-07,
"loss": 0.27317503094673157,
"step": 2644
},
{
"epoch": 2.854368932038835,
"grad_norm": 1.5950590372085571,
"learning_rate": 2.1175183978915794e-07,
"loss": 0.16236615180969238,
"step": 2646
},
{
"epoch": 2.8565264293419634,
"grad_norm": 1.4013762474060059,
"learning_rate": 2.1140946572819222e-07,
"loss": 0.23140932619571686,
"step": 2648
},
{
"epoch": 2.858683926645092,
"grad_norm": 1.3808273077011108,
"learning_rate": 2.1107212116258926e-07,
"loss": 0.34648364782333374,
"step": 2650
},
{
"epoch": 2.8608414239482203,
"grad_norm": 1.66391921043396,
"learning_rate": 2.107398080017468e-07,
"loss": 0.3459605872631073,
"step": 2652
},
{
"epoch": 2.8629989212513482,
"grad_norm": 3.6172289848327637,
"learning_rate": 2.1041252812658484e-07,
"loss": 0.25366389751434326,
"step": 2654
},
{
"epoch": 2.8651564185544767,
"grad_norm": 1.0681893825531006,
"learning_rate": 2.100902833895342e-07,
"loss": 0.185197651386261,
"step": 2656
},
{
"epoch": 2.867313915857605,
"grad_norm": 1.549970030784607,
"learning_rate": 2.0977307561452663e-07,
"loss": 0.1989063322544098,
"step": 2658
},
{
"epoch": 2.8694714131607335,
"grad_norm": 1.3811312913894653,
"learning_rate": 2.09460906596984e-07,
"loss": 0.31181615591049194,
"step": 2660
},
{
"epoch": 2.871628910463862,
"grad_norm": 1.6068974733352661,
"learning_rate": 2.091537781038089e-07,
"loss": 0.30436015129089355,
"step": 2662
},
{
"epoch": 2.8737864077669903,
"grad_norm": 2.1774463653564453,
"learning_rate": 2.0885169187337344e-07,
"loss": 0.1961633861064911,
"step": 2664
},
{
"epoch": 2.8759439050701188,
"grad_norm": 1.3629629611968994,
"learning_rate": 2.0855464961551068e-07,
"loss": 0.2554187774658203,
"step": 2666
},
{
"epoch": 2.878101402373247,
"grad_norm": 1.1737473011016846,
"learning_rate": 2.0826265301150424e-07,
"loss": 0.2499612420797348,
"step": 2668
},
{
"epoch": 2.8802588996763756,
"grad_norm": 0.43503567576408386,
"learning_rate": 2.0797570371407868e-07,
"loss": 0.11031116545200348,
"step": 2670
},
{
"epoch": 2.8824163969795036,
"grad_norm": 1.2711269855499268,
"learning_rate": 2.0769380334739064e-07,
"loss": 0.27530673146247864,
"step": 2672
},
{
"epoch": 2.884573894282632,
"grad_norm": 1.6386080980300903,
"learning_rate": 2.0741695350701957e-07,
"loss": 0.21418559551239014,
"step": 2674
},
{
"epoch": 2.8867313915857604,
"grad_norm": 0.6615068912506104,
"learning_rate": 2.0714515575995788e-07,
"loss": 0.3122788667678833,
"step": 2676
},
{
"epoch": 2.888888888888889,
"grad_norm": 1.6554388999938965,
"learning_rate": 2.068784116446034e-07,
"loss": 0.2509201467037201,
"step": 2678
},
{
"epoch": 2.8910463861920173,
"grad_norm": 1.535337209701538,
"learning_rate": 2.0661672267074972e-07,
"loss": 0.2228378802537918,
"step": 2680
},
{
"epoch": 2.8932038834951457,
"grad_norm": 1.8591066598892212,
"learning_rate": 2.0636009031957781e-07,
"loss": 0.4586015045642853,
"step": 2682
},
{
"epoch": 2.895361380798274,
"grad_norm": 1.2504751682281494,
"learning_rate": 2.0610851604364787e-07,
"loss": 0.21972203254699707,
"step": 2684
},
{
"epoch": 2.8975188781014025,
"grad_norm": 1.314433217048645,
"learning_rate": 2.0586200126689092e-07,
"loss": 0.30095550417900085,
"step": 2686
},
{
"epoch": 2.899676375404531,
"grad_norm": 1.6293613910675049,
"learning_rate": 2.0562054738460098e-07,
"loss": 0.12489507347345352,
"step": 2688
},
{
"epoch": 2.901833872707659,
"grad_norm": 2.388120651245117,
"learning_rate": 2.0538415576342665e-07,
"loss": 0.2403588593006134,
"step": 2690
},
{
"epoch": 2.9039913700107873,
"grad_norm": 1.676954984664917,
"learning_rate": 2.0515282774136402e-07,
"loss": 0.24414768815040588,
"step": 2692
},
{
"epoch": 2.9061488673139158,
"grad_norm": 2.731567621231079,
"learning_rate": 2.0492656462774877e-07,
"loss": 0.3925679326057434,
"step": 2694
},
{
"epoch": 2.908306364617044,
"grad_norm": 1.193949818611145,
"learning_rate": 2.047053677032484e-07,
"loss": 0.31919193267822266,
"step": 2696
},
{
"epoch": 2.9104638619201726,
"grad_norm": 1.6734915971755981,
"learning_rate": 2.0448923821985597e-07,
"loss": 0.2700918912887573,
"step": 2698
},
{
"epoch": 2.912621359223301,
"grad_norm": 2.165048837661743,
"learning_rate": 2.0427817740088204e-07,
"loss": 0.09998652338981628,
"step": 2700
},
{
"epoch": 2.9147788565264294,
"grad_norm": 1.4805729389190674,
"learning_rate": 2.0407218644094798e-07,
"loss": 0.26490524411201477,
"step": 2702
},
{
"epoch": 2.916936353829558,
"grad_norm": 1.9566199779510498,
"learning_rate": 2.0387126650597966e-07,
"loss": 0.37730756402015686,
"step": 2704
},
{
"epoch": 2.9190938511326863,
"grad_norm": 1.5146074295043945,
"learning_rate": 2.036754187332004e-07,
"loss": 0.2974068224430084,
"step": 2706
},
{
"epoch": 2.9212513484358142,
"grad_norm": 1.5399764776229858,
"learning_rate": 2.034846442311247e-07,
"loss": 0.2960386276245117,
"step": 2708
},
{
"epoch": 2.9234088457389427,
"grad_norm": 1.4276615381240845,
"learning_rate": 2.0329894407955186e-07,
"loss": 0.24433766305446625,
"step": 2710
},
{
"epoch": 2.925566343042071,
"grad_norm": 1.12831711769104,
"learning_rate": 2.0311831932956003e-07,
"loss": 0.194054514169693,
"step": 2712
},
{
"epoch": 2.9277238403451995,
"grad_norm": 4.204019546508789,
"learning_rate": 2.0294277100350006e-07,
"loss": 0.3664979338645935,
"step": 2714
},
{
"epoch": 2.929881337648328,
"grad_norm": 1.073026180267334,
"learning_rate": 2.0277230009498994e-07,
"loss": 0.2648014426231384,
"step": 2716
},
{
"epoch": 2.9320388349514563,
"grad_norm": 2.1484158039093018,
"learning_rate": 2.026069075689089e-07,
"loss": 0.28026407957077026,
"step": 2718
},
{
"epoch": 2.9341963322545848,
"grad_norm": 2.006321907043457,
"learning_rate": 2.0244659436139232e-07,
"loss": 0.451577752828598,
"step": 2720
},
{
"epoch": 2.936353829557713,
"grad_norm": 0.9612744450569153,
"learning_rate": 2.0229136137982607e-07,
"loss": 0.2079283595085144,
"step": 2722
},
{
"epoch": 2.9385113268608416,
"grad_norm": 1.127065896987915,
"learning_rate": 2.021412095028416e-07,
"loss": 0.29667848348617554,
"step": 2724
},
{
"epoch": 2.9406688241639696,
"grad_norm": 0.9780626893043518,
"learning_rate": 2.019961395803108e-07,
"loss": 0.15733566880226135,
"step": 2726
},
{
"epoch": 2.9428263214670984,
"grad_norm": 1.398445725440979,
"learning_rate": 2.0185615243334142e-07,
"loss": 0.3436535894870758,
"step": 2728
},
{
"epoch": 2.9449838187702264,
"grad_norm": 22.650222778320312,
"learning_rate": 2.0172124885427215e-07,
"loss": 0.23946398496627808,
"step": 2730
},
{
"epoch": 2.947141316073355,
"grad_norm": 1.695454478263855,
"learning_rate": 2.0159142960666828e-07,
"loss": 0.22133874893188477,
"step": 2732
},
{
"epoch": 2.9492988133764833,
"grad_norm": 3.043394088745117,
"learning_rate": 2.0146669542531755e-07,
"loss": 0.23711824417114258,
"step": 2734
},
{
"epoch": 2.9514563106796117,
"grad_norm": 0.8506256937980652,
"learning_rate": 2.0134704701622555e-07,
"loss": 0.15003100037574768,
"step": 2736
},
{
"epoch": 2.95361380798274,
"grad_norm": 1.9315284490585327,
"learning_rate": 2.0123248505661205e-07,
"loss": 0.27814292907714844,
"step": 2738
},
{
"epoch": 2.9557713052858685,
"grad_norm": 1.3882677555084229,
"learning_rate": 2.011230101949073e-07,
"loss": 0.23976953327655792,
"step": 2740
},
{
"epoch": 2.957928802588997,
"grad_norm": 2.189103126525879,
"learning_rate": 2.0101862305074788e-07,
"loss": 0.30090874433517456,
"step": 2742
},
{
"epoch": 2.960086299892125,
"grad_norm": 1.476915955543518,
"learning_rate": 2.0091932421497359e-07,
"loss": 0.2663874328136444,
"step": 2744
},
{
"epoch": 2.9622437971952538,
"grad_norm": 1.6229114532470703,
"learning_rate": 2.0082511424962407e-07,
"loss": 0.2835708558559418,
"step": 2746
},
{
"epoch": 2.9644012944983817,
"grad_norm": 1.7014294862747192,
"learning_rate": 2.0073599368793536e-07,
"loss": 0.3245730698108673,
"step": 2748
},
{
"epoch": 2.96655879180151,
"grad_norm": 1.3853977918624878,
"learning_rate": 2.0065196303433735e-07,
"loss": 0.41964831948280334,
"step": 2750
},
{
"epoch": 2.9687162891046386,
"grad_norm": 1.847307801246643,
"learning_rate": 2.0057302276445018e-07,
"loss": 0.33610066771507263,
"step": 2752
},
{
"epoch": 2.970873786407767,
"grad_norm": 1.3295162916183472,
"learning_rate": 2.0049917332508245e-07,
"loss": 0.32951587438583374,
"step": 2754
},
{
"epoch": 2.9730312837108954,
"grad_norm": 1.515709638595581,
"learning_rate": 2.0043041513422793e-07,
"loss": 0.1392497420310974,
"step": 2756
},
{
"epoch": 2.975188781014024,
"grad_norm": 1.3852156400680542,
"learning_rate": 2.0036674858106364e-07,
"loss": 0.23723219335079193,
"step": 2758
},
{
"epoch": 2.9773462783171523,
"grad_norm": 0.7373172044754028,
"learning_rate": 2.0030817402594758e-07,
"loss": 0.0664176344871521,
"step": 2760
},
{
"epoch": 2.9795037756202802,
"grad_norm": 0.5779815912246704,
"learning_rate": 2.0025469180041652e-07,
"loss": 0.19533909857273102,
"step": 2762
},
{
"epoch": 2.981661272923409,
"grad_norm": 1.1505489349365234,
"learning_rate": 2.0020630220718412e-07,
"loss": 0.40156224370002747,
"step": 2764
},
{
"epoch": 2.983818770226537,
"grad_norm": 1.542429804801941,
"learning_rate": 2.0016300552013962e-07,
"loss": 0.11573772132396698,
"step": 2766
},
{
"epoch": 2.9859762675296655,
"grad_norm": 4.137073040008545,
"learning_rate": 2.0012480198434574e-07,
"loss": 0.4161064624786377,
"step": 2768
},
{
"epoch": 2.988133764832794,
"grad_norm": 3.1401453018188477,
"learning_rate": 2.0009169181603766e-07,
"loss": 0.26901060342788696,
"step": 2770
},
{
"epoch": 2.9902912621359223,
"grad_norm": 1.872072696685791,
"learning_rate": 2.0006367520262163e-07,
"loss": 0.36560726165771484,
"step": 2772
},
{
"epoch": 2.9924487594390508,
"grad_norm": 1.6279696226119995,
"learning_rate": 2.0004075230267392e-07,
"loss": 0.22248563170433044,
"step": 2774
},
{
"epoch": 2.994606256742179,
"grad_norm": 2.087898015975952,
"learning_rate": 2.0002292324594007e-07,
"loss": 0.2563750445842743,
"step": 2776
},
{
"epoch": 2.9967637540453076,
"grad_norm": 2.9592933654785156,
"learning_rate": 2.000101881333341e-07,
"loss": 0.12756453454494476,
"step": 2778
},
{
"epoch": 2.9989212513484356,
"grad_norm": 2.473083734512329,
"learning_rate": 2.0000254703693767e-07,
"loss": 0.1695672571659088,
"step": 2780
},
{
"epoch": 3.0,
"step": 2781,
"total_flos": 3.284111394515778e+18,
"train_loss": 0.46147939157918366,
"train_runtime": 26036.604,
"train_samples_per_second": 1.709,
"train_steps_per_second": 0.107
}
],
"logging_steps": 2,
"max_steps": 2781,
"num_input_tokens_seen": 0,
"num_train_epochs": 3,
"save_steps": 99999,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": false,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 3.284111394515778e+18,
"train_batch_size": 2,
"trial_name": null,
"trial_params": null
}