9b-125-3 / trainer_state.json
furproxy's picture
Upload folder using huggingface_hub
220681a verified
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 3.0,
"eval_steps": 500,
"global_step": 2781,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.002157497303128371,
"grad_norm": 2.570061445236206,
"learning_rate": 3.571428571428571e-09,
"loss": 0.9756889939308167,
"step": 2
},
{
"epoch": 0.004314994606256742,
"grad_norm": 2.8094983100891113,
"learning_rate": 1.0714285714285715e-08,
"loss": 0.6637275218963623,
"step": 4
},
{
"epoch": 0.006472491909385114,
"grad_norm": 4.699804782867432,
"learning_rate": 1.7857142857142856e-08,
"loss": 0.8357824087142944,
"step": 6
},
{
"epoch": 0.008629989212513484,
"grad_norm": 4.096808433532715,
"learning_rate": 2.5e-08,
"loss": 0.8224667906761169,
"step": 8
},
{
"epoch": 0.010787486515641856,
"grad_norm": 2.105877161026001,
"learning_rate": 3.214285714285714e-08,
"loss": 0.7087568044662476,
"step": 10
},
{
"epoch": 0.012944983818770227,
"grad_norm": 4.518003463745117,
"learning_rate": 3.9285714285714285e-08,
"loss": 0.7910435795783997,
"step": 12
},
{
"epoch": 0.015102481121898598,
"grad_norm": 5.335687160491943,
"learning_rate": 4.642857142857143e-08,
"loss": 0.8569395542144775,
"step": 14
},
{
"epoch": 0.017259978425026967,
"grad_norm": 2.8184239864349365,
"learning_rate": 5.3571428571428564e-08,
"loss": 1.1704014539718628,
"step": 16
},
{
"epoch": 0.019417475728155338,
"grad_norm": 2.6054298877716064,
"learning_rate": 6.071428571428572e-08,
"loss": 1.0189613103866577,
"step": 18
},
{
"epoch": 0.021574973031283712,
"grad_norm": 2.1738038063049316,
"learning_rate": 6.785714285714285e-08,
"loss": 0.6196831464767456,
"step": 20
},
{
"epoch": 0.023732470334412083,
"grad_norm": 3.19342041015625,
"learning_rate": 7.5e-08,
"loss": 0.9074231386184692,
"step": 22
},
{
"epoch": 0.025889967637540454,
"grad_norm": 6.756741046905518,
"learning_rate": 8.214285714285714e-08,
"loss": 0.80443274974823,
"step": 24
},
{
"epoch": 0.028047464940668825,
"grad_norm": 4.307950496673584,
"learning_rate": 8.928571428571429e-08,
"loss": 0.7722480893135071,
"step": 26
},
{
"epoch": 0.030204962243797196,
"grad_norm": 4.798768043518066,
"learning_rate": 9.642857142857142e-08,
"loss": 0.7002389430999756,
"step": 28
},
{
"epoch": 0.032362459546925564,
"grad_norm": 4.575655460357666,
"learning_rate": 1.0357142857142857e-07,
"loss": 1.014316439628601,
"step": 30
},
{
"epoch": 0.034519956850053934,
"grad_norm": 1.0345690250396729,
"learning_rate": 1.107142857142857e-07,
"loss": 0.6911119222640991,
"step": 32
},
{
"epoch": 0.036677454153182305,
"grad_norm": 9.628297805786133,
"learning_rate": 1.1785714285714285e-07,
"loss": 1.0148026943206787,
"step": 34
},
{
"epoch": 0.038834951456310676,
"grad_norm": 2.975783109664917,
"learning_rate": 1.25e-07,
"loss": 0.8927556276321411,
"step": 36
},
{
"epoch": 0.040992448759439054,
"grad_norm": 9.633585929870605,
"learning_rate": 1.3214285714285714e-07,
"loss": 1.5045725107192993,
"step": 38
},
{
"epoch": 0.043149946062567425,
"grad_norm": 6.078215599060059,
"learning_rate": 1.392857142857143e-07,
"loss": 0.510919988155365,
"step": 40
},
{
"epoch": 0.045307443365695796,
"grad_norm": 2.061255693435669,
"learning_rate": 1.4642857142857143e-07,
"loss": 0.7295307517051697,
"step": 42
},
{
"epoch": 0.04746494066882417,
"grad_norm": 1.5910701751708984,
"learning_rate": 1.5357142857142858e-07,
"loss": 0.8399662375450134,
"step": 44
},
{
"epoch": 0.04962243797195254,
"grad_norm": 2.5256459712982178,
"learning_rate": 1.6071428571428573e-07,
"loss": 0.6381903290748596,
"step": 46
},
{
"epoch": 0.05177993527508091,
"grad_norm": 2.43086314201355,
"learning_rate": 1.6785714285714285e-07,
"loss": 0.8072210550308228,
"step": 48
},
{
"epoch": 0.05393743257820928,
"grad_norm": 2.4569172859191895,
"learning_rate": 1.75e-07,
"loss": 0.6129989624023438,
"step": 50
},
{
"epoch": 0.05609492988133765,
"grad_norm": 1.5575639009475708,
"learning_rate": 1.8214285714285714e-07,
"loss": 0.39090248942375183,
"step": 52
},
{
"epoch": 0.05825242718446602,
"grad_norm": 7.692088603973389,
"learning_rate": 1.8928571428571426e-07,
"loss": 0.9982097148895264,
"step": 54
},
{
"epoch": 0.06040992448759439,
"grad_norm": 28.100284576416016,
"learning_rate": 1.964285714285714e-07,
"loss": 1.190761923789978,
"step": 56
},
{
"epoch": 0.06256742179072276,
"grad_norm": 4.549140930175781,
"learning_rate": 2.0357142857142855e-07,
"loss": 0.6789318323135376,
"step": 58
},
{
"epoch": 0.06472491909385113,
"grad_norm": 12.27080249786377,
"learning_rate": 2.107142857142857e-07,
"loss": 0.8964890241622925,
"step": 60
},
{
"epoch": 0.0668824163969795,
"grad_norm": 8.274192810058594,
"learning_rate": 2.1785714285714284e-07,
"loss": 1.1355807781219482,
"step": 62
},
{
"epoch": 0.06903991370010787,
"grad_norm": 2.232541084289551,
"learning_rate": 2.25e-07,
"loss": 0.9159867763519287,
"step": 64
},
{
"epoch": 0.07119741100323625,
"grad_norm": 3.91237735748291,
"learning_rate": 2.3214285714285714e-07,
"loss": 1.0847519636154175,
"step": 66
},
{
"epoch": 0.07335490830636461,
"grad_norm": 3.247027635574341,
"learning_rate": 2.392857142857143e-07,
"loss": 0.8140153884887695,
"step": 68
},
{
"epoch": 0.07551240560949299,
"grad_norm": 13.169454574584961,
"learning_rate": 2.4642857142857143e-07,
"loss": 0.9606142640113831,
"step": 70
},
{
"epoch": 0.07766990291262135,
"grad_norm": 2.072512626647949,
"learning_rate": 2.5357142857142855e-07,
"loss": 0.7430305480957031,
"step": 72
},
{
"epoch": 0.07982740021574973,
"grad_norm": 2.4250895977020264,
"learning_rate": 2.607142857142857e-07,
"loss": 0.6543456315994263,
"step": 74
},
{
"epoch": 0.08198489751887811,
"grad_norm": 6.248291492462158,
"learning_rate": 2.6785714285714284e-07,
"loss": 0.7182219624519348,
"step": 76
},
{
"epoch": 0.08414239482200647,
"grad_norm": 2.01570725440979,
"learning_rate": 2.75e-07,
"loss": 0.6486653685569763,
"step": 78
},
{
"epoch": 0.08629989212513485,
"grad_norm": 4.23671817779541,
"learning_rate": 2.8214285714285713e-07,
"loss": 1.1015154123306274,
"step": 80
},
{
"epoch": 0.08845738942826321,
"grad_norm": 6.89118766784668,
"learning_rate": 2.892857142857143e-07,
"loss": 0.7223177552223206,
"step": 82
},
{
"epoch": 0.09061488673139159,
"grad_norm": 3.134542226791382,
"learning_rate": 2.9642857142857143e-07,
"loss": 0.5234851837158203,
"step": 84
},
{
"epoch": 0.09277238403451996,
"grad_norm": 1.6814379692077637,
"learning_rate": 3.0357142857142855e-07,
"loss": 0.7054818868637085,
"step": 86
},
{
"epoch": 0.09492988133764833,
"grad_norm": 2.541091203689575,
"learning_rate": 3.107142857142857e-07,
"loss": 0.7358066439628601,
"step": 88
},
{
"epoch": 0.0970873786407767,
"grad_norm": 7.0923991203308105,
"learning_rate": 3.1785714285714284e-07,
"loss": 0.7044313549995422,
"step": 90
},
{
"epoch": 0.09924487594390508,
"grad_norm": 1.4156235456466675,
"learning_rate": 3.25e-07,
"loss": 0.7540506720542908,
"step": 92
},
{
"epoch": 0.10140237324703344,
"grad_norm": 2.159705877304077,
"learning_rate": 3.3214285714285713e-07,
"loss": 0.8926774859428406,
"step": 94
},
{
"epoch": 0.10355987055016182,
"grad_norm": 1.109196662902832,
"learning_rate": 3.392857142857143e-07,
"loss": 0.8637357950210571,
"step": 96
},
{
"epoch": 0.10571736785329018,
"grad_norm": 2.337041139602661,
"learning_rate": 3.464285714285714e-07,
"loss": 0.748981237411499,
"step": 98
},
{
"epoch": 0.10787486515641856,
"grad_norm": 11.489009857177734,
"learning_rate": 3.535714285714286e-07,
"loss": 0.6166712641716003,
"step": 100
},
{
"epoch": 0.11003236245954692,
"grad_norm": 2.5563786029815674,
"learning_rate": 3.607142857142857e-07,
"loss": 0.8990151286125183,
"step": 102
},
{
"epoch": 0.1121898597626753,
"grad_norm": 11.782515525817871,
"learning_rate": 3.678571428571429e-07,
"loss": 0.9428755044937134,
"step": 104
},
{
"epoch": 0.11434735706580366,
"grad_norm": 4.793514251708984,
"learning_rate": 3.75e-07,
"loss": 0.515690803527832,
"step": 106
},
{
"epoch": 0.11650485436893204,
"grad_norm": 10.346397399902344,
"learning_rate": 3.821428571428571e-07,
"loss": 1.0367025136947632,
"step": 108
},
{
"epoch": 0.1186623516720604,
"grad_norm": 1.8307247161865234,
"learning_rate": 3.8928571428571425e-07,
"loss": 0.682608962059021,
"step": 110
},
{
"epoch": 0.12081984897518878,
"grad_norm": 2.618833541870117,
"learning_rate": 3.9642857142857137e-07,
"loss": 0.5873494148254395,
"step": 112
},
{
"epoch": 0.12297734627831715,
"grad_norm": 4.1070427894592285,
"learning_rate": 4.0357142857142854e-07,
"loss": 0.5457082390785217,
"step": 114
},
{
"epoch": 0.12513484358144553,
"grad_norm": 3.505392074584961,
"learning_rate": 4.1071428571428566e-07,
"loss": 0.7192925214767456,
"step": 116
},
{
"epoch": 0.1272923408845739,
"grad_norm": 4.676717758178711,
"learning_rate": 4.1785714285714283e-07,
"loss": 0.6860368847846985,
"step": 118
},
{
"epoch": 0.12944983818770225,
"grad_norm": 1.218853235244751,
"learning_rate": 4.2499999999999995e-07,
"loss": 0.5903947353363037,
"step": 120
},
{
"epoch": 0.13160733549083065,
"grad_norm": 1.845142126083374,
"learning_rate": 4.3214285714285713e-07,
"loss": 0.7133704423904419,
"step": 122
},
{
"epoch": 0.133764832793959,
"grad_norm": 6.926656246185303,
"learning_rate": 4.3928571428571425e-07,
"loss": 0.7799294590950012,
"step": 124
},
{
"epoch": 0.13592233009708737,
"grad_norm": 0.8671985268592834,
"learning_rate": 4.464285714285714e-07,
"loss": 0.9356327056884766,
"step": 126
},
{
"epoch": 0.13807982740021574,
"grad_norm": 2.000596761703491,
"learning_rate": 4.5357142857142854e-07,
"loss": 0.65338534116745,
"step": 128
},
{
"epoch": 0.14023732470334413,
"grad_norm": 2.7190310955047607,
"learning_rate": 4.6071428571428566e-07,
"loss": 0.3992256820201874,
"step": 130
},
{
"epoch": 0.1423948220064725,
"grad_norm": 4.049299240112305,
"learning_rate": 4.6785714285714283e-07,
"loss": 0.68809974193573,
"step": 132
},
{
"epoch": 0.14455231930960086,
"grad_norm": 3.6315855979919434,
"learning_rate": 4.7499999999999995e-07,
"loss": 0.8035519123077393,
"step": 134
},
{
"epoch": 0.14670981661272922,
"grad_norm": 1.8729430437088013,
"learning_rate": 4.821428571428571e-07,
"loss": 0.9223624467849731,
"step": 136
},
{
"epoch": 0.1488673139158576,
"grad_norm": 2.9594123363494873,
"learning_rate": 4.892857142857142e-07,
"loss": 0.7450867891311646,
"step": 138
},
{
"epoch": 0.15102481121898598,
"grad_norm": 1.094379186630249,
"learning_rate": 4.964285714285715e-07,
"loss": 0.521569550037384,
"step": 140
},
{
"epoch": 0.15318230852211434,
"grad_norm": 3.2890093326568604,
"learning_rate": 4.999998408101351e-07,
"loss": 0.7194236516952515,
"step": 142
},
{
"epoch": 0.1553398058252427,
"grad_norm": 4.202434062957764,
"learning_rate": 4.999985672925673e-07,
"loss": 0.6269642114639282,
"step": 144
},
{
"epoch": 0.1574973031283711,
"grad_norm": 9.459905624389648,
"learning_rate": 4.999960202646399e-07,
"loss": 0.798893392086029,
"step": 146
},
{
"epoch": 0.15965480043149946,
"grad_norm": 1.1723222732543945,
"learning_rate": 4.999921997407693e-07,
"loss": 0.827728271484375,
"step": 148
},
{
"epoch": 0.16181229773462782,
"grad_norm": 2.3163399696350098,
"learning_rate": 4.999871057425801e-07,
"loss": 0.6281718015670776,
"step": 150
},
{
"epoch": 0.16396979503775622,
"grad_norm": 1.8294389247894287,
"learning_rate": 4.999807382989047e-07,
"loss": 0.4277426302433014,
"step": 152
},
{
"epoch": 0.16612729234088458,
"grad_norm": 7.350142955780029,
"learning_rate": 4.999730974457832e-07,
"loss": 0.9866698980331421,
"step": 154
},
{
"epoch": 0.16828478964401294,
"grad_norm": 13.259355545043945,
"learning_rate": 4.999641832264634e-07,
"loss": 0.8262766599655151,
"step": 156
},
{
"epoch": 0.1704422869471413,
"grad_norm": 1.965742588043213,
"learning_rate": 4.999539956914009e-07,
"loss": 0.6971994042396545,
"step": 158
},
{
"epoch": 0.1725997842502697,
"grad_norm": 5.429571628570557,
"learning_rate": 4.999425348982576e-07,
"loss": 0.8861981630325317,
"step": 160
},
{
"epoch": 0.17475728155339806,
"grad_norm": 2.0696070194244385,
"learning_rate": 4.999298009119028e-07,
"loss": 0.7111194133758545,
"step": 162
},
{
"epoch": 0.17691477885652643,
"grad_norm": 1.6481966972351074,
"learning_rate": 4.999157938044117e-07,
"loss": 0.5676076412200928,
"step": 164
},
{
"epoch": 0.1790722761596548,
"grad_norm": 1.6638914346694946,
"learning_rate": 4.999005136550658e-07,
"loss": 0.6949500441551208,
"step": 166
},
{
"epoch": 0.18122977346278318,
"grad_norm": 1.638273000717163,
"learning_rate": 4.998839605503519e-07,
"loss": 0.5900384783744812,
"step": 168
},
{
"epoch": 0.18338727076591155,
"grad_norm": 4.502748012542725,
"learning_rate": 4.998661345839621e-07,
"loss": 0.6821661591529846,
"step": 170
},
{
"epoch": 0.1855447680690399,
"grad_norm": 7.763872146606445,
"learning_rate": 4.998470358567927e-07,
"loss": 0.942156195640564,
"step": 172
},
{
"epoch": 0.18770226537216828,
"grad_norm": 2.503589630126953,
"learning_rate": 4.998266644769442e-07,
"loss": 0.7288610935211182,
"step": 174
},
{
"epoch": 0.18985976267529667,
"grad_norm": 1.9182608127593994,
"learning_rate": 4.998050205597199e-07,
"loss": 0.6357927918434143,
"step": 176
},
{
"epoch": 0.19201725997842503,
"grad_norm": 6.610482692718506,
"learning_rate": 4.997821042276267e-07,
"loss": 0.6117627024650574,
"step": 178
},
{
"epoch": 0.1941747572815534,
"grad_norm": 1.5938338041305542,
"learning_rate": 4.997579156103726e-07,
"loss": 0.5153307914733887,
"step": 180
},
{
"epoch": 0.19633225458468176,
"grad_norm": 3.5181422233581543,
"learning_rate": 4.99732454844867e-07,
"loss": 0.8765067458152771,
"step": 182
},
{
"epoch": 0.19848975188781015,
"grad_norm": 2.738868236541748,
"learning_rate": 4.997057220752203e-07,
"loss": 0.6484391689300537,
"step": 184
},
{
"epoch": 0.20064724919093851,
"grad_norm": 4.510643005371094,
"learning_rate": 4.996777174527419e-07,
"loss": 0.7230756878852844,
"step": 186
},
{
"epoch": 0.20280474649406688,
"grad_norm": 8.01621150970459,
"learning_rate": 4.996484411359404e-07,
"loss": 0.5043923854827881,
"step": 188
},
{
"epoch": 0.20496224379719524,
"grad_norm": 3.4308345317840576,
"learning_rate": 4.996178932905221e-07,
"loss": 0.43938198685646057,
"step": 190
},
{
"epoch": 0.20711974110032363,
"grad_norm": 1.9731006622314453,
"learning_rate": 4.995860740893904e-07,
"loss": 0.5857755541801453,
"step": 192
},
{
"epoch": 0.209277238403452,
"grad_norm": 5.68419075012207,
"learning_rate": 4.995529837126445e-07,
"loss": 0.6594390273094177,
"step": 194
},
{
"epoch": 0.21143473570658036,
"grad_norm": 2.10448956489563,
"learning_rate": 4.995186223475785e-07,
"loss": 0.5797517895698547,
"step": 196
},
{
"epoch": 0.21359223300970873,
"grad_norm": 3.166001081466675,
"learning_rate": 4.99482990188681e-07,
"loss": 0.6418941617012024,
"step": 198
},
{
"epoch": 0.21574973031283712,
"grad_norm": 4.582778453826904,
"learning_rate": 4.994460874376325e-07,
"loss": 0.6962154507637024,
"step": 200
},
{
"epoch": 0.21790722761596548,
"grad_norm": 2.809333086013794,
"learning_rate": 4.994079143033057e-07,
"loss": 0.5948184132575989,
"step": 202
},
{
"epoch": 0.22006472491909385,
"grad_norm": 1.9424008131027222,
"learning_rate": 4.993684710017639e-07,
"loss": 0.6439061164855957,
"step": 204
},
{
"epoch": 0.2222222222222222,
"grad_norm": 1.9433950185775757,
"learning_rate": 4.993277577562591e-07,
"loss": 0.6355498433113098,
"step": 206
},
{
"epoch": 0.2243797195253506,
"grad_norm": 1.5145349502563477,
"learning_rate": 4.992857747972318e-07,
"loss": 0.5916112661361694,
"step": 208
},
{
"epoch": 0.22653721682847897,
"grad_norm": 4.159152507781982,
"learning_rate": 4.99242522362309e-07,
"loss": 0.6694331169128418,
"step": 210
},
{
"epoch": 0.22869471413160733,
"grad_norm": 2.050935983657837,
"learning_rate": 4.991980006963029e-07,
"loss": 0.5539838075637817,
"step": 212
},
{
"epoch": 0.2308522114347357,
"grad_norm": 3.0738911628723145,
"learning_rate": 4.9915221005121e-07,
"loss": 0.5762456655502319,
"step": 214
},
{
"epoch": 0.23300970873786409,
"grad_norm": 5.252814292907715,
"learning_rate": 4.991051506862089e-07,
"loss": 0.593986988067627,
"step": 216
},
{
"epoch": 0.23516720604099245,
"grad_norm": 4.718217372894287,
"learning_rate": 4.990568228676597e-07,
"loss": 0.6690990328788757,
"step": 218
},
{
"epoch": 0.2373247033441208,
"grad_norm": 3.038825035095215,
"learning_rate": 4.990072268691015e-07,
"loss": 0.703670859336853,
"step": 220
},
{
"epoch": 0.23948220064724918,
"grad_norm": 2.9468464851379395,
"learning_rate": 4.98956362971252e-07,
"loss": 0.7173536419868469,
"step": 222
},
{
"epoch": 0.24163969795037757,
"grad_norm": 2.068927526473999,
"learning_rate": 4.989042314620048e-07,
"loss": 0.658054769039154,
"step": 224
},
{
"epoch": 0.24379719525350593,
"grad_norm": 6.555632591247559,
"learning_rate": 4.988508326364288e-07,
"loss": 0.6826736927032471,
"step": 226
},
{
"epoch": 0.2459546925566343,
"grad_norm": 16.916873931884766,
"learning_rate": 4.987961667967655e-07,
"loss": 0.5197000503540039,
"step": 228
},
{
"epoch": 0.2481121898597627,
"grad_norm": 1.6320703029632568,
"learning_rate": 4.987402342524282e-07,
"loss": 0.6915658712387085,
"step": 230
},
{
"epoch": 0.25026968716289105,
"grad_norm": 8.044790267944336,
"learning_rate": 4.986830353199997e-07,
"loss": 0.9126973748207092,
"step": 232
},
{
"epoch": 0.2524271844660194,
"grad_norm": 6.615198135375977,
"learning_rate": 4.986245703232305e-07,
"loss": 0.8132709264755249,
"step": 234
},
{
"epoch": 0.2545846817691478,
"grad_norm": 10.388949394226074,
"learning_rate": 4.985648395930373e-07,
"loss": 0.7186510562896729,
"step": 236
},
{
"epoch": 0.25674217907227614,
"grad_norm": 1.9609507322311401,
"learning_rate": 4.985038434675011e-07,
"loss": 0.6574066877365112,
"step": 238
},
{
"epoch": 0.2588996763754045,
"grad_norm": 1.8315008878707886,
"learning_rate": 4.984415822918648e-07,
"loss": 0.7553728818893433,
"step": 240
},
{
"epoch": 0.26105717367853293,
"grad_norm": 1.6982944011688232,
"learning_rate": 4.983780564185318e-07,
"loss": 0.6477434635162354,
"step": 242
},
{
"epoch": 0.2632146709816613,
"grad_norm": 3.038011312484741,
"learning_rate": 4.983132662070639e-07,
"loss": 0.6487295031547546,
"step": 244
},
{
"epoch": 0.26537216828478966,
"grad_norm": 2.443270683288574,
"learning_rate": 4.982472120241788e-07,
"loss": 0.6732483506202698,
"step": 246
},
{
"epoch": 0.267529665587918,
"grad_norm": 23.088842391967773,
"learning_rate": 4.981798942437488e-07,
"loss": 0.8093491792678833,
"step": 248
},
{
"epoch": 0.2696871628910464,
"grad_norm": 1.265063762664795,
"learning_rate": 4.981113132467979e-07,
"loss": 0.6368775963783264,
"step": 250
},
{
"epoch": 0.27184466019417475,
"grad_norm": 4.324744701385498,
"learning_rate": 4.980414694215002e-07,
"loss": 0.7633779644966125,
"step": 252
},
{
"epoch": 0.2740021574973031,
"grad_norm": 1.693095326423645,
"learning_rate": 4.979703631631776e-07,
"loss": 0.4923373758792877,
"step": 254
},
{
"epoch": 0.2761596548004315,
"grad_norm": 1.543394923210144,
"learning_rate": 4.978979948742973e-07,
"loss": 0.580363392829895,
"step": 256
},
{
"epoch": 0.2783171521035599,
"grad_norm": 4.408818244934082,
"learning_rate": 4.978243649644698e-07,
"loss": 0.45743411779403687,
"step": 258
},
{
"epoch": 0.28047464940668826,
"grad_norm": 1.3202674388885498,
"learning_rate": 4.977494738504462e-07,
"loss": 0.7671762108802795,
"step": 260
},
{
"epoch": 0.2826321467098166,
"grad_norm": 1.4401205778121948,
"learning_rate": 4.976733219561166e-07,
"loss": 0.5078914761543274,
"step": 262
},
{
"epoch": 0.284789644012945,
"grad_norm": 2.1078848838806152,
"learning_rate": 4.97595909712507e-07,
"loss": 0.6809296607971191,
"step": 264
},
{
"epoch": 0.28694714131607335,
"grad_norm": 1.4545615911483765,
"learning_rate": 4.975172375577768e-07,
"loss": 0.5965730547904968,
"step": 266
},
{
"epoch": 0.2891046386192017,
"grad_norm": 3.4635396003723145,
"learning_rate": 4.974373059372171e-07,
"loss": 0.6723196506500244,
"step": 268
},
{
"epoch": 0.2912621359223301,
"grad_norm": 1.4718010425567627,
"learning_rate": 4.973561153032472e-07,
"loss": 0.5375315546989441,
"step": 270
},
{
"epoch": 0.29341963322545844,
"grad_norm": 2.598886489868164,
"learning_rate": 4.972736661154131e-07,
"loss": 0.6744803786277771,
"step": 272
},
{
"epoch": 0.29557713052858686,
"grad_norm": 2.894680976867676,
"learning_rate": 4.971899588403836e-07,
"loss": 0.6589304804801941,
"step": 274
},
{
"epoch": 0.2977346278317152,
"grad_norm": 6.139466762542725,
"learning_rate": 4.97104993951949e-07,
"loss": 0.6724509000778198,
"step": 276
},
{
"epoch": 0.2998921251348436,
"grad_norm": 4.252213954925537,
"learning_rate": 4.970187719310173e-07,
"loss": 0.7082564830780029,
"step": 278
},
{
"epoch": 0.30204962243797195,
"grad_norm": 4.135262489318848,
"learning_rate": 4.969312932656125e-07,
"loss": 0.6953362822532654,
"step": 280
},
{
"epoch": 0.3042071197411003,
"grad_norm": 6.377979755401611,
"learning_rate": 4.968425584508709e-07,
"loss": 0.6467074155807495,
"step": 282
},
{
"epoch": 0.3063646170442287,
"grad_norm": 2.111952066421509,
"learning_rate": 4.967525679890388e-07,
"loss": 0.6030433177947998,
"step": 284
},
{
"epoch": 0.30852211434735705,
"grad_norm": 2.0884768962860107,
"learning_rate": 4.966613223894696e-07,
"loss": 0.4877060651779175,
"step": 286
},
{
"epoch": 0.3106796116504854,
"grad_norm": 8.26274585723877,
"learning_rate": 4.96568822168621e-07,
"loss": 0.6577244400978088,
"step": 288
},
{
"epoch": 0.31283710895361383,
"grad_norm": 1.4854352474212646,
"learning_rate": 4.964750678500517e-07,
"loss": 0.6866559386253357,
"step": 290
},
{
"epoch": 0.3149946062567422,
"grad_norm": 2.8770534992218018,
"learning_rate": 4.963800599644189e-07,
"loss": 0.6283307075500488,
"step": 292
},
{
"epoch": 0.31715210355987056,
"grad_norm": 3.0127577781677246,
"learning_rate": 4.96283799049475e-07,
"loss": 0.7268189191818237,
"step": 294
},
{
"epoch": 0.3193096008629989,
"grad_norm": 1.608444333076477,
"learning_rate": 4.961862856500647e-07,
"loss": 0.4435058832168579,
"step": 296
},
{
"epoch": 0.3214670981661273,
"grad_norm": 4.365738391876221,
"learning_rate": 4.960875203181219e-07,
"loss": 0.6397342085838318,
"step": 298
},
{
"epoch": 0.32362459546925565,
"grad_norm": 17.852121353149414,
"learning_rate": 4.959875036126664e-07,
"loss": 0.7305557131767273,
"step": 300
},
{
"epoch": 0.325782092772384,
"grad_norm": 5.299609661102295,
"learning_rate": 4.958862360998011e-07,
"loss": 0.45774808526039124,
"step": 302
},
{
"epoch": 0.32793959007551243,
"grad_norm": 3.3113415241241455,
"learning_rate": 4.957837183527081e-07,
"loss": 0.577039897441864,
"step": 304
},
{
"epoch": 0.3300970873786408,
"grad_norm": 1.8541628122329712,
"learning_rate": 4.956799509516467e-07,
"loss": 0.6706950664520264,
"step": 306
},
{
"epoch": 0.33225458468176916,
"grad_norm": 2.7589690685272217,
"learning_rate": 4.955749344839487e-07,
"loss": 0.6480457186698914,
"step": 308
},
{
"epoch": 0.3344120819848975,
"grad_norm": 2.4146876335144043,
"learning_rate": 4.954686695440159e-07,
"loss": 0.7360544204711914,
"step": 310
},
{
"epoch": 0.3365695792880259,
"grad_norm": 2.073716878890991,
"learning_rate": 4.953611567333166e-07,
"loss": 0.6692315340042114,
"step": 312
},
{
"epoch": 0.33872707659115425,
"grad_norm": 5.253190994262695,
"learning_rate": 4.952523966603822e-07,
"loss": 0.6974945068359375,
"step": 314
},
{
"epoch": 0.3408845738942826,
"grad_norm": 2.9199769496917725,
"learning_rate": 4.951423899408035e-07,
"loss": 0.7332634329795837,
"step": 316
},
{
"epoch": 0.343042071197411,
"grad_norm": 4.3365254402160645,
"learning_rate": 4.950311371972277e-07,
"loss": 0.7718333005905151,
"step": 318
},
{
"epoch": 0.3451995685005394,
"grad_norm": 3.805959701538086,
"learning_rate": 4.949186390593544e-07,
"loss": 0.5337668657302856,
"step": 320
},
{
"epoch": 0.34735706580366776,
"grad_norm": 0.888100266456604,
"learning_rate": 4.948048961639323e-07,
"loss": 0.40571683645248413,
"step": 322
},
{
"epoch": 0.34951456310679613,
"grad_norm": 1.4906032085418701,
"learning_rate": 4.946899091547556e-07,
"loss": 0.5962201356887817,
"step": 324
},
{
"epoch": 0.3516720604099245,
"grad_norm": 13.393712997436523,
"learning_rate": 4.945736786826601e-07,
"loss": 0.7609850168228149,
"step": 326
},
{
"epoch": 0.35382955771305286,
"grad_norm": 6.983129024505615,
"learning_rate": 4.944562054055198e-07,
"loss": 0.6627951860427856,
"step": 328
},
{
"epoch": 0.3559870550161812,
"grad_norm": 1.3096721172332764,
"learning_rate": 4.943374899882432e-07,
"loss": 0.6522207856178284,
"step": 330
},
{
"epoch": 0.3581445523193096,
"grad_norm": 4.269274711608887,
"learning_rate": 4.942175331027693e-07,
"loss": 0.4938734173774719,
"step": 332
},
{
"epoch": 0.36030204962243795,
"grad_norm": 5.267324924468994,
"learning_rate": 4.940963354280638e-07,
"loss": 0.6275608539581299,
"step": 334
},
{
"epoch": 0.36245954692556637,
"grad_norm": 5.786050796508789,
"learning_rate": 4.939738976501156e-07,
"loss": 0.49768128991127014,
"step": 336
},
{
"epoch": 0.36461704422869473,
"grad_norm": 2.491793155670166,
"learning_rate": 4.938502204619325e-07,
"loss": 0.5501651763916016,
"step": 338
},
{
"epoch": 0.3667745415318231,
"grad_norm": 33.490753173828125,
"learning_rate": 4.937253045635375e-07,
"loss": 0.6647762656211853,
"step": 340
},
{
"epoch": 0.36893203883495146,
"grad_norm": 2.3751609325408936,
"learning_rate": 4.93599150661965e-07,
"loss": 0.6672598719596863,
"step": 342
},
{
"epoch": 0.3710895361380798,
"grad_norm": 2.2334680557250977,
"learning_rate": 4.934717594712564e-07,
"loss": 0.5913785099983215,
"step": 344
},
{
"epoch": 0.3732470334412082,
"grad_norm": 2.6074905395507812,
"learning_rate": 4.933431317124562e-07,
"loss": 0.6229710578918457,
"step": 346
},
{
"epoch": 0.37540453074433655,
"grad_norm": 13.647238731384277,
"learning_rate": 4.932132681136079e-07,
"loss": 0.7039205431938171,
"step": 348
},
{
"epoch": 0.3775620280474649,
"grad_norm": 1.9593160152435303,
"learning_rate": 4.930821694097507e-07,
"loss": 0.6190288662910461,
"step": 350
},
{
"epoch": 0.37971952535059333,
"grad_norm": 5.780766487121582,
"learning_rate": 4.929498363429135e-07,
"loss": 0.6188508868217468,
"step": 352
},
{
"epoch": 0.3818770226537217,
"grad_norm": 13.814835548400879,
"learning_rate": 4.928162696621125e-07,
"loss": 0.6810972094535828,
"step": 354
},
{
"epoch": 0.38403451995685006,
"grad_norm": 2.1199731826782227,
"learning_rate": 4.926814701233461e-07,
"loss": 0.5854727029800415,
"step": 356
},
{
"epoch": 0.3861920172599784,
"grad_norm": 6.030026435852051,
"learning_rate": 4.925454384895906e-07,
"loss": 0.5390771627426147,
"step": 358
},
{
"epoch": 0.3883495145631068,
"grad_norm": 1.6937010288238525,
"learning_rate": 4.924081755307964e-07,
"loss": 0.6610192060470581,
"step": 360
},
{
"epoch": 0.39050701186623515,
"grad_norm": 2.114851236343384,
"learning_rate": 4.922696820238831e-07,
"loss": 0.6132209300994873,
"step": 362
},
{
"epoch": 0.3926645091693635,
"grad_norm": 5.5387749671936035,
"learning_rate": 4.921299587527352e-07,
"loss": 0.5963850021362305,
"step": 364
},
{
"epoch": 0.3948220064724919,
"grad_norm": 6.011134147644043,
"learning_rate": 4.919890065081979e-07,
"loss": 0.7333153486251831,
"step": 366
},
{
"epoch": 0.3969795037756203,
"grad_norm": 6.8634209632873535,
"learning_rate": 4.918468260880726e-07,
"loss": 0.7082594633102417,
"step": 368
},
{
"epoch": 0.39913700107874867,
"grad_norm": 0.9249289631843567,
"learning_rate": 4.917034182971122e-07,
"loss": 0.6958880424499512,
"step": 370
},
{
"epoch": 0.40129449838187703,
"grad_norm": 2.1919021606445312,
"learning_rate": 4.915587839470163e-07,
"loss": 0.6318715214729309,
"step": 372
},
{
"epoch": 0.4034519956850054,
"grad_norm": 1.5956162214279175,
"learning_rate": 4.914129238564272e-07,
"loss": 0.6127752661705017,
"step": 374
},
{
"epoch": 0.40560949298813376,
"grad_norm": 1.8250914812088013,
"learning_rate": 4.912658388509253e-07,
"loss": 0.5225449204444885,
"step": 376
},
{
"epoch": 0.4077669902912621,
"grad_norm": 3.8166756629943848,
"learning_rate": 4.911175297630236e-07,
"loss": 0.3997286558151245,
"step": 378
},
{
"epoch": 0.4099244875943905,
"grad_norm": 2.1530375480651855,
"learning_rate": 4.909679974321636e-07,
"loss": 0.5367651581764221,
"step": 380
},
{
"epoch": 0.4120819848975189,
"grad_norm": 12.938714981079102,
"learning_rate": 4.908172427047109e-07,
"loss": 0.5885851383209229,
"step": 382
},
{
"epoch": 0.41423948220064727,
"grad_norm": 2.9967093467712402,
"learning_rate": 4.906652664339493e-07,
"loss": 0.6215783357620239,
"step": 384
},
{
"epoch": 0.41639697950377563,
"grad_norm": 4.113240718841553,
"learning_rate": 4.905120694800772e-07,
"loss": 0.6612198352813721,
"step": 386
},
{
"epoch": 0.418554476806904,
"grad_norm": 2.52058482170105,
"learning_rate": 4.903576527102018e-07,
"loss": 0.5840904712677002,
"step": 388
},
{
"epoch": 0.42071197411003236,
"grad_norm": 5.358467102050781,
"learning_rate": 4.902020169983346e-07,
"loss": 0.3897056579589844,
"step": 390
},
{
"epoch": 0.4228694714131607,
"grad_norm": 9.863550186157227,
"learning_rate": 4.900451632253868e-07,
"loss": 0.6255682706832886,
"step": 392
},
{
"epoch": 0.4250269687162891,
"grad_norm": 3.4259397983551025,
"learning_rate": 4.898870922791634e-07,
"loss": 0.6414270997047424,
"step": 394
},
{
"epoch": 0.42718446601941745,
"grad_norm": 8.052302360534668,
"learning_rate": 4.89727805054359e-07,
"loss": 0.761306881904602,
"step": 396
},
{
"epoch": 0.42934196332254587,
"grad_norm": 3.4023313522338867,
"learning_rate": 4.895673024525522e-07,
"loss": 0.5789651274681091,
"step": 398
},
{
"epoch": 0.43149946062567424,
"grad_norm": 3.1743762493133545,
"learning_rate": 4.894055853822012e-07,
"loss": 0.5212793946266174,
"step": 400
},
{
"epoch": 0.4336569579288026,
"grad_norm": 2.2208149433135986,
"learning_rate": 4.892426547586378e-07,
"loss": 0.5916732549667358,
"step": 402
},
{
"epoch": 0.43581445523193096,
"grad_norm": 2.620041608810425,
"learning_rate": 4.890785115040626e-07,
"loss": 0.6305989027023315,
"step": 404
},
{
"epoch": 0.43797195253505933,
"grad_norm": 3.6425275802612305,
"learning_rate": 4.889131565475401e-07,
"loss": 0.756013035774231,
"step": 406
},
{
"epoch": 0.4401294498381877,
"grad_norm": 5.899089336395264,
"learning_rate": 4.887465908249925e-07,
"loss": 0.5666382908821106,
"step": 408
},
{
"epoch": 0.44228694714131606,
"grad_norm": 2.683706283569336,
"learning_rate": 4.885788152791959e-07,
"loss": 0.774447500705719,
"step": 410
},
{
"epoch": 0.4444444444444444,
"grad_norm": 2.967109203338623,
"learning_rate": 4.884098308597734e-07,
"loss": 0.616926908493042,
"step": 412
},
{
"epoch": 0.44660194174757284,
"grad_norm": 1.796120047569275,
"learning_rate": 4.882396385231909e-07,
"loss": 0.5476500391960144,
"step": 414
},
{
"epoch": 0.4487594390507012,
"grad_norm": 1.2322068214416504,
"learning_rate": 4.880682392327509e-07,
"loss": 0.6756587028503418,
"step": 416
},
{
"epoch": 0.45091693635382957,
"grad_norm": 2.247699737548828,
"learning_rate": 4.878956339585874e-07,
"loss": 0.6002364754676819,
"step": 418
},
{
"epoch": 0.45307443365695793,
"grad_norm": 1.9976741075515747,
"learning_rate": 4.877218236776603e-07,
"loss": 0.5875912308692932,
"step": 420
},
{
"epoch": 0.4552319309600863,
"grad_norm": 1.5975956916809082,
"learning_rate": 4.875468093737504e-07,
"loss": 0.6899119019508362,
"step": 422
},
{
"epoch": 0.45738942826321466,
"grad_norm": 1.6121068000793457,
"learning_rate": 4.873705920374528e-07,
"loss": 0.5816035270690918,
"step": 424
},
{
"epoch": 0.459546925566343,
"grad_norm": 9.736807823181152,
"learning_rate": 4.87193172666172e-07,
"loss": 0.6405006647109985,
"step": 426
},
{
"epoch": 0.4617044228694714,
"grad_norm": 3.0928378105163574,
"learning_rate": 4.870145522641164e-07,
"loss": 0.8041344881057739,
"step": 428
},
{
"epoch": 0.4638619201725998,
"grad_norm": 1.493276596069336,
"learning_rate": 4.868347318422921e-07,
"loss": 0.6042853593826294,
"step": 430
},
{
"epoch": 0.46601941747572817,
"grad_norm": 5.470495700836182,
"learning_rate": 4.866537124184973e-07,
"loss": 0.6030522584915161,
"step": 432
},
{
"epoch": 0.46817691477885653,
"grad_norm": 2.748800754547119,
"learning_rate": 4.864714950173171e-07,
"loss": 0.449870765209198,
"step": 434
},
{
"epoch": 0.4703344120819849,
"grad_norm": 6.604589939117432,
"learning_rate": 4.862880806701166e-07,
"loss": 0.7801079154014587,
"step": 436
},
{
"epoch": 0.47249190938511326,
"grad_norm": 15.256587028503418,
"learning_rate": 4.861034704150363e-07,
"loss": 0.8985238075256348,
"step": 438
},
{
"epoch": 0.4746494066882416,
"grad_norm": 2.490304946899414,
"learning_rate": 4.859176652969853e-07,
"loss": 0.5823092460632324,
"step": 440
},
{
"epoch": 0.47680690399137,
"grad_norm": 1.5544885396957397,
"learning_rate": 4.857306663676358e-07,
"loss": 0.5551883578300476,
"step": 442
},
{
"epoch": 0.47896440129449835,
"grad_norm": 3.0149269104003906,
"learning_rate": 4.855424746854171e-07,
"loss": 0.6494267582893372,
"step": 444
},
{
"epoch": 0.4811218985976268,
"grad_norm": 6.602444171905518,
"learning_rate": 4.853530913155097e-07,
"loss": 0.6156390905380249,
"step": 446
},
{
"epoch": 0.48327939590075514,
"grad_norm": 2.8398349285125732,
"learning_rate": 4.851625173298389e-07,
"loss": 0.6284780502319336,
"step": 448
},
{
"epoch": 0.4854368932038835,
"grad_norm": 2.238544225692749,
"learning_rate": 4.84970753807069e-07,
"loss": 0.6789742112159729,
"step": 450
},
{
"epoch": 0.48759439050701187,
"grad_norm": 5.265446186065674,
"learning_rate": 4.847778018325974e-07,
"loss": 0.6509313583374023,
"step": 452
},
{
"epoch": 0.48975188781014023,
"grad_norm": 1.3713135719299316,
"learning_rate": 4.845836624985484e-07,
"loss": 0.5825515985488892,
"step": 454
},
{
"epoch": 0.4919093851132686,
"grad_norm": 2.7798221111297607,
"learning_rate": 4.84388336903766e-07,
"loss": 0.46305614709854126,
"step": 456
},
{
"epoch": 0.49406688241639696,
"grad_norm": 5.081538200378418,
"learning_rate": 4.841918261538093e-07,
"loss": 0.6566568613052368,
"step": 458
},
{
"epoch": 0.4962243797195254,
"grad_norm": 8.522509574890137,
"learning_rate": 4.839941313609456e-07,
"loss": 0.7881090641021729,
"step": 460
},
{
"epoch": 0.49838187702265374,
"grad_norm": 3.8689656257629395,
"learning_rate": 4.837952536441432e-07,
"loss": 0.592060387134552,
"step": 462
},
{
"epoch": 0.5005393743257821,
"grad_norm": 3.3025739192962646,
"learning_rate": 4.835951941290665e-07,
"loss": 0.6704021096229553,
"step": 464
},
{
"epoch": 0.5026968716289104,
"grad_norm": 1.400242805480957,
"learning_rate": 4.833939539480689e-07,
"loss": 0.7260591983795166,
"step": 466
},
{
"epoch": 0.5048543689320388,
"grad_norm": 1.9812822341918945,
"learning_rate": 4.831915342401862e-07,
"loss": 0.6312894225120544,
"step": 468
},
{
"epoch": 0.5070118662351673,
"grad_norm": 5.561008930206299,
"learning_rate": 4.829879361511305e-07,
"loss": 0.5982025861740112,
"step": 470
},
{
"epoch": 0.5091693635382956,
"grad_norm": 1.6053967475891113,
"learning_rate": 4.827831608332839e-07,
"loss": 0.6564121246337891,
"step": 472
},
{
"epoch": 0.511326860841424,
"grad_norm": 1.3539955615997314,
"learning_rate": 4.825772094456913e-07,
"loss": 0.7475908994674683,
"step": 474
},
{
"epoch": 0.5134843581445523,
"grad_norm": 1.227378249168396,
"learning_rate": 4.823700831540547e-07,
"loss": 0.8186240792274475,
"step": 476
},
{
"epoch": 0.5156418554476807,
"grad_norm": 1.6025233268737793,
"learning_rate": 4.821617831307256e-07,
"loss": 0.6983221769332886,
"step": 478
},
{
"epoch": 0.517799352750809,
"grad_norm": 1.7689329385757446,
"learning_rate": 4.819523105546994e-07,
"loss": 0.4151468276977539,
"step": 480
},
{
"epoch": 0.5199568500539374,
"grad_norm": 1.0149801969528198,
"learning_rate": 4.817416666116082e-07,
"loss": 0.6161211729049683,
"step": 482
},
{
"epoch": 0.5221143473570659,
"grad_norm": 3.3374340534210205,
"learning_rate": 4.815298524937138e-07,
"loss": 0.6058178544044495,
"step": 484
},
{
"epoch": 0.5242718446601942,
"grad_norm": 1.603325605392456,
"learning_rate": 4.813168693999016e-07,
"loss": 0.7110425233840942,
"step": 486
},
{
"epoch": 0.5264293419633226,
"grad_norm": 8.539067268371582,
"learning_rate": 4.811027185356733e-07,
"loss": 0.6150040626525879,
"step": 488
},
{
"epoch": 0.5285868392664509,
"grad_norm": 1.6875640153884888,
"learning_rate": 4.808874011131405e-07,
"loss": 0.5068952441215515,
"step": 490
},
{
"epoch": 0.5307443365695793,
"grad_norm": 3.954596996307373,
"learning_rate": 4.806709183510174e-07,
"loss": 0.6284441947937012,
"step": 492
},
{
"epoch": 0.5329018338727076,
"grad_norm": 11.474177360534668,
"learning_rate": 4.804532714746142e-07,
"loss": 0.7361968159675598,
"step": 494
},
{
"epoch": 0.535059331175836,
"grad_norm": 5.204519271850586,
"learning_rate": 4.8023446171583e-07,
"loss": 0.6124238967895508,
"step": 496
},
{
"epoch": 0.5372168284789643,
"grad_norm": 1.8010599613189697,
"learning_rate": 4.800144903131462e-07,
"loss": 0.7173029780387878,
"step": 498
},
{
"epoch": 0.5393743257820928,
"grad_norm": 3.062251567840576,
"learning_rate": 4.79793358511619e-07,
"loss": 0.5613416433334351,
"step": 500
},
{
"epoch": 0.5415318230852212,
"grad_norm": 1.9310272932052612,
"learning_rate": 4.795710675628724e-07,
"loss": 0.6559145450592041,
"step": 502
},
{
"epoch": 0.5436893203883495,
"grad_norm": 2.027998924255371,
"learning_rate": 4.793476187250913e-07,
"loss": 0.5871425271034241,
"step": 504
},
{
"epoch": 0.5458468176914779,
"grad_norm": 1.8883675336837769,
"learning_rate": 4.791230132630148e-07,
"loss": 0.6756701469421387,
"step": 506
},
{
"epoch": 0.5480043149946062,
"grad_norm": 1.6534463167190552,
"learning_rate": 4.78897252447928e-07,
"loss": 0.747499942779541,
"step": 508
},
{
"epoch": 0.5501618122977346,
"grad_norm": 1.547232747077942,
"learning_rate": 4.786703375576557e-07,
"loss": 0.5784983038902283,
"step": 510
},
{
"epoch": 0.552319309600863,
"grad_norm": 2.1741976737976074,
"learning_rate": 4.784422698765549e-07,
"loss": 0.6561665534973145,
"step": 512
},
{
"epoch": 0.5544768069039914,
"grad_norm": 7.314396381378174,
"learning_rate": 4.782130506955072e-07,
"loss": 0.6497671008110046,
"step": 514
},
{
"epoch": 0.5566343042071198,
"grad_norm": 4.359201908111572,
"learning_rate": 4.779826813119122e-07,
"loss": 0.5485538840293884,
"step": 516
},
{
"epoch": 0.5587918015102481,
"grad_norm": 1.9365326166152954,
"learning_rate": 4.777511630296795e-07,
"loss": 0.6849959492683411,
"step": 518
},
{
"epoch": 0.5609492988133765,
"grad_norm": 3.751213312149048,
"learning_rate": 4.775184971592214e-07,
"loss": 0.6562321186065674,
"step": 520
},
{
"epoch": 0.5631067961165048,
"grad_norm": 4.876345634460449,
"learning_rate": 4.772846850174459e-07,
"loss": 0.6159510016441345,
"step": 522
},
{
"epoch": 0.5652642934196332,
"grad_norm": 1.8032896518707275,
"learning_rate": 4.77049727927749e-07,
"loss": 0.37065228819847107,
"step": 524
},
{
"epoch": 0.5674217907227616,
"grad_norm": 1.5449891090393066,
"learning_rate": 4.7681362722000703e-07,
"loss": 0.6935975551605225,
"step": 526
},
{
"epoch": 0.56957928802589,
"grad_norm": 1.7247692346572876,
"learning_rate": 4.7657638423056947e-07,
"loss": 0.5516192317008972,
"step": 528
},
{
"epoch": 0.5717367853290184,
"grad_norm": 2.4441468715667725,
"learning_rate": 4.76338000302251e-07,
"loss": 0.6022404432296753,
"step": 530
},
{
"epoch": 0.5738942826321467,
"grad_norm": 2.1737172603607178,
"learning_rate": 4.760984767843242e-07,
"loss": 0.6502859592437744,
"step": 532
},
{
"epoch": 0.5760517799352751,
"grad_norm": 1.620774269104004,
"learning_rate": 4.7585781503251197e-07,
"loss": 0.660953938961029,
"step": 534
},
{
"epoch": 0.5782092772384034,
"grad_norm": 2.282656669616699,
"learning_rate": 4.7561601640897956e-07,
"loss": 0.5781145095825195,
"step": 536
},
{
"epoch": 0.5803667745415318,
"grad_norm": 1.9041107892990112,
"learning_rate": 4.75373082282327e-07,
"loss": 0.6320368647575378,
"step": 538
},
{
"epoch": 0.5825242718446602,
"grad_norm": 5.685824871063232,
"learning_rate": 4.751290140275813e-07,
"loss": 0.7650933265686035,
"step": 540
},
{
"epoch": 0.5846817691477886,
"grad_norm": 39.65275955200195,
"learning_rate": 4.7488381302618887e-07,
"loss": 0.6836517453193665,
"step": 542
},
{
"epoch": 0.5868392664509169,
"grad_norm": 1.5295366048812866,
"learning_rate": 4.7463748066600754e-07,
"loss": 0.5143399834632874,
"step": 544
},
{
"epoch": 0.5889967637540453,
"grad_norm": 1.5446751117706299,
"learning_rate": 4.7439001834129876e-07,
"loss": 0.7021965980529785,
"step": 546
},
{
"epoch": 0.5911542610571737,
"grad_norm": 0.48094552755355835,
"learning_rate": 4.7414142745271944e-07,
"loss": 0.7895625829696655,
"step": 548
},
{
"epoch": 0.593311758360302,
"grad_norm": 1.5283570289611816,
"learning_rate": 4.738917094073146e-07,
"loss": 0.5346397757530212,
"step": 550
},
{
"epoch": 0.5954692556634305,
"grad_norm": 1.5133683681488037,
"learning_rate": 4.7364086561850866e-07,
"loss": 0.6586446762084961,
"step": 552
},
{
"epoch": 0.5976267529665588,
"grad_norm": 1.5767534971237183,
"learning_rate": 4.733888975060981e-07,
"loss": 0.6797043681144714,
"step": 554
},
{
"epoch": 0.5997842502696872,
"grad_norm": 1.8032898902893066,
"learning_rate": 4.7313580649624335e-07,
"loss": 0.5901877284049988,
"step": 556
},
{
"epoch": 0.6019417475728155,
"grad_norm": 12.88221549987793,
"learning_rate": 4.7288159402146e-07,
"loss": 0.7293663024902344,
"step": 558
},
{
"epoch": 0.6040992448759439,
"grad_norm": 3.4143638610839844,
"learning_rate": 4.726262615206117e-07,
"loss": 0.7597730159759521,
"step": 560
},
{
"epoch": 0.6062567421790723,
"grad_norm": 1.426260232925415,
"learning_rate": 4.723698104389013e-07,
"loss": 0.6413673162460327,
"step": 562
},
{
"epoch": 0.6084142394822006,
"grad_norm": 1.8875607252120972,
"learning_rate": 4.72112242227863e-07,
"loss": 0.705469012260437,
"step": 564
},
{
"epoch": 0.6105717367853291,
"grad_norm": 1.8347193002700806,
"learning_rate": 4.71853558345354e-07,
"loss": 0.5616622567176819,
"step": 566
},
{
"epoch": 0.6127292340884574,
"grad_norm": 1.3484468460083008,
"learning_rate": 4.715937602555464e-07,
"loss": 0.615619957447052,
"step": 568
},
{
"epoch": 0.6148867313915858,
"grad_norm": 1.4341267347335815,
"learning_rate": 4.7133284942891846e-07,
"loss": 0.6071439981460571,
"step": 570
},
{
"epoch": 0.6170442286947141,
"grad_norm": 7.157674312591553,
"learning_rate": 4.7107082734224713e-07,
"loss": 0.668420672416687,
"step": 572
},
{
"epoch": 0.6192017259978425,
"grad_norm": 1.7236965894699097,
"learning_rate": 4.7080769547859884e-07,
"loss": 0.6496031284332275,
"step": 574
},
{
"epoch": 0.6213592233009708,
"grad_norm": 3.2555484771728516,
"learning_rate": 4.7054345532732155e-07,
"loss": 0.5818045735359192,
"step": 576
},
{
"epoch": 0.6235167206040992,
"grad_norm": 3.8229000568389893,
"learning_rate": 4.7027810838403613e-07,
"loss": 0.6198642253875732,
"step": 578
},
{
"epoch": 0.6256742179072277,
"grad_norm": 3.169841766357422,
"learning_rate": 4.700116561506282e-07,
"loss": 0.666779637336731,
"step": 580
},
{
"epoch": 0.627831715210356,
"grad_norm": 8.474517822265625,
"learning_rate": 4.697441001352392e-07,
"loss": 0.564462423324585,
"step": 582
},
{
"epoch": 0.6299892125134844,
"grad_norm": 2.2855448722839355,
"learning_rate": 4.6947544185225805e-07,
"loss": 0.6741440296173096,
"step": 584
},
{
"epoch": 0.6321467098166127,
"grad_norm": 3.856265068054199,
"learning_rate": 4.692056828223129e-07,
"loss": 0.7456379532814026,
"step": 586
},
{
"epoch": 0.6343042071197411,
"grad_norm": 2.731037139892578,
"learning_rate": 4.6893482457226174e-07,
"loss": 0.5937924385070801,
"step": 588
},
{
"epoch": 0.6364617044228694,
"grad_norm": 1.8991349935531616,
"learning_rate": 4.6866286863518465e-07,
"loss": 0.7059175372123718,
"step": 590
},
{
"epoch": 0.6386192017259978,
"grad_norm": 11.739773750305176,
"learning_rate": 4.6838981655037463e-07,
"loss": 0.7047215104103088,
"step": 592
},
{
"epoch": 0.6407766990291263,
"grad_norm": 12.105711936950684,
"learning_rate": 4.6811566986332875e-07,
"loss": 0.8416004180908203,
"step": 594
},
{
"epoch": 0.6429341963322546,
"grad_norm": 1.6077960729599,
"learning_rate": 4.678404301257398e-07,
"loss": 0.6131125688552856,
"step": 596
},
{
"epoch": 0.645091693635383,
"grad_norm": 1.12235689163208,
"learning_rate": 4.6756409889548734e-07,
"loss": 0.4833483397960663,
"step": 598
},
{
"epoch": 0.6472491909385113,
"grad_norm": 3.067166328430176,
"learning_rate": 4.6728667773662873e-07,
"loss": 0.7458387017250061,
"step": 600
},
{
"epoch": 0.6494066882416397,
"grad_norm": 4.32982873916626,
"learning_rate": 4.6700816821939056e-07,
"loss": 0.6110833883285522,
"step": 602
},
{
"epoch": 0.651564185544768,
"grad_norm": 1.638934850692749,
"learning_rate": 4.667285719201595e-07,
"loss": 0.8020920753479004,
"step": 604
},
{
"epoch": 0.6537216828478964,
"grad_norm": 4.146369457244873,
"learning_rate": 4.6644789042147366e-07,
"loss": 0.6712560653686523,
"step": 606
},
{
"epoch": 0.6558791801510249,
"grad_norm": 1.4545388221740723,
"learning_rate": 4.6616612531201324e-07,
"loss": 0.4938512444496155,
"step": 608
},
{
"epoch": 0.6580366774541532,
"grad_norm": 2.343878746032715,
"learning_rate": 4.6588327818659195e-07,
"loss": 0.6228682994842529,
"step": 610
},
{
"epoch": 0.6601941747572816,
"grad_norm": 1.3540754318237305,
"learning_rate": 4.655993506461478e-07,
"loss": 0.5692847371101379,
"step": 612
},
{
"epoch": 0.6623516720604099,
"grad_norm": 8.236356735229492,
"learning_rate": 4.6531434429773384e-07,
"loss": 0.7082728147506714,
"step": 614
},
{
"epoch": 0.6645091693635383,
"grad_norm": 2.163950204849243,
"learning_rate": 4.650282607545096e-07,
"loss": 0.7548322081565857,
"step": 616
},
{
"epoch": 0.6666666666666666,
"grad_norm": 1.8856990337371826,
"learning_rate": 4.6474110163573114e-07,
"loss": 0.6661372184753418,
"step": 618
},
{
"epoch": 0.668824163969795,
"grad_norm": 3.379833698272705,
"learning_rate": 4.644528685667428e-07,
"loss": 0.6216427087783813,
"step": 620
},
{
"epoch": 0.6709816612729234,
"grad_norm": 1.583329439163208,
"learning_rate": 4.641635631789675e-07,
"loss": 0.4488504230976105,
"step": 622
},
{
"epoch": 0.6731391585760518,
"grad_norm": 1.1011863946914673,
"learning_rate": 4.638731871098973e-07,
"loss": 0.7307155728340149,
"step": 624
},
{
"epoch": 0.6752966558791802,
"grad_norm": 2.518486976623535,
"learning_rate": 4.635817420030847e-07,
"loss": 0.605812668800354,
"step": 626
},
{
"epoch": 0.6774541531823085,
"grad_norm": 3.132564067840576,
"learning_rate": 4.6328922950813276e-07,
"loss": 0.6636737585067749,
"step": 628
},
{
"epoch": 0.6796116504854369,
"grad_norm": 2.8926432132720947,
"learning_rate": 4.629956512806865e-07,
"loss": 0.5622783899307251,
"step": 630
},
{
"epoch": 0.6817691477885652,
"grad_norm": 0.8876265287399292,
"learning_rate": 4.6270100898242257e-07,
"loss": 0.7432127594947815,
"step": 632
},
{
"epoch": 0.6839266450916937,
"grad_norm": 2.402222156524658,
"learning_rate": 4.6240530428104064e-07,
"loss": 0.495096892118454,
"step": 634
},
{
"epoch": 0.686084142394822,
"grad_norm": 1.8342965841293335,
"learning_rate": 4.6210853885025357e-07,
"loss": 0.41085928678512573,
"step": 636
},
{
"epoch": 0.6882416396979504,
"grad_norm": 6.494576930999756,
"learning_rate": 4.6181071436977803e-07,
"loss": 0.7552844285964966,
"step": 638
},
{
"epoch": 0.6903991370010788,
"grad_norm": 1.355686902999878,
"learning_rate": 4.615118325253251e-07,
"loss": 0.7046399116516113,
"step": 640
},
{
"epoch": 0.6925566343042071,
"grad_norm": 1.4658502340316772,
"learning_rate": 4.612118950085905e-07,
"loss": 0.6357789039611816,
"step": 642
},
{
"epoch": 0.6947141316073355,
"grad_norm": 10.526252746582031,
"learning_rate": 4.6091090351724523e-07,
"loss": 0.8061111569404602,
"step": 644
},
{
"epoch": 0.6968716289104638,
"grad_norm": 5.159208297729492,
"learning_rate": 4.606088597549258e-07,
"loss": 0.6721555590629578,
"step": 646
},
{
"epoch": 0.6990291262135923,
"grad_norm": 13.965704917907715,
"learning_rate": 4.603057654312247e-07,
"loss": 0.5817508697509766,
"step": 648
},
{
"epoch": 0.7011866235167206,
"grad_norm": 2.7752487659454346,
"learning_rate": 4.600016222616807e-07,
"loss": 0.6185034513473511,
"step": 650
},
{
"epoch": 0.703344120819849,
"grad_norm": 1.5174920558929443,
"learning_rate": 4.5969643196776907e-07,
"loss": 0.6131242513656616,
"step": 652
},
{
"epoch": 0.7055016181229773,
"grad_norm": 3.6596693992614746,
"learning_rate": 4.5939019627689196e-07,
"loss": 0.6017345190048218,
"step": 654
},
{
"epoch": 0.7076591154261057,
"grad_norm": 1.590834379196167,
"learning_rate": 4.590829169223686e-07,
"loss": 0.6057524681091309,
"step": 656
},
{
"epoch": 0.7098166127292341,
"grad_norm": 3.4560601711273193,
"learning_rate": 4.587745956434252e-07,
"loss": 0.48293402791023254,
"step": 658
},
{
"epoch": 0.7119741100323624,
"grad_norm": 3.8374826908111572,
"learning_rate": 4.584652341851855e-07,
"loss": 0.6879529356956482,
"step": 660
},
{
"epoch": 0.7141316073354909,
"grad_norm": 1.8356127738952637,
"learning_rate": 4.581548342986609e-07,
"loss": 0.6557474136352539,
"step": 662
},
{
"epoch": 0.7162891046386192,
"grad_norm": 2.937380313873291,
"learning_rate": 4.578433977407401e-07,
"loss": 0.5540982484817505,
"step": 664
},
{
"epoch": 0.7184466019417476,
"grad_norm": 9.279194831848145,
"learning_rate": 4.5753092627417966e-07,
"loss": 0.6095687747001648,
"step": 666
},
{
"epoch": 0.7206040992448759,
"grad_norm": 1.7384564876556396,
"learning_rate": 4.572174216675938e-07,
"loss": 0.5323720574378967,
"step": 668
},
{
"epoch": 0.7227615965480043,
"grad_norm": 3.279447555541992,
"learning_rate": 4.5690288569544423e-07,
"loss": 0.6184368133544922,
"step": 670
},
{
"epoch": 0.7249190938511327,
"grad_norm": 6.155360698699951,
"learning_rate": 4.5658732013803027e-07,
"loss": 0.6536476612091064,
"step": 672
},
{
"epoch": 0.727076591154261,
"grad_norm": 1.8201708793640137,
"learning_rate": 4.5627072678147904e-07,
"loss": 0.5651783347129822,
"step": 674
},
{
"epoch": 0.7292340884573895,
"grad_norm": 3.5887646675109863,
"learning_rate": 4.559531074177349e-07,
"loss": 0.5361768007278442,
"step": 676
},
{
"epoch": 0.7313915857605178,
"grad_norm": 9.772974967956543,
"learning_rate": 4.5563446384454945e-07,
"loss": 0.5051864981651306,
"step": 678
},
{
"epoch": 0.7335490830636462,
"grad_norm": 1.5057132244110107,
"learning_rate": 4.553147978654715e-07,
"loss": 0.6263077855110168,
"step": 680
},
{
"epoch": 0.7357065803667745,
"grad_norm": 1.7822685241699219,
"learning_rate": 4.5499411128983674e-07,
"loss": 0.6016006469726562,
"step": 682
},
{
"epoch": 0.7378640776699029,
"grad_norm": 10.83908462524414,
"learning_rate": 4.546724059327575e-07,
"loss": 0.5432024002075195,
"step": 684
},
{
"epoch": 0.7400215749730313,
"grad_norm": 2.1380414962768555,
"learning_rate": 4.5434968361511263e-07,
"loss": 0.7576265931129456,
"step": 686
},
{
"epoch": 0.7421790722761596,
"grad_norm": 1.9547456502914429,
"learning_rate": 4.5402594616353676e-07,
"loss": 0.5621410012245178,
"step": 688
},
{
"epoch": 0.7443365695792881,
"grad_norm": 1.8721739053726196,
"learning_rate": 4.537011954104105e-07,
"loss": 0.5246554613113403,
"step": 690
},
{
"epoch": 0.7464940668824164,
"grad_norm": 1.6793891191482544,
"learning_rate": 4.533754331938498e-07,
"loss": 0.6016390323638916,
"step": 692
},
{
"epoch": 0.7486515641855448,
"grad_norm": 1.9931007623672485,
"learning_rate": 4.530486613576954e-07,
"loss": 0.6583068370819092,
"step": 694
},
{
"epoch": 0.7508090614886731,
"grad_norm": 4.051970481872559,
"learning_rate": 4.5272088175150305e-07,
"loss": 0.5990911722183228,
"step": 696
},
{
"epoch": 0.7529665587918015,
"grad_norm": 1.6209897994995117,
"learning_rate": 4.523920962305319e-07,
"loss": 0.6055378913879395,
"step": 698
},
{
"epoch": 0.7551240560949298,
"grad_norm": 1.4128165245056152,
"learning_rate": 4.520623066557351e-07,
"loss": 0.7489557862281799,
"step": 700
},
{
"epoch": 0.7572815533980582,
"grad_norm": 2.651704788208008,
"learning_rate": 4.5173151489374874e-07,
"loss": 0.5671336054801941,
"step": 702
},
{
"epoch": 0.7594390507011867,
"grad_norm": 2.9760892391204834,
"learning_rate": 4.5139972281688125e-07,
"loss": 0.5748198628425598,
"step": 704
},
{
"epoch": 0.761596548004315,
"grad_norm": 2.070570230484009,
"learning_rate": 4.510669323031032e-07,
"loss": 0.6631715297698975,
"step": 706
},
{
"epoch": 0.7637540453074434,
"grad_norm": 2.8771958351135254,
"learning_rate": 4.50733145236036e-07,
"loss": 0.538175106048584,
"step": 708
},
{
"epoch": 0.7659115426105717,
"grad_norm": 1.689902901649475,
"learning_rate": 4.50398363504942e-07,
"loss": 0.5201085209846497,
"step": 710
},
{
"epoch": 0.7680690399137001,
"grad_norm": 3.293177366256714,
"learning_rate": 4.500625890047133e-07,
"loss": 0.825022280216217,
"step": 712
},
{
"epoch": 0.7702265372168284,
"grad_norm": 0.5633196234703064,
"learning_rate": 4.49725823635861e-07,
"loss": 0.6611315011978149,
"step": 714
},
{
"epoch": 0.7723840345199569,
"grad_norm": 1.3877530097961426,
"learning_rate": 4.4938806930450476e-07,
"loss": 0.5903546810150146,
"step": 716
},
{
"epoch": 0.7745415318230853,
"grad_norm": 2.2322981357574463,
"learning_rate": 4.4904932792236187e-07,
"loss": 0.4113418459892273,
"step": 718
},
{
"epoch": 0.7766990291262136,
"grad_norm": 1.2284919023513794,
"learning_rate": 4.487096014067363e-07,
"loss": 0.5899285078048706,
"step": 720
},
{
"epoch": 0.778856526429342,
"grad_norm": 1.843123197555542,
"learning_rate": 4.483688916805081e-07,
"loss": 0.5846186280250549,
"step": 722
},
{
"epoch": 0.7810140237324703,
"grad_norm": 3.0655667781829834,
"learning_rate": 4.4802720067212237e-07,
"loss": 0.6830175518989563,
"step": 724
},
{
"epoch": 0.7831715210355987,
"grad_norm": 2.1866564750671387,
"learning_rate": 4.4768453031557797e-07,
"loss": 0.6690636873245239,
"step": 726
},
{
"epoch": 0.785329018338727,
"grad_norm": 8.055279731750488,
"learning_rate": 4.4734088255041747e-07,
"loss": 0.6548261642456055,
"step": 728
},
{
"epoch": 0.7874865156418555,
"grad_norm": 1.804400086402893,
"learning_rate": 4.4699625932171534e-07,
"loss": 0.8144820928573608,
"step": 730
},
{
"epoch": 0.7896440129449838,
"grad_norm": 1.7731058597564697,
"learning_rate": 4.466506625800674e-07,
"loss": 0.6711707711219788,
"step": 732
},
{
"epoch": 0.7918015102481122,
"grad_norm": 4.458347320556641,
"learning_rate": 4.463040942815796e-07,
"loss": 0.6960354447364807,
"step": 734
},
{
"epoch": 0.7939590075512406,
"grad_norm": 1.831221103668213,
"learning_rate": 4.459565563878568e-07,
"loss": 0.6320536732673645,
"step": 736
},
{
"epoch": 0.7961165048543689,
"grad_norm": 1.1004321575164795,
"learning_rate": 4.456080508659922e-07,
"loss": 0.6839619874954224,
"step": 738
},
{
"epoch": 0.7982740021574973,
"grad_norm": 3.9790191650390625,
"learning_rate": 4.452585796885555e-07,
"loss": 0.787294328212738,
"step": 740
},
{
"epoch": 0.8004314994606256,
"grad_norm": 2.5182931423187256,
"learning_rate": 4.449081448335824e-07,
"loss": 0.5662147402763367,
"step": 742
},
{
"epoch": 0.8025889967637541,
"grad_norm": 1.3335113525390625,
"learning_rate": 4.4455674828456285e-07,
"loss": 0.4203084111213684,
"step": 744
},
{
"epoch": 0.8047464940668824,
"grad_norm": 3.6229031085968018,
"learning_rate": 4.442043920304302e-07,
"loss": 0.3892061412334442,
"step": 746
},
{
"epoch": 0.8069039913700108,
"grad_norm": 2.473930835723877,
"learning_rate": 4.4385107806554964e-07,
"loss": 0.6021113395690918,
"step": 748
},
{
"epoch": 0.8090614886731392,
"grad_norm": 5.694954872131348,
"learning_rate": 4.4349680838970745e-07,
"loss": 0.586351752281189,
"step": 750
},
{
"epoch": 0.8112189859762675,
"grad_norm": 1.9584993124008179,
"learning_rate": 4.431415850080989e-07,
"loss": 0.5266885757446289,
"step": 752
},
{
"epoch": 0.8133764832793959,
"grad_norm": 2.9414353370666504,
"learning_rate": 4.427854099313175e-07,
"loss": 0.7066282033920288,
"step": 754
},
{
"epoch": 0.8155339805825242,
"grad_norm": 2.3507425785064697,
"learning_rate": 4.424282851753435e-07,
"loss": 0.6412563323974609,
"step": 756
},
{
"epoch": 0.8176914778856527,
"grad_norm": 1.6683588027954102,
"learning_rate": 4.420702127615323e-07,
"loss": 0.6926671862602234,
"step": 758
},
{
"epoch": 0.819848975188781,
"grad_norm": 2.730984687805176,
"learning_rate": 4.4171119471660315e-07,
"loss": 0.506192684173584,
"step": 760
},
{
"epoch": 0.8220064724919094,
"grad_norm": 1.8006408214569092,
"learning_rate": 4.413512330726276e-07,
"loss": 0.5694448351860046,
"step": 762
},
{
"epoch": 0.8241639697950378,
"grad_norm": 1.66466224193573,
"learning_rate": 4.4099032986701817e-07,
"loss": 0.8421421051025391,
"step": 764
},
{
"epoch": 0.8263214670981661,
"grad_norm": 2.7753241062164307,
"learning_rate": 4.406284871425166e-07,
"loss": 0.5470436215400696,
"step": 766
},
{
"epoch": 0.8284789644012945,
"grad_norm": 2.583517551422119,
"learning_rate": 4.402657069471824e-07,
"loss": 0.5224552154541016,
"step": 768
},
{
"epoch": 0.8306364617044228,
"grad_norm": 3.6843671798706055,
"learning_rate": 4.3990199133438133e-07,
"loss": 0.5659505724906921,
"step": 770
},
{
"epoch": 0.8327939590075513,
"grad_norm": 2.2777364253997803,
"learning_rate": 4.395373423627735e-07,
"loss": 0.5481325387954712,
"step": 772
},
{
"epoch": 0.8349514563106796,
"grad_norm": 12.834761619567871,
"learning_rate": 4.3917176209630216e-07,
"loss": 0.7331764101982117,
"step": 774
},
{
"epoch": 0.837108953613808,
"grad_norm": 4.730658531188965,
"learning_rate": 4.3880525260418143e-07,
"loss": 0.6668429970741272,
"step": 776
},
{
"epoch": 0.8392664509169363,
"grad_norm": 8.263324737548828,
"learning_rate": 4.3843781596088526e-07,
"loss": 0.6778562068939209,
"step": 778
},
{
"epoch": 0.8414239482200647,
"grad_norm": 1.9440997838974,
"learning_rate": 4.380694542461352e-07,
"loss": 0.6994505524635315,
"step": 780
},
{
"epoch": 0.8435814455231931,
"grad_norm": 0.9290769100189209,
"learning_rate": 4.3770016954488887e-07,
"loss": 0.5277756452560425,
"step": 782
},
{
"epoch": 0.8457389428263214,
"grad_norm": 1.2959551811218262,
"learning_rate": 4.373299639473277e-07,
"loss": 0.5180898904800415,
"step": 784
},
{
"epoch": 0.8478964401294499,
"grad_norm": 2.0248026847839355,
"learning_rate": 4.3695883954884616e-07,
"loss": 0.5872831344604492,
"step": 786
},
{
"epoch": 0.8500539374325782,
"grad_norm": 1.9588837623596191,
"learning_rate": 4.365867984500385e-07,
"loss": 0.7239266633987427,
"step": 788
},
{
"epoch": 0.8522114347357066,
"grad_norm": 1.0221301317214966,
"learning_rate": 4.3621384275668796e-07,
"loss": 0.38601911067962646,
"step": 790
},
{
"epoch": 0.8543689320388349,
"grad_norm": 1.3124701976776123,
"learning_rate": 4.3583997457975454e-07,
"loss": 0.5240159630775452,
"step": 792
},
{
"epoch": 0.8565264293419633,
"grad_norm": 7.634000778198242,
"learning_rate": 4.354651960353625e-07,
"loss": 0.3657144010066986,
"step": 794
},
{
"epoch": 0.8586839266450917,
"grad_norm": 1.649867057800293,
"learning_rate": 4.3508950924478943e-07,
"loss": 0.5847235321998596,
"step": 796
},
{
"epoch": 0.86084142394822,
"grad_norm": 2.088364601135254,
"learning_rate": 4.3471291633445334e-07,
"loss": 0.6124690771102905,
"step": 798
},
{
"epoch": 0.8629989212513485,
"grad_norm": 3.3762848377227783,
"learning_rate": 4.343354194359009e-07,
"loss": 0.5077890753746033,
"step": 800
},
{
"epoch": 0.8651564185544768,
"grad_norm": 1.6368306875228882,
"learning_rate": 4.339570206857957e-07,
"loss": 0.4499396085739136,
"step": 802
},
{
"epoch": 0.8673139158576052,
"grad_norm": 1.6739598512649536,
"learning_rate": 4.335777222259056e-07,
"loss": 0.4595860242843628,
"step": 804
},
{
"epoch": 0.8694714131607335,
"grad_norm": 2.5155189037323,
"learning_rate": 4.331975262030911e-07,
"loss": 0.5732553005218506,
"step": 806
},
{
"epoch": 0.8716289104638619,
"grad_norm": 1.8127964735031128,
"learning_rate": 4.3281643476929286e-07,
"loss": 0.6196991801261902,
"step": 808
},
{
"epoch": 0.8737864077669902,
"grad_norm": 3.939059257507324,
"learning_rate": 4.324344500815197e-07,
"loss": 0.5942954421043396,
"step": 810
},
{
"epoch": 0.8759439050701187,
"grad_norm": 4.195562362670898,
"learning_rate": 4.3205157430183627e-07,
"loss": 0.38596200942993164,
"step": 812
},
{
"epoch": 0.8781014023732471,
"grad_norm": 1.3970879316329956,
"learning_rate": 4.316678095973509e-07,
"loss": 0.613640308380127,
"step": 814
},
{
"epoch": 0.8802588996763754,
"grad_norm": 1.3632667064666748,
"learning_rate": 4.312831581402034e-07,
"loss": 0.653675377368927,
"step": 816
},
{
"epoch": 0.8824163969795038,
"grad_norm": 3.1686818599700928,
"learning_rate": 4.3089762210755246e-07,
"loss": 0.5764310956001282,
"step": 818
},
{
"epoch": 0.8845738942826321,
"grad_norm": 3.4244537353515625,
"learning_rate": 4.305112036815639e-07,
"loss": 0.5124704241752625,
"step": 820
},
{
"epoch": 0.8867313915857605,
"grad_norm": 1.641554832458496,
"learning_rate": 4.3012390504939745e-07,
"loss": 0.6814495921134949,
"step": 822
},
{
"epoch": 0.8888888888888888,
"grad_norm": 1.3232611417770386,
"learning_rate": 4.2973572840319536e-07,
"loss": 0.6368851661682129,
"step": 824
},
{
"epoch": 0.8910463861920173,
"grad_norm": 2.085556983947754,
"learning_rate": 4.2934667594006917e-07,
"loss": 0.6687034368515015,
"step": 826
},
{
"epoch": 0.8932038834951457,
"grad_norm": 2.6139376163482666,
"learning_rate": 4.2895674986208786e-07,
"loss": 0.2821641266345978,
"step": 828
},
{
"epoch": 0.895361380798274,
"grad_norm": 1.9903957843780518,
"learning_rate": 4.28565952376265e-07,
"loss": 0.5252734422683716,
"step": 830
},
{
"epoch": 0.8975188781014024,
"grad_norm": 2.297610282897949,
"learning_rate": 4.281742856945465e-07,
"loss": 0.745256781578064,
"step": 832
},
{
"epoch": 0.8996763754045307,
"grad_norm": 6.354626655578613,
"learning_rate": 4.277817520337978e-07,
"loss": 0.5809981226921082,
"step": 834
},
{
"epoch": 0.9018338727076591,
"grad_norm": 2.735447883605957,
"learning_rate": 4.273883536157917e-07,
"loss": 0.6928726434707642,
"step": 836
},
{
"epoch": 0.9039913700107874,
"grad_norm": 1.2760671377182007,
"learning_rate": 4.269940926671957e-07,
"loss": 0.6470978856086731,
"step": 838
},
{
"epoch": 0.9061488673139159,
"grad_norm": 9.878061294555664,
"learning_rate": 4.2659897141955876e-07,
"loss": 0.6638087630271912,
"step": 840
},
{
"epoch": 0.9083063646170443,
"grad_norm": 6.520370960235596,
"learning_rate": 4.262029921092999e-07,
"loss": 0.6910456418991089,
"step": 842
},
{
"epoch": 0.9104638619201726,
"grad_norm": 6.460091590881348,
"learning_rate": 4.258061569776944e-07,
"loss": 0.8283501267433167,
"step": 844
},
{
"epoch": 0.912621359223301,
"grad_norm": 5.205661296844482,
"learning_rate": 4.254084682708617e-07,
"loss": 0.8275612592697144,
"step": 846
},
{
"epoch": 0.9147788565264293,
"grad_norm": 1.463245153427124,
"learning_rate": 4.250099282397526e-07,
"loss": 0.46582770347595215,
"step": 848
},
{
"epoch": 0.9169363538295577,
"grad_norm": 5.333581924438477,
"learning_rate": 4.246105391401362e-07,
"loss": 0.7306879758834839,
"step": 850
},
{
"epoch": 0.919093851132686,
"grad_norm": 5.498586654663086,
"learning_rate": 4.2421030323258773e-07,
"loss": 0.4175741672515869,
"step": 852
},
{
"epoch": 0.9212513484358145,
"grad_norm": 1.0770583152770996,
"learning_rate": 4.2380922278247524e-07,
"loss": 0.6730119585990906,
"step": 854
},
{
"epoch": 0.9234088457389428,
"grad_norm": 0.42357996106147766,
"learning_rate": 4.234073000599469e-07,
"loss": 0.8127405643463135,
"step": 856
},
{
"epoch": 0.9255663430420712,
"grad_norm": 7.6792120933532715,
"learning_rate": 4.230045373399185e-07,
"loss": 0.576317310333252,
"step": 858
},
{
"epoch": 0.9277238403451996,
"grad_norm": 5.140259265899658,
"learning_rate": 4.2260093690206007e-07,
"loss": 0.523854672908783,
"step": 860
},
{
"epoch": 0.9298813376483279,
"grad_norm": 3.8896889686584473,
"learning_rate": 4.221965010307831e-07,
"loss": 0.8840075731277466,
"step": 862
},
{
"epoch": 0.9320388349514563,
"grad_norm": 1.3811416625976562,
"learning_rate": 4.2179123201522784e-07,
"loss": 0.7325385212898254,
"step": 864
},
{
"epoch": 0.9341963322545846,
"grad_norm": 1.7766026258468628,
"learning_rate": 4.213851321492503e-07,
"loss": 0.30791109800338745,
"step": 866
},
{
"epoch": 0.9363538295577131,
"grad_norm": 1.1082113981246948,
"learning_rate": 4.209782037314089e-07,
"loss": 0.6958837509155273,
"step": 868
},
{
"epoch": 0.9385113268608414,
"grad_norm": 7.657420635223389,
"learning_rate": 4.2057044906495197e-07,
"loss": 0.42088598012924194,
"step": 870
},
{
"epoch": 0.9406688241639698,
"grad_norm": 9.207883834838867,
"learning_rate": 4.2016187045780445e-07,
"loss": 0.6101109385490417,
"step": 872
},
{
"epoch": 0.9428263214670982,
"grad_norm": 1.3625860214233398,
"learning_rate": 4.197524702225547e-07,
"loss": 0.6703606247901917,
"step": 874
},
{
"epoch": 0.9449838187702265,
"grad_norm": 4.89463996887207,
"learning_rate": 4.1934225067644163e-07,
"loss": 0.5207024216651917,
"step": 876
},
{
"epoch": 0.9471413160733549,
"grad_norm": 5.676528453826904,
"learning_rate": 4.1893121414134165e-07,
"loss": 0.67097008228302,
"step": 878
},
{
"epoch": 0.9492988133764833,
"grad_norm": 3.8142192363739014,
"learning_rate": 4.1851936294375525e-07,
"loss": 0.8798677325248718,
"step": 880
},
{
"epoch": 0.9514563106796117,
"grad_norm": 3.3056960105895996,
"learning_rate": 4.181066994147939e-07,
"loss": 0.5886131525039673,
"step": 882
},
{
"epoch": 0.95361380798274,
"grad_norm": 1.9534400701522827,
"learning_rate": 4.176932258901673e-07,
"loss": 0.553901731967926,
"step": 884
},
{
"epoch": 0.9557713052858684,
"grad_norm": 1.5135817527770996,
"learning_rate": 4.1727894471016933e-07,
"loss": 0.35753270983695984,
"step": 886
},
{
"epoch": 0.9579288025889967,
"grad_norm": 4.861209869384766,
"learning_rate": 4.168638582196654e-07,
"loss": 0.8348190188407898,
"step": 888
},
{
"epoch": 0.9600862998921251,
"grad_norm": 2.2612881660461426,
"learning_rate": 4.164479687680794e-07,
"loss": 0.8166549205780029,
"step": 890
},
{
"epoch": 0.9622437971952535,
"grad_norm": 1.4900096654891968,
"learning_rate": 4.160312787093796e-07,
"loss": 0.47946974635124207,
"step": 892
},
{
"epoch": 0.9644012944983819,
"grad_norm": 2.3414793014526367,
"learning_rate": 4.156137904020659e-07,
"loss": 0.5899794697761536,
"step": 894
},
{
"epoch": 0.9665587918015103,
"grad_norm": 1.6720659732818604,
"learning_rate": 4.1519550620915643e-07,
"loss": 0.6609233617782593,
"step": 896
},
{
"epoch": 0.9687162891046386,
"grad_norm": 1.4975440502166748,
"learning_rate": 4.1477642849817414e-07,
"loss": 0.6876581907272339,
"step": 898
},
{
"epoch": 0.970873786407767,
"grad_norm": 4.46371603012085,
"learning_rate": 4.143565596411331e-07,
"loss": 0.6830723881721497,
"step": 900
},
{
"epoch": 0.9730312837108953,
"grad_norm": 1.2813857793807983,
"learning_rate": 4.139359020145257e-07,
"loss": 0.5846165418624878,
"step": 902
},
{
"epoch": 0.9751887810140237,
"grad_norm": 3.6278672218322754,
"learning_rate": 4.1351445799930837e-07,
"loss": 0.7689055800437927,
"step": 904
},
{
"epoch": 0.9773462783171522,
"grad_norm": 6.095682621002197,
"learning_rate": 4.1309222998088923e-07,
"loss": 0.5822760462760925,
"step": 906
},
{
"epoch": 0.9795037756202805,
"grad_norm": 1.989863634109497,
"learning_rate": 4.126692203491132e-07,
"loss": 0.6141869425773621,
"step": 908
},
{
"epoch": 0.9816612729234089,
"grad_norm": 1.743506908416748,
"learning_rate": 4.1224543149824945e-07,
"loss": 0.6478677988052368,
"step": 910
},
{
"epoch": 0.9838187702265372,
"grad_norm": 1.9053099155426025,
"learning_rate": 4.11820865826978e-07,
"loss": 0.48178091645240784,
"step": 912
},
{
"epoch": 0.9859762675296656,
"grad_norm": 2.2662453651428223,
"learning_rate": 4.1139552573837515e-07,
"loss": 0.607950747013092,
"step": 914
},
{
"epoch": 0.9881337648327939,
"grad_norm": 2.4114274978637695,
"learning_rate": 4.109694136399008e-07,
"loss": 0.6108921766281128,
"step": 916
},
{
"epoch": 0.9902912621359223,
"grad_norm": 7.512186050415039,
"learning_rate": 4.105425319433844e-07,
"loss": 0.6795108318328857,
"step": 918
},
{
"epoch": 0.9924487594390508,
"grad_norm": 1.7533015012741089,
"learning_rate": 4.1011488306501136e-07,
"loss": 0.437101811170578,
"step": 920
},
{
"epoch": 0.9946062567421791,
"grad_norm": 2.403139352798462,
"learning_rate": 4.096864694253095e-07,
"loss": 0.4864053428173065,
"step": 922
},
{
"epoch": 0.9967637540453075,
"grad_norm": 1.6964081525802612,
"learning_rate": 4.0925729344913507e-07,
"loss": 0.643713116645813,
"step": 924
},
{
"epoch": 0.9989212513484358,
"grad_norm": 1.4134613275527954,
"learning_rate": 4.088273575656594e-07,
"loss": 0.6958010196685791,
"step": 926
},
{
"epoch": 1.0010787486515642,
"grad_norm": 1.5019845962524414,
"learning_rate": 4.083966642083549e-07,
"loss": 0.491763174533844,
"step": 928
},
{
"epoch": 1.0032362459546926,
"grad_norm": 1.6836707592010498,
"learning_rate": 4.079652158149813e-07,
"loss": 0.6048216819763184,
"step": 930
},
{
"epoch": 1.0053937432578208,
"grad_norm": 1.5291476249694824,
"learning_rate": 4.075330148275719e-07,
"loss": 0.6078330278396606,
"step": 932
},
{
"epoch": 1.0075512405609492,
"grad_norm": 1.7825332880020142,
"learning_rate": 4.0710006369241984e-07,
"loss": 0.6125837564468384,
"step": 934
},
{
"epoch": 1.0097087378640777,
"grad_norm": 2.508885622024536,
"learning_rate": 4.06666364860064e-07,
"loss": 0.42874282598495483,
"step": 936
},
{
"epoch": 1.011866235167206,
"grad_norm": 3.95334529876709,
"learning_rate": 4.062319207852754e-07,
"loss": 0.7187290191650391,
"step": 938
},
{
"epoch": 1.0140237324703345,
"grad_norm": 1.270442247390747,
"learning_rate": 4.0579673392704315e-07,
"loss": 0.5802730321884155,
"step": 940
},
{
"epoch": 1.0161812297734627,
"grad_norm": 1.6048680543899536,
"learning_rate": 4.0536080674856064e-07,
"loss": 0.5463104248046875,
"step": 942
},
{
"epoch": 1.0183387270765911,
"grad_norm": 4.189075469970703,
"learning_rate": 4.0492414171721137e-07,
"loss": 0.4786475896835327,
"step": 944
},
{
"epoch": 1.0204962243797195,
"grad_norm": 7.608390808105469,
"learning_rate": 4.044867413045554e-07,
"loss": 0.6445322632789612,
"step": 946
},
{
"epoch": 1.022653721682848,
"grad_norm": 3.295640230178833,
"learning_rate": 4.0404860798631497e-07,
"loss": 0.5995020866394043,
"step": 948
},
{
"epoch": 1.0248112189859762,
"grad_norm": 3.5857110023498535,
"learning_rate": 4.0360974424236045e-07,
"loss": 0.5838800072669983,
"step": 950
},
{
"epoch": 1.0269687162891046,
"grad_norm": 2.5072519779205322,
"learning_rate": 4.031701525566968e-07,
"loss": 0.550299882888794,
"step": 952
},
{
"epoch": 1.029126213592233,
"grad_norm": 2.477893352508545,
"learning_rate": 4.0272983541744906e-07,
"loss": 0.48044463992118835,
"step": 954
},
{
"epoch": 1.0312837108953614,
"grad_norm": 2.064870595932007,
"learning_rate": 4.0228879531684825e-07,
"loss": 0.607839822769165,
"step": 956
},
{
"epoch": 1.0334412081984898,
"grad_norm": 1.39699125289917,
"learning_rate": 4.018470347512177e-07,
"loss": 0.6562771797180176,
"step": 958
},
{
"epoch": 1.035598705501618,
"grad_norm": 4.258260250091553,
"learning_rate": 4.0140455622095833e-07,
"loss": 0.6375301480293274,
"step": 960
},
{
"epoch": 1.0377562028047465,
"grad_norm": 2.1129860877990723,
"learning_rate": 4.0096136223053503e-07,
"loss": 0.5195407271385193,
"step": 962
},
{
"epoch": 1.0399137001078749,
"grad_norm": 4.592264652252197,
"learning_rate": 4.005174552884621e-07,
"loss": 0.47522222995758057,
"step": 964
},
{
"epoch": 1.0420711974110033,
"grad_norm": 1.5298984050750732,
"learning_rate": 4.0007283790728937e-07,
"loss": 0.45716097950935364,
"step": 966
},
{
"epoch": 1.0442286947141317,
"grad_norm": 2.2780728340148926,
"learning_rate": 3.996275126035877e-07,
"loss": 0.6386222243309021,
"step": 968
},
{
"epoch": 1.04638619201726,
"grad_norm": 39.363914489746094,
"learning_rate": 3.9918148189793473e-07,
"loss": 0.6134737133979797,
"step": 970
},
{
"epoch": 1.0485436893203883,
"grad_norm": 1.4374394416809082,
"learning_rate": 3.98734748314901e-07,
"loss": 0.49869057536125183,
"step": 972
},
{
"epoch": 1.0507011866235167,
"grad_norm": 2.8738322257995605,
"learning_rate": 3.9828731438303513e-07,
"loss": 0.4262846112251282,
"step": 974
},
{
"epoch": 1.0528586839266452,
"grad_norm": 4.88612174987793,
"learning_rate": 3.978391826348501e-07,
"loss": 0.44011008739471436,
"step": 976
},
{
"epoch": 1.0550161812297734,
"grad_norm": 1.591728925704956,
"learning_rate": 3.973903556068082e-07,
"loss": 0.579682469367981,
"step": 978
},
{
"epoch": 1.0571736785329018,
"grad_norm": 2.144808769226074,
"learning_rate": 3.9694083583930734e-07,
"loss": 0.6116613149642944,
"step": 980
},
{
"epoch": 1.0593311758360302,
"grad_norm": 4.2694525718688965,
"learning_rate": 3.964906258766663e-07,
"loss": 0.5095481872558594,
"step": 982
},
{
"epoch": 1.0614886731391586,
"grad_norm": 1.4594628810882568,
"learning_rate": 3.960397282671104e-07,
"loss": 0.49267393350601196,
"step": 984
},
{
"epoch": 1.063646170442287,
"grad_norm": 2.070873737335205,
"learning_rate": 3.9558814556275705e-07,
"loss": 0.5884362459182739,
"step": 986
},
{
"epoch": 1.0658036677454152,
"grad_norm": 1.6621878147125244,
"learning_rate": 3.9513588031960164e-07,
"loss": 0.5078074336051941,
"step": 988
},
{
"epoch": 1.0679611650485437,
"grad_norm": 1.82585608959198,
"learning_rate": 3.946829350975024e-07,
"loss": 0.5152098536491394,
"step": 990
},
{
"epoch": 1.070118662351672,
"grad_norm": 1.609985589981079,
"learning_rate": 3.942293124601664e-07,
"loss": 0.5379571318626404,
"step": 992
},
{
"epoch": 1.0722761596548005,
"grad_norm": 3.880840301513672,
"learning_rate": 3.937750149751353e-07,
"loss": 0.53695148229599,
"step": 994
},
{
"epoch": 1.074433656957929,
"grad_norm": 1.5093027353286743,
"learning_rate": 3.9332004521376976e-07,
"loss": 0.6305853128433228,
"step": 996
},
{
"epoch": 1.0765911542610571,
"grad_norm": 10.337991714477539,
"learning_rate": 3.9286440575123625e-07,
"loss": 0.46823710203170776,
"step": 998
},
{
"epoch": 1.0787486515641855,
"grad_norm": 6.093511581420898,
"learning_rate": 3.9240809916649146e-07,
"loss": 0.7002033591270447,
"step": 1000
},
{
"epoch": 1.080906148867314,
"grad_norm": 1.6978514194488525,
"learning_rate": 3.919511280422681e-07,
"loss": 0.4546293318271637,
"step": 1002
},
{
"epoch": 1.0830636461704424,
"grad_norm": 1.1726937294006348,
"learning_rate": 3.914934949650603e-07,
"loss": 0.6637395620346069,
"step": 1004
},
{
"epoch": 1.0852211434735706,
"grad_norm": 1.5652375221252441,
"learning_rate": 3.910352025251087e-07,
"loss": 0.5811644196510315,
"step": 1006
},
{
"epoch": 1.087378640776699,
"grad_norm": 1.6414304971694946,
"learning_rate": 3.905762533163863e-07,
"loss": 0.595992386341095,
"step": 1008
},
{
"epoch": 1.0895361380798274,
"grad_norm": 2.038053512573242,
"learning_rate": 3.9011664993658315e-07,
"loss": 0.5396429896354675,
"step": 1010
},
{
"epoch": 1.0916936353829558,
"grad_norm": 1.9649733304977417,
"learning_rate": 3.8965639498709213e-07,
"loss": 0.5478025674819946,
"step": 1012
},
{
"epoch": 1.0938511326860842,
"grad_norm": 3.463732957839966,
"learning_rate": 3.891954910729942e-07,
"loss": 0.49627092480659485,
"step": 1014
},
{
"epoch": 1.0960086299892124,
"grad_norm": 3.943431854248047,
"learning_rate": 3.8873394080304304e-07,
"loss": 0.6524069905281067,
"step": 1016
},
{
"epoch": 1.0981661272923409,
"grad_norm": 3.5497028827667236,
"learning_rate": 3.8827174678965144e-07,
"loss": 0.5223618149757385,
"step": 1018
},
{
"epoch": 1.1003236245954693,
"grad_norm": 7.74370002746582,
"learning_rate": 3.878089116488752e-07,
"loss": 0.5389726161956787,
"step": 1020
},
{
"epoch": 1.1024811218985977,
"grad_norm": 2.1320722103118896,
"learning_rate": 3.873454380003992e-07,
"loss": 0.5913807153701782,
"step": 1022
},
{
"epoch": 1.104638619201726,
"grad_norm": 2.8024096488952637,
"learning_rate": 3.8688132846752246e-07,
"loss": 0.5501705408096313,
"step": 1024
},
{
"epoch": 1.1067961165048543,
"grad_norm": 1.3087513446807861,
"learning_rate": 3.864165856771429e-07,
"loss": 0.5886018872261047,
"step": 1026
},
{
"epoch": 1.1089536138079827,
"grad_norm": 1.2707762718200684,
"learning_rate": 3.85951212259743e-07,
"loss": 0.5950880646705627,
"step": 1028
},
{
"epoch": 1.1111111111111112,
"grad_norm": 5.31006383895874,
"learning_rate": 3.8548521084937434e-07,
"loss": 0.4451335668563843,
"step": 1030
},
{
"epoch": 1.1132686084142396,
"grad_norm": 3.055915117263794,
"learning_rate": 3.8501858408364333e-07,
"loss": 0.47768980264663696,
"step": 1032
},
{
"epoch": 1.1154261057173678,
"grad_norm": 1.2441359758377075,
"learning_rate": 3.845513346036957e-07,
"loss": 0.6385471820831299,
"step": 1034
},
{
"epoch": 1.1175836030204962,
"grad_norm": 6.279306411743164,
"learning_rate": 3.840834650542018e-07,
"loss": 0.45993420481681824,
"step": 1036
},
{
"epoch": 1.1197411003236246,
"grad_norm": 1.52280592918396,
"learning_rate": 3.836149780833418e-07,
"loss": 0.5168780088424683,
"step": 1038
},
{
"epoch": 1.121898597626753,
"grad_norm": 2.0793957710266113,
"learning_rate": 3.8314587634279027e-07,
"loss": 0.5576176643371582,
"step": 1040
},
{
"epoch": 1.1240560949298812,
"grad_norm": 1.4773210287094116,
"learning_rate": 3.8267616248770165e-07,
"loss": 0.5373958945274353,
"step": 1042
},
{
"epoch": 1.1262135922330097,
"grad_norm": 2.390502691268921,
"learning_rate": 3.8220583917669486e-07,
"loss": 0.5401830673217773,
"step": 1044
},
{
"epoch": 1.128371089536138,
"grad_norm": 1.2080022096633911,
"learning_rate": 3.8173490907183854e-07,
"loss": 0.5357274413108826,
"step": 1046
},
{
"epoch": 1.1305285868392665,
"grad_norm": 5.848990440368652,
"learning_rate": 3.8126337483863565e-07,
"loss": 0.5787194967269897,
"step": 1048
},
{
"epoch": 1.132686084142395,
"grad_norm": 2.1143975257873535,
"learning_rate": 3.8079123914600874e-07,
"loss": 0.5402184724807739,
"step": 1050
},
{
"epoch": 1.134843581445523,
"grad_norm": 1.5472838878631592,
"learning_rate": 3.8031850466628446e-07,
"loss": 0.5939875245094299,
"step": 1052
},
{
"epoch": 1.1370010787486515,
"grad_norm": 1.5804020166397095,
"learning_rate": 3.798451740751789e-07,
"loss": 0.5679658651351929,
"step": 1054
},
{
"epoch": 1.13915857605178,
"grad_norm": 1.2764010429382324,
"learning_rate": 3.79371250051782e-07,
"loss": 0.5673144459724426,
"step": 1056
},
{
"epoch": 1.1413160733549084,
"grad_norm": 2.0156118869781494,
"learning_rate": 3.788967352785426e-07,
"loss": 0.6328020095825195,
"step": 1058
},
{
"epoch": 1.1434735706580366,
"grad_norm": 9.465262413024902,
"learning_rate": 3.7842163244125336e-07,
"loss": 0.4857198894023895,
"step": 1060
},
{
"epoch": 1.145631067961165,
"grad_norm": 1.7701698541641235,
"learning_rate": 3.7794594422903524e-07,
"loss": 0.2504948377609253,
"step": 1062
},
{
"epoch": 1.1477885652642934,
"grad_norm": 3.641615629196167,
"learning_rate": 3.7746967333432267e-07,
"loss": 0.5472912192344666,
"step": 1064
},
{
"epoch": 1.1499460625674218,
"grad_norm": 2.0676019191741943,
"learning_rate": 3.769928224528479e-07,
"loss": 0.6364511847496033,
"step": 1066
},
{
"epoch": 1.1521035598705502,
"grad_norm": 4.401365756988525,
"learning_rate": 3.7651539428362613e-07,
"loss": 0.5439355969429016,
"step": 1068
},
{
"epoch": 1.1542610571736784,
"grad_norm": 5.850034236907959,
"learning_rate": 3.7603739152894e-07,
"loss": 0.6640523076057434,
"step": 1070
},
{
"epoch": 1.1564185544768069,
"grad_norm": 1.1790587902069092,
"learning_rate": 3.755588168943242e-07,
"loss": 0.5868851542472839,
"step": 1072
},
{
"epoch": 1.1585760517799353,
"grad_norm": 1.469138503074646,
"learning_rate": 3.7507967308855054e-07,
"loss": 0.5512825846672058,
"step": 1074
},
{
"epoch": 1.1607335490830637,
"grad_norm": 1.2130903005599976,
"learning_rate": 3.7459996282361243e-07,
"loss": 0.5239831805229187,
"step": 1076
},
{
"epoch": 1.162891046386192,
"grad_norm": 7.653203964233398,
"learning_rate": 3.741196888147091e-07,
"loss": 0.6439744234085083,
"step": 1078
},
{
"epoch": 1.1650485436893203,
"grad_norm": 1.5135337114334106,
"learning_rate": 3.7363885378023103e-07,
"loss": 0.5788765549659729,
"step": 1080
},
{
"epoch": 1.1672060409924487,
"grad_norm": 3.182453155517578,
"learning_rate": 3.731574604417439e-07,
"loss": 0.5617559552192688,
"step": 1082
},
{
"epoch": 1.1693635382955772,
"grad_norm": 2.3747951984405518,
"learning_rate": 3.7267551152397357e-07,
"loss": 0.5370228886604309,
"step": 1084
},
{
"epoch": 1.1715210355987056,
"grad_norm": 1.1393845081329346,
"learning_rate": 3.721930097547905e-07,
"loss": 0.6155055165290833,
"step": 1086
},
{
"epoch": 1.173678532901834,
"grad_norm": 2.894418954849243,
"learning_rate": 3.717099578651941e-07,
"loss": 0.6374943256378174,
"step": 1088
},
{
"epoch": 1.1758360302049622,
"grad_norm": 2.416472911834717,
"learning_rate": 3.71226358589298e-07,
"loss": 0.6947451829910278,
"step": 1090
},
{
"epoch": 1.1779935275080906,
"grad_norm": 1.3670376539230347,
"learning_rate": 3.7074221466431373e-07,
"loss": 0.5406089425086975,
"step": 1092
},
{
"epoch": 1.180151024811219,
"grad_norm": 1.2521384954452515,
"learning_rate": 3.702575288305355e-07,
"loss": 0.5395722389221191,
"step": 1094
},
{
"epoch": 1.1823085221143474,
"grad_norm": 1.9795541763305664,
"learning_rate": 3.697723038313251e-07,
"loss": 0.29258376359939575,
"step": 1096
},
{
"epoch": 1.1844660194174756,
"grad_norm": 1.5469352006912231,
"learning_rate": 3.692865424130957e-07,
"loss": 0.28986942768096924,
"step": 1098
},
{
"epoch": 1.186623516720604,
"grad_norm": 2.116645574569702,
"learning_rate": 3.6880024732529693e-07,
"loss": 0.4253014922142029,
"step": 1100
},
{
"epoch": 1.1887810140237325,
"grad_norm": 1.2636767625808716,
"learning_rate": 3.683134213203987e-07,
"loss": 0.3352068066596985,
"step": 1102
},
{
"epoch": 1.190938511326861,
"grad_norm": 4.438840866088867,
"learning_rate": 3.6782606715387635e-07,
"loss": 0.5909640192985535,
"step": 1104
},
{
"epoch": 1.1930960086299893,
"grad_norm": 2.9154276847839355,
"learning_rate": 3.673381875841945e-07,
"loss": 0.6895350217819214,
"step": 1106
},
{
"epoch": 1.1952535059331175,
"grad_norm": 1.7983450889587402,
"learning_rate": 3.668497853727913e-07,
"loss": 0.6385772824287415,
"step": 1108
},
{
"epoch": 1.197411003236246,
"grad_norm": 3.447049140930176,
"learning_rate": 3.6636086328406374e-07,
"loss": 0.6397103071212769,
"step": 1110
},
{
"epoch": 1.1995685005393744,
"grad_norm": 3.8200409412384033,
"learning_rate": 3.6587142408535054e-07,
"loss": 0.5476571321487427,
"step": 1112
},
{
"epoch": 1.2017259978425028,
"grad_norm": 1.2859944105148315,
"learning_rate": 3.6538147054691815e-07,
"loss": 0.5042168498039246,
"step": 1114
},
{
"epoch": 1.203883495145631,
"grad_norm": 1.283574104309082,
"learning_rate": 3.648910054419435e-07,
"loss": 0.5920245051383972,
"step": 1116
},
{
"epoch": 1.2060409924487594,
"grad_norm": 1.5358695983886719,
"learning_rate": 3.6440003154649953e-07,
"loss": 0.4955591559410095,
"step": 1118
},
{
"epoch": 1.2081984897518878,
"grad_norm": 1.0370635986328125,
"learning_rate": 3.639085516395387e-07,
"loss": 0.6023776531219482,
"step": 1120
},
{
"epoch": 1.2103559870550162,
"grad_norm": 3.8796122074127197,
"learning_rate": 3.6341656850287774e-07,
"loss": 0.762068510055542,
"step": 1122
},
{
"epoch": 1.2125134843581447,
"grad_norm": 3.400327444076538,
"learning_rate": 3.629240849211814e-07,
"loss": 0.6205697059631348,
"step": 1124
},
{
"epoch": 1.2146709816612729,
"grad_norm": 2.327786684036255,
"learning_rate": 3.6243110368194737e-07,
"loss": 0.5409681797027588,
"step": 1126
},
{
"epoch": 1.2168284789644013,
"grad_norm": 3.4878270626068115,
"learning_rate": 3.619376275754897e-07,
"loss": 0.5776150226593018,
"step": 1128
},
{
"epoch": 1.2189859762675297,
"grad_norm": 3.1736040115356445,
"learning_rate": 3.614436593949239e-07,
"loss": 0.6846837997436523,
"step": 1130
},
{
"epoch": 1.2211434735706581,
"grad_norm": 2.631258964538574,
"learning_rate": 3.609492019361503e-07,
"loss": 0.415913850069046,
"step": 1132
},
{
"epoch": 1.2233009708737863,
"grad_norm": 2.7962353229522705,
"learning_rate": 3.604542579978387e-07,
"loss": 0.4860919415950775,
"step": 1134
},
{
"epoch": 1.2254584681769147,
"grad_norm": 2.561530113220215,
"learning_rate": 3.599588303814125e-07,
"loss": 0.2348194271326065,
"step": 1136
},
{
"epoch": 1.2276159654800431,
"grad_norm": 1.6811720132827759,
"learning_rate": 3.594629218910325e-07,
"loss": 0.5006701946258545,
"step": 1138
},
{
"epoch": 1.2297734627831716,
"grad_norm": 4.786057472229004,
"learning_rate": 3.589665353335817e-07,
"loss": 0.7145098447799683,
"step": 1140
},
{
"epoch": 1.2319309600863,
"grad_norm": 1.3728108406066895,
"learning_rate": 3.584696735186486e-07,
"loss": 0.7182168960571289,
"step": 1142
},
{
"epoch": 1.2340884573894282,
"grad_norm": 2.7093677520751953,
"learning_rate": 3.579723392585119e-07,
"loss": 0.5651712417602539,
"step": 1144
},
{
"epoch": 1.2362459546925566,
"grad_norm": 1.986265778541565,
"learning_rate": 3.574745353681243e-07,
"loss": 0.5348817110061646,
"step": 1146
},
{
"epoch": 1.238403451995685,
"grad_norm": 6.097311496734619,
"learning_rate": 3.5697626466509663e-07,
"loss": 0.577741801738739,
"step": 1148
},
{
"epoch": 1.2405609492988134,
"grad_norm": 11.053370475769043,
"learning_rate": 3.564775299696821e-07,
"loss": 0.5550810098648071,
"step": 1150
},
{
"epoch": 1.2427184466019416,
"grad_norm": 1.4628392457962036,
"learning_rate": 3.5597833410476006e-07,
"loss": 0.620225727558136,
"step": 1152
},
{
"epoch": 1.24487594390507,
"grad_norm": 7.8950276374816895,
"learning_rate": 3.554786798958199e-07,
"loss": 0.607008159160614,
"step": 1154
},
{
"epoch": 1.2470334412081985,
"grad_norm": 1.5595459938049316,
"learning_rate": 3.549785701709456e-07,
"loss": 0.4987570643424988,
"step": 1156
},
{
"epoch": 1.249190938511327,
"grad_norm": 1.6523284912109375,
"learning_rate": 3.544780077607992e-07,
"loss": 0.43261778354644775,
"step": 1158
},
{
"epoch": 1.2513484358144553,
"grad_norm": 12.226542472839355,
"learning_rate": 3.53976995498605e-07,
"loss": 0.6562701463699341,
"step": 1160
},
{
"epoch": 1.2535059331175837,
"grad_norm": 1.1661919355392456,
"learning_rate": 3.534755362201336e-07,
"loss": 0.501526951789856,
"step": 1162
},
{
"epoch": 1.255663430420712,
"grad_norm": 2.927499532699585,
"learning_rate": 3.529736327636856e-07,
"loss": 0.5619246959686279,
"step": 1164
},
{
"epoch": 1.2578209277238404,
"grad_norm": 2.0249011516571045,
"learning_rate": 3.524712879700758e-07,
"loss": 0.5738718509674072,
"step": 1166
},
{
"epoch": 1.2599784250269688,
"grad_norm": 0.44852355122566223,
"learning_rate": 3.5196850468261694e-07,
"loss": 0.5509487390518188,
"step": 1168
},
{
"epoch": 1.262135922330097,
"grad_norm": 2.7338967323303223,
"learning_rate": 3.514652857471038e-07,
"loss": 0.8337733149528503,
"step": 1170
},
{
"epoch": 1.2642934196332254,
"grad_norm": 1.9451134204864502,
"learning_rate": 3.509616340117968e-07,
"loss": 0.650130033493042,
"step": 1172
},
{
"epoch": 1.2664509169363538,
"grad_norm": 1.3618203401565552,
"learning_rate": 3.50457552327406e-07,
"loss": 0.5888417363166809,
"step": 1174
},
{
"epoch": 1.2686084142394822,
"grad_norm": 3.983638048171997,
"learning_rate": 3.499530435470753e-07,
"loss": 0.5166311264038086,
"step": 1176
},
{
"epoch": 1.2707659115426106,
"grad_norm": 0.49615857005119324,
"learning_rate": 3.4944811052636557e-07,
"loss": 0.24507802724838257,
"step": 1178
},
{
"epoch": 1.272923408845739,
"grad_norm": 1.7828303575515747,
"learning_rate": 3.4894275612323937e-07,
"loss": 0.542197585105896,
"step": 1180
},
{
"epoch": 1.2750809061488673,
"grad_norm": 3.6452059745788574,
"learning_rate": 3.4843698319804406e-07,
"loss": 0.4652722179889679,
"step": 1182
},
{
"epoch": 1.2772384034519957,
"grad_norm": 2.0969831943511963,
"learning_rate": 3.479307946134958e-07,
"loss": 0.615992546081543,
"step": 1184
},
{
"epoch": 1.279395900755124,
"grad_norm": 5.83058500289917,
"learning_rate": 3.4742419323466364e-07,
"loss": 0.6636797189712524,
"step": 1186
},
{
"epoch": 1.2815533980582523,
"grad_norm": 1.9127455949783325,
"learning_rate": 3.469171819289529e-07,
"loss": 0.5533426403999329,
"step": 1188
},
{
"epoch": 1.2837108953613807,
"grad_norm": 3.0942981243133545,
"learning_rate": 3.4640976356608925e-07,
"loss": 0.4518588185310364,
"step": 1190
},
{
"epoch": 1.2858683926645091,
"grad_norm": 4.2268218994140625,
"learning_rate": 3.4590194101810225e-07,
"loss": 0.6738635897636414,
"step": 1192
},
{
"epoch": 1.2880258899676376,
"grad_norm": 1.095913290977478,
"learning_rate": 3.453937171593092e-07,
"loss": 0.5171671509742737,
"step": 1194
},
{
"epoch": 1.290183387270766,
"grad_norm": 1.8377701044082642,
"learning_rate": 3.448850948662989e-07,
"loss": 0.4299532175064087,
"step": 1196
},
{
"epoch": 1.2923408845738944,
"grad_norm": 5.051590442657471,
"learning_rate": 3.443760770179152e-07,
"loss": 0.5502167344093323,
"step": 1198
},
{
"epoch": 1.2944983818770226,
"grad_norm": 1.6523767709732056,
"learning_rate": 3.438666664952409e-07,
"loss": 0.6330602765083313,
"step": 1200
},
{
"epoch": 1.296655879180151,
"grad_norm": 1.3870859146118164,
"learning_rate": 3.4335686618158146e-07,
"loss": 0.5067728757858276,
"step": 1202
},
{
"epoch": 1.2988133764832794,
"grad_norm": 3.1465868949890137,
"learning_rate": 3.428466789624484e-07,
"loss": 0.4402804374694824,
"step": 1204
},
{
"epoch": 1.3009708737864076,
"grad_norm": 9.775895118713379,
"learning_rate": 3.4233610772554327e-07,
"loss": 0.6617931127548218,
"step": 1206
},
{
"epoch": 1.303128371089536,
"grad_norm": 6.532571792602539,
"learning_rate": 3.418251553607414e-07,
"loss": 0.5412026047706604,
"step": 1208
},
{
"epoch": 1.3052858683926645,
"grad_norm": 23.66463279724121,
"learning_rate": 3.4131382476007483e-07,
"loss": 0.5621963739395142,
"step": 1210
},
{
"epoch": 1.307443365695793,
"grad_norm": 4.139750957489014,
"learning_rate": 3.408021188177168e-07,
"loss": 0.43633171916007996,
"step": 1212
},
{
"epoch": 1.3096008629989213,
"grad_norm": 1.5540739297866821,
"learning_rate": 3.40290040429965e-07,
"loss": 0.585831880569458,
"step": 1214
},
{
"epoch": 1.3117583603020497,
"grad_norm": 0.7869778871536255,
"learning_rate": 3.397775924952252e-07,
"loss": 0.6663514375686646,
"step": 1216
},
{
"epoch": 1.313915857605178,
"grad_norm": 3.8079118728637695,
"learning_rate": 3.3926477791399466e-07,
"loss": 0.6986393928527832,
"step": 1218
},
{
"epoch": 1.3160733549083063,
"grad_norm": 80.64041137695312,
"learning_rate": 3.3875159958884604e-07,
"loss": 0.2534840703010559,
"step": 1220
},
{
"epoch": 1.3182308522114348,
"grad_norm": 1.4813616275787354,
"learning_rate": 3.382380604244108e-07,
"loss": 0.5039587020874023,
"step": 1222
},
{
"epoch": 1.3203883495145632,
"grad_norm": 1.3485782146453857,
"learning_rate": 3.3772416332736267e-07,
"loss": 0.49135297536849976,
"step": 1224
},
{
"epoch": 1.3225458468176914,
"grad_norm": 2.1019444465637207,
"learning_rate": 3.372099112064016e-07,
"loss": 0.5730096101760864,
"step": 1226
},
{
"epoch": 1.3247033441208198,
"grad_norm": 1.2675708532333374,
"learning_rate": 3.3669530697223666e-07,
"loss": 0.3039630055427551,
"step": 1228
},
{
"epoch": 1.3268608414239482,
"grad_norm": 4.25254487991333,
"learning_rate": 3.3618035353757004e-07,
"loss": 0.5067458748817444,
"step": 1230
},
{
"epoch": 1.3290183387270766,
"grad_norm": 1.8785464763641357,
"learning_rate": 3.3566505381708053e-07,
"loss": 0.592536449432373,
"step": 1232
},
{
"epoch": 1.331175836030205,
"grad_norm": 3.387016773223877,
"learning_rate": 3.351494107274067e-07,
"loss": 0.5786437392234802,
"step": 1234
},
{
"epoch": 1.3333333333333333,
"grad_norm": 3.627840757369995,
"learning_rate": 3.3463342718713093e-07,
"loss": 0.41272541880607605,
"step": 1236
},
{
"epoch": 1.3354908306364617,
"grad_norm": 2.921630620956421,
"learning_rate": 3.3411710611676245e-07,
"loss": 0.5590270757675171,
"step": 1238
},
{
"epoch": 1.33764832793959,
"grad_norm": 1.4048740863800049,
"learning_rate": 3.3360045043872073e-07,
"loss": 0.5370460748672485,
"step": 1240
},
{
"epoch": 1.3398058252427185,
"grad_norm": 1.9135799407958984,
"learning_rate": 3.3308346307731937e-07,
"loss": 0.6795614361763,
"step": 1242
},
{
"epoch": 1.3419633225458467,
"grad_norm": 5.181615829467773,
"learning_rate": 3.325661469587493e-07,
"loss": 0.43978267908096313,
"step": 1244
},
{
"epoch": 1.3441208198489751,
"grad_norm": 2.0108721256256104,
"learning_rate": 3.320485050110623e-07,
"loss": 0.7196254730224609,
"step": 1246
},
{
"epoch": 1.3462783171521036,
"grad_norm": 5.627135753631592,
"learning_rate": 3.3153054016415404e-07,
"loss": 0.6685677170753479,
"step": 1248
},
{
"epoch": 1.348435814455232,
"grad_norm": 1.5753936767578125,
"learning_rate": 3.3101225534974824e-07,
"loss": 0.47921374440193176,
"step": 1250
},
{
"epoch": 1.3505933117583604,
"grad_norm": 1.3769195079803467,
"learning_rate": 3.304936535013796e-07,
"loss": 0.5930690765380859,
"step": 1252
},
{
"epoch": 1.3527508090614886,
"grad_norm": 3.187718391418457,
"learning_rate": 3.2997473755437694e-07,
"loss": 0.443682998418808,
"step": 1254
},
{
"epoch": 1.354908306364617,
"grad_norm": 1.696946144104004,
"learning_rate": 3.294555104458472e-07,
"loss": 0.5157840251922607,
"step": 1256
},
{
"epoch": 1.3570658036677454,
"grad_norm": 1.3455950021743774,
"learning_rate": 3.289359751146585e-07,
"loss": 0.4192175269126892,
"step": 1258
},
{
"epoch": 1.3592233009708738,
"grad_norm": 2.213014602661133,
"learning_rate": 3.2841613450142326e-07,
"loss": 0.5426623821258545,
"step": 1260
},
{
"epoch": 1.361380798274002,
"grad_norm": 1.4176040887832642,
"learning_rate": 3.278959915484822e-07,
"loss": 0.37126630544662476,
"step": 1262
},
{
"epoch": 1.3635382955771305,
"grad_norm": 1.7383960485458374,
"learning_rate": 3.2737554919988713e-07,
"loss": 0.4560404419898987,
"step": 1264
},
{
"epoch": 1.3656957928802589,
"grad_norm": 1.2408937215805054,
"learning_rate": 3.2685481040138437e-07,
"loss": 0.6066496968269348,
"step": 1266
},
{
"epoch": 1.3678532901833873,
"grad_norm": 14.88650131225586,
"learning_rate": 3.2633377810039837e-07,
"loss": 0.5824995636940002,
"step": 1268
},
{
"epoch": 1.3700107874865157,
"grad_norm": 2.481553792953491,
"learning_rate": 3.2581245524601457e-07,
"loss": 0.5986048579216003,
"step": 1270
},
{
"epoch": 1.3721682847896441,
"grad_norm": 6.574549198150635,
"learning_rate": 3.252908447889633e-07,
"loss": 0.6264061331748962,
"step": 1272
},
{
"epoch": 1.3743257820927723,
"grad_norm": 11.117745399475098,
"learning_rate": 3.2476894968160245e-07,
"loss": 0.47890105843544006,
"step": 1274
},
{
"epoch": 1.3764832793959008,
"grad_norm": 2.9714407920837402,
"learning_rate": 3.2424677287790105e-07,
"loss": 0.6954044699668884,
"step": 1276
},
{
"epoch": 1.3786407766990292,
"grad_norm": 4.487089157104492,
"learning_rate": 3.237243173334229e-07,
"loss": 0.3699471354484558,
"step": 1278
},
{
"epoch": 1.3807982740021574,
"grad_norm": 3.5095317363739014,
"learning_rate": 3.232015860053093e-07,
"loss": 0.49049827456474304,
"step": 1280
},
{
"epoch": 1.3829557713052858,
"grad_norm": 3.8155364990234375,
"learning_rate": 3.226785818522622e-07,
"loss": 0.5451414585113525,
"step": 1282
},
{
"epoch": 1.3851132686084142,
"grad_norm": 1.868401050567627,
"learning_rate": 3.221553078345282e-07,
"loss": 0.4515441060066223,
"step": 1284
},
{
"epoch": 1.3872707659115426,
"grad_norm": 7.603207111358643,
"learning_rate": 3.216317669138812e-07,
"loss": 0.6191388964653015,
"step": 1286
},
{
"epoch": 1.389428263214671,
"grad_norm": 8.218348503112793,
"learning_rate": 3.211079620536058e-07,
"loss": 0.43059730529785156,
"step": 1288
},
{
"epoch": 1.3915857605177995,
"grad_norm": 4.354824542999268,
"learning_rate": 3.205838962184804e-07,
"loss": 0.47998175024986267,
"step": 1290
},
{
"epoch": 1.3937432578209277,
"grad_norm": 2.3470609188079834,
"learning_rate": 3.2005957237476073e-07,
"loss": 0.6489396095275879,
"step": 1292
},
{
"epoch": 1.395900755124056,
"grad_norm": 5.762950897216797,
"learning_rate": 3.1953499349016284e-07,
"loss": 0.6003392934799194,
"step": 1294
},
{
"epoch": 1.3980582524271845,
"grad_norm": 0.4735300838947296,
"learning_rate": 3.190101625338461e-07,
"loss": 0.5412855744361877,
"step": 1296
},
{
"epoch": 1.4002157497303127,
"grad_norm": 1.6221954822540283,
"learning_rate": 3.18485082476397e-07,
"loss": 0.452088862657547,
"step": 1298
},
{
"epoch": 1.4023732470334411,
"grad_norm": 5.19826078414917,
"learning_rate": 3.179597562898116e-07,
"loss": 0.45721590518951416,
"step": 1300
},
{
"epoch": 1.4045307443365695,
"grad_norm": 2.296773910522461,
"learning_rate": 3.1743418694747935e-07,
"loss": 0.6805709600448608,
"step": 1302
},
{
"epoch": 1.406688241639698,
"grad_norm": 1.289931058883667,
"learning_rate": 3.169083774241658e-07,
"loss": 0.48374688625335693,
"step": 1304
},
{
"epoch": 1.4088457389428264,
"grad_norm": 2.8533899784088135,
"learning_rate": 3.1638233069599603e-07,
"loss": 0.5229544639587402,
"step": 1306
},
{
"epoch": 1.4110032362459548,
"grad_norm": 6.371025085449219,
"learning_rate": 3.158560497404377e-07,
"loss": 0.5882778763771057,
"step": 1308
},
{
"epoch": 1.413160733549083,
"grad_norm": 1.4979420900344849,
"learning_rate": 3.153295375362843e-07,
"loss": 0.4229152500629425,
"step": 1310
},
{
"epoch": 1.4153182308522114,
"grad_norm": 1.8354501724243164,
"learning_rate": 3.14802797063638e-07,
"loss": 0.5944955945014954,
"step": 1312
},
{
"epoch": 1.4174757281553398,
"grad_norm": 2.193164825439453,
"learning_rate": 3.1427583130389324e-07,
"loss": 0.5871320366859436,
"step": 1314
},
{
"epoch": 1.419633225458468,
"grad_norm": 3.204235315322876,
"learning_rate": 3.137486432397193e-07,
"loss": 0.2334681898355484,
"step": 1316
},
{
"epoch": 1.4217907227615965,
"grad_norm": 4.964504718780518,
"learning_rate": 3.1322123585504395e-07,
"loss": 0.5250051617622375,
"step": 1318
},
{
"epoch": 1.4239482200647249,
"grad_norm": 2.2570719718933105,
"learning_rate": 3.1269361213503643e-07,
"loss": 0.5495631694793701,
"step": 1320
},
{
"epoch": 1.4261057173678533,
"grad_norm": 1.536373496055603,
"learning_rate": 3.121657750660901e-07,
"loss": 0.48065459728240967,
"step": 1322
},
{
"epoch": 1.4282632146709817,
"grad_norm": 4.908086776733398,
"learning_rate": 3.116377276358063e-07,
"loss": 0.41572993993759155,
"step": 1324
},
{
"epoch": 1.4304207119741101,
"grad_norm": 8.478764533996582,
"learning_rate": 3.111094728329767e-07,
"loss": 0.6322842240333557,
"step": 1326
},
{
"epoch": 1.4325782092772383,
"grad_norm": 1.3569598197937012,
"learning_rate": 3.1058101364756684e-07,
"loss": 0.48605573177337646,
"step": 1328
},
{
"epoch": 1.4347357065803668,
"grad_norm": 2.1617515087127686,
"learning_rate": 3.100523530706991e-07,
"loss": 0.5922558307647705,
"step": 1330
},
{
"epoch": 1.4368932038834952,
"grad_norm": 3.862966775894165,
"learning_rate": 3.095234940946358e-07,
"loss": 0.504891574382782,
"step": 1332
},
{
"epoch": 1.4390507011866236,
"grad_norm": 1.3128629922866821,
"learning_rate": 3.089944397127621e-07,
"loss": 0.5910184383392334,
"step": 1334
},
{
"epoch": 1.4412081984897518,
"grad_norm": 1.3905627727508545,
"learning_rate": 3.0846519291956923e-07,
"loss": 0.4852849543094635,
"step": 1336
},
{
"epoch": 1.4433656957928802,
"grad_norm": 4.070714950561523,
"learning_rate": 3.079357567106375e-07,
"loss": 0.609307050704956,
"step": 1338
},
{
"epoch": 1.4455231930960086,
"grad_norm": 1.5611417293548584,
"learning_rate": 3.074061340826193e-07,
"loss": 0.6423069834709167,
"step": 1340
},
{
"epoch": 1.447680690399137,
"grad_norm": 4.069622993469238,
"learning_rate": 3.0687632803322214e-07,
"loss": 0.4610182046890259,
"step": 1342
},
{
"epoch": 1.4498381877022655,
"grad_norm": 1.5368692874908447,
"learning_rate": 3.0634634156119183e-07,
"loss": 0.596781313419342,
"step": 1344
},
{
"epoch": 1.4519956850053937,
"grad_norm": 4.0829572677612305,
"learning_rate": 3.0581617766629525e-07,
"loss": 0.46168115735054016,
"step": 1346
},
{
"epoch": 1.454153182308522,
"grad_norm": 1.1894795894622803,
"learning_rate": 3.052858393493036e-07,
"loss": 0.3813992738723755,
"step": 1348
},
{
"epoch": 1.4563106796116505,
"grad_norm": 7.187083721160889,
"learning_rate": 3.0475532961197525e-07,
"loss": 0.4869483411312103,
"step": 1350
},
{
"epoch": 1.458468176914779,
"grad_norm": 2.1215786933898926,
"learning_rate": 3.042246514570388e-07,
"loss": 0.44144994020462036,
"step": 1352
},
{
"epoch": 1.4606256742179071,
"grad_norm": 1.7207623720169067,
"learning_rate": 3.036938078881764e-07,
"loss": 0.5316063165664673,
"step": 1354
},
{
"epoch": 1.4627831715210355,
"grad_norm": 1.4604369401931763,
"learning_rate": 3.0316280191000595e-07,
"loss": 0.5334872007369995,
"step": 1356
},
{
"epoch": 1.464940668824164,
"grad_norm": 1.4460911750793457,
"learning_rate": 3.0263163652806497e-07,
"loss": 0.5650609135627747,
"step": 1358
},
{
"epoch": 1.4670981661272924,
"grad_norm": 1.443108320236206,
"learning_rate": 3.0210031474879323e-07,
"loss": 0.3324916660785675,
"step": 1360
},
{
"epoch": 1.4692556634304208,
"grad_norm": 1.309964656829834,
"learning_rate": 3.015688395795154e-07,
"loss": 0.6782206892967224,
"step": 1362
},
{
"epoch": 1.4714131607335492,
"grad_norm": 1.0900267362594604,
"learning_rate": 3.010372140284247e-07,
"loss": 0.42707037925720215,
"step": 1364
},
{
"epoch": 1.4735706580366774,
"grad_norm": 1.5196927785873413,
"learning_rate": 3.0050544110456544e-07,
"loss": 0.561892032623291,
"step": 1366
},
{
"epoch": 1.4757281553398058,
"grad_norm": 1.3471835851669312,
"learning_rate": 2.999735238178159e-07,
"loss": 0.5165982842445374,
"step": 1368
},
{
"epoch": 1.4778856526429343,
"grad_norm": 1.6917212009429932,
"learning_rate": 2.9944146517887166e-07,
"loss": 0.42515087127685547,
"step": 1370
},
{
"epoch": 1.4800431499460625,
"grad_norm": 3.3652212619781494,
"learning_rate": 2.989092681992283e-07,
"loss": 0.5667294859886169,
"step": 1372
},
{
"epoch": 1.4822006472491909,
"grad_norm": 6.991846561431885,
"learning_rate": 2.983769358911643e-07,
"loss": 0.4916223883628845,
"step": 1374
},
{
"epoch": 1.4843581445523193,
"grad_norm": 3.88139271736145,
"learning_rate": 2.9784447126772434e-07,
"loss": 0.5577268600463867,
"step": 1376
},
{
"epoch": 1.4865156418554477,
"grad_norm": 1.634870171546936,
"learning_rate": 2.9731187734270173e-07,
"loss": 0.5713073015213013,
"step": 1378
},
{
"epoch": 1.4886731391585761,
"grad_norm": 2.828623056411743,
"learning_rate": 2.967791571306221e-07,
"loss": 0.7753892540931702,
"step": 1380
},
{
"epoch": 1.4908306364617046,
"grad_norm": 2.1929805278778076,
"learning_rate": 2.962463136467253e-07,
"loss": 0.5757138729095459,
"step": 1382
},
{
"epoch": 1.4929881337648327,
"grad_norm": 1.8372981548309326,
"learning_rate": 2.9571334990694927e-07,
"loss": 0.4425245523452759,
"step": 1384
},
{
"epoch": 1.4951456310679612,
"grad_norm": 1.2296621799468994,
"learning_rate": 2.951802689279126e-07,
"loss": 0.6176034808158875,
"step": 1386
},
{
"epoch": 1.4973031283710896,
"grad_norm": 4.398133277893066,
"learning_rate": 2.9464707372689734e-07,
"loss": 0.6378481984138489,
"step": 1388
},
{
"epoch": 1.4994606256742178,
"grad_norm": 2.845616102218628,
"learning_rate": 2.9411376732183206e-07,
"loss": 0.6340577602386475,
"step": 1390
},
{
"epoch": 1.5016181229773462,
"grad_norm": 1.8457330465316772,
"learning_rate": 2.935803527312748e-07,
"loss": 0.41831356287002563,
"step": 1392
},
{
"epoch": 1.5037756202804746,
"grad_norm": 11.14624309539795,
"learning_rate": 2.930468329743959e-07,
"loss": 0.5453674793243408,
"step": 1394
},
{
"epoch": 1.505933117583603,
"grad_norm": 1.1637507677078247,
"learning_rate": 2.9251321107096105e-07,
"loss": 0.6097413301467896,
"step": 1396
},
{
"epoch": 1.5080906148867315,
"grad_norm": 1.4595943689346313,
"learning_rate": 2.91979490041314e-07,
"loss": 0.5867338180541992,
"step": 1398
},
{
"epoch": 1.5102481121898599,
"grad_norm": 8.647173881530762,
"learning_rate": 2.9144567290635956e-07,
"loss": 0.6274332404136658,
"step": 1400
},
{
"epoch": 1.512405609492988,
"grad_norm": 1.433868169784546,
"learning_rate": 2.909117626875466e-07,
"loss": 0.530730664730072,
"step": 1402
},
{
"epoch": 1.5145631067961165,
"grad_norm": 2.0332090854644775,
"learning_rate": 2.903777624068507e-07,
"loss": 0.38328850269317627,
"step": 1404
},
{
"epoch": 1.516720604099245,
"grad_norm": 2.187605381011963,
"learning_rate": 2.8984367508675735e-07,
"loss": 0.507274866104126,
"step": 1406
},
{
"epoch": 1.5188781014023731,
"grad_norm": 2.2323250770568848,
"learning_rate": 2.8930950375024444e-07,
"loss": 0.47206202149391174,
"step": 1408
},
{
"epoch": 1.5210355987055015,
"grad_norm": 3.1241204738616943,
"learning_rate": 2.8877525142076584e-07,
"loss": 0.45956486463546753,
"step": 1410
},
{
"epoch": 1.52319309600863,
"grad_norm": 1.791256070137024,
"learning_rate": 2.882409211222335e-07,
"loss": 0.29296764731407166,
"step": 1412
},
{
"epoch": 1.5253505933117584,
"grad_norm": 4.025422096252441,
"learning_rate": 2.8770651587900075e-07,
"loss": 0.471237450838089,
"step": 1414
},
{
"epoch": 1.5275080906148868,
"grad_norm": 1.249983787536621,
"learning_rate": 2.8717203871584504e-07,
"loss": 0.5922088027000427,
"step": 1416
},
{
"epoch": 1.5296655879180152,
"grad_norm": 5.017333030700684,
"learning_rate": 2.866374926579512e-07,
"loss": 0.4936787188053131,
"step": 1418
},
{
"epoch": 1.5318230852211436,
"grad_norm": 1.736030101776123,
"learning_rate": 2.8610288073089363e-07,
"loss": 0.5232613682746887,
"step": 1420
},
{
"epoch": 1.5339805825242718,
"grad_norm": 4.345666885375977,
"learning_rate": 2.855682059606196e-07,
"loss": 0.5884013772010803,
"step": 1422
},
{
"epoch": 1.5361380798274002,
"grad_norm": 13.682802200317383,
"learning_rate": 2.850334713734325e-07,
"loss": 0.6141678094863892,
"step": 1424
},
{
"epoch": 1.5382955771305284,
"grad_norm": 1.5001555681228638,
"learning_rate": 2.844986799959738e-07,
"loss": 0.41682037711143494,
"step": 1426
},
{
"epoch": 1.5404530744336569,
"grad_norm": 1.6063748598098755,
"learning_rate": 2.839638348552067e-07,
"loss": 0.5947140455245972,
"step": 1428
},
{
"epoch": 1.5426105717367853,
"grad_norm": 1.9265503883361816,
"learning_rate": 2.8342893897839855e-07,
"loss": 0.504668653011322,
"step": 1430
},
{
"epoch": 1.5447680690399137,
"grad_norm": 2.9574995040893555,
"learning_rate": 2.828939953931038e-07,
"loss": 0.43785667419433594,
"step": 1432
},
{
"epoch": 1.5469255663430421,
"grad_norm": 1.5642776489257812,
"learning_rate": 2.823590071271472e-07,
"loss": 0.42886197566986084,
"step": 1434
},
{
"epoch": 1.5490830636461705,
"grad_norm": 0.6376549005508423,
"learning_rate": 2.818239772086063e-07,
"loss": 0.41987836360931396,
"step": 1436
},
{
"epoch": 1.551240560949299,
"grad_norm": 3.890024423599243,
"learning_rate": 2.8128890866579406e-07,
"loss": 0.5718374252319336,
"step": 1438
},
{
"epoch": 1.5533980582524272,
"grad_norm": 3.199538230895996,
"learning_rate": 2.807538045272427e-07,
"loss": 0.6301546096801758,
"step": 1440
},
{
"epoch": 1.5555555555555556,
"grad_norm": 2.968087911605835,
"learning_rate": 2.8021866782168547e-07,
"loss": 0.5787625908851624,
"step": 1442
},
{
"epoch": 1.5577130528586838,
"grad_norm": 4.71113920211792,
"learning_rate": 2.796835015780398e-07,
"loss": 0.5471571087837219,
"step": 1444
},
{
"epoch": 1.5598705501618122,
"grad_norm": 3.893944025039673,
"learning_rate": 2.79148308825391e-07,
"loss": 0.4724005162715912,
"step": 1446
},
{
"epoch": 1.5620280474649406,
"grad_norm": 0.665489673614502,
"learning_rate": 2.7861309259297354e-07,
"loss": 0.6328169107437134,
"step": 1448
},
{
"epoch": 1.564185544768069,
"grad_norm": 1.5428534746170044,
"learning_rate": 2.780778559101556e-07,
"loss": 0.4981670379638672,
"step": 1450
},
{
"epoch": 1.5663430420711975,
"grad_norm": 1.2379951477050781,
"learning_rate": 2.7754260180642046e-07,
"loss": 0.6006782054901123,
"step": 1452
},
{
"epoch": 1.5685005393743259,
"grad_norm": 1.8184620141983032,
"learning_rate": 2.770073333113504e-07,
"loss": 0.4560186564922333,
"step": 1454
},
{
"epoch": 1.5706580366774543,
"grad_norm": 7.522675037384033,
"learning_rate": 2.7647205345460906e-07,
"loss": 0.611346423625946,
"step": 1456
},
{
"epoch": 1.5728155339805825,
"grad_norm": 1.3059622049331665,
"learning_rate": 2.7593676526592423e-07,
"loss": 0.2933533191680908,
"step": 1458
},
{
"epoch": 1.574973031283711,
"grad_norm": 3.4842753410339355,
"learning_rate": 2.7540147177507123e-07,
"loss": 0.5341723561286926,
"step": 1460
},
{
"epoch": 1.577130528586839,
"grad_norm": 5.351451396942139,
"learning_rate": 2.74866176011855e-07,
"loss": 0.6091599464416504,
"step": 1462
},
{
"epoch": 1.5792880258899675,
"grad_norm": 2.680004119873047,
"learning_rate": 2.743308810060935e-07,
"loss": 0.7066933512687683,
"step": 1464
},
{
"epoch": 1.581445523193096,
"grad_norm": 2.588871955871582,
"learning_rate": 2.737955897876005e-07,
"loss": 0.6565461754798889,
"step": 1466
},
{
"epoch": 1.5836030204962244,
"grad_norm": 1.496903896331787,
"learning_rate": 2.732603053861681e-07,
"loss": 0.47990620136260986,
"step": 1468
},
{
"epoch": 1.5857605177993528,
"grad_norm": 6.212797164916992,
"learning_rate": 2.7272503083155004e-07,
"loss": 0.4671979546546936,
"step": 1470
},
{
"epoch": 1.5879180151024812,
"grad_norm": 4.061429977416992,
"learning_rate": 2.7218976915344416e-07,
"loss": 0.6535285711288452,
"step": 1472
},
{
"epoch": 1.5900755124056096,
"grad_norm": 1.503240942955017,
"learning_rate": 2.7165452338147555e-07,
"loss": 0.5079244375228882,
"step": 1474
},
{
"epoch": 1.5922330097087378,
"grad_norm": 3.7657103538513184,
"learning_rate": 2.7111929654517925e-07,
"loss": 0.6188565492630005,
"step": 1476
},
{
"epoch": 1.5943905070118662,
"grad_norm": 1.6457622051239014,
"learning_rate": 2.7058409167398305e-07,
"loss": 0.5721461772918701,
"step": 1478
},
{
"epoch": 1.5965480043149944,
"grad_norm": 2.564640760421753,
"learning_rate": 2.7004891179719044e-07,
"loss": 0.594935417175293,
"step": 1480
},
{
"epoch": 1.5987055016181229,
"grad_norm": 1.901548147201538,
"learning_rate": 2.695137599439635e-07,
"loss": 0.5292646884918213,
"step": 1482
},
{
"epoch": 1.6008629989212513,
"grad_norm": 2.8939266204833984,
"learning_rate": 2.689786391433055e-07,
"loss": 0.3793540894985199,
"step": 1484
},
{
"epoch": 1.6030204962243797,
"grad_norm": 1.6566553115844727,
"learning_rate": 2.6844355242404434e-07,
"loss": 0.6384937167167664,
"step": 1486
},
{
"epoch": 1.6051779935275081,
"grad_norm": 1.6839215755462646,
"learning_rate": 2.6790850281481455e-07,
"loss": 0.5815557837486267,
"step": 1488
},
{
"epoch": 1.6073354908306365,
"grad_norm": 2.1547319889068604,
"learning_rate": 2.6737349334404086e-07,
"loss": 0.4502698481082916,
"step": 1490
},
{
"epoch": 1.609492988133765,
"grad_norm": 6.962564945220947,
"learning_rate": 2.66838527039921e-07,
"loss": 0.5677059888839722,
"step": 1492
},
{
"epoch": 1.6116504854368932,
"grad_norm": 2.3401830196380615,
"learning_rate": 2.663036069304079e-07,
"loss": 0.7475476264953613,
"step": 1494
},
{
"epoch": 1.6138079827400216,
"grad_norm": 3.999359130859375,
"learning_rate": 2.657687360431935e-07,
"loss": 0.6050864458084106,
"step": 1496
},
{
"epoch": 1.61596548004315,
"grad_norm": 1.3075038194656372,
"learning_rate": 2.6523391740569074e-07,
"loss": 0.5616152286529541,
"step": 1498
},
{
"epoch": 1.6181229773462782,
"grad_norm": 1.1486589908599854,
"learning_rate": 2.646991540450172e-07,
"loss": 0.5269895792007446,
"step": 1500
},
{
"epoch": 1.6202804746494066,
"grad_norm": 7.192336082458496,
"learning_rate": 2.6416444898797716e-07,
"loss": 0.47785210609436035,
"step": 1502
},
{
"epoch": 1.622437971952535,
"grad_norm": 1.773577332496643,
"learning_rate": 2.6362980526104536e-07,
"loss": 0.3188018500804901,
"step": 1504
},
{
"epoch": 1.6245954692556634,
"grad_norm": 4.216702938079834,
"learning_rate": 2.630952258903491e-07,
"loss": 0.5588706135749817,
"step": 1506
},
{
"epoch": 1.6267529665587919,
"grad_norm": 1.6883203983306885,
"learning_rate": 2.6256071390165147e-07,
"loss": 0.3531300723552704,
"step": 1508
},
{
"epoch": 1.6289104638619203,
"grad_norm": 1.359221339225769,
"learning_rate": 2.620262723203342e-07,
"loss": 0.3672279715538025,
"step": 1510
},
{
"epoch": 1.6310679611650487,
"grad_norm": 8.669332504272461,
"learning_rate": 2.6149190417138057e-07,
"loss": 0.6095560193061829,
"step": 1512
},
{
"epoch": 1.633225458468177,
"grad_norm": 3.492126226425171,
"learning_rate": 2.609576124793581e-07,
"loss": 0.41963210701942444,
"step": 1514
},
{
"epoch": 1.6353829557713053,
"grad_norm": 3.135106086730957,
"learning_rate": 2.604234002684016e-07,
"loss": 0.4734860360622406,
"step": 1516
},
{
"epoch": 1.6375404530744335,
"grad_norm": 1.9704272747039795,
"learning_rate": 2.5988927056219613e-07,
"loss": 0.596852719783783,
"step": 1518
},
{
"epoch": 1.639697950377562,
"grad_norm": 2.8910233974456787,
"learning_rate": 2.593552263839596e-07,
"loss": 0.643505871295929,
"step": 1520
},
{
"epoch": 1.6418554476806904,
"grad_norm": 2.2476956844329834,
"learning_rate": 2.588212707564259e-07,
"loss": 0.4490068554878235,
"step": 1522
},
{
"epoch": 1.6440129449838188,
"grad_norm": 6.380528926849365,
"learning_rate": 2.582874067018278e-07,
"loss": 0.48139023780822754,
"step": 1524
},
{
"epoch": 1.6461704422869472,
"grad_norm": 2.361678123474121,
"learning_rate": 2.577536372418795e-07,
"loss": 0.4531154930591583,
"step": 1526
},
{
"epoch": 1.6483279395900756,
"grad_norm": 1.433486819267273,
"learning_rate": 2.572199653977602e-07,
"loss": 0.5615776181221008,
"step": 1528
},
{
"epoch": 1.650485436893204,
"grad_norm": 1.2627114057540894,
"learning_rate": 2.5668639419009606e-07,
"loss": 0.5969760417938232,
"step": 1530
},
{
"epoch": 1.6526429341963322,
"grad_norm": 6.023662090301514,
"learning_rate": 2.5615292663894406e-07,
"loss": 0.7165044546127319,
"step": 1532
},
{
"epoch": 1.6548004314994607,
"grad_norm": 3.729762315750122,
"learning_rate": 2.556195657637744e-07,
"loss": 0.5139379501342773,
"step": 1534
},
{
"epoch": 1.6569579288025889,
"grad_norm": 3.813971996307373,
"learning_rate": 2.5508631458345325e-07,
"loss": 0.40219447016716003,
"step": 1536
},
{
"epoch": 1.6591154261057173,
"grad_norm": 2.2718698978424072,
"learning_rate": 2.545531761162263e-07,
"loss": 0.6281888484954834,
"step": 1538
},
{
"epoch": 1.6612729234088457,
"grad_norm": 1.6029448509216309,
"learning_rate": 2.540201533797007e-07,
"loss": 0.5198391675949097,
"step": 1540
},
{
"epoch": 1.6634304207119741,
"grad_norm": 1.972841501235962,
"learning_rate": 2.5348724939082916e-07,
"loss": 0.5897455811500549,
"step": 1542
},
{
"epoch": 1.6655879180151025,
"grad_norm": 1.6188420057296753,
"learning_rate": 2.5295446716589194e-07,
"loss": 0.36811563372612,
"step": 1544
},
{
"epoch": 1.667745415318231,
"grad_norm": 2.6556005477905273,
"learning_rate": 2.5242180972048e-07,
"loss": 0.48183539509773254,
"step": 1546
},
{
"epoch": 1.6699029126213594,
"grad_norm": 1.075056791305542,
"learning_rate": 2.5188928006947846e-07,
"loss": 0.6169477105140686,
"step": 1548
},
{
"epoch": 1.6720604099244876,
"grad_norm": 4.6703009605407715,
"learning_rate": 2.513568812270487e-07,
"loss": 0.481448233127594,
"step": 1550
},
{
"epoch": 1.674217907227616,
"grad_norm": 2.274782180786133,
"learning_rate": 2.5082461620661196e-07,
"loss": 0.6557754874229431,
"step": 1552
},
{
"epoch": 1.6763754045307442,
"grad_norm": 1.1385339498519897,
"learning_rate": 2.502924880208318e-07,
"loss": 0.4550181031227112,
"step": 1554
},
{
"epoch": 1.6785329018338726,
"grad_norm": 9.544242858886719,
"learning_rate": 2.497604996815976e-07,
"loss": 0.5606685876846313,
"step": 1556
},
{
"epoch": 1.680690399137001,
"grad_norm": 23.09193229675293,
"learning_rate": 2.4922865420000693e-07,
"loss": 0.4275263547897339,
"step": 1558
},
{
"epoch": 1.6828478964401294,
"grad_norm": 1.2717275619506836,
"learning_rate": 2.486969545863489e-07,
"loss": 0.604001522064209,
"step": 1560
},
{
"epoch": 1.6850053937432579,
"grad_norm": 3.0152664184570312,
"learning_rate": 2.4816540385008696e-07,
"loss": 0.5382636189460754,
"step": 1562
},
{
"epoch": 1.6871628910463863,
"grad_norm": 2.152043581008911,
"learning_rate": 2.4763400499984184e-07,
"loss": 0.48857036232948303,
"step": 1564
},
{
"epoch": 1.6893203883495147,
"grad_norm": 1.6351813077926636,
"learning_rate": 2.471027610433748e-07,
"loss": 0.604580819606781,
"step": 1566
},
{
"epoch": 1.691477885652643,
"grad_norm": 1.648799180984497,
"learning_rate": 2.465716749875701e-07,
"loss": 0.5242129564285278,
"step": 1568
},
{
"epoch": 1.6936353829557713,
"grad_norm": 1.386277198791504,
"learning_rate": 2.4604074983841853e-07,
"loss": 0.5764685869216919,
"step": 1570
},
{
"epoch": 1.6957928802588995,
"grad_norm": 1.8634097576141357,
"learning_rate": 2.4550998860099993e-07,
"loss": 0.45359018445014954,
"step": 1572
},
{
"epoch": 1.697950377562028,
"grad_norm": 5.895047187805176,
"learning_rate": 2.4497939427946654e-07,
"loss": 0.6302123069763184,
"step": 1574
},
{
"epoch": 1.7001078748651564,
"grad_norm": 4.849222183227539,
"learning_rate": 2.444489698770256e-07,
"loss": 0.5112524032592773,
"step": 1576
},
{
"epoch": 1.7022653721682848,
"grad_norm": 4.763610363006592,
"learning_rate": 2.43918718395923e-07,
"loss": 0.507486879825592,
"step": 1578
},
{
"epoch": 1.7044228694714132,
"grad_norm": 1.9259899854660034,
"learning_rate": 2.4338864283742554e-07,
"loss": 0.5151503086090088,
"step": 1580
},
{
"epoch": 1.7065803667745416,
"grad_norm": 1.45798659324646,
"learning_rate": 2.428587462018044e-07,
"loss": 0.578480064868927,
"step": 1582
},
{
"epoch": 1.70873786407767,
"grad_norm": 1.3601330518722534,
"learning_rate": 2.4232903148831805e-07,
"loss": 0.5815561413764954,
"step": 1584
},
{
"epoch": 1.7108953613807982,
"grad_norm": 2.1496477127075195,
"learning_rate": 2.4179950169519514e-07,
"loss": 0.48085469007492065,
"step": 1586
},
{
"epoch": 1.7130528586839266,
"grad_norm": 1.8896552324295044,
"learning_rate": 2.4127015981961797e-07,
"loss": 0.44769376516342163,
"step": 1588
},
{
"epoch": 1.715210355987055,
"grad_norm": 3.8881163597106934,
"learning_rate": 2.407410088577047e-07,
"loss": 0.578763484954834,
"step": 1590
},
{
"epoch": 1.7173678532901833,
"grad_norm": 3.5086944103240967,
"learning_rate": 2.402120518044935e-07,
"loss": 0.5796621441841125,
"step": 1592
},
{
"epoch": 1.7195253505933117,
"grad_norm": 1.3541215658187866,
"learning_rate": 2.396832916539247e-07,
"loss": 0.6715743541717529,
"step": 1594
},
{
"epoch": 1.72168284789644,
"grad_norm": 1.8187955617904663,
"learning_rate": 2.391547313988239e-07,
"loss": 0.5842028260231018,
"step": 1596
},
{
"epoch": 1.7238403451995685,
"grad_norm": 2.136215925216675,
"learning_rate": 2.386263740308859e-07,
"loss": 0.5518381595611572,
"step": 1598
},
{
"epoch": 1.725997842502697,
"grad_norm": 5.43556022644043,
"learning_rate": 2.3809822254065637e-07,
"loss": 0.4855960011482239,
"step": 1600
},
{
"epoch": 1.7281553398058254,
"grad_norm": 1.7936867475509644,
"learning_rate": 2.375702799175164e-07,
"loss": 0.5670949816703796,
"step": 1602
},
{
"epoch": 1.7303128371089536,
"grad_norm": 3.9360201358795166,
"learning_rate": 2.3704254914966436e-07,
"loss": 0.4383196234703064,
"step": 1604
},
{
"epoch": 1.732470334412082,
"grad_norm": 1.8262487649917603,
"learning_rate": 2.365150332240999e-07,
"loss": 0.4360693395137787,
"step": 1606
},
{
"epoch": 1.7346278317152104,
"grad_norm": 5.341520309448242,
"learning_rate": 2.3598773512660636e-07,
"loss": 0.5174295902252197,
"step": 1608
},
{
"epoch": 1.7367853290183386,
"grad_norm": 1.4834777116775513,
"learning_rate": 2.3546065784173425e-07,
"loss": 0.5596581697463989,
"step": 1610
},
{
"epoch": 1.738942826321467,
"grad_norm": 2.0027642250061035,
"learning_rate": 2.349338043527843e-07,
"loss": 0.5774880051612854,
"step": 1612
},
{
"epoch": 1.7411003236245954,
"grad_norm": 2.79825758934021,
"learning_rate": 2.3440717764179053e-07,
"loss": 0.34554505348205566,
"step": 1614
},
{
"epoch": 1.7432578209277239,
"grad_norm": 1.6454131603240967,
"learning_rate": 2.338807806895033e-07,
"loss": 0.470796674489975,
"step": 1616
},
{
"epoch": 1.7454153182308523,
"grad_norm": 0.6968181133270264,
"learning_rate": 2.3335461647537252e-07,
"loss": 0.458304226398468,
"step": 1618
},
{
"epoch": 1.7475728155339807,
"grad_norm": 1.9778035879135132,
"learning_rate": 2.3282868797753092e-07,
"loss": 0.3832884132862091,
"step": 1620
},
{
"epoch": 1.7497303128371091,
"grad_norm": 1.3171417713165283,
"learning_rate": 2.3230299817277694e-07,
"loss": 0.33851158618927,
"step": 1622
},
{
"epoch": 1.7518878101402373,
"grad_norm": 1.1305524110794067,
"learning_rate": 2.3177755003655803e-07,
"loss": 0.38404303789138794,
"step": 1624
},
{
"epoch": 1.7540453074433657,
"grad_norm": 3.3389196395874023,
"learning_rate": 2.3125234654295378e-07,
"loss": 0.4796540141105652,
"step": 1626
},
{
"epoch": 1.756202804746494,
"grad_norm": 2.8726422786712646,
"learning_rate": 2.3072739066465906e-07,
"loss": 0.5632970333099365,
"step": 1628
},
{
"epoch": 1.7583603020496223,
"grad_norm": 2.0043997764587402,
"learning_rate": 2.3020268537296728e-07,
"loss": 0.610961377620697,
"step": 1630
},
{
"epoch": 1.7605177993527508,
"grad_norm": 1.977318286895752,
"learning_rate": 2.2967823363775334e-07,
"loss": 0.5584444403648376,
"step": 1632
},
{
"epoch": 1.7626752966558792,
"grad_norm": 1.4552688598632812,
"learning_rate": 2.2915403842745718e-07,
"loss": 0.5306387543678284,
"step": 1634
},
{
"epoch": 1.7648327939590076,
"grad_norm": 1.6729410886764526,
"learning_rate": 2.286301027090668e-07,
"loss": 0.7393071055412292,
"step": 1636
},
{
"epoch": 1.766990291262136,
"grad_norm": 1.8149133920669556,
"learning_rate": 2.2810642944810122e-07,
"loss": 0.6838573217391968,
"step": 1638
},
{
"epoch": 1.7691477885652644,
"grad_norm": 1.5463119745254517,
"learning_rate": 2.2758302160859426e-07,
"loss": 0.6229934692382812,
"step": 1640
},
{
"epoch": 1.7713052858683926,
"grad_norm": 1.5769580602645874,
"learning_rate": 2.2705988215307703e-07,
"loss": 0.5759105682373047,
"step": 1642
},
{
"epoch": 1.773462783171521,
"grad_norm": 1.2159730195999146,
"learning_rate": 2.2653701404256204e-07,
"loss": 0.5320509076118469,
"step": 1644
},
{
"epoch": 1.7756202804746493,
"grad_norm": 20.918190002441406,
"learning_rate": 2.260144202365254e-07,
"loss": 0.7069261074066162,
"step": 1646
},
{
"epoch": 1.7777777777777777,
"grad_norm": 1.1241180896759033,
"learning_rate": 2.2549210369289124e-07,
"loss": 0.5508931875228882,
"step": 1648
},
{
"epoch": 1.779935275080906,
"grad_norm": 4.296023368835449,
"learning_rate": 2.24970067368014e-07,
"loss": 0.27031293511390686,
"step": 1650
},
{
"epoch": 1.7820927723840345,
"grad_norm": 1.679374098777771,
"learning_rate": 2.24448314216662e-07,
"loss": 0.4512391686439514,
"step": 1652
},
{
"epoch": 1.784250269687163,
"grad_norm": 2.161573886871338,
"learning_rate": 2.2392684719200116e-07,
"loss": 0.3788191080093384,
"step": 1654
},
{
"epoch": 1.7864077669902914,
"grad_norm": 6.881853103637695,
"learning_rate": 2.2340566924557735e-07,
"loss": 0.5204552412033081,
"step": 1656
},
{
"epoch": 1.7885652642934198,
"grad_norm": 2.566661834716797,
"learning_rate": 2.228847833273007e-07,
"loss": 0.5021325945854187,
"step": 1658
},
{
"epoch": 1.790722761596548,
"grad_norm": 1.374242901802063,
"learning_rate": 2.223641923854282e-07,
"loss": 0.5507632493972778,
"step": 1660
},
{
"epoch": 1.7928802588996764,
"grad_norm": 2.6447982788085938,
"learning_rate": 2.2184389936654736e-07,
"loss": 0.5545051097869873,
"step": 1662
},
{
"epoch": 1.7950377562028046,
"grad_norm": 1.4282158613204956,
"learning_rate": 2.2132390721555933e-07,
"loss": 0.5268256068229675,
"step": 1664
},
{
"epoch": 1.797195253505933,
"grad_norm": 9.742328643798828,
"learning_rate": 2.2080421887566236e-07,
"loss": 0.28805431723594666,
"step": 1666
},
{
"epoch": 1.7993527508090614,
"grad_norm": 2.320671319961548,
"learning_rate": 2.2028483728833524e-07,
"loss": 0.61153244972229,
"step": 1668
},
{
"epoch": 1.8015102481121898,
"grad_norm": 1.3254764080047607,
"learning_rate": 2.197657653933202e-07,
"loss": 0.5806170105934143,
"step": 1670
},
{
"epoch": 1.8036677454153183,
"grad_norm": 1.328969120979309,
"learning_rate": 2.1924700612860692e-07,
"loss": 0.5318849086761475,
"step": 1672
},
{
"epoch": 1.8058252427184467,
"grad_norm": 0.41818344593048096,
"learning_rate": 2.1872856243041532e-07,
"loss": 0.27527254819869995,
"step": 1674
},
{
"epoch": 1.807982740021575,
"grad_norm": 4.447193622589111,
"learning_rate": 2.1821043723317935e-07,
"loss": 0.6419240236282349,
"step": 1676
},
{
"epoch": 1.8101402373247033,
"grad_norm": 1.7327150106430054,
"learning_rate": 2.1769263346953004e-07,
"loss": 0.5605652332305908,
"step": 1678
},
{
"epoch": 1.8122977346278317,
"grad_norm": 1.8278568983078003,
"learning_rate": 2.1717515407027937e-07,
"loss": 0.5259844660758972,
"step": 1680
},
{
"epoch": 1.81445523193096,
"grad_norm": 2.8355183601379395,
"learning_rate": 2.1665800196440314e-07,
"loss": 0.48816215991973877,
"step": 1682
},
{
"epoch": 1.8166127292340883,
"grad_norm": 4.203651428222656,
"learning_rate": 2.161411800790247e-07,
"loss": 0.622113049030304,
"step": 1684
},
{
"epoch": 1.8187702265372168,
"grad_norm": 1.8128265142440796,
"learning_rate": 2.1562469133939836e-07,
"loss": 0.23197607696056366,
"step": 1686
},
{
"epoch": 1.8209277238403452,
"grad_norm": 4.64774227142334,
"learning_rate": 2.1510853866889278e-07,
"loss": 0.5721830129623413,
"step": 1688
},
{
"epoch": 1.8230852211434736,
"grad_norm": 1.5357946157455444,
"learning_rate": 2.1459272498897452e-07,
"loss": 0.47745242714881897,
"step": 1690
},
{
"epoch": 1.825242718446602,
"grad_norm": 6.159779071807861,
"learning_rate": 2.1407725321919107e-07,
"loss": 0.6153979301452637,
"step": 1692
},
{
"epoch": 1.8274002157497304,
"grad_norm": 7.977688312530518,
"learning_rate": 2.1356212627715524e-07,
"loss": 0.5228186845779419,
"step": 1694
},
{
"epoch": 1.8295577130528586,
"grad_norm": 1.6970831155776978,
"learning_rate": 2.1304734707852785e-07,
"loss": 0.49107879400253296,
"step": 1696
},
{
"epoch": 1.831715210355987,
"grad_norm": 1.6975973844528198,
"learning_rate": 2.125329185370011e-07,
"loss": 0.5908475518226624,
"step": 1698
},
{
"epoch": 1.8338727076591155,
"grad_norm": 1.8754093647003174,
"learning_rate": 2.1201884356428313e-07,
"loss": 0.46887385845184326,
"step": 1700
},
{
"epoch": 1.8360302049622437,
"grad_norm": 2.0672781467437744,
"learning_rate": 2.1150512507008016e-07,
"loss": 0.6688355207443237,
"step": 1702
},
{
"epoch": 1.838187702265372,
"grad_norm": 6.069046497344971,
"learning_rate": 2.1099176596208134e-07,
"loss": 0.5004164576530457,
"step": 1704
},
{
"epoch": 1.8403451995685005,
"grad_norm": 2.647517204284668,
"learning_rate": 2.104787691459411e-07,
"loss": 0.6052039265632629,
"step": 1706
},
{
"epoch": 1.842502696871629,
"grad_norm": 24.532392501831055,
"learning_rate": 2.099661375252636e-07,
"loss": 0.5694432854652405,
"step": 1708
},
{
"epoch": 1.8446601941747574,
"grad_norm": 1.9516592025756836,
"learning_rate": 2.0945387400158597e-07,
"loss": 0.38723820447921753,
"step": 1710
},
{
"epoch": 1.8468176914778858,
"grad_norm": 5.9051408767700195,
"learning_rate": 2.0894198147436177e-07,
"loss": 0.5640073418617249,
"step": 1712
},
{
"epoch": 1.8489751887810142,
"grad_norm": 1.8454663753509521,
"learning_rate": 2.0843046284094474e-07,
"loss": 0.5534006357192993,
"step": 1714
},
{
"epoch": 1.8511326860841424,
"grad_norm": 2.155219316482544,
"learning_rate": 2.0791932099657221e-07,
"loss": 0.4856148660182953,
"step": 1716
},
{
"epoch": 1.8532901833872708,
"grad_norm": 6.839637279510498,
"learning_rate": 2.074085588343491e-07,
"loss": 0.41986072063446045,
"step": 1718
},
{
"epoch": 1.855447680690399,
"grad_norm": 1.4975872039794922,
"learning_rate": 2.0689817924523112e-07,
"loss": 0.4183667004108429,
"step": 1720
},
{
"epoch": 1.8576051779935274,
"grad_norm": 1.6008542776107788,
"learning_rate": 2.0638818511800865e-07,
"loss": 0.5753411650657654,
"step": 1722
},
{
"epoch": 1.8597626752966558,
"grad_norm": 1.6697242259979248,
"learning_rate": 2.0587857933929037e-07,
"loss": 0.6102425456047058,
"step": 1724
},
{
"epoch": 1.8619201725997843,
"grad_norm": 1.3475979566574097,
"learning_rate": 2.0536936479348672e-07,
"loss": 0.5690769553184509,
"step": 1726
},
{
"epoch": 1.8640776699029127,
"grad_norm": 2.36917781829834,
"learning_rate": 2.0486054436279394e-07,
"loss": 0.26033759117126465,
"step": 1728
},
{
"epoch": 1.866235167206041,
"grad_norm": 0.5969313383102417,
"learning_rate": 2.0435212092717729e-07,
"loss": 0.2784996032714844,
"step": 1730
},
{
"epoch": 1.8683926645091695,
"grad_norm": 1.8333349227905273,
"learning_rate": 2.0384409736435526e-07,
"loss": 0.5710358619689941,
"step": 1732
},
{
"epoch": 1.8705501618122977,
"grad_norm": 1.0351983308792114,
"learning_rate": 2.033364765497828e-07,
"loss": 0.352516233921051,
"step": 1734
},
{
"epoch": 1.8727076591154261,
"grad_norm": 5.123870849609375,
"learning_rate": 2.0282926135663554e-07,
"loss": 0.5661641359329224,
"step": 1736
},
{
"epoch": 1.8748651564185543,
"grad_norm": 1.9763929843902588,
"learning_rate": 2.0232245465579306e-07,
"loss": 0.6391258239746094,
"step": 1738
},
{
"epoch": 1.8770226537216828,
"grad_norm": 2.7457945346832275,
"learning_rate": 2.0181605931582284e-07,
"loss": 0.4859541952610016,
"step": 1740
},
{
"epoch": 1.8791801510248112,
"grad_norm": 1.343418836593628,
"learning_rate": 2.013100782029641e-07,
"loss": 0.57615065574646,
"step": 1742
},
{
"epoch": 1.8813376483279396,
"grad_norm": 2.796921491622925,
"learning_rate": 2.0080451418111143e-07,
"loss": 0.47713714838027954,
"step": 1744
},
{
"epoch": 1.883495145631068,
"grad_norm": 1.2428369522094727,
"learning_rate": 2.0029937011179882e-07,
"loss": 0.46215853095054626,
"step": 1746
},
{
"epoch": 1.8856526429341964,
"grad_norm": 1.3065931797027588,
"learning_rate": 1.9979464885418295e-07,
"loss": 0.37958696484565735,
"step": 1748
},
{
"epoch": 1.8878101402373249,
"grad_norm": 0.4351181089878082,
"learning_rate": 1.9929035326502773e-07,
"loss": 0.5532968044281006,
"step": 1750
},
{
"epoch": 1.889967637540453,
"grad_norm": 2.7067880630493164,
"learning_rate": 1.9878648619868765e-07,
"loss": 0.5427120923995972,
"step": 1752
},
{
"epoch": 1.8921251348435815,
"grad_norm": 1.2321635484695435,
"learning_rate": 1.9828305050709144e-07,
"loss": 0.4515300989151001,
"step": 1754
},
{
"epoch": 1.8942826321467097,
"grad_norm": 1.237924337387085,
"learning_rate": 1.9778004903972667e-07,
"loss": 0.6264490485191345,
"step": 1756
},
{
"epoch": 1.896440129449838,
"grad_norm": 2.399315118789673,
"learning_rate": 1.9727748464362276e-07,
"loss": 0.5656343698501587,
"step": 1758
},
{
"epoch": 1.8985976267529665,
"grad_norm": 2.262449264526367,
"learning_rate": 1.9677536016333556e-07,
"loss": 0.4433645009994507,
"step": 1760
},
{
"epoch": 1.900755124056095,
"grad_norm": 0.9049375653266907,
"learning_rate": 1.9627367844093078e-07,
"loss": 0.5328507423400879,
"step": 1762
},
{
"epoch": 1.9029126213592233,
"grad_norm": 3.406168222427368,
"learning_rate": 1.9577244231596807e-07,
"loss": 0.5393190979957581,
"step": 1764
},
{
"epoch": 1.9050701186623518,
"grad_norm": 5.9175872802734375,
"learning_rate": 1.9527165462548528e-07,
"loss": 0.5409752130508423,
"step": 1766
},
{
"epoch": 1.9072276159654802,
"grad_norm": 3.4059221744537354,
"learning_rate": 1.9477131820398158e-07,
"loss": 0.4544711410999298,
"step": 1768
},
{
"epoch": 1.9093851132686084,
"grad_norm": 2.6678411960601807,
"learning_rate": 1.942714358834024e-07,
"loss": 0.4812181890010834,
"step": 1770
},
{
"epoch": 1.9115426105717368,
"grad_norm": 2.3084359169006348,
"learning_rate": 1.9377201049312252e-07,
"loss": 0.5532037615776062,
"step": 1772
},
{
"epoch": 1.913700107874865,
"grad_norm": 1.5713317394256592,
"learning_rate": 1.9327304485993084e-07,
"loss": 0.5627604722976685,
"step": 1774
},
{
"epoch": 1.9158576051779934,
"grad_norm": 4.625167369842529,
"learning_rate": 1.9277454180801367e-07,
"loss": 0.5986460447311401,
"step": 1776
},
{
"epoch": 1.9180151024811218,
"grad_norm": 4.1043267250061035,
"learning_rate": 1.9227650415893914e-07,
"loss": 0.5130695700645447,
"step": 1778
},
{
"epoch": 1.9201725997842503,
"grad_norm": 1.6170152425765991,
"learning_rate": 1.9177893473164142e-07,
"loss": 0.3731135129928589,
"step": 1780
},
{
"epoch": 1.9223300970873787,
"grad_norm": 1.3302977085113525,
"learning_rate": 1.9128183634240414e-07,
"loss": 0.4674024283885956,
"step": 1782
},
{
"epoch": 1.924487594390507,
"grad_norm": 1.3277769088745117,
"learning_rate": 1.907852118048451e-07,
"loss": 0.5244305729866028,
"step": 1784
},
{
"epoch": 1.9266450916936355,
"grad_norm": 1.839621663093567,
"learning_rate": 1.902890639298998e-07,
"loss": 0.6807081699371338,
"step": 1786
},
{
"epoch": 1.9288025889967637,
"grad_norm": 22.250324249267578,
"learning_rate": 1.8979339552580615e-07,
"loss": 0.3220374882221222,
"step": 1788
},
{
"epoch": 1.9309600862998921,
"grad_norm": 1.6265594959259033,
"learning_rate": 1.8929820939808783e-07,
"loss": 0.6456558108329773,
"step": 1790
},
{
"epoch": 1.9331175836030206,
"grad_norm": 2.020714282989502,
"learning_rate": 1.8880350834953912e-07,
"loss": 0.5007312297821045,
"step": 1792
},
{
"epoch": 1.9352750809061487,
"grad_norm": 2.1667919158935547,
"learning_rate": 1.8830929518020833e-07,
"loss": 0.46376651525497437,
"step": 1794
},
{
"epoch": 1.9374325782092772,
"grad_norm": 5.430749893188477,
"learning_rate": 1.8781557268738275e-07,
"loss": 0.6586015820503235,
"step": 1796
},
{
"epoch": 1.9395900755124056,
"grad_norm": 0.6731663942337036,
"learning_rate": 1.8732234366557225e-07,
"loss": 0.6122515797615051,
"step": 1798
},
{
"epoch": 1.941747572815534,
"grad_norm": 2.9628219604492188,
"learning_rate": 1.8682961090649342e-07,
"loss": 0.7105916142463684,
"step": 1800
},
{
"epoch": 1.9439050701186624,
"grad_norm": 1.580090045928955,
"learning_rate": 1.8633737719905428e-07,
"loss": 0.4230397939682007,
"step": 1802
},
{
"epoch": 1.9460625674217908,
"grad_norm": 4.340382099151611,
"learning_rate": 1.8584564532933784e-07,
"loss": 0.6302311420440674,
"step": 1804
},
{
"epoch": 1.948220064724919,
"grad_norm": 1.663780689239502,
"learning_rate": 1.853544180805871e-07,
"loss": 0.3612719774246216,
"step": 1806
},
{
"epoch": 1.9503775620280475,
"grad_norm": 1.57147216796875,
"learning_rate": 1.8486369823318833e-07,
"loss": 0.5454095005989075,
"step": 1808
},
{
"epoch": 1.9525350593311759,
"grad_norm": 1.2029516696929932,
"learning_rate": 1.8437348856465623e-07,
"loss": 0.564690351486206,
"step": 1810
},
{
"epoch": 1.954692556634304,
"grad_norm": 1.5036405324935913,
"learning_rate": 1.8388379184961795e-07,
"loss": 0.2203519642353058,
"step": 1812
},
{
"epoch": 1.9568500539374325,
"grad_norm": 1.5001392364501953,
"learning_rate": 1.8339461085979686e-07,
"loss": 0.5622031092643738,
"step": 1814
},
{
"epoch": 1.959007551240561,
"grad_norm": 3.3546156883239746,
"learning_rate": 1.8290594836399765e-07,
"loss": 0.5708537697792053,
"step": 1816
},
{
"epoch": 1.9611650485436893,
"grad_norm": 3.217376232147217,
"learning_rate": 1.8241780712809007e-07,
"loss": 0.5283911228179932,
"step": 1818
},
{
"epoch": 1.9633225458468178,
"grad_norm": 1.1382657289505005,
"learning_rate": 1.8193018991499364e-07,
"loss": 0.6459410786628723,
"step": 1820
},
{
"epoch": 1.9654800431499462,
"grad_norm": 1.5658422708511353,
"learning_rate": 1.8144309948466175e-07,
"loss": 0.5700141191482544,
"step": 1822
},
{
"epoch": 1.9676375404530746,
"grad_norm": 1.9039454460144043,
"learning_rate": 1.8095653859406628e-07,
"loss": 0.4810370206832886,
"step": 1824
},
{
"epoch": 1.9697950377562028,
"grad_norm": 3.176187038421631,
"learning_rate": 1.8047050999718184e-07,
"loss": 0.6500232815742493,
"step": 1826
},
{
"epoch": 1.9719525350593312,
"grad_norm": 1.7816401720046997,
"learning_rate": 1.7998501644497006e-07,
"loss": 0.4381594657897949,
"step": 1828
},
{
"epoch": 1.9741100323624594,
"grad_norm": 2.0123534202575684,
"learning_rate": 1.795000606853646e-07,
"loss": 0.5198497772216797,
"step": 1830
},
{
"epoch": 1.9762675296655878,
"grad_norm": 1.8685188293457031,
"learning_rate": 1.7901564546325436e-07,
"loss": 0.5583903193473816,
"step": 1832
},
{
"epoch": 1.9784250269687162,
"grad_norm": 7.242468357086182,
"learning_rate": 1.7853177352046971e-07,
"loss": 0.6218190789222717,
"step": 1834
},
{
"epoch": 1.9805825242718447,
"grad_norm": 4.002262592315674,
"learning_rate": 1.7804844759576538e-07,
"loss": 0.5924632549285889,
"step": 1836
},
{
"epoch": 1.982740021574973,
"grad_norm": 1.834091067314148,
"learning_rate": 1.775656704248057e-07,
"loss": 0.5840417146682739,
"step": 1838
},
{
"epoch": 1.9848975188781015,
"grad_norm": 1.513541340827942,
"learning_rate": 1.7708344474014924e-07,
"loss": 0.5099426507949829,
"step": 1840
},
{
"epoch": 1.98705501618123,
"grad_norm": 1.4789743423461914,
"learning_rate": 1.7660177327123287e-07,
"loss": 0.5921831130981445,
"step": 1842
},
{
"epoch": 1.9892125134843581,
"grad_norm": 1.290024995803833,
"learning_rate": 1.7612065874435677e-07,
"loss": 0.5426990985870361,
"step": 1844
},
{
"epoch": 1.9913700107874865,
"grad_norm": 1.4434101581573486,
"learning_rate": 1.7564010388266837e-07,
"loss": 0.5949893593788147,
"step": 1846
},
{
"epoch": 1.9935275080906147,
"grad_norm": 7.057824611663818,
"learning_rate": 1.7516011140614795e-07,
"loss": 0.4401338994503021,
"step": 1848
},
{
"epoch": 1.9956850053937432,
"grad_norm": 2.867464303970337,
"learning_rate": 1.7468068403159218e-07,
"loss": 0.4908779263496399,
"step": 1850
},
{
"epoch": 1.9978425026968716,
"grad_norm": 1.0820274353027344,
"learning_rate": 1.7420182447259926e-07,
"loss": 0.41853272914886475,
"step": 1852
},
{
"epoch": 2.0,
"grad_norm": 1.5740938186645508,
"learning_rate": 1.7372353543955375e-07,
"loss": 0.48160526156425476,
"step": 1854
},
{
"epoch": 2.0021574973031284,
"grad_norm": 2.7098584175109863,
"learning_rate": 1.7324581963961088e-07,
"loss": 0.5192286372184753,
"step": 1856
},
{
"epoch": 2.004314994606257,
"grad_norm": 1.2259626388549805,
"learning_rate": 1.7276867977668117e-07,
"loss": 0.38666832447052,
"step": 1858
},
{
"epoch": 2.0064724919093853,
"grad_norm": 2.3588082790374756,
"learning_rate": 1.7229211855141535e-07,
"loss": 0.5268582105636597,
"step": 1860
},
{
"epoch": 2.0086299892125137,
"grad_norm": 3.2187092304229736,
"learning_rate": 1.718161386611892e-07,
"loss": 0.39220139384269714,
"step": 1862
},
{
"epoch": 2.0107874865156417,
"grad_norm": 3.2906999588012695,
"learning_rate": 1.71340742800088e-07,
"loss": 0.5459554195404053,
"step": 1864
},
{
"epoch": 2.01294498381877,
"grad_norm": 1.1693605184555054,
"learning_rate": 1.708659336588912e-07,
"loss": 0.3647141456604004,
"step": 1866
},
{
"epoch": 2.0151024811218985,
"grad_norm": 2.798161268234253,
"learning_rate": 1.703917139250576e-07,
"loss": 0.29213812947273254,
"step": 1868
},
{
"epoch": 2.017259978425027,
"grad_norm": 1.1381080150604248,
"learning_rate": 1.6991808628270987e-07,
"loss": 0.5609018802642822,
"step": 1870
},
{
"epoch": 2.0194174757281553,
"grad_norm": 1.3353042602539062,
"learning_rate": 1.694450534126193e-07,
"loss": 0.6125231981277466,
"step": 1872
},
{
"epoch": 2.0215749730312838,
"grad_norm": 0.5496029853820801,
"learning_rate": 1.689726179921906e-07,
"loss": 0.6636590957641602,
"step": 1874
},
{
"epoch": 2.023732470334412,
"grad_norm": 4.846103668212891,
"learning_rate": 1.6850078269544736e-07,
"loss": 0.5935502648353577,
"step": 1876
},
{
"epoch": 2.0258899676375406,
"grad_norm": 1.36087167263031,
"learning_rate": 1.6802955019301574e-07,
"loss": 0.4898212254047394,
"step": 1878
},
{
"epoch": 2.028047464940669,
"grad_norm": 1.213739037513733,
"learning_rate": 1.6755892315211056e-07,
"loss": 0.5537405014038086,
"step": 1880
},
{
"epoch": 2.030204962243797,
"grad_norm": 2.0111231803894043,
"learning_rate": 1.6708890423651965e-07,
"loss": 0.5984706282615662,
"step": 1882
},
{
"epoch": 2.0323624595469254,
"grad_norm": 1.080739974975586,
"learning_rate": 1.6661949610658831e-07,
"loss": 0.6536235809326172,
"step": 1884
},
{
"epoch": 2.034519956850054,
"grad_norm": 1.0429832935333252,
"learning_rate": 1.6615070141920538e-07,
"loss": 0.4953509569168091,
"step": 1886
},
{
"epoch": 2.0366774541531822,
"grad_norm": 0.8104021549224854,
"learning_rate": 1.656825228277871e-07,
"loss": 0.5021868944168091,
"step": 1888
},
{
"epoch": 2.0388349514563107,
"grad_norm": 2.875866413116455,
"learning_rate": 1.6521496298226293e-07,
"loss": 0.4888242483139038,
"step": 1890
},
{
"epoch": 2.040992448759439,
"grad_norm": 1.8168655633926392,
"learning_rate": 1.647480245290596e-07,
"loss": 0.5183455944061279,
"step": 1892
},
{
"epoch": 2.0431499460625675,
"grad_norm": 2.0136873722076416,
"learning_rate": 1.642817101110875e-07,
"loss": 0.4676356911659241,
"step": 1894
},
{
"epoch": 2.045307443365696,
"grad_norm": 1.8156472444534302,
"learning_rate": 1.6381602236772428e-07,
"loss": 0.39894169569015503,
"step": 1896
},
{
"epoch": 2.0474649406688243,
"grad_norm": 1.542965054512024,
"learning_rate": 1.6335096393480077e-07,
"loss": 0.6107752919197083,
"step": 1898
},
{
"epoch": 2.0496224379719523,
"grad_norm": 4.939133644104004,
"learning_rate": 1.6288653744458603e-07,
"loss": 0.4178003668785095,
"step": 1900
},
{
"epoch": 2.0517799352750807,
"grad_norm": 1.781587839126587,
"learning_rate": 1.62422745525772e-07,
"loss": 0.45616793632507324,
"step": 1902
},
{
"epoch": 2.053937432578209,
"grad_norm": 1.2864458560943604,
"learning_rate": 1.619595908034591e-07,
"loss": 0.4433179795742035,
"step": 1904
},
{
"epoch": 2.0560949298813376,
"grad_norm": 1.3701529502868652,
"learning_rate": 1.6149707589914092e-07,
"loss": 0.30546942353248596,
"step": 1906
},
{
"epoch": 2.058252427184466,
"grad_norm": 1.4088618755340576,
"learning_rate": 1.6103520343068992e-07,
"loss": 0.4966147840023041,
"step": 1908
},
{
"epoch": 2.0604099244875944,
"grad_norm": 1.663954734802246,
"learning_rate": 1.6057397601234218e-07,
"loss": 0.5685679912567139,
"step": 1910
},
{
"epoch": 2.062567421790723,
"grad_norm": 1.3583576679229736,
"learning_rate": 1.6011339625468262e-07,
"loss": 0.42057788372039795,
"step": 1912
},
{
"epoch": 2.0647249190938513,
"grad_norm": 1.4086862802505493,
"learning_rate": 1.5965346676463065e-07,
"loss": 0.47804367542266846,
"step": 1914
},
{
"epoch": 2.0668824163969797,
"grad_norm": 1.02560555934906,
"learning_rate": 1.5919419014542485e-07,
"loss": 0.5550174117088318,
"step": 1916
},
{
"epoch": 2.0690399137001076,
"grad_norm": 1.144000768661499,
"learning_rate": 1.5873556899660858e-07,
"loss": 0.538378894329071,
"step": 1918
},
{
"epoch": 2.071197411003236,
"grad_norm": 3.2516837120056152,
"learning_rate": 1.5827760591401513e-07,
"loss": 0.4809839129447937,
"step": 1920
},
{
"epoch": 2.0733549083063645,
"grad_norm": 3.4512033462524414,
"learning_rate": 1.578203034897533e-07,
"loss": 0.5308358669281006,
"step": 1922
},
{
"epoch": 2.075512405609493,
"grad_norm": 7.391180038452148,
"learning_rate": 1.573636643121922e-07,
"loss": 0.5464600324630737,
"step": 1924
},
{
"epoch": 2.0776699029126213,
"grad_norm": 1.2600277662277222,
"learning_rate": 1.5690769096594703e-07,
"loss": 0.4952971339225769,
"step": 1926
},
{
"epoch": 2.0798274002157497,
"grad_norm": 1.352362036705017,
"learning_rate": 1.5645238603186456e-07,
"loss": 0.5618917346000671,
"step": 1928
},
{
"epoch": 2.081984897518878,
"grad_norm": 2.5637662410736084,
"learning_rate": 1.5599775208700793e-07,
"loss": 0.4722367525100708,
"step": 1930
},
{
"epoch": 2.0841423948220066,
"grad_norm": 1.4300997257232666,
"learning_rate": 1.5554379170464265e-07,
"loss": 0.5235872864723206,
"step": 1932
},
{
"epoch": 2.086299892125135,
"grad_norm": 1.3848403692245483,
"learning_rate": 1.5509050745422164e-07,
"loss": 0.5249854922294617,
"step": 1934
},
{
"epoch": 2.0884573894282634,
"grad_norm": 1.8573588132858276,
"learning_rate": 1.546379019013712e-07,
"loss": 0.4274769127368927,
"step": 1936
},
{
"epoch": 2.0906148867313914,
"grad_norm": 2.3221805095672607,
"learning_rate": 1.5418597760787555e-07,
"loss": 0.4279857277870178,
"step": 1938
},
{
"epoch": 2.09277238403452,
"grad_norm": 1.2415211200714111,
"learning_rate": 1.537347371316635e-07,
"loss": 0.43542686104774475,
"step": 1940
},
{
"epoch": 2.0949298813376482,
"grad_norm": 1.8443889617919922,
"learning_rate": 1.532841830267934e-07,
"loss": 0.46662214398384094,
"step": 1942
},
{
"epoch": 2.0970873786407767,
"grad_norm": 1.2522131204605103,
"learning_rate": 1.5283431784343802e-07,
"loss": 0.5438724160194397,
"step": 1944
},
{
"epoch": 2.099244875943905,
"grad_norm": 1.2235276699066162,
"learning_rate": 1.5238514412787158e-07,
"loss": 0.6343849897384644,
"step": 1946
},
{
"epoch": 2.1014023732470335,
"grad_norm": 3.470675468444824,
"learning_rate": 1.5193666442245402e-07,
"loss": 0.43045446276664734,
"step": 1948
},
{
"epoch": 2.103559870550162,
"grad_norm": 1.038780927658081,
"learning_rate": 1.5148888126561726e-07,
"loss": 0.4236026406288147,
"step": 1950
},
{
"epoch": 2.1057173678532903,
"grad_norm": 2.081264019012451,
"learning_rate": 1.5104179719185075e-07,
"loss": 0.5168135166168213,
"step": 1952
},
{
"epoch": 2.1078748651564188,
"grad_norm": 12.081764221191406,
"learning_rate": 1.5059541473168715e-07,
"loss": 0.5867135524749756,
"step": 1954
},
{
"epoch": 2.1100323624595467,
"grad_norm": 2.5666370391845703,
"learning_rate": 1.5014973641168776e-07,
"loss": 0.5193699598312378,
"step": 1956
},
{
"epoch": 2.112189859762675,
"grad_norm": 1.6125311851501465,
"learning_rate": 1.497047647544283e-07,
"loss": 0.529248833656311,
"step": 1958
},
{
"epoch": 2.1143473570658036,
"grad_norm": 2.1865692138671875,
"learning_rate": 1.4926050227848519e-07,
"loss": 0.0995928943157196,
"step": 1960
},
{
"epoch": 2.116504854368932,
"grad_norm": 1.370114803314209,
"learning_rate": 1.4881695149842027e-07,
"loss": 0.4155382513999939,
"step": 1962
},
{
"epoch": 2.1186623516720604,
"grad_norm": 3.971156358718872,
"learning_rate": 1.4837411492476743e-07,
"loss": 0.42390692234039307,
"step": 1964
},
{
"epoch": 2.120819848975189,
"grad_norm": 1.6040301322937012,
"learning_rate": 1.4793199506401797e-07,
"loss": 0.47764524817466736,
"step": 1966
},
{
"epoch": 2.1229773462783172,
"grad_norm": 10.519671440124512,
"learning_rate": 1.474905944186067e-07,
"loss": 0.34308892488479614,
"step": 1968
},
{
"epoch": 2.1251348435814457,
"grad_norm": 1.1796921491622925,
"learning_rate": 1.4704991548689745e-07,
"loss": 0.5871822834014893,
"step": 1970
},
{
"epoch": 2.127292340884574,
"grad_norm": 1.7361336946487427,
"learning_rate": 1.4660996076316912e-07,
"loss": 0.3765156865119934,
"step": 1972
},
{
"epoch": 2.129449838187702,
"grad_norm": 0.9722249507904053,
"learning_rate": 1.461707327376016e-07,
"loss": 0.5491130352020264,
"step": 1974
},
{
"epoch": 2.1316073354908305,
"grad_norm": 0.8201406598091125,
"learning_rate": 1.457322338962616e-07,
"loss": 0.2312294989824295,
"step": 1976
},
{
"epoch": 2.133764832793959,
"grad_norm": 1.1285258531570435,
"learning_rate": 1.4529446672108852e-07,
"loss": 0.408553808927536,
"step": 1978
},
{
"epoch": 2.1359223300970873,
"grad_norm": 2.267793655395508,
"learning_rate": 1.448574336898804e-07,
"loss": 0.4971245229244232,
"step": 1980
},
{
"epoch": 2.1380798274002157,
"grad_norm": 1.5453788042068481,
"learning_rate": 1.4442113727628024e-07,
"loss": 0.5261057615280151,
"step": 1982
},
{
"epoch": 2.140237324703344,
"grad_norm": 1.3704675436019897,
"learning_rate": 1.439855799497615e-07,
"loss": 0.4675547480583191,
"step": 1984
},
{
"epoch": 2.1423948220064726,
"grad_norm": 1.8390225172042847,
"learning_rate": 1.4355076417561429e-07,
"loss": 0.45672228932380676,
"step": 1986
},
{
"epoch": 2.144552319309601,
"grad_norm": 2.047685384750366,
"learning_rate": 1.4311669241493184e-07,
"loss": 0.4718751013278961,
"step": 1988
},
{
"epoch": 2.1467098166127294,
"grad_norm": 2.519134044647217,
"learning_rate": 1.426833671245956e-07,
"loss": 0.20050865411758423,
"step": 1990
},
{
"epoch": 2.148867313915858,
"grad_norm": 1.5338327884674072,
"learning_rate": 1.422507907572626e-07,
"loss": 0.6630242466926575,
"step": 1992
},
{
"epoch": 2.151024811218986,
"grad_norm": 1.7562376260757446,
"learning_rate": 1.418189657613504e-07,
"loss": 0.48454782366752625,
"step": 1994
},
{
"epoch": 2.1531823085221142,
"grad_norm": 1.1941181421279907,
"learning_rate": 1.4138789458102395e-07,
"loss": 0.39869314432144165,
"step": 1996
},
{
"epoch": 2.1553398058252426,
"grad_norm": 1.5241355895996094,
"learning_rate": 1.409575796561815e-07,
"loss": 0.5645185708999634,
"step": 1998
},
{
"epoch": 2.157497303128371,
"grad_norm": 1.2509868144989014,
"learning_rate": 1.4052802342244085e-07,
"loss": 0.4137888550758362,
"step": 2000
},
{
"epoch": 2.1596548004314995,
"grad_norm": 0.9289142489433289,
"learning_rate": 1.4009922831112576e-07,
"loss": 0.39357897639274597,
"step": 2002
},
{
"epoch": 2.161812297734628,
"grad_norm": 1.4196053743362427,
"learning_rate": 1.3967119674925144e-07,
"loss": 0.4930650293827057,
"step": 2004
},
{
"epoch": 2.1639697950377563,
"grad_norm": 1.6377662420272827,
"learning_rate": 1.3924393115951183e-07,
"loss": 0.5852062702178955,
"step": 2006
},
{
"epoch": 2.1661272923408847,
"grad_norm": 3.294273614883423,
"learning_rate": 1.3881743396026519e-07,
"loss": 0.2635650932788849,
"step": 2008
},
{
"epoch": 2.168284789644013,
"grad_norm": 3.2055675983428955,
"learning_rate": 1.383917075655207e-07,
"loss": 0.45562589168548584,
"step": 2010
},
{
"epoch": 2.170442286947141,
"grad_norm": 1.2780847549438477,
"learning_rate": 1.3796675438492466e-07,
"loss": 0.2905101478099823,
"step": 2012
},
{
"epoch": 2.1725997842502696,
"grad_norm": 1.1974769830703735,
"learning_rate": 1.37542576823747e-07,
"loss": 0.5826109647750854,
"step": 2014
},
{
"epoch": 2.174757281553398,
"grad_norm": 12.196817398071289,
"learning_rate": 1.3711917728286758e-07,
"loss": 0.6894016861915588,
"step": 2016
},
{
"epoch": 2.1769147788565264,
"grad_norm": 1.2338263988494873,
"learning_rate": 1.3669655815876238e-07,
"loss": 0.5621905326843262,
"step": 2018
},
{
"epoch": 2.179072276159655,
"grad_norm": 1.2616736888885498,
"learning_rate": 1.3627472184349054e-07,
"loss": 0.5268079042434692,
"step": 2020
},
{
"epoch": 2.1812297734627832,
"grad_norm": 2.433539867401123,
"learning_rate": 1.3585367072468014e-07,
"loss": 0.5845661163330078,
"step": 2022
},
{
"epoch": 2.1833872707659117,
"grad_norm": 5.173591136932373,
"learning_rate": 1.3543340718551505e-07,
"loss": 0.5688271522521973,
"step": 2024
},
{
"epoch": 2.18554476806904,
"grad_norm": 3.6794235706329346,
"learning_rate": 1.3501393360472135e-07,
"loss": 0.5398727655410767,
"step": 2026
},
{
"epoch": 2.1877022653721685,
"grad_norm": 3.5358774662017822,
"learning_rate": 1.345952523565541e-07,
"loss": 0.4828336238861084,
"step": 2028
},
{
"epoch": 2.1898597626752965,
"grad_norm": 2.045574188232422,
"learning_rate": 1.3417736581078343e-07,
"loss": 0.4576345682144165,
"step": 2030
},
{
"epoch": 2.192017259978425,
"grad_norm": 2.9018726348876953,
"learning_rate": 1.3376027633268145e-07,
"loss": 0.5629587769508362,
"step": 2032
},
{
"epoch": 2.1941747572815533,
"grad_norm": 0.7962714433670044,
"learning_rate": 1.33343986283009e-07,
"loss": 0.3971530795097351,
"step": 2034
},
{
"epoch": 2.1963322545846817,
"grad_norm": 8.073356628417969,
"learning_rate": 1.3292849801800172e-07,
"loss": 0.34259432554244995,
"step": 2036
},
{
"epoch": 2.19848975188781,
"grad_norm": 2.6538028717041016,
"learning_rate": 1.325138138893574e-07,
"loss": 0.3950064778327942,
"step": 2038
},
{
"epoch": 2.2006472491909386,
"grad_norm": 2.4987733364105225,
"learning_rate": 1.3209993624422226e-07,
"loss": 0.5430999398231506,
"step": 2040
},
{
"epoch": 2.202804746494067,
"grad_norm": 2.3891706466674805,
"learning_rate": 1.3168686742517777e-07,
"loss": 0.4212225377559662,
"step": 2042
},
{
"epoch": 2.2049622437971954,
"grad_norm": 3.8016743659973145,
"learning_rate": 1.312746097702273e-07,
"loss": 0.47842153906822205,
"step": 2044
},
{
"epoch": 2.207119741100324,
"grad_norm": 1.4113874435424805,
"learning_rate": 1.3086316561278298e-07,
"loss": 0.6112795472145081,
"step": 2046
},
{
"epoch": 2.209277238403452,
"grad_norm": 1.0874158143997192,
"learning_rate": 1.304525372816527e-07,
"loss": 0.37503018975257874,
"step": 2048
},
{
"epoch": 2.2114347357065802,
"grad_norm": 1.5210480690002441,
"learning_rate": 1.3004272710102627e-07,
"loss": 0.5227410793304443,
"step": 2050
},
{
"epoch": 2.2135922330097086,
"grad_norm": 2.596205234527588,
"learning_rate": 1.2963373739046308e-07,
"loss": 0.5574774742126465,
"step": 2052
},
{
"epoch": 2.215749730312837,
"grad_norm": 1.1875081062316895,
"learning_rate": 1.2922557046487847e-07,
"loss": 0.5108221769332886,
"step": 2054
},
{
"epoch": 2.2179072276159655,
"grad_norm": 1.1905848979949951,
"learning_rate": 1.2881822863453066e-07,
"loss": 0.6219309568405151,
"step": 2056
},
{
"epoch": 2.220064724919094,
"grad_norm": 1.272186279296875,
"learning_rate": 1.2841171420500799e-07,
"loss": 0.5619434118270874,
"step": 2058
},
{
"epoch": 2.2222222222222223,
"grad_norm": 1.7432719469070435,
"learning_rate": 1.2800602947721539e-07,
"loss": 0.47846826910972595,
"step": 2060
},
{
"epoch": 2.2243797195253507,
"grad_norm": 3.082793951034546,
"learning_rate": 1.2760117674736174e-07,
"loss": 0.40510886907577515,
"step": 2062
},
{
"epoch": 2.226537216828479,
"grad_norm": 2.457563638687134,
"learning_rate": 1.2719715830694665e-07,
"loss": 0.5271166563034058,
"step": 2064
},
{
"epoch": 2.228694714131607,
"grad_norm": 2.0129661560058594,
"learning_rate": 1.2679397644274786e-07,
"loss": 0.5444518327713013,
"step": 2066
},
{
"epoch": 2.2308522114347356,
"grad_norm": 1.3749797344207764,
"learning_rate": 1.2639163343680764e-07,
"loss": 0.4370856285095215,
"step": 2068
},
{
"epoch": 2.233009708737864,
"grad_norm": 2.305034637451172,
"learning_rate": 1.259901315664204e-07,
"loss": 0.3118676543235779,
"step": 2070
},
{
"epoch": 2.2351672060409924,
"grad_norm": 2.7576963901519775,
"learning_rate": 1.2558947310411988e-07,
"loss": 0.5651037096977234,
"step": 2072
},
{
"epoch": 2.237324703344121,
"grad_norm": 1.6983442306518555,
"learning_rate": 1.251896603176657e-07,
"loss": 0.4641881585121155,
"step": 2074
},
{
"epoch": 2.2394822006472492,
"grad_norm": 1.8414127826690674,
"learning_rate": 1.2479069547003113e-07,
"loss": 0.43197086453437805,
"step": 2076
},
{
"epoch": 2.2416396979503777,
"grad_norm": 2.0715110301971436,
"learning_rate": 1.2439258081938982e-07,
"loss": 0.4157189428806305,
"step": 2078
},
{
"epoch": 2.243797195253506,
"grad_norm": 1.076478123664856,
"learning_rate": 1.2399531861910356e-07,
"loss": 0.47206589579582214,
"step": 2080
},
{
"epoch": 2.2459546925566345,
"grad_norm": 1.650862693786621,
"learning_rate": 1.2359891111770893e-07,
"loss": 0.34252646565437317,
"step": 2082
},
{
"epoch": 2.2481121898597625,
"grad_norm": 10.067177772521973,
"learning_rate": 1.2320336055890485e-07,
"loss": 0.39974507689476013,
"step": 2084
},
{
"epoch": 2.250269687162891,
"grad_norm": 2.362675666809082,
"learning_rate": 1.228086691815401e-07,
"loss": 0.3123304843902588,
"step": 2086
},
{
"epoch": 2.2524271844660193,
"grad_norm": 1.422263741493225,
"learning_rate": 1.224148392196002e-07,
"loss": 0.512368381023407,
"step": 2088
},
{
"epoch": 2.2545846817691477,
"grad_norm": 3.862522602081299,
"learning_rate": 1.2202187290219506e-07,
"loss": 0.6224650144577026,
"step": 2090
},
{
"epoch": 2.256742179072276,
"grad_norm": 1.9737151861190796,
"learning_rate": 1.2162977245354618e-07,
"loss": 0.5244027376174927,
"step": 2092
},
{
"epoch": 2.2588996763754046,
"grad_norm": 1.3677443265914917,
"learning_rate": 1.212385400929746e-07,
"loss": 0.45020678639411926,
"step": 2094
},
{
"epoch": 2.261057173678533,
"grad_norm": 2.039045572280884,
"learning_rate": 1.208481780348872e-07,
"loss": 0.5531943440437317,
"step": 2096
},
{
"epoch": 2.2632146709816614,
"grad_norm": 1.5377328395843506,
"learning_rate": 1.2045868848876553e-07,
"loss": 0.37629038095474243,
"step": 2098
},
{
"epoch": 2.26537216828479,
"grad_norm": 2.3397927284240723,
"learning_rate": 1.2007007365915235e-07,
"loss": 0.3320096433162689,
"step": 2100
},
{
"epoch": 2.267529665587918,
"grad_norm": 3.1143479347229004,
"learning_rate": 1.1968233574563937e-07,
"loss": 0.40027597546577454,
"step": 2102
},
{
"epoch": 2.269687162891046,
"grad_norm": 4.891796112060547,
"learning_rate": 1.1929547694285518e-07,
"loss": 0.4987124502658844,
"step": 2104
},
{
"epoch": 2.2718446601941746,
"grad_norm": 1.840555191040039,
"learning_rate": 1.1890949944045232e-07,
"loss": 0.49355462193489075,
"step": 2106
},
{
"epoch": 2.274002157497303,
"grad_norm": 3.1459503173828125,
"learning_rate": 1.1852440542309507e-07,
"loss": 0.36765629053115845,
"step": 2108
},
{
"epoch": 2.2761596548004315,
"grad_norm": 1.4921073913574219,
"learning_rate": 1.1814019707044715e-07,
"loss": 0.4247042238712311,
"step": 2110
},
{
"epoch": 2.27831715210356,
"grad_norm": 1.55894935131073,
"learning_rate": 1.1775687655715948e-07,
"loss": 0.4907058775424957,
"step": 2112
},
{
"epoch": 2.2804746494066883,
"grad_norm": 1.8256114721298218,
"learning_rate": 1.1737444605285757e-07,
"loss": 0.46969401836395264,
"step": 2114
},
{
"epoch": 2.2826321467098167,
"grad_norm": 2.421661376953125,
"learning_rate": 1.1699290772212944e-07,
"loss": 0.5222725868225098,
"step": 2116
},
{
"epoch": 2.284789644012945,
"grad_norm": 1.7159234285354614,
"learning_rate": 1.1661226372451344e-07,
"loss": 0.38712745904922485,
"step": 2118
},
{
"epoch": 2.286947141316073,
"grad_norm": 1.709659218788147,
"learning_rate": 1.1623251621448581e-07,
"loss": 0.2972549796104431,
"step": 2120
},
{
"epoch": 2.2891046386192015,
"grad_norm": 5.333364486694336,
"learning_rate": 1.1585366734144861e-07,
"loss": 0.5384417772293091,
"step": 2122
},
{
"epoch": 2.29126213592233,
"grad_norm": 1.5736010074615479,
"learning_rate": 1.154757192497175e-07,
"loss": 0.48839452862739563,
"step": 2124
},
{
"epoch": 2.2934196332254584,
"grad_norm": 2.719190835952759,
"learning_rate": 1.1509867407850982e-07,
"loss": 0.4550670385360718,
"step": 2126
},
{
"epoch": 2.295577130528587,
"grad_norm": 1.8119968175888062,
"learning_rate": 1.1472253396193217e-07,
"loss": 0.6911446452140808,
"step": 2128
},
{
"epoch": 2.2977346278317152,
"grad_norm": 4.632421970367432,
"learning_rate": 1.1434730102896833e-07,
"loss": 0.4868852198123932,
"step": 2130
},
{
"epoch": 2.2998921251348436,
"grad_norm": 1.4012049436569214,
"learning_rate": 1.1397297740346771e-07,
"loss": 0.4392762780189514,
"step": 2132
},
{
"epoch": 2.302049622437972,
"grad_norm": 1.4517533779144287,
"learning_rate": 1.1359956520413267e-07,
"loss": 0.6284406185150146,
"step": 2134
},
{
"epoch": 2.3042071197411005,
"grad_norm": 23.886926651000977,
"learning_rate": 1.1322706654450692e-07,
"loss": 0.5837987661361694,
"step": 2136
},
{
"epoch": 2.3063646170442285,
"grad_norm": 2.8387584686279297,
"learning_rate": 1.1285548353296335e-07,
"loss": 0.49964287877082825,
"step": 2138
},
{
"epoch": 2.308522114347357,
"grad_norm": 1.3726048469543457,
"learning_rate": 1.1248481827269252e-07,
"loss": 0.4638864994049072,
"step": 2140
},
{
"epoch": 2.3106796116504853,
"grad_norm": 2.2775590419769287,
"learning_rate": 1.1211507286168997e-07,
"loss": 0.3237634003162384,
"step": 2142
},
{
"epoch": 2.3128371089536137,
"grad_norm": 4.829261779785156,
"learning_rate": 1.1174624939274521e-07,
"loss": 0.37512320280075073,
"step": 2144
},
{
"epoch": 2.314994606256742,
"grad_norm": 1.2612223625183105,
"learning_rate": 1.1137834995342951e-07,
"loss": 0.44408831000328064,
"step": 2146
},
{
"epoch": 2.3171521035598706,
"grad_norm": 1.091060757637024,
"learning_rate": 1.1101137662608356e-07,
"loss": 0.45439931750297546,
"step": 2148
},
{
"epoch": 2.319309600862999,
"grad_norm": 1.9243289232254028,
"learning_rate": 1.1064533148780674e-07,
"loss": 0.5608944296836853,
"step": 2150
},
{
"epoch": 2.3214670981661274,
"grad_norm": 1.0947431325912476,
"learning_rate": 1.1028021661044448e-07,
"loss": 0.43637141585350037,
"step": 2152
},
{
"epoch": 2.323624595469256,
"grad_norm": 1.2952344417572021,
"learning_rate": 1.0991603406057712e-07,
"loss": 0.4882223904132843,
"step": 2154
},
{
"epoch": 2.325782092772384,
"grad_norm": 2.949446201324463,
"learning_rate": 1.0955278589950754e-07,
"loss": 0.4377874732017517,
"step": 2156
},
{
"epoch": 2.3279395900755127,
"grad_norm": 1.3994691371917725,
"learning_rate": 1.0919047418325027e-07,
"loss": 0.4923911392688751,
"step": 2158
},
{
"epoch": 2.3300970873786406,
"grad_norm": 4.1141438484191895,
"learning_rate": 1.088291009625195e-07,
"loss": 0.450039267539978,
"step": 2160
},
{
"epoch": 2.332254584681769,
"grad_norm": 1.905044436454773,
"learning_rate": 1.0846866828271706e-07,
"loss": 0.3132597506046295,
"step": 2162
},
{
"epoch": 2.3344120819848975,
"grad_norm": 2.2338316440582275,
"learning_rate": 1.081091781839217e-07,
"loss": 0.47647953033447266,
"step": 2164
},
{
"epoch": 2.336569579288026,
"grad_norm": 1.30208158493042,
"learning_rate": 1.0775063270087683e-07,
"loss": 0.49173516035079956,
"step": 2166
},
{
"epoch": 2.3387270765911543,
"grad_norm": 1.6397186517715454,
"learning_rate": 1.073930338629793e-07,
"loss": 0.6098090410232544,
"step": 2168
},
{
"epoch": 2.3408845738942827,
"grad_norm": 1.269619107246399,
"learning_rate": 1.0703638369426782e-07,
"loss": 0.36568519473075867,
"step": 2170
},
{
"epoch": 2.343042071197411,
"grad_norm": 1.4696781635284424,
"learning_rate": 1.0668068421341176e-07,
"loss": 0.5783711671829224,
"step": 2172
},
{
"epoch": 2.3451995685005396,
"grad_norm": 7.201903343200684,
"learning_rate": 1.0632593743369927e-07,
"loss": 0.5010417699813843,
"step": 2174
},
{
"epoch": 2.347357065803668,
"grad_norm": 1.4425990581512451,
"learning_rate": 1.0597214536302627e-07,
"loss": 0.4950565695762634,
"step": 2176
},
{
"epoch": 2.349514563106796,
"grad_norm": 1.3873226642608643,
"learning_rate": 1.0561931000388497e-07,
"loss": 0.5758650898933411,
"step": 2178
},
{
"epoch": 2.3516720604099244,
"grad_norm": 2.6027872562408447,
"learning_rate": 1.0526743335335244e-07,
"loss": 0.43806731700897217,
"step": 2180
},
{
"epoch": 2.353829557713053,
"grad_norm": 1.688138723373413,
"learning_rate": 1.0491651740307942e-07,
"loss": 0.45398759841918945,
"step": 2182
},
{
"epoch": 2.355987055016181,
"grad_norm": 1.561120867729187,
"learning_rate": 1.0456656413927885e-07,
"loss": 0.5060495734214783,
"step": 2184
},
{
"epoch": 2.3581445523193096,
"grad_norm": 0.7075545787811279,
"learning_rate": 1.0421757554271513e-07,
"loss": 0.6581755876541138,
"step": 2186
},
{
"epoch": 2.360302049622438,
"grad_norm": 1.6746031045913696,
"learning_rate": 1.0386955358869228e-07,
"loss": 0.4242514669895172,
"step": 2188
},
{
"epoch": 2.3624595469255665,
"grad_norm": 1.3905069828033447,
"learning_rate": 1.0352250024704305e-07,
"loss": 0.5258157849311829,
"step": 2190
},
{
"epoch": 2.364617044228695,
"grad_norm": 1.4214626550674438,
"learning_rate": 1.0317641748211797e-07,
"loss": 0.4111720025539398,
"step": 2192
},
{
"epoch": 2.3667745415318233,
"grad_norm": 2.1955699920654297,
"learning_rate": 1.0283130725277387e-07,
"loss": 0.4222407937049866,
"step": 2194
},
{
"epoch": 2.3689320388349513,
"grad_norm": 3.329183578491211,
"learning_rate": 1.0248717151236292e-07,
"loss": 0.48131048679351807,
"step": 2196
},
{
"epoch": 2.3710895361380797,
"grad_norm": 1.4323745965957642,
"learning_rate": 1.0214401220872165e-07,
"loss": 0.5055102109909058,
"step": 2198
},
{
"epoch": 2.373247033441208,
"grad_norm": 3.3138020038604736,
"learning_rate": 1.0180183128415996e-07,
"loss": 0.4181690514087677,
"step": 2200
},
{
"epoch": 2.3754045307443366,
"grad_norm": 1.2691850662231445,
"learning_rate": 1.0146063067544994e-07,
"loss": 0.5093836188316345,
"step": 2202
},
{
"epoch": 2.377562028047465,
"grad_norm": 1.5404523611068726,
"learning_rate": 1.0112041231381497e-07,
"loss": 0.5514087677001953,
"step": 2204
},
{
"epoch": 2.3797195253505934,
"grad_norm": 0.5054550170898438,
"learning_rate": 1.007811781249191e-07,
"loss": 0.5555412173271179,
"step": 2206
},
{
"epoch": 2.381877022653722,
"grad_norm": 1.4161839485168457,
"learning_rate": 1.0044293002885543e-07,
"loss": 0.40494877099990845,
"step": 2208
},
{
"epoch": 2.3840345199568502,
"grad_norm": 2.184622049331665,
"learning_rate": 1.0010566994013612e-07,
"loss": 0.4903999865055084,
"step": 2210
},
{
"epoch": 2.3861920172599786,
"grad_norm": 1.4374651908874512,
"learning_rate": 9.976939976768092e-08,
"loss": 0.3994739353656769,
"step": 2212
},
{
"epoch": 2.3883495145631066,
"grad_norm": 1.230521559715271,
"learning_rate": 9.943412141480658e-08,
"loss": 0.3945184051990509,
"step": 2214
},
{
"epoch": 2.390507011866235,
"grad_norm": 1.331252932548523,
"learning_rate": 9.909983677921607e-08,
"loss": 0.3576943278312683,
"step": 2216
},
{
"epoch": 2.3926645091693635,
"grad_norm": 2.683382987976074,
"learning_rate": 9.876654775298799e-08,
"loss": 0.5146565437316895,
"step": 2218
},
{
"epoch": 2.394822006472492,
"grad_norm": 1.1658935546875,
"learning_rate": 9.843425622256546e-08,
"loss": 0.4697680175304413,
"step": 2220
},
{
"epoch": 2.3969795037756203,
"grad_norm": 1.2742339372634888,
"learning_rate": 9.810296406874583e-08,
"loss": 0.5487444400787354,
"step": 2222
},
{
"epoch": 2.3991370010787487,
"grad_norm": 2.137641668319702,
"learning_rate": 9.777267316667e-08,
"loss": 0.4315913915634155,
"step": 2224
},
{
"epoch": 2.401294498381877,
"grad_norm": 2.66500186920166,
"learning_rate": 9.744338538581147e-08,
"loss": 0.5236972570419312,
"step": 2226
},
{
"epoch": 2.4034519956850056,
"grad_norm": 1.3068699836730957,
"learning_rate": 9.711510258996617e-08,
"loss": 0.46810081601142883,
"step": 2228
},
{
"epoch": 2.405609492988134,
"grad_norm": 2.274906635284424,
"learning_rate": 9.678782663724156e-08,
"loss": 0.3743009567260742,
"step": 2230
},
{
"epoch": 2.407766990291262,
"grad_norm": 7.867446422576904,
"learning_rate": 9.646155938004655e-08,
"loss": 0.5277361273765564,
"step": 2232
},
{
"epoch": 2.4099244875943904,
"grad_norm": 1.2440294027328491,
"learning_rate": 9.613630266508053e-08,
"loss": 0.5060732960700989,
"step": 2234
},
{
"epoch": 2.412081984897519,
"grad_norm": 2.0601508617401123,
"learning_rate": 9.581205833332316e-08,
"loss": 0.422480046749115,
"step": 2236
},
{
"epoch": 2.414239482200647,
"grad_norm": 3.935227632522583,
"learning_rate": 9.548882822002405e-08,
"loss": 0.5240671634674072,
"step": 2238
},
{
"epoch": 2.4163969795037756,
"grad_norm": 1.6686252355575562,
"learning_rate": 9.516661415469216e-08,
"loss": 0.5927475690841675,
"step": 2240
},
{
"epoch": 2.418554476806904,
"grad_norm": 1.506426215171814,
"learning_rate": 9.484541796108551e-08,
"loss": 0.4393031597137451,
"step": 2242
},
{
"epoch": 2.4207119741100325,
"grad_norm": 2.3679986000061035,
"learning_rate": 9.45252414572009e-08,
"loss": 0.41118109226226807,
"step": 2244
},
{
"epoch": 2.422869471413161,
"grad_norm": 3.8800225257873535,
"learning_rate": 9.420608645526373e-08,
"loss": 0.5971561670303345,
"step": 2246
},
{
"epoch": 2.4250269687162893,
"grad_norm": 3.2100000381469727,
"learning_rate": 9.388795476171742e-08,
"loss": 0.4730355739593506,
"step": 2248
},
{
"epoch": 2.4271844660194173,
"grad_norm": 2.2373881340026855,
"learning_rate": 9.357084817721342e-08,
"loss": 0.5331867337226868,
"step": 2250
},
{
"epoch": 2.4293419633225457,
"grad_norm": 1.286616563796997,
"learning_rate": 9.325476849660124e-08,
"loss": 0.3063022196292877,
"step": 2252
},
{
"epoch": 2.431499460625674,
"grad_norm": 16.60342788696289,
"learning_rate": 9.293971750891755e-08,
"loss": 0.44442903995513916,
"step": 2254
},
{
"epoch": 2.4336569579288025,
"grad_norm": 1.4422513246536255,
"learning_rate": 9.262569699737699e-08,
"loss": 0.45078244805336,
"step": 2256
},
{
"epoch": 2.435814455231931,
"grad_norm": 3.8183093070983887,
"learning_rate": 9.231270873936134e-08,
"loss": 0.6303662657737732,
"step": 2258
},
{
"epoch": 2.4379719525350594,
"grad_norm": 2.514275550842285,
"learning_rate": 9.200075450640982e-08,
"loss": 0.41539642214775085,
"step": 2260
},
{
"epoch": 2.440129449838188,
"grad_norm": 15.767145156860352,
"learning_rate": 9.16898360642091e-08,
"loss": 0.5099453330039978,
"step": 2262
},
{
"epoch": 2.4422869471413162,
"grad_norm": 1.8690749406814575,
"learning_rate": 9.137995517258301e-08,
"loss": 0.4256049394607544,
"step": 2264
},
{
"epoch": 2.4444444444444446,
"grad_norm": 1.6698436737060547,
"learning_rate": 9.107111358548284e-08,
"loss": 0.24495507776737213,
"step": 2266
},
{
"epoch": 2.4466019417475726,
"grad_norm": 1.4875117540359497,
"learning_rate": 9.076331305097726e-08,
"loss": 0.5850554704666138,
"step": 2268
},
{
"epoch": 2.448759439050701,
"grad_norm": 1.4028273820877075,
"learning_rate": 9.045655531124265e-08,
"loss": 0.6093006134033203,
"step": 2270
},
{
"epoch": 2.4509169363538295,
"grad_norm": 1.2962590456008911,
"learning_rate": 9.015084210255303e-08,
"loss": 0.47294872999191284,
"step": 2272
},
{
"epoch": 2.453074433656958,
"grad_norm": 1.212159514427185,
"learning_rate": 8.984617515527011e-08,
"loss": 0.49960917234420776,
"step": 2274
},
{
"epoch": 2.4552319309600863,
"grad_norm": 1.358234167098999,
"learning_rate": 8.954255619383396e-08,
"loss": 0.5016006231307983,
"step": 2276
},
{
"epoch": 2.4573894282632147,
"grad_norm": 1.423470139503479,
"learning_rate": 8.92399869367528e-08,
"loss": 0.5450627207756042,
"step": 2278
},
{
"epoch": 2.459546925566343,
"grad_norm": 10.734492301940918,
"learning_rate": 8.893846909659339e-08,
"loss": 0.5570814609527588,
"step": 2280
},
{
"epoch": 2.4617044228694716,
"grad_norm": 1.709943413734436,
"learning_rate": 8.863800437997145e-08,
"loss": 0.487891286611557,
"step": 2282
},
{
"epoch": 2.4638619201726,
"grad_norm": 4.394408226013184,
"learning_rate": 8.833859448754206e-08,
"loss": 0.4305161237716675,
"step": 2284
},
{
"epoch": 2.466019417475728,
"grad_norm": 4.226398468017578,
"learning_rate": 8.804024111398971e-08,
"loss": 0.529059648513794,
"step": 2286
},
{
"epoch": 2.4681769147788564,
"grad_norm": 1.9100017547607422,
"learning_rate": 8.77429459480189e-08,
"loss": 0.5420784950256348,
"step": 2288
},
{
"epoch": 2.470334412081985,
"grad_norm": 2.035076141357422,
"learning_rate": 8.744671067234483e-08,
"loss": 0.3656485080718994,
"step": 2290
},
{
"epoch": 2.472491909385113,
"grad_norm": 0.4504566490650177,
"learning_rate": 8.715153696368342e-08,
"loss": 0.10097479820251465,
"step": 2292
},
{
"epoch": 2.4746494066882416,
"grad_norm": 1.2601561546325684,
"learning_rate": 8.685742649274209e-08,
"loss": 0.5880253314971924,
"step": 2294
},
{
"epoch": 2.47680690399137,
"grad_norm": 1.1769603490829468,
"learning_rate": 8.656438092421015e-08,
"loss": 0.2316816747188568,
"step": 2296
},
{
"epoch": 2.4789644012944985,
"grad_norm": 4.42578125,
"learning_rate": 8.627240191674979e-08,
"loss": 0.5261933207511902,
"step": 2298
},
{
"epoch": 2.481121898597627,
"grad_norm": 2.054516553878784,
"learning_rate": 8.598149112298586e-08,
"loss": 0.3512814939022064,
"step": 2300
},
{
"epoch": 2.4832793959007553,
"grad_norm": 1.8083107471466064,
"learning_rate": 8.569165018949755e-08,
"loss": 0.5700247287750244,
"step": 2302
},
{
"epoch": 2.4854368932038833,
"grad_norm": 0.7789508700370789,
"learning_rate": 8.540288075680832e-08,
"loss": 0.5419098138809204,
"step": 2304
},
{
"epoch": 2.4875943905070117,
"grad_norm": 1.425304889678955,
"learning_rate": 8.511518445937682e-08,
"loss": 0.5239717960357666,
"step": 2306
},
{
"epoch": 2.48975188781014,
"grad_norm": 2.2157175540924072,
"learning_rate": 8.482856292558771e-08,
"loss": 0.5087226629257202,
"step": 2308
},
{
"epoch": 2.4919093851132685,
"grad_norm": 1.6569185256958008,
"learning_rate": 8.454301777774237e-08,
"loss": 0.5517893433570862,
"step": 2310
},
{
"epoch": 2.494066882416397,
"grad_norm": 1.5045207738876343,
"learning_rate": 8.425855063204987e-08,
"loss": 0.44327977299690247,
"step": 2312
},
{
"epoch": 2.4962243797195254,
"grad_norm": 3.6006226539611816,
"learning_rate": 8.397516309861743e-08,
"loss": 0.5266181230545044,
"step": 2314
},
{
"epoch": 2.498381877022654,
"grad_norm": 1.071789264678955,
"learning_rate": 8.369285678144197e-08,
"loss": 0.3867831826210022,
"step": 2316
},
{
"epoch": 2.500539374325782,
"grad_norm": 1.2131776809692383,
"learning_rate": 8.341163327840026e-08,
"loss": 0.4348089396953583,
"step": 2318
},
{
"epoch": 2.5026968716289106,
"grad_norm": 1.3953055143356323,
"learning_rate": 8.313149418124043e-08,
"loss": 0.3074108362197876,
"step": 2320
},
{
"epoch": 2.5048543689320386,
"grad_norm": 1.4362233877182007,
"learning_rate": 8.285244107557284e-08,
"loss": 0.559751033782959,
"step": 2322
},
{
"epoch": 2.5070118662351675,
"grad_norm": 2.6127679347991943,
"learning_rate": 8.257447554086095e-08,
"loss": 0.49618762731552124,
"step": 2324
},
{
"epoch": 2.5091693635382954,
"grad_norm": 1.4416208267211914,
"learning_rate": 8.229759915041243e-08,
"loss": 0.37788355350494385,
"step": 2326
},
{
"epoch": 2.511326860841424,
"grad_norm": 1.3234007358551025,
"learning_rate": 8.202181347137041e-08,
"loss": 0.529360830783844,
"step": 2328
},
{
"epoch": 2.5134843581445523,
"grad_norm": 1.065699577331543,
"learning_rate": 8.174712006470453e-08,
"loss": 0.3420860767364502,
"step": 2330
},
{
"epoch": 2.5156418554476807,
"grad_norm": 0.9615293741226196,
"learning_rate": 8.147352048520198e-08,
"loss": 0.3195402920246124,
"step": 2332
},
{
"epoch": 2.517799352750809,
"grad_norm": 1.6290311813354492,
"learning_rate": 8.12010162814588e-08,
"loss": 0.41644540429115295,
"step": 2334
},
{
"epoch": 2.5199568500539375,
"grad_norm": 3.109558343887329,
"learning_rate": 8.092960899587121e-08,
"loss": 0.4512273371219635,
"step": 2336
},
{
"epoch": 2.522114347357066,
"grad_norm": 2.5993058681488037,
"learning_rate": 8.065930016462671e-08,
"loss": 0.5339113473892212,
"step": 2338
},
{
"epoch": 2.524271844660194,
"grad_norm": 5.516918659210205,
"learning_rate": 8.039009131769548e-08,
"loss": 0.6151620745658875,
"step": 2340
},
{
"epoch": 2.526429341963323,
"grad_norm": 18.284648895263672,
"learning_rate": 8.012198397882164e-08,
"loss": 0.5498458743095398,
"step": 2342
},
{
"epoch": 2.528586839266451,
"grad_norm": 5.122011661529541,
"learning_rate": 7.98549796655148e-08,
"loss": 0.5199579000473022,
"step": 2344
},
{
"epoch": 2.530744336569579,
"grad_norm": 1.5160562992095947,
"learning_rate": 7.958907988904126e-08,
"loss": 0.6041461825370789,
"step": 2346
},
{
"epoch": 2.5329018338727076,
"grad_norm": 1.5068459510803223,
"learning_rate": 7.932428615441553e-08,
"loss": 0.593430757522583,
"step": 2348
},
{
"epoch": 2.535059331175836,
"grad_norm": 1.434343934059143,
"learning_rate": 7.9060599960392e-08,
"loss": 0.31969255208969116,
"step": 2350
},
{
"epoch": 2.5372168284789645,
"grad_norm": 2.9943349361419678,
"learning_rate": 7.879802279945609e-08,
"loss": 0.5539549589157104,
"step": 2352
},
{
"epoch": 2.539374325782093,
"grad_norm": 1.722773790359497,
"learning_rate": 7.85365561578161e-08,
"loss": 0.5361152291297913,
"step": 2354
},
{
"epoch": 2.5415318230852213,
"grad_norm": 1.305713176727295,
"learning_rate": 7.827620151539466e-08,
"loss": 0.4791654050350189,
"step": 2356
},
{
"epoch": 2.5436893203883493,
"grad_norm": 3.0847299098968506,
"learning_rate": 7.801696034582053e-08,
"loss": 0.5922753810882568,
"step": 2358
},
{
"epoch": 2.545846817691478,
"grad_norm": 2.508219003677368,
"learning_rate": 7.77588341164198e-08,
"loss": 0.5202733874320984,
"step": 2360
},
{
"epoch": 2.548004314994606,
"grad_norm": 1.4630476236343384,
"learning_rate": 7.750182428820827e-08,
"loss": 0.5938437581062317,
"step": 2362
},
{
"epoch": 2.5501618122977345,
"grad_norm": 4.091277122497559,
"learning_rate": 7.724593231588272e-08,
"loss": 0.5015133619308472,
"step": 2364
},
{
"epoch": 2.552319309600863,
"grad_norm": 1.16217839717865,
"learning_rate": 7.699115964781254e-08,
"loss": 0.27357974648475647,
"step": 2366
},
{
"epoch": 2.5544768069039914,
"grad_norm": 1.87477707862854,
"learning_rate": 7.673750772603207e-08,
"loss": 0.612422525882721,
"step": 2368
},
{
"epoch": 2.55663430420712,
"grad_norm": 1.7888860702514648,
"learning_rate": 7.6484977986232e-08,
"loss": 0.7179882526397705,
"step": 2370
},
{
"epoch": 2.558791801510248,
"grad_norm": 0.9722982048988342,
"learning_rate": 7.623357185775133e-08,
"loss": 0.4070899486541748,
"step": 2372
},
{
"epoch": 2.5609492988133766,
"grad_norm": 1.3785275220870972,
"learning_rate": 7.598329076356936e-08,
"loss": 0.5034173727035522,
"step": 2374
},
{
"epoch": 2.5631067961165046,
"grad_norm": 1.4615991115570068,
"learning_rate": 7.573413612029774e-08,
"loss": 0.42231953144073486,
"step": 2376
},
{
"epoch": 2.5652642934196335,
"grad_norm": 1.8301200866699219,
"learning_rate": 7.548610933817214e-08,
"loss": 0.4184509217739105,
"step": 2378
},
{
"epoch": 2.5674217907227614,
"grad_norm": 2.232179880142212,
"learning_rate": 7.523921182104446e-08,
"loss": 0.4791230261325836,
"step": 2380
},
{
"epoch": 2.56957928802589,
"grad_norm": 1.1662960052490234,
"learning_rate": 7.499344496637498e-08,
"loss": 0.45605581998825073,
"step": 2382
},
{
"epoch": 2.5717367853290183,
"grad_norm": 1.7335278987884521,
"learning_rate": 7.474881016522429e-08,
"loss": 0.4822132885456085,
"step": 2384
},
{
"epoch": 2.5738942826321467,
"grad_norm": 1.393849492073059,
"learning_rate": 7.45053088022454e-08,
"loss": 0.61832594871521,
"step": 2386
},
{
"epoch": 2.576051779935275,
"grad_norm": 2.2385711669921875,
"learning_rate": 7.426294225567596e-08,
"loss": 0.41741040349006653,
"step": 2388
},
{
"epoch": 2.5782092772384035,
"grad_norm": 1.4406133890151978,
"learning_rate": 7.40217118973306e-08,
"loss": 0.4191496670246124,
"step": 2390
},
{
"epoch": 2.580366774541532,
"grad_norm": 1.7564983367919922,
"learning_rate": 7.378161909259297e-08,
"loss": 0.5521677732467651,
"step": 2392
},
{
"epoch": 2.58252427184466,
"grad_norm": 1.317151665687561,
"learning_rate": 7.354266520040793e-08,
"loss": 0.5924421548843384,
"step": 2394
},
{
"epoch": 2.584681769147789,
"grad_norm": 7.4354753494262695,
"learning_rate": 7.330485157327426e-08,
"loss": 0.46872678399086,
"step": 2396
},
{
"epoch": 2.5868392664509168,
"grad_norm": 1.6784964799880981,
"learning_rate": 7.306817955723654e-08,
"loss": 0.4340111017227173,
"step": 2398
},
{
"epoch": 2.588996763754045,
"grad_norm": 2.780867099761963,
"learning_rate": 7.283265049187784e-08,
"loss": 0.35171282291412354,
"step": 2400
},
{
"epoch": 2.5911542610571736,
"grad_norm": 1.664075493812561,
"learning_rate": 7.259826571031191e-08,
"loss": 0.39083340764045715,
"step": 2402
},
{
"epoch": 2.593311758360302,
"grad_norm": 3.530792713165283,
"learning_rate": 7.236502653917599e-08,
"loss": 0.4641299247741699,
"step": 2404
},
{
"epoch": 2.5954692556634305,
"grad_norm": 1.017684817314148,
"learning_rate": 7.213293429862288e-08,
"loss": 0.3411005437374115,
"step": 2406
},
{
"epoch": 2.597626752966559,
"grad_norm": 3.9479050636291504,
"learning_rate": 7.190199030231364e-08,
"loss": 0.5616810321807861,
"step": 2408
},
{
"epoch": 2.5997842502696873,
"grad_norm": 5.205540180206299,
"learning_rate": 7.167219585741041e-08,
"loss": 0.5188603401184082,
"step": 2410
},
{
"epoch": 2.6019417475728153,
"grad_norm": 1.752669334411621,
"learning_rate": 7.144355226456839e-08,
"loss": 0.622796893119812,
"step": 2412
},
{
"epoch": 2.604099244875944,
"grad_norm": 1.7586170434951782,
"learning_rate": 7.121606081792928e-08,
"loss": 0.4979010820388794,
"step": 2414
},
{
"epoch": 2.606256742179072,
"grad_norm": 1.7928980588912964,
"learning_rate": 7.098972280511323e-08,
"loss": 0.40664538741111755,
"step": 2416
},
{
"epoch": 2.6084142394822005,
"grad_norm": 1.9738396406173706,
"learning_rate": 7.076453950721202e-08,
"loss": 0.5753185153007507,
"step": 2418
},
{
"epoch": 2.610571736785329,
"grad_norm": 1.1539170742034912,
"learning_rate": 7.054051219878153e-08,
"loss": 0.47662532329559326,
"step": 2420
},
{
"epoch": 2.6127292340884574,
"grad_norm": 2.355470895767212,
"learning_rate": 7.031764214783478e-08,
"loss": 0.4526709318161011,
"step": 2422
},
{
"epoch": 2.614886731391586,
"grad_norm": 1.3842222690582275,
"learning_rate": 7.009593061583462e-08,
"loss": 0.4917500615119934,
"step": 2424
},
{
"epoch": 2.617044228694714,
"grad_norm": 7.645388603210449,
"learning_rate": 6.987537885768635e-08,
"loss": 0.504601240158081,
"step": 2426
},
{
"epoch": 2.6192017259978426,
"grad_norm": 1.4394519329071045,
"learning_rate": 6.965598812173118e-08,
"loss": 0.6155430674552917,
"step": 2428
},
{
"epoch": 2.6213592233009706,
"grad_norm": 1.2602229118347168,
"learning_rate": 6.943775964973861e-08,
"loss": 0.5159276723861694,
"step": 2430
},
{
"epoch": 2.6235167206040995,
"grad_norm": 1.7222973108291626,
"learning_rate": 6.922069467689969e-08,
"loss": 0.46511101722717285,
"step": 2432
},
{
"epoch": 2.6256742179072274,
"grad_norm": 3.4029550552368164,
"learning_rate": 6.900479443182e-08,
"loss": 0.5705016851425171,
"step": 2434
},
{
"epoch": 2.627831715210356,
"grad_norm": 3.3381807804107666,
"learning_rate": 6.879006013651269e-08,
"loss": 0.588231086730957,
"step": 2436
},
{
"epoch": 2.6299892125134843,
"grad_norm": 1.722324013710022,
"learning_rate": 6.857649300639145e-08,
"loss": 0.4552815556526184,
"step": 2438
},
{
"epoch": 2.6321467098166127,
"grad_norm": 2.675380229949951,
"learning_rate": 6.836409425026375e-08,
"loss": 0.3620685040950775,
"step": 2440
},
{
"epoch": 2.634304207119741,
"grad_norm": 4.263212203979492,
"learning_rate": 6.815286507032405e-08,
"loss": 0.33681440353393555,
"step": 2442
},
{
"epoch": 2.6364617044228695,
"grad_norm": 3.89007830619812,
"learning_rate": 6.794280666214682e-08,
"loss": 0.4459841251373291,
"step": 2444
},
{
"epoch": 2.638619201725998,
"grad_norm": 3.6217668056488037,
"learning_rate": 6.773392021467987e-08,
"loss": 0.5162920951843262,
"step": 2446
},
{
"epoch": 2.6407766990291264,
"grad_norm": 3.093386173248291,
"learning_rate": 6.752620691023762e-08,
"loss": 0.25055232644081116,
"step": 2448
},
{
"epoch": 2.642934196332255,
"grad_norm": 3.092965602874756,
"learning_rate": 6.731966792449451e-08,
"loss": 0.6372309923171997,
"step": 2450
},
{
"epoch": 2.6450916936353828,
"grad_norm": 1.5319708585739136,
"learning_rate": 6.711430442647809e-08,
"loss": 0.4929147958755493,
"step": 2452
},
{
"epoch": 2.647249190938511,
"grad_norm": 13.12871265411377,
"learning_rate": 6.691011757856258e-08,
"loss": 0.434012770652771,
"step": 2454
},
{
"epoch": 2.6494066882416396,
"grad_norm": 1.5877397060394287,
"learning_rate": 6.670710853646239e-08,
"loss": 0.43648290634155273,
"step": 2456
},
{
"epoch": 2.651564185544768,
"grad_norm": 1.0320699214935303,
"learning_rate": 6.650527844922533e-08,
"loss": 0.4268641471862793,
"step": 2458
},
{
"epoch": 2.6537216828478964,
"grad_norm": 3.019049644470215,
"learning_rate": 6.630462845922622e-08,
"loss": 0.6072458624839783,
"step": 2460
},
{
"epoch": 2.655879180151025,
"grad_norm": 1.9484155178070068,
"learning_rate": 6.610515970216046e-08,
"loss": 0.42939677834510803,
"step": 2462
},
{
"epoch": 2.6580366774541533,
"grad_norm": 1.258055567741394,
"learning_rate": 6.59068733070377e-08,
"loss": 0.45139870047569275,
"step": 2464
},
{
"epoch": 2.6601941747572817,
"grad_norm": 0.20732815563678741,
"learning_rate": 6.570977039617512e-08,
"loss": 0.19261834025382996,
"step": 2466
},
{
"epoch": 2.66235167206041,
"grad_norm": 1.5855305194854736,
"learning_rate": 6.551385208519136e-08,
"loss": 0.609540581703186,
"step": 2468
},
{
"epoch": 2.664509169363538,
"grad_norm": 1.3604331016540527,
"learning_rate": 6.531911948300026e-08,
"loss": 0.4661960303783417,
"step": 2470
},
{
"epoch": 2.6666666666666665,
"grad_norm": 1.1402088403701782,
"learning_rate": 6.512557369180416e-08,
"loss": 0.3893601894378662,
"step": 2472
},
{
"epoch": 2.668824163969795,
"grad_norm": 1.1722886562347412,
"learning_rate": 6.493321580708825e-08,
"loss": 0.50113445520401,
"step": 2474
},
{
"epoch": 2.6709816612729234,
"grad_norm": 1.2451552152633667,
"learning_rate": 6.474204691761392e-08,
"loss": 0.5579499006271362,
"step": 2476
},
{
"epoch": 2.6731391585760518,
"grad_norm": 1.9142377376556396,
"learning_rate": 6.455206810541275e-08,
"loss": 0.5365015864372253,
"step": 2478
},
{
"epoch": 2.67529665587918,
"grad_norm": 4.749199867248535,
"learning_rate": 6.436328044578045e-08,
"loss": 0.5498421788215637,
"step": 2480
},
{
"epoch": 2.6774541531823086,
"grad_norm": 4.736466884613037,
"learning_rate": 6.417568500727065e-08,
"loss": 0.474033921957016,
"step": 2482
},
{
"epoch": 2.679611650485437,
"grad_norm": 1.416872262954712,
"learning_rate": 6.398928285168894e-08,
"loss": 0.5008449554443359,
"step": 2484
},
{
"epoch": 2.6817691477885655,
"grad_norm": 1.2803456783294678,
"learning_rate": 6.380407503408675e-08,
"loss": 0.4675408601760864,
"step": 2486
},
{
"epoch": 2.6839266450916934,
"grad_norm": 2.346578598022461,
"learning_rate": 6.362006260275566e-08,
"loss": 0.48824068903923035,
"step": 2488
},
{
"epoch": 2.686084142394822,
"grad_norm": 1.5030266046524048,
"learning_rate": 6.343724659922105e-08,
"loss": 0.4942224323749542,
"step": 2490
},
{
"epoch": 2.6882416396979503,
"grad_norm": 4.458725929260254,
"learning_rate": 6.325562805823647e-08,
"loss": 0.5143862962722778,
"step": 2492
},
{
"epoch": 2.6903991370010787,
"grad_norm": 0.25812989473342896,
"learning_rate": 6.307520800777791e-08,
"loss": 0.06615746021270752,
"step": 2494
},
{
"epoch": 2.692556634304207,
"grad_norm": 1.5842936038970947,
"learning_rate": 6.289598746903753e-08,
"loss": 0.488372266292572,
"step": 2496
},
{
"epoch": 2.6947141316073355,
"grad_norm": 2.356571674346924,
"learning_rate": 6.271796745641836e-08,
"loss": 0.33276641368865967,
"step": 2498
},
{
"epoch": 2.696871628910464,
"grad_norm": 1.8246636390686035,
"learning_rate": 6.254114897752822e-08,
"loss": 0.534456193447113,
"step": 2500
},
{
"epoch": 2.6990291262135924,
"grad_norm": 1.4518077373504639,
"learning_rate": 6.23655330331743e-08,
"loss": 0.48372286558151245,
"step": 2502
},
{
"epoch": 2.701186623516721,
"grad_norm": 1.9352519512176514,
"learning_rate": 6.21911206173572e-08,
"loss": 0.44714611768722534,
"step": 2504
},
{
"epoch": 2.7033441208198488,
"grad_norm": 63.32822036743164,
"learning_rate": 6.20179127172655e-08,
"loss": 0.49458324909210205,
"step": 2506
},
{
"epoch": 2.705501618122977,
"grad_norm": 1.128757357597351,
"learning_rate": 6.184591031327023e-08,
"loss": 0.53676438331604,
"step": 2508
},
{
"epoch": 2.7076591154261056,
"grad_norm": 1.42232346534729,
"learning_rate": 6.1675114378919e-08,
"loss": 0.6496074199676514,
"step": 2510
},
{
"epoch": 2.709816612729234,
"grad_norm": 1.3712254762649536,
"learning_rate": 6.150552588093088e-08,
"loss": 0.30613094568252563,
"step": 2512
},
{
"epoch": 2.7119741100323624,
"grad_norm": 1.141538381576538,
"learning_rate": 6.133714577919062e-08,
"loss": 0.6155597567558289,
"step": 2514
},
{
"epoch": 2.714131607335491,
"grad_norm": 1.5372565984725952,
"learning_rate": 6.116997502674356e-08,
"loss": 0.5866535305976868,
"step": 2516
},
{
"epoch": 2.7162891046386193,
"grad_norm": 1.712314248085022,
"learning_rate": 6.100401456978973e-08,
"loss": 0.6070207357406616,
"step": 2518
},
{
"epoch": 2.7184466019417477,
"grad_norm": 1.3143550157546997,
"learning_rate": 6.0839265347679e-08,
"loss": 0.46849966049194336,
"step": 2520
},
{
"epoch": 2.720604099244876,
"grad_norm": 1.805611491203308,
"learning_rate": 6.06757282929055e-08,
"loss": 0.44359534978866577,
"step": 2522
},
{
"epoch": 2.722761596548004,
"grad_norm": 3.4996039867401123,
"learning_rate": 6.051340433110235e-08,
"loss": 0.4810839295387268,
"step": 2524
},
{
"epoch": 2.724919093851133,
"grad_norm": 0.6923230886459351,
"learning_rate": 6.035229438103654e-08,
"loss": 0.47448840737342834,
"step": 2526
},
{
"epoch": 2.727076591154261,
"grad_norm": 1.4109259843826294,
"learning_rate": 6.019239935460361e-08,
"loss": 0.4482736885547638,
"step": 2528
},
{
"epoch": 2.7292340884573894,
"grad_norm": 2.148198127746582,
"learning_rate": 6.003372015682248e-08,
"loss": 0.47598910331726074,
"step": 2530
},
{
"epoch": 2.7313915857605178,
"grad_norm": 2.2834556102752686,
"learning_rate": 5.987625768583047e-08,
"loss": 0.5227712392807007,
"step": 2532
},
{
"epoch": 2.733549083063646,
"grad_norm": 3.4445295333862305,
"learning_rate": 5.972001283287814e-08,
"loss": 0.4431000053882599,
"step": 2534
},
{
"epoch": 2.7357065803667746,
"grad_norm": 2.449721097946167,
"learning_rate": 5.956498648232411e-08,
"loss": 0.4020468294620514,
"step": 2536
},
{
"epoch": 2.737864077669903,
"grad_norm": 6.9917073249816895,
"learning_rate": 5.9411179511630237e-08,
"loss": 0.3725473880767822,
"step": 2538
},
{
"epoch": 2.7400215749730314,
"grad_norm": 2.097342014312744,
"learning_rate": 5.9258592791356675e-08,
"loss": 0.47959214448928833,
"step": 2540
},
{
"epoch": 2.7421790722761594,
"grad_norm": 1.6001266241073608,
"learning_rate": 5.910722718515675e-08,
"loss": 0.49233609437942505,
"step": 2542
},
{
"epoch": 2.7443365695792883,
"grad_norm": 2.2737677097320557,
"learning_rate": 5.8957083549772227e-08,
"loss": 0.5828397870063782,
"step": 2544
},
{
"epoch": 2.7464940668824163,
"grad_norm": 4.293592929840088,
"learning_rate": 5.880816273502835e-08,
"loss": 0.40149906277656555,
"step": 2546
},
{
"epoch": 2.7486515641855447,
"grad_norm": 2.3264200687408447,
"learning_rate": 5.866046558382924e-08,
"loss": 0.5630208849906921,
"step": 2548
},
{
"epoch": 2.750809061488673,
"grad_norm": 0.47289416193962097,
"learning_rate": 5.851399293215284e-08,
"loss": 0.3701988160610199,
"step": 2550
},
{
"epoch": 2.7529665587918015,
"grad_norm": 4.011696815490723,
"learning_rate": 5.8368745609046394e-08,
"loss": 0.4746440351009369,
"step": 2552
},
{
"epoch": 2.75512405609493,
"grad_norm": 4.506484508514404,
"learning_rate": 5.8224724436621686e-08,
"loss": 0.41828322410583496,
"step": 2554
},
{
"epoch": 2.7572815533980584,
"grad_norm": 1.0973607301712036,
"learning_rate": 5.808193023005037e-08,
"loss": 0.35553479194641113,
"step": 2556
},
{
"epoch": 2.759439050701187,
"grad_norm": 3.0977184772491455,
"learning_rate": 5.7940363797559355e-08,
"loss": 0.6049969792366028,
"step": 2558
},
{
"epoch": 2.7615965480043148,
"grad_norm": 3.566246271133423,
"learning_rate": 5.780002594042628e-08,
"loss": 0.4752573072910309,
"step": 2560
},
{
"epoch": 2.7637540453074436,
"grad_norm": 3.1380441188812256,
"learning_rate": 5.766091745297499e-08,
"loss": 0.42298072576522827,
"step": 2562
},
{
"epoch": 2.7659115426105716,
"grad_norm": 1.4781795740127563,
"learning_rate": 5.752303912257083e-08,
"loss": 0.27772021293640137,
"step": 2564
},
{
"epoch": 2.7680690399137,
"grad_norm": 0.4035155773162842,
"learning_rate": 5.738639172961655e-08,
"loss": 0.2534405291080475,
"step": 2566
},
{
"epoch": 2.7702265372168284,
"grad_norm": 2.6446008682250977,
"learning_rate": 5.725097604754762e-08,
"loss": 0.445311039686203,
"step": 2568
},
{
"epoch": 2.772384034519957,
"grad_norm": 0.6513259410858154,
"learning_rate": 5.7116792842827847e-08,
"loss": 0.3810059428215027,
"step": 2570
},
{
"epoch": 2.7745415318230853,
"grad_norm": 1.0772795677185059,
"learning_rate": 5.698384287494524e-08,
"loss": 0.4859530031681061,
"step": 2572
},
{
"epoch": 2.7766990291262137,
"grad_norm": 1.3191403150558472,
"learning_rate": 5.68521268964075e-08,
"loss": 0.3801627457141876,
"step": 2574
},
{
"epoch": 2.778856526429342,
"grad_norm": 1.1567744016647339,
"learning_rate": 5.672164565273794e-08,
"loss": 0.43105101585388184,
"step": 2576
},
{
"epoch": 2.78101402373247,
"grad_norm": 1.525492548942566,
"learning_rate": 5.6592399882471005e-08,
"loss": 0.46906399726867676,
"step": 2578
},
{
"epoch": 2.783171521035599,
"grad_norm": 1.5278234481811523,
"learning_rate": 5.646439031714843e-08,
"loss": 0.44850075244903564,
"step": 2580
},
{
"epoch": 2.785329018338727,
"grad_norm": 1.656800627708435,
"learning_rate": 5.633761768131492e-08,
"loss": 0.4555439352989197,
"step": 2582
},
{
"epoch": 2.7874865156418553,
"grad_norm": 5.497950077056885,
"learning_rate": 5.6212082692513836e-08,
"loss": 0.5264440774917603,
"step": 2584
},
{
"epoch": 2.7896440129449838,
"grad_norm": 3.069579601287842,
"learning_rate": 5.608778606128367e-08,
"loss": 0.4567970633506775,
"step": 2586
},
{
"epoch": 2.791801510248112,
"grad_norm": 0.7134677767753601,
"learning_rate": 5.59647284911535e-08,
"loss": 0.5471997261047363,
"step": 2588
},
{
"epoch": 2.7939590075512406,
"grad_norm": 1.767069935798645,
"learning_rate": 5.5842910678639274e-08,
"loss": 0.5194593667984009,
"step": 2590
},
{
"epoch": 2.796116504854369,
"grad_norm": 1.2429695129394531,
"learning_rate": 5.5722333313239796e-08,
"loss": 0.472802996635437,
"step": 2592
},
{
"epoch": 2.7982740021574974,
"grad_norm": 3.2857539653778076,
"learning_rate": 5.5602997077432874e-08,
"loss": 0.6141800880432129,
"step": 2594
},
{
"epoch": 2.8004314994606254,
"grad_norm": 1.292257308959961,
"learning_rate": 5.548490264667141e-08,
"loss": 0.4846678376197815,
"step": 2596
},
{
"epoch": 2.8025889967637543,
"grad_norm": 22.469823837280273,
"learning_rate": 5.536805068937954e-08,
"loss": 0.5878589749336243,
"step": 2598
},
{
"epoch": 2.8047464940668823,
"grad_norm": 3.061379909515381,
"learning_rate": 5.525244186694894e-08,
"loss": 0.4901062548160553,
"step": 2600
},
{
"epoch": 2.8069039913700107,
"grad_norm": 2.615751266479492,
"learning_rate": 5.5138076833735084e-08,
"loss": 0.40489572286605835,
"step": 2602
},
{
"epoch": 2.809061488673139,
"grad_norm": 1.320258378982544,
"learning_rate": 5.5024956237053384e-08,
"loss": 0.5788986682891846,
"step": 2604
},
{
"epoch": 2.8112189859762675,
"grad_norm": 1.3249293565750122,
"learning_rate": 5.491308071717573e-08,
"loss": 0.42938145995140076,
"step": 2606
},
{
"epoch": 2.813376483279396,
"grad_norm": 1.0026346445083618,
"learning_rate": 5.480245090732673e-08,
"loss": 0.495646595954895,
"step": 2608
},
{
"epoch": 2.8155339805825244,
"grad_norm": 1.6520612239837646,
"learning_rate": 5.469306743368023e-08,
"loss": 0.4816511273384094,
"step": 2610
},
{
"epoch": 2.8176914778856528,
"grad_norm": 1.3639336824417114,
"learning_rate": 5.458493091535563e-08,
"loss": 0.3476675748825073,
"step": 2612
},
{
"epoch": 2.8198489751887807,
"grad_norm": 1.3383265733718872,
"learning_rate": 5.447804196441453e-08,
"loss": 0.5728883147239685,
"step": 2614
},
{
"epoch": 2.8220064724919096,
"grad_norm": 3.793182134628296,
"learning_rate": 5.4372401185857145e-08,
"loss": 0.6043237447738647,
"step": 2616
},
{
"epoch": 2.8241639697950376,
"grad_norm": 2.1428301334381104,
"learning_rate": 5.426800917761897e-08,
"loss": 0.529897928237915,
"step": 2618
},
{
"epoch": 2.826321467098166,
"grad_norm": 2.7142257690429688,
"learning_rate": 5.41648665305673e-08,
"loss": 0.5976702570915222,
"step": 2620
},
{
"epoch": 2.8284789644012944,
"grad_norm": 1.4826109409332275,
"learning_rate": 5.406297382849803e-08,
"loss": 0.4717695116996765,
"step": 2622
},
{
"epoch": 2.830636461704423,
"grad_norm": 1.6335054636001587,
"learning_rate": 5.396233164813221e-08,
"loss": 0.48008373379707336,
"step": 2624
},
{
"epoch": 2.8327939590075513,
"grad_norm": 1.709679126739502,
"learning_rate": 5.3862940559112795e-08,
"loss": 0.5768192410469055,
"step": 2626
},
{
"epoch": 2.8349514563106797,
"grad_norm": 3.3123061656951904,
"learning_rate": 5.376480112400159e-08,
"loss": 0.5282171368598938,
"step": 2628
},
{
"epoch": 2.837108953613808,
"grad_norm": 6.133842945098877,
"learning_rate": 5.366791389827578e-08,
"loss": 0.47790658473968506,
"step": 2630
},
{
"epoch": 2.839266450916936,
"grad_norm": 2.2591352462768555,
"learning_rate": 5.3572279430325055e-08,
"loss": 0.5901204347610474,
"step": 2632
},
{
"epoch": 2.841423948220065,
"grad_norm": 1.7429542541503906,
"learning_rate": 5.3477898261448344e-08,
"loss": 0.40829578042030334,
"step": 2634
},
{
"epoch": 2.843581445523193,
"grad_norm": 1.7901886701583862,
"learning_rate": 5.3384770925850796e-08,
"loss": 0.6178877353668213,
"step": 2636
},
{
"epoch": 2.8457389428263213,
"grad_norm": 2.5778701305389404,
"learning_rate": 5.3292897950640776e-08,
"loss": 0.5174447298049927,
"step": 2638
},
{
"epoch": 2.8478964401294498,
"grad_norm": 1.1254876852035522,
"learning_rate": 5.3202279855826885e-08,
"loss": 0.48666954040527344,
"step": 2640
},
{
"epoch": 2.850053937432578,
"grad_norm": 1.12764310836792,
"learning_rate": 5.311291715431497e-08,
"loss": 0.5326154828071594,
"step": 2642
},
{
"epoch": 2.8522114347357066,
"grad_norm": 4.193994522094727,
"learning_rate": 5.3024810351905257e-08,
"loss": 0.552856981754303,
"step": 2644
},
{
"epoch": 2.854368932038835,
"grad_norm": 2.1312756538391113,
"learning_rate": 5.2937959947289485e-08,
"loss": 0.31122079491615295,
"step": 2646
},
{
"epoch": 2.8565264293419634,
"grad_norm": 2.0096354484558105,
"learning_rate": 5.2852366432048054e-08,
"loss": 0.4695837199687958,
"step": 2648
},
{
"epoch": 2.858683926645092,
"grad_norm": 2.734739065170288,
"learning_rate": 5.2768030290647315e-08,
"loss": 0.4716711640357971,
"step": 2650
},
{
"epoch": 2.8608414239482203,
"grad_norm": 2.2249631881713867,
"learning_rate": 5.26849520004367e-08,
"loss": 0.44928935170173645,
"step": 2652
},
{
"epoch": 2.8629989212513482,
"grad_norm": 4.771669387817383,
"learning_rate": 5.260313203164621e-08,
"loss": 0.49516862630844116,
"step": 2654
},
{
"epoch": 2.8651564185544767,
"grad_norm": 1.4668776988983154,
"learning_rate": 5.252257084738355e-08,
"loss": 0.4240492582321167,
"step": 2656
},
{
"epoch": 2.867313915857605,
"grad_norm": 1.4637136459350586,
"learning_rate": 5.244326890363166e-08,
"loss": 0.4604833126068115,
"step": 2658
},
{
"epoch": 2.8694714131607335,
"grad_norm": 2.386207342147827,
"learning_rate": 5.2365226649246e-08,
"loss": 0.5221148133277893,
"step": 2660
},
{
"epoch": 2.871628910463862,
"grad_norm": 1.389047622680664,
"learning_rate": 5.2288444525952225e-08,
"loss": 0.5388311147689819,
"step": 2662
},
{
"epoch": 2.8737864077669903,
"grad_norm": 1.2806384563446045,
"learning_rate": 5.221292296834336e-08,
"loss": 0.4201410114765167,
"step": 2664
},
{
"epoch": 2.8759439050701188,
"grad_norm": 2.330186605453491,
"learning_rate": 5.213866240387767e-08,
"loss": 0.48175758123397827,
"step": 2666
},
{
"epoch": 2.878101402373247,
"grad_norm": 1.2908588647842407,
"learning_rate": 5.206566325287606e-08,
"loss": 0.5429530739784241,
"step": 2668
},
{
"epoch": 2.8802588996763756,
"grad_norm": 1.2484322786331177,
"learning_rate": 5.199392592851967e-08,
"loss": 0.3116611838340759,
"step": 2670
},
{
"epoch": 2.8824163969795036,
"grad_norm": 1.2387123107910156,
"learning_rate": 5.192345083684766e-08,
"loss": 0.5519980192184448,
"step": 2672
},
{
"epoch": 2.884573894282632,
"grad_norm": 1.2122451066970825,
"learning_rate": 5.1854238376754894e-08,
"loss": 0.4367588758468628,
"step": 2674
},
{
"epoch": 2.8867313915857604,
"grad_norm": 1.0126200914382935,
"learning_rate": 5.178628893998947e-08,
"loss": 0.5291083455085754,
"step": 2676
},
{
"epoch": 2.888888888888889,
"grad_norm": 17.17489242553711,
"learning_rate": 5.171960291115085e-08,
"loss": 0.440005362033844,
"step": 2678
},
{
"epoch": 2.8910463861920173,
"grad_norm": 2.179588794708252,
"learning_rate": 5.165418066768743e-08,
"loss": 0.33802393078804016,
"step": 2680
},
{
"epoch": 2.8932038834951457,
"grad_norm": 1.7356486320495605,
"learning_rate": 5.1590022579894453e-08,
"loss": 0.6102227568626404,
"step": 2682
},
{
"epoch": 2.895361380798274,
"grad_norm": 1.4570553302764893,
"learning_rate": 5.152712901091197e-08,
"loss": 0.519218921661377,
"step": 2684
},
{
"epoch": 2.8975188781014025,
"grad_norm": 1.47493577003479,
"learning_rate": 5.146550031672273e-08,
"loss": 0.5683881640434265,
"step": 2686
},
{
"epoch": 2.899676375404531,
"grad_norm": 2.640059471130371,
"learning_rate": 5.1405136846150246e-08,
"loss": 0.3549501597881317,
"step": 2688
},
{
"epoch": 2.901833872707659,
"grad_norm": 3.362346887588501,
"learning_rate": 5.1346038940856663e-08,
"loss": 0.4709499180316925,
"step": 2690
},
{
"epoch": 2.9039913700107873,
"grad_norm": 2.4766998291015625,
"learning_rate": 5.1288206935341004e-08,
"loss": 0.43772682547569275,
"step": 2692
},
{
"epoch": 2.9061488673139158,
"grad_norm": 12.43819808959961,
"learning_rate": 5.123164115693719e-08,
"loss": 0.5715151429176331,
"step": 2694
},
{
"epoch": 2.908306364617044,
"grad_norm": 1.2701572179794312,
"learning_rate": 5.11763419258121e-08,
"loss": 0.4489721655845642,
"step": 2696
},
{
"epoch": 2.9104638619201726,
"grad_norm": 2.798393964767456,
"learning_rate": 5.112230955496399e-08,
"loss": 0.483008474111557,
"step": 2698
},
{
"epoch": 2.912621359223301,
"grad_norm": 2.360163688659668,
"learning_rate": 5.106954435022051e-08,
"loss": 0.23574459552764893,
"step": 2700
},
{
"epoch": 2.9147788565264294,
"grad_norm": 1.3117761611938477,
"learning_rate": 5.1018046610236994e-08,
"loss": 0.4940152168273926,
"step": 2702
},
{
"epoch": 2.916936353829558,
"grad_norm": 4.25691556930542,
"learning_rate": 5.0967816626494914e-08,
"loss": 0.6037485003471375,
"step": 2704
},
{
"epoch": 2.9190938511326863,
"grad_norm": 1.7997087240219116,
"learning_rate": 5.09188546833001e-08,
"loss": 0.5916652083396912,
"step": 2706
},
{
"epoch": 2.9212513484358142,
"grad_norm": 1.2903374433517456,
"learning_rate": 5.0871161057781174e-08,
"loss": 0.5085786581039429,
"step": 2708
},
{
"epoch": 2.9234088457389427,
"grad_norm": 2.052659511566162,
"learning_rate": 5.0824736019887965e-08,
"loss": 0.4842919409275055,
"step": 2710
},
{
"epoch": 2.925566343042071,
"grad_norm": 1.6598625183105469,
"learning_rate": 5.077957983239001e-08,
"loss": 0.485705703496933,
"step": 2712
},
{
"epoch": 2.9277238403451995,
"grad_norm": 1.2494041919708252,
"learning_rate": 5.0735692750875014e-08,
"loss": 0.5640828609466553,
"step": 2714
},
{
"epoch": 2.929881337648328,
"grad_norm": 2.9620018005371094,
"learning_rate": 5.0693075023747485e-08,
"loss": 0.4791678786277771,
"step": 2716
},
{
"epoch": 2.9320388349514563,
"grad_norm": 3.0161349773406982,
"learning_rate": 5.0651726892227225e-08,
"loss": 0.5130857229232788,
"step": 2718
},
{
"epoch": 2.9341963322545848,
"grad_norm": 3.2139105796813965,
"learning_rate": 5.061164859034808e-08,
"loss": 0.5509821176528931,
"step": 2720
},
{
"epoch": 2.936353829557713,
"grad_norm": 2.8673787117004395,
"learning_rate": 5.057284034495652e-08,
"loss": 0.4344579875469208,
"step": 2722
},
{
"epoch": 2.9385113268608416,
"grad_norm": 2.557595729827881,
"learning_rate": 5.05353023757104e-08,
"loss": 0.4874427616596222,
"step": 2724
},
{
"epoch": 2.9406688241639696,
"grad_norm": 1.2036933898925781,
"learning_rate": 5.04990348950777e-08,
"loss": 0.3965007960796356,
"step": 2726
},
{
"epoch": 2.9428263214670984,
"grad_norm": 3.1260766983032227,
"learning_rate": 5.0464038108335355e-08,
"loss": 0.5202281475067139,
"step": 2728
},
{
"epoch": 2.9449838187702264,
"grad_norm": 10.786460876464844,
"learning_rate": 5.043031221356804e-08,
"loss": 0.37282276153564453,
"step": 2730
},
{
"epoch": 2.947141316073355,
"grad_norm": 1.7192671298980713,
"learning_rate": 5.039785740166707e-08,
"loss": 0.40285778045654297,
"step": 2732
},
{
"epoch": 2.9492988133764833,
"grad_norm": 1.8213419914245605,
"learning_rate": 5.036667385632939e-08,
"loss": 0.49603918194770813,
"step": 2734
},
{
"epoch": 2.9514563106796117,
"grad_norm": 1.3751518726348877,
"learning_rate": 5.0336761754056387e-08,
"loss": 0.4726869463920593,
"step": 2736
},
{
"epoch": 2.95361380798274,
"grad_norm": 2.531486749649048,
"learning_rate": 5.030812126415301e-08,
"loss": 0.4985443949699402,
"step": 2738
},
{
"epoch": 2.9557713052858685,
"grad_norm": 1.1253113746643066,
"learning_rate": 5.028075254872682e-08,
"loss": 0.43464401364326477,
"step": 2740
},
{
"epoch": 2.957928802588997,
"grad_norm": 1.2070587873458862,
"learning_rate": 5.025465576268697e-08,
"loss": 0.4627860486507416,
"step": 2742
},
{
"epoch": 2.960086299892125,
"grad_norm": 6.266397953033447,
"learning_rate": 5.0229831053743396e-08,
"loss": 0.4669394791126251,
"step": 2744
},
{
"epoch": 2.9622437971952538,
"grad_norm": 1.160976767539978,
"learning_rate": 5.020627856240602e-08,
"loss": 0.5476797819137573,
"step": 2746
},
{
"epoch": 2.9644012944983817,
"grad_norm": 1.2844939231872559,
"learning_rate": 5.018399842198384e-08,
"loss": 0.4904819130897522,
"step": 2748
},
{
"epoch": 2.96655879180151,
"grad_norm": 2.1023972034454346,
"learning_rate": 5.016299075858434e-08,
"loss": 0.5619233250617981,
"step": 2750
},
{
"epoch": 2.9687162891046386,
"grad_norm": 2.740511417388916,
"learning_rate": 5.0143255691112545e-08,
"loss": 0.5617838501930237,
"step": 2752
},
{
"epoch": 2.970873786407767,
"grad_norm": 1.1180198192596436,
"learning_rate": 5.012479333127061e-08,
"loss": 0.6266708374023438,
"step": 2754
},
{
"epoch": 2.9730312837108954,
"grad_norm": 2.0200552940368652,
"learning_rate": 5.0107603783556983e-08,
"loss": 0.2789224684238434,
"step": 2756
},
{
"epoch": 2.975188781014024,
"grad_norm": 1.1091794967651367,
"learning_rate": 5.009168714526591e-08,
"loss": 0.5262956023216248,
"step": 2758
},
{
"epoch": 2.9773462783171523,
"grad_norm": 1.1029998064041138,
"learning_rate": 5.0077043506486894e-08,
"loss": 0.1211983859539032,
"step": 2760
},
{
"epoch": 2.9795037756202802,
"grad_norm": 1.243391990661621,
"learning_rate": 5.006367295010413e-08,
"loss": 0.36533698439598083,
"step": 2762
},
{
"epoch": 2.981661272923409,
"grad_norm": 1.189220666885376,
"learning_rate": 5.005157555179603e-08,
"loss": 0.6093316078186035,
"step": 2764
},
{
"epoch": 2.983818770226537,
"grad_norm": 1.3239524364471436,
"learning_rate": 5.0040751380034905e-08,
"loss": 0.29459065198898315,
"step": 2766
},
{
"epoch": 2.9859762675296655,
"grad_norm": 3.5429818630218506,
"learning_rate": 5.0031200496086436e-08,
"loss": 0.5054223537445068,
"step": 2768
},
{
"epoch": 2.988133764832794,
"grad_norm": 3.4252212047576904,
"learning_rate": 5.0022922954009416e-08,
"loss": 0.34435006976127625,
"step": 2770
},
{
"epoch": 2.9902912621359223,
"grad_norm": 2.453650951385498,
"learning_rate": 5.001591880065541e-08,
"loss": 0.5675299167633057,
"step": 2772
},
{
"epoch": 2.9924487594390508,
"grad_norm": 1.6163811683654785,
"learning_rate": 5.001018807566848e-08,
"loss": 0.5318145751953125,
"step": 2774
},
{
"epoch": 2.994606256742179,
"grad_norm": 4.034750461578369,
"learning_rate": 5.000573081148502e-08,
"loss": 0.5025465488433838,
"step": 2776
},
{
"epoch": 2.9967637540453076,
"grad_norm": 1.6249115467071533,
"learning_rate": 5.0002547033333525e-08,
"loss": 0.235714852809906,
"step": 2778
},
{
"epoch": 2.9989212513484356,
"grad_norm": 2.0912704467773438,
"learning_rate": 5.000063675923442e-08,
"loss": 0.2616070806980133,
"step": 2780
},
{
"epoch": 3.0,
"step": 2781,
"total_flos": 3.284111394515778e+18,
"train_loss": 0.5546778752215686,
"train_runtime": 35021.8231,
"train_samples_per_second": 1.271,
"train_steps_per_second": 0.079
}
],
"logging_steps": 2,
"max_steps": 2781,
"num_input_tokens_seen": 0,
"num_train_epochs": 3,
"save_steps": 99999,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": false,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 3.284111394515778e+18,
"train_batch_size": 2,
"trial_name": null,
"trial_params": null
}