My-Qwen2-Audio-Instruct / SFT /trainer_state.json
MYJOKERML's picture
Upload SFT model
8b40b2e verified
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 10.0,
"eval_steps": 2500,
"global_step": 5540,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.009025270758122744,
"grad_norm": 9.466440213077052,
"learning_rate": 1.9985559566787006e-05,
"loss": 2.4118,
"step": 5
},
{
"epoch": 0.018050541516245487,
"grad_norm": 5.092031681557354,
"learning_rate": 1.996750902527076e-05,
"loss": 0.8115,
"step": 10
},
{
"epoch": 0.02707581227436823,
"grad_norm": 3.581292467477243,
"learning_rate": 1.9949458483754514e-05,
"loss": 0.7566,
"step": 15
},
{
"epoch": 0.036101083032490974,
"grad_norm": 3.0664426091496577,
"learning_rate": 1.9931407942238267e-05,
"loss": 0.7238,
"step": 20
},
{
"epoch": 0.04512635379061372,
"grad_norm": 2.953202771996899,
"learning_rate": 1.9913357400722025e-05,
"loss": 0.7088,
"step": 25
},
{
"epoch": 0.05415162454873646,
"grad_norm": 3.2107443547649233,
"learning_rate": 1.989530685920578e-05,
"loss": 0.6899,
"step": 30
},
{
"epoch": 0.0631768953068592,
"grad_norm": 2.606024610125901,
"learning_rate": 1.9877256317689532e-05,
"loss": 0.6928,
"step": 35
},
{
"epoch": 0.07220216606498195,
"grad_norm": 3.2049516084300245,
"learning_rate": 1.9859205776173286e-05,
"loss": 0.69,
"step": 40
},
{
"epoch": 0.0812274368231047,
"grad_norm": 2.761288783636034,
"learning_rate": 1.984115523465704e-05,
"loss": 0.6867,
"step": 45
},
{
"epoch": 0.09025270758122744,
"grad_norm": 2.996941813764781,
"learning_rate": 1.9823104693140797e-05,
"loss": 0.6675,
"step": 50
},
{
"epoch": 0.09927797833935018,
"grad_norm": 2.7492945960977617,
"learning_rate": 1.980505415162455e-05,
"loss": 0.6852,
"step": 55
},
{
"epoch": 0.10830324909747292,
"grad_norm": 3.1658110710465124,
"learning_rate": 1.9787003610108305e-05,
"loss": 0.6447,
"step": 60
},
{
"epoch": 0.11732851985559567,
"grad_norm": 2.5627271539534777,
"learning_rate": 1.976895306859206e-05,
"loss": 0.6562,
"step": 65
},
{
"epoch": 0.1263537906137184,
"grad_norm": 2.752341860058262,
"learning_rate": 1.9750902527075816e-05,
"loss": 0.6771,
"step": 70
},
{
"epoch": 0.13537906137184116,
"grad_norm": 3.047967974028345,
"learning_rate": 1.973285198555957e-05,
"loss": 0.6656,
"step": 75
},
{
"epoch": 0.1444043321299639,
"grad_norm": 2.9706224809053916,
"learning_rate": 1.9714801444043323e-05,
"loss": 0.6635,
"step": 80
},
{
"epoch": 0.15342960288808663,
"grad_norm": 2.4472232682052253,
"learning_rate": 1.9696750902527077e-05,
"loss": 0.6474,
"step": 85
},
{
"epoch": 0.1624548736462094,
"grad_norm": 2.701915790395711,
"learning_rate": 1.967870036101083e-05,
"loss": 0.6641,
"step": 90
},
{
"epoch": 0.17148014440433212,
"grad_norm": 2.4548463645426946,
"learning_rate": 1.9660649819494585e-05,
"loss": 0.6509,
"step": 95
},
{
"epoch": 0.18050541516245489,
"grad_norm": 12.210856636716178,
"learning_rate": 1.964259927797834e-05,
"loss": 0.6646,
"step": 100
},
{
"epoch": 0.18953068592057762,
"grad_norm": 2.603197676839273,
"learning_rate": 1.9624548736462096e-05,
"loss": 0.6975,
"step": 105
},
{
"epoch": 0.19855595667870035,
"grad_norm": 2.414541801668063,
"learning_rate": 1.960649819494585e-05,
"loss": 0.6412,
"step": 110
},
{
"epoch": 0.2075812274368231,
"grad_norm": 2.5004150715560787,
"learning_rate": 1.9588447653429607e-05,
"loss": 0.6477,
"step": 115
},
{
"epoch": 0.21660649819494585,
"grad_norm": 2.65590491408136,
"learning_rate": 1.957039711191336e-05,
"loss": 0.6328,
"step": 120
},
{
"epoch": 0.22563176895306858,
"grad_norm": 2.8094738107146107,
"learning_rate": 1.9552346570397115e-05,
"loss": 0.643,
"step": 125
},
{
"epoch": 0.23465703971119134,
"grad_norm": 2.4628672221036787,
"learning_rate": 1.953429602888087e-05,
"loss": 0.6295,
"step": 130
},
{
"epoch": 0.24368231046931407,
"grad_norm": 2.6028074637701315,
"learning_rate": 1.9516245487364622e-05,
"loss": 0.6231,
"step": 135
},
{
"epoch": 0.2527075812274368,
"grad_norm": 2.6627654496834112,
"learning_rate": 1.9498194945848376e-05,
"loss": 0.6164,
"step": 140
},
{
"epoch": 0.26173285198555957,
"grad_norm": 2.428314569670895,
"learning_rate": 1.948014440433213e-05,
"loss": 0.6137,
"step": 145
},
{
"epoch": 0.27075812274368233,
"grad_norm": 2.145860067780341,
"learning_rate": 1.9462093862815884e-05,
"loss": 0.6162,
"step": 150
},
{
"epoch": 0.27978339350180503,
"grad_norm": 2.6330932206114865,
"learning_rate": 1.944404332129964e-05,
"loss": 0.6152,
"step": 155
},
{
"epoch": 0.2888086642599278,
"grad_norm": 1.9933015700312968,
"learning_rate": 1.9425992779783395e-05,
"loss": 0.6333,
"step": 160
},
{
"epoch": 0.29783393501805056,
"grad_norm": 2.532862933716995,
"learning_rate": 1.940794223826715e-05,
"loss": 0.6284,
"step": 165
},
{
"epoch": 0.30685920577617326,
"grad_norm": 2.1667192528112755,
"learning_rate": 1.9389891696750906e-05,
"loss": 0.6237,
"step": 170
},
{
"epoch": 0.315884476534296,
"grad_norm": 2.1701228962066295,
"learning_rate": 1.937184115523466e-05,
"loss": 0.6336,
"step": 175
},
{
"epoch": 0.3249097472924188,
"grad_norm": 2.273883822591775,
"learning_rate": 1.9353790613718413e-05,
"loss": 0.6209,
"step": 180
},
{
"epoch": 0.33393501805054154,
"grad_norm": 2.29978243530194,
"learning_rate": 1.9335740072202167e-05,
"loss": 0.6318,
"step": 185
},
{
"epoch": 0.34296028880866425,
"grad_norm": 2.411589937459258,
"learning_rate": 1.931768953068592e-05,
"loss": 0.6107,
"step": 190
},
{
"epoch": 0.351985559566787,
"grad_norm": 2.175980233783614,
"learning_rate": 1.9299638989169675e-05,
"loss": 0.6034,
"step": 195
},
{
"epoch": 0.36101083032490977,
"grad_norm": 2.3182975023815566,
"learning_rate": 1.9281588447653432e-05,
"loss": 0.6054,
"step": 200
},
{
"epoch": 0.3700361010830325,
"grad_norm": 2.301500912368193,
"learning_rate": 1.9263537906137186e-05,
"loss": 0.608,
"step": 205
},
{
"epoch": 0.37906137184115524,
"grad_norm": 2.3411097096245146,
"learning_rate": 1.924548736462094e-05,
"loss": 0.6096,
"step": 210
},
{
"epoch": 0.388086642599278,
"grad_norm": 2.3477579646460267,
"learning_rate": 1.9227436823104693e-05,
"loss": 0.5981,
"step": 215
},
{
"epoch": 0.3971119133574007,
"grad_norm": 2.574256614403349,
"learning_rate": 1.920938628158845e-05,
"loss": 0.6032,
"step": 220
},
{
"epoch": 0.40613718411552346,
"grad_norm": 2.30445852375371,
"learning_rate": 1.9191335740072204e-05,
"loss": 0.6263,
"step": 225
},
{
"epoch": 0.4151624548736462,
"grad_norm": 1.9463153842646943,
"learning_rate": 1.9173285198555958e-05,
"loss": 0.5888,
"step": 230
},
{
"epoch": 0.42418772563176893,
"grad_norm": 2.2957992259075692,
"learning_rate": 1.9155234657039712e-05,
"loss": 0.5919,
"step": 235
},
{
"epoch": 0.4332129963898917,
"grad_norm": 2.4850310058268397,
"learning_rate": 1.913718411552347e-05,
"loss": 0.5982,
"step": 240
},
{
"epoch": 0.44223826714801445,
"grad_norm": 2.1484807254693665,
"learning_rate": 1.9119133574007223e-05,
"loss": 0.5922,
"step": 245
},
{
"epoch": 0.45126353790613716,
"grad_norm": 2.117521935012365,
"learning_rate": 1.9101083032490977e-05,
"loss": 0.614,
"step": 250
},
{
"epoch": 0.4602888086642599,
"grad_norm": 2.129059383234727,
"learning_rate": 1.908303249097473e-05,
"loss": 0.6065,
"step": 255
},
{
"epoch": 0.4693140794223827,
"grad_norm": 2.1304018405195904,
"learning_rate": 1.9064981949458485e-05,
"loss": 0.5776,
"step": 260
},
{
"epoch": 0.47833935018050544,
"grad_norm": 2.2992759766146973,
"learning_rate": 1.904693140794224e-05,
"loss": 0.5832,
"step": 265
},
{
"epoch": 0.48736462093862815,
"grad_norm": 2.0969841001906704,
"learning_rate": 1.9028880866425992e-05,
"loss": 0.6178,
"step": 270
},
{
"epoch": 0.4963898916967509,
"grad_norm": 2.2377624378205834,
"learning_rate": 1.901083032490975e-05,
"loss": 0.5877,
"step": 275
},
{
"epoch": 0.5054151624548736,
"grad_norm": 2.2091730376930308,
"learning_rate": 1.8992779783393503e-05,
"loss": 0.6093,
"step": 280
},
{
"epoch": 0.5144404332129964,
"grad_norm": 2.0821252876149274,
"learning_rate": 1.897472924187726e-05,
"loss": 0.5971,
"step": 285
},
{
"epoch": 0.5234657039711191,
"grad_norm": 2.4846088029201923,
"learning_rate": 1.8956678700361014e-05,
"loss": 0.5894,
"step": 290
},
{
"epoch": 0.5324909747292419,
"grad_norm": 2.253250469898687,
"learning_rate": 1.8938628158844768e-05,
"loss": 0.6035,
"step": 295
},
{
"epoch": 0.5415162454873647,
"grad_norm": 2.1890406664703237,
"learning_rate": 1.8920577617328522e-05,
"loss": 0.5905,
"step": 300
},
{
"epoch": 0.5505415162454874,
"grad_norm": 1.830902528715668,
"learning_rate": 1.8902527075812276e-05,
"loss": 0.5666,
"step": 305
},
{
"epoch": 0.5595667870036101,
"grad_norm": 2.2712137897785216,
"learning_rate": 1.888447653429603e-05,
"loss": 0.5794,
"step": 310
},
{
"epoch": 0.5685920577617328,
"grad_norm": 2.064463681190312,
"learning_rate": 1.8866425992779783e-05,
"loss": 0.5813,
"step": 315
},
{
"epoch": 0.5776173285198556,
"grad_norm": 2.0670626484908046,
"learning_rate": 1.8848375451263537e-05,
"loss": 0.5989,
"step": 320
},
{
"epoch": 0.5866425992779783,
"grad_norm": 2.4382023992648563,
"learning_rate": 1.8830324909747294e-05,
"loss": 0.5844,
"step": 325
},
{
"epoch": 0.5956678700361011,
"grad_norm": 2.3559788464517935,
"learning_rate": 1.8812274368231048e-05,
"loss": 0.5751,
"step": 330
},
{
"epoch": 0.6046931407942239,
"grad_norm": 2.0417255214726655,
"learning_rate": 1.8794223826714802e-05,
"loss": 0.582,
"step": 335
},
{
"epoch": 0.6137184115523465,
"grad_norm": 2.0606597313764174,
"learning_rate": 1.877617328519856e-05,
"loss": 0.5563,
"step": 340
},
{
"epoch": 0.6227436823104693,
"grad_norm": 2.1279001628524714,
"learning_rate": 1.8758122743682313e-05,
"loss": 0.5646,
"step": 345
},
{
"epoch": 0.631768953068592,
"grad_norm": 2.1228151514711318,
"learning_rate": 1.8740072202166067e-05,
"loss": 0.5687,
"step": 350
},
{
"epoch": 0.6407942238267148,
"grad_norm": 2.1178045046697997,
"learning_rate": 1.872202166064982e-05,
"loss": 0.5808,
"step": 355
},
{
"epoch": 0.6498194945848376,
"grad_norm": 2.110658120249641,
"learning_rate": 1.8703971119133574e-05,
"loss": 0.5576,
"step": 360
},
{
"epoch": 0.6588447653429603,
"grad_norm": 2.0620796847472835,
"learning_rate": 1.8685920577617328e-05,
"loss": 0.5817,
"step": 365
},
{
"epoch": 0.6678700361010831,
"grad_norm": 2.2036387339163217,
"learning_rate": 1.8667870036101086e-05,
"loss": 0.583,
"step": 370
},
{
"epoch": 0.6768953068592057,
"grad_norm": 2.169248099645488,
"learning_rate": 1.864981949458484e-05,
"loss": 0.5711,
"step": 375
},
{
"epoch": 0.6859205776173285,
"grad_norm": 2.0070917491256237,
"learning_rate": 1.8631768953068593e-05,
"loss": 0.5714,
"step": 380
},
{
"epoch": 0.6949458483754513,
"grad_norm": 2.412092286853898,
"learning_rate": 1.8613718411552347e-05,
"loss": 0.558,
"step": 385
},
{
"epoch": 0.703971119133574,
"grad_norm": 2.1573044881131636,
"learning_rate": 1.8595667870036104e-05,
"loss": 0.5605,
"step": 390
},
{
"epoch": 0.7129963898916968,
"grad_norm": 2.0060925615864234,
"learning_rate": 1.8577617328519858e-05,
"loss": 0.5646,
"step": 395
},
{
"epoch": 0.7220216606498195,
"grad_norm": 2.0721270781349856,
"learning_rate": 1.8559566787003612e-05,
"loss": 0.5591,
"step": 400
},
{
"epoch": 0.7310469314079422,
"grad_norm": 1.9740854158027745,
"learning_rate": 1.8541516245487366e-05,
"loss": 0.5749,
"step": 405
},
{
"epoch": 0.740072202166065,
"grad_norm": 2.1666939121131343,
"learning_rate": 1.852346570397112e-05,
"loss": 0.5559,
"step": 410
},
{
"epoch": 0.7490974729241877,
"grad_norm": 2.006153861447105,
"learning_rate": 1.8505415162454877e-05,
"loss": 0.5579,
"step": 415
},
{
"epoch": 0.7581227436823105,
"grad_norm": 2.1562521533225896,
"learning_rate": 1.848736462093863e-05,
"loss": 0.5641,
"step": 420
},
{
"epoch": 0.7671480144404332,
"grad_norm": 2.4213982076348386,
"learning_rate": 1.8469314079422384e-05,
"loss": 0.5697,
"step": 425
},
{
"epoch": 0.776173285198556,
"grad_norm": 2.021492521441799,
"learning_rate": 1.8451263537906138e-05,
"loss": 0.5512,
"step": 430
},
{
"epoch": 0.7851985559566786,
"grad_norm": 2.2387067716748903,
"learning_rate": 1.8433212996389892e-05,
"loss": 0.5567,
"step": 435
},
{
"epoch": 0.7942238267148014,
"grad_norm": 2.1182127501600654,
"learning_rate": 1.8415162454873646e-05,
"loss": 0.5529,
"step": 440
},
{
"epoch": 0.8032490974729242,
"grad_norm": 1.8809686502549097,
"learning_rate": 1.8397111913357403e-05,
"loss": 0.549,
"step": 445
},
{
"epoch": 0.8122743682310469,
"grad_norm": 1.9559681017618193,
"learning_rate": 1.8379061371841157e-05,
"loss": 0.546,
"step": 450
},
{
"epoch": 0.8212996389891697,
"grad_norm": 1.8843502716508203,
"learning_rate": 1.836101083032491e-05,
"loss": 0.5281,
"step": 455
},
{
"epoch": 0.8303249097472925,
"grad_norm": 2.0782130238317365,
"learning_rate": 1.8342960288808668e-05,
"loss": 0.5355,
"step": 460
},
{
"epoch": 0.8393501805054152,
"grad_norm": 2.028491033091686,
"learning_rate": 1.832490974729242e-05,
"loss": 0.5573,
"step": 465
},
{
"epoch": 0.8483754512635379,
"grad_norm": 1.997600040747785,
"learning_rate": 1.8306859205776175e-05,
"loss": 0.5503,
"step": 470
},
{
"epoch": 0.8574007220216606,
"grad_norm": 2.086622795789731,
"learning_rate": 1.828880866425993e-05,
"loss": 0.5271,
"step": 475
},
{
"epoch": 0.8664259927797834,
"grad_norm": 2.2126356409155603,
"learning_rate": 1.8270758122743683e-05,
"loss": 0.5295,
"step": 480
},
{
"epoch": 0.8754512635379061,
"grad_norm": 1.9420226465771289,
"learning_rate": 1.8252707581227437e-05,
"loss": 0.5229,
"step": 485
},
{
"epoch": 0.8844765342960289,
"grad_norm": 2.3845783258984605,
"learning_rate": 1.823465703971119e-05,
"loss": 0.5302,
"step": 490
},
{
"epoch": 0.8935018050541517,
"grad_norm": 1.990644749395453,
"learning_rate": 1.8216606498194948e-05,
"loss": 0.5327,
"step": 495
},
{
"epoch": 0.9025270758122743,
"grad_norm": 1.9296515724126473,
"learning_rate": 1.81985559566787e-05,
"loss": 0.5368,
"step": 500
},
{
"epoch": 0.9115523465703971,
"grad_norm": 2.08354639136672,
"learning_rate": 1.8180505415162456e-05,
"loss": 0.537,
"step": 505
},
{
"epoch": 0.9205776173285198,
"grad_norm": 1.8810121127412915,
"learning_rate": 1.8162454873646213e-05,
"loss": 0.5233,
"step": 510
},
{
"epoch": 0.9296028880866426,
"grad_norm": 2.1865339910493,
"learning_rate": 1.8144404332129967e-05,
"loss": 0.5229,
"step": 515
},
{
"epoch": 0.9386281588447654,
"grad_norm": 1.8530594783548222,
"learning_rate": 1.812635379061372e-05,
"loss": 0.5489,
"step": 520
},
{
"epoch": 0.9476534296028881,
"grad_norm": 1.8045811881947107,
"learning_rate": 1.8108303249097474e-05,
"loss": 0.5216,
"step": 525
},
{
"epoch": 0.9566787003610109,
"grad_norm": 2.0216105196649465,
"learning_rate": 1.8090252707581228e-05,
"loss": 0.5327,
"step": 530
},
{
"epoch": 0.9657039711191335,
"grad_norm": 2.070828240474061,
"learning_rate": 1.8072202166064982e-05,
"loss": 0.5341,
"step": 535
},
{
"epoch": 0.9747292418772563,
"grad_norm": 2.118337434473814,
"learning_rate": 1.8054151624548736e-05,
"loss": 0.5206,
"step": 540
},
{
"epoch": 0.983754512635379,
"grad_norm": 1.9327007846480317,
"learning_rate": 1.8036101083032493e-05,
"loss": 0.5223,
"step": 545
},
{
"epoch": 0.9927797833935018,
"grad_norm": 1.8102667165760828,
"learning_rate": 1.8018050541516247e-05,
"loss": 0.5183,
"step": 550
},
{
"epoch": 1.0018050541516246,
"grad_norm": 1.7565629809184389,
"learning_rate": 1.8e-05,
"loss": 0.5008,
"step": 555
},
{
"epoch": 1.0108303249097472,
"grad_norm": 1.7812661895325377,
"learning_rate": 1.7981949458483758e-05,
"loss": 0.403,
"step": 560
},
{
"epoch": 1.01985559566787,
"grad_norm": 1.9167006814291294,
"learning_rate": 1.796389891696751e-05,
"loss": 0.412,
"step": 565
},
{
"epoch": 1.0288808664259927,
"grad_norm": 1.9923575509001286,
"learning_rate": 1.7945848375451265e-05,
"loss": 0.3926,
"step": 570
},
{
"epoch": 1.0379061371841156,
"grad_norm": 1.7192659741327438,
"learning_rate": 1.792779783393502e-05,
"loss": 0.4072,
"step": 575
},
{
"epoch": 1.0469314079422383,
"grad_norm": 1.8513603883344656,
"learning_rate": 1.7909747292418773e-05,
"loss": 0.4031,
"step": 580
},
{
"epoch": 1.055956678700361,
"grad_norm": 1.828939711963778,
"learning_rate": 1.7891696750902527e-05,
"loss": 0.3984,
"step": 585
},
{
"epoch": 1.0649819494584838,
"grad_norm": 1.789210627896697,
"learning_rate": 1.7873646209386284e-05,
"loss": 0.4054,
"step": 590
},
{
"epoch": 1.0740072202166064,
"grad_norm": 1.8182359140012072,
"learning_rate": 1.7855595667870038e-05,
"loss": 0.4016,
"step": 595
},
{
"epoch": 1.0830324909747293,
"grad_norm": 1.8738010102389915,
"learning_rate": 1.783754512635379e-05,
"loss": 0.3916,
"step": 600
},
{
"epoch": 1.092057761732852,
"grad_norm": 1.7056071281982275,
"learning_rate": 1.7819494584837545e-05,
"loss": 0.4005,
"step": 605
},
{
"epoch": 1.1010830324909748,
"grad_norm": 1.7631642754877512,
"learning_rate": 1.7801444043321303e-05,
"loss": 0.4044,
"step": 610
},
{
"epoch": 1.1101083032490975,
"grad_norm": 1.9174467447157293,
"learning_rate": 1.7783393501805056e-05,
"loss": 0.3994,
"step": 615
},
{
"epoch": 1.1191335740072201,
"grad_norm": 1.9504105068703066,
"learning_rate": 1.776534296028881e-05,
"loss": 0.413,
"step": 620
},
{
"epoch": 1.128158844765343,
"grad_norm": 1.7274668622868,
"learning_rate": 1.7747292418772564e-05,
"loss": 0.4035,
"step": 625
},
{
"epoch": 1.1371841155234657,
"grad_norm": 1.8575861518758143,
"learning_rate": 1.772924187725632e-05,
"loss": 0.4098,
"step": 630
},
{
"epoch": 1.1462093862815885,
"grad_norm": 1.834605129701837,
"learning_rate": 1.7711191335740075e-05,
"loss": 0.4107,
"step": 635
},
{
"epoch": 1.1552346570397112,
"grad_norm": 1.6992491657802056,
"learning_rate": 1.769314079422383e-05,
"loss": 0.4104,
"step": 640
},
{
"epoch": 1.164259927797834,
"grad_norm": 1.6426546320530329,
"learning_rate": 1.7675090252707583e-05,
"loss": 0.4078,
"step": 645
},
{
"epoch": 1.1732851985559567,
"grad_norm": 1.8449792200452055,
"learning_rate": 1.7657039711191337e-05,
"loss": 0.4138,
"step": 650
},
{
"epoch": 1.1823104693140793,
"grad_norm": 1.7495204322144347,
"learning_rate": 1.763898916967509e-05,
"loss": 0.4084,
"step": 655
},
{
"epoch": 1.1913357400722022,
"grad_norm": 1.6260627217547752,
"learning_rate": 1.7620938628158844e-05,
"loss": 0.4197,
"step": 660
},
{
"epoch": 1.2003610108303249,
"grad_norm": 1.803828249935731,
"learning_rate": 1.76028880866426e-05,
"loss": 0.4157,
"step": 665
},
{
"epoch": 1.2093862815884477,
"grad_norm": 1.6698584933682465,
"learning_rate": 1.7584837545126355e-05,
"loss": 0.4108,
"step": 670
},
{
"epoch": 1.2184115523465704,
"grad_norm": 1.8547896347467827,
"learning_rate": 1.756678700361011e-05,
"loss": 0.4049,
"step": 675
},
{
"epoch": 1.2274368231046933,
"grad_norm": 1.9793660391728152,
"learning_rate": 1.7548736462093866e-05,
"loss": 0.4086,
"step": 680
},
{
"epoch": 1.236462093862816,
"grad_norm": 1.9481106812224744,
"learning_rate": 1.753068592057762e-05,
"loss": 0.4217,
"step": 685
},
{
"epoch": 1.2454873646209386,
"grad_norm": 1.8589145184235236,
"learning_rate": 1.7512635379061374e-05,
"loss": 0.4077,
"step": 690
},
{
"epoch": 1.2545126353790614,
"grad_norm": 1.7491717689464192,
"learning_rate": 1.7494584837545128e-05,
"loss": 0.4041,
"step": 695
},
{
"epoch": 1.263537906137184,
"grad_norm": 1.8301618024998683,
"learning_rate": 1.747653429602888e-05,
"loss": 0.4156,
"step": 700
},
{
"epoch": 1.2725631768953067,
"grad_norm": 1.8157533205183023,
"learning_rate": 1.7458483754512635e-05,
"loss": 0.4066,
"step": 705
},
{
"epoch": 1.2815884476534296,
"grad_norm": 1.618452105653501,
"learning_rate": 1.744043321299639e-05,
"loss": 0.4045,
"step": 710
},
{
"epoch": 1.2906137184115525,
"grad_norm": 1.801416162193692,
"learning_rate": 1.7422382671480146e-05,
"loss": 0.4114,
"step": 715
},
{
"epoch": 1.2996389891696751,
"grad_norm": 1.9120529810373577,
"learning_rate": 1.74043321299639e-05,
"loss": 0.4053,
"step": 720
},
{
"epoch": 1.3086642599277978,
"grad_norm": 1.8649971513309,
"learning_rate": 1.7386281588447654e-05,
"loss": 0.3994,
"step": 725
},
{
"epoch": 1.3176895306859207,
"grad_norm": 1.5823876746717431,
"learning_rate": 1.736823104693141e-05,
"loss": 0.405,
"step": 730
},
{
"epoch": 1.3267148014440433,
"grad_norm": 1.6565144465938595,
"learning_rate": 1.7350180505415165e-05,
"loss": 0.3956,
"step": 735
},
{
"epoch": 1.335740072202166,
"grad_norm": 1.7349944474589856,
"learning_rate": 1.733212996389892e-05,
"loss": 0.4004,
"step": 740
},
{
"epoch": 1.3447653429602888,
"grad_norm": 1.8413333458623544,
"learning_rate": 1.7314079422382673e-05,
"loss": 0.3933,
"step": 745
},
{
"epoch": 1.3537906137184115,
"grad_norm": 1.5956337270311123,
"learning_rate": 1.7296028880866426e-05,
"loss": 0.4027,
"step": 750
},
{
"epoch": 1.3628158844765343,
"grad_norm": 1.7395021140734128,
"learning_rate": 1.727797833935018e-05,
"loss": 0.4071,
"step": 755
},
{
"epoch": 1.371841155234657,
"grad_norm": 1.7983719467870525,
"learning_rate": 1.7259927797833937e-05,
"loss": 0.404,
"step": 760
},
{
"epoch": 1.3808664259927799,
"grad_norm": 1.7263325979359656,
"learning_rate": 1.724187725631769e-05,
"loss": 0.3933,
"step": 765
},
{
"epoch": 1.3898916967509025,
"grad_norm": 1.7378004534277407,
"learning_rate": 1.7223826714801445e-05,
"loss": 0.3839,
"step": 770
},
{
"epoch": 1.3989169675090252,
"grad_norm": 1.8567819478825005,
"learning_rate": 1.72057761732852e-05,
"loss": 0.4032,
"step": 775
},
{
"epoch": 1.407942238267148,
"grad_norm": 1.752933416288969,
"learning_rate": 1.7187725631768956e-05,
"loss": 0.3914,
"step": 780
},
{
"epoch": 1.4169675090252707,
"grad_norm": 1.7945976502850631,
"learning_rate": 1.716967509025271e-05,
"loss": 0.4052,
"step": 785
},
{
"epoch": 1.4259927797833936,
"grad_norm": 1.8440958122894435,
"learning_rate": 1.7151624548736464e-05,
"loss": 0.3889,
"step": 790
},
{
"epoch": 1.4350180505415162,
"grad_norm": 1.9541732635364462,
"learning_rate": 1.7133574007220218e-05,
"loss": 0.3897,
"step": 795
},
{
"epoch": 1.444043321299639,
"grad_norm": 1.7128039074068433,
"learning_rate": 1.711552346570397e-05,
"loss": 0.3949,
"step": 800
},
{
"epoch": 1.4530685920577617,
"grad_norm": 1.7805663890604293,
"learning_rate": 1.709747292418773e-05,
"loss": 0.4046,
"step": 805
},
{
"epoch": 1.4620938628158844,
"grad_norm": 1.7176479703166927,
"learning_rate": 1.7079422382671482e-05,
"loss": 0.3948,
"step": 810
},
{
"epoch": 1.4711191335740073,
"grad_norm": 1.7476594687945506,
"learning_rate": 1.7061371841155236e-05,
"loss": 0.3987,
"step": 815
},
{
"epoch": 1.48014440433213,
"grad_norm": 1.7097737410315963,
"learning_rate": 1.704332129963899e-05,
"loss": 0.3971,
"step": 820
},
{
"epoch": 1.4891696750902528,
"grad_norm": 1.6154261828372305,
"learning_rate": 1.7025270758122744e-05,
"loss": 0.3965,
"step": 825
},
{
"epoch": 1.4981949458483754,
"grad_norm": 1.6439021077041545,
"learning_rate": 1.7007220216606498e-05,
"loss": 0.3864,
"step": 830
},
{
"epoch": 1.5072202166064983,
"grad_norm": 1.7335388992906235,
"learning_rate": 1.6989169675090255e-05,
"loss": 0.3904,
"step": 835
},
{
"epoch": 1.516245487364621,
"grad_norm": 1.9280976394130103,
"learning_rate": 1.697111913357401e-05,
"loss": 0.3982,
"step": 840
},
{
"epoch": 1.5252707581227436,
"grad_norm": 1.6397519533163014,
"learning_rate": 1.6953068592057766e-05,
"loss": 0.4071,
"step": 845
},
{
"epoch": 1.5342960288808665,
"grad_norm": 1.7403299732030928,
"learning_rate": 1.693501805054152e-05,
"loss": 0.396,
"step": 850
},
{
"epoch": 1.5433212996389891,
"grad_norm": 1.856917104233432,
"learning_rate": 1.6916967509025274e-05,
"loss": 0.3965,
"step": 855
},
{
"epoch": 1.5523465703971118,
"grad_norm": 2.0561648148768246,
"learning_rate": 1.6898916967509027e-05,
"loss": 0.4011,
"step": 860
},
{
"epoch": 1.5613718411552346,
"grad_norm": 1.5934247442993748,
"learning_rate": 1.688086642599278e-05,
"loss": 0.4047,
"step": 865
},
{
"epoch": 1.5703971119133575,
"grad_norm": 1.6742097942966434,
"learning_rate": 1.6862815884476535e-05,
"loss": 0.3768,
"step": 870
},
{
"epoch": 1.5794223826714802,
"grad_norm": 1.6344395357039199,
"learning_rate": 1.684476534296029e-05,
"loss": 0.3994,
"step": 875
},
{
"epoch": 1.5884476534296028,
"grad_norm": 1.947535712414049,
"learning_rate": 1.6826714801444043e-05,
"loss": 0.3859,
"step": 880
},
{
"epoch": 1.5974729241877257,
"grad_norm": 1.8081716367634284,
"learning_rate": 1.68086642599278e-05,
"loss": 0.3971,
"step": 885
},
{
"epoch": 1.6064981949458483,
"grad_norm": 1.7555061804990388,
"learning_rate": 1.6790613718411554e-05,
"loss": 0.4033,
"step": 890
},
{
"epoch": 1.615523465703971,
"grad_norm": 1.7977625864566746,
"learning_rate": 1.6772563176895307e-05,
"loss": 0.3869,
"step": 895
},
{
"epoch": 1.6245487364620939,
"grad_norm": 2.0151295696094156,
"learning_rate": 1.6754512635379065e-05,
"loss": 0.3896,
"step": 900
},
{
"epoch": 1.6335740072202167,
"grad_norm": 1.7117350496546804,
"learning_rate": 1.673646209386282e-05,
"loss": 0.3931,
"step": 905
},
{
"epoch": 1.6425992779783394,
"grad_norm": 1.5367121125433267,
"learning_rate": 1.6718411552346572e-05,
"loss": 0.3925,
"step": 910
},
{
"epoch": 1.651624548736462,
"grad_norm": 1.7252709384729956,
"learning_rate": 1.6700361010830326e-05,
"loss": 0.3813,
"step": 915
},
{
"epoch": 1.660649819494585,
"grad_norm": 1.6424146369682562,
"learning_rate": 1.668231046931408e-05,
"loss": 0.3959,
"step": 920
},
{
"epoch": 1.6696750902527075,
"grad_norm": 1.7074460801427431,
"learning_rate": 1.6664259927797834e-05,
"loss": 0.3821,
"step": 925
},
{
"epoch": 1.6787003610108302,
"grad_norm": 1.7779369602208115,
"learning_rate": 1.6646209386281588e-05,
"loss": 0.387,
"step": 930
},
{
"epoch": 1.687725631768953,
"grad_norm": 1.7587322948042428,
"learning_rate": 1.6628158844765345e-05,
"loss": 0.3824,
"step": 935
},
{
"epoch": 1.696750902527076,
"grad_norm": 1.719570186327862,
"learning_rate": 1.66101083032491e-05,
"loss": 0.3968,
"step": 940
},
{
"epoch": 1.7057761732851986,
"grad_norm": 1.758936196148823,
"learning_rate": 1.6592057761732852e-05,
"loss": 0.4,
"step": 945
},
{
"epoch": 1.7148014440433212,
"grad_norm": 1.7037689600288302,
"learning_rate": 1.657400722021661e-05,
"loss": 0.3824,
"step": 950
},
{
"epoch": 1.7238267148014441,
"grad_norm": 1.6704621391960432,
"learning_rate": 1.6555956678700363e-05,
"loss": 0.3904,
"step": 955
},
{
"epoch": 1.7328519855595668,
"grad_norm": 1.6276171413365443,
"learning_rate": 1.6537906137184117e-05,
"loss": 0.3936,
"step": 960
},
{
"epoch": 1.7418772563176894,
"grad_norm": 1.4915457367830292,
"learning_rate": 1.651985559566787e-05,
"loss": 0.3878,
"step": 965
},
{
"epoch": 1.7509025270758123,
"grad_norm": 1.6998121121079899,
"learning_rate": 1.6501805054151625e-05,
"loss": 0.3851,
"step": 970
},
{
"epoch": 1.7599277978339352,
"grad_norm": 1.7868333554788238,
"learning_rate": 1.6483754512635382e-05,
"loss": 0.4016,
"step": 975
},
{
"epoch": 1.7689530685920578,
"grad_norm": 1.9509112307215477,
"learning_rate": 1.6465703971119136e-05,
"loss": 0.3947,
"step": 980
},
{
"epoch": 1.7779783393501805,
"grad_norm": 1.8313430184945898,
"learning_rate": 1.644765342960289e-05,
"loss": 0.3883,
"step": 985
},
{
"epoch": 1.7870036101083033,
"grad_norm": 1.69745773450961,
"learning_rate": 1.6429602888086644e-05,
"loss": 0.3927,
"step": 990
},
{
"epoch": 1.796028880866426,
"grad_norm": 1.5706439056988484,
"learning_rate": 1.6411552346570397e-05,
"loss": 0.3885,
"step": 995
},
{
"epoch": 1.8050541516245486,
"grad_norm": 1.9749233060644407,
"learning_rate": 1.639350180505415e-05,
"loss": 0.3911,
"step": 1000
},
{
"epoch": 1.8140794223826715,
"grad_norm": 1.835030597426223,
"learning_rate": 1.637545126353791e-05,
"loss": 0.3942,
"step": 1005
},
{
"epoch": 1.8231046931407944,
"grad_norm": 1.5738451037571743,
"learning_rate": 1.6357400722021662e-05,
"loss": 0.3812,
"step": 1010
},
{
"epoch": 1.8321299638989168,
"grad_norm": 1.751135780497672,
"learning_rate": 1.6339350180505416e-05,
"loss": 0.3862,
"step": 1015
},
{
"epoch": 1.8411552346570397,
"grad_norm": 1.864136563331331,
"learning_rate": 1.6321299638989173e-05,
"loss": 0.3942,
"step": 1020
},
{
"epoch": 1.8501805054151625,
"grad_norm": 1.5420248188805685,
"learning_rate": 1.6303249097472927e-05,
"loss": 0.3828,
"step": 1025
},
{
"epoch": 1.8592057761732852,
"grad_norm": 1.6307207463776452,
"learning_rate": 1.628519855595668e-05,
"loss": 0.3766,
"step": 1030
},
{
"epoch": 1.8682310469314078,
"grad_norm": 1.7022785037029124,
"learning_rate": 1.6267148014440435e-05,
"loss": 0.3847,
"step": 1035
},
{
"epoch": 1.8772563176895307,
"grad_norm": 1.6865169590583908,
"learning_rate": 1.624909747292419e-05,
"loss": 0.3855,
"step": 1040
},
{
"epoch": 1.8862815884476536,
"grad_norm": 1.772735245585654,
"learning_rate": 1.6231046931407942e-05,
"loss": 0.3766,
"step": 1045
},
{
"epoch": 1.895306859205776,
"grad_norm": 1.6414783417710321,
"learning_rate": 1.6212996389891696e-05,
"loss": 0.3769,
"step": 1050
},
{
"epoch": 1.904332129963899,
"grad_norm": 1.638546064732281,
"learning_rate": 1.6194945848375453e-05,
"loss": 0.3811,
"step": 1055
},
{
"epoch": 1.9133574007220218,
"grad_norm": 1.7273768032341619,
"learning_rate": 1.6176895306859207e-05,
"loss": 0.3787,
"step": 1060
},
{
"epoch": 1.9223826714801444,
"grad_norm": 1.7610887591542017,
"learning_rate": 1.615884476534296e-05,
"loss": 0.3901,
"step": 1065
},
{
"epoch": 1.931407942238267,
"grad_norm": 1.7492183570516289,
"learning_rate": 1.6140794223826718e-05,
"loss": 0.385,
"step": 1070
},
{
"epoch": 1.94043321299639,
"grad_norm": 1.6697391799649597,
"learning_rate": 1.6122743682310472e-05,
"loss": 0.3865,
"step": 1075
},
{
"epoch": 1.9494584837545126,
"grad_norm": 1.5675976217251384,
"learning_rate": 1.6104693140794226e-05,
"loss": 0.3757,
"step": 1080
},
{
"epoch": 1.9584837545126352,
"grad_norm": 1.6558779934861987,
"learning_rate": 1.608664259927798e-05,
"loss": 0.379,
"step": 1085
},
{
"epoch": 1.967509025270758,
"grad_norm": 1.6873028676221205,
"learning_rate": 1.6068592057761733e-05,
"loss": 0.3811,
"step": 1090
},
{
"epoch": 1.976534296028881,
"grad_norm": 1.6240908295441967,
"learning_rate": 1.6050541516245487e-05,
"loss": 0.3654,
"step": 1095
},
{
"epoch": 1.9855595667870036,
"grad_norm": 1.6615813580574832,
"learning_rate": 1.603249097472924e-05,
"loss": 0.3796,
"step": 1100
},
{
"epoch": 1.9945848375451263,
"grad_norm": 1.686361226213432,
"learning_rate": 1.6014440433212998e-05,
"loss": 0.3837,
"step": 1105
},
{
"epoch": 2.003610108303249,
"grad_norm": 1.4089105927337982,
"learning_rate": 1.5996389891696752e-05,
"loss": 0.3506,
"step": 1110
},
{
"epoch": 2.012635379061372,
"grad_norm": 1.375915950771055,
"learning_rate": 1.5978339350180506e-05,
"loss": 0.2759,
"step": 1115
},
{
"epoch": 2.0216606498194944,
"grad_norm": 1.5521739291638614,
"learning_rate": 1.5960288808664263e-05,
"loss": 0.2756,
"step": 1120
},
{
"epoch": 2.0306859205776173,
"grad_norm": 1.6072972838350046,
"learning_rate": 1.5942238267148017e-05,
"loss": 0.2673,
"step": 1125
},
{
"epoch": 2.03971119133574,
"grad_norm": 1.6652650820556896,
"learning_rate": 1.592418772563177e-05,
"loss": 0.27,
"step": 1130
},
{
"epoch": 2.0487364620938626,
"grad_norm": 1.4210627524187658,
"learning_rate": 1.5906137184115525e-05,
"loss": 0.2706,
"step": 1135
},
{
"epoch": 2.0577617328519855,
"grad_norm": 1.6471286272058623,
"learning_rate": 1.588808664259928e-05,
"loss": 0.2762,
"step": 1140
},
{
"epoch": 2.0667870036101084,
"grad_norm": 1.467229273738129,
"learning_rate": 1.5870036101083032e-05,
"loss": 0.2828,
"step": 1145
},
{
"epoch": 2.0758122743682312,
"grad_norm": 1.4064489033689571,
"learning_rate": 1.585198555956679e-05,
"loss": 0.2751,
"step": 1150
},
{
"epoch": 2.0848375451263537,
"grad_norm": 1.4888952106078595,
"learning_rate": 1.5833935018050543e-05,
"loss": 0.2785,
"step": 1155
},
{
"epoch": 2.0938628158844765,
"grad_norm": 1.5202276448854608,
"learning_rate": 1.5815884476534297e-05,
"loss": 0.2728,
"step": 1160
},
{
"epoch": 2.1028880866425994,
"grad_norm": 1.4911272606877095,
"learning_rate": 1.579783393501805e-05,
"loss": 0.2735,
"step": 1165
},
{
"epoch": 2.111913357400722,
"grad_norm": 1.5345612677088643,
"learning_rate": 1.5779783393501805e-05,
"loss": 0.2751,
"step": 1170
},
{
"epoch": 2.1209386281588447,
"grad_norm": 1.470603227078021,
"learning_rate": 1.5761732851985562e-05,
"loss": 0.2799,
"step": 1175
},
{
"epoch": 2.1299638989169676,
"grad_norm": 1.6228746622020027,
"learning_rate": 1.5743682310469316e-05,
"loss": 0.2751,
"step": 1180
},
{
"epoch": 2.1389891696750905,
"grad_norm": 1.5092478918219132,
"learning_rate": 1.572563176895307e-05,
"loss": 0.2837,
"step": 1185
},
{
"epoch": 2.148014440433213,
"grad_norm": 1.5022801045910057,
"learning_rate": 1.5707581227436823e-05,
"loss": 0.2806,
"step": 1190
},
{
"epoch": 2.1570397111913358,
"grad_norm": 1.5037599452460795,
"learning_rate": 1.568953068592058e-05,
"loss": 0.2767,
"step": 1195
},
{
"epoch": 2.1660649819494586,
"grad_norm": 1.579515606540816,
"learning_rate": 1.5671480144404334e-05,
"loss": 0.2849,
"step": 1200
},
{
"epoch": 2.175090252707581,
"grad_norm": 1.4754220940034453,
"learning_rate": 1.5653429602888088e-05,
"loss": 0.2791,
"step": 1205
},
{
"epoch": 2.184115523465704,
"grad_norm": 1.3959522376935156,
"learning_rate": 1.5635379061371842e-05,
"loss": 0.2881,
"step": 1210
},
{
"epoch": 2.193140794223827,
"grad_norm": 1.5444043202318694,
"learning_rate": 1.5617328519855596e-05,
"loss": 0.2786,
"step": 1215
},
{
"epoch": 2.2021660649819497,
"grad_norm": 1.4479480310561215,
"learning_rate": 1.559927797833935e-05,
"loss": 0.2797,
"step": 1220
},
{
"epoch": 2.211191335740072,
"grad_norm": 1.3937802391741012,
"learning_rate": 1.5581227436823107e-05,
"loss": 0.2799,
"step": 1225
},
{
"epoch": 2.220216606498195,
"grad_norm": 1.4393031837130634,
"learning_rate": 1.556317689530686e-05,
"loss": 0.2823,
"step": 1230
},
{
"epoch": 2.229241877256318,
"grad_norm": 1.6426238576078906,
"learning_rate": 1.5545126353790614e-05,
"loss": 0.2784,
"step": 1235
},
{
"epoch": 2.2382671480144403,
"grad_norm": 1.6072066180424502,
"learning_rate": 1.552707581227437e-05,
"loss": 0.2872,
"step": 1240
},
{
"epoch": 2.247292418772563,
"grad_norm": 1.3689820360945464,
"learning_rate": 1.5509025270758125e-05,
"loss": 0.2812,
"step": 1245
},
{
"epoch": 2.256317689530686,
"grad_norm": 1.363835208538375,
"learning_rate": 1.549097472924188e-05,
"loss": 0.283,
"step": 1250
},
{
"epoch": 2.265342960288809,
"grad_norm": 1.6184406483983944,
"learning_rate": 1.5472924187725633e-05,
"loss": 0.2819,
"step": 1255
},
{
"epoch": 2.2743682310469313,
"grad_norm": 1.452212492826692,
"learning_rate": 1.5454873646209387e-05,
"loss": 0.2886,
"step": 1260
},
{
"epoch": 2.283393501805054,
"grad_norm": 1.3645649562334174,
"learning_rate": 1.543682310469314e-05,
"loss": 0.2817,
"step": 1265
},
{
"epoch": 2.292418772563177,
"grad_norm": 1.503443086552838,
"learning_rate": 1.5418772563176895e-05,
"loss": 0.2816,
"step": 1270
},
{
"epoch": 2.3014440433212995,
"grad_norm": 1.4467084497351497,
"learning_rate": 1.5400722021660652e-05,
"loss": 0.2857,
"step": 1275
},
{
"epoch": 2.3104693140794224,
"grad_norm": 1.3905500897215815,
"learning_rate": 1.5382671480144406e-05,
"loss": 0.2858,
"step": 1280
},
{
"epoch": 2.3194945848375452,
"grad_norm": 1.5392545695875637,
"learning_rate": 1.536462093862816e-05,
"loss": 0.2828,
"step": 1285
},
{
"epoch": 2.328519855595668,
"grad_norm": 1.5670788162913514,
"learning_rate": 1.5346570397111917e-05,
"loss": 0.2819,
"step": 1290
},
{
"epoch": 2.3375451263537905,
"grad_norm": 1.4159881393523641,
"learning_rate": 1.532851985559567e-05,
"loss": 0.2824,
"step": 1295
},
{
"epoch": 2.3465703971119134,
"grad_norm": 1.4831615682921662,
"learning_rate": 1.5310469314079424e-05,
"loss": 0.2865,
"step": 1300
},
{
"epoch": 2.3555956678700363,
"grad_norm": 1.4219138664036257,
"learning_rate": 1.5292418772563178e-05,
"loss": 0.2868,
"step": 1305
},
{
"epoch": 2.3646209386281587,
"grad_norm": 1.512943638670528,
"learning_rate": 1.5274368231046932e-05,
"loss": 0.2808,
"step": 1310
},
{
"epoch": 2.3736462093862816,
"grad_norm": 1.5368411160330724,
"learning_rate": 1.5256317689530686e-05,
"loss": 0.2818,
"step": 1315
},
{
"epoch": 2.3826714801444044,
"grad_norm": 1.4589365443912974,
"learning_rate": 1.5238267148014441e-05,
"loss": 0.2858,
"step": 1320
},
{
"epoch": 2.3916967509025273,
"grad_norm": 4.062153022713927,
"learning_rate": 1.5220216606498197e-05,
"loss": 0.2859,
"step": 1325
},
{
"epoch": 2.4007220216606497,
"grad_norm": 1.5707167014303813,
"learning_rate": 1.520216606498195e-05,
"loss": 0.2848,
"step": 1330
},
{
"epoch": 2.4097472924187726,
"grad_norm": 1.435828117165283,
"learning_rate": 1.5184115523465706e-05,
"loss": 0.2882,
"step": 1335
},
{
"epoch": 2.4187725631768955,
"grad_norm": 1.4275643368334596,
"learning_rate": 1.516606498194946e-05,
"loss": 0.2867,
"step": 1340
},
{
"epoch": 2.427797833935018,
"grad_norm": 1.5650882814331575,
"learning_rate": 1.5148014440433214e-05,
"loss": 0.2802,
"step": 1345
},
{
"epoch": 2.436823104693141,
"grad_norm": 1.6485980287400839,
"learning_rate": 1.512996389891697e-05,
"loss": 0.2876,
"step": 1350
},
{
"epoch": 2.4458483754512637,
"grad_norm": 1.7648222520273187,
"learning_rate": 1.5111913357400723e-05,
"loss": 0.2918,
"step": 1355
},
{
"epoch": 2.4548736462093865,
"grad_norm": 1.3897537891471694,
"learning_rate": 1.5093862815884477e-05,
"loss": 0.2827,
"step": 1360
},
{
"epoch": 2.463898916967509,
"grad_norm": 1.363449811555752,
"learning_rate": 1.5075812274368234e-05,
"loss": 0.2755,
"step": 1365
},
{
"epoch": 2.472924187725632,
"grad_norm": 1.5054780686538907,
"learning_rate": 1.5057761732851988e-05,
"loss": 0.2818,
"step": 1370
},
{
"epoch": 2.4819494584837547,
"grad_norm": 1.4402872956422859,
"learning_rate": 1.5039711191335742e-05,
"loss": 0.2806,
"step": 1375
},
{
"epoch": 2.490974729241877,
"grad_norm": 1.531753267929379,
"learning_rate": 1.5021660649819495e-05,
"loss": 0.281,
"step": 1380
},
{
"epoch": 2.5,
"grad_norm": 1.2873311123663773,
"learning_rate": 1.5003610108303251e-05,
"loss": 0.2816,
"step": 1385
},
{
"epoch": 2.509025270758123,
"grad_norm": 1.3443259078812042,
"learning_rate": 1.4985559566787005e-05,
"loss": 0.2816,
"step": 1390
},
{
"epoch": 2.5180505415162457,
"grad_norm": 1.5434631176695652,
"learning_rate": 1.4967509025270759e-05,
"loss": 0.2819,
"step": 1395
},
{
"epoch": 2.527075812274368,
"grad_norm": 1.362978897038253,
"learning_rate": 1.4949458483754512e-05,
"loss": 0.2794,
"step": 1400
},
{
"epoch": 2.536101083032491,
"grad_norm": 1.3819832377162544,
"learning_rate": 1.4931407942238268e-05,
"loss": 0.2868,
"step": 1405
},
{
"epoch": 2.5451263537906135,
"grad_norm": 1.6555554545565292,
"learning_rate": 1.4913357400722023e-05,
"loss": 0.2794,
"step": 1410
},
{
"epoch": 2.5541516245487363,
"grad_norm": 1.3799356893593522,
"learning_rate": 1.4895306859205779e-05,
"loss": 0.2895,
"step": 1415
},
{
"epoch": 2.563176895306859,
"grad_norm": 1.3978197779714834,
"learning_rate": 1.4877256317689533e-05,
"loss": 0.2823,
"step": 1420
},
{
"epoch": 2.572202166064982,
"grad_norm": 1.5752923039032896,
"learning_rate": 1.4859205776173287e-05,
"loss": 0.2864,
"step": 1425
},
{
"epoch": 2.581227436823105,
"grad_norm": 1.5198542385453242,
"learning_rate": 1.484115523465704e-05,
"loss": 0.2895,
"step": 1430
},
{
"epoch": 2.5902527075812274,
"grad_norm": 1.5294363816313867,
"learning_rate": 1.4823104693140796e-05,
"loss": 0.2841,
"step": 1435
},
{
"epoch": 2.5992779783393503,
"grad_norm": 1.6460098424826168,
"learning_rate": 1.480505415162455e-05,
"loss": 0.2905,
"step": 1440
},
{
"epoch": 2.6083032490974727,
"grad_norm": 1.4396643499754782,
"learning_rate": 1.4787003610108304e-05,
"loss": 0.2839,
"step": 1445
},
{
"epoch": 2.6173285198555956,
"grad_norm": 1.4770734853739884,
"learning_rate": 1.4768953068592057e-05,
"loss": 0.285,
"step": 1450
},
{
"epoch": 2.6263537906137184,
"grad_norm": 1.5925936800627583,
"learning_rate": 1.4750902527075815e-05,
"loss": 0.2939,
"step": 1455
},
{
"epoch": 2.6353790613718413,
"grad_norm": 1.3085799141153367,
"learning_rate": 1.4732851985559568e-05,
"loss": 0.2755,
"step": 1460
},
{
"epoch": 2.644404332129964,
"grad_norm": 1.3774680740883536,
"learning_rate": 1.4714801444043322e-05,
"loss": 0.2808,
"step": 1465
},
{
"epoch": 2.6534296028880866,
"grad_norm": 1.4223794368692813,
"learning_rate": 1.4696750902527078e-05,
"loss": 0.2767,
"step": 1470
},
{
"epoch": 2.6624548736462095,
"grad_norm": 1.579637524626807,
"learning_rate": 1.4678700361010832e-05,
"loss": 0.2849,
"step": 1475
},
{
"epoch": 2.671480144404332,
"grad_norm": 1.393710344111409,
"learning_rate": 1.4660649819494585e-05,
"loss": 0.2768,
"step": 1480
},
{
"epoch": 2.6805054151624548,
"grad_norm": 1.36136102500023,
"learning_rate": 1.464259927797834e-05,
"loss": 0.2816,
"step": 1485
},
{
"epoch": 2.6895306859205776,
"grad_norm": 1.4211824221206752,
"learning_rate": 1.4624548736462095e-05,
"loss": 0.2856,
"step": 1490
},
{
"epoch": 2.6985559566787005,
"grad_norm": 1.4156029806705734,
"learning_rate": 1.460649819494585e-05,
"loss": 0.2783,
"step": 1495
},
{
"epoch": 2.707581227436823,
"grad_norm": 1.523501338124956,
"learning_rate": 1.4588447653429606e-05,
"loss": 0.2872,
"step": 1500
},
{
"epoch": 2.716606498194946,
"grad_norm": 1.472818222499458,
"learning_rate": 1.457039711191336e-05,
"loss": 0.2806,
"step": 1505
},
{
"epoch": 2.7256317689530687,
"grad_norm": 1.3839972224563968,
"learning_rate": 1.4552346570397113e-05,
"loss": 0.2887,
"step": 1510
},
{
"epoch": 2.734657039711191,
"grad_norm": 1.4093341174867682,
"learning_rate": 1.4534296028880867e-05,
"loss": 0.2812,
"step": 1515
},
{
"epoch": 2.743682310469314,
"grad_norm": 1.4227505403261873,
"learning_rate": 1.4516245487364623e-05,
"loss": 0.2873,
"step": 1520
},
{
"epoch": 2.752707581227437,
"grad_norm": 1.532809014546811,
"learning_rate": 1.4498194945848376e-05,
"loss": 0.2804,
"step": 1525
},
{
"epoch": 2.7617328519855597,
"grad_norm": 1.5838121158952596,
"learning_rate": 1.448014440433213e-05,
"loss": 0.2841,
"step": 1530
},
{
"epoch": 2.770758122743682,
"grad_norm": 1.463955244062951,
"learning_rate": 1.4462093862815884e-05,
"loss": 0.2833,
"step": 1535
},
{
"epoch": 2.779783393501805,
"grad_norm": 1.5918158548066845,
"learning_rate": 1.4444043321299641e-05,
"loss": 0.2815,
"step": 1540
},
{
"epoch": 2.788808664259928,
"grad_norm": 1.4805453499007415,
"learning_rate": 1.4425992779783395e-05,
"loss": 0.2811,
"step": 1545
},
{
"epoch": 2.7978339350180503,
"grad_norm": 1.5502318521124943,
"learning_rate": 1.4407942238267149e-05,
"loss": 0.28,
"step": 1550
},
{
"epoch": 2.806859205776173,
"grad_norm": 1.4288682445201122,
"learning_rate": 1.4389891696750904e-05,
"loss": 0.2868,
"step": 1555
},
{
"epoch": 2.815884476534296,
"grad_norm": 1.4010775386857144,
"learning_rate": 1.4371841155234658e-05,
"loss": 0.2889,
"step": 1560
},
{
"epoch": 2.824909747292419,
"grad_norm": 1.4238966947086382,
"learning_rate": 1.4353790613718412e-05,
"loss": 0.2797,
"step": 1565
},
{
"epoch": 2.8339350180505414,
"grad_norm": 1.6701165033755396,
"learning_rate": 1.4335740072202166e-05,
"loss": 0.2821,
"step": 1570
},
{
"epoch": 2.8429602888086642,
"grad_norm": 1.52168185280516,
"learning_rate": 1.4317689530685921e-05,
"loss": 0.2836,
"step": 1575
},
{
"epoch": 2.851985559566787,
"grad_norm": 1.4368504165059217,
"learning_rate": 1.4299638989169675e-05,
"loss": 0.2821,
"step": 1580
},
{
"epoch": 2.8610108303249095,
"grad_norm": 1.5537901610258407,
"learning_rate": 1.4281588447653432e-05,
"loss": 0.2767,
"step": 1585
},
{
"epoch": 2.8700361010830324,
"grad_norm": 1.5418670823823388,
"learning_rate": 1.4263537906137186e-05,
"loss": 0.286,
"step": 1590
},
{
"epoch": 2.8790613718411553,
"grad_norm": 1.346173892816451,
"learning_rate": 1.424548736462094e-05,
"loss": 0.2855,
"step": 1595
},
{
"epoch": 2.888086642599278,
"grad_norm": 1.4152264182563925,
"learning_rate": 1.4227436823104694e-05,
"loss": 0.2815,
"step": 1600
},
{
"epoch": 2.8971119133574006,
"grad_norm": 1.3391376662233245,
"learning_rate": 1.420938628158845e-05,
"loss": 0.2851,
"step": 1605
},
{
"epoch": 2.9061371841155235,
"grad_norm": 1.4435874024341668,
"learning_rate": 1.4191335740072203e-05,
"loss": 0.2796,
"step": 1610
},
{
"epoch": 2.9151624548736463,
"grad_norm": 1.3751003396664772,
"learning_rate": 1.4173285198555957e-05,
"loss": 0.2825,
"step": 1615
},
{
"epoch": 2.9241877256317688,
"grad_norm": 1.5204342697256372,
"learning_rate": 1.4155234657039711e-05,
"loss": 0.2841,
"step": 1620
},
{
"epoch": 2.9332129963898916,
"grad_norm": 1.6012862613909276,
"learning_rate": 1.4137184115523468e-05,
"loss": 0.2772,
"step": 1625
},
{
"epoch": 2.9422382671480145,
"grad_norm": 1.40233355498611,
"learning_rate": 1.4119133574007222e-05,
"loss": 0.2813,
"step": 1630
},
{
"epoch": 2.9512635379061374,
"grad_norm": 1.4678056138681723,
"learning_rate": 1.4101083032490976e-05,
"loss": 0.2833,
"step": 1635
},
{
"epoch": 2.96028880866426,
"grad_norm": 1.5256510122152633,
"learning_rate": 1.4083032490974731e-05,
"loss": 0.2875,
"step": 1640
},
{
"epoch": 2.9693140794223827,
"grad_norm": 1.5409003183834475,
"learning_rate": 1.4064981949458485e-05,
"loss": 0.2907,
"step": 1645
},
{
"epoch": 2.9783393501805056,
"grad_norm": 1.5329424357705386,
"learning_rate": 1.4046931407942239e-05,
"loss": 0.2795,
"step": 1650
},
{
"epoch": 2.987364620938628,
"grad_norm": 1.4730310617789872,
"learning_rate": 1.4028880866425993e-05,
"loss": 0.2818,
"step": 1655
},
{
"epoch": 2.996389891696751,
"grad_norm": 1.51332909920422,
"learning_rate": 1.4010830324909748e-05,
"loss": 0.2813,
"step": 1660
},
{
"epoch": 3.0054151624548737,
"grad_norm": 1.1148635775562197,
"learning_rate": 1.3992779783393502e-05,
"loss": 0.2436,
"step": 1665
},
{
"epoch": 3.0144404332129966,
"grad_norm": 1.358530663003378,
"learning_rate": 1.397472924187726e-05,
"loss": 0.2095,
"step": 1670
},
{
"epoch": 3.023465703971119,
"grad_norm": 1.3975749965062991,
"learning_rate": 1.3956678700361013e-05,
"loss": 0.2111,
"step": 1675
},
{
"epoch": 3.032490974729242,
"grad_norm": 1.2464319048586523,
"learning_rate": 1.3938628158844767e-05,
"loss": 0.2168,
"step": 1680
},
{
"epoch": 3.0415162454873648,
"grad_norm": 1.2891229687458905,
"learning_rate": 1.392057761732852e-05,
"loss": 0.209,
"step": 1685
},
{
"epoch": 3.050541516245487,
"grad_norm": 1.4037828122476248,
"learning_rate": 1.3902527075812276e-05,
"loss": 0.2153,
"step": 1690
},
{
"epoch": 3.05956678700361,
"grad_norm": 1.2235806246519516,
"learning_rate": 1.388447653429603e-05,
"loss": 0.2096,
"step": 1695
},
{
"epoch": 3.068592057761733,
"grad_norm": 1.378724047451379,
"learning_rate": 1.3866425992779784e-05,
"loss": 0.2085,
"step": 1700
},
{
"epoch": 3.077617328519856,
"grad_norm": 1.382858335186212,
"learning_rate": 1.3848375451263538e-05,
"loss": 0.2204,
"step": 1705
},
{
"epoch": 3.0866425992779782,
"grad_norm": 1.3137128844249182,
"learning_rate": 1.3830324909747293e-05,
"loss": 0.2162,
"step": 1710
},
{
"epoch": 3.095667870036101,
"grad_norm": 1.2029503152905936,
"learning_rate": 1.3812274368231049e-05,
"loss": 0.212,
"step": 1715
},
{
"epoch": 3.104693140794224,
"grad_norm": 1.3679968447380255,
"learning_rate": 1.3794223826714802e-05,
"loss": 0.2112,
"step": 1720
},
{
"epoch": 3.1137184115523464,
"grad_norm": 1.4087458694349797,
"learning_rate": 1.3776173285198558e-05,
"loss": 0.2124,
"step": 1725
},
{
"epoch": 3.1227436823104693,
"grad_norm": 1.2653578136284922,
"learning_rate": 1.3758122743682312e-05,
"loss": 0.2138,
"step": 1730
},
{
"epoch": 3.131768953068592,
"grad_norm": 1.3112584499411382,
"learning_rate": 1.3740072202166066e-05,
"loss": 0.2163,
"step": 1735
},
{
"epoch": 3.140794223826715,
"grad_norm": 1.4058232641289103,
"learning_rate": 1.372202166064982e-05,
"loss": 0.2159,
"step": 1740
},
{
"epoch": 3.1498194945848375,
"grad_norm": 1.41030881061776,
"learning_rate": 1.3703971119133575e-05,
"loss": 0.2145,
"step": 1745
},
{
"epoch": 3.1588447653429603,
"grad_norm": 1.5104671605422084,
"learning_rate": 1.3685920577617329e-05,
"loss": 0.2139,
"step": 1750
},
{
"epoch": 3.167870036101083,
"grad_norm": 1.2630974507680133,
"learning_rate": 1.3667870036101086e-05,
"loss": 0.2157,
"step": 1755
},
{
"epoch": 3.1768953068592056,
"grad_norm": 1.272128771203331,
"learning_rate": 1.364981949458484e-05,
"loss": 0.2161,
"step": 1760
},
{
"epoch": 3.1859205776173285,
"grad_norm": 1.3478004143294164,
"learning_rate": 1.3631768953068594e-05,
"loss": 0.2163,
"step": 1765
},
{
"epoch": 3.1949458483754514,
"grad_norm": 1.450785422963031,
"learning_rate": 1.3613718411552347e-05,
"loss": 0.2073,
"step": 1770
},
{
"epoch": 3.2039711191335742,
"grad_norm": 1.4737969733507372,
"learning_rate": 1.3595667870036103e-05,
"loss": 0.214,
"step": 1775
},
{
"epoch": 3.2129963898916967,
"grad_norm": 1.55495138656805,
"learning_rate": 1.3577617328519857e-05,
"loss": 0.217,
"step": 1780
},
{
"epoch": 3.2220216606498195,
"grad_norm": 1.301565356658425,
"learning_rate": 1.355956678700361e-05,
"loss": 0.2125,
"step": 1785
},
{
"epoch": 3.2310469314079424,
"grad_norm": 1.3526575231198374,
"learning_rate": 1.3541516245487364e-05,
"loss": 0.2139,
"step": 1790
},
{
"epoch": 3.240072202166065,
"grad_norm": 1.3480835110678375,
"learning_rate": 1.352346570397112e-05,
"loss": 0.2163,
"step": 1795
},
{
"epoch": 3.2490974729241877,
"grad_norm": 1.585271243780268,
"learning_rate": 1.3505415162454875e-05,
"loss": 0.2172,
"step": 1800
},
{
"epoch": 3.2581227436823106,
"grad_norm": 1.3914377175448838,
"learning_rate": 1.348736462093863e-05,
"loss": 0.2175,
"step": 1805
},
{
"epoch": 3.2671480144404335,
"grad_norm": 1.3096937764073042,
"learning_rate": 1.3469314079422385e-05,
"loss": 0.2161,
"step": 1810
},
{
"epoch": 3.276173285198556,
"grad_norm": 1.4025247600726756,
"learning_rate": 1.3451263537906139e-05,
"loss": 0.2192,
"step": 1815
},
{
"epoch": 3.2851985559566788,
"grad_norm": 1.341953244519878,
"learning_rate": 1.3433212996389892e-05,
"loss": 0.2193,
"step": 1820
},
{
"epoch": 3.2942238267148016,
"grad_norm": 1.1736050435468526,
"learning_rate": 1.3415162454873646e-05,
"loss": 0.2169,
"step": 1825
},
{
"epoch": 3.303249097472924,
"grad_norm": 7.343497355928463,
"learning_rate": 1.3397111913357402e-05,
"loss": 0.2221,
"step": 1830
},
{
"epoch": 3.312274368231047,
"grad_norm": 1.371834881522216,
"learning_rate": 1.3379061371841155e-05,
"loss": 0.2199,
"step": 1835
},
{
"epoch": 3.32129963898917,
"grad_norm": 1.4365047454230295,
"learning_rate": 1.336101083032491e-05,
"loss": 0.2132,
"step": 1840
},
{
"epoch": 3.3303249097472922,
"grad_norm": 1.272421653704436,
"learning_rate": 1.3342960288808667e-05,
"loss": 0.2184,
"step": 1845
},
{
"epoch": 3.339350180505415,
"grad_norm": 1.4316028096472446,
"learning_rate": 1.332490974729242e-05,
"loss": 0.2132,
"step": 1850
},
{
"epoch": 3.348375451263538,
"grad_norm": 1.2759638728894496,
"learning_rate": 1.3306859205776174e-05,
"loss": 0.2171,
"step": 1855
},
{
"epoch": 3.357400722021661,
"grad_norm": 1.3243009878900587,
"learning_rate": 1.328880866425993e-05,
"loss": 0.2187,
"step": 1860
},
{
"epoch": 3.3664259927797833,
"grad_norm": 1.357548066354826,
"learning_rate": 1.3270758122743683e-05,
"loss": 0.218,
"step": 1865
},
{
"epoch": 3.375451263537906,
"grad_norm": 1.3425006805058106,
"learning_rate": 1.3252707581227437e-05,
"loss": 0.217,
"step": 1870
},
{
"epoch": 3.384476534296029,
"grad_norm": 1.285802529815462,
"learning_rate": 1.3234657039711191e-05,
"loss": 0.216,
"step": 1875
},
{
"epoch": 3.3935018050541514,
"grad_norm": 1.2778875094894446,
"learning_rate": 1.3216606498194947e-05,
"loss": 0.2193,
"step": 1880
},
{
"epoch": 3.4025270758122743,
"grad_norm": 1.1908353789550035,
"learning_rate": 1.3198555956678702e-05,
"loss": 0.2161,
"step": 1885
},
{
"epoch": 3.411552346570397,
"grad_norm": 1.34603474003137,
"learning_rate": 1.3180505415162456e-05,
"loss": 0.2141,
"step": 1890
},
{
"epoch": 3.4205776173285196,
"grad_norm": 1.4297727153398665,
"learning_rate": 1.3162454873646211e-05,
"loss": 0.2237,
"step": 1895
},
{
"epoch": 3.4296028880866425,
"grad_norm": 1.3837512629017574,
"learning_rate": 1.3144404332129965e-05,
"loss": 0.2214,
"step": 1900
},
{
"epoch": 3.4386281588447654,
"grad_norm": 1.4387141423605057,
"learning_rate": 1.3126353790613719e-05,
"loss": 0.2187,
"step": 1905
},
{
"epoch": 3.4476534296028882,
"grad_norm": 1.2799805992130007,
"learning_rate": 1.3108303249097475e-05,
"loss": 0.2179,
"step": 1910
},
{
"epoch": 3.4566787003610107,
"grad_norm": 1.4835446521559619,
"learning_rate": 1.3090252707581228e-05,
"loss": 0.2203,
"step": 1915
},
{
"epoch": 3.4657039711191335,
"grad_norm": 1.341999213127749,
"learning_rate": 1.3072202166064982e-05,
"loss": 0.2168,
"step": 1920
},
{
"epoch": 3.4747292418772564,
"grad_norm": 1.3763087376806546,
"learning_rate": 1.3054151624548736e-05,
"loss": 0.2149,
"step": 1925
},
{
"epoch": 3.483754512635379,
"grad_norm": 1.4007971114232958,
"learning_rate": 1.3036101083032493e-05,
"loss": 0.2182,
"step": 1930
},
{
"epoch": 3.4927797833935017,
"grad_norm": 1.4572957861270215,
"learning_rate": 1.3018050541516247e-05,
"loss": 0.2147,
"step": 1935
},
{
"epoch": 3.5018050541516246,
"grad_norm": 1.3722288775763722,
"learning_rate": 1.3000000000000001e-05,
"loss": 0.2198,
"step": 1940
},
{
"epoch": 3.5108303249097474,
"grad_norm": 1.3099638901670316,
"learning_rate": 1.2981949458483756e-05,
"loss": 0.2195,
"step": 1945
},
{
"epoch": 3.51985559566787,
"grad_norm": 1.2794841153864642,
"learning_rate": 1.296389891696751e-05,
"loss": 0.2181,
"step": 1950
},
{
"epoch": 3.5288808664259927,
"grad_norm": 1.4143673780025412,
"learning_rate": 1.2945848375451264e-05,
"loss": 0.2159,
"step": 1955
},
{
"epoch": 3.5379061371841156,
"grad_norm": 1.2691083266416614,
"learning_rate": 1.2927797833935018e-05,
"loss": 0.2145,
"step": 1960
},
{
"epoch": 3.546931407942238,
"grad_norm": 1.6855255358173022,
"learning_rate": 1.2909747292418773e-05,
"loss": 0.2142,
"step": 1965
},
{
"epoch": 3.555956678700361,
"grad_norm": 1.2577832543255076,
"learning_rate": 1.2891696750902527e-05,
"loss": 0.2185,
"step": 1970
},
{
"epoch": 3.564981949458484,
"grad_norm": 1.4260534179211517,
"learning_rate": 1.2873646209386283e-05,
"loss": 0.2227,
"step": 1975
},
{
"epoch": 3.5740072202166067,
"grad_norm": 1.2525086568956194,
"learning_rate": 1.2855595667870038e-05,
"loss": 0.2193,
"step": 1980
},
{
"epoch": 3.583032490974729,
"grad_norm": 1.2589927076038325,
"learning_rate": 1.2837545126353792e-05,
"loss": 0.2203,
"step": 1985
},
{
"epoch": 3.592057761732852,
"grad_norm": 1.417146294874885,
"learning_rate": 1.2819494584837546e-05,
"loss": 0.2172,
"step": 1990
},
{
"epoch": 3.601083032490975,
"grad_norm": 1.201542111015426,
"learning_rate": 1.2801444043321301e-05,
"loss": 0.2175,
"step": 1995
},
{
"epoch": 3.6101083032490973,
"grad_norm": 1.3003616222573477,
"learning_rate": 1.2783393501805055e-05,
"loss": 0.2218,
"step": 2000
},
{
"epoch": 3.61913357400722,
"grad_norm": 1.2526661517801678,
"learning_rate": 1.2765342960288809e-05,
"loss": 0.2211,
"step": 2005
},
{
"epoch": 3.628158844765343,
"grad_norm": 1.3562427529698038,
"learning_rate": 1.2747292418772563e-05,
"loss": 0.2202,
"step": 2010
},
{
"epoch": 3.637184115523466,
"grad_norm": 1.337359110529062,
"learning_rate": 1.272924187725632e-05,
"loss": 0.2192,
"step": 2015
},
{
"epoch": 3.6462093862815883,
"grad_norm": 1.3247434076106055,
"learning_rate": 1.2711191335740074e-05,
"loss": 0.2175,
"step": 2020
},
{
"epoch": 3.655234657039711,
"grad_norm": 1.401401481932495,
"learning_rate": 1.2693140794223828e-05,
"loss": 0.2176,
"step": 2025
},
{
"epoch": 3.664259927797834,
"grad_norm": 1.3989599012921654,
"learning_rate": 1.2675090252707583e-05,
"loss": 0.2145,
"step": 2030
},
{
"epoch": 3.6732851985559565,
"grad_norm": 1.37942404901644,
"learning_rate": 1.2657039711191337e-05,
"loss": 0.2176,
"step": 2035
},
{
"epoch": 3.6823104693140793,
"grad_norm": 1.3768859929806074,
"learning_rate": 1.263898916967509e-05,
"loss": 0.2216,
"step": 2040
},
{
"epoch": 3.691335740072202,
"grad_norm": 1.4119126477817214,
"learning_rate": 1.2620938628158845e-05,
"loss": 0.2176,
"step": 2045
},
{
"epoch": 3.700361010830325,
"grad_norm": 1.521657968464524,
"learning_rate": 1.26028880866426e-05,
"loss": 0.2215,
"step": 2050
},
{
"epoch": 3.7093862815884475,
"grad_norm": 1.1673555154434911,
"learning_rate": 1.2584837545126354e-05,
"loss": 0.2207,
"step": 2055
},
{
"epoch": 3.7184115523465704,
"grad_norm": 1.3165831513135962,
"learning_rate": 1.256678700361011e-05,
"loss": 0.2199,
"step": 2060
},
{
"epoch": 3.7274368231046933,
"grad_norm": 1.2771969687673677,
"learning_rate": 1.2548736462093865e-05,
"loss": 0.2174,
"step": 2065
},
{
"epoch": 3.7364620938628157,
"grad_norm": 1.3460833673268793,
"learning_rate": 1.2530685920577619e-05,
"loss": 0.2176,
"step": 2070
},
{
"epoch": 3.7454873646209386,
"grad_norm": 1.321801578603406,
"learning_rate": 1.2512635379061373e-05,
"loss": 0.2181,
"step": 2075
},
{
"epoch": 3.7545126353790614,
"grad_norm": 1.296115804685079,
"learning_rate": 1.2494584837545128e-05,
"loss": 0.2174,
"step": 2080
},
{
"epoch": 3.7635379061371843,
"grad_norm": 1.611992163605043,
"learning_rate": 1.2476534296028882e-05,
"loss": 0.2218,
"step": 2085
},
{
"epoch": 3.7725631768953067,
"grad_norm": 1.333178205245191,
"learning_rate": 1.2458483754512636e-05,
"loss": 0.2191,
"step": 2090
},
{
"epoch": 3.7815884476534296,
"grad_norm": 1.3160245659773944,
"learning_rate": 1.244043321299639e-05,
"loss": 0.2103,
"step": 2095
},
{
"epoch": 3.7906137184115525,
"grad_norm": 1.229821228240215,
"learning_rate": 1.2422382671480145e-05,
"loss": 0.2211,
"step": 2100
},
{
"epoch": 3.799638989169675,
"grad_norm": 1.4668142178779533,
"learning_rate": 1.24043321299639e-05,
"loss": 0.2157,
"step": 2105
},
{
"epoch": 3.808664259927798,
"grad_norm": 1.2767987004558847,
"learning_rate": 1.2386281588447654e-05,
"loss": 0.2139,
"step": 2110
},
{
"epoch": 3.8176895306859207,
"grad_norm": 1.3419257818618695,
"learning_rate": 1.236823104693141e-05,
"loss": 0.2177,
"step": 2115
},
{
"epoch": 3.8267148014440435,
"grad_norm": 1.3106254825745933,
"learning_rate": 1.2350180505415164e-05,
"loss": 0.2188,
"step": 2120
},
{
"epoch": 3.835740072202166,
"grad_norm": 1.291543676501794,
"learning_rate": 1.2332129963898918e-05,
"loss": 0.2211,
"step": 2125
},
{
"epoch": 3.844765342960289,
"grad_norm": 1.4261998506808886,
"learning_rate": 1.2314079422382671e-05,
"loss": 0.2197,
"step": 2130
},
{
"epoch": 3.8537906137184117,
"grad_norm": 1.358333414575488,
"learning_rate": 1.2296028880866427e-05,
"loss": 0.2159,
"step": 2135
},
{
"epoch": 3.862815884476534,
"grad_norm": 1.474955377700595,
"learning_rate": 1.227797833935018e-05,
"loss": 0.2218,
"step": 2140
},
{
"epoch": 3.871841155234657,
"grad_norm": 1.3720904636423812,
"learning_rate": 1.2259927797833938e-05,
"loss": 0.2177,
"step": 2145
},
{
"epoch": 3.88086642599278,
"grad_norm": 1.5460881931560433,
"learning_rate": 1.2241877256317692e-05,
"loss": 0.2202,
"step": 2150
},
{
"epoch": 3.8898916967509027,
"grad_norm": 1.372535903476554,
"learning_rate": 1.2223826714801446e-05,
"loss": 0.2133,
"step": 2155
},
{
"epoch": 3.898916967509025,
"grad_norm": 1.3380599019325592,
"learning_rate": 1.22057761732852e-05,
"loss": 0.2156,
"step": 2160
},
{
"epoch": 3.907942238267148,
"grad_norm": 1.5215368016858366,
"learning_rate": 1.2187725631768955e-05,
"loss": 0.2209,
"step": 2165
},
{
"epoch": 3.916967509025271,
"grad_norm": 1.4103734495459146,
"learning_rate": 1.2169675090252709e-05,
"loss": 0.2156,
"step": 2170
},
{
"epoch": 3.9259927797833933,
"grad_norm": 1.3438329909754207,
"learning_rate": 1.2151624548736462e-05,
"loss": 0.2199,
"step": 2175
},
{
"epoch": 3.935018050541516,
"grad_norm": 1.2636238619595532,
"learning_rate": 1.2133574007220216e-05,
"loss": 0.2197,
"step": 2180
},
{
"epoch": 3.944043321299639,
"grad_norm": 1.1840357472659475,
"learning_rate": 1.2115523465703972e-05,
"loss": 0.2122,
"step": 2185
},
{
"epoch": 3.953068592057762,
"grad_norm": 1.2040616851708978,
"learning_rate": 1.2097472924187727e-05,
"loss": 0.2185,
"step": 2190
},
{
"epoch": 3.9620938628158844,
"grad_norm": 1.358963573245228,
"learning_rate": 1.2079422382671481e-05,
"loss": 0.2153,
"step": 2195
},
{
"epoch": 3.9711191335740073,
"grad_norm": 1.2933453928461196,
"learning_rate": 1.2061371841155237e-05,
"loss": 0.2157,
"step": 2200
},
{
"epoch": 3.98014440433213,
"grad_norm": 1.207608106082354,
"learning_rate": 1.204332129963899e-05,
"loss": 0.2199,
"step": 2205
},
{
"epoch": 3.9891696750902526,
"grad_norm": 1.3581034288222624,
"learning_rate": 1.2025270758122744e-05,
"loss": 0.2196,
"step": 2210
},
{
"epoch": 3.9981949458483754,
"grad_norm": 1.508352838653625,
"learning_rate": 1.2007220216606498e-05,
"loss": 0.2219,
"step": 2215
},
{
"epoch": 4.007220216606498,
"grad_norm": 1.1525808259183898,
"learning_rate": 1.1989169675090254e-05,
"loss": 0.172,
"step": 2220
},
{
"epoch": 4.016245487364621,
"grad_norm": 1.6837096670325025,
"learning_rate": 1.1971119133574007e-05,
"loss": 0.1473,
"step": 2225
},
{
"epoch": 4.025270758122744,
"grad_norm": 1.3622634803237335,
"learning_rate": 1.1953068592057765e-05,
"loss": 0.1424,
"step": 2230
},
{
"epoch": 4.034296028880866,
"grad_norm": 1.167482488838041,
"learning_rate": 1.1935018050541518e-05,
"loss": 0.1428,
"step": 2235
},
{
"epoch": 4.043321299638989,
"grad_norm": 1.370361726176911,
"learning_rate": 1.1916967509025272e-05,
"loss": 0.1385,
"step": 2240
},
{
"epoch": 4.052346570397112,
"grad_norm": 1.2641058446971232,
"learning_rate": 1.1898916967509026e-05,
"loss": 0.1358,
"step": 2245
},
{
"epoch": 4.061371841155235,
"grad_norm": 1.4838069133238292,
"learning_rate": 1.1880866425992782e-05,
"loss": 0.1369,
"step": 2250
},
{
"epoch": 4.0703971119133575,
"grad_norm": 1.411880125524948,
"learning_rate": 1.1862815884476535e-05,
"loss": 0.1376,
"step": 2255
},
{
"epoch": 4.07942238267148,
"grad_norm": 1.532801179696419,
"learning_rate": 1.184476534296029e-05,
"loss": 0.144,
"step": 2260
},
{
"epoch": 4.088447653429603,
"grad_norm": 1.2722280030975663,
"learning_rate": 1.1826714801444043e-05,
"loss": 0.1377,
"step": 2265
},
{
"epoch": 4.097472924187725,
"grad_norm": 1.257371634404589,
"learning_rate": 1.1808664259927799e-05,
"loss": 0.1376,
"step": 2270
},
{
"epoch": 4.106498194945848,
"grad_norm": 1.2867994012584483,
"learning_rate": 1.1790613718411554e-05,
"loss": 0.1438,
"step": 2275
},
{
"epoch": 4.115523465703971,
"grad_norm": 1.3235714594660852,
"learning_rate": 1.1772563176895308e-05,
"loss": 0.1375,
"step": 2280
},
{
"epoch": 4.124548736462094,
"grad_norm": 1.3538854718074433,
"learning_rate": 1.1754512635379063e-05,
"loss": 0.1434,
"step": 2285
},
{
"epoch": 4.133574007220217,
"grad_norm": 1.3387220340603523,
"learning_rate": 1.1736462093862817e-05,
"loss": 0.1439,
"step": 2290
},
{
"epoch": 4.14259927797834,
"grad_norm": 1.3342275390499512,
"learning_rate": 1.1718411552346571e-05,
"loss": 0.1423,
"step": 2295
},
{
"epoch": 4.1516245487364625,
"grad_norm": 1.3172292559591101,
"learning_rate": 1.1700361010830325e-05,
"loss": 0.1397,
"step": 2300
},
{
"epoch": 4.1606498194945845,
"grad_norm": 1.5089183076261925,
"learning_rate": 1.168231046931408e-05,
"loss": 0.1478,
"step": 2305
},
{
"epoch": 4.169675090252707,
"grad_norm": 1.2882037882410693,
"learning_rate": 1.1664259927797834e-05,
"loss": 0.1441,
"step": 2310
},
{
"epoch": 4.17870036101083,
"grad_norm": 1.3275271715737593,
"learning_rate": 1.1646209386281588e-05,
"loss": 0.1467,
"step": 2315
},
{
"epoch": 4.187725631768953,
"grad_norm": 1.26669118474737,
"learning_rate": 1.1628158844765345e-05,
"loss": 0.1448,
"step": 2320
},
{
"epoch": 4.196750902527076,
"grad_norm": 1.287627972081866,
"learning_rate": 1.1610108303249099e-05,
"loss": 0.1463,
"step": 2325
},
{
"epoch": 4.205776173285199,
"grad_norm": 1.4698351462515338,
"learning_rate": 1.1592057761732853e-05,
"loss": 0.1418,
"step": 2330
},
{
"epoch": 4.214801444043322,
"grad_norm": 1.214815481124232,
"learning_rate": 1.1574007220216608e-05,
"loss": 0.1414,
"step": 2335
},
{
"epoch": 4.223826714801444,
"grad_norm": 1.3143493372944082,
"learning_rate": 1.1555956678700362e-05,
"loss": 0.1434,
"step": 2340
},
{
"epoch": 4.2328519855595665,
"grad_norm": 1.312607955074974,
"learning_rate": 1.1537906137184116e-05,
"loss": 0.144,
"step": 2345
},
{
"epoch": 4.241877256317689,
"grad_norm": 1.2190888161416562,
"learning_rate": 1.151985559566787e-05,
"loss": 0.1426,
"step": 2350
},
{
"epoch": 4.250902527075812,
"grad_norm": 1.3594194563402886,
"learning_rate": 1.1501805054151625e-05,
"loss": 0.1435,
"step": 2355
},
{
"epoch": 4.259927797833935,
"grad_norm": 1.4782948938985976,
"learning_rate": 1.148375451263538e-05,
"loss": 0.1454,
"step": 2360
},
{
"epoch": 4.268953068592058,
"grad_norm": 1.336811201239736,
"learning_rate": 1.1465703971119135e-05,
"loss": 0.1454,
"step": 2365
},
{
"epoch": 4.277978339350181,
"grad_norm": 1.2482577868629918,
"learning_rate": 1.144765342960289e-05,
"loss": 0.1411,
"step": 2370
},
{
"epoch": 4.287003610108303,
"grad_norm": 1.2732430927693474,
"learning_rate": 1.1429602888086644e-05,
"loss": 0.1417,
"step": 2375
},
{
"epoch": 4.296028880866426,
"grad_norm": 1.5099576743037075,
"learning_rate": 1.1411552346570398e-05,
"loss": 0.1448,
"step": 2380
},
{
"epoch": 4.305054151624549,
"grad_norm": 1.289221803177811,
"learning_rate": 1.1393501805054152e-05,
"loss": 0.147,
"step": 2385
},
{
"epoch": 4.3140794223826715,
"grad_norm": 1.3263143595170082,
"learning_rate": 1.1375451263537907e-05,
"loss": 0.146,
"step": 2390
},
{
"epoch": 4.323104693140794,
"grad_norm": 1.2413770324582891,
"learning_rate": 1.1357400722021661e-05,
"loss": 0.1457,
"step": 2395
},
{
"epoch": 4.332129963898917,
"grad_norm": 1.2639388626402195,
"learning_rate": 1.1339350180505415e-05,
"loss": 0.1507,
"step": 2400
},
{
"epoch": 4.34115523465704,
"grad_norm": 1.264633141154961,
"learning_rate": 1.1321299638989172e-05,
"loss": 0.1449,
"step": 2405
},
{
"epoch": 4.350180505415162,
"grad_norm": 1.436154267765744,
"learning_rate": 1.1303249097472926e-05,
"loss": 0.1507,
"step": 2410
},
{
"epoch": 4.359205776173285,
"grad_norm": 1.1582422344120165,
"learning_rate": 1.128519855595668e-05,
"loss": 0.1484,
"step": 2415
},
{
"epoch": 4.368231046931408,
"grad_norm": 1.279883119070738,
"learning_rate": 1.1267148014440435e-05,
"loss": 0.1427,
"step": 2420
},
{
"epoch": 4.377256317689531,
"grad_norm": 1.3274819211696616,
"learning_rate": 1.1249097472924189e-05,
"loss": 0.1452,
"step": 2425
},
{
"epoch": 4.386281588447654,
"grad_norm": 1.3298646533007747,
"learning_rate": 1.1231046931407943e-05,
"loss": 0.1491,
"step": 2430
},
{
"epoch": 4.3953068592057765,
"grad_norm": 1.2549287331161112,
"learning_rate": 1.1212996389891697e-05,
"loss": 0.1489,
"step": 2435
},
{
"epoch": 4.404332129963899,
"grad_norm": 1.400198206798889,
"learning_rate": 1.1194945848375452e-05,
"loss": 0.1451,
"step": 2440
},
{
"epoch": 4.413357400722021,
"grad_norm": 1.3684946718616282,
"learning_rate": 1.1176895306859206e-05,
"loss": 0.1521,
"step": 2445
},
{
"epoch": 4.422382671480144,
"grad_norm": 1.2087420140469356,
"learning_rate": 1.1158844765342961e-05,
"loss": 0.1454,
"step": 2450
},
{
"epoch": 4.431407942238267,
"grad_norm": 1.2910589959993624,
"learning_rate": 1.1140794223826717e-05,
"loss": 0.1449,
"step": 2455
},
{
"epoch": 4.44043321299639,
"grad_norm": 1.4447075027189171,
"learning_rate": 1.112274368231047e-05,
"loss": 0.1443,
"step": 2460
},
{
"epoch": 4.449458483754513,
"grad_norm": 1.3740604349535315,
"learning_rate": 1.1104693140794225e-05,
"loss": 0.1413,
"step": 2465
},
{
"epoch": 4.458483754512636,
"grad_norm": 1.4050751545666154,
"learning_rate": 1.1086642599277978e-05,
"loss": 0.1483,
"step": 2470
},
{
"epoch": 4.467509025270758,
"grad_norm": 1.2385281470710559,
"learning_rate": 1.1068592057761734e-05,
"loss": 0.1485,
"step": 2475
},
{
"epoch": 4.4765342960288805,
"grad_norm": 1.2281174369886623,
"learning_rate": 1.1050541516245488e-05,
"loss": 0.1485,
"step": 2480
},
{
"epoch": 4.485559566787003,
"grad_norm": 1.2151464275592043,
"learning_rate": 1.1032490974729241e-05,
"loss": 0.1418,
"step": 2485
},
{
"epoch": 4.494584837545126,
"grad_norm": 1.3791171874873505,
"learning_rate": 1.1014440433212999e-05,
"loss": 0.1448,
"step": 2490
},
{
"epoch": 4.503610108303249,
"grad_norm": 1.2082141878609713,
"learning_rate": 1.0996389891696753e-05,
"loss": 0.142,
"step": 2495
},
{
"epoch": 4.512635379061372,
"grad_norm": 1.2284432308372542,
"learning_rate": 1.0978339350180506e-05,
"loss": 0.1462,
"step": 2500
},
{
"epoch": 4.512635379061372,
"eval_loss": 0.11392025649547577,
"eval_runtime": 768.0942,
"eval_samples_per_second": 17.309,
"eval_steps_per_second": 0.721,
"step": 2500
},
{
"epoch": 4.521660649819495,
"grad_norm": 1.230111612515777,
"learning_rate": 1.0960288808664262e-05,
"loss": 0.1425,
"step": 2505
},
{
"epoch": 4.530685920577618,
"grad_norm": 1.3673654833429962,
"learning_rate": 1.0942238267148016e-05,
"loss": 0.146,
"step": 2510
},
{
"epoch": 4.53971119133574,
"grad_norm": 1.470735178829857,
"learning_rate": 1.092418772563177e-05,
"loss": 0.1512,
"step": 2515
},
{
"epoch": 4.548736462093863,
"grad_norm": 1.431415105533255,
"learning_rate": 1.0906137184115523e-05,
"loss": 0.146,
"step": 2520
},
{
"epoch": 4.5577617328519855,
"grad_norm": 1.304958395632544,
"learning_rate": 1.0888086642599279e-05,
"loss": 0.1405,
"step": 2525
},
{
"epoch": 4.566787003610108,
"grad_norm": 1.202244774918958,
"learning_rate": 1.0870036101083033e-05,
"loss": 0.1494,
"step": 2530
},
{
"epoch": 4.575812274368231,
"grad_norm": 1.2061190438112863,
"learning_rate": 1.0851985559566788e-05,
"loss": 0.1454,
"step": 2535
},
{
"epoch": 4.584837545126354,
"grad_norm": 1.2303141902787755,
"learning_rate": 1.0833935018050544e-05,
"loss": 0.1464,
"step": 2540
},
{
"epoch": 4.593862815884476,
"grad_norm": 1.1973084706979935,
"learning_rate": 1.0815884476534297e-05,
"loss": 0.1525,
"step": 2545
},
{
"epoch": 4.602888086642599,
"grad_norm": 1.265181420660065,
"learning_rate": 1.0797833935018051e-05,
"loss": 0.1467,
"step": 2550
},
{
"epoch": 4.611913357400722,
"grad_norm": 1.3348119006510952,
"learning_rate": 1.0779783393501805e-05,
"loss": 0.1503,
"step": 2555
},
{
"epoch": 4.620938628158845,
"grad_norm": 1.3586496560954744,
"learning_rate": 1.076173285198556e-05,
"loss": 0.1478,
"step": 2560
},
{
"epoch": 4.629963898916968,
"grad_norm": 1.2646064815159468,
"learning_rate": 1.0743682310469314e-05,
"loss": 0.1506,
"step": 2565
},
{
"epoch": 4.6389891696750905,
"grad_norm": 1.2208996441560755,
"learning_rate": 1.0725631768953068e-05,
"loss": 0.1489,
"step": 2570
},
{
"epoch": 4.648014440433213,
"grad_norm": 1.2249820178968749,
"learning_rate": 1.0707581227436824e-05,
"loss": 0.1491,
"step": 2575
},
{
"epoch": 4.657039711191336,
"grad_norm": 1.2788771690689633,
"learning_rate": 1.068953068592058e-05,
"loss": 0.1468,
"step": 2580
},
{
"epoch": 4.666064981949458,
"grad_norm": 1.235614926954878,
"learning_rate": 1.0671480144404333e-05,
"loss": 0.1448,
"step": 2585
},
{
"epoch": 4.675090252707581,
"grad_norm": 1.4835623380890686,
"learning_rate": 1.0653429602888089e-05,
"loss": 0.1473,
"step": 2590
},
{
"epoch": 4.684115523465704,
"grad_norm": 1.3282902129276972,
"learning_rate": 1.0635379061371842e-05,
"loss": 0.1513,
"step": 2595
},
{
"epoch": 4.693140794223827,
"grad_norm": 1.3495078303520642,
"learning_rate": 1.0617328519855596e-05,
"loss": 0.1493,
"step": 2600
},
{
"epoch": 4.70216606498195,
"grad_norm": 1.3624724519254527,
"learning_rate": 1.059927797833935e-05,
"loss": 0.1464,
"step": 2605
},
{
"epoch": 4.7111913357400725,
"grad_norm": 1.5149872464672054,
"learning_rate": 1.0581227436823106e-05,
"loss": 0.1476,
"step": 2610
},
{
"epoch": 4.7202166064981945,
"grad_norm": 1.4140631499929084,
"learning_rate": 1.056317689530686e-05,
"loss": 0.1467,
"step": 2615
},
{
"epoch": 4.729241877256317,
"grad_norm": 1.3773886206332047,
"learning_rate": 1.0545126353790615e-05,
"loss": 0.1478,
"step": 2620
},
{
"epoch": 4.73826714801444,
"grad_norm": 1.360291667120557,
"learning_rate": 1.052707581227437e-05,
"loss": 0.1511,
"step": 2625
},
{
"epoch": 4.747292418772563,
"grad_norm": 2.0327952709868455,
"learning_rate": 1.0509025270758124e-05,
"loss": 0.1494,
"step": 2630
},
{
"epoch": 4.756317689530686,
"grad_norm": 1.1791100232489107,
"learning_rate": 1.0490974729241878e-05,
"loss": 0.1489,
"step": 2635
},
{
"epoch": 4.765342960288809,
"grad_norm": 1.3854796935196865,
"learning_rate": 1.0472924187725632e-05,
"loss": 0.1474,
"step": 2640
},
{
"epoch": 4.774368231046932,
"grad_norm": 1.195069413636359,
"learning_rate": 1.0454873646209387e-05,
"loss": 0.1473,
"step": 2645
},
{
"epoch": 4.783393501805055,
"grad_norm": 1.1422586625889126,
"learning_rate": 1.0436823104693141e-05,
"loss": 0.1492,
"step": 2650
},
{
"epoch": 4.792418772563177,
"grad_norm": 1.307147062583015,
"learning_rate": 1.0418772563176895e-05,
"loss": 0.1512,
"step": 2655
},
{
"epoch": 4.8014440433212995,
"grad_norm": 1.3584746863088188,
"learning_rate": 1.040072202166065e-05,
"loss": 0.1457,
"step": 2660
},
{
"epoch": 4.810469314079422,
"grad_norm": 1.3606216007098004,
"learning_rate": 1.0382671480144406e-05,
"loss": 0.1523,
"step": 2665
},
{
"epoch": 4.819494584837545,
"grad_norm": 1.2830676853953231,
"learning_rate": 1.036462093862816e-05,
"loss": 0.1468,
"step": 2670
},
{
"epoch": 4.828519855595668,
"grad_norm": 1.4025094885250153,
"learning_rate": 1.0346570397111915e-05,
"loss": 0.1531,
"step": 2675
},
{
"epoch": 4.837545126353791,
"grad_norm": 1.2538328025809764,
"learning_rate": 1.0328519855595669e-05,
"loss": 0.1511,
"step": 2680
},
{
"epoch": 4.846570397111913,
"grad_norm": 1.2447983687959376,
"learning_rate": 1.0310469314079423e-05,
"loss": 0.1482,
"step": 2685
},
{
"epoch": 4.855595667870036,
"grad_norm": 1.259843451978081,
"learning_rate": 1.0292418772563177e-05,
"loss": 0.1488,
"step": 2690
},
{
"epoch": 4.864620938628159,
"grad_norm": 1.2570447719051847,
"learning_rate": 1.0274368231046932e-05,
"loss": 0.1486,
"step": 2695
},
{
"epoch": 4.873646209386282,
"grad_norm": 1.329856783565704,
"learning_rate": 1.0256317689530686e-05,
"loss": 0.1529,
"step": 2700
},
{
"epoch": 4.882671480144404,
"grad_norm": 1.2638883131188237,
"learning_rate": 1.023826714801444e-05,
"loss": 0.1491,
"step": 2705
},
{
"epoch": 4.891696750902527,
"grad_norm": 1.308091101546035,
"learning_rate": 1.0220216606498197e-05,
"loss": 0.1444,
"step": 2710
},
{
"epoch": 4.90072202166065,
"grad_norm": 1.401420044812705,
"learning_rate": 1.0202166064981951e-05,
"loss": 0.1496,
"step": 2715
},
{
"epoch": 4.909747292418773,
"grad_norm": 1.1802217133046131,
"learning_rate": 1.0184115523465705e-05,
"loss": 0.1447,
"step": 2720
},
{
"epoch": 4.918772563176895,
"grad_norm": 1.418350121725291,
"learning_rate": 1.016606498194946e-05,
"loss": 0.1507,
"step": 2725
},
{
"epoch": 4.927797833935018,
"grad_norm": 1.3201116853524673,
"learning_rate": 1.0148014440433214e-05,
"loss": 0.1486,
"step": 2730
},
{
"epoch": 4.936823104693141,
"grad_norm": 1.3651415755311056,
"learning_rate": 1.0129963898916968e-05,
"loss": 0.1482,
"step": 2735
},
{
"epoch": 4.945848375451264,
"grad_norm": 1.2638304079285558,
"learning_rate": 1.0111913357400722e-05,
"loss": 0.1473,
"step": 2740
},
{
"epoch": 4.9548736462093865,
"grad_norm": 1.1922484627616543,
"learning_rate": 1.0093862815884477e-05,
"loss": 0.1515,
"step": 2745
},
{
"epoch": 4.963898916967509,
"grad_norm": 1.32172317810071,
"learning_rate": 1.0075812274368233e-05,
"loss": 0.151,
"step": 2750
},
{
"epoch": 4.972924187725631,
"grad_norm": 1.3124037260863468,
"learning_rate": 1.0057761732851987e-05,
"loss": 0.1458,
"step": 2755
},
{
"epoch": 4.981949458483754,
"grad_norm": 1.308966677769924,
"learning_rate": 1.0039711191335742e-05,
"loss": 0.1466,
"step": 2760
},
{
"epoch": 4.990974729241877,
"grad_norm": 1.3394659011449825,
"learning_rate": 1.0021660649819496e-05,
"loss": 0.1488,
"step": 2765
},
{
"epoch": 5.0,
"grad_norm": 1.1211076532163786,
"learning_rate": 1.000361010830325e-05,
"loss": 0.1451,
"step": 2770
},
{
"epoch": 5.009025270758123,
"grad_norm": 0.8762321663139153,
"learning_rate": 9.985559566787004e-06,
"loss": 0.0838,
"step": 2775
},
{
"epoch": 5.018050541516246,
"grad_norm": 1.1947630523435986,
"learning_rate": 9.967509025270759e-06,
"loss": 0.0791,
"step": 2780
},
{
"epoch": 5.027075812274369,
"grad_norm": 1.1957279068028621,
"learning_rate": 9.949458483754515e-06,
"loss": 0.0762,
"step": 2785
},
{
"epoch": 5.036101083032491,
"grad_norm": 1.0406919344632632,
"learning_rate": 9.931407942238268e-06,
"loss": 0.0789,
"step": 2790
},
{
"epoch": 5.0451263537906135,
"grad_norm": 1.0243376437328686,
"learning_rate": 9.913357400722022e-06,
"loss": 0.0782,
"step": 2795
},
{
"epoch": 5.054151624548736,
"grad_norm": 1.1064381648099426,
"learning_rate": 9.895306859205776e-06,
"loss": 0.0777,
"step": 2800
},
{
"epoch": 5.063176895306859,
"grad_norm": 0.9669535500625326,
"learning_rate": 9.877256317689532e-06,
"loss": 0.0756,
"step": 2805
},
{
"epoch": 5.072202166064982,
"grad_norm": 1.0367193436042796,
"learning_rate": 9.859205776173287e-06,
"loss": 0.0759,
"step": 2810
},
{
"epoch": 5.081227436823105,
"grad_norm": 1.0786119261971507,
"learning_rate": 9.84115523465704e-06,
"loss": 0.0759,
"step": 2815
},
{
"epoch": 5.090252707581228,
"grad_norm": 1.000167422197771,
"learning_rate": 9.823104693140795e-06,
"loss": 0.0785,
"step": 2820
},
{
"epoch": 5.09927797833935,
"grad_norm": 0.9542524209315384,
"learning_rate": 9.805054151624548e-06,
"loss": 0.0773,
"step": 2825
},
{
"epoch": 5.108303249097473,
"grad_norm": 0.9683260308018407,
"learning_rate": 9.787003610108304e-06,
"loss": 0.0773,
"step": 2830
},
{
"epoch": 5.117328519855596,
"grad_norm": 1.0346987353569643,
"learning_rate": 9.768953068592058e-06,
"loss": 0.0748,
"step": 2835
},
{
"epoch": 5.126353790613718,
"grad_norm": 1.1793270223432109,
"learning_rate": 9.750902527075813e-06,
"loss": 0.078,
"step": 2840
},
{
"epoch": 5.135379061371841,
"grad_norm": 0.8930953181531934,
"learning_rate": 9.732851985559567e-06,
"loss": 0.0772,
"step": 2845
},
{
"epoch": 5.144404332129964,
"grad_norm": 1.023338270678483,
"learning_rate": 9.714801444043323e-06,
"loss": 0.0781,
"step": 2850
},
{
"epoch": 5.153429602888087,
"grad_norm": 0.946465899056401,
"learning_rate": 9.696750902527076e-06,
"loss": 0.0766,
"step": 2855
},
{
"epoch": 5.162454873646209,
"grad_norm": 0.9856919879257939,
"learning_rate": 9.67870036101083e-06,
"loss": 0.078,
"step": 2860
},
{
"epoch": 5.171480144404332,
"grad_norm": 0.9817835099066485,
"learning_rate": 9.660649819494586e-06,
"loss": 0.0778,
"step": 2865
},
{
"epoch": 5.180505415162455,
"grad_norm": 0.9867957619852264,
"learning_rate": 9.642599277978341e-06,
"loss": 0.0794,
"step": 2870
},
{
"epoch": 5.189530685920578,
"grad_norm": 0.9281749841484341,
"learning_rate": 9.624548736462095e-06,
"loss": 0.079,
"step": 2875
},
{
"epoch": 5.1985559566787005,
"grad_norm": 1.1418877678658672,
"learning_rate": 9.606498194945849e-06,
"loss": 0.0816,
"step": 2880
},
{
"epoch": 5.207581227436823,
"grad_norm": 1.2801357990196476,
"learning_rate": 9.588447653429603e-06,
"loss": 0.0802,
"step": 2885
},
{
"epoch": 5.216606498194946,
"grad_norm": 1.08414475669863,
"learning_rate": 9.570397111913358e-06,
"loss": 0.0826,
"step": 2890
},
{
"epoch": 5.225631768953068,
"grad_norm": 1.0307284905851641,
"learning_rate": 9.552346570397114e-06,
"loss": 0.082,
"step": 2895
},
{
"epoch": 5.234657039711191,
"grad_norm": 1.0409677907783568,
"learning_rate": 9.534296028880868e-06,
"loss": 0.0794,
"step": 2900
},
{
"epoch": 5.243682310469314,
"grad_norm": 1.0620698441964218,
"learning_rate": 9.516245487364621e-06,
"loss": 0.0805,
"step": 2905
},
{
"epoch": 5.252707581227437,
"grad_norm": 1.074425732291813,
"learning_rate": 9.498194945848375e-06,
"loss": 0.078,
"step": 2910
},
{
"epoch": 5.26173285198556,
"grad_norm": 1.026175971232318,
"learning_rate": 9.48014440433213e-06,
"loss": 0.0791,
"step": 2915
},
{
"epoch": 5.270758122743683,
"grad_norm": 1.067615156649838,
"learning_rate": 9.462093862815885e-06,
"loss": 0.0801,
"step": 2920
},
{
"epoch": 5.2797833935018055,
"grad_norm": 1.0009258120547495,
"learning_rate": 9.44404332129964e-06,
"loss": 0.0799,
"step": 2925
},
{
"epoch": 5.2888086642599275,
"grad_norm": 0.997000627917853,
"learning_rate": 9.425992779783394e-06,
"loss": 0.0801,
"step": 2930
},
{
"epoch": 5.29783393501805,
"grad_norm": 1.0650332846055963,
"learning_rate": 9.40794223826715e-06,
"loss": 0.083,
"step": 2935
},
{
"epoch": 5.306859205776173,
"grad_norm": 1.0505624675010425,
"learning_rate": 9.389891696750903e-06,
"loss": 0.0806,
"step": 2940
},
{
"epoch": 5.315884476534296,
"grad_norm": 1.0721048636152144,
"learning_rate": 9.371841155234657e-06,
"loss": 0.0835,
"step": 2945
},
{
"epoch": 5.324909747292419,
"grad_norm": 1.133446760779764,
"learning_rate": 9.353790613718413e-06,
"loss": 0.0832,
"step": 2950
},
{
"epoch": 5.333935018050542,
"grad_norm": 0.9772797348599754,
"learning_rate": 9.335740072202168e-06,
"loss": 0.0829,
"step": 2955
},
{
"epoch": 5.342960288808664,
"grad_norm": 1.1072279559914036,
"learning_rate": 9.317689530685922e-06,
"loss": 0.0849,
"step": 2960
},
{
"epoch": 5.351985559566787,
"grad_norm": 1.1628299433020886,
"learning_rate": 9.299638989169676e-06,
"loss": 0.0786,
"step": 2965
},
{
"epoch": 5.3610108303249095,
"grad_norm": 1.098783237460958,
"learning_rate": 9.28158844765343e-06,
"loss": 0.0811,
"step": 2970
},
{
"epoch": 5.370036101083032,
"grad_norm": 1.0282918449980682,
"learning_rate": 9.263537906137185e-06,
"loss": 0.0807,
"step": 2975
},
{
"epoch": 5.379061371841155,
"grad_norm": 0.9904645159012198,
"learning_rate": 9.24548736462094e-06,
"loss": 0.0826,
"step": 2980
},
{
"epoch": 5.388086642599278,
"grad_norm": 0.9548744748187918,
"learning_rate": 9.227436823104694e-06,
"loss": 0.0848,
"step": 2985
},
{
"epoch": 5.397111913357401,
"grad_norm": 0.9743172447638601,
"learning_rate": 9.209386281588448e-06,
"loss": 0.0811,
"step": 2990
},
{
"epoch": 5.406137184115524,
"grad_norm": 1.0785358862799732,
"learning_rate": 9.191335740072202e-06,
"loss": 0.0799,
"step": 2995
},
{
"epoch": 5.415162454873646,
"grad_norm": 1.1287010196459963,
"learning_rate": 9.173285198555957e-06,
"loss": 0.0848,
"step": 3000
},
{
"epoch": 5.424187725631769,
"grad_norm": 1.2004903366809976,
"learning_rate": 9.155234657039711e-06,
"loss": 0.0829,
"step": 3005
},
{
"epoch": 5.433212996389892,
"grad_norm": 1.0193224933306848,
"learning_rate": 9.137184115523467e-06,
"loss": 0.0845,
"step": 3010
},
{
"epoch": 5.4422382671480145,
"grad_norm": 1.0644266825718822,
"learning_rate": 9.11913357400722e-06,
"loss": 0.0839,
"step": 3015
},
{
"epoch": 5.451263537906137,
"grad_norm": 0.8258848911520923,
"learning_rate": 9.101083032490976e-06,
"loss": 0.0822,
"step": 3020
},
{
"epoch": 5.46028880866426,
"grad_norm": 0.9412075089089998,
"learning_rate": 9.08303249097473e-06,
"loss": 0.0796,
"step": 3025
},
{
"epoch": 5.469314079422382,
"grad_norm": 1.1565753144937303,
"learning_rate": 9.064981949458484e-06,
"loss": 0.084,
"step": 3030
},
{
"epoch": 5.478339350180505,
"grad_norm": 1.131138454580635,
"learning_rate": 9.04693140794224e-06,
"loss": 0.0838,
"step": 3035
},
{
"epoch": 5.487364620938628,
"grad_norm": 0.9492827416842319,
"learning_rate": 9.028880866425993e-06,
"loss": 0.0813,
"step": 3040
},
{
"epoch": 5.496389891696751,
"grad_norm": 0.9559475828227638,
"learning_rate": 9.010830324909749e-06,
"loss": 0.0816,
"step": 3045
},
{
"epoch": 5.505415162454874,
"grad_norm": 0.9227409169217686,
"learning_rate": 8.992779783393502e-06,
"loss": 0.0831,
"step": 3050
},
{
"epoch": 5.514440433212997,
"grad_norm": 1.0194412684846976,
"learning_rate": 8.974729241877256e-06,
"loss": 0.0832,
"step": 3055
},
{
"epoch": 5.5234657039711195,
"grad_norm": 1.0830453157290356,
"learning_rate": 8.956678700361012e-06,
"loss": 0.0834,
"step": 3060
},
{
"epoch": 5.532490974729242,
"grad_norm": 1.2132106015297117,
"learning_rate": 8.938628158844767e-06,
"loss": 0.0851,
"step": 3065
},
{
"epoch": 5.541516245487364,
"grad_norm": 1.0126228133692035,
"learning_rate": 8.920577617328521e-06,
"loss": 0.085,
"step": 3070
},
{
"epoch": 5.550541516245487,
"grad_norm": 1.2031735728430073,
"learning_rate": 8.902527075812275e-06,
"loss": 0.0822,
"step": 3075
},
{
"epoch": 5.55956678700361,
"grad_norm": 1.0701261920334157,
"learning_rate": 8.884476534296029e-06,
"loss": 0.0832,
"step": 3080
},
{
"epoch": 5.568592057761733,
"grad_norm": 1.0580816761482588,
"learning_rate": 8.866425992779784e-06,
"loss": 0.0796,
"step": 3085
},
{
"epoch": 5.577617328519856,
"grad_norm": 1.1940482533499739,
"learning_rate": 8.84837545126354e-06,
"loss": 0.0811,
"step": 3090
},
{
"epoch": 5.586642599277979,
"grad_norm": 1.165235309648367,
"learning_rate": 8.830324909747294e-06,
"loss": 0.0863,
"step": 3095
},
{
"epoch": 5.595667870036101,
"grad_norm": 0.9249917161400244,
"learning_rate": 8.812274368231047e-06,
"loss": 0.0826,
"step": 3100
},
{
"epoch": 5.6046931407942235,
"grad_norm": 0.9717526930385568,
"learning_rate": 8.794223826714801e-06,
"loss": 0.0831,
"step": 3105
},
{
"epoch": 5.613718411552346,
"grad_norm": 0.9703434417026183,
"learning_rate": 8.776173285198557e-06,
"loss": 0.0825,
"step": 3110
},
{
"epoch": 5.622743682310469,
"grad_norm": 1.1028319154775599,
"learning_rate": 8.75812274368231e-06,
"loss": 0.0836,
"step": 3115
},
{
"epoch": 5.631768953068592,
"grad_norm": 1.0334242813746233,
"learning_rate": 8.740072202166066e-06,
"loss": 0.0839,
"step": 3120
},
{
"epoch": 5.640794223826715,
"grad_norm": 0.952976186597617,
"learning_rate": 8.72202166064982e-06,
"loss": 0.0829,
"step": 3125
},
{
"epoch": 5.649819494584838,
"grad_norm": 1.1742650529657839,
"learning_rate": 8.703971119133575e-06,
"loss": 0.0851,
"step": 3130
},
{
"epoch": 5.658844765342961,
"grad_norm": 1.0818057279178166,
"learning_rate": 8.68592057761733e-06,
"loss": 0.0843,
"step": 3135
},
{
"epoch": 5.667870036101083,
"grad_norm": 1.0218551853468192,
"learning_rate": 8.667870036101083e-06,
"loss": 0.0835,
"step": 3140
},
{
"epoch": 5.676895306859206,
"grad_norm": 0.9122852593914,
"learning_rate": 8.649819494584839e-06,
"loss": 0.0823,
"step": 3145
},
{
"epoch": 5.6859205776173285,
"grad_norm": 0.9335847424506468,
"learning_rate": 8.631768953068594e-06,
"loss": 0.0816,
"step": 3150
},
{
"epoch": 5.694945848375451,
"grad_norm": 1.0829119759659693,
"learning_rate": 8.613718411552348e-06,
"loss": 0.082,
"step": 3155
},
{
"epoch": 5.703971119133574,
"grad_norm": 0.8918149119424851,
"learning_rate": 8.595667870036102e-06,
"loss": 0.0827,
"step": 3160
},
{
"epoch": 5.712996389891697,
"grad_norm": 0.9986993160612925,
"learning_rate": 8.577617328519855e-06,
"loss": 0.0821,
"step": 3165
},
{
"epoch": 5.722021660649819,
"grad_norm": 1.0234360957137028,
"learning_rate": 8.559566787003611e-06,
"loss": 0.0835,
"step": 3170
},
{
"epoch": 5.731046931407942,
"grad_norm": 1.0744486767713939,
"learning_rate": 8.541516245487366e-06,
"loss": 0.0861,
"step": 3175
},
{
"epoch": 5.740072202166065,
"grad_norm": 1.140075564988142,
"learning_rate": 8.52346570397112e-06,
"loss": 0.0869,
"step": 3180
},
{
"epoch": 5.749097472924188,
"grad_norm": 1.088662389971363,
"learning_rate": 8.505415162454874e-06,
"loss": 0.0828,
"step": 3185
},
{
"epoch": 5.758122743682311,
"grad_norm": 1.0811807303803094,
"learning_rate": 8.487364620938628e-06,
"loss": 0.0847,
"step": 3190
},
{
"epoch": 5.7671480144404335,
"grad_norm": 0.9808903181684402,
"learning_rate": 8.469314079422383e-06,
"loss": 0.0814,
"step": 3195
},
{
"epoch": 5.776173285198556,
"grad_norm": 0.9528918043651443,
"learning_rate": 8.451263537906137e-06,
"loss": 0.0842,
"step": 3200
},
{
"epoch": 5.785198555956678,
"grad_norm": 1.0415470416861194,
"learning_rate": 8.433212996389893e-06,
"loss": 0.0852,
"step": 3205
},
{
"epoch": 5.794223826714801,
"grad_norm": 0.973061380539213,
"learning_rate": 8.415162454873647e-06,
"loss": 0.0823,
"step": 3210
},
{
"epoch": 5.803249097472924,
"grad_norm": 0.9918845339236286,
"learning_rate": 8.397111913357402e-06,
"loss": 0.0855,
"step": 3215
},
{
"epoch": 5.812274368231047,
"grad_norm": 1.0771655009336334,
"learning_rate": 8.379061371841156e-06,
"loss": 0.0834,
"step": 3220
},
{
"epoch": 5.82129963898917,
"grad_norm": 1.2069079266806046,
"learning_rate": 8.36101083032491e-06,
"loss": 0.086,
"step": 3225
},
{
"epoch": 5.830324909747293,
"grad_norm": 1.1089608320239726,
"learning_rate": 8.342960288808665e-06,
"loss": 0.0853,
"step": 3230
},
{
"epoch": 5.8393501805054155,
"grad_norm": 1.0591489304507902,
"learning_rate": 8.324909747292419e-06,
"loss": 0.0851,
"step": 3235
},
{
"epoch": 5.8483754512635375,
"grad_norm": 0.9769104219070994,
"learning_rate": 8.306859205776175e-06,
"loss": 0.0854,
"step": 3240
},
{
"epoch": 5.85740072202166,
"grad_norm": 1.0564740427808548,
"learning_rate": 8.288808664259928e-06,
"loss": 0.0848,
"step": 3245
},
{
"epoch": 5.866425992779783,
"grad_norm": 1.1856181761239502,
"learning_rate": 8.270758122743682e-06,
"loss": 0.0844,
"step": 3250
},
{
"epoch": 5.875451263537906,
"grad_norm": 1.1377866343272012,
"learning_rate": 8.252707581227438e-06,
"loss": 0.083,
"step": 3255
},
{
"epoch": 5.884476534296029,
"grad_norm": 0.960741526303003,
"learning_rate": 8.234657039711193e-06,
"loss": 0.0866,
"step": 3260
},
{
"epoch": 5.893501805054152,
"grad_norm": 1.042073925207735,
"learning_rate": 8.216606498194947e-06,
"loss": 0.0823,
"step": 3265
},
{
"epoch": 5.902527075812275,
"grad_norm": 1.1046114797773166,
"learning_rate": 8.198555956678701e-06,
"loss": 0.0849,
"step": 3270
},
{
"epoch": 5.911552346570397,
"grad_norm": 0.9693673702377407,
"learning_rate": 8.180505415162455e-06,
"loss": 0.0853,
"step": 3275
},
{
"epoch": 5.92057761732852,
"grad_norm": 0.9935728695450117,
"learning_rate": 8.16245487364621e-06,
"loss": 0.0821,
"step": 3280
},
{
"epoch": 5.9296028880866425,
"grad_norm": 1.0804926319490278,
"learning_rate": 8.144404332129964e-06,
"loss": 0.0835,
"step": 3285
},
{
"epoch": 5.938628158844765,
"grad_norm": 1.064140844986384,
"learning_rate": 8.12635379061372e-06,
"loss": 0.0844,
"step": 3290
},
{
"epoch": 5.947653429602888,
"grad_norm": 1.0950234731974577,
"learning_rate": 8.108303249097473e-06,
"loss": 0.0856,
"step": 3295
},
{
"epoch": 5.956678700361011,
"grad_norm": 0.8673519333585892,
"learning_rate": 8.090252707581227e-06,
"loss": 0.083,
"step": 3300
},
{
"epoch": 5.965703971119133,
"grad_norm": 1.0637539730176666,
"learning_rate": 8.072202166064983e-06,
"loss": 0.0876,
"step": 3305
},
{
"epoch": 5.974729241877256,
"grad_norm": 1.1094983157547202,
"learning_rate": 8.054151624548736e-06,
"loss": 0.0824,
"step": 3310
},
{
"epoch": 5.983754512635379,
"grad_norm": 1.0059308649225067,
"learning_rate": 8.036101083032492e-06,
"loss": 0.083,
"step": 3315
},
{
"epoch": 5.992779783393502,
"grad_norm": 1.0366600463191211,
"learning_rate": 8.018050541516246e-06,
"loss": 0.0823,
"step": 3320
},
{
"epoch": 6.001805054151625,
"grad_norm": 0.8077933037227134,
"learning_rate": 8.000000000000001e-06,
"loss": 0.0794,
"step": 3325
},
{
"epoch": 6.0108303249097474,
"grad_norm": 0.6607413873878891,
"learning_rate": 7.981949458483755e-06,
"loss": 0.0592,
"step": 3330
},
{
"epoch": 6.01985559566787,
"grad_norm": 0.8591048311669046,
"learning_rate": 7.963898916967509e-06,
"loss": 0.0581,
"step": 3335
},
{
"epoch": 6.028880866425993,
"grad_norm": 0.70812448877148,
"learning_rate": 7.945848375451264e-06,
"loss": 0.0557,
"step": 3340
},
{
"epoch": 6.037906137184115,
"grad_norm": 0.7669486224905381,
"learning_rate": 7.92779783393502e-06,
"loss": 0.0556,
"step": 3345
},
{
"epoch": 6.046931407942238,
"grad_norm": 0.8323807855804514,
"learning_rate": 7.909747292418774e-06,
"loss": 0.0556,
"step": 3350
},
{
"epoch": 6.055956678700361,
"grad_norm": 0.7520520480227791,
"learning_rate": 7.891696750902528e-06,
"loss": 0.0565,
"step": 3355
},
{
"epoch": 6.064981949458484,
"grad_norm": 0.7627341484745689,
"learning_rate": 7.873646209386281e-06,
"loss": 0.0577,
"step": 3360
},
{
"epoch": 6.074007220216607,
"grad_norm": 0.6284399763085183,
"learning_rate": 7.855595667870037e-06,
"loss": 0.0559,
"step": 3365
},
{
"epoch": 6.0830324909747295,
"grad_norm": 0.6933321046620422,
"learning_rate": 7.83754512635379e-06,
"loss": 0.057,
"step": 3370
},
{
"epoch": 6.092057761732852,
"grad_norm": 0.7124947102302089,
"learning_rate": 7.819494584837546e-06,
"loss": 0.057,
"step": 3375
},
{
"epoch": 6.101083032490974,
"grad_norm": 0.6774361061625748,
"learning_rate": 7.8014440433213e-06,
"loss": 0.0558,
"step": 3380
},
{
"epoch": 6.110108303249097,
"grad_norm": 0.6796571497120569,
"learning_rate": 7.783393501805054e-06,
"loss": 0.0552,
"step": 3385
},
{
"epoch": 6.11913357400722,
"grad_norm": 0.7096655971296874,
"learning_rate": 7.76534296028881e-06,
"loss": 0.0566,
"step": 3390
},
{
"epoch": 6.128158844765343,
"grad_norm": 0.6734447095516544,
"learning_rate": 7.747292418772563e-06,
"loss": 0.0561,
"step": 3395
},
{
"epoch": 6.137184115523466,
"grad_norm": 0.7849122123015055,
"learning_rate": 7.729241877256319e-06,
"loss": 0.0571,
"step": 3400
},
{
"epoch": 6.146209386281589,
"grad_norm": 0.8010538023816386,
"learning_rate": 7.711191335740073e-06,
"loss": 0.0559,
"step": 3405
},
{
"epoch": 6.155234657039712,
"grad_norm": 0.7974825006602668,
"learning_rate": 7.693140794223828e-06,
"loss": 0.0558,
"step": 3410
},
{
"epoch": 6.164259927797834,
"grad_norm": 0.738675744252563,
"learning_rate": 7.675090252707582e-06,
"loss": 0.0589,
"step": 3415
},
{
"epoch": 6.1732851985559565,
"grad_norm": 0.7350542546764174,
"learning_rate": 7.657039711191336e-06,
"loss": 0.0579,
"step": 3420
},
{
"epoch": 6.182310469314079,
"grad_norm": 0.7619517736710059,
"learning_rate": 7.638989169675091e-06,
"loss": 0.0583,
"step": 3425
},
{
"epoch": 6.191335740072202,
"grad_norm": 0.7031956779944963,
"learning_rate": 7.620938628158845e-06,
"loss": 0.0573,
"step": 3430
},
{
"epoch": 6.200361010830325,
"grad_norm": 0.7075380742871121,
"learning_rate": 7.6028880866426006e-06,
"loss": 0.0578,
"step": 3435
},
{
"epoch": 6.209386281588448,
"grad_norm": 0.7290441002589563,
"learning_rate": 7.584837545126354e-06,
"loss": 0.0575,
"step": 3440
},
{
"epoch": 6.21841155234657,
"grad_norm": 0.7207806459527201,
"learning_rate": 7.566787003610109e-06,
"loss": 0.0567,
"step": 3445
},
{
"epoch": 6.227436823104693,
"grad_norm": 0.6314466279187362,
"learning_rate": 7.548736462093863e-06,
"loss": 0.0571,
"step": 3450
},
{
"epoch": 6.236462093862816,
"grad_norm": 0.6887561278193176,
"learning_rate": 7.530685920577618e-06,
"loss": 0.0593,
"step": 3455
},
{
"epoch": 6.245487364620939,
"grad_norm": 0.6206250853652553,
"learning_rate": 7.512635379061373e-06,
"loss": 0.0577,
"step": 3460
},
{
"epoch": 6.254512635379061,
"grad_norm": 0.6522866377346127,
"learning_rate": 7.494584837545127e-06,
"loss": 0.0566,
"step": 3465
},
{
"epoch": 6.263537906137184,
"grad_norm": 0.8267081288646022,
"learning_rate": 7.4765342960288815e-06,
"loss": 0.058,
"step": 3470
},
{
"epoch": 6.272563176895307,
"grad_norm": 0.7317740303239466,
"learning_rate": 7.458483754512636e-06,
"loss": 0.0578,
"step": 3475
},
{
"epoch": 6.28158844765343,
"grad_norm": 0.7147086385332849,
"learning_rate": 7.440433212996391e-06,
"loss": 0.0581,
"step": 3480
},
{
"epoch": 6.290613718411552,
"grad_norm": 0.8128890990175704,
"learning_rate": 7.422382671480145e-06,
"loss": 0.0582,
"step": 3485
},
{
"epoch": 6.299638989169675,
"grad_norm": 0.733774553010267,
"learning_rate": 7.404332129963899e-06,
"loss": 0.0589,
"step": 3490
},
{
"epoch": 6.308664259927798,
"grad_norm": 0.6543459576191735,
"learning_rate": 7.386281588447653e-06,
"loss": 0.0586,
"step": 3495
},
{
"epoch": 6.317689530685921,
"grad_norm": 0.7327266175801325,
"learning_rate": 7.368231046931409e-06,
"loss": 0.0568,
"step": 3500
},
{
"epoch": 6.3267148014440435,
"grad_norm": 0.6332768442841004,
"learning_rate": 7.350180505415163e-06,
"loss": 0.0588,
"step": 3505
},
{
"epoch": 6.335740072202166,
"grad_norm": 0.7514342787787303,
"learning_rate": 7.332129963898917e-06,
"loss": 0.0561,
"step": 3510
},
{
"epoch": 6.344765342960288,
"grad_norm": 0.9126804192240373,
"learning_rate": 7.314079422382672e-06,
"loss": 0.06,
"step": 3515
},
{
"epoch": 6.353790613718411,
"grad_norm": 0.7488940592798972,
"learning_rate": 7.296028880866427e-06,
"loss": 0.0589,
"step": 3520
},
{
"epoch": 6.362815884476534,
"grad_norm": 0.7822412626745566,
"learning_rate": 7.277978339350181e-06,
"loss": 0.057,
"step": 3525
},
{
"epoch": 6.371841155234657,
"grad_norm": 0.641895512790488,
"learning_rate": 7.259927797833936e-06,
"loss": 0.0566,
"step": 3530
},
{
"epoch": 6.38086642599278,
"grad_norm": 0.610560993405513,
"learning_rate": 7.24187725631769e-06,
"loss": 0.0568,
"step": 3535
},
{
"epoch": 6.389891696750903,
"grad_norm": 0.6706724032290002,
"learning_rate": 7.223826714801445e-06,
"loss": 0.0606,
"step": 3540
},
{
"epoch": 6.398916967509026,
"grad_norm": 0.658753045191665,
"learning_rate": 7.2057761732852e-06,
"loss": 0.0593,
"step": 3545
},
{
"epoch": 6.4079422382671485,
"grad_norm": 0.6654660097704281,
"learning_rate": 7.187725631768954e-06,
"loss": 0.0585,
"step": 3550
},
{
"epoch": 6.4169675090252705,
"grad_norm": 0.7183651068540791,
"learning_rate": 7.169675090252708e-06,
"loss": 0.0583,
"step": 3555
},
{
"epoch": 6.425992779783393,
"grad_norm": 0.8088470667398641,
"learning_rate": 7.151624548736462e-06,
"loss": 0.0588,
"step": 3560
},
{
"epoch": 6.435018050541516,
"grad_norm": 0.8263381826886101,
"learning_rate": 7.133574007220218e-06,
"loss": 0.0593,
"step": 3565
},
{
"epoch": 6.444043321299639,
"grad_norm": 0.7405826430937665,
"learning_rate": 7.115523465703971e-06,
"loss": 0.0578,
"step": 3570
},
{
"epoch": 6.453068592057762,
"grad_norm": 0.7185281820450516,
"learning_rate": 7.097472924187726e-06,
"loss": 0.0579,
"step": 3575
},
{
"epoch": 6.462093862815885,
"grad_norm": 0.7382936760799543,
"learning_rate": 7.07942238267148e-06,
"loss": 0.0575,
"step": 3580
},
{
"epoch": 6.471119133574007,
"grad_norm": 0.6877004421197578,
"learning_rate": 7.061371841155235e-06,
"loss": 0.0573,
"step": 3585
},
{
"epoch": 6.48014440433213,
"grad_norm": 0.6861100161163733,
"learning_rate": 7.04332129963899e-06,
"loss": 0.0599,
"step": 3590
},
{
"epoch": 6.4891696750902526,
"grad_norm": 0.6794004024109824,
"learning_rate": 7.025270758122744e-06,
"loss": 0.059,
"step": 3595
},
{
"epoch": 6.498194945848375,
"grad_norm": 0.8104554628255708,
"learning_rate": 7.0072202166064985e-06,
"loss": 0.0587,
"step": 3600
},
{
"epoch": 6.507220216606498,
"grad_norm": 0.8305032099557856,
"learning_rate": 6.989169675090254e-06,
"loss": 0.0585,
"step": 3605
},
{
"epoch": 6.516245487364621,
"grad_norm": 0.8023646626140253,
"learning_rate": 6.971119133574008e-06,
"loss": 0.0595,
"step": 3610
},
{
"epoch": 6.525270758122744,
"grad_norm": 0.765073355799892,
"learning_rate": 6.9530685920577625e-06,
"loss": 0.0584,
"step": 3615
},
{
"epoch": 6.534296028880867,
"grad_norm": 0.7404552154431031,
"learning_rate": 6.935018050541516e-06,
"loss": 0.0577,
"step": 3620
},
{
"epoch": 6.543321299638989,
"grad_norm": 0.7405781286739906,
"learning_rate": 6.916967509025271e-06,
"loss": 0.0574,
"step": 3625
},
{
"epoch": 6.552346570397112,
"grad_norm": 0.7793217854667968,
"learning_rate": 6.8989169675090265e-06,
"loss": 0.0605,
"step": 3630
},
{
"epoch": 6.561371841155235,
"grad_norm": 0.5845887643829276,
"learning_rate": 6.88086642599278e-06,
"loss": 0.0599,
"step": 3635
},
{
"epoch": 6.5703971119133575,
"grad_norm": 0.7145137796672513,
"learning_rate": 6.862815884476535e-06,
"loss": 0.0585,
"step": 3640
},
{
"epoch": 6.57942238267148,
"grad_norm": 0.7417399070524405,
"learning_rate": 6.844765342960289e-06,
"loss": 0.0587,
"step": 3645
},
{
"epoch": 6.588447653429603,
"grad_norm": 0.6846077356552561,
"learning_rate": 6.826714801444044e-06,
"loss": 0.0588,
"step": 3650
},
{
"epoch": 6.597472924187725,
"grad_norm": 0.6999561458466752,
"learning_rate": 6.808664259927798e-06,
"loss": 0.0595,
"step": 3655
},
{
"epoch": 6.606498194945848,
"grad_norm": 0.6567498870286701,
"learning_rate": 6.790613718411553e-06,
"loss": 0.0581,
"step": 3660
},
{
"epoch": 6.615523465703971,
"grad_norm": 0.7350564747052517,
"learning_rate": 6.7725631768953075e-06,
"loss": 0.0596,
"step": 3665
},
{
"epoch": 6.624548736462094,
"grad_norm": 0.6574574010183412,
"learning_rate": 6.754512635379062e-06,
"loss": 0.0584,
"step": 3670
},
{
"epoch": 6.633574007220217,
"grad_norm": 0.754103084530502,
"learning_rate": 6.736462093862817e-06,
"loss": 0.0597,
"step": 3675
},
{
"epoch": 6.64259927797834,
"grad_norm": 0.6204823250912029,
"learning_rate": 6.718411552346571e-06,
"loss": 0.0594,
"step": 3680
},
{
"epoch": 6.6516245487364625,
"grad_norm": 0.6526250315655968,
"learning_rate": 6.700361010830325e-06,
"loss": 0.0586,
"step": 3685
},
{
"epoch": 6.6606498194945845,
"grad_norm": 0.6759108282695475,
"learning_rate": 6.682310469314079e-06,
"loss": 0.0598,
"step": 3690
},
{
"epoch": 6.669675090252707,
"grad_norm": 0.6281015940025027,
"learning_rate": 6.664259927797835e-06,
"loss": 0.0583,
"step": 3695
},
{
"epoch": 6.67870036101083,
"grad_norm": 0.8112328410048235,
"learning_rate": 6.646209386281589e-06,
"loss": 0.0581,
"step": 3700
},
{
"epoch": 6.687725631768953,
"grad_norm": 0.8294177729044503,
"learning_rate": 6.628158844765343e-06,
"loss": 0.0588,
"step": 3705
},
{
"epoch": 6.696750902527076,
"grad_norm": 0.6837556241925122,
"learning_rate": 6.610108303249098e-06,
"loss": 0.0584,
"step": 3710
},
{
"epoch": 6.705776173285199,
"grad_norm": 0.6977698116292954,
"learning_rate": 6.592057761732853e-06,
"loss": 0.0607,
"step": 3715
},
{
"epoch": 6.714801444043322,
"grad_norm": 0.8094312645469506,
"learning_rate": 6.574007220216607e-06,
"loss": 0.0583,
"step": 3720
},
{
"epoch": 6.723826714801444,
"grad_norm": 0.6015350505674695,
"learning_rate": 6.555956678700362e-06,
"loss": 0.0584,
"step": 3725
},
{
"epoch": 6.7328519855595665,
"grad_norm": 0.7144030665488617,
"learning_rate": 6.5379061371841156e-06,
"loss": 0.0586,
"step": 3730
},
{
"epoch": 6.741877256317689,
"grad_norm": 0.7376393032962922,
"learning_rate": 6.519855595667871e-06,
"loss": 0.0591,
"step": 3735
},
{
"epoch": 6.750902527075812,
"grad_norm": 0.8824089248862236,
"learning_rate": 6.501805054151626e-06,
"loss": 0.0613,
"step": 3740
},
{
"epoch": 6.759927797833935,
"grad_norm": 0.6572712036921975,
"learning_rate": 6.4837545126353796e-06,
"loss": 0.0584,
"step": 3745
},
{
"epoch": 6.768953068592058,
"grad_norm": 0.6525445173798352,
"learning_rate": 6.465703971119134e-06,
"loss": 0.0599,
"step": 3750
},
{
"epoch": 6.777978339350181,
"grad_norm": 0.7744924607467405,
"learning_rate": 6.447653429602888e-06,
"loss": 0.0603,
"step": 3755
},
{
"epoch": 6.787003610108303,
"grad_norm": 0.7489114246161626,
"learning_rate": 6.4296028880866436e-06,
"loss": 0.0593,
"step": 3760
},
{
"epoch": 6.796028880866426,
"grad_norm": 0.645441657942143,
"learning_rate": 6.411552346570397e-06,
"loss": 0.0601,
"step": 3765
},
{
"epoch": 6.805054151624549,
"grad_norm": 0.7628853557562308,
"learning_rate": 6.393501805054152e-06,
"loss": 0.059,
"step": 3770
},
{
"epoch": 6.8140794223826715,
"grad_norm": 0.6961794188939842,
"learning_rate": 6.375451263537906e-06,
"loss": 0.0589,
"step": 3775
},
{
"epoch": 6.823104693140794,
"grad_norm": 0.732051818736521,
"learning_rate": 6.357400722021661e-06,
"loss": 0.059,
"step": 3780
},
{
"epoch": 6.832129963898917,
"grad_norm": 0.6559311687860191,
"learning_rate": 6.339350180505416e-06,
"loss": 0.0583,
"step": 3785
},
{
"epoch": 6.841155234657039,
"grad_norm": 0.6593139916118618,
"learning_rate": 6.32129963898917e-06,
"loss": 0.0584,
"step": 3790
},
{
"epoch": 6.850180505415162,
"grad_norm": 0.6976834804145813,
"learning_rate": 6.3032490974729245e-06,
"loss": 0.0592,
"step": 3795
},
{
"epoch": 6.859205776173285,
"grad_norm": 0.9247025540887917,
"learning_rate": 6.28519855595668e-06,
"loss": 0.0579,
"step": 3800
},
{
"epoch": 6.868231046931408,
"grad_norm": 0.8326401902247998,
"learning_rate": 6.267148014440434e-06,
"loss": 0.0604,
"step": 3805
},
{
"epoch": 6.877256317689531,
"grad_norm": 0.5853865156076931,
"learning_rate": 6.2490974729241885e-06,
"loss": 0.0594,
"step": 3810
},
{
"epoch": 6.886281588447654,
"grad_norm": 0.6292032096557062,
"learning_rate": 6.231046931407942e-06,
"loss": 0.0573,
"step": 3815
},
{
"epoch": 6.8953068592057765,
"grad_norm": 0.7744076548642933,
"learning_rate": 6.212996389891697e-06,
"loss": 0.0607,
"step": 3820
},
{
"epoch": 6.904332129963899,
"grad_norm": 0.7914463992742894,
"learning_rate": 6.1949458483754525e-06,
"loss": 0.0606,
"step": 3825
},
{
"epoch": 6.913357400722021,
"grad_norm": 0.701945083370995,
"learning_rate": 6.176895306859206e-06,
"loss": 0.0587,
"step": 3830
},
{
"epoch": 6.922382671480144,
"grad_norm": 0.6696484276570565,
"learning_rate": 6.158844765342961e-06,
"loss": 0.0608,
"step": 3835
},
{
"epoch": 6.931407942238267,
"grad_norm": 0.7132432488001278,
"learning_rate": 6.140794223826715e-06,
"loss": 0.0576,
"step": 3840
},
{
"epoch": 6.94043321299639,
"grad_norm": 0.6745043956786805,
"learning_rate": 6.12274368231047e-06,
"loss": 0.0597,
"step": 3845
},
{
"epoch": 6.949458483754513,
"grad_norm": 0.7735109912206191,
"learning_rate": 6.104693140794224e-06,
"loss": 0.058,
"step": 3850
},
{
"epoch": 6.958483754512636,
"grad_norm": 0.8204235477419153,
"learning_rate": 6.086642599277979e-06,
"loss": 0.0611,
"step": 3855
},
{
"epoch": 6.967509025270758,
"grad_norm": 0.6940007350939205,
"learning_rate": 6.068592057761733e-06,
"loss": 0.0572,
"step": 3860
},
{
"epoch": 6.9765342960288805,
"grad_norm": 0.7293292682342528,
"learning_rate": 6.050541516245488e-06,
"loss": 0.0595,
"step": 3865
},
{
"epoch": 6.985559566787003,
"grad_norm": 0.7122810615357342,
"learning_rate": 6.032490974729243e-06,
"loss": 0.0581,
"step": 3870
},
{
"epoch": 6.994584837545126,
"grad_norm": 0.6537994119990527,
"learning_rate": 6.014440433212997e-06,
"loss": 0.0598,
"step": 3875
},
{
"epoch": 7.003610108303249,
"grad_norm": 0.45575554548627367,
"learning_rate": 5.996389891696751e-06,
"loss": 0.0554,
"step": 3880
},
{
"epoch": 7.012635379061372,
"grad_norm": 0.5070177525024102,
"learning_rate": 5.978339350180505e-06,
"loss": 0.0472,
"step": 3885
},
{
"epoch": 7.021660649819495,
"grad_norm": 0.5588320341624766,
"learning_rate": 5.960288808664261e-06,
"loss": 0.0476,
"step": 3890
},
{
"epoch": 7.030685920577618,
"grad_norm": 0.4730456078169544,
"learning_rate": 5.942238267148015e-06,
"loss": 0.0465,
"step": 3895
},
{
"epoch": 7.03971119133574,
"grad_norm": 0.4802501819942531,
"learning_rate": 5.924187725631769e-06,
"loss": 0.0467,
"step": 3900
},
{
"epoch": 7.048736462093863,
"grad_norm": 0.43376711913832905,
"learning_rate": 5.906137184115524e-06,
"loss": 0.046,
"step": 3905
},
{
"epoch": 7.0577617328519855,
"grad_norm": 0.5467463303002804,
"learning_rate": 5.888086642599279e-06,
"loss": 0.0472,
"step": 3910
},
{
"epoch": 7.066787003610108,
"grad_norm": 0.5482465563214539,
"learning_rate": 5.870036101083033e-06,
"loss": 0.0484,
"step": 3915
},
{
"epoch": 7.075812274368231,
"grad_norm": 0.47912292874862494,
"learning_rate": 5.851985559566788e-06,
"loss": 0.0464,
"step": 3920
},
{
"epoch": 7.084837545126354,
"grad_norm": 0.5614426575228361,
"learning_rate": 5.8339350180505415e-06,
"loss": 0.0473,
"step": 3925
},
{
"epoch": 7.093862815884476,
"grad_norm": 0.45529148018104104,
"learning_rate": 5.815884476534297e-06,
"loss": 0.0475,
"step": 3930
},
{
"epoch": 7.102888086642599,
"grad_norm": 0.4788344545919543,
"learning_rate": 5.797833935018051e-06,
"loss": 0.0479,
"step": 3935
},
{
"epoch": 7.111913357400722,
"grad_norm": 0.46173140924255596,
"learning_rate": 5.7797833935018055e-06,
"loss": 0.0473,
"step": 3940
},
{
"epoch": 7.120938628158845,
"grad_norm": 0.5390906257209109,
"learning_rate": 5.761732851985559e-06,
"loss": 0.0485,
"step": 3945
},
{
"epoch": 7.129963898916968,
"grad_norm": 0.5227509249629044,
"learning_rate": 5.743682310469314e-06,
"loss": 0.0467,
"step": 3950
},
{
"epoch": 7.1389891696750905,
"grad_norm": 0.48947576478003363,
"learning_rate": 5.7256317689530695e-06,
"loss": 0.0472,
"step": 3955
},
{
"epoch": 7.148014440433213,
"grad_norm": 0.49244408024963654,
"learning_rate": 5.707581227436823e-06,
"loss": 0.0483,
"step": 3960
},
{
"epoch": 7.157039711191336,
"grad_norm": 0.5222815328881174,
"learning_rate": 5.689530685920578e-06,
"loss": 0.048,
"step": 3965
},
{
"epoch": 7.166064981949458,
"grad_norm": 0.5817714524678111,
"learning_rate": 5.671480144404332e-06,
"loss": 0.0473,
"step": 3970
},
{
"epoch": 7.175090252707581,
"grad_norm": 0.5925655806700871,
"learning_rate": 5.653429602888087e-06,
"loss": 0.0497,
"step": 3975
},
{
"epoch": 7.184115523465704,
"grad_norm": 0.49153223210393315,
"learning_rate": 5.635379061371842e-06,
"loss": 0.0488,
"step": 3980
},
{
"epoch": 7.193140794223827,
"grad_norm": 0.47623000392936515,
"learning_rate": 5.617328519855596e-06,
"loss": 0.0483,
"step": 3985
},
{
"epoch": 7.20216606498195,
"grad_norm": 0.5429876236829767,
"learning_rate": 5.5992779783393505e-06,
"loss": 0.0475,
"step": 3990
},
{
"epoch": 7.2111913357400725,
"grad_norm": 0.5392076103305925,
"learning_rate": 5.581227436823106e-06,
"loss": 0.0473,
"step": 3995
},
{
"epoch": 7.2202166064981945,
"grad_norm": 0.4442145984166065,
"learning_rate": 5.56317689530686e-06,
"loss": 0.0477,
"step": 4000
},
{
"epoch": 7.229241877256317,
"grad_norm": 0.4474040313759183,
"learning_rate": 5.5451263537906145e-06,
"loss": 0.0478,
"step": 4005
},
{
"epoch": 7.23826714801444,
"grad_norm": 0.49719158373571726,
"learning_rate": 5.527075812274368e-06,
"loss": 0.0471,
"step": 4010
},
{
"epoch": 7.247292418772563,
"grad_norm": 0.5110277695420385,
"learning_rate": 5.509025270758123e-06,
"loss": 0.0472,
"step": 4015
},
{
"epoch": 7.256317689530686,
"grad_norm": 0.551234875347394,
"learning_rate": 5.490974729241878e-06,
"loss": 0.0482,
"step": 4020
},
{
"epoch": 7.265342960288809,
"grad_norm": 0.5164595334266263,
"learning_rate": 5.472924187725632e-06,
"loss": 0.0483,
"step": 4025
},
{
"epoch": 7.274368231046932,
"grad_norm": 0.5120155623092252,
"learning_rate": 5.454873646209387e-06,
"loss": 0.0473,
"step": 4030
},
{
"epoch": 7.283393501805054,
"grad_norm": 0.7138533885415025,
"learning_rate": 5.436823104693141e-06,
"loss": 0.0491,
"step": 4035
},
{
"epoch": 7.292418772563177,
"grad_norm": 0.5256186511567283,
"learning_rate": 5.418772563176896e-06,
"loss": 0.0482,
"step": 4040
},
{
"epoch": 7.3014440433212995,
"grad_norm": 0.5308368446047028,
"learning_rate": 5.40072202166065e-06,
"loss": 0.0483,
"step": 4045
},
{
"epoch": 7.310469314079422,
"grad_norm": 0.6340076056397649,
"learning_rate": 5.382671480144405e-06,
"loss": 0.0497,
"step": 4050
},
{
"epoch": 7.319494584837545,
"grad_norm": 0.5515989056962669,
"learning_rate": 5.3646209386281586e-06,
"loss": 0.0485,
"step": 4055
},
{
"epoch": 7.328519855595668,
"grad_norm": 0.5670979301767023,
"learning_rate": 5.346570397111914e-06,
"loss": 0.049,
"step": 4060
},
{
"epoch": 7.337545126353791,
"grad_norm": 0.5155332846058096,
"learning_rate": 5.328519855595669e-06,
"loss": 0.0491,
"step": 4065
},
{
"epoch": 7.346570397111913,
"grad_norm": 0.4824925439743894,
"learning_rate": 5.3104693140794226e-06,
"loss": 0.0478,
"step": 4070
},
{
"epoch": 7.355595667870036,
"grad_norm": 0.5632294248317786,
"learning_rate": 5.292418772563177e-06,
"loss": 0.0487,
"step": 4075
},
{
"epoch": 7.364620938628159,
"grad_norm": 0.5912408794489219,
"learning_rate": 5.274368231046931e-06,
"loss": 0.0494,
"step": 4080
},
{
"epoch": 7.373646209386282,
"grad_norm": 0.6610618876210643,
"learning_rate": 5.2563176895306866e-06,
"loss": 0.0484,
"step": 4085
},
{
"epoch": 7.382671480144404,
"grad_norm": 0.5715318430573592,
"learning_rate": 5.238267148014441e-06,
"loss": 0.0484,
"step": 4090
},
{
"epoch": 7.391696750902527,
"grad_norm": 0.5661862605906709,
"learning_rate": 5.220216606498195e-06,
"loss": 0.0486,
"step": 4095
},
{
"epoch": 7.40072202166065,
"grad_norm": 0.9029651147919706,
"learning_rate": 5.20216606498195e-06,
"loss": 0.0508,
"step": 4100
},
{
"epoch": 7.409747292418772,
"grad_norm": 0.5165409614311138,
"learning_rate": 5.184115523465705e-06,
"loss": 0.0495,
"step": 4105
},
{
"epoch": 7.418772563176895,
"grad_norm": 0.5306583449154315,
"learning_rate": 5.166064981949459e-06,
"loss": 0.0491,
"step": 4110
},
{
"epoch": 7.427797833935018,
"grad_norm": 0.450162478445713,
"learning_rate": 5.148014440433214e-06,
"loss": 0.0481,
"step": 4115
},
{
"epoch": 7.436823104693141,
"grad_norm": 0.5832168434645013,
"learning_rate": 5.1299638989169675e-06,
"loss": 0.0484,
"step": 4120
},
{
"epoch": 7.445848375451264,
"grad_norm": 0.6015879512932737,
"learning_rate": 5.111913357400723e-06,
"loss": 0.0483,
"step": 4125
},
{
"epoch": 7.4548736462093865,
"grad_norm": 0.516158408533585,
"learning_rate": 5.093862815884477e-06,
"loss": 0.0502,
"step": 4130
},
{
"epoch": 7.463898916967509,
"grad_norm": 0.5739166956674056,
"learning_rate": 5.0758122743682315e-06,
"loss": 0.049,
"step": 4135
},
{
"epoch": 7.472924187725631,
"grad_norm": 0.5545335828483756,
"learning_rate": 5.057761732851985e-06,
"loss": 0.0493,
"step": 4140
},
{
"epoch": 7.481949458483754,
"grad_norm": 0.47988775413231505,
"learning_rate": 5.03971119133574e-06,
"loss": 0.0481,
"step": 4145
},
{
"epoch": 7.490974729241877,
"grad_norm": 0.4135154941397547,
"learning_rate": 5.0216606498194955e-06,
"loss": 0.0485,
"step": 4150
},
{
"epoch": 7.5,
"grad_norm": 0.584778293406958,
"learning_rate": 5.003610108303249e-06,
"loss": 0.0496,
"step": 4155
},
{
"epoch": 7.509025270758123,
"grad_norm": 0.6596870903880235,
"learning_rate": 4.985559566787004e-06,
"loss": 0.0509,
"step": 4160
},
{
"epoch": 7.518050541516246,
"grad_norm": 0.5258493182778312,
"learning_rate": 4.967509025270759e-06,
"loss": 0.0479,
"step": 4165
},
{
"epoch": 7.527075812274369,
"grad_norm": 0.5240346323436935,
"learning_rate": 4.949458483754513e-06,
"loss": 0.0486,
"step": 4170
},
{
"epoch": 7.536101083032491,
"grad_norm": 0.5698373146810196,
"learning_rate": 4.931407942238268e-06,
"loss": 0.0498,
"step": 4175
},
{
"epoch": 7.5451263537906135,
"grad_norm": 0.5168274629904006,
"learning_rate": 4.913357400722022e-06,
"loss": 0.0484,
"step": 4180
},
{
"epoch": 7.554151624548736,
"grad_norm": 0.4967245847978526,
"learning_rate": 4.8953068592057764e-06,
"loss": 0.0494,
"step": 4185
},
{
"epoch": 7.563176895306859,
"grad_norm": 0.5296559595656957,
"learning_rate": 4.877256317689531e-06,
"loss": 0.0487,
"step": 4190
},
{
"epoch": 7.572202166064982,
"grad_norm": 0.4681797361268003,
"learning_rate": 4.859205776173286e-06,
"loss": 0.0492,
"step": 4195
},
{
"epoch": 7.581227436823105,
"grad_norm": 0.5837080132512812,
"learning_rate": 4.8411552346570404e-06,
"loss": 0.0505,
"step": 4200
},
{
"epoch": 7.590252707581227,
"grad_norm": 0.6465877829891032,
"learning_rate": 4.823104693140795e-06,
"loss": 0.0491,
"step": 4205
},
{
"epoch": 7.59927797833935,
"grad_norm": 0.5542060145314779,
"learning_rate": 4.805054151624549e-06,
"loss": 0.0502,
"step": 4210
},
{
"epoch": 7.608303249097473,
"grad_norm": 0.544169659697146,
"learning_rate": 4.787003610108304e-06,
"loss": 0.049,
"step": 4215
},
{
"epoch": 7.617328519855596,
"grad_norm": 0.4766240683764691,
"learning_rate": 4.768953068592058e-06,
"loss": 0.0488,
"step": 4220
},
{
"epoch": 7.626353790613718,
"grad_norm": 0.5857310047242031,
"learning_rate": 4.750902527075812e-06,
"loss": 0.0484,
"step": 4225
},
{
"epoch": 7.635379061371841,
"grad_norm": 0.6584297092461749,
"learning_rate": 4.7328519855595676e-06,
"loss": 0.0504,
"step": 4230
},
{
"epoch": 7.644404332129964,
"grad_norm": 0.6165622003595669,
"learning_rate": 4.714801444043321e-06,
"loss": 0.0497,
"step": 4235
},
{
"epoch": 7.653429602888087,
"grad_norm": 0.5411935348913237,
"learning_rate": 4.696750902527076e-06,
"loss": 0.0493,
"step": 4240
},
{
"epoch": 7.662454873646209,
"grad_norm": 0.5757635354988099,
"learning_rate": 4.678700361010831e-06,
"loss": 0.0487,
"step": 4245
},
{
"epoch": 7.671480144404332,
"grad_norm": 0.4789345734543781,
"learning_rate": 4.660649819494585e-06,
"loss": 0.0493,
"step": 4250
},
{
"epoch": 7.680505415162455,
"grad_norm": 0.5729021031172669,
"learning_rate": 4.64259927797834e-06,
"loss": 0.0494,
"step": 4255
},
{
"epoch": 7.689530685920578,
"grad_norm": 0.5131076676913804,
"learning_rate": 4.624548736462095e-06,
"loss": 0.0486,
"step": 4260
},
{
"epoch": 7.6985559566787005,
"grad_norm": 0.5950116178035108,
"learning_rate": 4.6064981949458485e-06,
"loss": 0.0492,
"step": 4265
},
{
"epoch": 7.707581227436823,
"grad_norm": 0.5524155549870725,
"learning_rate": 4.588447653429603e-06,
"loss": 0.049,
"step": 4270
},
{
"epoch": 7.716606498194945,
"grad_norm": 0.5270091994399845,
"learning_rate": 4.570397111913358e-06,
"loss": 0.0488,
"step": 4275
},
{
"epoch": 7.725631768953068,
"grad_norm": 0.46706887147243975,
"learning_rate": 4.552346570397112e-06,
"loss": 0.0499,
"step": 4280
},
{
"epoch": 7.734657039711191,
"grad_norm": 0.598860629124346,
"learning_rate": 4.534296028880867e-06,
"loss": 0.049,
"step": 4285
},
{
"epoch": 7.743682310469314,
"grad_norm": 0.4792268975334193,
"learning_rate": 4.516245487364621e-06,
"loss": 0.0504,
"step": 4290
},
{
"epoch": 7.752707581227437,
"grad_norm": 0.5423376274444353,
"learning_rate": 4.498194945848376e-06,
"loss": 0.0487,
"step": 4295
},
{
"epoch": 7.76173285198556,
"grad_norm": 0.5483231957576472,
"learning_rate": 4.48014440433213e-06,
"loss": 0.0492,
"step": 4300
},
{
"epoch": 7.770758122743683,
"grad_norm": 0.5272610680544871,
"learning_rate": 4.462093862815885e-06,
"loss": 0.0498,
"step": 4305
},
{
"epoch": 7.7797833935018055,
"grad_norm": 0.5746187590999563,
"learning_rate": 4.444043321299639e-06,
"loss": 0.0476,
"step": 4310
},
{
"epoch": 7.7888086642599275,
"grad_norm": 0.5034631208906547,
"learning_rate": 4.425992779783394e-06,
"loss": 0.0499,
"step": 4315
},
{
"epoch": 7.79783393501805,
"grad_norm": 0.4802523837756993,
"learning_rate": 4.407942238267148e-06,
"loss": 0.0491,
"step": 4320
},
{
"epoch": 7.806859205776173,
"grad_norm": 0.5995873657958978,
"learning_rate": 4.389891696750903e-06,
"loss": 0.0503,
"step": 4325
},
{
"epoch": 7.815884476534296,
"grad_norm": 0.5877452980751235,
"learning_rate": 4.3718411552346575e-06,
"loss": 0.0484,
"step": 4330
},
{
"epoch": 7.824909747292419,
"grad_norm": 0.5912813010558485,
"learning_rate": 4.353790613718412e-06,
"loss": 0.0506,
"step": 4335
},
{
"epoch": 7.833935018050542,
"grad_norm": 0.5241916120291473,
"learning_rate": 4.335740072202167e-06,
"loss": 0.0498,
"step": 4340
},
{
"epoch": 7.842960288808664,
"grad_norm": 0.6235491325166662,
"learning_rate": 4.317689530685921e-06,
"loss": 0.0493,
"step": 4345
},
{
"epoch": 7.851985559566787,
"grad_norm": 0.4918990115478395,
"learning_rate": 4.299638989169675e-06,
"loss": 0.0488,
"step": 4350
},
{
"epoch": 7.8610108303249095,
"grad_norm": 0.5015566746050467,
"learning_rate": 4.28158844765343e-06,
"loss": 0.0492,
"step": 4355
},
{
"epoch": 7.870036101083032,
"grad_norm": 0.5897991332222726,
"learning_rate": 4.263537906137185e-06,
"loss": 0.0499,
"step": 4360
},
{
"epoch": 7.879061371841155,
"grad_norm": 0.5044541687620931,
"learning_rate": 4.245487364620938e-06,
"loss": 0.0496,
"step": 4365
},
{
"epoch": 7.888086642599278,
"grad_norm": 0.5298007896462054,
"learning_rate": 4.227436823104694e-06,
"loss": 0.0499,
"step": 4370
},
{
"epoch": 7.897111913357401,
"grad_norm": 0.6508152344907792,
"learning_rate": 4.209386281588448e-06,
"loss": 0.0499,
"step": 4375
},
{
"epoch": 7.906137184115524,
"grad_norm": 0.4952820870375711,
"learning_rate": 4.191335740072202e-06,
"loss": 0.0486,
"step": 4380
},
{
"epoch": 7.915162454873646,
"grad_norm": 0.534088933198461,
"learning_rate": 4.173285198555957e-06,
"loss": 0.0497,
"step": 4385
},
{
"epoch": 7.924187725631769,
"grad_norm": 0.5456898452361849,
"learning_rate": 4.155234657039712e-06,
"loss": 0.0498,
"step": 4390
},
{
"epoch": 7.933212996389892,
"grad_norm": 0.5206741989861254,
"learning_rate": 4.137184115523466e-06,
"loss": 0.0503,
"step": 4395
},
{
"epoch": 7.9422382671480145,
"grad_norm": 0.5130632228899183,
"learning_rate": 4.119133574007221e-06,
"loss": 0.0503,
"step": 4400
},
{
"epoch": 7.951263537906137,
"grad_norm": 0.5076154969319394,
"learning_rate": 4.101083032490975e-06,
"loss": 0.0497,
"step": 4405
},
{
"epoch": 7.96028880866426,
"grad_norm": 0.4532751901805038,
"learning_rate": 4.0830324909747296e-06,
"loss": 0.0486,
"step": 4410
},
{
"epoch": 7.969314079422382,
"grad_norm": 0.5569018909973789,
"learning_rate": 4.064981949458484e-06,
"loss": 0.0495,
"step": 4415
},
{
"epoch": 7.978339350180505,
"grad_norm": 0.5691379588931518,
"learning_rate": 4.046931407942238e-06,
"loss": 0.0491,
"step": 4420
},
{
"epoch": 7.987364620938628,
"grad_norm": 0.5087632373984906,
"learning_rate": 4.0288808664259935e-06,
"loss": 0.0507,
"step": 4425
},
{
"epoch": 7.996389891696751,
"grad_norm": 0.5438187646707812,
"learning_rate": 4.010830324909747e-06,
"loss": 0.0509,
"step": 4430
},
{
"epoch": 8.005415162454874,
"grad_norm": 0.46810462572343003,
"learning_rate": 3.992779783393502e-06,
"loss": 0.0467,
"step": 4435
},
{
"epoch": 8.014440433212997,
"grad_norm": 0.5110390059676326,
"learning_rate": 3.974729241877257e-06,
"loss": 0.0439,
"step": 4440
},
{
"epoch": 8.02346570397112,
"grad_norm": 0.5350833689826751,
"learning_rate": 3.956678700361011e-06,
"loss": 0.0443,
"step": 4445
},
{
"epoch": 8.032490974729242,
"grad_norm": 0.5246123933567166,
"learning_rate": 3.938628158844765e-06,
"loss": 0.0436,
"step": 4450
},
{
"epoch": 8.041516245487365,
"grad_norm": 0.5240110675258801,
"learning_rate": 3.920577617328521e-06,
"loss": 0.0438,
"step": 4455
},
{
"epoch": 8.050541516245488,
"grad_norm": 0.4508238887199392,
"learning_rate": 3.9025270758122745e-06,
"loss": 0.0434,
"step": 4460
},
{
"epoch": 8.059566787003611,
"grad_norm": 0.4587378398564632,
"learning_rate": 3.884476534296029e-06,
"loss": 0.0433,
"step": 4465
},
{
"epoch": 8.068592057761732,
"grad_norm": 0.4536888628696346,
"learning_rate": 3.866425992779784e-06,
"loss": 0.044,
"step": 4470
},
{
"epoch": 8.077617328519855,
"grad_norm": 0.489763842575816,
"learning_rate": 3.848375451263538e-06,
"loss": 0.0439,
"step": 4475
},
{
"epoch": 8.086642599277978,
"grad_norm": 0.4841182353897231,
"learning_rate": 3.830324909747293e-06,
"loss": 0.0447,
"step": 4480
},
{
"epoch": 8.0956678700361,
"grad_norm": 0.43276031492594386,
"learning_rate": 3.812274368231047e-06,
"loss": 0.0436,
"step": 4485
},
{
"epoch": 8.104693140794224,
"grad_norm": 0.43236829754729605,
"learning_rate": 3.7942238267148016e-06,
"loss": 0.0442,
"step": 4490
},
{
"epoch": 8.113718411552346,
"grad_norm": 0.5146544780130509,
"learning_rate": 3.776173285198556e-06,
"loss": 0.0444,
"step": 4495
},
{
"epoch": 8.12274368231047,
"grad_norm": 0.5155356346240472,
"learning_rate": 3.758122743682311e-06,
"loss": 0.0447,
"step": 4500
},
{
"epoch": 8.131768953068592,
"grad_norm": 0.4360981669075826,
"learning_rate": 3.740072202166065e-06,
"loss": 0.0444,
"step": 4505
},
{
"epoch": 8.140794223826715,
"grad_norm": 0.5047195091481486,
"learning_rate": 3.72202166064982e-06,
"loss": 0.0453,
"step": 4510
},
{
"epoch": 8.149819494584838,
"grad_norm": 0.4206990761101632,
"learning_rate": 3.703971119133574e-06,
"loss": 0.0455,
"step": 4515
},
{
"epoch": 8.15884476534296,
"grad_norm": 0.4082036544428832,
"learning_rate": 3.685920577617329e-06,
"loss": 0.0446,
"step": 4520
},
{
"epoch": 8.167870036101084,
"grad_norm": 0.4584532743829287,
"learning_rate": 3.6678700361010834e-06,
"loss": 0.0437,
"step": 4525
},
{
"epoch": 8.176895306859207,
"grad_norm": 0.5033458410420543,
"learning_rate": 3.649819494584838e-06,
"loss": 0.0443,
"step": 4530
},
{
"epoch": 8.18592057761733,
"grad_norm": 0.4791798460117816,
"learning_rate": 3.6317689530685923e-06,
"loss": 0.0445,
"step": 4535
},
{
"epoch": 8.19494584837545,
"grad_norm": 0.4196203455463231,
"learning_rate": 3.6137184115523466e-06,
"loss": 0.0442,
"step": 4540
},
{
"epoch": 8.203971119133573,
"grad_norm": 0.41063069847051686,
"learning_rate": 3.5956678700361012e-06,
"loss": 0.0445,
"step": 4545
},
{
"epoch": 8.212996389891696,
"grad_norm": 0.45170926636173403,
"learning_rate": 3.5776173285198555e-06,
"loss": 0.0445,
"step": 4550
},
{
"epoch": 8.222021660649819,
"grad_norm": 0.4365171717286736,
"learning_rate": 3.5595667870036106e-06,
"loss": 0.0439,
"step": 4555
},
{
"epoch": 8.231046931407942,
"grad_norm": 0.44704281997750017,
"learning_rate": 3.541516245487365e-06,
"loss": 0.0451,
"step": 4560
},
{
"epoch": 8.240072202166065,
"grad_norm": 0.44891926809808913,
"learning_rate": 3.5234657039711195e-06,
"loss": 0.0444,
"step": 4565
},
{
"epoch": 8.249097472924188,
"grad_norm": 0.4965534347892108,
"learning_rate": 3.5054151624548737e-06,
"loss": 0.045,
"step": 4570
},
{
"epoch": 8.25812274368231,
"grad_norm": 0.4926232662322707,
"learning_rate": 3.487364620938629e-06,
"loss": 0.0458,
"step": 4575
},
{
"epoch": 8.267148014440433,
"grad_norm": 0.4655262777572625,
"learning_rate": 3.469314079422383e-06,
"loss": 0.0447,
"step": 4580
},
{
"epoch": 8.276173285198556,
"grad_norm": 0.42335405356873596,
"learning_rate": 3.4512635379061377e-06,
"loss": 0.0448,
"step": 4585
},
{
"epoch": 8.28519855595668,
"grad_norm": 0.4222745585103908,
"learning_rate": 3.433212996389892e-06,
"loss": 0.0447,
"step": 4590
},
{
"epoch": 8.294223826714802,
"grad_norm": 0.5301226189403145,
"learning_rate": 3.4151624548736466e-06,
"loss": 0.0446,
"step": 4595
},
{
"epoch": 8.303249097472925,
"grad_norm": 0.42819391239433835,
"learning_rate": 3.397111913357401e-06,
"loss": 0.0452,
"step": 4600
},
{
"epoch": 8.312274368231048,
"grad_norm": 0.514730540749935,
"learning_rate": 3.379061371841155e-06,
"loss": 0.0448,
"step": 4605
},
{
"epoch": 8.321299638989169,
"grad_norm": 0.3997116268939232,
"learning_rate": 3.36101083032491e-06,
"loss": 0.046,
"step": 4610
},
{
"epoch": 8.330324909747292,
"grad_norm": 0.5325835981272248,
"learning_rate": 3.3429602888086644e-06,
"loss": 0.0445,
"step": 4615
},
{
"epoch": 8.339350180505415,
"grad_norm": 0.4871783345754918,
"learning_rate": 3.324909747292419e-06,
"loss": 0.0436,
"step": 4620
},
{
"epoch": 8.348375451263538,
"grad_norm": 0.44483140119486997,
"learning_rate": 3.3068592057761733e-06,
"loss": 0.0457,
"step": 4625
},
{
"epoch": 8.35740072202166,
"grad_norm": 0.4988117583696254,
"learning_rate": 3.288808664259928e-06,
"loss": 0.0451,
"step": 4630
},
{
"epoch": 8.366425992779783,
"grad_norm": 0.498516790859318,
"learning_rate": 3.2707581227436822e-06,
"loss": 0.0453,
"step": 4635
},
{
"epoch": 8.375451263537906,
"grad_norm": 0.4871818511915015,
"learning_rate": 3.2527075812274373e-06,
"loss": 0.0449,
"step": 4640
},
{
"epoch": 8.384476534296029,
"grad_norm": 0.4644645285652418,
"learning_rate": 3.2346570397111916e-06,
"loss": 0.045,
"step": 4645
},
{
"epoch": 8.393501805054152,
"grad_norm": 0.48904484199047815,
"learning_rate": 3.2166064981949462e-06,
"loss": 0.0458,
"step": 4650
},
{
"epoch": 8.402527075812275,
"grad_norm": 0.49191409648820944,
"learning_rate": 3.1985559566787005e-06,
"loss": 0.0449,
"step": 4655
},
{
"epoch": 8.411552346570398,
"grad_norm": 0.42381190140416636,
"learning_rate": 3.1805054151624556e-06,
"loss": 0.0451,
"step": 4660
},
{
"epoch": 8.42057761732852,
"grad_norm": 0.5141446233668461,
"learning_rate": 3.16245487364621e-06,
"loss": 0.0448,
"step": 4665
},
{
"epoch": 8.429602888086643,
"grad_norm": 0.43988353370949224,
"learning_rate": 3.1444043321299645e-06,
"loss": 0.0449,
"step": 4670
},
{
"epoch": 8.438628158844764,
"grad_norm": 0.5124332256581936,
"learning_rate": 3.1263537906137187e-06,
"loss": 0.046,
"step": 4675
},
{
"epoch": 8.447653429602887,
"grad_norm": 0.44304572226308614,
"learning_rate": 3.108303249097473e-06,
"loss": 0.0464,
"step": 4680
},
{
"epoch": 8.45667870036101,
"grad_norm": 0.5470624111508224,
"learning_rate": 3.0902527075812276e-06,
"loss": 0.0454,
"step": 4685
},
{
"epoch": 8.465703971119133,
"grad_norm": 0.4030173134885733,
"learning_rate": 3.072202166064982e-06,
"loss": 0.0453,
"step": 4690
},
{
"epoch": 8.474729241877256,
"grad_norm": 0.48989039284185976,
"learning_rate": 3.054151624548737e-06,
"loss": 0.0456,
"step": 4695
},
{
"epoch": 8.483754512635379,
"grad_norm": 0.44373432641009264,
"learning_rate": 3.036101083032491e-06,
"loss": 0.0458,
"step": 4700
},
{
"epoch": 8.492779783393502,
"grad_norm": 0.4167509709185524,
"learning_rate": 3.018050541516246e-06,
"loss": 0.0445,
"step": 4705
},
{
"epoch": 8.501805054151625,
"grad_norm": 0.5383282538033065,
"learning_rate": 3e-06,
"loss": 0.0467,
"step": 4710
},
{
"epoch": 8.510830324909747,
"grad_norm": 0.528737608029945,
"learning_rate": 2.9819494584837547e-06,
"loss": 0.0457,
"step": 4715
},
{
"epoch": 8.51985559566787,
"grad_norm": 0.508286793685187,
"learning_rate": 2.9638989169675094e-06,
"loss": 0.0466,
"step": 4720
},
{
"epoch": 8.528880866425993,
"grad_norm": 0.43030142221157835,
"learning_rate": 2.945848375451264e-06,
"loss": 0.0458,
"step": 4725
},
{
"epoch": 8.537906137184116,
"grad_norm": 0.5345650309751631,
"learning_rate": 2.9277978339350183e-06,
"loss": 0.0451,
"step": 4730
},
{
"epoch": 8.546931407942239,
"grad_norm": 0.4211221353833985,
"learning_rate": 2.909747292418773e-06,
"loss": 0.0453,
"step": 4735
},
{
"epoch": 8.555956678700362,
"grad_norm": 0.45522058199277166,
"learning_rate": 2.8916967509025272e-06,
"loss": 0.0446,
"step": 4740
},
{
"epoch": 8.564981949458485,
"grad_norm": 0.44570872798295774,
"learning_rate": 2.8736462093862815e-06,
"loss": 0.0455,
"step": 4745
},
{
"epoch": 8.574007220216606,
"grad_norm": 0.4552227500869339,
"learning_rate": 2.8555956678700365e-06,
"loss": 0.0449,
"step": 4750
},
{
"epoch": 8.583032490974729,
"grad_norm": 0.5005592666887394,
"learning_rate": 2.8375451263537908e-06,
"loss": 0.0463,
"step": 4755
},
{
"epoch": 8.592057761732852,
"grad_norm": 0.42833298285006977,
"learning_rate": 2.8194945848375454e-06,
"loss": 0.0457,
"step": 4760
},
{
"epoch": 8.601083032490974,
"grad_norm": 0.5140646854966842,
"learning_rate": 2.8014440433212997e-06,
"loss": 0.0471,
"step": 4765
},
{
"epoch": 8.610108303249097,
"grad_norm": 0.5892042283245157,
"learning_rate": 2.7833935018050544e-06,
"loss": 0.0452,
"step": 4770
},
{
"epoch": 8.61913357400722,
"grad_norm": 0.452012962570369,
"learning_rate": 2.7653429602888086e-06,
"loss": 0.0448,
"step": 4775
},
{
"epoch": 8.628158844765343,
"grad_norm": 0.4396464198294571,
"learning_rate": 2.7472924187725637e-06,
"loss": 0.0461,
"step": 4780
},
{
"epoch": 8.637184115523466,
"grad_norm": 0.41842531603143573,
"learning_rate": 2.729241877256318e-06,
"loss": 0.0451,
"step": 4785
},
{
"epoch": 8.646209386281589,
"grad_norm": 0.41898202699131626,
"learning_rate": 2.7111913357400726e-06,
"loss": 0.0457,
"step": 4790
},
{
"epoch": 8.655234657039712,
"grad_norm": 0.5196610269445232,
"learning_rate": 2.693140794223827e-06,
"loss": 0.0452,
"step": 4795
},
{
"epoch": 8.664259927797834,
"grad_norm": 0.46936311363162797,
"learning_rate": 2.675090252707582e-06,
"loss": 0.0455,
"step": 4800
},
{
"epoch": 8.673285198555957,
"grad_norm": 0.4740159197363855,
"learning_rate": 2.657039711191336e-06,
"loss": 0.0447,
"step": 4805
},
{
"epoch": 8.68231046931408,
"grad_norm": 0.43074612764563486,
"learning_rate": 2.6389891696750904e-06,
"loss": 0.0456,
"step": 4810
},
{
"epoch": 8.691335740072201,
"grad_norm": 0.5425044634950232,
"learning_rate": 2.620938628158845e-06,
"loss": 0.0466,
"step": 4815
},
{
"epoch": 8.700361010830324,
"grad_norm": 0.5081339659232526,
"learning_rate": 2.6028880866425993e-06,
"loss": 0.046,
"step": 4820
},
{
"epoch": 8.709386281588447,
"grad_norm": 0.4651038570514386,
"learning_rate": 2.584837545126354e-06,
"loss": 0.0463,
"step": 4825
},
{
"epoch": 8.71841155234657,
"grad_norm": 0.49355797519718886,
"learning_rate": 2.566787003610108e-06,
"loss": 0.0457,
"step": 4830
},
{
"epoch": 8.727436823104693,
"grad_norm": 0.46746434525998604,
"learning_rate": 2.5487364620938633e-06,
"loss": 0.0461,
"step": 4835
},
{
"epoch": 8.736462093862816,
"grad_norm": 0.4210937803108333,
"learning_rate": 2.5306859205776175e-06,
"loss": 0.0449,
"step": 4840
},
{
"epoch": 8.745487364620939,
"grad_norm": 0.45489349719230204,
"learning_rate": 2.512635379061372e-06,
"loss": 0.0451,
"step": 4845
},
{
"epoch": 8.754512635379061,
"grad_norm": 0.49590457857103976,
"learning_rate": 2.4945848375451264e-06,
"loss": 0.0462,
"step": 4850
},
{
"epoch": 8.763537906137184,
"grad_norm": 0.41135841968863646,
"learning_rate": 2.476534296028881e-06,
"loss": 0.0459,
"step": 4855
},
{
"epoch": 8.772563176895307,
"grad_norm": 0.5862358329269642,
"learning_rate": 2.4584837545126353e-06,
"loss": 0.046,
"step": 4860
},
{
"epoch": 8.78158844765343,
"grad_norm": 0.4869407752123959,
"learning_rate": 2.44043321299639e-06,
"loss": 0.046,
"step": 4865
},
{
"epoch": 8.790613718411553,
"grad_norm": 0.4541073046578143,
"learning_rate": 2.4223826714801447e-06,
"loss": 0.0452,
"step": 4870
},
{
"epoch": 8.799638989169676,
"grad_norm": 0.506564617854228,
"learning_rate": 2.4043321299638993e-06,
"loss": 0.0467,
"step": 4875
},
{
"epoch": 8.808664259927799,
"grad_norm": 0.5487992800276584,
"learning_rate": 2.3862815884476536e-06,
"loss": 0.0461,
"step": 4880
},
{
"epoch": 8.81768953068592,
"grad_norm": 0.5911995859863417,
"learning_rate": 2.3682310469314082e-06,
"loss": 0.046,
"step": 4885
},
{
"epoch": 8.826714801444043,
"grad_norm": 0.49900273240686943,
"learning_rate": 2.350180505415163e-06,
"loss": 0.046,
"step": 4890
},
{
"epoch": 8.835740072202166,
"grad_norm": 0.48948946203921534,
"learning_rate": 2.332129963898917e-06,
"loss": 0.0458,
"step": 4895
},
{
"epoch": 8.844765342960288,
"grad_norm": 0.4148924662584737,
"learning_rate": 2.314079422382672e-06,
"loss": 0.0461,
"step": 4900
},
{
"epoch": 8.853790613718411,
"grad_norm": 0.5404388584683619,
"learning_rate": 2.296028880866426e-06,
"loss": 0.046,
"step": 4905
},
{
"epoch": 8.862815884476534,
"grad_norm": 0.4321254910131908,
"learning_rate": 2.2779783393501807e-06,
"loss": 0.0454,
"step": 4910
},
{
"epoch": 8.871841155234657,
"grad_norm": 0.5949655513323762,
"learning_rate": 2.259927797833935e-06,
"loss": 0.046,
"step": 4915
},
{
"epoch": 8.88086642599278,
"grad_norm": 0.4378486480608059,
"learning_rate": 2.2418772563176896e-06,
"loss": 0.0452,
"step": 4920
},
{
"epoch": 8.889891696750903,
"grad_norm": 0.5870449555902121,
"learning_rate": 2.2238267148014443e-06,
"loss": 0.0459,
"step": 4925
},
{
"epoch": 8.898916967509026,
"grad_norm": 0.5683809822588126,
"learning_rate": 2.2057761732851985e-06,
"loss": 0.0462,
"step": 4930
},
{
"epoch": 8.907942238267148,
"grad_norm": 0.46521601312012895,
"learning_rate": 2.187725631768953e-06,
"loss": 0.0467,
"step": 4935
},
{
"epoch": 8.916967509025271,
"grad_norm": 0.47477683194093706,
"learning_rate": 2.169675090252708e-06,
"loss": 0.0462,
"step": 4940
},
{
"epoch": 8.925992779783394,
"grad_norm": 0.4804957099911855,
"learning_rate": 2.1516245487364625e-06,
"loss": 0.0454,
"step": 4945
},
{
"epoch": 8.935018050541515,
"grad_norm": 0.4571767863844299,
"learning_rate": 2.1335740072202168e-06,
"loss": 0.0453,
"step": 4950
},
{
"epoch": 8.944043321299638,
"grad_norm": 0.5601651443669259,
"learning_rate": 2.1155234657039714e-06,
"loss": 0.0463,
"step": 4955
},
{
"epoch": 8.953068592057761,
"grad_norm": 0.5500455060473453,
"learning_rate": 2.097472924187726e-06,
"loss": 0.0463,
"step": 4960
},
{
"epoch": 8.962093862815884,
"grad_norm": 0.5232043738599569,
"learning_rate": 2.0794223826714803e-06,
"loss": 0.0464,
"step": 4965
},
{
"epoch": 8.971119133574007,
"grad_norm": 0.4969509637225851,
"learning_rate": 2.0613718411552346e-06,
"loss": 0.0467,
"step": 4970
},
{
"epoch": 8.98014440433213,
"grad_norm": 0.46415374324212516,
"learning_rate": 2.0433212996389892e-06,
"loss": 0.0464,
"step": 4975
},
{
"epoch": 8.989169675090253,
"grad_norm": 0.495736189400118,
"learning_rate": 2.025270758122744e-06,
"loss": 0.0459,
"step": 4980
},
{
"epoch": 8.998194945848375,
"grad_norm": 0.5020092748082658,
"learning_rate": 2.007220216606498e-06,
"loss": 0.0458,
"step": 4985
},
{
"epoch": 9.007220216606498,
"grad_norm": 0.4534175561356577,
"learning_rate": 1.989169675090253e-06,
"loss": 0.0428,
"step": 4990
},
{
"epoch": 9.016245487364621,
"grad_norm": 0.39468719173310207,
"learning_rate": 1.9711191335740075e-06,
"loss": 0.0425,
"step": 4995
},
{
"epoch": 9.025270758122744,
"grad_norm": 0.5163608939567661,
"learning_rate": 1.9530685920577617e-06,
"loss": 0.0423,
"step": 5000
},
{
"epoch": 9.025270758122744,
"eval_loss": 0.04193449020385742,
"eval_runtime": 759.583,
"eval_samples_per_second": 17.503,
"eval_steps_per_second": 0.729,
"step": 5000
},
{
"epoch": 9.034296028880867,
"grad_norm": 0.4599758437234501,
"learning_rate": 1.9350180505415164e-06,
"loss": 0.0422,
"step": 5005
},
{
"epoch": 9.04332129963899,
"grad_norm": 0.49964626450423844,
"learning_rate": 1.916967509025271e-06,
"loss": 0.0427,
"step": 5010
},
{
"epoch": 9.052346570397113,
"grad_norm": 0.4178907774622966,
"learning_rate": 1.8989169675090255e-06,
"loss": 0.0417,
"step": 5015
},
{
"epoch": 9.061371841155236,
"grad_norm": 0.39048192594633196,
"learning_rate": 1.88086642599278e-06,
"loss": 0.042,
"step": 5020
},
{
"epoch": 9.070397111913357,
"grad_norm": 0.4791548976955057,
"learning_rate": 1.8628158844765346e-06,
"loss": 0.0412,
"step": 5025
},
{
"epoch": 9.07942238267148,
"grad_norm": 0.4638926347317585,
"learning_rate": 1.844765342960289e-06,
"loss": 0.0423,
"step": 5030
},
{
"epoch": 9.088447653429602,
"grad_norm": 0.5289808337845963,
"learning_rate": 1.8267148014440433e-06,
"loss": 0.0421,
"step": 5035
},
{
"epoch": 9.097472924187725,
"grad_norm": 0.48767336384018667,
"learning_rate": 1.808664259927798e-06,
"loss": 0.0421,
"step": 5040
},
{
"epoch": 9.106498194945848,
"grad_norm": 0.4589026757647638,
"learning_rate": 1.7906137184115524e-06,
"loss": 0.0427,
"step": 5045
},
{
"epoch": 9.115523465703971,
"grad_norm": 0.46264566303543775,
"learning_rate": 1.7725631768953069e-06,
"loss": 0.0422,
"step": 5050
},
{
"epoch": 9.124548736462094,
"grad_norm": 0.4410983802446593,
"learning_rate": 1.7545126353790615e-06,
"loss": 0.0412,
"step": 5055
},
{
"epoch": 9.133574007220217,
"grad_norm": 0.4202311860157241,
"learning_rate": 1.736462093862816e-06,
"loss": 0.0421,
"step": 5060
},
{
"epoch": 9.14259927797834,
"grad_norm": 0.41239420931705556,
"learning_rate": 1.7184115523465706e-06,
"loss": 0.0426,
"step": 5065
},
{
"epoch": 9.151624548736462,
"grad_norm": 0.48539351529631136,
"learning_rate": 1.700361010830325e-06,
"loss": 0.0424,
"step": 5070
},
{
"epoch": 9.160649819494585,
"grad_norm": 0.4427031394702614,
"learning_rate": 1.6823104693140795e-06,
"loss": 0.0426,
"step": 5075
},
{
"epoch": 9.169675090252708,
"grad_norm": 0.4752210878563376,
"learning_rate": 1.6642599277978342e-06,
"loss": 0.0421,
"step": 5080
},
{
"epoch": 9.178700361010831,
"grad_norm": 0.5234039982082958,
"learning_rate": 1.6462093862815887e-06,
"loss": 0.0433,
"step": 5085
},
{
"epoch": 9.187725631768952,
"grad_norm": 0.4515971877704036,
"learning_rate": 1.6281588447653431e-06,
"loss": 0.0433,
"step": 5090
},
{
"epoch": 9.196750902527075,
"grad_norm": 0.6669560841097782,
"learning_rate": 1.6101083032490978e-06,
"loss": 0.0426,
"step": 5095
},
{
"epoch": 9.205776173285198,
"grad_norm": 0.4598456495040656,
"learning_rate": 1.592057761732852e-06,
"loss": 0.0423,
"step": 5100
},
{
"epoch": 9.21480144404332,
"grad_norm": 0.4965666374110849,
"learning_rate": 1.5740072202166065e-06,
"loss": 0.0428,
"step": 5105
},
{
"epoch": 9.223826714801444,
"grad_norm": 0.37975291809275413,
"learning_rate": 1.5559566787003611e-06,
"loss": 0.0429,
"step": 5110
},
{
"epoch": 9.232851985559567,
"grad_norm": 0.4716411802408934,
"learning_rate": 1.5379061371841156e-06,
"loss": 0.0428,
"step": 5115
},
{
"epoch": 9.24187725631769,
"grad_norm": 0.4327838862415194,
"learning_rate": 1.51985559566787e-06,
"loss": 0.0433,
"step": 5120
},
{
"epoch": 9.250902527075812,
"grad_norm": 0.5003874549105873,
"learning_rate": 1.5018050541516247e-06,
"loss": 0.0422,
"step": 5125
},
{
"epoch": 9.259927797833935,
"grad_norm": 0.49004225732796214,
"learning_rate": 1.4837545126353792e-06,
"loss": 0.0431,
"step": 5130
},
{
"epoch": 9.268953068592058,
"grad_norm": 0.41580487545033434,
"learning_rate": 1.4657039711191336e-06,
"loss": 0.0423,
"step": 5135
},
{
"epoch": 9.277978339350181,
"grad_norm": 0.6027983626577104,
"learning_rate": 1.4476534296028883e-06,
"loss": 0.0439,
"step": 5140
},
{
"epoch": 9.287003610108304,
"grad_norm": 0.4598712206248585,
"learning_rate": 1.4296028880866427e-06,
"loss": 0.0429,
"step": 5145
},
{
"epoch": 9.296028880866427,
"grad_norm": 0.49342736984840635,
"learning_rate": 1.4115523465703974e-06,
"loss": 0.0426,
"step": 5150
},
{
"epoch": 9.30505415162455,
"grad_norm": 0.49176684451178787,
"learning_rate": 1.3935018050541518e-06,
"loss": 0.0427,
"step": 5155
},
{
"epoch": 9.314079422382672,
"grad_norm": 0.45945014096327996,
"learning_rate": 1.3754512635379063e-06,
"loss": 0.0429,
"step": 5160
},
{
"epoch": 9.323104693140793,
"grad_norm": 0.4114132503638751,
"learning_rate": 1.357400722021661e-06,
"loss": 0.0428,
"step": 5165
},
{
"epoch": 9.332129963898916,
"grad_norm": 0.42473969764225256,
"learning_rate": 1.3393501805054152e-06,
"loss": 0.0431,
"step": 5170
},
{
"epoch": 9.34115523465704,
"grad_norm": 0.48677947222160384,
"learning_rate": 1.3212996389891696e-06,
"loss": 0.0427,
"step": 5175
},
{
"epoch": 9.350180505415162,
"grad_norm": 0.46858607602832114,
"learning_rate": 1.3032490974729243e-06,
"loss": 0.043,
"step": 5180
},
{
"epoch": 9.359205776173285,
"grad_norm": 0.42070686139078084,
"learning_rate": 1.2851985559566788e-06,
"loss": 0.043,
"step": 5185
},
{
"epoch": 9.368231046931408,
"grad_norm": 0.5082683923945276,
"learning_rate": 1.2671480144404332e-06,
"loss": 0.0432,
"step": 5190
},
{
"epoch": 9.37725631768953,
"grad_norm": 0.49146765440804957,
"learning_rate": 1.2490974729241879e-06,
"loss": 0.0433,
"step": 5195
},
{
"epoch": 9.386281588447654,
"grad_norm": 0.4942629872457007,
"learning_rate": 1.2310469314079423e-06,
"loss": 0.0432,
"step": 5200
},
{
"epoch": 9.395306859205776,
"grad_norm": 0.5421985341404022,
"learning_rate": 1.2129963898916968e-06,
"loss": 0.0434,
"step": 5205
},
{
"epoch": 9.4043321299639,
"grad_norm": 0.5216299465972287,
"learning_rate": 1.1949458483754514e-06,
"loss": 0.0427,
"step": 5210
},
{
"epoch": 9.413357400722022,
"grad_norm": 0.41742965195734916,
"learning_rate": 1.176895306859206e-06,
"loss": 0.0435,
"step": 5215
},
{
"epoch": 9.422382671480145,
"grad_norm": 0.50664802662452,
"learning_rate": 1.1588447653429604e-06,
"loss": 0.0427,
"step": 5220
},
{
"epoch": 9.431407942238268,
"grad_norm": 0.44313965879945716,
"learning_rate": 1.1407942238267148e-06,
"loss": 0.0435,
"step": 5225
},
{
"epoch": 9.440433212996389,
"grad_norm": 0.5131645452817757,
"learning_rate": 1.1227436823104695e-06,
"loss": 0.0434,
"step": 5230
},
{
"epoch": 9.449458483754512,
"grad_norm": 0.4474063447070478,
"learning_rate": 1.104693140794224e-06,
"loss": 0.0428,
"step": 5235
},
{
"epoch": 9.458483754512635,
"grad_norm": 0.4729265352475666,
"learning_rate": 1.0866425992779784e-06,
"loss": 0.0434,
"step": 5240
},
{
"epoch": 9.467509025270758,
"grad_norm": 0.45958551411695614,
"learning_rate": 1.068592057761733e-06,
"loss": 0.0432,
"step": 5245
},
{
"epoch": 9.47653429602888,
"grad_norm": 0.41039837943400087,
"learning_rate": 1.0505415162454875e-06,
"loss": 0.0439,
"step": 5250
},
{
"epoch": 9.485559566787003,
"grad_norm": 0.44753217417346286,
"learning_rate": 1.032490974729242e-06,
"loss": 0.042,
"step": 5255
},
{
"epoch": 9.494584837545126,
"grad_norm": 0.4139276562020777,
"learning_rate": 1.0144404332129964e-06,
"loss": 0.0427,
"step": 5260
},
{
"epoch": 9.50361010830325,
"grad_norm": 0.6251275698046171,
"learning_rate": 9.96389891696751e-07,
"loss": 0.0436,
"step": 5265
},
{
"epoch": 9.512635379061372,
"grad_norm": 0.4501260605118548,
"learning_rate": 9.783393501805055e-07,
"loss": 0.0431,
"step": 5270
},
{
"epoch": 9.521660649819495,
"grad_norm": 0.3980490709756831,
"learning_rate": 9.6028880866426e-07,
"loss": 0.0435,
"step": 5275
},
{
"epoch": 9.530685920577618,
"grad_norm": 0.46391281584553307,
"learning_rate": 9.422382671480146e-07,
"loss": 0.0436,
"step": 5280
},
{
"epoch": 9.53971119133574,
"grad_norm": 0.4127369030638665,
"learning_rate": 9.24187725631769e-07,
"loss": 0.043,
"step": 5285
},
{
"epoch": 9.548736462093864,
"grad_norm": 0.40324535228510755,
"learning_rate": 9.061371841155235e-07,
"loss": 0.0424,
"step": 5290
},
{
"epoch": 9.557761732851986,
"grad_norm": 0.4319791954661702,
"learning_rate": 8.880866425992781e-07,
"loss": 0.0435,
"step": 5295
},
{
"epoch": 9.566787003610107,
"grad_norm": 0.46271459456155417,
"learning_rate": 8.700361010830325e-07,
"loss": 0.0437,
"step": 5300
},
{
"epoch": 9.57581227436823,
"grad_norm": 0.427126926635068,
"learning_rate": 8.519855595667871e-07,
"loss": 0.0434,
"step": 5305
},
{
"epoch": 9.584837545126353,
"grad_norm": 0.4691427501429953,
"learning_rate": 8.339350180505417e-07,
"loss": 0.0437,
"step": 5310
},
{
"epoch": 9.593862815884476,
"grad_norm": 0.45679217355753204,
"learning_rate": 8.15884476534296e-07,
"loss": 0.0435,
"step": 5315
},
{
"epoch": 9.602888086642599,
"grad_norm": 0.48200222966829426,
"learning_rate": 7.978339350180506e-07,
"loss": 0.0437,
"step": 5320
},
{
"epoch": 9.611913357400722,
"grad_norm": 0.4901938411136157,
"learning_rate": 7.797833935018051e-07,
"loss": 0.044,
"step": 5325
},
{
"epoch": 9.620938628158845,
"grad_norm": 0.5165751415632741,
"learning_rate": 7.617328519855597e-07,
"loss": 0.0438,
"step": 5330
},
{
"epoch": 9.629963898916968,
"grad_norm": 0.5120078231958552,
"learning_rate": 7.436823104693141e-07,
"loss": 0.0427,
"step": 5335
},
{
"epoch": 9.63898916967509,
"grad_norm": 0.40973988656758753,
"learning_rate": 7.256317689530687e-07,
"loss": 0.0431,
"step": 5340
},
{
"epoch": 9.648014440433213,
"grad_norm": 0.4582389620066886,
"learning_rate": 7.075812274368232e-07,
"loss": 0.0433,
"step": 5345
},
{
"epoch": 9.657039711191336,
"grad_norm": 0.552631347988738,
"learning_rate": 6.895306859205776e-07,
"loss": 0.044,
"step": 5350
},
{
"epoch": 9.666064981949459,
"grad_norm": 0.3783877672799792,
"learning_rate": 6.714801444043322e-07,
"loss": 0.0435,
"step": 5355
},
{
"epoch": 9.675090252707582,
"grad_norm": 0.48138412749053733,
"learning_rate": 6.534296028880867e-07,
"loss": 0.0432,
"step": 5360
},
{
"epoch": 9.684115523465703,
"grad_norm": 0.4338898356243751,
"learning_rate": 6.353790613718413e-07,
"loss": 0.0433,
"step": 5365
},
{
"epoch": 9.693140794223826,
"grad_norm": 0.5225550209175859,
"learning_rate": 6.173285198555957e-07,
"loss": 0.0437,
"step": 5370
},
{
"epoch": 9.702166064981949,
"grad_norm": 0.5167671561018974,
"learning_rate": 5.992779783393502e-07,
"loss": 0.0434,
"step": 5375
},
{
"epoch": 9.711191335740072,
"grad_norm": 0.4115427231842469,
"learning_rate": 5.812274368231047e-07,
"loss": 0.0439,
"step": 5380
},
{
"epoch": 9.720216606498195,
"grad_norm": 0.45107850138578787,
"learning_rate": 5.631768953068593e-07,
"loss": 0.0439,
"step": 5385
},
{
"epoch": 9.729241877256317,
"grad_norm": 0.4342115174653207,
"learning_rate": 5.451263537906137e-07,
"loss": 0.0429,
"step": 5390
},
{
"epoch": 9.73826714801444,
"grad_norm": 0.4398151357748213,
"learning_rate": 5.270758122743683e-07,
"loss": 0.0432,
"step": 5395
},
{
"epoch": 9.747292418772563,
"grad_norm": 0.4242422950084841,
"learning_rate": 5.090252707581228e-07,
"loss": 0.0428,
"step": 5400
},
{
"epoch": 9.756317689530686,
"grad_norm": 0.46470583012028227,
"learning_rate": 4.909747292418773e-07,
"loss": 0.0431,
"step": 5405
},
{
"epoch": 9.765342960288809,
"grad_norm": 0.41760352928012284,
"learning_rate": 4.729241877256318e-07,
"loss": 0.0439,
"step": 5410
},
{
"epoch": 9.774368231046932,
"grad_norm": 0.43617126444221704,
"learning_rate": 4.548736462093863e-07,
"loss": 0.0429,
"step": 5415
},
{
"epoch": 9.783393501805055,
"grad_norm": 0.44372565623267207,
"learning_rate": 4.368231046931409e-07,
"loss": 0.0435,
"step": 5420
},
{
"epoch": 9.792418772563177,
"grad_norm": 0.4428259601085559,
"learning_rate": 4.1877256317689533e-07,
"loss": 0.0436,
"step": 5425
},
{
"epoch": 9.8014440433213,
"grad_norm": 0.5278115547439892,
"learning_rate": 4.0072202166064984e-07,
"loss": 0.0433,
"step": 5430
},
{
"epoch": 9.810469314079423,
"grad_norm": 0.4527700640650528,
"learning_rate": 3.826714801444044e-07,
"loss": 0.043,
"step": 5435
},
{
"epoch": 9.819494584837544,
"grad_norm": 0.401226496888323,
"learning_rate": 3.6462093862815885e-07,
"loss": 0.0428,
"step": 5440
},
{
"epoch": 9.828519855595667,
"grad_norm": 0.49440429063887187,
"learning_rate": 3.465703971119134e-07,
"loss": 0.0428,
"step": 5445
},
{
"epoch": 9.83754512635379,
"grad_norm": 0.5076032586052756,
"learning_rate": 3.285198555956679e-07,
"loss": 0.044,
"step": 5450
},
{
"epoch": 9.846570397111913,
"grad_norm": 0.43280796148087763,
"learning_rate": 3.104693140794224e-07,
"loss": 0.0432,
"step": 5455
},
{
"epoch": 9.855595667870036,
"grad_norm": 0.5015245155705008,
"learning_rate": 2.924187725631769e-07,
"loss": 0.0438,
"step": 5460
},
{
"epoch": 9.864620938628159,
"grad_norm": 0.5286579403439075,
"learning_rate": 2.743682310469314e-07,
"loss": 0.0434,
"step": 5465
},
{
"epoch": 9.873646209386282,
"grad_norm": 0.4224487204668749,
"learning_rate": 2.5631768953068593e-07,
"loss": 0.043,
"step": 5470
},
{
"epoch": 9.882671480144404,
"grad_norm": 0.45900406854255277,
"learning_rate": 2.3826714801444044e-07,
"loss": 0.0437,
"step": 5475
},
{
"epoch": 9.891696750902527,
"grad_norm": 0.45858140540922404,
"learning_rate": 2.2021660649819497e-07,
"loss": 0.0433,
"step": 5480
},
{
"epoch": 9.90072202166065,
"grad_norm": 0.530002579562083,
"learning_rate": 2.0216606498194947e-07,
"loss": 0.0434,
"step": 5485
},
{
"epoch": 9.909747292418773,
"grad_norm": 0.5370134546660643,
"learning_rate": 1.84115523465704e-07,
"loss": 0.0437,
"step": 5490
},
{
"epoch": 9.918772563176896,
"grad_norm": 0.4902526885586167,
"learning_rate": 1.660649819494585e-07,
"loss": 0.0439,
"step": 5495
},
{
"epoch": 9.927797833935019,
"grad_norm": 0.4649182165247798,
"learning_rate": 1.4801444043321301e-07,
"loss": 0.0444,
"step": 5500
},
{
"epoch": 9.93682310469314,
"grad_norm": 0.5575325439198506,
"learning_rate": 1.2996389891696752e-07,
"loss": 0.0424,
"step": 5505
},
{
"epoch": 9.945848375451263,
"grad_norm": 0.45452667158004884,
"learning_rate": 1.1191335740072203e-07,
"loss": 0.0437,
"step": 5510
},
{
"epoch": 9.954873646209386,
"grad_norm": 0.5203515708243794,
"learning_rate": 9.386281588447654e-08,
"loss": 0.0439,
"step": 5515
},
{
"epoch": 9.963898916967509,
"grad_norm": 0.45178261726772573,
"learning_rate": 7.581227436823105e-08,
"loss": 0.0432,
"step": 5520
},
{
"epoch": 9.972924187725631,
"grad_norm": 0.4688905694517184,
"learning_rate": 5.776173285198556e-08,
"loss": 0.0434,
"step": 5525
},
{
"epoch": 9.981949458483754,
"grad_norm": 0.43677959065978206,
"learning_rate": 3.971119133574008e-08,
"loss": 0.0437,
"step": 5530
},
{
"epoch": 9.990974729241877,
"grad_norm": 0.49453568586672814,
"learning_rate": 2.1660649819494588e-08,
"loss": 0.0437,
"step": 5535
},
{
"epoch": 10.0,
"grad_norm": 0.4332227379502704,
"learning_rate": 3.6101083032490975e-09,
"loss": 0.0433,
"step": 5540
},
{
"epoch": 10.0,
"step": 5540,
"total_flos": 613623860887552.0,
"train_loss": 0.19332283668838685,
"train_runtime": 35523.5631,
"train_samples_per_second": 3.743,
"train_steps_per_second": 0.156
}
],
"logging_steps": 5,
"max_steps": 5540,
"num_input_tokens_seen": 0,
"num_train_epochs": 10,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 613623860887552.0,
"train_batch_size": 2,
"trial_name": null,
"trial_params": null
}