DEJAN-LM / trainer_state.json
Briancrouch's picture
Duplicate from dejanseo/DEJAN-LM
dd2d8a4
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 14.86035710599649,
"eval_steps": 500,
"global_step": 18000,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.008256786046031582,
"grad_norm": 1.7616229057312012,
"learning_rate": 1.6515276630883566e-06,
"loss": 10.4208,
"step": 10
},
{
"epoch": 0.016513572092063163,
"grad_norm": 1.7366944551467896,
"learning_rate": 3.3030553261767132e-06,
"loss": 10.4105,
"step": 20
},
{
"epoch": 0.024770358138094747,
"grad_norm": 1.7757558822631836,
"learning_rate": 4.95458298926507e-06,
"loss": 10.387,
"step": 30
},
{
"epoch": 0.03302714418412633,
"grad_norm": 1.747761845588684,
"learning_rate": 6.6061106523534265e-06,
"loss": 10.3514,
"step": 40
},
{
"epoch": 0.04128393023015791,
"grad_norm": 1.6940944194793701,
"learning_rate": 8.257638315441784e-06,
"loss": 10.3033,
"step": 50
},
{
"epoch": 0.04954071627618949,
"grad_norm": 1.608394980430603,
"learning_rate": 9.90916597853014e-06,
"loss": 10.2448,
"step": 60
},
{
"epoch": 0.05779750232222108,
"grad_norm": 1.5383769273757935,
"learning_rate": 1.1560693641618496e-05,
"loss": 10.1765,
"step": 70
},
{
"epoch": 0.06605428836825265,
"grad_norm": 1.4542272090911865,
"learning_rate": 1.3212221304706853e-05,
"loss": 10.1003,
"step": 80
},
{
"epoch": 0.07431107441428424,
"grad_norm": 1.4005941152572632,
"learning_rate": 1.486374896779521e-05,
"loss": 10.0172,
"step": 90
},
{
"epoch": 0.08256786046031582,
"grad_norm": 1.372097134590149,
"learning_rate": 1.6515276630883568e-05,
"loss": 9.9285,
"step": 100
},
{
"epoch": 0.0908246465063474,
"grad_norm": 1.3433704376220703,
"learning_rate": 1.8166804293971927e-05,
"loss": 9.8394,
"step": 110
},
{
"epoch": 0.09908143255237899,
"grad_norm": 1.3353278636932373,
"learning_rate": 1.981833195706028e-05,
"loss": 9.7529,
"step": 120
},
{
"epoch": 0.10733821859841057,
"grad_norm": 1.3279035091400146,
"learning_rate": 2.1469859620148637e-05,
"loss": 9.6614,
"step": 130
},
{
"epoch": 0.11559500464444215,
"grad_norm": 1.3572771549224854,
"learning_rate": 2.3121387283236992e-05,
"loss": 9.5723,
"step": 140
},
{
"epoch": 0.12385179069047374,
"grad_norm": 1.3397237062454224,
"learning_rate": 2.477291494632535e-05,
"loss": 9.4781,
"step": 150
},
{
"epoch": 0.1321085767365053,
"grad_norm": 1.327772617340088,
"learning_rate": 2.6424442609413706e-05,
"loss": 9.3866,
"step": 160
},
{
"epoch": 0.1403653627825369,
"grad_norm": 1.3253681659698486,
"learning_rate": 2.8075970272502064e-05,
"loss": 9.2926,
"step": 170
},
{
"epoch": 0.14862214882856847,
"grad_norm": 1.3415825366973877,
"learning_rate": 2.972749793559042e-05,
"loss": 9.1932,
"step": 180
},
{
"epoch": 0.15687893487460006,
"grad_norm": 1.3262524604797363,
"learning_rate": 3.137902559867878e-05,
"loss": 9.0972,
"step": 190
},
{
"epoch": 0.16513572092063164,
"grad_norm": 1.3293501138687134,
"learning_rate": 3.3030553261767136e-05,
"loss": 8.996,
"step": 200
},
{
"epoch": 0.17339250696666322,
"grad_norm": 1.3101015090942383,
"learning_rate": 3.468208092485549e-05,
"loss": 8.8972,
"step": 210
},
{
"epoch": 0.1816492930126948,
"grad_norm": 1.3407046794891357,
"learning_rate": 3.6333608587943854e-05,
"loss": 8.795,
"step": 220
},
{
"epoch": 0.1899060790587264,
"grad_norm": 1.310673475265503,
"learning_rate": 3.798513625103221e-05,
"loss": 8.6952,
"step": 230
},
{
"epoch": 0.19816286510475797,
"grad_norm": 1.3073076009750366,
"learning_rate": 3.963666391412056e-05,
"loss": 8.5946,
"step": 240
},
{
"epoch": 0.20641965115078956,
"grad_norm": 1.2669146060943604,
"learning_rate": 4.128819157720892e-05,
"loss": 8.4994,
"step": 250
},
{
"epoch": 0.21467643719682114,
"grad_norm": 1.257196307182312,
"learning_rate": 4.2939719240297274e-05,
"loss": 8.3993,
"step": 260
},
{
"epoch": 0.22293322324285272,
"grad_norm": 1.258946418762207,
"learning_rate": 4.459124690338563e-05,
"loss": 8.2995,
"step": 270
},
{
"epoch": 0.2311900092888843,
"grad_norm": 1.2212146520614624,
"learning_rate": 4.6242774566473984e-05,
"loss": 8.2009,
"step": 280
},
{
"epoch": 0.2394467953349159,
"grad_norm": 1.1783366203308105,
"learning_rate": 4.7894302229562346e-05,
"loss": 8.1136,
"step": 290
},
{
"epoch": 0.24770358138094747,
"grad_norm": 1.1963677406311035,
"learning_rate": 4.95458298926507e-05,
"loss": 8.0248,
"step": 300
},
{
"epoch": 0.25596036742697903,
"grad_norm": 1.112446665763855,
"learning_rate": 5.1197357555739056e-05,
"loss": 7.9439,
"step": 310
},
{
"epoch": 0.2642171534730106,
"grad_norm": 1.0816154479980469,
"learning_rate": 5.284888521882741e-05,
"loss": 7.8579,
"step": 320
},
{
"epoch": 0.2724739395190422,
"grad_norm": 1.0233012437820435,
"learning_rate": 5.4500412881915774e-05,
"loss": 7.7776,
"step": 330
},
{
"epoch": 0.2807307255650738,
"grad_norm": 0.9429032206535339,
"learning_rate": 5.615194054500413e-05,
"loss": 7.7092,
"step": 340
},
{
"epoch": 0.28898751161110536,
"grad_norm": 0.8611400127410889,
"learning_rate": 5.7803468208092484e-05,
"loss": 7.6461,
"step": 350
},
{
"epoch": 0.29724429765713695,
"grad_norm": 0.8037594556808472,
"learning_rate": 5.945499587118084e-05,
"loss": 7.5908,
"step": 360
},
{
"epoch": 0.30550108370316853,
"grad_norm": 0.7045464515686035,
"learning_rate": 6.110652353426921e-05,
"loss": 7.5412,
"step": 370
},
{
"epoch": 0.3137578697492001,
"grad_norm": 0.6216972470283508,
"learning_rate": 6.275805119735756e-05,
"loss": 7.4982,
"step": 380
},
{
"epoch": 0.3220146557952317,
"grad_norm": 0.5564426183700562,
"learning_rate": 6.440957886044592e-05,
"loss": 7.4692,
"step": 390
},
{
"epoch": 0.3302714418412633,
"grad_norm": 0.5008960962295532,
"learning_rate": 6.606110652353427e-05,
"loss": 7.4429,
"step": 400
},
{
"epoch": 0.33852822788729486,
"grad_norm": 0.38887500762939453,
"learning_rate": 6.771263418662263e-05,
"loss": 7.4219,
"step": 410
},
{
"epoch": 0.34678501393332645,
"grad_norm": 0.34142008423805237,
"learning_rate": 6.936416184971098e-05,
"loss": 7.3995,
"step": 420
},
{
"epoch": 0.35504179997935803,
"grad_norm": 0.2729811668395996,
"learning_rate": 7.101568951279934e-05,
"loss": 7.3893,
"step": 430
},
{
"epoch": 0.3632985860253896,
"grad_norm": 0.2536115348339081,
"learning_rate": 7.266721717588771e-05,
"loss": 7.3651,
"step": 440
},
{
"epoch": 0.3715553720714212,
"grad_norm": 0.2442627251148224,
"learning_rate": 7.431874483897605e-05,
"loss": 7.3566,
"step": 450
},
{
"epoch": 0.3798121581174528,
"grad_norm": 0.2364257276058197,
"learning_rate": 7.597027250206442e-05,
"loss": 7.3426,
"step": 460
},
{
"epoch": 0.38806894416348436,
"grad_norm": 0.17855331301689148,
"learning_rate": 7.762180016515277e-05,
"loss": 7.3399,
"step": 470
},
{
"epoch": 0.39632573020951595,
"grad_norm": 0.19668430089950562,
"learning_rate": 7.927332782824111e-05,
"loss": 7.3281,
"step": 480
},
{
"epoch": 0.40458251625554753,
"grad_norm": 0.19964857399463654,
"learning_rate": 8.092485549132948e-05,
"loss": 7.3147,
"step": 490
},
{
"epoch": 0.4128393023015791,
"grad_norm": 0.1922236531972885,
"learning_rate": 8.257638315441784e-05,
"loss": 7.307,
"step": 500
},
{
"epoch": 0.4210960883476107,
"grad_norm": 0.18698124587535858,
"learning_rate": 8.422791081750619e-05,
"loss": 7.2961,
"step": 510
},
{
"epoch": 0.4293528743936423,
"grad_norm": 0.22864067554473877,
"learning_rate": 8.587943848059455e-05,
"loss": 7.2856,
"step": 520
},
{
"epoch": 0.43760966043967386,
"grad_norm": 0.21611692011356354,
"learning_rate": 8.75309661436829e-05,
"loss": 7.2777,
"step": 530
},
{
"epoch": 0.44586644648570545,
"grad_norm": 0.24379616975784302,
"learning_rate": 8.918249380677126e-05,
"loss": 7.2656,
"step": 540
},
{
"epoch": 0.45412323253173703,
"grad_norm": 0.2078498750925064,
"learning_rate": 9.083402146985963e-05,
"loss": 7.2623,
"step": 550
},
{
"epoch": 0.4623800185777686,
"grad_norm": 0.21156761050224304,
"learning_rate": 9.248554913294797e-05,
"loss": 7.2523,
"step": 560
},
{
"epoch": 0.4706368046238002,
"grad_norm": 0.2728644609451294,
"learning_rate": 9.413707679603634e-05,
"loss": 7.2481,
"step": 570
},
{
"epoch": 0.4788935906698318,
"grad_norm": 0.2564805746078491,
"learning_rate": 9.578860445912469e-05,
"loss": 7.2328,
"step": 580
},
{
"epoch": 0.48715037671586336,
"grad_norm": 0.239385187625885,
"learning_rate": 9.744013212221305e-05,
"loss": 7.228,
"step": 590
},
{
"epoch": 0.49540716276189495,
"grad_norm": 0.29688534140586853,
"learning_rate": 9.90916597853014e-05,
"loss": 7.2167,
"step": 600
},
{
"epoch": 0.5036639488079265,
"grad_norm": 0.3255228102207184,
"learning_rate": 0.00010074318744838976,
"loss": 7.2115,
"step": 610
},
{
"epoch": 0.5119207348539581,
"grad_norm": 0.26421359181404114,
"learning_rate": 0.00010239471511147811,
"loss": 7.2046,
"step": 620
},
{
"epoch": 0.5201775208999897,
"grad_norm": 0.22493909299373627,
"learning_rate": 0.00010404624277456648,
"loss": 7.1852,
"step": 630
},
{
"epoch": 0.5284343069460212,
"grad_norm": 0.23427563905715942,
"learning_rate": 0.00010569777043765482,
"loss": 7.188,
"step": 640
},
{
"epoch": 0.5366910929920529,
"grad_norm": 0.2601110637187958,
"learning_rate": 0.00010734929810074319,
"loss": 7.1702,
"step": 650
},
{
"epoch": 0.5449478790380844,
"grad_norm": 0.2562381625175476,
"learning_rate": 0.00010900082576383155,
"loss": 7.1727,
"step": 660
},
{
"epoch": 0.553204665084116,
"grad_norm": 0.3765369653701782,
"learning_rate": 0.0001106523534269199,
"loss": 7.1649,
"step": 670
},
{
"epoch": 0.5614614511301476,
"grad_norm": 0.33350440859794617,
"learning_rate": 0.00011230388109000826,
"loss": 7.1527,
"step": 680
},
{
"epoch": 0.5697182371761792,
"grad_norm": 0.27716100215911865,
"learning_rate": 0.00011395540875309663,
"loss": 7.139,
"step": 690
},
{
"epoch": 0.5779750232222107,
"grad_norm": 0.30098262429237366,
"learning_rate": 0.00011560693641618497,
"loss": 7.1353,
"step": 700
},
{
"epoch": 0.5862318092682424,
"grad_norm": 0.32388460636138916,
"learning_rate": 0.00011725846407927334,
"loss": 7.1255,
"step": 710
},
{
"epoch": 0.5944885953142739,
"grad_norm": 0.37705451250076294,
"learning_rate": 0.00011890999174236168,
"loss": 7.1116,
"step": 720
},
{
"epoch": 0.6027453813603055,
"grad_norm": 0.23738576471805573,
"learning_rate": 0.00012056151940545005,
"loss": 7.0997,
"step": 730
},
{
"epoch": 0.6110021674063371,
"grad_norm": 0.6089703440666199,
"learning_rate": 0.00012221304706853842,
"loss": 7.1003,
"step": 740
},
{
"epoch": 0.6192589534523687,
"grad_norm": 0.5266053080558777,
"learning_rate": 0.00012386457473162674,
"loss": 7.085,
"step": 750
},
{
"epoch": 0.6275157394984002,
"grad_norm": 0.2401445358991623,
"learning_rate": 0.00012551610239471513,
"loss": 7.0832,
"step": 760
},
{
"epoch": 0.6357725255444319,
"grad_norm": 0.30341988801956177,
"learning_rate": 0.00012716763005780345,
"loss": 7.0664,
"step": 770
},
{
"epoch": 0.6440293115904634,
"grad_norm": 0.2831466495990753,
"learning_rate": 0.00012881915772089184,
"loss": 7.0738,
"step": 780
},
{
"epoch": 0.652286097636495,
"grad_norm": 0.43197304010391235,
"learning_rate": 0.0001304706853839802,
"loss": 7.0623,
"step": 790
},
{
"epoch": 0.6605428836825266,
"grad_norm": 0.266729474067688,
"learning_rate": 0.00013212221304706855,
"loss": 7.0505,
"step": 800
},
{
"epoch": 0.6687996697285582,
"grad_norm": 0.26868125796318054,
"learning_rate": 0.0001337737407101569,
"loss": 7.036,
"step": 810
},
{
"epoch": 0.6770564557745897,
"grad_norm": 0.33523380756378174,
"learning_rate": 0.00013542526837324526,
"loss": 7.0277,
"step": 820
},
{
"epoch": 0.6853132418206214,
"grad_norm": 0.44178086519241333,
"learning_rate": 0.0001370767960363336,
"loss": 7.0232,
"step": 830
},
{
"epoch": 0.6935700278666529,
"grad_norm": 0.23396629095077515,
"learning_rate": 0.00013872832369942197,
"loss": 7.017,
"step": 840
},
{
"epoch": 0.7018268139126845,
"grad_norm": 0.2513016164302826,
"learning_rate": 0.00014037985136251032,
"loss": 7.0152,
"step": 850
},
{
"epoch": 0.7100835999587161,
"grad_norm": 0.22369173169136047,
"learning_rate": 0.00014203137902559868,
"loss": 7.0076,
"step": 860
},
{
"epoch": 0.7183403860047477,
"grad_norm": 0.31220072507858276,
"learning_rate": 0.00014368290668868703,
"loss": 6.9968,
"step": 870
},
{
"epoch": 0.7265971720507792,
"grad_norm": 0.26952481269836426,
"learning_rate": 0.00014533443435177541,
"loss": 6.9924,
"step": 880
},
{
"epoch": 0.7348539580968109,
"grad_norm": 0.33689916133880615,
"learning_rate": 0.00014698596201486374,
"loss": 6.977,
"step": 890
},
{
"epoch": 0.7431107441428424,
"grad_norm": 0.27384230494499207,
"learning_rate": 0.0001486374896779521,
"loss": 6.9776,
"step": 900
},
{
"epoch": 0.751367530188874,
"grad_norm": 0.3221207559108734,
"learning_rate": 0.00015028901734104048,
"loss": 6.9716,
"step": 910
},
{
"epoch": 0.7596243162349056,
"grad_norm": 0.3615335524082184,
"learning_rate": 0.00015194054500412883,
"loss": 6.9636,
"step": 920
},
{
"epoch": 0.7678811022809372,
"grad_norm": 0.23735912144184113,
"learning_rate": 0.00015359207266721716,
"loss": 6.9528,
"step": 930
},
{
"epoch": 0.7761378883269687,
"grad_norm": 0.3608275353908539,
"learning_rate": 0.00015524360033030554,
"loss": 6.9488,
"step": 940
},
{
"epoch": 0.7843946743730003,
"grad_norm": 0.32908084988594055,
"learning_rate": 0.0001568951279933939,
"loss": 6.9367,
"step": 950
},
{
"epoch": 0.7926514604190319,
"grad_norm": 0.27347663044929504,
"learning_rate": 0.00015854665565648223,
"loss": 6.9458,
"step": 960
},
{
"epoch": 0.8009082464650634,
"grad_norm": 0.2819118797779083,
"learning_rate": 0.0001601981833195706,
"loss": 6.9314,
"step": 970
},
{
"epoch": 0.8091650325110951,
"grad_norm": 0.32597044110298157,
"learning_rate": 0.00016184971098265897,
"loss": 6.9304,
"step": 980
},
{
"epoch": 0.8174218185571266,
"grad_norm": 0.2454291731119156,
"learning_rate": 0.00016350123864574732,
"loss": 6.9015,
"step": 990
},
{
"epoch": 0.8256786046031582,
"grad_norm": 0.3510182499885559,
"learning_rate": 0.00016515276630883568,
"loss": 6.902,
"step": 1000
},
{
"epoch": 0.8339353906491898,
"grad_norm": 0.3421700894832611,
"learning_rate": 0.00016680429397192403,
"loss": 6.9061,
"step": 1010
},
{
"epoch": 0.8421921766952214,
"grad_norm": 0.2990358769893646,
"learning_rate": 0.00016845582163501239,
"loss": 6.896,
"step": 1020
},
{
"epoch": 0.8504489627412529,
"grad_norm": 0.2679702937602997,
"learning_rate": 0.00017010734929810074,
"loss": 6.8927,
"step": 1030
},
{
"epoch": 0.8587057487872846,
"grad_norm": 0.2782333195209503,
"learning_rate": 0.0001717588769611891,
"loss": 6.8892,
"step": 1040
},
{
"epoch": 0.8669625348333161,
"grad_norm": 0.25567376613616943,
"learning_rate": 0.00017341040462427745,
"loss": 6.8829,
"step": 1050
},
{
"epoch": 0.8752193208793477,
"grad_norm": 0.25939440727233887,
"learning_rate": 0.0001750619322873658,
"loss": 6.8573,
"step": 1060
},
{
"epoch": 0.8834761069253793,
"grad_norm": 0.5158926844596863,
"learning_rate": 0.0001767134599504542,
"loss": 6.8639,
"step": 1070
},
{
"epoch": 0.8917328929714109,
"grad_norm": 0.3435886800289154,
"learning_rate": 0.00017836498761354252,
"loss": 6.8615,
"step": 1080
},
{
"epoch": 0.8999896790174424,
"grad_norm": 0.34237366914749146,
"learning_rate": 0.00018001651527663087,
"loss": 6.856,
"step": 1090
},
{
"epoch": 0.9082464650634741,
"grad_norm": 0.5136009454727173,
"learning_rate": 0.00018166804293971925,
"loss": 6.8604,
"step": 1100
},
{
"epoch": 0.9165032511095056,
"grad_norm": 0.42529767751693726,
"learning_rate": 0.0001833195706028076,
"loss": 6.8544,
"step": 1110
},
{
"epoch": 0.9247600371555372,
"grad_norm": 0.29155978560447693,
"learning_rate": 0.00018497109826589594,
"loss": 6.8404,
"step": 1120
},
{
"epoch": 0.9330168232015688,
"grad_norm": 0.5477098226547241,
"learning_rate": 0.00018662262592898432,
"loss": 6.8273,
"step": 1130
},
{
"epoch": 0.9412736092476004,
"grad_norm": 0.6319007873535156,
"learning_rate": 0.00018827415359207267,
"loss": 6.8354,
"step": 1140
},
{
"epoch": 0.9495303952936319,
"grad_norm": 0.5107876062393188,
"learning_rate": 0.00018992568125516103,
"loss": 6.8176,
"step": 1150
},
{
"epoch": 0.9577871813396636,
"grad_norm": 0.3837167024612427,
"learning_rate": 0.00019157720891824938,
"loss": 6.8291,
"step": 1160
},
{
"epoch": 0.9660439673856951,
"grad_norm": 0.4074363708496094,
"learning_rate": 0.00019322873658133774,
"loss": 6.808,
"step": 1170
},
{
"epoch": 0.9743007534317267,
"grad_norm": 0.3952867388725281,
"learning_rate": 0.0001948802642444261,
"loss": 6.8046,
"step": 1180
},
{
"epoch": 0.9825575394777583,
"grad_norm": 0.3207480311393738,
"learning_rate": 0.00019653179190751448,
"loss": 6.8099,
"step": 1190
},
{
"epoch": 0.9908143255237899,
"grad_norm": 0.46632879972457886,
"learning_rate": 0.0001981833195706028,
"loss": 6.7924,
"step": 1200
},
{
"epoch": 0.9990711115698214,
"grad_norm": 0.4463304877281189,
"learning_rate": 0.00019983484723369116,
"loss": 6.8048,
"step": 1210
},
{
"epoch": 1.0066054288368254,
"grad_norm": 0.481222927570343,
"learning_rate": 0.00020148637489677952,
"loss": 6.2066,
"step": 1220
},
{
"epoch": 1.0148622148828568,
"grad_norm": 0.35238635540008545,
"learning_rate": 0.0002031379025598679,
"loss": 6.7881,
"step": 1230
},
{
"epoch": 1.0231190009288884,
"grad_norm": 0.3762964904308319,
"learning_rate": 0.00020478943022295623,
"loss": 6.7819,
"step": 1240
},
{
"epoch": 1.03137578697492,
"grad_norm": 0.40854519605636597,
"learning_rate": 0.00020644095788604458,
"loss": 6.7868,
"step": 1250
},
{
"epoch": 1.0396325730209517,
"grad_norm": 0.7023956179618835,
"learning_rate": 0.00020809248554913296,
"loss": 6.7674,
"step": 1260
},
{
"epoch": 1.047889359066983,
"grad_norm": 0.43909308314323425,
"learning_rate": 0.00020974401321222132,
"loss": 6.7735,
"step": 1270
},
{
"epoch": 1.0561461451130147,
"grad_norm": 0.5223090648651123,
"learning_rate": 0.00021139554087530965,
"loss": 6.7506,
"step": 1280
},
{
"epoch": 1.0644029311590464,
"grad_norm": 0.41518449783325195,
"learning_rate": 0.00021304706853839803,
"loss": 6.7645,
"step": 1290
},
{
"epoch": 1.072659717205078,
"grad_norm": 0.346605509519577,
"learning_rate": 0.00021469859620148638,
"loss": 6.7545,
"step": 1300
},
{
"epoch": 1.0809165032511094,
"grad_norm": 0.50466388463974,
"learning_rate": 0.00021635012386457474,
"loss": 6.7577,
"step": 1310
},
{
"epoch": 1.089173289297141,
"grad_norm": 0.2985013723373413,
"learning_rate": 0.0002180016515276631,
"loss": 6.7553,
"step": 1320
},
{
"epoch": 1.0974300753431727,
"grad_norm": 0.3818993866443634,
"learning_rate": 0.00021965317919075145,
"loss": 6.7531,
"step": 1330
},
{
"epoch": 1.1056868613892044,
"grad_norm": 0.39540722966194153,
"learning_rate": 0.0002213047068538398,
"loss": 6.7366,
"step": 1340
},
{
"epoch": 1.1139436474352358,
"grad_norm": 0.4633065462112427,
"learning_rate": 0.0002229562345169282,
"loss": 6.7335,
"step": 1350
},
{
"epoch": 1.1222004334812674,
"grad_norm": 0.395951509475708,
"learning_rate": 0.00022460776218001651,
"loss": 6.7419,
"step": 1360
},
{
"epoch": 1.130457219527299,
"grad_norm": 0.3049313724040985,
"learning_rate": 0.00022625928984310487,
"loss": 6.7304,
"step": 1370
},
{
"epoch": 1.1387140055733305,
"grad_norm": 0.3902861475944519,
"learning_rate": 0.00022791081750619325,
"loss": 6.73,
"step": 1380
},
{
"epoch": 1.146970791619362,
"grad_norm": 0.336494117975235,
"learning_rate": 0.0002295623451692816,
"loss": 6.7183,
"step": 1390
},
{
"epoch": 1.1552275776653937,
"grad_norm": 0.4714094400405884,
"learning_rate": 0.00023121387283236994,
"loss": 6.7258,
"step": 1400
},
{
"epoch": 1.1634843637114254,
"grad_norm": 0.5000776052474976,
"learning_rate": 0.0002328654004954583,
"loss": 6.7238,
"step": 1410
},
{
"epoch": 1.171741149757457,
"grad_norm": 0.39674362540245056,
"learning_rate": 0.00023451692815854667,
"loss": 6.7216,
"step": 1420
},
{
"epoch": 1.1799979358034884,
"grad_norm": 0.4412456452846527,
"learning_rate": 0.00023616845582163503,
"loss": 6.7075,
"step": 1430
},
{
"epoch": 1.18825472184952,
"grad_norm": 0.5480419397354126,
"learning_rate": 0.00023781998348472336,
"loss": 6.7109,
"step": 1440
},
{
"epoch": 1.1965115078955517,
"grad_norm": 0.5100497007369995,
"learning_rate": 0.00023947151114781174,
"loss": 6.7063,
"step": 1450
},
{
"epoch": 1.2047682939415831,
"grad_norm": 0.48953869938850403,
"learning_rate": 0.0002411230388109001,
"loss": 6.7087,
"step": 1460
},
{
"epoch": 1.2130250799876148,
"grad_norm": 0.6405659317970276,
"learning_rate": 0.00024277456647398842,
"loss": 6.7031,
"step": 1470
},
{
"epoch": 1.2212818660336464,
"grad_norm": 0.6138748526573181,
"learning_rate": 0.00024442609413707683,
"loss": 6.6993,
"step": 1480
},
{
"epoch": 1.229538652079678,
"grad_norm": 0.5157073736190796,
"learning_rate": 0.00024607762180016516,
"loss": 6.6926,
"step": 1490
},
{
"epoch": 1.2377954381257097,
"grad_norm": 0.46693915128707886,
"learning_rate": 0.0002477291494632535,
"loss": 6.6984,
"step": 1500
},
{
"epoch": 1.246052224171741,
"grad_norm": 0.3783016502857208,
"learning_rate": 0.00024938067712634187,
"loss": 6.7014,
"step": 1510
},
{
"epoch": 1.2543090102177727,
"grad_norm": 0.657699704170227,
"learning_rate": 0.00025103220478943025,
"loss": 6.7012,
"step": 1520
},
{
"epoch": 1.2625657962638044,
"grad_norm": 0.28327375650405884,
"learning_rate": 0.0002526837324525186,
"loss": 6.689,
"step": 1530
},
{
"epoch": 1.2708225823098358,
"grad_norm": 0.419355571269989,
"learning_rate": 0.0002543352601156069,
"loss": 6.6892,
"step": 1540
},
{
"epoch": 1.2790793683558674,
"grad_norm": 0.6164056658744812,
"learning_rate": 0.0002559867877786953,
"loss": 6.6925,
"step": 1550
},
{
"epoch": 1.287336154401899,
"grad_norm": 0.5302212238311768,
"learning_rate": 0.00025763831544178367,
"loss": 6.685,
"step": 1560
},
{
"epoch": 1.2955929404479307,
"grad_norm": 0.4732874929904938,
"learning_rate": 0.000259289843104872,
"loss": 6.6831,
"step": 1570
},
{
"epoch": 1.3038497264939624,
"grad_norm": 0.6624964475631714,
"learning_rate": 0.0002609413707679604,
"loss": 6.6712,
"step": 1580
},
{
"epoch": 1.3121065125399938,
"grad_norm": 0.31143829226493835,
"learning_rate": 0.0002625928984310487,
"loss": 6.6742,
"step": 1590
},
{
"epoch": 1.3203632985860254,
"grad_norm": 0.6145860552787781,
"learning_rate": 0.0002642444260941371,
"loss": 6.6835,
"step": 1600
},
{
"epoch": 1.328620084632057,
"grad_norm": 0.3368646204471588,
"learning_rate": 0.0002658959537572254,
"loss": 6.6695,
"step": 1610
},
{
"epoch": 1.3368768706780885,
"grad_norm": 0.412112295627594,
"learning_rate": 0.0002675474814203138,
"loss": 6.6827,
"step": 1620
},
{
"epoch": 1.34513365672412,
"grad_norm": 0.3918752372264862,
"learning_rate": 0.0002691990090834022,
"loss": 6.6718,
"step": 1630
},
{
"epoch": 1.3533904427701517,
"grad_norm": 0.3757690489292145,
"learning_rate": 0.0002708505367464905,
"loss": 6.6744,
"step": 1640
},
{
"epoch": 1.3616472288161834,
"grad_norm": 0.5821499228477478,
"learning_rate": 0.00027250206440957884,
"loss": 6.6673,
"step": 1650
},
{
"epoch": 1.3699040148622148,
"grad_norm": 0.5082701444625854,
"learning_rate": 0.0002741535920726672,
"loss": 6.6474,
"step": 1660
},
{
"epoch": 1.3781608009082464,
"grad_norm": 0.3153243362903595,
"learning_rate": 0.00027580511973575555,
"loss": 6.6526,
"step": 1670
},
{
"epoch": 1.386417586954278,
"grad_norm": 0.5374317169189453,
"learning_rate": 0.00027745664739884393,
"loss": 6.6552,
"step": 1680
},
{
"epoch": 1.3946743730003095,
"grad_norm": 0.3755870759487152,
"learning_rate": 0.0002791081750619323,
"loss": 6.6457,
"step": 1690
},
{
"epoch": 1.4029311590463411,
"grad_norm": 0.39434754848480225,
"learning_rate": 0.00028075970272502064,
"loss": 6.6485,
"step": 1700
},
{
"epoch": 1.4111879450923728,
"grad_norm": 0.6054360270500183,
"learning_rate": 0.000282411230388109,
"loss": 6.6515,
"step": 1710
},
{
"epoch": 1.4194447311384044,
"grad_norm": 0.4798774719238281,
"learning_rate": 0.00028406275805119735,
"loss": 6.6451,
"step": 1720
},
{
"epoch": 1.427701517184436,
"grad_norm": 0.4491690695285797,
"learning_rate": 0.0002857142857142857,
"loss": 6.6524,
"step": 1730
},
{
"epoch": 1.4359583032304675,
"grad_norm": 0.5428591370582581,
"learning_rate": 0.00028736581337737406,
"loss": 6.6488,
"step": 1740
},
{
"epoch": 1.444215089276499,
"grad_norm": 0.5588364601135254,
"learning_rate": 0.00028901734104046245,
"loss": 6.6417,
"step": 1750
},
{
"epoch": 1.4524718753225307,
"grad_norm": 0.2916945517063141,
"learning_rate": 0.00029066886870355083,
"loss": 6.6449,
"step": 1760
},
{
"epoch": 1.4607286613685622,
"grad_norm": 0.40670228004455566,
"learning_rate": 0.00029232039636663916,
"loss": 6.6301,
"step": 1770
},
{
"epoch": 1.4689854474145938,
"grad_norm": 0.3062325119972229,
"learning_rate": 0.0002939719240297275,
"loss": 6.6322,
"step": 1780
},
{
"epoch": 1.4772422334606254,
"grad_norm": 0.46464964747428894,
"learning_rate": 0.00029562345169281587,
"loss": 6.6401,
"step": 1790
},
{
"epoch": 1.485499019506657,
"grad_norm": 0.45769989490509033,
"learning_rate": 0.0002972749793559042,
"loss": 6.6398,
"step": 1800
},
{
"epoch": 1.4937558055526887,
"grad_norm": 0.45375707745552063,
"learning_rate": 0.0002989265070189926,
"loss": 6.6404,
"step": 1810
},
{
"epoch": 1.5020125915987204,
"grad_norm": 0.47998979687690735,
"learning_rate": 0.00030057803468208096,
"loss": 6.6322,
"step": 1820
},
{
"epoch": 1.5102693776447518,
"grad_norm": 0.5479523539543152,
"learning_rate": 0.0003022295623451693,
"loss": 6.623,
"step": 1830
},
{
"epoch": 1.5185261636907834,
"grad_norm": 0.6078441143035889,
"learning_rate": 0.00030388109000825767,
"loss": 6.6307,
"step": 1840
},
{
"epoch": 1.5267829497368148,
"grad_norm": 0.580402672290802,
"learning_rate": 0.000305532617671346,
"loss": 6.6283,
"step": 1850
},
{
"epoch": 1.5350397357828465,
"grad_norm": 0.4065966010093689,
"learning_rate": 0.0003071841453344343,
"loss": 6.6217,
"step": 1860
},
{
"epoch": 1.543296521828878,
"grad_norm": 0.46066299080848694,
"learning_rate": 0.0003088356729975227,
"loss": 6.6222,
"step": 1870
},
{
"epoch": 1.5515533078749097,
"grad_norm": 0.5753124356269836,
"learning_rate": 0.0003104872006606111,
"loss": 6.6148,
"step": 1880
},
{
"epoch": 1.5598100939209414,
"grad_norm": 0.4921572506427765,
"learning_rate": 0.0003121387283236994,
"loss": 6.6167,
"step": 1890
},
{
"epoch": 1.5680668799669728,
"grad_norm": 0.5580713748931885,
"learning_rate": 0.0003137902559867878,
"loss": 6.6081,
"step": 1900
},
{
"epoch": 1.5763236660130044,
"grad_norm": 0.5664856433868408,
"learning_rate": 0.00031544178364987613,
"loss": 6.6037,
"step": 1910
},
{
"epoch": 1.5845804520590359,
"grad_norm": 0.466069757938385,
"learning_rate": 0.00031709331131296446,
"loss": 6.6023,
"step": 1920
},
{
"epoch": 1.5928372381050675,
"grad_norm": 0.594886302947998,
"learning_rate": 0.00031874483897605284,
"loss": 6.606,
"step": 1930
},
{
"epoch": 1.6010940241510991,
"grad_norm": 0.6219709515571594,
"learning_rate": 0.0003203963666391412,
"loss": 6.6031,
"step": 1940
},
{
"epoch": 1.6093508101971308,
"grad_norm": 0.419695109128952,
"learning_rate": 0.0003220478943022296,
"loss": 6.6258,
"step": 1950
},
{
"epoch": 1.6176075962431624,
"grad_norm": 0.6769319772720337,
"learning_rate": 0.00032369942196531793,
"loss": 6.6192,
"step": 1960
},
{
"epoch": 1.625864382289194,
"grad_norm": 0.4297437369823456,
"learning_rate": 0.00032535094962840626,
"loss": 6.5908,
"step": 1970
},
{
"epoch": 1.6341211683352255,
"grad_norm": 0.5890651941299438,
"learning_rate": 0.00032700247729149464,
"loss": 6.598,
"step": 1980
},
{
"epoch": 1.642377954381257,
"grad_norm": 0.42893460392951965,
"learning_rate": 0.00032865400495458297,
"loss": 6.5953,
"step": 1990
},
{
"epoch": 1.6506347404272885,
"grad_norm": 0.6383348107337952,
"learning_rate": 0.00033030553261767135,
"loss": 6.5885,
"step": 2000
},
{
"epoch": 1.6588915264733202,
"grad_norm": 0.4384669363498688,
"learning_rate": 0.00033195706028075973,
"loss": 6.5994,
"step": 2010
},
{
"epoch": 1.6671483125193518,
"grad_norm": 0.6523682475090027,
"learning_rate": 0.00033360858794384806,
"loss": 6.5988,
"step": 2020
},
{
"epoch": 1.6754050985653834,
"grad_norm": 0.48499011993408203,
"learning_rate": 0.00033526011560693644,
"loss": 6.5926,
"step": 2030
},
{
"epoch": 1.683661884611415,
"grad_norm": 0.6244016885757446,
"learning_rate": 0.00033691164327002477,
"loss": 6.5968,
"step": 2040
},
{
"epoch": 1.6919186706574467,
"grad_norm": 0.4214613139629364,
"learning_rate": 0.0003385631709331131,
"loss": 6.5831,
"step": 2050
},
{
"epoch": 1.7001754567034781,
"grad_norm": 0.6971030235290527,
"learning_rate": 0.0003402146985962015,
"loss": 6.5887,
"step": 2060
},
{
"epoch": 1.7084322427495098,
"grad_norm": 0.6625652313232422,
"learning_rate": 0.00034186622625928986,
"loss": 6.594,
"step": 2070
},
{
"epoch": 1.7166890287955412,
"grad_norm": 0.33810535073280334,
"learning_rate": 0.0003435177539223782,
"loss": 6.5831,
"step": 2080
},
{
"epoch": 1.7249458148415728,
"grad_norm": 0.3338748812675476,
"learning_rate": 0.0003451692815854666,
"loss": 6.5915,
"step": 2090
},
{
"epoch": 1.7332026008876045,
"grad_norm": 0.31808751821517944,
"learning_rate": 0.0003468208092485549,
"loss": 6.5816,
"step": 2100
},
{
"epoch": 1.741459386933636,
"grad_norm": 0.403300404548645,
"learning_rate": 0.0003484723369116433,
"loss": 6.5827,
"step": 2110
},
{
"epoch": 1.7497161729796677,
"grad_norm": 0.48524588346481323,
"learning_rate": 0.0003501238645747316,
"loss": 6.5864,
"step": 2120
},
{
"epoch": 1.7579729590256994,
"grad_norm": 0.3188144266605377,
"learning_rate": 0.00035177539223782,
"loss": 6.5745,
"step": 2130
},
{
"epoch": 1.7662297450717308,
"grad_norm": 0.499127596616745,
"learning_rate": 0.0003534269199009084,
"loss": 6.5814,
"step": 2140
},
{
"epoch": 1.7744865311177624,
"grad_norm": 0.3072030544281006,
"learning_rate": 0.0003550784475639967,
"loss": 6.5764,
"step": 2150
},
{
"epoch": 1.7827433171637939,
"grad_norm": 0.45411720871925354,
"learning_rate": 0.00035672997522708503,
"loss": 6.5791,
"step": 2160
},
{
"epoch": 1.7910001032098255,
"grad_norm": 0.5847482085227966,
"learning_rate": 0.0003583815028901734,
"loss": 6.5736,
"step": 2170
},
{
"epoch": 1.7992568892558571,
"grad_norm": 0.7848684787750244,
"learning_rate": 0.00036003303055326174,
"loss": 6.5646,
"step": 2180
},
{
"epoch": 1.8075136753018888,
"grad_norm": 0.49951326847076416,
"learning_rate": 0.0003616845582163502,
"loss": 6.5686,
"step": 2190
},
{
"epoch": 1.8157704613479204,
"grad_norm": 0.3567127287387848,
"learning_rate": 0.0003633360858794385,
"loss": 6.5613,
"step": 2200
},
{
"epoch": 1.824027247393952,
"grad_norm": 0.4178829491138458,
"learning_rate": 0.00036498761354252684,
"loss": 6.5557,
"step": 2210
},
{
"epoch": 1.8322840334399835,
"grad_norm": 0.5776088237762451,
"learning_rate": 0.0003666391412056152,
"loss": 6.5701,
"step": 2220
},
{
"epoch": 1.8405408194860151,
"grad_norm": 0.44506144523620605,
"learning_rate": 0.00036829066886870355,
"loss": 6.5653,
"step": 2230
},
{
"epoch": 1.8487976055320465,
"grad_norm": 0.4636722803115845,
"learning_rate": 0.0003699421965317919,
"loss": 6.5795,
"step": 2240
},
{
"epoch": 1.8570543915780782,
"grad_norm": 0.7135621309280396,
"learning_rate": 0.00037159372419488026,
"loss": 6.5533,
"step": 2250
},
{
"epoch": 1.8653111776241098,
"grad_norm": 0.6034865379333496,
"learning_rate": 0.00037324525185796864,
"loss": 6.5739,
"step": 2260
},
{
"epoch": 1.8735679636701414,
"grad_norm": 0.40661871433258057,
"learning_rate": 0.000374896779521057,
"loss": 6.5669,
"step": 2270
},
{
"epoch": 1.881824749716173,
"grad_norm": 0.43377530574798584,
"learning_rate": 0.00037654830718414535,
"loss": 6.5516,
"step": 2280
},
{
"epoch": 1.8900815357622047,
"grad_norm": 0.4028869867324829,
"learning_rate": 0.0003781998348472337,
"loss": 6.5637,
"step": 2290
},
{
"epoch": 1.8983383218082361,
"grad_norm": 0.45178523659706116,
"learning_rate": 0.00037985136251032206,
"loss": 6.5599,
"step": 2300
},
{
"epoch": 1.9065951078542678,
"grad_norm": 0.4831728935241699,
"learning_rate": 0.0003815028901734104,
"loss": 6.5574,
"step": 2310
},
{
"epoch": 1.9148518939002992,
"grad_norm": 0.7209576368331909,
"learning_rate": 0.00038315441783649877,
"loss": 6.5532,
"step": 2320
},
{
"epoch": 1.9231086799463308,
"grad_norm": 0.8340161442756653,
"learning_rate": 0.00038480594549958715,
"loss": 6.5647,
"step": 2330
},
{
"epoch": 1.9313654659923625,
"grad_norm": 0.42559853196144104,
"learning_rate": 0.0003864574731626755,
"loss": 6.5493,
"step": 2340
},
{
"epoch": 1.9396222520383941,
"grad_norm": 0.44831937551498413,
"learning_rate": 0.00038810900082576386,
"loss": 6.5543,
"step": 2350
},
{
"epoch": 1.9478790380844258,
"grad_norm": 0.4785579442977905,
"learning_rate": 0.0003897605284888522,
"loss": 6.5594,
"step": 2360
},
{
"epoch": 1.9561358241304572,
"grad_norm": 0.5370727181434631,
"learning_rate": 0.0003914120561519405,
"loss": 6.5567,
"step": 2370
},
{
"epoch": 1.9643926101764888,
"grad_norm": 0.7570891976356506,
"learning_rate": 0.00039306358381502895,
"loss": 6.5502,
"step": 2380
},
{
"epoch": 1.9726493962225202,
"grad_norm": 0.3707781136035919,
"learning_rate": 0.0003947151114781173,
"loss": 6.5606,
"step": 2390
},
{
"epoch": 1.9809061822685519,
"grad_norm": 0.733705461025238,
"learning_rate": 0.0003963666391412056,
"loss": 6.539,
"step": 2400
},
{
"epoch": 1.9891629683145835,
"grad_norm": 0.6355495452880859,
"learning_rate": 0.000398018166804294,
"loss": 6.5604,
"step": 2410
},
{
"epoch": 1.9974197543606151,
"grad_norm": 0.7003040313720703,
"learning_rate": 0.0003996696944673823,
"loss": 6.5358,
"step": 2420
},
{
"epoch": 2.004954071627619,
"grad_norm": 0.7953137159347534,
"learning_rate": 0.00040132122213047065,
"loss": 5.9742,
"step": 2430
},
{
"epoch": 2.0132108576736507,
"grad_norm": 0.642260730266571,
"learning_rate": 0.00040297274979355903,
"loss": 6.5496,
"step": 2440
},
{
"epoch": 2.021467643719682,
"grad_norm": 0.534957230091095,
"learning_rate": 0.0004046242774566474,
"loss": 6.5353,
"step": 2450
},
{
"epoch": 2.0297244297657135,
"grad_norm": 0.7051903605461121,
"learning_rate": 0.0004062758051197358,
"loss": 6.5398,
"step": 2460
},
{
"epoch": 2.037981215811745,
"grad_norm": 0.6130774617195129,
"learning_rate": 0.0004079273327828241,
"loss": 6.5238,
"step": 2470
},
{
"epoch": 2.046238001857777,
"grad_norm": 0.5148051977157593,
"learning_rate": 0.00040957886044591245,
"loss": 6.5378,
"step": 2480
},
{
"epoch": 2.0544947879038085,
"grad_norm": 0.47121939063072205,
"learning_rate": 0.00041123038810900083,
"loss": 6.5309,
"step": 2490
},
{
"epoch": 2.06275157394984,
"grad_norm": 0.49403122067451477,
"learning_rate": 0.00041288191577208916,
"loss": 6.5399,
"step": 2500
},
{
"epoch": 2.0710083599958717,
"grad_norm": 0.6388276815414429,
"learning_rate": 0.00041453344343517754,
"loss": 6.5237,
"step": 2510
},
{
"epoch": 2.0792651460419034,
"grad_norm": 0.714484691619873,
"learning_rate": 0.0004161849710982659,
"loss": 6.5343,
"step": 2520
},
{
"epoch": 2.0875219320879346,
"grad_norm": 0.7904228568077087,
"learning_rate": 0.00041783649876135425,
"loss": 6.534,
"step": 2530
},
{
"epoch": 2.095778718133966,
"grad_norm": 0.8133807182312012,
"learning_rate": 0.00041948802642444264,
"loss": 6.5307,
"step": 2540
},
{
"epoch": 2.104035504179998,
"grad_norm": 0.6666577458381653,
"learning_rate": 0.00042113955408753096,
"loss": 6.5222,
"step": 2550
},
{
"epoch": 2.1122922902260295,
"grad_norm": 0.5386178493499756,
"learning_rate": 0.0004227910817506193,
"loss": 6.5311,
"step": 2560
},
{
"epoch": 2.120549076272061,
"grad_norm": 0.7694815397262573,
"learning_rate": 0.00042444260941370773,
"loss": 6.5248,
"step": 2570
},
{
"epoch": 2.1288058623180928,
"grad_norm": 0.5060898065567017,
"learning_rate": 0.00042609413707679606,
"loss": 6.5294,
"step": 2580
},
{
"epoch": 2.1370626483641244,
"grad_norm": 0.6927939057350159,
"learning_rate": 0.0004277456647398844,
"loss": 6.5161,
"step": 2590
},
{
"epoch": 2.145319434410156,
"grad_norm": 0.5611531734466553,
"learning_rate": 0.00042939719240297277,
"loss": 6.5093,
"step": 2600
},
{
"epoch": 2.1535762204561872,
"grad_norm": 0.5405219197273254,
"learning_rate": 0.0004310487200660611,
"loss": 6.5176,
"step": 2610
},
{
"epoch": 2.161833006502219,
"grad_norm": 0.6646769046783447,
"learning_rate": 0.0004327002477291495,
"loss": 6.5141,
"step": 2620
},
{
"epoch": 2.1700897925482505,
"grad_norm": 0.4237206280231476,
"learning_rate": 0.0004343517753922378,
"loss": 6.5156,
"step": 2630
},
{
"epoch": 2.178346578594282,
"grad_norm": 0.5155819654464722,
"learning_rate": 0.0004360033030553262,
"loss": 6.5128,
"step": 2640
},
{
"epoch": 2.186603364640314,
"grad_norm": 0.5071853399276733,
"learning_rate": 0.00043765483071841457,
"loss": 6.5145,
"step": 2650
},
{
"epoch": 2.1948601506863454,
"grad_norm": 0.4715791642665863,
"learning_rate": 0.0004393063583815029,
"loss": 6.4999,
"step": 2660
},
{
"epoch": 2.203116936732377,
"grad_norm": 0.7622984647750854,
"learning_rate": 0.0004409578860445912,
"loss": 6.5098,
"step": 2670
},
{
"epoch": 2.2113737227784087,
"grad_norm": 0.7513878345489502,
"learning_rate": 0.0004426094137076796,
"loss": 6.5038,
"step": 2680
},
{
"epoch": 2.21963050882444,
"grad_norm": 0.5908055305480957,
"learning_rate": 0.00044426094137076794,
"loss": 6.4992,
"step": 2690
},
{
"epoch": 2.2278872948704715,
"grad_norm": 0.6824406981468201,
"learning_rate": 0.0004459124690338564,
"loss": 6.4816,
"step": 2700
},
{
"epoch": 2.236144080916503,
"grad_norm": 0.6593980193138123,
"learning_rate": 0.0004475639966969447,
"loss": 6.4793,
"step": 2710
},
{
"epoch": 2.244400866962535,
"grad_norm": 0.6030164361000061,
"learning_rate": 0.00044921552436003303,
"loss": 6.4714,
"step": 2720
},
{
"epoch": 2.2526576530085665,
"grad_norm": 0.5835041999816895,
"learning_rate": 0.0004508670520231214,
"loss": 6.4707,
"step": 2730
},
{
"epoch": 2.260914439054598,
"grad_norm": 0.7592746615409851,
"learning_rate": 0.00045251857968620974,
"loss": 6.46,
"step": 2740
},
{
"epoch": 2.2691712251006297,
"grad_norm": 0.596171498298645,
"learning_rate": 0.00045417010734929807,
"loss": 6.4632,
"step": 2750
},
{
"epoch": 2.277428011146661,
"grad_norm": 0.6832927465438843,
"learning_rate": 0.0004558216350123865,
"loss": 6.4563,
"step": 2760
},
{
"epoch": 2.2856847971926926,
"grad_norm": 0.6881881952285767,
"learning_rate": 0.00045747316267547483,
"loss": 6.4462,
"step": 2770
},
{
"epoch": 2.293941583238724,
"grad_norm": 0.6391133069992065,
"learning_rate": 0.0004591246903385632,
"loss": 6.4408,
"step": 2780
},
{
"epoch": 2.302198369284756,
"grad_norm": 0.55253666639328,
"learning_rate": 0.00046077621800165154,
"loss": 6.4379,
"step": 2790
},
{
"epoch": 2.3104551553307875,
"grad_norm": 0.7399817109107971,
"learning_rate": 0.00046242774566473987,
"loss": 6.427,
"step": 2800
},
{
"epoch": 2.318711941376819,
"grad_norm": 0.8651242256164551,
"learning_rate": 0.00046407927332782825,
"loss": 6.4348,
"step": 2810
},
{
"epoch": 2.3269687274228508,
"grad_norm": 0.7616157531738281,
"learning_rate": 0.0004657308009909166,
"loss": 6.4292,
"step": 2820
},
{
"epoch": 2.3352255134688824,
"grad_norm": 0.8812574148178101,
"learning_rate": 0.00046738232865400496,
"loss": 6.4302,
"step": 2830
},
{
"epoch": 2.343482299514914,
"grad_norm": 0.5720260143280029,
"learning_rate": 0.00046903385631709334,
"loss": 6.4159,
"step": 2840
},
{
"epoch": 2.3517390855609452,
"grad_norm": 0.6884876489639282,
"learning_rate": 0.0004706853839801817,
"loss": 6.4149,
"step": 2850
},
{
"epoch": 2.359995871606977,
"grad_norm": 0.8449527621269226,
"learning_rate": 0.00047233691164327006,
"loss": 6.4116,
"step": 2860
},
{
"epoch": 2.3682526576530085,
"grad_norm": 0.652077853679657,
"learning_rate": 0.0004739884393063584,
"loss": 6.4141,
"step": 2870
},
{
"epoch": 2.37650944369904,
"grad_norm": 0.7905910015106201,
"learning_rate": 0.0004756399669694467,
"loss": 6.3859,
"step": 2880
},
{
"epoch": 2.384766229745072,
"grad_norm": 0.6471400260925293,
"learning_rate": 0.00047729149463253515,
"loss": 6.3831,
"step": 2890
},
{
"epoch": 2.3930230157911034,
"grad_norm": 0.7152949571609497,
"learning_rate": 0.0004789430222956235,
"loss": 6.3814,
"step": 2900
},
{
"epoch": 2.401279801837135,
"grad_norm": 0.7103463411331177,
"learning_rate": 0.0004805945499587118,
"loss": 6.3843,
"step": 2910
},
{
"epoch": 2.4095365878831663,
"grad_norm": 0.8913406133651733,
"learning_rate": 0.0004822460776218002,
"loss": 6.3788,
"step": 2920
},
{
"epoch": 2.417793373929198,
"grad_norm": 0.7296728491783142,
"learning_rate": 0.0004838976052848885,
"loss": 6.3887,
"step": 2930
},
{
"epoch": 2.4260501599752295,
"grad_norm": 0.809280514717102,
"learning_rate": 0.00048554913294797684,
"loss": 6.3556,
"step": 2940
},
{
"epoch": 2.434306946021261,
"grad_norm": 0.7426701188087463,
"learning_rate": 0.0004872006606110653,
"loss": 6.3556,
"step": 2950
},
{
"epoch": 2.442563732067293,
"grad_norm": 0.6244359612464905,
"learning_rate": 0.0004888521882741537,
"loss": 6.3484,
"step": 2960
},
{
"epoch": 2.4508205181133245,
"grad_norm": 0.7139498591423035,
"learning_rate": 0.000490503715937242,
"loss": 6.344,
"step": 2970
},
{
"epoch": 2.459077304159356,
"grad_norm": 0.811773419380188,
"learning_rate": 0.0004921552436003303,
"loss": 6.3231,
"step": 2980
},
{
"epoch": 2.4673340902053877,
"grad_norm": 0.6412104368209839,
"learning_rate": 0.0004938067712634186,
"loss": 6.3251,
"step": 2990
},
{
"epoch": 2.4755908762514194,
"grad_norm": 1.169380784034729,
"learning_rate": 0.000495458298926507,
"loss": 6.3118,
"step": 3000
},
{
"epoch": 2.4838476622974506,
"grad_norm": 1.2334198951721191,
"learning_rate": 0.0004971098265895954,
"loss": 6.2942,
"step": 3010
},
{
"epoch": 2.492104448343482,
"grad_norm": 0.967470645904541,
"learning_rate": 0.0004987613542526837,
"loss": 6.2815,
"step": 3020
},
{
"epoch": 2.500361234389514,
"grad_norm": 0.8269219398498535,
"learning_rate": 0.0005004128819157721,
"loss": 6.2641,
"step": 3030
},
{
"epoch": 2.5086180204355455,
"grad_norm": 1.445989727973938,
"learning_rate": 0.0005020644095788605,
"loss": 6.2498,
"step": 3040
},
{
"epoch": 2.516874806481577,
"grad_norm": 1.0903043746948242,
"learning_rate": 0.0005037159372419488,
"loss": 6.233,
"step": 3050
},
{
"epoch": 2.5251315925276088,
"grad_norm": 1.0649830102920532,
"learning_rate": 0.0005053674649050372,
"loss": 6.2011,
"step": 3060
},
{
"epoch": 2.5333883785736404,
"grad_norm": 1.4002567529678345,
"learning_rate": 0.0005070189925681256,
"loss": 6.1885,
"step": 3070
},
{
"epoch": 2.5416451646196716,
"grad_norm": 1.6574758291244507,
"learning_rate": 0.0005086705202312138,
"loss": 6.1617,
"step": 3080
},
{
"epoch": 2.5499019506657032,
"grad_norm": 1.6543638706207275,
"learning_rate": 0.0005103220478943023,
"loss": 6.1348,
"step": 3090
},
{
"epoch": 2.558158736711735,
"grad_norm": 1.585507869720459,
"learning_rate": 0.0005119735755573906,
"loss": 6.1079,
"step": 3100
},
{
"epoch": 2.5664155227577665,
"grad_norm": 1.7207673788070679,
"learning_rate": 0.0005136251032204789,
"loss": 6.0665,
"step": 3110
},
{
"epoch": 2.574672308803798,
"grad_norm": 0.8735297918319702,
"learning_rate": 0.0005152766308835673,
"loss": 6.0345,
"step": 3120
},
{
"epoch": 2.58292909484983,
"grad_norm": 1.352499008178711,
"learning_rate": 0.0005169281585466557,
"loss": 6.0208,
"step": 3130
},
{
"epoch": 2.5911858808958614,
"grad_norm": 1.4594937562942505,
"learning_rate": 0.000518579686209744,
"loss": 5.9824,
"step": 3140
},
{
"epoch": 2.5994426669418926,
"grad_norm": 1.2508600950241089,
"learning_rate": 0.0005202312138728323,
"loss": 5.9507,
"step": 3150
},
{
"epoch": 2.6076994529879247,
"grad_norm": 1.80980384349823,
"learning_rate": 0.0005218827415359208,
"loss": 5.9357,
"step": 3160
},
{
"epoch": 2.615956239033956,
"grad_norm": 1.3844646215438843,
"learning_rate": 0.0005235342691990091,
"loss": 5.8944,
"step": 3170
},
{
"epoch": 2.6242130250799875,
"grad_norm": 1.927004098892212,
"learning_rate": 0.0005251857968620974,
"loss": 5.8612,
"step": 3180
},
{
"epoch": 2.632469811126019,
"grad_norm": 1.4003227949142456,
"learning_rate": 0.0005268373245251859,
"loss": 5.8304,
"step": 3190
},
{
"epoch": 2.640726597172051,
"grad_norm": 1.8084081411361694,
"learning_rate": 0.0005284888521882742,
"loss": 5.8157,
"step": 3200
},
{
"epoch": 2.6489833832180825,
"grad_norm": 1.6965852975845337,
"learning_rate": 0.0005301403798513625,
"loss": 5.7719,
"step": 3210
},
{
"epoch": 2.657240169264114,
"grad_norm": 1.2121639251708984,
"learning_rate": 0.0005317919075144508,
"loss": 5.7462,
"step": 3220
},
{
"epoch": 2.6654969553101457,
"grad_norm": 1.6979306936264038,
"learning_rate": 0.0005334434351775393,
"loss": 5.7426,
"step": 3230
},
{
"epoch": 2.673753741356177,
"grad_norm": 1.974894642829895,
"learning_rate": 0.0005350949628406276,
"loss": 5.704,
"step": 3240
},
{
"epoch": 2.6820105274022086,
"grad_norm": 1.8848650455474854,
"learning_rate": 0.0005367464905037159,
"loss": 5.6652,
"step": 3250
},
{
"epoch": 2.69026731344824,
"grad_norm": 1.4253407716751099,
"learning_rate": 0.0005383980181668044,
"loss": 5.652,
"step": 3260
},
{
"epoch": 2.698524099494272,
"grad_norm": 1.3455779552459717,
"learning_rate": 0.0005400495458298926,
"loss": 5.635,
"step": 3270
},
{
"epoch": 2.7067808855403035,
"grad_norm": 1.4485799074172974,
"learning_rate": 0.000541701073492981,
"loss": 5.6146,
"step": 3280
},
{
"epoch": 2.715037671586335,
"grad_norm": 2.0756478309631348,
"learning_rate": 0.0005433526011560694,
"loss": 5.6048,
"step": 3290
},
{
"epoch": 2.7232944576323668,
"grad_norm": 1.5905245542526245,
"learning_rate": 0.0005450041288191577,
"loss": 5.5905,
"step": 3300
},
{
"epoch": 2.731551243678398,
"grad_norm": 1.2772367000579834,
"learning_rate": 0.0005466556564822461,
"loss": 5.5564,
"step": 3310
},
{
"epoch": 2.7398080297244296,
"grad_norm": 1.7051196098327637,
"learning_rate": 0.0005483071841453344,
"loss": 5.5448,
"step": 3320
},
{
"epoch": 2.7480648157704612,
"grad_norm": 1.9072637557983398,
"learning_rate": 0.0005499587118084229,
"loss": 5.5277,
"step": 3330
},
{
"epoch": 2.756321601816493,
"grad_norm": 1.741525411605835,
"learning_rate": 0.0005516102394715111,
"loss": 5.5004,
"step": 3340
},
{
"epoch": 2.7645783878625245,
"grad_norm": 1.8459067344665527,
"learning_rate": 0.0005532617671345995,
"loss": 5.4817,
"step": 3350
},
{
"epoch": 2.772835173908556,
"grad_norm": 1.4545385837554932,
"learning_rate": 0.0005549132947976879,
"loss": 5.452,
"step": 3360
},
{
"epoch": 2.781091959954588,
"grad_norm": 1.9043197631835938,
"learning_rate": 0.0005565648224607762,
"loss": 5.4375,
"step": 3370
},
{
"epoch": 2.789348746000619,
"grad_norm": 1.4310576915740967,
"learning_rate": 0.0005582163501238646,
"loss": 5.4358,
"step": 3380
},
{
"epoch": 2.797605532046651,
"grad_norm": 2.0747084617614746,
"learning_rate": 0.000559867877786953,
"loss": 5.4096,
"step": 3390
},
{
"epoch": 2.8058623180926823,
"grad_norm": 1.9496272802352905,
"learning_rate": 0.0005615194054500413,
"loss": 5.4057,
"step": 3400
},
{
"epoch": 2.814119104138714,
"grad_norm": 1.805301547050476,
"learning_rate": 0.0005631709331131296,
"loss": 5.3882,
"step": 3410
},
{
"epoch": 2.8223758901847456,
"grad_norm": 1.5036094188690186,
"learning_rate": 0.000564822460776218,
"loss": 5.3605,
"step": 3420
},
{
"epoch": 2.830632676230777,
"grad_norm": 1.7904752492904663,
"learning_rate": 0.0005664739884393064,
"loss": 5.3395,
"step": 3430
},
{
"epoch": 2.838889462276809,
"grad_norm": 1.3491814136505127,
"learning_rate": 0.0005681255161023947,
"loss": 5.3045,
"step": 3440
},
{
"epoch": 2.8471462483228405,
"grad_norm": 1.4046028852462769,
"learning_rate": 0.0005697770437654831,
"loss": 5.2824,
"step": 3450
},
{
"epoch": 2.855403034368872,
"grad_norm": 1.8236334323883057,
"learning_rate": 0.0005714285714285714,
"loss": 5.2604,
"step": 3460
},
{
"epoch": 2.8636598204149033,
"grad_norm": 1.4702653884887695,
"learning_rate": 0.0005730800990916598,
"loss": 5.2299,
"step": 3470
},
{
"epoch": 2.871916606460935,
"grad_norm": 1.4314968585968018,
"learning_rate": 0.0005747316267547481,
"loss": 5.2004,
"step": 3480
},
{
"epoch": 2.8801733925069666,
"grad_norm": 2.0397439002990723,
"learning_rate": 0.0005763831544178365,
"loss": 5.1926,
"step": 3490
},
{
"epoch": 2.888430178552998,
"grad_norm": 1.335970163345337,
"learning_rate": 0.0005780346820809249,
"loss": 5.1421,
"step": 3500
},
{
"epoch": 2.89668696459903,
"grad_norm": 1.551160216331482,
"learning_rate": 0.0005796862097440132,
"loss": 5.1393,
"step": 3510
},
{
"epoch": 2.9049437506450615,
"grad_norm": 1.6374657154083252,
"learning_rate": 0.0005813377374071017,
"loss": 5.1064,
"step": 3520
},
{
"epoch": 2.913200536691093,
"grad_norm": 1.8045247793197632,
"learning_rate": 0.0005829892650701899,
"loss": 5.0742,
"step": 3530
},
{
"epoch": 2.9214573227371243,
"grad_norm": 1.6787662506103516,
"learning_rate": 0.0005846407927332783,
"loss": 5.0685,
"step": 3540
},
{
"epoch": 2.9297141087831564,
"grad_norm": 2.1116514205932617,
"learning_rate": 0.0005862923203963666,
"loss": 5.0505,
"step": 3550
},
{
"epoch": 2.9379708948291876,
"grad_norm": 1.637165904045105,
"learning_rate": 0.000587943848059455,
"loss": 5.0354,
"step": 3560
},
{
"epoch": 2.9462276808752192,
"grad_norm": 1.6193593740463257,
"learning_rate": 0.0005895953757225434,
"loss": 5.003,
"step": 3570
},
{
"epoch": 2.954484466921251,
"grad_norm": 1.4651744365692139,
"learning_rate": 0.0005912469033856317,
"loss": 4.9779,
"step": 3580
},
{
"epoch": 2.9627412529672825,
"grad_norm": 1.4064579010009766,
"learning_rate": 0.0005928984310487201,
"loss": 4.9413,
"step": 3590
},
{
"epoch": 2.970998039013314,
"grad_norm": 1.6661295890808105,
"learning_rate": 0.0005945499587118084,
"loss": 4.9438,
"step": 3600
},
{
"epoch": 2.979254825059346,
"grad_norm": 1.8025574684143066,
"learning_rate": 0.0005962014863748968,
"loss": 4.9145,
"step": 3610
},
{
"epoch": 2.9875116111053774,
"grad_norm": 2.135185718536377,
"learning_rate": 0.0005978530140379852,
"loss": 4.8906,
"step": 3620
},
{
"epoch": 2.9957683971514086,
"grad_norm": 1.839839220046997,
"learning_rate": 0.0005995045417010735,
"loss": 4.8848,
"step": 3630
},
{
"epoch": 3.0033027144184126,
"grad_norm": 1.8142980337142944,
"learning_rate": 0.0006011560693641619,
"loss": 4.4459,
"step": 3640
},
{
"epoch": 3.011559500464444,
"grad_norm": 1.6817476749420166,
"learning_rate": 0.0006028075970272501,
"loss": 4.8488,
"step": 3650
},
{
"epoch": 3.019816286510476,
"grad_norm": 1.583275318145752,
"learning_rate": 0.0006044591246903386,
"loss": 4.8085,
"step": 3660
},
{
"epoch": 3.0280730725565075,
"grad_norm": 1.5430843830108643,
"learning_rate": 0.0006061106523534269,
"loss": 4.7979,
"step": 3670
},
{
"epoch": 3.036329858602539,
"grad_norm": 1.5976840257644653,
"learning_rate": 0.0006077621800165153,
"loss": 4.7792,
"step": 3680
},
{
"epoch": 3.0445866446485708,
"grad_norm": 1.5955281257629395,
"learning_rate": 0.0006094137076796037,
"loss": 4.7536,
"step": 3690
},
{
"epoch": 3.052843430694602,
"grad_norm": 1.276328444480896,
"learning_rate": 0.000611065235342692,
"loss": 4.7247,
"step": 3700
},
{
"epoch": 3.0611002167406336,
"grad_norm": 1.3994117975234985,
"learning_rate": 0.0006127167630057804,
"loss": 4.726,
"step": 3710
},
{
"epoch": 3.0693570027866652,
"grad_norm": 1.333193063735962,
"learning_rate": 0.0006143682906688687,
"loss": 4.6887,
"step": 3720
},
{
"epoch": 3.077613788832697,
"grad_norm": 1.2973535060882568,
"learning_rate": 0.0006160198183319571,
"loss": 4.6584,
"step": 3730
},
{
"epoch": 3.0858705748787285,
"grad_norm": 1.3581771850585938,
"learning_rate": 0.0006176713459950454,
"loss": 4.6377,
"step": 3740
},
{
"epoch": 3.09412736092476,
"grad_norm": 1.2961112260818481,
"learning_rate": 0.0006193228736581337,
"loss": 4.6226,
"step": 3750
},
{
"epoch": 3.102384146970792,
"grad_norm": 1.2253503799438477,
"learning_rate": 0.0006209744013212222,
"loss": 4.6013,
"step": 3760
},
{
"epoch": 3.110640933016823,
"grad_norm": 1.4154301881790161,
"learning_rate": 0.0006226259289843105,
"loss": 4.5738,
"step": 3770
},
{
"epoch": 3.1188977190628546,
"grad_norm": 1.083807349205017,
"learning_rate": 0.0006242774566473988,
"loss": 4.5646,
"step": 3780
},
{
"epoch": 3.1271545051088863,
"grad_norm": 1.0674443244934082,
"learning_rate": 0.0006259289843104872,
"loss": 4.5446,
"step": 3790
},
{
"epoch": 3.135411291154918,
"grad_norm": 1.1654369831085205,
"learning_rate": 0.0006275805119735756,
"loss": 4.5398,
"step": 3800
},
{
"epoch": 3.1436680772009495,
"grad_norm": 1.1597775220870972,
"learning_rate": 0.0006292320396366639,
"loss": 4.5162,
"step": 3810
},
{
"epoch": 3.151924863246981,
"grad_norm": 1.133058786392212,
"learning_rate": 0.0006308835672997523,
"loss": 4.4877,
"step": 3820
},
{
"epoch": 3.160181649293013,
"grad_norm": 1.0915247201919556,
"learning_rate": 0.0006325350949628407,
"loss": 4.4808,
"step": 3830
},
{
"epoch": 3.1684384353390445,
"grad_norm": 1.137772560119629,
"learning_rate": 0.0006341866226259289,
"loss": 4.4562,
"step": 3840
},
{
"epoch": 3.1766952213850757,
"grad_norm": 1.1915597915649414,
"learning_rate": 0.0006358381502890173,
"loss": 4.4448,
"step": 3850
},
{
"epoch": 3.1849520074311073,
"grad_norm": 1.2536723613739014,
"learning_rate": 0.0006374896779521057,
"loss": 4.4146,
"step": 3860
},
{
"epoch": 3.193208793477139,
"grad_norm": 1.0724738836288452,
"learning_rate": 0.0006391412056151941,
"loss": 4.4128,
"step": 3870
},
{
"epoch": 3.2014655795231706,
"grad_norm": 1.0709177255630493,
"learning_rate": 0.0006407927332782824,
"loss": 4.3958,
"step": 3880
},
{
"epoch": 3.209722365569202,
"grad_norm": 1.0337883234024048,
"learning_rate": 0.0006424442609413708,
"loss": 4.379,
"step": 3890
},
{
"epoch": 3.217979151615234,
"grad_norm": 0.9574352502822876,
"learning_rate": 0.0006440957886044592,
"loss": 4.3577,
"step": 3900
},
{
"epoch": 3.2262359376612655,
"grad_norm": 0.9089698791503906,
"learning_rate": 0.0006457473162675474,
"loss": 4.3324,
"step": 3910
},
{
"epoch": 3.234492723707297,
"grad_norm": 1.1630803346633911,
"learning_rate": 0.0006473988439306359,
"loss": 4.3419,
"step": 3920
},
{
"epoch": 3.2427495097533283,
"grad_norm": 1.068848967552185,
"learning_rate": 0.0006490503715937242,
"loss": 4.3209,
"step": 3930
},
{
"epoch": 3.25100629579936,
"grad_norm": 0.9788937568664551,
"learning_rate": 0.0006507018992568125,
"loss": 4.2982,
"step": 3940
},
{
"epoch": 3.2592630818453916,
"grad_norm": 0.9281165599822998,
"learning_rate": 0.000652353426919901,
"loss": 4.2847,
"step": 3950
},
{
"epoch": 3.2675198678914232,
"grad_norm": 0.9365155696868896,
"learning_rate": 0.0006540049545829893,
"loss": 4.2756,
"step": 3960
},
{
"epoch": 3.275776653937455,
"grad_norm": 0.8847097158432007,
"learning_rate": 0.0006556564822460776,
"loss": 4.2646,
"step": 3970
},
{
"epoch": 3.2840334399834865,
"grad_norm": 0.8232343196868896,
"learning_rate": 0.0006573080099091659,
"loss": 4.2546,
"step": 3980
},
{
"epoch": 3.292290226029518,
"grad_norm": 0.870100736618042,
"learning_rate": 0.0006589595375722544,
"loss": 4.2119,
"step": 3990
},
{
"epoch": 3.3005470120755493,
"grad_norm": 0.8268401622772217,
"learning_rate": 0.0006606110652353427,
"loss": 4.2242,
"step": 4000
},
{
"epoch": 3.308803798121581,
"grad_norm": 0.9999098777770996,
"learning_rate": 0.000662262592898431,
"loss": 4.1925,
"step": 4010
},
{
"epoch": 3.3170605841676126,
"grad_norm": 0.8971749544143677,
"learning_rate": 0.0006639141205615195,
"loss": 4.1838,
"step": 4020
},
{
"epoch": 3.3253173702136443,
"grad_norm": 1.0182358026504517,
"learning_rate": 0.0006655656482246078,
"loss": 4.1679,
"step": 4030
},
{
"epoch": 3.333574156259676,
"grad_norm": 0.9021536707878113,
"learning_rate": 0.0006672171758876961,
"loss": 4.1573,
"step": 4040
},
{
"epoch": 3.3418309423057075,
"grad_norm": 0.8236122131347656,
"learning_rate": 0.0006688687035507845,
"loss": 4.1271,
"step": 4050
},
{
"epoch": 3.350087728351739,
"grad_norm": 0.9135034084320068,
"learning_rate": 0.0006705202312138729,
"loss": 4.1383,
"step": 4060
},
{
"epoch": 3.358344514397771,
"grad_norm": 0.9369881749153137,
"learning_rate": 0.0006721717588769612,
"loss": 4.1254,
"step": 4070
},
{
"epoch": 3.3666013004438025,
"grad_norm": 0.8571922183036804,
"learning_rate": 0.0006738232865400495,
"loss": 4.1155,
"step": 4080
},
{
"epoch": 3.3748580864898337,
"grad_norm": 0.7268726825714111,
"learning_rate": 0.000675474814203138,
"loss": 4.1042,
"step": 4090
},
{
"epoch": 3.3831148725358653,
"grad_norm": 0.7929525375366211,
"learning_rate": 0.0006771263418662262,
"loss": 4.0846,
"step": 4100
},
{
"epoch": 3.391371658581897,
"grad_norm": 0.9496756792068481,
"learning_rate": 0.0006787778695293146,
"loss": 4.0856,
"step": 4110
},
{
"epoch": 3.3996284446279286,
"grad_norm": 0.9352908730506897,
"learning_rate": 0.000680429397192403,
"loss": 4.0632,
"step": 4120
},
{
"epoch": 3.40788523067396,
"grad_norm": 0.7630198001861572,
"learning_rate": 0.0006820809248554913,
"loss": 4.0605,
"step": 4130
},
{
"epoch": 3.416142016719992,
"grad_norm": 0.8446700572967529,
"learning_rate": 0.0006837324525185797,
"loss": 4.0639,
"step": 4140
},
{
"epoch": 3.4243988027660235,
"grad_norm": 0.8067805767059326,
"learning_rate": 0.0006853839801816681,
"loss": 4.0482,
"step": 4150
},
{
"epoch": 3.4326555888120547,
"grad_norm": 0.9675975441932678,
"learning_rate": 0.0006870355078447564,
"loss": 4.0384,
"step": 4160
},
{
"epoch": 3.4409123748580863,
"grad_norm": 0.847297191619873,
"learning_rate": 0.0006886870355078447,
"loss": 4.0303,
"step": 4170
},
{
"epoch": 3.449169160904118,
"grad_norm": 0.8416860699653625,
"learning_rate": 0.0006903385631709331,
"loss": 4.0099,
"step": 4180
},
{
"epoch": 3.4574259469501496,
"grad_norm": 0.8956720232963562,
"learning_rate": 0.0006919900908340216,
"loss": 4.0159,
"step": 4190
},
{
"epoch": 3.4656827329961812,
"grad_norm": 0.8465309739112854,
"learning_rate": 0.0006936416184971098,
"loss": 3.997,
"step": 4200
},
{
"epoch": 3.473939519042213,
"grad_norm": 0.8289422988891602,
"learning_rate": 0.0006952931461601982,
"loss": 3.9781,
"step": 4210
},
{
"epoch": 3.4821963050882445,
"grad_norm": 0.8076528906822205,
"learning_rate": 0.0006969446738232866,
"loss": 3.9801,
"step": 4220
},
{
"epoch": 3.490453091134276,
"grad_norm": 0.8516877293586731,
"learning_rate": 0.0006985962014863749,
"loss": 3.9584,
"step": 4230
},
{
"epoch": 3.498709877180308,
"grad_norm": 0.8170517683029175,
"learning_rate": 0.0007002477291494632,
"loss": 3.9598,
"step": 4240
},
{
"epoch": 3.506966663226339,
"grad_norm": 0.8019408583641052,
"learning_rate": 0.0007018992568125517,
"loss": 3.947,
"step": 4250
},
{
"epoch": 3.5152234492723706,
"grad_norm": 0.8144872188568115,
"learning_rate": 0.00070355078447564,
"loss": 3.9539,
"step": 4260
},
{
"epoch": 3.5234802353184023,
"grad_norm": 0.7871933579444885,
"learning_rate": 0.0007052023121387283,
"loss": 3.9532,
"step": 4270
},
{
"epoch": 3.531737021364434,
"grad_norm": 0.8461719751358032,
"learning_rate": 0.0007068538398018168,
"loss": 3.9195,
"step": 4280
},
{
"epoch": 3.5399938074104655,
"grad_norm": 0.8719236850738525,
"learning_rate": 0.000708505367464905,
"loss": 3.9151,
"step": 4290
},
{
"epoch": 3.548250593456497,
"grad_norm": 0.8670084476470947,
"learning_rate": 0.0007101568951279934,
"loss": 3.9101,
"step": 4300
},
{
"epoch": 3.556507379502529,
"grad_norm": 0.7470918893814087,
"learning_rate": 0.0007118084227910817,
"loss": 3.9066,
"step": 4310
},
{
"epoch": 3.56476416554856,
"grad_norm": 0.7734981775283813,
"learning_rate": 0.0007134599504541701,
"loss": 3.9026,
"step": 4320
},
{
"epoch": 3.5730209515945917,
"grad_norm": 0.8196832537651062,
"learning_rate": 0.0007151114781172585,
"loss": 3.8973,
"step": 4330
},
{
"epoch": 3.5812777376406233,
"grad_norm": 0.796661376953125,
"learning_rate": 0.0007167630057803468,
"loss": 3.8841,
"step": 4340
},
{
"epoch": 3.589534523686655,
"grad_norm": 0.7249051928520203,
"learning_rate": 0.0007184145334434353,
"loss": 3.8769,
"step": 4350
},
{
"epoch": 3.5977913097326866,
"grad_norm": 0.8851024508476257,
"learning_rate": 0.0007200660611065235,
"loss": 3.8695,
"step": 4360
},
{
"epoch": 3.606048095778718,
"grad_norm": 0.8773587942123413,
"learning_rate": 0.0007217175887696119,
"loss": 3.8665,
"step": 4370
},
{
"epoch": 3.61430488182475,
"grad_norm": 0.7050719857215881,
"learning_rate": 0.0007233691164327004,
"loss": 3.8632,
"step": 4380
},
{
"epoch": 3.622561667870781,
"grad_norm": 0.8079128861427307,
"learning_rate": 0.0007250206440957886,
"loss": 3.8456,
"step": 4390
},
{
"epoch": 3.630818453916813,
"grad_norm": 0.8955399394035339,
"learning_rate": 0.000726672171758877,
"loss": 3.8456,
"step": 4400
},
{
"epoch": 3.6390752399628443,
"grad_norm": 0.8179429769515991,
"learning_rate": 0.0007283236994219653,
"loss": 3.8317,
"step": 4410
},
{
"epoch": 3.647332026008876,
"grad_norm": 0.8095247149467468,
"learning_rate": 0.0007299752270850537,
"loss": 3.8363,
"step": 4420
},
{
"epoch": 3.6555888120549076,
"grad_norm": 0.7325819730758667,
"learning_rate": 0.000731626754748142,
"loss": 3.8162,
"step": 4430
},
{
"epoch": 3.6638455981009392,
"grad_norm": 0.8401527404785156,
"learning_rate": 0.0007332782824112304,
"loss": 3.8204,
"step": 4440
},
{
"epoch": 3.672102384146971,
"grad_norm": 0.9044252634048462,
"learning_rate": 0.0007349298100743188,
"loss": 3.8106,
"step": 4450
},
{
"epoch": 3.6803591701930025,
"grad_norm": 0.8086848258972168,
"learning_rate": 0.0007365813377374071,
"loss": 3.7821,
"step": 4460
},
{
"epoch": 3.688615956239034,
"grad_norm": 0.718523383140564,
"learning_rate": 0.0007382328654004955,
"loss": 3.7964,
"step": 4470
},
{
"epoch": 3.6968727422850653,
"grad_norm": 0.84502112865448,
"learning_rate": 0.0007398843930635837,
"loss": 3.7868,
"step": 4480
},
{
"epoch": 3.705129528331097,
"grad_norm": 0.8375003337860107,
"learning_rate": 0.0007415359207266722,
"loss": 3.7848,
"step": 4490
},
{
"epoch": 3.7133863143771286,
"grad_norm": 0.8933425545692444,
"learning_rate": 0.0007431874483897605,
"loss": 3.776,
"step": 4500
},
{
"epoch": 3.7216431004231603,
"grad_norm": 0.7778623104095459,
"learning_rate": 0.0007448389760528488,
"loss": 3.7683,
"step": 4510
},
{
"epoch": 3.729899886469192,
"grad_norm": 0.704544186592102,
"learning_rate": 0.0007464905037159373,
"loss": 3.7699,
"step": 4520
},
{
"epoch": 3.7381566725152235,
"grad_norm": 0.8176696300506592,
"learning_rate": 0.0007481420313790256,
"loss": 3.7589,
"step": 4530
},
{
"epoch": 3.746413458561255,
"grad_norm": 0.8019874095916748,
"learning_rate": 0.000749793559042114,
"loss": 3.7464,
"step": 4540
},
{
"epoch": 3.7546702446072864,
"grad_norm": 0.8178896307945251,
"learning_rate": 0.0007514450867052023,
"loss": 3.742,
"step": 4550
},
{
"epoch": 3.7629270306533185,
"grad_norm": 0.7082737684249878,
"learning_rate": 0.0007530966143682907,
"loss": 3.7345,
"step": 4560
},
{
"epoch": 3.7711838166993497,
"grad_norm": 0.7834277749061584,
"learning_rate": 0.0007547481420313791,
"loss": 3.7127,
"step": 4570
},
{
"epoch": 3.7794406027453813,
"grad_norm": 0.7585816383361816,
"learning_rate": 0.0007563996696944674,
"loss": 3.7122,
"step": 4580
},
{
"epoch": 3.787697388791413,
"grad_norm": 0.7101882696151733,
"learning_rate": 0.0007580511973575558,
"loss": 3.7167,
"step": 4590
},
{
"epoch": 3.7959541748374446,
"grad_norm": 0.7938413619995117,
"learning_rate": 0.0007597027250206441,
"loss": 3.7045,
"step": 4600
},
{
"epoch": 3.804210960883476,
"grad_norm": 0.694128155708313,
"learning_rate": 0.0007613542526837324,
"loss": 3.6961,
"step": 4610
},
{
"epoch": 3.812467746929508,
"grad_norm": 0.7648592591285706,
"learning_rate": 0.0007630057803468208,
"loss": 3.7112,
"step": 4620
},
{
"epoch": 3.8207245329755395,
"grad_norm": 0.7412601709365845,
"learning_rate": 0.0007646573080099092,
"loss": 3.6992,
"step": 4630
},
{
"epoch": 3.8289813190215707,
"grad_norm": 0.8250954747200012,
"learning_rate": 0.0007663088356729975,
"loss": 3.6947,
"step": 4640
},
{
"epoch": 3.8372381050676023,
"grad_norm": 1.020845890045166,
"learning_rate": 0.0007679603633360859,
"loss": 3.6912,
"step": 4650
},
{
"epoch": 3.845494891113634,
"grad_norm": 0.714709997177124,
"learning_rate": 0.0007696118909991743,
"loss": 3.6897,
"step": 4660
},
{
"epoch": 3.8537516771596656,
"grad_norm": 0.8402379155158997,
"learning_rate": 0.0007712634186622625,
"loss": 3.6728,
"step": 4670
},
{
"epoch": 3.8620084632056972,
"grad_norm": 0.8631258010864258,
"learning_rate": 0.000772914946325351,
"loss": 3.6656,
"step": 4680
},
{
"epoch": 3.870265249251729,
"grad_norm": 0.7027178406715393,
"learning_rate": 0.0007745664739884393,
"loss": 3.6668,
"step": 4690
},
{
"epoch": 3.8785220352977605,
"grad_norm": 0.8082193732261658,
"learning_rate": 0.0007762180016515277,
"loss": 3.6604,
"step": 4700
},
{
"epoch": 3.8867788213437917,
"grad_norm": 0.7741190791130066,
"learning_rate": 0.000777869529314616,
"loss": 3.6598,
"step": 4710
},
{
"epoch": 3.8950356073898234,
"grad_norm": 0.8848899006843567,
"learning_rate": 0.0007795210569777044,
"loss": 3.6496,
"step": 4720
},
{
"epoch": 3.903292393435855,
"grad_norm": 0.8357560634613037,
"learning_rate": 0.0007811725846407928,
"loss": 3.6361,
"step": 4730
},
{
"epoch": 3.9115491794818866,
"grad_norm": 0.7973350286483765,
"learning_rate": 0.000782824112303881,
"loss": 3.6356,
"step": 4740
},
{
"epoch": 3.9198059655279183,
"grad_norm": 0.8782063126564026,
"learning_rate": 0.0007844756399669695,
"loss": 3.6421,
"step": 4750
},
{
"epoch": 3.92806275157395,
"grad_norm": 0.7487813234329224,
"learning_rate": 0.0007861271676300579,
"loss": 3.6245,
"step": 4760
},
{
"epoch": 3.9363195376199815,
"grad_norm": 0.766007125377655,
"learning_rate": 0.0007877786952931461,
"loss": 3.6219,
"step": 4770
},
{
"epoch": 3.9445763236660127,
"grad_norm": 0.7262325882911682,
"learning_rate": 0.0007894302229562346,
"loss": 3.6152,
"step": 4780
},
{
"epoch": 3.952833109712045,
"grad_norm": 0.837656557559967,
"learning_rate": 0.0007910817506193229,
"loss": 3.6129,
"step": 4790
},
{
"epoch": 3.961089895758076,
"grad_norm": 0.7486396431922913,
"learning_rate": 0.0007927332782824112,
"loss": 3.6017,
"step": 4800
},
{
"epoch": 3.9693466818041077,
"grad_norm": 0.7907805442810059,
"learning_rate": 0.0007943848059454995,
"loss": 3.5954,
"step": 4810
},
{
"epoch": 3.9776034678501393,
"grad_norm": 0.8688389658927917,
"learning_rate": 0.000796036333608588,
"loss": 3.5994,
"step": 4820
},
{
"epoch": 3.985860253896171,
"grad_norm": 0.8377218842506409,
"learning_rate": 0.0007976878612716763,
"loss": 3.5908,
"step": 4830
},
{
"epoch": 3.9941170399422026,
"grad_norm": 0.7856019139289856,
"learning_rate": 0.0007993393889347646,
"loss": 3.5889,
"step": 4840
},
{
"epoch": 4.001651357209206,
"grad_norm": 0.7975202202796936,
"learning_rate": 0.0008009909165978531,
"loss": 3.2828,
"step": 4850
},
{
"epoch": 4.009908143255238,
"grad_norm": 0.8581557869911194,
"learning_rate": 0.0008026424442609413,
"loss": 3.5907,
"step": 4860
},
{
"epoch": 4.018164929301269,
"grad_norm": 0.8292841911315918,
"learning_rate": 0.0008042939719240297,
"loss": 3.5775,
"step": 4870
},
{
"epoch": 4.026421715347301,
"grad_norm": 0.7097908854484558,
"learning_rate": 0.0008059454995871181,
"loss": 3.5703,
"step": 4880
},
{
"epoch": 4.034678501393333,
"grad_norm": 0.8221850395202637,
"learning_rate": 0.0008075970272502065,
"loss": 3.5639,
"step": 4890
},
{
"epoch": 4.042935287439364,
"grad_norm": 0.7759121656417847,
"learning_rate": 0.0008092485549132948,
"loss": 3.5577,
"step": 4900
},
{
"epoch": 4.051192073485396,
"grad_norm": 0.8487065434455872,
"learning_rate": 0.0008109000825763832,
"loss": 3.5502,
"step": 4910
},
{
"epoch": 4.059448859531427,
"grad_norm": 0.7345426082611084,
"learning_rate": 0.0008125516102394716,
"loss": 3.5432,
"step": 4920
},
{
"epoch": 4.067705645577459,
"grad_norm": 0.738944411277771,
"learning_rate": 0.0008142031379025598,
"loss": 3.5464,
"step": 4930
},
{
"epoch": 4.07596243162349,
"grad_norm": 0.8091252446174622,
"learning_rate": 0.0008158546655656482,
"loss": 3.5477,
"step": 4940
},
{
"epoch": 4.0842192176695225,
"grad_norm": 0.7931963801383972,
"learning_rate": 0.0008175061932287367,
"loss": 3.5353,
"step": 4950
},
{
"epoch": 4.092476003715554,
"grad_norm": 0.886758029460907,
"learning_rate": 0.0008191577208918249,
"loss": 3.5303,
"step": 4960
},
{
"epoch": 4.100732789761586,
"grad_norm": 0.7010697722434998,
"learning_rate": 0.0008208092485549133,
"loss": 3.5222,
"step": 4970
},
{
"epoch": 4.108989575807617,
"grad_norm": 0.8633137941360474,
"learning_rate": 0.0008224607762180017,
"loss": 3.5354,
"step": 4980
},
{
"epoch": 4.117246361853648,
"grad_norm": 0.8236711025238037,
"learning_rate": 0.00082411230388109,
"loss": 3.5233,
"step": 4990
},
{
"epoch": 4.12550314789968,
"grad_norm": 0.7535457015037537,
"learning_rate": 0.0008257638315441783,
"loss": 3.5156,
"step": 5000
},
{
"epoch": 4.133759933945711,
"grad_norm": 0.7200325727462769,
"learning_rate": 0.0008274153592072668,
"loss": 3.5292,
"step": 5010
},
{
"epoch": 4.1420167199917435,
"grad_norm": 0.6595053672790527,
"learning_rate": 0.0008290668868703551,
"loss": 3.5097,
"step": 5020
},
{
"epoch": 4.150273506037775,
"grad_norm": 0.8142825961112976,
"learning_rate": 0.0008307184145334434,
"loss": 3.5125,
"step": 5030
},
{
"epoch": 4.158530292083807,
"grad_norm": 0.8555150628089905,
"learning_rate": 0.0008323699421965319,
"loss": 3.5013,
"step": 5040
},
{
"epoch": 4.166787078129838,
"grad_norm": 0.9074802994728088,
"learning_rate": 0.0008340214698596202,
"loss": 3.5021,
"step": 5050
},
{
"epoch": 4.175043864175869,
"grad_norm": 0.9074805974960327,
"learning_rate": 0.0008356729975227085,
"loss": 3.5052,
"step": 5060
},
{
"epoch": 4.183300650221901,
"grad_norm": 0.7932230234146118,
"learning_rate": 0.0008373245251857968,
"loss": 3.5048,
"step": 5070
},
{
"epoch": 4.191557436267932,
"grad_norm": 0.8132278323173523,
"learning_rate": 0.0008389760528488853,
"loss": 3.4994,
"step": 5080
},
{
"epoch": 4.1998142223139645,
"grad_norm": 0.7839681506156921,
"learning_rate": 0.0008406275805119736,
"loss": 3.4905,
"step": 5090
},
{
"epoch": 4.208071008359996,
"grad_norm": 0.7674237489700317,
"learning_rate": 0.0008422791081750619,
"loss": 3.4764,
"step": 5100
},
{
"epoch": 4.216327794406028,
"grad_norm": 0.945733904838562,
"learning_rate": 0.0008439306358381504,
"loss": 3.48,
"step": 5110
},
{
"epoch": 4.224584580452059,
"grad_norm": 0.8186456561088562,
"learning_rate": 0.0008455821635012386,
"loss": 3.4886,
"step": 5120
},
{
"epoch": 4.232841366498091,
"grad_norm": 0.7169471383094788,
"learning_rate": 0.000847233691164327,
"loss": 3.4863,
"step": 5130
},
{
"epoch": 4.241098152544122,
"grad_norm": 0.8962691426277161,
"learning_rate": 0.0008488852188274155,
"loss": 3.478,
"step": 5140
},
{
"epoch": 4.2493549385901535,
"grad_norm": 0.7380357980728149,
"learning_rate": 0.0008505367464905037,
"loss": 3.4759,
"step": 5150
},
{
"epoch": 4.2576117246361855,
"grad_norm": 0.7585932612419128,
"learning_rate": 0.0008521882741535921,
"loss": 3.4725,
"step": 5160
},
{
"epoch": 4.265868510682217,
"grad_norm": 0.8082647919654846,
"learning_rate": 0.0008538398018166804,
"loss": 3.4546,
"step": 5170
},
{
"epoch": 4.274125296728249,
"grad_norm": 0.8778128027915955,
"learning_rate": 0.0008554913294797688,
"loss": 3.4585,
"step": 5180
},
{
"epoch": 4.28238208277428,
"grad_norm": 0.7410449981689453,
"learning_rate": 0.0008571428571428571,
"loss": 3.4558,
"step": 5190
},
{
"epoch": 4.290638868820312,
"grad_norm": 0.9528789520263672,
"learning_rate": 0.0008587943848059455,
"loss": 3.4529,
"step": 5200
},
{
"epoch": 4.298895654866343,
"grad_norm": 0.8182398080825806,
"learning_rate": 0.000860445912469034,
"loss": 3.454,
"step": 5210
},
{
"epoch": 4.3071524409123745,
"grad_norm": 0.8230072855949402,
"learning_rate": 0.0008620974401321222,
"loss": 3.4561,
"step": 5220
},
{
"epoch": 4.315409226958407,
"grad_norm": 0.8617272973060608,
"learning_rate": 0.0008637489677952106,
"loss": 3.4471,
"step": 5230
},
{
"epoch": 4.323666013004438,
"grad_norm": 0.7993802428245544,
"learning_rate": 0.000865400495458299,
"loss": 3.4389,
"step": 5240
},
{
"epoch": 4.33192279905047,
"grad_norm": 0.9033696055412292,
"learning_rate": 0.0008670520231213873,
"loss": 3.4565,
"step": 5250
},
{
"epoch": 4.340179585096501,
"grad_norm": 0.7320334911346436,
"learning_rate": 0.0008687035507844756,
"loss": 3.448,
"step": 5260
},
{
"epoch": 4.348436371142533,
"grad_norm": 0.7799825072288513,
"learning_rate": 0.000870355078447564,
"loss": 3.4408,
"step": 5270
},
{
"epoch": 4.356693157188564,
"grad_norm": 0.7929351329803467,
"learning_rate": 0.0008720066061106524,
"loss": 3.4409,
"step": 5280
},
{
"epoch": 4.3649499432345955,
"grad_norm": 0.821667492389679,
"learning_rate": 0.0008736581337737407,
"loss": 3.4242,
"step": 5290
},
{
"epoch": 4.373206729280628,
"grad_norm": 0.7827187180519104,
"learning_rate": 0.0008753096614368291,
"loss": 3.4301,
"step": 5300
},
{
"epoch": 4.381463515326659,
"grad_norm": 0.7317821383476257,
"learning_rate": 0.0008769611890999174,
"loss": 3.4305,
"step": 5310
},
{
"epoch": 4.389720301372691,
"grad_norm": 0.7912768125534058,
"learning_rate": 0.0008786127167630058,
"loss": 3.4159,
"step": 5320
},
{
"epoch": 4.397977087418722,
"grad_norm": 0.8757966756820679,
"learning_rate": 0.0008802642444260942,
"loss": 3.4325,
"step": 5330
},
{
"epoch": 4.406233873464754,
"grad_norm": 0.7433684468269348,
"learning_rate": 0.0008819157720891825,
"loss": 3.4247,
"step": 5340
},
{
"epoch": 4.414490659510785,
"grad_norm": 0.8170937895774841,
"learning_rate": 0.0008835672997522709,
"loss": 3.422,
"step": 5350
},
{
"epoch": 4.422747445556817,
"grad_norm": 0.7130184769630432,
"learning_rate": 0.0008852188274153592,
"loss": 3.4137,
"step": 5360
},
{
"epoch": 4.431004231602849,
"grad_norm": 0.8633317947387695,
"learning_rate": 0.0008868703550784475,
"loss": 3.4101,
"step": 5370
},
{
"epoch": 4.43926101764888,
"grad_norm": 0.6866912841796875,
"learning_rate": 0.0008885218827415359,
"loss": 3.3982,
"step": 5380
},
{
"epoch": 4.447517803694912,
"grad_norm": 0.7277703285217285,
"learning_rate": 0.0008901734104046243,
"loss": 3.4016,
"step": 5390
},
{
"epoch": 4.455774589740943,
"grad_norm": 0.7942615151405334,
"learning_rate": 0.0008918249380677127,
"loss": 3.3961,
"step": 5400
},
{
"epoch": 4.464031375786975,
"grad_norm": 0.9028828144073486,
"learning_rate": 0.000893476465730801,
"loss": 3.3883,
"step": 5410
},
{
"epoch": 4.472288161833006,
"grad_norm": 0.6972488760948181,
"learning_rate": 0.0008951279933938894,
"loss": 3.3828,
"step": 5420
},
{
"epoch": 4.4805449478790385,
"grad_norm": 0.746987521648407,
"learning_rate": 0.0008967795210569777,
"loss": 3.4034,
"step": 5430
},
{
"epoch": 4.48880173392507,
"grad_norm": 0.8450544476509094,
"learning_rate": 0.0008984310487200661,
"loss": 3.3972,
"step": 5440
},
{
"epoch": 4.497058519971101,
"grad_norm": 0.7922062873840332,
"learning_rate": 0.0009000825763831544,
"loss": 3.3915,
"step": 5450
},
{
"epoch": 4.505315306017133,
"grad_norm": 0.6909300088882446,
"learning_rate": 0.0009017341040462428,
"loss": 3.3797,
"step": 5460
},
{
"epoch": 4.513572092063164,
"grad_norm": 0.7204782366752625,
"learning_rate": 0.0009033856317093312,
"loss": 3.3781,
"step": 5470
},
{
"epoch": 4.521828878109196,
"grad_norm": 0.8381190299987793,
"learning_rate": 0.0009050371593724195,
"loss": 3.37,
"step": 5480
},
{
"epoch": 4.530085664155227,
"grad_norm": 0.8983927965164185,
"learning_rate": 0.0009066886870355079,
"loss": 3.3779,
"step": 5490
},
{
"epoch": 4.5383424502012595,
"grad_norm": 0.7274337410926819,
"learning_rate": 0.0009083402146985961,
"loss": 3.3777,
"step": 5500
},
{
"epoch": 4.546599236247291,
"grad_norm": 0.7718445062637329,
"learning_rate": 0.0009099917423616846,
"loss": 3.3728,
"step": 5510
},
{
"epoch": 4.554856022293322,
"grad_norm": 0.7668145298957825,
"learning_rate": 0.000911643270024773,
"loss": 3.3756,
"step": 5520
},
{
"epoch": 4.563112808339354,
"grad_norm": 0.7342451810836792,
"learning_rate": 0.0009132947976878612,
"loss": 3.3628,
"step": 5530
},
{
"epoch": 4.571369594385385,
"grad_norm": 0.6988268494606018,
"learning_rate": 0.0009149463253509497,
"loss": 3.3683,
"step": 5540
},
{
"epoch": 4.579626380431417,
"grad_norm": 0.7202860116958618,
"learning_rate": 0.000916597853014038,
"loss": 3.3681,
"step": 5550
},
{
"epoch": 4.587883166477448,
"grad_norm": 0.8209216594696045,
"learning_rate": 0.0009182493806771264,
"loss": 3.3652,
"step": 5560
},
{
"epoch": 4.5961399525234805,
"grad_norm": 0.7195369601249695,
"learning_rate": 0.0009199009083402146,
"loss": 3.3596,
"step": 5570
},
{
"epoch": 4.604396738569512,
"grad_norm": 0.706985354423523,
"learning_rate": 0.0009215524360033031,
"loss": 3.357,
"step": 5580
},
{
"epoch": 4.612653524615544,
"grad_norm": 0.8509654402732849,
"learning_rate": 0.0009232039636663915,
"loss": 3.3697,
"step": 5590
},
{
"epoch": 4.620910310661575,
"grad_norm": 0.867950439453125,
"learning_rate": 0.0009248554913294797,
"loss": 3.3569,
"step": 5600
},
{
"epoch": 4.629167096707606,
"grad_norm": 0.7825170159339905,
"learning_rate": 0.0009265070189925682,
"loss": 3.3529,
"step": 5610
},
{
"epoch": 4.637423882753638,
"grad_norm": 0.7278405427932739,
"learning_rate": 0.0009281585466556565,
"loss": 3.3587,
"step": 5620
},
{
"epoch": 4.6456806687996695,
"grad_norm": 0.7527414560317993,
"learning_rate": 0.0009298100743187448,
"loss": 3.3555,
"step": 5630
},
{
"epoch": 4.6539374548457015,
"grad_norm": 0.6936579346656799,
"learning_rate": 0.0009314616019818332,
"loss": 3.3483,
"step": 5640
},
{
"epoch": 4.662194240891733,
"grad_norm": 0.7889197468757629,
"learning_rate": 0.0009331131296449216,
"loss": 3.3545,
"step": 5650
},
{
"epoch": 4.670451026937765,
"grad_norm": 0.7300989627838135,
"learning_rate": 0.0009347646573080099,
"loss": 3.3435,
"step": 5660
},
{
"epoch": 4.678707812983796,
"grad_norm": 0.7153423428535461,
"learning_rate": 0.0009364161849710983,
"loss": 3.3284,
"step": 5670
},
{
"epoch": 4.686964599029828,
"grad_norm": 0.716394305229187,
"learning_rate": 0.0009380677126341867,
"loss": 3.3285,
"step": 5680
},
{
"epoch": 4.695221385075859,
"grad_norm": 0.748479425907135,
"learning_rate": 0.0009397192402972749,
"loss": 3.3384,
"step": 5690
},
{
"epoch": 4.7034781711218905,
"grad_norm": 0.7145617604255676,
"learning_rate": 0.0009413707679603633,
"loss": 3.3297,
"step": 5700
},
{
"epoch": 4.711734957167923,
"grad_norm": 0.7331937551498413,
"learning_rate": 0.0009430222956234518,
"loss": 3.329,
"step": 5710
},
{
"epoch": 4.719991743213954,
"grad_norm": 0.8153555393218994,
"learning_rate": 0.0009446738232865401,
"loss": 3.326,
"step": 5720
},
{
"epoch": 4.728248529259986,
"grad_norm": 0.803225576877594,
"learning_rate": 0.0009463253509496284,
"loss": 3.3375,
"step": 5730
},
{
"epoch": 4.736505315306017,
"grad_norm": 0.7512196898460388,
"learning_rate": 0.0009479768786127168,
"loss": 3.3291,
"step": 5740
},
{
"epoch": 4.744762101352048,
"grad_norm": 0.7939392328262329,
"learning_rate": 0.0009496284062758052,
"loss": 3.3263,
"step": 5750
},
{
"epoch": 4.75301888739808,
"grad_norm": 0.6965025663375854,
"learning_rate": 0.0009512799339388934,
"loss": 3.3139,
"step": 5760
},
{
"epoch": 4.7612756734441115,
"grad_norm": 0.7877525687217712,
"learning_rate": 0.0009529314616019819,
"loss": 3.3244,
"step": 5770
},
{
"epoch": 4.769532459490144,
"grad_norm": 0.7251325249671936,
"learning_rate": 0.0009545829892650703,
"loss": 3.3212,
"step": 5780
},
{
"epoch": 4.777789245536175,
"grad_norm": 0.7695476412773132,
"learning_rate": 0.0009562345169281585,
"loss": 3.3155,
"step": 5790
},
{
"epoch": 4.786046031582207,
"grad_norm": 0.7189447283744812,
"learning_rate": 0.000957886044591247,
"loss": 3.3115,
"step": 5800
},
{
"epoch": 4.794302817628238,
"grad_norm": 0.762616753578186,
"learning_rate": 0.0009595375722543353,
"loss": 3.3193,
"step": 5810
},
{
"epoch": 4.80255960367427,
"grad_norm": 0.7273391485214233,
"learning_rate": 0.0009611890999174236,
"loss": 3.2929,
"step": 5820
},
{
"epoch": 4.810816389720301,
"grad_norm": 0.7077900171279907,
"learning_rate": 0.0009628406275805119,
"loss": 3.2998,
"step": 5830
},
{
"epoch": 4.8190731757663325,
"grad_norm": 0.7202854156494141,
"learning_rate": 0.0009644921552436004,
"loss": 3.3005,
"step": 5840
},
{
"epoch": 4.827329961812365,
"grad_norm": 0.758812427520752,
"learning_rate": 0.0009661436829066887,
"loss": 3.2916,
"step": 5850
},
{
"epoch": 4.835586747858396,
"grad_norm": 0.7209702730178833,
"learning_rate": 0.000967795210569777,
"loss": 3.3037,
"step": 5860
},
{
"epoch": 4.843843533904428,
"grad_norm": 0.7841807007789612,
"learning_rate": 0.0009694467382328655,
"loss": 3.305,
"step": 5870
},
{
"epoch": 4.852100319950459,
"grad_norm": 0.7753096222877502,
"learning_rate": 0.0009710982658959537,
"loss": 3.31,
"step": 5880
},
{
"epoch": 4.860357105996491,
"grad_norm": 0.7271151542663574,
"learning_rate": 0.0009727497935590421,
"loss": 3.2891,
"step": 5890
},
{
"epoch": 4.868613892042522,
"grad_norm": 0.7262945175170898,
"learning_rate": 0.0009744013212221306,
"loss": 3.3034,
"step": 5900
},
{
"epoch": 4.8768706780885545,
"grad_norm": 0.698553740978241,
"learning_rate": 0.0009760528488852189,
"loss": 3.2953,
"step": 5910
},
{
"epoch": 4.885127464134586,
"grad_norm": 0.8064056634902954,
"learning_rate": 0.0009777043765483073,
"loss": 3.287,
"step": 5920
},
{
"epoch": 4.893384250180617,
"grad_norm": 0.7037026286125183,
"learning_rate": 0.0009793559042113955,
"loss": 3.2919,
"step": 5930
},
{
"epoch": 4.901641036226649,
"grad_norm": 0.7652758359909058,
"learning_rate": 0.000981007431874484,
"loss": 3.2884,
"step": 5940
},
{
"epoch": 4.90989782227268,
"grad_norm": 0.7884798049926758,
"learning_rate": 0.0009826589595375722,
"loss": 3.2911,
"step": 5950
},
{
"epoch": 4.918154608318712,
"grad_norm": 0.6904022693634033,
"learning_rate": 0.0009843104872006606,
"loss": 3.2835,
"step": 5960
},
{
"epoch": 4.926411394364743,
"grad_norm": 0.724676251411438,
"learning_rate": 0.000985962014863749,
"loss": 3.2875,
"step": 5970
},
{
"epoch": 4.9346681804107755,
"grad_norm": 0.8747690916061401,
"learning_rate": 0.0009876135425268373,
"loss": 3.2847,
"step": 5980
},
{
"epoch": 4.942924966456807,
"grad_norm": 0.793563187122345,
"learning_rate": 0.0009892650701899257,
"loss": 3.2873,
"step": 5990
},
{
"epoch": 4.951181752502839,
"grad_norm": 0.7522445917129517,
"learning_rate": 0.000990916597853014,
"loss": 3.2833,
"step": 6000
},
{
"epoch": 4.95943853854887,
"grad_norm": 0.7040495276451111,
"learning_rate": 0.0009925681255161024,
"loss": 3.276,
"step": 6010
},
{
"epoch": 4.967695324594901,
"grad_norm": 0.7230294942855835,
"learning_rate": 0.0009942196531791908,
"loss": 3.255,
"step": 6020
},
{
"epoch": 4.975952110640933,
"grad_norm": 0.7457141280174255,
"learning_rate": 0.000995871180842279,
"loss": 3.2603,
"step": 6030
},
{
"epoch": 4.984208896686964,
"grad_norm": 0.6743199229240417,
"learning_rate": 0.0009975227085053675,
"loss": 3.2682,
"step": 6040
},
{
"epoch": 4.9924656827329965,
"grad_norm": 0.7625213861465454,
"learning_rate": 0.000999174236168456,
"loss": 3.2795,
"step": 6050
},
{
"epoch": 5.0,
"grad_norm": 0.2576947510242462,
"learning_rate": 0.0009999082484631618,
"loss": 2.9799,
"step": 6060
},
{
"epoch": 5.008256786046031,
"grad_norm": 0.6936799883842468,
"learning_rate": 0.0009997247453894854,
"loss": 3.2597,
"step": 6070
},
{
"epoch": 5.016513572092063,
"grad_norm": 0.7428798675537109,
"learning_rate": 0.0009995412423158088,
"loss": 3.257,
"step": 6080
},
{
"epoch": 5.0247703581380945,
"grad_norm": 0.720032811164856,
"learning_rate": 0.0009993577392421323,
"loss": 3.2575,
"step": 6090
},
{
"epoch": 5.0330271441841266,
"grad_norm": 0.7074559330940247,
"learning_rate": 0.000999174236168456,
"loss": 3.2607,
"step": 6100
},
{
"epoch": 5.041283930230158,
"grad_norm": 0.713192880153656,
"learning_rate": 0.0009989907330947795,
"loss": 3.2604,
"step": 6110
},
{
"epoch": 5.04954071627619,
"grad_norm": 0.7576326727867126,
"learning_rate": 0.0009988072300211029,
"loss": 3.2604,
"step": 6120
},
{
"epoch": 5.057797502322221,
"grad_norm": 0.7588953375816345,
"learning_rate": 0.0009986237269474264,
"loss": 3.2452,
"step": 6130
},
{
"epoch": 5.066054288368252,
"grad_norm": 0.7049972414970398,
"learning_rate": 0.0009984402238737498,
"loss": 3.2655,
"step": 6140
},
{
"epoch": 5.074311074414284,
"grad_norm": 0.7510067820549011,
"learning_rate": 0.0009982567208000734,
"loss": 3.2359,
"step": 6150
},
{
"epoch": 5.0825678604603155,
"grad_norm": 0.7003161907196045,
"learning_rate": 0.000998073217726397,
"loss": 3.2391,
"step": 6160
},
{
"epoch": 5.090824646506348,
"grad_norm": 0.6871075630187988,
"learning_rate": 0.0009978897146527205,
"loss": 3.2435,
"step": 6170
},
{
"epoch": 5.099081432552379,
"grad_norm": 0.7129902243614197,
"learning_rate": 0.000997706211579044,
"loss": 3.2562,
"step": 6180
},
{
"epoch": 5.107338218598411,
"grad_norm": 0.6472665071487427,
"learning_rate": 0.0009975227085053675,
"loss": 3.235,
"step": 6190
},
{
"epoch": 5.115595004644442,
"grad_norm": 0.5904700756072998,
"learning_rate": 0.000997339205431691,
"loss": 3.2397,
"step": 6200
},
{
"epoch": 5.123851790690474,
"grad_norm": 0.6430502533912659,
"learning_rate": 0.0009971557023580146,
"loss": 3.2403,
"step": 6210
},
{
"epoch": 5.132108576736505,
"grad_norm": 0.7082479000091553,
"learning_rate": 0.000996972199284338,
"loss": 3.234,
"step": 6220
},
{
"epoch": 5.1403653627825365,
"grad_norm": 0.7464805245399475,
"learning_rate": 0.0009967886962106616,
"loss": 3.2178,
"step": 6230
},
{
"epoch": 5.148622148828569,
"grad_norm": 0.7043919563293457,
"learning_rate": 0.0009966051931369852,
"loss": 3.2287,
"step": 6240
},
{
"epoch": 5.1568789348746,
"grad_norm": 0.7245175838470459,
"learning_rate": 0.0009964216900633087,
"loss": 3.2352,
"step": 6250
},
{
"epoch": 5.165135720920632,
"grad_norm": 0.7505689859390259,
"learning_rate": 0.000996238186989632,
"loss": 3.2309,
"step": 6260
},
{
"epoch": 5.173392506966663,
"grad_norm": 0.7331697940826416,
"learning_rate": 0.0009960546839159557,
"loss": 3.2283,
"step": 6270
},
{
"epoch": 5.181649293012695,
"grad_norm": 0.6291115283966064,
"learning_rate": 0.000995871180842279,
"loss": 3.2195,
"step": 6280
},
{
"epoch": 5.189906079058726,
"grad_norm": 0.6958070397377014,
"learning_rate": 0.0009956876777686026,
"loss": 3.2255,
"step": 6290
},
{
"epoch": 5.198162865104758,
"grad_norm": 0.7305887937545776,
"learning_rate": 0.0009955041746949262,
"loss": 3.2089,
"step": 6300
},
{
"epoch": 5.20641965115079,
"grad_norm": 0.6707571744918823,
"learning_rate": 0.0009953206716212498,
"loss": 3.2188,
"step": 6310
},
{
"epoch": 5.214676437196821,
"grad_norm": 0.6920966506004333,
"learning_rate": 0.0009951371685475731,
"loss": 3.2288,
"step": 6320
},
{
"epoch": 5.222933223242853,
"grad_norm": 0.6794142127037048,
"learning_rate": 0.0009949536654738967,
"loss": 3.2277,
"step": 6330
},
{
"epoch": 5.231190009288884,
"grad_norm": 0.6484349966049194,
"learning_rate": 0.0009947701624002203,
"loss": 3.2327,
"step": 6340
},
{
"epoch": 5.239446795334916,
"grad_norm": 0.7271141409873962,
"learning_rate": 0.0009945866593265437,
"loss": 3.2253,
"step": 6350
},
{
"epoch": 5.247703581380947,
"grad_norm": 0.6956265568733215,
"learning_rate": 0.0009944031562528672,
"loss": 3.223,
"step": 6360
},
{
"epoch": 5.2559603674269795,
"grad_norm": 0.6692689061164856,
"learning_rate": 0.0009942196531791908,
"loss": 3.2136,
"step": 6370
},
{
"epoch": 5.264217153473011,
"grad_norm": 0.7369921803474426,
"learning_rate": 0.0009940361501055144,
"loss": 3.2078,
"step": 6380
},
{
"epoch": 5.272473939519042,
"grad_norm": 0.607624351978302,
"learning_rate": 0.0009938526470318378,
"loss": 3.21,
"step": 6390
},
{
"epoch": 5.280730725565074,
"grad_norm": 0.7406266331672668,
"learning_rate": 0.0009936691439581613,
"loss": 3.1989,
"step": 6400
},
{
"epoch": 5.288987511611105,
"grad_norm": 0.8179799318313599,
"learning_rate": 0.0009934856408844847,
"loss": 3.2034,
"step": 6410
},
{
"epoch": 5.297244297657137,
"grad_norm": 0.6973315477371216,
"learning_rate": 0.0009933021378108083,
"loss": 3.2205,
"step": 6420
},
{
"epoch": 5.305501083703168,
"grad_norm": 0.7340269088745117,
"learning_rate": 0.0009931186347371319,
"loss": 3.1984,
"step": 6430
},
{
"epoch": 5.3137578697492005,
"grad_norm": 0.6557930111885071,
"learning_rate": 0.0009929351316634554,
"loss": 3.1927,
"step": 6440
},
{
"epoch": 5.322014655795232,
"grad_norm": 0.7558073401451111,
"learning_rate": 0.0009927516285897788,
"loss": 3.1934,
"step": 6450
},
{
"epoch": 5.330271441841263,
"grad_norm": 0.7387466430664062,
"learning_rate": 0.0009925681255161024,
"loss": 3.1974,
"step": 6460
},
{
"epoch": 5.338528227887295,
"grad_norm": 0.6814390420913696,
"learning_rate": 0.000992384622442426,
"loss": 3.1997,
"step": 6470
},
{
"epoch": 5.346785013933326,
"grad_norm": 0.7269142866134644,
"learning_rate": 0.0009922011193687495,
"loss": 3.2081,
"step": 6480
},
{
"epoch": 5.355041799979358,
"grad_norm": 0.7551733255386353,
"learning_rate": 0.000992017616295073,
"loss": 3.2013,
"step": 6490
},
{
"epoch": 5.3632985860253894,
"grad_norm": 0.6852086186408997,
"learning_rate": 0.0009918341132213965,
"loss": 3.1996,
"step": 6500
},
{
"epoch": 5.3715553720714215,
"grad_norm": 0.7068336009979248,
"learning_rate": 0.00099165061014772,
"loss": 3.197,
"step": 6510
},
{
"epoch": 5.379812158117453,
"grad_norm": 0.6530427932739258,
"learning_rate": 0.0009914671070740434,
"loss": 3.1939,
"step": 6520
},
{
"epoch": 5.388068944163484,
"grad_norm": 0.7301046252250671,
"learning_rate": 0.000991283604000367,
"loss": 3.1973,
"step": 6530
},
{
"epoch": 5.396325730209516,
"grad_norm": 0.6607205271720886,
"learning_rate": 0.0009911001009266906,
"loss": 3.1844,
"step": 6540
},
{
"epoch": 5.404582516255547,
"grad_norm": 0.7713533043861389,
"learning_rate": 0.000990916597853014,
"loss": 3.1842,
"step": 6550
},
{
"epoch": 5.412839302301579,
"grad_norm": 0.6876000165939331,
"learning_rate": 0.0009907330947793375,
"loss": 3.1819,
"step": 6560
},
{
"epoch": 5.4210960883476105,
"grad_norm": 0.7219623327255249,
"learning_rate": 0.000990549591705661,
"loss": 3.1839,
"step": 6570
},
{
"epoch": 5.429352874393643,
"grad_norm": 0.5987829566001892,
"learning_rate": 0.0009903660886319847,
"loss": 3.1795,
"step": 6580
},
{
"epoch": 5.437609660439674,
"grad_norm": 0.6070224046707153,
"learning_rate": 0.000990182585558308,
"loss": 3.1921,
"step": 6590
},
{
"epoch": 5.445866446485706,
"grad_norm": 0.648897647857666,
"learning_rate": 0.0009899990824846316,
"loss": 3.1893,
"step": 6600
},
{
"epoch": 5.454123232531737,
"grad_norm": 0.6025215983390808,
"learning_rate": 0.0009898155794109552,
"loss": 3.1848,
"step": 6610
},
{
"epoch": 5.462380018577768,
"grad_norm": 0.6439123749732971,
"learning_rate": 0.0009896320763372788,
"loss": 3.1912,
"step": 6620
},
{
"epoch": 5.4706368046238,
"grad_norm": 0.6637933254241943,
"learning_rate": 0.0009894485732636021,
"loss": 3.1695,
"step": 6630
},
{
"epoch": 5.4788935906698315,
"grad_norm": 0.6922410726547241,
"learning_rate": 0.0009892650701899257,
"loss": 3.1787,
"step": 6640
},
{
"epoch": 5.487150376715864,
"grad_norm": 0.6986757516860962,
"learning_rate": 0.000989081567116249,
"loss": 3.1635,
"step": 6650
},
{
"epoch": 5.495407162761895,
"grad_norm": 0.7966019511222839,
"learning_rate": 0.0009888980640425727,
"loss": 3.1833,
"step": 6660
},
{
"epoch": 5.503663948807927,
"grad_norm": 0.6623300313949585,
"learning_rate": 0.0009887145609688962,
"loss": 3.1653,
"step": 6670
},
{
"epoch": 5.511920734853958,
"grad_norm": 0.6772647500038147,
"learning_rate": 0.0009885310578952198,
"loss": 3.185,
"step": 6680
},
{
"epoch": 5.52017752089999,
"grad_norm": 0.69997239112854,
"learning_rate": 0.0009883475548215432,
"loss": 3.1709,
"step": 6690
},
{
"epoch": 5.528434306946021,
"grad_norm": 0.6997058987617493,
"learning_rate": 0.0009881640517478668,
"loss": 3.1592,
"step": 6700
},
{
"epoch": 5.5366910929920525,
"grad_norm": 0.6697850227355957,
"learning_rate": 0.0009879805486741903,
"loss": 3.1748,
"step": 6710
},
{
"epoch": 5.544947879038085,
"grad_norm": 0.6371259093284607,
"learning_rate": 0.000987797045600514,
"loss": 3.1651,
"step": 6720
},
{
"epoch": 5.553204665084116,
"grad_norm": 0.6484488844871521,
"learning_rate": 0.0009876135425268373,
"loss": 3.1566,
"step": 6730
},
{
"epoch": 5.561461451130148,
"grad_norm": 0.6380677223205566,
"learning_rate": 0.0009874300394531609,
"loss": 3.1609,
"step": 6740
},
{
"epoch": 5.569718237176179,
"grad_norm": 0.7111419439315796,
"learning_rate": 0.0009872465363794844,
"loss": 3.1685,
"step": 6750
},
{
"epoch": 5.57797502322221,
"grad_norm": 0.7145205140113831,
"learning_rate": 0.000987063033305808,
"loss": 3.1482,
"step": 6760
},
{
"epoch": 5.586231809268242,
"grad_norm": 0.6077954769134521,
"learning_rate": 0.0009868795302321314,
"loss": 3.1577,
"step": 6770
},
{
"epoch": 5.594488595314274,
"grad_norm": 0.6183308959007263,
"learning_rate": 0.000986696027158455,
"loss": 3.159,
"step": 6780
},
{
"epoch": 5.602745381360306,
"grad_norm": 0.7077763080596924,
"learning_rate": 0.0009865125240847783,
"loss": 3.1583,
"step": 6790
},
{
"epoch": 5.611002167406337,
"grad_norm": 0.7214525938034058,
"learning_rate": 0.000986329021011102,
"loss": 3.1493,
"step": 6800
},
{
"epoch": 5.619258953452369,
"grad_norm": 0.6487968564033508,
"learning_rate": 0.0009861455179374255,
"loss": 3.142,
"step": 6810
},
{
"epoch": 5.6275157394984,
"grad_norm": 0.7745679020881653,
"learning_rate": 0.000985962014863749,
"loss": 3.1525,
"step": 6820
},
{
"epoch": 5.635772525544432,
"grad_norm": 0.7056599259376526,
"learning_rate": 0.0009857785117900724,
"loss": 3.144,
"step": 6830
},
{
"epoch": 5.644029311590463,
"grad_norm": 0.7179878354072571,
"learning_rate": 0.000985595008716396,
"loss": 3.1379,
"step": 6840
},
{
"epoch": 5.652286097636495,
"grad_norm": 0.6427177786827087,
"learning_rate": 0.0009854115056427196,
"loss": 3.1306,
"step": 6850
},
{
"epoch": 5.660542883682527,
"grad_norm": 0.6616296768188477,
"learning_rate": 0.0009852280025690432,
"loss": 3.1354,
"step": 6860
},
{
"epoch": 5.668799669728558,
"grad_norm": 0.6171796917915344,
"learning_rate": 0.0009850444994953665,
"loss": 3.1296,
"step": 6870
},
{
"epoch": 5.67705645577459,
"grad_norm": 0.7268235087394714,
"learning_rate": 0.0009848609964216901,
"loss": 3.1347,
"step": 6880
},
{
"epoch": 5.685313241820621,
"grad_norm": 0.7473070621490479,
"learning_rate": 0.0009846774933480137,
"loss": 3.14,
"step": 6890
},
{
"epoch": 5.693570027866653,
"grad_norm": 0.6529579162597656,
"learning_rate": 0.0009844939902743373,
"loss": 3.1364,
"step": 6900
},
{
"epoch": 5.701826813912684,
"grad_norm": 0.6876893043518066,
"learning_rate": 0.0009843104872006606,
"loss": 3.1376,
"step": 6910
},
{
"epoch": 5.7100835999587165,
"grad_norm": 0.7397525310516357,
"learning_rate": 0.000984126984126984,
"loss": 3.1271,
"step": 6920
},
{
"epoch": 5.718340386004748,
"grad_norm": 0.7049607634544373,
"learning_rate": 0.0009839434810533076,
"loss": 3.1335,
"step": 6930
},
{
"epoch": 5.726597172050779,
"grad_norm": 0.6563366651535034,
"learning_rate": 0.0009837599779796312,
"loss": 3.1174,
"step": 6940
},
{
"epoch": 5.734853958096811,
"grad_norm": 0.7188289761543274,
"learning_rate": 0.0009835764749059547,
"loss": 3.1344,
"step": 6950
},
{
"epoch": 5.743110744142842,
"grad_norm": 0.6964494585990906,
"learning_rate": 0.000983392971832278,
"loss": 3.1183,
"step": 6960
},
{
"epoch": 5.751367530188874,
"grad_norm": 0.6440771818161011,
"learning_rate": 0.0009832094687586017,
"loss": 3.125,
"step": 6970
},
{
"epoch": 5.7596243162349055,
"grad_norm": 0.6640235185623169,
"learning_rate": 0.0009830259656849253,
"loss": 3.1288,
"step": 6980
},
{
"epoch": 5.7678811022809375,
"grad_norm": 0.660474956035614,
"learning_rate": 0.0009828424626112488,
"loss": 3.1244,
"step": 6990
},
{
"epoch": 5.776137888326969,
"grad_norm": 0.6896589994430542,
"learning_rate": 0.0009826589595375722,
"loss": 3.1238,
"step": 7000
},
{
"epoch": 5.784394674373,
"grad_norm": 0.6928004026412964,
"learning_rate": 0.0009824754564638958,
"loss": 3.1281,
"step": 7010
},
{
"epoch": 5.792651460419032,
"grad_norm": 0.6702253222465515,
"learning_rate": 0.0009822919533902194,
"loss": 3.1199,
"step": 7020
},
{
"epoch": 5.800908246465063,
"grad_norm": 0.6199045777320862,
"learning_rate": 0.000982108450316543,
"loss": 3.1273,
"step": 7030
},
{
"epoch": 5.809165032511095,
"grad_norm": 0.6956904530525208,
"learning_rate": 0.0009819249472428663,
"loss": 3.1273,
"step": 7040
},
{
"epoch": 5.8174218185571265,
"grad_norm": 0.7308268547058105,
"learning_rate": 0.0009817414441691899,
"loss": 3.1214,
"step": 7050
},
{
"epoch": 5.825678604603159,
"grad_norm": 0.6409997940063477,
"learning_rate": 0.0009815579410955132,
"loss": 3.1102,
"step": 7060
},
{
"epoch": 5.83393539064919,
"grad_norm": 0.6429135203361511,
"learning_rate": 0.0009813744380218368,
"loss": 3.1109,
"step": 7070
},
{
"epoch": 5.842192176695221,
"grad_norm": 0.7045457363128662,
"learning_rate": 0.0009811909349481604,
"loss": 3.1168,
"step": 7080
},
{
"epoch": 5.850448962741253,
"grad_norm": 0.6149047613143921,
"learning_rate": 0.000981007431874484,
"loss": 3.0994,
"step": 7090
},
{
"epoch": 5.858705748787284,
"grad_norm": 0.6406427621841431,
"learning_rate": 0.0009808239288008073,
"loss": 3.1184,
"step": 7100
},
{
"epoch": 5.866962534833316,
"grad_norm": 0.6805707216262817,
"learning_rate": 0.000980640425727131,
"loss": 3.112,
"step": 7110
},
{
"epoch": 5.8752193208793475,
"grad_norm": 0.6262876987457275,
"learning_rate": 0.0009804569226534545,
"loss": 3.1104,
"step": 7120
},
{
"epoch": 5.88347610692538,
"grad_norm": 0.7171155214309692,
"learning_rate": 0.000980273419579778,
"loss": 3.1017,
"step": 7130
},
{
"epoch": 5.891732892971411,
"grad_norm": 0.6478092670440674,
"learning_rate": 0.0009800899165061014,
"loss": 3.0997,
"step": 7140
},
{
"epoch": 5.899989679017443,
"grad_norm": 0.6612927317619324,
"learning_rate": 0.000979906413432425,
"loss": 3.0963,
"step": 7150
},
{
"epoch": 5.908246465063474,
"grad_norm": 0.689495325088501,
"learning_rate": 0.0009797229103587486,
"loss": 3.1038,
"step": 7160
},
{
"epoch": 5.916503251109505,
"grad_norm": 0.6566335558891296,
"learning_rate": 0.0009795394072850722,
"loss": 3.1062,
"step": 7170
},
{
"epoch": 5.924760037155537,
"grad_norm": 0.7480162382125854,
"learning_rate": 0.0009793559042113955,
"loss": 3.0928,
"step": 7180
},
{
"epoch": 5.9330168232015685,
"grad_norm": 0.6011252403259277,
"learning_rate": 0.0009791724011377191,
"loss": 3.1066,
"step": 7190
},
{
"epoch": 5.941273609247601,
"grad_norm": 0.6580034494400024,
"learning_rate": 0.0009789888980640425,
"loss": 3.0919,
"step": 7200
},
{
"epoch": 5.949530395293632,
"grad_norm": 0.6794580817222595,
"learning_rate": 0.000978805394990366,
"loss": 3.0912,
"step": 7210
},
{
"epoch": 5.957787181339664,
"grad_norm": 0.6901310682296753,
"learning_rate": 0.0009786218919166896,
"loss": 3.0976,
"step": 7220
},
{
"epoch": 5.966043967385695,
"grad_norm": 0.7033196687698364,
"learning_rate": 0.0009784383888430132,
"loss": 3.0969,
"step": 7230
},
{
"epoch": 5.974300753431727,
"grad_norm": 0.5777009129524231,
"learning_rate": 0.0009782548857693366,
"loss": 3.0985,
"step": 7240
},
{
"epoch": 5.982557539477758,
"grad_norm": 0.6438208818435669,
"learning_rate": 0.0009780713826956602,
"loss": 3.0915,
"step": 7250
},
{
"epoch": 5.99081432552379,
"grad_norm": 0.6833881139755249,
"learning_rate": 0.0009778878796219837,
"loss": 3.1012,
"step": 7260
},
{
"epoch": 5.999071111569822,
"grad_norm": 0.5851444602012634,
"learning_rate": 0.0009777043765483073,
"loss": 3.0961,
"step": 7270
},
{
"epoch": 6.006605428836825,
"grad_norm": 0.6323698163032532,
"learning_rate": 0.0009775208734746307,
"loss": 2.8359,
"step": 7280
},
{
"epoch": 6.014862214882857,
"grad_norm": 0.6244434118270874,
"learning_rate": 0.0009773373704009543,
"loss": 3.0765,
"step": 7290
},
{
"epoch": 6.023119000928888,
"grad_norm": 0.6204081177711487,
"learning_rate": 0.0009771538673272778,
"loss": 3.0839,
"step": 7300
},
{
"epoch": 6.03137578697492,
"grad_norm": 0.6735767126083374,
"learning_rate": 0.0009769703642536012,
"loss": 3.0678,
"step": 7310
},
{
"epoch": 6.039632573020952,
"grad_norm": 0.6244058609008789,
"learning_rate": 0.0009767868611799248,
"loss": 3.0883,
"step": 7320
},
{
"epoch": 6.047889359066983,
"grad_norm": 0.581949770450592,
"learning_rate": 0.0009766033581062484,
"loss": 3.082,
"step": 7330
},
{
"epoch": 6.056146145113015,
"grad_norm": 0.6072763204574585,
"learning_rate": 0.0009764198550325718,
"loss": 3.0771,
"step": 7340
},
{
"epoch": 6.064402931159046,
"grad_norm": 0.5899455547332764,
"learning_rate": 0.0009762363519588954,
"loss": 3.0745,
"step": 7350
},
{
"epoch": 6.072659717205078,
"grad_norm": 0.636332094669342,
"learning_rate": 0.0009760528488852189,
"loss": 3.0777,
"step": 7360
},
{
"epoch": 6.080916503251109,
"grad_norm": 0.623324990272522,
"learning_rate": 0.0009758693458115424,
"loss": 3.0854,
"step": 7370
},
{
"epoch": 6.0891732892971415,
"grad_norm": 0.6235571503639221,
"learning_rate": 0.0009756858427378658,
"loss": 3.0639,
"step": 7380
},
{
"epoch": 6.097430075343173,
"grad_norm": 0.657364010810852,
"learning_rate": 0.0009755023396641894,
"loss": 3.0843,
"step": 7390
},
{
"epoch": 6.105686861389204,
"grad_norm": 0.678801953792572,
"learning_rate": 0.0009753188365905129,
"loss": 3.0809,
"step": 7400
},
{
"epoch": 6.113943647435236,
"grad_norm": 0.6138309836387634,
"learning_rate": 0.0009751353335168365,
"loss": 3.0832,
"step": 7410
},
{
"epoch": 6.122200433481267,
"grad_norm": 0.6104526519775391,
"learning_rate": 0.0009749518304431599,
"loss": 3.0667,
"step": 7420
},
{
"epoch": 6.130457219527299,
"grad_norm": 0.6059489250183105,
"learning_rate": 0.0009747683273694835,
"loss": 3.0592,
"step": 7430
},
{
"epoch": 6.1387140055733305,
"grad_norm": 0.640777587890625,
"learning_rate": 0.000974584824295807,
"loss": 3.0643,
"step": 7440
},
{
"epoch": 6.1469707916193626,
"grad_norm": 0.7542473077774048,
"learning_rate": 0.0009744013212221306,
"loss": 3.064,
"step": 7450
},
{
"epoch": 6.155227577665394,
"grad_norm": 0.6118050217628479,
"learning_rate": 0.000974217818148454,
"loss": 3.0737,
"step": 7460
},
{
"epoch": 6.163484363711425,
"grad_norm": 0.6154510378837585,
"learning_rate": 0.0009740343150747776,
"loss": 3.0759,
"step": 7470
},
{
"epoch": 6.171741149757457,
"grad_norm": 0.6432428359985352,
"learning_rate": 0.0009738508120011011,
"loss": 3.0799,
"step": 7480
},
{
"epoch": 6.179997935803488,
"grad_norm": 0.705723226070404,
"learning_rate": 0.0009736673089274245,
"loss": 3.0685,
"step": 7490
},
{
"epoch": 6.18825472184952,
"grad_norm": 0.6126253008842468,
"learning_rate": 0.000973483805853748,
"loss": 3.0565,
"step": 7500
},
{
"epoch": 6.1965115078955515,
"grad_norm": 0.6755325198173523,
"learning_rate": 0.0009733003027800715,
"loss": 3.0589,
"step": 7510
},
{
"epoch": 6.204768293941584,
"grad_norm": 0.5887700319290161,
"learning_rate": 0.0009731167997063951,
"loss": 3.0569,
"step": 7520
},
{
"epoch": 6.213025079987615,
"grad_norm": 0.627024233341217,
"learning_rate": 0.0009729332966327185,
"loss": 3.053,
"step": 7530
},
{
"epoch": 6.221281866033646,
"grad_norm": 0.7310320734977722,
"learning_rate": 0.0009727497935590421,
"loss": 3.0601,
"step": 7540
},
{
"epoch": 6.229538652079678,
"grad_norm": 0.5800510048866272,
"learning_rate": 0.0009725662904853656,
"loss": 3.0632,
"step": 7550
},
{
"epoch": 6.237795438125709,
"grad_norm": 0.6462418437004089,
"learning_rate": 0.0009723827874116892,
"loss": 3.0601,
"step": 7560
},
{
"epoch": 6.246052224171741,
"grad_norm": 0.5978024005889893,
"learning_rate": 0.0009721992843380126,
"loss": 3.0607,
"step": 7570
},
{
"epoch": 6.2543090102177725,
"grad_norm": 0.7113734483718872,
"learning_rate": 0.0009720157812643362,
"loss": 3.0441,
"step": 7580
},
{
"epoch": 6.262565796263805,
"grad_norm": 0.7136105298995972,
"learning_rate": 0.0009718322781906597,
"loss": 3.0627,
"step": 7590
},
{
"epoch": 6.270822582309836,
"grad_norm": 0.6299401521682739,
"learning_rate": 0.0009716487751169833,
"loss": 3.0632,
"step": 7600
},
{
"epoch": 6.279079368355868,
"grad_norm": 0.5755194425582886,
"learning_rate": 0.0009714652720433067,
"loss": 3.0655,
"step": 7610
},
{
"epoch": 6.287336154401899,
"grad_norm": 0.6883841753005981,
"learning_rate": 0.0009712817689696303,
"loss": 3.0555,
"step": 7620
},
{
"epoch": 6.29559294044793,
"grad_norm": 0.6997891664505005,
"learning_rate": 0.0009710982658959537,
"loss": 3.0537,
"step": 7630
},
{
"epoch": 6.303849726493962,
"grad_norm": 0.7656093835830688,
"learning_rate": 0.0009709147628222773,
"loss": 3.0597,
"step": 7640
},
{
"epoch": 6.3121065125399936,
"grad_norm": 0.6529414653778076,
"learning_rate": 0.0009707312597486007,
"loss": 3.0395,
"step": 7650
},
{
"epoch": 6.320363298586026,
"grad_norm": 0.6436627507209778,
"learning_rate": 0.0009705477566749243,
"loss": 3.0545,
"step": 7660
},
{
"epoch": 6.328620084632057,
"grad_norm": 0.6693470478057861,
"learning_rate": 0.0009703642536012478,
"loss": 3.0467,
"step": 7670
},
{
"epoch": 6.336876870678089,
"grad_norm": 0.6640053987503052,
"learning_rate": 0.0009701807505275714,
"loss": 3.0484,
"step": 7680
},
{
"epoch": 6.34513365672412,
"grad_norm": 0.615193784236908,
"learning_rate": 0.0009699972474538948,
"loss": 3.0456,
"step": 7690
},
{
"epoch": 6.353390442770151,
"grad_norm": 0.6501012444496155,
"learning_rate": 0.0009698137443802184,
"loss": 3.0536,
"step": 7700
},
{
"epoch": 6.361647228816183,
"grad_norm": 0.7172884941101074,
"learning_rate": 0.0009696302413065419,
"loss": 3.041,
"step": 7710
},
{
"epoch": 6.369904014862215,
"grad_norm": 0.6863964200019836,
"learning_rate": 0.0009694467382328655,
"loss": 3.0422,
"step": 7720
},
{
"epoch": 6.378160800908247,
"grad_norm": 0.6568806171417236,
"learning_rate": 0.0009692632351591889,
"loss": 3.0516,
"step": 7730
},
{
"epoch": 6.386417586954278,
"grad_norm": 0.7293218374252319,
"learning_rate": 0.0009690797320855125,
"loss": 3.0388,
"step": 7740
},
{
"epoch": 6.39467437300031,
"grad_norm": 0.651716947555542,
"learning_rate": 0.000968896229011836,
"loss": 3.0529,
"step": 7750
},
{
"epoch": 6.402931159046341,
"grad_norm": 0.6633101105690002,
"learning_rate": 0.0009687127259381596,
"loss": 3.0411,
"step": 7760
},
{
"epoch": 6.411187945092372,
"grad_norm": 0.685100793838501,
"learning_rate": 0.0009685292228644829,
"loss": 3.0472,
"step": 7770
},
{
"epoch": 6.419444731138404,
"grad_norm": 0.6207525730133057,
"learning_rate": 0.0009683457197908065,
"loss": 3.0368,
"step": 7780
},
{
"epoch": 6.427701517184436,
"grad_norm": 0.6622489094734192,
"learning_rate": 0.00096816221671713,
"loss": 3.0413,
"step": 7790
},
{
"epoch": 6.435958303230468,
"grad_norm": 0.640729546546936,
"learning_rate": 0.0009679787136434536,
"loss": 3.0358,
"step": 7800
},
{
"epoch": 6.444215089276499,
"grad_norm": 0.6243358254432678,
"learning_rate": 0.000967795210569777,
"loss": 3.0353,
"step": 7810
},
{
"epoch": 6.452471875322531,
"grad_norm": 0.7254058718681335,
"learning_rate": 0.0009676117074961006,
"loss": 3.0346,
"step": 7820
},
{
"epoch": 6.460728661368562,
"grad_norm": 0.6251292824745178,
"learning_rate": 0.0009674282044224241,
"loss": 3.0504,
"step": 7830
},
{
"epoch": 6.468985447414594,
"grad_norm": 0.6604384779930115,
"learning_rate": 0.0009672447013487477,
"loss": 3.037,
"step": 7840
},
{
"epoch": 6.477242233460625,
"grad_norm": 0.6694011092185974,
"learning_rate": 0.0009670611982750711,
"loss": 3.0383,
"step": 7850
},
{
"epoch": 6.485499019506657,
"grad_norm": 0.6231392025947571,
"learning_rate": 0.0009668776952013947,
"loss": 3.0465,
"step": 7860
},
{
"epoch": 6.493755805552689,
"grad_norm": 0.6012188792228699,
"learning_rate": 0.0009666941921277182,
"loss": 3.0319,
"step": 7870
},
{
"epoch": 6.50201259159872,
"grad_norm": 0.5632750988006592,
"learning_rate": 0.0009665106890540418,
"loss": 3.0311,
"step": 7880
},
{
"epoch": 6.510269377644752,
"grad_norm": 0.6662549376487732,
"learning_rate": 0.0009663271859803652,
"loss": 3.0278,
"step": 7890
},
{
"epoch": 6.518526163690783,
"grad_norm": 0.6620095372200012,
"learning_rate": 0.0009661436829066887,
"loss": 3.0341,
"step": 7900
},
{
"epoch": 6.526782949736815,
"grad_norm": 0.6526013612747192,
"learning_rate": 0.0009659601798330122,
"loss": 3.044,
"step": 7910
},
{
"epoch": 6.5350397357828465,
"grad_norm": 0.6725477576255798,
"learning_rate": 0.0009657766767593358,
"loss": 3.0221,
"step": 7920
},
{
"epoch": 6.5432965218288786,
"grad_norm": 0.5865882039070129,
"learning_rate": 0.0009655931736856592,
"loss": 3.0441,
"step": 7930
},
{
"epoch": 6.55155330787491,
"grad_norm": 0.6650230288505554,
"learning_rate": 0.0009654096706119828,
"loss": 3.0306,
"step": 7940
},
{
"epoch": 6.559810093920941,
"grad_norm": 0.7044249773025513,
"learning_rate": 0.0009652261675383063,
"loss": 3.0343,
"step": 7950
},
{
"epoch": 6.568066879966973,
"grad_norm": 0.6340664625167847,
"learning_rate": 0.0009650426644646299,
"loss": 3.0324,
"step": 7960
},
{
"epoch": 6.576323666013004,
"grad_norm": 0.6298174262046814,
"learning_rate": 0.0009648591613909533,
"loss": 3.0411,
"step": 7970
},
{
"epoch": 6.584580452059036,
"grad_norm": 0.6297299265861511,
"learning_rate": 0.0009646756583172769,
"loss": 3.0303,
"step": 7980
},
{
"epoch": 6.5928372381050675,
"grad_norm": 0.6586875915527344,
"learning_rate": 0.0009644921552436004,
"loss": 3.0271,
"step": 7990
},
{
"epoch": 6.601094024151099,
"grad_norm": 0.6087930798530579,
"learning_rate": 0.000964308652169924,
"loss": 3.0277,
"step": 8000
},
{
"epoch": 6.609350810197131,
"grad_norm": 0.6917185187339783,
"learning_rate": 0.0009641251490962474,
"loss": 3.0362,
"step": 8010
},
{
"epoch": 6.617607596243162,
"grad_norm": 0.6129333972930908,
"learning_rate": 0.000963941646022571,
"loss": 3.0206,
"step": 8020
},
{
"epoch": 6.625864382289194,
"grad_norm": 0.5826658606529236,
"learning_rate": 0.0009637581429488944,
"loss": 3.0132,
"step": 8030
},
{
"epoch": 6.634121168335225,
"grad_norm": 0.6190428733825684,
"learning_rate": 0.0009635746398752179,
"loss": 3.0196,
"step": 8040
},
{
"epoch": 6.642377954381257,
"grad_norm": 0.6231646537780762,
"learning_rate": 0.0009633911368015414,
"loss": 3.0279,
"step": 8050
},
{
"epoch": 6.6506347404272885,
"grad_norm": 0.7201693058013916,
"learning_rate": 0.0009632076337278649,
"loss": 3.0134,
"step": 8060
},
{
"epoch": 6.658891526473321,
"grad_norm": 0.616397500038147,
"learning_rate": 0.0009630241306541885,
"loss": 3.0182,
"step": 8070
},
{
"epoch": 6.667148312519352,
"grad_norm": 0.6851087212562561,
"learning_rate": 0.0009628406275805119,
"loss": 3.0179,
"step": 8080
},
{
"epoch": 6.675405098565383,
"grad_norm": 0.6185948252677917,
"learning_rate": 0.0009626571245068355,
"loss": 3.0191,
"step": 8090
},
{
"epoch": 6.683661884611415,
"grad_norm": 0.5413244962692261,
"learning_rate": 0.000962473621433159,
"loss": 3.0243,
"step": 8100
},
{
"epoch": 6.691918670657446,
"grad_norm": 0.7104983925819397,
"learning_rate": 0.0009622901183594826,
"loss": 3.0189,
"step": 8110
},
{
"epoch": 6.700175456703478,
"grad_norm": 0.5723142623901367,
"learning_rate": 0.000962106615285806,
"loss": 3.0015,
"step": 8120
},
{
"epoch": 6.7084322427495096,
"grad_norm": 0.6276829242706299,
"learning_rate": 0.0009619231122121296,
"loss": 3.0215,
"step": 8130
},
{
"epoch": 6.716689028795542,
"grad_norm": 0.6671704053878784,
"learning_rate": 0.0009617396091384531,
"loss": 3.0153,
"step": 8140
},
{
"epoch": 6.724945814841573,
"grad_norm": 0.7471591234207153,
"learning_rate": 0.0009615561060647767,
"loss": 3.0091,
"step": 8150
},
{
"epoch": 6.733202600887605,
"grad_norm": 0.6197100281715393,
"learning_rate": 0.0009613726029911,
"loss": 3.0037,
"step": 8160
},
{
"epoch": 6.741459386933636,
"grad_norm": 0.6177218556404114,
"learning_rate": 0.0009611890999174236,
"loss": 3.0122,
"step": 8170
},
{
"epoch": 6.749716172979667,
"grad_norm": 0.6349440813064575,
"learning_rate": 0.0009610055968437471,
"loss": 3.0155,
"step": 8180
},
{
"epoch": 6.757972959025699,
"grad_norm": 0.6462443470954895,
"learning_rate": 0.0009608220937700707,
"loss": 3.0141,
"step": 8190
},
{
"epoch": 6.766229745071731,
"grad_norm": 0.7159162163734436,
"learning_rate": 0.0009606385906963941,
"loss": 3.0119,
"step": 8200
},
{
"epoch": 6.774486531117763,
"grad_norm": 0.592444658279419,
"learning_rate": 0.0009604550876227177,
"loss": 3.0087,
"step": 8210
},
{
"epoch": 6.782743317163794,
"grad_norm": 0.6107344627380371,
"learning_rate": 0.0009602715845490412,
"loss": 3.0186,
"step": 8220
},
{
"epoch": 6.791000103209826,
"grad_norm": 0.6150995492935181,
"learning_rate": 0.0009600880814753648,
"loss": 3.0112,
"step": 8230
},
{
"epoch": 6.799256889255857,
"grad_norm": 0.6124362945556641,
"learning_rate": 0.0009599045784016882,
"loss": 2.9989,
"step": 8240
},
{
"epoch": 6.807513675301889,
"grad_norm": 0.6340455412864685,
"learning_rate": 0.0009597210753280118,
"loss": 3.0154,
"step": 8250
},
{
"epoch": 6.81577046134792,
"grad_norm": 0.5861290097236633,
"learning_rate": 0.0009595375722543353,
"loss": 3.0115,
"step": 8260
},
{
"epoch": 6.824027247393952,
"grad_norm": 0.5904505848884583,
"learning_rate": 0.0009593540691806589,
"loss": 3.0065,
"step": 8270
},
{
"epoch": 6.832284033439984,
"grad_norm": 0.6523525714874268,
"learning_rate": 0.0009591705661069823,
"loss": 3.0069,
"step": 8280
},
{
"epoch": 6.840540819486015,
"grad_norm": 0.6429992318153381,
"learning_rate": 0.0009589870630333058,
"loss": 3.0033,
"step": 8290
},
{
"epoch": 6.848797605532047,
"grad_norm": 0.6393450498580933,
"learning_rate": 0.0009588035599596293,
"loss": 3.0094,
"step": 8300
},
{
"epoch": 6.857054391578078,
"grad_norm": 0.6140925884246826,
"learning_rate": 0.0009586200568859529,
"loss": 3.0092,
"step": 8310
},
{
"epoch": 6.865311177624109,
"grad_norm": 0.5966553092002869,
"learning_rate": 0.0009584365538122763,
"loss": 3.005,
"step": 8320
},
{
"epoch": 6.8735679636701414,
"grad_norm": 0.5963024497032166,
"learning_rate": 0.0009582530507385999,
"loss": 3.009,
"step": 8330
},
{
"epoch": 6.881824749716173,
"grad_norm": 0.5785512924194336,
"learning_rate": 0.0009580695476649234,
"loss": 3.0075,
"step": 8340
},
{
"epoch": 6.890081535762205,
"grad_norm": 0.5979735851287842,
"learning_rate": 0.000957886044591247,
"loss": 2.9997,
"step": 8350
},
{
"epoch": 6.898338321808236,
"grad_norm": 0.6088021397590637,
"learning_rate": 0.0009577025415175704,
"loss": 3.0102,
"step": 8360
},
{
"epoch": 6.906595107854268,
"grad_norm": 0.6511215567588806,
"learning_rate": 0.000957519038443894,
"loss": 3.0058,
"step": 8370
},
{
"epoch": 6.914851893900299,
"grad_norm": 0.6001556515693665,
"learning_rate": 0.0009573355353702175,
"loss": 3.0005,
"step": 8380
},
{
"epoch": 6.923108679946331,
"grad_norm": 0.7033063173294067,
"learning_rate": 0.000957152032296541,
"loss": 2.9954,
"step": 8390
},
{
"epoch": 6.9313654659923625,
"grad_norm": 0.6751210689544678,
"learning_rate": 0.0009569685292228645,
"loss": 3.0032,
"step": 8400
},
{
"epoch": 6.939622252038394,
"grad_norm": 0.6629015207290649,
"learning_rate": 0.0009567850261491881,
"loss": 3.0148,
"step": 8410
},
{
"epoch": 6.947879038084426,
"grad_norm": 0.6272764801979065,
"learning_rate": 0.0009566015230755115,
"loss": 3.002,
"step": 8420
},
{
"epoch": 6.956135824130457,
"grad_norm": 0.6458156108856201,
"learning_rate": 0.000956418020001835,
"loss": 3.0066,
"step": 8430
},
{
"epoch": 6.964392610176489,
"grad_norm": 0.6023524403572083,
"learning_rate": 0.0009562345169281585,
"loss": 2.9992,
"step": 8440
},
{
"epoch": 6.97264939622252,
"grad_norm": 0.6430317759513855,
"learning_rate": 0.0009560510138544821,
"loss": 2.9976,
"step": 8450
},
{
"epoch": 6.980906182268552,
"grad_norm": 0.6168457269668579,
"learning_rate": 0.0009558675107808056,
"loss": 2.9931,
"step": 8460
},
{
"epoch": 6.9891629683145835,
"grad_norm": 0.6400942802429199,
"learning_rate": 0.0009556840077071291,
"loss": 2.9795,
"step": 8470
},
{
"epoch": 6.997419754360616,
"grad_norm": 0.5995707511901855,
"learning_rate": 0.0009555005046334526,
"loss": 3.0002,
"step": 8480
},
{
"epoch": 7.004954071627619,
"grad_norm": 0.7327253222465515,
"learning_rate": 0.0009553170015597762,
"loss": 2.7253,
"step": 8490
},
{
"epoch": 7.01321085767365,
"grad_norm": 0.6455899477005005,
"learning_rate": 0.0009551334984860997,
"loss": 2.9832,
"step": 8500
},
{
"epoch": 7.021467643719682,
"grad_norm": 0.6111765503883362,
"learning_rate": 0.0009549499954124232,
"loss": 2.9918,
"step": 8510
},
{
"epoch": 7.0297244297657135,
"grad_norm": 0.6223667860031128,
"learning_rate": 0.0009547664923387467,
"loss": 2.9752,
"step": 8520
},
{
"epoch": 7.037981215811746,
"grad_norm": 0.6821649074554443,
"learning_rate": 0.0009545829892650703,
"loss": 2.9938,
"step": 8530
},
{
"epoch": 7.046238001857777,
"grad_norm": 0.5645655989646912,
"learning_rate": 0.0009543994861913938,
"loss": 2.9932,
"step": 8540
},
{
"epoch": 7.054494787903808,
"grad_norm": 0.6132038235664368,
"learning_rate": 0.0009542159831177172,
"loss": 2.9796,
"step": 8550
},
{
"epoch": 7.06275157394984,
"grad_norm": 0.6503163576126099,
"learning_rate": 0.0009540324800440407,
"loss": 2.9844,
"step": 8560
},
{
"epoch": 7.071008359995871,
"grad_norm": 0.5986816883087158,
"learning_rate": 0.0009538489769703643,
"loss": 2.9817,
"step": 8570
},
{
"epoch": 7.079265146041903,
"grad_norm": 0.6171458959579468,
"learning_rate": 0.0009536654738966878,
"loss": 2.9802,
"step": 8580
},
{
"epoch": 7.087521932087935,
"grad_norm": 0.624758243560791,
"learning_rate": 0.0009534819708230113,
"loss": 2.9903,
"step": 8590
},
{
"epoch": 7.095778718133967,
"grad_norm": 0.6675239205360413,
"learning_rate": 0.0009532984677493348,
"loss": 2.9838,
"step": 8600
},
{
"epoch": 7.104035504179998,
"grad_norm": 0.6595028042793274,
"learning_rate": 0.0009531149646756584,
"loss": 2.9938,
"step": 8610
},
{
"epoch": 7.11229229022603,
"grad_norm": 0.7010105848312378,
"learning_rate": 0.0009529314616019819,
"loss": 2.9871,
"step": 8620
},
{
"epoch": 7.120549076272061,
"grad_norm": 0.6516680121421814,
"learning_rate": 0.0009527479585283053,
"loss": 2.9889,
"step": 8630
},
{
"epoch": 7.128805862318092,
"grad_norm": 0.6057817935943604,
"learning_rate": 0.0009525644554546289,
"loss": 2.9926,
"step": 8640
},
{
"epoch": 7.137062648364124,
"grad_norm": 0.6336268782615662,
"learning_rate": 0.0009523809523809524,
"loss": 2.9963,
"step": 8650
},
{
"epoch": 7.145319434410156,
"grad_norm": 0.5994205474853516,
"learning_rate": 0.000952197449307276,
"loss": 2.9847,
"step": 8660
},
{
"epoch": 7.153576220456188,
"grad_norm": 0.6255319118499756,
"learning_rate": 0.0009520139462335994,
"loss": 2.98,
"step": 8670
},
{
"epoch": 7.161833006502219,
"grad_norm": 0.5612902641296387,
"learning_rate": 0.0009518304431599229,
"loss": 2.9802,
"step": 8680
},
{
"epoch": 7.170089792548251,
"grad_norm": 0.6441757082939148,
"learning_rate": 0.0009516469400862464,
"loss": 2.9847,
"step": 8690
},
{
"epoch": 7.178346578594282,
"grad_norm": 0.6565569639205933,
"learning_rate": 0.00095146343701257,
"loss": 2.9879,
"step": 8700
},
{
"epoch": 7.186603364640313,
"grad_norm": 0.609322726726532,
"learning_rate": 0.0009512799339388934,
"loss": 2.9798,
"step": 8710
},
{
"epoch": 7.194860150686345,
"grad_norm": 0.6805379986763,
"learning_rate": 0.000951096430865217,
"loss": 2.9741,
"step": 8720
},
{
"epoch": 7.203116936732377,
"grad_norm": 0.674920380115509,
"learning_rate": 0.0009509129277915405,
"loss": 2.9833,
"step": 8730
},
{
"epoch": 7.211373722778409,
"grad_norm": 0.6178304553031921,
"learning_rate": 0.000950729424717864,
"loss": 2.9818,
"step": 8740
},
{
"epoch": 7.21963050882444,
"grad_norm": 0.5889567136764526,
"learning_rate": 0.0009505459216441875,
"loss": 2.9855,
"step": 8750
},
{
"epoch": 7.227887294870472,
"grad_norm": 0.5856685638427734,
"learning_rate": 0.0009503624185705111,
"loss": 2.97,
"step": 8760
},
{
"epoch": 7.236144080916503,
"grad_norm": 0.660362958908081,
"learning_rate": 0.0009501789154968346,
"loss": 2.9745,
"step": 8770
},
{
"epoch": 7.244400866962534,
"grad_norm": 0.7222636342048645,
"learning_rate": 0.0009499954124231582,
"loss": 2.9836,
"step": 8780
},
{
"epoch": 7.2526576530085665,
"grad_norm": 0.7483038306236267,
"learning_rate": 0.0009498119093494816,
"loss": 2.9743,
"step": 8790
},
{
"epoch": 7.260914439054598,
"grad_norm": 0.6627931594848633,
"learning_rate": 0.0009496284062758052,
"loss": 2.9716,
"step": 8800
},
{
"epoch": 7.26917122510063,
"grad_norm": 0.6666322350502014,
"learning_rate": 0.0009494449032021286,
"loss": 2.9693,
"step": 8810
},
{
"epoch": 7.277428011146661,
"grad_norm": 0.6174741387367249,
"learning_rate": 0.0009492614001284521,
"loss": 2.9638,
"step": 8820
},
{
"epoch": 7.285684797192693,
"grad_norm": 0.5936954617500305,
"learning_rate": 0.0009490778970547756,
"loss": 2.9701,
"step": 8830
},
{
"epoch": 7.293941583238724,
"grad_norm": 0.6383837461471558,
"learning_rate": 0.0009488943939810992,
"loss": 2.9706,
"step": 8840
},
{
"epoch": 7.302198369284756,
"grad_norm": 0.6035402417182922,
"learning_rate": 0.0009487108909074227,
"loss": 2.9639,
"step": 8850
},
{
"epoch": 7.3104551553307875,
"grad_norm": 0.6518993377685547,
"learning_rate": 0.0009485273878337462,
"loss": 2.9693,
"step": 8860
},
{
"epoch": 7.318711941376819,
"grad_norm": 0.5939560532569885,
"learning_rate": 0.0009483438847600697,
"loss": 2.9686,
"step": 8870
},
{
"epoch": 7.326968727422851,
"grad_norm": 0.6224295496940613,
"learning_rate": 0.0009481603816863933,
"loss": 2.9779,
"step": 8880
},
{
"epoch": 7.335225513468882,
"grad_norm": 0.6374024748802185,
"learning_rate": 0.0009479768786127168,
"loss": 2.969,
"step": 8890
},
{
"epoch": 7.343482299514914,
"grad_norm": 0.6577615141868591,
"learning_rate": 0.0009477933755390403,
"loss": 2.9645,
"step": 8900
},
{
"epoch": 7.351739085560945,
"grad_norm": 0.659116268157959,
"learning_rate": 0.0009476098724653638,
"loss": 2.9694,
"step": 8910
},
{
"epoch": 7.359995871606977,
"grad_norm": 0.618446946144104,
"learning_rate": 0.0009474263693916874,
"loss": 2.9615,
"step": 8920
},
{
"epoch": 7.3682526576530085,
"grad_norm": 0.6356460452079773,
"learning_rate": 0.0009472428663180109,
"loss": 2.9812,
"step": 8930
},
{
"epoch": 7.376509443699041,
"grad_norm": 0.5520789623260498,
"learning_rate": 0.0009470593632443344,
"loss": 2.9557,
"step": 8940
},
{
"epoch": 7.384766229745072,
"grad_norm": 0.6499543190002441,
"learning_rate": 0.0009468758601706578,
"loss": 2.9669,
"step": 8950
},
{
"epoch": 7.393023015791103,
"grad_norm": 0.6642090678215027,
"learning_rate": 0.0009466923570969814,
"loss": 2.956,
"step": 8960
},
{
"epoch": 7.401279801837135,
"grad_norm": 0.6019958257675171,
"learning_rate": 0.0009465088540233049,
"loss": 2.962,
"step": 8970
},
{
"epoch": 7.409536587883166,
"grad_norm": 0.6056467890739441,
"learning_rate": 0.0009463253509496284,
"loss": 2.9704,
"step": 8980
},
{
"epoch": 7.417793373929198,
"grad_norm": 0.5770221948623657,
"learning_rate": 0.0009461418478759519,
"loss": 2.9487,
"step": 8990
},
{
"epoch": 7.4260501599752295,
"grad_norm": 0.5907398462295532,
"learning_rate": 0.0009459583448022755,
"loss": 2.9609,
"step": 9000
},
{
"epoch": 7.434306946021262,
"grad_norm": 0.6140010952949524,
"learning_rate": 0.000945774841728599,
"loss": 2.9691,
"step": 9010
},
{
"epoch": 7.442563732067293,
"grad_norm": 0.5944181084632874,
"learning_rate": 0.0009455913386549225,
"loss": 2.957,
"step": 9020
},
{
"epoch": 7.450820518113324,
"grad_norm": 0.6197523474693298,
"learning_rate": 0.000945407835581246,
"loss": 2.9656,
"step": 9030
},
{
"epoch": 7.459077304159356,
"grad_norm": 0.6460192799568176,
"learning_rate": 0.0009452243325075696,
"loss": 2.9599,
"step": 9040
},
{
"epoch": 7.467334090205387,
"grad_norm": 0.6181427836418152,
"learning_rate": 0.0009450408294338931,
"loss": 2.9458,
"step": 9050
},
{
"epoch": 7.475590876251419,
"grad_norm": 0.6719056367874146,
"learning_rate": 0.0009448573263602166,
"loss": 2.9665,
"step": 9060
},
{
"epoch": 7.483847662297451,
"grad_norm": 0.6406500339508057,
"learning_rate": 0.0009446738232865401,
"loss": 2.9555,
"step": 9070
},
{
"epoch": 7.492104448343483,
"grad_norm": 0.6553565263748169,
"learning_rate": 0.0009444903202128636,
"loss": 2.9581,
"step": 9080
},
{
"epoch": 7.500361234389514,
"grad_norm": 0.5775774121284485,
"learning_rate": 0.000944306817139187,
"loss": 2.9582,
"step": 9090
},
{
"epoch": 7.508618020435545,
"grad_norm": 0.6064974665641785,
"learning_rate": 0.0009441233140655106,
"loss": 2.9567,
"step": 9100
},
{
"epoch": 7.516874806481577,
"grad_norm": 0.6577678322792053,
"learning_rate": 0.0009439398109918341,
"loss": 2.9727,
"step": 9110
},
{
"epoch": 7.525131592527608,
"grad_norm": 0.7013944387435913,
"learning_rate": 0.0009437563079181577,
"loss": 2.9625,
"step": 9120
},
{
"epoch": 7.53338837857364,
"grad_norm": 0.5832070112228394,
"learning_rate": 0.0009435728048444812,
"loss": 2.9544,
"step": 9130
},
{
"epoch": 7.541645164619672,
"grad_norm": 0.633455753326416,
"learning_rate": 0.0009433893017708047,
"loss": 2.966,
"step": 9140
},
{
"epoch": 7.549901950665704,
"grad_norm": 0.6928477883338928,
"learning_rate": 0.0009432057986971282,
"loss": 2.9606,
"step": 9150
},
{
"epoch": 7.558158736711735,
"grad_norm": 0.6043297052383423,
"learning_rate": 0.0009430222956234518,
"loss": 2.965,
"step": 9160
},
{
"epoch": 7.566415522757767,
"grad_norm": 0.6551850438117981,
"learning_rate": 0.0009428387925497753,
"loss": 2.9584,
"step": 9170
},
{
"epoch": 7.574672308803798,
"grad_norm": 0.5572656989097595,
"learning_rate": 0.0009426552894760988,
"loss": 2.9622,
"step": 9180
},
{
"epoch": 7.582929094849829,
"grad_norm": 0.5612010359764099,
"learning_rate": 0.0009424717864024223,
"loss": 2.9484,
"step": 9190
},
{
"epoch": 7.591185880895861,
"grad_norm": 0.6252767443656921,
"learning_rate": 0.0009422882833287458,
"loss": 2.9578,
"step": 9200
},
{
"epoch": 7.599442666941893,
"grad_norm": 0.569965124130249,
"learning_rate": 0.0009421047802550692,
"loss": 2.9554,
"step": 9210
},
{
"epoch": 7.607699452987925,
"grad_norm": 0.6037718057632446,
"learning_rate": 0.0009419212771813927,
"loss": 2.9417,
"step": 9220
},
{
"epoch": 7.615956239033956,
"grad_norm": 0.5498155355453491,
"learning_rate": 0.0009417377741077163,
"loss": 2.9473,
"step": 9230
},
{
"epoch": 7.624213025079988,
"grad_norm": 0.6004564166069031,
"learning_rate": 0.0009415542710340398,
"loss": 2.9404,
"step": 9240
},
{
"epoch": 7.632469811126019,
"grad_norm": 0.6017456650733948,
"learning_rate": 0.0009413707679603633,
"loss": 2.9513,
"step": 9250
},
{
"epoch": 7.640726597172051,
"grad_norm": 0.6328597068786621,
"learning_rate": 0.0009411872648866868,
"loss": 2.9446,
"step": 9260
},
{
"epoch": 7.6489833832180825,
"grad_norm": 0.5953946709632874,
"learning_rate": 0.0009410037618130104,
"loss": 2.9435,
"step": 9270
},
{
"epoch": 7.657240169264114,
"grad_norm": 0.6098210215568542,
"learning_rate": 0.0009408202587393339,
"loss": 2.9538,
"step": 9280
},
{
"epoch": 7.665496955310146,
"grad_norm": 0.592674732208252,
"learning_rate": 0.0009406367556656574,
"loss": 2.9527,
"step": 9290
},
{
"epoch": 7.673753741356177,
"grad_norm": 0.5980309247970581,
"learning_rate": 0.0009404532525919809,
"loss": 2.9348,
"step": 9300
},
{
"epoch": 7.682010527402209,
"grad_norm": 0.5754213333129883,
"learning_rate": 0.0009402697495183045,
"loss": 2.9523,
"step": 9310
},
{
"epoch": 7.69026731344824,
"grad_norm": 0.624748945236206,
"learning_rate": 0.000940086246444628,
"loss": 2.9538,
"step": 9320
},
{
"epoch": 7.698524099494271,
"grad_norm": 0.5637576580047607,
"learning_rate": 0.0009399027433709515,
"loss": 2.9414,
"step": 9330
},
{
"epoch": 7.7067808855403035,
"grad_norm": 0.6265804171562195,
"learning_rate": 0.0009397192402972749,
"loss": 2.9429,
"step": 9340
},
{
"epoch": 7.715037671586335,
"grad_norm": 0.6041392087936401,
"learning_rate": 0.0009395357372235985,
"loss": 2.9428,
"step": 9350
},
{
"epoch": 7.723294457632367,
"grad_norm": 0.5320299863815308,
"learning_rate": 0.000939352234149922,
"loss": 2.9391,
"step": 9360
},
{
"epoch": 7.731551243678398,
"grad_norm": 0.6173900365829468,
"learning_rate": 0.0009391687310762455,
"loss": 2.9374,
"step": 9370
},
{
"epoch": 7.73980802972443,
"grad_norm": 0.5725083351135254,
"learning_rate": 0.000938985228002569,
"loss": 2.9609,
"step": 9380
},
{
"epoch": 7.748064815770461,
"grad_norm": 0.5768330097198486,
"learning_rate": 0.0009388017249288926,
"loss": 2.9498,
"step": 9390
},
{
"epoch": 7.756321601816493,
"grad_norm": 0.6300333142280579,
"learning_rate": 0.0009386182218552161,
"loss": 2.9493,
"step": 9400
},
{
"epoch": 7.7645783878625245,
"grad_norm": 0.6431629061698914,
"learning_rate": 0.0009384347187815396,
"loss": 2.9324,
"step": 9410
},
{
"epoch": 7.772835173908556,
"grad_norm": 0.5805600881576538,
"learning_rate": 0.0009382512157078631,
"loss": 2.947,
"step": 9420
},
{
"epoch": 7.781091959954588,
"grad_norm": 0.6539075970649719,
"learning_rate": 0.0009380677126341867,
"loss": 2.9421,
"step": 9430
},
{
"epoch": 7.789348746000619,
"grad_norm": 0.6129085421562195,
"learning_rate": 0.0009378842095605102,
"loss": 2.9413,
"step": 9440
},
{
"epoch": 7.797605532046651,
"grad_norm": 0.6538434624671936,
"learning_rate": 0.0009377007064868337,
"loss": 2.935,
"step": 9450
},
{
"epoch": 7.805862318092682,
"grad_norm": 0.617875337600708,
"learning_rate": 0.0009375172034131572,
"loss": 2.9439,
"step": 9460
},
{
"epoch": 7.814119104138714,
"grad_norm": 0.6133493781089783,
"learning_rate": 0.0009373337003394807,
"loss": 2.9428,
"step": 9470
},
{
"epoch": 7.8223758901847456,
"grad_norm": 0.6544171571731567,
"learning_rate": 0.0009371501972658042,
"loss": 2.936,
"step": 9480
},
{
"epoch": 7.830632676230778,
"grad_norm": 0.6270118355751038,
"learning_rate": 0.0009369666941921277,
"loss": 2.9486,
"step": 9490
},
{
"epoch": 7.838889462276809,
"grad_norm": 0.6458065509796143,
"learning_rate": 0.0009367831911184512,
"loss": 2.9396,
"step": 9500
},
{
"epoch": 7.84714624832284,
"grad_norm": 0.6657986640930176,
"learning_rate": 0.0009365996880447748,
"loss": 2.9461,
"step": 9510
},
{
"epoch": 7.855403034368872,
"grad_norm": 0.6538524627685547,
"learning_rate": 0.0009364161849710983,
"loss": 2.9358,
"step": 9520
},
{
"epoch": 7.863659820414903,
"grad_norm": 0.6204900741577148,
"learning_rate": 0.0009362326818974218,
"loss": 2.9375,
"step": 9530
},
{
"epoch": 7.871916606460935,
"grad_norm": 0.5772661566734314,
"learning_rate": 0.0009360491788237453,
"loss": 2.9371,
"step": 9540
},
{
"epoch": 7.880173392506967,
"grad_norm": 0.7631484270095825,
"learning_rate": 0.0009358656757500689,
"loss": 2.9518,
"step": 9550
},
{
"epoch": 7.888430178552998,
"grad_norm": 0.5904896855354309,
"learning_rate": 0.0009356821726763924,
"loss": 2.9401,
"step": 9560
},
{
"epoch": 7.89668696459903,
"grad_norm": 0.6027041077613831,
"learning_rate": 0.0009354986696027159,
"loss": 2.935,
"step": 9570
},
{
"epoch": 7.904943750645061,
"grad_norm": 0.5784376859664917,
"learning_rate": 0.0009353151665290394,
"loss": 2.9314,
"step": 9580
},
{
"epoch": 7.913200536691093,
"grad_norm": 0.6234803795814514,
"learning_rate": 0.000935131663455363,
"loss": 2.9341,
"step": 9590
},
{
"epoch": 7.921457322737124,
"grad_norm": 0.5850915312767029,
"learning_rate": 0.0009349481603816863,
"loss": 2.9266,
"step": 9600
},
{
"epoch": 7.929714108783156,
"grad_norm": 0.6063703894615173,
"learning_rate": 0.0009347646573080099,
"loss": 2.9421,
"step": 9610
},
{
"epoch": 7.937970894829188,
"grad_norm": 0.5547103881835938,
"learning_rate": 0.0009345811542343334,
"loss": 2.9294,
"step": 9620
},
{
"epoch": 7.94622768087522,
"grad_norm": 0.5692980885505676,
"learning_rate": 0.000934397651160657,
"loss": 2.9347,
"step": 9630
},
{
"epoch": 7.954484466921251,
"grad_norm": 0.6392699480056763,
"learning_rate": 0.0009342141480869804,
"loss": 2.9386,
"step": 9640
},
{
"epoch": 7.962741252967282,
"grad_norm": 0.5906763076782227,
"learning_rate": 0.000934030645013304,
"loss": 2.9407,
"step": 9650
},
{
"epoch": 7.970998039013314,
"grad_norm": 0.5717517733573914,
"learning_rate": 0.0009338471419396275,
"loss": 2.93,
"step": 9660
},
{
"epoch": 7.979254825059345,
"grad_norm": 0.63603675365448,
"learning_rate": 0.0009336636388659511,
"loss": 2.9334,
"step": 9670
},
{
"epoch": 7.987511611105377,
"grad_norm": 0.6233087778091431,
"learning_rate": 0.0009334801357922745,
"loss": 2.9247,
"step": 9680
},
{
"epoch": 7.995768397151409,
"grad_norm": 0.6149667501449585,
"learning_rate": 0.0009332966327185981,
"loss": 2.9218,
"step": 9690
},
{
"epoch": 8.003302714418412,
"grad_norm": 0.6047292947769165,
"learning_rate": 0.0009331131296449216,
"loss": 2.6704,
"step": 9700
},
{
"epoch": 8.011559500464445,
"grad_norm": 0.6108692288398743,
"learning_rate": 0.0009329296265712452,
"loss": 2.9237,
"step": 9710
},
{
"epoch": 8.019816286510476,
"grad_norm": 0.5642316341400146,
"learning_rate": 0.0009327461234975686,
"loss": 2.9258,
"step": 9720
},
{
"epoch": 8.028073072556507,
"grad_norm": 0.6315813660621643,
"learning_rate": 0.0009325626204238921,
"loss": 2.9293,
"step": 9730
},
{
"epoch": 8.036329858602539,
"grad_norm": 0.6231210827827454,
"learning_rate": 0.0009323791173502156,
"loss": 2.9161,
"step": 9740
},
{
"epoch": 8.04458664464857,
"grad_norm": 0.5583593249320984,
"learning_rate": 0.0009321956142765392,
"loss": 2.923,
"step": 9750
},
{
"epoch": 8.052843430694603,
"grad_norm": 0.5963938236236572,
"learning_rate": 0.0009320121112028626,
"loss": 2.9282,
"step": 9760
},
{
"epoch": 8.061100216740634,
"grad_norm": 0.6553643941879272,
"learning_rate": 0.0009318286081291861,
"loss": 2.9218,
"step": 9770
},
{
"epoch": 8.069357002786665,
"grad_norm": 0.5880711674690247,
"learning_rate": 0.0009316451050555097,
"loss": 2.9278,
"step": 9780
},
{
"epoch": 8.077613788832696,
"grad_norm": 0.584306001663208,
"learning_rate": 0.0009314616019818332,
"loss": 2.9275,
"step": 9790
},
{
"epoch": 8.085870574878728,
"grad_norm": 0.655783474445343,
"learning_rate": 0.0009312780989081567,
"loss": 2.9148,
"step": 9800
},
{
"epoch": 8.09412736092476,
"grad_norm": 0.6076985001564026,
"learning_rate": 0.0009310945958344802,
"loss": 2.9243,
"step": 9810
},
{
"epoch": 8.102384146970792,
"grad_norm": 0.5802444815635681,
"learning_rate": 0.0009309110927608038,
"loss": 2.9269,
"step": 9820
},
{
"epoch": 8.110640933016823,
"grad_norm": 0.6020260453224182,
"learning_rate": 0.0009307275896871273,
"loss": 2.9156,
"step": 9830
},
{
"epoch": 8.118897719062854,
"grad_norm": 0.6201086044311523,
"learning_rate": 0.0009305440866134508,
"loss": 2.9187,
"step": 9840
},
{
"epoch": 8.127154505108887,
"grad_norm": 0.6539363861083984,
"learning_rate": 0.0009303605835397743,
"loss": 2.9242,
"step": 9850
},
{
"epoch": 8.135411291154918,
"grad_norm": 0.6557437777519226,
"learning_rate": 0.0009301770804660978,
"loss": 2.9149,
"step": 9860
},
{
"epoch": 8.14366807720095,
"grad_norm": 0.563693106174469,
"learning_rate": 0.0009299935773924213,
"loss": 2.9283,
"step": 9870
},
{
"epoch": 8.15192486324698,
"grad_norm": 0.610340416431427,
"learning_rate": 0.0009298100743187448,
"loss": 2.9233,
"step": 9880
},
{
"epoch": 8.160181649293012,
"grad_norm": 0.5527334809303284,
"learning_rate": 0.0009296265712450683,
"loss": 2.9088,
"step": 9890
},
{
"epoch": 8.168438435339045,
"grad_norm": 0.5965984463691711,
"learning_rate": 0.0009294430681713919,
"loss": 2.9233,
"step": 9900
},
{
"epoch": 8.176695221385076,
"grad_norm": 0.6083648204803467,
"learning_rate": 0.0009292595650977154,
"loss": 2.9182,
"step": 9910
},
{
"epoch": 8.184952007431107,
"grad_norm": 0.5621761083602905,
"learning_rate": 0.0009290760620240389,
"loss": 2.9146,
"step": 9920
},
{
"epoch": 8.193208793477138,
"grad_norm": 0.5425733923912048,
"learning_rate": 0.0009288925589503624,
"loss": 2.9133,
"step": 9930
},
{
"epoch": 8.201465579523171,
"grad_norm": 0.5596359372138977,
"learning_rate": 0.000928709055876686,
"loss": 2.9141,
"step": 9940
},
{
"epoch": 8.209722365569203,
"grad_norm": 0.5979769825935364,
"learning_rate": 0.0009285255528030095,
"loss": 2.9155,
"step": 9950
},
{
"epoch": 8.217979151615234,
"grad_norm": 0.6086379289627075,
"learning_rate": 0.000928342049729333,
"loss": 2.9253,
"step": 9960
},
{
"epoch": 8.226235937661265,
"grad_norm": 0.6083199381828308,
"learning_rate": 0.0009281585466556565,
"loss": 2.913,
"step": 9970
},
{
"epoch": 8.234492723707296,
"grad_norm": 0.6459252238273621,
"learning_rate": 0.0009279750435819801,
"loss": 2.914,
"step": 9980
},
{
"epoch": 8.24274950975333,
"grad_norm": 0.5913544297218323,
"learning_rate": 0.0009277915405083034,
"loss": 2.9142,
"step": 9990
},
{
"epoch": 8.25100629579936,
"grad_norm": 0.6325271129608154,
"learning_rate": 0.000927608037434627,
"loss": 2.9161,
"step": 10000
},
{
"epoch": 8.259263081845392,
"grad_norm": 0.5974222421646118,
"learning_rate": 0.0009274245343609505,
"loss": 2.907,
"step": 10010
},
{
"epoch": 8.267519867891423,
"grad_norm": 0.5887889862060547,
"learning_rate": 0.0009272410312872741,
"loss": 2.906,
"step": 10020
},
{
"epoch": 8.275776653937454,
"grad_norm": 0.6619329452514648,
"learning_rate": 0.0009270575282135975,
"loss": 2.9102,
"step": 10030
},
{
"epoch": 8.284033439983487,
"grad_norm": 0.5642185211181641,
"learning_rate": 0.0009268740251399211,
"loss": 2.9119,
"step": 10040
},
{
"epoch": 8.292290226029518,
"grad_norm": 0.6225172877311707,
"learning_rate": 0.0009266905220662446,
"loss": 2.9189,
"step": 10050
},
{
"epoch": 8.30054701207555,
"grad_norm": 0.6109263300895691,
"learning_rate": 0.0009265070189925682,
"loss": 2.9113,
"step": 10060
},
{
"epoch": 8.30880379812158,
"grad_norm": 0.6616942286491394,
"learning_rate": 0.0009263235159188916,
"loss": 2.9115,
"step": 10070
},
{
"epoch": 8.317060584167614,
"grad_norm": 0.5564186573028564,
"learning_rate": 0.0009261400128452152,
"loss": 2.9175,
"step": 10080
},
{
"epoch": 8.325317370213645,
"grad_norm": 0.5995142459869385,
"learning_rate": 0.0009259565097715387,
"loss": 2.9014,
"step": 10090
},
{
"epoch": 8.333574156259676,
"grad_norm": 0.599012553691864,
"learning_rate": 0.0009257730066978623,
"loss": 2.9076,
"step": 10100
},
{
"epoch": 8.341830942305707,
"grad_norm": 0.5985011458396912,
"learning_rate": 0.0009255895036241857,
"loss": 2.9071,
"step": 10110
},
{
"epoch": 8.350087728351738,
"grad_norm": 0.6194997429847717,
"learning_rate": 0.0009254060005505093,
"loss": 2.9066,
"step": 10120
},
{
"epoch": 8.358344514397771,
"grad_norm": 0.6201893091201782,
"learning_rate": 0.0009252224974768327,
"loss": 2.8995,
"step": 10130
},
{
"epoch": 8.366601300443802,
"grad_norm": 0.5880855321884155,
"learning_rate": 0.0009250389944031563,
"loss": 2.9174,
"step": 10140
},
{
"epoch": 8.374858086489834,
"grad_norm": 0.574177086353302,
"learning_rate": 0.0009248554913294797,
"loss": 2.9035,
"step": 10150
},
{
"epoch": 8.383114872535865,
"grad_norm": 0.6537944674491882,
"learning_rate": 0.0009246719882558033,
"loss": 2.9017,
"step": 10160
},
{
"epoch": 8.391371658581898,
"grad_norm": 0.5747184753417969,
"learning_rate": 0.0009244884851821268,
"loss": 2.9057,
"step": 10170
},
{
"epoch": 8.399628444627929,
"grad_norm": 0.6202713251113892,
"learning_rate": 0.0009243049821084504,
"loss": 2.9102,
"step": 10180
},
{
"epoch": 8.40788523067396,
"grad_norm": 0.5950630307197571,
"learning_rate": 0.0009241214790347738,
"loss": 2.9159,
"step": 10190
},
{
"epoch": 8.416142016719991,
"grad_norm": 0.6630895733833313,
"learning_rate": 0.0009239379759610974,
"loss": 2.9099,
"step": 10200
},
{
"epoch": 8.424398802766023,
"grad_norm": 0.6798600554466248,
"learning_rate": 0.0009237544728874209,
"loss": 2.9168,
"step": 10210
},
{
"epoch": 8.432655588812056,
"grad_norm": 0.6319479942321777,
"learning_rate": 0.0009235709698137445,
"loss": 2.9081,
"step": 10220
},
{
"epoch": 8.440912374858087,
"grad_norm": 0.6305397152900696,
"learning_rate": 0.0009233874667400679,
"loss": 2.9087,
"step": 10230
},
{
"epoch": 8.449169160904118,
"grad_norm": 0.5864200592041016,
"learning_rate": 0.0009232039636663915,
"loss": 2.8947,
"step": 10240
},
{
"epoch": 8.45742594695015,
"grad_norm": 0.5810872316360474,
"learning_rate": 0.000923020460592715,
"loss": 2.909,
"step": 10250
},
{
"epoch": 8.465682732996182,
"grad_norm": 0.6141155362129211,
"learning_rate": 0.0009228369575190385,
"loss": 2.9061,
"step": 10260
},
{
"epoch": 8.473939519042213,
"grad_norm": 0.6127697825431824,
"learning_rate": 0.0009226534544453619,
"loss": 2.9052,
"step": 10270
},
{
"epoch": 8.482196305088245,
"grad_norm": 0.6289766430854797,
"learning_rate": 0.0009224699513716855,
"loss": 2.8969,
"step": 10280
},
{
"epoch": 8.490453091134276,
"grad_norm": 0.6233021020889282,
"learning_rate": 0.000922286448298009,
"loss": 2.9047,
"step": 10290
},
{
"epoch": 8.498709877180307,
"grad_norm": 0.6213576197624207,
"learning_rate": 0.0009221029452243326,
"loss": 2.9005,
"step": 10300
},
{
"epoch": 8.50696666322634,
"grad_norm": 0.6397675275802612,
"learning_rate": 0.000921919442150656,
"loss": 2.9018,
"step": 10310
},
{
"epoch": 8.515223449272371,
"grad_norm": 0.6674553751945496,
"learning_rate": 0.0009217359390769796,
"loss": 2.9055,
"step": 10320
},
{
"epoch": 8.523480235318402,
"grad_norm": 0.636461615562439,
"learning_rate": 0.0009215524360033031,
"loss": 2.908,
"step": 10330
},
{
"epoch": 8.531737021364433,
"grad_norm": 0.593784511089325,
"learning_rate": 0.0009213689329296266,
"loss": 2.9028,
"step": 10340
},
{
"epoch": 8.539993807410465,
"grad_norm": 0.5959449410438538,
"learning_rate": 0.0009211854298559501,
"loss": 2.9042,
"step": 10350
},
{
"epoch": 8.548250593456498,
"grad_norm": 0.6200835704803467,
"learning_rate": 0.0009210019267822736,
"loss": 2.9108,
"step": 10360
},
{
"epoch": 8.556507379502529,
"grad_norm": 0.6081064939498901,
"learning_rate": 0.0009208184237085972,
"loss": 2.9078,
"step": 10370
},
{
"epoch": 8.56476416554856,
"grad_norm": 0.5773234963417053,
"learning_rate": 0.0009206349206349207,
"loss": 2.9061,
"step": 10380
},
{
"epoch": 8.573020951594591,
"grad_norm": 0.6200804710388184,
"learning_rate": 0.0009204514175612441,
"loss": 2.9125,
"step": 10390
},
{
"epoch": 8.581277737640624,
"grad_norm": 0.602094829082489,
"learning_rate": 0.0009202679144875676,
"loss": 2.8998,
"step": 10400
},
{
"epoch": 8.589534523686655,
"grad_norm": 0.6243281364440918,
"learning_rate": 0.0009200844114138912,
"loss": 2.899,
"step": 10410
},
{
"epoch": 8.597791309732687,
"grad_norm": 0.5654193758964539,
"learning_rate": 0.0009199009083402146,
"loss": 2.8988,
"step": 10420
},
{
"epoch": 8.606048095778718,
"grad_norm": 0.5849204063415527,
"learning_rate": 0.0009197174052665382,
"loss": 2.8968,
"step": 10430
},
{
"epoch": 8.614304881824749,
"grad_norm": 0.6373389363288879,
"learning_rate": 0.0009195339021928617,
"loss": 2.8907,
"step": 10440
},
{
"epoch": 8.622561667870782,
"grad_norm": 0.5677966475486755,
"learning_rate": 0.0009193503991191853,
"loss": 2.8966,
"step": 10450
},
{
"epoch": 8.630818453916813,
"grad_norm": 0.5700002908706665,
"learning_rate": 0.0009191668960455087,
"loss": 2.8932,
"step": 10460
},
{
"epoch": 8.639075239962844,
"grad_norm": 0.5689521431922913,
"learning_rate": 0.0009189833929718323,
"loss": 2.9058,
"step": 10470
},
{
"epoch": 8.647332026008876,
"grad_norm": 0.579234778881073,
"learning_rate": 0.0009187998898981558,
"loss": 2.8854,
"step": 10480
},
{
"epoch": 8.655588812054908,
"grad_norm": 0.5431221127510071,
"learning_rate": 0.0009186163868244794,
"loss": 2.8935,
"step": 10490
},
{
"epoch": 8.66384559810094,
"grad_norm": 0.5348896980285645,
"learning_rate": 0.0009184328837508028,
"loss": 2.9021,
"step": 10500
},
{
"epoch": 8.67210238414697,
"grad_norm": 0.5952715873718262,
"learning_rate": 0.0009182493806771264,
"loss": 2.902,
"step": 10510
},
{
"epoch": 8.680359170193002,
"grad_norm": 0.6143502593040466,
"learning_rate": 0.0009180658776034498,
"loss": 2.8996,
"step": 10520
},
{
"epoch": 8.688615956239033,
"grad_norm": 0.5976707339286804,
"learning_rate": 0.0009178823745297734,
"loss": 2.8936,
"step": 10530
},
{
"epoch": 8.696872742285066,
"grad_norm": 0.6755147576332092,
"learning_rate": 0.0009176988714560968,
"loss": 2.8962,
"step": 10540
},
{
"epoch": 8.705129528331097,
"grad_norm": 0.6825839281082153,
"learning_rate": 0.0009175153683824204,
"loss": 2.9054,
"step": 10550
},
{
"epoch": 8.713386314377129,
"grad_norm": 0.6553934812545776,
"learning_rate": 0.0009173318653087439,
"loss": 2.9012,
"step": 10560
},
{
"epoch": 8.72164310042316,
"grad_norm": 0.6154677867889404,
"learning_rate": 0.0009171483622350675,
"loss": 2.8977,
"step": 10570
},
{
"epoch": 8.729899886469191,
"grad_norm": 0.6081441044807434,
"learning_rate": 0.0009169648591613909,
"loss": 2.8851,
"step": 10580
},
{
"epoch": 8.738156672515224,
"grad_norm": 0.6328044533729553,
"learning_rate": 0.0009167813560877145,
"loss": 2.8979,
"step": 10590
},
{
"epoch": 8.746413458561255,
"grad_norm": 0.5969833731651306,
"learning_rate": 0.000916597853014038,
"loss": 2.8985,
"step": 10600
},
{
"epoch": 8.754670244607286,
"grad_norm": 0.5929258465766907,
"learning_rate": 0.0009164143499403616,
"loss": 2.8965,
"step": 10610
},
{
"epoch": 8.762927030653318,
"grad_norm": 0.5987407565116882,
"learning_rate": 0.000916230846866685,
"loss": 2.8874,
"step": 10620
},
{
"epoch": 8.77118381669935,
"grad_norm": 0.568051278591156,
"learning_rate": 0.0009160473437930086,
"loss": 2.9001,
"step": 10630
},
{
"epoch": 8.779440602745382,
"grad_norm": 0.6252589225769043,
"learning_rate": 0.0009158638407193321,
"loss": 2.8974,
"step": 10640
},
{
"epoch": 8.787697388791413,
"grad_norm": 0.5795060992240906,
"learning_rate": 0.0009156803376456556,
"loss": 2.8898,
"step": 10650
},
{
"epoch": 8.795954174837444,
"grad_norm": 0.5712361931800842,
"learning_rate": 0.000915496834571979,
"loss": 2.8999,
"step": 10660
},
{
"epoch": 8.804210960883475,
"grad_norm": 0.5985157489776611,
"learning_rate": 0.0009153133314983026,
"loss": 2.8827,
"step": 10670
},
{
"epoch": 8.812467746929508,
"grad_norm": 0.6716547608375549,
"learning_rate": 0.0009151298284246261,
"loss": 2.8915,
"step": 10680
},
{
"epoch": 8.82072453297554,
"grad_norm": 0.572161853313446,
"learning_rate": 0.0009149463253509497,
"loss": 2.8874,
"step": 10690
},
{
"epoch": 8.82898131902157,
"grad_norm": 0.6197661757469177,
"learning_rate": 0.0009147628222772731,
"loss": 2.8814,
"step": 10700
},
{
"epoch": 8.837238105067602,
"grad_norm": 0.5292848348617554,
"learning_rate": 0.0009145793192035967,
"loss": 2.8972,
"step": 10710
},
{
"epoch": 8.845494891113635,
"grad_norm": 0.6543566584587097,
"learning_rate": 0.0009143958161299202,
"loss": 2.8988,
"step": 10720
},
{
"epoch": 8.853751677159666,
"grad_norm": 0.5767044425010681,
"learning_rate": 0.0009142123130562438,
"loss": 2.8984,
"step": 10730
},
{
"epoch": 8.862008463205697,
"grad_norm": 0.6067584156990051,
"learning_rate": 0.0009140288099825672,
"loss": 2.8843,
"step": 10740
},
{
"epoch": 8.870265249251728,
"grad_norm": 0.7177631258964539,
"learning_rate": 0.0009138453069088908,
"loss": 2.8775,
"step": 10750
},
{
"epoch": 8.87852203529776,
"grad_norm": 0.5992334485054016,
"learning_rate": 0.0009136618038352143,
"loss": 2.8936,
"step": 10760
},
{
"epoch": 8.886778821343793,
"grad_norm": 0.5875272750854492,
"learning_rate": 0.0009134783007615379,
"loss": 2.892,
"step": 10770
},
{
"epoch": 8.895035607389824,
"grad_norm": 0.6319445967674255,
"learning_rate": 0.0009132947976878612,
"loss": 2.89,
"step": 10780
},
{
"epoch": 8.903292393435855,
"grad_norm": 0.6280015110969543,
"learning_rate": 0.0009131112946141848,
"loss": 2.892,
"step": 10790
},
{
"epoch": 8.911549179481886,
"grad_norm": 0.5766534805297852,
"learning_rate": 0.0009129277915405083,
"loss": 2.8984,
"step": 10800
},
{
"epoch": 8.919805965527917,
"grad_norm": 0.5661517381668091,
"learning_rate": 0.0009127442884668319,
"loss": 2.8816,
"step": 10810
},
{
"epoch": 8.92806275157395,
"grad_norm": 0.6289181709289551,
"learning_rate": 0.0009125607853931553,
"loss": 2.8935,
"step": 10820
},
{
"epoch": 8.936319537619982,
"grad_norm": 0.5980255603790283,
"learning_rate": 0.0009123772823194789,
"loss": 2.8961,
"step": 10830
},
{
"epoch": 8.944576323666013,
"grad_norm": 0.5405508279800415,
"learning_rate": 0.0009121937792458024,
"loss": 2.893,
"step": 10840
},
{
"epoch": 8.952833109712044,
"grad_norm": 0.5458992719650269,
"learning_rate": 0.000912010276172126,
"loss": 2.8879,
"step": 10850
},
{
"epoch": 8.961089895758077,
"grad_norm": 0.6285332441329956,
"learning_rate": 0.0009118267730984494,
"loss": 2.8697,
"step": 10860
},
{
"epoch": 8.969346681804108,
"grad_norm": 0.5860605239868164,
"learning_rate": 0.000911643270024773,
"loss": 2.8808,
"step": 10870
},
{
"epoch": 8.97760346785014,
"grad_norm": 0.6316761374473572,
"learning_rate": 0.0009114597669510965,
"loss": 2.8776,
"step": 10880
},
{
"epoch": 8.98586025389617,
"grad_norm": 0.6294664144515991,
"learning_rate": 0.0009112762638774201,
"loss": 2.8836,
"step": 10890
},
{
"epoch": 8.994117039942202,
"grad_norm": 0.5913059711456299,
"learning_rate": 0.0009110927608037435,
"loss": 2.8694,
"step": 10900
},
{
"epoch": 9.001651357209207,
"grad_norm": 0.5948079228401184,
"learning_rate": 0.0009109092577300669,
"loss": 2.6295,
"step": 10910
},
{
"epoch": 9.009908143255238,
"grad_norm": 0.6169693470001221,
"learning_rate": 0.0009107257546563905,
"loss": 2.8809,
"step": 10920
},
{
"epoch": 9.01816492930127,
"grad_norm": 0.5843782424926758,
"learning_rate": 0.0009105422515827139,
"loss": 2.8656,
"step": 10930
},
{
"epoch": 9.0264217153473,
"grad_norm": 0.5803791284561157,
"learning_rate": 0.0009103587485090375,
"loss": 2.8739,
"step": 10940
},
{
"epoch": 9.034678501393334,
"grad_norm": 0.5891194343566895,
"learning_rate": 0.000910175245435361,
"loss": 2.8722,
"step": 10950
},
{
"epoch": 9.042935287439365,
"grad_norm": 0.6038116216659546,
"learning_rate": 0.0009099917423616846,
"loss": 2.8822,
"step": 10960
},
{
"epoch": 9.051192073485396,
"grad_norm": 0.589483380317688,
"learning_rate": 0.000909808239288008,
"loss": 2.8651,
"step": 10970
},
{
"epoch": 9.059448859531427,
"grad_norm": 0.6521607041358948,
"learning_rate": 0.0009096247362143316,
"loss": 2.8731,
"step": 10980
},
{
"epoch": 9.067705645577458,
"grad_norm": 0.6631231307983398,
"learning_rate": 0.0009094412331406551,
"loss": 2.8701,
"step": 10990
},
{
"epoch": 9.075962431623491,
"grad_norm": 0.5744627714157104,
"learning_rate": 0.0009092577300669787,
"loss": 2.8745,
"step": 11000
},
{
"epoch": 9.084219217669522,
"grad_norm": 0.6196519732475281,
"learning_rate": 0.0009090742269933021,
"loss": 2.8726,
"step": 11010
},
{
"epoch": 9.092476003715554,
"grad_norm": 0.6212047934532166,
"learning_rate": 0.0009088907239196257,
"loss": 2.8806,
"step": 11020
},
{
"epoch": 9.100732789761585,
"grad_norm": 0.5632530450820923,
"learning_rate": 0.0009087072208459492,
"loss": 2.8707,
"step": 11030
},
{
"epoch": 9.108989575807616,
"grad_norm": 0.6230122447013855,
"learning_rate": 0.0009085237177722727,
"loss": 2.8752,
"step": 11040
},
{
"epoch": 9.117246361853649,
"grad_norm": 0.6368362307548523,
"learning_rate": 0.0009083402146985961,
"loss": 2.8752,
"step": 11050
},
{
"epoch": 9.12550314789968,
"grad_norm": 0.6354774236679077,
"learning_rate": 0.0009081567116249197,
"loss": 2.8783,
"step": 11060
},
{
"epoch": 9.133759933945711,
"grad_norm": 0.5921966433525085,
"learning_rate": 0.0009079732085512432,
"loss": 2.866,
"step": 11070
},
{
"epoch": 9.142016719991743,
"grad_norm": 0.6098789572715759,
"learning_rate": 0.0009077897054775668,
"loss": 2.8635,
"step": 11080
},
{
"epoch": 9.150273506037776,
"grad_norm": 0.6147322058677673,
"learning_rate": 0.0009076062024038902,
"loss": 2.879,
"step": 11090
},
{
"epoch": 9.158530292083807,
"grad_norm": 0.554958164691925,
"learning_rate": 0.0009074226993302138,
"loss": 2.8811,
"step": 11100
},
{
"epoch": 9.166787078129838,
"grad_norm": 0.5771721601486206,
"learning_rate": 0.0009072391962565373,
"loss": 2.8716,
"step": 11110
},
{
"epoch": 9.17504386417587,
"grad_norm": 0.5154232382774353,
"learning_rate": 0.0009070556931828609,
"loss": 2.8749,
"step": 11120
},
{
"epoch": 9.1833006502219,
"grad_norm": 0.6075816750526428,
"learning_rate": 0.0009068721901091843,
"loss": 2.8602,
"step": 11130
},
{
"epoch": 9.191557436267933,
"grad_norm": 0.6058173775672913,
"learning_rate": 0.0009066886870355079,
"loss": 2.8639,
"step": 11140
},
{
"epoch": 9.199814222313965,
"grad_norm": 0.6568463444709778,
"learning_rate": 0.0009065051839618314,
"loss": 2.8737,
"step": 11150
},
{
"epoch": 9.208071008359996,
"grad_norm": 0.6088699698448181,
"learning_rate": 0.000906321680888155,
"loss": 2.8674,
"step": 11160
},
{
"epoch": 9.216327794406027,
"grad_norm": 0.635866105556488,
"learning_rate": 0.0009061381778144783,
"loss": 2.8711,
"step": 11170
},
{
"epoch": 9.22458458045206,
"grad_norm": 0.6134654879570007,
"learning_rate": 0.0009059546747408019,
"loss": 2.8702,
"step": 11180
},
{
"epoch": 9.232841366498091,
"grad_norm": 0.6194204688072205,
"learning_rate": 0.0009057711716671254,
"loss": 2.8653,
"step": 11190
},
{
"epoch": 9.241098152544122,
"grad_norm": 0.5850259065628052,
"learning_rate": 0.000905587668593449,
"loss": 2.8682,
"step": 11200
},
{
"epoch": 9.249354938590153,
"grad_norm": 0.5745192766189575,
"learning_rate": 0.0009054041655197724,
"loss": 2.8726,
"step": 11210
},
{
"epoch": 9.257611724636185,
"grad_norm": 0.5950500965118408,
"learning_rate": 0.000905220662446096,
"loss": 2.8816,
"step": 11220
},
{
"epoch": 9.265868510682218,
"grad_norm": 0.5739644765853882,
"learning_rate": 0.0009050371593724195,
"loss": 2.8672,
"step": 11230
},
{
"epoch": 9.274125296728249,
"grad_norm": 0.632830798625946,
"learning_rate": 0.0009048536562987431,
"loss": 2.8733,
"step": 11240
},
{
"epoch": 9.28238208277428,
"grad_norm": 0.5722547769546509,
"learning_rate": 0.0009046701532250665,
"loss": 2.8772,
"step": 11250
},
{
"epoch": 9.290638868820311,
"grad_norm": 0.6459839344024658,
"learning_rate": 0.0009044866501513901,
"loss": 2.8625,
"step": 11260
},
{
"epoch": 9.298895654866342,
"grad_norm": 0.6177144050598145,
"learning_rate": 0.0009043031470777136,
"loss": 2.8658,
"step": 11270
},
{
"epoch": 9.307152440912375,
"grad_norm": 0.5970734357833862,
"learning_rate": 0.0009041196440040372,
"loss": 2.8572,
"step": 11280
},
{
"epoch": 9.315409226958407,
"grad_norm": 0.5540674924850464,
"learning_rate": 0.0009039361409303606,
"loss": 2.8578,
"step": 11290
},
{
"epoch": 9.323666013004438,
"grad_norm": 0.5886362791061401,
"learning_rate": 0.0009037526378566842,
"loss": 2.8622,
"step": 11300
},
{
"epoch": 9.331922799050469,
"grad_norm": 0.563347339630127,
"learning_rate": 0.0009035691347830076,
"loss": 2.8613,
"step": 11310
},
{
"epoch": 9.340179585096502,
"grad_norm": 0.6594980359077454,
"learning_rate": 0.0009033856317093312,
"loss": 2.8611,
"step": 11320
},
{
"epoch": 9.348436371142533,
"grad_norm": 0.6381516456604004,
"learning_rate": 0.0009032021286356546,
"loss": 2.8629,
"step": 11330
},
{
"epoch": 9.356693157188564,
"grad_norm": 0.5937607884407043,
"learning_rate": 0.0009030186255619782,
"loss": 2.8714,
"step": 11340
},
{
"epoch": 9.364949943234596,
"grad_norm": 0.6181517243385315,
"learning_rate": 0.0009028351224883017,
"loss": 2.8608,
"step": 11350
},
{
"epoch": 9.373206729280627,
"grad_norm": 0.601092517375946,
"learning_rate": 0.0009026516194146253,
"loss": 2.8648,
"step": 11360
},
{
"epoch": 9.38146351532666,
"grad_norm": 0.532781183719635,
"learning_rate": 0.0009024681163409487,
"loss": 2.8596,
"step": 11370
},
{
"epoch": 9.38972030137269,
"grad_norm": 0.6382347941398621,
"learning_rate": 0.0009022846132672723,
"loss": 2.8776,
"step": 11380
},
{
"epoch": 9.397977087418722,
"grad_norm": 0.617072343826294,
"learning_rate": 0.0009021011101935958,
"loss": 2.8641,
"step": 11390
},
{
"epoch": 9.406233873464753,
"grad_norm": 0.6701762676239014,
"learning_rate": 0.0009019176071199194,
"loss": 2.8749,
"step": 11400
},
{
"epoch": 9.414490659510786,
"grad_norm": 0.7021268010139465,
"learning_rate": 0.0009017341040462428,
"loss": 2.8707,
"step": 11410
},
{
"epoch": 9.422747445556817,
"grad_norm": 0.6214231848716736,
"learning_rate": 0.0009015506009725664,
"loss": 2.866,
"step": 11420
},
{
"epoch": 9.431004231602849,
"grad_norm": 0.5644016861915588,
"learning_rate": 0.0009013670978988899,
"loss": 2.869,
"step": 11430
},
{
"epoch": 9.43926101764888,
"grad_norm": 0.6352203488349915,
"learning_rate": 0.0009011835948252133,
"loss": 2.8674,
"step": 11440
},
{
"epoch": 9.447517803694911,
"grad_norm": 0.5540649890899658,
"learning_rate": 0.0009010000917515368,
"loss": 2.8574,
"step": 11450
},
{
"epoch": 9.455774589740944,
"grad_norm": 0.5691086649894714,
"learning_rate": 0.0009008165886778604,
"loss": 2.8652,
"step": 11460
},
{
"epoch": 9.464031375786975,
"grad_norm": 0.5646165013313293,
"learning_rate": 0.0009006330856041839,
"loss": 2.8704,
"step": 11470
},
{
"epoch": 9.472288161833006,
"grad_norm": 0.6189112067222595,
"learning_rate": 0.0009004495825305073,
"loss": 2.8685,
"step": 11480
},
{
"epoch": 9.480544947879038,
"grad_norm": 0.5498800873756409,
"learning_rate": 0.0009002660794568309,
"loss": 2.8595,
"step": 11490
},
{
"epoch": 9.488801733925069,
"grad_norm": 0.5840670466423035,
"learning_rate": 0.0009000825763831544,
"loss": 2.8675,
"step": 11500
},
{
"epoch": 9.497058519971102,
"grad_norm": 0.5607289671897888,
"learning_rate": 0.000899899073309478,
"loss": 2.8593,
"step": 11510
},
{
"epoch": 9.505315306017133,
"grad_norm": 0.6241579055786133,
"learning_rate": 0.0008997155702358014,
"loss": 2.8488,
"step": 11520
},
{
"epoch": 9.513572092063164,
"grad_norm": 0.6067299246788025,
"learning_rate": 0.000899532067162125,
"loss": 2.8573,
"step": 11530
},
{
"epoch": 9.521828878109195,
"grad_norm": 0.6034315824508667,
"learning_rate": 0.0008993485640884485,
"loss": 2.8672,
"step": 11540
},
{
"epoch": 9.530085664155228,
"grad_norm": 0.5804450511932373,
"learning_rate": 0.0008991650610147721,
"loss": 2.8576,
"step": 11550
},
{
"epoch": 9.53834245020126,
"grad_norm": 0.6092609167098999,
"learning_rate": 0.0008989815579410955,
"loss": 2.8654,
"step": 11560
},
{
"epoch": 9.54659923624729,
"grad_norm": 0.5359856486320496,
"learning_rate": 0.000898798054867419,
"loss": 2.8616,
"step": 11570
},
{
"epoch": 9.554856022293322,
"grad_norm": 0.6626849174499512,
"learning_rate": 0.0008986145517937425,
"loss": 2.8635,
"step": 11580
},
{
"epoch": 9.563112808339355,
"grad_norm": 0.6117586493492126,
"learning_rate": 0.0008984310487200661,
"loss": 2.8682,
"step": 11590
},
{
"epoch": 9.571369594385386,
"grad_norm": 0.6978448629379272,
"learning_rate": 0.0008982475456463895,
"loss": 2.8591,
"step": 11600
},
{
"epoch": 9.579626380431417,
"grad_norm": 0.569664478302002,
"learning_rate": 0.0008980640425727131,
"loss": 2.8576,
"step": 11610
},
{
"epoch": 9.587883166477448,
"grad_norm": 0.6535126566886902,
"learning_rate": 0.0008978805394990366,
"loss": 2.85,
"step": 11620
},
{
"epoch": 9.59613995252348,
"grad_norm": 0.5983597636222839,
"learning_rate": 0.0008976970364253602,
"loss": 2.8674,
"step": 11630
},
{
"epoch": 9.604396738569513,
"grad_norm": 0.5989744067192078,
"learning_rate": 0.0008975135333516836,
"loss": 2.8606,
"step": 11640
},
{
"epoch": 9.612653524615544,
"grad_norm": 0.6094872355461121,
"learning_rate": 0.0008973300302780072,
"loss": 2.8586,
"step": 11650
},
{
"epoch": 9.620910310661575,
"grad_norm": 0.5862686038017273,
"learning_rate": 0.0008971465272043307,
"loss": 2.8566,
"step": 11660
},
{
"epoch": 9.629167096707606,
"grad_norm": 0.6004934310913086,
"learning_rate": 0.0008969630241306543,
"loss": 2.854,
"step": 11670
},
{
"epoch": 9.637423882753637,
"grad_norm": 0.6094337105751038,
"learning_rate": 0.0008967795210569777,
"loss": 2.8599,
"step": 11680
},
{
"epoch": 9.64568066879967,
"grad_norm": 0.5388069748878479,
"learning_rate": 0.0008965960179833013,
"loss": 2.8595,
"step": 11690
},
{
"epoch": 9.653937454845702,
"grad_norm": 0.5832782983779907,
"learning_rate": 0.0008964125149096247,
"loss": 2.8579,
"step": 11700
},
{
"epoch": 9.662194240891733,
"grad_norm": 0.6066370606422424,
"learning_rate": 0.0008962290118359483,
"loss": 2.8632,
"step": 11710
},
{
"epoch": 9.670451026937764,
"grad_norm": 0.6169841289520264,
"learning_rate": 0.0008960455087622717,
"loss": 2.8664,
"step": 11720
},
{
"epoch": 9.678707812983795,
"grad_norm": 0.5962358713150024,
"learning_rate": 0.0008958620056885953,
"loss": 2.8569,
"step": 11730
},
{
"epoch": 9.686964599029828,
"grad_norm": 0.6182219386100769,
"learning_rate": 0.0008956785026149188,
"loss": 2.8409,
"step": 11740
},
{
"epoch": 9.69522138507586,
"grad_norm": 0.5909119248390198,
"learning_rate": 0.0008954949995412424,
"loss": 2.8491,
"step": 11750
},
{
"epoch": 9.70347817112189,
"grad_norm": 0.6216540932655334,
"learning_rate": 0.0008953114964675658,
"loss": 2.8516,
"step": 11760
},
{
"epoch": 9.711734957167922,
"grad_norm": 0.5488907694816589,
"learning_rate": 0.0008951279933938894,
"loss": 2.854,
"step": 11770
},
{
"epoch": 9.719991743213955,
"grad_norm": 0.6433009505271912,
"learning_rate": 0.0008949444903202129,
"loss": 2.8601,
"step": 11780
},
{
"epoch": 9.728248529259986,
"grad_norm": 0.6110396385192871,
"learning_rate": 0.0008947609872465365,
"loss": 2.8499,
"step": 11790
},
{
"epoch": 9.736505315306017,
"grad_norm": 0.5883538722991943,
"learning_rate": 0.0008945774841728599,
"loss": 2.8494,
"step": 11800
},
{
"epoch": 9.744762101352048,
"grad_norm": 0.6048309803009033,
"learning_rate": 0.0008943939810991835,
"loss": 2.8598,
"step": 11810
},
{
"epoch": 9.753018887398081,
"grad_norm": 0.5529934763908386,
"learning_rate": 0.000894210478025507,
"loss": 2.8556,
"step": 11820
},
{
"epoch": 9.761275673444112,
"grad_norm": 0.5549076199531555,
"learning_rate": 0.0008940269749518304,
"loss": 2.8615,
"step": 11830
},
{
"epoch": 9.769532459490144,
"grad_norm": 0.6248366832733154,
"learning_rate": 0.0008938434718781539,
"loss": 2.842,
"step": 11840
},
{
"epoch": 9.777789245536175,
"grad_norm": 0.5666365027427673,
"learning_rate": 0.0008936599688044775,
"loss": 2.8444,
"step": 11850
},
{
"epoch": 9.786046031582206,
"grad_norm": 0.5991445183753967,
"learning_rate": 0.000893476465730801,
"loss": 2.8415,
"step": 11860
},
{
"epoch": 9.794302817628239,
"grad_norm": 0.5583236217498779,
"learning_rate": 0.0008932929626571245,
"loss": 2.8519,
"step": 11870
},
{
"epoch": 9.80255960367427,
"grad_norm": 0.6396259069442749,
"learning_rate": 0.000893109459583448,
"loss": 2.8603,
"step": 11880
},
{
"epoch": 9.810816389720301,
"grad_norm": 0.6023778915405273,
"learning_rate": 0.0008929259565097716,
"loss": 2.856,
"step": 11890
},
{
"epoch": 9.819073175766333,
"grad_norm": 0.582880437374115,
"learning_rate": 0.0008927424534360951,
"loss": 2.8575,
"step": 11900
},
{
"epoch": 9.827329961812364,
"grad_norm": 0.6072121262550354,
"learning_rate": 0.0008925589503624186,
"loss": 2.8533,
"step": 11910
},
{
"epoch": 9.835586747858397,
"grad_norm": 0.6384845972061157,
"learning_rate": 0.0008923754472887421,
"loss": 2.8593,
"step": 11920
},
{
"epoch": 9.843843533904428,
"grad_norm": 0.6040202379226685,
"learning_rate": 0.0008921919442150657,
"loss": 2.8579,
"step": 11930
},
{
"epoch": 9.852100319950459,
"grad_norm": 0.5378623008728027,
"learning_rate": 0.0008920084411413892,
"loss": 2.8493,
"step": 11940
},
{
"epoch": 9.86035710599649,
"grad_norm": 0.6223618388175964,
"learning_rate": 0.0008918249380677127,
"loss": 2.8474,
"step": 11950
},
{
"epoch": 9.868613892042523,
"grad_norm": 0.5807675719261169,
"learning_rate": 0.0008916414349940361,
"loss": 2.8442,
"step": 11960
},
{
"epoch": 9.876870678088554,
"grad_norm": 0.6609598398208618,
"learning_rate": 0.0008914579319203597,
"loss": 2.8692,
"step": 11970
},
{
"epoch": 9.885127464134586,
"grad_norm": 0.55182945728302,
"learning_rate": 0.0008912744288466832,
"loss": 2.8533,
"step": 11980
},
{
"epoch": 9.893384250180617,
"grad_norm": 0.6168049573898315,
"learning_rate": 0.0008910909257730067,
"loss": 2.8432,
"step": 11990
},
{
"epoch": 9.901641036226648,
"grad_norm": 0.5642480850219727,
"learning_rate": 0.0008909074226993302,
"loss": 2.8483,
"step": 12000
},
{
"epoch": 9.91732892971411,
"grad_norm": 0.5881712436676025,
"learning_rate": 0.0008907239196256538,
"loss": 2.8515,
"step": 12010
},
{
"epoch": 9.92558571576014,
"grad_norm": 0.597673773765564,
"learning_rate": 0.0008905404165519773,
"loss": 2.843,
"step": 12020
},
{
"epoch": 9.933842501806172,
"grad_norm": 0.5990006923675537,
"learning_rate": 0.0008903569134783008,
"loss": 2.848,
"step": 12030
},
{
"epoch": 9.942099287852203,
"grad_norm": 0.6145173907279968,
"learning_rate": 0.0008901734104046243,
"loss": 2.8411,
"step": 12040
},
{
"epoch": 9.950356073898234,
"grad_norm": 0.5862278938293457,
"learning_rate": 0.0008899899073309478,
"loss": 2.8561,
"step": 12050
},
{
"epoch": 9.958612859944267,
"grad_norm": 0.5999264717102051,
"learning_rate": 0.0008898064042572714,
"loss": 2.8487,
"step": 12060
},
{
"epoch": 9.966869645990299,
"grad_norm": 0.5286862850189209,
"learning_rate": 0.0008896229011835948,
"loss": 2.851,
"step": 12070
},
{
"epoch": 9.97512643203633,
"grad_norm": 0.5677134394645691,
"learning_rate": 0.0008894393981099184,
"loss": 2.8516,
"step": 12080
},
{
"epoch": 9.983383218082361,
"grad_norm": 0.5856079459190369,
"learning_rate": 0.0008892558950362418,
"loss": 2.8545,
"step": 12090
},
{
"epoch": 9.991640004128392,
"grad_norm": 0.6451898813247681,
"learning_rate": 0.0008890723919625654,
"loss": 2.8358,
"step": 12100
},
{
"epoch": 9.999896790174425,
"grad_norm": 0.6016899347305298,
"learning_rate": 0.0008888888888888888,
"loss": 2.8434,
"step": 12110
},
{
"epoch": 10.008256786046031,
"grad_norm": 0.5568205714225769,
"learning_rate": 0.0008887053858152124,
"loss": 2.8738,
"step": 12120
},
{
"epoch": 10.016513572092062,
"grad_norm": 0.544348955154419,
"learning_rate": 0.0008885218827415359,
"loss": 2.8332,
"step": 12130
},
{
"epoch": 10.024770358138095,
"grad_norm": 0.6535346508026123,
"learning_rate": 0.0008883383796678595,
"loss": 2.8394,
"step": 12140
},
{
"epoch": 10.033027144184127,
"grad_norm": 0.5878455638885498,
"learning_rate": 0.0008881548765941829,
"loss": 2.8475,
"step": 12150
},
{
"epoch": 10.041283930230158,
"grad_norm": 0.5842605829238892,
"learning_rate": 0.0008879713735205065,
"loss": 2.8403,
"step": 12160
},
{
"epoch": 10.049540716276189,
"grad_norm": 0.6385082006454468,
"learning_rate": 0.00088778787044683,
"loss": 2.8376,
"step": 12170
},
{
"epoch": 10.057797502322222,
"grad_norm": 0.6178941130638123,
"learning_rate": 0.0008876043673731536,
"loss": 2.8441,
"step": 12180
},
{
"epoch": 10.066054288368253,
"grad_norm": 0.5717580318450928,
"learning_rate": 0.000887420864299477,
"loss": 2.8356,
"step": 12190
},
{
"epoch": 10.074311074414284,
"grad_norm": 0.5871554613113403,
"learning_rate": 0.0008872373612258006,
"loss": 2.8412,
"step": 12200
},
{
"epoch": 10.082567860460316,
"grad_norm": 0.6004984974861145,
"learning_rate": 0.0008870538581521241,
"loss": 2.8338,
"step": 12210
},
{
"epoch": 10.090824646506347,
"grad_norm": 0.6046565175056458,
"learning_rate": 0.0008868703550784475,
"loss": 2.8372,
"step": 12220
},
{
"epoch": 10.09908143255238,
"grad_norm": 0.5893774032592773,
"learning_rate": 0.000886686852004771,
"loss": 2.8295,
"step": 12230
},
{
"epoch": 10.10733821859841,
"grad_norm": 0.5833553671836853,
"learning_rate": 0.0008865033489310946,
"loss": 2.8371,
"step": 12240
},
{
"epoch": 10.115595004644442,
"grad_norm": 0.6019455194473267,
"learning_rate": 0.0008863198458574181,
"loss": 2.8246,
"step": 12250
},
{
"epoch": 10.123851790690473,
"grad_norm": 0.6151683926582336,
"learning_rate": 0.0008861363427837416,
"loss": 2.841,
"step": 12260
},
{
"epoch": 10.132108576736504,
"grad_norm": 0.6026824116706848,
"learning_rate": 0.0008859528397100651,
"loss": 2.8392,
"step": 12270
},
{
"epoch": 10.140365362782537,
"grad_norm": 0.5783131718635559,
"learning_rate": 0.0008857693366363887,
"loss": 2.8479,
"step": 12280
},
{
"epoch": 10.148622148828569,
"grad_norm": 0.6481205821037292,
"learning_rate": 0.0008855858335627122,
"loss": 2.8423,
"step": 12290
},
{
"epoch": 10.1568789348746,
"grad_norm": 0.5748919248580933,
"learning_rate": 0.0008854023304890357,
"loss": 2.8349,
"step": 12300
},
{
"epoch": 10.165135720920631,
"grad_norm": 0.5705230832099915,
"learning_rate": 0.0008852188274153592,
"loss": 2.8463,
"step": 12310
},
{
"epoch": 10.173392506966664,
"grad_norm": 0.5699977278709412,
"learning_rate": 0.0008850353243416828,
"loss": 2.8438,
"step": 12320
},
{
"epoch": 10.181649293012695,
"grad_norm": 0.544175386428833,
"learning_rate": 0.0008848518212680063,
"loss": 2.8363,
"step": 12330
},
{
"epoch": 10.189906079058726,
"grad_norm": 0.568715512752533,
"learning_rate": 0.0008846683181943298,
"loss": 2.8362,
"step": 12340
},
{
"epoch": 10.198162865104758,
"grad_norm": 0.5720770955085754,
"learning_rate": 0.0008844848151206532,
"loss": 2.8284,
"step": 12350
},
{
"epoch": 10.206419651150789,
"grad_norm": 0.626235842704773,
"learning_rate": 0.0008843013120469768,
"loss": 2.8393,
"step": 12360
},
{
"epoch": 10.214676437196822,
"grad_norm": 0.5661699175834656,
"learning_rate": 0.0008841178089733003,
"loss": 2.8333,
"step": 12370
},
{
"epoch": 10.222933223242853,
"grad_norm": 0.6092801094055176,
"learning_rate": 0.0008839343058996238,
"loss": 2.8513,
"step": 12380
},
{
"epoch": 10.231190009288884,
"grad_norm": 0.6037712097167969,
"learning_rate": 0.0008837508028259473,
"loss": 2.8328,
"step": 12390
},
{
"epoch": 10.239446795334915,
"grad_norm": 0.5994784832000732,
"learning_rate": 0.0008835672997522709,
"loss": 2.8268,
"step": 12400
},
{
"epoch": 10.247703581380948,
"grad_norm": 0.5821447968482971,
"learning_rate": 0.0008833837966785944,
"loss": 2.8376,
"step": 12410
},
{
"epoch": 10.25596036742698,
"grad_norm": 0.6151066422462463,
"learning_rate": 0.0008832002936049179,
"loss": 2.8338,
"step": 12420
},
{
"epoch": 10.26421715347301,
"grad_norm": 0.6016796231269836,
"learning_rate": 0.0008830167905312414,
"loss": 2.8295,
"step": 12430
},
{
"epoch": 10.272473939519042,
"grad_norm": 0.5741587281227112,
"learning_rate": 0.000882833287457565,
"loss": 2.8283,
"step": 12440
},
{
"epoch": 10.280730725565073,
"grad_norm": 0.5840280055999756,
"learning_rate": 0.0008826497843838885,
"loss": 2.8268,
"step": 12450
},
{
"epoch": 10.288987511611106,
"grad_norm": 0.5622872710227966,
"learning_rate": 0.000882466281310212,
"loss": 2.8424,
"step": 12460
},
{
"epoch": 10.297244297657137,
"grad_norm": 0.6184718608856201,
"learning_rate": 0.0008822827782365355,
"loss": 2.8269,
"step": 12470
},
{
"epoch": 10.305501083703168,
"grad_norm": 0.5796384215354919,
"learning_rate": 0.0008820992751628591,
"loss": 2.8383,
"step": 12480
},
{
"epoch": 10.3137578697492,
"grad_norm": 0.617235541343689,
"learning_rate": 0.0008819157720891825,
"loss": 2.8268,
"step": 12490
},
{
"epoch": 10.322014655795233,
"grad_norm": 0.5677554607391357,
"learning_rate": 0.000881732269015506,
"loss": 2.8349,
"step": 12500
},
{
"epoch": 10.330271441841264,
"grad_norm": 0.5938097238540649,
"learning_rate": 0.0008815487659418295,
"loss": 2.8362,
"step": 12510
},
{
"epoch": 10.338528227887295,
"grad_norm": 0.6369422078132629,
"learning_rate": 0.0008813652628681531,
"loss": 2.8364,
"step": 12520
},
{
"epoch": 10.346785013933326,
"grad_norm": 0.6142675280570984,
"learning_rate": 0.0008811817597944766,
"loss": 2.8259,
"step": 12530
},
{
"epoch": 10.355041799979357,
"grad_norm": 0.5718218684196472,
"learning_rate": 0.0008809982567208001,
"loss": 2.8473,
"step": 12540
},
{
"epoch": 10.36329858602539,
"grad_norm": 0.5698361992835999,
"learning_rate": 0.0008808147536471236,
"loss": 2.8398,
"step": 12550
},
{
"epoch": 10.371555372071422,
"grad_norm": 0.5833884477615356,
"learning_rate": 0.0008806312505734472,
"loss": 2.8171,
"step": 12560
},
{
"epoch": 10.379812158117453,
"grad_norm": 0.6157854795455933,
"learning_rate": 0.0008804477474997707,
"loss": 2.8409,
"step": 12570
},
{
"epoch": 10.388068944163484,
"grad_norm": 0.5915418863296509,
"learning_rate": 0.0008802642444260942,
"loss": 2.8369,
"step": 12580
},
{
"epoch": 10.396325730209515,
"grad_norm": 0.5849014520645142,
"learning_rate": 0.0008800807413524177,
"loss": 2.8329,
"step": 12590
},
{
"epoch": 10.404582516255548,
"grad_norm": 0.6383744478225708,
"learning_rate": 0.0008798972382787413,
"loss": 2.8299,
"step": 12600
},
{
"epoch": 10.41283930230158,
"grad_norm": 0.5256903767585754,
"learning_rate": 0.0008797137352050648,
"loss": 2.8244,
"step": 12610
},
{
"epoch": 10.42109608834761,
"grad_norm": 0.6176425218582153,
"learning_rate": 0.0008795302321313881,
"loss": 2.8318,
"step": 12620
},
{
"epoch": 10.429352874393642,
"grad_norm": 0.625028133392334,
"learning_rate": 0.0008793467290577117,
"loss": 2.8367,
"step": 12630
},
{
"epoch": 10.437609660439675,
"grad_norm": 0.626335620880127,
"learning_rate": 0.0008791632259840352,
"loss": 2.8346,
"step": 12640
},
{
"epoch": 10.445866446485706,
"grad_norm": 0.5328546166419983,
"learning_rate": 0.0008789797229103587,
"loss": 2.8279,
"step": 12650
},
{
"epoch": 10.454123232531737,
"grad_norm": 0.5871540904045105,
"learning_rate": 0.0008787962198366822,
"loss": 2.8268,
"step": 12660
},
{
"epoch": 10.462380018577768,
"grad_norm": 0.5590776205062866,
"learning_rate": 0.0008786127167630058,
"loss": 2.8227,
"step": 12670
},
{
"epoch": 10.4706368046238,
"grad_norm": 0.5899330973625183,
"learning_rate": 0.0008784292136893293,
"loss": 2.8186,
"step": 12680
},
{
"epoch": 10.478893590669832,
"grad_norm": 0.653564989566803,
"learning_rate": 0.0008782457106156528,
"loss": 2.8333,
"step": 12690
},
{
"epoch": 10.487150376715864,
"grad_norm": 0.627564013004303,
"learning_rate": 0.0008780622075419763,
"loss": 2.823,
"step": 12700
},
{
"epoch": 10.495407162761895,
"grad_norm": 0.6121799945831299,
"learning_rate": 0.0008778787044682999,
"loss": 2.8394,
"step": 12710
},
{
"epoch": 10.503663948807926,
"grad_norm": 0.6052922010421753,
"learning_rate": 0.0008776952013946234,
"loss": 2.815,
"step": 12720
},
{
"epoch": 10.511920734853959,
"grad_norm": 0.592348039150238,
"learning_rate": 0.000877511698320947,
"loss": 2.8293,
"step": 12730
},
{
"epoch": 10.52017752089999,
"grad_norm": 0.5429986119270325,
"learning_rate": 0.0008773281952472704,
"loss": 2.8258,
"step": 12740
},
{
"epoch": 10.528434306946021,
"grad_norm": 0.6261007785797119,
"learning_rate": 0.0008771446921735939,
"loss": 2.8287,
"step": 12750
},
{
"epoch": 10.536691092992053,
"grad_norm": 0.5362280011177063,
"learning_rate": 0.0008769611890999174,
"loss": 2.8271,
"step": 12760
},
{
"epoch": 10.544947879038084,
"grad_norm": 0.5826970338821411,
"learning_rate": 0.0008767776860262409,
"loss": 2.8285,
"step": 12770
},
{
"epoch": 10.553204665084117,
"grad_norm": 0.597993791103363,
"learning_rate": 0.0008765941829525644,
"loss": 2.8213,
"step": 12780
},
{
"epoch": 10.561461451130148,
"grad_norm": 0.5747185945510864,
"learning_rate": 0.000876410679878888,
"loss": 2.8282,
"step": 12790
},
{
"epoch": 10.569718237176179,
"grad_norm": 0.5573180317878723,
"learning_rate": 0.0008762271768052115,
"loss": 2.824,
"step": 12800
},
{
"epoch": 10.57797502322221,
"grad_norm": 0.5840964317321777,
"learning_rate": 0.000876043673731535,
"loss": 2.8178,
"step": 12810
},
{
"epoch": 10.586231809268241,
"grad_norm": 0.5690692663192749,
"learning_rate": 0.0008758601706578585,
"loss": 2.8155,
"step": 12820
},
{
"epoch": 10.594488595314274,
"grad_norm": 0.5685713887214661,
"learning_rate": 0.0008756766675841821,
"loss": 2.8147,
"step": 12830
},
{
"epoch": 10.602745381360306,
"grad_norm": 0.6194620132446289,
"learning_rate": 0.0008754931645105056,
"loss": 2.8374,
"step": 12840
},
{
"epoch": 10.611002167406337,
"grad_norm": 0.5465943217277527,
"learning_rate": 0.0008753096614368291,
"loss": 2.8312,
"step": 12850
},
{
"epoch": 10.619258953452368,
"grad_norm": 0.5942501425743103,
"learning_rate": 0.0008751261583631526,
"loss": 2.8235,
"step": 12860
},
{
"epoch": 10.627515739498401,
"grad_norm": 0.5760926008224487,
"learning_rate": 0.0008749426552894762,
"loss": 2.8196,
"step": 12870
},
{
"epoch": 10.635772525544432,
"grad_norm": 0.5682793259620667,
"learning_rate": 0.0008747591522157996,
"loss": 2.8349,
"step": 12880
},
{
"epoch": 10.644029311590463,
"grad_norm": 0.5754048228263855,
"learning_rate": 0.0008745756491421231,
"loss": 2.8215,
"step": 12890
},
{
"epoch": 10.652286097636495,
"grad_norm": 0.5868312120437622,
"learning_rate": 0.0008743921460684466,
"loss": 2.8359,
"step": 12900
},
{
"epoch": 10.660542883682526,
"grad_norm": 0.5740572214126587,
"learning_rate": 0.0008742086429947702,
"loss": 2.8321,
"step": 12910
},
{
"epoch": 10.668799669728559,
"grad_norm": 0.570972740650177,
"learning_rate": 0.0008740251399210937,
"loss": 2.8291,
"step": 12920
},
{
"epoch": 10.67705645577459,
"grad_norm": 0.5573681592941284,
"learning_rate": 0.0008738416368474172,
"loss": 2.8211,
"step": 12930
},
{
"epoch": 10.685313241820621,
"grad_norm": 0.6186919212341309,
"learning_rate": 0.0008736581337737407,
"loss": 2.8122,
"step": 12940
},
{
"epoch": 10.693570027866652,
"grad_norm": 0.6006292700767517,
"learning_rate": 0.0008734746307000643,
"loss": 2.8187,
"step": 12950
},
{
"epoch": 10.701826813912685,
"grad_norm": 0.571305513381958,
"learning_rate": 0.0008732911276263878,
"loss": 2.8213,
"step": 12960
},
{
"epoch": 10.710083599958717,
"grad_norm": 0.5861838459968567,
"learning_rate": 0.0008731076245527113,
"loss": 2.8176,
"step": 12970
},
{
"epoch": 10.718340386004748,
"grad_norm": 0.618885338306427,
"learning_rate": 0.0008729241214790348,
"loss": 2.8325,
"step": 12980
},
{
"epoch": 10.726597172050779,
"grad_norm": 0.6155752539634705,
"learning_rate": 0.0008727406184053584,
"loss": 2.8359,
"step": 12990
},
{
"epoch": 10.73485395809681,
"grad_norm": 0.5645089149475098,
"learning_rate": 0.0008725571153316819,
"loss": 2.8272,
"step": 13000
},
{
"epoch": 10.743110744142843,
"grad_norm": 0.5860604643821716,
"learning_rate": 0.0008723736122580053,
"loss": 2.8277,
"step": 13010
},
{
"epoch": 10.751367530188874,
"grad_norm": 0.6243773698806763,
"learning_rate": 0.0008721901091843288,
"loss": 2.8401,
"step": 13020
},
{
"epoch": 10.759624316234905,
"grad_norm": 0.6127947568893433,
"learning_rate": 0.0008720066061106524,
"loss": 2.8302,
"step": 13030
},
{
"epoch": 10.767881102280937,
"grad_norm": 0.6391910910606384,
"learning_rate": 0.0008718231030369758,
"loss": 2.8283,
"step": 13040
},
{
"epoch": 10.776137888326968,
"grad_norm": 0.5912919044494629,
"learning_rate": 0.0008716395999632994,
"loss": 2.8191,
"step": 13050
},
{
"epoch": 10.784394674373,
"grad_norm": 0.5919018983840942,
"learning_rate": 0.0008714560968896229,
"loss": 2.819,
"step": 13060
},
{
"epoch": 10.792651460419032,
"grad_norm": 0.6195237040519714,
"learning_rate": 0.0008712725938159465,
"loss": 2.8178,
"step": 13070
},
{
"epoch": 10.800908246465063,
"grad_norm": 0.6050122976303101,
"learning_rate": 0.00087108909074227,
"loss": 2.825,
"step": 13080
},
{
"epoch": 10.809165032511094,
"grad_norm": 0.6022667288780212,
"learning_rate": 0.0008709055876685935,
"loss": 2.8237,
"step": 13090
},
{
"epoch": 10.817421818557127,
"grad_norm": 0.6186034679412842,
"learning_rate": 0.000870722084594917,
"loss": 2.8207,
"step": 13100
},
{
"epoch": 10.825678604603159,
"grad_norm": 0.5731164813041687,
"learning_rate": 0.0008705385815212406,
"loss": 2.8229,
"step": 13110
},
{
"epoch": 10.83393539064919,
"grad_norm": 0.5715698003768921,
"learning_rate": 0.000870355078447564,
"loss": 2.8319,
"step": 13120
},
{
"epoch": 10.842192176695221,
"grad_norm": 0.5976336002349854,
"learning_rate": 0.0008701715753738876,
"loss": 2.8099,
"step": 13130
},
{
"epoch": 10.850448962741252,
"grad_norm": 0.5506688952445984,
"learning_rate": 0.000869988072300211,
"loss": 2.8256,
"step": 13140
},
{
"epoch": 10.858705748787285,
"grad_norm": 0.5799828767776489,
"learning_rate": 0.0008698045692265346,
"loss": 2.8198,
"step": 13150
},
{
"epoch": 10.866962534833316,
"grad_norm": 0.5726258754730225,
"learning_rate": 0.000869621066152858,
"loss": 2.8296,
"step": 13160
},
{
"epoch": 10.875219320879348,
"grad_norm": 0.5829789042472839,
"learning_rate": 0.0008694375630791816,
"loss": 2.8207,
"step": 13170
},
{
"epoch": 10.883476106925379,
"grad_norm": 0.5800747871398926,
"learning_rate": 0.0008692540600055051,
"loss": 2.8167,
"step": 13180
},
{
"epoch": 10.891732892971412,
"grad_norm": 0.6349780559539795,
"learning_rate": 0.0008690705569318286,
"loss": 2.8033,
"step": 13190
},
{
"epoch": 10.899989679017443,
"grad_norm": 0.6027595400810242,
"learning_rate": 0.0008688870538581521,
"loss": 2.8244,
"step": 13200
},
{
"epoch": 10.908246465063474,
"grad_norm": 0.5879138708114624,
"learning_rate": 0.0008687035507844756,
"loss": 2.8161,
"step": 13210
},
{
"epoch": 10.916503251109505,
"grad_norm": 0.5601217746734619,
"learning_rate": 0.0008685200477107992,
"loss": 2.8168,
"step": 13220
},
{
"epoch": 10.924760037155536,
"grad_norm": 0.6721534729003906,
"learning_rate": 0.0008683365446371227,
"loss": 2.8268,
"step": 13230
},
{
"epoch": 10.93301682320157,
"grad_norm": 0.6043145060539246,
"learning_rate": 0.0008681530415634462,
"loss": 2.8145,
"step": 13240
},
{
"epoch": 10.9412736092476,
"grad_norm": 0.6050463914871216,
"learning_rate": 0.0008679695384897697,
"loss": 2.8168,
"step": 13250
},
{
"epoch": 10.949530395293632,
"grad_norm": 0.6205517649650574,
"learning_rate": 0.0008677860354160933,
"loss": 2.8108,
"step": 13260
},
{
"epoch": 10.957787181339663,
"grad_norm": 0.6242938041687012,
"learning_rate": 0.0008676025323424167,
"loss": 2.8152,
"step": 13270
},
{
"epoch": 10.966043967385694,
"grad_norm": 0.5615045428276062,
"learning_rate": 0.0008674190292687402,
"loss": 2.817,
"step": 13280
},
{
"epoch": 10.974300753431727,
"grad_norm": 0.606850802898407,
"learning_rate": 0.0008672355261950637,
"loss": 2.8072,
"step": 13290
},
{
"epoch": 10.982557539477758,
"grad_norm": 0.6060166358947754,
"learning_rate": 0.0008670520231213873,
"loss": 2.8135,
"step": 13300
},
{
"epoch": 10.99081432552379,
"grad_norm": 0.5779221653938293,
"learning_rate": 0.0008668685200477108,
"loss": 2.8174,
"step": 13310
},
{
"epoch": 10.99907111156982,
"grad_norm": 0.5719656944274902,
"learning_rate": 0.0008666850169740343,
"loss": 2.8116,
"step": 13320
},
{
"epoch": 11.006605428836826,
"grad_norm": 0.5705454349517822,
"learning_rate": 0.0008665015139003578,
"loss": 2.5797,
"step": 13330
},
{
"epoch": 11.014862214882857,
"grad_norm": 0.6058195233345032,
"learning_rate": 0.0008663180108266814,
"loss": 2.807,
"step": 13340
},
{
"epoch": 11.023119000928888,
"grad_norm": 0.6259657740592957,
"learning_rate": 0.0008661345077530049,
"loss": 2.8049,
"step": 13350
},
{
"epoch": 11.03137578697492,
"grad_norm": 0.5490705966949463,
"learning_rate": 0.0008659510046793284,
"loss": 2.8092,
"step": 13360
},
{
"epoch": 11.03963257302095,
"grad_norm": 0.5268445611000061,
"learning_rate": 0.0008657675016056519,
"loss": 2.7993,
"step": 13370
},
{
"epoch": 11.047889359066984,
"grad_norm": 0.553423285484314,
"learning_rate": 0.0008655839985319755,
"loss": 2.7974,
"step": 13380
},
{
"epoch": 11.056146145113015,
"grad_norm": 0.5482957363128662,
"learning_rate": 0.000865400495458299,
"loss": 2.8158,
"step": 13390
},
{
"epoch": 11.064402931159046,
"grad_norm": 0.5422245860099792,
"learning_rate": 0.0008652169923846225,
"loss": 2.8082,
"step": 13400
},
{
"epoch": 11.072659717205077,
"grad_norm": 0.6090859174728394,
"learning_rate": 0.0008650334893109459,
"loss": 2.8095,
"step": 13410
},
{
"epoch": 11.08091650325111,
"grad_norm": 0.6134405732154846,
"learning_rate": 0.0008648499862372695,
"loss": 2.8136,
"step": 13420
},
{
"epoch": 11.089173289297142,
"grad_norm": 0.5978251099586487,
"learning_rate": 0.000864666483163593,
"loss": 2.8098,
"step": 13430
},
{
"epoch": 11.097430075343173,
"grad_norm": 0.6461356282234192,
"learning_rate": 0.0008644829800899165,
"loss": 2.8117,
"step": 13440
},
{
"epoch": 11.105686861389204,
"grad_norm": 0.5783932209014893,
"learning_rate": 0.00086429947701624,
"loss": 2.816,
"step": 13450
},
{
"epoch": 11.113943647435235,
"grad_norm": 0.5924204587936401,
"learning_rate": 0.0008641159739425636,
"loss": 2.8083,
"step": 13460
},
{
"epoch": 11.122200433481268,
"grad_norm": 0.5752689838409424,
"learning_rate": 0.000863932470868887,
"loss": 2.808,
"step": 13470
},
{
"epoch": 11.1304572195273,
"grad_norm": 0.6291837692260742,
"learning_rate": 0.0008637489677952106,
"loss": 2.815,
"step": 13480
},
{
"epoch": 11.13871400557333,
"grad_norm": 0.637244701385498,
"learning_rate": 0.0008635654647215341,
"loss": 2.8058,
"step": 13490
},
{
"epoch": 11.146970791619362,
"grad_norm": 0.5656126737594604,
"learning_rate": 0.0008633819616478577,
"loss": 2.8029,
"step": 13500
},
{
"epoch": 11.155227577665393,
"grad_norm": 0.5883386731147766,
"learning_rate": 0.0008631984585741811,
"loss": 2.8116,
"step": 13510
},
{
"epoch": 11.163484363711426,
"grad_norm": 0.541492760181427,
"learning_rate": 0.0008630149555005047,
"loss": 2.8034,
"step": 13520
},
{
"epoch": 11.171741149757457,
"grad_norm": 0.5849348902702332,
"learning_rate": 0.0008628314524268282,
"loss": 2.8121,
"step": 13530
},
{
"epoch": 11.179997935803488,
"grad_norm": 0.5753573179244995,
"learning_rate": 0.0008626479493531517,
"loss": 2.8147,
"step": 13540
},
{
"epoch": 11.18825472184952,
"grad_norm": 0.5690391659736633,
"learning_rate": 0.0008624644462794751,
"loss": 2.8021,
"step": 13550
},
{
"epoch": 11.196511507895552,
"grad_norm": 0.5915670990943909,
"learning_rate": 0.0008622809432057987,
"loss": 2.8023,
"step": 13560
},
{
"epoch": 11.204768293941584,
"grad_norm": 0.634675145149231,
"learning_rate": 0.0008620974401321222,
"loss": 2.8189,
"step": 13570
},
{
"epoch": 11.213025079987615,
"grad_norm": 0.5452571511268616,
"learning_rate": 0.0008619139370584458,
"loss": 2.8067,
"step": 13580
},
{
"epoch": 11.221281866033646,
"grad_norm": 0.5526494383811951,
"learning_rate": 0.0008617304339847692,
"loss": 2.8281,
"step": 13590
},
{
"epoch": 11.229538652079677,
"grad_norm": 0.6009969115257263,
"learning_rate": 0.0008615469309110928,
"loss": 2.8102,
"step": 13600
},
{
"epoch": 11.23779543812571,
"grad_norm": 0.5761014819145203,
"learning_rate": 0.0008613634278374163,
"loss": 2.8073,
"step": 13610
},
{
"epoch": 11.246052224171741,
"grad_norm": 0.6111817359924316,
"learning_rate": 0.0008611799247637399,
"loss": 2.8056,
"step": 13620
},
{
"epoch": 11.254309010217773,
"grad_norm": 0.5755062699317932,
"learning_rate": 0.0008609964216900633,
"loss": 2.8099,
"step": 13630
},
{
"epoch": 11.262565796263804,
"grad_norm": 0.5578922033309937,
"learning_rate": 0.0008608129186163869,
"loss": 2.8047,
"step": 13640
},
{
"epoch": 11.270822582309837,
"grad_norm": 0.6050003170967102,
"learning_rate": 0.0008606294155427104,
"loss": 2.8096,
"step": 13650
},
{
"epoch": 11.279079368355868,
"grad_norm": 0.6092653870582581,
"learning_rate": 0.000860445912469034,
"loss": 2.8029,
"step": 13660
},
{
"epoch": 11.287336154401899,
"grad_norm": 0.6080880165100098,
"learning_rate": 0.0008602624093953573,
"loss": 2.8025,
"step": 13670
},
{
"epoch": 11.29559294044793,
"grad_norm": 0.5565916895866394,
"learning_rate": 0.0008600789063216809,
"loss": 2.8062,
"step": 13680
},
{
"epoch": 11.303849726493961,
"grad_norm": 0.6291329860687256,
"learning_rate": 0.0008598954032480044,
"loss": 2.8212,
"step": 13690
},
{
"epoch": 11.312106512539994,
"grad_norm": 0.6040759682655334,
"learning_rate": 0.000859711900174328,
"loss": 2.7991,
"step": 13700
},
{
"epoch": 11.320363298586026,
"grad_norm": 0.5415501594543457,
"learning_rate": 0.0008595283971006514,
"loss": 2.7978,
"step": 13710
},
{
"epoch": 11.328620084632057,
"grad_norm": 0.5466763973236084,
"learning_rate": 0.000859344894026975,
"loss": 2.807,
"step": 13720
},
{
"epoch": 11.336876870678088,
"grad_norm": 0.5396016836166382,
"learning_rate": 0.0008591613909532985,
"loss": 2.8042,
"step": 13730
},
{
"epoch": 11.345133656724121,
"grad_norm": 0.6255636215209961,
"learning_rate": 0.0008589778878796221,
"loss": 2.8082,
"step": 13740
},
{
"epoch": 11.353390442770152,
"grad_norm": 0.6161576509475708,
"learning_rate": 0.0008587943848059455,
"loss": 2.8061,
"step": 13750
},
{
"epoch": 11.361647228816183,
"grad_norm": 0.62225741147995,
"learning_rate": 0.000858610881732269,
"loss": 2.8042,
"step": 13760
},
{
"epoch": 11.369904014862215,
"grad_norm": 0.6520695090293884,
"learning_rate": 0.0008584273786585926,
"loss": 2.8062,
"step": 13770
},
{
"epoch": 11.378160800908246,
"grad_norm": 0.6661168932914734,
"learning_rate": 0.0008582438755849161,
"loss": 2.8053,
"step": 13780
},
{
"epoch": 11.386417586954279,
"grad_norm": 0.5990477204322815,
"learning_rate": 0.0008580603725112396,
"loss": 2.8013,
"step": 13790
},
{
"epoch": 11.39467437300031,
"grad_norm": 0.6206037402153015,
"learning_rate": 0.000857876869437563,
"loss": 2.8089,
"step": 13800
},
{
"epoch": 11.402931159046341,
"grad_norm": 0.6662552356719971,
"learning_rate": 0.0008576933663638866,
"loss": 2.8048,
"step": 13810
},
{
"epoch": 11.411187945092372,
"grad_norm": 0.6055031418800354,
"learning_rate": 0.00085750986329021,
"loss": 2.8057,
"step": 13820
},
{
"epoch": 11.419444731138404,
"grad_norm": 0.618643045425415,
"learning_rate": 0.0008573263602165336,
"loss": 2.8175,
"step": 13830
},
{
"epoch": 11.427701517184436,
"grad_norm": 0.58855140209198,
"learning_rate": 0.0008571428571428571,
"loss": 2.7973,
"step": 13840
},
{
"epoch": 11.435958303230468,
"grad_norm": 0.5836468935012817,
"learning_rate": 0.0008569593540691807,
"loss": 2.8031,
"step": 13850
},
{
"epoch": 11.444215089276499,
"grad_norm": 0.6513998508453369,
"learning_rate": 0.0008567758509955041,
"loss": 2.8049,
"step": 13860
},
{
"epoch": 11.45247187532253,
"grad_norm": 0.6231095790863037,
"learning_rate": 0.0008565923479218277,
"loss": 2.7965,
"step": 13870
},
{
"epoch": 11.460728661368563,
"grad_norm": 0.598556637763977,
"learning_rate": 0.0008564088448481512,
"loss": 2.794,
"step": 13880
},
{
"epoch": 11.468985447414594,
"grad_norm": 0.613278329372406,
"learning_rate": 0.0008562253417744748,
"loss": 2.8008,
"step": 13890
},
{
"epoch": 11.477242233460625,
"grad_norm": 0.5937925577163696,
"learning_rate": 0.0008560418387007982,
"loss": 2.7992,
"step": 13900
},
{
"epoch": 11.485499019506657,
"grad_norm": 0.5671007037162781,
"learning_rate": 0.0008558583356271218,
"loss": 2.8071,
"step": 13910
},
{
"epoch": 11.493755805552688,
"grad_norm": 0.5720387101173401,
"learning_rate": 0.0008556748325534453,
"loss": 2.8006,
"step": 13920
},
{
"epoch": 11.50201259159872,
"grad_norm": 0.5988256335258484,
"learning_rate": 0.0008554913294797688,
"loss": 2.794,
"step": 13930
},
{
"epoch": 11.510269377644752,
"grad_norm": 0.5751326680183411,
"learning_rate": 0.0008553078264060922,
"loss": 2.809,
"step": 13940
},
{
"epoch": 11.518526163690783,
"grad_norm": 0.5636781454086304,
"learning_rate": 0.0008551243233324158,
"loss": 2.792,
"step": 13950
},
{
"epoch": 11.526782949736814,
"grad_norm": 0.6231285929679871,
"learning_rate": 0.0008549408202587393,
"loss": 2.8034,
"step": 13960
},
{
"epoch": 11.535039735782847,
"grad_norm": 0.5834125280380249,
"learning_rate": 0.0008547573171850629,
"loss": 2.7947,
"step": 13970
},
{
"epoch": 11.543296521828879,
"grad_norm": 0.5725896954536438,
"learning_rate": 0.0008545738141113863,
"loss": 2.7883,
"step": 13980
},
{
"epoch": 11.55155330787491,
"grad_norm": 0.6235449314117432,
"learning_rate": 0.0008543903110377099,
"loss": 2.794,
"step": 13990
},
{
"epoch": 11.559810093920941,
"grad_norm": 0.5574560165405273,
"learning_rate": 0.0008542068079640334,
"loss": 2.804,
"step": 14000
},
{
"epoch": 11.568066879966972,
"grad_norm": 0.6278049349784851,
"learning_rate": 0.000854023304890357,
"loss": 2.8071,
"step": 14010
},
{
"epoch": 11.576323666013005,
"grad_norm": 0.618698239326477,
"learning_rate": 0.0008538398018166804,
"loss": 2.8062,
"step": 14020
},
{
"epoch": 11.584580452059036,
"grad_norm": 0.5747182369232178,
"learning_rate": 0.000853656298743004,
"loss": 2.8152,
"step": 14030
},
{
"epoch": 11.592837238105067,
"grad_norm": 0.590527355670929,
"learning_rate": 0.0008534727956693275,
"loss": 2.8008,
"step": 14040
},
{
"epoch": 11.601094024151099,
"grad_norm": 0.5996799468994141,
"learning_rate": 0.0008532892925956511,
"loss": 2.8007,
"step": 14050
},
{
"epoch": 11.609350810197132,
"grad_norm": 0.5726416110992432,
"learning_rate": 0.0008531057895219744,
"loss": 2.8028,
"step": 14060
},
{
"epoch": 11.617607596243163,
"grad_norm": 0.5995892286300659,
"learning_rate": 0.000852922286448298,
"loss": 2.8071,
"step": 14070
},
{
"epoch": 11.625864382289194,
"grad_norm": 0.5530434250831604,
"learning_rate": 0.0008527387833746215,
"loss": 2.8062,
"step": 14080
},
{
"epoch": 11.634121168335225,
"grad_norm": 0.5788170695304871,
"learning_rate": 0.0008525552803009451,
"loss": 2.7975,
"step": 14090
},
{
"epoch": 11.642377954381256,
"grad_norm": 0.5959973931312561,
"learning_rate": 0.0008523717772272685,
"loss": 2.7908,
"step": 14100
},
{
"epoch": 11.65063474042729,
"grad_norm": 0.6316953301429749,
"learning_rate": 0.0008521882741535921,
"loss": 2.798,
"step": 14110
},
{
"epoch": 11.65889152647332,
"grad_norm": 0.5617077350616455,
"learning_rate": 0.0008520047710799156,
"loss": 2.8042,
"step": 14120
},
{
"epoch": 11.667148312519352,
"grad_norm": 0.553451657295227,
"learning_rate": 0.0008518212680062392,
"loss": 2.796,
"step": 14130
},
{
"epoch": 11.675405098565383,
"grad_norm": 0.5701197385787964,
"learning_rate": 0.0008516377649325626,
"loss": 2.7992,
"step": 14140
},
{
"epoch": 11.683661884611414,
"grad_norm": 0.6057118773460388,
"learning_rate": 0.0008514542618588862,
"loss": 2.7976,
"step": 14150
},
{
"epoch": 11.691918670657447,
"grad_norm": 0.5956297516822815,
"learning_rate": 0.0008512707587852097,
"loss": 2.8001,
"step": 14160
},
{
"epoch": 11.700175456703478,
"grad_norm": 0.5502737164497375,
"learning_rate": 0.0008510872557115333,
"loss": 2.7948,
"step": 14170
},
{
"epoch": 11.70843224274951,
"grad_norm": 0.6299700736999512,
"learning_rate": 0.0008509037526378567,
"loss": 2.7963,
"step": 14180
},
{
"epoch": 11.71668902879554,
"grad_norm": 0.5706774592399597,
"learning_rate": 0.0008507202495641802,
"loss": 2.7897,
"step": 14190
},
{
"epoch": 11.724945814841574,
"grad_norm": 0.5503284335136414,
"learning_rate": 0.0008505367464905037,
"loss": 2.7954,
"step": 14200
},
{
"epoch": 11.733202600887605,
"grad_norm": 0.6203439235687256,
"learning_rate": 0.0008503532434168273,
"loss": 2.7927,
"step": 14210
},
{
"epoch": 11.741459386933636,
"grad_norm": 0.5536445379257202,
"learning_rate": 0.0008501697403431507,
"loss": 2.785,
"step": 14220
},
{
"epoch": 11.749716172979667,
"grad_norm": 0.5857203006744385,
"learning_rate": 0.0008499862372694743,
"loss": 2.7982,
"step": 14230
},
{
"epoch": 11.757972959025699,
"grad_norm": 0.5552855730056763,
"learning_rate": 0.0008498027341957978,
"loss": 2.7839,
"step": 14240
},
{
"epoch": 11.766229745071731,
"grad_norm": 0.5858961939811707,
"learning_rate": 0.0008496192311221214,
"loss": 2.7923,
"step": 14250
},
{
"epoch": 11.774486531117763,
"grad_norm": 0.6338097453117371,
"learning_rate": 0.0008494357280484448,
"loss": 2.8024,
"step": 14260
},
{
"epoch": 11.782743317163794,
"grad_norm": 0.6377038359642029,
"learning_rate": 0.0008492522249747684,
"loss": 2.7864,
"step": 14270
},
{
"epoch": 11.791000103209825,
"grad_norm": 0.6072639226913452,
"learning_rate": 0.0008490687219010919,
"loss": 2.788,
"step": 14280
},
{
"epoch": 11.799256889255858,
"grad_norm": 0.5601785778999329,
"learning_rate": 0.0008488852188274155,
"loss": 2.7999,
"step": 14290
},
{
"epoch": 11.80751367530189,
"grad_norm": 0.6033042669296265,
"learning_rate": 0.0008487017157537389,
"loss": 2.7998,
"step": 14300
},
{
"epoch": 11.81577046134792,
"grad_norm": 0.5611660480499268,
"learning_rate": 0.0008485182126800625,
"loss": 2.795,
"step": 14310
},
{
"epoch": 11.824027247393952,
"grad_norm": 0.5943251848220825,
"learning_rate": 0.0008483347096063859,
"loss": 2.7871,
"step": 14320
},
{
"epoch": 11.832284033439983,
"grad_norm": 0.6414892077445984,
"learning_rate": 0.0008481512065327093,
"loss": 2.8011,
"step": 14330
},
{
"epoch": 11.840540819486016,
"grad_norm": 0.6055446267127991,
"learning_rate": 0.0008479677034590329,
"loss": 2.799,
"step": 14340
},
{
"epoch": 11.848797605532047,
"grad_norm": 0.6286283135414124,
"learning_rate": 0.0008477842003853564,
"loss": 2.7998,
"step": 14350
},
{
"epoch": 11.857054391578078,
"grad_norm": 0.6823182702064514,
"learning_rate": 0.00084760069731168,
"loss": 2.7979,
"step": 14360
},
{
"epoch": 11.86531117762411,
"grad_norm": 0.555995523929596,
"learning_rate": 0.0008474171942380034,
"loss": 2.7882,
"step": 14370
},
{
"epoch": 11.87356796367014,
"grad_norm": 0.5597317814826965,
"learning_rate": 0.000847233691164327,
"loss": 2.7941,
"step": 14380
},
{
"epoch": 11.881824749716174,
"grad_norm": 0.6191929578781128,
"learning_rate": 0.0008470501880906505,
"loss": 2.793,
"step": 14390
},
{
"epoch": 11.890081535762205,
"grad_norm": 0.6188380122184753,
"learning_rate": 0.0008468666850169741,
"loss": 2.7849,
"step": 14400
},
{
"epoch": 11.898338321808236,
"grad_norm": 0.6156478524208069,
"learning_rate": 0.0008466831819432975,
"loss": 2.8008,
"step": 14410
},
{
"epoch": 11.906595107854267,
"grad_norm": 0.5268288850784302,
"learning_rate": 0.0008464996788696211,
"loss": 2.7886,
"step": 14420
},
{
"epoch": 11.9148518939003,
"grad_norm": 0.5729905962944031,
"learning_rate": 0.0008463161757959446,
"loss": 2.7831,
"step": 14430
},
{
"epoch": 11.923108679946331,
"grad_norm": 0.6199338436126709,
"learning_rate": 0.0008461326727222682,
"loss": 2.7981,
"step": 14440
},
{
"epoch": 11.931365465992362,
"grad_norm": 0.5851151943206787,
"learning_rate": 0.0008459491696485915,
"loss": 2.7842,
"step": 14450
},
{
"epoch": 11.939622252038394,
"grad_norm": 0.6265865564346313,
"learning_rate": 0.0008457656665749151,
"loss": 2.7953,
"step": 14460
},
{
"epoch": 11.947879038084425,
"grad_norm": 0.5881917476654053,
"learning_rate": 0.0008455821635012386,
"loss": 2.7959,
"step": 14470
},
{
"epoch": 11.956135824130458,
"grad_norm": 0.5763278603553772,
"learning_rate": 0.0008453986604275622,
"loss": 2.8031,
"step": 14480
},
{
"epoch": 11.964392610176489,
"grad_norm": 0.5758784413337708,
"learning_rate": 0.0008452151573538856,
"loss": 2.7987,
"step": 14490
},
{
"epoch": 11.97264939622252,
"grad_norm": 0.5578325986862183,
"learning_rate": 0.0008450316542802092,
"loss": 2.7885,
"step": 14500
},
{
"epoch": 11.980906182268551,
"grad_norm": 0.5875093936920166,
"learning_rate": 0.0008448481512065327,
"loss": 2.789,
"step": 14510
},
{
"epoch": 11.989162968314584,
"grad_norm": 0.5956554412841797,
"learning_rate": 0.0008446646481328563,
"loss": 2.7899,
"step": 14520
},
{
"epoch": 11.997419754360616,
"grad_norm": 0.5572901368141174,
"learning_rate": 0.0008444811450591797,
"loss": 2.7874,
"step": 14530
},
{
"epoch": 12.004954071627619,
"grad_norm": 0.6897820234298706,
"learning_rate": 0.0008442976419855033,
"loss": 2.5443,
"step": 14540
},
{
"epoch": 12.01321085767365,
"grad_norm": 0.6007983684539795,
"learning_rate": 0.0008441141389118268,
"loss": 2.7947,
"step": 14550
},
{
"epoch": 12.021467643719681,
"grad_norm": 0.5293102860450745,
"learning_rate": 0.0008439306358381504,
"loss": 2.78,
"step": 14560
},
{
"epoch": 12.029724429765714,
"grad_norm": 0.5955855250358582,
"learning_rate": 0.0008437471327644738,
"loss": 2.7833,
"step": 14570
},
{
"epoch": 12.037981215811746,
"grad_norm": 0.6245818734169006,
"learning_rate": 0.0008435636296907974,
"loss": 2.7857,
"step": 14580
},
{
"epoch": 12.046238001857777,
"grad_norm": 0.6412973403930664,
"learning_rate": 0.0008433801266171208,
"loss": 2.7848,
"step": 14590
},
{
"epoch": 12.054494787903808,
"grad_norm": 0.5761491060256958,
"learning_rate": 0.0008431966235434444,
"loss": 2.7827,
"step": 14600
},
{
"epoch": 12.06275157394984,
"grad_norm": 0.5610695481300354,
"learning_rate": 0.0008430131204697678,
"loss": 2.7837,
"step": 14610
},
{
"epoch": 12.071008359995872,
"grad_norm": 0.5644015073776245,
"learning_rate": 0.0008428296173960914,
"loss": 2.7887,
"step": 14620
},
{
"epoch": 12.079265146041903,
"grad_norm": 0.5807902216911316,
"learning_rate": 0.0008426461143224149,
"loss": 2.7811,
"step": 14630
},
{
"epoch": 12.087521932087935,
"grad_norm": 0.6081882119178772,
"learning_rate": 0.0008424626112487385,
"loss": 2.7874,
"step": 14640
},
{
"epoch": 12.095778718133966,
"grad_norm": 0.5682186484336853,
"learning_rate": 0.0008422791081750619,
"loss": 2.7769,
"step": 14650
},
{
"epoch": 12.104035504179999,
"grad_norm": 0.5743092894554138,
"learning_rate": 0.0008420956051013855,
"loss": 2.7838,
"step": 14660
},
{
"epoch": 12.11229229022603,
"grad_norm": 0.5903926491737366,
"learning_rate": 0.000841912102027709,
"loss": 2.7798,
"step": 14670
},
{
"epoch": 12.120549076272061,
"grad_norm": 0.5655919909477234,
"learning_rate": 0.0008417285989540326,
"loss": 2.775,
"step": 14680
},
{
"epoch": 12.128805862318092,
"grad_norm": 0.6008566617965698,
"learning_rate": 0.000841545095880356,
"loss": 2.778,
"step": 14690
},
{
"epoch": 12.137062648364124,
"grad_norm": 0.5972955822944641,
"learning_rate": 0.0008413615928066796,
"loss": 2.7869,
"step": 14700
},
{
"epoch": 12.145319434410156,
"grad_norm": 0.6555289626121521,
"learning_rate": 0.0008411780897330031,
"loss": 2.7882,
"step": 14710
},
{
"epoch": 12.153576220456188,
"grad_norm": 0.571524441242218,
"learning_rate": 0.0008409945866593266,
"loss": 2.7853,
"step": 14720
},
{
"epoch": 12.161833006502219,
"grad_norm": 0.6336715221405029,
"learning_rate": 0.00084081108358565,
"loss": 2.7711,
"step": 14730
},
{
"epoch": 12.17008979254825,
"grad_norm": 0.6028571724891663,
"learning_rate": 0.0008406275805119736,
"loss": 2.7839,
"step": 14740
},
{
"epoch": 12.178346578594283,
"grad_norm": 0.6054027080535889,
"learning_rate": 0.0008404440774382971,
"loss": 2.7947,
"step": 14750
},
{
"epoch": 12.186603364640314,
"grad_norm": 0.6011176109313965,
"learning_rate": 0.0008402605743646207,
"loss": 2.7799,
"step": 14760
},
{
"epoch": 12.194860150686345,
"grad_norm": 0.5620320439338684,
"learning_rate": 0.0008400770712909441,
"loss": 2.7835,
"step": 14770
},
{
"epoch": 12.203116936732377,
"grad_norm": 0.6046602129936218,
"learning_rate": 0.0008398935682172677,
"loss": 2.7928,
"step": 14780
},
{
"epoch": 12.211373722778408,
"grad_norm": 0.5642755627632141,
"learning_rate": 0.0008397100651435912,
"loss": 2.7915,
"step": 14790
},
{
"epoch": 12.21963050882444,
"grad_norm": 0.5510666370391846,
"learning_rate": 0.0008395265620699148,
"loss": 2.783,
"step": 14800
},
{
"epoch": 12.227887294870472,
"grad_norm": 0.5766282081604004,
"learning_rate": 0.0008393430589962382,
"loss": 2.7827,
"step": 14810
},
{
"epoch": 12.236144080916503,
"grad_norm": 0.564561128616333,
"learning_rate": 0.0008391595559225618,
"loss": 2.7754,
"step": 14820
},
{
"epoch": 12.244400866962534,
"grad_norm": 0.5662837028503418,
"learning_rate": 0.0008389760528488853,
"loss": 2.7778,
"step": 14830
},
{
"epoch": 12.252657653008566,
"grad_norm": 0.5977376699447632,
"learning_rate": 0.0008387925497752089,
"loss": 2.7718,
"step": 14840
},
{
"epoch": 12.260914439054599,
"grad_norm": 0.6425307989120483,
"learning_rate": 0.0008386090467015322,
"loss": 2.785,
"step": 14850
},
{
"epoch": 12.26917122510063,
"grad_norm": 0.6152507066726685,
"learning_rate": 0.0008384255436278558,
"loss": 2.7817,
"step": 14860
},
{
"epoch": 12.277428011146661,
"grad_norm": 0.5792108774185181,
"learning_rate": 0.0008382420405541793,
"loss": 2.7763,
"step": 14870
},
{
"epoch": 12.285684797192692,
"grad_norm": 0.5410921573638916,
"learning_rate": 0.0008380585374805028,
"loss": 2.7731,
"step": 14880
},
{
"epoch": 12.293941583238725,
"grad_norm": 0.5805179476737976,
"learning_rate": 0.0008378750344068263,
"loss": 2.7858,
"step": 14890
},
{
"epoch": 12.302198369284756,
"grad_norm": 0.5664601922035217,
"learning_rate": 0.0008376915313331498,
"loss": 2.7816,
"step": 14900
},
{
"epoch": 12.310455155330787,
"grad_norm": 0.5598365664482117,
"learning_rate": 0.0008375080282594734,
"loss": 2.7788,
"step": 14910
},
{
"epoch": 12.318711941376819,
"grad_norm": 0.5913284420967102,
"learning_rate": 0.0008373245251857968,
"loss": 2.7721,
"step": 14920
},
{
"epoch": 12.32696872742285,
"grad_norm": 0.6424931287765503,
"learning_rate": 0.0008371410221121204,
"loss": 2.774,
"step": 14930
},
{
"epoch": 12.335225513468883,
"grad_norm": 0.5732784271240234,
"learning_rate": 0.0008369575190384439,
"loss": 2.7865,
"step": 14940
},
{
"epoch": 12.343482299514914,
"grad_norm": 0.6080560088157654,
"learning_rate": 0.0008367740159647675,
"loss": 2.7926,
"step": 14950
},
{
"epoch": 12.351739085560945,
"grad_norm": 0.6897197961807251,
"learning_rate": 0.0008365905128910909,
"loss": 2.7813,
"step": 14960
},
{
"epoch": 12.359995871606976,
"grad_norm": 0.5854954719543457,
"learning_rate": 0.0008364070098174145,
"loss": 2.7781,
"step": 14970
},
{
"epoch": 12.36825265765301,
"grad_norm": 0.6034757494926453,
"learning_rate": 0.0008362235067437379,
"loss": 2.7685,
"step": 14980
},
{
"epoch": 12.37650944369904,
"grad_norm": 0.6345445513725281,
"learning_rate": 0.0008360400036700615,
"loss": 2.784,
"step": 14990
},
{
"epoch": 12.384766229745072,
"grad_norm": 0.5897849798202515,
"learning_rate": 0.0008358565005963849,
"loss": 2.7849,
"step": 15000
},
{
"epoch": 12.393023015791103,
"grad_norm": 0.5857816338539124,
"learning_rate": 0.0008356729975227085,
"loss": 2.7702,
"step": 15010
},
{
"epoch": 12.401279801837134,
"grad_norm": 0.5820302367210388,
"learning_rate": 0.000835489494449032,
"loss": 2.7862,
"step": 15020
},
{
"epoch": 12.409536587883167,
"grad_norm": 0.6015300750732422,
"learning_rate": 0.0008353059913753556,
"loss": 2.7827,
"step": 15030
},
{
"epoch": 12.417793373929198,
"grad_norm": 0.5810590386390686,
"learning_rate": 0.000835122488301679,
"loss": 2.7848,
"step": 15040
},
{
"epoch": 12.42605015997523,
"grad_norm": 0.525604784488678,
"learning_rate": 0.0008349389852280026,
"loss": 2.7693,
"step": 15050
},
{
"epoch": 12.43430694602126,
"grad_norm": 0.5634535551071167,
"learning_rate": 0.0008347554821543261,
"loss": 2.782,
"step": 15060
},
{
"epoch": 12.442563732067292,
"grad_norm": 0.5564500689506531,
"learning_rate": 0.0008345719790806497,
"loss": 2.7656,
"step": 15070
},
{
"epoch": 12.450820518113325,
"grad_norm": 0.570466160774231,
"learning_rate": 0.0008343884760069731,
"loss": 2.7781,
"step": 15080
},
{
"epoch": 12.459077304159356,
"grad_norm": 0.5621691942214966,
"learning_rate": 0.0008342049729332967,
"loss": 2.774,
"step": 15090
},
{
"epoch": 12.467334090205387,
"grad_norm": 0.5975548624992371,
"learning_rate": 0.0008340214698596202,
"loss": 2.7771,
"step": 15100
},
{
"epoch": 12.475590876251418,
"grad_norm": 0.5807538628578186,
"learning_rate": 0.0008338379667859437,
"loss": 2.7786,
"step": 15110
},
{
"epoch": 12.483847662297451,
"grad_norm": 0.61223965883255,
"learning_rate": 0.0008336544637122671,
"loss": 2.7769,
"step": 15120
},
{
"epoch": 12.492104448343483,
"grad_norm": 0.5965583324432373,
"learning_rate": 0.0008334709606385907,
"loss": 2.777,
"step": 15130
},
{
"epoch": 12.500361234389514,
"grad_norm": 0.5752902626991272,
"learning_rate": 0.0008332874575649142,
"loss": 2.7799,
"step": 15140
},
{
"epoch": 12.508618020435545,
"grad_norm": 0.5716910362243652,
"learning_rate": 0.0008331039544912378,
"loss": 2.7791,
"step": 15150
},
{
"epoch": 12.516874806481576,
"grad_norm": 0.6005849242210388,
"learning_rate": 0.0008329204514175612,
"loss": 2.7852,
"step": 15160
},
{
"epoch": 12.52513159252761,
"grad_norm": 0.5944279432296753,
"learning_rate": 0.0008327369483438848,
"loss": 2.7757,
"step": 15170
},
{
"epoch": 12.53338837857364,
"grad_norm": 0.6027126908302307,
"learning_rate": 0.0008325534452702083,
"loss": 2.7776,
"step": 15180
},
{
"epoch": 12.541645164619672,
"grad_norm": 0.6317790746688843,
"learning_rate": 0.0008323699421965319,
"loss": 2.773,
"step": 15190
},
{
"epoch": 12.549901950665703,
"grad_norm": 0.5477875471115112,
"learning_rate": 0.0008321864391228553,
"loss": 2.7858,
"step": 15200
},
{
"epoch": 12.558158736711736,
"grad_norm": 0.5689815282821655,
"learning_rate": 0.0008320029360491789,
"loss": 2.7735,
"step": 15210
},
{
"epoch": 12.566415522757767,
"grad_norm": 0.6152288317680359,
"learning_rate": 0.0008318194329755024,
"loss": 2.7855,
"step": 15220
},
{
"epoch": 12.574672308803798,
"grad_norm": 0.5703557133674622,
"learning_rate": 0.000831635929901826,
"loss": 2.7674,
"step": 15230
},
{
"epoch": 12.58292909484983,
"grad_norm": 0.6394575834274292,
"learning_rate": 0.0008314524268281493,
"loss": 2.7703,
"step": 15240
},
{
"epoch": 12.59118588089586,
"grad_norm": 0.5734837055206299,
"learning_rate": 0.0008312689237544729,
"loss": 2.7765,
"step": 15250
},
{
"epoch": 12.599442666941894,
"grad_norm": 0.594292402267456,
"learning_rate": 0.0008310854206807964,
"loss": 2.7674,
"step": 15260
},
{
"epoch": 12.607699452987925,
"grad_norm": 0.5458073616027832,
"learning_rate": 0.0008309019176071199,
"loss": 2.7778,
"step": 15270
},
{
"epoch": 12.615956239033956,
"grad_norm": 0.5974953174591064,
"learning_rate": 0.0008307184145334434,
"loss": 2.771,
"step": 15280
},
{
"epoch": 12.624213025079987,
"grad_norm": 0.6053661108016968,
"learning_rate": 0.000830534911459767,
"loss": 2.7737,
"step": 15290
},
{
"epoch": 12.632469811126018,
"grad_norm": 0.5710778832435608,
"learning_rate": 0.0008303514083860905,
"loss": 2.7705,
"step": 15300
},
{
"epoch": 12.640726597172051,
"grad_norm": 0.5878491401672363,
"learning_rate": 0.000830167905312414,
"loss": 2.7832,
"step": 15310
},
{
"epoch": 12.648983383218082,
"grad_norm": 0.5833500623703003,
"learning_rate": 0.0008299844022387375,
"loss": 2.7734,
"step": 15320
},
{
"epoch": 12.657240169264114,
"grad_norm": 0.5963436961174011,
"learning_rate": 0.0008298008991650611,
"loss": 2.7795,
"step": 15330
},
{
"epoch": 12.665496955310145,
"grad_norm": 0.6217861175537109,
"learning_rate": 0.0008296173960913846,
"loss": 2.7715,
"step": 15340
},
{
"epoch": 12.673753741356178,
"grad_norm": 0.546258807182312,
"learning_rate": 0.0008294338930177081,
"loss": 2.7821,
"step": 15350
},
{
"epoch": 12.682010527402209,
"grad_norm": 0.6429739594459534,
"learning_rate": 0.0008292503899440316,
"loss": 2.7808,
"step": 15360
},
{
"epoch": 12.69026731344824,
"grad_norm": 0.6150422096252441,
"learning_rate": 0.0008290668868703551,
"loss": 2.7709,
"step": 15370
},
{
"epoch": 12.698524099494271,
"grad_norm": 0.5569972991943359,
"learning_rate": 0.0008288833837966786,
"loss": 2.7778,
"step": 15380
},
{
"epoch": 12.706780885540303,
"grad_norm": 0.5828894972801208,
"learning_rate": 0.0008286998807230021,
"loss": 2.7719,
"step": 15390
},
{
"epoch": 12.715037671586336,
"grad_norm": 0.5625948309898376,
"learning_rate": 0.0008285163776493256,
"loss": 2.7635,
"step": 15400
},
{
"epoch": 12.723294457632367,
"grad_norm": 0.6146851778030396,
"learning_rate": 0.0008283328745756492,
"loss": 2.791,
"step": 15410
},
{
"epoch": 12.731551243678398,
"grad_norm": 0.5903885364532471,
"learning_rate": 0.0008281493715019727,
"loss": 2.7738,
"step": 15420
},
{
"epoch": 12.73980802972443,
"grad_norm": 0.5333955883979797,
"learning_rate": 0.0008279658684282962,
"loss": 2.7711,
"step": 15430
},
{
"epoch": 12.748064815770462,
"grad_norm": 0.5588700175285339,
"learning_rate": 0.0008277823653546197,
"loss": 2.7776,
"step": 15440
},
{
"epoch": 12.756321601816493,
"grad_norm": 0.6176479458808899,
"learning_rate": 0.0008275988622809433,
"loss": 2.7769,
"step": 15450
},
{
"epoch": 12.764578387862525,
"grad_norm": 0.5709108114242554,
"learning_rate": 0.0008274153592072668,
"loss": 2.7691,
"step": 15460
},
{
"epoch": 12.772835173908556,
"grad_norm": 0.5612215995788574,
"learning_rate": 0.0008272318561335903,
"loss": 2.7771,
"step": 15470
},
{
"epoch": 12.781091959954587,
"grad_norm": 0.582386314868927,
"learning_rate": 0.0008270483530599138,
"loss": 2.7688,
"step": 15480
},
{
"epoch": 12.78934874600062,
"grad_norm": 0.5977119207382202,
"learning_rate": 0.0008268648499862373,
"loss": 2.776,
"step": 15490
},
{
"epoch": 12.797605532046651,
"grad_norm": 0.5754312872886658,
"learning_rate": 0.0008266813469125608,
"loss": 2.761,
"step": 15500
},
{
"epoch": 12.805862318092682,
"grad_norm": 0.56341552734375,
"learning_rate": 0.0008264978438388842,
"loss": 2.7812,
"step": 15510
},
{
"epoch": 12.814119104138713,
"grad_norm": 0.5888708829879761,
"learning_rate": 0.0008263143407652078,
"loss": 2.7708,
"step": 15520
},
{
"epoch": 12.822375890184745,
"grad_norm": 0.5750503540039062,
"learning_rate": 0.0008261308376915313,
"loss": 2.7895,
"step": 15530
},
{
"epoch": 12.830632676230778,
"grad_norm": 0.5679807662963867,
"learning_rate": 0.0008259473346178549,
"loss": 2.7826,
"step": 15540
},
{
"epoch": 12.838889462276809,
"grad_norm": 0.5332905054092407,
"learning_rate": 0.0008257638315441783,
"loss": 2.774,
"step": 15550
},
{
"epoch": 12.84714624832284,
"grad_norm": 0.5367740392684937,
"learning_rate": 0.0008255803284705019,
"loss": 2.7621,
"step": 15560
},
{
"epoch": 12.855403034368871,
"grad_norm": 0.6053501963615417,
"learning_rate": 0.0008253968253968254,
"loss": 2.7633,
"step": 15570
},
{
"epoch": 12.863659820414904,
"grad_norm": 0.5788416862487793,
"learning_rate": 0.000825213322323149,
"loss": 2.7689,
"step": 15580
},
{
"epoch": 12.871916606460935,
"grad_norm": 0.5835745334625244,
"learning_rate": 0.0008250298192494724,
"loss": 2.7746,
"step": 15590
},
{
"epoch": 12.880173392506967,
"grad_norm": 0.6038824915885925,
"learning_rate": 0.000824846316175796,
"loss": 2.7778,
"step": 15600
},
{
"epoch": 12.888430178552998,
"grad_norm": 0.5711358785629272,
"learning_rate": 0.0008246628131021195,
"loss": 2.7828,
"step": 15610
},
{
"epoch": 12.89668696459903,
"grad_norm": 0.6088118553161621,
"learning_rate": 0.0008244793100284431,
"loss": 2.7833,
"step": 15620
},
{
"epoch": 12.904943750645062,
"grad_norm": 0.6028804183006287,
"learning_rate": 0.0008242958069547664,
"loss": 2.7823,
"step": 15630
},
{
"epoch": 12.913200536691093,
"grad_norm": 0.5889461636543274,
"learning_rate": 0.00082411230388109,
"loss": 2.7751,
"step": 15640
},
{
"epoch": 12.921457322737124,
"grad_norm": 0.5903311967849731,
"learning_rate": 0.0008239288008074135,
"loss": 2.7712,
"step": 15650
},
{
"epoch": 12.929714108783156,
"grad_norm": 0.5665178894996643,
"learning_rate": 0.000823745297733737,
"loss": 2.7639,
"step": 15660
},
{
"epoch": 12.937970894829188,
"grad_norm": 0.5634979605674744,
"learning_rate": 0.0008235617946600605,
"loss": 2.7664,
"step": 15670
},
{
"epoch": 12.94622768087522,
"grad_norm": 0.5990162491798401,
"learning_rate": 0.0008233782915863841,
"loss": 2.778,
"step": 15680
},
{
"epoch": 12.95448446692125,
"grad_norm": 0.558689296245575,
"learning_rate": 0.0008231947885127076,
"loss": 2.7743,
"step": 15690
},
{
"epoch": 12.962741252967282,
"grad_norm": 0.546913206577301,
"learning_rate": 0.0008230112854390311,
"loss": 2.7657,
"step": 15700
},
{
"epoch": 12.970998039013313,
"grad_norm": 0.6025896072387695,
"learning_rate": 0.0008228277823653546,
"loss": 2.768,
"step": 15710
},
{
"epoch": 12.979254825059346,
"grad_norm": 0.5498492121696472,
"learning_rate": 0.0008226442792916782,
"loss": 2.7725,
"step": 15720
},
{
"epoch": 12.987511611105377,
"grad_norm": 0.6049798130989075,
"learning_rate": 0.0008224607762180017,
"loss": 2.7694,
"step": 15730
},
{
"epoch": 12.995768397151409,
"grad_norm": 0.5635313987731934,
"learning_rate": 0.0008222772731443252,
"loss": 2.7658,
"step": 15740
},
{
"epoch": 13.003302714418412,
"grad_norm": 0.6339975595474243,
"learning_rate": 0.0008220937700706487,
"loss": 2.5294,
"step": 15750
},
{
"epoch": 13.011559500464445,
"grad_norm": 0.5738035440444946,
"learning_rate": 0.0008219102669969723,
"loss": 2.768,
"step": 15760
},
{
"epoch": 13.019816286510476,
"grad_norm": 0.6072455644607544,
"learning_rate": 0.0008217267639232957,
"loss": 2.7673,
"step": 15770
},
{
"epoch": 13.028073072556507,
"grad_norm": 0.5527037978172302,
"learning_rate": 0.0008215432608496192,
"loss": 2.7617,
"step": 15780
},
{
"epoch": 13.036329858602539,
"grad_norm": 0.5809829831123352,
"learning_rate": 0.0008213597577759427,
"loss": 2.7681,
"step": 15790
},
{
"epoch": 13.04458664464857,
"grad_norm": 0.6070693135261536,
"learning_rate": 0.0008211762547022663,
"loss": 2.7738,
"step": 15800
},
{
"epoch": 13.052843430694603,
"grad_norm": 0.6138054132461548,
"learning_rate": 0.0008209927516285898,
"loss": 2.7669,
"step": 15810
},
{
"epoch": 13.061100216740634,
"grad_norm": 0.5852046012878418,
"learning_rate": 0.0008208092485549133,
"loss": 2.757,
"step": 15820
},
{
"epoch": 13.069357002786665,
"grad_norm": 0.5757762789726257,
"learning_rate": 0.0008206257454812368,
"loss": 2.7668,
"step": 15830
},
{
"epoch": 13.077613788832696,
"grad_norm": 0.5901942253112793,
"learning_rate": 0.0008204422424075604,
"loss": 2.7672,
"step": 15840
},
{
"epoch": 13.085870574878728,
"grad_norm": 0.6006907224655151,
"learning_rate": 0.0008202587393338839,
"loss": 2.7778,
"step": 15850
},
{
"epoch": 13.09412736092476,
"grad_norm": 0.6453226208686829,
"learning_rate": 0.0008200752362602074,
"loss": 2.7627,
"step": 15860
},
{
"epoch": 13.102384146970792,
"grad_norm": 0.5957190990447998,
"learning_rate": 0.0008198917331865309,
"loss": 2.7666,
"step": 15870
},
{
"epoch": 13.110640933016823,
"grad_norm": 0.5730419754981995,
"learning_rate": 0.0008197082301128545,
"loss": 2.7531,
"step": 15880
},
{
"epoch": 13.118897719062854,
"grad_norm": 0.5487211346626282,
"learning_rate": 0.000819524727039178,
"loss": 2.762,
"step": 15890
},
{
"epoch": 13.127154505108887,
"grad_norm": 0.605567216873169,
"learning_rate": 0.0008193412239655014,
"loss": 2.7731,
"step": 15900
},
{
"epoch": 13.135411291154918,
"grad_norm": 0.6126657128334045,
"learning_rate": 0.0008191577208918249,
"loss": 2.7694,
"step": 15910
},
{
"epoch": 13.14366807720095,
"grad_norm": 0.5657448172569275,
"learning_rate": 0.0008189742178181485,
"loss": 2.7749,
"step": 15920
},
{
"epoch": 13.15192486324698,
"grad_norm": 0.5270867347717285,
"learning_rate": 0.000818790714744472,
"loss": 2.7536,
"step": 15930
},
{
"epoch": 13.160181649293012,
"grad_norm": 0.5588110685348511,
"learning_rate": 0.0008186072116707955,
"loss": 2.7682,
"step": 15940
},
{
"epoch": 13.168438435339045,
"grad_norm": 0.5444889664649963,
"learning_rate": 0.000818423708597119,
"loss": 2.7521,
"step": 15950
},
{
"epoch": 13.176695221385076,
"grad_norm": 0.5641809105873108,
"learning_rate": 0.0008182402055234426,
"loss": 2.7611,
"step": 15960
},
{
"epoch": 13.184952007431107,
"grad_norm": 0.572223424911499,
"learning_rate": 0.0008180567024497661,
"loss": 2.7601,
"step": 15970
},
{
"epoch": 13.193208793477138,
"grad_norm": 0.6031824946403503,
"learning_rate": 0.0008178731993760896,
"loss": 2.7721,
"step": 15980
},
{
"epoch": 13.201465579523171,
"grad_norm": 0.5700808167457581,
"learning_rate": 0.0008176896963024131,
"loss": 2.7592,
"step": 15990
},
{
"epoch": 13.209722365569203,
"grad_norm": 0.5641034245491028,
"learning_rate": 0.0008175061932287367,
"loss": 2.7622,
"step": 16000
},
{
"epoch": 13.217979151615234,
"grad_norm": 0.5520575642585754,
"learning_rate": 0.0008173226901550602,
"loss": 2.7675,
"step": 16010
},
{
"epoch": 13.226235937661265,
"grad_norm": 0.6408013701438904,
"learning_rate": 0.0008171391870813837,
"loss": 2.7563,
"step": 16020
},
{
"epoch": 13.234492723707296,
"grad_norm": 0.5883532762527466,
"learning_rate": 0.0008169556840077071,
"loss": 2.7659,
"step": 16030
},
{
"epoch": 13.24274950975333,
"grad_norm": 0.5909677147865295,
"learning_rate": 0.0008167721809340307,
"loss": 2.7637,
"step": 16040
},
{
"epoch": 13.25100629579936,
"grad_norm": 0.5736511945724487,
"learning_rate": 0.0008165886778603541,
"loss": 2.7526,
"step": 16050
},
{
"epoch": 13.259263081845392,
"grad_norm": 0.5763906240463257,
"learning_rate": 0.0008164051747866776,
"loss": 2.7717,
"step": 16060
},
{
"epoch": 13.267519867891423,
"grad_norm": 0.5538901090621948,
"learning_rate": 0.0008162216717130012,
"loss": 2.7666,
"step": 16070
},
{
"epoch": 13.275776653937454,
"grad_norm": 0.6136773824691772,
"learning_rate": 0.0008160381686393247,
"loss": 2.7473,
"step": 16080
},
{
"epoch": 13.284033439983487,
"grad_norm": 0.5361644625663757,
"learning_rate": 0.0008158546655656482,
"loss": 2.7581,
"step": 16090
},
{
"epoch": 13.292290226029518,
"grad_norm": 0.5708776116371155,
"learning_rate": 0.0008156711624919717,
"loss": 2.758,
"step": 16100
},
{
"epoch": 13.30054701207555,
"grad_norm": 0.5603443384170532,
"learning_rate": 0.0008154876594182953,
"loss": 2.7581,
"step": 16110
},
{
"epoch": 13.30880379812158,
"grad_norm": 0.56572026014328,
"learning_rate": 0.0008153041563446188,
"loss": 2.7618,
"step": 16120
},
{
"epoch": 13.317060584167614,
"grad_norm": 0.6429868936538696,
"learning_rate": 0.0008151206532709423,
"loss": 2.7641,
"step": 16130
},
{
"epoch": 13.325317370213645,
"grad_norm": 0.6419973969459534,
"learning_rate": 0.0008149371501972658,
"loss": 2.7685,
"step": 16140
},
{
"epoch": 13.333574156259676,
"grad_norm": 0.6389254331588745,
"learning_rate": 0.0008147536471235894,
"loss": 2.7607,
"step": 16150
},
{
"epoch": 13.341830942305707,
"grad_norm": 0.5886973142623901,
"learning_rate": 0.0008145701440499128,
"loss": 2.7545,
"step": 16160
},
{
"epoch": 13.350087728351738,
"grad_norm": 0.6030955910682678,
"learning_rate": 0.0008143866409762363,
"loss": 2.7703,
"step": 16170
},
{
"epoch": 13.358344514397771,
"grad_norm": 0.5616552233695984,
"learning_rate": 0.0008142031379025598,
"loss": 2.763,
"step": 16180
},
{
"epoch": 13.366601300443802,
"grad_norm": 0.6055645942687988,
"learning_rate": 0.0008140196348288834,
"loss": 2.7692,
"step": 16190
},
{
"epoch": 13.374858086489834,
"grad_norm": 0.6489038467407227,
"learning_rate": 0.0008138361317552069,
"loss": 2.7599,
"step": 16200
},
{
"epoch": 13.383114872535865,
"grad_norm": 0.5819231867790222,
"learning_rate": 0.0008136526286815304,
"loss": 2.7609,
"step": 16210
},
{
"epoch": 13.391371658581898,
"grad_norm": 0.6634029150009155,
"learning_rate": 0.0008134691256078539,
"loss": 2.7662,
"step": 16220
},
{
"epoch": 13.399628444627929,
"grad_norm": 0.591643750667572,
"learning_rate": 0.0008132856225341775,
"loss": 2.762,
"step": 16230
},
{
"epoch": 13.40788523067396,
"grad_norm": 0.5705656409263611,
"learning_rate": 0.000813102119460501,
"loss": 2.7483,
"step": 16240
},
{
"epoch": 13.416142016719991,
"grad_norm": 0.597012996673584,
"learning_rate": 0.0008129186163868245,
"loss": 2.77,
"step": 16250
},
{
"epoch": 13.424398802766023,
"grad_norm": 0.6613156795501709,
"learning_rate": 0.000812735113313148,
"loss": 2.7585,
"step": 16260
},
{
"epoch": 13.432655588812056,
"grad_norm": 0.5879620313644409,
"learning_rate": 0.0008125516102394716,
"loss": 2.7628,
"step": 16270
},
{
"epoch": 13.440912374858087,
"grad_norm": 0.5478769540786743,
"learning_rate": 0.0008123681071657951,
"loss": 2.7576,
"step": 16280
},
{
"epoch": 13.449169160904118,
"grad_norm": 0.5667441487312317,
"learning_rate": 0.0008121846040921185,
"loss": 2.7507,
"step": 16290
},
{
"epoch": 13.45742594695015,
"grad_norm": 0.5518149733543396,
"learning_rate": 0.000812001101018442,
"loss": 2.7519,
"step": 16300
},
{
"epoch": 13.465682732996182,
"grad_norm": 0.6225078701972961,
"learning_rate": 0.0008118175979447656,
"loss": 2.7632,
"step": 16310
},
{
"epoch": 13.473939519042213,
"grad_norm": 0.587518572807312,
"learning_rate": 0.0008116340948710891,
"loss": 2.7691,
"step": 16320
},
{
"epoch": 13.482196305088245,
"grad_norm": 0.5977440476417542,
"learning_rate": 0.0008114505917974126,
"loss": 2.7503,
"step": 16330
},
{
"epoch": 13.490453091134276,
"grad_norm": 0.5600082278251648,
"learning_rate": 0.0008112670887237361,
"loss": 2.7622,
"step": 16340
},
{
"epoch": 13.498709877180307,
"grad_norm": 0.560407817363739,
"learning_rate": 0.0008110835856500597,
"loss": 2.7697,
"step": 16350
},
{
"epoch": 13.50696666322634,
"grad_norm": 0.5983129143714905,
"learning_rate": 0.0008109000825763832,
"loss": 2.7498,
"step": 16360
},
{
"epoch": 13.515223449272371,
"grad_norm": 0.5887816548347473,
"learning_rate": 0.0008107165795027067,
"loss": 2.7707,
"step": 16370
},
{
"epoch": 13.523480235318402,
"grad_norm": 0.5647554993629456,
"learning_rate": 0.0008105330764290302,
"loss": 2.7625,
"step": 16380
},
{
"epoch": 13.531737021364433,
"grad_norm": 0.551149845123291,
"learning_rate": 0.0008103495733553538,
"loss": 2.7564,
"step": 16390
},
{
"epoch": 13.539993807410465,
"grad_norm": 0.568866491317749,
"learning_rate": 0.0008101660702816773,
"loss": 2.751,
"step": 16400
},
{
"epoch": 13.548250593456498,
"grad_norm": 0.5884142518043518,
"learning_rate": 0.0008099825672080008,
"loss": 2.7613,
"step": 16410
},
{
"epoch": 13.556507379502529,
"grad_norm": 0.5169154405593872,
"learning_rate": 0.0008097990641343242,
"loss": 2.7544,
"step": 16420
},
{
"epoch": 13.56476416554856,
"grad_norm": 0.6176019310951233,
"learning_rate": 0.0008096155610606478,
"loss": 2.7581,
"step": 16430
},
{
"epoch": 13.573020951594591,
"grad_norm": 0.6097131967544556,
"learning_rate": 0.0008094320579869712,
"loss": 2.7634,
"step": 16440
},
{
"epoch": 13.581277737640624,
"grad_norm": 0.6191734075546265,
"learning_rate": 0.0008092485549132948,
"loss": 2.758,
"step": 16450
},
{
"epoch": 13.589534523686655,
"grad_norm": 0.5689364075660706,
"learning_rate": 0.0008090650518396183,
"loss": 2.7614,
"step": 16460
},
{
"epoch": 13.597791309732687,
"grad_norm": 0.6023751497268677,
"learning_rate": 0.0008088815487659419,
"loss": 2.775,
"step": 16470
},
{
"epoch": 13.606048095778718,
"grad_norm": 0.5691829323768616,
"learning_rate": 0.0008086980456922653,
"loss": 2.7631,
"step": 16480
},
{
"epoch": 13.614304881824749,
"grad_norm": 0.5723507404327393,
"learning_rate": 0.0008085145426185889,
"loss": 2.7529,
"step": 16490
},
{
"epoch": 13.622561667870782,
"grad_norm": 0.5878212451934814,
"learning_rate": 0.0008083310395449124,
"loss": 2.7629,
"step": 16500
},
{
"epoch": 13.630818453916813,
"grad_norm": 0.651943564414978,
"learning_rate": 0.000808147536471236,
"loss": 2.7539,
"step": 16510
},
{
"epoch": 13.639075239962844,
"grad_norm": 0.6334558129310608,
"learning_rate": 0.0008079640333975594,
"loss": 2.7562,
"step": 16520
},
{
"epoch": 13.647332026008876,
"grad_norm": 0.6675853133201599,
"learning_rate": 0.000807780530323883,
"loss": 2.7666,
"step": 16530
},
{
"epoch": 13.655588812054908,
"grad_norm": 0.5692960023880005,
"learning_rate": 0.0008075970272502065,
"loss": 2.7527,
"step": 16540
},
{
"epoch": 13.66384559810094,
"grad_norm": 0.5518311858177185,
"learning_rate": 0.00080741352417653,
"loss": 2.7626,
"step": 16550
},
{
"epoch": 13.67210238414697,
"grad_norm": 0.6077815890312195,
"learning_rate": 0.0008072300211028534,
"loss": 2.7613,
"step": 16560
},
{
"epoch": 13.680359170193002,
"grad_norm": 0.5508092641830444,
"learning_rate": 0.000807046518029177,
"loss": 2.7574,
"step": 16570
},
{
"epoch": 13.688615956239033,
"grad_norm": 0.5735660791397095,
"learning_rate": 0.0008068630149555005,
"loss": 2.7573,
"step": 16580
},
{
"epoch": 13.696872742285066,
"grad_norm": 0.5603192448616028,
"learning_rate": 0.0008066795118818241,
"loss": 2.7544,
"step": 16590
},
{
"epoch": 13.705129528331097,
"grad_norm": 0.556424081325531,
"learning_rate": 0.0008064960088081475,
"loss": 2.756,
"step": 16600
},
{
"epoch": 13.713386314377129,
"grad_norm": 0.5140565037727356,
"learning_rate": 0.0008063125057344711,
"loss": 2.7666,
"step": 16610
},
{
"epoch": 13.72164310042316,
"grad_norm": 0.5534517168998718,
"learning_rate": 0.0008061290026607946,
"loss": 2.7569,
"step": 16620
},
{
"epoch": 13.729899886469191,
"grad_norm": 0.6492647528648376,
"learning_rate": 0.0008059454995871181,
"loss": 2.7567,
"step": 16630
},
{
"epoch": 13.738156672515224,
"grad_norm": 0.5888465642929077,
"learning_rate": 0.0008057619965134416,
"loss": 2.7451,
"step": 16640
},
{
"epoch": 13.746413458561255,
"grad_norm": 0.6425179243087769,
"learning_rate": 0.0008055784934397651,
"loss": 2.7575,
"step": 16650
},
{
"epoch": 13.754670244607286,
"grad_norm": 0.5842881202697754,
"learning_rate": 0.0008053949903660887,
"loss": 2.756,
"step": 16660
},
{
"epoch": 13.762927030653318,
"grad_norm": 0.5675920248031616,
"learning_rate": 0.0008052114872924122,
"loss": 2.761,
"step": 16670
},
{
"epoch": 13.77118381669935,
"grad_norm": 0.532641589641571,
"learning_rate": 0.0008050279842187356,
"loss": 2.7615,
"step": 16680
},
{
"epoch": 13.779440602745382,
"grad_norm": 0.5731536149978638,
"learning_rate": 0.0008048444811450591,
"loss": 2.7562,
"step": 16690
},
{
"epoch": 13.787697388791413,
"grad_norm": 0.567754328250885,
"learning_rate": 0.0008046609780713827,
"loss": 2.7479,
"step": 16700
},
{
"epoch": 13.795954174837444,
"grad_norm": 0.524221658706665,
"learning_rate": 0.0008044774749977062,
"loss": 2.7509,
"step": 16710
},
{
"epoch": 13.804210960883475,
"grad_norm": 0.5846814513206482,
"learning_rate": 0.0008042939719240297,
"loss": 2.7475,
"step": 16720
},
{
"epoch": 13.812467746929508,
"grad_norm": 0.5527751445770264,
"learning_rate": 0.0008041104688503532,
"loss": 2.7608,
"step": 16730
},
{
"epoch": 13.82072453297554,
"grad_norm": 0.6005294919013977,
"learning_rate": 0.0008039269657766768,
"loss": 2.7499,
"step": 16740
},
{
"epoch": 13.82898131902157,
"grad_norm": 0.5409100651741028,
"learning_rate": 0.0008037434627030003,
"loss": 2.7473,
"step": 16750
},
{
"epoch": 13.837238105067602,
"grad_norm": 0.5972150564193726,
"learning_rate": 0.0008035599596293238,
"loss": 2.7597,
"step": 16760
},
{
"epoch": 13.845494891113635,
"grad_norm": 0.5449065566062927,
"learning_rate": 0.0008033764565556473,
"loss": 2.7471,
"step": 16770
},
{
"epoch": 13.853751677159666,
"grad_norm": 0.5764107704162598,
"learning_rate": 0.0008031929534819709,
"loss": 2.7548,
"step": 16780
},
{
"epoch": 13.862008463205697,
"grad_norm": 0.5843521356582642,
"learning_rate": 0.0008030094504082944,
"loss": 2.7527,
"step": 16790
},
{
"epoch": 13.870265249251728,
"grad_norm": 0.5988937020301819,
"learning_rate": 0.0008028259473346179,
"loss": 2.7538,
"step": 16800
},
{
"epoch": 13.87852203529776,
"grad_norm": 0.5904337763786316,
"learning_rate": 0.0008026424442609413,
"loss": 2.7502,
"step": 16810
},
{
"epoch": 13.886778821343793,
"grad_norm": 0.5412918329238892,
"learning_rate": 0.0008024589411872649,
"loss": 2.7522,
"step": 16820
},
{
"epoch": 13.895035607389824,
"grad_norm": 0.5681438446044922,
"learning_rate": 0.0008022754381135883,
"loss": 2.7576,
"step": 16830
},
{
"epoch": 13.903292393435855,
"grad_norm": 0.5728694796562195,
"learning_rate": 0.0008020919350399119,
"loss": 2.7549,
"step": 16840
},
{
"epoch": 13.911549179481886,
"grad_norm": 0.5923236608505249,
"learning_rate": 0.0008019084319662354,
"loss": 2.7553,
"step": 16850
},
{
"epoch": 13.919805965527917,
"grad_norm": 0.5946152210235596,
"learning_rate": 0.000801724928892559,
"loss": 2.7457,
"step": 16860
},
{
"epoch": 13.92806275157395,
"grad_norm": 0.5166122913360596,
"learning_rate": 0.0008015414258188824,
"loss": 2.7488,
"step": 16870
},
{
"epoch": 13.936319537619982,
"grad_norm": 0.5555543303489685,
"learning_rate": 0.000801357922745206,
"loss": 2.7606,
"step": 16880
},
{
"epoch": 13.944576323666013,
"grad_norm": 0.5452257990837097,
"learning_rate": 0.0008011744196715295,
"loss": 2.7558,
"step": 16890
},
{
"epoch": 13.952833109712044,
"grad_norm": 0.5303358435630798,
"learning_rate": 0.0008009909165978531,
"loss": 2.7481,
"step": 16900
},
{
"epoch": 13.961089895758077,
"grad_norm": 0.5449009537696838,
"learning_rate": 0.0008008074135241765,
"loss": 2.7548,
"step": 16910
},
{
"epoch": 13.969346681804108,
"grad_norm": 0.5688961148262024,
"learning_rate": 0.0008006239104505001,
"loss": 2.7543,
"step": 16920
},
{
"epoch": 13.97760346785014,
"grad_norm": 0.6097021698951721,
"learning_rate": 0.0008004404073768236,
"loss": 2.7521,
"step": 16930
},
{
"epoch": 13.98586025389617,
"grad_norm": 0.6139764189720154,
"learning_rate": 0.0008002569043031472,
"loss": 2.7544,
"step": 16940
},
{
"epoch": 13.994117039942202,
"grad_norm": 0.5823282599449158,
"learning_rate": 0.0008000734012294705,
"loss": 2.7485,
"step": 16950
},
{
"epoch": 14.001651357209207,
"grad_norm": 0.5491234064102173,
"learning_rate": 0.0007998898981557941,
"loss": 2.5171,
"step": 16960
},
{
"epoch": 14.009908143255238,
"grad_norm": 0.6469337940216064,
"learning_rate": 0.0007997063950821176,
"loss": 2.7401,
"step": 16970
},
{
"epoch": 14.01816492930127,
"grad_norm": 0.622250497341156,
"learning_rate": 0.0007995228920084412,
"loss": 2.7526,
"step": 16980
},
{
"epoch": 14.0264217153473,
"grad_norm": 0.6488636136054993,
"learning_rate": 0.0007993393889347646,
"loss": 2.7349,
"step": 16990
},
{
"epoch": 14.034678501393334,
"grad_norm": 0.5935384631156921,
"learning_rate": 0.0007991558858610882,
"loss": 2.7503,
"step": 17000
},
{
"epoch": 14.042935287439365,
"grad_norm": 0.6315668821334839,
"learning_rate": 0.0007989723827874117,
"loss": 2.7522,
"step": 17010
},
{
"epoch": 14.051192073485396,
"grad_norm": 0.607702910900116,
"learning_rate": 0.0007987888797137353,
"loss": 2.7474,
"step": 17020
},
{
"epoch": 14.059448859531427,
"grad_norm": 0.55247962474823,
"learning_rate": 0.0007986053766400587,
"loss": 2.7366,
"step": 17030
},
{
"epoch": 14.067705645577458,
"grad_norm": 0.5892691016197205,
"learning_rate": 0.0007984218735663823,
"loss": 2.7319,
"step": 17040
},
{
"epoch": 14.075962431623491,
"grad_norm": 0.5575072765350342,
"learning_rate": 0.0007982383704927058,
"loss": 2.753,
"step": 17050
},
{
"epoch": 14.084219217669522,
"grad_norm": 0.6110917329788208,
"learning_rate": 0.0007980548674190294,
"loss": 2.7465,
"step": 17060
},
{
"epoch": 14.092476003715554,
"grad_norm": 0.6070433855056763,
"learning_rate": 0.0007978713643453528,
"loss": 2.7533,
"step": 17070
},
{
"epoch": 14.100732789761585,
"grad_norm": 0.5724040865898132,
"learning_rate": 0.0007976878612716763,
"loss": 2.7412,
"step": 17080
},
{
"epoch": 14.108989575807616,
"grad_norm": 0.5734650492668152,
"learning_rate": 0.0007975043581979998,
"loss": 2.7417,
"step": 17090
},
{
"epoch": 14.117246361853649,
"grad_norm": 0.5555775165557861,
"learning_rate": 0.0007973208551243234,
"loss": 2.7436,
"step": 17100
},
{
"epoch": 14.12550314789968,
"grad_norm": 0.5774323344230652,
"learning_rate": 0.0007971373520506468,
"loss": 2.7413,
"step": 17110
},
{
"epoch": 14.133759933945711,
"grad_norm": 0.6438599824905396,
"learning_rate": 0.0007969538489769704,
"loss": 2.7539,
"step": 17120
},
{
"epoch": 14.142016719991743,
"grad_norm": 0.5561356544494629,
"learning_rate": 0.0007967703459032939,
"loss": 2.7405,
"step": 17130
},
{
"epoch": 14.150273506037776,
"grad_norm": 0.5886418223381042,
"learning_rate": 0.0007965868428296175,
"loss": 2.7599,
"step": 17140
},
{
"epoch": 14.158530292083807,
"grad_norm": 0.5819487571716309,
"learning_rate": 0.0007964033397559409,
"loss": 2.7569,
"step": 17150
},
{
"epoch": 14.166787078129838,
"grad_norm": 0.5723300576210022,
"learning_rate": 0.0007962198366822645,
"loss": 2.7404,
"step": 17160
},
{
"epoch": 14.17504386417587,
"grad_norm": 0.5738250017166138,
"learning_rate": 0.000796036333608588,
"loss": 2.7518,
"step": 17170
},
{
"epoch": 14.1833006502219,
"grad_norm": 0.5601485967636108,
"learning_rate": 0.0007958528305349116,
"loss": 2.7477,
"step": 17180
},
{
"epoch": 14.191557436267933,
"grad_norm": 0.5593155026435852,
"learning_rate": 0.000795669327461235,
"loss": 2.751,
"step": 17190
},
{
"epoch": 14.199814222313965,
"grad_norm": 0.5404049158096313,
"learning_rate": 0.0007954858243875585,
"loss": 2.7537,
"step": 17200
},
{
"epoch": 14.208071008359996,
"grad_norm": 0.5567106008529663,
"learning_rate": 0.000795302321313882,
"loss": 2.7499,
"step": 17210
},
{
"epoch": 14.216327794406027,
"grad_norm": 0.5681931376457214,
"learning_rate": 0.0007951188182402054,
"loss": 2.7598,
"step": 17220
},
{
"epoch": 14.22458458045206,
"grad_norm": 0.5726577639579773,
"learning_rate": 0.000794935315166529,
"loss": 2.7431,
"step": 17230
},
{
"epoch": 14.232841366498091,
"grad_norm": 0.5552230477333069,
"learning_rate": 0.0007947518120928525,
"loss": 2.7498,
"step": 17240
},
{
"epoch": 14.241098152544122,
"grad_norm": 0.5898513793945312,
"learning_rate": 0.0007945683090191761,
"loss": 2.7503,
"step": 17250
},
{
"epoch": 14.249354938590153,
"grad_norm": 0.5322459936141968,
"learning_rate": 0.0007943848059454995,
"loss": 2.7343,
"step": 17260
},
{
"epoch": 14.257611724636185,
"grad_norm": 0.62173992395401,
"learning_rate": 0.0007942013028718231,
"loss": 2.7424,
"step": 17270
},
{
"epoch": 14.265868510682218,
"grad_norm": 0.5796912908554077,
"learning_rate": 0.0007940177997981466,
"loss": 2.7477,
"step": 17280
},
{
"epoch": 14.274125296728249,
"grad_norm": 0.6236594915390015,
"learning_rate": 0.0007938342967244702,
"loss": 2.7553,
"step": 17290
},
{
"epoch": 14.28238208277428,
"grad_norm": 0.5684297680854797,
"learning_rate": 0.0007936507936507937,
"loss": 2.7347,
"step": 17300
},
{
"epoch": 14.290638868820311,
"grad_norm": 0.576805830001831,
"learning_rate": 0.0007934672905771172,
"loss": 2.7465,
"step": 17310
},
{
"epoch": 14.298895654866342,
"grad_norm": 0.6182284951210022,
"learning_rate": 0.0007932837875034407,
"loss": 2.746,
"step": 17320
},
{
"epoch": 14.307152440912375,
"grad_norm": 0.5486750602722168,
"learning_rate": 0.0007931002844297643,
"loss": 2.7496,
"step": 17330
},
{
"epoch": 14.315409226958407,
"grad_norm": 0.5673812627792358,
"learning_rate": 0.0007929167813560876,
"loss": 2.7365,
"step": 17340
},
{
"epoch": 14.323666013004438,
"grad_norm": 0.606238067150116,
"learning_rate": 0.0007927332782824112,
"loss": 2.7423,
"step": 17350
},
{
"epoch": 14.331922799050469,
"grad_norm": 0.555072009563446,
"learning_rate": 0.0007925497752087347,
"loss": 2.746,
"step": 17360
},
{
"epoch": 14.340179585096502,
"grad_norm": 0.5399696826934814,
"learning_rate": 0.0007923662721350583,
"loss": 2.7488,
"step": 17370
},
{
"epoch": 14.348436371142533,
"grad_norm": 0.5781683921813965,
"learning_rate": 0.0007921827690613817,
"loss": 2.7525,
"step": 17380
},
{
"epoch": 14.356693157188564,
"grad_norm": 0.5473909378051758,
"learning_rate": 0.0007919992659877053,
"loss": 2.7469,
"step": 17390
},
{
"epoch": 14.364949943234596,
"grad_norm": 0.5242516398429871,
"learning_rate": 0.0007918157629140288,
"loss": 2.737,
"step": 17400
},
{
"epoch": 14.373206729280627,
"grad_norm": 0.5968852043151855,
"learning_rate": 0.0007916322598403524,
"loss": 2.7457,
"step": 17410
},
{
"epoch": 14.38146351532666,
"grad_norm": 0.5766412615776062,
"learning_rate": 0.0007914487567666758,
"loss": 2.7326,
"step": 17420
},
{
"epoch": 14.38972030137269,
"grad_norm": 0.6067407131195068,
"learning_rate": 0.0007912652536929994,
"loss": 2.7426,
"step": 17430
},
{
"epoch": 14.397977087418722,
"grad_norm": 0.6106924414634705,
"learning_rate": 0.0007910817506193229,
"loss": 2.7525,
"step": 17440
},
{
"epoch": 14.406233873464753,
"grad_norm": 0.6435558199882507,
"learning_rate": 0.0007908982475456465,
"loss": 2.7428,
"step": 17450
},
{
"epoch": 14.414490659510786,
"grad_norm": 0.6241771578788757,
"learning_rate": 0.0007907147444719699,
"loss": 2.7438,
"step": 17460
},
{
"epoch": 14.422747445556817,
"grad_norm": 0.6236996054649353,
"learning_rate": 0.0007905312413982934,
"loss": 2.7496,
"step": 17470
},
{
"epoch": 14.431004231602849,
"grad_norm": 0.6004934310913086,
"learning_rate": 0.0007903477383246169,
"loss": 2.7483,
"step": 17480
},
{
"epoch": 14.43926101764888,
"grad_norm": 0.5864703059196472,
"learning_rate": 0.0007901642352509405,
"loss": 2.7421,
"step": 17490
},
{
"epoch": 14.447517803694911,
"grad_norm": 0.5803243517875671,
"learning_rate": 0.0007899807321772639,
"loss": 2.7512,
"step": 17500
},
{
"epoch": 14.455774589740944,
"grad_norm": 0.5815431475639343,
"learning_rate": 0.0007897972291035875,
"loss": 2.7539,
"step": 17510
},
{
"epoch": 14.464031375786975,
"grad_norm": 0.5773807168006897,
"learning_rate": 0.000789613726029911,
"loss": 2.7282,
"step": 17520
},
{
"epoch": 14.472288161833006,
"grad_norm": 0.561482846736908,
"learning_rate": 0.0007894302229562346,
"loss": 2.7368,
"step": 17530
},
{
"epoch": 14.480544947879038,
"grad_norm": 0.6419026255607605,
"learning_rate": 0.000789246719882558,
"loss": 2.752,
"step": 17540
},
{
"epoch": 14.488801733925069,
"grad_norm": 0.5817477107048035,
"learning_rate": 0.0007890632168088816,
"loss": 2.7464,
"step": 17550
},
{
"epoch": 14.497058519971102,
"grad_norm": 0.6521551609039307,
"learning_rate": 0.0007888797137352051,
"loss": 2.7582,
"step": 17560
},
{
"epoch": 14.505315306017133,
"grad_norm": 0.6004222631454468,
"learning_rate": 0.0007886962106615287,
"loss": 2.7429,
"step": 17570
},
{
"epoch": 14.513572092063164,
"grad_norm": 0.6220718026161194,
"learning_rate": 0.0007885127075878521,
"loss": 2.7376,
"step": 17580
},
{
"epoch": 14.521828878109195,
"grad_norm": 0.5441803336143494,
"learning_rate": 0.0007883292045141757,
"loss": 2.7418,
"step": 17590
},
{
"epoch": 14.530085664155228,
"grad_norm": 0.5832270383834839,
"learning_rate": 0.0007881457014404991,
"loss": 2.7377,
"step": 17600
},
{
"epoch": 14.53834245020126,
"grad_norm": 0.536746621131897,
"learning_rate": 0.0007879621983668227,
"loss": 2.7456,
"step": 17610
},
{
"epoch": 14.54659923624729,
"grad_norm": 0.5866507887840271,
"learning_rate": 0.0007877786952931461,
"loss": 2.7429,
"step": 17620
},
{
"epoch": 14.554856022293322,
"grad_norm": 0.5756723880767822,
"learning_rate": 0.0007875951922194697,
"loss": 2.743,
"step": 17630
},
{
"epoch": 14.563112808339355,
"grad_norm": 0.5826034545898438,
"learning_rate": 0.0007874116891457932,
"loss": 2.7452,
"step": 17640
},
{
"epoch": 14.571369594385386,
"grad_norm": 0.5977003574371338,
"learning_rate": 0.0007872281860721168,
"loss": 2.7443,
"step": 17650
},
{
"epoch": 14.579626380431417,
"grad_norm": 0.551539957523346,
"learning_rate": 0.0007870446829984402,
"loss": 2.7434,
"step": 17660
},
{
"epoch": 14.587883166477448,
"grad_norm": 0.6162058115005493,
"learning_rate": 0.0007868611799247638,
"loss": 2.734,
"step": 17670
},
{
"epoch": 14.59613995252348,
"grad_norm": 0.5811628103256226,
"learning_rate": 0.0007866776768510873,
"loss": 2.7447,
"step": 17680
},
{
"epoch": 14.604396738569513,
"grad_norm": 0.6103553771972656,
"learning_rate": 0.0007864941737774109,
"loss": 2.7472,
"step": 17690
},
{
"epoch": 14.612653524615544,
"grad_norm": 0.569419264793396,
"learning_rate": 0.0007863106707037343,
"loss": 2.7578,
"step": 17700
},
{
"epoch": 14.620910310661575,
"grad_norm": 0.6102364659309387,
"learning_rate": 0.0007861271676300579,
"loss": 2.7364,
"step": 17710
},
{
"epoch": 14.629167096707606,
"grad_norm": 0.5832472443580627,
"learning_rate": 0.0007859436645563814,
"loss": 2.7449,
"step": 17720
},
{
"epoch": 14.637423882753637,
"grad_norm": 0.5760400891304016,
"learning_rate": 0.0007857601614827049,
"loss": 2.7581,
"step": 17730
},
{
"epoch": 14.64568066879967,
"grad_norm": 0.6216306686401367,
"learning_rate": 0.0007855766584090283,
"loss": 2.732,
"step": 17740
},
{
"epoch": 14.653937454845702,
"grad_norm": 0.5639564394950867,
"learning_rate": 0.0007853931553353519,
"loss": 2.7582,
"step": 17750
},
{
"epoch": 14.662194240891733,
"grad_norm": 0.5887823700904846,
"learning_rate": 0.0007852096522616754,
"loss": 2.7492,
"step": 17760
},
{
"epoch": 14.670451026937764,
"grad_norm": 0.5743685364723206,
"learning_rate": 0.0007850261491879988,
"loss": 2.7484,
"step": 17770
},
{
"epoch": 14.678707812983795,
"grad_norm": 0.6122255921363831,
"learning_rate": 0.0007848426461143224,
"loss": 2.7326,
"step": 17780
},
{
"epoch": 14.686964599029828,
"grad_norm": 0.6089203357696533,
"learning_rate": 0.0007846591430406459,
"loss": 2.7411,
"step": 17790
},
{
"epoch": 14.69522138507586,
"grad_norm": 0.5829803347587585,
"learning_rate": 0.0007844756399669695,
"loss": 2.7406,
"step": 17800
},
{
"epoch": 14.70347817112189,
"grad_norm": 0.5928598642349243,
"learning_rate": 0.0007842921368932929,
"loss": 2.7462,
"step": 17810
},
{
"epoch": 14.711734957167922,
"grad_norm": 0.6143853664398193,
"learning_rate": 0.0007841086338196165,
"loss": 2.7446,
"step": 17820
},
{
"epoch": 14.719991743213955,
"grad_norm": 0.6457964777946472,
"learning_rate": 0.00078392513074594,
"loss": 2.7416,
"step": 17830
},
{
"epoch": 14.728248529259986,
"grad_norm": 0.6104548573493958,
"learning_rate": 0.0007837416276722636,
"loss": 2.7332,
"step": 17840
},
{
"epoch": 14.736505315306017,
"grad_norm": 0.5743314623832703,
"learning_rate": 0.000783558124598587,
"loss": 2.7459,
"step": 17850
},
{
"epoch": 14.744762101352048,
"grad_norm": 0.552040159702301,
"learning_rate": 0.0007833746215249105,
"loss": 2.747,
"step": 17860
},
{
"epoch": 14.753018887398081,
"grad_norm": 0.57485431432724,
"learning_rate": 0.000783191118451234,
"loss": 2.7543,
"step": 17870
},
{
"epoch": 14.761275673444112,
"grad_norm": 0.5415575504302979,
"learning_rate": 0.0007830076153775576,
"loss": 2.7432,
"step": 17880
},
{
"epoch": 14.769532459490144,
"grad_norm": 0.58236163854599,
"learning_rate": 0.000782824112303881,
"loss": 2.7429,
"step": 17890
},
{
"epoch": 14.777789245536175,
"grad_norm": 0.5532475709915161,
"learning_rate": 0.0007826406092302046,
"loss": 2.7298,
"step": 17900
},
{
"epoch": 14.786046031582206,
"grad_norm": 0.5620941519737244,
"learning_rate": 0.0007824571061565281,
"loss": 2.7398,
"step": 17910
},
{
"epoch": 14.794302817628239,
"grad_norm": 0.5772944688796997,
"learning_rate": 0.0007822736030828517,
"loss": 2.7326,
"step": 17920
},
{
"epoch": 14.80255960367427,
"grad_norm": 0.6066027879714966,
"learning_rate": 0.0007820901000091751,
"loss": 2.7341,
"step": 17930
},
{
"epoch": 14.810816389720301,
"grad_norm": 0.5544676184654236,
"learning_rate": 0.0007819065969354987,
"loss": 2.7498,
"step": 17940
},
{
"epoch": 14.819073175766333,
"grad_norm": 0.6160995364189148,
"learning_rate": 0.0007817230938618222,
"loss": 2.7362,
"step": 17950
},
{
"epoch": 14.827329961812364,
"grad_norm": 0.6500398516654968,
"learning_rate": 0.0007815395907881458,
"loss": 2.7412,
"step": 17960
},
{
"epoch": 14.835586747858397,
"grad_norm": 0.5683214068412781,
"learning_rate": 0.0007813560877144692,
"loss": 2.7469,
"step": 17970
},
{
"epoch": 14.843843533904428,
"grad_norm": 0.5637840032577515,
"learning_rate": 0.0007811725846407928,
"loss": 2.7378,
"step": 17980
},
{
"epoch": 14.852100319950459,
"grad_norm": 0.5927807092666626,
"learning_rate": 0.0007809890815671162,
"loss": 2.727,
"step": 17990
},
{
"epoch": 14.86035710599649,
"grad_norm": 0.611671507358551,
"learning_rate": 0.0007808055784934398,
"loss": 2.7437,
"step": 18000
}
],
"logging_steps": 10,
"max_steps": 60550,
"num_input_tokens_seen": 0,
"num_train_epochs": 50,
"save_steps": 1000,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 9.215610343472333e+16,
"train_batch_size": 64,
"trial_name": null,
"trial_params": null
}