KashiwaByte's picture
add LoRA model
71abb64
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 2.8648734680884926,
"eval_steps": 500,
"global_step": 9000,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0,
"grad_norm": 0.0,
"learning_rate": 0,
"loss": 7.7169,
"step": 10
},
{
"epoch": 0.01,
"grad_norm": 10.55540657043457,
"learning_rate": 9.997877083112197e-05,
"loss": 9.0438,
"step": 20
},
{
"epoch": 0.01,
"grad_norm": 6.060225009918213,
"learning_rate": 9.987262498673178e-05,
"loss": 3.211,
"step": 30
},
{
"epoch": 0.01,
"grad_norm": 2.5255496501922607,
"learning_rate": 9.976647914234159e-05,
"loss": 0.6387,
"step": 40
},
{
"epoch": 0.02,
"grad_norm": 2.976543664932251,
"learning_rate": 9.966033329795139e-05,
"loss": 0.5633,
"step": 50
},
{
"epoch": 0.02,
"grad_norm": 2.2680673599243164,
"learning_rate": 9.95541874535612e-05,
"loss": 0.474,
"step": 60
},
{
"epoch": 0.02,
"grad_norm": 3.136930465698242,
"learning_rate": 9.944804160917101e-05,
"loss": 0.3379,
"step": 70
},
{
"epoch": 0.03,
"grad_norm": 4.159604072570801,
"learning_rate": 9.935251034921983e-05,
"loss": 0.4444,
"step": 80
},
{
"epoch": 0.03,
"grad_norm": 1.704042911529541,
"learning_rate": 9.924636450482963e-05,
"loss": 0.4925,
"step": 90
},
{
"epoch": 0.03,
"grad_norm": 3.9414522647857666,
"learning_rate": 9.914021866043945e-05,
"loss": 0.4583,
"step": 100
},
{
"epoch": 0.04,
"grad_norm": 2.938662052154541,
"learning_rate": 9.903407281604927e-05,
"loss": 0.3838,
"step": 110
},
{
"epoch": 0.04,
"grad_norm": 1.8753790855407715,
"learning_rate": 9.892792697165907e-05,
"loss": 0.3247,
"step": 120
},
{
"epoch": 0.04,
"grad_norm": 1.75948965549469,
"learning_rate": 9.882178112726887e-05,
"loss": 0.3609,
"step": 130
},
{
"epoch": 0.04,
"grad_norm": 1.9066141843795776,
"learning_rate": 9.871563528287868e-05,
"loss": 0.3453,
"step": 140
},
{
"epoch": 0.05,
"grad_norm": 1.7767695188522339,
"learning_rate": 9.86094894384885e-05,
"loss": 0.5076,
"step": 150
},
{
"epoch": 0.05,
"grad_norm": 2.5219664573669434,
"learning_rate": 9.85033435940983e-05,
"loss": 0.4999,
"step": 160
},
{
"epoch": 0.05,
"grad_norm": 2.0505383014678955,
"learning_rate": 9.83971977497081e-05,
"loss": 0.5429,
"step": 170
},
{
"epoch": 0.06,
"grad_norm": 6.132015705108643,
"learning_rate": 9.82910519053179e-05,
"loss": 0.5099,
"step": 180
},
{
"epoch": 0.06,
"grad_norm": 1.057868480682373,
"learning_rate": 9.818490606092772e-05,
"loss": 0.4416,
"step": 190
},
{
"epoch": 0.06,
"grad_norm": 2.6155290603637695,
"learning_rate": 9.807876021653753e-05,
"loss": 0.3986,
"step": 200
},
{
"epoch": 0.07,
"grad_norm": 2.1468820571899414,
"learning_rate": 9.797261437214733e-05,
"loss": 0.3216,
"step": 210
},
{
"epoch": 0.07,
"grad_norm": 0.6600925326347351,
"learning_rate": 9.786646852775713e-05,
"loss": 0.3552,
"step": 220
},
{
"epoch": 0.07,
"grad_norm": 5.129382133483887,
"learning_rate": 9.776032268336695e-05,
"loss": 0.3221,
"step": 230
},
{
"epoch": 0.08,
"grad_norm": 0.3891478180885315,
"learning_rate": 9.765417683897677e-05,
"loss": 0.4073,
"step": 240
},
{
"epoch": 0.08,
"grad_norm": 3.254958391189575,
"learning_rate": 9.754803099458657e-05,
"loss": 0.4212,
"step": 250
},
{
"epoch": 0.08,
"grad_norm": 3.34332013130188,
"learning_rate": 9.744188515019638e-05,
"loss": 0.2167,
"step": 260
},
{
"epoch": 0.09,
"grad_norm": 3.801086902618408,
"learning_rate": 9.733573930580618e-05,
"loss": 0.5605,
"step": 270
},
{
"epoch": 0.09,
"grad_norm": 5.026745796203613,
"learning_rate": 9.7229593461416e-05,
"loss": 0.3527,
"step": 280
},
{
"epoch": 0.09,
"grad_norm": 3.8389620780944824,
"learning_rate": 9.71234476170258e-05,
"loss": 0.295,
"step": 290
},
{
"epoch": 0.1,
"grad_norm": 2.0584566593170166,
"learning_rate": 9.70173017726356e-05,
"loss": 0.2759,
"step": 300
},
{
"epoch": 0.1,
"grad_norm": 3.132164239883423,
"learning_rate": 9.691115592824541e-05,
"loss": 0.3888,
"step": 310
},
{
"epoch": 0.1,
"grad_norm": 0.5387492179870605,
"learning_rate": 9.680501008385522e-05,
"loss": 0.2285,
"step": 320
},
{
"epoch": 0.11,
"grad_norm": 3.0382373332977295,
"learning_rate": 9.669886423946503e-05,
"loss": 0.2549,
"step": 330
},
{
"epoch": 0.11,
"grad_norm": 6.465576648712158,
"learning_rate": 9.659271839507483e-05,
"loss": 0.7377,
"step": 340
},
{
"epoch": 0.11,
"grad_norm": 4.1156134605407715,
"learning_rate": 9.648657255068465e-05,
"loss": 0.3387,
"step": 350
},
{
"epoch": 0.11,
"grad_norm": 4.147655963897705,
"learning_rate": 9.638042670629445e-05,
"loss": 0.2605,
"step": 360
},
{
"epoch": 0.12,
"grad_norm": 1.4572869539260864,
"learning_rate": 9.627428086190427e-05,
"loss": 0.3024,
"step": 370
},
{
"epoch": 0.12,
"grad_norm": 1.906175971031189,
"learning_rate": 9.616813501751407e-05,
"loss": 0.3728,
"step": 380
},
{
"epoch": 0.12,
"grad_norm": 1.169878363609314,
"learning_rate": 9.606198917312388e-05,
"loss": 0.3961,
"step": 390
},
{
"epoch": 0.13,
"grad_norm": 1.2084730863571167,
"learning_rate": 9.595584332873368e-05,
"loss": 0.4887,
"step": 400
},
{
"epoch": 0.13,
"grad_norm": 0.7927988171577454,
"learning_rate": 9.58496974843435e-05,
"loss": 0.4519,
"step": 410
},
{
"epoch": 0.13,
"grad_norm": 6.37067985534668,
"learning_rate": 9.57435516399533e-05,
"loss": 0.237,
"step": 420
},
{
"epoch": 0.14,
"grad_norm": 2.9806203842163086,
"learning_rate": 9.56374057955631e-05,
"loss": 0.2917,
"step": 430
},
{
"epoch": 0.14,
"grad_norm": 5.05634880065918,
"learning_rate": 9.553125995117291e-05,
"loss": 0.2794,
"step": 440
},
{
"epoch": 0.14,
"grad_norm": 3.0483241081237793,
"learning_rate": 9.542511410678273e-05,
"loss": 0.3182,
"step": 450
},
{
"epoch": 0.15,
"grad_norm": 3.2123796939849854,
"learning_rate": 9.531896826239253e-05,
"loss": 0.2872,
"step": 460
},
{
"epoch": 0.15,
"grad_norm": 1.532020092010498,
"learning_rate": 9.521282241800233e-05,
"loss": 0.3258,
"step": 470
},
{
"epoch": 0.15,
"grad_norm": 1.1242539882659912,
"learning_rate": 9.510667657361215e-05,
"loss": 0.3356,
"step": 480
},
{
"epoch": 0.16,
"grad_norm": 4.846567153930664,
"learning_rate": 9.500053072922196e-05,
"loss": 0.3551,
"step": 490
},
{
"epoch": 0.16,
"grad_norm": 3.233238458633423,
"learning_rate": 9.489438488483177e-05,
"loss": 0.3971,
"step": 500
},
{
"epoch": 0.16,
"grad_norm": 1.7334824800491333,
"learning_rate": 9.478823904044158e-05,
"loss": 0.1896,
"step": 510
},
{
"epoch": 0.17,
"grad_norm": 7.36009407043457,
"learning_rate": 9.468209319605138e-05,
"loss": 0.3338,
"step": 520
},
{
"epoch": 0.17,
"grad_norm": 2.7838549613952637,
"learning_rate": 9.457594735166118e-05,
"loss": 0.3331,
"step": 530
},
{
"epoch": 0.17,
"grad_norm": 2.643627405166626,
"learning_rate": 9.4469801507271e-05,
"loss": 0.4575,
"step": 540
},
{
"epoch": 0.18,
"grad_norm": 5.420917510986328,
"learning_rate": 9.43636556628808e-05,
"loss": 0.37,
"step": 550
},
{
"epoch": 0.18,
"grad_norm": 2.1689090728759766,
"learning_rate": 9.425750981849061e-05,
"loss": 0.3551,
"step": 560
},
{
"epoch": 0.18,
"grad_norm": 0.7210526466369629,
"learning_rate": 9.415136397410041e-05,
"loss": 0.4028,
"step": 570
},
{
"epoch": 0.18,
"grad_norm": 0.3214457929134369,
"learning_rate": 9.404521812971022e-05,
"loss": 0.2391,
"step": 580
},
{
"epoch": 0.19,
"grad_norm": 3.8258142471313477,
"learning_rate": 9.393907228532003e-05,
"loss": 0.3399,
"step": 590
},
{
"epoch": 0.19,
"grad_norm": 1.5249234437942505,
"learning_rate": 9.383292644092985e-05,
"loss": 0.4449,
"step": 600
},
{
"epoch": 0.19,
"grad_norm": 0.22292350232601166,
"learning_rate": 9.372678059653965e-05,
"loss": 0.2156,
"step": 610
},
{
"epoch": 0.2,
"grad_norm": 1.3040258884429932,
"learning_rate": 9.362063475214946e-05,
"loss": 0.4175,
"step": 620
},
{
"epoch": 0.2,
"grad_norm": 1.3762481212615967,
"learning_rate": 9.351448890775926e-05,
"loss": 0.3191,
"step": 630
},
{
"epoch": 0.2,
"grad_norm": 2.706467866897583,
"learning_rate": 9.340834306336908e-05,
"loss": 0.5163,
"step": 640
},
{
"epoch": 0.21,
"grad_norm": 1.8577134609222412,
"learning_rate": 9.330219721897888e-05,
"loss": 0.1832,
"step": 650
},
{
"epoch": 0.21,
"grad_norm": 5.450695037841797,
"learning_rate": 9.319605137458869e-05,
"loss": 0.269,
"step": 660
},
{
"epoch": 0.21,
"grad_norm": 3.1967124938964844,
"learning_rate": 9.308990553019849e-05,
"loss": 0.3387,
"step": 670
},
{
"epoch": 0.22,
"grad_norm": 2.2148098945617676,
"learning_rate": 9.29837596858083e-05,
"loss": 0.3407,
"step": 680
},
{
"epoch": 0.22,
"grad_norm": 2.2693583965301514,
"learning_rate": 9.287761384141811e-05,
"loss": 0.2758,
"step": 690
},
{
"epoch": 0.22,
"grad_norm": 4.460744857788086,
"learning_rate": 9.277146799702791e-05,
"loss": 0.2493,
"step": 700
},
{
"epoch": 0.23,
"grad_norm": 8.331945419311523,
"learning_rate": 9.266532215263772e-05,
"loss": 0.2264,
"step": 710
},
{
"epoch": 0.23,
"grad_norm": 2.7469747066497803,
"learning_rate": 9.255917630824753e-05,
"loss": 0.3038,
"step": 720
},
{
"epoch": 0.23,
"grad_norm": 3.013535737991333,
"learning_rate": 9.245303046385735e-05,
"loss": 0.3136,
"step": 730
},
{
"epoch": 0.24,
"grad_norm": 3.508979558944702,
"learning_rate": 9.234688461946716e-05,
"loss": 0.3502,
"step": 740
},
{
"epoch": 0.24,
"grad_norm": 5.0464301109313965,
"learning_rate": 9.224073877507696e-05,
"loss": 0.1776,
"step": 750
},
{
"epoch": 0.24,
"grad_norm": 1.6929841041564941,
"learning_rate": 9.213459293068676e-05,
"loss": 0.2984,
"step": 760
},
{
"epoch": 0.25,
"grad_norm": 1.1452223062515259,
"learning_rate": 9.202844708629658e-05,
"loss": 0.2503,
"step": 770
},
{
"epoch": 0.25,
"grad_norm": 1.3975647687911987,
"learning_rate": 9.192230124190638e-05,
"loss": 0.2423,
"step": 780
},
{
"epoch": 0.25,
"grad_norm": 1.8630661964416504,
"learning_rate": 9.181615539751619e-05,
"loss": 0.327,
"step": 790
},
{
"epoch": 0.25,
"grad_norm": 5.333163261413574,
"learning_rate": 9.171000955312599e-05,
"loss": 0.4495,
"step": 800
},
{
"epoch": 0.26,
"grad_norm": 1.6478999853134155,
"learning_rate": 9.160386370873581e-05,
"loss": 0.2546,
"step": 810
},
{
"epoch": 0.26,
"grad_norm": 1.2132633924484253,
"learning_rate": 9.149771786434561e-05,
"loss": 0.2439,
"step": 820
},
{
"epoch": 0.26,
"grad_norm": 2.2123448848724365,
"learning_rate": 9.139157201995542e-05,
"loss": 0.3715,
"step": 830
},
{
"epoch": 0.27,
"grad_norm": 2.148674726486206,
"learning_rate": 9.128542617556523e-05,
"loss": 0.252,
"step": 840
},
{
"epoch": 0.27,
"grad_norm": 3.6980788707733154,
"learning_rate": 9.117928033117504e-05,
"loss": 0.4487,
"step": 850
},
{
"epoch": 0.27,
"grad_norm": 6.548594951629639,
"learning_rate": 9.107313448678485e-05,
"loss": 0.2199,
"step": 860
},
{
"epoch": 0.28,
"grad_norm": 3.5746383666992188,
"learning_rate": 9.096698864239466e-05,
"loss": 0.2728,
"step": 870
},
{
"epoch": 0.28,
"grad_norm": 0.9120383858680725,
"learning_rate": 9.086084279800446e-05,
"loss": 0.2737,
"step": 880
},
{
"epoch": 0.28,
"grad_norm": 4.220329761505127,
"learning_rate": 9.075469695361427e-05,
"loss": 0.4124,
"step": 890
},
{
"epoch": 0.29,
"grad_norm": 2.5000956058502197,
"learning_rate": 9.064855110922408e-05,
"loss": 0.302,
"step": 900
},
{
"epoch": 0.29,
"grad_norm": 5.3845906257629395,
"learning_rate": 9.054240526483389e-05,
"loss": 0.4177,
"step": 910
},
{
"epoch": 0.29,
"grad_norm": 1.0533277988433838,
"learning_rate": 9.043625942044369e-05,
"loss": 0.3834,
"step": 920
},
{
"epoch": 0.3,
"grad_norm": 2.482363224029541,
"learning_rate": 9.03301135760535e-05,
"loss": 0.3497,
"step": 930
},
{
"epoch": 0.3,
"grad_norm": 2.785825729370117,
"learning_rate": 9.022396773166331e-05,
"loss": 0.2696,
"step": 940
},
{
"epoch": 0.3,
"grad_norm": 0.9899762868881226,
"learning_rate": 9.011782188727311e-05,
"loss": 0.3139,
"step": 950
},
{
"epoch": 0.31,
"grad_norm": 3.0521786212921143,
"learning_rate": 9.001167604288293e-05,
"loss": 0.4116,
"step": 960
},
{
"epoch": 0.31,
"grad_norm": 1.1553211212158203,
"learning_rate": 8.990553019849274e-05,
"loss": 0.3239,
"step": 970
},
{
"epoch": 0.31,
"grad_norm": 2.973958730697632,
"learning_rate": 8.979938435410254e-05,
"loss": 0.297,
"step": 980
},
{
"epoch": 0.32,
"grad_norm": 1.3011306524276733,
"learning_rate": 8.969323850971236e-05,
"loss": 0.3136,
"step": 990
},
{
"epoch": 0.32,
"grad_norm": 2.6845755577087402,
"learning_rate": 8.958709266532216e-05,
"loss": 0.3207,
"step": 1000
},
{
"epoch": 0.32,
"grad_norm": 0.33025118708610535,
"learning_rate": 8.948094682093196e-05,
"loss": 0.1847,
"step": 1010
},
{
"epoch": 0.32,
"grad_norm": 1.9631307125091553,
"learning_rate": 8.937480097654177e-05,
"loss": 0.2798,
"step": 1020
},
{
"epoch": 0.33,
"grad_norm": 1.952580451965332,
"learning_rate": 8.926865513215158e-05,
"loss": 0.2184,
"step": 1030
},
{
"epoch": 0.33,
"grad_norm": 5.541811466217041,
"learning_rate": 8.916250928776139e-05,
"loss": 0.2649,
"step": 1040
},
{
"epoch": 0.33,
"grad_norm": 1.0800001621246338,
"learning_rate": 8.905636344337119e-05,
"loss": 0.3064,
"step": 1050
},
{
"epoch": 0.34,
"grad_norm": 4.908554553985596,
"learning_rate": 8.8950217598981e-05,
"loss": 0.2,
"step": 1060
},
{
"epoch": 0.34,
"grad_norm": 0.08677980303764343,
"learning_rate": 8.884407175459081e-05,
"loss": 0.1262,
"step": 1070
},
{
"epoch": 0.34,
"grad_norm": 1.9461978673934937,
"learning_rate": 8.873792591020062e-05,
"loss": 0.3098,
"step": 1080
},
{
"epoch": 0.35,
"grad_norm": 0.11714805662631989,
"learning_rate": 8.863178006581043e-05,
"loss": 0.3596,
"step": 1090
},
{
"epoch": 0.35,
"grad_norm": 2.0041699409484863,
"learning_rate": 8.852563422142024e-05,
"loss": 0.2518,
"step": 1100
},
{
"epoch": 0.35,
"grad_norm": 5.036510467529297,
"learning_rate": 8.841948837703004e-05,
"loss": 0.3654,
"step": 1110
},
{
"epoch": 0.36,
"grad_norm": 2.267143726348877,
"learning_rate": 8.831334253263986e-05,
"loss": 0.2812,
"step": 1120
},
{
"epoch": 0.36,
"grad_norm": 3.063321113586426,
"learning_rate": 8.820719668824966e-05,
"loss": 0.3135,
"step": 1130
},
{
"epoch": 0.36,
"grad_norm": 4.012215614318848,
"learning_rate": 8.810105084385947e-05,
"loss": 0.2423,
"step": 1140
},
{
"epoch": 0.37,
"grad_norm": 1.7306702136993408,
"learning_rate": 8.799490499946927e-05,
"loss": 0.187,
"step": 1150
},
{
"epoch": 0.37,
"grad_norm": 1.7319563627243042,
"learning_rate": 8.788875915507909e-05,
"loss": 0.3792,
"step": 1160
},
{
"epoch": 0.37,
"grad_norm": 4.382763862609863,
"learning_rate": 8.778261331068889e-05,
"loss": 0.483,
"step": 1170
},
{
"epoch": 0.38,
"grad_norm": 1.3643946647644043,
"learning_rate": 8.76764674662987e-05,
"loss": 0.1497,
"step": 1180
},
{
"epoch": 0.38,
"grad_norm": 5.549211025238037,
"learning_rate": 8.75703216219085e-05,
"loss": 0.2628,
"step": 1190
},
{
"epoch": 0.38,
"grad_norm": 2.2046520709991455,
"learning_rate": 8.747479036195734e-05,
"loss": 0.3474,
"step": 1200
},
{
"epoch": 0.39,
"grad_norm": 3.313180446624756,
"learning_rate": 8.736864451756715e-05,
"loss": 0.3096,
"step": 1210
},
{
"epoch": 0.39,
"grad_norm": 2.811859130859375,
"learning_rate": 8.726249867317695e-05,
"loss": 0.1371,
"step": 1220
},
{
"epoch": 0.39,
"grad_norm": 0.43377700448036194,
"learning_rate": 8.715635282878675e-05,
"loss": 0.2461,
"step": 1230
},
{
"epoch": 0.39,
"grad_norm": 2.7710583209991455,
"learning_rate": 8.705020698439657e-05,
"loss": 0.3332,
"step": 1240
},
{
"epoch": 0.4,
"grad_norm": 0.4188406467437744,
"learning_rate": 8.694406114000637e-05,
"loss": 0.3196,
"step": 1250
},
{
"epoch": 0.4,
"grad_norm": 0.7705641388893127,
"learning_rate": 8.683791529561618e-05,
"loss": 0.1709,
"step": 1260
},
{
"epoch": 0.4,
"grad_norm": 2.6247994899749756,
"learning_rate": 8.673176945122598e-05,
"loss": 0.3033,
"step": 1270
},
{
"epoch": 0.41,
"grad_norm": 1.033170461654663,
"learning_rate": 8.66256236068358e-05,
"loss": 0.2506,
"step": 1280
},
{
"epoch": 0.41,
"grad_norm": 4.289760112762451,
"learning_rate": 8.65194777624456e-05,
"loss": 0.2839,
"step": 1290
},
{
"epoch": 0.41,
"grad_norm": 1.3554538488388062,
"learning_rate": 8.64133319180554e-05,
"loss": 0.2703,
"step": 1300
},
{
"epoch": 0.42,
"grad_norm": 1.9523005485534668,
"learning_rate": 8.630718607366522e-05,
"loss": 0.1133,
"step": 1310
},
{
"epoch": 0.42,
"grad_norm": 5.332389831542969,
"learning_rate": 8.620104022927503e-05,
"loss": 0.3579,
"step": 1320
},
{
"epoch": 0.42,
"grad_norm": 5.874100208282471,
"learning_rate": 8.609489438488484e-05,
"loss": 0.4038,
"step": 1330
},
{
"epoch": 0.43,
"grad_norm": 1.4143377542495728,
"learning_rate": 8.598874854049465e-05,
"loss": 0.2451,
"step": 1340
},
{
"epoch": 0.43,
"grad_norm": 0.5176362991333008,
"learning_rate": 8.588260269610445e-05,
"loss": 0.2561,
"step": 1350
},
{
"epoch": 0.43,
"grad_norm": 1.5968561172485352,
"learning_rate": 8.577645685171426e-05,
"loss": 0.3456,
"step": 1360
},
{
"epoch": 0.44,
"grad_norm": 1.039812445640564,
"learning_rate": 8.567031100732407e-05,
"loss": 0.2792,
"step": 1370
},
{
"epoch": 0.44,
"grad_norm": 5.390068531036377,
"learning_rate": 8.556416516293388e-05,
"loss": 0.398,
"step": 1380
},
{
"epoch": 0.44,
"grad_norm": 1.3645654916763306,
"learning_rate": 8.545801931854368e-05,
"loss": 0.4537,
"step": 1390
},
{
"epoch": 0.45,
"grad_norm": 2.444027900695801,
"learning_rate": 8.535187347415348e-05,
"loss": 0.218,
"step": 1400
},
{
"epoch": 0.45,
"grad_norm": 4.201082229614258,
"learning_rate": 8.52457276297633e-05,
"loss": 0.3146,
"step": 1410
},
{
"epoch": 0.45,
"grad_norm": 4.080310344696045,
"learning_rate": 8.51395817853731e-05,
"loss": 0.2769,
"step": 1420
},
{
"epoch": 0.46,
"grad_norm": 2.712216377258301,
"learning_rate": 8.503343594098292e-05,
"loss": 0.2795,
"step": 1430
},
{
"epoch": 0.46,
"grad_norm": 3.2429492473602295,
"learning_rate": 8.492729009659273e-05,
"loss": 0.2956,
"step": 1440
},
{
"epoch": 0.46,
"grad_norm": 6.107478618621826,
"learning_rate": 8.482114425220253e-05,
"loss": 0.3381,
"step": 1450
},
{
"epoch": 0.46,
"grad_norm": 0.9037106037139893,
"learning_rate": 8.471499840781235e-05,
"loss": 0.4196,
"step": 1460
},
{
"epoch": 0.47,
"grad_norm": 1.2487717866897583,
"learning_rate": 8.460885256342215e-05,
"loss": 0.2471,
"step": 1470
},
{
"epoch": 0.47,
"grad_norm": 2.8922715187072754,
"learning_rate": 8.450270671903195e-05,
"loss": 0.2664,
"step": 1480
},
{
"epoch": 0.47,
"grad_norm": 0.6493813991546631,
"learning_rate": 8.439656087464176e-05,
"loss": 0.206,
"step": 1490
},
{
"epoch": 0.48,
"grad_norm": 0.11327870935201645,
"learning_rate": 8.429041503025157e-05,
"loss": 0.2593,
"step": 1500
},
{
"epoch": 0.48,
"grad_norm": 4.4462690353393555,
"learning_rate": 8.418426918586138e-05,
"loss": 0.4474,
"step": 1510
},
{
"epoch": 0.48,
"grad_norm": 2.0405867099761963,
"learning_rate": 8.407812334147118e-05,
"loss": 0.1657,
"step": 1520
},
{
"epoch": 0.49,
"grad_norm": 0.3047516942024231,
"learning_rate": 8.397197749708099e-05,
"loss": 0.1691,
"step": 1530
},
{
"epoch": 0.49,
"grad_norm": 6.330657958984375,
"learning_rate": 8.386583165269079e-05,
"loss": 0.2041,
"step": 1540
},
{
"epoch": 0.49,
"grad_norm": 2.403702974319458,
"learning_rate": 8.375968580830062e-05,
"loss": 0.3408,
"step": 1550
},
{
"epoch": 0.5,
"grad_norm": 3.2958528995513916,
"learning_rate": 8.365353996391042e-05,
"loss": 0.3271,
"step": 1560
},
{
"epoch": 0.5,
"grad_norm": 3.2511487007141113,
"learning_rate": 8.354739411952023e-05,
"loss": 0.1719,
"step": 1570
},
{
"epoch": 0.5,
"grad_norm": 2.447939872741699,
"learning_rate": 8.344124827513003e-05,
"loss": 0.2823,
"step": 1580
},
{
"epoch": 0.51,
"grad_norm": 1.9992095232009888,
"learning_rate": 8.333510243073985e-05,
"loss": 0.2479,
"step": 1590
},
{
"epoch": 0.51,
"grad_norm": 3.8574376106262207,
"learning_rate": 8.322895658634965e-05,
"loss": 0.2539,
"step": 1600
},
{
"epoch": 0.51,
"grad_norm": 3.184896230697632,
"learning_rate": 8.312281074195946e-05,
"loss": 0.2826,
"step": 1610
},
{
"epoch": 0.52,
"grad_norm": 0.6027563810348511,
"learning_rate": 8.301666489756926e-05,
"loss": 0.1404,
"step": 1620
},
{
"epoch": 0.52,
"grad_norm": 1.0776386260986328,
"learning_rate": 8.291051905317906e-05,
"loss": 0.3887,
"step": 1630
},
{
"epoch": 0.52,
"grad_norm": 2.386305093765259,
"learning_rate": 8.280437320878888e-05,
"loss": 0.4232,
"step": 1640
},
{
"epoch": 0.53,
"grad_norm": 1.299332618713379,
"learning_rate": 8.269822736439868e-05,
"loss": 0.2855,
"step": 1650
},
{
"epoch": 0.53,
"grad_norm": 1.3506910800933838,
"learning_rate": 8.259208152000849e-05,
"loss": 0.2412,
"step": 1660
},
{
"epoch": 0.53,
"grad_norm": 2.2037456035614014,
"learning_rate": 8.24859356756183e-05,
"loss": 0.2399,
"step": 1670
},
{
"epoch": 0.53,
"grad_norm": 2.2852354049682617,
"learning_rate": 8.237978983122812e-05,
"loss": 0.202,
"step": 1680
},
{
"epoch": 0.54,
"grad_norm": 0.2693609297275543,
"learning_rate": 8.227364398683793e-05,
"loss": 0.3235,
"step": 1690
},
{
"epoch": 0.54,
"grad_norm": 3.526648998260498,
"learning_rate": 8.216749814244773e-05,
"loss": 0.3102,
"step": 1700
},
{
"epoch": 0.54,
"grad_norm": 1.9742597341537476,
"learning_rate": 8.206135229805753e-05,
"loss": 0.3293,
"step": 1710
},
{
"epoch": 0.55,
"grad_norm": 2.933436155319214,
"learning_rate": 8.195520645366734e-05,
"loss": 0.207,
"step": 1720
},
{
"epoch": 0.55,
"grad_norm": 0.5870353579521179,
"learning_rate": 8.184906060927715e-05,
"loss": 0.3731,
"step": 1730
},
{
"epoch": 0.55,
"grad_norm": 1.7825034856796265,
"learning_rate": 8.174291476488696e-05,
"loss": 0.1747,
"step": 1740
},
{
"epoch": 0.56,
"grad_norm": 4.706550598144531,
"learning_rate": 8.163676892049676e-05,
"loss": 0.2143,
"step": 1750
},
{
"epoch": 0.56,
"grad_norm": 3.326359748840332,
"learning_rate": 8.153062307610657e-05,
"loss": 0.363,
"step": 1760
},
{
"epoch": 0.56,
"grad_norm": 1.3437646627426147,
"learning_rate": 8.142447723171638e-05,
"loss": 0.2806,
"step": 1770
},
{
"epoch": 0.57,
"grad_norm": 4.6950249671936035,
"learning_rate": 8.131833138732619e-05,
"loss": 0.2547,
"step": 1780
},
{
"epoch": 0.57,
"grad_norm": 1.557305097579956,
"learning_rate": 8.1212185542936e-05,
"loss": 0.277,
"step": 1790
},
{
"epoch": 0.57,
"grad_norm": 1.5373164415359497,
"learning_rate": 8.110603969854581e-05,
"loss": 0.2878,
"step": 1800
},
{
"epoch": 0.58,
"grad_norm": 1.3761144876480103,
"learning_rate": 8.099989385415561e-05,
"loss": 0.4071,
"step": 1810
},
{
"epoch": 0.58,
"grad_norm": 0.7141520977020264,
"learning_rate": 8.089374800976543e-05,
"loss": 0.2002,
"step": 1820
},
{
"epoch": 0.58,
"grad_norm": 0.6471810340881348,
"learning_rate": 8.078760216537523e-05,
"loss": 0.1962,
"step": 1830
},
{
"epoch": 0.59,
"grad_norm": 1.8333234786987305,
"learning_rate": 8.068145632098504e-05,
"loss": 0.23,
"step": 1840
},
{
"epoch": 0.59,
"grad_norm": 0.7382714152336121,
"learning_rate": 8.057531047659484e-05,
"loss": 0.1602,
"step": 1850
},
{
"epoch": 0.59,
"grad_norm": 2.2624874114990234,
"learning_rate": 8.046916463220466e-05,
"loss": 0.3355,
"step": 1860
},
{
"epoch": 0.6,
"grad_norm": 1.3432509899139404,
"learning_rate": 8.036301878781446e-05,
"loss": 0.1226,
"step": 1870
},
{
"epoch": 0.6,
"grad_norm": 1.3153080940246582,
"learning_rate": 8.025687294342426e-05,
"loss": 0.2797,
"step": 1880
},
{
"epoch": 0.6,
"grad_norm": 0.13998636603355408,
"learning_rate": 8.015072709903407e-05,
"loss": 0.3126,
"step": 1890
},
{
"epoch": 0.6,
"grad_norm": 7.6837382316589355,
"learning_rate": 8.004458125464388e-05,
"loss": 0.348,
"step": 1900
},
{
"epoch": 0.61,
"grad_norm": 2.536726236343384,
"learning_rate": 7.993843541025369e-05,
"loss": 0.2518,
"step": 1910
},
{
"epoch": 0.61,
"grad_norm": 2.798586130142212,
"learning_rate": 7.98322895658635e-05,
"loss": 0.187,
"step": 1920
},
{
"epoch": 0.61,
"grad_norm": 2.047030210494995,
"learning_rate": 7.972614372147331e-05,
"loss": 0.1801,
"step": 1930
},
{
"epoch": 0.62,
"grad_norm": 2.5127789974212646,
"learning_rate": 7.961999787708311e-05,
"loss": 0.2613,
"step": 1940
},
{
"epoch": 0.62,
"grad_norm": 5.015801429748535,
"learning_rate": 7.951385203269293e-05,
"loss": 0.4155,
"step": 1950
},
{
"epoch": 0.62,
"grad_norm": 4.095780849456787,
"learning_rate": 7.940770618830273e-05,
"loss": 0.2413,
"step": 1960
},
{
"epoch": 0.63,
"grad_norm": 0.575307309627533,
"learning_rate": 7.930156034391254e-05,
"loss": 0.2799,
"step": 1970
},
{
"epoch": 0.63,
"grad_norm": 0.26382434368133545,
"learning_rate": 7.919541449952234e-05,
"loss": 0.1894,
"step": 1980
},
{
"epoch": 0.63,
"grad_norm": 1.7955100536346436,
"learning_rate": 7.908926865513216e-05,
"loss": 0.199,
"step": 1990
},
{
"epoch": 0.64,
"grad_norm": 0.4029354453086853,
"learning_rate": 7.898312281074196e-05,
"loss": 0.2465,
"step": 2000
},
{
"epoch": 0.64,
"grad_norm": 1.4386157989501953,
"learning_rate": 7.887697696635177e-05,
"loss": 0.2603,
"step": 2010
},
{
"epoch": 0.64,
"grad_norm": 4.048315525054932,
"learning_rate": 7.877083112196157e-05,
"loss": 0.3663,
"step": 2020
},
{
"epoch": 0.65,
"grad_norm": 4.0357255935668945,
"learning_rate": 7.866468527757139e-05,
"loss": 0.2365,
"step": 2030
},
{
"epoch": 0.65,
"grad_norm": 0.6603661775588989,
"learning_rate": 7.85585394331812e-05,
"loss": 0.2848,
"step": 2040
},
{
"epoch": 0.65,
"grad_norm": 2.005911111831665,
"learning_rate": 7.845239358879101e-05,
"loss": 0.316,
"step": 2050
},
{
"epoch": 0.66,
"grad_norm": 1.5447591543197632,
"learning_rate": 7.834624774440081e-05,
"loss": 0.2741,
"step": 2060
},
{
"epoch": 0.66,
"grad_norm": 3.2413675785064697,
"learning_rate": 7.824010190001062e-05,
"loss": 0.4234,
"step": 2070
},
{
"epoch": 0.66,
"grad_norm": 2.6230356693267822,
"learning_rate": 7.813395605562043e-05,
"loss": 0.1797,
"step": 2080
},
{
"epoch": 0.67,
"grad_norm": 1.5376132726669312,
"learning_rate": 7.802781021123024e-05,
"loss": 0.3815,
"step": 2090
},
{
"epoch": 0.67,
"grad_norm": 1.4491734504699707,
"learning_rate": 7.792166436684004e-05,
"loss": 0.3153,
"step": 2100
},
{
"epoch": 0.67,
"grad_norm": 1.949112057685852,
"learning_rate": 7.781551852244984e-05,
"loss": 0.2751,
"step": 2110
},
{
"epoch": 0.67,
"grad_norm": 0.3488381803035736,
"learning_rate": 7.770937267805966e-05,
"loss": 0.3558,
"step": 2120
},
{
"epoch": 0.68,
"grad_norm": 1.4437161684036255,
"learning_rate": 7.760322683366946e-05,
"loss": 0.2827,
"step": 2130
},
{
"epoch": 0.68,
"grad_norm": 1.1105573177337646,
"learning_rate": 7.749708098927927e-05,
"loss": 0.1867,
"step": 2140
},
{
"epoch": 0.68,
"grad_norm": 2.1235313415527344,
"learning_rate": 7.739093514488907e-05,
"loss": 0.1689,
"step": 2150
},
{
"epoch": 0.69,
"grad_norm": 1.60935378074646,
"learning_rate": 7.728478930049889e-05,
"loss": 0.3198,
"step": 2160
},
{
"epoch": 0.69,
"grad_norm": 1.3222334384918213,
"learning_rate": 7.71786434561087e-05,
"loss": 0.1978,
"step": 2170
},
{
"epoch": 0.69,
"grad_norm": 1.4521784782409668,
"learning_rate": 7.707249761171851e-05,
"loss": 0.3276,
"step": 2180
},
{
"epoch": 0.7,
"grad_norm": 0.4480780363082886,
"learning_rate": 7.696635176732831e-05,
"loss": 0.2151,
"step": 2190
},
{
"epoch": 0.7,
"grad_norm": 1.5750231742858887,
"learning_rate": 7.686020592293812e-05,
"loss": 0.1659,
"step": 2200
},
{
"epoch": 0.7,
"grad_norm": 2.5736334323883057,
"learning_rate": 7.675406007854793e-05,
"loss": 0.3704,
"step": 2210
},
{
"epoch": 0.71,
"grad_norm": 3.719284772872925,
"learning_rate": 7.664791423415774e-05,
"loss": 0.1645,
"step": 2220
},
{
"epoch": 0.71,
"grad_norm": 3.429244041442871,
"learning_rate": 7.654176838976754e-05,
"loss": 0.3323,
"step": 2230
},
{
"epoch": 0.71,
"grad_norm": 2.801398277282715,
"learning_rate": 7.643562254537735e-05,
"loss": 0.2805,
"step": 2240
},
{
"epoch": 0.72,
"grad_norm": 2.050607204437256,
"learning_rate": 7.632947670098716e-05,
"loss": 0.2308,
"step": 2250
},
{
"epoch": 0.72,
"grad_norm": 3.164123773574829,
"learning_rate": 7.622333085659697e-05,
"loss": 0.2401,
"step": 2260
},
{
"epoch": 0.72,
"grad_norm": 3.276832342147827,
"learning_rate": 7.611718501220677e-05,
"loss": 0.2399,
"step": 2270
},
{
"epoch": 0.73,
"grad_norm": 2.8366944789886475,
"learning_rate": 7.601103916781659e-05,
"loss": 0.4004,
"step": 2280
},
{
"epoch": 0.73,
"grad_norm": 2.4258265495300293,
"learning_rate": 7.590489332342639e-05,
"loss": 0.3202,
"step": 2290
},
{
"epoch": 0.73,
"grad_norm": 1.4008164405822754,
"learning_rate": 7.579874747903621e-05,
"loss": 0.1952,
"step": 2300
},
{
"epoch": 0.74,
"grad_norm": 1.1098754405975342,
"learning_rate": 7.569260163464601e-05,
"loss": 0.1867,
"step": 2310
},
{
"epoch": 0.74,
"grad_norm": 0.15033583343029022,
"learning_rate": 7.558645579025582e-05,
"loss": 0.1995,
"step": 2320
},
{
"epoch": 0.74,
"grad_norm": 0.9557719230651855,
"learning_rate": 7.548030994586562e-05,
"loss": 0.2475,
"step": 2330
},
{
"epoch": 0.74,
"grad_norm": 8.91406536102295,
"learning_rate": 7.537416410147544e-05,
"loss": 0.2756,
"step": 2340
},
{
"epoch": 0.75,
"grad_norm": 1.9521056413650513,
"learning_rate": 7.526801825708524e-05,
"loss": 0.2595,
"step": 2350
},
{
"epoch": 0.75,
"grad_norm": 3.3855483531951904,
"learning_rate": 7.516187241269504e-05,
"loss": 0.2948,
"step": 2360
},
{
"epoch": 0.75,
"grad_norm": 1.6990065574645996,
"learning_rate": 7.506634115274387e-05,
"loss": 0.2755,
"step": 2370
},
{
"epoch": 0.76,
"grad_norm": 2.098942518234253,
"learning_rate": 7.496019530835369e-05,
"loss": 0.175,
"step": 2380
},
{
"epoch": 0.76,
"grad_norm": 0.9781967997550964,
"learning_rate": 7.48540494639635e-05,
"loss": 0.4592,
"step": 2390
},
{
"epoch": 0.76,
"grad_norm": 0.4728473722934723,
"learning_rate": 7.47479036195733e-05,
"loss": 0.3847,
"step": 2400
},
{
"epoch": 0.77,
"grad_norm": 3.3047373294830322,
"learning_rate": 7.46417577751831e-05,
"loss": 0.1848,
"step": 2410
},
{
"epoch": 0.77,
"grad_norm": 2.424025535583496,
"learning_rate": 7.453561193079292e-05,
"loss": 0.2197,
"step": 2420
},
{
"epoch": 0.77,
"grad_norm": 2.697960376739502,
"learning_rate": 7.442946608640272e-05,
"loss": 0.2314,
"step": 2430
},
{
"epoch": 0.78,
"grad_norm": 0.496898353099823,
"learning_rate": 7.432332024201253e-05,
"loss": 0.3299,
"step": 2440
},
{
"epoch": 0.78,
"grad_norm": 1.4845099449157715,
"learning_rate": 7.421717439762233e-05,
"loss": 0.2832,
"step": 2450
},
{
"epoch": 0.78,
"grad_norm": 3.8896942138671875,
"learning_rate": 7.411102855323215e-05,
"loss": 0.2837,
"step": 2460
},
{
"epoch": 0.79,
"grad_norm": 4.288979530334473,
"learning_rate": 7.400488270884195e-05,
"loss": 0.1653,
"step": 2470
},
{
"epoch": 0.79,
"grad_norm": 3.0013909339904785,
"learning_rate": 7.389873686445176e-05,
"loss": 0.3207,
"step": 2480
},
{
"epoch": 0.79,
"grad_norm": 0.38008421659469604,
"learning_rate": 7.379259102006156e-05,
"loss": 0.2916,
"step": 2490
},
{
"epoch": 0.8,
"grad_norm": 3.843106985092163,
"learning_rate": 7.368644517567138e-05,
"loss": 0.4216,
"step": 2500
},
{
"epoch": 0.8,
"grad_norm": 0.46844518184661865,
"learning_rate": 7.35802993312812e-05,
"loss": 0.3038,
"step": 2510
},
{
"epoch": 0.8,
"grad_norm": 0.5063233375549316,
"learning_rate": 7.3474153486891e-05,
"loss": 0.2392,
"step": 2520
},
{
"epoch": 0.81,
"grad_norm": 6.260082721710205,
"learning_rate": 7.33680076425008e-05,
"loss": 0.317,
"step": 2530
},
{
"epoch": 0.81,
"grad_norm": 1.771292805671692,
"learning_rate": 7.32618617981106e-05,
"loss": 0.2229,
"step": 2540
},
{
"epoch": 0.81,
"grad_norm": 5.619741439819336,
"learning_rate": 7.315571595372042e-05,
"loss": 0.1364,
"step": 2550
},
{
"epoch": 0.81,
"grad_norm": 2.196967363357544,
"learning_rate": 7.304957010933023e-05,
"loss": 0.2732,
"step": 2560
},
{
"epoch": 0.82,
"grad_norm": 0.6409101486206055,
"learning_rate": 7.294342426494003e-05,
"loss": 0.2754,
"step": 2570
},
{
"epoch": 0.82,
"grad_norm": 1.4790414571762085,
"learning_rate": 7.283727842054983e-05,
"loss": 0.2017,
"step": 2580
},
{
"epoch": 0.82,
"grad_norm": 2.013932943344116,
"learning_rate": 7.273113257615965e-05,
"loss": 0.24,
"step": 2590
},
{
"epoch": 0.83,
"grad_norm": 3.7832634449005127,
"learning_rate": 7.262498673176945e-05,
"loss": 0.3675,
"step": 2600
},
{
"epoch": 0.83,
"grad_norm": 0.3102867007255554,
"learning_rate": 7.251884088737926e-05,
"loss": 0.379,
"step": 2610
},
{
"epoch": 0.83,
"grad_norm": 2.4098093509674072,
"learning_rate": 7.241269504298906e-05,
"loss": 0.381,
"step": 2620
},
{
"epoch": 0.84,
"grad_norm": 2.3519186973571777,
"learning_rate": 7.230654919859888e-05,
"loss": 0.2574,
"step": 2630
},
{
"epoch": 0.84,
"grad_norm": 1.1589571237564087,
"learning_rate": 7.22004033542087e-05,
"loss": 0.1603,
"step": 2640
},
{
"epoch": 0.84,
"grad_norm": 3.823918342590332,
"learning_rate": 7.20942575098185e-05,
"loss": 0.2485,
"step": 2650
},
{
"epoch": 0.85,
"grad_norm": 1.778441071510315,
"learning_rate": 7.19881116654283e-05,
"loss": 0.234,
"step": 2660
},
{
"epoch": 0.85,
"grad_norm": 2.2710683345794678,
"learning_rate": 7.188196582103811e-05,
"loss": 0.1746,
"step": 2670
},
{
"epoch": 0.85,
"grad_norm": 6.078259468078613,
"learning_rate": 7.177581997664792e-05,
"loss": 0.3255,
"step": 2680
},
{
"epoch": 0.86,
"grad_norm": 0.585472583770752,
"learning_rate": 7.166967413225773e-05,
"loss": 0.3718,
"step": 2690
},
{
"epoch": 0.86,
"grad_norm": 1.9394687414169312,
"learning_rate": 7.156352828786753e-05,
"loss": 0.3181,
"step": 2700
},
{
"epoch": 0.86,
"grad_norm": 1.6753870248794556,
"learning_rate": 7.145738244347734e-05,
"loss": 0.2424,
"step": 2710
},
{
"epoch": 0.87,
"grad_norm": 0.37682977318763733,
"learning_rate": 7.135123659908714e-05,
"loss": 0.2963,
"step": 2720
},
{
"epoch": 0.87,
"grad_norm": 3.564805507659912,
"learning_rate": 7.124509075469696e-05,
"loss": 0.2822,
"step": 2730
},
{
"epoch": 0.87,
"grad_norm": 0.22953364253044128,
"learning_rate": 7.113894491030676e-05,
"loss": 0.3489,
"step": 2740
},
{
"epoch": 0.88,
"grad_norm": 4.16074275970459,
"learning_rate": 7.103279906591658e-05,
"loss": 0.405,
"step": 2750
},
{
"epoch": 0.88,
"grad_norm": 1.4540446996688843,
"learning_rate": 7.092665322152638e-05,
"loss": 0.2634,
"step": 2760
},
{
"epoch": 0.88,
"grad_norm": 1.9992202520370483,
"learning_rate": 7.082050737713618e-05,
"loss": 0.2762,
"step": 2770
},
{
"epoch": 0.88,
"grad_norm": 1.3939869403839111,
"learning_rate": 7.0714361532746e-05,
"loss": 0.3462,
"step": 2780
},
{
"epoch": 0.89,
"grad_norm": 0.6099751591682434,
"learning_rate": 7.06082156883558e-05,
"loss": 0.367,
"step": 2790
},
{
"epoch": 0.89,
"grad_norm": 6.303842067718506,
"learning_rate": 7.050206984396561e-05,
"loss": 0.2596,
"step": 2800
},
{
"epoch": 0.89,
"grad_norm": 1.5723298788070679,
"learning_rate": 7.039592399957541e-05,
"loss": 0.3136,
"step": 2810
},
{
"epoch": 0.9,
"grad_norm": 1.3614245653152466,
"learning_rate": 7.028977815518523e-05,
"loss": 0.2983,
"step": 2820
},
{
"epoch": 0.9,
"grad_norm": 2.220656633377075,
"learning_rate": 7.018363231079503e-05,
"loss": 0.3549,
"step": 2830
},
{
"epoch": 0.9,
"grad_norm": 2.8158984184265137,
"learning_rate": 7.007748646640484e-05,
"loss": 0.2431,
"step": 2840
},
{
"epoch": 0.91,
"grad_norm": 0.46454083919525146,
"learning_rate": 6.997134062201464e-05,
"loss": 0.204,
"step": 2850
},
{
"epoch": 0.91,
"grad_norm": 2.5426604747772217,
"learning_rate": 6.986519477762446e-05,
"loss": 0.1241,
"step": 2860
},
{
"epoch": 0.91,
"grad_norm": 2.6442790031433105,
"learning_rate": 6.975904893323428e-05,
"loss": 0.2026,
"step": 2870
},
{
"epoch": 0.92,
"grad_norm": 0.07216634601354599,
"learning_rate": 6.965290308884408e-05,
"loss": 0.1619,
"step": 2880
},
{
"epoch": 0.92,
"grad_norm": 1.6410995721817017,
"learning_rate": 6.954675724445388e-05,
"loss": 0.309,
"step": 2890
},
{
"epoch": 0.92,
"grad_norm": 1.0634126663208008,
"learning_rate": 6.944061140006369e-05,
"loss": 0.2269,
"step": 2900
},
{
"epoch": 0.93,
"grad_norm": 1.272518277168274,
"learning_rate": 6.93344655556735e-05,
"loss": 0.2748,
"step": 2910
},
{
"epoch": 0.93,
"grad_norm": 8.030739784240723,
"learning_rate": 6.922831971128331e-05,
"loss": 0.2386,
"step": 2920
},
{
"epoch": 0.93,
"grad_norm": 1.0459538698196411,
"learning_rate": 6.912217386689311e-05,
"loss": 0.2162,
"step": 2930
},
{
"epoch": 0.94,
"grad_norm": 2.7766873836517334,
"learning_rate": 6.901602802250292e-05,
"loss": 0.18,
"step": 2940
},
{
"epoch": 0.94,
"grad_norm": 1.345751166343689,
"learning_rate": 6.890988217811273e-05,
"loss": 0.1927,
"step": 2950
},
{
"epoch": 0.94,
"grad_norm": 3.475550889968872,
"learning_rate": 6.880373633372254e-05,
"loss": 0.1593,
"step": 2960
},
{
"epoch": 0.95,
"grad_norm": 4.3208088874816895,
"learning_rate": 6.869759048933234e-05,
"loss": 0.3782,
"step": 2970
},
{
"epoch": 0.95,
"grad_norm": 0.5283639430999756,
"learning_rate": 6.859144464494214e-05,
"loss": 0.2065,
"step": 2980
},
{
"epoch": 0.95,
"grad_norm": 0.3912002444267273,
"learning_rate": 6.848529880055196e-05,
"loss": 0.2094,
"step": 2990
},
{
"epoch": 0.95,
"grad_norm": 5.560369968414307,
"learning_rate": 6.837915295616178e-05,
"loss": 0.2598,
"step": 3000
},
{
"epoch": 0.96,
"grad_norm": 2.0859804153442383,
"learning_rate": 6.827300711177158e-05,
"loss": 0.2396,
"step": 3010
},
{
"epoch": 0.96,
"grad_norm": 1.9198240041732788,
"learning_rate": 6.816686126738139e-05,
"loss": 0.326,
"step": 3020
},
{
"epoch": 0.96,
"grad_norm": 2.559525728225708,
"learning_rate": 6.806071542299119e-05,
"loss": 0.2846,
"step": 3030
},
{
"epoch": 0.97,
"grad_norm": 8.122730255126953,
"learning_rate": 6.7954569578601e-05,
"loss": 0.3404,
"step": 3040
},
{
"epoch": 0.97,
"grad_norm": 1.4377597570419312,
"learning_rate": 6.784842373421081e-05,
"loss": 0.3534,
"step": 3050
},
{
"epoch": 0.97,
"grad_norm": 1.3202710151672363,
"learning_rate": 6.774227788982061e-05,
"loss": 0.3151,
"step": 3060
},
{
"epoch": 0.98,
"grad_norm": 1.2933627367019653,
"learning_rate": 6.763613204543042e-05,
"loss": 0.1983,
"step": 3070
},
{
"epoch": 0.98,
"grad_norm": 0.8253432512283325,
"learning_rate": 6.752998620104023e-05,
"loss": 0.1989,
"step": 3080
},
{
"epoch": 0.98,
"grad_norm": 1.008435606956482,
"learning_rate": 6.742384035665004e-05,
"loss": 0.2045,
"step": 3090
},
{
"epoch": 0.99,
"grad_norm": 4.022599220275879,
"learning_rate": 6.731769451225984e-05,
"loss": 0.2166,
"step": 3100
},
{
"epoch": 0.99,
"grad_norm": 0.5018757581710815,
"learning_rate": 6.721154866786966e-05,
"loss": 0.1841,
"step": 3110
},
{
"epoch": 0.99,
"grad_norm": 1.1110012531280518,
"learning_rate": 6.710540282347946e-05,
"loss": 0.208,
"step": 3120
},
{
"epoch": 1.0,
"grad_norm": 4.160871505737305,
"learning_rate": 6.699925697908928e-05,
"loss": 0.2853,
"step": 3130
},
{
"epoch": 1.0,
"grad_norm": 3.1839327812194824,
"learning_rate": 6.689311113469908e-05,
"loss": 0.239,
"step": 3140
},
{
"epoch": 1.0,
"grad_norm": 1.2867355346679688,
"learning_rate": 6.678696529030889e-05,
"loss": 0.1678,
"step": 3150
},
{
"epoch": 1.01,
"grad_norm": 0.3853776454925537,
"learning_rate": 6.668081944591869e-05,
"loss": 0.1119,
"step": 3160
},
{
"epoch": 1.01,
"grad_norm": 0.9403756856918335,
"learning_rate": 6.657467360152851e-05,
"loss": 0.1772,
"step": 3170
},
{
"epoch": 1.01,
"grad_norm": 2.8056976795196533,
"learning_rate": 6.646852775713831e-05,
"loss": 0.1438,
"step": 3180
},
{
"epoch": 1.02,
"grad_norm": 0.9233602285385132,
"learning_rate": 6.636238191274812e-05,
"loss": 0.2491,
"step": 3190
},
{
"epoch": 1.02,
"grad_norm": 2.179743766784668,
"learning_rate": 6.625623606835792e-05,
"loss": 0.1493,
"step": 3200
},
{
"epoch": 1.02,
"grad_norm": 1.8002713918685913,
"learning_rate": 6.615009022396774e-05,
"loss": 0.1557,
"step": 3210
},
{
"epoch": 1.02,
"grad_norm": 1.0567578077316284,
"learning_rate": 6.604394437957754e-05,
"loss": 0.1573,
"step": 3220
},
{
"epoch": 1.03,
"grad_norm": 1.7498853206634521,
"learning_rate": 6.593779853518734e-05,
"loss": 0.2639,
"step": 3230
},
{
"epoch": 1.03,
"grad_norm": 0.14960238337516785,
"learning_rate": 6.583165269079716e-05,
"loss": 0.2314,
"step": 3240
},
{
"epoch": 1.03,
"grad_norm": 0.858378529548645,
"learning_rate": 6.572550684640697e-05,
"loss": 0.1898,
"step": 3250
},
{
"epoch": 1.04,
"grad_norm": 4.104907989501953,
"learning_rate": 6.561936100201678e-05,
"loss": 0.2381,
"step": 3260
},
{
"epoch": 1.04,
"grad_norm": 0.1154847964644432,
"learning_rate": 6.551321515762659e-05,
"loss": 0.0987,
"step": 3270
},
{
"epoch": 1.04,
"grad_norm": 1.8907705545425415,
"learning_rate": 6.540706931323639e-05,
"loss": 0.125,
"step": 3280
},
{
"epoch": 1.05,
"grad_norm": 1.2750372886657715,
"learning_rate": 6.53009234688462e-05,
"loss": 0.234,
"step": 3290
},
{
"epoch": 1.05,
"grad_norm": 1.584429144859314,
"learning_rate": 6.519477762445601e-05,
"loss": 0.1328,
"step": 3300
},
{
"epoch": 1.05,
"grad_norm": 2.3900089263916016,
"learning_rate": 6.508863178006581e-05,
"loss": 0.2681,
"step": 3310
},
{
"epoch": 1.06,
"grad_norm": 1.9859068393707275,
"learning_rate": 6.498248593567562e-05,
"loss": 0.4136,
"step": 3320
},
{
"epoch": 1.06,
"grad_norm": 3.4652695655822754,
"learning_rate": 6.487634009128542e-05,
"loss": 0.2059,
"step": 3330
},
{
"epoch": 1.06,
"grad_norm": 4.06072473526001,
"learning_rate": 6.477019424689524e-05,
"loss": 0.2378,
"step": 3340
},
{
"epoch": 1.07,
"grad_norm": 1.2823538780212402,
"learning_rate": 6.466404840250504e-05,
"loss": 0.1772,
"step": 3350
},
{
"epoch": 1.07,
"grad_norm": 0.545313835144043,
"learning_rate": 6.455790255811486e-05,
"loss": 0.1587,
"step": 3360
},
{
"epoch": 1.07,
"grad_norm": 5.666371822357178,
"learning_rate": 6.445175671372466e-05,
"loss": 0.1486,
"step": 3370
},
{
"epoch": 1.08,
"grad_norm": 0.3175773620605469,
"learning_rate": 6.434561086933447e-05,
"loss": 0.2295,
"step": 3380
},
{
"epoch": 1.08,
"grad_norm": 3.88968563079834,
"learning_rate": 6.423946502494428e-05,
"loss": 0.16,
"step": 3390
},
{
"epoch": 1.08,
"grad_norm": 2.4445409774780273,
"learning_rate": 6.413331918055409e-05,
"loss": 0.1766,
"step": 3400
},
{
"epoch": 1.09,
"grad_norm": 0.5478050708770752,
"learning_rate": 6.402717333616389e-05,
"loss": 0.1299,
"step": 3410
},
{
"epoch": 1.09,
"grad_norm": 4.029285907745361,
"learning_rate": 6.393164207621272e-05,
"loss": 0.3463,
"step": 3420
},
{
"epoch": 1.09,
"grad_norm": 0.3899819552898407,
"learning_rate": 6.382549623182253e-05,
"loss": 0.1214,
"step": 3430
},
{
"epoch": 1.1,
"grad_norm": 0.7180734276771545,
"learning_rate": 6.371935038743233e-05,
"loss": 0.2756,
"step": 3440
},
{
"epoch": 1.1,
"grad_norm": 3.6423099040985107,
"learning_rate": 6.361320454304213e-05,
"loss": 0.2059,
"step": 3450
},
{
"epoch": 1.1,
"grad_norm": 3.006516933441162,
"learning_rate": 6.350705869865195e-05,
"loss": 0.2151,
"step": 3460
},
{
"epoch": 1.1,
"grad_norm": 2.1426503658294678,
"learning_rate": 6.340091285426177e-05,
"loss": 0.2644,
"step": 3470
},
{
"epoch": 1.11,
"grad_norm": 1.4418883323669434,
"learning_rate": 6.329476700987157e-05,
"loss": 0.1675,
"step": 3480
},
{
"epoch": 1.11,
"grad_norm": 1.2576738595962524,
"learning_rate": 6.318862116548138e-05,
"loss": 0.1612,
"step": 3490
},
{
"epoch": 1.11,
"grad_norm": 3.26369309425354,
"learning_rate": 6.308247532109118e-05,
"loss": 0.2346,
"step": 3500
},
{
"epoch": 1.12,
"grad_norm": 0.9214788675308228,
"learning_rate": 6.2976329476701e-05,
"loss": 0.1714,
"step": 3510
},
{
"epoch": 1.12,
"grad_norm": 1.696925163269043,
"learning_rate": 6.28701836323108e-05,
"loss": 0.1306,
"step": 3520
},
{
"epoch": 1.12,
"grad_norm": 1.1808693408966064,
"learning_rate": 6.27640377879206e-05,
"loss": 0.1135,
"step": 3530
},
{
"epoch": 1.13,
"grad_norm": 4.710297107696533,
"learning_rate": 6.265789194353041e-05,
"loss": 0.158,
"step": 3540
},
{
"epoch": 1.13,
"grad_norm": 0.5521005988121033,
"learning_rate": 6.255174609914022e-05,
"loss": 0.3224,
"step": 3550
},
{
"epoch": 1.13,
"grad_norm": 2.172825336456299,
"learning_rate": 6.244560025475003e-05,
"loss": 0.0946,
"step": 3560
},
{
"epoch": 1.14,
"grad_norm": 1.8690552711486816,
"learning_rate": 6.233945441035983e-05,
"loss": 0.1972,
"step": 3570
},
{
"epoch": 1.14,
"grad_norm": 0.059970393776893616,
"learning_rate": 6.223330856596965e-05,
"loss": 0.0601,
"step": 3580
},
{
"epoch": 1.14,
"grad_norm": 0.0773802176117897,
"learning_rate": 6.212716272157945e-05,
"loss": 0.2881,
"step": 3590
},
{
"epoch": 1.15,
"grad_norm": 1.320061206817627,
"learning_rate": 6.202101687718927e-05,
"loss": 0.1966,
"step": 3600
},
{
"epoch": 1.15,
"grad_norm": 2.4339261054992676,
"learning_rate": 6.191487103279907e-05,
"loss": 0.1808,
"step": 3610
},
{
"epoch": 1.15,
"grad_norm": 5.3104729652404785,
"learning_rate": 6.180872518840888e-05,
"loss": 0.1737,
"step": 3620
},
{
"epoch": 1.16,
"grad_norm": 3.9139719009399414,
"learning_rate": 6.170257934401868e-05,
"loss": 0.239,
"step": 3630
},
{
"epoch": 1.16,
"grad_norm": 0.9480198621749878,
"learning_rate": 6.15964334996285e-05,
"loss": 0.1556,
"step": 3640
},
{
"epoch": 1.16,
"grad_norm": 0.807107150554657,
"learning_rate": 6.14902876552383e-05,
"loss": 0.131,
"step": 3650
},
{
"epoch": 1.17,
"grad_norm": 0.059983473271131516,
"learning_rate": 6.13841418108481e-05,
"loss": 0.1479,
"step": 3660
},
{
"epoch": 1.17,
"grad_norm": 0.7000637650489807,
"learning_rate": 6.127799596645791e-05,
"loss": 0.0861,
"step": 3670
},
{
"epoch": 1.17,
"grad_norm": 0.43273600935935974,
"learning_rate": 6.117185012206771e-05,
"loss": 0.1848,
"step": 3680
},
{
"epoch": 1.17,
"grad_norm": 0.056298673152923584,
"learning_rate": 6.106570427767753e-05,
"loss": 0.1313,
"step": 3690
},
{
"epoch": 1.18,
"grad_norm": 0.6714267134666443,
"learning_rate": 6.095955843328735e-05,
"loss": 0.2817,
"step": 3700
},
{
"epoch": 1.18,
"grad_norm": 2.8052423000335693,
"learning_rate": 6.085341258889715e-05,
"loss": 0.2095,
"step": 3710
},
{
"epoch": 1.18,
"grad_norm": 3.0490353107452393,
"learning_rate": 6.074726674450696e-05,
"loss": 0.2707,
"step": 3720
},
{
"epoch": 1.19,
"grad_norm": 2.3823633193969727,
"learning_rate": 6.0641120900116766e-05,
"loss": 0.1918,
"step": 3730
},
{
"epoch": 1.19,
"grad_norm": 5.9893293380737305,
"learning_rate": 6.0534975055726576e-05,
"loss": 0.1855,
"step": 3740
},
{
"epoch": 1.19,
"grad_norm": 5.253934383392334,
"learning_rate": 6.042882921133638e-05,
"loss": 0.1286,
"step": 3750
},
{
"epoch": 1.2,
"grad_norm": 3.3353893756866455,
"learning_rate": 6.0322683366946183e-05,
"loss": 0.1656,
"step": 3760
},
{
"epoch": 1.2,
"grad_norm": 1.5391966104507446,
"learning_rate": 6.0216537522555994e-05,
"loss": 0.1783,
"step": 3770
},
{
"epoch": 1.2,
"grad_norm": 3.3716678619384766,
"learning_rate": 6.01103916781658e-05,
"loss": 0.1025,
"step": 3780
},
{
"epoch": 1.21,
"grad_norm": 0.8058392405509949,
"learning_rate": 6.000424583377561e-05,
"loss": 0.1224,
"step": 3790
},
{
"epoch": 1.21,
"grad_norm": 1.5231162309646606,
"learning_rate": 5.989809998938541e-05,
"loss": 0.0579,
"step": 3800
},
{
"epoch": 1.21,
"grad_norm": 3.7527573108673096,
"learning_rate": 5.979195414499522e-05,
"loss": 0.3109,
"step": 3810
},
{
"epoch": 1.22,
"grad_norm": 1.884722113609314,
"learning_rate": 5.968580830060504e-05,
"loss": 0.2569,
"step": 3820
},
{
"epoch": 1.22,
"grad_norm": 1.2949138879776,
"learning_rate": 5.957966245621484e-05,
"loss": 0.2067,
"step": 3830
},
{
"epoch": 1.22,
"grad_norm": 1.9406439065933228,
"learning_rate": 5.9473516611824654e-05,
"loss": 0.1397,
"step": 3840
},
{
"epoch": 1.23,
"grad_norm": 3.048089027404785,
"learning_rate": 5.936737076743446e-05,
"loss": 0.1903,
"step": 3850
},
{
"epoch": 1.23,
"grad_norm": 2.7827141284942627,
"learning_rate": 5.926122492304427e-05,
"loss": 0.2375,
"step": 3860
},
{
"epoch": 1.23,
"grad_norm": 0.30664700269699097,
"learning_rate": 5.915507907865407e-05,
"loss": 0.2605,
"step": 3870
},
{
"epoch": 1.24,
"grad_norm": 5.038077354431152,
"learning_rate": 5.904893323426388e-05,
"loss": 0.2249,
"step": 3880
},
{
"epoch": 1.24,
"grad_norm": 0.5563170313835144,
"learning_rate": 5.8942787389873686e-05,
"loss": 0.1407,
"step": 3890
},
{
"epoch": 1.24,
"grad_norm": 3.5176491737365723,
"learning_rate": 5.8836641545483496e-05,
"loss": 0.1955,
"step": 3900
},
{
"epoch": 1.24,
"grad_norm": 0.16444259881973267,
"learning_rate": 5.87304957010933e-05,
"loss": 0.2973,
"step": 3910
},
{
"epoch": 1.25,
"grad_norm": 2.3163607120513916,
"learning_rate": 5.862434985670311e-05,
"loss": 0.1388,
"step": 3920
},
{
"epoch": 1.25,
"grad_norm": 2.4921140670776367,
"learning_rate": 5.8518204012312914e-05,
"loss": 0.2844,
"step": 3930
},
{
"epoch": 1.25,
"grad_norm": 6.664550304412842,
"learning_rate": 5.841205816792273e-05,
"loss": 0.5434,
"step": 3940
},
{
"epoch": 1.26,
"grad_norm": 0.27615758776664734,
"learning_rate": 5.830591232353254e-05,
"loss": 0.2716,
"step": 3950
},
{
"epoch": 1.26,
"grad_norm": 7.205143451690674,
"learning_rate": 5.8199766479142345e-05,
"loss": 0.1927,
"step": 3960
},
{
"epoch": 1.26,
"grad_norm": 2.423842191696167,
"learning_rate": 5.8093620634752156e-05,
"loss": 0.2013,
"step": 3970
},
{
"epoch": 1.27,
"grad_norm": 0.6563037037849426,
"learning_rate": 5.798747479036196e-05,
"loss": 0.2597,
"step": 3980
},
{
"epoch": 1.27,
"grad_norm": 2.216214418411255,
"learning_rate": 5.788132894597177e-05,
"loss": 0.1484,
"step": 3990
},
{
"epoch": 1.27,
"grad_norm": 0.21049724519252777,
"learning_rate": 5.7775183101581574e-05,
"loss": 0.1205,
"step": 4000
},
{
"epoch": 1.28,
"grad_norm": 1.838711142539978,
"learning_rate": 5.7669037257191384e-05,
"loss": 0.1806,
"step": 4010
},
{
"epoch": 1.28,
"grad_norm": 4.584275245666504,
"learning_rate": 5.756289141280119e-05,
"loss": 0.1459,
"step": 4020
},
{
"epoch": 1.28,
"grad_norm": 3.7076704502105713,
"learning_rate": 5.7456745568411e-05,
"loss": 0.2119,
"step": 4030
},
{
"epoch": 1.29,
"grad_norm": 4.600487232208252,
"learning_rate": 5.73505997240208e-05,
"loss": 0.1846,
"step": 4040
},
{
"epoch": 1.29,
"grad_norm": 2.9479613304138184,
"learning_rate": 5.724445387963061e-05,
"loss": 0.1373,
"step": 4050
},
{
"epoch": 1.29,
"grad_norm": 2.7824301719665527,
"learning_rate": 5.7138308035240416e-05,
"loss": 0.1573,
"step": 4060
},
{
"epoch": 1.3,
"grad_norm": 1.3697668313980103,
"learning_rate": 5.703216219085023e-05,
"loss": 0.1067,
"step": 4070
},
{
"epoch": 1.3,
"grad_norm": 4.134962558746338,
"learning_rate": 5.6926016346460044e-05,
"loss": 0.3154,
"step": 4080
},
{
"epoch": 1.3,
"grad_norm": 1.986623764038086,
"learning_rate": 5.681987050206985e-05,
"loss": 0.162,
"step": 4090
},
{
"epoch": 1.31,
"grad_norm": 1.7553232908248901,
"learning_rate": 5.671372465767966e-05,
"loss": 0.2197,
"step": 4100
},
{
"epoch": 1.31,
"grad_norm": 1.666942834854126,
"learning_rate": 5.660757881328946e-05,
"loss": 0.2144,
"step": 4110
},
{
"epoch": 1.31,
"grad_norm": 1.3620635271072388,
"learning_rate": 5.650143296889927e-05,
"loss": 0.2823,
"step": 4120
},
{
"epoch": 1.31,
"grad_norm": 3.4056193828582764,
"learning_rate": 5.6395287124509076e-05,
"loss": 0.3223,
"step": 4130
},
{
"epoch": 1.32,
"grad_norm": 0.8397992253303528,
"learning_rate": 5.6289141280118886e-05,
"loss": 0.1297,
"step": 4140
},
{
"epoch": 1.32,
"grad_norm": 0.09627294540405273,
"learning_rate": 5.618299543572869e-05,
"loss": 0.1154,
"step": 4150
},
{
"epoch": 1.32,
"grad_norm": 2.1529462337493896,
"learning_rate": 5.60768495913385e-05,
"loss": 0.1903,
"step": 4160
},
{
"epoch": 1.33,
"grad_norm": 0.42282378673553467,
"learning_rate": 5.5970703746948304e-05,
"loss": 0.0992,
"step": 4170
},
{
"epoch": 1.33,
"grad_norm": 0.34097906947135925,
"learning_rate": 5.5864557902558115e-05,
"loss": 0.2193,
"step": 4180
},
{
"epoch": 1.33,
"grad_norm": 0.11647669225931168,
"learning_rate": 5.575841205816793e-05,
"loss": 0.1511,
"step": 4190
},
{
"epoch": 1.34,
"grad_norm": 7.489476680755615,
"learning_rate": 5.5652266213777736e-05,
"loss": 0.182,
"step": 4200
},
{
"epoch": 1.34,
"grad_norm": 0.0627538189291954,
"learning_rate": 5.5546120369387546e-05,
"loss": 0.2056,
"step": 4210
},
{
"epoch": 1.34,
"grad_norm": 1.6038990020751953,
"learning_rate": 5.543997452499735e-05,
"loss": 0.317,
"step": 4220
},
{
"epoch": 1.35,
"grad_norm": 2.0296130180358887,
"learning_rate": 5.533382868060716e-05,
"loss": 0.221,
"step": 4230
},
{
"epoch": 1.35,
"grad_norm": 3.08427357673645,
"learning_rate": 5.5227682836216964e-05,
"loss": 0.309,
"step": 4240
},
{
"epoch": 1.35,
"grad_norm": 6.700926303863525,
"learning_rate": 5.5121536991826774e-05,
"loss": 0.3862,
"step": 4250
},
{
"epoch": 1.36,
"grad_norm": 3.3283987045288086,
"learning_rate": 5.501539114743658e-05,
"loss": 0.1449,
"step": 4260
},
{
"epoch": 1.36,
"grad_norm": 2.7718186378479004,
"learning_rate": 5.490924530304639e-05,
"loss": 0.1237,
"step": 4270
},
{
"epoch": 1.36,
"grad_norm": 1.7264149188995361,
"learning_rate": 5.480309945865619e-05,
"loss": 0.0537,
"step": 4280
},
{
"epoch": 1.37,
"grad_norm": 2.8292267322540283,
"learning_rate": 5.4696953614266e-05,
"loss": 0.1139,
"step": 4290
},
{
"epoch": 1.37,
"grad_norm": 2.6377663612365723,
"learning_rate": 5.4590807769875806e-05,
"loss": 0.1632,
"step": 4300
},
{
"epoch": 1.37,
"grad_norm": 0.1827862560749054,
"learning_rate": 5.4484661925485624e-05,
"loss": 0.1809,
"step": 4310
},
{
"epoch": 1.38,
"grad_norm": 5.187005996704102,
"learning_rate": 5.4378516081095434e-05,
"loss": 0.1735,
"step": 4320
},
{
"epoch": 1.38,
"grad_norm": 2.064953327178955,
"learning_rate": 5.427237023670524e-05,
"loss": 0.3226,
"step": 4330
},
{
"epoch": 1.38,
"grad_norm": 0.03769757226109505,
"learning_rate": 5.416622439231505e-05,
"loss": 0.1563,
"step": 4340
},
{
"epoch": 1.38,
"grad_norm": 5.220246315002441,
"learning_rate": 5.406007854792485e-05,
"loss": 0.2403,
"step": 4350
},
{
"epoch": 1.39,
"grad_norm": 0.1891440451145172,
"learning_rate": 5.395393270353466e-05,
"loss": 0.1741,
"step": 4360
},
{
"epoch": 1.39,
"grad_norm": 5.661322116851807,
"learning_rate": 5.3847786859144466e-05,
"loss": 0.1514,
"step": 4370
},
{
"epoch": 1.39,
"grad_norm": 8.325531005859375,
"learning_rate": 5.3741641014754277e-05,
"loss": 0.1954,
"step": 4380
},
{
"epoch": 1.4,
"grad_norm": 3.1849327087402344,
"learning_rate": 5.363549517036408e-05,
"loss": 0.2667,
"step": 4390
},
{
"epoch": 1.4,
"grad_norm": 4.426061153411865,
"learning_rate": 5.352934932597389e-05,
"loss": 0.1621,
"step": 4400
},
{
"epoch": 1.4,
"grad_norm": 0.08511369675397873,
"learning_rate": 5.3423203481583694e-05,
"loss": 0.2384,
"step": 4410
},
{
"epoch": 1.41,
"grad_norm": 2.6035985946655273,
"learning_rate": 5.3317057637193505e-05,
"loss": 0.2029,
"step": 4420
},
{
"epoch": 1.41,
"grad_norm": 3.637746810913086,
"learning_rate": 5.321091179280332e-05,
"loss": 0.2054,
"step": 4430
},
{
"epoch": 1.41,
"grad_norm": 2.6887290477752686,
"learning_rate": 5.3104765948413126e-05,
"loss": 0.194,
"step": 4440
},
{
"epoch": 1.42,
"grad_norm": 0.5362237691879272,
"learning_rate": 5.2998620104022936e-05,
"loss": 0.1243,
"step": 4450
},
{
"epoch": 1.42,
"grad_norm": 6.602662086486816,
"learning_rate": 5.289247425963274e-05,
"loss": 0.1005,
"step": 4460
},
{
"epoch": 1.42,
"grad_norm": 0.16585449874401093,
"learning_rate": 5.278632841524255e-05,
"loss": 0.116,
"step": 4470
},
{
"epoch": 1.43,
"grad_norm": 3.062458038330078,
"learning_rate": 5.2690797155291374e-05,
"loss": 0.2236,
"step": 4480
},
{
"epoch": 1.43,
"grad_norm": 3.1578338146209717,
"learning_rate": 5.258465131090118e-05,
"loss": 0.1248,
"step": 4490
},
{
"epoch": 1.43,
"grad_norm": 6.487752914428711,
"learning_rate": 5.247850546651099e-05,
"loss": 0.2268,
"step": 4500
},
{
"epoch": 1.44,
"grad_norm": 4.561209678649902,
"learning_rate": 5.237235962212079e-05,
"loss": 0.3183,
"step": 4510
},
{
"epoch": 1.44,
"grad_norm": 1.6614716053009033,
"learning_rate": 5.22662137777306e-05,
"loss": 0.2555,
"step": 4520
},
{
"epoch": 1.44,
"grad_norm": 2.4814791679382324,
"learning_rate": 5.216006793334042e-05,
"loss": 0.1524,
"step": 4530
},
{
"epoch": 1.45,
"grad_norm": 0.17691956460475922,
"learning_rate": 5.205392208895022e-05,
"loss": 0.1934,
"step": 4540
},
{
"epoch": 1.45,
"grad_norm": 5.082562446594238,
"learning_rate": 5.1947776244560033e-05,
"loss": 0.4279,
"step": 4550
},
{
"epoch": 1.45,
"grad_norm": 3.106387138366699,
"learning_rate": 5.184163040016984e-05,
"loss": 0.1194,
"step": 4560
},
{
"epoch": 1.45,
"grad_norm": 7.02073335647583,
"learning_rate": 5.173548455577965e-05,
"loss": 0.1109,
"step": 4570
},
{
"epoch": 1.46,
"grad_norm": 0.2526942193508148,
"learning_rate": 5.162933871138945e-05,
"loss": 0.1913,
"step": 4580
},
{
"epoch": 1.46,
"grad_norm": 4.575504302978516,
"learning_rate": 5.152319286699926e-05,
"loss": 0.2151,
"step": 4590
},
{
"epoch": 1.46,
"grad_norm": 2.3890509605407715,
"learning_rate": 5.1417047022609066e-05,
"loss": 0.2336,
"step": 4600
},
{
"epoch": 1.47,
"grad_norm": 0.8267619013786316,
"learning_rate": 5.1310901178218876e-05,
"loss": 0.0856,
"step": 4610
},
{
"epoch": 1.47,
"grad_norm": 4.056538105010986,
"learning_rate": 5.120475533382868e-05,
"loss": 0.1947,
"step": 4620
},
{
"epoch": 1.47,
"grad_norm": 6.964923858642578,
"learning_rate": 5.109860948943849e-05,
"loss": 0.1195,
"step": 4630
},
{
"epoch": 1.48,
"grad_norm": 2.813004970550537,
"learning_rate": 5.100307822948732e-05,
"loss": 0.1225,
"step": 4640
},
{
"epoch": 1.48,
"grad_norm": 2.654339075088501,
"learning_rate": 5.089693238509713e-05,
"loss": 0.1006,
"step": 4650
},
{
"epoch": 1.48,
"grad_norm": 6.5991644859313965,
"learning_rate": 5.0790786540706934e-05,
"loss": 0.2646,
"step": 4660
},
{
"epoch": 1.49,
"grad_norm": 5.099368572235107,
"learning_rate": 5.0684640696316745e-05,
"loss": 0.2748,
"step": 4670
},
{
"epoch": 1.49,
"grad_norm": 5.0444655418396,
"learning_rate": 5.057849485192655e-05,
"loss": 0.2295,
"step": 4680
},
{
"epoch": 1.49,
"grad_norm": 0.07431354373693466,
"learning_rate": 5.047234900753636e-05,
"loss": 0.1348,
"step": 4690
},
{
"epoch": 1.5,
"grad_norm": 0.1366661787033081,
"learning_rate": 5.036620316314616e-05,
"loss": 0.1164,
"step": 4700
},
{
"epoch": 1.5,
"grad_norm": 4.550073146820068,
"learning_rate": 5.026005731875597e-05,
"loss": 0.2377,
"step": 4710
},
{
"epoch": 1.5,
"grad_norm": 0.12663549184799194,
"learning_rate": 5.015391147436578e-05,
"loss": 0.0871,
"step": 4720
},
{
"epoch": 1.51,
"grad_norm": 5.191462993621826,
"learning_rate": 5.004776562997559e-05,
"loss": 0.2778,
"step": 4730
},
{
"epoch": 1.51,
"grad_norm": 2.7582337856292725,
"learning_rate": 4.99416197855854e-05,
"loss": 0.203,
"step": 4740
},
{
"epoch": 1.51,
"grad_norm": 7.114481449127197,
"learning_rate": 4.98354739411952e-05,
"loss": 0.1426,
"step": 4750
},
{
"epoch": 1.52,
"grad_norm": 0.41717416048049927,
"learning_rate": 4.972932809680501e-05,
"loss": 0.2009,
"step": 4760
},
{
"epoch": 1.52,
"grad_norm": 1.8175145387649536,
"learning_rate": 4.9623182252414816e-05,
"loss": 0.1152,
"step": 4770
},
{
"epoch": 1.52,
"grad_norm": 3.585702419281006,
"learning_rate": 4.951703640802463e-05,
"loss": 0.1615,
"step": 4780
},
{
"epoch": 1.52,
"grad_norm": 0.385105699300766,
"learning_rate": 4.9410890563634437e-05,
"loss": 0.1569,
"step": 4790
},
{
"epoch": 1.53,
"grad_norm": 2.8163392543792725,
"learning_rate": 4.930474471924425e-05,
"loss": 0.0942,
"step": 4800
},
{
"epoch": 1.53,
"grad_norm": 5.181662082672119,
"learning_rate": 4.919859887485405e-05,
"loss": 0.2076,
"step": 4810
},
{
"epoch": 1.53,
"grad_norm": 0.15229104459285736,
"learning_rate": 4.909245303046386e-05,
"loss": 0.2249,
"step": 4820
},
{
"epoch": 1.54,
"grad_norm": 3.2373440265655518,
"learning_rate": 4.8986307186073665e-05,
"loss": 0.5439,
"step": 4830
},
{
"epoch": 1.54,
"grad_norm": 1.7857202291488647,
"learning_rate": 4.8880161341683475e-05,
"loss": 0.1806,
"step": 4840
},
{
"epoch": 1.54,
"grad_norm": 1.1035951375961304,
"learning_rate": 4.8774015497293286e-05,
"loss": 0.1309,
"step": 4850
},
{
"epoch": 1.55,
"grad_norm": 7.660123825073242,
"learning_rate": 4.866786965290309e-05,
"loss": 0.1587,
"step": 4860
},
{
"epoch": 1.55,
"grad_norm": 0.20227286219596863,
"learning_rate": 4.85617238085129e-05,
"loss": 0.3051,
"step": 4870
},
{
"epoch": 1.55,
"grad_norm": 6.558931827545166,
"learning_rate": 4.8455577964122704e-05,
"loss": 0.2137,
"step": 4880
},
{
"epoch": 1.56,
"grad_norm": 2.683018922805786,
"learning_rate": 4.8349432119732514e-05,
"loss": 0.1528,
"step": 4890
},
{
"epoch": 1.56,
"grad_norm": 1.2843786478042603,
"learning_rate": 4.8243286275342325e-05,
"loss": 0.1525,
"step": 4900
},
{
"epoch": 1.56,
"grad_norm": 0.9824750423431396,
"learning_rate": 4.8137140430952135e-05,
"loss": 0.1682,
"step": 4910
},
{
"epoch": 1.57,
"grad_norm": 1.0165822505950928,
"learning_rate": 4.803099458656194e-05,
"loss": 0.2397,
"step": 4920
},
{
"epoch": 1.57,
"grad_norm": 2.0921578407287598,
"learning_rate": 4.792484874217175e-05,
"loss": 0.2342,
"step": 4930
},
{
"epoch": 1.57,
"grad_norm": 2.5232343673706055,
"learning_rate": 4.781870289778155e-05,
"loss": 0.2216,
"step": 4940
},
{
"epoch": 1.58,
"grad_norm": 5.7156782150268555,
"learning_rate": 4.7712557053391363e-05,
"loss": 0.2342,
"step": 4950
},
{
"epoch": 1.58,
"grad_norm": 3.128016233444214,
"learning_rate": 4.760641120900117e-05,
"loss": 0.1759,
"step": 4960
},
{
"epoch": 1.58,
"grad_norm": 2.2040598392486572,
"learning_rate": 4.750026536461098e-05,
"loss": 0.1414,
"step": 4970
},
{
"epoch": 1.59,
"grad_norm": 2.1795644760131836,
"learning_rate": 4.739411952022079e-05,
"loss": 0.1648,
"step": 4980
},
{
"epoch": 1.59,
"grad_norm": 5.399777412414551,
"learning_rate": 4.728797367583059e-05,
"loss": 0.1344,
"step": 4990
},
{
"epoch": 1.59,
"grad_norm": 0.06098851189017296,
"learning_rate": 4.71818278314404e-05,
"loss": 0.1188,
"step": 5000
},
{
"epoch": 1.59,
"grad_norm": 3.174159049987793,
"learning_rate": 4.7075681987050206e-05,
"loss": 0.3419,
"step": 5010
},
{
"epoch": 1.6,
"grad_norm": 4.566168308258057,
"learning_rate": 4.6969536142660016e-05,
"loss": 0.2582,
"step": 5020
},
{
"epoch": 1.6,
"grad_norm": 0.5227226614952087,
"learning_rate": 4.686339029826983e-05,
"loss": 0.1691,
"step": 5030
},
{
"epoch": 1.6,
"grad_norm": 5.8460869789123535,
"learning_rate": 4.675724445387963e-05,
"loss": 0.1399,
"step": 5040
},
{
"epoch": 1.61,
"grad_norm": 2.2399487495422363,
"learning_rate": 4.665109860948944e-05,
"loss": 0.1549,
"step": 5050
},
{
"epoch": 1.61,
"grad_norm": 2.9508166313171387,
"learning_rate": 4.6544952765099245e-05,
"loss": 0.1665,
"step": 5060
},
{
"epoch": 1.61,
"grad_norm": 2.5230746269226074,
"learning_rate": 4.6438806920709055e-05,
"loss": 0.2108,
"step": 5070
},
{
"epoch": 1.62,
"grad_norm": 0.5516650080680847,
"learning_rate": 4.633266107631886e-05,
"loss": 0.2275,
"step": 5080
},
{
"epoch": 1.62,
"grad_norm": 8.398303985595703,
"learning_rate": 4.6226515231928676e-05,
"loss": 0.1801,
"step": 5090
},
{
"epoch": 1.62,
"grad_norm": 0.2512928247451782,
"learning_rate": 4.612036938753848e-05,
"loss": 0.2654,
"step": 5100
},
{
"epoch": 1.63,
"grad_norm": 5.312344551086426,
"learning_rate": 4.601422354314829e-05,
"loss": 0.2992,
"step": 5110
},
{
"epoch": 1.63,
"grad_norm": 1.728023648262024,
"learning_rate": 4.5908077698758094e-05,
"loss": 0.1638,
"step": 5120
},
{
"epoch": 1.63,
"grad_norm": 1.6222649812698364,
"learning_rate": 4.5801931854367904e-05,
"loss": 0.2216,
"step": 5130
},
{
"epoch": 1.64,
"grad_norm": 0.5581383109092712,
"learning_rate": 4.569578600997771e-05,
"loss": 0.2467,
"step": 5140
},
{
"epoch": 1.64,
"grad_norm": 3.051811456680298,
"learning_rate": 4.558964016558752e-05,
"loss": 0.1486,
"step": 5150
},
{
"epoch": 1.64,
"grad_norm": 0.6013765931129456,
"learning_rate": 4.548349432119733e-05,
"loss": 0.123,
"step": 5160
},
{
"epoch": 1.65,
"grad_norm": 3.8984789848327637,
"learning_rate": 4.537734847680713e-05,
"loss": 0.3698,
"step": 5170
},
{
"epoch": 1.65,
"grad_norm": 1.3346749544143677,
"learning_rate": 4.527120263241694e-05,
"loss": 0.1814,
"step": 5180
},
{
"epoch": 1.65,
"grad_norm": 11.491423606872559,
"learning_rate": 4.516505678802675e-05,
"loss": 0.1745,
"step": 5190
},
{
"epoch": 1.66,
"grad_norm": 2.358656883239746,
"learning_rate": 4.505891094363656e-05,
"loss": 0.2734,
"step": 5200
},
{
"epoch": 1.66,
"grad_norm": 3.3352041244506836,
"learning_rate": 4.495276509924637e-05,
"loss": 0.2054,
"step": 5210
},
{
"epoch": 1.66,
"grad_norm": 0.052441373467445374,
"learning_rate": 4.484661925485618e-05,
"loss": 0.1389,
"step": 5220
},
{
"epoch": 1.66,
"grad_norm": 0.20047003030776978,
"learning_rate": 4.474047341046598e-05,
"loss": 0.1197,
"step": 5230
},
{
"epoch": 1.67,
"grad_norm": 1.4837030172348022,
"learning_rate": 4.463432756607579e-05,
"loss": 0.2446,
"step": 5240
},
{
"epoch": 1.67,
"grad_norm": 0.3104861378669739,
"learning_rate": 4.4528181721685596e-05,
"loss": 0.1842,
"step": 5250
},
{
"epoch": 1.67,
"grad_norm": 7.954286098480225,
"learning_rate": 4.442203587729541e-05,
"loss": 0.1221,
"step": 5260
},
{
"epoch": 1.68,
"grad_norm": 0.03400198742747307,
"learning_rate": 4.431589003290522e-05,
"loss": 0.1513,
"step": 5270
},
{
"epoch": 1.68,
"grad_norm": 0.08371475338935852,
"learning_rate": 4.420974418851502e-05,
"loss": 0.2098,
"step": 5280
},
{
"epoch": 1.68,
"grad_norm": 1.2470760345458984,
"learning_rate": 4.410359834412483e-05,
"loss": 0.117,
"step": 5290
},
{
"epoch": 1.69,
"grad_norm": 1.5426656007766724,
"learning_rate": 4.3997452499734635e-05,
"loss": 0.1826,
"step": 5300
},
{
"epoch": 1.69,
"grad_norm": 3.978109121322632,
"learning_rate": 4.3891306655344445e-05,
"loss": 0.1103,
"step": 5310
},
{
"epoch": 1.69,
"grad_norm": 1.6321693658828735,
"learning_rate": 4.378516081095425e-05,
"loss": 0.151,
"step": 5320
},
{
"epoch": 1.7,
"grad_norm": 2.555723190307617,
"learning_rate": 4.3679014966564066e-05,
"loss": 0.1786,
"step": 5330
},
{
"epoch": 1.7,
"grad_norm": 0.2461155354976654,
"learning_rate": 4.357286912217387e-05,
"loss": 0.1914,
"step": 5340
},
{
"epoch": 1.7,
"grad_norm": 0.41670894622802734,
"learning_rate": 4.346672327778368e-05,
"loss": 0.2582,
"step": 5350
},
{
"epoch": 1.71,
"grad_norm": 4.785902976989746,
"learning_rate": 4.3360577433393484e-05,
"loss": 0.0911,
"step": 5360
},
{
"epoch": 1.71,
"grad_norm": 4.179080963134766,
"learning_rate": 4.3254431589003295e-05,
"loss": 0.2264,
"step": 5370
},
{
"epoch": 1.71,
"grad_norm": 0.9344226717948914,
"learning_rate": 4.31482857446131e-05,
"loss": 0.2003,
"step": 5380
},
{
"epoch": 1.72,
"grad_norm": 0.3643859624862671,
"learning_rate": 4.304213990022291e-05,
"loss": 0.1,
"step": 5390
},
{
"epoch": 1.72,
"grad_norm": 2.3688154220581055,
"learning_rate": 4.293599405583272e-05,
"loss": 0.2461,
"step": 5400
},
{
"epoch": 1.72,
"grad_norm": 4.223112106323242,
"learning_rate": 4.282984821144252e-05,
"loss": 0.1316,
"step": 5410
},
{
"epoch": 1.73,
"grad_norm": 1.52751886844635,
"learning_rate": 4.2723702367052333e-05,
"loss": 0.162,
"step": 5420
},
{
"epoch": 1.73,
"grad_norm": 0.06534834951162338,
"learning_rate": 4.261755652266214e-05,
"loss": 0.1787,
"step": 5430
},
{
"epoch": 1.73,
"grad_norm": 0.0435919463634491,
"learning_rate": 4.251141067827195e-05,
"loss": 0.2196,
"step": 5440
},
{
"epoch": 1.73,
"grad_norm": 1.0877362489700317,
"learning_rate": 4.240526483388176e-05,
"loss": 0.2829,
"step": 5450
},
{
"epoch": 1.74,
"grad_norm": 1.7220368385314941,
"learning_rate": 4.229911898949156e-05,
"loss": 0.211,
"step": 5460
},
{
"epoch": 1.74,
"grad_norm": 1.6200969219207764,
"learning_rate": 4.219297314510137e-05,
"loss": 0.2046,
"step": 5470
},
{
"epoch": 1.74,
"grad_norm": 2.376384735107422,
"learning_rate": 4.2086827300711176e-05,
"loss": 0.2518,
"step": 5480
},
{
"epoch": 1.75,
"grad_norm": 1.6646453142166138,
"learning_rate": 4.1980681456320986e-05,
"loss": 0.1542,
"step": 5490
},
{
"epoch": 1.75,
"grad_norm": 0.580792248249054,
"learning_rate": 4.187453561193079e-05,
"loss": 0.1503,
"step": 5500
},
{
"epoch": 1.75,
"grad_norm": 2.325477123260498,
"learning_rate": 4.176838976754061e-05,
"loss": 0.1867,
"step": 5510
},
{
"epoch": 1.76,
"grad_norm": 3.004499673843384,
"learning_rate": 4.166224392315041e-05,
"loss": 0.1816,
"step": 5520
},
{
"epoch": 1.76,
"grad_norm": 1.7592769861221313,
"learning_rate": 4.155609807876022e-05,
"loss": 0.2155,
"step": 5530
},
{
"epoch": 1.76,
"grad_norm": 0.4255143105983734,
"learning_rate": 4.1449952234370025e-05,
"loss": 0.2298,
"step": 5540
},
{
"epoch": 1.77,
"grad_norm": 4.217332363128662,
"learning_rate": 4.1343806389979836e-05,
"loss": 0.1263,
"step": 5550
},
{
"epoch": 1.77,
"grad_norm": 1.6670517921447754,
"learning_rate": 4.123766054558964e-05,
"loss": 0.1993,
"step": 5560
},
{
"epoch": 1.77,
"grad_norm": 0.2432798445224762,
"learning_rate": 4.113151470119945e-05,
"loss": 0.1992,
"step": 5570
},
{
"epoch": 1.78,
"grad_norm": 5.0905070304870605,
"learning_rate": 4.102536885680926e-05,
"loss": 0.1381,
"step": 5580
},
{
"epoch": 1.78,
"grad_norm": 12.299093246459961,
"learning_rate": 4.0919223012419064e-05,
"loss": 0.2233,
"step": 5590
},
{
"epoch": 1.78,
"grad_norm": 0.27092546224594116,
"learning_rate": 4.0813077168028874e-05,
"loss": 0.1675,
"step": 5600
},
{
"epoch": 1.79,
"grad_norm": 3.4481306076049805,
"learning_rate": 4.070693132363868e-05,
"loss": 0.3113,
"step": 5610
},
{
"epoch": 1.79,
"grad_norm": 12.642804145812988,
"learning_rate": 4.060078547924849e-05,
"loss": 0.1557,
"step": 5620
},
{
"epoch": 1.79,
"grad_norm": 4.341307163238525,
"learning_rate": 4.049463963485829e-05,
"loss": 0.0825,
"step": 5630
},
{
"epoch": 1.8,
"grad_norm": 0.728386402130127,
"learning_rate": 4.038849379046811e-05,
"loss": 0.1589,
"step": 5640
},
{
"epoch": 1.8,
"grad_norm": 4.2692084312438965,
"learning_rate": 4.028234794607791e-05,
"loss": 0.0908,
"step": 5650
},
{
"epoch": 1.8,
"grad_norm": 3.5218265056610107,
"learning_rate": 4.0176202101687724e-05,
"loss": 0.2008,
"step": 5660
},
{
"epoch": 1.8,
"grad_norm": 0.6934779286384583,
"learning_rate": 4.007005625729753e-05,
"loss": 0.1652,
"step": 5670
},
{
"epoch": 1.81,
"grad_norm": 7.079185485839844,
"learning_rate": 3.996391041290734e-05,
"loss": 0.1854,
"step": 5680
},
{
"epoch": 1.81,
"grad_norm": 2.6828112602233887,
"learning_rate": 3.985776456851714e-05,
"loss": 0.0911,
"step": 5690
},
{
"epoch": 1.81,
"grad_norm": 5.049779891967773,
"learning_rate": 3.975161872412695e-05,
"loss": 0.1191,
"step": 5700
},
{
"epoch": 1.82,
"grad_norm": 2.4732673168182373,
"learning_rate": 3.9656087464175775e-05,
"loss": 0.2192,
"step": 5710
},
{
"epoch": 1.82,
"grad_norm": 0.11808130145072937,
"learning_rate": 3.9549941619785586e-05,
"loss": 0.1782,
"step": 5720
},
{
"epoch": 1.82,
"grad_norm": 3.8879833221435547,
"learning_rate": 3.944379577539539e-05,
"loss": 0.1692,
"step": 5730
},
{
"epoch": 1.83,
"grad_norm": 3.667048931121826,
"learning_rate": 3.933764993100521e-05,
"loss": 0.1236,
"step": 5740
},
{
"epoch": 1.83,
"grad_norm": 4.494665622711182,
"learning_rate": 3.923150408661501e-05,
"loss": 0.2373,
"step": 5750
},
{
"epoch": 1.83,
"grad_norm": 0.3976966440677643,
"learning_rate": 3.912535824222482e-05,
"loss": 0.2805,
"step": 5760
},
{
"epoch": 1.84,
"grad_norm": 2.046142578125,
"learning_rate": 3.9019212397834625e-05,
"loss": 0.1198,
"step": 5770
},
{
"epoch": 1.84,
"grad_norm": 0.27937573194503784,
"learning_rate": 3.8913066553444435e-05,
"loss": 0.1443,
"step": 5780
},
{
"epoch": 1.84,
"grad_norm": 6.109045028686523,
"learning_rate": 3.880692070905424e-05,
"loss": 0.3341,
"step": 5790
},
{
"epoch": 1.85,
"grad_norm": 0.7306396961212158,
"learning_rate": 3.870077486466405e-05,
"loss": 0.1208,
"step": 5800
},
{
"epoch": 1.85,
"grad_norm": 1.7087950706481934,
"learning_rate": 3.859462902027386e-05,
"loss": 0.1464,
"step": 5810
},
{
"epoch": 1.85,
"grad_norm": 0.5200537443161011,
"learning_rate": 3.8488483175883663e-05,
"loss": 0.1639,
"step": 5820
},
{
"epoch": 1.86,
"grad_norm": 6.455096244812012,
"learning_rate": 3.8382337331493474e-05,
"loss": 0.1885,
"step": 5830
},
{
"epoch": 1.86,
"grad_norm": 7.437272548675537,
"learning_rate": 3.827619148710328e-05,
"loss": 0.1916,
"step": 5840
},
{
"epoch": 1.86,
"grad_norm": 6.395534515380859,
"learning_rate": 3.817004564271309e-05,
"loss": 0.2988,
"step": 5850
},
{
"epoch": 1.87,
"grad_norm": 20.61446762084961,
"learning_rate": 3.80638997983229e-05,
"loss": 0.0853,
"step": 5860
},
{
"epoch": 1.87,
"grad_norm": 1.0395785570144653,
"learning_rate": 3.795775395393271e-05,
"loss": 0.2113,
"step": 5870
},
{
"epoch": 1.87,
"grad_norm": 8.83860969543457,
"learning_rate": 3.785160810954251e-05,
"loss": 0.1904,
"step": 5880
},
{
"epoch": 1.87,
"grad_norm": 5.42601203918457,
"learning_rate": 3.774546226515232e-05,
"loss": 0.3887,
"step": 5890
},
{
"epoch": 1.88,
"grad_norm": 3.3505442142486572,
"learning_rate": 3.763931642076213e-05,
"loss": 0.1397,
"step": 5900
},
{
"epoch": 1.88,
"grad_norm": 4.929141521453857,
"learning_rate": 3.753317057637194e-05,
"loss": 0.2773,
"step": 5910
},
{
"epoch": 1.88,
"grad_norm": 2.1540703773498535,
"learning_rate": 3.742702473198175e-05,
"loss": 0.1679,
"step": 5920
},
{
"epoch": 1.89,
"grad_norm": 10.82689094543457,
"learning_rate": 3.732087888759155e-05,
"loss": 0.1776,
"step": 5930
},
{
"epoch": 1.89,
"grad_norm": 3.0525174140930176,
"learning_rate": 3.721473304320136e-05,
"loss": 0.1619,
"step": 5940
},
{
"epoch": 1.89,
"grad_norm": 5.296212196350098,
"learning_rate": 3.7108587198811166e-05,
"loss": 0.3294,
"step": 5950
},
{
"epoch": 1.9,
"grad_norm": 3.4226958751678467,
"learning_rate": 3.7002441354420976e-05,
"loss": 0.3229,
"step": 5960
},
{
"epoch": 1.9,
"grad_norm": 0.4734908938407898,
"learning_rate": 3.689629551003078e-05,
"loss": 0.1179,
"step": 5970
},
{
"epoch": 1.9,
"grad_norm": 5.436024188995361,
"learning_rate": 3.67901496656406e-05,
"loss": 0.1892,
"step": 5980
},
{
"epoch": 1.91,
"grad_norm": 5.233070373535156,
"learning_rate": 3.66840038212504e-05,
"loss": 0.2054,
"step": 5990
},
{
"epoch": 1.91,
"grad_norm": 0.5661432147026062,
"learning_rate": 3.657785797686021e-05,
"loss": 0.2202,
"step": 6000
},
{
"epoch": 1.91,
"grad_norm": 0.23524077236652374,
"learning_rate": 3.6471712132470015e-05,
"loss": 0.2318,
"step": 6010
},
{
"epoch": 1.92,
"grad_norm": 0.05953243002295494,
"learning_rate": 3.6365566288079825e-05,
"loss": 0.2486,
"step": 6020
},
{
"epoch": 1.92,
"grad_norm": 1.3823449611663818,
"learning_rate": 3.625942044368963e-05,
"loss": 0.1171,
"step": 6030
},
{
"epoch": 1.92,
"grad_norm": 7.733388423919678,
"learning_rate": 3.615327459929944e-05,
"loss": 0.2469,
"step": 6040
},
{
"epoch": 1.93,
"grad_norm": 1.4917621612548828,
"learning_rate": 3.604712875490925e-05,
"loss": 0.2045,
"step": 6050
},
{
"epoch": 1.93,
"grad_norm": 7.689728736877441,
"learning_rate": 3.5940982910519054e-05,
"loss": 0.1648,
"step": 6060
},
{
"epoch": 1.93,
"grad_norm": 2.2216577529907227,
"learning_rate": 3.5834837066128864e-05,
"loss": 0.2779,
"step": 6070
},
{
"epoch": 1.94,
"grad_norm": 1.7362425327301025,
"learning_rate": 3.572869122173867e-05,
"loss": 0.1664,
"step": 6080
},
{
"epoch": 1.94,
"grad_norm": 4.933811187744141,
"learning_rate": 3.562254537734848e-05,
"loss": 0.293,
"step": 6090
},
{
"epoch": 1.94,
"grad_norm": 4.054910182952881,
"learning_rate": 3.551639953295829e-05,
"loss": 0.1539,
"step": 6100
},
{
"epoch": 1.94,
"grad_norm": 0.9219651222229004,
"learning_rate": 3.541025368856809e-05,
"loss": 0.1111,
"step": 6110
},
{
"epoch": 1.95,
"grad_norm": 4.558506488800049,
"learning_rate": 3.53041078441779e-05,
"loss": 0.1783,
"step": 6120
},
{
"epoch": 1.95,
"grad_norm": 2.6951773166656494,
"learning_rate": 3.5197961999787707e-05,
"loss": 0.2916,
"step": 6130
},
{
"epoch": 1.95,
"grad_norm": 0.9989050626754761,
"learning_rate": 3.509181615539752e-05,
"loss": 0.2099,
"step": 6140
},
{
"epoch": 1.96,
"grad_norm": 0.08494656533002853,
"learning_rate": 3.498567031100732e-05,
"loss": 0.1255,
"step": 6150
},
{
"epoch": 1.96,
"grad_norm": 0.20273062586784363,
"learning_rate": 3.487952446661714e-05,
"loss": 0.1523,
"step": 6160
},
{
"epoch": 1.96,
"grad_norm": 0.2878829538822174,
"learning_rate": 3.477337862222694e-05,
"loss": 0.1732,
"step": 6170
},
{
"epoch": 1.97,
"grad_norm": 2.026616096496582,
"learning_rate": 3.466723277783675e-05,
"loss": 0.1037,
"step": 6180
},
{
"epoch": 1.97,
"grad_norm": 0.8350101709365845,
"learning_rate": 3.4561086933446556e-05,
"loss": 0.1169,
"step": 6190
},
{
"epoch": 1.97,
"grad_norm": 0.6492775082588196,
"learning_rate": 3.4454941089056366e-05,
"loss": 0.1758,
"step": 6200
},
{
"epoch": 1.98,
"grad_norm": 4.830353736877441,
"learning_rate": 3.434879524466617e-05,
"loss": 0.3367,
"step": 6210
},
{
"epoch": 1.98,
"grad_norm": 5.267330169677734,
"learning_rate": 3.424264940027598e-05,
"loss": 0.1753,
"step": 6220
},
{
"epoch": 1.98,
"grad_norm": 0.11368358880281448,
"learning_rate": 3.413650355588579e-05,
"loss": 0.2409,
"step": 6230
},
{
"epoch": 1.99,
"grad_norm": 0.10408168286085129,
"learning_rate": 3.4030357711495595e-05,
"loss": 0.1407,
"step": 6240
},
{
"epoch": 1.99,
"grad_norm": 4.495917320251465,
"learning_rate": 3.3924211867105405e-05,
"loss": 0.1504,
"step": 6250
},
{
"epoch": 1.99,
"grad_norm": 0.16925585269927979,
"learning_rate": 3.381806602271521e-05,
"loss": 0.1323,
"step": 6260
},
{
"epoch": 2.0,
"grad_norm": 2.5475289821624756,
"learning_rate": 3.371192017832502e-05,
"loss": 0.1902,
"step": 6270
},
{
"epoch": 2.0,
"grad_norm": 2.21207332611084,
"learning_rate": 3.360577433393483e-05,
"loss": 0.1019,
"step": 6280
},
{
"epoch": 2.0,
"grad_norm": 2.7308425903320312,
"learning_rate": 3.349962848954464e-05,
"loss": 0.1368,
"step": 6290
},
{
"epoch": 2.01,
"grad_norm": 0.8695929646492004,
"learning_rate": 3.3393482645154444e-05,
"loss": 0.1979,
"step": 6300
},
{
"epoch": 2.01,
"grad_norm": 5.150228977203369,
"learning_rate": 3.3287336800764254e-05,
"loss": 0.1237,
"step": 6310
},
{
"epoch": 2.01,
"grad_norm": 0.1432078331708908,
"learning_rate": 3.318119095637406e-05,
"loss": 0.1547,
"step": 6320
},
{
"epoch": 2.01,
"grad_norm": 3.952962875366211,
"learning_rate": 3.307504511198387e-05,
"loss": 0.1682,
"step": 6330
},
{
"epoch": 2.02,
"grad_norm": 0.044416822493076324,
"learning_rate": 3.296889926759367e-05,
"loss": 0.0388,
"step": 6340
},
{
"epoch": 2.02,
"grad_norm": 6.307524681091309,
"learning_rate": 3.286275342320348e-05,
"loss": 0.1418,
"step": 6350
},
{
"epoch": 2.02,
"grad_norm": 0.1354295015335083,
"learning_rate": 3.275660757881329e-05,
"loss": 0.2588,
"step": 6360
},
{
"epoch": 2.03,
"grad_norm": 3.275066614151001,
"learning_rate": 3.26504617344231e-05,
"loss": 0.1091,
"step": 6370
},
{
"epoch": 2.03,
"grad_norm": 0.0923081785440445,
"learning_rate": 3.254431589003291e-05,
"loss": 0.1384,
"step": 6380
},
{
"epoch": 2.03,
"grad_norm": 3.508528232574463,
"learning_rate": 3.243817004564271e-05,
"loss": 0.217,
"step": 6390
},
{
"epoch": 2.04,
"grad_norm": 2.36240291595459,
"learning_rate": 3.233202420125252e-05,
"loss": 0.0337,
"step": 6400
},
{
"epoch": 2.04,
"grad_norm": 0.20124652981758118,
"learning_rate": 3.222587835686233e-05,
"loss": 0.0982,
"step": 6410
},
{
"epoch": 2.04,
"grad_norm": 0.8248081803321838,
"learning_rate": 3.211973251247214e-05,
"loss": 0.2217,
"step": 6420
},
{
"epoch": 2.05,
"grad_norm": 1.1201878786087036,
"learning_rate": 3.2013586668081946e-05,
"loss": 0.0651,
"step": 6430
},
{
"epoch": 2.05,
"grad_norm": 1.6418076753616333,
"learning_rate": 3.1907440823691757e-05,
"loss": 0.0738,
"step": 6440
},
{
"epoch": 2.05,
"grad_norm": 2.1913180351257324,
"learning_rate": 3.180129497930156e-05,
"loss": 0.0863,
"step": 6450
},
{
"epoch": 2.06,
"grad_norm": 1.3282325267791748,
"learning_rate": 3.1695149134911364e-05,
"loss": 0.0582,
"step": 6460
},
{
"epoch": 2.06,
"grad_norm": 2.451772451400757,
"learning_rate": 3.158900329052118e-05,
"loss": 0.1187,
"step": 6470
},
{
"epoch": 2.06,
"grad_norm": 0.1372409611940384,
"learning_rate": 3.1482857446130985e-05,
"loss": 0.0618,
"step": 6480
},
{
"epoch": 2.07,
"grad_norm": 0.08469751477241516,
"learning_rate": 3.1376711601740795e-05,
"loss": 0.0316,
"step": 6490
},
{
"epoch": 2.07,
"grad_norm": 0.1473696529865265,
"learning_rate": 3.12705657573506e-05,
"loss": 0.0954,
"step": 6500
},
{
"epoch": 2.07,
"grad_norm": 0.06819278746843338,
"learning_rate": 3.116441991296041e-05,
"loss": 0.1365,
"step": 6510
},
{
"epoch": 2.08,
"grad_norm": 8.832886695861816,
"learning_rate": 3.105827406857021e-05,
"loss": 0.1828,
"step": 6520
},
{
"epoch": 2.08,
"grad_norm": 0.043228354305028915,
"learning_rate": 3.0952128224180024e-05,
"loss": 0.1541,
"step": 6530
},
{
"epoch": 2.08,
"grad_norm": 0.1457592248916626,
"learning_rate": 3.0845982379789834e-05,
"loss": 0.0291,
"step": 6540
},
{
"epoch": 2.08,
"grad_norm": 1.5548399686813354,
"learning_rate": 3.073983653539964e-05,
"loss": 0.123,
"step": 6550
},
{
"epoch": 2.09,
"grad_norm": 5.61803674697876,
"learning_rate": 3.063369069100945e-05,
"loss": 0.1871,
"step": 6560
},
{
"epoch": 2.09,
"grad_norm": 0.020372767001390457,
"learning_rate": 3.052754484661925e-05,
"loss": 0.0865,
"step": 6570
},
{
"epoch": 2.09,
"grad_norm": 5.178860664367676,
"learning_rate": 3.0421399002229062e-05,
"loss": 0.1568,
"step": 6580
},
{
"epoch": 2.1,
"grad_norm": 4.118620872497559,
"learning_rate": 3.0315253157838873e-05,
"loss": 0.0729,
"step": 6590
},
{
"epoch": 2.1,
"grad_norm": 3.9899566173553467,
"learning_rate": 3.020910731344868e-05,
"loss": 0.2327,
"step": 6600
},
{
"epoch": 2.1,
"grad_norm": 1.3902517557144165,
"learning_rate": 3.0102961469058487e-05,
"loss": 0.1305,
"step": 6610
},
{
"epoch": 2.11,
"grad_norm": 5.5835957527160645,
"learning_rate": 2.9996815624668294e-05,
"loss": 0.1032,
"step": 6620
},
{
"epoch": 2.11,
"grad_norm": 1.521474003791809,
"learning_rate": 2.98906697802781e-05,
"loss": 0.1188,
"step": 6630
},
{
"epoch": 2.11,
"grad_norm": 0.19501766562461853,
"learning_rate": 2.978452393588791e-05,
"loss": 0.0989,
"step": 6640
},
{
"epoch": 2.12,
"grad_norm": 0.03989823907613754,
"learning_rate": 2.9678378091497722e-05,
"loss": 0.0736,
"step": 6650
},
{
"epoch": 2.12,
"grad_norm": 3.9346630573272705,
"learning_rate": 2.957223224710753e-05,
"loss": 0.0347,
"step": 6660
},
{
"epoch": 2.12,
"grad_norm": 0.05866791680455208,
"learning_rate": 2.9466086402717336e-05,
"loss": 0.1317,
"step": 6670
},
{
"epoch": 2.13,
"grad_norm": 0.660900890827179,
"learning_rate": 2.9359940558327143e-05,
"loss": 0.1365,
"step": 6680
},
{
"epoch": 2.13,
"grad_norm": 0.20864763855934143,
"learning_rate": 2.925379471393695e-05,
"loss": 0.2221,
"step": 6690
},
{
"epoch": 2.13,
"grad_norm": 2.8652963638305664,
"learning_rate": 2.9147648869546758e-05,
"loss": 0.0355,
"step": 6700
},
{
"epoch": 2.14,
"grad_norm": 3.0343375205993652,
"learning_rate": 2.9041503025156568e-05,
"loss": 0.2081,
"step": 6710
},
{
"epoch": 2.14,
"grad_norm": 2.393002510070801,
"learning_rate": 2.8935357180766375e-05,
"loss": 0.1076,
"step": 6720
},
{
"epoch": 2.14,
"grad_norm": 0.08225111663341522,
"learning_rate": 2.8829211336376182e-05,
"loss": 0.1367,
"step": 6730
},
{
"epoch": 2.15,
"grad_norm": 4.09624719619751,
"learning_rate": 2.872306549198599e-05,
"loss": 0.2712,
"step": 6740
},
{
"epoch": 2.15,
"grad_norm": 0.667273998260498,
"learning_rate": 2.8616919647595796e-05,
"loss": 0.152,
"step": 6750
},
{
"epoch": 2.15,
"grad_norm": 1.4781357049942017,
"learning_rate": 2.8510773803205603e-05,
"loss": 0.0949,
"step": 6760
},
{
"epoch": 2.16,
"grad_norm": 4.563651084899902,
"learning_rate": 2.8404627958815417e-05,
"loss": 0.0873,
"step": 6770
},
{
"epoch": 2.16,
"grad_norm": 3.7740418910980225,
"learning_rate": 2.830909669886424e-05,
"loss": 0.1207,
"step": 6780
},
{
"epoch": 2.16,
"grad_norm": 10.370115280151367,
"learning_rate": 2.8202950854474048e-05,
"loss": 0.1369,
"step": 6790
},
{
"epoch": 2.16,
"grad_norm": 0.13098500669002533,
"learning_rate": 2.8096805010083855e-05,
"loss": 0.1016,
"step": 6800
},
{
"epoch": 2.17,
"grad_norm": 9.170578956604004,
"learning_rate": 2.7990659165693665e-05,
"loss": 0.0463,
"step": 6810
},
{
"epoch": 2.17,
"grad_norm": 10.379976272583008,
"learning_rate": 2.7884513321303472e-05,
"loss": 0.0798,
"step": 6820
},
{
"epoch": 2.17,
"grad_norm": 0.10993140935897827,
"learning_rate": 2.777836747691328e-05,
"loss": 0.084,
"step": 6830
},
{
"epoch": 2.18,
"grad_norm": 0.4707590937614441,
"learning_rate": 2.7672221632523087e-05,
"loss": 0.1232,
"step": 6840
},
{
"epoch": 2.18,
"grad_norm": 4.587014198303223,
"learning_rate": 2.7566075788132894e-05,
"loss": 0.1474,
"step": 6850
},
{
"epoch": 2.18,
"grad_norm": 10.61086654663086,
"learning_rate": 2.74599299437427e-05,
"loss": 0.1282,
"step": 6860
},
{
"epoch": 2.19,
"grad_norm": 0.2299477756023407,
"learning_rate": 2.7353784099352515e-05,
"loss": 0.0667,
"step": 6870
},
{
"epoch": 2.19,
"grad_norm": 5.911661624908447,
"learning_rate": 2.724763825496232e-05,
"loss": 0.1222,
"step": 6880
},
{
"epoch": 2.19,
"grad_norm": 0.1657014936208725,
"learning_rate": 2.714149241057213e-05,
"loss": 0.072,
"step": 6890
},
{
"epoch": 2.2,
"grad_norm": 0.04870441555976868,
"learning_rate": 2.7035346566181936e-05,
"loss": 0.1132,
"step": 6900
},
{
"epoch": 2.2,
"grad_norm": 0.7382871508598328,
"learning_rate": 2.6929200721791743e-05,
"loss": 0.0272,
"step": 6910
},
{
"epoch": 2.2,
"grad_norm": 1.1875141859054565,
"learning_rate": 2.682305487740155e-05,
"loss": 0.0833,
"step": 6920
},
{
"epoch": 2.21,
"grad_norm": 0.070220448076725,
"learning_rate": 2.671690903301136e-05,
"loss": 0.1321,
"step": 6930
},
{
"epoch": 2.21,
"grad_norm": 3.7514150142669678,
"learning_rate": 2.6610763188621167e-05,
"loss": 0.0971,
"step": 6940
},
{
"epoch": 2.21,
"grad_norm": 0.04383459314703941,
"learning_rate": 2.6504617344230975e-05,
"loss": 0.0878,
"step": 6950
},
{
"epoch": 2.22,
"grad_norm": 0.11518880724906921,
"learning_rate": 2.639847149984078e-05,
"loss": 0.0679,
"step": 6960
},
{
"epoch": 2.22,
"grad_norm": 5.474330902099609,
"learning_rate": 2.629232565545059e-05,
"loss": 0.0497,
"step": 6970
},
{
"epoch": 2.22,
"grad_norm": 0.03785128891468048,
"learning_rate": 2.6186179811060396e-05,
"loss": 0.1183,
"step": 6980
},
{
"epoch": 2.23,
"grad_norm": 0.050687942653894424,
"learning_rate": 2.608003396667021e-05,
"loss": 0.1141,
"step": 6990
},
{
"epoch": 2.23,
"grad_norm": 5.501091003417969,
"learning_rate": 2.5973888122280017e-05,
"loss": 0.1175,
"step": 7000
},
{
"epoch": 2.23,
"grad_norm": 1.3896145820617676,
"learning_rate": 2.5867742277889824e-05,
"loss": 0.1665,
"step": 7010
},
{
"epoch": 2.23,
"grad_norm": 5.888062000274658,
"learning_rate": 2.576159643349963e-05,
"loss": 0.1868,
"step": 7020
},
{
"epoch": 2.24,
"grad_norm": 0.3350411653518677,
"learning_rate": 2.5655450589109438e-05,
"loss": 0.0262,
"step": 7030
},
{
"epoch": 2.24,
"grad_norm": 0.12134930491447449,
"learning_rate": 2.5549304744719245e-05,
"loss": 0.1391,
"step": 7040
},
{
"epoch": 2.24,
"grad_norm": 2.653724193572998,
"learning_rate": 2.5443158900329056e-05,
"loss": 0.044,
"step": 7050
},
{
"epoch": 2.25,
"grad_norm": 1.480675458908081,
"learning_rate": 2.5337013055938863e-05,
"loss": 0.125,
"step": 7060
},
{
"epoch": 2.25,
"grad_norm": 2.112579584121704,
"learning_rate": 2.523086721154867e-05,
"loss": 0.0774,
"step": 7070
},
{
"epoch": 2.25,
"grad_norm": 0.03731192275881767,
"learning_rate": 2.5124721367158477e-05,
"loss": 0.0703,
"step": 7080
},
{
"epoch": 2.26,
"grad_norm": 0.06327365338802338,
"learning_rate": 2.5018575522768284e-05,
"loss": 0.1557,
"step": 7090
},
{
"epoch": 2.26,
"grad_norm": 0.10991324484348297,
"learning_rate": 2.4912429678378094e-05,
"loss": 0.0682,
"step": 7100
},
{
"epoch": 2.26,
"grad_norm": 0.03156714513897896,
"learning_rate": 2.48062838339879e-05,
"loss": 0.1716,
"step": 7110
},
{
"epoch": 2.27,
"grad_norm": 9.979147911071777,
"learning_rate": 2.470013798959771e-05,
"loss": 0.1797,
"step": 7120
},
{
"epoch": 2.27,
"grad_norm": 3.263706684112549,
"learning_rate": 2.459399214520752e-05,
"loss": 0.0659,
"step": 7130
},
{
"epoch": 2.27,
"grad_norm": 6.261413097381592,
"learning_rate": 2.4487846300817323e-05,
"loss": 0.0968,
"step": 7140
},
{
"epoch": 2.28,
"grad_norm": 1.550948143005371,
"learning_rate": 2.438170045642713e-05,
"loss": 0.1336,
"step": 7150
},
{
"epoch": 2.28,
"grad_norm": 1.1487703323364258,
"learning_rate": 2.427555461203694e-05,
"loss": 0.0647,
"step": 7160
},
{
"epoch": 2.28,
"grad_norm": 6.673706531524658,
"learning_rate": 2.4169408767646747e-05,
"loss": 0.1567,
"step": 7170
},
{
"epoch": 2.29,
"grad_norm": 0.17169363796710968,
"learning_rate": 2.4063262923256554e-05,
"loss": 0.1096,
"step": 7180
},
{
"epoch": 2.29,
"grad_norm": 8.660694122314453,
"learning_rate": 2.3957117078866365e-05,
"loss": 0.1589,
"step": 7190
},
{
"epoch": 2.29,
"grad_norm": 1.5906010866165161,
"learning_rate": 2.3850971234476172e-05,
"loss": 0.1224,
"step": 7200
},
{
"epoch": 2.3,
"grad_norm": 0.8341835141181946,
"learning_rate": 2.374482539008598e-05,
"loss": 0.02,
"step": 7210
},
{
"epoch": 2.3,
"grad_norm": 10.785898208618164,
"learning_rate": 2.363867954569579e-05,
"loss": 0.1153,
"step": 7220
},
{
"epoch": 2.3,
"grad_norm": 5.174521446228027,
"learning_rate": 2.3532533701305597e-05,
"loss": 0.0843,
"step": 7230
},
{
"epoch": 2.3,
"grad_norm": 0.7447335720062256,
"learning_rate": 2.3426387856915404e-05,
"loss": 0.1465,
"step": 7240
},
{
"epoch": 2.31,
"grad_norm": 3.2618470191955566,
"learning_rate": 2.332024201252521e-05,
"loss": 0.0874,
"step": 7250
},
{
"epoch": 2.31,
"grad_norm": 1.483594298362732,
"learning_rate": 2.3214096168135018e-05,
"loss": 0.1061,
"step": 7260
},
{
"epoch": 2.31,
"grad_norm": 0.303654283285141,
"learning_rate": 2.3107950323744825e-05,
"loss": 0.117,
"step": 7270
},
{
"epoch": 2.32,
"grad_norm": 10.942138671875,
"learning_rate": 2.3001804479354635e-05,
"loss": 0.2048,
"step": 7280
},
{
"epoch": 2.32,
"grad_norm": 7.95550012588501,
"learning_rate": 2.2895658634964442e-05,
"loss": 0.1158,
"step": 7290
},
{
"epoch": 2.32,
"grad_norm": 0.05263487249612808,
"learning_rate": 2.278951279057425e-05,
"loss": 0.0142,
"step": 7300
},
{
"epoch": 2.33,
"grad_norm": 0.04684547707438469,
"learning_rate": 2.268336694618406e-05,
"loss": 0.15,
"step": 7310
},
{
"epoch": 2.33,
"grad_norm": 6.8654890060424805,
"learning_rate": 2.2577221101793867e-05,
"loss": 0.1816,
"step": 7320
},
{
"epoch": 2.33,
"grad_norm": 11.469459533691406,
"learning_rate": 2.2471075257403674e-05,
"loss": 0.1426,
"step": 7330
},
{
"epoch": 2.34,
"grad_norm": 5.302177906036377,
"learning_rate": 2.236492941301348e-05,
"loss": 0.2248,
"step": 7340
},
{
"epoch": 2.34,
"grad_norm": 2.6794090270996094,
"learning_rate": 2.2258783568623288e-05,
"loss": 0.157,
"step": 7350
},
{
"epoch": 2.34,
"grad_norm": 1.3895156383514404,
"learning_rate": 2.2152637724233095e-05,
"loss": 0.159,
"step": 7360
},
{
"epoch": 2.35,
"grad_norm": 0.17077626287937164,
"learning_rate": 2.2046491879842906e-05,
"loss": 0.1298,
"step": 7370
},
{
"epoch": 2.35,
"grad_norm": 0.14379891753196716,
"learning_rate": 2.1940346035452713e-05,
"loss": 0.072,
"step": 7380
},
{
"epoch": 2.35,
"grad_norm": 0.946506142616272,
"learning_rate": 2.183420019106252e-05,
"loss": 0.0414,
"step": 7390
},
{
"epoch": 2.36,
"grad_norm": 0.10742925852537155,
"learning_rate": 2.172805434667233e-05,
"loss": 0.1991,
"step": 7400
},
{
"epoch": 2.36,
"grad_norm": 4.503111362457275,
"learning_rate": 2.1621908502282138e-05,
"loss": 0.1018,
"step": 7410
},
{
"epoch": 2.36,
"grad_norm": 0.025181856006383896,
"learning_rate": 2.1515762657891945e-05,
"loss": 0.2192,
"step": 7420
},
{
"epoch": 2.37,
"grad_norm": 0.2496863454580307,
"learning_rate": 2.140961681350175e-05,
"loss": 0.1513,
"step": 7430
},
{
"epoch": 2.37,
"grad_norm": 0.18356376886367798,
"learning_rate": 2.1303470969111562e-05,
"loss": 0.0928,
"step": 7440
},
{
"epoch": 2.37,
"grad_norm": 4.700144290924072,
"learning_rate": 2.119732512472137e-05,
"loss": 0.1076,
"step": 7450
},
{
"epoch": 2.37,
"grad_norm": 1.5925829410552979,
"learning_rate": 2.1091179280331176e-05,
"loss": 0.0673,
"step": 7460
},
{
"epoch": 2.38,
"grad_norm": 0.5920007228851318,
"learning_rate": 2.0985033435940983e-05,
"loss": 0.1291,
"step": 7470
},
{
"epoch": 2.38,
"grad_norm": 5.156589508056641,
"learning_rate": 2.087888759155079e-05,
"loss": 0.3071,
"step": 7480
},
{
"epoch": 2.38,
"grad_norm": 0.03765925392508507,
"learning_rate": 2.0772741747160598e-05,
"loss": 0.1093,
"step": 7490
},
{
"epoch": 2.39,
"grad_norm": 0.4249335825443268,
"learning_rate": 2.0666595902770408e-05,
"loss": 0.1279,
"step": 7500
},
{
"epoch": 2.39,
"grad_norm": 0.016695374622941017,
"learning_rate": 2.0560450058380215e-05,
"loss": 0.2595,
"step": 7510
},
{
"epoch": 2.39,
"grad_norm": 0.8157448768615723,
"learning_rate": 2.0454304213990022e-05,
"loss": 0.2879,
"step": 7520
},
{
"epoch": 2.4,
"grad_norm": 0.43193209171295166,
"learning_rate": 2.0348158369599833e-05,
"loss": 0.1147,
"step": 7530
},
{
"epoch": 2.4,
"grad_norm": 1.1754236221313477,
"learning_rate": 2.024201252520964e-05,
"loss": 0.1228,
"step": 7540
},
{
"epoch": 2.4,
"grad_norm": 0.073044553399086,
"learning_rate": 2.0135866680819447e-05,
"loss": 0.0953,
"step": 7550
},
{
"epoch": 2.41,
"grad_norm": 6.481806755065918,
"learning_rate": 2.0029720836429254e-05,
"loss": 0.0925,
"step": 7560
},
{
"epoch": 2.41,
"grad_norm": 3.421597719192505,
"learning_rate": 1.992357499203906e-05,
"loss": 0.1844,
"step": 7570
},
{
"epoch": 2.41,
"grad_norm": 0.15194571018218994,
"learning_rate": 1.9817429147648868e-05,
"loss": 0.3675,
"step": 7580
},
{
"epoch": 2.42,
"grad_norm": 0.44171637296676636,
"learning_rate": 1.971128330325868e-05,
"loss": 0.043,
"step": 7590
},
{
"epoch": 2.42,
"grad_norm": 0.06510256975889206,
"learning_rate": 1.9605137458868486e-05,
"loss": 0.1354,
"step": 7600
},
{
"epoch": 2.42,
"grad_norm": 2.7437000274658203,
"learning_rate": 1.9498991614478293e-05,
"loss": 0.0389,
"step": 7610
},
{
"epoch": 2.43,
"grad_norm": 1.2895437479019165,
"learning_rate": 1.9392845770088103e-05,
"loss": 0.1704,
"step": 7620
},
{
"epoch": 2.43,
"grad_norm": 0.03322044759988785,
"learning_rate": 1.928669992569791e-05,
"loss": 0.1065,
"step": 7630
},
{
"epoch": 2.43,
"grad_norm": 2.8655242919921875,
"learning_rate": 1.9180554081307717e-05,
"loss": 0.1504,
"step": 7640
},
{
"epoch": 2.44,
"grad_norm": 0.2032870352268219,
"learning_rate": 1.9074408236917528e-05,
"loss": 0.0229,
"step": 7650
},
{
"epoch": 2.44,
"grad_norm": 1.7102253437042236,
"learning_rate": 1.8968262392527335e-05,
"loss": 0.1019,
"step": 7660
},
{
"epoch": 2.44,
"grad_norm": 2.740474224090576,
"learning_rate": 1.8862116548137142e-05,
"loss": 0.1235,
"step": 7670
},
{
"epoch": 2.44,
"grad_norm": 2.9120683670043945,
"learning_rate": 1.875597070374695e-05,
"loss": 0.0516,
"step": 7680
},
{
"epoch": 2.45,
"grad_norm": 0.11502601206302643,
"learning_rate": 1.8649824859356756e-05,
"loss": 0.2791,
"step": 7690
},
{
"epoch": 2.45,
"grad_norm": 0.7027528882026672,
"learning_rate": 1.8543679014966563e-05,
"loss": 0.0385,
"step": 7700
},
{
"epoch": 2.45,
"grad_norm": 2.4370245933532715,
"learning_rate": 1.8437533170576374e-05,
"loss": 0.0936,
"step": 7710
},
{
"epoch": 2.46,
"grad_norm": 6.21151876449585,
"learning_rate": 1.833138732618618e-05,
"loss": 0.0806,
"step": 7720
},
{
"epoch": 2.46,
"grad_norm": 0.052706655114889145,
"learning_rate": 1.8225241481795988e-05,
"loss": 0.1684,
"step": 7730
},
{
"epoch": 2.46,
"grad_norm": 0.24665802717208862,
"learning_rate": 1.8119095637405798e-05,
"loss": 0.1383,
"step": 7740
},
{
"epoch": 2.47,
"grad_norm": 3.6017708778381348,
"learning_rate": 1.8012949793015605e-05,
"loss": 0.1204,
"step": 7750
},
{
"epoch": 2.47,
"grad_norm": 3.1942765712738037,
"learning_rate": 1.7906803948625412e-05,
"loss": 0.0627,
"step": 7760
},
{
"epoch": 2.47,
"grad_norm": 3.020968437194824,
"learning_rate": 1.780065810423522e-05,
"loss": 0.1656,
"step": 7770
},
{
"epoch": 2.48,
"grad_norm": 0.13594868779182434,
"learning_rate": 1.7694512259845027e-05,
"loss": 0.0529,
"step": 7780
},
{
"epoch": 2.48,
"grad_norm": 0.0280010886490345,
"learning_rate": 1.7588366415454834e-05,
"loss": 0.1539,
"step": 7790
},
{
"epoch": 2.48,
"grad_norm": 8.52804946899414,
"learning_rate": 1.7482220571064644e-05,
"loss": 0.0498,
"step": 7800
},
{
"epoch": 2.49,
"grad_norm": 0.20770138502120972,
"learning_rate": 1.737607472667445e-05,
"loss": 0.0903,
"step": 7810
},
{
"epoch": 2.49,
"grad_norm": 0.06971104443073273,
"learning_rate": 1.7269928882284258e-05,
"loss": 0.2458,
"step": 7820
},
{
"epoch": 2.49,
"grad_norm": 0.022506361827254295,
"learning_rate": 1.716378303789407e-05,
"loss": 0.0741,
"step": 7830
},
{
"epoch": 2.5,
"grad_norm": 4.818386077880859,
"learning_rate": 1.7057637193503876e-05,
"loss": 0.0586,
"step": 7840
},
{
"epoch": 2.5,
"grad_norm": 0.05160210281610489,
"learning_rate": 1.6951491349113683e-05,
"loss": 0.0817,
"step": 7850
},
{
"epoch": 2.5,
"grad_norm": 0.15953780710697174,
"learning_rate": 1.6845345504723493e-05,
"loss": 0.0905,
"step": 7860
},
{
"epoch": 2.51,
"grad_norm": 0.015429453924298286,
"learning_rate": 1.67391996603333e-05,
"loss": 0.0719,
"step": 7870
},
{
"epoch": 2.51,
"grad_norm": 3.159700632095337,
"learning_rate": 1.6633053815943108e-05,
"loss": 0.048,
"step": 7880
},
{
"epoch": 2.51,
"grad_norm": 1.702974796295166,
"learning_rate": 1.6526907971552915e-05,
"loss": 0.1025,
"step": 7890
},
{
"epoch": 2.51,
"grad_norm": 0.7218146324157715,
"learning_rate": 1.6420762127162722e-05,
"loss": 0.0534,
"step": 7900
},
{
"epoch": 2.52,
"grad_norm": 4.001716136932373,
"learning_rate": 1.632523086721155e-05,
"loss": 0.1611,
"step": 7910
},
{
"epoch": 2.52,
"grad_norm": 0.6529110074043274,
"learning_rate": 1.6219085022821356e-05,
"loss": 0.1739,
"step": 7920
},
{
"epoch": 2.52,
"grad_norm": 3.3086657524108887,
"learning_rate": 1.6112939178431166e-05,
"loss": 0.1373,
"step": 7930
},
{
"epoch": 2.53,
"grad_norm": 2.368133068084717,
"learning_rate": 1.6006793334040973e-05,
"loss": 0.0841,
"step": 7940
},
{
"epoch": 2.53,
"grad_norm": 5.263741970062256,
"learning_rate": 1.590064748965078e-05,
"loss": 0.1226,
"step": 7950
},
{
"epoch": 2.53,
"grad_norm": 10.581872940063477,
"learning_rate": 1.579450164526059e-05,
"loss": 0.1063,
"step": 7960
},
{
"epoch": 2.54,
"grad_norm": 0.07476484030485153,
"learning_rate": 1.5688355800870398e-05,
"loss": 0.3208,
"step": 7970
},
{
"epoch": 2.54,
"grad_norm": 0.976747453212738,
"learning_rate": 1.5582209956480205e-05,
"loss": 0.2336,
"step": 7980
},
{
"epoch": 2.54,
"grad_norm": 0.2981054186820984,
"learning_rate": 1.5476064112090012e-05,
"loss": 0.0603,
"step": 7990
},
{
"epoch": 2.55,
"grad_norm": 0.032338302582502365,
"learning_rate": 1.536991826769982e-05,
"loss": 0.0123,
"step": 8000
},
{
"epoch": 2.55,
"grad_norm": 7.625821590423584,
"learning_rate": 1.5263772423309626e-05,
"loss": 0.1735,
"step": 8010
},
{
"epoch": 2.55,
"grad_norm": 4.120946407318115,
"learning_rate": 1.5157626578919436e-05,
"loss": 0.1199,
"step": 8020
},
{
"epoch": 2.56,
"grad_norm": 0.04417848959565163,
"learning_rate": 1.5051480734529244e-05,
"loss": 0.0629,
"step": 8030
},
{
"epoch": 2.56,
"grad_norm": 3.9831886291503906,
"learning_rate": 1.494533489013905e-05,
"loss": 0.1507,
"step": 8040
},
{
"epoch": 2.56,
"grad_norm": 0.2706742286682129,
"learning_rate": 1.4839189045748861e-05,
"loss": 0.1195,
"step": 8050
},
{
"epoch": 2.57,
"grad_norm": 0.045659586787223816,
"learning_rate": 1.4733043201358668e-05,
"loss": 0.0875,
"step": 8060
},
{
"epoch": 2.57,
"grad_norm": 2.9574756622314453,
"learning_rate": 1.4626897356968475e-05,
"loss": 0.0828,
"step": 8070
},
{
"epoch": 2.57,
"grad_norm": 11.923121452331543,
"learning_rate": 1.4520751512578284e-05,
"loss": 0.1937,
"step": 8080
},
{
"epoch": 2.58,
"grad_norm": 0.8571139574050903,
"learning_rate": 1.4414605668188091e-05,
"loss": 0.1385,
"step": 8090
},
{
"epoch": 2.58,
"grad_norm": 11.532151222229004,
"learning_rate": 1.4308459823797898e-05,
"loss": 0.1644,
"step": 8100
},
{
"epoch": 2.58,
"grad_norm": 0.02608746476471424,
"learning_rate": 1.4202313979407709e-05,
"loss": 0.0845,
"step": 8110
},
{
"epoch": 2.58,
"grad_norm": 0.3875482976436615,
"learning_rate": 1.4096168135017516e-05,
"loss": 0.1526,
"step": 8120
},
{
"epoch": 2.59,
"grad_norm": 0.46190938353538513,
"learning_rate": 1.3990022290627323e-05,
"loss": 0.0656,
"step": 8130
},
{
"epoch": 2.59,
"grad_norm": 0.06178577244281769,
"learning_rate": 1.3883876446237132e-05,
"loss": 0.0245,
"step": 8140
},
{
"epoch": 2.59,
"grad_norm": 0.41626548767089844,
"learning_rate": 1.3777730601846939e-05,
"loss": 0.1448,
"step": 8150
},
{
"epoch": 2.6,
"grad_norm": 0.8394218683242798,
"learning_rate": 1.3671584757456746e-05,
"loss": 0.1566,
"step": 8160
},
{
"epoch": 2.6,
"grad_norm": 0.030064724385738373,
"learning_rate": 1.3565438913066556e-05,
"loss": 0.1614,
"step": 8170
},
{
"epoch": 2.6,
"grad_norm": 0.7408326864242554,
"learning_rate": 1.3459293068676362e-05,
"loss": 0.0493,
"step": 8180
},
{
"epoch": 2.61,
"grad_norm": 6.210927486419678,
"learning_rate": 1.3353147224286169e-05,
"loss": 0.173,
"step": 8190
},
{
"epoch": 2.61,
"grad_norm": 0.3989274501800537,
"learning_rate": 1.3247001379895976e-05,
"loss": 0.2242,
"step": 8200
},
{
"epoch": 2.61,
"grad_norm": 0.21221469342708588,
"learning_rate": 1.3140855535505786e-05,
"loss": 0.1021,
"step": 8210
},
{
"epoch": 2.62,
"grad_norm": 0.018684396520256996,
"learning_rate": 1.3034709691115593e-05,
"loss": 0.1168,
"step": 8220
},
{
"epoch": 2.62,
"grad_norm": 4.258501052856445,
"learning_rate": 1.29285638467254e-05,
"loss": 0.101,
"step": 8230
},
{
"epoch": 2.62,
"grad_norm": 0.18293698132038116,
"learning_rate": 1.282241800233521e-05,
"loss": 0.0888,
"step": 8240
},
{
"epoch": 2.63,
"grad_norm": 5.2559685707092285,
"learning_rate": 1.2716272157945016e-05,
"loss": 0.1593,
"step": 8250
},
{
"epoch": 2.63,
"grad_norm": 0.714055597782135,
"learning_rate": 1.2610126313554823e-05,
"loss": 0.0548,
"step": 8260
},
{
"epoch": 2.63,
"grad_norm": 5.772704124450684,
"learning_rate": 1.2503980469164634e-05,
"loss": 0.1758,
"step": 8270
},
{
"epoch": 2.64,
"grad_norm": 0.15256932377815247,
"learning_rate": 1.2397834624774441e-05,
"loss": 0.1546,
"step": 8280
},
{
"epoch": 2.64,
"grad_norm": 0.17343765497207642,
"learning_rate": 1.2291688780384248e-05,
"loss": 0.0422,
"step": 8290
},
{
"epoch": 2.64,
"grad_norm": 5.067286491394043,
"learning_rate": 1.2185542935994057e-05,
"loss": 0.0384,
"step": 8300
},
{
"epoch": 2.65,
"grad_norm": 2.087721109390259,
"learning_rate": 1.2079397091603864e-05,
"loss": 0.1132,
"step": 8310
},
{
"epoch": 2.65,
"grad_norm": 6.7488017082214355,
"learning_rate": 1.1973251247213673e-05,
"loss": 0.0729,
"step": 8320
},
{
"epoch": 2.65,
"grad_norm": 10.669734954833984,
"learning_rate": 1.1867105402823481e-05,
"loss": 0.0677,
"step": 8330
},
{
"epoch": 2.65,
"grad_norm": 4.734282970428467,
"learning_rate": 1.1760959558433288e-05,
"loss": 0.1741,
"step": 8340
},
{
"epoch": 2.66,
"grad_norm": 1.1807498931884766,
"learning_rate": 1.1665428298482115e-05,
"loss": 0.1219,
"step": 8350
},
{
"epoch": 2.66,
"grad_norm": 0.1118198037147522,
"learning_rate": 1.1559282454091922e-05,
"loss": 0.0724,
"step": 8360
},
{
"epoch": 2.66,
"grad_norm": 8.563444137573242,
"learning_rate": 1.1453136609701731e-05,
"loss": 0.1453,
"step": 8370
},
{
"epoch": 2.67,
"grad_norm": 1.4987778663635254,
"learning_rate": 1.1346990765311538e-05,
"loss": 0.1087,
"step": 8380
},
{
"epoch": 2.67,
"grad_norm": 6.070169448852539,
"learning_rate": 1.1240844920921347e-05,
"loss": 0.1372,
"step": 8390
},
{
"epoch": 2.67,
"grad_norm": 3.5378408432006836,
"learning_rate": 1.1134699076531156e-05,
"loss": 0.1421,
"step": 8400
},
{
"epoch": 2.68,
"grad_norm": 0.18879607319831848,
"learning_rate": 1.1028553232140961e-05,
"loss": 0.0617,
"step": 8410
},
{
"epoch": 2.68,
"grad_norm": 3.873791217803955,
"learning_rate": 1.092240738775077e-05,
"loss": 0.1256,
"step": 8420
},
{
"epoch": 2.68,
"grad_norm": 3.0632710456848145,
"learning_rate": 1.0816261543360579e-05,
"loss": 0.1084,
"step": 8430
},
{
"epoch": 2.69,
"grad_norm": 0.044198133051395416,
"learning_rate": 1.0710115698970386e-05,
"loss": 0.0972,
"step": 8440
},
{
"epoch": 2.69,
"grad_norm": 0.06533059477806091,
"learning_rate": 1.0603969854580194e-05,
"loss": 0.0659,
"step": 8450
},
{
"epoch": 2.69,
"grad_norm": 0.024154966697096825,
"learning_rate": 1.0497824010190002e-05,
"loss": 0.2245,
"step": 8460
},
{
"epoch": 2.7,
"grad_norm": 0.06551453471183777,
"learning_rate": 1.0391678165799809e-05,
"loss": 0.0679,
"step": 8470
},
{
"epoch": 2.7,
"grad_norm": 2.244358777999878,
"learning_rate": 1.0285532321409617e-05,
"loss": 0.1138,
"step": 8480
},
{
"epoch": 2.7,
"grad_norm": 1.3429971933364868,
"learning_rate": 1.0179386477019426e-05,
"loss": 0.1286,
"step": 8490
},
{
"epoch": 2.71,
"grad_norm": 13.364596366882324,
"learning_rate": 1.0073240632629233e-05,
"loss": 0.1304,
"step": 8500
},
{
"epoch": 2.71,
"grad_norm": 1.5777560472488403,
"learning_rate": 9.96709478823904e-06,
"loss": 0.0324,
"step": 8510
},
{
"epoch": 2.71,
"grad_norm": 3.5468719005584717,
"learning_rate": 9.860948943848847e-06,
"loss": 0.1142,
"step": 8520
},
{
"epoch": 2.72,
"grad_norm": 9.198564529418945,
"learning_rate": 9.754803099458656e-06,
"loss": 0.1208,
"step": 8530
},
{
"epoch": 2.72,
"grad_norm": 0.10464298725128174,
"learning_rate": 9.648657255068465e-06,
"loss": 0.062,
"step": 8540
},
{
"epoch": 2.72,
"grad_norm": 7.4889702796936035,
"learning_rate": 9.542511410678272e-06,
"loss": 0.1081,
"step": 8550
},
{
"epoch": 2.72,
"grad_norm": 4.211546897888184,
"learning_rate": 9.43636556628808e-06,
"loss": 0.122,
"step": 8560
},
{
"epoch": 2.73,
"grad_norm": 5.125463008880615,
"learning_rate": 9.330219721897888e-06,
"loss": 0.2547,
"step": 8570
},
{
"epoch": 2.73,
"grad_norm": 0.17111606895923615,
"learning_rate": 9.224073877507695e-06,
"loss": 0.0792,
"step": 8580
},
{
"epoch": 2.73,
"grad_norm": 0.17677658796310425,
"learning_rate": 9.117928033117504e-06,
"loss": 0.2517,
"step": 8590
},
{
"epoch": 2.74,
"grad_norm": 0.88303542137146,
"learning_rate": 9.011782188727312e-06,
"loss": 0.1207,
"step": 8600
},
{
"epoch": 2.74,
"grad_norm": 0.934140682220459,
"learning_rate": 8.90563634433712e-06,
"loss": 0.0874,
"step": 8610
},
{
"epoch": 2.74,
"grad_norm": 0.1124495416879654,
"learning_rate": 8.799490499946927e-06,
"loss": 0.2207,
"step": 8620
},
{
"epoch": 2.75,
"grad_norm": 1.9301073551177979,
"learning_rate": 8.693344655556735e-06,
"loss": 0.1351,
"step": 8630
},
{
"epoch": 2.75,
"grad_norm": 0.42326900362968445,
"learning_rate": 8.587198811166543e-06,
"loss": 0.1563,
"step": 8640
},
{
"epoch": 2.75,
"grad_norm": 0.01322962436825037,
"learning_rate": 8.481052966776351e-06,
"loss": 0.0387,
"step": 8650
},
{
"epoch": 2.76,
"grad_norm": 3.7665517330169678,
"learning_rate": 8.37490712238616e-06,
"loss": 0.2157,
"step": 8660
},
{
"epoch": 2.76,
"grad_norm": 0.2205476611852646,
"learning_rate": 8.268761277995967e-06,
"loss": 0.0491,
"step": 8670
},
{
"epoch": 2.76,
"grad_norm": 0.10910103470087051,
"learning_rate": 8.162615433605774e-06,
"loss": 0.0924,
"step": 8680
},
{
"epoch": 2.77,
"grad_norm": 0.030913598835468292,
"learning_rate": 8.056469589215583e-06,
"loss": 0.0553,
"step": 8690
},
{
"epoch": 2.77,
"grad_norm": 0.08986567705869675,
"learning_rate": 7.95032374482539e-06,
"loss": 0.0613,
"step": 8700
},
{
"epoch": 2.77,
"grad_norm": 0.21952463686466217,
"learning_rate": 7.844177900435199e-06,
"loss": 0.0898,
"step": 8710
},
{
"epoch": 2.78,
"grad_norm": 0.6068935990333557,
"learning_rate": 7.738032056045006e-06,
"loss": 0.043,
"step": 8720
},
{
"epoch": 2.78,
"grad_norm": 0.03201749920845032,
"learning_rate": 7.631886211654813e-06,
"loss": 0.1957,
"step": 8730
},
{
"epoch": 2.78,
"grad_norm": 3.205738067626953,
"learning_rate": 7.525740367264622e-06,
"loss": 0.1425,
"step": 8740
},
{
"epoch": 2.79,
"grad_norm": 3.265514612197876,
"learning_rate": 7.4195945228744306e-06,
"loss": 0.1652,
"step": 8750
},
{
"epoch": 2.79,
"grad_norm": 0.11868763715028763,
"learning_rate": 7.313448678484238e-06,
"loss": 0.0193,
"step": 8760
},
{
"epoch": 2.79,
"grad_norm": 0.03614291548728943,
"learning_rate": 7.2073028340940456e-06,
"loss": 0.1368,
"step": 8770
},
{
"epoch": 2.79,
"grad_norm": 2.512045383453369,
"learning_rate": 7.101156989703854e-06,
"loss": 0.0949,
"step": 8780
},
{
"epoch": 2.8,
"grad_norm": 5.77540922164917,
"learning_rate": 6.995011145313661e-06,
"loss": 0.1639,
"step": 8790
},
{
"epoch": 2.8,
"grad_norm": 7.473822116851807,
"learning_rate": 6.888865300923469e-06,
"loss": 0.1023,
"step": 8800
},
{
"epoch": 2.8,
"grad_norm": 0.0789722427725792,
"learning_rate": 6.782719456533278e-06,
"loss": 0.0627,
"step": 8810
},
{
"epoch": 2.81,
"grad_norm": 2.9245636463165283,
"learning_rate": 6.676573612143084e-06,
"loss": 0.1771,
"step": 8820
},
{
"epoch": 2.81,
"grad_norm": 2.1707448959350586,
"learning_rate": 6.570427767752893e-06,
"loss": 0.0423,
"step": 8830
},
{
"epoch": 2.81,
"grad_norm": 4.990893363952637,
"learning_rate": 6.4642819233627e-06,
"loss": 0.073,
"step": 8840
},
{
"epoch": 2.82,
"grad_norm": 6.3620452880859375,
"learning_rate": 6.358136078972508e-06,
"loss": 0.0627,
"step": 8850
},
{
"epoch": 2.82,
"grad_norm": 0.09669307619333267,
"learning_rate": 6.251990234582317e-06,
"loss": 0.0879,
"step": 8860
},
{
"epoch": 2.82,
"grad_norm": 5.8794779777526855,
"learning_rate": 6.145844390192124e-06,
"loss": 0.1667,
"step": 8870
},
{
"epoch": 2.83,
"grad_norm": 0.0750487744808197,
"learning_rate": 6.039698545801932e-06,
"loss": 0.1538,
"step": 8880
},
{
"epoch": 2.83,
"grad_norm": 4.174580097198486,
"learning_rate": 5.933552701411741e-06,
"loss": 0.1782,
"step": 8890
},
{
"epoch": 2.83,
"grad_norm": 2.7931034564971924,
"learning_rate": 5.827406857021548e-06,
"loss": 0.1047,
"step": 8900
},
{
"epoch": 2.84,
"grad_norm": 0.11179756373167038,
"learning_rate": 5.721261012631356e-06,
"loss": 0.0648,
"step": 8910
},
{
"epoch": 2.84,
"grad_norm": 0.25602421164512634,
"learning_rate": 5.615115168241164e-06,
"loss": 0.1657,
"step": 8920
},
{
"epoch": 2.84,
"grad_norm": 0.030272111296653748,
"learning_rate": 5.5089693238509715e-06,
"loss": 0.1344,
"step": 8930
},
{
"epoch": 2.85,
"grad_norm": 1.8802919387817383,
"learning_rate": 5.4028234794607795e-06,
"loss": 0.1284,
"step": 8940
},
{
"epoch": 2.85,
"grad_norm": 0.9859854578971863,
"learning_rate": 5.296677635070587e-06,
"loss": 0.0504,
"step": 8950
},
{
"epoch": 2.85,
"grad_norm": 0.5083135962486267,
"learning_rate": 5.190531790680395e-06,
"loss": 0.0193,
"step": 8960
},
{
"epoch": 2.86,
"grad_norm": 3.466031789779663,
"learning_rate": 5.084385946290203e-06,
"loss": 0.0459,
"step": 8970
},
{
"epoch": 2.86,
"grad_norm": 8.049098014831543,
"learning_rate": 4.97824010190001e-06,
"loss": 0.1025,
"step": 8980
},
{
"epoch": 2.86,
"grad_norm": 5.528136730194092,
"learning_rate": 4.872094257509819e-06,
"loss": 0.109,
"step": 8990
},
{
"epoch": 2.86,
"grad_norm": 0.02654377557337284,
"learning_rate": 4.765948413119627e-06,
"loss": 0.1655,
"step": 9000
}
],
"logging_steps": 10,
"max_steps": 9423,
"num_input_tokens_seen": 0,
"num_train_epochs": 3,
"save_steps": 500,
"total_flos": 1.0453875280157082e+18,
"train_batch_size": 4,
"trial_name": null,
"trial_params": null
}