output2 / trainer_state.json
ZurabDz's picture
Training in progress, step 12000
22c506a verified
Invalid JSON: Unexpected token 'N', ..."ad_norm": NaN, "... is not valid JSON
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 0.05429397472615476,
"eval_steps": 2000,
"global_step": 12000,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 9.048995787692461e-05,
"grad_norm": 1.1874778270721436,
"learning_rate": 2.7146864537145957e-07,
"loss": 10.3312,
"step": 20
},
{
"epoch": 0.00018097991575384922,
"grad_norm": 1.3932149410247803,
"learning_rate": 5.429372907429191e-07,
"loss": 10.3266,
"step": 40
},
{
"epoch": 0.00027146987363077383,
"grad_norm": 1.2732529640197754,
"learning_rate": 8.144059361143787e-07,
"loss": 10.3163,
"step": 60
},
{
"epoch": 0.00036195983150769844,
"grad_norm": 1.07429039478302,
"learning_rate": 1.0858745814858383e-06,
"loss": 10.3044,
"step": 80
},
{
"epoch": 0.00045244978938462305,
"grad_norm": 1.0309141874313354,
"learning_rate": 1.357343226857298e-06,
"loss": 10.2959,
"step": 100
},
{
"epoch": 0.0005429397472615477,
"grad_norm": 0.9270058870315552,
"learning_rate": 1.6288118722287574e-06,
"loss": 10.2818,
"step": 120
},
{
"epoch": 0.0006334297051384723,
"grad_norm": 0.8409116864204407,
"learning_rate": 1.900280517600217e-06,
"loss": 10.2724,
"step": 140
},
{
"epoch": 0.0007239196630153969,
"grad_norm": 0.7587267160415649,
"learning_rate": 2.1717491629716765e-06,
"loss": 10.2662,
"step": 160
},
{
"epoch": 0.0008144096208923215,
"grad_norm": 0.8605366945266724,
"learning_rate": 2.4432178083431364e-06,
"loss": 10.2567,
"step": 180
},
{
"epoch": 0.0009048995787692461,
"grad_norm": 0.8124440908432007,
"learning_rate": 2.714686453714596e-06,
"loss": 10.2513,
"step": 200
},
{
"epoch": 0.0009953895366461706,
"grad_norm": 0.8214222192764282,
"learning_rate": 2.9861550990860553e-06,
"loss": 10.2396,
"step": 220
},
{
"epoch": 0.0010858794945230953,
"grad_norm": 0.7500312924385071,
"learning_rate": 3.2576237444575148e-06,
"loss": 10.2378,
"step": 240
},
{
"epoch": 0.0011763694524000198,
"grad_norm": 0.7709519267082214,
"learning_rate": 3.529092389828975e-06,
"loss": 10.2287,
"step": 260
},
{
"epoch": 0.0012668594102769445,
"grad_norm": 0.8319140672683716,
"learning_rate": 3.800561035200434e-06,
"loss": 10.2214,
"step": 280
},
{
"epoch": 0.001357349368153869,
"grad_norm": 0.8057898283004761,
"learning_rate": 4.072029680571894e-06,
"loss": 10.2072,
"step": 300
},
{
"epoch": 0.0014478393260307938,
"grad_norm": 0.6834843754768372,
"learning_rate": 4.343498325943353e-06,
"loss": 10.1983,
"step": 320
},
{
"epoch": 0.0015383292839077183,
"grad_norm": 0.8223700523376465,
"learning_rate": 4.614966971314813e-06,
"loss": 10.1884,
"step": 340
},
{
"epoch": 0.001628819241784643,
"grad_norm": 0.8147690892219543,
"learning_rate": 4.886435616686273e-06,
"loss": 10.1814,
"step": 360
},
{
"epoch": 0.0017193091996615675,
"grad_norm": 0.8512526750564575,
"learning_rate": 5.157904262057733e-06,
"loss": 10.1713,
"step": 380
},
{
"epoch": 0.0018097991575384922,
"grad_norm": 0.8844230771064758,
"learning_rate": 5.429372907429192e-06,
"loss": 10.1572,
"step": 400
},
{
"epoch": 0.0019002891154154167,
"grad_norm": 0.9605993628501892,
"learning_rate": 5.700841552800652e-06,
"loss": 10.1496,
"step": 420
},
{
"epoch": 0.001990779073292341,
"grad_norm": 1.2027961015701294,
"learning_rate": 5.972310198172111e-06,
"loss": 10.1298,
"step": 440
},
{
"epoch": 0.002081269031169266,
"grad_norm": 1.4069308042526245,
"learning_rate": 6.2437788435435705e-06,
"loss": 10.1092,
"step": 460
},
{
"epoch": 0.0021717589890461906,
"grad_norm": 1.7658456563949585,
"learning_rate": 6.5152474889150296e-06,
"loss": 10.0954,
"step": 480
},
{
"epoch": 0.002262248946923115,
"grad_norm": 1.6941689252853394,
"learning_rate": 6.7867161342864895e-06,
"loss": 10.0746,
"step": 500
},
{
"epoch": 0.0023527389048000396,
"grad_norm": 2.362786293029785,
"learning_rate": 7.05818477965795e-06,
"loss": 10.0613,
"step": 520
},
{
"epoch": 0.0024432288626769646,
"grad_norm": 1.827091932296753,
"learning_rate": 7.329653425029408e-06,
"loss": 10.045,
"step": 540
},
{
"epoch": 0.002533718820553889,
"grad_norm": 2.136615753173828,
"learning_rate": 7.601122070400868e-06,
"loss": 10.0243,
"step": 560
},
{
"epoch": 0.0026242087784308136,
"grad_norm": 2.501790761947632,
"learning_rate": 7.872590715772328e-06,
"loss": 10.0091,
"step": 580
},
{
"epoch": 0.002714698736307738,
"grad_norm": 2.7978005409240723,
"learning_rate": 8.144059361143788e-06,
"loss": 9.9957,
"step": 600
},
{
"epoch": 0.002805188694184663,
"grad_norm": 3.0485517978668213,
"learning_rate": 8.415528006515246e-06,
"loss": 9.9819,
"step": 620
},
{
"epoch": 0.0028956786520615875,
"grad_norm": 2.761986255645752,
"learning_rate": 8.686996651886706e-06,
"loss": 9.9596,
"step": 640
},
{
"epoch": 0.002986168609938512,
"grad_norm": 3.0985260009765625,
"learning_rate": 8.958465297258166e-06,
"loss": 9.9436,
"step": 660
},
{
"epoch": 0.0030766585678154365,
"grad_norm": 2.40391206741333,
"learning_rate": 9.229933942629626e-06,
"loss": 9.9226,
"step": 680
},
{
"epoch": 0.0031671485256923614,
"grad_norm": 1.933740496635437,
"learning_rate": 9.501402588001086e-06,
"loss": 9.9069,
"step": 700
},
{
"epoch": 0.003257638483569286,
"grad_norm": 2.518874168395996,
"learning_rate": 9.772871233372546e-06,
"loss": 9.881,
"step": 720
},
{
"epoch": 0.0033481284414462104,
"grad_norm": 2.8025624752044678,
"learning_rate": 1.0044339878744006e-05,
"loss": 9.8686,
"step": 740
},
{
"epoch": 0.003438618399323135,
"grad_norm": 1.943656086921692,
"learning_rate": 1.0315808524115465e-05,
"loss": 9.8463,
"step": 760
},
{
"epoch": 0.00352910835720006,
"grad_norm": 1.753179907798767,
"learning_rate": 1.0587277169486925e-05,
"loss": 9.8344,
"step": 780
},
{
"epoch": 0.0036195983150769844,
"grad_norm": 1.9388506412506104,
"learning_rate": 1.0858745814858383e-05,
"loss": 9.8144,
"step": 800
},
{
"epoch": 0.003710088272953909,
"grad_norm": 2.6278536319732666,
"learning_rate": 1.1130214460229843e-05,
"loss": 9.8008,
"step": 820
},
{
"epoch": 0.0038005782308308334,
"grad_norm": 1.8270655870437622,
"learning_rate": 1.1401683105601303e-05,
"loss": 9.7791,
"step": 840
},
{
"epoch": 0.0038910681887077583,
"grad_norm": 1.656563639640808,
"learning_rate": 1.1673151750972763e-05,
"loss": 9.7677,
"step": 860
},
{
"epoch": 0.003981558146584682,
"grad_norm": 1.6003910303115845,
"learning_rate": 1.1944620396344221e-05,
"loss": 9.7465,
"step": 880
},
{
"epoch": 0.004072048104461608,
"grad_norm": 1.5632762908935547,
"learning_rate": 1.2216089041715681e-05,
"loss": 9.73,
"step": 900
},
{
"epoch": 0.004162538062338532,
"grad_norm": 1.4974184036254883,
"learning_rate": 1.2487557687087141e-05,
"loss": 9.7067,
"step": 920
},
{
"epoch": 0.004253028020215457,
"grad_norm": 1.811112880706787,
"learning_rate": 1.2759026332458601e-05,
"loss": 9.6956,
"step": 940
},
{
"epoch": 0.004343517978092381,
"grad_norm": 1.505334734916687,
"learning_rate": 1.3030494977830059e-05,
"loss": 9.6667,
"step": 960
},
{
"epoch": 0.004434007935969306,
"grad_norm": 1.6951265335083008,
"learning_rate": 1.3301963623201519e-05,
"loss": 9.6505,
"step": 980
},
{
"epoch": 0.00452449789384623,
"grad_norm": 1.6119604110717773,
"learning_rate": 1.3573432268572979e-05,
"loss": 9.6381,
"step": 1000
},
{
"epoch": 0.004614987851723155,
"grad_norm": 1.1929903030395508,
"learning_rate": 1.3844900913944439e-05,
"loss": 9.6209,
"step": 1020
},
{
"epoch": 0.004705477809600079,
"grad_norm": 1.5701353549957275,
"learning_rate": 1.41163695593159e-05,
"loss": 9.5956,
"step": 1040
},
{
"epoch": 0.004795967767477005,
"grad_norm": 1.32628333568573,
"learning_rate": 1.4387838204687359e-05,
"loss": 9.5899,
"step": 1060
},
{
"epoch": 0.004886457725353929,
"grad_norm": 1.5850657224655151,
"learning_rate": 1.4659306850058817e-05,
"loss": 9.5779,
"step": 1080
},
{
"epoch": 0.004976947683230854,
"grad_norm": 1.3933109045028687,
"learning_rate": 1.4930775495430278e-05,
"loss": 9.5701,
"step": 1100
},
{
"epoch": 0.005067437641107778,
"grad_norm": 1.258367657661438,
"learning_rate": 1.5202244140801737e-05,
"loss": 9.5468,
"step": 1120
},
{
"epoch": 0.005157927598984703,
"grad_norm": 1.3926512002944946,
"learning_rate": 1.5473712786173196e-05,
"loss": 9.5392,
"step": 1140
},
{
"epoch": 0.005248417556861627,
"grad_norm": 1.1674704551696777,
"learning_rate": 1.5745181431544656e-05,
"loss": 9.5291,
"step": 1160
},
{
"epoch": 0.005338907514738552,
"grad_norm": 1.4704829454421997,
"learning_rate": 1.6016650076916116e-05,
"loss": 9.5219,
"step": 1180
},
{
"epoch": 0.005429397472615476,
"grad_norm": 1.6223082542419434,
"learning_rate": 1.6288118722287576e-05,
"loss": 9.4918,
"step": 1200
},
{
"epoch": 0.0055198874304924015,
"grad_norm": 1.8586570024490356,
"learning_rate": 1.6559587367659036e-05,
"loss": 9.4895,
"step": 1220
},
{
"epoch": 0.005610377388369326,
"grad_norm": 1.4105405807495117,
"learning_rate": 1.6831056013030492e-05,
"loss": 9.4886,
"step": 1240
},
{
"epoch": 0.0057008673462462505,
"grad_norm": 1.4756163358688354,
"learning_rate": 1.7102524658401956e-05,
"loss": 9.4702,
"step": 1260
},
{
"epoch": 0.005791357304123175,
"grad_norm": 1.3847874402999878,
"learning_rate": 1.7373993303773412e-05,
"loss": 9.4638,
"step": 1280
},
{
"epoch": 0.0058818472620000995,
"grad_norm": 1.5135865211486816,
"learning_rate": 1.7645461949144875e-05,
"loss": 9.4583,
"step": 1300
},
{
"epoch": 0.005972337219877024,
"grad_norm": 1.462760329246521,
"learning_rate": 1.7916930594516332e-05,
"loss": 9.4292,
"step": 1320
},
{
"epoch": 0.0060628271777539485,
"grad_norm": 1.646760106086731,
"learning_rate": 1.8188399239887792e-05,
"loss": 9.4419,
"step": 1340
},
{
"epoch": 0.006153317135630873,
"grad_norm": 1.3564046621322632,
"learning_rate": 1.8459867885259252e-05,
"loss": 9.4283,
"step": 1360
},
{
"epoch": 0.006243807093507798,
"grad_norm": 1.4385489225387573,
"learning_rate": 1.873133653063071e-05,
"loss": 9.4208,
"step": 1380
},
{
"epoch": 0.006334297051384723,
"grad_norm": 1.3975261449813843,
"learning_rate": 1.900280517600217e-05,
"loss": 9.4015,
"step": 1400
},
{
"epoch": 0.006424787009261647,
"grad_norm": 1.4809174537658691,
"learning_rate": 1.927427382137363e-05,
"loss": 9.4009,
"step": 1420
},
{
"epoch": 0.006515276967138572,
"grad_norm": 1.5181605815887451,
"learning_rate": 1.954574246674509e-05,
"loss": 9.3969,
"step": 1440
},
{
"epoch": 0.006605766925015496,
"grad_norm": 1.4760838747024536,
"learning_rate": 1.981721111211655e-05,
"loss": 9.395,
"step": 1460
},
{
"epoch": 0.006696256882892421,
"grad_norm": 1.6140539646148682,
"learning_rate": 2.008867975748801e-05,
"loss": 9.3868,
"step": 1480
},
{
"epoch": 0.006786746840769345,
"grad_norm": 1.469307541847229,
"learning_rate": 2.0360148402859468e-05,
"loss": 9.3766,
"step": 1500
},
{
"epoch": 0.00687723679864627,
"grad_norm": 1.8742159605026245,
"learning_rate": 2.063161704823093e-05,
"loss": 9.3715,
"step": 1520
},
{
"epoch": 0.006967726756523195,
"grad_norm": 1.5996043682098389,
"learning_rate": 2.0903085693602387e-05,
"loss": 9.3622,
"step": 1540
},
{
"epoch": 0.00705821671440012,
"grad_norm": 1.867632508277893,
"learning_rate": 2.117455433897385e-05,
"loss": 9.3704,
"step": 1560
},
{
"epoch": 0.007148706672277044,
"grad_norm": 1.4762872457504272,
"learning_rate": 2.1446022984345307e-05,
"loss": 9.3741,
"step": 1580
},
{
"epoch": 0.007239196630153969,
"grad_norm": 1.5752198696136475,
"learning_rate": 2.1717491629716767e-05,
"loss": 9.3561,
"step": 1600
},
{
"epoch": 0.007329686588030893,
"grad_norm": 1.637786865234375,
"learning_rate": 2.1988960275088227e-05,
"loss": 9.3535,
"step": 1620
},
{
"epoch": 0.007420176545907818,
"grad_norm": 2.6087028980255127,
"learning_rate": 2.2260428920459687e-05,
"loss": 9.3541,
"step": 1640
},
{
"epoch": 0.007510666503784742,
"grad_norm": 1.977252721786499,
"learning_rate": 2.2531897565831143e-05,
"loss": 9.3341,
"step": 1660
},
{
"epoch": 0.007601156461661667,
"grad_norm": 1.9511388540267944,
"learning_rate": 2.2803366211202606e-05,
"loss": 9.339,
"step": 1680
},
{
"epoch": 0.007691646419538592,
"grad_norm": 1.8821523189544678,
"learning_rate": 2.3074834856574063e-05,
"loss": 9.3234,
"step": 1700
},
{
"epoch": 0.007782136377415517,
"grad_norm": 1.5517367124557495,
"learning_rate": 2.3346303501945526e-05,
"loss": 9.3367,
"step": 1720
},
{
"epoch": 0.00787262633529244,
"grad_norm": 2.164625883102417,
"learning_rate": 2.3617772147316983e-05,
"loss": 9.3366,
"step": 1740
},
{
"epoch": 0.007963116293169365,
"grad_norm": 2.4158406257629395,
"learning_rate": 2.3889240792688443e-05,
"loss": 9.3221,
"step": 1760
},
{
"epoch": 0.00805360625104629,
"grad_norm": 1.8652360439300537,
"learning_rate": 2.4160709438059906e-05,
"loss": 9.3098,
"step": 1780
},
{
"epoch": 0.008144096208923216,
"grad_norm": 1.8249917030334473,
"learning_rate": 2.4432178083431362e-05,
"loss": 9.3094,
"step": 1800
},
{
"epoch": 0.00823458616680014,
"grad_norm": 2.06990647315979,
"learning_rate": 2.4703646728802822e-05,
"loss": 9.2994,
"step": 1820
},
{
"epoch": 0.008325076124677065,
"grad_norm": 2.461805582046509,
"learning_rate": 2.4975115374174282e-05,
"loss": 9.3157,
"step": 1840
},
{
"epoch": 0.008415566082553989,
"grad_norm": 2.1320767402648926,
"learning_rate": 2.5246584019545742e-05,
"loss": 9.281,
"step": 1860
},
{
"epoch": 0.008506056040430914,
"grad_norm": 2.6872756481170654,
"learning_rate": 2.5518052664917202e-05,
"loss": 9.2917,
"step": 1880
},
{
"epoch": 0.008596545998307838,
"grad_norm": 2.4759294986724854,
"learning_rate": 2.5789521310288662e-05,
"loss": 9.2941,
"step": 1900
},
{
"epoch": 0.008687035956184763,
"grad_norm": 1.8129667043685913,
"learning_rate": 2.6060989955660118e-05,
"loss": 9.2815,
"step": 1920
},
{
"epoch": 0.008777525914061687,
"grad_norm": 2.9053220748901367,
"learning_rate": 2.633245860103158e-05,
"loss": 9.2801,
"step": 1940
},
{
"epoch": 0.008868015871938612,
"grad_norm": 2.412623167037964,
"learning_rate": 2.6603927246403038e-05,
"loss": 9.2719,
"step": 1960
},
{
"epoch": 0.008958505829815536,
"grad_norm": 1.972790002822876,
"learning_rate": 2.6875395891774498e-05,
"loss": 9.2729,
"step": 1980
},
{
"epoch": 0.00904899578769246,
"grad_norm": 3.04768705368042,
"learning_rate": 2.7146864537145958e-05,
"loss": 9.2653,
"step": 2000
},
{
"epoch": 0.00904899578769246,
"eval_accuracy": 0.10545615706904701,
"eval_loss": 9.261013984680176,
"eval_runtime": 215.2628,
"eval_samples_per_second": 2823.711,
"eval_steps_per_second": 11.033,
"step": 2000
},
{
"epoch": 0.009139485745569385,
"grad_norm": 2.2706515789031982,
"learning_rate": 2.7418333182517418e-05,
"loss": 9.2604,
"step": 2020
},
{
"epoch": 0.00922997570344631,
"grad_norm": 2.297621011734009,
"learning_rate": 2.7689801827888878e-05,
"loss": 9.2367,
"step": 2040
},
{
"epoch": 0.009320465661323234,
"grad_norm": 2.049971342086792,
"learning_rate": 2.7961270473260337e-05,
"loss": 9.2545,
"step": 2060
},
{
"epoch": 0.009410955619200159,
"grad_norm": 2.3538951873779297,
"learning_rate": 2.82327391186318e-05,
"loss": 9.2511,
"step": 2080
},
{
"epoch": 0.009501445577077083,
"grad_norm": 3.1383931636810303,
"learning_rate": 2.8504207764003254e-05,
"loss": 9.2319,
"step": 2100
},
{
"epoch": 0.00959193553495401,
"grad_norm": 2.6480958461761475,
"learning_rate": 2.8775676409374717e-05,
"loss": 9.2353,
"step": 2120
},
{
"epoch": 0.009682425492830934,
"grad_norm": 2.3209128379821777,
"learning_rate": 2.9047145054746177e-05,
"loss": 9.241,
"step": 2140
},
{
"epoch": 0.009772915450707858,
"grad_norm": 2.3225491046905518,
"learning_rate": 2.9318613700117634e-05,
"loss": 9.2133,
"step": 2160
},
{
"epoch": 0.009863405408584783,
"grad_norm": 2.0134568214416504,
"learning_rate": 2.9590082345489093e-05,
"loss": 9.2188,
"step": 2180
},
{
"epoch": 0.009953895366461707,
"grad_norm": 3.033569574356079,
"learning_rate": 2.9861550990860557e-05,
"loss": 9.2131,
"step": 2200
},
{
"epoch": 0.010044385324338632,
"grad_norm": 2.8993263244628906,
"learning_rate": 3.0133019636232017e-05,
"loss": 9.2119,
"step": 2220
},
{
"epoch": 0.010134875282215556,
"grad_norm": 2.718588352203369,
"learning_rate": 3.0404488281603473e-05,
"loss": 9.2187,
"step": 2240
},
{
"epoch": 0.01022536524009248,
"grad_norm": 2.635470390319824,
"learning_rate": 3.0675956926974936e-05,
"loss": 9.1953,
"step": 2260
},
{
"epoch": 0.010315855197969405,
"grad_norm": 2.6032440662384033,
"learning_rate": 3.094742557234639e-05,
"loss": 9.1967,
"step": 2280
},
{
"epoch": 0.01040634515584633,
"grad_norm": 2.4713950157165527,
"learning_rate": 3.121889421771785e-05,
"loss": 9.1881,
"step": 2300
},
{
"epoch": 0.010496835113723254,
"grad_norm": 2.4573025703430176,
"learning_rate": 3.149036286308931e-05,
"loss": 9.1827,
"step": 2320
},
{
"epoch": 0.010587325071600179,
"grad_norm": 2.6169447898864746,
"learning_rate": 3.1761831508460776e-05,
"loss": 9.1865,
"step": 2340
},
{
"epoch": 0.010677815029477103,
"grad_norm": 2.6744954586029053,
"learning_rate": 3.203330015383223e-05,
"loss": 9.1829,
"step": 2360
},
{
"epoch": 0.010768304987354028,
"grad_norm": 2.766223907470703,
"learning_rate": 3.230476879920369e-05,
"loss": 9.177,
"step": 2380
},
{
"epoch": 0.010858794945230952,
"grad_norm": 2.8083655834198,
"learning_rate": 3.257623744457515e-05,
"loss": 9.1853,
"step": 2400
},
{
"epoch": 0.010949284903107877,
"grad_norm": 4.484155178070068,
"learning_rate": 3.284770608994661e-05,
"loss": 9.1655,
"step": 2420
},
{
"epoch": 0.011039774860984803,
"grad_norm": 3.5152087211608887,
"learning_rate": 3.311917473531807e-05,
"loss": 9.1516,
"step": 2440
},
{
"epoch": 0.011130264818861728,
"grad_norm": 2.3122165203094482,
"learning_rate": 3.339064338068953e-05,
"loss": 9.1552,
"step": 2460
},
{
"epoch": 0.011220754776738652,
"grad_norm": 3.0563108921051025,
"learning_rate": 3.3662112026060985e-05,
"loss": 9.1494,
"step": 2480
},
{
"epoch": 0.011311244734615577,
"grad_norm": 3.926668882369995,
"learning_rate": 3.393358067143245e-05,
"loss": 9.1425,
"step": 2500
},
{
"epoch": 0.011401734692492501,
"grad_norm": 2.7006709575653076,
"learning_rate": 3.420504931680391e-05,
"loss": 9.1328,
"step": 2520
},
{
"epoch": 0.011492224650369426,
"grad_norm": 3.1082751750946045,
"learning_rate": 3.447651796217537e-05,
"loss": 9.1316,
"step": 2540
},
{
"epoch": 0.01158271460824635,
"grad_norm": 2.744490385055542,
"learning_rate": 3.4747986607546824e-05,
"loss": 9.1193,
"step": 2560
},
{
"epoch": 0.011673204566123275,
"grad_norm": 2.8441922664642334,
"learning_rate": 3.501945525291829e-05,
"loss": 9.1174,
"step": 2580
},
{
"epoch": 0.011763694524000199,
"grad_norm": 3.7371647357940674,
"learning_rate": 3.529092389828975e-05,
"loss": 9.1217,
"step": 2600
},
{
"epoch": 0.011854184481877124,
"grad_norm": 3.0141730308532715,
"learning_rate": 3.556239254366121e-05,
"loss": 9.0999,
"step": 2620
},
{
"epoch": 0.011944674439754048,
"grad_norm": 2.9731669425964355,
"learning_rate": 3.5833861189032664e-05,
"loss": 9.1044,
"step": 2640
},
{
"epoch": 0.012035164397630973,
"grad_norm": 3.166254997253418,
"learning_rate": 3.610532983440413e-05,
"loss": 9.103,
"step": 2660
},
{
"epoch": 0.012125654355507897,
"grad_norm": 2.949646472930908,
"learning_rate": 3.6376798479775584e-05,
"loss": 9.1026,
"step": 2680
},
{
"epoch": 0.012216144313384822,
"grad_norm": 2.762843132019043,
"learning_rate": 3.664826712514705e-05,
"loss": 9.1047,
"step": 2700
},
{
"epoch": 0.012306634271261746,
"grad_norm": 3.188957929611206,
"learning_rate": 3.6919735770518503e-05,
"loss": 9.0968,
"step": 2720
},
{
"epoch": 0.01239712422913867,
"grad_norm": 4.116425037384033,
"learning_rate": 3.719120441588996e-05,
"loss": 9.0993,
"step": 2740
},
{
"epoch": 0.012487614187015597,
"grad_norm": 2.7521297931671143,
"learning_rate": 3.746267306126142e-05,
"loss": 9.063,
"step": 2760
},
{
"epoch": 0.012578104144892521,
"grad_norm": 3.1481823921203613,
"learning_rate": 3.7734141706632886e-05,
"loss": 9.062,
"step": 2780
},
{
"epoch": 0.012668594102769446,
"grad_norm": 2.48091721534729,
"learning_rate": 3.800561035200434e-05,
"loss": 9.0727,
"step": 2800
},
{
"epoch": 0.01275908406064637,
"grad_norm": 3.0816426277160645,
"learning_rate": 3.82770789973758e-05,
"loss": 9.0525,
"step": 2820
},
{
"epoch": 0.012849574018523295,
"grad_norm": 2.86342191696167,
"learning_rate": 3.854854764274726e-05,
"loss": 9.0447,
"step": 2840
},
{
"epoch": 0.01294006397640022,
"grad_norm": 2.769746780395508,
"learning_rate": 3.8820016288118726e-05,
"loss": 9.0524,
"step": 2860
},
{
"epoch": 0.013030553934277144,
"grad_norm": 3.4716339111328125,
"learning_rate": 3.909148493349018e-05,
"loss": 9.0453,
"step": 2880
},
{
"epoch": 0.013121043892154068,
"grad_norm": 4.585721969604492,
"learning_rate": 3.936295357886164e-05,
"loss": 9.0466,
"step": 2900
},
{
"epoch": 0.013211533850030993,
"grad_norm": 3.7394728660583496,
"learning_rate": 3.96344222242331e-05,
"loss": 9.0405,
"step": 2920
},
{
"epoch": 0.013302023807907917,
"grad_norm": 3.9100561141967773,
"learning_rate": 3.990589086960456e-05,
"loss": 9.0415,
"step": 2940
},
{
"epoch": 0.013392513765784842,
"grad_norm": 2.94941782951355,
"learning_rate": 4.017735951497602e-05,
"loss": 9.0265,
"step": 2960
},
{
"epoch": 0.013483003723661766,
"grad_norm": 2.6733226776123047,
"learning_rate": 4.044882816034748e-05,
"loss": 9.0195,
"step": 2980
},
{
"epoch": 0.01357349368153869,
"grad_norm": 3.4839463233947754,
"learning_rate": 4.0720296805718935e-05,
"loss": 9.0204,
"step": 3000
},
{
"epoch": 0.013663983639415615,
"grad_norm": 3.460050344467163,
"learning_rate": 4.09917654510904e-05,
"loss": 9.0086,
"step": 3020
},
{
"epoch": 0.01375447359729254,
"grad_norm": 4.007343769073486,
"learning_rate": 4.126323409646186e-05,
"loss": 9.0185,
"step": 3040
},
{
"epoch": 0.013844963555169464,
"grad_norm": 3.917860746383667,
"learning_rate": 4.153470274183331e-05,
"loss": 9.0032,
"step": 3060
},
{
"epoch": 0.01393545351304639,
"grad_norm": 3.5258123874664307,
"learning_rate": 4.1806171387204775e-05,
"loss": 8.9983,
"step": 3080
},
{
"epoch": 0.014025943470923315,
"grad_norm": 3.002183198928833,
"learning_rate": 4.207764003257624e-05,
"loss": 8.9898,
"step": 3100
},
{
"epoch": 0.01411643342880024,
"grad_norm": 3.2682976722717285,
"learning_rate": 4.23491086779477e-05,
"loss": 8.9932,
"step": 3120
},
{
"epoch": 0.014206923386677164,
"grad_norm": 3.7955832481384277,
"learning_rate": 4.262057732331915e-05,
"loss": 8.9879,
"step": 3140
},
{
"epoch": 0.014297413344554089,
"grad_norm": 3.3697524070739746,
"learning_rate": 4.2892045968690614e-05,
"loss": 8.9757,
"step": 3160
},
{
"epoch": 0.014387903302431013,
"grad_norm": 3.756788730621338,
"learning_rate": 4.316351461406208e-05,
"loss": 8.9811,
"step": 3180
},
{
"epoch": 0.014478393260307938,
"grad_norm": 3.024722099304199,
"learning_rate": 4.3434983259433534e-05,
"loss": 8.9613,
"step": 3200
},
{
"epoch": 0.014568883218184862,
"grad_norm": 3.258375406265259,
"learning_rate": 4.3706451904805e-05,
"loss": 8.9614,
"step": 3220
},
{
"epoch": 0.014659373176061787,
"grad_norm": 2.970426559448242,
"learning_rate": 4.3977920550176454e-05,
"loss": 8.9624,
"step": 3240
},
{
"epoch": 0.014749863133938711,
"grad_norm": 4.601590156555176,
"learning_rate": 4.424938919554791e-05,
"loss": 8.9615,
"step": 3260
},
{
"epoch": 0.014840353091815636,
"grad_norm": 4.773068428039551,
"learning_rate": 4.4520857840919373e-05,
"loss": 8.9668,
"step": 3280
},
{
"epoch": 0.01493084304969256,
"grad_norm": 3.182677984237671,
"learning_rate": 4.479232648629084e-05,
"loss": 8.933,
"step": 3300
},
{
"epoch": 0.015021333007569485,
"grad_norm": 3.160553455352783,
"learning_rate": 4.5063795131662286e-05,
"loss": 8.9409,
"step": 3320
},
{
"epoch": 0.015111822965446409,
"grad_norm": 3.0617620944976807,
"learning_rate": 4.533526377703375e-05,
"loss": 8.95,
"step": 3340
},
{
"epoch": 0.015202312923323334,
"grad_norm": 3.1966211795806885,
"learning_rate": 4.560673242240521e-05,
"loss": 8.9379,
"step": 3360
},
{
"epoch": 0.015292802881200258,
"grad_norm": 2.3314368724823,
"learning_rate": 4.587820106777667e-05,
"loss": 8.9246,
"step": 3380
},
{
"epoch": 0.015383292839077184,
"grad_norm": 3.1242740154266357,
"learning_rate": 4.6149669713148126e-05,
"loss": 8.9409,
"step": 3400
},
{
"epoch": 0.015473782796954109,
"grad_norm": 3.042051315307617,
"learning_rate": 4.642113835851959e-05,
"loss": 8.9204,
"step": 3420
},
{
"epoch": 0.015564272754831033,
"grad_norm": 4.102015495300293,
"learning_rate": 4.669260700389105e-05,
"loss": 8.8915,
"step": 3440
},
{
"epoch": 0.015654762712707958,
"grad_norm": 3.2991299629211426,
"learning_rate": 4.696407564926251e-05,
"loss": 8.8897,
"step": 3460
},
{
"epoch": 0.01574525267058488,
"grad_norm": 3.501094102859497,
"learning_rate": 4.7235544294633965e-05,
"loss": 8.9223,
"step": 3480
},
{
"epoch": 0.015835742628461807,
"grad_norm": 6.248113632202148,
"learning_rate": 4.750701294000543e-05,
"loss": 8.8925,
"step": 3500
},
{
"epoch": 0.01592623258633873,
"grad_norm": 4.329127788543701,
"learning_rate": 4.7778481585376885e-05,
"loss": 8.8891,
"step": 3520
},
{
"epoch": 0.016016722544215656,
"grad_norm": 3.575141191482544,
"learning_rate": 4.804995023074835e-05,
"loss": 8.8741,
"step": 3540
},
{
"epoch": 0.01610721250209258,
"grad_norm": 3.301194429397583,
"learning_rate": 4.832141887611981e-05,
"loss": 8.8965,
"step": 3560
},
{
"epoch": 0.016197702459969505,
"grad_norm": 3.7364182472229004,
"learning_rate": 4.859288752149126e-05,
"loss": 8.8899,
"step": 3580
},
{
"epoch": 0.01628819241784643,
"grad_norm": 5.336267471313477,
"learning_rate": 4.8864356166862725e-05,
"loss": 8.8959,
"step": 3600
},
{
"epoch": 0.016378682375723354,
"grad_norm": 4.769089221954346,
"learning_rate": 4.913582481223419e-05,
"loss": 8.8981,
"step": 3620
},
{
"epoch": 0.01646917233360028,
"grad_norm": 3.369799852371216,
"learning_rate": 4.9407293457605645e-05,
"loss": 8.8954,
"step": 3640
},
{
"epoch": 0.016559662291477203,
"grad_norm": 3.063030481338501,
"learning_rate": 4.96787621029771e-05,
"loss": 8.8694,
"step": 3660
},
{
"epoch": 0.01665015224935413,
"grad_norm": 4.988938331604004,
"learning_rate": 4.9950230748348564e-05,
"loss": 8.8611,
"step": 3680
},
{
"epoch": 0.016740642207231052,
"grad_norm": 3.5118601322174072,
"learning_rate": 5.022169939372003e-05,
"loss": 8.8525,
"step": 3700
},
{
"epoch": 0.016831132165107978,
"grad_norm": 4.257157325744629,
"learning_rate": 5.0493168039091484e-05,
"loss": 8.8547,
"step": 3720
},
{
"epoch": 0.0169216221229849,
"grad_norm": 3.7021615505218506,
"learning_rate": 5.076463668446294e-05,
"loss": 8.8572,
"step": 3740
},
{
"epoch": 0.017012112080861827,
"grad_norm": 4.868439197540283,
"learning_rate": 5.1036105329834404e-05,
"loss": 8.8684,
"step": 3760
},
{
"epoch": 0.01710260203873875,
"grad_norm": 6.547580718994141,
"learning_rate": 5.130757397520586e-05,
"loss": 8.828,
"step": 3780
},
{
"epoch": 0.017193091996615676,
"grad_norm": 5.9254374504089355,
"learning_rate": 5.1579042620577324e-05,
"loss": 8.838,
"step": 3800
},
{
"epoch": 0.0172835819544926,
"grad_norm": 6.061065196990967,
"learning_rate": 5.185051126594879e-05,
"loss": 8.8405,
"step": 3820
},
{
"epoch": 0.017374071912369525,
"grad_norm": 6.026751518249512,
"learning_rate": 5.2121979911320237e-05,
"loss": 8.8305,
"step": 3840
},
{
"epoch": 0.017464561870246448,
"grad_norm": 4.982965469360352,
"learning_rate": 5.23934485566917e-05,
"loss": 8.8316,
"step": 3860
},
{
"epoch": 0.017555051828123374,
"grad_norm": 9.080221176147461,
"learning_rate": 5.266491720206316e-05,
"loss": 8.8267,
"step": 3880
},
{
"epoch": 0.0176455417860003,
"grad_norm": 6.644583225250244,
"learning_rate": 5.293638584743462e-05,
"loss": 8.8331,
"step": 3900
},
{
"epoch": 0.017736031743877223,
"grad_norm": 6.022925853729248,
"learning_rate": 5.3207854492806076e-05,
"loss": 8.8198,
"step": 3920
},
{
"epoch": 0.01782652170175415,
"grad_norm": 4.794320583343506,
"learning_rate": 5.347932313817754e-05,
"loss": 8.8075,
"step": 3940
},
{
"epoch": 0.017917011659631072,
"grad_norm": 5.949656963348389,
"learning_rate": 5.3750791783548996e-05,
"loss": 8.8175,
"step": 3960
},
{
"epoch": 0.018007501617508,
"grad_norm": 7.972283840179443,
"learning_rate": 5.402226042892046e-05,
"loss": 8.8263,
"step": 3980
},
{
"epoch": 0.01809799157538492,
"grad_norm": 6.132015228271484,
"learning_rate": 5.4293729074291916e-05,
"loss": 8.8035,
"step": 4000
},
{
"epoch": 0.01809799157538492,
"eval_accuracy": 0.10955227810888264,
"eval_loss": 8.793069839477539,
"eval_runtime": 217.825,
"eval_samples_per_second": 2790.497,
"eval_steps_per_second": 10.903,
"step": 4000
},
{
"epoch": 0.018188481533261847,
"grad_norm": 3.9714837074279785,
"learning_rate": 5.455162428739481e-05,
"loss": 8.8029,
"step": 4020
},
{
"epoch": 0.01827897149113877,
"grad_norm": 3.9775164127349854,
"learning_rate": 5.482309293276626e-05,
"loss": 8.7859,
"step": 4040
},
{
"epoch": 0.018369461449015696,
"grad_norm": 4.350288391113281,
"learning_rate": 5.509456157813772e-05,
"loss": 8.8049,
"step": 4060
},
{
"epoch": 0.01845995140689262,
"grad_norm": 5.212925910949707,
"learning_rate": 5.5366030223509186e-05,
"loss": 8.7768,
"step": 4080
},
{
"epoch": 0.018550441364769545,
"grad_norm": 5.585092544555664,
"learning_rate": 5.563749886888064e-05,
"loss": 8.7792,
"step": 4100
},
{
"epoch": 0.018640931322646468,
"grad_norm": 5.019256114959717,
"learning_rate": 5.59089675142521e-05,
"loss": 8.7843,
"step": 4120
},
{
"epoch": 0.018731421280523394,
"grad_norm": 5.925191402435303,
"learning_rate": 5.616686272735499e-05,
"loss": 8.7693,
"step": 4140
},
{
"epoch": 0.018821911238400317,
"grad_norm": 4.334403991699219,
"learning_rate": 5.643833137272645e-05,
"loss": 8.7652,
"step": 4160
},
{
"epoch": 0.018912401196277243,
"grad_norm": 6.786751747131348,
"learning_rate": 5.670980001809791e-05,
"loss": 8.76,
"step": 4180
},
{
"epoch": 0.019002891154154166,
"grad_norm": 5.805715084075928,
"learning_rate": 5.698126866346936e-05,
"loss": 8.7835,
"step": 4200
},
{
"epoch": 0.019093381112031092,
"grad_norm": 7.2905120849609375,
"learning_rate": 5.7252737308840826e-05,
"loss": 8.7524,
"step": 4220
},
{
"epoch": 0.01918387106990802,
"grad_norm": 4.692761421203613,
"learning_rate": 5.752420595421228e-05,
"loss": 8.7274,
"step": 4240
},
{
"epoch": 0.01927436102778494,
"grad_norm": 5.6952924728393555,
"learning_rate": 5.7795674599583746e-05,
"loss": 8.7625,
"step": 4260
},
{
"epoch": 0.019364850985661868,
"grad_norm": 7.725805759429932,
"learning_rate": 5.806714324495521e-05,
"loss": 8.7313,
"step": 4280
},
{
"epoch": 0.01945534094353879,
"grad_norm": 5.154263496398926,
"learning_rate": 5.833861189032667e-05,
"loss": 8.7433,
"step": 4300
},
{
"epoch": 0.019545830901415717,
"grad_norm": 7.734066963195801,
"learning_rate": 5.861008053569812e-05,
"loss": 8.738,
"step": 4320
},
{
"epoch": 0.01963632085929264,
"grad_norm": 6.757390022277832,
"learning_rate": 5.888154918106958e-05,
"loss": 8.6971,
"step": 4340
},
{
"epoch": 0.019726810817169566,
"grad_norm": 9.869467735290527,
"learning_rate": 5.915301782644104e-05,
"loss": 8.7437,
"step": 4360
},
{
"epoch": 0.01981730077504649,
"grad_norm": 4.825913429260254,
"learning_rate": 5.9424486471812505e-05,
"loss": 8.712,
"step": 4380
},
{
"epoch": 0.019907790732923415,
"grad_norm": 8.725457191467285,
"learning_rate": 5.969595511718397e-05,
"loss": 8.7054,
"step": 4400
},
{
"epoch": 0.019998280690800337,
"grad_norm": 9.08804702758789,
"learning_rate": 5.9967423762555425e-05,
"loss": 8.6968,
"step": 4420
},
{
"epoch": 0.020088770648677264,
"grad_norm": 7.369052886962891,
"learning_rate": 6.023889240792689e-05,
"loss": 8.6736,
"step": 4440
},
{
"epoch": 0.020179260606554186,
"grad_norm": 9.925745964050293,
"learning_rate": 6.051036105329834e-05,
"loss": 8.7043,
"step": 4460
},
{
"epoch": 0.020269750564431113,
"grad_norm": 10.998024940490723,
"learning_rate": 6.07818296986698e-05,
"loss": 8.7098,
"step": 4480
},
{
"epoch": 0.020360240522308035,
"grad_norm": 9.010730743408203,
"learning_rate": 6.105329834404126e-05,
"loss": 8.6893,
"step": 4500
},
{
"epoch": 0.02045073048018496,
"grad_norm": 5.833269119262695,
"learning_rate": 6.132476698941272e-05,
"loss": 8.6928,
"step": 4520
},
{
"epoch": 0.020541220438061888,
"grad_norm": 5.778794288635254,
"learning_rate": 6.159623563478418e-05,
"loss": 8.6813,
"step": 4540
},
{
"epoch": 0.02063171039593881,
"grad_norm": 6.518376350402832,
"learning_rate": 6.186770428015565e-05,
"loss": 8.6679,
"step": 4560
},
{
"epoch": 0.020722200353815737,
"grad_norm": 7.985169887542725,
"learning_rate": 6.21391729255271e-05,
"loss": 8.6912,
"step": 4580
},
{
"epoch": 0.02081269031169266,
"grad_norm": 6.066607475280762,
"learning_rate": 6.241064157089856e-05,
"loss": 8.67,
"step": 4600
},
{
"epoch": 0.020903180269569586,
"grad_norm": 7.519238471984863,
"learning_rate": 6.268211021627002e-05,
"loss": 8.648,
"step": 4620
},
{
"epoch": 0.02099367022744651,
"grad_norm": 9.485710144042969,
"learning_rate": 6.295357886164147e-05,
"loss": 8.6484,
"step": 4640
},
{
"epoch": 0.021084160185323435,
"grad_norm": 9.786864280700684,
"learning_rate": 6.322504750701294e-05,
"loss": 8.637,
"step": 4660
},
{
"epoch": 0.021174650143200358,
"grad_norm": 8.231635093688965,
"learning_rate": 6.34965161523844e-05,
"loss": 8.648,
"step": 4680
},
{
"epoch": 0.021265140101077284,
"grad_norm": 7.283841609954834,
"learning_rate": 6.376798479775586e-05,
"loss": 8.64,
"step": 4700
},
{
"epoch": 0.021355630058954207,
"grad_norm": 7.625393390655518,
"learning_rate": 6.403945344312731e-05,
"loss": 8.6713,
"step": 4720
},
{
"epoch": 0.021446120016831133,
"grad_norm": 7.758394241333008,
"learning_rate": 6.431092208849878e-05,
"loss": 8.6473,
"step": 4740
},
{
"epoch": 0.021536609974708056,
"grad_norm": 7.519627571105957,
"learning_rate": 6.458239073387024e-05,
"loss": 8.6144,
"step": 4760
},
{
"epoch": 0.021627099932584982,
"grad_norm": 7.698405742645264,
"learning_rate": 6.48538593792417e-05,
"loss": 8.6678,
"step": 4780
},
{
"epoch": 0.021717589890461905,
"grad_norm": 7.843724727630615,
"learning_rate": 6.512532802461315e-05,
"loss": 8.6292,
"step": 4800
},
{
"epoch": 0.02180807984833883,
"grad_norm": 9.748797416687012,
"learning_rate": 6.539679666998462e-05,
"loss": 8.6059,
"step": 4820
},
{
"epoch": 0.021898569806215754,
"grad_norm": 8.68276596069336,
"learning_rate": 6.566826531535607e-05,
"loss": 8.6153,
"step": 4840
},
{
"epoch": 0.02198905976409268,
"grad_norm": 9.26171588897705,
"learning_rate": 6.593973396072753e-05,
"loss": 8.6343,
"step": 4860
},
{
"epoch": 0.022079549721969606,
"grad_norm": 10.164648056030273,
"learning_rate": 6.621120260609899e-05,
"loss": 8.6255,
"step": 4880
},
{
"epoch": 0.02217003967984653,
"grad_norm": 8.388748168945312,
"learning_rate": 6.648267125147046e-05,
"loss": 8.6111,
"step": 4900
},
{
"epoch": 0.022260529637723455,
"grad_norm": 9.701128005981445,
"learning_rate": 6.675413989684192e-05,
"loss": 8.5902,
"step": 4920
},
{
"epoch": 0.022351019595600378,
"grad_norm": 9.261332511901855,
"learning_rate": 6.702560854221338e-05,
"loss": 8.6013,
"step": 4940
},
{
"epoch": 0.022441509553477304,
"grad_norm": 7.0918354988098145,
"learning_rate": 6.729707718758483e-05,
"loss": 8.5595,
"step": 4960
},
{
"epoch": 0.022531999511354227,
"grad_norm": 8.793268203735352,
"learning_rate": 6.756854583295628e-05,
"loss": 8.5862,
"step": 4980
},
{
"epoch": 0.022622489469231153,
"grad_norm": 8.539192199707031,
"learning_rate": 6.784001447832774e-05,
"loss": 8.5938,
"step": 5000
},
{
"epoch": 0.022712979427108076,
"grad_norm": 8.60251522064209,
"learning_rate": 6.811148312369921e-05,
"loss": 8.598,
"step": 5020
},
{
"epoch": 0.022803469384985002,
"grad_norm": 8.976070404052734,
"learning_rate": 6.838295176907067e-05,
"loss": 8.5896,
"step": 5040
},
{
"epoch": 0.022893959342861925,
"grad_norm": 8.834037780761719,
"learning_rate": 6.865442041444213e-05,
"loss": 8.5654,
"step": 5060
},
{
"epoch": 0.02298444930073885,
"grad_norm": 7.039853096008301,
"learning_rate": 6.89258890598136e-05,
"loss": 8.574,
"step": 5080
},
{
"epoch": 0.023074939258615774,
"grad_norm": 4.989284515380859,
"learning_rate": 6.919735770518505e-05,
"loss": 8.584,
"step": 5100
},
{
"epoch": 0.0231654292164927,
"grad_norm": 10.530620574951172,
"learning_rate": 6.946882635055651e-05,
"loss": 8.5884,
"step": 5120
},
{
"epoch": 0.023255919174369623,
"grad_norm": 10.483266830444336,
"learning_rate": 6.974029499592797e-05,
"loss": 8.573,
"step": 5140
},
{
"epoch": 0.02334640913224655,
"grad_norm": 9.433408737182617,
"learning_rate": 7.001176364129942e-05,
"loss": 8.5553,
"step": 5160
},
{
"epoch": 0.023436899090123475,
"grad_norm": 10.707608222961426,
"learning_rate": 7.028323228667089e-05,
"loss": 8.5672,
"step": 5180
},
{
"epoch": 0.023527389048000398,
"grad_norm": 11.35906982421875,
"learning_rate": 7.055470093204235e-05,
"loss": 8.5374,
"step": 5200
},
{
"epoch": 0.023617879005877324,
"grad_norm": 9.386375427246094,
"learning_rate": 7.08261695774138e-05,
"loss": 8.5199,
"step": 5220
},
{
"epoch": 0.023708368963754247,
"grad_norm": 10.813016891479492,
"learning_rate": 7.109763822278526e-05,
"loss": 8.5296,
"step": 5240
},
{
"epoch": 0.023798858921631173,
"grad_norm": 10.738064765930176,
"learning_rate": 7.136910686815673e-05,
"loss": 8.5293,
"step": 5260
},
{
"epoch": 0.023889348879508096,
"grad_norm": 12.89620590209961,
"learning_rate": 7.164057551352819e-05,
"loss": 8.5494,
"step": 5280
},
{
"epoch": 0.023979838837385022,
"grad_norm": 11.624608039855957,
"learning_rate": 7.191204415889965e-05,
"loss": 8.5179,
"step": 5300
},
{
"epoch": 0.024070328795261945,
"grad_norm": 7.694511413574219,
"learning_rate": 7.21835128042711e-05,
"loss": 8.5528,
"step": 5320
},
{
"epoch": 0.02416081875313887,
"grad_norm": 9.326581954956055,
"learning_rate": 7.245498144964257e-05,
"loss": 8.5307,
"step": 5340
},
{
"epoch": 0.024251308711015794,
"grad_norm": 8.548121452331543,
"learning_rate": 7.272645009501402e-05,
"loss": 8.5031,
"step": 5360
},
{
"epoch": 0.02434179866889272,
"grad_norm": 10.232369422912598,
"learning_rate": 7.299791874038548e-05,
"loss": 8.4905,
"step": 5380
},
{
"epoch": 0.024432288626769643,
"grad_norm": 9.751016616821289,
"learning_rate": 7.326938738575694e-05,
"loss": 8.4996,
"step": 5400
},
{
"epoch": 0.02452277858464657,
"grad_norm": 11.058146476745605,
"learning_rate": 7.35408560311284e-05,
"loss": 8.4889,
"step": 5420
},
{
"epoch": 0.024613268542523492,
"grad_norm": 8.11478042602539,
"learning_rate": 7.381232467649987e-05,
"loss": 8.5099,
"step": 5440
},
{
"epoch": 0.02470375850040042,
"grad_norm": 8.138284683227539,
"learning_rate": 7.408379332187133e-05,
"loss": 8.4854,
"step": 5460
},
{
"epoch": 0.02479424845827734,
"grad_norm": 7.7438459396362305,
"learning_rate": 7.435526196724278e-05,
"loss": 8.4877,
"step": 5480
},
{
"epoch": 0.024884738416154267,
"grad_norm": 9.896592140197754,
"learning_rate": 7.462673061261423e-05,
"loss": 8.4662,
"step": 5500
},
{
"epoch": 0.024975228374031194,
"grad_norm": 7.162434101104736,
"learning_rate": 7.48981992579857e-05,
"loss": 8.4772,
"step": 5520
},
{
"epoch": 0.025065718331908116,
"grad_norm": 8.252161026000977,
"learning_rate": 7.516966790335716e-05,
"loss": 8.4936,
"step": 5540
},
{
"epoch": 0.025156208289785043,
"grad_norm": 7.313194751739502,
"learning_rate": 7.544113654872862e-05,
"loss": 8.493,
"step": 5560
},
{
"epoch": 0.025246698247661965,
"grad_norm": 11.324033737182617,
"learning_rate": 7.571260519410008e-05,
"loss": 8.4776,
"step": 5580
},
{
"epoch": 0.02533718820553889,
"grad_norm": 9.4235258102417,
"learning_rate": 7.598407383947155e-05,
"loss": 8.4769,
"step": 5600
},
{
"epoch": 0.025427678163415814,
"grad_norm": 6.676479339599609,
"learning_rate": 7.6255542484843e-05,
"loss": 8.4389,
"step": 5620
},
{
"epoch": 0.02551816812129274,
"grad_norm": 9.530123710632324,
"learning_rate": 7.652701113021446e-05,
"loss": 8.4704,
"step": 5640
},
{
"epoch": 0.025608658079169663,
"grad_norm": 10.614904403686523,
"learning_rate": 7.679847977558591e-05,
"loss": 8.4507,
"step": 5660
},
{
"epoch": 0.02569914803704659,
"grad_norm": 7.5254974365234375,
"learning_rate": 7.706994842095737e-05,
"loss": 8.464,
"step": 5680
},
{
"epoch": 0.025789637994923512,
"grad_norm": 7.461385250091553,
"learning_rate": 7.734141706632884e-05,
"loss": 8.4516,
"step": 5700
},
{
"epoch": 0.02588012795280044,
"grad_norm": 9.106521606445312,
"learning_rate": 7.76128857117003e-05,
"loss": 8.4142,
"step": 5720
},
{
"epoch": 0.02597061791067736,
"grad_norm": 8.536205291748047,
"learning_rate": 7.788435435707175e-05,
"loss": 8.4497,
"step": 5740
},
{
"epoch": 0.026061107868554288,
"grad_norm": 7.925720691680908,
"learning_rate": 7.815582300244321e-05,
"loss": 8.4783,
"step": 5760
},
{
"epoch": 0.02615159782643121,
"grad_norm": 11.187898635864258,
"learning_rate": 7.842729164781468e-05,
"loss": 8.4054,
"step": 5780
},
{
"epoch": 0.026242087784308137,
"grad_norm": 6.965084075927734,
"learning_rate": 7.869876029318614e-05,
"loss": 8.4079,
"step": 5800
},
{
"epoch": 0.02633257774218506,
"grad_norm": 8.090741157531738,
"learning_rate": 7.89702289385576e-05,
"loss": 8.4474,
"step": 5820
},
{
"epoch": 0.026423067700061986,
"grad_norm": 9.698216438293457,
"learning_rate": 7.924169758392905e-05,
"loss": 8.3945,
"step": 5840
},
{
"epoch": 0.026513557657938912,
"grad_norm": 7.889448642730713,
"learning_rate": 7.951316622930052e-05,
"loss": 8.4046,
"step": 5860
},
{
"epoch": 0.026604047615815835,
"grad_norm": 11.487144470214844,
"learning_rate": 7.978463487467197e-05,
"loss": 8.4195,
"step": 5880
},
{
"epoch": 0.02669453757369276,
"grad_norm": 9.28532886505127,
"learning_rate": 8.005610352004343e-05,
"loss": 8.406,
"step": 5900
},
{
"epoch": 0.026785027531569684,
"grad_norm": 8.982071876525879,
"learning_rate": 8.032757216541489e-05,
"loss": 8.4221,
"step": 5920
},
{
"epoch": 0.02687551748944661,
"grad_norm": 11.42358684539795,
"learning_rate": 8.059904081078636e-05,
"loss": 8.4423,
"step": 5940
},
{
"epoch": 0.026966007447323533,
"grad_norm": 8.633251190185547,
"learning_rate": 8.087050945615782e-05,
"loss": 8.4233,
"step": 5960
},
{
"epoch": 0.02705649740520046,
"grad_norm": 9.28022575378418,
"learning_rate": 8.114197810152928e-05,
"loss": 8.4169,
"step": 5980
},
{
"epoch": 0.02714698736307738,
"grad_norm": 11.166740417480469,
"learning_rate": 8.141344674690073e-05,
"loss": 8.4018,
"step": 6000
},
{
"epoch": 0.02714698736307738,
"eval_accuracy": 0.11314150543417859,
"eval_loss": 8.402518272399902,
"eval_runtime": 218.3209,
"eval_samples_per_second": 2784.158,
"eval_steps_per_second": 10.878,
"step": 6000
},
{
"epoch": 0.027237477320954308,
"grad_norm": 7.100822925567627,
"learning_rate": 8.167134196000362e-05,
"loss": 8.4131,
"step": 6020
},
{
"epoch": 0.02732796727883123,
"grad_norm": 8.460954666137695,
"learning_rate": 8.194281060537508e-05,
"loss": 8.4087,
"step": 6040
},
{
"epoch": 0.027418457236708157,
"grad_norm": 7.642125129699707,
"learning_rate": 8.221427925074653e-05,
"loss": 8.3806,
"step": 6060
},
{
"epoch": 0.02750894719458508,
"grad_norm": 8.104974746704102,
"learning_rate": 8.2485747896118e-05,
"loss": 8.404,
"step": 6080
},
{
"epoch": 0.027599437152462006,
"grad_norm": 8.082459449768066,
"learning_rate": 8.275721654148946e-05,
"loss": 8.3865,
"step": 6100
},
{
"epoch": 0.02768992711033893,
"grad_norm": 8.786911010742188,
"learning_rate": 8.302868518686092e-05,
"loss": 8.3475,
"step": 6120
},
{
"epoch": 0.027780417068215855,
"grad_norm": 7.780808925628662,
"learning_rate": 8.330015383223237e-05,
"loss": 8.3798,
"step": 6140
},
{
"epoch": 0.02787090702609278,
"grad_norm": 10.508188247680664,
"learning_rate": 8.357162247760384e-05,
"loss": 8.3718,
"step": 6160
},
{
"epoch": 0.027961396983969704,
"grad_norm": 9.833992004394531,
"learning_rate": 8.38430911229753e-05,
"loss": 8.3952,
"step": 6180
},
{
"epoch": 0.02805188694184663,
"grad_norm": 9.917244911193848,
"learning_rate": 8.411455976834675e-05,
"loss": 8.3828,
"step": 6200
},
{
"epoch": 0.028142376899723553,
"grad_norm": 8.893899917602539,
"learning_rate": 8.438602841371821e-05,
"loss": 8.3853,
"step": 6220
},
{
"epoch": 0.02823286685760048,
"grad_norm": 8.206876754760742,
"learning_rate": 8.465749705908967e-05,
"loss": 8.3686,
"step": 6240
},
{
"epoch": 0.028323356815477402,
"grad_norm": 6.771660327911377,
"learning_rate": 8.492896570446114e-05,
"loss": 8.3699,
"step": 6260
},
{
"epoch": 0.028413846773354328,
"grad_norm": 8.602880477905273,
"learning_rate": 8.52004343498326e-05,
"loss": 8.3388,
"step": 6280
},
{
"epoch": 0.02850433673123125,
"grad_norm": 12.602445602416992,
"learning_rate": 8.547190299520405e-05,
"loss": 8.3127,
"step": 6300
},
{
"epoch": 0.028594826689108177,
"grad_norm": 6.581843852996826,
"learning_rate": 8.57433716405755e-05,
"loss": 8.3345,
"step": 6320
},
{
"epoch": 0.0286853166469851,
"grad_norm": 11.11732292175293,
"learning_rate": 8.601484028594696e-05,
"loss": 8.3442,
"step": 6340
},
{
"epoch": 0.028775806604862026,
"grad_norm": 7.795157432556152,
"learning_rate": 8.628630893131843e-05,
"loss": 8.3477,
"step": 6360
},
{
"epoch": 0.02886629656273895,
"grad_norm": 7.013496398925781,
"learning_rate": 8.655777757668989e-05,
"loss": 8.3444,
"step": 6380
},
{
"epoch": 0.028956786520615875,
"grad_norm": 7.039948463439941,
"learning_rate": 8.682924622206135e-05,
"loss": 8.3242,
"step": 6400
},
{
"epoch": 0.029047276478492798,
"grad_norm": 9.261716842651367,
"learning_rate": 8.710071486743282e-05,
"loss": 8.3209,
"step": 6420
},
{
"epoch": 0.029137766436369724,
"grad_norm": 7.255875587463379,
"learning_rate": 8.737218351280428e-05,
"loss": 8.304,
"step": 6440
},
{
"epoch": 0.029228256394246647,
"grad_norm": 7.955538749694824,
"learning_rate": 8.764365215817573e-05,
"loss": 8.2953,
"step": 6460
},
{
"epoch": 0.029318746352123573,
"grad_norm": 9.364811897277832,
"learning_rate": 8.791512080354718e-05,
"loss": 8.2936,
"step": 6480
},
{
"epoch": 0.0294092363100005,
"grad_norm": 9.385396957397461,
"learning_rate": 8.818658944891864e-05,
"loss": 8.3276,
"step": 6500
},
{
"epoch": 0.029499726267877422,
"grad_norm": 8.448295593261719,
"learning_rate": 8.84580580942901e-05,
"loss": 8.2975,
"step": 6520
},
{
"epoch": 0.02959021622575435,
"grad_norm": 9.282604217529297,
"learning_rate": 8.872952673966157e-05,
"loss": 8.3217,
"step": 6540
},
{
"epoch": 0.02968070618363127,
"grad_norm": 7.898446559906006,
"learning_rate": 8.900099538503303e-05,
"loss": 8.3006,
"step": 6560
},
{
"epoch": 0.029771196141508197,
"grad_norm": 9.186493873596191,
"learning_rate": 8.927246403040448e-05,
"loss": 8.2981,
"step": 6580
},
{
"epoch": 0.02986168609938512,
"grad_norm": 9.346575736999512,
"learning_rate": 8.954393267577595e-05,
"loss": 8.2883,
"step": 6600
},
{
"epoch": 0.029952176057262046,
"grad_norm": 6.458785057067871,
"learning_rate": 8.981540132114741e-05,
"loss": 8.2966,
"step": 6620
},
{
"epoch": 0.03004266601513897,
"grad_norm": 8.704976081848145,
"learning_rate": 9.008686996651886e-05,
"loss": 8.2986,
"step": 6640
},
{
"epoch": 0.030133155973015895,
"grad_norm": 7.744259357452393,
"learning_rate": 9.035833861189032e-05,
"loss": 8.2868,
"step": 6660
},
{
"epoch": 0.030223645930892818,
"grad_norm": 8.345844268798828,
"learning_rate": 9.062980725726179e-05,
"loss": 8.2931,
"step": 6680
},
{
"epoch": 0.030314135888769744,
"grad_norm": 7.604759216308594,
"learning_rate": 9.090127590263323e-05,
"loss": 8.2847,
"step": 6700
},
{
"epoch": 0.030404625846646667,
"grad_norm": 10.3920259475708,
"learning_rate": 9.11727445480047e-05,
"loss": 8.273,
"step": 6720
},
{
"epoch": 0.030495115804523593,
"grad_norm": 7.095389366149902,
"learning_rate": 9.144421319337616e-05,
"loss": 8.2768,
"step": 6740
},
{
"epoch": 0.030585605762400516,
"grad_norm": 7.211811542510986,
"learning_rate": 9.171568183874762e-05,
"loss": 8.2918,
"step": 6760
},
{
"epoch": 0.030676095720277442,
"grad_norm": 8.639713287353516,
"learning_rate": 9.198715048411909e-05,
"loss": 8.2845,
"step": 6780
},
{
"epoch": 0.03076658567815437,
"grad_norm": 7.687414169311523,
"learning_rate": 9.225861912949055e-05,
"loss": 8.2992,
"step": 6800
},
{
"epoch": 0.03085707563603129,
"grad_norm": 8.479426383972168,
"learning_rate": 9.2530087774862e-05,
"loss": 8.2848,
"step": 6820
},
{
"epoch": 0.030947565593908218,
"grad_norm": 8.185149192810059,
"learning_rate": 9.280155642023345e-05,
"loss": 8.3037,
"step": 6840
},
{
"epoch": 0.03103805555178514,
"grad_norm": 8.295937538146973,
"learning_rate": 9.307302506560491e-05,
"loss": 8.3179,
"step": 6860
},
{
"epoch": 0.031128545509662067,
"grad_norm": 10.772727012634277,
"learning_rate": 9.334449371097638e-05,
"loss": 8.264,
"step": 6880
},
{
"epoch": 0.03121903546753899,
"grad_norm": 8.465076446533203,
"learning_rate": 9.361596235634784e-05,
"loss": 8.2303,
"step": 6900
},
{
"epoch": 0.031309525425415916,
"grad_norm": 9.096773147583008,
"learning_rate": 9.38874310017193e-05,
"loss": 8.2473,
"step": 6920
},
{
"epoch": 0.03140001538329284,
"grad_norm": 10.57555866241455,
"learning_rate": 9.415889964709077e-05,
"loss": 8.27,
"step": 6940
},
{
"epoch": 0.03149050534116976,
"grad_norm": 7.5089850425720215,
"learning_rate": 9.443036829246222e-05,
"loss": 8.27,
"step": 6960
},
{
"epoch": 0.03158099529904669,
"grad_norm": 10.865699768066406,
"learning_rate": 9.470183693783368e-05,
"loss": 8.2451,
"step": 6980
},
{
"epoch": 0.031671485256923614,
"grad_norm": 12.514881134033203,
"learning_rate": 9.497330558320513e-05,
"loss": 8.259,
"step": 7000
},
{
"epoch": 0.031761975214800536,
"grad_norm": 9.914373397827148,
"learning_rate": 9.524477422857659e-05,
"loss": 8.2727,
"step": 7020
},
{
"epoch": 0.03185246517267746,
"grad_norm": 7.3313984870910645,
"learning_rate": 9.551624287394806e-05,
"loss": 8.2421,
"step": 7040
},
{
"epoch": 0.03194295513055439,
"grad_norm": 5.989616394042969,
"learning_rate": 9.578771151931952e-05,
"loss": 8.2363,
"step": 7060
},
{
"epoch": 0.03203344508843131,
"grad_norm": 7.4773430824279785,
"learning_rate": 9.605918016469098e-05,
"loss": 8.2718,
"step": 7080
},
{
"epoch": 0.032123935046308234,
"grad_norm": 6.605820655822754,
"learning_rate": 9.633064881006243e-05,
"loss": 8.257,
"step": 7100
},
{
"epoch": 0.03221442500418516,
"grad_norm": 8.294914245605469,
"learning_rate": 9.658854402316532e-05,
"loss": 8.2478,
"step": 7120
},
{
"epoch": 0.03230491496206209,
"grad_norm": 10.011855125427246,
"learning_rate": 9.686001266853678e-05,
"loss": 8.2525,
"step": 7140
},
{
"epoch": 0.03239540491993901,
"grad_norm": 7.529365062713623,
"learning_rate": 9.713148131390823e-05,
"loss": 8.2728,
"step": 7160
},
{
"epoch": 0.03248589487781593,
"grad_norm": 8.781538009643555,
"learning_rate": 9.74029499592797e-05,
"loss": 8.2305,
"step": 7180
},
{
"epoch": 0.03257638483569286,
"grad_norm": 12.758204460144043,
"learning_rate": 9.767441860465116e-05,
"loss": 8.2382,
"step": 7200
},
{
"epoch": 0.032666874793569785,
"grad_norm": 10.523704528808594,
"learning_rate": 9.794588725002262e-05,
"loss": 8.2364,
"step": 7220
},
{
"epoch": 0.03275736475144671,
"grad_norm": 6.50457239151001,
"learning_rate": 9.821735589539409e-05,
"loss": 8.2384,
"step": 7240
},
{
"epoch": 0.03284785470932363,
"grad_norm": 9.191271781921387,
"learning_rate": 9.848882454076555e-05,
"loss": 8.2148,
"step": 7260
},
{
"epoch": 0.03293834466720056,
"grad_norm": 8.93270206451416,
"learning_rate": 9.8760293186137e-05,
"loss": 8.2352,
"step": 7280
},
{
"epoch": 0.03302883462507748,
"grad_norm": 9.895100593566895,
"learning_rate": 9.903176183150845e-05,
"loss": 8.2376,
"step": 7300
},
{
"epoch": 0.033119324582954406,
"grad_norm": 10.420171737670898,
"learning_rate": 9.930323047687991e-05,
"loss": 8.2479,
"step": 7320
},
{
"epoch": 0.03320981454083133,
"grad_norm": 9.649170875549316,
"learning_rate": 9.957469912225138e-05,
"loss": 8.2557,
"step": 7340
},
{
"epoch": 0.03330030449870826,
"grad_norm": 7.854948043823242,
"learning_rate": 9.984616776762284e-05,
"loss": 8.2145,
"step": 7360
},
{
"epoch": 0.03339079445658518,
"grad_norm": 8.486404418945312,
"learning_rate": 0.0001001176364129943,
"loss": 8.2132,
"step": 7380
},
{
"epoch": 0.033481284414462104,
"grad_norm": 11.286945343017578,
"learning_rate": 0.00010038910505836577,
"loss": 8.2169,
"step": 7400
},
{
"epoch": 0.033571774372339026,
"grad_norm": 6.662302494049072,
"learning_rate": 0.00010066057370373721,
"loss": 8.2318,
"step": 7420
},
{
"epoch": 0.033662264330215956,
"grad_norm": 10.467026710510254,
"learning_rate": 0.00010093204234910868,
"loss": 8.2089,
"step": 7440
},
{
"epoch": 0.03375275428809288,
"grad_norm": 12.113288879394531,
"learning_rate": 0.00010120351099448013,
"loss": 8.2194,
"step": 7460
},
{
"epoch": 0.0338432442459698,
"grad_norm": 13.295260429382324,
"learning_rate": 0.00010147497963985159,
"loss": 8.2526,
"step": 7480
},
{
"epoch": 0.03393373420384673,
"grad_norm": 9.79587173461914,
"learning_rate": 0.00010174644828522305,
"loss": 8.2253,
"step": 7500
},
{
"epoch": 0.034024224161723654,
"grad_norm": 10.251439094543457,
"learning_rate": 0.00010201791693059452,
"loss": 8.2248,
"step": 7520
},
{
"epoch": 0.03411471411960058,
"grad_norm": 10.583033561706543,
"learning_rate": 0.00010228938557596597,
"loss": 8.211,
"step": 7540
},
{
"epoch": 0.0342052040774775,
"grad_norm": 10.661384582519531,
"learning_rate": 0.00010256085422133743,
"loss": 8.2053,
"step": 7560
},
{
"epoch": 0.03429569403535443,
"grad_norm": 8.133881568908691,
"learning_rate": 0.0001028323228667089,
"loss": 8.1948,
"step": 7580
},
{
"epoch": 0.03438618399323135,
"grad_norm": 9.278162002563477,
"learning_rate": 0.00010310379151208036,
"loss": 8.2235,
"step": 7600
},
{
"epoch": 0.034476673951108275,
"grad_norm": 10.354171752929688,
"learning_rate": 0.00010337526015745181,
"loss": 8.1704,
"step": 7620
},
{
"epoch": 0.0345671639089852,
"grad_norm": 9.4600830078125,
"learning_rate": 0.00010364672880282327,
"loss": 8.2008,
"step": 7640
},
{
"epoch": 0.03465765386686213,
"grad_norm": 10.290422439575195,
"learning_rate": 0.00010391819744819473,
"loss": 8.2084,
"step": 7660
},
{
"epoch": 0.03474814382473905,
"grad_norm": 9.98493480682373,
"learning_rate": 0.00010418966609356618,
"loss": 8.1878,
"step": 7680
},
{
"epoch": 0.03483863378261597,
"grad_norm": 8.021723747253418,
"learning_rate": 0.00010446113473893765,
"loss": 8.1865,
"step": 7700
},
{
"epoch": 0.034929123740492896,
"grad_norm": 6.915677070617676,
"learning_rate": 0.00010473260338430911,
"loss": 8.1795,
"step": 7720
},
{
"epoch": 0.035019613698369825,
"grad_norm": 9.64877986907959,
"learning_rate": 0.00010500407202968057,
"loss": 8.1756,
"step": 7740
},
{
"epoch": 0.03511010365624675,
"grad_norm": 9.673460960388184,
"learning_rate": 0.00010527554067505204,
"loss": 8.1877,
"step": 7760
},
{
"epoch": 0.03520059361412367,
"grad_norm": 10.429800033569336,
"learning_rate": 0.0001055470093204235,
"loss": 8.1803,
"step": 7780
},
{
"epoch": 0.0352910835720006,
"grad_norm": 9.610269546508789,
"learning_rate": 0.00010581847796579494,
"loss": 8.214,
"step": 7800
},
{
"epoch": 0.03538157352987752,
"grad_norm": 9.696439743041992,
"learning_rate": 0.0001060899466111664,
"loss": 8.1585,
"step": 7820
},
{
"epoch": 0.035472063487754446,
"grad_norm": 10.302108764648438,
"learning_rate": 0.00010636141525653786,
"loss": 8.1495,
"step": 7840
},
{
"epoch": 0.03556255344563137,
"grad_norm": 10.439906120300293,
"learning_rate": 0.00010663288390190933,
"loss": 8.1636,
"step": 7860
},
{
"epoch": 0.0356530434035083,
"grad_norm": 13.941293716430664,
"learning_rate": 0.00010690435254728079,
"loss": 8.1674,
"step": 7880
},
{
"epoch": 0.03574353336138522,
"grad_norm": 11.378789901733398,
"learning_rate": 0.00010717582119265225,
"loss": 8.1704,
"step": 7900
},
{
"epoch": 0.035834023319262144,
"grad_norm": 10.802684783935547,
"learning_rate": 0.00010744728983802372,
"loss": 8.1902,
"step": 7920
},
{
"epoch": 0.03592451327713907,
"grad_norm": 13.995284080505371,
"learning_rate": 0.00010771875848339517,
"loss": 8.1502,
"step": 7940
},
{
"epoch": 0.036015003235016,
"grad_norm": 11.473008155822754,
"learning_rate": 0.00010799022712876663,
"loss": 8.2082,
"step": 7960
},
{
"epoch": 0.03610549319289292,
"grad_norm": 9.314510345458984,
"learning_rate": 0.00010826169577413808,
"loss": 8.19,
"step": 7980
},
{
"epoch": 0.03619598315076984,
"grad_norm": 11.141118049621582,
"learning_rate": 0.00010853316441950954,
"loss": 8.2093,
"step": 8000
},
{
"epoch": 0.03619598315076984,
"eval_accuracy": 0.11013720949528932,
"eval_loss": 8.173333168029785,
"eval_runtime": 219.4541,
"eval_samples_per_second": 2769.782,
"eval_steps_per_second": 10.822,
"step": 8000
},
{
"epoch": 0.036286473108646765,
"grad_norm": 12.62540054321289,
"learning_rate": 0.000108804633064881,
"loss": 8.1561,
"step": 8020
},
{
"epoch": 0.036376963066523695,
"grad_norm": 12.97541332244873,
"learning_rate": 0.00010907610171025247,
"loss": 8.1708,
"step": 8040
},
{
"epoch": 0.03646745302440062,
"grad_norm": 8.305766105651855,
"learning_rate": 0.00010934757035562392,
"loss": 8.1671,
"step": 8060
},
{
"epoch": 0.03655794298227754,
"grad_norm": 14.076859474182129,
"learning_rate": 0.00010961903900099538,
"loss": 8.1659,
"step": 8080
},
{
"epoch": 0.03664843294015447,
"grad_norm": 11.951278686523438,
"learning_rate": 0.00010989050764636684,
"loss": 8.1893,
"step": 8100
},
{
"epoch": 0.03673892289803139,
"grad_norm": 10.796624183654785,
"learning_rate": 0.00011016197629173831,
"loss": 8.1942,
"step": 8120
},
{
"epoch": 0.036829412855908315,
"grad_norm": 10.49177074432373,
"learning_rate": 0.00011043344493710976,
"loss": 8.1589,
"step": 8140
},
{
"epoch": 0.03691990281378524,
"grad_norm": 12.82060432434082,
"learning_rate": 0.00011070491358248122,
"loss": 8.1957,
"step": 8160
},
{
"epoch": 0.03701039277166217,
"grad_norm": 11.00941276550293,
"learning_rate": 0.00011097638222785267,
"loss": 8.1609,
"step": 8180
},
{
"epoch": 0.03710088272953909,
"grad_norm": 10.24111270904541,
"learning_rate": 0.00011124785087322413,
"loss": 8.1769,
"step": 8200
},
{
"epoch": 0.03719137268741601,
"grad_norm": 11.292909622192383,
"learning_rate": 0.0001115193195185956,
"loss": 8.1628,
"step": 8220
},
{
"epoch": 0.037281862645292936,
"grad_norm": 9.362674713134766,
"learning_rate": 0.00011179078816396706,
"loss": 8.1638,
"step": 8240
},
{
"epoch": 0.037372352603169866,
"grad_norm": 12.9249906539917,
"learning_rate": 0.00011206225680933852,
"loss": 8.1957,
"step": 8260
},
{
"epoch": 0.03746284256104679,
"grad_norm": 10.386489868164062,
"learning_rate": 0.00011233372545470999,
"loss": 8.1525,
"step": 8280
},
{
"epoch": 0.03755333251892371,
"grad_norm": 12.65300464630127,
"learning_rate": 0.00011260519410008144,
"loss": 8.1558,
"step": 8300
},
{
"epoch": 0.037643822476800634,
"grad_norm": 11.562602996826172,
"learning_rate": 0.0001128766627454529,
"loss": 8.148,
"step": 8320
},
{
"epoch": 0.037734312434677564,
"grad_norm": 14.783183097839355,
"learning_rate": 0.00011314813139082436,
"loss": 8.1448,
"step": 8340
},
{
"epoch": 0.03782480239255449,
"grad_norm": 15.469168663024902,
"learning_rate": 0.00011341960003619583,
"loss": 8.1801,
"step": 8360
},
{
"epoch": 0.03791529235043141,
"grad_norm": 11.361299514770508,
"learning_rate": 0.00011369106868156726,
"loss": 8.1549,
"step": 8380
},
{
"epoch": 0.03800578230830833,
"grad_norm": 9.814708709716797,
"learning_rate": 0.00011396253732693873,
"loss": 8.1663,
"step": 8400
},
{
"epoch": 0.03809627226618526,
"grad_norm": 10.522832870483398,
"learning_rate": 0.00011423400597231019,
"loss": 8.1459,
"step": 8420
},
{
"epoch": 0.038186762224062185,
"grad_norm": 10.637961387634277,
"learning_rate": 0.00011450547461768165,
"loss": 8.1554,
"step": 8440
},
{
"epoch": 0.03827725218193911,
"grad_norm": 14.578750610351562,
"learning_rate": 0.00011477694326305312,
"loss": 8.1758,
"step": 8460
},
{
"epoch": 0.03836774213981604,
"grad_norm": 12.179791450500488,
"learning_rate": 0.00011504841190842457,
"loss": 8.1117,
"step": 8480
},
{
"epoch": 0.03845823209769296,
"grad_norm": 11.189960479736328,
"learning_rate": 0.00011531988055379603,
"loss": 8.1517,
"step": 8500
},
{
"epoch": 0.03854872205556988,
"grad_norm": 11.662614822387695,
"learning_rate": 0.00011559134919916749,
"loss": 8.129,
"step": 8520
},
{
"epoch": 0.038639212013446805,
"grad_norm": 9.089029312133789,
"learning_rate": 0.00011584924441227038,
"loss": 8.1452,
"step": 8540
},
{
"epoch": 0.038729701971323735,
"grad_norm": 15.1500825881958,
"learning_rate": 0.00011612071305764184,
"loss": 8.1623,
"step": 8560
},
{
"epoch": 0.03882019192920066,
"grad_norm": 15.177955627441406,
"learning_rate": 0.0001163921817030133,
"loss": 8.1138,
"step": 8580
},
{
"epoch": 0.03891068188707758,
"grad_norm": 9.620798110961914,
"learning_rate": 0.00011666365034838476,
"loss": 8.1472,
"step": 8600
},
{
"epoch": 0.0390011718449545,
"grad_norm": 13.227412223815918,
"learning_rate": 0.00011693511899375622,
"loss": 8.1436,
"step": 8620
},
{
"epoch": 0.03909166180283143,
"grad_norm": 12.561627388000488,
"learning_rate": 0.00011720658763912768,
"loss": 8.1478,
"step": 8640
},
{
"epoch": 0.039182151760708356,
"grad_norm": 12.864951133728027,
"learning_rate": 0.00011747805628449915,
"loss": 8.1727,
"step": 8660
},
{
"epoch": 0.03927264171858528,
"grad_norm": 12.883962631225586,
"learning_rate": 0.00011774952492987061,
"loss": 8.1396,
"step": 8680
},
{
"epoch": 0.0393631316764622,
"grad_norm": 7.435621738433838,
"learning_rate": 0.00011802099357524204,
"loss": 8.1774,
"step": 8700
},
{
"epoch": 0.03945362163433913,
"grad_norm": 12.7384672164917,
"learning_rate": 0.00011829246222061351,
"loss": 8.1297,
"step": 8720
},
{
"epoch": 0.039544111592216054,
"grad_norm": 14.0343017578125,
"learning_rate": 0.00011856393086598497,
"loss": 8.1406,
"step": 8740
},
{
"epoch": 0.03963460155009298,
"grad_norm": 15.325870513916016,
"learning_rate": 0.00011883539951135643,
"loss": 8.1619,
"step": 8760
},
{
"epoch": 0.039725091507969906,
"grad_norm": 21.650548934936523,
"learning_rate": 0.00011910686815672788,
"loss": 8.193,
"step": 8780
},
{
"epoch": 0.03981558146584683,
"grad_norm": 15.605712890625,
"learning_rate": 0.00011937833680209935,
"loss": 8.1709,
"step": 8800
},
{
"epoch": 0.03990607142372375,
"grad_norm": 10.788895606994629,
"learning_rate": 0.00011964980544747081,
"loss": 8.1451,
"step": 8820
},
{
"epoch": 0.039996561381600675,
"grad_norm": 16.377477645874023,
"learning_rate": 0.00011992127409284227,
"loss": 8.134,
"step": 8840
},
{
"epoch": 0.040087051339477604,
"grad_norm": 13.106194496154785,
"learning_rate": 0.00012019274273821374,
"loss": 8.1352,
"step": 8860
},
{
"epoch": 0.04017754129735453,
"grad_norm": 11.152835845947266,
"learning_rate": 0.0001204642113835852,
"loss": 8.1138,
"step": 8880
},
{
"epoch": 0.04026803125523145,
"grad_norm": 9.210712432861328,
"learning_rate": 0.00012073568002895666,
"loss": 8.1769,
"step": 8900
},
{
"epoch": 0.04035852121310837,
"grad_norm": 12.555234909057617,
"learning_rate": 0.00012100714867432813,
"loss": 8.1383,
"step": 8920
},
{
"epoch": 0.0404490111709853,
"grad_norm": 12.013688087463379,
"learning_rate": 0.00012127861731969958,
"loss": 8.1564,
"step": 8940
},
{
"epoch": 0.040539501128862225,
"grad_norm": 9.827411651611328,
"learning_rate": 0.00012155008596507101,
"loss": 8.1348,
"step": 8960
},
{
"epoch": 0.04062999108673915,
"grad_norm": 11.609356880187988,
"learning_rate": 0.00012182155461044248,
"loss": 8.1646,
"step": 8980
},
{
"epoch": 0.04072048104461607,
"grad_norm": 13.045088768005371,
"learning_rate": 0.00012209302325581395,
"loss": 8.1628,
"step": 9000
},
{
"epoch": 0.040810971002493,
"grad_norm": 12.780691146850586,
"learning_rate": 0.00012236449190118542,
"loss": 8.1487,
"step": 9020
},
{
"epoch": 0.04090146096036992,
"grad_norm": 10.65334701538086,
"learning_rate": 0.00012263596054655685,
"loss": 8.1275,
"step": 9040
},
{
"epoch": 0.040991950918246846,
"grad_norm": 8.080134391784668,
"learning_rate": 0.00012290742919192832,
"loss": 8.1356,
"step": 9060
},
{
"epoch": 0.041082440876123776,
"grad_norm": 12.708916664123535,
"learning_rate": 0.00012317889783729978,
"loss": 8.1606,
"step": 9080
},
{
"epoch": 0.0411729308340007,
"grad_norm": 13.570298194885254,
"learning_rate": 0.00012345036648267124,
"loss": 8.1389,
"step": 9100
},
{
"epoch": 0.04126342079187762,
"grad_norm": 13.237983703613281,
"learning_rate": 0.0001237218351280427,
"loss": 8.1243,
"step": 9120
},
{
"epoch": 0.041353910749754544,
"grad_norm": 14.53023910522461,
"learning_rate": 0.00012399330377341417,
"loss": 8.1191,
"step": 9140
},
{
"epoch": 0.041444400707631474,
"grad_norm": 11.765192031860352,
"learning_rate": 0.00012426477241878563,
"loss": 8.1031,
"step": 9160
},
{
"epoch": 0.041534890665508396,
"grad_norm": 11.261069297790527,
"learning_rate": 0.0001245362410641571,
"loss": 8.1504,
"step": 9180
},
{
"epoch": 0.04162538062338532,
"grad_norm": 13.039865493774414,
"learning_rate": 0.00012480770970952856,
"loss": 8.1186,
"step": 9200
},
{
"epoch": 0.04171587058126224,
"grad_norm": 11.21242904663086,
"learning_rate": 0.0001250791783549,
"loss": 8.1244,
"step": 9220
},
{
"epoch": 0.04180636053913917,
"grad_norm": 13.84521770477295,
"learning_rate": 0.00012535064700027146,
"loss": 8.1442,
"step": 9240
},
{
"epoch": 0.041896850497016094,
"grad_norm": 14.333518981933594,
"learning_rate": 0.00012562211564564292,
"loss": 8.1628,
"step": 9260
},
{
"epoch": 0.04198734045489302,
"grad_norm": 12.016851425170898,
"learning_rate": 0.00012589358429101438,
"loss": 8.1037,
"step": 9280
},
{
"epoch": 0.04207783041276994,
"grad_norm": 9.183259010314941,
"learning_rate": 0.00012616505293638585,
"loss": 8.1429,
"step": 9300
},
{
"epoch": 0.04216832037064687,
"grad_norm": 13.651033401489258,
"learning_rate": 0.0001264365215817573,
"loss": 8.1202,
"step": 9320
},
{
"epoch": 0.04225881032852379,
"grad_norm": 11.869391441345215,
"learning_rate": 0.00012670799022712877,
"loss": 8.1125,
"step": 9340
},
{
"epoch": 0.042349300286400715,
"grad_norm": 15.943286895751953,
"learning_rate": 0.00012697945887250024,
"loss": 8.1694,
"step": 9360
},
{
"epoch": 0.04243979024427764,
"grad_norm": 13.450387001037598,
"learning_rate": 0.00012725092751787167,
"loss": 8.1379,
"step": 9380
},
{
"epoch": 0.04253028020215457,
"grad_norm": 15.152196884155273,
"learning_rate": 0.00012752239616324314,
"loss": 8.1391,
"step": 9400
},
{
"epoch": 0.04262077016003149,
"grad_norm": 15.109274864196777,
"learning_rate": 0.0001277938648086146,
"loss": 8.0963,
"step": 9420
},
{
"epoch": 0.04271126011790841,
"grad_norm": 10.3173189163208,
"learning_rate": 0.00012806533345398606,
"loss": 8.1557,
"step": 9440
},
{
"epoch": 0.04280175007578534,
"grad_norm": 11.38595962524414,
"learning_rate": 0.00012833680209935753,
"loss": 8.173,
"step": 9460
},
{
"epoch": 0.042892240033662266,
"grad_norm": 11.458219528198242,
"learning_rate": 0.00012859469731246043,
"loss": 8.2542,
"step": 9480
},
{
"epoch": 0.04298272999153919,
"grad_norm": 14.253256797790527,
"learning_rate": 0.00012886616595783186,
"loss": 8.1687,
"step": 9500
},
{
"epoch": 0.04307321994941611,
"grad_norm": 14.074560165405273,
"learning_rate": 0.00012913763460320333,
"loss": 8.1175,
"step": 9520
},
{
"epoch": 0.04316370990729304,
"grad_norm": 14.521282196044922,
"learning_rate": 0.00012939552981630623,
"loss": 8.1456,
"step": 9540
},
{
"epoch": 0.043254199865169964,
"grad_norm": 12.537208557128906,
"learning_rate": 0.0001296669984616777,
"loss": 8.1432,
"step": 9560
},
{
"epoch": 0.043344689823046886,
"grad_norm": 10.885902404785156,
"learning_rate": 0.00012993846710704915,
"loss": 8.1875,
"step": 9580
},
{
"epoch": 0.04343517978092381,
"grad_norm": 10.156676292419434,
"learning_rate": 0.0001302099357524206,
"loss": 8.1728,
"step": 9600
},
{
"epoch": 0.04352566973880074,
"grad_norm": 13.31322193145752,
"learning_rate": 0.00013048140439779205,
"loss": 8.1394,
"step": 9620
},
{
"epoch": 0.04361615969667766,
"grad_norm": 7.779819488525391,
"learning_rate": 0.0001307528730431635,
"loss": 8.139,
"step": 9640
},
{
"epoch": 0.043706649654554584,
"grad_norm": 12.208565711975098,
"learning_rate": 0.00013102434168853495,
"loss": 8.1346,
"step": 9660
},
{
"epoch": 0.04379713961243151,
"grad_norm": 11.362008094787598,
"learning_rate": 0.00013129581033390642,
"loss": 8.1419,
"step": 9680
},
{
"epoch": 0.04388762957030844,
"grad_norm": 11.86789321899414,
"learning_rate": 0.00013156727897927788,
"loss": 8.1475,
"step": 9700
},
{
"epoch": 0.04397811952818536,
"grad_norm": 14.61185073852539,
"learning_rate": 0.00013183874762464934,
"loss": 8.1582,
"step": 9720
},
{
"epoch": 0.04406860948606228,
"grad_norm": 11.60112190246582,
"learning_rate": 0.0001321102162700208,
"loss": 8.1073,
"step": 9740
},
{
"epoch": 0.04415909944393921,
"grad_norm": 13.442856788635254,
"learning_rate": 0.00013238168491539227,
"loss": 8.1358,
"step": 9760
},
{
"epoch": 0.044249589401816135,
"grad_norm": 11.524395942687988,
"learning_rate": 0.00013265315356076373,
"loss": 8.1083,
"step": 9780
},
{
"epoch": 0.04434007935969306,
"grad_norm": 13.528814315795898,
"learning_rate": 0.0001329246222061352,
"loss": 8.1392,
"step": 9800
},
{
"epoch": 0.04443056931756998,
"grad_norm": 18.11868667602539,
"learning_rate": 0.00013319609085150666,
"loss": 8.1784,
"step": 9820
},
{
"epoch": 0.04452105927544691,
"grad_norm": 15.858280181884766,
"learning_rate": 0.00013346755949687812,
"loss": 8.1597,
"step": 9840
},
{
"epoch": 0.04461154923332383,
"grad_norm": 14.466769218444824,
"learning_rate": 0.00013373902814224956,
"loss": 8.1632,
"step": 9860
},
{
"epoch": 0.044702039191200756,
"grad_norm": 11.416616439819336,
"learning_rate": 0.00013401049678762102,
"loss": 8.1681,
"step": 9880
},
{
"epoch": 0.04479252914907768,
"grad_norm": 39.87081527709961,
"learning_rate": 0.00013428196543299249,
"loss": 8.1384,
"step": 9900
},
{
"epoch": 0.04488301910695461,
"grad_norm": 11.689374923706055,
"learning_rate": 0.0001345398606460954,
"loss": 8.5619,
"step": 9920
},
{
"epoch": 0.04497350906483153,
"grad_norm": 10.53484058380127,
"learning_rate": 0.00013481132929146682,
"loss": 9.1495,
"step": 9940
},
{
"epoch": 0.045063999022708454,
"grad_norm": 12.07006549835205,
"learning_rate": 0.00013508279793683829,
"loss": 9.1771,
"step": 9960
},
{
"epoch": 0.045154488980585376,
"grad_norm": 9.795348167419434,
"learning_rate": 0.00013535426658220975,
"loss": 9.1545,
"step": 9980
},
{
"epoch": 0.045244978938462306,
"grad_norm": 10.068339347839355,
"learning_rate": 0.0001356257352275812,
"loss": 9.1969,
"step": 10000
},
{
"epoch": 0.045244978938462306,
"eval_accuracy": 0.022879129772772476,
"eval_loss": 9.148832321166992,
"eval_runtime": 212.7494,
"eval_samples_per_second": 2857.071,
"eval_steps_per_second": 11.163,
"step": 10000
},
{
"epoch": 0.04533546889633923,
"grad_norm": 12.951713562011719,
"learning_rate": 0.00013589720387295268,
"loss": 9.154,
"step": 10020
},
{
"epoch": 0.04542595885421615,
"grad_norm": 9.139362335205078,
"learning_rate": 0.00013616867251832414,
"loss": 9.154,
"step": 10040
},
{
"epoch": 0.04551644881209308,
"grad_norm": 8.388337135314941,
"learning_rate": 0.0001364401411636956,
"loss": 9.1391,
"step": 10060
},
{
"epoch": 0.045606938769970004,
"grad_norm": 10.0809326171875,
"learning_rate": 0.00013671160980906704,
"loss": 9.1417,
"step": 10080
},
{
"epoch": 0.04569742872784693,
"grad_norm": 8.565701484680176,
"learning_rate": 0.0001369830784544385,
"loss": 9.1112,
"step": 10100
},
{
"epoch": 0.04578791868572385,
"grad_norm": 10.437520027160645,
"learning_rate": 0.00013725454709980997,
"loss": 9.1169,
"step": 10120
},
{
"epoch": 0.04587840864360078,
"grad_norm": 8.615896224975586,
"learning_rate": 0.00013752601574518143,
"loss": 9.1003,
"step": 10140
},
{
"epoch": 0.0459688986014777,
"grad_norm": 10.89583683013916,
"learning_rate": 0.0001377974843905529,
"loss": 9.101,
"step": 10160
},
{
"epoch": 0.046059388559354625,
"grad_norm": 9.786931991577148,
"learning_rate": 0.00013806895303592433,
"loss": 9.0689,
"step": 10180
},
{
"epoch": 0.04614987851723155,
"grad_norm": 9.010174751281738,
"learning_rate": 0.0001383404216812958,
"loss": 9.0579,
"step": 10200
},
{
"epoch": 0.04624036847510848,
"grad_norm": 11.039669036865234,
"learning_rate": 0.00013861189032666725,
"loss": 9.0865,
"step": 10220
},
{
"epoch": 0.0463308584329854,
"grad_norm": 12.055830001831055,
"learning_rate": 0.00013888335897203872,
"loss": 9.0955,
"step": 10240
},
{
"epoch": 0.04642134839086232,
"grad_norm": 8.361885070800781,
"learning_rate": 0.00013915482761741018,
"loss": 9.07,
"step": 10260
},
{
"epoch": 0.046511838348739246,
"grad_norm": 7.196146011352539,
"learning_rate": 0.00013942629626278164,
"loss": 9.0528,
"step": 10280
},
{
"epoch": 0.046602328306616175,
"grad_norm": 9.67076587677002,
"learning_rate": 0.0001396977649081531,
"loss": 9.0546,
"step": 10300
},
{
"epoch": 0.0466928182644931,
"grad_norm": 10.09327220916748,
"learning_rate": 0.00013996923355352457,
"loss": 9.0741,
"step": 10320
},
{
"epoch": 0.04678330822237002,
"grad_norm": 9.639015197753906,
"learning_rate": 0.00014024070219889603,
"loss": 9.0633,
"step": 10340
},
{
"epoch": 0.04687379818024695,
"grad_norm": 10.251932144165039,
"learning_rate": 0.0001405121708442675,
"loss": 9.0446,
"step": 10360
},
{
"epoch": 0.04696428813812387,
"grad_norm": 11.07875919342041,
"learning_rate": 0.00014078363948963896,
"loss": 9.0418,
"step": 10380
},
{
"epoch": 0.047054778096000796,
"grad_norm": 9.328507423400879,
"learning_rate": 0.00014105510813501042,
"loss": 9.0287,
"step": 10400
},
{
"epoch": 0.04714526805387772,
"grad_norm": 7.056753635406494,
"learning_rate": 0.00014132657678038186,
"loss": 9.0362,
"step": 10420
},
{
"epoch": 0.04723575801175465,
"grad_norm": 8.899680137634277,
"learning_rate": 0.0001415980454257533,
"loss": 9.036,
"step": 10440
},
{
"epoch": 0.04732624796963157,
"grad_norm": 9.175132751464844,
"learning_rate": 0.00014186951407112476,
"loss": 9.0444,
"step": 10460
},
{
"epoch": 0.047416737927508494,
"grad_norm": 9.374978065490723,
"learning_rate": 0.00014214098271649622,
"loss": 9.0372,
"step": 10480
},
{
"epoch": 0.04750722788538542,
"grad_norm": 9.893750190734863,
"learning_rate": 0.00014241245136186769,
"loss": 9.0424,
"step": 10500
},
{
"epoch": 0.04759771784326235,
"grad_norm": 7.787280082702637,
"learning_rate": 0.00014265677314270202,
"loss": 8.9691,
"step": 10520
},
{
"epoch": 0.04768820780113927,
"grad_norm": 17.40734100341797,
"learning_rate": 0.00014277893403311917,
"loss": 8.2225,
"step": 10540
},
{
"epoch": 0.04777869775901619,
"grad_norm": NaN,
"learning_rate": 0.00014286037462673062,
"loss": 6.6046,
"step": 10560
},
{
"epoch": 0.047869187716893115,
"grad_norm": NaN,
"learning_rate": 0.0001429146683558049,
"loss": 3.0921,
"step": 10580
},
{
"epoch": 0.047959677674770045,
"grad_norm": NaN,
"learning_rate": 0.00014294181522034205,
"loss": 3.9765,
"step": 10600
},
{
"epoch": 0.04805016763264697,
"grad_norm": NaN,
"learning_rate": 0.00014298253551714776,
"loss": 6.9972,
"step": 10620
},
{
"epoch": 0.04814065759052389,
"grad_norm": NaN,
"learning_rate": 0.00014298253551714776,
"loss": 0.0,
"step": 10640
},
{
"epoch": 0.04823114754840081,
"grad_norm": NaN,
"learning_rate": 0.00014298253551714776,
"loss": 0.0,
"step": 10660
},
{
"epoch": 0.04832163750627774,
"grad_norm": NaN,
"learning_rate": 0.00014298253551714776,
"loss": 0.0,
"step": 10680
},
{
"epoch": 0.048412127464154665,
"grad_norm": NaN,
"learning_rate": 0.00014298253551714776,
"loss": 0.0,
"step": 10700
},
{
"epoch": 0.04850261742203159,
"grad_norm": NaN,
"learning_rate": 0.00014298253551714776,
"loss": 0.0,
"step": 10720
},
{
"epoch": 0.04859310737990852,
"grad_norm": NaN,
"learning_rate": 0.00014298253551714776,
"loss": 0.0,
"step": 10740
},
{
"epoch": 0.04868359733778544,
"grad_norm": NaN,
"learning_rate": 0.00014298253551714776,
"loss": 0.0,
"step": 10760
},
{
"epoch": 0.04877408729566236,
"grad_norm": NaN,
"learning_rate": 0.00014298253551714776,
"loss": 0.0,
"step": 10780
},
{
"epoch": 0.048864577253539286,
"grad_norm": NaN,
"learning_rate": 0.00014298253551714776,
"loss": 0.0,
"step": 10800
},
{
"epoch": 0.048955067211416216,
"grad_norm": NaN,
"learning_rate": 0.00014298253551714776,
"loss": 0.0,
"step": 10820
},
{
"epoch": 0.04904555716929314,
"grad_norm": NaN,
"learning_rate": 0.00014298253551714776,
"loss": 0.0,
"step": 10840
},
{
"epoch": 0.04913604712717006,
"grad_norm": NaN,
"learning_rate": 0.00014298253551714776,
"loss": 0.0,
"step": 10860
},
{
"epoch": 0.049226537085046984,
"grad_norm": NaN,
"learning_rate": 0.00014298253551714776,
"loss": 0.0,
"step": 10880
},
{
"epoch": 0.049317027042923914,
"grad_norm": NaN,
"learning_rate": 0.00014298253551714776,
"loss": 0.0,
"step": 10900
},
{
"epoch": 0.04940751700080084,
"grad_norm": NaN,
"learning_rate": 0.00014298253551714776,
"loss": 0.0,
"step": 10920
},
{
"epoch": 0.04949800695867776,
"grad_norm": NaN,
"learning_rate": 0.00014298253551714776,
"loss": 0.0,
"step": 10940
},
{
"epoch": 0.04958849691655468,
"grad_norm": NaN,
"learning_rate": 0.00014298253551714776,
"loss": 0.0,
"step": 10960
},
{
"epoch": 0.04967898687443161,
"grad_norm": NaN,
"learning_rate": 0.00014298253551714776,
"loss": 0.0,
"step": 10980
},
{
"epoch": 0.049769476832308535,
"grad_norm": NaN,
"learning_rate": 0.00014298253551714776,
"loss": 0.0,
"step": 11000
},
{
"epoch": 0.04985996679018546,
"grad_norm": NaN,
"learning_rate": 0.00014298253551714776,
"loss": 0.0,
"step": 11020
},
{
"epoch": 0.04995045674806239,
"grad_norm": NaN,
"learning_rate": 0.00014298253551714776,
"loss": 0.0,
"step": 11040
},
{
"epoch": 0.05004094670593931,
"grad_norm": NaN,
"learning_rate": 0.00014298253551714776,
"loss": 0.0,
"step": 11060
},
{
"epoch": 0.05013143666381623,
"grad_norm": NaN,
"learning_rate": 0.00014298253551714776,
"loss": 0.0,
"step": 11080
},
{
"epoch": 0.050221926621693155,
"grad_norm": NaN,
"learning_rate": 0.00014298253551714776,
"loss": 0.0,
"step": 11100
},
{
"epoch": 0.050312416579570085,
"grad_norm": NaN,
"learning_rate": 0.00014298253551714776,
"loss": 0.0,
"step": 11120
},
{
"epoch": 0.05040290653744701,
"grad_norm": NaN,
"learning_rate": 0.00014298253551714776,
"loss": 0.0,
"step": 11140
},
{
"epoch": 0.05049339649532393,
"grad_norm": NaN,
"learning_rate": 0.00014298253551714776,
"loss": 0.0,
"step": 11160
},
{
"epoch": 0.05058388645320085,
"grad_norm": NaN,
"learning_rate": 0.00014298253551714776,
"loss": 0.0,
"step": 11180
},
{
"epoch": 0.05067437641107778,
"grad_norm": NaN,
"learning_rate": 0.00014298253551714776,
"loss": 0.0,
"step": 11200
},
{
"epoch": 0.050764866368954706,
"grad_norm": NaN,
"learning_rate": 0.00014298253551714776,
"loss": 0.0,
"step": 11220
},
{
"epoch": 0.05085535632683163,
"grad_norm": NaN,
"learning_rate": 0.00014298253551714776,
"loss": 0.0,
"step": 11240
},
{
"epoch": 0.05094584628470855,
"grad_norm": NaN,
"learning_rate": 0.00014298253551714776,
"loss": 0.0,
"step": 11260
},
{
"epoch": 0.05103633624258548,
"grad_norm": NaN,
"learning_rate": 0.00014298253551714776,
"loss": 0.0,
"step": 11280
},
{
"epoch": 0.051126826200462404,
"grad_norm": NaN,
"learning_rate": 0.00014298253551714776,
"loss": 0.0,
"step": 11300
},
{
"epoch": 0.05121731615833933,
"grad_norm": NaN,
"learning_rate": 0.00014298253551714776,
"loss": 0.0,
"step": 11320
},
{
"epoch": 0.051307806116216256,
"grad_norm": NaN,
"learning_rate": 0.00014298253551714776,
"loss": 0.0,
"step": 11340
},
{
"epoch": 0.05139829607409318,
"grad_norm": NaN,
"learning_rate": 0.00014298253551714776,
"loss": 0.0,
"step": 11360
},
{
"epoch": 0.0514887860319701,
"grad_norm": NaN,
"learning_rate": 0.00014298253551714776,
"loss": 0.0,
"step": 11380
},
{
"epoch": 0.051579275989847025,
"grad_norm": NaN,
"learning_rate": 0.00014298253551714776,
"loss": 0.0,
"step": 11400
},
{
"epoch": 0.051669765947723954,
"grad_norm": NaN,
"learning_rate": 0.00014298253551714776,
"loss": 0.0,
"step": 11420
},
{
"epoch": 0.05176025590560088,
"grad_norm": NaN,
"learning_rate": 0.00014298253551714776,
"loss": 0.0,
"step": 11440
},
{
"epoch": 0.0518507458634778,
"grad_norm": NaN,
"learning_rate": 0.00014298253551714776,
"loss": 0.0,
"step": 11460
},
{
"epoch": 0.05194123582135472,
"grad_norm": NaN,
"learning_rate": 0.00014298253551714776,
"loss": 0.0,
"step": 11480
},
{
"epoch": 0.05203172577923165,
"grad_norm": NaN,
"learning_rate": 0.00014298253551714776,
"loss": 0.0,
"step": 11500
},
{
"epoch": 0.052122215737108575,
"grad_norm": NaN,
"learning_rate": 0.00014298253551714776,
"loss": 0.0,
"step": 11520
},
{
"epoch": 0.0522127056949855,
"grad_norm": NaN,
"learning_rate": 0.00014298253551714776,
"loss": 0.0,
"step": 11540
},
{
"epoch": 0.05230319565286242,
"grad_norm": NaN,
"learning_rate": 0.00014298253551714776,
"loss": 0.0,
"step": 11560
},
{
"epoch": 0.05239368561073935,
"grad_norm": NaN,
"learning_rate": 0.00014298253551714776,
"loss": 0.0,
"step": 11580
},
{
"epoch": 0.05248417556861627,
"grad_norm": NaN,
"learning_rate": 0.00014298253551714776,
"loss": 0.0,
"step": 11600
},
{
"epoch": 0.052574665526493196,
"grad_norm": NaN,
"learning_rate": 0.00014298253551714776,
"loss": 0.0,
"step": 11620
},
{
"epoch": 0.05266515548437012,
"grad_norm": NaN,
"learning_rate": 0.00014298253551714776,
"loss": 0.0,
"step": 11640
},
{
"epoch": 0.05275564544224705,
"grad_norm": NaN,
"learning_rate": 0.00014298253551714776,
"loss": 0.0,
"step": 11660
},
{
"epoch": 0.05284613540012397,
"grad_norm": NaN,
"learning_rate": 0.00014298253551714776,
"loss": 0.0,
"step": 11680
},
{
"epoch": 0.052936625358000894,
"grad_norm": NaN,
"learning_rate": 0.00014298253551714776,
"loss": 0.0,
"step": 11700
},
{
"epoch": 0.053027115315877824,
"grad_norm": NaN,
"learning_rate": 0.00014298253551714776,
"loss": 0.0,
"step": 11720
},
{
"epoch": 0.053117605273754746,
"grad_norm": NaN,
"learning_rate": 0.00014298253551714776,
"loss": 0.0,
"step": 11740
},
{
"epoch": 0.05320809523163167,
"grad_norm": NaN,
"learning_rate": 0.00014298253551714776,
"loss": 0.0,
"step": 11760
},
{
"epoch": 0.05329858518950859,
"grad_norm": NaN,
"learning_rate": 0.00014298253551714776,
"loss": 0.0,
"step": 11780
},
{
"epoch": 0.05338907514738552,
"grad_norm": NaN,
"learning_rate": 0.00014298253551714776,
"loss": 0.0,
"step": 11800
},
{
"epoch": 0.053479565105262444,
"grad_norm": NaN,
"learning_rate": 0.00014298253551714776,
"loss": 0.0,
"step": 11820
},
{
"epoch": 0.05357005506313937,
"grad_norm": NaN,
"learning_rate": 0.00014298253551714776,
"loss": 0.0,
"step": 11840
},
{
"epoch": 0.05366054502101629,
"grad_norm": NaN,
"learning_rate": 0.00014298253551714776,
"loss": 0.0,
"step": 11860
},
{
"epoch": 0.05375103497889322,
"grad_norm": NaN,
"learning_rate": 0.00014298253551714776,
"loss": 0.0,
"step": 11880
},
{
"epoch": 0.05384152493677014,
"grad_norm": NaN,
"learning_rate": 0.00014298253551714776,
"loss": 0.0,
"step": 11900
},
{
"epoch": 0.053932014894647065,
"grad_norm": NaN,
"learning_rate": 0.00014298253551714776,
"loss": 0.0,
"step": 11920
},
{
"epoch": 0.05402250485252399,
"grad_norm": NaN,
"learning_rate": 0.00014298253551714776,
"loss": 0.0,
"step": 11940
},
{
"epoch": 0.05411299481040092,
"grad_norm": NaN,
"learning_rate": 0.00014298253551714776,
"loss": 0.0,
"step": 11960
},
{
"epoch": 0.05420348476827784,
"grad_norm": NaN,
"learning_rate": 0.00014298253551714776,
"loss": 0.0,
"step": 11980
},
{
"epoch": 0.05429397472615476,
"grad_norm": NaN,
"learning_rate": 0.00014298253551714776,
"loss": 0.0,
"step": 12000
},
{
"epoch": 0.05429397472615476,
"eval_accuracy": 0.021626624590642192,
"eval_loss": NaN,
"eval_runtime": 218.9297,
"eval_samples_per_second": 2776.417,
"eval_steps_per_second": 10.848,
"step": 12000
}
],
"logging_steps": 20,
"max_steps": 663057,
"num_input_tokens_seen": 0,
"num_train_epochs": 3,
"save_steps": 100,
"total_flos": 4315086323712000.0,
"train_batch_size": 256,
"trial_name": null,
"trial_params": null
}