{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.05429397472615476, "eval_steps": 2000, "global_step": 12000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 9.048995787692461e-05, "grad_norm": 1.1874778270721436, "learning_rate": 2.7146864537145957e-07, "loss": 10.3312, "step": 20 }, { "epoch": 0.00018097991575384922, "grad_norm": 1.3932149410247803, "learning_rate": 5.429372907429191e-07, "loss": 10.3266, "step": 40 }, { "epoch": 0.00027146987363077383, "grad_norm": 1.2732529640197754, "learning_rate": 8.144059361143787e-07, "loss": 10.3163, "step": 60 }, { "epoch": 0.00036195983150769844, "grad_norm": 1.07429039478302, "learning_rate": 1.0858745814858383e-06, "loss": 10.3044, "step": 80 }, { "epoch": 0.00045244978938462305, "grad_norm": 1.0309141874313354, "learning_rate": 1.357343226857298e-06, "loss": 10.2959, "step": 100 }, { "epoch": 0.0005429397472615477, "grad_norm": 0.9270058870315552, "learning_rate": 1.6288118722287574e-06, "loss": 10.2818, "step": 120 }, { "epoch": 0.0006334297051384723, "grad_norm": 0.8409116864204407, "learning_rate": 1.900280517600217e-06, "loss": 10.2724, "step": 140 }, { "epoch": 0.0007239196630153969, "grad_norm": 0.7587267160415649, "learning_rate": 2.1717491629716765e-06, "loss": 10.2662, "step": 160 }, { "epoch": 0.0008144096208923215, "grad_norm": 0.8605366945266724, "learning_rate": 2.4432178083431364e-06, "loss": 10.2567, "step": 180 }, { "epoch": 0.0009048995787692461, "grad_norm": 0.8124440908432007, "learning_rate": 2.714686453714596e-06, "loss": 10.2513, "step": 200 }, { "epoch": 0.0009953895366461706, "grad_norm": 0.8214222192764282, "learning_rate": 2.9861550990860553e-06, "loss": 10.2396, "step": 220 }, { "epoch": 0.0010858794945230953, "grad_norm": 0.7500312924385071, "learning_rate": 3.2576237444575148e-06, "loss": 10.2378, "step": 240 }, { "epoch": 0.0011763694524000198, "grad_norm": 0.7709519267082214, "learning_rate": 3.529092389828975e-06, "loss": 10.2287, "step": 260 }, { "epoch": 0.0012668594102769445, "grad_norm": 0.8319140672683716, "learning_rate": 3.800561035200434e-06, "loss": 10.2214, "step": 280 }, { "epoch": 0.001357349368153869, "grad_norm": 0.8057898283004761, "learning_rate": 4.072029680571894e-06, "loss": 10.2072, "step": 300 }, { "epoch": 0.0014478393260307938, "grad_norm": 0.6834843754768372, "learning_rate": 4.343498325943353e-06, "loss": 10.1983, "step": 320 }, { "epoch": 0.0015383292839077183, "grad_norm": 0.8223700523376465, "learning_rate": 4.614966971314813e-06, "loss": 10.1884, "step": 340 }, { "epoch": 0.001628819241784643, "grad_norm": 0.8147690892219543, "learning_rate": 4.886435616686273e-06, "loss": 10.1814, "step": 360 }, { "epoch": 0.0017193091996615675, "grad_norm": 0.8512526750564575, "learning_rate": 5.157904262057733e-06, "loss": 10.1713, "step": 380 }, { "epoch": 0.0018097991575384922, "grad_norm": 0.8844230771064758, "learning_rate": 5.429372907429192e-06, "loss": 10.1572, "step": 400 }, { "epoch": 0.0019002891154154167, "grad_norm": 0.9605993628501892, "learning_rate": 5.700841552800652e-06, "loss": 10.1496, "step": 420 }, { "epoch": 0.001990779073292341, "grad_norm": 1.2027961015701294, "learning_rate": 5.972310198172111e-06, "loss": 10.1298, "step": 440 }, { "epoch": 0.002081269031169266, "grad_norm": 1.4069308042526245, "learning_rate": 6.2437788435435705e-06, "loss": 10.1092, "step": 460 }, { "epoch": 0.0021717589890461906, "grad_norm": 1.7658456563949585, "learning_rate": 6.5152474889150296e-06, "loss": 10.0954, "step": 480 }, { "epoch": 0.002262248946923115, "grad_norm": 1.6941689252853394, "learning_rate": 6.7867161342864895e-06, "loss": 10.0746, "step": 500 }, { "epoch": 0.0023527389048000396, "grad_norm": 2.362786293029785, "learning_rate": 7.05818477965795e-06, "loss": 10.0613, "step": 520 }, { "epoch": 0.0024432288626769646, "grad_norm": 1.827091932296753, "learning_rate": 7.329653425029408e-06, "loss": 10.045, "step": 540 }, { "epoch": 0.002533718820553889, "grad_norm": 2.136615753173828, "learning_rate": 7.601122070400868e-06, "loss": 10.0243, "step": 560 }, { "epoch": 0.0026242087784308136, "grad_norm": 2.501790761947632, "learning_rate": 7.872590715772328e-06, "loss": 10.0091, "step": 580 }, { "epoch": 0.002714698736307738, "grad_norm": 2.7978005409240723, "learning_rate": 8.144059361143788e-06, "loss": 9.9957, "step": 600 }, { "epoch": 0.002805188694184663, "grad_norm": 3.0485517978668213, "learning_rate": 8.415528006515246e-06, "loss": 9.9819, "step": 620 }, { "epoch": 0.0028956786520615875, "grad_norm": 2.761986255645752, "learning_rate": 8.686996651886706e-06, "loss": 9.9596, "step": 640 }, { "epoch": 0.002986168609938512, "grad_norm": 3.0985260009765625, "learning_rate": 8.958465297258166e-06, "loss": 9.9436, "step": 660 }, { "epoch": 0.0030766585678154365, "grad_norm": 2.40391206741333, "learning_rate": 9.229933942629626e-06, "loss": 9.9226, "step": 680 }, { "epoch": 0.0031671485256923614, "grad_norm": 1.933740496635437, "learning_rate": 9.501402588001086e-06, "loss": 9.9069, "step": 700 }, { "epoch": 0.003257638483569286, "grad_norm": 2.518874168395996, "learning_rate": 9.772871233372546e-06, "loss": 9.881, "step": 720 }, { "epoch": 0.0033481284414462104, "grad_norm": 2.8025624752044678, "learning_rate": 1.0044339878744006e-05, "loss": 9.8686, "step": 740 }, { "epoch": 0.003438618399323135, "grad_norm": 1.943656086921692, "learning_rate": 1.0315808524115465e-05, "loss": 9.8463, "step": 760 }, { "epoch": 0.00352910835720006, "grad_norm": 1.753179907798767, "learning_rate": 1.0587277169486925e-05, "loss": 9.8344, "step": 780 }, { "epoch": 0.0036195983150769844, "grad_norm": 1.9388506412506104, "learning_rate": 1.0858745814858383e-05, "loss": 9.8144, "step": 800 }, { "epoch": 0.003710088272953909, "grad_norm": 2.6278536319732666, "learning_rate": 1.1130214460229843e-05, "loss": 9.8008, "step": 820 }, { "epoch": 0.0038005782308308334, "grad_norm": 1.8270655870437622, "learning_rate": 1.1401683105601303e-05, "loss": 9.7791, "step": 840 }, { "epoch": 0.0038910681887077583, "grad_norm": 1.656563639640808, "learning_rate": 1.1673151750972763e-05, "loss": 9.7677, "step": 860 }, { "epoch": 0.003981558146584682, "grad_norm": 1.6003910303115845, "learning_rate": 1.1944620396344221e-05, "loss": 9.7465, "step": 880 }, { "epoch": 0.004072048104461608, "grad_norm": 1.5632762908935547, "learning_rate": 1.2216089041715681e-05, "loss": 9.73, "step": 900 }, { "epoch": 0.004162538062338532, "grad_norm": 1.4974184036254883, "learning_rate": 1.2487557687087141e-05, "loss": 9.7067, "step": 920 }, { "epoch": 0.004253028020215457, "grad_norm": 1.811112880706787, "learning_rate": 1.2759026332458601e-05, "loss": 9.6956, "step": 940 }, { "epoch": 0.004343517978092381, "grad_norm": 1.505334734916687, "learning_rate": 1.3030494977830059e-05, "loss": 9.6667, "step": 960 }, { "epoch": 0.004434007935969306, "grad_norm": 1.6951265335083008, "learning_rate": 1.3301963623201519e-05, "loss": 9.6505, "step": 980 }, { "epoch": 0.00452449789384623, "grad_norm": 1.6119604110717773, "learning_rate": 1.3573432268572979e-05, "loss": 9.6381, "step": 1000 }, { "epoch": 0.004614987851723155, "grad_norm": 1.1929903030395508, "learning_rate": 1.3844900913944439e-05, "loss": 9.6209, "step": 1020 }, { "epoch": 0.004705477809600079, "grad_norm": 1.5701353549957275, "learning_rate": 1.41163695593159e-05, "loss": 9.5956, "step": 1040 }, { "epoch": 0.004795967767477005, "grad_norm": 1.32628333568573, "learning_rate": 1.4387838204687359e-05, "loss": 9.5899, "step": 1060 }, { "epoch": 0.004886457725353929, "grad_norm": 1.5850657224655151, "learning_rate": 1.4659306850058817e-05, "loss": 9.5779, "step": 1080 }, { "epoch": 0.004976947683230854, "grad_norm": 1.3933109045028687, "learning_rate": 1.4930775495430278e-05, "loss": 9.5701, "step": 1100 }, { "epoch": 0.005067437641107778, "grad_norm": 1.258367657661438, "learning_rate": 1.5202244140801737e-05, "loss": 9.5468, "step": 1120 }, { "epoch": 0.005157927598984703, "grad_norm": 1.3926512002944946, "learning_rate": 1.5473712786173196e-05, "loss": 9.5392, "step": 1140 }, { "epoch": 0.005248417556861627, "grad_norm": 1.1674704551696777, "learning_rate": 1.5745181431544656e-05, "loss": 9.5291, "step": 1160 }, { "epoch": 0.005338907514738552, "grad_norm": 1.4704829454421997, "learning_rate": 1.6016650076916116e-05, "loss": 9.5219, "step": 1180 }, { "epoch": 0.005429397472615476, "grad_norm": 1.6223082542419434, "learning_rate": 1.6288118722287576e-05, "loss": 9.4918, "step": 1200 }, { "epoch": 0.0055198874304924015, "grad_norm": 1.8586570024490356, "learning_rate": 1.6559587367659036e-05, "loss": 9.4895, "step": 1220 }, { "epoch": 0.005610377388369326, "grad_norm": 1.4105405807495117, "learning_rate": 1.6831056013030492e-05, "loss": 9.4886, "step": 1240 }, { "epoch": 0.0057008673462462505, "grad_norm": 1.4756163358688354, "learning_rate": 1.7102524658401956e-05, "loss": 9.4702, "step": 1260 }, { "epoch": 0.005791357304123175, "grad_norm": 1.3847874402999878, "learning_rate": 1.7373993303773412e-05, "loss": 9.4638, "step": 1280 }, { "epoch": 0.0058818472620000995, "grad_norm": 1.5135865211486816, "learning_rate": 1.7645461949144875e-05, "loss": 9.4583, "step": 1300 }, { "epoch": 0.005972337219877024, "grad_norm": 1.462760329246521, "learning_rate": 1.7916930594516332e-05, "loss": 9.4292, "step": 1320 }, { "epoch": 0.0060628271777539485, "grad_norm": 1.646760106086731, "learning_rate": 1.8188399239887792e-05, "loss": 9.4419, "step": 1340 }, { "epoch": 0.006153317135630873, "grad_norm": 1.3564046621322632, "learning_rate": 1.8459867885259252e-05, "loss": 9.4283, "step": 1360 }, { "epoch": 0.006243807093507798, "grad_norm": 1.4385489225387573, "learning_rate": 1.873133653063071e-05, "loss": 9.4208, "step": 1380 }, { "epoch": 0.006334297051384723, "grad_norm": 1.3975261449813843, "learning_rate": 1.900280517600217e-05, "loss": 9.4015, "step": 1400 }, { "epoch": 0.006424787009261647, "grad_norm": 1.4809174537658691, "learning_rate": 1.927427382137363e-05, "loss": 9.4009, "step": 1420 }, { "epoch": 0.006515276967138572, "grad_norm": 1.5181605815887451, "learning_rate": 1.954574246674509e-05, "loss": 9.3969, "step": 1440 }, { "epoch": 0.006605766925015496, "grad_norm": 1.4760838747024536, "learning_rate": 1.981721111211655e-05, "loss": 9.395, "step": 1460 }, { "epoch": 0.006696256882892421, "grad_norm": 1.6140539646148682, "learning_rate": 2.008867975748801e-05, "loss": 9.3868, "step": 1480 }, { "epoch": 0.006786746840769345, "grad_norm": 1.469307541847229, "learning_rate": 2.0360148402859468e-05, "loss": 9.3766, "step": 1500 }, { "epoch": 0.00687723679864627, "grad_norm": 1.8742159605026245, "learning_rate": 2.063161704823093e-05, "loss": 9.3715, "step": 1520 }, { "epoch": 0.006967726756523195, "grad_norm": 1.5996043682098389, "learning_rate": 2.0903085693602387e-05, "loss": 9.3622, "step": 1540 }, { "epoch": 0.00705821671440012, "grad_norm": 1.867632508277893, "learning_rate": 2.117455433897385e-05, "loss": 9.3704, "step": 1560 }, { "epoch": 0.007148706672277044, "grad_norm": 1.4762872457504272, "learning_rate": 2.1446022984345307e-05, "loss": 9.3741, "step": 1580 }, { "epoch": 0.007239196630153969, "grad_norm": 1.5752198696136475, "learning_rate": 2.1717491629716767e-05, "loss": 9.3561, "step": 1600 }, { "epoch": 0.007329686588030893, "grad_norm": 1.637786865234375, "learning_rate": 2.1988960275088227e-05, "loss": 9.3535, "step": 1620 }, { "epoch": 0.007420176545907818, "grad_norm": 2.6087028980255127, "learning_rate": 2.2260428920459687e-05, "loss": 9.3541, "step": 1640 }, { "epoch": 0.007510666503784742, "grad_norm": 1.977252721786499, "learning_rate": 2.2531897565831143e-05, "loss": 9.3341, "step": 1660 }, { "epoch": 0.007601156461661667, "grad_norm": 1.9511388540267944, "learning_rate": 2.2803366211202606e-05, "loss": 9.339, "step": 1680 }, { "epoch": 0.007691646419538592, "grad_norm": 1.8821523189544678, "learning_rate": 2.3074834856574063e-05, "loss": 9.3234, "step": 1700 }, { "epoch": 0.007782136377415517, "grad_norm": 1.5517367124557495, "learning_rate": 2.3346303501945526e-05, "loss": 9.3367, "step": 1720 }, { "epoch": 0.00787262633529244, "grad_norm": 2.164625883102417, "learning_rate": 2.3617772147316983e-05, "loss": 9.3366, "step": 1740 }, { "epoch": 0.007963116293169365, "grad_norm": 2.4158406257629395, "learning_rate": 2.3889240792688443e-05, "loss": 9.3221, "step": 1760 }, { "epoch": 0.00805360625104629, "grad_norm": 1.8652360439300537, "learning_rate": 2.4160709438059906e-05, "loss": 9.3098, "step": 1780 }, { "epoch": 0.008144096208923216, "grad_norm": 1.8249917030334473, "learning_rate": 2.4432178083431362e-05, "loss": 9.3094, "step": 1800 }, { "epoch": 0.00823458616680014, "grad_norm": 2.06990647315979, "learning_rate": 2.4703646728802822e-05, "loss": 9.2994, "step": 1820 }, { "epoch": 0.008325076124677065, "grad_norm": 2.461805582046509, "learning_rate": 2.4975115374174282e-05, "loss": 9.3157, "step": 1840 }, { "epoch": 0.008415566082553989, "grad_norm": 2.1320767402648926, "learning_rate": 2.5246584019545742e-05, "loss": 9.281, "step": 1860 }, { "epoch": 0.008506056040430914, "grad_norm": 2.6872756481170654, "learning_rate": 2.5518052664917202e-05, "loss": 9.2917, "step": 1880 }, { "epoch": 0.008596545998307838, "grad_norm": 2.4759294986724854, "learning_rate": 2.5789521310288662e-05, "loss": 9.2941, "step": 1900 }, { "epoch": 0.008687035956184763, "grad_norm": 1.8129667043685913, "learning_rate": 2.6060989955660118e-05, "loss": 9.2815, "step": 1920 }, { "epoch": 0.008777525914061687, "grad_norm": 2.9053220748901367, "learning_rate": 2.633245860103158e-05, "loss": 9.2801, "step": 1940 }, { "epoch": 0.008868015871938612, "grad_norm": 2.412623167037964, "learning_rate": 2.6603927246403038e-05, "loss": 9.2719, "step": 1960 }, { "epoch": 0.008958505829815536, "grad_norm": 1.972790002822876, "learning_rate": 2.6875395891774498e-05, "loss": 9.2729, "step": 1980 }, { "epoch": 0.00904899578769246, "grad_norm": 3.04768705368042, "learning_rate": 2.7146864537145958e-05, "loss": 9.2653, "step": 2000 }, { "epoch": 0.00904899578769246, "eval_accuracy": 0.10545615706904701, "eval_loss": 9.261013984680176, "eval_runtime": 215.2628, "eval_samples_per_second": 2823.711, "eval_steps_per_second": 11.033, "step": 2000 }, { "epoch": 0.009139485745569385, "grad_norm": 2.2706515789031982, "learning_rate": 2.7418333182517418e-05, "loss": 9.2604, "step": 2020 }, { "epoch": 0.00922997570344631, "grad_norm": 2.297621011734009, "learning_rate": 2.7689801827888878e-05, "loss": 9.2367, "step": 2040 }, { "epoch": 0.009320465661323234, "grad_norm": 2.049971342086792, "learning_rate": 2.7961270473260337e-05, "loss": 9.2545, "step": 2060 }, { "epoch": 0.009410955619200159, "grad_norm": 2.3538951873779297, "learning_rate": 2.82327391186318e-05, "loss": 9.2511, "step": 2080 }, { "epoch": 0.009501445577077083, "grad_norm": 3.1383931636810303, "learning_rate": 2.8504207764003254e-05, "loss": 9.2319, "step": 2100 }, { "epoch": 0.00959193553495401, "grad_norm": 2.6480958461761475, "learning_rate": 2.8775676409374717e-05, "loss": 9.2353, "step": 2120 }, { "epoch": 0.009682425492830934, "grad_norm": 2.3209128379821777, "learning_rate": 2.9047145054746177e-05, "loss": 9.241, "step": 2140 }, { "epoch": 0.009772915450707858, "grad_norm": 2.3225491046905518, "learning_rate": 2.9318613700117634e-05, "loss": 9.2133, "step": 2160 }, { "epoch": 0.009863405408584783, "grad_norm": 2.0134568214416504, "learning_rate": 2.9590082345489093e-05, "loss": 9.2188, "step": 2180 }, { "epoch": 0.009953895366461707, "grad_norm": 3.033569574356079, "learning_rate": 2.9861550990860557e-05, "loss": 9.2131, "step": 2200 }, { "epoch": 0.010044385324338632, "grad_norm": 2.8993263244628906, "learning_rate": 3.0133019636232017e-05, "loss": 9.2119, "step": 2220 }, { "epoch": 0.010134875282215556, "grad_norm": 2.718588352203369, "learning_rate": 3.0404488281603473e-05, "loss": 9.2187, "step": 2240 }, { "epoch": 0.01022536524009248, "grad_norm": 2.635470390319824, "learning_rate": 3.0675956926974936e-05, "loss": 9.1953, "step": 2260 }, { "epoch": 0.010315855197969405, "grad_norm": 2.6032440662384033, "learning_rate": 3.094742557234639e-05, "loss": 9.1967, "step": 2280 }, { "epoch": 0.01040634515584633, "grad_norm": 2.4713950157165527, "learning_rate": 3.121889421771785e-05, "loss": 9.1881, "step": 2300 }, { "epoch": 0.010496835113723254, "grad_norm": 2.4573025703430176, "learning_rate": 3.149036286308931e-05, "loss": 9.1827, "step": 2320 }, { "epoch": 0.010587325071600179, "grad_norm": 2.6169447898864746, "learning_rate": 3.1761831508460776e-05, "loss": 9.1865, "step": 2340 }, { "epoch": 0.010677815029477103, "grad_norm": 2.6744954586029053, "learning_rate": 3.203330015383223e-05, "loss": 9.1829, "step": 2360 }, { "epoch": 0.010768304987354028, "grad_norm": 2.766223907470703, "learning_rate": 3.230476879920369e-05, "loss": 9.177, "step": 2380 }, { "epoch": 0.010858794945230952, "grad_norm": 2.8083655834198, "learning_rate": 3.257623744457515e-05, "loss": 9.1853, "step": 2400 }, { "epoch": 0.010949284903107877, "grad_norm": 4.484155178070068, "learning_rate": 3.284770608994661e-05, "loss": 9.1655, "step": 2420 }, { "epoch": 0.011039774860984803, "grad_norm": 3.5152087211608887, "learning_rate": 3.311917473531807e-05, "loss": 9.1516, "step": 2440 }, { "epoch": 0.011130264818861728, "grad_norm": 2.3122165203094482, "learning_rate": 3.339064338068953e-05, "loss": 9.1552, "step": 2460 }, { "epoch": 0.011220754776738652, "grad_norm": 3.0563108921051025, "learning_rate": 3.3662112026060985e-05, "loss": 9.1494, "step": 2480 }, { "epoch": 0.011311244734615577, "grad_norm": 3.926668882369995, "learning_rate": 3.393358067143245e-05, "loss": 9.1425, "step": 2500 }, { "epoch": 0.011401734692492501, "grad_norm": 2.7006709575653076, "learning_rate": 3.420504931680391e-05, "loss": 9.1328, "step": 2520 }, { "epoch": 0.011492224650369426, "grad_norm": 3.1082751750946045, "learning_rate": 3.447651796217537e-05, "loss": 9.1316, "step": 2540 }, { "epoch": 0.01158271460824635, "grad_norm": 2.744490385055542, "learning_rate": 3.4747986607546824e-05, "loss": 9.1193, "step": 2560 }, { "epoch": 0.011673204566123275, "grad_norm": 2.8441922664642334, "learning_rate": 3.501945525291829e-05, "loss": 9.1174, "step": 2580 }, { "epoch": 0.011763694524000199, "grad_norm": 3.7371647357940674, "learning_rate": 3.529092389828975e-05, "loss": 9.1217, "step": 2600 }, { "epoch": 0.011854184481877124, "grad_norm": 3.0141730308532715, "learning_rate": 3.556239254366121e-05, "loss": 9.0999, "step": 2620 }, { "epoch": 0.011944674439754048, "grad_norm": 2.9731669425964355, "learning_rate": 3.5833861189032664e-05, "loss": 9.1044, "step": 2640 }, { "epoch": 0.012035164397630973, "grad_norm": 3.166254997253418, "learning_rate": 3.610532983440413e-05, "loss": 9.103, "step": 2660 }, { "epoch": 0.012125654355507897, "grad_norm": 2.949646472930908, "learning_rate": 3.6376798479775584e-05, "loss": 9.1026, "step": 2680 }, { "epoch": 0.012216144313384822, "grad_norm": 2.762843132019043, "learning_rate": 3.664826712514705e-05, "loss": 9.1047, "step": 2700 }, { "epoch": 0.012306634271261746, "grad_norm": 3.188957929611206, "learning_rate": 3.6919735770518503e-05, "loss": 9.0968, "step": 2720 }, { "epoch": 0.01239712422913867, "grad_norm": 4.116425037384033, "learning_rate": 3.719120441588996e-05, "loss": 9.0993, "step": 2740 }, { "epoch": 0.012487614187015597, "grad_norm": 2.7521297931671143, "learning_rate": 3.746267306126142e-05, "loss": 9.063, "step": 2760 }, { "epoch": 0.012578104144892521, "grad_norm": 3.1481823921203613, "learning_rate": 3.7734141706632886e-05, "loss": 9.062, "step": 2780 }, { "epoch": 0.012668594102769446, "grad_norm": 2.48091721534729, "learning_rate": 3.800561035200434e-05, "loss": 9.0727, "step": 2800 }, { "epoch": 0.01275908406064637, "grad_norm": 3.0816426277160645, "learning_rate": 3.82770789973758e-05, "loss": 9.0525, "step": 2820 }, { "epoch": 0.012849574018523295, "grad_norm": 2.86342191696167, "learning_rate": 3.854854764274726e-05, "loss": 9.0447, "step": 2840 }, { "epoch": 0.01294006397640022, "grad_norm": 2.769746780395508, "learning_rate": 3.8820016288118726e-05, "loss": 9.0524, "step": 2860 }, { "epoch": 0.013030553934277144, "grad_norm": 3.4716339111328125, "learning_rate": 3.909148493349018e-05, "loss": 9.0453, "step": 2880 }, { "epoch": 0.013121043892154068, "grad_norm": 4.585721969604492, "learning_rate": 3.936295357886164e-05, "loss": 9.0466, "step": 2900 }, { "epoch": 0.013211533850030993, "grad_norm": 3.7394728660583496, "learning_rate": 3.96344222242331e-05, "loss": 9.0405, "step": 2920 }, { "epoch": 0.013302023807907917, "grad_norm": 3.9100561141967773, "learning_rate": 3.990589086960456e-05, "loss": 9.0415, "step": 2940 }, { "epoch": 0.013392513765784842, "grad_norm": 2.94941782951355, "learning_rate": 4.017735951497602e-05, "loss": 9.0265, "step": 2960 }, { "epoch": 0.013483003723661766, "grad_norm": 2.6733226776123047, "learning_rate": 4.044882816034748e-05, "loss": 9.0195, "step": 2980 }, { "epoch": 0.01357349368153869, "grad_norm": 3.4839463233947754, "learning_rate": 4.0720296805718935e-05, "loss": 9.0204, "step": 3000 }, { "epoch": 0.013663983639415615, "grad_norm": 3.460050344467163, "learning_rate": 4.09917654510904e-05, "loss": 9.0086, "step": 3020 }, { "epoch": 0.01375447359729254, "grad_norm": 4.007343769073486, "learning_rate": 4.126323409646186e-05, "loss": 9.0185, "step": 3040 }, { "epoch": 0.013844963555169464, "grad_norm": 3.917860746383667, "learning_rate": 4.153470274183331e-05, "loss": 9.0032, "step": 3060 }, { "epoch": 0.01393545351304639, "grad_norm": 3.5258123874664307, "learning_rate": 4.1806171387204775e-05, "loss": 8.9983, "step": 3080 }, { "epoch": 0.014025943470923315, "grad_norm": 3.002183198928833, "learning_rate": 4.207764003257624e-05, "loss": 8.9898, "step": 3100 }, { "epoch": 0.01411643342880024, "grad_norm": 3.2682976722717285, "learning_rate": 4.23491086779477e-05, "loss": 8.9932, "step": 3120 }, { "epoch": 0.014206923386677164, "grad_norm": 3.7955832481384277, "learning_rate": 4.262057732331915e-05, "loss": 8.9879, "step": 3140 }, { "epoch": 0.014297413344554089, "grad_norm": 3.3697524070739746, "learning_rate": 4.2892045968690614e-05, "loss": 8.9757, "step": 3160 }, { "epoch": 0.014387903302431013, "grad_norm": 3.756788730621338, "learning_rate": 4.316351461406208e-05, "loss": 8.9811, "step": 3180 }, { "epoch": 0.014478393260307938, "grad_norm": 3.024722099304199, "learning_rate": 4.3434983259433534e-05, "loss": 8.9613, "step": 3200 }, { "epoch": 0.014568883218184862, "grad_norm": 3.258375406265259, "learning_rate": 4.3706451904805e-05, "loss": 8.9614, "step": 3220 }, { "epoch": 0.014659373176061787, "grad_norm": 2.970426559448242, "learning_rate": 4.3977920550176454e-05, "loss": 8.9624, "step": 3240 }, { "epoch": 0.014749863133938711, "grad_norm": 4.601590156555176, "learning_rate": 4.424938919554791e-05, "loss": 8.9615, "step": 3260 }, { "epoch": 0.014840353091815636, "grad_norm": 4.773068428039551, "learning_rate": 4.4520857840919373e-05, "loss": 8.9668, "step": 3280 }, { "epoch": 0.01493084304969256, "grad_norm": 3.182677984237671, "learning_rate": 4.479232648629084e-05, "loss": 8.933, "step": 3300 }, { "epoch": 0.015021333007569485, "grad_norm": 3.160553455352783, "learning_rate": 4.5063795131662286e-05, "loss": 8.9409, "step": 3320 }, { "epoch": 0.015111822965446409, "grad_norm": 3.0617620944976807, "learning_rate": 4.533526377703375e-05, "loss": 8.95, "step": 3340 }, { "epoch": 0.015202312923323334, "grad_norm": 3.1966211795806885, "learning_rate": 4.560673242240521e-05, "loss": 8.9379, "step": 3360 }, { "epoch": 0.015292802881200258, "grad_norm": 2.3314368724823, "learning_rate": 4.587820106777667e-05, "loss": 8.9246, "step": 3380 }, { "epoch": 0.015383292839077184, "grad_norm": 3.1242740154266357, "learning_rate": 4.6149669713148126e-05, "loss": 8.9409, "step": 3400 }, { "epoch": 0.015473782796954109, "grad_norm": 3.042051315307617, "learning_rate": 4.642113835851959e-05, "loss": 8.9204, "step": 3420 }, { "epoch": 0.015564272754831033, "grad_norm": 4.102015495300293, "learning_rate": 4.669260700389105e-05, "loss": 8.8915, "step": 3440 }, { "epoch": 0.015654762712707958, "grad_norm": 3.2991299629211426, "learning_rate": 4.696407564926251e-05, "loss": 8.8897, "step": 3460 }, { "epoch": 0.01574525267058488, "grad_norm": 3.501094102859497, "learning_rate": 4.7235544294633965e-05, "loss": 8.9223, "step": 3480 }, { "epoch": 0.015835742628461807, "grad_norm": 6.248113632202148, "learning_rate": 4.750701294000543e-05, "loss": 8.8925, "step": 3500 }, { "epoch": 0.01592623258633873, "grad_norm": 4.329127788543701, "learning_rate": 4.7778481585376885e-05, "loss": 8.8891, "step": 3520 }, { "epoch": 0.016016722544215656, "grad_norm": 3.575141191482544, "learning_rate": 4.804995023074835e-05, "loss": 8.8741, "step": 3540 }, { "epoch": 0.01610721250209258, "grad_norm": 3.301194429397583, "learning_rate": 4.832141887611981e-05, "loss": 8.8965, "step": 3560 }, { "epoch": 0.016197702459969505, "grad_norm": 3.7364182472229004, "learning_rate": 4.859288752149126e-05, "loss": 8.8899, "step": 3580 }, { "epoch": 0.01628819241784643, "grad_norm": 5.336267471313477, "learning_rate": 4.8864356166862725e-05, "loss": 8.8959, "step": 3600 }, { "epoch": 0.016378682375723354, "grad_norm": 4.769089221954346, "learning_rate": 4.913582481223419e-05, "loss": 8.8981, "step": 3620 }, { "epoch": 0.01646917233360028, "grad_norm": 3.369799852371216, "learning_rate": 4.9407293457605645e-05, "loss": 8.8954, "step": 3640 }, { "epoch": 0.016559662291477203, "grad_norm": 3.063030481338501, "learning_rate": 4.96787621029771e-05, "loss": 8.8694, "step": 3660 }, { "epoch": 0.01665015224935413, "grad_norm": 4.988938331604004, "learning_rate": 4.9950230748348564e-05, "loss": 8.8611, "step": 3680 }, { "epoch": 0.016740642207231052, "grad_norm": 3.5118601322174072, "learning_rate": 5.022169939372003e-05, "loss": 8.8525, "step": 3700 }, { "epoch": 0.016831132165107978, "grad_norm": 4.257157325744629, "learning_rate": 5.0493168039091484e-05, "loss": 8.8547, "step": 3720 }, { "epoch": 0.0169216221229849, "grad_norm": 3.7021615505218506, "learning_rate": 5.076463668446294e-05, "loss": 8.8572, "step": 3740 }, { "epoch": 0.017012112080861827, "grad_norm": 4.868439197540283, "learning_rate": 5.1036105329834404e-05, "loss": 8.8684, "step": 3760 }, { "epoch": 0.01710260203873875, "grad_norm": 6.547580718994141, "learning_rate": 5.130757397520586e-05, "loss": 8.828, "step": 3780 }, { "epoch": 0.017193091996615676, "grad_norm": 5.9254374504089355, "learning_rate": 5.1579042620577324e-05, "loss": 8.838, "step": 3800 }, { "epoch": 0.0172835819544926, "grad_norm": 6.061065196990967, "learning_rate": 5.185051126594879e-05, "loss": 8.8405, "step": 3820 }, { "epoch": 0.017374071912369525, "grad_norm": 6.026751518249512, "learning_rate": 5.2121979911320237e-05, "loss": 8.8305, "step": 3840 }, { "epoch": 0.017464561870246448, "grad_norm": 4.982965469360352, "learning_rate": 5.23934485566917e-05, "loss": 8.8316, "step": 3860 }, { "epoch": 0.017555051828123374, "grad_norm": 9.080221176147461, "learning_rate": 5.266491720206316e-05, "loss": 8.8267, "step": 3880 }, { "epoch": 0.0176455417860003, "grad_norm": 6.644583225250244, "learning_rate": 5.293638584743462e-05, "loss": 8.8331, "step": 3900 }, { "epoch": 0.017736031743877223, "grad_norm": 6.022925853729248, "learning_rate": 5.3207854492806076e-05, "loss": 8.8198, "step": 3920 }, { "epoch": 0.01782652170175415, "grad_norm": 4.794320583343506, "learning_rate": 5.347932313817754e-05, "loss": 8.8075, "step": 3940 }, { "epoch": 0.017917011659631072, "grad_norm": 5.949656963348389, "learning_rate": 5.3750791783548996e-05, "loss": 8.8175, "step": 3960 }, { "epoch": 0.018007501617508, "grad_norm": 7.972283840179443, "learning_rate": 5.402226042892046e-05, "loss": 8.8263, "step": 3980 }, { "epoch": 0.01809799157538492, "grad_norm": 6.132015228271484, "learning_rate": 5.4293729074291916e-05, "loss": 8.8035, "step": 4000 }, { "epoch": 0.01809799157538492, "eval_accuracy": 0.10955227810888264, "eval_loss": 8.793069839477539, "eval_runtime": 217.825, "eval_samples_per_second": 2790.497, "eval_steps_per_second": 10.903, "step": 4000 }, { "epoch": 0.018188481533261847, "grad_norm": 3.9714837074279785, "learning_rate": 5.455162428739481e-05, "loss": 8.8029, "step": 4020 }, { "epoch": 0.01827897149113877, "grad_norm": 3.9775164127349854, "learning_rate": 5.482309293276626e-05, "loss": 8.7859, "step": 4040 }, { "epoch": 0.018369461449015696, "grad_norm": 4.350288391113281, "learning_rate": 5.509456157813772e-05, "loss": 8.8049, "step": 4060 }, { "epoch": 0.01845995140689262, "grad_norm": 5.212925910949707, "learning_rate": 5.5366030223509186e-05, "loss": 8.7768, "step": 4080 }, { "epoch": 0.018550441364769545, "grad_norm": 5.585092544555664, "learning_rate": 5.563749886888064e-05, "loss": 8.7792, "step": 4100 }, { "epoch": 0.018640931322646468, "grad_norm": 5.019256114959717, "learning_rate": 5.59089675142521e-05, "loss": 8.7843, "step": 4120 }, { "epoch": 0.018731421280523394, "grad_norm": 5.925191402435303, "learning_rate": 5.616686272735499e-05, "loss": 8.7693, "step": 4140 }, { "epoch": 0.018821911238400317, "grad_norm": 4.334403991699219, "learning_rate": 5.643833137272645e-05, "loss": 8.7652, "step": 4160 }, { "epoch": 0.018912401196277243, "grad_norm": 6.786751747131348, "learning_rate": 5.670980001809791e-05, "loss": 8.76, "step": 4180 }, { "epoch": 0.019002891154154166, "grad_norm": 5.805715084075928, "learning_rate": 5.698126866346936e-05, "loss": 8.7835, "step": 4200 }, { "epoch": 0.019093381112031092, "grad_norm": 7.2905120849609375, "learning_rate": 5.7252737308840826e-05, "loss": 8.7524, "step": 4220 }, { "epoch": 0.01918387106990802, "grad_norm": 4.692761421203613, "learning_rate": 5.752420595421228e-05, "loss": 8.7274, "step": 4240 }, { "epoch": 0.01927436102778494, "grad_norm": 5.6952924728393555, "learning_rate": 5.7795674599583746e-05, "loss": 8.7625, "step": 4260 }, { "epoch": 0.019364850985661868, "grad_norm": 7.725805759429932, "learning_rate": 5.806714324495521e-05, "loss": 8.7313, "step": 4280 }, { "epoch": 0.01945534094353879, "grad_norm": 5.154263496398926, "learning_rate": 5.833861189032667e-05, "loss": 8.7433, "step": 4300 }, { "epoch": 0.019545830901415717, "grad_norm": 7.734066963195801, "learning_rate": 5.861008053569812e-05, "loss": 8.738, "step": 4320 }, { "epoch": 0.01963632085929264, "grad_norm": 6.757390022277832, "learning_rate": 5.888154918106958e-05, "loss": 8.6971, "step": 4340 }, { "epoch": 0.019726810817169566, "grad_norm": 9.869467735290527, "learning_rate": 5.915301782644104e-05, "loss": 8.7437, "step": 4360 }, { "epoch": 0.01981730077504649, "grad_norm": 4.825913429260254, "learning_rate": 5.9424486471812505e-05, "loss": 8.712, "step": 4380 }, { "epoch": 0.019907790732923415, "grad_norm": 8.725457191467285, "learning_rate": 5.969595511718397e-05, "loss": 8.7054, "step": 4400 }, { "epoch": 0.019998280690800337, "grad_norm": 9.08804702758789, "learning_rate": 5.9967423762555425e-05, "loss": 8.6968, "step": 4420 }, { "epoch": 0.020088770648677264, "grad_norm": 7.369052886962891, "learning_rate": 6.023889240792689e-05, "loss": 8.6736, "step": 4440 }, { "epoch": 0.020179260606554186, "grad_norm": 9.925745964050293, "learning_rate": 6.051036105329834e-05, "loss": 8.7043, "step": 4460 }, { "epoch": 0.020269750564431113, "grad_norm": 10.998024940490723, "learning_rate": 6.07818296986698e-05, "loss": 8.7098, "step": 4480 }, { "epoch": 0.020360240522308035, "grad_norm": 9.010730743408203, "learning_rate": 6.105329834404126e-05, "loss": 8.6893, "step": 4500 }, { "epoch": 0.02045073048018496, "grad_norm": 5.833269119262695, "learning_rate": 6.132476698941272e-05, "loss": 8.6928, "step": 4520 }, { "epoch": 0.020541220438061888, "grad_norm": 5.778794288635254, "learning_rate": 6.159623563478418e-05, "loss": 8.6813, "step": 4540 }, { "epoch": 0.02063171039593881, "grad_norm": 6.518376350402832, "learning_rate": 6.186770428015565e-05, "loss": 8.6679, "step": 4560 }, { "epoch": 0.020722200353815737, "grad_norm": 7.985169887542725, "learning_rate": 6.21391729255271e-05, "loss": 8.6912, "step": 4580 }, { "epoch": 0.02081269031169266, "grad_norm": 6.066607475280762, "learning_rate": 6.241064157089856e-05, "loss": 8.67, "step": 4600 }, { "epoch": 0.020903180269569586, "grad_norm": 7.519238471984863, "learning_rate": 6.268211021627002e-05, "loss": 8.648, "step": 4620 }, { "epoch": 0.02099367022744651, "grad_norm": 9.485710144042969, "learning_rate": 6.295357886164147e-05, "loss": 8.6484, "step": 4640 }, { "epoch": 0.021084160185323435, "grad_norm": 9.786864280700684, "learning_rate": 6.322504750701294e-05, "loss": 8.637, "step": 4660 }, { "epoch": 0.021174650143200358, "grad_norm": 8.231635093688965, "learning_rate": 6.34965161523844e-05, "loss": 8.648, "step": 4680 }, { "epoch": 0.021265140101077284, "grad_norm": 7.283841609954834, "learning_rate": 6.376798479775586e-05, "loss": 8.64, "step": 4700 }, { "epoch": 0.021355630058954207, "grad_norm": 7.625393390655518, "learning_rate": 6.403945344312731e-05, "loss": 8.6713, "step": 4720 }, { "epoch": 0.021446120016831133, "grad_norm": 7.758394241333008, "learning_rate": 6.431092208849878e-05, "loss": 8.6473, "step": 4740 }, { "epoch": 0.021536609974708056, "grad_norm": 7.519627571105957, "learning_rate": 6.458239073387024e-05, "loss": 8.6144, "step": 4760 }, { "epoch": 0.021627099932584982, "grad_norm": 7.698405742645264, "learning_rate": 6.48538593792417e-05, "loss": 8.6678, "step": 4780 }, { "epoch": 0.021717589890461905, "grad_norm": 7.843724727630615, "learning_rate": 6.512532802461315e-05, "loss": 8.6292, "step": 4800 }, { "epoch": 0.02180807984833883, "grad_norm": 9.748797416687012, "learning_rate": 6.539679666998462e-05, "loss": 8.6059, "step": 4820 }, { "epoch": 0.021898569806215754, "grad_norm": 8.68276596069336, "learning_rate": 6.566826531535607e-05, "loss": 8.6153, "step": 4840 }, { "epoch": 0.02198905976409268, "grad_norm": 9.26171588897705, "learning_rate": 6.593973396072753e-05, "loss": 8.6343, "step": 4860 }, { "epoch": 0.022079549721969606, "grad_norm": 10.164648056030273, "learning_rate": 6.621120260609899e-05, "loss": 8.6255, "step": 4880 }, { "epoch": 0.02217003967984653, "grad_norm": 8.388748168945312, "learning_rate": 6.648267125147046e-05, "loss": 8.6111, "step": 4900 }, { "epoch": 0.022260529637723455, "grad_norm": 9.701128005981445, "learning_rate": 6.675413989684192e-05, "loss": 8.5902, "step": 4920 }, { "epoch": 0.022351019595600378, "grad_norm": 9.261332511901855, "learning_rate": 6.702560854221338e-05, "loss": 8.6013, "step": 4940 }, { "epoch": 0.022441509553477304, "grad_norm": 7.0918354988098145, "learning_rate": 6.729707718758483e-05, "loss": 8.5595, "step": 4960 }, { "epoch": 0.022531999511354227, "grad_norm": 8.793268203735352, "learning_rate": 6.756854583295628e-05, "loss": 8.5862, "step": 4980 }, { "epoch": 0.022622489469231153, "grad_norm": 8.539192199707031, "learning_rate": 6.784001447832774e-05, "loss": 8.5938, "step": 5000 }, { "epoch": 0.022712979427108076, "grad_norm": 8.60251522064209, "learning_rate": 6.811148312369921e-05, "loss": 8.598, "step": 5020 }, { "epoch": 0.022803469384985002, "grad_norm": 8.976070404052734, "learning_rate": 6.838295176907067e-05, "loss": 8.5896, "step": 5040 }, { "epoch": 0.022893959342861925, "grad_norm": 8.834037780761719, "learning_rate": 6.865442041444213e-05, "loss": 8.5654, "step": 5060 }, { "epoch": 0.02298444930073885, "grad_norm": 7.039853096008301, "learning_rate": 6.89258890598136e-05, "loss": 8.574, "step": 5080 }, { "epoch": 0.023074939258615774, "grad_norm": 4.989284515380859, "learning_rate": 6.919735770518505e-05, "loss": 8.584, "step": 5100 }, { "epoch": 0.0231654292164927, "grad_norm": 10.530620574951172, "learning_rate": 6.946882635055651e-05, "loss": 8.5884, "step": 5120 }, { "epoch": 0.023255919174369623, "grad_norm": 10.483266830444336, "learning_rate": 6.974029499592797e-05, "loss": 8.573, "step": 5140 }, { "epoch": 0.02334640913224655, "grad_norm": 9.433408737182617, "learning_rate": 7.001176364129942e-05, "loss": 8.5553, "step": 5160 }, { "epoch": 0.023436899090123475, "grad_norm": 10.707608222961426, "learning_rate": 7.028323228667089e-05, "loss": 8.5672, "step": 5180 }, { "epoch": 0.023527389048000398, "grad_norm": 11.35906982421875, "learning_rate": 7.055470093204235e-05, "loss": 8.5374, "step": 5200 }, { "epoch": 0.023617879005877324, "grad_norm": 9.386375427246094, "learning_rate": 7.08261695774138e-05, "loss": 8.5199, "step": 5220 }, { "epoch": 0.023708368963754247, "grad_norm": 10.813016891479492, "learning_rate": 7.109763822278526e-05, "loss": 8.5296, "step": 5240 }, { "epoch": 0.023798858921631173, "grad_norm": 10.738064765930176, "learning_rate": 7.136910686815673e-05, "loss": 8.5293, "step": 5260 }, { "epoch": 0.023889348879508096, "grad_norm": 12.89620590209961, "learning_rate": 7.164057551352819e-05, "loss": 8.5494, "step": 5280 }, { "epoch": 0.023979838837385022, "grad_norm": 11.624608039855957, "learning_rate": 7.191204415889965e-05, "loss": 8.5179, "step": 5300 }, { "epoch": 0.024070328795261945, "grad_norm": 7.694511413574219, "learning_rate": 7.21835128042711e-05, "loss": 8.5528, "step": 5320 }, { "epoch": 0.02416081875313887, "grad_norm": 9.326581954956055, "learning_rate": 7.245498144964257e-05, "loss": 8.5307, "step": 5340 }, { "epoch": 0.024251308711015794, "grad_norm": 8.548121452331543, "learning_rate": 7.272645009501402e-05, "loss": 8.5031, "step": 5360 }, { "epoch": 0.02434179866889272, "grad_norm": 10.232369422912598, "learning_rate": 7.299791874038548e-05, "loss": 8.4905, "step": 5380 }, { "epoch": 0.024432288626769643, "grad_norm": 9.751016616821289, "learning_rate": 7.326938738575694e-05, "loss": 8.4996, "step": 5400 }, { "epoch": 0.02452277858464657, "grad_norm": 11.058146476745605, "learning_rate": 7.35408560311284e-05, "loss": 8.4889, "step": 5420 }, { "epoch": 0.024613268542523492, "grad_norm": 8.11478042602539, "learning_rate": 7.381232467649987e-05, "loss": 8.5099, "step": 5440 }, { "epoch": 0.02470375850040042, "grad_norm": 8.138284683227539, "learning_rate": 7.408379332187133e-05, "loss": 8.4854, "step": 5460 }, { "epoch": 0.02479424845827734, "grad_norm": 7.7438459396362305, "learning_rate": 7.435526196724278e-05, "loss": 8.4877, "step": 5480 }, { "epoch": 0.024884738416154267, "grad_norm": 9.896592140197754, "learning_rate": 7.462673061261423e-05, "loss": 8.4662, "step": 5500 }, { "epoch": 0.024975228374031194, "grad_norm": 7.162434101104736, "learning_rate": 7.48981992579857e-05, "loss": 8.4772, "step": 5520 }, { "epoch": 0.025065718331908116, "grad_norm": 8.252161026000977, "learning_rate": 7.516966790335716e-05, "loss": 8.4936, "step": 5540 }, { "epoch": 0.025156208289785043, "grad_norm": 7.313194751739502, "learning_rate": 7.544113654872862e-05, "loss": 8.493, "step": 5560 }, { "epoch": 0.025246698247661965, "grad_norm": 11.324033737182617, "learning_rate": 7.571260519410008e-05, "loss": 8.4776, "step": 5580 }, { "epoch": 0.02533718820553889, "grad_norm": 9.4235258102417, "learning_rate": 7.598407383947155e-05, "loss": 8.4769, "step": 5600 }, { "epoch": 0.025427678163415814, "grad_norm": 6.676479339599609, "learning_rate": 7.6255542484843e-05, "loss": 8.4389, "step": 5620 }, { "epoch": 0.02551816812129274, "grad_norm": 9.530123710632324, "learning_rate": 7.652701113021446e-05, "loss": 8.4704, "step": 5640 }, { "epoch": 0.025608658079169663, "grad_norm": 10.614904403686523, "learning_rate": 7.679847977558591e-05, "loss": 8.4507, "step": 5660 }, { "epoch": 0.02569914803704659, "grad_norm": 7.5254974365234375, "learning_rate": 7.706994842095737e-05, "loss": 8.464, "step": 5680 }, { "epoch": 0.025789637994923512, "grad_norm": 7.461385250091553, "learning_rate": 7.734141706632884e-05, "loss": 8.4516, "step": 5700 }, { "epoch": 0.02588012795280044, "grad_norm": 9.106521606445312, "learning_rate": 7.76128857117003e-05, "loss": 8.4142, "step": 5720 }, { "epoch": 0.02597061791067736, "grad_norm": 8.536205291748047, "learning_rate": 7.788435435707175e-05, "loss": 8.4497, "step": 5740 }, { "epoch": 0.026061107868554288, "grad_norm": 7.925720691680908, "learning_rate": 7.815582300244321e-05, "loss": 8.4783, "step": 5760 }, { "epoch": 0.02615159782643121, "grad_norm": 11.187898635864258, "learning_rate": 7.842729164781468e-05, "loss": 8.4054, "step": 5780 }, { "epoch": 0.026242087784308137, "grad_norm": 6.965084075927734, "learning_rate": 7.869876029318614e-05, "loss": 8.4079, "step": 5800 }, { "epoch": 0.02633257774218506, "grad_norm": 8.090741157531738, "learning_rate": 7.89702289385576e-05, "loss": 8.4474, "step": 5820 }, { "epoch": 0.026423067700061986, "grad_norm": 9.698216438293457, "learning_rate": 7.924169758392905e-05, "loss": 8.3945, "step": 5840 }, { "epoch": 0.026513557657938912, "grad_norm": 7.889448642730713, "learning_rate": 7.951316622930052e-05, "loss": 8.4046, "step": 5860 }, { "epoch": 0.026604047615815835, "grad_norm": 11.487144470214844, "learning_rate": 7.978463487467197e-05, "loss": 8.4195, "step": 5880 }, { "epoch": 0.02669453757369276, "grad_norm": 9.28532886505127, "learning_rate": 8.005610352004343e-05, "loss": 8.406, "step": 5900 }, { "epoch": 0.026785027531569684, "grad_norm": 8.982071876525879, "learning_rate": 8.032757216541489e-05, "loss": 8.4221, "step": 5920 }, { "epoch": 0.02687551748944661, "grad_norm": 11.42358684539795, "learning_rate": 8.059904081078636e-05, "loss": 8.4423, "step": 5940 }, { "epoch": 0.026966007447323533, "grad_norm": 8.633251190185547, "learning_rate": 8.087050945615782e-05, "loss": 8.4233, "step": 5960 }, { "epoch": 0.02705649740520046, "grad_norm": 9.28022575378418, "learning_rate": 8.114197810152928e-05, "loss": 8.4169, "step": 5980 }, { "epoch": 0.02714698736307738, "grad_norm": 11.166740417480469, "learning_rate": 8.141344674690073e-05, "loss": 8.4018, "step": 6000 }, { "epoch": 0.02714698736307738, "eval_accuracy": 0.11314150543417859, "eval_loss": 8.402518272399902, "eval_runtime": 218.3209, "eval_samples_per_second": 2784.158, "eval_steps_per_second": 10.878, "step": 6000 }, { "epoch": 0.027237477320954308, "grad_norm": 7.100822925567627, "learning_rate": 8.167134196000362e-05, "loss": 8.4131, "step": 6020 }, { "epoch": 0.02732796727883123, "grad_norm": 8.460954666137695, "learning_rate": 8.194281060537508e-05, "loss": 8.4087, "step": 6040 }, { "epoch": 0.027418457236708157, "grad_norm": 7.642125129699707, "learning_rate": 8.221427925074653e-05, "loss": 8.3806, "step": 6060 }, { "epoch": 0.02750894719458508, "grad_norm": 8.104974746704102, "learning_rate": 8.2485747896118e-05, "loss": 8.404, "step": 6080 }, { "epoch": 0.027599437152462006, "grad_norm": 8.082459449768066, "learning_rate": 8.275721654148946e-05, "loss": 8.3865, "step": 6100 }, { "epoch": 0.02768992711033893, "grad_norm": 8.786911010742188, "learning_rate": 8.302868518686092e-05, "loss": 8.3475, "step": 6120 }, { "epoch": 0.027780417068215855, "grad_norm": 7.780808925628662, "learning_rate": 8.330015383223237e-05, "loss": 8.3798, "step": 6140 }, { "epoch": 0.02787090702609278, "grad_norm": 10.508188247680664, "learning_rate": 8.357162247760384e-05, "loss": 8.3718, "step": 6160 }, { "epoch": 0.027961396983969704, "grad_norm": 9.833992004394531, "learning_rate": 8.38430911229753e-05, "loss": 8.3952, "step": 6180 }, { "epoch": 0.02805188694184663, "grad_norm": 9.917244911193848, "learning_rate": 8.411455976834675e-05, "loss": 8.3828, "step": 6200 }, { "epoch": 0.028142376899723553, "grad_norm": 8.893899917602539, "learning_rate": 8.438602841371821e-05, "loss": 8.3853, "step": 6220 }, { "epoch": 0.02823286685760048, "grad_norm": 8.206876754760742, "learning_rate": 8.465749705908967e-05, "loss": 8.3686, "step": 6240 }, { "epoch": 0.028323356815477402, "grad_norm": 6.771660327911377, "learning_rate": 8.492896570446114e-05, "loss": 8.3699, "step": 6260 }, { "epoch": 0.028413846773354328, "grad_norm": 8.602880477905273, "learning_rate": 8.52004343498326e-05, "loss": 8.3388, "step": 6280 }, { "epoch": 0.02850433673123125, "grad_norm": 12.602445602416992, "learning_rate": 8.547190299520405e-05, "loss": 8.3127, "step": 6300 }, { "epoch": 0.028594826689108177, "grad_norm": 6.581843852996826, "learning_rate": 8.57433716405755e-05, "loss": 8.3345, "step": 6320 }, { "epoch": 0.0286853166469851, "grad_norm": 11.11732292175293, "learning_rate": 8.601484028594696e-05, "loss": 8.3442, "step": 6340 }, { "epoch": 0.028775806604862026, "grad_norm": 7.795157432556152, "learning_rate": 8.628630893131843e-05, "loss": 8.3477, "step": 6360 }, { "epoch": 0.02886629656273895, "grad_norm": 7.013496398925781, "learning_rate": 8.655777757668989e-05, "loss": 8.3444, "step": 6380 }, { "epoch": 0.028956786520615875, "grad_norm": 7.039948463439941, "learning_rate": 8.682924622206135e-05, "loss": 8.3242, "step": 6400 }, { "epoch": 0.029047276478492798, "grad_norm": 9.261716842651367, "learning_rate": 8.710071486743282e-05, "loss": 8.3209, "step": 6420 }, { "epoch": 0.029137766436369724, "grad_norm": 7.255875587463379, "learning_rate": 8.737218351280428e-05, "loss": 8.304, "step": 6440 }, { "epoch": 0.029228256394246647, "grad_norm": 7.955538749694824, "learning_rate": 8.764365215817573e-05, "loss": 8.2953, "step": 6460 }, { "epoch": 0.029318746352123573, "grad_norm": 9.364811897277832, "learning_rate": 8.791512080354718e-05, "loss": 8.2936, "step": 6480 }, { "epoch": 0.0294092363100005, "grad_norm": 9.385396957397461, "learning_rate": 8.818658944891864e-05, "loss": 8.3276, "step": 6500 }, { "epoch": 0.029499726267877422, "grad_norm": 8.448295593261719, "learning_rate": 8.84580580942901e-05, "loss": 8.2975, "step": 6520 }, { "epoch": 0.02959021622575435, "grad_norm": 9.282604217529297, "learning_rate": 8.872952673966157e-05, "loss": 8.3217, "step": 6540 }, { "epoch": 0.02968070618363127, "grad_norm": 7.898446559906006, "learning_rate": 8.900099538503303e-05, "loss": 8.3006, "step": 6560 }, { "epoch": 0.029771196141508197, "grad_norm": 9.186493873596191, "learning_rate": 8.927246403040448e-05, "loss": 8.2981, "step": 6580 }, { "epoch": 0.02986168609938512, "grad_norm": 9.346575736999512, "learning_rate": 8.954393267577595e-05, "loss": 8.2883, "step": 6600 }, { "epoch": 0.029952176057262046, "grad_norm": 6.458785057067871, "learning_rate": 8.981540132114741e-05, "loss": 8.2966, "step": 6620 }, { "epoch": 0.03004266601513897, "grad_norm": 8.704976081848145, "learning_rate": 9.008686996651886e-05, "loss": 8.2986, "step": 6640 }, { "epoch": 0.030133155973015895, "grad_norm": 7.744259357452393, "learning_rate": 9.035833861189032e-05, "loss": 8.2868, "step": 6660 }, { "epoch": 0.030223645930892818, "grad_norm": 8.345844268798828, "learning_rate": 9.062980725726179e-05, "loss": 8.2931, "step": 6680 }, { "epoch": 0.030314135888769744, "grad_norm": 7.604759216308594, "learning_rate": 9.090127590263323e-05, "loss": 8.2847, "step": 6700 }, { "epoch": 0.030404625846646667, "grad_norm": 10.3920259475708, "learning_rate": 9.11727445480047e-05, "loss": 8.273, "step": 6720 }, { "epoch": 0.030495115804523593, "grad_norm": 7.095389366149902, "learning_rate": 9.144421319337616e-05, "loss": 8.2768, "step": 6740 }, { "epoch": 0.030585605762400516, "grad_norm": 7.211811542510986, "learning_rate": 9.171568183874762e-05, "loss": 8.2918, "step": 6760 }, { "epoch": 0.030676095720277442, "grad_norm": 8.639713287353516, "learning_rate": 9.198715048411909e-05, "loss": 8.2845, "step": 6780 }, { "epoch": 0.03076658567815437, "grad_norm": 7.687414169311523, "learning_rate": 9.225861912949055e-05, "loss": 8.2992, "step": 6800 }, { "epoch": 0.03085707563603129, "grad_norm": 8.479426383972168, "learning_rate": 9.2530087774862e-05, "loss": 8.2848, "step": 6820 }, { "epoch": 0.030947565593908218, "grad_norm": 8.185149192810059, "learning_rate": 9.280155642023345e-05, "loss": 8.3037, "step": 6840 }, { "epoch": 0.03103805555178514, "grad_norm": 8.295937538146973, "learning_rate": 9.307302506560491e-05, "loss": 8.3179, "step": 6860 }, { "epoch": 0.031128545509662067, "grad_norm": 10.772727012634277, "learning_rate": 9.334449371097638e-05, "loss": 8.264, "step": 6880 }, { "epoch": 0.03121903546753899, "grad_norm": 8.465076446533203, "learning_rate": 9.361596235634784e-05, "loss": 8.2303, "step": 6900 }, { "epoch": 0.031309525425415916, "grad_norm": 9.096773147583008, "learning_rate": 9.38874310017193e-05, "loss": 8.2473, "step": 6920 }, { "epoch": 0.03140001538329284, "grad_norm": 10.57555866241455, "learning_rate": 9.415889964709077e-05, "loss": 8.27, "step": 6940 }, { "epoch": 0.03149050534116976, "grad_norm": 7.5089850425720215, "learning_rate": 9.443036829246222e-05, "loss": 8.27, "step": 6960 }, { "epoch": 0.03158099529904669, "grad_norm": 10.865699768066406, "learning_rate": 9.470183693783368e-05, "loss": 8.2451, "step": 6980 }, { "epoch": 0.031671485256923614, "grad_norm": 12.514881134033203, "learning_rate": 9.497330558320513e-05, "loss": 8.259, "step": 7000 }, { "epoch": 0.031761975214800536, "grad_norm": 9.914373397827148, "learning_rate": 9.524477422857659e-05, "loss": 8.2727, "step": 7020 }, { "epoch": 0.03185246517267746, "grad_norm": 7.3313984870910645, "learning_rate": 9.551624287394806e-05, "loss": 8.2421, "step": 7040 }, { "epoch": 0.03194295513055439, "grad_norm": 5.989616394042969, "learning_rate": 9.578771151931952e-05, "loss": 8.2363, "step": 7060 }, { "epoch": 0.03203344508843131, "grad_norm": 7.4773430824279785, "learning_rate": 9.605918016469098e-05, "loss": 8.2718, "step": 7080 }, { "epoch": 0.032123935046308234, "grad_norm": 6.605820655822754, "learning_rate": 9.633064881006243e-05, "loss": 8.257, "step": 7100 }, { "epoch": 0.03221442500418516, "grad_norm": 8.294914245605469, "learning_rate": 9.658854402316532e-05, "loss": 8.2478, "step": 7120 }, { "epoch": 0.03230491496206209, "grad_norm": 10.011855125427246, "learning_rate": 9.686001266853678e-05, "loss": 8.2525, "step": 7140 }, { "epoch": 0.03239540491993901, "grad_norm": 7.529365062713623, "learning_rate": 9.713148131390823e-05, "loss": 8.2728, "step": 7160 }, { "epoch": 0.03248589487781593, "grad_norm": 8.781538009643555, "learning_rate": 9.74029499592797e-05, "loss": 8.2305, "step": 7180 }, { "epoch": 0.03257638483569286, "grad_norm": 12.758204460144043, "learning_rate": 9.767441860465116e-05, "loss": 8.2382, "step": 7200 }, { "epoch": 0.032666874793569785, "grad_norm": 10.523704528808594, "learning_rate": 9.794588725002262e-05, "loss": 8.2364, "step": 7220 }, { "epoch": 0.03275736475144671, "grad_norm": 6.50457239151001, "learning_rate": 9.821735589539409e-05, "loss": 8.2384, "step": 7240 }, { "epoch": 0.03284785470932363, "grad_norm": 9.191271781921387, "learning_rate": 9.848882454076555e-05, "loss": 8.2148, "step": 7260 }, { "epoch": 0.03293834466720056, "grad_norm": 8.93270206451416, "learning_rate": 9.8760293186137e-05, "loss": 8.2352, "step": 7280 }, { "epoch": 0.03302883462507748, "grad_norm": 9.895100593566895, "learning_rate": 9.903176183150845e-05, "loss": 8.2376, "step": 7300 }, { "epoch": 0.033119324582954406, "grad_norm": 10.420171737670898, "learning_rate": 9.930323047687991e-05, "loss": 8.2479, "step": 7320 }, { "epoch": 0.03320981454083133, "grad_norm": 9.649170875549316, "learning_rate": 9.957469912225138e-05, "loss": 8.2557, "step": 7340 }, { "epoch": 0.03330030449870826, "grad_norm": 7.854948043823242, "learning_rate": 9.984616776762284e-05, "loss": 8.2145, "step": 7360 }, { "epoch": 0.03339079445658518, "grad_norm": 8.486404418945312, "learning_rate": 0.0001001176364129943, "loss": 8.2132, "step": 7380 }, { "epoch": 0.033481284414462104, "grad_norm": 11.286945343017578, "learning_rate": 0.00010038910505836577, "loss": 8.2169, "step": 7400 }, { "epoch": 0.033571774372339026, "grad_norm": 6.662302494049072, "learning_rate": 0.00010066057370373721, "loss": 8.2318, "step": 7420 }, { "epoch": 0.033662264330215956, "grad_norm": 10.467026710510254, "learning_rate": 0.00010093204234910868, "loss": 8.2089, "step": 7440 }, { "epoch": 0.03375275428809288, "grad_norm": 12.113288879394531, "learning_rate": 0.00010120351099448013, "loss": 8.2194, "step": 7460 }, { "epoch": 0.0338432442459698, "grad_norm": 13.295260429382324, "learning_rate": 0.00010147497963985159, "loss": 8.2526, "step": 7480 }, { "epoch": 0.03393373420384673, "grad_norm": 9.79587173461914, "learning_rate": 0.00010174644828522305, "loss": 8.2253, "step": 7500 }, { "epoch": 0.034024224161723654, "grad_norm": 10.251439094543457, "learning_rate": 0.00010201791693059452, "loss": 8.2248, "step": 7520 }, { "epoch": 0.03411471411960058, "grad_norm": 10.583033561706543, "learning_rate": 0.00010228938557596597, "loss": 8.211, "step": 7540 }, { "epoch": 0.0342052040774775, "grad_norm": 10.661384582519531, "learning_rate": 0.00010256085422133743, "loss": 8.2053, "step": 7560 }, { "epoch": 0.03429569403535443, "grad_norm": 8.133881568908691, "learning_rate": 0.0001028323228667089, "loss": 8.1948, "step": 7580 }, { "epoch": 0.03438618399323135, "grad_norm": 9.278162002563477, "learning_rate": 0.00010310379151208036, "loss": 8.2235, "step": 7600 }, { "epoch": 0.034476673951108275, "grad_norm": 10.354171752929688, "learning_rate": 0.00010337526015745181, "loss": 8.1704, "step": 7620 }, { "epoch": 0.0345671639089852, "grad_norm": 9.4600830078125, "learning_rate": 0.00010364672880282327, "loss": 8.2008, "step": 7640 }, { "epoch": 0.03465765386686213, "grad_norm": 10.290422439575195, "learning_rate": 0.00010391819744819473, "loss": 8.2084, "step": 7660 }, { "epoch": 0.03474814382473905, "grad_norm": 9.98493480682373, "learning_rate": 0.00010418966609356618, "loss": 8.1878, "step": 7680 }, { "epoch": 0.03483863378261597, "grad_norm": 8.021723747253418, "learning_rate": 0.00010446113473893765, "loss": 8.1865, "step": 7700 }, { "epoch": 0.034929123740492896, "grad_norm": 6.915677070617676, "learning_rate": 0.00010473260338430911, "loss": 8.1795, "step": 7720 }, { "epoch": 0.035019613698369825, "grad_norm": 9.64877986907959, "learning_rate": 0.00010500407202968057, "loss": 8.1756, "step": 7740 }, { "epoch": 0.03511010365624675, "grad_norm": 9.673460960388184, "learning_rate": 0.00010527554067505204, "loss": 8.1877, "step": 7760 }, { "epoch": 0.03520059361412367, "grad_norm": 10.429800033569336, "learning_rate": 0.0001055470093204235, "loss": 8.1803, "step": 7780 }, { "epoch": 0.0352910835720006, "grad_norm": 9.610269546508789, "learning_rate": 0.00010581847796579494, "loss": 8.214, "step": 7800 }, { "epoch": 0.03538157352987752, "grad_norm": 9.696439743041992, "learning_rate": 0.0001060899466111664, "loss": 8.1585, "step": 7820 }, { "epoch": 0.035472063487754446, "grad_norm": 10.302108764648438, "learning_rate": 0.00010636141525653786, "loss": 8.1495, "step": 7840 }, { "epoch": 0.03556255344563137, "grad_norm": 10.439906120300293, "learning_rate": 0.00010663288390190933, "loss": 8.1636, "step": 7860 }, { "epoch": 0.0356530434035083, "grad_norm": 13.941293716430664, "learning_rate": 0.00010690435254728079, "loss": 8.1674, "step": 7880 }, { "epoch": 0.03574353336138522, "grad_norm": 11.378789901733398, "learning_rate": 0.00010717582119265225, "loss": 8.1704, "step": 7900 }, { "epoch": 0.035834023319262144, "grad_norm": 10.802684783935547, "learning_rate": 0.00010744728983802372, "loss": 8.1902, "step": 7920 }, { "epoch": 0.03592451327713907, "grad_norm": 13.995284080505371, "learning_rate": 0.00010771875848339517, "loss": 8.1502, "step": 7940 }, { "epoch": 0.036015003235016, "grad_norm": 11.473008155822754, "learning_rate": 0.00010799022712876663, "loss": 8.2082, "step": 7960 }, { "epoch": 0.03610549319289292, "grad_norm": 9.314510345458984, "learning_rate": 0.00010826169577413808, "loss": 8.19, "step": 7980 }, { "epoch": 0.03619598315076984, "grad_norm": 11.141118049621582, "learning_rate": 0.00010853316441950954, "loss": 8.2093, "step": 8000 }, { "epoch": 0.03619598315076984, "eval_accuracy": 0.11013720949528932, "eval_loss": 8.173333168029785, "eval_runtime": 219.4541, "eval_samples_per_second": 2769.782, "eval_steps_per_second": 10.822, "step": 8000 }, { "epoch": 0.036286473108646765, "grad_norm": 12.62540054321289, "learning_rate": 0.000108804633064881, "loss": 8.1561, "step": 8020 }, { "epoch": 0.036376963066523695, "grad_norm": 12.97541332244873, "learning_rate": 0.00010907610171025247, "loss": 8.1708, "step": 8040 }, { "epoch": 0.03646745302440062, "grad_norm": 8.305766105651855, "learning_rate": 0.00010934757035562392, "loss": 8.1671, "step": 8060 }, { "epoch": 0.03655794298227754, "grad_norm": 14.076859474182129, "learning_rate": 0.00010961903900099538, "loss": 8.1659, "step": 8080 }, { "epoch": 0.03664843294015447, "grad_norm": 11.951278686523438, "learning_rate": 0.00010989050764636684, "loss": 8.1893, "step": 8100 }, { "epoch": 0.03673892289803139, "grad_norm": 10.796624183654785, "learning_rate": 0.00011016197629173831, "loss": 8.1942, "step": 8120 }, { "epoch": 0.036829412855908315, "grad_norm": 10.49177074432373, "learning_rate": 0.00011043344493710976, "loss": 8.1589, "step": 8140 }, { "epoch": 0.03691990281378524, "grad_norm": 12.82060432434082, "learning_rate": 0.00011070491358248122, "loss": 8.1957, "step": 8160 }, { "epoch": 0.03701039277166217, "grad_norm": 11.00941276550293, "learning_rate": 0.00011097638222785267, "loss": 8.1609, "step": 8180 }, { "epoch": 0.03710088272953909, "grad_norm": 10.24111270904541, "learning_rate": 0.00011124785087322413, "loss": 8.1769, "step": 8200 }, { "epoch": 0.03719137268741601, "grad_norm": 11.292909622192383, "learning_rate": 0.0001115193195185956, "loss": 8.1628, "step": 8220 }, { "epoch": 0.037281862645292936, "grad_norm": 9.362674713134766, "learning_rate": 0.00011179078816396706, "loss": 8.1638, "step": 8240 }, { "epoch": 0.037372352603169866, "grad_norm": 12.9249906539917, "learning_rate": 0.00011206225680933852, "loss": 8.1957, "step": 8260 }, { "epoch": 0.03746284256104679, "grad_norm": 10.386489868164062, "learning_rate": 0.00011233372545470999, "loss": 8.1525, "step": 8280 }, { "epoch": 0.03755333251892371, "grad_norm": 12.65300464630127, "learning_rate": 0.00011260519410008144, "loss": 8.1558, "step": 8300 }, { "epoch": 0.037643822476800634, "grad_norm": 11.562602996826172, "learning_rate": 0.0001128766627454529, "loss": 8.148, "step": 8320 }, { "epoch": 0.037734312434677564, "grad_norm": 14.783183097839355, "learning_rate": 0.00011314813139082436, "loss": 8.1448, "step": 8340 }, { "epoch": 0.03782480239255449, "grad_norm": 15.469168663024902, "learning_rate": 0.00011341960003619583, "loss": 8.1801, "step": 8360 }, { "epoch": 0.03791529235043141, "grad_norm": 11.361299514770508, "learning_rate": 0.00011369106868156726, "loss": 8.1549, "step": 8380 }, { "epoch": 0.03800578230830833, "grad_norm": 9.814708709716797, "learning_rate": 0.00011396253732693873, "loss": 8.1663, "step": 8400 }, { "epoch": 0.03809627226618526, "grad_norm": 10.522832870483398, "learning_rate": 0.00011423400597231019, "loss": 8.1459, "step": 8420 }, { "epoch": 0.038186762224062185, "grad_norm": 10.637961387634277, "learning_rate": 0.00011450547461768165, "loss": 8.1554, "step": 8440 }, { "epoch": 0.03827725218193911, "grad_norm": 14.578750610351562, "learning_rate": 0.00011477694326305312, "loss": 8.1758, "step": 8460 }, { "epoch": 0.03836774213981604, "grad_norm": 12.179791450500488, "learning_rate": 0.00011504841190842457, "loss": 8.1117, "step": 8480 }, { "epoch": 0.03845823209769296, "grad_norm": 11.189960479736328, "learning_rate": 0.00011531988055379603, "loss": 8.1517, "step": 8500 }, { "epoch": 0.03854872205556988, "grad_norm": 11.662614822387695, "learning_rate": 0.00011559134919916749, "loss": 8.129, "step": 8520 }, { "epoch": 0.038639212013446805, "grad_norm": 9.089029312133789, "learning_rate": 0.00011584924441227038, "loss": 8.1452, "step": 8540 }, { "epoch": 0.038729701971323735, "grad_norm": 15.1500825881958, "learning_rate": 0.00011612071305764184, "loss": 8.1623, "step": 8560 }, { "epoch": 0.03882019192920066, "grad_norm": 15.177955627441406, "learning_rate": 0.0001163921817030133, "loss": 8.1138, "step": 8580 }, { "epoch": 0.03891068188707758, "grad_norm": 9.620798110961914, "learning_rate": 0.00011666365034838476, "loss": 8.1472, "step": 8600 }, { "epoch": 0.0390011718449545, "grad_norm": 13.227412223815918, "learning_rate": 0.00011693511899375622, "loss": 8.1436, "step": 8620 }, { "epoch": 0.03909166180283143, "grad_norm": 12.561627388000488, "learning_rate": 0.00011720658763912768, "loss": 8.1478, "step": 8640 }, { "epoch": 0.039182151760708356, "grad_norm": 12.864951133728027, "learning_rate": 0.00011747805628449915, "loss": 8.1727, "step": 8660 }, { "epoch": 0.03927264171858528, "grad_norm": 12.883962631225586, "learning_rate": 0.00011774952492987061, "loss": 8.1396, "step": 8680 }, { "epoch": 0.0393631316764622, "grad_norm": 7.435621738433838, "learning_rate": 0.00011802099357524204, "loss": 8.1774, "step": 8700 }, { "epoch": 0.03945362163433913, "grad_norm": 12.7384672164917, "learning_rate": 0.00011829246222061351, "loss": 8.1297, "step": 8720 }, { "epoch": 0.039544111592216054, "grad_norm": 14.0343017578125, "learning_rate": 0.00011856393086598497, "loss": 8.1406, "step": 8740 }, { "epoch": 0.03963460155009298, "grad_norm": 15.325870513916016, "learning_rate": 0.00011883539951135643, "loss": 8.1619, "step": 8760 }, { "epoch": 0.039725091507969906, "grad_norm": 21.650548934936523, "learning_rate": 0.00011910686815672788, "loss": 8.193, "step": 8780 }, { "epoch": 0.03981558146584683, "grad_norm": 15.605712890625, "learning_rate": 0.00011937833680209935, "loss": 8.1709, "step": 8800 }, { "epoch": 0.03990607142372375, "grad_norm": 10.788895606994629, "learning_rate": 0.00011964980544747081, "loss": 8.1451, "step": 8820 }, { "epoch": 0.039996561381600675, "grad_norm": 16.377477645874023, "learning_rate": 0.00011992127409284227, "loss": 8.134, "step": 8840 }, { "epoch": 0.040087051339477604, "grad_norm": 13.106194496154785, "learning_rate": 0.00012019274273821374, "loss": 8.1352, "step": 8860 }, { "epoch": 0.04017754129735453, "grad_norm": 11.152835845947266, "learning_rate": 0.0001204642113835852, "loss": 8.1138, "step": 8880 }, { "epoch": 0.04026803125523145, "grad_norm": 9.210712432861328, "learning_rate": 0.00012073568002895666, "loss": 8.1769, "step": 8900 }, { "epoch": 0.04035852121310837, "grad_norm": 12.555234909057617, "learning_rate": 0.00012100714867432813, "loss": 8.1383, "step": 8920 }, { "epoch": 0.0404490111709853, "grad_norm": 12.013688087463379, "learning_rate": 0.00012127861731969958, "loss": 8.1564, "step": 8940 }, { "epoch": 0.040539501128862225, "grad_norm": 9.827411651611328, "learning_rate": 0.00012155008596507101, "loss": 8.1348, "step": 8960 }, { "epoch": 0.04062999108673915, "grad_norm": 11.609356880187988, "learning_rate": 0.00012182155461044248, "loss": 8.1646, "step": 8980 }, { "epoch": 0.04072048104461607, "grad_norm": 13.045088768005371, "learning_rate": 0.00012209302325581395, "loss": 8.1628, "step": 9000 }, { "epoch": 0.040810971002493, "grad_norm": 12.780691146850586, "learning_rate": 0.00012236449190118542, "loss": 8.1487, "step": 9020 }, { "epoch": 0.04090146096036992, "grad_norm": 10.65334701538086, "learning_rate": 0.00012263596054655685, "loss": 8.1275, "step": 9040 }, { "epoch": 0.040991950918246846, "grad_norm": 8.080134391784668, "learning_rate": 0.00012290742919192832, "loss": 8.1356, "step": 9060 }, { "epoch": 0.041082440876123776, "grad_norm": 12.708916664123535, "learning_rate": 0.00012317889783729978, "loss": 8.1606, "step": 9080 }, { "epoch": 0.0411729308340007, "grad_norm": 13.570298194885254, "learning_rate": 0.00012345036648267124, "loss": 8.1389, "step": 9100 }, { "epoch": 0.04126342079187762, "grad_norm": 13.237983703613281, "learning_rate": 0.0001237218351280427, "loss": 8.1243, "step": 9120 }, { "epoch": 0.041353910749754544, "grad_norm": 14.53023910522461, "learning_rate": 0.00012399330377341417, "loss": 8.1191, "step": 9140 }, { "epoch": 0.041444400707631474, "grad_norm": 11.765192031860352, "learning_rate": 0.00012426477241878563, "loss": 8.1031, "step": 9160 }, { "epoch": 0.041534890665508396, "grad_norm": 11.261069297790527, "learning_rate": 0.0001245362410641571, "loss": 8.1504, "step": 9180 }, { "epoch": 0.04162538062338532, "grad_norm": 13.039865493774414, "learning_rate": 0.00012480770970952856, "loss": 8.1186, "step": 9200 }, { "epoch": 0.04171587058126224, "grad_norm": 11.21242904663086, "learning_rate": 0.0001250791783549, "loss": 8.1244, "step": 9220 }, { "epoch": 0.04180636053913917, "grad_norm": 13.84521770477295, "learning_rate": 0.00012535064700027146, "loss": 8.1442, "step": 9240 }, { "epoch": 0.041896850497016094, "grad_norm": 14.333518981933594, "learning_rate": 0.00012562211564564292, "loss": 8.1628, "step": 9260 }, { "epoch": 0.04198734045489302, "grad_norm": 12.016851425170898, "learning_rate": 0.00012589358429101438, "loss": 8.1037, "step": 9280 }, { "epoch": 0.04207783041276994, "grad_norm": 9.183259010314941, "learning_rate": 0.00012616505293638585, "loss": 8.1429, "step": 9300 }, { "epoch": 0.04216832037064687, "grad_norm": 13.651033401489258, "learning_rate": 0.0001264365215817573, "loss": 8.1202, "step": 9320 }, { "epoch": 0.04225881032852379, "grad_norm": 11.869391441345215, "learning_rate": 0.00012670799022712877, "loss": 8.1125, "step": 9340 }, { "epoch": 0.042349300286400715, "grad_norm": 15.943286895751953, "learning_rate": 0.00012697945887250024, "loss": 8.1694, "step": 9360 }, { "epoch": 0.04243979024427764, "grad_norm": 13.450387001037598, "learning_rate": 0.00012725092751787167, "loss": 8.1379, "step": 9380 }, { "epoch": 0.04253028020215457, "grad_norm": 15.152196884155273, "learning_rate": 0.00012752239616324314, "loss": 8.1391, "step": 9400 }, { "epoch": 0.04262077016003149, "grad_norm": 15.109274864196777, "learning_rate": 0.0001277938648086146, "loss": 8.0963, "step": 9420 }, { "epoch": 0.04271126011790841, "grad_norm": 10.3173189163208, "learning_rate": 0.00012806533345398606, "loss": 8.1557, "step": 9440 }, { "epoch": 0.04280175007578534, "grad_norm": 11.38595962524414, "learning_rate": 0.00012833680209935753, "loss": 8.173, "step": 9460 }, { "epoch": 0.042892240033662266, "grad_norm": 11.458219528198242, "learning_rate": 0.00012859469731246043, "loss": 8.2542, "step": 9480 }, { "epoch": 0.04298272999153919, "grad_norm": 14.253256797790527, "learning_rate": 0.00012886616595783186, "loss": 8.1687, "step": 9500 }, { "epoch": 0.04307321994941611, "grad_norm": 14.074560165405273, "learning_rate": 0.00012913763460320333, "loss": 8.1175, "step": 9520 }, { "epoch": 0.04316370990729304, "grad_norm": 14.521282196044922, "learning_rate": 0.00012939552981630623, "loss": 8.1456, "step": 9540 }, { "epoch": 0.043254199865169964, "grad_norm": 12.537208557128906, "learning_rate": 0.0001296669984616777, "loss": 8.1432, "step": 9560 }, { "epoch": 0.043344689823046886, "grad_norm": 10.885902404785156, "learning_rate": 0.00012993846710704915, "loss": 8.1875, "step": 9580 }, { "epoch": 0.04343517978092381, "grad_norm": 10.156676292419434, "learning_rate": 0.0001302099357524206, "loss": 8.1728, "step": 9600 }, { "epoch": 0.04352566973880074, "grad_norm": 13.31322193145752, "learning_rate": 0.00013048140439779205, "loss": 8.1394, "step": 9620 }, { "epoch": 0.04361615969667766, "grad_norm": 7.779819488525391, "learning_rate": 0.0001307528730431635, "loss": 8.139, "step": 9640 }, { "epoch": 0.043706649654554584, "grad_norm": 12.208565711975098, "learning_rate": 0.00013102434168853495, "loss": 8.1346, "step": 9660 }, { "epoch": 0.04379713961243151, "grad_norm": 11.362008094787598, "learning_rate": 0.00013129581033390642, "loss": 8.1419, "step": 9680 }, { "epoch": 0.04388762957030844, "grad_norm": 11.86789321899414, "learning_rate": 0.00013156727897927788, "loss": 8.1475, "step": 9700 }, { "epoch": 0.04397811952818536, "grad_norm": 14.61185073852539, "learning_rate": 0.00013183874762464934, "loss": 8.1582, "step": 9720 }, { "epoch": 0.04406860948606228, "grad_norm": 11.60112190246582, "learning_rate": 0.0001321102162700208, "loss": 8.1073, "step": 9740 }, { "epoch": 0.04415909944393921, "grad_norm": 13.442856788635254, "learning_rate": 0.00013238168491539227, "loss": 8.1358, "step": 9760 }, { "epoch": 0.044249589401816135, "grad_norm": 11.524395942687988, "learning_rate": 0.00013265315356076373, "loss": 8.1083, "step": 9780 }, { "epoch": 0.04434007935969306, "grad_norm": 13.528814315795898, "learning_rate": 0.0001329246222061352, "loss": 8.1392, "step": 9800 }, { "epoch": 0.04443056931756998, "grad_norm": 18.11868667602539, "learning_rate": 0.00013319609085150666, "loss": 8.1784, "step": 9820 }, { "epoch": 0.04452105927544691, "grad_norm": 15.858280181884766, "learning_rate": 0.00013346755949687812, "loss": 8.1597, "step": 9840 }, { "epoch": 0.04461154923332383, "grad_norm": 14.466769218444824, "learning_rate": 0.00013373902814224956, "loss": 8.1632, "step": 9860 }, { "epoch": 0.044702039191200756, "grad_norm": 11.416616439819336, "learning_rate": 0.00013401049678762102, "loss": 8.1681, "step": 9880 }, { "epoch": 0.04479252914907768, "grad_norm": 39.87081527709961, "learning_rate": 0.00013428196543299249, "loss": 8.1384, "step": 9900 }, { "epoch": 0.04488301910695461, "grad_norm": 11.689374923706055, "learning_rate": 0.0001345398606460954, "loss": 8.5619, "step": 9920 }, { "epoch": 0.04497350906483153, "grad_norm": 10.53484058380127, "learning_rate": 0.00013481132929146682, "loss": 9.1495, "step": 9940 }, { "epoch": 0.045063999022708454, "grad_norm": 12.07006549835205, "learning_rate": 0.00013508279793683829, "loss": 9.1771, "step": 9960 }, { "epoch": 0.045154488980585376, "grad_norm": 9.795348167419434, "learning_rate": 0.00013535426658220975, "loss": 9.1545, "step": 9980 }, { "epoch": 0.045244978938462306, "grad_norm": 10.068339347839355, "learning_rate": 0.0001356257352275812, "loss": 9.1969, "step": 10000 }, { "epoch": 0.045244978938462306, "eval_accuracy": 0.022879129772772476, "eval_loss": 9.148832321166992, "eval_runtime": 212.7494, "eval_samples_per_second": 2857.071, "eval_steps_per_second": 11.163, "step": 10000 }, { "epoch": 0.04533546889633923, "grad_norm": 12.951713562011719, "learning_rate": 0.00013589720387295268, "loss": 9.154, "step": 10020 }, { "epoch": 0.04542595885421615, "grad_norm": 9.139362335205078, "learning_rate": 0.00013616867251832414, "loss": 9.154, "step": 10040 }, { "epoch": 0.04551644881209308, "grad_norm": 8.388337135314941, "learning_rate": 0.0001364401411636956, "loss": 9.1391, "step": 10060 }, { "epoch": 0.045606938769970004, "grad_norm": 10.0809326171875, "learning_rate": 0.00013671160980906704, "loss": 9.1417, "step": 10080 }, { "epoch": 0.04569742872784693, "grad_norm": 8.565701484680176, "learning_rate": 0.0001369830784544385, "loss": 9.1112, "step": 10100 }, { "epoch": 0.04578791868572385, "grad_norm": 10.437520027160645, "learning_rate": 0.00013725454709980997, "loss": 9.1169, "step": 10120 }, { "epoch": 0.04587840864360078, "grad_norm": 8.615896224975586, "learning_rate": 0.00013752601574518143, "loss": 9.1003, "step": 10140 }, { "epoch": 0.0459688986014777, "grad_norm": 10.89583683013916, "learning_rate": 0.0001377974843905529, "loss": 9.101, "step": 10160 }, { "epoch": 0.046059388559354625, "grad_norm": 9.786931991577148, "learning_rate": 0.00013806895303592433, "loss": 9.0689, "step": 10180 }, { "epoch": 0.04614987851723155, "grad_norm": 9.010174751281738, "learning_rate": 0.0001383404216812958, "loss": 9.0579, "step": 10200 }, { "epoch": 0.04624036847510848, "grad_norm": 11.039669036865234, "learning_rate": 0.00013861189032666725, "loss": 9.0865, "step": 10220 }, { "epoch": 0.0463308584329854, "grad_norm": 12.055830001831055, "learning_rate": 0.00013888335897203872, "loss": 9.0955, "step": 10240 }, { "epoch": 0.04642134839086232, "grad_norm": 8.361885070800781, "learning_rate": 0.00013915482761741018, "loss": 9.07, "step": 10260 }, { "epoch": 0.046511838348739246, "grad_norm": 7.196146011352539, "learning_rate": 0.00013942629626278164, "loss": 9.0528, "step": 10280 }, { "epoch": 0.046602328306616175, "grad_norm": 9.67076587677002, "learning_rate": 0.0001396977649081531, "loss": 9.0546, "step": 10300 }, { "epoch": 0.0466928182644931, "grad_norm": 10.09327220916748, "learning_rate": 0.00013996923355352457, "loss": 9.0741, "step": 10320 }, { "epoch": 0.04678330822237002, "grad_norm": 9.639015197753906, "learning_rate": 0.00014024070219889603, "loss": 9.0633, "step": 10340 }, { "epoch": 0.04687379818024695, "grad_norm": 10.251932144165039, "learning_rate": 0.0001405121708442675, "loss": 9.0446, "step": 10360 }, { "epoch": 0.04696428813812387, "grad_norm": 11.07875919342041, "learning_rate": 0.00014078363948963896, "loss": 9.0418, "step": 10380 }, { "epoch": 0.047054778096000796, "grad_norm": 9.328507423400879, "learning_rate": 0.00014105510813501042, "loss": 9.0287, "step": 10400 }, { "epoch": 0.04714526805387772, "grad_norm": 7.056753635406494, "learning_rate": 0.00014132657678038186, "loss": 9.0362, "step": 10420 }, { "epoch": 0.04723575801175465, "grad_norm": 8.899680137634277, "learning_rate": 0.0001415980454257533, "loss": 9.036, "step": 10440 }, { "epoch": 0.04732624796963157, "grad_norm": 9.175132751464844, "learning_rate": 0.00014186951407112476, "loss": 9.0444, "step": 10460 }, { "epoch": 0.047416737927508494, "grad_norm": 9.374978065490723, "learning_rate": 0.00014214098271649622, "loss": 9.0372, "step": 10480 }, { "epoch": 0.04750722788538542, "grad_norm": 9.893750190734863, "learning_rate": 0.00014241245136186769, "loss": 9.0424, "step": 10500 }, { "epoch": 0.04759771784326235, "grad_norm": 7.787280082702637, "learning_rate": 0.00014265677314270202, "loss": 8.9691, "step": 10520 }, { "epoch": 0.04768820780113927, "grad_norm": 17.40734100341797, "learning_rate": 0.00014277893403311917, "loss": 8.2225, "step": 10540 }, { "epoch": 0.04777869775901619, "grad_norm": NaN, "learning_rate": 0.00014286037462673062, "loss": 6.6046, "step": 10560 }, { "epoch": 0.047869187716893115, "grad_norm": NaN, "learning_rate": 0.0001429146683558049, "loss": 3.0921, "step": 10580 }, { "epoch": 0.047959677674770045, "grad_norm": NaN, "learning_rate": 0.00014294181522034205, "loss": 3.9765, "step": 10600 }, { "epoch": 0.04805016763264697, "grad_norm": NaN, "learning_rate": 0.00014298253551714776, "loss": 6.9972, "step": 10620 }, { "epoch": 0.04814065759052389, "grad_norm": NaN, "learning_rate": 0.00014298253551714776, "loss": 0.0, "step": 10640 }, { "epoch": 0.04823114754840081, "grad_norm": NaN, "learning_rate": 0.00014298253551714776, "loss": 0.0, "step": 10660 }, { "epoch": 0.04832163750627774, "grad_norm": NaN, "learning_rate": 0.00014298253551714776, "loss": 0.0, "step": 10680 }, { "epoch": 0.048412127464154665, "grad_norm": NaN, "learning_rate": 0.00014298253551714776, "loss": 0.0, "step": 10700 }, { "epoch": 0.04850261742203159, "grad_norm": NaN, "learning_rate": 0.00014298253551714776, "loss": 0.0, "step": 10720 }, { "epoch": 0.04859310737990852, "grad_norm": NaN, "learning_rate": 0.00014298253551714776, "loss": 0.0, "step": 10740 }, { "epoch": 0.04868359733778544, "grad_norm": NaN, "learning_rate": 0.00014298253551714776, "loss": 0.0, "step": 10760 }, { "epoch": 0.04877408729566236, "grad_norm": NaN, "learning_rate": 0.00014298253551714776, "loss": 0.0, "step": 10780 }, { "epoch": 0.048864577253539286, "grad_norm": NaN, "learning_rate": 0.00014298253551714776, "loss": 0.0, "step": 10800 }, { "epoch": 0.048955067211416216, "grad_norm": NaN, "learning_rate": 0.00014298253551714776, "loss": 0.0, "step": 10820 }, { "epoch": 0.04904555716929314, "grad_norm": NaN, "learning_rate": 0.00014298253551714776, "loss": 0.0, "step": 10840 }, { "epoch": 0.04913604712717006, "grad_norm": NaN, "learning_rate": 0.00014298253551714776, "loss": 0.0, "step": 10860 }, { "epoch": 0.049226537085046984, "grad_norm": NaN, "learning_rate": 0.00014298253551714776, "loss": 0.0, "step": 10880 }, { "epoch": 0.049317027042923914, "grad_norm": NaN, "learning_rate": 0.00014298253551714776, "loss": 0.0, "step": 10900 }, { "epoch": 0.04940751700080084, "grad_norm": NaN, "learning_rate": 0.00014298253551714776, "loss": 0.0, "step": 10920 }, { "epoch": 0.04949800695867776, "grad_norm": NaN, "learning_rate": 0.00014298253551714776, "loss": 0.0, "step": 10940 }, { "epoch": 0.04958849691655468, "grad_norm": NaN, "learning_rate": 0.00014298253551714776, "loss": 0.0, "step": 10960 }, { "epoch": 0.04967898687443161, "grad_norm": NaN, "learning_rate": 0.00014298253551714776, "loss": 0.0, "step": 10980 }, { "epoch": 0.049769476832308535, "grad_norm": NaN, "learning_rate": 0.00014298253551714776, "loss": 0.0, "step": 11000 }, { "epoch": 0.04985996679018546, "grad_norm": NaN, "learning_rate": 0.00014298253551714776, "loss": 0.0, "step": 11020 }, { "epoch": 0.04995045674806239, "grad_norm": NaN, "learning_rate": 0.00014298253551714776, "loss": 0.0, "step": 11040 }, { "epoch": 0.05004094670593931, "grad_norm": NaN, "learning_rate": 0.00014298253551714776, "loss": 0.0, "step": 11060 }, { "epoch": 0.05013143666381623, "grad_norm": NaN, "learning_rate": 0.00014298253551714776, "loss": 0.0, "step": 11080 }, { "epoch": 0.050221926621693155, "grad_norm": NaN, "learning_rate": 0.00014298253551714776, "loss": 0.0, "step": 11100 }, { "epoch": 0.050312416579570085, "grad_norm": NaN, "learning_rate": 0.00014298253551714776, "loss": 0.0, "step": 11120 }, { "epoch": 0.05040290653744701, "grad_norm": NaN, "learning_rate": 0.00014298253551714776, "loss": 0.0, "step": 11140 }, { "epoch": 0.05049339649532393, "grad_norm": NaN, "learning_rate": 0.00014298253551714776, "loss": 0.0, "step": 11160 }, { "epoch": 0.05058388645320085, "grad_norm": NaN, "learning_rate": 0.00014298253551714776, "loss": 0.0, "step": 11180 }, { "epoch": 0.05067437641107778, "grad_norm": NaN, "learning_rate": 0.00014298253551714776, "loss": 0.0, "step": 11200 }, { "epoch": 0.050764866368954706, "grad_norm": NaN, "learning_rate": 0.00014298253551714776, "loss": 0.0, "step": 11220 }, { "epoch": 0.05085535632683163, "grad_norm": NaN, "learning_rate": 0.00014298253551714776, "loss": 0.0, "step": 11240 }, { "epoch": 0.05094584628470855, "grad_norm": NaN, "learning_rate": 0.00014298253551714776, "loss": 0.0, "step": 11260 }, { "epoch": 0.05103633624258548, "grad_norm": NaN, "learning_rate": 0.00014298253551714776, "loss": 0.0, "step": 11280 }, { "epoch": 0.051126826200462404, "grad_norm": NaN, "learning_rate": 0.00014298253551714776, "loss": 0.0, "step": 11300 }, { "epoch": 0.05121731615833933, "grad_norm": NaN, "learning_rate": 0.00014298253551714776, "loss": 0.0, "step": 11320 }, { "epoch": 0.051307806116216256, "grad_norm": NaN, "learning_rate": 0.00014298253551714776, "loss": 0.0, "step": 11340 }, { "epoch": 0.05139829607409318, "grad_norm": NaN, "learning_rate": 0.00014298253551714776, "loss": 0.0, "step": 11360 }, { "epoch": 0.0514887860319701, "grad_norm": NaN, "learning_rate": 0.00014298253551714776, "loss": 0.0, "step": 11380 }, { "epoch": 0.051579275989847025, "grad_norm": NaN, "learning_rate": 0.00014298253551714776, "loss": 0.0, "step": 11400 }, { "epoch": 0.051669765947723954, "grad_norm": NaN, "learning_rate": 0.00014298253551714776, "loss": 0.0, "step": 11420 }, { "epoch": 0.05176025590560088, "grad_norm": NaN, "learning_rate": 0.00014298253551714776, "loss": 0.0, "step": 11440 }, { "epoch": 0.0518507458634778, "grad_norm": NaN, "learning_rate": 0.00014298253551714776, "loss": 0.0, "step": 11460 }, { "epoch": 0.05194123582135472, "grad_norm": NaN, "learning_rate": 0.00014298253551714776, "loss": 0.0, "step": 11480 }, { "epoch": 0.05203172577923165, "grad_norm": NaN, "learning_rate": 0.00014298253551714776, "loss": 0.0, "step": 11500 }, { "epoch": 0.052122215737108575, "grad_norm": NaN, "learning_rate": 0.00014298253551714776, "loss": 0.0, "step": 11520 }, { "epoch": 0.0522127056949855, "grad_norm": NaN, "learning_rate": 0.00014298253551714776, "loss": 0.0, "step": 11540 }, { "epoch": 0.05230319565286242, "grad_norm": NaN, "learning_rate": 0.00014298253551714776, "loss": 0.0, "step": 11560 }, { "epoch": 0.05239368561073935, "grad_norm": NaN, "learning_rate": 0.00014298253551714776, "loss": 0.0, "step": 11580 }, { "epoch": 0.05248417556861627, "grad_norm": NaN, "learning_rate": 0.00014298253551714776, "loss": 0.0, "step": 11600 }, { "epoch": 0.052574665526493196, "grad_norm": NaN, "learning_rate": 0.00014298253551714776, "loss": 0.0, "step": 11620 }, { "epoch": 0.05266515548437012, "grad_norm": NaN, "learning_rate": 0.00014298253551714776, "loss": 0.0, "step": 11640 }, { "epoch": 0.05275564544224705, "grad_norm": NaN, "learning_rate": 0.00014298253551714776, "loss": 0.0, "step": 11660 }, { "epoch": 0.05284613540012397, "grad_norm": NaN, "learning_rate": 0.00014298253551714776, "loss": 0.0, "step": 11680 }, { "epoch": 0.052936625358000894, "grad_norm": NaN, "learning_rate": 0.00014298253551714776, "loss": 0.0, "step": 11700 }, { "epoch": 0.053027115315877824, "grad_norm": NaN, "learning_rate": 0.00014298253551714776, "loss": 0.0, "step": 11720 }, { "epoch": 0.053117605273754746, "grad_norm": NaN, "learning_rate": 0.00014298253551714776, "loss": 0.0, "step": 11740 }, { "epoch": 0.05320809523163167, "grad_norm": NaN, "learning_rate": 0.00014298253551714776, "loss": 0.0, "step": 11760 }, { "epoch": 0.05329858518950859, "grad_norm": NaN, "learning_rate": 0.00014298253551714776, "loss": 0.0, "step": 11780 }, { "epoch": 0.05338907514738552, "grad_norm": NaN, "learning_rate": 0.00014298253551714776, "loss": 0.0, "step": 11800 }, { "epoch": 0.053479565105262444, "grad_norm": NaN, "learning_rate": 0.00014298253551714776, "loss": 0.0, "step": 11820 }, { "epoch": 0.05357005506313937, "grad_norm": NaN, "learning_rate": 0.00014298253551714776, "loss": 0.0, "step": 11840 }, { "epoch": 0.05366054502101629, "grad_norm": NaN, "learning_rate": 0.00014298253551714776, "loss": 0.0, "step": 11860 }, { "epoch": 0.05375103497889322, "grad_norm": NaN, "learning_rate": 0.00014298253551714776, "loss": 0.0, "step": 11880 }, { "epoch": 0.05384152493677014, "grad_norm": NaN, "learning_rate": 0.00014298253551714776, "loss": 0.0, "step": 11900 }, { "epoch": 0.053932014894647065, "grad_norm": NaN, "learning_rate": 0.00014298253551714776, "loss": 0.0, "step": 11920 }, { "epoch": 0.05402250485252399, "grad_norm": NaN, "learning_rate": 0.00014298253551714776, "loss": 0.0, "step": 11940 }, { "epoch": 0.05411299481040092, "grad_norm": NaN, "learning_rate": 0.00014298253551714776, "loss": 0.0, "step": 11960 }, { "epoch": 0.05420348476827784, "grad_norm": NaN, "learning_rate": 0.00014298253551714776, "loss": 0.0, "step": 11980 }, { "epoch": 0.05429397472615476, "grad_norm": NaN, "learning_rate": 0.00014298253551714776, "loss": 0.0, "step": 12000 }, { "epoch": 0.05429397472615476, "eval_accuracy": 0.021626624590642192, "eval_loss": NaN, "eval_runtime": 218.9297, "eval_samples_per_second": 2776.417, "eval_steps_per_second": 10.848, "step": 12000 } ], "logging_steps": 20, "max_steps": 663057, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 100, "total_flos": 4315086323712000.0, "train_batch_size": 256, "trial_name": null, "trial_params": null }