Invalid JSON:
Unexpected token 'N', ..."ad_norm": NaN,
"... is not valid JSON
| { | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 0.05429397472615476, | |
| "eval_steps": 2000, | |
| "global_step": 12000, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 9.048995787692461e-05, | |
| "grad_norm": 1.1874778270721436, | |
| "learning_rate": 2.7146864537145957e-07, | |
| "loss": 10.3312, | |
| "step": 20 | |
| }, | |
| { | |
| "epoch": 0.00018097991575384922, | |
| "grad_norm": 1.3932149410247803, | |
| "learning_rate": 5.429372907429191e-07, | |
| "loss": 10.3266, | |
| "step": 40 | |
| }, | |
| { | |
| "epoch": 0.00027146987363077383, | |
| "grad_norm": 1.2732529640197754, | |
| "learning_rate": 8.144059361143787e-07, | |
| "loss": 10.3163, | |
| "step": 60 | |
| }, | |
| { | |
| "epoch": 0.00036195983150769844, | |
| "grad_norm": 1.07429039478302, | |
| "learning_rate": 1.0858745814858383e-06, | |
| "loss": 10.3044, | |
| "step": 80 | |
| }, | |
| { | |
| "epoch": 0.00045244978938462305, | |
| "grad_norm": 1.0309141874313354, | |
| "learning_rate": 1.357343226857298e-06, | |
| "loss": 10.2959, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.0005429397472615477, | |
| "grad_norm": 0.9270058870315552, | |
| "learning_rate": 1.6288118722287574e-06, | |
| "loss": 10.2818, | |
| "step": 120 | |
| }, | |
| { | |
| "epoch": 0.0006334297051384723, | |
| "grad_norm": 0.8409116864204407, | |
| "learning_rate": 1.900280517600217e-06, | |
| "loss": 10.2724, | |
| "step": 140 | |
| }, | |
| { | |
| "epoch": 0.0007239196630153969, | |
| "grad_norm": 0.7587267160415649, | |
| "learning_rate": 2.1717491629716765e-06, | |
| "loss": 10.2662, | |
| "step": 160 | |
| }, | |
| { | |
| "epoch": 0.0008144096208923215, | |
| "grad_norm": 0.8605366945266724, | |
| "learning_rate": 2.4432178083431364e-06, | |
| "loss": 10.2567, | |
| "step": 180 | |
| }, | |
| { | |
| "epoch": 0.0009048995787692461, | |
| "grad_norm": 0.8124440908432007, | |
| "learning_rate": 2.714686453714596e-06, | |
| "loss": 10.2513, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 0.0009953895366461706, | |
| "grad_norm": 0.8214222192764282, | |
| "learning_rate": 2.9861550990860553e-06, | |
| "loss": 10.2396, | |
| "step": 220 | |
| }, | |
| { | |
| "epoch": 0.0010858794945230953, | |
| "grad_norm": 0.7500312924385071, | |
| "learning_rate": 3.2576237444575148e-06, | |
| "loss": 10.2378, | |
| "step": 240 | |
| }, | |
| { | |
| "epoch": 0.0011763694524000198, | |
| "grad_norm": 0.7709519267082214, | |
| "learning_rate": 3.529092389828975e-06, | |
| "loss": 10.2287, | |
| "step": 260 | |
| }, | |
| { | |
| "epoch": 0.0012668594102769445, | |
| "grad_norm": 0.8319140672683716, | |
| "learning_rate": 3.800561035200434e-06, | |
| "loss": 10.2214, | |
| "step": 280 | |
| }, | |
| { | |
| "epoch": 0.001357349368153869, | |
| "grad_norm": 0.8057898283004761, | |
| "learning_rate": 4.072029680571894e-06, | |
| "loss": 10.2072, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 0.0014478393260307938, | |
| "grad_norm": 0.6834843754768372, | |
| "learning_rate": 4.343498325943353e-06, | |
| "loss": 10.1983, | |
| "step": 320 | |
| }, | |
| { | |
| "epoch": 0.0015383292839077183, | |
| "grad_norm": 0.8223700523376465, | |
| "learning_rate": 4.614966971314813e-06, | |
| "loss": 10.1884, | |
| "step": 340 | |
| }, | |
| { | |
| "epoch": 0.001628819241784643, | |
| "grad_norm": 0.8147690892219543, | |
| "learning_rate": 4.886435616686273e-06, | |
| "loss": 10.1814, | |
| "step": 360 | |
| }, | |
| { | |
| "epoch": 0.0017193091996615675, | |
| "grad_norm": 0.8512526750564575, | |
| "learning_rate": 5.157904262057733e-06, | |
| "loss": 10.1713, | |
| "step": 380 | |
| }, | |
| { | |
| "epoch": 0.0018097991575384922, | |
| "grad_norm": 0.8844230771064758, | |
| "learning_rate": 5.429372907429192e-06, | |
| "loss": 10.1572, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 0.0019002891154154167, | |
| "grad_norm": 0.9605993628501892, | |
| "learning_rate": 5.700841552800652e-06, | |
| "loss": 10.1496, | |
| "step": 420 | |
| }, | |
| { | |
| "epoch": 0.001990779073292341, | |
| "grad_norm": 1.2027961015701294, | |
| "learning_rate": 5.972310198172111e-06, | |
| "loss": 10.1298, | |
| "step": 440 | |
| }, | |
| { | |
| "epoch": 0.002081269031169266, | |
| "grad_norm": 1.4069308042526245, | |
| "learning_rate": 6.2437788435435705e-06, | |
| "loss": 10.1092, | |
| "step": 460 | |
| }, | |
| { | |
| "epoch": 0.0021717589890461906, | |
| "grad_norm": 1.7658456563949585, | |
| "learning_rate": 6.5152474889150296e-06, | |
| "loss": 10.0954, | |
| "step": 480 | |
| }, | |
| { | |
| "epoch": 0.002262248946923115, | |
| "grad_norm": 1.6941689252853394, | |
| "learning_rate": 6.7867161342864895e-06, | |
| "loss": 10.0746, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 0.0023527389048000396, | |
| "grad_norm": 2.362786293029785, | |
| "learning_rate": 7.05818477965795e-06, | |
| "loss": 10.0613, | |
| "step": 520 | |
| }, | |
| { | |
| "epoch": 0.0024432288626769646, | |
| "grad_norm": 1.827091932296753, | |
| "learning_rate": 7.329653425029408e-06, | |
| "loss": 10.045, | |
| "step": 540 | |
| }, | |
| { | |
| "epoch": 0.002533718820553889, | |
| "grad_norm": 2.136615753173828, | |
| "learning_rate": 7.601122070400868e-06, | |
| "loss": 10.0243, | |
| "step": 560 | |
| }, | |
| { | |
| "epoch": 0.0026242087784308136, | |
| "grad_norm": 2.501790761947632, | |
| "learning_rate": 7.872590715772328e-06, | |
| "loss": 10.0091, | |
| "step": 580 | |
| }, | |
| { | |
| "epoch": 0.002714698736307738, | |
| "grad_norm": 2.7978005409240723, | |
| "learning_rate": 8.144059361143788e-06, | |
| "loss": 9.9957, | |
| "step": 600 | |
| }, | |
| { | |
| "epoch": 0.002805188694184663, | |
| "grad_norm": 3.0485517978668213, | |
| "learning_rate": 8.415528006515246e-06, | |
| "loss": 9.9819, | |
| "step": 620 | |
| }, | |
| { | |
| "epoch": 0.0028956786520615875, | |
| "grad_norm": 2.761986255645752, | |
| "learning_rate": 8.686996651886706e-06, | |
| "loss": 9.9596, | |
| "step": 640 | |
| }, | |
| { | |
| "epoch": 0.002986168609938512, | |
| "grad_norm": 3.0985260009765625, | |
| "learning_rate": 8.958465297258166e-06, | |
| "loss": 9.9436, | |
| "step": 660 | |
| }, | |
| { | |
| "epoch": 0.0030766585678154365, | |
| "grad_norm": 2.40391206741333, | |
| "learning_rate": 9.229933942629626e-06, | |
| "loss": 9.9226, | |
| "step": 680 | |
| }, | |
| { | |
| "epoch": 0.0031671485256923614, | |
| "grad_norm": 1.933740496635437, | |
| "learning_rate": 9.501402588001086e-06, | |
| "loss": 9.9069, | |
| "step": 700 | |
| }, | |
| { | |
| "epoch": 0.003257638483569286, | |
| "grad_norm": 2.518874168395996, | |
| "learning_rate": 9.772871233372546e-06, | |
| "loss": 9.881, | |
| "step": 720 | |
| }, | |
| { | |
| "epoch": 0.0033481284414462104, | |
| "grad_norm": 2.8025624752044678, | |
| "learning_rate": 1.0044339878744006e-05, | |
| "loss": 9.8686, | |
| "step": 740 | |
| }, | |
| { | |
| "epoch": 0.003438618399323135, | |
| "grad_norm": 1.943656086921692, | |
| "learning_rate": 1.0315808524115465e-05, | |
| "loss": 9.8463, | |
| "step": 760 | |
| }, | |
| { | |
| "epoch": 0.00352910835720006, | |
| "grad_norm": 1.753179907798767, | |
| "learning_rate": 1.0587277169486925e-05, | |
| "loss": 9.8344, | |
| "step": 780 | |
| }, | |
| { | |
| "epoch": 0.0036195983150769844, | |
| "grad_norm": 1.9388506412506104, | |
| "learning_rate": 1.0858745814858383e-05, | |
| "loss": 9.8144, | |
| "step": 800 | |
| }, | |
| { | |
| "epoch": 0.003710088272953909, | |
| "grad_norm": 2.6278536319732666, | |
| "learning_rate": 1.1130214460229843e-05, | |
| "loss": 9.8008, | |
| "step": 820 | |
| }, | |
| { | |
| "epoch": 0.0038005782308308334, | |
| "grad_norm": 1.8270655870437622, | |
| "learning_rate": 1.1401683105601303e-05, | |
| "loss": 9.7791, | |
| "step": 840 | |
| }, | |
| { | |
| "epoch": 0.0038910681887077583, | |
| "grad_norm": 1.656563639640808, | |
| "learning_rate": 1.1673151750972763e-05, | |
| "loss": 9.7677, | |
| "step": 860 | |
| }, | |
| { | |
| "epoch": 0.003981558146584682, | |
| "grad_norm": 1.6003910303115845, | |
| "learning_rate": 1.1944620396344221e-05, | |
| "loss": 9.7465, | |
| "step": 880 | |
| }, | |
| { | |
| "epoch": 0.004072048104461608, | |
| "grad_norm": 1.5632762908935547, | |
| "learning_rate": 1.2216089041715681e-05, | |
| "loss": 9.73, | |
| "step": 900 | |
| }, | |
| { | |
| "epoch": 0.004162538062338532, | |
| "grad_norm": 1.4974184036254883, | |
| "learning_rate": 1.2487557687087141e-05, | |
| "loss": 9.7067, | |
| "step": 920 | |
| }, | |
| { | |
| "epoch": 0.004253028020215457, | |
| "grad_norm": 1.811112880706787, | |
| "learning_rate": 1.2759026332458601e-05, | |
| "loss": 9.6956, | |
| "step": 940 | |
| }, | |
| { | |
| "epoch": 0.004343517978092381, | |
| "grad_norm": 1.505334734916687, | |
| "learning_rate": 1.3030494977830059e-05, | |
| "loss": 9.6667, | |
| "step": 960 | |
| }, | |
| { | |
| "epoch": 0.004434007935969306, | |
| "grad_norm": 1.6951265335083008, | |
| "learning_rate": 1.3301963623201519e-05, | |
| "loss": 9.6505, | |
| "step": 980 | |
| }, | |
| { | |
| "epoch": 0.00452449789384623, | |
| "grad_norm": 1.6119604110717773, | |
| "learning_rate": 1.3573432268572979e-05, | |
| "loss": 9.6381, | |
| "step": 1000 | |
| }, | |
| { | |
| "epoch": 0.004614987851723155, | |
| "grad_norm": 1.1929903030395508, | |
| "learning_rate": 1.3844900913944439e-05, | |
| "loss": 9.6209, | |
| "step": 1020 | |
| }, | |
| { | |
| "epoch": 0.004705477809600079, | |
| "grad_norm": 1.5701353549957275, | |
| "learning_rate": 1.41163695593159e-05, | |
| "loss": 9.5956, | |
| "step": 1040 | |
| }, | |
| { | |
| "epoch": 0.004795967767477005, | |
| "grad_norm": 1.32628333568573, | |
| "learning_rate": 1.4387838204687359e-05, | |
| "loss": 9.5899, | |
| "step": 1060 | |
| }, | |
| { | |
| "epoch": 0.004886457725353929, | |
| "grad_norm": 1.5850657224655151, | |
| "learning_rate": 1.4659306850058817e-05, | |
| "loss": 9.5779, | |
| "step": 1080 | |
| }, | |
| { | |
| "epoch": 0.004976947683230854, | |
| "grad_norm": 1.3933109045028687, | |
| "learning_rate": 1.4930775495430278e-05, | |
| "loss": 9.5701, | |
| "step": 1100 | |
| }, | |
| { | |
| "epoch": 0.005067437641107778, | |
| "grad_norm": 1.258367657661438, | |
| "learning_rate": 1.5202244140801737e-05, | |
| "loss": 9.5468, | |
| "step": 1120 | |
| }, | |
| { | |
| "epoch": 0.005157927598984703, | |
| "grad_norm": 1.3926512002944946, | |
| "learning_rate": 1.5473712786173196e-05, | |
| "loss": 9.5392, | |
| "step": 1140 | |
| }, | |
| { | |
| "epoch": 0.005248417556861627, | |
| "grad_norm": 1.1674704551696777, | |
| "learning_rate": 1.5745181431544656e-05, | |
| "loss": 9.5291, | |
| "step": 1160 | |
| }, | |
| { | |
| "epoch": 0.005338907514738552, | |
| "grad_norm": 1.4704829454421997, | |
| "learning_rate": 1.6016650076916116e-05, | |
| "loss": 9.5219, | |
| "step": 1180 | |
| }, | |
| { | |
| "epoch": 0.005429397472615476, | |
| "grad_norm": 1.6223082542419434, | |
| "learning_rate": 1.6288118722287576e-05, | |
| "loss": 9.4918, | |
| "step": 1200 | |
| }, | |
| { | |
| "epoch": 0.0055198874304924015, | |
| "grad_norm": 1.8586570024490356, | |
| "learning_rate": 1.6559587367659036e-05, | |
| "loss": 9.4895, | |
| "step": 1220 | |
| }, | |
| { | |
| "epoch": 0.005610377388369326, | |
| "grad_norm": 1.4105405807495117, | |
| "learning_rate": 1.6831056013030492e-05, | |
| "loss": 9.4886, | |
| "step": 1240 | |
| }, | |
| { | |
| "epoch": 0.0057008673462462505, | |
| "grad_norm": 1.4756163358688354, | |
| "learning_rate": 1.7102524658401956e-05, | |
| "loss": 9.4702, | |
| "step": 1260 | |
| }, | |
| { | |
| "epoch": 0.005791357304123175, | |
| "grad_norm": 1.3847874402999878, | |
| "learning_rate": 1.7373993303773412e-05, | |
| "loss": 9.4638, | |
| "step": 1280 | |
| }, | |
| { | |
| "epoch": 0.0058818472620000995, | |
| "grad_norm": 1.5135865211486816, | |
| "learning_rate": 1.7645461949144875e-05, | |
| "loss": 9.4583, | |
| "step": 1300 | |
| }, | |
| { | |
| "epoch": 0.005972337219877024, | |
| "grad_norm": 1.462760329246521, | |
| "learning_rate": 1.7916930594516332e-05, | |
| "loss": 9.4292, | |
| "step": 1320 | |
| }, | |
| { | |
| "epoch": 0.0060628271777539485, | |
| "grad_norm": 1.646760106086731, | |
| "learning_rate": 1.8188399239887792e-05, | |
| "loss": 9.4419, | |
| "step": 1340 | |
| }, | |
| { | |
| "epoch": 0.006153317135630873, | |
| "grad_norm": 1.3564046621322632, | |
| "learning_rate": 1.8459867885259252e-05, | |
| "loss": 9.4283, | |
| "step": 1360 | |
| }, | |
| { | |
| "epoch": 0.006243807093507798, | |
| "grad_norm": 1.4385489225387573, | |
| "learning_rate": 1.873133653063071e-05, | |
| "loss": 9.4208, | |
| "step": 1380 | |
| }, | |
| { | |
| "epoch": 0.006334297051384723, | |
| "grad_norm": 1.3975261449813843, | |
| "learning_rate": 1.900280517600217e-05, | |
| "loss": 9.4015, | |
| "step": 1400 | |
| }, | |
| { | |
| "epoch": 0.006424787009261647, | |
| "grad_norm": 1.4809174537658691, | |
| "learning_rate": 1.927427382137363e-05, | |
| "loss": 9.4009, | |
| "step": 1420 | |
| }, | |
| { | |
| "epoch": 0.006515276967138572, | |
| "grad_norm": 1.5181605815887451, | |
| "learning_rate": 1.954574246674509e-05, | |
| "loss": 9.3969, | |
| "step": 1440 | |
| }, | |
| { | |
| "epoch": 0.006605766925015496, | |
| "grad_norm": 1.4760838747024536, | |
| "learning_rate": 1.981721111211655e-05, | |
| "loss": 9.395, | |
| "step": 1460 | |
| }, | |
| { | |
| "epoch": 0.006696256882892421, | |
| "grad_norm": 1.6140539646148682, | |
| "learning_rate": 2.008867975748801e-05, | |
| "loss": 9.3868, | |
| "step": 1480 | |
| }, | |
| { | |
| "epoch": 0.006786746840769345, | |
| "grad_norm": 1.469307541847229, | |
| "learning_rate": 2.0360148402859468e-05, | |
| "loss": 9.3766, | |
| "step": 1500 | |
| }, | |
| { | |
| "epoch": 0.00687723679864627, | |
| "grad_norm": 1.8742159605026245, | |
| "learning_rate": 2.063161704823093e-05, | |
| "loss": 9.3715, | |
| "step": 1520 | |
| }, | |
| { | |
| "epoch": 0.006967726756523195, | |
| "grad_norm": 1.5996043682098389, | |
| "learning_rate": 2.0903085693602387e-05, | |
| "loss": 9.3622, | |
| "step": 1540 | |
| }, | |
| { | |
| "epoch": 0.00705821671440012, | |
| "grad_norm": 1.867632508277893, | |
| "learning_rate": 2.117455433897385e-05, | |
| "loss": 9.3704, | |
| "step": 1560 | |
| }, | |
| { | |
| "epoch": 0.007148706672277044, | |
| "grad_norm": 1.4762872457504272, | |
| "learning_rate": 2.1446022984345307e-05, | |
| "loss": 9.3741, | |
| "step": 1580 | |
| }, | |
| { | |
| "epoch": 0.007239196630153969, | |
| "grad_norm": 1.5752198696136475, | |
| "learning_rate": 2.1717491629716767e-05, | |
| "loss": 9.3561, | |
| "step": 1600 | |
| }, | |
| { | |
| "epoch": 0.007329686588030893, | |
| "grad_norm": 1.637786865234375, | |
| "learning_rate": 2.1988960275088227e-05, | |
| "loss": 9.3535, | |
| "step": 1620 | |
| }, | |
| { | |
| "epoch": 0.007420176545907818, | |
| "grad_norm": 2.6087028980255127, | |
| "learning_rate": 2.2260428920459687e-05, | |
| "loss": 9.3541, | |
| "step": 1640 | |
| }, | |
| { | |
| "epoch": 0.007510666503784742, | |
| "grad_norm": 1.977252721786499, | |
| "learning_rate": 2.2531897565831143e-05, | |
| "loss": 9.3341, | |
| "step": 1660 | |
| }, | |
| { | |
| "epoch": 0.007601156461661667, | |
| "grad_norm": 1.9511388540267944, | |
| "learning_rate": 2.2803366211202606e-05, | |
| "loss": 9.339, | |
| "step": 1680 | |
| }, | |
| { | |
| "epoch": 0.007691646419538592, | |
| "grad_norm": 1.8821523189544678, | |
| "learning_rate": 2.3074834856574063e-05, | |
| "loss": 9.3234, | |
| "step": 1700 | |
| }, | |
| { | |
| "epoch": 0.007782136377415517, | |
| "grad_norm": 1.5517367124557495, | |
| "learning_rate": 2.3346303501945526e-05, | |
| "loss": 9.3367, | |
| "step": 1720 | |
| }, | |
| { | |
| "epoch": 0.00787262633529244, | |
| "grad_norm": 2.164625883102417, | |
| "learning_rate": 2.3617772147316983e-05, | |
| "loss": 9.3366, | |
| "step": 1740 | |
| }, | |
| { | |
| "epoch": 0.007963116293169365, | |
| "grad_norm": 2.4158406257629395, | |
| "learning_rate": 2.3889240792688443e-05, | |
| "loss": 9.3221, | |
| "step": 1760 | |
| }, | |
| { | |
| "epoch": 0.00805360625104629, | |
| "grad_norm": 1.8652360439300537, | |
| "learning_rate": 2.4160709438059906e-05, | |
| "loss": 9.3098, | |
| "step": 1780 | |
| }, | |
| { | |
| "epoch": 0.008144096208923216, | |
| "grad_norm": 1.8249917030334473, | |
| "learning_rate": 2.4432178083431362e-05, | |
| "loss": 9.3094, | |
| "step": 1800 | |
| }, | |
| { | |
| "epoch": 0.00823458616680014, | |
| "grad_norm": 2.06990647315979, | |
| "learning_rate": 2.4703646728802822e-05, | |
| "loss": 9.2994, | |
| "step": 1820 | |
| }, | |
| { | |
| "epoch": 0.008325076124677065, | |
| "grad_norm": 2.461805582046509, | |
| "learning_rate": 2.4975115374174282e-05, | |
| "loss": 9.3157, | |
| "step": 1840 | |
| }, | |
| { | |
| "epoch": 0.008415566082553989, | |
| "grad_norm": 2.1320767402648926, | |
| "learning_rate": 2.5246584019545742e-05, | |
| "loss": 9.281, | |
| "step": 1860 | |
| }, | |
| { | |
| "epoch": 0.008506056040430914, | |
| "grad_norm": 2.6872756481170654, | |
| "learning_rate": 2.5518052664917202e-05, | |
| "loss": 9.2917, | |
| "step": 1880 | |
| }, | |
| { | |
| "epoch": 0.008596545998307838, | |
| "grad_norm": 2.4759294986724854, | |
| "learning_rate": 2.5789521310288662e-05, | |
| "loss": 9.2941, | |
| "step": 1900 | |
| }, | |
| { | |
| "epoch": 0.008687035956184763, | |
| "grad_norm": 1.8129667043685913, | |
| "learning_rate": 2.6060989955660118e-05, | |
| "loss": 9.2815, | |
| "step": 1920 | |
| }, | |
| { | |
| "epoch": 0.008777525914061687, | |
| "grad_norm": 2.9053220748901367, | |
| "learning_rate": 2.633245860103158e-05, | |
| "loss": 9.2801, | |
| "step": 1940 | |
| }, | |
| { | |
| "epoch": 0.008868015871938612, | |
| "grad_norm": 2.412623167037964, | |
| "learning_rate": 2.6603927246403038e-05, | |
| "loss": 9.2719, | |
| "step": 1960 | |
| }, | |
| { | |
| "epoch": 0.008958505829815536, | |
| "grad_norm": 1.972790002822876, | |
| "learning_rate": 2.6875395891774498e-05, | |
| "loss": 9.2729, | |
| "step": 1980 | |
| }, | |
| { | |
| "epoch": 0.00904899578769246, | |
| "grad_norm": 3.04768705368042, | |
| "learning_rate": 2.7146864537145958e-05, | |
| "loss": 9.2653, | |
| "step": 2000 | |
| }, | |
| { | |
| "epoch": 0.00904899578769246, | |
| "eval_accuracy": 0.10545615706904701, | |
| "eval_loss": 9.261013984680176, | |
| "eval_runtime": 215.2628, | |
| "eval_samples_per_second": 2823.711, | |
| "eval_steps_per_second": 11.033, | |
| "step": 2000 | |
| }, | |
| { | |
| "epoch": 0.009139485745569385, | |
| "grad_norm": 2.2706515789031982, | |
| "learning_rate": 2.7418333182517418e-05, | |
| "loss": 9.2604, | |
| "step": 2020 | |
| }, | |
| { | |
| "epoch": 0.00922997570344631, | |
| "grad_norm": 2.297621011734009, | |
| "learning_rate": 2.7689801827888878e-05, | |
| "loss": 9.2367, | |
| "step": 2040 | |
| }, | |
| { | |
| "epoch": 0.009320465661323234, | |
| "grad_norm": 2.049971342086792, | |
| "learning_rate": 2.7961270473260337e-05, | |
| "loss": 9.2545, | |
| "step": 2060 | |
| }, | |
| { | |
| "epoch": 0.009410955619200159, | |
| "grad_norm": 2.3538951873779297, | |
| "learning_rate": 2.82327391186318e-05, | |
| "loss": 9.2511, | |
| "step": 2080 | |
| }, | |
| { | |
| "epoch": 0.009501445577077083, | |
| "grad_norm": 3.1383931636810303, | |
| "learning_rate": 2.8504207764003254e-05, | |
| "loss": 9.2319, | |
| "step": 2100 | |
| }, | |
| { | |
| "epoch": 0.00959193553495401, | |
| "grad_norm": 2.6480958461761475, | |
| "learning_rate": 2.8775676409374717e-05, | |
| "loss": 9.2353, | |
| "step": 2120 | |
| }, | |
| { | |
| "epoch": 0.009682425492830934, | |
| "grad_norm": 2.3209128379821777, | |
| "learning_rate": 2.9047145054746177e-05, | |
| "loss": 9.241, | |
| "step": 2140 | |
| }, | |
| { | |
| "epoch": 0.009772915450707858, | |
| "grad_norm": 2.3225491046905518, | |
| "learning_rate": 2.9318613700117634e-05, | |
| "loss": 9.2133, | |
| "step": 2160 | |
| }, | |
| { | |
| "epoch": 0.009863405408584783, | |
| "grad_norm": 2.0134568214416504, | |
| "learning_rate": 2.9590082345489093e-05, | |
| "loss": 9.2188, | |
| "step": 2180 | |
| }, | |
| { | |
| "epoch": 0.009953895366461707, | |
| "grad_norm": 3.033569574356079, | |
| "learning_rate": 2.9861550990860557e-05, | |
| "loss": 9.2131, | |
| "step": 2200 | |
| }, | |
| { | |
| "epoch": 0.010044385324338632, | |
| "grad_norm": 2.8993263244628906, | |
| "learning_rate": 3.0133019636232017e-05, | |
| "loss": 9.2119, | |
| "step": 2220 | |
| }, | |
| { | |
| "epoch": 0.010134875282215556, | |
| "grad_norm": 2.718588352203369, | |
| "learning_rate": 3.0404488281603473e-05, | |
| "loss": 9.2187, | |
| "step": 2240 | |
| }, | |
| { | |
| "epoch": 0.01022536524009248, | |
| "grad_norm": 2.635470390319824, | |
| "learning_rate": 3.0675956926974936e-05, | |
| "loss": 9.1953, | |
| "step": 2260 | |
| }, | |
| { | |
| "epoch": 0.010315855197969405, | |
| "grad_norm": 2.6032440662384033, | |
| "learning_rate": 3.094742557234639e-05, | |
| "loss": 9.1967, | |
| "step": 2280 | |
| }, | |
| { | |
| "epoch": 0.01040634515584633, | |
| "grad_norm": 2.4713950157165527, | |
| "learning_rate": 3.121889421771785e-05, | |
| "loss": 9.1881, | |
| "step": 2300 | |
| }, | |
| { | |
| "epoch": 0.010496835113723254, | |
| "grad_norm": 2.4573025703430176, | |
| "learning_rate": 3.149036286308931e-05, | |
| "loss": 9.1827, | |
| "step": 2320 | |
| }, | |
| { | |
| "epoch": 0.010587325071600179, | |
| "grad_norm": 2.6169447898864746, | |
| "learning_rate": 3.1761831508460776e-05, | |
| "loss": 9.1865, | |
| "step": 2340 | |
| }, | |
| { | |
| "epoch": 0.010677815029477103, | |
| "grad_norm": 2.6744954586029053, | |
| "learning_rate": 3.203330015383223e-05, | |
| "loss": 9.1829, | |
| "step": 2360 | |
| }, | |
| { | |
| "epoch": 0.010768304987354028, | |
| "grad_norm": 2.766223907470703, | |
| "learning_rate": 3.230476879920369e-05, | |
| "loss": 9.177, | |
| "step": 2380 | |
| }, | |
| { | |
| "epoch": 0.010858794945230952, | |
| "grad_norm": 2.8083655834198, | |
| "learning_rate": 3.257623744457515e-05, | |
| "loss": 9.1853, | |
| "step": 2400 | |
| }, | |
| { | |
| "epoch": 0.010949284903107877, | |
| "grad_norm": 4.484155178070068, | |
| "learning_rate": 3.284770608994661e-05, | |
| "loss": 9.1655, | |
| "step": 2420 | |
| }, | |
| { | |
| "epoch": 0.011039774860984803, | |
| "grad_norm": 3.5152087211608887, | |
| "learning_rate": 3.311917473531807e-05, | |
| "loss": 9.1516, | |
| "step": 2440 | |
| }, | |
| { | |
| "epoch": 0.011130264818861728, | |
| "grad_norm": 2.3122165203094482, | |
| "learning_rate": 3.339064338068953e-05, | |
| "loss": 9.1552, | |
| "step": 2460 | |
| }, | |
| { | |
| "epoch": 0.011220754776738652, | |
| "grad_norm": 3.0563108921051025, | |
| "learning_rate": 3.3662112026060985e-05, | |
| "loss": 9.1494, | |
| "step": 2480 | |
| }, | |
| { | |
| "epoch": 0.011311244734615577, | |
| "grad_norm": 3.926668882369995, | |
| "learning_rate": 3.393358067143245e-05, | |
| "loss": 9.1425, | |
| "step": 2500 | |
| }, | |
| { | |
| "epoch": 0.011401734692492501, | |
| "grad_norm": 2.7006709575653076, | |
| "learning_rate": 3.420504931680391e-05, | |
| "loss": 9.1328, | |
| "step": 2520 | |
| }, | |
| { | |
| "epoch": 0.011492224650369426, | |
| "grad_norm": 3.1082751750946045, | |
| "learning_rate": 3.447651796217537e-05, | |
| "loss": 9.1316, | |
| "step": 2540 | |
| }, | |
| { | |
| "epoch": 0.01158271460824635, | |
| "grad_norm": 2.744490385055542, | |
| "learning_rate": 3.4747986607546824e-05, | |
| "loss": 9.1193, | |
| "step": 2560 | |
| }, | |
| { | |
| "epoch": 0.011673204566123275, | |
| "grad_norm": 2.8441922664642334, | |
| "learning_rate": 3.501945525291829e-05, | |
| "loss": 9.1174, | |
| "step": 2580 | |
| }, | |
| { | |
| "epoch": 0.011763694524000199, | |
| "grad_norm": 3.7371647357940674, | |
| "learning_rate": 3.529092389828975e-05, | |
| "loss": 9.1217, | |
| "step": 2600 | |
| }, | |
| { | |
| "epoch": 0.011854184481877124, | |
| "grad_norm": 3.0141730308532715, | |
| "learning_rate": 3.556239254366121e-05, | |
| "loss": 9.0999, | |
| "step": 2620 | |
| }, | |
| { | |
| "epoch": 0.011944674439754048, | |
| "grad_norm": 2.9731669425964355, | |
| "learning_rate": 3.5833861189032664e-05, | |
| "loss": 9.1044, | |
| "step": 2640 | |
| }, | |
| { | |
| "epoch": 0.012035164397630973, | |
| "grad_norm": 3.166254997253418, | |
| "learning_rate": 3.610532983440413e-05, | |
| "loss": 9.103, | |
| "step": 2660 | |
| }, | |
| { | |
| "epoch": 0.012125654355507897, | |
| "grad_norm": 2.949646472930908, | |
| "learning_rate": 3.6376798479775584e-05, | |
| "loss": 9.1026, | |
| "step": 2680 | |
| }, | |
| { | |
| "epoch": 0.012216144313384822, | |
| "grad_norm": 2.762843132019043, | |
| "learning_rate": 3.664826712514705e-05, | |
| "loss": 9.1047, | |
| "step": 2700 | |
| }, | |
| { | |
| "epoch": 0.012306634271261746, | |
| "grad_norm": 3.188957929611206, | |
| "learning_rate": 3.6919735770518503e-05, | |
| "loss": 9.0968, | |
| "step": 2720 | |
| }, | |
| { | |
| "epoch": 0.01239712422913867, | |
| "grad_norm": 4.116425037384033, | |
| "learning_rate": 3.719120441588996e-05, | |
| "loss": 9.0993, | |
| "step": 2740 | |
| }, | |
| { | |
| "epoch": 0.012487614187015597, | |
| "grad_norm": 2.7521297931671143, | |
| "learning_rate": 3.746267306126142e-05, | |
| "loss": 9.063, | |
| "step": 2760 | |
| }, | |
| { | |
| "epoch": 0.012578104144892521, | |
| "grad_norm": 3.1481823921203613, | |
| "learning_rate": 3.7734141706632886e-05, | |
| "loss": 9.062, | |
| "step": 2780 | |
| }, | |
| { | |
| "epoch": 0.012668594102769446, | |
| "grad_norm": 2.48091721534729, | |
| "learning_rate": 3.800561035200434e-05, | |
| "loss": 9.0727, | |
| "step": 2800 | |
| }, | |
| { | |
| "epoch": 0.01275908406064637, | |
| "grad_norm": 3.0816426277160645, | |
| "learning_rate": 3.82770789973758e-05, | |
| "loss": 9.0525, | |
| "step": 2820 | |
| }, | |
| { | |
| "epoch": 0.012849574018523295, | |
| "grad_norm": 2.86342191696167, | |
| "learning_rate": 3.854854764274726e-05, | |
| "loss": 9.0447, | |
| "step": 2840 | |
| }, | |
| { | |
| "epoch": 0.01294006397640022, | |
| "grad_norm": 2.769746780395508, | |
| "learning_rate": 3.8820016288118726e-05, | |
| "loss": 9.0524, | |
| "step": 2860 | |
| }, | |
| { | |
| "epoch": 0.013030553934277144, | |
| "grad_norm": 3.4716339111328125, | |
| "learning_rate": 3.909148493349018e-05, | |
| "loss": 9.0453, | |
| "step": 2880 | |
| }, | |
| { | |
| "epoch": 0.013121043892154068, | |
| "grad_norm": 4.585721969604492, | |
| "learning_rate": 3.936295357886164e-05, | |
| "loss": 9.0466, | |
| "step": 2900 | |
| }, | |
| { | |
| "epoch": 0.013211533850030993, | |
| "grad_norm": 3.7394728660583496, | |
| "learning_rate": 3.96344222242331e-05, | |
| "loss": 9.0405, | |
| "step": 2920 | |
| }, | |
| { | |
| "epoch": 0.013302023807907917, | |
| "grad_norm": 3.9100561141967773, | |
| "learning_rate": 3.990589086960456e-05, | |
| "loss": 9.0415, | |
| "step": 2940 | |
| }, | |
| { | |
| "epoch": 0.013392513765784842, | |
| "grad_norm": 2.94941782951355, | |
| "learning_rate": 4.017735951497602e-05, | |
| "loss": 9.0265, | |
| "step": 2960 | |
| }, | |
| { | |
| "epoch": 0.013483003723661766, | |
| "grad_norm": 2.6733226776123047, | |
| "learning_rate": 4.044882816034748e-05, | |
| "loss": 9.0195, | |
| "step": 2980 | |
| }, | |
| { | |
| "epoch": 0.01357349368153869, | |
| "grad_norm": 3.4839463233947754, | |
| "learning_rate": 4.0720296805718935e-05, | |
| "loss": 9.0204, | |
| "step": 3000 | |
| }, | |
| { | |
| "epoch": 0.013663983639415615, | |
| "grad_norm": 3.460050344467163, | |
| "learning_rate": 4.09917654510904e-05, | |
| "loss": 9.0086, | |
| "step": 3020 | |
| }, | |
| { | |
| "epoch": 0.01375447359729254, | |
| "grad_norm": 4.007343769073486, | |
| "learning_rate": 4.126323409646186e-05, | |
| "loss": 9.0185, | |
| "step": 3040 | |
| }, | |
| { | |
| "epoch": 0.013844963555169464, | |
| "grad_norm": 3.917860746383667, | |
| "learning_rate": 4.153470274183331e-05, | |
| "loss": 9.0032, | |
| "step": 3060 | |
| }, | |
| { | |
| "epoch": 0.01393545351304639, | |
| "grad_norm": 3.5258123874664307, | |
| "learning_rate": 4.1806171387204775e-05, | |
| "loss": 8.9983, | |
| "step": 3080 | |
| }, | |
| { | |
| "epoch": 0.014025943470923315, | |
| "grad_norm": 3.002183198928833, | |
| "learning_rate": 4.207764003257624e-05, | |
| "loss": 8.9898, | |
| "step": 3100 | |
| }, | |
| { | |
| "epoch": 0.01411643342880024, | |
| "grad_norm": 3.2682976722717285, | |
| "learning_rate": 4.23491086779477e-05, | |
| "loss": 8.9932, | |
| "step": 3120 | |
| }, | |
| { | |
| "epoch": 0.014206923386677164, | |
| "grad_norm": 3.7955832481384277, | |
| "learning_rate": 4.262057732331915e-05, | |
| "loss": 8.9879, | |
| "step": 3140 | |
| }, | |
| { | |
| "epoch": 0.014297413344554089, | |
| "grad_norm": 3.3697524070739746, | |
| "learning_rate": 4.2892045968690614e-05, | |
| "loss": 8.9757, | |
| "step": 3160 | |
| }, | |
| { | |
| "epoch": 0.014387903302431013, | |
| "grad_norm": 3.756788730621338, | |
| "learning_rate": 4.316351461406208e-05, | |
| "loss": 8.9811, | |
| "step": 3180 | |
| }, | |
| { | |
| "epoch": 0.014478393260307938, | |
| "grad_norm": 3.024722099304199, | |
| "learning_rate": 4.3434983259433534e-05, | |
| "loss": 8.9613, | |
| "step": 3200 | |
| }, | |
| { | |
| "epoch": 0.014568883218184862, | |
| "grad_norm": 3.258375406265259, | |
| "learning_rate": 4.3706451904805e-05, | |
| "loss": 8.9614, | |
| "step": 3220 | |
| }, | |
| { | |
| "epoch": 0.014659373176061787, | |
| "grad_norm": 2.970426559448242, | |
| "learning_rate": 4.3977920550176454e-05, | |
| "loss": 8.9624, | |
| "step": 3240 | |
| }, | |
| { | |
| "epoch": 0.014749863133938711, | |
| "grad_norm": 4.601590156555176, | |
| "learning_rate": 4.424938919554791e-05, | |
| "loss": 8.9615, | |
| "step": 3260 | |
| }, | |
| { | |
| "epoch": 0.014840353091815636, | |
| "grad_norm": 4.773068428039551, | |
| "learning_rate": 4.4520857840919373e-05, | |
| "loss": 8.9668, | |
| "step": 3280 | |
| }, | |
| { | |
| "epoch": 0.01493084304969256, | |
| "grad_norm": 3.182677984237671, | |
| "learning_rate": 4.479232648629084e-05, | |
| "loss": 8.933, | |
| "step": 3300 | |
| }, | |
| { | |
| "epoch": 0.015021333007569485, | |
| "grad_norm": 3.160553455352783, | |
| "learning_rate": 4.5063795131662286e-05, | |
| "loss": 8.9409, | |
| "step": 3320 | |
| }, | |
| { | |
| "epoch": 0.015111822965446409, | |
| "grad_norm": 3.0617620944976807, | |
| "learning_rate": 4.533526377703375e-05, | |
| "loss": 8.95, | |
| "step": 3340 | |
| }, | |
| { | |
| "epoch": 0.015202312923323334, | |
| "grad_norm": 3.1966211795806885, | |
| "learning_rate": 4.560673242240521e-05, | |
| "loss": 8.9379, | |
| "step": 3360 | |
| }, | |
| { | |
| "epoch": 0.015292802881200258, | |
| "grad_norm": 2.3314368724823, | |
| "learning_rate": 4.587820106777667e-05, | |
| "loss": 8.9246, | |
| "step": 3380 | |
| }, | |
| { | |
| "epoch": 0.015383292839077184, | |
| "grad_norm": 3.1242740154266357, | |
| "learning_rate": 4.6149669713148126e-05, | |
| "loss": 8.9409, | |
| "step": 3400 | |
| }, | |
| { | |
| "epoch": 0.015473782796954109, | |
| "grad_norm": 3.042051315307617, | |
| "learning_rate": 4.642113835851959e-05, | |
| "loss": 8.9204, | |
| "step": 3420 | |
| }, | |
| { | |
| "epoch": 0.015564272754831033, | |
| "grad_norm": 4.102015495300293, | |
| "learning_rate": 4.669260700389105e-05, | |
| "loss": 8.8915, | |
| "step": 3440 | |
| }, | |
| { | |
| "epoch": 0.015654762712707958, | |
| "grad_norm": 3.2991299629211426, | |
| "learning_rate": 4.696407564926251e-05, | |
| "loss": 8.8897, | |
| "step": 3460 | |
| }, | |
| { | |
| "epoch": 0.01574525267058488, | |
| "grad_norm": 3.501094102859497, | |
| "learning_rate": 4.7235544294633965e-05, | |
| "loss": 8.9223, | |
| "step": 3480 | |
| }, | |
| { | |
| "epoch": 0.015835742628461807, | |
| "grad_norm": 6.248113632202148, | |
| "learning_rate": 4.750701294000543e-05, | |
| "loss": 8.8925, | |
| "step": 3500 | |
| }, | |
| { | |
| "epoch": 0.01592623258633873, | |
| "grad_norm": 4.329127788543701, | |
| "learning_rate": 4.7778481585376885e-05, | |
| "loss": 8.8891, | |
| "step": 3520 | |
| }, | |
| { | |
| "epoch": 0.016016722544215656, | |
| "grad_norm": 3.575141191482544, | |
| "learning_rate": 4.804995023074835e-05, | |
| "loss": 8.8741, | |
| "step": 3540 | |
| }, | |
| { | |
| "epoch": 0.01610721250209258, | |
| "grad_norm": 3.301194429397583, | |
| "learning_rate": 4.832141887611981e-05, | |
| "loss": 8.8965, | |
| "step": 3560 | |
| }, | |
| { | |
| "epoch": 0.016197702459969505, | |
| "grad_norm": 3.7364182472229004, | |
| "learning_rate": 4.859288752149126e-05, | |
| "loss": 8.8899, | |
| "step": 3580 | |
| }, | |
| { | |
| "epoch": 0.01628819241784643, | |
| "grad_norm": 5.336267471313477, | |
| "learning_rate": 4.8864356166862725e-05, | |
| "loss": 8.8959, | |
| "step": 3600 | |
| }, | |
| { | |
| "epoch": 0.016378682375723354, | |
| "grad_norm": 4.769089221954346, | |
| "learning_rate": 4.913582481223419e-05, | |
| "loss": 8.8981, | |
| "step": 3620 | |
| }, | |
| { | |
| "epoch": 0.01646917233360028, | |
| "grad_norm": 3.369799852371216, | |
| "learning_rate": 4.9407293457605645e-05, | |
| "loss": 8.8954, | |
| "step": 3640 | |
| }, | |
| { | |
| "epoch": 0.016559662291477203, | |
| "grad_norm": 3.063030481338501, | |
| "learning_rate": 4.96787621029771e-05, | |
| "loss": 8.8694, | |
| "step": 3660 | |
| }, | |
| { | |
| "epoch": 0.01665015224935413, | |
| "grad_norm": 4.988938331604004, | |
| "learning_rate": 4.9950230748348564e-05, | |
| "loss": 8.8611, | |
| "step": 3680 | |
| }, | |
| { | |
| "epoch": 0.016740642207231052, | |
| "grad_norm": 3.5118601322174072, | |
| "learning_rate": 5.022169939372003e-05, | |
| "loss": 8.8525, | |
| "step": 3700 | |
| }, | |
| { | |
| "epoch": 0.016831132165107978, | |
| "grad_norm": 4.257157325744629, | |
| "learning_rate": 5.0493168039091484e-05, | |
| "loss": 8.8547, | |
| "step": 3720 | |
| }, | |
| { | |
| "epoch": 0.0169216221229849, | |
| "grad_norm": 3.7021615505218506, | |
| "learning_rate": 5.076463668446294e-05, | |
| "loss": 8.8572, | |
| "step": 3740 | |
| }, | |
| { | |
| "epoch": 0.017012112080861827, | |
| "grad_norm": 4.868439197540283, | |
| "learning_rate": 5.1036105329834404e-05, | |
| "loss": 8.8684, | |
| "step": 3760 | |
| }, | |
| { | |
| "epoch": 0.01710260203873875, | |
| "grad_norm": 6.547580718994141, | |
| "learning_rate": 5.130757397520586e-05, | |
| "loss": 8.828, | |
| "step": 3780 | |
| }, | |
| { | |
| "epoch": 0.017193091996615676, | |
| "grad_norm": 5.9254374504089355, | |
| "learning_rate": 5.1579042620577324e-05, | |
| "loss": 8.838, | |
| "step": 3800 | |
| }, | |
| { | |
| "epoch": 0.0172835819544926, | |
| "grad_norm": 6.061065196990967, | |
| "learning_rate": 5.185051126594879e-05, | |
| "loss": 8.8405, | |
| "step": 3820 | |
| }, | |
| { | |
| "epoch": 0.017374071912369525, | |
| "grad_norm": 6.026751518249512, | |
| "learning_rate": 5.2121979911320237e-05, | |
| "loss": 8.8305, | |
| "step": 3840 | |
| }, | |
| { | |
| "epoch": 0.017464561870246448, | |
| "grad_norm": 4.982965469360352, | |
| "learning_rate": 5.23934485566917e-05, | |
| "loss": 8.8316, | |
| "step": 3860 | |
| }, | |
| { | |
| "epoch": 0.017555051828123374, | |
| "grad_norm": 9.080221176147461, | |
| "learning_rate": 5.266491720206316e-05, | |
| "loss": 8.8267, | |
| "step": 3880 | |
| }, | |
| { | |
| "epoch": 0.0176455417860003, | |
| "grad_norm": 6.644583225250244, | |
| "learning_rate": 5.293638584743462e-05, | |
| "loss": 8.8331, | |
| "step": 3900 | |
| }, | |
| { | |
| "epoch": 0.017736031743877223, | |
| "grad_norm": 6.022925853729248, | |
| "learning_rate": 5.3207854492806076e-05, | |
| "loss": 8.8198, | |
| "step": 3920 | |
| }, | |
| { | |
| "epoch": 0.01782652170175415, | |
| "grad_norm": 4.794320583343506, | |
| "learning_rate": 5.347932313817754e-05, | |
| "loss": 8.8075, | |
| "step": 3940 | |
| }, | |
| { | |
| "epoch": 0.017917011659631072, | |
| "grad_norm": 5.949656963348389, | |
| "learning_rate": 5.3750791783548996e-05, | |
| "loss": 8.8175, | |
| "step": 3960 | |
| }, | |
| { | |
| "epoch": 0.018007501617508, | |
| "grad_norm": 7.972283840179443, | |
| "learning_rate": 5.402226042892046e-05, | |
| "loss": 8.8263, | |
| "step": 3980 | |
| }, | |
| { | |
| "epoch": 0.01809799157538492, | |
| "grad_norm": 6.132015228271484, | |
| "learning_rate": 5.4293729074291916e-05, | |
| "loss": 8.8035, | |
| "step": 4000 | |
| }, | |
| { | |
| "epoch": 0.01809799157538492, | |
| "eval_accuracy": 0.10955227810888264, | |
| "eval_loss": 8.793069839477539, | |
| "eval_runtime": 217.825, | |
| "eval_samples_per_second": 2790.497, | |
| "eval_steps_per_second": 10.903, | |
| "step": 4000 | |
| }, | |
| { | |
| "epoch": 0.018188481533261847, | |
| "grad_norm": 3.9714837074279785, | |
| "learning_rate": 5.455162428739481e-05, | |
| "loss": 8.8029, | |
| "step": 4020 | |
| }, | |
| { | |
| "epoch": 0.01827897149113877, | |
| "grad_norm": 3.9775164127349854, | |
| "learning_rate": 5.482309293276626e-05, | |
| "loss": 8.7859, | |
| "step": 4040 | |
| }, | |
| { | |
| "epoch": 0.018369461449015696, | |
| "grad_norm": 4.350288391113281, | |
| "learning_rate": 5.509456157813772e-05, | |
| "loss": 8.8049, | |
| "step": 4060 | |
| }, | |
| { | |
| "epoch": 0.01845995140689262, | |
| "grad_norm": 5.212925910949707, | |
| "learning_rate": 5.5366030223509186e-05, | |
| "loss": 8.7768, | |
| "step": 4080 | |
| }, | |
| { | |
| "epoch": 0.018550441364769545, | |
| "grad_norm": 5.585092544555664, | |
| "learning_rate": 5.563749886888064e-05, | |
| "loss": 8.7792, | |
| "step": 4100 | |
| }, | |
| { | |
| "epoch": 0.018640931322646468, | |
| "grad_norm": 5.019256114959717, | |
| "learning_rate": 5.59089675142521e-05, | |
| "loss": 8.7843, | |
| "step": 4120 | |
| }, | |
| { | |
| "epoch": 0.018731421280523394, | |
| "grad_norm": 5.925191402435303, | |
| "learning_rate": 5.616686272735499e-05, | |
| "loss": 8.7693, | |
| "step": 4140 | |
| }, | |
| { | |
| "epoch": 0.018821911238400317, | |
| "grad_norm": 4.334403991699219, | |
| "learning_rate": 5.643833137272645e-05, | |
| "loss": 8.7652, | |
| "step": 4160 | |
| }, | |
| { | |
| "epoch": 0.018912401196277243, | |
| "grad_norm": 6.786751747131348, | |
| "learning_rate": 5.670980001809791e-05, | |
| "loss": 8.76, | |
| "step": 4180 | |
| }, | |
| { | |
| "epoch": 0.019002891154154166, | |
| "grad_norm": 5.805715084075928, | |
| "learning_rate": 5.698126866346936e-05, | |
| "loss": 8.7835, | |
| "step": 4200 | |
| }, | |
| { | |
| "epoch": 0.019093381112031092, | |
| "grad_norm": 7.2905120849609375, | |
| "learning_rate": 5.7252737308840826e-05, | |
| "loss": 8.7524, | |
| "step": 4220 | |
| }, | |
| { | |
| "epoch": 0.01918387106990802, | |
| "grad_norm": 4.692761421203613, | |
| "learning_rate": 5.752420595421228e-05, | |
| "loss": 8.7274, | |
| "step": 4240 | |
| }, | |
| { | |
| "epoch": 0.01927436102778494, | |
| "grad_norm": 5.6952924728393555, | |
| "learning_rate": 5.7795674599583746e-05, | |
| "loss": 8.7625, | |
| "step": 4260 | |
| }, | |
| { | |
| "epoch": 0.019364850985661868, | |
| "grad_norm": 7.725805759429932, | |
| "learning_rate": 5.806714324495521e-05, | |
| "loss": 8.7313, | |
| "step": 4280 | |
| }, | |
| { | |
| "epoch": 0.01945534094353879, | |
| "grad_norm": 5.154263496398926, | |
| "learning_rate": 5.833861189032667e-05, | |
| "loss": 8.7433, | |
| "step": 4300 | |
| }, | |
| { | |
| "epoch": 0.019545830901415717, | |
| "grad_norm": 7.734066963195801, | |
| "learning_rate": 5.861008053569812e-05, | |
| "loss": 8.738, | |
| "step": 4320 | |
| }, | |
| { | |
| "epoch": 0.01963632085929264, | |
| "grad_norm": 6.757390022277832, | |
| "learning_rate": 5.888154918106958e-05, | |
| "loss": 8.6971, | |
| "step": 4340 | |
| }, | |
| { | |
| "epoch": 0.019726810817169566, | |
| "grad_norm": 9.869467735290527, | |
| "learning_rate": 5.915301782644104e-05, | |
| "loss": 8.7437, | |
| "step": 4360 | |
| }, | |
| { | |
| "epoch": 0.01981730077504649, | |
| "grad_norm": 4.825913429260254, | |
| "learning_rate": 5.9424486471812505e-05, | |
| "loss": 8.712, | |
| "step": 4380 | |
| }, | |
| { | |
| "epoch": 0.019907790732923415, | |
| "grad_norm": 8.725457191467285, | |
| "learning_rate": 5.969595511718397e-05, | |
| "loss": 8.7054, | |
| "step": 4400 | |
| }, | |
| { | |
| "epoch": 0.019998280690800337, | |
| "grad_norm": 9.08804702758789, | |
| "learning_rate": 5.9967423762555425e-05, | |
| "loss": 8.6968, | |
| "step": 4420 | |
| }, | |
| { | |
| "epoch": 0.020088770648677264, | |
| "grad_norm": 7.369052886962891, | |
| "learning_rate": 6.023889240792689e-05, | |
| "loss": 8.6736, | |
| "step": 4440 | |
| }, | |
| { | |
| "epoch": 0.020179260606554186, | |
| "grad_norm": 9.925745964050293, | |
| "learning_rate": 6.051036105329834e-05, | |
| "loss": 8.7043, | |
| "step": 4460 | |
| }, | |
| { | |
| "epoch": 0.020269750564431113, | |
| "grad_norm": 10.998024940490723, | |
| "learning_rate": 6.07818296986698e-05, | |
| "loss": 8.7098, | |
| "step": 4480 | |
| }, | |
| { | |
| "epoch": 0.020360240522308035, | |
| "grad_norm": 9.010730743408203, | |
| "learning_rate": 6.105329834404126e-05, | |
| "loss": 8.6893, | |
| "step": 4500 | |
| }, | |
| { | |
| "epoch": 0.02045073048018496, | |
| "grad_norm": 5.833269119262695, | |
| "learning_rate": 6.132476698941272e-05, | |
| "loss": 8.6928, | |
| "step": 4520 | |
| }, | |
| { | |
| "epoch": 0.020541220438061888, | |
| "grad_norm": 5.778794288635254, | |
| "learning_rate": 6.159623563478418e-05, | |
| "loss": 8.6813, | |
| "step": 4540 | |
| }, | |
| { | |
| "epoch": 0.02063171039593881, | |
| "grad_norm": 6.518376350402832, | |
| "learning_rate": 6.186770428015565e-05, | |
| "loss": 8.6679, | |
| "step": 4560 | |
| }, | |
| { | |
| "epoch": 0.020722200353815737, | |
| "grad_norm": 7.985169887542725, | |
| "learning_rate": 6.21391729255271e-05, | |
| "loss": 8.6912, | |
| "step": 4580 | |
| }, | |
| { | |
| "epoch": 0.02081269031169266, | |
| "grad_norm": 6.066607475280762, | |
| "learning_rate": 6.241064157089856e-05, | |
| "loss": 8.67, | |
| "step": 4600 | |
| }, | |
| { | |
| "epoch": 0.020903180269569586, | |
| "grad_norm": 7.519238471984863, | |
| "learning_rate": 6.268211021627002e-05, | |
| "loss": 8.648, | |
| "step": 4620 | |
| }, | |
| { | |
| "epoch": 0.02099367022744651, | |
| "grad_norm": 9.485710144042969, | |
| "learning_rate": 6.295357886164147e-05, | |
| "loss": 8.6484, | |
| "step": 4640 | |
| }, | |
| { | |
| "epoch": 0.021084160185323435, | |
| "grad_norm": 9.786864280700684, | |
| "learning_rate": 6.322504750701294e-05, | |
| "loss": 8.637, | |
| "step": 4660 | |
| }, | |
| { | |
| "epoch": 0.021174650143200358, | |
| "grad_norm": 8.231635093688965, | |
| "learning_rate": 6.34965161523844e-05, | |
| "loss": 8.648, | |
| "step": 4680 | |
| }, | |
| { | |
| "epoch": 0.021265140101077284, | |
| "grad_norm": 7.283841609954834, | |
| "learning_rate": 6.376798479775586e-05, | |
| "loss": 8.64, | |
| "step": 4700 | |
| }, | |
| { | |
| "epoch": 0.021355630058954207, | |
| "grad_norm": 7.625393390655518, | |
| "learning_rate": 6.403945344312731e-05, | |
| "loss": 8.6713, | |
| "step": 4720 | |
| }, | |
| { | |
| "epoch": 0.021446120016831133, | |
| "grad_norm": 7.758394241333008, | |
| "learning_rate": 6.431092208849878e-05, | |
| "loss": 8.6473, | |
| "step": 4740 | |
| }, | |
| { | |
| "epoch": 0.021536609974708056, | |
| "grad_norm": 7.519627571105957, | |
| "learning_rate": 6.458239073387024e-05, | |
| "loss": 8.6144, | |
| "step": 4760 | |
| }, | |
| { | |
| "epoch": 0.021627099932584982, | |
| "grad_norm": 7.698405742645264, | |
| "learning_rate": 6.48538593792417e-05, | |
| "loss": 8.6678, | |
| "step": 4780 | |
| }, | |
| { | |
| "epoch": 0.021717589890461905, | |
| "grad_norm": 7.843724727630615, | |
| "learning_rate": 6.512532802461315e-05, | |
| "loss": 8.6292, | |
| "step": 4800 | |
| }, | |
| { | |
| "epoch": 0.02180807984833883, | |
| "grad_norm": 9.748797416687012, | |
| "learning_rate": 6.539679666998462e-05, | |
| "loss": 8.6059, | |
| "step": 4820 | |
| }, | |
| { | |
| "epoch": 0.021898569806215754, | |
| "grad_norm": 8.68276596069336, | |
| "learning_rate": 6.566826531535607e-05, | |
| "loss": 8.6153, | |
| "step": 4840 | |
| }, | |
| { | |
| "epoch": 0.02198905976409268, | |
| "grad_norm": 9.26171588897705, | |
| "learning_rate": 6.593973396072753e-05, | |
| "loss": 8.6343, | |
| "step": 4860 | |
| }, | |
| { | |
| "epoch": 0.022079549721969606, | |
| "grad_norm": 10.164648056030273, | |
| "learning_rate": 6.621120260609899e-05, | |
| "loss": 8.6255, | |
| "step": 4880 | |
| }, | |
| { | |
| "epoch": 0.02217003967984653, | |
| "grad_norm": 8.388748168945312, | |
| "learning_rate": 6.648267125147046e-05, | |
| "loss": 8.6111, | |
| "step": 4900 | |
| }, | |
| { | |
| "epoch": 0.022260529637723455, | |
| "grad_norm": 9.701128005981445, | |
| "learning_rate": 6.675413989684192e-05, | |
| "loss": 8.5902, | |
| "step": 4920 | |
| }, | |
| { | |
| "epoch": 0.022351019595600378, | |
| "grad_norm": 9.261332511901855, | |
| "learning_rate": 6.702560854221338e-05, | |
| "loss": 8.6013, | |
| "step": 4940 | |
| }, | |
| { | |
| "epoch": 0.022441509553477304, | |
| "grad_norm": 7.0918354988098145, | |
| "learning_rate": 6.729707718758483e-05, | |
| "loss": 8.5595, | |
| "step": 4960 | |
| }, | |
| { | |
| "epoch": 0.022531999511354227, | |
| "grad_norm": 8.793268203735352, | |
| "learning_rate": 6.756854583295628e-05, | |
| "loss": 8.5862, | |
| "step": 4980 | |
| }, | |
| { | |
| "epoch": 0.022622489469231153, | |
| "grad_norm": 8.539192199707031, | |
| "learning_rate": 6.784001447832774e-05, | |
| "loss": 8.5938, | |
| "step": 5000 | |
| }, | |
| { | |
| "epoch": 0.022712979427108076, | |
| "grad_norm": 8.60251522064209, | |
| "learning_rate": 6.811148312369921e-05, | |
| "loss": 8.598, | |
| "step": 5020 | |
| }, | |
| { | |
| "epoch": 0.022803469384985002, | |
| "grad_norm": 8.976070404052734, | |
| "learning_rate": 6.838295176907067e-05, | |
| "loss": 8.5896, | |
| "step": 5040 | |
| }, | |
| { | |
| "epoch": 0.022893959342861925, | |
| "grad_norm": 8.834037780761719, | |
| "learning_rate": 6.865442041444213e-05, | |
| "loss": 8.5654, | |
| "step": 5060 | |
| }, | |
| { | |
| "epoch": 0.02298444930073885, | |
| "grad_norm": 7.039853096008301, | |
| "learning_rate": 6.89258890598136e-05, | |
| "loss": 8.574, | |
| "step": 5080 | |
| }, | |
| { | |
| "epoch": 0.023074939258615774, | |
| "grad_norm": 4.989284515380859, | |
| "learning_rate": 6.919735770518505e-05, | |
| "loss": 8.584, | |
| "step": 5100 | |
| }, | |
| { | |
| "epoch": 0.0231654292164927, | |
| "grad_norm": 10.530620574951172, | |
| "learning_rate": 6.946882635055651e-05, | |
| "loss": 8.5884, | |
| "step": 5120 | |
| }, | |
| { | |
| "epoch": 0.023255919174369623, | |
| "grad_norm": 10.483266830444336, | |
| "learning_rate": 6.974029499592797e-05, | |
| "loss": 8.573, | |
| "step": 5140 | |
| }, | |
| { | |
| "epoch": 0.02334640913224655, | |
| "grad_norm": 9.433408737182617, | |
| "learning_rate": 7.001176364129942e-05, | |
| "loss": 8.5553, | |
| "step": 5160 | |
| }, | |
| { | |
| "epoch": 0.023436899090123475, | |
| "grad_norm": 10.707608222961426, | |
| "learning_rate": 7.028323228667089e-05, | |
| "loss": 8.5672, | |
| "step": 5180 | |
| }, | |
| { | |
| "epoch": 0.023527389048000398, | |
| "grad_norm": 11.35906982421875, | |
| "learning_rate": 7.055470093204235e-05, | |
| "loss": 8.5374, | |
| "step": 5200 | |
| }, | |
| { | |
| "epoch": 0.023617879005877324, | |
| "grad_norm": 9.386375427246094, | |
| "learning_rate": 7.08261695774138e-05, | |
| "loss": 8.5199, | |
| "step": 5220 | |
| }, | |
| { | |
| "epoch": 0.023708368963754247, | |
| "grad_norm": 10.813016891479492, | |
| "learning_rate": 7.109763822278526e-05, | |
| "loss": 8.5296, | |
| "step": 5240 | |
| }, | |
| { | |
| "epoch": 0.023798858921631173, | |
| "grad_norm": 10.738064765930176, | |
| "learning_rate": 7.136910686815673e-05, | |
| "loss": 8.5293, | |
| "step": 5260 | |
| }, | |
| { | |
| "epoch": 0.023889348879508096, | |
| "grad_norm": 12.89620590209961, | |
| "learning_rate": 7.164057551352819e-05, | |
| "loss": 8.5494, | |
| "step": 5280 | |
| }, | |
| { | |
| "epoch": 0.023979838837385022, | |
| "grad_norm": 11.624608039855957, | |
| "learning_rate": 7.191204415889965e-05, | |
| "loss": 8.5179, | |
| "step": 5300 | |
| }, | |
| { | |
| "epoch": 0.024070328795261945, | |
| "grad_norm": 7.694511413574219, | |
| "learning_rate": 7.21835128042711e-05, | |
| "loss": 8.5528, | |
| "step": 5320 | |
| }, | |
| { | |
| "epoch": 0.02416081875313887, | |
| "grad_norm": 9.326581954956055, | |
| "learning_rate": 7.245498144964257e-05, | |
| "loss": 8.5307, | |
| "step": 5340 | |
| }, | |
| { | |
| "epoch": 0.024251308711015794, | |
| "grad_norm": 8.548121452331543, | |
| "learning_rate": 7.272645009501402e-05, | |
| "loss": 8.5031, | |
| "step": 5360 | |
| }, | |
| { | |
| "epoch": 0.02434179866889272, | |
| "grad_norm": 10.232369422912598, | |
| "learning_rate": 7.299791874038548e-05, | |
| "loss": 8.4905, | |
| "step": 5380 | |
| }, | |
| { | |
| "epoch": 0.024432288626769643, | |
| "grad_norm": 9.751016616821289, | |
| "learning_rate": 7.326938738575694e-05, | |
| "loss": 8.4996, | |
| "step": 5400 | |
| }, | |
| { | |
| "epoch": 0.02452277858464657, | |
| "grad_norm": 11.058146476745605, | |
| "learning_rate": 7.35408560311284e-05, | |
| "loss": 8.4889, | |
| "step": 5420 | |
| }, | |
| { | |
| "epoch": 0.024613268542523492, | |
| "grad_norm": 8.11478042602539, | |
| "learning_rate": 7.381232467649987e-05, | |
| "loss": 8.5099, | |
| "step": 5440 | |
| }, | |
| { | |
| "epoch": 0.02470375850040042, | |
| "grad_norm": 8.138284683227539, | |
| "learning_rate": 7.408379332187133e-05, | |
| "loss": 8.4854, | |
| "step": 5460 | |
| }, | |
| { | |
| "epoch": 0.02479424845827734, | |
| "grad_norm": 7.7438459396362305, | |
| "learning_rate": 7.435526196724278e-05, | |
| "loss": 8.4877, | |
| "step": 5480 | |
| }, | |
| { | |
| "epoch": 0.024884738416154267, | |
| "grad_norm": 9.896592140197754, | |
| "learning_rate": 7.462673061261423e-05, | |
| "loss": 8.4662, | |
| "step": 5500 | |
| }, | |
| { | |
| "epoch": 0.024975228374031194, | |
| "grad_norm": 7.162434101104736, | |
| "learning_rate": 7.48981992579857e-05, | |
| "loss": 8.4772, | |
| "step": 5520 | |
| }, | |
| { | |
| "epoch": 0.025065718331908116, | |
| "grad_norm": 8.252161026000977, | |
| "learning_rate": 7.516966790335716e-05, | |
| "loss": 8.4936, | |
| "step": 5540 | |
| }, | |
| { | |
| "epoch": 0.025156208289785043, | |
| "grad_norm": 7.313194751739502, | |
| "learning_rate": 7.544113654872862e-05, | |
| "loss": 8.493, | |
| "step": 5560 | |
| }, | |
| { | |
| "epoch": 0.025246698247661965, | |
| "grad_norm": 11.324033737182617, | |
| "learning_rate": 7.571260519410008e-05, | |
| "loss": 8.4776, | |
| "step": 5580 | |
| }, | |
| { | |
| "epoch": 0.02533718820553889, | |
| "grad_norm": 9.4235258102417, | |
| "learning_rate": 7.598407383947155e-05, | |
| "loss": 8.4769, | |
| "step": 5600 | |
| }, | |
| { | |
| "epoch": 0.025427678163415814, | |
| "grad_norm": 6.676479339599609, | |
| "learning_rate": 7.6255542484843e-05, | |
| "loss": 8.4389, | |
| "step": 5620 | |
| }, | |
| { | |
| "epoch": 0.02551816812129274, | |
| "grad_norm": 9.530123710632324, | |
| "learning_rate": 7.652701113021446e-05, | |
| "loss": 8.4704, | |
| "step": 5640 | |
| }, | |
| { | |
| "epoch": 0.025608658079169663, | |
| "grad_norm": 10.614904403686523, | |
| "learning_rate": 7.679847977558591e-05, | |
| "loss": 8.4507, | |
| "step": 5660 | |
| }, | |
| { | |
| "epoch": 0.02569914803704659, | |
| "grad_norm": 7.5254974365234375, | |
| "learning_rate": 7.706994842095737e-05, | |
| "loss": 8.464, | |
| "step": 5680 | |
| }, | |
| { | |
| "epoch": 0.025789637994923512, | |
| "grad_norm": 7.461385250091553, | |
| "learning_rate": 7.734141706632884e-05, | |
| "loss": 8.4516, | |
| "step": 5700 | |
| }, | |
| { | |
| "epoch": 0.02588012795280044, | |
| "grad_norm": 9.106521606445312, | |
| "learning_rate": 7.76128857117003e-05, | |
| "loss": 8.4142, | |
| "step": 5720 | |
| }, | |
| { | |
| "epoch": 0.02597061791067736, | |
| "grad_norm": 8.536205291748047, | |
| "learning_rate": 7.788435435707175e-05, | |
| "loss": 8.4497, | |
| "step": 5740 | |
| }, | |
| { | |
| "epoch": 0.026061107868554288, | |
| "grad_norm": 7.925720691680908, | |
| "learning_rate": 7.815582300244321e-05, | |
| "loss": 8.4783, | |
| "step": 5760 | |
| }, | |
| { | |
| "epoch": 0.02615159782643121, | |
| "grad_norm": 11.187898635864258, | |
| "learning_rate": 7.842729164781468e-05, | |
| "loss": 8.4054, | |
| "step": 5780 | |
| }, | |
| { | |
| "epoch": 0.026242087784308137, | |
| "grad_norm": 6.965084075927734, | |
| "learning_rate": 7.869876029318614e-05, | |
| "loss": 8.4079, | |
| "step": 5800 | |
| }, | |
| { | |
| "epoch": 0.02633257774218506, | |
| "grad_norm": 8.090741157531738, | |
| "learning_rate": 7.89702289385576e-05, | |
| "loss": 8.4474, | |
| "step": 5820 | |
| }, | |
| { | |
| "epoch": 0.026423067700061986, | |
| "grad_norm": 9.698216438293457, | |
| "learning_rate": 7.924169758392905e-05, | |
| "loss": 8.3945, | |
| "step": 5840 | |
| }, | |
| { | |
| "epoch": 0.026513557657938912, | |
| "grad_norm": 7.889448642730713, | |
| "learning_rate": 7.951316622930052e-05, | |
| "loss": 8.4046, | |
| "step": 5860 | |
| }, | |
| { | |
| "epoch": 0.026604047615815835, | |
| "grad_norm": 11.487144470214844, | |
| "learning_rate": 7.978463487467197e-05, | |
| "loss": 8.4195, | |
| "step": 5880 | |
| }, | |
| { | |
| "epoch": 0.02669453757369276, | |
| "grad_norm": 9.28532886505127, | |
| "learning_rate": 8.005610352004343e-05, | |
| "loss": 8.406, | |
| "step": 5900 | |
| }, | |
| { | |
| "epoch": 0.026785027531569684, | |
| "grad_norm": 8.982071876525879, | |
| "learning_rate": 8.032757216541489e-05, | |
| "loss": 8.4221, | |
| "step": 5920 | |
| }, | |
| { | |
| "epoch": 0.02687551748944661, | |
| "grad_norm": 11.42358684539795, | |
| "learning_rate": 8.059904081078636e-05, | |
| "loss": 8.4423, | |
| "step": 5940 | |
| }, | |
| { | |
| "epoch": 0.026966007447323533, | |
| "grad_norm": 8.633251190185547, | |
| "learning_rate": 8.087050945615782e-05, | |
| "loss": 8.4233, | |
| "step": 5960 | |
| }, | |
| { | |
| "epoch": 0.02705649740520046, | |
| "grad_norm": 9.28022575378418, | |
| "learning_rate": 8.114197810152928e-05, | |
| "loss": 8.4169, | |
| "step": 5980 | |
| }, | |
| { | |
| "epoch": 0.02714698736307738, | |
| "grad_norm": 11.166740417480469, | |
| "learning_rate": 8.141344674690073e-05, | |
| "loss": 8.4018, | |
| "step": 6000 | |
| }, | |
| { | |
| "epoch": 0.02714698736307738, | |
| "eval_accuracy": 0.11314150543417859, | |
| "eval_loss": 8.402518272399902, | |
| "eval_runtime": 218.3209, | |
| "eval_samples_per_second": 2784.158, | |
| "eval_steps_per_second": 10.878, | |
| "step": 6000 | |
| }, | |
| { | |
| "epoch": 0.027237477320954308, | |
| "grad_norm": 7.100822925567627, | |
| "learning_rate": 8.167134196000362e-05, | |
| "loss": 8.4131, | |
| "step": 6020 | |
| }, | |
| { | |
| "epoch": 0.02732796727883123, | |
| "grad_norm": 8.460954666137695, | |
| "learning_rate": 8.194281060537508e-05, | |
| "loss": 8.4087, | |
| "step": 6040 | |
| }, | |
| { | |
| "epoch": 0.027418457236708157, | |
| "grad_norm": 7.642125129699707, | |
| "learning_rate": 8.221427925074653e-05, | |
| "loss": 8.3806, | |
| "step": 6060 | |
| }, | |
| { | |
| "epoch": 0.02750894719458508, | |
| "grad_norm": 8.104974746704102, | |
| "learning_rate": 8.2485747896118e-05, | |
| "loss": 8.404, | |
| "step": 6080 | |
| }, | |
| { | |
| "epoch": 0.027599437152462006, | |
| "grad_norm": 8.082459449768066, | |
| "learning_rate": 8.275721654148946e-05, | |
| "loss": 8.3865, | |
| "step": 6100 | |
| }, | |
| { | |
| "epoch": 0.02768992711033893, | |
| "grad_norm": 8.786911010742188, | |
| "learning_rate": 8.302868518686092e-05, | |
| "loss": 8.3475, | |
| "step": 6120 | |
| }, | |
| { | |
| "epoch": 0.027780417068215855, | |
| "grad_norm": 7.780808925628662, | |
| "learning_rate": 8.330015383223237e-05, | |
| "loss": 8.3798, | |
| "step": 6140 | |
| }, | |
| { | |
| "epoch": 0.02787090702609278, | |
| "grad_norm": 10.508188247680664, | |
| "learning_rate": 8.357162247760384e-05, | |
| "loss": 8.3718, | |
| "step": 6160 | |
| }, | |
| { | |
| "epoch": 0.027961396983969704, | |
| "grad_norm": 9.833992004394531, | |
| "learning_rate": 8.38430911229753e-05, | |
| "loss": 8.3952, | |
| "step": 6180 | |
| }, | |
| { | |
| "epoch": 0.02805188694184663, | |
| "grad_norm": 9.917244911193848, | |
| "learning_rate": 8.411455976834675e-05, | |
| "loss": 8.3828, | |
| "step": 6200 | |
| }, | |
| { | |
| "epoch": 0.028142376899723553, | |
| "grad_norm": 8.893899917602539, | |
| "learning_rate": 8.438602841371821e-05, | |
| "loss": 8.3853, | |
| "step": 6220 | |
| }, | |
| { | |
| "epoch": 0.02823286685760048, | |
| "grad_norm": 8.206876754760742, | |
| "learning_rate": 8.465749705908967e-05, | |
| "loss": 8.3686, | |
| "step": 6240 | |
| }, | |
| { | |
| "epoch": 0.028323356815477402, | |
| "grad_norm": 6.771660327911377, | |
| "learning_rate": 8.492896570446114e-05, | |
| "loss": 8.3699, | |
| "step": 6260 | |
| }, | |
| { | |
| "epoch": 0.028413846773354328, | |
| "grad_norm": 8.602880477905273, | |
| "learning_rate": 8.52004343498326e-05, | |
| "loss": 8.3388, | |
| "step": 6280 | |
| }, | |
| { | |
| "epoch": 0.02850433673123125, | |
| "grad_norm": 12.602445602416992, | |
| "learning_rate": 8.547190299520405e-05, | |
| "loss": 8.3127, | |
| "step": 6300 | |
| }, | |
| { | |
| "epoch": 0.028594826689108177, | |
| "grad_norm": 6.581843852996826, | |
| "learning_rate": 8.57433716405755e-05, | |
| "loss": 8.3345, | |
| "step": 6320 | |
| }, | |
| { | |
| "epoch": 0.0286853166469851, | |
| "grad_norm": 11.11732292175293, | |
| "learning_rate": 8.601484028594696e-05, | |
| "loss": 8.3442, | |
| "step": 6340 | |
| }, | |
| { | |
| "epoch": 0.028775806604862026, | |
| "grad_norm": 7.795157432556152, | |
| "learning_rate": 8.628630893131843e-05, | |
| "loss": 8.3477, | |
| "step": 6360 | |
| }, | |
| { | |
| "epoch": 0.02886629656273895, | |
| "grad_norm": 7.013496398925781, | |
| "learning_rate": 8.655777757668989e-05, | |
| "loss": 8.3444, | |
| "step": 6380 | |
| }, | |
| { | |
| "epoch": 0.028956786520615875, | |
| "grad_norm": 7.039948463439941, | |
| "learning_rate": 8.682924622206135e-05, | |
| "loss": 8.3242, | |
| "step": 6400 | |
| }, | |
| { | |
| "epoch": 0.029047276478492798, | |
| "grad_norm": 9.261716842651367, | |
| "learning_rate": 8.710071486743282e-05, | |
| "loss": 8.3209, | |
| "step": 6420 | |
| }, | |
| { | |
| "epoch": 0.029137766436369724, | |
| "grad_norm": 7.255875587463379, | |
| "learning_rate": 8.737218351280428e-05, | |
| "loss": 8.304, | |
| "step": 6440 | |
| }, | |
| { | |
| "epoch": 0.029228256394246647, | |
| "grad_norm": 7.955538749694824, | |
| "learning_rate": 8.764365215817573e-05, | |
| "loss": 8.2953, | |
| "step": 6460 | |
| }, | |
| { | |
| "epoch": 0.029318746352123573, | |
| "grad_norm": 9.364811897277832, | |
| "learning_rate": 8.791512080354718e-05, | |
| "loss": 8.2936, | |
| "step": 6480 | |
| }, | |
| { | |
| "epoch": 0.0294092363100005, | |
| "grad_norm": 9.385396957397461, | |
| "learning_rate": 8.818658944891864e-05, | |
| "loss": 8.3276, | |
| "step": 6500 | |
| }, | |
| { | |
| "epoch": 0.029499726267877422, | |
| "grad_norm": 8.448295593261719, | |
| "learning_rate": 8.84580580942901e-05, | |
| "loss": 8.2975, | |
| "step": 6520 | |
| }, | |
| { | |
| "epoch": 0.02959021622575435, | |
| "grad_norm": 9.282604217529297, | |
| "learning_rate": 8.872952673966157e-05, | |
| "loss": 8.3217, | |
| "step": 6540 | |
| }, | |
| { | |
| "epoch": 0.02968070618363127, | |
| "grad_norm": 7.898446559906006, | |
| "learning_rate": 8.900099538503303e-05, | |
| "loss": 8.3006, | |
| "step": 6560 | |
| }, | |
| { | |
| "epoch": 0.029771196141508197, | |
| "grad_norm": 9.186493873596191, | |
| "learning_rate": 8.927246403040448e-05, | |
| "loss": 8.2981, | |
| "step": 6580 | |
| }, | |
| { | |
| "epoch": 0.02986168609938512, | |
| "grad_norm": 9.346575736999512, | |
| "learning_rate": 8.954393267577595e-05, | |
| "loss": 8.2883, | |
| "step": 6600 | |
| }, | |
| { | |
| "epoch": 0.029952176057262046, | |
| "grad_norm": 6.458785057067871, | |
| "learning_rate": 8.981540132114741e-05, | |
| "loss": 8.2966, | |
| "step": 6620 | |
| }, | |
| { | |
| "epoch": 0.03004266601513897, | |
| "grad_norm": 8.704976081848145, | |
| "learning_rate": 9.008686996651886e-05, | |
| "loss": 8.2986, | |
| "step": 6640 | |
| }, | |
| { | |
| "epoch": 0.030133155973015895, | |
| "grad_norm": 7.744259357452393, | |
| "learning_rate": 9.035833861189032e-05, | |
| "loss": 8.2868, | |
| "step": 6660 | |
| }, | |
| { | |
| "epoch": 0.030223645930892818, | |
| "grad_norm": 8.345844268798828, | |
| "learning_rate": 9.062980725726179e-05, | |
| "loss": 8.2931, | |
| "step": 6680 | |
| }, | |
| { | |
| "epoch": 0.030314135888769744, | |
| "grad_norm": 7.604759216308594, | |
| "learning_rate": 9.090127590263323e-05, | |
| "loss": 8.2847, | |
| "step": 6700 | |
| }, | |
| { | |
| "epoch": 0.030404625846646667, | |
| "grad_norm": 10.3920259475708, | |
| "learning_rate": 9.11727445480047e-05, | |
| "loss": 8.273, | |
| "step": 6720 | |
| }, | |
| { | |
| "epoch": 0.030495115804523593, | |
| "grad_norm": 7.095389366149902, | |
| "learning_rate": 9.144421319337616e-05, | |
| "loss": 8.2768, | |
| "step": 6740 | |
| }, | |
| { | |
| "epoch": 0.030585605762400516, | |
| "grad_norm": 7.211811542510986, | |
| "learning_rate": 9.171568183874762e-05, | |
| "loss": 8.2918, | |
| "step": 6760 | |
| }, | |
| { | |
| "epoch": 0.030676095720277442, | |
| "grad_norm": 8.639713287353516, | |
| "learning_rate": 9.198715048411909e-05, | |
| "loss": 8.2845, | |
| "step": 6780 | |
| }, | |
| { | |
| "epoch": 0.03076658567815437, | |
| "grad_norm": 7.687414169311523, | |
| "learning_rate": 9.225861912949055e-05, | |
| "loss": 8.2992, | |
| "step": 6800 | |
| }, | |
| { | |
| "epoch": 0.03085707563603129, | |
| "grad_norm": 8.479426383972168, | |
| "learning_rate": 9.2530087774862e-05, | |
| "loss": 8.2848, | |
| "step": 6820 | |
| }, | |
| { | |
| "epoch": 0.030947565593908218, | |
| "grad_norm": 8.185149192810059, | |
| "learning_rate": 9.280155642023345e-05, | |
| "loss": 8.3037, | |
| "step": 6840 | |
| }, | |
| { | |
| "epoch": 0.03103805555178514, | |
| "grad_norm": 8.295937538146973, | |
| "learning_rate": 9.307302506560491e-05, | |
| "loss": 8.3179, | |
| "step": 6860 | |
| }, | |
| { | |
| "epoch": 0.031128545509662067, | |
| "grad_norm": 10.772727012634277, | |
| "learning_rate": 9.334449371097638e-05, | |
| "loss": 8.264, | |
| "step": 6880 | |
| }, | |
| { | |
| "epoch": 0.03121903546753899, | |
| "grad_norm": 8.465076446533203, | |
| "learning_rate": 9.361596235634784e-05, | |
| "loss": 8.2303, | |
| "step": 6900 | |
| }, | |
| { | |
| "epoch": 0.031309525425415916, | |
| "grad_norm": 9.096773147583008, | |
| "learning_rate": 9.38874310017193e-05, | |
| "loss": 8.2473, | |
| "step": 6920 | |
| }, | |
| { | |
| "epoch": 0.03140001538329284, | |
| "grad_norm": 10.57555866241455, | |
| "learning_rate": 9.415889964709077e-05, | |
| "loss": 8.27, | |
| "step": 6940 | |
| }, | |
| { | |
| "epoch": 0.03149050534116976, | |
| "grad_norm": 7.5089850425720215, | |
| "learning_rate": 9.443036829246222e-05, | |
| "loss": 8.27, | |
| "step": 6960 | |
| }, | |
| { | |
| "epoch": 0.03158099529904669, | |
| "grad_norm": 10.865699768066406, | |
| "learning_rate": 9.470183693783368e-05, | |
| "loss": 8.2451, | |
| "step": 6980 | |
| }, | |
| { | |
| "epoch": 0.031671485256923614, | |
| "grad_norm": 12.514881134033203, | |
| "learning_rate": 9.497330558320513e-05, | |
| "loss": 8.259, | |
| "step": 7000 | |
| }, | |
| { | |
| "epoch": 0.031761975214800536, | |
| "grad_norm": 9.914373397827148, | |
| "learning_rate": 9.524477422857659e-05, | |
| "loss": 8.2727, | |
| "step": 7020 | |
| }, | |
| { | |
| "epoch": 0.03185246517267746, | |
| "grad_norm": 7.3313984870910645, | |
| "learning_rate": 9.551624287394806e-05, | |
| "loss": 8.2421, | |
| "step": 7040 | |
| }, | |
| { | |
| "epoch": 0.03194295513055439, | |
| "grad_norm": 5.989616394042969, | |
| "learning_rate": 9.578771151931952e-05, | |
| "loss": 8.2363, | |
| "step": 7060 | |
| }, | |
| { | |
| "epoch": 0.03203344508843131, | |
| "grad_norm": 7.4773430824279785, | |
| "learning_rate": 9.605918016469098e-05, | |
| "loss": 8.2718, | |
| "step": 7080 | |
| }, | |
| { | |
| "epoch": 0.032123935046308234, | |
| "grad_norm": 6.605820655822754, | |
| "learning_rate": 9.633064881006243e-05, | |
| "loss": 8.257, | |
| "step": 7100 | |
| }, | |
| { | |
| "epoch": 0.03221442500418516, | |
| "grad_norm": 8.294914245605469, | |
| "learning_rate": 9.658854402316532e-05, | |
| "loss": 8.2478, | |
| "step": 7120 | |
| }, | |
| { | |
| "epoch": 0.03230491496206209, | |
| "grad_norm": 10.011855125427246, | |
| "learning_rate": 9.686001266853678e-05, | |
| "loss": 8.2525, | |
| "step": 7140 | |
| }, | |
| { | |
| "epoch": 0.03239540491993901, | |
| "grad_norm": 7.529365062713623, | |
| "learning_rate": 9.713148131390823e-05, | |
| "loss": 8.2728, | |
| "step": 7160 | |
| }, | |
| { | |
| "epoch": 0.03248589487781593, | |
| "grad_norm": 8.781538009643555, | |
| "learning_rate": 9.74029499592797e-05, | |
| "loss": 8.2305, | |
| "step": 7180 | |
| }, | |
| { | |
| "epoch": 0.03257638483569286, | |
| "grad_norm": 12.758204460144043, | |
| "learning_rate": 9.767441860465116e-05, | |
| "loss": 8.2382, | |
| "step": 7200 | |
| }, | |
| { | |
| "epoch": 0.032666874793569785, | |
| "grad_norm": 10.523704528808594, | |
| "learning_rate": 9.794588725002262e-05, | |
| "loss": 8.2364, | |
| "step": 7220 | |
| }, | |
| { | |
| "epoch": 0.03275736475144671, | |
| "grad_norm": 6.50457239151001, | |
| "learning_rate": 9.821735589539409e-05, | |
| "loss": 8.2384, | |
| "step": 7240 | |
| }, | |
| { | |
| "epoch": 0.03284785470932363, | |
| "grad_norm": 9.191271781921387, | |
| "learning_rate": 9.848882454076555e-05, | |
| "loss": 8.2148, | |
| "step": 7260 | |
| }, | |
| { | |
| "epoch": 0.03293834466720056, | |
| "grad_norm": 8.93270206451416, | |
| "learning_rate": 9.8760293186137e-05, | |
| "loss": 8.2352, | |
| "step": 7280 | |
| }, | |
| { | |
| "epoch": 0.03302883462507748, | |
| "grad_norm": 9.895100593566895, | |
| "learning_rate": 9.903176183150845e-05, | |
| "loss": 8.2376, | |
| "step": 7300 | |
| }, | |
| { | |
| "epoch": 0.033119324582954406, | |
| "grad_norm": 10.420171737670898, | |
| "learning_rate": 9.930323047687991e-05, | |
| "loss": 8.2479, | |
| "step": 7320 | |
| }, | |
| { | |
| "epoch": 0.03320981454083133, | |
| "grad_norm": 9.649170875549316, | |
| "learning_rate": 9.957469912225138e-05, | |
| "loss": 8.2557, | |
| "step": 7340 | |
| }, | |
| { | |
| "epoch": 0.03330030449870826, | |
| "grad_norm": 7.854948043823242, | |
| "learning_rate": 9.984616776762284e-05, | |
| "loss": 8.2145, | |
| "step": 7360 | |
| }, | |
| { | |
| "epoch": 0.03339079445658518, | |
| "grad_norm": 8.486404418945312, | |
| "learning_rate": 0.0001001176364129943, | |
| "loss": 8.2132, | |
| "step": 7380 | |
| }, | |
| { | |
| "epoch": 0.033481284414462104, | |
| "grad_norm": 11.286945343017578, | |
| "learning_rate": 0.00010038910505836577, | |
| "loss": 8.2169, | |
| "step": 7400 | |
| }, | |
| { | |
| "epoch": 0.033571774372339026, | |
| "grad_norm": 6.662302494049072, | |
| "learning_rate": 0.00010066057370373721, | |
| "loss": 8.2318, | |
| "step": 7420 | |
| }, | |
| { | |
| "epoch": 0.033662264330215956, | |
| "grad_norm": 10.467026710510254, | |
| "learning_rate": 0.00010093204234910868, | |
| "loss": 8.2089, | |
| "step": 7440 | |
| }, | |
| { | |
| "epoch": 0.03375275428809288, | |
| "grad_norm": 12.113288879394531, | |
| "learning_rate": 0.00010120351099448013, | |
| "loss": 8.2194, | |
| "step": 7460 | |
| }, | |
| { | |
| "epoch": 0.0338432442459698, | |
| "grad_norm": 13.295260429382324, | |
| "learning_rate": 0.00010147497963985159, | |
| "loss": 8.2526, | |
| "step": 7480 | |
| }, | |
| { | |
| "epoch": 0.03393373420384673, | |
| "grad_norm": 9.79587173461914, | |
| "learning_rate": 0.00010174644828522305, | |
| "loss": 8.2253, | |
| "step": 7500 | |
| }, | |
| { | |
| "epoch": 0.034024224161723654, | |
| "grad_norm": 10.251439094543457, | |
| "learning_rate": 0.00010201791693059452, | |
| "loss": 8.2248, | |
| "step": 7520 | |
| }, | |
| { | |
| "epoch": 0.03411471411960058, | |
| "grad_norm": 10.583033561706543, | |
| "learning_rate": 0.00010228938557596597, | |
| "loss": 8.211, | |
| "step": 7540 | |
| }, | |
| { | |
| "epoch": 0.0342052040774775, | |
| "grad_norm": 10.661384582519531, | |
| "learning_rate": 0.00010256085422133743, | |
| "loss": 8.2053, | |
| "step": 7560 | |
| }, | |
| { | |
| "epoch": 0.03429569403535443, | |
| "grad_norm": 8.133881568908691, | |
| "learning_rate": 0.0001028323228667089, | |
| "loss": 8.1948, | |
| "step": 7580 | |
| }, | |
| { | |
| "epoch": 0.03438618399323135, | |
| "grad_norm": 9.278162002563477, | |
| "learning_rate": 0.00010310379151208036, | |
| "loss": 8.2235, | |
| "step": 7600 | |
| }, | |
| { | |
| "epoch": 0.034476673951108275, | |
| "grad_norm": 10.354171752929688, | |
| "learning_rate": 0.00010337526015745181, | |
| "loss": 8.1704, | |
| "step": 7620 | |
| }, | |
| { | |
| "epoch": 0.0345671639089852, | |
| "grad_norm": 9.4600830078125, | |
| "learning_rate": 0.00010364672880282327, | |
| "loss": 8.2008, | |
| "step": 7640 | |
| }, | |
| { | |
| "epoch": 0.03465765386686213, | |
| "grad_norm": 10.290422439575195, | |
| "learning_rate": 0.00010391819744819473, | |
| "loss": 8.2084, | |
| "step": 7660 | |
| }, | |
| { | |
| "epoch": 0.03474814382473905, | |
| "grad_norm": 9.98493480682373, | |
| "learning_rate": 0.00010418966609356618, | |
| "loss": 8.1878, | |
| "step": 7680 | |
| }, | |
| { | |
| "epoch": 0.03483863378261597, | |
| "grad_norm": 8.021723747253418, | |
| "learning_rate": 0.00010446113473893765, | |
| "loss": 8.1865, | |
| "step": 7700 | |
| }, | |
| { | |
| "epoch": 0.034929123740492896, | |
| "grad_norm": 6.915677070617676, | |
| "learning_rate": 0.00010473260338430911, | |
| "loss": 8.1795, | |
| "step": 7720 | |
| }, | |
| { | |
| "epoch": 0.035019613698369825, | |
| "grad_norm": 9.64877986907959, | |
| "learning_rate": 0.00010500407202968057, | |
| "loss": 8.1756, | |
| "step": 7740 | |
| }, | |
| { | |
| "epoch": 0.03511010365624675, | |
| "grad_norm": 9.673460960388184, | |
| "learning_rate": 0.00010527554067505204, | |
| "loss": 8.1877, | |
| "step": 7760 | |
| }, | |
| { | |
| "epoch": 0.03520059361412367, | |
| "grad_norm": 10.429800033569336, | |
| "learning_rate": 0.0001055470093204235, | |
| "loss": 8.1803, | |
| "step": 7780 | |
| }, | |
| { | |
| "epoch": 0.0352910835720006, | |
| "grad_norm": 9.610269546508789, | |
| "learning_rate": 0.00010581847796579494, | |
| "loss": 8.214, | |
| "step": 7800 | |
| }, | |
| { | |
| "epoch": 0.03538157352987752, | |
| "grad_norm": 9.696439743041992, | |
| "learning_rate": 0.0001060899466111664, | |
| "loss": 8.1585, | |
| "step": 7820 | |
| }, | |
| { | |
| "epoch": 0.035472063487754446, | |
| "grad_norm": 10.302108764648438, | |
| "learning_rate": 0.00010636141525653786, | |
| "loss": 8.1495, | |
| "step": 7840 | |
| }, | |
| { | |
| "epoch": 0.03556255344563137, | |
| "grad_norm": 10.439906120300293, | |
| "learning_rate": 0.00010663288390190933, | |
| "loss": 8.1636, | |
| "step": 7860 | |
| }, | |
| { | |
| "epoch": 0.0356530434035083, | |
| "grad_norm": 13.941293716430664, | |
| "learning_rate": 0.00010690435254728079, | |
| "loss": 8.1674, | |
| "step": 7880 | |
| }, | |
| { | |
| "epoch": 0.03574353336138522, | |
| "grad_norm": 11.378789901733398, | |
| "learning_rate": 0.00010717582119265225, | |
| "loss": 8.1704, | |
| "step": 7900 | |
| }, | |
| { | |
| "epoch": 0.035834023319262144, | |
| "grad_norm": 10.802684783935547, | |
| "learning_rate": 0.00010744728983802372, | |
| "loss": 8.1902, | |
| "step": 7920 | |
| }, | |
| { | |
| "epoch": 0.03592451327713907, | |
| "grad_norm": 13.995284080505371, | |
| "learning_rate": 0.00010771875848339517, | |
| "loss": 8.1502, | |
| "step": 7940 | |
| }, | |
| { | |
| "epoch": 0.036015003235016, | |
| "grad_norm": 11.473008155822754, | |
| "learning_rate": 0.00010799022712876663, | |
| "loss": 8.2082, | |
| "step": 7960 | |
| }, | |
| { | |
| "epoch": 0.03610549319289292, | |
| "grad_norm": 9.314510345458984, | |
| "learning_rate": 0.00010826169577413808, | |
| "loss": 8.19, | |
| "step": 7980 | |
| }, | |
| { | |
| "epoch": 0.03619598315076984, | |
| "grad_norm": 11.141118049621582, | |
| "learning_rate": 0.00010853316441950954, | |
| "loss": 8.2093, | |
| "step": 8000 | |
| }, | |
| { | |
| "epoch": 0.03619598315076984, | |
| "eval_accuracy": 0.11013720949528932, | |
| "eval_loss": 8.173333168029785, | |
| "eval_runtime": 219.4541, | |
| "eval_samples_per_second": 2769.782, | |
| "eval_steps_per_second": 10.822, | |
| "step": 8000 | |
| }, | |
| { | |
| "epoch": 0.036286473108646765, | |
| "grad_norm": 12.62540054321289, | |
| "learning_rate": 0.000108804633064881, | |
| "loss": 8.1561, | |
| "step": 8020 | |
| }, | |
| { | |
| "epoch": 0.036376963066523695, | |
| "grad_norm": 12.97541332244873, | |
| "learning_rate": 0.00010907610171025247, | |
| "loss": 8.1708, | |
| "step": 8040 | |
| }, | |
| { | |
| "epoch": 0.03646745302440062, | |
| "grad_norm": 8.305766105651855, | |
| "learning_rate": 0.00010934757035562392, | |
| "loss": 8.1671, | |
| "step": 8060 | |
| }, | |
| { | |
| "epoch": 0.03655794298227754, | |
| "grad_norm": 14.076859474182129, | |
| "learning_rate": 0.00010961903900099538, | |
| "loss": 8.1659, | |
| "step": 8080 | |
| }, | |
| { | |
| "epoch": 0.03664843294015447, | |
| "grad_norm": 11.951278686523438, | |
| "learning_rate": 0.00010989050764636684, | |
| "loss": 8.1893, | |
| "step": 8100 | |
| }, | |
| { | |
| "epoch": 0.03673892289803139, | |
| "grad_norm": 10.796624183654785, | |
| "learning_rate": 0.00011016197629173831, | |
| "loss": 8.1942, | |
| "step": 8120 | |
| }, | |
| { | |
| "epoch": 0.036829412855908315, | |
| "grad_norm": 10.49177074432373, | |
| "learning_rate": 0.00011043344493710976, | |
| "loss": 8.1589, | |
| "step": 8140 | |
| }, | |
| { | |
| "epoch": 0.03691990281378524, | |
| "grad_norm": 12.82060432434082, | |
| "learning_rate": 0.00011070491358248122, | |
| "loss": 8.1957, | |
| "step": 8160 | |
| }, | |
| { | |
| "epoch": 0.03701039277166217, | |
| "grad_norm": 11.00941276550293, | |
| "learning_rate": 0.00011097638222785267, | |
| "loss": 8.1609, | |
| "step": 8180 | |
| }, | |
| { | |
| "epoch": 0.03710088272953909, | |
| "grad_norm": 10.24111270904541, | |
| "learning_rate": 0.00011124785087322413, | |
| "loss": 8.1769, | |
| "step": 8200 | |
| }, | |
| { | |
| "epoch": 0.03719137268741601, | |
| "grad_norm": 11.292909622192383, | |
| "learning_rate": 0.0001115193195185956, | |
| "loss": 8.1628, | |
| "step": 8220 | |
| }, | |
| { | |
| "epoch": 0.037281862645292936, | |
| "grad_norm": 9.362674713134766, | |
| "learning_rate": 0.00011179078816396706, | |
| "loss": 8.1638, | |
| "step": 8240 | |
| }, | |
| { | |
| "epoch": 0.037372352603169866, | |
| "grad_norm": 12.9249906539917, | |
| "learning_rate": 0.00011206225680933852, | |
| "loss": 8.1957, | |
| "step": 8260 | |
| }, | |
| { | |
| "epoch": 0.03746284256104679, | |
| "grad_norm": 10.386489868164062, | |
| "learning_rate": 0.00011233372545470999, | |
| "loss": 8.1525, | |
| "step": 8280 | |
| }, | |
| { | |
| "epoch": 0.03755333251892371, | |
| "grad_norm": 12.65300464630127, | |
| "learning_rate": 0.00011260519410008144, | |
| "loss": 8.1558, | |
| "step": 8300 | |
| }, | |
| { | |
| "epoch": 0.037643822476800634, | |
| "grad_norm": 11.562602996826172, | |
| "learning_rate": 0.0001128766627454529, | |
| "loss": 8.148, | |
| "step": 8320 | |
| }, | |
| { | |
| "epoch": 0.037734312434677564, | |
| "grad_norm": 14.783183097839355, | |
| "learning_rate": 0.00011314813139082436, | |
| "loss": 8.1448, | |
| "step": 8340 | |
| }, | |
| { | |
| "epoch": 0.03782480239255449, | |
| "grad_norm": 15.469168663024902, | |
| "learning_rate": 0.00011341960003619583, | |
| "loss": 8.1801, | |
| "step": 8360 | |
| }, | |
| { | |
| "epoch": 0.03791529235043141, | |
| "grad_norm": 11.361299514770508, | |
| "learning_rate": 0.00011369106868156726, | |
| "loss": 8.1549, | |
| "step": 8380 | |
| }, | |
| { | |
| "epoch": 0.03800578230830833, | |
| "grad_norm": 9.814708709716797, | |
| "learning_rate": 0.00011396253732693873, | |
| "loss": 8.1663, | |
| "step": 8400 | |
| }, | |
| { | |
| "epoch": 0.03809627226618526, | |
| "grad_norm": 10.522832870483398, | |
| "learning_rate": 0.00011423400597231019, | |
| "loss": 8.1459, | |
| "step": 8420 | |
| }, | |
| { | |
| "epoch": 0.038186762224062185, | |
| "grad_norm": 10.637961387634277, | |
| "learning_rate": 0.00011450547461768165, | |
| "loss": 8.1554, | |
| "step": 8440 | |
| }, | |
| { | |
| "epoch": 0.03827725218193911, | |
| "grad_norm": 14.578750610351562, | |
| "learning_rate": 0.00011477694326305312, | |
| "loss": 8.1758, | |
| "step": 8460 | |
| }, | |
| { | |
| "epoch": 0.03836774213981604, | |
| "grad_norm": 12.179791450500488, | |
| "learning_rate": 0.00011504841190842457, | |
| "loss": 8.1117, | |
| "step": 8480 | |
| }, | |
| { | |
| "epoch": 0.03845823209769296, | |
| "grad_norm": 11.189960479736328, | |
| "learning_rate": 0.00011531988055379603, | |
| "loss": 8.1517, | |
| "step": 8500 | |
| }, | |
| { | |
| "epoch": 0.03854872205556988, | |
| "grad_norm": 11.662614822387695, | |
| "learning_rate": 0.00011559134919916749, | |
| "loss": 8.129, | |
| "step": 8520 | |
| }, | |
| { | |
| "epoch": 0.038639212013446805, | |
| "grad_norm": 9.089029312133789, | |
| "learning_rate": 0.00011584924441227038, | |
| "loss": 8.1452, | |
| "step": 8540 | |
| }, | |
| { | |
| "epoch": 0.038729701971323735, | |
| "grad_norm": 15.1500825881958, | |
| "learning_rate": 0.00011612071305764184, | |
| "loss": 8.1623, | |
| "step": 8560 | |
| }, | |
| { | |
| "epoch": 0.03882019192920066, | |
| "grad_norm": 15.177955627441406, | |
| "learning_rate": 0.0001163921817030133, | |
| "loss": 8.1138, | |
| "step": 8580 | |
| }, | |
| { | |
| "epoch": 0.03891068188707758, | |
| "grad_norm": 9.620798110961914, | |
| "learning_rate": 0.00011666365034838476, | |
| "loss": 8.1472, | |
| "step": 8600 | |
| }, | |
| { | |
| "epoch": 0.0390011718449545, | |
| "grad_norm": 13.227412223815918, | |
| "learning_rate": 0.00011693511899375622, | |
| "loss": 8.1436, | |
| "step": 8620 | |
| }, | |
| { | |
| "epoch": 0.03909166180283143, | |
| "grad_norm": 12.561627388000488, | |
| "learning_rate": 0.00011720658763912768, | |
| "loss": 8.1478, | |
| "step": 8640 | |
| }, | |
| { | |
| "epoch": 0.039182151760708356, | |
| "grad_norm": 12.864951133728027, | |
| "learning_rate": 0.00011747805628449915, | |
| "loss": 8.1727, | |
| "step": 8660 | |
| }, | |
| { | |
| "epoch": 0.03927264171858528, | |
| "grad_norm": 12.883962631225586, | |
| "learning_rate": 0.00011774952492987061, | |
| "loss": 8.1396, | |
| "step": 8680 | |
| }, | |
| { | |
| "epoch": 0.0393631316764622, | |
| "grad_norm": 7.435621738433838, | |
| "learning_rate": 0.00011802099357524204, | |
| "loss": 8.1774, | |
| "step": 8700 | |
| }, | |
| { | |
| "epoch": 0.03945362163433913, | |
| "grad_norm": 12.7384672164917, | |
| "learning_rate": 0.00011829246222061351, | |
| "loss": 8.1297, | |
| "step": 8720 | |
| }, | |
| { | |
| "epoch": 0.039544111592216054, | |
| "grad_norm": 14.0343017578125, | |
| "learning_rate": 0.00011856393086598497, | |
| "loss": 8.1406, | |
| "step": 8740 | |
| }, | |
| { | |
| "epoch": 0.03963460155009298, | |
| "grad_norm": 15.325870513916016, | |
| "learning_rate": 0.00011883539951135643, | |
| "loss": 8.1619, | |
| "step": 8760 | |
| }, | |
| { | |
| "epoch": 0.039725091507969906, | |
| "grad_norm": 21.650548934936523, | |
| "learning_rate": 0.00011910686815672788, | |
| "loss": 8.193, | |
| "step": 8780 | |
| }, | |
| { | |
| "epoch": 0.03981558146584683, | |
| "grad_norm": 15.605712890625, | |
| "learning_rate": 0.00011937833680209935, | |
| "loss": 8.1709, | |
| "step": 8800 | |
| }, | |
| { | |
| "epoch": 0.03990607142372375, | |
| "grad_norm": 10.788895606994629, | |
| "learning_rate": 0.00011964980544747081, | |
| "loss": 8.1451, | |
| "step": 8820 | |
| }, | |
| { | |
| "epoch": 0.039996561381600675, | |
| "grad_norm": 16.377477645874023, | |
| "learning_rate": 0.00011992127409284227, | |
| "loss": 8.134, | |
| "step": 8840 | |
| }, | |
| { | |
| "epoch": 0.040087051339477604, | |
| "grad_norm": 13.106194496154785, | |
| "learning_rate": 0.00012019274273821374, | |
| "loss": 8.1352, | |
| "step": 8860 | |
| }, | |
| { | |
| "epoch": 0.04017754129735453, | |
| "grad_norm": 11.152835845947266, | |
| "learning_rate": 0.0001204642113835852, | |
| "loss": 8.1138, | |
| "step": 8880 | |
| }, | |
| { | |
| "epoch": 0.04026803125523145, | |
| "grad_norm": 9.210712432861328, | |
| "learning_rate": 0.00012073568002895666, | |
| "loss": 8.1769, | |
| "step": 8900 | |
| }, | |
| { | |
| "epoch": 0.04035852121310837, | |
| "grad_norm": 12.555234909057617, | |
| "learning_rate": 0.00012100714867432813, | |
| "loss": 8.1383, | |
| "step": 8920 | |
| }, | |
| { | |
| "epoch": 0.0404490111709853, | |
| "grad_norm": 12.013688087463379, | |
| "learning_rate": 0.00012127861731969958, | |
| "loss": 8.1564, | |
| "step": 8940 | |
| }, | |
| { | |
| "epoch": 0.040539501128862225, | |
| "grad_norm": 9.827411651611328, | |
| "learning_rate": 0.00012155008596507101, | |
| "loss": 8.1348, | |
| "step": 8960 | |
| }, | |
| { | |
| "epoch": 0.04062999108673915, | |
| "grad_norm": 11.609356880187988, | |
| "learning_rate": 0.00012182155461044248, | |
| "loss": 8.1646, | |
| "step": 8980 | |
| }, | |
| { | |
| "epoch": 0.04072048104461607, | |
| "grad_norm": 13.045088768005371, | |
| "learning_rate": 0.00012209302325581395, | |
| "loss": 8.1628, | |
| "step": 9000 | |
| }, | |
| { | |
| "epoch": 0.040810971002493, | |
| "grad_norm": 12.780691146850586, | |
| "learning_rate": 0.00012236449190118542, | |
| "loss": 8.1487, | |
| "step": 9020 | |
| }, | |
| { | |
| "epoch": 0.04090146096036992, | |
| "grad_norm": 10.65334701538086, | |
| "learning_rate": 0.00012263596054655685, | |
| "loss": 8.1275, | |
| "step": 9040 | |
| }, | |
| { | |
| "epoch": 0.040991950918246846, | |
| "grad_norm": 8.080134391784668, | |
| "learning_rate": 0.00012290742919192832, | |
| "loss": 8.1356, | |
| "step": 9060 | |
| }, | |
| { | |
| "epoch": 0.041082440876123776, | |
| "grad_norm": 12.708916664123535, | |
| "learning_rate": 0.00012317889783729978, | |
| "loss": 8.1606, | |
| "step": 9080 | |
| }, | |
| { | |
| "epoch": 0.0411729308340007, | |
| "grad_norm": 13.570298194885254, | |
| "learning_rate": 0.00012345036648267124, | |
| "loss": 8.1389, | |
| "step": 9100 | |
| }, | |
| { | |
| "epoch": 0.04126342079187762, | |
| "grad_norm": 13.237983703613281, | |
| "learning_rate": 0.0001237218351280427, | |
| "loss": 8.1243, | |
| "step": 9120 | |
| }, | |
| { | |
| "epoch": 0.041353910749754544, | |
| "grad_norm": 14.53023910522461, | |
| "learning_rate": 0.00012399330377341417, | |
| "loss": 8.1191, | |
| "step": 9140 | |
| }, | |
| { | |
| "epoch": 0.041444400707631474, | |
| "grad_norm": 11.765192031860352, | |
| "learning_rate": 0.00012426477241878563, | |
| "loss": 8.1031, | |
| "step": 9160 | |
| }, | |
| { | |
| "epoch": 0.041534890665508396, | |
| "grad_norm": 11.261069297790527, | |
| "learning_rate": 0.0001245362410641571, | |
| "loss": 8.1504, | |
| "step": 9180 | |
| }, | |
| { | |
| "epoch": 0.04162538062338532, | |
| "grad_norm": 13.039865493774414, | |
| "learning_rate": 0.00012480770970952856, | |
| "loss": 8.1186, | |
| "step": 9200 | |
| }, | |
| { | |
| "epoch": 0.04171587058126224, | |
| "grad_norm": 11.21242904663086, | |
| "learning_rate": 0.0001250791783549, | |
| "loss": 8.1244, | |
| "step": 9220 | |
| }, | |
| { | |
| "epoch": 0.04180636053913917, | |
| "grad_norm": 13.84521770477295, | |
| "learning_rate": 0.00012535064700027146, | |
| "loss": 8.1442, | |
| "step": 9240 | |
| }, | |
| { | |
| "epoch": 0.041896850497016094, | |
| "grad_norm": 14.333518981933594, | |
| "learning_rate": 0.00012562211564564292, | |
| "loss": 8.1628, | |
| "step": 9260 | |
| }, | |
| { | |
| "epoch": 0.04198734045489302, | |
| "grad_norm": 12.016851425170898, | |
| "learning_rate": 0.00012589358429101438, | |
| "loss": 8.1037, | |
| "step": 9280 | |
| }, | |
| { | |
| "epoch": 0.04207783041276994, | |
| "grad_norm": 9.183259010314941, | |
| "learning_rate": 0.00012616505293638585, | |
| "loss": 8.1429, | |
| "step": 9300 | |
| }, | |
| { | |
| "epoch": 0.04216832037064687, | |
| "grad_norm": 13.651033401489258, | |
| "learning_rate": 0.0001264365215817573, | |
| "loss": 8.1202, | |
| "step": 9320 | |
| }, | |
| { | |
| "epoch": 0.04225881032852379, | |
| "grad_norm": 11.869391441345215, | |
| "learning_rate": 0.00012670799022712877, | |
| "loss": 8.1125, | |
| "step": 9340 | |
| }, | |
| { | |
| "epoch": 0.042349300286400715, | |
| "grad_norm": 15.943286895751953, | |
| "learning_rate": 0.00012697945887250024, | |
| "loss": 8.1694, | |
| "step": 9360 | |
| }, | |
| { | |
| "epoch": 0.04243979024427764, | |
| "grad_norm": 13.450387001037598, | |
| "learning_rate": 0.00012725092751787167, | |
| "loss": 8.1379, | |
| "step": 9380 | |
| }, | |
| { | |
| "epoch": 0.04253028020215457, | |
| "grad_norm": 15.152196884155273, | |
| "learning_rate": 0.00012752239616324314, | |
| "loss": 8.1391, | |
| "step": 9400 | |
| }, | |
| { | |
| "epoch": 0.04262077016003149, | |
| "grad_norm": 15.109274864196777, | |
| "learning_rate": 0.0001277938648086146, | |
| "loss": 8.0963, | |
| "step": 9420 | |
| }, | |
| { | |
| "epoch": 0.04271126011790841, | |
| "grad_norm": 10.3173189163208, | |
| "learning_rate": 0.00012806533345398606, | |
| "loss": 8.1557, | |
| "step": 9440 | |
| }, | |
| { | |
| "epoch": 0.04280175007578534, | |
| "grad_norm": 11.38595962524414, | |
| "learning_rate": 0.00012833680209935753, | |
| "loss": 8.173, | |
| "step": 9460 | |
| }, | |
| { | |
| "epoch": 0.042892240033662266, | |
| "grad_norm": 11.458219528198242, | |
| "learning_rate": 0.00012859469731246043, | |
| "loss": 8.2542, | |
| "step": 9480 | |
| }, | |
| { | |
| "epoch": 0.04298272999153919, | |
| "grad_norm": 14.253256797790527, | |
| "learning_rate": 0.00012886616595783186, | |
| "loss": 8.1687, | |
| "step": 9500 | |
| }, | |
| { | |
| "epoch": 0.04307321994941611, | |
| "grad_norm": 14.074560165405273, | |
| "learning_rate": 0.00012913763460320333, | |
| "loss": 8.1175, | |
| "step": 9520 | |
| }, | |
| { | |
| "epoch": 0.04316370990729304, | |
| "grad_norm": 14.521282196044922, | |
| "learning_rate": 0.00012939552981630623, | |
| "loss": 8.1456, | |
| "step": 9540 | |
| }, | |
| { | |
| "epoch": 0.043254199865169964, | |
| "grad_norm": 12.537208557128906, | |
| "learning_rate": 0.0001296669984616777, | |
| "loss": 8.1432, | |
| "step": 9560 | |
| }, | |
| { | |
| "epoch": 0.043344689823046886, | |
| "grad_norm": 10.885902404785156, | |
| "learning_rate": 0.00012993846710704915, | |
| "loss": 8.1875, | |
| "step": 9580 | |
| }, | |
| { | |
| "epoch": 0.04343517978092381, | |
| "grad_norm": 10.156676292419434, | |
| "learning_rate": 0.0001302099357524206, | |
| "loss": 8.1728, | |
| "step": 9600 | |
| }, | |
| { | |
| "epoch": 0.04352566973880074, | |
| "grad_norm": 13.31322193145752, | |
| "learning_rate": 0.00013048140439779205, | |
| "loss": 8.1394, | |
| "step": 9620 | |
| }, | |
| { | |
| "epoch": 0.04361615969667766, | |
| "grad_norm": 7.779819488525391, | |
| "learning_rate": 0.0001307528730431635, | |
| "loss": 8.139, | |
| "step": 9640 | |
| }, | |
| { | |
| "epoch": 0.043706649654554584, | |
| "grad_norm": 12.208565711975098, | |
| "learning_rate": 0.00013102434168853495, | |
| "loss": 8.1346, | |
| "step": 9660 | |
| }, | |
| { | |
| "epoch": 0.04379713961243151, | |
| "grad_norm": 11.362008094787598, | |
| "learning_rate": 0.00013129581033390642, | |
| "loss": 8.1419, | |
| "step": 9680 | |
| }, | |
| { | |
| "epoch": 0.04388762957030844, | |
| "grad_norm": 11.86789321899414, | |
| "learning_rate": 0.00013156727897927788, | |
| "loss": 8.1475, | |
| "step": 9700 | |
| }, | |
| { | |
| "epoch": 0.04397811952818536, | |
| "grad_norm": 14.61185073852539, | |
| "learning_rate": 0.00013183874762464934, | |
| "loss": 8.1582, | |
| "step": 9720 | |
| }, | |
| { | |
| "epoch": 0.04406860948606228, | |
| "grad_norm": 11.60112190246582, | |
| "learning_rate": 0.0001321102162700208, | |
| "loss": 8.1073, | |
| "step": 9740 | |
| }, | |
| { | |
| "epoch": 0.04415909944393921, | |
| "grad_norm": 13.442856788635254, | |
| "learning_rate": 0.00013238168491539227, | |
| "loss": 8.1358, | |
| "step": 9760 | |
| }, | |
| { | |
| "epoch": 0.044249589401816135, | |
| "grad_norm": 11.524395942687988, | |
| "learning_rate": 0.00013265315356076373, | |
| "loss": 8.1083, | |
| "step": 9780 | |
| }, | |
| { | |
| "epoch": 0.04434007935969306, | |
| "grad_norm": 13.528814315795898, | |
| "learning_rate": 0.0001329246222061352, | |
| "loss": 8.1392, | |
| "step": 9800 | |
| }, | |
| { | |
| "epoch": 0.04443056931756998, | |
| "grad_norm": 18.11868667602539, | |
| "learning_rate": 0.00013319609085150666, | |
| "loss": 8.1784, | |
| "step": 9820 | |
| }, | |
| { | |
| "epoch": 0.04452105927544691, | |
| "grad_norm": 15.858280181884766, | |
| "learning_rate": 0.00013346755949687812, | |
| "loss": 8.1597, | |
| "step": 9840 | |
| }, | |
| { | |
| "epoch": 0.04461154923332383, | |
| "grad_norm": 14.466769218444824, | |
| "learning_rate": 0.00013373902814224956, | |
| "loss": 8.1632, | |
| "step": 9860 | |
| }, | |
| { | |
| "epoch": 0.044702039191200756, | |
| "grad_norm": 11.416616439819336, | |
| "learning_rate": 0.00013401049678762102, | |
| "loss": 8.1681, | |
| "step": 9880 | |
| }, | |
| { | |
| "epoch": 0.04479252914907768, | |
| "grad_norm": 39.87081527709961, | |
| "learning_rate": 0.00013428196543299249, | |
| "loss": 8.1384, | |
| "step": 9900 | |
| }, | |
| { | |
| "epoch": 0.04488301910695461, | |
| "grad_norm": 11.689374923706055, | |
| "learning_rate": 0.0001345398606460954, | |
| "loss": 8.5619, | |
| "step": 9920 | |
| }, | |
| { | |
| "epoch": 0.04497350906483153, | |
| "grad_norm": 10.53484058380127, | |
| "learning_rate": 0.00013481132929146682, | |
| "loss": 9.1495, | |
| "step": 9940 | |
| }, | |
| { | |
| "epoch": 0.045063999022708454, | |
| "grad_norm": 12.07006549835205, | |
| "learning_rate": 0.00013508279793683829, | |
| "loss": 9.1771, | |
| "step": 9960 | |
| }, | |
| { | |
| "epoch": 0.045154488980585376, | |
| "grad_norm": 9.795348167419434, | |
| "learning_rate": 0.00013535426658220975, | |
| "loss": 9.1545, | |
| "step": 9980 | |
| }, | |
| { | |
| "epoch": 0.045244978938462306, | |
| "grad_norm": 10.068339347839355, | |
| "learning_rate": 0.0001356257352275812, | |
| "loss": 9.1969, | |
| "step": 10000 | |
| }, | |
| { | |
| "epoch": 0.045244978938462306, | |
| "eval_accuracy": 0.022879129772772476, | |
| "eval_loss": 9.148832321166992, | |
| "eval_runtime": 212.7494, | |
| "eval_samples_per_second": 2857.071, | |
| "eval_steps_per_second": 11.163, | |
| "step": 10000 | |
| }, | |
| { | |
| "epoch": 0.04533546889633923, | |
| "grad_norm": 12.951713562011719, | |
| "learning_rate": 0.00013589720387295268, | |
| "loss": 9.154, | |
| "step": 10020 | |
| }, | |
| { | |
| "epoch": 0.04542595885421615, | |
| "grad_norm": 9.139362335205078, | |
| "learning_rate": 0.00013616867251832414, | |
| "loss": 9.154, | |
| "step": 10040 | |
| }, | |
| { | |
| "epoch": 0.04551644881209308, | |
| "grad_norm": 8.388337135314941, | |
| "learning_rate": 0.0001364401411636956, | |
| "loss": 9.1391, | |
| "step": 10060 | |
| }, | |
| { | |
| "epoch": 0.045606938769970004, | |
| "grad_norm": 10.0809326171875, | |
| "learning_rate": 0.00013671160980906704, | |
| "loss": 9.1417, | |
| "step": 10080 | |
| }, | |
| { | |
| "epoch": 0.04569742872784693, | |
| "grad_norm": 8.565701484680176, | |
| "learning_rate": 0.0001369830784544385, | |
| "loss": 9.1112, | |
| "step": 10100 | |
| }, | |
| { | |
| "epoch": 0.04578791868572385, | |
| "grad_norm": 10.437520027160645, | |
| "learning_rate": 0.00013725454709980997, | |
| "loss": 9.1169, | |
| "step": 10120 | |
| }, | |
| { | |
| "epoch": 0.04587840864360078, | |
| "grad_norm": 8.615896224975586, | |
| "learning_rate": 0.00013752601574518143, | |
| "loss": 9.1003, | |
| "step": 10140 | |
| }, | |
| { | |
| "epoch": 0.0459688986014777, | |
| "grad_norm": 10.89583683013916, | |
| "learning_rate": 0.0001377974843905529, | |
| "loss": 9.101, | |
| "step": 10160 | |
| }, | |
| { | |
| "epoch": 0.046059388559354625, | |
| "grad_norm": 9.786931991577148, | |
| "learning_rate": 0.00013806895303592433, | |
| "loss": 9.0689, | |
| "step": 10180 | |
| }, | |
| { | |
| "epoch": 0.04614987851723155, | |
| "grad_norm": 9.010174751281738, | |
| "learning_rate": 0.0001383404216812958, | |
| "loss": 9.0579, | |
| "step": 10200 | |
| }, | |
| { | |
| "epoch": 0.04624036847510848, | |
| "grad_norm": 11.039669036865234, | |
| "learning_rate": 0.00013861189032666725, | |
| "loss": 9.0865, | |
| "step": 10220 | |
| }, | |
| { | |
| "epoch": 0.0463308584329854, | |
| "grad_norm": 12.055830001831055, | |
| "learning_rate": 0.00013888335897203872, | |
| "loss": 9.0955, | |
| "step": 10240 | |
| }, | |
| { | |
| "epoch": 0.04642134839086232, | |
| "grad_norm": 8.361885070800781, | |
| "learning_rate": 0.00013915482761741018, | |
| "loss": 9.07, | |
| "step": 10260 | |
| }, | |
| { | |
| "epoch": 0.046511838348739246, | |
| "grad_norm": 7.196146011352539, | |
| "learning_rate": 0.00013942629626278164, | |
| "loss": 9.0528, | |
| "step": 10280 | |
| }, | |
| { | |
| "epoch": 0.046602328306616175, | |
| "grad_norm": 9.67076587677002, | |
| "learning_rate": 0.0001396977649081531, | |
| "loss": 9.0546, | |
| "step": 10300 | |
| }, | |
| { | |
| "epoch": 0.0466928182644931, | |
| "grad_norm": 10.09327220916748, | |
| "learning_rate": 0.00013996923355352457, | |
| "loss": 9.0741, | |
| "step": 10320 | |
| }, | |
| { | |
| "epoch": 0.04678330822237002, | |
| "grad_norm": 9.639015197753906, | |
| "learning_rate": 0.00014024070219889603, | |
| "loss": 9.0633, | |
| "step": 10340 | |
| }, | |
| { | |
| "epoch": 0.04687379818024695, | |
| "grad_norm": 10.251932144165039, | |
| "learning_rate": 0.0001405121708442675, | |
| "loss": 9.0446, | |
| "step": 10360 | |
| }, | |
| { | |
| "epoch": 0.04696428813812387, | |
| "grad_norm": 11.07875919342041, | |
| "learning_rate": 0.00014078363948963896, | |
| "loss": 9.0418, | |
| "step": 10380 | |
| }, | |
| { | |
| "epoch": 0.047054778096000796, | |
| "grad_norm": 9.328507423400879, | |
| "learning_rate": 0.00014105510813501042, | |
| "loss": 9.0287, | |
| "step": 10400 | |
| }, | |
| { | |
| "epoch": 0.04714526805387772, | |
| "grad_norm": 7.056753635406494, | |
| "learning_rate": 0.00014132657678038186, | |
| "loss": 9.0362, | |
| "step": 10420 | |
| }, | |
| { | |
| "epoch": 0.04723575801175465, | |
| "grad_norm": 8.899680137634277, | |
| "learning_rate": 0.0001415980454257533, | |
| "loss": 9.036, | |
| "step": 10440 | |
| }, | |
| { | |
| "epoch": 0.04732624796963157, | |
| "grad_norm": 9.175132751464844, | |
| "learning_rate": 0.00014186951407112476, | |
| "loss": 9.0444, | |
| "step": 10460 | |
| }, | |
| { | |
| "epoch": 0.047416737927508494, | |
| "grad_norm": 9.374978065490723, | |
| "learning_rate": 0.00014214098271649622, | |
| "loss": 9.0372, | |
| "step": 10480 | |
| }, | |
| { | |
| "epoch": 0.04750722788538542, | |
| "grad_norm": 9.893750190734863, | |
| "learning_rate": 0.00014241245136186769, | |
| "loss": 9.0424, | |
| "step": 10500 | |
| }, | |
| { | |
| "epoch": 0.04759771784326235, | |
| "grad_norm": 7.787280082702637, | |
| "learning_rate": 0.00014265677314270202, | |
| "loss": 8.9691, | |
| "step": 10520 | |
| }, | |
| { | |
| "epoch": 0.04768820780113927, | |
| "grad_norm": 17.40734100341797, | |
| "learning_rate": 0.00014277893403311917, | |
| "loss": 8.2225, | |
| "step": 10540 | |
| }, | |
| { | |
| "epoch": 0.04777869775901619, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.00014286037462673062, | |
| "loss": 6.6046, | |
| "step": 10560 | |
| }, | |
| { | |
| "epoch": 0.047869187716893115, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.0001429146683558049, | |
| "loss": 3.0921, | |
| "step": 10580 | |
| }, | |
| { | |
| "epoch": 0.047959677674770045, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.00014294181522034205, | |
| "loss": 3.9765, | |
| "step": 10600 | |
| }, | |
| { | |
| "epoch": 0.04805016763264697, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.00014298253551714776, | |
| "loss": 6.9972, | |
| "step": 10620 | |
| }, | |
| { | |
| "epoch": 0.04814065759052389, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.00014298253551714776, | |
| "loss": 0.0, | |
| "step": 10640 | |
| }, | |
| { | |
| "epoch": 0.04823114754840081, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.00014298253551714776, | |
| "loss": 0.0, | |
| "step": 10660 | |
| }, | |
| { | |
| "epoch": 0.04832163750627774, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.00014298253551714776, | |
| "loss": 0.0, | |
| "step": 10680 | |
| }, | |
| { | |
| "epoch": 0.048412127464154665, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.00014298253551714776, | |
| "loss": 0.0, | |
| "step": 10700 | |
| }, | |
| { | |
| "epoch": 0.04850261742203159, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.00014298253551714776, | |
| "loss": 0.0, | |
| "step": 10720 | |
| }, | |
| { | |
| "epoch": 0.04859310737990852, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.00014298253551714776, | |
| "loss": 0.0, | |
| "step": 10740 | |
| }, | |
| { | |
| "epoch": 0.04868359733778544, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.00014298253551714776, | |
| "loss": 0.0, | |
| "step": 10760 | |
| }, | |
| { | |
| "epoch": 0.04877408729566236, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.00014298253551714776, | |
| "loss": 0.0, | |
| "step": 10780 | |
| }, | |
| { | |
| "epoch": 0.048864577253539286, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.00014298253551714776, | |
| "loss": 0.0, | |
| "step": 10800 | |
| }, | |
| { | |
| "epoch": 0.048955067211416216, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.00014298253551714776, | |
| "loss": 0.0, | |
| "step": 10820 | |
| }, | |
| { | |
| "epoch": 0.04904555716929314, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.00014298253551714776, | |
| "loss": 0.0, | |
| "step": 10840 | |
| }, | |
| { | |
| "epoch": 0.04913604712717006, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.00014298253551714776, | |
| "loss": 0.0, | |
| "step": 10860 | |
| }, | |
| { | |
| "epoch": 0.049226537085046984, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.00014298253551714776, | |
| "loss": 0.0, | |
| "step": 10880 | |
| }, | |
| { | |
| "epoch": 0.049317027042923914, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.00014298253551714776, | |
| "loss": 0.0, | |
| "step": 10900 | |
| }, | |
| { | |
| "epoch": 0.04940751700080084, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.00014298253551714776, | |
| "loss": 0.0, | |
| "step": 10920 | |
| }, | |
| { | |
| "epoch": 0.04949800695867776, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.00014298253551714776, | |
| "loss": 0.0, | |
| "step": 10940 | |
| }, | |
| { | |
| "epoch": 0.04958849691655468, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.00014298253551714776, | |
| "loss": 0.0, | |
| "step": 10960 | |
| }, | |
| { | |
| "epoch": 0.04967898687443161, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.00014298253551714776, | |
| "loss": 0.0, | |
| "step": 10980 | |
| }, | |
| { | |
| "epoch": 0.049769476832308535, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.00014298253551714776, | |
| "loss": 0.0, | |
| "step": 11000 | |
| }, | |
| { | |
| "epoch": 0.04985996679018546, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.00014298253551714776, | |
| "loss": 0.0, | |
| "step": 11020 | |
| }, | |
| { | |
| "epoch": 0.04995045674806239, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.00014298253551714776, | |
| "loss": 0.0, | |
| "step": 11040 | |
| }, | |
| { | |
| "epoch": 0.05004094670593931, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.00014298253551714776, | |
| "loss": 0.0, | |
| "step": 11060 | |
| }, | |
| { | |
| "epoch": 0.05013143666381623, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.00014298253551714776, | |
| "loss": 0.0, | |
| "step": 11080 | |
| }, | |
| { | |
| "epoch": 0.050221926621693155, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.00014298253551714776, | |
| "loss": 0.0, | |
| "step": 11100 | |
| }, | |
| { | |
| "epoch": 0.050312416579570085, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.00014298253551714776, | |
| "loss": 0.0, | |
| "step": 11120 | |
| }, | |
| { | |
| "epoch": 0.05040290653744701, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.00014298253551714776, | |
| "loss": 0.0, | |
| "step": 11140 | |
| }, | |
| { | |
| "epoch": 0.05049339649532393, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.00014298253551714776, | |
| "loss": 0.0, | |
| "step": 11160 | |
| }, | |
| { | |
| "epoch": 0.05058388645320085, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.00014298253551714776, | |
| "loss": 0.0, | |
| "step": 11180 | |
| }, | |
| { | |
| "epoch": 0.05067437641107778, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.00014298253551714776, | |
| "loss": 0.0, | |
| "step": 11200 | |
| }, | |
| { | |
| "epoch": 0.050764866368954706, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.00014298253551714776, | |
| "loss": 0.0, | |
| "step": 11220 | |
| }, | |
| { | |
| "epoch": 0.05085535632683163, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.00014298253551714776, | |
| "loss": 0.0, | |
| "step": 11240 | |
| }, | |
| { | |
| "epoch": 0.05094584628470855, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.00014298253551714776, | |
| "loss": 0.0, | |
| "step": 11260 | |
| }, | |
| { | |
| "epoch": 0.05103633624258548, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.00014298253551714776, | |
| "loss": 0.0, | |
| "step": 11280 | |
| }, | |
| { | |
| "epoch": 0.051126826200462404, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.00014298253551714776, | |
| "loss": 0.0, | |
| "step": 11300 | |
| }, | |
| { | |
| "epoch": 0.05121731615833933, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.00014298253551714776, | |
| "loss": 0.0, | |
| "step": 11320 | |
| }, | |
| { | |
| "epoch": 0.051307806116216256, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.00014298253551714776, | |
| "loss": 0.0, | |
| "step": 11340 | |
| }, | |
| { | |
| "epoch": 0.05139829607409318, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.00014298253551714776, | |
| "loss": 0.0, | |
| "step": 11360 | |
| }, | |
| { | |
| "epoch": 0.0514887860319701, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.00014298253551714776, | |
| "loss": 0.0, | |
| "step": 11380 | |
| }, | |
| { | |
| "epoch": 0.051579275989847025, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.00014298253551714776, | |
| "loss": 0.0, | |
| "step": 11400 | |
| }, | |
| { | |
| "epoch": 0.051669765947723954, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.00014298253551714776, | |
| "loss": 0.0, | |
| "step": 11420 | |
| }, | |
| { | |
| "epoch": 0.05176025590560088, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.00014298253551714776, | |
| "loss": 0.0, | |
| "step": 11440 | |
| }, | |
| { | |
| "epoch": 0.0518507458634778, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.00014298253551714776, | |
| "loss": 0.0, | |
| "step": 11460 | |
| }, | |
| { | |
| "epoch": 0.05194123582135472, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.00014298253551714776, | |
| "loss": 0.0, | |
| "step": 11480 | |
| }, | |
| { | |
| "epoch": 0.05203172577923165, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.00014298253551714776, | |
| "loss": 0.0, | |
| "step": 11500 | |
| }, | |
| { | |
| "epoch": 0.052122215737108575, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.00014298253551714776, | |
| "loss": 0.0, | |
| "step": 11520 | |
| }, | |
| { | |
| "epoch": 0.0522127056949855, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.00014298253551714776, | |
| "loss": 0.0, | |
| "step": 11540 | |
| }, | |
| { | |
| "epoch": 0.05230319565286242, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.00014298253551714776, | |
| "loss": 0.0, | |
| "step": 11560 | |
| }, | |
| { | |
| "epoch": 0.05239368561073935, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.00014298253551714776, | |
| "loss": 0.0, | |
| "step": 11580 | |
| }, | |
| { | |
| "epoch": 0.05248417556861627, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.00014298253551714776, | |
| "loss": 0.0, | |
| "step": 11600 | |
| }, | |
| { | |
| "epoch": 0.052574665526493196, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.00014298253551714776, | |
| "loss": 0.0, | |
| "step": 11620 | |
| }, | |
| { | |
| "epoch": 0.05266515548437012, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.00014298253551714776, | |
| "loss": 0.0, | |
| "step": 11640 | |
| }, | |
| { | |
| "epoch": 0.05275564544224705, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.00014298253551714776, | |
| "loss": 0.0, | |
| "step": 11660 | |
| }, | |
| { | |
| "epoch": 0.05284613540012397, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.00014298253551714776, | |
| "loss": 0.0, | |
| "step": 11680 | |
| }, | |
| { | |
| "epoch": 0.052936625358000894, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.00014298253551714776, | |
| "loss": 0.0, | |
| "step": 11700 | |
| }, | |
| { | |
| "epoch": 0.053027115315877824, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.00014298253551714776, | |
| "loss": 0.0, | |
| "step": 11720 | |
| }, | |
| { | |
| "epoch": 0.053117605273754746, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.00014298253551714776, | |
| "loss": 0.0, | |
| "step": 11740 | |
| }, | |
| { | |
| "epoch": 0.05320809523163167, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.00014298253551714776, | |
| "loss": 0.0, | |
| "step": 11760 | |
| }, | |
| { | |
| "epoch": 0.05329858518950859, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.00014298253551714776, | |
| "loss": 0.0, | |
| "step": 11780 | |
| }, | |
| { | |
| "epoch": 0.05338907514738552, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.00014298253551714776, | |
| "loss": 0.0, | |
| "step": 11800 | |
| }, | |
| { | |
| "epoch": 0.053479565105262444, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.00014298253551714776, | |
| "loss": 0.0, | |
| "step": 11820 | |
| }, | |
| { | |
| "epoch": 0.05357005506313937, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.00014298253551714776, | |
| "loss": 0.0, | |
| "step": 11840 | |
| }, | |
| { | |
| "epoch": 0.05366054502101629, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.00014298253551714776, | |
| "loss": 0.0, | |
| "step": 11860 | |
| }, | |
| { | |
| "epoch": 0.05375103497889322, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.00014298253551714776, | |
| "loss": 0.0, | |
| "step": 11880 | |
| }, | |
| { | |
| "epoch": 0.05384152493677014, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.00014298253551714776, | |
| "loss": 0.0, | |
| "step": 11900 | |
| }, | |
| { | |
| "epoch": 0.053932014894647065, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.00014298253551714776, | |
| "loss": 0.0, | |
| "step": 11920 | |
| }, | |
| { | |
| "epoch": 0.05402250485252399, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.00014298253551714776, | |
| "loss": 0.0, | |
| "step": 11940 | |
| }, | |
| { | |
| "epoch": 0.05411299481040092, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.00014298253551714776, | |
| "loss": 0.0, | |
| "step": 11960 | |
| }, | |
| { | |
| "epoch": 0.05420348476827784, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.00014298253551714776, | |
| "loss": 0.0, | |
| "step": 11980 | |
| }, | |
| { | |
| "epoch": 0.05429397472615476, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.00014298253551714776, | |
| "loss": 0.0, | |
| "step": 12000 | |
| }, | |
| { | |
| "epoch": 0.05429397472615476, | |
| "eval_accuracy": 0.021626624590642192, | |
| "eval_loss": NaN, | |
| "eval_runtime": 218.9297, | |
| "eval_samples_per_second": 2776.417, | |
| "eval_steps_per_second": 10.848, | |
| "step": 12000 | |
| } | |
| ], | |
| "logging_steps": 20, | |
| "max_steps": 663057, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 3, | |
| "save_steps": 100, | |
| "total_flos": 4315086323712000.0, | |
| "train_batch_size": 256, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |