| { |
| "best_metric": 3.753229856491089, |
| "best_model_checkpoint": "/scratch/cl5625/exceptions/models/100M_high_2000_495/checkpoint-10000", |
| "epoch": 1.0781671159029649, |
| "eval_steps": 1000, |
| "global_step": 10000, |
| "is_hyper_param_search": false, |
| "is_local_process_zero": true, |
| "is_world_process_zero": true, |
| "log_history": [ |
| { |
| "epoch": 0.005390835579514825, |
| "grad_norm": 1.4431322813034058, |
| "learning_rate": 0.000294, |
| "loss": 8.5391, |
| "step": 50 |
| }, |
| { |
| "epoch": 0.01078167115902965, |
| "grad_norm": 3.01206111907959, |
| "learning_rate": 0.0005939999999999999, |
| "loss": 6.9498, |
| "step": 100 |
| }, |
| { |
| "epoch": 0.016172506738544475, |
| "grad_norm": 2.0440971851348877, |
| "learning_rate": 0.0005996826767404209, |
| "loss": 6.5454, |
| "step": 150 |
| }, |
| { |
| "epoch": 0.0215633423180593, |
| "grad_norm": 1.441593885421753, |
| "learning_rate": 0.0005993588774959525, |
| "loss": 6.29, |
| "step": 200 |
| }, |
| { |
| "epoch": 0.026954177897574125, |
| "grad_norm": 1.1196218729019165, |
| "learning_rate": 0.000599035078251484, |
| "loss": 6.1031, |
| "step": 250 |
| }, |
| { |
| "epoch": 0.03234501347708895, |
| "grad_norm": 1.7114529609680176, |
| "learning_rate": 0.0005987112790070156, |
| "loss": 5.9896, |
| "step": 300 |
| }, |
| { |
| "epoch": 0.03773584905660377, |
| "grad_norm": 1.0614324808120728, |
| "learning_rate": 0.0005983874797625472, |
| "loss": 5.8898, |
| "step": 350 |
| }, |
| { |
| "epoch": 0.0431266846361186, |
| "grad_norm": 0.9476490616798401, |
| "learning_rate": 0.0005980636805180788, |
| "loss": 5.8227, |
| "step": 400 |
| }, |
| { |
| "epoch": 0.04851752021563342, |
| "grad_norm": 1.198143482208252, |
| "learning_rate": 0.0005977398812736103, |
| "loss": 5.7384, |
| "step": 450 |
| }, |
| { |
| "epoch": 0.05390835579514825, |
| "grad_norm": 1.5752428770065308, |
| "learning_rate": 0.0005974160820291419, |
| "loss": 5.6852, |
| "step": 500 |
| }, |
| { |
| "epoch": 0.05929919137466307, |
| "grad_norm": 2.0579349994659424, |
| "learning_rate": 0.0005970922827846734, |
| "loss": 5.5695, |
| "step": 550 |
| }, |
| { |
| "epoch": 0.0646900269541779, |
| "grad_norm": 1.3935744762420654, |
| "learning_rate": 0.0005967684835402051, |
| "loss": 5.535, |
| "step": 600 |
| }, |
| { |
| "epoch": 0.07008086253369272, |
| "grad_norm": 1.303139090538025, |
| "learning_rate": 0.0005964446842957366, |
| "loss": 5.4307, |
| "step": 650 |
| }, |
| { |
| "epoch": 0.07547169811320754, |
| "grad_norm": 0.969357967376709, |
| "learning_rate": 0.0005961208850512682, |
| "loss": 5.4075, |
| "step": 700 |
| }, |
| { |
| "epoch": 0.08086253369272237, |
| "grad_norm": 1.0970923900604248, |
| "learning_rate": 0.0005957970858067997, |
| "loss": 5.3339, |
| "step": 750 |
| }, |
| { |
| "epoch": 0.0862533692722372, |
| "grad_norm": 1.444526195526123, |
| "learning_rate": 0.0005954732865623314, |
| "loss": 5.2734, |
| "step": 800 |
| }, |
| { |
| "epoch": 0.09164420485175202, |
| "grad_norm": 1.2398624420166016, |
| "learning_rate": 0.0005951494873178628, |
| "loss": 5.2435, |
| "step": 850 |
| }, |
| { |
| "epoch": 0.09703504043126684, |
| "grad_norm": 1.0231698751449585, |
| "learning_rate": 0.0005948256880733944, |
| "loss": 5.1615, |
| "step": 900 |
| }, |
| { |
| "epoch": 0.10242587601078167, |
| "grad_norm": 0.8052211403846741, |
| "learning_rate": 0.000594501888828926, |
| "loss": 5.1565, |
| "step": 950 |
| }, |
| { |
| "epoch": 0.1078167115902965, |
| "grad_norm": 0.9758635759353638, |
| "learning_rate": 0.0005941780895844576, |
| "loss": 5.0973, |
| "step": 1000 |
| }, |
| { |
| "epoch": 0.1078167115902965, |
| "eval_accuracy": 0.22682387725439804, |
| "eval_loss": 5.0240583419799805, |
| "eval_runtime": 187.7541, |
| "eval_samples_per_second": 95.929, |
| "eval_steps_per_second": 5.997, |
| "step": 1000 |
| }, |
| { |
| "epoch": 0.11320754716981132, |
| "grad_norm": 1.1241644620895386, |
| "learning_rate": 0.0005938542903399891, |
| "loss": 5.0594, |
| "step": 1050 |
| }, |
| { |
| "epoch": 0.11859838274932614, |
| "grad_norm": 1.3577865362167358, |
| "learning_rate": 0.0005935304910955207, |
| "loss": 5.0184, |
| "step": 1100 |
| }, |
| { |
| "epoch": 0.12398921832884097, |
| "grad_norm": 1.2023968696594238, |
| "learning_rate": 0.0005932066918510523, |
| "loss": 4.9997, |
| "step": 1150 |
| }, |
| { |
| "epoch": 0.1293800539083558, |
| "grad_norm": 1.0888073444366455, |
| "learning_rate": 0.0005928828926065839, |
| "loss": 4.9799, |
| "step": 1200 |
| }, |
| { |
| "epoch": 0.1347708894878706, |
| "grad_norm": 1.1558417081832886, |
| "learning_rate": 0.0005925590933621154, |
| "loss": 4.9525, |
| "step": 1250 |
| }, |
| { |
| "epoch": 0.14016172506738545, |
| "grad_norm": 0.8390737771987915, |
| "learning_rate": 0.000592235294117647, |
| "loss": 4.8991, |
| "step": 1300 |
| }, |
| { |
| "epoch": 0.14555256064690028, |
| "grad_norm": 1.2147760391235352, |
| "learning_rate": 0.0005919114948731785, |
| "loss": 4.8807, |
| "step": 1350 |
| }, |
| { |
| "epoch": 0.1509433962264151, |
| "grad_norm": 0.8272018432617188, |
| "learning_rate": 0.0005915876956287102, |
| "loss": 4.8533, |
| "step": 1400 |
| }, |
| { |
| "epoch": 0.15633423180592992, |
| "grad_norm": 0.9127699732780457, |
| "learning_rate": 0.0005912638963842416, |
| "loss": 4.8361, |
| "step": 1450 |
| }, |
| { |
| "epoch": 0.16172506738544473, |
| "grad_norm": 1.053229808807373, |
| "learning_rate": 0.0005909400971397733, |
| "loss": 4.822, |
| "step": 1500 |
| }, |
| { |
| "epoch": 0.16711590296495957, |
| "grad_norm": 0.7915006279945374, |
| "learning_rate": 0.0005906162978953049, |
| "loss": 4.7808, |
| "step": 1550 |
| }, |
| { |
| "epoch": 0.1725067385444744, |
| "grad_norm": 1.0942384004592896, |
| "learning_rate": 0.0005902924986508364, |
| "loss": 4.7932, |
| "step": 1600 |
| }, |
| { |
| "epoch": 0.1778975741239892, |
| "grad_norm": 1.114819884300232, |
| "learning_rate": 0.000589968699406368, |
| "loss": 4.7716, |
| "step": 1650 |
| }, |
| { |
| "epoch": 0.18328840970350405, |
| "grad_norm": 0.8059048652648926, |
| "learning_rate": 0.0005896449001618996, |
| "loss": 4.6935, |
| "step": 1700 |
| }, |
| { |
| "epoch": 0.18867924528301888, |
| "grad_norm": 1.0432419776916504, |
| "learning_rate": 0.0005893211009174312, |
| "loss": 4.7057, |
| "step": 1750 |
| }, |
| { |
| "epoch": 0.1940700808625337, |
| "grad_norm": 1.0190030336380005, |
| "learning_rate": 0.0005889973016729627, |
| "loss": 4.6937, |
| "step": 1800 |
| }, |
| { |
| "epoch": 0.19946091644204852, |
| "grad_norm": 0.9340100884437561, |
| "learning_rate": 0.0005886735024284943, |
| "loss": 4.6527, |
| "step": 1850 |
| }, |
| { |
| "epoch": 0.20485175202156333, |
| "grad_norm": 1.0046193599700928, |
| "learning_rate": 0.0005883497031840258, |
| "loss": 4.65, |
| "step": 1900 |
| }, |
| { |
| "epoch": 0.21024258760107817, |
| "grad_norm": 1.0945899486541748, |
| "learning_rate": 0.0005880259039395575, |
| "loss": 4.6138, |
| "step": 1950 |
| }, |
| { |
| "epoch": 0.215633423180593, |
| "grad_norm": 1.0984909534454346, |
| "learning_rate": 0.000587702104695089, |
| "loss": 4.5905, |
| "step": 2000 |
| }, |
| { |
| "epoch": 0.215633423180593, |
| "eval_accuracy": 0.26831504687122604, |
| "eval_loss": 4.5331034660339355, |
| "eval_runtime": 188.7106, |
| "eval_samples_per_second": 95.442, |
| "eval_steps_per_second": 5.967, |
| "step": 2000 |
| }, |
| { |
| "epoch": 0.2210242587601078, |
| "grad_norm": 0.9245219230651855, |
| "learning_rate": 0.0005873783054506206, |
| "loss": 4.6042, |
| "step": 2050 |
| }, |
| { |
| "epoch": 0.22641509433962265, |
| "grad_norm": 1.0915874242782593, |
| "learning_rate": 0.0005870545062061521, |
| "loss": 4.5669, |
| "step": 2100 |
| }, |
| { |
| "epoch": 0.23180592991913745, |
| "grad_norm": 1.1114705801010132, |
| "learning_rate": 0.0005867307069616837, |
| "loss": 4.513, |
| "step": 2150 |
| }, |
| { |
| "epoch": 0.2371967654986523, |
| "grad_norm": 1.0507372617721558, |
| "learning_rate": 0.0005864069077172152, |
| "loss": 4.555, |
| "step": 2200 |
| }, |
| { |
| "epoch": 0.24258760107816713, |
| "grad_norm": 0.7661042809486389, |
| "learning_rate": 0.0005860831084727468, |
| "loss": 4.4936, |
| "step": 2250 |
| }, |
| { |
| "epoch": 0.24797843665768193, |
| "grad_norm": 0.9031612873077393, |
| "learning_rate": 0.0005857593092282784, |
| "loss": 4.4732, |
| "step": 2300 |
| }, |
| { |
| "epoch": 0.25336927223719674, |
| "grad_norm": 0.8872079849243164, |
| "learning_rate": 0.00058543550998381, |
| "loss": 4.468, |
| "step": 2350 |
| }, |
| { |
| "epoch": 0.2587601078167116, |
| "grad_norm": 1.007972002029419, |
| "learning_rate": 0.0005851117107393415, |
| "loss": 4.4588, |
| "step": 2400 |
| }, |
| { |
| "epoch": 0.2641509433962264, |
| "grad_norm": 0.9107605218887329, |
| "learning_rate": 0.0005847879114948731, |
| "loss": 4.4611, |
| "step": 2450 |
| }, |
| { |
| "epoch": 0.2695417789757412, |
| "grad_norm": 0.8212679624557495, |
| "learning_rate": 0.0005844641122504048, |
| "loss": 4.4549, |
| "step": 2500 |
| }, |
| { |
| "epoch": 0.2749326145552561, |
| "grad_norm": 0.8247270584106445, |
| "learning_rate": 0.0005841403130059363, |
| "loss": 4.4192, |
| "step": 2550 |
| }, |
| { |
| "epoch": 0.2803234501347709, |
| "grad_norm": 0.8410664796829224, |
| "learning_rate": 0.0005838165137614679, |
| "loss": 4.4058, |
| "step": 2600 |
| }, |
| { |
| "epoch": 0.2857142857142857, |
| "grad_norm": 0.8341009616851807, |
| "learning_rate": 0.0005834927145169994, |
| "loss": 4.4019, |
| "step": 2650 |
| }, |
| { |
| "epoch": 0.29110512129380056, |
| "grad_norm": 0.7168475985527039, |
| "learning_rate": 0.000583168915272531, |
| "loss": 4.3961, |
| "step": 2700 |
| }, |
| { |
| "epoch": 0.29649595687331537, |
| "grad_norm": 0.9485689997673035, |
| "learning_rate": 0.0005828451160280626, |
| "loss": 4.3549, |
| "step": 2750 |
| }, |
| { |
| "epoch": 0.3018867924528302, |
| "grad_norm": 0.7494977116584778, |
| "learning_rate": 0.0005825213167835941, |
| "loss": 4.394, |
| "step": 2800 |
| }, |
| { |
| "epoch": 0.30727762803234504, |
| "grad_norm": 0.7997616529464722, |
| "learning_rate": 0.0005821975175391257, |
| "loss": 4.3639, |
| "step": 2850 |
| }, |
| { |
| "epoch": 0.31266846361185985, |
| "grad_norm": 0.7571331262588501, |
| "learning_rate": 0.0005818737182946573, |
| "loss": 4.3431, |
| "step": 2900 |
| }, |
| { |
| "epoch": 0.31805929919137466, |
| "grad_norm": 0.7700881958007812, |
| "learning_rate": 0.0005815499190501888, |
| "loss": 4.3449, |
| "step": 2950 |
| }, |
| { |
| "epoch": 0.32345013477088946, |
| "grad_norm": 0.670320451259613, |
| "learning_rate": 0.0005812261198057204, |
| "loss": 4.3425, |
| "step": 3000 |
| }, |
| { |
| "epoch": 0.32345013477088946, |
| "eval_accuracy": 0.2970671000033574, |
| "eval_loss": 4.244440078735352, |
| "eval_runtime": 181.2734, |
| "eval_samples_per_second": 99.358, |
| "eval_steps_per_second": 6.212, |
| "step": 3000 |
| }, |
| { |
| "epoch": 0.3288409703504043, |
| "grad_norm": 0.7345431447029114, |
| "learning_rate": 0.0005809023205612519, |
| "loss": 4.3171, |
| "step": 3050 |
| }, |
| { |
| "epoch": 0.33423180592991913, |
| "grad_norm": 0.7953465580940247, |
| "learning_rate": 0.0005805785213167836, |
| "loss": 4.3147, |
| "step": 3100 |
| }, |
| { |
| "epoch": 0.33962264150943394, |
| "grad_norm": 0.7929427027702332, |
| "learning_rate": 0.0005802547220723151, |
| "loss": 4.2801, |
| "step": 3150 |
| }, |
| { |
| "epoch": 0.3450134770889488, |
| "grad_norm": 0.8242636322975159, |
| "learning_rate": 0.0005799309228278467, |
| "loss": 4.2861, |
| "step": 3200 |
| }, |
| { |
| "epoch": 0.3504043126684636, |
| "grad_norm": 0.9226231575012207, |
| "learning_rate": 0.0005796071235833782, |
| "loss": 4.2607, |
| "step": 3250 |
| }, |
| { |
| "epoch": 0.3557951482479784, |
| "grad_norm": 0.8058574199676514, |
| "learning_rate": 0.0005792833243389099, |
| "loss": 4.2833, |
| "step": 3300 |
| }, |
| { |
| "epoch": 0.3611859838274933, |
| "grad_norm": 0.5928198099136353, |
| "learning_rate": 0.0005789595250944414, |
| "loss": 4.262, |
| "step": 3350 |
| }, |
| { |
| "epoch": 0.3665768194070081, |
| "grad_norm": 0.7675554752349854, |
| "learning_rate": 0.000578635725849973, |
| "loss": 4.2542, |
| "step": 3400 |
| }, |
| { |
| "epoch": 0.3719676549865229, |
| "grad_norm": 0.611778974533081, |
| "learning_rate": 0.0005783119266055045, |
| "loss": 4.2414, |
| "step": 3450 |
| }, |
| { |
| "epoch": 0.37735849056603776, |
| "grad_norm": 0.6161898970603943, |
| "learning_rate": 0.0005779881273610361, |
| "loss": 4.2189, |
| "step": 3500 |
| }, |
| { |
| "epoch": 0.38274932614555257, |
| "grad_norm": 0.739580512046814, |
| "learning_rate": 0.0005776643281165676, |
| "loss": 4.2229, |
| "step": 3550 |
| }, |
| { |
| "epoch": 0.3881401617250674, |
| "grad_norm": 0.8803327679634094, |
| "learning_rate": 0.0005773405288720992, |
| "loss": 4.2268, |
| "step": 3600 |
| }, |
| { |
| "epoch": 0.3935309973045822, |
| "grad_norm": 1.1461224555969238, |
| "learning_rate": 0.0005770167296276308, |
| "loss": 4.2344, |
| "step": 3650 |
| }, |
| { |
| "epoch": 0.39892183288409705, |
| "grad_norm": 0.9922895431518555, |
| "learning_rate": 0.0005766929303831624, |
| "loss": 4.2043, |
| "step": 3700 |
| }, |
| { |
| "epoch": 0.40431266846361186, |
| "grad_norm": 0.7285529375076294, |
| "learning_rate": 0.000576369131138694, |
| "loss": 4.2012, |
| "step": 3750 |
| }, |
| { |
| "epoch": 0.40970350404312667, |
| "grad_norm": 0.809013307094574, |
| "learning_rate": 0.0005760453318942255, |
| "loss": 4.2178, |
| "step": 3800 |
| }, |
| { |
| "epoch": 0.41509433962264153, |
| "grad_norm": 0.8063985109329224, |
| "learning_rate": 0.000575721532649757, |
| "loss": 4.2018, |
| "step": 3850 |
| }, |
| { |
| "epoch": 0.42048517520215634, |
| "grad_norm": 0.847189724445343, |
| "learning_rate": 0.0005753977334052887, |
| "loss": 4.1669, |
| "step": 3900 |
| }, |
| { |
| "epoch": 0.42587601078167114, |
| "grad_norm": 0.6911249756813049, |
| "learning_rate": 0.0005750739341608203, |
| "loss": 4.1661, |
| "step": 3950 |
| }, |
| { |
| "epoch": 0.431266846361186, |
| "grad_norm": 0.7047509551048279, |
| "learning_rate": 0.0005747501349163518, |
| "loss": 4.1711, |
| "step": 4000 |
| }, |
| { |
| "epoch": 0.431266846361186, |
| "eval_accuracy": 0.31155715777518433, |
| "eval_loss": 4.099443435668945, |
| "eval_runtime": 181.2897, |
| "eval_samples_per_second": 99.349, |
| "eval_steps_per_second": 6.211, |
| "step": 4000 |
| }, |
| { |
| "epoch": 0.4366576819407008, |
| "grad_norm": 0.6963608264923096, |
| "learning_rate": 0.0005744263356718834, |
| "loss": 4.1712, |
| "step": 4050 |
| }, |
| { |
| "epoch": 0.4420485175202156, |
| "grad_norm": 0.5528955459594727, |
| "learning_rate": 0.000574102536427415, |
| "loss": 4.1704, |
| "step": 4100 |
| }, |
| { |
| "epoch": 0.4474393530997305, |
| "grad_norm": 0.6239518523216248, |
| "learning_rate": 0.0005737787371829465, |
| "loss": 4.1571, |
| "step": 4150 |
| }, |
| { |
| "epoch": 0.4528301886792453, |
| "grad_norm": 0.6668544411659241, |
| "learning_rate": 0.0005734549379384781, |
| "loss": 4.1498, |
| "step": 4200 |
| }, |
| { |
| "epoch": 0.4582210242587601, |
| "grad_norm": 0.6054083704948425, |
| "learning_rate": 0.0005731311386940097, |
| "loss": 4.1415, |
| "step": 4250 |
| }, |
| { |
| "epoch": 0.4636118598382749, |
| "grad_norm": 0.6274952292442322, |
| "learning_rate": 0.0005728073394495412, |
| "loss": 4.1351, |
| "step": 4300 |
| }, |
| { |
| "epoch": 0.46900269541778977, |
| "grad_norm": 0.6964167952537537, |
| "learning_rate": 0.0005724835402050728, |
| "loss": 4.1408, |
| "step": 4350 |
| }, |
| { |
| "epoch": 0.4743935309973046, |
| "grad_norm": 0.7939971685409546, |
| "learning_rate": 0.0005721597409606043, |
| "loss": 4.1304, |
| "step": 4400 |
| }, |
| { |
| "epoch": 0.4797843665768194, |
| "grad_norm": 0.7325452566146851, |
| "learning_rate": 0.000571835941716136, |
| "loss": 4.1334, |
| "step": 4450 |
| }, |
| { |
| "epoch": 0.48517520215633425, |
| "grad_norm": 0.6969997882843018, |
| "learning_rate": 0.0005715121424716675, |
| "loss": 4.1142, |
| "step": 4500 |
| }, |
| { |
| "epoch": 0.49056603773584906, |
| "grad_norm": 0.6884226202964783, |
| "learning_rate": 0.0005711883432271991, |
| "loss": 4.1028, |
| "step": 4550 |
| }, |
| { |
| "epoch": 0.49595687331536387, |
| "grad_norm": 0.6321718096733093, |
| "learning_rate": 0.0005708645439827306, |
| "loss": 4.1078, |
| "step": 4600 |
| }, |
| { |
| "epoch": 0.5013477088948787, |
| "grad_norm": 0.701016366481781, |
| "learning_rate": 0.0005705407447382623, |
| "loss": 4.0883, |
| "step": 4650 |
| }, |
| { |
| "epoch": 0.5067385444743935, |
| "grad_norm": 0.7334260940551758, |
| "learning_rate": 0.0005702169454937938, |
| "loss": 4.0958, |
| "step": 4700 |
| }, |
| { |
| "epoch": 0.5121293800539084, |
| "grad_norm": 0.5910300612449646, |
| "learning_rate": 0.0005698931462493253, |
| "loss": 4.075, |
| "step": 4750 |
| }, |
| { |
| "epoch": 0.5175202156334232, |
| "grad_norm": 0.6013472080230713, |
| "learning_rate": 0.0005695693470048569, |
| "loss": 4.0922, |
| "step": 4800 |
| }, |
| { |
| "epoch": 0.522911051212938, |
| "grad_norm": 0.6121587753295898, |
| "learning_rate": 0.0005692455477603885, |
| "loss": 4.0841, |
| "step": 4850 |
| }, |
| { |
| "epoch": 0.5283018867924528, |
| "grad_norm": 0.613993227481842, |
| "learning_rate": 0.00056892174851592, |
| "loss": 4.0931, |
| "step": 4900 |
| }, |
| { |
| "epoch": 0.5336927223719676, |
| "grad_norm": 0.6811482310295105, |
| "learning_rate": 0.0005685979492714516, |
| "loss": 4.0664, |
| "step": 4950 |
| }, |
| { |
| "epoch": 0.5390835579514824, |
| "grad_norm": 0.6156988739967346, |
| "learning_rate": 0.0005682741500269833, |
| "loss": 4.0584, |
| "step": 5000 |
| }, |
| { |
| "epoch": 0.5390835579514824, |
| "eval_accuracy": 0.3207276788125022, |
| "eval_loss": 3.9989216327667236, |
| "eval_runtime": 181.0844, |
| "eval_samples_per_second": 99.462, |
| "eval_steps_per_second": 6.218, |
| "step": 5000 |
| }, |
| { |
| "epoch": 0.5444743935309974, |
| "grad_norm": 0.5171638131141663, |
| "learning_rate": 0.0005679503507825148, |
| "loss": 4.0485, |
| "step": 5050 |
| }, |
| { |
| "epoch": 0.5498652291105122, |
| "grad_norm": 0.8664311170578003, |
| "learning_rate": 0.0005676265515380464, |
| "loss": 4.0568, |
| "step": 5100 |
| }, |
| { |
| "epoch": 0.555256064690027, |
| "grad_norm": 0.5895658731460571, |
| "learning_rate": 0.0005673027522935779, |
| "loss": 4.0612, |
| "step": 5150 |
| }, |
| { |
| "epoch": 0.5606469002695418, |
| "grad_norm": 0.7551826238632202, |
| "learning_rate": 0.0005669789530491095, |
| "loss": 4.0396, |
| "step": 5200 |
| }, |
| { |
| "epoch": 0.5660377358490566, |
| "grad_norm": 0.6080760359764099, |
| "learning_rate": 0.0005666551538046411, |
| "loss": 4.0427, |
| "step": 5250 |
| }, |
| { |
| "epoch": 0.5714285714285714, |
| "grad_norm": 0.6198018193244934, |
| "learning_rate": 0.0005663313545601727, |
| "loss": 4.0237, |
| "step": 5300 |
| }, |
| { |
| "epoch": 0.5768194070080862, |
| "grad_norm": 0.6986076235771179, |
| "learning_rate": 0.0005660075553157042, |
| "loss": 4.037, |
| "step": 5350 |
| }, |
| { |
| "epoch": 0.5822102425876011, |
| "grad_norm": 0.6296870708465576, |
| "learning_rate": 0.0005656837560712358, |
| "loss": 4.0294, |
| "step": 5400 |
| }, |
| { |
| "epoch": 0.5876010781671159, |
| "grad_norm": 0.7363589406013489, |
| "learning_rate": 0.0005653599568267674, |
| "loss": 4.0366, |
| "step": 5450 |
| }, |
| { |
| "epoch": 0.5929919137466307, |
| "grad_norm": 0.5726944804191589, |
| "learning_rate": 0.0005650361575822989, |
| "loss": 4.038, |
| "step": 5500 |
| }, |
| { |
| "epoch": 0.5983827493261455, |
| "grad_norm": 0.6348768472671509, |
| "learning_rate": 0.0005647123583378305, |
| "loss": 4.0435, |
| "step": 5550 |
| }, |
| { |
| "epoch": 0.6037735849056604, |
| "grad_norm": 0.6801629662513733, |
| "learning_rate": 0.0005643885590933621, |
| "loss": 4.0221, |
| "step": 5600 |
| }, |
| { |
| "epoch": 0.6091644204851752, |
| "grad_norm": 0.5451263785362244, |
| "learning_rate": 0.0005640647598488936, |
| "loss": 4.0095, |
| "step": 5650 |
| }, |
| { |
| "epoch": 0.6145552560646901, |
| "grad_norm": 0.6517798900604248, |
| "learning_rate": 0.0005637409606044252, |
| "loss": 4.0171, |
| "step": 5700 |
| }, |
| { |
| "epoch": 0.6199460916442049, |
| "grad_norm": 0.5931031107902527, |
| "learning_rate": 0.0005634171613599567, |
| "loss": 4.0271, |
| "step": 5750 |
| }, |
| { |
| "epoch": 0.6253369272237197, |
| "grad_norm": 0.6558272838592529, |
| "learning_rate": 0.0005630933621154884, |
| "loss": 4.0187, |
| "step": 5800 |
| }, |
| { |
| "epoch": 0.6307277628032345, |
| "grad_norm": 0.6062823534011841, |
| "learning_rate": 0.0005627695628710199, |
| "loss": 4.0266, |
| "step": 5850 |
| }, |
| { |
| "epoch": 0.6361185983827493, |
| "grad_norm": 0.6019988656044006, |
| "learning_rate": 0.0005624457636265515, |
| "loss": 3.996, |
| "step": 5900 |
| }, |
| { |
| "epoch": 0.6415094339622641, |
| "grad_norm": 0.6201125383377075, |
| "learning_rate": 0.000562121964382083, |
| "loss": 3.9898, |
| "step": 5950 |
| }, |
| { |
| "epoch": 0.6469002695417789, |
| "grad_norm": 0.6050779819488525, |
| "learning_rate": 0.0005617981651376146, |
| "loss": 4.0004, |
| "step": 6000 |
| }, |
| { |
| "epoch": 0.6469002695417789, |
| "eval_accuracy": 0.32684505370223305, |
| "eval_loss": 3.9261674880981445, |
| "eval_runtime": 185.9413, |
| "eval_samples_per_second": 96.864, |
| "eval_steps_per_second": 6.056, |
| "step": 6000 |
| }, |
| { |
| "epoch": 0.6522911051212938, |
| "grad_norm": 0.6865056753158569, |
| "learning_rate": 0.0005614743658931463, |
| "loss": 3.9731, |
| "step": 6050 |
| }, |
| { |
| "epoch": 0.6576819407008087, |
| "grad_norm": 0.7325416803359985, |
| "learning_rate": 0.0005611505666486777, |
| "loss": 4.0069, |
| "step": 6100 |
| }, |
| { |
| "epoch": 0.6630727762803235, |
| "grad_norm": 0.5644854307174683, |
| "learning_rate": 0.0005608267674042094, |
| "loss": 3.9862, |
| "step": 6150 |
| }, |
| { |
| "epoch": 0.6684636118598383, |
| "grad_norm": 0.6596018075942993, |
| "learning_rate": 0.0005605029681597409, |
| "loss": 3.9661, |
| "step": 6200 |
| }, |
| { |
| "epoch": 0.6738544474393531, |
| "grad_norm": 0.6351631879806519, |
| "learning_rate": 0.0005601791689152725, |
| "loss": 3.9823, |
| "step": 6250 |
| }, |
| { |
| "epoch": 0.6792452830188679, |
| "grad_norm": 0.7349021434783936, |
| "learning_rate": 0.000559855369670804, |
| "loss": 4.0002, |
| "step": 6300 |
| }, |
| { |
| "epoch": 0.6846361185983828, |
| "grad_norm": 0.6522649526596069, |
| "learning_rate": 0.0005595315704263357, |
| "loss": 3.9623, |
| "step": 6350 |
| }, |
| { |
| "epoch": 0.6900269541778976, |
| "grad_norm": 0.5307708978652954, |
| "learning_rate": 0.0005592077711818672, |
| "loss": 3.9669, |
| "step": 6400 |
| }, |
| { |
| "epoch": 0.6954177897574124, |
| "grad_norm": 0.5935479998588562, |
| "learning_rate": 0.0005588839719373988, |
| "loss": 3.9545, |
| "step": 6450 |
| }, |
| { |
| "epoch": 0.7008086253369272, |
| "grad_norm": 0.6537756323814392, |
| "learning_rate": 0.0005585601726929303, |
| "loss": 3.9651, |
| "step": 6500 |
| }, |
| { |
| "epoch": 0.706199460916442, |
| "grad_norm": 0.5890859961509705, |
| "learning_rate": 0.0005582363734484619, |
| "loss": 3.9888, |
| "step": 6550 |
| }, |
| { |
| "epoch": 0.7115902964959568, |
| "grad_norm": 0.6212690472602844, |
| "learning_rate": 0.0005579125742039935, |
| "loss": 3.9448, |
| "step": 6600 |
| }, |
| { |
| "epoch": 0.7169811320754716, |
| "grad_norm": 0.6124947667121887, |
| "learning_rate": 0.0005575887749595251, |
| "loss": 3.9736, |
| "step": 6650 |
| }, |
| { |
| "epoch": 0.7223719676549866, |
| "grad_norm": 0.5879746675491333, |
| "learning_rate": 0.0005572649757150566, |
| "loss": 3.9494, |
| "step": 6700 |
| }, |
| { |
| "epoch": 0.7277628032345014, |
| "grad_norm": 0.5893728733062744, |
| "learning_rate": 0.0005569411764705882, |
| "loss": 3.9548, |
| "step": 6750 |
| }, |
| { |
| "epoch": 0.7331536388140162, |
| "grad_norm": 0.626844584941864, |
| "learning_rate": 0.0005566173772261198, |
| "loss": 3.9442, |
| "step": 6800 |
| }, |
| { |
| "epoch": 0.738544474393531, |
| "grad_norm": 0.6960020065307617, |
| "learning_rate": 0.0005562935779816513, |
| "loss": 3.9258, |
| "step": 6850 |
| }, |
| { |
| "epoch": 0.7439353099730458, |
| "grad_norm": 0.6511778235435486, |
| "learning_rate": 0.0005559697787371828, |
| "loss": 3.9527, |
| "step": 6900 |
| }, |
| { |
| "epoch": 0.7493261455525606, |
| "grad_norm": 0.6318475604057312, |
| "learning_rate": 0.0005556459794927145, |
| "loss": 3.9468, |
| "step": 6950 |
| }, |
| { |
| "epoch": 0.7547169811320755, |
| "grad_norm": 0.7971281409263611, |
| "learning_rate": 0.000555322180248246, |
| "loss": 3.9359, |
| "step": 7000 |
| }, |
| { |
| "epoch": 0.7547169811320755, |
| "eval_accuracy": 0.3327894531945633, |
| "eval_loss": 3.869997501373291, |
| "eval_runtime": 181.2103, |
| "eval_samples_per_second": 99.393, |
| "eval_steps_per_second": 6.214, |
| "step": 7000 |
| }, |
| { |
| "epoch": 0.7601078167115903, |
| "grad_norm": 0.6572127342224121, |
| "learning_rate": 0.0005549983810037776, |
| "loss": 3.9277, |
| "step": 7050 |
| }, |
| { |
| "epoch": 0.7654986522911051, |
| "grad_norm": 0.5682153105735779, |
| "learning_rate": 0.0005546745817593091, |
| "loss": 3.9367, |
| "step": 7100 |
| }, |
| { |
| "epoch": 0.77088948787062, |
| "grad_norm": 0.5704119801521301, |
| "learning_rate": 0.0005543507825148408, |
| "loss": 3.9349, |
| "step": 7150 |
| }, |
| { |
| "epoch": 0.7762803234501348, |
| "grad_norm": 0.6218463182449341, |
| "learning_rate": 0.0005540269832703723, |
| "loss": 3.9385, |
| "step": 7200 |
| }, |
| { |
| "epoch": 0.7816711590296496, |
| "grad_norm": 0.5849258303642273, |
| "learning_rate": 0.0005537031840259039, |
| "loss": 3.8932, |
| "step": 7250 |
| }, |
| { |
| "epoch": 0.7870619946091644, |
| "grad_norm": 0.5835541486740112, |
| "learning_rate": 0.0005533793847814354, |
| "loss": 3.9344, |
| "step": 7300 |
| }, |
| { |
| "epoch": 0.7924528301886793, |
| "grad_norm": 0.5649335980415344, |
| "learning_rate": 0.000553055585536967, |
| "loss": 3.9192, |
| "step": 7350 |
| }, |
| { |
| "epoch": 0.7978436657681941, |
| "grad_norm": 0.6361489295959473, |
| "learning_rate": 0.0005527317862924987, |
| "loss": 3.9193, |
| "step": 7400 |
| }, |
| { |
| "epoch": 0.8032345013477089, |
| "grad_norm": 0.5219013094902039, |
| "learning_rate": 0.0005524079870480301, |
| "loss": 3.9079, |
| "step": 7450 |
| }, |
| { |
| "epoch": 0.8086253369272237, |
| "grad_norm": 0.6409909129142761, |
| "learning_rate": 0.0005520841878035618, |
| "loss": 3.8963, |
| "step": 7500 |
| }, |
| { |
| "epoch": 0.8140161725067385, |
| "grad_norm": 0.6128087043762207, |
| "learning_rate": 0.0005517603885590933, |
| "loss": 3.9236, |
| "step": 7550 |
| }, |
| { |
| "epoch": 0.8194070080862533, |
| "grad_norm": 0.6087100505828857, |
| "learning_rate": 0.0005514365893146249, |
| "loss": 3.9237, |
| "step": 7600 |
| }, |
| { |
| "epoch": 0.8247978436657682, |
| "grad_norm": 0.6042186617851257, |
| "learning_rate": 0.0005511127900701564, |
| "loss": 3.8995, |
| "step": 7650 |
| }, |
| { |
| "epoch": 0.8301886792452831, |
| "grad_norm": 0.566831111907959, |
| "learning_rate": 0.000550788990825688, |
| "loss": 3.905, |
| "step": 7700 |
| }, |
| { |
| "epoch": 0.8355795148247979, |
| "grad_norm": 0.544748842716217, |
| "learning_rate": 0.0005504651915812196, |
| "loss": 3.8854, |
| "step": 7750 |
| }, |
| { |
| "epoch": 0.8409703504043127, |
| "grad_norm": 0.5742127299308777, |
| "learning_rate": 0.0005501413923367512, |
| "loss": 3.9136, |
| "step": 7800 |
| }, |
| { |
| "epoch": 0.8463611859838275, |
| "grad_norm": 0.6579509973526001, |
| "learning_rate": 0.0005498175930922827, |
| "loss": 3.8913, |
| "step": 7850 |
| }, |
| { |
| "epoch": 0.8517520215633423, |
| "grad_norm": 0.5239347219467163, |
| "learning_rate": 0.0005494937938478143, |
| "loss": 3.8935, |
| "step": 7900 |
| }, |
| { |
| "epoch": 0.8571428571428571, |
| "grad_norm": 0.6472986340522766, |
| "learning_rate": 0.0005491699946033459, |
| "loss": 3.8914, |
| "step": 7950 |
| }, |
| { |
| "epoch": 0.862533692722372, |
| "grad_norm": 0.5899927616119385, |
| "learning_rate": 0.0005488461953588775, |
| "loss": 3.8808, |
| "step": 8000 |
| }, |
| { |
| "epoch": 0.862533692722372, |
| "eval_accuracy": 0.33687349794173405, |
| "eval_loss": 3.8257834911346436, |
| "eval_runtime": 181.2863, |
| "eval_samples_per_second": 99.351, |
| "eval_steps_per_second": 6.211, |
| "step": 8000 |
| }, |
| { |
| "epoch": 0.8679245283018868, |
| "grad_norm": 0.5935076475143433, |
| "learning_rate": 0.000548522396114409, |
| "loss": 3.8878, |
| "step": 8050 |
| }, |
| { |
| "epoch": 0.8733153638814016, |
| "grad_norm": 0.6126903891563416, |
| "learning_rate": 0.0005481985968699406, |
| "loss": 3.873, |
| "step": 8100 |
| }, |
| { |
| "epoch": 0.8787061994609164, |
| "grad_norm": 0.5333702564239502, |
| "learning_rate": 0.0005478747976254721, |
| "loss": 3.8833, |
| "step": 8150 |
| }, |
| { |
| "epoch": 0.8840970350404312, |
| "grad_norm": 0.542793869972229, |
| "learning_rate": 0.0005475509983810037, |
| "loss": 3.874, |
| "step": 8200 |
| }, |
| { |
| "epoch": 0.889487870619946, |
| "grad_norm": 0.5891146659851074, |
| "learning_rate": 0.0005472271991365352, |
| "loss": 3.8604, |
| "step": 8250 |
| }, |
| { |
| "epoch": 0.894878706199461, |
| "grad_norm": 0.6112461090087891, |
| "learning_rate": 0.0005469033998920669, |
| "loss": 3.8849, |
| "step": 8300 |
| }, |
| { |
| "epoch": 0.9002695417789758, |
| "grad_norm": 0.5972184538841248, |
| "learning_rate": 0.0005465796006475984, |
| "loss": 3.8678, |
| "step": 8350 |
| }, |
| { |
| "epoch": 0.9056603773584906, |
| "grad_norm": 0.685249388217926, |
| "learning_rate": 0.00054625580140313, |
| "loss": 3.8825, |
| "step": 8400 |
| }, |
| { |
| "epoch": 0.9110512129380054, |
| "grad_norm": 0.6276729702949524, |
| "learning_rate": 0.0005459320021586615, |
| "loss": 3.8911, |
| "step": 8450 |
| }, |
| { |
| "epoch": 0.9164420485175202, |
| "grad_norm": 0.5620297193527222, |
| "learning_rate": 0.0005456082029141932, |
| "loss": 3.8825, |
| "step": 8500 |
| }, |
| { |
| "epoch": 0.921832884097035, |
| "grad_norm": 0.5803284049034119, |
| "learning_rate": 0.0005452844036697248, |
| "loss": 3.8637, |
| "step": 8550 |
| }, |
| { |
| "epoch": 0.9272237196765498, |
| "grad_norm": 0.5740498900413513, |
| "learning_rate": 0.0005449606044252563, |
| "loss": 3.8559, |
| "step": 8600 |
| }, |
| { |
| "epoch": 0.9326145552560647, |
| "grad_norm": 0.5694971680641174, |
| "learning_rate": 0.0005446368051807879, |
| "loss": 3.8485, |
| "step": 8650 |
| }, |
| { |
| "epoch": 0.9380053908355795, |
| "grad_norm": 0.555069625377655, |
| "learning_rate": 0.0005443130059363194, |
| "loss": 3.857, |
| "step": 8700 |
| }, |
| { |
| "epoch": 0.9433962264150944, |
| "grad_norm": 0.5356409549713135, |
| "learning_rate": 0.0005439892066918511, |
| "loss": 3.8539, |
| "step": 8750 |
| }, |
| { |
| "epoch": 0.9487870619946092, |
| "grad_norm": 0.5425722002983093, |
| "learning_rate": 0.0005436654074473825, |
| "loss": 3.8465, |
| "step": 8800 |
| }, |
| { |
| "epoch": 0.954177897574124, |
| "grad_norm": 0.5436581373214722, |
| "learning_rate": 0.0005433416082029142, |
| "loss": 3.8369, |
| "step": 8850 |
| }, |
| { |
| "epoch": 0.9595687331536388, |
| "grad_norm": 0.5613462328910828, |
| "learning_rate": 0.0005430178089584457, |
| "loss": 3.8548, |
| "step": 8900 |
| }, |
| { |
| "epoch": 0.9649595687331537, |
| "grad_norm": 0.5960737466812134, |
| "learning_rate": 0.0005426940097139773, |
| "loss": 3.8608, |
| "step": 8950 |
| }, |
| { |
| "epoch": 0.9703504043126685, |
| "grad_norm": 0.5757009387016296, |
| "learning_rate": 0.0005423702104695088, |
| "loss": 3.8477, |
| "step": 9000 |
| }, |
| { |
| "epoch": 0.9703504043126685, |
| "eval_accuracy": 0.340650489627941, |
| "eval_loss": 3.7848594188690186, |
| "eval_runtime": 181.5098, |
| "eval_samples_per_second": 99.229, |
| "eval_steps_per_second": 6.204, |
| "step": 9000 |
| }, |
| { |
| "epoch": 0.9757412398921833, |
| "grad_norm": 0.5870667695999146, |
| "learning_rate": 0.0005420464112250404, |
| "loss": 3.8409, |
| "step": 9050 |
| }, |
| { |
| "epoch": 0.9811320754716981, |
| "grad_norm": 0.6312296986579895, |
| "learning_rate": 0.000541722611980572, |
| "loss": 3.8558, |
| "step": 9100 |
| }, |
| { |
| "epoch": 0.9865229110512129, |
| "grad_norm": 0.5529806613922119, |
| "learning_rate": 0.0005413988127361036, |
| "loss": 3.8301, |
| "step": 9150 |
| }, |
| { |
| "epoch": 0.9919137466307277, |
| "grad_norm": 0.5621755123138428, |
| "learning_rate": 0.0005410750134916351, |
| "loss": 3.8279, |
| "step": 9200 |
| }, |
| { |
| "epoch": 0.9973045822102425, |
| "grad_norm": 0.564895510673523, |
| "learning_rate": 0.0005407512142471667, |
| "loss": 3.8412, |
| "step": 9250 |
| }, |
| { |
| "epoch": 1.0026954177897573, |
| "grad_norm": 0.5690886974334717, |
| "learning_rate": 0.0005404274150026983, |
| "loss": 3.8157, |
| "step": 9300 |
| }, |
| { |
| "epoch": 1.0080862533692723, |
| "grad_norm": 0.544973611831665, |
| "learning_rate": 0.0005401100917431192, |
| "loss": 3.7794, |
| "step": 9350 |
| }, |
| { |
| "epoch": 1.013477088948787, |
| "grad_norm": 0.61162269115448, |
| "learning_rate": 0.0005397862924986508, |
| "loss": 3.774, |
| "step": 9400 |
| }, |
| { |
| "epoch": 1.0188679245283019, |
| "grad_norm": 0.6406456828117371, |
| "learning_rate": 0.0005394624932541824, |
| "loss": 3.773, |
| "step": 9450 |
| }, |
| { |
| "epoch": 1.0242587601078168, |
| "grad_norm": 0.6477643847465515, |
| "learning_rate": 0.0005391386940097139, |
| "loss": 3.7741, |
| "step": 9500 |
| }, |
| { |
| "epoch": 1.0296495956873315, |
| "grad_norm": 0.5913816094398499, |
| "learning_rate": 0.0005388148947652455, |
| "loss": 3.7717, |
| "step": 9550 |
| }, |
| { |
| "epoch": 1.0350404312668464, |
| "grad_norm": 0.6406410932540894, |
| "learning_rate": 0.000538491095520777, |
| "loss": 3.76, |
| "step": 9600 |
| }, |
| { |
| "epoch": 1.0404312668463611, |
| "grad_norm": 0.6126238107681274, |
| "learning_rate": 0.0005381672962763086, |
| "loss": 3.801, |
| "step": 9650 |
| }, |
| { |
| "epoch": 1.045822102425876, |
| "grad_norm": 0.5817605257034302, |
| "learning_rate": 0.0005378434970318403, |
| "loss": 3.7861, |
| "step": 9700 |
| }, |
| { |
| "epoch": 1.0512129380053907, |
| "grad_norm": 0.5051379203796387, |
| "learning_rate": 0.0005375196977873718, |
| "loss": 3.7861, |
| "step": 9750 |
| }, |
| { |
| "epoch": 1.0566037735849056, |
| "grad_norm": 0.5685185194015503, |
| "learning_rate": 0.0005371958985429034, |
| "loss": 3.7939, |
| "step": 9800 |
| }, |
| { |
| "epoch": 1.0619946091644206, |
| "grad_norm": 0.561697244644165, |
| "learning_rate": 0.0005368720992984349, |
| "loss": 3.788, |
| "step": 9850 |
| }, |
| { |
| "epoch": 1.0673854447439353, |
| "grad_norm": 0.5285218954086304, |
| "learning_rate": 0.0005365483000539665, |
| "loss": 3.7792, |
| "step": 9900 |
| }, |
| { |
| "epoch": 1.0727762803234502, |
| "grad_norm": 0.6429543495178223, |
| "learning_rate": 0.0005362245008094981, |
| "loss": 3.7742, |
| "step": 9950 |
| }, |
| { |
| "epoch": 1.0781671159029649, |
| "grad_norm": 0.6265733242034912, |
| "learning_rate": 0.0005359007015650297, |
| "loss": 3.7666, |
| "step": 10000 |
| }, |
| { |
| "epoch": 1.0781671159029649, |
| "eval_accuracy": 0.34411564752612045, |
| "eval_loss": 3.753229856491089, |
| "eval_runtime": 181.6252, |
| "eval_samples_per_second": 99.166, |
| "eval_steps_per_second": 6.2, |
| "step": 10000 |
| } |
| ], |
| "logging_steps": 50, |
| "max_steps": 92750, |
| "num_input_tokens_seen": 0, |
| "num_train_epochs": 10, |
| "save_steps": 10000, |
| "stateful_callbacks": { |
| "TrainerControl": { |
| "args": { |
| "should_epoch_stop": false, |
| "should_evaluate": false, |
| "should_log": false, |
| "should_save": true, |
| "should_training_stop": false |
| }, |
| "attributes": {} |
| } |
| }, |
| "total_flos": 8.36069179392e+16, |
| "train_batch_size": 32, |
| "trial_name": null, |
| "trial_params": null |
| } |
|
|