| { | |
| "best_global_step": 50000, | |
| "best_metric": 0.5460931658744812, | |
| "best_model_checkpoint": "checkpoints/mla-o_baseline/checkpoint-50000", | |
| "epoch": 7.105300554213443, | |
| "eval_steps": 2000, | |
| "global_step": 50000, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.0071053005542134435, | |
| "grad_norm": 0.3552660346031189, | |
| "learning_rate": 4.9e-06, | |
| "loss": 10.3029, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.014210601108426887, | |
| "grad_norm": 0.5037913918495178, | |
| "learning_rate": 9.900000000000002e-06, | |
| "loss": 10.2361, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.02131590166264033, | |
| "grad_norm": 1.8683347702026367, | |
| "learning_rate": 1.49e-05, | |
| "loss": 9.9763, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 0.028421202216853774, | |
| "grad_norm": 4.1887102127075195, | |
| "learning_rate": 1.9900000000000003e-05, | |
| "loss": 8.7258, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 0.03552650277106722, | |
| "grad_norm": 1.4753533601760864, | |
| "learning_rate": 2.49e-05, | |
| "loss": 7.5735, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 0.04263180332528066, | |
| "grad_norm": 1.4821197986602783, | |
| "learning_rate": 2.9900000000000002e-05, | |
| "loss": 7.3076, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 0.0497371038794941, | |
| "grad_norm": 1.4782558679580688, | |
| "learning_rate": 3.49e-05, | |
| "loss": 7.2083, | |
| "step": 350 | |
| }, | |
| { | |
| "epoch": 0.05684240443370755, | |
| "grad_norm": 1.6048325300216675, | |
| "learning_rate": 3.99e-05, | |
| "loss": 7.1503, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 0.06394770498792099, | |
| "grad_norm": 1.4144378900527954, | |
| "learning_rate": 4.49e-05, | |
| "loss": 7.1062, | |
| "step": 450 | |
| }, | |
| { | |
| "epoch": 0.07105300554213444, | |
| "grad_norm": 1.399983525276184, | |
| "learning_rate": 4.99e-05, | |
| "loss": 7.1068, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 0.07815830609634787, | |
| "grad_norm": 1.180097222328186, | |
| "learning_rate": 5.49e-05, | |
| "loss": 7.0583, | |
| "step": 550 | |
| }, | |
| { | |
| "epoch": 0.08526360665056132, | |
| "grad_norm": 1.253781795501709, | |
| "learning_rate": 5.9900000000000006e-05, | |
| "loss": 7.04, | |
| "step": 600 | |
| }, | |
| { | |
| "epoch": 0.09236890720477477, | |
| "grad_norm": 1.1676445007324219, | |
| "learning_rate": 6.49e-05, | |
| "loss": 7.0169, | |
| "step": 650 | |
| }, | |
| { | |
| "epoch": 0.0994742077589882, | |
| "grad_norm": 1.308653473854065, | |
| "learning_rate": 6.99e-05, | |
| "loss": 6.94, | |
| "step": 700 | |
| }, | |
| { | |
| "epoch": 0.10657950831320165, | |
| "grad_norm": 1.7658427953720093, | |
| "learning_rate": 7.489999999999999e-05, | |
| "loss": 6.899, | |
| "step": 750 | |
| }, | |
| { | |
| "epoch": 0.1136848088674151, | |
| "grad_norm": 1.4796422719955444, | |
| "learning_rate": 7.99e-05, | |
| "loss": 6.8586, | |
| "step": 800 | |
| }, | |
| { | |
| "epoch": 0.12079010942162853, | |
| "grad_norm": 1.3457406759262085, | |
| "learning_rate": 8.49e-05, | |
| "loss": 6.8178, | |
| "step": 850 | |
| }, | |
| { | |
| "epoch": 0.12789540997584198, | |
| "grad_norm": 1.6725692749023438, | |
| "learning_rate": 8.989999999999999e-05, | |
| "loss": 6.7367, | |
| "step": 900 | |
| }, | |
| { | |
| "epoch": 0.1350007105300554, | |
| "grad_norm": 2.1698403358459473, | |
| "learning_rate": 9.49e-05, | |
| "loss": 6.7096, | |
| "step": 950 | |
| }, | |
| { | |
| "epoch": 0.14210601108426887, | |
| "grad_norm": 1.6723932027816772, | |
| "learning_rate": 9.99e-05, | |
| "loss": 6.6399, | |
| "step": 1000 | |
| }, | |
| { | |
| "epoch": 0.1492113116384823, | |
| "grad_norm": 1.8007622957229614, | |
| "learning_rate": 0.0001049, | |
| "loss": 6.597, | |
| "step": 1050 | |
| }, | |
| { | |
| "epoch": 0.15631661219269574, | |
| "grad_norm": 2.0344436168670654, | |
| "learning_rate": 0.0001099, | |
| "loss": 6.5591, | |
| "step": 1100 | |
| }, | |
| { | |
| "epoch": 0.1634219127469092, | |
| "grad_norm": 2.2933220863342285, | |
| "learning_rate": 0.0001149, | |
| "loss": 6.503, | |
| "step": 1150 | |
| }, | |
| { | |
| "epoch": 0.17052721330112264, | |
| "grad_norm": 2.4448184967041016, | |
| "learning_rate": 0.00011990000000000001, | |
| "loss": 6.4439, | |
| "step": 1200 | |
| }, | |
| { | |
| "epoch": 0.17763251385533607, | |
| "grad_norm": 2.009326219558716, | |
| "learning_rate": 0.0001249, | |
| "loss": 6.4007, | |
| "step": 1250 | |
| }, | |
| { | |
| "epoch": 0.18473781440954953, | |
| "grad_norm": 3.5166053771972656, | |
| "learning_rate": 0.00012989999999999999, | |
| "loss": 6.3217, | |
| "step": 1300 | |
| }, | |
| { | |
| "epoch": 0.19184311496376297, | |
| "grad_norm": 2.2721781730651855, | |
| "learning_rate": 0.0001349, | |
| "loss": 6.2818, | |
| "step": 1350 | |
| }, | |
| { | |
| "epoch": 0.1989484155179764, | |
| "grad_norm": 2.715651035308838, | |
| "learning_rate": 0.0001399, | |
| "loss": 6.192, | |
| "step": 1400 | |
| }, | |
| { | |
| "epoch": 0.20605371607218986, | |
| "grad_norm": 2.155087471008301, | |
| "learning_rate": 0.0001449, | |
| "loss": 6.1615, | |
| "step": 1450 | |
| }, | |
| { | |
| "epoch": 0.2131590166264033, | |
| "grad_norm": 2.541161060333252, | |
| "learning_rate": 0.0001499, | |
| "loss": 6.0676, | |
| "step": 1500 | |
| }, | |
| { | |
| "epoch": 0.22026431718061673, | |
| "grad_norm": 2.9829161167144775, | |
| "learning_rate": 0.00015490000000000002, | |
| "loss": 6.0268, | |
| "step": 1550 | |
| }, | |
| { | |
| "epoch": 0.2273696177348302, | |
| "grad_norm": 3.020001173019409, | |
| "learning_rate": 0.00015989999999999998, | |
| "loss": 5.9497, | |
| "step": 1600 | |
| }, | |
| { | |
| "epoch": 0.23447491828904363, | |
| "grad_norm": 2.6197469234466553, | |
| "learning_rate": 0.0001649, | |
| "loss": 5.888, | |
| "step": 1650 | |
| }, | |
| { | |
| "epoch": 0.24158021884325706, | |
| "grad_norm": 2.939798355102539, | |
| "learning_rate": 0.0001699, | |
| "loss": 5.8187, | |
| "step": 1700 | |
| }, | |
| { | |
| "epoch": 0.24868551939747052, | |
| "grad_norm": 4.240734100341797, | |
| "learning_rate": 0.0001749, | |
| "loss": 5.7487, | |
| "step": 1750 | |
| }, | |
| { | |
| "epoch": 0.25579081995168396, | |
| "grad_norm": 2.2773475646972656, | |
| "learning_rate": 0.0001799, | |
| "loss": 5.6821, | |
| "step": 1800 | |
| }, | |
| { | |
| "epoch": 0.2628961205058974, | |
| "grad_norm": 4.022791862487793, | |
| "learning_rate": 0.00018490000000000002, | |
| "loss": 5.6257, | |
| "step": 1850 | |
| }, | |
| { | |
| "epoch": 0.2700014210601108, | |
| "grad_norm": 2.742016315460205, | |
| "learning_rate": 0.0001899, | |
| "loss": 5.5804, | |
| "step": 1900 | |
| }, | |
| { | |
| "epoch": 0.27710672161432426, | |
| "grad_norm": 2.627027988433838, | |
| "learning_rate": 0.0001949, | |
| "loss": 5.5132, | |
| "step": 1950 | |
| }, | |
| { | |
| "epoch": 0.28421202216853775, | |
| "grad_norm": 2.689333200454712, | |
| "learning_rate": 0.0001999, | |
| "loss": 5.4796, | |
| "step": 2000 | |
| }, | |
| { | |
| "epoch": 0.28421202216853775, | |
| "eval_accuracy": 0.27806881070137024, | |
| "eval_loss": 5.384032726287842, | |
| "eval_runtime": 1.4127, | |
| "eval_samples_per_second": 2661.587, | |
| "eval_steps_per_second": 41.764, | |
| "step": 2000 | |
| }, | |
| { | |
| "epoch": 0.2913173227227512, | |
| "grad_norm": 2.620975971221924, | |
| "learning_rate": 0.0002049, | |
| "loss": 5.442, | |
| "step": 2050 | |
| }, | |
| { | |
| "epoch": 0.2984226232769646, | |
| "grad_norm": 3.2960896492004395, | |
| "learning_rate": 0.0002099, | |
| "loss": 5.3711, | |
| "step": 2100 | |
| }, | |
| { | |
| "epoch": 0.30552792383117805, | |
| "grad_norm": 2.6958446502685547, | |
| "learning_rate": 0.00021490000000000002, | |
| "loss": 5.3401, | |
| "step": 2150 | |
| }, | |
| { | |
| "epoch": 0.3126332243853915, | |
| "grad_norm": 2.4613704681396484, | |
| "learning_rate": 0.0002199, | |
| "loss": 5.317, | |
| "step": 2200 | |
| }, | |
| { | |
| "epoch": 0.3197385249396049, | |
| "grad_norm": 2.736056089401245, | |
| "learning_rate": 0.0002249, | |
| "loss": 5.2626, | |
| "step": 2250 | |
| }, | |
| { | |
| "epoch": 0.3268438254938184, | |
| "grad_norm": 3.220209836959839, | |
| "learning_rate": 0.0002299, | |
| "loss": 5.2429, | |
| "step": 2300 | |
| }, | |
| { | |
| "epoch": 0.33394912604803184, | |
| "grad_norm": 2.666049003601074, | |
| "learning_rate": 0.0002349, | |
| "loss": 5.2345, | |
| "step": 2350 | |
| }, | |
| { | |
| "epoch": 0.3410544266022453, | |
| "grad_norm": 2.9080348014831543, | |
| "learning_rate": 0.0002399, | |
| "loss": 5.2253, | |
| "step": 2400 | |
| }, | |
| { | |
| "epoch": 0.3481597271564587, | |
| "grad_norm": 2.500917673110962, | |
| "learning_rate": 0.0002449, | |
| "loss": 5.1528, | |
| "step": 2450 | |
| }, | |
| { | |
| "epoch": 0.35526502771067214, | |
| "grad_norm": 2.4509730339050293, | |
| "learning_rate": 0.0002499, | |
| "loss": 5.1371, | |
| "step": 2500 | |
| }, | |
| { | |
| "epoch": 0.3623703282648856, | |
| "grad_norm": 2.1525373458862305, | |
| "learning_rate": 0.0002549, | |
| "loss": 5.1283, | |
| "step": 2550 | |
| }, | |
| { | |
| "epoch": 0.36947562881909907, | |
| "grad_norm": 2.3644602298736572, | |
| "learning_rate": 0.00025990000000000003, | |
| "loss": 5.1132, | |
| "step": 2600 | |
| }, | |
| { | |
| "epoch": 0.3765809293733125, | |
| "grad_norm": 2.4550833702087402, | |
| "learning_rate": 0.00026490000000000004, | |
| "loss": 5.0745, | |
| "step": 2650 | |
| }, | |
| { | |
| "epoch": 0.38368622992752593, | |
| "grad_norm": 2.2757322788238525, | |
| "learning_rate": 0.0002699, | |
| "loss": 5.0384, | |
| "step": 2700 | |
| }, | |
| { | |
| "epoch": 0.39079153048173937, | |
| "grad_norm": 2.3816654682159424, | |
| "learning_rate": 0.00027489999999999996, | |
| "loss": 5.0265, | |
| "step": 2750 | |
| }, | |
| { | |
| "epoch": 0.3978968310359528, | |
| "grad_norm": 2.689765453338623, | |
| "learning_rate": 0.0002799, | |
| "loss": 5.0075, | |
| "step": 2800 | |
| }, | |
| { | |
| "epoch": 0.40500213159016624, | |
| "grad_norm": 2.3210747241973877, | |
| "learning_rate": 0.0002849, | |
| "loss": 4.9886, | |
| "step": 2850 | |
| }, | |
| { | |
| "epoch": 0.4121074321443797, | |
| "grad_norm": 2.2462069988250732, | |
| "learning_rate": 0.0002899, | |
| "loss": 4.9609, | |
| "step": 2900 | |
| }, | |
| { | |
| "epoch": 0.41921273269859316, | |
| "grad_norm": 2.407829999923706, | |
| "learning_rate": 0.0002949, | |
| "loss": 4.9633, | |
| "step": 2950 | |
| }, | |
| { | |
| "epoch": 0.4263180332528066, | |
| "grad_norm": 2.6128649711608887, | |
| "learning_rate": 0.0002999, | |
| "loss": 4.9396, | |
| "step": 3000 | |
| }, | |
| { | |
| "epoch": 0.43342333380702003, | |
| "grad_norm": 2.2061820030212402, | |
| "learning_rate": 0.0003049, | |
| "loss": 4.928, | |
| "step": 3050 | |
| }, | |
| { | |
| "epoch": 0.44052863436123346, | |
| "grad_norm": 2.3088581562042236, | |
| "learning_rate": 0.0003099, | |
| "loss": 4.9005, | |
| "step": 3100 | |
| }, | |
| { | |
| "epoch": 0.4476339349154469, | |
| "grad_norm": 2.3053336143493652, | |
| "learning_rate": 0.0003149, | |
| "loss": 4.8898, | |
| "step": 3150 | |
| }, | |
| { | |
| "epoch": 0.4547392354696604, | |
| "grad_norm": 2.061445951461792, | |
| "learning_rate": 0.0003199, | |
| "loss": 4.849, | |
| "step": 3200 | |
| }, | |
| { | |
| "epoch": 0.4618445360238738, | |
| "grad_norm": 2.8992912769317627, | |
| "learning_rate": 0.00032490000000000004, | |
| "loss": 4.8438, | |
| "step": 3250 | |
| }, | |
| { | |
| "epoch": 0.46894983657808725, | |
| "grad_norm": 2.3059287071228027, | |
| "learning_rate": 0.00032990000000000005, | |
| "loss": 4.7987, | |
| "step": 3300 | |
| }, | |
| { | |
| "epoch": 0.4760551371323007, | |
| "grad_norm": 2.248922824859619, | |
| "learning_rate": 0.0003349, | |
| "loss": 4.7949, | |
| "step": 3350 | |
| }, | |
| { | |
| "epoch": 0.4831604376865141, | |
| "grad_norm": 2.3223118782043457, | |
| "learning_rate": 0.00033989999999999997, | |
| "loss": 4.7798, | |
| "step": 3400 | |
| }, | |
| { | |
| "epoch": 0.49026573824072756, | |
| "grad_norm": 2.4109292030334473, | |
| "learning_rate": 0.0003449, | |
| "loss": 4.7713, | |
| "step": 3450 | |
| }, | |
| { | |
| "epoch": 0.49737103879494104, | |
| "grad_norm": 2.629187822341919, | |
| "learning_rate": 0.0003499, | |
| "loss": 4.7844, | |
| "step": 3500 | |
| }, | |
| { | |
| "epoch": 0.5044763393491545, | |
| "grad_norm": 2.707880735397339, | |
| "learning_rate": 0.0003549, | |
| "loss": 4.7225, | |
| "step": 3550 | |
| }, | |
| { | |
| "epoch": 0.5115816399033679, | |
| "grad_norm": 1.9972128868103027, | |
| "learning_rate": 0.0003599, | |
| "loss": 4.7385, | |
| "step": 3600 | |
| }, | |
| { | |
| "epoch": 0.5186869404575813, | |
| "grad_norm": 2.0002362728118896, | |
| "learning_rate": 0.00036490000000000003, | |
| "loss": 4.7187, | |
| "step": 3650 | |
| }, | |
| { | |
| "epoch": 0.5257922410117948, | |
| "grad_norm": 2.079793691635132, | |
| "learning_rate": 0.0003699, | |
| "loss": 4.6875, | |
| "step": 3700 | |
| }, | |
| { | |
| "epoch": 0.5328975415660082, | |
| "grad_norm": 2.2726001739501953, | |
| "learning_rate": 0.0003749, | |
| "loss": 4.6636, | |
| "step": 3750 | |
| }, | |
| { | |
| "epoch": 0.5400028421202216, | |
| "grad_norm": 2.250825881958008, | |
| "learning_rate": 0.0003799, | |
| "loss": 4.6815, | |
| "step": 3800 | |
| }, | |
| { | |
| "epoch": 0.5471081426744351, | |
| "grad_norm": 2.145050287246704, | |
| "learning_rate": 0.00038490000000000003, | |
| "loss": 4.669, | |
| "step": 3850 | |
| }, | |
| { | |
| "epoch": 0.5542134432286485, | |
| "grad_norm": 2.17869234085083, | |
| "learning_rate": 0.00038990000000000004, | |
| "loss": 4.651, | |
| "step": 3900 | |
| }, | |
| { | |
| "epoch": 0.5613187437828621, | |
| "grad_norm": 1.988489031791687, | |
| "learning_rate": 0.0003949, | |
| "loss": 4.6282, | |
| "step": 3950 | |
| }, | |
| { | |
| "epoch": 0.5684240443370755, | |
| "grad_norm": 2.583235025405884, | |
| "learning_rate": 0.00039989999999999996, | |
| "loss": 4.6166, | |
| "step": 4000 | |
| }, | |
| { | |
| "epoch": 0.5684240443370755, | |
| "eval_accuracy": 0.3410300612449646, | |
| "eval_loss": 4.496020317077637, | |
| "eval_runtime": 1.2758, | |
| "eval_samples_per_second": 2947.074, | |
| "eval_steps_per_second": 46.244, | |
| "step": 4000 | |
| }, | |
| { | |
| "epoch": 0.5755293448912889, | |
| "grad_norm": 2.1131043434143066, | |
| "learning_rate": 0.0004049, | |
| "loss": 4.6117, | |
| "step": 4050 | |
| }, | |
| { | |
| "epoch": 0.5826346454455024, | |
| "grad_norm": 2.024203062057495, | |
| "learning_rate": 0.0004099, | |
| "loss": 4.58, | |
| "step": 4100 | |
| }, | |
| { | |
| "epoch": 0.5897399459997158, | |
| "grad_norm": 2.1560163497924805, | |
| "learning_rate": 0.0004149, | |
| "loss": 4.5912, | |
| "step": 4150 | |
| }, | |
| { | |
| "epoch": 0.5968452465539292, | |
| "grad_norm": 1.8401826620101929, | |
| "learning_rate": 0.0004199, | |
| "loss": 4.5721, | |
| "step": 4200 | |
| }, | |
| { | |
| "epoch": 0.6039505471081427, | |
| "grad_norm": 1.9240000247955322, | |
| "learning_rate": 0.00042490000000000003, | |
| "loss": 4.5552, | |
| "step": 4250 | |
| }, | |
| { | |
| "epoch": 0.6110558476623561, | |
| "grad_norm": 1.9883395433425903, | |
| "learning_rate": 0.0004299, | |
| "loss": 4.5461, | |
| "step": 4300 | |
| }, | |
| { | |
| "epoch": 0.6181611482165695, | |
| "grad_norm": 1.7731436491012573, | |
| "learning_rate": 0.0004349, | |
| "loss": 4.5235, | |
| "step": 4350 | |
| }, | |
| { | |
| "epoch": 0.625266448770783, | |
| "grad_norm": 2.082683801651001, | |
| "learning_rate": 0.0004399, | |
| "loss": 4.4835, | |
| "step": 4400 | |
| }, | |
| { | |
| "epoch": 0.6323717493249964, | |
| "grad_norm": 1.7258926630020142, | |
| "learning_rate": 0.0004449, | |
| "loss": 4.4901, | |
| "step": 4450 | |
| }, | |
| { | |
| "epoch": 0.6394770498792098, | |
| "grad_norm": 1.8295708894729614, | |
| "learning_rate": 0.00044990000000000004, | |
| "loss": 4.4861, | |
| "step": 4500 | |
| }, | |
| { | |
| "epoch": 0.6465823504334234, | |
| "grad_norm": 1.955956220626831, | |
| "learning_rate": 0.00045490000000000005, | |
| "loss": 4.4275, | |
| "step": 4550 | |
| }, | |
| { | |
| "epoch": 0.6536876509876368, | |
| "grad_norm": 2.007366180419922, | |
| "learning_rate": 0.0004599, | |
| "loss": 4.4404, | |
| "step": 4600 | |
| }, | |
| { | |
| "epoch": 0.6607929515418502, | |
| "grad_norm": 1.8152107000350952, | |
| "learning_rate": 0.00046489999999999997, | |
| "loss": 4.4501, | |
| "step": 4650 | |
| }, | |
| { | |
| "epoch": 0.6678982520960637, | |
| "grad_norm": 1.5465844869613647, | |
| "learning_rate": 0.0004699, | |
| "loss": 4.4577, | |
| "step": 4700 | |
| }, | |
| { | |
| "epoch": 0.6750035526502771, | |
| "grad_norm": 1.9919116497039795, | |
| "learning_rate": 0.0004749, | |
| "loss": 4.4107, | |
| "step": 4750 | |
| }, | |
| { | |
| "epoch": 0.6821088532044906, | |
| "grad_norm": 1.665770173072815, | |
| "learning_rate": 0.0004799, | |
| "loss": 4.3893, | |
| "step": 4800 | |
| }, | |
| { | |
| "epoch": 0.689214153758704, | |
| "grad_norm": 1.6241706609725952, | |
| "learning_rate": 0.0004849, | |
| "loss": 4.3998, | |
| "step": 4850 | |
| }, | |
| { | |
| "epoch": 0.6963194543129174, | |
| "grad_norm": 1.9763983488082886, | |
| "learning_rate": 0.0004899, | |
| "loss": 4.3905, | |
| "step": 4900 | |
| }, | |
| { | |
| "epoch": 0.7034247548671309, | |
| "grad_norm": 1.7900429964065552, | |
| "learning_rate": 0.0004949, | |
| "loss": 4.3574, | |
| "step": 4950 | |
| }, | |
| { | |
| "epoch": 0.7105300554213443, | |
| "grad_norm": 1.7338964939117432, | |
| "learning_rate": 0.0004999000000000001, | |
| "loss": 4.3495, | |
| "step": 5000 | |
| }, | |
| { | |
| "epoch": 0.7176353559755577, | |
| "grad_norm": 1.7506004571914673, | |
| "learning_rate": 0.0004994555555555555, | |
| "loss": 4.3368, | |
| "step": 5050 | |
| }, | |
| { | |
| "epoch": 0.7247406565297712, | |
| "grad_norm": 1.6576265096664429, | |
| "learning_rate": 0.0004989, | |
| "loss": 4.3262, | |
| "step": 5100 | |
| }, | |
| { | |
| "epoch": 0.7318459570839847, | |
| "grad_norm": 1.6786417961120605, | |
| "learning_rate": 0.0004983444444444444, | |
| "loss": 4.3262, | |
| "step": 5150 | |
| }, | |
| { | |
| "epoch": 0.7389512576381981, | |
| "grad_norm": 1.7846481800079346, | |
| "learning_rate": 0.0004977888888888889, | |
| "loss": 4.3184, | |
| "step": 5200 | |
| }, | |
| { | |
| "epoch": 0.7460565581924116, | |
| "grad_norm": 1.50885009765625, | |
| "learning_rate": 0.0004972333333333334, | |
| "loss": 4.2821, | |
| "step": 5250 | |
| }, | |
| { | |
| "epoch": 0.753161858746625, | |
| "grad_norm": 1.887153148651123, | |
| "learning_rate": 0.0004966777777777778, | |
| "loss": 4.2808, | |
| "step": 5300 | |
| }, | |
| { | |
| "epoch": 0.7602671593008384, | |
| "grad_norm": 1.6502448320388794, | |
| "learning_rate": 0.0004961222222222223, | |
| "loss": 4.2995, | |
| "step": 5350 | |
| }, | |
| { | |
| "epoch": 0.7673724598550519, | |
| "grad_norm": 1.6177388429641724, | |
| "learning_rate": 0.0004955666666666667, | |
| "loss": 4.2695, | |
| "step": 5400 | |
| }, | |
| { | |
| "epoch": 0.7744777604092653, | |
| "grad_norm": 1.56751549243927, | |
| "learning_rate": 0.0004950111111111112, | |
| "loss": 4.2314, | |
| "step": 5450 | |
| }, | |
| { | |
| "epoch": 0.7815830609634787, | |
| "grad_norm": 1.7374509572982788, | |
| "learning_rate": 0.0004944555555555555, | |
| "loss": 4.2265, | |
| "step": 5500 | |
| }, | |
| { | |
| "epoch": 0.7886883615176922, | |
| "grad_norm": 1.7130149602890015, | |
| "learning_rate": 0.0004939, | |
| "loss": 4.2313, | |
| "step": 5550 | |
| }, | |
| { | |
| "epoch": 0.7957936620719056, | |
| "grad_norm": 1.669311285018921, | |
| "learning_rate": 0.0004933444444444444, | |
| "loss": 4.21, | |
| "step": 5600 | |
| }, | |
| { | |
| "epoch": 0.802898962626119, | |
| "grad_norm": 1.6400610208511353, | |
| "learning_rate": 0.0004927888888888889, | |
| "loss": 4.1947, | |
| "step": 5650 | |
| }, | |
| { | |
| "epoch": 0.8100042631803325, | |
| "grad_norm": 1.6835277080535889, | |
| "learning_rate": 0.0004922333333333334, | |
| "loss": 4.1966, | |
| "step": 5700 | |
| }, | |
| { | |
| "epoch": 0.817109563734546, | |
| "grad_norm": 1.7618376016616821, | |
| "learning_rate": 0.0004916777777777778, | |
| "loss": 4.1575, | |
| "step": 5750 | |
| }, | |
| { | |
| "epoch": 0.8242148642887595, | |
| "grad_norm": 1.6249600648880005, | |
| "learning_rate": 0.0004911222222222223, | |
| "loss": 4.1816, | |
| "step": 5800 | |
| }, | |
| { | |
| "epoch": 0.8313201648429729, | |
| "grad_norm": 1.5585474967956543, | |
| "learning_rate": 0.0004905666666666666, | |
| "loss": 4.1563, | |
| "step": 5850 | |
| }, | |
| { | |
| "epoch": 0.8384254653971863, | |
| "grad_norm": 1.5878101587295532, | |
| "learning_rate": 0.0004900111111111111, | |
| "loss": 4.1433, | |
| "step": 5900 | |
| }, | |
| { | |
| "epoch": 0.8455307659513998, | |
| "grad_norm": 1.7434005737304688, | |
| "learning_rate": 0.0004894555555555555, | |
| "loss": 4.1249, | |
| "step": 5950 | |
| }, | |
| { | |
| "epoch": 0.8526360665056132, | |
| "grad_norm": 1.6475976705551147, | |
| "learning_rate": 0.0004889, | |
| "loss": 4.1271, | |
| "step": 6000 | |
| }, | |
| { | |
| "epoch": 0.8526360665056132, | |
| "eval_accuracy": 0.3815571367740631, | |
| "eval_loss": 4.0217814445495605, | |
| "eval_runtime": 1.347, | |
| "eval_samples_per_second": 2791.439, | |
| "eval_steps_per_second": 43.802, | |
| "step": 6000 | |
| }, | |
| { | |
| "epoch": 0.8597413670598266, | |
| "grad_norm": 1.5479772090911865, | |
| "learning_rate": 0.0004883444444444445, | |
| "loss": 4.123, | |
| "step": 6050 | |
| }, | |
| { | |
| "epoch": 0.8668466676140401, | |
| "grad_norm": 1.6345648765563965, | |
| "learning_rate": 0.0004877888888888889, | |
| "loss": 4.0898, | |
| "step": 6100 | |
| }, | |
| { | |
| "epoch": 0.8739519681682535, | |
| "grad_norm": 1.6345279216766357, | |
| "learning_rate": 0.0004872333333333334, | |
| "loss": 4.1015, | |
| "step": 6150 | |
| }, | |
| { | |
| "epoch": 0.8810572687224669, | |
| "grad_norm": 1.691853642463684, | |
| "learning_rate": 0.00048667777777777776, | |
| "loss": 4.0686, | |
| "step": 6200 | |
| }, | |
| { | |
| "epoch": 0.8881625692766804, | |
| "grad_norm": 1.5769050121307373, | |
| "learning_rate": 0.00048612222222222225, | |
| "loss": 4.0704, | |
| "step": 6250 | |
| }, | |
| { | |
| "epoch": 0.8952678698308938, | |
| "grad_norm": 1.539277195930481, | |
| "learning_rate": 0.00048556666666666663, | |
| "loss": 4.0777, | |
| "step": 6300 | |
| }, | |
| { | |
| "epoch": 0.9023731703851073, | |
| "grad_norm": 1.5936651229858398, | |
| "learning_rate": 0.0004850111111111111, | |
| "loss": 4.0539, | |
| "step": 6350 | |
| }, | |
| { | |
| "epoch": 0.9094784709393208, | |
| "grad_norm": 1.5352519750595093, | |
| "learning_rate": 0.00048445555555555556, | |
| "loss": 4.0572, | |
| "step": 6400 | |
| }, | |
| { | |
| "epoch": 0.9165837714935342, | |
| "grad_norm": 1.5943176746368408, | |
| "learning_rate": 0.0004839, | |
| "loss": 4.0325, | |
| "step": 6450 | |
| }, | |
| { | |
| "epoch": 0.9236890720477476, | |
| "grad_norm": 1.6698144674301147, | |
| "learning_rate": 0.0004833444444444445, | |
| "loss": 4.021, | |
| "step": 6500 | |
| }, | |
| { | |
| "epoch": 0.9307943726019611, | |
| "grad_norm": 1.4587650299072266, | |
| "learning_rate": 0.00048278888888888887, | |
| "loss": 4.0297, | |
| "step": 6550 | |
| }, | |
| { | |
| "epoch": 0.9378996731561745, | |
| "grad_norm": 1.6008667945861816, | |
| "learning_rate": 0.00048223333333333336, | |
| "loss": 4.0161, | |
| "step": 6600 | |
| }, | |
| { | |
| "epoch": 0.9450049737103879, | |
| "grad_norm": 1.550215721130371, | |
| "learning_rate": 0.00048167777777777775, | |
| "loss": 3.974, | |
| "step": 6650 | |
| }, | |
| { | |
| "epoch": 0.9521102742646014, | |
| "grad_norm": 1.6898516416549683, | |
| "learning_rate": 0.00048112222222222224, | |
| "loss": 3.992, | |
| "step": 6700 | |
| }, | |
| { | |
| "epoch": 0.9592155748188148, | |
| "grad_norm": 1.4572670459747314, | |
| "learning_rate": 0.0004805666666666667, | |
| "loss": 3.9695, | |
| "step": 6750 | |
| }, | |
| { | |
| "epoch": 0.9663208753730282, | |
| "grad_norm": 1.482657790184021, | |
| "learning_rate": 0.0004800111111111111, | |
| "loss": 3.9757, | |
| "step": 6800 | |
| }, | |
| { | |
| "epoch": 0.9734261759272417, | |
| "grad_norm": 1.404305100440979, | |
| "learning_rate": 0.0004794555555555556, | |
| "loss": 3.9483, | |
| "step": 6850 | |
| }, | |
| { | |
| "epoch": 0.9805314764814551, | |
| "grad_norm": 1.5593451261520386, | |
| "learning_rate": 0.0004789, | |
| "loss": 3.9546, | |
| "step": 6900 | |
| }, | |
| { | |
| "epoch": 0.9876367770356687, | |
| "grad_norm": 1.6054081916809082, | |
| "learning_rate": 0.0004783444444444445, | |
| "loss": 3.9455, | |
| "step": 6950 | |
| }, | |
| { | |
| "epoch": 0.9947420775898821, | |
| "grad_norm": 1.6005113124847412, | |
| "learning_rate": 0.00047778888888888886, | |
| "loss": 3.9228, | |
| "step": 7000 | |
| }, | |
| { | |
| "epoch": 1.0018473781440955, | |
| "grad_norm": 1.5673545598983765, | |
| "learning_rate": 0.00047723333333333335, | |
| "loss": 3.9479, | |
| "step": 7050 | |
| }, | |
| { | |
| "epoch": 1.008952678698309, | |
| "grad_norm": 1.5405232906341553, | |
| "learning_rate": 0.0004766777777777778, | |
| "loss": 3.9094, | |
| "step": 7100 | |
| }, | |
| { | |
| "epoch": 1.0160579792525224, | |
| "grad_norm": 1.550139307975769, | |
| "learning_rate": 0.0004761222222222222, | |
| "loss": 3.9012, | |
| "step": 7150 | |
| }, | |
| { | |
| "epoch": 1.0231632798067358, | |
| "grad_norm": 1.513722538948059, | |
| "learning_rate": 0.0004755666666666667, | |
| "loss": 3.9027, | |
| "step": 7200 | |
| }, | |
| { | |
| "epoch": 1.0302685803609493, | |
| "grad_norm": 1.5611164569854736, | |
| "learning_rate": 0.0004750111111111111, | |
| "loss": 3.889, | |
| "step": 7250 | |
| }, | |
| { | |
| "epoch": 1.0373738809151627, | |
| "grad_norm": 1.5630569458007812, | |
| "learning_rate": 0.0004744555555555556, | |
| "loss": 3.8921, | |
| "step": 7300 | |
| }, | |
| { | |
| "epoch": 1.0444791814693761, | |
| "grad_norm": 1.5469523668289185, | |
| "learning_rate": 0.00047389999999999997, | |
| "loss": 3.8684, | |
| "step": 7350 | |
| }, | |
| { | |
| "epoch": 1.0515844820235896, | |
| "grad_norm": 1.4361380338668823, | |
| "learning_rate": 0.00047334444444444446, | |
| "loss": 3.8806, | |
| "step": 7400 | |
| }, | |
| { | |
| "epoch": 1.058689782577803, | |
| "grad_norm": 1.5062557458877563, | |
| "learning_rate": 0.0004727888888888889, | |
| "loss": 3.8982, | |
| "step": 7450 | |
| }, | |
| { | |
| "epoch": 1.0657950831320164, | |
| "grad_norm": 1.5810432434082031, | |
| "learning_rate": 0.00047223333333333334, | |
| "loss": 3.8627, | |
| "step": 7500 | |
| }, | |
| { | |
| "epoch": 1.0729003836862299, | |
| "grad_norm": 1.4955435991287231, | |
| "learning_rate": 0.0004716777777777778, | |
| "loss": 3.8626, | |
| "step": 7550 | |
| }, | |
| { | |
| "epoch": 1.0800056842404433, | |
| "grad_norm": 1.4136838912963867, | |
| "learning_rate": 0.0004711222222222222, | |
| "loss": 3.8536, | |
| "step": 7600 | |
| }, | |
| { | |
| "epoch": 1.0871109847946567, | |
| "grad_norm": 1.5749014616012573, | |
| "learning_rate": 0.0004705666666666667, | |
| "loss": 3.8523, | |
| "step": 7650 | |
| }, | |
| { | |
| "epoch": 1.0942162853488702, | |
| "grad_norm": 1.5885382890701294, | |
| "learning_rate": 0.0004700111111111111, | |
| "loss": 3.8116, | |
| "step": 7700 | |
| }, | |
| { | |
| "epoch": 1.1013215859030836, | |
| "grad_norm": 1.5716185569763184, | |
| "learning_rate": 0.0004694555555555556, | |
| "loss": 3.8193, | |
| "step": 7750 | |
| }, | |
| { | |
| "epoch": 1.108426886457297, | |
| "grad_norm": 1.4737333059310913, | |
| "learning_rate": 0.0004689, | |
| "loss": 3.8319, | |
| "step": 7800 | |
| }, | |
| { | |
| "epoch": 1.1155321870115107, | |
| "grad_norm": 1.5206536054611206, | |
| "learning_rate": 0.00046834444444444445, | |
| "loss": 3.7899, | |
| "step": 7850 | |
| }, | |
| { | |
| "epoch": 1.1226374875657241, | |
| "grad_norm": 1.584385633468628, | |
| "learning_rate": 0.0004677888888888889, | |
| "loss": 3.8117, | |
| "step": 7900 | |
| }, | |
| { | |
| "epoch": 1.1297427881199376, | |
| "grad_norm": 1.614679217338562, | |
| "learning_rate": 0.0004672333333333333, | |
| "loss": 3.8008, | |
| "step": 7950 | |
| }, | |
| { | |
| "epoch": 1.136848088674151, | |
| "grad_norm": 1.7981069087982178, | |
| "learning_rate": 0.0004666777777777778, | |
| "loss": 3.8065, | |
| "step": 8000 | |
| }, | |
| { | |
| "epoch": 1.136848088674151, | |
| "eval_accuracy": 0.4039747714996338, | |
| "eval_loss": 3.722554922103882, | |
| "eval_runtime": 1.2852, | |
| "eval_samples_per_second": 2925.727, | |
| "eval_steps_per_second": 45.909, | |
| "step": 8000 | |
| }, | |
| { | |
| "epoch": 1.1439533892283644, | |
| "grad_norm": 1.3700244426727295, | |
| "learning_rate": 0.0004661222222222222, | |
| "loss": 3.7843, | |
| "step": 8050 | |
| }, | |
| { | |
| "epoch": 1.1510586897825779, | |
| "grad_norm": 1.6801719665527344, | |
| "learning_rate": 0.0004655666666666667, | |
| "loss": 3.7919, | |
| "step": 8100 | |
| }, | |
| { | |
| "epoch": 1.1581639903367913, | |
| "grad_norm": 1.5943037271499634, | |
| "learning_rate": 0.0004650111111111111, | |
| "loss": 3.782, | |
| "step": 8150 | |
| }, | |
| { | |
| "epoch": 1.1652692908910047, | |
| "grad_norm": 1.434883713722229, | |
| "learning_rate": 0.00046445555555555556, | |
| "loss": 3.7404, | |
| "step": 8200 | |
| }, | |
| { | |
| "epoch": 1.1723745914452182, | |
| "grad_norm": 1.5571914911270142, | |
| "learning_rate": 0.0004639, | |
| "loss": 3.7607, | |
| "step": 8250 | |
| }, | |
| { | |
| "epoch": 1.1794798919994316, | |
| "grad_norm": 1.4568759202957153, | |
| "learning_rate": 0.00046334444444444444, | |
| "loss": 3.7598, | |
| "step": 8300 | |
| }, | |
| { | |
| "epoch": 1.186585192553645, | |
| "grad_norm": 1.5035616159439087, | |
| "learning_rate": 0.00046278888888888893, | |
| "loss": 3.74, | |
| "step": 8350 | |
| }, | |
| { | |
| "epoch": 1.1936904931078585, | |
| "grad_norm": 1.396409273147583, | |
| "learning_rate": 0.00046223333333333337, | |
| "loss": 3.7324, | |
| "step": 8400 | |
| }, | |
| { | |
| "epoch": 1.200795793662072, | |
| "grad_norm": 1.4661195278167725, | |
| "learning_rate": 0.0004616777777777778, | |
| "loss": 3.7349, | |
| "step": 8450 | |
| }, | |
| { | |
| "epoch": 1.2079010942162853, | |
| "grad_norm": 1.4614042043685913, | |
| "learning_rate": 0.00046112222222222224, | |
| "loss": 3.7451, | |
| "step": 8500 | |
| }, | |
| { | |
| "epoch": 1.2150063947704988, | |
| "grad_norm": 1.4750593900680542, | |
| "learning_rate": 0.0004605666666666667, | |
| "loss": 3.7319, | |
| "step": 8550 | |
| }, | |
| { | |
| "epoch": 1.2221116953247122, | |
| "grad_norm": 1.4696282148361206, | |
| "learning_rate": 0.0004600111111111111, | |
| "loss": 3.7226, | |
| "step": 8600 | |
| }, | |
| { | |
| "epoch": 1.2292169958789256, | |
| "grad_norm": 1.4502874612808228, | |
| "learning_rate": 0.00045945555555555555, | |
| "loss": 3.7188, | |
| "step": 8650 | |
| }, | |
| { | |
| "epoch": 1.236322296433139, | |
| "grad_norm": 1.5446569919586182, | |
| "learning_rate": 0.0004589, | |
| "loss": 3.7264, | |
| "step": 8700 | |
| }, | |
| { | |
| "epoch": 1.2434275969873525, | |
| "grad_norm": 1.7836312055587769, | |
| "learning_rate": 0.0004583444444444445, | |
| "loss": 3.7129, | |
| "step": 8750 | |
| }, | |
| { | |
| "epoch": 1.250532897541566, | |
| "grad_norm": 1.430017113685608, | |
| "learning_rate": 0.0004577888888888889, | |
| "loss": 3.7156, | |
| "step": 8800 | |
| }, | |
| { | |
| "epoch": 1.2576381980957794, | |
| "grad_norm": 1.4015753269195557, | |
| "learning_rate": 0.00045723333333333335, | |
| "loss": 3.7096, | |
| "step": 8850 | |
| }, | |
| { | |
| "epoch": 1.264743498649993, | |
| "grad_norm": 1.4344234466552734, | |
| "learning_rate": 0.0004566777777777778, | |
| "loss": 3.6902, | |
| "step": 8900 | |
| }, | |
| { | |
| "epoch": 1.2718487992042062, | |
| "grad_norm": 1.450966715812683, | |
| "learning_rate": 0.0004561222222222222, | |
| "loss": 3.6944, | |
| "step": 8950 | |
| }, | |
| { | |
| "epoch": 1.27895409975842, | |
| "grad_norm": 1.482603669166565, | |
| "learning_rate": 0.00045556666666666666, | |
| "loss": 3.6716, | |
| "step": 9000 | |
| }, | |
| { | |
| "epoch": 1.286059400312633, | |
| "grad_norm": 1.5204460620880127, | |
| "learning_rate": 0.0004550111111111111, | |
| "loss": 3.6782, | |
| "step": 9050 | |
| }, | |
| { | |
| "epoch": 1.2931647008668468, | |
| "grad_norm": 1.3356878757476807, | |
| "learning_rate": 0.0004544555555555556, | |
| "loss": 3.6502, | |
| "step": 9100 | |
| }, | |
| { | |
| "epoch": 1.30027000142106, | |
| "grad_norm": 1.5904278755187988, | |
| "learning_rate": 0.00045390000000000003, | |
| "loss": 3.6615, | |
| "step": 9150 | |
| }, | |
| { | |
| "epoch": 1.3073753019752736, | |
| "grad_norm": 1.4965075254440308, | |
| "learning_rate": 0.00045334444444444447, | |
| "loss": 3.6604, | |
| "step": 9200 | |
| }, | |
| { | |
| "epoch": 1.314480602529487, | |
| "grad_norm": 1.5077792406082153, | |
| "learning_rate": 0.0004527888888888889, | |
| "loss": 3.638, | |
| "step": 9250 | |
| }, | |
| { | |
| "epoch": 1.3215859030837005, | |
| "grad_norm": 1.470985770225525, | |
| "learning_rate": 0.00045223333333333334, | |
| "loss": 3.6634, | |
| "step": 9300 | |
| }, | |
| { | |
| "epoch": 1.328691203637914, | |
| "grad_norm": 1.591996192932129, | |
| "learning_rate": 0.0004516777777777778, | |
| "loss": 3.6287, | |
| "step": 9350 | |
| }, | |
| { | |
| "epoch": 1.3357965041921274, | |
| "grad_norm": 1.4681540727615356, | |
| "learning_rate": 0.0004511222222222222, | |
| "loss": 3.6118, | |
| "step": 9400 | |
| }, | |
| { | |
| "epoch": 1.3429018047463408, | |
| "grad_norm": 1.5497246980667114, | |
| "learning_rate": 0.0004505666666666667, | |
| "loss": 3.6179, | |
| "step": 9450 | |
| }, | |
| { | |
| "epoch": 1.3500071053005542, | |
| "grad_norm": 1.499262809753418, | |
| "learning_rate": 0.0004500111111111111, | |
| "loss": 3.62, | |
| "step": 9500 | |
| }, | |
| { | |
| "epoch": 1.3571124058547677, | |
| "grad_norm": 1.4057672023773193, | |
| "learning_rate": 0.0004494555555555556, | |
| "loss": 3.6019, | |
| "step": 9550 | |
| }, | |
| { | |
| "epoch": 1.364217706408981, | |
| "grad_norm": 1.5262913703918457, | |
| "learning_rate": 0.0004489, | |
| "loss": 3.588, | |
| "step": 9600 | |
| }, | |
| { | |
| "epoch": 1.3713230069631945, | |
| "grad_norm": 1.4854165315628052, | |
| "learning_rate": 0.00044834444444444445, | |
| "loss": 3.5968, | |
| "step": 9650 | |
| }, | |
| { | |
| "epoch": 1.378428307517408, | |
| "grad_norm": 1.4390678405761719, | |
| "learning_rate": 0.0004477888888888889, | |
| "loss": 3.6228, | |
| "step": 9700 | |
| }, | |
| { | |
| "epoch": 1.3855336080716214, | |
| "grad_norm": 1.527435541152954, | |
| "learning_rate": 0.0004472333333333333, | |
| "loss": 3.6099, | |
| "step": 9750 | |
| }, | |
| { | |
| "epoch": 1.3926389086258348, | |
| "grad_norm": 1.4992133378982544, | |
| "learning_rate": 0.0004466777777777778, | |
| "loss": 3.6263, | |
| "step": 9800 | |
| }, | |
| { | |
| "epoch": 1.3997442091800483, | |
| "grad_norm": 1.5344403982162476, | |
| "learning_rate": 0.0004461222222222222, | |
| "loss": 3.5873, | |
| "step": 9850 | |
| }, | |
| { | |
| "epoch": 1.4068495097342617, | |
| "grad_norm": 1.4833427667617798, | |
| "learning_rate": 0.0004455666666666667, | |
| "loss": 3.57, | |
| "step": 9900 | |
| }, | |
| { | |
| "epoch": 1.4139548102884751, | |
| "grad_norm": 1.5680700540542603, | |
| "learning_rate": 0.00044501111111111113, | |
| "loss": 3.5832, | |
| "step": 9950 | |
| }, | |
| { | |
| "epoch": 1.4210601108426886, | |
| "grad_norm": 1.560567855834961, | |
| "learning_rate": 0.00044445555555555557, | |
| "loss": 3.5744, | |
| "step": 10000 | |
| }, | |
| { | |
| "epoch": 1.4210601108426886, | |
| "eval_accuracy": 0.4289895296096802, | |
| "eval_loss": 3.51322603225708, | |
| "eval_runtime": 1.2882, | |
| "eval_samples_per_second": 2918.705, | |
| "eval_steps_per_second": 45.799, | |
| "step": 10000 | |
| }, | |
| { | |
| "epoch": 1.428165411396902, | |
| "grad_norm": 1.402680516242981, | |
| "learning_rate": 0.0004439, | |
| "loss": 3.5277, | |
| "step": 10050 | |
| }, | |
| { | |
| "epoch": 1.4352707119511154, | |
| "grad_norm": 1.4061716794967651, | |
| "learning_rate": 0.00044334444444444444, | |
| "loss": 3.5653, | |
| "step": 10100 | |
| }, | |
| { | |
| "epoch": 1.442376012505329, | |
| "grad_norm": 1.5107778310775757, | |
| "learning_rate": 0.00044278888888888893, | |
| "loss": 3.5627, | |
| "step": 10150 | |
| }, | |
| { | |
| "epoch": 1.4494813130595423, | |
| "grad_norm": 1.4344958066940308, | |
| "learning_rate": 0.0004422333333333333, | |
| "loss": 3.5403, | |
| "step": 10200 | |
| }, | |
| { | |
| "epoch": 1.456586613613756, | |
| "grad_norm": 1.4615083932876587, | |
| "learning_rate": 0.0004416777777777778, | |
| "loss": 3.5344, | |
| "step": 10250 | |
| }, | |
| { | |
| "epoch": 1.4636919141679692, | |
| "grad_norm": 1.491568684577942, | |
| "learning_rate": 0.00044112222222222224, | |
| "loss": 3.5576, | |
| "step": 10300 | |
| }, | |
| { | |
| "epoch": 1.4707972147221828, | |
| "grad_norm": 1.4227489233016968, | |
| "learning_rate": 0.0004405666666666667, | |
| "loss": 3.533, | |
| "step": 10350 | |
| }, | |
| { | |
| "epoch": 1.4779025152763963, | |
| "grad_norm": 1.5831272602081299, | |
| "learning_rate": 0.0004400111111111111, | |
| "loss": 3.54, | |
| "step": 10400 | |
| }, | |
| { | |
| "epoch": 1.4850078158306097, | |
| "grad_norm": 1.3513504266738892, | |
| "learning_rate": 0.00043945555555555555, | |
| "loss": 3.5517, | |
| "step": 10450 | |
| }, | |
| { | |
| "epoch": 1.4921131163848231, | |
| "grad_norm": 1.704670786857605, | |
| "learning_rate": 0.00043890000000000004, | |
| "loss": 3.5364, | |
| "step": 10500 | |
| }, | |
| { | |
| "epoch": 1.4992184169390366, | |
| "grad_norm": 1.4062697887420654, | |
| "learning_rate": 0.0004383444444444444, | |
| "loss": 3.5202, | |
| "step": 10550 | |
| }, | |
| { | |
| "epoch": 1.50632371749325, | |
| "grad_norm": 1.3823599815368652, | |
| "learning_rate": 0.0004377888888888889, | |
| "loss": 3.5123, | |
| "step": 10600 | |
| }, | |
| { | |
| "epoch": 1.5134290180474634, | |
| "grad_norm": 1.617918848991394, | |
| "learning_rate": 0.0004372333333333333, | |
| "loss": 3.53, | |
| "step": 10650 | |
| }, | |
| { | |
| "epoch": 1.5205343186016769, | |
| "grad_norm": 1.5281829833984375, | |
| "learning_rate": 0.0004366777777777778, | |
| "loss": 3.5191, | |
| "step": 10700 | |
| }, | |
| { | |
| "epoch": 1.5276396191558903, | |
| "grad_norm": 1.454651951789856, | |
| "learning_rate": 0.00043612222222222223, | |
| "loss": 3.5129, | |
| "step": 10750 | |
| }, | |
| { | |
| "epoch": 1.5347449197101037, | |
| "grad_norm": 1.5583391189575195, | |
| "learning_rate": 0.00043556666666666666, | |
| "loss": 3.4786, | |
| "step": 10800 | |
| }, | |
| { | |
| "epoch": 1.5418502202643172, | |
| "grad_norm": 1.4228161573410034, | |
| "learning_rate": 0.00043501111111111116, | |
| "loss": 3.5236, | |
| "step": 10850 | |
| }, | |
| { | |
| "epoch": 1.5489555208185306, | |
| "grad_norm": 1.5005617141723633, | |
| "learning_rate": 0.00043445555555555554, | |
| "loss": 3.4942, | |
| "step": 10900 | |
| }, | |
| { | |
| "epoch": 1.556060821372744, | |
| "grad_norm": 1.4787464141845703, | |
| "learning_rate": 0.00043390000000000003, | |
| "loss": 3.4913, | |
| "step": 10950 | |
| }, | |
| { | |
| "epoch": 1.5631661219269575, | |
| "grad_norm": 1.5321528911590576, | |
| "learning_rate": 0.0004333444444444444, | |
| "loss": 3.4675, | |
| "step": 11000 | |
| }, | |
| { | |
| "epoch": 1.570271422481171, | |
| "grad_norm": 1.3829543590545654, | |
| "learning_rate": 0.0004327888888888889, | |
| "loss": 3.4547, | |
| "step": 11050 | |
| }, | |
| { | |
| "epoch": 1.5773767230353843, | |
| "grad_norm": 1.7110611200332642, | |
| "learning_rate": 0.00043223333333333334, | |
| "loss": 3.4481, | |
| "step": 11100 | |
| }, | |
| { | |
| "epoch": 1.5844820235895978, | |
| "grad_norm": 1.5753222703933716, | |
| "learning_rate": 0.0004316777777777778, | |
| "loss": 3.484, | |
| "step": 11150 | |
| }, | |
| { | |
| "epoch": 1.5915873241438114, | |
| "grad_norm": 1.454259991645813, | |
| "learning_rate": 0.00043112222222222227, | |
| "loss": 3.4728, | |
| "step": 11200 | |
| }, | |
| { | |
| "epoch": 1.5986926246980246, | |
| "grad_norm": 1.4018853902816772, | |
| "learning_rate": 0.00043056666666666665, | |
| "loss": 3.4706, | |
| "step": 11250 | |
| }, | |
| { | |
| "epoch": 1.6057979252522383, | |
| "grad_norm": 1.6062161922454834, | |
| "learning_rate": 0.00043001111111111114, | |
| "loss": 3.4497, | |
| "step": 11300 | |
| }, | |
| { | |
| "epoch": 1.6129032258064515, | |
| "grad_norm": 1.585882306098938, | |
| "learning_rate": 0.0004294555555555555, | |
| "loss": 3.4331, | |
| "step": 11350 | |
| }, | |
| { | |
| "epoch": 1.6200085263606652, | |
| "grad_norm": 1.4439284801483154, | |
| "learning_rate": 0.0004289, | |
| "loss": 3.4684, | |
| "step": 11400 | |
| }, | |
| { | |
| "epoch": 1.6271138269148784, | |
| "grad_norm": 1.3603633642196655, | |
| "learning_rate": 0.0004283444444444445, | |
| "loss": 3.4541, | |
| "step": 11450 | |
| }, | |
| { | |
| "epoch": 1.634219127469092, | |
| "grad_norm": 1.4939123392105103, | |
| "learning_rate": 0.0004277888888888889, | |
| "loss": 3.46, | |
| "step": 11500 | |
| }, | |
| { | |
| "epoch": 1.6413244280233052, | |
| "grad_norm": 1.4626458883285522, | |
| "learning_rate": 0.0004272333333333334, | |
| "loss": 3.4453, | |
| "step": 11550 | |
| }, | |
| { | |
| "epoch": 1.648429728577519, | |
| "grad_norm": 1.5030425786972046, | |
| "learning_rate": 0.00042667777777777776, | |
| "loss": 3.4265, | |
| "step": 11600 | |
| }, | |
| { | |
| "epoch": 1.6555350291317321, | |
| "grad_norm": 1.4823453426361084, | |
| "learning_rate": 0.00042612222222222226, | |
| "loss": 3.4356, | |
| "step": 11650 | |
| }, | |
| { | |
| "epoch": 1.6626403296859458, | |
| "grad_norm": 1.510815978050232, | |
| "learning_rate": 0.00042556666666666664, | |
| "loss": 3.464, | |
| "step": 11700 | |
| }, | |
| { | |
| "epoch": 1.6697456302401592, | |
| "grad_norm": 1.4318405389785767, | |
| "learning_rate": 0.00042501111111111113, | |
| "loss": 3.4274, | |
| "step": 11750 | |
| }, | |
| { | |
| "epoch": 1.6768509307943726, | |
| "grad_norm": 1.4325045347213745, | |
| "learning_rate": 0.0004244555555555555, | |
| "loss": 3.4449, | |
| "step": 11800 | |
| }, | |
| { | |
| "epoch": 1.683956231348586, | |
| "grad_norm": 1.6433807611465454, | |
| "learning_rate": 0.0004239, | |
| "loss": 3.4162, | |
| "step": 11850 | |
| }, | |
| { | |
| "epoch": 1.6910615319027995, | |
| "grad_norm": 1.6231051683425903, | |
| "learning_rate": 0.0004233444444444445, | |
| "loss": 3.4107, | |
| "step": 11900 | |
| }, | |
| { | |
| "epoch": 1.698166832457013, | |
| "grad_norm": 1.468326449394226, | |
| "learning_rate": 0.0004227888888888889, | |
| "loss": 3.3951, | |
| "step": 11950 | |
| }, | |
| { | |
| "epoch": 1.7052721330112264, | |
| "grad_norm": 1.5077193975448608, | |
| "learning_rate": 0.00042223333333333337, | |
| "loss": 3.4196, | |
| "step": 12000 | |
| }, | |
| { | |
| "epoch": 1.7052721330112264, | |
| "eval_accuracy": 0.44879472255706787, | |
| "eval_loss": 3.3239564895629883, | |
| "eval_runtime": 1.3407, | |
| "eval_samples_per_second": 2804.532, | |
| "eval_steps_per_second": 44.007, | |
| "step": 12000 | |
| }, | |
| { | |
| "epoch": 1.7123774335654398, | |
| "grad_norm": 1.4623733758926392, | |
| "learning_rate": 0.00042167777777777775, | |
| "loss": 3.392, | |
| "step": 12050 | |
| }, | |
| { | |
| "epoch": 1.7194827341196532, | |
| "grad_norm": 1.561031699180603, | |
| "learning_rate": 0.00042112222222222224, | |
| "loss": 3.4159, | |
| "step": 12100 | |
| }, | |
| { | |
| "epoch": 1.7265880346738667, | |
| "grad_norm": 1.495694875717163, | |
| "learning_rate": 0.0004205666666666667, | |
| "loss": 3.411, | |
| "step": 12150 | |
| }, | |
| { | |
| "epoch": 1.7336933352280801, | |
| "grad_norm": 1.462576150894165, | |
| "learning_rate": 0.0004200111111111111, | |
| "loss": 3.4034, | |
| "step": 12200 | |
| }, | |
| { | |
| "epoch": 1.7407986357822935, | |
| "grad_norm": 1.582942247390747, | |
| "learning_rate": 0.0004194555555555556, | |
| "loss": 3.3976, | |
| "step": 12250 | |
| }, | |
| { | |
| "epoch": 1.747903936336507, | |
| "grad_norm": 1.4122976064682007, | |
| "learning_rate": 0.0004189, | |
| "loss": 3.3722, | |
| "step": 12300 | |
| }, | |
| { | |
| "epoch": 1.7550092368907206, | |
| "grad_norm": 1.7197649478912354, | |
| "learning_rate": 0.0004183444444444445, | |
| "loss": 3.3926, | |
| "step": 12350 | |
| }, | |
| { | |
| "epoch": 1.7621145374449338, | |
| "grad_norm": 1.452899694442749, | |
| "learning_rate": 0.00041778888888888886, | |
| "loss": 3.3943, | |
| "step": 12400 | |
| }, | |
| { | |
| "epoch": 1.7692198379991475, | |
| "grad_norm": 1.4426175355911255, | |
| "learning_rate": 0.00041723333333333336, | |
| "loss": 3.375, | |
| "step": 12450 | |
| }, | |
| { | |
| "epoch": 1.7763251385533607, | |
| "grad_norm": 1.4973036050796509, | |
| "learning_rate": 0.0004166777777777778, | |
| "loss": 3.3823, | |
| "step": 12500 | |
| }, | |
| { | |
| "epoch": 1.7834304391075744, | |
| "grad_norm": 1.5862528085708618, | |
| "learning_rate": 0.00041612222222222223, | |
| "loss": 3.3689, | |
| "step": 12550 | |
| }, | |
| { | |
| "epoch": 1.7905357396617876, | |
| "grad_norm": 1.5989631414413452, | |
| "learning_rate": 0.00041556666666666667, | |
| "loss": 3.3464, | |
| "step": 12600 | |
| }, | |
| { | |
| "epoch": 1.7976410402160012, | |
| "grad_norm": 1.54103422164917, | |
| "learning_rate": 0.0004150111111111111, | |
| "loss": 3.3558, | |
| "step": 12650 | |
| }, | |
| { | |
| "epoch": 1.8047463407702145, | |
| "grad_norm": 1.4894845485687256, | |
| "learning_rate": 0.0004144555555555556, | |
| "loss": 3.3531, | |
| "step": 12700 | |
| }, | |
| { | |
| "epoch": 1.811851641324428, | |
| "grad_norm": 1.3978097438812256, | |
| "learning_rate": 0.0004139, | |
| "loss": 3.3272, | |
| "step": 12750 | |
| }, | |
| { | |
| "epoch": 1.8189569418786413, | |
| "grad_norm": 1.5314109325408936, | |
| "learning_rate": 0.00041334444444444447, | |
| "loss": 3.3423, | |
| "step": 12800 | |
| }, | |
| { | |
| "epoch": 1.826062242432855, | |
| "grad_norm": 1.595335841178894, | |
| "learning_rate": 0.0004127888888888889, | |
| "loss": 3.3897, | |
| "step": 12850 | |
| }, | |
| { | |
| "epoch": 1.8331675429870682, | |
| "grad_norm": 1.457351565361023, | |
| "learning_rate": 0.00041223333333333334, | |
| "loss": 3.3346, | |
| "step": 12900 | |
| }, | |
| { | |
| "epoch": 1.8402728435412818, | |
| "grad_norm": 1.4411284923553467, | |
| "learning_rate": 0.0004116777777777778, | |
| "loss": 3.3504, | |
| "step": 12950 | |
| }, | |
| { | |
| "epoch": 1.8473781440954953, | |
| "grad_norm": 1.4269949197769165, | |
| "learning_rate": 0.0004111222222222222, | |
| "loss": 3.3533, | |
| "step": 13000 | |
| }, | |
| { | |
| "epoch": 1.8544834446497087, | |
| "grad_norm": 1.4080978631973267, | |
| "learning_rate": 0.0004105666666666667, | |
| "loss": 3.3357, | |
| "step": 13050 | |
| }, | |
| { | |
| "epoch": 1.8615887452039221, | |
| "grad_norm": 1.4291956424713135, | |
| "learning_rate": 0.0004100111111111111, | |
| "loss": 3.3473, | |
| "step": 13100 | |
| }, | |
| { | |
| "epoch": 1.8686940457581356, | |
| "grad_norm": 1.411839246749878, | |
| "learning_rate": 0.0004094555555555556, | |
| "loss": 3.3267, | |
| "step": 13150 | |
| }, | |
| { | |
| "epoch": 1.875799346312349, | |
| "grad_norm": 1.5854250192642212, | |
| "learning_rate": 0.0004089, | |
| "loss": 3.3545, | |
| "step": 13200 | |
| }, | |
| { | |
| "epoch": 1.8829046468665624, | |
| "grad_norm": 1.330222725868225, | |
| "learning_rate": 0.00040834444444444446, | |
| "loss": 3.3221, | |
| "step": 13250 | |
| }, | |
| { | |
| "epoch": 1.8900099474207759, | |
| "grad_norm": 1.4597231149673462, | |
| "learning_rate": 0.0004077888888888889, | |
| "loss": 3.333, | |
| "step": 13300 | |
| }, | |
| { | |
| "epoch": 1.8971152479749893, | |
| "grad_norm": 1.4997475147247314, | |
| "learning_rate": 0.00040723333333333333, | |
| "loss": 3.3313, | |
| "step": 13350 | |
| }, | |
| { | |
| "epoch": 1.9042205485292028, | |
| "grad_norm": 1.5232534408569336, | |
| "learning_rate": 0.0004066777777777778, | |
| "loss": 3.3063, | |
| "step": 13400 | |
| }, | |
| { | |
| "epoch": 1.9113258490834162, | |
| "grad_norm": 1.451190710067749, | |
| "learning_rate": 0.0004061222222222222, | |
| "loss": 3.3367, | |
| "step": 13450 | |
| }, | |
| { | |
| "epoch": 1.9184311496376296, | |
| "grad_norm": 1.4469232559204102, | |
| "learning_rate": 0.0004055666666666667, | |
| "loss": 3.3166, | |
| "step": 13500 | |
| }, | |
| { | |
| "epoch": 1.925536450191843, | |
| "grad_norm": 1.5176293849945068, | |
| "learning_rate": 0.00040501111111111113, | |
| "loss": 3.3052, | |
| "step": 13550 | |
| }, | |
| { | |
| "epoch": 1.9326417507460567, | |
| "grad_norm": 1.4334474802017212, | |
| "learning_rate": 0.00040445555555555557, | |
| "loss": 3.2966, | |
| "step": 13600 | |
| }, | |
| { | |
| "epoch": 1.93974705130027, | |
| "grad_norm": 1.3909268379211426, | |
| "learning_rate": 0.0004039, | |
| "loss": 3.2984, | |
| "step": 13650 | |
| }, | |
| { | |
| "epoch": 1.9468523518544836, | |
| "grad_norm": 1.4753868579864502, | |
| "learning_rate": 0.00040334444444444444, | |
| "loss": 3.2928, | |
| "step": 13700 | |
| }, | |
| { | |
| "epoch": 1.9539576524086968, | |
| "grad_norm": 1.414066195487976, | |
| "learning_rate": 0.0004027888888888889, | |
| "loss": 3.3084, | |
| "step": 13750 | |
| }, | |
| { | |
| "epoch": 1.9610629529629104, | |
| "grad_norm": 1.5064789056777954, | |
| "learning_rate": 0.0004022333333333333, | |
| "loss": 3.3257, | |
| "step": 13800 | |
| }, | |
| { | |
| "epoch": 1.9681682535171237, | |
| "grad_norm": 1.5383269786834717, | |
| "learning_rate": 0.0004016777777777778, | |
| "loss": 3.3141, | |
| "step": 13850 | |
| }, | |
| { | |
| "epoch": 1.9752735540713373, | |
| "grad_norm": 1.6095879077911377, | |
| "learning_rate": 0.00040112222222222224, | |
| "loss": 3.2907, | |
| "step": 13900 | |
| }, | |
| { | |
| "epoch": 1.9823788546255505, | |
| "grad_norm": 1.447340488433838, | |
| "learning_rate": 0.0004005666666666667, | |
| "loss": 3.26, | |
| "step": 13950 | |
| }, | |
| { | |
| "epoch": 1.9894841551797642, | |
| "grad_norm": 1.5441666841506958, | |
| "learning_rate": 0.0004000111111111111, | |
| "loss": 3.2894, | |
| "step": 14000 | |
| }, | |
| { | |
| "epoch": 1.9894841551797642, | |
| "eval_accuracy": 0.45595255494117737, | |
| "eval_loss": 3.245368719100952, | |
| "eval_runtime": 1.4692, | |
| "eval_samples_per_second": 2559.288, | |
| "eval_steps_per_second": 40.159, | |
| "step": 14000 | |
| }, | |
| { | |
| "epoch": 1.9965894557339774, | |
| "grad_norm": 1.566861629486084, | |
| "learning_rate": 0.00039945555555555556, | |
| "loss": 3.294, | |
| "step": 14050 | |
| }, | |
| { | |
| "epoch": 2.003694756288191, | |
| "grad_norm": 1.5748252868652344, | |
| "learning_rate": 0.0003989, | |
| "loss": 3.2552, | |
| "step": 14100 | |
| }, | |
| { | |
| "epoch": 2.0108000568424043, | |
| "grad_norm": 1.5415462255477905, | |
| "learning_rate": 0.00039834444444444443, | |
| "loss": 3.2492, | |
| "step": 14150 | |
| }, | |
| { | |
| "epoch": 2.017905357396618, | |
| "grad_norm": 1.5561825037002563, | |
| "learning_rate": 0.0003977888888888889, | |
| "loss": 3.2842, | |
| "step": 14200 | |
| }, | |
| { | |
| "epoch": 2.025010657950831, | |
| "grad_norm": 1.6461340188980103, | |
| "learning_rate": 0.00039723333333333336, | |
| "loss": 3.2462, | |
| "step": 14250 | |
| }, | |
| { | |
| "epoch": 2.032115958505045, | |
| "grad_norm": 1.5616611242294312, | |
| "learning_rate": 0.0003966777777777778, | |
| "loss": 3.2676, | |
| "step": 14300 | |
| }, | |
| { | |
| "epoch": 2.039221259059258, | |
| "grad_norm": 1.4216052293777466, | |
| "learning_rate": 0.00039612222222222223, | |
| "loss": 3.2811, | |
| "step": 14350 | |
| }, | |
| { | |
| "epoch": 2.0463265596134717, | |
| "grad_norm": 1.5665044784545898, | |
| "learning_rate": 0.00039556666666666667, | |
| "loss": 3.2648, | |
| "step": 14400 | |
| }, | |
| { | |
| "epoch": 2.0534318601676853, | |
| "grad_norm": 1.4758416414260864, | |
| "learning_rate": 0.0003950111111111111, | |
| "loss": 3.2494, | |
| "step": 14450 | |
| }, | |
| { | |
| "epoch": 2.0605371607218985, | |
| "grad_norm": 1.5897761583328247, | |
| "learning_rate": 0.00039445555555555554, | |
| "loss": 3.2562, | |
| "step": 14500 | |
| }, | |
| { | |
| "epoch": 2.067642461276112, | |
| "grad_norm": 1.5198686122894287, | |
| "learning_rate": 0.0003939, | |
| "loss": 3.2323, | |
| "step": 14550 | |
| }, | |
| { | |
| "epoch": 2.0747477618303254, | |
| "grad_norm": 1.5690261125564575, | |
| "learning_rate": 0.00039334444444444447, | |
| "loss": 3.2525, | |
| "step": 14600 | |
| }, | |
| { | |
| "epoch": 2.081853062384539, | |
| "grad_norm": 1.4182822704315186, | |
| "learning_rate": 0.0003927888888888889, | |
| "loss": 3.253, | |
| "step": 14650 | |
| }, | |
| { | |
| "epoch": 2.0889583629387523, | |
| "grad_norm": 1.4269342422485352, | |
| "learning_rate": 0.00039223333333333334, | |
| "loss": 3.2332, | |
| "step": 14700 | |
| }, | |
| { | |
| "epoch": 2.096063663492966, | |
| "grad_norm": 1.3798179626464844, | |
| "learning_rate": 0.0003916777777777778, | |
| "loss": 3.2357, | |
| "step": 14750 | |
| }, | |
| { | |
| "epoch": 2.103168964047179, | |
| "grad_norm": 1.5191919803619385, | |
| "learning_rate": 0.0003911222222222222, | |
| "loss": 3.2516, | |
| "step": 14800 | |
| }, | |
| { | |
| "epoch": 2.110274264601393, | |
| "grad_norm": 1.4717212915420532, | |
| "learning_rate": 0.00039056666666666666, | |
| "loss": 3.2508, | |
| "step": 14850 | |
| }, | |
| { | |
| "epoch": 2.117379565155606, | |
| "grad_norm": 1.468482494354248, | |
| "learning_rate": 0.0003900111111111111, | |
| "loss": 3.2272, | |
| "step": 14900 | |
| }, | |
| { | |
| "epoch": 2.1244848657098196, | |
| "grad_norm": 1.424468994140625, | |
| "learning_rate": 0.0003894555555555556, | |
| "loss": 3.2288, | |
| "step": 14950 | |
| }, | |
| { | |
| "epoch": 2.131590166264033, | |
| "grad_norm": 1.4754891395568848, | |
| "learning_rate": 0.0003889, | |
| "loss": 3.2092, | |
| "step": 15000 | |
| }, | |
| { | |
| "epoch": 2.1386954668182465, | |
| "grad_norm": 1.6779147386550903, | |
| "learning_rate": 0.00038834444444444446, | |
| "loss": 3.2062, | |
| "step": 15050 | |
| }, | |
| { | |
| "epoch": 2.1458007673724597, | |
| "grad_norm": 1.3882695436477661, | |
| "learning_rate": 0.0003877888888888889, | |
| "loss": 3.23, | |
| "step": 15100 | |
| }, | |
| { | |
| "epoch": 2.1529060679266734, | |
| "grad_norm": 1.6576505899429321, | |
| "learning_rate": 0.00038723333333333333, | |
| "loss": 3.2042, | |
| "step": 15150 | |
| }, | |
| { | |
| "epoch": 2.1600113684808866, | |
| "grad_norm": 1.4277151823043823, | |
| "learning_rate": 0.00038667777777777777, | |
| "loss": 3.2246, | |
| "step": 15200 | |
| }, | |
| { | |
| "epoch": 2.1671166690351003, | |
| "grad_norm": 1.588564395904541, | |
| "learning_rate": 0.0003861222222222222, | |
| "loss": 3.2095, | |
| "step": 15250 | |
| }, | |
| { | |
| "epoch": 2.1742219695893135, | |
| "grad_norm": 1.5983752012252808, | |
| "learning_rate": 0.0003855666666666667, | |
| "loss": 3.22, | |
| "step": 15300 | |
| }, | |
| { | |
| "epoch": 2.181327270143527, | |
| "grad_norm": 1.5480470657348633, | |
| "learning_rate": 0.00038501111111111113, | |
| "loss": 3.1973, | |
| "step": 15350 | |
| }, | |
| { | |
| "epoch": 2.1884325706977403, | |
| "grad_norm": 1.7334848642349243, | |
| "learning_rate": 0.00038445555555555557, | |
| "loss": 3.1972, | |
| "step": 15400 | |
| }, | |
| { | |
| "epoch": 2.195537871251954, | |
| "grad_norm": 1.4713610410690308, | |
| "learning_rate": 0.0003839, | |
| "loss": 3.206, | |
| "step": 15450 | |
| }, | |
| { | |
| "epoch": 2.202643171806167, | |
| "grad_norm": 1.4612007141113281, | |
| "learning_rate": 0.00038334444444444444, | |
| "loss": 3.1949, | |
| "step": 15500 | |
| }, | |
| { | |
| "epoch": 2.209748472360381, | |
| "grad_norm": 1.5234096050262451, | |
| "learning_rate": 0.00038278888888888894, | |
| "loss": 3.1958, | |
| "step": 15550 | |
| }, | |
| { | |
| "epoch": 2.216853772914594, | |
| "grad_norm": 1.554410457611084, | |
| "learning_rate": 0.0003822333333333333, | |
| "loss": 3.2034, | |
| "step": 15600 | |
| }, | |
| { | |
| "epoch": 2.2239590734688077, | |
| "grad_norm": 1.4644205570220947, | |
| "learning_rate": 0.0003816777777777778, | |
| "loss": 3.2038, | |
| "step": 15650 | |
| }, | |
| { | |
| "epoch": 2.2310643740230214, | |
| "grad_norm": 1.5510737895965576, | |
| "learning_rate": 0.0003811222222222222, | |
| "loss": 3.1962, | |
| "step": 15700 | |
| }, | |
| { | |
| "epoch": 2.2381696745772346, | |
| "grad_norm": 1.564414381980896, | |
| "learning_rate": 0.0003805666666666667, | |
| "loss": 3.2068, | |
| "step": 15750 | |
| }, | |
| { | |
| "epoch": 2.2452749751314482, | |
| "grad_norm": 1.5432263612747192, | |
| "learning_rate": 0.0003800111111111111, | |
| "loss": 3.1981, | |
| "step": 15800 | |
| }, | |
| { | |
| "epoch": 2.2523802756856615, | |
| "grad_norm": 1.6288096904754639, | |
| "learning_rate": 0.00037945555555555556, | |
| "loss": 3.2062, | |
| "step": 15850 | |
| }, | |
| { | |
| "epoch": 2.259485576239875, | |
| "grad_norm": 1.5214354991912842, | |
| "learning_rate": 0.00037890000000000005, | |
| "loss": 3.194, | |
| "step": 15900 | |
| }, | |
| { | |
| "epoch": 2.2665908767940883, | |
| "grad_norm": 1.4612019062042236, | |
| "learning_rate": 0.00037834444444444443, | |
| "loss": 3.2283, | |
| "step": 15950 | |
| }, | |
| { | |
| "epoch": 2.273696177348302, | |
| "grad_norm": 1.5259031057357788, | |
| "learning_rate": 0.0003777888888888889, | |
| "loss": 3.173, | |
| "step": 16000 | |
| }, | |
| { | |
| "epoch": 2.273696177348302, | |
| "eval_accuracy": 0.4649949371814728, | |
| "eval_loss": 3.1757428646087646, | |
| "eval_runtime": 1.3251, | |
| "eval_samples_per_second": 2837.513, | |
| "eval_steps_per_second": 44.525, | |
| "step": 16000 | |
| }, | |
| { | |
| "epoch": 2.280801477902515, | |
| "grad_norm": 1.536083698272705, | |
| "learning_rate": 0.0003772333333333333, | |
| "loss": 3.1828, | |
| "step": 16050 | |
| }, | |
| { | |
| "epoch": 2.287906778456729, | |
| "grad_norm": 1.5883125066757202, | |
| "learning_rate": 0.0003766777777777778, | |
| "loss": 3.1711, | |
| "step": 16100 | |
| }, | |
| { | |
| "epoch": 2.295012079010942, | |
| "grad_norm": 1.5141252279281616, | |
| "learning_rate": 0.00037612222222222223, | |
| "loss": 3.1669, | |
| "step": 16150 | |
| }, | |
| { | |
| "epoch": 2.3021173795651557, | |
| "grad_norm": 1.5442508459091187, | |
| "learning_rate": 0.00037556666666666667, | |
| "loss": 3.1835, | |
| "step": 16200 | |
| }, | |
| { | |
| "epoch": 2.309222680119369, | |
| "grad_norm": 1.5498298406600952, | |
| "learning_rate": 0.00037501111111111116, | |
| "loss": 3.1566, | |
| "step": 16250 | |
| }, | |
| { | |
| "epoch": 2.3163279806735826, | |
| "grad_norm": 1.5234136581420898, | |
| "learning_rate": 0.00037445555555555554, | |
| "loss": 3.1623, | |
| "step": 16300 | |
| }, | |
| { | |
| "epoch": 2.323433281227796, | |
| "grad_norm": 1.557966947555542, | |
| "learning_rate": 0.00037390000000000004, | |
| "loss": 3.152, | |
| "step": 16350 | |
| }, | |
| { | |
| "epoch": 2.3305385817820095, | |
| "grad_norm": 1.641692042350769, | |
| "learning_rate": 0.0003733444444444444, | |
| "loss": 3.1508, | |
| "step": 16400 | |
| }, | |
| { | |
| "epoch": 2.3376438823362227, | |
| "grad_norm": 1.5023847818374634, | |
| "learning_rate": 0.0003727888888888889, | |
| "loss": 3.1556, | |
| "step": 16450 | |
| }, | |
| { | |
| "epoch": 2.3447491828904363, | |
| "grad_norm": 1.4382113218307495, | |
| "learning_rate": 0.00037223333333333335, | |
| "loss": 3.154, | |
| "step": 16500 | |
| }, | |
| { | |
| "epoch": 2.3518544834446495, | |
| "grad_norm": 1.6160063743591309, | |
| "learning_rate": 0.0003716777777777778, | |
| "loss": 3.1567, | |
| "step": 16550 | |
| }, | |
| { | |
| "epoch": 2.358959783998863, | |
| "grad_norm": 1.6019223928451538, | |
| "learning_rate": 0.0003711222222222223, | |
| "loss": 3.1607, | |
| "step": 16600 | |
| }, | |
| { | |
| "epoch": 2.3660650845530764, | |
| "grad_norm": 1.4011200666427612, | |
| "learning_rate": 0.00037056666666666666, | |
| "loss": 3.1499, | |
| "step": 16650 | |
| }, | |
| { | |
| "epoch": 2.37317038510729, | |
| "grad_norm": 1.4653880596160889, | |
| "learning_rate": 0.00037001111111111115, | |
| "loss": 3.1396, | |
| "step": 16700 | |
| }, | |
| { | |
| "epoch": 2.3802756856615037, | |
| "grad_norm": 1.5659642219543457, | |
| "learning_rate": 0.00036945555555555553, | |
| "loss": 3.1565, | |
| "step": 16750 | |
| }, | |
| { | |
| "epoch": 2.387380986215717, | |
| "grad_norm": 1.6067794561386108, | |
| "learning_rate": 0.0003689, | |
| "loss": 3.169, | |
| "step": 16800 | |
| }, | |
| { | |
| "epoch": 2.39448628676993, | |
| "grad_norm": 1.4000523090362549, | |
| "learning_rate": 0.0003683444444444444, | |
| "loss": 3.1505, | |
| "step": 16850 | |
| }, | |
| { | |
| "epoch": 2.401591587324144, | |
| "grad_norm": 1.4956333637237549, | |
| "learning_rate": 0.0003677888888888889, | |
| "loss": 3.1618, | |
| "step": 16900 | |
| }, | |
| { | |
| "epoch": 2.4086968878783575, | |
| "grad_norm": 1.564629316329956, | |
| "learning_rate": 0.0003672333333333334, | |
| "loss": 3.1462, | |
| "step": 16950 | |
| }, | |
| { | |
| "epoch": 2.4158021884325707, | |
| "grad_norm": 1.533850073814392, | |
| "learning_rate": 0.00036667777777777777, | |
| "loss": 3.1486, | |
| "step": 17000 | |
| }, | |
| { | |
| "epoch": 2.4229074889867843, | |
| "grad_norm": 1.4568897485733032, | |
| "learning_rate": 0.00036612222222222226, | |
| "loss": 3.138, | |
| "step": 17050 | |
| }, | |
| { | |
| "epoch": 2.4300127895409975, | |
| "grad_norm": 1.513048768043518, | |
| "learning_rate": 0.00036556666666666664, | |
| "loss": 3.1468, | |
| "step": 17100 | |
| }, | |
| { | |
| "epoch": 2.437118090095211, | |
| "grad_norm": 1.400328516960144, | |
| "learning_rate": 0.00036501111111111114, | |
| "loss": 3.1431, | |
| "step": 17150 | |
| }, | |
| { | |
| "epoch": 2.4442233906494244, | |
| "grad_norm": 1.554104208946228, | |
| "learning_rate": 0.0003644555555555555, | |
| "loss": 3.1236, | |
| "step": 17200 | |
| }, | |
| { | |
| "epoch": 2.451328691203638, | |
| "grad_norm": 1.4198516607284546, | |
| "learning_rate": 0.0003639, | |
| "loss": 3.1398, | |
| "step": 17250 | |
| }, | |
| { | |
| "epoch": 2.4584339917578513, | |
| "grad_norm": 1.5592104196548462, | |
| "learning_rate": 0.0003633444444444445, | |
| "loss": 3.1229, | |
| "step": 17300 | |
| }, | |
| { | |
| "epoch": 2.465539292312065, | |
| "grad_norm": 1.5739942789077759, | |
| "learning_rate": 0.0003627888888888889, | |
| "loss": 3.1334, | |
| "step": 17350 | |
| }, | |
| { | |
| "epoch": 2.472644592866278, | |
| "grad_norm": 1.6278862953186035, | |
| "learning_rate": 0.0003622333333333334, | |
| "loss": 3.1245, | |
| "step": 17400 | |
| }, | |
| { | |
| "epoch": 2.479749893420492, | |
| "grad_norm": 1.5226835012435913, | |
| "learning_rate": 0.00036167777777777776, | |
| "loss": 3.1179, | |
| "step": 17450 | |
| }, | |
| { | |
| "epoch": 2.486855193974705, | |
| "grad_norm": 1.4963643550872803, | |
| "learning_rate": 0.00036112222222222225, | |
| "loss": 3.1252, | |
| "step": 17500 | |
| }, | |
| { | |
| "epoch": 2.4939604945289187, | |
| "grad_norm": 1.571409821510315, | |
| "learning_rate": 0.00036056666666666663, | |
| "loss": 3.1178, | |
| "step": 17550 | |
| }, | |
| { | |
| "epoch": 2.501065795083132, | |
| "grad_norm": 1.5292372703552246, | |
| "learning_rate": 0.0003600111111111111, | |
| "loss": 3.1328, | |
| "step": 17600 | |
| }, | |
| { | |
| "epoch": 2.5081710956373455, | |
| "grad_norm": 1.5522916316986084, | |
| "learning_rate": 0.00035945555555555556, | |
| "loss": 3.1346, | |
| "step": 17650 | |
| }, | |
| { | |
| "epoch": 2.5152763961915587, | |
| "grad_norm": 1.526452660560608, | |
| "learning_rate": 0.0003589, | |
| "loss": 3.1153, | |
| "step": 17700 | |
| }, | |
| { | |
| "epoch": 2.5223816967457724, | |
| "grad_norm": 1.5139750242233276, | |
| "learning_rate": 0.0003583444444444445, | |
| "loss": 3.109, | |
| "step": 17750 | |
| }, | |
| { | |
| "epoch": 2.529486997299986, | |
| "grad_norm": 1.616378903388977, | |
| "learning_rate": 0.00035778888888888887, | |
| "loss": 3.1086, | |
| "step": 17800 | |
| }, | |
| { | |
| "epoch": 2.5365922978541993, | |
| "grad_norm": 1.4770593643188477, | |
| "learning_rate": 0.00035723333333333336, | |
| "loss": 3.11, | |
| "step": 17850 | |
| }, | |
| { | |
| "epoch": 2.5436975984084125, | |
| "grad_norm": 1.4987894296646118, | |
| "learning_rate": 0.00035667777777777774, | |
| "loss": 3.1083, | |
| "step": 17900 | |
| }, | |
| { | |
| "epoch": 2.550802898962626, | |
| "grad_norm": 1.5630767345428467, | |
| "learning_rate": 0.00035612222222222223, | |
| "loss": 3.1057, | |
| "step": 17950 | |
| }, | |
| { | |
| "epoch": 2.55790819951684, | |
| "grad_norm": 1.495004415512085, | |
| "learning_rate": 0.00035556666666666667, | |
| "loss": 3.0933, | |
| "step": 18000 | |
| }, | |
| { | |
| "epoch": 2.55790819951684, | |
| "eval_accuracy": 0.480011910200119, | |
| "eval_loss": 3.0557689666748047, | |
| "eval_runtime": 1.3066, | |
| "eval_samples_per_second": 2877.766, | |
| "eval_steps_per_second": 45.156, | |
| "step": 18000 | |
| }, | |
| { | |
| "epoch": 2.565013500071053, | |
| "grad_norm": 1.4845120906829834, | |
| "learning_rate": 0.0003550111111111111, | |
| "loss": 3.1214, | |
| "step": 18050 | |
| }, | |
| { | |
| "epoch": 2.572118800625266, | |
| "grad_norm": 1.5265833139419556, | |
| "learning_rate": 0.0003544555555555556, | |
| "loss": 3.1069, | |
| "step": 18100 | |
| }, | |
| { | |
| "epoch": 2.57922410117948, | |
| "grad_norm": 1.4948880672454834, | |
| "learning_rate": 0.0003539, | |
| "loss": 3.108, | |
| "step": 18150 | |
| }, | |
| { | |
| "epoch": 2.5863294017336935, | |
| "grad_norm": 1.5253654718399048, | |
| "learning_rate": 0.0003533444444444445, | |
| "loss": 3.0819, | |
| "step": 18200 | |
| }, | |
| { | |
| "epoch": 2.5934347022879067, | |
| "grad_norm": 1.5354139804840088, | |
| "learning_rate": 0.00035278888888888886, | |
| "loss": 3.0993, | |
| "step": 18250 | |
| }, | |
| { | |
| "epoch": 2.60054000284212, | |
| "grad_norm": 1.5608172416687012, | |
| "learning_rate": 0.00035223333333333335, | |
| "loss": 3.0908, | |
| "step": 18300 | |
| }, | |
| { | |
| "epoch": 2.6076453033963336, | |
| "grad_norm": 1.6600279808044434, | |
| "learning_rate": 0.0003516777777777778, | |
| "loss": 3.0814, | |
| "step": 18350 | |
| }, | |
| { | |
| "epoch": 2.6147506039505473, | |
| "grad_norm": 1.5104360580444336, | |
| "learning_rate": 0.0003511222222222222, | |
| "loss": 3.0731, | |
| "step": 18400 | |
| }, | |
| { | |
| "epoch": 2.6218559045047605, | |
| "grad_norm": 1.682814359664917, | |
| "learning_rate": 0.0003505666666666667, | |
| "loss": 3.0993, | |
| "step": 18450 | |
| }, | |
| { | |
| "epoch": 2.628961205058974, | |
| "grad_norm": 1.5391907691955566, | |
| "learning_rate": 0.0003500111111111111, | |
| "loss": 3.0771, | |
| "step": 18500 | |
| }, | |
| { | |
| "epoch": 2.6360665056131873, | |
| "grad_norm": 1.5016599893569946, | |
| "learning_rate": 0.0003494555555555556, | |
| "loss": 3.0913, | |
| "step": 18550 | |
| }, | |
| { | |
| "epoch": 2.643171806167401, | |
| "grad_norm": 1.6000250577926636, | |
| "learning_rate": 0.00034889999999999997, | |
| "loss": 3.1168, | |
| "step": 18600 | |
| }, | |
| { | |
| "epoch": 2.650277106721614, | |
| "grad_norm": 1.5141435861587524, | |
| "learning_rate": 0.00034834444444444446, | |
| "loss": 3.0673, | |
| "step": 18650 | |
| }, | |
| { | |
| "epoch": 2.657382407275828, | |
| "grad_norm": 1.5580344200134277, | |
| "learning_rate": 0.0003477888888888889, | |
| "loss": 3.0857, | |
| "step": 18700 | |
| }, | |
| { | |
| "epoch": 2.664487707830041, | |
| "grad_norm": 1.4756735563278198, | |
| "learning_rate": 0.00034723333333333333, | |
| "loss": 3.092, | |
| "step": 18750 | |
| }, | |
| { | |
| "epoch": 2.6715930083842547, | |
| "grad_norm": 1.4011656045913696, | |
| "learning_rate": 0.00034667777777777777, | |
| "loss": 3.0678, | |
| "step": 18800 | |
| }, | |
| { | |
| "epoch": 2.678698308938468, | |
| "grad_norm": 1.4854072332382202, | |
| "learning_rate": 0.0003461222222222222, | |
| "loss": 3.0864, | |
| "step": 18850 | |
| }, | |
| { | |
| "epoch": 2.6858036094926816, | |
| "grad_norm": 1.5279541015625, | |
| "learning_rate": 0.0003455666666666667, | |
| "loss": 3.0626, | |
| "step": 18900 | |
| }, | |
| { | |
| "epoch": 2.692908910046895, | |
| "grad_norm": 1.4168810844421387, | |
| "learning_rate": 0.0003450111111111111, | |
| "loss": 3.0822, | |
| "step": 18950 | |
| }, | |
| { | |
| "epoch": 2.7000142106011085, | |
| "grad_norm": 1.5052216053009033, | |
| "learning_rate": 0.0003444555555555556, | |
| "loss": 3.1007, | |
| "step": 19000 | |
| }, | |
| { | |
| "epoch": 2.707119511155322, | |
| "grad_norm": 1.524707317352295, | |
| "learning_rate": 0.0003439, | |
| "loss": 3.0528, | |
| "step": 19050 | |
| }, | |
| { | |
| "epoch": 2.7142248117095353, | |
| "grad_norm": 1.5324409008026123, | |
| "learning_rate": 0.00034334444444444445, | |
| "loss": 3.0779, | |
| "step": 19100 | |
| }, | |
| { | |
| "epoch": 2.7213301122637485, | |
| "grad_norm": 1.5060224533081055, | |
| "learning_rate": 0.0003427888888888889, | |
| "loss": 3.0828, | |
| "step": 19150 | |
| }, | |
| { | |
| "epoch": 2.728435412817962, | |
| "grad_norm": 1.4482327699661255, | |
| "learning_rate": 0.0003422333333333333, | |
| "loss": 3.0503, | |
| "step": 19200 | |
| }, | |
| { | |
| "epoch": 2.735540713372176, | |
| "grad_norm": 1.4725353717803955, | |
| "learning_rate": 0.0003416777777777778, | |
| "loss": 3.0714, | |
| "step": 19250 | |
| }, | |
| { | |
| "epoch": 2.742646013926389, | |
| "grad_norm": 1.528380274772644, | |
| "learning_rate": 0.0003411222222222222, | |
| "loss": 3.0669, | |
| "step": 19300 | |
| }, | |
| { | |
| "epoch": 2.7497513144806023, | |
| "grad_norm": 1.517346739768982, | |
| "learning_rate": 0.0003405666666666667, | |
| "loss": 3.0601, | |
| "step": 19350 | |
| }, | |
| { | |
| "epoch": 2.756856615034816, | |
| "grad_norm": 1.5343750715255737, | |
| "learning_rate": 0.0003400111111111111, | |
| "loss": 3.0632, | |
| "step": 19400 | |
| }, | |
| { | |
| "epoch": 2.7639619155890296, | |
| "grad_norm": 1.5137101411819458, | |
| "learning_rate": 0.00033945555555555556, | |
| "loss": 3.0508, | |
| "step": 19450 | |
| }, | |
| { | |
| "epoch": 2.771067216143243, | |
| "grad_norm": 1.4996198415756226, | |
| "learning_rate": 0.0003389, | |
| "loss": 3.0681, | |
| "step": 19500 | |
| }, | |
| { | |
| "epoch": 2.7781725166974565, | |
| "grad_norm": 1.4925514459609985, | |
| "learning_rate": 0.00033834444444444443, | |
| "loss": 3.0629, | |
| "step": 19550 | |
| }, | |
| { | |
| "epoch": 2.7852778172516697, | |
| "grad_norm": 1.5612093210220337, | |
| "learning_rate": 0.0003377888888888889, | |
| "loss": 3.0824, | |
| "step": 19600 | |
| }, | |
| { | |
| "epoch": 2.7923831178058833, | |
| "grad_norm": 1.442588210105896, | |
| "learning_rate": 0.00033723333333333336, | |
| "loss": 3.054, | |
| "step": 19650 | |
| }, | |
| { | |
| "epoch": 2.7994884183600965, | |
| "grad_norm": 1.6014519929885864, | |
| "learning_rate": 0.0003366777777777778, | |
| "loss": 3.0527, | |
| "step": 19700 | |
| }, | |
| { | |
| "epoch": 2.80659371891431, | |
| "grad_norm": 1.5083961486816406, | |
| "learning_rate": 0.00033612222222222224, | |
| "loss": 3.0582, | |
| "step": 19750 | |
| }, | |
| { | |
| "epoch": 2.8136990194685234, | |
| "grad_norm": 1.504754662513733, | |
| "learning_rate": 0.0003355666666666667, | |
| "loss": 3.0809, | |
| "step": 19800 | |
| }, | |
| { | |
| "epoch": 2.820804320022737, | |
| "grad_norm": 1.5361518859863281, | |
| "learning_rate": 0.0003350111111111111, | |
| "loss": 3.0485, | |
| "step": 19850 | |
| }, | |
| { | |
| "epoch": 2.8279096205769503, | |
| "grad_norm": 1.5289889574050903, | |
| "learning_rate": 0.00033445555555555555, | |
| "loss": 3.057, | |
| "step": 19900 | |
| }, | |
| { | |
| "epoch": 2.835014921131164, | |
| "grad_norm": 1.4377233982086182, | |
| "learning_rate": 0.0003339, | |
| "loss": 3.0319, | |
| "step": 19950 | |
| }, | |
| { | |
| "epoch": 2.842120221685377, | |
| "grad_norm": 1.4774348735809326, | |
| "learning_rate": 0.0003333444444444445, | |
| "loss": 3.0391, | |
| "step": 20000 | |
| }, | |
| { | |
| "epoch": 2.842120221685377, | |
| "eval_accuracy": 0.4836238920688629, | |
| "eval_loss": 2.9734725952148438, | |
| "eval_runtime": 1.3307, | |
| "eval_samples_per_second": 2825.609, | |
| "eval_steps_per_second": 44.338, | |
| "step": 20000 | |
| }, | |
| { | |
| "epoch": 2.849225522239591, | |
| "grad_norm": 1.490995168685913, | |
| "learning_rate": 0.0003327888888888889, | |
| "loss": 3.03, | |
| "step": 20050 | |
| }, | |
| { | |
| "epoch": 2.856330822793804, | |
| "grad_norm": 1.5990046262741089, | |
| "learning_rate": 0.00033223333333333335, | |
| "loss": 3.0526, | |
| "step": 20100 | |
| }, | |
| { | |
| "epoch": 2.8634361233480177, | |
| "grad_norm": 1.4851841926574707, | |
| "learning_rate": 0.0003316777777777778, | |
| "loss": 3.038, | |
| "step": 20150 | |
| }, | |
| { | |
| "epoch": 2.870541423902231, | |
| "grad_norm": 1.617862582206726, | |
| "learning_rate": 0.0003311222222222222, | |
| "loss": 3.0544, | |
| "step": 20200 | |
| }, | |
| { | |
| "epoch": 2.8776467244564445, | |
| "grad_norm": 1.5293192863464355, | |
| "learning_rate": 0.00033056666666666666, | |
| "loss": 3.0331, | |
| "step": 20250 | |
| }, | |
| { | |
| "epoch": 2.884752025010658, | |
| "grad_norm": 1.5155032873153687, | |
| "learning_rate": 0.0003300111111111111, | |
| "loss": 3.0372, | |
| "step": 20300 | |
| }, | |
| { | |
| "epoch": 2.8918573255648714, | |
| "grad_norm": 1.4643704891204834, | |
| "learning_rate": 0.0003294555555555556, | |
| "loss": 3.0362, | |
| "step": 20350 | |
| }, | |
| { | |
| "epoch": 2.8989626261190846, | |
| "grad_norm": 1.4701436758041382, | |
| "learning_rate": 0.0003289, | |
| "loss": 3.038, | |
| "step": 20400 | |
| }, | |
| { | |
| "epoch": 2.9060679266732983, | |
| "grad_norm": 1.4922205209732056, | |
| "learning_rate": 0.00032834444444444446, | |
| "loss": 3.0214, | |
| "step": 20450 | |
| }, | |
| { | |
| "epoch": 2.913173227227512, | |
| "grad_norm": 1.5764334201812744, | |
| "learning_rate": 0.0003277888888888889, | |
| "loss": 3.0282, | |
| "step": 20500 | |
| }, | |
| { | |
| "epoch": 2.920278527781725, | |
| "grad_norm": 1.4649850130081177, | |
| "learning_rate": 0.00032723333333333334, | |
| "loss": 3.0165, | |
| "step": 20550 | |
| }, | |
| { | |
| "epoch": 2.9273838283359384, | |
| "grad_norm": 1.5053377151489258, | |
| "learning_rate": 0.0003266777777777778, | |
| "loss": 3.0112, | |
| "step": 20600 | |
| }, | |
| { | |
| "epoch": 2.934489128890152, | |
| "grad_norm": 1.5205187797546387, | |
| "learning_rate": 0.0003261222222222222, | |
| "loss": 3.0337, | |
| "step": 20650 | |
| }, | |
| { | |
| "epoch": 2.9415944294443657, | |
| "grad_norm": 1.6407885551452637, | |
| "learning_rate": 0.0003255666666666667, | |
| "loss": 3.0087, | |
| "step": 20700 | |
| }, | |
| { | |
| "epoch": 2.948699729998579, | |
| "grad_norm": 1.6242375373840332, | |
| "learning_rate": 0.0003250111111111111, | |
| "loss": 3.0223, | |
| "step": 20750 | |
| }, | |
| { | |
| "epoch": 2.9558050305527925, | |
| "grad_norm": 1.6333444118499756, | |
| "learning_rate": 0.0003244555555555556, | |
| "loss": 3.0213, | |
| "step": 20800 | |
| }, | |
| { | |
| "epoch": 2.9629103311070057, | |
| "grad_norm": 1.5015469789505005, | |
| "learning_rate": 0.0003239, | |
| "loss": 3.024, | |
| "step": 20850 | |
| }, | |
| { | |
| "epoch": 2.9700156316612194, | |
| "grad_norm": 1.5481396913528442, | |
| "learning_rate": 0.00032334444444444445, | |
| "loss": 2.997, | |
| "step": 20900 | |
| }, | |
| { | |
| "epoch": 2.9771209322154326, | |
| "grad_norm": 1.5252339839935303, | |
| "learning_rate": 0.0003227888888888889, | |
| "loss": 3.0115, | |
| "step": 20950 | |
| }, | |
| { | |
| "epoch": 2.9842262327696463, | |
| "grad_norm": 1.524375081062317, | |
| "learning_rate": 0.0003222333333333333, | |
| "loss": 3.0242, | |
| "step": 21000 | |
| }, | |
| { | |
| "epoch": 2.9913315333238595, | |
| "grad_norm": 1.471494197845459, | |
| "learning_rate": 0.0003216777777777778, | |
| "loss": 3.0049, | |
| "step": 21050 | |
| }, | |
| { | |
| "epoch": 2.998436833878073, | |
| "grad_norm": 1.4367444515228271, | |
| "learning_rate": 0.0003211222222222222, | |
| "loss": 3.0026, | |
| "step": 21100 | |
| }, | |
| { | |
| "epoch": 3.0055421344322863, | |
| "grad_norm": 1.6314510107040405, | |
| "learning_rate": 0.0003205666666666667, | |
| "loss": 2.9714, | |
| "step": 21150 | |
| }, | |
| { | |
| "epoch": 3.0126474349865, | |
| "grad_norm": 1.5607901811599731, | |
| "learning_rate": 0.0003200111111111111, | |
| "loss": 2.9827, | |
| "step": 21200 | |
| }, | |
| { | |
| "epoch": 3.019752735540713, | |
| "grad_norm": 1.5266896486282349, | |
| "learning_rate": 0.00031945555555555556, | |
| "loss": 2.9824, | |
| "step": 21250 | |
| }, | |
| { | |
| "epoch": 3.026858036094927, | |
| "grad_norm": 1.453753113746643, | |
| "learning_rate": 0.0003189, | |
| "loss": 3.0012, | |
| "step": 21300 | |
| }, | |
| { | |
| "epoch": 3.03396333664914, | |
| "grad_norm": 1.618914246559143, | |
| "learning_rate": 0.00031834444444444444, | |
| "loss": 2.996, | |
| "step": 21350 | |
| }, | |
| { | |
| "epoch": 3.0410686372033537, | |
| "grad_norm": 1.5350817441940308, | |
| "learning_rate": 0.00031778888888888893, | |
| "loss": 2.9959, | |
| "step": 21400 | |
| }, | |
| { | |
| "epoch": 3.048173937757567, | |
| "grad_norm": 1.5359801054000854, | |
| "learning_rate": 0.0003172333333333333, | |
| "loss": 3.006, | |
| "step": 21450 | |
| }, | |
| { | |
| "epoch": 3.0552792383117806, | |
| "grad_norm": 1.571722388267517, | |
| "learning_rate": 0.0003166777777777778, | |
| "loss": 2.965, | |
| "step": 21500 | |
| }, | |
| { | |
| "epoch": 3.062384538865994, | |
| "grad_norm": 1.5369782447814941, | |
| "learning_rate": 0.00031612222222222224, | |
| "loss": 2.9839, | |
| "step": 21550 | |
| }, | |
| { | |
| "epoch": 3.0694898394202075, | |
| "grad_norm": 1.4015165567398071, | |
| "learning_rate": 0.0003155666666666667, | |
| "loss": 2.9783, | |
| "step": 21600 | |
| }, | |
| { | |
| "epoch": 3.0765951399744207, | |
| "grad_norm": 1.4333746433258057, | |
| "learning_rate": 0.0003150111111111111, | |
| "loss": 2.9957, | |
| "step": 21650 | |
| }, | |
| { | |
| "epoch": 3.0837004405286343, | |
| "grad_norm": 1.555273175239563, | |
| "learning_rate": 0.00031445555555555555, | |
| "loss": 2.9952, | |
| "step": 21700 | |
| }, | |
| { | |
| "epoch": 3.090805741082848, | |
| "grad_norm": 1.4734160900115967, | |
| "learning_rate": 0.00031390000000000004, | |
| "loss": 2.958, | |
| "step": 21750 | |
| }, | |
| { | |
| "epoch": 3.097911041637061, | |
| "grad_norm": 1.5799846649169922, | |
| "learning_rate": 0.0003133444444444444, | |
| "loss": 2.979, | |
| "step": 21800 | |
| }, | |
| { | |
| "epoch": 3.105016342191275, | |
| "grad_norm": 1.608751654624939, | |
| "learning_rate": 0.0003127888888888889, | |
| "loss": 3.0088, | |
| "step": 21850 | |
| }, | |
| { | |
| "epoch": 3.112121642745488, | |
| "grad_norm": 1.4913153648376465, | |
| "learning_rate": 0.0003122333333333333, | |
| "loss": 2.9614, | |
| "step": 21900 | |
| }, | |
| { | |
| "epoch": 3.1192269432997017, | |
| "grad_norm": 1.4407751560211182, | |
| "learning_rate": 0.0003116777777777778, | |
| "loss": 2.9573, | |
| "step": 21950 | |
| }, | |
| { | |
| "epoch": 3.126332243853915, | |
| "grad_norm": 1.4962221384048462, | |
| "learning_rate": 0.0003111222222222222, | |
| "loss": 2.9946, | |
| "step": 22000 | |
| }, | |
| { | |
| "epoch": 3.126332243853915, | |
| "eval_accuracy": 0.49943476915359497, | |
| "eval_loss": 2.9075963497161865, | |
| "eval_runtime": 1.3119, | |
| "eval_samples_per_second": 2865.999, | |
| "eval_steps_per_second": 44.972, | |
| "step": 22000 | |
| }, | |
| { | |
| "epoch": 3.1334375444081286, | |
| "grad_norm": 1.5972894430160522, | |
| "learning_rate": 0.00031056666666666666, | |
| "loss": 2.9793, | |
| "step": 22050 | |
| }, | |
| { | |
| "epoch": 3.140542844962342, | |
| "grad_norm": 1.4097793102264404, | |
| "learning_rate": 0.00031001111111111115, | |
| "loss": 2.9575, | |
| "step": 22100 | |
| }, | |
| { | |
| "epoch": 3.1476481455165555, | |
| "grad_norm": 1.4834725856781006, | |
| "learning_rate": 0.00030945555555555554, | |
| "loss": 2.9657, | |
| "step": 22150 | |
| }, | |
| { | |
| "epoch": 3.1547534460707687, | |
| "grad_norm": 1.5428999662399292, | |
| "learning_rate": 0.00030890000000000003, | |
| "loss": 2.9836, | |
| "step": 22200 | |
| }, | |
| { | |
| "epoch": 3.1618587466249823, | |
| "grad_norm": 1.4695510864257812, | |
| "learning_rate": 0.0003083444444444444, | |
| "loss": 2.9748, | |
| "step": 22250 | |
| }, | |
| { | |
| "epoch": 3.1689640471791956, | |
| "grad_norm": 1.6710104942321777, | |
| "learning_rate": 0.0003077888888888889, | |
| "loss": 2.9663, | |
| "step": 22300 | |
| }, | |
| { | |
| "epoch": 3.176069347733409, | |
| "grad_norm": 1.473408818244934, | |
| "learning_rate": 0.00030723333333333334, | |
| "loss": 2.9748, | |
| "step": 22350 | |
| }, | |
| { | |
| "epoch": 3.1831746482876224, | |
| "grad_norm": 1.5751774311065674, | |
| "learning_rate": 0.0003066777777777778, | |
| "loss": 2.9596, | |
| "step": 22400 | |
| }, | |
| { | |
| "epoch": 3.190279948841836, | |
| "grad_norm": 1.4606789350509644, | |
| "learning_rate": 0.00030612222222222227, | |
| "loss": 2.972, | |
| "step": 22450 | |
| }, | |
| { | |
| "epoch": 3.1973852493960493, | |
| "grad_norm": 1.5335890054702759, | |
| "learning_rate": 0.00030556666666666665, | |
| "loss": 2.9667, | |
| "step": 22500 | |
| }, | |
| { | |
| "epoch": 3.204490549950263, | |
| "grad_norm": 1.5628565549850464, | |
| "learning_rate": 0.00030501111111111114, | |
| "loss": 2.9691, | |
| "step": 22550 | |
| }, | |
| { | |
| "epoch": 3.211595850504476, | |
| "grad_norm": 1.4582511186599731, | |
| "learning_rate": 0.0003044555555555555, | |
| "loss": 2.9518, | |
| "step": 22600 | |
| }, | |
| { | |
| "epoch": 3.21870115105869, | |
| "grad_norm": 1.4858638048171997, | |
| "learning_rate": 0.0003039, | |
| "loss": 2.9607, | |
| "step": 22650 | |
| }, | |
| { | |
| "epoch": 3.225806451612903, | |
| "grad_norm": 1.5188275575637817, | |
| "learning_rate": 0.0003033444444444445, | |
| "loss": 2.9544, | |
| "step": 22700 | |
| }, | |
| { | |
| "epoch": 3.2329117521671167, | |
| "grad_norm": 1.564578652381897, | |
| "learning_rate": 0.0003027888888888889, | |
| "loss": 2.9541, | |
| "step": 22750 | |
| }, | |
| { | |
| "epoch": 3.2400170527213303, | |
| "grad_norm": 1.453080177307129, | |
| "learning_rate": 0.0003022333333333334, | |
| "loss": 2.9404, | |
| "step": 22800 | |
| }, | |
| { | |
| "epoch": 3.2471223532755435, | |
| "grad_norm": 1.4518792629241943, | |
| "learning_rate": 0.00030167777777777776, | |
| "loss": 2.9623, | |
| "step": 22850 | |
| }, | |
| { | |
| "epoch": 3.2542276538297568, | |
| "grad_norm": 1.548088550567627, | |
| "learning_rate": 0.00030112222222222225, | |
| "loss": 2.9484, | |
| "step": 22900 | |
| }, | |
| { | |
| "epoch": 3.2613329543839704, | |
| "grad_norm": 1.5216915607452393, | |
| "learning_rate": 0.00030056666666666664, | |
| "loss": 2.9565, | |
| "step": 22950 | |
| }, | |
| { | |
| "epoch": 3.268438254938184, | |
| "grad_norm": 1.6633533239364624, | |
| "learning_rate": 0.00030001111111111113, | |
| "loss": 2.9563, | |
| "step": 23000 | |
| }, | |
| { | |
| "epoch": 3.2755435554923973, | |
| "grad_norm": 1.429800271987915, | |
| "learning_rate": 0.0002994555555555555, | |
| "loss": 2.9477, | |
| "step": 23050 | |
| }, | |
| { | |
| "epoch": 3.282648856046611, | |
| "grad_norm": 1.6086186170578003, | |
| "learning_rate": 0.0002989, | |
| "loss": 2.9605, | |
| "step": 23100 | |
| }, | |
| { | |
| "epoch": 3.289754156600824, | |
| "grad_norm": 1.481373906135559, | |
| "learning_rate": 0.0002983444444444445, | |
| "loss": 2.9466, | |
| "step": 23150 | |
| }, | |
| { | |
| "epoch": 3.296859457155038, | |
| "grad_norm": 1.5376640558242798, | |
| "learning_rate": 0.0002977888888888889, | |
| "loss": 2.9213, | |
| "step": 23200 | |
| }, | |
| { | |
| "epoch": 3.303964757709251, | |
| "grad_norm": 1.5091279745101929, | |
| "learning_rate": 0.00029723333333333337, | |
| "loss": 2.9449, | |
| "step": 23250 | |
| }, | |
| { | |
| "epoch": 3.3110700582634647, | |
| "grad_norm": 1.5239790678024292, | |
| "learning_rate": 0.00029667777777777775, | |
| "loss": 2.9451, | |
| "step": 23300 | |
| }, | |
| { | |
| "epoch": 3.318175358817678, | |
| "grad_norm": 1.6087703704833984, | |
| "learning_rate": 0.00029612222222222224, | |
| "loss": 2.9639, | |
| "step": 23350 | |
| }, | |
| { | |
| "epoch": 3.3252806593718915, | |
| "grad_norm": 1.3892955780029297, | |
| "learning_rate": 0.0002955666666666667, | |
| "loss": 2.9194, | |
| "step": 23400 | |
| }, | |
| { | |
| "epoch": 3.3323859599261048, | |
| "grad_norm": 1.447771668434143, | |
| "learning_rate": 0.0002950111111111111, | |
| "loss": 2.9282, | |
| "step": 23450 | |
| }, | |
| { | |
| "epoch": 3.3394912604803184, | |
| "grad_norm": 1.476275086402893, | |
| "learning_rate": 0.0002944555555555556, | |
| "loss": 2.9522, | |
| "step": 23500 | |
| }, | |
| { | |
| "epoch": 3.3465965610345316, | |
| "grad_norm": 1.6364308595657349, | |
| "learning_rate": 0.0002939, | |
| "loss": 2.9321, | |
| "step": 23550 | |
| }, | |
| { | |
| "epoch": 3.3537018615887453, | |
| "grad_norm": 1.6567586660385132, | |
| "learning_rate": 0.0002933444444444445, | |
| "loss": 2.9454, | |
| "step": 23600 | |
| }, | |
| { | |
| "epoch": 3.3608071621429585, | |
| "grad_norm": 1.5751633644104004, | |
| "learning_rate": 0.00029278888888888886, | |
| "loss": 2.942, | |
| "step": 23650 | |
| }, | |
| { | |
| "epoch": 3.367912462697172, | |
| "grad_norm": 1.575961947441101, | |
| "learning_rate": 0.00029223333333333335, | |
| "loss": 2.942, | |
| "step": 23700 | |
| }, | |
| { | |
| "epoch": 3.3750177632513854, | |
| "grad_norm": 1.5118695497512817, | |
| "learning_rate": 0.0002916777777777778, | |
| "loss": 2.9382, | |
| "step": 23750 | |
| }, | |
| { | |
| "epoch": 3.382123063805599, | |
| "grad_norm": 1.5508843660354614, | |
| "learning_rate": 0.00029112222222222223, | |
| "loss": 2.9573, | |
| "step": 23800 | |
| }, | |
| { | |
| "epoch": 3.3892283643598122, | |
| "grad_norm": 1.5142732858657837, | |
| "learning_rate": 0.00029056666666666666, | |
| "loss": 2.9448, | |
| "step": 23850 | |
| }, | |
| { | |
| "epoch": 3.396333664914026, | |
| "grad_norm": 1.4622957706451416, | |
| "learning_rate": 0.0002900111111111111, | |
| "loss": 2.938, | |
| "step": 23900 | |
| }, | |
| { | |
| "epoch": 3.403438965468239, | |
| "grad_norm": 1.4186699390411377, | |
| "learning_rate": 0.0002894555555555556, | |
| "loss": 2.9116, | |
| "step": 23950 | |
| }, | |
| { | |
| "epoch": 3.4105442660224528, | |
| "grad_norm": 1.5300025939941406, | |
| "learning_rate": 0.0002889, | |
| "loss": 2.9046, | |
| "step": 24000 | |
| }, | |
| { | |
| "epoch": 3.4105442660224528, | |
| "eval_accuracy": 0.4978906214237213, | |
| "eval_loss": 2.8925819396972656, | |
| "eval_runtime": 1.3351, | |
| "eval_samples_per_second": 2816.374, | |
| "eval_steps_per_second": 44.193, | |
| "step": 24000 | |
| }, | |
| { | |
| "epoch": 3.4176495665766664, | |
| "grad_norm": 1.4906315803527832, | |
| "learning_rate": 0.00028834444444444447, | |
| "loss": 2.936, | |
| "step": 24050 | |
| }, | |
| { | |
| "epoch": 3.4247548671308796, | |
| "grad_norm": 1.5187666416168213, | |
| "learning_rate": 0.0002877888888888889, | |
| "loss": 2.8953, | |
| "step": 24100 | |
| }, | |
| { | |
| "epoch": 3.431860167685093, | |
| "grad_norm": 1.4870028495788574, | |
| "learning_rate": 0.00028723333333333334, | |
| "loss": 2.923, | |
| "step": 24150 | |
| }, | |
| { | |
| "epoch": 3.4389654682393065, | |
| "grad_norm": 1.4789930582046509, | |
| "learning_rate": 0.0002866777777777778, | |
| "loss": 2.9348, | |
| "step": 24200 | |
| }, | |
| { | |
| "epoch": 3.44607076879352, | |
| "grad_norm": 1.6222258806228638, | |
| "learning_rate": 0.0002861222222222222, | |
| "loss": 2.9264, | |
| "step": 24250 | |
| }, | |
| { | |
| "epoch": 3.4531760693477334, | |
| "grad_norm": 1.4804177284240723, | |
| "learning_rate": 0.0002855666666666667, | |
| "loss": 2.9254, | |
| "step": 24300 | |
| }, | |
| { | |
| "epoch": 3.460281369901947, | |
| "grad_norm": 1.4715673923492432, | |
| "learning_rate": 0.0002850111111111111, | |
| "loss": 2.9074, | |
| "step": 24350 | |
| }, | |
| { | |
| "epoch": 3.4673866704561602, | |
| "grad_norm": 1.4322458505630493, | |
| "learning_rate": 0.0002844555555555556, | |
| "loss": 2.9132, | |
| "step": 24400 | |
| }, | |
| { | |
| "epoch": 3.474491971010374, | |
| "grad_norm": 1.4830477237701416, | |
| "learning_rate": 0.0002839, | |
| "loss": 2.9204, | |
| "step": 24450 | |
| }, | |
| { | |
| "epoch": 3.481597271564587, | |
| "grad_norm": 1.5052211284637451, | |
| "learning_rate": 0.00028334444444444445, | |
| "loss": 2.9064, | |
| "step": 24500 | |
| }, | |
| { | |
| "epoch": 3.4887025721188007, | |
| "grad_norm": 1.7150460481643677, | |
| "learning_rate": 0.0002827888888888889, | |
| "loss": 2.9238, | |
| "step": 24550 | |
| }, | |
| { | |
| "epoch": 3.495807872673014, | |
| "grad_norm": 1.4955459833145142, | |
| "learning_rate": 0.0002822333333333333, | |
| "loss": 2.9269, | |
| "step": 24600 | |
| }, | |
| { | |
| "epoch": 3.5029131732272276, | |
| "grad_norm": 1.5976444482803345, | |
| "learning_rate": 0.0002816777777777778, | |
| "loss": 2.9462, | |
| "step": 24650 | |
| }, | |
| { | |
| "epoch": 3.510018473781441, | |
| "grad_norm": 1.6254138946533203, | |
| "learning_rate": 0.0002811222222222222, | |
| "loss": 2.898, | |
| "step": 24700 | |
| }, | |
| { | |
| "epoch": 3.5171237743356545, | |
| "grad_norm": 1.4726321697235107, | |
| "learning_rate": 0.0002805666666666667, | |
| "loss": 2.9065, | |
| "step": 24750 | |
| }, | |
| { | |
| "epoch": 3.5242290748898677, | |
| "grad_norm": 1.5584523677825928, | |
| "learning_rate": 0.00028001111111111113, | |
| "loss": 2.9357, | |
| "step": 24800 | |
| }, | |
| { | |
| "epoch": 3.5313343754440814, | |
| "grad_norm": 1.4915764331817627, | |
| "learning_rate": 0.00027945555555555557, | |
| "loss": 2.9165, | |
| "step": 24850 | |
| }, | |
| { | |
| "epoch": 3.5384396759982946, | |
| "grad_norm": 1.5386276245117188, | |
| "learning_rate": 0.0002789, | |
| "loss": 2.9232, | |
| "step": 24900 | |
| }, | |
| { | |
| "epoch": 3.545544976552508, | |
| "grad_norm": 1.5589542388916016, | |
| "learning_rate": 0.00027834444444444444, | |
| "loss": 2.9032, | |
| "step": 24950 | |
| }, | |
| { | |
| "epoch": 3.5526502771067214, | |
| "grad_norm": 1.4987221956253052, | |
| "learning_rate": 0.0002777888888888889, | |
| "loss": 2.9246, | |
| "step": 25000 | |
| }, | |
| { | |
| "epoch": 3.559755577660935, | |
| "grad_norm": 1.4958397150039673, | |
| "learning_rate": 0.0002772333333333333, | |
| "loss": 2.8938, | |
| "step": 25050 | |
| }, | |
| { | |
| "epoch": 3.5668608782151487, | |
| "grad_norm": 1.5078121423721313, | |
| "learning_rate": 0.0002766777777777778, | |
| "loss": 2.9069, | |
| "step": 25100 | |
| }, | |
| { | |
| "epoch": 3.573966178769362, | |
| "grad_norm": 1.5042097568511963, | |
| "learning_rate": 0.00027612222222222224, | |
| "loss": 2.8918, | |
| "step": 25150 | |
| }, | |
| { | |
| "epoch": 3.581071479323575, | |
| "grad_norm": 1.671631932258606, | |
| "learning_rate": 0.0002755666666666667, | |
| "loss": 2.9023, | |
| "step": 25200 | |
| }, | |
| { | |
| "epoch": 3.588176779877789, | |
| "grad_norm": 1.4698235988616943, | |
| "learning_rate": 0.0002750111111111111, | |
| "loss": 2.883, | |
| "step": 25250 | |
| }, | |
| { | |
| "epoch": 3.5952820804320025, | |
| "grad_norm": 1.4968258142471313, | |
| "learning_rate": 0.00027445555555555555, | |
| "loss": 2.9238, | |
| "step": 25300 | |
| }, | |
| { | |
| "epoch": 3.6023873809862157, | |
| "grad_norm": 1.4356114864349365, | |
| "learning_rate": 0.0002739, | |
| "loss": 2.9126, | |
| "step": 25350 | |
| }, | |
| { | |
| "epoch": 3.609492681540429, | |
| "grad_norm": 1.5779649019241333, | |
| "learning_rate": 0.0002733444444444444, | |
| "loss": 2.9067, | |
| "step": 25400 | |
| }, | |
| { | |
| "epoch": 3.6165979820946426, | |
| "grad_norm": 1.580443024635315, | |
| "learning_rate": 0.0002727888888888889, | |
| "loss": 2.8901, | |
| "step": 25450 | |
| }, | |
| { | |
| "epoch": 3.623703282648856, | |
| "grad_norm": 1.555375576019287, | |
| "learning_rate": 0.00027223333333333335, | |
| "loss": 2.9117, | |
| "step": 25500 | |
| }, | |
| { | |
| "epoch": 3.6308085832030694, | |
| "grad_norm": 1.5805009603500366, | |
| "learning_rate": 0.0002716777777777778, | |
| "loss": 2.8997, | |
| "step": 25550 | |
| }, | |
| { | |
| "epoch": 3.637913883757283, | |
| "grad_norm": 1.3847641944885254, | |
| "learning_rate": 0.00027112222222222223, | |
| "loss": 2.9138, | |
| "step": 25600 | |
| }, | |
| { | |
| "epoch": 3.6450191843114963, | |
| "grad_norm": 1.647619605064392, | |
| "learning_rate": 0.00027056666666666667, | |
| "loss": 2.8953, | |
| "step": 25650 | |
| }, | |
| { | |
| "epoch": 3.65212448486571, | |
| "grad_norm": 1.5242257118225098, | |
| "learning_rate": 0.0002700111111111111, | |
| "loss": 2.9007, | |
| "step": 25700 | |
| }, | |
| { | |
| "epoch": 3.659229785419923, | |
| "grad_norm": 1.59304940700531, | |
| "learning_rate": 0.00026945555555555554, | |
| "loss": 2.8855, | |
| "step": 25750 | |
| }, | |
| { | |
| "epoch": 3.666335085974137, | |
| "grad_norm": 1.4405932426452637, | |
| "learning_rate": 0.0002689, | |
| "loss": 2.8963, | |
| "step": 25800 | |
| }, | |
| { | |
| "epoch": 3.67344038652835, | |
| "grad_norm": 1.5832966566085815, | |
| "learning_rate": 0.00026834444444444447, | |
| "loss": 2.8782, | |
| "step": 25850 | |
| }, | |
| { | |
| "epoch": 3.6805456870825637, | |
| "grad_norm": 1.4739205837249756, | |
| "learning_rate": 0.0002677888888888889, | |
| "loss": 2.875, | |
| "step": 25900 | |
| }, | |
| { | |
| "epoch": 3.687650987636777, | |
| "grad_norm": 1.439565658569336, | |
| "learning_rate": 0.00026723333333333334, | |
| "loss": 2.8956, | |
| "step": 25950 | |
| }, | |
| { | |
| "epoch": 3.6947562881909906, | |
| "grad_norm": 1.4591790437698364, | |
| "learning_rate": 0.0002666777777777778, | |
| "loss": 2.88, | |
| "step": 26000 | |
| }, | |
| { | |
| "epoch": 3.6947562881909906, | |
| "eval_accuracy": 0.4965103268623352, | |
| "eval_loss": 2.9005048274993896, | |
| "eval_runtime": 1.4001, | |
| "eval_samples_per_second": 2685.439, | |
| "eval_steps_per_second": 42.139, | |
| "step": 26000 | |
| }, | |
| { | |
| "epoch": 3.7018615887452038, | |
| "grad_norm": 1.6253418922424316, | |
| "learning_rate": 0.0002661222222222222, | |
| "loss": 2.8801, | |
| "step": 26050 | |
| }, | |
| { | |
| "epoch": 3.7089668892994174, | |
| "grad_norm": 1.652553915977478, | |
| "learning_rate": 0.00026556666666666665, | |
| "loss": 2.8799, | |
| "step": 26100 | |
| }, | |
| { | |
| "epoch": 3.716072189853631, | |
| "grad_norm": 1.4921988248825073, | |
| "learning_rate": 0.0002650111111111111, | |
| "loss": 2.8733, | |
| "step": 26150 | |
| }, | |
| { | |
| "epoch": 3.7231774904078443, | |
| "grad_norm": 1.5398036241531372, | |
| "learning_rate": 0.0002644555555555556, | |
| "loss": 2.8914, | |
| "step": 26200 | |
| }, | |
| { | |
| "epoch": 3.7302827909620575, | |
| "grad_norm": 1.5585711002349854, | |
| "learning_rate": 0.0002639, | |
| "loss": 2.894, | |
| "step": 26250 | |
| }, | |
| { | |
| "epoch": 3.737388091516271, | |
| "grad_norm": 1.5460278987884521, | |
| "learning_rate": 0.00026334444444444445, | |
| "loss": 2.8801, | |
| "step": 26300 | |
| }, | |
| { | |
| "epoch": 3.744493392070485, | |
| "grad_norm": 1.5803145170211792, | |
| "learning_rate": 0.0002627888888888889, | |
| "loss": 2.8933, | |
| "step": 26350 | |
| }, | |
| { | |
| "epoch": 3.751598692624698, | |
| "grad_norm": 1.649328589439392, | |
| "learning_rate": 0.00026223333333333333, | |
| "loss": 2.8536, | |
| "step": 26400 | |
| }, | |
| { | |
| "epoch": 3.7587039931789112, | |
| "grad_norm": 1.4576084613800049, | |
| "learning_rate": 0.00026167777777777777, | |
| "loss": 2.8657, | |
| "step": 26450 | |
| }, | |
| { | |
| "epoch": 3.765809293733125, | |
| "grad_norm": 1.6992441415786743, | |
| "learning_rate": 0.0002611222222222222, | |
| "loss": 2.8739, | |
| "step": 26500 | |
| }, | |
| { | |
| "epoch": 3.7729145942873386, | |
| "grad_norm": 1.479893445968628, | |
| "learning_rate": 0.0002605666666666667, | |
| "loss": 2.865, | |
| "step": 26550 | |
| }, | |
| { | |
| "epoch": 3.7800198948415518, | |
| "grad_norm": 1.599707841873169, | |
| "learning_rate": 0.00026001111111111113, | |
| "loss": 2.8904, | |
| "step": 26600 | |
| }, | |
| { | |
| "epoch": 3.787125195395765, | |
| "grad_norm": 1.5905715227127075, | |
| "learning_rate": 0.00025945555555555557, | |
| "loss": 2.8701, | |
| "step": 26650 | |
| }, | |
| { | |
| "epoch": 3.7942304959499786, | |
| "grad_norm": 1.6773452758789062, | |
| "learning_rate": 0.0002589, | |
| "loss": 2.8836, | |
| "step": 26700 | |
| }, | |
| { | |
| "epoch": 3.8013357965041923, | |
| "grad_norm": 1.4900513887405396, | |
| "learning_rate": 0.00025834444444444444, | |
| "loss": 2.8706, | |
| "step": 26750 | |
| }, | |
| { | |
| "epoch": 3.8084410970584055, | |
| "grad_norm": 1.6035610437393188, | |
| "learning_rate": 0.00025778888888888893, | |
| "loss": 2.8387, | |
| "step": 26800 | |
| }, | |
| { | |
| "epoch": 3.815546397612619, | |
| "grad_norm": 1.5936367511749268, | |
| "learning_rate": 0.0002572333333333333, | |
| "loss": 2.8678, | |
| "step": 26850 | |
| }, | |
| { | |
| "epoch": 3.8226516981668324, | |
| "grad_norm": 1.4388103485107422, | |
| "learning_rate": 0.0002566777777777778, | |
| "loss": 2.8685, | |
| "step": 26900 | |
| }, | |
| { | |
| "epoch": 3.829756998721046, | |
| "grad_norm": 1.6475917100906372, | |
| "learning_rate": 0.0002561222222222222, | |
| "loss": 2.8578, | |
| "step": 26950 | |
| }, | |
| { | |
| "epoch": 3.8368622992752592, | |
| "grad_norm": 1.4207926988601685, | |
| "learning_rate": 0.0002555666666666667, | |
| "loss": 2.8609, | |
| "step": 27000 | |
| }, | |
| { | |
| "epoch": 3.843967599829473, | |
| "grad_norm": 1.626503586769104, | |
| "learning_rate": 0.0002550111111111111, | |
| "loss": 2.8567, | |
| "step": 27050 | |
| }, | |
| { | |
| "epoch": 3.851072900383686, | |
| "grad_norm": 1.7468626499176025, | |
| "learning_rate": 0.00025445555555555555, | |
| "loss": 2.8804, | |
| "step": 27100 | |
| }, | |
| { | |
| "epoch": 3.8581782009378998, | |
| "grad_norm": 1.525142788887024, | |
| "learning_rate": 0.00025390000000000005, | |
| "loss": 2.8795, | |
| "step": 27150 | |
| }, | |
| { | |
| "epoch": 3.865283501492113, | |
| "grad_norm": 1.6227000951766968, | |
| "learning_rate": 0.00025334444444444443, | |
| "loss": 2.872, | |
| "step": 27200 | |
| }, | |
| { | |
| "epoch": 3.8723888020463266, | |
| "grad_norm": 1.4426857233047485, | |
| "learning_rate": 0.0002527888888888889, | |
| "loss": 2.859, | |
| "step": 27250 | |
| }, | |
| { | |
| "epoch": 3.87949410260054, | |
| "grad_norm": 1.6314607858657837, | |
| "learning_rate": 0.0002522333333333333, | |
| "loss": 2.8617, | |
| "step": 27300 | |
| }, | |
| { | |
| "epoch": 3.8865994031547535, | |
| "grad_norm": 1.6732324361801147, | |
| "learning_rate": 0.0002516777777777778, | |
| "loss": 2.8533, | |
| "step": 27350 | |
| }, | |
| { | |
| "epoch": 3.893704703708967, | |
| "grad_norm": 1.671032190322876, | |
| "learning_rate": 0.00025112222222222223, | |
| "loss": 2.8521, | |
| "step": 27400 | |
| }, | |
| { | |
| "epoch": 3.9008100042631804, | |
| "grad_norm": 1.5386325120925903, | |
| "learning_rate": 0.00025056666666666667, | |
| "loss": 2.8558, | |
| "step": 27450 | |
| }, | |
| { | |
| "epoch": 3.9079153048173936, | |
| "grad_norm": 1.48641037940979, | |
| "learning_rate": 0.00025001111111111116, | |
| "loss": 2.8706, | |
| "step": 27500 | |
| }, | |
| { | |
| "epoch": 3.9150206053716072, | |
| "grad_norm": 1.487549901008606, | |
| "learning_rate": 0.0002494555555555556, | |
| "loss": 2.8681, | |
| "step": 27550 | |
| }, | |
| { | |
| "epoch": 3.922125905925821, | |
| "grad_norm": 1.5427532196044922, | |
| "learning_rate": 0.00024890000000000003, | |
| "loss": 2.8411, | |
| "step": 27600 | |
| }, | |
| { | |
| "epoch": 3.929231206480034, | |
| "grad_norm": 1.6081897020339966, | |
| "learning_rate": 0.00024834444444444447, | |
| "loss": 2.8598, | |
| "step": 27650 | |
| }, | |
| { | |
| "epoch": 3.9363365070342473, | |
| "grad_norm": 1.4773359298706055, | |
| "learning_rate": 0.0002477888888888889, | |
| "loss": 2.8975, | |
| "step": 27700 | |
| }, | |
| { | |
| "epoch": 3.943441807588461, | |
| "grad_norm": 1.6076672077178955, | |
| "learning_rate": 0.00024723333333333334, | |
| "loss": 2.8473, | |
| "step": 27750 | |
| }, | |
| { | |
| "epoch": 3.9505471081426746, | |
| "grad_norm": 1.5951896905899048, | |
| "learning_rate": 0.0002466777777777778, | |
| "loss": 2.8476, | |
| "step": 27800 | |
| }, | |
| { | |
| "epoch": 3.957652408696888, | |
| "grad_norm": 1.5349781513214111, | |
| "learning_rate": 0.0002461222222222222, | |
| "loss": 2.8614, | |
| "step": 27850 | |
| }, | |
| { | |
| "epoch": 3.964757709251101, | |
| "grad_norm": 1.531628131866455, | |
| "learning_rate": 0.00024556666666666665, | |
| "loss": 2.8488, | |
| "step": 27900 | |
| }, | |
| { | |
| "epoch": 3.9718630098053147, | |
| "grad_norm": 1.4372403621673584, | |
| "learning_rate": 0.00024501111111111115, | |
| "loss": 2.8677, | |
| "step": 27950 | |
| }, | |
| { | |
| "epoch": 3.9789683103595284, | |
| "grad_norm": 1.4930979013442993, | |
| "learning_rate": 0.0002444555555555556, | |
| "loss": 2.8408, | |
| "step": 28000 | |
| }, | |
| { | |
| "epoch": 3.9789683103595284, | |
| "eval_accuracy": 0.5038213729858398, | |
| "eval_loss": 2.8301947116851807, | |
| "eval_runtime": 1.3222, | |
| "eval_samples_per_second": 2843.851, | |
| "eval_steps_per_second": 44.624, | |
| "step": 28000 | |
| }, | |
| { | |
| "epoch": 3.9860736109137416, | |
| "grad_norm": 1.5813312530517578, | |
| "learning_rate": 0.00024390000000000002, | |
| "loss": 2.8482, | |
| "step": 28050 | |
| }, | |
| { | |
| "epoch": 3.9931789114679552, | |
| "grad_norm": 1.5875399112701416, | |
| "learning_rate": 0.00024334444444444446, | |
| "loss": 2.8717, | |
| "step": 28100 | |
| }, | |
| { | |
| "epoch": 4.000284212022168, | |
| "grad_norm": 1.5822824239730835, | |
| "learning_rate": 0.0002427888888888889, | |
| "loss": 2.8383, | |
| "step": 28150 | |
| }, | |
| { | |
| "epoch": 4.007389512576382, | |
| "grad_norm": 1.4386391639709473, | |
| "learning_rate": 0.00024223333333333333, | |
| "loss": 2.8187, | |
| "step": 28200 | |
| }, | |
| { | |
| "epoch": 4.014494813130596, | |
| "grad_norm": 1.545289158821106, | |
| "learning_rate": 0.00024167777777777777, | |
| "loss": 2.811, | |
| "step": 28250 | |
| }, | |
| { | |
| "epoch": 4.0216001136848085, | |
| "grad_norm": 1.6034023761749268, | |
| "learning_rate": 0.0002411222222222222, | |
| "loss": 2.8284, | |
| "step": 28300 | |
| }, | |
| { | |
| "epoch": 4.028705414239022, | |
| "grad_norm": 1.6022422313690186, | |
| "learning_rate": 0.0002405666666666667, | |
| "loss": 2.8463, | |
| "step": 28350 | |
| }, | |
| { | |
| "epoch": 4.035810714793236, | |
| "grad_norm": 1.7777578830718994, | |
| "learning_rate": 0.00024001111111111113, | |
| "loss": 2.8322, | |
| "step": 28400 | |
| }, | |
| { | |
| "epoch": 4.0429160153474495, | |
| "grad_norm": 1.4642590284347534, | |
| "learning_rate": 0.00023945555555555557, | |
| "loss": 2.8207, | |
| "step": 28450 | |
| }, | |
| { | |
| "epoch": 4.050021315901662, | |
| "grad_norm": 1.5625720024108887, | |
| "learning_rate": 0.0002389, | |
| "loss": 2.8108, | |
| "step": 28500 | |
| }, | |
| { | |
| "epoch": 4.057126616455876, | |
| "grad_norm": 1.5230220556259155, | |
| "learning_rate": 0.00023834444444444444, | |
| "loss": 2.8204, | |
| "step": 28550 | |
| }, | |
| { | |
| "epoch": 4.06423191701009, | |
| "grad_norm": 1.7173887491226196, | |
| "learning_rate": 0.00023778888888888888, | |
| "loss": 2.8146, | |
| "step": 28600 | |
| }, | |
| { | |
| "epoch": 4.071337217564303, | |
| "grad_norm": 1.551294207572937, | |
| "learning_rate": 0.00023723333333333332, | |
| "loss": 2.845, | |
| "step": 28650 | |
| }, | |
| { | |
| "epoch": 4.078442518118516, | |
| "grad_norm": 1.4647568464279175, | |
| "learning_rate": 0.00023667777777777778, | |
| "loss": 2.8286, | |
| "step": 28700 | |
| }, | |
| { | |
| "epoch": 4.08554781867273, | |
| "grad_norm": 1.5898845195770264, | |
| "learning_rate": 0.00023612222222222225, | |
| "loss": 2.8352, | |
| "step": 28750 | |
| }, | |
| { | |
| "epoch": 4.092653119226943, | |
| "grad_norm": 1.4554657936096191, | |
| "learning_rate": 0.00023556666666666668, | |
| "loss": 2.8263, | |
| "step": 28800 | |
| }, | |
| { | |
| "epoch": 4.099758419781157, | |
| "grad_norm": 1.603566288948059, | |
| "learning_rate": 0.00023501111111111112, | |
| "loss": 2.7992, | |
| "step": 28850 | |
| }, | |
| { | |
| "epoch": 4.106863720335371, | |
| "grad_norm": 1.590300440788269, | |
| "learning_rate": 0.00023445555555555556, | |
| "loss": 2.7964, | |
| "step": 28900 | |
| }, | |
| { | |
| "epoch": 4.113969020889583, | |
| "grad_norm": 1.7716232538223267, | |
| "learning_rate": 0.0002339, | |
| "loss": 2.8384, | |
| "step": 28950 | |
| }, | |
| { | |
| "epoch": 4.121074321443797, | |
| "grad_norm": 1.6027381420135498, | |
| "learning_rate": 0.00023334444444444443, | |
| "loss": 2.8064, | |
| "step": 29000 | |
| }, | |
| { | |
| "epoch": 4.128179621998011, | |
| "grad_norm": 1.6521331071853638, | |
| "learning_rate": 0.0002327888888888889, | |
| "loss": 2.8155, | |
| "step": 29050 | |
| }, | |
| { | |
| "epoch": 4.135284922552224, | |
| "grad_norm": 1.635908603668213, | |
| "learning_rate": 0.00023223333333333336, | |
| "loss": 2.8135, | |
| "step": 29100 | |
| }, | |
| { | |
| "epoch": 4.142390223106437, | |
| "grad_norm": 1.4520660638809204, | |
| "learning_rate": 0.0002316777777777778, | |
| "loss": 2.8221, | |
| "step": 29150 | |
| }, | |
| { | |
| "epoch": 4.149495523660651, | |
| "grad_norm": 1.57816743850708, | |
| "learning_rate": 0.00023112222222222223, | |
| "loss": 2.8149, | |
| "step": 29200 | |
| }, | |
| { | |
| "epoch": 4.156600824214864, | |
| "grad_norm": 1.5677634477615356, | |
| "learning_rate": 0.00023056666666666667, | |
| "loss": 2.8118, | |
| "step": 29250 | |
| }, | |
| { | |
| "epoch": 4.163706124769078, | |
| "grad_norm": 1.4617812633514404, | |
| "learning_rate": 0.0002300111111111111, | |
| "loss": 2.8092, | |
| "step": 29300 | |
| }, | |
| { | |
| "epoch": 4.170811425323291, | |
| "grad_norm": 1.501502513885498, | |
| "learning_rate": 0.00022945555555555554, | |
| "loss": 2.812, | |
| "step": 29350 | |
| }, | |
| { | |
| "epoch": 4.1779167258775045, | |
| "grad_norm": 1.6901543140411377, | |
| "learning_rate": 0.0002289, | |
| "loss": 2.821, | |
| "step": 29400 | |
| }, | |
| { | |
| "epoch": 4.185022026431718, | |
| "grad_norm": 1.5179857015609741, | |
| "learning_rate": 0.00022834444444444444, | |
| "loss": 2.8395, | |
| "step": 29450 | |
| }, | |
| { | |
| "epoch": 4.192127326985932, | |
| "grad_norm": 1.5568187236785889, | |
| "learning_rate": 0.0002277888888888889, | |
| "loss": 2.8194, | |
| "step": 29500 | |
| }, | |
| { | |
| "epoch": 4.199232627540145, | |
| "grad_norm": 1.584041714668274, | |
| "learning_rate": 0.00022723333333333335, | |
| "loss": 2.7991, | |
| "step": 29550 | |
| }, | |
| { | |
| "epoch": 4.206337928094358, | |
| "grad_norm": 1.479154109954834, | |
| "learning_rate": 0.00022667777777777778, | |
| "loss": 2.8233, | |
| "step": 29600 | |
| }, | |
| { | |
| "epoch": 4.213443228648572, | |
| "grad_norm": 1.4960718154907227, | |
| "learning_rate": 0.00022612222222222222, | |
| "loss": 2.7909, | |
| "step": 29650 | |
| }, | |
| { | |
| "epoch": 4.220548529202786, | |
| "grad_norm": 1.6419041156768799, | |
| "learning_rate": 0.00022556666666666668, | |
| "loss": 2.8214, | |
| "step": 29700 | |
| }, | |
| { | |
| "epoch": 4.227653829756998, | |
| "grad_norm": 1.555156946182251, | |
| "learning_rate": 0.00022501111111111112, | |
| "loss": 2.8272, | |
| "step": 29750 | |
| }, | |
| { | |
| "epoch": 4.234759130311212, | |
| "grad_norm": 1.6603648662567139, | |
| "learning_rate": 0.00022445555555555556, | |
| "loss": 2.8172, | |
| "step": 29800 | |
| }, | |
| { | |
| "epoch": 4.241864430865426, | |
| "grad_norm": 1.4937909841537476, | |
| "learning_rate": 0.0002239, | |
| "loss": 2.7813, | |
| "step": 29850 | |
| }, | |
| { | |
| "epoch": 4.248969731419639, | |
| "grad_norm": 1.5089703798294067, | |
| "learning_rate": 0.00022334444444444446, | |
| "loss": 2.8092, | |
| "step": 29900 | |
| }, | |
| { | |
| "epoch": 4.256075031973852, | |
| "grad_norm": 1.4582308530807495, | |
| "learning_rate": 0.0002227888888888889, | |
| "loss": 2.7991, | |
| "step": 29950 | |
| }, | |
| { | |
| "epoch": 4.263180332528066, | |
| "grad_norm": 1.507042646408081, | |
| "learning_rate": 0.00022223333333333333, | |
| "loss": 2.7974, | |
| "step": 30000 | |
| }, | |
| { | |
| "epoch": 4.263180332528066, | |
| "eval_accuracy": 0.5150346159934998, | |
| "eval_loss": 2.7642641067504883, | |
| "eval_runtime": 1.4549, | |
| "eval_samples_per_second": 2584.342, | |
| "eval_steps_per_second": 40.552, | |
| "step": 30000 | |
| }, | |
| { | |
| "epoch": 4.270285633082279, | |
| "grad_norm": 1.5285909175872803, | |
| "learning_rate": 0.0002216777777777778, | |
| "loss": 2.7813, | |
| "step": 30050 | |
| }, | |
| { | |
| "epoch": 4.277390933636493, | |
| "grad_norm": 1.5540330410003662, | |
| "learning_rate": 0.00022112222222222223, | |
| "loss": 2.7818, | |
| "step": 30100 | |
| }, | |
| { | |
| "epoch": 4.284496234190707, | |
| "grad_norm": 1.6137828826904297, | |
| "learning_rate": 0.00022056666666666667, | |
| "loss": 2.8132, | |
| "step": 30150 | |
| }, | |
| { | |
| "epoch": 4.2916015347449195, | |
| "grad_norm": 1.5205047130584717, | |
| "learning_rate": 0.0002200111111111111, | |
| "loss": 2.7866, | |
| "step": 30200 | |
| }, | |
| { | |
| "epoch": 4.298706835299133, | |
| "grad_norm": 1.541063904762268, | |
| "learning_rate": 0.00021945555555555554, | |
| "loss": 2.8089, | |
| "step": 30250 | |
| }, | |
| { | |
| "epoch": 4.305812135853347, | |
| "grad_norm": 1.6706923246383667, | |
| "learning_rate": 0.0002189, | |
| "loss": 2.7891, | |
| "step": 30300 | |
| }, | |
| { | |
| "epoch": 4.31291743640756, | |
| "grad_norm": 1.6725937128067017, | |
| "learning_rate": 0.00021834444444444445, | |
| "loss": 2.7965, | |
| "step": 30350 | |
| }, | |
| { | |
| "epoch": 4.320022736961773, | |
| "grad_norm": 1.731666922569275, | |
| "learning_rate": 0.0002177888888888889, | |
| "loss": 2.7659, | |
| "step": 30400 | |
| }, | |
| { | |
| "epoch": 4.327128037515987, | |
| "grad_norm": 1.5914294719696045, | |
| "learning_rate": 0.00021723333333333335, | |
| "loss": 2.7915, | |
| "step": 30450 | |
| }, | |
| { | |
| "epoch": 4.3342333380702005, | |
| "grad_norm": 1.5213390588760376, | |
| "learning_rate": 0.00021667777777777778, | |
| "loss": 2.8122, | |
| "step": 30500 | |
| }, | |
| { | |
| "epoch": 4.341338638624414, | |
| "grad_norm": 1.7755457162857056, | |
| "learning_rate": 0.00021612222222222222, | |
| "loss": 2.7972, | |
| "step": 30550 | |
| }, | |
| { | |
| "epoch": 4.348443939178627, | |
| "grad_norm": 1.6222538948059082, | |
| "learning_rate": 0.00021556666666666666, | |
| "loss": 2.7935, | |
| "step": 30600 | |
| }, | |
| { | |
| "epoch": 4.355549239732841, | |
| "grad_norm": 1.7443130016326904, | |
| "learning_rate": 0.0002150111111111111, | |
| "loss": 2.8111, | |
| "step": 30650 | |
| }, | |
| { | |
| "epoch": 4.362654540287054, | |
| "grad_norm": 1.5261310338974, | |
| "learning_rate": 0.00021445555555555556, | |
| "loss": 2.7848, | |
| "step": 30700 | |
| }, | |
| { | |
| "epoch": 4.369759840841268, | |
| "grad_norm": 1.5985714197158813, | |
| "learning_rate": 0.00021390000000000002, | |
| "loss": 2.7745, | |
| "step": 30750 | |
| }, | |
| { | |
| "epoch": 4.376865141395481, | |
| "grad_norm": 1.5596888065338135, | |
| "learning_rate": 0.00021334444444444446, | |
| "loss": 2.8015, | |
| "step": 30800 | |
| }, | |
| { | |
| "epoch": 4.383970441949694, | |
| "grad_norm": 1.4315415620803833, | |
| "learning_rate": 0.0002127888888888889, | |
| "loss": 2.7807, | |
| "step": 30850 | |
| }, | |
| { | |
| "epoch": 4.391075742503908, | |
| "grad_norm": 1.545339822769165, | |
| "learning_rate": 0.00021223333333333333, | |
| "loss": 2.7805, | |
| "step": 30900 | |
| }, | |
| { | |
| "epoch": 4.398181043058122, | |
| "grad_norm": 1.5493332147598267, | |
| "learning_rate": 0.00021167777777777777, | |
| "loss": 2.8105, | |
| "step": 30950 | |
| }, | |
| { | |
| "epoch": 4.405286343612334, | |
| "grad_norm": 1.536258339881897, | |
| "learning_rate": 0.0002111222222222222, | |
| "loss": 2.7994, | |
| "step": 31000 | |
| }, | |
| { | |
| "epoch": 4.412391644166548, | |
| "grad_norm": 1.4628187417984009, | |
| "learning_rate": 0.00021056666666666667, | |
| "loss": 2.7527, | |
| "step": 31050 | |
| }, | |
| { | |
| "epoch": 4.419496944720762, | |
| "grad_norm": 1.5581260919570923, | |
| "learning_rate": 0.00021001111111111114, | |
| "loss": 2.8013, | |
| "step": 31100 | |
| }, | |
| { | |
| "epoch": 4.426602245274975, | |
| "grad_norm": 1.6152867078781128, | |
| "learning_rate": 0.00020945555555555557, | |
| "loss": 2.8103, | |
| "step": 31150 | |
| }, | |
| { | |
| "epoch": 4.433707545829188, | |
| "grad_norm": 1.5480724573135376, | |
| "learning_rate": 0.0002089, | |
| "loss": 2.7777, | |
| "step": 31200 | |
| }, | |
| { | |
| "epoch": 4.440812846383402, | |
| "grad_norm": 1.5430995225906372, | |
| "learning_rate": 0.00020834444444444445, | |
| "loss": 2.7929, | |
| "step": 31250 | |
| }, | |
| { | |
| "epoch": 4.4479181469376154, | |
| "grad_norm": 1.5283119678497314, | |
| "learning_rate": 0.00020778888888888888, | |
| "loss": 2.779, | |
| "step": 31300 | |
| }, | |
| { | |
| "epoch": 4.455023447491829, | |
| "grad_norm": 1.5975980758666992, | |
| "learning_rate": 0.00020723333333333332, | |
| "loss": 2.8032, | |
| "step": 31350 | |
| }, | |
| { | |
| "epoch": 4.462128748046043, | |
| "grad_norm": 1.5572302341461182, | |
| "learning_rate": 0.00020667777777777776, | |
| "loss": 2.7722, | |
| "step": 31400 | |
| }, | |
| { | |
| "epoch": 4.4692340486002555, | |
| "grad_norm": 1.6391266584396362, | |
| "learning_rate": 0.00020612222222222225, | |
| "loss": 2.7831, | |
| "step": 31450 | |
| }, | |
| { | |
| "epoch": 4.476339349154469, | |
| "grad_norm": 1.6277772188186646, | |
| "learning_rate": 0.00020556666666666669, | |
| "loss": 2.7939, | |
| "step": 31500 | |
| }, | |
| { | |
| "epoch": 4.483444649708683, | |
| "grad_norm": 1.6991925239562988, | |
| "learning_rate": 0.00020501111111111112, | |
| "loss": 2.7588, | |
| "step": 31550 | |
| }, | |
| { | |
| "epoch": 4.4905499502628965, | |
| "grad_norm": 1.5235546827316284, | |
| "learning_rate": 0.00020445555555555556, | |
| "loss": 2.7738, | |
| "step": 31600 | |
| }, | |
| { | |
| "epoch": 4.497655250817109, | |
| "grad_norm": 1.751791000366211, | |
| "learning_rate": 0.0002039, | |
| "loss": 2.7764, | |
| "step": 31650 | |
| }, | |
| { | |
| "epoch": 4.504760551371323, | |
| "grad_norm": 1.6844935417175293, | |
| "learning_rate": 0.00020334444444444443, | |
| "loss": 2.7975, | |
| "step": 31700 | |
| }, | |
| { | |
| "epoch": 4.511865851925537, | |
| "grad_norm": 1.4950183629989624, | |
| "learning_rate": 0.0002027888888888889, | |
| "loss": 2.7504, | |
| "step": 31750 | |
| }, | |
| { | |
| "epoch": 4.51897115247975, | |
| "grad_norm": 1.6350730657577515, | |
| "learning_rate": 0.00020223333333333333, | |
| "loss": 2.797, | |
| "step": 31800 | |
| }, | |
| { | |
| "epoch": 4.526076453033963, | |
| "grad_norm": 1.5607668161392212, | |
| "learning_rate": 0.0002016777777777778, | |
| "loss": 2.7793, | |
| "step": 31850 | |
| }, | |
| { | |
| "epoch": 4.533181753588177, | |
| "grad_norm": 1.506750226020813, | |
| "learning_rate": 0.00020112222222222223, | |
| "loss": 2.7641, | |
| "step": 31900 | |
| }, | |
| { | |
| "epoch": 4.54028705414239, | |
| "grad_norm": 1.5720961093902588, | |
| "learning_rate": 0.00020056666666666667, | |
| "loss": 2.782, | |
| "step": 31950 | |
| }, | |
| { | |
| "epoch": 4.547392354696604, | |
| "grad_norm": 1.601806640625, | |
| "learning_rate": 0.0002000111111111111, | |
| "loss": 2.7733, | |
| "step": 32000 | |
| }, | |
| { | |
| "epoch": 4.547392354696604, | |
| "eval_accuracy": 0.5182931423187256, | |
| "eval_loss": 2.738097906112671, | |
| "eval_runtime": 1.2841, | |
| "eval_samples_per_second": 2928.059, | |
| "eval_steps_per_second": 45.946, | |
| "step": 32000 | |
| }, | |
| { | |
| "epoch": 4.554497655250817, | |
| "grad_norm": 1.5717216730117798, | |
| "learning_rate": 0.00019945555555555555, | |
| "loss": 2.7728, | |
| "step": 32050 | |
| }, | |
| { | |
| "epoch": 4.56160295580503, | |
| "grad_norm": 1.6468063592910767, | |
| "learning_rate": 0.0001989, | |
| "loss": 2.7679, | |
| "step": 32100 | |
| }, | |
| { | |
| "epoch": 4.568708256359244, | |
| "grad_norm": 1.6788794994354248, | |
| "learning_rate": 0.00019834444444444445, | |
| "loss": 2.7733, | |
| "step": 32150 | |
| }, | |
| { | |
| "epoch": 4.575813556913458, | |
| "grad_norm": 1.6205172538757324, | |
| "learning_rate": 0.00019778888888888888, | |
| "loss": 2.7692, | |
| "step": 32200 | |
| }, | |
| { | |
| "epoch": 4.582918857467671, | |
| "grad_norm": 1.57520592212677, | |
| "learning_rate": 0.00019723333333333335, | |
| "loss": 2.7715, | |
| "step": 32250 | |
| }, | |
| { | |
| "epoch": 4.590024158021884, | |
| "grad_norm": 1.5816936492919922, | |
| "learning_rate": 0.00019667777777777778, | |
| "loss": 2.767, | |
| "step": 32300 | |
| }, | |
| { | |
| "epoch": 4.597129458576098, | |
| "grad_norm": 1.518502116203308, | |
| "learning_rate": 0.00019612222222222222, | |
| "loss": 2.7713, | |
| "step": 32350 | |
| }, | |
| { | |
| "epoch": 4.604234759130311, | |
| "grad_norm": 1.516652226448059, | |
| "learning_rate": 0.00019556666666666666, | |
| "loss": 2.7631, | |
| "step": 32400 | |
| }, | |
| { | |
| "epoch": 4.611340059684524, | |
| "grad_norm": 1.6790320873260498, | |
| "learning_rate": 0.00019501111111111112, | |
| "loss": 2.7556, | |
| "step": 32450 | |
| }, | |
| { | |
| "epoch": 4.618445360238738, | |
| "grad_norm": 1.5499494075775146, | |
| "learning_rate": 0.00019445555555555556, | |
| "loss": 2.7683, | |
| "step": 32500 | |
| }, | |
| { | |
| "epoch": 4.6255506607929515, | |
| "grad_norm": 1.4426767826080322, | |
| "learning_rate": 0.0001939, | |
| "loss": 2.7845, | |
| "step": 32550 | |
| }, | |
| { | |
| "epoch": 4.632655961347165, | |
| "grad_norm": 1.5415791273117065, | |
| "learning_rate": 0.00019334444444444446, | |
| "loss": 2.7534, | |
| "step": 32600 | |
| }, | |
| { | |
| "epoch": 4.639761261901379, | |
| "grad_norm": 1.4372013807296753, | |
| "learning_rate": 0.0001927888888888889, | |
| "loss": 2.7741, | |
| "step": 32650 | |
| }, | |
| { | |
| "epoch": 4.646866562455592, | |
| "grad_norm": 1.555567979812622, | |
| "learning_rate": 0.00019223333333333333, | |
| "loss": 2.7633, | |
| "step": 32700 | |
| }, | |
| { | |
| "epoch": 4.653971863009805, | |
| "grad_norm": 1.7388957738876343, | |
| "learning_rate": 0.00019167777777777777, | |
| "loss": 2.7753, | |
| "step": 32750 | |
| }, | |
| { | |
| "epoch": 4.661077163564019, | |
| "grad_norm": 1.6249463558197021, | |
| "learning_rate": 0.00019112222222222224, | |
| "loss": 2.7339, | |
| "step": 32800 | |
| }, | |
| { | |
| "epoch": 4.668182464118233, | |
| "grad_norm": 1.642773985862732, | |
| "learning_rate": 0.00019056666666666667, | |
| "loss": 2.7721, | |
| "step": 32850 | |
| }, | |
| { | |
| "epoch": 4.675287764672445, | |
| "grad_norm": 1.544217824935913, | |
| "learning_rate": 0.0001900111111111111, | |
| "loss": 2.743, | |
| "step": 32900 | |
| }, | |
| { | |
| "epoch": 4.682393065226659, | |
| "grad_norm": 1.688503384590149, | |
| "learning_rate": 0.00018945555555555555, | |
| "loss": 2.7594, | |
| "step": 32950 | |
| }, | |
| { | |
| "epoch": 4.689498365780873, | |
| "grad_norm": 1.5570727586746216, | |
| "learning_rate": 0.0001889, | |
| "loss": 2.7632, | |
| "step": 33000 | |
| }, | |
| { | |
| "epoch": 4.696603666335086, | |
| "grad_norm": 1.4844149351119995, | |
| "learning_rate": 0.00018834444444444445, | |
| "loss": 2.7619, | |
| "step": 33050 | |
| }, | |
| { | |
| "epoch": 4.703708966889299, | |
| "grad_norm": 1.5461333990097046, | |
| "learning_rate": 0.00018778888888888888, | |
| "loss": 2.7787, | |
| "step": 33100 | |
| }, | |
| { | |
| "epoch": 4.710814267443513, | |
| "grad_norm": 1.6980046033859253, | |
| "learning_rate": 0.00018723333333333335, | |
| "loss": 2.7698, | |
| "step": 33150 | |
| }, | |
| { | |
| "epoch": 4.717919567997726, | |
| "grad_norm": 1.4724769592285156, | |
| "learning_rate": 0.00018667777777777779, | |
| "loss": 2.765, | |
| "step": 33200 | |
| }, | |
| { | |
| "epoch": 4.72502486855194, | |
| "grad_norm": 1.5083281993865967, | |
| "learning_rate": 0.00018612222222222222, | |
| "loss": 2.7679, | |
| "step": 33250 | |
| }, | |
| { | |
| "epoch": 4.732130169106153, | |
| "grad_norm": 1.7240538597106934, | |
| "learning_rate": 0.00018556666666666666, | |
| "loss": 2.7522, | |
| "step": 33300 | |
| }, | |
| { | |
| "epoch": 4.7392354696603665, | |
| "grad_norm": 1.508767008781433, | |
| "learning_rate": 0.0001850111111111111, | |
| "loss": 2.7485, | |
| "step": 33350 | |
| }, | |
| { | |
| "epoch": 4.74634077021458, | |
| "grad_norm": 1.636539101600647, | |
| "learning_rate": 0.00018445555555555556, | |
| "loss": 2.742, | |
| "step": 33400 | |
| }, | |
| { | |
| "epoch": 4.753446070768794, | |
| "grad_norm": 1.4749336242675781, | |
| "learning_rate": 0.00018390000000000002, | |
| "loss": 2.7663, | |
| "step": 33450 | |
| }, | |
| { | |
| "epoch": 4.760551371323007, | |
| "grad_norm": 1.504374623298645, | |
| "learning_rate": 0.00018334444444444446, | |
| "loss": 2.7579, | |
| "step": 33500 | |
| }, | |
| { | |
| "epoch": 4.76765667187722, | |
| "grad_norm": 1.668945550918579, | |
| "learning_rate": 0.0001827888888888889, | |
| "loss": 2.7532, | |
| "step": 33550 | |
| }, | |
| { | |
| "epoch": 4.774761972431434, | |
| "grad_norm": 1.6482501029968262, | |
| "learning_rate": 0.00018223333333333334, | |
| "loss": 2.7407, | |
| "step": 33600 | |
| }, | |
| { | |
| "epoch": 4.7818672729856475, | |
| "grad_norm": 1.57821524143219, | |
| "learning_rate": 0.00018167777777777777, | |
| "loss": 2.7731, | |
| "step": 33650 | |
| }, | |
| { | |
| "epoch": 4.78897257353986, | |
| "grad_norm": 1.5972754955291748, | |
| "learning_rate": 0.0001811222222222222, | |
| "loss": 2.74, | |
| "step": 33700 | |
| }, | |
| { | |
| "epoch": 4.796077874094074, | |
| "grad_norm": 1.537202000617981, | |
| "learning_rate": 0.00018056666666666665, | |
| "loss": 2.7553, | |
| "step": 33750 | |
| }, | |
| { | |
| "epoch": 4.803183174648288, | |
| "grad_norm": 1.4743520021438599, | |
| "learning_rate": 0.00018001111111111114, | |
| "loss": 2.7268, | |
| "step": 33800 | |
| }, | |
| { | |
| "epoch": 4.810288475202501, | |
| "grad_norm": 1.5363125801086426, | |
| "learning_rate": 0.00017945555555555557, | |
| "loss": 2.7475, | |
| "step": 33850 | |
| }, | |
| { | |
| "epoch": 4.817393775756715, | |
| "grad_norm": 1.6069902181625366, | |
| "learning_rate": 0.0001789, | |
| "loss": 2.7431, | |
| "step": 33900 | |
| }, | |
| { | |
| "epoch": 4.824499076310928, | |
| "grad_norm": 1.5969340801239014, | |
| "learning_rate": 0.00017834444444444445, | |
| "loss": 2.7594, | |
| "step": 33950 | |
| }, | |
| { | |
| "epoch": 4.831604376865141, | |
| "grad_norm": 1.5152361392974854, | |
| "learning_rate": 0.00017778888888888889, | |
| "loss": 2.7431, | |
| "step": 34000 | |
| }, | |
| { | |
| "epoch": 4.831604376865141, | |
| "eval_accuracy": 0.5208772420883179, | |
| "eval_loss": 2.706029176712036, | |
| "eval_runtime": 1.4028, | |
| "eval_samples_per_second": 2680.365, | |
| "eval_steps_per_second": 42.059, | |
| "step": 34000 | |
| }, | |
| { | |
| "epoch": 4.838709677419355, | |
| "grad_norm": 1.5891512632369995, | |
| "learning_rate": 0.00017723333333333332, | |
| "loss": 2.7503, | |
| "step": 34050 | |
| }, | |
| { | |
| "epoch": 4.845814977973569, | |
| "grad_norm": 1.6110639572143555, | |
| "learning_rate": 0.00017667777777777776, | |
| "loss": 2.7449, | |
| "step": 34100 | |
| }, | |
| { | |
| "epoch": 4.852920278527781, | |
| "grad_norm": 1.7298341989517212, | |
| "learning_rate": 0.00017612222222222225, | |
| "loss": 2.7515, | |
| "step": 34150 | |
| }, | |
| { | |
| "epoch": 4.860025579081995, | |
| "grad_norm": 1.5957584381103516, | |
| "learning_rate": 0.0001755666666666667, | |
| "loss": 2.7374, | |
| "step": 34200 | |
| }, | |
| { | |
| "epoch": 4.867130879636209, | |
| "grad_norm": 1.4720990657806396, | |
| "learning_rate": 0.00017501111111111112, | |
| "loss": 2.7342, | |
| "step": 34250 | |
| }, | |
| { | |
| "epoch": 4.874236180190422, | |
| "grad_norm": 1.6834919452667236, | |
| "learning_rate": 0.00017445555555555556, | |
| "loss": 2.7247, | |
| "step": 34300 | |
| }, | |
| { | |
| "epoch": 4.881341480744635, | |
| "grad_norm": 1.521272897720337, | |
| "learning_rate": 0.0001739, | |
| "loss": 2.739, | |
| "step": 34350 | |
| }, | |
| { | |
| "epoch": 4.888446781298849, | |
| "grad_norm": 1.671993374824524, | |
| "learning_rate": 0.00017334444444444444, | |
| "loss": 2.7624, | |
| "step": 34400 | |
| }, | |
| { | |
| "epoch": 4.8955520818530625, | |
| "grad_norm": 1.6694529056549072, | |
| "learning_rate": 0.00017278888888888887, | |
| "loss": 2.7306, | |
| "step": 34450 | |
| }, | |
| { | |
| "epoch": 4.902657382407276, | |
| "grad_norm": 1.4730091094970703, | |
| "learning_rate": 0.00017223333333333334, | |
| "loss": 2.7275, | |
| "step": 34500 | |
| }, | |
| { | |
| "epoch": 4.909762682961489, | |
| "grad_norm": 1.5129313468933105, | |
| "learning_rate": 0.0001716777777777778, | |
| "loss": 2.7323, | |
| "step": 34550 | |
| }, | |
| { | |
| "epoch": 4.9168679835157025, | |
| "grad_norm": 1.5696403980255127, | |
| "learning_rate": 0.00017112222222222224, | |
| "loss": 2.7295, | |
| "step": 34600 | |
| }, | |
| { | |
| "epoch": 4.923973284069916, | |
| "grad_norm": 1.7613575458526611, | |
| "learning_rate": 0.00017056666666666667, | |
| "loss": 2.7267, | |
| "step": 34650 | |
| }, | |
| { | |
| "epoch": 4.93107858462413, | |
| "grad_norm": 1.5198171138763428, | |
| "learning_rate": 0.0001700111111111111, | |
| "loss": 2.7287, | |
| "step": 34700 | |
| }, | |
| { | |
| "epoch": 4.9381838851783435, | |
| "grad_norm": 1.5724774599075317, | |
| "learning_rate": 0.00016945555555555555, | |
| "loss": 2.7424, | |
| "step": 34750 | |
| }, | |
| { | |
| "epoch": 4.945289185732556, | |
| "grad_norm": 1.5571963787078857, | |
| "learning_rate": 0.00016889999999999999, | |
| "loss": 2.7071, | |
| "step": 34800 | |
| }, | |
| { | |
| "epoch": 4.95239448628677, | |
| "grad_norm": 1.590010404586792, | |
| "learning_rate": 0.00016834444444444445, | |
| "loss": 2.7464, | |
| "step": 34850 | |
| }, | |
| { | |
| "epoch": 4.959499786840984, | |
| "grad_norm": 1.6421610116958618, | |
| "learning_rate": 0.0001677888888888889, | |
| "loss": 2.7104, | |
| "step": 34900 | |
| }, | |
| { | |
| "epoch": 4.966605087395196, | |
| "grad_norm": 1.5812050104141235, | |
| "learning_rate": 0.00016723333333333335, | |
| "loss": 2.748, | |
| "step": 34950 | |
| }, | |
| { | |
| "epoch": 4.97371038794941, | |
| "grad_norm": 1.5282282829284668, | |
| "learning_rate": 0.0001666777777777778, | |
| "loss": 2.763, | |
| "step": 35000 | |
| }, | |
| { | |
| "epoch": 4.980815688503624, | |
| "grad_norm": 1.509498953819275, | |
| "learning_rate": 0.00016612222222222222, | |
| "loss": 2.7346, | |
| "step": 35050 | |
| }, | |
| { | |
| "epoch": 4.987920989057837, | |
| "grad_norm": 1.6109907627105713, | |
| "learning_rate": 0.00016556666666666666, | |
| "loss": 2.7215, | |
| "step": 35100 | |
| }, | |
| { | |
| "epoch": 4.995026289612051, | |
| "grad_norm": 1.5312402248382568, | |
| "learning_rate": 0.0001650111111111111, | |
| "loss": 2.7163, | |
| "step": 35150 | |
| }, | |
| { | |
| "epoch": 5.002131590166264, | |
| "grad_norm": 1.620906949043274, | |
| "learning_rate": 0.00016445555555555556, | |
| "loss": 2.7254, | |
| "step": 35200 | |
| }, | |
| { | |
| "epoch": 5.009236890720477, | |
| "grad_norm": 1.5913270711898804, | |
| "learning_rate": 0.0001639, | |
| "loss": 2.6946, | |
| "step": 35250 | |
| }, | |
| { | |
| "epoch": 5.016342191274691, | |
| "grad_norm": 1.6128920316696167, | |
| "learning_rate": 0.00016334444444444444, | |
| "loss": 2.7067, | |
| "step": 35300 | |
| }, | |
| { | |
| "epoch": 5.023447491828905, | |
| "grad_norm": 1.598669171333313, | |
| "learning_rate": 0.0001627888888888889, | |
| "loss": 2.7214, | |
| "step": 35350 | |
| }, | |
| { | |
| "epoch": 5.0305527923831175, | |
| "grad_norm": 1.576566457748413, | |
| "learning_rate": 0.00016223333333333334, | |
| "loss": 2.7142, | |
| "step": 35400 | |
| }, | |
| { | |
| "epoch": 5.037658092937331, | |
| "grad_norm": 1.5625091791152954, | |
| "learning_rate": 0.00016167777777777777, | |
| "loss": 2.7218, | |
| "step": 35450 | |
| }, | |
| { | |
| "epoch": 5.044763393491545, | |
| "grad_norm": 1.595491886138916, | |
| "learning_rate": 0.00016112222222222224, | |
| "loss": 2.7142, | |
| "step": 35500 | |
| }, | |
| { | |
| "epoch": 5.0518686940457584, | |
| "grad_norm": 1.8340257406234741, | |
| "learning_rate": 0.00016056666666666668, | |
| "loss": 2.6812, | |
| "step": 35550 | |
| }, | |
| { | |
| "epoch": 5.058973994599971, | |
| "grad_norm": 1.5786815881729126, | |
| "learning_rate": 0.0001600111111111111, | |
| "loss": 2.6866, | |
| "step": 35600 | |
| }, | |
| { | |
| "epoch": 5.066079295154185, | |
| "grad_norm": 1.6554768085479736, | |
| "learning_rate": 0.00015945555555555555, | |
| "loss": 2.7131, | |
| "step": 35650 | |
| }, | |
| { | |
| "epoch": 5.0731845957083985, | |
| "grad_norm": 1.7100026607513428, | |
| "learning_rate": 0.0001589, | |
| "loss": 2.7062, | |
| "step": 35700 | |
| }, | |
| { | |
| "epoch": 5.080289896262612, | |
| "grad_norm": 1.4170799255371094, | |
| "learning_rate": 0.00015834444444444445, | |
| "loss": 2.703, | |
| "step": 35750 | |
| }, | |
| { | |
| "epoch": 5.087395196816825, | |
| "grad_norm": 1.573917031288147, | |
| "learning_rate": 0.0001577888888888889, | |
| "loss": 2.7189, | |
| "step": 35800 | |
| }, | |
| { | |
| "epoch": 5.094500497371039, | |
| "grad_norm": 1.5848358869552612, | |
| "learning_rate": 0.00015723333333333335, | |
| "loss": 2.7044, | |
| "step": 35850 | |
| }, | |
| { | |
| "epoch": 5.101605797925252, | |
| "grad_norm": 1.602611780166626, | |
| "learning_rate": 0.0001566777777777778, | |
| "loss": 2.7033, | |
| "step": 35900 | |
| }, | |
| { | |
| "epoch": 5.108711098479466, | |
| "grad_norm": 1.728371262550354, | |
| "learning_rate": 0.00015612222222222223, | |
| "loss": 2.709, | |
| "step": 35950 | |
| }, | |
| { | |
| "epoch": 5.11581639903368, | |
| "grad_norm": 1.5747239589691162, | |
| "learning_rate": 0.00015556666666666666, | |
| "loss": 2.6966, | |
| "step": 36000 | |
| }, | |
| { | |
| "epoch": 5.11581639903368, | |
| "eval_accuracy": 0.521781861782074, | |
| "eval_loss": 2.6849184036254883, | |
| "eval_runtime": 1.4614, | |
| "eval_samples_per_second": 2572.842, | |
| "eval_steps_per_second": 40.372, | |
| "step": 36000 | |
| }, | |
| { | |
| "epoch": 5.122921699587892, | |
| "grad_norm": 1.4712780714035034, | |
| "learning_rate": 0.0001550111111111111, | |
| "loss": 2.7073, | |
| "step": 36050 | |
| }, | |
| { | |
| "epoch": 5.130027000142106, | |
| "grad_norm": 1.6234378814697266, | |
| "learning_rate": 0.00015445555555555556, | |
| "loss": 2.7039, | |
| "step": 36100 | |
| }, | |
| { | |
| "epoch": 5.13713230069632, | |
| "grad_norm": 1.6177140474319458, | |
| "learning_rate": 0.0001539, | |
| "loss": 2.7057, | |
| "step": 36150 | |
| }, | |
| { | |
| "epoch": 5.144237601250533, | |
| "grad_norm": 1.6571050882339478, | |
| "learning_rate": 0.00015334444444444446, | |
| "loss": 2.6836, | |
| "step": 36200 | |
| }, | |
| { | |
| "epoch": 5.151342901804746, | |
| "grad_norm": 1.4596971273422241, | |
| "learning_rate": 0.0001527888888888889, | |
| "loss": 2.7236, | |
| "step": 36250 | |
| }, | |
| { | |
| "epoch": 5.15844820235896, | |
| "grad_norm": 1.5042576789855957, | |
| "learning_rate": 0.00015223333333333334, | |
| "loss": 2.6988, | |
| "step": 36300 | |
| }, | |
| { | |
| "epoch": 5.165553502913173, | |
| "grad_norm": 1.5042853355407715, | |
| "learning_rate": 0.00015167777777777778, | |
| "loss": 2.7274, | |
| "step": 36350 | |
| }, | |
| { | |
| "epoch": 5.172658803467387, | |
| "grad_norm": 1.6068828105926514, | |
| "learning_rate": 0.0001511222222222222, | |
| "loss": 2.6741, | |
| "step": 36400 | |
| }, | |
| { | |
| "epoch": 5.1797641040216, | |
| "grad_norm": 1.4389551877975464, | |
| "learning_rate": 0.00015056666666666665, | |
| "loss": 2.7118, | |
| "step": 36450 | |
| }, | |
| { | |
| "epoch": 5.1868694045758135, | |
| "grad_norm": 1.5431450605392456, | |
| "learning_rate": 0.0001500111111111111, | |
| "loss": 2.7079, | |
| "step": 36500 | |
| }, | |
| { | |
| "epoch": 5.193974705130027, | |
| "grad_norm": 1.6403303146362305, | |
| "learning_rate": 0.00014945555555555558, | |
| "loss": 2.7069, | |
| "step": 36550 | |
| }, | |
| { | |
| "epoch": 5.201080005684241, | |
| "grad_norm": 1.7466474771499634, | |
| "learning_rate": 0.00014890000000000001, | |
| "loss": 2.6675, | |
| "step": 36600 | |
| }, | |
| { | |
| "epoch": 5.2081853062384535, | |
| "grad_norm": 1.5739290714263916, | |
| "learning_rate": 0.00014834444444444445, | |
| "loss": 2.7048, | |
| "step": 36650 | |
| }, | |
| { | |
| "epoch": 5.215290606792667, | |
| "grad_norm": 1.6087383031845093, | |
| "learning_rate": 0.0001477888888888889, | |
| "loss": 2.6779, | |
| "step": 36700 | |
| }, | |
| { | |
| "epoch": 5.222395907346881, | |
| "grad_norm": 1.5931442975997925, | |
| "learning_rate": 0.00014723333333333333, | |
| "loss": 2.6958, | |
| "step": 36750 | |
| }, | |
| { | |
| "epoch": 5.2295012079010945, | |
| "grad_norm": 1.8047977685928345, | |
| "learning_rate": 0.00014667777777777776, | |
| "loss": 2.6802, | |
| "step": 36800 | |
| }, | |
| { | |
| "epoch": 5.236606508455307, | |
| "grad_norm": 1.7050204277038574, | |
| "learning_rate": 0.0001461222222222222, | |
| "loss": 2.7088, | |
| "step": 36850 | |
| }, | |
| { | |
| "epoch": 5.243711809009521, | |
| "grad_norm": 1.5651583671569824, | |
| "learning_rate": 0.0001455666666666667, | |
| "loss": 2.6978, | |
| "step": 36900 | |
| }, | |
| { | |
| "epoch": 5.250817109563735, | |
| "grad_norm": 1.519932508468628, | |
| "learning_rate": 0.00014501111111111113, | |
| "loss": 2.6886, | |
| "step": 36950 | |
| }, | |
| { | |
| "epoch": 5.257922410117948, | |
| "grad_norm": 1.6694849729537964, | |
| "learning_rate": 0.00014445555555555556, | |
| "loss": 2.7073, | |
| "step": 37000 | |
| }, | |
| { | |
| "epoch": 5.265027710672161, | |
| "grad_norm": 1.5857856273651123, | |
| "learning_rate": 0.0001439, | |
| "loss": 2.7079, | |
| "step": 37050 | |
| }, | |
| { | |
| "epoch": 5.272133011226375, | |
| "grad_norm": 1.78690505027771, | |
| "learning_rate": 0.00014334444444444444, | |
| "loss": 2.6991, | |
| "step": 37100 | |
| }, | |
| { | |
| "epoch": 5.279238311780588, | |
| "grad_norm": 1.7014470100402832, | |
| "learning_rate": 0.00014278888888888888, | |
| "loss": 2.6957, | |
| "step": 37150 | |
| }, | |
| { | |
| "epoch": 5.286343612334802, | |
| "grad_norm": 1.4961591958999634, | |
| "learning_rate": 0.00014223333333333334, | |
| "loss": 2.6875, | |
| "step": 37200 | |
| }, | |
| { | |
| "epoch": 5.293448912889016, | |
| "grad_norm": 1.682266116142273, | |
| "learning_rate": 0.0001416777777777778, | |
| "loss": 2.717, | |
| "step": 37250 | |
| }, | |
| { | |
| "epoch": 5.300554213443228, | |
| "grad_norm": 1.613564372062683, | |
| "learning_rate": 0.00014112222222222224, | |
| "loss": 2.6989, | |
| "step": 37300 | |
| }, | |
| { | |
| "epoch": 5.307659513997442, | |
| "grad_norm": 1.5683411359786987, | |
| "learning_rate": 0.00014056666666666668, | |
| "loss": 2.6864, | |
| "step": 37350 | |
| }, | |
| { | |
| "epoch": 5.314764814551656, | |
| "grad_norm": 1.5730217695236206, | |
| "learning_rate": 0.00014001111111111111, | |
| "loss": 2.6997, | |
| "step": 37400 | |
| }, | |
| { | |
| "epoch": 5.321870115105869, | |
| "grad_norm": 1.8172276020050049, | |
| "learning_rate": 0.00013945555555555555, | |
| "loss": 2.6929, | |
| "step": 37450 | |
| }, | |
| { | |
| "epoch": 5.328975415660082, | |
| "grad_norm": 1.5851980447769165, | |
| "learning_rate": 0.0001389, | |
| "loss": 2.6976, | |
| "step": 37500 | |
| }, | |
| { | |
| "epoch": 5.336080716214296, | |
| "grad_norm": 1.569605827331543, | |
| "learning_rate": 0.00013834444444444445, | |
| "loss": 2.7113, | |
| "step": 37550 | |
| }, | |
| { | |
| "epoch": 5.3431860167685095, | |
| "grad_norm": 1.4252275228500366, | |
| "learning_rate": 0.0001377888888888889, | |
| "loss": 2.6696, | |
| "step": 37600 | |
| }, | |
| { | |
| "epoch": 5.350291317322723, | |
| "grad_norm": 1.5046370029449463, | |
| "learning_rate": 0.00013723333333333335, | |
| "loss": 2.6994, | |
| "step": 37650 | |
| }, | |
| { | |
| "epoch": 5.357396617876936, | |
| "grad_norm": 1.4966368675231934, | |
| "learning_rate": 0.0001366777777777778, | |
| "loss": 2.6858, | |
| "step": 37700 | |
| }, | |
| { | |
| "epoch": 5.3645019184311495, | |
| "grad_norm": 1.635502576828003, | |
| "learning_rate": 0.00013612222222222223, | |
| "loss": 2.6776, | |
| "step": 37750 | |
| }, | |
| { | |
| "epoch": 5.371607218985363, | |
| "grad_norm": 1.703403115272522, | |
| "learning_rate": 0.00013556666666666666, | |
| "loss": 2.6876, | |
| "step": 37800 | |
| }, | |
| { | |
| "epoch": 5.378712519539577, | |
| "grad_norm": 1.569338321685791, | |
| "learning_rate": 0.0001350111111111111, | |
| "loss": 2.6779, | |
| "step": 37850 | |
| }, | |
| { | |
| "epoch": 5.38581782009379, | |
| "grad_norm": 1.4831880331039429, | |
| "learning_rate": 0.00013445555555555557, | |
| "loss": 2.6771, | |
| "step": 37900 | |
| }, | |
| { | |
| "epoch": 5.392923120648003, | |
| "grad_norm": 1.5764096975326538, | |
| "learning_rate": 0.0001339, | |
| "loss": 2.6808, | |
| "step": 37950 | |
| }, | |
| { | |
| "epoch": 5.400028421202217, | |
| "grad_norm": 1.4974082708358765, | |
| "learning_rate": 0.00013334444444444444, | |
| "loss": 2.6965, | |
| "step": 38000 | |
| }, | |
| { | |
| "epoch": 5.400028421202217, | |
| "eval_accuracy": 0.5244875550270081, | |
| "eval_loss": 2.6772782802581787, | |
| "eval_runtime": 1.2877, | |
| "eval_samples_per_second": 2919.822, | |
| "eval_steps_per_second": 45.816, | |
| "step": 38000 | |
| }, | |
| { | |
| "epoch": 5.407133721756431, | |
| "grad_norm": 1.7094296216964722, | |
| "learning_rate": 0.0001327888888888889, | |
| "loss": 2.6857, | |
| "step": 38050 | |
| }, | |
| { | |
| "epoch": 5.414239022310643, | |
| "grad_norm": 1.6493133306503296, | |
| "learning_rate": 0.00013223333333333334, | |
| "loss": 2.6767, | |
| "step": 38100 | |
| }, | |
| { | |
| "epoch": 5.421344322864857, | |
| "grad_norm": 1.6794978380203247, | |
| "learning_rate": 0.00013167777777777778, | |
| "loss": 2.6659, | |
| "step": 38150 | |
| }, | |
| { | |
| "epoch": 5.428449623419071, | |
| "grad_norm": 1.5671613216400146, | |
| "learning_rate": 0.00013112222222222221, | |
| "loss": 2.6793, | |
| "step": 38200 | |
| }, | |
| { | |
| "epoch": 5.435554923973284, | |
| "grad_norm": 1.6433796882629395, | |
| "learning_rate": 0.00013056666666666668, | |
| "loss": 2.6739, | |
| "step": 38250 | |
| }, | |
| { | |
| "epoch": 5.442660224527497, | |
| "grad_norm": 1.6001204252243042, | |
| "learning_rate": 0.00013001111111111112, | |
| "loss": 2.6673, | |
| "step": 38300 | |
| }, | |
| { | |
| "epoch": 5.449765525081711, | |
| "grad_norm": 1.6381980180740356, | |
| "learning_rate": 0.00012945555555555555, | |
| "loss": 2.6867, | |
| "step": 38350 | |
| }, | |
| { | |
| "epoch": 5.456870825635924, | |
| "grad_norm": 1.443001627922058, | |
| "learning_rate": 0.0001289, | |
| "loss": 2.6864, | |
| "step": 38400 | |
| }, | |
| { | |
| "epoch": 5.463976126190138, | |
| "grad_norm": 1.6225110292434692, | |
| "learning_rate": 0.00012834444444444445, | |
| "loss": 2.6858, | |
| "step": 38450 | |
| }, | |
| { | |
| "epoch": 5.471081426744352, | |
| "grad_norm": 1.5502654314041138, | |
| "learning_rate": 0.0001277888888888889, | |
| "loss": 2.6823, | |
| "step": 38500 | |
| }, | |
| { | |
| "epoch": 5.4781867272985645, | |
| "grad_norm": 1.579957127571106, | |
| "learning_rate": 0.00012723333333333333, | |
| "loss": 2.6662, | |
| "step": 38550 | |
| }, | |
| { | |
| "epoch": 5.485292027852778, | |
| "grad_norm": 1.6774084568023682, | |
| "learning_rate": 0.0001266777777777778, | |
| "loss": 2.677, | |
| "step": 38600 | |
| }, | |
| { | |
| "epoch": 5.492397328406992, | |
| "grad_norm": 1.554765224456787, | |
| "learning_rate": 0.00012612222222222223, | |
| "loss": 2.6806, | |
| "step": 38650 | |
| }, | |
| { | |
| "epoch": 5.4995026289612055, | |
| "grad_norm": 1.5972063541412354, | |
| "learning_rate": 0.00012556666666666666, | |
| "loss": 2.6703, | |
| "step": 38700 | |
| }, | |
| { | |
| "epoch": 5.506607929515418, | |
| "grad_norm": 1.596091866493225, | |
| "learning_rate": 0.0001250111111111111, | |
| "loss": 2.6884, | |
| "step": 38750 | |
| }, | |
| { | |
| "epoch": 5.513713230069632, | |
| "grad_norm": 1.5787779092788696, | |
| "learning_rate": 0.00012445555555555557, | |
| "loss": 2.6518, | |
| "step": 38800 | |
| }, | |
| { | |
| "epoch": 5.5208185306238455, | |
| "grad_norm": 1.4552741050720215, | |
| "learning_rate": 0.0001239, | |
| "loss": 2.674, | |
| "step": 38850 | |
| }, | |
| { | |
| "epoch": 5.527923831178059, | |
| "grad_norm": 1.5038365125656128, | |
| "learning_rate": 0.00012334444444444447, | |
| "loss": 2.6618, | |
| "step": 38900 | |
| }, | |
| { | |
| "epoch": 5.535029131732272, | |
| "grad_norm": 1.5640016794204712, | |
| "learning_rate": 0.0001227888888888889, | |
| "loss": 2.6654, | |
| "step": 38950 | |
| }, | |
| { | |
| "epoch": 5.542134432286486, | |
| "grad_norm": 1.6808786392211914, | |
| "learning_rate": 0.00012223333333333334, | |
| "loss": 2.654, | |
| "step": 39000 | |
| }, | |
| { | |
| "epoch": 5.549239732840699, | |
| "grad_norm": 1.6088745594024658, | |
| "learning_rate": 0.00012167777777777778, | |
| "loss": 2.6694, | |
| "step": 39050 | |
| }, | |
| { | |
| "epoch": 5.556345033394913, | |
| "grad_norm": 1.5467897653579712, | |
| "learning_rate": 0.00012112222222222223, | |
| "loss": 2.6646, | |
| "step": 39100 | |
| }, | |
| { | |
| "epoch": 5.563450333949126, | |
| "grad_norm": 1.5007407665252686, | |
| "learning_rate": 0.00012056666666666667, | |
| "loss": 2.681, | |
| "step": 39150 | |
| }, | |
| { | |
| "epoch": 5.570555634503339, | |
| "grad_norm": 1.5999568700790405, | |
| "learning_rate": 0.00012001111111111112, | |
| "loss": 2.6965, | |
| "step": 39200 | |
| }, | |
| { | |
| "epoch": 5.577660935057553, | |
| "grad_norm": 1.7177321910858154, | |
| "learning_rate": 0.00011945555555555555, | |
| "loss": 2.6713, | |
| "step": 39250 | |
| }, | |
| { | |
| "epoch": 5.584766235611767, | |
| "grad_norm": 1.7111984491348267, | |
| "learning_rate": 0.0001189, | |
| "loss": 2.6562, | |
| "step": 39300 | |
| }, | |
| { | |
| "epoch": 5.59187153616598, | |
| "grad_norm": 1.4950600862503052, | |
| "learning_rate": 0.00011834444444444445, | |
| "loss": 2.646, | |
| "step": 39350 | |
| }, | |
| { | |
| "epoch": 5.598976836720193, | |
| "grad_norm": 1.7324713468551636, | |
| "learning_rate": 0.00011778888888888889, | |
| "loss": 2.6701, | |
| "step": 39400 | |
| }, | |
| { | |
| "epoch": 5.606082137274407, | |
| "grad_norm": 1.685196042060852, | |
| "learning_rate": 0.00011723333333333333, | |
| "loss": 2.6571, | |
| "step": 39450 | |
| }, | |
| { | |
| "epoch": 5.61318743782862, | |
| "grad_norm": 1.4760277271270752, | |
| "learning_rate": 0.00011667777777777779, | |
| "loss": 2.6617, | |
| "step": 39500 | |
| }, | |
| { | |
| "epoch": 5.620292738382833, | |
| "grad_norm": 1.6101094484329224, | |
| "learning_rate": 0.00011612222222222223, | |
| "loss": 2.6783, | |
| "step": 39550 | |
| }, | |
| { | |
| "epoch": 5.627398038937047, | |
| "grad_norm": 1.649571418762207, | |
| "learning_rate": 0.00011556666666666667, | |
| "loss": 2.6717, | |
| "step": 39600 | |
| }, | |
| { | |
| "epoch": 5.6345033394912605, | |
| "grad_norm": 1.6487045288085938, | |
| "learning_rate": 0.0001150111111111111, | |
| "loss": 2.6572, | |
| "step": 39650 | |
| }, | |
| { | |
| "epoch": 5.641608640045474, | |
| "grad_norm": 1.711259365081787, | |
| "learning_rate": 0.00011445555555555557, | |
| "loss": 2.6626, | |
| "step": 39700 | |
| }, | |
| { | |
| "epoch": 5.648713940599688, | |
| "grad_norm": 1.5313870906829834, | |
| "learning_rate": 0.0001139, | |
| "loss": 2.677, | |
| "step": 39750 | |
| }, | |
| { | |
| "epoch": 5.655819241153901, | |
| "grad_norm": 1.729377031326294, | |
| "learning_rate": 0.00011334444444444444, | |
| "loss": 2.674, | |
| "step": 39800 | |
| }, | |
| { | |
| "epoch": 5.662924541708114, | |
| "grad_norm": 1.6128586530685425, | |
| "learning_rate": 0.00011278888888888889, | |
| "loss": 2.6651, | |
| "step": 39850 | |
| }, | |
| { | |
| "epoch": 5.670029842262328, | |
| "grad_norm": 1.6688061952590942, | |
| "learning_rate": 0.00011223333333333334, | |
| "loss": 2.6423, | |
| "step": 39900 | |
| }, | |
| { | |
| "epoch": 5.6771351428165415, | |
| "grad_norm": 1.5778543949127197, | |
| "learning_rate": 0.00011167777777777778, | |
| "loss": 2.6659, | |
| "step": 39950 | |
| }, | |
| { | |
| "epoch": 5.684240443370754, | |
| "grad_norm": 1.7078243494033813, | |
| "learning_rate": 0.00011112222222222222, | |
| "loss": 2.6466, | |
| "step": 40000 | |
| }, | |
| { | |
| "epoch": 5.684240443370754, | |
| "eval_accuracy": 0.5260953307151794, | |
| "eval_loss": 2.670067310333252, | |
| "eval_runtime": 1.4203, | |
| "eval_samples_per_second": 2647.336, | |
| "eval_steps_per_second": 41.541, | |
| "step": 40000 | |
| }, | |
| { | |
| "epoch": 5.691345743924968, | |
| "grad_norm": 1.573383092880249, | |
| "learning_rate": 0.00011056666666666667, | |
| "loss": 2.6674, | |
| "step": 40050 | |
| }, | |
| { | |
| "epoch": 5.698451044479182, | |
| "grad_norm": 1.6435350179672241, | |
| "learning_rate": 0.00011001111111111112, | |
| "loss": 2.6686, | |
| "step": 40100 | |
| }, | |
| { | |
| "epoch": 5.705556345033395, | |
| "grad_norm": 1.5594240427017212, | |
| "learning_rate": 0.00010945555555555555, | |
| "loss": 2.6544, | |
| "step": 40150 | |
| }, | |
| { | |
| "epoch": 5.712661645587608, | |
| "grad_norm": 1.7234139442443848, | |
| "learning_rate": 0.0001089, | |
| "loss": 2.6475, | |
| "step": 40200 | |
| }, | |
| { | |
| "epoch": 5.719766946141822, | |
| "grad_norm": 1.4562207460403442, | |
| "learning_rate": 0.00010834444444444444, | |
| "loss": 2.6673, | |
| "step": 40250 | |
| }, | |
| { | |
| "epoch": 5.726872246696035, | |
| "grad_norm": 1.5317538976669312, | |
| "learning_rate": 0.00010778888888888889, | |
| "loss": 2.6356, | |
| "step": 40300 | |
| }, | |
| { | |
| "epoch": 5.733977547250249, | |
| "grad_norm": 1.5807552337646484, | |
| "learning_rate": 0.00010723333333333334, | |
| "loss": 2.6816, | |
| "step": 40350 | |
| }, | |
| { | |
| "epoch": 5.741082847804462, | |
| "grad_norm": 1.5644679069519043, | |
| "learning_rate": 0.00010667777777777778, | |
| "loss": 2.663, | |
| "step": 40400 | |
| }, | |
| { | |
| "epoch": 5.748188148358675, | |
| "grad_norm": 1.5229665040969849, | |
| "learning_rate": 0.00010612222222222223, | |
| "loss": 2.6588, | |
| "step": 40450 | |
| }, | |
| { | |
| "epoch": 5.755293448912889, | |
| "grad_norm": 1.532139778137207, | |
| "learning_rate": 0.00010556666666666667, | |
| "loss": 2.6606, | |
| "step": 40500 | |
| }, | |
| { | |
| "epoch": 5.762398749467103, | |
| "grad_norm": 1.6157749891281128, | |
| "learning_rate": 0.00010501111111111112, | |
| "loss": 2.6476, | |
| "step": 40550 | |
| }, | |
| { | |
| "epoch": 5.769504050021316, | |
| "grad_norm": 1.5783989429473877, | |
| "learning_rate": 0.00010445555555555555, | |
| "loss": 2.6586, | |
| "step": 40600 | |
| }, | |
| { | |
| "epoch": 5.776609350575529, | |
| "grad_norm": 1.6066045761108398, | |
| "learning_rate": 0.0001039, | |
| "loss": 2.6452, | |
| "step": 40650 | |
| }, | |
| { | |
| "epoch": 5.783714651129743, | |
| "grad_norm": 1.563295841217041, | |
| "learning_rate": 0.00010334444444444446, | |
| "loss": 2.6814, | |
| "step": 40700 | |
| }, | |
| { | |
| "epoch": 5.7908199516839565, | |
| "grad_norm": 1.7197033166885376, | |
| "learning_rate": 0.00010278888888888889, | |
| "loss": 2.6531, | |
| "step": 40750 | |
| }, | |
| { | |
| "epoch": 5.797925252238169, | |
| "grad_norm": 1.6908972263336182, | |
| "learning_rate": 0.00010223333333333333, | |
| "loss": 2.6506, | |
| "step": 40800 | |
| }, | |
| { | |
| "epoch": 5.805030552792383, | |
| "grad_norm": 1.5352091789245605, | |
| "learning_rate": 0.00010167777777777778, | |
| "loss": 2.6505, | |
| "step": 40850 | |
| }, | |
| { | |
| "epoch": 5.8121358533465965, | |
| "grad_norm": 1.5693176984786987, | |
| "learning_rate": 0.00010112222222222223, | |
| "loss": 2.6653, | |
| "step": 40900 | |
| }, | |
| { | |
| "epoch": 5.81924115390081, | |
| "grad_norm": 1.5169340372085571, | |
| "learning_rate": 0.00010056666666666667, | |
| "loss": 2.6262, | |
| "step": 40950 | |
| }, | |
| { | |
| "epoch": 5.826346454455024, | |
| "grad_norm": 1.6942225694656372, | |
| "learning_rate": 0.0001000111111111111, | |
| "loss": 2.6643, | |
| "step": 41000 | |
| }, | |
| { | |
| "epoch": 5.833451755009237, | |
| "grad_norm": 1.6022897958755493, | |
| "learning_rate": 9.945555555555557e-05, | |
| "loss": 2.6171, | |
| "step": 41050 | |
| }, | |
| { | |
| "epoch": 5.84055705556345, | |
| "grad_norm": 1.5821208953857422, | |
| "learning_rate": 9.89e-05, | |
| "loss": 2.6481, | |
| "step": 41100 | |
| }, | |
| { | |
| "epoch": 5.847662356117664, | |
| "grad_norm": 1.5729005336761475, | |
| "learning_rate": 9.834444444444444e-05, | |
| "loss": 2.6347, | |
| "step": 41150 | |
| }, | |
| { | |
| "epoch": 5.854767656671878, | |
| "grad_norm": 1.587731957435608, | |
| "learning_rate": 9.778888888888888e-05, | |
| "loss": 2.6503, | |
| "step": 41200 | |
| }, | |
| { | |
| "epoch": 5.86187295722609, | |
| "grad_norm": 1.5719395875930786, | |
| "learning_rate": 9.723333333333334e-05, | |
| "loss": 2.662, | |
| "step": 41250 | |
| }, | |
| { | |
| "epoch": 5.868978257780304, | |
| "grad_norm": 1.6065324544906616, | |
| "learning_rate": 9.667777777777778e-05, | |
| "loss": 2.6593, | |
| "step": 41300 | |
| }, | |
| { | |
| "epoch": 5.876083558334518, | |
| "grad_norm": 1.6367747783660889, | |
| "learning_rate": 9.612222222222222e-05, | |
| "loss": 2.647, | |
| "step": 41350 | |
| }, | |
| { | |
| "epoch": 5.883188858888731, | |
| "grad_norm": 1.6349313259124756, | |
| "learning_rate": 9.556666666666667e-05, | |
| "loss": 2.6427, | |
| "step": 41400 | |
| }, | |
| { | |
| "epoch": 5.890294159442944, | |
| "grad_norm": 1.597489356994629, | |
| "learning_rate": 9.501111111111112e-05, | |
| "loss": 2.6612, | |
| "step": 41450 | |
| }, | |
| { | |
| "epoch": 5.897399459997158, | |
| "grad_norm": 1.696363091468811, | |
| "learning_rate": 9.445555555555556e-05, | |
| "loss": 2.6384, | |
| "step": 41500 | |
| }, | |
| { | |
| "epoch": 5.904504760551371, | |
| "grad_norm": 1.6225945949554443, | |
| "learning_rate": 9.39e-05, | |
| "loss": 2.6575, | |
| "step": 41550 | |
| }, | |
| { | |
| "epoch": 5.911610061105585, | |
| "grad_norm": 1.6240595579147339, | |
| "learning_rate": 9.334444444444444e-05, | |
| "loss": 2.6414, | |
| "step": 41600 | |
| }, | |
| { | |
| "epoch": 5.918715361659798, | |
| "grad_norm": 1.6538727283477783, | |
| "learning_rate": 9.278888888888889e-05, | |
| "loss": 2.6411, | |
| "step": 41650 | |
| }, | |
| { | |
| "epoch": 5.9258206622140115, | |
| "grad_norm": 1.543818473815918, | |
| "learning_rate": 9.223333333333333e-05, | |
| "loss": 2.637, | |
| "step": 41700 | |
| }, | |
| { | |
| "epoch": 5.932925962768225, | |
| "grad_norm": 1.6847704648971558, | |
| "learning_rate": 9.167777777777778e-05, | |
| "loss": 2.6321, | |
| "step": 41750 | |
| }, | |
| { | |
| "epoch": 5.940031263322439, | |
| "grad_norm": 1.536786675453186, | |
| "learning_rate": 9.112222222222222e-05, | |
| "loss": 2.6215, | |
| "step": 41800 | |
| }, | |
| { | |
| "epoch": 5.9471365638766525, | |
| "grad_norm": 1.6496859788894653, | |
| "learning_rate": 9.056666666666667e-05, | |
| "loss": 2.6411, | |
| "step": 41850 | |
| }, | |
| { | |
| "epoch": 5.954241864430865, | |
| "grad_norm": 1.5533984899520874, | |
| "learning_rate": 9.001111111111112e-05, | |
| "loss": 2.6462, | |
| "step": 41900 | |
| }, | |
| { | |
| "epoch": 5.961347164985079, | |
| "grad_norm": 1.6307629346847534, | |
| "learning_rate": 8.945555555555556e-05, | |
| "loss": 2.6413, | |
| "step": 41950 | |
| }, | |
| { | |
| "epoch": 5.9684524655392925, | |
| "grad_norm": 1.693447470664978, | |
| "learning_rate": 8.89e-05, | |
| "loss": 2.6365, | |
| "step": 42000 | |
| }, | |
| { | |
| "epoch": 5.9684524655392925, | |
| "eval_accuracy": 0.5310431718826294, | |
| "eval_loss": 2.6503593921661377, | |
| "eval_runtime": 1.5353, | |
| "eval_samples_per_second": 2448.955, | |
| "eval_steps_per_second": 38.428, | |
| "step": 42000 | |
| }, | |
| { | |
| "epoch": 5.975557766093505, | |
| "grad_norm": 1.7850193977355957, | |
| "learning_rate": 8.834444444444444e-05, | |
| "loss": 2.6257, | |
| "step": 42050 | |
| }, | |
| { | |
| "epoch": 5.982663066647719, | |
| "grad_norm": 1.711273431777954, | |
| "learning_rate": 8.77888888888889e-05, | |
| "loss": 2.6191, | |
| "step": 42100 | |
| }, | |
| { | |
| "epoch": 5.989768367201933, | |
| "grad_norm": 1.7241146564483643, | |
| "learning_rate": 8.723333333333333e-05, | |
| "loss": 2.6228, | |
| "step": 42150 | |
| }, | |
| { | |
| "epoch": 5.996873667756146, | |
| "grad_norm": 1.6184428930282593, | |
| "learning_rate": 8.667777777777778e-05, | |
| "loss": 2.62, | |
| "step": 42200 | |
| }, | |
| { | |
| "epoch": 6.00397896831036, | |
| "grad_norm": 1.6427233219146729, | |
| "learning_rate": 8.612222222222223e-05, | |
| "loss": 2.6255, | |
| "step": 42250 | |
| }, | |
| { | |
| "epoch": 6.011084268864573, | |
| "grad_norm": 1.6040061712265015, | |
| "learning_rate": 8.556666666666667e-05, | |
| "loss": 2.6212, | |
| "step": 42300 | |
| }, | |
| { | |
| "epoch": 6.018189569418786, | |
| "grad_norm": 1.539577603340149, | |
| "learning_rate": 8.50111111111111e-05, | |
| "loss": 2.6142, | |
| "step": 42350 | |
| }, | |
| { | |
| "epoch": 6.025294869973, | |
| "grad_norm": 1.5189242362976074, | |
| "learning_rate": 8.445555555555557e-05, | |
| "loss": 2.6396, | |
| "step": 42400 | |
| }, | |
| { | |
| "epoch": 6.032400170527214, | |
| "grad_norm": 1.6133103370666504, | |
| "learning_rate": 8.39e-05, | |
| "loss": 2.6211, | |
| "step": 42450 | |
| }, | |
| { | |
| "epoch": 6.039505471081426, | |
| "grad_norm": 1.625777006149292, | |
| "learning_rate": 8.334444444444444e-05, | |
| "loss": 2.6138, | |
| "step": 42500 | |
| }, | |
| { | |
| "epoch": 6.04661077163564, | |
| "grad_norm": 1.6675585508346558, | |
| "learning_rate": 8.278888888888888e-05, | |
| "loss": 2.6357, | |
| "step": 42550 | |
| }, | |
| { | |
| "epoch": 6.053716072189854, | |
| "grad_norm": 1.5562056303024292, | |
| "learning_rate": 8.223333333333334e-05, | |
| "loss": 2.6409, | |
| "step": 42600 | |
| }, | |
| { | |
| "epoch": 6.060821372744067, | |
| "grad_norm": 1.4880784749984741, | |
| "learning_rate": 8.167777777777778e-05, | |
| "loss": 2.6206, | |
| "step": 42650 | |
| }, | |
| { | |
| "epoch": 6.06792667329828, | |
| "grad_norm": 1.6204643249511719, | |
| "learning_rate": 8.112222222222222e-05, | |
| "loss": 2.6226, | |
| "step": 42700 | |
| }, | |
| { | |
| "epoch": 6.075031973852494, | |
| "grad_norm": 1.6648606061935425, | |
| "learning_rate": 8.056666666666667e-05, | |
| "loss": 2.6139, | |
| "step": 42750 | |
| }, | |
| { | |
| "epoch": 6.0821372744067075, | |
| "grad_norm": 1.6851210594177246, | |
| "learning_rate": 8.001111111111112e-05, | |
| "loss": 2.6204, | |
| "step": 42800 | |
| }, | |
| { | |
| "epoch": 6.089242574960921, | |
| "grad_norm": 1.7008609771728516, | |
| "learning_rate": 7.945555555555556e-05, | |
| "loss": 2.6229, | |
| "step": 42850 | |
| }, | |
| { | |
| "epoch": 6.096347875515134, | |
| "grad_norm": 1.6615877151489258, | |
| "learning_rate": 7.89e-05, | |
| "loss": 2.6106, | |
| "step": 42900 | |
| }, | |
| { | |
| "epoch": 6.103453176069348, | |
| "grad_norm": 1.7349096536636353, | |
| "learning_rate": 7.834444444444444e-05, | |
| "loss": 2.6035, | |
| "step": 42950 | |
| }, | |
| { | |
| "epoch": 6.110558476623561, | |
| "grad_norm": 1.6119391918182373, | |
| "learning_rate": 7.77888888888889e-05, | |
| "loss": 2.6262, | |
| "step": 43000 | |
| }, | |
| { | |
| "epoch": 6.117663777177775, | |
| "grad_norm": 1.651769995689392, | |
| "learning_rate": 7.723333333333333e-05, | |
| "loss": 2.6131, | |
| "step": 43050 | |
| }, | |
| { | |
| "epoch": 6.124769077731988, | |
| "grad_norm": 1.5095596313476562, | |
| "learning_rate": 7.667777777777778e-05, | |
| "loss": 2.6036, | |
| "step": 43100 | |
| }, | |
| { | |
| "epoch": 6.131874378286201, | |
| "grad_norm": 1.5748904943466187, | |
| "learning_rate": 7.612222222222222e-05, | |
| "loss": 2.6016, | |
| "step": 43150 | |
| }, | |
| { | |
| "epoch": 6.138979678840415, | |
| "grad_norm": 1.5882796049118042, | |
| "learning_rate": 7.556666666666667e-05, | |
| "loss": 2.611, | |
| "step": 43200 | |
| }, | |
| { | |
| "epoch": 6.146084979394629, | |
| "grad_norm": 1.6430201530456543, | |
| "learning_rate": 7.501111111111112e-05, | |
| "loss": 2.6039, | |
| "step": 43250 | |
| }, | |
| { | |
| "epoch": 6.153190279948841, | |
| "grad_norm": 1.5699527263641357, | |
| "learning_rate": 7.445555555555556e-05, | |
| "loss": 2.6238, | |
| "step": 43300 | |
| }, | |
| { | |
| "epoch": 6.160295580503055, | |
| "grad_norm": 1.5793179273605347, | |
| "learning_rate": 7.39e-05, | |
| "loss": 2.6269, | |
| "step": 43350 | |
| }, | |
| { | |
| "epoch": 6.167400881057269, | |
| "grad_norm": 1.6893434524536133, | |
| "learning_rate": 7.334444444444444e-05, | |
| "loss": 2.6256, | |
| "step": 43400 | |
| }, | |
| { | |
| "epoch": 6.174506181611482, | |
| "grad_norm": 1.6231344938278198, | |
| "learning_rate": 7.27888888888889e-05, | |
| "loss": 2.6153, | |
| "step": 43450 | |
| }, | |
| { | |
| "epoch": 6.181611482165696, | |
| "grad_norm": 1.6697924137115479, | |
| "learning_rate": 7.223333333333333e-05, | |
| "loss": 2.5972, | |
| "step": 43500 | |
| }, | |
| { | |
| "epoch": 6.188716782719909, | |
| "grad_norm": 1.5698109865188599, | |
| "learning_rate": 7.167777777777778e-05, | |
| "loss": 2.5996, | |
| "step": 43550 | |
| }, | |
| { | |
| "epoch": 6.195822083274122, | |
| "grad_norm": 1.6216611862182617, | |
| "learning_rate": 7.112222222222223e-05, | |
| "loss": 2.5944, | |
| "step": 43600 | |
| }, | |
| { | |
| "epoch": 6.202927383828336, | |
| "grad_norm": 1.549578309059143, | |
| "learning_rate": 7.056666666666667e-05, | |
| "loss": 2.6129, | |
| "step": 43650 | |
| }, | |
| { | |
| "epoch": 6.21003268438255, | |
| "grad_norm": 1.5020294189453125, | |
| "learning_rate": 7.00111111111111e-05, | |
| "loss": 2.6016, | |
| "step": 43700 | |
| }, | |
| { | |
| "epoch": 6.2171379849367625, | |
| "grad_norm": 1.6891061067581177, | |
| "learning_rate": 6.945555555555556e-05, | |
| "loss": 2.5895, | |
| "step": 43750 | |
| }, | |
| { | |
| "epoch": 6.224243285490976, | |
| "grad_norm": 1.6669466495513916, | |
| "learning_rate": 6.890000000000001e-05, | |
| "loss": 2.6106, | |
| "step": 43800 | |
| }, | |
| { | |
| "epoch": 6.23134858604519, | |
| "grad_norm": 1.6679954528808594, | |
| "learning_rate": 6.834444444444444e-05, | |
| "loss": 2.6225, | |
| "step": 43850 | |
| }, | |
| { | |
| "epoch": 6.2384538865994035, | |
| "grad_norm": 1.5900484323501587, | |
| "learning_rate": 6.778888888888888e-05, | |
| "loss": 2.6264, | |
| "step": 43900 | |
| }, | |
| { | |
| "epoch": 6.245559187153616, | |
| "grad_norm": 1.8170428276062012, | |
| "learning_rate": 6.723333333333335e-05, | |
| "loss": 2.6022, | |
| "step": 43950 | |
| }, | |
| { | |
| "epoch": 6.25266448770783, | |
| "grad_norm": 1.6536431312561035, | |
| "learning_rate": 6.667777777777778e-05, | |
| "loss": 2.6184, | |
| "step": 44000 | |
| }, | |
| { | |
| "epoch": 6.25266448770783, | |
| "eval_accuracy": 0.5310567021369934, | |
| "eval_loss": 2.6401333808898926, | |
| "eval_runtime": 1.3799, | |
| "eval_samples_per_second": 2724.796, | |
| "eval_steps_per_second": 42.756, | |
| "step": 44000 | |
| }, | |
| { | |
| "epoch": 6.2597697882620436, | |
| "grad_norm": 1.6533645391464233, | |
| "learning_rate": 6.612222222222222e-05, | |
| "loss": 2.5819, | |
| "step": 44050 | |
| }, | |
| { | |
| "epoch": 6.266875088816257, | |
| "grad_norm": 1.6523014307022095, | |
| "learning_rate": 6.556666666666666e-05, | |
| "loss": 2.6304, | |
| "step": 44100 | |
| }, | |
| { | |
| "epoch": 6.27398038937047, | |
| "grad_norm": 1.6646928787231445, | |
| "learning_rate": 6.501111111111112e-05, | |
| "loss": 2.6044, | |
| "step": 44150 | |
| }, | |
| { | |
| "epoch": 6.281085689924684, | |
| "grad_norm": 1.5884506702423096, | |
| "learning_rate": 6.445555555555556e-05, | |
| "loss": 2.5986, | |
| "step": 44200 | |
| }, | |
| { | |
| "epoch": 6.288190990478897, | |
| "grad_norm": 1.666008472442627, | |
| "learning_rate": 6.39e-05, | |
| "loss": 2.6306, | |
| "step": 44250 | |
| }, | |
| { | |
| "epoch": 6.295296291033111, | |
| "grad_norm": 1.7450133562088013, | |
| "learning_rate": 6.334444444444445e-05, | |
| "loss": 2.6172, | |
| "step": 44300 | |
| }, | |
| { | |
| "epoch": 6.302401591587325, | |
| "grad_norm": 1.6662380695343018, | |
| "learning_rate": 6.27888888888889e-05, | |
| "loss": 2.5978, | |
| "step": 44350 | |
| }, | |
| { | |
| "epoch": 6.309506892141537, | |
| "grad_norm": 1.7020577192306519, | |
| "learning_rate": 6.223333333333333e-05, | |
| "loss": 2.6189, | |
| "step": 44400 | |
| }, | |
| { | |
| "epoch": 6.316612192695751, | |
| "grad_norm": 1.6353223323822021, | |
| "learning_rate": 6.167777777777778e-05, | |
| "loss": 2.6044, | |
| "step": 44450 | |
| }, | |
| { | |
| "epoch": 6.323717493249965, | |
| "grad_norm": 1.668189525604248, | |
| "learning_rate": 6.112222222222222e-05, | |
| "loss": 2.5963, | |
| "step": 44500 | |
| }, | |
| { | |
| "epoch": 6.3308227938041775, | |
| "grad_norm": 1.6153579950332642, | |
| "learning_rate": 6.0566666666666664e-05, | |
| "loss": 2.5861, | |
| "step": 44550 | |
| }, | |
| { | |
| "epoch": 6.337928094358391, | |
| "grad_norm": 1.5848743915557861, | |
| "learning_rate": 6.0011111111111114e-05, | |
| "loss": 2.5926, | |
| "step": 44600 | |
| }, | |
| { | |
| "epoch": 6.345033394912605, | |
| "grad_norm": 1.603839635848999, | |
| "learning_rate": 5.945555555555555e-05, | |
| "loss": 2.593, | |
| "step": 44650 | |
| }, | |
| { | |
| "epoch": 6.352138695466818, | |
| "grad_norm": 1.665961503982544, | |
| "learning_rate": 5.89e-05, | |
| "loss": 2.5935, | |
| "step": 44700 | |
| }, | |
| { | |
| "epoch": 6.359243996021032, | |
| "grad_norm": 1.5604987144470215, | |
| "learning_rate": 5.8344444444444446e-05, | |
| "loss": 2.6095, | |
| "step": 44750 | |
| }, | |
| { | |
| "epoch": 6.366349296575245, | |
| "grad_norm": 1.6461862325668335, | |
| "learning_rate": 5.778888888888889e-05, | |
| "loss": 2.6373, | |
| "step": 44800 | |
| }, | |
| { | |
| "epoch": 6.3734545971294585, | |
| "grad_norm": 1.5700501203536987, | |
| "learning_rate": 5.723333333333333e-05, | |
| "loss": 2.6208, | |
| "step": 44850 | |
| }, | |
| { | |
| "epoch": 6.380559897683672, | |
| "grad_norm": 1.5315876007080078, | |
| "learning_rate": 5.667777777777778e-05, | |
| "loss": 2.609, | |
| "step": 44900 | |
| }, | |
| { | |
| "epoch": 6.387665198237886, | |
| "grad_norm": 1.4918439388275146, | |
| "learning_rate": 5.612222222222222e-05, | |
| "loss": 2.6031, | |
| "step": 44950 | |
| }, | |
| { | |
| "epoch": 6.394770498792099, | |
| "grad_norm": 1.6920193433761597, | |
| "learning_rate": 5.556666666666667e-05, | |
| "loss": 2.6149, | |
| "step": 45000 | |
| }, | |
| { | |
| "epoch": 6.401875799346312, | |
| "grad_norm": 1.6514570713043213, | |
| "learning_rate": 5.501111111111111e-05, | |
| "loss": 2.6164, | |
| "step": 45050 | |
| }, | |
| { | |
| "epoch": 6.408981099900526, | |
| "grad_norm": 1.57295823097229, | |
| "learning_rate": 5.445555555555556e-05, | |
| "loss": 2.599, | |
| "step": 45100 | |
| }, | |
| { | |
| "epoch": 6.4160864004547395, | |
| "grad_norm": 1.6922849416732788, | |
| "learning_rate": 5.39e-05, | |
| "loss": 2.614, | |
| "step": 45150 | |
| }, | |
| { | |
| "epoch": 6.423191701008952, | |
| "grad_norm": 1.7391736507415771, | |
| "learning_rate": 5.3344444444444446e-05, | |
| "loss": 2.5909, | |
| "step": 45200 | |
| }, | |
| { | |
| "epoch": 6.430297001563166, | |
| "grad_norm": 1.584605097770691, | |
| "learning_rate": 5.2788888888888897e-05, | |
| "loss": 2.5942, | |
| "step": 45250 | |
| }, | |
| { | |
| "epoch": 6.43740230211738, | |
| "grad_norm": 1.7213895320892334, | |
| "learning_rate": 5.2233333333333334e-05, | |
| "loss": 2.5806, | |
| "step": 45300 | |
| }, | |
| { | |
| "epoch": 6.444507602671593, | |
| "grad_norm": 1.710915207862854, | |
| "learning_rate": 5.1677777777777784e-05, | |
| "loss": 2.5869, | |
| "step": 45350 | |
| }, | |
| { | |
| "epoch": 6.451612903225806, | |
| "grad_norm": 1.5835086107254028, | |
| "learning_rate": 5.112222222222222e-05, | |
| "loss": 2.6056, | |
| "step": 45400 | |
| }, | |
| { | |
| "epoch": 6.45871820378002, | |
| "grad_norm": 1.8420978784561157, | |
| "learning_rate": 5.056666666666667e-05, | |
| "loss": 2.6039, | |
| "step": 45450 | |
| }, | |
| { | |
| "epoch": 6.465823504334233, | |
| "grad_norm": 1.7443053722381592, | |
| "learning_rate": 5.001111111111111e-05, | |
| "loss": 2.5945, | |
| "step": 45500 | |
| }, | |
| { | |
| "epoch": 6.472928804888447, | |
| "grad_norm": 1.6628175973892212, | |
| "learning_rate": 4.945555555555556e-05, | |
| "loss": 2.5674, | |
| "step": 45550 | |
| }, | |
| { | |
| "epoch": 6.480034105442661, | |
| "grad_norm": 1.5999755859375, | |
| "learning_rate": 4.89e-05, | |
| "loss": 2.6151, | |
| "step": 45600 | |
| }, | |
| { | |
| "epoch": 6.487139405996873, | |
| "grad_norm": 1.5596177577972412, | |
| "learning_rate": 4.8344444444444447e-05, | |
| "loss": 2.5888, | |
| "step": 45650 | |
| }, | |
| { | |
| "epoch": 6.494244706551087, | |
| "grad_norm": 1.651270866394043, | |
| "learning_rate": 4.778888888888889e-05, | |
| "loss": 2.6043, | |
| "step": 45700 | |
| }, | |
| { | |
| "epoch": 6.501350007105301, | |
| "grad_norm": 1.840889573097229, | |
| "learning_rate": 4.7233333333333334e-05, | |
| "loss": 2.5918, | |
| "step": 45750 | |
| }, | |
| { | |
| "epoch": 6.5084553076595135, | |
| "grad_norm": 1.56648588180542, | |
| "learning_rate": 4.667777777777778e-05, | |
| "loss": 2.5825, | |
| "step": 45800 | |
| }, | |
| { | |
| "epoch": 6.515560608213727, | |
| "grad_norm": 1.5391209125518799, | |
| "learning_rate": 4.612222222222223e-05, | |
| "loss": 2.5998, | |
| "step": 45850 | |
| }, | |
| { | |
| "epoch": 6.522665908767941, | |
| "grad_norm": 1.6329506635665894, | |
| "learning_rate": 4.5566666666666665e-05, | |
| "loss": 2.5932, | |
| "step": 45900 | |
| }, | |
| { | |
| "epoch": 6.5297712093221545, | |
| "grad_norm": 1.5591578483581543, | |
| "learning_rate": 4.5011111111111116e-05, | |
| "loss": 2.6017, | |
| "step": 45950 | |
| }, | |
| { | |
| "epoch": 6.536876509876368, | |
| "grad_norm": 1.5110472440719604, | |
| "learning_rate": 4.445555555555555e-05, | |
| "loss": 2.5993, | |
| "step": 46000 | |
| }, | |
| { | |
| "epoch": 6.536876509876368, | |
| "eval_accuracy": 0.5387080907821655, | |
| "eval_loss": 2.6144747734069824, | |
| "eval_runtime": 1.4386, | |
| "eval_samples_per_second": 2613.657, | |
| "eval_steps_per_second": 41.012, | |
| "step": 46000 | |
| }, | |
| { | |
| "epoch": 6.543981810430581, | |
| "grad_norm": 1.7392023801803589, | |
| "learning_rate": 4.39e-05, | |
| "loss": 2.5975, | |
| "step": 46050 | |
| }, | |
| { | |
| "epoch": 6.551087110984795, | |
| "grad_norm": 1.5147992372512817, | |
| "learning_rate": 4.334444444444444e-05, | |
| "loss": 2.6104, | |
| "step": 46100 | |
| }, | |
| { | |
| "epoch": 6.558192411539008, | |
| "grad_norm": 1.6592369079589844, | |
| "learning_rate": 4.278888888888889e-05, | |
| "loss": 2.5908, | |
| "step": 46150 | |
| }, | |
| { | |
| "epoch": 6.565297712093222, | |
| "grad_norm": 1.6960110664367676, | |
| "learning_rate": 4.2233333333333334e-05, | |
| "loss": 2.5899, | |
| "step": 46200 | |
| }, | |
| { | |
| "epoch": 6.572403012647435, | |
| "grad_norm": 1.6082763671875, | |
| "learning_rate": 4.167777777777778e-05, | |
| "loss": 2.591, | |
| "step": 46250 | |
| }, | |
| { | |
| "epoch": 6.579508313201648, | |
| "grad_norm": 1.7125688791275024, | |
| "learning_rate": 4.112222222222222e-05, | |
| "loss": 2.6009, | |
| "step": 46300 | |
| }, | |
| { | |
| "epoch": 6.586613613755862, | |
| "grad_norm": 1.5538341999053955, | |
| "learning_rate": 4.0566666666666666e-05, | |
| "loss": 2.5956, | |
| "step": 46350 | |
| }, | |
| { | |
| "epoch": 6.593718914310076, | |
| "grad_norm": 1.5610955953598022, | |
| "learning_rate": 4.001111111111111e-05, | |
| "loss": 2.5926, | |
| "step": 46400 | |
| }, | |
| { | |
| "epoch": 6.600824214864288, | |
| "grad_norm": 1.7150379419326782, | |
| "learning_rate": 3.945555555555556e-05, | |
| "loss": 2.5879, | |
| "step": 46450 | |
| }, | |
| { | |
| "epoch": 6.607929515418502, | |
| "grad_norm": 1.703142523765564, | |
| "learning_rate": 3.89e-05, | |
| "loss": 2.5992, | |
| "step": 46500 | |
| }, | |
| { | |
| "epoch": 6.615034815972716, | |
| "grad_norm": 1.6176077127456665, | |
| "learning_rate": 3.834444444444445e-05, | |
| "loss": 2.5836, | |
| "step": 46550 | |
| }, | |
| { | |
| "epoch": 6.622140116526929, | |
| "grad_norm": 1.619447112083435, | |
| "learning_rate": 3.7788888888888884e-05, | |
| "loss": 2.5961, | |
| "step": 46600 | |
| }, | |
| { | |
| "epoch": 6.629245417081142, | |
| "grad_norm": 1.7121543884277344, | |
| "learning_rate": 3.7233333333333335e-05, | |
| "loss": 2.5927, | |
| "step": 46650 | |
| }, | |
| { | |
| "epoch": 6.636350717635356, | |
| "grad_norm": 1.5830241441726685, | |
| "learning_rate": 3.667777777777777e-05, | |
| "loss": 2.5861, | |
| "step": 46700 | |
| }, | |
| { | |
| "epoch": 6.643456018189569, | |
| "grad_norm": 1.7162401676177979, | |
| "learning_rate": 3.612222222222222e-05, | |
| "loss": 2.5839, | |
| "step": 46750 | |
| }, | |
| { | |
| "epoch": 6.650561318743783, | |
| "grad_norm": 1.6397333145141602, | |
| "learning_rate": 3.556666666666667e-05, | |
| "loss": 2.5975, | |
| "step": 46800 | |
| }, | |
| { | |
| "epoch": 6.657666619297997, | |
| "grad_norm": 1.5965735912322998, | |
| "learning_rate": 3.501111111111111e-05, | |
| "loss": 2.5734, | |
| "step": 46850 | |
| }, | |
| { | |
| "epoch": 6.6647719198522095, | |
| "grad_norm": 1.6135231256484985, | |
| "learning_rate": 3.445555555555556e-05, | |
| "loss": 2.5634, | |
| "step": 46900 | |
| }, | |
| { | |
| "epoch": 6.671877220406423, | |
| "grad_norm": 1.681933045387268, | |
| "learning_rate": 3.39e-05, | |
| "loss": 2.5795, | |
| "step": 46950 | |
| }, | |
| { | |
| "epoch": 6.678982520960637, | |
| "grad_norm": 1.7126643657684326, | |
| "learning_rate": 3.334444444444445e-05, | |
| "loss": 2.5771, | |
| "step": 47000 | |
| }, | |
| { | |
| "epoch": 6.68608782151485, | |
| "grad_norm": 1.7277473211288452, | |
| "learning_rate": 3.278888888888889e-05, | |
| "loss": 2.579, | |
| "step": 47050 | |
| }, | |
| { | |
| "epoch": 6.693193122069063, | |
| "grad_norm": 1.5491448640823364, | |
| "learning_rate": 3.2233333333333335e-05, | |
| "loss": 2.5959, | |
| "step": 47100 | |
| }, | |
| { | |
| "epoch": 6.700298422623277, | |
| "grad_norm": 1.6598379611968994, | |
| "learning_rate": 3.167777777777778e-05, | |
| "loss": 2.5826, | |
| "step": 47150 | |
| }, | |
| { | |
| "epoch": 6.707403723177491, | |
| "grad_norm": 1.711911678314209, | |
| "learning_rate": 3.112222222222222e-05, | |
| "loss": 2.5639, | |
| "step": 47200 | |
| }, | |
| { | |
| "epoch": 6.714509023731704, | |
| "grad_norm": 1.6216599941253662, | |
| "learning_rate": 3.0566666666666667e-05, | |
| "loss": 2.5626, | |
| "step": 47250 | |
| }, | |
| { | |
| "epoch": 6.721614324285917, | |
| "grad_norm": 1.6412239074707031, | |
| "learning_rate": 3.001111111111111e-05, | |
| "loss": 2.5635, | |
| "step": 47300 | |
| }, | |
| { | |
| "epoch": 6.728719624840131, | |
| "grad_norm": 1.738433599472046, | |
| "learning_rate": 2.9455555555555554e-05, | |
| "loss": 2.5778, | |
| "step": 47350 | |
| }, | |
| { | |
| "epoch": 6.735824925394344, | |
| "grad_norm": 1.73790442943573, | |
| "learning_rate": 2.8899999999999998e-05, | |
| "loss": 2.599, | |
| "step": 47400 | |
| }, | |
| { | |
| "epoch": 6.742930225948558, | |
| "grad_norm": 1.5688190460205078, | |
| "learning_rate": 2.8344444444444445e-05, | |
| "loss": 2.5767, | |
| "step": 47450 | |
| }, | |
| { | |
| "epoch": 6.750035526502771, | |
| "grad_norm": 1.561957597732544, | |
| "learning_rate": 2.778888888888889e-05, | |
| "loss": 2.5636, | |
| "step": 47500 | |
| }, | |
| { | |
| "epoch": 6.757140827056984, | |
| "grad_norm": 1.6916581392288208, | |
| "learning_rate": 2.7233333333333332e-05, | |
| "loss": 2.5993, | |
| "step": 47550 | |
| }, | |
| { | |
| "epoch": 6.764246127611198, | |
| "grad_norm": 1.5806390047073364, | |
| "learning_rate": 2.667777777777778e-05, | |
| "loss": 2.5508, | |
| "step": 47600 | |
| }, | |
| { | |
| "epoch": 6.771351428165412, | |
| "grad_norm": 1.6202126741409302, | |
| "learning_rate": 2.6122222222222223e-05, | |
| "loss": 2.5829, | |
| "step": 47650 | |
| }, | |
| { | |
| "epoch": 6.7784567287196245, | |
| "grad_norm": 1.7637797594070435, | |
| "learning_rate": 2.5566666666666667e-05, | |
| "loss": 2.5919, | |
| "step": 47700 | |
| }, | |
| { | |
| "epoch": 6.785562029273838, | |
| "grad_norm": 1.7567532062530518, | |
| "learning_rate": 2.5011111111111114e-05, | |
| "loss": 2.5543, | |
| "step": 47750 | |
| }, | |
| { | |
| "epoch": 6.792667329828052, | |
| "grad_norm": 1.568477988243103, | |
| "learning_rate": 2.4455555555555558e-05, | |
| "loss": 2.571, | |
| "step": 47800 | |
| }, | |
| { | |
| "epoch": 6.799772630382265, | |
| "grad_norm": 1.640987515449524, | |
| "learning_rate": 2.39e-05, | |
| "loss": 2.5779, | |
| "step": 47850 | |
| }, | |
| { | |
| "epoch": 6.806877930936478, | |
| "grad_norm": 1.6859703063964844, | |
| "learning_rate": 2.3344444444444445e-05, | |
| "loss": 2.5774, | |
| "step": 47900 | |
| }, | |
| { | |
| "epoch": 6.813983231490692, | |
| "grad_norm": 1.6906059980392456, | |
| "learning_rate": 2.278888888888889e-05, | |
| "loss": 2.5953, | |
| "step": 47950 | |
| }, | |
| { | |
| "epoch": 6.8210885320449055, | |
| "grad_norm": 1.5696355104446411, | |
| "learning_rate": 2.2233333333333336e-05, | |
| "loss": 2.6005, | |
| "step": 48000 | |
| }, | |
| { | |
| "epoch": 6.8210885320449055, | |
| "eval_accuracy": 0.540194034576416, | |
| "eval_loss": 2.564614772796631, | |
| "eval_runtime": 1.3856, | |
| "eval_samples_per_second": 2713.711, | |
| "eval_steps_per_second": 42.582, | |
| "step": 48000 | |
| }, | |
| { | |
| "epoch": 6.828193832599119, | |
| "grad_norm": 1.6884602308273315, | |
| "learning_rate": 2.167777777777778e-05, | |
| "loss": 2.5969, | |
| "step": 48050 | |
| }, | |
| { | |
| "epoch": 6.835299133153333, | |
| "grad_norm": 1.663227915763855, | |
| "learning_rate": 2.1122222222222224e-05, | |
| "loss": 2.5542, | |
| "step": 48100 | |
| }, | |
| { | |
| "epoch": 6.842404433707546, | |
| "grad_norm": 1.5696767568588257, | |
| "learning_rate": 2.0566666666666667e-05, | |
| "loss": 2.5988, | |
| "step": 48150 | |
| }, | |
| { | |
| "epoch": 6.849509734261759, | |
| "grad_norm": 1.564605951309204, | |
| "learning_rate": 2.001111111111111e-05, | |
| "loss": 2.5946, | |
| "step": 48200 | |
| }, | |
| { | |
| "epoch": 6.856615034815973, | |
| "grad_norm": 1.5115107297897339, | |
| "learning_rate": 1.9455555555555555e-05, | |
| "loss": 2.5823, | |
| "step": 48250 | |
| }, | |
| { | |
| "epoch": 6.863720335370186, | |
| "grad_norm": 1.765989065170288, | |
| "learning_rate": 1.8900000000000002e-05, | |
| "loss": 2.5893, | |
| "step": 48300 | |
| }, | |
| { | |
| "epoch": 6.870825635924399, | |
| "grad_norm": 1.6937189102172852, | |
| "learning_rate": 1.8344444444444446e-05, | |
| "loss": 2.5764, | |
| "step": 48350 | |
| }, | |
| { | |
| "epoch": 6.877930936478613, | |
| "grad_norm": 1.6565457582473755, | |
| "learning_rate": 1.778888888888889e-05, | |
| "loss": 2.5779, | |
| "step": 48400 | |
| }, | |
| { | |
| "epoch": 6.885036237032827, | |
| "grad_norm": 1.5109210014343262, | |
| "learning_rate": 1.7233333333333333e-05, | |
| "loss": 2.5582, | |
| "step": 48450 | |
| }, | |
| { | |
| "epoch": 6.89214153758704, | |
| "grad_norm": 1.562355399131775, | |
| "learning_rate": 1.6677777777777777e-05, | |
| "loss": 2.5819, | |
| "step": 48500 | |
| }, | |
| { | |
| "epoch": 6.899246838141253, | |
| "grad_norm": 1.5795271396636963, | |
| "learning_rate": 1.612222222222222e-05, | |
| "loss": 2.5704, | |
| "step": 48550 | |
| }, | |
| { | |
| "epoch": 6.906352138695467, | |
| "grad_norm": 1.5818406343460083, | |
| "learning_rate": 1.5566666666666668e-05, | |
| "loss": 2.595, | |
| "step": 48600 | |
| }, | |
| { | |
| "epoch": 6.91345743924968, | |
| "grad_norm": 1.639119029045105, | |
| "learning_rate": 1.5011111111111112e-05, | |
| "loss": 2.5691, | |
| "step": 48650 | |
| }, | |
| { | |
| "epoch": 6.920562739803894, | |
| "grad_norm": 1.5755335092544556, | |
| "learning_rate": 1.4455555555555555e-05, | |
| "loss": 2.5333, | |
| "step": 48700 | |
| }, | |
| { | |
| "epoch": 6.927668040358107, | |
| "grad_norm": 1.711458683013916, | |
| "learning_rate": 1.3899999999999999e-05, | |
| "loss": 2.5762, | |
| "step": 48750 | |
| }, | |
| { | |
| "epoch": 6.9347733409123204, | |
| "grad_norm": 1.4931989908218384, | |
| "learning_rate": 1.3344444444444446e-05, | |
| "loss": 2.5961, | |
| "step": 48800 | |
| }, | |
| { | |
| "epoch": 6.941878641466534, | |
| "grad_norm": 1.5901076793670654, | |
| "learning_rate": 1.278888888888889e-05, | |
| "loss": 2.5502, | |
| "step": 48850 | |
| }, | |
| { | |
| "epoch": 6.948983942020748, | |
| "grad_norm": 1.7123291492462158, | |
| "learning_rate": 1.2233333333333334e-05, | |
| "loss": 2.5433, | |
| "step": 48900 | |
| }, | |
| { | |
| "epoch": 6.956089242574961, | |
| "grad_norm": 1.555472493171692, | |
| "learning_rate": 1.1677777777777779e-05, | |
| "loss": 2.5781, | |
| "step": 48950 | |
| }, | |
| { | |
| "epoch": 6.963194543129174, | |
| "grad_norm": 1.6610956192016602, | |
| "learning_rate": 1.1122222222222223e-05, | |
| "loss": 2.5722, | |
| "step": 49000 | |
| }, | |
| { | |
| "epoch": 6.970299843683388, | |
| "grad_norm": 1.657886028289795, | |
| "learning_rate": 1.0566666666666667e-05, | |
| "loss": 2.555, | |
| "step": 49050 | |
| }, | |
| { | |
| "epoch": 6.9774051442376015, | |
| "grad_norm": 1.5435242652893066, | |
| "learning_rate": 1.0011111111111112e-05, | |
| "loss": 2.5819, | |
| "step": 49100 | |
| }, | |
| { | |
| "epoch": 6.984510444791814, | |
| "grad_norm": 1.6257867813110352, | |
| "learning_rate": 9.455555555555556e-06, | |
| "loss": 2.5871, | |
| "step": 49150 | |
| }, | |
| { | |
| "epoch": 6.991615745346028, | |
| "grad_norm": 1.5763481855392456, | |
| "learning_rate": 8.9e-06, | |
| "loss": 2.5863, | |
| "step": 49200 | |
| }, | |
| { | |
| "epoch": 6.998721045900242, | |
| "grad_norm": 1.6068406105041504, | |
| "learning_rate": 8.344444444444445e-06, | |
| "loss": 2.5862, | |
| "step": 49250 | |
| }, | |
| { | |
| "epoch": 7.005826346454455, | |
| "grad_norm": 1.6784113645553589, | |
| "learning_rate": 7.788888888888889e-06, | |
| "loss": 2.5952, | |
| "step": 49300 | |
| }, | |
| { | |
| "epoch": 7.012931647008669, | |
| "grad_norm": 1.6384174823760986, | |
| "learning_rate": 7.233333333333333e-06, | |
| "loss": 2.5681, | |
| "step": 49350 | |
| }, | |
| { | |
| "epoch": 7.020036947562882, | |
| "grad_norm": 1.6811999082565308, | |
| "learning_rate": 6.677777777777779e-06, | |
| "loss": 2.5668, | |
| "step": 49400 | |
| }, | |
| { | |
| "epoch": 7.027142248117095, | |
| "grad_norm": 1.7817960977554321, | |
| "learning_rate": 6.1222222222222224e-06, | |
| "loss": 2.5302, | |
| "step": 49450 | |
| }, | |
| { | |
| "epoch": 7.034247548671309, | |
| "grad_norm": 1.679125428199768, | |
| "learning_rate": 5.566666666666667e-06, | |
| "loss": 2.5674, | |
| "step": 49500 | |
| }, | |
| { | |
| "epoch": 7.041352849225523, | |
| "grad_norm": 1.8110630512237549, | |
| "learning_rate": 5.011111111111112e-06, | |
| "loss": 2.5557, | |
| "step": 49550 | |
| }, | |
| { | |
| "epoch": 7.048458149779735, | |
| "grad_norm": 1.6951816082000732, | |
| "learning_rate": 4.455555555555555e-06, | |
| "loss": 2.5737, | |
| "step": 49600 | |
| }, | |
| { | |
| "epoch": 7.055563450333949, | |
| "grad_norm": 1.5674768686294556, | |
| "learning_rate": 3.9e-06, | |
| "loss": 2.5628, | |
| "step": 49650 | |
| }, | |
| { | |
| "epoch": 7.062668750888163, | |
| "grad_norm": 1.8455121517181396, | |
| "learning_rate": 3.3444444444444445e-06, | |
| "loss": 2.577, | |
| "step": 49700 | |
| }, | |
| { | |
| "epoch": 7.069774051442376, | |
| "grad_norm": 1.6875885725021362, | |
| "learning_rate": 2.788888888888889e-06, | |
| "loss": 2.5594, | |
| "step": 49750 | |
| }, | |
| { | |
| "epoch": 7.076879351996589, | |
| "grad_norm": 1.582588791847229, | |
| "learning_rate": 2.2333333333333333e-06, | |
| "loss": 2.5725, | |
| "step": 49800 | |
| }, | |
| { | |
| "epoch": 7.083984652550803, | |
| "grad_norm": 1.682963490486145, | |
| "learning_rate": 1.6777777777777779e-06, | |
| "loss": 2.556, | |
| "step": 49850 | |
| }, | |
| { | |
| "epoch": 7.091089953105016, | |
| "grad_norm": 1.7682604789733887, | |
| "learning_rate": 1.1222222222222222e-06, | |
| "loss": 2.5602, | |
| "step": 49900 | |
| }, | |
| { | |
| "epoch": 7.09819525365923, | |
| "grad_norm": 1.5831440687179565, | |
| "learning_rate": 5.666666666666667e-07, | |
| "loss": 2.5521, | |
| "step": 49950 | |
| }, | |
| { | |
| "epoch": 7.105300554213443, | |
| "grad_norm": 1.5679473876953125, | |
| "learning_rate": 1.1111111111111112e-08, | |
| "loss": 2.5724, | |
| "step": 50000 | |
| }, | |
| { | |
| "epoch": 7.105300554213443, | |
| "eval_accuracy": 0.5460931658744812, | |
| "eval_loss": 2.5473690032958984, | |
| "eval_runtime": 1.4123, | |
| "eval_samples_per_second": 2662.272, | |
| "eval_steps_per_second": 41.775, | |
| "step": 50000 | |
| } | |
| ], | |
| "logging_steps": 50, | |
| "max_steps": 50000, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 8, | |
| "save_steps": 2000, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": true | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 5.186896471100621e+16, | |
| "train_batch_size": 256, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |