diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,7259 @@ +{ + "best_global_step": 50000, + "best_metric": 0.5550713539123535, + "best_model_checkpoint": "checkpoints/mla_baseline/checkpoint-50000", + "epoch": 7.105300554213443, + "eval_steps": 2000, + "global_step": 50000, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0071053005542134435, + "grad_norm": 0.3619440197944641, + "learning_rate": 4.9e-06, + "loss": 10.3021, + "step": 50 + }, + { + "epoch": 0.014210601108426887, + "grad_norm": 0.6546474099159241, + "learning_rate": 9.900000000000002e-06, + "loss": 10.2243, + "step": 100 + }, + { + "epoch": 0.02131590166264033, + "grad_norm": 2.2018182277679443, + "learning_rate": 1.49e-05, + "loss": 9.8459, + "step": 150 + }, + { + "epoch": 0.028421202216853774, + "grad_norm": 3.6677067279815674, + "learning_rate": 1.9900000000000003e-05, + "loss": 8.5751, + "step": 200 + }, + { + "epoch": 0.03552650277106722, + "grad_norm": 1.577499508857727, + "learning_rate": 2.49e-05, + "loss": 7.543, + "step": 250 + }, + { + "epoch": 0.04263180332528066, + "grad_norm": 1.397471308708191, + "learning_rate": 2.9900000000000002e-05, + "loss": 7.2528, + "step": 300 + }, + { + "epoch": 0.0497371038794941, + "grad_norm": 1.4008537530899048, + "learning_rate": 3.49e-05, + "loss": 7.1711, + "step": 350 + }, + { + "epoch": 0.05684240443370755, + "grad_norm": 1.5214428901672363, + "learning_rate": 3.99e-05, + "loss": 7.1438, + "step": 400 + }, + { + "epoch": 0.06394770498792099, + "grad_norm": 1.3703827857971191, + "learning_rate": 4.49e-05, + "loss": 7.1052, + "step": 450 + }, + { + "epoch": 0.07105300554213444, + "grad_norm": 1.3568859100341797, + "learning_rate": 4.99e-05, + "loss": 7.1017, + "step": 500 + }, + { + "epoch": 0.07815830609634787, + "grad_norm": 1.3110448122024536, + "learning_rate": 5.49e-05, + "loss": 7.0499, + "step": 550 + }, + { + "epoch": 0.08526360665056132, + "grad_norm": 1.4580223560333252, + "learning_rate": 5.9900000000000006e-05, + "loss": 7.0325, + "step": 600 + }, + { + "epoch": 0.09236890720477477, + "grad_norm": 1.3007315397262573, + "learning_rate": 6.49e-05, + "loss": 7.0092, + "step": 650 + }, + { + "epoch": 0.0994742077589882, + "grad_norm": 1.3965587615966797, + "learning_rate": 6.99e-05, + "loss": 6.9465, + "step": 700 + }, + { + "epoch": 0.10657950831320165, + "grad_norm": 1.5621792078018188, + "learning_rate": 7.489999999999999e-05, + "loss": 6.8977, + "step": 750 + }, + { + "epoch": 0.1136848088674151, + "grad_norm": 1.5563201904296875, + "learning_rate": 7.99e-05, + "loss": 6.8554, + "step": 800 + }, + { + "epoch": 0.12079010942162853, + "grad_norm": 1.4112974405288696, + "learning_rate": 8.49e-05, + "loss": 6.8084, + "step": 850 + }, + { + "epoch": 0.12789540997584198, + "grad_norm": 1.6433038711547852, + "learning_rate": 8.989999999999999e-05, + "loss": 6.7257, + "step": 900 + }, + { + "epoch": 0.1350007105300554, + "grad_norm": 1.9007697105407715, + "learning_rate": 9.49e-05, + "loss": 6.6945, + "step": 950 + }, + { + "epoch": 0.14210601108426887, + "grad_norm": 2.0347580909729004, + "learning_rate": 9.99e-05, + "loss": 6.6074, + "step": 1000 + }, + { + "epoch": 0.1492113116384823, + "grad_norm": 2.3306286334991455, + "learning_rate": 0.0001049, + "loss": 6.5402, + "step": 1050 + }, + { + "epoch": 0.15631661219269574, + "grad_norm": 3.961554527282715, + "learning_rate": 0.0001099, + "loss": 6.462, + "step": 1100 + }, + { + "epoch": 0.1634219127469092, + "grad_norm": 2.189784288406372, + "learning_rate": 0.0001149, + "loss": 6.3737, + "step": 1150 + }, + { + "epoch": 0.17052721330112264, + "grad_norm": 2.473756790161133, + "learning_rate": 0.00011990000000000001, + "loss": 6.2955, + "step": 1200 + }, + { + "epoch": 0.17763251385533607, + "grad_norm": 3.5680646896362305, + "learning_rate": 0.0001249, + "loss": 6.2441, + "step": 1250 + }, + { + "epoch": 0.18473781440954953, + "grad_norm": 3.6250619888305664, + "learning_rate": 0.00012989999999999999, + "loss": 6.1729, + "step": 1300 + }, + { + "epoch": 0.19184311496376297, + "grad_norm": 4.737584590911865, + "learning_rate": 0.0001349, + "loss": 6.1345, + "step": 1350 + }, + { + "epoch": 0.1989484155179764, + "grad_norm": 3.0449159145355225, + "learning_rate": 0.0001399, + "loss": 6.0541, + "step": 1400 + }, + { + "epoch": 0.20605371607218986, + "grad_norm": 3.516432285308838, + "learning_rate": 0.0001449, + "loss": 6.0235, + "step": 1450 + }, + { + "epoch": 0.2131590166264033, + "grad_norm": 3.8344614505767822, + "learning_rate": 0.0001499, + "loss": 5.9358, + "step": 1500 + }, + { + "epoch": 0.22026431718061673, + "grad_norm": 3.620025873184204, + "learning_rate": 0.00015490000000000002, + "loss": 5.9013, + "step": 1550 + }, + { + "epoch": 0.2273696177348302, + "grad_norm": 3.103908061981201, + "learning_rate": 0.00015989999999999998, + "loss": 5.8423, + "step": 1600 + }, + { + "epoch": 0.23447491828904363, + "grad_norm": 3.2771809101104736, + "learning_rate": 0.0001649, + "loss": 5.8007, + "step": 1650 + }, + { + "epoch": 0.24158021884325706, + "grad_norm": 3.067811965942383, + "learning_rate": 0.0001699, + "loss": 5.7556, + "step": 1700 + }, + { + "epoch": 0.24868551939747052, + "grad_norm": 3.275484085083008, + "learning_rate": 0.0001749, + "loss": 5.7074, + "step": 1750 + }, + { + "epoch": 0.25579081995168396, + "grad_norm": 3.4723703861236572, + "learning_rate": 0.0001799, + "loss": 5.6587, + "step": 1800 + }, + { + "epoch": 0.2628961205058974, + "grad_norm": 4.599445343017578, + "learning_rate": 0.00018490000000000002, + "loss": 5.6147, + "step": 1850 + }, + { + "epoch": 0.2700014210601108, + "grad_norm": 3.8192200660705566, + "learning_rate": 0.0001899, + "loss": 5.5743, + "step": 1900 + }, + { + "epoch": 0.27710672161432426, + "grad_norm": 4.365124702453613, + "learning_rate": 0.0001949, + "loss": 5.5159, + "step": 1950 + }, + { + "epoch": 0.28421202216853775, + "grad_norm": 4.356955051422119, + "learning_rate": 0.0001999, + "loss": 5.4936, + "step": 2000 + }, + { + "epoch": 0.28421202216853775, + "eval_accuracy": 0.2797648310661316, + "eval_loss": 5.373836994171143, + "eval_runtime": 1.4619, + "eval_samples_per_second": 2572.026, + "eval_steps_per_second": 40.359, + "step": 2000 + }, + { + "epoch": 0.2913173227227512, + "grad_norm": 3.7968027591705322, + "learning_rate": 0.0002049, + "loss": 5.4659, + "step": 2050 + }, + { + "epoch": 0.2984226232769646, + "grad_norm": 3.9787704944610596, + "learning_rate": 0.0002099, + "loss": 5.3956, + "step": 2100 + }, + { + "epoch": 0.30552792383117805, + "grad_norm": 4.154063701629639, + "learning_rate": 0.00021490000000000002, + "loss": 5.3716, + "step": 2150 + }, + { + "epoch": 0.3126332243853915, + "grad_norm": 3.5421087741851807, + "learning_rate": 0.0002199, + "loss": 5.3506, + "step": 2200 + }, + { + "epoch": 0.3197385249396049, + "grad_norm": 4.011205673217773, + "learning_rate": 0.0002249, + "loss": 5.295, + "step": 2250 + }, + { + "epoch": 0.3268438254938184, + "grad_norm": 3.6677510738372803, + "learning_rate": 0.0002299, + "loss": 5.2688, + "step": 2300 + }, + { + "epoch": 0.33394912604803184, + "grad_norm": 3.733154773712158, + "learning_rate": 0.0002349, + "loss": 5.2771, + "step": 2350 + }, + { + "epoch": 0.3410544266022453, + "grad_norm": 3.485593795776367, + "learning_rate": 0.0002399, + "loss": 5.2569, + "step": 2400 + }, + { + "epoch": 0.3481597271564587, + "grad_norm": 3.414398193359375, + "learning_rate": 0.0002449, + "loss": 5.1798, + "step": 2450 + }, + { + "epoch": 0.35526502771067214, + "grad_norm": 4.8961896896362305, + "learning_rate": 0.0002499, + "loss": 5.1564, + "step": 2500 + }, + { + "epoch": 0.3623703282648856, + "grad_norm": 3.7125649452209473, + "learning_rate": 0.0002549, + "loss": 5.1338, + "step": 2550 + }, + { + "epoch": 0.36947562881909907, + "grad_norm": 3.6601812839508057, + "learning_rate": 0.00025990000000000003, + "loss": 5.1297, + "step": 2600 + }, + { + "epoch": 0.3765809293733125, + "grad_norm": 3.997366189956665, + "learning_rate": 0.00026490000000000004, + "loss": 5.0859, + "step": 2650 + }, + { + "epoch": 0.38368622992752593, + "grad_norm": 3.54914927482605, + "learning_rate": 0.0002699, + "loss": 5.0545, + "step": 2700 + }, + { + "epoch": 0.39079153048173937, + "grad_norm": 3.6549172401428223, + "learning_rate": 0.00027489999999999996, + "loss": 5.0494, + "step": 2750 + }, + { + "epoch": 0.3978968310359528, + "grad_norm": 3.403808355331421, + "learning_rate": 0.0002799, + "loss": 5.0164, + "step": 2800 + }, + { + "epoch": 0.40500213159016624, + "grad_norm": 3.342942476272583, + "learning_rate": 0.0002849, + "loss": 4.9945, + "step": 2850 + }, + { + "epoch": 0.4121074321443797, + "grad_norm": 3.8801138401031494, + "learning_rate": 0.0002899, + "loss": 4.9666, + "step": 2900 + }, + { + "epoch": 0.41921273269859316, + "grad_norm": 3.295564651489258, + "learning_rate": 0.0002949, + "loss": 4.9704, + "step": 2950 + }, + { + "epoch": 0.4263180332528066, + "grad_norm": 3.265148639678955, + "learning_rate": 0.0002999, + "loss": 4.9451, + "step": 3000 + }, + { + "epoch": 0.43342333380702003, + "grad_norm": 3.1508865356445312, + "learning_rate": 0.0003049, + "loss": 4.9264, + "step": 3050 + }, + { + "epoch": 0.44052863436123346, + "grad_norm": 3.8297386169433594, + "learning_rate": 0.0003099, + "loss": 4.9052, + "step": 3100 + }, + { + "epoch": 0.4476339349154469, + "grad_norm": 3.190356969833374, + "learning_rate": 0.0003149, + "loss": 4.8845, + "step": 3150 + }, + { + "epoch": 0.4547392354696604, + "grad_norm": 3.5795695781707764, + "learning_rate": 0.0003199, + "loss": 4.8459, + "step": 3200 + }, + { + "epoch": 0.4618445360238738, + "grad_norm": 3.614764451980591, + "learning_rate": 0.00032490000000000004, + "loss": 4.8369, + "step": 3250 + }, + { + "epoch": 0.46894983657808725, + "grad_norm": 3.8342933654785156, + "learning_rate": 0.00032990000000000005, + "loss": 4.7909, + "step": 3300 + }, + { + "epoch": 0.4760551371323007, + "grad_norm": 3.1796512603759766, + "learning_rate": 0.0003349, + "loss": 4.7868, + "step": 3350 + }, + { + "epoch": 0.4831604376865141, + "grad_norm": 3.308342695236206, + "learning_rate": 0.00033989999999999997, + "loss": 4.7647, + "step": 3400 + }, + { + "epoch": 0.49026573824072756, + "grad_norm": 3.8211753368377686, + "learning_rate": 0.0003449, + "loss": 4.7579, + "step": 3450 + }, + { + "epoch": 0.49737103879494104, + "grad_norm": 3.686267375946045, + "learning_rate": 0.0003499, + "loss": 4.7682, + "step": 3500 + }, + { + "epoch": 0.5044763393491545, + "grad_norm": 3.1786656379699707, + "learning_rate": 0.0003549, + "loss": 4.7018, + "step": 3550 + }, + { + "epoch": 0.5115816399033679, + "grad_norm": 3.0715014934539795, + "learning_rate": 0.0003599, + "loss": 4.7151, + "step": 3600 + }, + { + "epoch": 0.5186869404575813, + "grad_norm": 2.9246482849121094, + "learning_rate": 0.00036490000000000003, + "loss": 4.6897, + "step": 3650 + }, + { + "epoch": 0.5257922410117948, + "grad_norm": 3.4063315391540527, + "learning_rate": 0.0003699, + "loss": 4.6572, + "step": 3700 + }, + { + "epoch": 0.5328975415660082, + "grad_norm": 2.8954086303710938, + "learning_rate": 0.0003749, + "loss": 4.6345, + "step": 3750 + }, + { + "epoch": 0.5400028421202216, + "grad_norm": 3.2454769611358643, + "learning_rate": 0.0003799, + "loss": 4.6397, + "step": 3800 + }, + { + "epoch": 0.5471081426744351, + "grad_norm": 3.0364902019500732, + "learning_rate": 0.00038490000000000003, + "loss": 4.6329, + "step": 3850 + }, + { + "epoch": 0.5542134432286485, + "grad_norm": 2.843865156173706, + "learning_rate": 0.00038990000000000004, + "loss": 4.6107, + "step": 3900 + }, + { + "epoch": 0.5613187437828621, + "grad_norm": 3.196857213973999, + "learning_rate": 0.0003949, + "loss": 4.5821, + "step": 3950 + }, + { + "epoch": 0.5684240443370755, + "grad_norm": 3.3867523670196533, + "learning_rate": 0.00039989999999999996, + "loss": 4.5726, + "step": 4000 + }, + { + "epoch": 0.5684240443370755, + "eval_accuracy": 0.3473648726940155, + "eval_loss": 4.424046516418457, + "eval_runtime": 1.2247, + "eval_samples_per_second": 3070.052, + "eval_steps_per_second": 48.174, + "step": 4000 + }, + { + "epoch": 0.5755293448912889, + "grad_norm": 2.6906516551971436, + "learning_rate": 0.0004049, + "loss": 4.5637, + "step": 4050 + }, + { + "epoch": 0.5826346454455024, + "grad_norm": 2.7520558834075928, + "learning_rate": 0.0004099, + "loss": 4.5297, + "step": 4100 + }, + { + "epoch": 0.5897399459997158, + "grad_norm": 2.7561872005462646, + "learning_rate": 0.0004149, + "loss": 4.5373, + "step": 4150 + }, + { + "epoch": 0.5968452465539292, + "grad_norm": 2.703505754470825, + "learning_rate": 0.0004199, + "loss": 4.5301, + "step": 4200 + }, + { + "epoch": 0.6039505471081427, + "grad_norm": 2.954202175140381, + "learning_rate": 0.00042490000000000003, + "loss": 4.4973, + "step": 4250 + }, + { + "epoch": 0.6110558476623561, + "grad_norm": 2.657067060470581, + "learning_rate": 0.0004299, + "loss": 4.4914, + "step": 4300 + }, + { + "epoch": 0.6181611482165695, + "grad_norm": 2.738647222518921, + "learning_rate": 0.0004349, + "loss": 4.4614, + "step": 4350 + }, + { + "epoch": 0.625266448770783, + "grad_norm": 2.5784761905670166, + "learning_rate": 0.0004399, + "loss": 4.43, + "step": 4400 + }, + { + "epoch": 0.6323717493249964, + "grad_norm": 2.204275608062744, + "learning_rate": 0.0004449, + "loss": 4.4255, + "step": 4450 + }, + { + "epoch": 0.6394770498792098, + "grad_norm": 2.524068832397461, + "learning_rate": 0.00044990000000000004, + "loss": 4.4287, + "step": 4500 + }, + { + "epoch": 0.6465823504334234, + "grad_norm": 2.4962570667266846, + "learning_rate": 0.00045490000000000005, + "loss": 4.3569, + "step": 4550 + }, + { + "epoch": 0.6536876509876368, + "grad_norm": 2.712149143218994, + "learning_rate": 0.0004599, + "loss": 4.3769, + "step": 4600 + }, + { + "epoch": 0.6607929515418502, + "grad_norm": 2.447850465774536, + "learning_rate": 0.00046489999999999997, + "loss": 4.3919, + "step": 4650 + }, + { + "epoch": 0.6678982520960637, + "grad_norm": 1.9662154912948608, + "learning_rate": 0.0004699, + "loss": 4.3945, + "step": 4700 + }, + { + "epoch": 0.6750035526502771, + "grad_norm": 2.773207426071167, + "learning_rate": 0.0004749, + "loss": 4.3456, + "step": 4750 + }, + { + "epoch": 0.6821088532044906, + "grad_norm": 2.235621690750122, + "learning_rate": 0.0004799, + "loss": 4.3292, + "step": 4800 + }, + { + "epoch": 0.689214153758704, + "grad_norm": 2.2144429683685303, + "learning_rate": 0.0004849, + "loss": 4.3334, + "step": 4850 + }, + { + "epoch": 0.6963194543129174, + "grad_norm": 2.3465662002563477, + "learning_rate": 0.0004899, + "loss": 4.3216, + "step": 4900 + }, + { + "epoch": 0.7034247548671309, + "grad_norm": 2.3879144191741943, + "learning_rate": 0.0004949, + "loss": 4.293, + "step": 4950 + }, + { + "epoch": 0.7105300554213443, + "grad_norm": 2.186339855194092, + "learning_rate": 0.0004999000000000001, + "loss": 4.2757, + "step": 5000 + }, + { + "epoch": 0.7176353559755577, + "grad_norm": 2.113415241241455, + "learning_rate": 0.0004994555555555555, + "loss": 4.2609, + "step": 5050 + }, + { + "epoch": 0.7247406565297712, + "grad_norm": 2.384539842605591, + "learning_rate": 0.0004989, + "loss": 4.2621, + "step": 5100 + }, + { + "epoch": 0.7318459570839847, + "grad_norm": 2.57897686958313, + "learning_rate": 0.0004983444444444444, + "loss": 4.2507, + "step": 5150 + }, + { + "epoch": 0.7389512576381981, + "grad_norm": 2.3281402587890625, + "learning_rate": 0.0004977888888888889, + "loss": 4.2517, + "step": 5200 + }, + { + "epoch": 0.7460565581924116, + "grad_norm": 2.055142879486084, + "learning_rate": 0.0004972333333333334, + "loss": 4.2112, + "step": 5250 + }, + { + "epoch": 0.753161858746625, + "grad_norm": 2.339130163192749, + "learning_rate": 0.0004966777777777778, + "loss": 4.205, + "step": 5300 + }, + { + "epoch": 0.7602671593008384, + "grad_norm": 2.2506520748138428, + "learning_rate": 0.0004961222222222223, + "loss": 4.2257, + "step": 5350 + }, + { + "epoch": 0.7673724598550519, + "grad_norm": 2.1256449222564697, + "learning_rate": 0.0004955666666666667, + "loss": 4.1923, + "step": 5400 + }, + { + "epoch": 0.7744777604092653, + "grad_norm": 1.9763379096984863, + "learning_rate": 0.0004950111111111112, + "loss": 4.154, + "step": 5450 + }, + { + "epoch": 0.7815830609634787, + "grad_norm": 2.272706985473633, + "learning_rate": 0.0004944555555555555, + "loss": 4.1438, + "step": 5500 + }, + { + "epoch": 0.7886883615176922, + "grad_norm": 2.2818517684936523, + "learning_rate": 0.0004939, + "loss": 4.1481, + "step": 5550 + }, + { + "epoch": 0.7957936620719056, + "grad_norm": 2.158761739730835, + "learning_rate": 0.0004933444444444444, + "loss": 4.1347, + "step": 5600 + }, + { + "epoch": 0.802898962626119, + "grad_norm": 2.0649828910827637, + "learning_rate": 0.0004927888888888889, + "loss": 4.1106, + "step": 5650 + }, + { + "epoch": 0.8100042631803325, + "grad_norm": 2.074234962463379, + "learning_rate": 0.0004922333333333334, + "loss": 4.1174, + "step": 5700 + }, + { + "epoch": 0.817109563734546, + "grad_norm": 2.0552144050598145, + "learning_rate": 0.0004916777777777778, + "loss": 4.0816, + "step": 5750 + }, + { + "epoch": 0.8242148642887595, + "grad_norm": 2.1459226608276367, + "learning_rate": 0.0004911222222222223, + "loss": 4.1048, + "step": 5800 + }, + { + "epoch": 0.8313201648429729, + "grad_norm": 2.0078556537628174, + "learning_rate": 0.0004905666666666666, + "loss": 4.0741, + "step": 5850 + }, + { + "epoch": 0.8384254653971863, + "grad_norm": 1.962790608406067, + "learning_rate": 0.0004900111111111111, + "loss": 4.0615, + "step": 5900 + }, + { + "epoch": 0.8455307659513998, + "grad_norm": 2.1868464946746826, + "learning_rate": 0.0004894555555555555, + "loss": 4.0513, + "step": 5950 + }, + { + "epoch": 0.8526360665056132, + "grad_norm": 2.160015344619751, + "learning_rate": 0.0004889, + "loss": 4.0401, + "step": 6000 + }, + { + "epoch": 0.8526360665056132, + "eval_accuracy": 0.389207124710083, + "eval_loss": 3.9373788833618164, + "eval_runtime": 1.4022, + "eval_samples_per_second": 2681.497, + "eval_steps_per_second": 42.077, + "step": 6000 + }, + { + "epoch": 0.8597413670598266, + "grad_norm": 1.8545076847076416, + "learning_rate": 0.0004883444444444445, + "loss": 4.0452, + "step": 6050 + }, + { + "epoch": 0.8668466676140401, + "grad_norm": 2.2014081478118896, + "learning_rate": 0.0004877888888888889, + "loss": 4.0081, + "step": 6100 + }, + { + "epoch": 0.8739519681682535, + "grad_norm": 1.886440396308899, + "learning_rate": 0.0004872333333333334, + "loss": 4.0104, + "step": 6150 + }, + { + "epoch": 0.8810572687224669, + "grad_norm": 2.234833240509033, + "learning_rate": 0.00048667777777777776, + "loss": 3.9821, + "step": 6200 + }, + { + "epoch": 0.8881625692766804, + "grad_norm": 2.0455291271209717, + "learning_rate": 0.00048612222222222225, + "loss": 3.9822, + "step": 6250 + }, + { + "epoch": 0.8952678698308938, + "grad_norm": 2.097593069076538, + "learning_rate": 0.00048556666666666663, + "loss": 3.9991, + "step": 6300 + }, + { + "epoch": 0.9023731703851073, + "grad_norm": 2.0400683879852295, + "learning_rate": 0.0004850111111111111, + "loss": 3.9679, + "step": 6350 + }, + { + "epoch": 0.9094784709393208, + "grad_norm": 2.0201427936553955, + "learning_rate": 0.00048445555555555556, + "loss": 3.9712, + "step": 6400 + }, + { + "epoch": 0.9165837714935342, + "grad_norm": 2.175518035888672, + "learning_rate": 0.0004839, + "loss": 3.9371, + "step": 6450 + }, + { + "epoch": 0.9236890720477476, + "grad_norm": 1.8984272480010986, + "learning_rate": 0.0004833444444444445, + "loss": 3.9372, + "step": 6500 + }, + { + "epoch": 0.9307943726019611, + "grad_norm": 2.0253663063049316, + "learning_rate": 0.00048278888888888887, + "loss": 3.9369, + "step": 6550 + }, + { + "epoch": 0.9378996731561745, + "grad_norm": 1.9045063257217407, + "learning_rate": 0.00048223333333333336, + "loss": 3.9284, + "step": 6600 + }, + { + "epoch": 0.9450049737103879, + "grad_norm": 2.129824161529541, + "learning_rate": 0.00048167777777777775, + "loss": 3.8851, + "step": 6650 + }, + { + "epoch": 0.9521102742646014, + "grad_norm": 2.022346258163452, + "learning_rate": 0.00048112222222222224, + "loss": 3.9011, + "step": 6700 + }, + { + "epoch": 0.9592155748188148, + "grad_norm": 1.9612109661102295, + "learning_rate": 0.0004805666666666667, + "loss": 3.8745, + "step": 6750 + }, + { + "epoch": 0.9663208753730282, + "grad_norm": 1.9245905876159668, + "learning_rate": 0.0004800111111111111, + "loss": 3.8801, + "step": 6800 + }, + { + "epoch": 0.9734261759272417, + "grad_norm": 2.065843343734741, + "learning_rate": 0.0004794555555555556, + "loss": 3.8506, + "step": 6850 + }, + { + "epoch": 0.9805314764814551, + "grad_norm": 2.1017391681671143, + "learning_rate": 0.0004789, + "loss": 3.8554, + "step": 6900 + }, + { + "epoch": 0.9876367770356687, + "grad_norm": 1.9719597101211548, + "learning_rate": 0.0004783444444444445, + "loss": 3.8459, + "step": 6950 + }, + { + "epoch": 0.9947420775898821, + "grad_norm": 2.016110897064209, + "learning_rate": 0.00047778888888888886, + "loss": 3.8261, + "step": 7000 + }, + { + "epoch": 1.0018473781440955, + "grad_norm": 1.8949023485183716, + "learning_rate": 0.00047723333333333335, + "loss": 3.8615, + "step": 7050 + }, + { + "epoch": 1.008952678698309, + "grad_norm": 1.8229880332946777, + "learning_rate": 0.0004766777777777778, + "loss": 3.8052, + "step": 7100 + }, + { + "epoch": 1.0160579792525224, + "grad_norm": 2.2516708374023438, + "learning_rate": 0.0004761222222222222, + "loss": 3.8019, + "step": 7150 + }, + { + "epoch": 1.0231632798067358, + "grad_norm": 2.0374722480773926, + "learning_rate": 0.0004755666666666667, + "loss": 3.8016, + "step": 7200 + }, + { + "epoch": 1.0302685803609493, + "grad_norm": 2.0370278358459473, + "learning_rate": 0.0004750111111111111, + "loss": 3.7872, + "step": 7250 + }, + { + "epoch": 1.0373738809151627, + "grad_norm": 2.0572760105133057, + "learning_rate": 0.0004744555555555556, + "loss": 3.7898, + "step": 7300 + }, + { + "epoch": 1.0444791814693761, + "grad_norm": 2.3242228031158447, + "learning_rate": 0.00047389999999999997, + "loss": 3.7669, + "step": 7350 + }, + { + "epoch": 1.0515844820235896, + "grad_norm": 2.0401973724365234, + "learning_rate": 0.00047334444444444446, + "loss": 3.7725, + "step": 7400 + }, + { + "epoch": 1.058689782577803, + "grad_norm": 1.954636812210083, + "learning_rate": 0.0004727888888888889, + "loss": 3.7939, + "step": 7450 + }, + { + "epoch": 1.0657950831320164, + "grad_norm": 1.949588418006897, + "learning_rate": 0.00047223333333333334, + "loss": 3.7626, + "step": 7500 + }, + { + "epoch": 1.0729003836862299, + "grad_norm": 1.9143036603927612, + "learning_rate": 0.0004716777777777778, + "loss": 3.7593, + "step": 7550 + }, + { + "epoch": 1.0800056842404433, + "grad_norm": 1.8144210577011108, + "learning_rate": 0.0004711222222222222, + "loss": 3.7563, + "step": 7600 + }, + { + "epoch": 1.0871109847946567, + "grad_norm": 1.971367597579956, + "learning_rate": 0.0004705666666666667, + "loss": 3.7496, + "step": 7650 + }, + { + "epoch": 1.0942162853488702, + "grad_norm": 2.0770599842071533, + "learning_rate": 0.0004700111111111111, + "loss": 3.7137, + "step": 7700 + }, + { + "epoch": 1.1013215859030836, + "grad_norm": 1.8662265539169312, + "learning_rate": 0.0004694555555555556, + "loss": 3.7118, + "step": 7750 + }, + { + "epoch": 1.108426886457297, + "grad_norm": 1.9397633075714111, + "learning_rate": 0.0004689, + "loss": 3.7241, + "step": 7800 + }, + { + "epoch": 1.1155321870115107, + "grad_norm": 2.0750577449798584, + "learning_rate": 0.00046834444444444445, + "loss": 3.6847, + "step": 7850 + }, + { + "epoch": 1.1226374875657241, + "grad_norm": 2.2893929481506348, + "learning_rate": 0.0004677888888888889, + "loss": 3.7101, + "step": 7900 + }, + { + "epoch": 1.1297427881199376, + "grad_norm": 1.8849507570266724, + "learning_rate": 0.0004672333333333333, + "loss": 3.6931, + "step": 7950 + }, + { + "epoch": 1.136848088674151, + "grad_norm": 2.108226776123047, + "learning_rate": 0.0004666777777777778, + "loss": 3.6962, + "step": 8000 + }, + { + "epoch": 1.136848088674151, + "eval_accuracy": 0.41970664262771606, + "eval_loss": 3.6095504760742188, + "eval_runtime": 1.3165, + "eval_samples_per_second": 2855.969, + "eval_steps_per_second": 44.814, + "step": 8000 + }, + { + "epoch": 1.1439533892283644, + "grad_norm": 1.8215097188949585, + "learning_rate": 0.0004661222222222222, + "loss": 3.6758, + "step": 8050 + }, + { + "epoch": 1.1510586897825779, + "grad_norm": 2.1105434894561768, + "learning_rate": 0.0004655666666666667, + "loss": 3.6832, + "step": 8100 + }, + { + "epoch": 1.1581639903367913, + "grad_norm": 1.933396339416504, + "learning_rate": 0.0004650111111111111, + "loss": 3.6691, + "step": 8150 + }, + { + "epoch": 1.1652692908910047, + "grad_norm": 1.8069454431533813, + "learning_rate": 0.00046445555555555556, + "loss": 3.632, + "step": 8200 + }, + { + "epoch": 1.1723745914452182, + "grad_norm": 1.9175587892532349, + "learning_rate": 0.0004639, + "loss": 3.6569, + "step": 8250 + }, + { + "epoch": 1.1794798919994316, + "grad_norm": 1.936651587486267, + "learning_rate": 0.00046334444444444444, + "loss": 3.6549, + "step": 8300 + }, + { + "epoch": 1.186585192553645, + "grad_norm": 1.8092012405395508, + "learning_rate": 0.00046278888888888893, + "loss": 3.6326, + "step": 8350 + }, + { + "epoch": 1.1936904931078585, + "grad_norm": 1.741718053817749, + "learning_rate": 0.00046223333333333337, + "loss": 3.6232, + "step": 8400 + }, + { + "epoch": 1.200795793662072, + "grad_norm": 1.9609713554382324, + "learning_rate": 0.0004616777777777778, + "loss": 3.6275, + "step": 8450 + }, + { + "epoch": 1.2079010942162853, + "grad_norm": 1.6691536903381348, + "learning_rate": 0.00046112222222222224, + "loss": 3.6359, + "step": 8500 + }, + { + "epoch": 1.2150063947704988, + "grad_norm": 1.8362239599227905, + "learning_rate": 0.0004605666666666667, + "loss": 3.6195, + "step": 8550 + }, + { + "epoch": 1.2221116953247122, + "grad_norm": 1.9224746227264404, + "learning_rate": 0.0004600111111111111, + "loss": 3.6203, + "step": 8600 + }, + { + "epoch": 1.2292169958789256, + "grad_norm": 1.791446566581726, + "learning_rate": 0.00045945555555555555, + "loss": 3.605, + "step": 8650 + }, + { + "epoch": 1.236322296433139, + "grad_norm": 2.0857369899749756, + "learning_rate": 0.0004589, + "loss": 3.599, + "step": 8700 + }, + { + "epoch": 1.2434275969873525, + "grad_norm": 2.1251144409179688, + "learning_rate": 0.0004583444444444445, + "loss": 3.6082, + "step": 8750 + }, + { + "epoch": 1.250532897541566, + "grad_norm": 1.901963710784912, + "learning_rate": 0.0004577888888888889, + "loss": 3.6067, + "step": 8800 + }, + { + "epoch": 1.2576381980957794, + "grad_norm": 1.745914340019226, + "learning_rate": 0.00045723333333333335, + "loss": 3.6024, + "step": 8850 + }, + { + "epoch": 1.264743498649993, + "grad_norm": 1.7997791767120361, + "learning_rate": 0.0004566777777777778, + "loss": 3.5741, + "step": 8900 + }, + { + "epoch": 1.2718487992042062, + "grad_norm": 1.8684380054473877, + "learning_rate": 0.0004561222222222222, + "loss": 3.5767, + "step": 8950 + }, + { + "epoch": 1.27895409975842, + "grad_norm": 1.9908355474472046, + "learning_rate": 0.00045556666666666666, + "loss": 3.5529, + "step": 9000 + }, + { + "epoch": 1.286059400312633, + "grad_norm": 1.8683586120605469, + "learning_rate": 0.0004550111111111111, + "loss": 3.5614, + "step": 9050 + }, + { + "epoch": 1.2931647008668468, + "grad_norm": 1.766913890838623, + "learning_rate": 0.0004544555555555556, + "loss": 3.5393, + "step": 9100 + }, + { + "epoch": 1.30027000142106, + "grad_norm": 2.004033088684082, + "learning_rate": 0.00045390000000000003, + "loss": 3.5424, + "step": 9150 + }, + { + "epoch": 1.3073753019752736, + "grad_norm": 1.9604741334915161, + "learning_rate": 0.00045334444444444447, + "loss": 3.5526, + "step": 9200 + }, + { + "epoch": 1.314480602529487, + "grad_norm": 1.9911085367202759, + "learning_rate": 0.0004527888888888889, + "loss": 3.5261, + "step": 9250 + }, + { + "epoch": 1.3215859030837005, + "grad_norm": 1.9417774677276611, + "learning_rate": 0.00045223333333333334, + "loss": 3.545, + "step": 9300 + }, + { + "epoch": 1.328691203637914, + "grad_norm": 1.9904927015304565, + "learning_rate": 0.0004516777777777778, + "loss": 3.5167, + "step": 9350 + }, + { + "epoch": 1.3357965041921274, + "grad_norm": 1.8322969675064087, + "learning_rate": 0.0004511222222222222, + "loss": 3.498, + "step": 9400 + }, + { + "epoch": 1.3429018047463408, + "grad_norm": 1.797074794769287, + "learning_rate": 0.0004505666666666667, + "loss": 3.5084, + "step": 9450 + }, + { + "epoch": 1.3500071053005542, + "grad_norm": 1.8438552618026733, + "learning_rate": 0.0004500111111111111, + "loss": 3.5053, + "step": 9500 + }, + { + "epoch": 1.3571124058547677, + "grad_norm": 1.8281489610671997, + "learning_rate": 0.0004494555555555556, + "loss": 3.4848, + "step": 9550 + }, + { + "epoch": 1.364217706408981, + "grad_norm": 1.9461642503738403, + "learning_rate": 0.0004489, + "loss": 3.4721, + "step": 9600 + }, + { + "epoch": 1.3713230069631945, + "grad_norm": 1.9854313135147095, + "learning_rate": 0.00044834444444444445, + "loss": 3.4813, + "step": 9650 + }, + { + "epoch": 1.378428307517408, + "grad_norm": 1.9185129404067993, + "learning_rate": 0.0004477888888888889, + "loss": 3.5082, + "step": 9700 + }, + { + "epoch": 1.3855336080716214, + "grad_norm": 1.9525820016860962, + "learning_rate": 0.0004472333333333333, + "loss": 3.4965, + "step": 9750 + }, + { + "epoch": 1.3926389086258348, + "grad_norm": 1.9303693771362305, + "learning_rate": 0.0004466777777777778, + "loss": 3.5032, + "step": 9800 + }, + { + "epoch": 1.3997442091800483, + "grad_norm": 1.8870890140533447, + "learning_rate": 0.0004461222222222222, + "loss": 3.4689, + "step": 9850 + }, + { + "epoch": 1.4068495097342617, + "grad_norm": 1.9007459878921509, + "learning_rate": 0.0004455666666666667, + "loss": 3.4519, + "step": 9900 + }, + { + "epoch": 1.4139548102884751, + "grad_norm": 2.0525524616241455, + "learning_rate": 0.00044501111111111113, + "loss": 3.4673, + "step": 9950 + }, + { + "epoch": 1.4210601108426886, + "grad_norm": 1.9106584787368774, + "learning_rate": 0.00044445555555555557, + "loss": 3.4609, + "step": 10000 + }, + { + "epoch": 1.4210601108426886, + "eval_accuracy": 0.44234737753868103, + "eval_loss": 3.3924427032470703, + "eval_runtime": 1.3159, + "eval_samples_per_second": 2857.31, + "eval_steps_per_second": 44.835, + "step": 10000 + }, + { + "epoch": 1.428165411396902, + "grad_norm": 1.6311852931976318, + "learning_rate": 0.0004439, + "loss": 3.4179, + "step": 10050 + }, + { + "epoch": 1.4352707119511154, + "grad_norm": 1.6985563039779663, + "learning_rate": 0.00044334444444444444, + "loss": 3.4466, + "step": 10100 + }, + { + "epoch": 1.442376012505329, + "grad_norm": 1.7384531497955322, + "learning_rate": 0.00044278888888888893, + "loss": 3.4444, + "step": 10150 + }, + { + "epoch": 1.4494813130595423, + "grad_norm": 1.8844335079193115, + "learning_rate": 0.0004422333333333333, + "loss": 3.4162, + "step": 10200 + }, + { + "epoch": 1.456586613613756, + "grad_norm": 1.8525067567825317, + "learning_rate": 0.0004416777777777778, + "loss": 3.4217, + "step": 10250 + }, + { + "epoch": 1.4636919141679692, + "grad_norm": 1.924688458442688, + "learning_rate": 0.00044112222222222224, + "loss": 3.441, + "step": 10300 + }, + { + "epoch": 1.4707972147221828, + "grad_norm": 1.8880536556243896, + "learning_rate": 0.0004405666666666667, + "loss": 3.4166, + "step": 10350 + }, + { + "epoch": 1.4779025152763963, + "grad_norm": 1.8569647073745728, + "learning_rate": 0.0004400111111111111, + "loss": 3.4239, + "step": 10400 + }, + { + "epoch": 1.4850078158306097, + "grad_norm": 1.7077707052230835, + "learning_rate": 0.00043945555555555555, + "loss": 3.4355, + "step": 10450 + }, + { + "epoch": 1.4921131163848231, + "grad_norm": 2.164099931716919, + "learning_rate": 0.00043890000000000004, + "loss": 3.4213, + "step": 10500 + }, + { + "epoch": 1.4992184169390366, + "grad_norm": 1.7151411771774292, + "learning_rate": 0.0004383444444444444, + "loss": 3.4066, + "step": 10550 + }, + { + "epoch": 1.50632371749325, + "grad_norm": 1.7368758916854858, + "learning_rate": 0.0004377888888888889, + "loss": 3.3885, + "step": 10600 + }, + { + "epoch": 1.5134290180474634, + "grad_norm": 1.9382424354553223, + "learning_rate": 0.0004372333333333333, + "loss": 3.4183, + "step": 10650 + }, + { + "epoch": 1.5205343186016769, + "grad_norm": 1.9375344514846802, + "learning_rate": 0.0004366777777777778, + "loss": 3.4014, + "step": 10700 + }, + { + "epoch": 1.5276396191558903, + "grad_norm": 1.8492896556854248, + "learning_rate": 0.00043612222222222223, + "loss": 3.3994, + "step": 10750 + }, + { + "epoch": 1.5347449197101037, + "grad_norm": 1.8383281230926514, + "learning_rate": 0.00043556666666666666, + "loss": 3.3695, + "step": 10800 + }, + { + "epoch": 1.5418502202643172, + "grad_norm": 1.785221815109253, + "learning_rate": 0.00043501111111111116, + "loss": 3.4025, + "step": 10850 + }, + { + "epoch": 1.5489555208185306, + "grad_norm": 1.8311967849731445, + "learning_rate": 0.00043445555555555554, + "loss": 3.3803, + "step": 10900 + }, + { + "epoch": 1.556060821372744, + "grad_norm": 2.0591444969177246, + "learning_rate": 0.00043390000000000003, + "loss": 3.3792, + "step": 10950 + }, + { + "epoch": 1.5631661219269575, + "grad_norm": 1.7114486694335938, + "learning_rate": 0.0004333444444444444, + "loss": 3.3562, + "step": 11000 + }, + { + "epoch": 1.570271422481171, + "grad_norm": 1.8185391426086426, + "learning_rate": 0.0004327888888888889, + "loss": 3.3397, + "step": 11050 + }, + { + "epoch": 1.5773767230353843, + "grad_norm": 2.1638777256011963, + "learning_rate": 0.00043223333333333334, + "loss": 3.3304, + "step": 11100 + }, + { + "epoch": 1.5844820235895978, + "grad_norm": 1.8578804731369019, + "learning_rate": 0.0004316777777777778, + "loss": 3.3724, + "step": 11150 + }, + { + "epoch": 1.5915873241438114, + "grad_norm": 1.8585854768753052, + "learning_rate": 0.00043112222222222227, + "loss": 3.3607, + "step": 11200 + }, + { + "epoch": 1.5986926246980246, + "grad_norm": 1.7061985731124878, + "learning_rate": 0.00043056666666666665, + "loss": 3.3526, + "step": 11250 + }, + { + "epoch": 1.6057979252522383, + "grad_norm": 1.9890382289886475, + "learning_rate": 0.00043001111111111114, + "loss": 3.3264, + "step": 11300 + }, + { + "epoch": 1.6129032258064515, + "grad_norm": 1.9284510612487793, + "learning_rate": 0.0004294555555555555, + "loss": 3.322, + "step": 11350 + }, + { + "epoch": 1.6200085263606652, + "grad_norm": 1.802822470664978, + "learning_rate": 0.0004289, + "loss": 3.356, + "step": 11400 + }, + { + "epoch": 1.6271138269148784, + "grad_norm": 1.6525734663009644, + "learning_rate": 0.0004283444444444445, + "loss": 3.343, + "step": 11450 + }, + { + "epoch": 1.634219127469092, + "grad_norm": 1.9781888723373413, + "learning_rate": 0.0004277888888888889, + "loss": 3.3424, + "step": 11500 + }, + { + "epoch": 1.6413244280233052, + "grad_norm": 1.78948175907135, + "learning_rate": 0.0004272333333333334, + "loss": 3.3334, + "step": 11550 + }, + { + "epoch": 1.648429728577519, + "grad_norm": 1.9074255228042603, + "learning_rate": 0.00042667777777777776, + "loss": 3.3221, + "step": 11600 + }, + { + "epoch": 1.6555350291317321, + "grad_norm": 1.723588466644287, + "learning_rate": 0.00042612222222222226, + "loss": 3.3214, + "step": 11650 + }, + { + "epoch": 1.6626403296859458, + "grad_norm": 1.7900450229644775, + "learning_rate": 0.00042556666666666664, + "loss": 3.3557, + "step": 11700 + }, + { + "epoch": 1.6697456302401592, + "grad_norm": 1.8456897735595703, + "learning_rate": 0.00042501111111111113, + "loss": 3.3195, + "step": 11750 + }, + { + "epoch": 1.6768509307943726, + "grad_norm": 1.9056233167648315, + "learning_rate": 0.0004244555555555555, + "loss": 3.3365, + "step": 11800 + }, + { + "epoch": 1.683956231348586, + "grad_norm": 1.88231360912323, + "learning_rate": 0.0004239, + "loss": 3.3063, + "step": 11850 + }, + { + "epoch": 1.6910615319027995, + "grad_norm": 1.961814045906067, + "learning_rate": 0.0004233444444444445, + "loss": 3.3011, + "step": 11900 + }, + { + "epoch": 1.698166832457013, + "grad_norm": 1.6388925313949585, + "learning_rate": 0.0004227888888888889, + "loss": 3.29, + "step": 11950 + }, + { + "epoch": 1.7052721330112264, + "grad_norm": 1.8381247520446777, + "learning_rate": 0.00042223333333333337, + "loss": 3.3135, + "step": 12000 + }, + { + "epoch": 1.7052721330112264, + "eval_accuracy": 0.4627563953399658, + "eval_loss": 3.2176170349121094, + "eval_runtime": 1.4396, + "eval_samples_per_second": 2611.836, + "eval_steps_per_second": 40.984, + "step": 12000 + }, + { + "epoch": 1.7123774335654398, + "grad_norm": 1.9075206518173218, + "learning_rate": 0.00042167777777777775, + "loss": 3.2829, + "step": 12050 + }, + { + "epoch": 1.7194827341196532, + "grad_norm": 1.8718541860580444, + "learning_rate": 0.00042112222222222224, + "loss": 3.3016, + "step": 12100 + }, + { + "epoch": 1.7265880346738667, + "grad_norm": 1.821109414100647, + "learning_rate": 0.0004205666666666667, + "loss": 3.2981, + "step": 12150 + }, + { + "epoch": 1.7336933352280801, + "grad_norm": 1.821947455406189, + "learning_rate": 0.0004200111111111111, + "loss": 3.2915, + "step": 12200 + }, + { + "epoch": 1.7407986357822935, + "grad_norm": 1.940075397491455, + "learning_rate": 0.0004194555555555556, + "loss": 3.2839, + "step": 12250 + }, + { + "epoch": 1.747903936336507, + "grad_norm": 1.779321312904358, + "learning_rate": 0.0004189, + "loss": 3.2556, + "step": 12300 + }, + { + "epoch": 1.7550092368907206, + "grad_norm": 1.8860771656036377, + "learning_rate": 0.0004183444444444445, + "loss": 3.29, + "step": 12350 + }, + { + "epoch": 1.7621145374449338, + "grad_norm": 1.7763786315917969, + "learning_rate": 0.00041778888888888886, + "loss": 3.284, + "step": 12400 + }, + { + "epoch": 1.7692198379991475, + "grad_norm": 1.822303295135498, + "learning_rate": 0.00041723333333333336, + "loss": 3.2684, + "step": 12450 + }, + { + "epoch": 1.7763251385533607, + "grad_norm": 1.8546795845031738, + "learning_rate": 0.0004166777777777778, + "loss": 3.2722, + "step": 12500 + }, + { + "epoch": 1.7834304391075744, + "grad_norm": 1.8313467502593994, + "learning_rate": 0.00041612222222222223, + "loss": 3.2599, + "step": 12550 + }, + { + "epoch": 1.7905357396617876, + "grad_norm": 1.8988237380981445, + "learning_rate": 0.00041556666666666667, + "loss": 3.2451, + "step": 12600 + }, + { + "epoch": 1.7976410402160012, + "grad_norm": 1.7995522022247314, + "learning_rate": 0.0004150111111111111, + "loss": 3.2431, + "step": 12650 + }, + { + "epoch": 1.8047463407702145, + "grad_norm": 1.8105956315994263, + "learning_rate": 0.0004144555555555556, + "loss": 3.2426, + "step": 12700 + }, + { + "epoch": 1.811851641324428, + "grad_norm": 1.672658920288086, + "learning_rate": 0.0004139, + "loss": 3.2177, + "step": 12750 + }, + { + "epoch": 1.8189569418786413, + "grad_norm": 1.6561784744262695, + "learning_rate": 0.00041334444444444447, + "loss": 3.2395, + "step": 12800 + }, + { + "epoch": 1.826062242432855, + "grad_norm": 1.965090274810791, + "learning_rate": 0.0004127888888888889, + "loss": 3.2797, + "step": 12850 + }, + { + "epoch": 1.8331675429870682, + "grad_norm": 1.803202509880066, + "learning_rate": 0.00041223333333333334, + "loss": 3.2293, + "step": 12900 + }, + { + "epoch": 1.8402728435412818, + "grad_norm": 1.8006333112716675, + "learning_rate": 0.0004116777777777778, + "loss": 3.2415, + "step": 12950 + }, + { + "epoch": 1.8473781440954953, + "grad_norm": 1.754680871963501, + "learning_rate": 0.0004111222222222222, + "loss": 3.2471, + "step": 13000 + }, + { + "epoch": 1.8544834446497087, + "grad_norm": 1.9389216899871826, + "learning_rate": 0.0004105666666666667, + "loss": 3.2215, + "step": 13050 + }, + { + "epoch": 1.8615887452039221, + "grad_norm": 1.6737974882125854, + "learning_rate": 0.0004100111111111111, + "loss": 3.2347, + "step": 13100 + }, + { + "epoch": 1.8686940457581356, + "grad_norm": 1.608807921409607, + "learning_rate": 0.0004094555555555556, + "loss": 3.2219, + "step": 13150 + }, + { + "epoch": 1.875799346312349, + "grad_norm": 1.9688620567321777, + "learning_rate": 0.0004089, + "loss": 3.2489, + "step": 13200 + }, + { + "epoch": 1.8829046468665624, + "grad_norm": 1.6064422130584717, + "learning_rate": 0.00040834444444444446, + "loss": 3.2206, + "step": 13250 + }, + { + "epoch": 1.8900099474207759, + "grad_norm": 1.7799787521362305, + "learning_rate": 0.0004077888888888889, + "loss": 3.2219, + "step": 13300 + }, + { + "epoch": 1.8971152479749893, + "grad_norm": 1.880999207496643, + "learning_rate": 0.00040723333333333333, + "loss": 3.2213, + "step": 13350 + }, + { + "epoch": 1.9042205485292028, + "grad_norm": 1.9010646343231201, + "learning_rate": 0.0004066777777777778, + "loss": 3.2001, + "step": 13400 + }, + { + "epoch": 1.9113258490834162, + "grad_norm": 1.7856796979904175, + "learning_rate": 0.0004061222222222222, + "loss": 3.2293, + "step": 13450 + }, + { + "epoch": 1.9184311496376296, + "grad_norm": 1.7832741737365723, + "learning_rate": 0.0004055666666666667, + "loss": 3.2087, + "step": 13500 + }, + { + "epoch": 1.925536450191843, + "grad_norm": 1.817251205444336, + "learning_rate": 0.00040501111111111113, + "loss": 3.2004, + "step": 13550 + }, + { + "epoch": 1.9326417507460567, + "grad_norm": 1.5597301721572876, + "learning_rate": 0.00040445555555555557, + "loss": 3.1885, + "step": 13600 + }, + { + "epoch": 1.93974705130027, + "grad_norm": 1.6983693838119507, + "learning_rate": 0.0004039, + "loss": 3.1858, + "step": 13650 + }, + { + "epoch": 1.9468523518544836, + "grad_norm": 1.6369034051895142, + "learning_rate": 0.00040334444444444444, + "loss": 3.1833, + "step": 13700 + }, + { + "epoch": 1.9539576524086968, + "grad_norm": 1.6948421001434326, + "learning_rate": 0.0004027888888888889, + "loss": 3.1958, + "step": 13750 + }, + { + "epoch": 1.9610629529629104, + "grad_norm": 1.742189645767212, + "learning_rate": 0.0004022333333333333, + "loss": 3.2171, + "step": 13800 + }, + { + "epoch": 1.9681682535171237, + "grad_norm": 1.7551114559173584, + "learning_rate": 0.0004016777777777778, + "loss": 3.1991, + "step": 13850 + }, + { + "epoch": 1.9752735540713373, + "grad_norm": 1.8701705932617188, + "learning_rate": 0.00040112222222222224, + "loss": 3.1838, + "step": 13900 + }, + { + "epoch": 1.9823788546255505, + "grad_norm": 1.7148219347000122, + "learning_rate": 0.0004005666666666667, + "loss": 3.1522, + "step": 13950 + }, + { + "epoch": 1.9894841551797642, + "grad_norm": 1.7814823389053345, + "learning_rate": 0.0004000111111111111, + "loss": 3.1841, + "step": 14000 + }, + { + "epoch": 1.9894841551797642, + "eval_accuracy": 0.471990168094635, + "eval_loss": 3.1252591609954834, + "eval_runtime": 1.4365, + "eval_samples_per_second": 2617.479, + "eval_steps_per_second": 41.072, + "step": 14000 + }, + { + "epoch": 1.9965894557339774, + "grad_norm": 1.8632270097732544, + "learning_rate": 0.00039945555555555556, + "loss": 3.1887, + "step": 14050 + }, + { + "epoch": 2.003694756288191, + "grad_norm": 1.8517438173294067, + "learning_rate": 0.0003989, + "loss": 3.1434, + "step": 14100 + }, + { + "epoch": 2.0108000568424043, + "grad_norm": 1.9222501516342163, + "learning_rate": 0.00039834444444444443, + "loss": 3.1432, + "step": 14150 + }, + { + "epoch": 2.017905357396618, + "grad_norm": 1.9569604396820068, + "learning_rate": 0.0003977888888888889, + "loss": 3.1752, + "step": 14200 + }, + { + "epoch": 2.025010657950831, + "grad_norm": 1.9165287017822266, + "learning_rate": 0.00039723333333333336, + "loss": 3.1438, + "step": 14250 + }, + { + "epoch": 2.032115958505045, + "grad_norm": 1.8013373613357544, + "learning_rate": 0.0003966777777777778, + "loss": 3.1602, + "step": 14300 + }, + { + "epoch": 2.039221259059258, + "grad_norm": 1.7178173065185547, + "learning_rate": 0.00039612222222222223, + "loss": 3.1763, + "step": 14350 + }, + { + "epoch": 2.0463265596134717, + "grad_norm": 1.762027382850647, + "learning_rate": 0.00039556666666666667, + "loss": 3.1565, + "step": 14400 + }, + { + "epoch": 2.0534318601676853, + "grad_norm": 1.7229429483413696, + "learning_rate": 0.0003950111111111111, + "loss": 3.1485, + "step": 14450 + }, + { + "epoch": 2.0605371607218985, + "grad_norm": 1.9075498580932617, + "learning_rate": 0.00039445555555555554, + "loss": 3.156, + "step": 14500 + }, + { + "epoch": 2.067642461276112, + "grad_norm": 1.7284672260284424, + "learning_rate": 0.0003939, + "loss": 3.1324, + "step": 14550 + }, + { + "epoch": 2.0747477618303254, + "grad_norm": 1.9364721775054932, + "learning_rate": 0.00039334444444444447, + "loss": 3.1528, + "step": 14600 + }, + { + "epoch": 2.081853062384539, + "grad_norm": 1.8764890432357788, + "learning_rate": 0.0003927888888888889, + "loss": 3.1491, + "step": 14650 + }, + { + "epoch": 2.0889583629387523, + "grad_norm": 1.8724685907363892, + "learning_rate": 0.00039223333333333334, + "loss": 3.1342, + "step": 14700 + }, + { + "epoch": 2.096063663492966, + "grad_norm": 1.7195576429367065, + "learning_rate": 0.0003916777777777778, + "loss": 3.1256, + "step": 14750 + }, + { + "epoch": 2.103168964047179, + "grad_norm": 1.8319681882858276, + "learning_rate": 0.0003911222222222222, + "loss": 3.1422, + "step": 14800 + }, + { + "epoch": 2.110274264601393, + "grad_norm": 1.840844750404358, + "learning_rate": 0.00039056666666666666, + "loss": 3.1501, + "step": 14850 + }, + { + "epoch": 2.117379565155606, + "grad_norm": 1.756561517715454, + "learning_rate": 0.0003900111111111111, + "loss": 3.124, + "step": 14900 + }, + { + "epoch": 2.1244848657098196, + "grad_norm": 1.6923362016677856, + "learning_rate": 0.0003894555555555556, + "loss": 3.1231, + "step": 14950 + }, + { + "epoch": 2.131590166264033, + "grad_norm": 1.7511463165283203, + "learning_rate": 0.0003889, + "loss": 3.105, + "step": 15000 + }, + { + "epoch": 2.1386954668182465, + "grad_norm": 1.9665418863296509, + "learning_rate": 0.00038834444444444446, + "loss": 3.1074, + "step": 15050 + }, + { + "epoch": 2.1458007673724597, + "grad_norm": 1.7199262380599976, + "learning_rate": 0.0003877888888888889, + "loss": 3.1231, + "step": 15100 + }, + { + "epoch": 2.1529060679266734, + "grad_norm": 2.058184862136841, + "learning_rate": 0.00038723333333333333, + "loss": 3.096, + "step": 15150 + }, + { + "epoch": 2.1600113684808866, + "grad_norm": 1.7477974891662598, + "learning_rate": 0.00038667777777777777, + "loss": 3.117, + "step": 15200 + }, + { + "epoch": 2.1671166690351003, + "grad_norm": 1.8791770935058594, + "learning_rate": 0.0003861222222222222, + "loss": 3.1073, + "step": 15250 + }, + { + "epoch": 2.1742219695893135, + "grad_norm": 1.924623727798462, + "learning_rate": 0.0003855666666666667, + "loss": 3.1212, + "step": 15300 + }, + { + "epoch": 2.181327270143527, + "grad_norm": 1.8112562894821167, + "learning_rate": 0.00038501111111111113, + "loss": 3.092, + "step": 15350 + }, + { + "epoch": 2.1884325706977403, + "grad_norm": 1.9288897514343262, + "learning_rate": 0.00038445555555555557, + "loss": 3.0911, + "step": 15400 + }, + { + "epoch": 2.195537871251954, + "grad_norm": 1.8161592483520508, + "learning_rate": 0.0003839, + "loss": 3.0984, + "step": 15450 + }, + { + "epoch": 2.202643171806167, + "grad_norm": 1.7706139087677002, + "learning_rate": 0.00038334444444444444, + "loss": 3.0908, + "step": 15500 + }, + { + "epoch": 2.209748472360381, + "grad_norm": 2.003417491912842, + "learning_rate": 0.00038278888888888894, + "loss": 3.092, + "step": 15550 + }, + { + "epoch": 2.216853772914594, + "grad_norm": 1.8415908813476562, + "learning_rate": 0.0003822333333333333, + "loss": 3.1011, + "step": 15600 + }, + { + "epoch": 2.2239590734688077, + "grad_norm": 1.7236992120742798, + "learning_rate": 0.0003816777777777778, + "loss": 3.1022, + "step": 15650 + }, + { + "epoch": 2.2310643740230214, + "grad_norm": 1.8835231065750122, + "learning_rate": 0.0003811222222222222, + "loss": 3.0866, + "step": 15700 + }, + { + "epoch": 2.2381696745772346, + "grad_norm": 1.805284023284912, + "learning_rate": 0.0003805666666666667, + "loss": 3.1058, + "step": 15750 + }, + { + "epoch": 2.2452749751314482, + "grad_norm": 1.9442837238311768, + "learning_rate": 0.0003800111111111111, + "loss": 3.0923, + "step": 15800 + }, + { + "epoch": 2.2523802756856615, + "grad_norm": 1.8718860149383545, + "learning_rate": 0.00037945555555555556, + "loss": 3.1035, + "step": 15850 + }, + { + "epoch": 2.259485576239875, + "grad_norm": 1.8578605651855469, + "learning_rate": 0.00037890000000000005, + "loss": 3.0904, + "step": 15900 + }, + { + "epoch": 2.2665908767940883, + "grad_norm": 1.7682795524597168, + "learning_rate": 0.00037834444444444443, + "loss": 3.1211, + "step": 15950 + }, + { + "epoch": 2.273696177348302, + "grad_norm": 1.7548738718032837, + "learning_rate": 0.0003777888888888889, + "loss": 3.0719, + "step": 16000 + }, + { + "epoch": 2.273696177348302, + "eval_accuracy": 0.47854650020599365, + "eval_loss": 3.0652825832366943, + "eval_runtime": 1.4549, + "eval_samples_per_second": 2584.303, + "eval_steps_per_second": 40.552, + "step": 16000 + }, + { + "epoch": 2.280801477902515, + "grad_norm": 1.7292568683624268, + "learning_rate": 0.0003772333333333333, + "loss": 3.0797, + "step": 16050 + }, + { + "epoch": 2.287906778456729, + "grad_norm": 1.8499785661697388, + "learning_rate": 0.0003766777777777778, + "loss": 3.0689, + "step": 16100 + }, + { + "epoch": 2.295012079010942, + "grad_norm": 1.7851368188858032, + "learning_rate": 0.00037612222222222223, + "loss": 3.07, + "step": 16150 + }, + { + "epoch": 2.3021173795651557, + "grad_norm": 1.7935354709625244, + "learning_rate": 0.00037556666666666667, + "loss": 3.0801, + "step": 16200 + }, + { + "epoch": 2.309222680119369, + "grad_norm": 1.778581142425537, + "learning_rate": 0.00037501111111111116, + "loss": 3.0556, + "step": 16250 + }, + { + "epoch": 2.3163279806735826, + "grad_norm": 1.8388044834136963, + "learning_rate": 0.00037445555555555554, + "loss": 3.0656, + "step": 16300 + }, + { + "epoch": 2.323433281227796, + "grad_norm": 1.9298747777938843, + "learning_rate": 0.00037390000000000004, + "loss": 3.054, + "step": 16350 + }, + { + "epoch": 2.3305385817820095, + "grad_norm": 1.7658896446228027, + "learning_rate": 0.0003733444444444444, + "loss": 3.0465, + "step": 16400 + }, + { + "epoch": 2.3376438823362227, + "grad_norm": 1.7524223327636719, + "learning_rate": 0.0003727888888888889, + "loss": 3.0571, + "step": 16450 + }, + { + "epoch": 2.3447491828904363, + "grad_norm": 1.7193357944488525, + "learning_rate": 0.00037223333333333335, + "loss": 3.0535, + "step": 16500 + }, + { + "epoch": 2.3518544834446495, + "grad_norm": 2.003408193588257, + "learning_rate": 0.0003716777777777778, + "loss": 3.0622, + "step": 16550 + }, + { + "epoch": 2.358959783998863, + "grad_norm": 2.0907719135284424, + "learning_rate": 0.0003711222222222223, + "loss": 3.0641, + "step": 16600 + }, + { + "epoch": 2.3660650845530764, + "grad_norm": 1.819555401802063, + "learning_rate": 0.00037056666666666666, + "loss": 3.0539, + "step": 16650 + }, + { + "epoch": 2.37317038510729, + "grad_norm": 1.6507291793823242, + "learning_rate": 0.00037001111111111115, + "loss": 3.0291, + "step": 16700 + }, + { + "epoch": 2.3802756856615037, + "grad_norm": 1.763790249824524, + "learning_rate": 0.00036945555555555553, + "loss": 3.0576, + "step": 16750 + }, + { + "epoch": 2.387380986215717, + "grad_norm": 1.8909801244735718, + "learning_rate": 0.0003689, + "loss": 3.0673, + "step": 16800 + }, + { + "epoch": 2.39448628676993, + "grad_norm": 1.701228380203247, + "learning_rate": 0.0003683444444444444, + "loss": 3.0441, + "step": 16850 + }, + { + "epoch": 2.401591587324144, + "grad_norm": 1.7287544012069702, + "learning_rate": 0.0003677888888888889, + "loss": 3.0622, + "step": 16900 + }, + { + "epoch": 2.4086968878783575, + "grad_norm": 1.7561144828796387, + "learning_rate": 0.0003672333333333334, + "loss": 3.0488, + "step": 16950 + }, + { + "epoch": 2.4158021884325707, + "grad_norm": 1.8787589073181152, + "learning_rate": 0.00036667777777777777, + "loss": 3.057, + "step": 17000 + }, + { + "epoch": 2.4229074889867843, + "grad_norm": 1.7158374786376953, + "learning_rate": 0.00036612222222222226, + "loss": 3.0378, + "step": 17050 + }, + { + "epoch": 2.4300127895409975, + "grad_norm": 1.8142306804656982, + "learning_rate": 0.00036556666666666664, + "loss": 3.0515, + "step": 17100 + }, + { + "epoch": 2.437118090095211, + "grad_norm": 1.5873689651489258, + "learning_rate": 0.00036501111111111114, + "loss": 3.0495, + "step": 17150 + }, + { + "epoch": 2.4442233906494244, + "grad_norm": 1.8045600652694702, + "learning_rate": 0.0003644555555555555, + "loss": 3.025, + "step": 17200 + }, + { + "epoch": 2.451328691203638, + "grad_norm": 1.6360293626785278, + "learning_rate": 0.0003639, + "loss": 3.0391, + "step": 17250 + }, + { + "epoch": 2.4584339917578513, + "grad_norm": 1.805924892425537, + "learning_rate": 0.0003633444444444445, + "loss": 3.028, + "step": 17300 + }, + { + "epoch": 2.465539292312065, + "grad_norm": 1.8847789764404297, + "learning_rate": 0.0003627888888888889, + "loss": 3.0413, + "step": 17350 + }, + { + "epoch": 2.472644592866278, + "grad_norm": 1.900485873222351, + "learning_rate": 0.0003622333333333334, + "loss": 3.023, + "step": 17400 + }, + { + "epoch": 2.479749893420492, + "grad_norm": 1.7782105207443237, + "learning_rate": 0.00036167777777777776, + "loss": 3.0128, + "step": 17450 + }, + { + "epoch": 2.486855193974705, + "grad_norm": 1.69002103805542, + "learning_rate": 0.00036112222222222225, + "loss": 3.0246, + "step": 17500 + }, + { + "epoch": 2.4939604945289187, + "grad_norm": 1.8271868228912354, + "learning_rate": 0.00036056666666666663, + "loss": 3.0155, + "step": 17550 + }, + { + "epoch": 2.501065795083132, + "grad_norm": 1.7090950012207031, + "learning_rate": 0.0003600111111111111, + "loss": 3.0339, + "step": 17600 + }, + { + "epoch": 2.5081710956373455, + "grad_norm": 1.7666364908218384, + "learning_rate": 0.00035945555555555556, + "loss": 3.0362, + "step": 17650 + }, + { + "epoch": 2.5152763961915587, + "grad_norm": 1.8835707902908325, + "learning_rate": 0.0003589, + "loss": 3.0236, + "step": 17700 + }, + { + "epoch": 2.5223816967457724, + "grad_norm": 1.8882673978805542, + "learning_rate": 0.0003583444444444445, + "loss": 3.0096, + "step": 17750 + }, + { + "epoch": 2.529486997299986, + "grad_norm": 1.7961220741271973, + "learning_rate": 0.00035778888888888887, + "loss": 3.0135, + "step": 17800 + }, + { + "epoch": 2.5365922978541993, + "grad_norm": 1.7673027515411377, + "learning_rate": 0.00035723333333333336, + "loss": 3.0151, + "step": 17850 + }, + { + "epoch": 2.5436975984084125, + "grad_norm": 1.7343908548355103, + "learning_rate": 0.00035667777777777774, + "loss": 3.0174, + "step": 17900 + }, + { + "epoch": 2.550802898962626, + "grad_norm": 1.7693558931350708, + "learning_rate": 0.00035612222222222223, + "loss": 3.0008, + "step": 17950 + }, + { + "epoch": 2.55790819951684, + "grad_norm": 1.762675404548645, + "learning_rate": 0.00035556666666666667, + "loss": 2.986, + "step": 18000 + }, + { + "epoch": 2.55790819951684, + "eval_accuracy": 0.4936477839946747, + "eval_loss": 2.9375457763671875, + "eval_runtime": 1.4066, + "eval_samples_per_second": 2673.043, + "eval_steps_per_second": 41.944, + "step": 18000 + }, + { + "epoch": 2.565013500071053, + "grad_norm": 1.7804518938064575, + "learning_rate": 0.0003550111111111111, + "loss": 3.0187, + "step": 18050 + }, + { + "epoch": 2.572118800625266, + "grad_norm": 1.7693209648132324, + "learning_rate": 0.0003544555555555556, + "loss": 3.011, + "step": 18100 + }, + { + "epoch": 2.57922410117948, + "grad_norm": 1.8292006254196167, + "learning_rate": 0.0003539, + "loss": 3.0151, + "step": 18150 + }, + { + "epoch": 2.5863294017336935, + "grad_norm": 1.659195065498352, + "learning_rate": 0.0003533444444444445, + "loss": 2.9828, + "step": 18200 + }, + { + "epoch": 2.5934347022879067, + "grad_norm": 1.8265076875686646, + "learning_rate": 0.00035278888888888886, + "loss": 3.0014, + "step": 18250 + }, + { + "epoch": 2.60054000284212, + "grad_norm": 1.7613290548324585, + "learning_rate": 0.00035223333333333335, + "loss": 2.9924, + "step": 18300 + }, + { + "epoch": 2.6076453033963336, + "grad_norm": 1.8945201635360718, + "learning_rate": 0.0003516777777777778, + "loss": 2.9883, + "step": 18350 + }, + { + "epoch": 2.6147506039505473, + "grad_norm": 1.6011463403701782, + "learning_rate": 0.0003511222222222222, + "loss": 2.9815, + "step": 18400 + }, + { + "epoch": 2.6218559045047605, + "grad_norm": 1.730685830116272, + "learning_rate": 0.0003505666666666667, + "loss": 3.0054, + "step": 18450 + }, + { + "epoch": 2.628961205058974, + "grad_norm": 1.7386492490768433, + "learning_rate": 0.0003500111111111111, + "loss": 2.9788, + "step": 18500 + }, + { + "epoch": 2.6360665056131873, + "grad_norm": 1.8171436786651611, + "learning_rate": 0.0003494555555555556, + "loss": 2.9997, + "step": 18550 + }, + { + "epoch": 2.643171806167401, + "grad_norm": 2.123448610305786, + "learning_rate": 0.00034889999999999997, + "loss": 3.014, + "step": 18600 + }, + { + "epoch": 2.650277106721614, + "grad_norm": 1.7145969867706299, + "learning_rate": 0.00034834444444444446, + "loss": 2.9728, + "step": 18650 + }, + { + "epoch": 2.657382407275828, + "grad_norm": 1.7777656316757202, + "learning_rate": 0.0003477888888888889, + "loss": 2.9886, + "step": 18700 + }, + { + "epoch": 2.664487707830041, + "grad_norm": 1.7456960678100586, + "learning_rate": 0.00034723333333333333, + "loss": 2.9896, + "step": 18750 + }, + { + "epoch": 2.6715930083842547, + "grad_norm": 1.6129354238510132, + "learning_rate": 0.00034667777777777777, + "loss": 2.9732, + "step": 18800 + }, + { + "epoch": 2.678698308938468, + "grad_norm": 1.6911518573760986, + "learning_rate": 0.0003461222222222222, + "loss": 2.9867, + "step": 18850 + }, + { + "epoch": 2.6858036094926816, + "grad_norm": 1.7017191648483276, + "learning_rate": 0.0003455666666666667, + "loss": 2.97, + "step": 18900 + }, + { + "epoch": 2.692908910046895, + "grad_norm": 1.6771681308746338, + "learning_rate": 0.0003450111111111111, + "loss": 2.9767, + "step": 18950 + }, + { + "epoch": 2.7000142106011085, + "grad_norm": 1.8211736679077148, + "learning_rate": 0.0003444555555555556, + "loss": 3.0055, + "step": 19000 + }, + { + "epoch": 2.707119511155322, + "grad_norm": 1.8175971508026123, + "learning_rate": 0.0003439, + "loss": 2.9567, + "step": 19050 + }, + { + "epoch": 2.7142248117095353, + "grad_norm": 1.8108701705932617, + "learning_rate": 0.00034334444444444445, + "loss": 2.9767, + "step": 19100 + }, + { + "epoch": 2.7213301122637485, + "grad_norm": 1.737069845199585, + "learning_rate": 0.0003427888888888889, + "loss": 2.9864, + "step": 19150 + }, + { + "epoch": 2.728435412817962, + "grad_norm": 1.6564186811447144, + "learning_rate": 0.0003422333333333333, + "loss": 2.9684, + "step": 19200 + }, + { + "epoch": 2.735540713372176, + "grad_norm": 1.7465981245040894, + "learning_rate": 0.0003416777777777778, + "loss": 2.9774, + "step": 19250 + }, + { + "epoch": 2.742646013926389, + "grad_norm": 1.7666462659835815, + "learning_rate": 0.0003411222222222222, + "loss": 2.9737, + "step": 19300 + }, + { + "epoch": 2.7497513144806023, + "grad_norm": 1.7104542255401611, + "learning_rate": 0.0003405666666666667, + "loss": 2.9636, + "step": 19350 + }, + { + "epoch": 2.756856615034816, + "grad_norm": 1.6974040269851685, + "learning_rate": 0.0003400111111111111, + "loss": 2.96, + "step": 19400 + }, + { + "epoch": 2.7639619155890296, + "grad_norm": 1.7548801898956299, + "learning_rate": 0.00033945555555555556, + "loss": 2.9621, + "step": 19450 + }, + { + "epoch": 2.771067216143243, + "grad_norm": 1.6098867654800415, + "learning_rate": 0.0003389, + "loss": 2.9714, + "step": 19500 + }, + { + "epoch": 2.7781725166974565, + "grad_norm": 1.7277166843414307, + "learning_rate": 0.00033834444444444443, + "loss": 2.9735, + "step": 19550 + }, + { + "epoch": 2.7852778172516697, + "grad_norm": 1.7835525274276733, + "learning_rate": 0.0003377888888888889, + "loss": 2.9882, + "step": 19600 + }, + { + "epoch": 2.7923831178058833, + "grad_norm": 1.6993199586868286, + "learning_rate": 0.00033723333333333336, + "loss": 2.9597, + "step": 19650 + }, + { + "epoch": 2.7994884183600965, + "grad_norm": 1.764560580253601, + "learning_rate": 0.0003366777777777778, + "loss": 2.9507, + "step": 19700 + }, + { + "epoch": 2.80659371891431, + "grad_norm": 1.816872000694275, + "learning_rate": 0.00033612222222222224, + "loss": 2.9655, + "step": 19750 + }, + { + "epoch": 2.8136990194685234, + "grad_norm": 1.8321980237960815, + "learning_rate": 0.0003355666666666667, + "loss": 2.9932, + "step": 19800 + }, + { + "epoch": 2.820804320022737, + "grad_norm": 1.8136756420135498, + "learning_rate": 0.0003350111111111111, + "loss": 2.9514, + "step": 19850 + }, + { + "epoch": 2.8279096205769503, + "grad_norm": 1.7299060821533203, + "learning_rate": 0.00033445555555555555, + "loss": 2.9607, + "step": 19900 + }, + { + "epoch": 2.835014921131164, + "grad_norm": 1.6662367582321167, + "learning_rate": 0.0003339, + "loss": 2.941, + "step": 19950 + }, + { + "epoch": 2.842120221685377, + "grad_norm": 1.646530032157898, + "learning_rate": 0.0003333444444444445, + "loss": 2.9477, + "step": 20000 + }, + { + "epoch": 2.842120221685377, + "eval_accuracy": 0.49664175510406494, + "eval_loss": 2.882197141647339, + "eval_runtime": 1.406, + "eval_samples_per_second": 2674.307, + "eval_steps_per_second": 41.964, + "step": 20000 + }, + { + "epoch": 2.849225522239591, + "grad_norm": 1.721617579460144, + "learning_rate": 0.0003327888888888889, + "loss": 2.9421, + "step": 20050 + }, + { + "epoch": 2.856330822793804, + "grad_norm": 1.8738772869110107, + "learning_rate": 0.00033223333333333335, + "loss": 2.9624, + "step": 20100 + }, + { + "epoch": 2.8634361233480177, + "grad_norm": 1.7513505220413208, + "learning_rate": 0.0003316777777777778, + "loss": 2.9429, + "step": 20150 + }, + { + "epoch": 2.870541423902231, + "grad_norm": 1.9003950357437134, + "learning_rate": 0.0003311222222222222, + "loss": 2.9538, + "step": 20200 + }, + { + "epoch": 2.8776467244564445, + "grad_norm": 1.6538949012756348, + "learning_rate": 0.00033056666666666666, + "loss": 2.9437, + "step": 20250 + }, + { + "epoch": 2.884752025010658, + "grad_norm": 1.7895572185516357, + "learning_rate": 0.0003300111111111111, + "loss": 2.9445, + "step": 20300 + }, + { + "epoch": 2.8918573255648714, + "grad_norm": 1.6757142543792725, + "learning_rate": 0.0003294555555555556, + "loss": 2.9393, + "step": 20350 + }, + { + "epoch": 2.8989626261190846, + "grad_norm": 1.717271327972412, + "learning_rate": 0.0003289, + "loss": 2.9467, + "step": 20400 + }, + { + "epoch": 2.9060679266732983, + "grad_norm": 1.7182637453079224, + "learning_rate": 0.00032834444444444446, + "loss": 2.9295, + "step": 20450 + }, + { + "epoch": 2.913173227227512, + "grad_norm": 1.770296573638916, + "learning_rate": 0.0003277888888888889, + "loss": 2.9355, + "step": 20500 + }, + { + "epoch": 2.920278527781725, + "grad_norm": 1.6502301692962646, + "learning_rate": 0.00032723333333333334, + "loss": 2.9241, + "step": 20550 + }, + { + "epoch": 2.9273838283359384, + "grad_norm": 1.689746618270874, + "learning_rate": 0.0003266777777777778, + "loss": 2.9195, + "step": 20600 + }, + { + "epoch": 2.934489128890152, + "grad_norm": 1.8485779762268066, + "learning_rate": 0.0003261222222222222, + "loss": 2.934, + "step": 20650 + }, + { + "epoch": 2.9415944294443657, + "grad_norm": 1.8438777923583984, + "learning_rate": 0.0003255666666666667, + "loss": 2.9208, + "step": 20700 + }, + { + "epoch": 2.948699729998579, + "grad_norm": 1.8864026069641113, + "learning_rate": 0.0003250111111111111, + "loss": 2.9299, + "step": 20750 + }, + { + "epoch": 2.9558050305527925, + "grad_norm": 1.799882411956787, + "learning_rate": 0.0003244555555555556, + "loss": 2.9306, + "step": 20800 + }, + { + "epoch": 2.9629103311070057, + "grad_norm": 1.7453547716140747, + "learning_rate": 0.0003239, + "loss": 2.9329, + "step": 20850 + }, + { + "epoch": 2.9700156316612194, + "grad_norm": 1.6310656070709229, + "learning_rate": 0.00032334444444444445, + "loss": 2.9027, + "step": 20900 + }, + { + "epoch": 2.9771209322154326, + "grad_norm": 1.8118422031402588, + "learning_rate": 0.0003227888888888889, + "loss": 2.925, + "step": 20950 + }, + { + "epoch": 2.9842262327696463, + "grad_norm": 1.7698137760162354, + "learning_rate": 0.0003222333333333333, + "loss": 2.9347, + "step": 21000 + }, + { + "epoch": 2.9913315333238595, + "grad_norm": 1.7710407972335815, + "learning_rate": 0.0003216777777777778, + "loss": 2.9159, + "step": 21050 + }, + { + "epoch": 2.998436833878073, + "grad_norm": 1.613924264907837, + "learning_rate": 0.0003211222222222222, + "loss": 2.9188, + "step": 21100 + }, + { + "epoch": 3.0055421344322863, + "grad_norm": 1.8230890035629272, + "learning_rate": 0.0003205666666666667, + "loss": 2.8801, + "step": 21150 + }, + { + "epoch": 3.0126474349865, + "grad_norm": 1.745085597038269, + "learning_rate": 0.0003200111111111111, + "loss": 2.8948, + "step": 21200 + }, + { + "epoch": 3.019752735540713, + "grad_norm": 1.6332448720932007, + "learning_rate": 0.00031945555555555556, + "loss": 2.8926, + "step": 21250 + }, + { + "epoch": 3.026858036094927, + "grad_norm": 1.6312452554702759, + "learning_rate": 0.0003189, + "loss": 2.9057, + "step": 21300 + }, + { + "epoch": 3.03396333664914, + "grad_norm": 1.7664523124694824, + "learning_rate": 0.00031834444444444444, + "loss": 2.9123, + "step": 21350 + }, + { + "epoch": 3.0410686372033537, + "grad_norm": 1.8374122381210327, + "learning_rate": 0.00031778888888888893, + "loss": 2.9046, + "step": 21400 + }, + { + "epoch": 3.048173937757567, + "grad_norm": 1.686972737312317, + "learning_rate": 0.0003172333333333333, + "loss": 2.9136, + "step": 21450 + }, + { + "epoch": 3.0552792383117806, + "grad_norm": 1.7806686162948608, + "learning_rate": 0.0003166777777777778, + "loss": 2.867, + "step": 21500 + }, + { + "epoch": 3.062384538865994, + "grad_norm": 1.7213020324707031, + "learning_rate": 0.00031612222222222224, + "loss": 2.8885, + "step": 21550 + }, + { + "epoch": 3.0694898394202075, + "grad_norm": 1.659408688545227, + "learning_rate": 0.0003155666666666667, + "loss": 2.8824, + "step": 21600 + }, + { + "epoch": 3.0765951399744207, + "grad_norm": 1.71113121509552, + "learning_rate": 0.0003150111111111111, + "loss": 2.9071, + "step": 21650 + }, + { + "epoch": 3.0837004405286343, + "grad_norm": 1.6978799104690552, + "learning_rate": 0.00031445555555555555, + "loss": 2.9053, + "step": 21700 + }, + { + "epoch": 3.090805741082848, + "grad_norm": 1.7844059467315674, + "learning_rate": 0.00031390000000000004, + "loss": 2.8661, + "step": 21750 + }, + { + "epoch": 3.097911041637061, + "grad_norm": 1.7128864526748657, + "learning_rate": 0.0003133444444444444, + "loss": 2.8892, + "step": 21800 + }, + { + "epoch": 3.105016342191275, + "grad_norm": 1.7532225847244263, + "learning_rate": 0.0003127888888888889, + "loss": 2.9181, + "step": 21850 + }, + { + "epoch": 3.112121642745488, + "grad_norm": 1.7945367097854614, + "learning_rate": 0.0003122333333333333, + "loss": 2.8686, + "step": 21900 + }, + { + "epoch": 3.1192269432997017, + "grad_norm": 1.6655150651931763, + "learning_rate": 0.0003116777777777778, + "loss": 2.8604, + "step": 21950 + }, + { + "epoch": 3.126332243853915, + "grad_norm": 1.6884666681289673, + "learning_rate": 0.0003111222222222222, + "loss": 2.9021, + "step": 22000 + }, + { + "epoch": 3.126332243853915, + "eval_accuracy": 0.5131886601448059, + "eval_loss": 2.801339864730835, + "eval_runtime": 1.3759, + "eval_samples_per_second": 2732.73, + "eval_steps_per_second": 42.881, + "step": 22000 + }, + { + "epoch": 3.1334375444081286, + "grad_norm": 1.801597237586975, + "learning_rate": 0.00031056666666666666, + "loss": 2.889, + "step": 22050 + }, + { + "epoch": 3.140542844962342, + "grad_norm": 1.6757820844650269, + "learning_rate": 0.00031001111111111115, + "loss": 2.8707, + "step": 22100 + }, + { + "epoch": 3.1476481455165555, + "grad_norm": 1.6857764720916748, + "learning_rate": 0.00030945555555555554, + "loss": 2.877, + "step": 22150 + }, + { + "epoch": 3.1547534460707687, + "grad_norm": 1.7177927494049072, + "learning_rate": 0.00030890000000000003, + "loss": 2.8937, + "step": 22200 + }, + { + "epoch": 3.1618587466249823, + "grad_norm": 1.7618646621704102, + "learning_rate": 0.0003083444444444444, + "loss": 2.8927, + "step": 22250 + }, + { + "epoch": 3.1689640471791956, + "grad_norm": 1.862821102142334, + "learning_rate": 0.0003077888888888889, + "loss": 2.8737, + "step": 22300 + }, + { + "epoch": 3.176069347733409, + "grad_norm": 1.6600791215896606, + "learning_rate": 0.00030723333333333334, + "loss": 2.8844, + "step": 22350 + }, + { + "epoch": 3.1831746482876224, + "grad_norm": 1.715598702430725, + "learning_rate": 0.0003066777777777778, + "loss": 2.8733, + "step": 22400 + }, + { + "epoch": 3.190279948841836, + "grad_norm": 1.615591049194336, + "learning_rate": 0.00030612222222222227, + "loss": 2.8793, + "step": 22450 + }, + { + "epoch": 3.1973852493960493, + "grad_norm": 1.6799874305725098, + "learning_rate": 0.00030556666666666665, + "loss": 2.8833, + "step": 22500 + }, + { + "epoch": 3.204490549950263, + "grad_norm": 1.8471604585647583, + "learning_rate": 0.00030501111111111114, + "loss": 2.8819, + "step": 22550 + }, + { + "epoch": 3.211595850504476, + "grad_norm": 1.721903681755066, + "learning_rate": 0.0003044555555555555, + "loss": 2.8663, + "step": 22600 + }, + { + "epoch": 3.21870115105869, + "grad_norm": 1.7450604438781738, + "learning_rate": 0.0003039, + "loss": 2.8662, + "step": 22650 + }, + { + "epoch": 3.225806451612903, + "grad_norm": 1.7081820964813232, + "learning_rate": 0.0003033444444444445, + "loss": 2.8641, + "step": 22700 + }, + { + "epoch": 3.2329117521671167, + "grad_norm": 1.834999918937683, + "learning_rate": 0.0003027888888888889, + "loss": 2.862, + "step": 22750 + }, + { + "epoch": 3.2400170527213303, + "grad_norm": 1.6457868814468384, + "learning_rate": 0.0003022333333333334, + "loss": 2.847, + "step": 22800 + }, + { + "epoch": 3.2471223532755435, + "grad_norm": 1.613499641418457, + "learning_rate": 0.00030167777777777776, + "loss": 2.8755, + "step": 22850 + }, + { + "epoch": 3.2542276538297568, + "grad_norm": 1.6897544860839844, + "learning_rate": 0.00030112222222222225, + "loss": 2.8551, + "step": 22900 + }, + { + "epoch": 3.2613329543839704, + "grad_norm": 1.7230019569396973, + "learning_rate": 0.00030056666666666664, + "loss": 2.8636, + "step": 22950 + }, + { + "epoch": 3.268438254938184, + "grad_norm": 1.8159016370773315, + "learning_rate": 0.00030001111111111113, + "loss": 2.8669, + "step": 23000 + }, + { + "epoch": 3.2755435554923973, + "grad_norm": 1.6577616930007935, + "learning_rate": 0.0002994555555555555, + "loss": 2.8526, + "step": 23050 + }, + { + "epoch": 3.282648856046611, + "grad_norm": 1.6508777141571045, + "learning_rate": 0.0002989, + "loss": 2.8717, + "step": 23100 + }, + { + "epoch": 3.289754156600824, + "grad_norm": 1.630266785621643, + "learning_rate": 0.0002983444444444445, + "loss": 2.8593, + "step": 23150 + }, + { + "epoch": 3.296859457155038, + "grad_norm": 1.7836819887161255, + "learning_rate": 0.0002977888888888889, + "loss": 2.8409, + "step": 23200 + }, + { + "epoch": 3.303964757709251, + "grad_norm": 1.7919524908065796, + "learning_rate": 0.00029723333333333337, + "loss": 2.8644, + "step": 23250 + }, + { + "epoch": 3.3110700582634647, + "grad_norm": 1.8404020071029663, + "learning_rate": 0.00029667777777777775, + "loss": 2.856, + "step": 23300 + }, + { + "epoch": 3.318175358817678, + "grad_norm": 1.913402795791626, + "learning_rate": 0.00029612222222222224, + "loss": 2.8724, + "step": 23350 + }, + { + "epoch": 3.3252806593718915, + "grad_norm": 1.5555598735809326, + "learning_rate": 0.0002955666666666667, + "loss": 2.834, + "step": 23400 + }, + { + "epoch": 3.3323859599261048, + "grad_norm": 1.6552711725234985, + "learning_rate": 0.0002950111111111111, + "loss": 2.8363, + "step": 23450 + }, + { + "epoch": 3.3394912604803184, + "grad_norm": 1.7023571729660034, + "learning_rate": 0.0002944555555555556, + "loss": 2.8586, + "step": 23500 + }, + { + "epoch": 3.3465965610345316, + "grad_norm": 1.8556574583053589, + "learning_rate": 0.0002939, + "loss": 2.8384, + "step": 23550 + }, + { + "epoch": 3.3537018615887453, + "grad_norm": 1.8642303943634033, + "learning_rate": 0.0002933444444444445, + "loss": 2.8554, + "step": 23600 + }, + { + "epoch": 3.3608071621429585, + "grad_norm": 1.8594276905059814, + "learning_rate": 0.00029278888888888886, + "loss": 2.8562, + "step": 23650 + }, + { + "epoch": 3.367912462697172, + "grad_norm": 1.7641949653625488, + "learning_rate": 0.00029223333333333335, + "loss": 2.8509, + "step": 23700 + }, + { + "epoch": 3.3750177632513854, + "grad_norm": 1.780263900756836, + "learning_rate": 0.0002916777777777778, + "loss": 2.8511, + "step": 23750 + }, + { + "epoch": 3.382123063805599, + "grad_norm": 1.7801567316055298, + "learning_rate": 0.00029112222222222223, + "loss": 2.8701, + "step": 23800 + }, + { + "epoch": 3.3892283643598122, + "grad_norm": 1.7215375900268555, + "learning_rate": 0.00029056666666666666, + "loss": 2.8586, + "step": 23850 + }, + { + "epoch": 3.396333664914026, + "grad_norm": 1.547910451889038, + "learning_rate": 0.0002900111111111111, + "loss": 2.843, + "step": 23900 + }, + { + "epoch": 3.403438965468239, + "grad_norm": 1.6910938024520874, + "learning_rate": 0.0002894555555555556, + "loss": 2.8203, + "step": 23950 + }, + { + "epoch": 3.4105442660224528, + "grad_norm": 1.791414499282837, + "learning_rate": 0.0002889, + "loss": 2.823, + "step": 24000 + }, + { + "epoch": 3.4105442660224528, + "eval_accuracy": 0.5083815455436707, + "eval_loss": 2.793198823928833, + "eval_runtime": 1.4287, + "eval_samples_per_second": 2631.689, + "eval_steps_per_second": 41.295, + "step": 24000 + }, + { + "epoch": 3.4176495665766664, + "grad_norm": 1.8300690650939941, + "learning_rate": 0.00028834444444444447, + "loss": 2.8477, + "step": 24050 + }, + { + "epoch": 3.4247548671308796, + "grad_norm": 1.8223072290420532, + "learning_rate": 0.0002877888888888889, + "loss": 2.7996, + "step": 24100 + }, + { + "epoch": 3.431860167685093, + "grad_norm": 1.781424880027771, + "learning_rate": 0.00028723333333333334, + "loss": 2.8303, + "step": 24150 + }, + { + "epoch": 3.4389654682393065, + "grad_norm": 1.6363815069198608, + "learning_rate": 0.0002866777777777778, + "loss": 2.852, + "step": 24200 + }, + { + "epoch": 3.44607076879352, + "grad_norm": 1.838799238204956, + "learning_rate": 0.0002861222222222222, + "loss": 2.8356, + "step": 24250 + }, + { + "epoch": 3.4531760693477334, + "grad_norm": 1.6546505689620972, + "learning_rate": 0.0002855666666666667, + "loss": 2.8423, + "step": 24300 + }, + { + "epoch": 3.460281369901947, + "grad_norm": 1.6957730054855347, + "learning_rate": 0.0002850111111111111, + "loss": 2.8331, + "step": 24350 + }, + { + "epoch": 3.4673866704561602, + "grad_norm": 1.7324293851852417, + "learning_rate": 0.0002844555555555556, + "loss": 2.8258, + "step": 24400 + }, + { + "epoch": 3.474491971010374, + "grad_norm": 1.7163538932800293, + "learning_rate": 0.0002839, + "loss": 2.8265, + "step": 24450 + }, + { + "epoch": 3.481597271564587, + "grad_norm": 1.7319365739822388, + "learning_rate": 0.00028334444444444445, + "loss": 2.8256, + "step": 24500 + }, + { + "epoch": 3.4887025721188007, + "grad_norm": 2.031334161758423, + "learning_rate": 0.0002827888888888889, + "loss": 2.8359, + "step": 24550 + }, + { + "epoch": 3.495807872673014, + "grad_norm": 1.750105857849121, + "learning_rate": 0.0002822333333333333, + "loss": 2.8361, + "step": 24600 + }, + { + "epoch": 3.5029131732272276, + "grad_norm": 1.8582334518432617, + "learning_rate": 0.0002816777777777778, + "loss": 2.8611, + "step": 24650 + }, + { + "epoch": 3.510018473781441, + "grad_norm": 1.8061821460723877, + "learning_rate": 0.0002811222222222222, + "loss": 2.8137, + "step": 24700 + }, + { + "epoch": 3.5171237743356545, + "grad_norm": 1.7302175760269165, + "learning_rate": 0.0002805666666666667, + "loss": 2.8224, + "step": 24750 + }, + { + "epoch": 3.5242290748898677, + "grad_norm": 1.7751331329345703, + "learning_rate": 0.00028001111111111113, + "loss": 2.8508, + "step": 24800 + }, + { + "epoch": 3.5313343754440814, + "grad_norm": 1.7023547887802124, + "learning_rate": 0.00027945555555555557, + "loss": 2.8283, + "step": 24850 + }, + { + "epoch": 3.5384396759982946, + "grad_norm": 1.6634979248046875, + "learning_rate": 0.0002789, + "loss": 2.8343, + "step": 24900 + }, + { + "epoch": 3.545544976552508, + "grad_norm": 1.8010696172714233, + "learning_rate": 0.00027834444444444444, + "loss": 2.8175, + "step": 24950 + }, + { + "epoch": 3.5526502771067214, + "grad_norm": 1.7478526830673218, + "learning_rate": 0.0002777888888888889, + "loss": 2.832, + "step": 25000 + }, + { + "epoch": 3.559755577660935, + "grad_norm": 1.8158820867538452, + "learning_rate": 0.0002772333333333333, + "loss": 2.8119, + "step": 25050 + }, + { + "epoch": 3.5668608782151487, + "grad_norm": 1.8094425201416016, + "learning_rate": 0.0002766777777777778, + "loss": 2.8173, + "step": 25100 + }, + { + "epoch": 3.573966178769362, + "grad_norm": 1.6954437494277954, + "learning_rate": 0.00027612222222222224, + "loss": 2.8067, + "step": 25150 + }, + { + "epoch": 3.581071479323575, + "grad_norm": 1.9532020092010498, + "learning_rate": 0.0002755666666666667, + "loss": 2.8207, + "step": 25200 + }, + { + "epoch": 3.588176779877789, + "grad_norm": 1.652669072151184, + "learning_rate": 0.0002750111111111111, + "loss": 2.7992, + "step": 25250 + }, + { + "epoch": 3.5952820804320025, + "grad_norm": 1.7369633913040161, + "learning_rate": 0.00027445555555555555, + "loss": 2.8382, + "step": 25300 + }, + { + "epoch": 3.6023873809862157, + "grad_norm": 1.5968304872512817, + "learning_rate": 0.0002739, + "loss": 2.8245, + "step": 25350 + }, + { + "epoch": 3.609492681540429, + "grad_norm": 1.791428565979004, + "learning_rate": 0.0002733444444444444, + "loss": 2.8243, + "step": 25400 + }, + { + "epoch": 3.6165979820946426, + "grad_norm": 1.7652702331542969, + "learning_rate": 0.0002727888888888889, + "loss": 2.8047, + "step": 25450 + }, + { + "epoch": 3.623703282648856, + "grad_norm": 1.7601053714752197, + "learning_rate": 0.00027223333333333335, + "loss": 2.8247, + "step": 25500 + }, + { + "epoch": 3.6308085832030694, + "grad_norm": 1.7859609127044678, + "learning_rate": 0.0002716777777777778, + "loss": 2.8192, + "step": 25550 + }, + { + "epoch": 3.637913883757283, + "grad_norm": 1.5694724321365356, + "learning_rate": 0.00027112222222222223, + "loss": 2.8291, + "step": 25600 + }, + { + "epoch": 3.6450191843114963, + "grad_norm": 1.839003324508667, + "learning_rate": 0.00027056666666666667, + "loss": 2.8119, + "step": 25650 + }, + { + "epoch": 3.65212448486571, + "grad_norm": 1.8402965068817139, + "learning_rate": 0.0002700111111111111, + "loss": 2.8155, + "step": 25700 + }, + { + "epoch": 3.659229785419923, + "grad_norm": 1.7180988788604736, + "learning_rate": 0.00026945555555555554, + "loss": 2.7935, + "step": 25750 + }, + { + "epoch": 3.666335085974137, + "grad_norm": 1.6569797992706299, + "learning_rate": 0.0002689, + "loss": 2.8104, + "step": 25800 + }, + { + "epoch": 3.67344038652835, + "grad_norm": 1.7790822982788086, + "learning_rate": 0.00026834444444444447, + "loss": 2.7944, + "step": 25850 + }, + { + "epoch": 3.6805456870825637, + "grad_norm": 1.7186955213546753, + "learning_rate": 0.0002677888888888889, + "loss": 2.7895, + "step": 25900 + }, + { + "epoch": 3.687650987636777, + "grad_norm": 1.7662361860275269, + "learning_rate": 0.00026723333333333334, + "loss": 2.8101, + "step": 25950 + }, + { + "epoch": 3.6947562881909906, + "grad_norm": 1.7198424339294434, + "learning_rate": 0.0002666777777777778, + "loss": 2.7967, + "step": 26000 + }, + { + "epoch": 3.6947562881909906, + "eval_accuracy": 0.5110318660736084, + "eval_loss": 2.814082384109497, + "eval_runtime": 1.4763, + "eval_samples_per_second": 2546.889, + "eval_steps_per_second": 39.964, + "step": 26000 + }, + { + "epoch": 3.7018615887452038, + "grad_norm": 1.8493636846542358, + "learning_rate": 0.0002661222222222222, + "loss": 2.7912, + "step": 26050 + }, + { + "epoch": 3.7089668892994174, + "grad_norm": 1.9021114110946655, + "learning_rate": 0.00026556666666666665, + "loss": 2.7961, + "step": 26100 + }, + { + "epoch": 3.716072189853631, + "grad_norm": 1.756255030632019, + "learning_rate": 0.0002650111111111111, + "loss": 2.7835, + "step": 26150 + }, + { + "epoch": 3.7231774904078443, + "grad_norm": 1.7606886625289917, + "learning_rate": 0.0002644555555555556, + "loss": 2.8018, + "step": 26200 + }, + { + "epoch": 3.7302827909620575, + "grad_norm": 1.6332037448883057, + "learning_rate": 0.0002639, + "loss": 2.8003, + "step": 26250 + }, + { + "epoch": 3.737388091516271, + "grad_norm": 1.7682067155838013, + "learning_rate": 0.00026334444444444445, + "loss": 2.7961, + "step": 26300 + }, + { + "epoch": 3.744493392070485, + "grad_norm": 1.7425826787948608, + "learning_rate": 0.0002627888888888889, + "loss": 2.8068, + "step": 26350 + }, + { + "epoch": 3.751598692624698, + "grad_norm": 1.7267491817474365, + "learning_rate": 0.00026223333333333333, + "loss": 2.7612, + "step": 26400 + }, + { + "epoch": 3.7587039931789112, + "grad_norm": 1.5135743618011475, + "learning_rate": 0.00026167777777777777, + "loss": 2.7745, + "step": 26450 + }, + { + "epoch": 3.765809293733125, + "grad_norm": 1.8557426929473877, + "learning_rate": 0.0002611222222222222, + "loss": 2.7918, + "step": 26500 + }, + { + "epoch": 3.7729145942873386, + "grad_norm": 1.60994553565979, + "learning_rate": 0.0002605666666666667, + "loss": 2.7807, + "step": 26550 + }, + { + "epoch": 3.7800198948415518, + "grad_norm": 1.8855258226394653, + "learning_rate": 0.00026001111111111113, + "loss": 2.8044, + "step": 26600 + }, + { + "epoch": 3.787125195395765, + "grad_norm": 1.6651372909545898, + "learning_rate": 0.00025945555555555557, + "loss": 2.7805, + "step": 26650 + }, + { + "epoch": 3.7942304959499786, + "grad_norm": 1.8007607460021973, + "learning_rate": 0.0002589, + "loss": 2.7983, + "step": 26700 + }, + { + "epoch": 3.8013357965041923, + "grad_norm": 1.6439241170883179, + "learning_rate": 0.00025834444444444444, + "loss": 2.7896, + "step": 26750 + }, + { + "epoch": 3.8084410970584055, + "grad_norm": 1.8518551588058472, + "learning_rate": 0.00025778888888888893, + "loss": 2.756, + "step": 26800 + }, + { + "epoch": 3.815546397612619, + "grad_norm": 1.7910232543945312, + "learning_rate": 0.0002572333333333333, + "loss": 2.7825, + "step": 26850 + }, + { + "epoch": 3.8226516981668324, + "grad_norm": 1.678464651107788, + "learning_rate": 0.0002566777777777778, + "loss": 2.7798, + "step": 26900 + }, + { + "epoch": 3.829756998721046, + "grad_norm": 1.7954827547073364, + "learning_rate": 0.0002561222222222222, + "loss": 2.771, + "step": 26950 + }, + { + "epoch": 3.8368622992752592, + "grad_norm": 1.6391098499298096, + "learning_rate": 0.0002555666666666667, + "loss": 2.771, + "step": 27000 + }, + { + "epoch": 3.843967599829473, + "grad_norm": 1.8526887893676758, + "learning_rate": 0.0002550111111111111, + "loss": 2.7749, + "step": 27050 + }, + { + "epoch": 3.851072900383686, + "grad_norm": 1.9011675119400024, + "learning_rate": 0.00025445555555555555, + "loss": 2.7898, + "step": 27100 + }, + { + "epoch": 3.8581782009378998, + "grad_norm": 1.711075782775879, + "learning_rate": 0.00025390000000000005, + "loss": 2.7933, + "step": 27150 + }, + { + "epoch": 3.865283501492113, + "grad_norm": 1.9099894762039185, + "learning_rate": 0.00025334444444444443, + "loss": 2.7883, + "step": 27200 + }, + { + "epoch": 3.8723888020463266, + "grad_norm": 1.6617141962051392, + "learning_rate": 0.0002527888888888889, + "loss": 2.7768, + "step": 27250 + }, + { + "epoch": 3.87949410260054, + "grad_norm": 1.7626140117645264, + "learning_rate": 0.0002522333333333333, + "loss": 2.7703, + "step": 27300 + }, + { + "epoch": 3.8865994031547535, + "grad_norm": 1.8242555856704712, + "learning_rate": 0.0002516777777777778, + "loss": 2.7734, + "step": 27350 + }, + { + "epoch": 3.893704703708967, + "grad_norm": 1.8651511669158936, + "learning_rate": 0.00025112222222222223, + "loss": 2.7654, + "step": 27400 + }, + { + "epoch": 3.9008100042631804, + "grad_norm": 1.8385076522827148, + "learning_rate": 0.00025056666666666667, + "loss": 2.7626, + "step": 27450 + }, + { + "epoch": 3.9079153048173936, + "grad_norm": 1.7049630880355835, + "learning_rate": 0.00025001111111111116, + "loss": 2.7816, + "step": 27500 + }, + { + "epoch": 3.9150206053716072, + "grad_norm": 1.6070621013641357, + "learning_rate": 0.0002494555555555556, + "loss": 2.7825, + "step": 27550 + }, + { + "epoch": 3.922125905925821, + "grad_norm": 1.6320933103561401, + "learning_rate": 0.00024890000000000003, + "loss": 2.756, + "step": 27600 + }, + { + "epoch": 3.929231206480034, + "grad_norm": 1.8926113843917847, + "learning_rate": 0.00024834444444444447, + "loss": 2.779, + "step": 27650 + }, + { + "epoch": 3.9363365070342473, + "grad_norm": 1.6653364896774292, + "learning_rate": 0.0002477888888888889, + "loss": 2.8109, + "step": 27700 + }, + { + "epoch": 3.943441807588461, + "grad_norm": 1.8551247119903564, + "learning_rate": 0.00024723333333333334, + "loss": 2.75, + "step": 27750 + }, + { + "epoch": 3.9505471081426746, + "grad_norm": 1.8100675344467163, + "learning_rate": 0.0002466777777777778, + "loss": 2.7643, + "step": 27800 + }, + { + "epoch": 3.957652408696888, + "grad_norm": 1.6926288604736328, + "learning_rate": 0.0002461222222222222, + "loss": 2.7711, + "step": 27850 + }, + { + "epoch": 3.964757709251101, + "grad_norm": 1.6786293983459473, + "learning_rate": 0.00024556666666666665, + "loss": 2.7595, + "step": 27900 + }, + { + "epoch": 3.9718630098053147, + "grad_norm": 1.569153070449829, + "learning_rate": 0.00024501111111111115, + "loss": 2.7817, + "step": 27950 + }, + { + "epoch": 3.9789683103595284, + "grad_norm": 1.7269905805587769, + "learning_rate": 0.0002444555555555556, + "loss": 2.7554, + "step": 28000 + }, + { + "epoch": 3.9789683103595284, + "eval_accuracy": 0.5206505656242371, + "eval_loss": 2.7269434928894043, + "eval_runtime": 1.313, + "eval_samples_per_second": 2863.767, + "eval_steps_per_second": 44.937, + "step": 28000 + }, + { + "epoch": 3.9860736109137416, + "grad_norm": 1.8113017082214355, + "learning_rate": 0.00024390000000000002, + "loss": 2.7633, + "step": 28050 + }, + { + "epoch": 3.9931789114679552, + "grad_norm": 1.878679871559143, + "learning_rate": 0.00024334444444444446, + "loss": 2.7855, + "step": 28100 + }, + { + "epoch": 4.000284212022168, + "grad_norm": 1.683408260345459, + "learning_rate": 0.0002427888888888889, + "loss": 2.751, + "step": 28150 + }, + { + "epoch": 4.007389512576382, + "grad_norm": 1.6192328929901123, + "learning_rate": 0.00024223333333333333, + "loss": 2.7363, + "step": 28200 + }, + { + "epoch": 4.014494813130596, + "grad_norm": 1.7787748575210571, + "learning_rate": 0.00024167777777777777, + "loss": 2.7267, + "step": 28250 + }, + { + "epoch": 4.0216001136848085, + "grad_norm": 1.8885560035705566, + "learning_rate": 0.0002411222222222222, + "loss": 2.7409, + "step": 28300 + }, + { + "epoch": 4.028705414239022, + "grad_norm": 1.8589296340942383, + "learning_rate": 0.0002405666666666667, + "loss": 2.7602, + "step": 28350 + }, + { + "epoch": 4.035810714793236, + "grad_norm": 1.9907305240631104, + "learning_rate": 0.00024001111111111113, + "loss": 2.7435, + "step": 28400 + }, + { + "epoch": 4.0429160153474495, + "grad_norm": 1.7080715894699097, + "learning_rate": 0.00023945555555555557, + "loss": 2.7307, + "step": 28450 + }, + { + "epoch": 4.050021315901662, + "grad_norm": 1.65744948387146, + "learning_rate": 0.0002389, + "loss": 2.7272, + "step": 28500 + }, + { + "epoch": 4.057126616455876, + "grad_norm": 1.6736458539962769, + "learning_rate": 0.00023834444444444444, + "loss": 2.737, + "step": 28550 + }, + { + "epoch": 4.06423191701009, + "grad_norm": 1.7677834033966064, + "learning_rate": 0.00023778888888888888, + "loss": 2.7256, + "step": 28600 + }, + { + "epoch": 4.071337217564303, + "grad_norm": 1.7060825824737549, + "learning_rate": 0.00023723333333333332, + "loss": 2.7578, + "step": 28650 + }, + { + "epoch": 4.078442518118516, + "grad_norm": 1.5596199035644531, + "learning_rate": 0.00023667777777777778, + "loss": 2.7446, + "step": 28700 + }, + { + "epoch": 4.08554781867273, + "grad_norm": 1.7688475847244263, + "learning_rate": 0.00023612222222222225, + "loss": 2.7451, + "step": 28750 + }, + { + "epoch": 4.092653119226943, + "grad_norm": 1.6317684650421143, + "learning_rate": 0.00023556666666666668, + "loss": 2.7385, + "step": 28800 + }, + { + "epoch": 4.099758419781157, + "grad_norm": 1.7691974639892578, + "learning_rate": 0.00023501111111111112, + "loss": 2.7098, + "step": 28850 + }, + { + "epoch": 4.106863720335371, + "grad_norm": 1.7797774076461792, + "learning_rate": 0.00023445555555555556, + "loss": 2.709, + "step": 28900 + }, + { + "epoch": 4.113969020889583, + "grad_norm": 1.9547299146652222, + "learning_rate": 0.0002339, + "loss": 2.7476, + "step": 28950 + }, + { + "epoch": 4.121074321443797, + "grad_norm": 1.8123749494552612, + "learning_rate": 0.00023334444444444443, + "loss": 2.726, + "step": 29000 + }, + { + "epoch": 4.128179621998011, + "grad_norm": 1.8036295175552368, + "learning_rate": 0.0002327888888888889, + "loss": 2.7269, + "step": 29050 + }, + { + "epoch": 4.135284922552224, + "grad_norm": 1.7694047689437866, + "learning_rate": 0.00023223333333333336, + "loss": 2.728, + "step": 29100 + }, + { + "epoch": 4.142390223106437, + "grad_norm": 1.6515389680862427, + "learning_rate": 0.0002316777777777778, + "loss": 2.7279, + "step": 29150 + }, + { + "epoch": 4.149495523660651, + "grad_norm": 1.6699292659759521, + "learning_rate": 0.00023112222222222223, + "loss": 2.7281, + "step": 29200 + }, + { + "epoch": 4.156600824214864, + "grad_norm": 1.878513216972351, + "learning_rate": 0.00023056666666666667, + "loss": 2.7259, + "step": 29250 + }, + { + "epoch": 4.163706124769078, + "grad_norm": 1.5836261510849, + "learning_rate": 0.0002300111111111111, + "loss": 2.7269, + "step": 29300 + }, + { + "epoch": 4.170811425323291, + "grad_norm": 1.7325093746185303, + "learning_rate": 0.00022945555555555554, + "loss": 2.7192, + "step": 29350 + }, + { + "epoch": 4.1779167258775045, + "grad_norm": 1.793212652206421, + "learning_rate": 0.0002289, + "loss": 2.7374, + "step": 29400 + }, + { + "epoch": 4.185022026431718, + "grad_norm": 1.658182978630066, + "learning_rate": 0.00022834444444444444, + "loss": 2.7541, + "step": 29450 + }, + { + "epoch": 4.192127326985932, + "grad_norm": 1.9298884868621826, + "learning_rate": 0.0002277888888888889, + "loss": 2.7343, + "step": 29500 + }, + { + "epoch": 4.199232627540145, + "grad_norm": 1.6460927724838257, + "learning_rate": 0.00022723333333333335, + "loss": 2.7191, + "step": 29550 + }, + { + "epoch": 4.206337928094358, + "grad_norm": 1.6624342203140259, + "learning_rate": 0.00022667777777777778, + "loss": 2.7415, + "step": 29600 + }, + { + "epoch": 4.213443228648572, + "grad_norm": 1.710582971572876, + "learning_rate": 0.00022612222222222222, + "loss": 2.7037, + "step": 29650 + }, + { + "epoch": 4.220548529202786, + "grad_norm": 1.7348077297210693, + "learning_rate": 0.00022556666666666668, + "loss": 2.7315, + "step": 29700 + }, + { + "epoch": 4.227653829756998, + "grad_norm": 1.7268335819244385, + "learning_rate": 0.00022501111111111112, + "loss": 2.7443, + "step": 29750 + }, + { + "epoch": 4.234759130311212, + "grad_norm": 1.8017711639404297, + "learning_rate": 0.00022445555555555556, + "loss": 2.735, + "step": 29800 + }, + { + "epoch": 4.241864430865426, + "grad_norm": 1.776839017868042, + "learning_rate": 0.0002239, + "loss": 2.7009, + "step": 29850 + }, + { + "epoch": 4.248969731419639, + "grad_norm": 1.7648807764053345, + "learning_rate": 0.00022334444444444446, + "loss": 2.7173, + "step": 29900 + }, + { + "epoch": 4.256075031973852, + "grad_norm": 1.6997913122177124, + "learning_rate": 0.0002227888888888889, + "loss": 2.7113, + "step": 29950 + }, + { + "epoch": 4.263180332528066, + "grad_norm": 1.723713994026184, + "learning_rate": 0.00022223333333333333, + "loss": 2.7182, + "step": 30000 + }, + { + "epoch": 4.263180332528066, + "eval_accuracy": 0.529339075088501, + "eval_loss": 2.672091245651245, + "eval_runtime": 1.5216, + "eval_samples_per_second": 2471.011, + "eval_steps_per_second": 38.774, + "step": 30000 + }, + { + "epoch": 4.270285633082279, + "grad_norm": 1.6741389036178589, + "learning_rate": 0.0002216777777777778, + "loss": 2.6956, + "step": 30050 + }, + { + "epoch": 4.277390933636493, + "grad_norm": 1.7774298191070557, + "learning_rate": 0.00022112222222222223, + "loss": 2.6983, + "step": 30100 + }, + { + "epoch": 4.284496234190707, + "grad_norm": 1.742165207862854, + "learning_rate": 0.00022056666666666667, + "loss": 2.7294, + "step": 30150 + }, + { + "epoch": 4.2916015347449195, + "grad_norm": 1.7675608396530151, + "learning_rate": 0.0002200111111111111, + "loss": 2.6987, + "step": 30200 + }, + { + "epoch": 4.298706835299133, + "grad_norm": 1.7561299800872803, + "learning_rate": 0.00021945555555555554, + "loss": 2.7197, + "step": 30250 + }, + { + "epoch": 4.305812135853347, + "grad_norm": 1.7825928926467896, + "learning_rate": 0.0002189, + "loss": 2.7139, + "step": 30300 + }, + { + "epoch": 4.31291743640756, + "grad_norm": 2.1004021167755127, + "learning_rate": 0.00021834444444444445, + "loss": 2.7166, + "step": 30350 + }, + { + "epoch": 4.320022736961773, + "grad_norm": 1.8277966976165771, + "learning_rate": 0.0002177888888888889, + "loss": 2.6808, + "step": 30400 + }, + { + "epoch": 4.327128037515987, + "grad_norm": 1.7409828901290894, + "learning_rate": 0.00021723333333333335, + "loss": 2.7069, + "step": 30450 + }, + { + "epoch": 4.3342333380702005, + "grad_norm": 1.7453832626342773, + "learning_rate": 0.00021667777777777778, + "loss": 2.7218, + "step": 30500 + }, + { + "epoch": 4.341338638624414, + "grad_norm": 1.8581887483596802, + "learning_rate": 0.00021612222222222222, + "loss": 2.7126, + "step": 30550 + }, + { + "epoch": 4.348443939178627, + "grad_norm": 1.8236676454544067, + "learning_rate": 0.00021556666666666666, + "loss": 2.7147, + "step": 30600 + }, + { + "epoch": 4.355549239732841, + "grad_norm": 1.944787621498108, + "learning_rate": 0.0002150111111111111, + "loss": 2.724, + "step": 30650 + }, + { + "epoch": 4.362654540287054, + "grad_norm": 1.663387417793274, + "learning_rate": 0.00021445555555555556, + "loss": 2.7077, + "step": 30700 + }, + { + "epoch": 4.369759840841268, + "grad_norm": 1.8352930545806885, + "learning_rate": 0.00021390000000000002, + "loss": 2.6954, + "step": 30750 + }, + { + "epoch": 4.376865141395481, + "grad_norm": 1.635725736618042, + "learning_rate": 0.00021334444444444446, + "loss": 2.7108, + "step": 30800 + }, + { + "epoch": 4.383970441949694, + "grad_norm": 1.6727386713027954, + "learning_rate": 0.0002127888888888889, + "loss": 2.7002, + "step": 30850 + }, + { + "epoch": 4.391075742503908, + "grad_norm": 1.6804115772247314, + "learning_rate": 0.00021223333333333333, + "loss": 2.6961, + "step": 30900 + }, + { + "epoch": 4.398181043058122, + "grad_norm": 1.7196424007415771, + "learning_rate": 0.00021167777777777777, + "loss": 2.7234, + "step": 30950 + }, + { + "epoch": 4.405286343612334, + "grad_norm": 1.8474704027175903, + "learning_rate": 0.0002111222222222222, + "loss": 2.7161, + "step": 31000 + }, + { + "epoch": 4.412391644166548, + "grad_norm": 1.6778532266616821, + "learning_rate": 0.00021056666666666667, + "loss": 2.6654, + "step": 31050 + }, + { + "epoch": 4.419496944720762, + "grad_norm": 1.7179423570632935, + "learning_rate": 0.00021001111111111114, + "loss": 2.7152, + "step": 31100 + }, + { + "epoch": 4.426602245274975, + "grad_norm": 1.6991947889328003, + "learning_rate": 0.00020945555555555557, + "loss": 2.723, + "step": 31150 + }, + { + "epoch": 4.433707545829188, + "grad_norm": 1.705942153930664, + "learning_rate": 0.0002089, + "loss": 2.7024, + "step": 31200 + }, + { + "epoch": 4.440812846383402, + "grad_norm": 1.7053892612457275, + "learning_rate": 0.00020834444444444445, + "loss": 2.7086, + "step": 31250 + }, + { + "epoch": 4.4479181469376154, + "grad_norm": 1.735185146331787, + "learning_rate": 0.00020778888888888888, + "loss": 2.6899, + "step": 31300 + }, + { + "epoch": 4.455023447491829, + "grad_norm": 1.7392066717147827, + "learning_rate": 0.00020723333333333332, + "loss": 2.7156, + "step": 31350 + }, + { + "epoch": 4.462128748046043, + "grad_norm": 1.7509199380874634, + "learning_rate": 0.00020667777777777776, + "loss": 2.6833, + "step": 31400 + }, + { + "epoch": 4.4692340486002555, + "grad_norm": 1.877554178237915, + "learning_rate": 0.00020612222222222225, + "loss": 2.702, + "step": 31450 + }, + { + "epoch": 4.476339349154469, + "grad_norm": 1.859157681465149, + "learning_rate": 0.00020556666666666669, + "loss": 2.7088, + "step": 31500 + }, + { + "epoch": 4.483444649708683, + "grad_norm": 1.9033279418945312, + "learning_rate": 0.00020501111111111112, + "loss": 2.6754, + "step": 31550 + }, + { + "epoch": 4.4905499502628965, + "grad_norm": 1.8347678184509277, + "learning_rate": 0.00020445555555555556, + "loss": 2.6952, + "step": 31600 + }, + { + "epoch": 4.497655250817109, + "grad_norm": 1.8469839096069336, + "learning_rate": 0.0002039, + "loss": 2.6879, + "step": 31650 + }, + { + "epoch": 4.504760551371323, + "grad_norm": 1.740691065788269, + "learning_rate": 0.00020334444444444443, + "loss": 2.7068, + "step": 31700 + }, + { + "epoch": 4.511865851925537, + "grad_norm": 1.6777381896972656, + "learning_rate": 0.0002027888888888889, + "loss": 2.664, + "step": 31750 + }, + { + "epoch": 4.51897115247975, + "grad_norm": 1.709246039390564, + "learning_rate": 0.00020223333333333333, + "loss": 2.7154, + "step": 31800 + }, + { + "epoch": 4.526076453033963, + "grad_norm": 1.7748068571090698, + "learning_rate": 0.0002016777777777778, + "loss": 2.6943, + "step": 31850 + }, + { + "epoch": 4.533181753588177, + "grad_norm": 1.613324522972107, + "learning_rate": 0.00020112222222222223, + "loss": 2.6844, + "step": 31900 + }, + { + "epoch": 4.54028705414239, + "grad_norm": 1.6655322313308716, + "learning_rate": 0.00020056666666666667, + "loss": 2.6986, + "step": 31950 + }, + { + "epoch": 4.547392354696604, + "grad_norm": 1.8054392337799072, + "learning_rate": 0.0002000111111111111, + "loss": 2.6879, + "step": 32000 + }, + { + "epoch": 4.547392354696604, + "eval_accuracy": 0.5284602046012878, + "eval_loss": 2.64477801322937, + "eval_runtime": 1.3859, + "eval_samples_per_second": 2713.044, + "eval_steps_per_second": 42.572, + "step": 32000 + }, + { + "epoch": 4.554497655250817, + "grad_norm": 1.7321134805679321, + "learning_rate": 0.00019945555555555555, + "loss": 2.6896, + "step": 32050 + }, + { + "epoch": 4.56160295580503, + "grad_norm": 1.9145094156265259, + "learning_rate": 0.0001989, + "loss": 2.6834, + "step": 32100 + }, + { + "epoch": 4.568708256359244, + "grad_norm": 1.8805677890777588, + "learning_rate": 0.00019834444444444445, + "loss": 2.6949, + "step": 32150 + }, + { + "epoch": 4.575813556913458, + "grad_norm": 1.7424559593200684, + "learning_rate": 0.00019778888888888888, + "loss": 2.6867, + "step": 32200 + }, + { + "epoch": 4.582918857467671, + "grad_norm": 1.9168509244918823, + "learning_rate": 0.00019723333333333335, + "loss": 2.6895, + "step": 32250 + }, + { + "epoch": 4.590024158021884, + "grad_norm": 1.6305192708969116, + "learning_rate": 0.00019667777777777778, + "loss": 2.6829, + "step": 32300 + }, + { + "epoch": 4.597129458576098, + "grad_norm": 1.5996636152267456, + "learning_rate": 0.00019612222222222222, + "loss": 2.6894, + "step": 32350 + }, + { + "epoch": 4.604234759130311, + "grad_norm": 1.7526198625564575, + "learning_rate": 0.00019556666666666666, + "loss": 2.6857, + "step": 32400 + }, + { + "epoch": 4.611340059684524, + "grad_norm": 1.8293813467025757, + "learning_rate": 0.00019501111111111112, + "loss": 2.6659, + "step": 32450 + }, + { + "epoch": 4.618445360238738, + "grad_norm": 1.8178389072418213, + "learning_rate": 0.00019445555555555556, + "loss": 2.6877, + "step": 32500 + }, + { + "epoch": 4.6255506607929515, + "grad_norm": 1.628410816192627, + "learning_rate": 0.0001939, + "loss": 2.699, + "step": 32550 + }, + { + "epoch": 4.632655961347165, + "grad_norm": 1.7101975679397583, + "learning_rate": 0.00019334444444444446, + "loss": 2.6754, + "step": 32600 + }, + { + "epoch": 4.639761261901379, + "grad_norm": 1.6504610776901245, + "learning_rate": 0.0001927888888888889, + "loss": 2.6874, + "step": 32650 + }, + { + "epoch": 4.646866562455592, + "grad_norm": 1.7311458587646484, + "learning_rate": 0.00019223333333333333, + "loss": 2.6898, + "step": 32700 + }, + { + "epoch": 4.653971863009805, + "grad_norm": 1.9511997699737549, + "learning_rate": 0.00019167777777777777, + "loss": 2.6902, + "step": 32750 + }, + { + "epoch": 4.661077163564019, + "grad_norm": 1.7774189710617065, + "learning_rate": 0.00019112222222222224, + "loss": 2.6501, + "step": 32800 + }, + { + "epoch": 4.668182464118233, + "grad_norm": 1.8208081722259521, + "learning_rate": 0.00019056666666666667, + "loss": 2.6812, + "step": 32850 + }, + { + "epoch": 4.675287764672445, + "grad_norm": 1.7172284126281738, + "learning_rate": 0.0001900111111111111, + "loss": 2.6661, + "step": 32900 + }, + { + "epoch": 4.682393065226659, + "grad_norm": 1.98916494846344, + "learning_rate": 0.00018945555555555555, + "loss": 2.6739, + "step": 32950 + }, + { + "epoch": 4.689498365780873, + "grad_norm": 1.7716188430786133, + "learning_rate": 0.0001889, + "loss": 2.6826, + "step": 33000 + }, + { + "epoch": 4.696603666335086, + "grad_norm": 1.7136895656585693, + "learning_rate": 0.00018834444444444445, + "loss": 2.6791, + "step": 33050 + }, + { + "epoch": 4.703708966889299, + "grad_norm": 1.8110146522521973, + "learning_rate": 0.00018778888888888888, + "loss": 2.6942, + "step": 33100 + }, + { + "epoch": 4.710814267443513, + "grad_norm": 1.9212117195129395, + "learning_rate": 0.00018723333333333335, + "loss": 2.6818, + "step": 33150 + }, + { + "epoch": 4.717919567997726, + "grad_norm": 1.7386603355407715, + "learning_rate": 0.00018667777777777779, + "loss": 2.6818, + "step": 33200 + }, + { + "epoch": 4.72502486855194, + "grad_norm": 1.7467000484466553, + "learning_rate": 0.00018612222222222222, + "loss": 2.684, + "step": 33250 + }, + { + "epoch": 4.732130169106153, + "grad_norm": 1.8801017999649048, + "learning_rate": 0.00018556666666666666, + "loss": 2.668, + "step": 33300 + }, + { + "epoch": 4.7392354696603665, + "grad_norm": 1.751801609992981, + "learning_rate": 0.0001850111111111111, + "loss": 2.66, + "step": 33350 + }, + { + "epoch": 4.74634077021458, + "grad_norm": 1.9448400735855103, + "learning_rate": 0.00018445555555555556, + "loss": 2.6685, + "step": 33400 + }, + { + "epoch": 4.753446070768794, + "grad_norm": 1.7627147436141968, + "learning_rate": 0.00018390000000000002, + "loss": 2.6803, + "step": 33450 + }, + { + "epoch": 4.760551371323007, + "grad_norm": 1.7280786037445068, + "learning_rate": 0.00018334444444444446, + "loss": 2.6735, + "step": 33500 + }, + { + "epoch": 4.76765667187722, + "grad_norm": 1.8190041780471802, + "learning_rate": 0.0001827888888888889, + "loss": 2.6678, + "step": 33550 + }, + { + "epoch": 4.774761972431434, + "grad_norm": 1.735298991203308, + "learning_rate": 0.00018223333333333334, + "loss": 2.6547, + "step": 33600 + }, + { + "epoch": 4.7818672729856475, + "grad_norm": 1.7868086099624634, + "learning_rate": 0.00018167777777777777, + "loss": 2.6876, + "step": 33650 + }, + { + "epoch": 4.78897257353986, + "grad_norm": 1.749411940574646, + "learning_rate": 0.0001811222222222222, + "loss": 2.6575, + "step": 33700 + }, + { + "epoch": 4.796077874094074, + "grad_norm": 1.6397782564163208, + "learning_rate": 0.00018056666666666665, + "loss": 2.6762, + "step": 33750 + }, + { + "epoch": 4.803183174648288, + "grad_norm": 1.6440680027008057, + "learning_rate": 0.00018001111111111114, + "loss": 2.6445, + "step": 33800 + }, + { + "epoch": 4.810288475202501, + "grad_norm": 1.664414405822754, + "learning_rate": 0.00017945555555555557, + "loss": 2.6687, + "step": 33850 + }, + { + "epoch": 4.817393775756715, + "grad_norm": 1.6941622495651245, + "learning_rate": 0.0001789, + "loss": 2.6661, + "step": 33900 + }, + { + "epoch": 4.824499076310928, + "grad_norm": 1.8345458507537842, + "learning_rate": 0.00017834444444444445, + "loss": 2.6758, + "step": 33950 + }, + { + "epoch": 4.831604376865141, + "grad_norm": 1.7914514541625977, + "learning_rate": 0.00017778888888888889, + "loss": 2.6659, + "step": 34000 + }, + { + "epoch": 4.831604376865141, + "eval_accuracy": 0.5343063473701477, + "eval_loss": 2.6134769916534424, + "eval_runtime": 1.4838, + "eval_samples_per_second": 2534.091, + "eval_steps_per_second": 39.764, + "step": 34000 + }, + { + "epoch": 4.838709677419355, + "grad_norm": 1.7636709213256836, + "learning_rate": 0.00017723333333333332, + "loss": 2.6714, + "step": 34050 + }, + { + "epoch": 4.845814977973569, + "grad_norm": 1.712996244430542, + "learning_rate": 0.00017667777777777776, + "loss": 2.6592, + "step": 34100 + }, + { + "epoch": 4.852920278527781, + "grad_norm": 1.8048306703567505, + "learning_rate": 0.00017612222222222225, + "loss": 2.6626, + "step": 34150 + }, + { + "epoch": 4.860025579081995, + "grad_norm": 1.6417750120162964, + "learning_rate": 0.0001755666666666667, + "loss": 2.6569, + "step": 34200 + }, + { + "epoch": 4.867130879636209, + "grad_norm": 1.678472638130188, + "learning_rate": 0.00017501111111111112, + "loss": 2.6508, + "step": 34250 + }, + { + "epoch": 4.874236180190422, + "grad_norm": 1.9373923540115356, + "learning_rate": 0.00017445555555555556, + "loss": 2.6428, + "step": 34300 + }, + { + "epoch": 4.881341480744635, + "grad_norm": 1.6744999885559082, + "learning_rate": 0.0001739, + "loss": 2.6595, + "step": 34350 + }, + { + "epoch": 4.888446781298849, + "grad_norm": 1.796247959136963, + "learning_rate": 0.00017334444444444444, + "loss": 2.6739, + "step": 34400 + }, + { + "epoch": 4.8955520818530625, + "grad_norm": 1.8660869598388672, + "learning_rate": 0.00017278888888888887, + "loss": 2.6538, + "step": 34450 + }, + { + "epoch": 4.902657382407276, + "grad_norm": 1.6597950458526611, + "learning_rate": 0.00017223333333333334, + "loss": 2.6417, + "step": 34500 + }, + { + "epoch": 4.909762682961489, + "grad_norm": 1.6842671632766724, + "learning_rate": 0.0001716777777777778, + "loss": 2.6521, + "step": 34550 + }, + { + "epoch": 4.9168679835157025, + "grad_norm": 1.8126052618026733, + "learning_rate": 0.00017112222222222224, + "loss": 2.6465, + "step": 34600 + }, + { + "epoch": 4.923973284069916, + "grad_norm": 1.8387752771377563, + "learning_rate": 0.00017056666666666667, + "loss": 2.6434, + "step": 34650 + }, + { + "epoch": 4.93107858462413, + "grad_norm": 1.7043359279632568, + "learning_rate": 0.0001700111111111111, + "loss": 2.6521, + "step": 34700 + }, + { + "epoch": 4.9381838851783435, + "grad_norm": 1.7130815982818604, + "learning_rate": 0.00016945555555555555, + "loss": 2.6609, + "step": 34750 + }, + { + "epoch": 4.945289185732556, + "grad_norm": 1.6739017963409424, + "learning_rate": 0.00016889999999999999, + "loss": 2.6231, + "step": 34800 + }, + { + "epoch": 4.95239448628677, + "grad_norm": 1.6555219888687134, + "learning_rate": 0.00016834444444444445, + "loss": 2.6672, + "step": 34850 + }, + { + "epoch": 4.959499786840984, + "grad_norm": 1.7497371435165405, + "learning_rate": 0.0001677888888888889, + "loss": 2.6369, + "step": 34900 + }, + { + "epoch": 4.966605087395196, + "grad_norm": 1.7139822244644165, + "learning_rate": 0.00016723333333333335, + "loss": 2.6625, + "step": 34950 + }, + { + "epoch": 4.97371038794941, + "grad_norm": 1.7247073650360107, + "learning_rate": 0.0001666777777777778, + "loss": 2.6793, + "step": 35000 + }, + { + "epoch": 4.980815688503624, + "grad_norm": 1.792324185371399, + "learning_rate": 0.00016612222222222222, + "loss": 2.6511, + "step": 35050 + }, + { + "epoch": 4.987920989057837, + "grad_norm": 1.7975879907608032, + "learning_rate": 0.00016556666666666666, + "loss": 2.6459, + "step": 35100 + }, + { + "epoch": 4.995026289612051, + "grad_norm": 1.6427263021469116, + "learning_rate": 0.0001650111111111111, + "loss": 2.6298, + "step": 35150 + }, + { + "epoch": 5.002131590166264, + "grad_norm": 1.7684862613677979, + "learning_rate": 0.00016445555555555556, + "loss": 2.6456, + "step": 35200 + }, + { + "epoch": 5.009236890720477, + "grad_norm": 1.6923253536224365, + "learning_rate": 0.0001639, + "loss": 2.6107, + "step": 35250 + }, + { + "epoch": 5.016342191274691, + "grad_norm": 1.7871248722076416, + "learning_rate": 0.00016334444444444444, + "loss": 2.6249, + "step": 35300 + }, + { + "epoch": 5.023447491828905, + "grad_norm": 1.738742709159851, + "learning_rate": 0.0001627888888888889, + "loss": 2.6468, + "step": 35350 + }, + { + "epoch": 5.0305527923831175, + "grad_norm": 1.688292384147644, + "learning_rate": 0.00016223333333333334, + "loss": 2.632, + "step": 35400 + }, + { + "epoch": 5.037658092937331, + "grad_norm": 1.7152246236801147, + "learning_rate": 0.00016167777777777777, + "loss": 2.6373, + "step": 35450 + }, + { + "epoch": 5.044763393491545, + "grad_norm": 1.6816190481185913, + "learning_rate": 0.00016112222222222224, + "loss": 2.631, + "step": 35500 + }, + { + "epoch": 5.0518686940457584, + "grad_norm": 1.9319465160369873, + "learning_rate": 0.00016056666666666668, + "loss": 2.5949, + "step": 35550 + }, + { + "epoch": 5.058973994599971, + "grad_norm": 1.7346752882003784, + "learning_rate": 0.0001600111111111111, + "loss": 2.6062, + "step": 35600 + }, + { + "epoch": 5.066079295154185, + "grad_norm": 1.95259428024292, + "learning_rate": 0.00015945555555555555, + "loss": 2.6281, + "step": 35650 + }, + { + "epoch": 5.0731845957083985, + "grad_norm": 1.9597879648208618, + "learning_rate": 0.0001589, + "loss": 2.6211, + "step": 35700 + }, + { + "epoch": 5.080289896262612, + "grad_norm": 1.7877590656280518, + "learning_rate": 0.00015834444444444445, + "loss": 2.6229, + "step": 35750 + }, + { + "epoch": 5.087395196816825, + "grad_norm": 1.8384203910827637, + "learning_rate": 0.0001577888888888889, + "loss": 2.6415, + "step": 35800 + }, + { + "epoch": 5.094500497371039, + "grad_norm": 1.7532262802124023, + "learning_rate": 0.00015723333333333335, + "loss": 2.6277, + "step": 35850 + }, + { + "epoch": 5.101605797925252, + "grad_norm": 1.8662482500076294, + "learning_rate": 0.0001566777777777778, + "loss": 2.6251, + "step": 35900 + }, + { + "epoch": 5.108711098479466, + "grad_norm": 1.817358374595642, + "learning_rate": 0.00015612222222222223, + "loss": 2.6221, + "step": 35950 + }, + { + "epoch": 5.11581639903368, + "grad_norm": 1.6843544244766235, + "learning_rate": 0.00015556666666666666, + "loss": 2.6143, + "step": 36000 + }, + { + "epoch": 5.11581639903368, + "eval_accuracy": 0.5344390869140625, + "eval_loss": 2.5895192623138428, + "eval_runtime": 1.3175, + "eval_samples_per_second": 2853.919, + "eval_steps_per_second": 44.782, + "step": 36000 + }, + { + "epoch": 5.122921699587892, + "grad_norm": 1.6144895553588867, + "learning_rate": 0.0001550111111111111, + "loss": 2.628, + "step": 36050 + }, + { + "epoch": 5.130027000142106, + "grad_norm": 1.7719066143035889, + "learning_rate": 0.00015445555555555556, + "loss": 2.622, + "step": 36100 + }, + { + "epoch": 5.13713230069632, + "grad_norm": 1.775813102722168, + "learning_rate": 0.0001539, + "loss": 2.629, + "step": 36150 + }, + { + "epoch": 5.144237601250533, + "grad_norm": 1.7993078231811523, + "learning_rate": 0.00015334444444444446, + "loss": 2.6059, + "step": 36200 + }, + { + "epoch": 5.151342901804746, + "grad_norm": 1.6277464628219604, + "learning_rate": 0.0001527888888888889, + "loss": 2.6393, + "step": 36250 + }, + { + "epoch": 5.15844820235896, + "grad_norm": 1.6855401992797852, + "learning_rate": 0.00015223333333333334, + "loss": 2.616, + "step": 36300 + }, + { + "epoch": 5.165553502913173, + "grad_norm": 1.7239676713943481, + "learning_rate": 0.00015167777777777778, + "loss": 2.6438, + "step": 36350 + }, + { + "epoch": 5.172658803467387, + "grad_norm": 1.7113441228866577, + "learning_rate": 0.0001511222222222222, + "loss": 2.5931, + "step": 36400 + }, + { + "epoch": 5.1797641040216, + "grad_norm": 1.7422834634780884, + "learning_rate": 0.00015056666666666665, + "loss": 2.6281, + "step": 36450 + }, + { + "epoch": 5.1868694045758135, + "grad_norm": 1.7386971712112427, + "learning_rate": 0.0001500111111111111, + "loss": 2.6301, + "step": 36500 + }, + { + "epoch": 5.193974705130027, + "grad_norm": 1.8851107358932495, + "learning_rate": 0.00014945555555555558, + "loss": 2.623, + "step": 36550 + }, + { + "epoch": 5.201080005684241, + "grad_norm": 1.901906967163086, + "learning_rate": 0.00014890000000000001, + "loss": 2.5805, + "step": 36600 + }, + { + "epoch": 5.2081853062384535, + "grad_norm": 1.6902124881744385, + "learning_rate": 0.00014834444444444445, + "loss": 2.6301, + "step": 36650 + }, + { + "epoch": 5.215290606792667, + "grad_norm": 1.8009217977523804, + "learning_rate": 0.0001477888888888889, + "loss": 2.5984, + "step": 36700 + }, + { + "epoch": 5.222395907346881, + "grad_norm": 1.7949461936950684, + "learning_rate": 0.00014723333333333333, + "loss": 2.612, + "step": 36750 + }, + { + "epoch": 5.2295012079010945, + "grad_norm": 1.8531763553619385, + "learning_rate": 0.00014667777777777776, + "loss": 2.6009, + "step": 36800 + }, + { + "epoch": 5.236606508455307, + "grad_norm": 1.9773792028427124, + "learning_rate": 0.0001461222222222222, + "loss": 2.6264, + "step": 36850 + }, + { + "epoch": 5.243711809009521, + "grad_norm": 1.7207331657409668, + "learning_rate": 0.0001455666666666667, + "loss": 2.6109, + "step": 36900 + }, + { + "epoch": 5.250817109563735, + "grad_norm": 1.6260844469070435, + "learning_rate": 0.00014501111111111113, + "loss": 2.6107, + "step": 36950 + }, + { + "epoch": 5.257922410117948, + "grad_norm": 1.8471879959106445, + "learning_rate": 0.00014445555555555556, + "loss": 2.6236, + "step": 37000 + }, + { + "epoch": 5.265027710672161, + "grad_norm": 1.7721649408340454, + "learning_rate": 0.0001439, + "loss": 2.6253, + "step": 37050 + }, + { + "epoch": 5.272133011226375, + "grad_norm": 1.9177438020706177, + "learning_rate": 0.00014334444444444444, + "loss": 2.6143, + "step": 37100 + }, + { + "epoch": 5.279238311780588, + "grad_norm": 1.777573823928833, + "learning_rate": 0.00014278888888888888, + "loss": 2.6128, + "step": 37150 + }, + { + "epoch": 5.286343612334802, + "grad_norm": 1.6359736919403076, + "learning_rate": 0.00014223333333333334, + "loss": 2.602, + "step": 37200 + }, + { + "epoch": 5.293448912889016, + "grad_norm": 1.793562412261963, + "learning_rate": 0.0001416777777777778, + "loss": 2.6356, + "step": 37250 + }, + { + "epoch": 5.300554213443228, + "grad_norm": 1.7116947174072266, + "learning_rate": 0.00014112222222222224, + "loss": 2.6169, + "step": 37300 + }, + { + "epoch": 5.307659513997442, + "grad_norm": 1.9178073406219482, + "learning_rate": 0.00014056666666666668, + "loss": 2.6071, + "step": 37350 + }, + { + "epoch": 5.314764814551656, + "grad_norm": 1.7767285108566284, + "learning_rate": 0.00014001111111111111, + "loss": 2.6182, + "step": 37400 + }, + { + "epoch": 5.321870115105869, + "grad_norm": 1.8449881076812744, + "learning_rate": 0.00013945555555555555, + "loss": 2.6065, + "step": 37450 + }, + { + "epoch": 5.328975415660082, + "grad_norm": 1.814612865447998, + "learning_rate": 0.0001389, + "loss": 2.6139, + "step": 37500 + }, + { + "epoch": 5.336080716214296, + "grad_norm": 1.7973238229751587, + "learning_rate": 0.00013834444444444445, + "loss": 2.6266, + "step": 37550 + }, + { + "epoch": 5.3431860167685095, + "grad_norm": 1.525099277496338, + "learning_rate": 0.0001377888888888889, + "loss": 2.5997, + "step": 37600 + }, + { + "epoch": 5.350291317322723, + "grad_norm": 1.7055003643035889, + "learning_rate": 0.00013723333333333335, + "loss": 2.6203, + "step": 37650 + }, + { + "epoch": 5.357396617876936, + "grad_norm": 1.6953563690185547, + "learning_rate": 0.0001366777777777778, + "loss": 2.6076, + "step": 37700 + }, + { + "epoch": 5.3645019184311495, + "grad_norm": 1.8244370222091675, + "learning_rate": 0.00013612222222222223, + "loss": 2.5991, + "step": 37750 + }, + { + "epoch": 5.371607218985363, + "grad_norm": 1.956551194190979, + "learning_rate": 0.00013556666666666666, + "loss": 2.6084, + "step": 37800 + }, + { + "epoch": 5.378712519539577, + "grad_norm": 1.725568413734436, + "learning_rate": 0.0001350111111111111, + "loss": 2.5965, + "step": 37850 + }, + { + "epoch": 5.38581782009379, + "grad_norm": 1.644600749015808, + "learning_rate": 0.00013445555555555557, + "loss": 2.5927, + "step": 37900 + }, + { + "epoch": 5.392923120648003, + "grad_norm": 1.8263912200927734, + "learning_rate": 0.0001339, + "loss": 2.5936, + "step": 37950 + }, + { + "epoch": 5.400028421202217, + "grad_norm": 1.664300799369812, + "learning_rate": 0.00013334444444444444, + "loss": 2.6121, + "step": 38000 + }, + { + "epoch": 5.400028421202217, + "eval_accuracy": 0.5335931181907654, + "eval_loss": 2.589524030685425, + "eval_runtime": 1.4723, + "eval_samples_per_second": 2553.85, + "eval_steps_per_second": 40.074, + "step": 38000 + }, + { + "epoch": 5.407133721756431, + "grad_norm": 1.7988592386245728, + "learning_rate": 0.0001327888888888889, + "loss": 2.5998, + "step": 38050 + }, + { + "epoch": 5.414239022310643, + "grad_norm": 1.78681480884552, + "learning_rate": 0.00013223333333333334, + "loss": 2.5973, + "step": 38100 + }, + { + "epoch": 5.421344322864857, + "grad_norm": 1.8385677337646484, + "learning_rate": 0.00013167777777777778, + "loss": 2.5859, + "step": 38150 + }, + { + "epoch": 5.428449623419071, + "grad_norm": 1.695863962173462, + "learning_rate": 0.00013112222222222221, + "loss": 2.5974, + "step": 38200 + }, + { + "epoch": 5.435554923973284, + "grad_norm": 1.8921480178833008, + "learning_rate": 0.00013056666666666668, + "loss": 2.5927, + "step": 38250 + }, + { + "epoch": 5.442660224527497, + "grad_norm": 1.7668559551239014, + "learning_rate": 0.00013001111111111112, + "loss": 2.5875, + "step": 38300 + }, + { + "epoch": 5.449765525081711, + "grad_norm": 1.7700510025024414, + "learning_rate": 0.00012945555555555555, + "loss": 2.6111, + "step": 38350 + }, + { + "epoch": 5.456870825635924, + "grad_norm": 1.6881382465362549, + "learning_rate": 0.0001289, + "loss": 2.6026, + "step": 38400 + }, + { + "epoch": 5.463976126190138, + "grad_norm": 1.8298293352127075, + "learning_rate": 0.00012834444444444445, + "loss": 2.6006, + "step": 38450 + }, + { + "epoch": 5.471081426744352, + "grad_norm": 1.6942826509475708, + "learning_rate": 0.0001277888888888889, + "loss": 2.6035, + "step": 38500 + }, + { + "epoch": 5.4781867272985645, + "grad_norm": 1.8194513320922852, + "learning_rate": 0.00012723333333333333, + "loss": 2.5884, + "step": 38550 + }, + { + "epoch": 5.485292027852778, + "grad_norm": 1.8685965538024902, + "learning_rate": 0.0001266777777777778, + "loss": 2.5953, + "step": 38600 + }, + { + "epoch": 5.492397328406992, + "grad_norm": 1.7005841732025146, + "learning_rate": 0.00012612222222222223, + "loss": 2.5994, + "step": 38650 + }, + { + "epoch": 5.4995026289612055, + "grad_norm": 1.833150029182434, + "learning_rate": 0.00012556666666666666, + "loss": 2.5881, + "step": 38700 + }, + { + "epoch": 5.506607929515418, + "grad_norm": 1.691675066947937, + "learning_rate": 0.0001250111111111111, + "loss": 2.6069, + "step": 38750 + }, + { + "epoch": 5.513713230069632, + "grad_norm": 1.769320011138916, + "learning_rate": 0.00012445555555555557, + "loss": 2.5756, + "step": 38800 + }, + { + "epoch": 5.5208185306238455, + "grad_norm": 1.6686408519744873, + "learning_rate": 0.0001239, + "loss": 2.5906, + "step": 38850 + }, + { + "epoch": 5.527923831178059, + "grad_norm": 1.6487681865692139, + "learning_rate": 0.00012334444444444447, + "loss": 2.5795, + "step": 38900 + }, + { + "epoch": 5.535029131732272, + "grad_norm": 1.5991772413253784, + "learning_rate": 0.0001227888888888889, + "loss": 2.5886, + "step": 38950 + }, + { + "epoch": 5.542134432286486, + "grad_norm": 1.8373521566390991, + "learning_rate": 0.00012223333333333334, + "loss": 2.5726, + "step": 39000 + }, + { + "epoch": 5.549239732840699, + "grad_norm": 1.832866907119751, + "learning_rate": 0.00012167777777777778, + "loss": 2.5869, + "step": 39050 + }, + { + "epoch": 5.556345033394913, + "grad_norm": 1.6868762969970703, + "learning_rate": 0.00012112222222222223, + "loss": 2.5834, + "step": 39100 + }, + { + "epoch": 5.563450333949126, + "grad_norm": 1.7114180326461792, + "learning_rate": 0.00012056666666666667, + "loss": 2.5955, + "step": 39150 + }, + { + "epoch": 5.570555634503339, + "grad_norm": 1.8619048595428467, + "learning_rate": 0.00012001111111111112, + "loss": 2.618, + "step": 39200 + }, + { + "epoch": 5.577660935057553, + "grad_norm": 1.9599003791809082, + "learning_rate": 0.00011945555555555555, + "loss": 2.5879, + "step": 39250 + }, + { + "epoch": 5.584766235611767, + "grad_norm": 1.8127872943878174, + "learning_rate": 0.0001189, + "loss": 2.5773, + "step": 39300 + }, + { + "epoch": 5.59187153616598, + "grad_norm": 1.6214098930358887, + "learning_rate": 0.00011834444444444445, + "loss": 2.5677, + "step": 39350 + }, + { + "epoch": 5.598976836720193, + "grad_norm": 1.8787380456924438, + "learning_rate": 0.00011778888888888889, + "loss": 2.5936, + "step": 39400 + }, + { + "epoch": 5.606082137274407, + "grad_norm": 1.7826387882232666, + "learning_rate": 0.00011723333333333333, + "loss": 2.5734, + "step": 39450 + }, + { + "epoch": 5.61318743782862, + "grad_norm": 1.6517889499664307, + "learning_rate": 0.00011667777777777779, + "loss": 2.5787, + "step": 39500 + }, + { + "epoch": 5.620292738382833, + "grad_norm": 1.9160776138305664, + "learning_rate": 0.00011612222222222223, + "loss": 2.5911, + "step": 39550 + }, + { + "epoch": 5.627398038937047, + "grad_norm": 1.7249836921691895, + "learning_rate": 0.00011556666666666667, + "loss": 2.5813, + "step": 39600 + }, + { + "epoch": 5.6345033394912605, + "grad_norm": 1.815263032913208, + "learning_rate": 0.0001150111111111111, + "loss": 2.5825, + "step": 39650 + }, + { + "epoch": 5.641608640045474, + "grad_norm": 1.912611722946167, + "learning_rate": 0.00011445555555555557, + "loss": 2.5846, + "step": 39700 + }, + { + "epoch": 5.648713940599688, + "grad_norm": 1.7393444776535034, + "learning_rate": 0.0001139, + "loss": 2.5919, + "step": 39750 + }, + { + "epoch": 5.655819241153901, + "grad_norm": 1.740699291229248, + "learning_rate": 0.00011334444444444444, + "loss": 2.5997, + "step": 39800 + }, + { + "epoch": 5.662924541708114, + "grad_norm": 1.7837730646133423, + "learning_rate": 0.00011278888888888889, + "loss": 2.5815, + "step": 39850 + }, + { + "epoch": 5.670029842262328, + "grad_norm": 1.9134184122085571, + "learning_rate": 0.00011223333333333334, + "loss": 2.5646, + "step": 39900 + }, + { + "epoch": 5.6771351428165415, + "grad_norm": 1.7678228616714478, + "learning_rate": 0.00011167777777777778, + "loss": 2.5827, + "step": 39950 + }, + { + "epoch": 5.684240443370754, + "grad_norm": 1.7933886051177979, + "learning_rate": 0.00011112222222222222, + "loss": 2.5639, + "step": 40000 + }, + { + "epoch": 5.684240443370754, + "eval_accuracy": 0.5388505458831787, + "eval_loss": 2.571995496749878, + "eval_runtime": 1.3688, + "eval_samples_per_second": 2747.025, + "eval_steps_per_second": 43.105, + "step": 40000 + }, + { + "epoch": 5.691345743924968, + "grad_norm": 1.7596800327301025, + "learning_rate": 0.00011056666666666667, + "loss": 2.5873, + "step": 40050 + }, + { + "epoch": 5.698451044479182, + "grad_norm": 1.852180004119873, + "learning_rate": 0.00011001111111111112, + "loss": 2.5909, + "step": 40100 + }, + { + "epoch": 5.705556345033395, + "grad_norm": 1.6055049896240234, + "learning_rate": 0.00010945555555555555, + "loss": 2.568, + "step": 40150 + }, + { + "epoch": 5.712661645587608, + "grad_norm": 1.9267030954360962, + "learning_rate": 0.0001089, + "loss": 2.5681, + "step": 40200 + }, + { + "epoch": 5.719766946141822, + "grad_norm": 1.663442850112915, + "learning_rate": 0.00010834444444444444, + "loss": 2.5927, + "step": 40250 + }, + { + "epoch": 5.726872246696035, + "grad_norm": 1.6687923669815063, + "learning_rate": 0.00010778888888888889, + "loss": 2.5585, + "step": 40300 + }, + { + "epoch": 5.733977547250249, + "grad_norm": 1.64219331741333, + "learning_rate": 0.00010723333333333334, + "loss": 2.6002, + "step": 40350 + }, + { + "epoch": 5.741082847804462, + "grad_norm": 1.7011926174163818, + "learning_rate": 0.00010667777777777778, + "loss": 2.5828, + "step": 40400 + }, + { + "epoch": 5.748188148358675, + "grad_norm": 1.7362624406814575, + "learning_rate": 0.00010612222222222223, + "loss": 2.5832, + "step": 40450 + }, + { + "epoch": 5.755293448912889, + "grad_norm": 1.7169482707977295, + "learning_rate": 0.00010556666666666667, + "loss": 2.5808, + "step": 40500 + }, + { + "epoch": 5.762398749467103, + "grad_norm": 1.7829616069793701, + "learning_rate": 0.00010501111111111112, + "loss": 2.566, + "step": 40550 + }, + { + "epoch": 5.769504050021316, + "grad_norm": 1.8037691116333008, + "learning_rate": 0.00010445555555555555, + "loss": 2.5777, + "step": 40600 + }, + { + "epoch": 5.776609350575529, + "grad_norm": 1.8303767442703247, + "learning_rate": 0.0001039, + "loss": 2.5691, + "step": 40650 + }, + { + "epoch": 5.783714651129743, + "grad_norm": 1.6861969232559204, + "learning_rate": 0.00010334444444444446, + "loss": 2.5929, + "step": 40700 + }, + { + "epoch": 5.7908199516839565, + "grad_norm": 2.0748815536499023, + "learning_rate": 0.00010278888888888889, + "loss": 2.5749, + "step": 40750 + }, + { + "epoch": 5.797925252238169, + "grad_norm": 1.96970534324646, + "learning_rate": 0.00010223333333333333, + "loss": 2.5705, + "step": 40800 + }, + { + "epoch": 5.805030552792383, + "grad_norm": 1.6883175373077393, + "learning_rate": 0.00010167777777777778, + "loss": 2.5644, + "step": 40850 + }, + { + "epoch": 5.8121358533465965, + "grad_norm": 1.8087552785873413, + "learning_rate": 0.00010112222222222223, + "loss": 2.5855, + "step": 40900 + }, + { + "epoch": 5.81924115390081, + "grad_norm": 1.771265983581543, + "learning_rate": 0.00010056666666666667, + "loss": 2.5527, + "step": 40950 + }, + { + "epoch": 5.826346454455024, + "grad_norm": 1.7859134674072266, + "learning_rate": 0.0001000111111111111, + "loss": 2.5787, + "step": 41000 + }, + { + "epoch": 5.833451755009237, + "grad_norm": 1.8071932792663574, + "learning_rate": 9.945555555555557e-05, + "loss": 2.5371, + "step": 41050 + }, + { + "epoch": 5.84055705556345, + "grad_norm": 1.6997110843658447, + "learning_rate": 9.89e-05, + "loss": 2.5671, + "step": 41100 + }, + { + "epoch": 5.847662356117664, + "grad_norm": 1.623996615409851, + "learning_rate": 9.834444444444444e-05, + "loss": 2.5568, + "step": 41150 + }, + { + "epoch": 5.854767656671878, + "grad_norm": 1.720940351486206, + "learning_rate": 9.778888888888888e-05, + "loss": 2.5701, + "step": 41200 + }, + { + "epoch": 5.86187295722609, + "grad_norm": 1.5865821838378906, + "learning_rate": 9.723333333333334e-05, + "loss": 2.5884, + "step": 41250 + }, + { + "epoch": 5.868978257780304, + "grad_norm": 1.7232975959777832, + "learning_rate": 9.667777777777778e-05, + "loss": 2.5765, + "step": 41300 + }, + { + "epoch": 5.876083558334518, + "grad_norm": 1.879902958869934, + "learning_rate": 9.612222222222222e-05, + "loss": 2.5735, + "step": 41350 + }, + { + "epoch": 5.883188858888731, + "grad_norm": 1.7776896953582764, + "learning_rate": 9.556666666666667e-05, + "loss": 2.5701, + "step": 41400 + }, + { + "epoch": 5.890294159442944, + "grad_norm": 1.6992663145065308, + "learning_rate": 9.501111111111112e-05, + "loss": 2.5841, + "step": 41450 + }, + { + "epoch": 5.897399459997158, + "grad_norm": 1.8483794927597046, + "learning_rate": 9.445555555555556e-05, + "loss": 2.5563, + "step": 41500 + }, + { + "epoch": 5.904504760551371, + "grad_norm": 1.7901180982589722, + "learning_rate": 9.39e-05, + "loss": 2.5739, + "step": 41550 + }, + { + "epoch": 5.911610061105585, + "grad_norm": 1.8169100284576416, + "learning_rate": 9.334444444444444e-05, + "loss": 2.5597, + "step": 41600 + }, + { + "epoch": 5.918715361659798, + "grad_norm": 1.8494465351104736, + "learning_rate": 9.278888888888889e-05, + "loss": 2.5641, + "step": 41650 + }, + { + "epoch": 5.9258206622140115, + "grad_norm": 1.711912989616394, + "learning_rate": 9.223333333333333e-05, + "loss": 2.558, + "step": 41700 + }, + { + "epoch": 5.932925962768225, + "grad_norm": 1.789607048034668, + "learning_rate": 9.167777777777778e-05, + "loss": 2.5553, + "step": 41750 + }, + { + "epoch": 5.940031263322439, + "grad_norm": 1.6762250661849976, + "learning_rate": 9.112222222222222e-05, + "loss": 2.5429, + "step": 41800 + }, + { + "epoch": 5.9471365638766525, + "grad_norm": 1.8096888065338135, + "learning_rate": 9.056666666666667e-05, + "loss": 2.5565, + "step": 41850 + }, + { + "epoch": 5.954241864430865, + "grad_norm": 1.6990909576416016, + "learning_rate": 9.001111111111112e-05, + "loss": 2.5566, + "step": 41900 + }, + { + "epoch": 5.961347164985079, + "grad_norm": 1.7819483280181885, + "learning_rate": 8.945555555555556e-05, + "loss": 2.5609, + "step": 41950 + }, + { + "epoch": 5.9684524655392925, + "grad_norm": 1.7949304580688477, + "learning_rate": 8.89e-05, + "loss": 2.5517, + "step": 42000 + }, + { + "epoch": 5.9684524655392925, + "eval_accuracy": 0.5439066290855408, + "eval_loss": 2.5534567832946777, + "eval_runtime": 1.4415, + "eval_samples_per_second": 2608.377, + "eval_steps_per_second": 40.929, + "step": 42000 + }, + { + "epoch": 5.975557766093505, + "grad_norm": 1.8308558464050293, + "learning_rate": 8.834444444444444e-05, + "loss": 2.5442, + "step": 42050 + }, + { + "epoch": 5.982663066647719, + "grad_norm": 1.9043349027633667, + "learning_rate": 8.77888888888889e-05, + "loss": 2.5432, + "step": 42100 + }, + { + "epoch": 5.989768367201933, + "grad_norm": 1.8522542715072632, + "learning_rate": 8.723333333333333e-05, + "loss": 2.5459, + "step": 42150 + }, + { + "epoch": 5.996873667756146, + "grad_norm": 1.8050007820129395, + "learning_rate": 8.667777777777778e-05, + "loss": 2.5387, + "step": 42200 + }, + { + "epoch": 6.00397896831036, + "grad_norm": 1.7550387382507324, + "learning_rate": 8.612222222222223e-05, + "loss": 2.5406, + "step": 42250 + }, + { + "epoch": 6.011084268864573, + "grad_norm": 1.7701274156570435, + "learning_rate": 8.556666666666667e-05, + "loss": 2.5472, + "step": 42300 + }, + { + "epoch": 6.018189569418786, + "grad_norm": 1.6725575923919678, + "learning_rate": 8.50111111111111e-05, + "loss": 2.537, + "step": 42350 + }, + { + "epoch": 6.025294869973, + "grad_norm": 1.7032172679901123, + "learning_rate": 8.445555555555557e-05, + "loss": 2.5599, + "step": 42400 + }, + { + "epoch": 6.032400170527214, + "grad_norm": 1.8186277151107788, + "learning_rate": 8.39e-05, + "loss": 2.5486, + "step": 42450 + }, + { + "epoch": 6.039505471081426, + "grad_norm": 1.6973793506622314, + "learning_rate": 8.334444444444444e-05, + "loss": 2.5327, + "step": 42500 + }, + { + "epoch": 6.04661077163564, + "grad_norm": 1.7290997505187988, + "learning_rate": 8.278888888888888e-05, + "loss": 2.5486, + "step": 42550 + }, + { + "epoch": 6.053716072189854, + "grad_norm": 1.72194504737854, + "learning_rate": 8.223333333333334e-05, + "loss": 2.5563, + "step": 42600 + }, + { + "epoch": 6.060821372744067, + "grad_norm": 1.6357437372207642, + "learning_rate": 8.167777777777778e-05, + "loss": 2.5448, + "step": 42650 + }, + { + "epoch": 6.06792667329828, + "grad_norm": 1.7466579675674438, + "learning_rate": 8.112222222222222e-05, + "loss": 2.5516, + "step": 42700 + }, + { + "epoch": 6.075031973852494, + "grad_norm": 1.7971217632293701, + "learning_rate": 8.056666666666667e-05, + "loss": 2.5372, + "step": 42750 + }, + { + "epoch": 6.0821372744067075, + "grad_norm": 1.7370117902755737, + "learning_rate": 8.001111111111112e-05, + "loss": 2.5451, + "step": 42800 + }, + { + "epoch": 6.089242574960921, + "grad_norm": 1.9375853538513184, + "learning_rate": 7.945555555555556e-05, + "loss": 2.5472, + "step": 42850 + }, + { + "epoch": 6.096347875515134, + "grad_norm": 1.8141741752624512, + "learning_rate": 7.89e-05, + "loss": 2.5328, + "step": 42900 + }, + { + "epoch": 6.103453176069348, + "grad_norm": 1.9149723052978516, + "learning_rate": 7.834444444444444e-05, + "loss": 2.5162, + "step": 42950 + }, + { + "epoch": 6.110558476623561, + "grad_norm": 1.7717311382293701, + "learning_rate": 7.77888888888889e-05, + "loss": 2.5438, + "step": 43000 + }, + { + "epoch": 6.117663777177775, + "grad_norm": 1.7755837440490723, + "learning_rate": 7.723333333333333e-05, + "loss": 2.5316, + "step": 43050 + }, + { + "epoch": 6.124769077731988, + "grad_norm": 1.7218981981277466, + "learning_rate": 7.667777777777778e-05, + "loss": 2.5293, + "step": 43100 + }, + { + "epoch": 6.131874378286201, + "grad_norm": 1.824884295463562, + "learning_rate": 7.612222222222222e-05, + "loss": 2.5248, + "step": 43150 + }, + { + "epoch": 6.138979678840415, + "grad_norm": 1.7106846570968628, + "learning_rate": 7.556666666666667e-05, + "loss": 2.5321, + "step": 43200 + }, + { + "epoch": 6.146084979394629, + "grad_norm": 1.805311679840088, + "learning_rate": 7.501111111111112e-05, + "loss": 2.5245, + "step": 43250 + }, + { + "epoch": 6.153190279948841, + "grad_norm": 1.7311850786209106, + "learning_rate": 7.445555555555556e-05, + "loss": 2.5471, + "step": 43300 + }, + { + "epoch": 6.160295580503055, + "grad_norm": 1.7861510515213013, + "learning_rate": 7.39e-05, + "loss": 2.5501, + "step": 43350 + }, + { + "epoch": 6.167400881057269, + "grad_norm": 1.9243968725204468, + "learning_rate": 7.334444444444444e-05, + "loss": 2.5437, + "step": 43400 + }, + { + "epoch": 6.174506181611482, + "grad_norm": 1.706551432609558, + "learning_rate": 7.27888888888889e-05, + "loss": 2.535, + "step": 43450 + }, + { + "epoch": 6.181611482165696, + "grad_norm": 1.8230974674224854, + "learning_rate": 7.223333333333333e-05, + "loss": 2.5183, + "step": 43500 + }, + { + "epoch": 6.188716782719909, + "grad_norm": 1.8202252388000488, + "learning_rate": 7.167777777777778e-05, + "loss": 2.5224, + "step": 43550 + }, + { + "epoch": 6.195822083274122, + "grad_norm": 1.7891016006469727, + "learning_rate": 7.112222222222223e-05, + "loss": 2.5142, + "step": 43600 + }, + { + "epoch": 6.202927383828336, + "grad_norm": 1.6762447357177734, + "learning_rate": 7.056666666666667e-05, + "loss": 2.5286, + "step": 43650 + }, + { + "epoch": 6.21003268438255, + "grad_norm": 1.7952409982681274, + "learning_rate": 7.00111111111111e-05, + "loss": 2.5212, + "step": 43700 + }, + { + "epoch": 6.2171379849367625, + "grad_norm": 1.9008454084396362, + "learning_rate": 6.945555555555556e-05, + "loss": 2.5084, + "step": 43750 + }, + { + "epoch": 6.224243285490976, + "grad_norm": 1.9572248458862305, + "learning_rate": 6.890000000000001e-05, + "loss": 2.542, + "step": 43800 + }, + { + "epoch": 6.23134858604519, + "grad_norm": 1.8827017545700073, + "learning_rate": 6.834444444444444e-05, + "loss": 2.5489, + "step": 43850 + }, + { + "epoch": 6.2384538865994035, + "grad_norm": 1.8700237274169922, + "learning_rate": 6.778888888888888e-05, + "loss": 2.5443, + "step": 43900 + }, + { + "epoch": 6.245559187153616, + "grad_norm": 2.1006288528442383, + "learning_rate": 6.723333333333335e-05, + "loss": 2.5255, + "step": 43950 + }, + { + "epoch": 6.25266448770783, + "grad_norm": 1.7663155794143677, + "learning_rate": 6.667777777777778e-05, + "loss": 2.5377, + "step": 44000 + }, + { + "epoch": 6.25266448770783, + "eval_accuracy": 0.5429770946502686, + "eval_loss": 2.5491862297058105, + "eval_runtime": 1.3913, + "eval_samples_per_second": 2702.566, + "eval_steps_per_second": 42.407, + "step": 44000 + }, + { + "epoch": 6.2597697882620436, + "grad_norm": 1.747223973274231, + "learning_rate": 6.612222222222222e-05, + "loss": 2.5047, + "step": 44050 + }, + { + "epoch": 6.266875088816257, + "grad_norm": 1.837746500968933, + "learning_rate": 6.556666666666666e-05, + "loss": 2.5486, + "step": 44100 + }, + { + "epoch": 6.27398038937047, + "grad_norm": 1.8067456483840942, + "learning_rate": 6.501111111111112e-05, + "loss": 2.5246, + "step": 44150 + }, + { + "epoch": 6.281085689924684, + "grad_norm": 1.7894113063812256, + "learning_rate": 6.445555555555556e-05, + "loss": 2.5244, + "step": 44200 + }, + { + "epoch": 6.288190990478897, + "grad_norm": 1.8320538997650146, + "learning_rate": 6.39e-05, + "loss": 2.5486, + "step": 44250 + }, + { + "epoch": 6.295296291033111, + "grad_norm": 1.9445518255233765, + "learning_rate": 6.334444444444445e-05, + "loss": 2.5449, + "step": 44300 + }, + { + "epoch": 6.302401591587325, + "grad_norm": 1.8100392818450928, + "learning_rate": 6.27888888888889e-05, + "loss": 2.516, + "step": 44350 + }, + { + "epoch": 6.309506892141537, + "grad_norm": 1.7949355840682983, + "learning_rate": 6.223333333333333e-05, + "loss": 2.5361, + "step": 44400 + }, + { + "epoch": 6.316612192695751, + "grad_norm": 1.7824511528015137, + "learning_rate": 6.167777777777778e-05, + "loss": 2.5252, + "step": 44450 + }, + { + "epoch": 6.323717493249965, + "grad_norm": 1.8611977100372314, + "learning_rate": 6.112222222222222e-05, + "loss": 2.5231, + "step": 44500 + }, + { + "epoch": 6.3308227938041775, + "grad_norm": 1.7862845659255981, + "learning_rate": 6.0566666666666664e-05, + "loss": 2.5104, + "step": 44550 + }, + { + "epoch": 6.337928094358391, + "grad_norm": 1.7541511058807373, + "learning_rate": 6.0011111111111114e-05, + "loss": 2.511, + "step": 44600 + }, + { + "epoch": 6.345033394912605, + "grad_norm": 1.7871416807174683, + "learning_rate": 5.945555555555555e-05, + "loss": 2.5121, + "step": 44650 + }, + { + "epoch": 6.352138695466818, + "grad_norm": 1.8592162132263184, + "learning_rate": 5.89e-05, + "loss": 2.5114, + "step": 44700 + }, + { + "epoch": 6.359243996021032, + "grad_norm": 1.7054407596588135, + "learning_rate": 5.8344444444444446e-05, + "loss": 2.53, + "step": 44750 + }, + { + "epoch": 6.366349296575245, + "grad_norm": 1.788488745689392, + "learning_rate": 5.778888888888889e-05, + "loss": 2.5508, + "step": 44800 + }, + { + "epoch": 6.3734545971294585, + "grad_norm": 1.8061481714248657, + "learning_rate": 5.723333333333333e-05, + "loss": 2.5408, + "step": 44850 + }, + { + "epoch": 6.380559897683672, + "grad_norm": 1.7002582550048828, + "learning_rate": 5.667777777777778e-05, + "loss": 2.5298, + "step": 44900 + }, + { + "epoch": 6.387665198237886, + "grad_norm": 1.707021951675415, + "learning_rate": 5.612222222222222e-05, + "loss": 2.522, + "step": 44950 + }, + { + "epoch": 6.394770498792099, + "grad_norm": 1.855074405670166, + "learning_rate": 5.556666666666667e-05, + "loss": 2.5345, + "step": 45000 + }, + { + "epoch": 6.401875799346312, + "grad_norm": 1.866926670074463, + "learning_rate": 5.501111111111111e-05, + "loss": 2.5368, + "step": 45050 + }, + { + "epoch": 6.408981099900526, + "grad_norm": 1.7042498588562012, + "learning_rate": 5.445555555555556e-05, + "loss": 2.5188, + "step": 45100 + }, + { + "epoch": 6.4160864004547395, + "grad_norm": 1.8467471599578857, + "learning_rate": 5.39e-05, + "loss": 2.5368, + "step": 45150 + }, + { + "epoch": 6.423191701008952, + "grad_norm": 1.7979286909103394, + "learning_rate": 5.3344444444444446e-05, + "loss": 2.518, + "step": 45200 + }, + { + "epoch": 6.430297001563166, + "grad_norm": 1.772163987159729, + "learning_rate": 5.2788888888888897e-05, + "loss": 2.5179, + "step": 45250 + }, + { + "epoch": 6.43740230211738, + "grad_norm": 1.9672173261642456, + "learning_rate": 5.2233333333333334e-05, + "loss": 2.5017, + "step": 45300 + }, + { + "epoch": 6.444507602671593, + "grad_norm": 1.8143894672393799, + "learning_rate": 5.1677777777777784e-05, + "loss": 2.5096, + "step": 45350 + }, + { + "epoch": 6.451612903225806, + "grad_norm": 1.794852614402771, + "learning_rate": 5.112222222222222e-05, + "loss": 2.5268, + "step": 45400 + }, + { + "epoch": 6.45871820378002, + "grad_norm": 2.0142173767089844, + "learning_rate": 5.056666666666667e-05, + "loss": 2.523, + "step": 45450 + }, + { + "epoch": 6.465823504334233, + "grad_norm": 1.9065548181533813, + "learning_rate": 5.001111111111111e-05, + "loss": 2.5207, + "step": 45500 + }, + { + "epoch": 6.472928804888447, + "grad_norm": 1.9128227233886719, + "learning_rate": 4.945555555555556e-05, + "loss": 2.4899, + "step": 45550 + }, + { + "epoch": 6.480034105442661, + "grad_norm": 1.729750633239746, + "learning_rate": 4.89e-05, + "loss": 2.538, + "step": 45600 + }, + { + "epoch": 6.487139405996873, + "grad_norm": 1.7502912282943726, + "learning_rate": 4.8344444444444447e-05, + "loss": 2.5154, + "step": 45650 + }, + { + "epoch": 6.494244706551087, + "grad_norm": 1.7217427492141724, + "learning_rate": 4.778888888888889e-05, + "loss": 2.5228, + "step": 45700 + }, + { + "epoch": 6.501350007105301, + "grad_norm": 1.852655053138733, + "learning_rate": 4.7233333333333334e-05, + "loss": 2.5175, + "step": 45750 + }, + { + "epoch": 6.5084553076595135, + "grad_norm": 1.8434449434280396, + "learning_rate": 4.667777777777778e-05, + "loss": 2.5029, + "step": 45800 + }, + { + "epoch": 6.515560608213727, + "grad_norm": 1.6561399698257446, + "learning_rate": 4.612222222222223e-05, + "loss": 2.5213, + "step": 45850 + }, + { + "epoch": 6.522665908767941, + "grad_norm": 1.8549339771270752, + "learning_rate": 4.5566666666666665e-05, + "loss": 2.5187, + "step": 45900 + }, + { + "epoch": 6.5297712093221545, + "grad_norm": 1.7742987871170044, + "learning_rate": 4.5011111111111116e-05, + "loss": 2.5192, + "step": 45950 + }, + { + "epoch": 6.536876509876368, + "grad_norm": 1.772049903869629, + "learning_rate": 4.445555555555555e-05, + "loss": 2.5169, + "step": 46000 + }, + { + "epoch": 6.536876509876368, + "eval_accuracy": 0.5479844808578491, + "eval_loss": 2.523486614227295, + "eval_runtime": 1.4553, + "eval_samples_per_second": 2583.605, + "eval_steps_per_second": 40.541, + "step": 46000 + }, + { + "epoch": 6.543981810430581, + "grad_norm": 1.9687917232513428, + "learning_rate": 4.39e-05, + "loss": 2.5184, + "step": 46050 + }, + { + "epoch": 6.551087110984795, + "grad_norm": 1.6084198951721191, + "learning_rate": 4.334444444444444e-05, + "loss": 2.5259, + "step": 46100 + }, + { + "epoch": 6.558192411539008, + "grad_norm": 1.8368006944656372, + "learning_rate": 4.278888888888889e-05, + "loss": 2.5118, + "step": 46150 + }, + { + "epoch": 6.565297712093222, + "grad_norm": 1.8750320672988892, + "learning_rate": 4.2233333333333334e-05, + "loss": 2.5157, + "step": 46200 + }, + { + "epoch": 6.572403012647435, + "grad_norm": 1.7846745252609253, + "learning_rate": 4.167777777777778e-05, + "loss": 2.5076, + "step": 46250 + }, + { + "epoch": 6.579508313201648, + "grad_norm": 1.8737770318984985, + "learning_rate": 4.112222222222222e-05, + "loss": 2.5257, + "step": 46300 + }, + { + "epoch": 6.586613613755862, + "grad_norm": 1.870085597038269, + "learning_rate": 4.0566666666666666e-05, + "loss": 2.5151, + "step": 46350 + }, + { + "epoch": 6.593718914310076, + "grad_norm": 1.7165669202804565, + "learning_rate": 4.001111111111111e-05, + "loss": 2.5106, + "step": 46400 + }, + { + "epoch": 6.600824214864288, + "grad_norm": 2.036309242248535, + "learning_rate": 3.945555555555556e-05, + "loss": 2.5137, + "step": 46450 + }, + { + "epoch": 6.607929515418502, + "grad_norm": 2.068466901779175, + "learning_rate": 3.89e-05, + "loss": 2.5248, + "step": 46500 + }, + { + "epoch": 6.615034815972716, + "grad_norm": 1.8142298460006714, + "learning_rate": 3.834444444444445e-05, + "loss": 2.5043, + "step": 46550 + }, + { + "epoch": 6.622140116526929, + "grad_norm": 1.8824352025985718, + "learning_rate": 3.7788888888888884e-05, + "loss": 2.5186, + "step": 46600 + }, + { + "epoch": 6.629245417081142, + "grad_norm": 1.865993618965149, + "learning_rate": 3.7233333333333335e-05, + "loss": 2.5131, + "step": 46650 + }, + { + "epoch": 6.636350717635356, + "grad_norm": 1.7249752283096313, + "learning_rate": 3.667777777777777e-05, + "loss": 2.508, + "step": 46700 + }, + { + "epoch": 6.643456018189569, + "grad_norm": 1.8466631174087524, + "learning_rate": 3.612222222222222e-05, + "loss": 2.5059, + "step": 46750 + }, + { + "epoch": 6.650561318743783, + "grad_norm": 1.7760319709777832, + "learning_rate": 3.556666666666667e-05, + "loss": 2.5165, + "step": 46800 + }, + { + "epoch": 6.657666619297997, + "grad_norm": 1.7560021877288818, + "learning_rate": 3.501111111111111e-05, + "loss": 2.4964, + "step": 46850 + }, + { + "epoch": 6.6647719198522095, + "grad_norm": 1.7918329238891602, + "learning_rate": 3.445555555555556e-05, + "loss": 2.4828, + "step": 46900 + }, + { + "epoch": 6.671877220406423, + "grad_norm": 1.89692223072052, + "learning_rate": 3.39e-05, + "loss": 2.5003, + "step": 46950 + }, + { + "epoch": 6.678982520960637, + "grad_norm": 1.7761543989181519, + "learning_rate": 3.334444444444445e-05, + "loss": 2.4997, + "step": 47000 + }, + { + "epoch": 6.68608782151485, + "grad_norm": 1.837147831916809, + "learning_rate": 3.278888888888889e-05, + "loss": 2.4969, + "step": 47050 + }, + { + "epoch": 6.693193122069063, + "grad_norm": 1.7900179624557495, + "learning_rate": 3.2233333333333335e-05, + "loss": 2.5142, + "step": 47100 + }, + { + "epoch": 6.700298422623277, + "grad_norm": 1.8336573839187622, + "learning_rate": 3.167777777777778e-05, + "loss": 2.4998, + "step": 47150 + }, + { + "epoch": 6.707403723177491, + "grad_norm": 1.9420949220657349, + "learning_rate": 3.112222222222222e-05, + "loss": 2.4862, + "step": 47200 + }, + { + "epoch": 6.714509023731704, + "grad_norm": 1.8640563488006592, + "learning_rate": 3.0566666666666667e-05, + "loss": 2.4828, + "step": 47250 + }, + { + "epoch": 6.721614324285917, + "grad_norm": 1.7071493864059448, + "learning_rate": 3.001111111111111e-05, + "loss": 2.4917, + "step": 47300 + }, + { + "epoch": 6.728719624840131, + "grad_norm": 1.986015796661377, + "learning_rate": 2.9455555555555554e-05, + "loss": 2.5003, + "step": 47350 + }, + { + "epoch": 6.735824925394344, + "grad_norm": 1.985974907875061, + "learning_rate": 2.8899999999999998e-05, + "loss": 2.5244, + "step": 47400 + }, + { + "epoch": 6.742930225948558, + "grad_norm": 1.7473855018615723, + "learning_rate": 2.8344444444444445e-05, + "loss": 2.4982, + "step": 47450 + }, + { + "epoch": 6.750035526502771, + "grad_norm": 1.6116970777511597, + "learning_rate": 2.778888888888889e-05, + "loss": 2.4841, + "step": 47500 + }, + { + "epoch": 6.757140827056984, + "grad_norm": 1.7973964214324951, + "learning_rate": 2.7233333333333332e-05, + "loss": 2.5168, + "step": 47550 + }, + { + "epoch": 6.764246127611198, + "grad_norm": 1.7002062797546387, + "learning_rate": 2.667777777777778e-05, + "loss": 2.4778, + "step": 47600 + }, + { + "epoch": 6.771351428165412, + "grad_norm": 1.6538755893707275, + "learning_rate": 2.6122222222222223e-05, + "loss": 2.5103, + "step": 47650 + }, + { + "epoch": 6.7784567287196245, + "grad_norm": 1.9194822311401367, + "learning_rate": 2.5566666666666667e-05, + "loss": 2.5176, + "step": 47700 + }, + { + "epoch": 6.785562029273838, + "grad_norm": 1.9157203435897827, + "learning_rate": 2.5011111111111114e-05, + "loss": 2.4771, + "step": 47750 + }, + { + "epoch": 6.792667329828052, + "grad_norm": 1.752124547958374, + "learning_rate": 2.4455555555555558e-05, + "loss": 2.4962, + "step": 47800 + }, + { + "epoch": 6.799772630382265, + "grad_norm": 1.8622421026229858, + "learning_rate": 2.39e-05, + "loss": 2.5011, + "step": 47850 + }, + { + "epoch": 6.806877930936478, + "grad_norm": 1.9002290964126587, + "learning_rate": 2.3344444444444445e-05, + "loss": 2.501, + "step": 47900 + }, + { + "epoch": 6.813983231490692, + "grad_norm": 1.779293179512024, + "learning_rate": 2.278888888888889e-05, + "loss": 2.5198, + "step": 47950 + }, + { + "epoch": 6.8210885320449055, + "grad_norm": 1.7831950187683105, + "learning_rate": 2.2233333333333336e-05, + "loss": 2.5213, + "step": 48000 + }, + { + "epoch": 6.8210885320449055, + "eval_accuracy": 0.5504963397979736, + "eval_loss": 2.474808692932129, + "eval_runtime": 1.4017, + "eval_samples_per_second": 2682.379, + "eval_steps_per_second": 42.091, + "step": 48000 + }, + { + "epoch": 6.828193832599119, + "grad_norm": 1.9210829734802246, + "learning_rate": 2.167777777777778e-05, + "loss": 2.5188, + "step": 48050 + }, + { + "epoch": 6.835299133153333, + "grad_norm": 1.8451883792877197, + "learning_rate": 2.1122222222222224e-05, + "loss": 2.4747, + "step": 48100 + }, + { + "epoch": 6.842404433707546, + "grad_norm": 1.761411428451538, + "learning_rate": 2.0566666666666667e-05, + "loss": 2.5166, + "step": 48150 + }, + { + "epoch": 6.849509734261759, + "grad_norm": 1.6441208124160767, + "learning_rate": 2.001111111111111e-05, + "loss": 2.5105, + "step": 48200 + }, + { + "epoch": 6.856615034815973, + "grad_norm": 1.8028146028518677, + "learning_rate": 1.9455555555555555e-05, + "loss": 2.507, + "step": 48250 + }, + { + "epoch": 6.863720335370186, + "grad_norm": 1.8731715679168701, + "learning_rate": 1.8900000000000002e-05, + "loss": 2.5134, + "step": 48300 + }, + { + "epoch": 6.870825635924399, + "grad_norm": 1.891373634338379, + "learning_rate": 1.8344444444444446e-05, + "loss": 2.498, + "step": 48350 + }, + { + "epoch": 6.877930936478613, + "grad_norm": 1.762410044670105, + "learning_rate": 1.778888888888889e-05, + "loss": 2.5021, + "step": 48400 + }, + { + "epoch": 6.885036237032827, + "grad_norm": 1.608912706375122, + "learning_rate": 1.7233333333333333e-05, + "loss": 2.4846, + "step": 48450 + }, + { + "epoch": 6.89214153758704, + "grad_norm": 1.6867483854293823, + "learning_rate": 1.6677777777777777e-05, + "loss": 2.5007, + "step": 48500 + }, + { + "epoch": 6.899246838141253, + "grad_norm": 1.753810167312622, + "learning_rate": 1.612222222222222e-05, + "loss": 2.4897, + "step": 48550 + }, + { + "epoch": 6.906352138695467, + "grad_norm": 1.7778904438018799, + "learning_rate": 1.5566666666666668e-05, + "loss": 2.516, + "step": 48600 + }, + { + "epoch": 6.91345743924968, + "grad_norm": 1.8375523090362549, + "learning_rate": 1.5011111111111112e-05, + "loss": 2.4946, + "step": 48650 + }, + { + "epoch": 6.920562739803894, + "grad_norm": 1.7306774854660034, + "learning_rate": 1.4455555555555555e-05, + "loss": 2.4534, + "step": 48700 + }, + { + "epoch": 6.927668040358107, + "grad_norm": 1.8564552068710327, + "learning_rate": 1.3899999999999999e-05, + "loss": 2.494, + "step": 48750 + }, + { + "epoch": 6.9347733409123204, + "grad_norm": 1.687759518623352, + "learning_rate": 1.3344444444444446e-05, + "loss": 2.5194, + "step": 48800 + }, + { + "epoch": 6.941878641466534, + "grad_norm": 1.7956315279006958, + "learning_rate": 1.278888888888889e-05, + "loss": 2.4794, + "step": 48850 + }, + { + "epoch": 6.948983942020748, + "grad_norm": 1.8576797246932983, + "learning_rate": 1.2233333333333334e-05, + "loss": 2.4653, + "step": 48900 + }, + { + "epoch": 6.956089242574961, + "grad_norm": 1.7479002475738525, + "learning_rate": 1.1677777777777779e-05, + "loss": 2.5062, + "step": 48950 + }, + { + "epoch": 6.963194543129174, + "grad_norm": 1.786080002784729, + "learning_rate": 1.1122222222222223e-05, + "loss": 2.4969, + "step": 49000 + }, + { + "epoch": 6.970299843683388, + "grad_norm": 1.8831062316894531, + "learning_rate": 1.0566666666666667e-05, + "loss": 2.4833, + "step": 49050 + }, + { + "epoch": 6.9774051442376015, + "grad_norm": 1.7100664377212524, + "learning_rate": 1.0011111111111112e-05, + "loss": 2.5084, + "step": 49100 + }, + { + "epoch": 6.984510444791814, + "grad_norm": 1.8145115375518799, + "learning_rate": 9.455555555555556e-06, + "loss": 2.5161, + "step": 49150 + }, + { + "epoch": 6.991615745346028, + "grad_norm": 1.7452220916748047, + "learning_rate": 8.9e-06, + "loss": 2.5122, + "step": 49200 + }, + { + "epoch": 6.998721045900242, + "grad_norm": 1.7257003784179688, + "learning_rate": 8.344444444444445e-06, + "loss": 2.507, + "step": 49250 + }, + { + "epoch": 7.005826346454455, + "grad_norm": 1.9227982759475708, + "learning_rate": 7.788888888888889e-06, + "loss": 2.5138, + "step": 49300 + }, + { + "epoch": 7.012931647008669, + "grad_norm": 1.8527491092681885, + "learning_rate": 7.233333333333333e-06, + "loss": 2.4882, + "step": 49350 + }, + { + "epoch": 7.020036947562882, + "grad_norm": 1.9063125848770142, + "learning_rate": 6.677777777777779e-06, + "loss": 2.4867, + "step": 49400 + }, + { + "epoch": 7.027142248117095, + "grad_norm": 1.886391520500183, + "learning_rate": 6.1222222222222224e-06, + "loss": 2.4559, + "step": 49450 + }, + { + "epoch": 7.034247548671309, + "grad_norm": 1.8591769933700562, + "learning_rate": 5.566666666666667e-06, + "loss": 2.487, + "step": 49500 + }, + { + "epoch": 7.041352849225523, + "grad_norm": 1.9463618993759155, + "learning_rate": 5.011111111111112e-06, + "loss": 2.4784, + "step": 49550 + }, + { + "epoch": 7.048458149779735, + "grad_norm": 1.8545664548873901, + "learning_rate": 4.455555555555555e-06, + "loss": 2.4978, + "step": 49600 + }, + { + "epoch": 7.055563450333949, + "grad_norm": 1.77763831615448, + "learning_rate": 3.9e-06, + "loss": 2.4797, + "step": 49650 + }, + { + "epoch": 7.062668750888163, + "grad_norm": 2.1448755264282227, + "learning_rate": 3.3444444444444445e-06, + "loss": 2.5023, + "step": 49700 + }, + { + "epoch": 7.069774051442376, + "grad_norm": 1.895645022392273, + "learning_rate": 2.788888888888889e-06, + "loss": 2.4801, + "step": 49750 + }, + { + "epoch": 7.076879351996589, + "grad_norm": 1.6657183170318604, + "learning_rate": 2.2333333333333333e-06, + "loss": 2.4922, + "step": 49800 + }, + { + "epoch": 7.083984652550803, + "grad_norm": 1.82796311378479, + "learning_rate": 1.6777777777777779e-06, + "loss": 2.475, + "step": 49850 + }, + { + "epoch": 7.091089953105016, + "grad_norm": 1.904001235961914, + "learning_rate": 1.1222222222222222e-06, + "loss": 2.4825, + "step": 49900 + }, + { + "epoch": 7.09819525365923, + "grad_norm": 1.7330507040023804, + "learning_rate": 5.666666666666667e-07, + "loss": 2.4798, + "step": 49950 + }, + { + "epoch": 7.105300554213443, + "grad_norm": 1.712110161781311, + "learning_rate": 1.1111111111111112e-08, + "loss": 2.4916, + "step": 50000 + }, + { + "epoch": 7.105300554213443, + "eval_accuracy": 0.5550713539123535, + "eval_loss": 2.4701426029205322, + "eval_runtime": 1.4603, + "eval_samples_per_second": 2574.769, + "eval_steps_per_second": 40.402, + "step": 50000 + } + ], + "logging_steps": 50, + "max_steps": 50000, + "num_input_tokens_seen": 0, + "num_train_epochs": 8, + "save_steps": 2000, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 5.380157104452403e+16, + "train_batch_size": 256, + "trial_name": null, + "trial_params": null +}