| { |
| "best_global_step": null, |
| "best_metric": null, |
| "best_model_checkpoint": null, |
| "epoch": 1.962183374955405, |
| "eval_steps": 200, |
| "global_step": 5500, |
| "is_hyper_param_search": false, |
| "is_local_process_zero": true, |
| "is_world_process_zero": true, |
| "log_history": [ |
| { |
| "epoch": 0.0035676061362825543, |
| "grad_norm": 3.328125, |
| "learning_rate": 3.0000000000000004e-07, |
| "loss": 1.6871, |
| "mean_token_accuracy": 0.6493056863546371, |
| "step": 10 |
| }, |
| { |
| "epoch": 0.007135212272565109, |
| "grad_norm": 3.703125, |
| "learning_rate": 6.333333333333334e-07, |
| "loss": 1.6679, |
| "mean_token_accuracy": 0.6548885881900788, |
| "step": 20 |
| }, |
| { |
| "epoch": 0.010702818408847663, |
| "grad_norm": 3.34375, |
| "learning_rate": 9.666666666666668e-07, |
| "loss": 1.6503, |
| "mean_token_accuracy": 0.6580708235502243, |
| "step": 30 |
| }, |
| { |
| "epoch": 0.014270424545130217, |
| "grad_norm": 3.03125, |
| "learning_rate": 1.3e-06, |
| "loss": 1.6875, |
| "mean_token_accuracy": 0.6484389722347259, |
| "step": 40 |
| }, |
| { |
| "epoch": 0.01783803068141277, |
| "grad_norm": 2.875, |
| "learning_rate": 1.6333333333333335e-06, |
| "loss": 1.611, |
| "mean_token_accuracy": 0.6590644121170044, |
| "step": 50 |
| }, |
| { |
| "epoch": 0.021405636817695327, |
| "grad_norm": 2.703125, |
| "learning_rate": 1.9666666666666668e-06, |
| "loss": 1.6627, |
| "mean_token_accuracy": 0.6492084830999374, |
| "step": 60 |
| }, |
| { |
| "epoch": 0.024973242953977882, |
| "grad_norm": 2.453125, |
| "learning_rate": 2.3000000000000004e-06, |
| "loss": 1.5776, |
| "mean_token_accuracy": 0.6581329733133316, |
| "step": 70 |
| }, |
| { |
| "epoch": 0.028540849090260435, |
| "grad_norm": 2.3125, |
| "learning_rate": 2.6333333333333332e-06, |
| "loss": 1.5193, |
| "mean_token_accuracy": 0.663571435213089, |
| "step": 80 |
| }, |
| { |
| "epoch": 0.03210845522654299, |
| "grad_norm": 1.6328125, |
| "learning_rate": 2.9666666666666673e-06, |
| "loss": 1.5103, |
| "mean_token_accuracy": 0.6620537608861923, |
| "step": 90 |
| }, |
| { |
| "epoch": 0.03567606136282554, |
| "grad_norm": 1.5859375, |
| "learning_rate": 3.3000000000000006e-06, |
| "loss": 1.398, |
| "mean_token_accuracy": 0.6780021399259567, |
| "step": 100 |
| }, |
| { |
| "epoch": 0.0392436674991081, |
| "grad_norm": 1.234375, |
| "learning_rate": 3.633333333333334e-06, |
| "loss": 1.4068, |
| "mean_token_accuracy": 0.6717195540666581, |
| "step": 110 |
| }, |
| { |
| "epoch": 0.042811273635390654, |
| "grad_norm": 0.953125, |
| "learning_rate": 3.966666666666667e-06, |
| "loss": 1.3377, |
| "mean_token_accuracy": 0.6818253934383393, |
| "step": 120 |
| }, |
| { |
| "epoch": 0.04637887977167321, |
| "grad_norm": 0.66015625, |
| "learning_rate": 4.3e-06, |
| "loss": 1.3095, |
| "mean_token_accuracy": 0.6851063162088394, |
| "step": 130 |
| }, |
| { |
| "epoch": 0.049946485907955765, |
| "grad_norm": 0.65234375, |
| "learning_rate": 4.633333333333334e-06, |
| "loss": 1.239, |
| "mean_token_accuracy": 0.699410155415535, |
| "step": 140 |
| }, |
| { |
| "epoch": 0.053514092044238314, |
| "grad_norm": 0.80859375, |
| "learning_rate": 4.966666666666667e-06, |
| "loss": 1.2225, |
| "mean_token_accuracy": 0.7006685018539429, |
| "step": 150 |
| }, |
| { |
| "epoch": 0.05708169818052087, |
| "grad_norm": 0.5546875, |
| "learning_rate": 5.300000000000001e-06, |
| "loss": 1.2636, |
| "mean_token_accuracy": 0.6928021967411041, |
| "step": 160 |
| }, |
| { |
| "epoch": 0.060649304316803425, |
| "grad_norm": 0.5234375, |
| "learning_rate": 5.633333333333334e-06, |
| "loss": 1.185, |
| "mean_token_accuracy": 0.7068213105201722, |
| "step": 170 |
| }, |
| { |
| "epoch": 0.06421691045308597, |
| "grad_norm": 0.55859375, |
| "learning_rate": 5.966666666666667e-06, |
| "loss": 1.2134, |
| "mean_token_accuracy": 0.7014941990375518, |
| "step": 180 |
| }, |
| { |
| "epoch": 0.06778451658936853, |
| "grad_norm": 0.490234375, |
| "learning_rate": 6.300000000000001e-06, |
| "loss": 1.279, |
| "mean_token_accuracy": 0.6920940101146698, |
| "step": 190 |
| }, |
| { |
| "epoch": 0.07135212272565108, |
| "grad_norm": 0.474609375, |
| "learning_rate": 6.633333333333334e-06, |
| "loss": 1.2059, |
| "mean_token_accuracy": 0.7041285097599029, |
| "step": 200 |
| }, |
| { |
| "epoch": 0.07135212272565108, |
| "eval_loss": 1.2062970399856567, |
| "eval_mean_token_accuracy": 0.7016672051142132, |
| "eval_runtime": 129.7363, |
| "eval_samples_per_second": 7.284, |
| "eval_steps_per_second": 7.284, |
| "step": 200 |
| }, |
| { |
| "epoch": 0.07491972886193364, |
| "grad_norm": 0.51171875, |
| "learning_rate": 6.966666666666667e-06, |
| "loss": 1.2212, |
| "mean_token_accuracy": 0.6997595220804215, |
| "step": 210 |
| }, |
| { |
| "epoch": 0.0784873349982162, |
| "grad_norm": 0.4609375, |
| "learning_rate": 7.3e-06, |
| "loss": 1.2062, |
| "mean_token_accuracy": 0.7019762545824051, |
| "step": 220 |
| }, |
| { |
| "epoch": 0.08205494113449875, |
| "grad_norm": 0.51953125, |
| "learning_rate": 7.633333333333334e-06, |
| "loss": 1.1967, |
| "mean_token_accuracy": 0.7059087961912155, |
| "step": 230 |
| }, |
| { |
| "epoch": 0.08562254727078131, |
| "grad_norm": 0.51953125, |
| "learning_rate": 7.966666666666668e-06, |
| "loss": 1.2293, |
| "mean_token_accuracy": 0.6980852127075196, |
| "step": 240 |
| }, |
| { |
| "epoch": 0.08919015340706386, |
| "grad_norm": 0.447265625, |
| "learning_rate": 8.3e-06, |
| "loss": 1.1805, |
| "mean_token_accuracy": 0.707709014415741, |
| "step": 250 |
| }, |
| { |
| "epoch": 0.09275775954334642, |
| "grad_norm": 0.4375, |
| "learning_rate": 8.633333333333334e-06, |
| "loss": 1.2241, |
| "mean_token_accuracy": 0.6983526796102524, |
| "step": 260 |
| }, |
| { |
| "epoch": 0.09632536567962897, |
| "grad_norm": 0.423828125, |
| "learning_rate": 8.966666666666667e-06, |
| "loss": 1.1792, |
| "mean_token_accuracy": 0.7078437715768814, |
| "step": 270 |
| }, |
| { |
| "epoch": 0.09989297181591153, |
| "grad_norm": 0.447265625, |
| "learning_rate": 9.3e-06, |
| "loss": 1.2231, |
| "mean_token_accuracy": 0.6988429129123688, |
| "step": 280 |
| }, |
| { |
| "epoch": 0.10346057795219407, |
| "grad_norm": 0.451171875, |
| "learning_rate": 9.633333333333335e-06, |
| "loss": 1.186, |
| "mean_token_accuracy": 0.7060195624828338, |
| "step": 290 |
| }, |
| { |
| "epoch": 0.10702818408847663, |
| "grad_norm": 0.470703125, |
| "learning_rate": 9.966666666666667e-06, |
| "loss": 1.1457, |
| "mean_token_accuracy": 0.7139957278966904, |
| "step": 300 |
| }, |
| { |
| "epoch": 0.11059579022475918, |
| "grad_norm": 0.470703125, |
| "learning_rate": 9.999938485971279e-06, |
| "loss": 1.1713, |
| "mean_token_accuracy": 0.7086452007293701, |
| "step": 310 |
| }, |
| { |
| "epoch": 0.11416339636104174, |
| "grad_norm": 0.408203125, |
| "learning_rate": 9.999725846827562e-06, |
| "loss": 1.1357, |
| "mean_token_accuracy": 0.7179197162389755, |
| "step": 320 |
| }, |
| { |
| "epoch": 0.1177310024973243, |
| "grad_norm": 0.40625, |
| "learning_rate": 9.999361329594255e-06, |
| "loss": 1.1963, |
| "mean_token_accuracy": 0.7035378068685532, |
| "step": 330 |
| }, |
| { |
| "epoch": 0.12129860863360685, |
| "grad_norm": 0.466796875, |
| "learning_rate": 9.998844945344404e-06, |
| "loss": 1.1991, |
| "mean_token_accuracy": 0.7027044087648392, |
| "step": 340 |
| }, |
| { |
| "epoch": 0.1248662147698894, |
| "grad_norm": 0.4375, |
| "learning_rate": 9.99817670976436e-06, |
| "loss": 1.2241, |
| "mean_token_accuracy": 0.697895684838295, |
| "step": 350 |
| }, |
| { |
| "epoch": 0.12843382090617195, |
| "grad_norm": 0.416015625, |
| "learning_rate": 9.997356643153303e-06, |
| "loss": 1.153, |
| "mean_token_accuracy": 0.7134905397891999, |
| "step": 360 |
| }, |
| { |
| "epoch": 0.13200142704245452, |
| "grad_norm": 0.431640625, |
| "learning_rate": 9.99638477042263e-06, |
| "loss": 1.1724, |
| "mean_token_accuracy": 0.707893255352974, |
| "step": 370 |
| }, |
| { |
| "epoch": 0.13556903317873706, |
| "grad_norm": 0.4296875, |
| "learning_rate": 9.995261121095194e-06, |
| "loss": 1.141, |
| "mean_token_accuracy": 0.7150793671607971, |
| "step": 380 |
| }, |
| { |
| "epoch": 0.13913663931501963, |
| "grad_norm": 0.423828125, |
| "learning_rate": 9.993985729304409e-06, |
| "loss": 1.1605, |
| "mean_token_accuracy": 0.7104953229427338, |
| "step": 390 |
| }, |
| { |
| "epoch": 0.14270424545130217, |
| "grad_norm": 0.44140625, |
| "learning_rate": 9.992558633793212e-06, |
| "loss": 1.218, |
| "mean_token_accuracy": 0.6982508540153504, |
| "step": 400 |
| }, |
| { |
| "epoch": 0.14270424545130217, |
| "eval_loss": 1.1587024927139282, |
| "eval_mean_token_accuracy": 0.7100714037658046, |
| "eval_runtime": 129.716, |
| "eval_samples_per_second": 7.285, |
| "eval_steps_per_second": 7.285, |
| "step": 400 |
| }, |
| { |
| "epoch": 0.14627185158758474, |
| "grad_norm": 0.412109375, |
| "learning_rate": 9.990979877912893e-06, |
| "loss": 1.1043, |
| "mean_token_accuracy": 0.7205588966608047, |
| "step": 410 |
| }, |
| { |
| "epoch": 0.14983945772386728, |
| "grad_norm": 0.4140625, |
| "learning_rate": 9.989249509621759e-06, |
| "loss": 1.1833, |
| "mean_token_accuracy": 0.7077071905136109, |
| "step": 420 |
| }, |
| { |
| "epoch": 0.15340706386014985, |
| "grad_norm": 0.4296875, |
| "learning_rate": 9.987367581483705e-06, |
| "loss": 1.1747, |
| "mean_token_accuracy": 0.7064232885837555, |
| "step": 430 |
| }, |
| { |
| "epoch": 0.1569746699964324, |
| "grad_norm": 0.39453125, |
| "learning_rate": 9.985334150666592e-06, |
| "loss": 1.2035, |
| "mean_token_accuracy": 0.7038203418254853, |
| "step": 440 |
| }, |
| { |
| "epoch": 0.16054227613271496, |
| "grad_norm": 0.40625, |
| "learning_rate": 9.983149278940526e-06, |
| "loss": 1.1481, |
| "mean_token_accuracy": 0.7118978053331375, |
| "step": 450 |
| }, |
| { |
| "epoch": 0.1641098822689975, |
| "grad_norm": 0.41796875, |
| "learning_rate": 9.980813032675975e-06, |
| "loss": 1.1586, |
| "mean_token_accuracy": 0.7104112029075622, |
| "step": 460 |
| }, |
| { |
| "epoch": 0.16767748840528005, |
| "grad_norm": 0.404296875, |
| "learning_rate": 9.978325482841752e-06, |
| "loss": 1.1386, |
| "mean_token_accuracy": 0.7149664223194122, |
| "step": 470 |
| }, |
| { |
| "epoch": 0.17124509454156261, |
| "grad_norm": 0.578125, |
| "learning_rate": 9.975686705002868e-06, |
| "loss": 1.1623, |
| "mean_token_accuracy": 0.7120841652154922, |
| "step": 480 |
| }, |
| { |
| "epoch": 0.17481270067784516, |
| "grad_norm": 0.408203125, |
| "learning_rate": 9.97289677931822e-06, |
| "loss": 1.1351, |
| "mean_token_accuracy": 0.7119597673416138, |
| "step": 490 |
| }, |
| { |
| "epoch": 0.17838030681412773, |
| "grad_norm": 0.40234375, |
| "learning_rate": 9.969955790538175e-06, |
| "loss": 1.1403, |
| "mean_token_accuracy": 0.7143330305814743, |
| "step": 500 |
| }, |
| { |
| "epoch": 0.18194791295041027, |
| "grad_norm": 0.396484375, |
| "learning_rate": 9.966863828001982e-06, |
| "loss": 1.1655, |
| "mean_token_accuracy": 0.7093423008918762, |
| "step": 510 |
| }, |
| { |
| "epoch": 0.18551551908669284, |
| "grad_norm": 0.412109375, |
| "learning_rate": 9.963620985635065e-06, |
| "loss": 1.1989, |
| "mean_token_accuracy": 0.7034634321928024, |
| "step": 520 |
| }, |
| { |
| "epoch": 0.18908312522297538, |
| "grad_norm": 0.40234375, |
| "learning_rate": 9.960227361946164e-06, |
| "loss": 1.1878, |
| "mean_token_accuracy": 0.7056231737136841, |
| "step": 530 |
| }, |
| { |
| "epoch": 0.19265073135925795, |
| "grad_norm": 0.38671875, |
| "learning_rate": 9.95668306002435e-06, |
| "loss": 1.1426, |
| "mean_token_accuracy": 0.7126373589038849, |
| "step": 540 |
| }, |
| { |
| "epoch": 0.1962183374955405, |
| "grad_norm": 0.4375, |
| "learning_rate": 9.952988187535886e-06, |
| "loss": 1.1426, |
| "mean_token_accuracy": 0.7135271608829499, |
| "step": 550 |
| }, |
| { |
| "epoch": 0.19978594363182306, |
| "grad_norm": 0.396484375, |
| "learning_rate": 9.949142856720962e-06, |
| "loss": 1.1458, |
| "mean_token_accuracy": 0.7125060319900512, |
| "step": 560 |
| }, |
| { |
| "epoch": 0.2033535497681056, |
| "grad_norm": 0.4296875, |
| "learning_rate": 9.94514718439028e-06, |
| "loss": 1.1709, |
| "mean_token_accuracy": 0.7099435269832611, |
| "step": 570 |
| }, |
| { |
| "epoch": 0.20692115590438814, |
| "grad_norm": 0.45703125, |
| "learning_rate": 9.941001291921512e-06, |
| "loss": 1.1607, |
| "mean_token_accuracy": 0.7122893363237381, |
| "step": 580 |
| }, |
| { |
| "epoch": 0.2104887620406707, |
| "grad_norm": 0.390625, |
| "learning_rate": 9.936705305255614e-06, |
| "loss": 1.158, |
| "mean_token_accuracy": 0.7115995079278946, |
| "step": 590 |
| }, |
| { |
| "epoch": 0.21405636817695325, |
| "grad_norm": 0.40625, |
| "learning_rate": 9.932259354892984e-06, |
| "loss": 1.1663, |
| "mean_token_accuracy": 0.7099356561899185, |
| "step": 600 |
| }, |
| { |
| "epoch": 0.21405636817695325, |
| "eval_loss": 1.1424596309661865, |
| "eval_mean_token_accuracy": 0.7128433112114195, |
| "eval_runtime": 130.0274, |
| "eval_samples_per_second": 7.268, |
| "eval_steps_per_second": 7.268, |
| "step": 600 |
| }, |
| { |
| "epoch": 0.21762397431323582, |
| "grad_norm": 0.40625, |
| "learning_rate": 9.92766357588952e-06, |
| "loss": 1.1284, |
| "mean_token_accuracy": 0.716127896308899, |
| "step": 610 |
| }, |
| { |
| "epoch": 0.22119158044951837, |
| "grad_norm": 0.41796875, |
| "learning_rate": 9.922918107852504e-06, |
| "loss": 1.1973, |
| "mean_token_accuracy": 0.7034493207931518, |
| "step": 620 |
| }, |
| { |
| "epoch": 0.22475918658580094, |
| "grad_norm": 0.4375, |
| "learning_rate": 9.918023094936364e-06, |
| "loss": 1.1346, |
| "mean_token_accuracy": 0.7147135883569717, |
| "step": 630 |
| }, |
| { |
| "epoch": 0.22832679272208348, |
| "grad_norm": 0.421875, |
| "learning_rate": 9.912978685838294e-06, |
| "loss": 1.1396, |
| "mean_token_accuracy": 0.7132570236921311, |
| "step": 640 |
| }, |
| { |
| "epoch": 0.23189439885836605, |
| "grad_norm": 0.412109375, |
| "learning_rate": 9.90778503379374e-06, |
| "loss": 1.1593, |
| "mean_token_accuracy": 0.7094680279493332, |
| "step": 650 |
| }, |
| { |
| "epoch": 0.2354620049946486, |
| "grad_norm": 0.43359375, |
| "learning_rate": 9.902442296571744e-06, |
| "loss": 1.1453, |
| "mean_token_accuracy": 0.7149360835552215, |
| "step": 660 |
| }, |
| { |
| "epoch": 0.23902961113093116, |
| "grad_norm": 0.447265625, |
| "learning_rate": 9.896950636470147e-06, |
| "loss": 1.1577, |
| "mean_token_accuracy": 0.7109050691127777, |
| "step": 670 |
| }, |
| { |
| "epoch": 0.2425972172672137, |
| "grad_norm": 0.421875, |
| "learning_rate": 9.891310220310668e-06, |
| "loss": 1.1138, |
| "mean_token_accuracy": 0.7179471969604492, |
| "step": 680 |
| }, |
| { |
| "epoch": 0.24616482340349624, |
| "grad_norm": 0.43359375, |
| "learning_rate": 9.885521219433824e-06, |
| "loss": 1.1285, |
| "mean_token_accuracy": 0.7142092436552048, |
| "step": 690 |
| }, |
| { |
| "epoch": 0.2497324295397788, |
| "grad_norm": 0.4140625, |
| "learning_rate": 9.879583809693737e-06, |
| "loss": 1.1468, |
| "mean_token_accuracy": 0.7132605969905853, |
| "step": 700 |
| }, |
| { |
| "epoch": 0.25330003567606135, |
| "grad_norm": 0.447265625, |
| "learning_rate": 9.873498171452788e-06, |
| "loss": 1.0896, |
| "mean_token_accuracy": 0.7208470612764358, |
| "step": 710 |
| }, |
| { |
| "epoch": 0.2568676418123439, |
| "grad_norm": 0.41015625, |
| "learning_rate": 9.867264489576135e-06, |
| "loss": 1.162, |
| "mean_token_accuracy": 0.7106604993343353, |
| "step": 720 |
| }, |
| { |
| "epoch": 0.2604352479486265, |
| "grad_norm": 0.421875, |
| "learning_rate": 9.8608829534261e-06, |
| "loss": 1.093, |
| "mean_token_accuracy": 0.7230006068944931, |
| "step": 730 |
| }, |
| { |
| "epoch": 0.26400285408490903, |
| "grad_norm": 0.423828125, |
| "learning_rate": 9.854353756856413e-06, |
| "loss": 1.1405, |
| "mean_token_accuracy": 0.7145604640245438, |
| "step": 740 |
| }, |
| { |
| "epoch": 0.2675704602211916, |
| "grad_norm": 0.412109375, |
| "learning_rate": 9.847677098206332e-06, |
| "loss": 1.1675, |
| "mean_token_accuracy": 0.7108287513256073, |
| "step": 750 |
| }, |
| { |
| "epoch": 0.2711380663574741, |
| "grad_norm": 0.4453125, |
| "learning_rate": 9.840853180294609e-06, |
| "loss": 1.1449, |
| "mean_token_accuracy": 0.7134768009185791, |
| "step": 760 |
| }, |
| { |
| "epoch": 0.2747056724937567, |
| "grad_norm": 0.419921875, |
| "learning_rate": 9.833882210413333e-06, |
| "loss": 1.1526, |
| "mean_token_accuracy": 0.7124755054712295, |
| "step": 770 |
| }, |
| { |
| "epoch": 0.27827327863003926, |
| "grad_norm": 0.404296875, |
| "learning_rate": 9.826764400321634e-06, |
| "loss": 1.1413, |
| "mean_token_accuracy": 0.7144547283649445, |
| "step": 780 |
| }, |
| { |
| "epoch": 0.2818408847663218, |
| "grad_norm": 0.4375, |
| "learning_rate": 9.819499966239243e-06, |
| "loss": 1.1208, |
| "mean_token_accuracy": 0.7170589029788971, |
| "step": 790 |
| }, |
| { |
| "epoch": 0.28540849090260434, |
| "grad_norm": 0.40625, |
| "learning_rate": 9.812089128839939e-06, |
| "loss": 1.1465, |
| "mean_token_accuracy": 0.7129792451858521, |
| "step": 800 |
| }, |
| { |
| "epoch": 0.28540849090260434, |
| "eval_loss": 1.1330844163894653, |
| "eval_mean_token_accuracy": 0.7144628852132767, |
| "eval_runtime": 130.1009, |
| "eval_samples_per_second": 7.264, |
| "eval_steps_per_second": 7.264, |
| "step": 800 |
| }, |
| { |
| "epoch": 0.2889760970388869, |
| "grad_norm": 0.408203125, |
| "learning_rate": 9.804532113244829e-06, |
| "loss": 1.1472, |
| "mean_token_accuracy": 0.712533500790596, |
| "step": 810 |
| }, |
| { |
| "epoch": 0.2925437031751695, |
| "grad_norm": 0.443359375, |
| "learning_rate": 9.796829149015517e-06, |
| "loss": 1.1492, |
| "mean_token_accuracy": 0.7120617270469666, |
| "step": 820 |
| }, |
| { |
| "epoch": 0.296111309311452, |
| "grad_norm": 0.408203125, |
| "learning_rate": 9.788980470147132e-06, |
| "loss": 1.1661, |
| "mean_token_accuracy": 0.7102774143218994, |
| "step": 830 |
| }, |
| { |
| "epoch": 0.29967891544773456, |
| "grad_norm": 0.423828125, |
| "learning_rate": 9.780986315061218e-06, |
| "loss": 1.1302, |
| "mean_token_accuracy": 0.7161981076002121, |
| "step": 840 |
| }, |
| { |
| "epoch": 0.3032465215840171, |
| "grad_norm": 0.408203125, |
| "learning_rate": 9.772846926598492e-06, |
| "loss": 1.1356, |
| "mean_token_accuracy": 0.7167322486639023, |
| "step": 850 |
| }, |
| { |
| "epoch": 0.3068141277202997, |
| "grad_norm": 0.478515625, |
| "learning_rate": 9.76456255201146e-06, |
| "loss": 1.0996, |
| "mean_token_accuracy": 0.7234615385532379, |
| "step": 860 |
| }, |
| { |
| "epoch": 0.31038173385658224, |
| "grad_norm": 0.423828125, |
| "learning_rate": 9.756133442956923e-06, |
| "loss": 1.1543, |
| "mean_token_accuracy": 0.7091817498207093, |
| "step": 870 |
| }, |
| { |
| "epoch": 0.3139493399928648, |
| "grad_norm": 0.384765625, |
| "learning_rate": 9.747559855488314e-06, |
| "loss": 1.1344, |
| "mean_token_accuracy": 0.7144211947917938, |
| "step": 880 |
| }, |
| { |
| "epoch": 0.3175169461291473, |
| "grad_norm": 0.4140625, |
| "learning_rate": 9.73884205004793e-06, |
| "loss": 1.1804, |
| "mean_token_accuracy": 0.7077479273080826, |
| "step": 890 |
| }, |
| { |
| "epoch": 0.3210845522654299, |
| "grad_norm": 0.392578125, |
| "learning_rate": 9.72998029145902e-06, |
| "loss": 1.1579, |
| "mean_token_accuracy": 0.7100216567516326, |
| "step": 900 |
| }, |
| { |
| "epoch": 0.32465215840171247, |
| "grad_norm": 0.4296875, |
| "learning_rate": 9.720974848917734e-06, |
| "loss": 1.1425, |
| "mean_token_accuracy": 0.7118788123130798, |
| "step": 910 |
| }, |
| { |
| "epoch": 0.328219764537995, |
| "grad_norm": 0.41796875, |
| "learning_rate": 9.711825995984957e-06, |
| "loss": 1.1004, |
| "mean_token_accuracy": 0.7200456827878952, |
| "step": 920 |
| }, |
| { |
| "epoch": 0.33178737067427755, |
| "grad_norm": 0.43359375, |
| "learning_rate": 9.70253401057799e-06, |
| "loss": 1.1737, |
| "mean_token_accuracy": 0.7067985355854034, |
| "step": 930 |
| }, |
| { |
| "epoch": 0.3353549768105601, |
| "grad_norm": 0.41015625, |
| "learning_rate": 9.693099174962103e-06, |
| "loss": 1.1259, |
| "mean_token_accuracy": 0.7149448841810226, |
| "step": 940 |
| }, |
| { |
| "epoch": 0.3389225829468427, |
| "grad_norm": 0.421875, |
| "learning_rate": 9.683521775741976e-06, |
| "loss": 1.1501, |
| "mean_token_accuracy": 0.71326465010643, |
| "step": 950 |
| }, |
| { |
| "epoch": 0.34249018908312523, |
| "grad_norm": 0.38671875, |
| "learning_rate": 9.67380210385298e-06, |
| "loss": 1.1507, |
| "mean_token_accuracy": 0.711698266863823, |
| "step": 960 |
| }, |
| { |
| "epoch": 0.34605779521940777, |
| "grad_norm": 0.3984375, |
| "learning_rate": 9.663940454552342e-06, |
| "loss": 1.162, |
| "mean_token_accuracy": 0.7098847538232803, |
| "step": 970 |
| }, |
| { |
| "epoch": 0.3496254013556903, |
| "grad_norm": 0.421875, |
| "learning_rate": 9.65393712741018e-06, |
| "loss": 1.1452, |
| "mean_token_accuracy": 0.7119550198316574, |
| "step": 980 |
| }, |
| { |
| "epoch": 0.3531930074919729, |
| "grad_norm": 0.40234375, |
| "learning_rate": 9.6437924263004e-06, |
| "loss": 1.085, |
| "mean_token_accuracy": 0.7231919884681701, |
| "step": 990 |
| }, |
| { |
| "epoch": 0.35676061362825545, |
| "grad_norm": 0.416015625, |
| "learning_rate": 9.633506659391461e-06, |
| "loss": 1.179, |
| "mean_token_accuracy": 0.7087622046470642, |
| "step": 1000 |
| }, |
| { |
| "epoch": 0.35676061362825545, |
| "eval_loss": 1.1262521743774414, |
| "eval_mean_token_accuracy": 0.7156485801020628, |
| "eval_runtime": 129.9397, |
| "eval_samples_per_second": 7.273, |
| "eval_steps_per_second": 7.273, |
| "step": 1000 |
| }, |
| { |
| "epoch": 0.360328219764538, |
| "grad_norm": 0.4296875, |
| "learning_rate": 9.623080139137023e-06, |
| "loss": 1.1494, |
| "mean_token_accuracy": 0.7136828035116196, |
| "step": 1010 |
| }, |
| { |
| "epoch": 0.36389582590082054, |
| "grad_norm": 0.400390625, |
| "learning_rate": 9.612513182266447e-06, |
| "loss": 1.1174, |
| "mean_token_accuracy": 0.7193785756826401, |
| "step": 1020 |
| }, |
| { |
| "epoch": 0.3674634320371031, |
| "grad_norm": 0.40625, |
| "learning_rate": 9.601806109775178e-06, |
| "loss": 1.0843, |
| "mean_token_accuracy": 0.7244871765375137, |
| "step": 1030 |
| }, |
| { |
| "epoch": 0.3710310381733857, |
| "grad_norm": 0.451171875, |
| "learning_rate": 9.590959246914995e-06, |
| "loss": 1.1474, |
| "mean_token_accuracy": 0.7139957219362258, |
| "step": 1040 |
| }, |
| { |
| "epoch": 0.3745986443096682, |
| "grad_norm": 0.3984375, |
| "learning_rate": 9.579972923184123e-06, |
| "loss": 1.1285, |
| "mean_token_accuracy": 0.7164849281311035, |
| "step": 1050 |
| }, |
| { |
| "epoch": 0.37816625044595076, |
| "grad_norm": 0.41796875, |
| "learning_rate": 9.568847472317232e-06, |
| "loss": 1.1582, |
| "mean_token_accuracy": 0.7093528658151627, |
| "step": 1060 |
| }, |
| { |
| "epoch": 0.3817338565822333, |
| "grad_norm": 0.41015625, |
| "learning_rate": 9.557583232275303e-06, |
| "loss": 1.1206, |
| "mean_token_accuracy": 0.7188614189624787, |
| "step": 1070 |
| }, |
| { |
| "epoch": 0.3853014627185159, |
| "grad_norm": 0.412109375, |
| "learning_rate": 9.546180545235344e-06, |
| "loss": 1.1307, |
| "mean_token_accuracy": 0.715569081902504, |
| "step": 1080 |
| }, |
| { |
| "epoch": 0.38886906885479844, |
| "grad_norm": 0.419921875, |
| "learning_rate": 9.534639757580014e-06, |
| "loss": 1.1597, |
| "mean_token_accuracy": 0.7111599564552307, |
| "step": 1090 |
| }, |
| { |
| "epoch": 0.392436674991081, |
| "grad_norm": 0.40625, |
| "learning_rate": 9.522961219887092e-06, |
| "loss": 1.123, |
| "mean_token_accuracy": 0.7160988986492157, |
| "step": 1100 |
| }, |
| { |
| "epoch": 0.3960042811273635, |
| "grad_norm": 0.40625, |
| "learning_rate": 9.511145286918829e-06, |
| "loss": 1.1452, |
| "mean_token_accuracy": 0.7111187487840652, |
| "step": 1110 |
| }, |
| { |
| "epoch": 0.3995718872636461, |
| "grad_norm": 0.40234375, |
| "learning_rate": 9.499192317611169e-06, |
| "loss": 1.1138, |
| "mean_token_accuracy": 0.7196794927120209, |
| "step": 1120 |
| }, |
| { |
| "epoch": 0.40313949339992866, |
| "grad_norm": 0.4296875, |
| "learning_rate": 9.487102675062851e-06, |
| "loss": 1.1225, |
| "mean_token_accuracy": 0.7172648102045059, |
| "step": 1130 |
| }, |
| { |
| "epoch": 0.4067070995362112, |
| "grad_norm": 0.3984375, |
| "learning_rate": 9.474876726524375e-06, |
| "loss": 1.1446, |
| "mean_token_accuracy": 0.7108858436346054, |
| "step": 1140 |
| }, |
| { |
| "epoch": 0.41027470567249374, |
| "grad_norm": 0.421875, |
| "learning_rate": 9.462514843386846e-06, |
| "loss": 1.1113, |
| "mean_token_accuracy": 0.7194084107875824, |
| "step": 1150 |
| }, |
| { |
| "epoch": 0.4138423118087763, |
| "grad_norm": 0.412109375, |
| "learning_rate": 9.45001740117069e-06, |
| "loss": 1.1038, |
| "mean_token_accuracy": 0.7193791836500167, |
| "step": 1160 |
| }, |
| { |
| "epoch": 0.4174099179450589, |
| "grad_norm": 0.416015625, |
| "learning_rate": 9.437384779514255e-06, |
| "loss": 1.1063, |
| "mean_token_accuracy": 0.7218341737985611, |
| "step": 1170 |
| }, |
| { |
| "epoch": 0.4209775240813414, |
| "grad_norm": 0.439453125, |
| "learning_rate": 9.424617362162272e-06, |
| "loss": 1.1003, |
| "mean_token_accuracy": 0.720566239953041, |
| "step": 1180 |
| }, |
| { |
| "epoch": 0.42454513021762397, |
| "grad_norm": 0.3984375, |
| "learning_rate": 9.411715536954195e-06, |
| "loss": 1.1184, |
| "mean_token_accuracy": 0.7180521368980408, |
| "step": 1190 |
| }, |
| { |
| "epoch": 0.4281127363539065, |
| "grad_norm": 0.400390625, |
| "learning_rate": 9.398679695812429e-06, |
| "loss": 1.123, |
| "mean_token_accuracy": 0.7158424884080887, |
| "step": 1200 |
| }, |
| { |
| "epoch": 0.4281127363539065, |
| "eval_loss": 1.1210980415344238, |
| "eval_mean_token_accuracy": 0.7164756011079859, |
| "eval_runtime": 129.7635, |
| "eval_samples_per_second": 7.282, |
| "eval_steps_per_second": 7.282, |
| "step": 1200 |
| }, |
| { |
| "epoch": 0.4316803424901891, |
| "grad_norm": 0.408203125, |
| "learning_rate": 9.385510234730415e-06, |
| "loss": 1.1279, |
| "mean_token_accuracy": 0.7153876632452011, |
| "step": 1210 |
| }, |
| { |
| "epoch": 0.43524794862647165, |
| "grad_norm": 0.40234375, |
| "learning_rate": 9.372207553760604e-06, |
| "loss": 1.0931, |
| "mean_token_accuracy": 0.7221202790737152, |
| "step": 1220 |
| }, |
| { |
| "epoch": 0.4388155547627542, |
| "grad_norm": 0.392578125, |
| "learning_rate": 9.358772057002311e-06, |
| "loss": 1.1358, |
| "mean_token_accuracy": 0.7130582332611084, |
| "step": 1230 |
| }, |
| { |
| "epoch": 0.44238316089903673, |
| "grad_norm": 0.447265625, |
| "learning_rate": 9.345204152589429e-06, |
| "loss": 1.1416, |
| "mean_token_accuracy": 0.713940778374672, |
| "step": 1240 |
| }, |
| { |
| "epoch": 0.4459507670353193, |
| "grad_norm": 0.40234375, |
| "learning_rate": 9.331504252678036e-06, |
| "loss": 1.0999, |
| "mean_token_accuracy": 0.722961962223053, |
| "step": 1250 |
| }, |
| { |
| "epoch": 0.44951837317160187, |
| "grad_norm": 0.43359375, |
| "learning_rate": 9.317672773433877e-06, |
| "loss": 1.1298, |
| "mean_token_accuracy": 0.7171943247318268, |
| "step": 1260 |
| }, |
| { |
| "epoch": 0.4530859793078844, |
| "grad_norm": 0.4140625, |
| "learning_rate": 9.30371013501972e-06, |
| "loss": 1.1318, |
| "mean_token_accuracy": 0.7135118931531906, |
| "step": 1270 |
| }, |
| { |
| "epoch": 0.45665358544416695, |
| "grad_norm": 0.392578125, |
| "learning_rate": 9.289616761582587e-06, |
| "loss": 1.1095, |
| "mean_token_accuracy": 0.7176755160093308, |
| "step": 1280 |
| }, |
| { |
| "epoch": 0.4602211915804495, |
| "grad_norm": 0.4140625, |
| "learning_rate": 9.275393081240882e-06, |
| "loss": 1.1432, |
| "mean_token_accuracy": 0.7123174369335175, |
| "step": 1290 |
| }, |
| { |
| "epoch": 0.4637887977167321, |
| "grad_norm": 0.390625, |
| "learning_rate": 9.261039526071374e-06, |
| "loss": 1.181, |
| "mean_token_accuracy": 0.7043406635522842, |
| "step": 1300 |
| }, |
| { |
| "epoch": 0.46735640385301463, |
| "grad_norm": 0.404296875, |
| "learning_rate": 9.246556532096079e-06, |
| "loss": 1.1444, |
| "mean_token_accuracy": 0.7133805394172669, |
| "step": 1310 |
| }, |
| { |
| "epoch": 0.4709240099892972, |
| "grad_norm": 0.40234375, |
| "learning_rate": 9.231944539269009e-06, |
| "loss": 1.1314, |
| "mean_token_accuracy": 0.7159987896680832, |
| "step": 1320 |
| }, |
| { |
| "epoch": 0.4744916161255797, |
| "grad_norm": 0.3984375, |
| "learning_rate": 9.217203991462816e-06, |
| "loss": 1.1424, |
| "mean_token_accuracy": 0.7153406947851181, |
| "step": 1330 |
| }, |
| { |
| "epoch": 0.4780592222618623, |
| "grad_norm": 0.40625, |
| "learning_rate": 9.202335336455297e-06, |
| "loss": 1.0974, |
| "mean_token_accuracy": 0.720839849114418, |
| "step": 1340 |
| }, |
| { |
| "epoch": 0.48162682839814486, |
| "grad_norm": 0.39453125, |
| "learning_rate": 9.187339025915802e-06, |
| "loss": 1.1449, |
| "mean_token_accuracy": 0.714079761505127, |
| "step": 1350 |
| }, |
| { |
| "epoch": 0.4851944345344274, |
| "grad_norm": 0.412109375, |
| "learning_rate": 9.172215515391511e-06, |
| "loss": 1.1234, |
| "mean_token_accuracy": 0.7155411690473557, |
| "step": 1360 |
| }, |
| { |
| "epoch": 0.48876204067070994, |
| "grad_norm": 0.38671875, |
| "learning_rate": 9.156965264293587e-06, |
| "loss": 1.1625, |
| "mean_token_accuracy": 0.7104980289936066, |
| "step": 1370 |
| }, |
| { |
| "epoch": 0.4923296468069925, |
| "grad_norm": 0.423828125, |
| "learning_rate": 9.141588735883232e-06, |
| "loss": 1.1397, |
| "mean_token_accuracy": 0.7147118121385574, |
| "step": 1380 |
| }, |
| { |
| "epoch": 0.4958972529432751, |
| "grad_norm": 0.404296875, |
| "learning_rate": 9.126086397257613e-06, |
| "loss": 1.1055, |
| "mean_token_accuracy": 0.7200381577014923, |
| "step": 1390 |
| }, |
| { |
| "epoch": 0.4994648590795576, |
| "grad_norm": 0.392578125, |
| "learning_rate": 9.110458719335658e-06, |
| "loss": 1.1128, |
| "mean_token_accuracy": 0.7173902809619903, |
| "step": 1400 |
| }, |
| { |
| "epoch": 0.4994648590795576, |
| "eval_loss": 1.1170151233673096, |
| "eval_mean_token_accuracy": 0.7172522515846939, |
| "eval_runtime": 129.7541, |
| "eval_samples_per_second": 7.283, |
| "eval_steps_per_second": 7.283, |
| "step": 1400 |
| }, |
| { |
| "epoch": 0.5030324652158402, |
| "grad_norm": 0.404296875, |
| "learning_rate": 9.094706176843777e-06, |
| "loss": 1.1184, |
| "mean_token_accuracy": 0.7172956675291061, |
| "step": 1410 |
| }, |
| { |
| "epoch": 0.5066000713521227, |
| "grad_norm": 0.4296875, |
| "learning_rate": 9.078829248301418e-06, |
| "loss": 1.1457, |
| "mean_token_accuracy": 0.7121971815824508, |
| "step": 1420 |
| }, |
| { |
| "epoch": 0.5101676774884053, |
| "grad_norm": 0.400390625, |
| "learning_rate": 9.06282841600654e-06, |
| "loss": 1.1171, |
| "mean_token_accuracy": 0.7190193891525268, |
| "step": 1430 |
| }, |
| { |
| "epoch": 0.5137352836246878, |
| "grad_norm": 0.41015625, |
| "learning_rate": 9.046704166020962e-06, |
| "loss": 1.1006, |
| "mean_token_accuracy": 0.7216590374708176, |
| "step": 1440 |
| }, |
| { |
| "epoch": 0.5173028897609704, |
| "grad_norm": 0.400390625, |
| "learning_rate": 9.030456988155596e-06, |
| "loss": 1.1472, |
| "mean_token_accuracy": 0.7126083642244339, |
| "step": 1450 |
| }, |
| { |
| "epoch": 0.520870495897253, |
| "grad_norm": 0.412109375, |
| "learning_rate": 9.014087375955574e-06, |
| "loss": 1.1549, |
| "mean_token_accuracy": 0.7121048629283905, |
| "step": 1460 |
| }, |
| { |
| "epoch": 0.5244381020335355, |
| "grad_norm": 0.396484375, |
| "learning_rate": 8.997595826685244e-06, |
| "loss": 1.1367, |
| "mean_token_accuracy": 0.7131784975528717, |
| "step": 1470 |
| }, |
| { |
| "epoch": 0.5280057081698181, |
| "grad_norm": 0.408203125, |
| "learning_rate": 8.980982841313074e-06, |
| "loss": 1.1021, |
| "mean_token_accuracy": 0.7187942564487457, |
| "step": 1480 |
| }, |
| { |
| "epoch": 0.5315733143061006, |
| "grad_norm": 0.41796875, |
| "learning_rate": 8.964248924496436e-06, |
| "loss": 1.1671, |
| "mean_token_accuracy": 0.7080662369728088, |
| "step": 1490 |
| }, |
| { |
| "epoch": 0.5351409204423832, |
| "grad_norm": 0.384765625, |
| "learning_rate": 8.947394584566258e-06, |
| "loss": 1.119, |
| "mean_token_accuracy": 0.7194337636232376, |
| "step": 1500 |
| }, |
| { |
| "epoch": 0.5387085265786657, |
| "grad_norm": 0.4296875, |
| "learning_rate": 8.930420333511606e-06, |
| "loss": 1.0863, |
| "mean_token_accuracy": 0.7273092210292816, |
| "step": 1510 |
| }, |
| { |
| "epoch": 0.5422761327149482, |
| "grad_norm": 0.40234375, |
| "learning_rate": 8.913326686964118e-06, |
| "loss": 1.1094, |
| "mean_token_accuracy": 0.7193956017494202, |
| "step": 1520 |
| }, |
| { |
| "epoch": 0.5458437388512308, |
| "grad_norm": 0.423828125, |
| "learning_rate": 8.89611416418234e-06, |
| "loss": 1.1228, |
| "mean_token_accuracy": 0.717705026268959, |
| "step": 1530 |
| }, |
| { |
| "epoch": 0.5494113449875134, |
| "grad_norm": 0.41796875, |
| "learning_rate": 8.878783288035958e-06, |
| "loss": 1.1298, |
| "mean_token_accuracy": 0.7152670890092849, |
| "step": 1540 |
| }, |
| { |
| "epoch": 0.5529789511237959, |
| "grad_norm": 0.376953125, |
| "learning_rate": 8.861334584989909e-06, |
| "loss": 1.0833, |
| "mean_token_accuracy": 0.7264007419347763, |
| "step": 1550 |
| }, |
| { |
| "epoch": 0.5565465572600785, |
| "grad_norm": 0.419921875, |
| "learning_rate": 8.843768585088394e-06, |
| "loss": 1.1246, |
| "mean_token_accuracy": 0.7142621994018554, |
| "step": 1560 |
| }, |
| { |
| "epoch": 0.560114163396361, |
| "grad_norm": 0.37890625, |
| "learning_rate": 8.82608582193877e-06, |
| "loss": 1.1455, |
| "mean_token_accuracy": 0.7122760713100433, |
| "step": 1570 |
| }, |
| { |
| "epoch": 0.5636817695326436, |
| "grad_norm": 0.40625, |
| "learning_rate": 8.80828683269535e-06, |
| "loss": 1.0687, |
| "mean_token_accuracy": 0.7281849682331085, |
| "step": 1580 |
| }, |
| { |
| "epoch": 0.5672493756689262, |
| "grad_norm": 0.41796875, |
| "learning_rate": 8.790372158043075e-06, |
| "loss": 1.1762, |
| "mean_token_accuracy": 0.7067204713821411, |
| "step": 1590 |
| }, |
| { |
| "epoch": 0.5708169818052087, |
| "grad_norm": 0.41796875, |
| "learning_rate": 8.772342342181094e-06, |
| "loss": 1.1204, |
| "mean_token_accuracy": 0.7168204188346863, |
| "step": 1600 |
| }, |
| { |
| "epoch": 0.5708169818052087, |
| "eval_loss": 1.113885760307312, |
| "eval_mean_token_accuracy": 0.7179328374131017, |
| "eval_runtime": 129.7287, |
| "eval_samples_per_second": 7.284, |
| "eval_steps_per_second": 7.284, |
| "step": 1600 |
| }, |
| { |
| "epoch": 0.5743845879414913, |
| "grad_norm": 0.4140625, |
| "learning_rate": 8.754197932806241e-06, |
| "loss": 1.0923, |
| "mean_token_accuracy": 0.7226126015186309, |
| "step": 1610 |
| }, |
| { |
| "epoch": 0.5779521940777738, |
| "grad_norm": 0.388671875, |
| "learning_rate": 8.735939481096378e-06, |
| "loss": 1.1291, |
| "mean_token_accuracy": 0.7160821080207824, |
| "step": 1620 |
| }, |
| { |
| "epoch": 0.5815198002140564, |
| "grad_norm": 0.396484375, |
| "learning_rate": 8.717567541693673e-06, |
| "loss": 1.1157, |
| "mean_token_accuracy": 0.7175381571054459, |
| "step": 1630 |
| }, |
| { |
| "epoch": 0.585087406350339, |
| "grad_norm": 0.443359375, |
| "learning_rate": 8.699082672687734e-06, |
| "loss": 1.1565, |
| "mean_token_accuracy": 0.7133760660886764, |
| "step": 1640 |
| }, |
| { |
| "epoch": 0.5886550124866214, |
| "grad_norm": 0.392578125, |
| "learning_rate": 8.680485435598674e-06, |
| "loss": 1.128, |
| "mean_token_accuracy": 0.7148131161928177, |
| "step": 1650 |
| }, |
| { |
| "epoch": 0.592222618622904, |
| "grad_norm": 0.38671875, |
| "learning_rate": 8.66177639536003e-06, |
| "loss": 1.1639, |
| "mean_token_accuracy": 0.7102029919624329, |
| "step": 1660 |
| }, |
| { |
| "epoch": 0.5957902247591866, |
| "grad_norm": 0.423828125, |
| "learning_rate": 8.642956120301626e-06, |
| "loss": 1.1691, |
| "mean_token_accuracy": 0.7109829038381577, |
| "step": 1670 |
| }, |
| { |
| "epoch": 0.5993578308954691, |
| "grad_norm": 0.4375, |
| "learning_rate": 8.624025182132293e-06, |
| "loss": 1.1153, |
| "mean_token_accuracy": 0.7207663655281067, |
| "step": 1680 |
| }, |
| { |
| "epoch": 0.6029254370317517, |
| "grad_norm": 0.408203125, |
| "learning_rate": 8.604984155922507e-06, |
| "loss": 1.1443, |
| "mean_token_accuracy": 0.7137151628732681, |
| "step": 1690 |
| }, |
| { |
| "epoch": 0.6064930431680342, |
| "grad_norm": 0.41015625, |
| "learning_rate": 8.585833620086919e-06, |
| "loss": 1.1282, |
| "mean_token_accuracy": 0.7165161371231079, |
| "step": 1700 |
| }, |
| { |
| "epoch": 0.6100606493043168, |
| "grad_norm": 0.408203125, |
| "learning_rate": 8.566574156366784e-06, |
| "loss": 1.1128, |
| "mean_token_accuracy": 0.7188989132642746, |
| "step": 1710 |
| }, |
| { |
| "epoch": 0.6136282554405994, |
| "grad_norm": 0.408203125, |
| "learning_rate": 8.547206349812298e-06, |
| "loss": 1.0959, |
| "mean_token_accuracy": 0.7228006720542908, |
| "step": 1720 |
| }, |
| { |
| "epoch": 0.6171958615768819, |
| "grad_norm": 0.404296875, |
| "learning_rate": 8.527730788764806e-06, |
| "loss": 1.0866, |
| "mean_token_accuracy": 0.7220207571983337, |
| "step": 1730 |
| }, |
| { |
| "epoch": 0.6207634677131645, |
| "grad_norm": 0.3828125, |
| "learning_rate": 8.508148064838948e-06, |
| "loss": 1.1204, |
| "mean_token_accuracy": 0.7184960007667541, |
| "step": 1740 |
| }, |
| { |
| "epoch": 0.624331073849447, |
| "grad_norm": 0.40625, |
| "learning_rate": 8.488458772904685e-06, |
| "loss": 1.1005, |
| "mean_token_accuracy": 0.7205128222703934, |
| "step": 1750 |
| }, |
| { |
| "epoch": 0.6278986799857296, |
| "grad_norm": 0.412109375, |
| "learning_rate": 8.468663511069217e-06, |
| "loss": 1.093, |
| "mean_token_accuracy": 0.7237001955509186, |
| "step": 1760 |
| }, |
| { |
| "epoch": 0.6314662861220122, |
| "grad_norm": 0.40625, |
| "learning_rate": 8.448762880658824e-06, |
| "loss": 1.1175, |
| "mean_token_accuracy": 0.7174406319856643, |
| "step": 1770 |
| }, |
| { |
| "epoch": 0.6350338922582947, |
| "grad_norm": 0.41015625, |
| "learning_rate": 8.428757486200603e-06, |
| "loss": 1.1455, |
| "mean_token_accuracy": 0.7134257555007935, |
| "step": 1780 |
| }, |
| { |
| "epoch": 0.6386014983945772, |
| "grad_norm": 0.37109375, |
| "learning_rate": 8.40864793540409e-06, |
| "loss": 1.0907, |
| "mean_token_accuracy": 0.7222222208976745, |
| "step": 1790 |
| }, |
| { |
| "epoch": 0.6421691045308598, |
| "grad_norm": 0.439453125, |
| "learning_rate": 8.388434839142814e-06, |
| "loss": 1.1291, |
| "mean_token_accuracy": 0.7152737528085709, |
| "step": 1800 |
| }, |
| { |
| "epoch": 0.6421691045308598, |
| "eval_loss": 1.1112759113311768, |
| "eval_mean_token_accuracy": 0.7183094984009153, |
| "eval_runtime": 129.5552, |
| "eval_samples_per_second": 7.294, |
| "eval_steps_per_second": 7.294, |
| "step": 1800 |
| }, |
| { |
| "epoch": 0.6457367106671423, |
| "grad_norm": 0.416015625, |
| "learning_rate": 8.368118811435727e-06, |
| "loss": 1.1139, |
| "mean_token_accuracy": 0.7180525124073028, |
| "step": 1810 |
| }, |
| { |
| "epoch": 0.6493043168034249, |
| "grad_norm": 0.43359375, |
| "learning_rate": 8.347700469428564e-06, |
| "loss": 1.0676, |
| "mean_token_accuracy": 0.7244535982608795, |
| "step": 1820 |
| }, |
| { |
| "epoch": 0.6528719229397074, |
| "grad_norm": 0.4296875, |
| "learning_rate": 8.327180433375091e-06, |
| "loss": 1.1539, |
| "mean_token_accuracy": 0.7107728093862533, |
| "step": 1830 |
| }, |
| { |
| "epoch": 0.65643952907599, |
| "grad_norm": 0.41015625, |
| "learning_rate": 8.30655932661826e-06, |
| "loss": 1.0917, |
| "mean_token_accuracy": 0.7237072676420212, |
| "step": 1840 |
| }, |
| { |
| "epoch": 0.6600071352122726, |
| "grad_norm": 0.419921875, |
| "learning_rate": 8.285837775571277e-06, |
| "loss": 1.1071, |
| "mean_token_accuracy": 0.7205540329217911, |
| "step": 1850 |
| }, |
| { |
| "epoch": 0.6635747413485551, |
| "grad_norm": 0.40625, |
| "learning_rate": 8.265016409698575e-06, |
| "loss": 1.1168, |
| "mean_token_accuracy": 0.7185046076774597, |
| "step": 1860 |
| }, |
| { |
| "epoch": 0.6671423474848377, |
| "grad_norm": 0.392578125, |
| "learning_rate": 8.244095861496686e-06, |
| "loss": 1.126, |
| "mean_token_accuracy": 0.7147445350885391, |
| "step": 1870 |
| }, |
| { |
| "epoch": 0.6707099536211202, |
| "grad_norm": 0.416015625, |
| "learning_rate": 8.223076766475035e-06, |
| "loss": 1.1393, |
| "mean_token_accuracy": 0.7144123941659928, |
| "step": 1880 |
| }, |
| { |
| "epoch": 0.6742775597574028, |
| "grad_norm": 0.400390625, |
| "learning_rate": 8.201959763136633e-06, |
| "loss": 1.0996, |
| "mean_token_accuracy": 0.720915749669075, |
| "step": 1890 |
| }, |
| { |
| "epoch": 0.6778451658936854, |
| "grad_norm": 0.4453125, |
| "learning_rate": 8.180745492958675e-06, |
| "loss": 1.1228, |
| "mean_token_accuracy": 0.7163150638341904, |
| "step": 1900 |
| }, |
| { |
| "epoch": 0.6814127720299679, |
| "grad_norm": 0.388671875, |
| "learning_rate": 8.15943460037306e-06, |
| "loss": 1.0919, |
| "mean_token_accuracy": 0.7232828825712204, |
| "step": 1910 |
| }, |
| { |
| "epoch": 0.6849803781662505, |
| "grad_norm": 0.375, |
| "learning_rate": 8.138027732746818e-06, |
| "loss": 1.1326, |
| "mean_token_accuracy": 0.7160445064306259, |
| "step": 1920 |
| }, |
| { |
| "epoch": 0.688547984302533, |
| "grad_norm": 0.41015625, |
| "learning_rate": 8.116525540362434e-06, |
| "loss": 1.0976, |
| "mean_token_accuracy": 0.7222598105669021, |
| "step": 1930 |
| }, |
| { |
| "epoch": 0.6921155904388155, |
| "grad_norm": 0.404296875, |
| "learning_rate": 8.094928676398102e-06, |
| "loss": 1.1088, |
| "mean_token_accuracy": 0.7192872434854507, |
| "step": 1940 |
| }, |
| { |
| "epoch": 0.6956831965750981, |
| "grad_norm": 0.453125, |
| "learning_rate": 8.073237796907882e-06, |
| "loss": 1.1296, |
| "mean_token_accuracy": 0.7178067773580551, |
| "step": 1950 |
| }, |
| { |
| "epoch": 0.6992508027113806, |
| "grad_norm": 0.408203125, |
| "learning_rate": 8.051453560801772e-06, |
| "loss": 1.126, |
| "mean_token_accuracy": 0.7155622184276581, |
| "step": 1960 |
| }, |
| { |
| "epoch": 0.7028184088476632, |
| "grad_norm": 0.400390625, |
| "learning_rate": 8.029576629825688e-06, |
| "loss": 1.1065, |
| "mean_token_accuracy": 0.7209303051233291, |
| "step": 1970 |
| }, |
| { |
| "epoch": 0.7063860149839458, |
| "grad_norm": 0.40625, |
| "learning_rate": 8.007607668541362e-06, |
| "loss": 1.112, |
| "mean_token_accuracy": 0.7199948191642761, |
| "step": 1980 |
| }, |
| { |
| "epoch": 0.7099536211202283, |
| "grad_norm": 0.423828125, |
| "learning_rate": 7.98554734430616e-06, |
| "loss": 1.1144, |
| "mean_token_accuracy": 0.7163010597229004, |
| "step": 1990 |
| }, |
| { |
| "epoch": 0.7135212272565109, |
| "grad_norm": 0.4140625, |
| "learning_rate": 7.963396327252812e-06, |
| "loss": 1.1727, |
| "mean_token_accuracy": 0.7071001201868057, |
| "step": 2000 |
| }, |
| { |
| "epoch": 0.7135212272565109, |
| "eval_loss": 1.1092555522918701, |
| "eval_mean_token_accuracy": 0.71876708281734, |
| "eval_runtime": 129.8182, |
| "eval_samples_per_second": 7.279, |
| "eval_steps_per_second": 7.279, |
| "step": 2000 |
| }, |
| { |
| "epoch": 0.7170888333927934, |
| "grad_norm": 0.38671875, |
| "learning_rate": 7.941155290269038e-06, |
| "loss": 1.1294, |
| "mean_token_accuracy": 0.7160616219043732, |
| "step": 2010 |
| }, |
| { |
| "epoch": 0.720656439529076, |
| "grad_norm": 0.392578125, |
| "learning_rate": 7.918824908977122e-06, |
| "loss": 1.1077, |
| "mean_token_accuracy": 0.717936509847641, |
| "step": 2020 |
| }, |
| { |
| "epoch": 0.7242240456653586, |
| "grad_norm": 0.43359375, |
| "learning_rate": 7.896405861713393e-06, |
| "loss": 1.1357, |
| "mean_token_accuracy": 0.7146857738494873, |
| "step": 2030 |
| }, |
| { |
| "epoch": 0.7277916518016411, |
| "grad_norm": 0.396484375, |
| "learning_rate": 7.873898829507606e-06, |
| "loss": 1.1267, |
| "mean_token_accuracy": 0.7182570159435272, |
| "step": 2040 |
| }, |
| { |
| "epoch": 0.7313592579379237, |
| "grad_norm": 0.4140625, |
| "learning_rate": 7.851304496062255e-06, |
| "loss": 1.1009, |
| "mean_token_accuracy": 0.7209157526493073, |
| "step": 2050 |
| }, |
| { |
| "epoch": 0.7349268640742062, |
| "grad_norm": 0.421875, |
| "learning_rate": 7.828623547731817e-06, |
| "loss": 1.114, |
| "mean_token_accuracy": 0.7182909816503524, |
| "step": 2060 |
| }, |
| { |
| "epoch": 0.7384944702104888, |
| "grad_norm": 0.41015625, |
| "learning_rate": 7.80585667350189e-06, |
| "loss": 1.1389, |
| "mean_token_accuracy": 0.7153458327054978, |
| "step": 2070 |
| }, |
| { |
| "epoch": 0.7420620763467713, |
| "grad_norm": 0.431640625, |
| "learning_rate": 7.783004564968264e-06, |
| "loss": 1.1433, |
| "mean_token_accuracy": 0.7121430188417435, |
| "step": 2080 |
| }, |
| { |
| "epoch": 0.7456296824830538, |
| "grad_norm": 0.3984375, |
| "learning_rate": 7.760067916315921e-06, |
| "loss": 1.1025, |
| "mean_token_accuracy": 0.7199740529060363, |
| "step": 2090 |
| }, |
| { |
| "epoch": 0.7491972886193364, |
| "grad_norm": 0.408203125, |
| "learning_rate": 7.73704742429794e-06, |
| "loss": 1.1541, |
| "mean_token_accuracy": 0.7129777103662491, |
| "step": 2100 |
| }, |
| { |
| "epoch": 0.752764894755619, |
| "grad_norm": 0.421875, |
| "learning_rate": 7.713943788214337e-06, |
| "loss": 1.1749, |
| "mean_token_accuracy": 0.7083684355020523, |
| "step": 2110 |
| }, |
| { |
| "epoch": 0.7563325008919015, |
| "grad_norm": 0.423828125, |
| "learning_rate": 7.690757709890812e-06, |
| "loss": 1.1163, |
| "mean_token_accuracy": 0.7175366312265397, |
| "step": 2120 |
| }, |
| { |
| "epoch": 0.7599001070281841, |
| "grad_norm": 0.41015625, |
| "learning_rate": 7.66748989365744e-06, |
| "loss": 1.1011, |
| "mean_token_accuracy": 0.7209264308214187, |
| "step": 2130 |
| }, |
| { |
| "epoch": 0.7634677131644666, |
| "grad_norm": 0.439453125, |
| "learning_rate": 7.644141046327272e-06, |
| "loss": 1.0954, |
| "mean_token_accuracy": 0.7210824608802795, |
| "step": 2140 |
| }, |
| { |
| "epoch": 0.7670353193007492, |
| "grad_norm": 0.390625, |
| "learning_rate": 7.620711877174865e-06, |
| "loss": 1.0982, |
| "mean_token_accuracy": 0.7221284538507462, |
| "step": 2150 |
| }, |
| { |
| "epoch": 0.7706029254370318, |
| "grad_norm": 0.43359375, |
| "learning_rate": 7.597203097914732e-06, |
| "loss": 1.1388, |
| "mean_token_accuracy": 0.7131964445114136, |
| "step": 2160 |
| }, |
| { |
| "epoch": 0.7741705315733143, |
| "grad_norm": 0.376953125, |
| "learning_rate": 7.573615422679726e-06, |
| "loss": 1.0986, |
| "mean_token_accuracy": 0.7218159317970276, |
| "step": 2170 |
| }, |
| { |
| "epoch": 0.7777381377095969, |
| "grad_norm": 0.404296875, |
| "learning_rate": 7.549949567999345e-06, |
| "loss": 1.1109, |
| "mean_token_accuracy": 0.720161783695221, |
| "step": 2180 |
| }, |
| { |
| "epoch": 0.7813057438458794, |
| "grad_norm": 0.392578125, |
| "learning_rate": 7.526206252777968e-06, |
| "loss": 1.1174, |
| "mean_token_accuracy": 0.7193341284990311, |
| "step": 2190 |
| }, |
| { |
| "epoch": 0.784873349982162, |
| "grad_norm": 0.4375, |
| "learning_rate": 7.50238619827301e-06, |
| "loss": 1.1282, |
| "mean_token_accuracy": 0.7151526242494584, |
| "step": 2200 |
| }, |
| { |
| "epoch": 0.784873349982162, |
| "eval_loss": 1.1076802015304565, |
| "eval_mean_token_accuracy": 0.7190840867461351, |
| "eval_runtime": 129.5985, |
| "eval_samples_per_second": 7.292, |
| "eval_steps_per_second": 7.292, |
| "step": 2200 |
| }, |
| { |
| "epoch": 0.7884409561184446, |
| "grad_norm": 0.38671875, |
| "learning_rate": 7.478490128073021e-06, |
| "loss": 1.1367, |
| "mean_token_accuracy": 0.7158470660448074, |
| "step": 2210 |
| }, |
| { |
| "epoch": 0.792008562254727, |
| "grad_norm": 0.455078125, |
| "learning_rate": 7.454518768075705e-06, |
| "loss": 1.1032, |
| "mean_token_accuracy": 0.7226166218519211, |
| "step": 2220 |
| }, |
| { |
| "epoch": 0.7955761683910096, |
| "grad_norm": 0.408203125, |
| "learning_rate": 7.430472846465856e-06, |
| "loss": 1.0852, |
| "mean_token_accuracy": 0.7247954785823822, |
| "step": 2230 |
| }, |
| { |
| "epoch": 0.7991437745272922, |
| "grad_norm": 0.40234375, |
| "learning_rate": 7.406353093693254e-06, |
| "loss": 1.1243, |
| "mean_token_accuracy": 0.7176846712827682, |
| "step": 2240 |
| }, |
| { |
| "epoch": 0.8027113806635747, |
| "grad_norm": 0.421875, |
| "learning_rate": 7.382160242450469e-06, |
| "loss": 1.1019, |
| "mean_token_accuracy": 0.7193223416805268, |
| "step": 2250 |
| }, |
| { |
| "epoch": 0.8062789867998573, |
| "grad_norm": 0.369140625, |
| "learning_rate": 7.3578950276505986e-06, |
| "loss": 1.1045, |
| "mean_token_accuracy": 0.7196703284978867, |
| "step": 2260 |
| }, |
| { |
| "epoch": 0.8098465929361398, |
| "grad_norm": 0.41015625, |
| "learning_rate": 7.333558186404957e-06, |
| "loss": 1.1259, |
| "mean_token_accuracy": 0.7183966040611267, |
| "step": 2270 |
| }, |
| { |
| "epoch": 0.8134141990724224, |
| "grad_norm": 0.4296875, |
| "learning_rate": 7.309150458000668e-06, |
| "loss": 1.1084, |
| "mean_token_accuracy": 0.719590961933136, |
| "step": 2280 |
| }, |
| { |
| "epoch": 0.816981805208705, |
| "grad_norm": 0.376953125, |
| "learning_rate": 7.284672583878218e-06, |
| "loss": 1.1115, |
| "mean_token_accuracy": 0.7197863161563873, |
| "step": 2290 |
| }, |
| { |
| "epoch": 0.8205494113449875, |
| "grad_norm": 0.40625, |
| "learning_rate": 7.260125307608929e-06, |
| "loss": 1.1326, |
| "mean_token_accuracy": 0.7145008385181427, |
| "step": 2300 |
| }, |
| { |
| "epoch": 0.8241170174812701, |
| "grad_norm": 0.40625, |
| "learning_rate": 7.2355093748723736e-06, |
| "loss": 1.0616, |
| "mean_token_accuracy": 0.7284486919641495, |
| "step": 2310 |
| }, |
| { |
| "epoch": 0.8276846236175526, |
| "grad_norm": 0.416015625, |
| "learning_rate": 7.21082553343372e-06, |
| "loss": 1.1128, |
| "mean_token_accuracy": 0.718557596206665, |
| "step": 2320 |
| }, |
| { |
| "epoch": 0.8312522297538352, |
| "grad_norm": 0.39453125, |
| "learning_rate": 7.1860745331210134e-06, |
| "loss": 1.0932, |
| "mean_token_accuracy": 0.7211322575807572, |
| "step": 2330 |
| }, |
| { |
| "epoch": 0.8348198358901178, |
| "grad_norm": 0.38671875, |
| "learning_rate": 7.161257125802412e-06, |
| "loss": 1.0894, |
| "mean_token_accuracy": 0.7218823105096817, |
| "step": 2340 |
| }, |
| { |
| "epoch": 0.8383874420264003, |
| "grad_norm": 0.400390625, |
| "learning_rate": 7.136374065363334e-06, |
| "loss": 1.108, |
| "mean_token_accuracy": 0.7213227808475494, |
| "step": 2350 |
| }, |
| { |
| "epoch": 0.8419550481626829, |
| "grad_norm": 0.396484375, |
| "learning_rate": 7.11142610768356e-06, |
| "loss": 1.1143, |
| "mean_token_accuracy": 0.7198183685541153, |
| "step": 2360 |
| }, |
| { |
| "epoch": 0.8455226542989654, |
| "grad_norm": 0.400390625, |
| "learning_rate": 7.086414010614275e-06, |
| "loss": 1.0941, |
| "mean_token_accuracy": 0.7209905326366425, |
| "step": 2370 |
| }, |
| { |
| "epoch": 0.8490902604352479, |
| "grad_norm": 0.41015625, |
| "learning_rate": 7.061338533955042e-06, |
| "loss": 1.0979, |
| "mean_token_accuracy": 0.7203459590673447, |
| "step": 2380 |
| }, |
| { |
| "epoch": 0.8526578665715305, |
| "grad_norm": 0.41796875, |
| "learning_rate": 7.036200439430726e-06, |
| "loss": 1.1143, |
| "mean_token_accuracy": 0.719848895072937, |
| "step": 2390 |
| }, |
| { |
| "epoch": 0.856225472707813, |
| "grad_norm": 0.39453125, |
| "learning_rate": 7.011000490668351e-06, |
| "loss": 1.1283, |
| "mean_token_accuracy": 0.7175860285758973, |
| "step": 2400 |
| }, |
| { |
| "epoch": 0.856225472707813, |
| "eval_loss": 1.1063700914382935, |
| "eval_mean_token_accuracy": 0.7192220385743197, |
| "eval_runtime": 129.9411, |
| "eval_samples_per_second": 7.273, |
| "eval_steps_per_second": 7.273, |
| "step": 2400 |
| }, |
| { |
| "epoch": 0.8597930788440956, |
| "grad_norm": 0.400390625, |
| "learning_rate": 6.985739453173903e-06, |
| "loss": 1.1124, |
| "mean_token_accuracy": 0.7205769211053848, |
| "step": 2410 |
| }, |
| { |
| "epoch": 0.8633606849803782, |
| "grad_norm": 0.41015625, |
| "learning_rate": 6.960418094309085e-06, |
| "loss": 1.0814, |
| "mean_token_accuracy": 0.7245164632797241, |
| "step": 2420 |
| }, |
| { |
| "epoch": 0.8669282911166607, |
| "grad_norm": 0.38671875, |
| "learning_rate": 6.935037183267991e-06, |
| "loss": 1.0952, |
| "mean_token_accuracy": 0.7253048270940781, |
| "step": 2430 |
| }, |
| { |
| "epoch": 0.8704958972529433, |
| "grad_norm": 0.380859375, |
| "learning_rate": 6.909597491053752e-06, |
| "loss": 1.0957, |
| "mean_token_accuracy": 0.7212597787380218, |
| "step": 2440 |
| }, |
| { |
| "epoch": 0.8740635033892258, |
| "grad_norm": 0.421875, |
| "learning_rate": 6.8840997904551135e-06, |
| "loss": 1.1455, |
| "mean_token_accuracy": 0.7126419723033905, |
| "step": 2450 |
| }, |
| { |
| "epoch": 0.8776311095255084, |
| "grad_norm": 0.4140625, |
| "learning_rate": 6.858544856022953e-06, |
| "loss": 1.1135, |
| "mean_token_accuracy": 0.7193411946296692, |
| "step": 2460 |
| }, |
| { |
| "epoch": 0.881198715661791, |
| "grad_norm": 0.41015625, |
| "learning_rate": 6.83293346404676e-06, |
| "loss": 1.1466, |
| "mean_token_accuracy": 0.7116483449935913, |
| "step": 2470 |
| }, |
| { |
| "epoch": 0.8847663217980735, |
| "grad_norm": 0.4140625, |
| "learning_rate": 6.807266392531051e-06, |
| "loss": 1.1267, |
| "mean_token_accuracy": 0.7160818904638291, |
| "step": 2480 |
| }, |
| { |
| "epoch": 0.8883339279343561, |
| "grad_norm": 0.3984375, |
| "learning_rate": 6.781544421171733e-06, |
| "loss": 1.0753, |
| "mean_token_accuracy": 0.7256105065345764, |
| "step": 2490 |
| }, |
| { |
| "epoch": 0.8919015340706385, |
| "grad_norm": 0.44140625, |
| "learning_rate": 6.755768331332424e-06, |
| "loss": 1.0929, |
| "mean_token_accuracy": 0.7246196806430817, |
| "step": 2500 |
| }, |
| { |
| "epoch": 0.8954691402069211, |
| "grad_norm": 0.376953125, |
| "learning_rate": 6.729938906020713e-06, |
| "loss": 1.1191, |
| "mean_token_accuracy": 0.7169597029685975, |
| "step": 2510 |
| }, |
| { |
| "epoch": 0.8990367463432037, |
| "grad_norm": 0.39453125, |
| "learning_rate": 6.704056929864376e-06, |
| "loss": 1.0886, |
| "mean_token_accuracy": 0.7246688008308411, |
| "step": 2520 |
| }, |
| { |
| "epoch": 0.9026043524794862, |
| "grad_norm": 0.376953125, |
| "learning_rate": 6.67812318908754e-06, |
| "loss": 1.0827, |
| "mean_token_accuracy": 0.7253311902284623, |
| "step": 2530 |
| }, |
| { |
| "epoch": 0.9061719586157688, |
| "grad_norm": 0.396484375, |
| "learning_rate": 6.6521384714868005e-06, |
| "loss": 1.1279, |
| "mean_token_accuracy": 0.7162562996149063, |
| "step": 2540 |
| }, |
| { |
| "epoch": 0.9097395647520514, |
| "grad_norm": 0.3984375, |
| "learning_rate": 6.626103566407296e-06, |
| "loss": 1.1595, |
| "mean_token_accuracy": 0.7120533227920532, |
| "step": 2550 |
| }, |
| { |
| "epoch": 0.9133071708883339, |
| "grad_norm": 0.451171875, |
| "learning_rate": 6.600019264718713e-06, |
| "loss": 1.1472, |
| "mean_token_accuracy": 0.711984121799469, |
| "step": 2560 |
| }, |
| { |
| "epoch": 0.9168747770246165, |
| "grad_norm": 0.4296875, |
| "learning_rate": 6.573886358791285e-06, |
| "loss": 1.0996, |
| "mean_token_accuracy": 0.7185855805873871, |
| "step": 2570 |
| }, |
| { |
| "epoch": 0.920442383160899, |
| "grad_norm": 0.42578125, |
| "learning_rate": 6.547705642471703e-06, |
| "loss": 1.1321, |
| "mean_token_accuracy": 0.7178098350763321, |
| "step": 2580 |
| }, |
| { |
| "epoch": 0.9240099892971816, |
| "grad_norm": 0.42578125, |
| "learning_rate": 6.521477911059009e-06, |
| "loss": 1.1223, |
| "mean_token_accuracy": 0.7173870533704758, |
| "step": 2590 |
| }, |
| { |
| "epoch": 0.9275775954334642, |
| "grad_norm": 0.390625, |
| "learning_rate": 6.495203961280434e-06, |
| "loss": 1.0704, |
| "mean_token_accuracy": 0.7261828422546387, |
| "step": 2600 |
| }, |
| { |
| "epoch": 0.9275775954334642, |
| "eval_loss": 1.1053811311721802, |
| "eval_mean_token_accuracy": 0.719482549977681, |
| "eval_runtime": 129.7866, |
| "eval_samples_per_second": 7.281, |
| "eval_steps_per_second": 7.281, |
| "step": 2600 |
| }, |
| { |
| "epoch": 0.9311452015697467, |
| "grad_norm": 0.42578125, |
| "learning_rate": 6.468884591267204e-06, |
| "loss": 1.0919, |
| "mean_token_accuracy": 0.7208716124296188, |
| "step": 2610 |
| }, |
| { |
| "epoch": 0.9347128077060293, |
| "grad_norm": 0.388671875, |
| "learning_rate": 6.442520600530281e-06, |
| "loss": 1.1299, |
| "mean_token_accuracy": 0.7162574619054795, |
| "step": 2620 |
| }, |
| { |
| "epoch": 0.9382804138423118, |
| "grad_norm": 0.396484375, |
| "learning_rate": 6.416112789936087e-06, |
| "loss": 1.1068, |
| "mean_token_accuracy": 0.719712033867836, |
| "step": 2630 |
| }, |
| { |
| "epoch": 0.9418480199785944, |
| "grad_norm": 0.41796875, |
| "learning_rate": 6.389661961682173e-06, |
| "loss": 1.1466, |
| "mean_token_accuracy": 0.7130582392215729, |
| "step": 2640 |
| }, |
| { |
| "epoch": 0.945415626114877, |
| "grad_norm": 0.396484375, |
| "learning_rate": 6.363168919272846e-06, |
| "loss": 1.0967, |
| "mean_token_accuracy": 0.7230816513299942, |
| "step": 2650 |
| }, |
| { |
| "epoch": 0.9489832322511594, |
| "grad_norm": 0.40234375, |
| "learning_rate": 6.336634467494769e-06, |
| "loss": 1.0929, |
| "mean_token_accuracy": 0.7232824087142944, |
| "step": 2660 |
| }, |
| { |
| "epoch": 0.952550838387442, |
| "grad_norm": 0.37890625, |
| "learning_rate": 6.310059412392506e-06, |
| "loss": 1.1006, |
| "mean_token_accuracy": 0.721543088555336, |
| "step": 2670 |
| }, |
| { |
| "epoch": 0.9561184445237246, |
| "grad_norm": 0.431640625, |
| "learning_rate": 6.283444561244042e-06, |
| "loss": 1.1479, |
| "mean_token_accuracy": 0.7155443519353867, |
| "step": 2680 |
| }, |
| { |
| "epoch": 0.9596860506600071, |
| "grad_norm": 0.400390625, |
| "learning_rate": 6.256790722536252e-06, |
| "loss": 1.1503, |
| "mean_token_accuracy": 0.7140598267316818, |
| "step": 2690 |
| }, |
| { |
| "epoch": 0.9632536567962897, |
| "grad_norm": 0.392578125, |
| "learning_rate": 6.230098705940354e-06, |
| "loss": 1.1413, |
| "mean_token_accuracy": 0.7128863990306854, |
| "step": 2700 |
| }, |
| { |
| "epoch": 0.9668212629325722, |
| "grad_norm": 0.39453125, |
| "learning_rate": 6.203369322287306e-06, |
| "loss": 1.1257, |
| "mean_token_accuracy": 0.7178693532943725, |
| "step": 2710 |
| }, |
| { |
| "epoch": 0.9703888690688548, |
| "grad_norm": 0.40625, |
| "learning_rate": 6.17660338354317e-06, |
| "loss": 1.1121, |
| "mean_token_accuracy": 0.7180402904748917, |
| "step": 2720 |
| }, |
| { |
| "epoch": 0.9739564752051374, |
| "grad_norm": 0.392578125, |
| "learning_rate": 6.149801702784457e-06, |
| "loss": 1.0664, |
| "mean_token_accuracy": 0.726373627781868, |
| "step": 2730 |
| }, |
| { |
| "epoch": 0.9775240813414199, |
| "grad_norm": 0.37890625, |
| "learning_rate": 6.122965094173424e-06, |
| "loss": 1.143, |
| "mean_token_accuracy": 0.7148909151554108, |
| "step": 2740 |
| }, |
| { |
| "epoch": 0.9810916874777025, |
| "grad_norm": 0.423828125, |
| "learning_rate": 6.0960943729333374e-06, |
| "loss": 1.1238, |
| "mean_token_accuracy": 0.7170207530260087, |
| "step": 2750 |
| }, |
| { |
| "epoch": 0.984659293613985, |
| "grad_norm": 0.40625, |
| "learning_rate": 6.0691903553237175e-06, |
| "loss": 1.1131, |
| "mean_token_accuracy": 0.720502644777298, |
| "step": 2760 |
| }, |
| { |
| "epoch": 0.9882268997502676, |
| "grad_norm": 0.443359375, |
| "learning_rate": 6.042253858615532e-06, |
| "loss": 1.1352, |
| "mean_token_accuracy": 0.7152940958738327, |
| "step": 2770 |
| }, |
| { |
| "epoch": 0.9917945058865502, |
| "grad_norm": 0.400390625, |
| "learning_rate": 6.015285701066382e-06, |
| "loss": 1.1455, |
| "mean_token_accuracy": 0.7113858342170716, |
| "step": 2780 |
| }, |
| { |
| "epoch": 0.9953621120228326, |
| "grad_norm": 0.412109375, |
| "learning_rate": 5.988286701895631e-06, |
| "loss": 1.1069, |
| "mean_token_accuracy": 0.7202442049980163, |
| "step": 2790 |
| }, |
| { |
| "epoch": 0.9989297181591152, |
| "grad_norm": 0.416015625, |
| "learning_rate": 5.961257681259536e-06, |
| "loss": 1.1122, |
| "mean_token_accuracy": 0.7190551698207855, |
| "step": 2800 |
| }, |
| { |
| "epoch": 0.9989297181591152, |
| "eval_loss": 1.1045879125595093, |
| "eval_mean_token_accuracy": 0.71954735315666, |
| "eval_runtime": 129.8512, |
| "eval_samples_per_second": 7.278, |
| "eval_steps_per_second": 7.278, |
| "step": 2800 |
| }, |
| { |
| "epoch": 1.0024973242953978, |
| "grad_norm": 0.390625, |
| "learning_rate": 5.934199460226318e-06, |
| "loss": 1.1141, |
| "mean_token_accuracy": 0.7181821793317795, |
| "step": 2810 |
| }, |
| { |
| "epoch": 1.0060649304316804, |
| "grad_norm": 0.41015625, |
| "learning_rate": 5.907112860751229e-06, |
| "loss": 1.0648, |
| "mean_token_accuracy": 0.7280068606138229, |
| "step": 2820 |
| }, |
| { |
| "epoch": 1.0096325365679628, |
| "grad_norm": 0.39453125, |
| "learning_rate": 5.87999870565158e-06, |
| "loss": 1.0918, |
| "mean_token_accuracy": 0.7223397463560104, |
| "step": 2830 |
| }, |
| { |
| "epoch": 1.0132001427042454, |
| "grad_norm": 0.390625, |
| "learning_rate": 5.852857818581752e-06, |
| "loss": 1.1075, |
| "mean_token_accuracy": 0.7205494433641434, |
| "step": 2840 |
| }, |
| { |
| "epoch": 1.016767748840528, |
| "grad_norm": 0.408203125, |
| "learning_rate": 5.8256910240081625e-06, |
| "loss": 1.1043, |
| "mean_token_accuracy": 0.7207432836294174, |
| "step": 2850 |
| }, |
| { |
| "epoch": 1.0203353549768106, |
| "grad_norm": 0.4140625, |
| "learning_rate": 5.798499147184233e-06, |
| "loss": 1.1094, |
| "mean_token_accuracy": 0.7200416833162308, |
| "step": 2860 |
| }, |
| { |
| "epoch": 1.0239029611130932, |
| "grad_norm": 0.42578125, |
| "learning_rate": 5.771283014125317e-06, |
| "loss": 1.1142, |
| "mean_token_accuracy": 0.7189957290887833, |
| "step": 2870 |
| }, |
| { |
| "epoch": 1.0274705672493756, |
| "grad_norm": 0.396484375, |
| "learning_rate": 5.744043451583606e-06, |
| "loss": 1.0526, |
| "mean_token_accuracy": 0.729624542593956, |
| "step": 2880 |
| }, |
| { |
| "epoch": 1.0310381733856582, |
| "grad_norm": 0.416015625, |
| "learning_rate": 5.71678128702301e-06, |
| "loss": 1.0836, |
| "mean_token_accuracy": 0.7231720060110092, |
| "step": 2890 |
| }, |
| { |
| "epoch": 1.0346057795219408, |
| "grad_norm": 0.400390625, |
| "learning_rate": 5.689497348594035e-06, |
| "loss": 1.0864, |
| "mean_token_accuracy": 0.7251443862915039, |
| "step": 2900 |
| }, |
| { |
| "epoch": 1.0381733856582234, |
| "grad_norm": 0.37890625, |
| "learning_rate": 5.662192465108613e-06, |
| "loss": 1.1158, |
| "mean_token_accuracy": 0.7158897966146469, |
| "step": 2910 |
| }, |
| { |
| "epoch": 1.041740991794506, |
| "grad_norm": 0.39453125, |
| "learning_rate": 5.634867466014932e-06, |
| "loss": 1.0744, |
| "mean_token_accuracy": 0.7273076504468918, |
| "step": 2920 |
| }, |
| { |
| "epoch": 1.0453085979307883, |
| "grad_norm": 0.40625, |
| "learning_rate": 5.607523181372234e-06, |
| "loss": 1.124, |
| "mean_token_accuracy": 0.7161523163318634, |
| "step": 2930 |
| }, |
| { |
| "epoch": 1.048876204067071, |
| "grad_norm": 0.42578125, |
| "learning_rate": 5.580160441825612e-06, |
| "loss": 1.1439, |
| "mean_token_accuracy": 0.7132133394479752, |
| "step": 2940 |
| }, |
| { |
| "epoch": 1.0524438102033535, |
| "grad_norm": 0.404296875, |
| "learning_rate": 5.552780078580756e-06, |
| "loss": 1.1128, |
| "mean_token_accuracy": 0.7194567203521729, |
| "step": 2950 |
| }, |
| { |
| "epoch": 1.0560114163396361, |
| "grad_norm": 0.3984375, |
| "learning_rate": 5.525382923378728e-06, |
| "loss": 1.1078, |
| "mean_token_accuracy": 0.7216997921466828, |
| "step": 2960 |
| }, |
| { |
| "epoch": 1.0595790224759187, |
| "grad_norm": 0.400390625, |
| "learning_rate": 5.49796980847068e-06, |
| "loss": 1.0823, |
| "mean_token_accuracy": 0.7254059910774231, |
| "step": 2970 |
| }, |
| { |
| "epoch": 1.063146628612201, |
| "grad_norm": 0.4140625, |
| "learning_rate": 5.470541566592573e-06, |
| "loss": 1.0758, |
| "mean_token_accuracy": 0.7267078697681427, |
| "step": 2980 |
| }, |
| { |
| "epoch": 1.0667142347484837, |
| "grad_norm": 0.40234375, |
| "learning_rate": 5.443099030939887e-06, |
| "loss": 1.1415, |
| "mean_token_accuracy": 0.7161742985248566, |
| "step": 2990 |
| }, |
| { |
| "epoch": 1.0702818408847663, |
| "grad_norm": 0.41015625, |
| "learning_rate": 5.41564303514231e-06, |
| "loss": 1.0969, |
| "mean_token_accuracy": 0.7204415529966355, |
| "step": 3000 |
| }, |
| { |
| "epoch": 1.0702818408847663, |
| "eval_loss": 1.104050874710083, |
| "eval_mean_token_accuracy": 0.7196486196189961, |
| "eval_runtime": 129.5743, |
| "eval_samples_per_second": 7.293, |
| "eval_steps_per_second": 7.293, |
| "step": 3000 |
| }, |
| { |
| "epoch": 1.073849447021049, |
| "grad_norm": 0.41015625, |
| "learning_rate": 5.388174413238411e-06, |
| "loss": 1.1008, |
| "mean_token_accuracy": 0.722458791732788, |
| "step": 3010 |
| }, |
| { |
| "epoch": 1.0774170531573315, |
| "grad_norm": 0.3984375, |
| "learning_rate": 5.360693999650303e-06, |
| "loss": 1.116, |
| "mean_token_accuracy": 0.7188027024269104, |
| "step": 3020 |
| }, |
| { |
| "epoch": 1.0809846592936139, |
| "grad_norm": 0.38671875, |
| "learning_rate": 5.333202629158301e-06, |
| "loss": 1.1376, |
| "mean_token_accuracy": 0.7148137897253036, |
| "step": 3030 |
| }, |
| { |
| "epoch": 1.0845522654298965, |
| "grad_norm": 0.408203125, |
| "learning_rate": 5.305701136875566e-06, |
| "loss": 1.0712, |
| "mean_token_accuracy": 0.7277883917093277, |
| "step": 3040 |
| }, |
| { |
| "epoch": 1.088119871566179, |
| "grad_norm": 0.42578125, |
| "learning_rate": 5.278190358222721e-06, |
| "loss": 1.0932, |
| "mean_token_accuracy": 0.7220012903213501, |
| "step": 3050 |
| }, |
| { |
| "epoch": 1.0916874777024617, |
| "grad_norm": 0.419921875, |
| "learning_rate": 5.250671128902491e-06, |
| "loss": 1.1314, |
| "mean_token_accuracy": 0.7159400582313538, |
| "step": 3060 |
| }, |
| { |
| "epoch": 1.0952550838387443, |
| "grad_norm": 0.396484375, |
| "learning_rate": 5.223144284874307e-06, |
| "loss": 1.0971, |
| "mean_token_accuracy": 0.721205735206604, |
| "step": 3070 |
| }, |
| { |
| "epoch": 1.0988226899750269, |
| "grad_norm": 0.390625, |
| "learning_rate": 5.1956106623289145e-06, |
| "loss": 1.0998, |
| "mean_token_accuracy": 0.7201095730066299, |
| "step": 3080 |
| }, |
| { |
| "epoch": 1.1023902961113092, |
| "grad_norm": 0.451171875, |
| "learning_rate": 5.168071097662972e-06, |
| "loss": 1.0961, |
| "mean_token_accuracy": 0.7220069378614425, |
| "step": 3090 |
| }, |
| { |
| "epoch": 1.1059579022475918, |
| "grad_norm": 0.416015625, |
| "learning_rate": 5.140526427453645e-06, |
| "loss": 1.1041, |
| "mean_token_accuracy": 0.7202136754989624, |
| "step": 3100 |
| }, |
| { |
| "epoch": 1.1095255083838744, |
| "grad_norm": 0.388671875, |
| "learning_rate": 5.112977488433188e-06, |
| "loss": 1.1036, |
| "mean_token_accuracy": 0.7218604981899261, |
| "step": 3110 |
| }, |
| { |
| "epoch": 1.113093114520157, |
| "grad_norm": 0.40234375, |
| "learning_rate": 5.085425117463533e-06, |
| "loss": 1.1064, |
| "mean_token_accuracy": 0.7209546893835068, |
| "step": 3120 |
| }, |
| { |
| "epoch": 1.1166607206564396, |
| "grad_norm": 0.3828125, |
| "learning_rate": 5.057870151510864e-06, |
| "loss": 1.0473, |
| "mean_token_accuracy": 0.7317735016345978, |
| "step": 3130 |
| }, |
| { |
| "epoch": 1.120228326792722, |
| "grad_norm": 0.400390625, |
| "learning_rate": 5.030313427620197e-06, |
| "loss": 1.0901, |
| "mean_token_accuracy": 0.7229151397943496, |
| "step": 3140 |
| }, |
| { |
| "epoch": 1.1237959329290046, |
| "grad_norm": 0.455078125, |
| "learning_rate": 5.002755782889943e-06, |
| "loss": 1.1259, |
| "mean_token_accuracy": 0.7180311203002929, |
| "step": 3150 |
| }, |
| { |
| "epoch": 1.1273635390652872, |
| "grad_norm": 0.400390625, |
| "learning_rate": 4.9751980544464916e-06, |
| "loss": 1.1069, |
| "mean_token_accuracy": 0.7184020102024078, |
| "step": 3160 |
| }, |
| { |
| "epoch": 1.1309311452015698, |
| "grad_norm": 0.408203125, |
| "learning_rate": 4.9476410794187726e-06, |
| "loss": 1.107, |
| "mean_token_accuracy": 0.7216697186231613, |
| "step": 3170 |
| }, |
| { |
| "epoch": 1.1344987513378524, |
| "grad_norm": 0.4140625, |
| "learning_rate": 4.9200856949128285e-06, |
| "loss": 1.1157, |
| "mean_token_accuracy": 0.7185103803873062, |
| "step": 3180 |
| }, |
| { |
| "epoch": 1.1380663574741348, |
| "grad_norm": 0.38671875, |
| "learning_rate": 4.892532737986387e-06, |
| "loss": 1.0641, |
| "mean_token_accuracy": 0.7288919419050217, |
| "step": 3190 |
| }, |
| { |
| "epoch": 1.1416339636104174, |
| "grad_norm": 0.3984375, |
| "learning_rate": 4.864983045623434e-06, |
| "loss": 1.093, |
| "mean_token_accuracy": 0.7208064585924149, |
| "step": 3200 |
| }, |
| { |
| "epoch": 1.1416339636104174, |
| "eval_loss": 1.1036481857299805, |
| "eval_mean_token_accuracy": 0.7197175011432991, |
| "eval_runtime": 129.7798, |
| "eval_samples_per_second": 7.282, |
| "eval_steps_per_second": 7.282, |
| "step": 3200 |
| }, |
| { |
| "epoch": 1.1452015697467, |
| "grad_norm": 0.384765625, |
| "learning_rate": 4.837437454708784e-06, |
| "loss": 1.087, |
| "mean_token_accuracy": 0.7225021064281464, |
| "step": 3210 |
| }, |
| { |
| "epoch": 1.1487691758829826, |
| "grad_norm": 0.41796875, |
| "learning_rate": 4.809896802002662e-06, |
| "loss": 1.126, |
| "mean_token_accuracy": 0.717538720369339, |
| "step": 3220 |
| }, |
| { |
| "epoch": 1.1523367820192651, |
| "grad_norm": 0.404296875, |
| "learning_rate": 4.782361924115286e-06, |
| "loss": 1.1145, |
| "mean_token_accuracy": 0.7184861123561859, |
| "step": 3230 |
| }, |
| { |
| "epoch": 1.1559043881555477, |
| "grad_norm": 0.390625, |
| "learning_rate": 4.754833657481445e-06, |
| "loss": 1.1047, |
| "mean_token_accuracy": 0.7220670431852341, |
| "step": 3240 |
| }, |
| { |
| "epoch": 1.1594719942918301, |
| "grad_norm": 0.419921875, |
| "learning_rate": 4.727312838335101e-06, |
| "loss": 1.13, |
| "mean_token_accuracy": 0.716909921169281, |
| "step": 3250 |
| }, |
| { |
| "epoch": 1.1630396004281127, |
| "grad_norm": 0.38671875, |
| "learning_rate": 4.699800302683981e-06, |
| "loss": 1.0671, |
| "mean_token_accuracy": 0.7277197778224945, |
| "step": 3260 |
| }, |
| { |
| "epoch": 1.1666072065643953, |
| "grad_norm": 0.404296875, |
| "learning_rate": 4.6722968862841805e-06, |
| "loss": 1.1069, |
| "mean_token_accuracy": 0.7204835414886475, |
| "step": 3270 |
| }, |
| { |
| "epoch": 1.170174812700678, |
| "grad_norm": 0.3984375, |
| "learning_rate": 4.644803424614775e-06, |
| "loss": 1.0959, |
| "mean_token_accuracy": 0.7221046984195709, |
| "step": 3280 |
| }, |
| { |
| "epoch": 1.1737424188369605, |
| "grad_norm": 0.412109375, |
| "learning_rate": 4.617320752852448e-06, |
| "loss": 1.1679, |
| "mean_token_accuracy": 0.7081861823797226, |
| "step": 3290 |
| }, |
| { |
| "epoch": 1.1773100249732429, |
| "grad_norm": 0.42578125, |
| "learning_rate": 4.58984970584611e-06, |
| "loss": 1.0946, |
| "mean_token_accuracy": 0.7215434044599534, |
| "step": 3300 |
| }, |
| { |
| "epoch": 1.1808776311095255, |
| "grad_norm": 0.41796875, |
| "learning_rate": 4.562391118091544e-06, |
| "loss": 1.1253, |
| "mean_token_accuracy": 0.7170281201601029, |
| "step": 3310 |
| }, |
| { |
| "epoch": 1.184445237245808, |
| "grad_norm": 0.38671875, |
| "learning_rate": 4.5349458237060565e-06, |
| "loss": 1.0819, |
| "mean_token_accuracy": 0.72613705098629, |
| "step": 3320 |
| }, |
| { |
| "epoch": 1.1880128433820907, |
| "grad_norm": 0.416015625, |
| "learning_rate": 4.507514656403137e-06, |
| "loss": 1.1511, |
| "mean_token_accuracy": 0.7124078363180161, |
| "step": 3330 |
| }, |
| { |
| "epoch": 1.1915804495183733, |
| "grad_norm": 0.37890625, |
| "learning_rate": 4.480098449467132e-06, |
| "loss": 1.1156, |
| "mean_token_accuracy": 0.7186054766178132, |
| "step": 3340 |
| }, |
| { |
| "epoch": 1.1951480556546556, |
| "grad_norm": 0.392578125, |
| "learning_rate": 4.4526980357279294e-06, |
| "loss": 1.0783, |
| "mean_token_accuracy": 0.725648021697998, |
| "step": 3350 |
| }, |
| { |
| "epoch": 1.1987156617909382, |
| "grad_norm": 0.392578125, |
| "learning_rate": 4.425314247535668e-06, |
| "loss": 1.1226, |
| "mean_token_accuracy": 0.7171672821044922, |
| "step": 3360 |
| }, |
| { |
| "epoch": 1.2022832679272208, |
| "grad_norm": 0.416015625, |
| "learning_rate": 4.397947916735448e-06, |
| "loss": 1.1166, |
| "mean_token_accuracy": 0.7191376656293869, |
| "step": 3370 |
| }, |
| { |
| "epoch": 1.2058508740635034, |
| "grad_norm": 0.376953125, |
| "learning_rate": 4.3705998746420555e-06, |
| "loss": 1.1059, |
| "mean_token_accuracy": 0.720007112622261, |
| "step": 3380 |
| }, |
| { |
| "epoch": 1.209418480199786, |
| "grad_norm": 0.408203125, |
| "learning_rate": 4.343270952014721e-06, |
| "loss": 1.1415, |
| "mean_token_accuracy": 0.7136272579431534, |
| "step": 3390 |
| }, |
| { |
| "epoch": 1.2129860863360684, |
| "grad_norm": 0.388671875, |
| "learning_rate": 4.315961979031875e-06, |
| "loss": 1.0703, |
| "mean_token_accuracy": 0.7283745348453522, |
| "step": 3400 |
| }, |
| { |
| "epoch": 1.2129860863360684, |
| "eval_loss": 1.1033363342285156, |
| "eval_mean_token_accuracy": 0.719832913333146, |
| "eval_runtime": 129.8299, |
| "eval_samples_per_second": 7.279, |
| "eval_steps_per_second": 7.279, |
| "step": 3400 |
| }, |
| { |
| "epoch": 1.216553692472351, |
| "grad_norm": 0.43359375, |
| "learning_rate": 4.288673785265933e-06, |
| "loss": 1.1456, |
| "mean_token_accuracy": 0.7134081184864044, |
| "step": 3410 |
| }, |
| { |
| "epoch": 1.2201212986086336, |
| "grad_norm": 0.408203125, |
| "learning_rate": 4.261407199658093e-06, |
| "loss": 1.0958, |
| "mean_token_accuracy": 0.7243350833654404, |
| "step": 3420 |
| }, |
| { |
| "epoch": 1.2236889047449162, |
| "grad_norm": 0.41015625, |
| "learning_rate": 4.234163050493158e-06, |
| "loss": 1.1189, |
| "mean_token_accuracy": 0.7177170753479004, |
| "step": 3430 |
| }, |
| { |
| "epoch": 1.2272565108811988, |
| "grad_norm": 0.41015625, |
| "learning_rate": 4.206942165374371e-06, |
| "loss": 1.076, |
| "mean_token_accuracy": 0.7244078129529953, |
| "step": 3440 |
| }, |
| { |
| "epoch": 1.2308241170174812, |
| "grad_norm": 0.41796875, |
| "learning_rate": 4.179745371198276e-06, |
| "loss": 1.0819, |
| "mean_token_accuracy": 0.7244366586208344, |
| "step": 3450 |
| }, |
| { |
| "epoch": 1.2343917231537638, |
| "grad_norm": 0.43359375, |
| "learning_rate": 4.1525734941296024e-06, |
| "loss": 1.1043, |
| "mean_token_accuracy": 0.719652670621872, |
| "step": 3460 |
| }, |
| { |
| "epoch": 1.2379593292900464, |
| "grad_norm": 0.40234375, |
| "learning_rate": 4.125427359576162e-06, |
| "loss": 1.0647, |
| "mean_token_accuracy": 0.7277228325605393, |
| "step": 3470 |
| }, |
| { |
| "epoch": 1.241526935426329, |
| "grad_norm": 0.37890625, |
| "learning_rate": 4.098307792163781e-06, |
| "loss": 1.0939, |
| "mean_token_accuracy": 0.7211065292358398, |
| "step": 3480 |
| }, |
| { |
| "epoch": 1.2450945415626116, |
| "grad_norm": 0.408203125, |
| "learning_rate": 4.071215615711251e-06, |
| "loss": 1.1171, |
| "mean_token_accuracy": 0.717405378818512, |
| "step": 3490 |
| }, |
| { |
| "epoch": 1.248662147698894, |
| "grad_norm": 0.400390625, |
| "learning_rate": 4.044151653205292e-06, |
| "loss": 1.0871, |
| "mean_token_accuracy": 0.7227793008089065, |
| "step": 3500 |
| }, |
| { |
| "epoch": 1.2522297538351765, |
| "grad_norm": 0.412109375, |
| "learning_rate": 4.0171167267755696e-06, |
| "loss": 1.0915, |
| "mean_token_accuracy": 0.7235286891460418, |
| "step": 3510 |
| }, |
| { |
| "epoch": 1.2557973599714591, |
| "grad_norm": 0.42578125, |
| "learning_rate": 3.990111657669709e-06, |
| "loss": 1.0939, |
| "mean_token_accuracy": 0.7201143652200699, |
| "step": 3520 |
| }, |
| { |
| "epoch": 1.2593649661077417, |
| "grad_norm": 0.404296875, |
| "learning_rate": 3.963137266228349e-06, |
| "loss": 1.0854, |
| "mean_token_accuracy": 0.7254441410303116, |
| "step": 3530 |
| }, |
| { |
| "epoch": 1.2629325722440243, |
| "grad_norm": 0.40625, |
| "learning_rate": 3.93619437186023e-06, |
| "loss": 1.1012, |
| "mean_token_accuracy": 0.7225957334041595, |
| "step": 3540 |
| }, |
| { |
| "epoch": 1.2665001783803067, |
| "grad_norm": 0.447265625, |
| "learning_rate": 3.909283793017289e-06, |
| "loss": 1.1192, |
| "mean_token_accuracy": 0.716712012887001, |
| "step": 3550 |
| }, |
| { |
| "epoch": 1.2700677845165893, |
| "grad_norm": 0.39453125, |
| "learning_rate": 3.88240634716981e-06, |
| "loss": 1.0832, |
| "mean_token_accuracy": 0.7240831553936005, |
| "step": 3560 |
| }, |
| { |
| "epoch": 1.273635390652872, |
| "grad_norm": 0.419921875, |
| "learning_rate": 3.855562850781589e-06, |
| "loss": 1.1122, |
| "mean_token_accuracy": 0.7198260009288788, |
| "step": 3570 |
| }, |
| { |
| "epoch": 1.2772029967891545, |
| "grad_norm": 0.431640625, |
| "learning_rate": 3.828754119285123e-06, |
| "loss": 1.1111, |
| "mean_token_accuracy": 0.7183409601449966, |
| "step": 3580 |
| }, |
| { |
| "epoch": 1.280770602925437, |
| "grad_norm": 0.400390625, |
| "learning_rate": 3.801980967056851e-06, |
| "loss": 1.1049, |
| "mean_token_accuracy": 0.7193833947181701, |
| "step": 3590 |
| }, |
| { |
| "epoch": 1.2843382090617195, |
| "grad_norm": 0.41015625, |
| "learning_rate": 3.77524420739241e-06, |
| "loss": 1.0567, |
| "mean_token_accuracy": 0.7300051569938659, |
| "step": 3600 |
| }, |
| { |
| "epoch": 1.2843382090617195, |
| "eval_loss": 1.1031303405761719, |
| "eval_mean_token_accuracy": 0.7198326473513609, |
| "eval_runtime": 129.7756, |
| "eval_samples_per_second": 7.282, |
| "eval_steps_per_second": 7.282, |
| "step": 3600 |
| }, |
| { |
| "epoch": 1.287905815198002, |
| "grad_norm": 0.416015625, |
| "learning_rate": 3.748544652481927e-06, |
| "loss": 1.0862, |
| "mean_token_accuracy": 0.723472598195076, |
| "step": 3610 |
| }, |
| { |
| "epoch": 1.2914734213342847, |
| "grad_norm": 0.44140625, |
| "learning_rate": 3.721883113385353e-06, |
| "loss": 1.1064, |
| "mean_token_accuracy": 0.7193070739507675, |
| "step": 3620 |
| }, |
| { |
| "epoch": 1.2950410274705673, |
| "grad_norm": 0.396484375, |
| "learning_rate": 3.6952604000078197e-06, |
| "loss": 1.1207, |
| "mean_token_accuracy": 0.7172163218259812, |
| "step": 3630 |
| }, |
| { |
| "epoch": 1.2986086336068499, |
| "grad_norm": 0.3984375, |
| "learning_rate": 3.6686773210750386e-06, |
| "loss": 1.1333, |
| "mean_token_accuracy": 0.7135063827037811, |
| "step": 3640 |
| }, |
| { |
| "epoch": 1.3021762397431322, |
| "grad_norm": 0.41796875, |
| "learning_rate": 3.642134684108737e-06, |
| "loss": 1.0796, |
| "mean_token_accuracy": 0.7254454314708709, |
| "step": 3650 |
| }, |
| { |
| "epoch": 1.3057438458794148, |
| "grad_norm": 0.41796875, |
| "learning_rate": 3.6156332954021233e-06, |
| "loss": 1.1124, |
| "mean_token_accuracy": 0.7201877295970917, |
| "step": 3660 |
| }, |
| { |
| "epoch": 1.3093114520156974, |
| "grad_norm": 0.4296875, |
| "learning_rate": 3.5891739599953945e-06, |
| "loss": 1.0879, |
| "mean_token_accuracy": 0.7234185546636581, |
| "step": 3670 |
| }, |
| { |
| "epoch": 1.31287905815198, |
| "grad_norm": 0.388671875, |
| "learning_rate": 3.562757481651285e-06, |
| "loss": 1.1103, |
| "mean_token_accuracy": 0.7212854534387588, |
| "step": 3680 |
| }, |
| { |
| "epoch": 1.3164466642882626, |
| "grad_norm": 0.396484375, |
| "learning_rate": 3.5363846628306486e-06, |
| "loss": 1.1443, |
| "mean_token_accuracy": 0.7132387101650238, |
| "step": 3690 |
| }, |
| { |
| "epoch": 1.320014270424545, |
| "grad_norm": 0.41796875, |
| "learning_rate": 3.510056304668077e-06, |
| "loss": 1.1172, |
| "mean_token_accuracy": 0.7189575970172882, |
| "step": 3700 |
| }, |
| { |
| "epoch": 1.3235818765608278, |
| "grad_norm": 0.3984375, |
| "learning_rate": 3.483773206947572e-06, |
| "loss": 1.1179, |
| "mean_token_accuracy": 0.7171127736568451, |
| "step": 3710 |
| }, |
| { |
| "epoch": 1.3271494826971102, |
| "grad_norm": 0.396484375, |
| "learning_rate": 3.457536168078247e-06, |
| "loss": 1.078, |
| "mean_token_accuracy": 0.7240683853626251, |
| "step": 3720 |
| }, |
| { |
| "epoch": 1.3307170888333928, |
| "grad_norm": 0.400390625, |
| "learning_rate": 3.4313459850700678e-06, |
| "loss": 1.0843, |
| "mean_token_accuracy": 0.7244907855987549, |
| "step": 3730 |
| }, |
| { |
| "epoch": 1.3342846949696754, |
| "grad_norm": 0.384765625, |
| "learning_rate": 3.40520345350965e-06, |
| "loss": 1.1142, |
| "mean_token_accuracy": 0.7188033819198608, |
| "step": 3740 |
| }, |
| { |
| "epoch": 1.3378523011059578, |
| "grad_norm": 0.400390625, |
| "learning_rate": 3.3791093675360886e-06, |
| "loss": 1.1201, |
| "mean_token_accuracy": 0.7192677736282349, |
| "step": 3750 |
| }, |
| { |
| "epoch": 1.3414199072422406, |
| "grad_norm": 0.40625, |
| "learning_rate": 3.3530645198168293e-06, |
| "loss": 1.107, |
| "mean_token_accuracy": 0.7217560350894928, |
| "step": 3760 |
| }, |
| { |
| "epoch": 1.344987513378523, |
| "grad_norm": 0.400390625, |
| "learning_rate": 3.3270697015235955e-06, |
| "loss": 1.1118, |
| "mean_token_accuracy": 0.7190796703100204, |
| "step": 3770 |
| }, |
| { |
| "epoch": 1.3485551195148056, |
| "grad_norm": 0.396484375, |
| "learning_rate": 3.3011257023083525e-06, |
| "loss": 1.1232, |
| "mean_token_accuracy": 0.7180998176336288, |
| "step": 3780 |
| }, |
| { |
| "epoch": 1.3521227256510882, |
| "grad_norm": 0.400390625, |
| "learning_rate": 3.2752333102793217e-06, |
| "loss": 1.132, |
| "mean_token_accuracy": 0.7163138449192047, |
| "step": 3790 |
| }, |
| { |
| "epoch": 1.3556903317873708, |
| "grad_norm": 0.3828125, |
| "learning_rate": 3.249393311977037e-06, |
| "loss": 1.0722, |
| "mean_token_accuracy": 0.7268070787191391, |
| "step": 3800 |
| }, |
| { |
| "epoch": 1.3556903317873708, |
| "eval_loss": 1.1029784679412842, |
| "eval_mean_token_accuracy": 0.7198866987985278, |
| "eval_runtime": 129.6054, |
| "eval_samples_per_second": 7.291, |
| "eval_steps_per_second": 7.291, |
| "step": 3800 |
| }, |
| { |
| "epoch": 1.3592579379236533, |
| "grad_norm": 0.384765625, |
| "learning_rate": 3.2236064923504508e-06, |
| "loss": 1.0568, |
| "mean_token_accuracy": 0.7289782732725143, |
| "step": 3810 |
| }, |
| { |
| "epoch": 1.3628255440599357, |
| "grad_norm": 0.408203125, |
| "learning_rate": 3.197873634733096e-06, |
| "loss": 1.0981, |
| "mean_token_accuracy": 0.7203329116106033, |
| "step": 3820 |
| }, |
| { |
| "epoch": 1.3663931501962183, |
| "grad_norm": 0.412109375, |
| "learning_rate": 3.1721955208192843e-06, |
| "loss": 1.1181, |
| "mean_token_accuracy": 0.7192750304937363, |
| "step": 3830 |
| }, |
| { |
| "epoch": 1.369960756332501, |
| "grad_norm": 0.439453125, |
| "learning_rate": 3.146572930640362e-06, |
| "loss": 1.1537, |
| "mean_token_accuracy": 0.7088033139705658, |
| "step": 3840 |
| }, |
| { |
| "epoch": 1.3735283624687835, |
| "grad_norm": 0.39453125, |
| "learning_rate": 3.1210066425410148e-06, |
| "loss": 1.0876, |
| "mean_token_accuracy": 0.7232223749160767, |
| "step": 3850 |
| }, |
| { |
| "epoch": 1.3770959686050661, |
| "grad_norm": 0.390625, |
| "learning_rate": 3.0954974331556264e-06, |
| "loss": 1.0996, |
| "mean_token_accuracy": 0.7225809276103974, |
| "step": 3860 |
| }, |
| { |
| "epoch": 1.3806635747413485, |
| "grad_norm": 0.431640625, |
| "learning_rate": 3.0700460773846817e-06, |
| "loss": 1.1041, |
| "mean_token_accuracy": 0.7193689078092576, |
| "step": 3870 |
| }, |
| { |
| "epoch": 1.384231180877631, |
| "grad_norm": 0.388671875, |
| "learning_rate": 3.0446533483712304e-06, |
| "loss": 1.1126, |
| "mean_token_accuracy": 0.7207145065069198, |
| "step": 3880 |
| }, |
| { |
| "epoch": 1.3877987870139137, |
| "grad_norm": 0.392578125, |
| "learning_rate": 3.019320017477404e-06, |
| "loss": 1.0962, |
| "mean_token_accuracy": 0.7231746077537536, |
| "step": 3890 |
| }, |
| { |
| "epoch": 1.3913663931501963, |
| "grad_norm": 0.42578125, |
| "learning_rate": 2.994046854260974e-06, |
| "loss": 1.133, |
| "mean_token_accuracy": 0.7163553088903427, |
| "step": 3900 |
| }, |
| { |
| "epoch": 1.3949339992864789, |
| "grad_norm": 0.388671875, |
| "learning_rate": 2.968834626451987e-06, |
| "loss": 1.1341, |
| "mean_token_accuracy": 0.7145072937011718, |
| "step": 3910 |
| }, |
| { |
| "epoch": 1.3985016054227613, |
| "grad_norm": 0.40625, |
| "learning_rate": 2.943684099929436e-06, |
| "loss": 1.1396, |
| "mean_token_accuracy": 0.7137275010347366, |
| "step": 3920 |
| }, |
| { |
| "epoch": 1.4020692115590438, |
| "grad_norm": 0.390625, |
| "learning_rate": 2.918596038697995e-06, |
| "loss": 1.0991, |
| "mean_token_accuracy": 0.7207221776247025, |
| "step": 3930 |
| }, |
| { |
| "epoch": 1.4056368176953264, |
| "grad_norm": 0.43359375, |
| "learning_rate": 2.893571204864811e-06, |
| "loss": 1.0946, |
| "mean_token_accuracy": 0.7236782640218735, |
| "step": 3940 |
| }, |
| { |
| "epoch": 1.409204423831609, |
| "grad_norm": 0.427734375, |
| "learning_rate": 2.8686103586163626e-06, |
| "loss": 1.073, |
| "mean_token_accuracy": 0.7256010293960571, |
| "step": 3950 |
| }, |
| { |
| "epoch": 1.4127720299678916, |
| "grad_norm": 0.396484375, |
| "learning_rate": 2.843714258195346e-06, |
| "loss": 1.0855, |
| "mean_token_accuracy": 0.7233177602291108, |
| "step": 3960 |
| }, |
| { |
| "epoch": 1.416339636104174, |
| "grad_norm": 0.44921875, |
| "learning_rate": 2.8188836598776662e-06, |
| "loss": 1.1262, |
| "mean_token_accuracy": 0.7158330619335175, |
| "step": 3970 |
| }, |
| { |
| "epoch": 1.4199072422404566, |
| "grad_norm": 0.3984375, |
| "learning_rate": 2.7941193179494487e-06, |
| "loss": 1.0975, |
| "mean_token_accuracy": 0.7226381480693818, |
| "step": 3980 |
| }, |
| { |
| "epoch": 1.4234748483767392, |
| "grad_norm": 0.39453125, |
| "learning_rate": 2.7694219846841263e-06, |
| "loss": 1.0941, |
| "mean_token_accuracy": 0.7227201044559479, |
| "step": 3990 |
| }, |
| { |
| "epoch": 1.4270424545130218, |
| "grad_norm": 0.41796875, |
| "learning_rate": 2.7447924103195976e-06, |
| "loss": 1.1058, |
| "mean_token_accuracy": 0.7205296069383621, |
| "step": 4000 |
| }, |
| { |
| "epoch": 1.4270424545130218, |
| "eval_loss": 1.1028817892074585, |
| "eval_mean_token_accuracy": 0.719914884605105, |
| "eval_runtime": 129.7381, |
| "eval_samples_per_second": 7.284, |
| "eval_steps_per_second": 7.284, |
| "step": 4000 |
| }, |
| { |
| "epoch": 1.4306100606493044, |
| "grad_norm": 0.408203125, |
| "learning_rate": 2.7202313430354256e-06, |
| "loss": 1.0807, |
| "mean_token_accuracy": 0.7271443784236908, |
| "step": 4010 |
| }, |
| { |
| "epoch": 1.4341776667855868, |
| "grad_norm": 0.42578125, |
| "learning_rate": 2.6957395289301113e-06, |
| "loss": 1.111, |
| "mean_token_accuracy": 0.7193177580833435, |
| "step": 4020 |
| }, |
| { |
| "epoch": 1.4377452729218694, |
| "grad_norm": 0.416015625, |
| "learning_rate": 2.6713177119984402e-06, |
| "loss": 1.1057, |
| "mean_token_accuracy": 0.7203785151243209, |
| "step": 4030 |
| }, |
| { |
| "epoch": 1.441312879058152, |
| "grad_norm": 0.388671875, |
| "learning_rate": 2.646966634108868e-06, |
| "loss": 1.0787, |
| "mean_token_accuracy": 0.7256379693746566, |
| "step": 4040 |
| }, |
| { |
| "epoch": 1.4448804851944346, |
| "grad_norm": 0.431640625, |
| "learning_rate": 2.6226870349809883e-06, |
| "loss": 1.1009, |
| "mean_token_accuracy": 0.7196383833885193, |
| "step": 4050 |
| }, |
| { |
| "epoch": 1.4484480913307172, |
| "grad_norm": 0.380859375, |
| "learning_rate": 2.598479652163074e-06, |
| "loss": 1.1047, |
| "mean_token_accuracy": 0.7177596479654312, |
| "step": 4060 |
| }, |
| { |
| "epoch": 1.4520156974669995, |
| "grad_norm": 0.4609375, |
| "learning_rate": 2.5743452210096533e-06, |
| "loss": 1.1265, |
| "mean_token_accuracy": 0.7146128952503205, |
| "step": 4070 |
| }, |
| { |
| "epoch": 1.4555833036032821, |
| "grad_norm": 0.4453125, |
| "learning_rate": 2.5502844746591803e-06, |
| "loss": 1.1007, |
| "mean_token_accuracy": 0.7225989043712616, |
| "step": 4080 |
| }, |
| { |
| "epoch": 1.4591509097395647, |
| "grad_norm": 0.4296875, |
| "learning_rate": 2.526298144011775e-06, |
| "loss": 1.0782, |
| "mean_token_accuracy": 0.7250162631273269, |
| "step": 4090 |
| }, |
| { |
| "epoch": 1.4627185158758473, |
| "grad_norm": 0.416015625, |
| "learning_rate": 2.5023869577070014e-06, |
| "loss": 1.0739, |
| "mean_token_accuracy": 0.7271322458982468, |
| "step": 4100 |
| }, |
| { |
| "epoch": 1.46628612201213, |
| "grad_norm": 0.439453125, |
| "learning_rate": 2.478551642101743e-06, |
| "loss": 1.1167, |
| "mean_token_accuracy": 0.7192445009946823, |
| "step": 4110 |
| }, |
| { |
| "epoch": 1.4698537281484123, |
| "grad_norm": 0.419921875, |
| "learning_rate": 2.4547929212481436e-06, |
| "loss": 1.0806, |
| "mean_token_accuracy": 0.7259793847799301, |
| "step": 4120 |
| }, |
| { |
| "epoch": 1.473421334284695, |
| "grad_norm": 0.421875, |
| "learning_rate": 2.4311115168716017e-06, |
| "loss": 1.1226, |
| "mean_token_accuracy": 0.7186828255653381, |
| "step": 4130 |
| }, |
| { |
| "epoch": 1.4769889404209775, |
| "grad_norm": 0.408203125, |
| "learning_rate": 2.4075081483488494e-06, |
| "loss": 1.1148, |
| "mean_token_accuracy": 0.718857717514038, |
| "step": 4140 |
| }, |
| { |
| "epoch": 1.48055654655726, |
| "grad_norm": 0.41015625, |
| "learning_rate": 2.3839835326861106e-06, |
| "loss": 1.1094, |
| "mean_token_accuracy": 0.7207616001367569, |
| "step": 4150 |
| }, |
| { |
| "epoch": 1.4841241526935427, |
| "grad_norm": 0.404296875, |
| "learning_rate": 2.360538384497297e-06, |
| "loss": 1.1264, |
| "mean_token_accuracy": 0.7167964816093445, |
| "step": 4160 |
| }, |
| { |
| "epoch": 1.487691758829825, |
| "grad_norm": 0.40625, |
| "learning_rate": 2.3371734159823283e-06, |
| "loss": 1.1273, |
| "mean_token_accuracy": 0.7168554663658142, |
| "step": 4170 |
| }, |
| { |
| "epoch": 1.4912593649661077, |
| "grad_norm": 0.40625, |
| "learning_rate": 2.3138893369054764e-06, |
| "loss": 1.1307, |
| "mean_token_accuracy": 0.716307619214058, |
| "step": 4180 |
| }, |
| { |
| "epoch": 1.4948269711023903, |
| "grad_norm": 0.390625, |
| "learning_rate": 2.2906868545738105e-06, |
| "loss": 1.1064, |
| "mean_token_accuracy": 0.7192932486534118, |
| "step": 4190 |
| }, |
| { |
| "epoch": 1.4983945772386729, |
| "grad_norm": 0.431640625, |
| "learning_rate": 2.267566673815719e-06, |
| "loss": 1.1028, |
| "mean_token_accuracy": 0.7194180607795715, |
| "step": 4200 |
| }, |
| { |
| "epoch": 1.4983945772386729, |
| "eval_loss": 1.1028162240982056, |
| "eval_mean_token_accuracy": 0.7199414636092211, |
| "eval_runtime": 129.9105, |
| "eval_samples_per_second": 7.274, |
| "eval_steps_per_second": 7.274, |
| "step": 4200 |
| }, |
| { |
| "epoch": 1.5019621833749555, |
| "grad_norm": 0.41796875, |
| "learning_rate": 2.2445294969594843e-06, |
| "loss": 1.142, |
| "mean_token_accuracy": 0.7116603493690491, |
| "step": 4210 |
| }, |
| { |
| "epoch": 1.5055297895112378, |
| "grad_norm": 0.388671875, |
| "learning_rate": 2.22157602381196e-06, |
| "loss": 1.137, |
| "mean_token_accuracy": 0.7143406510353089, |
| "step": 4220 |
| }, |
| { |
| "epoch": 1.5090973956475207, |
| "grad_norm": 0.41796875, |
| "learning_rate": 2.19870695163731e-06, |
| "loss": 1.1154, |
| "mean_token_accuracy": 0.7157580763101578, |
| "step": 4230 |
| }, |
| { |
| "epoch": 1.512665001783803, |
| "grad_norm": 0.419921875, |
| "learning_rate": 2.175922975135822e-06, |
| "loss": 1.12, |
| "mean_token_accuracy": 0.7179748564958572, |
| "step": 4240 |
| }, |
| { |
| "epoch": 1.5162326079200856, |
| "grad_norm": 0.4140625, |
| "learning_rate": 2.1532247864228083e-06, |
| "loss": 1.0879, |
| "mean_token_accuracy": 0.724085783958435, |
| "step": 4250 |
| }, |
| { |
| "epoch": 1.5198002140563682, |
| "grad_norm": 0.3984375, |
| "learning_rate": 2.1306130750075865e-06, |
| "loss": 1.0788, |
| "mean_token_accuracy": 0.7249435156583786, |
| "step": 4260 |
| }, |
| { |
| "epoch": 1.5233678201926506, |
| "grad_norm": 0.416015625, |
| "learning_rate": 2.108088527772524e-06, |
| "loss": 1.0919, |
| "mean_token_accuracy": 0.7243706405162811, |
| "step": 4270 |
| }, |
| { |
| "epoch": 1.5269354263289334, |
| "grad_norm": 0.37890625, |
| "learning_rate": 2.0856518289521747e-06, |
| "loss": 1.1294, |
| "mean_token_accuracy": 0.7154888361692429, |
| "step": 4280 |
| }, |
| { |
| "epoch": 1.5305030324652158, |
| "grad_norm": 0.40625, |
| "learning_rate": 2.0633036601125062e-06, |
| "loss": 1.1221, |
| "mean_token_accuracy": 0.7178556174039841, |
| "step": 4290 |
| }, |
| { |
| "epoch": 1.5340706386014984, |
| "grad_norm": 0.3984375, |
| "learning_rate": 2.0410447001301754e-06, |
| "loss": 1.1133, |
| "mean_token_accuracy": 0.7192623257637024, |
| "step": 4300 |
| }, |
| { |
| "epoch": 1.537638244737781, |
| "grad_norm": 0.396484375, |
| "learning_rate": 2.0188756251719204e-06, |
| "loss": 1.0671, |
| "mean_token_accuracy": 0.7278983563184738, |
| "step": 4310 |
| }, |
| { |
| "epoch": 1.5412058508740634, |
| "grad_norm": 0.40234375, |
| "learning_rate": 1.9967971086740195e-06, |
| "loss": 1.1107, |
| "mean_token_accuracy": 0.7185012221336364, |
| "step": 4320 |
| }, |
| { |
| "epoch": 1.5447734570103462, |
| "grad_norm": 0.4140625, |
| "learning_rate": 1.974809821321827e-06, |
| "loss": 1.1444, |
| "mean_token_accuracy": 0.7138689398765564, |
| "step": 4330 |
| }, |
| { |
| "epoch": 1.5483410631466286, |
| "grad_norm": 0.408203125, |
| "learning_rate": 1.9529144310294025e-06, |
| "loss": 1.1183, |
| "mean_token_accuracy": 0.7177895784378052, |
| "step": 4340 |
| }, |
| { |
| "epoch": 1.5519086692829112, |
| "grad_norm": 0.392578125, |
| "learning_rate": 1.9311116029192277e-06, |
| "loss": 1.1121, |
| "mean_token_accuracy": 0.7182875365018845, |
| "step": 4350 |
| }, |
| { |
| "epoch": 1.5554762754191938, |
| "grad_norm": 0.392578125, |
| "learning_rate": 1.909401999301993e-06, |
| "loss": 1.1103, |
| "mean_token_accuracy": 0.7180555552244187, |
| "step": 4360 |
| }, |
| { |
| "epoch": 1.5590438815554761, |
| "grad_norm": 0.40625, |
| "learning_rate": 1.8877862796564821e-06, |
| "loss": 1.1366, |
| "mean_token_accuracy": 0.7148736745119095, |
| "step": 4370 |
| }, |
| { |
| "epoch": 1.562611487691759, |
| "grad_norm": 0.4140625, |
| "learning_rate": 1.866265100609539e-06, |
| "loss": 1.0949, |
| "mean_token_accuracy": 0.7195869028568268, |
| "step": 4380 |
| }, |
| { |
| "epoch": 1.5661790938280413, |
| "grad_norm": 0.3984375, |
| "learning_rate": 1.8448391159161206e-06, |
| "loss": 1.0945, |
| "mean_token_accuracy": 0.7243360817432404, |
| "step": 4390 |
| }, |
| { |
| "epoch": 1.569746699964324, |
| "grad_norm": 0.40234375, |
| "learning_rate": 1.8235089764394408e-06, |
| "loss": 1.1096, |
| "mean_token_accuracy": 0.7223778516054153, |
| "step": 4400 |
| }, |
| { |
| "epoch": 1.569746699964324, |
| "eval_loss": 1.1027787923812866, |
| "eval_mean_token_accuracy": 0.7199580997386307, |
| "eval_runtime": 130.0033, |
| "eval_samples_per_second": 7.269, |
| "eval_steps_per_second": 7.269, |
| "step": 4400 |
| }, |
| { |
| "epoch": 1.5733143061006065, |
| "grad_norm": 0.431640625, |
| "learning_rate": 1.8022753301311935e-06, |
| "loss": 1.057, |
| "mean_token_accuracy": 0.7320741713047028, |
| "step": 4410 |
| }, |
| { |
| "epoch": 1.576881912236889, |
| "grad_norm": 0.3984375, |
| "learning_rate": 1.7811388220118707e-06, |
| "loss": 1.0779, |
| "mean_token_accuracy": 0.727427190542221, |
| "step": 4420 |
| }, |
| { |
| "epoch": 1.5804495183731717, |
| "grad_norm": 0.404296875, |
| "learning_rate": 1.760100094151176e-06, |
| "loss": 1.1037, |
| "mean_token_accuracy": 0.7183500498533248, |
| "step": 4430 |
| }, |
| { |
| "epoch": 1.584017124509454, |
| "grad_norm": 0.412109375, |
| "learning_rate": 1.7391597856485083e-06, |
| "loss": 1.0871, |
| "mean_token_accuracy": 0.7250396728515625, |
| "step": 4440 |
| }, |
| { |
| "epoch": 1.5875847306457367, |
| "grad_norm": 0.3671875, |
| "learning_rate": 1.7183185326135543e-06, |
| "loss": 1.0795, |
| "mean_token_accuracy": 0.7259785860776902, |
| "step": 4450 |
| }, |
| { |
| "epoch": 1.5911523367820193, |
| "grad_norm": 0.41796875, |
| "learning_rate": 1.6975769681469705e-06, |
| "loss": 1.0992, |
| "mean_token_accuracy": 0.7215543150901794, |
| "step": 4460 |
| }, |
| { |
| "epoch": 1.5947199429183017, |
| "grad_norm": 0.408203125, |
| "learning_rate": 1.6769357223211392e-06, |
| "loss": 1.119, |
| "mean_token_accuracy": 0.7176971554756164, |
| "step": 4470 |
| }, |
| { |
| "epoch": 1.5982875490545845, |
| "grad_norm": 0.396484375, |
| "learning_rate": 1.6563954221610356e-06, |
| "loss": 1.1187, |
| "mean_token_accuracy": 0.7163354724645614, |
| "step": 4480 |
| }, |
| { |
| "epoch": 1.6018551551908669, |
| "grad_norm": 0.408203125, |
| "learning_rate": 1.6359566916251846e-06, |
| "loss": 1.1075, |
| "mean_token_accuracy": 0.7200702011585236, |
| "step": 4490 |
| }, |
| { |
| "epoch": 1.6054227613271495, |
| "grad_norm": 0.39453125, |
| "learning_rate": 1.6156201515866971e-06, |
| "loss": 1.1027, |
| "mean_token_accuracy": 0.7201297283172607, |
| "step": 4500 |
| }, |
| { |
| "epoch": 1.608990367463432, |
| "grad_norm": 0.400390625, |
| "learning_rate": 1.5953864198144137e-06, |
| "loss": 1.1081, |
| "mean_token_accuracy": 0.7206196546554565, |
| "step": 4510 |
| }, |
| { |
| "epoch": 1.6125579735997146, |
| "grad_norm": 0.40234375, |
| "learning_rate": 1.5752561109541447e-06, |
| "loss": 1.0819, |
| "mean_token_accuracy": 0.7242395669221878, |
| "step": 4520 |
| }, |
| { |
| "epoch": 1.6161255797359972, |
| "grad_norm": 0.4140625, |
| "learning_rate": 1.5552298365099883e-06, |
| "loss": 1.0762, |
| "mean_token_accuracy": 0.7243531912565231, |
| "step": 4530 |
| }, |
| { |
| "epoch": 1.6196931858722796, |
| "grad_norm": 0.421875, |
| "learning_rate": 1.5353082048257596e-06, |
| "loss": 1.1185, |
| "mean_token_accuracy": 0.7174435228109359, |
| "step": 4540 |
| }, |
| { |
| "epoch": 1.6232607920085622, |
| "grad_norm": 0.396484375, |
| "learning_rate": 1.5154918210665148e-06, |
| "loss": 1.0798, |
| "mean_token_accuracy": 0.7255944669246673, |
| "step": 4550 |
| }, |
| { |
| "epoch": 1.6268283981448448, |
| "grad_norm": 0.39453125, |
| "learning_rate": 1.4957812872001614e-06, |
| "loss": 1.0723, |
| "mean_token_accuracy": 0.7288858354091644, |
| "step": 4560 |
| }, |
| { |
| "epoch": 1.6303960042811274, |
| "grad_norm": 0.3984375, |
| "learning_rate": 1.4761772019791749e-06, |
| "loss": 1.1132, |
| "mean_token_accuracy": 0.7194352895021439, |
| "step": 4570 |
| }, |
| { |
| "epoch": 1.63396361041741, |
| "grad_norm": 0.458984375, |
| "learning_rate": 1.4566801609224096e-06, |
| "loss": 1.1258, |
| "mean_token_accuracy": 0.7161971390247345, |
| "step": 4580 |
| }, |
| { |
| "epoch": 1.6375312165536924, |
| "grad_norm": 0.416015625, |
| "learning_rate": 1.4372907562970078e-06, |
| "loss": 1.0896, |
| "mean_token_accuracy": 0.7243341475725174, |
| "step": 4590 |
| }, |
| { |
| "epoch": 1.641098822689975, |
| "grad_norm": 0.390625, |
| "learning_rate": 1.4180095771004155e-06, |
| "loss": 1.1161, |
| "mean_token_accuracy": 0.7176478147506714, |
| "step": 4600 |
| }, |
| { |
| "epoch": 1.641098822689975, |
| "eval_loss": 1.1027382612228394, |
| "eval_mean_token_accuracy": 0.7199298966498602, |
| "eval_runtime": 129.3274, |
| "eval_samples_per_second": 7.307, |
| "eval_steps_per_second": 7.307, |
| "step": 4600 |
| }, |
| { |
| "epoch": 1.6446664288262576, |
| "grad_norm": 0.39453125, |
| "learning_rate": 1.3988372090424773e-06, |
| "loss": 1.1084, |
| "mean_token_accuracy": 0.7220268547534943, |
| "step": 4610 |
| }, |
| { |
| "epoch": 1.6482340349625402, |
| "grad_norm": 0.40625, |
| "learning_rate": 1.3797742345276522e-06, |
| "loss": 1.0884, |
| "mean_token_accuracy": 0.7240757316350936, |
| "step": 4620 |
| }, |
| { |
| "epoch": 1.6518016410988228, |
| "grad_norm": 0.408203125, |
| "learning_rate": 1.3608212326373248e-06, |
| "loss": 1.1107, |
| "mean_token_accuracy": 0.7184859544038773, |
| "step": 4630 |
| }, |
| { |
| "epoch": 1.6553692472351051, |
| "grad_norm": 0.41015625, |
| "learning_rate": 1.3419787791122063e-06, |
| "loss": 1.0812, |
| "mean_token_accuracy": 0.7257325768470764, |
| "step": 4640 |
| }, |
| { |
| "epoch": 1.658936853371388, |
| "grad_norm": 0.384765625, |
| "learning_rate": 1.3232474463348472e-06, |
| "loss": 1.0959, |
| "mean_token_accuracy": 0.7212332129478455, |
| "step": 4650 |
| }, |
| { |
| "epoch": 1.6625044595076703, |
| "grad_norm": 0.390625, |
| "learning_rate": 1.3046278033122578e-06, |
| "loss": 1.0898, |
| "mean_token_accuracy": 0.7245996713638305, |
| "step": 4660 |
| }, |
| { |
| "epoch": 1.666072065643953, |
| "grad_norm": 0.41015625, |
| "learning_rate": 1.286120415658611e-06, |
| "loss": 1.0961, |
| "mean_token_accuracy": 0.7228555232286453, |
| "step": 4670 |
| }, |
| { |
| "epoch": 1.6696396717802355, |
| "grad_norm": 0.42578125, |
| "learning_rate": 1.2677258455780682e-06, |
| "loss": 1.1533, |
| "mean_token_accuracy": 0.7131746053695679, |
| "step": 4680 |
| }, |
| { |
| "epoch": 1.673207277916518, |
| "grad_norm": 0.390625, |
| "learning_rate": 1.2494446518477022e-06, |
| "loss": 1.1423, |
| "mean_token_accuracy": 0.7150006622076035, |
| "step": 4690 |
| }, |
| { |
| "epoch": 1.6767748840528007, |
| "grad_norm": 0.3984375, |
| "learning_rate": 1.2312773898005175e-06, |
| "loss": 1.1138, |
| "mean_token_accuracy": 0.7159749954938889, |
| "step": 4700 |
| }, |
| { |
| "epoch": 1.680342490189083, |
| "grad_norm": 0.3984375, |
| "learning_rate": 1.2132246113085823e-06, |
| "loss": 1.1093, |
| "mean_token_accuracy": 0.7196764290332794, |
| "step": 4710 |
| }, |
| { |
| "epoch": 1.6839100963253657, |
| "grad_norm": 0.416015625, |
| "learning_rate": 1.1952868647662696e-06, |
| "loss": 1.1136, |
| "mean_token_accuracy": 0.7188949972391129, |
| "step": 4720 |
| }, |
| { |
| "epoch": 1.6874777024616483, |
| "grad_norm": 0.388671875, |
| "learning_rate": 1.1774646950735914e-06, |
| "loss": 1.1072, |
| "mean_token_accuracy": 0.7206517100334168, |
| "step": 4730 |
| }, |
| { |
| "epoch": 1.6910453085979307, |
| "grad_norm": 0.41796875, |
| "learning_rate": 1.1597586436196473e-06, |
| "loss": 1.1105, |
| "mean_token_accuracy": 0.7209351748228073, |
| "step": 4740 |
| }, |
| { |
| "epoch": 1.6946129147342135, |
| "grad_norm": 0.3828125, |
| "learning_rate": 1.1421692482661855e-06, |
| "loss": 1.1187, |
| "mean_token_accuracy": 0.7177319884300232, |
| "step": 4750 |
| }, |
| { |
| "epoch": 1.6981805208704959, |
| "grad_norm": 0.39453125, |
| "learning_rate": 1.124697043331256e-06, |
| "loss": 1.0603, |
| "mean_token_accuracy": 0.7291674822568893, |
| "step": 4760 |
| }, |
| { |
| "epoch": 1.7017481270067785, |
| "grad_norm": 0.39453125, |
| "learning_rate": 1.107342559572977e-06, |
| "loss": 1.0875, |
| "mean_token_accuracy": 0.7248851746320725, |
| "step": 4770 |
| }, |
| { |
| "epoch": 1.705315733143061, |
| "grad_norm": 0.400390625, |
| "learning_rate": 1.0901063241734262e-06, |
| "loss": 1.0983, |
| "mean_token_accuracy": 0.7222309172153473, |
| "step": 4780 |
| }, |
| { |
| "epoch": 1.7088833392793434, |
| "grad_norm": 0.3828125, |
| "learning_rate": 1.0729888607226114e-06, |
| "loss": 1.0718, |
| "mean_token_accuracy": 0.727744197845459, |
| "step": 4790 |
| }, |
| { |
| "epoch": 1.7124509454156263, |
| "grad_norm": 0.404296875, |
| "learning_rate": 1.0559906892025744e-06, |
| "loss": 1.0327, |
| "mean_token_accuracy": 0.734174844622612, |
| "step": 4800 |
| }, |
| { |
| "epoch": 1.7124509454156263, |
| "eval_loss": 1.1027302742004395, |
| "eval_mean_token_accuracy": 0.7199333585128582, |
| "eval_runtime": 130.0492, |
| "eval_samples_per_second": 7.266, |
| "eval_steps_per_second": 7.266, |
| "step": 4800 |
| }, |
| { |
| "epoch": 1.7160185515519086, |
| "grad_norm": 0.412109375, |
| "learning_rate": 1.0391123259715907e-06, |
| "loss": 1.0924, |
| "mean_token_accuracy": 0.7229822933673858, |
| "step": 4810 |
| }, |
| { |
| "epoch": 1.7195861576881912, |
| "grad_norm": 0.44140625, |
| "learning_rate": 1.022354283748484e-06, |
| "loss": 1.1117, |
| "mean_token_accuracy": 0.7192734986543655, |
| "step": 4820 |
| }, |
| { |
| "epoch": 1.7231537638244738, |
| "grad_norm": 0.4375, |
| "learning_rate": 1.005717071597056e-06, |
| "loss": 1.0888, |
| "mean_token_accuracy": 0.7252772331237793, |
| "step": 4830 |
| }, |
| { |
| "epoch": 1.7267213699607562, |
| "grad_norm": 0.421875, |
| "learning_rate": 9.892011949106173e-07, |
| "loss": 1.119, |
| "mean_token_accuracy": 0.7155101299285889, |
| "step": 4840 |
| }, |
| { |
| "epoch": 1.730288976097039, |
| "grad_norm": 0.40234375, |
| "learning_rate": 9.72807155396634e-07, |
| "loss": 1.1094, |
| "mean_token_accuracy": 0.7175802975893021, |
| "step": 4850 |
| }, |
| { |
| "epoch": 1.7338565822333214, |
| "grad_norm": 0.404296875, |
| "learning_rate": 9.56535451061496e-07, |
| "loss": 1.1328, |
| "mean_token_accuracy": 0.7159011840820313, |
| "step": 4860 |
| }, |
| { |
| "epoch": 1.737424188369604, |
| "grad_norm": 0.419921875, |
| "learning_rate": 9.403865761953778e-07, |
| "loss": 1.119, |
| "mean_token_accuracy": 0.7185006946325302, |
| "step": 4870 |
| }, |
| { |
| "epoch": 1.7409917945058866, |
| "grad_norm": 0.390625, |
| "learning_rate": 9.243610213572285e-07, |
| "loss": 1.0974, |
| "mean_token_accuracy": 0.7215916812419891, |
| "step": 4880 |
| }, |
| { |
| "epoch": 1.744559400642169, |
| "grad_norm": 0.412109375, |
| "learning_rate": 9.084592733598735e-07, |
| "loss": 1.1092, |
| "mean_token_accuracy": 0.7196306467056275, |
| "step": 4890 |
| }, |
| { |
| "epoch": 1.7481270067784518, |
| "grad_norm": 0.40625, |
| "learning_rate": 8.926818152552191e-07, |
| "loss": 1.1035, |
| "mean_token_accuracy": 0.7213721007108689, |
| "step": 4900 |
| }, |
| { |
| "epoch": 1.7516946129147342, |
| "grad_norm": 0.396484375, |
| "learning_rate": 8.770291263195818e-07, |
| "loss": 1.0939, |
| "mean_token_accuracy": 0.7234235763549804, |
| "step": 4910 |
| }, |
| { |
| "epoch": 1.7552622190510168, |
| "grad_norm": 0.38671875, |
| "learning_rate": 8.615016820391342e-07, |
| "loss": 1.091, |
| "mean_token_accuracy": 0.7227918088436127, |
| "step": 4920 |
| }, |
| { |
| "epoch": 1.7588298251872994, |
| "grad_norm": 0.3984375, |
| "learning_rate": 8.460999540954518e-07, |
| "loss": 1.0923, |
| "mean_token_accuracy": 0.7203070998191834, |
| "step": 4930 |
| }, |
| { |
| "epoch": 1.7623974313235817, |
| "grad_norm": 0.416015625, |
| "learning_rate": 8.308244103511909e-07, |
| "loss": 1.1246, |
| "mean_token_accuracy": 0.7148092061281204, |
| "step": 4940 |
| }, |
| { |
| "epoch": 1.7659650374598646, |
| "grad_norm": 0.41015625, |
| "learning_rate": 8.156755148358763e-07, |
| "loss": 1.1148, |
| "mean_token_accuracy": 0.7176388889551163, |
| "step": 4950 |
| }, |
| { |
| "epoch": 1.769532643596147, |
| "grad_norm": 0.37890625, |
| "learning_rate": 8.006537277318011e-07, |
| "loss": 1.0959, |
| "mean_token_accuracy": 0.7230799734592438, |
| "step": 4960 |
| }, |
| { |
| "epoch": 1.7731002497324295, |
| "grad_norm": 0.408203125, |
| "learning_rate": 7.857595053600514e-07, |
| "loss": 1.0804, |
| "mean_token_accuracy": 0.7231711894273758, |
| "step": 4970 |
| }, |
| { |
| "epoch": 1.7766678558687121, |
| "grad_norm": 0.39453125, |
| "learning_rate": 7.709933001666431e-07, |
| "loss": 1.0831, |
| "mean_token_accuracy": 0.7241422474384308, |
| "step": 4980 |
| }, |
| { |
| "epoch": 1.7802354620049945, |
| "grad_norm": 0.42578125, |
| "learning_rate": 7.56355560708778e-07, |
| "loss": 1.0837, |
| "mean_token_accuracy": 0.7250366419553756, |
| "step": 4990 |
| }, |
| { |
| "epoch": 1.7838030681412773, |
| "grad_norm": 0.4453125, |
| "learning_rate": 7.41846731641216e-07, |
| "loss": 1.1205, |
| "mean_token_accuracy": 0.7213740587234497, |
| "step": 5000 |
| }, |
| { |
| "epoch": 1.7838030681412773, |
| "eval_loss": 1.102725863456726, |
| "eval_mean_token_accuracy": 0.719940029439472, |
| "eval_runtime": 130.069, |
| "eval_samples_per_second": 7.265, |
| "eval_steps_per_second": 7.265, |
| "step": 5000 |
| }, |
| { |
| "epoch": 1.7873706742775597, |
| "grad_norm": 0.427734375, |
| "learning_rate": 7.274672537027744e-07, |
| "loss": 1.1027, |
| "mean_token_accuracy": 0.7189759016036987, |
| "step": 5010 |
| }, |
| { |
| "epoch": 1.7909382804138423, |
| "grad_norm": 0.390625, |
| "learning_rate": 7.132175637029292e-07, |
| "loss": 1.1089, |
| "mean_token_accuracy": 0.7223053216934204, |
| "step": 5020 |
| }, |
| { |
| "epoch": 1.7945058865501249, |
| "grad_norm": 0.392578125, |
| "learning_rate": 6.990980945085535e-07, |
| "loss": 1.0885, |
| "mean_token_accuracy": 0.7228141844272613, |
| "step": 5030 |
| }, |
| { |
| "epoch": 1.7980734926864073, |
| "grad_norm": 0.412109375, |
| "learning_rate": 6.851092750307687e-07, |
| "loss": 1.1324, |
| "mean_token_accuracy": 0.7142399311065674, |
| "step": 5040 |
| }, |
| { |
| "epoch": 1.80164109882269, |
| "grad_norm": 0.404296875, |
| "learning_rate": 6.712515302119077e-07, |
| "loss": 1.1329, |
| "mean_token_accuracy": 0.7173153191804886, |
| "step": 5050 |
| }, |
| { |
| "epoch": 1.8052087049589725, |
| "grad_norm": 0.3828125, |
| "learning_rate": 6.575252810126143e-07, |
| "loss": 1.1205, |
| "mean_token_accuracy": 0.7187498182058334, |
| "step": 5060 |
| }, |
| { |
| "epoch": 1.808776311095255, |
| "grad_norm": 0.404296875, |
| "learning_rate": 6.439309443990532e-07, |
| "loss": 1.0917, |
| "mean_token_accuracy": 0.7252810209989548, |
| "step": 5070 |
| }, |
| { |
| "epoch": 1.8123439172315376, |
| "grad_norm": 0.40234375, |
| "learning_rate": 6.304689333302416e-07, |
| "loss": 1.1159, |
| "mean_token_accuracy": 0.717368358373642, |
| "step": 5080 |
| }, |
| { |
| "epoch": 1.8159115233678202, |
| "grad_norm": 0.42578125, |
| "learning_rate": 6.171396567455051e-07, |
| "loss": 1.0996, |
| "mean_token_accuracy": 0.7219736486673355, |
| "step": 5090 |
| }, |
| { |
| "epoch": 1.8194791295041028, |
| "grad_norm": 0.388671875, |
| "learning_rate": 6.039435195520604e-07, |
| "loss": 1.0866, |
| "mean_token_accuracy": 0.7217567145824433, |
| "step": 5100 |
| }, |
| { |
| "epoch": 1.8230467356403852, |
| "grad_norm": 0.41015625, |
| "learning_rate": 5.908809226127055e-07, |
| "loss": 1.0752, |
| "mean_token_accuracy": 0.7249330550432205, |
| "step": 5110 |
| }, |
| { |
| "epoch": 1.8266143417766678, |
| "grad_norm": 0.3984375, |
| "learning_rate": 5.779522627336536e-07, |
| "loss": 1.1156, |
| "mean_token_accuracy": 0.7156333893537521, |
| "step": 5120 |
| }, |
| { |
| "epoch": 1.8301819479129504, |
| "grad_norm": 0.392578125, |
| "learning_rate": 5.651579326524708e-07, |
| "loss": 1.0806, |
| "mean_token_accuracy": 0.7244322270154953, |
| "step": 5130 |
| }, |
| { |
| "epoch": 1.833749554049233, |
| "grad_norm": 0.470703125, |
| "learning_rate": 5.524983210261481e-07, |
| "loss": 1.1064, |
| "mean_token_accuracy": 0.719607749581337, |
| "step": 5140 |
| }, |
| { |
| "epoch": 1.8373171601855156, |
| "grad_norm": 0.412109375, |
| "learning_rate": 5.399738124192988e-07, |
| "loss": 1.0813, |
| "mean_token_accuracy": 0.7249496281147003, |
| "step": 5150 |
| }, |
| { |
| "epoch": 1.840884766321798, |
| "grad_norm": 0.41796875, |
| "learning_rate": 5.275847872924716e-07, |
| "loss": 1.1177, |
| "mean_token_accuracy": 0.7161726891994477, |
| "step": 5160 |
| }, |
| { |
| "epoch": 1.8444523724580806, |
| "grad_norm": 0.41015625, |
| "learning_rate": 5.153316219905947e-07, |
| "loss": 1.0884, |
| "mean_token_accuracy": 0.7215735673904419, |
| "step": 5170 |
| }, |
| { |
| "epoch": 1.8480199785943632, |
| "grad_norm": 0.396484375, |
| "learning_rate": 5.032146887315448e-07, |
| "loss": 1.1004, |
| "mean_token_accuracy": 0.7204856693744659, |
| "step": 5180 |
| }, |
| { |
| "epoch": 1.8515875847306458, |
| "grad_norm": 0.421875, |
| "learning_rate": 4.91234355594839e-07, |
| "loss": 1.0845, |
| "mean_token_accuracy": 0.7245700657367706, |
| "step": 5190 |
| }, |
| { |
| "epoch": 1.8551551908669284, |
| "grad_norm": 0.443359375, |
| "learning_rate": 4.793909865104524e-07, |
| "loss": 1.1528, |
| "mean_token_accuracy": 0.7120543330907821, |
| "step": 5200 |
| }, |
| { |
| "epoch": 1.8551551908669284, |
| "eval_loss": 1.1027225255966187, |
| "eval_mean_token_accuracy": 0.7199266590138592, |
| "eval_runtime": 129.727, |
| "eval_samples_per_second": 7.285, |
| "eval_steps_per_second": 7.285, |
| "step": 5200 |
| }, |
| { |
| "epoch": 1.8587227970032107, |
| "grad_norm": 0.41796875, |
| "learning_rate": 4.67684941247768e-07, |
| "loss": 1.1167, |
| "mean_token_accuracy": 0.7193272441625596, |
| "step": 5210 |
| }, |
| { |
| "epoch": 1.8622904031394936, |
| "grad_norm": 0.43359375, |
| "learning_rate": 4.561165754046404e-07, |
| "loss": 1.06, |
| "mean_token_accuracy": 0.7279850870370865, |
| "step": 5220 |
| }, |
| { |
| "epoch": 1.865858009275776, |
| "grad_norm": 0.40625, |
| "learning_rate": 4.4468624039659847e-07, |
| "loss": 1.073, |
| "mean_token_accuracy": 0.7259253591299057, |
| "step": 5230 |
| }, |
| { |
| "epoch": 1.8694256154120585, |
| "grad_norm": 0.400390625, |
| "learning_rate": 4.333942834461702e-07, |
| "loss": 1.1598, |
| "mean_token_accuracy": 0.7092094957828522, |
| "step": 5240 |
| }, |
| { |
| "epoch": 1.8729932215483411, |
| "grad_norm": 0.41796875, |
| "learning_rate": 4.2224104757233263e-07, |
| "loss": 1.0985, |
| "mean_token_accuracy": 0.7210815221071243, |
| "step": 5250 |
| }, |
| { |
| "epoch": 1.8765608276846235, |
| "grad_norm": 0.416015625, |
| "learning_rate": 4.112268715800943e-07, |
| "loss": 1.1095, |
| "mean_token_accuracy": 0.7216496378183365, |
| "step": 5260 |
| }, |
| { |
| "epoch": 1.8801284338209063, |
| "grad_norm": 0.40234375, |
| "learning_rate": 4.003520900502028e-07, |
| "loss": 1.1034, |
| "mean_token_accuracy": 0.720407497882843, |
| "step": 5270 |
| }, |
| { |
| "epoch": 1.8836960399571887, |
| "grad_norm": 0.439453125, |
| "learning_rate": 3.8961703332898003e-07, |
| "loss": 1.1309, |
| "mean_token_accuracy": 0.7161675840616226, |
| "step": 5280 |
| }, |
| { |
| "epoch": 1.8872636460934713, |
| "grad_norm": 0.39453125, |
| "learning_rate": 3.7902202751828544e-07, |
| "loss": 1.1306, |
| "mean_token_accuracy": 0.7164841383695603, |
| "step": 5290 |
| }, |
| { |
| "epoch": 1.890831252229754, |
| "grad_norm": 0.416015625, |
| "learning_rate": 3.685673944656176e-07, |
| "loss": 1.0887, |
| "mean_token_accuracy": 0.722822043299675, |
| "step": 5300 |
| }, |
| { |
| "epoch": 1.8943988583660363, |
| "grad_norm": 0.41015625, |
| "learning_rate": 3.5825345175432677e-07, |
| "loss": 1.1201, |
| "mean_token_accuracy": 0.7180888473987579, |
| "step": 5310 |
| }, |
| { |
| "epoch": 1.897966464502319, |
| "grad_norm": 0.416015625, |
| "learning_rate": 3.4808051269397513e-07, |
| "loss": 1.1129, |
| "mean_token_accuracy": 0.7190331071615219, |
| "step": 5320 |
| }, |
| { |
| "epoch": 1.9015340706386015, |
| "grad_norm": 0.39453125, |
| "learning_rate": 3.3804888631081834e-07, |
| "loss": 1.1337, |
| "mean_token_accuracy": 0.7162440955638886, |
| "step": 5330 |
| }, |
| { |
| "epoch": 1.905101676774884, |
| "grad_norm": 0.40234375, |
| "learning_rate": 3.281588773384137e-07, |
| "loss": 1.1106, |
| "mean_token_accuracy": 0.7199198633432389, |
| "step": 5340 |
| }, |
| { |
| "epoch": 1.9086692829111667, |
| "grad_norm": 0.42578125, |
| "learning_rate": 3.1841078620836687e-07, |
| "loss": 1.1027, |
| "mean_token_accuracy": 0.7217979252338409, |
| "step": 5350 |
| }, |
| { |
| "epoch": 1.912236889047449, |
| "grad_norm": 0.4140625, |
| "learning_rate": 3.0880490904120877e-07, |
| "loss": 1.0784, |
| "mean_token_accuracy": 0.7263754487037659, |
| "step": 5360 |
| }, |
| { |
| "epoch": 1.9158044951837319, |
| "grad_norm": 0.423828125, |
| "learning_rate": 2.99341537637392e-07, |
| "loss": 1.1339, |
| "mean_token_accuracy": 0.7150900274515152, |
| "step": 5370 |
| }, |
| { |
| "epoch": 1.9193721013200142, |
| "grad_norm": 0.390625, |
| "learning_rate": 2.9002095946843276e-07, |
| "loss": 1.0923, |
| "mean_token_accuracy": 0.7226948976516724, |
| "step": 5380 |
| }, |
| { |
| "epoch": 1.9229397074562968, |
| "grad_norm": 0.400390625, |
| "learning_rate": 2.8084345766817676e-07, |
| "loss": 1.1026, |
| "mean_token_accuracy": 0.7200335741043091, |
| "step": 5390 |
| }, |
| { |
| "epoch": 1.9265073135925794, |
| "grad_norm": 0.4140625, |
| "learning_rate": 2.718093110241976e-07, |
| "loss": 1.141, |
| "mean_token_accuracy": 0.7119520723819732, |
| "step": 5400 |
| }, |
| { |
| "epoch": 1.9265073135925794, |
| "eval_loss": 1.1027096509933472, |
| "eval_mean_token_accuracy": 0.7199140159541337, |
| "eval_runtime": 130.0788, |
| "eval_samples_per_second": 7.265, |
| "eval_steps_per_second": 7.265, |
| "step": 5400 |
| }, |
| { |
| "epoch": 1.9300749197288618, |
| "grad_norm": 0.41796875, |
| "learning_rate": 2.6291879396933004e-07, |
| "loss": 1.1296, |
| "mean_token_accuracy": 0.7158176630735398, |
| "step": 5410 |
| }, |
| { |
| "epoch": 1.9336425258651446, |
| "grad_norm": 0.39453125, |
| "learning_rate": 2.5417217657333184e-07, |
| "loss": 1.118, |
| "mean_token_accuracy": 0.7163922816514969, |
| "step": 5420 |
| }, |
| { |
| "epoch": 1.937210132001427, |
| "grad_norm": 0.396484375, |
| "learning_rate": 2.455697245346783e-07, |
| "loss": 1.0926, |
| "mean_token_accuracy": 0.7246672809123993, |
| "step": 5430 |
| }, |
| { |
| "epoch": 1.9407777381377096, |
| "grad_norm": 0.427734375, |
| "learning_rate": 2.3711169917249533e-07, |
| "loss": 1.1075, |
| "mean_token_accuracy": 0.7199786335229874, |
| "step": 5440 |
| }, |
| { |
| "epoch": 1.9443453442739922, |
| "grad_norm": 0.388671875, |
| "learning_rate": 2.287983574186159e-07, |
| "loss": 1.1034, |
| "mean_token_accuracy": 0.7227095514535904, |
| "step": 5450 |
| }, |
| { |
| "epoch": 1.9479129504102746, |
| "grad_norm": 0.41796875, |
| "learning_rate": 2.2062995180978043e-07, |
| "loss": 1.1322, |
| "mean_token_accuracy": 0.7170233994722366, |
| "step": 5460 |
| }, |
| { |
| "epoch": 1.9514805565465574, |
| "grad_norm": 0.435546875, |
| "learning_rate": 2.1260673047996227e-07, |
| "loss": 1.1113, |
| "mean_token_accuracy": 0.7198122620582581, |
| "step": 5470 |
| }, |
| { |
| "epoch": 1.9550481626828398, |
| "grad_norm": 0.416015625, |
| "learning_rate": 2.0472893715282993e-07, |
| "loss": 1.0686, |
| "mean_token_accuracy": 0.7249145358800888, |
| "step": 5480 |
| }, |
| { |
| "epoch": 1.9586157688191224, |
| "grad_norm": 0.38671875, |
| "learning_rate": 1.9699681113434398e-07, |
| "loss": 1.1166, |
| "mean_token_accuracy": 0.7198186457157135, |
| "step": 5490 |
| }, |
| { |
| "epoch": 1.962183374955405, |
| "grad_norm": 0.435546875, |
| "learning_rate": 1.8941058730549134e-07, |
| "loss": 1.1484, |
| "mean_token_accuracy": 0.7104803085327148, |
| "step": 5500 |
| } |
| ], |
| "logging_steps": 10, |
| "max_steps": 6000, |
| "num_input_tokens_seen": 0, |
| "num_train_epochs": 3, |
| "save_steps": 500, |
| "stateful_callbacks": { |
| "TrainerControl": { |
| "args": { |
| "should_epoch_stop": false, |
| "should_evaluate": false, |
| "should_log": false, |
| "should_save": true, |
| "should_training_stop": false |
| }, |
| "attributes": {} |
| } |
| }, |
| "total_flos": 1.9844252622378762e+19, |
| "train_batch_size": 8, |
| "trial_name": null, |
| "trial_params": null |
| } |
|
|