| { | |
| "best_metric": 0.8745647668838501, | |
| "best_model_checkpoint": "./beans_outputs/checkpoint-1680", | |
| "epoch": 5.0, | |
| "eval_steps": 500, | |
| "global_step": 1680, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.02976190476190476, | |
| "grad_norm": 3.170905113220215, | |
| "learning_rate": 1.9880952380952384e-05, | |
| "loss": 3.6048, | |
| "step": 10 | |
| }, | |
| { | |
| "epoch": 0.05952380952380952, | |
| "grad_norm": 2.51796555519104, | |
| "learning_rate": 1.9761904761904763e-05, | |
| "loss": 3.5551, | |
| "step": 20 | |
| }, | |
| { | |
| "epoch": 0.08928571428571429, | |
| "grad_norm": 2.868872880935669, | |
| "learning_rate": 1.9642857142857145e-05, | |
| "loss": 3.4868, | |
| "step": 30 | |
| }, | |
| { | |
| "epoch": 0.11904761904761904, | |
| "grad_norm": 3.2086801528930664, | |
| "learning_rate": 1.9523809523809524e-05, | |
| "loss": 3.4105, | |
| "step": 40 | |
| }, | |
| { | |
| "epoch": 0.1488095238095238, | |
| "grad_norm": 2.825397253036499, | |
| "learning_rate": 1.9404761904761906e-05, | |
| "loss": 3.3538, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.17857142857142858, | |
| "grad_norm": 3.486938238143921, | |
| "learning_rate": 1.928571428571429e-05, | |
| "loss": 3.3201, | |
| "step": 60 | |
| }, | |
| { | |
| "epoch": 0.20833333333333334, | |
| "grad_norm": 2.802475929260254, | |
| "learning_rate": 1.916666666666667e-05, | |
| "loss": 3.2432, | |
| "step": 70 | |
| }, | |
| { | |
| "epoch": 0.23809523809523808, | |
| "grad_norm": 2.789459228515625, | |
| "learning_rate": 1.904761904761905e-05, | |
| "loss": 3.2041, | |
| "step": 80 | |
| }, | |
| { | |
| "epoch": 0.26785714285714285, | |
| "grad_norm": 3.008307933807373, | |
| "learning_rate": 1.892857142857143e-05, | |
| "loss": 3.1679, | |
| "step": 90 | |
| }, | |
| { | |
| "epoch": 0.2976190476190476, | |
| "grad_norm": 2.6487619876861572, | |
| "learning_rate": 1.880952380952381e-05, | |
| "loss": 3.1249, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.3273809523809524, | |
| "grad_norm": 2.947179079055786, | |
| "learning_rate": 1.8690476190476193e-05, | |
| "loss": 3.0909, | |
| "step": 110 | |
| }, | |
| { | |
| "epoch": 0.35714285714285715, | |
| "grad_norm": 3.1243131160736084, | |
| "learning_rate": 1.8571428571428575e-05, | |
| "loss": 3.0953, | |
| "step": 120 | |
| }, | |
| { | |
| "epoch": 0.3869047619047619, | |
| "grad_norm": 2.9400837421417236, | |
| "learning_rate": 1.8452380952380954e-05, | |
| "loss": 2.9629, | |
| "step": 130 | |
| }, | |
| { | |
| "epoch": 0.4166666666666667, | |
| "grad_norm": 2.7061338424682617, | |
| "learning_rate": 1.8333333333333333e-05, | |
| "loss": 2.9307, | |
| "step": 140 | |
| }, | |
| { | |
| "epoch": 0.44642857142857145, | |
| "grad_norm": 2.6359243392944336, | |
| "learning_rate": 1.8214285714285715e-05, | |
| "loss": 2.8238, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 0.47619047619047616, | |
| "grad_norm": 2.740408420562744, | |
| "learning_rate": 1.8095238095238097e-05, | |
| "loss": 2.8961, | |
| "step": 160 | |
| }, | |
| { | |
| "epoch": 0.5059523809523809, | |
| "grad_norm": 2.858968496322632, | |
| "learning_rate": 1.797619047619048e-05, | |
| "loss": 2.7505, | |
| "step": 170 | |
| }, | |
| { | |
| "epoch": 0.5357142857142857, | |
| "grad_norm": 2.7578256130218506, | |
| "learning_rate": 1.785714285714286e-05, | |
| "loss": 2.7989, | |
| "step": 180 | |
| }, | |
| { | |
| "epoch": 0.5654761904761905, | |
| "grad_norm": 2.9766931533813477, | |
| "learning_rate": 1.7738095238095237e-05, | |
| "loss": 2.6722, | |
| "step": 190 | |
| }, | |
| { | |
| "epoch": 0.5952380952380952, | |
| "grad_norm": 2.7900352478027344, | |
| "learning_rate": 1.761904761904762e-05, | |
| "loss": 2.7213, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 0.625, | |
| "grad_norm": 3.004939556121826, | |
| "learning_rate": 1.7500000000000002e-05, | |
| "loss": 2.7287, | |
| "step": 210 | |
| }, | |
| { | |
| "epoch": 0.6547619047619048, | |
| "grad_norm": 2.7375917434692383, | |
| "learning_rate": 1.7380952380952384e-05, | |
| "loss": 2.6691, | |
| "step": 220 | |
| }, | |
| { | |
| "epoch": 0.6845238095238095, | |
| "grad_norm": 3.2530713081359863, | |
| "learning_rate": 1.7261904761904763e-05, | |
| "loss": 2.5742, | |
| "step": 230 | |
| }, | |
| { | |
| "epoch": 0.7142857142857143, | |
| "grad_norm": 3.0463545322418213, | |
| "learning_rate": 1.7142857142857142e-05, | |
| "loss": 2.4523, | |
| "step": 240 | |
| }, | |
| { | |
| "epoch": 0.7440476190476191, | |
| "grad_norm": 3.0471720695495605, | |
| "learning_rate": 1.7023809523809524e-05, | |
| "loss": 2.4592, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 0.7738095238095238, | |
| "grad_norm": 3.4415907859802246, | |
| "learning_rate": 1.6904761904761906e-05, | |
| "loss": 2.4316, | |
| "step": 260 | |
| }, | |
| { | |
| "epoch": 0.8035714285714286, | |
| "grad_norm": 2.830673933029175, | |
| "learning_rate": 1.678571428571429e-05, | |
| "loss": 2.3903, | |
| "step": 270 | |
| }, | |
| { | |
| "epoch": 0.8333333333333334, | |
| "grad_norm": 3.584303617477417, | |
| "learning_rate": 1.6666666666666667e-05, | |
| "loss": 2.4643, | |
| "step": 280 | |
| }, | |
| { | |
| "epoch": 0.8630952380952381, | |
| "grad_norm": 3.9748589992523193, | |
| "learning_rate": 1.6547619047619046e-05, | |
| "loss": 2.3237, | |
| "step": 290 | |
| }, | |
| { | |
| "epoch": 0.8928571428571429, | |
| "grad_norm": 2.929922103881836, | |
| "learning_rate": 1.642857142857143e-05, | |
| "loss": 2.2639, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 0.9226190476190477, | |
| "grad_norm": 4.647745132446289, | |
| "learning_rate": 1.630952380952381e-05, | |
| "loss": 2.4637, | |
| "step": 310 | |
| }, | |
| { | |
| "epoch": 0.9523809523809523, | |
| "grad_norm": 3.6543118953704834, | |
| "learning_rate": 1.6190476190476193e-05, | |
| "loss": 2.2519, | |
| "step": 320 | |
| }, | |
| { | |
| "epoch": 0.9821428571428571, | |
| "grad_norm": 3.3143322467803955, | |
| "learning_rate": 1.6071428571428572e-05, | |
| "loss": 2.1775, | |
| "step": 330 | |
| }, | |
| { | |
| "epoch": 1.0, | |
| "eval_accuracy": 0.7616033755274262, | |
| "eval_loss": 2.1820600032806396, | |
| "eval_runtime": 50.7645, | |
| "eval_samples_per_second": 9.337, | |
| "eval_steps_per_second": 1.182, | |
| "step": 336 | |
| }, | |
| { | |
| "epoch": 1.0119047619047619, | |
| "grad_norm": 3.666236639022827, | |
| "learning_rate": 1.5952380952380954e-05, | |
| "loss": 2.1187, | |
| "step": 340 | |
| }, | |
| { | |
| "epoch": 1.0416666666666667, | |
| "grad_norm": 3.736830472946167, | |
| "learning_rate": 1.5833333333333333e-05, | |
| "loss": 2.1312, | |
| "step": 350 | |
| }, | |
| { | |
| "epoch": 1.0714285714285714, | |
| "grad_norm": 3.002455711364746, | |
| "learning_rate": 1.5714285714285715e-05, | |
| "loss": 2.2274, | |
| "step": 360 | |
| }, | |
| { | |
| "epoch": 1.1011904761904763, | |
| "grad_norm": 3.2685108184814453, | |
| "learning_rate": 1.5595238095238098e-05, | |
| "loss": 2.1347, | |
| "step": 370 | |
| }, | |
| { | |
| "epoch": 1.130952380952381, | |
| "grad_norm": 3.4998621940612793, | |
| "learning_rate": 1.5476190476190476e-05, | |
| "loss": 2.0757, | |
| "step": 380 | |
| }, | |
| { | |
| "epoch": 1.1607142857142858, | |
| "grad_norm": 3.306267738342285, | |
| "learning_rate": 1.535714285714286e-05, | |
| "loss": 2.0177, | |
| "step": 390 | |
| }, | |
| { | |
| "epoch": 1.1904761904761905, | |
| "grad_norm": 3.8774032592773438, | |
| "learning_rate": 1.523809523809524e-05, | |
| "loss": 1.9748, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 1.2202380952380953, | |
| "grad_norm": 2.662797212600708, | |
| "learning_rate": 1.511904761904762e-05, | |
| "loss": 1.9628, | |
| "step": 410 | |
| }, | |
| { | |
| "epoch": 1.25, | |
| "grad_norm": 3.9353742599487305, | |
| "learning_rate": 1.5000000000000002e-05, | |
| "loss": 2.0104, | |
| "step": 420 | |
| }, | |
| { | |
| "epoch": 1.2797619047619047, | |
| "grad_norm": 3.3460521697998047, | |
| "learning_rate": 1.4880952380952383e-05, | |
| "loss": 2.0678, | |
| "step": 430 | |
| }, | |
| { | |
| "epoch": 1.3095238095238095, | |
| "grad_norm": 3.0211353302001953, | |
| "learning_rate": 1.4761904761904763e-05, | |
| "loss": 2.0294, | |
| "step": 440 | |
| }, | |
| { | |
| "epoch": 1.3392857142857144, | |
| "grad_norm": 2.827756404876709, | |
| "learning_rate": 1.4642857142857144e-05, | |
| "loss": 1.9104, | |
| "step": 450 | |
| }, | |
| { | |
| "epoch": 1.369047619047619, | |
| "grad_norm": 2.606844663619995, | |
| "learning_rate": 1.4523809523809524e-05, | |
| "loss": 1.933, | |
| "step": 460 | |
| }, | |
| { | |
| "epoch": 1.3988095238095237, | |
| "grad_norm": 3.994950294494629, | |
| "learning_rate": 1.4404761904761907e-05, | |
| "loss": 1.9977, | |
| "step": 470 | |
| }, | |
| { | |
| "epoch": 1.4285714285714286, | |
| "grad_norm": 3.6433207988739014, | |
| "learning_rate": 1.4285714285714287e-05, | |
| "loss": 1.897, | |
| "step": 480 | |
| }, | |
| { | |
| "epoch": 1.4583333333333333, | |
| "grad_norm": 3.1899826526641846, | |
| "learning_rate": 1.416666666666667e-05, | |
| "loss": 1.9046, | |
| "step": 490 | |
| }, | |
| { | |
| "epoch": 1.4880952380952381, | |
| "grad_norm": 3.352928638458252, | |
| "learning_rate": 1.4047619047619048e-05, | |
| "loss": 1.7378, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 1.5178571428571428, | |
| "grad_norm": 4.73577880859375, | |
| "learning_rate": 1.3928571428571429e-05, | |
| "loss": 1.7998, | |
| "step": 510 | |
| }, | |
| { | |
| "epoch": 1.5476190476190477, | |
| "grad_norm": 3.118739366531372, | |
| "learning_rate": 1.3809523809523811e-05, | |
| "loss": 1.7316, | |
| "step": 520 | |
| }, | |
| { | |
| "epoch": 1.5773809523809523, | |
| "grad_norm": 2.617877721786499, | |
| "learning_rate": 1.3690476190476192e-05, | |
| "loss": 1.6478, | |
| "step": 530 | |
| }, | |
| { | |
| "epoch": 1.6071428571428572, | |
| "grad_norm": 3.3894600868225098, | |
| "learning_rate": 1.3571428571428574e-05, | |
| "loss": 1.7311, | |
| "step": 540 | |
| }, | |
| { | |
| "epoch": 1.6369047619047619, | |
| "grad_norm": 4.088054656982422, | |
| "learning_rate": 1.3452380952380954e-05, | |
| "loss": 1.5008, | |
| "step": 550 | |
| }, | |
| { | |
| "epoch": 1.6666666666666665, | |
| "grad_norm": 3.2209737300872803, | |
| "learning_rate": 1.3333333333333333e-05, | |
| "loss": 1.6994, | |
| "step": 560 | |
| }, | |
| { | |
| "epoch": 1.6964285714285714, | |
| "grad_norm": 3.8286681175231934, | |
| "learning_rate": 1.3214285714285716e-05, | |
| "loss": 1.6879, | |
| "step": 570 | |
| }, | |
| { | |
| "epoch": 1.7261904761904763, | |
| "grad_norm": 2.611720561981201, | |
| "learning_rate": 1.3095238095238096e-05, | |
| "loss": 1.6061, | |
| "step": 580 | |
| }, | |
| { | |
| "epoch": 1.755952380952381, | |
| "grad_norm": 2.898097276687622, | |
| "learning_rate": 1.2976190476190478e-05, | |
| "loss": 1.5223, | |
| "step": 590 | |
| }, | |
| { | |
| "epoch": 1.7857142857142856, | |
| "grad_norm": 2.2522895336151123, | |
| "learning_rate": 1.2857142857142859e-05, | |
| "loss": 1.5095, | |
| "step": 600 | |
| }, | |
| { | |
| "epoch": 1.8154761904761905, | |
| "grad_norm": 3.5610804557800293, | |
| "learning_rate": 1.2738095238095238e-05, | |
| "loss": 1.6524, | |
| "step": 610 | |
| }, | |
| { | |
| "epoch": 1.8452380952380953, | |
| "grad_norm": 3.532130002975464, | |
| "learning_rate": 1.261904761904762e-05, | |
| "loss": 1.5345, | |
| "step": 620 | |
| }, | |
| { | |
| "epoch": 1.875, | |
| "grad_norm": 3.8648953437805176, | |
| "learning_rate": 1.25e-05, | |
| "loss": 1.691, | |
| "step": 630 | |
| }, | |
| { | |
| "epoch": 1.9047619047619047, | |
| "grad_norm": 2.4936046600341797, | |
| "learning_rate": 1.2380952380952383e-05, | |
| "loss": 1.4573, | |
| "step": 640 | |
| }, | |
| { | |
| "epoch": 1.9345238095238095, | |
| "grad_norm": 3.499699592590332, | |
| "learning_rate": 1.2261904761904763e-05, | |
| "loss": 1.5181, | |
| "step": 650 | |
| }, | |
| { | |
| "epoch": 1.9642857142857144, | |
| "grad_norm": 2.7815959453582764, | |
| "learning_rate": 1.2142857142857142e-05, | |
| "loss": 1.4333, | |
| "step": 660 | |
| }, | |
| { | |
| "epoch": 1.994047619047619, | |
| "grad_norm": 3.007183790206909, | |
| "learning_rate": 1.2023809523809525e-05, | |
| "loss": 1.4653, | |
| "step": 670 | |
| }, | |
| { | |
| "epoch": 2.0, | |
| "eval_accuracy": 0.8839662447257384, | |
| "eval_loss": 1.4698303937911987, | |
| "eval_runtime": 51.4369, | |
| "eval_samples_per_second": 9.215, | |
| "eval_steps_per_second": 1.166, | |
| "step": 672 | |
| }, | |
| { | |
| "epoch": 2.0238095238095237, | |
| "grad_norm": 3.4663267135620117, | |
| "learning_rate": 1.1904761904761905e-05, | |
| "loss": 1.4428, | |
| "step": 680 | |
| }, | |
| { | |
| "epoch": 2.0535714285714284, | |
| "grad_norm": 2.2934768199920654, | |
| "learning_rate": 1.1785714285714287e-05, | |
| "loss": 1.4135, | |
| "step": 690 | |
| }, | |
| { | |
| "epoch": 2.0833333333333335, | |
| "grad_norm": 2.601954221725464, | |
| "learning_rate": 1.1666666666666668e-05, | |
| "loss": 1.456, | |
| "step": 700 | |
| }, | |
| { | |
| "epoch": 2.113095238095238, | |
| "grad_norm": 3.2254600524902344, | |
| "learning_rate": 1.1547619047619047e-05, | |
| "loss": 1.5227, | |
| "step": 710 | |
| }, | |
| { | |
| "epoch": 2.142857142857143, | |
| "grad_norm": 3.2958316802978516, | |
| "learning_rate": 1.1428571428571429e-05, | |
| "loss": 1.4248, | |
| "step": 720 | |
| }, | |
| { | |
| "epoch": 2.1726190476190474, | |
| "grad_norm": 4.993536472320557, | |
| "learning_rate": 1.130952380952381e-05, | |
| "loss": 1.4717, | |
| "step": 730 | |
| }, | |
| { | |
| "epoch": 2.2023809523809526, | |
| "grad_norm": 3.3640084266662598, | |
| "learning_rate": 1.1190476190476192e-05, | |
| "loss": 1.4265, | |
| "step": 740 | |
| }, | |
| { | |
| "epoch": 2.232142857142857, | |
| "grad_norm": 2.6835250854492188, | |
| "learning_rate": 1.1071428571428572e-05, | |
| "loss": 1.408, | |
| "step": 750 | |
| }, | |
| { | |
| "epoch": 2.261904761904762, | |
| "grad_norm": 3.8518381118774414, | |
| "learning_rate": 1.0952380952380955e-05, | |
| "loss": 1.2666, | |
| "step": 760 | |
| }, | |
| { | |
| "epoch": 2.2916666666666665, | |
| "grad_norm": 3.553366184234619, | |
| "learning_rate": 1.0833333333333334e-05, | |
| "loss": 1.4052, | |
| "step": 770 | |
| }, | |
| { | |
| "epoch": 2.3214285714285716, | |
| "grad_norm": 2.657440423965454, | |
| "learning_rate": 1.0714285714285714e-05, | |
| "loss": 1.3953, | |
| "step": 780 | |
| }, | |
| { | |
| "epoch": 2.3511904761904763, | |
| "grad_norm": 4.050617694854736, | |
| "learning_rate": 1.0595238095238096e-05, | |
| "loss": 1.3073, | |
| "step": 790 | |
| }, | |
| { | |
| "epoch": 2.380952380952381, | |
| "grad_norm": 3.039287567138672, | |
| "learning_rate": 1.0476190476190477e-05, | |
| "loss": 1.3765, | |
| "step": 800 | |
| }, | |
| { | |
| "epoch": 2.4107142857142856, | |
| "grad_norm": 3.350076913833618, | |
| "learning_rate": 1.0357142857142859e-05, | |
| "loss": 1.2713, | |
| "step": 810 | |
| }, | |
| { | |
| "epoch": 2.4404761904761907, | |
| "grad_norm": 4.112967491149902, | |
| "learning_rate": 1.0238095238095238e-05, | |
| "loss": 1.3557, | |
| "step": 820 | |
| }, | |
| { | |
| "epoch": 2.4702380952380953, | |
| "grad_norm": 2.587895154953003, | |
| "learning_rate": 1.011904761904762e-05, | |
| "loss": 1.2106, | |
| "step": 830 | |
| }, | |
| { | |
| "epoch": 2.5, | |
| "grad_norm": 2.2189221382141113, | |
| "learning_rate": 1e-05, | |
| "loss": 1.1529, | |
| "step": 840 | |
| }, | |
| { | |
| "epoch": 2.5297619047619047, | |
| "grad_norm": 1.7763313055038452, | |
| "learning_rate": 9.880952380952381e-06, | |
| "loss": 1.2066, | |
| "step": 850 | |
| }, | |
| { | |
| "epoch": 2.5595238095238093, | |
| "grad_norm": 2.5652577877044678, | |
| "learning_rate": 9.761904761904762e-06, | |
| "loss": 1.2206, | |
| "step": 860 | |
| }, | |
| { | |
| "epoch": 2.5892857142857144, | |
| "grad_norm": 2.4081642627716064, | |
| "learning_rate": 9.642857142857144e-06, | |
| "loss": 1.2288, | |
| "step": 870 | |
| }, | |
| { | |
| "epoch": 2.619047619047619, | |
| "grad_norm": 3.4448933601379395, | |
| "learning_rate": 9.523809523809525e-06, | |
| "loss": 1.2764, | |
| "step": 880 | |
| }, | |
| { | |
| "epoch": 2.6488095238095237, | |
| "grad_norm": 3.65535044670105, | |
| "learning_rate": 9.404761904761905e-06, | |
| "loss": 1.1818, | |
| "step": 890 | |
| }, | |
| { | |
| "epoch": 2.678571428571429, | |
| "grad_norm": 2.902886152267456, | |
| "learning_rate": 9.285714285714288e-06, | |
| "loss": 1.2662, | |
| "step": 900 | |
| }, | |
| { | |
| "epoch": 2.7083333333333335, | |
| "grad_norm": 2.8251378536224365, | |
| "learning_rate": 9.166666666666666e-06, | |
| "loss": 1.1246, | |
| "step": 910 | |
| }, | |
| { | |
| "epoch": 2.738095238095238, | |
| "grad_norm": 2.1443264484405518, | |
| "learning_rate": 9.047619047619049e-06, | |
| "loss": 1.2486, | |
| "step": 920 | |
| }, | |
| { | |
| "epoch": 2.767857142857143, | |
| "grad_norm": 4.930934429168701, | |
| "learning_rate": 8.92857142857143e-06, | |
| "loss": 1.1865, | |
| "step": 930 | |
| }, | |
| { | |
| "epoch": 2.7976190476190474, | |
| "grad_norm": 3.2018985748291016, | |
| "learning_rate": 8.80952380952381e-06, | |
| "loss": 1.1047, | |
| "step": 940 | |
| }, | |
| { | |
| "epoch": 2.8273809523809526, | |
| "grad_norm": 3.2998268604278564, | |
| "learning_rate": 8.690476190476192e-06, | |
| "loss": 1.2098, | |
| "step": 950 | |
| }, | |
| { | |
| "epoch": 2.857142857142857, | |
| "grad_norm": 2.1316542625427246, | |
| "learning_rate": 8.571428571428571e-06, | |
| "loss": 1.0918, | |
| "step": 960 | |
| }, | |
| { | |
| "epoch": 2.886904761904762, | |
| "grad_norm": 3.8014087677001953, | |
| "learning_rate": 8.452380952380953e-06, | |
| "loss": 1.1139, | |
| "step": 970 | |
| }, | |
| { | |
| "epoch": 2.9166666666666665, | |
| "grad_norm": 2.8320999145507812, | |
| "learning_rate": 8.333333333333334e-06, | |
| "loss": 1.213, | |
| "step": 980 | |
| }, | |
| { | |
| "epoch": 2.946428571428571, | |
| "grad_norm": 3.016481876373291, | |
| "learning_rate": 8.214285714285714e-06, | |
| "loss": 1.1398, | |
| "step": 990 | |
| }, | |
| { | |
| "epoch": 2.9761904761904763, | |
| "grad_norm": 3.9006187915802, | |
| "learning_rate": 8.095238095238097e-06, | |
| "loss": 1.1052, | |
| "step": 1000 | |
| }, | |
| { | |
| "epoch": 3.0, | |
| "eval_accuracy": 0.930379746835443, | |
| "eval_loss": 1.0801581144332886, | |
| "eval_runtime": 51.0077, | |
| "eval_samples_per_second": 9.293, | |
| "eval_steps_per_second": 1.176, | |
| "step": 1008 | |
| }, | |
| { | |
| "epoch": 3.005952380952381, | |
| "grad_norm": 2.796464204788208, | |
| "learning_rate": 7.976190476190477e-06, | |
| "loss": 1.1341, | |
| "step": 1010 | |
| }, | |
| { | |
| "epoch": 3.0357142857142856, | |
| "grad_norm": 2.1846368312835693, | |
| "learning_rate": 7.857142857142858e-06, | |
| "loss": 1.173, | |
| "step": 1020 | |
| }, | |
| { | |
| "epoch": 3.0654761904761907, | |
| "grad_norm": 3.3909096717834473, | |
| "learning_rate": 7.738095238095238e-06, | |
| "loss": 1.0198, | |
| "step": 1030 | |
| }, | |
| { | |
| "epoch": 3.0952380952380953, | |
| "grad_norm": 3.5887138843536377, | |
| "learning_rate": 7.61904761904762e-06, | |
| "loss": 1.0729, | |
| "step": 1040 | |
| }, | |
| { | |
| "epoch": 3.125, | |
| "grad_norm": 2.7871737480163574, | |
| "learning_rate": 7.500000000000001e-06, | |
| "loss": 0.9676, | |
| "step": 1050 | |
| }, | |
| { | |
| "epoch": 3.1547619047619047, | |
| "grad_norm": 3.3368754386901855, | |
| "learning_rate": 7.380952380952382e-06, | |
| "loss": 0.9599, | |
| "step": 1060 | |
| }, | |
| { | |
| "epoch": 3.1845238095238093, | |
| "grad_norm": 3.748992919921875, | |
| "learning_rate": 7.261904761904762e-06, | |
| "loss": 1.1599, | |
| "step": 1070 | |
| }, | |
| { | |
| "epoch": 3.2142857142857144, | |
| "grad_norm": 4.470694065093994, | |
| "learning_rate": 7.1428571428571436e-06, | |
| "loss": 1.155, | |
| "step": 1080 | |
| }, | |
| { | |
| "epoch": 3.244047619047619, | |
| "grad_norm": 1.8315823078155518, | |
| "learning_rate": 7.023809523809524e-06, | |
| "loss": 0.979, | |
| "step": 1090 | |
| }, | |
| { | |
| "epoch": 3.2738095238095237, | |
| "grad_norm": 2.505209445953369, | |
| "learning_rate": 6.9047619047619055e-06, | |
| "loss": 1.142, | |
| "step": 1100 | |
| }, | |
| { | |
| "epoch": 3.3035714285714284, | |
| "grad_norm": 3.056353807449341, | |
| "learning_rate": 6.785714285714287e-06, | |
| "loss": 1.0072, | |
| "step": 1110 | |
| }, | |
| { | |
| "epoch": 3.3333333333333335, | |
| "grad_norm": 3.9302310943603516, | |
| "learning_rate": 6.666666666666667e-06, | |
| "loss": 1.0705, | |
| "step": 1120 | |
| }, | |
| { | |
| "epoch": 3.363095238095238, | |
| "grad_norm": 4.6520490646362305, | |
| "learning_rate": 6.547619047619048e-06, | |
| "loss": 1.0325, | |
| "step": 1130 | |
| }, | |
| { | |
| "epoch": 3.392857142857143, | |
| "grad_norm": 3.9381701946258545, | |
| "learning_rate": 6.4285714285714295e-06, | |
| "loss": 0.9674, | |
| "step": 1140 | |
| }, | |
| { | |
| "epoch": 3.4226190476190474, | |
| "grad_norm": 5.080965042114258, | |
| "learning_rate": 6.30952380952381e-06, | |
| "loss": 0.9812, | |
| "step": 1150 | |
| }, | |
| { | |
| "epoch": 3.4523809523809526, | |
| "grad_norm": 4.649317264556885, | |
| "learning_rate": 6.1904761904761914e-06, | |
| "loss": 1.1093, | |
| "step": 1160 | |
| }, | |
| { | |
| "epoch": 3.482142857142857, | |
| "grad_norm": 5.5956315994262695, | |
| "learning_rate": 6.071428571428571e-06, | |
| "loss": 1.0133, | |
| "step": 1170 | |
| }, | |
| { | |
| "epoch": 3.511904761904762, | |
| "grad_norm": 4.99602746963501, | |
| "learning_rate": 5.9523809523809525e-06, | |
| "loss": 1.075, | |
| "step": 1180 | |
| }, | |
| { | |
| "epoch": 3.5416666666666665, | |
| "grad_norm": 3.875300407409668, | |
| "learning_rate": 5.833333333333334e-06, | |
| "loss": 1.1469, | |
| "step": 1190 | |
| }, | |
| { | |
| "epoch": 3.571428571428571, | |
| "grad_norm": 2.9351279735565186, | |
| "learning_rate": 5.7142857142857145e-06, | |
| "loss": 1.1746, | |
| "step": 1200 | |
| }, | |
| { | |
| "epoch": 3.6011904761904763, | |
| "grad_norm": 3.581909418106079, | |
| "learning_rate": 5.595238095238096e-06, | |
| "loss": 1.0452, | |
| "step": 1210 | |
| }, | |
| { | |
| "epoch": 3.630952380952381, | |
| "grad_norm": 2.4383697509765625, | |
| "learning_rate": 5.476190476190477e-06, | |
| "loss": 0.884, | |
| "step": 1220 | |
| }, | |
| { | |
| "epoch": 3.6607142857142856, | |
| "grad_norm": 3.386600971221924, | |
| "learning_rate": 5.357142857142857e-06, | |
| "loss": 0.9479, | |
| "step": 1230 | |
| }, | |
| { | |
| "epoch": 3.6904761904761907, | |
| "grad_norm": 1.5890535116195679, | |
| "learning_rate": 5.2380952380952384e-06, | |
| "loss": 0.8953, | |
| "step": 1240 | |
| }, | |
| { | |
| "epoch": 3.7202380952380953, | |
| "grad_norm": 2.729491710662842, | |
| "learning_rate": 5.119047619047619e-06, | |
| "loss": 0.9071, | |
| "step": 1250 | |
| }, | |
| { | |
| "epoch": 3.75, | |
| "grad_norm": 4.265748977661133, | |
| "learning_rate": 5e-06, | |
| "loss": 1.0496, | |
| "step": 1260 | |
| }, | |
| { | |
| "epoch": 3.7797619047619047, | |
| "grad_norm": 3.6234512329101562, | |
| "learning_rate": 4.880952380952381e-06, | |
| "loss": 0.9945, | |
| "step": 1270 | |
| }, | |
| { | |
| "epoch": 3.8095238095238093, | |
| "grad_norm": 3.0296449661254883, | |
| "learning_rate": 4.761904761904762e-06, | |
| "loss": 1.0592, | |
| "step": 1280 | |
| }, | |
| { | |
| "epoch": 3.8392857142857144, | |
| "grad_norm": 3.7550673484802246, | |
| "learning_rate": 4.642857142857144e-06, | |
| "loss": 0.9102, | |
| "step": 1290 | |
| }, | |
| { | |
| "epoch": 3.869047619047619, | |
| "grad_norm": 2.3732712268829346, | |
| "learning_rate": 4.523809523809524e-06, | |
| "loss": 0.9721, | |
| "step": 1300 | |
| }, | |
| { | |
| "epoch": 3.8988095238095237, | |
| "grad_norm": 4.049142360687256, | |
| "learning_rate": 4.404761904761905e-06, | |
| "loss": 0.9409, | |
| "step": 1310 | |
| }, | |
| { | |
| "epoch": 3.928571428571429, | |
| "grad_norm": 2.1877949237823486, | |
| "learning_rate": 4.2857142857142855e-06, | |
| "loss": 1.0235, | |
| "step": 1320 | |
| }, | |
| { | |
| "epoch": 3.9583333333333335, | |
| "grad_norm": 1.8449411392211914, | |
| "learning_rate": 4.166666666666667e-06, | |
| "loss": 0.978, | |
| "step": 1330 | |
| }, | |
| { | |
| "epoch": 3.988095238095238, | |
| "grad_norm": 2.8841190338134766, | |
| "learning_rate": 4.047619047619048e-06, | |
| "loss": 1.0055, | |
| "step": 1340 | |
| }, | |
| { | |
| "epoch": 4.0, | |
| "eval_accuracy": 0.9493670886075949, | |
| "eval_loss": 0.9248189926147461, | |
| "eval_runtime": 51.0423, | |
| "eval_samples_per_second": 9.286, | |
| "eval_steps_per_second": 1.175, | |
| "step": 1344 | |
| }, | |
| { | |
| "epoch": 4.017857142857143, | |
| "grad_norm": 2.242076873779297, | |
| "learning_rate": 3.928571428571429e-06, | |
| "loss": 0.9244, | |
| "step": 1350 | |
| }, | |
| { | |
| "epoch": 4.0476190476190474, | |
| "grad_norm": 1.98090660572052, | |
| "learning_rate": 3.80952380952381e-06, | |
| "loss": 0.8568, | |
| "step": 1360 | |
| }, | |
| { | |
| "epoch": 4.0773809523809526, | |
| "grad_norm": 3.927706718444824, | |
| "learning_rate": 3.690476190476191e-06, | |
| "loss": 0.9644, | |
| "step": 1370 | |
| }, | |
| { | |
| "epoch": 4.107142857142857, | |
| "grad_norm": 2.3780994415283203, | |
| "learning_rate": 3.5714285714285718e-06, | |
| "loss": 0.97, | |
| "step": 1380 | |
| }, | |
| { | |
| "epoch": 4.136904761904762, | |
| "grad_norm": 2.21608304977417, | |
| "learning_rate": 3.4523809523809528e-06, | |
| "loss": 0.9728, | |
| "step": 1390 | |
| }, | |
| { | |
| "epoch": 4.166666666666667, | |
| "grad_norm": 6.764073848724365, | |
| "learning_rate": 3.3333333333333333e-06, | |
| "loss": 0.8729, | |
| "step": 1400 | |
| }, | |
| { | |
| "epoch": 4.196428571428571, | |
| "grad_norm": 1.5746071338653564, | |
| "learning_rate": 3.2142857142857147e-06, | |
| "loss": 0.7702, | |
| "step": 1410 | |
| }, | |
| { | |
| "epoch": 4.226190476190476, | |
| "grad_norm": 1.8241825103759766, | |
| "learning_rate": 3.0952380952380957e-06, | |
| "loss": 0.9121, | |
| "step": 1420 | |
| }, | |
| { | |
| "epoch": 4.255952380952381, | |
| "grad_norm": 3.9683926105499268, | |
| "learning_rate": 2.9761904761904763e-06, | |
| "loss": 0.8749, | |
| "step": 1430 | |
| }, | |
| { | |
| "epoch": 4.285714285714286, | |
| "grad_norm": 1.5732113122940063, | |
| "learning_rate": 2.8571428571428573e-06, | |
| "loss": 0.9421, | |
| "step": 1440 | |
| }, | |
| { | |
| "epoch": 4.315476190476191, | |
| "grad_norm": 2.5848405361175537, | |
| "learning_rate": 2.7380952380952387e-06, | |
| "loss": 0.9617, | |
| "step": 1450 | |
| }, | |
| { | |
| "epoch": 4.345238095238095, | |
| "grad_norm": 3.7017910480499268, | |
| "learning_rate": 2.6190476190476192e-06, | |
| "loss": 0.905, | |
| "step": 1460 | |
| }, | |
| { | |
| "epoch": 4.375, | |
| "grad_norm": 5.973739147186279, | |
| "learning_rate": 2.5e-06, | |
| "loss": 0.89, | |
| "step": 1470 | |
| }, | |
| { | |
| "epoch": 4.404761904761905, | |
| "grad_norm": 1.8716737031936646, | |
| "learning_rate": 2.380952380952381e-06, | |
| "loss": 0.9635, | |
| "step": 1480 | |
| }, | |
| { | |
| "epoch": 4.434523809523809, | |
| "grad_norm": 3.3029792308807373, | |
| "learning_rate": 2.261904761904762e-06, | |
| "loss": 0.933, | |
| "step": 1490 | |
| }, | |
| { | |
| "epoch": 4.464285714285714, | |
| "grad_norm": 2.5819740295410156, | |
| "learning_rate": 2.1428571428571427e-06, | |
| "loss": 0.8899, | |
| "step": 1500 | |
| }, | |
| { | |
| "epoch": 4.494047619047619, | |
| "grad_norm": 3.5635058879852295, | |
| "learning_rate": 2.023809523809524e-06, | |
| "loss": 0.8539, | |
| "step": 1510 | |
| }, | |
| { | |
| "epoch": 4.523809523809524, | |
| "grad_norm": 2.5672874450683594, | |
| "learning_rate": 1.904761904761905e-06, | |
| "loss": 1.0972, | |
| "step": 1520 | |
| }, | |
| { | |
| "epoch": 4.553571428571429, | |
| "grad_norm": 5.11098051071167, | |
| "learning_rate": 1.7857142857142859e-06, | |
| "loss": 0.9862, | |
| "step": 1530 | |
| }, | |
| { | |
| "epoch": 4.583333333333333, | |
| "grad_norm": 2.5244972705841064, | |
| "learning_rate": 1.6666666666666667e-06, | |
| "loss": 1.0213, | |
| "step": 1540 | |
| }, | |
| { | |
| "epoch": 4.613095238095238, | |
| "grad_norm": 3.5044398307800293, | |
| "learning_rate": 1.5476190476190479e-06, | |
| "loss": 0.9144, | |
| "step": 1550 | |
| }, | |
| { | |
| "epoch": 4.642857142857143, | |
| "grad_norm": 2.4903435707092285, | |
| "learning_rate": 1.4285714285714286e-06, | |
| "loss": 0.9331, | |
| "step": 1560 | |
| }, | |
| { | |
| "epoch": 4.6726190476190474, | |
| "grad_norm": 3.208696126937866, | |
| "learning_rate": 1.3095238095238096e-06, | |
| "loss": 1.013, | |
| "step": 1570 | |
| }, | |
| { | |
| "epoch": 4.7023809523809526, | |
| "grad_norm": 2.255563735961914, | |
| "learning_rate": 1.1904761904761906e-06, | |
| "loss": 0.7625, | |
| "step": 1580 | |
| }, | |
| { | |
| "epoch": 4.732142857142857, | |
| "grad_norm": 2.1157748699188232, | |
| "learning_rate": 1.0714285714285714e-06, | |
| "loss": 0.8885, | |
| "step": 1590 | |
| }, | |
| { | |
| "epoch": 4.761904761904762, | |
| "grad_norm": 3.0076255798339844, | |
| "learning_rate": 9.523809523809525e-07, | |
| "loss": 1.0166, | |
| "step": 1600 | |
| }, | |
| { | |
| "epoch": 4.791666666666667, | |
| "grad_norm": 2.899481773376465, | |
| "learning_rate": 8.333333333333333e-07, | |
| "loss": 0.9983, | |
| "step": 1610 | |
| }, | |
| { | |
| "epoch": 4.821428571428571, | |
| "grad_norm": 6.084941387176514, | |
| "learning_rate": 7.142857142857143e-07, | |
| "loss": 1.1526, | |
| "step": 1620 | |
| }, | |
| { | |
| "epoch": 4.851190476190476, | |
| "grad_norm": 3.8710179328918457, | |
| "learning_rate": 5.952380952380953e-07, | |
| "loss": 0.8589, | |
| "step": 1630 | |
| }, | |
| { | |
| "epoch": 4.880952380952381, | |
| "grad_norm": 2.1053106784820557, | |
| "learning_rate": 4.7619047619047623e-07, | |
| "loss": 0.8788, | |
| "step": 1640 | |
| }, | |
| { | |
| "epoch": 4.910714285714286, | |
| "grad_norm": 2.2121217250823975, | |
| "learning_rate": 3.5714285714285716e-07, | |
| "loss": 0.8718, | |
| "step": 1650 | |
| }, | |
| { | |
| "epoch": 4.940476190476191, | |
| "grad_norm": 2.3137481212615967, | |
| "learning_rate": 2.3809523809523811e-07, | |
| "loss": 0.7878, | |
| "step": 1660 | |
| }, | |
| { | |
| "epoch": 4.970238095238095, | |
| "grad_norm": 2.676529884338379, | |
| "learning_rate": 1.1904761904761906e-07, | |
| "loss": 0.7782, | |
| "step": 1670 | |
| }, | |
| { | |
| "epoch": 5.0, | |
| "grad_norm": 7.775545597076416, | |
| "learning_rate": 0.0, | |
| "loss": 0.7847, | |
| "step": 1680 | |
| }, | |
| { | |
| "epoch": 5.0, | |
| "eval_accuracy": 0.9514767932489452, | |
| "eval_loss": 0.8745647668838501, | |
| "eval_runtime": 50.8678, | |
| "eval_samples_per_second": 9.318, | |
| "eval_steps_per_second": 1.18, | |
| "step": 1680 | |
| }, | |
| { | |
| "epoch": 5.0, | |
| "step": 1680, | |
| "total_flos": 1.0410532148820787e+18, | |
| "train_loss": 1.5688391100792658, | |
| "train_runtime": 1801.1044, | |
| "train_samples_per_second": 7.457, | |
| "train_steps_per_second": 0.933 | |
| } | |
| ], | |
| "logging_steps": 10, | |
| "max_steps": 1680, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 5, | |
| "save_steps": 500, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": true | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 1.0410532148820787e+18, | |
| "train_batch_size": 8, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |