{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.9972396871645453, "eval_steps": 200.0, "global_step": 814, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.02, "learning_rate": 9.996276622795847e-06, "loss": 0.1401, "step": 10 }, { "epoch": 0.05, "learning_rate": 9.98511203659851e-06, "loss": 0.0684, "step": 20 }, { "epoch": 0.07, "learning_rate": 9.966522869394282e-06, "loss": 0.051, "step": 30 }, { "epoch": 0.1, "learning_rate": 9.940536806975732e-06, "loss": 0.0444, "step": 40 }, { "epoch": 0.12, "learning_rate": 9.907192551707831e-06, "loss": 0.0436, "step": 50 }, { "epoch": 0.15, "learning_rate": 9.866539764886562e-06, "loss": 0.038, "step": 60 }, { "epoch": 0.17, "learning_rate": 9.818638992775822e-06, "loss": 0.0421, "step": 70 }, { "epoch": 0.2, "learning_rate": 9.763561576432781e-06, "loss": 0.0415, "step": 80 }, { "epoch": 0.22, "learning_rate": 9.70138954545603e-06, "loss": 0.0402, "step": 90 }, { "epoch": 0.25, "learning_rate": 9.632215495814724e-06, "loss": 0.0375, "step": 100 }, { "epoch": 0.27, "learning_rate": 9.55614245194068e-06, "loss": 0.0362, "step": 110 }, { "epoch": 0.29, "learning_rate": 9.473283713288862e-06, "loss": 0.0373, "step": 120 }, { "epoch": 0.32, "learning_rate": 9.383762685594736e-06, "loss": 0.0375, "step": 130 }, { "epoch": 0.34, "learning_rate": 9.287712697079827e-06, "loss": 0.0374, "step": 140 }, { "epoch": 0.37, "learning_rate": 9.185276799879212e-06, "loss": 0.0358, "step": 150 }, { "epoch": 0.39, "learning_rate": 9.076607556986699e-06, "loss": 0.0365, "step": 160 }, { "epoch": 0.42, "learning_rate": 8.961866815035e-06, "loss": 0.0378, "step": 170 }, { "epoch": 0.44, "learning_rate": 8.841225463249305e-06, "loss": 0.0367, "step": 180 }, { "epoch": 0.47, "learning_rate": 8.714863178933258e-06, "loss": 0.0359, "step": 190 }, { "epoch": 0.49, "learning_rate": 8.582968159866416e-06, "loss": 0.0346, "step": 200 }, { "epoch": 0.52, "learning_rate": 8.445736844011712e-06, "loss": 0.0361, "step": 210 }, { "epoch": 0.54, "learning_rate": 8.303373616950408e-06, "loss": 0.0352, "step": 220 }, { "epoch": 0.56, "learning_rate": 8.156090507480242e-06, "loss": 0.0351, "step": 230 }, { "epoch": 0.59, "learning_rate": 8.004106871830155e-06, "loss": 0.0345, "step": 240 }, { "epoch": 0.61, "learning_rate": 7.847649066961905e-06, "loss": 0.0364, "step": 250 }, { "epoch": 0.64, "learning_rate": 7.68695011344511e-06, "loss": 0.0313, "step": 260 }, { "epoch": 0.66, "learning_rate": 7.52224934840788e-06, "loss": 0.0322, "step": 270 }, { "epoch": 0.69, "learning_rate": 7.353792069079826e-06, "loss": 0.0348, "step": 280 }, { "epoch": 0.71, "learning_rate": 7.181829167458441e-06, "loss": 0.0374, "step": 290 }, { "epoch": 0.74, "learning_rate": 7.006616756642867e-06, "loss": 0.0348, "step": 300 }, { "epoch": 0.76, "learning_rate": 6.828415789391632e-06, "loss": 0.0318, "step": 310 }, { "epoch": 0.79, "learning_rate": 6.647491669472421e-06, "loss": 0.0345, "step": 320 }, { "epoch": 0.81, "learning_rate": 6.464113856382752e-06, "loss": 0.034, "step": 330 }, { "epoch": 0.83, "learning_rate": 6.278555464030228e-06, "loss": 0.0305, "step": 340 }, { "epoch": 0.86, "learning_rate": 6.091092853970098e-06, "loss": 0.0337, "step": 350 }, { "epoch": 0.88, "learning_rate": 5.902005223805931e-06, "loss": 0.0326, "step": 360 }, { "epoch": 0.91, "learning_rate": 5.711574191366427e-06, "loss": 0.0344, "step": 370 }, { "epoch": 0.93, "learning_rate": 5.520083375277644e-06, "loss": 0.032, "step": 380 }, { "epoch": 0.96, "learning_rate": 5.3278179725553525e-06, "loss": 0.0329, "step": 390 }, { "epoch": 0.98, "learning_rate": 5.135064333846612e-06, "loss": 0.0337, "step": 400 }, { "epoch": 1.01, "learning_rate": 4.942109536953177e-06, "loss": 0.033, "step": 410 }, { "epoch": 1.03, "learning_rate": 4.749240959271918e-06, "loss": 0.0298, "step": 420 }, { "epoch": 1.06, "learning_rate": 4.556745849789055e-06, "loss": 0.0338, "step": 430 }, { "epoch": 1.08, "learning_rate": 4.364910901265607e-06, "loss": 0.0332, "step": 440 }, { "epoch": 1.1, "learning_rate": 4.174021823251294e-06, "loss": 0.0328, "step": 450 }, { "epoch": 1.13, "learning_rate": 3.984362916562753e-06, "loss": 0.0329, "step": 460 }, { "epoch": 1.15, "learning_rate": 3.7962166498598785e-06, "loss": 0.0315, "step": 470 }, { "epoch": 1.18, "learning_rate": 3.6098632389508637e-06, "loss": 0.0299, "step": 480 }, { "epoch": 1.2, "learning_rate": 3.4255802294525464e-06, "loss": 0.0319, "step": 490 }, { "epoch": 1.23, "learning_rate": 3.2436420834276013e-06, "loss": 0.0319, "step": 500 }, { "epoch": 1.25, "learning_rate": 3.0643197706142136e-06, "loss": 0.0333, "step": 510 }, { "epoch": 1.28, "learning_rate": 2.8878803648570773e-06, "loss": 0.0306, "step": 520 }, { "epoch": 1.3, "learning_rate": 2.7145866463407163e-06, "loss": 0.0309, "step": 530 }, { "epoch": 1.32, "learning_rate": 2.544696710217588e-06, "loss": 0.0325, "step": 540 }, { "epoch": 1.35, "learning_rate": 2.3784635822138424e-06, "loss": 0.032, "step": 550 }, { "epoch": 1.37, "learning_rate": 2.2161348417852346e-06, "loss": 0.034, "step": 560 }, { "epoch": 1.4, "learning_rate": 2.05795225338444e-06, "loss": 0.0322, "step": 570 }, { "epoch": 1.42, "learning_rate": 1.9041514063889571e-06, "loss": 0.0328, "step": 580 }, { "epoch": 1.45, "learning_rate": 1.7549613642258573e-06, "loss": 0.0336, "step": 590 }, { "epoch": 1.47, "learning_rate": 1.6106043232159745e-06, "loss": 0.0339, "step": 600 }, { "epoch": 1.5, "learning_rate": 1.4712952816456095e-06, "loss": 0.0325, "step": 610 }, { "epoch": 1.52, "learning_rate": 1.337241719558648e-06, "loss": 0.0327, "step": 620 }, { "epoch": 1.55, "learning_rate": 1.2086432897459738e-06, "loss": 0.0304, "step": 630 }, { "epoch": 1.57, "learning_rate": 1.0856915203924096e-06, "loss": 0.0329, "step": 640 }, { "epoch": 1.59, "learning_rate": 9.685695298240432e-07, "loss": 0.0322, "step": 650 }, { "epoch": 1.62, "learning_rate": 8.574517537807897e-07, "loss": 0.0343, "step": 660 }, { "epoch": 1.64, "learning_rate": 7.525036856203677e-07, "loss": 0.0328, "step": 670 }, { "epoch": 1.67, "learning_rate": 6.538816298406203e-07, "loss": 0.0334, "step": 680 }, { "epoch": 1.69, "learning_rate": 5.617324692872744e-07, "loss": 0.0334, "step": 690 }, { "epoch": 1.72, "learning_rate": 4.7619344639384447e-07, "loss": 0.0337, "step": 700 }, { "epoch": 1.74, "learning_rate": 3.9739195877949223e-07, "loss": 0.0306, "step": 710 }, { "epoch": 1.77, "learning_rate": 3.254453695092752e-07, "loss": 0.032, "step": 720 }, { "epoch": 1.79, "learning_rate": 2.604608322993518e-07, "loss": 0.0325, "step": 730 }, { "epoch": 1.82, "learning_rate": 2.0253513192751374e-07, "loss": 0.0351, "step": 740 }, { "epoch": 1.84, "learning_rate": 1.5175454008667712e-07, "loss": 0.0309, "step": 750 }, { "epoch": 1.86, "learning_rate": 1.0819468689607426e-07, "loss": 0.0305, "step": 760 }, { "epoch": 1.89, "learning_rate": 7.192044826145772e-08, "loss": 0.0284, "step": 770 }, { "epoch": 1.91, "learning_rate": 4.298584925212068e-08, "loss": 0.0305, "step": 780 }, { "epoch": 1.94, "learning_rate": 2.143398363860738e-08, "loss": 0.0303, "step": 790 }, { "epoch": 1.96, "learning_rate": 7.2969497109715016e-09, "loss": 0.0313, "step": 800 }, { "epoch": 1.99, "learning_rate": 5.958024731567147e-10, "loss": 0.0331, "step": 810 }, { "epoch": 2.0, "step": 814, "tflops": 490.2053045046936, "token/s": 1736.022262942593, "total_flos": 1.5851362853622645e+19, "train_loss": 0.035910474946516446, "train_runtime": 31598.4886, "train_samples_per_second": 6.603, "train_steps_per_second": 0.026 } ], "log_save_evaluate_time": 2188.9126505851746, "logging_steps": 10, "max_steps": 814, "num_train_epochs": 2, "save_steps": 200, "total_flos": 1.5851362853622645e+19, "total_tokens": 51055680.0, "trial_name": null, "trial_params": null }