{ "best_metric": null, "best_model_checkpoint": null, "epoch": 2.998778998778999, "eval_steps": 500, "global_step": 921, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.03256003256003256, "grad_norm": 5.323436630800234, "learning_rate": 5e-06, "loss": 1.0327, "step": 10 }, { "epoch": 0.06512006512006512, "grad_norm": 1.688128003809466, "learning_rate": 5e-06, "loss": 0.9062, "step": 20 }, { "epoch": 0.09768009768009768, "grad_norm": 1.2140455994223702, "learning_rate": 5e-06, "loss": 0.8619, "step": 30 }, { "epoch": 0.13024013024013023, "grad_norm": 1.892078953122378, "learning_rate": 5e-06, "loss": 0.8392, "step": 40 }, { "epoch": 0.1628001628001628, "grad_norm": 1.3493280342186107, "learning_rate": 5e-06, "loss": 0.8302, "step": 50 }, { "epoch": 0.19536019536019536, "grad_norm": 1.404398870605967, "learning_rate": 5e-06, "loss": 0.8146, "step": 60 }, { "epoch": 0.22792022792022792, "grad_norm": 1.716904927996173, "learning_rate": 5e-06, "loss": 0.7984, "step": 70 }, { "epoch": 0.26048026048026046, "grad_norm": 1.1043041468212171, "learning_rate": 5e-06, "loss": 0.7822, "step": 80 }, { "epoch": 0.29304029304029305, "grad_norm": 1.1394393992724225, "learning_rate": 5e-06, "loss": 0.7747, "step": 90 }, { "epoch": 0.3256003256003256, "grad_norm": 0.9041488774288327, "learning_rate": 5e-06, "loss": 0.7717, "step": 100 }, { "epoch": 0.3581603581603582, "grad_norm": 0.9299755697835647, "learning_rate": 5e-06, "loss": 0.7678, "step": 110 }, { "epoch": 0.3907203907203907, "grad_norm": 0.993036115619975, "learning_rate": 5e-06, "loss": 0.7683, "step": 120 }, { "epoch": 0.42328042328042326, "grad_norm": 0.6582951208355411, "learning_rate": 5e-06, "loss": 0.7628, "step": 130 }, { "epoch": 0.45584045584045585, "grad_norm": 0.7519239236084948, "learning_rate": 5e-06, "loss": 0.7565, "step": 140 }, { "epoch": 0.4884004884004884, "grad_norm": 0.6532749233523701, "learning_rate": 5e-06, "loss": 0.7542, "step": 150 }, { "epoch": 0.5209605209605209, "grad_norm": 0.6538729701357949, "learning_rate": 5e-06, "loss": 0.752, "step": 160 }, { "epoch": 0.5535205535205535, "grad_norm": 0.5799393154553659, "learning_rate": 5e-06, "loss": 0.7495, "step": 170 }, { "epoch": 0.5860805860805861, "grad_norm": 0.5825669893589298, "learning_rate": 5e-06, "loss": 0.7428, "step": 180 }, { "epoch": 0.6186406186406186, "grad_norm": 0.7509089505557253, "learning_rate": 5e-06, "loss": 0.7449, "step": 190 }, { "epoch": 0.6512006512006512, "grad_norm": 0.8999008774030093, "learning_rate": 5e-06, "loss": 0.7512, "step": 200 }, { "epoch": 0.6837606837606838, "grad_norm": 0.6383792756493567, "learning_rate": 5e-06, "loss": 0.7357, "step": 210 }, { "epoch": 0.7163207163207164, "grad_norm": 0.7456320321321589, "learning_rate": 5e-06, "loss": 0.7407, "step": 220 }, { "epoch": 0.7488807488807488, "grad_norm": 0.6143588238010131, "learning_rate": 5e-06, "loss": 0.7365, "step": 230 }, { "epoch": 0.7814407814407814, "grad_norm": 0.7213470058027147, "learning_rate": 5e-06, "loss": 0.74, "step": 240 }, { "epoch": 0.814000814000814, "grad_norm": 0.6247707373236687, "learning_rate": 5e-06, "loss": 0.737, "step": 250 }, { "epoch": 0.8465608465608465, "grad_norm": 0.7179277915838116, "learning_rate": 5e-06, "loss": 0.7391, "step": 260 }, { "epoch": 0.8791208791208791, "grad_norm": 0.5751713737264728, "learning_rate": 5e-06, "loss": 0.7256, "step": 270 }, { "epoch": 0.9116809116809117, "grad_norm": 1.0076623893106156, "learning_rate": 5e-06, "loss": 0.7292, "step": 280 }, { "epoch": 0.9442409442409443, "grad_norm": 0.9261278576758778, "learning_rate": 5e-06, "loss": 0.7307, "step": 290 }, { "epoch": 0.9768009768009768, "grad_norm": 0.5709384175023418, "learning_rate": 5e-06, "loss": 0.7273, "step": 300 }, { "epoch": 0.9995929995929996, "eval_loss": 0.727676272392273, "eval_runtime": 323.9497, "eval_samples_per_second": 25.544, "eval_steps_per_second": 0.401, "step": 307 }, { "epoch": 1.0093610093610093, "grad_norm": 0.9162973627143661, "learning_rate": 5e-06, "loss": 0.775, "step": 310 }, { "epoch": 1.0419210419210418, "grad_norm": 0.6625376118384887, "learning_rate": 5e-06, "loss": 0.679, "step": 320 }, { "epoch": 1.0744810744810744, "grad_norm": 0.6807366677436052, "learning_rate": 5e-06, "loss": 0.6787, "step": 330 }, { "epoch": 1.107041107041107, "grad_norm": 0.7495116413050847, "learning_rate": 5e-06, "loss": 0.6805, "step": 340 }, { "epoch": 1.1396011396011396, "grad_norm": 0.6693097946610393, "learning_rate": 5e-06, "loss": 0.6759, "step": 350 }, { "epoch": 1.1721611721611722, "grad_norm": 0.6269071277974789, "learning_rate": 5e-06, "loss": 0.6793, "step": 360 }, { "epoch": 1.2047212047212048, "grad_norm": 0.701447231936067, "learning_rate": 5e-06, "loss": 0.6739, "step": 370 }, { "epoch": 1.2372812372812372, "grad_norm": 0.6286463507729596, "learning_rate": 5e-06, "loss": 0.6738, "step": 380 }, { "epoch": 1.2698412698412698, "grad_norm": 0.678099904425436, "learning_rate": 5e-06, "loss": 0.6775, "step": 390 }, { "epoch": 1.3024013024013024, "grad_norm": 0.7402015170342834, "learning_rate": 5e-06, "loss": 0.6755, "step": 400 }, { "epoch": 1.334961334961335, "grad_norm": 0.8963076513215479, "learning_rate": 5e-06, "loss": 0.6747, "step": 410 }, { "epoch": 1.3675213675213675, "grad_norm": 0.758606230124057, "learning_rate": 5e-06, "loss": 0.6795, "step": 420 }, { "epoch": 1.4000814000814001, "grad_norm": 0.755225029704983, "learning_rate": 5e-06, "loss": 0.6782, "step": 430 }, { "epoch": 1.4326414326414327, "grad_norm": 0.9296144227043265, "learning_rate": 5e-06, "loss": 0.6706, "step": 440 }, { "epoch": 1.4652014652014653, "grad_norm": 0.6465031575836382, "learning_rate": 5e-06, "loss": 0.6708, "step": 450 }, { "epoch": 1.4977614977614977, "grad_norm": 0.5928157369911277, "learning_rate": 5e-06, "loss": 0.6763, "step": 460 }, { "epoch": 1.5303215303215303, "grad_norm": 0.6210442459071823, "learning_rate": 5e-06, "loss": 0.6749, "step": 470 }, { "epoch": 1.5628815628815629, "grad_norm": 0.6163699649091662, "learning_rate": 5e-06, "loss": 0.6755, "step": 480 }, { "epoch": 1.5954415954415955, "grad_norm": 0.8040173316442683, "learning_rate": 5e-06, "loss": 0.6704, "step": 490 }, { "epoch": 1.6280016280016278, "grad_norm": 0.6887993451391516, "learning_rate": 5e-06, "loss": 0.6701, "step": 500 }, { "epoch": 1.6605616605616604, "grad_norm": 0.6197281649463939, "learning_rate": 5e-06, "loss": 0.6726, "step": 510 }, { "epoch": 1.693121693121693, "grad_norm": 0.619478860918107, "learning_rate": 5e-06, "loss": 0.6751, "step": 520 }, { "epoch": 1.7256817256817256, "grad_norm": 0.6773051427286838, "learning_rate": 5e-06, "loss": 0.6641, "step": 530 }, { "epoch": 1.7582417582417582, "grad_norm": 0.6286720338866559, "learning_rate": 5e-06, "loss": 0.6796, "step": 540 }, { "epoch": 1.7908017908017908, "grad_norm": 0.7533774989219207, "learning_rate": 5e-06, "loss": 0.672, "step": 550 }, { "epoch": 1.8233618233618234, "grad_norm": 0.7731184689615994, "learning_rate": 5e-06, "loss": 0.6659, "step": 560 }, { "epoch": 1.855921855921856, "grad_norm": 0.7416671731262793, "learning_rate": 5e-06, "loss": 0.6746, "step": 570 }, { "epoch": 1.8884818884818886, "grad_norm": 0.6128076594680967, "learning_rate": 5e-06, "loss": 0.6716, "step": 580 }, { "epoch": 1.9210419210419212, "grad_norm": 0.7891628980046747, "learning_rate": 5e-06, "loss": 0.6697, "step": 590 }, { "epoch": 1.9536019536019538, "grad_norm": 0.6796937767570254, "learning_rate": 5e-06, "loss": 0.6651, "step": 600 }, { "epoch": 1.9861619861619861, "grad_norm": 0.6743284971968594, "learning_rate": 5e-06, "loss": 0.6701, "step": 610 }, { "epoch": 1.999185999185999, "eval_loss": 0.711691677570343, "eval_runtime": 324.9391, "eval_samples_per_second": 25.466, "eval_steps_per_second": 0.4, "step": 614 }, { "epoch": 2.0187220187220185, "grad_norm": 1.025163234157577, "learning_rate": 5e-06, "loss": 0.6947, "step": 620 }, { "epoch": 2.051282051282051, "grad_norm": 0.8628252708559125, "learning_rate": 5e-06, "loss": 0.6187, "step": 630 }, { "epoch": 2.0838420838420837, "grad_norm": 0.6783100628721863, "learning_rate": 5e-06, "loss": 0.6179, "step": 640 }, { "epoch": 2.1164021164021163, "grad_norm": 0.6421565201150183, "learning_rate": 5e-06, "loss": 0.6162, "step": 650 }, { "epoch": 2.148962148962149, "grad_norm": 0.685352763219904, "learning_rate": 5e-06, "loss": 0.6177, "step": 660 }, { "epoch": 2.1815221815221815, "grad_norm": 0.6061382299294098, "learning_rate": 5e-06, "loss": 0.613, "step": 670 }, { "epoch": 2.214082214082214, "grad_norm": 0.690472583201057, "learning_rate": 5e-06, "loss": 0.6157, "step": 680 }, { "epoch": 2.2466422466422467, "grad_norm": 0.627437676785234, "learning_rate": 5e-06, "loss": 0.6187, "step": 690 }, { "epoch": 2.2792022792022792, "grad_norm": 0.6938080734685778, "learning_rate": 5e-06, "loss": 0.6226, "step": 700 }, { "epoch": 2.311762311762312, "grad_norm": 0.772959190894534, "learning_rate": 5e-06, "loss": 0.6182, "step": 710 }, { "epoch": 2.3443223443223444, "grad_norm": 0.5713521350519779, "learning_rate": 5e-06, "loss": 0.6213, "step": 720 }, { "epoch": 2.376882376882377, "grad_norm": 0.6443040936760203, "learning_rate": 5e-06, "loss": 0.6224, "step": 730 }, { "epoch": 2.4094424094424096, "grad_norm": 0.5889564557828441, "learning_rate": 5e-06, "loss": 0.6203, "step": 740 }, { "epoch": 2.442002442002442, "grad_norm": 0.709826472700304, "learning_rate": 5e-06, "loss": 0.6193, "step": 750 }, { "epoch": 2.4745624745624744, "grad_norm": 0.7335788363502472, "learning_rate": 5e-06, "loss": 0.623, "step": 760 }, { "epoch": 2.5071225071225074, "grad_norm": 0.6283405015720556, "learning_rate": 5e-06, "loss": 0.6188, "step": 770 }, { "epoch": 2.5396825396825395, "grad_norm": 0.6952325423084712, "learning_rate": 5e-06, "loss": 0.6209, "step": 780 }, { "epoch": 2.572242572242572, "grad_norm": 0.6559620535420857, "learning_rate": 5e-06, "loss": 0.6201, "step": 790 }, { "epoch": 2.6048026048026047, "grad_norm": 0.7118809496834119, "learning_rate": 5e-06, "loss": 0.6198, "step": 800 }, { "epoch": 2.6373626373626373, "grad_norm": 0.7214373132810955, "learning_rate": 5e-06, "loss": 0.621, "step": 810 }, { "epoch": 2.66992266992267, "grad_norm": 0.6415485259710608, "learning_rate": 5e-06, "loss": 0.6229, "step": 820 }, { "epoch": 2.7024827024827025, "grad_norm": 0.5655240384143257, "learning_rate": 5e-06, "loss": 0.623, "step": 830 }, { "epoch": 2.735042735042735, "grad_norm": 0.7379491136681863, "learning_rate": 5e-06, "loss": 0.6204, "step": 840 }, { "epoch": 2.7676027676027677, "grad_norm": 0.6816392250287217, "learning_rate": 5e-06, "loss": 0.6213, "step": 850 }, { "epoch": 2.8001628001628003, "grad_norm": 0.6788149050134666, "learning_rate": 5e-06, "loss": 0.6219, "step": 860 }, { "epoch": 2.832722832722833, "grad_norm": 0.5660568888358906, "learning_rate": 5e-06, "loss": 0.6211, "step": 870 }, { "epoch": 2.8652828652828655, "grad_norm": 0.59814839030772, "learning_rate": 5e-06, "loss": 0.6172, "step": 880 }, { "epoch": 2.8978428978428976, "grad_norm": 0.7047473448081863, "learning_rate": 5e-06, "loss": 0.6207, "step": 890 }, { "epoch": 2.9304029304029307, "grad_norm": 0.9367887506446253, "learning_rate": 5e-06, "loss": 0.6287, "step": 900 }, { "epoch": 2.962962962962963, "grad_norm": 0.5836189827444674, "learning_rate": 5e-06, "loss": 0.6189, "step": 910 }, { "epoch": 2.9955229955229954, "grad_norm": 0.8008103377337297, "learning_rate": 5e-06, "loss": 0.6223, "step": 920 }, { "epoch": 2.998778998778999, "eval_loss": 0.7124439477920532, "eval_runtime": 324.9155, "eval_samples_per_second": 25.468, "eval_steps_per_second": 0.4, "step": 921 }, { "epoch": 2.998778998778999, "step": 921, "total_flos": 1542499923394560.0, "train_loss": 0.6911445101490498, "train_runtime": 53981.7348, "train_samples_per_second": 8.737, "train_steps_per_second": 0.017 } ], "logging_steps": 10, "max_steps": 921, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 1542499923394560.0, "train_batch_size": 8, "trial_name": null, "trial_params": null }