| { | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 2.998778998778999, | |
| "eval_steps": 500, | |
| "global_step": 921, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.03256003256003256, | |
| "grad_norm": 5.323436630800234, | |
| "learning_rate": 5e-06, | |
| "loss": 1.0327, | |
| "step": 10 | |
| }, | |
| { | |
| "epoch": 0.06512006512006512, | |
| "grad_norm": 1.688128003809466, | |
| "learning_rate": 5e-06, | |
| "loss": 0.9062, | |
| "step": 20 | |
| }, | |
| { | |
| "epoch": 0.09768009768009768, | |
| "grad_norm": 1.2140455994223702, | |
| "learning_rate": 5e-06, | |
| "loss": 0.8619, | |
| "step": 30 | |
| }, | |
| { | |
| "epoch": 0.13024013024013023, | |
| "grad_norm": 1.892078953122378, | |
| "learning_rate": 5e-06, | |
| "loss": 0.8392, | |
| "step": 40 | |
| }, | |
| { | |
| "epoch": 0.1628001628001628, | |
| "grad_norm": 1.3493280342186107, | |
| "learning_rate": 5e-06, | |
| "loss": 0.8302, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.19536019536019536, | |
| "grad_norm": 1.404398870605967, | |
| "learning_rate": 5e-06, | |
| "loss": 0.8146, | |
| "step": 60 | |
| }, | |
| { | |
| "epoch": 0.22792022792022792, | |
| "grad_norm": 1.716904927996173, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7984, | |
| "step": 70 | |
| }, | |
| { | |
| "epoch": 0.26048026048026046, | |
| "grad_norm": 1.1043041468212171, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7822, | |
| "step": 80 | |
| }, | |
| { | |
| "epoch": 0.29304029304029305, | |
| "grad_norm": 1.1394393992724225, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7747, | |
| "step": 90 | |
| }, | |
| { | |
| "epoch": 0.3256003256003256, | |
| "grad_norm": 0.9041488774288327, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7717, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.3581603581603582, | |
| "grad_norm": 0.9299755697835647, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7678, | |
| "step": 110 | |
| }, | |
| { | |
| "epoch": 0.3907203907203907, | |
| "grad_norm": 0.993036115619975, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7683, | |
| "step": 120 | |
| }, | |
| { | |
| "epoch": 0.42328042328042326, | |
| "grad_norm": 0.6582951208355411, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7628, | |
| "step": 130 | |
| }, | |
| { | |
| "epoch": 0.45584045584045585, | |
| "grad_norm": 0.7519239236084948, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7565, | |
| "step": 140 | |
| }, | |
| { | |
| "epoch": 0.4884004884004884, | |
| "grad_norm": 0.6532749233523701, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7542, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 0.5209605209605209, | |
| "grad_norm": 0.6538729701357949, | |
| "learning_rate": 5e-06, | |
| "loss": 0.752, | |
| "step": 160 | |
| }, | |
| { | |
| "epoch": 0.5535205535205535, | |
| "grad_norm": 0.5799393154553659, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7495, | |
| "step": 170 | |
| }, | |
| { | |
| "epoch": 0.5860805860805861, | |
| "grad_norm": 0.5825669893589298, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7428, | |
| "step": 180 | |
| }, | |
| { | |
| "epoch": 0.6186406186406186, | |
| "grad_norm": 0.7509089505557253, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7449, | |
| "step": 190 | |
| }, | |
| { | |
| "epoch": 0.6512006512006512, | |
| "grad_norm": 0.8999008774030093, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7512, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 0.6837606837606838, | |
| "grad_norm": 0.6383792756493567, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7357, | |
| "step": 210 | |
| }, | |
| { | |
| "epoch": 0.7163207163207164, | |
| "grad_norm": 0.7456320321321589, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7407, | |
| "step": 220 | |
| }, | |
| { | |
| "epoch": 0.7488807488807488, | |
| "grad_norm": 0.6143588238010131, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7365, | |
| "step": 230 | |
| }, | |
| { | |
| "epoch": 0.7814407814407814, | |
| "grad_norm": 0.7213470058027147, | |
| "learning_rate": 5e-06, | |
| "loss": 0.74, | |
| "step": 240 | |
| }, | |
| { | |
| "epoch": 0.814000814000814, | |
| "grad_norm": 0.6247707373236687, | |
| "learning_rate": 5e-06, | |
| "loss": 0.737, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 0.8465608465608465, | |
| "grad_norm": 0.7179277915838116, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7391, | |
| "step": 260 | |
| }, | |
| { | |
| "epoch": 0.8791208791208791, | |
| "grad_norm": 0.5751713737264728, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7256, | |
| "step": 270 | |
| }, | |
| { | |
| "epoch": 0.9116809116809117, | |
| "grad_norm": 1.0076623893106156, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7292, | |
| "step": 280 | |
| }, | |
| { | |
| "epoch": 0.9442409442409443, | |
| "grad_norm": 0.9261278576758778, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7307, | |
| "step": 290 | |
| }, | |
| { | |
| "epoch": 0.9768009768009768, | |
| "grad_norm": 0.5709384175023418, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7273, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 0.9995929995929996, | |
| "eval_loss": 0.727676272392273, | |
| "eval_runtime": 323.9497, | |
| "eval_samples_per_second": 25.544, | |
| "eval_steps_per_second": 0.401, | |
| "step": 307 | |
| }, | |
| { | |
| "epoch": 1.0093610093610093, | |
| "grad_norm": 0.9162973627143661, | |
| "learning_rate": 5e-06, | |
| "loss": 0.775, | |
| "step": 310 | |
| }, | |
| { | |
| "epoch": 1.0419210419210418, | |
| "grad_norm": 0.6625376118384887, | |
| "learning_rate": 5e-06, | |
| "loss": 0.679, | |
| "step": 320 | |
| }, | |
| { | |
| "epoch": 1.0744810744810744, | |
| "grad_norm": 0.6807366677436052, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6787, | |
| "step": 330 | |
| }, | |
| { | |
| "epoch": 1.107041107041107, | |
| "grad_norm": 0.7495116413050847, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6805, | |
| "step": 340 | |
| }, | |
| { | |
| "epoch": 1.1396011396011396, | |
| "grad_norm": 0.6693097946610393, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6759, | |
| "step": 350 | |
| }, | |
| { | |
| "epoch": 1.1721611721611722, | |
| "grad_norm": 0.6269071277974789, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6793, | |
| "step": 360 | |
| }, | |
| { | |
| "epoch": 1.2047212047212048, | |
| "grad_norm": 0.701447231936067, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6739, | |
| "step": 370 | |
| }, | |
| { | |
| "epoch": 1.2372812372812372, | |
| "grad_norm": 0.6286463507729596, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6738, | |
| "step": 380 | |
| }, | |
| { | |
| "epoch": 1.2698412698412698, | |
| "grad_norm": 0.678099904425436, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6775, | |
| "step": 390 | |
| }, | |
| { | |
| "epoch": 1.3024013024013024, | |
| "grad_norm": 0.7402015170342834, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6755, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 1.334961334961335, | |
| "grad_norm": 0.8963076513215479, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6747, | |
| "step": 410 | |
| }, | |
| { | |
| "epoch": 1.3675213675213675, | |
| "grad_norm": 0.758606230124057, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6795, | |
| "step": 420 | |
| }, | |
| { | |
| "epoch": 1.4000814000814001, | |
| "grad_norm": 0.755225029704983, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6782, | |
| "step": 430 | |
| }, | |
| { | |
| "epoch": 1.4326414326414327, | |
| "grad_norm": 0.9296144227043265, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6706, | |
| "step": 440 | |
| }, | |
| { | |
| "epoch": 1.4652014652014653, | |
| "grad_norm": 0.6465031575836382, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6708, | |
| "step": 450 | |
| }, | |
| { | |
| "epoch": 1.4977614977614977, | |
| "grad_norm": 0.5928157369911277, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6763, | |
| "step": 460 | |
| }, | |
| { | |
| "epoch": 1.5303215303215303, | |
| "grad_norm": 0.6210442459071823, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6749, | |
| "step": 470 | |
| }, | |
| { | |
| "epoch": 1.5628815628815629, | |
| "grad_norm": 0.6163699649091662, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6755, | |
| "step": 480 | |
| }, | |
| { | |
| "epoch": 1.5954415954415955, | |
| "grad_norm": 0.8040173316442683, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6704, | |
| "step": 490 | |
| }, | |
| { | |
| "epoch": 1.6280016280016278, | |
| "grad_norm": 0.6887993451391516, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6701, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 1.6605616605616604, | |
| "grad_norm": 0.6197281649463939, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6726, | |
| "step": 510 | |
| }, | |
| { | |
| "epoch": 1.693121693121693, | |
| "grad_norm": 0.619478860918107, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6751, | |
| "step": 520 | |
| }, | |
| { | |
| "epoch": 1.7256817256817256, | |
| "grad_norm": 0.6773051427286838, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6641, | |
| "step": 530 | |
| }, | |
| { | |
| "epoch": 1.7582417582417582, | |
| "grad_norm": 0.6286720338866559, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6796, | |
| "step": 540 | |
| }, | |
| { | |
| "epoch": 1.7908017908017908, | |
| "grad_norm": 0.7533774989219207, | |
| "learning_rate": 5e-06, | |
| "loss": 0.672, | |
| "step": 550 | |
| }, | |
| { | |
| "epoch": 1.8233618233618234, | |
| "grad_norm": 0.7731184689615994, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6659, | |
| "step": 560 | |
| }, | |
| { | |
| "epoch": 1.855921855921856, | |
| "grad_norm": 0.7416671731262793, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6746, | |
| "step": 570 | |
| }, | |
| { | |
| "epoch": 1.8884818884818886, | |
| "grad_norm": 0.6128076594680967, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6716, | |
| "step": 580 | |
| }, | |
| { | |
| "epoch": 1.9210419210419212, | |
| "grad_norm": 0.7891628980046747, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6697, | |
| "step": 590 | |
| }, | |
| { | |
| "epoch": 1.9536019536019538, | |
| "grad_norm": 0.6796937767570254, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6651, | |
| "step": 600 | |
| }, | |
| { | |
| "epoch": 1.9861619861619861, | |
| "grad_norm": 0.6743284971968594, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6701, | |
| "step": 610 | |
| }, | |
| { | |
| "epoch": 1.999185999185999, | |
| "eval_loss": 0.711691677570343, | |
| "eval_runtime": 324.9391, | |
| "eval_samples_per_second": 25.466, | |
| "eval_steps_per_second": 0.4, | |
| "step": 614 | |
| }, | |
| { | |
| "epoch": 2.0187220187220185, | |
| "grad_norm": 1.025163234157577, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6947, | |
| "step": 620 | |
| }, | |
| { | |
| "epoch": 2.051282051282051, | |
| "grad_norm": 0.8628252708559125, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6187, | |
| "step": 630 | |
| }, | |
| { | |
| "epoch": 2.0838420838420837, | |
| "grad_norm": 0.6783100628721863, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6179, | |
| "step": 640 | |
| }, | |
| { | |
| "epoch": 2.1164021164021163, | |
| "grad_norm": 0.6421565201150183, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6162, | |
| "step": 650 | |
| }, | |
| { | |
| "epoch": 2.148962148962149, | |
| "grad_norm": 0.685352763219904, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6177, | |
| "step": 660 | |
| }, | |
| { | |
| "epoch": 2.1815221815221815, | |
| "grad_norm": 0.6061382299294098, | |
| "learning_rate": 5e-06, | |
| "loss": 0.613, | |
| "step": 670 | |
| }, | |
| { | |
| "epoch": 2.214082214082214, | |
| "grad_norm": 0.690472583201057, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6157, | |
| "step": 680 | |
| }, | |
| { | |
| "epoch": 2.2466422466422467, | |
| "grad_norm": 0.627437676785234, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6187, | |
| "step": 690 | |
| }, | |
| { | |
| "epoch": 2.2792022792022792, | |
| "grad_norm": 0.6938080734685778, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6226, | |
| "step": 700 | |
| }, | |
| { | |
| "epoch": 2.311762311762312, | |
| "grad_norm": 0.772959190894534, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6182, | |
| "step": 710 | |
| }, | |
| { | |
| "epoch": 2.3443223443223444, | |
| "grad_norm": 0.5713521350519779, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6213, | |
| "step": 720 | |
| }, | |
| { | |
| "epoch": 2.376882376882377, | |
| "grad_norm": 0.6443040936760203, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6224, | |
| "step": 730 | |
| }, | |
| { | |
| "epoch": 2.4094424094424096, | |
| "grad_norm": 0.5889564557828441, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6203, | |
| "step": 740 | |
| }, | |
| { | |
| "epoch": 2.442002442002442, | |
| "grad_norm": 0.709826472700304, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6193, | |
| "step": 750 | |
| }, | |
| { | |
| "epoch": 2.4745624745624744, | |
| "grad_norm": 0.7335788363502472, | |
| "learning_rate": 5e-06, | |
| "loss": 0.623, | |
| "step": 760 | |
| }, | |
| { | |
| "epoch": 2.5071225071225074, | |
| "grad_norm": 0.6283405015720556, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6188, | |
| "step": 770 | |
| }, | |
| { | |
| "epoch": 2.5396825396825395, | |
| "grad_norm": 0.6952325423084712, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6209, | |
| "step": 780 | |
| }, | |
| { | |
| "epoch": 2.572242572242572, | |
| "grad_norm": 0.6559620535420857, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6201, | |
| "step": 790 | |
| }, | |
| { | |
| "epoch": 2.6048026048026047, | |
| "grad_norm": 0.7118809496834119, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6198, | |
| "step": 800 | |
| }, | |
| { | |
| "epoch": 2.6373626373626373, | |
| "grad_norm": 0.7214373132810955, | |
| "learning_rate": 5e-06, | |
| "loss": 0.621, | |
| "step": 810 | |
| }, | |
| { | |
| "epoch": 2.66992266992267, | |
| "grad_norm": 0.6415485259710608, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6229, | |
| "step": 820 | |
| }, | |
| { | |
| "epoch": 2.7024827024827025, | |
| "grad_norm": 0.5655240384143257, | |
| "learning_rate": 5e-06, | |
| "loss": 0.623, | |
| "step": 830 | |
| }, | |
| { | |
| "epoch": 2.735042735042735, | |
| "grad_norm": 0.7379491136681863, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6204, | |
| "step": 840 | |
| }, | |
| { | |
| "epoch": 2.7676027676027677, | |
| "grad_norm": 0.6816392250287217, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6213, | |
| "step": 850 | |
| }, | |
| { | |
| "epoch": 2.8001628001628003, | |
| "grad_norm": 0.6788149050134666, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6219, | |
| "step": 860 | |
| }, | |
| { | |
| "epoch": 2.832722832722833, | |
| "grad_norm": 0.5660568888358906, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6211, | |
| "step": 870 | |
| }, | |
| { | |
| "epoch": 2.8652828652828655, | |
| "grad_norm": 0.59814839030772, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6172, | |
| "step": 880 | |
| }, | |
| { | |
| "epoch": 2.8978428978428976, | |
| "grad_norm": 0.7047473448081863, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6207, | |
| "step": 890 | |
| }, | |
| { | |
| "epoch": 2.9304029304029307, | |
| "grad_norm": 0.9367887506446253, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6287, | |
| "step": 900 | |
| }, | |
| { | |
| "epoch": 2.962962962962963, | |
| "grad_norm": 0.5836189827444674, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6189, | |
| "step": 910 | |
| }, | |
| { | |
| "epoch": 2.9955229955229954, | |
| "grad_norm": 0.8008103377337297, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6223, | |
| "step": 920 | |
| }, | |
| { | |
| "epoch": 2.998778998778999, | |
| "eval_loss": 0.7124439477920532, | |
| "eval_runtime": 324.9155, | |
| "eval_samples_per_second": 25.468, | |
| "eval_steps_per_second": 0.4, | |
| "step": 921 | |
| }, | |
| { | |
| "epoch": 2.998778998778999, | |
| "step": 921, | |
| "total_flos": 1542499923394560.0, | |
| "train_loss": 0.6911445101490498, | |
| "train_runtime": 53981.7348, | |
| "train_samples_per_second": 8.737, | |
| "train_steps_per_second": 0.017 | |
| } | |
| ], | |
| "logging_steps": 10, | |
| "max_steps": 921, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 3, | |
| "save_steps": 500, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": true | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 1542499923394560.0, | |
| "train_batch_size": 8, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |