| { |
| "best_metric": null, |
| "best_model_checkpoint": null, |
| "epoch": 2.9988536492166604, |
| "eval_steps": 500, |
| "global_step": 981, |
| "is_hyper_param_search": false, |
| "is_local_process_zero": true, |
| "is_world_process_zero": true, |
| "log_history": [ |
| { |
| "epoch": 0.030569354222392053, |
| "grad_norm": 2.5210809779069177, |
| "learning_rate": 5e-06, |
| "loss": 1.0377, |
| "step": 10 |
| }, |
| { |
| "epoch": 0.061138708444784105, |
| "grad_norm": 8.843402948999215, |
| "learning_rate": 5e-06, |
| "loss": 0.9357, |
| "step": 20 |
| }, |
| { |
| "epoch": 0.09170806266717615, |
| "grad_norm": 5.905493309213603, |
| "learning_rate": 5e-06, |
| "loss": 0.8957, |
| "step": 30 |
| }, |
| { |
| "epoch": 0.12227741688956821, |
| "grad_norm": 1.8539641142842107, |
| "learning_rate": 5e-06, |
| "loss": 0.8741, |
| "step": 40 |
| }, |
| { |
| "epoch": 0.15284677111196027, |
| "grad_norm": 2.070310187410454, |
| "learning_rate": 5e-06, |
| "loss": 0.8547, |
| "step": 50 |
| }, |
| { |
| "epoch": 0.1834161253343523, |
| "grad_norm": 1.6459728758706702, |
| "learning_rate": 5e-06, |
| "loss": 0.8415, |
| "step": 60 |
| }, |
| { |
| "epoch": 0.21398547955674435, |
| "grad_norm": 1.0031157681260168, |
| "learning_rate": 5e-06, |
| "loss": 0.8256, |
| "step": 70 |
| }, |
| { |
| "epoch": 0.24455483377913642, |
| "grad_norm": 1.1608879789418929, |
| "learning_rate": 5e-06, |
| "loss": 0.8178, |
| "step": 80 |
| }, |
| { |
| "epoch": 0.2751241880015285, |
| "grad_norm": 0.8809818540535138, |
| "learning_rate": 5e-06, |
| "loss": 0.8075, |
| "step": 90 |
| }, |
| { |
| "epoch": 0.30569354222392053, |
| "grad_norm": 0.6485177640239032, |
| "learning_rate": 5e-06, |
| "loss": 0.7984, |
| "step": 100 |
| }, |
| { |
| "epoch": 0.3362628964463126, |
| "grad_norm": 0.6091087407506572, |
| "learning_rate": 5e-06, |
| "loss": 0.7954, |
| "step": 110 |
| }, |
| { |
| "epoch": 0.3668322506687046, |
| "grad_norm": 0.6370713623745252, |
| "learning_rate": 5e-06, |
| "loss": 0.7896, |
| "step": 120 |
| }, |
| { |
| "epoch": 0.39740160489109666, |
| "grad_norm": 0.9042512916702462, |
| "learning_rate": 5e-06, |
| "loss": 0.7887, |
| "step": 130 |
| }, |
| { |
| "epoch": 0.4279709591134887, |
| "grad_norm": 0.6657970582129965, |
| "learning_rate": 5e-06, |
| "loss": 0.7875, |
| "step": 140 |
| }, |
| { |
| "epoch": 0.4585403133358808, |
| "grad_norm": 0.6623117082135378, |
| "learning_rate": 5e-06, |
| "loss": 0.7829, |
| "step": 150 |
| }, |
| { |
| "epoch": 0.48910966755827284, |
| "grad_norm": 0.8758486633363365, |
| "learning_rate": 5e-06, |
| "loss": 0.7835, |
| "step": 160 |
| }, |
| { |
| "epoch": 0.5196790217806648, |
| "grad_norm": 0.687987022965735, |
| "learning_rate": 5e-06, |
| "loss": 0.7774, |
| "step": 170 |
| }, |
| { |
| "epoch": 0.550248376003057, |
| "grad_norm": 0.5715159943996615, |
| "learning_rate": 5e-06, |
| "loss": 0.7757, |
| "step": 180 |
| }, |
| { |
| "epoch": 0.580817730225449, |
| "grad_norm": 0.6235956778260581, |
| "learning_rate": 5e-06, |
| "loss": 0.7738, |
| "step": 190 |
| }, |
| { |
| "epoch": 0.6113870844478411, |
| "grad_norm": 0.625289072408262, |
| "learning_rate": 5e-06, |
| "loss": 0.7785, |
| "step": 200 |
| }, |
| { |
| "epoch": 0.6419564386702331, |
| "grad_norm": 0.8374831890605251, |
| "learning_rate": 5e-06, |
| "loss": 0.773, |
| "step": 210 |
| }, |
| { |
| "epoch": 0.6725257928926252, |
| "grad_norm": 0.7439050014272415, |
| "learning_rate": 5e-06, |
| "loss": 0.7733, |
| "step": 220 |
| }, |
| { |
| "epoch": 0.7030951471150172, |
| "grad_norm": 0.8205921908676899, |
| "learning_rate": 5e-06, |
| "loss": 0.7696, |
| "step": 230 |
| }, |
| { |
| "epoch": 0.7336645013374092, |
| "grad_norm": 0.5336955969561407, |
| "learning_rate": 5e-06, |
| "loss": 0.7705, |
| "step": 240 |
| }, |
| { |
| "epoch": 0.7642338555598013, |
| "grad_norm": 0.8042702021993413, |
| "learning_rate": 5e-06, |
| "loss": 0.7633, |
| "step": 250 |
| }, |
| { |
| "epoch": 0.7948032097821933, |
| "grad_norm": 0.7443163831237201, |
| "learning_rate": 5e-06, |
| "loss": 0.7642, |
| "step": 260 |
| }, |
| { |
| "epoch": 0.8253725640045854, |
| "grad_norm": 0.7100151672475864, |
| "learning_rate": 5e-06, |
| "loss": 0.7672, |
| "step": 270 |
| }, |
| { |
| "epoch": 0.8559419182269774, |
| "grad_norm": 0.8122666871045187, |
| "learning_rate": 5e-06, |
| "loss": 0.7658, |
| "step": 280 |
| }, |
| { |
| "epoch": 0.8865112724493696, |
| "grad_norm": 0.9650852170253817, |
| "learning_rate": 5e-06, |
| "loss": 0.761, |
| "step": 290 |
| }, |
| { |
| "epoch": 0.9170806266717616, |
| "grad_norm": 0.9066342061905646, |
| "learning_rate": 5e-06, |
| "loss": 0.7623, |
| "step": 300 |
| }, |
| { |
| "epoch": 0.9476499808941536, |
| "grad_norm": 0.8736567540109669, |
| "learning_rate": 5e-06, |
| "loss": 0.7605, |
| "step": 310 |
| }, |
| { |
| "epoch": 0.9782193351165457, |
| "grad_norm": 0.581848021872034, |
| "learning_rate": 5e-06, |
| "loss": 0.759, |
| "step": 320 |
| }, |
| { |
| "epoch": 0.9996178830722201, |
| "eval_loss": 0.7565266489982605, |
| "eval_runtime": 348.0672, |
| "eval_samples_per_second": 25.323, |
| "eval_steps_per_second": 0.396, |
| "step": 327 |
| }, |
| { |
| "epoch": 1.0087886893389377, |
| "grad_norm": 1.2716473505505381, |
| "learning_rate": 5e-06, |
| "loss": 0.8102, |
| "step": 330 |
| }, |
| { |
| "epoch": 1.0393580435613297, |
| "grad_norm": 0.6878381986732218, |
| "learning_rate": 5e-06, |
| "loss": 0.7133, |
| "step": 340 |
| }, |
| { |
| "epoch": 1.0699273977837218, |
| "grad_norm": 0.6062487967329377, |
| "learning_rate": 5e-06, |
| "loss": 0.7114, |
| "step": 350 |
| }, |
| { |
| "epoch": 1.100496752006114, |
| "grad_norm": 0.6358424479273052, |
| "learning_rate": 5e-06, |
| "loss": 0.7172, |
| "step": 360 |
| }, |
| { |
| "epoch": 1.131066106228506, |
| "grad_norm": 0.6758954139263628, |
| "learning_rate": 5e-06, |
| "loss": 0.7155, |
| "step": 370 |
| }, |
| { |
| "epoch": 1.161635460450898, |
| "grad_norm": 0.6698110186698876, |
| "learning_rate": 5e-06, |
| "loss": 0.7112, |
| "step": 380 |
| }, |
| { |
| "epoch": 1.19220481467329, |
| "grad_norm": 0.7109888242262707, |
| "learning_rate": 5e-06, |
| "loss": 0.717, |
| "step": 390 |
| }, |
| { |
| "epoch": 1.2227741688956821, |
| "grad_norm": 0.8395391271484969, |
| "learning_rate": 5e-06, |
| "loss": 0.7124, |
| "step": 400 |
| }, |
| { |
| "epoch": 1.253343523118074, |
| "grad_norm": 0.7312759566081071, |
| "learning_rate": 5e-06, |
| "loss": 0.7118, |
| "step": 410 |
| }, |
| { |
| "epoch": 1.2839128773404662, |
| "grad_norm": 0.7632165886503746, |
| "learning_rate": 5e-06, |
| "loss": 0.7158, |
| "step": 420 |
| }, |
| { |
| "epoch": 1.3144822315628581, |
| "grad_norm": 0.6457766924924516, |
| "learning_rate": 5e-06, |
| "loss": 0.7125, |
| "step": 430 |
| }, |
| { |
| "epoch": 1.3450515857852503, |
| "grad_norm": 0.6072207383876659, |
| "learning_rate": 5e-06, |
| "loss": 0.7073, |
| "step": 440 |
| }, |
| { |
| "epoch": 1.3756209400076425, |
| "grad_norm": 0.7504686700687067, |
| "learning_rate": 5e-06, |
| "loss": 0.7126, |
| "step": 450 |
| }, |
| { |
| "epoch": 1.4061902942300344, |
| "grad_norm": 0.5951557615102034, |
| "learning_rate": 5e-06, |
| "loss": 0.7153, |
| "step": 460 |
| }, |
| { |
| "epoch": 1.4367596484524263, |
| "grad_norm": 0.7769347892834471, |
| "learning_rate": 5e-06, |
| "loss": 0.7121, |
| "step": 470 |
| }, |
| { |
| "epoch": 1.4673290026748185, |
| "grad_norm": 0.6602990935189074, |
| "learning_rate": 5e-06, |
| "loss": 0.7109, |
| "step": 480 |
| }, |
| { |
| "epoch": 1.4978983568972106, |
| "grad_norm": 0.6128725764352912, |
| "learning_rate": 5e-06, |
| "loss": 0.7107, |
| "step": 490 |
| }, |
| { |
| "epoch": 1.5284677111196026, |
| "grad_norm": 0.6273289786548926, |
| "learning_rate": 5e-06, |
| "loss": 0.7163, |
| "step": 500 |
| }, |
| { |
| "epoch": 1.5590370653419945, |
| "grad_norm": 0.7489331538468206, |
| "learning_rate": 5e-06, |
| "loss": 0.7089, |
| "step": 510 |
| }, |
| { |
| "epoch": 1.5896064195643866, |
| "grad_norm": 0.5681605011723666, |
| "learning_rate": 5e-06, |
| "loss": 0.7095, |
| "step": 520 |
| }, |
| { |
| "epoch": 1.6201757737867788, |
| "grad_norm": 0.5985303729183026, |
| "learning_rate": 5e-06, |
| "loss": 0.7121, |
| "step": 530 |
| }, |
| { |
| "epoch": 1.650745128009171, |
| "grad_norm": 0.797108834271602, |
| "learning_rate": 5e-06, |
| "loss": 0.7076, |
| "step": 540 |
| }, |
| { |
| "epoch": 1.6813144822315629, |
| "grad_norm": 0.7153053885200517, |
| "learning_rate": 5e-06, |
| "loss": 0.7122, |
| "step": 550 |
| }, |
| { |
| "epoch": 1.7118838364539548, |
| "grad_norm": 0.6550919899983794, |
| "learning_rate": 5e-06, |
| "loss": 0.7081, |
| "step": 560 |
| }, |
| { |
| "epoch": 1.742453190676347, |
| "grad_norm": 0.518606008941313, |
| "learning_rate": 5e-06, |
| "loss": 0.7079, |
| "step": 570 |
| }, |
| { |
| "epoch": 1.7730225448987391, |
| "grad_norm": 0.5838363697325892, |
| "learning_rate": 5e-06, |
| "loss": 0.7099, |
| "step": 580 |
| }, |
| { |
| "epoch": 1.803591899121131, |
| "grad_norm": 0.7286231789681924, |
| "learning_rate": 5e-06, |
| "loss": 0.7114, |
| "step": 590 |
| }, |
| { |
| "epoch": 1.834161253343523, |
| "grad_norm": 0.5756454212301187, |
| "learning_rate": 5e-06, |
| "loss": 0.7097, |
| "step": 600 |
| }, |
| { |
| "epoch": 1.8647306075659151, |
| "grad_norm": 0.6043070572041148, |
| "learning_rate": 5e-06, |
| "loss": 0.7088, |
| "step": 610 |
| }, |
| { |
| "epoch": 1.8952999617883073, |
| "grad_norm": 0.6701129239334284, |
| "learning_rate": 5e-06, |
| "loss": 0.7102, |
| "step": 620 |
| }, |
| { |
| "epoch": 1.9258693160106994, |
| "grad_norm": 0.682232504464511, |
| "learning_rate": 5e-06, |
| "loss": 0.7065, |
| "step": 630 |
| }, |
| { |
| "epoch": 1.9564386702330914, |
| "grad_norm": 0.7372520465078513, |
| "learning_rate": 5e-06, |
| "loss": 0.7075, |
| "step": 640 |
| }, |
| { |
| "epoch": 1.9870080244554833, |
| "grad_norm": 0.643870845568192, |
| "learning_rate": 5e-06, |
| "loss": 0.7082, |
| "step": 650 |
| }, |
| { |
| "epoch": 1.9992357661444402, |
| "eval_loss": 0.7433667182922363, |
| "eval_runtime": 347.9317, |
| "eval_samples_per_second": 25.333, |
| "eval_steps_per_second": 0.397, |
| "step": 654 |
| }, |
| { |
| "epoch": 2.0175773786778755, |
| "grad_norm": 0.8491803882899337, |
| "learning_rate": 5e-06, |
| "loss": 0.7384, |
| "step": 660 |
| }, |
| { |
| "epoch": 2.0481467329002676, |
| "grad_norm": 0.6533686347265598, |
| "learning_rate": 5e-06, |
| "loss": 0.6584, |
| "step": 670 |
| }, |
| { |
| "epoch": 2.0787160871226593, |
| "grad_norm": 0.7193242535677702, |
| "learning_rate": 5e-06, |
| "loss": 0.6626, |
| "step": 680 |
| }, |
| { |
| "epoch": 2.1092854413450515, |
| "grad_norm": 0.6207686304604076, |
| "learning_rate": 5e-06, |
| "loss": 0.6584, |
| "step": 690 |
| }, |
| { |
| "epoch": 2.1398547955674436, |
| "grad_norm": 0.8022673797865767, |
| "learning_rate": 5e-06, |
| "loss": 0.6637, |
| "step": 700 |
| }, |
| { |
| "epoch": 2.1704241497898358, |
| "grad_norm": 0.7500169227153624, |
| "learning_rate": 5e-06, |
| "loss": 0.664, |
| "step": 710 |
| }, |
| { |
| "epoch": 2.200993504012228, |
| "grad_norm": 0.8712969762773869, |
| "learning_rate": 5e-06, |
| "loss": 0.6639, |
| "step": 720 |
| }, |
| { |
| "epoch": 2.2315628582346196, |
| "grad_norm": 0.8613714358485228, |
| "learning_rate": 5e-06, |
| "loss": 0.6644, |
| "step": 730 |
| }, |
| { |
| "epoch": 2.262132212457012, |
| "grad_norm": 0.947467641650837, |
| "learning_rate": 5e-06, |
| "loss": 0.6597, |
| "step": 740 |
| }, |
| { |
| "epoch": 2.292701566679404, |
| "grad_norm": 0.6868607018777386, |
| "learning_rate": 5e-06, |
| "loss": 0.6649, |
| "step": 750 |
| }, |
| { |
| "epoch": 2.323270920901796, |
| "grad_norm": 0.6251862332352117, |
| "learning_rate": 5e-06, |
| "loss": 0.6659, |
| "step": 760 |
| }, |
| { |
| "epoch": 2.353840275124188, |
| "grad_norm": 0.5757269555166172, |
| "learning_rate": 5e-06, |
| "loss": 0.663, |
| "step": 770 |
| }, |
| { |
| "epoch": 2.38440962934658, |
| "grad_norm": 0.584698830183716, |
| "learning_rate": 5e-06, |
| "loss": 0.6615, |
| "step": 780 |
| }, |
| { |
| "epoch": 2.414978983568972, |
| "grad_norm": 0.6077791824133885, |
| "learning_rate": 5e-06, |
| "loss": 0.6668, |
| "step": 790 |
| }, |
| { |
| "epoch": 2.4455483377913643, |
| "grad_norm": 0.760997414953586, |
| "learning_rate": 5e-06, |
| "loss": 0.6652, |
| "step": 800 |
| }, |
| { |
| "epoch": 2.476117692013756, |
| "grad_norm": 0.8057818067666922, |
| "learning_rate": 5e-06, |
| "loss": 0.6658, |
| "step": 810 |
| }, |
| { |
| "epoch": 2.506687046236148, |
| "grad_norm": 0.7538900658925544, |
| "learning_rate": 5e-06, |
| "loss": 0.6652, |
| "step": 820 |
| }, |
| { |
| "epoch": 2.5372564004585403, |
| "grad_norm": 0.5345492924904168, |
| "learning_rate": 5e-06, |
| "loss": 0.6667, |
| "step": 830 |
| }, |
| { |
| "epoch": 2.5678257546809324, |
| "grad_norm": 0.631444604730353, |
| "learning_rate": 5e-06, |
| "loss": 0.6675, |
| "step": 840 |
| }, |
| { |
| "epoch": 2.5983951089033246, |
| "grad_norm": 0.5462915586952234, |
| "learning_rate": 5e-06, |
| "loss": 0.6645, |
| "step": 850 |
| }, |
| { |
| "epoch": 2.6289644631257163, |
| "grad_norm": 0.54738098576822, |
| "learning_rate": 5e-06, |
| "loss": 0.6632, |
| "step": 860 |
| }, |
| { |
| "epoch": 2.6595338173481085, |
| "grad_norm": 0.5479245307940317, |
| "learning_rate": 5e-06, |
| "loss": 0.668, |
| "step": 870 |
| }, |
| { |
| "epoch": 2.6901031715705006, |
| "grad_norm": 0.573639898499825, |
| "learning_rate": 5e-06, |
| "loss": 0.668, |
| "step": 880 |
| }, |
| { |
| "epoch": 2.7206725257928928, |
| "grad_norm": 0.7814076044018249, |
| "learning_rate": 5e-06, |
| "loss": 0.6673, |
| "step": 890 |
| }, |
| { |
| "epoch": 2.751241880015285, |
| "grad_norm": 0.7321434596516128, |
| "learning_rate": 5e-06, |
| "loss": 0.6682, |
| "step": 900 |
| }, |
| { |
| "epoch": 2.7818112342376766, |
| "grad_norm": 0.6195747454218001, |
| "learning_rate": 5e-06, |
| "loss": 0.6654, |
| "step": 910 |
| }, |
| { |
| "epoch": 2.8123805884600688, |
| "grad_norm": 0.6928375671510486, |
| "learning_rate": 5e-06, |
| "loss": 0.6644, |
| "step": 920 |
| }, |
| { |
| "epoch": 2.842949942682461, |
| "grad_norm": 0.6500798961347346, |
| "learning_rate": 5e-06, |
| "loss": 0.6662, |
| "step": 930 |
| }, |
| { |
| "epoch": 2.8735192969048526, |
| "grad_norm": 0.6034455141876413, |
| "learning_rate": 5e-06, |
| "loss": 0.6663, |
| "step": 940 |
| }, |
| { |
| "epoch": 2.904088651127245, |
| "grad_norm": 0.6232438975926966, |
| "learning_rate": 5e-06, |
| "loss": 0.6658, |
| "step": 950 |
| }, |
| { |
| "epoch": 2.934658005349637, |
| "grad_norm": 0.6210983063190099, |
| "learning_rate": 5e-06, |
| "loss": 0.667, |
| "step": 960 |
| }, |
| { |
| "epoch": 2.965227359572029, |
| "grad_norm": 0.5251770576443071, |
| "learning_rate": 5e-06, |
| "loss": 0.6669, |
| "step": 970 |
| }, |
| { |
| "epoch": 2.9957967137944213, |
| "grad_norm": 0.7539461515741752, |
| "learning_rate": 5e-06, |
| "loss": 0.6679, |
| "step": 980 |
| }, |
| { |
| "epoch": 2.9988536492166604, |
| "eval_loss": 0.7435035109519958, |
| "eval_runtime": 349.5805, |
| "eval_samples_per_second": 25.213, |
| "eval_steps_per_second": 0.395, |
| "step": 981 |
| }, |
| { |
| "epoch": 2.9988536492166604, |
| "step": 981, |
| "total_flos": 1643002158120960.0, |
| "train_loss": 0.7275239749907475, |
| "train_runtime": 58192.229, |
| "train_samples_per_second": 8.633, |
| "train_steps_per_second": 0.017 |
| } |
| ], |
| "logging_steps": 10, |
| "max_steps": 981, |
| "num_input_tokens_seen": 0, |
| "num_train_epochs": 3, |
| "save_steps": 500, |
| "stateful_callbacks": { |
| "TrainerControl": { |
| "args": { |
| "should_epoch_stop": false, |
| "should_evaluate": false, |
| "should_log": false, |
| "should_save": true, |
| "should_training_stop": true |
| }, |
| "attributes": {} |
| } |
| }, |
| "total_flos": 1643002158120960.0, |
| "train_batch_size": 8, |
| "trial_name": null, |
| "trial_params": null |
| } |
|
|