{ "best_metric": null, "best_model_checkpoint": null, "epoch": 3.0, "eval_steps": 500, "global_step": 1200, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.025, "grad_norm": 6.393901263952026, "learning_rate": 5e-06, "loss": 1.1278, "step": 10 }, { "epoch": 0.05, "grad_norm": 1.705956397184389, "learning_rate": 5e-06, "loss": 1.0093, "step": 20 }, { "epoch": 0.075, "grad_norm": 1.68971140105189, "learning_rate": 5e-06, "loss": 0.9776, "step": 30 }, { "epoch": 0.1, "grad_norm": 2.2896479687833042, "learning_rate": 5e-06, "loss": 0.9461, "step": 40 }, { "epoch": 0.125, "grad_norm": 2.337387810419128, "learning_rate": 5e-06, "loss": 0.9341, "step": 50 }, { "epoch": 0.15, "grad_norm": 1.882007339408848, "learning_rate": 5e-06, "loss": 0.9133, "step": 60 }, { "epoch": 0.175, "grad_norm": 1.0405536021297026, "learning_rate": 5e-06, "loss": 0.9034, "step": 70 }, { "epoch": 0.2, "grad_norm": 1.1273905617480575, "learning_rate": 5e-06, "loss": 0.8946, "step": 80 }, { "epoch": 0.225, "grad_norm": 0.8899956341459144, "learning_rate": 5e-06, "loss": 0.8897, "step": 90 }, { "epoch": 0.25, "grad_norm": 0.7569552721791574, "learning_rate": 5e-06, "loss": 0.8838, "step": 100 }, { "epoch": 0.275, "grad_norm": 0.775187782225504, "learning_rate": 5e-06, "loss": 0.8804, "step": 110 }, { "epoch": 0.3, "grad_norm": 0.7058567462838982, "learning_rate": 5e-06, "loss": 0.8742, "step": 120 }, { "epoch": 0.325, "grad_norm": 1.2857109201084875, "learning_rate": 5e-06, "loss": 0.8689, "step": 130 }, { "epoch": 0.35, "grad_norm": 0.833965423282999, "learning_rate": 5e-06, "loss": 0.8664, "step": 140 }, { "epoch": 0.375, "grad_norm": 0.8452787015074426, "learning_rate": 5e-06, "loss": 0.8647, "step": 150 }, { "epoch": 0.4, "grad_norm": 0.6905100077591924, "learning_rate": 5e-06, "loss": 0.8614, "step": 160 }, { "epoch": 0.425, "grad_norm": 0.6534167059133549, "learning_rate": 5e-06, "loss": 0.8591, "step": 170 }, { "epoch": 0.45, "grad_norm": 0.7163070342801734, "learning_rate": 5e-06, "loss": 0.8538, "step": 180 }, { "epoch": 0.475, "grad_norm": 0.8379014374017074, "learning_rate": 5e-06, "loss": 0.8519, "step": 190 }, { "epoch": 0.5, "grad_norm": 0.7058002653208569, "learning_rate": 5e-06, "loss": 0.8519, "step": 200 }, { "epoch": 0.525, "grad_norm": 0.7433143437252816, "learning_rate": 5e-06, "loss": 0.8486, "step": 210 }, { "epoch": 0.55, "grad_norm": 0.7600103941580408, "learning_rate": 5e-06, "loss": 0.844, "step": 220 }, { "epoch": 0.575, "grad_norm": 0.7419761211751795, "learning_rate": 5e-06, "loss": 0.8447, "step": 230 }, { "epoch": 0.6, "grad_norm": 0.6020153457959969, "learning_rate": 5e-06, "loss": 0.8448, "step": 240 }, { "epoch": 0.625, "grad_norm": 0.5834136183612872, "learning_rate": 5e-06, "loss": 0.8415, "step": 250 }, { "epoch": 0.65, "grad_norm": 0.6861809869702314, "learning_rate": 5e-06, "loss": 0.8412, "step": 260 }, { "epoch": 0.675, "grad_norm": 0.6424298466812792, "learning_rate": 5e-06, "loss": 0.8446, "step": 270 }, { "epoch": 0.7, "grad_norm": 0.7062264581938785, "learning_rate": 5e-06, "loss": 0.8371, "step": 280 }, { "epoch": 0.725, "grad_norm": 0.7413013325508084, "learning_rate": 5e-06, "loss": 0.8368, "step": 290 }, { "epoch": 0.75, "grad_norm": 0.6755467656899127, "learning_rate": 5e-06, "loss": 0.8384, "step": 300 }, { "epoch": 0.775, "grad_norm": 0.7186164830975353, "learning_rate": 5e-06, "loss": 0.8346, "step": 310 }, { "epoch": 0.8, "grad_norm": 0.5707506171843091, "learning_rate": 5e-06, "loss": 0.8331, "step": 320 }, { "epoch": 0.825, "grad_norm": 0.9112862077459049, "learning_rate": 5e-06, "loss": 0.8324, "step": 330 }, { "epoch": 0.85, "grad_norm": 0.6828679795547313, "learning_rate": 5e-06, "loss": 0.8354, "step": 340 }, { "epoch": 0.875, "grad_norm": 0.6927842206009195, "learning_rate": 5e-06, "loss": 0.829, "step": 350 }, { "epoch": 0.9, "grad_norm": 0.6008408676809543, "learning_rate": 5e-06, "loss": 0.8307, "step": 360 }, { "epoch": 0.925, "grad_norm": 0.6206381764947855, "learning_rate": 5e-06, "loss": 0.8267, "step": 370 }, { "epoch": 0.95, "grad_norm": 0.5454612469710348, "learning_rate": 5e-06, "loss": 0.8247, "step": 380 }, { "epoch": 0.975, "grad_norm": 0.5220893940655968, "learning_rate": 5e-06, "loss": 0.8256, "step": 390 }, { "epoch": 1.0, "grad_norm": 0.5672238225320995, "learning_rate": 5e-06, "loss": 0.8272, "step": 400 }, { "epoch": 1.0, "eval_loss": 0.8262304067611694, "eval_runtime": 283.5953, "eval_samples_per_second": 38.001, "eval_steps_per_second": 0.596, "step": 400 }, { "epoch": 1.025, "grad_norm": 1.1102286013620262, "learning_rate": 5e-06, "loss": 0.7842, "step": 410 }, { "epoch": 1.05, "grad_norm": 0.7607811738450544, "learning_rate": 5e-06, "loss": 0.7849, "step": 420 }, { "epoch": 1.075, "grad_norm": 0.7653482131592415, "learning_rate": 5e-06, "loss": 0.7849, "step": 430 }, { "epoch": 1.1, "grad_norm": 0.705031839749241, "learning_rate": 5e-06, "loss": 0.779, "step": 440 }, { "epoch": 1.125, "grad_norm": 0.5370550513294627, "learning_rate": 5e-06, "loss": 0.7831, "step": 450 }, { "epoch": 1.15, "grad_norm": 0.5298114725731297, "learning_rate": 5e-06, "loss": 0.7818, "step": 460 }, { "epoch": 1.175, "grad_norm": 0.6734194289163318, "learning_rate": 5e-06, "loss": 0.7772, "step": 470 }, { "epoch": 1.2, "grad_norm": 0.5897636893907213, "learning_rate": 5e-06, "loss": 0.7813, "step": 480 }, { "epoch": 1.225, "grad_norm": 0.604266980297563, "learning_rate": 5e-06, "loss": 0.7836, "step": 490 }, { "epoch": 1.25, "grad_norm": 0.6714899816376678, "learning_rate": 5e-06, "loss": 0.7801, "step": 500 }, { "epoch": 1.275, "grad_norm": 0.8168625249318959, "learning_rate": 5e-06, "loss": 0.7806, "step": 510 }, { "epoch": 1.3, "grad_norm": 0.7932681977775804, "learning_rate": 5e-06, "loss": 0.7776, "step": 520 }, { "epoch": 1.325, "grad_norm": 0.8862643278939233, "learning_rate": 5e-06, "loss": 0.7796, "step": 530 }, { "epoch": 1.35, "grad_norm": 0.7421035460384174, "learning_rate": 5e-06, "loss": 0.7823, "step": 540 }, { "epoch": 1.375, "grad_norm": 0.5962649233174171, "learning_rate": 5e-06, "loss": 0.7783, "step": 550 }, { "epoch": 1.4, "grad_norm": 0.9067010653184415, "learning_rate": 5e-06, "loss": 0.7755, "step": 560 }, { "epoch": 1.425, "grad_norm": 0.9272486884932043, "learning_rate": 5e-06, "loss": 0.7811, "step": 570 }, { "epoch": 1.45, "grad_norm": 0.634573401992294, "learning_rate": 5e-06, "loss": 0.7783, "step": 580 }, { "epoch": 1.475, "grad_norm": 0.7647615867543145, "learning_rate": 5e-06, "loss": 0.7789, "step": 590 }, { "epoch": 1.5, "grad_norm": 0.7186976301425001, "learning_rate": 5e-06, "loss": 0.7732, "step": 600 }, { "epoch": 1.525, "grad_norm": 0.6741640245568876, "learning_rate": 5e-06, "loss": 0.7803, "step": 610 }, { "epoch": 1.55, "grad_norm": 0.7673936498272227, "learning_rate": 5e-06, "loss": 0.7771, "step": 620 }, { "epoch": 1.575, "grad_norm": 0.6762446070662301, "learning_rate": 5e-06, "loss": 0.7781, "step": 630 }, { "epoch": 1.6, "grad_norm": 0.694582981661087, "learning_rate": 5e-06, "loss": 0.7803, "step": 640 }, { "epoch": 1.625, "grad_norm": 0.6911825951192573, "learning_rate": 5e-06, "loss": 0.7755, "step": 650 }, { "epoch": 1.65, "grad_norm": 0.7214035778878081, "learning_rate": 5e-06, "loss": 0.7784, "step": 660 }, { "epoch": 1.675, "grad_norm": 0.565532898038658, "learning_rate": 5e-06, "loss": 0.7798, "step": 670 }, { "epoch": 1.7, "grad_norm": 0.7144730054018653, "learning_rate": 5e-06, "loss": 0.7779, "step": 680 }, { "epoch": 1.725, "grad_norm": 0.68929751507137, "learning_rate": 5e-06, "loss": 0.7743, "step": 690 }, { "epoch": 1.75, "grad_norm": 0.6232088446530882, "learning_rate": 5e-06, "loss": 0.7764, "step": 700 }, { "epoch": 1.775, "grad_norm": 0.5516916387958084, "learning_rate": 5e-06, "loss": 0.7743, "step": 710 }, { "epoch": 1.8, "grad_norm": 0.6556060585205762, "learning_rate": 5e-06, "loss": 0.7795, "step": 720 }, { "epoch": 1.825, "grad_norm": 0.6395664223230667, "learning_rate": 5e-06, "loss": 0.7727, "step": 730 }, { "epoch": 1.85, "grad_norm": 0.7264470243374237, "learning_rate": 5e-06, "loss": 0.7716, "step": 740 }, { "epoch": 1.875, "grad_norm": 0.7348089904512866, "learning_rate": 5e-06, "loss": 0.7759, "step": 750 }, { "epoch": 1.9, "grad_norm": 0.5362704236504767, "learning_rate": 5e-06, "loss": 0.7725, "step": 760 }, { "epoch": 1.925, "grad_norm": 0.6132209869820257, "learning_rate": 5e-06, "loss": 0.7746, "step": 770 }, { "epoch": 1.95, "grad_norm": 0.5674492579544563, "learning_rate": 5e-06, "loss": 0.7762, "step": 780 }, { "epoch": 1.975, "grad_norm": 0.8144733224752085, "learning_rate": 5e-06, "loss": 0.771, "step": 790 }, { "epoch": 2.0, "grad_norm": 0.6438437017098873, "learning_rate": 5e-06, "loss": 0.7753, "step": 800 }, { "epoch": 2.0, "eval_loss": 0.8092445731163025, "eval_runtime": 279.3948, "eval_samples_per_second": 38.573, "eval_steps_per_second": 0.605, "step": 800 }, { "epoch": 2.025, "grad_norm": 0.741980442805714, "learning_rate": 5e-06, "loss": 0.731, "step": 810 }, { "epoch": 2.05, "grad_norm": 0.6348046724923234, "learning_rate": 5e-06, "loss": 0.7254, "step": 820 }, { "epoch": 2.075, "grad_norm": 0.6998662875286445, "learning_rate": 5e-06, "loss": 0.722, "step": 830 }, { "epoch": 2.1, "grad_norm": 0.6662824597359785, "learning_rate": 5e-06, "loss": 0.7251, "step": 840 }, { "epoch": 2.125, "grad_norm": 0.5887524561855076, "learning_rate": 5e-06, "loss": 0.7329, "step": 850 }, { "epoch": 2.15, "grad_norm": 0.773871523528161, "learning_rate": 5e-06, "loss": 0.7282, "step": 860 }, { "epoch": 2.175, "grad_norm": 0.6520605524681873, "learning_rate": 5e-06, "loss": 0.7283, "step": 870 }, { "epoch": 2.2, "grad_norm": 0.6481774807052064, "learning_rate": 5e-06, "loss": 0.7333, "step": 880 }, { "epoch": 2.225, "grad_norm": 0.5980371010517554, "learning_rate": 5e-06, "loss": 0.7307, "step": 890 }, { "epoch": 2.25, "grad_norm": 0.6698106025945666, "learning_rate": 5e-06, "loss": 0.7276, "step": 900 }, { "epoch": 2.275, "grad_norm": 0.6064989242913533, "learning_rate": 5e-06, "loss": 0.7284, "step": 910 }, { "epoch": 2.3, "grad_norm": 0.5694949139445875, "learning_rate": 5e-06, "loss": 0.7287, "step": 920 }, { "epoch": 2.325, "grad_norm": 0.6518761067013964, "learning_rate": 5e-06, "loss": 0.7272, "step": 930 }, { "epoch": 2.35, "grad_norm": 0.6644802771746268, "learning_rate": 5e-06, "loss": 0.7301, "step": 940 }, { "epoch": 2.375, "grad_norm": 0.564358869683258, "learning_rate": 5e-06, "loss": 0.7333, "step": 950 }, { "epoch": 2.4, "grad_norm": 0.6052088075734858, "learning_rate": 5e-06, "loss": 0.7279, "step": 960 }, { "epoch": 2.425, "grad_norm": 0.5588110088845489, "learning_rate": 5e-06, "loss": 0.7307, "step": 970 }, { "epoch": 2.45, "grad_norm": 0.7921721794918313, "learning_rate": 5e-06, "loss": 0.727, "step": 980 }, { "epoch": 2.475, "grad_norm": 0.633397399719071, "learning_rate": 5e-06, "loss": 0.7315, "step": 990 }, { "epoch": 2.5, "grad_norm": 0.5866962766860978, "learning_rate": 5e-06, "loss": 0.732, "step": 1000 }, { "epoch": 2.525, "grad_norm": 0.7798403321881853, "learning_rate": 5e-06, "loss": 0.7301, "step": 1010 }, { "epoch": 2.55, "grad_norm": 0.6522313154237275, "learning_rate": 5e-06, "loss": 0.727, "step": 1020 }, { "epoch": 2.575, "grad_norm": 0.5918752307210033, "learning_rate": 5e-06, "loss": 0.732, "step": 1030 }, { "epoch": 2.6, "grad_norm": 0.7013642379103808, "learning_rate": 5e-06, "loss": 0.7304, "step": 1040 }, { "epoch": 2.625, "grad_norm": 0.6019984161152461, "learning_rate": 5e-06, "loss": 0.7301, "step": 1050 }, { "epoch": 2.65, "grad_norm": 0.6025521914638767, "learning_rate": 5e-06, "loss": 0.7303, "step": 1060 }, { "epoch": 2.675, "grad_norm": 0.8383133859288988, "learning_rate": 5e-06, "loss": 0.7352, "step": 1070 }, { "epoch": 2.7, "grad_norm": 0.7176072744526589, "learning_rate": 5e-06, "loss": 0.7325, "step": 1080 }, { "epoch": 2.725, "grad_norm": 0.6330508627766337, "learning_rate": 5e-06, "loss": 0.7297, "step": 1090 }, { "epoch": 2.75, "grad_norm": 0.708854127664677, "learning_rate": 5e-06, "loss": 0.7302, "step": 1100 }, { "epoch": 2.775, "grad_norm": 0.7080026819402018, "learning_rate": 5e-06, "loss": 0.7338, "step": 1110 }, { "epoch": 2.8, "grad_norm": 0.6486046388056755, "learning_rate": 5e-06, "loss": 0.7333, "step": 1120 }, { "epoch": 2.825, "grad_norm": 0.6098247605462774, "learning_rate": 5e-06, "loss": 0.7368, "step": 1130 }, { "epoch": 2.85, "grad_norm": 0.707595465934189, "learning_rate": 5e-06, "loss": 0.7315, "step": 1140 }, { "epoch": 2.875, "grad_norm": 0.6598068743344158, "learning_rate": 5e-06, "loss": 0.7328, "step": 1150 }, { "epoch": 2.9, "grad_norm": 0.6256793408527994, "learning_rate": 5e-06, "loss": 0.7335, "step": 1160 }, { "epoch": 2.925, "grad_norm": 0.6477061017951039, "learning_rate": 5e-06, "loss": 0.7296, "step": 1170 }, { "epoch": 2.95, "grad_norm": 0.6150717312512685, "learning_rate": 5e-06, "loss": 0.7336, "step": 1180 }, { "epoch": 2.975, "grad_norm": 0.5375035486490041, "learning_rate": 5e-06, "loss": 0.7316, "step": 1190 }, { "epoch": 3.0, "grad_norm": 0.6000906442308059, "learning_rate": 5e-06, "loss": 0.7331, "step": 1200 }, { "epoch": 3.0, "eval_loss": 0.8070117235183716, "eval_runtime": 271.2171, "eval_samples_per_second": 39.736, "eval_steps_per_second": 0.623, "step": 1200 }, { "epoch": 3.0, "step": 1200, "total_flos": 2009835314872320.0, "train_loss": 0.7931587568918864, "train_runtime": 40308.7538, "train_samples_per_second": 15.238, "train_steps_per_second": 0.03 } ], "logging_steps": 10, "max_steps": 1200, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 2009835314872320.0, "train_batch_size": 8, "trial_name": null, "trial_params": null }