| { | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 3.0, | |
| "eval_steps": 500, | |
| "global_step": 1200, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.025, | |
| "grad_norm": 6.393901263952026, | |
| "learning_rate": 5e-06, | |
| "loss": 1.1278, | |
| "step": 10 | |
| }, | |
| { | |
| "epoch": 0.05, | |
| "grad_norm": 1.705956397184389, | |
| "learning_rate": 5e-06, | |
| "loss": 1.0093, | |
| "step": 20 | |
| }, | |
| { | |
| "epoch": 0.075, | |
| "grad_norm": 1.68971140105189, | |
| "learning_rate": 5e-06, | |
| "loss": 0.9776, | |
| "step": 30 | |
| }, | |
| { | |
| "epoch": 0.1, | |
| "grad_norm": 2.2896479687833042, | |
| "learning_rate": 5e-06, | |
| "loss": 0.9461, | |
| "step": 40 | |
| }, | |
| { | |
| "epoch": 0.125, | |
| "grad_norm": 2.337387810419128, | |
| "learning_rate": 5e-06, | |
| "loss": 0.9341, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.15, | |
| "grad_norm": 1.882007339408848, | |
| "learning_rate": 5e-06, | |
| "loss": 0.9133, | |
| "step": 60 | |
| }, | |
| { | |
| "epoch": 0.175, | |
| "grad_norm": 1.0405536021297026, | |
| "learning_rate": 5e-06, | |
| "loss": 0.9034, | |
| "step": 70 | |
| }, | |
| { | |
| "epoch": 0.2, | |
| "grad_norm": 1.1273905617480575, | |
| "learning_rate": 5e-06, | |
| "loss": 0.8946, | |
| "step": 80 | |
| }, | |
| { | |
| "epoch": 0.225, | |
| "grad_norm": 0.8899956341459144, | |
| "learning_rate": 5e-06, | |
| "loss": 0.8897, | |
| "step": 90 | |
| }, | |
| { | |
| "epoch": 0.25, | |
| "grad_norm": 0.7569552721791574, | |
| "learning_rate": 5e-06, | |
| "loss": 0.8838, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.275, | |
| "grad_norm": 0.775187782225504, | |
| "learning_rate": 5e-06, | |
| "loss": 0.8804, | |
| "step": 110 | |
| }, | |
| { | |
| "epoch": 0.3, | |
| "grad_norm": 0.7058567462838982, | |
| "learning_rate": 5e-06, | |
| "loss": 0.8742, | |
| "step": 120 | |
| }, | |
| { | |
| "epoch": 0.325, | |
| "grad_norm": 1.2857109201084875, | |
| "learning_rate": 5e-06, | |
| "loss": 0.8689, | |
| "step": 130 | |
| }, | |
| { | |
| "epoch": 0.35, | |
| "grad_norm": 0.833965423282999, | |
| "learning_rate": 5e-06, | |
| "loss": 0.8664, | |
| "step": 140 | |
| }, | |
| { | |
| "epoch": 0.375, | |
| "grad_norm": 0.8452787015074426, | |
| "learning_rate": 5e-06, | |
| "loss": 0.8647, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 0.4, | |
| "grad_norm": 0.6905100077591924, | |
| "learning_rate": 5e-06, | |
| "loss": 0.8614, | |
| "step": 160 | |
| }, | |
| { | |
| "epoch": 0.425, | |
| "grad_norm": 0.6534167059133549, | |
| "learning_rate": 5e-06, | |
| "loss": 0.8591, | |
| "step": 170 | |
| }, | |
| { | |
| "epoch": 0.45, | |
| "grad_norm": 0.7163070342801734, | |
| "learning_rate": 5e-06, | |
| "loss": 0.8538, | |
| "step": 180 | |
| }, | |
| { | |
| "epoch": 0.475, | |
| "grad_norm": 0.8379014374017074, | |
| "learning_rate": 5e-06, | |
| "loss": 0.8519, | |
| "step": 190 | |
| }, | |
| { | |
| "epoch": 0.5, | |
| "grad_norm": 0.7058002653208569, | |
| "learning_rate": 5e-06, | |
| "loss": 0.8519, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 0.525, | |
| "grad_norm": 0.7433143437252816, | |
| "learning_rate": 5e-06, | |
| "loss": 0.8486, | |
| "step": 210 | |
| }, | |
| { | |
| "epoch": 0.55, | |
| "grad_norm": 0.7600103941580408, | |
| "learning_rate": 5e-06, | |
| "loss": 0.844, | |
| "step": 220 | |
| }, | |
| { | |
| "epoch": 0.575, | |
| "grad_norm": 0.7419761211751795, | |
| "learning_rate": 5e-06, | |
| "loss": 0.8447, | |
| "step": 230 | |
| }, | |
| { | |
| "epoch": 0.6, | |
| "grad_norm": 0.6020153457959969, | |
| "learning_rate": 5e-06, | |
| "loss": 0.8448, | |
| "step": 240 | |
| }, | |
| { | |
| "epoch": 0.625, | |
| "grad_norm": 0.5834136183612872, | |
| "learning_rate": 5e-06, | |
| "loss": 0.8415, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 0.65, | |
| "grad_norm": 0.6861809869702314, | |
| "learning_rate": 5e-06, | |
| "loss": 0.8412, | |
| "step": 260 | |
| }, | |
| { | |
| "epoch": 0.675, | |
| "grad_norm": 0.6424298466812792, | |
| "learning_rate": 5e-06, | |
| "loss": 0.8446, | |
| "step": 270 | |
| }, | |
| { | |
| "epoch": 0.7, | |
| "grad_norm": 0.7062264581938785, | |
| "learning_rate": 5e-06, | |
| "loss": 0.8371, | |
| "step": 280 | |
| }, | |
| { | |
| "epoch": 0.725, | |
| "grad_norm": 0.7413013325508084, | |
| "learning_rate": 5e-06, | |
| "loss": 0.8368, | |
| "step": 290 | |
| }, | |
| { | |
| "epoch": 0.75, | |
| "grad_norm": 0.6755467656899127, | |
| "learning_rate": 5e-06, | |
| "loss": 0.8384, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 0.775, | |
| "grad_norm": 0.7186164830975353, | |
| "learning_rate": 5e-06, | |
| "loss": 0.8346, | |
| "step": 310 | |
| }, | |
| { | |
| "epoch": 0.8, | |
| "grad_norm": 0.5707506171843091, | |
| "learning_rate": 5e-06, | |
| "loss": 0.8331, | |
| "step": 320 | |
| }, | |
| { | |
| "epoch": 0.825, | |
| "grad_norm": 0.9112862077459049, | |
| "learning_rate": 5e-06, | |
| "loss": 0.8324, | |
| "step": 330 | |
| }, | |
| { | |
| "epoch": 0.85, | |
| "grad_norm": 0.6828679795547313, | |
| "learning_rate": 5e-06, | |
| "loss": 0.8354, | |
| "step": 340 | |
| }, | |
| { | |
| "epoch": 0.875, | |
| "grad_norm": 0.6927842206009195, | |
| "learning_rate": 5e-06, | |
| "loss": 0.829, | |
| "step": 350 | |
| }, | |
| { | |
| "epoch": 0.9, | |
| "grad_norm": 0.6008408676809543, | |
| "learning_rate": 5e-06, | |
| "loss": 0.8307, | |
| "step": 360 | |
| }, | |
| { | |
| "epoch": 0.925, | |
| "grad_norm": 0.6206381764947855, | |
| "learning_rate": 5e-06, | |
| "loss": 0.8267, | |
| "step": 370 | |
| }, | |
| { | |
| "epoch": 0.95, | |
| "grad_norm": 0.5454612469710348, | |
| "learning_rate": 5e-06, | |
| "loss": 0.8247, | |
| "step": 380 | |
| }, | |
| { | |
| "epoch": 0.975, | |
| "grad_norm": 0.5220893940655968, | |
| "learning_rate": 5e-06, | |
| "loss": 0.8256, | |
| "step": 390 | |
| }, | |
| { | |
| "epoch": 1.0, | |
| "grad_norm": 0.5672238225320995, | |
| "learning_rate": 5e-06, | |
| "loss": 0.8272, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 1.0, | |
| "eval_loss": 0.8262304067611694, | |
| "eval_runtime": 283.5953, | |
| "eval_samples_per_second": 38.001, | |
| "eval_steps_per_second": 0.596, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 1.025, | |
| "grad_norm": 1.1102286013620262, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7842, | |
| "step": 410 | |
| }, | |
| { | |
| "epoch": 1.05, | |
| "grad_norm": 0.7607811738450544, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7849, | |
| "step": 420 | |
| }, | |
| { | |
| "epoch": 1.075, | |
| "grad_norm": 0.7653482131592415, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7849, | |
| "step": 430 | |
| }, | |
| { | |
| "epoch": 1.1, | |
| "grad_norm": 0.705031839749241, | |
| "learning_rate": 5e-06, | |
| "loss": 0.779, | |
| "step": 440 | |
| }, | |
| { | |
| "epoch": 1.125, | |
| "grad_norm": 0.5370550513294627, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7831, | |
| "step": 450 | |
| }, | |
| { | |
| "epoch": 1.15, | |
| "grad_norm": 0.5298114725731297, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7818, | |
| "step": 460 | |
| }, | |
| { | |
| "epoch": 1.175, | |
| "grad_norm": 0.6734194289163318, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7772, | |
| "step": 470 | |
| }, | |
| { | |
| "epoch": 1.2, | |
| "grad_norm": 0.5897636893907213, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7813, | |
| "step": 480 | |
| }, | |
| { | |
| "epoch": 1.225, | |
| "grad_norm": 0.604266980297563, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7836, | |
| "step": 490 | |
| }, | |
| { | |
| "epoch": 1.25, | |
| "grad_norm": 0.6714899816376678, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7801, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 1.275, | |
| "grad_norm": 0.8168625249318959, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7806, | |
| "step": 510 | |
| }, | |
| { | |
| "epoch": 1.3, | |
| "grad_norm": 0.7932681977775804, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7776, | |
| "step": 520 | |
| }, | |
| { | |
| "epoch": 1.325, | |
| "grad_norm": 0.8862643278939233, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7796, | |
| "step": 530 | |
| }, | |
| { | |
| "epoch": 1.35, | |
| "grad_norm": 0.7421035460384174, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7823, | |
| "step": 540 | |
| }, | |
| { | |
| "epoch": 1.375, | |
| "grad_norm": 0.5962649233174171, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7783, | |
| "step": 550 | |
| }, | |
| { | |
| "epoch": 1.4, | |
| "grad_norm": 0.9067010653184415, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7755, | |
| "step": 560 | |
| }, | |
| { | |
| "epoch": 1.425, | |
| "grad_norm": 0.9272486884932043, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7811, | |
| "step": 570 | |
| }, | |
| { | |
| "epoch": 1.45, | |
| "grad_norm": 0.634573401992294, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7783, | |
| "step": 580 | |
| }, | |
| { | |
| "epoch": 1.475, | |
| "grad_norm": 0.7647615867543145, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7789, | |
| "step": 590 | |
| }, | |
| { | |
| "epoch": 1.5, | |
| "grad_norm": 0.7186976301425001, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7732, | |
| "step": 600 | |
| }, | |
| { | |
| "epoch": 1.525, | |
| "grad_norm": 0.6741640245568876, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7803, | |
| "step": 610 | |
| }, | |
| { | |
| "epoch": 1.55, | |
| "grad_norm": 0.7673936498272227, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7771, | |
| "step": 620 | |
| }, | |
| { | |
| "epoch": 1.575, | |
| "grad_norm": 0.6762446070662301, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7781, | |
| "step": 630 | |
| }, | |
| { | |
| "epoch": 1.6, | |
| "grad_norm": 0.694582981661087, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7803, | |
| "step": 640 | |
| }, | |
| { | |
| "epoch": 1.625, | |
| "grad_norm": 0.6911825951192573, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7755, | |
| "step": 650 | |
| }, | |
| { | |
| "epoch": 1.65, | |
| "grad_norm": 0.7214035778878081, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7784, | |
| "step": 660 | |
| }, | |
| { | |
| "epoch": 1.675, | |
| "grad_norm": 0.565532898038658, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7798, | |
| "step": 670 | |
| }, | |
| { | |
| "epoch": 1.7, | |
| "grad_norm": 0.7144730054018653, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7779, | |
| "step": 680 | |
| }, | |
| { | |
| "epoch": 1.725, | |
| "grad_norm": 0.68929751507137, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7743, | |
| "step": 690 | |
| }, | |
| { | |
| "epoch": 1.75, | |
| "grad_norm": 0.6232088446530882, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7764, | |
| "step": 700 | |
| }, | |
| { | |
| "epoch": 1.775, | |
| "grad_norm": 0.5516916387958084, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7743, | |
| "step": 710 | |
| }, | |
| { | |
| "epoch": 1.8, | |
| "grad_norm": 0.6556060585205762, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7795, | |
| "step": 720 | |
| }, | |
| { | |
| "epoch": 1.825, | |
| "grad_norm": 0.6395664223230667, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7727, | |
| "step": 730 | |
| }, | |
| { | |
| "epoch": 1.85, | |
| "grad_norm": 0.7264470243374237, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7716, | |
| "step": 740 | |
| }, | |
| { | |
| "epoch": 1.875, | |
| "grad_norm": 0.7348089904512866, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7759, | |
| "step": 750 | |
| }, | |
| { | |
| "epoch": 1.9, | |
| "grad_norm": 0.5362704236504767, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7725, | |
| "step": 760 | |
| }, | |
| { | |
| "epoch": 1.925, | |
| "grad_norm": 0.6132209869820257, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7746, | |
| "step": 770 | |
| }, | |
| { | |
| "epoch": 1.95, | |
| "grad_norm": 0.5674492579544563, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7762, | |
| "step": 780 | |
| }, | |
| { | |
| "epoch": 1.975, | |
| "grad_norm": 0.8144733224752085, | |
| "learning_rate": 5e-06, | |
| "loss": 0.771, | |
| "step": 790 | |
| }, | |
| { | |
| "epoch": 2.0, | |
| "grad_norm": 0.6438437017098873, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7753, | |
| "step": 800 | |
| }, | |
| { | |
| "epoch": 2.0, | |
| "eval_loss": 0.8092445731163025, | |
| "eval_runtime": 279.3948, | |
| "eval_samples_per_second": 38.573, | |
| "eval_steps_per_second": 0.605, | |
| "step": 800 | |
| }, | |
| { | |
| "epoch": 2.025, | |
| "grad_norm": 0.741980442805714, | |
| "learning_rate": 5e-06, | |
| "loss": 0.731, | |
| "step": 810 | |
| }, | |
| { | |
| "epoch": 2.05, | |
| "grad_norm": 0.6348046724923234, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7254, | |
| "step": 820 | |
| }, | |
| { | |
| "epoch": 2.075, | |
| "grad_norm": 0.6998662875286445, | |
| "learning_rate": 5e-06, | |
| "loss": 0.722, | |
| "step": 830 | |
| }, | |
| { | |
| "epoch": 2.1, | |
| "grad_norm": 0.6662824597359785, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7251, | |
| "step": 840 | |
| }, | |
| { | |
| "epoch": 2.125, | |
| "grad_norm": 0.5887524561855076, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7329, | |
| "step": 850 | |
| }, | |
| { | |
| "epoch": 2.15, | |
| "grad_norm": 0.773871523528161, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7282, | |
| "step": 860 | |
| }, | |
| { | |
| "epoch": 2.175, | |
| "grad_norm": 0.6520605524681873, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7283, | |
| "step": 870 | |
| }, | |
| { | |
| "epoch": 2.2, | |
| "grad_norm": 0.6481774807052064, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7333, | |
| "step": 880 | |
| }, | |
| { | |
| "epoch": 2.225, | |
| "grad_norm": 0.5980371010517554, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7307, | |
| "step": 890 | |
| }, | |
| { | |
| "epoch": 2.25, | |
| "grad_norm": 0.6698106025945666, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7276, | |
| "step": 900 | |
| }, | |
| { | |
| "epoch": 2.275, | |
| "grad_norm": 0.6064989242913533, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7284, | |
| "step": 910 | |
| }, | |
| { | |
| "epoch": 2.3, | |
| "grad_norm": 0.5694949139445875, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7287, | |
| "step": 920 | |
| }, | |
| { | |
| "epoch": 2.325, | |
| "grad_norm": 0.6518761067013964, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7272, | |
| "step": 930 | |
| }, | |
| { | |
| "epoch": 2.35, | |
| "grad_norm": 0.6644802771746268, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7301, | |
| "step": 940 | |
| }, | |
| { | |
| "epoch": 2.375, | |
| "grad_norm": 0.564358869683258, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7333, | |
| "step": 950 | |
| }, | |
| { | |
| "epoch": 2.4, | |
| "grad_norm": 0.6052088075734858, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7279, | |
| "step": 960 | |
| }, | |
| { | |
| "epoch": 2.425, | |
| "grad_norm": 0.5588110088845489, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7307, | |
| "step": 970 | |
| }, | |
| { | |
| "epoch": 2.45, | |
| "grad_norm": 0.7921721794918313, | |
| "learning_rate": 5e-06, | |
| "loss": 0.727, | |
| "step": 980 | |
| }, | |
| { | |
| "epoch": 2.475, | |
| "grad_norm": 0.633397399719071, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7315, | |
| "step": 990 | |
| }, | |
| { | |
| "epoch": 2.5, | |
| "grad_norm": 0.5866962766860978, | |
| "learning_rate": 5e-06, | |
| "loss": 0.732, | |
| "step": 1000 | |
| }, | |
| { | |
| "epoch": 2.525, | |
| "grad_norm": 0.7798403321881853, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7301, | |
| "step": 1010 | |
| }, | |
| { | |
| "epoch": 2.55, | |
| "grad_norm": 0.6522313154237275, | |
| "learning_rate": 5e-06, | |
| "loss": 0.727, | |
| "step": 1020 | |
| }, | |
| { | |
| "epoch": 2.575, | |
| "grad_norm": 0.5918752307210033, | |
| "learning_rate": 5e-06, | |
| "loss": 0.732, | |
| "step": 1030 | |
| }, | |
| { | |
| "epoch": 2.6, | |
| "grad_norm": 0.7013642379103808, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7304, | |
| "step": 1040 | |
| }, | |
| { | |
| "epoch": 2.625, | |
| "grad_norm": 0.6019984161152461, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7301, | |
| "step": 1050 | |
| }, | |
| { | |
| "epoch": 2.65, | |
| "grad_norm": 0.6025521914638767, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7303, | |
| "step": 1060 | |
| }, | |
| { | |
| "epoch": 2.675, | |
| "grad_norm": 0.8383133859288988, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7352, | |
| "step": 1070 | |
| }, | |
| { | |
| "epoch": 2.7, | |
| "grad_norm": 0.7176072744526589, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7325, | |
| "step": 1080 | |
| }, | |
| { | |
| "epoch": 2.725, | |
| "grad_norm": 0.6330508627766337, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7297, | |
| "step": 1090 | |
| }, | |
| { | |
| "epoch": 2.75, | |
| "grad_norm": 0.708854127664677, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7302, | |
| "step": 1100 | |
| }, | |
| { | |
| "epoch": 2.775, | |
| "grad_norm": 0.7080026819402018, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7338, | |
| "step": 1110 | |
| }, | |
| { | |
| "epoch": 2.8, | |
| "grad_norm": 0.6486046388056755, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7333, | |
| "step": 1120 | |
| }, | |
| { | |
| "epoch": 2.825, | |
| "grad_norm": 0.6098247605462774, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7368, | |
| "step": 1130 | |
| }, | |
| { | |
| "epoch": 2.85, | |
| "grad_norm": 0.707595465934189, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7315, | |
| "step": 1140 | |
| }, | |
| { | |
| "epoch": 2.875, | |
| "grad_norm": 0.6598068743344158, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7328, | |
| "step": 1150 | |
| }, | |
| { | |
| "epoch": 2.9, | |
| "grad_norm": 0.6256793408527994, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7335, | |
| "step": 1160 | |
| }, | |
| { | |
| "epoch": 2.925, | |
| "grad_norm": 0.6477061017951039, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7296, | |
| "step": 1170 | |
| }, | |
| { | |
| "epoch": 2.95, | |
| "grad_norm": 0.6150717312512685, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7336, | |
| "step": 1180 | |
| }, | |
| { | |
| "epoch": 2.975, | |
| "grad_norm": 0.5375035486490041, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7316, | |
| "step": 1190 | |
| }, | |
| { | |
| "epoch": 3.0, | |
| "grad_norm": 0.6000906442308059, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7331, | |
| "step": 1200 | |
| }, | |
| { | |
| "epoch": 3.0, | |
| "eval_loss": 0.8070117235183716, | |
| "eval_runtime": 271.2171, | |
| "eval_samples_per_second": 39.736, | |
| "eval_steps_per_second": 0.623, | |
| "step": 1200 | |
| }, | |
| { | |
| "epoch": 3.0, | |
| "step": 1200, | |
| "total_flos": 2009835314872320.0, | |
| "train_loss": 0.7931587568918864, | |
| "train_runtime": 40308.7538, | |
| "train_samples_per_second": 15.238, | |
| "train_steps_per_second": 0.03 | |
| } | |
| ], | |
| "logging_steps": 10, | |
| "max_steps": 1200, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 3, | |
| "save_steps": 500, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": true | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 2009835314872320.0, | |
| "train_batch_size": 8, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |