{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 0.00020262979704937243, "eval_steps": 500, "global_step": 60, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 3.377163284156207e-06, "grad_norm": 3.2998340276743727, "learning_rate": 0.0, "loss": 0.6698, "num_tokens": 15452.0, "step": 1 }, { "epoch": 6.754326568312414e-06, "grad_norm": 3.314867812248204, "learning_rate": 1e-05, "loss": 0.7748, "num_tokens": 31805.0, "step": 2 }, { "epoch": 1.0131489852468621e-05, "grad_norm": 2.636858491346485, "learning_rate": 2e-05, "loss": 0.6761, "num_tokens": 48037.0, "step": 3 }, { "epoch": 1.3508653136624828e-05, "grad_norm": 2.862558765495362, "learning_rate": 1.9986800724660115e-05, "loss": 0.7604, "num_tokens": 64421.0, "step": 4 }, { "epoch": 1.6885816420781036e-05, "grad_norm": 4.699701254023154, "learning_rate": 1.994724161438924e-05, "loss": 0.8179, "num_tokens": 80610.0, "step": 5 }, { "epoch": 2.0262979704937243e-05, "grad_norm": 2.6445710424782325, "learning_rate": 1.988143870287374e-05, "loss": 0.6983, "num_tokens": 96889.0, "step": 6 }, { "epoch": 2.364014298909345e-05, "grad_norm": 2.639879020578115, "learning_rate": 1.978958500139078e-05, "loss": 0.5182, "num_tokens": 113102.0, "step": 7 }, { "epoch": 2.7017306273249657e-05, "grad_norm": 3.823695385004499, "learning_rate": 1.9671949932673007e-05, "loss": 0.7172, "num_tokens": 128765.0, "step": 8 }, { "epoch": 3.0394469557405867e-05, "grad_norm": 2.534376407017609, "learning_rate": 1.9528878540645225e-05, "loss": 0.7418, "num_tokens": 145149.0, "step": 9 }, { "epoch": 3.377163284156207e-05, "grad_norm": 2.435524195364598, "learning_rate": 1.9360790478351125e-05, "loss": 0.7219, "num_tokens": 161533.0, "step": 10 }, { "epoch": 3.714879612571828e-05, "grad_norm": 3.0208638335811986, "learning_rate": 1.9168178777038614e-05, "loss": 0.625, "num_tokens": 177844.0, "step": 11 }, { "epoch": 4.0525959409874485e-05, "grad_norm": 2.0415192942047784, "learning_rate": 1.8951608400014208e-05, "loss": 0.763, "num_tokens": 194228.0, "step": 12 }, { "epoch": 4.390312269403069e-05, "grad_norm": 2.8653779028111974, "learning_rate": 1.8711714585508303e-05, "loss": 0.7449, "num_tokens": 210612.0, "step": 13 }, { "epoch": 4.72802859781869e-05, "grad_norm": 1.9417779398411579, "learning_rate": 1.8449200983412017e-05, "loss": 0.759, "num_tokens": 226876.0, "step": 14 }, { "epoch": 5.065744926234311e-05, "grad_norm": 3.9805173199464834, "learning_rate": 1.8164837591350794e-05, "loss": 0.7392, "num_tokens": 243260.0, "step": 15 }, { "epoch": 5.4034612546499314e-05, "grad_norm": 2.476541943825435, "learning_rate": 1.7859458496148728e-05, "loss": 0.9031, "num_tokens": 259119.0, "step": 16 }, { "epoch": 5.741177583065553e-05, "grad_norm": 2.651204324427315, "learning_rate": 1.753395942730818e-05, "loss": 0.6205, "num_tokens": 275503.0, "step": 17 }, { "epoch": 6.0788939114811735e-05, "grad_norm": 1.953799253533995, "learning_rate": 1.7189295129680813e-05, "loss": 0.8365, "num_tokens": 291589.0, "step": 18 }, { "epoch": 6.416610239896794e-05, "grad_norm": 2.5313894503087644, "learning_rate": 1.682647656303645e-05, "loss": 0.7259, "num_tokens": 307784.0, "step": 19 }, { "epoch": 6.754326568312414e-05, "grad_norm": 2.491889329796246, "learning_rate": 1.644656793674389e-05, "loss": 0.8129, "num_tokens": 323930.0, "step": 20 }, { "epoch": 7.092042896728035e-05, "grad_norm": 2.1154387961448253, "learning_rate": 1.6050683588261443e-05, "loss": 0.7697, "num_tokens": 339986.0, "step": 21 }, { "epoch": 7.429759225143656e-05, "grad_norm": 2.164937404359563, "learning_rate": 1.56399847145932e-05, "loss": 0.6625, "num_tokens": 356006.0, "step": 22 }, { "epoch": 7.767475553559276e-05, "grad_norm": 2.1656607929946015, "learning_rate": 1.5215675966298114e-05, "loss": 0.6913, "num_tokens": 372204.0, "step": 23 }, { "epoch": 8.105191881974897e-05, "grad_norm": 2.7914504787622487, "learning_rate": 1.4779001914042384e-05, "loss": 0.6953, "num_tokens": 388588.0, "step": 24 }, { "epoch": 8.442908210390518e-05, "grad_norm": 2.537959588442396, "learning_rate": 1.433124339805923e-05, "loss": 0.9529, "num_tokens": 404218.0, "step": 25 }, { "epoch": 8.780624538806138e-05, "grad_norm": 2.1684714858541168, "learning_rate": 1.387371377122382e-05, "loss": 0.7472, "num_tokens": 420364.0, "step": 26 }, { "epoch": 9.118340867221759e-05, "grad_norm": 2.6761066929636073, "learning_rate": 1.340775504676299e-05, "loss": 0.7507, "num_tokens": 436748.0, "step": 27 }, { "epoch": 9.45605719563738e-05, "grad_norm": 2.174938128059083, "learning_rate": 1.293473396189922e-05, "loss": 0.9251, "num_tokens": 453046.0, "step": 28 }, { "epoch": 9.793773524053e-05, "grad_norm": 2.4115251558621984, "learning_rate": 1.2456037968974885e-05, "loss": 0.7905, "num_tokens": 469005.0, "step": 29 }, { "epoch": 0.00010131489852468621, "grad_norm": 1.60837374564969, "learning_rate": 1.1973071165815478e-05, "loss": 0.7351, "num_tokens": 485002.0, "step": 30 }, { "epoch": 0.00010469206180884242, "grad_norm": 2.4309846178168866, "learning_rate": 1.148725017726876e-05, "loss": 0.9979, "num_tokens": 501386.0, "step": 31 }, { "epoch": 0.00010806922509299863, "grad_norm": 2.263832588986116, "learning_rate": 1.1000000000000001e-05, "loss": 0.7958, "num_tokens": 517770.0, "step": 32 }, { "epoch": 0.00011144638837715483, "grad_norm": 1.6117737174897266, "learning_rate": 1.0512749822731243e-05, "loss": 0.6614, "num_tokens": 533776.0, "step": 33 }, { "epoch": 0.00011482355166131106, "grad_norm": 2.0488856375912445, "learning_rate": 1.0026928834184527e-05, "loss": 0.8444, "num_tokens": 550160.0, "step": 34 }, { "epoch": 0.00011820071494546726, "grad_norm": 1.9496889317534967, "learning_rate": 9.543962031025118e-06, "loss": 0.9086, "num_tokens": 566346.0, "step": 35 }, { "epoch": 0.00012157787822962347, "grad_norm": 1.6614668753060753, "learning_rate": 9.065266038100783e-06, "loss": 0.739, "num_tokens": 582525.0, "step": 36 }, { "epoch": 0.00012495504151377966, "grad_norm": 1.9466008912375943, "learning_rate": 8.592244953237014e-06, "loss": 0.749, "num_tokens": 598644.0, "step": 37 }, { "epoch": 0.00012833220479793587, "grad_norm": 1.8574288792595999, "learning_rate": 8.126286228776183e-06, "loss": 0.7908, "num_tokens": 615028.0, "step": 38 }, { "epoch": 0.00013170936808209208, "grad_norm": 2.050591553128843, "learning_rate": 7.66875660194077e-06, "loss": 0.7322, "num_tokens": 631265.0, "step": 39 }, { "epoch": 0.00013508653136624828, "grad_norm": 2.0008482722767122, "learning_rate": 7.2209980859576204e-06, "loss": 0.818, "num_tokens": 647649.0, "step": 40 }, { "epoch": 0.0001384636946504045, "grad_norm": 1.833258401205397, "learning_rate": 6.78432403370189e-06, "loss": 0.7454, "num_tokens": 664033.0, "step": 41 }, { "epoch": 0.0001418408579345607, "grad_norm": 1.9661355226698174, "learning_rate": 6.360015285406804e-06, "loss": 0.7122, "num_tokens": 679962.0, "step": 42 }, { "epoch": 0.0001452180212187169, "grad_norm": 2.058428378432495, "learning_rate": 5.9493164117385605e-06, "loss": 0.6922, "num_tokens": 696076.0, "step": 43 }, { "epoch": 0.0001485951845028731, "grad_norm": 1.7144233362081016, "learning_rate": 5.5534320632561165e-06, "loss": 0.7376, "num_tokens": 712293.0, "step": 44 }, { "epoch": 0.00015197234778702932, "grad_norm": 2.1359696786830575, "learning_rate": 5.173523436963552e-06, "loss": 0.7869, "num_tokens": 728402.0, "step": 45 }, { "epoch": 0.00015534951107118553, "grad_norm": 1.6013689228796348, "learning_rate": 4.81070487031919e-06, "loss": 0.6969, "num_tokens": 744784.0, "step": 46 }, { "epoch": 0.00015872667435534173, "grad_norm": 2.2478354102121467, "learning_rate": 4.466040572691825e-06, "loss": 0.8044, "num_tokens": 761011.0, "step": 47 }, { "epoch": 0.00016210383763949794, "grad_norm": 1.695729707502705, "learning_rate": 4.140541503851273e-06, "loss": 0.7718, "num_tokens": 777278.0, "step": 48 }, { "epoch": 0.00016548100092365415, "grad_norm": 1.9847435959437771, "learning_rate": 3.835162408649207e-06, "loss": 0.6821, "num_tokens": 793662.0, "step": 49 }, { "epoch": 0.00016885816420781036, "grad_norm": 1.5498551098436795, "learning_rate": 3.5507990165879863e-06, "loss": 0.6231, "num_tokens": 810046.0, "step": 50 }, { "epoch": 0.00017223532749196656, "grad_norm": 2.134645146571127, "learning_rate": 3.2882854144916986e-06, "loss": 0.6843, "num_tokens": 826430.0, "step": 51 }, { "epoch": 0.00017561249077612277, "grad_norm": 2.3239289472585334, "learning_rate": 3.0483915999857948e-06, "loss": 0.6446, "num_tokens": 842569.0, "step": 52 }, { "epoch": 0.00017898965406027898, "grad_norm": 2.5992128021859764, "learning_rate": 2.8318212229613886e-06, "loss": 0.7124, "num_tokens": 858632.0, "step": 53 }, { "epoch": 0.00018236681734443518, "grad_norm": 1.704578656265245, "learning_rate": 2.639209521648878e-06, "loss": 0.8026, "num_tokens": 874433.0, "step": 54 }, { "epoch": 0.0001857439806285914, "grad_norm": 2.519268314944392, "learning_rate": 2.4711214593547793e-06, "loss": 0.7972, "num_tokens": 890719.0, "step": 55 }, { "epoch": 0.0001891211439127476, "grad_norm": 1.9980417958286467, "learning_rate": 2.328050067326994e-06, "loss": 0.8193, "num_tokens": 907103.0, "step": 56 }, { "epoch": 0.0001924983071969038, "grad_norm": 3.1384042825103644, "learning_rate": 2.2104149986092204e-06, "loss": 0.7287, "num_tokens": 923253.0, "step": 57 }, { "epoch": 0.00019587547048106, "grad_norm": 1.56025595889078, "learning_rate": 2.118561297126265e-06, "loss": 0.6933, "num_tokens": 939637.0, "step": 58 }, { "epoch": 0.00019925263376521622, "grad_norm": 1.6418440558647935, "learning_rate": 2.052758385610764e-06, "loss": 0.6185, "num_tokens": 955674.0, "step": 59 }, { "epoch": 0.00020262979704937243, "grad_norm": 2.3942563402200143, "learning_rate": 2.013199275339886e-06, "loss": 0.7311, "num_tokens": 972044.0, "step": 60 }, { "epoch": 0.00020262979704937243, "step": 60, "total_flos": 864992169984.0, "train_loss": 0.747809229294459, "train_runtime": 385.7034, "train_samples_per_second": 2.489, "train_steps_per_second": 0.156 } ], "logging_steps": 1, "max_steps": 60, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 30, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 864992169984.0, "train_batch_size": 1, "trial_name": null, "trial_params": null }