| { | |
| "best_global_step": null, | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 0.00020262979704937243, | |
| "eval_steps": 500, | |
| "global_step": 60, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 3.377163284156207e-06, | |
| "grad_norm": 3.2998340276743727, | |
| "learning_rate": 0.0, | |
| "loss": 0.6698, | |
| "num_tokens": 15452.0, | |
| "step": 1 | |
| }, | |
| { | |
| "epoch": 6.754326568312414e-06, | |
| "grad_norm": 3.314867812248204, | |
| "learning_rate": 1e-05, | |
| "loss": 0.7748, | |
| "num_tokens": 31805.0, | |
| "step": 2 | |
| }, | |
| { | |
| "epoch": 1.0131489852468621e-05, | |
| "grad_norm": 2.636858491346485, | |
| "learning_rate": 2e-05, | |
| "loss": 0.6761, | |
| "num_tokens": 48037.0, | |
| "step": 3 | |
| }, | |
| { | |
| "epoch": 1.3508653136624828e-05, | |
| "grad_norm": 2.862558765495362, | |
| "learning_rate": 1.9986800724660115e-05, | |
| "loss": 0.7604, | |
| "num_tokens": 64421.0, | |
| "step": 4 | |
| }, | |
| { | |
| "epoch": 1.6885816420781036e-05, | |
| "grad_norm": 4.699701254023154, | |
| "learning_rate": 1.994724161438924e-05, | |
| "loss": 0.8179, | |
| "num_tokens": 80610.0, | |
| "step": 5 | |
| }, | |
| { | |
| "epoch": 2.0262979704937243e-05, | |
| "grad_norm": 2.6445710424782325, | |
| "learning_rate": 1.988143870287374e-05, | |
| "loss": 0.6983, | |
| "num_tokens": 96889.0, | |
| "step": 6 | |
| }, | |
| { | |
| "epoch": 2.364014298909345e-05, | |
| "grad_norm": 2.639879020578115, | |
| "learning_rate": 1.978958500139078e-05, | |
| "loss": 0.5182, | |
| "num_tokens": 113102.0, | |
| "step": 7 | |
| }, | |
| { | |
| "epoch": 2.7017306273249657e-05, | |
| "grad_norm": 3.823695385004499, | |
| "learning_rate": 1.9671949932673007e-05, | |
| "loss": 0.7172, | |
| "num_tokens": 128765.0, | |
| "step": 8 | |
| }, | |
| { | |
| "epoch": 3.0394469557405867e-05, | |
| "grad_norm": 2.534376407017609, | |
| "learning_rate": 1.9528878540645225e-05, | |
| "loss": 0.7418, | |
| "num_tokens": 145149.0, | |
| "step": 9 | |
| }, | |
| { | |
| "epoch": 3.377163284156207e-05, | |
| "grad_norm": 2.435524195364598, | |
| "learning_rate": 1.9360790478351125e-05, | |
| "loss": 0.7219, | |
| "num_tokens": 161533.0, | |
| "step": 10 | |
| }, | |
| { | |
| "epoch": 3.714879612571828e-05, | |
| "grad_norm": 3.0208638335811986, | |
| "learning_rate": 1.9168178777038614e-05, | |
| "loss": 0.625, | |
| "num_tokens": 177844.0, | |
| "step": 11 | |
| }, | |
| { | |
| "epoch": 4.0525959409874485e-05, | |
| "grad_norm": 2.0415192942047784, | |
| "learning_rate": 1.8951608400014208e-05, | |
| "loss": 0.763, | |
| "num_tokens": 194228.0, | |
| "step": 12 | |
| }, | |
| { | |
| "epoch": 4.390312269403069e-05, | |
| "grad_norm": 2.8653779028111974, | |
| "learning_rate": 1.8711714585508303e-05, | |
| "loss": 0.7449, | |
| "num_tokens": 210612.0, | |
| "step": 13 | |
| }, | |
| { | |
| "epoch": 4.72802859781869e-05, | |
| "grad_norm": 1.9417779398411579, | |
| "learning_rate": 1.8449200983412017e-05, | |
| "loss": 0.759, | |
| "num_tokens": 226876.0, | |
| "step": 14 | |
| }, | |
| { | |
| "epoch": 5.065744926234311e-05, | |
| "grad_norm": 3.9805173199464834, | |
| "learning_rate": 1.8164837591350794e-05, | |
| "loss": 0.7392, | |
| "num_tokens": 243260.0, | |
| "step": 15 | |
| }, | |
| { | |
| "epoch": 5.4034612546499314e-05, | |
| "grad_norm": 2.476541943825435, | |
| "learning_rate": 1.7859458496148728e-05, | |
| "loss": 0.9031, | |
| "num_tokens": 259119.0, | |
| "step": 16 | |
| }, | |
| { | |
| "epoch": 5.741177583065553e-05, | |
| "grad_norm": 2.651204324427315, | |
| "learning_rate": 1.753395942730818e-05, | |
| "loss": 0.6205, | |
| "num_tokens": 275503.0, | |
| "step": 17 | |
| }, | |
| { | |
| "epoch": 6.0788939114811735e-05, | |
| "grad_norm": 1.953799253533995, | |
| "learning_rate": 1.7189295129680813e-05, | |
| "loss": 0.8365, | |
| "num_tokens": 291589.0, | |
| "step": 18 | |
| }, | |
| { | |
| "epoch": 6.416610239896794e-05, | |
| "grad_norm": 2.5313894503087644, | |
| "learning_rate": 1.682647656303645e-05, | |
| "loss": 0.7259, | |
| "num_tokens": 307784.0, | |
| "step": 19 | |
| }, | |
| { | |
| "epoch": 6.754326568312414e-05, | |
| "grad_norm": 2.491889329796246, | |
| "learning_rate": 1.644656793674389e-05, | |
| "loss": 0.8129, | |
| "num_tokens": 323930.0, | |
| "step": 20 | |
| }, | |
| { | |
| "epoch": 7.092042896728035e-05, | |
| "grad_norm": 2.1154387961448253, | |
| "learning_rate": 1.6050683588261443e-05, | |
| "loss": 0.7697, | |
| "num_tokens": 339986.0, | |
| "step": 21 | |
| }, | |
| { | |
| "epoch": 7.429759225143656e-05, | |
| "grad_norm": 2.164937404359563, | |
| "learning_rate": 1.56399847145932e-05, | |
| "loss": 0.6625, | |
| "num_tokens": 356006.0, | |
| "step": 22 | |
| }, | |
| { | |
| "epoch": 7.767475553559276e-05, | |
| "grad_norm": 2.1656607929946015, | |
| "learning_rate": 1.5215675966298114e-05, | |
| "loss": 0.6913, | |
| "num_tokens": 372204.0, | |
| "step": 23 | |
| }, | |
| { | |
| "epoch": 8.105191881974897e-05, | |
| "grad_norm": 2.7914504787622487, | |
| "learning_rate": 1.4779001914042384e-05, | |
| "loss": 0.6953, | |
| "num_tokens": 388588.0, | |
| "step": 24 | |
| }, | |
| { | |
| "epoch": 8.442908210390518e-05, | |
| "grad_norm": 2.537959588442396, | |
| "learning_rate": 1.433124339805923e-05, | |
| "loss": 0.9529, | |
| "num_tokens": 404218.0, | |
| "step": 25 | |
| }, | |
| { | |
| "epoch": 8.780624538806138e-05, | |
| "grad_norm": 2.1684714858541168, | |
| "learning_rate": 1.387371377122382e-05, | |
| "loss": 0.7472, | |
| "num_tokens": 420364.0, | |
| "step": 26 | |
| }, | |
| { | |
| "epoch": 9.118340867221759e-05, | |
| "grad_norm": 2.6761066929636073, | |
| "learning_rate": 1.340775504676299e-05, | |
| "loss": 0.7507, | |
| "num_tokens": 436748.0, | |
| "step": 27 | |
| }, | |
| { | |
| "epoch": 9.45605719563738e-05, | |
| "grad_norm": 2.174938128059083, | |
| "learning_rate": 1.293473396189922e-05, | |
| "loss": 0.9251, | |
| "num_tokens": 453046.0, | |
| "step": 28 | |
| }, | |
| { | |
| "epoch": 9.793773524053e-05, | |
| "grad_norm": 2.4115251558621984, | |
| "learning_rate": 1.2456037968974885e-05, | |
| "loss": 0.7905, | |
| "num_tokens": 469005.0, | |
| "step": 29 | |
| }, | |
| { | |
| "epoch": 0.00010131489852468621, | |
| "grad_norm": 1.60837374564969, | |
| "learning_rate": 1.1973071165815478e-05, | |
| "loss": 0.7351, | |
| "num_tokens": 485002.0, | |
| "step": 30 | |
| }, | |
| { | |
| "epoch": 0.00010469206180884242, | |
| "grad_norm": 2.4309846178168866, | |
| "learning_rate": 1.148725017726876e-05, | |
| "loss": 0.9979, | |
| "num_tokens": 501386.0, | |
| "step": 31 | |
| }, | |
| { | |
| "epoch": 0.00010806922509299863, | |
| "grad_norm": 2.263832588986116, | |
| "learning_rate": 1.1000000000000001e-05, | |
| "loss": 0.7958, | |
| "num_tokens": 517770.0, | |
| "step": 32 | |
| }, | |
| { | |
| "epoch": 0.00011144638837715483, | |
| "grad_norm": 1.6117737174897266, | |
| "learning_rate": 1.0512749822731243e-05, | |
| "loss": 0.6614, | |
| "num_tokens": 533776.0, | |
| "step": 33 | |
| }, | |
| { | |
| "epoch": 0.00011482355166131106, | |
| "grad_norm": 2.0488856375912445, | |
| "learning_rate": 1.0026928834184527e-05, | |
| "loss": 0.8444, | |
| "num_tokens": 550160.0, | |
| "step": 34 | |
| }, | |
| { | |
| "epoch": 0.00011820071494546726, | |
| "grad_norm": 1.9496889317534967, | |
| "learning_rate": 9.543962031025118e-06, | |
| "loss": 0.9086, | |
| "num_tokens": 566346.0, | |
| "step": 35 | |
| }, | |
| { | |
| "epoch": 0.00012157787822962347, | |
| "grad_norm": 1.6614668753060753, | |
| "learning_rate": 9.065266038100783e-06, | |
| "loss": 0.739, | |
| "num_tokens": 582525.0, | |
| "step": 36 | |
| }, | |
| { | |
| "epoch": 0.00012495504151377966, | |
| "grad_norm": 1.9466008912375943, | |
| "learning_rate": 8.592244953237014e-06, | |
| "loss": 0.749, | |
| "num_tokens": 598644.0, | |
| "step": 37 | |
| }, | |
| { | |
| "epoch": 0.00012833220479793587, | |
| "grad_norm": 1.8574288792595999, | |
| "learning_rate": 8.126286228776183e-06, | |
| "loss": 0.7908, | |
| "num_tokens": 615028.0, | |
| "step": 38 | |
| }, | |
| { | |
| "epoch": 0.00013170936808209208, | |
| "grad_norm": 2.050591553128843, | |
| "learning_rate": 7.66875660194077e-06, | |
| "loss": 0.7322, | |
| "num_tokens": 631265.0, | |
| "step": 39 | |
| }, | |
| { | |
| "epoch": 0.00013508653136624828, | |
| "grad_norm": 2.0008482722767122, | |
| "learning_rate": 7.2209980859576204e-06, | |
| "loss": 0.818, | |
| "num_tokens": 647649.0, | |
| "step": 40 | |
| }, | |
| { | |
| "epoch": 0.0001384636946504045, | |
| "grad_norm": 1.833258401205397, | |
| "learning_rate": 6.78432403370189e-06, | |
| "loss": 0.7454, | |
| "num_tokens": 664033.0, | |
| "step": 41 | |
| }, | |
| { | |
| "epoch": 0.0001418408579345607, | |
| "grad_norm": 1.9661355226698174, | |
| "learning_rate": 6.360015285406804e-06, | |
| "loss": 0.7122, | |
| "num_tokens": 679962.0, | |
| "step": 42 | |
| }, | |
| { | |
| "epoch": 0.0001452180212187169, | |
| "grad_norm": 2.058428378432495, | |
| "learning_rate": 5.9493164117385605e-06, | |
| "loss": 0.6922, | |
| "num_tokens": 696076.0, | |
| "step": 43 | |
| }, | |
| { | |
| "epoch": 0.0001485951845028731, | |
| "grad_norm": 1.7144233362081016, | |
| "learning_rate": 5.5534320632561165e-06, | |
| "loss": 0.7376, | |
| "num_tokens": 712293.0, | |
| "step": 44 | |
| }, | |
| { | |
| "epoch": 0.00015197234778702932, | |
| "grad_norm": 2.1359696786830575, | |
| "learning_rate": 5.173523436963552e-06, | |
| "loss": 0.7869, | |
| "num_tokens": 728402.0, | |
| "step": 45 | |
| }, | |
| { | |
| "epoch": 0.00015534951107118553, | |
| "grad_norm": 1.6013689228796348, | |
| "learning_rate": 4.81070487031919e-06, | |
| "loss": 0.6969, | |
| "num_tokens": 744784.0, | |
| "step": 46 | |
| }, | |
| { | |
| "epoch": 0.00015872667435534173, | |
| "grad_norm": 2.2478354102121467, | |
| "learning_rate": 4.466040572691825e-06, | |
| "loss": 0.8044, | |
| "num_tokens": 761011.0, | |
| "step": 47 | |
| }, | |
| { | |
| "epoch": 0.00016210383763949794, | |
| "grad_norm": 1.695729707502705, | |
| "learning_rate": 4.140541503851273e-06, | |
| "loss": 0.7718, | |
| "num_tokens": 777278.0, | |
| "step": 48 | |
| }, | |
| { | |
| "epoch": 0.00016548100092365415, | |
| "grad_norm": 1.9847435959437771, | |
| "learning_rate": 3.835162408649207e-06, | |
| "loss": 0.6821, | |
| "num_tokens": 793662.0, | |
| "step": 49 | |
| }, | |
| { | |
| "epoch": 0.00016885816420781036, | |
| "grad_norm": 1.5498551098436795, | |
| "learning_rate": 3.5507990165879863e-06, | |
| "loss": 0.6231, | |
| "num_tokens": 810046.0, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.00017223532749196656, | |
| "grad_norm": 2.134645146571127, | |
| "learning_rate": 3.2882854144916986e-06, | |
| "loss": 0.6843, | |
| "num_tokens": 826430.0, | |
| "step": 51 | |
| }, | |
| { | |
| "epoch": 0.00017561249077612277, | |
| "grad_norm": 2.3239289472585334, | |
| "learning_rate": 3.0483915999857948e-06, | |
| "loss": 0.6446, | |
| "num_tokens": 842569.0, | |
| "step": 52 | |
| }, | |
| { | |
| "epoch": 0.00017898965406027898, | |
| "grad_norm": 2.5992128021859764, | |
| "learning_rate": 2.8318212229613886e-06, | |
| "loss": 0.7124, | |
| "num_tokens": 858632.0, | |
| "step": 53 | |
| }, | |
| { | |
| "epoch": 0.00018236681734443518, | |
| "grad_norm": 1.704578656265245, | |
| "learning_rate": 2.639209521648878e-06, | |
| "loss": 0.8026, | |
| "num_tokens": 874433.0, | |
| "step": 54 | |
| }, | |
| { | |
| "epoch": 0.0001857439806285914, | |
| "grad_norm": 2.519268314944392, | |
| "learning_rate": 2.4711214593547793e-06, | |
| "loss": 0.7972, | |
| "num_tokens": 890719.0, | |
| "step": 55 | |
| }, | |
| { | |
| "epoch": 0.0001891211439127476, | |
| "grad_norm": 1.9980417958286467, | |
| "learning_rate": 2.328050067326994e-06, | |
| "loss": 0.8193, | |
| "num_tokens": 907103.0, | |
| "step": 56 | |
| }, | |
| { | |
| "epoch": 0.0001924983071969038, | |
| "grad_norm": 3.1384042825103644, | |
| "learning_rate": 2.2104149986092204e-06, | |
| "loss": 0.7287, | |
| "num_tokens": 923253.0, | |
| "step": 57 | |
| }, | |
| { | |
| "epoch": 0.00019587547048106, | |
| "grad_norm": 1.56025595889078, | |
| "learning_rate": 2.118561297126265e-06, | |
| "loss": 0.6933, | |
| "num_tokens": 939637.0, | |
| "step": 58 | |
| }, | |
| { | |
| "epoch": 0.00019925263376521622, | |
| "grad_norm": 1.6418440558647935, | |
| "learning_rate": 2.052758385610764e-06, | |
| "loss": 0.6185, | |
| "num_tokens": 955674.0, | |
| "step": 59 | |
| }, | |
| { | |
| "epoch": 0.00020262979704937243, | |
| "grad_norm": 2.3942563402200143, | |
| "learning_rate": 2.013199275339886e-06, | |
| "loss": 0.7311, | |
| "num_tokens": 972044.0, | |
| "step": 60 | |
| }, | |
| { | |
| "epoch": 0.00020262979704937243, | |
| "step": 60, | |
| "total_flos": 864992169984.0, | |
| "train_loss": 0.747809229294459, | |
| "train_runtime": 385.7034, | |
| "train_samples_per_second": 2.489, | |
| "train_steps_per_second": 0.156 | |
| } | |
| ], | |
| "logging_steps": 1, | |
| "max_steps": 60, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 1, | |
| "save_steps": 30, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": true | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 864992169984.0, | |
| "train_batch_size": 1, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |