DeepBrainz-SmolLM3-Mid-Sanity / trainer_state.json
ArunkumarVR's picture
Model save
3031f91 verified
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 0.00020262979704937243,
"eval_steps": 500,
"global_step": 60,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 3.377163284156207e-06,
"grad_norm": 3.2998340276743727,
"learning_rate": 0.0,
"loss": 0.6698,
"num_tokens": 15452.0,
"step": 1
},
{
"epoch": 6.754326568312414e-06,
"grad_norm": 3.314867812248204,
"learning_rate": 1e-05,
"loss": 0.7748,
"num_tokens": 31805.0,
"step": 2
},
{
"epoch": 1.0131489852468621e-05,
"grad_norm": 2.636858491346485,
"learning_rate": 2e-05,
"loss": 0.6761,
"num_tokens": 48037.0,
"step": 3
},
{
"epoch": 1.3508653136624828e-05,
"grad_norm": 2.862558765495362,
"learning_rate": 1.9986800724660115e-05,
"loss": 0.7604,
"num_tokens": 64421.0,
"step": 4
},
{
"epoch": 1.6885816420781036e-05,
"grad_norm": 4.699701254023154,
"learning_rate": 1.994724161438924e-05,
"loss": 0.8179,
"num_tokens": 80610.0,
"step": 5
},
{
"epoch": 2.0262979704937243e-05,
"grad_norm": 2.6445710424782325,
"learning_rate": 1.988143870287374e-05,
"loss": 0.6983,
"num_tokens": 96889.0,
"step": 6
},
{
"epoch": 2.364014298909345e-05,
"grad_norm": 2.639879020578115,
"learning_rate": 1.978958500139078e-05,
"loss": 0.5182,
"num_tokens": 113102.0,
"step": 7
},
{
"epoch": 2.7017306273249657e-05,
"grad_norm": 3.823695385004499,
"learning_rate": 1.9671949932673007e-05,
"loss": 0.7172,
"num_tokens": 128765.0,
"step": 8
},
{
"epoch": 3.0394469557405867e-05,
"grad_norm": 2.534376407017609,
"learning_rate": 1.9528878540645225e-05,
"loss": 0.7418,
"num_tokens": 145149.0,
"step": 9
},
{
"epoch": 3.377163284156207e-05,
"grad_norm": 2.435524195364598,
"learning_rate": 1.9360790478351125e-05,
"loss": 0.7219,
"num_tokens": 161533.0,
"step": 10
},
{
"epoch": 3.714879612571828e-05,
"grad_norm": 3.0208638335811986,
"learning_rate": 1.9168178777038614e-05,
"loss": 0.625,
"num_tokens": 177844.0,
"step": 11
},
{
"epoch": 4.0525959409874485e-05,
"grad_norm": 2.0415192942047784,
"learning_rate": 1.8951608400014208e-05,
"loss": 0.763,
"num_tokens": 194228.0,
"step": 12
},
{
"epoch": 4.390312269403069e-05,
"grad_norm": 2.8653779028111974,
"learning_rate": 1.8711714585508303e-05,
"loss": 0.7449,
"num_tokens": 210612.0,
"step": 13
},
{
"epoch": 4.72802859781869e-05,
"grad_norm": 1.9417779398411579,
"learning_rate": 1.8449200983412017e-05,
"loss": 0.759,
"num_tokens": 226876.0,
"step": 14
},
{
"epoch": 5.065744926234311e-05,
"grad_norm": 3.9805173199464834,
"learning_rate": 1.8164837591350794e-05,
"loss": 0.7392,
"num_tokens": 243260.0,
"step": 15
},
{
"epoch": 5.4034612546499314e-05,
"grad_norm": 2.476541943825435,
"learning_rate": 1.7859458496148728e-05,
"loss": 0.9031,
"num_tokens": 259119.0,
"step": 16
},
{
"epoch": 5.741177583065553e-05,
"grad_norm": 2.651204324427315,
"learning_rate": 1.753395942730818e-05,
"loss": 0.6205,
"num_tokens": 275503.0,
"step": 17
},
{
"epoch": 6.0788939114811735e-05,
"grad_norm": 1.953799253533995,
"learning_rate": 1.7189295129680813e-05,
"loss": 0.8365,
"num_tokens": 291589.0,
"step": 18
},
{
"epoch": 6.416610239896794e-05,
"grad_norm": 2.5313894503087644,
"learning_rate": 1.682647656303645e-05,
"loss": 0.7259,
"num_tokens": 307784.0,
"step": 19
},
{
"epoch": 6.754326568312414e-05,
"grad_norm": 2.491889329796246,
"learning_rate": 1.644656793674389e-05,
"loss": 0.8129,
"num_tokens": 323930.0,
"step": 20
},
{
"epoch": 7.092042896728035e-05,
"grad_norm": 2.1154387961448253,
"learning_rate": 1.6050683588261443e-05,
"loss": 0.7697,
"num_tokens": 339986.0,
"step": 21
},
{
"epoch": 7.429759225143656e-05,
"grad_norm": 2.164937404359563,
"learning_rate": 1.56399847145932e-05,
"loss": 0.6625,
"num_tokens": 356006.0,
"step": 22
},
{
"epoch": 7.767475553559276e-05,
"grad_norm": 2.1656607929946015,
"learning_rate": 1.5215675966298114e-05,
"loss": 0.6913,
"num_tokens": 372204.0,
"step": 23
},
{
"epoch": 8.105191881974897e-05,
"grad_norm": 2.7914504787622487,
"learning_rate": 1.4779001914042384e-05,
"loss": 0.6953,
"num_tokens": 388588.0,
"step": 24
},
{
"epoch": 8.442908210390518e-05,
"grad_norm": 2.537959588442396,
"learning_rate": 1.433124339805923e-05,
"loss": 0.9529,
"num_tokens": 404218.0,
"step": 25
},
{
"epoch": 8.780624538806138e-05,
"grad_norm": 2.1684714858541168,
"learning_rate": 1.387371377122382e-05,
"loss": 0.7472,
"num_tokens": 420364.0,
"step": 26
},
{
"epoch": 9.118340867221759e-05,
"grad_norm": 2.6761066929636073,
"learning_rate": 1.340775504676299e-05,
"loss": 0.7507,
"num_tokens": 436748.0,
"step": 27
},
{
"epoch": 9.45605719563738e-05,
"grad_norm": 2.174938128059083,
"learning_rate": 1.293473396189922e-05,
"loss": 0.9251,
"num_tokens": 453046.0,
"step": 28
},
{
"epoch": 9.793773524053e-05,
"grad_norm": 2.4115251558621984,
"learning_rate": 1.2456037968974885e-05,
"loss": 0.7905,
"num_tokens": 469005.0,
"step": 29
},
{
"epoch": 0.00010131489852468621,
"grad_norm": 1.60837374564969,
"learning_rate": 1.1973071165815478e-05,
"loss": 0.7351,
"num_tokens": 485002.0,
"step": 30
},
{
"epoch": 0.00010469206180884242,
"grad_norm": 2.4309846178168866,
"learning_rate": 1.148725017726876e-05,
"loss": 0.9979,
"num_tokens": 501386.0,
"step": 31
},
{
"epoch": 0.00010806922509299863,
"grad_norm": 2.263832588986116,
"learning_rate": 1.1000000000000001e-05,
"loss": 0.7958,
"num_tokens": 517770.0,
"step": 32
},
{
"epoch": 0.00011144638837715483,
"grad_norm": 1.6117737174897266,
"learning_rate": 1.0512749822731243e-05,
"loss": 0.6614,
"num_tokens": 533776.0,
"step": 33
},
{
"epoch": 0.00011482355166131106,
"grad_norm": 2.0488856375912445,
"learning_rate": 1.0026928834184527e-05,
"loss": 0.8444,
"num_tokens": 550160.0,
"step": 34
},
{
"epoch": 0.00011820071494546726,
"grad_norm": 1.9496889317534967,
"learning_rate": 9.543962031025118e-06,
"loss": 0.9086,
"num_tokens": 566346.0,
"step": 35
},
{
"epoch": 0.00012157787822962347,
"grad_norm": 1.6614668753060753,
"learning_rate": 9.065266038100783e-06,
"loss": 0.739,
"num_tokens": 582525.0,
"step": 36
},
{
"epoch": 0.00012495504151377966,
"grad_norm": 1.9466008912375943,
"learning_rate": 8.592244953237014e-06,
"loss": 0.749,
"num_tokens": 598644.0,
"step": 37
},
{
"epoch": 0.00012833220479793587,
"grad_norm": 1.8574288792595999,
"learning_rate": 8.126286228776183e-06,
"loss": 0.7908,
"num_tokens": 615028.0,
"step": 38
},
{
"epoch": 0.00013170936808209208,
"grad_norm": 2.050591553128843,
"learning_rate": 7.66875660194077e-06,
"loss": 0.7322,
"num_tokens": 631265.0,
"step": 39
},
{
"epoch": 0.00013508653136624828,
"grad_norm": 2.0008482722767122,
"learning_rate": 7.2209980859576204e-06,
"loss": 0.818,
"num_tokens": 647649.0,
"step": 40
},
{
"epoch": 0.0001384636946504045,
"grad_norm": 1.833258401205397,
"learning_rate": 6.78432403370189e-06,
"loss": 0.7454,
"num_tokens": 664033.0,
"step": 41
},
{
"epoch": 0.0001418408579345607,
"grad_norm": 1.9661355226698174,
"learning_rate": 6.360015285406804e-06,
"loss": 0.7122,
"num_tokens": 679962.0,
"step": 42
},
{
"epoch": 0.0001452180212187169,
"grad_norm": 2.058428378432495,
"learning_rate": 5.9493164117385605e-06,
"loss": 0.6922,
"num_tokens": 696076.0,
"step": 43
},
{
"epoch": 0.0001485951845028731,
"grad_norm": 1.7144233362081016,
"learning_rate": 5.5534320632561165e-06,
"loss": 0.7376,
"num_tokens": 712293.0,
"step": 44
},
{
"epoch": 0.00015197234778702932,
"grad_norm": 2.1359696786830575,
"learning_rate": 5.173523436963552e-06,
"loss": 0.7869,
"num_tokens": 728402.0,
"step": 45
},
{
"epoch": 0.00015534951107118553,
"grad_norm": 1.6013689228796348,
"learning_rate": 4.81070487031919e-06,
"loss": 0.6969,
"num_tokens": 744784.0,
"step": 46
},
{
"epoch": 0.00015872667435534173,
"grad_norm": 2.2478354102121467,
"learning_rate": 4.466040572691825e-06,
"loss": 0.8044,
"num_tokens": 761011.0,
"step": 47
},
{
"epoch": 0.00016210383763949794,
"grad_norm": 1.695729707502705,
"learning_rate": 4.140541503851273e-06,
"loss": 0.7718,
"num_tokens": 777278.0,
"step": 48
},
{
"epoch": 0.00016548100092365415,
"grad_norm": 1.9847435959437771,
"learning_rate": 3.835162408649207e-06,
"loss": 0.6821,
"num_tokens": 793662.0,
"step": 49
},
{
"epoch": 0.00016885816420781036,
"grad_norm": 1.5498551098436795,
"learning_rate": 3.5507990165879863e-06,
"loss": 0.6231,
"num_tokens": 810046.0,
"step": 50
},
{
"epoch": 0.00017223532749196656,
"grad_norm": 2.134645146571127,
"learning_rate": 3.2882854144916986e-06,
"loss": 0.6843,
"num_tokens": 826430.0,
"step": 51
},
{
"epoch": 0.00017561249077612277,
"grad_norm": 2.3239289472585334,
"learning_rate": 3.0483915999857948e-06,
"loss": 0.6446,
"num_tokens": 842569.0,
"step": 52
},
{
"epoch": 0.00017898965406027898,
"grad_norm": 2.5992128021859764,
"learning_rate": 2.8318212229613886e-06,
"loss": 0.7124,
"num_tokens": 858632.0,
"step": 53
},
{
"epoch": 0.00018236681734443518,
"grad_norm": 1.704578656265245,
"learning_rate": 2.639209521648878e-06,
"loss": 0.8026,
"num_tokens": 874433.0,
"step": 54
},
{
"epoch": 0.0001857439806285914,
"grad_norm": 2.519268314944392,
"learning_rate": 2.4711214593547793e-06,
"loss": 0.7972,
"num_tokens": 890719.0,
"step": 55
},
{
"epoch": 0.0001891211439127476,
"grad_norm": 1.9980417958286467,
"learning_rate": 2.328050067326994e-06,
"loss": 0.8193,
"num_tokens": 907103.0,
"step": 56
},
{
"epoch": 0.0001924983071969038,
"grad_norm": 3.1384042825103644,
"learning_rate": 2.2104149986092204e-06,
"loss": 0.7287,
"num_tokens": 923253.0,
"step": 57
},
{
"epoch": 0.00019587547048106,
"grad_norm": 1.56025595889078,
"learning_rate": 2.118561297126265e-06,
"loss": 0.6933,
"num_tokens": 939637.0,
"step": 58
},
{
"epoch": 0.00019925263376521622,
"grad_norm": 1.6418440558647935,
"learning_rate": 2.052758385610764e-06,
"loss": 0.6185,
"num_tokens": 955674.0,
"step": 59
},
{
"epoch": 0.00020262979704937243,
"grad_norm": 2.3942563402200143,
"learning_rate": 2.013199275339886e-06,
"loss": 0.7311,
"num_tokens": 972044.0,
"step": 60
},
{
"epoch": 0.00020262979704937243,
"step": 60,
"total_flos": 864992169984.0,
"train_loss": 0.747809229294459,
"train_runtime": 385.7034,
"train_samples_per_second": 2.489,
"train_steps_per_second": 0.156
}
],
"logging_steps": 1,
"max_steps": 60,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 30,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 864992169984.0,
"train_batch_size": 1,
"trial_name": null,
"trial_params": null
}