Mistral-Docker / trainer_state.json
Sri Santh
Upload 16 files
6eb46c5 verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 3.0,
"eval_steps": 500,
"global_step": 453,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.033112582781456956,
"grad_norm": 2.2606041431427,
"learning_rate": 4.998497170031657e-05,
"loss": 0.6997,
"num_input_tokens_seen": 4912,
"step": 5
},
{
"epoch": 0.06622516556291391,
"grad_norm": 2.3543715476989746,
"learning_rate": 4.9939904869249616e-05,
"loss": 0.3509,
"num_input_tokens_seen": 9936,
"step": 10
},
{
"epoch": 0.09933774834437085,
"grad_norm": 1.801797866821289,
"learning_rate": 4.9864853689026556e-05,
"loss": 0.2853,
"num_input_tokens_seen": 14864,
"step": 15
},
{
"epoch": 0.13245033112582782,
"grad_norm": 1.3575855493545532,
"learning_rate": 4.975990839097764e-05,
"loss": 0.2456,
"num_input_tokens_seen": 19824,
"step": 20
},
{
"epoch": 0.16556291390728478,
"grad_norm": 2.5929694175720215,
"learning_rate": 4.9625195147054034e-05,
"loss": 0.2084,
"num_input_tokens_seen": 24480,
"step": 25
},
{
"epoch": 0.1986754966887417,
"grad_norm": 0.8744802474975586,
"learning_rate": 4.9460875918135804e-05,
"loss": 0.1626,
"num_input_tokens_seen": 29296,
"step": 30
},
{
"epoch": 0.23178807947019867,
"grad_norm": 2.857471466064453,
"learning_rate": 4.9267148259312224e-05,
"loss": 0.1405,
"num_input_tokens_seen": 33936,
"step": 35
},
{
"epoch": 0.26490066225165565,
"grad_norm": 1.7763206958770752,
"learning_rate": 4.9044245082368415e-05,
"loss": 0.1182,
"num_input_tokens_seen": 39056,
"step": 40
},
{
"epoch": 0.2980132450331126,
"grad_norm": 2.78945255279541,
"learning_rate": 4.879243437576383e-05,
"loss": 0.1285,
"num_input_tokens_seen": 43520,
"step": 45
},
{
"epoch": 0.33112582781456956,
"grad_norm": 1.1319329738616943,
"learning_rate": 4.8512018882439475e-05,
"loss": 0.0906,
"num_input_tokens_seen": 48656,
"step": 50
},
{
"epoch": 0.36423841059602646,
"grad_norm": 0.8884170055389404,
"learning_rate": 4.820333573584091e-05,
"loss": 0.054,
"num_input_tokens_seen": 53600,
"step": 55
},
{
"epoch": 0.3973509933774834,
"grad_norm": 2.0057179927825928,
"learning_rate": 4.786675605459487e-05,
"loss": 0.073,
"num_input_tokens_seen": 58384,
"step": 60
},
{
"epoch": 0.4304635761589404,
"grad_norm": 1.3076105117797852,
"learning_rate": 4.7502684496326746e-05,
"loss": 0.0524,
"num_input_tokens_seen": 63152,
"step": 65
},
{
"epoch": 0.46357615894039733,
"grad_norm": 1.074171543121338,
"learning_rate": 4.711155877115523e-05,
"loss": 0.0534,
"num_input_tokens_seen": 67888,
"step": 70
},
{
"epoch": 0.4966887417218543,
"grad_norm": 1.634122371673584,
"learning_rate": 4.669384911544927e-05,
"loss": 0.0759,
"num_input_tokens_seen": 72480,
"step": 75
},
{
"epoch": 0.5298013245033113,
"grad_norm": 1.0059384107589722,
"learning_rate": 4.625005772647979e-05,
"loss": 0.0766,
"num_input_tokens_seen": 77120,
"step": 80
},
{
"epoch": 0.5629139072847682,
"grad_norm": 1.3958277702331543,
"learning_rate": 4.578071815864602e-05,
"loss": 0.0383,
"num_input_tokens_seen": 81584,
"step": 85
},
{
"epoch": 0.5960264900662252,
"grad_norm": 2.721231698989868,
"learning_rate": 4.528639468200226e-05,
"loss": 0.0393,
"num_input_tokens_seen": 86416,
"step": 90
},
{
"epoch": 0.6291390728476821,
"grad_norm": 1.4776098728179932,
"learning_rate": 4.476768160385632e-05,
"loss": 0.0589,
"num_input_tokens_seen": 91248,
"step": 95
},
{
"epoch": 0.6622516556291391,
"grad_norm": 1.1105955839157104,
"learning_rate": 4.4225202554255227e-05,
"loss": 0.0351,
"num_input_tokens_seen": 95936,
"step": 100
},
{
"epoch": 0.695364238410596,
"grad_norm": 1.1301405429840088,
"learning_rate": 4.3659609736217344e-05,
"loss": 0.0417,
"num_input_tokens_seen": 100704,
"step": 105
},
{
"epoch": 0.7284768211920529,
"grad_norm": 1.869552493095398,
"learning_rate": 4.3071583141612135e-05,
"loss": 0.0437,
"num_input_tokens_seen": 105376,
"step": 110
},
{
"epoch": 0.7615894039735099,
"grad_norm": 1.0497004985809326,
"learning_rate": 4.2461829733630435e-05,
"loss": 0.0498,
"num_input_tokens_seen": 110208,
"step": 115
},
{
"epoch": 0.7947019867549668,
"grad_norm": 0.4084171652793884,
"learning_rate": 4.1831082596828106e-05,
"loss": 0.0239,
"num_input_tokens_seen": 114704,
"step": 120
},
{
"epoch": 0.8278145695364238,
"grad_norm": 0.4082948565483093,
"learning_rate": 4.118010005576485e-05,
"loss": 0.0228,
"num_input_tokens_seen": 119744,
"step": 125
},
{
"epoch": 0.8609271523178808,
"grad_norm": 0.246117502450943,
"learning_rate": 4.050966476329793e-05,
"loss": 0.039,
"num_input_tokens_seen": 124736,
"step": 130
},
{
"epoch": 0.8940397350993378,
"grad_norm": 0.5470453500747681,
"learning_rate": 3.9820582759626825e-05,
"loss": 0.05,
"num_input_tokens_seen": 129552,
"step": 135
},
{
"epoch": 0.9271523178807947,
"grad_norm": 0.6097426414489746,
"learning_rate": 3.911368250322014e-05,
"loss": 0.02,
"num_input_tokens_seen": 134400,
"step": 140
},
{
"epoch": 0.9602649006622517,
"grad_norm": 0.4274413585662842,
"learning_rate": 3.8389813874789856e-05,
"loss": 0.0159,
"num_input_tokens_seen": 139424,
"step": 145
},
{
"epoch": 0.9933774834437086,
"grad_norm": 0.1949557512998581,
"learning_rate": 3.764984715551032e-05,
"loss": 0.017,
"num_input_tokens_seen": 144368,
"step": 150
},
{
"epoch": 1.0264900662251655,
"grad_norm": 0.9226788878440857,
"learning_rate": 3.6894671980710574e-05,
"loss": 0.0298,
"num_input_tokens_seen": 149040,
"step": 155
},
{
"epoch": 1.0596026490066226,
"grad_norm": 1.5730527639389038,
"learning_rate": 3.612519627029787e-05,
"loss": 0.0357,
"num_input_tokens_seen": 153712,
"step": 160
},
{
"epoch": 1.0927152317880795,
"grad_norm": 0.5750863552093506,
"learning_rate": 3.534234513719821e-05,
"loss": 0.0185,
"num_input_tokens_seen": 158640,
"step": 165
},
{
"epoch": 1.1258278145695364,
"grad_norm": 0.4966561198234558,
"learning_rate": 3.4547059775126445e-05,
"loss": 0.0339,
"num_input_tokens_seen": 163552,
"step": 170
},
{
"epoch": 1.1589403973509933,
"grad_norm": 0.6601276993751526,
"learning_rate": 3.3740296327022984e-05,
"loss": 0.0224,
"num_input_tokens_seen": 168352,
"step": 175
},
{
"epoch": 1.1920529801324504,
"grad_norm": 1.2051963806152344,
"learning_rate": 3.292302473551757e-05,
"loss": 0.0192,
"num_input_tokens_seen": 173312,
"step": 180
},
{
"epoch": 1.2251655629139073,
"grad_norm": 0.48197436332702637,
"learning_rate": 3.20962275768022e-05,
"loss": 0.0275,
"num_input_tokens_seen": 178160,
"step": 185
},
{
"epoch": 1.2582781456953642,
"grad_norm": 0.3365241289138794,
"learning_rate": 3.126089887931515e-05,
"loss": 0.0086,
"num_input_tokens_seen": 182944,
"step": 190
},
{
"epoch": 1.2913907284768211,
"grad_norm": 0.5984787940979004,
"learning_rate": 3.0418042928656414e-05,
"loss": 0.0166,
"num_input_tokens_seen": 187920,
"step": 195
},
{
"epoch": 1.3245033112582782,
"grad_norm": 0.3351554274559021,
"learning_rate": 2.9568673060171326e-05,
"loss": 0.0167,
"num_input_tokens_seen": 192992,
"step": 200
},
{
"epoch": 1.3576158940397351,
"grad_norm": 0.21267688274383545,
"learning_rate": 2.8713810440653926e-05,
"loss": 0.0137,
"num_input_tokens_seen": 197616,
"step": 205
},
{
"epoch": 1.390728476821192,
"grad_norm": 0.17382262647151947,
"learning_rate": 2.7854482840634965e-05,
"loss": 0.0142,
"num_input_tokens_seen": 202400,
"step": 210
},
{
"epoch": 1.423841059602649,
"grad_norm": 0.552691638469696,
"learning_rate": 2.6991723398730383e-05,
"loss": 0.0136,
"num_input_tokens_seen": 207216,
"step": 215
},
{
"epoch": 1.4569536423841059,
"grad_norm": 0.4015306532382965,
"learning_rate": 2.6126569379535985e-05,
"loss": 0.0184,
"num_input_tokens_seen": 211744,
"step": 220
},
{
"epoch": 1.490066225165563,
"grad_norm": 0.20611760020256042,
"learning_rate": 2.526006092656161e-05,
"loss": 0.0098,
"num_input_tokens_seen": 216608,
"step": 225
},
{
"epoch": 1.5231788079470199,
"grad_norm": 0.3923095762729645,
"learning_rate": 2.4393239811704e-05,
"loss": 0.012,
"num_input_tokens_seen": 221552,
"step": 230
},
{
"epoch": 1.5562913907284768,
"grad_norm": 0.2283228635787964,
"learning_rate": 2.3527148182762054e-05,
"loss": 0.0106,
"num_input_tokens_seen": 226272,
"step": 235
},
{
"epoch": 1.589403973509934,
"grad_norm": 1.2066081762313843,
"learning_rate": 2.2662827310499995e-05,
"loss": 0.0128,
"num_input_tokens_seen": 231072,
"step": 240
},
{
"epoch": 1.6225165562913908,
"grad_norm": 0.39357009530067444,
"learning_rate": 2.1801316336765126e-05,
"loss": 0.0139,
"num_input_tokens_seen": 235728,
"step": 245
},
{
"epoch": 1.6556291390728477,
"grad_norm": 0.2383226603269577,
"learning_rate": 2.0943651025164932e-05,
"loss": 0.0084,
"num_input_tokens_seen": 240560,
"step": 250
},
{
"epoch": 1.6887417218543046,
"grad_norm": 1.2794650793075562,
"learning_rate": 2.0090862515805898e-05,
"loss": 0.0143,
"num_input_tokens_seen": 245408,
"step": 255
},
{
"epoch": 1.7218543046357615,
"grad_norm": 0.5334519147872925,
"learning_rate": 1.9243976085590824e-05,
"loss": 0.011,
"num_input_tokens_seen": 250400,
"step": 260
},
{
"epoch": 1.7549668874172184,
"grad_norm": 0.32287150621414185,
"learning_rate": 1.840400991556541e-05,
"loss": 0.0127,
"num_input_tokens_seen": 255216,
"step": 265
},
{
"epoch": 1.7880794701986755,
"grad_norm": 0.7233603596687317,
"learning_rate": 1.7571973866795815e-05,
"loss": 0.0127,
"num_input_tokens_seen": 260080,
"step": 270
},
{
"epoch": 1.8211920529801324,
"grad_norm": 0.2672172784805298,
"learning_rate": 1.6748868266249114e-05,
"loss": 0.0122,
"num_input_tokens_seen": 264848,
"step": 275
},
{
"epoch": 1.8543046357615895,
"grad_norm": 0.32350099086761475,
"learning_rate": 1.5935682704136183e-05,
"loss": 0.0169,
"num_input_tokens_seen": 269776,
"step": 280
},
{
"epoch": 1.8874172185430464,
"grad_norm": 0.26331964135169983,
"learning_rate": 1.5133394844163093e-05,
"loss": 0.0242,
"num_input_tokens_seen": 274752,
"step": 285
},
{
"epoch": 1.9205298013245033,
"grad_norm": 0.25610876083374023,
"learning_rate": 1.4342969248121185e-05,
"loss": 0.0079,
"num_input_tokens_seen": 279440,
"step": 290
},
{
"epoch": 1.9536423841059603,
"grad_norm": 1.2221907377243042,
"learning_rate": 1.3565356216229268e-05,
"loss": 0.0194,
"num_input_tokens_seen": 284288,
"step": 295
},
{
"epoch": 1.9867549668874172,
"grad_norm": 0.46571576595306396,
"learning_rate": 1.2801490644621789e-05,
"loss": 0.0133,
"num_input_tokens_seen": 289216,
"step": 300
},
{
"epoch": 2.019867549668874,
"grad_norm": 0.06845066696405411,
"learning_rate": 1.2052290901357025e-05,
"loss": 0.0049,
"num_input_tokens_seen": 293992,
"step": 305
},
{
"epoch": 2.052980132450331,
"grad_norm": 0.3654076159000397,
"learning_rate": 1.1318657722296097e-05,
"loss": 0.0121,
"num_input_tokens_seen": 299048,
"step": 310
},
{
"epoch": 2.0860927152317883,
"grad_norm": 0.1736595779657364,
"learning_rate": 1.0601473128180855e-05,
"loss": 0.0102,
"num_input_tokens_seen": 303768,
"step": 315
},
{
"epoch": 2.119205298013245,
"grad_norm": 0.15334352850914001,
"learning_rate": 9.90159936421197e-06,
"loss": 0.0072,
"num_input_tokens_seen": 308872,
"step": 320
},
{
"epoch": 2.152317880794702,
"grad_norm": 0.15292225778102875,
"learning_rate": 9.219877863402682e-06,
"loss": 0.0066,
"num_input_tokens_seen": 313800,
"step": 325
},
{
"epoch": 2.185430463576159,
"grad_norm": 0.12052378058433533,
"learning_rate": 8.55712823495419e-06,
"loss": 0.0119,
"num_input_tokens_seen": 318536,
"step": 330
},
{
"epoch": 2.218543046357616,
"grad_norm": 0.16140712797641754,
"learning_rate": 7.91414727886898e-06,
"loss": 0.0057,
"num_input_tokens_seen": 323512,
"step": 335
},
{
"epoch": 2.251655629139073,
"grad_norm": 0.49767032265663147,
"learning_rate": 7.291708027986988e-06,
"loss": 0.0136,
"num_input_tokens_seen": 328552,
"step": 340
},
{
"epoch": 2.2847682119205297,
"grad_norm": 0.10487841814756393,
"learning_rate": 6.690558818595943e-06,
"loss": 0.0078,
"num_input_tokens_seen": 333272,
"step": 345
},
{
"epoch": 2.3178807947019866,
"grad_norm": 0.2247195988893509,
"learning_rate": 6.111422390733715e-06,
"loss": 0.0141,
"num_input_tokens_seen": 337816,
"step": 350
},
{
"epoch": 2.3509933774834435,
"grad_norm": 0.15797987580299377,
"learning_rate": 5.55499501926394e-06,
"loss": 0.014,
"num_input_tokens_seen": 342792,
"step": 355
},
{
"epoch": 2.384105960264901,
"grad_norm": 0.33236005902290344,
"learning_rate": 5.02194567676986e-06,
"loss": 0.0045,
"num_input_tokens_seen": 347288,
"step": 360
},
{
"epoch": 2.4172185430463577,
"grad_norm": 0.3886609375476837,
"learning_rate": 4.51291522927268e-06,
"loss": 0.0062,
"num_input_tokens_seen": 352088,
"step": 365
},
{
"epoch": 2.4503311258278146,
"grad_norm": 0.3580506145954132,
"learning_rate": 4.028515665741439e-06,
"loss": 0.0028,
"num_input_tokens_seen": 357096,
"step": 370
},
{
"epoch": 2.4834437086092715,
"grad_norm": 0.045303523540496826,
"learning_rate": 3.5693293623207086e-06,
"loss": 0.0022,
"num_input_tokens_seen": 361928,
"step": 375
},
{
"epoch": 2.5165562913907285,
"grad_norm": 0.3307873010635376,
"learning_rate": 3.135908382160771e-06,
"loss": 0.0061,
"num_input_tokens_seen": 366632,
"step": 380
},
{
"epoch": 2.5496688741721854,
"grad_norm": 0.31569671630859375,
"learning_rate": 2.728773811691923e-06,
"loss": 0.0077,
"num_input_tokens_seen": 371352,
"step": 385
},
{
"epoch": 2.5827814569536423,
"grad_norm": 0.25220000743865967,
"learning_rate": 2.348415134141102e-06,
"loss": 0.0053,
"num_input_tokens_seen": 375976,
"step": 390
},
{
"epoch": 2.6158940397350996,
"grad_norm": 0.12679828703403473,
"learning_rate": 1.995289641043768e-06,
"loss": 0.0021,
"num_input_tokens_seen": 381016,
"step": 395
},
{
"epoch": 2.6490066225165565,
"grad_norm": 0.2858022153377533,
"learning_rate": 1.6698218824588164e-06,
"loss": 0.0063,
"num_input_tokens_seen": 385688,
"step": 400
},
{
"epoch": 2.6821192052980134,
"grad_norm": 0.3448869287967682,
"learning_rate": 1.3724031565473112e-06,
"loss": 0.0052,
"num_input_tokens_seen": 390584,
"step": 405
},
{
"epoch": 2.7152317880794703,
"grad_norm": 0.26856377720832825,
"learning_rate": 1.1033910391288065e-06,
"loss": 0.0051,
"num_input_tokens_seen": 395576,
"step": 410
},
{
"epoch": 2.748344370860927,
"grad_norm": 0.3648998737335205,
"learning_rate": 8.631089537808307e-07,
"loss": 0.0084,
"num_input_tokens_seen": 400328,
"step": 415
},
{
"epoch": 2.781456953642384,
"grad_norm": 1.5257205963134766,
"learning_rate": 6.51845782998356e-07,
"loss": 0.0122,
"num_input_tokens_seen": 405192,
"step": 420
},
{
"epoch": 2.814569536423841,
"grad_norm": 0.4184490144252777,
"learning_rate": 4.698555208807853e-07,
"loss": 0.006,
"num_input_tokens_seen": 409912,
"step": 425
},
{
"epoch": 2.847682119205298,
"grad_norm": 0.39113691449165344,
"learning_rate": 3.1735696776400703e-07,
"loss": 0.0044,
"num_input_tokens_seen": 414792,
"step": 430
},
{
"epoch": 2.880794701986755,
"grad_norm": 0.49995261430740356,
"learning_rate": 1.9453346716462317e-07,
"loss": 0.0091,
"num_input_tokens_seen": 419752,
"step": 435
},
{
"epoch": 2.9139072847682117,
"grad_norm": 0.45024681091308594,
"learning_rate": 1.0153268535264827e-07,
"loss": 0.005,
"num_input_tokens_seen": 424808,
"step": 440
},
{
"epoch": 2.9470198675496686,
"grad_norm": 0.11354193091392517,
"learning_rate": 3.846643381766879e-08,
"loss": 0.0051,
"num_input_tokens_seen": 429704,
"step": 445
},
{
"epoch": 2.980132450331126,
"grad_norm": 0.07164571434259415,
"learning_rate": 5.41053484192644e-09,
"loss": 0.0049,
"num_input_tokens_seen": 434536,
"step": 450
},
{
"epoch": 3.0,
"num_input_tokens_seen": 437216,
"step": 453,
"total_flos": 1.8708380691726336e+16,
"train_loss": 0.044493223137528556,
"train_runtime": 988.3228,
"train_samples_per_second": 7.331,
"train_steps_per_second": 0.458
}
],
"logging_steps": 5,
"max_steps": 453,
"num_input_tokens_seen": 437216,
"num_train_epochs": 3,
"save_steps": 100,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 1.8708380691726336e+16,
"train_batch_size": 2,
"trial_name": null,
"trial_params": null
}