htech_compliance / training_data_v3 /trainer_state.json
cpiuk's picture
Add v3 training logs (trainer_state.json)
e829e30 verified
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 6.0,
"eval_steps": 500,
"global_step": 870,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.06896551724137931,
"grad_norm": 0.7040123343467712,
"learning_rate": 6e-05,
"loss": 2.4602,
"step": 10
},
{
"epoch": 0.13793103448275862,
"grad_norm": 0.5209792256355286,
"learning_rate": 0.00012666666666666666,
"loss": 2.082,
"step": 20
},
{
"epoch": 0.20689655172413793,
"grad_norm": 0.5385194420814514,
"learning_rate": 0.00019333333333333333,
"loss": 1.7231,
"step": 30
},
{
"epoch": 0.27586206896551724,
"grad_norm": 0.39407166838645935,
"learning_rate": 0.00019995880424308071,
"loss": 1.4061,
"step": 40
},
{
"epoch": 0.3448275862068966,
"grad_norm": 0.41344210505485535,
"learning_rate": 0.00019981644273304322,
"loss": 1.1996,
"step": 50
},
{
"epoch": 0.41379310344827586,
"grad_norm": 0.7267842292785645,
"learning_rate": 0.00019957255165063584,
"loss": 1.0567,
"step": 60
},
{
"epoch": 0.4827586206896552,
"grad_norm": 0.4600793719291687,
"learning_rate": 0.0001992273790727949,
"loss": 0.9303,
"step": 70
},
{
"epoch": 0.5517241379310345,
"grad_norm": 0.5658743381500244,
"learning_rate": 0.00019878127609622607,
"loss": 0.872,
"step": 80
},
{
"epoch": 0.6206896551724138,
"grad_norm": 0.6658106446266174,
"learning_rate": 0.00019823469648028207,
"loss": 0.8219,
"step": 90
},
{
"epoch": 0.6896551724137931,
"grad_norm": 0.8075541853904724,
"learning_rate": 0.0001975881961854155,
"loss": 0.663,
"step": 100
},
{
"epoch": 0.7586206896551724,
"grad_norm": 0.5715014934539795,
"learning_rate": 0.00019684243280767633,
"loss": 0.612,
"step": 110
},
{
"epoch": 0.8275862068965517,
"grad_norm": 0.6722455024719238,
"learning_rate": 0.00019599816490983006,
"loss": 0.5205,
"step": 120
},
{
"epoch": 0.896551724137931,
"grad_norm": 0.665215253829956,
"learning_rate": 0.0001950562512497755,
"loss": 0.4817,
"step": 130
},
{
"epoch": 0.9655172413793104,
"grad_norm": 0.7088465690612793,
"learning_rate": 0.00019401764990704842,
"loss": 0.4479,
"step": 140
},
{
"epoch": 1.0,
"eval_loss": 0.40697577595710754,
"eval_runtime": 125.2582,
"eval_samples_per_second": 8.255,
"eval_steps_per_second": 2.068,
"step": 145
},
{
"epoch": 1.0344827586206897,
"grad_norm": 0.6929165720939636,
"learning_rate": 0.0001928834173082986,
"loss": 0.3906,
"step": 150
},
{
"epoch": 1.103448275862069,
"grad_norm": 0.5821729898452759,
"learning_rate": 0.00019165470715273197,
"loss": 0.3082,
"step": 160
},
{
"epoch": 1.1724137931034484,
"grad_norm": 0.8513327836990356,
"learning_rate": 0.0001903327692386107,
"loss": 0.3073,
"step": 170
},
{
"epoch": 1.2413793103448276,
"grad_norm": 0.5871995687484741,
"learning_rate": 0.0001889189481920048,
"loss": 0.2877,
"step": 180
},
{
"epoch": 1.3103448275862069,
"grad_norm": 0.5694791078567505,
"learning_rate": 0.0001874146820990887,
"loss": 0.2462,
"step": 190
},
{
"epoch": 1.3793103448275863,
"grad_norm": 0.53709876537323,
"learning_rate": 0.00018582150104337326,
"loss": 0.2469,
"step": 200
},
{
"epoch": 1.4482758620689655,
"grad_norm": 0.5729167461395264,
"learning_rate": 0.00018414102554936194,
"loss": 0.204,
"step": 210
},
{
"epoch": 1.5172413793103448,
"grad_norm": 0.5852716565132141,
"learning_rate": 0.0001823749649342135,
"loss": 0.1935,
"step": 220
},
{
"epoch": 1.5862068965517242,
"grad_norm": 0.5843812227249146,
"learning_rate": 0.000180525115569088,
"loss": 0.2048,
"step": 230
},
{
"epoch": 1.6551724137931034,
"grad_norm": 0.471234530210495,
"learning_rate": 0.00017859335905194521,
"loss": 0.1982,
"step": 240
},
{
"epoch": 1.7241379310344827,
"grad_norm": 0.45676925778388977,
"learning_rate": 0.00017658166029365288,
"loss": 0.1631,
"step": 250
},
{
"epoch": 1.793103448275862,
"grad_norm": 0.44925084710121155,
"learning_rate": 0.00017449206551935258,
"loss": 0.1504,
"step": 260
},
{
"epoch": 1.8620689655172413,
"grad_norm": 0.4521825313568115,
"learning_rate": 0.00017232670018711572,
"loss": 0.1397,
"step": 270
},
{
"epoch": 1.9310344827586206,
"grad_norm": 0.3619280755519867,
"learning_rate": 0.0001700877668260065,
"loss": 0.129,
"step": 280
},
{
"epoch": 2.0,
"grad_norm": 0.46234917640686035,
"learning_rate": 0.00016777754279575136,
"loss": 0.1276,
"step": 290
},
{
"epoch": 2.0,
"eval_loss": 0.14752867817878723,
"eval_runtime": 112.8661,
"eval_samples_per_second": 9.161,
"eval_steps_per_second": 2.295,
"step": 290
},
{
"epoch": 2.0689655172413794,
"grad_norm": 0.4741278886795044,
"learning_rate": 0.00016539837797029341,
"loss": 0.1097,
"step": 300
},
{
"epoch": 2.1379310344827585,
"grad_norm": 0.46682700514793396,
"learning_rate": 0.00016295269234758796,
"loss": 0.1014,
"step": 310
},
{
"epoch": 2.206896551724138,
"grad_norm": 0.48358821868896484,
"learning_rate": 0.00016044297358807085,
"loss": 0.096,
"step": 320
},
{
"epoch": 2.2758620689655173,
"grad_norm": 0.4410136044025421,
"learning_rate": 0.00015787177448430253,
"loss": 0.1056,
"step": 330
},
{
"epoch": 2.344827586206897,
"grad_norm": 0.46466073393821716,
"learning_rate": 0.00015524171036436255,
"loss": 0.09,
"step": 340
},
{
"epoch": 2.413793103448276,
"grad_norm": 0.37848514318466187,
"learning_rate": 0.00015255545643163516,
"loss": 0.095,
"step": 350
},
{
"epoch": 2.4827586206896552,
"grad_norm": 0.3724570870399475,
"learning_rate": 0.00014981574504369194,
"loss": 0.0897,
"step": 360
},
{
"epoch": 2.5517241379310347,
"grad_norm": 0.4251348078250885,
"learning_rate": 0.00014702536293303924,
"loss": 0.0953,
"step": 370
},
{
"epoch": 2.6206896551724137,
"grad_norm": 0.33401480317115784,
"learning_rate": 0.00014418714837255764,
"loss": 0.0856,
"step": 380
},
{
"epoch": 2.689655172413793,
"grad_norm": 0.3629361689090729,
"learning_rate": 0.00014130398828851625,
"loss": 0.0986,
"step": 390
},
{
"epoch": 2.7586206896551726,
"grad_norm": 0.3115435540676117,
"learning_rate": 0.00013837881532409888,
"loss": 0.087,
"step": 400
},
{
"epoch": 2.8275862068965516,
"grad_norm": 0.32981187105178833,
"learning_rate": 0.00013541460485642825,
"loss": 0.0755,
"step": 410
},
{
"epoch": 2.896551724137931,
"grad_norm": 0.4125177264213562,
"learning_rate": 0.00013241437197012326,
"loss": 0.0765,
"step": 420
},
{
"epoch": 2.9655172413793105,
"grad_norm": 0.3462369740009308,
"learning_rate": 0.00012938116839046704,
"loss": 0.0754,
"step": 430
},
{
"epoch": 3.0,
"eval_loss": 0.09198899567127228,
"eval_runtime": 105.7366,
"eval_samples_per_second": 9.779,
"eval_steps_per_second": 2.449,
"step": 435
},
{
"epoch": 3.0344827586206895,
"grad_norm": 0.41924241185188293,
"learning_rate": 0.0001263180793793054,
"loss": 0.0753,
"step": 440
},
{
"epoch": 3.103448275862069,
"grad_norm": 0.30730345845222473,
"learning_rate": 0.00012322822059683344,
"loss": 0.0608,
"step": 450
},
{
"epoch": 3.1724137931034484,
"grad_norm": 0.2622484564781189,
"learning_rate": 0.00012011473493246166,
"loss": 0.0537,
"step": 460
},
{
"epoch": 3.2413793103448274,
"grad_norm": 0.4255964756011963,
"learning_rate": 0.00011698078930798606,
"loss": 0.0608,
"step": 470
},
{
"epoch": 3.310344827586207,
"grad_norm": 0.32836616039276123,
"learning_rate": 0.00011382957145631284,
"loss": 0.0603,
"step": 480
},
{
"epoch": 3.3793103448275863,
"grad_norm": 0.3328634798526764,
"learning_rate": 0.00011066428667901523,
"loss": 0.0491,
"step": 490
},
{
"epoch": 3.4482758620689653,
"grad_norm": 0.33855220675468445,
"learning_rate": 0.00010748815458601989,
"loss": 0.0565,
"step": 500
},
{
"epoch": 3.5172413793103448,
"grad_norm": 0.35493630170822144,
"learning_rate": 0.00010430440582073946,
"loss": 0.0516,
"step": 510
},
{
"epoch": 3.586206896551724,
"grad_norm": 0.31457582116127014,
"learning_rate": 0.00010111627877398236,
"loss": 0.0483,
"step": 520
},
{
"epoch": 3.655172413793103,
"grad_norm": 0.2784205377101898,
"learning_rate": 9.7927016289982e-05,
"loss": 0.0561,
"step": 530
},
{
"epoch": 3.7241379310344827,
"grad_norm": 0.31077679991722107,
"learning_rate": 9.473986236789633e-05,
"loss": 0.0513,
"step": 540
},
{
"epoch": 3.793103448275862,
"grad_norm": 0.262256383895874,
"learning_rate": 9.155805886213265e-05,
"loss": 0.0553,
"step": 550
},
{
"epoch": 3.862068965517241,
"grad_norm": 0.2638000547885895,
"learning_rate": 8.838484218485358e-05,
"loss": 0.0488,
"step": 560
},
{
"epoch": 3.9310344827586206,
"grad_norm": 0.2960629165172577,
"learning_rate": 8.522344001401945e-05,
"loss": 0.0539,
"step": 570
},
{
"epoch": 4.0,
"grad_norm": 0.3162899911403656,
"learning_rate": 8.207706801031408e-05,
"loss": 0.0505,
"step": 580
},
{
"epoch": 4.0,
"eval_loss": 0.06869391351938248,
"eval_runtime": 107.7637,
"eval_samples_per_second": 9.595,
"eval_steps_per_second": 2.403,
"step": 580
},
{
"epoch": 4.068965517241379,
"grad_norm": 0.23831477761268616,
"learning_rate": 7.894892654629438e-05,
"loss": 0.0346,
"step": 590
},
{
"epoch": 4.137931034482759,
"grad_norm": 0.20031996071338654,
"learning_rate": 7.584219745109047e-05,
"loss": 0.0359,
"step": 600
},
{
"epoch": 4.206896551724138,
"grad_norm": 0.1699780821800232,
"learning_rate": 7.276004077396747e-05,
"loss": 0.0401,
"step": 610
},
{
"epoch": 4.275862068965517,
"grad_norm": 0.22816252708435059,
"learning_rate": 6.970559157004097e-05,
"loss": 0.0461,
"step": 620
},
{
"epoch": 4.344827586206897,
"grad_norm": 0.2750411927700043,
"learning_rate": 6.668195671141542e-05,
"loss": 0.0358,
"step": 630
},
{
"epoch": 4.413793103448276,
"grad_norm": 0.25011658668518066,
"learning_rate": 6.369221172698963e-05,
"loss": 0.0402,
"step": 640
},
{
"epoch": 4.482758620689655,
"grad_norm": 0.19483405351638794,
"learning_rate": 6.073939767414305e-05,
"loss": 0.0317,
"step": 650
},
{
"epoch": 4.551724137931035,
"grad_norm": 0.2074318677186966,
"learning_rate": 5.782651804548538e-05,
"loss": 0.034,
"step": 660
},
{
"epoch": 4.620689655172414,
"grad_norm": 0.19932489097118378,
"learning_rate": 5.495653571381554e-05,
"loss": 0.0358,
"step": 670
},
{
"epoch": 4.689655172413794,
"grad_norm": 0.19241374731063843,
"learning_rate": 5.213236991839781e-05,
"loss": 0.0368,
"step": 680
},
{
"epoch": 4.758620689655173,
"grad_norm": 0.21923574805259705,
"learning_rate": 4.93568932956201e-05,
"loss": 0.0367,
"step": 690
},
{
"epoch": 4.827586206896552,
"grad_norm": 0.24106892943382263,
"learning_rate": 4.663292895705526e-05,
"loss": 0.0424,
"step": 700
},
{
"epoch": 4.896551724137931,
"grad_norm": 0.21531100571155548,
"learning_rate": 4.396324761789672e-05,
"loss": 0.0436,
"step": 710
},
{
"epoch": 4.9655172413793105,
"grad_norm": 0.22390952706336975,
"learning_rate": 4.1350564778690424e-05,
"loss": 0.0415,
"step": 720
},
{
"epoch": 5.0,
"eval_loss": 0.06067777797579765,
"eval_runtime": 107.7054,
"eval_samples_per_second": 9.6,
"eval_steps_per_second": 2.405,
"step": 725
},
{
"epoch": 5.0344827586206895,
"grad_norm": 0.1765316277742386,
"learning_rate": 3.879753796322845e-05,
"loss": 0.0304,
"step": 730
},
{
"epoch": 5.103448275862069,
"grad_norm": 0.14481499791145325,
"learning_rate": 3.630676401541466e-05,
"loss": 0.028,
"step": 740
},
{
"epoch": 5.172413793103448,
"grad_norm": 0.2429375946521759,
"learning_rate": 3.388077645785186e-05,
"loss": 0.0314,
"step": 750
},
{
"epoch": 5.241379310344827,
"grad_norm": 0.1938163936138153,
"learning_rate": 3.1522042914836704e-05,
"loss": 0.0298,
"step": 760
},
{
"epoch": 5.310344827586207,
"grad_norm": 0.2667997181415558,
"learning_rate": 2.923296260238412e-05,
"loss": 0.0299,
"step": 770
},
{
"epoch": 5.379310344827586,
"grad_norm": 0.2025449424982071,
"learning_rate": 2.7015863887833947e-05,
"loss": 0.0299,
"step": 780
},
{
"epoch": 5.448275862068965,
"grad_norm": 0.28111398220062256,
"learning_rate": 2.4873001921522444e-05,
"loss": 0.0319,
"step": 790
},
{
"epoch": 5.517241379310345,
"grad_norm": 0.17337462306022644,
"learning_rate": 2.2806556342927142e-05,
"loss": 0.0308,
"step": 800
},
{
"epoch": 5.586206896551724,
"grad_norm": 0.1848144233226776,
"learning_rate": 2.0818629063618656e-05,
"loss": 0.0307,
"step": 810
},
{
"epoch": 5.655172413793103,
"grad_norm": 0.16859489679336548,
"learning_rate": 1.8911242129274498e-05,
"loss": 0.0294,
"step": 820
},
{
"epoch": 5.724137931034483,
"grad_norm": 0.20610836148262024,
"learning_rate": 1.7086335662929352e-05,
"loss": 0.0288,
"step": 830
},
{
"epoch": 5.793103448275862,
"grad_norm": 0.17157162725925446,
"learning_rate": 1.5345765891554163e-05,
"loss": 0.0263,
"step": 840
},
{
"epoch": 5.862068965517241,
"grad_norm": 0.21943090856075287,
"learning_rate": 1.3691303257971033e-05,
"loss": 0.0336,
"step": 850
},
{
"epoch": 5.931034482758621,
"grad_norm": 0.17992551624774933,
"learning_rate": 1.2124630620024746e-05,
"loss": 0.0288,
"step": 860
},
{
"epoch": 6.0,
"grad_norm": 0.2768034040927887,
"learning_rate": 1.0647341538842282e-05,
"loss": 0.0327,
"step": 870
},
{
"epoch": 6.0,
"eval_loss": 0.05765723064541817,
"eval_runtime": 106.1855,
"eval_samples_per_second": 9.738,
"eval_steps_per_second": 2.439,
"step": 870
}
],
"logging_steps": 10,
"max_steps": 1015,
"num_input_tokens_seen": 0,
"num_train_epochs": 7,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 3.3175319780130816e+17,
"train_batch_size": 32,
"trial_name": null,
"trial_params": null
}