gpt2_m070_tiny-stories_1024_dpos / trainer_state.json
jonasknobloch's picture
Upload folder using huggingface_hub
46b5190 verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 1.0,
"eval_steps": 1000,
"global_step": 19132,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.02613422538155969,
"grad_norm": 1.0069942474365234,
"learning_rate": 4.8693288730922014e-05,
"loss": 4.3789,
"step": 500
},
{
"epoch": 0.05226845076311938,
"grad_norm": 1.8369040489196777,
"learning_rate": 4.738657746184403e-05,
"loss": 2.9037,
"step": 1000
},
{
"epoch": 0.05226845076311938,
"eval_accuracy": 0.4486757469685219,
"eval_loss": 2.4443750381469727,
"eval_runtime": 52.7365,
"eval_samples_per_second": 116.731,
"eval_steps_per_second": 3.66,
"step": 1000
},
{
"epoch": 0.07840267614467906,
"grad_norm": 5.2621636390686035,
"learning_rate": 4.607986619276605e-05,
"loss": 2.2813,
"step": 1500
},
{
"epoch": 0.10453690152623876,
"grad_norm": 1.531705617904663,
"learning_rate": 4.477315492368807e-05,
"loss": 1.9735,
"step": 2000
},
{
"epoch": 0.10453690152623876,
"eval_accuracy": 0.5685718087623388,
"eval_loss": 1.796225905418396,
"eval_runtime": 52.7973,
"eval_samples_per_second": 116.597,
"eval_steps_per_second": 3.655,
"step": 2000
},
{
"epoch": 0.13067112690779845,
"grad_norm": 1.563498854637146,
"learning_rate": 4.346644365461008e-05,
"loss": 1.8191,
"step": 2500
},
{
"epoch": 0.15680535228935813,
"grad_norm": 1.4150679111480713,
"learning_rate": 4.21597323855321e-05,
"loss": 1.7239,
"step": 3000
},
{
"epoch": 0.15680535228935813,
"eval_accuracy": 0.6016185879419231,
"eval_loss": 1.6086262464523315,
"eval_runtime": 52.8364,
"eval_samples_per_second": 116.511,
"eval_steps_per_second": 3.653,
"step": 3000
},
{
"epoch": 0.18293957767091784,
"grad_norm": 1.4226484298706055,
"learning_rate": 4.085302111645411e-05,
"loss": 1.6536,
"step": 3500
},
{
"epoch": 0.20907380305247752,
"grad_norm": 1.2391685247421265,
"learning_rate": 3.954630984737613e-05,
"loss": 1.6013,
"step": 4000
},
{
"epoch": 0.20907380305247752,
"eval_accuracy": 0.6210625401344134,
"eval_loss": 1.50575852394104,
"eval_runtime": 52.7674,
"eval_samples_per_second": 116.663,
"eval_steps_per_second": 3.658,
"step": 4000
},
{
"epoch": 0.2352080284340372,
"grad_norm": 1.4180678129196167,
"learning_rate": 3.823959857829814e-05,
"loss": 1.5616,
"step": 4500
},
{
"epoch": 0.2613422538155969,
"grad_norm": 1.303842306137085,
"learning_rate": 3.6932887309220156e-05,
"loss": 1.5297,
"step": 5000
},
{
"epoch": 0.2613422538155969,
"eval_accuracy": 0.6315313100825268,
"eval_loss": 1.446053385734558,
"eval_runtime": 52.6764,
"eval_samples_per_second": 116.865,
"eval_steps_per_second": 3.664,
"step": 5000
},
{
"epoch": 0.2874764791971566,
"grad_norm": 1.2424602508544922,
"learning_rate": 3.5626176040142174e-05,
"loss": 1.5007,
"step": 5500
},
{
"epoch": 0.31361070457871626,
"grad_norm": 1.2661899328231812,
"learning_rate": 3.4319464771064186e-05,
"loss": 1.478,
"step": 6000
},
{
"epoch": 0.31361070457871626,
"eval_accuracy": 0.640106656707298,
"eval_loss": 1.4016118049621582,
"eval_runtime": 52.5005,
"eval_samples_per_second": 117.256,
"eval_steps_per_second": 3.676,
"step": 6000
},
{
"epoch": 0.339744929960276,
"grad_norm": 1.1755191087722778,
"learning_rate": 3.3012753501986204e-05,
"loss": 1.4558,
"step": 6500
},
{
"epoch": 0.3658791553418357,
"grad_norm": 1.3452744483947754,
"learning_rate": 3.1706042232908215e-05,
"loss": 1.4343,
"step": 7000
},
{
"epoch": 0.3658791553418357,
"eval_accuracy": 0.6475163189462378,
"eval_loss": 1.3634966611862183,
"eval_runtime": 52.4288,
"eval_samples_per_second": 117.416,
"eval_steps_per_second": 3.681,
"step": 7000
},
{
"epoch": 0.39201338072339537,
"grad_norm": 1.1488875150680542,
"learning_rate": 3.0399330963830237e-05,
"loss": 1.4183,
"step": 7500
},
{
"epoch": 0.41814760610495505,
"grad_norm": 1.1783889532089233,
"learning_rate": 2.9092619694752248e-05,
"loss": 1.4042,
"step": 8000
},
{
"epoch": 0.41814760610495505,
"eval_accuracy": 0.6532186608587288,
"eval_loss": 1.3342305421829224,
"eval_runtime": 52.5027,
"eval_samples_per_second": 117.251,
"eval_steps_per_second": 3.676,
"step": 8000
},
{
"epoch": 0.44428183148651473,
"grad_norm": 1.1511882543563843,
"learning_rate": 2.7785908425674263e-05,
"loss": 1.3937,
"step": 8500
},
{
"epoch": 0.4704160568680744,
"grad_norm": 1.1524832248687744,
"learning_rate": 2.647919715659628e-05,
"loss": 1.3786,
"step": 9000
},
{
"epoch": 0.4704160568680744,
"eval_accuracy": 0.6576487696559381,
"eval_loss": 1.3107670545578003,
"eval_runtime": 53.6197,
"eval_samples_per_second": 114.808,
"eval_steps_per_second": 3.599,
"step": 9000
},
{
"epoch": 0.4965502822496341,
"grad_norm": 1.1211012601852417,
"learning_rate": 2.5172485887518292e-05,
"loss": 1.3674,
"step": 9500
},
{
"epoch": 0.5226845076311938,
"grad_norm": 1.1335201263427734,
"learning_rate": 2.3865774618440313e-05,
"loss": 1.3566,
"step": 10000
},
{
"epoch": 0.5226845076311938,
"eval_accuracy": 0.6616726911954227,
"eval_loss": 1.2912213802337646,
"eval_runtime": 53.4337,
"eval_samples_per_second": 115.208,
"eval_steps_per_second": 3.612,
"step": 10000
},
{
"epoch": 0.5488187330127535,
"grad_norm": 1.109776496887207,
"learning_rate": 2.2559063349362325e-05,
"loss": 1.3481,
"step": 10500
},
{
"epoch": 0.5749529583943132,
"grad_norm": 1.0994426012039185,
"learning_rate": 2.125235208028434e-05,
"loss": 1.3389,
"step": 11000
},
{
"epoch": 0.5749529583943132,
"eval_accuracy": 0.665065101114903,
"eval_loss": 1.273192286491394,
"eval_runtime": 53.8053,
"eval_samples_per_second": 114.413,
"eval_steps_per_second": 3.587,
"step": 11000
},
{
"epoch": 0.6010871837758729,
"grad_norm": 1.1425756216049194,
"learning_rate": 1.9945640811206358e-05,
"loss": 1.3294,
"step": 11500
},
{
"epoch": 0.6272214091574325,
"grad_norm": 1.1189128160476685,
"learning_rate": 1.8638929542128372e-05,
"loss": 1.3228,
"step": 12000
},
{
"epoch": 0.6272214091574325,
"eval_accuracy": 0.6674850752383293,
"eval_loss": 1.2623748779296875,
"eval_runtime": 53.8318,
"eval_samples_per_second": 114.356,
"eval_steps_per_second": 3.585,
"step": 12000
},
{
"epoch": 0.6533556345389923,
"grad_norm": 1.1318820714950562,
"learning_rate": 1.7332218273050387e-05,
"loss": 1.3188,
"step": 12500
},
{
"epoch": 0.679489859920552,
"grad_norm": 1.1322083473205566,
"learning_rate": 1.6025507003972405e-05,
"loss": 1.3105,
"step": 13000
},
{
"epoch": 0.679489859920552,
"eval_accuracy": 0.6702737619545769,
"eval_loss": 1.2479877471923828,
"eval_runtime": 53.2542,
"eval_samples_per_second": 115.596,
"eval_steps_per_second": 3.624,
"step": 13000
},
{
"epoch": 0.7056240853021116,
"grad_norm": 1.105891227722168,
"learning_rate": 1.471879573489442e-05,
"loss": 1.3034,
"step": 13500
},
{
"epoch": 0.7317583106836714,
"grad_norm": 1.1217329502105713,
"learning_rate": 1.3412084465816434e-05,
"loss": 1.2968,
"step": 14000
},
{
"epoch": 0.7317583106836714,
"eval_accuracy": 0.6723256586489939,
"eval_loss": 1.237587809562683,
"eval_runtime": 52.8726,
"eval_samples_per_second": 116.431,
"eval_steps_per_second": 3.65,
"step": 14000
},
{
"epoch": 0.757892536065231,
"grad_norm": 1.0946693420410156,
"learning_rate": 1.210537319673845e-05,
"loss": 1.2924,
"step": 14500
},
{
"epoch": 0.7840267614467907,
"grad_norm": 1.081743836402893,
"learning_rate": 1.0798661927660466e-05,
"loss": 1.2894,
"step": 15000
},
{
"epoch": 0.7840267614467907,
"eval_accuracy": 0.674236548977164,
"eval_loss": 1.2293522357940674,
"eval_runtime": 53.025,
"eval_samples_per_second": 116.096,
"eval_steps_per_second": 3.64,
"step": 15000
},
{
"epoch": 0.8101609868283504,
"grad_norm": 1.1411429643630981,
"learning_rate": 9.49195065858248e-06,
"loss": 1.2839,
"step": 15500
},
{
"epoch": 0.8362952122099101,
"grad_norm": 1.1418683528900146,
"learning_rate": 8.185239389504495e-06,
"loss": 1.2795,
"step": 16000
},
{
"epoch": 0.8362952122099101,
"eval_accuracy": 0.6758117552307328,
"eval_loss": 1.2207015752792358,
"eval_runtime": 53.0931,
"eval_samples_per_second": 115.947,
"eval_steps_per_second": 3.635,
"step": 16000
},
{
"epoch": 0.8624294375914698,
"grad_norm": 1.115324854850769,
"learning_rate": 6.878528120426511e-06,
"loss": 1.2737,
"step": 16500
},
{
"epoch": 0.8885636629730295,
"grad_norm": 1.1184618473052979,
"learning_rate": 5.571816851348526e-06,
"loss": 1.2718,
"step": 17000
},
{
"epoch": 0.8885636629730295,
"eval_accuracy": 0.6774212603301455,
"eval_loss": 1.2131068706512451,
"eval_runtime": 53.0358,
"eval_samples_per_second": 116.073,
"eval_steps_per_second": 3.639,
"step": 17000
},
{
"epoch": 0.9146978883545892,
"grad_norm": 1.1303164958953857,
"learning_rate": 4.265105582270542e-06,
"loss": 1.2676,
"step": 17500
},
{
"epoch": 0.9408321137361488,
"grad_norm": 1.0928161144256592,
"learning_rate": 2.958394313192557e-06,
"loss": 1.2679,
"step": 18000
},
{
"epoch": 0.9408321137361488,
"eval_accuracy": 0.6784538778973791,
"eval_loss": 1.2084242105484009,
"eval_runtime": 52.7134,
"eval_samples_per_second": 116.782,
"eval_steps_per_second": 3.661,
"step": 18000
},
{
"epoch": 0.9669663391177086,
"grad_norm": 1.1279501914978027,
"learning_rate": 1.6516830441145725e-06,
"loss": 1.2626,
"step": 18500
},
{
"epoch": 0.9931005644992682,
"grad_norm": 1.1036733388900757,
"learning_rate": 3.4497177503658794e-07,
"loss": 1.2646,
"step": 19000
},
{
"epoch": 0.9931005644992682,
"eval_accuracy": 0.6793310391216447,
"eval_loss": 1.204505443572998,
"eval_runtime": 52.9636,
"eval_samples_per_second": 116.231,
"eval_steps_per_second": 3.644,
"step": 19000
},
{
"epoch": 1.0,
"step": 19132,
"total_flos": 3.19927531732992e+17,
"train_loss": 1.5489698861050551,
"train_runtime": 7449.0972,
"train_samples_per_second": 82.185,
"train_steps_per_second": 2.568
}
],
"logging_steps": 500,
"max_steps": 19132,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 1000,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 3.19927531732992e+17,
"train_batch_size": 32,
"trial_name": null,
"trial_params": null
}