gpt-a / trainer_state.json
luxopes's picture
Upload 13 files
719fe40 verified
{
"best_global_step": 4000,
"best_metric": 2.539825201034546,
"best_model_checkpoint": "./qlora_gpt2/checkpoint-4000",
"epoch": 2.2785531187695813,
"eval_steps": 500,
"global_step": 4000,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.05696382796923953,
"grad_norm": 0.22663557529449463,
"learning_rate": 3.960000000000001e-05,
"loss": 3.4511,
"step": 100
},
{
"epoch": 0.11392765593847906,
"grad_norm": 0.2373093217611313,
"learning_rate": 7.960000000000001e-05,
"loss": 3.068,
"step": 200
},
{
"epoch": 0.1708914839077186,
"grad_norm": 0.25586986541748047,
"learning_rate": 0.00011960000000000001,
"loss": 2.9652,
"step": 300
},
{
"epoch": 0.22785531187695812,
"grad_norm": 0.24306726455688477,
"learning_rate": 0.0001596,
"loss": 2.9145,
"step": 400
},
{
"epoch": 0.28481913984619767,
"grad_norm": 0.2453053742647171,
"learning_rate": 0.0001996,
"loss": 2.8514,
"step": 500
},
{
"epoch": 0.28481913984619767,
"eval_loss": 2.73831844329834,
"eval_runtime": 780.3351,
"eval_samples_per_second": 15.997,
"eval_steps_per_second": 1.001,
"step": 500
},
{
"epoch": 0.3417829678154372,
"grad_norm": 0.2287718951702118,
"learning_rate": 0.00019584731543624163,
"loss": 2.8526,
"step": 600
},
{
"epoch": 0.39874679578467676,
"grad_norm": 0.23401789367198944,
"learning_rate": 0.0001916526845637584,
"loss": 2.8242,
"step": 700
},
{
"epoch": 0.45571062375391624,
"grad_norm": 0.2420588731765747,
"learning_rate": 0.0001874580536912752,
"loss": 2.8295,
"step": 800
},
{
"epoch": 0.5126744517231558,
"grad_norm": 0.24802158772945404,
"learning_rate": 0.00018326342281879197,
"loss": 2.7982,
"step": 900
},
{
"epoch": 0.5696382796923953,
"grad_norm": 0.24083346128463745,
"learning_rate": 0.00017906879194630872,
"loss": 2.7647,
"step": 1000
},
{
"epoch": 0.5696382796923953,
"eval_loss": 2.657050371170044,
"eval_runtime": 780.7988,
"eval_samples_per_second": 15.987,
"eval_steps_per_second": 1.0,
"step": 1000
},
{
"epoch": 0.6266021076616348,
"grad_norm": 0.23890583217144012,
"learning_rate": 0.0001748741610738255,
"loss": 2.7541,
"step": 1100
},
{
"epoch": 0.6835659356308744,
"grad_norm": 0.2339860498905182,
"learning_rate": 0.00017067953020134227,
"loss": 2.7423,
"step": 1200
},
{
"epoch": 0.7405297636001139,
"grad_norm": 0.224105566740036,
"learning_rate": 0.00016648489932885908,
"loss": 2.7567,
"step": 1300
},
{
"epoch": 0.7974935915693535,
"grad_norm": 0.21676279604434967,
"learning_rate": 0.00016229026845637586,
"loss": 2.7369,
"step": 1400
},
{
"epoch": 0.854457419538593,
"grad_norm": 0.22006016969680786,
"learning_rate": 0.00015809563758389263,
"loss": 2.7527,
"step": 1500
},
{
"epoch": 0.854457419538593,
"eval_loss": 2.614635467529297,
"eval_runtime": 780.923,
"eval_samples_per_second": 15.985,
"eval_steps_per_second": 1.0,
"step": 1500
},
{
"epoch": 0.9114212475078325,
"grad_norm": 0.2208578735589981,
"learning_rate": 0.0001539010067114094,
"loss": 2.7038,
"step": 1600
},
{
"epoch": 0.9683850754770721,
"grad_norm": 0.245719775557518,
"learning_rate": 0.00014970637583892616,
"loss": 2.7085,
"step": 1700
},
{
"epoch": 1.0256337225861578,
"grad_norm": 0.22791120409965515,
"learning_rate": 0.00014551174496644294,
"loss": 2.7286,
"step": 1800
},
{
"epoch": 1.0825975505553973,
"grad_norm": 0.2143191248178482,
"learning_rate": 0.00014131711409395975,
"loss": 2.6748,
"step": 1900
},
{
"epoch": 1.1395613785246368,
"grad_norm": 0.2522701323032379,
"learning_rate": 0.00013712248322147652,
"loss": 2.6871,
"step": 2000
},
{
"epoch": 1.1395613785246368,
"eval_loss": 2.5919432640075684,
"eval_runtime": 777.5861,
"eval_samples_per_second": 16.054,
"eval_steps_per_second": 1.004,
"step": 2000
},
{
"epoch": 1.1965252064938765,
"grad_norm": 0.21982581913471222,
"learning_rate": 0.0001329278523489933,
"loss": 2.644,
"step": 2100
},
{
"epoch": 1.253489034463116,
"grad_norm": 0.19931554794311523,
"learning_rate": 0.00012873322147651008,
"loss": 2.6782,
"step": 2200
},
{
"epoch": 1.3104528624323555,
"grad_norm": 0.22992636263370514,
"learning_rate": 0.00012453859060402686,
"loss": 2.6368,
"step": 2300
},
{
"epoch": 1.367416690401595,
"grad_norm": 0.257996529340744,
"learning_rate": 0.00012034395973154362,
"loss": 2.6744,
"step": 2400
},
{
"epoch": 1.4243805183708345,
"grad_norm": 0.23609480261802673,
"learning_rate": 0.0001161493288590604,
"loss": 2.662,
"step": 2500
},
{
"epoch": 1.4243805183708345,
"eval_loss": 2.572186231613159,
"eval_runtime": 779.8917,
"eval_samples_per_second": 16.006,
"eval_steps_per_second": 1.001,
"step": 2500
},
{
"epoch": 1.481344346340074,
"grad_norm": 0.22605575621128082,
"learning_rate": 0.00011195469798657718,
"loss": 2.6789,
"step": 2600
},
{
"epoch": 1.5383081743093134,
"grad_norm": 0.2314230501651764,
"learning_rate": 0.00010776006711409397,
"loss": 2.6356,
"step": 2700
},
{
"epoch": 1.5952720022785531,
"grad_norm": 0.23274995386600494,
"learning_rate": 0.00010356543624161075,
"loss": 2.661,
"step": 2800
},
{
"epoch": 1.6522358302477926,
"grad_norm": 0.20582696795463562,
"learning_rate": 9.937080536912751e-05,
"loss": 2.643,
"step": 2900
},
{
"epoch": 1.7091996582170323,
"grad_norm": 0.2208004742860794,
"learning_rate": 9.51761744966443e-05,
"loss": 2.6328,
"step": 3000
},
{
"epoch": 1.7091996582170323,
"eval_loss": 2.558286190032959,
"eval_runtime": 779.5251,
"eval_samples_per_second": 16.014,
"eval_steps_per_second": 1.002,
"step": 3000
},
{
"epoch": 1.7661634861862718,
"grad_norm": 0.23239333927631378,
"learning_rate": 9.098154362416108e-05,
"loss": 2.6733,
"step": 3100
},
{
"epoch": 1.8231273141555113,
"grad_norm": 0.2152535766363144,
"learning_rate": 8.678691275167785e-05,
"loss": 2.6505,
"step": 3200
},
{
"epoch": 1.8800911421247508,
"grad_norm": 0.21094359457492828,
"learning_rate": 8.259228187919464e-05,
"loss": 2.6153,
"step": 3300
},
{
"epoch": 1.9370549700939903,
"grad_norm": 0.20640310645103455,
"learning_rate": 7.839765100671142e-05,
"loss": 2.6411,
"step": 3400
},
{
"epoch": 1.9940187980632298,
"grad_norm": 0.2932434678077698,
"learning_rate": 7.42030201342282e-05,
"loss": 2.6608,
"step": 3500
},
{
"epoch": 1.9940187980632298,
"eval_loss": 2.5474600791931152,
"eval_runtime": 779.7457,
"eval_samples_per_second": 16.009,
"eval_steps_per_second": 1.002,
"step": 3500
},
{
"epoch": 2.0506978068926234,
"grad_norm": 0.22819621860980988,
"learning_rate": 7.000838926174496e-05,
"loss": 2.6262,
"step": 3600
},
{
"epoch": 2.107661634861863,
"grad_norm": 0.23161746561527252,
"learning_rate": 6.581375838926175e-05,
"loss": 2.6318,
"step": 3700
},
{
"epoch": 2.1646254628311024,
"grad_norm": 0.26975372433662415,
"learning_rate": 6.161912751677853e-05,
"loss": 2.6433,
"step": 3800
},
{
"epoch": 2.221589290800342,
"grad_norm": 0.23308990895748138,
"learning_rate": 5.74244966442953e-05,
"loss": 2.6375,
"step": 3900
},
{
"epoch": 2.2785531187695813,
"grad_norm": 0.23112072050571442,
"learning_rate": 5.322986577181208e-05,
"loss": 2.6248,
"step": 4000
},
{
"epoch": 2.2785531187695813,
"eval_loss": 2.539825201034546,
"eval_runtime": 779.2483,
"eval_samples_per_second": 16.019,
"eval_steps_per_second": 1.002,
"step": 4000
}
],
"logging_steps": 100,
"max_steps": 5268,
"num_input_tokens_seen": 0,
"num_train_epochs": 3,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 1.439796133185454e+17,
"train_batch_size": 16,
"trial_name": null,
"trial_params": null
}