outputs / trainer_state.json
starlineventures's picture
starlineventures/pilot-talk
a4cc6c5 verified
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 3.0,
"eval_steps": 500,
"global_step": 480,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0625,
"grad_norm": 0.5966607332229614,
"learning_rate": 9.8125e-05,
"loss": 1.9853,
"mean_token_accuracy": 0.7986810088157654,
"num_tokens": 81920.0,
"step": 10
},
{
"epoch": 0.125,
"grad_norm": 0.517964780330658,
"learning_rate": 9.604166666666668e-05,
"loss": 0.1767,
"mean_token_accuracy": 0.9620053827762604,
"num_tokens": 163840.0,
"step": 20
},
{
"epoch": 0.1875,
"grad_norm": 0.3082728683948517,
"learning_rate": 9.395833333333333e-05,
"loss": 0.0745,
"mean_token_accuracy": 0.9809599459171295,
"num_tokens": 245760.0,
"step": 30
},
{
"epoch": 0.25,
"grad_norm": 0.1719624400138855,
"learning_rate": 9.1875e-05,
"loss": 0.0512,
"mean_token_accuracy": 0.9850024461746216,
"num_tokens": 327680.0,
"step": 40
},
{
"epoch": 0.3125,
"grad_norm": 0.16888852417469025,
"learning_rate": 8.979166666666668e-05,
"loss": 0.0432,
"mean_token_accuracy": 0.9855642437934875,
"num_tokens": 409600.0,
"step": 50
},
{
"epoch": 0.375,
"grad_norm": 0.18019415438175201,
"learning_rate": 8.770833333333334e-05,
"loss": 0.0358,
"mean_token_accuracy": 0.9871152937412262,
"num_tokens": 491520.0,
"step": 60
},
{
"epoch": 0.4375,
"grad_norm": 0.14396421611309052,
"learning_rate": 8.5625e-05,
"loss": 0.0322,
"mean_token_accuracy": 0.9876770913600922,
"num_tokens": 573440.0,
"step": 70
},
{
"epoch": 0.5,
"grad_norm": 0.1333707869052887,
"learning_rate": 8.354166666666667e-05,
"loss": 0.0307,
"mean_token_accuracy": 0.9872984886169434,
"num_tokens": 655360.0,
"step": 80
},
{
"epoch": 0.5625,
"grad_norm": 0.118904247879982,
"learning_rate": 8.145833333333334e-05,
"loss": 0.0298,
"mean_token_accuracy": 0.9874206185340881,
"num_tokens": 737280.0,
"step": 90
},
{
"epoch": 0.625,
"grad_norm": 0.10813665390014648,
"learning_rate": 7.9375e-05,
"loss": 0.0288,
"mean_token_accuracy": 0.9879335641860962,
"num_tokens": 819200.0,
"step": 100
},
{
"epoch": 0.6875,
"grad_norm": 0.11911392956972122,
"learning_rate": 7.729166666666667e-05,
"loss": 0.0287,
"mean_token_accuracy": 0.987469470500946,
"num_tokens": 901120.0,
"step": 110
},
{
"epoch": 0.75,
"grad_norm": 0.11651206016540527,
"learning_rate": 7.520833333333334e-05,
"loss": 0.0281,
"mean_token_accuracy": 0.9881045460700989,
"num_tokens": 983040.0,
"step": 120
},
{
"epoch": 0.8125,
"grad_norm": 0.12028653174638748,
"learning_rate": 7.3125e-05,
"loss": 0.0279,
"mean_token_accuracy": 0.9879335641860962,
"num_tokens": 1064960.0,
"step": 130
},
{
"epoch": 0.875,
"grad_norm": 0.10548015683889389,
"learning_rate": 7.104166666666667e-05,
"loss": 0.0276,
"mean_token_accuracy": 0.9877259433269501,
"num_tokens": 1146880.0,
"step": 140
},
{
"epoch": 0.9375,
"grad_norm": 0.10059994459152222,
"learning_rate": 6.895833333333333e-05,
"loss": 0.0276,
"mean_token_accuracy": 0.9879824161529541,
"num_tokens": 1228800.0,
"step": 150
},
{
"epoch": 1.0,
"grad_norm": 0.09910538047552109,
"learning_rate": 6.6875e-05,
"loss": 0.0275,
"mean_token_accuracy": 0.9881045460700989,
"num_tokens": 1310720.0,
"step": 160
},
{
"epoch": 1.0,
"eval_runtime": 11.8471,
"eval_samples_per_second": 13.505,
"eval_steps_per_second": 0.844,
"step": 160
},
{
"epoch": 1.0,
"eval_runtime": 10.9136,
"eval_samples_per_second": 14.661,
"eval_steps_per_second": 0.916,
"step": 160
},
{
"epoch": 1.0625,
"grad_norm": 0.10032763332128525,
"learning_rate": 6.479166666666668e-05,
"loss": 0.0267,
"mean_token_accuracy": 0.9882633149623871,
"num_tokens": 1392640.0,
"step": 170
},
{
"epoch": 1.125,
"grad_norm": 0.10291949659585953,
"learning_rate": 6.270833333333333e-05,
"loss": 0.0271,
"mean_token_accuracy": 0.9882144629955292,
"num_tokens": 1474560.0,
"step": 180
},
{
"epoch": 1.1875,
"grad_norm": 0.10503465682268143,
"learning_rate": 6.0624999999999996e-05,
"loss": 0.027,
"mean_token_accuracy": 0.988031268119812,
"num_tokens": 1556480.0,
"step": 190
},
{
"epoch": 1.25,
"grad_norm": 0.09680064767599106,
"learning_rate": 5.8541666666666676e-05,
"loss": 0.0268,
"mean_token_accuracy": 0.9883976578712463,
"num_tokens": 1638400.0,
"step": 200
},
{
"epoch": 1.3125,
"grad_norm": 0.09701237827539444,
"learning_rate": 5.6458333333333335e-05,
"loss": 0.0268,
"mean_token_accuracy": 0.9880923330783844,
"num_tokens": 1720320.0,
"step": 210
},
{
"epoch": 1.375,
"grad_norm": 0.09592857956886292,
"learning_rate": 5.4375e-05,
"loss": 0.027,
"mean_token_accuracy": 0.9881533980369568,
"num_tokens": 1802240.0,
"step": 220
},
{
"epoch": 1.4375,
"grad_norm": 0.09052903950214386,
"learning_rate": 5.229166666666667e-05,
"loss": 0.027,
"mean_token_accuracy": 0.9882999539375306,
"num_tokens": 1884160.0,
"step": 230
},
{
"epoch": 1.5,
"grad_norm": 0.1032903790473938,
"learning_rate": 5.020833333333333e-05,
"loss": 0.0268,
"mean_token_accuracy": 0.988312166929245,
"num_tokens": 1966080.0,
"step": 240
},
{
"epoch": 1.5625,
"grad_norm": 0.10449512302875519,
"learning_rate": 4.8125000000000004e-05,
"loss": 0.0266,
"mean_token_accuracy": 0.9881900370121002,
"num_tokens": 2048000.0,
"step": 250
},
{
"epoch": 1.625,
"grad_norm": 0.09428944438695908,
"learning_rate": 4.604166666666666e-05,
"loss": 0.0267,
"mean_token_accuracy": 0.9882511019706726,
"num_tokens": 2129920.0,
"step": 260
},
{
"epoch": 1.6875,
"grad_norm": 0.10462497174739838,
"learning_rate": 4.3958333333333336e-05,
"loss": 0.0266,
"mean_token_accuracy": 0.9879091382026672,
"num_tokens": 2211840.0,
"step": 270
},
{
"epoch": 1.75,
"grad_norm": 0.09638702869415283,
"learning_rate": 4.1875e-05,
"loss": 0.0266,
"mean_token_accuracy": 0.9883488059043884,
"num_tokens": 2293760.0,
"step": 280
},
{
"epoch": 1.8125,
"grad_norm": 0.10269024223089218,
"learning_rate": 3.979166666666667e-05,
"loss": 0.0265,
"mean_token_accuracy": 0.9881533980369568,
"num_tokens": 2375680.0,
"step": 290
},
{
"epoch": 1.875,
"grad_norm": 0.09432139992713928,
"learning_rate": 3.770833333333333e-05,
"loss": 0.0264,
"mean_token_accuracy": 0.9882633149623871,
"num_tokens": 2457600.0,
"step": 300
},
{
"epoch": 1.9375,
"grad_norm": 0.10591922700405121,
"learning_rate": 3.5625000000000005e-05,
"loss": 0.0265,
"mean_token_accuracy": 0.9878358602523803,
"num_tokens": 2539520.0,
"step": 310
},
{
"epoch": 2.0,
"grad_norm": 0.0988362580537796,
"learning_rate": 3.3541666666666664e-05,
"loss": 0.0264,
"mean_token_accuracy": 0.9880923330783844,
"num_tokens": 2621440.0,
"step": 320
},
{
"epoch": 2.0,
"eval_runtime": 10.7547,
"eval_samples_per_second": 14.877,
"eval_steps_per_second": 0.93,
"step": 320
},
{
"epoch": 2.0625,
"grad_norm": 0.08879899233579636,
"learning_rate": 3.145833333333334e-05,
"loss": 0.026,
"mean_token_accuracy": 0.9881656110286713,
"num_tokens": 2703360.0,
"step": 330
},
{
"epoch": 2.125,
"grad_norm": 0.09961431473493576,
"learning_rate": 2.9375000000000003e-05,
"loss": 0.0257,
"mean_token_accuracy": 0.9886052787303925,
"num_tokens": 2785280.0,
"step": 340
},
{
"epoch": 2.1875,
"grad_norm": 0.1122380793094635,
"learning_rate": 2.7291666666666665e-05,
"loss": 0.0261,
"mean_token_accuracy": 0.9886541306972504,
"num_tokens": 2867200.0,
"step": 350
},
{
"epoch": 2.25,
"grad_norm": 0.09964418411254883,
"learning_rate": 2.5208333333333334e-05,
"loss": 0.0255,
"mean_token_accuracy": 0.9889594554901123,
"num_tokens": 2949120.0,
"step": 360
},
{
"epoch": 2.3125,
"grad_norm": 0.09933824837207794,
"learning_rate": 2.3125000000000003e-05,
"loss": 0.0259,
"mean_token_accuracy": 0.9884220838546753,
"num_tokens": 3031040.0,
"step": 370
},
{
"epoch": 2.375,
"grad_norm": 0.09340930730104446,
"learning_rate": 2.104166666666667e-05,
"loss": 0.0262,
"mean_token_accuracy": 0.9882755279541016,
"num_tokens": 3112960.0,
"step": 380
},
{
"epoch": 2.4375,
"grad_norm": 0.09159277379512787,
"learning_rate": 1.8958333333333334e-05,
"loss": 0.0259,
"mean_token_accuracy": 0.9886297047138214,
"num_tokens": 3194880.0,
"step": 390
},
{
"epoch": 2.5,
"grad_norm": 0.10940947383642197,
"learning_rate": 1.6875000000000004e-05,
"loss": 0.0258,
"mean_token_accuracy": 0.9885075747966766,
"num_tokens": 3276800.0,
"step": 400
},
{
"epoch": 2.5625,
"grad_norm": 0.09535407274961472,
"learning_rate": 1.4791666666666668e-05,
"loss": 0.0259,
"mean_token_accuracy": 0.9886174917221069,
"num_tokens": 3358720.0,
"step": 410
},
{
"epoch": 2.625,
"grad_norm": 0.08938491344451904,
"learning_rate": 1.2708333333333333e-05,
"loss": 0.0257,
"mean_token_accuracy": 0.9884831488132477,
"num_tokens": 3440640.0,
"step": 420
},
{
"epoch": 2.6875,
"grad_norm": 0.09536239504814148,
"learning_rate": 1.0625e-05,
"loss": 0.0257,
"mean_token_accuracy": 0.9886052787303925,
"num_tokens": 3522560.0,
"step": 430
},
{
"epoch": 2.75,
"grad_norm": 0.0934009775519371,
"learning_rate": 8.541666666666666e-06,
"loss": 0.0257,
"mean_token_accuracy": 0.9886907696723938,
"num_tokens": 3604480.0,
"step": 440
},
{
"epoch": 2.8125,
"grad_norm": 0.09570059180259705,
"learning_rate": 6.458333333333334e-06,
"loss": 0.0255,
"mean_token_accuracy": 0.9888617515563964,
"num_tokens": 3686400.0,
"step": 450
},
{
"epoch": 2.875,
"grad_norm": 0.09678570926189423,
"learning_rate": 4.375e-06,
"loss": 0.0255,
"mean_token_accuracy": 0.988361018896103,
"num_tokens": 3768320.0,
"step": 460
},
{
"epoch": 2.9375,
"grad_norm": 0.0881657674908638,
"learning_rate": 2.2916666666666666e-06,
"loss": 0.0255,
"mean_token_accuracy": 0.9887884736061097,
"num_tokens": 3850240.0,
"step": 470
},
{
"epoch": 3.0,
"grad_norm": 0.10601094365119934,
"learning_rate": 2.0833333333333333e-07,
"loss": 0.0257,
"mean_token_accuracy": 0.9887518346309662,
"num_tokens": 3932160.0,
"step": 480
},
{
"epoch": 3.0,
"eval_runtime": 10.7566,
"eval_samples_per_second": 14.875,
"eval_steps_per_second": 0.93,
"step": 480
},
{
"epoch": 3.0,
"step": 480,
"total_flos": 0.0,
"train_loss": 0.07282223819444576,
"train_runtime": 407.0393,
"train_samples_per_second": 4.717,
"train_steps_per_second": 1.179
}
],
"logging_steps": 10,
"max_steps": 480,
"num_input_tokens_seen": 0,
"num_train_epochs": 3,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 0.0,
"train_batch_size": 4,
"trial_name": null,
"trial_params": null
}