7do5jtsy / checkpoint-500 /trainer_state.json
roonbug's picture
Upload folder using huggingface_hub
298773e verified
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 0.8901746967842439,
"eval_steps": 100,
"global_step": 500,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"entropy": 0.8410565290600062,
"epoch": 0.017803493935684877,
"grad_norm": 41.25,
"learning_rate": 1.8e-07,
"loss": 14.231,
"mean_token_accuracy": 0.766436281055212,
"num_tokens": 257189.0,
"step": 10
},
{
"entropy": 0.8121558612212538,
"epoch": 0.035606987871369754,
"grad_norm": 43.25,
"learning_rate": 3.8e-07,
"loss": 13.7028,
"mean_token_accuracy": 0.7704860582947731,
"num_tokens": 528285.0,
"step": 20
},
{
"entropy": 0.8321617901325226,
"epoch": 0.053410481807054634,
"grad_norm": 38.75,
"learning_rate": 5.800000000000001e-07,
"loss": 14.0203,
"mean_token_accuracy": 0.7678851690143347,
"num_tokens": 801635.0,
"step": 30
},
{
"entropy": 0.8524315822869539,
"epoch": 0.07121397574273951,
"grad_norm": 37.75,
"learning_rate": 7.8e-07,
"loss": 14.3071,
"mean_token_accuracy": 0.7648834150284529,
"num_tokens": 1060812.0,
"step": 40
},
{
"entropy": 0.8403167355805635,
"epoch": 0.08901746967842439,
"grad_norm": 34.5,
"learning_rate": 9.800000000000001e-07,
"loss": 14.0049,
"mean_token_accuracy": 0.7667405389249324,
"num_tokens": 1327380.0,
"step": 50
},
{
"entropy": 0.8425804980099201,
"epoch": 0.10682096361410927,
"grad_norm": 34.25,
"learning_rate": 1.1800000000000001e-06,
"loss": 13.9802,
"mean_token_accuracy": 0.7680465068668128,
"num_tokens": 1597405.0,
"step": 60
},
{
"entropy": 0.8755916632711888,
"epoch": 0.12462445754979415,
"grad_norm": 32.25,
"learning_rate": 1.3800000000000001e-06,
"loss": 14.4557,
"mean_token_accuracy": 0.7614609662443399,
"num_tokens": 1859684.0,
"step": 70
},
{
"entropy": 0.8682003870606423,
"epoch": 0.14242795148547902,
"grad_norm": 30.375,
"learning_rate": 1.5800000000000001e-06,
"loss": 14.1174,
"mean_token_accuracy": 0.7653359699994325,
"num_tokens": 2118068.0,
"step": 80
},
{
"entropy": 0.8360268581658602,
"epoch": 0.1602314454211639,
"grad_norm": 28.375,
"learning_rate": 1.7800000000000001e-06,
"loss": 13.6773,
"mean_token_accuracy": 0.7724899325519801,
"num_tokens": 2388824.0,
"step": 90
},
{
"entropy": 0.8634540975093842,
"epoch": 0.17803493935684878,
"grad_norm": 34.0,
"learning_rate": 1.98e-06,
"loss": 14.0689,
"mean_token_accuracy": 0.7672080259770155,
"num_tokens": 2644330.0,
"step": 100
},
{
"epoch": 0.17803493935684878,
"eval_biology_entropy": 1.128741333961487,
"eval_biology_loss": 1.1601769924163818,
"eval_biology_mean_token_accuracy": 0.7042496418952942,
"eval_biology_num_tokens": 2644330.0,
"eval_biology_runtime": 54.4222,
"eval_biology_samples_per_second": 9.187,
"eval_biology_steps_per_second": 2.297,
"step": 100
},
{
"epoch": 0.17803493935684878,
"eval_chemistry_entropy": 0.8544474849700928,
"eval_chemistry_loss": 0.8545747995376587,
"eval_chemistry_mean_token_accuracy": 0.7713311352729797,
"eval_chemistry_num_tokens": 2644330.0,
"eval_chemistry_runtime": 65.3258,
"eval_chemistry_samples_per_second": 7.654,
"eval_chemistry_steps_per_second": 1.913,
"step": 100
},
{
"entropy": 0.8453936260193586,
"epoch": 0.19583843329253367,
"grad_norm": 29.5,
"learning_rate": 2.1800000000000003e-06,
"loss": 13.6681,
"mean_token_accuracy": 0.7730784911662341,
"num_tokens": 2913700.0,
"step": 110
},
{
"entropy": 0.8460511896759272,
"epoch": 0.21364192722821854,
"grad_norm": 27.625,
"learning_rate": 2.38e-06,
"loss": 13.728,
"mean_token_accuracy": 0.7713178683072328,
"num_tokens": 3185255.0,
"step": 120
},
{
"entropy": 0.8403830077499151,
"epoch": 0.2314454211639034,
"grad_norm": 28.25,
"learning_rate": 2.5800000000000003e-06,
"loss": 13.5642,
"mean_token_accuracy": 0.7730850588530302,
"num_tokens": 3454750.0,
"step": 130
},
{
"entropy": 0.8515023712068797,
"epoch": 0.2492489150995883,
"grad_norm": 30.0,
"learning_rate": 2.7800000000000005e-06,
"loss": 13.8032,
"mean_token_accuracy": 0.7721988521516323,
"num_tokens": 3719113.0,
"step": 140
},
{
"entropy": 0.8474330805242062,
"epoch": 0.26705240903527316,
"grad_norm": 27.375,
"learning_rate": 2.9800000000000003e-06,
"loss": 13.728,
"mean_token_accuracy": 0.7698215767741203,
"num_tokens": 3990505.0,
"step": 150
},
{
"entropy": 0.8292176539078355,
"epoch": 0.28485590297095803,
"grad_norm": 24.625,
"learning_rate": 3.1800000000000005e-06,
"loss": 13.3391,
"mean_token_accuracy": 0.776393149420619,
"num_tokens": 4267403.0,
"step": 160
},
{
"entropy": 0.8280835278332234,
"epoch": 0.30265939690664295,
"grad_norm": 29.875,
"learning_rate": 3.3800000000000007e-06,
"loss": 13.3175,
"mean_token_accuracy": 0.7770637154579163,
"num_tokens": 4535458.0,
"step": 170
},
{
"entropy": 0.842758315615356,
"epoch": 0.3204628908423278,
"grad_norm": 27.5,
"learning_rate": 3.58e-06,
"loss": 13.6062,
"mean_token_accuracy": 0.7731371156871318,
"num_tokens": 4796815.0,
"step": 180
},
{
"entropy": 0.8247860476374627,
"epoch": 0.3382663847780127,
"grad_norm": 22.75,
"learning_rate": 3.7800000000000002e-06,
"loss": 13.2445,
"mean_token_accuracy": 0.7772165313363075,
"num_tokens": 5066948.0,
"step": 190
},
{
"entropy": 0.8495354067534209,
"epoch": 0.35606987871369755,
"grad_norm": 26.125,
"learning_rate": 3.980000000000001e-06,
"loss": 13.7137,
"mean_token_accuracy": 0.7696468211710453,
"num_tokens": 5324751.0,
"step": 200
},
{
"epoch": 0.35606987871369755,
"eval_biology_entropy": 1.1391102423667907,
"eval_biology_loss": 1.1645851135253906,
"eval_biology_mean_token_accuracy": 0.7036963219642639,
"eval_biology_num_tokens": 5324751.0,
"eval_biology_runtime": 47.5522,
"eval_biology_samples_per_second": 10.515,
"eval_biology_steps_per_second": 2.629,
"step": 200
},
{
"epoch": 0.35606987871369755,
"eval_chemistry_entropy": 0.8357059621810913,
"eval_chemistry_loss": 0.8271888494491577,
"eval_chemistry_mean_token_accuracy": 0.7765080814361572,
"eval_chemistry_num_tokens": 5324751.0,
"eval_chemistry_runtime": 58.2133,
"eval_chemistry_samples_per_second": 8.589,
"eval_chemistry_steps_per_second": 2.147,
"step": 200
},
{
"entropy": 0.8138596788048744,
"epoch": 0.3738733726493824,
"grad_norm": 27.875,
"learning_rate": 4.18e-06,
"loss": 13.0337,
"mean_token_accuracy": 0.7797718059271574,
"num_tokens": 5585508.0,
"step": 210
},
{
"entropy": 0.8215312957763672,
"epoch": 0.39167686658506734,
"grad_norm": 25.75,
"learning_rate": 4.38e-06,
"loss": 13.2224,
"mean_token_accuracy": 0.7784637857228518,
"num_tokens": 5848889.0,
"step": 220
},
{
"entropy": 0.8100055737420917,
"epoch": 0.4094803605207522,
"grad_norm": 27.625,
"learning_rate": 4.58e-06,
"loss": 13.0714,
"mean_token_accuracy": 0.7809950839728117,
"num_tokens": 6114855.0,
"step": 230
},
{
"entropy": 0.8117449183017016,
"epoch": 0.4272838544564371,
"grad_norm": 25.875,
"learning_rate": 4.78e-06,
"loss": 13.0914,
"mean_token_accuracy": 0.7797230206429958,
"num_tokens": 6378152.0,
"step": 240
},
{
"entropy": 0.8300687098875642,
"epoch": 0.44508734839212194,
"grad_norm": 24.5,
"learning_rate": 4.980000000000001e-06,
"loss": 13.3199,
"mean_token_accuracy": 0.7753712415695191,
"num_tokens": 6637273.0,
"step": 250
},
{
"entropy": 0.8000802919268608,
"epoch": 0.4628908423278068,
"grad_norm": 23.75,
"learning_rate": 5.18e-06,
"loss": 12.8911,
"mean_token_accuracy": 0.7822641927748919,
"num_tokens": 6896684.0,
"step": 260
},
{
"entropy": 0.8173866732046008,
"epoch": 0.48069433626349173,
"grad_norm": 25.75,
"learning_rate": 5.380000000000001e-06,
"loss": 13.1839,
"mean_token_accuracy": 0.7791078709065914,
"num_tokens": 7166608.0,
"step": 270
},
{
"entropy": 0.7851757485419512,
"epoch": 0.4984978301991766,
"grad_norm": 23.125,
"learning_rate": 5.580000000000001e-06,
"loss": 12.5647,
"mean_token_accuracy": 0.7872234936803579,
"num_tokens": 7444923.0,
"step": 280
},
{
"entropy": 0.8196133345365524,
"epoch": 0.5163013241348615,
"grad_norm": 23.75,
"learning_rate": 5.78e-06,
"loss": 13.195,
"mean_token_accuracy": 0.7780409008264542,
"num_tokens": 7706502.0,
"step": 290
},
{
"entropy": 0.8108824253082275,
"epoch": 0.5341048180705463,
"grad_norm": 25.75,
"learning_rate": 5.98e-06,
"loss": 13.1243,
"mean_token_accuracy": 0.7791608296334743,
"num_tokens": 7969704.0,
"step": 300
},
{
"epoch": 0.5341048180705463,
"eval_biology_entropy": 1.1140506463050843,
"eval_biology_loss": 1.1709669828414917,
"eval_biology_mean_token_accuracy": 0.7033902740478516,
"eval_biology_num_tokens": 7969704.0,
"eval_biology_runtime": 47.6418,
"eval_biology_samples_per_second": 10.495,
"eval_biology_steps_per_second": 2.624,
"step": 300
},
{
"epoch": 0.5341048180705463,
"eval_chemistry_entropy": 0.7940924577713012,
"eval_chemistry_loss": 0.8012509942054749,
"eval_chemistry_mean_token_accuracy": 0.7820386853218079,
"eval_chemistry_num_tokens": 7969704.0,
"eval_chemistry_runtime": 59.4475,
"eval_chemistry_samples_per_second": 8.411,
"eval_chemistry_steps_per_second": 2.103,
"step": 300
},
{
"entropy": 0.7919084688648581,
"epoch": 0.5519083120062312,
"grad_norm": 24.375,
"learning_rate": 6.18e-06,
"loss": 12.7303,
"mean_token_accuracy": 0.7834540419280529,
"num_tokens": 8242162.0,
"step": 310
},
{
"entropy": 0.8003328915685415,
"epoch": 0.5697118059419161,
"grad_norm": 22.875,
"learning_rate": 6.380000000000001e-06,
"loss": 12.8333,
"mean_token_accuracy": 0.7824427511543035,
"num_tokens": 8497852.0,
"step": 320
},
{
"entropy": 0.7982260027900339,
"epoch": 0.587515299877601,
"grad_norm": 28.0,
"learning_rate": 6.5800000000000005e-06,
"loss": 12.8827,
"mean_token_accuracy": 0.7826927099376917,
"num_tokens": 8757753.0,
"step": 330
},
{
"entropy": 0.7888958260416985,
"epoch": 0.6053187938132859,
"grad_norm": 25.125,
"learning_rate": 6.780000000000001e-06,
"loss": 12.7464,
"mean_token_accuracy": 0.7863177515566349,
"num_tokens": 9024677.0,
"step": 340
},
{
"entropy": 0.7913781819865108,
"epoch": 0.6231222877489707,
"grad_norm": 26.0,
"learning_rate": 6.98e-06,
"loss": 12.7471,
"mean_token_accuracy": 0.7842035111039877,
"num_tokens": 9291760.0,
"step": 350
},
{
"entropy": 0.7765169985592365,
"epoch": 0.6409257816846556,
"grad_norm": 22.0,
"learning_rate": 7.180000000000001e-06,
"loss": 12.5378,
"mean_token_accuracy": 0.7859202962368727,
"num_tokens": 9561091.0,
"step": 360
},
{
"entropy": 0.7792716162279248,
"epoch": 0.6587292756203404,
"grad_norm": 25.0,
"learning_rate": 7.3800000000000005e-06,
"loss": 12.4956,
"mean_token_accuracy": 0.7880564954131841,
"num_tokens": 9827272.0,
"step": 370
},
{
"entropy": 0.758336128294468,
"epoch": 0.6765327695560254,
"grad_norm": 25.0,
"learning_rate": 7.58e-06,
"loss": 12.1732,
"mean_token_accuracy": 0.7918921418488025,
"num_tokens": 10096065.0,
"step": 380
},
{
"entropy": 0.7533971995115281,
"epoch": 0.6943362634917103,
"grad_norm": 24.375,
"learning_rate": 7.78e-06,
"loss": 12.1132,
"mean_token_accuracy": 0.7922064792364836,
"num_tokens": 10364601.0,
"step": 390
},
{
"entropy": 0.7620638139545918,
"epoch": 0.7121397574273951,
"grad_norm": 23.625,
"learning_rate": 7.980000000000002e-06,
"loss": 12.3379,
"mean_token_accuracy": 0.7897147350013256,
"num_tokens": 10633325.0,
"step": 400
},
{
"epoch": 0.7121397574273951,
"eval_biology_entropy": 1.1209784712791442,
"eval_biology_loss": 1.175487995147705,
"eval_biology_mean_token_accuracy": 0.7023502192497253,
"eval_biology_num_tokens": 10633325.0,
"eval_biology_runtime": 45.7349,
"eval_biology_samples_per_second": 10.933,
"eval_biology_steps_per_second": 2.733,
"step": 400
},
{
"epoch": 0.7121397574273951,
"eval_chemistry_entropy": 0.7846209690570831,
"eval_chemistry_loss": 0.7767007946968079,
"eval_chemistry_mean_token_accuracy": 0.7876848134994506,
"eval_chemistry_num_tokens": 10633325.0,
"eval_chemistry_runtime": 56.279,
"eval_chemistry_samples_per_second": 8.884,
"eval_chemistry_steps_per_second": 2.221,
"step": 400
},
{
"entropy": 0.7557655736804009,
"epoch": 0.72994325136308,
"grad_norm": 25.0,
"learning_rate": 8.18e-06,
"loss": 12.1754,
"mean_token_accuracy": 0.7914514016360045,
"num_tokens": 10897916.0,
"step": 410
},
{
"entropy": 0.7626162808388471,
"epoch": 0.7477467452987648,
"grad_norm": 22.375,
"learning_rate": 8.380000000000001e-06,
"loss": 12.1991,
"mean_token_accuracy": 0.7916438620537519,
"num_tokens": 11165356.0,
"step": 420
},
{
"entropy": 0.7673018729314208,
"epoch": 0.7655502392344498,
"grad_norm": 24.375,
"learning_rate": 8.580000000000001e-06,
"loss": 12.3987,
"mean_token_accuracy": 0.7888187035918236,
"num_tokens": 11436799.0,
"step": 430
},
{
"entropy": 0.7744929634034634,
"epoch": 0.7833537331701347,
"grad_norm": 24.125,
"learning_rate": 8.78e-06,
"loss": 12.4708,
"mean_token_accuracy": 0.787474300712347,
"num_tokens": 11703496.0,
"step": 440
},
{
"entropy": 0.7655632747337222,
"epoch": 0.8011572271058195,
"grad_norm": 24.875,
"learning_rate": 8.98e-06,
"loss": 12.299,
"mean_token_accuracy": 0.7900604665279388,
"num_tokens": 11965530.0,
"step": 450
},
{
"entropy": 0.7661685338243842,
"epoch": 0.8189607210415044,
"grad_norm": 23.0,
"learning_rate": 9.180000000000002e-06,
"loss": 12.3101,
"mean_token_accuracy": 0.7891276117414237,
"num_tokens": 12224427.0,
"step": 460
},
{
"entropy": 0.7463042287155985,
"epoch": 0.8367642149771892,
"grad_norm": 24.125,
"learning_rate": 9.38e-06,
"loss": 12.0346,
"mean_token_accuracy": 0.7945169288665056,
"num_tokens": 12509124.0,
"step": 470
},
{
"entropy": 0.753023486584425,
"epoch": 0.8545677089128741,
"grad_norm": 21.25,
"learning_rate": 9.58e-06,
"loss": 12.0236,
"mean_token_accuracy": 0.7931863989681005,
"num_tokens": 12778408.0,
"step": 480
},
{
"entropy": 0.7310503415763379,
"epoch": 0.8723712028485591,
"grad_norm": 22.25,
"learning_rate": 9.780000000000001e-06,
"loss": 11.8094,
"mean_token_accuracy": 0.7964685469865799,
"num_tokens": 13046473.0,
"step": 490
},
{
"entropy": 0.7645759535953403,
"epoch": 0.8901746967842439,
"grad_norm": 22.375,
"learning_rate": 9.980000000000001e-06,
"loss": 12.2645,
"mean_token_accuracy": 0.7906601417809724,
"num_tokens": 13301659.0,
"step": 500
},
{
"epoch": 0.8901746967842439,
"eval_biology_entropy": 1.092752426624298,
"eval_biology_loss": 1.1775906085968018,
"eval_biology_mean_token_accuracy": 0.7025688862800599,
"eval_biology_num_tokens": 13301659.0,
"eval_biology_runtime": 268.8833,
"eval_biology_samples_per_second": 1.86,
"eval_biology_steps_per_second": 0.465,
"step": 500
},
{
"epoch": 0.8901746967842439,
"eval_chemistry_entropy": 0.7384080934524536,
"eval_chemistry_loss": 0.7555699944496155,
"eval_chemistry_mean_token_accuracy": 0.7920897974967956,
"eval_chemistry_num_tokens": 13301659.0,
"eval_chemistry_runtime": 450.4029,
"eval_chemistry_samples_per_second": 1.11,
"eval_chemistry_steps_per_second": 0.278,
"step": 500
}
],
"logging_steps": 10,
"max_steps": 10000,
"num_input_tokens_seen": 0,
"num_train_epochs": 18,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 1.3874307456966482e+18,
"train_batch_size": 4,
"trial_name": null,
"trial_params": null
}