AbdoTW's picture
Upload checkpoint-750
f7e3724 verified
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 2.9761904761904763,
"eval_steps": 50,
"global_step": 750,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0992063492063492,
"grad_norm": 2.4192488193511963,
"learning_rate": 4.7619047619047615e-06,
"loss": 1.3237,
"step": 25
},
{
"epoch": 0.1984126984126984,
"grad_norm": 0.8920113444328308,
"learning_rate": 9.722222222222223e-06,
"loss": 0.6721,
"step": 50
},
{
"epoch": 0.1984126984126984,
"eval_loss": 0.48523738980293274,
"eval_runtime": 17.2583,
"eval_samples_per_second": 4.867,
"eval_steps_per_second": 2.434,
"step": 50
},
{
"epoch": 0.2976190476190476,
"grad_norm": 0.5771058797836304,
"learning_rate": 1.4682539682539683e-05,
"loss": 0.3509,
"step": 75
},
{
"epoch": 0.3968253968253968,
"grad_norm": 0.5407119393348694,
"learning_rate": 1.9642857142857145e-05,
"loss": 0.2735,
"step": 100
},
{
"epoch": 0.3968253968253968,
"eval_loss": 0.28276199102401733,
"eval_runtime": 17.2337,
"eval_samples_per_second": 4.874,
"eval_steps_per_second": 2.437,
"step": 100
},
{
"epoch": 0.49603174603174605,
"grad_norm": 0.496040016412735,
"learning_rate": 2.4603174603174602e-05,
"loss": 0.2463,
"step": 125
},
{
"epoch": 0.5952380952380952,
"grad_norm": 0.4996989667415619,
"learning_rate": 2.9563492063492066e-05,
"loss": 0.2279,
"step": 150
},
{
"epoch": 0.5952380952380952,
"eval_loss": 0.24540141224861145,
"eval_runtime": 17.2092,
"eval_samples_per_second": 4.881,
"eval_steps_per_second": 2.441,
"step": 150
},
{
"epoch": 0.6944444444444444,
"grad_norm": 0.5580428838729858,
"learning_rate": 3.4523809523809526e-05,
"loss": 0.214,
"step": 175
},
{
"epoch": 0.7936507936507936,
"grad_norm": 0.4554564356803894,
"learning_rate": 3.9484126984126986e-05,
"loss": 0.2046,
"step": 200
},
{
"epoch": 0.7936507936507936,
"eval_loss": 0.22907117009162903,
"eval_runtime": 17.3315,
"eval_samples_per_second": 4.847,
"eval_steps_per_second": 2.423,
"step": 200
},
{
"epoch": 0.8928571428571429,
"grad_norm": 0.5286790132522583,
"learning_rate": 4.4444444444444447e-05,
"loss": 0.1943,
"step": 225
},
{
"epoch": 0.9920634920634921,
"grad_norm": 0.4137091040611267,
"learning_rate": 4.940476190476191e-05,
"loss": 0.1885,
"step": 250
},
{
"epoch": 0.9920634920634921,
"eval_loss": 0.20971575379371643,
"eval_runtime": 17.2857,
"eval_samples_per_second": 4.859,
"eval_steps_per_second": 2.43,
"step": 250
},
{
"epoch": 1.0912698412698412,
"grad_norm": 0.4036062955856323,
"learning_rate": 5.436507936507936e-05,
"loss": 0.1656,
"step": 275
},
{
"epoch": 1.1904761904761905,
"grad_norm": 0.43942078948020935,
"learning_rate": 5.932539682539683e-05,
"loss": 0.1668,
"step": 300
},
{
"epoch": 1.1904761904761905,
"eval_loss": 0.19860731065273285,
"eval_runtime": 17.4085,
"eval_samples_per_second": 4.825,
"eval_steps_per_second": 2.413,
"step": 300
},
{
"epoch": 1.2896825396825398,
"grad_norm": 0.38786229491233826,
"learning_rate": 6.428571428571429e-05,
"loss": 0.1575,
"step": 325
},
{
"epoch": 1.3888888888888888,
"grad_norm": 0.3493235409259796,
"learning_rate": 6.924603174603174e-05,
"loss": 0.1619,
"step": 350
},
{
"epoch": 1.3888888888888888,
"eval_loss": 0.19389225542545319,
"eval_runtime": 17.2223,
"eval_samples_per_second": 4.877,
"eval_steps_per_second": 2.439,
"step": 350
},
{
"epoch": 1.4880952380952381,
"grad_norm": 0.39436835050582886,
"learning_rate": 7.420634920634921e-05,
"loss": 0.1524,
"step": 375
},
{
"epoch": 1.5873015873015874,
"grad_norm": 0.3463725745677948,
"learning_rate": 7.916666666666666e-05,
"loss": 0.1482,
"step": 400
},
{
"epoch": 1.5873015873015874,
"eval_loss": 0.185992032289505,
"eval_runtime": 17.0685,
"eval_samples_per_second": 4.921,
"eval_steps_per_second": 2.461,
"step": 400
},
{
"epoch": 1.6865079365079365,
"grad_norm": 0.3713228702545166,
"learning_rate": 8.412698412698413e-05,
"loss": 0.149,
"step": 425
},
{
"epoch": 1.7857142857142856,
"grad_norm": 0.30970633029937744,
"learning_rate": 8.90873015873016e-05,
"loss": 0.1486,
"step": 450
},
{
"epoch": 1.7857142857142856,
"eval_loss": 0.17747902870178223,
"eval_runtime": 17.1226,
"eval_samples_per_second": 4.906,
"eval_steps_per_second": 2.453,
"step": 450
},
{
"epoch": 1.8849206349206349,
"grad_norm": 0.3172999918460846,
"learning_rate": 9.404761904761905e-05,
"loss": 0.147,
"step": 475
},
{
"epoch": 1.9841269841269842,
"grad_norm": 0.3906099498271942,
"learning_rate": 9.900793650793652e-05,
"loss": 0.1427,
"step": 500
},
{
"epoch": 1.9841269841269842,
"eval_loss": 0.17465326189994812,
"eval_runtime": 17.3304,
"eval_samples_per_second": 4.847,
"eval_steps_per_second": 2.423,
"step": 500
},
{
"epoch": 2.0833333333333335,
"grad_norm": 0.402475506067276,
"learning_rate": 9.999520325413887e-05,
"loss": 0.1208,
"step": 525
},
{
"epoch": 2.1825396825396823,
"grad_norm": 0.37534043192863464,
"learning_rate": 9.997571805142639e-05,
"loss": 0.1163,
"step": 550
},
{
"epoch": 2.1825396825396823,
"eval_loss": 0.1724633127450943,
"eval_runtime": 17.1218,
"eval_samples_per_second": 4.906,
"eval_steps_per_second": 2.453,
"step": 550
},
{
"epoch": 2.2817460317460316,
"grad_norm": 0.30866020917892456,
"learning_rate": 9.994125043229752e-05,
"loss": 0.1104,
"step": 575
},
{
"epoch": 2.380952380952381,
"grad_norm": 0.3464547395706177,
"learning_rate": 9.989181072993494e-05,
"loss": 0.1111,
"step": 600
},
{
"epoch": 2.380952380952381,
"eval_loss": 0.16887226700782776,
"eval_runtime": 17.1039,
"eval_samples_per_second": 4.911,
"eval_steps_per_second": 2.456,
"step": 600
},
{
"epoch": 2.4801587301587302,
"grad_norm": 0.3208065927028656,
"learning_rate": 9.982741376606078e-05,
"loss": 0.1153,
"step": 625
},
{
"epoch": 2.5793650793650795,
"grad_norm": 0.2699253559112549,
"learning_rate": 9.97480788464933e-05,
"loss": 0.106,
"step": 650
},
{
"epoch": 2.5793650793650795,
"eval_loss": 0.16333948075771332,
"eval_runtime": 17.2826,
"eval_samples_per_second": 4.86,
"eval_steps_per_second": 2.43,
"step": 650
},
{
"epoch": 2.678571428571429,
"grad_norm": 0.2834033668041229,
"learning_rate": 9.965382975535902e-05,
"loss": 0.1121,
"step": 675
},
{
"epoch": 2.7777777777777777,
"grad_norm": 0.2606027126312256,
"learning_rate": 9.954469474796241e-05,
"loss": 0.1127,
"step": 700
},
{
"epoch": 2.7777777777777777,
"eval_loss": 0.1618286520242691,
"eval_runtime": 17.132,
"eval_samples_per_second": 4.903,
"eval_steps_per_second": 2.452,
"step": 700
},
{
"epoch": 2.876984126984127,
"grad_norm": 0.30125725269317627,
"learning_rate": 9.942070654231517e-05,
"loss": 0.1044,
"step": 725
},
{
"epoch": 2.9761904761904763,
"grad_norm": 0.29861846566200256,
"learning_rate": 9.928190230932746e-05,
"loss": 0.1136,
"step": 750
},
{
"epoch": 2.9761904761904763,
"eval_loss": 0.1563536822795868,
"eval_runtime": 17.1692,
"eval_samples_per_second": 4.892,
"eval_steps_per_second": 2.446,
"step": 750
}
],
"logging_steps": 25,
"max_steps": 5040,
"num_input_tokens_seen": 0,
"num_train_epochs": 20,
"save_steps": 50,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 3.9601356198961e+17,
"train_batch_size": 2,
"trial_name": null,
"trial_params": null
}