dwhite2003's picture
Upload folder using huggingface_hub
cc24a4a verified
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 3.0,
"eval_steps": 500,
"global_step": 375,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"entropy": 0.7578534603118896,
"epoch": 0.08,
"grad_norm": 0.0264892578125,
"learning_rate": 1.3500000000000001e-05,
"loss": 1.0626055717468261,
"mean_token_accuracy": 0.7818616509437561,
"num_tokens": 145593.0,
"step": 10
},
{
"entropy": 0.8365898035466671,
"epoch": 0.16,
"grad_norm": 0.0245361328125,
"learning_rate": 1.4977508774437613e-05,
"loss": 1.0916013717651367,
"mean_token_accuracy": 0.774065163731575,
"num_tokens": 275483.0,
"step": 20
},
{
"entropy": 0.8604681774973869,
"epoch": 0.24,
"grad_norm": 0.0216064453125,
"learning_rate": 1.4899934496687427e-05,
"loss": 0.9568503379821778,
"mean_token_accuracy": 0.7914068281650544,
"num_tokens": 412016.0,
"step": 30
},
{
"entropy": 0.9091467261314392,
"epoch": 0.32,
"grad_norm": 0.0242919921875,
"learning_rate": 1.4767573692933046e-05,
"loss": 0.896739387512207,
"mean_token_accuracy": 0.7991590544581413,
"num_tokens": 546070.0,
"step": 40
},
{
"entropy": 0.8796953782439232,
"epoch": 0.4,
"grad_norm": 0.026123046875,
"learning_rate": 1.4581406316256497e-05,
"loss": 0.8293215751647949,
"mean_token_accuracy": 0.8040232986211777,
"num_tokens": 671836.0,
"step": 50
},
{
"entropy": 0.7109395630657673,
"epoch": 0.48,
"grad_norm": 0.0257568359375,
"learning_rate": 1.4342810684780456e-05,
"loss": 0.6653613567352294,
"mean_token_accuracy": 0.8423441141843796,
"num_tokens": 808474.0,
"step": 60
},
{
"entropy": 0.595400421321392,
"epoch": 0.56,
"grad_norm": 0.038818359375,
"learning_rate": 1.4053553277083107e-05,
"loss": 0.5540878295898437,
"mean_token_accuracy": 0.8688836485147476,
"num_tokens": 935307.0,
"step": 70
},
{
"entropy": 0.4914353840053082,
"epoch": 0.64,
"grad_norm": 0.033447265625,
"learning_rate": 1.3715775653808777e-05,
"loss": 0.464099645614624,
"mean_token_accuracy": 0.889048607647419,
"num_tokens": 1064855.0,
"step": 80
},
{
"entropy": 0.46343609765172006,
"epoch": 0.72,
"grad_norm": 0.038818359375,
"learning_rate": 1.3331978602302202e-05,
"loss": 0.441998291015625,
"mean_token_accuracy": 0.8911783829331398,
"num_tokens": 1190651.0,
"step": 90
},
{
"entropy": 0.4148942559957504,
"epoch": 0.8,
"grad_norm": 0.052001953125,
"learning_rate": 1.2905003621653957e-05,
"loss": 0.3941481590270996,
"mean_token_accuracy": 0.9037791520357132,
"num_tokens": 1326261.0,
"step": 100
},
{
"entropy": 0.3749663054943085,
"epoch": 0.88,
"grad_norm": 0.0341796875,
"learning_rate": 1.2438011885235367e-05,
"loss": 0.3582408666610718,
"mean_token_accuracy": 0.9127597466111184,
"num_tokens": 1459975.0,
"step": 110
},
{
"entropy": 0.3347546439617872,
"epoch": 0.96,
"grad_norm": 0.037109375,
"learning_rate": 1.1934460836476838e-05,
"loss": 0.31285719871520995,
"mean_token_accuracy": 0.9229100957512856,
"num_tokens": 1595413.0,
"step": 120
},
{
"entropy": 0.3288331624120474,
"epoch": 1.04,
"grad_norm": 0.03515625,
"learning_rate": 1.139807859116637e-05,
"loss": 0.31476891040802,
"mean_token_accuracy": 0.9243608936667442,
"num_tokens": 1722937.0,
"step": 130
},
{
"entropy": 0.287095432728529,
"epoch": 1.12,
"grad_norm": 0.032470703125,
"learning_rate": 1.0832836335784602e-05,
"loss": 0.272609543800354,
"mean_token_accuracy": 0.9351037934422493,
"num_tokens": 1851236.0,
"step": 140
},
{
"entropy": 0.2753055978566408,
"epoch": 1.2,
"grad_norm": 0.0216064453125,
"learning_rate": 1.024291892622952e-05,
"loss": 0.2653245210647583,
"mean_token_accuracy": 0.9374801725149154,
"num_tokens": 1991297.0,
"step": 150
},
{
"entropy": 0.2545565586537123,
"epoch": 1.28,
"grad_norm": 0.033203125,
"learning_rate": 9.63269390460753e-06,
"loss": 0.24248099327087402,
"mean_token_accuracy": 0.943722878396511,
"num_tokens": 2116751.0,
"step": 160
},
{
"entropy": 0.21991128847002983,
"epoch": 1.3599999999999999,
"grad_norm": 0.0341796875,
"learning_rate": 9.006679163479767e-06,
"loss": 0.20926618576049805,
"mean_token_accuracy": 0.9520752727985382,
"num_tokens": 2253965.0,
"step": 170
},
{
"entropy": 0.21300265919417144,
"epoch": 1.44,
"grad_norm": 0.0284423828125,
"learning_rate": 8.369509496966254e-06,
"loss": 0.201596999168396,
"mean_token_accuracy": 0.9541342169046402,
"num_tokens": 2387688.0,
"step": 180
},
{
"entropy": 0.1999088702723384,
"epoch": 1.52,
"grad_norm": 0.0274658203125,
"learning_rate": 7.725902286351813e-06,
"loss": 0.18661935329437257,
"mean_token_accuracy": 0.9588151663541794,
"num_tokens": 2513947.0,
"step": 190
},
{
"entropy": 0.21373646408319474,
"epoch": 1.6,
"grad_norm": 0.0257568359375,
"learning_rate": 7.0806225742454765e-06,
"loss": 0.20502221584320068,
"mean_token_accuracy": 0.9535915687680244,
"num_tokens": 2641508.0,
"step": 200
},
{
"entropy": 0.19736929275095463,
"epoch": 1.6800000000000002,
"grad_norm": 0.0257568359375,
"learning_rate": 6.438447785872176e-06,
"loss": 0.18847702741622924,
"mean_token_accuracy": 0.9587804660201072,
"num_tokens": 2783740.0,
"step": 210
},
{
"entropy": 0.19301388934254646,
"epoch": 1.76,
"grad_norm": 0.0302734375,
"learning_rate": 5.804132358687839e-06,
"loss": 0.18001898527145385,
"mean_token_accuracy": 0.9596618011593818,
"num_tokens": 2912164.0,
"step": 220
},
{
"entropy": 0.16831486914306878,
"epoch": 1.8399999999999999,
"grad_norm": 0.020751953125,
"learning_rate": 5.182372542187895e-06,
"loss": 0.15890954732894896,
"mean_token_accuracy": 0.9644858300685882,
"num_tokens": 3049526.0,
"step": 230
},
{
"entropy": 0.1791680719703436,
"epoch": 1.92,
"grad_norm": 0.022705078125,
"learning_rate": 4.577771628519091e-06,
"loss": 0.16924891471862794,
"mean_token_accuracy": 0.9632360026240349,
"num_tokens": 3179372.0,
"step": 240
},
{
"entropy": 0.17665232978761197,
"epoch": 2.0,
"grad_norm": 0.0184326171875,
"learning_rate": 3.9948058713149845e-06,
"loss": 0.16852205991744995,
"mean_token_accuracy": 0.9628902286291122,
"num_tokens": 3318608.0,
"step": 250
},
{
"entropy": 0.17881185598671437,
"epoch": 2.08,
"grad_norm": 0.0264892578125,
"learning_rate": 3.4377913450801405e-06,
"loss": 0.17007871866226196,
"mean_token_accuracy": 0.963550227880478,
"num_tokens": 3453713.0,
"step": 260
},
{
"entropy": 0.18688333593308926,
"epoch": 2.16,
"grad_norm": 0.029052734375,
"learning_rate": 2.9108519904845677e-06,
"loss": 0.1791600227355957,
"mean_token_accuracy": 0.961190114915371,
"num_tokens": 3577609.0,
"step": 270
},
{
"entropy": 0.17757816668599843,
"epoch": 2.24,
"grad_norm": 0.01953125,
"learning_rate": 2.417889082149834e-06,
"loss": 0.17080665826797486,
"mean_token_accuracy": 0.9637410417199135,
"num_tokens": 3707617.0,
"step": 280
},
{
"entropy": 0.17086777035146952,
"epoch": 2.32,
"grad_norm": 0.0172119140625,
"learning_rate": 1.962552344976702e-06,
"loss": 0.1627667188644409,
"mean_token_accuracy": 0.9644756108522415,
"num_tokens": 3837778.0,
"step": 290
},
{
"entropy": 0.16370586045086383,
"epoch": 2.4,
"grad_norm": 0.02294921875,
"learning_rate": 1.5482129328588954e-06,
"loss": 0.15820531845092772,
"mean_token_accuracy": 0.9661343678832054,
"num_tokens": 3972183.0,
"step": 300
},
{
"entropy": 0.17486946266144515,
"epoch": 2.48,
"grad_norm": 0.0264892578125,
"learning_rate": 1.1779384698391296e-06,
"loss": 0.16707814931869508,
"mean_token_accuracy": 0.9641691878437996,
"num_tokens": 4102229.0,
"step": 310
},
{
"entropy": 0.17446570619940757,
"epoch": 2.56,
"grad_norm": 0.017333984375,
"learning_rate": 8.544703384939661e-07,
"loss": 0.1683019757270813,
"mean_token_accuracy": 0.9637968197464943,
"num_tokens": 4235775.0,
"step": 320
},
{
"entropy": 0.1773039098829031,
"epoch": 2.64,
"grad_norm": 0.0206298828125,
"learning_rate": 5.802033836963438e-07,
"loss": 0.1716697096824646,
"mean_token_accuracy": 0.9628803566098213,
"num_tokens": 4369651.0,
"step": 330
},
{
"entropy": 0.17304655965417623,
"epoch": 2.7199999999999998,
"grad_norm": 0.0233154296875,
"learning_rate": 3.571681820220196e-07,
"loss": 0.16543103456497193,
"mean_token_accuracy": 0.9639842137694359,
"num_tokens": 4504684.0,
"step": 340
},
{
"entropy": 0.17868599500507115,
"epoch": 2.8,
"grad_norm": 0.0201416015625,
"learning_rate": 1.870160080710398e-07,
"loss": 0.17553088665008545,
"mean_token_accuracy": 0.9631485670804978,
"num_tokens": 4633399.0,
"step": 350
},
{
"entropy": 0.16202218551188707,
"epoch": 2.88,
"grad_norm": 0.0162353515625,
"learning_rate": 7.100660900832945e-08,
"loss": 0.15266696214675904,
"mean_token_accuracy": 0.9657531261444092,
"num_tokens": 4780905.0,
"step": 360
},
{
"entropy": 0.17599052861332892,
"epoch": 2.96,
"grad_norm": 0.0206298828125,
"learning_rate": 9.998877836430998e-09,
"loss": 0.16831759214401246,
"mean_token_accuracy": 0.9632442593574524,
"num_tokens": 4916029.0,
"step": 370
}
],
"logging_steps": 10,
"max_steps": 375,
"num_input_tokens_seen": 0,
"num_train_epochs": 3,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 2.1006213706296852e+18,
"train_batch_size": 1,
"trial_name": null,
"trial_params": null
}