| { |
| "best_global_step": null, |
| "best_metric": null, |
| "best_model_checkpoint": null, |
| "epoch": 3.0, |
| "eval_steps": 500, |
| "global_step": 375, |
| "is_hyper_param_search": false, |
| "is_local_process_zero": true, |
| "is_world_process_zero": true, |
| "log_history": [ |
| { |
| "entropy": 0.7578534603118896, |
| "epoch": 0.08, |
| "grad_norm": 0.0264892578125, |
| "learning_rate": 1.3500000000000001e-05, |
| "loss": 1.0626055717468261, |
| "mean_token_accuracy": 0.7818616509437561, |
| "num_tokens": 145593.0, |
| "step": 10 |
| }, |
| { |
| "entropy": 0.8365898035466671, |
| "epoch": 0.16, |
| "grad_norm": 0.0245361328125, |
| "learning_rate": 1.4977508774437613e-05, |
| "loss": 1.0916013717651367, |
| "mean_token_accuracy": 0.774065163731575, |
| "num_tokens": 275483.0, |
| "step": 20 |
| }, |
| { |
| "entropy": 0.8604681774973869, |
| "epoch": 0.24, |
| "grad_norm": 0.0216064453125, |
| "learning_rate": 1.4899934496687427e-05, |
| "loss": 0.9568503379821778, |
| "mean_token_accuracy": 0.7914068281650544, |
| "num_tokens": 412016.0, |
| "step": 30 |
| }, |
| { |
| "entropy": 0.9091467261314392, |
| "epoch": 0.32, |
| "grad_norm": 0.0242919921875, |
| "learning_rate": 1.4767573692933046e-05, |
| "loss": 0.896739387512207, |
| "mean_token_accuracy": 0.7991590544581413, |
| "num_tokens": 546070.0, |
| "step": 40 |
| }, |
| { |
| "entropy": 0.8796953782439232, |
| "epoch": 0.4, |
| "grad_norm": 0.026123046875, |
| "learning_rate": 1.4581406316256497e-05, |
| "loss": 0.8293215751647949, |
| "mean_token_accuracy": 0.8040232986211777, |
| "num_tokens": 671836.0, |
| "step": 50 |
| }, |
| { |
| "entropy": 0.7109395630657673, |
| "epoch": 0.48, |
| "grad_norm": 0.0257568359375, |
| "learning_rate": 1.4342810684780456e-05, |
| "loss": 0.6653613567352294, |
| "mean_token_accuracy": 0.8423441141843796, |
| "num_tokens": 808474.0, |
| "step": 60 |
| }, |
| { |
| "entropy": 0.595400421321392, |
| "epoch": 0.56, |
| "grad_norm": 0.038818359375, |
| "learning_rate": 1.4053553277083107e-05, |
| "loss": 0.5540878295898437, |
| "mean_token_accuracy": 0.8688836485147476, |
| "num_tokens": 935307.0, |
| "step": 70 |
| }, |
| { |
| "entropy": 0.4914353840053082, |
| "epoch": 0.64, |
| "grad_norm": 0.033447265625, |
| "learning_rate": 1.3715775653808777e-05, |
| "loss": 0.464099645614624, |
| "mean_token_accuracy": 0.889048607647419, |
| "num_tokens": 1064855.0, |
| "step": 80 |
| }, |
| { |
| "entropy": 0.46343609765172006, |
| "epoch": 0.72, |
| "grad_norm": 0.038818359375, |
| "learning_rate": 1.3331978602302202e-05, |
| "loss": 0.441998291015625, |
| "mean_token_accuracy": 0.8911783829331398, |
| "num_tokens": 1190651.0, |
| "step": 90 |
| }, |
| { |
| "entropy": 0.4148942559957504, |
| "epoch": 0.8, |
| "grad_norm": 0.052001953125, |
| "learning_rate": 1.2905003621653957e-05, |
| "loss": 0.3941481590270996, |
| "mean_token_accuracy": 0.9037791520357132, |
| "num_tokens": 1326261.0, |
| "step": 100 |
| }, |
| { |
| "entropy": 0.3749663054943085, |
| "epoch": 0.88, |
| "grad_norm": 0.0341796875, |
| "learning_rate": 1.2438011885235367e-05, |
| "loss": 0.3582408666610718, |
| "mean_token_accuracy": 0.9127597466111184, |
| "num_tokens": 1459975.0, |
| "step": 110 |
| }, |
| { |
| "entropy": 0.3347546439617872, |
| "epoch": 0.96, |
| "grad_norm": 0.037109375, |
| "learning_rate": 1.1934460836476838e-05, |
| "loss": 0.31285719871520995, |
| "mean_token_accuracy": 0.9229100957512856, |
| "num_tokens": 1595413.0, |
| "step": 120 |
| }, |
| { |
| "entropy": 0.3288331624120474, |
| "epoch": 1.04, |
| "grad_norm": 0.03515625, |
| "learning_rate": 1.139807859116637e-05, |
| "loss": 0.31476891040802, |
| "mean_token_accuracy": 0.9243608936667442, |
| "num_tokens": 1722937.0, |
| "step": 130 |
| }, |
| { |
| "entropy": 0.287095432728529, |
| "epoch": 1.12, |
| "grad_norm": 0.032470703125, |
| "learning_rate": 1.0832836335784602e-05, |
| "loss": 0.272609543800354, |
| "mean_token_accuracy": 0.9351037934422493, |
| "num_tokens": 1851236.0, |
| "step": 140 |
| }, |
| { |
| "entropy": 0.2753055978566408, |
| "epoch": 1.2, |
| "grad_norm": 0.0216064453125, |
| "learning_rate": 1.024291892622952e-05, |
| "loss": 0.2653245210647583, |
| "mean_token_accuracy": 0.9374801725149154, |
| "num_tokens": 1991297.0, |
| "step": 150 |
| }, |
| { |
| "entropy": 0.2545565586537123, |
| "epoch": 1.28, |
| "grad_norm": 0.033203125, |
| "learning_rate": 9.63269390460753e-06, |
| "loss": 0.24248099327087402, |
| "mean_token_accuracy": 0.943722878396511, |
| "num_tokens": 2116751.0, |
| "step": 160 |
| }, |
| { |
| "entropy": 0.21991128847002983, |
| "epoch": 1.3599999999999999, |
| "grad_norm": 0.0341796875, |
| "learning_rate": 9.006679163479767e-06, |
| "loss": 0.20926618576049805, |
| "mean_token_accuracy": 0.9520752727985382, |
| "num_tokens": 2253965.0, |
| "step": 170 |
| }, |
| { |
| "entropy": 0.21300265919417144, |
| "epoch": 1.44, |
| "grad_norm": 0.0284423828125, |
| "learning_rate": 8.369509496966254e-06, |
| "loss": 0.201596999168396, |
| "mean_token_accuracy": 0.9541342169046402, |
| "num_tokens": 2387688.0, |
| "step": 180 |
| }, |
| { |
| "entropy": 0.1999088702723384, |
| "epoch": 1.52, |
| "grad_norm": 0.0274658203125, |
| "learning_rate": 7.725902286351813e-06, |
| "loss": 0.18661935329437257, |
| "mean_token_accuracy": 0.9588151663541794, |
| "num_tokens": 2513947.0, |
| "step": 190 |
| }, |
| { |
| "entropy": 0.21373646408319474, |
| "epoch": 1.6, |
| "grad_norm": 0.0257568359375, |
| "learning_rate": 7.0806225742454765e-06, |
| "loss": 0.20502221584320068, |
| "mean_token_accuracy": 0.9535915687680244, |
| "num_tokens": 2641508.0, |
| "step": 200 |
| }, |
| { |
| "entropy": 0.19736929275095463, |
| "epoch": 1.6800000000000002, |
| "grad_norm": 0.0257568359375, |
| "learning_rate": 6.438447785872176e-06, |
| "loss": 0.18847702741622924, |
| "mean_token_accuracy": 0.9587804660201072, |
| "num_tokens": 2783740.0, |
| "step": 210 |
| }, |
| { |
| "entropy": 0.19301388934254646, |
| "epoch": 1.76, |
| "grad_norm": 0.0302734375, |
| "learning_rate": 5.804132358687839e-06, |
| "loss": 0.18001898527145385, |
| "mean_token_accuracy": 0.9596618011593818, |
| "num_tokens": 2912164.0, |
| "step": 220 |
| }, |
| { |
| "entropy": 0.16831486914306878, |
| "epoch": 1.8399999999999999, |
| "grad_norm": 0.020751953125, |
| "learning_rate": 5.182372542187895e-06, |
| "loss": 0.15890954732894896, |
| "mean_token_accuracy": 0.9644858300685882, |
| "num_tokens": 3049526.0, |
| "step": 230 |
| }, |
| { |
| "entropy": 0.1791680719703436, |
| "epoch": 1.92, |
| "grad_norm": 0.022705078125, |
| "learning_rate": 4.577771628519091e-06, |
| "loss": 0.16924891471862794, |
| "mean_token_accuracy": 0.9632360026240349, |
| "num_tokens": 3179372.0, |
| "step": 240 |
| }, |
| { |
| "entropy": 0.17665232978761197, |
| "epoch": 2.0, |
| "grad_norm": 0.0184326171875, |
| "learning_rate": 3.9948058713149845e-06, |
| "loss": 0.16852205991744995, |
| "mean_token_accuracy": 0.9628902286291122, |
| "num_tokens": 3318608.0, |
| "step": 250 |
| }, |
| { |
| "entropy": 0.17881185598671437, |
| "epoch": 2.08, |
| "grad_norm": 0.0264892578125, |
| "learning_rate": 3.4377913450801405e-06, |
| "loss": 0.17007871866226196, |
| "mean_token_accuracy": 0.963550227880478, |
| "num_tokens": 3453713.0, |
| "step": 260 |
| }, |
| { |
| "entropy": 0.18688333593308926, |
| "epoch": 2.16, |
| "grad_norm": 0.029052734375, |
| "learning_rate": 2.9108519904845677e-06, |
| "loss": 0.1791600227355957, |
| "mean_token_accuracy": 0.961190114915371, |
| "num_tokens": 3577609.0, |
| "step": 270 |
| }, |
| { |
| "entropy": 0.17757816668599843, |
| "epoch": 2.24, |
| "grad_norm": 0.01953125, |
| "learning_rate": 2.417889082149834e-06, |
| "loss": 0.17080665826797486, |
| "mean_token_accuracy": 0.9637410417199135, |
| "num_tokens": 3707617.0, |
| "step": 280 |
| }, |
| { |
| "entropy": 0.17086777035146952, |
| "epoch": 2.32, |
| "grad_norm": 0.0172119140625, |
| "learning_rate": 1.962552344976702e-06, |
| "loss": 0.1627667188644409, |
| "mean_token_accuracy": 0.9644756108522415, |
| "num_tokens": 3837778.0, |
| "step": 290 |
| }, |
| { |
| "entropy": 0.16370586045086383, |
| "epoch": 2.4, |
| "grad_norm": 0.02294921875, |
| "learning_rate": 1.5482129328588954e-06, |
| "loss": 0.15820531845092772, |
| "mean_token_accuracy": 0.9661343678832054, |
| "num_tokens": 3972183.0, |
| "step": 300 |
| }, |
| { |
| "entropy": 0.17486946266144515, |
| "epoch": 2.48, |
| "grad_norm": 0.0264892578125, |
| "learning_rate": 1.1779384698391296e-06, |
| "loss": 0.16707814931869508, |
| "mean_token_accuracy": 0.9641691878437996, |
| "num_tokens": 4102229.0, |
| "step": 310 |
| }, |
| { |
| "entropy": 0.17446570619940757, |
| "epoch": 2.56, |
| "grad_norm": 0.017333984375, |
| "learning_rate": 8.544703384939661e-07, |
| "loss": 0.1683019757270813, |
| "mean_token_accuracy": 0.9637968197464943, |
| "num_tokens": 4235775.0, |
| "step": 320 |
| }, |
| { |
| "entropy": 0.1773039098829031, |
| "epoch": 2.64, |
| "grad_norm": 0.0206298828125, |
| "learning_rate": 5.802033836963438e-07, |
| "loss": 0.1716697096824646, |
| "mean_token_accuracy": 0.9628803566098213, |
| "num_tokens": 4369651.0, |
| "step": 330 |
| }, |
| { |
| "entropy": 0.17304655965417623, |
| "epoch": 2.7199999999999998, |
| "grad_norm": 0.0233154296875, |
| "learning_rate": 3.571681820220196e-07, |
| "loss": 0.16543103456497193, |
| "mean_token_accuracy": 0.9639842137694359, |
| "num_tokens": 4504684.0, |
| "step": 340 |
| }, |
| { |
| "entropy": 0.17868599500507115, |
| "epoch": 2.8, |
| "grad_norm": 0.0201416015625, |
| "learning_rate": 1.870160080710398e-07, |
| "loss": 0.17553088665008545, |
| "mean_token_accuracy": 0.9631485670804978, |
| "num_tokens": 4633399.0, |
| "step": 350 |
| }, |
| { |
| "entropy": 0.16202218551188707, |
| "epoch": 2.88, |
| "grad_norm": 0.0162353515625, |
| "learning_rate": 7.100660900832945e-08, |
| "loss": 0.15266696214675904, |
| "mean_token_accuracy": 0.9657531261444092, |
| "num_tokens": 4780905.0, |
| "step": 360 |
| }, |
| { |
| "entropy": 0.17599052861332892, |
| "epoch": 2.96, |
| "grad_norm": 0.0206298828125, |
| "learning_rate": 9.998877836430998e-09, |
| "loss": 0.16831759214401246, |
| "mean_token_accuracy": 0.9632442593574524, |
| "num_tokens": 4916029.0, |
| "step": 370 |
| } |
| ], |
| "logging_steps": 10, |
| "max_steps": 375, |
| "num_input_tokens_seen": 0, |
| "num_train_epochs": 3, |
| "save_steps": 500, |
| "stateful_callbacks": { |
| "TrainerControl": { |
| "args": { |
| "should_epoch_stop": false, |
| "should_evaluate": false, |
| "should_log": false, |
| "should_save": true, |
| "should_training_stop": true |
| }, |
| "attributes": {} |
| } |
| }, |
| "total_flos": 2.1006213706296852e+18, |
| "train_batch_size": 1, |
| "trial_name": null, |
| "trial_params": null |
| } |
|
|