| { | |
| "best_metric": 0.31098222732543945, | |
| "best_model_checkpoint": "./seq_clf/0523013301/checkpoint-24528", | |
| "epoch": 25.0, | |
| "eval_steps": 500, | |
| "global_step": 102200, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.1223091976516634, | |
| "grad_norm": 3.0948333740234375, | |
| "learning_rate": 1.990215264187867e-05, | |
| "loss": 1.4987, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 0.2446183953033268, | |
| "grad_norm": 4.519032955169678, | |
| "learning_rate": 1.9804305283757338e-05, | |
| "loss": 0.8512, | |
| "step": 1000 | |
| }, | |
| { | |
| "epoch": 0.3669275929549902, | |
| "grad_norm": 6.118898868560791, | |
| "learning_rate": 1.970645792563601e-05, | |
| "loss": 0.7319, | |
| "step": 1500 | |
| }, | |
| { | |
| "epoch": 0.4892367906066536, | |
| "grad_norm": 5.870689392089844, | |
| "learning_rate": 1.9608610567514678e-05, | |
| "loss": 0.673, | |
| "step": 2000 | |
| }, | |
| { | |
| "epoch": 0.6115459882583171, | |
| "grad_norm": 5.046342849731445, | |
| "learning_rate": 1.951076320939335e-05, | |
| "loss": 0.6408, | |
| "step": 2500 | |
| }, | |
| { | |
| "epoch": 0.7338551859099804, | |
| "grad_norm": 5.137794494628906, | |
| "learning_rate": 1.9412915851272015e-05, | |
| "loss": 0.6029, | |
| "step": 3000 | |
| }, | |
| { | |
| "epoch": 0.8561643835616438, | |
| "grad_norm": 5.616779327392578, | |
| "learning_rate": 1.9315068493150686e-05, | |
| "loss": 0.5867, | |
| "step": 3500 | |
| }, | |
| { | |
| "epoch": 0.9784735812133072, | |
| "grad_norm": 3.4356954097747803, | |
| "learning_rate": 1.9217221135029354e-05, | |
| "loss": 0.5654, | |
| "step": 4000 | |
| }, | |
| { | |
| "epoch": 1.0, | |
| "eval_accuracy": 0.8249802208386364, | |
| "eval_loss": 0.5318922400474548, | |
| "eval_runtime": 77.9312, | |
| "eval_samples_per_second": 373.034, | |
| "eval_steps_per_second": 11.664, | |
| "step": 4088 | |
| }, | |
| { | |
| "epoch": 1.1007827788649707, | |
| "grad_norm": 5.042200088500977, | |
| "learning_rate": 1.9119373776908026e-05, | |
| "loss": 0.5171, | |
| "step": 4500 | |
| }, | |
| { | |
| "epoch": 1.223091976516634, | |
| "grad_norm": 5.630020618438721, | |
| "learning_rate": 1.9021526418786694e-05, | |
| "loss": 0.4983, | |
| "step": 5000 | |
| }, | |
| { | |
| "epoch": 1.3454011741682974, | |
| "grad_norm": 3.5620667934417725, | |
| "learning_rate": 1.8923679060665363e-05, | |
| "loss": 0.488, | |
| "step": 5500 | |
| }, | |
| { | |
| "epoch": 1.467710371819961, | |
| "grad_norm": 6.017724990844727, | |
| "learning_rate": 1.8825831702544034e-05, | |
| "loss": 0.4867, | |
| "step": 6000 | |
| }, | |
| { | |
| "epoch": 1.5900195694716244, | |
| "grad_norm": 4.974002361297607, | |
| "learning_rate": 1.8727984344422703e-05, | |
| "loss": 0.4645, | |
| "step": 6500 | |
| }, | |
| { | |
| "epoch": 1.7123287671232876, | |
| "grad_norm": 5.986278057098389, | |
| "learning_rate": 1.863013698630137e-05, | |
| "loss": 0.4582, | |
| "step": 7000 | |
| }, | |
| { | |
| "epoch": 1.8346379647749511, | |
| "grad_norm": 5.255044937133789, | |
| "learning_rate": 1.853228962818004e-05, | |
| "loss": 0.4609, | |
| "step": 7500 | |
| }, | |
| { | |
| "epoch": 1.9569471624266144, | |
| "grad_norm": 5.023875713348389, | |
| "learning_rate": 1.843444227005871e-05, | |
| "loss": 0.4495, | |
| "step": 8000 | |
| }, | |
| { | |
| "epoch": 2.0, | |
| "eval_accuracy": 0.8506759313405111, | |
| "eval_loss": 0.4426937699317932, | |
| "eval_runtime": 78.0987, | |
| "eval_samples_per_second": 372.234, | |
| "eval_steps_per_second": 11.639, | |
| "step": 8176 | |
| }, | |
| { | |
| "epoch": 2.079256360078278, | |
| "grad_norm": 6.947430610656738, | |
| "learning_rate": 1.833659491193738e-05, | |
| "loss": 0.4058, | |
| "step": 8500 | |
| }, | |
| { | |
| "epoch": 2.2015655577299413, | |
| "grad_norm": 2.9549953937530518, | |
| "learning_rate": 1.8238747553816047e-05, | |
| "loss": 0.3764, | |
| "step": 9000 | |
| }, | |
| { | |
| "epoch": 2.323874755381605, | |
| "grad_norm": 6.063886642456055, | |
| "learning_rate": 1.814090019569472e-05, | |
| "loss": 0.3785, | |
| "step": 9500 | |
| }, | |
| { | |
| "epoch": 2.446183953033268, | |
| "grad_norm": 6.911349296569824, | |
| "learning_rate": 1.8043052837573387e-05, | |
| "loss": 0.3736, | |
| "step": 10000 | |
| }, | |
| { | |
| "epoch": 2.5684931506849313, | |
| "grad_norm": 5.118221282958984, | |
| "learning_rate": 1.7945205479452055e-05, | |
| "loss": 0.3684, | |
| "step": 10500 | |
| }, | |
| { | |
| "epoch": 2.690802348336595, | |
| "grad_norm": 4.369062423706055, | |
| "learning_rate": 1.7847358121330724e-05, | |
| "loss": 0.3607, | |
| "step": 11000 | |
| }, | |
| { | |
| "epoch": 2.8131115459882583, | |
| "grad_norm": 6.040197849273682, | |
| "learning_rate": 1.7749510763209395e-05, | |
| "loss": 0.3657, | |
| "step": 11500 | |
| }, | |
| { | |
| "epoch": 2.935420743639922, | |
| "grad_norm": 6.751057147979736, | |
| "learning_rate": 1.7651663405088064e-05, | |
| "loss": 0.3517, | |
| "step": 12000 | |
| }, | |
| { | |
| "epoch": 3.0, | |
| "eval_accuracy": 0.8710398679096006, | |
| "eval_loss": 0.3752569854259491, | |
| "eval_runtime": 78.0858, | |
| "eval_samples_per_second": 372.295, | |
| "eval_steps_per_second": 11.641, | |
| "step": 12264 | |
| }, | |
| { | |
| "epoch": 3.0577299412915853, | |
| "grad_norm": 5.28397798538208, | |
| "learning_rate": 1.7553816046966735e-05, | |
| "loss": 0.3172, | |
| "step": 12500 | |
| }, | |
| { | |
| "epoch": 3.1800391389432487, | |
| "grad_norm": 3.873892068862915, | |
| "learning_rate": 1.74559686888454e-05, | |
| "loss": 0.2865, | |
| "step": 13000 | |
| }, | |
| { | |
| "epoch": 3.302348336594912, | |
| "grad_norm": 3.584193468093872, | |
| "learning_rate": 1.735812133072407e-05, | |
| "loss": 0.285, | |
| "step": 13500 | |
| }, | |
| { | |
| "epoch": 3.4246575342465753, | |
| "grad_norm": 6.080500602722168, | |
| "learning_rate": 1.726027397260274e-05, | |
| "loss": 0.2867, | |
| "step": 14000 | |
| }, | |
| { | |
| "epoch": 3.5469667318982387, | |
| "grad_norm": 5.05246114730835, | |
| "learning_rate": 1.716242661448141e-05, | |
| "loss": 0.2793, | |
| "step": 14500 | |
| }, | |
| { | |
| "epoch": 3.6692759295499022, | |
| "grad_norm": 4.842950344085693, | |
| "learning_rate": 1.706457925636008e-05, | |
| "loss": 0.2829, | |
| "step": 15000 | |
| }, | |
| { | |
| "epoch": 3.7915851272015657, | |
| "grad_norm": 6.52248477935791, | |
| "learning_rate": 1.6966731898238748e-05, | |
| "loss": 0.2776, | |
| "step": 15500 | |
| }, | |
| { | |
| "epoch": 3.9138943248532287, | |
| "grad_norm": 5.814143657684326, | |
| "learning_rate": 1.686888454011742e-05, | |
| "loss": 0.268, | |
| "step": 16000 | |
| }, | |
| { | |
| "epoch": 4.0, | |
| "eval_accuracy": 0.8836985311822779, | |
| "eval_loss": 0.33874690532684326, | |
| "eval_runtime": 78.7296, | |
| "eval_samples_per_second": 369.251, | |
| "eval_steps_per_second": 11.546, | |
| "step": 16352 | |
| }, | |
| { | |
| "epoch": 4.036203522504892, | |
| "grad_norm": 4.4718194007873535, | |
| "learning_rate": 1.6771037181996088e-05, | |
| "loss": 0.2524, | |
| "step": 16500 | |
| }, | |
| { | |
| "epoch": 4.158512720156556, | |
| "grad_norm": 6.8225531578063965, | |
| "learning_rate": 1.6673189823874756e-05, | |
| "loss": 0.2121, | |
| "step": 17000 | |
| }, | |
| { | |
| "epoch": 4.280821917808219, | |
| "grad_norm": 4.292703151702881, | |
| "learning_rate": 1.6575342465753425e-05, | |
| "loss": 0.2168, | |
| "step": 17500 | |
| }, | |
| { | |
| "epoch": 4.403131115459883, | |
| "grad_norm": 6.740828514099121, | |
| "learning_rate": 1.6477495107632096e-05, | |
| "loss": 0.2079, | |
| "step": 18000 | |
| }, | |
| { | |
| "epoch": 4.525440313111546, | |
| "grad_norm": 7.279580593109131, | |
| "learning_rate": 1.6379647749510764e-05, | |
| "loss": 0.2178, | |
| "step": 18500 | |
| }, | |
| { | |
| "epoch": 4.64774951076321, | |
| "grad_norm": 8.839160919189453, | |
| "learning_rate": 1.6281800391389433e-05, | |
| "loss": 0.2135, | |
| "step": 19000 | |
| }, | |
| { | |
| "epoch": 4.770058708414873, | |
| "grad_norm": 5.748088836669922, | |
| "learning_rate": 1.6183953033268104e-05, | |
| "loss": 0.215, | |
| "step": 19500 | |
| }, | |
| { | |
| "epoch": 4.892367906066536, | |
| "grad_norm": 6.222439765930176, | |
| "learning_rate": 1.6086105675146773e-05, | |
| "loss": 0.2114, | |
| "step": 20000 | |
| }, | |
| { | |
| "epoch": 5.0, | |
| "eval_accuracy": 0.895531629458911, | |
| "eval_loss": 0.3136844038963318, | |
| "eval_runtime": 78.002, | |
| "eval_samples_per_second": 372.695, | |
| "eval_steps_per_second": 11.654, | |
| "step": 20440 | |
| }, | |
| { | |
| "epoch": 5.014677103718199, | |
| "grad_norm": 5.067689895629883, | |
| "learning_rate": 1.598825831702544e-05, | |
| "loss": 0.2001, | |
| "step": 20500 | |
| }, | |
| { | |
| "epoch": 5.136986301369863, | |
| "grad_norm": 5.334500789642334, | |
| "learning_rate": 1.589041095890411e-05, | |
| "loss": 0.1574, | |
| "step": 21000 | |
| }, | |
| { | |
| "epoch": 5.259295499021526, | |
| "grad_norm": 8.226490020751953, | |
| "learning_rate": 1.579256360078278e-05, | |
| "loss": 0.1599, | |
| "step": 21500 | |
| }, | |
| { | |
| "epoch": 5.38160469667319, | |
| "grad_norm": 5.279560565948486, | |
| "learning_rate": 1.569471624266145e-05, | |
| "loss": 0.1594, | |
| "step": 22000 | |
| }, | |
| { | |
| "epoch": 5.503913894324853, | |
| "grad_norm": 8.88842487335205, | |
| "learning_rate": 1.559686888454012e-05, | |
| "loss": 0.1577, | |
| "step": 22500 | |
| }, | |
| { | |
| "epoch": 5.626223091976517, | |
| "grad_norm": 6.64274787902832, | |
| "learning_rate": 1.5499021526418785e-05, | |
| "loss": 0.1566, | |
| "step": 23000 | |
| }, | |
| { | |
| "epoch": 5.74853228962818, | |
| "grad_norm": 8.864998817443848, | |
| "learning_rate": 1.5401174168297457e-05, | |
| "loss": 0.163, | |
| "step": 23500 | |
| }, | |
| { | |
| "epoch": 5.870841487279844, | |
| "grad_norm": 4.494806289672852, | |
| "learning_rate": 1.5303326810176125e-05, | |
| "loss": 0.159, | |
| "step": 24000 | |
| }, | |
| { | |
| "epoch": 5.993150684931507, | |
| "grad_norm": 8.841043472290039, | |
| "learning_rate": 1.5205479452054797e-05, | |
| "loss": 0.1593, | |
| "step": 24500 | |
| }, | |
| { | |
| "epoch": 6.0, | |
| "eval_accuracy": 0.9032025042138213, | |
| "eval_loss": 0.31098222732543945, | |
| "eval_runtime": 78.1256, | |
| "eval_samples_per_second": 372.106, | |
| "eval_steps_per_second": 11.635, | |
| "step": 24528 | |
| }, | |
| { | |
| "epoch": 6.1154598825831705, | |
| "grad_norm": 5.516913890838623, | |
| "learning_rate": 1.5107632093933464e-05, | |
| "loss": 0.1186, | |
| "step": 25000 | |
| }, | |
| { | |
| "epoch": 6.237769080234834, | |
| "grad_norm": 17.19109344482422, | |
| "learning_rate": 1.5009784735812134e-05, | |
| "loss": 0.1242, | |
| "step": 25500 | |
| }, | |
| { | |
| "epoch": 6.3600782778864975, | |
| "grad_norm": 4.817780494689941, | |
| "learning_rate": 1.4911937377690804e-05, | |
| "loss": 0.1264, | |
| "step": 26000 | |
| }, | |
| { | |
| "epoch": 6.48238747553816, | |
| "grad_norm": 9.93541431427002, | |
| "learning_rate": 1.4814090019569473e-05, | |
| "loss": 0.1251, | |
| "step": 26500 | |
| }, | |
| { | |
| "epoch": 6.604696673189824, | |
| "grad_norm": 7.402759552001953, | |
| "learning_rate": 1.4716242661448142e-05, | |
| "loss": 0.1205, | |
| "step": 27000 | |
| }, | |
| { | |
| "epoch": 6.727005870841487, | |
| "grad_norm": 4.997498989105225, | |
| "learning_rate": 1.4618395303326812e-05, | |
| "loss": 0.1221, | |
| "step": 27500 | |
| }, | |
| { | |
| "epoch": 6.8493150684931505, | |
| "grad_norm": 8.2695951461792, | |
| "learning_rate": 1.4520547945205482e-05, | |
| "loss": 0.126, | |
| "step": 28000 | |
| }, | |
| { | |
| "epoch": 6.971624266144814, | |
| "grad_norm": 7.226652145385742, | |
| "learning_rate": 1.4422700587084152e-05, | |
| "loss": 0.1238, | |
| "step": 28500 | |
| }, | |
| { | |
| "epoch": 7.0, | |
| "eval_accuracy": 0.9069863437790238, | |
| "eval_loss": 0.3151151239871979, | |
| "eval_runtime": 77.3038, | |
| "eval_samples_per_second": 376.062, | |
| "eval_steps_per_second": 11.759, | |
| "step": 28616 | |
| }, | |
| { | |
| "epoch": 7.0939334637964775, | |
| "grad_norm": 6.771386623382568, | |
| "learning_rate": 1.4324853228962818e-05, | |
| "loss": 0.1014, | |
| "step": 29000 | |
| }, | |
| { | |
| "epoch": 7.216242661448141, | |
| "grad_norm": 7.908430099487305, | |
| "learning_rate": 1.4227005870841488e-05, | |
| "loss": 0.0917, | |
| "step": 29500 | |
| }, | |
| { | |
| "epoch": 7.3385518590998045, | |
| "grad_norm": 11.553496360778809, | |
| "learning_rate": 1.4129158512720158e-05, | |
| "loss": 0.0989, | |
| "step": 30000 | |
| }, | |
| { | |
| "epoch": 7.460861056751468, | |
| "grad_norm": 5.48296594619751, | |
| "learning_rate": 1.4031311154598828e-05, | |
| "loss": 0.0945, | |
| "step": 30500 | |
| }, | |
| { | |
| "epoch": 7.583170254403131, | |
| "grad_norm": 4.382979869842529, | |
| "learning_rate": 1.3933463796477496e-05, | |
| "loss": 0.0904, | |
| "step": 31000 | |
| }, | |
| { | |
| "epoch": 7.705479452054795, | |
| "grad_norm": 4.479061126708984, | |
| "learning_rate": 1.3835616438356164e-05, | |
| "loss": 0.1003, | |
| "step": 31500 | |
| }, | |
| { | |
| "epoch": 7.8277886497064575, | |
| "grad_norm": 2.2252697944641113, | |
| "learning_rate": 1.3737769080234834e-05, | |
| "loss": 0.0963, | |
| "step": 32000 | |
| }, | |
| { | |
| "epoch": 7.950097847358121, | |
| "grad_norm": 5.149953842163086, | |
| "learning_rate": 1.3639921722113504e-05, | |
| "loss": 0.0969, | |
| "step": 32500 | |
| }, | |
| { | |
| "epoch": 8.0, | |
| "eval_accuracy": 0.9127652987513329, | |
| "eval_loss": 0.3146636486053467, | |
| "eval_runtime": 77.5909, | |
| "eval_samples_per_second": 374.67, | |
| "eval_steps_per_second": 11.715, | |
| "step": 32704 | |
| }, | |
| { | |
| "epoch": 8.072407045009784, | |
| "grad_norm": 10.789752960205078, | |
| "learning_rate": 1.3542074363992173e-05, | |
| "loss": 0.0788, | |
| "step": 33000 | |
| }, | |
| { | |
| "epoch": 8.194716242661448, | |
| "grad_norm": 5.515634536743164, | |
| "learning_rate": 1.3444227005870843e-05, | |
| "loss": 0.0717, | |
| "step": 33500 | |
| }, | |
| { | |
| "epoch": 8.317025440313111, | |
| "grad_norm": 10.454425811767578, | |
| "learning_rate": 1.3346379647749513e-05, | |
| "loss": 0.0719, | |
| "step": 34000 | |
| }, | |
| { | |
| "epoch": 8.439334637964775, | |
| "grad_norm": 7.216288089752197, | |
| "learning_rate": 1.3248532289628183e-05, | |
| "loss": 0.0764, | |
| "step": 34500 | |
| }, | |
| { | |
| "epoch": 8.561643835616438, | |
| "grad_norm": 6.058413028717041, | |
| "learning_rate": 1.3150684931506849e-05, | |
| "loss": 0.0775, | |
| "step": 35000 | |
| }, | |
| { | |
| "epoch": 8.683953033268102, | |
| "grad_norm": 1.9321287870407104, | |
| "learning_rate": 1.3052837573385519e-05, | |
| "loss": 0.0762, | |
| "step": 35500 | |
| }, | |
| { | |
| "epoch": 8.806262230919765, | |
| "grad_norm": 5.581308841705322, | |
| "learning_rate": 1.2954990215264189e-05, | |
| "loss": 0.0763, | |
| "step": 36000 | |
| }, | |
| { | |
| "epoch": 8.928571428571429, | |
| "grad_norm": 8.296586036682129, | |
| "learning_rate": 1.2857142857142859e-05, | |
| "loss": 0.0826, | |
| "step": 36500 | |
| }, | |
| { | |
| "epoch": 9.0, | |
| "eval_accuracy": 0.9133156754153624, | |
| "eval_loss": 0.33166706562042236, | |
| "eval_runtime": 77.4584, | |
| "eval_samples_per_second": 375.311, | |
| "eval_steps_per_second": 11.735, | |
| "step": 36792 | |
| }, | |
| { | |
| "epoch": 9.050880626223092, | |
| "grad_norm": 6.40161657333374, | |
| "learning_rate": 1.2759295499021527e-05, | |
| "loss": 0.0689, | |
| "step": 37000 | |
| }, | |
| { | |
| "epoch": 9.173189823874756, | |
| "grad_norm": 9.357067108154297, | |
| "learning_rate": 1.2661448140900197e-05, | |
| "loss": 0.0581, | |
| "step": 37500 | |
| }, | |
| { | |
| "epoch": 9.29549902152642, | |
| "grad_norm": 4.852538108825684, | |
| "learning_rate": 1.2563600782778867e-05, | |
| "loss": 0.0582, | |
| "step": 38000 | |
| }, | |
| { | |
| "epoch": 9.417808219178083, | |
| "grad_norm": 6.204359531402588, | |
| "learning_rate": 1.2465753424657537e-05, | |
| "loss": 0.06, | |
| "step": 38500 | |
| }, | |
| { | |
| "epoch": 9.540117416829746, | |
| "grad_norm": 11.658947944641113, | |
| "learning_rate": 1.2367906066536204e-05, | |
| "loss": 0.0626, | |
| "step": 39000 | |
| }, | |
| { | |
| "epoch": 9.66242661448141, | |
| "grad_norm": 12.047215461730957, | |
| "learning_rate": 1.2270058708414874e-05, | |
| "loss": 0.0626, | |
| "step": 39500 | |
| }, | |
| { | |
| "epoch": 9.784735812133073, | |
| "grad_norm": 11.078405380249023, | |
| "learning_rate": 1.2172211350293543e-05, | |
| "loss": 0.0655, | |
| "step": 40000 | |
| }, | |
| { | |
| "epoch": 9.907045009784735, | |
| "grad_norm": 3.5147736072540283, | |
| "learning_rate": 1.2074363992172213e-05, | |
| "loss": 0.0663, | |
| "step": 40500 | |
| }, | |
| { | |
| "epoch": 10.0, | |
| "eval_accuracy": 0.9173747033125795, | |
| "eval_loss": 0.34417206048965454, | |
| "eval_runtime": 77.3814, | |
| "eval_samples_per_second": 375.685, | |
| "eval_steps_per_second": 11.747, | |
| "step": 40880 | |
| }, | |
| { | |
| "epoch": 10.029354207436398, | |
| "grad_norm": 2.839688539505005, | |
| "learning_rate": 1.1976516634050882e-05, | |
| "loss": 0.0609, | |
| "step": 41000 | |
| }, | |
| { | |
| "epoch": 10.151663405088062, | |
| "grad_norm": 10.794103622436523, | |
| "learning_rate": 1.187866927592955e-05, | |
| "loss": 0.0467, | |
| "step": 41500 | |
| }, | |
| { | |
| "epoch": 10.273972602739725, | |
| "grad_norm": 3.0600426197052, | |
| "learning_rate": 1.178082191780822e-05, | |
| "loss": 0.0464, | |
| "step": 42000 | |
| }, | |
| { | |
| "epoch": 10.396281800391389, | |
| "grad_norm": 7.135795593261719, | |
| "learning_rate": 1.168297455968689e-05, | |
| "loss": 0.051, | |
| "step": 42500 | |
| }, | |
| { | |
| "epoch": 10.518590998043052, | |
| "grad_norm": 5.585446834564209, | |
| "learning_rate": 1.1585127201565558e-05, | |
| "loss": 0.0531, | |
| "step": 43000 | |
| }, | |
| { | |
| "epoch": 10.640900195694716, | |
| "grad_norm": 0.05372029170393944, | |
| "learning_rate": 1.1487279843444228e-05, | |
| "loss": 0.0504, | |
| "step": 43500 | |
| }, | |
| { | |
| "epoch": 10.76320939334638, | |
| "grad_norm": 12.072264671325684, | |
| "learning_rate": 1.1389432485322898e-05, | |
| "loss": 0.0511, | |
| "step": 44000 | |
| }, | |
| { | |
| "epoch": 10.885518590998043, | |
| "grad_norm": 5.787899971008301, | |
| "learning_rate": 1.1291585127201568e-05, | |
| "loss": 0.0519, | |
| "step": 44500 | |
| }, | |
| { | |
| "epoch": 11.0, | |
| "eval_accuracy": 0.9171339135220666, | |
| "eval_loss": 0.3852131962776184, | |
| "eval_runtime": 77.3953, | |
| "eval_samples_per_second": 375.617, | |
| "eval_steps_per_second": 11.745, | |
| "step": 44968 | |
| }, | |
| { | |
| "epoch": 11.007827788649706, | |
| "grad_norm": 8.351863861083984, | |
| "learning_rate": 1.1193737769080235e-05, | |
| "loss": 0.0508, | |
| "step": 45000 | |
| }, | |
| { | |
| "epoch": 11.13013698630137, | |
| "grad_norm": 6.567861080169678, | |
| "learning_rate": 1.1095890410958904e-05, | |
| "loss": 0.0403, | |
| "step": 45500 | |
| }, | |
| { | |
| "epoch": 11.252446183953033, | |
| "grad_norm": 5.368165016174316, | |
| "learning_rate": 1.0998043052837574e-05, | |
| "loss": 0.0419, | |
| "step": 46000 | |
| }, | |
| { | |
| "epoch": 11.374755381604697, | |
| "grad_norm": 11.540104866027832, | |
| "learning_rate": 1.0900195694716244e-05, | |
| "loss": 0.0466, | |
| "step": 46500 | |
| }, | |
| { | |
| "epoch": 11.49706457925636, | |
| "grad_norm": 2.5703656673431396, | |
| "learning_rate": 1.0802348336594913e-05, | |
| "loss": 0.0422, | |
| "step": 47000 | |
| }, | |
| { | |
| "epoch": 11.619373776908024, | |
| "grad_norm": 10.161659240722656, | |
| "learning_rate": 1.0704500978473583e-05, | |
| "loss": 0.0492, | |
| "step": 47500 | |
| }, | |
| { | |
| "epoch": 11.741682974559687, | |
| "grad_norm": 9.083320617675781, | |
| "learning_rate": 1.0606653620352253e-05, | |
| "loss": 0.0444, | |
| "step": 48000 | |
| }, | |
| { | |
| "epoch": 11.86399217221135, | |
| "grad_norm": 7.005116939544678, | |
| "learning_rate": 1.0508806262230922e-05, | |
| "loss": 0.0431, | |
| "step": 48500 | |
| }, | |
| { | |
| "epoch": 11.986301369863014, | |
| "grad_norm": 5.291450023651123, | |
| "learning_rate": 1.0410958904109589e-05, | |
| "loss": 0.045, | |
| "step": 49000 | |
| }, | |
| { | |
| "epoch": 12.0, | |
| "eval_accuracy": 0.9189226376801624, | |
| "eval_loss": 0.3933347463607788, | |
| "eval_runtime": 77.3468, | |
| "eval_samples_per_second": 375.852, | |
| "eval_steps_per_second": 11.752, | |
| "step": 49056 | |
| }, | |
| { | |
| "epoch": 12.108610567514678, | |
| "grad_norm": 3.0380940437316895, | |
| "learning_rate": 1.0313111545988259e-05, | |
| "loss": 0.0338, | |
| "step": 49500 | |
| }, | |
| { | |
| "epoch": 12.230919765166341, | |
| "grad_norm": 0.6975650787353516, | |
| "learning_rate": 1.0215264187866929e-05, | |
| "loss": 0.0343, | |
| "step": 50000 | |
| }, | |
| { | |
| "epoch": 12.353228962818005, | |
| "grad_norm": 0.18672548234462738, | |
| "learning_rate": 1.0117416829745599e-05, | |
| "loss": 0.0349, | |
| "step": 50500 | |
| }, | |
| { | |
| "epoch": 12.475538160469668, | |
| "grad_norm": 6.868373394012451, | |
| "learning_rate": 1.0019569471624267e-05, | |
| "loss": 0.0343, | |
| "step": 51000 | |
| }, | |
| { | |
| "epoch": 12.597847358121331, | |
| "grad_norm": 1.4025439023971558, | |
| "learning_rate": 9.921722113502935e-06, | |
| "loss": 0.0396, | |
| "step": 51500 | |
| }, | |
| { | |
| "epoch": 12.720156555772995, | |
| "grad_norm": 4.942438125610352, | |
| "learning_rate": 9.823874755381605e-06, | |
| "loss": 0.0381, | |
| "step": 52000 | |
| }, | |
| { | |
| "epoch": 12.842465753424658, | |
| "grad_norm": 4.044002056121826, | |
| "learning_rate": 9.726027397260275e-06, | |
| "loss": 0.0382, | |
| "step": 52500 | |
| }, | |
| { | |
| "epoch": 12.96477495107632, | |
| "grad_norm": 0.6736146211624146, | |
| "learning_rate": 9.628180039138944e-06, | |
| "loss": 0.0368, | |
| "step": 53000 | |
| }, | |
| { | |
| "epoch": 13.0, | |
| "eval_accuracy": 0.9228096728698704, | |
| "eval_loss": 0.38157734274864197, | |
| "eval_runtime": 77.3834, | |
| "eval_samples_per_second": 375.675, | |
| "eval_steps_per_second": 11.747, | |
| "step": 53144 | |
| }, | |
| { | |
| "epoch": 13.087084148727984, | |
| "grad_norm": 1.1303954124450684, | |
| "learning_rate": 9.530332681017614e-06, | |
| "loss": 0.0306, | |
| "step": 53500 | |
| }, | |
| { | |
| "epoch": 13.209393346379647, | |
| "grad_norm": 1.899757742881775, | |
| "learning_rate": 9.432485322896282e-06, | |
| "loss": 0.0302, | |
| "step": 54000 | |
| }, | |
| { | |
| "epoch": 13.33170254403131, | |
| "grad_norm": 10.160910606384277, | |
| "learning_rate": 9.334637964774952e-06, | |
| "loss": 0.0284, | |
| "step": 54500 | |
| }, | |
| { | |
| "epoch": 13.454011741682974, | |
| "grad_norm": 6.715288162231445, | |
| "learning_rate": 9.23679060665362e-06, | |
| "loss": 0.0351, | |
| "step": 55000 | |
| }, | |
| { | |
| "epoch": 13.576320939334638, | |
| "grad_norm": 9.383013725280762, | |
| "learning_rate": 9.13894324853229e-06, | |
| "loss": 0.0342, | |
| "step": 55500 | |
| }, | |
| { | |
| "epoch": 13.698630136986301, | |
| "grad_norm": 0.5024349093437195, | |
| "learning_rate": 9.04109589041096e-06, | |
| "loss": 0.0358, | |
| "step": 56000 | |
| }, | |
| { | |
| "epoch": 13.820939334637965, | |
| "grad_norm": 6.495013236999512, | |
| "learning_rate": 8.943248532289628e-06, | |
| "loss": 0.0361, | |
| "step": 56500 | |
| }, | |
| { | |
| "epoch": 13.943248532289628, | |
| "grad_norm": 3.360276699066162, | |
| "learning_rate": 8.845401174168298e-06, | |
| "loss": 0.0332, | |
| "step": 57000 | |
| }, | |
| { | |
| "epoch": 14.0, | |
| "eval_accuracy": 0.9229816655773795, | |
| "eval_loss": 0.4262143075466156, | |
| "eval_runtime": 77.1924, | |
| "eval_samples_per_second": 376.604, | |
| "eval_steps_per_second": 11.776, | |
| "step": 57232 | |
| }, | |
| { | |
| "epoch": 14.065557729941291, | |
| "grad_norm": 2.5697081089019775, | |
| "learning_rate": 8.747553816046968e-06, | |
| "loss": 0.0271, | |
| "step": 57500 | |
| }, | |
| { | |
| "epoch": 14.187866927592955, | |
| "grad_norm": 2.5552070140838623, | |
| "learning_rate": 8.649706457925636e-06, | |
| "loss": 0.0229, | |
| "step": 58000 | |
| }, | |
| { | |
| "epoch": 14.310176125244618, | |
| "grad_norm": 11.48392105102539, | |
| "learning_rate": 8.551859099804306e-06, | |
| "loss": 0.0275, | |
| "step": 58500 | |
| }, | |
| { | |
| "epoch": 14.432485322896282, | |
| "grad_norm": 4.35728645324707, | |
| "learning_rate": 8.454011741682975e-06, | |
| "loss": 0.026, | |
| "step": 59000 | |
| }, | |
| { | |
| "epoch": 14.554794520547945, | |
| "grad_norm": 18.579906463623047, | |
| "learning_rate": 8.356164383561644e-06, | |
| "loss": 0.0259, | |
| "step": 59500 | |
| }, | |
| { | |
| "epoch": 14.677103718199609, | |
| "grad_norm": 7.566853046417236, | |
| "learning_rate": 8.258317025440313e-06, | |
| "loss": 0.0281, | |
| "step": 60000 | |
| }, | |
| { | |
| "epoch": 14.799412915851272, | |
| "grad_norm": 0.4659290611743927, | |
| "learning_rate": 8.160469667318983e-06, | |
| "loss": 0.0283, | |
| "step": 60500 | |
| }, | |
| { | |
| "epoch": 14.921722113502936, | |
| "grad_norm": 5.014148235321045, | |
| "learning_rate": 8.062622309197653e-06, | |
| "loss": 0.0289, | |
| "step": 61000 | |
| }, | |
| { | |
| "epoch": 15.0, | |
| "eval_accuracy": 0.9236008393244126, | |
| "eval_loss": 0.43280109763145447, | |
| "eval_runtime": 77.9069, | |
| "eval_samples_per_second": 373.151, | |
| "eval_steps_per_second": 11.668, | |
| "step": 61320 | |
| }, | |
| { | |
| "epoch": 15.0440313111546, | |
| "grad_norm": 4.811933994293213, | |
| "learning_rate": 7.964774951076321e-06, | |
| "loss": 0.027, | |
| "step": 61500 | |
| }, | |
| { | |
| "epoch": 15.166340508806263, | |
| "grad_norm": 9.811365127563477, | |
| "learning_rate": 7.86692759295499e-06, | |
| "loss": 0.0214, | |
| "step": 62000 | |
| }, | |
| { | |
| "epoch": 15.288649706457926, | |
| "grad_norm": 1.3940095901489258, | |
| "learning_rate": 7.76908023483366e-06, | |
| "loss": 0.0219, | |
| "step": 62500 | |
| }, | |
| { | |
| "epoch": 15.41095890410959, | |
| "grad_norm": 0.06226726993918419, | |
| "learning_rate": 7.671232876712329e-06, | |
| "loss": 0.0226, | |
| "step": 63000 | |
| }, | |
| { | |
| "epoch": 15.533268101761253, | |
| "grad_norm": 8.294939994812012, | |
| "learning_rate": 7.573385518590999e-06, | |
| "loss": 0.0251, | |
| "step": 63500 | |
| }, | |
| { | |
| "epoch": 15.655577299412915, | |
| "grad_norm": 1.8952441215515137, | |
| "learning_rate": 7.475538160469667e-06, | |
| "loss": 0.0232, | |
| "step": 64000 | |
| }, | |
| { | |
| "epoch": 15.777886497064578, | |
| "grad_norm": 5.1830315589904785, | |
| "learning_rate": 7.377690802348337e-06, | |
| "loss": 0.0254, | |
| "step": 64500 | |
| }, | |
| { | |
| "epoch": 15.900195694716242, | |
| "grad_norm": 1.0473072528839111, | |
| "learning_rate": 7.279843444227006e-06, | |
| "loss": 0.0248, | |
| "step": 65000 | |
| }, | |
| { | |
| "epoch": 16.0, | |
| "eval_accuracy": 0.9260431357710434, | |
| "eval_loss": 0.4366961419582367, | |
| "eval_runtime": 77.3755, | |
| "eval_samples_per_second": 375.713, | |
| "eval_steps_per_second": 11.748, | |
| "step": 65408 | |
| }, | |
| { | |
| "epoch": 16.022504892367905, | |
| "grad_norm": 0.5687731504440308, | |
| "learning_rate": 7.181996086105676e-06, | |
| "loss": 0.0238, | |
| "step": 65500 | |
| }, | |
| { | |
| "epoch": 16.14481409001957, | |
| "grad_norm": 1.2872580289840698, | |
| "learning_rate": 7.0841487279843445e-06, | |
| "loss": 0.0194, | |
| "step": 66000 | |
| }, | |
| { | |
| "epoch": 16.267123287671232, | |
| "grad_norm": 5.059693813323975, | |
| "learning_rate": 6.9863013698630145e-06, | |
| "loss": 0.0199, | |
| "step": 66500 | |
| }, | |
| { | |
| "epoch": 16.389432485322896, | |
| "grad_norm": 6.738853931427002, | |
| "learning_rate": 6.8884540117416836e-06, | |
| "loss": 0.0205, | |
| "step": 67000 | |
| }, | |
| { | |
| "epoch": 16.51174168297456, | |
| "grad_norm": 11.091848373413086, | |
| "learning_rate": 6.790606653620353e-06, | |
| "loss": 0.0215, | |
| "step": 67500 | |
| }, | |
| { | |
| "epoch": 16.634050880626223, | |
| "grad_norm": 0.19167885184288025, | |
| "learning_rate": 6.692759295499022e-06, | |
| "loss": 0.022, | |
| "step": 68000 | |
| }, | |
| { | |
| "epoch": 16.756360078277886, | |
| "grad_norm": 5.966000080108643, | |
| "learning_rate": 6.594911937377692e-06, | |
| "loss": 0.0203, | |
| "step": 68500 | |
| }, | |
| { | |
| "epoch": 16.87866927592955, | |
| "grad_norm": 3.5066215991973877, | |
| "learning_rate": 6.49706457925636e-06, | |
| "loss": 0.0187, | |
| "step": 69000 | |
| }, | |
| { | |
| "epoch": 17.0, | |
| "eval_accuracy": 0.9272470847236077, | |
| "eval_loss": 0.44777625799179077, | |
| "eval_runtime": 77.1492, | |
| "eval_samples_per_second": 376.815, | |
| "eval_steps_per_second": 11.782, | |
| "step": 69496 | |
| }, | |
| { | |
| "epoch": 17.000978473581213, | |
| "grad_norm": 0.6858524680137634, | |
| "learning_rate": 6.39921722113503e-06, | |
| "loss": 0.0204, | |
| "step": 69500 | |
| }, | |
| { | |
| "epoch": 17.123287671232877, | |
| "grad_norm": 9.703988075256348, | |
| "learning_rate": 6.301369863013699e-06, | |
| "loss": 0.0161, | |
| "step": 70000 | |
| }, | |
| { | |
| "epoch": 17.24559686888454, | |
| "grad_norm": 4.51020622253418, | |
| "learning_rate": 6.203522504892369e-06, | |
| "loss": 0.016, | |
| "step": 70500 | |
| }, | |
| { | |
| "epoch": 17.367906066536204, | |
| "grad_norm": 0.09826533496379852, | |
| "learning_rate": 6.105675146771037e-06, | |
| "loss": 0.0177, | |
| "step": 71000 | |
| }, | |
| { | |
| "epoch": 17.490215264187867, | |
| "grad_norm": 11.620870590209961, | |
| "learning_rate": 6.007827788649707e-06, | |
| "loss": 0.02, | |
| "step": 71500 | |
| }, | |
| { | |
| "epoch": 17.61252446183953, | |
| "grad_norm": 8.661576271057129, | |
| "learning_rate": 5.909980430528376e-06, | |
| "loss": 0.017, | |
| "step": 72000 | |
| }, | |
| { | |
| "epoch": 17.734833659491194, | |
| "grad_norm": 0.7512950897216797, | |
| "learning_rate": 5.812133072407045e-06, | |
| "loss": 0.0186, | |
| "step": 72500 | |
| }, | |
| { | |
| "epoch": 17.857142857142858, | |
| "grad_norm": 0.44471457600593567, | |
| "learning_rate": 5.7142857142857145e-06, | |
| "loss": 0.0228, | |
| "step": 73000 | |
| }, | |
| { | |
| "epoch": 17.97945205479452, | |
| "grad_norm": 1.0926498174667358, | |
| "learning_rate": 5.6164383561643845e-06, | |
| "loss": 0.0178, | |
| "step": 73500 | |
| }, | |
| { | |
| "epoch": 18.0, | |
| "eval_accuracy": 0.9264559182690654, | |
| "eval_loss": 0.46360349655151367, | |
| "eval_runtime": 77.3263, | |
| "eval_samples_per_second": 375.952, | |
| "eval_steps_per_second": 11.755, | |
| "step": 73584 | |
| }, | |
| { | |
| "epoch": 18.101761252446185, | |
| "grad_norm": 8.134523391723633, | |
| "learning_rate": 5.518590998043053e-06, | |
| "loss": 0.0153, | |
| "step": 74000 | |
| }, | |
| { | |
| "epoch": 18.224070450097848, | |
| "grad_norm": 0.08053633570671082, | |
| "learning_rate": 5.420743639921723e-06, | |
| "loss": 0.016, | |
| "step": 74500 | |
| }, | |
| { | |
| "epoch": 18.34637964774951, | |
| "grad_norm": 0.44435006380081177, | |
| "learning_rate": 5.322896281800392e-06, | |
| "loss": 0.0146, | |
| "step": 75000 | |
| }, | |
| { | |
| "epoch": 18.468688845401175, | |
| "grad_norm": 2.0229480266571045, | |
| "learning_rate": 5.225048923679062e-06, | |
| "loss": 0.017, | |
| "step": 75500 | |
| }, | |
| { | |
| "epoch": 18.59099804305284, | |
| "grad_norm": 4.502400875091553, | |
| "learning_rate": 5.12720156555773e-06, | |
| "loss": 0.017, | |
| "step": 76000 | |
| }, | |
| { | |
| "epoch": 18.713307240704502, | |
| "grad_norm": 0.14978571236133575, | |
| "learning_rate": 5.0293542074364e-06, | |
| "loss": 0.0154, | |
| "step": 76500 | |
| }, | |
| { | |
| "epoch": 18.835616438356166, | |
| "grad_norm": 9.524068832397461, | |
| "learning_rate": 4.931506849315069e-06, | |
| "loss": 0.0165, | |
| "step": 77000 | |
| }, | |
| { | |
| "epoch": 18.95792563600783, | |
| "grad_norm": 0.9707121253013611, | |
| "learning_rate": 4.833659491193738e-06, | |
| "loss": 0.0154, | |
| "step": 77500 | |
| }, | |
| { | |
| "epoch": 19.0, | |
| "eval_accuracy": 0.9266623095180765, | |
| "eval_loss": 0.47839030623435974, | |
| "eval_runtime": 77.6682, | |
| "eval_samples_per_second": 374.297, | |
| "eval_steps_per_second": 11.704, | |
| "step": 77672 | |
| }, | |
| { | |
| "epoch": 19.080234833659492, | |
| "grad_norm": 11.346772193908691, | |
| "learning_rate": 4.735812133072407e-06, | |
| "loss": 0.0136, | |
| "step": 78000 | |
| }, | |
| { | |
| "epoch": 19.202544031311156, | |
| "grad_norm": 0.7873362302780151, | |
| "learning_rate": 4.637964774951076e-06, | |
| "loss": 0.0131, | |
| "step": 78500 | |
| }, | |
| { | |
| "epoch": 19.32485322896282, | |
| "grad_norm": 1.0734236240386963, | |
| "learning_rate": 4.5401174168297455e-06, | |
| "loss": 0.0156, | |
| "step": 79000 | |
| }, | |
| { | |
| "epoch": 19.447162426614483, | |
| "grad_norm": 0.031163902953267097, | |
| "learning_rate": 4.442270058708415e-06, | |
| "loss": 0.0148, | |
| "step": 79500 | |
| }, | |
| { | |
| "epoch": 19.569471624266146, | |
| "grad_norm": 3.0132200717926025, | |
| "learning_rate": 4.3444227005870845e-06, | |
| "loss": 0.0142, | |
| "step": 80000 | |
| }, | |
| { | |
| "epoch": 19.69178082191781, | |
| "grad_norm": 5.072915554046631, | |
| "learning_rate": 4.246575342465754e-06, | |
| "loss": 0.0133, | |
| "step": 80500 | |
| }, | |
| { | |
| "epoch": 19.81409001956947, | |
| "grad_norm": 0.12334894388914108, | |
| "learning_rate": 4.148727984344423e-06, | |
| "loss": 0.0153, | |
| "step": 81000 | |
| }, | |
| { | |
| "epoch": 19.936399217221137, | |
| "grad_norm": 3.8586535453796387, | |
| "learning_rate": 4.050880626223092e-06, | |
| "loss": 0.0138, | |
| "step": 81500 | |
| }, | |
| { | |
| "epoch": 20.0, | |
| "eval_accuracy": 0.9280726497196519, | |
| "eval_loss": 0.4880678355693817, | |
| "eval_runtime": 77.1562, | |
| "eval_samples_per_second": 376.781, | |
| "eval_steps_per_second": 11.781, | |
| "step": 81760 | |
| }, | |
| { | |
| "epoch": 20.058708414872797, | |
| "grad_norm": 5.422935485839844, | |
| "learning_rate": 3.953033268101762e-06, | |
| "loss": 0.0123, | |
| "step": 82000 | |
| }, | |
| { | |
| "epoch": 20.18101761252446, | |
| "grad_norm": 0.02908429130911827, | |
| "learning_rate": 3.855185909980431e-06, | |
| "loss": 0.0127, | |
| "step": 82500 | |
| }, | |
| { | |
| "epoch": 20.303326810176124, | |
| "grad_norm": 0.9683161973953247, | |
| "learning_rate": 3.7573385518591e-06, | |
| "loss": 0.0106, | |
| "step": 83000 | |
| }, | |
| { | |
| "epoch": 20.425636007827787, | |
| "grad_norm": 2.011960029602051, | |
| "learning_rate": 3.659491193737769e-06, | |
| "loss": 0.0129, | |
| "step": 83500 | |
| }, | |
| { | |
| "epoch": 20.54794520547945, | |
| "grad_norm": 0.011290821246802807, | |
| "learning_rate": 3.5616438356164386e-06, | |
| "loss": 0.0098, | |
| "step": 84000 | |
| }, | |
| { | |
| "epoch": 20.670254403131114, | |
| "grad_norm": 3.6643381118774414, | |
| "learning_rate": 3.4637964774951077e-06, | |
| "loss": 0.0123, | |
| "step": 84500 | |
| }, | |
| { | |
| "epoch": 20.792563600782778, | |
| "grad_norm": 0.28946495056152344, | |
| "learning_rate": 3.365949119373777e-06, | |
| "loss": 0.0119, | |
| "step": 85000 | |
| }, | |
| { | |
| "epoch": 20.91487279843444, | |
| "grad_norm": 0.8526332974433899, | |
| "learning_rate": 3.2681017612524463e-06, | |
| "loss": 0.0127, | |
| "step": 85500 | |
| }, | |
| { | |
| "epoch": 21.0, | |
| "eval_accuracy": 0.9281758453441574, | |
| "eval_loss": 0.49737828969955444, | |
| "eval_runtime": 77.6066, | |
| "eval_samples_per_second": 374.594, | |
| "eval_steps_per_second": 11.713, | |
| "step": 85848 | |
| }, | |
| { | |
| "epoch": 21.037181996086105, | |
| "grad_norm": 0.038626112043857574, | |
| "learning_rate": 3.1702544031311154e-06, | |
| "loss": 0.0133, | |
| "step": 86000 | |
| }, | |
| { | |
| "epoch": 21.159491193737768, | |
| "grad_norm": 1.5674916505813599, | |
| "learning_rate": 3.072407045009785e-06, | |
| "loss": 0.0079, | |
| "step": 86500 | |
| }, | |
| { | |
| "epoch": 21.28180039138943, | |
| "grad_norm": 0.023863431066274643, | |
| "learning_rate": 2.974559686888454e-06, | |
| "loss": 0.0089, | |
| "step": 87000 | |
| }, | |
| { | |
| "epoch": 21.404109589041095, | |
| "grad_norm": 0.027295144274830818, | |
| "learning_rate": 2.876712328767123e-06, | |
| "loss": 0.0099, | |
| "step": 87500 | |
| }, | |
| { | |
| "epoch": 21.52641878669276, | |
| "grad_norm": 1.8621610403060913, | |
| "learning_rate": 2.7788649706457927e-06, | |
| "loss": 0.01, | |
| "step": 88000 | |
| }, | |
| { | |
| "epoch": 21.648727984344422, | |
| "grad_norm": 0.09760987758636475, | |
| "learning_rate": 2.681017612524462e-06, | |
| "loss": 0.0115, | |
| "step": 88500 | |
| }, | |
| { | |
| "epoch": 21.771037181996086, | |
| "grad_norm": 0.0476505346596241, | |
| "learning_rate": 2.5831702544031313e-06, | |
| "loss": 0.0096, | |
| "step": 89000 | |
| }, | |
| { | |
| "epoch": 21.89334637964775, | |
| "grad_norm": 0.08127936720848083, | |
| "learning_rate": 2.4853228962818004e-06, | |
| "loss": 0.0116, | |
| "step": 89500 | |
| }, | |
| { | |
| "epoch": 22.0, | |
| "eval_accuracy": 0.9293797942967218, | |
| "eval_loss": 0.4935356378555298, | |
| "eval_runtime": 79.4825, | |
| "eval_samples_per_second": 365.754, | |
| "eval_steps_per_second": 11.436, | |
| "step": 89936 | |
| }, | |
| { | |
| "epoch": 22.015655577299412, | |
| "grad_norm": 1.2286592721939087, | |
| "learning_rate": 2.3874755381604695e-06, | |
| "loss": 0.0086, | |
| "step": 90000 | |
| }, | |
| { | |
| "epoch": 22.137964774951076, | |
| "grad_norm": 1.9946578741073608, | |
| "learning_rate": 2.289628180039139e-06, | |
| "loss": 0.008, | |
| "step": 90500 | |
| }, | |
| { | |
| "epoch": 22.26027397260274, | |
| "grad_norm": 0.024408530443906784, | |
| "learning_rate": 2.191780821917808e-06, | |
| "loss": 0.0115, | |
| "step": 91000 | |
| }, | |
| { | |
| "epoch": 22.382583170254403, | |
| "grad_norm": 0.011225187219679356, | |
| "learning_rate": 2.0939334637964777e-06, | |
| "loss": 0.0085, | |
| "step": 91500 | |
| }, | |
| { | |
| "epoch": 22.504892367906066, | |
| "grad_norm": 1.2969368696212769, | |
| "learning_rate": 1.996086105675147e-06, | |
| "loss": 0.0089, | |
| "step": 92000 | |
| }, | |
| { | |
| "epoch": 22.62720156555773, | |
| "grad_norm": 6.824276447296143, | |
| "learning_rate": 1.8982387475538161e-06, | |
| "loss": 0.0088, | |
| "step": 92500 | |
| }, | |
| { | |
| "epoch": 22.749510763209393, | |
| "grad_norm": 0.052455250173807144, | |
| "learning_rate": 1.8003913894324854e-06, | |
| "loss": 0.0088, | |
| "step": 93000 | |
| }, | |
| { | |
| "epoch": 22.871819960861057, | |
| "grad_norm": 0.04021551460027695, | |
| "learning_rate": 1.7025440313111545e-06, | |
| "loss": 0.009, | |
| "step": 93500 | |
| }, | |
| { | |
| "epoch": 22.99412915851272, | |
| "grad_norm": 0.06902284175157547, | |
| "learning_rate": 1.6046966731898239e-06, | |
| "loss": 0.0106, | |
| "step": 94000 | |
| }, | |
| { | |
| "epoch": 23.0, | |
| "eval_accuracy": 0.93065254033229, | |
| "eval_loss": 0.4948473870754242, | |
| "eval_runtime": 79.6762, | |
| "eval_samples_per_second": 364.864, | |
| "eval_steps_per_second": 11.409, | |
| "step": 94024 | |
| }, | |
| { | |
| "epoch": 23.116438356164384, | |
| "grad_norm": 0.5471883416175842, | |
| "learning_rate": 1.5068493150684932e-06, | |
| "loss": 0.0068, | |
| "step": 94500 | |
| }, | |
| { | |
| "epoch": 23.238747553816047, | |
| "grad_norm": 0.16019482910633087, | |
| "learning_rate": 1.4090019569471625e-06, | |
| "loss": 0.0089, | |
| "step": 95000 | |
| }, | |
| { | |
| "epoch": 23.36105675146771, | |
| "grad_norm": 6.4080963134765625, | |
| "learning_rate": 1.3111545988258318e-06, | |
| "loss": 0.0077, | |
| "step": 95500 | |
| }, | |
| { | |
| "epoch": 23.483365949119374, | |
| "grad_norm": 0.13792355358600616, | |
| "learning_rate": 1.213307240704501e-06, | |
| "loss": 0.0076, | |
| "step": 96000 | |
| }, | |
| { | |
| "epoch": 23.605675146771038, | |
| "grad_norm": 2.563711404800415, | |
| "learning_rate": 1.1154598825831702e-06, | |
| "loss": 0.0075, | |
| "step": 96500 | |
| }, | |
| { | |
| "epoch": 23.7279843444227, | |
| "grad_norm": 0.04809940978884697, | |
| "learning_rate": 1.0176125244618395e-06, | |
| "loss": 0.0067, | |
| "step": 97000 | |
| }, | |
| { | |
| "epoch": 23.850293542074365, | |
| "grad_norm": 0.057318106293678284, | |
| "learning_rate": 9.197651663405089e-07, | |
| "loss": 0.0062, | |
| "step": 97500 | |
| }, | |
| { | |
| "epoch": 23.972602739726028, | |
| "grad_norm": 4.935258865356445, | |
| "learning_rate": 8.219178082191781e-07, | |
| "loss": 0.0085, | |
| "step": 98000 | |
| }, | |
| { | |
| "epoch": 24.0, | |
| "eval_accuracy": 0.9311685184548175, | |
| "eval_loss": 0.49324169754981995, | |
| "eval_runtime": 79.011, | |
| "eval_samples_per_second": 367.936, | |
| "eval_steps_per_second": 11.505, | |
| "step": 98112 | |
| }, | |
| { | |
| "epoch": 24.09491193737769, | |
| "grad_norm": 6.268829345703125, | |
| "learning_rate": 7.240704500978474e-07, | |
| "loss": 0.0055, | |
| "step": 98500 | |
| }, | |
| { | |
| "epoch": 24.217221135029355, | |
| "grad_norm": 0.00456130551174283, | |
| "learning_rate": 6.262230919765167e-07, | |
| "loss": 0.0054, | |
| "step": 99000 | |
| }, | |
| { | |
| "epoch": 24.33953033268102, | |
| "grad_norm": 0.014587494544684887, | |
| "learning_rate": 5.283757338551859e-07, | |
| "loss": 0.0067, | |
| "step": 99500 | |
| }, | |
| { | |
| "epoch": 24.461839530332682, | |
| "grad_norm": 0.7037560343742371, | |
| "learning_rate": 4.305283757338552e-07, | |
| "loss": 0.0059, | |
| "step": 100000 | |
| }, | |
| { | |
| "epoch": 24.584148727984346, | |
| "grad_norm": 0.006079969462007284, | |
| "learning_rate": 3.326810176125245e-07, | |
| "loss": 0.0074, | |
| "step": 100500 | |
| }, | |
| { | |
| "epoch": 24.70645792563601, | |
| "grad_norm": 1.3056970834732056, | |
| "learning_rate": 2.3483365949119375e-07, | |
| "loss": 0.0075, | |
| "step": 101000 | |
| }, | |
| { | |
| "epoch": 24.828767123287673, | |
| "grad_norm": 8.87458324432373, | |
| "learning_rate": 1.36986301369863e-07, | |
| "loss": 0.0065, | |
| "step": 101500 | |
| }, | |
| { | |
| "epoch": 24.951076320939336, | |
| "grad_norm": 14.917941093444824, | |
| "learning_rate": 3.9138943248532294e-08, | |
| "loss": 0.0072, | |
| "step": 102000 | |
| }, | |
| { | |
| "epoch": 25.0, | |
| "eval_accuracy": 0.9311341199133156, | |
| "eval_loss": 0.49332907795906067, | |
| "eval_runtime": 79.9831, | |
| "eval_samples_per_second": 363.464, | |
| "eval_steps_per_second": 11.365, | |
| "step": 102200 | |
| } | |
| ], | |
| "logging_steps": 500, | |
| "max_steps": 102200, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 25, | |
| "save_steps": 500, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": true | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 4.33360446554112e+17, | |
| "train_batch_size": 64, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |