| { | |
| "best_metric": 0.9545454382896423, | |
| "best_model_checkpoint": "/kaggle/working/ckpts/checkpoint-2453", | |
| "epoch": 14.0, | |
| "eval_steps": 500, | |
| "global_step": 3122, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.45, | |
| "grad_norm": 2.3771965503692627, | |
| "learning_rate": 9.701046337817639e-06, | |
| "loss": 1.5285, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.9, | |
| "grad_norm": 2.858564615249634, | |
| "learning_rate": 9.402092675635277e-06, | |
| "loss": 1.1628, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 1.0, | |
| "eval_accuracy": 0.7727272510528564, | |
| "eval_loss": 0.712559700012207, | |
| "eval_runtime": 9.2054, | |
| "eval_samples_per_second": 21.509, | |
| "eval_steps_per_second": 5.432, | |
| "step": 223 | |
| }, | |
| { | |
| "epoch": 1.35, | |
| "grad_norm": 12.418754577636719, | |
| "learning_rate": 9.106128550074738e-06, | |
| "loss": 0.8286, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 1.79, | |
| "grad_norm": 7.768007755279541, | |
| "learning_rate": 8.807174887892378e-06, | |
| "loss": 0.6562, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 2.0, | |
| "eval_accuracy": 0.8484848737716675, | |
| "eval_loss": 0.5068599581718445, | |
| "eval_runtime": 9.5278, | |
| "eval_samples_per_second": 20.781, | |
| "eval_steps_per_second": 5.248, | |
| "step": 446 | |
| }, | |
| { | |
| "epoch": 2.24, | |
| "grad_norm": 9.381482124328613, | |
| "learning_rate": 8.51121076233184e-06, | |
| "loss": 0.5053, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 2.69, | |
| "grad_norm": 2.293752908706665, | |
| "learning_rate": 8.212257100149478e-06, | |
| "loss": 0.4199, | |
| "step": 600 | |
| }, | |
| { | |
| "epoch": 3.0, | |
| "eval_accuracy": 0.8989899158477783, | |
| "eval_loss": 0.356963574886322, | |
| "eval_runtime": 9.2472, | |
| "eval_samples_per_second": 21.412, | |
| "eval_steps_per_second": 5.407, | |
| "step": 669 | |
| }, | |
| { | |
| "epoch": 3.14, | |
| "grad_norm": 23.293209075927734, | |
| "learning_rate": 7.916292974588939e-06, | |
| "loss": 0.3121, | |
| "step": 700 | |
| }, | |
| { | |
| "epoch": 3.59, | |
| "grad_norm": 3.8754687309265137, | |
| "learning_rate": 7.617339312406578e-06, | |
| "loss": 0.325, | |
| "step": 800 | |
| }, | |
| { | |
| "epoch": 4.0, | |
| "eval_accuracy": 0.939393937587738, | |
| "eval_loss": 0.20920716226100922, | |
| "eval_runtime": 9.2568, | |
| "eval_samples_per_second": 21.39, | |
| "eval_steps_per_second": 5.401, | |
| "step": 892 | |
| }, | |
| { | |
| "epoch": 4.04, | |
| "grad_norm": 62.81392288208008, | |
| "learning_rate": 7.318385650224216e-06, | |
| "loss": 0.2896, | |
| "step": 900 | |
| }, | |
| { | |
| "epoch": 4.48, | |
| "grad_norm": 35.08163833618164, | |
| "learning_rate": 7.019431988041854e-06, | |
| "loss": 0.2535, | |
| "step": 1000 | |
| }, | |
| { | |
| "epoch": 4.93, | |
| "grad_norm": 14.269490242004395, | |
| "learning_rate": 6.720478325859492e-06, | |
| "loss": 0.2217, | |
| "step": 1100 | |
| }, | |
| { | |
| "epoch": 5.0, | |
| "eval_accuracy": 0.9444444179534912, | |
| "eval_loss": 0.23924072086811066, | |
| "eval_runtime": 9.2044, | |
| "eval_samples_per_second": 21.511, | |
| "eval_steps_per_second": 5.432, | |
| "step": 1115 | |
| }, | |
| { | |
| "epoch": 5.38, | |
| "grad_norm": 0.41719043254852295, | |
| "learning_rate": 6.421524663677131e-06, | |
| "loss": 0.2165, | |
| "step": 1200 | |
| }, | |
| { | |
| "epoch": 5.83, | |
| "grad_norm": 1.484471321105957, | |
| "learning_rate": 6.1225710014947695e-06, | |
| "loss": 0.1831, | |
| "step": 1300 | |
| }, | |
| { | |
| "epoch": 6.0, | |
| "eval_accuracy": 0.9292929172515869, | |
| "eval_loss": 0.27538299560546875, | |
| "eval_runtime": 9.1435, | |
| "eval_samples_per_second": 21.655, | |
| "eval_steps_per_second": 5.468, | |
| "step": 1338 | |
| }, | |
| { | |
| "epoch": 6.28, | |
| "grad_norm": 0.09743738174438477, | |
| "learning_rate": 5.823617339312408e-06, | |
| "loss": 0.2059, | |
| "step": 1400 | |
| }, | |
| { | |
| "epoch": 6.73, | |
| "grad_norm": 0.3065042793750763, | |
| "learning_rate": 5.524663677130046e-06, | |
| "loss": 0.1598, | |
| "step": 1500 | |
| }, | |
| { | |
| "epoch": 7.0, | |
| "eval_accuracy": 0.9343434572219849, | |
| "eval_loss": 0.3294394910335541, | |
| "eval_runtime": 9.1064, | |
| "eval_samples_per_second": 21.743, | |
| "eval_steps_per_second": 5.491, | |
| "step": 1561 | |
| }, | |
| { | |
| "epoch": 7.17, | |
| "grad_norm": 0.05342373996973038, | |
| "learning_rate": 5.228699551569507e-06, | |
| "loss": 0.1455, | |
| "step": 1600 | |
| }, | |
| { | |
| "epoch": 7.62, | |
| "grad_norm": 1.5460679531097412, | |
| "learning_rate": 4.929745889387145e-06, | |
| "loss": 0.1676, | |
| "step": 1700 | |
| }, | |
| { | |
| "epoch": 8.0, | |
| "eval_accuracy": 0.9494949579238892, | |
| "eval_loss": 0.2668905258178711, | |
| "eval_runtime": 9.2118, | |
| "eval_samples_per_second": 21.494, | |
| "eval_steps_per_second": 5.428, | |
| "step": 1784 | |
| }, | |
| { | |
| "epoch": 8.07, | |
| "grad_norm": 17.537992477416992, | |
| "learning_rate": 4.630792227204783e-06, | |
| "loss": 0.1762, | |
| "step": 1800 | |
| }, | |
| { | |
| "epoch": 8.52, | |
| "grad_norm": 0.20349286496639252, | |
| "learning_rate": 4.3318385650224224e-06, | |
| "loss": 0.1566, | |
| "step": 1900 | |
| }, | |
| { | |
| "epoch": 8.97, | |
| "grad_norm": 15.300110816955566, | |
| "learning_rate": 4.03288490284006e-06, | |
| "loss": 0.1597, | |
| "step": 2000 | |
| }, | |
| { | |
| "epoch": 9.0, | |
| "eval_accuracy": 0.9292929172515869, | |
| "eval_loss": 0.34383586049079895, | |
| "eval_runtime": 9.179, | |
| "eval_samples_per_second": 21.571, | |
| "eval_steps_per_second": 5.447, | |
| "step": 2007 | |
| }, | |
| { | |
| "epoch": 9.42, | |
| "grad_norm": 0.4512959420681, | |
| "learning_rate": 3.7339312406576984e-06, | |
| "loss": 0.1416, | |
| "step": 2100 | |
| }, | |
| { | |
| "epoch": 9.87, | |
| "grad_norm": 0.7455862760543823, | |
| "learning_rate": 3.4349775784753366e-06, | |
| "loss": 0.1132, | |
| "step": 2200 | |
| }, | |
| { | |
| "epoch": 10.0, | |
| "eval_accuracy": 0.9444444179534912, | |
| "eval_loss": 0.31586208939552307, | |
| "eval_runtime": 9.1631, | |
| "eval_samples_per_second": 21.608, | |
| "eval_steps_per_second": 5.457, | |
| "step": 2230 | |
| }, | |
| { | |
| "epoch": 10.31, | |
| "grad_norm": 0.25966259837150574, | |
| "learning_rate": 3.136023916292975e-06, | |
| "loss": 0.1654, | |
| "step": 2300 | |
| }, | |
| { | |
| "epoch": 10.76, | |
| "grad_norm": 0.45347365736961365, | |
| "learning_rate": 2.8370702541106134e-06, | |
| "loss": 0.1224, | |
| "step": 2400 | |
| }, | |
| { | |
| "epoch": 11.0, | |
| "eval_accuracy": 0.9545454382896423, | |
| "eval_loss": 0.29796990752220154, | |
| "eval_runtime": 9.1354, | |
| "eval_samples_per_second": 21.674, | |
| "eval_steps_per_second": 5.473, | |
| "step": 2453 | |
| }, | |
| { | |
| "epoch": 11.21, | |
| "grad_norm": 27.043094635009766, | |
| "learning_rate": 2.538116591928251e-06, | |
| "loss": 0.1021, | |
| "step": 2500 | |
| }, | |
| { | |
| "epoch": 11.66, | |
| "grad_norm": 72.37726593017578, | |
| "learning_rate": 2.2391629297458894e-06, | |
| "loss": 0.095, | |
| "step": 2600 | |
| }, | |
| { | |
| "epoch": 12.0, | |
| "eval_accuracy": 0.9444444179534912, | |
| "eval_loss": 0.2970119118690491, | |
| "eval_runtime": 9.1388, | |
| "eval_samples_per_second": 21.666, | |
| "eval_steps_per_second": 5.471, | |
| "step": 2676 | |
| }, | |
| { | |
| "epoch": 12.11, | |
| "grad_norm": 0.6068007946014404, | |
| "learning_rate": 1.940209267563528e-06, | |
| "loss": 0.1307, | |
| "step": 2700 | |
| }, | |
| { | |
| "epoch": 12.56, | |
| "grad_norm": 4.567564964294434, | |
| "learning_rate": 1.641255605381166e-06, | |
| "loss": 0.1087, | |
| "step": 2800 | |
| }, | |
| { | |
| "epoch": 13.0, | |
| "eval_accuracy": 0.9343434572219849, | |
| "eval_loss": 0.34486597776412964, | |
| "eval_runtime": 9.3094, | |
| "eval_samples_per_second": 21.269, | |
| "eval_steps_per_second": 5.371, | |
| "step": 2899 | |
| }, | |
| { | |
| "epoch": 13.0, | |
| "grad_norm": 41.62958908081055, | |
| "learning_rate": 1.3423019431988044e-06, | |
| "loss": 0.0917, | |
| "step": 2900 | |
| }, | |
| { | |
| "epoch": 13.45, | |
| "grad_norm": 0.026164406910538673, | |
| "learning_rate": 1.0433482810164425e-06, | |
| "loss": 0.0904, | |
| "step": 3000 | |
| }, | |
| { | |
| "epoch": 13.9, | |
| "grad_norm": 52.47389221191406, | |
| "learning_rate": 7.443946188340807e-07, | |
| "loss": 0.1254, | |
| "step": 3100 | |
| }, | |
| { | |
| "epoch": 14.0, | |
| "eval_accuracy": 0.9444444179534912, | |
| "eval_loss": 0.31978654861450195, | |
| "eval_runtime": 9.2595, | |
| "eval_samples_per_second": 21.384, | |
| "eval_steps_per_second": 5.4, | |
| "step": 3122 | |
| }, | |
| { | |
| "epoch": 14.0, | |
| "step": 3122, | |
| "total_flos": 7.9842219974856e+17, | |
| "train_loss": 0.30023268478028214, | |
| "train_runtime": 1742.0026, | |
| "train_samples_per_second": 15.336, | |
| "train_steps_per_second": 1.92 | |
| }, | |
| { | |
| "epoch": 14.0, | |
| "eval_accuracy": 0.9545454382896423, | |
| "eval_loss": 0.29796990752220154, | |
| "eval_runtime": 8.9823, | |
| "eval_samples_per_second": 22.043, | |
| "eval_steps_per_second": 5.566, | |
| "step": 3122 | |
| } | |
| ], | |
| "logging_steps": 100, | |
| "max_steps": 3345, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 15, | |
| "save_steps": 500, | |
| "total_flos": 7.9842219974856e+17, | |
| "train_batch_size": 4, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |