| { | |
| "best_global_step": 450, | |
| "best_metric": 0.5457363724708557, | |
| "best_model_checkpoint": "./vit-results/checkpoint-400", | |
| "epoch": 10.0, | |
| "eval_steps": 50, | |
| "global_step": 490, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.20408163265306123, | |
| "grad_norm": 1.3306224346160889, | |
| "learning_rate": 1.963265306122449e-05, | |
| "loss": 1.3374, | |
| "step": 10 | |
| }, | |
| { | |
| "epoch": 0.40816326530612246, | |
| "grad_norm": 1.4784319400787354, | |
| "learning_rate": 1.922448979591837e-05, | |
| "loss": 1.265, | |
| "step": 20 | |
| }, | |
| { | |
| "epoch": 0.6122448979591837, | |
| "grad_norm": 2.0332705974578857, | |
| "learning_rate": 1.8816326530612246e-05, | |
| "loss": 1.2081, | |
| "step": 30 | |
| }, | |
| { | |
| "epoch": 0.8163265306122449, | |
| "grad_norm": 1.4771761894226074, | |
| "learning_rate": 1.8408163265306125e-05, | |
| "loss": 1.2104, | |
| "step": 40 | |
| }, | |
| { | |
| "epoch": 1.0204081632653061, | |
| "grad_norm": 1.5638868808746338, | |
| "learning_rate": 1.8e-05, | |
| "loss": 1.1453, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 1.0204081632653061, | |
| "eval_accuracy": 0.475, | |
| "eval_loss": 1.1232645511627197, | |
| "eval_runtime": 1.3068, | |
| "eval_samples_per_second": 61.216, | |
| "eval_steps_per_second": 7.652, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 1.2244897959183674, | |
| "grad_norm": 2.1241655349731445, | |
| "learning_rate": 1.759183673469388e-05, | |
| "loss": 1.0284, | |
| "step": 60 | |
| }, | |
| { | |
| "epoch": 1.4285714285714286, | |
| "grad_norm": 2.0577926635742188, | |
| "learning_rate": 1.7183673469387755e-05, | |
| "loss": 1.0428, | |
| "step": 70 | |
| }, | |
| { | |
| "epoch": 1.6326530612244898, | |
| "grad_norm": 1.7612333297729492, | |
| "learning_rate": 1.6775510204081634e-05, | |
| "loss": 1.008, | |
| "step": 80 | |
| }, | |
| { | |
| "epoch": 1.836734693877551, | |
| "grad_norm": 2.2085816860198975, | |
| "learning_rate": 1.6367346938775513e-05, | |
| "loss": 1.0095, | |
| "step": 90 | |
| }, | |
| { | |
| "epoch": 2.0408163265306123, | |
| "grad_norm": 1.5611677169799805, | |
| "learning_rate": 1.595918367346939e-05, | |
| "loss": 0.9155, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 2.0408163265306123, | |
| "eval_accuracy": 0.7, | |
| "eval_loss": 0.964668869972229, | |
| "eval_runtime": 1.2647, | |
| "eval_samples_per_second": 63.255, | |
| "eval_steps_per_second": 7.907, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 2.2448979591836733, | |
| "grad_norm": 2.1504409313201904, | |
| "learning_rate": 1.5551020408163265e-05, | |
| "loss": 0.8396, | |
| "step": 110 | |
| }, | |
| { | |
| "epoch": 2.4489795918367347, | |
| "grad_norm": 1.9358028173446655, | |
| "learning_rate": 1.5142857142857144e-05, | |
| "loss": 0.8618, | |
| "step": 120 | |
| }, | |
| { | |
| "epoch": 2.6530612244897958, | |
| "grad_norm": 1.5132843255996704, | |
| "learning_rate": 1.4734693877551021e-05, | |
| "loss": 0.8077, | |
| "step": 130 | |
| }, | |
| { | |
| "epoch": 2.857142857142857, | |
| "grad_norm": 2.5402004718780518, | |
| "learning_rate": 1.43265306122449e-05, | |
| "loss": 0.7742, | |
| "step": 140 | |
| }, | |
| { | |
| "epoch": 3.061224489795918, | |
| "grad_norm": 2.259558916091919, | |
| "learning_rate": 1.3918367346938776e-05, | |
| "loss": 0.7638, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 3.061224489795918, | |
| "eval_accuracy": 0.75, | |
| "eval_loss": 0.8326537013053894, | |
| "eval_runtime": 1.2728, | |
| "eval_samples_per_second": 62.853, | |
| "eval_steps_per_second": 7.857, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 3.2653061224489797, | |
| "grad_norm": 2.010672092437744, | |
| "learning_rate": 1.3510204081632654e-05, | |
| "loss": 0.672, | |
| "step": 160 | |
| }, | |
| { | |
| "epoch": 3.4693877551020407, | |
| "grad_norm": 1.8449556827545166, | |
| "learning_rate": 1.3102040816326531e-05, | |
| "loss": 0.6367, | |
| "step": 170 | |
| }, | |
| { | |
| "epoch": 3.673469387755102, | |
| "grad_norm": 1.8838822841644287, | |
| "learning_rate": 1.2693877551020409e-05, | |
| "loss": 0.6125, | |
| "step": 180 | |
| }, | |
| { | |
| "epoch": 3.877551020408163, | |
| "grad_norm": 2.325084686279297, | |
| "learning_rate": 1.2285714285714288e-05, | |
| "loss": 0.5768, | |
| "step": 190 | |
| }, | |
| { | |
| "epoch": 4.081632653061225, | |
| "grad_norm": 2.084455966949463, | |
| "learning_rate": 1.1877551020408165e-05, | |
| "loss": 0.537, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 4.081632653061225, | |
| "eval_accuracy": 0.7875, | |
| "eval_loss": 0.7291887402534485, | |
| "eval_runtime": 1.2754, | |
| "eval_samples_per_second": 62.726, | |
| "eval_steps_per_second": 7.841, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 4.285714285714286, | |
| "grad_norm": 2.3336734771728516, | |
| "learning_rate": 1.146938775510204e-05, | |
| "loss": 0.5, | |
| "step": 210 | |
| }, | |
| { | |
| "epoch": 4.489795918367347, | |
| "grad_norm": 2.6175546646118164, | |
| "learning_rate": 1.1061224489795918e-05, | |
| "loss": 0.4665, | |
| "step": 220 | |
| }, | |
| { | |
| "epoch": 4.6938775510204085, | |
| "grad_norm": 1.6590876579284668, | |
| "learning_rate": 1.0653061224489796e-05, | |
| "loss": 0.448, | |
| "step": 230 | |
| }, | |
| { | |
| "epoch": 4.8979591836734695, | |
| "grad_norm": 1.8714507818222046, | |
| "learning_rate": 1.0244897959183675e-05, | |
| "loss": 0.4491, | |
| "step": 240 | |
| }, | |
| { | |
| "epoch": 5.1020408163265305, | |
| "grad_norm": 2.239349365234375, | |
| "learning_rate": 9.836734693877552e-06, | |
| "loss": 0.3957, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 5.1020408163265305, | |
| "eval_accuracy": 0.825, | |
| "eval_loss": 0.6328426599502563, | |
| "eval_runtime": 1.2652, | |
| "eval_samples_per_second": 63.23, | |
| "eval_steps_per_second": 7.904, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 5.3061224489795915, | |
| "grad_norm": 2.4021127223968506, | |
| "learning_rate": 9.42857142857143e-06, | |
| "loss": 0.3601, | |
| "step": 260 | |
| }, | |
| { | |
| "epoch": 5.510204081632653, | |
| "grad_norm": 1.2185922861099243, | |
| "learning_rate": 9.020408163265307e-06, | |
| "loss": 0.3431, | |
| "step": 270 | |
| }, | |
| { | |
| "epoch": 5.714285714285714, | |
| "grad_norm": 1.0699914693832397, | |
| "learning_rate": 8.612244897959184e-06, | |
| "loss": 0.3646, | |
| "step": 280 | |
| }, | |
| { | |
| "epoch": 5.918367346938775, | |
| "grad_norm": 1.5863635540008545, | |
| "learning_rate": 8.204081632653062e-06, | |
| "loss": 0.3129, | |
| "step": 290 | |
| }, | |
| { | |
| "epoch": 6.122448979591836, | |
| "grad_norm": 1.0668057203292847, | |
| "learning_rate": 7.79591836734694e-06, | |
| "loss": 0.2842, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 6.122448979591836, | |
| "eval_accuracy": 0.7875, | |
| "eval_loss": 0.5894995927810669, | |
| "eval_runtime": 1.2447, | |
| "eval_samples_per_second": 64.272, | |
| "eval_steps_per_second": 8.034, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 6.326530612244898, | |
| "grad_norm": 1.0372223854064941, | |
| "learning_rate": 7.387755102040817e-06, | |
| "loss": 0.2725, | |
| "step": 310 | |
| }, | |
| { | |
| "epoch": 6.530612244897959, | |
| "grad_norm": 1.855797529220581, | |
| "learning_rate": 6.979591836734695e-06, | |
| "loss": 0.2596, | |
| "step": 320 | |
| }, | |
| { | |
| "epoch": 6.73469387755102, | |
| "grad_norm": 1.3400880098342896, | |
| "learning_rate": 6.571428571428572e-06, | |
| "loss": 0.2579, | |
| "step": 330 | |
| }, | |
| { | |
| "epoch": 6.938775510204081, | |
| "grad_norm": 1.0782897472381592, | |
| "learning_rate": 6.163265306122449e-06, | |
| "loss": 0.2394, | |
| "step": 340 | |
| }, | |
| { | |
| "epoch": 7.142857142857143, | |
| "grad_norm": 1.1473782062530518, | |
| "learning_rate": 5.755102040816327e-06, | |
| "loss": 0.2266, | |
| "step": 350 | |
| }, | |
| { | |
| "epoch": 7.142857142857143, | |
| "eval_accuracy": 0.8125, | |
| "eval_loss": 0.5921751260757446, | |
| "eval_runtime": 1.2621, | |
| "eval_samples_per_second": 63.388, | |
| "eval_steps_per_second": 7.924, | |
| "step": 350 | |
| }, | |
| { | |
| "epoch": 7.346938775510204, | |
| "grad_norm": 0.8374194502830505, | |
| "learning_rate": 5.3469387755102045e-06, | |
| "loss": 0.2043, | |
| "step": 360 | |
| }, | |
| { | |
| "epoch": 7.551020408163265, | |
| "grad_norm": 0.9697467684745789, | |
| "learning_rate": 4.938775510204082e-06, | |
| "loss": 0.2076, | |
| "step": 370 | |
| }, | |
| { | |
| "epoch": 7.755102040816326, | |
| "grad_norm": 1.7021687030792236, | |
| "learning_rate": 4.530612244897959e-06, | |
| "loss": 0.2111, | |
| "step": 380 | |
| }, | |
| { | |
| "epoch": 7.959183673469388, | |
| "grad_norm": 0.8718199729919434, | |
| "learning_rate": 4.122448979591837e-06, | |
| "loss": 0.188, | |
| "step": 390 | |
| }, | |
| { | |
| "epoch": 8.16326530612245, | |
| "grad_norm": 0.7752737998962402, | |
| "learning_rate": 3.7142857142857146e-06, | |
| "loss": 0.1798, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 8.16326530612245, | |
| "eval_accuracy": 0.8125, | |
| "eval_loss": 0.5628954172134399, | |
| "eval_runtime": 1.291, | |
| "eval_samples_per_second": 61.967, | |
| "eval_steps_per_second": 7.746, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 8.36734693877551, | |
| "grad_norm": 1.137776494026184, | |
| "learning_rate": 3.3061224489795924e-06, | |
| "loss": 0.1778, | |
| "step": 410 | |
| }, | |
| { | |
| "epoch": 8.571428571428571, | |
| "grad_norm": 0.7163811326026917, | |
| "learning_rate": 2.8979591836734694e-06, | |
| "loss": 0.1697, | |
| "step": 420 | |
| }, | |
| { | |
| "epoch": 8.775510204081632, | |
| "grad_norm": 0.860792875289917, | |
| "learning_rate": 2.489795918367347e-06, | |
| "loss": 0.1803, | |
| "step": 430 | |
| }, | |
| { | |
| "epoch": 8.979591836734693, | |
| "grad_norm": 0.8092204928398132, | |
| "learning_rate": 2.0816326530612247e-06, | |
| "loss": 0.1792, | |
| "step": 440 | |
| }, | |
| { | |
| "epoch": 9.183673469387756, | |
| "grad_norm": 3.376288890838623, | |
| "learning_rate": 1.6734693877551023e-06, | |
| "loss": 0.1746, | |
| "step": 450 | |
| }, | |
| { | |
| "epoch": 9.183673469387756, | |
| "eval_accuracy": 0.825, | |
| "eval_loss": 0.5457363724708557, | |
| "eval_runtime": 1.2999, | |
| "eval_samples_per_second": 61.544, | |
| "eval_steps_per_second": 7.693, | |
| "step": 450 | |
| }, | |
| { | |
| "epoch": 9.387755102040817, | |
| "grad_norm": 0.6227843165397644, | |
| "learning_rate": 1.2653061224489795e-06, | |
| "loss": 0.1631, | |
| "step": 460 | |
| }, | |
| { | |
| "epoch": 9.591836734693878, | |
| "grad_norm": 0.7905530333518982, | |
| "learning_rate": 8.571428571428572e-07, | |
| "loss": 0.1603, | |
| "step": 470 | |
| }, | |
| { | |
| "epoch": 9.795918367346939, | |
| "grad_norm": 0.6496562361717224, | |
| "learning_rate": 4.489795918367347e-07, | |
| "loss": 0.1581, | |
| "step": 480 | |
| }, | |
| { | |
| "epoch": 10.0, | |
| "grad_norm": 0.9721047282218933, | |
| "learning_rate": 4.0816326530612253e-08, | |
| "loss": 0.1594, | |
| "step": 490 | |
| }, | |
| { | |
| "epoch": 10.0, | |
| "step": 490, | |
| "total_flos": 6.013486186109338e+17, | |
| "train_loss": 0.5295110111333886, | |
| "train_runtime": 345.9048, | |
| "train_samples_per_second": 22.434, | |
| "train_steps_per_second": 1.417 | |
| } | |
| ], | |
| "logging_steps": 10, | |
| "max_steps": 490, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 10, | |
| "save_steps": 100, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": true | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 6.013486186109338e+17, | |
| "train_batch_size": 16, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |