{ "best_global_step": 450, "best_metric": 0.5457363724708557, "best_model_checkpoint": "./vit-results/checkpoint-400", "epoch": 10.0, "eval_steps": 50, "global_step": 490, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.20408163265306123, "grad_norm": 1.3306224346160889, "learning_rate": 1.963265306122449e-05, "loss": 1.3374, "step": 10 }, { "epoch": 0.40816326530612246, "grad_norm": 1.4784319400787354, "learning_rate": 1.922448979591837e-05, "loss": 1.265, "step": 20 }, { "epoch": 0.6122448979591837, "grad_norm": 2.0332705974578857, "learning_rate": 1.8816326530612246e-05, "loss": 1.2081, "step": 30 }, { "epoch": 0.8163265306122449, "grad_norm": 1.4771761894226074, "learning_rate": 1.8408163265306125e-05, "loss": 1.2104, "step": 40 }, { "epoch": 1.0204081632653061, "grad_norm": 1.5638868808746338, "learning_rate": 1.8e-05, "loss": 1.1453, "step": 50 }, { "epoch": 1.0204081632653061, "eval_accuracy": 0.475, "eval_loss": 1.1232645511627197, "eval_runtime": 1.3068, "eval_samples_per_second": 61.216, "eval_steps_per_second": 7.652, "step": 50 }, { "epoch": 1.2244897959183674, "grad_norm": 2.1241655349731445, "learning_rate": 1.759183673469388e-05, "loss": 1.0284, "step": 60 }, { "epoch": 1.4285714285714286, "grad_norm": 2.0577926635742188, "learning_rate": 1.7183673469387755e-05, "loss": 1.0428, "step": 70 }, { "epoch": 1.6326530612244898, "grad_norm": 1.7612333297729492, "learning_rate": 1.6775510204081634e-05, "loss": 1.008, "step": 80 }, { "epoch": 1.836734693877551, "grad_norm": 2.2085816860198975, "learning_rate": 1.6367346938775513e-05, "loss": 1.0095, "step": 90 }, { "epoch": 2.0408163265306123, "grad_norm": 1.5611677169799805, "learning_rate": 1.595918367346939e-05, "loss": 0.9155, "step": 100 }, { "epoch": 2.0408163265306123, "eval_accuracy": 0.7, "eval_loss": 0.964668869972229, "eval_runtime": 1.2647, "eval_samples_per_second": 63.255, "eval_steps_per_second": 7.907, "step": 100 }, { "epoch": 2.2448979591836733, "grad_norm": 2.1504409313201904, "learning_rate": 1.5551020408163265e-05, "loss": 0.8396, "step": 110 }, { "epoch": 2.4489795918367347, "grad_norm": 1.9358028173446655, "learning_rate": 1.5142857142857144e-05, "loss": 0.8618, "step": 120 }, { "epoch": 2.6530612244897958, "grad_norm": 1.5132843255996704, "learning_rate": 1.4734693877551021e-05, "loss": 0.8077, "step": 130 }, { "epoch": 2.857142857142857, "grad_norm": 2.5402004718780518, "learning_rate": 1.43265306122449e-05, "loss": 0.7742, "step": 140 }, { "epoch": 3.061224489795918, "grad_norm": 2.259558916091919, "learning_rate": 1.3918367346938776e-05, "loss": 0.7638, "step": 150 }, { "epoch": 3.061224489795918, "eval_accuracy": 0.75, "eval_loss": 0.8326537013053894, "eval_runtime": 1.2728, "eval_samples_per_second": 62.853, "eval_steps_per_second": 7.857, "step": 150 }, { "epoch": 3.2653061224489797, "grad_norm": 2.010672092437744, "learning_rate": 1.3510204081632654e-05, "loss": 0.672, "step": 160 }, { "epoch": 3.4693877551020407, "grad_norm": 1.8449556827545166, "learning_rate": 1.3102040816326531e-05, "loss": 0.6367, "step": 170 }, { "epoch": 3.673469387755102, "grad_norm": 1.8838822841644287, "learning_rate": 1.2693877551020409e-05, "loss": 0.6125, "step": 180 }, { "epoch": 3.877551020408163, "grad_norm": 2.325084686279297, "learning_rate": 1.2285714285714288e-05, "loss": 0.5768, "step": 190 }, { "epoch": 4.081632653061225, "grad_norm": 2.084455966949463, "learning_rate": 1.1877551020408165e-05, "loss": 0.537, "step": 200 }, { "epoch": 4.081632653061225, "eval_accuracy": 0.7875, "eval_loss": 0.7291887402534485, "eval_runtime": 1.2754, "eval_samples_per_second": 62.726, "eval_steps_per_second": 7.841, "step": 200 }, { "epoch": 4.285714285714286, "grad_norm": 2.3336734771728516, "learning_rate": 1.146938775510204e-05, "loss": 0.5, "step": 210 }, { "epoch": 4.489795918367347, "grad_norm": 2.6175546646118164, "learning_rate": 1.1061224489795918e-05, "loss": 0.4665, "step": 220 }, { "epoch": 4.6938775510204085, "grad_norm": 1.6590876579284668, "learning_rate": 1.0653061224489796e-05, "loss": 0.448, "step": 230 }, { "epoch": 4.8979591836734695, "grad_norm": 1.8714507818222046, "learning_rate": 1.0244897959183675e-05, "loss": 0.4491, "step": 240 }, { "epoch": 5.1020408163265305, "grad_norm": 2.239349365234375, "learning_rate": 9.836734693877552e-06, "loss": 0.3957, "step": 250 }, { "epoch": 5.1020408163265305, "eval_accuracy": 0.825, "eval_loss": 0.6328426599502563, "eval_runtime": 1.2652, "eval_samples_per_second": 63.23, "eval_steps_per_second": 7.904, "step": 250 }, { "epoch": 5.3061224489795915, "grad_norm": 2.4021127223968506, "learning_rate": 9.42857142857143e-06, "loss": 0.3601, "step": 260 }, { "epoch": 5.510204081632653, "grad_norm": 1.2185922861099243, "learning_rate": 9.020408163265307e-06, "loss": 0.3431, "step": 270 }, { "epoch": 5.714285714285714, "grad_norm": 1.0699914693832397, "learning_rate": 8.612244897959184e-06, "loss": 0.3646, "step": 280 }, { "epoch": 5.918367346938775, "grad_norm": 1.5863635540008545, "learning_rate": 8.204081632653062e-06, "loss": 0.3129, "step": 290 }, { "epoch": 6.122448979591836, "grad_norm": 1.0668057203292847, "learning_rate": 7.79591836734694e-06, "loss": 0.2842, "step": 300 }, { "epoch": 6.122448979591836, "eval_accuracy": 0.7875, "eval_loss": 0.5894995927810669, "eval_runtime": 1.2447, "eval_samples_per_second": 64.272, "eval_steps_per_second": 8.034, "step": 300 }, { "epoch": 6.326530612244898, "grad_norm": 1.0372223854064941, "learning_rate": 7.387755102040817e-06, "loss": 0.2725, "step": 310 }, { "epoch": 6.530612244897959, "grad_norm": 1.855797529220581, "learning_rate": 6.979591836734695e-06, "loss": 0.2596, "step": 320 }, { "epoch": 6.73469387755102, "grad_norm": 1.3400880098342896, "learning_rate": 6.571428571428572e-06, "loss": 0.2579, "step": 330 }, { "epoch": 6.938775510204081, "grad_norm": 1.0782897472381592, "learning_rate": 6.163265306122449e-06, "loss": 0.2394, "step": 340 }, { "epoch": 7.142857142857143, "grad_norm": 1.1473782062530518, "learning_rate": 5.755102040816327e-06, "loss": 0.2266, "step": 350 }, { "epoch": 7.142857142857143, "eval_accuracy": 0.8125, "eval_loss": 0.5921751260757446, "eval_runtime": 1.2621, "eval_samples_per_second": 63.388, "eval_steps_per_second": 7.924, "step": 350 }, { "epoch": 7.346938775510204, "grad_norm": 0.8374194502830505, "learning_rate": 5.3469387755102045e-06, "loss": 0.2043, "step": 360 }, { "epoch": 7.551020408163265, "grad_norm": 0.9697467684745789, "learning_rate": 4.938775510204082e-06, "loss": 0.2076, "step": 370 }, { "epoch": 7.755102040816326, "grad_norm": 1.7021687030792236, "learning_rate": 4.530612244897959e-06, "loss": 0.2111, "step": 380 }, { "epoch": 7.959183673469388, "grad_norm": 0.8718199729919434, "learning_rate": 4.122448979591837e-06, "loss": 0.188, "step": 390 }, { "epoch": 8.16326530612245, "grad_norm": 0.7752737998962402, "learning_rate": 3.7142857142857146e-06, "loss": 0.1798, "step": 400 }, { "epoch": 8.16326530612245, "eval_accuracy": 0.8125, "eval_loss": 0.5628954172134399, "eval_runtime": 1.291, "eval_samples_per_second": 61.967, "eval_steps_per_second": 7.746, "step": 400 }, { "epoch": 8.36734693877551, "grad_norm": 1.137776494026184, "learning_rate": 3.3061224489795924e-06, "loss": 0.1778, "step": 410 }, { "epoch": 8.571428571428571, "grad_norm": 0.7163811326026917, "learning_rate": 2.8979591836734694e-06, "loss": 0.1697, "step": 420 }, { "epoch": 8.775510204081632, "grad_norm": 0.860792875289917, "learning_rate": 2.489795918367347e-06, "loss": 0.1803, "step": 430 }, { "epoch": 8.979591836734693, "grad_norm": 0.8092204928398132, "learning_rate": 2.0816326530612247e-06, "loss": 0.1792, "step": 440 }, { "epoch": 9.183673469387756, "grad_norm": 3.376288890838623, "learning_rate": 1.6734693877551023e-06, "loss": 0.1746, "step": 450 }, { "epoch": 9.183673469387756, "eval_accuracy": 0.825, "eval_loss": 0.5457363724708557, "eval_runtime": 1.2999, "eval_samples_per_second": 61.544, "eval_steps_per_second": 7.693, "step": 450 }, { "epoch": 9.387755102040817, "grad_norm": 0.6227843165397644, "learning_rate": 1.2653061224489795e-06, "loss": 0.1631, "step": 460 }, { "epoch": 9.591836734693878, "grad_norm": 0.7905530333518982, "learning_rate": 8.571428571428572e-07, "loss": 0.1603, "step": 470 }, { "epoch": 9.795918367346939, "grad_norm": 0.6496562361717224, "learning_rate": 4.489795918367347e-07, "loss": 0.1581, "step": 480 }, { "epoch": 10.0, "grad_norm": 0.9721047282218933, "learning_rate": 4.0816326530612253e-08, "loss": 0.1594, "step": 490 }, { "epoch": 10.0, "step": 490, "total_flos": 6.013486186109338e+17, "train_loss": 0.5295110111333886, "train_runtime": 345.9048, "train_samples_per_second": 22.434, "train_steps_per_second": 1.417 } ], "logging_steps": 10, "max_steps": 490, "num_input_tokens_seen": 0, "num_train_epochs": 10, "save_steps": 100, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 6.013486186109338e+17, "train_batch_size": 16, "trial_name": null, "trial_params": null }