{ "best_metric": 0.0003763487620744854, "best_model_checkpoint": "./vit-base-fruit-punch/checkpoint-1000", "epoch": 8.0, "eval_steps": 100, "global_step": 1000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.08, "grad_norm": 100187.125, "learning_rate": 4.9500000000000004e-05, "loss": 1.127, "step": 10 }, { "epoch": 0.16, "grad_norm": 96563.390625, "learning_rate": 4.9e-05, "loss": 0.7007, "step": 20 }, { "epoch": 0.24, "grad_norm": 64723.85546875, "learning_rate": 4.85e-05, "loss": 0.4167, "step": 30 }, { "epoch": 0.32, "grad_norm": 40675.4140625, "learning_rate": 4.8e-05, "loss": 0.2602, "step": 40 }, { "epoch": 0.4, "grad_norm": 28872.85546875, "learning_rate": 4.75e-05, "loss": 0.16, "step": 50 }, { "epoch": 0.48, "grad_norm": 26935.421875, "learning_rate": 4.7e-05, "loss": 0.1115, "step": 60 }, { "epoch": 0.56, "grad_norm": 19970.119140625, "learning_rate": 4.6500000000000005e-05, "loss": 0.0944, "step": 70 }, { "epoch": 0.64, "grad_norm": 19774.392578125, "learning_rate": 4.600000000000001e-05, "loss": 0.0761, "step": 80 }, { "epoch": 0.72, "grad_norm": 14233.1318359375, "learning_rate": 4.55e-05, "loss": 0.0588, "step": 90 }, { "epoch": 0.8, "grad_norm": 12306.6767578125, "learning_rate": 4.5e-05, "loss": 0.0488, "step": 100 }, { "epoch": 0.8, "eval_accuracy": 1.0, "eval_loss": 0.04701722040772438, "eval_runtime": 12.891, "eval_samples_per_second": 77.574, "eval_steps_per_second": 4.887, "step": 100 }, { "epoch": 0.88, "grad_norm": 11777.6708984375, "learning_rate": 4.4500000000000004e-05, "loss": 0.0429, "step": 110 }, { "epoch": 0.96, "grad_norm": 10813.1064453125, "learning_rate": 4.4000000000000006e-05, "loss": 0.0458, "step": 120 }, { "epoch": 1.04, "grad_norm": 8880.1279296875, "learning_rate": 4.35e-05, "loss": 0.0384, "step": 130 }, { "epoch": 1.12, "grad_norm": 8182.60107421875, "learning_rate": 4.3e-05, "loss": 0.0309, "step": 140 }, { "epoch": 1.2, "grad_norm": 7128.07275390625, "learning_rate": 4.25e-05, "loss": 0.0281, "step": 150 }, { "epoch": 1.28, "grad_norm": 6803.498046875, "learning_rate": 4.2e-05, "loss": 0.0254, "step": 160 }, { "epoch": 1.3599999999999999, "grad_norm": 6198.88037109375, "learning_rate": 4.15e-05, "loss": 0.0232, "step": 170 }, { "epoch": 1.44, "grad_norm": 5394.99072265625, "learning_rate": 4.1e-05, "loss": 0.021, "step": 180 }, { "epoch": 1.52, "grad_norm": 5170.45458984375, "learning_rate": 4.05e-05, "loss": 0.0191, "step": 190 }, { "epoch": 1.6, "grad_norm": 4661.20263671875, "learning_rate": 4e-05, "loss": 0.0174, "step": 200 }, { "epoch": 1.6, "eval_accuracy": 1.0, "eval_loss": 0.017339378595352173, "eval_runtime": 13.5483, "eval_samples_per_second": 73.81, "eval_steps_per_second": 4.65, "step": 200 }, { "epoch": 1.6800000000000002, "grad_norm": 4417.46337890625, "learning_rate": 3.9500000000000005e-05, "loss": 0.016, "step": 210 }, { "epoch": 1.76, "grad_norm": 3967.432861328125, "learning_rate": 3.9000000000000006e-05, "loss": 0.0146, "step": 220 }, { "epoch": 1.8399999999999999, "grad_norm": 3568.837646484375, "learning_rate": 3.85e-05, "loss": 0.0134, "step": 230 }, { "epoch": 1.92, "grad_norm": 3472.84716796875, "learning_rate": 3.8e-05, "loss": 0.0123, "step": 240 }, { "epoch": 2.0, "grad_norm": 3190.490966796875, "learning_rate": 3.7500000000000003e-05, "loss": 0.0113, "step": 250 }, { "epoch": 2.08, "grad_norm": 2825.7802734375, "learning_rate": 3.7e-05, "loss": 0.0104, "step": 260 }, { "epoch": 2.16, "grad_norm": 2605.496337890625, "learning_rate": 3.65e-05, "loss": 0.0095, "step": 270 }, { "epoch": 2.24, "grad_norm": 2393.7314453125, "learning_rate": 3.6e-05, "loss": 0.0088, "step": 280 }, { "epoch": 2.32, "grad_norm": 2354.353515625, "learning_rate": 3.55e-05, "loss": 0.0081, "step": 290 }, { "epoch": 2.4, "grad_norm": 2027.639404296875, "learning_rate": 3.5e-05, "loss": 0.0074, "step": 300 }, { "epoch": 2.4, "eval_accuracy": 1.0, "eval_loss": 0.007476483471691608, "eval_runtime": 13.6387, "eval_samples_per_second": 73.321, "eval_steps_per_second": 4.619, "step": 300 }, { "epoch": 2.48, "grad_norm": 1890.090087890625, "learning_rate": 3.45e-05, "loss": 0.0068, "step": 310 }, { "epoch": 2.56, "grad_norm": 1713.1453857421875, "learning_rate": 3.4000000000000007e-05, "loss": 0.0063, "step": 320 }, { "epoch": 2.64, "grad_norm": 1611.7166748046875, "learning_rate": 3.35e-05, "loss": 0.0059, "step": 330 }, { "epoch": 2.7199999999999998, "grad_norm": 1491.3282470703125, "learning_rate": 3.3e-05, "loss": 0.0054, "step": 340 }, { "epoch": 2.8, "grad_norm": 1385.7913818359375, "learning_rate": 3.2500000000000004e-05, "loss": 0.005, "step": 350 }, { "epoch": 2.88, "grad_norm": 1317.3277587890625, "learning_rate": 3.2000000000000005e-05, "loss": 0.0047, "step": 360 }, { "epoch": 2.96, "grad_norm": 1197.3973388671875, "learning_rate": 3.15e-05, "loss": 0.0043, "step": 370 }, { "epoch": 3.04, "grad_norm": 1110.10693359375, "learning_rate": 3.1e-05, "loss": 0.004, "step": 380 }, { "epoch": 3.12, "grad_norm": 1046.7801513671875, "learning_rate": 3.05e-05, "loss": 0.0037, "step": 390 }, { "epoch": 3.2, "grad_norm": 958.0781860351562, "learning_rate": 3e-05, "loss": 0.0035, "step": 400 }, { "epoch": 3.2, "eval_accuracy": 1.0, "eval_loss": 0.0034720886033028364, "eval_runtime": 13.6477, "eval_samples_per_second": 73.272, "eval_steps_per_second": 4.616, "step": 400 }, { "epoch": 3.2800000000000002, "grad_norm": 986.9517822265625, "learning_rate": 2.95e-05, "loss": 0.0032, "step": 410 }, { "epoch": 3.36, "grad_norm": 841.5371704101562, "learning_rate": 2.9e-05, "loss": 0.003, "step": 420 }, { "epoch": 3.44, "grad_norm": 797.5939331054688, "learning_rate": 2.8499999999999998e-05, "loss": 0.0028, "step": 430 }, { "epoch": 3.52, "grad_norm": 735.5321655273438, "learning_rate": 2.8000000000000003e-05, "loss": 0.0026, "step": 440 }, { "epoch": 3.6, "grad_norm": 701.638427734375, "learning_rate": 2.7500000000000004e-05, "loss": 0.0024, "step": 450 }, { "epoch": 3.68, "grad_norm": 647.36279296875, "learning_rate": 2.7000000000000002e-05, "loss": 0.0023, "step": 460 }, { "epoch": 3.76, "grad_norm": 599.30126953125, "learning_rate": 2.6500000000000004e-05, "loss": 0.0021, "step": 470 }, { "epoch": 3.84, "grad_norm": 590.8321533203125, "learning_rate": 2.6000000000000002e-05, "loss": 0.002, "step": 480 }, { "epoch": 3.92, "grad_norm": 546.0530395507812, "learning_rate": 2.5500000000000003e-05, "loss": 0.0019, "step": 490 }, { "epoch": 4.0, "grad_norm": 500.25738525390625, "learning_rate": 2.5e-05, "loss": 0.0018, "step": 500 }, { "epoch": 4.0, "eval_accuracy": 1.0, "eval_loss": 0.0022775332909077406, "eval_runtime": 13.3322, "eval_samples_per_second": 75.006, "eval_steps_per_second": 4.725, "step": 500 }, { "epoch": 4.08, "grad_norm": 494.0545349121094, "learning_rate": 2.45e-05, "loss": 0.0017, "step": 510 }, { "epoch": 4.16, "grad_norm": 452.84375, "learning_rate": 2.4e-05, "loss": 0.0016, "step": 520 }, { "epoch": 4.24, "grad_norm": 434.6565246582031, "learning_rate": 2.35e-05, "loss": 0.0015, "step": 530 }, { "epoch": 4.32, "grad_norm": 404.3072204589844, "learning_rate": 2.3000000000000003e-05, "loss": 0.0014, "step": 540 }, { "epoch": 4.4, "grad_norm": 389.670166015625, "learning_rate": 2.25e-05, "loss": 0.0013, "step": 550 }, { "epoch": 4.48, "grad_norm": 387.8797607421875, "learning_rate": 2.2000000000000003e-05, "loss": 0.0012, "step": 560 }, { "epoch": 4.5600000000000005, "grad_norm": 357.6133728027344, "learning_rate": 2.15e-05, "loss": 0.0012, "step": 570 }, { "epoch": 4.64, "grad_norm": 320.7620544433594, "learning_rate": 2.1e-05, "loss": 0.0011, "step": 580 }, { "epoch": 4.72, "grad_norm": 309.20062255859375, "learning_rate": 2.05e-05, "loss": 0.001, "step": 590 }, { "epoch": 4.8, "grad_norm": 292.0805358886719, "learning_rate": 2e-05, "loss": 0.001, "step": 600 }, { "epoch": 4.8, "eval_accuracy": 1.0, "eval_loss": 0.001073041232302785, "eval_runtime": 13.7046, "eval_samples_per_second": 72.968, "eval_steps_per_second": 4.597, "step": 600 }, { "epoch": 4.88, "grad_norm": 283.4959411621094, "learning_rate": 1.9500000000000003e-05, "loss": 0.0009, "step": 610 }, { "epoch": 4.96, "grad_norm": 261.8572998046875, "learning_rate": 1.9e-05, "loss": 0.0009, "step": 620 }, { "epoch": 5.04, "grad_norm": 252.91981506347656, "learning_rate": 1.85e-05, "loss": 0.0009, "step": 630 }, { "epoch": 5.12, "grad_norm": 239.09896850585938, "learning_rate": 1.8e-05, "loss": 0.0008, "step": 640 }, { "epoch": 5.2, "grad_norm": 232.7013397216797, "learning_rate": 1.75e-05, "loss": 0.0008, "step": 650 }, { "epoch": 5.28, "grad_norm": 220.06301879882812, "learning_rate": 1.7000000000000003e-05, "loss": 0.0007, "step": 660 }, { "epoch": 5.36, "grad_norm": 219.54986572265625, "learning_rate": 1.65e-05, "loss": 0.0007, "step": 670 }, { "epoch": 5.44, "grad_norm": 206.40716552734375, "learning_rate": 1.6000000000000003e-05, "loss": 0.0007, "step": 680 }, { "epoch": 5.52, "grad_norm": 196.642578125, "learning_rate": 1.55e-05, "loss": 0.0007, "step": 690 }, { "epoch": 5.6, "grad_norm": 187.69554138183594, "learning_rate": 1.5e-05, "loss": 0.0006, "step": 700 }, { "epoch": 5.6, "eval_accuracy": 1.0, "eval_loss": 0.0006605549133382738, "eval_runtime": 13.6953, "eval_samples_per_second": 73.018, "eval_steps_per_second": 4.6, "step": 700 }, { "epoch": 5.68, "grad_norm": 183.08045959472656, "learning_rate": 1.45e-05, "loss": 0.0006, "step": 710 }, { "epoch": 5.76, "grad_norm": 174.93222045898438, "learning_rate": 1.4000000000000001e-05, "loss": 0.0006, "step": 720 }, { "epoch": 5.84, "grad_norm": 177.07530212402344, "learning_rate": 1.3500000000000001e-05, "loss": 0.0006, "step": 730 }, { "epoch": 5.92, "grad_norm": 166.14947509765625, "learning_rate": 1.3000000000000001e-05, "loss": 0.0005, "step": 740 }, { "epoch": 6.0, "grad_norm": 165.67318725585938, "learning_rate": 1.25e-05, "loss": 0.0005, "step": 750 }, { "epoch": 6.08, "grad_norm": 158.77545166015625, "learning_rate": 1.2e-05, "loss": 0.0005, "step": 760 }, { "epoch": 6.16, "grad_norm": 149.71511840820312, "learning_rate": 1.1500000000000002e-05, "loss": 0.0005, "step": 770 }, { "epoch": 6.24, "grad_norm": 144.20770263671875, "learning_rate": 1.1000000000000001e-05, "loss": 0.0005, "step": 780 }, { "epoch": 6.32, "grad_norm": 148.6312255859375, "learning_rate": 1.05e-05, "loss": 0.0005, "step": 790 }, { "epoch": 6.4, "grad_norm": 135.4142303466797, "learning_rate": 1e-05, "loss": 0.0005, "step": 800 }, { "epoch": 6.4, "eval_accuracy": 1.0, "eval_loss": 0.0004884201916866004, "eval_runtime": 13.5521, "eval_samples_per_second": 73.79, "eval_steps_per_second": 4.649, "step": 800 }, { "epoch": 6.48, "grad_norm": 133.83642578125, "learning_rate": 9.5e-06, "loss": 0.0004, "step": 810 }, { "epoch": 6.5600000000000005, "grad_norm": 138.82203674316406, "learning_rate": 9e-06, "loss": 0.0004, "step": 820 }, { "epoch": 6.64, "grad_norm": 127.43915557861328, "learning_rate": 8.500000000000002e-06, "loss": 0.0004, "step": 830 }, { "epoch": 6.72, "grad_norm": 126.12251281738281, "learning_rate": 8.000000000000001e-06, "loss": 0.0004, "step": 840 }, { "epoch": 6.8, "grad_norm": 121.66053771972656, "learning_rate": 7.5e-06, "loss": 0.0004, "step": 850 }, { "epoch": 6.88, "grad_norm": 121.25574493408203, "learning_rate": 7.000000000000001e-06, "loss": 0.0004, "step": 860 }, { "epoch": 6.96, "grad_norm": 126.3290023803711, "learning_rate": 6.5000000000000004e-06, "loss": 0.0004, "step": 870 }, { "epoch": 7.04, "grad_norm": 117.99575805664062, "learning_rate": 6e-06, "loss": 0.0004, "step": 880 }, { "epoch": 7.12, "grad_norm": 116.10645294189453, "learning_rate": 5.500000000000001e-06, "loss": 0.0004, "step": 890 }, { "epoch": 7.2, "grad_norm": 113.11275482177734, "learning_rate": 5e-06, "loss": 0.0004, "step": 900 }, { "epoch": 7.2, "eval_accuracy": 1.0, "eval_loss": 0.0003921452153008431, "eval_runtime": 13.7013, "eval_samples_per_second": 72.986, "eval_steps_per_second": 4.598, "step": 900 }, { "epoch": 7.28, "grad_norm": 114.9211196899414, "learning_rate": 4.5e-06, "loss": 0.0004, "step": 910 }, { "epoch": 7.36, "grad_norm": 110.5498046875, "learning_rate": 4.000000000000001e-06, "loss": 0.0004, "step": 920 }, { "epoch": 7.44, "grad_norm": 110.70841979980469, "learning_rate": 3.5000000000000004e-06, "loss": 0.0004, "step": 930 }, { "epoch": 7.52, "grad_norm": 115.6305160522461, "learning_rate": 3e-06, "loss": 0.0004, "step": 940 }, { "epoch": 7.6, "grad_norm": 106.5681381225586, "learning_rate": 2.5e-06, "loss": 0.0004, "step": 950 }, { "epoch": 7.68, "grad_norm": 109.81066131591797, "learning_rate": 2.0000000000000003e-06, "loss": 0.0003, "step": 960 }, { "epoch": 7.76, "grad_norm": 107.74824523925781, "learning_rate": 1.5e-06, "loss": 0.0003, "step": 970 }, { "epoch": 7.84, "grad_norm": 108.34854888916016, "learning_rate": 1.0000000000000002e-06, "loss": 0.0003, "step": 980 }, { "epoch": 7.92, "grad_norm": 107.01416778564453, "learning_rate": 5.000000000000001e-07, "loss": 0.0003, "step": 990 }, { "epoch": 8.0, "grad_norm": 111.06143188476562, "learning_rate": 0.0, "loss": 0.0003, "step": 1000 }, { "epoch": 8.0, "eval_accuracy": 1.0, "eval_loss": 0.0003763487620744854, "eval_runtime": 13.1171, "eval_samples_per_second": 76.236, "eval_steps_per_second": 4.803, "step": 1000 }, { "epoch": 8.0, "step": 1000, "total_flos": 2.479168170953736e+18, "train_loss": 0.03565365221118554, "train_runtime": 1044.6113, "train_samples_per_second": 30.626, "train_steps_per_second": 0.957 } ], "logging_steps": 10, "max_steps": 1000, "num_input_tokens_seen": 0, "num_train_epochs": 8, "save_steps": 100, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 2.479168170953736e+18, "train_batch_size": 32, "trial_name": null, "trial_params": null }