{ "best_metric": 0.3908935785293579, "best_model_checkpoint": "./exper_batch_32_e4/checkpoint-1200", "epoch": 4.0, "global_step": 1280, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.03, "learning_rate": 0.00019843750000000002, "loss": 4.9283, "step": 10 }, { "epoch": 0.06, "learning_rate": 0.000196875, "loss": 4.7121, "step": 20 }, { "epoch": 0.09, "learning_rate": 0.0001953125, "loss": 4.4878, "step": 30 }, { "epoch": 0.12, "learning_rate": 0.00019375000000000002, "loss": 4.3163, "step": 40 }, { "epoch": 0.16, "learning_rate": 0.0001921875, "loss": 4.1709, "step": 50 }, { "epoch": 0.19, "learning_rate": 0.000190625, "loss": 4.0397, "step": 60 }, { "epoch": 0.22, "learning_rate": 0.00018906250000000002, "loss": 3.8221, "step": 70 }, { "epoch": 0.25, "learning_rate": 0.0001875, "loss": 3.7326, "step": 80 }, { "epoch": 0.28, "learning_rate": 0.0001859375, "loss": 3.541, "step": 90 }, { "epoch": 0.31, "learning_rate": 0.000184375, "loss": 3.4295, "step": 100 }, { "epoch": 0.31, "eval_accuracy": 0.28373168851195063, "eval_loss": 3.4026525020599365, "eval_runtime": 43.01, "eval_samples_per_second": 60.312, "eval_steps_per_second": 7.556, "step": 100 }, { "epoch": 0.34, "learning_rate": 0.0001828125, "loss": 3.2807, "step": 110 }, { "epoch": 0.38, "learning_rate": 0.00018125000000000001, "loss": 3.2503, "step": 120 }, { "epoch": 0.41, "learning_rate": 0.0001796875, "loss": 3.0753, "step": 130 }, { "epoch": 0.44, "learning_rate": 0.000178125, "loss": 3.0426, "step": 140 }, { "epoch": 0.47, "learning_rate": 0.00017656250000000002, "loss": 2.8475, "step": 150 }, { "epoch": 0.5, "learning_rate": 0.000175, "loss": 2.8298, "step": 160 }, { "epoch": 0.53, "learning_rate": 0.0001734375, "loss": 2.7558, "step": 170 }, { "epoch": 0.56, "learning_rate": 0.00017187500000000002, "loss": 2.5849, "step": 180 }, { "epoch": 0.59, "learning_rate": 0.0001703125, "loss": 2.5335, "step": 190 }, { "epoch": 0.62, "learning_rate": 0.00016875, "loss": 2.5035, "step": 200 }, { "epoch": 0.62, "eval_accuracy": 0.5246723207401697, "eval_loss": 2.433885097503662, "eval_runtime": 43.0144, "eval_samples_per_second": 60.305, "eval_steps_per_second": 7.556, "step": 200 }, { "epoch": 0.66, "learning_rate": 0.00016718750000000002, "loss": 2.4416, "step": 210 }, { "epoch": 0.69, "learning_rate": 0.000165625, "loss": 2.2758, "step": 220 }, { "epoch": 0.72, "learning_rate": 0.0001640625, "loss": 2.2845, "step": 230 }, { "epoch": 0.75, "learning_rate": 0.00016250000000000002, "loss": 2.2053, "step": 240 }, { "epoch": 0.78, "learning_rate": 0.0001609375, "loss": 2.165, "step": 250 }, { "epoch": 0.81, "learning_rate": 0.000159375, "loss": 1.9497, "step": 260 }, { "epoch": 0.84, "learning_rate": 0.00015781250000000002, "loss": 1.9581, "step": 270 }, { "epoch": 0.88, "learning_rate": 0.00015625, "loss": 1.8753, "step": 280 }, { "epoch": 0.91, "learning_rate": 0.0001546875, "loss": 1.7976, "step": 290 }, { "epoch": 0.94, "learning_rate": 0.000153125, "loss": 1.6542, "step": 300 }, { "epoch": 0.94, "eval_accuracy": 0.6387818041634541, "eval_loss": 1.7689646482467651, "eval_runtime": 42.9608, "eval_samples_per_second": 60.381, "eval_steps_per_second": 7.565, "step": 300 }, { "epoch": 0.97, "learning_rate": 0.0001515625, "loss": 1.7968, "step": 310 }, { "epoch": 1.0, "learning_rate": 0.00015000000000000001, "loss": 1.84, "step": 320 }, { "epoch": 1.03, "learning_rate": 0.0001484375, "loss": 1.5215, "step": 330 }, { "epoch": 1.06, "learning_rate": 0.000146875, "loss": 1.3148, "step": 340 }, { "epoch": 1.09, "learning_rate": 0.00014531250000000002, "loss": 1.3781, "step": 350 }, { "epoch": 1.12, "learning_rate": 0.00014375, "loss": 1.3547, "step": 360 }, { "epoch": 1.16, "learning_rate": 0.0001421875, "loss": 1.2466, "step": 370 }, { "epoch": 1.19, "learning_rate": 0.00014062500000000002, "loss": 1.1866, "step": 380 }, { "epoch": 1.22, "learning_rate": 0.0001390625, "loss": 1.187, "step": 390 }, { "epoch": 1.25, "learning_rate": 0.0001375, "loss": 1.1589, "step": 400 }, { "epoch": 1.25, "eval_accuracy": 0.7459521973785659, "eval_loss": 1.3106426000595093, "eval_runtime": 43.0294, "eval_samples_per_second": 60.284, "eval_steps_per_second": 7.553, "step": 400 }, { "epoch": 1.28, "learning_rate": 0.00013593750000000002, "loss": 1.0901, "step": 410 }, { "epoch": 1.31, "learning_rate": 0.000134375, "loss": 1.0998, "step": 420 }, { "epoch": 1.34, "learning_rate": 0.0001328125, "loss": 1.028, "step": 430 }, { "epoch": 1.38, "learning_rate": 0.00013125000000000002, "loss": 1.1139, "step": 440 }, { "epoch": 1.41, "learning_rate": 0.0001296875, "loss": 0.9936, "step": 450 }, { "epoch": 1.44, "learning_rate": 0.000128125, "loss": 0.9392, "step": 460 }, { "epoch": 1.47, "learning_rate": 0.0001265625, "loss": 0.8785, "step": 470 }, { "epoch": 1.5, "learning_rate": 0.000125, "loss": 0.8851, "step": 480 }, { "epoch": 1.53, "learning_rate": 0.0001234375, "loss": 0.8302, "step": 490 }, { "epoch": 1.56, "learning_rate": 0.00012187500000000001, "loss": 0.9363, "step": 500 }, { "epoch": 1.56, "eval_accuracy": 0.7802621434078643, "eval_loss": 0.997726321220398, "eval_runtime": 42.8785, "eval_samples_per_second": 60.496, "eval_steps_per_second": 7.58, "step": 500 }, { "epoch": 1.59, "learning_rate": 0.0001203125, "loss": 0.826, "step": 510 }, { "epoch": 1.62, "learning_rate": 0.00011875, "loss": 0.8231, "step": 520 }, { "epoch": 1.66, "learning_rate": 0.00011718750000000001, "loss": 0.8144, "step": 530 }, { "epoch": 1.69, "learning_rate": 0.000115625, "loss": 0.8248, "step": 540 }, { "epoch": 1.72, "learning_rate": 0.0001140625, "loss": 0.8278, "step": 550 }, { "epoch": 1.75, "learning_rate": 0.00011250000000000001, "loss": 0.8477, "step": 560 }, { "epoch": 1.78, "learning_rate": 0.0001109375, "loss": 0.714, "step": 570 }, { "epoch": 1.81, "learning_rate": 0.000109375, "loss": 0.7819, "step": 580 }, { "epoch": 1.84, "learning_rate": 0.00010781250000000001, "loss": 0.7466, "step": 590 }, { "epoch": 1.88, "learning_rate": 0.00010625000000000001, "loss": 0.6946, "step": 600 }, { "epoch": 1.88, "eval_accuracy": 0.8207401696222051, "eval_loss": 0.8138005137443542, "eval_runtime": 42.9052, "eval_samples_per_second": 60.459, "eval_steps_per_second": 7.575, "step": 600 }, { "epoch": 1.91, "learning_rate": 0.0001046875, "loss": 0.699, "step": 610 }, { "epoch": 1.94, "learning_rate": 0.000103125, "loss": 0.5999, "step": 620 }, { "epoch": 1.97, "learning_rate": 0.00010156250000000001, "loss": 0.5976, "step": 630 }, { "epoch": 2.0, "learning_rate": 0.0001, "loss": 0.5776, "step": 640 }, { "epoch": 2.03, "learning_rate": 9.84375e-05, "loss": 0.4293, "step": 650 }, { "epoch": 2.06, "learning_rate": 9.687500000000001e-05, "loss": 0.4565, "step": 660 }, { "epoch": 2.09, "learning_rate": 9.53125e-05, "loss": 0.4259, "step": 670 }, { "epoch": 2.12, "learning_rate": 9.375e-05, "loss": 0.3683, "step": 680 }, { "epoch": 2.16, "learning_rate": 9.21875e-05, "loss": 0.4605, "step": 690 }, { "epoch": 2.19, "learning_rate": 9.062500000000001e-05, "loss": 0.3488, "step": 700 }, { "epoch": 2.19, "eval_accuracy": 0.848882035466461, "eval_loss": 0.6592601537704468, "eval_runtime": 42.672, "eval_samples_per_second": 60.789, "eval_steps_per_second": 7.616, "step": 700 }, { "epoch": 2.22, "learning_rate": 8.90625e-05, "loss": 0.4594, "step": 710 }, { "epoch": 2.25, "learning_rate": 8.75e-05, "loss": 0.3706, "step": 720 }, { "epoch": 2.28, "learning_rate": 8.593750000000001e-05, "loss": 0.3855, "step": 730 }, { "epoch": 2.31, "learning_rate": 8.4375e-05, "loss": 0.4005, "step": 740 }, { "epoch": 2.34, "learning_rate": 8.28125e-05, "loss": 0.3648, "step": 750 }, { "epoch": 2.38, "learning_rate": 8.125000000000001e-05, "loss": 0.3759, "step": 760 }, { "epoch": 2.41, "learning_rate": 7.96875e-05, "loss": 0.3622, "step": 770 }, { "epoch": 2.44, "learning_rate": 7.8125e-05, "loss": 0.3209, "step": 780 }, { "epoch": 2.47, "learning_rate": 7.65625e-05, "loss": 0.2794, "step": 790 }, { "epoch": 2.5, "learning_rate": 7.500000000000001e-05, "loss": 0.2935, "step": 800 }, { "epoch": 2.5, "eval_accuracy": 0.8662297609868929, "eval_loss": 0.5725119113922119, "eval_runtime": 42.6907, "eval_samples_per_second": 60.763, "eval_steps_per_second": 7.613, "step": 800 }, { "epoch": 2.53, "learning_rate": 7.34375e-05, "loss": 0.3629, "step": 810 }, { "epoch": 2.56, "learning_rate": 7.1875e-05, "loss": 0.3006, "step": 820 }, { "epoch": 2.59, "learning_rate": 7.031250000000001e-05, "loss": 0.2876, "step": 830 }, { "epoch": 2.62, "learning_rate": 6.875e-05, "loss": 0.2612, "step": 840 }, { "epoch": 2.66, "learning_rate": 6.71875e-05, "loss": 0.3033, "step": 850 }, { "epoch": 2.69, "learning_rate": 6.562500000000001e-05, "loss": 0.2857, "step": 860 }, { "epoch": 2.72, "learning_rate": 6.40625e-05, "loss": 0.2877, "step": 870 }, { "epoch": 2.75, "learning_rate": 6.25e-05, "loss": 0.2429, "step": 880 }, { "epoch": 2.78, "learning_rate": 6.0937500000000004e-05, "loss": 0.2133, "step": 890 }, { "epoch": 2.81, "learning_rate": 5.9375e-05, "loss": 0.2557, "step": 900 }, { "epoch": 2.81, "eval_accuracy": 0.8855050115651504, "eval_loss": 0.5088278651237488, "eval_runtime": 43.0972, "eval_samples_per_second": 60.189, "eval_steps_per_second": 7.541, "step": 900 }, { "epoch": 2.84, "learning_rate": 5.78125e-05, "loss": 0.3011, "step": 910 }, { "epoch": 2.88, "learning_rate": 5.6250000000000005e-05, "loss": 0.2739, "step": 920 }, { "epoch": 2.91, "learning_rate": 5.46875e-05, "loss": 0.215, "step": 930 }, { "epoch": 2.94, "learning_rate": 5.3125000000000004e-05, "loss": 0.2399, "step": 940 }, { "epoch": 2.97, "learning_rate": 5.15625e-05, "loss": 0.2356, "step": 950 }, { "epoch": 3.0, "learning_rate": 5e-05, "loss": 0.2304, "step": 960 }, { "epoch": 3.03, "learning_rate": 4.8437500000000005e-05, "loss": 0.21, "step": 970 }, { "epoch": 3.06, "learning_rate": 4.6875e-05, "loss": 0.1801, "step": 980 }, { "epoch": 3.09, "learning_rate": 4.5312500000000004e-05, "loss": 0.12, "step": 990 }, { "epoch": 3.12, "learning_rate": 4.375e-05, "loss": 0.1509, "step": 1000 }, { "epoch": 3.12, "eval_accuracy": 0.8970701619121049, "eval_loss": 0.4571980834007263, "eval_runtime": 42.5823, "eval_samples_per_second": 60.917, "eval_steps_per_second": 7.632, "step": 1000 }, { "epoch": 3.16, "learning_rate": 4.21875e-05, "loss": 0.1448, "step": 1010 }, { "epoch": 3.19, "learning_rate": 4.0625000000000005e-05, "loss": 0.1272, "step": 1020 }, { "epoch": 3.22, "learning_rate": 3.90625e-05, "loss": 0.1215, "step": 1030 }, { "epoch": 3.25, "learning_rate": 3.7500000000000003e-05, "loss": 0.1236, "step": 1040 }, { "epoch": 3.28, "learning_rate": 3.59375e-05, "loss": 0.1364, "step": 1050 }, { "epoch": 3.31, "learning_rate": 3.4375e-05, "loss": 0.1074, "step": 1060 }, { "epoch": 3.34, "learning_rate": 3.2812500000000005e-05, "loss": 0.1383, "step": 1070 }, { "epoch": 3.38, "learning_rate": 3.125e-05, "loss": 0.1304, "step": 1080 }, { "epoch": 3.41, "learning_rate": 2.96875e-05, "loss": 0.1431, "step": 1090 }, { "epoch": 3.44, "learning_rate": 2.8125000000000003e-05, "loss": 0.1367, "step": 1100 }, { "epoch": 3.44, "eval_accuracy": 0.9090208172706246, "eval_loss": 0.4128749370574951, "eval_runtime": 42.4657, "eval_samples_per_second": 61.085, "eval_steps_per_second": 7.653, "step": 1100 }, { "epoch": 3.47, "learning_rate": 2.6562500000000002e-05, "loss": 0.1202, "step": 1110 }, { "epoch": 3.5, "learning_rate": 2.5e-05, "loss": 0.1309, "step": 1120 }, { "epoch": 3.53, "learning_rate": 2.34375e-05, "loss": 0.1081, "step": 1130 }, { "epoch": 3.56, "learning_rate": 2.1875e-05, "loss": 0.144, "step": 1140 }, { "epoch": 3.59, "learning_rate": 2.0312500000000002e-05, "loss": 0.1104, "step": 1150 }, { "epoch": 3.62, "learning_rate": 1.8750000000000002e-05, "loss": 0.0947, "step": 1160 }, { "epoch": 3.66, "learning_rate": 1.71875e-05, "loss": 0.1008, "step": 1170 }, { "epoch": 3.69, "learning_rate": 1.5625e-05, "loss": 0.1177, "step": 1180 }, { "epoch": 3.72, "learning_rate": 1.4062500000000001e-05, "loss": 0.1347, "step": 1190 }, { "epoch": 3.75, "learning_rate": 1.25e-05, "loss": 0.1078, "step": 1200 }, { "epoch": 3.75, "eval_accuracy": 0.9067077872012336, "eval_loss": 0.3908935785293579, "eval_runtime": 42.3296, "eval_samples_per_second": 61.281, "eval_steps_per_second": 7.678, "step": 1200 }, { "epoch": 3.78, "learning_rate": 1.09375e-05, "loss": 0.1007, "step": 1210 }, { "epoch": 3.81, "learning_rate": 9.375000000000001e-06, "loss": 0.1205, "step": 1220 }, { "epoch": 3.84, "learning_rate": 7.8125e-06, "loss": 0.1124, "step": 1230 }, { "epoch": 3.88, "learning_rate": 6.25e-06, "loss": 0.1064, "step": 1240 }, { "epoch": 3.91, "learning_rate": 4.6875000000000004e-06, "loss": 0.1117, "step": 1250 }, { "epoch": 3.94, "learning_rate": 3.125e-06, "loss": 0.1043, "step": 1260 }, { "epoch": 3.97, "learning_rate": 1.5625e-06, "loss": 0.1033, "step": 1270 }, { "epoch": 4.0, "learning_rate": 0.0, "loss": 0.1101, "step": 1280 }, { "epoch": 4.0, "step": 1280, "total_flos": 3.1689424899978854e+18, "train_loss": 1.0863585265353322, "train_runtime": 1526.0713, "train_samples_per_second": 26.762, "train_steps_per_second": 0.839 } ], "max_steps": 1280, "num_train_epochs": 4, "total_flos": 3.1689424899978854e+18, "trial_name": null, "trial_params": null }