{ "best_metric": 0.3000366985797882, "best_model_checkpoint": "./new_exper3/checkpoint-4200", "epoch": 8.0, "global_step": 5112, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.02, "learning_rate": 9.980438184663537e-05, "loss": 4.9498, "step": 10 }, { "epoch": 0.03, "learning_rate": 9.960876369327074e-05, "loss": 4.8556, "step": 20 }, { "epoch": 0.05, "learning_rate": 9.94131455399061e-05, "loss": 4.7389, "step": 30 }, { "epoch": 0.06, "learning_rate": 9.921752738654147e-05, "loss": 4.6112, "step": 40 }, { "epoch": 0.08, "learning_rate": 9.902190923317684e-05, "loss": 4.54, "step": 50 }, { "epoch": 0.09, "learning_rate": 9.882629107981222e-05, "loss": 4.448, "step": 60 }, { "epoch": 0.11, "learning_rate": 9.863067292644758e-05, "loss": 4.3441, "step": 70 }, { "epoch": 0.13, "learning_rate": 9.843505477308295e-05, "loss": 4.2899, "step": 80 }, { "epoch": 0.14, "learning_rate": 9.823943661971832e-05, "loss": 4.2878, "step": 90 }, { "epoch": 0.16, "learning_rate": 9.804381846635369e-05, "loss": 4.093, "step": 100 }, { "epoch": 0.16, "eval_accuracy": 0.18851195065535853, "eval_loss": 4.104459285736084, "eval_runtime": 43.0936, "eval_samples_per_second": 60.195, "eval_steps_per_second": 7.542, "step": 100 }, { "epoch": 0.17, "learning_rate": 9.784820031298904e-05, "loss": 4.1228, "step": 110 }, { "epoch": 0.19, "learning_rate": 9.765258215962441e-05, "loss": 4.0244, "step": 120 }, { "epoch": 0.2, "learning_rate": 9.745696400625978e-05, "loss": 3.9376, "step": 130 }, { "epoch": 0.22, "learning_rate": 9.726134585289515e-05, "loss": 3.7948, "step": 140 }, { "epoch": 0.23, "learning_rate": 9.706572769953052e-05, "loss": 3.8001, "step": 150 }, { "epoch": 0.25, "learning_rate": 9.687010954616589e-05, "loss": 3.7539, "step": 160 }, { "epoch": 0.27, "learning_rate": 9.667449139280126e-05, "loss": 3.6066, "step": 170 }, { "epoch": 0.28, "learning_rate": 9.647887323943663e-05, "loss": 3.5595, "step": 180 }, { "epoch": 0.3, "learning_rate": 9.628325508607199e-05, "loss": 3.4972, "step": 190 }, { "epoch": 0.31, "learning_rate": 9.608763693270736e-05, "loss": 3.5057, "step": 200 }, { "epoch": 0.31, "eval_accuracy": 0.323053199691596, "eval_loss": 3.444770336151123, "eval_runtime": 43.0641, "eval_samples_per_second": 60.236, "eval_steps_per_second": 7.547, "step": 200 }, { "epoch": 0.33, "learning_rate": 9.589201877934273e-05, "loss": 3.3105, "step": 210 }, { "epoch": 0.34, "learning_rate": 9.56964006259781e-05, "loss": 3.4108, "step": 220 }, { "epoch": 0.36, "learning_rate": 9.550078247261345e-05, "loss": 3.4285, "step": 230 }, { "epoch": 0.38, "learning_rate": 9.530516431924882e-05, "loss": 3.213, "step": 240 }, { "epoch": 0.39, "learning_rate": 9.510954616588421e-05, "loss": 3.1473, "step": 250 }, { "epoch": 0.41, "learning_rate": 9.491392801251958e-05, "loss": 3.1898, "step": 260 }, { "epoch": 0.42, "learning_rate": 9.471830985915493e-05, "loss": 3.1523, "step": 270 }, { "epoch": 0.44, "learning_rate": 9.45226917057903e-05, "loss": 3.186, "step": 280 }, { "epoch": 0.45, "learning_rate": 9.432707355242567e-05, "loss": 2.9603, "step": 290 }, { "epoch": 0.47, "learning_rate": 9.413145539906104e-05, "loss": 2.9116, "step": 300 }, { "epoch": 0.47, "eval_accuracy": 0.45373939861218193, "eval_loss": 2.9483001232147217, "eval_runtime": 42.4599, "eval_samples_per_second": 61.093, "eval_steps_per_second": 7.654, "step": 300 }, { "epoch": 0.49, "learning_rate": 9.39358372456964e-05, "loss": 3.0387, "step": 310 }, { "epoch": 0.5, "learning_rate": 9.374021909233177e-05, "loss": 2.8818, "step": 320 }, { "epoch": 0.52, "learning_rate": 9.354460093896714e-05, "loss": 2.9362, "step": 330 }, { "epoch": 0.53, "learning_rate": 9.334898278560251e-05, "loss": 2.9021, "step": 340 }, { "epoch": 0.55, "learning_rate": 9.315336463223788e-05, "loss": 2.7243, "step": 350 }, { "epoch": 0.56, "learning_rate": 9.295774647887325e-05, "loss": 2.6124, "step": 360 }, { "epoch": 0.58, "learning_rate": 9.276212832550862e-05, "loss": 2.6288, "step": 370 }, { "epoch": 0.59, "learning_rate": 9.256651017214399e-05, "loss": 2.6455, "step": 380 }, { "epoch": 0.61, "learning_rate": 9.237089201877934e-05, "loss": 2.6136, "step": 390 }, { "epoch": 0.63, "learning_rate": 9.217527386541471e-05, "loss": 2.561, "step": 400 }, { "epoch": 0.63, "eval_accuracy": 0.525828835774865, "eval_loss": 2.5700132846832275, "eval_runtime": 42.7904, "eval_samples_per_second": 60.621, "eval_steps_per_second": 7.595, "step": 400 }, { "epoch": 0.64, "learning_rate": 9.197965571205008e-05, "loss": 2.5745, "step": 410 }, { "epoch": 0.66, "learning_rate": 9.178403755868545e-05, "loss": 2.5422, "step": 420 }, { "epoch": 0.67, "learning_rate": 9.158841940532081e-05, "loss": 2.4631, "step": 430 }, { "epoch": 0.69, "learning_rate": 9.139280125195618e-05, "loss": 2.3095, "step": 440 }, { "epoch": 0.7, "learning_rate": 9.119718309859156e-05, "loss": 2.3756, "step": 450 }, { "epoch": 0.72, "learning_rate": 9.100156494522693e-05, "loss": 2.485, "step": 460 }, { "epoch": 0.74, "learning_rate": 9.080594679186229e-05, "loss": 2.3036, "step": 470 }, { "epoch": 0.75, "learning_rate": 9.061032863849766e-05, "loss": 2.2608, "step": 480 }, { "epoch": 0.77, "learning_rate": 9.041471048513303e-05, "loss": 2.296, "step": 490 }, { "epoch": 0.78, "learning_rate": 9.02190923317684e-05, "loss": 2.1611, "step": 500 }, { "epoch": 0.78, "eval_accuracy": 0.6144949884348496, "eval_loss": 2.172065258026123, "eval_runtime": 42.9818, "eval_samples_per_second": 60.351, "eval_steps_per_second": 7.561, "step": 500 }, { "epoch": 0.8, "learning_rate": 9.002347417840375e-05, "loss": 2.0372, "step": 510 }, { "epoch": 0.81, "learning_rate": 8.982785602503912e-05, "loss": 2.0652, "step": 520 }, { "epoch": 0.83, "learning_rate": 8.963223787167449e-05, "loss": 2.1529, "step": 530 }, { "epoch": 0.85, "learning_rate": 8.943661971830986e-05, "loss": 1.9779, "step": 540 }, { "epoch": 0.86, "learning_rate": 8.924100156494523e-05, "loss": 1.9492, "step": 550 }, { "epoch": 0.88, "learning_rate": 8.90453834115806e-05, "loss": 1.9624, "step": 560 }, { "epoch": 0.89, "learning_rate": 8.884976525821597e-05, "loss": 1.8139, "step": 570 }, { "epoch": 0.91, "learning_rate": 8.865414710485134e-05, "loss": 1.9124, "step": 580 }, { "epoch": 0.92, "learning_rate": 8.845852895148671e-05, "loss": 1.7514, "step": 590 }, { "epoch": 0.94, "learning_rate": 8.826291079812207e-05, "loss": 1.715, "step": 600 }, { "epoch": 0.94, "eval_accuracy": 0.6407093292212799, "eval_loss": 1.8254655599594116, "eval_runtime": 42.9275, "eval_samples_per_second": 60.428, "eval_steps_per_second": 7.571, "step": 600 }, { "epoch": 0.95, "learning_rate": 8.806729264475744e-05, "loss": 1.864, "step": 610 }, { "epoch": 0.97, "learning_rate": 8.78716744913928e-05, "loss": 1.8157, "step": 620 }, { "epoch": 0.99, "learning_rate": 8.767605633802817e-05, "loss": 1.8333, "step": 630 }, { "epoch": 1.0, "learning_rate": 8.748043818466354e-05, "loss": 1.6364, "step": 640 }, { "epoch": 1.02, "learning_rate": 8.728482003129891e-05, "loss": 1.5205, "step": 650 }, { "epoch": 1.03, "learning_rate": 8.708920187793428e-05, "loss": 1.5193, "step": 660 }, { "epoch": 1.05, "learning_rate": 8.689358372456965e-05, "loss": 1.5602, "step": 670 }, { "epoch": 1.06, "learning_rate": 8.669796557120501e-05, "loss": 1.3509, "step": 680 }, { "epoch": 1.08, "learning_rate": 8.650234741784038e-05, "loss": 1.408, "step": 690 }, { "epoch": 1.1, "learning_rate": 8.630672926447575e-05, "loss": 1.2752, "step": 700 }, { "epoch": 1.1, "eval_accuracy": 0.70508866615266, "eval_loss": 1.5340100526809692, "eval_runtime": 42.6015, "eval_samples_per_second": 60.89, "eval_steps_per_second": 7.629, "step": 700 }, { "epoch": 1.11, "learning_rate": 8.611111111111112e-05, "loss": 1.184, "step": 710 }, { "epoch": 1.13, "learning_rate": 8.591549295774647e-05, "loss": 1.284, "step": 720 }, { "epoch": 1.14, "learning_rate": 8.571987480438184e-05, "loss": 1.2331, "step": 730 }, { "epoch": 1.16, "learning_rate": 8.552425665101721e-05, "loss": 1.2059, "step": 740 }, { "epoch": 1.17, "learning_rate": 8.53286384976526e-05, "loss": 1.3036, "step": 750 }, { "epoch": 1.19, "learning_rate": 8.513302034428795e-05, "loss": 1.2848, "step": 760 }, { "epoch": 1.21, "learning_rate": 8.493740219092332e-05, "loss": 1.073, "step": 770 }, { "epoch": 1.22, "learning_rate": 8.474178403755869e-05, "loss": 1.4211, "step": 780 }, { "epoch": 1.24, "learning_rate": 8.454616588419406e-05, "loss": 1.067, "step": 790 }, { "epoch": 1.25, "learning_rate": 8.435054773082942e-05, "loss": 1.2487, "step": 800 }, { "epoch": 1.25, "eval_accuracy": 0.7201233616037008, "eval_loss": 1.353263258934021, "eval_runtime": 42.5111, "eval_samples_per_second": 61.019, "eval_steps_per_second": 7.645, "step": 800 }, { "epoch": 1.27, "learning_rate": 8.415492957746479e-05, "loss": 1.2377, "step": 810 }, { "epoch": 1.28, "learning_rate": 8.395931142410016e-05, "loss": 1.1797, "step": 820 }, { "epoch": 1.3, "learning_rate": 8.376369327073553e-05, "loss": 1.2482, "step": 830 }, { "epoch": 1.31, "learning_rate": 8.35680751173709e-05, "loss": 1.0641, "step": 840 }, { "epoch": 1.33, "learning_rate": 8.337245696400627e-05, "loss": 1.14, "step": 850 }, { "epoch": 1.35, "learning_rate": 8.317683881064164e-05, "loss": 1.1587, "step": 860 }, { "epoch": 1.36, "learning_rate": 8.298122065727701e-05, "loss": 1.1029, "step": 870 }, { "epoch": 1.38, "learning_rate": 8.278560250391236e-05, "loss": 0.9988, "step": 880 }, { "epoch": 1.39, "learning_rate": 8.258998435054773e-05, "loss": 1.0692, "step": 890 }, { "epoch": 1.41, "learning_rate": 8.23943661971831e-05, "loss": 1.0333, "step": 900 }, { "epoch": 1.41, "eval_accuracy": 0.7825751734772552, "eval_loss": 1.1474497318267822, "eval_runtime": 43.2872, "eval_samples_per_second": 59.925, "eval_steps_per_second": 7.508, "step": 900 }, { "epoch": 1.42, "learning_rate": 8.219874804381847e-05, "loss": 1.0357, "step": 910 }, { "epoch": 1.44, "learning_rate": 8.200312989045383e-05, "loss": 0.8625, "step": 920 }, { "epoch": 1.46, "learning_rate": 8.18075117370892e-05, "loss": 1.0712, "step": 930 }, { "epoch": 1.47, "learning_rate": 8.161189358372458e-05, "loss": 1.1329, "step": 940 }, { "epoch": 1.49, "learning_rate": 8.141627543035995e-05, "loss": 1.0017, "step": 950 }, { "epoch": 1.5, "learning_rate": 8.122065727699531e-05, "loss": 1.1086, "step": 960 }, { "epoch": 1.52, "learning_rate": 8.102503912363068e-05, "loss": 0.8891, "step": 970 }, { "epoch": 1.53, "learning_rate": 8.082942097026605e-05, "loss": 0.8059, "step": 980 }, { "epoch": 1.55, "learning_rate": 8.063380281690142e-05, "loss": 1.0192, "step": 990 }, { "epoch": 1.56, "learning_rate": 8.043818466353677e-05, "loss": 0.8856, "step": 1000 }, { "epoch": 1.56, "eval_accuracy": 0.7644564379336931, "eval_loss": 1.0914219617843628, "eval_runtime": 42.8447, "eval_samples_per_second": 60.544, "eval_steps_per_second": 7.586, "step": 1000 }, { "epoch": 1.58, "learning_rate": 8.024256651017214e-05, "loss": 0.911, "step": 1010 }, { "epoch": 1.6, "learning_rate": 8.004694835680751e-05, "loss": 0.8939, "step": 1020 }, { "epoch": 1.61, "learning_rate": 7.985133020344288e-05, "loss": 0.7816, "step": 1030 }, { "epoch": 1.63, "learning_rate": 7.965571205007825e-05, "loss": 0.8397, "step": 1040 }, { "epoch": 1.64, "learning_rate": 7.946009389671362e-05, "loss": 0.8172, "step": 1050 }, { "epoch": 1.66, "learning_rate": 7.926447574334899e-05, "loss": 0.7408, "step": 1060 }, { "epoch": 1.67, "learning_rate": 7.906885758998436e-05, "loss": 0.6926, "step": 1070 }, { "epoch": 1.69, "learning_rate": 7.887323943661972e-05, "loss": 0.8984, "step": 1080 }, { "epoch": 1.71, "learning_rate": 7.867762128325509e-05, "loss": 0.7221, "step": 1090 }, { "epoch": 1.72, "learning_rate": 7.848200312989046e-05, "loss": 0.7512, "step": 1100 }, { "epoch": 1.72, "eval_accuracy": 0.8118735543562067, "eval_loss": 0.8893365263938904, "eval_runtime": 42.2991, "eval_samples_per_second": 61.325, "eval_steps_per_second": 7.683, "step": 1100 }, { "epoch": 1.74, "learning_rate": 7.828638497652583e-05, "loss": 0.7297, "step": 1110 }, { "epoch": 1.75, "learning_rate": 7.809076682316118e-05, "loss": 0.6638, "step": 1120 }, { "epoch": 1.77, "learning_rate": 7.789514866979655e-05, "loss": 0.7419, "step": 1130 }, { "epoch": 1.78, "learning_rate": 7.769953051643193e-05, "loss": 0.7263, "step": 1140 }, { "epoch": 1.8, "learning_rate": 7.75039123630673e-05, "loss": 0.6707, "step": 1150 }, { "epoch": 1.82, "learning_rate": 7.730829420970266e-05, "loss": 0.7675, "step": 1160 }, { "epoch": 1.83, "learning_rate": 7.711267605633803e-05, "loss": 0.9189, "step": 1170 }, { "epoch": 1.85, "learning_rate": 7.69170579029734e-05, "loss": 0.5663, "step": 1180 }, { "epoch": 1.86, "learning_rate": 7.672143974960877e-05, "loss": 0.679, "step": 1190 }, { "epoch": 1.88, "learning_rate": 7.652582159624414e-05, "loss": 0.747, "step": 1200 }, { "epoch": 1.88, "eval_accuracy": 0.8303777949113339, "eval_loss": 0.8370148539543152, "eval_runtime": 42.5695, "eval_samples_per_second": 60.936, "eval_steps_per_second": 7.635, "step": 1200 }, { "epoch": 1.89, "learning_rate": 7.63302034428795e-05, "loss": 0.8146, "step": 1210 }, { "epoch": 1.91, "learning_rate": 7.613458528951487e-05, "loss": 0.715, "step": 1220 }, { "epoch": 1.92, "learning_rate": 7.593896713615024e-05, "loss": 0.8337, "step": 1230 }, { "epoch": 1.94, "learning_rate": 7.57433489827856e-05, "loss": 0.6553, "step": 1240 }, { "epoch": 1.96, "learning_rate": 7.554773082942097e-05, "loss": 0.7805, "step": 1250 }, { "epoch": 1.97, "learning_rate": 7.535211267605634e-05, "loss": 0.6115, "step": 1260 }, { "epoch": 1.99, "learning_rate": 7.515649452269171e-05, "loss": 0.7046, "step": 1270 }, { "epoch": 2.0, "learning_rate": 7.496087636932708e-05, "loss": 0.551, "step": 1280 }, { "epoch": 2.02, "learning_rate": 7.476525821596244e-05, "loss": 0.4497, "step": 1290 }, { "epoch": 2.03, "learning_rate": 7.456964006259781e-05, "loss": 0.5082, "step": 1300 }, { "epoch": 2.03, "eval_accuracy": 0.856592135697764, "eval_loss": 0.7130723595619202, "eval_runtime": 43.188, "eval_samples_per_second": 60.063, "eval_steps_per_second": 7.525, "step": 1300 }, { "epoch": 2.05, "learning_rate": 7.437402190923318e-05, "loss": 0.347, "step": 1310 }, { "epoch": 2.07, "learning_rate": 7.417840375586855e-05, "loss": 0.463, "step": 1320 }, { "epoch": 2.08, "learning_rate": 7.398278560250392e-05, "loss": 0.4014, "step": 1330 }, { "epoch": 2.1, "learning_rate": 7.378716744913929e-05, "loss": 0.488, "step": 1340 }, { "epoch": 2.11, "learning_rate": 7.359154929577466e-05, "loss": 0.4239, "step": 1350 }, { "epoch": 2.13, "learning_rate": 7.339593114241003e-05, "loss": 0.4246, "step": 1360 }, { "epoch": 2.14, "learning_rate": 7.320031298904538e-05, "loss": 0.3698, "step": 1370 }, { "epoch": 2.16, "learning_rate": 7.300469483568075e-05, "loss": 0.4264, "step": 1380 }, { "epoch": 2.18, "learning_rate": 7.280907668231612e-05, "loss": 0.3167, "step": 1390 }, { "epoch": 2.19, "learning_rate": 7.261345852895149e-05, "loss": 0.4449, "step": 1400 }, { "epoch": 2.19, "eval_accuracy": 0.8546646106399384, "eval_loss": 0.6572707295417786, "eval_runtime": 43.2163, "eval_samples_per_second": 60.024, "eval_steps_per_second": 7.52, "step": 1400 }, { "epoch": 2.21, "learning_rate": 7.241784037558685e-05, "loss": 0.3504, "step": 1410 }, { "epoch": 2.22, "learning_rate": 7.222222222222222e-05, "loss": 0.397, "step": 1420 }, { "epoch": 2.24, "learning_rate": 7.202660406885759e-05, "loss": 0.4599, "step": 1430 }, { "epoch": 2.25, "learning_rate": 7.183098591549297e-05, "loss": 0.4151, "step": 1440 }, { "epoch": 2.27, "learning_rate": 7.163536776212833e-05, "loss": 0.3605, "step": 1450 }, { "epoch": 2.28, "learning_rate": 7.14397496087637e-05, "loss": 0.3248, "step": 1460 }, { "epoch": 2.3, "learning_rate": 7.124413145539907e-05, "loss": 0.32, "step": 1470 }, { "epoch": 2.32, "learning_rate": 7.104851330203444e-05, "loss": 0.4012, "step": 1480 }, { "epoch": 2.33, "learning_rate": 7.08528951486698e-05, "loss": 0.3526, "step": 1490 }, { "epoch": 2.35, "learning_rate": 7.065727699530516e-05, "loss": 0.2912, "step": 1500 }, { "epoch": 2.35, "eval_accuracy": 0.8596761757902853, "eval_loss": 0.6183947324752808, "eval_runtime": 43.0374, "eval_samples_per_second": 60.273, "eval_steps_per_second": 7.552, "step": 1500 }, { "epoch": 2.36, "learning_rate": 7.046165884194053e-05, "loss": 0.427, "step": 1510 }, { "epoch": 2.38, "learning_rate": 7.02660406885759e-05, "loss": 0.4321, "step": 1520 }, { "epoch": 2.39, "learning_rate": 7.007042253521127e-05, "loss": 0.363, "step": 1530 }, { "epoch": 2.41, "learning_rate": 6.987480438184664e-05, "loss": 0.2761, "step": 1540 }, { "epoch": 2.43, "learning_rate": 6.967918622848201e-05, "loss": 0.3189, "step": 1550 }, { "epoch": 2.44, "learning_rate": 6.948356807511738e-05, "loss": 0.3227, "step": 1560 }, { "epoch": 2.46, "learning_rate": 6.928794992175274e-05, "loss": 0.2792, "step": 1570 }, { "epoch": 2.47, "learning_rate": 6.909233176838811e-05, "loss": 0.1959, "step": 1580 }, { "epoch": 2.49, "learning_rate": 6.889671361502348e-05, "loss": 0.2785, "step": 1590 }, { "epoch": 2.5, "learning_rate": 6.870109546165885e-05, "loss": 0.285, "step": 1600 }, { "epoch": 2.5, "eval_accuracy": 0.8569776407093292, "eval_loss": 0.5973872542381287, "eval_runtime": 42.9634, "eval_samples_per_second": 60.377, "eval_steps_per_second": 7.565, "step": 1600 }, { "epoch": 2.52, "learning_rate": 6.85054773082942e-05, "loss": 0.2649, "step": 1610 }, { "epoch": 2.54, "learning_rate": 6.830985915492957e-05, "loss": 0.3259, "step": 1620 }, { "epoch": 2.55, "learning_rate": 6.811424100156496e-05, "loss": 0.3085, "step": 1630 }, { "epoch": 2.57, "learning_rate": 6.791862284820033e-05, "loss": 0.3485, "step": 1640 }, { "epoch": 2.58, "learning_rate": 6.772300469483568e-05, "loss": 0.2735, "step": 1650 }, { "epoch": 2.6, "learning_rate": 6.752738654147105e-05, "loss": 0.3112, "step": 1660 }, { "epoch": 2.61, "learning_rate": 6.733176838810642e-05, "loss": 0.3373, "step": 1670 }, { "epoch": 2.63, "learning_rate": 6.713615023474179e-05, "loss": 0.1682, "step": 1680 }, { "epoch": 2.64, "learning_rate": 6.694053208137715e-05, "loss": 0.2515, "step": 1690 }, { "epoch": 2.66, "learning_rate": 6.674491392801252e-05, "loss": 0.2267, "step": 1700 }, { "epoch": 2.66, "eval_accuracy": 0.8646877409406323, "eval_loss": 0.5621365904808044, "eval_runtime": 43.0986, "eval_samples_per_second": 60.188, "eval_steps_per_second": 7.541, "step": 1700 }, { "epoch": 2.68, "learning_rate": 6.654929577464789e-05, "loss": 0.1861, "step": 1710 }, { "epoch": 2.69, "learning_rate": 6.635367762128326e-05, "loss": 0.3325, "step": 1720 }, { "epoch": 2.71, "learning_rate": 6.615805946791863e-05, "loss": 0.3621, "step": 1730 }, { "epoch": 2.72, "learning_rate": 6.5962441314554e-05, "loss": 0.3046, "step": 1740 }, { "epoch": 2.74, "learning_rate": 6.576682316118937e-05, "loss": 0.347, "step": 1750 }, { "epoch": 2.75, "learning_rate": 6.557120500782473e-05, "loss": 0.3916, "step": 1760 }, { "epoch": 2.77, "learning_rate": 6.53755868544601e-05, "loss": 0.3221, "step": 1770 }, { "epoch": 2.79, "learning_rate": 6.517996870109546e-05, "loss": 0.1877, "step": 1780 }, { "epoch": 2.8, "learning_rate": 6.498435054773083e-05, "loss": 0.3638, "step": 1790 }, { "epoch": 2.82, "learning_rate": 6.47887323943662e-05, "loss": 0.2553, "step": 1800 }, { "epoch": 2.82, "eval_accuracy": 0.8816499614494988, "eval_loss": 0.5043683052062988, "eval_runtime": 42.7859, "eval_samples_per_second": 60.628, "eval_steps_per_second": 7.596, "step": 1800 }, { "epoch": 2.83, "learning_rate": 6.459311424100157e-05, "loss": 0.3439, "step": 1810 }, { "epoch": 2.85, "learning_rate": 6.439749608763693e-05, "loss": 0.2463, "step": 1820 }, { "epoch": 2.86, "learning_rate": 6.420187793427231e-05, "loss": 0.2192, "step": 1830 }, { "epoch": 2.88, "learning_rate": 6.400625978090768e-05, "loss": 0.236, "step": 1840 }, { "epoch": 2.9, "learning_rate": 6.381064162754305e-05, "loss": 0.2441, "step": 1850 }, { "epoch": 2.91, "learning_rate": 6.36150234741784e-05, "loss": 0.2125, "step": 1860 }, { "epoch": 2.93, "learning_rate": 6.341940532081377e-05, "loss": 0.2112, "step": 1870 }, { "epoch": 2.94, "learning_rate": 6.322378716744914e-05, "loss": 0.2905, "step": 1880 }, { "epoch": 2.96, "learning_rate": 6.302816901408451e-05, "loss": 0.3244, "step": 1890 }, { "epoch": 2.97, "learning_rate": 6.283255086071987e-05, "loss": 0.2029, "step": 1900 }, { "epoch": 2.97, "eval_accuracy": 0.8955281418658443, "eval_loss": 0.43422141671180725, "eval_runtime": 42.7572, "eval_samples_per_second": 60.668, "eval_steps_per_second": 7.601, "step": 1900 }, { "epoch": 2.99, "learning_rate": 6.263693270735524e-05, "loss": 0.2284, "step": 1910 }, { "epoch": 3.0, "learning_rate": 6.244131455399061e-05, "loss": 0.2399, "step": 1920 }, { "epoch": 3.02, "learning_rate": 6.224569640062598e-05, "loss": 0.1875, "step": 1930 }, { "epoch": 3.04, "learning_rate": 6.205007824726135e-05, "loss": 0.1721, "step": 1940 }, { "epoch": 3.05, "learning_rate": 6.185446009389672e-05, "loss": 0.2115, "step": 1950 }, { "epoch": 3.07, "learning_rate": 6.165884194053209e-05, "loss": 0.1698, "step": 1960 }, { "epoch": 3.08, "learning_rate": 6.146322378716746e-05, "loss": 0.1321, "step": 1970 }, { "epoch": 3.1, "learning_rate": 6.126760563380281e-05, "loss": 0.1165, "step": 1980 }, { "epoch": 3.11, "learning_rate": 6.107198748043818e-05, "loss": 0.1121, "step": 1990 }, { "epoch": 3.13, "learning_rate": 6.0876369327073554e-05, "loss": 0.1763, "step": 2000 }, { "epoch": 3.13, "eval_accuracy": 0.8905165767154973, "eval_loss": 0.44871243834495544, "eval_runtime": 42.6933, "eval_samples_per_second": 60.759, "eval_steps_per_second": 7.612, "step": 2000 }, { "epoch": 3.15, "learning_rate": 6.068075117370893e-05, "loss": 0.1473, "step": 2010 }, { "epoch": 3.16, "learning_rate": 6.0485133020344286e-05, "loss": 0.1697, "step": 2020 }, { "epoch": 3.18, "learning_rate": 6.0289514866979656e-05, "loss": 0.1712, "step": 2030 }, { "epoch": 3.19, "learning_rate": 6.0093896713615026e-05, "loss": 0.1468, "step": 2040 }, { "epoch": 3.21, "learning_rate": 5.9898278560250395e-05, "loss": 0.1395, "step": 2050 }, { "epoch": 3.22, "learning_rate": 5.970266040688576e-05, "loss": 0.1122, "step": 2060 }, { "epoch": 3.24, "learning_rate": 5.950704225352113e-05, "loss": 0.1426, "step": 2070 }, { "epoch": 3.26, "learning_rate": 5.93114241001565e-05, "loss": 0.111, "step": 2080 }, { "epoch": 3.27, "learning_rate": 5.911580594679187e-05, "loss": 0.141, "step": 2090 }, { "epoch": 3.29, "learning_rate": 5.892018779342723e-05, "loss": 0.1418, "step": 2100 }, { "epoch": 3.29, "eval_accuracy": 0.9005397070161912, "eval_loss": 0.41731029748916626, "eval_runtime": 42.6624, "eval_samples_per_second": 60.803, "eval_steps_per_second": 7.618, "step": 2100 }, { "epoch": 3.3, "learning_rate": 5.87245696400626e-05, "loss": 0.1462, "step": 2110 }, { "epoch": 3.32, "learning_rate": 5.852895148669797e-05, "loss": 0.0985, "step": 2120 }, { "epoch": 3.33, "learning_rate": 5.833333333333334e-05, "loss": 0.133, "step": 2130 }, { "epoch": 3.35, "learning_rate": 5.81377151799687e-05, "loss": 0.1414, "step": 2140 }, { "epoch": 3.36, "learning_rate": 5.794209702660407e-05, "loss": 0.1297, "step": 2150 }, { "epoch": 3.38, "learning_rate": 5.774647887323944e-05, "loss": 0.1346, "step": 2160 }, { "epoch": 3.4, "learning_rate": 5.755086071987481e-05, "loss": 0.1237, "step": 2170 }, { "epoch": 3.41, "learning_rate": 5.735524256651017e-05, "loss": 0.1664, "step": 2180 }, { "epoch": 3.43, "learning_rate": 5.715962441314554e-05, "loss": 0.0701, "step": 2190 }, { "epoch": 3.44, "learning_rate": 5.6964006259780914e-05, "loss": 0.0563, "step": 2200 }, { "epoch": 3.44, "eval_accuracy": 0.9047802621434079, "eval_loss": 0.387023389339447, "eval_runtime": 42.2972, "eval_samples_per_second": 61.328, "eval_steps_per_second": 7.684, "step": 2200 }, { "epoch": 3.46, "learning_rate": 5.6768388106416284e-05, "loss": 0.0832, "step": 2210 }, { "epoch": 3.47, "learning_rate": 5.657276995305164e-05, "loss": 0.0966, "step": 2220 }, { "epoch": 3.49, "learning_rate": 5.637715179968701e-05, "loss": 0.0926, "step": 2230 }, { "epoch": 3.51, "learning_rate": 5.618153364632238e-05, "loss": 0.1285, "step": 2240 }, { "epoch": 3.52, "learning_rate": 5.598591549295775e-05, "loss": 0.0809, "step": 2250 }, { "epoch": 3.54, "learning_rate": 5.579029733959311e-05, "loss": 0.1119, "step": 2260 }, { "epoch": 3.55, "learning_rate": 5.559467918622848e-05, "loss": 0.067, "step": 2270 }, { "epoch": 3.57, "learning_rate": 5.539906103286385e-05, "loss": 0.1022, "step": 2280 }, { "epoch": 3.58, "learning_rate": 5.520344287949922e-05, "loss": 0.1318, "step": 2290 }, { "epoch": 3.6, "learning_rate": 5.5007824726134584e-05, "loss": 0.0579, "step": 2300 }, { "epoch": 3.6, "eval_accuracy": 0.9036237471087124, "eval_loss": 0.38491636514663696, "eval_runtime": 42.9889, "eval_samples_per_second": 60.341, "eval_steps_per_second": 7.56, "step": 2300 }, { "epoch": 3.62, "learning_rate": 5.4812206572769954e-05, "loss": 0.1294, "step": 2310 }, { "epoch": 3.63, "learning_rate": 5.461658841940532e-05, "loss": 0.0777, "step": 2320 }, { "epoch": 3.65, "learning_rate": 5.442097026604069e-05, "loss": 0.0754, "step": 2330 }, { "epoch": 3.66, "learning_rate": 5.422535211267606e-05, "loss": 0.1463, "step": 2340 }, { "epoch": 3.68, "learning_rate": 5.4029733959311426e-05, "loss": 0.0578, "step": 2350 }, { "epoch": 3.69, "learning_rate": 5.3834115805946795e-05, "loss": 0.1084, "step": 2360 }, { "epoch": 3.71, "learning_rate": 5.3638497652582165e-05, "loss": 0.0534, "step": 2370 }, { "epoch": 3.72, "learning_rate": 5.3442879499217535e-05, "loss": 0.0598, "step": 2380 }, { "epoch": 3.74, "learning_rate": 5.32472613458529e-05, "loss": 0.1353, "step": 2390 }, { "epoch": 3.76, "learning_rate": 5.305164319248827e-05, "loss": 0.166, "step": 2400 }, { "epoch": 3.76, "eval_accuracy": 0.9024672320740169, "eval_loss": 0.3933294415473938, "eval_runtime": 42.6729, "eval_samples_per_second": 60.788, "eval_steps_per_second": 7.616, "step": 2400 }, { "epoch": 3.77, "learning_rate": 5.285602503912364e-05, "loss": 0.0961, "step": 2410 }, { "epoch": 3.79, "learning_rate": 5.266040688575901e-05, "loss": 0.0647, "step": 2420 }, { "epoch": 3.8, "learning_rate": 5.246478873239436e-05, "loss": 0.0744, "step": 2430 }, { "epoch": 3.82, "learning_rate": 5.226917057902973e-05, "loss": 0.1046, "step": 2440 }, { "epoch": 3.83, "learning_rate": 5.207355242566511e-05, "loss": 0.0925, "step": 2450 }, { "epoch": 3.85, "learning_rate": 5.187793427230048e-05, "loss": 0.1343, "step": 2460 }, { "epoch": 3.87, "learning_rate": 5.1682316118935835e-05, "loss": 0.0721, "step": 2470 }, { "epoch": 3.88, "learning_rate": 5.1486697965571205e-05, "loss": 0.1446, "step": 2480 }, { "epoch": 3.9, "learning_rate": 5.1291079812206575e-05, "loss": 0.0807, "step": 2490 }, { "epoch": 3.91, "learning_rate": 5.109546165884195e-05, "loss": 0.11, "step": 2500 }, { "epoch": 3.91, "eval_accuracy": 0.9055512721665382, "eval_loss": 0.39182865619659424, "eval_runtime": 42.4672, "eval_samples_per_second": 61.082, "eval_steps_per_second": 7.653, "step": 2500 }, { "epoch": 3.93, "learning_rate": 5.089984350547731e-05, "loss": 0.1331, "step": 2510 }, { "epoch": 3.94, "learning_rate": 5.070422535211268e-05, "loss": 0.0506, "step": 2520 }, { "epoch": 3.96, "learning_rate": 5.0508607198748047e-05, "loss": 0.1025, "step": 2530 }, { "epoch": 3.97, "learning_rate": 5.0312989045383416e-05, "loss": 0.0792, "step": 2540 }, { "epoch": 3.99, "learning_rate": 5.011737089201878e-05, "loss": 0.099, "step": 2550 }, { "epoch": 4.01, "learning_rate": 4.992175273865415e-05, "loss": 0.0861, "step": 2560 }, { "epoch": 4.02, "learning_rate": 4.972613458528952e-05, "loss": 0.0467, "step": 2570 }, { "epoch": 4.04, "learning_rate": 4.953051643192488e-05, "loss": 0.0587, "step": 2580 }, { "epoch": 4.05, "learning_rate": 4.933489827856025e-05, "loss": 0.064, "step": 2590 }, { "epoch": 4.07, "learning_rate": 4.913928012519562e-05, "loss": 0.0356, "step": 2600 }, { "epoch": 4.07, "eval_accuracy": 0.9202004626060138, "eval_loss": 0.3298385739326477, "eval_runtime": 42.9604, "eval_samples_per_second": 60.381, "eval_steps_per_second": 7.565, "step": 2600 }, { "epoch": 4.08, "learning_rate": 4.894366197183099e-05, "loss": 0.0376, "step": 2610 }, { "epoch": 4.1, "learning_rate": 4.8748043818466354e-05, "loss": 0.0275, "step": 2620 }, { "epoch": 4.12, "learning_rate": 4.855242566510172e-05, "loss": 0.0297, "step": 2630 }, { "epoch": 4.13, "learning_rate": 4.835680751173709e-05, "loss": 0.0323, "step": 2640 }, { "epoch": 4.15, "learning_rate": 4.816118935837246e-05, "loss": 0.0561, "step": 2650 }, { "epoch": 4.16, "learning_rate": 4.7965571205007826e-05, "loss": 0.0489, "step": 2660 }, { "epoch": 4.18, "learning_rate": 4.7769953051643195e-05, "loss": 0.0448, "step": 2670 }, { "epoch": 4.19, "learning_rate": 4.757433489827856e-05, "loss": 0.0398, "step": 2680 }, { "epoch": 4.21, "learning_rate": 4.737871674491393e-05, "loss": 0.0588, "step": 2690 }, { "epoch": 4.23, "learning_rate": 4.71830985915493e-05, "loss": 0.0513, "step": 2700 }, { "epoch": 4.23, "eval_accuracy": 0.9209714726291441, "eval_loss": 0.337054580450058, "eval_runtime": 42.6677, "eval_samples_per_second": 60.795, "eval_steps_per_second": 7.617, "step": 2700 }, { "epoch": 4.24, "learning_rate": 4.698748043818467e-05, "loss": 0.0396, "step": 2710 }, { "epoch": 4.26, "learning_rate": 4.679186228482003e-05, "loss": 0.0352, "step": 2720 }, { "epoch": 4.27, "learning_rate": 4.65962441314554e-05, "loss": 0.0695, "step": 2730 }, { "epoch": 4.29, "learning_rate": 4.640062597809077e-05, "loss": 0.0614, "step": 2740 }, { "epoch": 4.3, "learning_rate": 4.620500782472614e-05, "loss": 0.0702, "step": 2750 }, { "epoch": 4.32, "learning_rate": 4.60093896713615e-05, "loss": 0.037, "step": 2760 }, { "epoch": 4.33, "learning_rate": 4.581377151799687e-05, "loss": 0.0567, "step": 2770 }, { "epoch": 4.35, "learning_rate": 4.5618153364632235e-05, "loss": 0.0327, "step": 2780 }, { "epoch": 4.37, "learning_rate": 4.542253521126761e-05, "loss": 0.0358, "step": 2790 }, { "epoch": 4.38, "learning_rate": 4.5226917057902975e-05, "loss": 0.0762, "step": 2800 }, { "epoch": 4.38, "eval_accuracy": 0.9225134926754048, "eval_loss": 0.32532238960266113, "eval_runtime": 42.6755, "eval_samples_per_second": 60.784, "eval_steps_per_second": 7.616, "step": 2800 }, { "epoch": 4.4, "learning_rate": 4.5031298904538344e-05, "loss": 0.0223, "step": 2810 }, { "epoch": 4.41, "learning_rate": 4.483568075117371e-05, "loss": 0.028, "step": 2820 }, { "epoch": 4.43, "learning_rate": 4.464006259780908e-05, "loss": 0.0572, "step": 2830 }, { "epoch": 4.44, "learning_rate": 4.4444444444444447e-05, "loss": 0.0487, "step": 2840 }, { "epoch": 4.46, "learning_rate": 4.4248826291079816e-05, "loss": 0.0711, "step": 2850 }, { "epoch": 4.48, "learning_rate": 4.405320813771518e-05, "loss": 0.0334, "step": 2860 }, { "epoch": 4.49, "learning_rate": 4.385758998435055e-05, "loss": 0.0219, "step": 2870 }, { "epoch": 4.51, "learning_rate": 4.366197183098591e-05, "loss": 0.0405, "step": 2880 }, { "epoch": 4.52, "learning_rate": 4.346635367762129e-05, "loss": 0.0329, "step": 2890 }, { "epoch": 4.54, "learning_rate": 4.327073552425665e-05, "loss": 0.018, "step": 2900 }, { "epoch": 4.54, "eval_accuracy": 0.9148033924441018, "eval_loss": 0.34668266773223877, "eval_runtime": 42.9733, "eval_samples_per_second": 60.363, "eval_steps_per_second": 7.563, "step": 2900 }, { "epoch": 4.55, "learning_rate": 4.307511737089202e-05, "loss": 0.0413, "step": 2910 }, { "epoch": 4.57, "learning_rate": 4.287949921752739e-05, "loss": 0.018, "step": 2920 }, { "epoch": 4.59, "learning_rate": 4.2683881064162754e-05, "loss": 0.0417, "step": 2930 }, { "epoch": 4.6, "learning_rate": 4.248826291079812e-05, "loss": 0.0254, "step": 2940 }, { "epoch": 4.62, "learning_rate": 4.229264475743349e-05, "loss": 0.0411, "step": 2950 }, { "epoch": 4.63, "learning_rate": 4.209702660406886e-05, "loss": 0.0625, "step": 2960 }, { "epoch": 4.65, "learning_rate": 4.1901408450704226e-05, "loss": 0.0416, "step": 2970 }, { "epoch": 4.66, "learning_rate": 4.1705790297339595e-05, "loss": 0.0486, "step": 2980 }, { "epoch": 4.68, "learning_rate": 4.1510172143974965e-05, "loss": 0.0378, "step": 2990 }, { "epoch": 4.69, "learning_rate": 4.1314553990610335e-05, "loss": 0.0263, "step": 3000 }, { "epoch": 4.69, "eval_accuracy": 0.9144178874325366, "eval_loss": 0.3544096052646637, "eval_runtime": 43.2558, "eval_samples_per_second": 59.969, "eval_steps_per_second": 7.513, "step": 3000 }, { "epoch": 4.71, "learning_rate": 4.11189358372457e-05, "loss": 0.0449, "step": 3010 }, { "epoch": 4.73, "learning_rate": 4.092331768388107e-05, "loss": 0.0758, "step": 3020 }, { "epoch": 4.74, "learning_rate": 4.072769953051643e-05, "loss": 0.0418, "step": 3030 }, { "epoch": 4.76, "learning_rate": 4.053208137715181e-05, "loss": 0.0392, "step": 3040 }, { "epoch": 4.77, "learning_rate": 4.033646322378717e-05, "loss": 0.0195, "step": 3050 }, { "epoch": 4.79, "learning_rate": 4.014084507042254e-05, "loss": 0.0317, "step": 3060 }, { "epoch": 4.8, "learning_rate": 3.99452269170579e-05, "loss": 0.0162, "step": 3070 }, { "epoch": 4.82, "learning_rate": 3.974960876369327e-05, "loss": 0.0256, "step": 3080 }, { "epoch": 4.84, "learning_rate": 3.955399061032864e-05, "loss": 0.017, "step": 3090 }, { "epoch": 4.85, "learning_rate": 3.935837245696401e-05, "loss": 0.0205, "step": 3100 }, { "epoch": 4.85, "eval_accuracy": 0.9221279876638396, "eval_loss": 0.33404412865638733, "eval_runtime": 43.1306, "eval_samples_per_second": 60.143, "eval_steps_per_second": 7.535, "step": 3100 }, { "epoch": 4.87, "learning_rate": 3.9162754303599375e-05, "loss": 0.0549, "step": 3110 }, { "epoch": 4.88, "learning_rate": 3.8967136150234744e-05, "loss": 0.066, "step": 3120 }, { "epoch": 4.9, "learning_rate": 3.877151799687011e-05, "loss": 0.0307, "step": 3130 }, { "epoch": 4.91, "learning_rate": 3.8575899843505484e-05, "loss": 0.0212, "step": 3140 }, { "epoch": 4.93, "learning_rate": 3.8380281690140847e-05, "loss": 0.0311, "step": 3150 }, { "epoch": 4.95, "learning_rate": 3.8184663536776216e-05, "loss": 0.0146, "step": 3160 }, { "epoch": 4.96, "learning_rate": 3.798904538341158e-05, "loss": 0.0173, "step": 3170 }, { "epoch": 4.98, "learning_rate": 3.779342723004695e-05, "loss": 0.0336, "step": 3180 }, { "epoch": 4.99, "learning_rate": 3.759780907668232e-05, "loss": 0.0295, "step": 3190 }, { "epoch": 5.01, "learning_rate": 3.740219092331769e-05, "loss": 0.0237, "step": 3200 }, { "epoch": 5.01, "eval_accuracy": 0.9144178874325366, "eval_loss": 0.33526894450187683, "eval_runtime": 42.9414, "eval_samples_per_second": 60.408, "eval_steps_per_second": 7.568, "step": 3200 }, { "epoch": 5.02, "learning_rate": 3.720657276995305e-05, "loss": 0.0137, "step": 3210 }, { "epoch": 5.04, "learning_rate": 3.701095461658842e-05, "loss": 0.0161, "step": 3220 }, { "epoch": 5.05, "learning_rate": 3.681533646322379e-05, "loss": 0.0217, "step": 3230 }, { "epoch": 5.07, "learning_rate": 3.661971830985916e-05, "loss": 0.0184, "step": 3240 }, { "epoch": 5.09, "learning_rate": 3.642410015649452e-05, "loss": 0.0177, "step": 3250 }, { "epoch": 5.1, "learning_rate": 3.622848200312989e-05, "loss": 0.051, "step": 3260 }, { "epoch": 5.12, "learning_rate": 3.6032863849765256e-05, "loss": 0.012, "step": 3270 }, { "epoch": 5.13, "learning_rate": 3.5837245696400626e-05, "loss": 0.0156, "step": 3280 }, { "epoch": 5.15, "learning_rate": 3.5641627543035995e-05, "loss": 0.0129, "step": 3290 }, { "epoch": 5.16, "learning_rate": 3.5446009389671365e-05, "loss": 0.013, "step": 3300 }, { "epoch": 5.16, "eval_accuracy": 0.9228989976869699, "eval_loss": 0.3218042850494385, "eval_runtime": 42.5041, "eval_samples_per_second": 61.029, "eval_steps_per_second": 7.646, "step": 3300 }, { "epoch": 5.18, "learning_rate": 3.525039123630673e-05, "loss": 0.0106, "step": 3310 }, { "epoch": 5.2, "learning_rate": 3.50547730829421e-05, "loss": 0.0132, "step": 3320 }, { "epoch": 5.21, "learning_rate": 3.485915492957747e-05, "loss": 0.0212, "step": 3330 }, { "epoch": 5.23, "learning_rate": 3.466353677621284e-05, "loss": 0.0124, "step": 3340 }, { "epoch": 5.24, "learning_rate": 3.44679186228482e-05, "loss": 0.0135, "step": 3350 }, { "epoch": 5.26, "learning_rate": 3.427230046948357e-05, "loss": 0.0112, "step": 3360 }, { "epoch": 5.27, "learning_rate": 3.407668231611893e-05, "loss": 0.0175, "step": 3370 }, { "epoch": 5.29, "learning_rate": 3.38810641627543e-05, "loss": 0.0234, "step": 3380 }, { "epoch": 5.31, "learning_rate": 3.368544600938967e-05, "loss": 0.0133, "step": 3390 }, { "epoch": 5.32, "learning_rate": 3.348982785602504e-05, "loss": 0.0116, "step": 3400 }, { "epoch": 5.32, "eval_accuracy": 0.9290670778720124, "eval_loss": 0.308786541223526, "eval_runtime": 43.1692, "eval_samples_per_second": 60.089, "eval_steps_per_second": 7.529, "step": 3400 }, { "epoch": 5.34, "learning_rate": 3.3294209702660405e-05, "loss": 0.011, "step": 3410 }, { "epoch": 5.35, "learning_rate": 3.3098591549295775e-05, "loss": 0.0119, "step": 3420 }, { "epoch": 5.37, "learning_rate": 3.2902973395931144e-05, "loss": 0.0175, "step": 3430 }, { "epoch": 5.38, "learning_rate": 3.2707355242566514e-05, "loss": 0.0116, "step": 3440 }, { "epoch": 5.4, "learning_rate": 3.251173708920188e-05, "loss": 0.0113, "step": 3450 }, { "epoch": 5.41, "learning_rate": 3.2316118935837247e-05, "loss": 0.0152, "step": 3460 }, { "epoch": 5.43, "learning_rate": 3.212050078247261e-05, "loss": 0.012, "step": 3470 }, { "epoch": 5.45, "learning_rate": 3.1924882629107986e-05, "loss": 0.0096, "step": 3480 }, { "epoch": 5.46, "learning_rate": 3.1729264475743356e-05, "loss": 0.0109, "step": 3490 }, { "epoch": 5.48, "learning_rate": 3.153364632237872e-05, "loss": 0.0119, "step": 3500 }, { "epoch": 5.48, "eval_accuracy": 0.9279105628373169, "eval_loss": 0.3046722412109375, "eval_runtime": 42.5452, "eval_samples_per_second": 60.97, "eval_steps_per_second": 7.639, "step": 3500 }, { "epoch": 5.49, "learning_rate": 3.133802816901409e-05, "loss": 0.0091, "step": 3510 }, { "epoch": 5.51, "learning_rate": 3.114241001564945e-05, "loss": 0.0089, "step": 3520 }, { "epoch": 5.52, "learning_rate": 3.094679186228482e-05, "loss": 0.0113, "step": 3530 }, { "epoch": 5.54, "learning_rate": 3.075117370892019e-05, "loss": 0.0162, "step": 3540 }, { "epoch": 5.56, "learning_rate": 3.055555555555556e-05, "loss": 0.0089, "step": 3550 }, { "epoch": 5.57, "learning_rate": 3.0359937402190923e-05, "loss": 0.0105, "step": 3560 }, { "epoch": 5.59, "learning_rate": 3.0164319248826296e-05, "loss": 0.037, "step": 3570 }, { "epoch": 5.6, "learning_rate": 2.996870109546166e-05, "loss": 0.0097, "step": 3580 }, { "epoch": 5.62, "learning_rate": 2.977308294209703e-05, "loss": 0.0087, "step": 3590 }, { "epoch": 5.63, "learning_rate": 2.9577464788732395e-05, "loss": 0.0098, "step": 3600 }, { "epoch": 5.63, "eval_accuracy": 0.9282960678488821, "eval_loss": 0.30633866786956787, "eval_runtime": 42.8022, "eval_samples_per_second": 60.604, "eval_steps_per_second": 7.593, "step": 3600 }, { "epoch": 5.65, "learning_rate": 2.9381846635367765e-05, "loss": 0.009, "step": 3610 }, { "epoch": 5.67, "learning_rate": 2.918622848200313e-05, "loss": 0.0263, "step": 3620 }, { "epoch": 5.68, "learning_rate": 2.89906103286385e-05, "loss": 0.0112, "step": 3630 }, { "epoch": 5.7, "learning_rate": 2.8794992175273867e-05, "loss": 0.0104, "step": 3640 }, { "epoch": 5.71, "learning_rate": 2.8599374021909237e-05, "loss": 0.0082, "step": 3650 }, { "epoch": 5.73, "learning_rate": 2.84037558685446e-05, "loss": 0.0091, "step": 3660 }, { "epoch": 5.74, "learning_rate": 2.8208137715179973e-05, "loss": 0.0099, "step": 3670 }, { "epoch": 5.76, "learning_rate": 2.8012519561815336e-05, "loss": 0.0355, "step": 3680 }, { "epoch": 5.77, "learning_rate": 2.7816901408450706e-05, "loss": 0.0088, "step": 3690 }, { "epoch": 5.79, "learning_rate": 2.7621283255086072e-05, "loss": 0.0086, "step": 3700 }, { "epoch": 5.79, "eval_accuracy": 0.9267540478026214, "eval_loss": 0.30737537145614624, "eval_runtime": 42.6258, "eval_samples_per_second": 60.855, "eval_steps_per_second": 7.624, "step": 3700 }, { "epoch": 5.81, "learning_rate": 2.7425665101721442e-05, "loss": 0.0097, "step": 3710 }, { "epoch": 5.82, "learning_rate": 2.7230046948356808e-05, "loss": 0.0092, "step": 3720 }, { "epoch": 5.84, "learning_rate": 2.7034428794992178e-05, "loss": 0.0085, "step": 3730 }, { "epoch": 5.85, "learning_rate": 2.6838810641627544e-05, "loss": 0.0089, "step": 3740 }, { "epoch": 5.87, "learning_rate": 2.6643192488262914e-05, "loss": 0.0341, "step": 3750 }, { "epoch": 5.88, "learning_rate": 2.6447574334898277e-05, "loss": 0.0357, "step": 3760 }, { "epoch": 5.9, "learning_rate": 2.625195618153365e-05, "loss": 0.0088, "step": 3770 }, { "epoch": 5.92, "learning_rate": 2.6056338028169013e-05, "loss": 0.0101, "step": 3780 }, { "epoch": 5.93, "learning_rate": 2.5860719874804386e-05, "loss": 0.0124, "step": 3790 }, { "epoch": 5.95, "learning_rate": 2.566510172143975e-05, "loss": 0.0081, "step": 3800 }, { "epoch": 5.95, "eval_accuracy": 0.9236700077101002, "eval_loss": 0.32199642062187195, "eval_runtime": 43.3628, "eval_samples_per_second": 59.821, "eval_steps_per_second": 7.495, "step": 3800 }, { "epoch": 5.96, "learning_rate": 2.546948356807512e-05, "loss": 0.0096, "step": 3810 }, { "epoch": 5.98, "learning_rate": 2.5273865414710485e-05, "loss": 0.0217, "step": 3820 }, { "epoch": 5.99, "learning_rate": 2.5078247261345855e-05, "loss": 0.0112, "step": 3830 }, { "epoch": 6.01, "learning_rate": 2.4882629107981224e-05, "loss": 0.0115, "step": 3840 }, { "epoch": 6.03, "learning_rate": 2.468701095461659e-05, "loss": 0.0081, "step": 3850 }, { "epoch": 6.04, "learning_rate": 2.4491392801251957e-05, "loss": 0.0076, "step": 3860 }, { "epoch": 6.06, "learning_rate": 2.4295774647887327e-05, "loss": 0.0078, "step": 3870 }, { "epoch": 6.07, "learning_rate": 2.4100156494522693e-05, "loss": 0.0082, "step": 3880 }, { "epoch": 6.09, "learning_rate": 2.3904538341158063e-05, "loss": 0.0079, "step": 3890 }, { "epoch": 6.1, "learning_rate": 2.370892018779343e-05, "loss": 0.0078, "step": 3900 }, { "epoch": 6.1, "eval_accuracy": 0.9267540478026214, "eval_loss": 0.30635374784469604, "eval_runtime": 43.3159, "eval_samples_per_second": 59.886, "eval_steps_per_second": 7.503, "step": 3900 }, { "epoch": 6.12, "learning_rate": 2.3513302034428795e-05, "loss": 0.008, "step": 3910 }, { "epoch": 6.13, "learning_rate": 2.3317683881064165e-05, "loss": 0.0081, "step": 3920 }, { "epoch": 6.15, "learning_rate": 2.312206572769953e-05, "loss": 0.0074, "step": 3930 }, { "epoch": 6.17, "learning_rate": 2.29264475743349e-05, "loss": 0.0075, "step": 3940 }, { "epoch": 6.18, "learning_rate": 2.2730829420970267e-05, "loss": 0.0082, "step": 3950 }, { "epoch": 6.2, "learning_rate": 2.2535211267605634e-05, "loss": 0.0079, "step": 3960 }, { "epoch": 6.21, "learning_rate": 2.2339593114241003e-05, "loss": 0.0075, "step": 3970 }, { "epoch": 6.23, "learning_rate": 2.214397496087637e-05, "loss": 0.0074, "step": 3980 }, { "epoch": 6.24, "learning_rate": 2.194835680751174e-05, "loss": 0.0067, "step": 3990 }, { "epoch": 6.26, "learning_rate": 2.1752738654147106e-05, "loss": 0.0074, "step": 4000 }, { "epoch": 6.26, "eval_accuracy": 0.9279105628373169, "eval_loss": 0.30622774362564087, "eval_runtime": 42.6237, "eval_samples_per_second": 60.858, "eval_steps_per_second": 7.625, "step": 4000 }, { "epoch": 6.28, "learning_rate": 2.1557120500782476e-05, "loss": 0.0077, "step": 4010 }, { "epoch": 6.29, "learning_rate": 2.1361502347417842e-05, "loss": 0.0074, "step": 4020 }, { "epoch": 6.31, "learning_rate": 2.1165884194053208e-05, "loss": 0.0076, "step": 4030 }, { "epoch": 6.32, "learning_rate": 2.0970266040688578e-05, "loss": 0.0077, "step": 4040 }, { "epoch": 6.34, "learning_rate": 2.0774647887323944e-05, "loss": 0.0075, "step": 4050 }, { "epoch": 6.35, "learning_rate": 2.0579029733959314e-05, "loss": 0.0066, "step": 4060 }, { "epoch": 6.37, "learning_rate": 2.038341158059468e-05, "loss": 0.0077, "step": 4070 }, { "epoch": 6.38, "learning_rate": 2.0187793427230047e-05, "loss": 0.0076, "step": 4080 }, { "epoch": 6.4, "learning_rate": 1.9992175273865416e-05, "loss": 0.0073, "step": 4090 }, { "epoch": 6.42, "learning_rate": 1.9796557120500783e-05, "loss": 0.0068, "step": 4100 }, { "epoch": 6.42, "eval_accuracy": 0.9290670778720124, "eval_loss": 0.3051210343837738, "eval_runtime": 43.4448, "eval_samples_per_second": 59.708, "eval_steps_per_second": 7.481, "step": 4100 }, { "epoch": 6.43, "learning_rate": 1.9600938967136152e-05, "loss": 0.0067, "step": 4110 }, { "epoch": 6.45, "learning_rate": 1.940532081377152e-05, "loss": 0.0382, "step": 4120 }, { "epoch": 6.46, "learning_rate": 1.9209702660406885e-05, "loss": 0.0071, "step": 4130 }, { "epoch": 6.48, "learning_rate": 1.9014084507042255e-05, "loss": 0.0155, "step": 4140 }, { "epoch": 6.49, "learning_rate": 1.881846635367762e-05, "loss": 0.0075, "step": 4150 }, { "epoch": 6.51, "learning_rate": 1.862284820031299e-05, "loss": 0.0073, "step": 4160 }, { "epoch": 6.53, "learning_rate": 1.8427230046948357e-05, "loss": 0.0076, "step": 4170 }, { "epoch": 6.54, "learning_rate": 1.8231611893583723e-05, "loss": 0.0064, "step": 4180 }, { "epoch": 6.56, "learning_rate": 1.8035993740219093e-05, "loss": 0.0097, "step": 4190 }, { "epoch": 6.57, "learning_rate": 1.784037558685446e-05, "loss": 0.006, "step": 4200 }, { "epoch": 6.57, "eval_accuracy": 0.9298380878951427, "eval_loss": 0.3000366985797882, "eval_runtime": 42.6162, "eval_samples_per_second": 60.869, "eval_steps_per_second": 7.626, "step": 4200 }, { "epoch": 6.59, "learning_rate": 1.764475743348983e-05, "loss": 0.0077, "step": 4210 }, { "epoch": 6.6, "learning_rate": 1.7449139280125195e-05, "loss": 0.0076, "step": 4220 }, { "epoch": 6.62, "learning_rate": 1.7253521126760565e-05, "loss": 0.0069, "step": 4230 }, { "epoch": 6.64, "learning_rate": 1.705790297339593e-05, "loss": 0.0067, "step": 4240 }, { "epoch": 6.65, "learning_rate": 1.6862284820031298e-05, "loss": 0.0075, "step": 4250 }, { "epoch": 6.67, "learning_rate": 1.6666666666666667e-05, "loss": 0.0068, "step": 4260 }, { "epoch": 6.68, "learning_rate": 1.6471048513302034e-05, "loss": 0.0062, "step": 4270 }, { "epoch": 6.7, "learning_rate": 1.6275430359937403e-05, "loss": 0.0066, "step": 4280 }, { "epoch": 6.71, "learning_rate": 1.607981220657277e-05, "loss": 0.0072, "step": 4290 }, { "epoch": 6.73, "learning_rate": 1.5884194053208136e-05, "loss": 0.0075, "step": 4300 }, { "epoch": 6.73, "eval_accuracy": 0.930994602929838, "eval_loss": 0.30098453164100647, "eval_runtime": 42.908, "eval_samples_per_second": 60.455, "eval_steps_per_second": 7.574, "step": 4300 }, { "epoch": 6.74, "learning_rate": 1.5688575899843506e-05, "loss": 0.0066, "step": 4310 }, { "epoch": 6.76, "learning_rate": 1.5492957746478872e-05, "loss": 0.0061, "step": 4320 }, { "epoch": 6.78, "learning_rate": 1.5297339593114242e-05, "loss": 0.0073, "step": 4330 }, { "epoch": 6.79, "learning_rate": 1.5101721439749608e-05, "loss": 0.013, "step": 4340 }, { "epoch": 6.81, "learning_rate": 1.4906103286384976e-05, "loss": 0.0071, "step": 4350 }, { "epoch": 6.82, "learning_rate": 1.4710485133020346e-05, "loss": 0.0063, "step": 4360 }, { "epoch": 6.84, "learning_rate": 1.4514866979655714e-05, "loss": 0.0071, "step": 4370 }, { "epoch": 6.85, "learning_rate": 1.4319248826291082e-05, "loss": 0.0067, "step": 4380 }, { "epoch": 6.87, "learning_rate": 1.412363067292645e-05, "loss": 0.0475, "step": 4390 }, { "epoch": 6.89, "learning_rate": 1.3928012519561818e-05, "loss": 0.0057, "step": 4400 }, { "epoch": 6.89, "eval_accuracy": 0.9298380878951427, "eval_loss": 0.3036852180957794, "eval_runtime": 42.308, "eval_samples_per_second": 61.312, "eval_steps_per_second": 7.682, "step": 4400 }, { "epoch": 6.9, "learning_rate": 1.3732394366197184e-05, "loss": 0.0058, "step": 4410 }, { "epoch": 6.92, "learning_rate": 1.3536776212832552e-05, "loss": 0.0068, "step": 4420 }, { "epoch": 6.93, "learning_rate": 1.334115805946792e-05, "loss": 0.0061, "step": 4430 }, { "epoch": 6.95, "learning_rate": 1.3145539906103288e-05, "loss": 0.0245, "step": 4440 }, { "epoch": 6.96, "learning_rate": 1.2949921752738656e-05, "loss": 0.0304, "step": 4450 }, { "epoch": 6.98, "learning_rate": 1.2754303599374023e-05, "loss": 0.0068, "step": 4460 }, { "epoch": 7.0, "learning_rate": 1.255868544600939e-05, "loss": 0.0066, "step": 4470 }, { "epoch": 7.01, "learning_rate": 1.2363067292644757e-05, "loss": 0.0071, "step": 4480 }, { "epoch": 7.03, "learning_rate": 1.2167449139280125e-05, "loss": 0.0053, "step": 4490 }, { "epoch": 7.04, "learning_rate": 1.1971830985915493e-05, "loss": 0.0058, "step": 4500 }, { "epoch": 7.04, "eval_accuracy": 0.9279105628373169, "eval_loss": 0.30713140964508057, "eval_runtime": 42.5583, "eval_samples_per_second": 60.952, "eval_steps_per_second": 7.637, "step": 4500 }, { "epoch": 7.06, "learning_rate": 1.1776212832550863e-05, "loss": 0.0061, "step": 4510 }, { "epoch": 7.07, "learning_rate": 1.1580594679186229e-05, "loss": 0.0064, "step": 4520 }, { "epoch": 7.09, "learning_rate": 1.1384976525821597e-05, "loss": 0.0069, "step": 4530 }, { "epoch": 7.1, "learning_rate": 1.1189358372456965e-05, "loss": 0.0057, "step": 4540 }, { "epoch": 7.12, "learning_rate": 1.0993740219092333e-05, "loss": 0.0061, "step": 4550 }, { "epoch": 7.14, "learning_rate": 1.0798122065727701e-05, "loss": 0.0062, "step": 4560 }, { "epoch": 7.15, "learning_rate": 1.0602503912363067e-05, "loss": 0.0062, "step": 4570 }, { "epoch": 7.17, "learning_rate": 1.0406885758998435e-05, "loss": 0.0058, "step": 4580 }, { "epoch": 7.18, "learning_rate": 1.0211267605633803e-05, "loss": 0.0073, "step": 4590 }, { "epoch": 7.2, "learning_rate": 1.0015649452269172e-05, "loss": 0.0075, "step": 4600 }, { "epoch": 7.2, "eval_accuracy": 0.9282960678488821, "eval_loss": 0.307522177696228, "eval_runtime": 43.0156, "eval_samples_per_second": 60.304, "eval_steps_per_second": 7.555, "step": 4600 }, { "epoch": 7.21, "learning_rate": 9.82003129890454e-06, "loss": 0.0059, "step": 4610 }, { "epoch": 7.23, "learning_rate": 9.624413145539906e-06, "loss": 0.0064, "step": 4620 }, { "epoch": 7.25, "learning_rate": 9.428794992175274e-06, "loss": 0.0063, "step": 4630 }, { "epoch": 7.26, "learning_rate": 9.233176838810642e-06, "loss": 0.0052, "step": 4640 }, { "epoch": 7.28, "learning_rate": 9.03755868544601e-06, "loss": 0.0053, "step": 4650 }, { "epoch": 7.29, "learning_rate": 8.841940532081378e-06, "loss": 0.0059, "step": 4660 }, { "epoch": 7.31, "learning_rate": 8.646322378716746e-06, "loss": 0.0059, "step": 4670 }, { "epoch": 7.32, "learning_rate": 8.450704225352112e-06, "loss": 0.0058, "step": 4680 }, { "epoch": 7.34, "learning_rate": 8.25508607198748e-06, "loss": 0.0054, "step": 4690 }, { "epoch": 7.36, "learning_rate": 8.059467918622848e-06, "loss": 0.0066, "step": 4700 }, { "epoch": 7.36, "eval_accuracy": 0.9294525828835775, "eval_loss": 0.30765867233276367, "eval_runtime": 43.157, "eval_samples_per_second": 60.106, "eval_steps_per_second": 7.531, "step": 4700 }, { "epoch": 7.37, "learning_rate": 7.863849765258216e-06, "loss": 0.0054, "step": 4710 }, { "epoch": 7.39, "learning_rate": 7.668231611893584e-06, "loss": 0.0064, "step": 4720 }, { "epoch": 7.4, "learning_rate": 7.4726134585289515e-06, "loss": 0.0059, "step": 4730 }, { "epoch": 7.42, "learning_rate": 7.2769953051643195e-06, "loss": 0.0057, "step": 4740 }, { "epoch": 7.43, "learning_rate": 7.081377151799687e-06, "loss": 0.0066, "step": 4750 }, { "epoch": 7.45, "learning_rate": 6.885758998435055e-06, "loss": 0.0113, "step": 4760 }, { "epoch": 7.46, "learning_rate": 6.690140845070423e-06, "loss": 0.0059, "step": 4770 }, { "epoch": 7.48, "learning_rate": 6.49452269170579e-06, "loss": 0.0061, "step": 4780 }, { "epoch": 7.5, "learning_rate": 6.298904538341158e-06, "loss": 0.0069, "step": 4790 }, { "epoch": 7.51, "learning_rate": 6.103286384976526e-06, "loss": 0.0056, "step": 4800 }, { "epoch": 7.51, "eval_accuracy": 0.9294525828835775, "eval_loss": 0.30838659405708313, "eval_runtime": 43.0465, "eval_samples_per_second": 60.26, "eval_steps_per_second": 7.55, "step": 4800 }, { "epoch": 7.53, "learning_rate": 5.907668231611894e-06, "loss": 0.0059, "step": 4810 }, { "epoch": 7.54, "learning_rate": 5.712050078247261e-06, "loss": 0.0054, "step": 4820 }, { "epoch": 7.56, "learning_rate": 5.516431924882629e-06, "loss": 0.0054, "step": 4830 }, { "epoch": 7.57, "learning_rate": 5.320813771517997e-06, "loss": 0.0059, "step": 4840 }, { "epoch": 7.59, "learning_rate": 5.125195618153364e-06, "loss": 0.0061, "step": 4850 }, { "epoch": 7.61, "learning_rate": 4.929577464788732e-06, "loss": 0.0062, "step": 4860 }, { "epoch": 7.62, "learning_rate": 4.7339593114241e-06, "loss": 0.0215, "step": 4870 }, { "epoch": 7.64, "learning_rate": 4.538341158059468e-06, "loss": 0.0057, "step": 4880 }, { "epoch": 7.65, "learning_rate": 4.342723004694836e-06, "loss": 0.0061, "step": 4890 }, { "epoch": 7.67, "learning_rate": 4.1471048513302035e-06, "loss": 0.0053, "step": 4900 }, { "epoch": 7.67, "eval_accuracy": 0.930994602929838, "eval_loss": 0.3063901364803314, "eval_runtime": 42.9974, "eval_samples_per_second": 60.329, "eval_steps_per_second": 7.559, "step": 4900 }, { "epoch": 7.68, "learning_rate": 3.9514866979655715e-06, "loss": 0.006, "step": 4910 }, { "epoch": 7.7, "learning_rate": 3.755868544600939e-06, "loss": 0.0053, "step": 4920 }, { "epoch": 7.72, "learning_rate": 3.560250391236307e-06, "loss": 0.0051, "step": 4930 }, { "epoch": 7.73, "learning_rate": 3.3646322378716747e-06, "loss": 0.0057, "step": 4940 }, { "epoch": 7.75, "learning_rate": 3.1690140845070423e-06, "loss": 0.0259, "step": 4950 }, { "epoch": 7.76, "learning_rate": 2.97339593114241e-06, "loss": 0.0055, "step": 4960 }, { "epoch": 7.78, "learning_rate": 2.777777777777778e-06, "loss": 0.0051, "step": 4970 }, { "epoch": 7.79, "learning_rate": 2.582159624413146e-06, "loss": 0.006, "step": 4980 }, { "epoch": 7.81, "learning_rate": 2.3865414710485135e-06, "loss": 0.0062, "step": 4990 }, { "epoch": 7.82, "learning_rate": 2.190923317683881e-06, "loss": 0.0057, "step": 5000 }, { "epoch": 7.82, "eval_accuracy": 0.9317656129529683, "eval_loss": 0.3068486750125885, "eval_runtime": 42.7527, "eval_samples_per_second": 60.675, "eval_steps_per_second": 7.602, "step": 5000 }, { "epoch": 7.84, "learning_rate": 1.995305164319249e-06, "loss": 0.0057, "step": 5010 }, { "epoch": 7.86, "learning_rate": 1.7996870109546167e-06, "loss": 0.0063, "step": 5020 }, { "epoch": 7.87, "learning_rate": 1.6040688575899843e-06, "loss": 0.0063, "step": 5030 }, { "epoch": 7.89, "learning_rate": 1.4084507042253521e-06, "loss": 0.0061, "step": 5040 }, { "epoch": 7.9, "learning_rate": 1.21283255086072e-06, "loss": 0.0051, "step": 5050 }, { "epoch": 7.92, "learning_rate": 1.0172143974960877e-06, "loss": 0.0061, "step": 5060 }, { "epoch": 7.93, "learning_rate": 8.215962441314555e-07, "loss": 0.0061, "step": 5070 }, { "epoch": 7.95, "learning_rate": 6.259780907668232e-07, "loss": 0.0049, "step": 5080 }, { "epoch": 7.97, "learning_rate": 4.303599374021909e-07, "loss": 0.0049, "step": 5090 }, { "epoch": 7.98, "learning_rate": 2.347417840375587e-07, "loss": 0.0055, "step": 5100 }, { "epoch": 7.98, "eval_accuracy": 0.9317656129529683, "eval_loss": 0.30683887004852295, "eval_runtime": 43.5191, "eval_samples_per_second": 59.606, "eval_steps_per_second": 7.468, "step": 5100 }, { "epoch": 8.0, "learning_rate": 3.912363067292645e-08, "loss": 0.0062, "step": 5110 }, { "epoch": 8.0, "step": 5112, "total_flos": 6.337884979995771e+18, "train_loss": 0.5634685837152139, "train_runtime": 5666.239, "train_samples_per_second": 14.415, "train_steps_per_second": 0.902 } ], "max_steps": 5112, "num_train_epochs": 8, "total_flos": 6.337884979995771e+18, "trial_name": null, "trial_params": null }