{ "best_metric": 2.318600654602051, "best_model_checkpoint": "/data4/share_nlp/data/luannd/78.52.project/weight_saving/PoetGPT_vietnamese_with_deepspeed_v0/checkpoint-7744", "epoch": 11.0, "global_step": 7744, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.01, "learning_rate": 1.3499999999999998e-05, "loss": 5.9125, "step": 10 }, { "epoch": 0.03, "learning_rate": 2.8499999999999998e-05, "loss": 5.6567, "step": 20 }, { "epoch": 0.04, "learning_rate": 4.3499999999999993e-05, "loss": 5.3131, "step": 30 }, { "epoch": 0.06, "learning_rate": 5.6999999999999996e-05, "loss": 4.8718, "step": 40 }, { "epoch": 0.07, "learning_rate": 7.199999999999999e-05, "loss": 4.3081, "step": 50 }, { "epoch": 0.09, "learning_rate": 8.699999999999999e-05, "loss": 3.6688, "step": 60 }, { "epoch": 0.1, "learning_rate": 0.000102, "loss": 3.3306, "step": 70 }, { "epoch": 0.11, "learning_rate": 0.000117, "loss": 3.2425, "step": 80 }, { "epoch": 0.13, "learning_rate": 0.00013199999999999998, "loss": 3.1231, "step": 90 }, { "epoch": 0.14, "learning_rate": 0.000147, "loss": 3.1498, "step": 100 }, { "epoch": 0.16, "learning_rate": 0.000162, "loss": 3.0767, "step": 110 }, { "epoch": 0.17, "learning_rate": 0.00017699999999999997, "loss": 3.0715, "step": 120 }, { "epoch": 0.18, "learning_rate": 0.00019199999999999998, "loss": 3.0144, "step": 130 }, { "epoch": 0.2, "learning_rate": 0.00020699999999999996, "loss": 3.0438, "step": 140 }, { "epoch": 0.21, "learning_rate": 0.00022199999999999998, "loss": 2.9902, "step": 150 }, { "epoch": 0.23, "learning_rate": 0.000237, "loss": 2.9502, "step": 160 }, { "epoch": 0.24, "learning_rate": 0.00025199999999999995, "loss": 2.9778, "step": 170 }, { "epoch": 0.26, "learning_rate": 0.000267, "loss": 2.9055, "step": 180 }, { "epoch": 0.27, "learning_rate": 0.00028199999999999997, "loss": 2.8714, "step": 190 }, { "epoch": 0.28, "learning_rate": 0.00029699999999999996, "loss": 2.8467, "step": 200 }, { "epoch": 0.3, "learning_rate": 0.0002998135381828383, "loss": 2.846, "step": 210 }, { "epoch": 0.31, "learning_rate": 0.0002990568314839864, "loss": 2.8177, "step": 220 }, { "epoch": 0.33, "learning_rate": 0.0002977211629518312, "loss": 2.8653, "step": 230 }, { "epoch": 0.34, "learning_rate": 0.00029581172054786616, "loss": 2.8117, "step": 240 }, { "epoch": 0.36, "learning_rate": 0.00029333592086792107, "loss": 2.8291, "step": 250 }, { "epoch": 0.37, "learning_rate": 0.0002903033803348551, "loss": 2.797, "step": 260 }, { "epoch": 0.38, "learning_rate": 0.00028672587784675096, "loss": 2.7837, "step": 270 }, { "epoch": 0.4, "learning_rate": 0.00028261730902569146, "loss": 2.7499, "step": 280 }, { "epoch": 0.41, "learning_rate": 0.0002779936322448233, "loss": 2.7736, "step": 290 }, { "epoch": 0.43, "learning_rate": 0.00027287280664334875, "loss": 2.7529, "step": 300 }, { "epoch": 0.44, "learning_rate": 0.00026727472237020447, "loss": 2.7289, "step": 310 }, { "epoch": 0.45, "learning_rate": 0.000261221123327374, "loss": 2.7221, "step": 320 }, { "epoch": 0.47, "learning_rate": 0.0002547355227129109, "loss": 2.7534, "step": 330 }, { "epoch": 0.48, "learning_rate": 0.00024784311169171814, "loss": 2.7273, "step": 340 }, { "epoch": 0.5, "learning_rate": 0.0002405706615488216, "loss": 2.705, "step": 350 }, { "epoch": 0.51, "learning_rate": 0.0002329464197051909, "loss": 2.6387, "step": 360 }, { "epoch": 0.53, "learning_rate": 0.000225, "loss": 2.736, "step": 370 }, { "epoch": 0.54, "learning_rate": 0.00021676226766548882, "loss": 2.688, "step": 380 }, { "epoch": 0.55, "learning_rate": 0.0002082652194412042, "loss": 2.7384, "step": 390 }, { "epoch": 0.57, "learning_rate": 0.00019954185929327506, "loss": 2.6948, "step": 400 }, { "epoch": 0.58, "learning_rate": 0.00019062607022145078, "loss": 2.7189, "step": 410 }, { "epoch": 0.6, "learning_rate": 0.00018155248265182435, "loss": 2.6816, "step": 420 }, { "epoch": 0.61, "learning_rate": 0.00017235633992642615, "loss": 2.7302, "step": 430 }, { "epoch": 0.62, "learning_rate": 0.00016307336141214873, "loss": 2.6641, "step": 440 }, { "epoch": 0.64, "learning_rate": 0.00015373960376071093, "loss": 2.7036, "step": 450 }, { "epoch": 0.65, "learning_rate": 0.00014439132085855116, "loss": 2.6852, "step": 460 }, { "epoch": 0.67, "learning_rate": 0.0001350648230106275, "loss": 2.6867, "step": 470 }, { "epoch": 0.68, "learning_rate": 0.000125796335905079, "loss": 2.7016, "step": 480 }, { "epoch": 0.7, "learning_rate": 0.00011662185990655284, "loss": 2.7191, "step": 490 }, { "epoch": 0.71, "learning_rate": 0.00010757703022472587, "loss": 2.7024, "step": 500 }, { "epoch": 0.72, "learning_rate": 9.869697850114969e-05, "loss": 2.6439, "step": 510 }, { "epoch": 0.74, "learning_rate": 9.001619635203888e-05, "loss": 2.7095, "step": 520 }, { "epoch": 0.75, "learning_rate": 8.156840139702554e-05, "loss": 2.684, "step": 530 }, { "epoch": 0.77, "learning_rate": 7.33864062942472e-05, "loss": 2.6504, "step": 540 }, { "epoch": 0.78, "learning_rate": 6.550199129045668e-05, "loss": 2.6889, "step": 550 }, { "epoch": 0.8, "learning_rate": 5.794578078119291e-05, "loss": 2.6694, "step": 560 }, { "epoch": 0.81, "learning_rate": 5.074712436047112e-05, "loss": 2.669, "step": 570 }, { "epoch": 0.82, "learning_rate": 4.3933982822017876e-05, "loss": 2.661, "step": 580 }, { "epoch": 0.84, "learning_rate": 3.753281955483985e-05, "loss": 2.6576, "step": 590 }, { "epoch": 0.85, "learning_rate": 3.15684977549647e-05, "loss": 2.665, "step": 600 }, { "epoch": 0.87, "learning_rate": 2.6064183852600797e-05, "loss": 2.6302, "step": 610 }, { "epoch": 0.88, "learning_rate": 2.1041257529821453e-05, "loss": 2.7063, "step": 620 }, { "epoch": 0.89, "learning_rate": 1.6519228678279718e-05, "loss": 2.6988, "step": 630 }, { "epoch": 0.91, "learning_rate": 1.251566161950357e-05, "loss": 2.6588, "step": 640 }, { "epoch": 0.92, "learning_rate": 9.046106882113751e-06, "loss": 2.6728, "step": 650 }, { "epoch": 0.94, "learning_rate": 6.1240408009518346e-06, "loss": 2.6478, "step": 660 }, { "epoch": 0.95, "learning_rate": 3.760813172726457e-06, "loss": 2.6409, "step": 670 }, { "epoch": 0.97, "learning_rate": 1.9656031714918365e-06, "loss": 2.73, "step": 680 }, { "epoch": 0.98, "learning_rate": 7.453836951897885e-07, "loss": 2.6736, "step": 690 }, { "epoch": 0.99, "learning_rate": 1.0489428174020875e-07, "loss": 2.6633, "step": 700 }, { "epoch": 1.0, "eval_loss": 2.6718015670776367, "eval_runtime": 11.0736, "eval_samples_per_second": 904.038, "eval_steps_per_second": 7.134, "step": 704 }, { "epoch": 1.01, "learning_rate": 4.662269987756317e-08, "loss": 2.6824, "step": 710 }, { "epoch": 1.02, "learning_rate": 5.707952862381681e-07, "loss": 2.6342, "step": 720 }, { "epoch": 1.04, "learning_rate": 1.6753760662307215e-06, "loss": 2.6547, "step": 730 }, { "epoch": 1.05, "learning_rate": 3.356074662104319e-06, "loss": 2.6604, "step": 740 }, { "epoch": 1.07, "learning_rate": 5.606362957498179e-06, "loss": 2.6865, "step": 750 }, { "epoch": 1.08, "learning_rate": 8.417500453744864e-06, "loss": 2.6392, "step": 760 }, { "epoch": 1.09, "learning_rate": 1.1778568219438839e-05, "loss": 2.6564, "step": 770 }, { "epoch": 1.11, "learning_rate": 1.567651130140481e-05, "loss": 2.6074, "step": 780 }, { "epoch": 1.12, "learning_rate": 2.0096189432334208e-05, "loss": 2.6392, "step": 790 }, { "epoch": 1.14, "learning_rate": 2.5020435838132658e-05, "loss": 2.6144, "step": 800 }, { "epoch": 1.15, "learning_rate": 3.0430123916561604e-05, "loss": 2.6486, "step": 810 }, { "epoch": 1.16, "learning_rate": 3.630424152818204e-05, "loss": 2.6122, "step": 820 }, { "epoch": 1.18, "learning_rate": 4.2619972611042214e-05, "loss": 2.6399, "step": 830 }, { "epoch": 1.19, "learning_rate": 4.935278580210442e-05, "loss": 2.6323, "step": 840 }, { "epoch": 1.21, "learning_rate": 5.647652972118994e-05, "loss": 2.6635, "step": 850 }, { "epoch": 1.22, "learning_rate": 6.396353454734303e-05, "loss": 2.6061, "step": 860 }, { "epoch": 1.24, "learning_rate": 7.178471949307521e-05, "loss": 2.5807, "step": 870 }, { "epoch": 1.25, "learning_rate": 7.990970575904072e-05, "loss": 2.6347, "step": 880 }, { "epoch": 1.26, "learning_rate": 8.830693453040826e-05, "loss": 2.6889, "step": 890 }, { "epoch": 1.28, "learning_rate": 9.694378955661275e-05, "loss": 2.6108, "step": 900 }, { "epoch": 1.29, "learning_rate": 0.00010578672383836428, "loss": 2.6338, "step": 910 }, { "epoch": 1.31, "learning_rate": 0.00011480138992984273, "loss": 2.596, "step": 920 }, { "epoch": 1.32, "learning_rate": 0.00012395277334996044, "loss": 2.6379, "step": 930 }, { "epoch": 1.34, "learning_rate": 0.00013320532858450377, "loss": 2.6312, "step": 940 }, { "epoch": 1.35, "learning_rate": 0.00014252311715089535, "loss": 2.6165, "step": 950 }, { "epoch": 1.36, "learning_rate": 0.00015186994718931215, "loss": 2.6148, "step": 960 }, { "epoch": 1.38, "learning_rate": 0.0001612095140379635, "loss": 2.6115, "step": 970 }, { "epoch": 1.39, "learning_rate": 0.00017050554124651096, "loss": 2.6476, "step": 980 }, { "epoch": 1.41, "learning_rate": 0.00017972192147990958, "loss": 2.6233, "step": 990 }, { "epoch": 1.42, "learning_rate": 0.00018882285676537802, "loss": 2.6507, "step": 1000 }, { "epoch": 1.43, "learning_rate": 0.00019777299753775265, "loss": 2.6474, "step": 1010 }, { "epoch": 1.45, "learning_rate": 0.00020653757994315076, "loss": 2.6162, "step": 1020 }, { "epoch": 1.46, "learning_rate": 0.00021508256086763368, "loss": 2.6377, "step": 1030 }, { "epoch": 1.48, "learning_rate": 0.00022337475016639342, "loss": 2.6132, "step": 1040 }, { "epoch": 1.49, "learning_rate": 0.00023138193957986393, "loss": 2.6239, "step": 1050 }, { "epoch": 1.51, "learning_rate": 0.00023907302783602514, "loss": 2.6212, "step": 1060 }, { "epoch": 1.52, "learning_rate": 0.0002464181414529809, "loss": 2.599, "step": 1070 }, { "epoch": 1.53, "learning_rate": 0.000253388750772592, "loss": 2.6263, "step": 1080 }, { "epoch": 1.55, "learning_rate": 0.0002599577807744739, "loss": 2.5971, "step": 1090 }, { "epoch": 1.56, "learning_rate": 0.0002660997162399341, "loss": 2.5784, "step": 1100 }, { "epoch": 1.58, "learning_rate": 0.0002717907008573785, "loss": 2.6068, "step": 1110 }, { "epoch": 1.59, "learning_rate": 0.0002770086298842426, "loss": 2.6139, "step": 1120 }, { "epoch": 1.61, "learning_rate": 0.00028173323600553423, "loss": 2.6228, "step": 1130 }, { "epoch": 1.62, "learning_rate": 0.0002859461680554975, "loss": 2.5909, "step": 1140 }, { "epoch": 1.63, "learning_rate": 0.00028963106229663063, "loss": 2.5944, "step": 1150 }, { "epoch": 1.65, "learning_rate": 0.0002927736059791983, "loss": 2.6474, "step": 1160 }, { "epoch": 1.66, "learning_rate": 0.0002953615929343616, "loss": 2.6134, "step": 1170 }, { "epoch": 1.68, "learning_rate": 0.0002973849709849932, "loss": 2.5762, "step": 1180 }, { "epoch": 1.69, "learning_rate": 0.0002988358809900258, "loss": 2.5956, "step": 1190 }, { "epoch": 1.7, "learning_rate": 0.0002997086873706798, "loss": 2.5878, "step": 1200 }, { "epoch": 1.72, "learning_rate": 0.0003, "loss": 2.6209, "step": 1210 }, { "epoch": 1.73, "learning_rate": 0.0002997086873706798, "loss": 2.6205, "step": 1220 }, { "epoch": 1.75, "learning_rate": 0.0002988358809900258, "loss": 2.5864, "step": 1230 }, { "epoch": 1.76, "learning_rate": 0.00029738497098499324, "loss": 2.5826, "step": 1240 }, { "epoch": 1.78, "learning_rate": 0.00029536159293436166, "loss": 2.5817, "step": 1250 }, { "epoch": 1.79, "learning_rate": 0.0002927736059791983, "loss": 2.5889, "step": 1260 }, { "epoch": 1.8, "learning_rate": 0.00028963106229663063, "loss": 2.5749, "step": 1270 }, { "epoch": 1.82, "learning_rate": 0.0002859461680554975, "loss": 2.6131, "step": 1280 }, { "epoch": 1.83, "learning_rate": 0.00028173323600553434, "loss": 2.5982, "step": 1290 }, { "epoch": 1.85, "learning_rate": 0.0002770086298842427, "loss": 2.5702, "step": 1300 }, { "epoch": 1.86, "learning_rate": 0.0002717907008573784, "loss": 2.5854, "step": 1310 }, { "epoch": 1.88, "learning_rate": 0.00026609971623993406, "loss": 2.554, "step": 1320 }, { "epoch": 1.89, "learning_rate": 0.0002599577807744739, "loss": 2.5928, "step": 1330 }, { "epoch": 1.9, "learning_rate": 0.00025338875077259205, "loss": 2.5768, "step": 1340 }, { "epoch": 1.92, "learning_rate": 0.00024641814145298093, "loss": 2.5395, "step": 1350 }, { "epoch": 1.93, "learning_rate": 0.00023907302783602525, "loss": 2.5265, "step": 1360 }, { "epoch": 1.95, "learning_rate": 0.00023138193957986385, "loss": 2.5535, "step": 1370 }, { "epoch": 1.96, "learning_rate": 0.0002233747501663934, "loss": 2.5308, "step": 1380 }, { "epoch": 1.97, "learning_rate": 0.00021508256086763376, "loss": 2.5611, "step": 1390 }, { "epoch": 1.99, "learning_rate": 0.00020653757994315084, "loss": 2.5567, "step": 1400 }, { "epoch": 2.0, "eval_loss": 2.5756430625915527, "eval_runtime": 11.0382, "eval_samples_per_second": 906.942, "eval_steps_per_second": 7.157, "step": 1408 }, { "epoch": 2.0, "learning_rate": 0.00019777299753775273, "loss": 2.5977, "step": 1410 }, { "epoch": 2.02, "learning_rate": 0.00018882285676537824, "loss": 2.5147, "step": 1420 }, { "epoch": 2.03, "learning_rate": 0.0001797219214799098, "loss": 2.5313, "step": 1430 }, { "epoch": 2.05, "learning_rate": 0.0001705055412465109, "loss": 2.5303, "step": 1440 }, { "epoch": 2.06, "learning_rate": 0.00016120951403796358, "loss": 2.5039, "step": 1450 }, { "epoch": 2.07, "learning_rate": 0.00015186994718931223, "loss": 2.5199, "step": 1460 }, { "epoch": 2.09, "learning_rate": 0.00014252311715089543, "loss": 2.4787, "step": 1470 }, { "epoch": 2.1, "learning_rate": 0.00013320532858450385, "loss": 2.5371, "step": 1480 }, { "epoch": 2.12, "learning_rate": 0.00012395277334996052, "loss": 2.5247, "step": 1490 }, { "epoch": 2.13, "learning_rate": 0.00011480138992984267, "loss": 2.488, "step": 1500 }, { "epoch": 2.14, "learning_rate": 0.00010578672383836424, "loss": 2.5005, "step": 1510 }, { "epoch": 2.16, "learning_rate": 9.694378955661282e-05, "loss": 2.4841, "step": 1520 }, { "epoch": 2.17, "learning_rate": 8.830693453040844e-05, "loss": 2.5214, "step": 1530 }, { "epoch": 2.19, "learning_rate": 7.990970575904079e-05, "loss": 2.4842, "step": 1540 }, { "epoch": 2.2, "learning_rate": 7.178471949307551e-05, "loss": 2.5118, "step": 1550 }, { "epoch": 2.22, "learning_rate": 6.396353454734299e-05, "loss": 2.468, "step": 1560 }, { "epoch": 2.23, "learning_rate": 5.647652972119001e-05, "loss": 2.5586, "step": 1570 }, { "epoch": 2.24, "learning_rate": 4.935278580210444e-05, "loss": 2.4862, "step": 1580 }, { "epoch": 2.26, "learning_rate": 4.2619972611042316e-05, "loss": 2.4773, "step": 1590 }, { "epoch": 2.27, "learning_rate": 3.630424152818206e-05, "loss": 2.4787, "step": 1600 }, { "epoch": 2.29, "learning_rate": 3.0430123916561723e-05, "loss": 2.4914, "step": 1610 }, { "epoch": 2.3, "learning_rate": 2.5020435838132692e-05, "loss": 2.5184, "step": 1620 }, { "epoch": 2.32, "learning_rate": 2.0096189432334177e-05, "loss": 2.5503, "step": 1630 }, { "epoch": 2.33, "learning_rate": 1.5676511301404892e-05, "loss": 2.4975, "step": 1640 }, { "epoch": 2.34, "learning_rate": 1.1778568219438856e-05, "loss": 2.4797, "step": 1650 }, { "epoch": 2.36, "learning_rate": 8.41750045374493e-06, "loss": 2.5437, "step": 1660 }, { "epoch": 2.37, "learning_rate": 5.606362957498212e-06, "loss": 2.5227, "step": 1670 }, { "epoch": 2.39, "learning_rate": 3.356074662104369e-06, "loss": 2.4699, "step": 1680 }, { "epoch": 2.4, "learning_rate": 1.6753760662307048e-06, "loss": 2.5275, "step": 1690 }, { "epoch": 2.41, "learning_rate": 5.707952862381681e-07, "loss": 2.5028, "step": 1700 }, { "epoch": 2.43, "learning_rate": 4.662269987756317e-08, "loss": 2.4967, "step": 1710 }, { "epoch": 2.44, "learning_rate": 1.0489428174020875e-07, "loss": 2.5104, "step": 1720 }, { "epoch": 2.46, "learning_rate": 7.453836951897885e-07, "loss": 2.4775, "step": 1730 }, { "epoch": 2.47, "learning_rate": 1.965603171491803e-06, "loss": 2.4989, "step": 1740 }, { "epoch": 2.49, "learning_rate": 3.76081317272644e-06, "loss": 2.5113, "step": 1750 }, { "epoch": 2.5, "learning_rate": 6.1240408009518185e-06, "loss": 2.5008, "step": 1760 }, { "epoch": 2.51, "learning_rate": 9.046106882113702e-06, "loss": 2.5256, "step": 1770 }, { "epoch": 2.53, "learning_rate": 1.251566161950357e-05, "loss": 2.5026, "step": 1780 }, { "epoch": 2.54, "learning_rate": 1.6519228678279633e-05, "loss": 2.4909, "step": 1790 }, { "epoch": 2.56, "learning_rate": 2.104125752982142e-05, "loss": 2.5, "step": 1800 }, { "epoch": 2.57, "learning_rate": 2.606418385260078e-05, "loss": 2.5199, "step": 1810 }, { "epoch": 2.59, "learning_rate": 3.156849775496477e-05, "loss": 2.4495, "step": 1820 }, { "epoch": 2.6, "learning_rate": 3.753281955483985e-05, "loss": 2.497, "step": 1830 }, { "epoch": 2.61, "learning_rate": 4.3933982822017924e-05, "loss": 2.4602, "step": 1840 }, { "epoch": 2.63, "learning_rate": 5.074712436047102e-05, "loss": 2.4893, "step": 1850 }, { "epoch": 2.64, "learning_rate": 5.794578078119269e-05, "loss": 2.4704, "step": 1860 }, { "epoch": 2.66, "learning_rate": 6.550199129045656e-05, "loss": 2.487, "step": 1870 }, { "epoch": 2.67, "learning_rate": 7.338640629424713e-05, "loss": 2.5259, "step": 1880 }, { "epoch": 2.68, "learning_rate": 8.15684013970256e-05, "loss": 2.5138, "step": 1890 }, { "epoch": 2.7, "learning_rate": 9.001619635203874e-05, "loss": 2.4926, "step": 1900 }, { "epoch": 2.71, "learning_rate": 9.869697850114967e-05, "loss": 2.5033, "step": 1910 }, { "epoch": 2.73, "learning_rate": 0.00010757703022472571, "loss": 2.4881, "step": 1920 }, { "epoch": 2.74, "learning_rate": 0.00011662185990655278, "loss": 2.5102, "step": 1930 }, { "epoch": 2.76, "learning_rate": 0.00012579633590507902, "loss": 2.5005, "step": 1940 }, { "epoch": 2.77, "learning_rate": 0.00013506482301062737, "loss": 2.5107, "step": 1950 }, { "epoch": 2.78, "learning_rate": 0.00014439132085855113, "loss": 2.5019, "step": 1960 }, { "epoch": 2.8, "learning_rate": 0.00015373960376071074, "loss": 2.5012, "step": 1970 }, { "epoch": 2.81, "learning_rate": 0.00016307336141214865, "loss": 2.5013, "step": 1980 }, { "epoch": 2.83, "learning_rate": 0.0001723563399264259, "loss": 2.4734, "step": 1990 }, { "epoch": 2.84, "learning_rate": 0.00018155248265182446, "loss": 2.4939, "step": 2000 }, { "epoch": 2.86, "learning_rate": 0.00019062607022145073, "loss": 2.4969, "step": 2010 }, { "epoch": 2.87, "learning_rate": 0.0001995418592932751, "loss": 2.4625, "step": 2020 }, { "epoch": 2.88, "learning_rate": 0.0002082652194412041, "loss": 2.5128, "step": 2030 }, { "epoch": 2.9, "learning_rate": 0.00021676226766548885, "loss": 2.4877, "step": 2040 }, { "epoch": 2.91, "learning_rate": 0.00022499999999999986, "loss": 2.4856, "step": 2050 }, { "epoch": 2.93, "learning_rate": 0.0002329464197051909, "loss": 2.4225, "step": 2060 }, { "epoch": 2.94, "learning_rate": 0.00024057066154882162, "loss": 2.5323, "step": 2070 }, { "epoch": 2.95, "learning_rate": 0.0002478431116917181, "loss": 2.4751, "step": 2080 }, { "epoch": 2.97, "learning_rate": 0.0002547355227129109, "loss": 2.4638, "step": 2090 }, { "epoch": 2.98, "learning_rate": 0.0002612211233273739, "loss": 2.4553, "step": 2100 }, { "epoch": 3.0, "learning_rate": 0.0002672747223702044, "loss": 2.4885, "step": 2110 }, { "epoch": 3.0, "eval_loss": 2.5283169746398926, "eval_runtime": 11.0137, "eval_samples_per_second": 908.962, "eval_steps_per_second": 7.173, "step": 2112 }, { "epoch": 3.01, "learning_rate": 0.00027287280664334865, "loss": 2.4728, "step": 2120 }, { "epoch": 3.03, "learning_rate": 0.00027799363224482337, "loss": 2.4908, "step": 2130 }, { "epoch": 3.04, "learning_rate": 0.00028261730902569146, "loss": 2.4329, "step": 2140 }, { "epoch": 3.05, "learning_rate": 0.00028672587784675096, "loss": 2.4558, "step": 2150 }, { "epoch": 3.07, "learning_rate": 0.0002903033803348551, "loss": 2.4522, "step": 2160 }, { "epoch": 3.08, "learning_rate": 0.000293335920867921, "loss": 2.4269, "step": 2170 }, { "epoch": 3.1, "learning_rate": 0.00029581172054786616, "loss": 2.4584, "step": 2180 }, { "epoch": 3.11, "learning_rate": 0.0002977211629518312, "loss": 2.4243, "step": 2190 }, { "epoch": 3.12, "learning_rate": 0.0002990568314839864, "loss": 2.4672, "step": 2200 }, { "epoch": 3.14, "learning_rate": 0.0002998135381828383, "loss": 2.4333, "step": 2210 }, { "epoch": 3.15, "learning_rate": 0.0002999883438721462, "loss": 2.3994, "step": 2220 }, { "epoch": 3.17, "learning_rate": 0.000299580569577177, "loss": 2.4639, "step": 2230 }, { "epoch": 3.18, "learning_rate": 0.00029859179916195787, "loss": 2.4175, "step": 2240 }, { "epoch": 3.2, "learning_rate": 0.0002970258731772816, "loss": 2.4103, "step": 2250 }, { "epoch": 3.21, "learning_rate": 0.00029488887394336027, "loss": 2.4623, "step": 2260 }, { "epoch": 3.22, "learning_rate": 0.00029218910192506983, "loss": 2.4217, "step": 2270 }, { "epoch": 3.24, "learning_rate": 0.0002889370434915463, "loss": 2.4537, "step": 2280 }, { "epoch": 3.25, "learning_rate": 0.00028514533018536277, "loss": 2.4471, "step": 2290 }, { "epoch": 3.27, "learning_rate": 0.00028082868965949076, "loss": 2.3821, "step": 2300 }, { "epoch": 3.28, "learning_rate": 0.0002760038884726156, "loss": 2.406, "step": 2310 }, { "epoch": 3.3, "learning_rate": 0.0002706896669650002, "loss": 2.4075, "step": 2320 }, { "epoch": 3.31, "learning_rate": 0.00026490666646784665, "loss": 2.4283, "step": 2330 }, { "epoch": 3.32, "learning_rate": 0.00025867734912889096, "loss": 2.3801, "step": 2340 }, { "epoch": 3.34, "learning_rate": 0.00025202591066563786, "loss": 2.4049, "step": 2350 }, { "epoch": 3.35, "learning_rate": 0.00024497818638512107, "loss": 2.3783, "step": 2360 }, { "epoch": 3.37, "learning_rate": 0.00023756155083521851, "loss": 2.4167, "step": 2370 }, { "epoch": 3.38, "learning_rate": 0.00022980481147730062, "loss": 2.3844, "step": 2380 }, { "epoch": 3.39, "learning_rate": 0.00022173809679319783, "loss": 2.4114, "step": 2390 }, { "epoch": 3.41, "learning_rate": 0.00021339273926110515, "loss": 2.4202, "step": 2400 }, { "epoch": 3.42, "learning_rate": 0.00020480115365495915, "loss": 2.3722, "step": 2410 }, { "epoch": 3.44, "learning_rate": 0.00019599671113999995, "loss": 2.4075, "step": 2420 }, { "epoch": 3.45, "learning_rate": 0.00018701360965354402, "loss": 2.3886, "step": 2430 }, { "epoch": 3.47, "learning_rate": 0.00017788674107443704, "loss": 2.3833, "step": 2440 }, { "epoch": 3.48, "learning_rate": 0.00016865155569712278, "loss": 2.3775, "step": 2450 }, { "epoch": 3.49, "learning_rate": 0.00015934392453672772, "loss": 2.413, "step": 2460 }, { "epoch": 3.51, "learning_rate": 0.00015000000000000004, "loss": 2.3845, "step": 2470 }, { "epoch": 3.52, "learning_rate": 0.00014065607546327242, "loss": 2.3666, "step": 2480 }, { "epoch": 3.54, "learning_rate": 0.00013134844430287736, "loss": 2.4041, "step": 2490 }, { "epoch": 3.55, "learning_rate": 0.0001221132589255631, "loss": 2.4064, "step": 2500 }, { "epoch": 3.57, "learning_rate": 0.00011298639034645613, "loss": 2.4397, "step": 2510 }, { "epoch": 3.58, "learning_rate": 0.00010400328886000018, "loss": 2.4145, "step": 2520 }, { "epoch": 3.59, "learning_rate": 9.519884634504099e-05, "loss": 2.3853, "step": 2530 }, { "epoch": 3.61, "learning_rate": 8.660726073889497e-05, "loss": 2.4271, "step": 2540 }, { "epoch": 3.62, "learning_rate": 7.82619032068023e-05, "loss": 2.3364, "step": 2550 }, { "epoch": 3.64, "learning_rate": 7.019518852269947e-05, "loss": 2.3847, "step": 2560 }, { "epoch": 3.65, "learning_rate": 6.24384491647816e-05, "loss": 2.4106, "step": 2570 }, { "epoch": 3.66, "learning_rate": 5.502181361487904e-05, "loss": 2.3956, "step": 2580 }, { "epoch": 3.68, "learning_rate": 4.7974089334362206e-05, "loss": 2.3839, "step": 2590 }, { "epoch": 3.69, "learning_rate": 4.132265087110915e-05, "loss": 2.3884, "step": 2600 }, { "epoch": 3.71, "learning_rate": 3.5093333532153445e-05, "loss": 2.396, "step": 2610 }, { "epoch": 3.72, "learning_rate": 2.9310333034999828e-05, "loss": 2.3785, "step": 2620 }, { "epoch": 3.74, "learning_rate": 2.3996111527384437e-05, "loss": 2.4004, "step": 2630 }, { "epoch": 3.75, "learning_rate": 1.91713103405093e-05, "loss": 2.3918, "step": 2640 }, { "epoch": 3.76, "learning_rate": 1.4854669814637276e-05, "loss": 2.3066, "step": 2650 }, { "epoch": 3.78, "learning_rate": 1.1062956508453685e-05, "loss": 2.3492, "step": 2660 }, { "epoch": 3.79, "learning_rate": 7.810898074930194e-06, "loss": 2.3618, "step": 2670 }, { "epoch": 3.81, "learning_rate": 5.11112605663977e-06, "loss": 2.3854, "step": 2680 }, { "epoch": 3.82, "learning_rate": 2.974126822718409e-06, "loss": 2.3978, "step": 2690 }, { "epoch": 3.84, "learning_rate": 1.408200838042095e-06, "loss": 2.3669, "step": 2700 }, { "epoch": 3.85, "learning_rate": 4.194304228229806e-07, "loss": 2.3758, "step": 2710 }, { "epoch": 3.86, "learning_rate": 1.1656127853787445e-08, "loss": 2.4071, "step": 2720 }, { "epoch": 3.88, "learning_rate": 1.8646181716164831e-07, "loss": 2.3569, "step": 2730 }, { "epoch": 3.89, "learning_rate": 9.431685160135927e-07, "loss": 2.3831, "step": 2740 }, { "epoch": 3.91, "learning_rate": 2.2788370481687635e-06, "loss": 2.3951, "step": 2750 }, { "epoch": 3.92, "learning_rate": 4.188279452133741e-06, "loss": 2.4046, "step": 2760 }, { "epoch": 3.93, "learning_rate": 6.664079132078764e-06, "loss": 2.3911, "step": 2770 }, { "epoch": 3.95, "learning_rate": 9.696619665144767e-06, "loss": 2.3894, "step": 2780 }, { "epoch": 3.96, "learning_rate": 1.3274122153249145e-05, "loss": 2.4069, "step": 2790 }, { "epoch": 3.98, "learning_rate": 1.7382690974308465e-05, "loss": 2.4119, "step": 2800 }, { "epoch": 3.99, "learning_rate": 2.2006367755176625e-05, "loss": 2.3552, "step": 2810 }, { "epoch": 4.0, "eval_loss": 2.4395244121551514, "eval_runtime": 11.0583, "eval_samples_per_second": 905.292, "eval_steps_per_second": 7.144, "step": 2816 }, { "epoch": 4.01, "learning_rate": 2.712719335665126e-05, "loss": 2.3469, "step": 2820 }, { "epoch": 4.02, "learning_rate": 3.272527762979563e-05, "loss": 2.3646, "step": 2830 }, { "epoch": 4.03, "learning_rate": 3.877887667262582e-05, "loss": 2.3076, "step": 2840 }, { "epoch": 4.05, "learning_rate": 4.526447728708897e-05, "loss": 2.3542, "step": 2850 }, { "epoch": 4.06, "learning_rate": 5.2156888308281784e-05, "loss": 2.3176, "step": 2860 }, { "epoch": 4.08, "learning_rate": 5.9429338451178436e-05, "loss": 2.3324, "step": 2870 }, { "epoch": 4.09, "learning_rate": 6.705358029480876e-05, "loss": 2.3942, "step": 2880 }, { "epoch": 4.11, "learning_rate": 7.499999999999976e-05, "loss": 2.3256, "step": 2890 }, { "epoch": 4.12, "learning_rate": 8.323773233451096e-05, "loss": 2.3241, "step": 2900 }, { "epoch": 4.13, "learning_rate": 9.173478055879573e-05, "loss": 2.3697, "step": 2910 }, { "epoch": 4.15, "learning_rate": 0.00010045814070672496, "loss": 2.3174, "step": 2920 }, { "epoch": 4.16, "learning_rate": 0.00010937392977854936, "loss": 2.3787, "step": 2930 }, { "epoch": 4.18, "learning_rate": 0.00011844751734817587, "loss": 2.3298, "step": 2940 }, { "epoch": 4.19, "learning_rate": 0.00012764366007357364, "loss": 2.2992, "step": 2950 }, { "epoch": 4.2, "learning_rate": 0.00013692663858785116, "loss": 2.3446, "step": 2960 }, { "epoch": 4.22, "learning_rate": 0.00014626039623928907, "loss": 2.3374, "step": 2970 }, { "epoch": 4.23, "learning_rate": 0.00015560867914144898, "loss": 2.345, "step": 2980 }, { "epoch": 4.25, "learning_rate": 0.00016493517698937217, "loss": 2.3703, "step": 2990 }, { "epoch": 4.26, "learning_rate": 0.0001742036640949208, "loss": 2.3734, "step": 3000 }, { "epoch": 4.28, "learning_rate": 0.00018337814009344703, "loss": 2.3125, "step": 3010 }, { "epoch": 4.29, "learning_rate": 0.00019242296977527412, "loss": 2.3347, "step": 3020 }, { "epoch": 4.3, "learning_rate": 0.00020130302149884988, "loss": 2.406, "step": 3030 }, { "epoch": 4.32, "learning_rate": 0.00020998380364796131, "loss": 2.3703, "step": 3040 }, { "epoch": 4.33, "learning_rate": 0.0002184315986029747, "loss": 2.3179, "step": 3050 }, { "epoch": 4.35, "learning_rate": 0.0002266135937057527, "loss": 2.3096, "step": 3060 }, { "epoch": 4.36, "learning_rate": 0.00023449800870954326, "loss": 2.372, "step": 3070 }, { "epoch": 4.38, "learning_rate": 0.00024205421921880715, "loss": 2.3321, "step": 3080 }, { "epoch": 4.39, "learning_rate": 0.00024925287563952903, "loss": 2.3247, "step": 3090 }, { "epoch": 4.4, "learning_rate": 0.0002560660171779819, "loss": 2.3729, "step": 3100 }, { "epoch": 4.42, "learning_rate": 0.00026246718044516, "loss": 2.3266, "step": 3110 }, { "epoch": 4.43, "learning_rate": 0.0002684315022450353, "loss": 2.3244, "step": 3120 }, { "epoch": 4.45, "learning_rate": 0.00027393581614739896, "loss": 2.3222, "step": 3130 }, { "epoch": 4.46, "learning_rate": 0.0002789587424701784, "loss": 2.3556, "step": 3140 }, { "epoch": 4.47, "learning_rate": 0.00028348077132172016, "loss": 2.3062, "step": 3150 }, { "epoch": 4.49, "learning_rate": 0.0002874843383804963, "loss": 2.3373, "step": 3160 }, { "epoch": 4.5, "learning_rate": 0.0002909538931178862, "loss": 2.3501, "step": 3170 }, { "epoch": 4.52, "learning_rate": 0.0002938759591990482, "loss": 2.3569, "step": 3180 }, { "epoch": 4.53, "learning_rate": 0.00029623918682727355, "loss": 2.3147, "step": 3190 }, { "epoch": 4.55, "learning_rate": 0.0002980343968285081, "loss": 2.2977, "step": 3200 }, { "epoch": 4.56, "learning_rate": 0.0002992546163048102, "loss": 2.3241, "step": 3210 }, { "epoch": 4.57, "learning_rate": 0.0002998951057182598, "loss": 2.3418, "step": 3220 }, { "epoch": 4.59, "learning_rate": 0.0002999533773001224, "loss": 2.3403, "step": 3230 }, { "epoch": 4.6, "learning_rate": 0.0002994292047137618, "loss": 2.306, "step": 3240 }, { "epoch": 4.62, "learning_rate": 0.00029832462393376933, "loss": 2.2761, "step": 3250 }, { "epoch": 4.63, "learning_rate": 0.0002966439253378957, "loss": 2.3357, "step": 3260 }, { "epoch": 4.64, "learning_rate": 0.0002943936370425018, "loss": 2.3251, "step": 3270 }, { "epoch": 4.66, "learning_rate": 0.0002915824995462553, "loss": 2.3676, "step": 3280 }, { "epoch": 4.67, "learning_rate": 0.00028822143178056103, "loss": 2.3547, "step": 3290 }, { "epoch": 4.69, "learning_rate": 0.00028432348869859505, "loss": 2.3155, "step": 3300 }, { "epoch": 4.7, "learning_rate": 0.00027990381056766585, "loss": 2.3275, "step": 3310 }, { "epoch": 4.72, "learning_rate": 0.00027497956416186735, "loss": 2.3019, "step": 3320 }, { "epoch": 4.73, "learning_rate": 0.0002695698760834384, "loss": 2.3236, "step": 3330 }, { "epoch": 4.74, "learning_rate": 0.00026369575847181784, "loss": 2.2911, "step": 3340 }, { "epoch": 4.76, "learning_rate": 0.000257380027388958, "loss": 2.3297, "step": 3350 }, { "epoch": 4.77, "learning_rate": 0.0002506472141978957, "loss": 2.3119, "step": 3360 }, { "epoch": 4.79, "learning_rate": 0.0002435234702788101, "loss": 2.3075, "step": 3370 }, { "epoch": 4.8, "learning_rate": 0.00023603646545265687, "loss": 2.3113, "step": 3380 }, { "epoch": 4.82, "learning_rate": 0.00022821528050692507, "loss": 2.3155, "step": 3390 }, { "epoch": 4.83, "learning_rate": 0.00022009029424095958, "loss": 2.315, "step": 3400 }, { "epoch": 4.84, "learning_rate": 0.00021169306546959193, "loss": 2.281, "step": 3410 }, { "epoch": 4.86, "learning_rate": 0.00020305621044338731, "loss": 2.2756, "step": 3420 }, { "epoch": 4.87, "learning_rate": 0.00019421327616163563, "loss": 2.2932, "step": 3430 }, { "epoch": 4.89, "learning_rate": 0.0001851986100701572, "loss": 2.2784, "step": 3440 }, { "epoch": 4.9, "learning_rate": 0.00017604722665003937, "loss": 2.3281, "step": 3450 }, { "epoch": 4.91, "learning_rate": 0.00016679467141549642, "loss": 2.3306, "step": 3460 }, { "epoch": 4.93, "learning_rate": 0.00015747688284910473, "loss": 2.311, "step": 3470 }, { "epoch": 4.94, "learning_rate": 0.0001481300528106878, "loss": 2.3097, "step": 3480 }, { "epoch": 4.96, "learning_rate": 0.00013879048596203628, "loss": 2.3104, "step": 3490 }, { "epoch": 4.97, "learning_rate": 0.00012949445875348934, "loss": 2.2894, "step": 3500 }, { "epoch": 4.99, "learning_rate": 0.00012027807852009062, "loss": 2.3139, "step": 3510 }, { "epoch": 5.0, "learning_rate": 0.00011117714323462205, "loss": 2.3084, "step": 3520 }, { "epoch": 5.0, "eval_loss": 2.3810958862304688, "eval_runtime": 11.048, "eval_samples_per_second": 906.136, "eval_steps_per_second": 7.151, "step": 3520 }, { "epoch": 5.01, "learning_rate": 0.00010222700246224737, "loss": 2.2791, "step": 3530 }, { "epoch": 5.03, "learning_rate": 9.346242005684964e-05, "loss": 2.2723, "step": 3540 }, { "epoch": 5.04, "learning_rate": 8.491743913236614e-05, "loss": 2.2202, "step": 3550 }, { "epoch": 5.06, "learning_rate": 7.662524983360638e-05, "loss": 2.2527, "step": 3560 }, { "epoch": 5.07, "learning_rate": 6.861806042013623e-05, "loss": 2.2603, "step": 3570 }, { "epoch": 5.09, "learning_rate": 6.092697216397482e-05, "loss": 2.2643, "step": 3580 }, { "epoch": 5.1, "learning_rate": 5.3581858547019076e-05, "loss": 2.3002, "step": 3590 }, { "epoch": 5.11, "learning_rate": 4.661124922740784e-05, "loss": 2.2654, "step": 3600 }, { "epoch": 5.13, "learning_rate": 4.004221922552624e-05, "loss": 2.2413, "step": 3610 }, { "epoch": 5.14, "learning_rate": 3.3900283760066006e-05, "loss": 2.2112, "step": 3620 }, { "epoch": 5.16, "learning_rate": 2.8209299142621573e-05, "loss": 2.2311, "step": 3630 }, { "epoch": 5.17, "learning_rate": 2.2991370115757362e-05, "loss": 2.258, "step": 3640 }, { "epoch": 5.18, "learning_rate": 1.8266763994465914e-05, "loss": 2.2304, "step": 3650 }, { "epoch": 5.2, "learning_rate": 1.4053831944502642e-05, "loss": 2.2356, "step": 3660 }, { "epoch": 5.21, "learning_rate": 1.0368937703369245e-05, "loss": 2.2375, "step": 3670 }, { "epoch": 5.23, "learning_rate": 7.226394020801679e-06, "loss": 2.2039, "step": 3680 }, { "epoch": 5.24, "learning_rate": 4.6384070656383054e-06, "loss": 2.232, "step": 3690 }, { "epoch": 5.26, "learning_rate": 2.6150290150067422e-06, "loss": 2.2633, "step": 3700 }, { "epoch": 5.27, "learning_rate": 1.1641190099741572e-06, "loss": 2.2459, "step": 3710 }, { "epoch": 5.28, "learning_rate": 2.913126293202395e-07, "loss": 2.2498, "step": 3720 }, { "epoch": 5.3, "learning_rate": 0.0, "loss": 2.228, "step": 3730 }, { "epoch": 5.31, "learning_rate": 2.913126293202228e-07, "loss": 2.2358, "step": 3740 }, { "epoch": 5.33, "learning_rate": 1.1641190099741237e-06, "loss": 2.2618, "step": 3750 }, { "epoch": 5.34, "learning_rate": 2.6150290150066923e-06, "loss": 2.2322, "step": 3760 }, { "epoch": 5.36, "learning_rate": 4.6384070656382385e-06, "loss": 2.2604, "step": 3770 }, { "epoch": 5.37, "learning_rate": 7.2263940208015954e-06, "loss": 2.2499, "step": 3780 }, { "epoch": 5.38, "learning_rate": 1.0368937703369145e-05, "loss": 2.2434, "step": 3790 }, { "epoch": 5.4, "learning_rate": 1.4053831944502525e-05, "loss": 2.2428, "step": 3800 }, { "epoch": 5.41, "learning_rate": 1.82667639944658e-05, "loss": 2.2561, "step": 3810 }, { "epoch": 5.43, "learning_rate": 2.299137011575723e-05, "loss": 2.2548, "step": 3820 }, { "epoch": 5.44, "learning_rate": 2.820929914262142e-05, "loss": 2.2404, "step": 3830 }, { "epoch": 5.45, "learning_rate": 3.3900283760065837e-05, "loss": 2.2243, "step": 3840 }, { "epoch": 5.47, "learning_rate": 4.004221922552608e-05, "loss": 2.2692, "step": 3850 }, { "epoch": 5.48, "learning_rate": 4.6611249227407644e-05, "loss": 2.2215, "step": 3860 }, { "epoch": 5.5, "learning_rate": 5.358185854701887e-05, "loss": 2.2461, "step": 3870 }, { "epoch": 5.51, "learning_rate": 6.0926972163974606e-05, "loss": 2.2281, "step": 3880 }, { "epoch": 5.53, "learning_rate": 6.861806042013602e-05, "loss": 2.2252, "step": 3890 }, { "epoch": 5.54, "learning_rate": 7.662524983360616e-05, "loss": 2.2318, "step": 3900 }, { "epoch": 5.55, "learning_rate": 8.491743913236591e-05, "loss": 2.2259, "step": 3910 }, { "epoch": 5.57, "learning_rate": 9.34624200568494e-05, "loss": 2.195, "step": 3920 }, { "epoch": 5.58, "learning_rate": 0.00010222700246224714, "loss": 2.297, "step": 3930 }, { "epoch": 5.6, "learning_rate": 0.00011117714323462178, "loss": 2.245, "step": 3940 }, { "epoch": 5.61, "learning_rate": 0.00012027807852009036, "loss": 2.2401, "step": 3950 }, { "epoch": 5.62, "learning_rate": 0.0001294944587534891, "loss": 2.2179, "step": 3960 }, { "epoch": 5.64, "learning_rate": 0.00013879048596203604, "loss": 2.2755, "step": 3970 }, { "epoch": 5.65, "learning_rate": 0.00014813005281068752, "loss": 2.2392, "step": 3980 }, { "epoch": 5.67, "learning_rate": 0.00015747688284910446, "loss": 2.2611, "step": 3990 }, { "epoch": 5.68, "learning_rate": 0.00016679467141549617, "loss": 2.2239, "step": 4000 }, { "epoch": 5.7, "learning_rate": 0.0001760472266500391, "loss": 2.2209, "step": 4010 }, { "epoch": 5.71, "learning_rate": 0.00018519861007015696, "loss": 2.2581, "step": 4020 }, { "epoch": 5.72, "learning_rate": 0.00019421327616163538, "loss": 2.2747, "step": 4030 }, { "epoch": 5.74, "learning_rate": 0.0002030562104433871, "loss": 2.2458, "step": 4040 }, { "epoch": 5.75, "learning_rate": 0.00021169306546959168, "loss": 2.2509, "step": 4050 }, { "epoch": 5.77, "learning_rate": 0.00022009029424095936, "loss": 2.2503, "step": 4060 }, { "epoch": 5.78, "learning_rate": 0.00022821528050692485, "loss": 2.2346, "step": 4070 }, { "epoch": 5.8, "learning_rate": 0.00023603646545265668, "loss": 2.2528, "step": 4080 }, { "epoch": 5.81, "learning_rate": 0.00024352347027880992, "loss": 2.2423, "step": 4090 }, { "epoch": 5.82, "learning_rate": 0.0002506472141978955, "loss": 2.2623, "step": 4100 }, { "epoch": 5.84, "learning_rate": 0.0002573800273889578, "loss": 2.2391, "step": 4110 }, { "epoch": 5.85, "learning_rate": 0.0002636957584718177, "loss": 2.252, "step": 4120 }, { "epoch": 5.87, "learning_rate": 0.0002695698760834382, "loss": 2.2544, "step": 4130 }, { "epoch": 5.88, "learning_rate": 0.0002749795641618672, "loss": 2.2513, "step": 4140 }, { "epoch": 5.89, "learning_rate": 0.00027990381056766574, "loss": 2.2127, "step": 4150 }, { "epoch": 5.91, "learning_rate": 0.0002843234886985949, "loss": 2.2516, "step": 4160 }, { "epoch": 5.92, "learning_rate": 0.000288221431780561, "loss": 2.2518, "step": 4170 }, { "epoch": 5.94, "learning_rate": 0.0002915824995462552, "loss": 2.2243, "step": 4180 }, { "epoch": 5.95, "learning_rate": 0.00029439363704250176, "loss": 2.2356, "step": 4190 }, { "epoch": 5.97, "learning_rate": 0.00029664392533789563, "loss": 2.2513, "step": 4200 }, { "epoch": 5.98, "learning_rate": 0.0002983246239337692, "loss": 2.2249, "step": 4210 }, { "epoch": 5.99, "learning_rate": 0.0002994292047137618, "loss": 2.2587, "step": 4220 }, { "epoch": 6.0, "eval_loss": 2.3698606491088867, "eval_runtime": 11.0095, "eval_samples_per_second": 909.309, "eval_steps_per_second": 7.176, "step": 4224 }, { "epoch": 6.01, "learning_rate": 0.0002999533773001224, "loss": 2.2167, "step": 4230 }, { "epoch": 6.02, "learning_rate": 0.0002998951057182598, "loss": 2.1821, "step": 4240 }, { "epoch": 6.04, "learning_rate": 0.0002992546163048102, "loss": 2.2571, "step": 4250 }, { "epoch": 6.05, "learning_rate": 0.0002980343968285081, "loss": 2.213, "step": 4260 }, { "epoch": 6.07, "learning_rate": 0.0002962391868272736, "loss": 2.1956, "step": 4270 }, { "epoch": 6.08, "learning_rate": 0.0002938759591990481, "loss": 2.2039, "step": 4280 }, { "epoch": 6.09, "learning_rate": 0.0002909538931178863, "loss": 2.2088, "step": 4290 }, { "epoch": 6.11, "learning_rate": 0.00028748433838049643, "loss": 2.1783, "step": 4300 }, { "epoch": 6.12, "learning_rate": 0.0002834807713217205, "loss": 2.2402, "step": 4310 }, { "epoch": 6.14, "learning_rate": 0.0002789587424701785, "loss": 2.1845, "step": 4320 }, { "epoch": 6.15, "learning_rate": 0.0002739358161473994, "loss": 2.2124, "step": 4330 }, { "epoch": 6.16, "learning_rate": 0.0002684315022450354, "loss": 2.1913, "step": 4340 }, { "epoch": 6.18, "learning_rate": 0.00026246718044516056, "loss": 2.2663, "step": 4350 }, { "epoch": 6.19, "learning_rate": 0.00025606601717798207, "loss": 2.2365, "step": 4360 }, { "epoch": 6.21, "learning_rate": 0.0002492528756395288, "loss": 2.2329, "step": 4370 }, { "epoch": 6.22, "learning_rate": 0.00024205421921880737, "loss": 2.1998, "step": 4380 }, { "epoch": 6.24, "learning_rate": 0.00023449800870954305, "loss": 2.2371, "step": 4390 }, { "epoch": 6.25, "learning_rate": 0.00022661359370575293, "loss": 2.2358, "step": 4400 }, { "epoch": 6.26, "learning_rate": 0.00021843159860297448, "loss": 2.1672, "step": 4410 }, { "epoch": 6.28, "learning_rate": 0.00020998380364796156, "loss": 2.1938, "step": 4420 }, { "epoch": 6.29, "learning_rate": 0.00020130302149885012, "loss": 2.1829, "step": 4430 }, { "epoch": 6.31, "learning_rate": 0.00019242296977527433, "loss": 2.2059, "step": 4440 }, { "epoch": 6.32, "learning_rate": 0.0001833781400934473, "loss": 2.2004, "step": 4450 }, { "epoch": 6.34, "learning_rate": 0.00017420366409492155, "loss": 2.2182, "step": 4460 }, { "epoch": 6.35, "learning_rate": 0.00016493517698937244, "loss": 2.2017, "step": 4470 }, { "epoch": 6.36, "learning_rate": 0.0001556086791414492, "loss": 2.173, "step": 4480 }, { "epoch": 6.38, "learning_rate": 0.00014626039623928932, "loss": 2.1953, "step": 4490 }, { "epoch": 6.39, "learning_rate": 0.0001369266385878509, "loss": 2.2158, "step": 4500 }, { "epoch": 6.41, "learning_rate": 0.00012764366007357388, "loss": 2.1664, "step": 4510 }, { "epoch": 6.42, "learning_rate": 0.0001184475173481756, "loss": 2.1875, "step": 4520 }, { "epoch": 6.43, "learning_rate": 0.0001093739297785496, "loss": 2.2206, "step": 4530 }, { "epoch": 6.45, "learning_rate": 0.0001004581407067247, "loss": 2.2095, "step": 4540 }, { "epoch": 6.46, "learning_rate": 9.173478055879596e-05, "loss": 2.218, "step": 4550 }, { "epoch": 6.48, "learning_rate": 8.323773233451119e-05, "loss": 2.1932, "step": 4560 }, { "epoch": 6.49, "learning_rate": 7.500000000000044e-05, "loss": 2.2209, "step": 4570 }, { "epoch": 6.51, "learning_rate": 6.705358029480897e-05, "loss": 2.1806, "step": 4580 }, { "epoch": 6.52, "learning_rate": 5.942933845117864e-05, "loss": 2.2136, "step": 4590 }, { "epoch": 6.53, "learning_rate": 5.215688830828199e-05, "loss": 2.2195, "step": 4600 }, { "epoch": 6.55, "learning_rate": 4.5264477287089516e-05, "loss": 2.1989, "step": 4610 }, { "epoch": 6.56, "learning_rate": 3.877887667262599e-05, "loss": 2.1715, "step": 4620 }, { "epoch": 6.58, "learning_rate": 3.272527762979546e-05, "loss": 2.1788, "step": 4630 }, { "epoch": 6.59, "learning_rate": 2.7127193356651412e-05, "loss": 2.1751, "step": 4640 }, { "epoch": 6.61, "learning_rate": 2.200636775517649e-05, "loss": 2.2413, "step": 4650 }, { "epoch": 6.62, "learning_rate": 1.7382690974308584e-05, "loss": 2.1537, "step": 4660 }, { "epoch": 6.63, "learning_rate": 1.3274122153249028e-05, "loss": 2.2157, "step": 4670 }, { "epoch": 6.65, "learning_rate": 9.696619665145034e-06, "loss": 2.2003, "step": 4680 }, { "epoch": 6.66, "learning_rate": 6.664079132078831e-06, "loss": 2.1929, "step": 4690 }, { "epoch": 6.68, "learning_rate": 4.188279452133875e-06, "loss": 2.2119, "step": 4700 }, { "epoch": 6.69, "learning_rate": 2.2788370481688135e-06, "loss": 2.1549, "step": 4710 }, { "epoch": 6.7, "learning_rate": 9.431685160136759e-07, "loss": 2.1555, "step": 4720 }, { "epoch": 6.72, "learning_rate": 1.8646181716164831e-07, "loss": 2.1977, "step": 4730 }, { "epoch": 6.73, "learning_rate": 1.1656127853787445e-08, "loss": 2.1932, "step": 4740 }, { "epoch": 6.75, "learning_rate": 4.194304228229639e-07, "loss": 2.1683, "step": 4750 }, { "epoch": 6.76, "learning_rate": 1.408200838042145e-06, "loss": 2.1984, "step": 4760 }, { "epoch": 6.78, "learning_rate": 2.974126822718409e-06, "loss": 2.1759, "step": 4770 }, { "epoch": 6.79, "learning_rate": 5.11112605663977e-06, "loss": 2.1944, "step": 4780 }, { "epoch": 6.8, "learning_rate": 7.810898074930111e-06, "loss": 2.1653, "step": 4790 }, { "epoch": 6.82, "learning_rate": 1.1062956508453785e-05, "loss": 2.1588, "step": 4800 }, { "epoch": 6.83, "learning_rate": 1.485466981463706e-05, "loss": 2.2177, "step": 4810 }, { "epoch": 6.85, "learning_rate": 1.9171310340509167e-05, "loss": 2.1766, "step": 4820 }, { "epoch": 6.86, "learning_rate": 2.399611152738402e-05, "loss": 2.2032, "step": 4830 }, { "epoch": 6.88, "learning_rate": 2.9310333034999828e-05, "loss": 2.1885, "step": 4840 }, { "epoch": 6.89, "learning_rate": 3.509333353215311e-05, "loss": 2.1923, "step": 4850 }, { "epoch": 6.9, "learning_rate": 4.1322650871108964e-05, "loss": 2.2324, "step": 4860 }, { "epoch": 6.92, "learning_rate": 4.797408933436242e-05, "loss": 2.2365, "step": 4870 }, { "epoch": 6.93, "learning_rate": 5.5021813614879056e-05, "loss": 2.2018, "step": 4880 }, { "epoch": 6.95, "learning_rate": 6.243844916478162e-05, "loss": 2.2164, "step": 4890 }, { "epoch": 6.96, "learning_rate": 7.019518852269926e-05, "loss": 2.1815, "step": 4900 }, { "epoch": 6.97, "learning_rate": 7.826190320680255e-05, "loss": 2.1761, "step": 4910 }, { "epoch": 6.99, "learning_rate": 8.660726073889499e-05, "loss": 2.1938, "step": 4920 }, { "epoch": 7.0, "eval_loss": 2.346975326538086, "eval_runtime": 11.0292, "eval_samples_per_second": 907.68, "eval_steps_per_second": 7.163, "step": 4928 }, { "epoch": 7.0, "learning_rate": 9.519884634504074e-05, "loss": 2.157, "step": 4930 }, { "epoch": 7.02, "learning_rate": 0.00010400328885999944, "loss": 2.1342, "step": 4940 }, { "epoch": 7.03, "learning_rate": 0.00011298639034645615, "loss": 2.169, "step": 4950 }, { "epoch": 7.05, "learning_rate": 0.00012211325892556206, "loss": 2.1329, "step": 4960 }, { "epoch": 7.06, "learning_rate": 0.0001313484443028771, "loss": 2.1549, "step": 4970 }, { "epoch": 7.07, "learning_rate": 0.00014065607546327163, "loss": 2.1629, "step": 4980 }, { "epoch": 7.09, "learning_rate": 0.00014999999999999955, "loss": 2.1504, "step": 4990 }, { "epoch": 7.1, "learning_rate": 0.000159343924536728, "loss": 2.1788, "step": 5000 }, { "epoch": 7.12, "learning_rate": 0.0001686515556971225, "loss": 2.1536, "step": 5010 }, { "epoch": 7.13, "learning_rate": 0.00017788674107443756, "loss": 2.1729, "step": 5020 }, { "epoch": 7.14, "learning_rate": 0.00018701360965354348, "loss": 2.1063, "step": 5030 }, { "epoch": 7.16, "learning_rate": 0.0001959967111400002, "loss": 2.1522, "step": 5040 }, { "epoch": 7.17, "learning_rate": 0.0002048011536549589, "loss": 2.1394, "step": 5050 }, { "epoch": 7.19, "learning_rate": 0.00021339273926110466, "loss": 2.1487, "step": 5060 }, { "epoch": 7.2, "learning_rate": 0.00022173809679319713, "loss": 2.1357, "step": 5070 }, { "epoch": 7.22, "learning_rate": 0.00022980481147730043, "loss": 2.1608, "step": 5080 }, { "epoch": 7.23, "learning_rate": 0.00023756155083521808, "loss": 2.1628, "step": 5090 }, { "epoch": 7.24, "learning_rate": 0.00024497818638512063, "loss": 2.1579, "step": 5100 }, { "epoch": 7.26, "learning_rate": 0.0002520259106656373, "loss": 2.1437, "step": 5110 }, { "epoch": 7.27, "learning_rate": 0.0002586773491288908, "loss": 2.173, "step": 5120 }, { "epoch": 7.29, "learning_rate": 0.00026490666646784665, "loss": 2.1681, "step": 5130 }, { "epoch": 7.3, "learning_rate": 0.00027068966696499995, "loss": 2.168, "step": 5140 }, { "epoch": 7.32, "learning_rate": 0.00027600388847261577, "loss": 2.1713, "step": 5150 }, { "epoch": 7.33, "learning_rate": 0.00028082868965949065, "loss": 2.1775, "step": 5160 }, { "epoch": 7.34, "learning_rate": 0.00028514533018536277, "loss": 2.1691, "step": 5170 }, { "epoch": 7.36, "learning_rate": 0.00028893704349154605, "loss": 2.201, "step": 5180 }, { "epoch": 7.37, "learning_rate": 0.0002921891019250697, "loss": 2.1381, "step": 5190 }, { "epoch": 7.39, "learning_rate": 0.0002948888739433601, "loss": 2.1945, "step": 5200 }, { "epoch": 7.4, "learning_rate": 0.0002970258731772815, "loss": 2.212, "step": 5210 }, { "epoch": 7.41, "learning_rate": 0.00029859179916195776, "loss": 2.1935, "step": 5220 }, { "epoch": 7.43, "learning_rate": 0.00029958056957717696, "loss": 2.1762, "step": 5230 }, { "epoch": 7.44, "learning_rate": 0.0002999883438721462, "loss": 2.1784, "step": 5240 }, { "epoch": 7.46, "learning_rate": 0.00029981353818283837, "loss": 2.156, "step": 5250 }, { "epoch": 7.47, "learning_rate": 0.00029905683148398634, "loss": 2.1444, "step": 5260 }, { "epoch": 7.49, "learning_rate": 0.0002977211629518312, "loss": 2.1508, "step": 5270 }, { "epoch": 7.5, "learning_rate": 0.00029581172054786616, "loss": 2.1578, "step": 5280 }, { "epoch": 7.51, "learning_rate": 0.00029333592086792123, "loss": 2.1284, "step": 5290 }, { "epoch": 7.53, "learning_rate": 0.0002903033803348551, "loss": 2.1465, "step": 5300 }, { "epoch": 7.54, "learning_rate": 0.0002867258778467511, "loss": 2.1643, "step": 5310 }, { "epoch": 7.56, "learning_rate": 0.0002826173090256916, "loss": 2.2208, "step": 5320 }, { "epoch": 7.57, "learning_rate": 0.00027799363224482364, "loss": 2.1873, "step": 5330 }, { "epoch": 7.59, "learning_rate": 0.00027287280664334875, "loss": 2.1881, "step": 5340 }, { "epoch": 7.6, "learning_rate": 0.00026727472237020506, "loss": 2.1576, "step": 5350 }, { "epoch": 7.61, "learning_rate": 0.0002612211233273738, "loss": 2.1402, "step": 5360 }, { "epoch": 7.63, "learning_rate": 0.0002547355227129107, "loss": 2.1339, "step": 5370 }, { "epoch": 7.64, "learning_rate": 0.0002478431116917187, "loss": 2.1533, "step": 5380 }, { "epoch": 7.66, "learning_rate": 0.00024057066154882119, "loss": 2.185, "step": 5390 }, { "epoch": 7.67, "learning_rate": 0.00023294641970519127, "loss": 2.1806, "step": 5400 }, { "epoch": 7.68, "learning_rate": 0.00022500000000000032, "loss": 2.134, "step": 5410 }, { "epoch": 7.7, "learning_rate": 0.00021676226766548907, "loss": 2.1749, "step": 5420 }, { "epoch": 7.71, "learning_rate": 0.00020826521944120434, "loss": 2.1698, "step": 5430 }, { "epoch": 7.73, "learning_rate": 0.0001995418592932751, "loss": 2.1687, "step": 5440 }, { "epoch": 7.74, "learning_rate": 0.0001906260702214507, "loss": 2.1888, "step": 5450 }, { "epoch": 7.76, "learning_rate": 0.00018155248265182522, "loss": 2.145, "step": 5460 }, { "epoch": 7.77, "learning_rate": 0.0001723563399264259, "loss": 2.1814, "step": 5470 }, { "epoch": 7.78, "learning_rate": 0.0001630733614121494, "loss": 2.1604, "step": 5480 }, { "epoch": 7.8, "learning_rate": 0.00015373960376071152, "loss": 2.1584, "step": 5490 }, { "epoch": 7.81, "learning_rate": 0.00014439132085855056, "loss": 2.146, "step": 5500 }, { "epoch": 7.83, "learning_rate": 0.0001350648230106279, "loss": 2.1995, "step": 5510 }, { "epoch": 7.84, "learning_rate": 0.0001257963359050793, "loss": 2.1522, "step": 5520 }, { "epoch": 7.86, "learning_rate": 0.00011662185990655302, "loss": 2.1457, "step": 5530 }, { "epoch": 7.87, "learning_rate": 0.00010757703022472596, "loss": 2.1699, "step": 5540 }, { "epoch": 7.88, "learning_rate": 9.869697850114966e-05, "loss": 2.1149, "step": 5550 }, { "epoch": 7.9, "learning_rate": 9.001619635203874e-05, "loss": 2.1681, "step": 5560 }, { "epoch": 7.91, "learning_rate": 8.156840139702631e-05, "loss": 2.1431, "step": 5570 }, { "epoch": 7.93, "learning_rate": 7.33864062942469e-05, "loss": 2.1279, "step": 5580 }, { "epoch": 7.94, "learning_rate": 6.550199129045721e-05, "loss": 2.1389, "step": 5590 }, { "epoch": 7.95, "learning_rate": 5.794578078119331e-05, "loss": 2.1316, "step": 5600 }, { "epoch": 7.97, "learning_rate": 5.0747124360471404e-05, "loss": 2.0931, "step": 5610 }, { "epoch": 7.98, "learning_rate": 4.393398282201809e-05, "loss": 2.1771, "step": 5620 }, { "epoch": 8.0, "learning_rate": 3.7532819554840014e-05, "loss": 2.1491, "step": 5630 }, { "epoch": 8.0, "eval_loss": 2.3224949836730957, "eval_runtime": 11.0019, "eval_samples_per_second": 909.934, "eval_steps_per_second": 7.181, "step": 5632 }, { "epoch": 8.01, "learning_rate": 3.156849775496477e-05, "loss": 2.0926, "step": 5640 }, { "epoch": 8.03, "learning_rate": 2.606418385260078e-05, "loss": 2.077, "step": 5650 }, { "epoch": 8.04, "learning_rate": 2.104125752982142e-05, "loss": 2.0516, "step": 5660 }, { "epoch": 8.05, "learning_rate": 1.6519228678279616e-05, "loss": 2.0906, "step": 5670 }, { "epoch": 8.07, "learning_rate": 1.2515661619503886e-05, "loss": 2.095, "step": 5680 }, { "epoch": 8.08, "learning_rate": 9.046106882113602e-06, "loss": 2.0801, "step": 5690 }, { "epoch": 8.1, "learning_rate": 6.124040800951968e-06, "loss": 2.0714, "step": 5700 }, { "epoch": 8.11, "learning_rate": 3.760813172726557e-06, "loss": 2.1109, "step": 5710 }, { "epoch": 8.12, "learning_rate": 1.9656031714918865e-06, "loss": 2.12, "step": 5720 }, { "epoch": 8.14, "learning_rate": 7.453836951898218e-07, "loss": 2.0788, "step": 5730 }, { "epoch": 8.15, "learning_rate": 1.048942817402254e-07, "loss": 2.0993, "step": 5740 }, { "epoch": 8.17, "learning_rate": 4.662269987756317e-08, "loss": 2.0666, "step": 5750 }, { "epoch": 8.18, "learning_rate": 5.707952862381681e-07, "loss": 2.1035, "step": 5760 }, { "epoch": 8.2, "learning_rate": 1.675376066230738e-06, "loss": 2.0755, "step": 5770 }, { "epoch": 8.21, "learning_rate": 3.356074662104369e-06, "loss": 2.0395, "step": 5780 }, { "epoch": 8.22, "learning_rate": 5.606362957497995e-06, "loss": 2.049, "step": 5790 }, { "epoch": 8.24, "learning_rate": 8.417500453745013e-06, "loss": 2.1038, "step": 5800 }, { "epoch": 8.25, "learning_rate": 1.1778568219438656e-05, "loss": 2.0996, "step": 5810 }, { "epoch": 8.27, "learning_rate": 1.5676511301404658e-05, "loss": 2.055, "step": 5820 }, { "epoch": 8.28, "learning_rate": 2.009618943233406e-05, "loss": 2.0607, "step": 5830 }, { "epoch": 8.3, "learning_rate": 2.5020435838132556e-05, "loss": 2.0802, "step": 5840 }, { "epoch": 8.31, "learning_rate": 3.043012391656157e-05, "loss": 2.0957, "step": 5850 }, { "epoch": 8.32, "learning_rate": 3.630424152818206e-05, "loss": 2.0716, "step": 5860 }, { "epoch": 8.34, "learning_rate": 4.2619972611041564e-05, "loss": 2.1055, "step": 5870 }, { "epoch": 8.35, "learning_rate": 4.935278580210464e-05, "loss": 2.0589, "step": 5880 }, { "epoch": 8.37, "learning_rate": 5.6476529721190224e-05, "loss": 2.0697, "step": 5890 }, { "epoch": 8.38, "learning_rate": 6.396353454734256e-05, "loss": 2.0682, "step": 5900 }, { "epoch": 8.39, "learning_rate": 7.178471949307574e-05, "loss": 2.0672, "step": 5910 }, { "epoch": 8.41, "learning_rate": 7.990970575904032e-05, "loss": 2.0863, "step": 5920 }, { "epoch": 8.42, "learning_rate": 8.830693453040795e-05, "loss": 2.089, "step": 5930 }, { "epoch": 8.44, "learning_rate": 9.694378955661256e-05, "loss": 2.0583, "step": 5940 }, { "epoch": 8.45, "learning_rate": 0.00010578672383836425, "loss": 2.0616, "step": 5950 }, { "epoch": 8.47, "learning_rate": 0.00011480138992984269, "loss": 2.0936, "step": 5960 }, { "epoch": 8.48, "learning_rate": 0.00012395277334996052, "loss": 2.0863, "step": 5970 }, { "epoch": 8.49, "learning_rate": 0.00013320532858450296, "loss": 2.1113, "step": 5980 }, { "epoch": 8.51, "learning_rate": 0.0001425231171508957, "loss": 2.04, "step": 5990 }, { "epoch": 8.52, "learning_rate": 0.0001518699471893126, "loss": 2.0697, "step": 6000 }, { "epoch": 8.54, "learning_rate": 0.00016120951403796307, "loss": 2.0468, "step": 6010 }, { "epoch": 8.55, "learning_rate": 0.0001705055412465116, "loss": 2.0831, "step": 6020 }, { "epoch": 8.57, "learning_rate": 0.00017972192147990926, "loss": 2.1102, "step": 6030 }, { "epoch": 8.58, "learning_rate": 0.00018882285676537786, "loss": 2.0944, "step": 6040 }, { "epoch": 8.59, "learning_rate": 0.0001977729975377525, "loss": 2.108, "step": 6050 }, { "epoch": 8.61, "learning_rate": 0.00020653757994315073, "loss": 2.0661, "step": 6060 }, { "epoch": 8.62, "learning_rate": 0.00021508256086763376, "loss": 2.078, "step": 6070 }, { "epoch": 8.64, "learning_rate": 0.00022337475016639353, "loss": 2.1152, "step": 6080 }, { "epoch": 8.65, "learning_rate": 0.0002313819395798632, "loss": 2.1097, "step": 6090 }, { "epoch": 8.66, "learning_rate": 0.00023907302783602552, "loss": 2.0812, "step": 6100 }, { "epoch": 8.68, "learning_rate": 0.0002464181414529804, "loss": 2.1115, "step": 6110 }, { "epoch": 8.69, "learning_rate": 0.00025338875077259167, "loss": 2.0616, "step": 6120 }, { "epoch": 8.71, "learning_rate": 0.0002599577807744744, "loss": 2.1264, "step": 6130 }, { "epoch": 8.72, "learning_rate": 0.0002655054352858896, "loss": 2.0978, "step": 6140 }, { "epoch": 8.74, "learning_rate": 0.0002712425392983004, "loss": 2.1099, "step": 6150 }, { "epoch": 8.75, "learning_rate": 0.00027650871687193255, "loss": 2.0902, "step": 6160 }, { "epoch": 8.76, "learning_rate": 0.0002812835132863128, "loss": 2.102, "step": 6170 }, { "epoch": 8.78, "learning_rate": 0.0002855483824281353, "loss": 2.082, "step": 6180 }, { "epoch": 8.79, "learning_rate": 0.0002892867588274879, "loss": 2.1016, "step": 6190 }, { "epoch": 8.81, "learning_rate": 0.00029248412200092686, "loss": 2.0823, "step": 6200 }, { "epoch": 8.82, "learning_rate": 0.0002951280528514794, "loss": 2.1198, "step": 6210 }, { "epoch": 8.84, "learning_rate": 0.00029720828190650815, "loss": 2.0912, "step": 6220 }, { "epoch": 8.85, "learning_rate": 0.00029871672920607153, "loss": 2.0547, "step": 6230 }, { "epoch": 8.86, "learning_rate": 0.00029964753568684926, "loss": 2.1026, "step": 6240 }, { "epoch": 8.88, "learning_rate": 0.0002999970859397307, "loss": 2.106, "step": 6250 }, { "epoch": 8.89, "learning_rate": 0.00029976402225267247, "loss": 2.1424, "step": 6260 }, { "epoch": 8.91, "learning_rate": 0.0002989492498842809, "loss": 2.1021, "step": 6270 }, { "epoch": 8.92, "learning_rate": 0.00029755593354763527, "loss": 2.1026, "step": 6280 }, { "epoch": 8.93, "learning_rate": 0.00029558948511800866, "loss": 2.0828, "step": 6290 }, { "epoch": 8.95, "learning_rate": 0.00029305754261223406, "loss": 2.1417, "step": 6300 }, { "epoch": 8.96, "learning_rate": 0.00028996994052135996, "loss": 2.0579, "step": 6310 }, { "epoch": 8.98, "learning_rate": 0.00028633867161183155, "loss": 2.1051, "step": 6320 }, { "epoch": 8.99, "learning_rate": 0.00028217784034356626, "loss": 2.0623, "step": 6330 }, { "epoch": 9.0, "eval_loss": 2.327604293823242, "eval_runtime": 11.0081, "eval_samples_per_second": 909.418, "eval_steps_per_second": 7.177, "step": 6336 }, { "epoch": 9.01, "learning_rate": 0.0002775036080858562, "loss": 2.091, "step": 6340 }, { "epoch": 9.02, "learning_rate": 0.0002723341303438894, "loss": 2.0147, "step": 6350 }, { "epoch": 9.03, "learning_rate": 0.00026668948623970694, "loss": 2.0584, "step": 6360 }, { "epoch": 9.05, "learning_rate": 0.0002605916005215189, "loss": 2.0441, "step": 6370 }, { "epoch": 9.06, "learning_rate": 0.00025406415840428147, "loss": 2.0037, "step": 6380 }, { "epoch": 9.08, "learning_rate": 0.00024713251357234075, "loss": 2.0342, "step": 6390 }, { "epoch": 9.09, "learning_rate": 0.00023982358970145017, "loss": 2.0517, "step": 6400 }, { "epoch": 9.11, "learning_rate": 0.00023216577588268072, "loss": 2.0496, "step": 6410 }, { "epoch": 9.12, "learning_rate": 0.00022418881635441105, "loss": 2.0683, "step": 6420 }, { "epoch": 9.13, "learning_rate": 0.00021592369497069755, "loss": 2.0416, "step": 6430 }, { "epoch": 9.15, "learning_rate": 0.00020740251485476324, "loss": 2.0329, "step": 6440 }, { "epoch": 9.16, "learning_rate": 0.00019865837370507073, "loss": 2.0409, "step": 6450 }, { "epoch": 9.18, "learning_rate": 0.00018972523523827966, "loss": 2.0436, "step": 6460 }, { "epoch": 9.19, "learning_rate": 0.00018063779726845152, "loss": 2.0631, "step": 6470 }, { "epoch": 9.2, "learning_rate": 0.0001714313569349074, "loss": 2.0563, "step": 6480 }, { "epoch": 9.22, "learning_rate": 0.0001621416736021808, "loss": 2.0395, "step": 6490 }, { "epoch": 9.23, "learning_rate": 0.00015280482996463552, "loss": 2.0422, "step": 6500 }, { "epoch": 9.25, "learning_rate": 0.0001434570918951997, "loss": 2.038, "step": 6510 }, { "epoch": 9.26, "learning_rate": 0.00013413476758260932, "loss": 2.0441, "step": 6520 }, { "epoch": 9.28, "learning_rate": 0.00012487406650428943, "loss": 2.036, "step": 6530 }, { "epoch": 9.29, "learning_rate": 0.00011571095878264738, "loss": 2.074, "step": 6540 }, { "epoch": 9.3, "learning_rate": 0.00010668103547105523, "loss": 2.063, "step": 6550 }, { "epoch": 9.32, "learning_rate": 9.781937031221653e-05, "loss": 2.0583, "step": 6560 }, { "epoch": 9.33, "learning_rate": 8.916038350582922e-05, "loss": 2.0469, "step": 6570 }, { "epoch": 9.35, "learning_rate": 8.073770801474436e-05, "loss": 2.0674, "step": 6580 }, { "epoch": 9.36, "learning_rate": 7.25840589288743e-05, "loss": 2.0586, "step": 6590 }, { "epoch": 9.38, "learning_rate": 6.473110639426635e-05, "loss": 2.0402, "step": 6600 }, { "epoch": 9.39, "learning_rate": 5.72093526009319e-05, "loss": 2.0753, "step": 6610 }, { "epoch": 9.4, "learning_rate": 5.0048013307199424e-05, "loss": 1.9955, "step": 6620 }, { "epoch": 9.42, "learning_rate": 4.3274904360790423e-05, "loss": 2.0728, "step": 6630 }, { "epoch": 9.43, "learning_rate": 3.691633365738289e-05, "loss": 2.0395, "step": 6640 }, { "epoch": 9.45, "learning_rate": 3.09969989563152e-05, "loss": 2.0321, "step": 6650 }, { "epoch": 9.46, "learning_rate": 2.5539891950326658e-05, "loss": 2.0234, "step": 6660 }, { "epoch": 9.47, "learning_rate": 2.056620896195836e-05, "loss": 2.0023, "step": 6670 }, { "epoch": 9.49, "learning_rate": 1.6095268613458483e-05, "loss": 2.0256, "step": 6680 }, { "epoch": 9.5, "learning_rate": 1.2144436790007034e-05, "loss": 2.005, "step": 6690 }, { "epoch": 9.52, "learning_rate": 8.729059187690579e-06, "loss": 2.0098, "step": 6700 }, { "epoch": 9.53, "learning_rate": 5.862401708235109e-06, "loss": 2.0488, "step": 6710 }, { "epoch": 9.55, "learning_rate": 3.555598932010012e-06, "loss": 2.0012, "step": 6720 }, { "epoch": 9.56, "learning_rate": 1.8176108694427927e-06, "loss": 2.0404, "step": 6730 }, { "epoch": 9.57, "learning_rate": 6.551881588300112e-07, "loss": 2.0141, "step": 6740 }, { "epoch": 9.59, "learning_rate": 7.284584572083696e-08, "loss": 2.0593, "step": 6750 }, { "epoch": 9.6, "learning_rate": 7.284584572082031e-08, "loss": 2.0341, "step": 6760 }, { "epoch": 9.62, "learning_rate": 6.551881588299612e-07, "loss": 2.037, "step": 6770 }, { "epoch": 9.63, "learning_rate": 1.817610869442726e-06, "loss": 2.0146, "step": 6780 }, { "epoch": 9.64, "learning_rate": 3.5555989320099122e-06, "loss": 2.0295, "step": 6790 }, { "epoch": 9.66, "learning_rate": 5.862401708234976e-06, "loss": 2.0359, "step": 6800 }, { "epoch": 9.67, "learning_rate": 8.729059187690413e-06, "loss": 2.0138, "step": 6810 }, { "epoch": 9.69, "learning_rate": 1.214443679000685e-05, "loss": 2.028, "step": 6820 }, { "epoch": 9.7, "learning_rate": 1.6095268613458266e-05, "loss": 2.0097, "step": 6830 }, { "epoch": 9.72, "learning_rate": 2.0566208961958125e-05, "loss": 2.0522, "step": 6840 }, { "epoch": 9.73, "learning_rate": 2.553989195032639e-05, "loss": 2.0516, "step": 6850 }, { "epoch": 9.74, "learning_rate": 3.0996998956314905e-05, "loss": 2.022, "step": 6860 }, { "epoch": 9.76, "learning_rate": 3.691633365738259e-05, "loss": 2.0524, "step": 6870 }, { "epoch": 9.77, "learning_rate": 4.327490436079009e-05, "loss": 2.0464, "step": 6880 }, { "epoch": 9.79, "learning_rate": 5.004801330719908e-05, "loss": 2.0572, "step": 6890 }, { "epoch": 9.8, "learning_rate": 5.720935260093152e-05, "loss": 2.0273, "step": 6900 }, { "epoch": 9.82, "learning_rate": 6.473110639426594e-05, "loss": 2.03, "step": 6910 }, { "epoch": 9.83, "learning_rate": 7.258405892887389e-05, "loss": 2.027, "step": 6920 }, { "epoch": 9.84, "learning_rate": 8.073770801474394e-05, "loss": 2.0345, "step": 6930 }, { "epoch": 9.86, "learning_rate": 8.916038350582879e-05, "loss": 2.0404, "step": 6940 }, { "epoch": 9.87, "learning_rate": 9.781937031221607e-05, "loss": 2.0137, "step": 6950 }, { "epoch": 9.89, "learning_rate": 0.00010668103547105477, "loss": 2.0181, "step": 6960 }, { "epoch": 9.9, "learning_rate": 0.0001157109587826469, "loss": 2.0438, "step": 6970 }, { "epoch": 9.91, "learning_rate": 0.00012487406650428897, "loss": 2.0424, "step": 6980 }, { "epoch": 9.93, "learning_rate": 0.00013413476758260883, "loss": 2.0435, "step": 6990 }, { "epoch": 9.94, "learning_rate": 0.0001434570918951992, "loss": 2.0117, "step": 7000 }, { "epoch": 9.96, "learning_rate": 0.00015280482996463506, "loss": 2.0098, "step": 7010 }, { "epoch": 9.97, "learning_rate": 0.00016214167360218033, "loss": 2.0263, "step": 7020 }, { "epoch": 9.99, "learning_rate": 0.00017143135693490692, "loss": 2.0697, "step": 7030 }, { "epoch": 10.0, "learning_rate": 0.00018063779726845106, "loss": 2.0672, "step": 7040 }, { "epoch": 10.0, "eval_loss": 2.330075263977051, "eval_runtime": 11.0756, "eval_samples_per_second": 903.883, "eval_steps_per_second": 7.133, "step": 7040 }, { "epoch": 10.01, "learning_rate": 0.0001897252352382792, "loss": 2.0053, "step": 7050 }, { "epoch": 10.03, "learning_rate": 0.0001986583737050703, "loss": 2.0154, "step": 7060 }, { "epoch": 10.04, "learning_rate": 0.0002074025148547628, "loss": 1.998, "step": 7070 }, { "epoch": 10.06, "learning_rate": 0.00021592369497069712, "loss": 2.0056, "step": 7080 }, { "epoch": 10.07, "learning_rate": 0.00022418881635441067, "loss": 1.9888, "step": 7090 }, { "epoch": 10.09, "learning_rate": 0.00023216577588268034, "loss": 2.0266, "step": 7100 }, { "epoch": 10.1, "learning_rate": 0.00023982358970144976, "loss": 1.9969, "step": 7110 }, { "epoch": 10.11, "learning_rate": 0.0002471325135723404, "loss": 2.0036, "step": 7120 }, { "epoch": 10.13, "learning_rate": 0.00025406415840428115, "loss": 2.0173, "step": 7130 }, { "epoch": 10.14, "learning_rate": 0.0002605916005215186, "loss": 2.0406, "step": 7140 }, { "epoch": 10.16, "learning_rate": 0.0002666894862397066, "loss": 2.0007, "step": 7150 }, { "epoch": 10.17, "learning_rate": 0.0002723341303438891, "loss": 2.0359, "step": 7160 }, { "epoch": 10.18, "learning_rate": 0.000277503608085856, "loss": 2.0217, "step": 7170 }, { "epoch": 10.2, "learning_rate": 0.00028217784034356605, "loss": 2.0029, "step": 7180 }, { "epoch": 10.21, "learning_rate": 0.0002863386716118314, "loss": 2.0112, "step": 7190 }, { "epoch": 10.23, "learning_rate": 0.0002899699405213598, "loss": 2.0221, "step": 7200 }, { "epoch": 10.24, "learning_rate": 0.00029305754261223395, "loss": 2.0064, "step": 7210 }, { "epoch": 10.26, "learning_rate": 0.00029558948511800855, "loss": 2.019, "step": 7220 }, { "epoch": 10.27, "learning_rate": 0.00029755593354763516, "loss": 2.0286, "step": 7230 }, { "epoch": 10.28, "learning_rate": 0.00029894924988428087, "loss": 2.0046, "step": 7240 }, { "epoch": 10.3, "learning_rate": 0.00029976402225267247, "loss": 2.0264, "step": 7250 }, { "epoch": 10.31, "learning_rate": 0.0002999970859397307, "loss": 2.0141, "step": 7260 }, { "epoch": 10.33, "learning_rate": 0.00029964753568684926, "loss": 2.0015, "step": 7270 }, { "epoch": 10.34, "learning_rate": 0.0002987167292060716, "loss": 2.0057, "step": 7280 }, { "epoch": 10.36, "learning_rate": 0.00029720828190650826, "loss": 2.0256, "step": 7290 }, { "epoch": 10.37, "learning_rate": 0.00029512805285147956, "loss": 2.0477, "step": 7300 }, { "epoch": 10.38, "learning_rate": 0.00029248412200092697, "loss": 2.0311, "step": 7310 }, { "epoch": 10.4, "learning_rate": 0.00028928675882748813, "loss": 1.9869, "step": 7320 }, { "epoch": 10.41, "learning_rate": 0.00028554838242813554, "loss": 2.0306, "step": 7330 }, { "epoch": 10.43, "learning_rate": 0.00028128351328631304, "loss": 2.0188, "step": 7340 }, { "epoch": 10.44, "learning_rate": 0.0002765087168719328, "loss": 2.0359, "step": 7350 }, { "epoch": 10.45, "learning_rate": 0.00027124253929830067, "loss": 2.008, "step": 7360 }, { "epoch": 10.47, "learning_rate": 0.00026550543528588993, "loss": 1.9957, "step": 7370 }, { "epoch": 10.48, "learning_rate": 0.000259319688712759, "loss": 2.0123, "step": 7380 }, { "epoch": 10.5, "learning_rate": 0.0002527093260602245, "loss": 2.0441, "step": 7390 }, { "epoch": 10.51, "learning_rate": 0.00024570002308995167, "loss": 2.0213, "step": 7400 }, { "epoch": 10.53, "learning_rate": 0.00023831900511498102, "loss": 2.0308, "step": 7410 }, { "epoch": 10.54, "learning_rate": 0.00023059494125202381, "loss": 2.0259, "step": 7420 }, { "epoch": 10.55, "learning_rate": 0.00022255783306578613, "loss": 2.0105, "step": 7430 }, { "epoch": 10.57, "learning_rate": 0.0002142388980378395, "loss": 2.0307, "step": 7440 }, { "epoch": 10.58, "learning_rate": 0.00020567044831266566, "loss": 2.0057, "step": 7450 }, { "epoch": 10.6, "learning_rate": 0.00019688576519184654, "loss": 1.9967, "step": 7460 }, { "epoch": 10.61, "learning_rate": 0.0001879189698638844, "loss": 2.0092, "step": 7470 }, { "epoch": 10.62, "learning_rate": 0.00017880489087176112, "loss": 1.9987, "step": 7480 }, { "epoch": 10.64, "learning_rate": 0.00016957892883300732, "loss": 2.0128, "step": 7490 }, { "epoch": 10.65, "learning_rate": 0.000160276918937754, "loss": 2.0124, "step": 7500 }, { "epoch": 10.67, "learning_rate": 0.00015093499175880545, "loss": 2.0014, "step": 7510 }, { "epoch": 10.68, "learning_rate": 0.00014158943291442156, "loss": 2.0083, "step": 7520 }, { "epoch": 10.7, "learning_rate": 0.0001322765421288613, "loss": 2.0226, "step": 7530 }, { "epoch": 10.71, "learning_rate": 0.0001230324922381422, "loss": 2.0203, "step": 7540 }, { "epoch": 10.72, "learning_rate": 0.00011389318868865406, "loss": 2.0343, "step": 7550 }, { "epoch": 10.74, "learning_rate": 0.00010489413007435998, "loss": 2.0033, "step": 7560 }, { "epoch": 10.75, "learning_rate": 9.60702702542747e-05, "loss": 2.0476, "step": 7570 }, { "epoch": 10.77, "learning_rate": 8.745588258580053e-05, "loss": 1.9919, "step": 7580 }, { "epoch": 10.78, "learning_rate": 7.908442680122653e-05, "loss": 1.999, "step": 7590 }, { "epoch": 10.8, "learning_rate": 7.098841904449448e-05, "loss": 2.0013, "step": 7600 }, { "epoch": 10.81, "learning_rate": 6.319930557302952e-05, "loss": 1.9996, "step": 7610 }, { "epoch": 10.82, "learning_rate": 5.57473406151682e-05, "loss": 1.9714, "step": 7620 }, { "epoch": 10.84, "learning_rate": 4.8661468857651156e-05, "loss": 2.0055, "step": 7630 }, { "epoch": 10.85, "learning_rate": 4.196921301958112e-05, "loss": 2.0256, "step": 7640 }, { "epoch": 10.87, "learning_rate": 3.569656694954841e-05, "loss": 2.0128, "step": 7650 }, { "epoch": 10.88, "learning_rate": 2.9867894661145786e-05, "loss": 1.9936, "step": 7660 }, { "epoch": 10.89, "learning_rate": 2.4505835699037535e-05, "loss": 1.9781, "step": 7670 }, { "epoch": 10.91, "learning_rate": 1.9631217203152903e-05, "loss": 2.0075, "step": 7680 }, { "epoch": 10.92, "learning_rate": 1.5262973012573674e-05, "loss": 1.9988, "step": 7690 }, { "epoch": 10.94, "learning_rate": 1.141807012330722e-05, "loss": 2.0045, "step": 7700 }, { "epoch": 10.95, "learning_rate": 8.111442785622413e-06, "loss": 1.9608, "step": 7710 }, { "epoch": 10.97, "learning_rate": 5.3559344969059505e-06, "loss": 1.9888, "step": 7720 }, { "epoch": 10.98, "learning_rate": 3.1622481153527944e-06, "loss": 1.9767, "step": 7730 }, { "epoch": 10.99, "learning_rate": 1.5389042882661184e-06, "loss": 2.0293, "step": 7740 }, { "epoch": 11.0, "eval_loss": 2.318600654602051, "eval_runtime": 11.0328, "eval_samples_per_second": 907.383, "eval_steps_per_second": 7.16, "step": 7744 } ], "max_steps": 21120, "num_train_epochs": 30, "total_flos": 7.58623645990912e+16, "trial_name": null, "trial_params": null }