{ "best_metric": null, "best_model_checkpoint": null, "epoch": 2.110671936758893, "eval_steps": 10, "global_step": 1201, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.02, "learning_rate": 0.00019999576610920983, "loss": 3.0929, "step": 10 }, { "epoch": 0.02, "eval_loss": 2.167867660522461, "eval_runtime": 21.3399, "eval_samples_per_second": 23.383, "eval_steps_per_second": 5.858, "step": 10 }, { "epoch": 0.04, "learning_rate": 0.00019998306479535586, "loss": 1.8074, "step": 20 }, { "epoch": 0.04, "eval_loss": 1.4092761278152466, "eval_runtime": 21.3617, "eval_samples_per_second": 23.36, "eval_steps_per_second": 5.852, "step": 20 }, { "epoch": 0.05, "learning_rate": 0.00019996189713395766, "loss": 1.2188, "step": 30 }, { "epoch": 0.05, "eval_loss": 1.1258704662322998, "eval_runtime": 21.3588, "eval_samples_per_second": 23.363, "eval_steps_per_second": 5.852, "step": 30 }, { "epoch": 0.07, "learning_rate": 0.00019993226491744662, "loss": 1.0841, "step": 40 }, { "epoch": 0.07, "eval_loss": 0.9605854153633118, "eval_runtime": 21.3693, "eval_samples_per_second": 23.351, "eval_steps_per_second": 5.85, "step": 40 }, { "epoch": 0.09, "learning_rate": 0.00019989417065501396, "loss": 0.9594, "step": 50 }, { "epoch": 0.09, "eval_loss": 0.8681771159172058, "eval_runtime": 21.3662, "eval_samples_per_second": 23.355, "eval_steps_per_second": 5.85, "step": 50 }, { "epoch": 0.11, "learning_rate": 0.00019984761757239875, "loss": 0.8765, "step": 60 }, { "epoch": 0.11, "eval_loss": 0.816562294960022, "eval_runtime": 21.3405, "eval_samples_per_second": 23.383, "eval_steps_per_second": 5.857, "step": 60 }, { "epoch": 0.12, "learning_rate": 0.00019979260961161427, "loss": 0.852, "step": 70 }, { "epoch": 0.12, "eval_loss": 0.7803006768226624, "eval_runtime": 21.3622, "eval_samples_per_second": 23.359, "eval_steps_per_second": 5.851, "step": 70 }, { "epoch": 0.14, "learning_rate": 0.00019972915143061455, "loss": 0.8404, "step": 80 }, { "epoch": 0.14, "eval_loss": 0.760216236114502, "eval_runtime": 21.379, "eval_samples_per_second": 23.341, "eval_steps_per_second": 5.847, "step": 80 }, { "epoch": 0.16, "learning_rate": 0.0001996572484028997, "loss": 0.8183, "step": 90 }, { "epoch": 0.16, "eval_loss": 0.7222956418991089, "eval_runtime": 21.3895, "eval_samples_per_second": 23.329, "eval_steps_per_second": 5.844, "step": 90 }, { "epoch": 0.18, "learning_rate": 0.00019957690661706108, "loss": 0.7816, "step": 100 }, { "epoch": 0.18, "eval_loss": 0.7133845686912537, "eval_runtime": 21.3742, "eval_samples_per_second": 23.346, "eval_steps_per_second": 5.848, "step": 100 }, { "epoch": 0.19, "learning_rate": 0.00019948813287626563, "loss": 0.7792, "step": 110 }, { "epoch": 0.19, "eval_loss": 0.7233743667602539, "eval_runtime": 21.3703, "eval_samples_per_second": 23.35, "eval_steps_per_second": 5.849, "step": 110 }, { "epoch": 0.21, "learning_rate": 0.0001993909346976798, "loss": 0.7648, "step": 120 }, { "epoch": 0.21, "eval_loss": 0.6882979273796082, "eval_runtime": 21.3682, "eval_samples_per_second": 23.352, "eval_steps_per_second": 5.85, "step": 120 }, { "epoch": 0.23, "learning_rate": 0.0001992853203118331, "loss": 0.8132, "step": 130 }, { "epoch": 0.23, "eval_loss": 0.7019714117050171, "eval_runtime": 21.374, "eval_samples_per_second": 23.346, "eval_steps_per_second": 5.848, "step": 130 }, { "epoch": 0.25, "learning_rate": 0.000199171298661921, "loss": 0.7599, "step": 140 }, { "epoch": 0.25, "eval_loss": 0.668350338935852, "eval_runtime": 21.323, "eval_samples_per_second": 23.402, "eval_steps_per_second": 5.862, "step": 140 }, { "epoch": 0.26, "learning_rate": 0.0001990488794030478, "loss": 0.7518, "step": 150 }, { "epoch": 0.26, "eval_loss": 0.6716361045837402, "eval_runtime": 21.3312, "eval_samples_per_second": 23.393, "eval_steps_per_second": 5.86, "step": 150 }, { "epoch": 0.28, "learning_rate": 0.00019891807290140892, "loss": 0.7452, "step": 160 }, { "epoch": 0.28, "eval_loss": 0.6634441018104553, "eval_runtime": 21.3388, "eval_samples_per_second": 23.385, "eval_steps_per_second": 5.858, "step": 160 }, { "epoch": 0.3, "learning_rate": 0.00019877889023341323, "loss": 0.7215, "step": 170 }, { "epoch": 0.3, "eval_loss": 0.6609596610069275, "eval_runtime": 21.3355, "eval_samples_per_second": 23.388, "eval_steps_per_second": 5.859, "step": 170 }, { "epoch": 0.32, "learning_rate": 0.00019863134318474503, "loss": 0.7088, "step": 180 }, { "epoch": 0.32, "eval_loss": 0.659795880317688, "eval_runtime": 21.3459, "eval_samples_per_second": 23.377, "eval_steps_per_second": 5.856, "step": 180 }, { "epoch": 0.33, "learning_rate": 0.0001984754442493662, "loss": 0.7237, "step": 190 }, { "epoch": 0.33, "eval_loss": 0.6469973921775818, "eval_runtime": 21.3405, "eval_samples_per_second": 23.383, "eval_steps_per_second": 5.857, "step": 190 }, { "epoch": 0.35, "learning_rate": 0.000198311206628458, "loss": 0.7353, "step": 200 }, { "epoch": 0.35, "eval_loss": 0.6315197348594666, "eval_runtime": 21.3347, "eval_samples_per_second": 23.389, "eval_steps_per_second": 5.859, "step": 200 }, { "epoch": 0.37, "learning_rate": 0.00019813864422930347, "loss": 0.7111, "step": 210 }, { "epoch": 0.37, "eval_loss": 0.6466153860092163, "eval_runtime": 21.3369, "eval_samples_per_second": 23.387, "eval_steps_per_second": 5.858, "step": 210 }, { "epoch": 0.39, "learning_rate": 0.00019795777166410966, "loss": 0.7136, "step": 220 }, { "epoch": 0.39, "eval_loss": 0.632926344871521, "eval_runtime": 21.3244, "eval_samples_per_second": 23.4, "eval_steps_per_second": 5.862, "step": 220 }, { "epoch": 0.4, "learning_rate": 0.00019776860424877032, "loss": 0.7044, "step": 230 }, { "epoch": 0.4, "eval_loss": 0.6356912851333618, "eval_runtime": 21.3235, "eval_samples_per_second": 23.401, "eval_steps_per_second": 5.862, "step": 230 }, { "epoch": 0.42, "learning_rate": 0.000197571158001569, "loss": 0.7369, "step": 240 }, { "epoch": 0.42, "eval_loss": 0.6214553713798523, "eval_runtime": 21.3355, "eval_samples_per_second": 23.388, "eval_steps_per_second": 5.859, "step": 240 }, { "epoch": 0.44, "learning_rate": 0.00019736544964182268, "loss": 0.6995, "step": 250 }, { "epoch": 0.44, "eval_loss": 0.6103290915489197, "eval_runtime": 21.3392, "eval_samples_per_second": 23.384, "eval_steps_per_second": 5.858, "step": 250 }, { "epoch": 0.46, "learning_rate": 0.00019715149658846591, "loss": 0.7027, "step": 260 }, { "epoch": 0.46, "eval_loss": 0.5964030027389526, "eval_runtime": 21.3644, "eval_samples_per_second": 23.357, "eval_steps_per_second": 5.851, "step": 260 }, { "epoch": 0.47, "learning_rate": 0.000196929316958576, "loss": 0.6872, "step": 270 }, { "epoch": 0.47, "eval_loss": 0.60444176197052, "eval_runtime": 21.3353, "eval_samples_per_second": 23.389, "eval_steps_per_second": 5.859, "step": 270 }, { "epoch": 0.49, "learning_rate": 0.00019669892956583867, "loss": 0.7182, "step": 280 }, { "epoch": 0.49, "eval_loss": 0.6127080917358398, "eval_runtime": 21.3451, "eval_samples_per_second": 23.378, "eval_steps_per_second": 5.856, "step": 280 }, { "epoch": 0.51, "learning_rate": 0.00019646035391895512, "loss": 0.6897, "step": 290 }, { "epoch": 0.51, "eval_loss": 0.6016324758529663, "eval_runtime": 21.324, "eval_samples_per_second": 23.401, "eval_steps_per_second": 5.862, "step": 290 }, { "epoch": 0.53, "learning_rate": 0.00019621361021999008, "loss": 0.6824, "step": 300 }, { "epoch": 0.53, "eval_loss": 0.5880205631256104, "eval_runtime": 21.3379, "eval_samples_per_second": 23.386, "eval_steps_per_second": 5.858, "step": 300 }, { "epoch": 0.54, "learning_rate": 0.000195958719362661, "loss": 0.673, "step": 310 }, { "epoch": 0.54, "eval_loss": 0.5902190804481506, "eval_runtime": 21.3191, "eval_samples_per_second": 23.406, "eval_steps_per_second": 5.863, "step": 310 }, { "epoch": 0.56, "learning_rate": 0.00019569570293056894, "loss": 0.6956, "step": 320 }, { "epoch": 0.56, "eval_loss": 0.5811321139335632, "eval_runtime": 21.3365, "eval_samples_per_second": 23.387, "eval_steps_per_second": 5.859, "step": 320 }, { "epoch": 0.58, "learning_rate": 0.00019542458319537093, "loss": 0.6889, "step": 330 }, { "epoch": 0.58, "eval_loss": 0.5849844813346863, "eval_runtime": 21.3231, "eval_samples_per_second": 23.402, "eval_steps_per_second": 5.862, "step": 330 }, { "epoch": 0.6, "learning_rate": 0.00019514538311489395, "loss": 0.6773, "step": 340 }, { "epoch": 0.6, "eval_loss": 0.5933501720428467, "eval_runtime": 21.3446, "eval_samples_per_second": 23.378, "eval_steps_per_second": 5.856, "step": 340 }, { "epoch": 0.61, "learning_rate": 0.00019485812633119096, "loss": 0.6782, "step": 350 }, { "epoch": 0.61, "eval_loss": 0.594153642654419, "eval_runtime": 21.3347, "eval_samples_per_second": 23.389, "eval_steps_per_second": 5.859, "step": 350 }, { "epoch": 0.63, "learning_rate": 0.00019456283716853904, "loss": 0.719, "step": 360 }, { "epoch": 0.63, "eval_loss": 0.5848734974861145, "eval_runtime": 21.3162, "eval_samples_per_second": 23.409, "eval_steps_per_second": 5.864, "step": 360 }, { "epoch": 0.65, "learning_rate": 0.00019425954063137947, "loss": 0.6809, "step": 370 }, { "epoch": 0.65, "eval_loss": 0.579924464225769, "eval_runtime": 21.3339, "eval_samples_per_second": 23.39, "eval_steps_per_second": 5.859, "step": 370 }, { "epoch": 0.67, "learning_rate": 0.00019394826240220057, "loss": 0.6412, "step": 380 }, { "epoch": 0.67, "eval_loss": 0.5709846019744873, "eval_runtime": 21.3459, "eval_samples_per_second": 23.377, "eval_steps_per_second": 5.856, "step": 380 }, { "epoch": 0.69, "learning_rate": 0.00019362902883936288, "loss": 0.6411, "step": 390 }, { "epoch": 0.69, "eval_loss": 0.562785267829895, "eval_runtime": 21.3375, "eval_samples_per_second": 23.386, "eval_steps_per_second": 5.858, "step": 390 }, { "epoch": 0.7, "learning_rate": 0.00019330186697486722, "loss": 0.6519, "step": 400 }, { "epoch": 0.7, "eval_loss": 0.5611785650253296, "eval_runtime": 21.3506, "eval_samples_per_second": 23.372, "eval_steps_per_second": 5.855, "step": 400 }, { "epoch": 0.72, "learning_rate": 0.00019296680451206575, "loss": 0.6446, "step": 410 }, { "epoch": 0.72, "eval_loss": 0.5562126636505127, "eval_runtime": 21.3482, "eval_samples_per_second": 23.374, "eval_steps_per_second": 5.855, "step": 410 }, { "epoch": 0.74, "learning_rate": 0.00019262386982331594, "loss": 0.6574, "step": 420 }, { "epoch": 0.74, "eval_loss": 0.5644647479057312, "eval_runtime": 21.3719, "eval_samples_per_second": 23.348, "eval_steps_per_second": 5.849, "step": 420 }, { "epoch": 0.76, "learning_rate": 0.00019227309194757818, "loss": 0.6633, "step": 430 }, { "epoch": 0.76, "eval_loss": 0.5663937926292419, "eval_runtime": 21.3728, "eval_samples_per_second": 23.347, "eval_steps_per_second": 5.849, "step": 430 }, { "epoch": 0.77, "learning_rate": 0.00019191450058795683, "loss": 0.6673, "step": 440 }, { "epoch": 0.77, "eval_loss": 0.5483366847038269, "eval_runtime": 21.352, "eval_samples_per_second": 23.37, "eval_steps_per_second": 5.854, "step": 440 }, { "epoch": 0.79, "learning_rate": 0.00019154812610918501, "loss": 0.6466, "step": 450 }, { "epoch": 0.79, "eval_loss": 0.554151713848114, "eval_runtime": 21.4045, "eval_samples_per_second": 23.313, "eval_steps_per_second": 5.84, "step": 450 }, { "epoch": 0.81, "learning_rate": 0.00019117399953505335, "loss": 0.653, "step": 460 }, { "epoch": 0.81, "eval_loss": 0.5411431789398193, "eval_runtime": 21.3349, "eval_samples_per_second": 23.389, "eval_steps_per_second": 5.859, "step": 460 }, { "epoch": 0.83, "learning_rate": 0.00019079215254578293, "loss": 0.6384, "step": 470 }, { "epoch": 0.83, "eval_loss": 0.5362362265586853, "eval_runtime": 21.3477, "eval_samples_per_second": 23.375, "eval_steps_per_second": 5.855, "step": 470 }, { "epoch": 0.84, "learning_rate": 0.00019040261747534283, "loss": 0.6287, "step": 480 }, { "epoch": 0.84, "eval_loss": 0.5452967286109924, "eval_runtime": 21.3462, "eval_samples_per_second": 23.377, "eval_steps_per_second": 5.856, "step": 480 }, { "epoch": 0.86, "learning_rate": 0.00019000542730871197, "loss": 0.661, "step": 490 }, { "epoch": 0.86, "eval_loss": 0.5644904971122742, "eval_runtime": 21.3569, "eval_samples_per_second": 23.365, "eval_steps_per_second": 5.853, "step": 490 }, { "epoch": 0.88, "learning_rate": 0.0001896006156790861, "loss": 0.608, "step": 500 }, { "epoch": 0.88, "eval_loss": 0.5245234370231628, "eval_runtime": 21.3459, "eval_samples_per_second": 23.377, "eval_steps_per_second": 5.856, "step": 500 }, { "epoch": 0.9, "learning_rate": 0.00018918821686502989, "loss": 0.6584, "step": 510 }, { "epoch": 0.9, "eval_loss": 0.5376425385475159, "eval_runtime": 21.3569, "eval_samples_per_second": 23.365, "eval_steps_per_second": 5.853, "step": 510 }, { "epoch": 0.91, "learning_rate": 0.0001887682657875741, "loss": 0.6416, "step": 520 }, { "epoch": 0.91, "eval_loss": 0.5471484661102295, "eval_runtime": 21.3644, "eval_samples_per_second": 23.357, "eval_steps_per_second": 5.851, "step": 520 }, { "epoch": 0.93, "learning_rate": 0.00018834079800725872, "loss": 0.6527, "step": 530 }, { "epoch": 0.93, "eval_loss": 0.5425943732261658, "eval_runtime": 21.3669, "eval_samples_per_second": 23.354, "eval_steps_per_second": 5.85, "step": 530 }, { "epoch": 0.95, "learning_rate": 0.00018790584972112174, "loss": 0.6164, "step": 540 }, { "epoch": 0.95, "eval_loss": 0.5284227728843689, "eval_runtime": 21.3751, "eval_samples_per_second": 23.345, "eval_steps_per_second": 5.848, "step": 540 }, { "epoch": 0.97, "learning_rate": 0.00018746345775963395, "loss": 0.611, "step": 550 }, { "epoch": 0.97, "eval_loss": 0.5312528014183044, "eval_runtime": 21.3628, "eval_samples_per_second": 23.358, "eval_steps_per_second": 5.851, "step": 550 }, { "epoch": 0.98, "learning_rate": 0.00018701365958358047, "loss": 0.614, "step": 560 }, { "epoch": 0.98, "eval_loss": 0.5262718796730042, "eval_runtime": 21.3578, "eval_samples_per_second": 23.364, "eval_steps_per_second": 5.853, "step": 560 }, { "epoch": 1.0, "learning_rate": 0.00018655649328088835, "loss": 0.6382, "step": 570 }, { "epoch": 1.0, "eval_loss": 0.5316660404205322, "eval_runtime": 21.3512, "eval_samples_per_second": 23.371, "eval_steps_per_second": 5.854, "step": 570 }, { "epoch": 1.02, "learning_rate": 0.00018609199756340156, "loss": 0.5804, "step": 580 }, { "epoch": 1.02, "eval_loss": 0.5207402110099792, "eval_runtime": 21.3663, "eval_samples_per_second": 23.355, "eval_steps_per_second": 5.85, "step": 580 }, { "epoch": 1.04, "learning_rate": 0.0001856202117636029, "loss": 0.6291, "step": 590 }, { "epoch": 1.04, "eval_loss": 0.5237697958946228, "eval_runtime": 21.3922, "eval_samples_per_second": 23.326, "eval_steps_per_second": 5.843, "step": 590 }, { "epoch": 1.05, "learning_rate": 0.00018514117583128347, "loss": 0.5911, "step": 600 }, { "epoch": 1.05, "eval_loss": 0.517393171787262, "eval_runtime": 21.3498, "eval_samples_per_second": 23.373, "eval_steps_per_second": 5.855, "step": 600 }, { "epoch": 1.07, "learning_rate": 0.00018465493033015967, "loss": 0.6111, "step": 610 }, { "epoch": 1.07, "eval_loss": 0.5281241536140442, "eval_runtime": 21.3522, "eval_samples_per_second": 23.37, "eval_steps_per_second": 5.854, "step": 610 }, { "epoch": 1.09, "learning_rate": 0.0001841615164344385, "loss": 0.5578, "step": 620 }, { "epoch": 1.09, "eval_loss": 0.5255175232887268, "eval_runtime": 21.3637, "eval_samples_per_second": 23.357, "eval_steps_per_second": 5.851, "step": 620 }, { "epoch": 1.11, "learning_rate": 0.00018366097592533093, "loss": 0.6055, "step": 630 }, { "epoch": 1.11, "eval_loss": 0.5177362561225891, "eval_runtime": 21.3827, "eval_samples_per_second": 23.337, "eval_steps_per_second": 5.846, "step": 630 }, { "epoch": 1.12, "learning_rate": 0.00018315335118751396, "loss": 0.6015, "step": 640 }, { "epoch": 1.12, "eval_loss": 0.5130926370620728, "eval_runtime": 21.4348, "eval_samples_per_second": 23.28, "eval_steps_per_second": 5.832, "step": 640 }, { "epoch": 1.14, "learning_rate": 0.0001826386852055417, "loss": 0.6072, "step": 650 }, { "epoch": 1.14, "eval_loss": 0.5168054103851318, "eval_runtime": 21.3583, "eval_samples_per_second": 23.363, "eval_steps_per_second": 5.853, "step": 650 }, { "epoch": 1.16, "learning_rate": 0.0001821170215602053, "loss": 0.5956, "step": 660 }, { "epoch": 1.16, "eval_loss": 0.5168840289115906, "eval_runtime": 21.3753, "eval_samples_per_second": 23.345, "eval_steps_per_second": 5.848, "step": 660 }, { "epoch": 1.18, "learning_rate": 0.0001815884044248429, "loss": 0.6099, "step": 670 }, { "epoch": 1.18, "eval_loss": 0.5169732570648193, "eval_runtime": 21.3472, "eval_samples_per_second": 23.375, "eval_steps_per_second": 5.856, "step": 670 }, { "epoch": 1.19, "learning_rate": 0.0001810528785615989, "loss": 0.6038, "step": 680 }, { "epoch": 1.19, "eval_loss": 0.5055590867996216, "eval_runtime": 21.3489, "eval_samples_per_second": 23.374, "eval_steps_per_second": 5.855, "step": 680 }, { "epoch": 1.21, "learning_rate": 0.00018051048931763366, "loss": 0.583, "step": 690 }, { "epoch": 1.21, "eval_loss": 0.5121394395828247, "eval_runtime": 21.3436, "eval_samples_per_second": 23.379, "eval_steps_per_second": 5.857, "step": 690 }, { "epoch": 1.23, "learning_rate": 0.0001799612826212837, "loss": 0.5885, "step": 700 }, { "epoch": 1.23, "eval_loss": 0.523388683795929, "eval_runtime": 21.3616, "eval_samples_per_second": 23.36, "eval_steps_per_second": 5.852, "step": 700 }, { "epoch": 1.25, "learning_rate": 0.00017940530497817254, "loss": 0.5784, "step": 710 }, { "epoch": 1.25, "eval_loss": 0.5028228163719177, "eval_runtime": 21.3469, "eval_samples_per_second": 23.376, "eval_steps_per_second": 5.856, "step": 710 }, { "epoch": 1.26, "learning_rate": 0.00017884260346727254, "loss": 0.5744, "step": 720 }, { "epoch": 1.26, "eval_loss": 0.5100187063217163, "eval_runtime": 21.3603, "eval_samples_per_second": 23.361, "eval_steps_per_second": 5.852, "step": 720 }, { "epoch": 1.28, "learning_rate": 0.00017827322573691872, "loss": 0.6014, "step": 730 }, { "epoch": 1.28, "eval_loss": 0.5038166046142578, "eval_runtime": 21.3431, "eval_samples_per_second": 23.38, "eval_steps_per_second": 5.857, "step": 730 }, { "epoch": 1.3, "learning_rate": 0.0001776972200007735, "loss": 0.6185, "step": 740 }, { "epoch": 1.3, "eval_loss": 0.5146144032478333, "eval_runtime": 21.3367, "eval_samples_per_second": 23.387, "eval_steps_per_second": 5.858, "step": 740 }, { "epoch": 1.32, "learning_rate": 0.00017711463503374466, "loss": 0.6184, "step": 750 }, { "epoch": 1.32, "eval_loss": 0.5316588282585144, "eval_runtime": 21.3617, "eval_samples_per_second": 23.36, "eval_steps_per_second": 5.852, "step": 750 }, { "epoch": 1.34, "learning_rate": 0.0001765255201678546, "loss": 0.6141, "step": 760 }, { "epoch": 1.34, "eval_loss": 0.5080065727233887, "eval_runtime": 21.3913, "eval_samples_per_second": 23.327, "eval_steps_per_second": 5.844, "step": 760 }, { "epoch": 1.35, "learning_rate": 0.00017592992528806352, "loss": 0.6146, "step": 770 }, { "epoch": 1.35, "eval_loss": 0.5165488719940186, "eval_runtime": 21.3517, "eval_samples_per_second": 23.371, "eval_steps_per_second": 5.854, "step": 770 }, { "epoch": 1.37, "learning_rate": 0.0001753279008280449, "loss": 0.5721, "step": 780 }, { "epoch": 1.37, "eval_loss": 0.5040128231048584, "eval_runtime": 21.3467, "eval_samples_per_second": 23.376, "eval_steps_per_second": 5.856, "step": 780 }, { "epoch": 1.39, "learning_rate": 0.00017471949776591504, "loss": 0.5931, "step": 790 }, { "epoch": 1.39, "eval_loss": 0.49337631464004517, "eval_runtime": 21.3473, "eval_samples_per_second": 23.375, "eval_steps_per_second": 5.856, "step": 790 }, { "epoch": 1.41, "learning_rate": 0.00017410476761991643, "loss": 0.5944, "step": 800 }, { "epoch": 1.41, "eval_loss": 0.487575501203537, "eval_runtime": 21.3451, "eval_samples_per_second": 23.378, "eval_steps_per_second": 5.856, "step": 800 }, { "epoch": 1.42, "learning_rate": 0.00017348376244405512, "loss": 0.6002, "step": 810 }, { "epoch": 1.42, "eval_loss": 0.4929651618003845, "eval_runtime": 21.3604, "eval_samples_per_second": 23.361, "eval_steps_per_second": 5.852, "step": 810 }, { "epoch": 1.44, "learning_rate": 0.000172856534823693, "loss": 0.5557, "step": 820 }, { "epoch": 1.44, "eval_loss": 0.4913093149662018, "eval_runtime": 21.3626, "eval_samples_per_second": 23.359, "eval_steps_per_second": 5.851, "step": 820 }, { "epoch": 1.46, "learning_rate": 0.00017222313787109496, "loss": 0.58, "step": 830 }, { "epoch": 1.46, "eval_loss": 0.4909800887107849, "eval_runtime": 21.4163, "eval_samples_per_second": 23.3, "eval_steps_per_second": 5.837, "step": 830 }, { "epoch": 1.48, "learning_rate": 0.00017158362522093153, "loss": 0.5459, "step": 840 }, { "epoch": 1.48, "eval_loss": 0.4883653223514557, "eval_runtime": 21.3483, "eval_samples_per_second": 23.374, "eval_steps_per_second": 5.855, "step": 840 }, { "epoch": 1.49, "learning_rate": 0.00017093805102573706, "loss": 0.5871, "step": 850 }, { "epoch": 1.49, "eval_loss": 0.48601067066192627, "eval_runtime": 21.3674, "eval_samples_per_second": 23.353, "eval_steps_per_second": 5.85, "step": 850 }, { "epoch": 1.51, "learning_rate": 0.00017028646995132435, "loss": 0.5554, "step": 860 }, { "epoch": 1.51, "eval_loss": 0.4856807291507721, "eval_runtime": 21.366, "eval_samples_per_second": 23.355, "eval_steps_per_second": 5.85, "step": 860 }, { "epoch": 1.53, "learning_rate": 0.0001696289371721556, "loss": 0.5819, "step": 870 }, { "epoch": 1.53, "eval_loss": 0.4648899435997009, "eval_runtime": 21.3516, "eval_samples_per_second": 23.371, "eval_steps_per_second": 5.854, "step": 870 }, { "epoch": 1.55, "learning_rate": 0.00016896550836667035, "loss": 0.5649, "step": 880 }, { "epoch": 1.55, "eval_loss": 0.47903972864151, "eval_runtime": 21.3528, "eval_samples_per_second": 23.369, "eval_steps_per_second": 5.854, "step": 880 }, { "epoch": 1.56, "learning_rate": 0.00016829623971257088, "loss": 0.5779, "step": 890 }, { "epoch": 1.56, "eval_loss": 0.4807458519935608, "eval_runtime": 21.377, "eval_samples_per_second": 23.343, "eval_steps_per_second": 5.847, "step": 890 }, { "epoch": 1.58, "learning_rate": 0.00016762118788206487, "loss": 0.5756, "step": 900 }, { "epoch": 1.58, "eval_loss": 0.483437180519104, "eval_runtime": 21.3913, "eval_samples_per_second": 23.327, "eval_steps_per_second": 5.843, "step": 900 }, { "epoch": 1.6, "learning_rate": 0.000166940410037067, "loss": 0.5563, "step": 910 }, { "epoch": 1.6, "eval_loss": 0.49455228447914124, "eval_runtime": 21.365, "eval_samples_per_second": 23.356, "eval_steps_per_second": 5.851, "step": 910 }, { "epoch": 1.62, "learning_rate": 0.00016625396382435813, "loss": 0.5393, "step": 920 }, { "epoch": 1.62, "eval_loss": 0.4847542643547058, "eval_runtime": 21.3614, "eval_samples_per_second": 23.36, "eval_steps_per_second": 5.852, "step": 920 }, { "epoch": 1.63, "learning_rate": 0.00016556190737070428, "loss": 0.5551, "step": 930 }, { "epoch": 1.63, "eval_loss": 0.4845309257507324, "eval_runtime": 21.4008, "eval_samples_per_second": 23.317, "eval_steps_per_second": 5.841, "step": 930 }, { "epoch": 1.65, "learning_rate": 0.00016486429927793436, "loss": 0.5687, "step": 940 }, { "epoch": 1.65, "eval_loss": 0.4806869626045227, "eval_runtime": 21.374, "eval_samples_per_second": 23.346, "eval_steps_per_second": 5.848, "step": 940 }, { "epoch": 1.67, "learning_rate": 0.00016416119861797796, "loss": 0.5469, "step": 950 }, { "epoch": 1.67, "eval_loss": 0.4748505651950836, "eval_runtime": 21.355, "eval_samples_per_second": 23.367, "eval_steps_per_second": 5.853, "step": 950 }, { "epoch": 1.69, "learning_rate": 0.0001634526649278632, "loss": 0.5771, "step": 960 }, { "epoch": 1.69, "eval_loss": 0.4859110414981842, "eval_runtime": 21.3418, "eval_samples_per_second": 23.381, "eval_steps_per_second": 5.857, "step": 960 }, { "epoch": 1.7, "learning_rate": 0.00016273875820467545, "loss": 0.5689, "step": 970 }, { "epoch": 1.7, "eval_loss": 0.4734295606613159, "eval_runtime": 21.3607, "eval_samples_per_second": 23.361, "eval_steps_per_second": 5.852, "step": 970 }, { "epoch": 1.72, "learning_rate": 0.0001620195389004767, "loss": 0.5741, "step": 980 }, { "epoch": 1.72, "eval_loss": 0.4881950914859772, "eval_runtime": 21.3786, "eval_samples_per_second": 23.341, "eval_steps_per_second": 5.847, "step": 980 }, { "epoch": 1.74, "learning_rate": 0.00016129506791718665, "loss": 0.5643, "step": 990 }, { "epoch": 1.74, "eval_loss": 0.4815501570701599, "eval_runtime": 21.3699, "eval_samples_per_second": 23.351, "eval_steps_per_second": 5.849, "step": 990 }, { "epoch": 1.76, "learning_rate": 0.00016056540660142586, "loss": 0.5603, "step": 1000 }, { "epoch": 1.76, "eval_loss": 0.46760401129722595, "eval_runtime": 21.354, "eval_samples_per_second": 23.368, "eval_steps_per_second": 5.854, "step": 1000 }, { "epoch": 1.77, "learning_rate": 0.0001598306167393208, "loss": 0.5925, "step": 1010 }, { "epoch": 1.77, "eval_loss": 0.46860912442207336, "eval_runtime": 21.3958, "eval_samples_per_second": 23.322, "eval_steps_per_second": 5.842, "step": 1010 }, { "epoch": 1.79, "learning_rate": 0.00015909076055127202, "loss": 0.5834, "step": 1020 }, { "epoch": 1.79, "eval_loss": 0.47431066632270813, "eval_runtime": 21.4034, "eval_samples_per_second": 23.314, "eval_steps_per_second": 5.84, "step": 1020 }, { "epoch": 1.81, "learning_rate": 0.00019360595357389735, "loss": 0.5902, "step": 1030 }, { "epoch": 1.81, "eval_loss": 0.49162757396698, "eval_runtime": 38.6636, "eval_samples_per_second": 12.906, "eval_steps_per_second": 3.233, "step": 1030 }, { "epoch": 1.83, "learning_rate": 0.00019348256763960145, "loss": 0.5777, "step": 1040 }, { "epoch": 1.83, "eval_loss": 0.47481468319892883, "eval_runtime": 41.2268, "eval_samples_per_second": 12.104, "eval_steps_per_second": 3.032, "step": 1040 }, { "epoch": 1.84, "learning_rate": 0.00019335804264972018, "loss": 0.5921, "step": 1050 }, { "epoch": 1.84, "eval_loss": 0.48432889580726624, "eval_runtime": 30.5847, "eval_samples_per_second": 16.315, "eval_steps_per_second": 4.087, "step": 1050 }, { "epoch": 1.86, "learning_rate": 0.00019323238012155123, "loss": 0.5877, "step": 1060 }, { "epoch": 1.86, "eval_loss": 0.47419798374176025, "eval_runtime": 37.8731, "eval_samples_per_second": 13.176, "eval_steps_per_second": 3.3, "step": 1060 }, { "epoch": 1.88, "learning_rate": 0.00019310558158625285, "loss": 0.5453, "step": 1070 }, { "epoch": 1.88, "eval_loss": 0.4705266058444977, "eval_runtime": 22.4824, "eval_samples_per_second": 22.195, "eval_steps_per_second": 2.802, "step": 1070 }, { "epoch": 1.9, "learning_rate": 0.00019297764858882514, "loss": 0.5445, "step": 1080 }, { "epoch": 1.9, "eval_loss": 0.4662667214870453, "eval_runtime": 31.6864, "eval_samples_per_second": 15.748, "eval_steps_per_second": 1.988, "step": 1080 }, { "epoch": 1.92, "learning_rate": 0.00019284858268809137, "loss": 0.5686, "step": 1090 }, { "epoch": 1.92, "eval_loss": 0.47445282340049744, "eval_runtime": 27.1784, "eval_samples_per_second": 18.36, "eval_steps_per_second": 2.318, "step": 1090 }, { "epoch": 1.93, "learning_rate": 0.00019271838545667876, "loss": 0.5712, "step": 1100 }, { "epoch": 1.93, "eval_loss": 0.48884764313697815, "eval_runtime": 37.118, "eval_samples_per_second": 13.444, "eval_steps_per_second": 1.697, "step": 1100 }, { "epoch": 1.95, "learning_rate": 0.0001925870584809995, "loss": 0.6032, "step": 1110 }, { "epoch": 1.95, "eval_loss": 0.48608502745628357, "eval_runtime": 36.4351, "eval_samples_per_second": 13.696, "eval_steps_per_second": 1.729, "step": 1110 }, { "epoch": 1.97, "learning_rate": 0.00019245460336123134, "loss": 0.5491, "step": 1120 }, { "epoch": 1.97, "eval_loss": 0.472098708152771, "eval_runtime": 22.4827, "eval_samples_per_second": 22.195, "eval_steps_per_second": 2.802, "step": 1120 }, { "epoch": 1.99, "learning_rate": 0.00019232102171129811, "loss": 0.5452, "step": 1130 }, { "epoch": 1.99, "eval_loss": 0.4644794762134552, "eval_runtime": 27.0746, "eval_samples_per_second": 18.431, "eval_steps_per_second": 2.327, "step": 1130 }, { "epoch": 2.0, "learning_rate": 0.00019218631515885006, "loss": 0.5526, "step": 1140 }, { "epoch": 2.0, "eval_loss": 0.48768851161003113, "eval_runtime": 32.1887, "eval_samples_per_second": 15.502, "eval_steps_per_second": 1.957, "step": 1140 }, { "epoch": 2.02, "learning_rate": 0.00019205048534524406, "loss": 0.5443, "step": 1150 }, { "epoch": 2.02, "eval_loss": 0.4716103971004486, "eval_runtime": 30.3462, "eval_samples_per_second": 16.444, "eval_steps_per_second": 2.076, "step": 1150 }, { "epoch": 2.04, "learning_rate": 0.00019191353392552344, "loss": 0.5103, "step": 1160 }, { "epoch": 2.04, "eval_loss": 0.46319034695625305, "eval_runtime": 23.7207, "eval_samples_per_second": 21.037, "eval_steps_per_second": 2.656, "step": 1160 }, { "epoch": 2.06, "learning_rate": 0.00019177546256839812, "loss": 0.5202, "step": 1170 }, { "epoch": 2.06, "eval_loss": 0.4802156984806061, "eval_runtime": 24.1547, "eval_samples_per_second": 20.658, "eval_steps_per_second": 2.608, "step": 1170 }, { "epoch": 2.07, "learning_rate": 0.00019163627295622397, "loss": 0.5436, "step": 1180 }, { "epoch": 2.07, "eval_loss": 0.4681110680103302, "eval_runtime": 26.9736, "eval_samples_per_second": 18.5, "eval_steps_per_second": 2.336, "step": 1180 }, { "epoch": 2.09, "learning_rate": 0.0001914959667849825, "loss": 0.5454, "step": 1190 }, { "epoch": 2.09, "eval_loss": 0.470931738615036, "eval_runtime": 36.9982, "eval_samples_per_second": 13.487, "eval_steps_per_second": 1.703, "step": 1190 }, { "epoch": 2.11, "learning_rate": 0.0001913545457642601, "loss": 0.5183, "step": 1200 }, { "epoch": 2.11, "eval_loss": 0.47423675656318665, "eval_runtime": 33.685, "eval_samples_per_second": 14.814, "eval_steps_per_second": 1.87, "step": 1200 }, { "epoch": 2.11, "step": 1201, "total_flos": 3.450559153050747e+17, "train_loss": 0.000492346822768822, "train_runtime": 4.9952, "train_samples_per_second": 3843.723, "train_steps_per_second": 240.233 } ], "logging_steps": 10, "max_steps": 1200, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 10, "total_flos": 3.450559153050747e+17, "train_batch_size": 4, "trial_name": null, "trial_params": null }