{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.9752701960065946, "eval_steps": 100, "global_step": 5391, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0, "learning_rate": 0.0001999998159397344, "loss": 1.4113, "step": 10 }, { "epoch": 0.01, "learning_rate": 0.00019999926375961516, "loss": 0.8357, "step": 20 }, { "epoch": 0.01, "learning_rate": 0.00019999834346167496, "loss": 0.7122, "step": 30 }, { "epoch": 0.01, "learning_rate": 0.0001999970550493016, "loss": 0.6653, "step": 40 }, { "epoch": 0.02, "learning_rate": 0.000199995398527238, "loss": 0.6152, "step": 50 }, { "epoch": 0.02, "learning_rate": 0.00019999337390158218, "loss": 0.5963, "step": 60 }, { "epoch": 0.03, "learning_rate": 0.00019999098117978715, "loss": 0.5923, "step": 70 }, { "epoch": 0.03, "learning_rate": 0.00019998822037066105, "loss": 0.5783, "step": 80 }, { "epoch": 0.03, "learning_rate": 0.00019998509148436697, "loss": 0.5529, "step": 90 }, { "epoch": 0.04, "learning_rate": 0.000199981594532423, "loss": 0.5382, "step": 100 }, { "epoch": 0.04, "eval_loss": 0.5621132850646973, "eval_runtime": 398.8595, "eval_samples_per_second": 11.992, "eval_steps_per_second": 2.999, "step": 100 }, { "epoch": 0.04, "learning_rate": 0.00019997772952770216, "loss": 0.5489, "step": 110 }, { "epoch": 0.04, "learning_rate": 0.00019997349648443225, "loss": 0.5268, "step": 120 }, { "epoch": 0.05, "learning_rate": 0.00019996889541819602, "loss": 0.525, "step": 130 }, { "epoch": 0.05, "learning_rate": 0.00019996392634593092, "loss": 0.4926, "step": 140 }, { "epoch": 0.05, "learning_rate": 0.00019995858928592916, "loss": 0.5181, "step": 150 }, { "epoch": 0.06, "learning_rate": 0.00019995288425783754, "loss": 0.5225, "step": 160 }, { "epoch": 0.06, "learning_rate": 0.00019994681128265743, "loss": 0.4993, "step": 170 }, { "epoch": 0.07, "learning_rate": 0.00019994037038274467, "loss": 0.5387, "step": 180 }, { "epoch": 0.07, "learning_rate": 0.0001999335615818096, "loss": 0.465, "step": 190 }, { "epoch": 0.07, "learning_rate": 0.00019992638490491676, "loss": 0.4788, "step": 200 }, { "epoch": 0.07, "eval_loss": 0.5201168060302734, "eval_runtime": 398.3332, "eval_samples_per_second": 12.008, "eval_steps_per_second": 3.003, "step": 200 }, { "epoch": 0.08, "learning_rate": 0.00019991884037848497, "loss": 0.451, "step": 210 }, { "epoch": 0.08, "learning_rate": 0.00019991092803028725, "loss": 0.4586, "step": 220 }, { "epoch": 0.08, "learning_rate": 0.00019990264788945052, "loss": 0.4894, "step": 230 }, { "epoch": 0.09, "learning_rate": 0.00019989399998645568, "loss": 0.4666, "step": 240 }, { "epoch": 0.09, "learning_rate": 0.00019988498435313744, "loss": 0.4763, "step": 250 }, { "epoch": 0.1, "learning_rate": 0.00019987560102268422, "loss": 0.4539, "step": 260 }, { "epoch": 0.1, "learning_rate": 0.00019986585002963793, "loss": 0.5133, "step": 270 }, { "epoch": 0.1, "learning_rate": 0.00019985573140989405, "loss": 0.4704, "step": 280 }, { "epoch": 0.11, "learning_rate": 0.00019984524520070125, "loss": 0.4548, "step": 290 }, { "epoch": 0.11, "learning_rate": 0.00019983439144066143, "loss": 0.4682, "step": 300 }, { "epoch": 0.11, "eval_loss": 0.497054785490036, "eval_runtime": 399.4057, "eval_samples_per_second": 11.975, "eval_steps_per_second": 2.994, "step": 300 }, { "epoch": 0.11, "learning_rate": 0.0001998231701697295, "loss": 0.4867, "step": 310 }, { "epoch": 0.12, "learning_rate": 0.0001998115814292133, "loss": 0.4486, "step": 320 }, { "epoch": 0.12, "learning_rate": 0.0001997996252617733, "loss": 0.4472, "step": 330 }, { "epoch": 0.12, "learning_rate": 0.00019978730171142268, "loss": 0.4821, "step": 340 }, { "epoch": 0.13, "learning_rate": 0.0001997746108235269, "loss": 0.4475, "step": 350 }, { "epoch": 0.13, "learning_rate": 0.00019976155264480377, "loss": 0.4485, "step": 360 }, { "epoch": 0.14, "learning_rate": 0.00019974812722332308, "loss": 0.4547, "step": 370 }, { "epoch": 0.14, "learning_rate": 0.0001997343346085066, "loss": 0.4279, "step": 380 }, { "epoch": 0.14, "learning_rate": 0.00019972017485112774, "loss": 0.4748, "step": 390 }, { "epoch": 0.15, "learning_rate": 0.0001997056480033115, "loss": 0.4778, "step": 400 }, { "epoch": 0.15, "eval_loss": 0.4730743169784546, "eval_runtime": 403.2392, "eval_samples_per_second": 11.861, "eval_steps_per_second": 2.966, "step": 400 }, { "epoch": 0.15, "learning_rate": 0.0001996907541185342, "loss": 0.4393, "step": 410 }, { "epoch": 0.15, "learning_rate": 0.00019967549325162324, "loss": 0.4562, "step": 420 }, { "epoch": 0.16, "learning_rate": 0.00019965986545875708, "loss": 0.4389, "step": 430 }, { "epoch": 0.16, "learning_rate": 0.0001996438707974648, "loss": 0.4282, "step": 440 }, { "epoch": 0.16, "learning_rate": 0.000199627509326626, "loss": 0.4254, "step": 450 }, { "epoch": 0.17, "learning_rate": 0.0001996107811064706, "loss": 0.419, "step": 460 }, { "epoch": 0.17, "learning_rate": 0.00019959368619857872, "loss": 0.4261, "step": 470 }, { "epoch": 0.18, "learning_rate": 0.0001995762246658801, "loss": 0.4581, "step": 480 }, { "epoch": 0.18, "learning_rate": 0.00019955839657265432, "loss": 0.4333, "step": 490 }, { "epoch": 0.18, "learning_rate": 0.00019954020198453018, "loss": 0.4541, "step": 500 }, { "epoch": 0.18, "eval_loss": 0.46874529123306274, "eval_runtime": 403.7251, "eval_samples_per_second": 11.847, "eval_steps_per_second": 2.962, "step": 500 }, { "epoch": 0.19, "learning_rate": 0.00019952164096848578, "loss": 0.4482, "step": 510 }, { "epoch": 0.19, "learning_rate": 0.00019950271359284795, "loss": 0.4475, "step": 520 }, { "epoch": 0.19, "learning_rate": 0.00019948341992729227, "loss": 0.4339, "step": 530 }, { "epoch": 0.2, "learning_rate": 0.00019946376004284272, "loss": 0.4527, "step": 540 }, { "epoch": 0.2, "learning_rate": 0.0001994437340118713, "loss": 0.4009, "step": 550 }, { "epoch": 0.21, "learning_rate": 0.000199423341908098, "loss": 0.4496, "step": 560 }, { "epoch": 0.21, "learning_rate": 0.0001994025838065903, "loss": 0.4314, "step": 570 }, { "epoch": 0.21, "learning_rate": 0.0001993814597837631, "loss": 0.4454, "step": 580 }, { "epoch": 0.22, "learning_rate": 0.00019935996991737818, "loss": 0.4076, "step": 590 }, { "epoch": 0.22, "learning_rate": 0.0001993381142865442, "loss": 0.4786, "step": 600 }, { "epoch": 0.22, "eval_loss": 0.4500006139278412, "eval_runtime": 399.7496, "eval_samples_per_second": 11.965, "eval_steps_per_second": 2.992, "step": 600 }, { "epoch": 0.22, "learning_rate": 0.00019931589297171628, "loss": 0.4386, "step": 610 }, { "epoch": 0.23, "learning_rate": 0.0001992933060546955, "loss": 0.4805, "step": 620 }, { "epoch": 0.23, "learning_rate": 0.00019927035361862904, "loss": 0.4135, "step": 630 }, { "epoch": 0.23, "learning_rate": 0.0001992470357480095, "loss": 0.421, "step": 640 }, { "epoch": 0.24, "learning_rate": 0.00019922335252867476, "loss": 0.4224, "step": 650 }, { "epoch": 0.24, "learning_rate": 0.00019919930404780766, "loss": 0.4324, "step": 660 }, { "epoch": 0.25, "learning_rate": 0.0001991748903939355, "loss": 0.4062, "step": 670 }, { "epoch": 0.25, "learning_rate": 0.00019915011165692997, "loss": 0.4107, "step": 680 }, { "epoch": 0.25, "learning_rate": 0.00019912496792800677, "loss": 0.3953, "step": 690 }, { "epoch": 0.26, "learning_rate": 0.00019909945929972502, "loss": 0.3974, "step": 700 }, { "epoch": 0.26, "eval_loss": 0.4456511437892914, "eval_runtime": 403.5923, "eval_samples_per_second": 11.851, "eval_steps_per_second": 2.963, "step": 700 }, { "epoch": 0.26, "learning_rate": 0.0001990735858659873, "loss": 0.4006, "step": 710 }, { "epoch": 0.26, "learning_rate": 0.000199047347722039, "loss": 0.4258, "step": 720 }, { "epoch": 0.27, "learning_rate": 0.00019902074496446815, "loss": 0.4083, "step": 730 }, { "epoch": 0.27, "learning_rate": 0.00019899377769120487, "loss": 0.4097, "step": 740 }, { "epoch": 0.27, "learning_rate": 0.00019896644600152135, "loss": 0.4262, "step": 750 }, { "epoch": 0.28, "learning_rate": 0.00019893874999603103, "loss": 0.4204, "step": 760 }, { "epoch": 0.28, "learning_rate": 0.0001989106897766887, "loss": 0.4233, "step": 770 }, { "epoch": 0.29, "learning_rate": 0.0001988822654467897, "loss": 0.4107, "step": 780 }, { "epoch": 0.29, "learning_rate": 0.00019885347711096993, "loss": 0.4013, "step": 790 }, { "epoch": 0.29, "learning_rate": 0.00019882432487520506, "loss": 0.4142, "step": 800 }, { "epoch": 0.29, "eval_loss": 0.4460389018058777, "eval_runtime": 400.2256, "eval_samples_per_second": 11.951, "eval_steps_per_second": 2.988, "step": 800 }, { "epoch": 0.3, "learning_rate": 0.0001987948088468105, "loss": 0.4324, "step": 810 }, { "epoch": 0.3, "learning_rate": 0.0001987649291344408, "loss": 0.3975, "step": 820 }, { "epoch": 0.3, "learning_rate": 0.00019873468584808934, "loss": 0.4109, "step": 830 }, { "epoch": 0.31, "learning_rate": 0.00019870407909908786, "loss": 0.3987, "step": 840 }, { "epoch": 0.31, "learning_rate": 0.00019867310900010605, "loss": 0.3885, "step": 850 }, { "epoch": 0.32, "learning_rate": 0.00019864177566515122, "loss": 0.4122, "step": 860 }, { "epoch": 0.32, "learning_rate": 0.00019861007920956786, "loss": 0.405, "step": 870 }, { "epoch": 0.32, "learning_rate": 0.00019857801975003704, "loss": 0.404, "step": 880 }, { "epoch": 0.33, "learning_rate": 0.0001985455974045763, "loss": 0.382, "step": 890 }, { "epoch": 0.33, "learning_rate": 0.0001985128122925389, "loss": 0.4374, "step": 900 }, { "epoch": 0.33, "eval_loss": 0.44203105568885803, "eval_runtime": 401.2967, "eval_samples_per_second": 11.919, "eval_steps_per_second": 2.98, "step": 900 }, { "epoch": 0.33, "learning_rate": 0.00019847966453461358, "loss": 0.3859, "step": 910 }, { "epoch": 0.34, "learning_rate": 0.00019844615425282405, "loss": 0.4187, "step": 920 }, { "epoch": 0.34, "learning_rate": 0.00019841228157052853, "loss": 0.4078, "step": 930 }, { "epoch": 0.34, "learning_rate": 0.0001983780466124193, "loss": 0.4218, "step": 940 }, { "epoch": 0.35, "learning_rate": 0.0001983434495045223, "loss": 0.4109, "step": 950 }, { "epoch": 0.35, "learning_rate": 0.00019830849037419656, "loss": 0.4249, "step": 960 }, { "epoch": 0.36, "learning_rate": 0.00019827316935013388, "loss": 0.4073, "step": 970 }, { "epoch": 0.36, "learning_rate": 0.0001982374865623581, "loss": 0.4336, "step": 980 }, { "epoch": 0.36, "learning_rate": 0.00019820144214222497, "loss": 0.4056, "step": 990 }, { "epoch": 0.37, "learning_rate": 0.00019816503622242137, "loss": 0.4008, "step": 1000 }, { "epoch": 0.37, "eval_loss": 0.4418930411338806, "eval_runtime": 404.9805, "eval_samples_per_second": 11.81, "eval_steps_per_second": 2.953, "step": 1000 }, { "epoch": 0.37, "learning_rate": 0.00019812826893696495, "loss": 0.365, "step": 1010 }, { "epoch": 0.37, "learning_rate": 0.00019809114042120367, "loss": 0.4006, "step": 1020 }, { "epoch": 0.38, "learning_rate": 0.0001980536508118152, "loss": 0.3714, "step": 1030 }, { "epoch": 0.38, "learning_rate": 0.00019801580024680652, "loss": 0.3945, "step": 1040 }, { "epoch": 0.38, "learning_rate": 0.00019797758886551324, "loss": 0.4316, "step": 1050 }, { "epoch": 0.39, "learning_rate": 0.0001979390168085994, "loss": 0.3958, "step": 1060 }, { "epoch": 0.39, "learning_rate": 0.00019790008421805664, "loss": 0.419, "step": 1070 }, { "epoch": 0.4, "learning_rate": 0.00019786079123720377, "loss": 0.4048, "step": 1080 }, { "epoch": 0.4, "learning_rate": 0.0001978211380106864, "loss": 0.4002, "step": 1090 }, { "epoch": 0.4, "learning_rate": 0.0001977811246844761, "loss": 0.3979, "step": 1100 }, { "epoch": 0.4, "eval_loss": 0.43328720331192017, "eval_runtime": 398.734, "eval_samples_per_second": 11.995, "eval_steps_per_second": 2.999, "step": 1100 }, { "epoch": 0.41, "learning_rate": 0.00019774075140587024, "loss": 0.4047, "step": 1110 }, { "epoch": 0.41, "learning_rate": 0.00019770001832349106, "loss": 0.389, "step": 1120 }, { "epoch": 0.41, "learning_rate": 0.00019765892558728542, "loss": 0.3628, "step": 1130 }, { "epoch": 0.42, "learning_rate": 0.0001976174733485242, "loss": 0.3914, "step": 1140 }, { "epoch": 0.42, "learning_rate": 0.00019757566175980146, "loss": 0.3885, "step": 1150 }, { "epoch": 0.42, "learning_rate": 0.00019753349097503437, "loss": 0.4023, "step": 1160 }, { "epoch": 0.43, "learning_rate": 0.0001974909611494622, "loss": 0.4358, "step": 1170 }, { "epoch": 0.43, "learning_rate": 0.00019744807243964597, "loss": 0.3965, "step": 1180 }, { "epoch": 0.44, "learning_rate": 0.00019740482500346779, "loss": 0.3762, "step": 1190 }, { "epoch": 0.44, "learning_rate": 0.0001973612190001304, "loss": 0.4108, "step": 1200 }, { "epoch": 0.44, "eval_loss": 0.4304308295249939, "eval_runtime": 397.6793, "eval_samples_per_second": 12.027, "eval_steps_per_second": 3.007, "step": 1200 }, { "epoch": 0.44, "learning_rate": 0.00019731725459015643, "loss": 0.3838, "step": 1210 }, { "epoch": 0.45, "learning_rate": 0.00019727293193538793, "loss": 0.3782, "step": 1220 }, { "epoch": 0.45, "learning_rate": 0.00019722825119898566, "loss": 0.4034, "step": 1230 }, { "epoch": 0.45, "learning_rate": 0.00019718321254542858, "loss": 0.391, "step": 1240 }, { "epoch": 0.46, "learning_rate": 0.0001971378161405132, "loss": 0.4082, "step": 1250 }, { "epoch": 0.46, "learning_rate": 0.0001970920621513531, "loss": 0.3772, "step": 1260 }, { "epoch": 0.47, "learning_rate": 0.00019704595074637805, "loss": 0.3894, "step": 1270 }, { "epoch": 0.47, "learning_rate": 0.00019699948209533355, "loss": 0.3882, "step": 1280 }, { "epoch": 0.47, "learning_rate": 0.00019695265636928032, "loss": 0.3782, "step": 1290 }, { "epoch": 0.48, "learning_rate": 0.0001969054737405934, "loss": 0.3578, "step": 1300 }, { "epoch": 0.48, "eval_loss": 0.42550337314605713, "eval_runtime": 402.9262, "eval_samples_per_second": 11.871, "eval_steps_per_second": 2.968, "step": 1300 }, { "epoch": 0.48, "learning_rate": 0.00019685793438296183, "loss": 0.3644, "step": 1310 }, { "epoch": 0.48, "learning_rate": 0.00019681003847138765, "loss": 0.3702, "step": 1320 }, { "epoch": 0.49, "learning_rate": 0.00019676178618218565, "loss": 0.4213, "step": 1330 }, { "epoch": 0.49, "learning_rate": 0.0001967131776929823, "loss": 0.4126, "step": 1340 }, { "epoch": 0.49, "learning_rate": 0.00019666421318271547, "loss": 0.3643, "step": 1350 }, { "epoch": 0.5, "learning_rate": 0.00019661489283163362, "loss": 0.3854, "step": 1360 }, { "epoch": 0.5, "learning_rate": 0.00019656521682129502, "loss": 0.3933, "step": 1370 }, { "epoch": 0.51, "learning_rate": 0.00019651518533456733, "loss": 0.3841, "step": 1380 }, { "epoch": 0.51, "learning_rate": 0.00019646479855562666, "loss": 0.3677, "step": 1390 }, { "epoch": 0.51, "learning_rate": 0.00019641405666995715, "loss": 0.3895, "step": 1400 }, { "epoch": 0.51, "eval_loss": 0.41961902379989624, "eval_runtime": 402.2386, "eval_samples_per_second": 11.891, "eval_steps_per_second": 2.973, "step": 1400 }, { "epoch": 0.52, "learning_rate": 0.00019636295986435003, "loss": 0.3578, "step": 1410 }, { "epoch": 0.52, "learning_rate": 0.00019631150832690318, "loss": 0.3878, "step": 1420 }, { "epoch": 0.52, "learning_rate": 0.00019625970224702025, "loss": 0.3784, "step": 1430 }, { "epoch": 0.53, "learning_rate": 0.00019620754181541008, "loss": 0.4042, "step": 1440 }, { "epoch": 0.53, "learning_rate": 0.0001961550272240859, "loss": 0.361, "step": 1450 }, { "epoch": 0.53, "learning_rate": 0.00019610215866636477, "loss": 0.3877, "step": 1460 }, { "epoch": 0.54, "learning_rate": 0.00019604893633686662, "loss": 0.3822, "step": 1470 }, { "epoch": 0.54, "learning_rate": 0.00019599536043151384, "loss": 0.3726, "step": 1480 }, { "epoch": 0.55, "learning_rate": 0.00019594143114753026, "loss": 0.3552, "step": 1490 }, { "epoch": 0.55, "learning_rate": 0.00019588714868344073, "loss": 0.3725, "step": 1500 }, { "epoch": 0.55, "eval_loss": 0.4203263521194458, "eval_runtime": 396.7561, "eval_samples_per_second": 12.055, "eval_steps_per_second": 3.014, "step": 1500 }, { "epoch": 0.55, "learning_rate": 0.00019583251323907006, "loss": 0.3993, "step": 1510 }, { "epoch": 0.56, "learning_rate": 0.0001957775250155426, "loss": 0.3557, "step": 1520 }, { "epoch": 0.56, "learning_rate": 0.0001957221842152813, "loss": 0.4027, "step": 1530 }, { "epoch": 0.56, "learning_rate": 0.00019566649104200696, "loss": 0.3547, "step": 1540 }, { "epoch": 0.57, "learning_rate": 0.00019561044570073763, "loss": 0.3978, "step": 1550 }, { "epoch": 0.57, "learning_rate": 0.00019555404839778767, "loss": 0.3919, "step": 1560 }, { "epoch": 0.58, "learning_rate": 0.00019549729934076717, "loss": 0.3713, "step": 1570 }, { "epoch": 0.58, "learning_rate": 0.00019544019873858102, "loss": 0.408, "step": 1580 }, { "epoch": 0.58, "learning_rate": 0.00019538274680142834, "loss": 0.3792, "step": 1590 }, { "epoch": 0.59, "learning_rate": 0.00019532494374080144, "loss": 0.3836, "step": 1600 }, { "epoch": 0.59, "eval_loss": 0.42036038637161255, "eval_runtime": 402.4927, "eval_samples_per_second": 11.883, "eval_steps_per_second": 2.971, "step": 1600 }, { "epoch": 0.59, "learning_rate": 0.00019526678976948525, "loss": 0.3411, "step": 1610 }, { "epoch": 0.59, "learning_rate": 0.0001952082851015565, "loss": 0.3863, "step": 1620 }, { "epoch": 0.6, "learning_rate": 0.00019514942995238287, "loss": 0.375, "step": 1630 }, { "epoch": 0.6, "learning_rate": 0.00019509022453862226, "loss": 0.3747, "step": 1640 }, { "epoch": 0.6, "learning_rate": 0.00019503066907822198, "loss": 0.36, "step": 1650 }, { "epoch": 0.61, "learning_rate": 0.00019497076379041786, "loss": 0.3919, "step": 1660 }, { "epoch": 0.61, "learning_rate": 0.00019491050889573357, "loss": 0.3699, "step": 1670 }, { "epoch": 0.62, "learning_rate": 0.00019484990461597978, "loss": 0.3829, "step": 1680 }, { "epoch": 0.62, "learning_rate": 0.00019478895117425323, "loss": 0.3589, "step": 1690 }, { "epoch": 0.62, "learning_rate": 0.00019472764879493616, "loss": 0.3784, "step": 1700 }, { "epoch": 0.62, "eval_loss": 0.4183010458946228, "eval_runtime": 404.1371, "eval_samples_per_second": 11.835, "eval_steps_per_second": 2.959, "step": 1700 }, { "epoch": 0.63, "learning_rate": 0.00019466599770369509, "loss": 0.3912, "step": 1710 }, { "epoch": 0.63, "learning_rate": 0.00019460399812748041, "loss": 0.374, "step": 1720 }, { "epoch": 0.63, "learning_rate": 0.0001945416502945253, "loss": 0.384, "step": 1730 }, { "epoch": 0.64, "learning_rate": 0.00019447895443434486, "loss": 0.3519, "step": 1740 }, { "epoch": 0.64, "learning_rate": 0.00019441591077773554, "loss": 0.3598, "step": 1750 }, { "epoch": 0.64, "learning_rate": 0.0001943525195567739, "loss": 0.3145, "step": 1760 }, { "epoch": 0.65, "learning_rate": 0.00019428878100481606, "loss": 0.3842, "step": 1770 }, { "epoch": 0.65, "learning_rate": 0.0001942246953564967, "loss": 0.3674, "step": 1780 }, { "epoch": 0.66, "learning_rate": 0.00019416026284772825, "loss": 0.3603, "step": 1790 }, { "epoch": 0.66, "learning_rate": 0.00019409548371570007, "loss": 0.369, "step": 1800 }, { "epoch": 0.66, "eval_loss": 0.4111176133155823, "eval_runtime": 402.9829, "eval_samples_per_second": 11.869, "eval_steps_per_second": 2.968, "step": 1800 }, { "epoch": 0.66, "learning_rate": 0.00019403035819887734, "loss": 0.3713, "step": 1810 }, { "epoch": 0.67, "learning_rate": 0.00019396488653700055, "loss": 0.3539, "step": 1820 }, { "epoch": 0.67, "learning_rate": 0.00019389906897108428, "loss": 0.3932, "step": 1830 }, { "epoch": 0.67, "learning_rate": 0.0001938329057434165, "loss": 0.3371, "step": 1840 }, { "epoch": 0.68, "learning_rate": 0.00019376639709755766, "loss": 0.3832, "step": 1850 }, { "epoch": 0.68, "learning_rate": 0.00019369954327833972, "loss": 0.3658, "step": 1860 }, { "epoch": 0.69, "learning_rate": 0.00019363234453186534, "loss": 0.3452, "step": 1870 }, { "epoch": 0.69, "learning_rate": 0.00019356480110550687, "loss": 0.3721, "step": 1880 }, { "epoch": 0.69, "learning_rate": 0.00019349691324790555, "loss": 0.3637, "step": 1890 }, { "epoch": 0.7, "learning_rate": 0.00019342868120897054, "loss": 0.3409, "step": 1900 }, { "epoch": 0.7, "eval_loss": 0.41198766231536865, "eval_runtime": 402.9688, "eval_samples_per_second": 11.869, "eval_steps_per_second": 2.968, "step": 1900 }, { "epoch": 0.7, "learning_rate": 0.00019336010523987796, "loss": 0.3424, "step": 1910 }, { "epoch": 0.7, "learning_rate": 0.00019329118559307, "loss": 0.3532, "step": 1920 }, { "epoch": 0.71, "learning_rate": 0.0001932219225222541, "loss": 0.3591, "step": 1930 }, { "epoch": 0.71, "learning_rate": 0.00019315231628240178, "loss": 0.3794, "step": 1940 }, { "epoch": 0.71, "learning_rate": 0.00019308236712974795, "loss": 0.3825, "step": 1950 }, { "epoch": 0.72, "learning_rate": 0.0001930120753217898, "loss": 0.3482, "step": 1960 }, { "epoch": 0.72, "learning_rate": 0.00019294144111728584, "loss": 0.3643, "step": 1970 }, { "epoch": 0.73, "learning_rate": 0.00019287046477625515, "loss": 0.3522, "step": 1980 }, { "epoch": 0.73, "learning_rate": 0.00019279914655997619, "loss": 0.3723, "step": 1990 }, { "epoch": 0.73, "learning_rate": 0.00019272748673098596, "loss": 0.388, "step": 2000 }, { "epoch": 0.73, "eval_loss": 0.41505494713783264, "eval_runtime": 404.7184, "eval_samples_per_second": 11.818, "eval_steps_per_second": 2.955, "step": 2000 }, { "epoch": 0.74, "learning_rate": 0.000192655485553079, "loss": 0.3739, "step": 2010 }, { "epoch": 0.74, "learning_rate": 0.00019258314329130641, "loss": 0.3625, "step": 2020 }, { "epoch": 0.74, "learning_rate": 0.00019251046021197496, "loss": 0.3174, "step": 2030 }, { "epoch": 0.75, "learning_rate": 0.00019243743658264593, "loss": 0.347, "step": 2040 }, { "epoch": 0.75, "learning_rate": 0.00019236407267213433, "loss": 0.3837, "step": 2050 }, { "epoch": 0.75, "learning_rate": 0.00019229036875050777, "loss": 0.3854, "step": 2060 }, { "epoch": 0.76, "learning_rate": 0.0001922163250890855, "loss": 0.3449, "step": 2070 }, { "epoch": 0.76, "learning_rate": 0.00019214194196043741, "loss": 0.3507, "step": 2080 }, { "epoch": 0.77, "learning_rate": 0.00019206721963838317, "loss": 0.3786, "step": 2090 }, { "epoch": 0.77, "learning_rate": 0.00019199215839799092, "loss": 0.3608, "step": 2100 }, { "epoch": 0.77, "eval_loss": 0.4092780351638794, "eval_runtime": 403.6583, "eval_samples_per_second": 11.849, "eval_steps_per_second": 2.963, "step": 2100 }, { "epoch": 0.77, "learning_rate": 0.0001919167585155765, "loss": 0.3233, "step": 2110 }, { "epoch": 0.78, "learning_rate": 0.00019184102026870235, "loss": 0.3679, "step": 2120 }, { "epoch": 0.78, "learning_rate": 0.0001917649439361765, "loss": 0.3426, "step": 2130 }, { "epoch": 0.78, "learning_rate": 0.00019168852979805162, "loss": 0.3272, "step": 2140 }, { "epoch": 0.79, "learning_rate": 0.00019161177813562379, "loss": 0.3572, "step": 2150 }, { "epoch": 0.79, "learning_rate": 0.0001915346892314316, "loss": 0.3429, "step": 2160 }, { "epoch": 0.8, "learning_rate": 0.0001914572633692552, "loss": 0.3325, "step": 2170 }, { "epoch": 0.8, "learning_rate": 0.00019137950083411505, "loss": 0.3444, "step": 2180 }, { "epoch": 0.8, "learning_rate": 0.000191301401912271, "loss": 0.351, "step": 2190 }, { "epoch": 0.81, "learning_rate": 0.00019122296689122123, "loss": 0.3171, "step": 2200 }, { "epoch": 0.81, "eval_loss": 0.4078885316848755, "eval_runtime": 403.041, "eval_samples_per_second": 11.867, "eval_steps_per_second": 2.967, "step": 2200 }, { "epoch": 0.81, "learning_rate": 0.0001911441960597012, "loss": 0.3626, "step": 2210 }, { "epoch": 0.81, "learning_rate": 0.0001910650897076824, "loss": 0.3529, "step": 2220 }, { "epoch": 0.82, "learning_rate": 0.00019098564812637165, "loss": 0.3727, "step": 2230 }, { "epoch": 0.82, "learning_rate": 0.0001909058716082097, "loss": 0.3111, "step": 2240 }, { "epoch": 0.82, "learning_rate": 0.0001908257604468703, "loss": 0.3846, "step": 2250 }, { "epoch": 0.83, "learning_rate": 0.00019074531493725906, "loss": 0.3612, "step": 2260 }, { "epoch": 0.83, "learning_rate": 0.0001906645353755124, "loss": 0.3573, "step": 2270 }, { "epoch": 0.84, "learning_rate": 0.00019058342205899656, "loss": 0.3902, "step": 2280 }, { "epoch": 0.84, "learning_rate": 0.0001905019752863062, "loss": 0.345, "step": 2290 }, { "epoch": 0.84, "learning_rate": 0.0001904201953572637, "loss": 0.3581, "step": 2300 }, { "epoch": 0.84, "eval_loss": 0.4118511974811554, "eval_runtime": 405.9796, "eval_samples_per_second": 11.781, "eval_steps_per_second": 2.946, "step": 2300 }, { "epoch": 0.85, "learning_rate": 0.00019033808257291768, "loss": 0.3303, "step": 2310 }, { "epoch": 0.85, "learning_rate": 0.00019025563723554223, "loss": 0.3432, "step": 2320 }, { "epoch": 0.85, "learning_rate": 0.00019017285964863554, "loss": 0.3436, "step": 2330 }, { "epoch": 0.86, "learning_rate": 0.0001900897501169189, "loss": 0.338, "step": 2340 }, { "epoch": 0.86, "learning_rate": 0.0001900063089463356, "loss": 0.3611, "step": 2350 }, { "epoch": 0.86, "learning_rate": 0.00018992253644404967, "loss": 0.3311, "step": 2360 }, { "epoch": 0.87, "learning_rate": 0.00018983843291844492, "loss": 0.3337, "step": 2370 }, { "epoch": 0.87, "learning_rate": 0.00018975399867912364, "loss": 0.358, "step": 2380 }, { "epoch": 0.88, "learning_rate": 0.0001896692340369057, "loss": 0.3802, "step": 2390 }, { "epoch": 0.88, "learning_rate": 0.00018958413930382704, "loss": 0.3389, "step": 2400 }, { "epoch": 0.88, "eval_loss": 0.41193756461143494, "eval_runtime": 402.9836, "eval_samples_per_second": 11.869, "eval_steps_per_second": 2.968, "step": 2400 }, { "epoch": 0.88, "learning_rate": 0.0001894987147931389, "loss": 0.3398, "step": 2410 }, { "epoch": 0.89, "learning_rate": 0.00018941296081930646, "loss": 0.3459, "step": 2420 }, { "epoch": 0.89, "learning_rate": 0.00018932687769800767, "loss": 0.3546, "step": 2430 }, { "epoch": 0.89, "learning_rate": 0.00018924046574613222, "loss": 0.3158, "step": 2440 }, { "epoch": 0.9, "learning_rate": 0.0001891537252817802, "loss": 0.3484, "step": 2450 }, { "epoch": 0.9, "learning_rate": 0.00018906665662426104, "loss": 0.3437, "step": 2460 }, { "epoch": 0.9, "learning_rate": 0.0001889792600940924, "loss": 0.3637, "step": 2470 }, { "epoch": 0.91, "learning_rate": 0.00018889153601299888, "loss": 0.3658, "step": 2480 }, { "epoch": 0.91, "learning_rate": 0.00018880348470391077, "loss": 0.3452, "step": 2490 }, { "epoch": 0.92, "learning_rate": 0.000188715106490963, "loss": 0.3302, "step": 2500 }, { "epoch": 0.92, "eval_loss": 0.4021553695201874, "eval_runtime": 399.2274, "eval_samples_per_second": 11.981, "eval_steps_per_second": 2.996, "step": 2500 }, { "epoch": 0.92, "learning_rate": 0.000188626401699494, "loss": 0.3511, "step": 2510 }, { "epoch": 0.92, "learning_rate": 0.00018853737065604426, "loss": 0.3232, "step": 2520 }, { "epoch": 0.93, "learning_rate": 0.00018844801368835532, "loss": 0.366, "step": 2530 }, { "epoch": 0.93, "learning_rate": 0.00018835833112536857, "loss": 0.3151, "step": 2540 }, { "epoch": 0.93, "learning_rate": 0.0001882683232972239, "loss": 0.3534, "step": 2550 }, { "epoch": 0.94, "learning_rate": 0.00018817799053525862, "loss": 0.3779, "step": 2560 }, { "epoch": 0.94, "learning_rate": 0.0001880873331720062, "loss": 0.3338, "step": 2570 }, { "epoch": 0.95, "learning_rate": 0.00018799635154119495, "loss": 0.3243, "step": 2580 }, { "epoch": 0.95, "learning_rate": 0.00018790504597774698, "loss": 0.3729, "step": 2590 }, { "epoch": 0.95, "learning_rate": 0.0001878134168177768, "loss": 0.3553, "step": 2600 }, { "epoch": 0.95, "eval_loss": 0.4054717421531677, "eval_runtime": 400.3737, "eval_samples_per_second": 11.946, "eval_steps_per_second": 2.987, "step": 2600 }, { "epoch": 0.96, "learning_rate": 0.00018772146439859015, "loss": 0.3268, "step": 2610 }, { "epoch": 0.96, "learning_rate": 0.00018762918905868277, "loss": 0.3304, "step": 2620 }, { "epoch": 0.96, "learning_rate": 0.00018753659113773913, "loss": 0.3606, "step": 2630 }, { "epoch": 0.97, "learning_rate": 0.0001874436709766312, "loss": 0.3421, "step": 2640 }, { "epoch": 0.97, "learning_rate": 0.00018735042891741718, "loss": 0.3629, "step": 2650 }, { "epoch": 0.97, "learning_rate": 0.0001872568653033402, "loss": 0.3636, "step": 2660 }, { "epoch": 0.98, "learning_rate": 0.00018716298047882714, "loss": 0.3485, "step": 2670 }, { "epoch": 0.98, "learning_rate": 0.00018706877478948735, "loss": 0.3169, "step": 2680 }, { "epoch": 0.99, "learning_rate": 0.00018697424858211126, "loss": 0.306, "step": 2690 }, { "epoch": 0.99, "learning_rate": 0.0001868794022046693, "loss": 0.3586, "step": 2700 }, { "epoch": 0.99, "eval_loss": 0.4048784375190735, "eval_runtime": 400.9367, "eval_samples_per_second": 11.93, "eval_steps_per_second": 2.983, "step": 2700 }, { "epoch": 0.99, "learning_rate": 0.00018678423600631042, "loss": 0.3311, "step": 2710 }, { "epoch": 1.0, "learning_rate": 0.00018668875033736094, "loss": 0.3066, "step": 2720 }, { "epoch": 1.0, "learning_rate": 0.00018659294554932324, "loss": 0.338, "step": 2730 }, { "epoch": 1.0, "learning_rate": 0.00018649682199487437, "loss": 0.3205, "step": 2740 }, { "epoch": 1.01, "learning_rate": 0.0001864003800278649, "loss": 0.3176, "step": 2750 }, { "epoch": 1.01, "learning_rate": 0.00018630362000331753, "loss": 0.2986, "step": 2760 }, { "epoch": 1.01, "learning_rate": 0.00018620654227742572, "loss": 0.3175, "step": 2770 }, { "epoch": 1.02, "learning_rate": 0.0001861091472075526, "loss": 0.3174, "step": 2780 }, { "epoch": 1.02, "learning_rate": 0.0001860114351522293, "loss": 0.2752, "step": 2790 }, { "epoch": 1.03, "learning_rate": 0.00018591340647115402, "loss": 0.3014, "step": 2800 }, { "epoch": 1.03, "eval_loss": 0.41257622838020325, "eval_runtime": 223.7363, "eval_samples_per_second": 21.378, "eval_steps_per_second": 5.346, "step": 2800 }, { "epoch": 1.03, "learning_rate": 0.0001858150615251905, "loss": 0.3013, "step": 2810 }, { "epoch": 1.03, "learning_rate": 0.00018571640067636662, "loss": 0.3364, "step": 2820 }, { "epoch": 1.04, "learning_rate": 0.00018561742428787324, "loss": 0.3159, "step": 2830 }, { "epoch": 1.04, "learning_rate": 0.00018552807605292504, "loss": 0.3037, "step": 2840 }, { "epoch": 1.04, "learning_rate": 0.00018542850114380946, "loss": 0.3087, "step": 2850 }, { "epoch": 1.05, "learning_rate": 0.00018532861175484162, "loss": 0.2858, "step": 2860 }, { "epoch": 1.05, "learning_rate": 0.00018522840825373492, "loss": 0.3208, "step": 2870 }, { "epoch": 1.06, "learning_rate": 0.00018512789100935906, "loss": 0.3117, "step": 2880 }, { "epoch": 1.06, "learning_rate": 0.00018502706039173856, "loss": 0.2921, "step": 2890 }, { "epoch": 1.06, "learning_rate": 0.0001849259167720517, "loss": 0.3354, "step": 2900 }, { "epoch": 1.06, "eval_loss": 0.4081941545009613, "eval_runtime": 223.8258, "eval_samples_per_second": 21.369, "eval_steps_per_second": 5.343, "step": 2900 }, { "epoch": 1.07, "learning_rate": 0.0001848244605226289, "loss": 0.2866, "step": 2910 }, { "epoch": 1.07, "learning_rate": 0.0001847226920169514, "loss": 0.308, "step": 2920 }, { "epoch": 1.07, "learning_rate": 0.00018462061162965, "loss": 0.3142, "step": 2930 }, { "epoch": 1.08, "learning_rate": 0.0001845182197365036, "loss": 0.3125, "step": 2940 }, { "epoch": 1.08, "learning_rate": 0.00018441551671443768, "loss": 0.307, "step": 2950 }, { "epoch": 1.08, "learning_rate": 0.00018431250294152323, "loss": 0.2804, "step": 2960 }, { "epoch": 1.09, "learning_rate": 0.00018420917879697507, "loss": 0.304, "step": 2970 }, { "epoch": 1.09, "learning_rate": 0.0001841055446611506, "loss": 0.3152, "step": 2980 }, { "epoch": 1.1, "learning_rate": 0.00018400160091554835, "loss": 0.3095, "step": 2990 }, { "epoch": 1.1, "learning_rate": 0.0001838973479428066, "loss": 0.2954, "step": 3000 }, { "epoch": 1.1, "eval_loss": 0.4158288836479187, "eval_runtime": 223.8063, "eval_samples_per_second": 21.371, "eval_steps_per_second": 5.344, "step": 3000 }, { "epoch": 1.1, "learning_rate": 0.00018379278612670193, "loss": 0.3167, "step": 3010 }, { "epoch": 1.11, "learning_rate": 0.00018368791585214784, "loss": 0.2872, "step": 3020 }, { "epoch": 1.11, "learning_rate": 0.00018358273750519337, "loss": 0.2999, "step": 3030 }, { "epoch": 1.11, "learning_rate": 0.00018347725147302158, "loss": 0.3229, "step": 3040 }, { "epoch": 1.12, "learning_rate": 0.00018337145814394825, "loss": 0.3332, "step": 3050 }, { "epoch": 1.12, "learning_rate": 0.0001832653579074203, "loss": 0.2818, "step": 3060 }, { "epoch": 1.12, "learning_rate": 0.00018315895115401457, "loss": 0.3198, "step": 3070 }, { "epoch": 1.13, "learning_rate": 0.00018305223827543604, "loss": 0.3504, "step": 3080 }, { "epoch": 1.13, "learning_rate": 0.0001829452196645168, "loss": 0.3392, "step": 3090 }, { "epoch": 1.14, "learning_rate": 0.00018283789571521436, "loss": 0.3023, "step": 3100 }, { "epoch": 1.14, "eval_loss": 0.40502193570137024, "eval_runtime": 223.8877, "eval_samples_per_second": 21.363, "eval_steps_per_second": 5.342, "step": 3100 }, { "epoch": 1.14, "learning_rate": 0.00018273026682261013, "loss": 0.3164, "step": 3110 }, { "epoch": 1.14, "learning_rate": 0.0001826223333829082, "loss": 0.2985, "step": 3120 }, { "epoch": 1.15, "learning_rate": 0.00018251409579343375, "loss": 0.2992, "step": 3130 }, { "epoch": 1.15, "learning_rate": 0.00018240555445263153, "loss": 0.3174, "step": 3140 }, { "epoch": 1.15, "learning_rate": 0.00018229670976006453, "loss": 0.3025, "step": 3150 }, { "epoch": 1.16, "learning_rate": 0.00018218756211641236, "loss": 0.3297, "step": 3160 }, { "epoch": 1.16, "learning_rate": 0.00018207811192346996, "loss": 0.3139, "step": 3170 }, { "epoch": 1.17, "learning_rate": 0.00018196835958414598, "loss": 0.2956, "step": 3180 }, { "epoch": 1.17, "learning_rate": 0.00018185830550246124, "loss": 0.3189, "step": 3190 }, { "epoch": 1.17, "learning_rate": 0.00018174795008354743, "loss": 0.2896, "step": 3200 }, { "epoch": 1.17, "eval_loss": 0.40529006719589233, "eval_runtime": 223.8738, "eval_samples_per_second": 21.365, "eval_steps_per_second": 5.342, "step": 3200 }, { "epoch": 1.18, "learning_rate": 0.00018163729373364554, "loss": 0.3085, "step": 3210 }, { "epoch": 1.18, "learning_rate": 0.0001815263368601043, "loss": 0.316, "step": 3220 }, { "epoch": 1.18, "learning_rate": 0.00018141507987137873, "loss": 0.326, "step": 3230 }, { "epoch": 1.19, "learning_rate": 0.00018130352317702865, "loss": 0.293, "step": 3240 }, { "epoch": 1.19, "learning_rate": 0.00018119166718771716, "loss": 0.2887, "step": 3250 }, { "epoch": 1.19, "learning_rate": 0.00018107951231520911, "loss": 0.3266, "step": 3260 }, { "epoch": 1.2, "learning_rate": 0.00018096705897236966, "loss": 0.3327, "step": 3270 }, { "epoch": 1.2, "learning_rate": 0.00018085430757316256, "loss": 0.2991, "step": 3280 }, { "epoch": 1.21, "learning_rate": 0.00018074125853264898, "loss": 0.3076, "step": 3290 }, { "epoch": 1.21, "learning_rate": 0.00018062791226698558, "loss": 0.3339, "step": 3300 }, { "epoch": 1.21, "eval_loss": 0.4053775668144226, "eval_runtime": 223.9954, "eval_samples_per_second": 21.353, "eval_steps_per_second": 5.339, "step": 3300 }, { "epoch": 1.21, "learning_rate": 0.00018051426919342317, "loss": 0.2935, "step": 3310 }, { "epoch": 1.22, "learning_rate": 0.00018040032973030536, "loss": 0.3497, "step": 3320 }, { "epoch": 1.22, "learning_rate": 0.00018028609429706664, "loss": 0.2897, "step": 3330 }, { "epoch": 1.22, "learning_rate": 0.00018017156331423114, "loss": 0.312, "step": 3340 }, { "epoch": 1.23, "learning_rate": 0.00018005673720341086, "loss": 0.303, "step": 3350 }, { "epoch": 1.23, "learning_rate": 0.00017994161638730432, "loss": 0.3253, "step": 3360 }, { "epoch": 1.23, "learning_rate": 0.00017982620128969488, "loss": 0.3322, "step": 3370 }, { "epoch": 1.24, "learning_rate": 0.0001797104923354492, "loss": 0.2988, "step": 3380 }, { "epoch": 1.24, "learning_rate": 0.00017959448995051575, "loss": 0.3094, "step": 3390 }, { "epoch": 1.25, "learning_rate": 0.00017947819456192306, "loss": 0.3118, "step": 3400 }, { "epoch": 1.25, "eval_loss": 0.3964312672615051, "eval_runtime": 223.8688, "eval_samples_per_second": 21.365, "eval_steps_per_second": 5.342, "step": 3400 }, { "epoch": 1.25, "learning_rate": 0.00017936160659777833, "loss": 0.2958, "step": 3410 }, { "epoch": 1.25, "learning_rate": 0.00017924472648726583, "loss": 0.3119, "step": 3420 }, { "epoch": 1.26, "learning_rate": 0.00017912755466064525, "loss": 0.3096, "step": 3430 }, { "epoch": 1.26, "learning_rate": 0.00017901009154925007, "loss": 0.2778, "step": 3440 }, { "epoch": 1.26, "learning_rate": 0.00017889233758548625, "loss": 0.2953, "step": 3450 }, { "epoch": 1.27, "learning_rate": 0.00017877429320283016, "loss": 0.3063, "step": 3460 }, { "epoch": 1.27, "learning_rate": 0.0001786559588358275, "loss": 0.2945, "step": 3470 }, { "epoch": 1.27, "learning_rate": 0.00017853733492009135, "loss": 0.3027, "step": 3480 }, { "epoch": 1.28, "learning_rate": 0.0001784184218923007, "loss": 0.2654, "step": 3490 }, { "epoch": 1.28, "learning_rate": 0.0001782992201901988, "loss": 0.3289, "step": 3500 }, { "epoch": 1.28, "eval_loss": 0.39914268255233765, "eval_runtime": 223.8716, "eval_samples_per_second": 21.365, "eval_steps_per_second": 5.342, "step": 3500 }, { "epoch": 1.29, "learning_rate": 0.0001781797302525916, "loss": 0.2894, "step": 3510 }, { "epoch": 1.29, "learning_rate": 0.00017805995251934614, "loss": 0.2776, "step": 3520 }, { "epoch": 1.29, "learning_rate": 0.00017793988743138877, "loss": 0.288, "step": 3530 }, { "epoch": 1.3, "learning_rate": 0.00017781953543070372, "loss": 0.3229, "step": 3540 }, { "epoch": 1.3, "learning_rate": 0.00017769889696033154, "loss": 0.285, "step": 3550 }, { "epoch": 1.3, "learning_rate": 0.0001775779724643671, "loss": 0.321, "step": 3560 }, { "epoch": 1.31, "learning_rate": 0.0001774567623879583, "loss": 0.3222, "step": 3570 }, { "epoch": 1.31, "learning_rate": 0.00017733526717730435, "loss": 0.3087, "step": 3580 }, { "epoch": 1.32, "learning_rate": 0.00017721348727965408, "loss": 0.2904, "step": 3590 }, { "epoch": 1.32, "learning_rate": 0.00017709142314330424, "loss": 0.2984, "step": 3600 }, { "epoch": 1.32, "eval_loss": 0.40588298439979553, "eval_runtime": 223.9372, "eval_samples_per_second": 21.359, "eval_steps_per_second": 5.341, "step": 3600 }, { "epoch": 1.32, "learning_rate": 0.00017696907521759804, "loss": 0.3216, "step": 3610 }, { "epoch": 1.33, "learning_rate": 0.00017684644395292326, "loss": 0.3019, "step": 3620 }, { "epoch": 1.33, "learning_rate": 0.00017672352980071078, "loss": 0.3272, "step": 3630 }, { "epoch": 1.33, "learning_rate": 0.00017660033321343285, "loss": 0.2892, "step": 3640 }, { "epoch": 1.34, "learning_rate": 0.0001764768546446014, "loss": 0.2921, "step": 3650 }, { "epoch": 1.34, "learning_rate": 0.00017635309454876636, "loss": 0.3105, "step": 3660 }, { "epoch": 1.34, "learning_rate": 0.00017622905338151408, "loss": 0.3205, "step": 3670 }, { "epoch": 1.35, "learning_rate": 0.00017610473159946556, "loss": 0.2838, "step": 3680 }, { "epoch": 1.35, "learning_rate": 0.00017598012966027482, "loss": 0.2762, "step": 3690 }, { "epoch": 1.36, "learning_rate": 0.0001758552480226271, "loss": 0.3277, "step": 3700 }, { "epoch": 1.36, "eval_loss": 0.3979549705982208, "eval_runtime": 223.8768, "eval_samples_per_second": 21.364, "eval_steps_per_second": 5.342, "step": 3700 }, { "epoch": 1.36, "learning_rate": 0.00017573008714623746, "loss": 0.2709, "step": 3710 }, { "epoch": 1.36, "learning_rate": 0.00017560464749184876, "loss": 0.2918, "step": 3720 }, { "epoch": 1.37, "learning_rate": 0.00017547892952123005, "loss": 0.3098, "step": 3730 }, { "epoch": 1.37, "learning_rate": 0.00017535293369717506, "loss": 0.3233, "step": 3740 }, { "epoch": 1.37, "learning_rate": 0.00017522666048350023, "loss": 0.31, "step": 3750 }, { "epoch": 1.38, "learning_rate": 0.00017510011034504324, "loss": 0.3337, "step": 3760 }, { "epoch": 1.38, "learning_rate": 0.00017497328374766112, "loss": 0.2845, "step": 3770 }, { "epoch": 1.38, "learning_rate": 0.00017484618115822857, "loss": 0.3073, "step": 3780 }, { "epoch": 1.39, "learning_rate": 0.00017471880304463638, "loss": 0.2977, "step": 3790 }, { "epoch": 1.39, "learning_rate": 0.0001745911498757895, "loss": 0.3011, "step": 3800 }, { "epoch": 1.39, "eval_loss": 0.4045478105545044, "eval_runtime": 223.7983, "eval_samples_per_second": 21.372, "eval_steps_per_second": 5.344, "step": 3800 }, { "epoch": 1.4, "learning_rate": 0.00017446322212160545, "loss": 0.3029, "step": 3810 }, { "epoch": 1.4, "learning_rate": 0.0001743350202530126, "loss": 0.3017, "step": 3820 }, { "epoch": 1.4, "learning_rate": 0.00017420654474194832, "loss": 0.3179, "step": 3830 }, { "epoch": 1.41, "learning_rate": 0.00017407779606135732, "loss": 0.2728, "step": 3840 }, { "epoch": 1.41, "learning_rate": 0.00017394877468518996, "loss": 0.286, "step": 3850 }, { "epoch": 1.41, "learning_rate": 0.00017381948108840042, "loss": 0.3009, "step": 3860 }, { "epoch": 1.42, "learning_rate": 0.00017368991574694495, "loss": 0.2946, "step": 3870 }, { "epoch": 1.42, "learning_rate": 0.0001735600791377802, "loss": 0.3229, "step": 3880 }, { "epoch": 1.43, "learning_rate": 0.00017342997173886134, "loss": 0.3372, "step": 3890 }, { "epoch": 1.43, "learning_rate": 0.00017329959402914046, "loss": 0.3194, "step": 3900 }, { "epoch": 1.43, "eval_loss": 0.40152063965797424, "eval_runtime": 223.8706, "eval_samples_per_second": 21.365, "eval_steps_per_second": 5.342, "step": 3900 }, { "epoch": 1.43, "learning_rate": 0.0001731689464885647, "loss": 0.2674, "step": 3910 }, { "epoch": 1.44, "learning_rate": 0.00017303802959807443, "loss": 0.2558, "step": 3920 }, { "epoch": 1.44, "learning_rate": 0.0001729068438396016, "loss": 0.3186, "step": 3930 }, { "epoch": 1.44, "learning_rate": 0.00017277538969606793, "loss": 0.3298, "step": 3940 }, { "epoch": 1.45, "learning_rate": 0.00017264366765138317, "loss": 0.2991, "step": 3950 }, { "epoch": 1.45, "learning_rate": 0.00017251167819044315, "loss": 0.2652, "step": 3960 }, { "epoch": 1.45, "learning_rate": 0.0001723794217991282, "loss": 0.3177, "step": 3970 }, { "epoch": 1.46, "learning_rate": 0.00017224689896430117, "loss": 0.3045, "step": 3980 }, { "epoch": 1.46, "learning_rate": 0.00017211411017380594, "loss": 0.3146, "step": 3990 }, { "epoch": 1.47, "learning_rate": 0.00017198105591646528, "loss": 0.2921, "step": 4000 }, { "epoch": 1.47, "eval_loss": 0.400907427072525, "eval_runtime": 224.0601, "eval_samples_per_second": 21.347, "eval_steps_per_second": 5.338, "step": 4000 }, { "epoch": 1.47, "learning_rate": 0.00017184773668207917, "loss": 0.3441, "step": 4010 }, { "epoch": 1.47, "learning_rate": 0.00017171415296142315, "loss": 0.2876, "step": 4020 }, { "epoch": 1.48, "learning_rate": 0.00017158030524624626, "loss": 0.2755, "step": 4030 }, { "epoch": 1.48, "learning_rate": 0.0001714461940292695, "loss": 0.3169, "step": 4040 }, { "epoch": 1.48, "learning_rate": 0.00017131181980418374, "loss": 0.2844, "step": 4050 }, { "epoch": 1.49, "learning_rate": 0.00017117718306564812, "loss": 0.2821, "step": 4060 }, { "epoch": 1.49, "learning_rate": 0.00017104228430928805, "loss": 0.3002, "step": 4070 }, { "epoch": 1.49, "learning_rate": 0.00017090712403169364, "loss": 0.3232, "step": 4080 }, { "epoch": 1.5, "learning_rate": 0.00017077170273041757, "loss": 0.3135, "step": 4090 }, { "epoch": 1.5, "learning_rate": 0.00017063602090397346, "loss": 0.2917, "step": 4100 }, { "epoch": 1.5, "eval_loss": 0.4019235670566559, "eval_runtime": 224.3869, "eval_samples_per_second": 21.316, "eval_steps_per_second": 5.33, "step": 4100 }, { "epoch": 1.51, "learning_rate": 0.00017050007905183398, "loss": 0.3089, "step": 4110 }, { "epoch": 1.51, "learning_rate": 0.000170363877674429, "loss": 0.3063, "step": 4120 }, { "epoch": 1.51, "learning_rate": 0.00017022741727314373, "loss": 0.3056, "step": 4130 }, { "epoch": 1.52, "learning_rate": 0.00017009069835031694, "loss": 0.3168, "step": 4140 }, { "epoch": 1.52, "learning_rate": 0.00016995372140923907, "loss": 0.3062, "step": 4150 }, { "epoch": 1.52, "learning_rate": 0.00016981648695415033, "loss": 0.2744, "step": 4160 }, { "epoch": 1.53, "learning_rate": 0.00016967899549023895, "loss": 0.2877, "step": 4170 }, { "epoch": 1.53, "learning_rate": 0.00016954124752363922, "loss": 0.3173, "step": 4180 }, { "epoch": 1.54, "learning_rate": 0.00016940324356142972, "loss": 0.2954, "step": 4190 }, { "epoch": 1.54, "learning_rate": 0.00016926498411163135, "loss": 0.2792, "step": 4200 }, { "epoch": 1.54, "eval_loss": 0.4046282172203064, "eval_runtime": 224.0328, "eval_samples_per_second": 21.35, "eval_steps_per_second": 5.339, "step": 4200 }, { "epoch": 1.54, "learning_rate": 0.00016912646968320552, "loss": 0.3168, "step": 4210 }, { "epoch": 1.55, "learning_rate": 0.00016898770078605226, "loss": 0.2854, "step": 4220 }, { "epoch": 1.55, "learning_rate": 0.00016884867793100843, "loss": 0.2979, "step": 4230 }, { "epoch": 1.55, "learning_rate": 0.00016870940162984566, "loss": 0.2656, "step": 4240 }, { "epoch": 1.56, "learning_rate": 0.00016856987239526863, "loss": 0.3169, "step": 4250 }, { "epoch": 1.56, "learning_rate": 0.00016843009074091306, "loss": 0.2872, "step": 4260 }, { "epoch": 1.56, "learning_rate": 0.00016829005718134397, "loss": 0.2954, "step": 4270 }, { "epoch": 1.57, "learning_rate": 0.00016814977223205362, "loss": 0.2885, "step": 4280 }, { "epoch": 1.57, "learning_rate": 0.00016800923640945974, "loss": 0.2927, "step": 4290 }, { "epoch": 1.58, "learning_rate": 0.0001678684502309035, "loss": 0.2886, "step": 4300 }, { "epoch": 1.58, "eval_loss": 0.4055146872997284, "eval_runtime": 224.1142, "eval_samples_per_second": 21.342, "eval_steps_per_second": 5.337, "step": 4300 }, { "epoch": 1.58, "learning_rate": 0.00016772741421464772, "loss": 0.3042, "step": 4310 }, { "epoch": 1.58, "learning_rate": 0.00016758612887987498, "loss": 0.2858, "step": 4320 }, { "epoch": 1.59, "learning_rate": 0.00016744459474668557, "loss": 0.3072, "step": 4330 }, { "epoch": 1.59, "learning_rate": 0.00016730281233609572, "loss": 0.3126, "step": 4340 }, { "epoch": 1.59, "learning_rate": 0.00016716078217003557, "loss": 0.2748, "step": 4350 }, { "epoch": 1.6, "learning_rate": 0.00016701850477134734, "loss": 0.2873, "step": 4360 }, { "epoch": 1.6, "learning_rate": 0.00016687598066378336, "loss": 0.3051, "step": 4370 }, { "epoch": 1.6, "learning_rate": 0.00016673321037200407, "loss": 0.3077, "step": 4380 }, { "epoch": 1.61, "learning_rate": 0.0001665901944215763, "loss": 0.2708, "step": 4390 }, { "epoch": 1.61, "learning_rate": 0.00016644693333897108, "loss": 0.2947, "step": 4400 }, { "epoch": 1.61, "eval_loss": 0.40514013171195984, "eval_runtime": 224.1094, "eval_samples_per_second": 21.342, "eval_steps_per_second": 5.337, "step": 4400 }, { "epoch": 1.62, "learning_rate": 0.0001663034276515619, "loss": 0.306, "step": 4410 }, { "epoch": 1.62, "learning_rate": 0.00016615967788762261, "loss": 0.3076, "step": 4420 }, { "epoch": 1.62, "learning_rate": 0.00016601568457632566, "loss": 0.2895, "step": 4430 }, { "epoch": 1.63, "learning_rate": 0.00016587144824773992, "loss": 0.2885, "step": 4440 }, { "epoch": 1.63, "learning_rate": 0.00016572696943282903, "loss": 0.3046, "step": 4450 }, { "epoch": 1.63, "learning_rate": 0.00016558224866344907, "loss": 0.3037, "step": 4460 }, { "epoch": 1.64, "learning_rate": 0.000165437286472347, "loss": 0.28, "step": 4470 }, { "epoch": 1.64, "learning_rate": 0.00016529208339315833, "loss": 0.2903, "step": 4480 }, { "epoch": 1.64, "learning_rate": 0.00016514663996040544, "loss": 0.2849, "step": 4490 }, { "epoch": 1.65, "learning_rate": 0.00016500095670949548, "loss": 0.2975, "step": 4500 }, { "epoch": 1.65, "eval_loss": 0.4067119359970093, "eval_runtime": 224.4914, "eval_samples_per_second": 21.306, "eval_steps_per_second": 5.328, "step": 4500 }, { "epoch": 1.65, "learning_rate": 0.00016485503417671836, "loss": 0.3115, "step": 4510 }, { "epoch": 1.66, "learning_rate": 0.00016470887289924492, "loss": 0.3409, "step": 4520 }, { "epoch": 1.66, "learning_rate": 0.00016456247341512485, "loss": 0.2604, "step": 4530 }, { "epoch": 1.66, "learning_rate": 0.00016441583626328467, "loss": 0.3226, "step": 4540 }, { "epoch": 1.67, "learning_rate": 0.00016426896198352587, "loss": 0.2816, "step": 4550 }, { "epoch": 1.67, "learning_rate": 0.00016412185111652278, "loss": 0.319, "step": 4560 }, { "epoch": 1.67, "learning_rate": 0.00016397450420382076, "loss": 0.2831, "step": 4570 }, { "epoch": 1.68, "learning_rate": 0.00016382692178783402, "loss": 0.2974, "step": 4580 }, { "epoch": 1.68, "learning_rate": 0.00016367910441184374, "loss": 0.2772, "step": 4590 }, { "epoch": 1.69, "learning_rate": 0.00016353105261999605, "loss": 0.3091, "step": 4600 }, { "epoch": 1.69, "eval_loss": 0.3946963846683502, "eval_runtime": 224.3647, "eval_samples_per_second": 21.318, "eval_steps_per_second": 5.331, "step": 4600 }, { "epoch": 1.69, "learning_rate": 0.00016338276695729994, "loss": 0.286, "step": 4610 }, { "epoch": 1.69, "learning_rate": 0.00016323424796962544, "loss": 0.2721, "step": 4620 }, { "epoch": 1.7, "learning_rate": 0.0001630854962037014, "loss": 0.2983, "step": 4630 }, { "epoch": 1.7, "learning_rate": 0.00016293651220711364, "loss": 0.2875, "step": 4640 }, { "epoch": 1.7, "learning_rate": 0.0001627872965283028, "loss": 0.2859, "step": 4650 }, { "epoch": 1.71, "learning_rate": 0.00016263784971656247, "loss": 0.2603, "step": 4660 }, { "epoch": 1.71, "learning_rate": 0.00016248817232203698, "loss": 0.3231, "step": 4670 }, { "epoch": 1.71, "learning_rate": 0.00016233826489571963, "loss": 0.2927, "step": 4680 }, { "epoch": 1.72, "learning_rate": 0.00016218812798945038, "loss": 0.297, "step": 4690 }, { "epoch": 1.72, "learning_rate": 0.00016203776215591403, "loss": 0.2908, "step": 4700 }, { "epoch": 1.72, "eval_loss": 0.403292179107666, "eval_runtime": 224.3888, "eval_samples_per_second": 21.316, "eval_steps_per_second": 5.33, "step": 4700 }, { "epoch": 1.73, "learning_rate": 0.0001618871679486381, "loss": 0.3094, "step": 4710 }, { "epoch": 1.73, "learning_rate": 0.00016173634592199076, "loss": 0.29, "step": 4720 }, { "epoch": 1.73, "learning_rate": 0.00016158529663117888, "loss": 0.3115, "step": 4730 }, { "epoch": 1.74, "learning_rate": 0.00016143402063224584, "loss": 0.3139, "step": 4740 }, { "epoch": 1.74, "learning_rate": 0.00016128251848206976, "loss": 0.302, "step": 4750 }, { "epoch": 1.74, "learning_rate": 0.00016113079073836107, "loss": 0.2777, "step": 4760 }, { "epoch": 1.75, "learning_rate": 0.0001609788379596608, "loss": 0.2906, "step": 4770 }, { "epoch": 1.75, "learning_rate": 0.00016082666070533832, "loss": 0.3028, "step": 4780 }, { "epoch": 1.75, "learning_rate": 0.0001606742595355893, "loss": 0.3159, "step": 4790 }, { "epoch": 1.76, "learning_rate": 0.0001605216350114338, "loss": 0.2864, "step": 4800 }, { "epoch": 1.76, "eval_loss": 0.40955105423927307, "eval_runtime": 224.3793, "eval_samples_per_second": 21.317, "eval_steps_per_second": 5.33, "step": 4800 }, { "epoch": 1.76, "learning_rate": 0.00016036878769471401, "loss": 0.3017, "step": 4810 }, { "epoch": 1.77, "learning_rate": 0.00016021571814809227, "loss": 0.2986, "step": 4820 }, { "epoch": 1.77, "learning_rate": 0.000160062426935049, "loss": 0.3077, "step": 4830 }, { "epoch": 1.77, "learning_rate": 0.00015990891461988065, "loss": 0.2758, "step": 4840 }, { "epoch": 1.78, "learning_rate": 0.00015975518176769755, "loss": 0.2592, "step": 4850 }, { "epoch": 1.78, "learning_rate": 0.0001596012289444219, "loss": 0.3021, "step": 4860 }, { "epoch": 1.78, "learning_rate": 0.00015944705671678565, "loss": 0.3047, "step": 4870 }, { "epoch": 1.79, "learning_rate": 0.00015929266565232846, "loss": 0.2785, "step": 4880 }, { "epoch": 1.79, "learning_rate": 0.00015913805631939546, "loss": 0.2839, "step": 4890 }, { "epoch": 1.8, "learning_rate": 0.00015898322928713544, "loss": 0.2788, "step": 4900 }, { "epoch": 1.8, "eval_loss": 0.40693148970603943, "eval_runtime": 224.6917, "eval_samples_per_second": 21.287, "eval_steps_per_second": 5.323, "step": 4900 }, { "epoch": 1.8, "learning_rate": 0.00015882818512549836, "loss": 0.2875, "step": 4910 }, { "epoch": 1.8, "learning_rate": 0.00015867292440523378, "loss": 0.2907, "step": 4920 }, { "epoch": 1.81, "learning_rate": 0.00015851744769788818, "loss": 0.2776, "step": 4930 }, { "epoch": 1.81, "learning_rate": 0.00015836175557580324, "loss": 0.2608, "step": 4940 }, { "epoch": 1.81, "learning_rate": 0.00015820584861211368, "loss": 0.2945, "step": 4950 }, { "epoch": 1.82, "learning_rate": 0.000158049727380745, "loss": 0.2771, "step": 4960 }, { "epoch": 1.82, "learning_rate": 0.0001578933924564115, "loss": 0.2802, "step": 4970 }, { "epoch": 1.82, "learning_rate": 0.0001577368444146142, "loss": 0.2849, "step": 4980 }, { "epoch": 1.83, "learning_rate": 0.00015758008383163854, "loss": 0.3013, "step": 4990 }, { "epoch": 1.83, "learning_rate": 0.0001574231112845524, "loss": 0.2942, "step": 5000 }, { "epoch": 1.83, "eval_loss": 0.399748831987381, "eval_runtime": 224.6973, "eval_samples_per_second": 21.286, "eval_steps_per_second": 5.323, "step": 5000 }, { "epoch": 1.84, "learning_rate": 0.00015726592735120393, "loss": 0.2883, "step": 5010 }, { "epoch": 1.84, "learning_rate": 0.0001571085326102195, "loss": 0.2639, "step": 5020 }, { "epoch": 1.84, "learning_rate": 0.0001569509276410015, "loss": 0.2624, "step": 5030 }, { "epoch": 1.85, "learning_rate": 0.00015679311302372614, "loss": 0.2823, "step": 5040 }, { "epoch": 1.85, "learning_rate": 0.0001566350893393414, "loss": 0.3026, "step": 5050 }, { "epoch": 1.85, "learning_rate": 0.00015647685716956494, "loss": 0.2803, "step": 5060 }, { "epoch": 1.86, "learning_rate": 0.00015631841709688184, "loss": 0.2991, "step": 5070 }, { "epoch": 1.86, "learning_rate": 0.00015615976970454257, "loss": 0.2734, "step": 5080 }, { "epoch": 1.86, "learning_rate": 0.00015600091557656072, "loss": 0.2674, "step": 5090 }, { "epoch": 1.87, "learning_rate": 0.000155841855297711, "loss": 0.2736, "step": 5100 }, { "epoch": 1.87, "eval_loss": 0.40985968708992004, "eval_runtime": 224.6419, "eval_samples_per_second": 21.292, "eval_steps_per_second": 5.324, "step": 5100 }, { "epoch": 1.87, "learning_rate": 0.0001556825894535269, "loss": 0.2828, "step": 5110 }, { "epoch": 1.88, "learning_rate": 0.00015552311863029875, "loss": 0.2872, "step": 5120 }, { "epoch": 1.88, "learning_rate": 0.00015536344341507129, "loss": 0.2669, "step": 5130 }, { "epoch": 1.88, "learning_rate": 0.0001552035643956419, "loss": 0.2708, "step": 5140 }, { "epoch": 1.89, "learning_rate": 0.00015504348216055798, "loss": 0.3048, "step": 5150 }, { "epoch": 1.89, "learning_rate": 0.00015488319729911512, "loss": 0.2964, "step": 5160 }, { "epoch": 1.89, "learning_rate": 0.00015472271040135483, "loss": 0.266, "step": 5170 }, { "epoch": 1.9, "learning_rate": 0.00015456202205806234, "loss": 0.2885, "step": 5180 }, { "epoch": 1.9, "learning_rate": 0.0001544011328607644, "loss": 0.3317, "step": 5190 }, { "epoch": 1.91, "learning_rate": 0.00015424004340172719, "loss": 0.2905, "step": 5200 }, { "epoch": 1.91, "eval_loss": 0.39825478196144104, "eval_runtime": 224.8976, "eval_samples_per_second": 21.267, "eval_steps_per_second": 5.318, "step": 5200 }, { "epoch": 1.91, "learning_rate": 0.0001540787542739541, "loss": 0.2663, "step": 5210 }, { "epoch": 1.91, "learning_rate": 0.00015391726607118345, "loss": 0.2838, "step": 5220 }, { "epoch": 1.92, "learning_rate": 0.00015375557938788657, "loss": 0.2962, "step": 5230 }, { "epoch": 1.92, "learning_rate": 0.0001535936948192653, "loss": 0.2884, "step": 5240 }, { "epoch": 1.92, "learning_rate": 0.00015343161296124994, "loss": 0.3026, "step": 5250 }, { "epoch": 1.93, "learning_rate": 0.00015326933441049714, "loss": 0.3234, "step": 5260 }, { "epoch": 1.93, "learning_rate": 0.00015310685976438753, "loss": 0.2547, "step": 5270 }, { "epoch": 1.93, "learning_rate": 0.00015294418962102363, "loss": 0.2896, "step": 5280 }, { "epoch": 1.94, "learning_rate": 0.00015278132457922764, "loss": 0.3173, "step": 5290 }, { "epoch": 1.94, "learning_rate": 0.00015261826523853926, "loss": 0.2848, "step": 5300 }, { "epoch": 1.94, "eval_loss": 0.4000326693058014, "eval_runtime": 224.8241, "eval_samples_per_second": 21.274, "eval_steps_per_second": 5.32, "step": 5300 }, { "epoch": 1.95, "learning_rate": 0.00015245501219921336, "loss": 0.2819, "step": 5310 }, { "epoch": 1.95, "learning_rate": 0.00015229156606221792, "loss": 0.2578, "step": 5320 }, { "epoch": 1.95, "learning_rate": 0.0001521279274292317, "loss": 0.2893, "step": 5330 }, { "epoch": 1.96, "learning_rate": 0.00015196409690264212, "loss": 0.2897, "step": 5340 }, { "epoch": 1.96, "learning_rate": 0.000151800075085543, "loss": 0.2718, "step": 5350 }, { "epoch": 1.96, "learning_rate": 0.0001516358625817323, "loss": 0.2908, "step": 5360 }, { "epoch": 1.97, "learning_rate": 0.00015147145999570998, "loss": 0.2694, "step": 5370 }, { "epoch": 1.97, "learning_rate": 0.0001513068679326757, "loss": 0.3028, "step": 5380 }, { "epoch": 1.97, "learning_rate": 0.00015114208699852663, "loss": 0.2707, "step": 5390 }, { "epoch": 1.98, "step": 5391, "total_flos": 1.7572375333880463e+18, "train_loss": 4.6051067095080594e-05, "train_runtime": 5.408, "train_samples_per_second": 15680.552, "train_steps_per_second": 980.035 } ], "logging_steps": 10, "max_steps": 5300, "num_input_tokens_seen": 0, "num_train_epochs": 2, "save_steps": 10, "total_flos": 1.7572375333880463e+18, "train_batch_size": 4, "trial_name": null, "trial_params": null }