| { |
| "best_metric": null, |
| "best_model_checkpoint": null, |
| "epoch": 2.9954430379746837, |
| "eval_steps": 500, |
| "global_step": 1479, |
| "is_hyper_param_search": false, |
| "is_local_process_zero": true, |
| "is_world_process_zero": true, |
| "log_history": [ |
| { |
| "epoch": 0.020253164556962026, |
| "grad_norm": 8.556459004852009, |
| "learning_rate": 3.378378378378379e-07, |
| "loss": 0.8354, |
| "step": 10 |
| }, |
| { |
| "epoch": 0.04050632911392405, |
| "grad_norm": 3.681642301527027, |
| "learning_rate": 6.756756756756758e-07, |
| "loss": 0.7599, |
| "step": 20 |
| }, |
| { |
| "epoch": 0.060759493670886074, |
| "grad_norm": 1.726936094306069, |
| "learning_rate": 1.0135135135135136e-06, |
| "loss": 0.6959, |
| "step": 30 |
| }, |
| { |
| "epoch": 0.0810126582278481, |
| "grad_norm": 2.37353153377641, |
| "learning_rate": 1.3513513513513515e-06, |
| "loss": 0.6575, |
| "step": 40 |
| }, |
| { |
| "epoch": 0.10126582278481013, |
| "grad_norm": 2.0089356033589145, |
| "learning_rate": 1.6891891891891894e-06, |
| "loss": 0.6351, |
| "step": 50 |
| }, |
| { |
| "epoch": 0.12151898734177215, |
| "grad_norm": 1.391624997798933, |
| "learning_rate": 2.0270270270270273e-06, |
| "loss": 0.6213, |
| "step": 60 |
| }, |
| { |
| "epoch": 0.14177215189873418, |
| "grad_norm": 1.6331791503446509, |
| "learning_rate": 2.364864864864865e-06, |
| "loss": 0.6121, |
| "step": 70 |
| }, |
| { |
| "epoch": 0.1620253164556962, |
| "grad_norm": 2.381527517296881, |
| "learning_rate": 2.702702702702703e-06, |
| "loss": 0.6048, |
| "step": 80 |
| }, |
| { |
| "epoch": 0.18227848101265823, |
| "grad_norm": 2.1493391697359403, |
| "learning_rate": 3.040540540540541e-06, |
| "loss": 0.6016, |
| "step": 90 |
| }, |
| { |
| "epoch": 0.20253164556962025, |
| "grad_norm": 2.907906626199151, |
| "learning_rate": 3.3783783783783788e-06, |
| "loss": 0.6034, |
| "step": 100 |
| }, |
| { |
| "epoch": 0.22278481012658227, |
| "grad_norm": 2.742004973375118, |
| "learning_rate": 3.7162162162162162e-06, |
| "loss": 0.5956, |
| "step": 110 |
| }, |
| { |
| "epoch": 0.2430379746835443, |
| "grad_norm": 2.7366062341726476, |
| "learning_rate": 4.0540540540540545e-06, |
| "loss": 0.592, |
| "step": 120 |
| }, |
| { |
| "epoch": 0.26329113924050634, |
| "grad_norm": 2.61940583080256, |
| "learning_rate": 4.391891891891892e-06, |
| "loss": 0.5894, |
| "step": 130 |
| }, |
| { |
| "epoch": 0.28354430379746837, |
| "grad_norm": 2.2813205470058837, |
| "learning_rate": 4.72972972972973e-06, |
| "loss": 0.5782, |
| "step": 140 |
| }, |
| { |
| "epoch": 0.3037974683544304, |
| "grad_norm": 2.5170304525110594, |
| "learning_rate": 4.992486851990984e-06, |
| "loss": 0.5882, |
| "step": 150 |
| }, |
| { |
| "epoch": 0.3240506329113924, |
| "grad_norm": 1.9290797099657084, |
| "learning_rate": 4.954921111945906e-06, |
| "loss": 0.5859, |
| "step": 160 |
| }, |
| { |
| "epoch": 0.34430379746835443, |
| "grad_norm": 1.4336818702016718, |
| "learning_rate": 4.917355371900827e-06, |
| "loss": 0.5874, |
| "step": 170 |
| }, |
| { |
| "epoch": 0.36455696202531646, |
| "grad_norm": 1.8411955872042784, |
| "learning_rate": 4.879789631855748e-06, |
| "loss": 0.583, |
| "step": 180 |
| }, |
| { |
| "epoch": 0.3848101265822785, |
| "grad_norm": 1.6410006944167816, |
| "learning_rate": 4.842223891810669e-06, |
| "loss": 0.5823, |
| "step": 190 |
| }, |
| { |
| "epoch": 0.4050632911392405, |
| "grad_norm": 1.8701658637773764, |
| "learning_rate": 4.80465815176559e-06, |
| "loss": 0.5779, |
| "step": 200 |
| }, |
| { |
| "epoch": 0.4253164556962025, |
| "grad_norm": 1.8645518662157583, |
| "learning_rate": 4.767092411720512e-06, |
| "loss": 0.5731, |
| "step": 210 |
| }, |
| { |
| "epoch": 0.44556962025316454, |
| "grad_norm": 1.4882959577748913, |
| "learning_rate": 4.729526671675433e-06, |
| "loss": 0.5769, |
| "step": 220 |
| }, |
| { |
| "epoch": 0.46582278481012657, |
| "grad_norm": 1.3995352940191192, |
| "learning_rate": 4.6919609316303534e-06, |
| "loss": 0.5754, |
| "step": 230 |
| }, |
| { |
| "epoch": 0.4860759493670886, |
| "grad_norm": 1.6983010693170986, |
| "learning_rate": 4.654395191585275e-06, |
| "loss": 0.5708, |
| "step": 240 |
| }, |
| { |
| "epoch": 0.5063291139240507, |
| "grad_norm": 2.061097436379402, |
| "learning_rate": 4.616829451540196e-06, |
| "loss": 0.5743, |
| "step": 250 |
| }, |
| { |
| "epoch": 0.5265822784810127, |
| "grad_norm": 2.0033851006854624, |
| "learning_rate": 4.579263711495117e-06, |
| "loss": 0.5597, |
| "step": 260 |
| }, |
| { |
| "epoch": 0.5468354430379747, |
| "grad_norm": 1.4371666580128166, |
| "learning_rate": 4.541697971450038e-06, |
| "loss": 0.568, |
| "step": 270 |
| }, |
| { |
| "epoch": 0.5670886075949367, |
| "grad_norm": 1.5842138481650203, |
| "learning_rate": 4.504132231404959e-06, |
| "loss": 0.5636, |
| "step": 280 |
| }, |
| { |
| "epoch": 0.5873417721518988, |
| "grad_norm": 1.4152120878177024, |
| "learning_rate": 4.46656649135988e-06, |
| "loss": 0.5642, |
| "step": 290 |
| }, |
| { |
| "epoch": 0.6075949367088608, |
| "grad_norm": 1.5290735652174632, |
| "learning_rate": 4.429000751314802e-06, |
| "loss": 0.5635, |
| "step": 300 |
| }, |
| { |
| "epoch": 0.6278481012658228, |
| "grad_norm": 1.3811776146486536, |
| "learning_rate": 4.3914350112697225e-06, |
| "loss": 0.5685, |
| "step": 310 |
| }, |
| { |
| "epoch": 0.6481012658227848, |
| "grad_norm": 1.2489817397894598, |
| "learning_rate": 4.353869271224643e-06, |
| "loss": 0.5645, |
| "step": 320 |
| }, |
| { |
| "epoch": 0.6683544303797468, |
| "grad_norm": 1.38609447055016, |
| "learning_rate": 4.316303531179565e-06, |
| "loss": 0.5643, |
| "step": 330 |
| }, |
| { |
| "epoch": 0.6886075949367089, |
| "grad_norm": 1.5318268452835269, |
| "learning_rate": 4.278737791134486e-06, |
| "loss": 0.5636, |
| "step": 340 |
| }, |
| { |
| "epoch": 0.7088607594936709, |
| "grad_norm": 1.4550855449189222, |
| "learning_rate": 4.2411720510894065e-06, |
| "loss": 0.5646, |
| "step": 350 |
| }, |
| { |
| "epoch": 0.7291139240506329, |
| "grad_norm": 1.4357986151802806, |
| "learning_rate": 4.203606311044328e-06, |
| "loss": 0.5659, |
| "step": 360 |
| }, |
| { |
| "epoch": 0.7493670886075949, |
| "grad_norm": 1.4758039776130085, |
| "learning_rate": 4.166040570999249e-06, |
| "loss": 0.5581, |
| "step": 370 |
| }, |
| { |
| "epoch": 0.769620253164557, |
| "grad_norm": 1.4285167058069708, |
| "learning_rate": 4.12847483095417e-06, |
| "loss": 0.5548, |
| "step": 380 |
| }, |
| { |
| "epoch": 0.789873417721519, |
| "grad_norm": 1.7405622458735588, |
| "learning_rate": 4.0909090909090915e-06, |
| "loss": 0.5581, |
| "step": 390 |
| }, |
| { |
| "epoch": 0.810126582278481, |
| "grad_norm": 1.7189240150684444, |
| "learning_rate": 4.053343350864012e-06, |
| "loss": 0.558, |
| "step": 400 |
| }, |
| { |
| "epoch": 0.830379746835443, |
| "grad_norm": 1.6501569274132042, |
| "learning_rate": 4.015777610818933e-06, |
| "loss": 0.5519, |
| "step": 410 |
| }, |
| { |
| "epoch": 0.850632911392405, |
| "grad_norm": 1.6731630114484979, |
| "learning_rate": 3.978211870773855e-06, |
| "loss": 0.5587, |
| "step": 420 |
| }, |
| { |
| "epoch": 0.8708860759493671, |
| "grad_norm": 2.1187915177073178, |
| "learning_rate": 3.9406461307287756e-06, |
| "loss": 0.5513, |
| "step": 430 |
| }, |
| { |
| "epoch": 0.8911392405063291, |
| "grad_norm": 1.5433420890497953, |
| "learning_rate": 3.903080390683696e-06, |
| "loss": 0.5564, |
| "step": 440 |
| }, |
| { |
| "epoch": 0.9113924050632911, |
| "grad_norm": 1.8747825342264348, |
| "learning_rate": 3.865514650638618e-06, |
| "loss": 0.5471, |
| "step": 450 |
| }, |
| { |
| "epoch": 0.9316455696202531, |
| "grad_norm": 1.59749808793934, |
| "learning_rate": 3.827948910593539e-06, |
| "loss": 0.5517, |
| "step": 460 |
| }, |
| { |
| "epoch": 0.9518987341772152, |
| "grad_norm": 1.703614465575271, |
| "learning_rate": 3.79038317054846e-06, |
| "loss": 0.5527, |
| "step": 470 |
| }, |
| { |
| "epoch": 0.9721518987341772, |
| "grad_norm": 1.6904288437756008, |
| "learning_rate": 3.752817430503381e-06, |
| "loss": 0.5499, |
| "step": 480 |
| }, |
| { |
| "epoch": 0.9924050632911392, |
| "grad_norm": 1.2436069436662953, |
| "learning_rate": 3.7152516904583025e-06, |
| "loss": 0.5465, |
| "step": 490 |
| }, |
| { |
| "epoch": 0.9984810126582279, |
| "eval_loss": 0.0686691552400589, |
| "eval_runtime": 504.1479, |
| "eval_samples_per_second": 26.389, |
| "eval_steps_per_second": 0.413, |
| "step": 493 |
| }, |
| { |
| "epoch": 1.0126582278481013, |
| "grad_norm": 1.8140129271933714, |
| "learning_rate": 3.6776859504132234e-06, |
| "loss": 0.5154, |
| "step": 500 |
| }, |
| { |
| "epoch": 1.0329113924050632, |
| "grad_norm": 1.2711128895195876, |
| "learning_rate": 3.640120210368144e-06, |
| "loss": 0.4864, |
| "step": 510 |
| }, |
| { |
| "epoch": 1.0531645569620254, |
| "grad_norm": 1.594060189939638, |
| "learning_rate": 3.602554470323066e-06, |
| "loss": 0.4845, |
| "step": 520 |
| }, |
| { |
| "epoch": 1.0734177215189873, |
| "grad_norm": 1.5462198272155392, |
| "learning_rate": 3.5649887302779866e-06, |
| "loss": 0.4869, |
| "step": 530 |
| }, |
| { |
| "epoch": 1.0936708860759494, |
| "grad_norm": 1.3581074119881988, |
| "learning_rate": 3.527422990232908e-06, |
| "loss": 0.4847, |
| "step": 540 |
| }, |
| { |
| "epoch": 1.1139240506329113, |
| "grad_norm": 1.413570881088854, |
| "learning_rate": 3.489857250187829e-06, |
| "loss": 0.4866, |
| "step": 550 |
| }, |
| { |
| "epoch": 1.1341772151898735, |
| "grad_norm": 1.2703043443689073, |
| "learning_rate": 3.45229151014275e-06, |
| "loss": 0.484, |
| "step": 560 |
| }, |
| { |
| "epoch": 1.1544303797468354, |
| "grad_norm": 1.2116910881911624, |
| "learning_rate": 3.414725770097671e-06, |
| "loss": 0.49, |
| "step": 570 |
| }, |
| { |
| "epoch": 1.1746835443037975, |
| "grad_norm": 1.22591839207132, |
| "learning_rate": 3.3771600300525924e-06, |
| "loss": 0.4883, |
| "step": 580 |
| }, |
| { |
| "epoch": 1.1949367088607594, |
| "grad_norm": 1.1731315032521277, |
| "learning_rate": 3.3395942900075136e-06, |
| "loss": 0.4823, |
| "step": 590 |
| }, |
| { |
| "epoch": 1.2151898734177216, |
| "grad_norm": 1.2756842843152258, |
| "learning_rate": 3.3020285499624344e-06, |
| "loss": 0.4894, |
| "step": 600 |
| }, |
| { |
| "epoch": 1.2354430379746835, |
| "grad_norm": 1.3941039610425434, |
| "learning_rate": 3.264462809917356e-06, |
| "loss": 0.485, |
| "step": 610 |
| }, |
| { |
| "epoch": 1.2556962025316456, |
| "grad_norm": 1.2927381511779246, |
| "learning_rate": 3.226897069872277e-06, |
| "loss": 0.4891, |
| "step": 620 |
| }, |
| { |
| "epoch": 1.2759493670886077, |
| "grad_norm": 1.239246362421845, |
| "learning_rate": 3.1893313298271977e-06, |
| "loss": 0.4884, |
| "step": 630 |
| }, |
| { |
| "epoch": 1.2962025316455696, |
| "grad_norm": 1.4428586131832628, |
| "learning_rate": 3.1517655897821194e-06, |
| "loss": 0.4875, |
| "step": 640 |
| }, |
| { |
| "epoch": 1.3164556962025316, |
| "grad_norm": 1.1942547572789066, |
| "learning_rate": 3.11419984973704e-06, |
| "loss": 0.4911, |
| "step": 650 |
| }, |
| { |
| "epoch": 1.3367088607594937, |
| "grad_norm": 1.2924811153254836, |
| "learning_rate": 3.076634109691961e-06, |
| "loss": 0.4794, |
| "step": 660 |
| }, |
| { |
| "epoch": 1.3569620253164558, |
| "grad_norm": 1.345772029679538, |
| "learning_rate": 3.0390683696468826e-06, |
| "loss": 0.4929, |
| "step": 670 |
| }, |
| { |
| "epoch": 1.3772151898734177, |
| "grad_norm": 1.1190552420786224, |
| "learning_rate": 3.0015026296018034e-06, |
| "loss": 0.4897, |
| "step": 680 |
| }, |
| { |
| "epoch": 1.3974683544303796, |
| "grad_norm": 1.196893927879723, |
| "learning_rate": 2.9639368895567243e-06, |
| "loss": 0.4891, |
| "step": 690 |
| }, |
| { |
| "epoch": 1.4177215189873418, |
| "grad_norm": 1.2638908138618863, |
| "learning_rate": 2.9263711495116455e-06, |
| "loss": 0.4847, |
| "step": 700 |
| }, |
| { |
| "epoch": 1.437974683544304, |
| "grad_norm": 1.1163514888014268, |
| "learning_rate": 2.8888054094665667e-06, |
| "loss": 0.4906, |
| "step": 710 |
| }, |
| { |
| "epoch": 1.4582278481012658, |
| "grad_norm": 1.155377067438078, |
| "learning_rate": 2.851239669421488e-06, |
| "loss": 0.4898, |
| "step": 720 |
| }, |
| { |
| "epoch": 1.4784810126582277, |
| "grad_norm": 1.2738771806551292, |
| "learning_rate": 2.8136739293764088e-06, |
| "loss": 0.4922, |
| "step": 730 |
| }, |
| { |
| "epoch": 1.4987341772151899, |
| "grad_norm": 1.2452118202307745, |
| "learning_rate": 2.77610818933133e-06, |
| "loss": 0.4845, |
| "step": 740 |
| }, |
| { |
| "epoch": 1.518987341772152, |
| "grad_norm": 1.219390205919516, |
| "learning_rate": 2.7385424492862512e-06, |
| "loss": 0.4872, |
| "step": 750 |
| }, |
| { |
| "epoch": 1.539240506329114, |
| "grad_norm": 1.3476108111898892, |
| "learning_rate": 2.700976709241172e-06, |
| "loss": 0.4874, |
| "step": 760 |
| }, |
| { |
| "epoch": 1.5594936708860758, |
| "grad_norm": 1.1947882615361192, |
| "learning_rate": 2.6634109691960937e-06, |
| "loss": 0.4854, |
| "step": 770 |
| }, |
| { |
| "epoch": 1.579746835443038, |
| "grad_norm": 1.2664227895565825, |
| "learning_rate": 2.6258452291510145e-06, |
| "loss": 0.4863, |
| "step": 780 |
| }, |
| { |
| "epoch": 1.6, |
| "grad_norm": 1.0918882362910753, |
| "learning_rate": 2.5882794891059353e-06, |
| "loss": 0.4884, |
| "step": 790 |
| }, |
| { |
| "epoch": 1.620253164556962, |
| "grad_norm": 1.1799041159893287, |
| "learning_rate": 2.550713749060857e-06, |
| "loss": 0.4843, |
| "step": 800 |
| }, |
| { |
| "epoch": 1.640506329113924, |
| "grad_norm": 1.2814036637876116, |
| "learning_rate": 2.513148009015778e-06, |
| "loss": 0.4774, |
| "step": 810 |
| }, |
| { |
| "epoch": 1.660759493670886, |
| "grad_norm": 1.215664740050223, |
| "learning_rate": 2.475582268970699e-06, |
| "loss": 0.4856, |
| "step": 820 |
| }, |
| { |
| "epoch": 1.6810126582278482, |
| "grad_norm": 1.1813078713632477, |
| "learning_rate": 2.43801652892562e-06, |
| "loss": 0.4808, |
| "step": 830 |
| }, |
| { |
| "epoch": 1.70126582278481, |
| "grad_norm": 1.1757543183194046, |
| "learning_rate": 2.400450788880541e-06, |
| "loss": 0.4813, |
| "step": 840 |
| }, |
| { |
| "epoch": 1.721518987341772, |
| "grad_norm": 1.1411860521089428, |
| "learning_rate": 2.3628850488354623e-06, |
| "loss": 0.4834, |
| "step": 850 |
| }, |
| { |
| "epoch": 1.7417721518987341, |
| "grad_norm": 1.0342492999295814, |
| "learning_rate": 2.325319308790383e-06, |
| "loss": 0.4832, |
| "step": 860 |
| }, |
| { |
| "epoch": 1.7620253164556963, |
| "grad_norm": 1.1434911938672212, |
| "learning_rate": 2.2877535687453044e-06, |
| "loss": 0.4866, |
| "step": 870 |
| }, |
| { |
| "epoch": 1.7822784810126582, |
| "grad_norm": 1.246279092334059, |
| "learning_rate": 2.2501878287002256e-06, |
| "loss": 0.4871, |
| "step": 880 |
| }, |
| { |
| "epoch": 1.80253164556962, |
| "grad_norm": 1.1509212405407068, |
| "learning_rate": 2.212622088655147e-06, |
| "loss": 0.4845, |
| "step": 890 |
| }, |
| { |
| "epoch": 1.8227848101265822, |
| "grad_norm": 1.2639874530877144, |
| "learning_rate": 2.175056348610068e-06, |
| "loss": 0.483, |
| "step": 900 |
| }, |
| { |
| "epoch": 1.8430379746835444, |
| "grad_norm": 1.2029946729772611, |
| "learning_rate": 2.137490608564989e-06, |
| "loss": 0.4852, |
| "step": 910 |
| }, |
| { |
| "epoch": 1.8632911392405065, |
| "grad_norm": 1.1169305596935761, |
| "learning_rate": 2.09992486851991e-06, |
| "loss": 0.4831, |
| "step": 920 |
| }, |
| { |
| "epoch": 1.8835443037974684, |
| "grad_norm": 1.09196132946494, |
| "learning_rate": 2.0623591284748313e-06, |
| "loss": 0.4794, |
| "step": 930 |
| }, |
| { |
| "epoch": 1.9037974683544303, |
| "grad_norm": 1.089272978011173, |
| "learning_rate": 2.024793388429752e-06, |
| "loss": 0.483, |
| "step": 940 |
| }, |
| { |
| "epoch": 1.9240506329113924, |
| "grad_norm": 1.0410920443753975, |
| "learning_rate": 1.9872276483846734e-06, |
| "loss": 0.4846, |
| "step": 950 |
| }, |
| { |
| "epoch": 1.9443037974683546, |
| "grad_norm": 1.1956033603187548, |
| "learning_rate": 1.9496619083395946e-06, |
| "loss": 0.4879, |
| "step": 960 |
| }, |
| { |
| "epoch": 1.9645569620253165, |
| "grad_norm": 1.0920561603907453, |
| "learning_rate": 1.9120961682945154e-06, |
| "loss": 0.4866, |
| "step": 970 |
| }, |
| { |
| "epoch": 1.9848101265822784, |
| "grad_norm": 1.0451509677263875, |
| "learning_rate": 1.8745304282494367e-06, |
| "loss": 0.4856, |
| "step": 980 |
| }, |
| { |
| "epoch": 1.998987341772152, |
| "eval_loss": 0.06770560145378113, |
| "eval_runtime": 506.4671, |
| "eval_samples_per_second": 26.268, |
| "eval_steps_per_second": 0.411, |
| "step": 987 |
| }, |
| { |
| "epoch": 2.0050632911392405, |
| "grad_norm": 1.8162363718859433, |
| "learning_rate": 1.8369646882043577e-06, |
| "loss": 0.4629, |
| "step": 990 |
| }, |
| { |
| "epoch": 2.0253164556962027, |
| "grad_norm": 1.5873140436853967, |
| "learning_rate": 1.799398948159279e-06, |
| "loss": 0.4148, |
| "step": 1000 |
| }, |
| { |
| "epoch": 2.0455696202531644, |
| "grad_norm": 1.3250480015078225, |
| "learning_rate": 1.7618332081142001e-06, |
| "loss": 0.4141, |
| "step": 1010 |
| }, |
| { |
| "epoch": 2.0658227848101265, |
| "grad_norm": 1.2559122162053822, |
| "learning_rate": 1.724267468069121e-06, |
| "loss": 0.4097, |
| "step": 1020 |
| }, |
| { |
| "epoch": 2.0860759493670886, |
| "grad_norm": 1.11730327433827, |
| "learning_rate": 1.6867017280240422e-06, |
| "loss": 0.4098, |
| "step": 1030 |
| }, |
| { |
| "epoch": 2.1063291139240508, |
| "grad_norm": 1.1488792674788184, |
| "learning_rate": 1.6491359879789634e-06, |
| "loss": 0.4063, |
| "step": 1040 |
| }, |
| { |
| "epoch": 2.1265822784810124, |
| "grad_norm": 1.2194570653560266, |
| "learning_rate": 1.6115702479338842e-06, |
| "loss": 0.4148, |
| "step": 1050 |
| }, |
| { |
| "epoch": 2.1468354430379746, |
| "grad_norm": 1.2246780862208726, |
| "learning_rate": 1.5740045078888055e-06, |
| "loss": 0.4095, |
| "step": 1060 |
| }, |
| { |
| "epoch": 2.1670886075949367, |
| "grad_norm": 1.1221422744938137, |
| "learning_rate": 1.5364387678437267e-06, |
| "loss": 0.4124, |
| "step": 1070 |
| }, |
| { |
| "epoch": 2.187341772151899, |
| "grad_norm": 1.1845225517845253, |
| "learning_rate": 1.4988730277986477e-06, |
| "loss": 0.4106, |
| "step": 1080 |
| }, |
| { |
| "epoch": 2.207594936708861, |
| "grad_norm": 1.151643245126613, |
| "learning_rate": 1.461307287753569e-06, |
| "loss": 0.4079, |
| "step": 1090 |
| }, |
| { |
| "epoch": 2.2278481012658227, |
| "grad_norm": 1.1727300795743778, |
| "learning_rate": 1.4237415477084902e-06, |
| "loss": 0.4166, |
| "step": 1100 |
| }, |
| { |
| "epoch": 2.248101265822785, |
| "grad_norm": 1.140792166225886, |
| "learning_rate": 1.386175807663411e-06, |
| "loss": 0.4123, |
| "step": 1110 |
| }, |
| { |
| "epoch": 2.268354430379747, |
| "grad_norm": 1.1592975056739614, |
| "learning_rate": 1.3486100676183322e-06, |
| "loss": 0.413, |
| "step": 1120 |
| }, |
| { |
| "epoch": 2.2886075949367086, |
| "grad_norm": 1.144847715133614, |
| "learning_rate": 1.3110443275732533e-06, |
| "loss": 0.4142, |
| "step": 1130 |
| }, |
| { |
| "epoch": 2.3088607594936708, |
| "grad_norm": 1.094302060235976, |
| "learning_rate": 1.2734785875281743e-06, |
| "loss": 0.409, |
| "step": 1140 |
| }, |
| { |
| "epoch": 2.329113924050633, |
| "grad_norm": 1.8324152294122444, |
| "learning_rate": 1.2359128474830955e-06, |
| "loss": 0.4157, |
| "step": 1150 |
| }, |
| { |
| "epoch": 2.349367088607595, |
| "grad_norm": 1.1755976482888877, |
| "learning_rate": 1.1983471074380167e-06, |
| "loss": 0.4155, |
| "step": 1160 |
| }, |
| { |
| "epoch": 2.369620253164557, |
| "grad_norm": 1.1610490026245264, |
| "learning_rate": 1.1607813673929378e-06, |
| "loss": 0.4143, |
| "step": 1170 |
| }, |
| { |
| "epoch": 2.389873417721519, |
| "grad_norm": 1.1593249771779028, |
| "learning_rate": 1.1232156273478588e-06, |
| "loss": 0.4143, |
| "step": 1180 |
| }, |
| { |
| "epoch": 2.410126582278481, |
| "grad_norm": 1.2447447706790171, |
| "learning_rate": 1.08564988730278e-06, |
| "loss": 0.4134, |
| "step": 1190 |
| }, |
| { |
| "epoch": 2.430379746835443, |
| "grad_norm": 1.117095640965601, |
| "learning_rate": 1.048084147257701e-06, |
| "loss": 0.4113, |
| "step": 1200 |
| }, |
| { |
| "epoch": 2.4506329113924052, |
| "grad_norm": 1.1370447334516511, |
| "learning_rate": 1.010518407212622e-06, |
| "loss": 0.4107, |
| "step": 1210 |
| }, |
| { |
| "epoch": 2.470886075949367, |
| "grad_norm": 1.181102859042412, |
| "learning_rate": 9.729526671675433e-07, |
| "loss": 0.4137, |
| "step": 1220 |
| }, |
| { |
| "epoch": 2.491139240506329, |
| "grad_norm": 1.2140589114207827, |
| "learning_rate": 9.353869271224644e-07, |
| "loss": 0.4123, |
| "step": 1230 |
| }, |
| { |
| "epoch": 2.511392405063291, |
| "grad_norm": 1.1778505184014514, |
| "learning_rate": 8.978211870773855e-07, |
| "loss": 0.4162, |
| "step": 1240 |
| }, |
| { |
| "epoch": 2.5316455696202533, |
| "grad_norm": 1.1699977693871288, |
| "learning_rate": 8.602554470323066e-07, |
| "loss": 0.4094, |
| "step": 1250 |
| }, |
| { |
| "epoch": 2.5518987341772155, |
| "grad_norm": 1.124119808595154, |
| "learning_rate": 8.226897069872278e-07, |
| "loss": 0.4041, |
| "step": 1260 |
| }, |
| { |
| "epoch": 2.572151898734177, |
| "grad_norm": 1.131590151917583, |
| "learning_rate": 7.851239669421488e-07, |
| "loss": 0.411, |
| "step": 1270 |
| }, |
| { |
| "epoch": 2.5924050632911393, |
| "grad_norm": 1.1610048489913083, |
| "learning_rate": 7.4755822689707e-07, |
| "loss": 0.4127, |
| "step": 1280 |
| }, |
| { |
| "epoch": 2.6126582278481014, |
| "grad_norm": 1.175325258081783, |
| "learning_rate": 7.09992486851991e-07, |
| "loss": 0.4181, |
| "step": 1290 |
| }, |
| { |
| "epoch": 2.632911392405063, |
| "grad_norm": 1.1266790071767538, |
| "learning_rate": 6.724267468069122e-07, |
| "loss": 0.4114, |
| "step": 1300 |
| }, |
| { |
| "epoch": 2.6531645569620252, |
| "grad_norm": 1.1274406714120246, |
| "learning_rate": 6.348610067618332e-07, |
| "loss": 0.4104, |
| "step": 1310 |
| }, |
| { |
| "epoch": 2.6734177215189874, |
| "grad_norm": 1.0855642192635904, |
| "learning_rate": 5.972952667167544e-07, |
| "loss": 0.4119, |
| "step": 1320 |
| }, |
| { |
| "epoch": 2.6936708860759495, |
| "grad_norm": 1.0950717526822358, |
| "learning_rate": 5.597295266716755e-07, |
| "loss": 0.4145, |
| "step": 1330 |
| }, |
| { |
| "epoch": 2.7139240506329116, |
| "grad_norm": 1.1434523085263464, |
| "learning_rate": 5.221637866265966e-07, |
| "loss": 0.4124, |
| "step": 1340 |
| }, |
| { |
| "epoch": 2.7341772151898733, |
| "grad_norm": 1.105475212905674, |
| "learning_rate": 4.845980465815176e-07, |
| "loss": 0.4104, |
| "step": 1350 |
| }, |
| { |
| "epoch": 2.7544303797468355, |
| "grad_norm": 1.0837702546358798, |
| "learning_rate": 4.4703230653643883e-07, |
| "loss": 0.4063, |
| "step": 1360 |
| }, |
| { |
| "epoch": 2.7746835443037976, |
| "grad_norm": 1.1427611200757817, |
| "learning_rate": 4.094665664913599e-07, |
| "loss": 0.4114, |
| "step": 1370 |
| }, |
| { |
| "epoch": 2.7949367088607593, |
| "grad_norm": 1.0776017124660127, |
| "learning_rate": 3.7190082644628103e-07, |
| "loss": 0.4092, |
| "step": 1380 |
| }, |
| { |
| "epoch": 2.8151898734177214, |
| "grad_norm": 1.335687040637302, |
| "learning_rate": 3.343350864012021e-07, |
| "loss": 0.4121, |
| "step": 1390 |
| }, |
| { |
| "epoch": 2.8354430379746836, |
| "grad_norm": 1.067862661628945, |
| "learning_rate": 2.9676934635612324e-07, |
| "loss": 0.4045, |
| "step": 1400 |
| }, |
| { |
| "epoch": 2.8556962025316457, |
| "grad_norm": 1.083663558722376, |
| "learning_rate": 2.5920360631104436e-07, |
| "loss": 0.4132, |
| "step": 1410 |
| }, |
| { |
| "epoch": 2.875949367088608, |
| "grad_norm": 1.0900237221256193, |
| "learning_rate": 2.2163786626596544e-07, |
| "loss": 0.4077, |
| "step": 1420 |
| }, |
| { |
| "epoch": 2.8962025316455695, |
| "grad_norm": 1.1077884473894029, |
| "learning_rate": 1.8407212622088657e-07, |
| "loss": 0.4089, |
| "step": 1430 |
| }, |
| { |
| "epoch": 2.9164556962025316, |
| "grad_norm": 1.0569922478203235, |
| "learning_rate": 1.4650638617580767e-07, |
| "loss": 0.4108, |
| "step": 1440 |
| }, |
| { |
| "epoch": 2.9367088607594938, |
| "grad_norm": 1.0634188950315056, |
| "learning_rate": 1.0894064613072878e-07, |
| "loss": 0.4193, |
| "step": 1450 |
| }, |
| { |
| "epoch": 2.9569620253164555, |
| "grad_norm": 1.0687631065939336, |
| "learning_rate": 7.13749060856499e-08, |
| "loss": 0.4063, |
| "step": 1460 |
| }, |
| { |
| "epoch": 2.9772151898734176, |
| "grad_norm": 1.0945464445026132, |
| "learning_rate": 3.3809166040571e-08, |
| "loss": 0.4105, |
| "step": 1470 |
| }, |
| { |
| "epoch": 2.9954430379746837, |
| "eval_loss": 0.0698537528514862, |
| "eval_runtime": 508.665, |
| "eval_samples_per_second": 26.155, |
| "eval_steps_per_second": 0.409, |
| "step": 1479 |
| }, |
| { |
| "epoch": 2.9954430379746837, |
| "step": 1479, |
| "total_flos": 2477170706350080.0, |
| "train_loss": 0.494840882935179, |
| "train_runtime": 84658.1688, |
| "train_samples_per_second": 8.957, |
| "train_steps_per_second": 0.017 |
| } |
| ], |
| "logging_steps": 10, |
| "max_steps": 1479, |
| "num_input_tokens_seen": 0, |
| "num_train_epochs": 3, |
| "save_steps": 500, |
| "stateful_callbacks": { |
| "TrainerControl": { |
| "args": { |
| "should_epoch_stop": false, |
| "should_evaluate": false, |
| "should_log": false, |
| "should_save": true, |
| "should_training_stop": true |
| }, |
| "attributes": {} |
| } |
| }, |
| "total_flos": 2477170706350080.0, |
| "train_batch_size": 8, |
| "trial_name": null, |
| "trial_params": null |
| } |
|
|