{ "best_metric": 0.04752533510327339, "best_model_checkpoint": "results/checkpoint-35000", "epoch": 10.0, "eval_steps": 500, "global_step": 36070, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.02772387025228722, "grad_norm": 1.750556230545044, "learning_rate": 9.999814117181637e-06, "loss": 6.049, "step": 100 }, { "epoch": 0.05544774050457444, "grad_norm": 0.824866533279419, "learning_rate": 9.999248953493363e-06, "loss": 3.0817, "step": 200 }, { "epoch": 0.08317161075686166, "grad_norm": 0.4907461702823639, "learning_rate": 9.998304532844263e-06, "loss": 2.3969, "step": 300 }, { "epoch": 0.11089548100914888, "grad_norm": 0.4534800946712494, "learning_rate": 9.996980926880713e-06, "loss": 2.0935, "step": 400 }, { "epoch": 0.1386193512614361, "grad_norm": 0.47491493821144104, "learning_rate": 9.995278236015153e-06, "loss": 1.9245, "step": 500 }, { "epoch": 0.1386193512614361, "eval_valid_loss": 1.7945984601974487, "eval_valid_runtime": 6.4498, "eval_valid_samples_per_second": 214.426, "eval_valid_steps_per_second": 6.822, "step": 500 }, { "epoch": 0.1386193512614361, "eval_valid_target_loss": 1.875697374343872, "eval_valid_target_runtime": 6.5527, "eval_valid_target_samples_per_second": 218.841, "eval_valid_target_steps_per_second": 6.867, "step": 500 }, { "epoch": 0.16634322151372333, "grad_norm": 0.5983259677886963, "learning_rate": 9.99319658941846e-06, "loss": 1.8294, "step": 600 }, { "epoch": 0.19406709176601053, "grad_norm": 0.6906803846359253, "learning_rate": 9.990736145010146e-06, "loss": 1.7625, "step": 700 }, { "epoch": 0.22179096201829776, "grad_norm": 1.4024661779403687, "learning_rate": 9.987897089446381e-06, "loss": 1.709, "step": 800 }, { "epoch": 0.24951483227058496, "grad_norm": 1.073205590248108, "learning_rate": 9.984679638105837e-06, "loss": 1.6595, "step": 900 }, { "epoch": 0.2772387025228722, "grad_norm": 1.280462384223938, "learning_rate": 9.981084035073337e-06, "loss": 1.6153, "step": 1000 }, { "epoch": 0.2772387025228722, "eval_valid_loss": 1.5186923742294312, "eval_valid_runtime": 6.4198, "eval_valid_samples_per_second": 215.427, "eval_valid_steps_per_second": 6.854, "step": 1000 }, { "epoch": 0.2772387025228722, "eval_valid_target_loss": 1.5994268655776978, "eval_valid_target_runtime": 6.5778, "eval_valid_target_samples_per_second": 218.006, "eval_valid_target_steps_per_second": 6.841, "step": 1000 }, { "epoch": 0.3049625727751594, "grad_norm": 0.9407665133476257, "learning_rate": 9.977110553121353e-06, "loss": 1.567, "step": 1100 }, { "epoch": 0.33268644302744665, "grad_norm": 1.5439337491989136, "learning_rate": 9.972759493689301e-06, "loss": 1.5275, "step": 1200 }, { "epoch": 0.36041031327973383, "grad_norm": 2.2176036834716797, "learning_rate": 9.968031186860677e-06, "loss": 1.4833, "step": 1300 }, { "epoch": 0.38813418353202106, "grad_norm": 1.6237233877182007, "learning_rate": 9.962925991338018e-06, "loss": 1.4457, "step": 1400 }, { "epoch": 0.4158580537843083, "grad_norm": 1.3075989484786987, "learning_rate": 9.957444294415685e-06, "loss": 1.407, "step": 1500 }, { "epoch": 0.4158580537843083, "eval_valid_loss": 1.326136589050293, "eval_valid_runtime": 6.413, "eval_valid_samples_per_second": 215.655, "eval_valid_steps_per_second": 6.861, "step": 1500 }, { "epoch": 0.4158580537843083, "eval_valid_target_loss": 1.3982958793640137, "eval_valid_target_runtime": 6.5728, "eval_valid_target_samples_per_second": 218.172, "eval_valid_target_steps_per_second": 6.846, "step": 1500 }, { "epoch": 0.4435819240365955, "grad_norm": 1.379807472229004, "learning_rate": 9.951586511950491e-06, "loss": 1.3768, "step": 1600 }, { "epoch": 0.47130579428888275, "grad_norm": 0.737086832523346, "learning_rate": 9.945353088330137e-06, "loss": 1.347, "step": 1700 }, { "epoch": 0.4990296645411699, "grad_norm": 0.6332296133041382, "learning_rate": 9.93874449643952e-06, "loss": 1.3188, "step": 1800 }, { "epoch": 0.5267535347934572, "grad_norm": 0.6948099732398987, "learning_rate": 9.931761237624833e-06, "loss": 1.2903, "step": 1900 }, { "epoch": 0.5544774050457444, "grad_norm": 0.9397527575492859, "learning_rate": 9.924403841655565e-06, "loss": 1.2671, "step": 2000 }, { "epoch": 0.5544774050457444, "eval_valid_loss": 1.2014020681381226, "eval_valid_runtime": 6.4367, "eval_valid_samples_per_second": 214.861, "eval_valid_steps_per_second": 6.836, "step": 2000 }, { "epoch": 0.5544774050457444, "eval_valid_target_loss": 1.2820453643798828, "eval_valid_target_runtime": 6.5614, "eval_valid_target_samples_per_second": 218.55, "eval_valid_target_steps_per_second": 6.858, "step": 2000 }, { "epoch": 0.5822012752980316, "grad_norm": 0.5302172303199768, "learning_rate": 9.916672866684275e-06, "loss": 1.2439, "step": 2100 }, { "epoch": 0.6099251455503188, "grad_norm": 0.5439279675483704, "learning_rate": 9.908568899204281e-06, "loss": 1.2231, "step": 2200 }, { "epoch": 0.637649015802606, "grad_norm": 0.7026234865188599, "learning_rate": 9.90009255400514e-06, "loss": 1.2027, "step": 2300 }, { "epoch": 0.6653728860548933, "grad_norm": 0.642803430557251, "learning_rate": 9.89124447412603e-06, "loss": 1.1864, "step": 2400 }, { "epoch": 0.6930967563071805, "grad_norm": 1.3601601123809814, "learning_rate": 9.882025330806952e-06, "loss": 1.1654, "step": 2500 }, { "epoch": 0.6930967563071805, "eval_valid_loss": 1.1063387393951416, "eval_valid_runtime": 6.4314, "eval_valid_samples_per_second": 215.037, "eval_valid_steps_per_second": 6.841, "step": 2500 }, { "epoch": 0.6930967563071805, "eval_valid_target_loss": 1.208246111869812, "eval_valid_target_runtime": 6.5564, "eval_valid_target_samples_per_second": 218.719, "eval_valid_target_steps_per_second": 6.864, "step": 2500 }, { "epoch": 0.7208206265594677, "grad_norm": 0.7053922414779663, "learning_rate": 9.872435823437816e-06, "loss": 1.1433, "step": 2600 }, { "epoch": 0.748544496811755, "grad_norm": 0.6601741909980774, "learning_rate": 9.862476679505384e-06, "loss": 1.1193, "step": 2700 }, { "epoch": 0.7762683670640421, "grad_norm": 0.7706498503684998, "learning_rate": 9.852148654538072e-06, "loss": 1.0954, "step": 2800 }, { "epoch": 0.8039922373163294, "grad_norm": 0.8355486392974854, "learning_rate": 9.841452532048648e-06, "loss": 1.069, "step": 2900 }, { "epoch": 0.8317161075686166, "grad_norm": 0.8369494676589966, "learning_rate": 9.830389123474773e-06, "loss": 1.0384, "step": 3000 }, { "epoch": 0.8317161075686166, "eval_valid_loss": 0.9615023732185364, "eval_valid_runtime": 6.4156, "eval_valid_samples_per_second": 215.57, "eval_valid_steps_per_second": 6.858, "step": 3000 }, { "epoch": 0.8317161075686166, "eval_valid_target_loss": 1.0947415828704834, "eval_valid_target_runtime": 6.5753, "eval_valid_target_samples_per_second": 218.088, "eval_valid_target_steps_per_second": 6.844, "step": 3000 }, { "epoch": 0.8594399778209038, "grad_norm": 1.4864110946655273, "learning_rate": 9.818959268117464e-06, "loss": 1.0103, "step": 3100 }, { "epoch": 0.887163848073191, "grad_norm": 0.7728907465934753, "learning_rate": 9.807163833077407e-06, "loss": 0.982, "step": 3200 }, { "epoch": 0.9148877183254782, "grad_norm": 0.6881595253944397, "learning_rate": 9.795003713189187e-06, "loss": 0.9492, "step": 3300 }, { "epoch": 0.9426115885777655, "grad_norm": 1.0222816467285156, "learning_rate": 9.782479830953388e-06, "loss": 0.9142, "step": 3400 }, { "epoch": 0.9703354588300527, "grad_norm": 0.6671555042266846, "learning_rate": 9.769593136466633e-06, "loss": 0.8838, "step": 3500 }, { "epoch": 0.9703354588300527, "eval_valid_loss": 0.8037808537483215, "eval_valid_runtime": 6.4314, "eval_valid_samples_per_second": 215.038, "eval_valid_steps_per_second": 6.841, "step": 3500 }, { "epoch": 0.9703354588300527, "eval_valid_target_loss": 0.9639121294021606, "eval_valid_target_runtime": 6.6053, "eval_valid_target_samples_per_second": 217.1, "eval_valid_target_steps_per_second": 6.813, "step": 3500 }, { "epoch": 0.9980593290823399, "grad_norm": 0.7793981432914734, "learning_rate": 9.756344607349483e-06, "loss": 0.8496, "step": 3600 }, { "epoch": 1.0257831993346271, "grad_norm": 0.7545821070671082, "learning_rate": 9.74273524867229e-06, "loss": 0.8117, "step": 3700 }, { "epoch": 1.0535070695869144, "grad_norm": 0.631118893623352, "learning_rate": 9.728766092878934e-06, "loss": 0.7749, "step": 3800 }, { "epoch": 1.0812309398392015, "grad_norm": 0.7934292554855347, "learning_rate": 9.714438199708516e-06, "loss": 0.7321, "step": 3900 }, { "epoch": 1.1089548100914888, "grad_norm": 0.6160613298416138, "learning_rate": 9.699752656114947e-06, "loss": 0.6891, "step": 4000 }, { "epoch": 1.1089548100914888, "eval_valid_loss": 0.5853330492973328, "eval_valid_runtime": 6.4069, "eval_valid_samples_per_second": 215.861, "eval_valid_steps_per_second": 6.868, "step": 4000 }, { "epoch": 1.1089548100914888, "eval_valid_target_loss": 0.7543638944625854, "eval_valid_target_runtime": 6.5591, "eval_valid_target_samples_per_second": 218.627, "eval_valid_target_steps_per_second": 6.861, "step": 4000 }, { "epoch": 1.136678680343776, "grad_norm": 0.4765689969062805, "learning_rate": 9.684710576184504e-06, "loss": 0.6383, "step": 4100 }, { "epoch": 1.1644025505960631, "grad_norm": 0.7610909938812256, "learning_rate": 9.669313101051295e-06, "loss": 0.5894, "step": 4200 }, { "epoch": 1.1921264208483504, "grad_norm": 0.5010733008384705, "learning_rate": 9.653561398810706e-06, "loss": 0.5446, "step": 4300 }, { "epoch": 1.2198502911006377, "grad_norm": 0.6305666565895081, "learning_rate": 9.637456664430776e-06, "loss": 0.5097, "step": 4400 }, { "epoch": 1.247574161352925, "grad_norm": 0.8064519762992859, "learning_rate": 9.621000119661545e-06, "loss": 0.4678, "step": 4500 }, { "epoch": 1.247574161352925, "eval_valid_loss": 0.38276800513267517, "eval_valid_runtime": 6.4349, "eval_valid_samples_per_second": 214.922, "eval_valid_steps_per_second": 6.838, "step": 4500 }, { "epoch": 1.247574161352925, "eval_valid_target_loss": 0.4976137578487396, "eval_valid_target_runtime": 6.5738, "eval_valid_target_samples_per_second": 218.139, "eval_valid_target_steps_per_second": 6.845, "step": 4500 }, { "epoch": 1.275298031605212, "grad_norm": 0.49154090881347656, "learning_rate": 9.604193012942375e-06, "loss": 0.4326, "step": 4600 }, { "epoch": 1.3030219018574993, "grad_norm": 0.5592367053031921, "learning_rate": 9.587036619307226e-06, "loss": 0.4054, "step": 4700 }, { "epoch": 1.3307457721097866, "grad_norm": 0.48195400834083557, "learning_rate": 9.569532240287946e-06, "loss": 0.3828, "step": 4800 }, { "epoch": 1.3584696423620737, "grad_norm": 0.5364578366279602, "learning_rate": 9.551681203815517e-06, "loss": 0.3595, "step": 4900 }, { "epoch": 1.386193512614361, "grad_norm": 0.5409713387489319, "learning_rate": 9.533484864119327e-06, "loss": 0.3405, "step": 5000 }, { "epoch": 1.386193512614361, "eval_valid_loss": 0.2857649326324463, "eval_valid_runtime": 6.4118, "eval_valid_samples_per_second": 215.697, "eval_valid_steps_per_second": 6.862, "step": 5000 }, { "epoch": 1.386193512614361, "eval_valid_target_loss": 0.33146464824676514, "eval_valid_target_runtime": 6.5717, "eval_valid_target_samples_per_second": 218.209, "eval_valid_target_steps_per_second": 6.848, "step": 5000 }, { "epoch": 1.4139173828666483, "grad_norm": 0.7294422388076782, "learning_rate": 9.514944601624427e-06, "loss": 0.328, "step": 5100 }, { "epoch": 1.4416412531189353, "grad_norm": 0.4695785343647003, "learning_rate": 9.49606182284681e-06, "loss": 0.3095, "step": 5200 }, { "epoch": 1.4693651233712226, "grad_norm": 0.5484552979469299, "learning_rate": 9.476837960286707e-06, "loss": 0.3016, "step": 5300 }, { "epoch": 1.49708899362351, "grad_norm": 0.38614729046821594, "learning_rate": 9.457274472319919e-06, "loss": 0.2875, "step": 5400 }, { "epoch": 1.524812863875797, "grad_norm": 0.3303731381893158, "learning_rate": 9.437372843087175e-06, "loss": 0.2821, "step": 5500 }, { "epoch": 1.524812863875797, "eval_valid_loss": 0.23669035732746124, "eval_valid_runtime": 6.4303, "eval_valid_samples_per_second": 215.074, "eval_valid_steps_per_second": 6.843, "step": 5500 }, { "epoch": 1.524812863875797, "eval_valid_target_loss": 0.2617432773113251, "eval_valid_target_runtime": 6.5556, "eval_valid_target_samples_per_second": 218.744, "eval_valid_target_steps_per_second": 6.864, "step": 5500 }, { "epoch": 1.5525367341280842, "grad_norm": 0.5144414305686951, "learning_rate": 9.417134582381548e-06, "loss": 0.2696, "step": 5600 }, { "epoch": 1.5802606043803715, "grad_norm": 0.5522892475128174, "learning_rate": 9.396561225533902e-06, "loss": 0.2617, "step": 5700 }, { "epoch": 1.6079844746326586, "grad_norm": 0.4152807295322418, "learning_rate": 9.37565433329644e-06, "loss": 0.2522, "step": 5800 }, { "epoch": 1.635708344884946, "grad_norm": 0.3866608142852783, "learning_rate": 9.35441549172428e-06, "loss": 0.2469, "step": 5900 }, { "epoch": 1.6634322151372332, "grad_norm": 0.3131564259529114, "learning_rate": 9.33284631205515e-06, "loss": 0.2425, "step": 6000 }, { "epoch": 1.6634322151372332, "eval_valid_loss": 0.20471729338169098, "eval_valid_runtime": 6.4284, "eval_valid_samples_per_second": 215.138, "eval_valid_steps_per_second": 6.845, "step": 6000 }, { "epoch": 1.6634322151372332, "eval_valid_target_loss": 0.2232024222612381, "eval_valid_target_runtime": 6.5873, "eval_valid_target_samples_per_second": 217.69, "eval_valid_target_steps_per_second": 6.831, "step": 6000 }, { "epoch": 1.6911560853895202, "grad_norm": 0.4385012090206146, "learning_rate": 9.31094843058714e-06, "loss": 0.2346, "step": 6100 }, { "epoch": 1.7188799556418077, "grad_norm": 0.3904290497303009, "learning_rate": 9.28872350855458e-06, "loss": 0.2279, "step": 6200 }, { "epoch": 1.7466038258940948, "grad_norm": 0.4294661581516266, "learning_rate": 9.266173232002005e-06, "loss": 0.2218, "step": 6300 }, { "epoch": 1.774327696146382, "grad_norm": 0.40256062150001526, "learning_rate": 9.243299311656253e-06, "loss": 0.2189, "step": 6400 }, { "epoch": 1.8020515663986694, "grad_norm": 0.39798569679260254, "learning_rate": 9.220103482796683e-06, "loss": 0.2154, "step": 6500 }, { "epoch": 1.8020515663986694, "eval_valid_loss": 0.18116505444049835, "eval_valid_runtime": 6.4306, "eval_valid_samples_per_second": 215.065, "eval_valid_steps_per_second": 6.842, "step": 6500 }, { "epoch": 1.8020515663986694, "eval_valid_target_loss": 0.19611063599586487, "eval_valid_target_runtime": 6.5521, "eval_valid_target_samples_per_second": 218.86, "eval_valid_target_steps_per_second": 6.868, "step": 6500 }, { "epoch": 1.8297754366509564, "grad_norm": 0.2555886507034302, "learning_rate": 9.196587505123526e-06, "loss": 0.2082, "step": 6600 }, { "epoch": 1.8574993069032437, "grad_norm": 0.278145968914032, "learning_rate": 9.172753162624401e-06, "loss": 0.2025, "step": 6700 }, { "epoch": 1.885223177155531, "grad_norm": 0.43592485785484314, "learning_rate": 9.148602263438967e-06, "loss": 0.2006, "step": 6800 }, { "epoch": 1.912947047407818, "grad_norm": 0.3828723430633545, "learning_rate": 9.124136639721757e-06, "loss": 0.1963, "step": 6900 }, { "epoch": 1.9406709176601054, "grad_norm": 0.3468044102191925, "learning_rate": 9.09935814750318e-06, "loss": 0.1928, "step": 7000 }, { "epoch": 1.9406709176601054, "eval_valid_loss": 0.16255635023117065, "eval_valid_runtime": 6.4262, "eval_valid_samples_per_second": 215.213, "eval_valid_steps_per_second": 6.847, "step": 7000 }, { "epoch": 1.9406709176601054, "eval_valid_target_loss": 0.17588204145431519, "eval_valid_target_runtime": 6.5759, "eval_valid_target_samples_per_second": 218.07, "eval_valid_target_steps_per_second": 6.843, "step": 7000 }, { "epoch": 1.9683947879123926, "grad_norm": 0.28793609142303467, "learning_rate": 9.074268666548728e-06, "loss": 0.1868, "step": 7100 }, { "epoch": 1.9961186581646797, "grad_norm": 0.4627343714237213, "learning_rate": 9.04887010021636e-06, "loss": 0.1857, "step": 7200 }, { "epoch": 2.023842528416967, "grad_norm": 0.4490989148616791, "learning_rate": 9.023164375312117e-06, "loss": 0.1786, "step": 7300 }, { "epoch": 2.0515663986692543, "grad_norm": 0.319859117269516, "learning_rate": 8.997153441943944e-06, "loss": 0.1779, "step": 7400 }, { "epoch": 2.0792902689215413, "grad_norm": 0.3379845917224884, "learning_rate": 8.970839273373748e-06, "loss": 0.1717, "step": 7500 }, { "epoch": 2.0792902689215413, "eval_valid_loss": 0.1455078125, "eval_valid_runtime": 6.4396, "eval_valid_samples_per_second": 214.766, "eval_valid_steps_per_second": 6.833, "step": 7500 }, { "epoch": 2.0792902689215413, "eval_valid_target_loss": 0.15758885443210602, "eval_valid_target_runtime": 6.5627, "eval_valid_target_samples_per_second": 218.508, "eval_valid_target_steps_per_second": 6.857, "step": 7500 }, { "epoch": 2.107014139173829, "grad_norm": 0.3079555928707123, "learning_rate": 8.944223865867712e-06, "loss": 0.1688, "step": 7600 }, { "epoch": 2.134738009426116, "grad_norm": 0.346603125333786, "learning_rate": 8.917309238544834e-06, "loss": 0.1661, "step": 7700 }, { "epoch": 2.162461879678403, "grad_norm": 0.3899448812007904, "learning_rate": 8.890097433223766e-06, "loss": 0.1653, "step": 7800 }, { "epoch": 2.1901857499306905, "grad_norm": 0.31352731585502625, "learning_rate": 8.862590514267915e-06, "loss": 0.1609, "step": 7900 }, { "epoch": 2.2179096201829775, "grad_norm": 0.29558128118515015, "learning_rate": 8.834790568428827e-06, "loss": 0.158, "step": 8000 }, { "epoch": 2.2179096201829775, "eval_valid_loss": 0.1319538652896881, "eval_valid_runtime": 6.417, "eval_valid_samples_per_second": 215.521, "eval_valid_steps_per_second": 6.857, "step": 8000 }, { "epoch": 2.2179096201829775, "eval_valid_target_loss": 0.1427442878484726, "eval_valid_target_runtime": 6.5854, "eval_valid_target_samples_per_second": 217.754, "eval_valid_target_steps_per_second": 6.833, "step": 8000 }, { "epoch": 2.2456334904352646, "grad_norm": 0.29061177372932434, "learning_rate": 8.80669970468788e-06, "loss": 0.1545, "step": 8100 }, { "epoch": 2.273357360687552, "grad_norm": 0.3253875970840454, "learning_rate": 8.778320054096306e-06, "loss": 0.1528, "step": 8200 }, { "epoch": 2.301081230939839, "grad_norm": 0.2402360886335373, "learning_rate": 8.749653769613502e-06, "loss": 0.1511, "step": 8300 }, { "epoch": 2.3288051011921262, "grad_norm": 0.31634458899497986, "learning_rate": 8.720703025943717e-06, "loss": 0.1461, "step": 8400 }, { "epoch": 2.3565289714444138, "grad_norm": 0.21685920655727386, "learning_rate": 8.691470019371065e-06, "loss": 0.143, "step": 8500 }, { "epoch": 2.3565289714444138, "eval_valid_loss": 0.12121625989675522, "eval_valid_runtime": 6.4171, "eval_valid_samples_per_second": 215.519, "eval_valid_steps_per_second": 6.857, "step": 8500 }, { "epoch": 2.3565289714444138, "eval_valid_target_loss": 0.1312141716480255, "eval_valid_target_runtime": 6.57, "eval_valid_target_samples_per_second": 218.266, "eval_valid_target_steps_per_second": 6.849, "step": 8500 }, { "epoch": 2.384252841696701, "grad_norm": 0.24635937809944153, "learning_rate": 8.661956967592907e-06, "loss": 0.1424, "step": 8600 }, { "epoch": 2.411976711948988, "grad_norm": 0.21958141028881073, "learning_rate": 8.632166109551623e-06, "loss": 0.1388, "step": 8700 }, { "epoch": 2.4397005822012754, "grad_norm": 0.2693657875061035, "learning_rate": 8.60209970526474e-06, "loss": 0.1392, "step": 8800 }, { "epoch": 2.4674244524535625, "grad_norm": 0.22512082755565643, "learning_rate": 8.5717600356535e-06, "loss": 0.1356, "step": 8900 }, { "epoch": 2.49514832270585, "grad_norm": 0.3446211516857147, "learning_rate": 8.541149402369806e-06, "loss": 0.1324, "step": 9000 }, { "epoch": 2.49514832270585, "eval_valid_loss": 0.11042323708534241, "eval_valid_runtime": 6.4273, "eval_valid_samples_per_second": 215.176, "eval_valid_steps_per_second": 6.846, "step": 9000 }, { "epoch": 2.49514832270585, "eval_valid_target_loss": 0.11918216943740845, "eval_valid_target_runtime": 6.5885, "eval_valid_target_samples_per_second": 217.651, "eval_valid_target_steps_per_second": 6.83, "step": 9000 }, { "epoch": 2.522872192958137, "grad_norm": 0.21913643181324005, "learning_rate": 8.51027012762163e-06, "loss": 0.1303, "step": 9100 }, { "epoch": 2.550596063210424, "grad_norm": 0.24243904650211334, "learning_rate": 8.479124553996824e-06, "loss": 0.1268, "step": 9200 }, { "epoch": 2.578319933462711, "grad_norm": 0.22184187173843384, "learning_rate": 8.447715044285425e-06, "loss": 0.1251, "step": 9300 }, { "epoch": 2.6060438037149987, "grad_norm": 0.22888724505901337, "learning_rate": 8.41604398130039e-06, "loss": 0.1221, "step": 9400 }, { "epoch": 2.6337676739672857, "grad_norm": 0.24152572453022003, "learning_rate": 8.384113767696838e-06, "loss": 0.121, "step": 9500 }, { "epoch": 2.6337676739672857, "eval_valid_loss": 0.10074004530906677, "eval_valid_runtime": 6.4317, "eval_valid_samples_per_second": 215.03, "eval_valid_steps_per_second": 6.841, "step": 9500 }, { "epoch": 2.6337676739672857, "eval_valid_target_loss": 0.10891123861074448, "eval_valid_target_runtime": 6.5593, "eval_valid_target_samples_per_second": 218.622, "eval_valid_target_steps_per_second": 6.861, "step": 9500 }, { "epoch": 2.6614915442195732, "grad_norm": 0.2756216526031494, "learning_rate": 8.35192682578978e-06, "loss": 0.1195, "step": 9600 }, { "epoch": 2.6892154144718603, "grad_norm": 0.24438254535198212, "learning_rate": 8.319485597370348e-06, "loss": 0.1157, "step": 9700 }, { "epoch": 2.7169392847241474, "grad_norm": 0.35991132259368896, "learning_rate": 8.286792543520556e-06, "loss": 0.115, "step": 9800 }, { "epoch": 2.744663154976435, "grad_norm": 0.22763152420520782, "learning_rate": 8.253850144426606e-06, "loss": 0.1134, "step": 9900 }, { "epoch": 2.772387025228722, "grad_norm": 0.24357567727565765, "learning_rate": 8.220660899190712e-06, "loss": 0.1106, "step": 10000 }, { "epoch": 2.772387025228722, "eval_valid_loss": 0.092686228454113, "eval_valid_runtime": 6.4287, "eval_valid_samples_per_second": 215.129, "eval_valid_steps_per_second": 6.844, "step": 10000 }, { "epoch": 2.772387025228722, "eval_valid_target_loss": 0.1005280539393425, "eval_valid_target_runtime": 6.5902, "eval_valid_target_samples_per_second": 217.596, "eval_valid_target_steps_per_second": 6.828, "step": 10000 }, { "epoch": 2.800110895481009, "grad_norm": 0.20446299016475677, "learning_rate": 8.187227325641534e-06, "loss": 0.109, "step": 10100 }, { "epoch": 2.8278347657332965, "grad_norm": 0.24309873580932617, "learning_rate": 8.153551960143157e-06, "loss": 0.1087, "step": 10200 }, { "epoch": 2.8555586359855836, "grad_norm": 0.21243679523468018, "learning_rate": 8.119637357402676e-06, "loss": 0.1063, "step": 10300 }, { "epoch": 2.8832825062378706, "grad_norm": 0.2227753847837448, "learning_rate": 8.085486090276391e-06, "loss": 0.1057, "step": 10400 }, { "epoch": 2.911006376490158, "grad_norm": 0.1933346837759018, "learning_rate": 8.05110074957462e-06, "loss": 0.1037, "step": 10500 }, { "epoch": 2.911006376490158, "eval_valid_loss": 0.08755628019571304, "eval_valid_runtime": 6.4374, "eval_valid_samples_per_second": 214.84, "eval_valid_steps_per_second": 6.835, "step": 10500 }, { "epoch": 2.911006376490158, "eval_valid_target_loss": 0.09479602426290512, "eval_valid_target_runtime": 6.5624, "eval_valid_target_samples_per_second": 218.517, "eval_valid_target_steps_per_second": 6.857, "step": 10500 }, { "epoch": 2.938730246742445, "grad_norm": 0.24507193267345428, "learning_rate": 8.016483943865158e-06, "loss": 0.1026, "step": 10600 }, { "epoch": 2.9664541169947327, "grad_norm": 0.16903254389762878, "learning_rate": 7.98163829927538e-06, "loss": 0.1019, "step": 10700 }, { "epoch": 2.99417798724702, "grad_norm": 0.21406187117099762, "learning_rate": 7.946566459293014e-06, "loss": 0.1016, "step": 10800 }, { "epoch": 3.021901857499307, "grad_norm": 0.17749078571796417, "learning_rate": 7.911271084565603e-06, "loss": 0.0988, "step": 10900 }, { "epoch": 3.049625727751594, "grad_norm": 0.2052767425775528, "learning_rate": 7.875754852698658e-06, "loss": 0.099, "step": 11000 }, { "epoch": 3.049625727751594, "eval_valid_loss": 0.08359777182340622, "eval_valid_runtime": 6.4134, "eval_valid_samples_per_second": 215.643, "eval_valid_steps_per_second": 6.861, "step": 11000 }, { "epoch": 3.049625727751594, "eval_valid_target_loss": 0.09044167399406433, "eval_valid_target_runtime": 6.5678, "eval_valid_target_samples_per_second": 218.336, "eval_valid_target_steps_per_second": 6.852, "step": 11000 }, { "epoch": 3.0773495980038814, "grad_norm": 0.20621031522750854, "learning_rate": 7.840020458052529e-06, "loss": 0.0961, "step": 11100 }, { "epoch": 3.1050734682561685, "grad_norm": 0.18608888983726501, "learning_rate": 7.804070611538001e-06, "loss": 0.0964, "step": 11200 }, { "epoch": 3.132797338508456, "grad_norm": 0.14550629258155823, "learning_rate": 7.767908040410642e-06, "loss": 0.0957, "step": 11300 }, { "epoch": 3.160521208760743, "grad_norm": 0.21664443612098694, "learning_rate": 7.731535488063895e-06, "loss": 0.0948, "step": 11400 }, { "epoch": 3.18824507901303, "grad_norm": 0.17702756822109222, "learning_rate": 7.694955713820974e-06, "loss": 0.0935, "step": 11500 }, { "epoch": 3.18824507901303, "eval_valid_loss": 0.07985392957925797, "eval_valid_runtime": 6.4194, "eval_valid_samples_per_second": 215.442, "eval_valid_steps_per_second": 6.854, "step": 11500 }, { "epoch": 3.18824507901303, "eval_valid_target_loss": 0.08640262484550476, "eval_valid_target_runtime": 6.5608, "eval_valid_target_samples_per_second": 218.572, "eval_valid_target_steps_per_second": 6.859, "step": 11500 }, { "epoch": 3.2159689492653176, "grad_norm": 0.19913919270038605, "learning_rate": 7.658171492725513e-06, "loss": 0.0936, "step": 11600 }, { "epoch": 3.2436928195176047, "grad_norm": 0.18789726495742798, "learning_rate": 7.621185615331061e-06, "loss": 0.0924, "step": 11700 }, { "epoch": 3.2714166897698918, "grad_norm": 0.18376338481903076, "learning_rate": 7.584000887489373e-06, "loss": 0.0911, "step": 11800 }, { "epoch": 3.2991405600221793, "grad_norm": 0.19736219942569733, "learning_rate": 7.546620130137557e-06, "loss": 0.0912, "step": 11900 }, { "epoch": 3.3268644302744663, "grad_norm": 0.19527922570705414, "learning_rate": 7.509046179084061e-06, "loss": 0.0912, "step": 12000 }, { "epoch": 3.3268644302744663, "eval_valid_loss": 0.07622889429330826, "eval_valid_runtime": 6.4437, "eval_valid_samples_per_second": 214.627, "eval_valid_steps_per_second": 6.828, "step": 12000 }, { "epoch": 3.3268644302744663, "eval_valid_target_loss": 0.0823676660656929, "eval_valid_target_runtime": 6.5589, "eval_valid_target_samples_per_second": 218.635, "eval_valid_target_steps_per_second": 6.861, "step": 12000 }, { "epoch": 3.3545883005267534, "grad_norm": 0.18916228413581848, "learning_rate": 7.471281884793544e-06, "loss": 0.0896, "step": 12100 }, { "epoch": 3.382312170779041, "grad_norm": 0.1649465262889862, "learning_rate": 7.4333301121706445e-06, "loss": 0.0881, "step": 12200 }, { "epoch": 3.410036041031328, "grad_norm": 0.18362993001937866, "learning_rate": 7.3951937403426186e-06, "loss": 0.0892, "step": 12300 }, { "epoch": 3.437759911283615, "grad_norm": 0.19268861413002014, "learning_rate": 7.356875662440939e-06, "loss": 0.0879, "step": 12400 }, { "epoch": 3.4654837815359025, "grad_norm": 0.17124581336975098, "learning_rate": 7.318378785381802e-06, "loss": 0.086, "step": 12500 }, { "epoch": 3.4654837815359025, "eval_valid_loss": 0.07317828387022018, "eval_valid_runtime": 6.4273, "eval_valid_samples_per_second": 215.177, "eval_valid_steps_per_second": 6.846, "step": 12500 }, { "epoch": 3.4654837815359025, "eval_valid_target_loss": 0.07900213450193405, "eval_valid_target_runtime": 6.5852, "eval_valid_target_samples_per_second": 217.76, "eval_valid_target_steps_per_second": 6.833, "step": 12500 }, { "epoch": 3.4932076517881896, "grad_norm": 0.23004941642284393, "learning_rate": 7.279706029645615e-06, "loss": 0.0855, "step": 12600 }, { "epoch": 3.5209315220404767, "grad_norm": 0.16131635010242462, "learning_rate": 7.240860329055422e-06, "loss": 0.0848, "step": 12700 }, { "epoch": 3.548655392292764, "grad_norm": 0.19867731630802155, "learning_rate": 7.201844630554353e-06, "loss": 0.0851, "step": 12800 }, { "epoch": 3.5763792625450512, "grad_norm": 0.17405714094638824, "learning_rate": 7.162661893982052e-06, "loss": 0.0839, "step": 12900 }, { "epoch": 3.6041031327973387, "grad_norm": 0.19404906034469604, "learning_rate": 7.123315091850136e-06, "loss": 0.0839, "step": 13000 }, { "epoch": 3.6041031327973387, "eval_valid_loss": 0.07132507115602493, "eval_valid_runtime": 6.4118, "eval_valid_samples_per_second": 215.695, "eval_valid_steps_per_second": 6.862, "step": 13000 }, { "epoch": 3.6041031327973387, "eval_valid_target_loss": 0.0771123468875885, "eval_valid_target_runtime": 6.5745, "eval_valid_target_samples_per_second": 218.117, "eval_valid_target_steps_per_second": 6.845, "step": 13000 }, { "epoch": 3.631827003049626, "grad_norm": 0.15152141451835632, "learning_rate": 7.083807209116689e-06, "loss": 0.0836, "step": 13100 }, { "epoch": 3.659550873301913, "grad_norm": 0.18368007242679596, "learning_rate": 7.044141242959826e-06, "loss": 0.0827, "step": 13200 }, { "epoch": 3.6872747435542, "grad_norm": 0.18081355094909668, "learning_rate": 7.004320202550303e-06, "loss": 0.0823, "step": 13300 }, { "epoch": 3.7149986138064874, "grad_norm": 0.15222586691379547, "learning_rate": 6.9643471088232506e-06, "loss": 0.0801, "step": 13400 }, { "epoch": 3.7427224840587745, "grad_norm": 0.1571241021156311, "learning_rate": 6.9242249942489755e-06, "loss": 0.0807, "step": 13500 }, { "epoch": 3.7427224840587745, "eval_valid_loss": 0.06911951303482056, "eval_valid_runtime": 6.4701, "eval_valid_samples_per_second": 213.752, "eval_valid_steps_per_second": 6.8, "step": 13500 }, { "epoch": 3.7427224840587745, "eval_valid_target_loss": 0.07482416182756424, "eval_valid_target_runtime": 6.5611, "eval_valid_target_samples_per_second": 218.56, "eval_valid_target_steps_per_second": 6.859, "step": 13500 }, { "epoch": 3.770446354311062, "grad_norm": 0.1546078324317932, "learning_rate": 6.883956902602933e-06, "loss": 0.0811, "step": 13600 }, { "epoch": 3.798170224563349, "grad_norm": 0.1428447812795639, "learning_rate": 6.843545888734801e-06, "loss": 0.0795, "step": 13700 }, { "epoch": 3.825894094815636, "grad_norm": 0.1369272619485855, "learning_rate": 6.802995018336736e-06, "loss": 0.0794, "step": 13800 }, { "epoch": 3.8536179650679236, "grad_norm": 0.1972970962524414, "learning_rate": 6.762307367710797e-06, "loss": 0.0785, "step": 13900 }, { "epoch": 3.8813418353202107, "grad_norm": 0.15961000323295593, "learning_rate": 6.721486023535577e-06, "loss": 0.0787, "step": 14000 }, { "epoch": 3.8813418353202107, "eval_valid_loss": 0.06712613999843597, "eval_valid_runtime": 6.4106, "eval_valid_samples_per_second": 215.737, "eval_valid_steps_per_second": 6.864, "step": 14000 }, { "epoch": 3.8813418353202107, "eval_valid_target_loss": 0.07271508872509003, "eval_valid_target_runtime": 6.5891, "eval_valid_target_samples_per_second": 217.633, "eval_valid_target_steps_per_second": 6.829, "step": 14000 }, { "epoch": 3.9090657055724978, "grad_norm": 0.15836742520332336, "learning_rate": 6.680534082632036e-06, "loss": 0.0779, "step": 14100 }, { "epoch": 3.9367895758247853, "grad_norm": 0.1906501203775406, "learning_rate": 6.639454651728561e-06, "loss": 0.0772, "step": 14200 }, { "epoch": 3.9645134460770723, "grad_norm": 0.1872212439775467, "learning_rate": 6.598250847225286e-06, "loss": 0.0772, "step": 14300 }, { "epoch": 3.9922373163293594, "grad_norm": 0.1689438670873642, "learning_rate": 6.556925794957678e-06, "loss": 0.0769, "step": 14400 }, { "epoch": 4.0199611865816465, "grad_norm": 0.1830626279115677, "learning_rate": 6.515482629959392e-06, "loss": 0.0764, "step": 14500 }, { "epoch": 4.0199611865816465, "eval_valid_loss": 0.0653899684548378, "eval_valid_runtime": 6.4271, "eval_valid_samples_per_second": 215.181, "eval_valid_steps_per_second": 6.846, "step": 14500 }, { "epoch": 4.0199611865816465, "eval_valid_target_loss": 0.0708317682147026, "eval_valid_target_runtime": 6.5574, "eval_valid_target_samples_per_second": 218.684, "eval_valid_target_steps_per_second": 6.862, "step": 14500 }, { "epoch": 4.047685056833934, "grad_norm": 0.1517285257577896, "learning_rate": 6.473924496224447e-06, "loss": 0.0757, "step": 14600 }, { "epoch": 4.0754089270862215, "grad_norm": 0.15981799364089966, "learning_rate": 6.432254546468708e-06, "loss": 0.0751, "step": 14700 }, { "epoch": 4.1031327973385086, "grad_norm": 0.14974670112133026, "learning_rate": 6.3904759418907194e-06, "loss": 0.0755, "step": 14800 }, { "epoch": 4.130856667590796, "grad_norm": 0.15918827056884766, "learning_rate": 6.348591851931879e-06, "loss": 0.0743, "step": 14900 }, { "epoch": 4.158580537843083, "grad_norm": 0.17248332500457764, "learning_rate": 6.306605454036001e-06, "loss": 0.0747, "step": 15000 }, { "epoch": 4.158580537843083, "eval_valid_loss": 0.06470626592636108, "eval_valid_runtime": 6.4429, "eval_valid_samples_per_second": 214.654, "eval_valid_steps_per_second": 6.829, "step": 15000 }, { "epoch": 4.158580537843083, "eval_valid_target_loss": 0.07004554569721222, "eval_valid_target_runtime": 6.5941, "eval_valid_target_samples_per_second": 217.468, "eval_valid_target_steps_per_second": 6.824, "step": 15000 }, { "epoch": 4.18630440809537, "grad_norm": 0.18200209736824036, "learning_rate": 6.2645199334082674e-06, "loss": 0.0735, "step": 15100 }, { "epoch": 4.214028278347658, "grad_norm": 0.12851852178573608, "learning_rate": 6.222338482773584e-06, "loss": 0.0736, "step": 15200 }, { "epoch": 4.241752148599945, "grad_norm": 0.15132804214954376, "learning_rate": 6.180064302134374e-06, "loss": 0.0738, "step": 15300 }, { "epoch": 4.269476018852232, "grad_norm": 0.15047667920589447, "learning_rate": 6.1377005985278205e-06, "loss": 0.073, "step": 15400 }, { "epoch": 4.297199889104519, "grad_norm": 0.19985252618789673, "learning_rate": 6.095250585782562e-06, "loss": 0.0732, "step": 15500 }, { "epoch": 4.297199889104519, "eval_valid_loss": 0.062382254749536514, "eval_valid_runtime": 6.4347, "eval_valid_samples_per_second": 214.927, "eval_valid_steps_per_second": 6.838, "step": 15500 }, { "epoch": 4.297199889104519, "eval_valid_target_loss": 0.06759324669837952, "eval_valid_target_runtime": 6.5646, "eval_valid_target_samples_per_second": 218.446, "eval_valid_target_steps_per_second": 6.855, "step": 15500 }, { "epoch": 4.324923759356806, "grad_norm": 0.16384641826152802, "learning_rate": 6.0527174842748994e-06, "loss": 0.0716, "step": 15600 }, { "epoch": 4.352647629609093, "grad_norm": 0.14244656264781952, "learning_rate": 6.0101045206844676e-06, "loss": 0.0716, "step": 15700 }, { "epoch": 4.380371499861381, "grad_norm": 0.16209416091442108, "learning_rate": 5.9674149277494694e-06, "loss": 0.0714, "step": 15800 }, { "epoch": 4.408095370113668, "grad_norm": 0.17041273415088654, "learning_rate": 5.92465194402142e-06, "loss": 0.0715, "step": 15900 }, { "epoch": 4.435819240365955, "grad_norm": 0.16730940341949463, "learning_rate": 5.881818813619463e-06, "loss": 0.0714, "step": 16000 }, { "epoch": 4.435819240365955, "eval_valid_loss": 0.061134014278650284, "eval_valid_runtime": 6.4104, "eval_valid_samples_per_second": 215.742, "eval_valid_steps_per_second": 6.864, "step": 16000 }, { "epoch": 4.435819240365955, "eval_valid_target_loss": 0.06638547778129578, "eval_valid_target_runtime": 6.5651, "eval_valid_target_samples_per_second": 218.427, "eval_valid_target_steps_per_second": 6.854, "step": 16000 }, { "epoch": 4.463543110618242, "grad_norm": 0.13161396980285645, "learning_rate": 5.8389187859842675e-06, "loss": 0.0703, "step": 16100 }, { "epoch": 4.491266980870529, "grad_norm": 0.13423210382461548, "learning_rate": 5.7959551156315156e-06, "loss": 0.0707, "step": 16200 }, { "epoch": 4.518990851122817, "grad_norm": 0.20051045715808868, "learning_rate": 5.752931061904994e-06, "loss": 0.0699, "step": 16300 }, { "epoch": 4.546714721375104, "grad_norm": 0.15945318341255188, "learning_rate": 5.709849888729351e-06, "loss": 0.0697, "step": 16400 }, { "epoch": 4.574438591627391, "grad_norm": 0.13749030232429504, "learning_rate": 5.666714864362468e-06, "loss": 0.0704, "step": 16500 }, { "epoch": 4.574438591627391, "eval_valid_loss": 0.06001834571361542, "eval_valid_runtime": 6.4467, "eval_valid_samples_per_second": 214.529, "eval_valid_steps_per_second": 6.825, "step": 16500 }, { "epoch": 4.574438591627391, "eval_valid_target_loss": 0.06535307317972183, "eval_valid_target_runtime": 6.5686, "eval_valid_target_samples_per_second": 218.311, "eval_valid_target_steps_per_second": 6.851, "step": 16500 }, { "epoch": 4.602162461879678, "grad_norm": 0.133077010512352, "learning_rate": 5.6235292611475326e-06, "loss": 0.0693, "step": 16600 }, { "epoch": 4.629886332131965, "grad_norm": 0.1508035957813263, "learning_rate": 5.580296355264783e-06, "loss": 0.069, "step": 16700 }, { "epoch": 4.6576102023842525, "grad_norm": 0.14195485413074493, "learning_rate": 5.537019426482966e-06, "loss": 0.0695, "step": 16800 }, { "epoch": 4.6853340726365404, "grad_norm": 0.16586261987686157, "learning_rate": 5.493701757910536e-06, "loss": 0.0684, "step": 16900 }, { "epoch": 4.7130579428888275, "grad_norm": 0.13865657150745392, "learning_rate": 5.4503466357465765e-06, "loss": 0.0682, "step": 17000 }, { "epoch": 4.7130579428888275, "eval_valid_loss": 0.0584811232984066, "eval_valid_runtime": 6.422, "eval_valid_samples_per_second": 215.352, "eval_valid_steps_per_second": 6.851, "step": 17000 }, { "epoch": 4.7130579428888275, "eval_valid_target_loss": 0.06370435655117035, "eval_valid_target_runtime": 6.5705, "eval_valid_target_samples_per_second": 218.247, "eval_valid_target_steps_per_second": 6.849, "step": 17000 }, { "epoch": 4.740781813141115, "grad_norm": 0.1934811919927597, "learning_rate": 5.406957349031504e-06, "loss": 0.0686, "step": 17100 }, { "epoch": 4.768505683393402, "grad_norm": 0.16662567853927612, "learning_rate": 5.363537189397556e-06, "loss": 0.0682, "step": 17200 }, { "epoch": 4.796229553645689, "grad_norm": 0.15507076680660248, "learning_rate": 5.320089450819075e-06, "loss": 0.0673, "step": 17300 }, { "epoch": 4.823953423897976, "grad_norm": 0.12763585150241852, "learning_rate": 5.276617429362616e-06, "loss": 0.0671, "step": 17400 }, { "epoch": 4.851677294150264, "grad_norm": 0.15640078485012054, "learning_rate": 5.233124422936906e-06, "loss": 0.0669, "step": 17500 }, { "epoch": 4.851677294150264, "eval_valid_loss": 0.05754322186112404, "eval_valid_runtime": 6.4388, "eval_valid_samples_per_second": 214.792, "eval_valid_steps_per_second": 6.834, "step": 17500 }, { "epoch": 4.851677294150264, "eval_valid_target_loss": 0.06262939423322678, "eval_valid_target_runtime": 6.5536, "eval_valid_target_samples_per_second": 218.81, "eval_valid_target_steps_per_second": 6.866, "step": 17500 }, { "epoch": 4.879401164402551, "grad_norm": 0.16545389592647552, "learning_rate": 5.189613731042645e-06, "loss": 0.0663, "step": 17600 }, { "epoch": 4.907125034654838, "grad_norm": 0.17085812985897064, "learning_rate": 5.146088654522208e-06, "loss": 0.0657, "step": 17700 }, { "epoch": 4.934848904907125, "grad_norm": 0.14638109505176544, "learning_rate": 5.102552495309222e-06, "loss": 0.0677, "step": 17800 }, { "epoch": 4.962572775159412, "grad_norm": 0.15568013489246368, "learning_rate": 5.059008556178079e-06, "loss": 0.0657, "step": 17900 }, { "epoch": 4.9902966454117, "grad_norm": 0.16898399591445923, "learning_rate": 5.015460140493381e-06, "loss": 0.0661, "step": 18000 }, { "epoch": 4.9902966454117, "eval_valid_loss": 0.05648580938577652, "eval_valid_runtime": 6.4207, "eval_valid_samples_per_second": 215.397, "eval_valid_steps_per_second": 6.853, "step": 18000 }, { "epoch": 4.9902966454117, "eval_valid_target_loss": 0.06151015684008598, "eval_valid_target_runtime": 6.5952, "eval_valid_target_samples_per_second": 217.432, "eval_valid_target_steps_per_second": 6.823, "step": 18000 }, { "epoch": 5.018020515663987, "grad_norm": 0.13535688817501068, "learning_rate": 4.971910551959332e-06, "loss": 0.0654, "step": 18100 }, { "epoch": 5.045744385916274, "grad_norm": 0.16001687943935394, "learning_rate": 4.928363094369108e-06, "loss": 0.0656, "step": 18200 }, { "epoch": 5.073468256168561, "grad_norm": 0.1575719267129898, "learning_rate": 4.88482107135423e-06, "loss": 0.0641, "step": 18300 }, { "epoch": 5.101192126420848, "grad_norm": 0.1607745736837387, "learning_rate": 4.841287786133937e-06, "loss": 0.0642, "step": 18400 }, { "epoch": 5.128915996673135, "grad_norm": 0.13689269125461578, "learning_rate": 4.797766541264592e-06, "loss": 0.0646, "step": 18500 }, { "epoch": 5.128915996673135, "eval_valid_loss": 0.05563423037528992, "eval_valid_runtime": 6.4248, "eval_valid_samples_per_second": 215.261, "eval_valid_steps_per_second": 6.849, "step": 18500 }, { "epoch": 5.128915996673135, "eval_valid_target_loss": 0.06068035215139389, "eval_valid_target_runtime": 6.561, "eval_valid_target_samples_per_second": 218.566, "eval_valid_target_steps_per_second": 6.859, "step": 18500 }, { "epoch": 5.156639866925423, "grad_norm": 0.13576319813728333, "learning_rate": 4.754260638389145e-06, "loss": 0.0641, "step": 18600 }, { "epoch": 5.18436373717771, "grad_norm": 0.13574448227882385, "learning_rate": 4.710773377986659e-06, "loss": 0.0643, "step": 18700 }, { "epoch": 5.212087607429997, "grad_norm": 0.11536768078804016, "learning_rate": 4.667308059121928e-06, "loss": 0.064, "step": 18800 }, { "epoch": 5.239811477682284, "grad_norm": 0.1470881700515747, "learning_rate": 4.623867979195196e-06, "loss": 0.0637, "step": 18900 }, { "epoch": 5.2675353479345715, "grad_norm": 0.13156047463417053, "learning_rate": 4.580456433692017e-06, "loss": 0.0635, "step": 19000 }, { "epoch": 5.2675353479345715, "eval_valid_loss": 0.05473410338163376, "eval_valid_runtime": 6.4623, "eval_valid_samples_per_second": 214.012, "eval_valid_steps_per_second": 6.809, "step": 19000 }, { "epoch": 5.2675353479345715, "eval_valid_target_loss": 0.05973204970359802, "eval_valid_target_runtime": 6.5636, "eval_valid_target_samples_per_second": 218.477, "eval_valid_target_steps_per_second": 6.856, "step": 19000 }, { "epoch": 5.2952592181868585, "grad_norm": 0.132376030087471, "learning_rate": 4.537076715933242e-06, "loss": 0.0638, "step": 19100 }, { "epoch": 5.3229830884391465, "grad_norm": 0.14191821217536926, "learning_rate": 4.493732116825174e-06, "loss": 0.064, "step": 19200 }, { "epoch": 5.3507069586914335, "grad_norm": 0.1247839480638504, "learning_rate": 4.45042592460993e-06, "loss": 0.0627, "step": 19300 }, { "epoch": 5.378430828943721, "grad_norm": 0.12980355322360992, "learning_rate": 4.4071614246159596e-06, "loss": 0.0632, "step": 19400 }, { "epoch": 5.406154699196008, "grad_norm": 0.1391134262084961, "learning_rate": 4.363941899008833e-06, "loss": 0.0625, "step": 19500 }, { "epoch": 5.406154699196008, "eval_valid_loss": 0.05415208637714386, "eval_valid_runtime": 6.4065, "eval_valid_samples_per_second": 215.873, "eval_valid_steps_per_second": 6.868, "step": 19500 }, { "epoch": 5.406154699196008, "eval_valid_target_loss": 0.05894719064235687, "eval_valid_target_runtime": 6.569, "eval_valid_target_samples_per_second": 218.299, "eval_valid_target_steps_per_second": 6.85, "step": 19500 }, { "epoch": 5.433878569448295, "grad_norm": 0.2045671045780182, "learning_rate": 4.320770626542238e-06, "loss": 0.0629, "step": 19600 }, { "epoch": 5.461602439700582, "grad_norm": 0.1417771577835083, "learning_rate": 4.277650882309238e-06, "loss": 0.0625, "step": 19700 }, { "epoch": 5.48932630995287, "grad_norm": 0.14284995198249817, "learning_rate": 4.234585937493829e-06, "loss": 0.0623, "step": 19800 }, { "epoch": 5.517050180205157, "grad_norm": 0.1546027809381485, "learning_rate": 4.1915790591227615e-06, "loss": 0.0625, "step": 19900 }, { "epoch": 5.544774050457444, "grad_norm": 0.1454819142818451, "learning_rate": 4.148633509817715e-06, "loss": 0.0613, "step": 20000 }, { "epoch": 5.544774050457444, "eval_valid_loss": 0.05364985764026642, "eval_valid_runtime": 6.436, "eval_valid_samples_per_second": 214.885, "eval_valid_steps_per_second": 6.837, "step": 20000 }, { "epoch": 5.544774050457444, "eval_valid_target_loss": 0.05850011110305786, "eval_valid_target_runtime": 6.5534, "eval_valid_target_samples_per_second": 218.819, "eval_valid_target_steps_per_second": 6.867, "step": 20000 }, { "epoch": 5.572497920709731, "grad_norm": 0.12440012395381927, "learning_rate": 4.105752547547764e-06, "loss": 0.0613, "step": 20100 }, { "epoch": 5.600221790962018, "grad_norm": 0.14089658856391907, "learning_rate": 4.062939425382236e-06, "loss": 0.0616, "step": 20200 }, { "epoch": 5.627945661214305, "grad_norm": 0.24770374596118927, "learning_rate": 4.020197391243922e-06, "loss": 0.0621, "step": 20300 }, { "epoch": 5.655669531466593, "grad_norm": 0.11835476011037827, "learning_rate": 3.977529687662671e-06, "loss": 0.0619, "step": 20400 }, { "epoch": 5.68339340171888, "grad_norm": 0.12585273385047913, "learning_rate": 3.93493955152941e-06, "loss": 0.0612, "step": 20500 }, { "epoch": 5.68339340171888, "eval_valid_loss": 0.05319705978035927, "eval_valid_runtime": 6.4196, "eval_valid_samples_per_second": 215.435, "eval_valid_steps_per_second": 6.854, "step": 20500 }, { "epoch": 5.68339340171888, "eval_valid_target_loss": 0.058061882853507996, "eval_valid_target_runtime": 6.5894, "eval_valid_target_samples_per_second": 217.622, "eval_valid_target_steps_per_second": 6.829, "step": 20500 }, { "epoch": 5.711117271971167, "grad_norm": 0.15103484690189362, "learning_rate": 3.892430213850587e-06, "loss": 0.0615, "step": 20600 }, { "epoch": 5.738841142223454, "grad_norm": 0.1266421228647232, "learning_rate": 3.850004899503051e-06, "loss": 0.0613, "step": 20700 }, { "epoch": 5.766565012475741, "grad_norm": 0.1100655049085617, "learning_rate": 3.8076668269894045e-06, "loss": 0.0606, "step": 20800 }, { "epoch": 5.794288882728029, "grad_norm": 0.1395365446805954, "learning_rate": 3.765419208193848e-06, "loss": 0.0614, "step": 20900 }, { "epoch": 5.822012752980316, "grad_norm": 0.12668344378471375, "learning_rate": 3.723265248138506e-06, "loss": 0.0614, "step": 21000 }, { "epoch": 5.822012752980316, "eval_valid_loss": 0.052489351481199265, "eval_valid_runtime": 6.4455, "eval_valid_samples_per_second": 214.567, "eval_valid_steps_per_second": 6.826, "step": 21000 }, { "epoch": 5.822012752980316, "eval_valid_target_loss": 0.057213690131902695, "eval_valid_target_runtime": 6.5546, "eval_valid_target_samples_per_second": 218.777, "eval_valid_target_steps_per_second": 6.865, "step": 21000 }, { "epoch": 5.849736623232603, "grad_norm": 0.12728376686573029, "learning_rate": 3.681208144740291e-06, "loss": 0.0612, "step": 21100 }, { "epoch": 5.87746049348489, "grad_norm": 0.14501620829105377, "learning_rate": 3.6392510885682965e-06, "loss": 0.0601, "step": 21200 }, { "epoch": 5.9051843637371775, "grad_norm": 0.1082565188407898, "learning_rate": 3.5973972626017594e-06, "loss": 0.0608, "step": 21300 }, { "epoch": 5.9329082339894645, "grad_norm": 0.14926603436470032, "learning_rate": 3.5556498419885867e-06, "loss": 0.0603, "step": 21400 }, { "epoch": 5.9606321042417525, "grad_norm": 0.1263745278120041, "learning_rate": 3.514011993804469e-06, "loss": 0.0602, "step": 21500 }, { "epoch": 5.9606321042417525, "eval_valid_loss": 0.05212084576487541, "eval_valid_runtime": 6.439, "eval_valid_samples_per_second": 214.785, "eval_valid_steps_per_second": 6.833, "step": 21500 }, { "epoch": 5.9606321042417525, "eval_valid_target_loss": 0.05688408389687538, "eval_valid_target_runtime": 6.5822, "eval_valid_target_samples_per_second": 217.862, "eval_valid_target_steps_per_second": 6.837, "step": 21500 }, { "epoch": 5.98835597449404, "grad_norm": 0.1368781179189682, "learning_rate": 3.4724868768126384e-06, "loss": 0.0604, "step": 21600 }, { "epoch": 6.016079844746327, "grad_norm": 0.15087148547172546, "learning_rate": 3.4310776412242195e-06, "loss": 0.06, "step": 21700 }, { "epoch": 6.043803714998614, "grad_norm": 0.11400382220745087, "learning_rate": 3.3897874284592467e-06, "loss": 0.0594, "step": 21800 }, { "epoch": 6.071527585250901, "grad_norm": 0.1169167011976242, "learning_rate": 3.348619370908361e-06, "loss": 0.0598, "step": 21900 }, { "epoch": 6.099251455503188, "grad_norm": 0.12172160297632217, "learning_rate": 3.3075765916951576e-06, "loss": 0.0599, "step": 22000 }, { "epoch": 6.099251455503188, "eval_valid_loss": 0.05157113075256348, "eval_valid_runtime": 6.4258, "eval_valid_samples_per_second": 215.224, "eval_valid_steps_per_second": 6.847, "step": 22000 }, { "epoch": 6.099251455503188, "eval_valid_target_loss": 0.056347791105508804, "eval_valid_target_runtime": 6.5915, "eval_valid_target_samples_per_second": 217.554, "eval_valid_target_steps_per_second": 6.827, "step": 22000 }, { "epoch": 6.126975325755476, "grad_norm": 0.1324358880519867, "learning_rate": 3.2666622044392765e-06, "loss": 0.0591, "step": 22100 }, { "epoch": 6.154699196007763, "grad_norm": 0.12708991765975952, "learning_rate": 3.225879313020178e-06, "loss": 0.0591, "step": 22200 }, { "epoch": 6.18242306626005, "grad_norm": 0.11844506114721298, "learning_rate": 3.18523101134169e-06, "loss": 0.0592, "step": 22300 }, { "epoch": 6.210146936512337, "grad_norm": 0.12888644635677338, "learning_rate": 3.1447203830972827e-06, "loss": 0.0597, "step": 22400 }, { "epoch": 6.237870806764624, "grad_norm": 0.1485096514225006, "learning_rate": 3.104350501536134e-06, "loss": 0.0598, "step": 22500 }, { "epoch": 6.237870806764624, "eval_valid_loss": 0.051265206187963486, "eval_valid_runtime": 6.437, "eval_valid_samples_per_second": 214.85, "eval_valid_steps_per_second": 6.835, "step": 22500 }, { "epoch": 6.237870806764624, "eval_valid_target_loss": 0.056084584444761276, "eval_valid_target_runtime": 6.6, "eval_valid_target_samples_per_second": 217.273, "eval_valid_target_steps_per_second": 6.818, "step": 22500 }, { "epoch": 6.265594677016912, "grad_norm": 0.11319620907306671, "learning_rate": 3.064124429229992e-06, "loss": 0.0581, "step": 22600 }, { "epoch": 6.293318547269199, "grad_norm": 0.125896617770195, "learning_rate": 3.0240452178408286e-06, "loss": 0.0594, "step": 22700 }, { "epoch": 6.321042417521486, "grad_norm": 0.13202796876430511, "learning_rate": 2.9841159078893377e-06, "loss": 0.0587, "step": 22800 }, { "epoch": 6.348766287773773, "grad_norm": 0.12477891147136688, "learning_rate": 2.944339528524278e-06, "loss": 0.0582, "step": 22900 }, { "epoch": 6.37649015802606, "grad_norm": 0.13174673914909363, "learning_rate": 2.9047190972926597e-06, "loss": 0.0585, "step": 23000 }, { "epoch": 6.37649015802606, "eval_valid_loss": 0.05099370330572128, "eval_valid_runtime": 6.4377, "eval_valid_samples_per_second": 214.828, "eval_valid_steps_per_second": 6.835, "step": 23000 }, { "epoch": 6.37649015802606, "eval_valid_target_loss": 0.055660318583250046, "eval_valid_target_runtime": 6.5668, "eval_valid_target_samples_per_second": 218.37, "eval_valid_target_steps_per_second": 6.853, "step": 23000 }, { "epoch": 6.404214028278347, "grad_norm": 0.12851925194263458, "learning_rate": 2.8652576199108395e-06, "loss": 0.0586, "step": 23100 }, { "epoch": 6.431937898530635, "grad_norm": 0.10676029324531555, "learning_rate": 2.8259580900364825e-06, "loss": 0.0584, "step": 23200 }, { "epoch": 6.459661768782922, "grad_norm": 0.1461838185787201, "learning_rate": 2.786823489041478e-06, "loss": 0.0583, "step": 23300 }, { "epoch": 6.487385639035209, "grad_norm": 0.12321025878190994, "learning_rate": 2.747856785785743e-06, "loss": 0.0579, "step": 23400 }, { "epoch": 6.515109509287496, "grad_norm": 0.1209678128361702, "learning_rate": 2.7090609363919986e-06, "loss": 0.0581, "step": 23500 }, { "epoch": 6.515109509287496, "eval_valid_loss": 0.050510190427303314, "eval_valid_runtime": 6.447, "eval_valid_samples_per_second": 214.517, "eval_valid_steps_per_second": 6.825, "step": 23500 }, { "epoch": 6.515109509287496, "eval_valid_target_loss": 0.0551883801817894, "eval_valid_target_runtime": 6.5701, "eval_valid_target_samples_per_second": 218.262, "eval_valid_target_steps_per_second": 6.849, "step": 23500 }, { "epoch": 6.5428333795397835, "grad_norm": 0.15566356480121613, "learning_rate": 2.6704388840215277e-06, "loss": 0.0578, "step": 23600 }, { "epoch": 6.570557249792071, "grad_norm": 0.10754121840000153, "learning_rate": 2.6319935586508814e-06, "loss": 0.058, "step": 23700 }, { "epoch": 6.5982811200443585, "grad_norm": 0.12134023010730743, "learning_rate": 2.593727876849601e-06, "loss": 0.0577, "step": 23800 }, { "epoch": 6.626004990296646, "grad_norm": 0.12984460592269897, "learning_rate": 2.555644741558979e-06, "loss": 0.0575, "step": 23900 }, { "epoch": 6.653728860548933, "grad_norm": 0.13557353615760803, "learning_rate": 2.51774704187181e-06, "loss": 0.0571, "step": 24000 }, { "epoch": 6.653728860548933, "eval_valid_loss": 0.0503346286714077, "eval_valid_runtime": 6.419, "eval_valid_samples_per_second": 215.455, "eval_valid_steps_per_second": 6.855, "step": 24000 }, { "epoch": 6.653728860548933, "eval_valid_target_loss": 0.0548863522708416, "eval_valid_target_runtime": 6.5823, "eval_valid_target_samples_per_second": 217.857, "eval_valid_target_steps_per_second": 6.837, "step": 24000 }, { "epoch": 6.68145273080122, "grad_norm": 0.10979162156581879, "learning_rate": 2.4800376528132297e-06, "loss": 0.0576, "step": 24100 }, { "epoch": 6.709176601053507, "grad_norm": 0.16127757728099823, "learning_rate": 2.4425194351226082e-06, "loss": 0.0579, "step": 24200 }, { "epoch": 6.736900471305795, "grad_norm": 0.13306181132793427, "learning_rate": 2.4051952350365194e-06, "loss": 0.0572, "step": 24300 }, { "epoch": 6.764624341558082, "grad_norm": 0.11353787779808044, "learning_rate": 2.368067884072821e-06, "loss": 0.0573, "step": 24400 }, { "epoch": 6.792348211810369, "grad_norm": 0.10115820914506912, "learning_rate": 2.331140198815849e-06, "loss": 0.0574, "step": 24500 }, { "epoch": 6.792348211810369, "eval_valid_loss": 0.049953412264585495, "eval_valid_runtime": 6.4338, "eval_valid_samples_per_second": 214.958, "eval_valid_steps_per_second": 6.839, "step": 24500 }, { "epoch": 6.792348211810369, "eval_valid_target_loss": 0.054579559713602066, "eval_valid_target_runtime": 6.5694, "eval_valid_target_samples_per_second": 218.283, "eval_valid_target_steps_per_second": 6.85, "step": 24500 }, { "epoch": 6.820072082062656, "grad_norm": 0.10899285972118378, "learning_rate": 2.294414980702741e-06, "loss": 0.0573, "step": 24600 }, { "epoch": 6.847795952314943, "grad_norm": 0.1248159185051918, "learning_rate": 2.257895015810913e-06, "loss": 0.0568, "step": 24700 }, { "epoch": 6.87551982256723, "grad_norm": 0.10761197656393051, "learning_rate": 2.221583074646701e-06, "loss": 0.0574, "step": 24800 }, { "epoch": 6.903243692819517, "grad_norm": 0.13541601598262787, "learning_rate": 2.1854819119351784e-06, "loss": 0.0562, "step": 24900 }, { "epoch": 6.930967563071805, "grad_norm": 0.10959000140428543, "learning_rate": 2.1495942664111814e-06, "loss": 0.0576, "step": 25000 }, { "epoch": 6.930967563071805, "eval_valid_loss": 0.049802832305431366, "eval_valid_runtime": 6.4091, "eval_valid_samples_per_second": 215.786, "eval_valid_steps_per_second": 6.865, "step": 25000 }, { "epoch": 6.930967563071805, "eval_valid_target_loss": 0.05434631556272507, "eval_valid_target_runtime": 6.5766, "eval_valid_target_samples_per_second": 218.047, "eval_valid_target_steps_per_second": 6.842, "step": 25000 }, { "epoch": 6.958691433324092, "grad_norm": 0.11864270269870758, "learning_rate": 2.113922860611532e-06, "loss": 0.0571, "step": 25100 }, { "epoch": 6.986415303576379, "grad_norm": 0.10493431985378265, "learning_rate": 2.078470400668506e-06, "loss": 0.0572, "step": 25200 }, { "epoch": 7.014139173828666, "grad_norm": 0.10294145345687866, "learning_rate": 2.0432395761045427e-06, "loss": 0.0562, "step": 25300 }, { "epoch": 7.041863044080953, "grad_norm": 0.11174608767032623, "learning_rate": 2.008233059628193e-06, "loss": 0.0562, "step": 25400 }, { "epoch": 7.069586914333241, "grad_norm": 0.10171514004468918, "learning_rate": 1.9734535069313753e-06, "loss": 0.056, "step": 25500 }, { "epoch": 7.069586914333241, "eval_valid_loss": 0.04948737472295761, "eval_valid_runtime": 6.442, "eval_valid_samples_per_second": 214.685, "eval_valid_steps_per_second": 6.83, "step": 25500 }, { "epoch": 7.069586914333241, "eval_valid_target_loss": 0.05410830304026604, "eval_valid_target_runtime": 6.5896, "eval_valid_target_samples_per_second": 217.617, "eval_valid_target_steps_per_second": 6.829, "step": 25500 }, { "epoch": 7.097310784585528, "grad_norm": 0.10731488466262817, "learning_rate": 1.9389035564879104e-06, "loss": 0.0569, "step": 25600 }, { "epoch": 7.125034654837815, "grad_norm": 0.0954216718673706, "learning_rate": 1.9045858293533399e-06, "loss": 0.0566, "step": 25700 }, { "epoch": 7.1527585250901025, "grad_norm": 0.11443454772233963, "learning_rate": 1.8705029289661054e-06, "loss": 0.057, "step": 25800 }, { "epoch": 7.1804823953423895, "grad_norm": 0.10671606659889221, "learning_rate": 1.8366574409500344e-06, "loss": 0.0561, "step": 25900 }, { "epoch": 7.208206265594677, "grad_norm": 0.1028604656457901, "learning_rate": 1.8030519329181916e-06, "loss": 0.0561, "step": 26000 }, { "epoch": 7.208206265594677, "eval_valid_loss": 0.04931313917040825, "eval_valid_runtime": 6.431, "eval_valid_samples_per_second": 215.053, "eval_valid_steps_per_second": 6.842, "step": 26000 }, { "epoch": 7.208206265594677, "eval_valid_target_loss": 0.053888678550720215, "eval_valid_target_runtime": 6.5712, "eval_valid_target_samples_per_second": 218.225, "eval_valid_target_steps_per_second": 6.848, "step": 26000 }, { "epoch": 7.2359301358469645, "grad_norm": 0.11538730561733246, "learning_rate": 1.7696889542780904e-06, "loss": 0.0564, "step": 26100 }, { "epoch": 7.263654006099252, "grad_norm": 0.10585539788007736, "learning_rate": 1.7365710360382882e-06, "loss": 0.0562, "step": 26200 }, { "epoch": 7.291377876351539, "grad_norm": 0.09750411659479141, "learning_rate": 1.7037006906163773e-06, "loss": 0.0563, "step": 26300 }, { "epoch": 7.319101746603826, "grad_norm": 0.10777630656957626, "learning_rate": 1.6710804116483886e-06, "loss": 0.0556, "step": 26400 }, { "epoch": 7.346825616856113, "grad_norm": 0.13231071829795837, "learning_rate": 1.6387126737996067e-06, "loss": 0.0559, "step": 26500 }, { "epoch": 7.346825616856113, "eval_valid_loss": 0.04909936338663101, "eval_valid_runtime": 6.4292, "eval_valid_samples_per_second": 215.112, "eval_valid_steps_per_second": 6.844, "step": 26500 }, { "epoch": 7.346825616856113, "eval_valid_target_loss": 0.05357712134718895, "eval_valid_target_runtime": 6.5542, "eval_valid_target_samples_per_second": 218.792, "eval_valid_target_steps_per_second": 6.866, "step": 26500 }, { "epoch": 7.374549487108401, "grad_norm": 0.10591776669025421, "learning_rate": 1.6065999325768544e-06, "loss": 0.0559, "step": 26600 }, { "epoch": 7.402273357360688, "grad_norm": 0.11603645980358124, "learning_rate": 1.5747446241421931e-06, "loss": 0.0557, "step": 26700 }, { "epoch": 7.429997227612975, "grad_norm": 0.09715123474597931, "learning_rate": 1.5431491651281123e-06, "loss": 0.0563, "step": 26800 }, { "epoch": 7.457721097865262, "grad_norm": 0.10046205669641495, "learning_rate": 1.511815952454208e-06, "loss": 0.0556, "step": 26900 }, { "epoch": 7.485444968117549, "grad_norm": 0.11805932968854904, "learning_rate": 1.480747363145334e-06, "loss": 0.0556, "step": 27000 }, { "epoch": 7.485444968117549, "eval_valid_loss": 0.04887402430176735, "eval_valid_runtime": 6.4098, "eval_valid_samples_per_second": 215.763, "eval_valid_steps_per_second": 6.864, "step": 27000 }, { "epoch": 7.485444968117549, "eval_valid_target_loss": 0.05348382145166397, "eval_valid_target_runtime": 6.5773, "eval_valid_target_samples_per_second": 218.023, "eval_valid_target_steps_per_second": 6.842, "step": 27000 }, { "epoch": 7.513168838369836, "grad_norm": 0.1107444316148758, "learning_rate": 1.4499457541512746e-06, "loss": 0.0554, "step": 27100 }, { "epoch": 7.540892708622124, "grad_norm": 0.10029349476099014, "learning_rate": 1.4194134621679478e-06, "loss": 0.0559, "step": 27200 }, { "epoch": 7.568616578874411, "grad_norm": 0.09976372122764587, "learning_rate": 1.3891528034601316e-06, "loss": 0.0565, "step": 27300 }, { "epoch": 7.596340449126698, "grad_norm": 0.10560230165719986, "learning_rate": 1.3591660736857453e-06, "loss": 0.0553, "step": 27400 }, { "epoch": 7.624064319378985, "grad_norm": 0.09814602881669998, "learning_rate": 1.329455547721697e-06, "loss": 0.0552, "step": 27500 }, { "epoch": 7.624064319378985, "eval_valid_loss": 0.04867083579301834, "eval_valid_runtime": 6.4389, "eval_valid_samples_per_second": 214.79, "eval_valid_steps_per_second": 6.834, "step": 27500 }, { "epoch": 7.624064319378985, "eval_valid_target_loss": 0.053231850266456604, "eval_valid_target_runtime": 6.5692, "eval_valid_target_samples_per_second": 218.292, "eval_valid_target_steps_per_second": 6.85, "step": 27500 }, { "epoch": 7.651788189631272, "grad_norm": 0.10253589600324631, "learning_rate": 1.300023479491303e-06, "loss": 0.0555, "step": 27600 }, { "epoch": 7.67951205988356, "grad_norm": 0.10933282226324081, "learning_rate": 1.2708721017933007e-06, "loss": 0.0551, "step": 27700 }, { "epoch": 7.707235930135847, "grad_norm": 0.11853484809398651, "learning_rate": 1.2420036261324598e-06, "loss": 0.056, "step": 27800 }, { "epoch": 7.734959800388134, "grad_norm": 0.0992041826248169, "learning_rate": 1.2134202425518139e-06, "loss": 0.0547, "step": 27900 }, { "epoch": 7.762683670640421, "grad_norm": 0.10824355483055115, "learning_rate": 1.185124119466517e-06, "loss": 0.0554, "step": 28000 }, { "epoch": 7.762683670640421, "eval_valid_loss": 0.048471271991729736, "eval_valid_runtime": 6.414, "eval_valid_samples_per_second": 215.623, "eval_valid_steps_per_second": 6.86, "step": 28000 }, { "epoch": 7.762683670640421, "eval_valid_target_loss": 0.05302482470870018, "eval_valid_target_runtime": 6.5682, "eval_valid_target_samples_per_second": 218.326, "eval_valid_target_steps_per_second": 6.851, "step": 28000 }, { "epoch": 7.7904075408927085, "grad_norm": 0.09927680343389511, "learning_rate": 1.1571174034993416e-06, "loss": 0.0555, "step": 28100 }, { "epoch": 7.8181314111449955, "grad_norm": 0.09600567072629929, "learning_rate": 1.129402219317825e-06, "loss": 0.0553, "step": 28200 }, { "epoch": 7.845855281397283, "grad_norm": 0.11057105660438538, "learning_rate": 1.1019806694730989e-06, "loss": 0.0557, "step": 28300 }, { "epoch": 7.873579151649571, "grad_norm": 0.10991726815700531, "learning_rate": 1.074854834240368e-06, "loss": 0.0553, "step": 28400 }, { "epoch": 7.901303021901858, "grad_norm": 0.09168905019760132, "learning_rate": 1.0480267714611048e-06, "loss": 0.0551, "step": 28500 }, { "epoch": 7.901303021901858, "eval_valid_loss": 0.04835043475031853, "eval_valid_runtime": 6.4532, "eval_valid_samples_per_second": 214.313, "eval_valid_steps_per_second": 6.818, "step": 28500 }, { "epoch": 7.901303021901858, "eval_valid_target_loss": 0.05293356999754906, "eval_valid_target_runtime": 6.5812, "eval_valid_target_samples_per_second": 217.894, "eval_valid_target_steps_per_second": 6.838, "step": 28500 }, { "epoch": 7.929026892154145, "grad_norm": 0.09465237706899643, "learning_rate": 1.0214985163869378e-06, "loss": 0.0556, "step": 28600 }, { "epoch": 7.956750762406432, "grad_norm": 0.10842736065387726, "learning_rate": 9.952720815252397e-07, "loss": 0.0543, "step": 28700 }, { "epoch": 7.984474632658719, "grad_norm": 0.09609558433294296, "learning_rate": 9.693494564864648e-07, "loss": 0.0554, "step": 28800 }, { "epoch": 8.012198502911007, "grad_norm": 0.10819283127784729, "learning_rate": 9.437326078332099e-07, "loss": 0.0545, "step": 28900 }, { "epoch": 8.039922373163293, "grad_norm": 0.09054001420736313, "learning_rate": 9.18423478931016e-07, "loss": 0.0554, "step": 29000 }, { "epoch": 8.039922373163293, "eval_valid_loss": 0.04819526523351669, "eval_valid_runtime": 6.4165, "eval_valid_samples_per_second": 215.536, "eval_valid_steps_per_second": 6.857, "step": 29000 }, { "epoch": 8.039922373163293, "eval_valid_target_loss": 0.05275378376245499, "eval_valid_target_runtime": 6.5635, "eval_valid_target_samples_per_second": 218.482, "eval_valid_target_steps_per_second": 6.856, "step": 29000 }, { "epoch": 8.067646243415581, "grad_norm": 0.10373499244451523, "learning_rate": 8.934239898009517e-07, "loss": 0.0552, "step": 29100 }, { "epoch": 8.095370113667869, "grad_norm": 0.09614498168230057, "learning_rate": 8.687360369739473e-07, "loss": 0.0545, "step": 29200 }, { "epoch": 8.123093983920155, "grad_norm": 0.1014479324221611, "learning_rate": 8.443614933469208e-07, "loss": 0.0549, "step": 29300 }, { "epoch": 8.150817854172443, "grad_norm": 0.08971751481294632, "learning_rate": 8.203022080406952e-07, "loss": 0.0546, "step": 29400 }, { "epoch": 8.17854172442473, "grad_norm": 0.09659924358129501, "learning_rate": 7.965600062597184e-07, "loss": 0.0542, "step": 29500 }, { "epoch": 8.17854172442473, "eval_valid_loss": 0.04812739044427872, "eval_valid_runtime": 6.4674, "eval_valid_samples_per_second": 213.843, "eval_valid_steps_per_second": 6.803, "step": 29500 }, { "epoch": 8.17854172442473, "eval_valid_target_loss": 0.05264822766184807, "eval_valid_target_runtime": 6.5912, "eval_valid_target_samples_per_second": 217.563, "eval_valid_target_steps_per_second": 6.827, "step": 29500 }, { "epoch": 8.206265594677017, "grad_norm": 0.1034499853849411, "learning_rate": 7.731366891535969e-07, "loss": 0.0548, "step": 29600 }, { "epoch": 8.233989464929303, "grad_norm": 0.0934043675661087, "learning_rate": 7.500340336804607e-07, "loss": 0.0542, "step": 29700 }, { "epoch": 8.261713335181591, "grad_norm": 0.09693789482116699, "learning_rate": 7.272537924721467e-07, "loss": 0.0553, "step": 29800 }, { "epoch": 8.28943720543388, "grad_norm": 0.09552415460348129, "learning_rate": 7.047976937012568e-07, "loss": 0.0543, "step": 29900 }, { "epoch": 8.317161075686165, "grad_norm": 0.0978178158402443, "learning_rate": 6.826674409500389e-07, "loss": 0.0548, "step": 30000 }, { "epoch": 8.317161075686165, "eval_valid_loss": 0.04797354340553284, "eval_valid_runtime": 6.442, "eval_valid_samples_per_second": 214.683, "eval_valid_steps_per_second": 6.83, "step": 30000 }, { "epoch": 8.317161075686165, "eval_valid_target_loss": 0.052511684596538544, "eval_valid_target_runtime": 6.5615, "eval_valid_target_samples_per_second": 218.549, "eval_valid_target_steps_per_second": 6.858, "step": 30000 }, { "epoch": 8.344884945938453, "grad_norm": 0.09591928869485855, "learning_rate": 6.608647130811502e-07, "loss": 0.0543, "step": 30100 }, { "epoch": 8.37260881619074, "grad_norm": 0.09678730368614197, "learning_rate": 6.393911641103051e-07, "loss": 0.0542, "step": 30200 }, { "epoch": 8.400332686443027, "grad_norm": 0.10894029587507248, "learning_rate": 6.182484230807845e-07, "loss": 0.0542, "step": 30300 }, { "epoch": 8.428056556695315, "grad_norm": 0.10065341740846634, "learning_rate": 5.974380939398555e-07, "loss": 0.0549, "step": 30400 }, { "epoch": 8.455780426947602, "grad_norm": 0.11015477776527405, "learning_rate": 5.769617554170959e-07, "loss": 0.0544, "step": 30500 }, { "epoch": 8.455780426947602, "eval_valid_loss": 0.04785359278321266, "eval_valid_runtime": 6.4159, "eval_valid_samples_per_second": 215.558, "eval_valid_steps_per_second": 6.858, "step": 30500 }, { "epoch": 8.455780426947602, "eval_valid_target_loss": 0.05238433927297592, "eval_valid_target_runtime": 6.575, "eval_valid_target_samples_per_second": 218.1, "eval_valid_target_steps_per_second": 6.844, "step": 30500 }, { "epoch": 8.48350429719989, "grad_norm": 0.10229642689228058, "learning_rate": 5.568209609046238e-07, "loss": 0.0542, "step": 30600 }, { "epoch": 8.511228167452176, "grad_norm": 0.1019807681441307, "learning_rate": 5.370172383392514e-07, "loss": 0.0548, "step": 30700 }, { "epoch": 8.538952037704464, "grad_norm": 0.1037830114364624, "learning_rate": 5.175520900865754e-07, "loss": 0.0538, "step": 30800 }, { "epoch": 8.56667590795675, "grad_norm": 0.0952112227678299, "learning_rate": 4.984269928270002e-07, "loss": 0.0537, "step": 30900 }, { "epoch": 8.594399778209038, "grad_norm": 0.09642232209444046, "learning_rate": 4.796433974437148e-07, "loss": 0.0533, "step": 31000 }, { "epoch": 8.594399778209038, "eval_valid_loss": 0.04777803644537926, "eval_valid_runtime": 6.4399, "eval_valid_samples_per_second": 214.756, "eval_valid_steps_per_second": 6.832, "step": 31000 }, { "epoch": 8.594399778209038, "eval_valid_target_loss": 0.052354373037815094, "eval_valid_target_runtime": 6.5668, "eval_valid_target_samples_per_second": 218.371, "eval_valid_target_steps_per_second": 6.853, "step": 31000 }, { "epoch": 8.622123648461326, "grad_norm": 0.10211507230997086, "learning_rate": 4.6120272891262365e-07, "loss": 0.0544, "step": 31100 }, { "epoch": 8.649847518713612, "grad_norm": 0.0912129357457161, "learning_rate": 4.4310638619424363e-07, "loss": 0.0536, "step": 31200 }, { "epoch": 8.6775713889659, "grad_norm": 0.10558176785707474, "learning_rate": 4.2535574212757667e-07, "loss": 0.0542, "step": 31300 }, { "epoch": 8.705295259218186, "grad_norm": 0.10381397604942322, "learning_rate": 4.0795214332596145e-07, "loss": 0.0547, "step": 31400 }, { "epoch": 8.733019129470474, "grad_norm": 0.09383094310760498, "learning_rate": 3.908969100749121e-07, "loss": 0.055, "step": 31500 }, { "epoch": 8.733019129470474, "eval_valid_loss": 0.047727905213832855, "eval_valid_runtime": 6.4171, "eval_valid_samples_per_second": 215.518, "eval_valid_steps_per_second": 6.857, "step": 31500 }, { "epoch": 8.733019129470474, "eval_valid_target_loss": 0.05224745720624924, "eval_valid_target_runtime": 6.5727, "eval_valid_target_samples_per_second": 218.174, "eval_valid_target_steps_per_second": 6.846, "step": 31500 }, { "epoch": 8.760742999722762, "grad_norm": 0.10438426584005356, "learning_rate": 3.7419133623196825e-07, "loss": 0.0541, "step": 31600 }, { "epoch": 8.788466869975048, "grad_norm": 0.09324101358652115, "learning_rate": 3.5783668912852453e-07, "loss": 0.0537, "step": 31700 }, { "epoch": 8.816190740227336, "grad_norm": 0.09235464036464691, "learning_rate": 3.4183420947369873e-07, "loss": 0.0544, "step": 31800 }, { "epoch": 8.843914610479622, "grad_norm": 0.09870747476816177, "learning_rate": 3.261851112602055e-07, "loss": 0.0543, "step": 31900 }, { "epoch": 8.87163848073191, "grad_norm": 0.10918495059013367, "learning_rate": 3.108905816722546e-07, "loss": 0.054, "step": 32000 }, { "epoch": 8.87163848073191, "eval_valid_loss": 0.047707512974739075, "eval_valid_runtime": 6.4362, "eval_valid_samples_per_second": 214.879, "eval_valid_steps_per_second": 6.836, "step": 32000 }, { "epoch": 8.87163848073191, "eval_valid_target_loss": 0.05221306532621384, "eval_valid_target_runtime": 6.5779, "eval_valid_target_samples_per_second": 218.002, "eval_valid_target_steps_per_second": 6.841, "step": 32000 }, { "epoch": 8.899362350984198, "grad_norm": 0.09537260234355927, "learning_rate": 2.9595178099549315e-07, "loss": 0.054, "step": 32100 }, { "epoch": 8.927086221236484, "grad_norm": 0.09188380092382431, "learning_rate": 2.8136984252898515e-07, "loss": 0.0542, "step": 32200 }, { "epoch": 8.954810091488772, "grad_norm": 0.09919969737529755, "learning_rate": 2.671458724992254e-07, "loss": 0.0542, "step": 32300 }, { "epoch": 8.982533961741058, "grad_norm": 0.09692647308111191, "learning_rate": 2.532809499762312e-07, "loss": 0.0544, "step": 32400 }, { "epoch": 9.010257831993346, "grad_norm": 0.09277132153511047, "learning_rate": 2.397761267916726e-07, "loss": 0.0539, "step": 32500 }, { "epoch": 9.010257831993346, "eval_valid_loss": 0.047637518495321274, "eval_valid_runtime": 6.4471, "eval_valid_samples_per_second": 214.516, "eval_valid_steps_per_second": 6.825, "step": 32500 }, { "epoch": 9.010257831993346, "eval_valid_target_loss": 0.052208978682756424, "eval_valid_target_runtime": 6.5636, "eval_valid_target_samples_per_second": 218.477, "eval_valid_target_steps_per_second": 6.856, "step": 32500 }, { "epoch": 9.037981702245634, "grad_norm": 0.09585940837860107, "learning_rate": 2.2663242745908087e-07, "loss": 0.0542, "step": 32600 }, { "epoch": 9.06570557249792, "grad_norm": 0.09488432109355927, "learning_rate": 2.138508490961244e-07, "loss": 0.0533, "step": 32700 }, { "epoch": 9.093429442750208, "grad_norm": 0.09499957412481308, "learning_rate": 2.014323613489666e-07, "loss": 0.0543, "step": 32800 }, { "epoch": 9.121153313002495, "grad_norm": 0.09435317665338516, "learning_rate": 1.8937790631870345e-07, "loss": 0.0536, "step": 32900 }, { "epoch": 9.148877183254783, "grad_norm": 0.10342779755592346, "learning_rate": 1.7768839848989584e-07, "loss": 0.0539, "step": 33000 }, { "epoch": 9.148877183254783, "eval_valid_loss": 0.047598063945770264, "eval_valid_runtime": 6.4315, "eval_valid_samples_per_second": 215.037, "eval_valid_steps_per_second": 6.841, "step": 33000 }, { "epoch": 9.148877183254783, "eval_valid_target_loss": 0.05212317034602165, "eval_valid_target_runtime": 6.5736, "eval_valid_target_samples_per_second": 218.146, "eval_valid_target_steps_per_second": 6.846, "step": 33000 }, { "epoch": 9.176601053507069, "grad_norm": 0.09814909845590591, "learning_rate": 1.6636472466118992e-07, "loss": 0.0542, "step": 33100 }, { "epoch": 9.204324923759357, "grad_norm": 0.09484022855758667, "learning_rate": 1.5540774387804825e-07, "loss": 0.0544, "step": 33200 }, { "epoch": 9.232048794011645, "grad_norm": 0.07888332009315491, "learning_rate": 1.448182873675752e-07, "loss": 0.0539, "step": 33300 }, { "epoch": 9.25977266426393, "grad_norm": 0.0964021384716034, "learning_rate": 1.345971584754585e-07, "loss": 0.0539, "step": 33400 }, { "epoch": 9.287496534516219, "grad_norm": 0.10322096943855286, "learning_rate": 1.2474513260502695e-07, "loss": 0.0536, "step": 33500 }, { "epoch": 9.287496534516219, "eval_valid_loss": 0.047564879059791565, "eval_valid_runtime": 6.4358, "eval_valid_samples_per_second": 214.89, "eval_valid_steps_per_second": 6.837, "step": 33500 }, { "epoch": 9.287496534516219, "eval_valid_target_loss": 0.05209695175290108, "eval_valid_target_runtime": 6.5809, "eval_valid_target_samples_per_second": 217.904, "eval_valid_target_steps_per_second": 6.838, "step": 33500 }, { "epoch": 9.315220404768505, "grad_norm": 0.10957927256822586, "learning_rate": 1.1526295715842628e-07, "loss": 0.0541, "step": 33600 }, { "epoch": 9.342944275020793, "grad_norm": 0.09433583915233612, "learning_rate": 1.0615135147991562e-07, "loss": 0.0542, "step": 33700 }, { "epoch": 9.370668145273081, "grad_norm": 0.09703412652015686, "learning_rate": 9.741100680130122e-08, "loss": 0.0535, "step": 33800 }, { "epoch": 9.398392015525367, "grad_norm": 0.10180799663066864, "learning_rate": 8.904258618949335e-08, "loss": 0.054, "step": 33900 }, { "epoch": 9.426115885777655, "grad_norm": 0.09336613118648529, "learning_rate": 8.104672449620598e-08, "loss": 0.0532, "step": 34000 }, { "epoch": 9.426115885777655, "eval_valid_loss": 0.047556404024362564, "eval_valid_runtime": 6.42, "eval_valid_samples_per_second": 215.421, "eval_valid_steps_per_second": 6.854, "step": 34000 }, { "epoch": 9.426115885777655, "eval_valid_target_loss": 0.05208129063248634, "eval_valid_target_runtime": 6.595, "eval_valid_target_samples_per_second": 217.437, "eval_valid_target_steps_per_second": 6.823, "step": 34000 }, { "epoch": 9.453839756029941, "grad_norm": 0.0890408605337143, "learning_rate": 7.342402830979589e-08, "loss": 0.054, "step": 34100 }, { "epoch": 9.48156362628223, "grad_norm": 0.09568461775779724, "learning_rate": 6.617507590924332e-08, "loss": 0.0535, "step": 34200 }, { "epoch": 9.509287496534515, "grad_norm": 0.09256019443273544, "learning_rate": 5.930041722028379e-08, "loss": 0.054, "step": 34300 }, { "epoch": 9.537011366786803, "grad_norm": 0.09314898401498795, "learning_rate": 5.280057377368863e-08, "loss": 0.0535, "step": 34400 }, { "epoch": 9.564735237039091, "grad_norm": 0.10256827622652054, "learning_rate": 4.667603866569892e-08, "loss": 0.0537, "step": 34500 }, { "epoch": 9.564735237039091, "eval_valid_loss": 0.047560639679431915, "eval_valid_runtime": 6.4632, "eval_valid_samples_per_second": 213.979, "eval_valid_steps_per_second": 6.808, "step": 34500 }, { "epoch": 9.564735237039091, "eval_valid_target_loss": 0.05206665024161339, "eval_valid_target_runtime": 6.5886, "eval_valid_target_samples_per_second": 217.649, "eval_valid_target_steps_per_second": 6.83, "step": 34500 }, { "epoch": 9.592459107291377, "grad_norm": 0.0861942321062088, "learning_rate": 4.092727652062034e-08, "loss": 0.0537, "step": 34600 }, { "epoch": 9.620182977543665, "grad_norm": 0.09521106630563736, "learning_rate": 3.555472345557365e-08, "loss": 0.0535, "step": 34700 }, { "epoch": 9.647906847795952, "grad_norm": 0.10885845869779587, "learning_rate": 3.055878704741e-08, "loss": 0.0542, "step": 34800 }, { "epoch": 9.67563071804824, "grad_norm": 0.09145703911781311, "learning_rate": 2.5939846301791804e-08, "loss": 0.0541, "step": 34900 }, { "epoch": 9.703354588300527, "grad_norm": 0.09051796793937683, "learning_rate": 2.1698251624438503e-08, "loss": 0.0544, "step": 35000 }, { "epoch": 9.703354588300527, "eval_valid_loss": 0.04752533510327339, "eval_valid_runtime": 6.4168, "eval_valid_samples_per_second": 215.528, "eval_valid_steps_per_second": 6.857, "step": 35000 }, { "epoch": 9.703354588300527, "eval_valid_target_loss": 0.05207618325948715, "eval_valid_target_runtime": 6.57, "eval_valid_target_samples_per_second": 218.265, "eval_valid_target_steps_per_second": 6.849, "step": 35000 }, { "epoch": 9.731078458552814, "grad_norm": 0.0903056338429451, "learning_rate": 1.7834324794546164e-08, "loss": 0.0539, "step": 35100 }, { "epoch": 9.758802328805102, "grad_norm": 0.0897304117679596, "learning_rate": 1.434835894037423e-08, "loss": 0.0539, "step": 35200 }, { "epoch": 9.786526199057388, "grad_norm": 0.10058806836605072, "learning_rate": 1.1240618517009416e-08, "loss": 0.0542, "step": 35300 }, { "epoch": 9.814250069309676, "grad_norm": 0.1056876927614212, "learning_rate": 8.511339286303432e-09, "loss": 0.0537, "step": 35400 }, { "epoch": 9.841973939561964, "grad_norm": 0.08990786969661713, "learning_rate": 6.1607282989856184e-09, "loss": 0.0547, "step": 35500 }, { "epoch": 9.841973939561964, "eval_valid_loss": 0.047528158873319626, "eval_valid_runtime": 6.4412, "eval_valid_samples_per_second": 214.712, "eval_valid_steps_per_second": 6.831, "step": 35500 }, { "epoch": 9.841973939561964, "eval_valid_target_loss": 0.05206017941236496, "eval_valid_target_runtime": 6.5864, "eval_valid_target_samples_per_second": 217.72, "eval_valid_target_steps_per_second": 6.832, "step": 35500 }, { "epoch": 9.86969780981425, "grad_norm": 0.08090436458587646, "learning_rate": 4.188963878958841e-09, "loss": 0.0536, "step": 35600 }, { "epoch": 9.897421680066538, "grad_norm": 0.08319131284952164, "learning_rate": 2.5961956097669827e-09, "loss": 0.0541, "step": 35700 }, { "epoch": 9.925145550318824, "grad_norm": 0.10666873306035995, "learning_rate": 1.3825443232517999e-09, "loss": 0.0541, "step": 35800 }, { "epoch": 9.952869420571112, "grad_norm": 0.10748881101608276, "learning_rate": 5.48102090381919e-10, "loss": 0.0543, "step": 35900 }, { "epoch": 9.9805932908234, "grad_norm": 0.10198221355676651, "learning_rate": 9.293221427231214e-11, "loss": 0.0533, "step": 36000 }, { "epoch": 9.9805932908234, "eval_valid_loss": 0.04753027856349945, "eval_valid_runtime": 6.4518, "eval_valid_samples_per_second": 214.359, "eval_valid_steps_per_second": 6.82, "step": 36000 }, { "epoch": 9.9805932908234, "eval_valid_target_loss": 0.05205439031124115, "eval_valid_target_runtime": 6.5698, "eval_valid_target_samples_per_second": 218.272, "eval_valid_target_steps_per_second": 6.85, "step": 36000 } ], "logging_steps": 100, "max_steps": 36070, "num_input_tokens_seen": 0, "num_train_epochs": 10, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 3.429394066302619e+19, "train_batch_size": 16, "trial_name": null, "trial_params": null }