| { | |
| "best_metric": 0.04752533510327339, | |
| "best_model_checkpoint": "results/checkpoint-35000", | |
| "epoch": 10.0, | |
| "eval_steps": 500, | |
| "global_step": 36070, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.02772387025228722, | |
| "grad_norm": 1.750556230545044, | |
| "learning_rate": 9.999814117181637e-06, | |
| "loss": 6.049, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.05544774050457444, | |
| "grad_norm": 0.824866533279419, | |
| "learning_rate": 9.999248953493363e-06, | |
| "loss": 3.0817, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 0.08317161075686166, | |
| "grad_norm": 0.4907461702823639, | |
| "learning_rate": 9.998304532844263e-06, | |
| "loss": 2.3969, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 0.11089548100914888, | |
| "grad_norm": 0.4534800946712494, | |
| "learning_rate": 9.996980926880713e-06, | |
| "loss": 2.0935, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 0.1386193512614361, | |
| "grad_norm": 0.47491493821144104, | |
| "learning_rate": 9.995278236015153e-06, | |
| "loss": 1.9245, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 0.1386193512614361, | |
| "eval_valid_loss": 1.7945984601974487, | |
| "eval_valid_runtime": 6.4498, | |
| "eval_valid_samples_per_second": 214.426, | |
| "eval_valid_steps_per_second": 6.822, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 0.1386193512614361, | |
| "eval_valid_target_loss": 1.875697374343872, | |
| "eval_valid_target_runtime": 6.5527, | |
| "eval_valid_target_samples_per_second": 218.841, | |
| "eval_valid_target_steps_per_second": 6.867, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 0.16634322151372333, | |
| "grad_norm": 0.5983259677886963, | |
| "learning_rate": 9.99319658941846e-06, | |
| "loss": 1.8294, | |
| "step": 600 | |
| }, | |
| { | |
| "epoch": 0.19406709176601053, | |
| "grad_norm": 0.6906803846359253, | |
| "learning_rate": 9.990736145010146e-06, | |
| "loss": 1.7625, | |
| "step": 700 | |
| }, | |
| { | |
| "epoch": 0.22179096201829776, | |
| "grad_norm": 1.4024661779403687, | |
| "learning_rate": 9.987897089446381e-06, | |
| "loss": 1.709, | |
| "step": 800 | |
| }, | |
| { | |
| "epoch": 0.24951483227058496, | |
| "grad_norm": 1.073205590248108, | |
| "learning_rate": 9.984679638105837e-06, | |
| "loss": 1.6595, | |
| "step": 900 | |
| }, | |
| { | |
| "epoch": 0.2772387025228722, | |
| "grad_norm": 1.280462384223938, | |
| "learning_rate": 9.981084035073337e-06, | |
| "loss": 1.6153, | |
| "step": 1000 | |
| }, | |
| { | |
| "epoch": 0.2772387025228722, | |
| "eval_valid_loss": 1.5186923742294312, | |
| "eval_valid_runtime": 6.4198, | |
| "eval_valid_samples_per_second": 215.427, | |
| "eval_valid_steps_per_second": 6.854, | |
| "step": 1000 | |
| }, | |
| { | |
| "epoch": 0.2772387025228722, | |
| "eval_valid_target_loss": 1.5994268655776978, | |
| "eval_valid_target_runtime": 6.5778, | |
| "eval_valid_target_samples_per_second": 218.006, | |
| "eval_valid_target_steps_per_second": 6.841, | |
| "step": 1000 | |
| }, | |
| { | |
| "epoch": 0.3049625727751594, | |
| "grad_norm": 0.9407665133476257, | |
| "learning_rate": 9.977110553121353e-06, | |
| "loss": 1.567, | |
| "step": 1100 | |
| }, | |
| { | |
| "epoch": 0.33268644302744665, | |
| "grad_norm": 1.5439337491989136, | |
| "learning_rate": 9.972759493689301e-06, | |
| "loss": 1.5275, | |
| "step": 1200 | |
| }, | |
| { | |
| "epoch": 0.36041031327973383, | |
| "grad_norm": 2.2176036834716797, | |
| "learning_rate": 9.968031186860677e-06, | |
| "loss": 1.4833, | |
| "step": 1300 | |
| }, | |
| { | |
| "epoch": 0.38813418353202106, | |
| "grad_norm": 1.6237233877182007, | |
| "learning_rate": 9.962925991338018e-06, | |
| "loss": 1.4457, | |
| "step": 1400 | |
| }, | |
| { | |
| "epoch": 0.4158580537843083, | |
| "grad_norm": 1.3075989484786987, | |
| "learning_rate": 9.957444294415685e-06, | |
| "loss": 1.407, | |
| "step": 1500 | |
| }, | |
| { | |
| "epoch": 0.4158580537843083, | |
| "eval_valid_loss": 1.326136589050293, | |
| "eval_valid_runtime": 6.413, | |
| "eval_valid_samples_per_second": 215.655, | |
| "eval_valid_steps_per_second": 6.861, | |
| "step": 1500 | |
| }, | |
| { | |
| "epoch": 0.4158580537843083, | |
| "eval_valid_target_loss": 1.3982958793640137, | |
| "eval_valid_target_runtime": 6.5728, | |
| "eval_valid_target_samples_per_second": 218.172, | |
| "eval_valid_target_steps_per_second": 6.846, | |
| "step": 1500 | |
| }, | |
| { | |
| "epoch": 0.4435819240365955, | |
| "grad_norm": 1.379807472229004, | |
| "learning_rate": 9.951586511950491e-06, | |
| "loss": 1.3768, | |
| "step": 1600 | |
| }, | |
| { | |
| "epoch": 0.47130579428888275, | |
| "grad_norm": 0.737086832523346, | |
| "learning_rate": 9.945353088330137e-06, | |
| "loss": 1.347, | |
| "step": 1700 | |
| }, | |
| { | |
| "epoch": 0.4990296645411699, | |
| "grad_norm": 0.6332296133041382, | |
| "learning_rate": 9.93874449643952e-06, | |
| "loss": 1.3188, | |
| "step": 1800 | |
| }, | |
| { | |
| "epoch": 0.5267535347934572, | |
| "grad_norm": 0.6948099732398987, | |
| "learning_rate": 9.931761237624833e-06, | |
| "loss": 1.2903, | |
| "step": 1900 | |
| }, | |
| { | |
| "epoch": 0.5544774050457444, | |
| "grad_norm": 0.9397527575492859, | |
| "learning_rate": 9.924403841655565e-06, | |
| "loss": 1.2671, | |
| "step": 2000 | |
| }, | |
| { | |
| "epoch": 0.5544774050457444, | |
| "eval_valid_loss": 1.2014020681381226, | |
| "eval_valid_runtime": 6.4367, | |
| "eval_valid_samples_per_second": 214.861, | |
| "eval_valid_steps_per_second": 6.836, | |
| "step": 2000 | |
| }, | |
| { | |
| "epoch": 0.5544774050457444, | |
| "eval_valid_target_loss": 1.2820453643798828, | |
| "eval_valid_target_runtime": 6.5614, | |
| "eval_valid_target_samples_per_second": 218.55, | |
| "eval_valid_target_steps_per_second": 6.858, | |
| "step": 2000 | |
| }, | |
| { | |
| "epoch": 0.5822012752980316, | |
| "grad_norm": 0.5302172303199768, | |
| "learning_rate": 9.916672866684275e-06, | |
| "loss": 1.2439, | |
| "step": 2100 | |
| }, | |
| { | |
| "epoch": 0.6099251455503188, | |
| "grad_norm": 0.5439279675483704, | |
| "learning_rate": 9.908568899204281e-06, | |
| "loss": 1.2231, | |
| "step": 2200 | |
| }, | |
| { | |
| "epoch": 0.637649015802606, | |
| "grad_norm": 0.7026234865188599, | |
| "learning_rate": 9.90009255400514e-06, | |
| "loss": 1.2027, | |
| "step": 2300 | |
| }, | |
| { | |
| "epoch": 0.6653728860548933, | |
| "grad_norm": 0.642803430557251, | |
| "learning_rate": 9.89124447412603e-06, | |
| "loss": 1.1864, | |
| "step": 2400 | |
| }, | |
| { | |
| "epoch": 0.6930967563071805, | |
| "grad_norm": 1.3601601123809814, | |
| "learning_rate": 9.882025330806952e-06, | |
| "loss": 1.1654, | |
| "step": 2500 | |
| }, | |
| { | |
| "epoch": 0.6930967563071805, | |
| "eval_valid_loss": 1.1063387393951416, | |
| "eval_valid_runtime": 6.4314, | |
| "eval_valid_samples_per_second": 215.037, | |
| "eval_valid_steps_per_second": 6.841, | |
| "step": 2500 | |
| }, | |
| { | |
| "epoch": 0.6930967563071805, | |
| "eval_valid_target_loss": 1.208246111869812, | |
| "eval_valid_target_runtime": 6.5564, | |
| "eval_valid_target_samples_per_second": 218.719, | |
| "eval_valid_target_steps_per_second": 6.864, | |
| "step": 2500 | |
| }, | |
| { | |
| "epoch": 0.7208206265594677, | |
| "grad_norm": 0.7053922414779663, | |
| "learning_rate": 9.872435823437816e-06, | |
| "loss": 1.1433, | |
| "step": 2600 | |
| }, | |
| { | |
| "epoch": 0.748544496811755, | |
| "grad_norm": 0.6601741909980774, | |
| "learning_rate": 9.862476679505384e-06, | |
| "loss": 1.1193, | |
| "step": 2700 | |
| }, | |
| { | |
| "epoch": 0.7762683670640421, | |
| "grad_norm": 0.7706498503684998, | |
| "learning_rate": 9.852148654538072e-06, | |
| "loss": 1.0954, | |
| "step": 2800 | |
| }, | |
| { | |
| "epoch": 0.8039922373163294, | |
| "grad_norm": 0.8355486392974854, | |
| "learning_rate": 9.841452532048648e-06, | |
| "loss": 1.069, | |
| "step": 2900 | |
| }, | |
| { | |
| "epoch": 0.8317161075686166, | |
| "grad_norm": 0.8369494676589966, | |
| "learning_rate": 9.830389123474773e-06, | |
| "loss": 1.0384, | |
| "step": 3000 | |
| }, | |
| { | |
| "epoch": 0.8317161075686166, | |
| "eval_valid_loss": 0.9615023732185364, | |
| "eval_valid_runtime": 6.4156, | |
| "eval_valid_samples_per_second": 215.57, | |
| "eval_valid_steps_per_second": 6.858, | |
| "step": 3000 | |
| }, | |
| { | |
| "epoch": 0.8317161075686166, | |
| "eval_valid_target_loss": 1.0947415828704834, | |
| "eval_valid_target_runtime": 6.5753, | |
| "eval_valid_target_samples_per_second": 218.088, | |
| "eval_valid_target_steps_per_second": 6.844, | |
| "step": 3000 | |
| }, | |
| { | |
| "epoch": 0.8594399778209038, | |
| "grad_norm": 1.4864110946655273, | |
| "learning_rate": 9.818959268117464e-06, | |
| "loss": 1.0103, | |
| "step": 3100 | |
| }, | |
| { | |
| "epoch": 0.887163848073191, | |
| "grad_norm": 0.7728907465934753, | |
| "learning_rate": 9.807163833077407e-06, | |
| "loss": 0.982, | |
| "step": 3200 | |
| }, | |
| { | |
| "epoch": 0.9148877183254782, | |
| "grad_norm": 0.6881595253944397, | |
| "learning_rate": 9.795003713189187e-06, | |
| "loss": 0.9492, | |
| "step": 3300 | |
| }, | |
| { | |
| "epoch": 0.9426115885777655, | |
| "grad_norm": 1.0222816467285156, | |
| "learning_rate": 9.782479830953388e-06, | |
| "loss": 0.9142, | |
| "step": 3400 | |
| }, | |
| { | |
| "epoch": 0.9703354588300527, | |
| "grad_norm": 0.6671555042266846, | |
| "learning_rate": 9.769593136466633e-06, | |
| "loss": 0.8838, | |
| "step": 3500 | |
| }, | |
| { | |
| "epoch": 0.9703354588300527, | |
| "eval_valid_loss": 0.8037808537483215, | |
| "eval_valid_runtime": 6.4314, | |
| "eval_valid_samples_per_second": 215.038, | |
| "eval_valid_steps_per_second": 6.841, | |
| "step": 3500 | |
| }, | |
| { | |
| "epoch": 0.9703354588300527, | |
| "eval_valid_target_loss": 0.9639121294021606, | |
| "eval_valid_target_runtime": 6.6053, | |
| "eval_valid_target_samples_per_second": 217.1, | |
| "eval_valid_target_steps_per_second": 6.813, | |
| "step": 3500 | |
| }, | |
| { | |
| "epoch": 0.9980593290823399, | |
| "grad_norm": 0.7793981432914734, | |
| "learning_rate": 9.756344607349483e-06, | |
| "loss": 0.8496, | |
| "step": 3600 | |
| }, | |
| { | |
| "epoch": 1.0257831993346271, | |
| "grad_norm": 0.7545821070671082, | |
| "learning_rate": 9.74273524867229e-06, | |
| "loss": 0.8117, | |
| "step": 3700 | |
| }, | |
| { | |
| "epoch": 1.0535070695869144, | |
| "grad_norm": 0.631118893623352, | |
| "learning_rate": 9.728766092878934e-06, | |
| "loss": 0.7749, | |
| "step": 3800 | |
| }, | |
| { | |
| "epoch": 1.0812309398392015, | |
| "grad_norm": 0.7934292554855347, | |
| "learning_rate": 9.714438199708516e-06, | |
| "loss": 0.7321, | |
| "step": 3900 | |
| }, | |
| { | |
| "epoch": 1.1089548100914888, | |
| "grad_norm": 0.6160613298416138, | |
| "learning_rate": 9.699752656114947e-06, | |
| "loss": 0.6891, | |
| "step": 4000 | |
| }, | |
| { | |
| "epoch": 1.1089548100914888, | |
| "eval_valid_loss": 0.5853330492973328, | |
| "eval_valid_runtime": 6.4069, | |
| "eval_valid_samples_per_second": 215.861, | |
| "eval_valid_steps_per_second": 6.868, | |
| "step": 4000 | |
| }, | |
| { | |
| "epoch": 1.1089548100914888, | |
| "eval_valid_target_loss": 0.7543638944625854, | |
| "eval_valid_target_runtime": 6.5591, | |
| "eval_valid_target_samples_per_second": 218.627, | |
| "eval_valid_target_steps_per_second": 6.861, | |
| "step": 4000 | |
| }, | |
| { | |
| "epoch": 1.136678680343776, | |
| "grad_norm": 0.4765689969062805, | |
| "learning_rate": 9.684710576184504e-06, | |
| "loss": 0.6383, | |
| "step": 4100 | |
| }, | |
| { | |
| "epoch": 1.1644025505960631, | |
| "grad_norm": 0.7610909938812256, | |
| "learning_rate": 9.669313101051295e-06, | |
| "loss": 0.5894, | |
| "step": 4200 | |
| }, | |
| { | |
| "epoch": 1.1921264208483504, | |
| "grad_norm": 0.5010733008384705, | |
| "learning_rate": 9.653561398810706e-06, | |
| "loss": 0.5446, | |
| "step": 4300 | |
| }, | |
| { | |
| "epoch": 1.2198502911006377, | |
| "grad_norm": 0.6305666565895081, | |
| "learning_rate": 9.637456664430776e-06, | |
| "loss": 0.5097, | |
| "step": 4400 | |
| }, | |
| { | |
| "epoch": 1.247574161352925, | |
| "grad_norm": 0.8064519762992859, | |
| "learning_rate": 9.621000119661545e-06, | |
| "loss": 0.4678, | |
| "step": 4500 | |
| }, | |
| { | |
| "epoch": 1.247574161352925, | |
| "eval_valid_loss": 0.38276800513267517, | |
| "eval_valid_runtime": 6.4349, | |
| "eval_valid_samples_per_second": 214.922, | |
| "eval_valid_steps_per_second": 6.838, | |
| "step": 4500 | |
| }, | |
| { | |
| "epoch": 1.247574161352925, | |
| "eval_valid_target_loss": 0.4976137578487396, | |
| "eval_valid_target_runtime": 6.5738, | |
| "eval_valid_target_samples_per_second": 218.139, | |
| "eval_valid_target_steps_per_second": 6.845, | |
| "step": 4500 | |
| }, | |
| { | |
| "epoch": 1.275298031605212, | |
| "grad_norm": 0.49154090881347656, | |
| "learning_rate": 9.604193012942375e-06, | |
| "loss": 0.4326, | |
| "step": 4600 | |
| }, | |
| { | |
| "epoch": 1.3030219018574993, | |
| "grad_norm": 0.5592367053031921, | |
| "learning_rate": 9.587036619307226e-06, | |
| "loss": 0.4054, | |
| "step": 4700 | |
| }, | |
| { | |
| "epoch": 1.3307457721097866, | |
| "grad_norm": 0.48195400834083557, | |
| "learning_rate": 9.569532240287946e-06, | |
| "loss": 0.3828, | |
| "step": 4800 | |
| }, | |
| { | |
| "epoch": 1.3584696423620737, | |
| "grad_norm": 0.5364578366279602, | |
| "learning_rate": 9.551681203815517e-06, | |
| "loss": 0.3595, | |
| "step": 4900 | |
| }, | |
| { | |
| "epoch": 1.386193512614361, | |
| "grad_norm": 0.5409713387489319, | |
| "learning_rate": 9.533484864119327e-06, | |
| "loss": 0.3405, | |
| "step": 5000 | |
| }, | |
| { | |
| "epoch": 1.386193512614361, | |
| "eval_valid_loss": 0.2857649326324463, | |
| "eval_valid_runtime": 6.4118, | |
| "eval_valid_samples_per_second": 215.697, | |
| "eval_valid_steps_per_second": 6.862, | |
| "step": 5000 | |
| }, | |
| { | |
| "epoch": 1.386193512614361, | |
| "eval_valid_target_loss": 0.33146464824676514, | |
| "eval_valid_target_runtime": 6.5717, | |
| "eval_valid_target_samples_per_second": 218.209, | |
| "eval_valid_target_steps_per_second": 6.848, | |
| "step": 5000 | |
| }, | |
| { | |
| "epoch": 1.4139173828666483, | |
| "grad_norm": 0.7294422388076782, | |
| "learning_rate": 9.514944601624427e-06, | |
| "loss": 0.328, | |
| "step": 5100 | |
| }, | |
| { | |
| "epoch": 1.4416412531189353, | |
| "grad_norm": 0.4695785343647003, | |
| "learning_rate": 9.49606182284681e-06, | |
| "loss": 0.3095, | |
| "step": 5200 | |
| }, | |
| { | |
| "epoch": 1.4693651233712226, | |
| "grad_norm": 0.5484552979469299, | |
| "learning_rate": 9.476837960286707e-06, | |
| "loss": 0.3016, | |
| "step": 5300 | |
| }, | |
| { | |
| "epoch": 1.49708899362351, | |
| "grad_norm": 0.38614729046821594, | |
| "learning_rate": 9.457274472319919e-06, | |
| "loss": 0.2875, | |
| "step": 5400 | |
| }, | |
| { | |
| "epoch": 1.524812863875797, | |
| "grad_norm": 0.3303731381893158, | |
| "learning_rate": 9.437372843087175e-06, | |
| "loss": 0.2821, | |
| "step": 5500 | |
| }, | |
| { | |
| "epoch": 1.524812863875797, | |
| "eval_valid_loss": 0.23669035732746124, | |
| "eval_valid_runtime": 6.4303, | |
| "eval_valid_samples_per_second": 215.074, | |
| "eval_valid_steps_per_second": 6.843, | |
| "step": 5500 | |
| }, | |
| { | |
| "epoch": 1.524812863875797, | |
| "eval_valid_target_loss": 0.2617432773113251, | |
| "eval_valid_target_runtime": 6.5556, | |
| "eval_valid_target_samples_per_second": 218.744, | |
| "eval_valid_target_steps_per_second": 6.864, | |
| "step": 5500 | |
| }, | |
| { | |
| "epoch": 1.5525367341280842, | |
| "grad_norm": 0.5144414305686951, | |
| "learning_rate": 9.417134582381548e-06, | |
| "loss": 0.2696, | |
| "step": 5600 | |
| }, | |
| { | |
| "epoch": 1.5802606043803715, | |
| "grad_norm": 0.5522892475128174, | |
| "learning_rate": 9.396561225533902e-06, | |
| "loss": 0.2617, | |
| "step": 5700 | |
| }, | |
| { | |
| "epoch": 1.6079844746326586, | |
| "grad_norm": 0.4152807295322418, | |
| "learning_rate": 9.37565433329644e-06, | |
| "loss": 0.2522, | |
| "step": 5800 | |
| }, | |
| { | |
| "epoch": 1.635708344884946, | |
| "grad_norm": 0.3866608142852783, | |
| "learning_rate": 9.35441549172428e-06, | |
| "loss": 0.2469, | |
| "step": 5900 | |
| }, | |
| { | |
| "epoch": 1.6634322151372332, | |
| "grad_norm": 0.3131564259529114, | |
| "learning_rate": 9.33284631205515e-06, | |
| "loss": 0.2425, | |
| "step": 6000 | |
| }, | |
| { | |
| "epoch": 1.6634322151372332, | |
| "eval_valid_loss": 0.20471729338169098, | |
| "eval_valid_runtime": 6.4284, | |
| "eval_valid_samples_per_second": 215.138, | |
| "eval_valid_steps_per_second": 6.845, | |
| "step": 6000 | |
| }, | |
| { | |
| "epoch": 1.6634322151372332, | |
| "eval_valid_target_loss": 0.2232024222612381, | |
| "eval_valid_target_runtime": 6.5873, | |
| "eval_valid_target_samples_per_second": 217.69, | |
| "eval_valid_target_steps_per_second": 6.831, | |
| "step": 6000 | |
| }, | |
| { | |
| "epoch": 1.6911560853895202, | |
| "grad_norm": 0.4385012090206146, | |
| "learning_rate": 9.31094843058714e-06, | |
| "loss": 0.2346, | |
| "step": 6100 | |
| }, | |
| { | |
| "epoch": 1.7188799556418077, | |
| "grad_norm": 0.3904290497303009, | |
| "learning_rate": 9.28872350855458e-06, | |
| "loss": 0.2279, | |
| "step": 6200 | |
| }, | |
| { | |
| "epoch": 1.7466038258940948, | |
| "grad_norm": 0.4294661581516266, | |
| "learning_rate": 9.266173232002005e-06, | |
| "loss": 0.2218, | |
| "step": 6300 | |
| }, | |
| { | |
| "epoch": 1.774327696146382, | |
| "grad_norm": 0.40256062150001526, | |
| "learning_rate": 9.243299311656253e-06, | |
| "loss": 0.2189, | |
| "step": 6400 | |
| }, | |
| { | |
| "epoch": 1.8020515663986694, | |
| "grad_norm": 0.39798569679260254, | |
| "learning_rate": 9.220103482796683e-06, | |
| "loss": 0.2154, | |
| "step": 6500 | |
| }, | |
| { | |
| "epoch": 1.8020515663986694, | |
| "eval_valid_loss": 0.18116505444049835, | |
| "eval_valid_runtime": 6.4306, | |
| "eval_valid_samples_per_second": 215.065, | |
| "eval_valid_steps_per_second": 6.842, | |
| "step": 6500 | |
| }, | |
| { | |
| "epoch": 1.8020515663986694, | |
| "eval_valid_target_loss": 0.19611063599586487, | |
| "eval_valid_target_runtime": 6.5521, | |
| "eval_valid_target_samples_per_second": 218.86, | |
| "eval_valid_target_steps_per_second": 6.868, | |
| "step": 6500 | |
| }, | |
| { | |
| "epoch": 1.8297754366509564, | |
| "grad_norm": 0.2555886507034302, | |
| "learning_rate": 9.196587505123526e-06, | |
| "loss": 0.2082, | |
| "step": 6600 | |
| }, | |
| { | |
| "epoch": 1.8574993069032437, | |
| "grad_norm": 0.278145968914032, | |
| "learning_rate": 9.172753162624401e-06, | |
| "loss": 0.2025, | |
| "step": 6700 | |
| }, | |
| { | |
| "epoch": 1.885223177155531, | |
| "grad_norm": 0.43592485785484314, | |
| "learning_rate": 9.148602263438967e-06, | |
| "loss": 0.2006, | |
| "step": 6800 | |
| }, | |
| { | |
| "epoch": 1.912947047407818, | |
| "grad_norm": 0.3828723430633545, | |
| "learning_rate": 9.124136639721757e-06, | |
| "loss": 0.1963, | |
| "step": 6900 | |
| }, | |
| { | |
| "epoch": 1.9406709176601054, | |
| "grad_norm": 0.3468044102191925, | |
| "learning_rate": 9.09935814750318e-06, | |
| "loss": 0.1928, | |
| "step": 7000 | |
| }, | |
| { | |
| "epoch": 1.9406709176601054, | |
| "eval_valid_loss": 0.16255635023117065, | |
| "eval_valid_runtime": 6.4262, | |
| "eval_valid_samples_per_second": 215.213, | |
| "eval_valid_steps_per_second": 6.847, | |
| "step": 7000 | |
| }, | |
| { | |
| "epoch": 1.9406709176601054, | |
| "eval_valid_target_loss": 0.17588204145431519, | |
| "eval_valid_target_runtime": 6.5759, | |
| "eval_valid_target_samples_per_second": 218.07, | |
| "eval_valid_target_steps_per_second": 6.843, | |
| "step": 7000 | |
| }, | |
| { | |
| "epoch": 1.9683947879123926, | |
| "grad_norm": 0.28793609142303467, | |
| "learning_rate": 9.074268666548728e-06, | |
| "loss": 0.1868, | |
| "step": 7100 | |
| }, | |
| { | |
| "epoch": 1.9961186581646797, | |
| "grad_norm": 0.4627343714237213, | |
| "learning_rate": 9.04887010021636e-06, | |
| "loss": 0.1857, | |
| "step": 7200 | |
| }, | |
| { | |
| "epoch": 2.023842528416967, | |
| "grad_norm": 0.4490989148616791, | |
| "learning_rate": 9.023164375312117e-06, | |
| "loss": 0.1786, | |
| "step": 7300 | |
| }, | |
| { | |
| "epoch": 2.0515663986692543, | |
| "grad_norm": 0.319859117269516, | |
| "learning_rate": 8.997153441943944e-06, | |
| "loss": 0.1779, | |
| "step": 7400 | |
| }, | |
| { | |
| "epoch": 2.0792902689215413, | |
| "grad_norm": 0.3379845917224884, | |
| "learning_rate": 8.970839273373748e-06, | |
| "loss": 0.1717, | |
| "step": 7500 | |
| }, | |
| { | |
| "epoch": 2.0792902689215413, | |
| "eval_valid_loss": 0.1455078125, | |
| "eval_valid_runtime": 6.4396, | |
| "eval_valid_samples_per_second": 214.766, | |
| "eval_valid_steps_per_second": 6.833, | |
| "step": 7500 | |
| }, | |
| { | |
| "epoch": 2.0792902689215413, | |
| "eval_valid_target_loss": 0.15758885443210602, | |
| "eval_valid_target_runtime": 6.5627, | |
| "eval_valid_target_samples_per_second": 218.508, | |
| "eval_valid_target_steps_per_second": 6.857, | |
| "step": 7500 | |
| }, | |
| { | |
| "epoch": 2.107014139173829, | |
| "grad_norm": 0.3079555928707123, | |
| "learning_rate": 8.944223865867712e-06, | |
| "loss": 0.1688, | |
| "step": 7600 | |
| }, | |
| { | |
| "epoch": 2.134738009426116, | |
| "grad_norm": 0.346603125333786, | |
| "learning_rate": 8.917309238544834e-06, | |
| "loss": 0.1661, | |
| "step": 7700 | |
| }, | |
| { | |
| "epoch": 2.162461879678403, | |
| "grad_norm": 0.3899448812007904, | |
| "learning_rate": 8.890097433223766e-06, | |
| "loss": 0.1653, | |
| "step": 7800 | |
| }, | |
| { | |
| "epoch": 2.1901857499306905, | |
| "grad_norm": 0.31352731585502625, | |
| "learning_rate": 8.862590514267915e-06, | |
| "loss": 0.1609, | |
| "step": 7900 | |
| }, | |
| { | |
| "epoch": 2.2179096201829775, | |
| "grad_norm": 0.29558128118515015, | |
| "learning_rate": 8.834790568428827e-06, | |
| "loss": 0.158, | |
| "step": 8000 | |
| }, | |
| { | |
| "epoch": 2.2179096201829775, | |
| "eval_valid_loss": 0.1319538652896881, | |
| "eval_valid_runtime": 6.417, | |
| "eval_valid_samples_per_second": 215.521, | |
| "eval_valid_steps_per_second": 6.857, | |
| "step": 8000 | |
| }, | |
| { | |
| "epoch": 2.2179096201829775, | |
| "eval_valid_target_loss": 0.1427442878484726, | |
| "eval_valid_target_runtime": 6.5854, | |
| "eval_valid_target_samples_per_second": 217.754, | |
| "eval_valid_target_steps_per_second": 6.833, | |
| "step": 8000 | |
| }, | |
| { | |
| "epoch": 2.2456334904352646, | |
| "grad_norm": 0.29061177372932434, | |
| "learning_rate": 8.80669970468788e-06, | |
| "loss": 0.1545, | |
| "step": 8100 | |
| }, | |
| { | |
| "epoch": 2.273357360687552, | |
| "grad_norm": 0.3253875970840454, | |
| "learning_rate": 8.778320054096306e-06, | |
| "loss": 0.1528, | |
| "step": 8200 | |
| }, | |
| { | |
| "epoch": 2.301081230939839, | |
| "grad_norm": 0.2402360886335373, | |
| "learning_rate": 8.749653769613502e-06, | |
| "loss": 0.1511, | |
| "step": 8300 | |
| }, | |
| { | |
| "epoch": 2.3288051011921262, | |
| "grad_norm": 0.31634458899497986, | |
| "learning_rate": 8.720703025943717e-06, | |
| "loss": 0.1461, | |
| "step": 8400 | |
| }, | |
| { | |
| "epoch": 2.3565289714444138, | |
| "grad_norm": 0.21685920655727386, | |
| "learning_rate": 8.691470019371065e-06, | |
| "loss": 0.143, | |
| "step": 8500 | |
| }, | |
| { | |
| "epoch": 2.3565289714444138, | |
| "eval_valid_loss": 0.12121625989675522, | |
| "eval_valid_runtime": 6.4171, | |
| "eval_valid_samples_per_second": 215.519, | |
| "eval_valid_steps_per_second": 6.857, | |
| "step": 8500 | |
| }, | |
| { | |
| "epoch": 2.3565289714444138, | |
| "eval_valid_target_loss": 0.1312141716480255, | |
| "eval_valid_target_runtime": 6.57, | |
| "eval_valid_target_samples_per_second": 218.266, | |
| "eval_valid_target_steps_per_second": 6.849, | |
| "step": 8500 | |
| }, | |
| { | |
| "epoch": 2.384252841696701, | |
| "grad_norm": 0.24635937809944153, | |
| "learning_rate": 8.661956967592907e-06, | |
| "loss": 0.1424, | |
| "step": 8600 | |
| }, | |
| { | |
| "epoch": 2.411976711948988, | |
| "grad_norm": 0.21958141028881073, | |
| "learning_rate": 8.632166109551623e-06, | |
| "loss": 0.1388, | |
| "step": 8700 | |
| }, | |
| { | |
| "epoch": 2.4397005822012754, | |
| "grad_norm": 0.2693657875061035, | |
| "learning_rate": 8.60209970526474e-06, | |
| "loss": 0.1392, | |
| "step": 8800 | |
| }, | |
| { | |
| "epoch": 2.4674244524535625, | |
| "grad_norm": 0.22512082755565643, | |
| "learning_rate": 8.5717600356535e-06, | |
| "loss": 0.1356, | |
| "step": 8900 | |
| }, | |
| { | |
| "epoch": 2.49514832270585, | |
| "grad_norm": 0.3446211516857147, | |
| "learning_rate": 8.541149402369806e-06, | |
| "loss": 0.1324, | |
| "step": 9000 | |
| }, | |
| { | |
| "epoch": 2.49514832270585, | |
| "eval_valid_loss": 0.11042323708534241, | |
| "eval_valid_runtime": 6.4273, | |
| "eval_valid_samples_per_second": 215.176, | |
| "eval_valid_steps_per_second": 6.846, | |
| "step": 9000 | |
| }, | |
| { | |
| "epoch": 2.49514832270585, | |
| "eval_valid_target_loss": 0.11918216943740845, | |
| "eval_valid_target_runtime": 6.5885, | |
| "eval_valid_target_samples_per_second": 217.651, | |
| "eval_valid_target_steps_per_second": 6.83, | |
| "step": 9000 | |
| }, | |
| { | |
| "epoch": 2.522872192958137, | |
| "grad_norm": 0.21913643181324005, | |
| "learning_rate": 8.51027012762163e-06, | |
| "loss": 0.1303, | |
| "step": 9100 | |
| }, | |
| { | |
| "epoch": 2.550596063210424, | |
| "grad_norm": 0.24243904650211334, | |
| "learning_rate": 8.479124553996824e-06, | |
| "loss": 0.1268, | |
| "step": 9200 | |
| }, | |
| { | |
| "epoch": 2.578319933462711, | |
| "grad_norm": 0.22184187173843384, | |
| "learning_rate": 8.447715044285425e-06, | |
| "loss": 0.1251, | |
| "step": 9300 | |
| }, | |
| { | |
| "epoch": 2.6060438037149987, | |
| "grad_norm": 0.22888724505901337, | |
| "learning_rate": 8.41604398130039e-06, | |
| "loss": 0.1221, | |
| "step": 9400 | |
| }, | |
| { | |
| "epoch": 2.6337676739672857, | |
| "grad_norm": 0.24152572453022003, | |
| "learning_rate": 8.384113767696838e-06, | |
| "loss": 0.121, | |
| "step": 9500 | |
| }, | |
| { | |
| "epoch": 2.6337676739672857, | |
| "eval_valid_loss": 0.10074004530906677, | |
| "eval_valid_runtime": 6.4317, | |
| "eval_valid_samples_per_second": 215.03, | |
| "eval_valid_steps_per_second": 6.841, | |
| "step": 9500 | |
| }, | |
| { | |
| "epoch": 2.6337676739672857, | |
| "eval_valid_target_loss": 0.10891123861074448, | |
| "eval_valid_target_runtime": 6.5593, | |
| "eval_valid_target_samples_per_second": 218.622, | |
| "eval_valid_target_steps_per_second": 6.861, | |
| "step": 9500 | |
| }, | |
| { | |
| "epoch": 2.6614915442195732, | |
| "grad_norm": 0.2756216526031494, | |
| "learning_rate": 8.35192682578978e-06, | |
| "loss": 0.1195, | |
| "step": 9600 | |
| }, | |
| { | |
| "epoch": 2.6892154144718603, | |
| "grad_norm": 0.24438254535198212, | |
| "learning_rate": 8.319485597370348e-06, | |
| "loss": 0.1157, | |
| "step": 9700 | |
| }, | |
| { | |
| "epoch": 2.7169392847241474, | |
| "grad_norm": 0.35991132259368896, | |
| "learning_rate": 8.286792543520556e-06, | |
| "loss": 0.115, | |
| "step": 9800 | |
| }, | |
| { | |
| "epoch": 2.744663154976435, | |
| "grad_norm": 0.22763152420520782, | |
| "learning_rate": 8.253850144426606e-06, | |
| "loss": 0.1134, | |
| "step": 9900 | |
| }, | |
| { | |
| "epoch": 2.772387025228722, | |
| "grad_norm": 0.24357567727565765, | |
| "learning_rate": 8.220660899190712e-06, | |
| "loss": 0.1106, | |
| "step": 10000 | |
| }, | |
| { | |
| "epoch": 2.772387025228722, | |
| "eval_valid_loss": 0.092686228454113, | |
| "eval_valid_runtime": 6.4287, | |
| "eval_valid_samples_per_second": 215.129, | |
| "eval_valid_steps_per_second": 6.844, | |
| "step": 10000 | |
| }, | |
| { | |
| "epoch": 2.772387025228722, | |
| "eval_valid_target_loss": 0.1005280539393425, | |
| "eval_valid_target_runtime": 6.5902, | |
| "eval_valid_target_samples_per_second": 217.596, | |
| "eval_valid_target_steps_per_second": 6.828, | |
| "step": 10000 | |
| }, | |
| { | |
| "epoch": 2.800110895481009, | |
| "grad_norm": 0.20446299016475677, | |
| "learning_rate": 8.187227325641534e-06, | |
| "loss": 0.109, | |
| "step": 10100 | |
| }, | |
| { | |
| "epoch": 2.8278347657332965, | |
| "grad_norm": 0.24309873580932617, | |
| "learning_rate": 8.153551960143157e-06, | |
| "loss": 0.1087, | |
| "step": 10200 | |
| }, | |
| { | |
| "epoch": 2.8555586359855836, | |
| "grad_norm": 0.21243679523468018, | |
| "learning_rate": 8.119637357402676e-06, | |
| "loss": 0.1063, | |
| "step": 10300 | |
| }, | |
| { | |
| "epoch": 2.8832825062378706, | |
| "grad_norm": 0.2227753847837448, | |
| "learning_rate": 8.085486090276391e-06, | |
| "loss": 0.1057, | |
| "step": 10400 | |
| }, | |
| { | |
| "epoch": 2.911006376490158, | |
| "grad_norm": 0.1933346837759018, | |
| "learning_rate": 8.05110074957462e-06, | |
| "loss": 0.1037, | |
| "step": 10500 | |
| }, | |
| { | |
| "epoch": 2.911006376490158, | |
| "eval_valid_loss": 0.08755628019571304, | |
| "eval_valid_runtime": 6.4374, | |
| "eval_valid_samples_per_second": 214.84, | |
| "eval_valid_steps_per_second": 6.835, | |
| "step": 10500 | |
| }, | |
| { | |
| "epoch": 2.911006376490158, | |
| "eval_valid_target_loss": 0.09479602426290512, | |
| "eval_valid_target_runtime": 6.5624, | |
| "eval_valid_target_samples_per_second": 218.517, | |
| "eval_valid_target_steps_per_second": 6.857, | |
| "step": 10500 | |
| }, | |
| { | |
| "epoch": 2.938730246742445, | |
| "grad_norm": 0.24507193267345428, | |
| "learning_rate": 8.016483943865158e-06, | |
| "loss": 0.1026, | |
| "step": 10600 | |
| }, | |
| { | |
| "epoch": 2.9664541169947327, | |
| "grad_norm": 0.16903254389762878, | |
| "learning_rate": 7.98163829927538e-06, | |
| "loss": 0.1019, | |
| "step": 10700 | |
| }, | |
| { | |
| "epoch": 2.99417798724702, | |
| "grad_norm": 0.21406187117099762, | |
| "learning_rate": 7.946566459293014e-06, | |
| "loss": 0.1016, | |
| "step": 10800 | |
| }, | |
| { | |
| "epoch": 3.021901857499307, | |
| "grad_norm": 0.17749078571796417, | |
| "learning_rate": 7.911271084565603e-06, | |
| "loss": 0.0988, | |
| "step": 10900 | |
| }, | |
| { | |
| "epoch": 3.049625727751594, | |
| "grad_norm": 0.2052767425775528, | |
| "learning_rate": 7.875754852698658e-06, | |
| "loss": 0.099, | |
| "step": 11000 | |
| }, | |
| { | |
| "epoch": 3.049625727751594, | |
| "eval_valid_loss": 0.08359777182340622, | |
| "eval_valid_runtime": 6.4134, | |
| "eval_valid_samples_per_second": 215.643, | |
| "eval_valid_steps_per_second": 6.861, | |
| "step": 11000 | |
| }, | |
| { | |
| "epoch": 3.049625727751594, | |
| "eval_valid_target_loss": 0.09044167399406433, | |
| "eval_valid_target_runtime": 6.5678, | |
| "eval_valid_target_samples_per_second": 218.336, | |
| "eval_valid_target_steps_per_second": 6.852, | |
| "step": 11000 | |
| }, | |
| { | |
| "epoch": 3.0773495980038814, | |
| "grad_norm": 0.20621031522750854, | |
| "learning_rate": 7.840020458052529e-06, | |
| "loss": 0.0961, | |
| "step": 11100 | |
| }, | |
| { | |
| "epoch": 3.1050734682561685, | |
| "grad_norm": 0.18608888983726501, | |
| "learning_rate": 7.804070611538001e-06, | |
| "loss": 0.0964, | |
| "step": 11200 | |
| }, | |
| { | |
| "epoch": 3.132797338508456, | |
| "grad_norm": 0.14550629258155823, | |
| "learning_rate": 7.767908040410642e-06, | |
| "loss": 0.0957, | |
| "step": 11300 | |
| }, | |
| { | |
| "epoch": 3.160521208760743, | |
| "grad_norm": 0.21664443612098694, | |
| "learning_rate": 7.731535488063895e-06, | |
| "loss": 0.0948, | |
| "step": 11400 | |
| }, | |
| { | |
| "epoch": 3.18824507901303, | |
| "grad_norm": 0.17702756822109222, | |
| "learning_rate": 7.694955713820974e-06, | |
| "loss": 0.0935, | |
| "step": 11500 | |
| }, | |
| { | |
| "epoch": 3.18824507901303, | |
| "eval_valid_loss": 0.07985392957925797, | |
| "eval_valid_runtime": 6.4194, | |
| "eval_valid_samples_per_second": 215.442, | |
| "eval_valid_steps_per_second": 6.854, | |
| "step": 11500 | |
| }, | |
| { | |
| "epoch": 3.18824507901303, | |
| "eval_valid_target_loss": 0.08640262484550476, | |
| "eval_valid_target_runtime": 6.5608, | |
| "eval_valid_target_samples_per_second": 218.572, | |
| "eval_valid_target_steps_per_second": 6.859, | |
| "step": 11500 | |
| }, | |
| { | |
| "epoch": 3.2159689492653176, | |
| "grad_norm": 0.19913919270038605, | |
| "learning_rate": 7.658171492725513e-06, | |
| "loss": 0.0936, | |
| "step": 11600 | |
| }, | |
| { | |
| "epoch": 3.2436928195176047, | |
| "grad_norm": 0.18789726495742798, | |
| "learning_rate": 7.621185615331061e-06, | |
| "loss": 0.0924, | |
| "step": 11700 | |
| }, | |
| { | |
| "epoch": 3.2714166897698918, | |
| "grad_norm": 0.18376338481903076, | |
| "learning_rate": 7.584000887489373e-06, | |
| "loss": 0.0911, | |
| "step": 11800 | |
| }, | |
| { | |
| "epoch": 3.2991405600221793, | |
| "grad_norm": 0.19736219942569733, | |
| "learning_rate": 7.546620130137557e-06, | |
| "loss": 0.0912, | |
| "step": 11900 | |
| }, | |
| { | |
| "epoch": 3.3268644302744663, | |
| "grad_norm": 0.19527922570705414, | |
| "learning_rate": 7.509046179084061e-06, | |
| "loss": 0.0912, | |
| "step": 12000 | |
| }, | |
| { | |
| "epoch": 3.3268644302744663, | |
| "eval_valid_loss": 0.07622889429330826, | |
| "eval_valid_runtime": 6.4437, | |
| "eval_valid_samples_per_second": 214.627, | |
| "eval_valid_steps_per_second": 6.828, | |
| "step": 12000 | |
| }, | |
| { | |
| "epoch": 3.3268644302744663, | |
| "eval_valid_target_loss": 0.0823676660656929, | |
| "eval_valid_target_runtime": 6.5589, | |
| "eval_valid_target_samples_per_second": 218.635, | |
| "eval_valid_target_steps_per_second": 6.861, | |
| "step": 12000 | |
| }, | |
| { | |
| "epoch": 3.3545883005267534, | |
| "grad_norm": 0.18916228413581848, | |
| "learning_rate": 7.471281884793544e-06, | |
| "loss": 0.0896, | |
| "step": 12100 | |
| }, | |
| { | |
| "epoch": 3.382312170779041, | |
| "grad_norm": 0.1649465262889862, | |
| "learning_rate": 7.4333301121706445e-06, | |
| "loss": 0.0881, | |
| "step": 12200 | |
| }, | |
| { | |
| "epoch": 3.410036041031328, | |
| "grad_norm": 0.18362993001937866, | |
| "learning_rate": 7.3951937403426186e-06, | |
| "loss": 0.0892, | |
| "step": 12300 | |
| }, | |
| { | |
| "epoch": 3.437759911283615, | |
| "grad_norm": 0.19268861413002014, | |
| "learning_rate": 7.356875662440939e-06, | |
| "loss": 0.0879, | |
| "step": 12400 | |
| }, | |
| { | |
| "epoch": 3.4654837815359025, | |
| "grad_norm": 0.17124581336975098, | |
| "learning_rate": 7.318378785381802e-06, | |
| "loss": 0.086, | |
| "step": 12500 | |
| }, | |
| { | |
| "epoch": 3.4654837815359025, | |
| "eval_valid_loss": 0.07317828387022018, | |
| "eval_valid_runtime": 6.4273, | |
| "eval_valid_samples_per_second": 215.177, | |
| "eval_valid_steps_per_second": 6.846, | |
| "step": 12500 | |
| }, | |
| { | |
| "epoch": 3.4654837815359025, | |
| "eval_valid_target_loss": 0.07900213450193405, | |
| "eval_valid_target_runtime": 6.5852, | |
| "eval_valid_target_samples_per_second": 217.76, | |
| "eval_valid_target_steps_per_second": 6.833, | |
| "step": 12500 | |
| }, | |
| { | |
| "epoch": 3.4932076517881896, | |
| "grad_norm": 0.23004941642284393, | |
| "learning_rate": 7.279706029645615e-06, | |
| "loss": 0.0855, | |
| "step": 12600 | |
| }, | |
| { | |
| "epoch": 3.5209315220404767, | |
| "grad_norm": 0.16131635010242462, | |
| "learning_rate": 7.240860329055422e-06, | |
| "loss": 0.0848, | |
| "step": 12700 | |
| }, | |
| { | |
| "epoch": 3.548655392292764, | |
| "grad_norm": 0.19867731630802155, | |
| "learning_rate": 7.201844630554353e-06, | |
| "loss": 0.0851, | |
| "step": 12800 | |
| }, | |
| { | |
| "epoch": 3.5763792625450512, | |
| "grad_norm": 0.17405714094638824, | |
| "learning_rate": 7.162661893982052e-06, | |
| "loss": 0.0839, | |
| "step": 12900 | |
| }, | |
| { | |
| "epoch": 3.6041031327973387, | |
| "grad_norm": 0.19404906034469604, | |
| "learning_rate": 7.123315091850136e-06, | |
| "loss": 0.0839, | |
| "step": 13000 | |
| }, | |
| { | |
| "epoch": 3.6041031327973387, | |
| "eval_valid_loss": 0.07132507115602493, | |
| "eval_valid_runtime": 6.4118, | |
| "eval_valid_samples_per_second": 215.695, | |
| "eval_valid_steps_per_second": 6.862, | |
| "step": 13000 | |
| }, | |
| { | |
| "epoch": 3.6041031327973387, | |
| "eval_valid_target_loss": 0.0771123468875885, | |
| "eval_valid_target_runtime": 6.5745, | |
| "eval_valid_target_samples_per_second": 218.117, | |
| "eval_valid_target_steps_per_second": 6.845, | |
| "step": 13000 | |
| }, | |
| { | |
| "epoch": 3.631827003049626, | |
| "grad_norm": 0.15152141451835632, | |
| "learning_rate": 7.083807209116689e-06, | |
| "loss": 0.0836, | |
| "step": 13100 | |
| }, | |
| { | |
| "epoch": 3.659550873301913, | |
| "grad_norm": 0.18368007242679596, | |
| "learning_rate": 7.044141242959826e-06, | |
| "loss": 0.0827, | |
| "step": 13200 | |
| }, | |
| { | |
| "epoch": 3.6872747435542, | |
| "grad_norm": 0.18081355094909668, | |
| "learning_rate": 7.004320202550303e-06, | |
| "loss": 0.0823, | |
| "step": 13300 | |
| }, | |
| { | |
| "epoch": 3.7149986138064874, | |
| "grad_norm": 0.15222586691379547, | |
| "learning_rate": 6.9643471088232506e-06, | |
| "loss": 0.0801, | |
| "step": 13400 | |
| }, | |
| { | |
| "epoch": 3.7427224840587745, | |
| "grad_norm": 0.1571241021156311, | |
| "learning_rate": 6.9242249942489755e-06, | |
| "loss": 0.0807, | |
| "step": 13500 | |
| }, | |
| { | |
| "epoch": 3.7427224840587745, | |
| "eval_valid_loss": 0.06911951303482056, | |
| "eval_valid_runtime": 6.4701, | |
| "eval_valid_samples_per_second": 213.752, | |
| "eval_valid_steps_per_second": 6.8, | |
| "step": 13500 | |
| }, | |
| { | |
| "epoch": 3.7427224840587745, | |
| "eval_valid_target_loss": 0.07482416182756424, | |
| "eval_valid_target_runtime": 6.5611, | |
| "eval_valid_target_samples_per_second": 218.56, | |
| "eval_valid_target_steps_per_second": 6.859, | |
| "step": 13500 | |
| }, | |
| { | |
| "epoch": 3.770446354311062, | |
| "grad_norm": 0.1546078324317932, | |
| "learning_rate": 6.883956902602933e-06, | |
| "loss": 0.0811, | |
| "step": 13600 | |
| }, | |
| { | |
| "epoch": 3.798170224563349, | |
| "grad_norm": 0.1428447812795639, | |
| "learning_rate": 6.843545888734801e-06, | |
| "loss": 0.0795, | |
| "step": 13700 | |
| }, | |
| { | |
| "epoch": 3.825894094815636, | |
| "grad_norm": 0.1369272619485855, | |
| "learning_rate": 6.802995018336736e-06, | |
| "loss": 0.0794, | |
| "step": 13800 | |
| }, | |
| { | |
| "epoch": 3.8536179650679236, | |
| "grad_norm": 0.1972970962524414, | |
| "learning_rate": 6.762307367710797e-06, | |
| "loss": 0.0785, | |
| "step": 13900 | |
| }, | |
| { | |
| "epoch": 3.8813418353202107, | |
| "grad_norm": 0.15961000323295593, | |
| "learning_rate": 6.721486023535577e-06, | |
| "loss": 0.0787, | |
| "step": 14000 | |
| }, | |
| { | |
| "epoch": 3.8813418353202107, | |
| "eval_valid_loss": 0.06712613999843597, | |
| "eval_valid_runtime": 6.4106, | |
| "eval_valid_samples_per_second": 215.737, | |
| "eval_valid_steps_per_second": 6.864, | |
| "step": 14000 | |
| }, | |
| { | |
| "epoch": 3.8813418353202107, | |
| "eval_valid_target_loss": 0.07271508872509003, | |
| "eval_valid_target_runtime": 6.5891, | |
| "eval_valid_target_samples_per_second": 217.633, | |
| "eval_valid_target_steps_per_second": 6.829, | |
| "step": 14000 | |
| }, | |
| { | |
| "epoch": 3.9090657055724978, | |
| "grad_norm": 0.15836742520332336, | |
| "learning_rate": 6.680534082632036e-06, | |
| "loss": 0.0779, | |
| "step": 14100 | |
| }, | |
| { | |
| "epoch": 3.9367895758247853, | |
| "grad_norm": 0.1906501203775406, | |
| "learning_rate": 6.639454651728561e-06, | |
| "loss": 0.0772, | |
| "step": 14200 | |
| }, | |
| { | |
| "epoch": 3.9645134460770723, | |
| "grad_norm": 0.1872212439775467, | |
| "learning_rate": 6.598250847225286e-06, | |
| "loss": 0.0772, | |
| "step": 14300 | |
| }, | |
| { | |
| "epoch": 3.9922373163293594, | |
| "grad_norm": 0.1689438670873642, | |
| "learning_rate": 6.556925794957678e-06, | |
| "loss": 0.0769, | |
| "step": 14400 | |
| }, | |
| { | |
| "epoch": 4.0199611865816465, | |
| "grad_norm": 0.1830626279115677, | |
| "learning_rate": 6.515482629959392e-06, | |
| "loss": 0.0764, | |
| "step": 14500 | |
| }, | |
| { | |
| "epoch": 4.0199611865816465, | |
| "eval_valid_loss": 0.0653899684548378, | |
| "eval_valid_runtime": 6.4271, | |
| "eval_valid_samples_per_second": 215.181, | |
| "eval_valid_steps_per_second": 6.846, | |
| "step": 14500 | |
| }, | |
| { | |
| "epoch": 4.0199611865816465, | |
| "eval_valid_target_loss": 0.0708317682147026, | |
| "eval_valid_target_runtime": 6.5574, | |
| "eval_valid_target_samples_per_second": 218.684, | |
| "eval_valid_target_steps_per_second": 6.862, | |
| "step": 14500 | |
| }, | |
| { | |
| "epoch": 4.047685056833934, | |
| "grad_norm": 0.1517285257577896, | |
| "learning_rate": 6.473924496224447e-06, | |
| "loss": 0.0757, | |
| "step": 14600 | |
| }, | |
| { | |
| "epoch": 4.0754089270862215, | |
| "grad_norm": 0.15981799364089966, | |
| "learning_rate": 6.432254546468708e-06, | |
| "loss": 0.0751, | |
| "step": 14700 | |
| }, | |
| { | |
| "epoch": 4.1031327973385086, | |
| "grad_norm": 0.14974670112133026, | |
| "learning_rate": 6.3904759418907194e-06, | |
| "loss": 0.0755, | |
| "step": 14800 | |
| }, | |
| { | |
| "epoch": 4.130856667590796, | |
| "grad_norm": 0.15918827056884766, | |
| "learning_rate": 6.348591851931879e-06, | |
| "loss": 0.0743, | |
| "step": 14900 | |
| }, | |
| { | |
| "epoch": 4.158580537843083, | |
| "grad_norm": 0.17248332500457764, | |
| "learning_rate": 6.306605454036001e-06, | |
| "loss": 0.0747, | |
| "step": 15000 | |
| }, | |
| { | |
| "epoch": 4.158580537843083, | |
| "eval_valid_loss": 0.06470626592636108, | |
| "eval_valid_runtime": 6.4429, | |
| "eval_valid_samples_per_second": 214.654, | |
| "eval_valid_steps_per_second": 6.829, | |
| "step": 15000 | |
| }, | |
| { | |
| "epoch": 4.158580537843083, | |
| "eval_valid_target_loss": 0.07004554569721222, | |
| "eval_valid_target_runtime": 6.5941, | |
| "eval_valid_target_samples_per_second": 217.468, | |
| "eval_valid_target_steps_per_second": 6.824, | |
| "step": 15000 | |
| }, | |
| { | |
| "epoch": 4.18630440809537, | |
| "grad_norm": 0.18200209736824036, | |
| "learning_rate": 6.2645199334082674e-06, | |
| "loss": 0.0735, | |
| "step": 15100 | |
| }, | |
| { | |
| "epoch": 4.214028278347658, | |
| "grad_norm": 0.12851852178573608, | |
| "learning_rate": 6.222338482773584e-06, | |
| "loss": 0.0736, | |
| "step": 15200 | |
| }, | |
| { | |
| "epoch": 4.241752148599945, | |
| "grad_norm": 0.15132804214954376, | |
| "learning_rate": 6.180064302134374e-06, | |
| "loss": 0.0738, | |
| "step": 15300 | |
| }, | |
| { | |
| "epoch": 4.269476018852232, | |
| "grad_norm": 0.15047667920589447, | |
| "learning_rate": 6.1377005985278205e-06, | |
| "loss": 0.073, | |
| "step": 15400 | |
| }, | |
| { | |
| "epoch": 4.297199889104519, | |
| "grad_norm": 0.19985252618789673, | |
| "learning_rate": 6.095250585782562e-06, | |
| "loss": 0.0732, | |
| "step": 15500 | |
| }, | |
| { | |
| "epoch": 4.297199889104519, | |
| "eval_valid_loss": 0.062382254749536514, | |
| "eval_valid_runtime": 6.4347, | |
| "eval_valid_samples_per_second": 214.927, | |
| "eval_valid_steps_per_second": 6.838, | |
| "step": 15500 | |
| }, | |
| { | |
| "epoch": 4.297199889104519, | |
| "eval_valid_target_loss": 0.06759324669837952, | |
| "eval_valid_target_runtime": 6.5646, | |
| "eval_valid_target_samples_per_second": 218.446, | |
| "eval_valid_target_steps_per_second": 6.855, | |
| "step": 15500 | |
| }, | |
| { | |
| "epoch": 4.324923759356806, | |
| "grad_norm": 0.16384641826152802, | |
| "learning_rate": 6.0527174842748994e-06, | |
| "loss": 0.0716, | |
| "step": 15600 | |
| }, | |
| { | |
| "epoch": 4.352647629609093, | |
| "grad_norm": 0.14244656264781952, | |
| "learning_rate": 6.0101045206844676e-06, | |
| "loss": 0.0716, | |
| "step": 15700 | |
| }, | |
| { | |
| "epoch": 4.380371499861381, | |
| "grad_norm": 0.16209416091442108, | |
| "learning_rate": 5.9674149277494694e-06, | |
| "loss": 0.0714, | |
| "step": 15800 | |
| }, | |
| { | |
| "epoch": 4.408095370113668, | |
| "grad_norm": 0.17041273415088654, | |
| "learning_rate": 5.92465194402142e-06, | |
| "loss": 0.0715, | |
| "step": 15900 | |
| }, | |
| { | |
| "epoch": 4.435819240365955, | |
| "grad_norm": 0.16730940341949463, | |
| "learning_rate": 5.881818813619463e-06, | |
| "loss": 0.0714, | |
| "step": 16000 | |
| }, | |
| { | |
| "epoch": 4.435819240365955, | |
| "eval_valid_loss": 0.061134014278650284, | |
| "eval_valid_runtime": 6.4104, | |
| "eval_valid_samples_per_second": 215.742, | |
| "eval_valid_steps_per_second": 6.864, | |
| "step": 16000 | |
| }, | |
| { | |
| "epoch": 4.435819240365955, | |
| "eval_valid_target_loss": 0.06638547778129578, | |
| "eval_valid_target_runtime": 6.5651, | |
| "eval_valid_target_samples_per_second": 218.427, | |
| "eval_valid_target_steps_per_second": 6.854, | |
| "step": 16000 | |
| }, | |
| { | |
| "epoch": 4.463543110618242, | |
| "grad_norm": 0.13161396980285645, | |
| "learning_rate": 5.8389187859842675e-06, | |
| "loss": 0.0703, | |
| "step": 16100 | |
| }, | |
| { | |
| "epoch": 4.491266980870529, | |
| "grad_norm": 0.13423210382461548, | |
| "learning_rate": 5.7959551156315156e-06, | |
| "loss": 0.0707, | |
| "step": 16200 | |
| }, | |
| { | |
| "epoch": 4.518990851122817, | |
| "grad_norm": 0.20051045715808868, | |
| "learning_rate": 5.752931061904994e-06, | |
| "loss": 0.0699, | |
| "step": 16300 | |
| }, | |
| { | |
| "epoch": 4.546714721375104, | |
| "grad_norm": 0.15945318341255188, | |
| "learning_rate": 5.709849888729351e-06, | |
| "loss": 0.0697, | |
| "step": 16400 | |
| }, | |
| { | |
| "epoch": 4.574438591627391, | |
| "grad_norm": 0.13749030232429504, | |
| "learning_rate": 5.666714864362468e-06, | |
| "loss": 0.0704, | |
| "step": 16500 | |
| }, | |
| { | |
| "epoch": 4.574438591627391, | |
| "eval_valid_loss": 0.06001834571361542, | |
| "eval_valid_runtime": 6.4467, | |
| "eval_valid_samples_per_second": 214.529, | |
| "eval_valid_steps_per_second": 6.825, | |
| "step": 16500 | |
| }, | |
| { | |
| "epoch": 4.574438591627391, | |
| "eval_valid_target_loss": 0.06535307317972183, | |
| "eval_valid_target_runtime": 6.5686, | |
| "eval_valid_target_samples_per_second": 218.311, | |
| "eval_valid_target_steps_per_second": 6.851, | |
| "step": 16500 | |
| }, | |
| { | |
| "epoch": 4.602162461879678, | |
| "grad_norm": 0.133077010512352, | |
| "learning_rate": 5.6235292611475326e-06, | |
| "loss": 0.0693, | |
| "step": 16600 | |
| }, | |
| { | |
| "epoch": 4.629886332131965, | |
| "grad_norm": 0.1508035957813263, | |
| "learning_rate": 5.580296355264783e-06, | |
| "loss": 0.069, | |
| "step": 16700 | |
| }, | |
| { | |
| "epoch": 4.6576102023842525, | |
| "grad_norm": 0.14195485413074493, | |
| "learning_rate": 5.537019426482966e-06, | |
| "loss": 0.0695, | |
| "step": 16800 | |
| }, | |
| { | |
| "epoch": 4.6853340726365404, | |
| "grad_norm": 0.16586261987686157, | |
| "learning_rate": 5.493701757910536e-06, | |
| "loss": 0.0684, | |
| "step": 16900 | |
| }, | |
| { | |
| "epoch": 4.7130579428888275, | |
| "grad_norm": 0.13865657150745392, | |
| "learning_rate": 5.4503466357465765e-06, | |
| "loss": 0.0682, | |
| "step": 17000 | |
| }, | |
| { | |
| "epoch": 4.7130579428888275, | |
| "eval_valid_loss": 0.0584811232984066, | |
| "eval_valid_runtime": 6.422, | |
| "eval_valid_samples_per_second": 215.352, | |
| "eval_valid_steps_per_second": 6.851, | |
| "step": 17000 | |
| }, | |
| { | |
| "epoch": 4.7130579428888275, | |
| "eval_valid_target_loss": 0.06370435655117035, | |
| "eval_valid_target_runtime": 6.5705, | |
| "eval_valid_target_samples_per_second": 218.247, | |
| "eval_valid_target_steps_per_second": 6.849, | |
| "step": 17000 | |
| }, | |
| { | |
| "epoch": 4.740781813141115, | |
| "grad_norm": 0.1934811919927597, | |
| "learning_rate": 5.406957349031504e-06, | |
| "loss": 0.0686, | |
| "step": 17100 | |
| }, | |
| { | |
| "epoch": 4.768505683393402, | |
| "grad_norm": 0.16662567853927612, | |
| "learning_rate": 5.363537189397556e-06, | |
| "loss": 0.0682, | |
| "step": 17200 | |
| }, | |
| { | |
| "epoch": 4.796229553645689, | |
| "grad_norm": 0.15507076680660248, | |
| "learning_rate": 5.320089450819075e-06, | |
| "loss": 0.0673, | |
| "step": 17300 | |
| }, | |
| { | |
| "epoch": 4.823953423897976, | |
| "grad_norm": 0.12763585150241852, | |
| "learning_rate": 5.276617429362616e-06, | |
| "loss": 0.0671, | |
| "step": 17400 | |
| }, | |
| { | |
| "epoch": 4.851677294150264, | |
| "grad_norm": 0.15640078485012054, | |
| "learning_rate": 5.233124422936906e-06, | |
| "loss": 0.0669, | |
| "step": 17500 | |
| }, | |
| { | |
| "epoch": 4.851677294150264, | |
| "eval_valid_loss": 0.05754322186112404, | |
| "eval_valid_runtime": 6.4388, | |
| "eval_valid_samples_per_second": 214.792, | |
| "eval_valid_steps_per_second": 6.834, | |
| "step": 17500 | |
| }, | |
| { | |
| "epoch": 4.851677294150264, | |
| "eval_valid_target_loss": 0.06262939423322678, | |
| "eval_valid_target_runtime": 6.5536, | |
| "eval_valid_target_samples_per_second": 218.81, | |
| "eval_valid_target_steps_per_second": 6.866, | |
| "step": 17500 | |
| }, | |
| { | |
| "epoch": 4.879401164402551, | |
| "grad_norm": 0.16545389592647552, | |
| "learning_rate": 5.189613731042645e-06, | |
| "loss": 0.0663, | |
| "step": 17600 | |
| }, | |
| { | |
| "epoch": 4.907125034654838, | |
| "grad_norm": 0.17085812985897064, | |
| "learning_rate": 5.146088654522208e-06, | |
| "loss": 0.0657, | |
| "step": 17700 | |
| }, | |
| { | |
| "epoch": 4.934848904907125, | |
| "grad_norm": 0.14638109505176544, | |
| "learning_rate": 5.102552495309222e-06, | |
| "loss": 0.0677, | |
| "step": 17800 | |
| }, | |
| { | |
| "epoch": 4.962572775159412, | |
| "grad_norm": 0.15568013489246368, | |
| "learning_rate": 5.059008556178079e-06, | |
| "loss": 0.0657, | |
| "step": 17900 | |
| }, | |
| { | |
| "epoch": 4.9902966454117, | |
| "grad_norm": 0.16898399591445923, | |
| "learning_rate": 5.015460140493381e-06, | |
| "loss": 0.0661, | |
| "step": 18000 | |
| }, | |
| { | |
| "epoch": 4.9902966454117, | |
| "eval_valid_loss": 0.05648580938577652, | |
| "eval_valid_runtime": 6.4207, | |
| "eval_valid_samples_per_second": 215.397, | |
| "eval_valid_steps_per_second": 6.853, | |
| "step": 18000 | |
| }, | |
| { | |
| "epoch": 4.9902966454117, | |
| "eval_valid_target_loss": 0.06151015684008598, | |
| "eval_valid_target_runtime": 6.5952, | |
| "eval_valid_target_samples_per_second": 217.432, | |
| "eval_valid_target_steps_per_second": 6.823, | |
| "step": 18000 | |
| }, | |
| { | |
| "epoch": 5.018020515663987, | |
| "grad_norm": 0.13535688817501068, | |
| "learning_rate": 4.971910551959332e-06, | |
| "loss": 0.0654, | |
| "step": 18100 | |
| }, | |
| { | |
| "epoch": 5.045744385916274, | |
| "grad_norm": 0.16001687943935394, | |
| "learning_rate": 4.928363094369108e-06, | |
| "loss": 0.0656, | |
| "step": 18200 | |
| }, | |
| { | |
| "epoch": 5.073468256168561, | |
| "grad_norm": 0.1575719267129898, | |
| "learning_rate": 4.88482107135423e-06, | |
| "loss": 0.0641, | |
| "step": 18300 | |
| }, | |
| { | |
| "epoch": 5.101192126420848, | |
| "grad_norm": 0.1607745736837387, | |
| "learning_rate": 4.841287786133937e-06, | |
| "loss": 0.0642, | |
| "step": 18400 | |
| }, | |
| { | |
| "epoch": 5.128915996673135, | |
| "grad_norm": 0.13689269125461578, | |
| "learning_rate": 4.797766541264592e-06, | |
| "loss": 0.0646, | |
| "step": 18500 | |
| }, | |
| { | |
| "epoch": 5.128915996673135, | |
| "eval_valid_loss": 0.05563423037528992, | |
| "eval_valid_runtime": 6.4248, | |
| "eval_valid_samples_per_second": 215.261, | |
| "eval_valid_steps_per_second": 6.849, | |
| "step": 18500 | |
| }, | |
| { | |
| "epoch": 5.128915996673135, | |
| "eval_valid_target_loss": 0.06068035215139389, | |
| "eval_valid_target_runtime": 6.561, | |
| "eval_valid_target_samples_per_second": 218.566, | |
| "eval_valid_target_steps_per_second": 6.859, | |
| "step": 18500 | |
| }, | |
| { | |
| "epoch": 5.156639866925423, | |
| "grad_norm": 0.13576319813728333, | |
| "learning_rate": 4.754260638389145e-06, | |
| "loss": 0.0641, | |
| "step": 18600 | |
| }, | |
| { | |
| "epoch": 5.18436373717771, | |
| "grad_norm": 0.13574448227882385, | |
| "learning_rate": 4.710773377986659e-06, | |
| "loss": 0.0643, | |
| "step": 18700 | |
| }, | |
| { | |
| "epoch": 5.212087607429997, | |
| "grad_norm": 0.11536768078804016, | |
| "learning_rate": 4.667308059121928e-06, | |
| "loss": 0.064, | |
| "step": 18800 | |
| }, | |
| { | |
| "epoch": 5.239811477682284, | |
| "grad_norm": 0.1470881700515747, | |
| "learning_rate": 4.623867979195196e-06, | |
| "loss": 0.0637, | |
| "step": 18900 | |
| }, | |
| { | |
| "epoch": 5.2675353479345715, | |
| "grad_norm": 0.13156047463417053, | |
| "learning_rate": 4.580456433692017e-06, | |
| "loss": 0.0635, | |
| "step": 19000 | |
| }, | |
| { | |
| "epoch": 5.2675353479345715, | |
| "eval_valid_loss": 0.05473410338163376, | |
| "eval_valid_runtime": 6.4623, | |
| "eval_valid_samples_per_second": 214.012, | |
| "eval_valid_steps_per_second": 6.809, | |
| "step": 19000 | |
| }, | |
| { | |
| "epoch": 5.2675353479345715, | |
| "eval_valid_target_loss": 0.05973204970359802, | |
| "eval_valid_target_runtime": 6.5636, | |
| "eval_valid_target_samples_per_second": 218.477, | |
| "eval_valid_target_steps_per_second": 6.856, | |
| "step": 19000 | |
| }, | |
| { | |
| "epoch": 5.2952592181868585, | |
| "grad_norm": 0.132376030087471, | |
| "learning_rate": 4.537076715933242e-06, | |
| "loss": 0.0638, | |
| "step": 19100 | |
| }, | |
| { | |
| "epoch": 5.3229830884391465, | |
| "grad_norm": 0.14191821217536926, | |
| "learning_rate": 4.493732116825174e-06, | |
| "loss": 0.064, | |
| "step": 19200 | |
| }, | |
| { | |
| "epoch": 5.3507069586914335, | |
| "grad_norm": 0.1247839480638504, | |
| "learning_rate": 4.45042592460993e-06, | |
| "loss": 0.0627, | |
| "step": 19300 | |
| }, | |
| { | |
| "epoch": 5.378430828943721, | |
| "grad_norm": 0.12980355322360992, | |
| "learning_rate": 4.4071614246159596e-06, | |
| "loss": 0.0632, | |
| "step": 19400 | |
| }, | |
| { | |
| "epoch": 5.406154699196008, | |
| "grad_norm": 0.1391134262084961, | |
| "learning_rate": 4.363941899008833e-06, | |
| "loss": 0.0625, | |
| "step": 19500 | |
| }, | |
| { | |
| "epoch": 5.406154699196008, | |
| "eval_valid_loss": 0.05415208637714386, | |
| "eval_valid_runtime": 6.4065, | |
| "eval_valid_samples_per_second": 215.873, | |
| "eval_valid_steps_per_second": 6.868, | |
| "step": 19500 | |
| }, | |
| { | |
| "epoch": 5.406154699196008, | |
| "eval_valid_target_loss": 0.05894719064235687, | |
| "eval_valid_target_runtime": 6.569, | |
| "eval_valid_target_samples_per_second": 218.299, | |
| "eval_valid_target_steps_per_second": 6.85, | |
| "step": 19500 | |
| }, | |
| { | |
| "epoch": 5.433878569448295, | |
| "grad_norm": 0.2045671045780182, | |
| "learning_rate": 4.320770626542238e-06, | |
| "loss": 0.0629, | |
| "step": 19600 | |
| }, | |
| { | |
| "epoch": 5.461602439700582, | |
| "grad_norm": 0.1417771577835083, | |
| "learning_rate": 4.277650882309238e-06, | |
| "loss": 0.0625, | |
| "step": 19700 | |
| }, | |
| { | |
| "epoch": 5.48932630995287, | |
| "grad_norm": 0.14284995198249817, | |
| "learning_rate": 4.234585937493829e-06, | |
| "loss": 0.0623, | |
| "step": 19800 | |
| }, | |
| { | |
| "epoch": 5.517050180205157, | |
| "grad_norm": 0.1546027809381485, | |
| "learning_rate": 4.1915790591227615e-06, | |
| "loss": 0.0625, | |
| "step": 19900 | |
| }, | |
| { | |
| "epoch": 5.544774050457444, | |
| "grad_norm": 0.1454819142818451, | |
| "learning_rate": 4.148633509817715e-06, | |
| "loss": 0.0613, | |
| "step": 20000 | |
| }, | |
| { | |
| "epoch": 5.544774050457444, | |
| "eval_valid_loss": 0.05364985764026642, | |
| "eval_valid_runtime": 6.436, | |
| "eval_valid_samples_per_second": 214.885, | |
| "eval_valid_steps_per_second": 6.837, | |
| "step": 20000 | |
| }, | |
| { | |
| "epoch": 5.544774050457444, | |
| "eval_valid_target_loss": 0.05850011110305786, | |
| "eval_valid_target_runtime": 6.5534, | |
| "eval_valid_target_samples_per_second": 218.819, | |
| "eval_valid_target_steps_per_second": 6.867, | |
| "step": 20000 | |
| }, | |
| { | |
| "epoch": 5.572497920709731, | |
| "grad_norm": 0.12440012395381927, | |
| "learning_rate": 4.105752547547764e-06, | |
| "loss": 0.0613, | |
| "step": 20100 | |
| }, | |
| { | |
| "epoch": 5.600221790962018, | |
| "grad_norm": 0.14089658856391907, | |
| "learning_rate": 4.062939425382236e-06, | |
| "loss": 0.0616, | |
| "step": 20200 | |
| }, | |
| { | |
| "epoch": 5.627945661214305, | |
| "grad_norm": 0.24770374596118927, | |
| "learning_rate": 4.020197391243922e-06, | |
| "loss": 0.0621, | |
| "step": 20300 | |
| }, | |
| { | |
| "epoch": 5.655669531466593, | |
| "grad_norm": 0.11835476011037827, | |
| "learning_rate": 3.977529687662671e-06, | |
| "loss": 0.0619, | |
| "step": 20400 | |
| }, | |
| { | |
| "epoch": 5.68339340171888, | |
| "grad_norm": 0.12585273385047913, | |
| "learning_rate": 3.93493955152941e-06, | |
| "loss": 0.0612, | |
| "step": 20500 | |
| }, | |
| { | |
| "epoch": 5.68339340171888, | |
| "eval_valid_loss": 0.05319705978035927, | |
| "eval_valid_runtime": 6.4196, | |
| "eval_valid_samples_per_second": 215.435, | |
| "eval_valid_steps_per_second": 6.854, | |
| "step": 20500 | |
| }, | |
| { | |
| "epoch": 5.68339340171888, | |
| "eval_valid_target_loss": 0.058061882853507996, | |
| "eval_valid_target_runtime": 6.5894, | |
| "eval_valid_target_samples_per_second": 217.622, | |
| "eval_valid_target_steps_per_second": 6.829, | |
| "step": 20500 | |
| }, | |
| { | |
| "epoch": 5.711117271971167, | |
| "grad_norm": 0.15103484690189362, | |
| "learning_rate": 3.892430213850587e-06, | |
| "loss": 0.0615, | |
| "step": 20600 | |
| }, | |
| { | |
| "epoch": 5.738841142223454, | |
| "grad_norm": 0.1266421228647232, | |
| "learning_rate": 3.850004899503051e-06, | |
| "loss": 0.0613, | |
| "step": 20700 | |
| }, | |
| { | |
| "epoch": 5.766565012475741, | |
| "grad_norm": 0.1100655049085617, | |
| "learning_rate": 3.8076668269894045e-06, | |
| "loss": 0.0606, | |
| "step": 20800 | |
| }, | |
| { | |
| "epoch": 5.794288882728029, | |
| "grad_norm": 0.1395365446805954, | |
| "learning_rate": 3.765419208193848e-06, | |
| "loss": 0.0614, | |
| "step": 20900 | |
| }, | |
| { | |
| "epoch": 5.822012752980316, | |
| "grad_norm": 0.12668344378471375, | |
| "learning_rate": 3.723265248138506e-06, | |
| "loss": 0.0614, | |
| "step": 21000 | |
| }, | |
| { | |
| "epoch": 5.822012752980316, | |
| "eval_valid_loss": 0.052489351481199265, | |
| "eval_valid_runtime": 6.4455, | |
| "eval_valid_samples_per_second": 214.567, | |
| "eval_valid_steps_per_second": 6.826, | |
| "step": 21000 | |
| }, | |
| { | |
| "epoch": 5.822012752980316, | |
| "eval_valid_target_loss": 0.057213690131902695, | |
| "eval_valid_target_runtime": 6.5546, | |
| "eval_valid_target_samples_per_second": 218.777, | |
| "eval_valid_target_steps_per_second": 6.865, | |
| "step": 21000 | |
| }, | |
| { | |
| "epoch": 5.849736623232603, | |
| "grad_norm": 0.12728376686573029, | |
| "learning_rate": 3.681208144740291e-06, | |
| "loss": 0.0612, | |
| "step": 21100 | |
| }, | |
| { | |
| "epoch": 5.87746049348489, | |
| "grad_norm": 0.14501620829105377, | |
| "learning_rate": 3.6392510885682965e-06, | |
| "loss": 0.0601, | |
| "step": 21200 | |
| }, | |
| { | |
| "epoch": 5.9051843637371775, | |
| "grad_norm": 0.1082565188407898, | |
| "learning_rate": 3.5973972626017594e-06, | |
| "loss": 0.0608, | |
| "step": 21300 | |
| }, | |
| { | |
| "epoch": 5.9329082339894645, | |
| "grad_norm": 0.14926603436470032, | |
| "learning_rate": 3.5556498419885867e-06, | |
| "loss": 0.0603, | |
| "step": 21400 | |
| }, | |
| { | |
| "epoch": 5.9606321042417525, | |
| "grad_norm": 0.1263745278120041, | |
| "learning_rate": 3.514011993804469e-06, | |
| "loss": 0.0602, | |
| "step": 21500 | |
| }, | |
| { | |
| "epoch": 5.9606321042417525, | |
| "eval_valid_loss": 0.05212084576487541, | |
| "eval_valid_runtime": 6.439, | |
| "eval_valid_samples_per_second": 214.785, | |
| "eval_valid_steps_per_second": 6.833, | |
| "step": 21500 | |
| }, | |
| { | |
| "epoch": 5.9606321042417525, | |
| "eval_valid_target_loss": 0.05688408389687538, | |
| "eval_valid_target_runtime": 6.5822, | |
| "eval_valid_target_samples_per_second": 217.862, | |
| "eval_valid_target_steps_per_second": 6.837, | |
| "step": 21500 | |
| }, | |
| { | |
| "epoch": 5.98835597449404, | |
| "grad_norm": 0.1368781179189682, | |
| "learning_rate": 3.4724868768126384e-06, | |
| "loss": 0.0604, | |
| "step": 21600 | |
| }, | |
| { | |
| "epoch": 6.016079844746327, | |
| "grad_norm": 0.15087148547172546, | |
| "learning_rate": 3.4310776412242195e-06, | |
| "loss": 0.06, | |
| "step": 21700 | |
| }, | |
| { | |
| "epoch": 6.043803714998614, | |
| "grad_norm": 0.11400382220745087, | |
| "learning_rate": 3.3897874284592467e-06, | |
| "loss": 0.0594, | |
| "step": 21800 | |
| }, | |
| { | |
| "epoch": 6.071527585250901, | |
| "grad_norm": 0.1169167011976242, | |
| "learning_rate": 3.348619370908361e-06, | |
| "loss": 0.0598, | |
| "step": 21900 | |
| }, | |
| { | |
| "epoch": 6.099251455503188, | |
| "grad_norm": 0.12172160297632217, | |
| "learning_rate": 3.3075765916951576e-06, | |
| "loss": 0.0599, | |
| "step": 22000 | |
| }, | |
| { | |
| "epoch": 6.099251455503188, | |
| "eval_valid_loss": 0.05157113075256348, | |
| "eval_valid_runtime": 6.4258, | |
| "eval_valid_samples_per_second": 215.224, | |
| "eval_valid_steps_per_second": 6.847, | |
| "step": 22000 | |
| }, | |
| { | |
| "epoch": 6.099251455503188, | |
| "eval_valid_target_loss": 0.056347791105508804, | |
| "eval_valid_target_runtime": 6.5915, | |
| "eval_valid_target_samples_per_second": 217.554, | |
| "eval_valid_target_steps_per_second": 6.827, | |
| "step": 22000 | |
| }, | |
| { | |
| "epoch": 6.126975325755476, | |
| "grad_norm": 0.1324358880519867, | |
| "learning_rate": 3.2666622044392765e-06, | |
| "loss": 0.0591, | |
| "step": 22100 | |
| }, | |
| { | |
| "epoch": 6.154699196007763, | |
| "grad_norm": 0.12708991765975952, | |
| "learning_rate": 3.225879313020178e-06, | |
| "loss": 0.0591, | |
| "step": 22200 | |
| }, | |
| { | |
| "epoch": 6.18242306626005, | |
| "grad_norm": 0.11844506114721298, | |
| "learning_rate": 3.18523101134169e-06, | |
| "loss": 0.0592, | |
| "step": 22300 | |
| }, | |
| { | |
| "epoch": 6.210146936512337, | |
| "grad_norm": 0.12888644635677338, | |
| "learning_rate": 3.1447203830972827e-06, | |
| "loss": 0.0597, | |
| "step": 22400 | |
| }, | |
| { | |
| "epoch": 6.237870806764624, | |
| "grad_norm": 0.1485096514225006, | |
| "learning_rate": 3.104350501536134e-06, | |
| "loss": 0.0598, | |
| "step": 22500 | |
| }, | |
| { | |
| "epoch": 6.237870806764624, | |
| "eval_valid_loss": 0.051265206187963486, | |
| "eval_valid_runtime": 6.437, | |
| "eval_valid_samples_per_second": 214.85, | |
| "eval_valid_steps_per_second": 6.835, | |
| "step": 22500 | |
| }, | |
| { | |
| "epoch": 6.237870806764624, | |
| "eval_valid_target_loss": 0.056084584444761276, | |
| "eval_valid_target_runtime": 6.6, | |
| "eval_valid_target_samples_per_second": 217.273, | |
| "eval_valid_target_steps_per_second": 6.818, | |
| "step": 22500 | |
| }, | |
| { | |
| "epoch": 6.265594677016912, | |
| "grad_norm": 0.11319620907306671, | |
| "learning_rate": 3.064124429229992e-06, | |
| "loss": 0.0581, | |
| "step": 22600 | |
| }, | |
| { | |
| "epoch": 6.293318547269199, | |
| "grad_norm": 0.125896617770195, | |
| "learning_rate": 3.0240452178408286e-06, | |
| "loss": 0.0594, | |
| "step": 22700 | |
| }, | |
| { | |
| "epoch": 6.321042417521486, | |
| "grad_norm": 0.13202796876430511, | |
| "learning_rate": 2.9841159078893377e-06, | |
| "loss": 0.0587, | |
| "step": 22800 | |
| }, | |
| { | |
| "epoch": 6.348766287773773, | |
| "grad_norm": 0.12477891147136688, | |
| "learning_rate": 2.944339528524278e-06, | |
| "loss": 0.0582, | |
| "step": 22900 | |
| }, | |
| { | |
| "epoch": 6.37649015802606, | |
| "grad_norm": 0.13174673914909363, | |
| "learning_rate": 2.9047190972926597e-06, | |
| "loss": 0.0585, | |
| "step": 23000 | |
| }, | |
| { | |
| "epoch": 6.37649015802606, | |
| "eval_valid_loss": 0.05099370330572128, | |
| "eval_valid_runtime": 6.4377, | |
| "eval_valid_samples_per_second": 214.828, | |
| "eval_valid_steps_per_second": 6.835, | |
| "step": 23000 | |
| }, | |
| { | |
| "epoch": 6.37649015802606, | |
| "eval_valid_target_loss": 0.055660318583250046, | |
| "eval_valid_target_runtime": 6.5668, | |
| "eval_valid_target_samples_per_second": 218.37, | |
| "eval_valid_target_steps_per_second": 6.853, | |
| "step": 23000 | |
| }, | |
| { | |
| "epoch": 6.404214028278347, | |
| "grad_norm": 0.12851925194263458, | |
| "learning_rate": 2.8652576199108395e-06, | |
| "loss": 0.0586, | |
| "step": 23100 | |
| }, | |
| { | |
| "epoch": 6.431937898530635, | |
| "grad_norm": 0.10676029324531555, | |
| "learning_rate": 2.8259580900364825e-06, | |
| "loss": 0.0584, | |
| "step": 23200 | |
| }, | |
| { | |
| "epoch": 6.459661768782922, | |
| "grad_norm": 0.1461838185787201, | |
| "learning_rate": 2.786823489041478e-06, | |
| "loss": 0.0583, | |
| "step": 23300 | |
| }, | |
| { | |
| "epoch": 6.487385639035209, | |
| "grad_norm": 0.12321025878190994, | |
| "learning_rate": 2.747856785785743e-06, | |
| "loss": 0.0579, | |
| "step": 23400 | |
| }, | |
| { | |
| "epoch": 6.515109509287496, | |
| "grad_norm": 0.1209678128361702, | |
| "learning_rate": 2.7090609363919986e-06, | |
| "loss": 0.0581, | |
| "step": 23500 | |
| }, | |
| { | |
| "epoch": 6.515109509287496, | |
| "eval_valid_loss": 0.050510190427303314, | |
| "eval_valid_runtime": 6.447, | |
| "eval_valid_samples_per_second": 214.517, | |
| "eval_valid_steps_per_second": 6.825, | |
| "step": 23500 | |
| }, | |
| { | |
| "epoch": 6.515109509287496, | |
| "eval_valid_target_loss": 0.0551883801817894, | |
| "eval_valid_target_runtime": 6.5701, | |
| "eval_valid_target_samples_per_second": 218.262, | |
| "eval_valid_target_steps_per_second": 6.849, | |
| "step": 23500 | |
| }, | |
| { | |
| "epoch": 6.5428333795397835, | |
| "grad_norm": 0.15566356480121613, | |
| "learning_rate": 2.6704388840215277e-06, | |
| "loss": 0.0578, | |
| "step": 23600 | |
| }, | |
| { | |
| "epoch": 6.570557249792071, | |
| "grad_norm": 0.10754121840000153, | |
| "learning_rate": 2.6319935586508814e-06, | |
| "loss": 0.058, | |
| "step": 23700 | |
| }, | |
| { | |
| "epoch": 6.5982811200443585, | |
| "grad_norm": 0.12134023010730743, | |
| "learning_rate": 2.593727876849601e-06, | |
| "loss": 0.0577, | |
| "step": 23800 | |
| }, | |
| { | |
| "epoch": 6.626004990296646, | |
| "grad_norm": 0.12984460592269897, | |
| "learning_rate": 2.555644741558979e-06, | |
| "loss": 0.0575, | |
| "step": 23900 | |
| }, | |
| { | |
| "epoch": 6.653728860548933, | |
| "grad_norm": 0.13557353615760803, | |
| "learning_rate": 2.51774704187181e-06, | |
| "loss": 0.0571, | |
| "step": 24000 | |
| }, | |
| { | |
| "epoch": 6.653728860548933, | |
| "eval_valid_loss": 0.0503346286714077, | |
| "eval_valid_runtime": 6.419, | |
| "eval_valid_samples_per_second": 215.455, | |
| "eval_valid_steps_per_second": 6.855, | |
| "step": 24000 | |
| }, | |
| { | |
| "epoch": 6.653728860548933, | |
| "eval_valid_target_loss": 0.0548863522708416, | |
| "eval_valid_target_runtime": 6.5823, | |
| "eval_valid_target_samples_per_second": 217.857, | |
| "eval_valid_target_steps_per_second": 6.837, | |
| "step": 24000 | |
| }, | |
| { | |
| "epoch": 6.68145273080122, | |
| "grad_norm": 0.10979162156581879, | |
| "learning_rate": 2.4800376528132297e-06, | |
| "loss": 0.0576, | |
| "step": 24100 | |
| }, | |
| { | |
| "epoch": 6.709176601053507, | |
| "grad_norm": 0.16127757728099823, | |
| "learning_rate": 2.4425194351226082e-06, | |
| "loss": 0.0579, | |
| "step": 24200 | |
| }, | |
| { | |
| "epoch": 6.736900471305795, | |
| "grad_norm": 0.13306181132793427, | |
| "learning_rate": 2.4051952350365194e-06, | |
| "loss": 0.0572, | |
| "step": 24300 | |
| }, | |
| { | |
| "epoch": 6.764624341558082, | |
| "grad_norm": 0.11353787779808044, | |
| "learning_rate": 2.368067884072821e-06, | |
| "loss": 0.0573, | |
| "step": 24400 | |
| }, | |
| { | |
| "epoch": 6.792348211810369, | |
| "grad_norm": 0.10115820914506912, | |
| "learning_rate": 2.331140198815849e-06, | |
| "loss": 0.0574, | |
| "step": 24500 | |
| }, | |
| { | |
| "epoch": 6.792348211810369, | |
| "eval_valid_loss": 0.049953412264585495, | |
| "eval_valid_runtime": 6.4338, | |
| "eval_valid_samples_per_second": 214.958, | |
| "eval_valid_steps_per_second": 6.839, | |
| "step": 24500 | |
| }, | |
| { | |
| "epoch": 6.792348211810369, | |
| "eval_valid_target_loss": 0.054579559713602066, | |
| "eval_valid_target_runtime": 6.5694, | |
| "eval_valid_target_samples_per_second": 218.283, | |
| "eval_valid_target_steps_per_second": 6.85, | |
| "step": 24500 | |
| }, | |
| { | |
| "epoch": 6.820072082062656, | |
| "grad_norm": 0.10899285972118378, | |
| "learning_rate": 2.294414980702741e-06, | |
| "loss": 0.0573, | |
| "step": 24600 | |
| }, | |
| { | |
| "epoch": 6.847795952314943, | |
| "grad_norm": 0.1248159185051918, | |
| "learning_rate": 2.257895015810913e-06, | |
| "loss": 0.0568, | |
| "step": 24700 | |
| }, | |
| { | |
| "epoch": 6.87551982256723, | |
| "grad_norm": 0.10761197656393051, | |
| "learning_rate": 2.221583074646701e-06, | |
| "loss": 0.0574, | |
| "step": 24800 | |
| }, | |
| { | |
| "epoch": 6.903243692819517, | |
| "grad_norm": 0.13541601598262787, | |
| "learning_rate": 2.1854819119351784e-06, | |
| "loss": 0.0562, | |
| "step": 24900 | |
| }, | |
| { | |
| "epoch": 6.930967563071805, | |
| "grad_norm": 0.10959000140428543, | |
| "learning_rate": 2.1495942664111814e-06, | |
| "loss": 0.0576, | |
| "step": 25000 | |
| }, | |
| { | |
| "epoch": 6.930967563071805, | |
| "eval_valid_loss": 0.049802832305431366, | |
| "eval_valid_runtime": 6.4091, | |
| "eval_valid_samples_per_second": 215.786, | |
| "eval_valid_steps_per_second": 6.865, | |
| "step": 25000 | |
| }, | |
| { | |
| "epoch": 6.930967563071805, | |
| "eval_valid_target_loss": 0.05434631556272507, | |
| "eval_valid_target_runtime": 6.5766, | |
| "eval_valid_target_samples_per_second": 218.047, | |
| "eval_valid_target_steps_per_second": 6.842, | |
| "step": 25000 | |
| }, | |
| { | |
| "epoch": 6.958691433324092, | |
| "grad_norm": 0.11864270269870758, | |
| "learning_rate": 2.113922860611532e-06, | |
| "loss": 0.0571, | |
| "step": 25100 | |
| }, | |
| { | |
| "epoch": 6.986415303576379, | |
| "grad_norm": 0.10493431985378265, | |
| "learning_rate": 2.078470400668506e-06, | |
| "loss": 0.0572, | |
| "step": 25200 | |
| }, | |
| { | |
| "epoch": 7.014139173828666, | |
| "grad_norm": 0.10294145345687866, | |
| "learning_rate": 2.0432395761045427e-06, | |
| "loss": 0.0562, | |
| "step": 25300 | |
| }, | |
| { | |
| "epoch": 7.041863044080953, | |
| "grad_norm": 0.11174608767032623, | |
| "learning_rate": 2.008233059628193e-06, | |
| "loss": 0.0562, | |
| "step": 25400 | |
| }, | |
| { | |
| "epoch": 7.069586914333241, | |
| "grad_norm": 0.10171514004468918, | |
| "learning_rate": 1.9734535069313753e-06, | |
| "loss": 0.056, | |
| "step": 25500 | |
| }, | |
| { | |
| "epoch": 7.069586914333241, | |
| "eval_valid_loss": 0.04948737472295761, | |
| "eval_valid_runtime": 6.442, | |
| "eval_valid_samples_per_second": 214.685, | |
| "eval_valid_steps_per_second": 6.83, | |
| "step": 25500 | |
| }, | |
| { | |
| "epoch": 7.069586914333241, | |
| "eval_valid_target_loss": 0.05410830304026604, | |
| "eval_valid_target_runtime": 6.5896, | |
| "eval_valid_target_samples_per_second": 217.617, | |
| "eval_valid_target_steps_per_second": 6.829, | |
| "step": 25500 | |
| }, | |
| { | |
| "epoch": 7.097310784585528, | |
| "grad_norm": 0.10731488466262817, | |
| "learning_rate": 1.9389035564879104e-06, | |
| "loss": 0.0569, | |
| "step": 25600 | |
| }, | |
| { | |
| "epoch": 7.125034654837815, | |
| "grad_norm": 0.0954216718673706, | |
| "learning_rate": 1.9045858293533399e-06, | |
| "loss": 0.0566, | |
| "step": 25700 | |
| }, | |
| { | |
| "epoch": 7.1527585250901025, | |
| "grad_norm": 0.11443454772233963, | |
| "learning_rate": 1.8705029289661054e-06, | |
| "loss": 0.057, | |
| "step": 25800 | |
| }, | |
| { | |
| "epoch": 7.1804823953423895, | |
| "grad_norm": 0.10671606659889221, | |
| "learning_rate": 1.8366574409500344e-06, | |
| "loss": 0.0561, | |
| "step": 25900 | |
| }, | |
| { | |
| "epoch": 7.208206265594677, | |
| "grad_norm": 0.1028604656457901, | |
| "learning_rate": 1.8030519329181916e-06, | |
| "loss": 0.0561, | |
| "step": 26000 | |
| }, | |
| { | |
| "epoch": 7.208206265594677, | |
| "eval_valid_loss": 0.04931313917040825, | |
| "eval_valid_runtime": 6.431, | |
| "eval_valid_samples_per_second": 215.053, | |
| "eval_valid_steps_per_second": 6.842, | |
| "step": 26000 | |
| }, | |
| { | |
| "epoch": 7.208206265594677, | |
| "eval_valid_target_loss": 0.053888678550720215, | |
| "eval_valid_target_runtime": 6.5712, | |
| "eval_valid_target_samples_per_second": 218.225, | |
| "eval_valid_target_steps_per_second": 6.848, | |
| "step": 26000 | |
| }, | |
| { | |
| "epoch": 7.2359301358469645, | |
| "grad_norm": 0.11538730561733246, | |
| "learning_rate": 1.7696889542780904e-06, | |
| "loss": 0.0564, | |
| "step": 26100 | |
| }, | |
| { | |
| "epoch": 7.263654006099252, | |
| "grad_norm": 0.10585539788007736, | |
| "learning_rate": 1.7365710360382882e-06, | |
| "loss": 0.0562, | |
| "step": 26200 | |
| }, | |
| { | |
| "epoch": 7.291377876351539, | |
| "grad_norm": 0.09750411659479141, | |
| "learning_rate": 1.7037006906163773e-06, | |
| "loss": 0.0563, | |
| "step": 26300 | |
| }, | |
| { | |
| "epoch": 7.319101746603826, | |
| "grad_norm": 0.10777630656957626, | |
| "learning_rate": 1.6710804116483886e-06, | |
| "loss": 0.0556, | |
| "step": 26400 | |
| }, | |
| { | |
| "epoch": 7.346825616856113, | |
| "grad_norm": 0.13231071829795837, | |
| "learning_rate": 1.6387126737996067e-06, | |
| "loss": 0.0559, | |
| "step": 26500 | |
| }, | |
| { | |
| "epoch": 7.346825616856113, | |
| "eval_valid_loss": 0.04909936338663101, | |
| "eval_valid_runtime": 6.4292, | |
| "eval_valid_samples_per_second": 215.112, | |
| "eval_valid_steps_per_second": 6.844, | |
| "step": 26500 | |
| }, | |
| { | |
| "epoch": 7.346825616856113, | |
| "eval_valid_target_loss": 0.05357712134718895, | |
| "eval_valid_target_runtime": 6.5542, | |
| "eval_valid_target_samples_per_second": 218.792, | |
| "eval_valid_target_steps_per_second": 6.866, | |
| "step": 26500 | |
| }, | |
| { | |
| "epoch": 7.374549487108401, | |
| "grad_norm": 0.10591776669025421, | |
| "learning_rate": 1.6065999325768544e-06, | |
| "loss": 0.0559, | |
| "step": 26600 | |
| }, | |
| { | |
| "epoch": 7.402273357360688, | |
| "grad_norm": 0.11603645980358124, | |
| "learning_rate": 1.5747446241421931e-06, | |
| "loss": 0.0557, | |
| "step": 26700 | |
| }, | |
| { | |
| "epoch": 7.429997227612975, | |
| "grad_norm": 0.09715123474597931, | |
| "learning_rate": 1.5431491651281123e-06, | |
| "loss": 0.0563, | |
| "step": 26800 | |
| }, | |
| { | |
| "epoch": 7.457721097865262, | |
| "grad_norm": 0.10046205669641495, | |
| "learning_rate": 1.511815952454208e-06, | |
| "loss": 0.0556, | |
| "step": 26900 | |
| }, | |
| { | |
| "epoch": 7.485444968117549, | |
| "grad_norm": 0.11805932968854904, | |
| "learning_rate": 1.480747363145334e-06, | |
| "loss": 0.0556, | |
| "step": 27000 | |
| }, | |
| { | |
| "epoch": 7.485444968117549, | |
| "eval_valid_loss": 0.04887402430176735, | |
| "eval_valid_runtime": 6.4098, | |
| "eval_valid_samples_per_second": 215.763, | |
| "eval_valid_steps_per_second": 6.864, | |
| "step": 27000 | |
| }, | |
| { | |
| "epoch": 7.485444968117549, | |
| "eval_valid_target_loss": 0.05348382145166397, | |
| "eval_valid_target_runtime": 6.5773, | |
| "eval_valid_target_samples_per_second": 218.023, | |
| "eval_valid_target_steps_per_second": 6.842, | |
| "step": 27000 | |
| }, | |
| { | |
| "epoch": 7.513168838369836, | |
| "grad_norm": 0.1107444316148758, | |
| "learning_rate": 1.4499457541512746e-06, | |
| "loss": 0.0554, | |
| "step": 27100 | |
| }, | |
| { | |
| "epoch": 7.540892708622124, | |
| "grad_norm": 0.10029349476099014, | |
| "learning_rate": 1.4194134621679478e-06, | |
| "loss": 0.0559, | |
| "step": 27200 | |
| }, | |
| { | |
| "epoch": 7.568616578874411, | |
| "grad_norm": 0.09976372122764587, | |
| "learning_rate": 1.3891528034601316e-06, | |
| "loss": 0.0565, | |
| "step": 27300 | |
| }, | |
| { | |
| "epoch": 7.596340449126698, | |
| "grad_norm": 0.10560230165719986, | |
| "learning_rate": 1.3591660736857453e-06, | |
| "loss": 0.0553, | |
| "step": 27400 | |
| }, | |
| { | |
| "epoch": 7.624064319378985, | |
| "grad_norm": 0.09814602881669998, | |
| "learning_rate": 1.329455547721697e-06, | |
| "loss": 0.0552, | |
| "step": 27500 | |
| }, | |
| { | |
| "epoch": 7.624064319378985, | |
| "eval_valid_loss": 0.04867083579301834, | |
| "eval_valid_runtime": 6.4389, | |
| "eval_valid_samples_per_second": 214.79, | |
| "eval_valid_steps_per_second": 6.834, | |
| "step": 27500 | |
| }, | |
| { | |
| "epoch": 7.624064319378985, | |
| "eval_valid_target_loss": 0.053231850266456604, | |
| "eval_valid_target_runtime": 6.5692, | |
| "eval_valid_target_samples_per_second": 218.292, | |
| "eval_valid_target_steps_per_second": 6.85, | |
| "step": 27500 | |
| }, | |
| { | |
| "epoch": 7.651788189631272, | |
| "grad_norm": 0.10253589600324631, | |
| "learning_rate": 1.300023479491303e-06, | |
| "loss": 0.0555, | |
| "step": 27600 | |
| }, | |
| { | |
| "epoch": 7.67951205988356, | |
| "grad_norm": 0.10933282226324081, | |
| "learning_rate": 1.2708721017933007e-06, | |
| "loss": 0.0551, | |
| "step": 27700 | |
| }, | |
| { | |
| "epoch": 7.707235930135847, | |
| "grad_norm": 0.11853484809398651, | |
| "learning_rate": 1.2420036261324598e-06, | |
| "loss": 0.056, | |
| "step": 27800 | |
| }, | |
| { | |
| "epoch": 7.734959800388134, | |
| "grad_norm": 0.0992041826248169, | |
| "learning_rate": 1.2134202425518139e-06, | |
| "loss": 0.0547, | |
| "step": 27900 | |
| }, | |
| { | |
| "epoch": 7.762683670640421, | |
| "grad_norm": 0.10824355483055115, | |
| "learning_rate": 1.185124119466517e-06, | |
| "loss": 0.0554, | |
| "step": 28000 | |
| }, | |
| { | |
| "epoch": 7.762683670640421, | |
| "eval_valid_loss": 0.048471271991729736, | |
| "eval_valid_runtime": 6.414, | |
| "eval_valid_samples_per_second": 215.623, | |
| "eval_valid_steps_per_second": 6.86, | |
| "step": 28000 | |
| }, | |
| { | |
| "epoch": 7.762683670640421, | |
| "eval_valid_target_loss": 0.05302482470870018, | |
| "eval_valid_target_runtime": 6.5682, | |
| "eval_valid_target_samples_per_second": 218.326, | |
| "eval_valid_target_steps_per_second": 6.851, | |
| "step": 28000 | |
| }, | |
| { | |
| "epoch": 7.7904075408927085, | |
| "grad_norm": 0.09927680343389511, | |
| "learning_rate": 1.1571174034993416e-06, | |
| "loss": 0.0555, | |
| "step": 28100 | |
| }, | |
| { | |
| "epoch": 7.8181314111449955, | |
| "grad_norm": 0.09600567072629929, | |
| "learning_rate": 1.129402219317825e-06, | |
| "loss": 0.0553, | |
| "step": 28200 | |
| }, | |
| { | |
| "epoch": 7.845855281397283, | |
| "grad_norm": 0.11057105660438538, | |
| "learning_rate": 1.1019806694730989e-06, | |
| "loss": 0.0557, | |
| "step": 28300 | |
| }, | |
| { | |
| "epoch": 7.873579151649571, | |
| "grad_norm": 0.10991726815700531, | |
| "learning_rate": 1.074854834240368e-06, | |
| "loss": 0.0553, | |
| "step": 28400 | |
| }, | |
| { | |
| "epoch": 7.901303021901858, | |
| "grad_norm": 0.09168905019760132, | |
| "learning_rate": 1.0480267714611048e-06, | |
| "loss": 0.0551, | |
| "step": 28500 | |
| }, | |
| { | |
| "epoch": 7.901303021901858, | |
| "eval_valid_loss": 0.04835043475031853, | |
| "eval_valid_runtime": 6.4532, | |
| "eval_valid_samples_per_second": 214.313, | |
| "eval_valid_steps_per_second": 6.818, | |
| "step": 28500 | |
| }, | |
| { | |
| "epoch": 7.901303021901858, | |
| "eval_valid_target_loss": 0.05293356999754906, | |
| "eval_valid_target_runtime": 6.5812, | |
| "eval_valid_target_samples_per_second": 217.894, | |
| "eval_valid_target_steps_per_second": 6.838, | |
| "step": 28500 | |
| }, | |
| { | |
| "epoch": 7.929026892154145, | |
| "grad_norm": 0.09465237706899643, | |
| "learning_rate": 1.0214985163869378e-06, | |
| "loss": 0.0556, | |
| "step": 28600 | |
| }, | |
| { | |
| "epoch": 7.956750762406432, | |
| "grad_norm": 0.10842736065387726, | |
| "learning_rate": 9.952720815252397e-07, | |
| "loss": 0.0543, | |
| "step": 28700 | |
| }, | |
| { | |
| "epoch": 7.984474632658719, | |
| "grad_norm": 0.09609558433294296, | |
| "learning_rate": 9.693494564864648e-07, | |
| "loss": 0.0554, | |
| "step": 28800 | |
| }, | |
| { | |
| "epoch": 8.012198502911007, | |
| "grad_norm": 0.10819283127784729, | |
| "learning_rate": 9.437326078332099e-07, | |
| "loss": 0.0545, | |
| "step": 28900 | |
| }, | |
| { | |
| "epoch": 8.039922373163293, | |
| "grad_norm": 0.09054001420736313, | |
| "learning_rate": 9.18423478931016e-07, | |
| "loss": 0.0554, | |
| "step": 29000 | |
| }, | |
| { | |
| "epoch": 8.039922373163293, | |
| "eval_valid_loss": 0.04819526523351669, | |
| "eval_valid_runtime": 6.4165, | |
| "eval_valid_samples_per_second": 215.536, | |
| "eval_valid_steps_per_second": 6.857, | |
| "step": 29000 | |
| }, | |
| { | |
| "epoch": 8.039922373163293, | |
| "eval_valid_target_loss": 0.05275378376245499, | |
| "eval_valid_target_runtime": 6.5635, | |
| "eval_valid_target_samples_per_second": 218.482, | |
| "eval_valid_target_steps_per_second": 6.856, | |
| "step": 29000 | |
| }, | |
| { | |
| "epoch": 8.067646243415581, | |
| "grad_norm": 0.10373499244451523, | |
| "learning_rate": 8.934239898009517e-07, | |
| "loss": 0.0552, | |
| "step": 29100 | |
| }, | |
| { | |
| "epoch": 8.095370113667869, | |
| "grad_norm": 0.09614498168230057, | |
| "learning_rate": 8.687360369739473e-07, | |
| "loss": 0.0545, | |
| "step": 29200 | |
| }, | |
| { | |
| "epoch": 8.123093983920155, | |
| "grad_norm": 0.1014479324221611, | |
| "learning_rate": 8.443614933469208e-07, | |
| "loss": 0.0549, | |
| "step": 29300 | |
| }, | |
| { | |
| "epoch": 8.150817854172443, | |
| "grad_norm": 0.08971751481294632, | |
| "learning_rate": 8.203022080406952e-07, | |
| "loss": 0.0546, | |
| "step": 29400 | |
| }, | |
| { | |
| "epoch": 8.17854172442473, | |
| "grad_norm": 0.09659924358129501, | |
| "learning_rate": 7.965600062597184e-07, | |
| "loss": 0.0542, | |
| "step": 29500 | |
| }, | |
| { | |
| "epoch": 8.17854172442473, | |
| "eval_valid_loss": 0.04812739044427872, | |
| "eval_valid_runtime": 6.4674, | |
| "eval_valid_samples_per_second": 213.843, | |
| "eval_valid_steps_per_second": 6.803, | |
| "step": 29500 | |
| }, | |
| { | |
| "epoch": 8.17854172442473, | |
| "eval_valid_target_loss": 0.05264822766184807, | |
| "eval_valid_target_runtime": 6.5912, | |
| "eval_valid_target_samples_per_second": 217.563, | |
| "eval_valid_target_steps_per_second": 6.827, | |
| "step": 29500 | |
| }, | |
| { | |
| "epoch": 8.206265594677017, | |
| "grad_norm": 0.1034499853849411, | |
| "learning_rate": 7.731366891535969e-07, | |
| "loss": 0.0548, | |
| "step": 29600 | |
| }, | |
| { | |
| "epoch": 8.233989464929303, | |
| "grad_norm": 0.0934043675661087, | |
| "learning_rate": 7.500340336804607e-07, | |
| "loss": 0.0542, | |
| "step": 29700 | |
| }, | |
| { | |
| "epoch": 8.261713335181591, | |
| "grad_norm": 0.09693789482116699, | |
| "learning_rate": 7.272537924721467e-07, | |
| "loss": 0.0553, | |
| "step": 29800 | |
| }, | |
| { | |
| "epoch": 8.28943720543388, | |
| "grad_norm": 0.09552415460348129, | |
| "learning_rate": 7.047976937012568e-07, | |
| "loss": 0.0543, | |
| "step": 29900 | |
| }, | |
| { | |
| "epoch": 8.317161075686165, | |
| "grad_norm": 0.0978178158402443, | |
| "learning_rate": 6.826674409500389e-07, | |
| "loss": 0.0548, | |
| "step": 30000 | |
| }, | |
| { | |
| "epoch": 8.317161075686165, | |
| "eval_valid_loss": 0.04797354340553284, | |
| "eval_valid_runtime": 6.442, | |
| "eval_valid_samples_per_second": 214.683, | |
| "eval_valid_steps_per_second": 6.83, | |
| "step": 30000 | |
| }, | |
| { | |
| "epoch": 8.317161075686165, | |
| "eval_valid_target_loss": 0.052511684596538544, | |
| "eval_valid_target_runtime": 6.5615, | |
| "eval_valid_target_samples_per_second": 218.549, | |
| "eval_valid_target_steps_per_second": 6.858, | |
| "step": 30000 | |
| }, | |
| { | |
| "epoch": 8.344884945938453, | |
| "grad_norm": 0.09591928869485855, | |
| "learning_rate": 6.608647130811502e-07, | |
| "loss": 0.0543, | |
| "step": 30100 | |
| }, | |
| { | |
| "epoch": 8.37260881619074, | |
| "grad_norm": 0.09678730368614197, | |
| "learning_rate": 6.393911641103051e-07, | |
| "loss": 0.0542, | |
| "step": 30200 | |
| }, | |
| { | |
| "epoch": 8.400332686443027, | |
| "grad_norm": 0.10894029587507248, | |
| "learning_rate": 6.182484230807845e-07, | |
| "loss": 0.0542, | |
| "step": 30300 | |
| }, | |
| { | |
| "epoch": 8.428056556695315, | |
| "grad_norm": 0.10065341740846634, | |
| "learning_rate": 5.974380939398555e-07, | |
| "loss": 0.0549, | |
| "step": 30400 | |
| }, | |
| { | |
| "epoch": 8.455780426947602, | |
| "grad_norm": 0.11015477776527405, | |
| "learning_rate": 5.769617554170959e-07, | |
| "loss": 0.0544, | |
| "step": 30500 | |
| }, | |
| { | |
| "epoch": 8.455780426947602, | |
| "eval_valid_loss": 0.04785359278321266, | |
| "eval_valid_runtime": 6.4159, | |
| "eval_valid_samples_per_second": 215.558, | |
| "eval_valid_steps_per_second": 6.858, | |
| "step": 30500 | |
| }, | |
| { | |
| "epoch": 8.455780426947602, | |
| "eval_valid_target_loss": 0.05238433927297592, | |
| "eval_valid_target_runtime": 6.575, | |
| "eval_valid_target_samples_per_second": 218.1, | |
| "eval_valid_target_steps_per_second": 6.844, | |
| "step": 30500 | |
| }, | |
| { | |
| "epoch": 8.48350429719989, | |
| "grad_norm": 0.10229642689228058, | |
| "learning_rate": 5.568209609046238e-07, | |
| "loss": 0.0542, | |
| "step": 30600 | |
| }, | |
| { | |
| "epoch": 8.511228167452176, | |
| "grad_norm": 0.1019807681441307, | |
| "learning_rate": 5.370172383392514e-07, | |
| "loss": 0.0548, | |
| "step": 30700 | |
| }, | |
| { | |
| "epoch": 8.538952037704464, | |
| "grad_norm": 0.1037830114364624, | |
| "learning_rate": 5.175520900865754e-07, | |
| "loss": 0.0538, | |
| "step": 30800 | |
| }, | |
| { | |
| "epoch": 8.56667590795675, | |
| "grad_norm": 0.0952112227678299, | |
| "learning_rate": 4.984269928270002e-07, | |
| "loss": 0.0537, | |
| "step": 30900 | |
| }, | |
| { | |
| "epoch": 8.594399778209038, | |
| "grad_norm": 0.09642232209444046, | |
| "learning_rate": 4.796433974437148e-07, | |
| "loss": 0.0533, | |
| "step": 31000 | |
| }, | |
| { | |
| "epoch": 8.594399778209038, | |
| "eval_valid_loss": 0.04777803644537926, | |
| "eval_valid_runtime": 6.4399, | |
| "eval_valid_samples_per_second": 214.756, | |
| "eval_valid_steps_per_second": 6.832, | |
| "step": 31000 | |
| }, | |
| { | |
| "epoch": 8.594399778209038, | |
| "eval_valid_target_loss": 0.052354373037815094, | |
| "eval_valid_target_runtime": 6.5668, | |
| "eval_valid_target_samples_per_second": 218.371, | |
| "eval_valid_target_steps_per_second": 6.853, | |
| "step": 31000 | |
| }, | |
| { | |
| "epoch": 8.622123648461326, | |
| "grad_norm": 0.10211507230997086, | |
| "learning_rate": 4.6120272891262365e-07, | |
| "loss": 0.0544, | |
| "step": 31100 | |
| }, | |
| { | |
| "epoch": 8.649847518713612, | |
| "grad_norm": 0.0912129357457161, | |
| "learning_rate": 4.4310638619424363e-07, | |
| "loss": 0.0536, | |
| "step": 31200 | |
| }, | |
| { | |
| "epoch": 8.6775713889659, | |
| "grad_norm": 0.10558176785707474, | |
| "learning_rate": 4.2535574212757667e-07, | |
| "loss": 0.0542, | |
| "step": 31300 | |
| }, | |
| { | |
| "epoch": 8.705295259218186, | |
| "grad_norm": 0.10381397604942322, | |
| "learning_rate": 4.0795214332596145e-07, | |
| "loss": 0.0547, | |
| "step": 31400 | |
| }, | |
| { | |
| "epoch": 8.733019129470474, | |
| "grad_norm": 0.09383094310760498, | |
| "learning_rate": 3.908969100749121e-07, | |
| "loss": 0.055, | |
| "step": 31500 | |
| }, | |
| { | |
| "epoch": 8.733019129470474, | |
| "eval_valid_loss": 0.047727905213832855, | |
| "eval_valid_runtime": 6.4171, | |
| "eval_valid_samples_per_second": 215.518, | |
| "eval_valid_steps_per_second": 6.857, | |
| "step": 31500 | |
| }, | |
| { | |
| "epoch": 8.733019129470474, | |
| "eval_valid_target_loss": 0.05224745720624924, | |
| "eval_valid_target_runtime": 6.5727, | |
| "eval_valid_target_samples_per_second": 218.174, | |
| "eval_valid_target_steps_per_second": 6.846, | |
| "step": 31500 | |
| }, | |
| { | |
| "epoch": 8.760742999722762, | |
| "grad_norm": 0.10438426584005356, | |
| "learning_rate": 3.7419133623196825e-07, | |
| "loss": 0.0541, | |
| "step": 31600 | |
| }, | |
| { | |
| "epoch": 8.788466869975048, | |
| "grad_norm": 0.09324101358652115, | |
| "learning_rate": 3.5783668912852453e-07, | |
| "loss": 0.0537, | |
| "step": 31700 | |
| }, | |
| { | |
| "epoch": 8.816190740227336, | |
| "grad_norm": 0.09235464036464691, | |
| "learning_rate": 3.4183420947369873e-07, | |
| "loss": 0.0544, | |
| "step": 31800 | |
| }, | |
| { | |
| "epoch": 8.843914610479622, | |
| "grad_norm": 0.09870747476816177, | |
| "learning_rate": 3.261851112602055e-07, | |
| "loss": 0.0543, | |
| "step": 31900 | |
| }, | |
| { | |
| "epoch": 8.87163848073191, | |
| "grad_norm": 0.10918495059013367, | |
| "learning_rate": 3.108905816722546e-07, | |
| "loss": 0.054, | |
| "step": 32000 | |
| }, | |
| { | |
| "epoch": 8.87163848073191, | |
| "eval_valid_loss": 0.047707512974739075, | |
| "eval_valid_runtime": 6.4362, | |
| "eval_valid_samples_per_second": 214.879, | |
| "eval_valid_steps_per_second": 6.836, | |
| "step": 32000 | |
| }, | |
| { | |
| "epoch": 8.87163848073191, | |
| "eval_valid_target_loss": 0.05221306532621384, | |
| "eval_valid_target_runtime": 6.5779, | |
| "eval_valid_target_samples_per_second": 218.002, | |
| "eval_valid_target_steps_per_second": 6.841, | |
| "step": 32000 | |
| }, | |
| { | |
| "epoch": 8.899362350984198, | |
| "grad_norm": 0.09537260234355927, | |
| "learning_rate": 2.9595178099549315e-07, | |
| "loss": 0.054, | |
| "step": 32100 | |
| }, | |
| { | |
| "epoch": 8.927086221236484, | |
| "grad_norm": 0.09188380092382431, | |
| "learning_rate": 2.8136984252898515e-07, | |
| "loss": 0.0542, | |
| "step": 32200 | |
| }, | |
| { | |
| "epoch": 8.954810091488772, | |
| "grad_norm": 0.09919969737529755, | |
| "learning_rate": 2.671458724992254e-07, | |
| "loss": 0.0542, | |
| "step": 32300 | |
| }, | |
| { | |
| "epoch": 8.982533961741058, | |
| "grad_norm": 0.09692647308111191, | |
| "learning_rate": 2.532809499762312e-07, | |
| "loss": 0.0544, | |
| "step": 32400 | |
| }, | |
| { | |
| "epoch": 9.010257831993346, | |
| "grad_norm": 0.09277132153511047, | |
| "learning_rate": 2.397761267916726e-07, | |
| "loss": 0.0539, | |
| "step": 32500 | |
| }, | |
| { | |
| "epoch": 9.010257831993346, | |
| "eval_valid_loss": 0.047637518495321274, | |
| "eval_valid_runtime": 6.4471, | |
| "eval_valid_samples_per_second": 214.516, | |
| "eval_valid_steps_per_second": 6.825, | |
| "step": 32500 | |
| }, | |
| { | |
| "epoch": 9.010257831993346, | |
| "eval_valid_target_loss": 0.052208978682756424, | |
| "eval_valid_target_runtime": 6.5636, | |
| "eval_valid_target_samples_per_second": 218.477, | |
| "eval_valid_target_steps_per_second": 6.856, | |
| "step": 32500 | |
| }, | |
| { | |
| "epoch": 9.037981702245634, | |
| "grad_norm": 0.09585940837860107, | |
| "learning_rate": 2.2663242745908087e-07, | |
| "loss": 0.0542, | |
| "step": 32600 | |
| }, | |
| { | |
| "epoch": 9.06570557249792, | |
| "grad_norm": 0.09488432109355927, | |
| "learning_rate": 2.138508490961244e-07, | |
| "loss": 0.0533, | |
| "step": 32700 | |
| }, | |
| { | |
| "epoch": 9.093429442750208, | |
| "grad_norm": 0.09499957412481308, | |
| "learning_rate": 2.014323613489666e-07, | |
| "loss": 0.0543, | |
| "step": 32800 | |
| }, | |
| { | |
| "epoch": 9.121153313002495, | |
| "grad_norm": 0.09435317665338516, | |
| "learning_rate": 1.8937790631870345e-07, | |
| "loss": 0.0536, | |
| "step": 32900 | |
| }, | |
| { | |
| "epoch": 9.148877183254783, | |
| "grad_norm": 0.10342779755592346, | |
| "learning_rate": 1.7768839848989584e-07, | |
| "loss": 0.0539, | |
| "step": 33000 | |
| }, | |
| { | |
| "epoch": 9.148877183254783, | |
| "eval_valid_loss": 0.047598063945770264, | |
| "eval_valid_runtime": 6.4315, | |
| "eval_valid_samples_per_second": 215.037, | |
| "eval_valid_steps_per_second": 6.841, | |
| "step": 33000 | |
| }, | |
| { | |
| "epoch": 9.148877183254783, | |
| "eval_valid_target_loss": 0.05212317034602165, | |
| "eval_valid_target_runtime": 6.5736, | |
| "eval_valid_target_samples_per_second": 218.146, | |
| "eval_valid_target_steps_per_second": 6.846, | |
| "step": 33000 | |
| }, | |
| { | |
| "epoch": 9.176601053507069, | |
| "grad_norm": 0.09814909845590591, | |
| "learning_rate": 1.6636472466118992e-07, | |
| "loss": 0.0542, | |
| "step": 33100 | |
| }, | |
| { | |
| "epoch": 9.204324923759357, | |
| "grad_norm": 0.09484022855758667, | |
| "learning_rate": 1.5540774387804825e-07, | |
| "loss": 0.0544, | |
| "step": 33200 | |
| }, | |
| { | |
| "epoch": 9.232048794011645, | |
| "grad_norm": 0.07888332009315491, | |
| "learning_rate": 1.448182873675752e-07, | |
| "loss": 0.0539, | |
| "step": 33300 | |
| }, | |
| { | |
| "epoch": 9.25977266426393, | |
| "grad_norm": 0.0964021384716034, | |
| "learning_rate": 1.345971584754585e-07, | |
| "loss": 0.0539, | |
| "step": 33400 | |
| }, | |
| { | |
| "epoch": 9.287496534516219, | |
| "grad_norm": 0.10322096943855286, | |
| "learning_rate": 1.2474513260502695e-07, | |
| "loss": 0.0536, | |
| "step": 33500 | |
| }, | |
| { | |
| "epoch": 9.287496534516219, | |
| "eval_valid_loss": 0.047564879059791565, | |
| "eval_valid_runtime": 6.4358, | |
| "eval_valid_samples_per_second": 214.89, | |
| "eval_valid_steps_per_second": 6.837, | |
| "step": 33500 | |
| }, | |
| { | |
| "epoch": 9.287496534516219, | |
| "eval_valid_target_loss": 0.05209695175290108, | |
| "eval_valid_target_runtime": 6.5809, | |
| "eval_valid_target_samples_per_second": 217.904, | |
| "eval_valid_target_steps_per_second": 6.838, | |
| "step": 33500 | |
| }, | |
| { | |
| "epoch": 9.315220404768505, | |
| "grad_norm": 0.10957927256822586, | |
| "learning_rate": 1.1526295715842628e-07, | |
| "loss": 0.0541, | |
| "step": 33600 | |
| }, | |
| { | |
| "epoch": 9.342944275020793, | |
| "grad_norm": 0.09433583915233612, | |
| "learning_rate": 1.0615135147991562e-07, | |
| "loss": 0.0542, | |
| "step": 33700 | |
| }, | |
| { | |
| "epoch": 9.370668145273081, | |
| "grad_norm": 0.09703412652015686, | |
| "learning_rate": 9.741100680130122e-08, | |
| "loss": 0.0535, | |
| "step": 33800 | |
| }, | |
| { | |
| "epoch": 9.398392015525367, | |
| "grad_norm": 0.10180799663066864, | |
| "learning_rate": 8.904258618949335e-08, | |
| "loss": 0.054, | |
| "step": 33900 | |
| }, | |
| { | |
| "epoch": 9.426115885777655, | |
| "grad_norm": 0.09336613118648529, | |
| "learning_rate": 8.104672449620598e-08, | |
| "loss": 0.0532, | |
| "step": 34000 | |
| }, | |
| { | |
| "epoch": 9.426115885777655, | |
| "eval_valid_loss": 0.047556404024362564, | |
| "eval_valid_runtime": 6.42, | |
| "eval_valid_samples_per_second": 215.421, | |
| "eval_valid_steps_per_second": 6.854, | |
| "step": 34000 | |
| }, | |
| { | |
| "epoch": 9.426115885777655, | |
| "eval_valid_target_loss": 0.05208129063248634, | |
| "eval_valid_target_runtime": 6.595, | |
| "eval_valid_target_samples_per_second": 217.437, | |
| "eval_valid_target_steps_per_second": 6.823, | |
| "step": 34000 | |
| }, | |
| { | |
| "epoch": 9.453839756029941, | |
| "grad_norm": 0.0890408605337143, | |
| "learning_rate": 7.342402830979589e-08, | |
| "loss": 0.054, | |
| "step": 34100 | |
| }, | |
| { | |
| "epoch": 9.48156362628223, | |
| "grad_norm": 0.09568461775779724, | |
| "learning_rate": 6.617507590924332e-08, | |
| "loss": 0.0535, | |
| "step": 34200 | |
| }, | |
| { | |
| "epoch": 9.509287496534515, | |
| "grad_norm": 0.09256019443273544, | |
| "learning_rate": 5.930041722028379e-08, | |
| "loss": 0.054, | |
| "step": 34300 | |
| }, | |
| { | |
| "epoch": 9.537011366786803, | |
| "grad_norm": 0.09314898401498795, | |
| "learning_rate": 5.280057377368863e-08, | |
| "loss": 0.0535, | |
| "step": 34400 | |
| }, | |
| { | |
| "epoch": 9.564735237039091, | |
| "grad_norm": 0.10256827622652054, | |
| "learning_rate": 4.667603866569892e-08, | |
| "loss": 0.0537, | |
| "step": 34500 | |
| }, | |
| { | |
| "epoch": 9.564735237039091, | |
| "eval_valid_loss": 0.047560639679431915, | |
| "eval_valid_runtime": 6.4632, | |
| "eval_valid_samples_per_second": 213.979, | |
| "eval_valid_steps_per_second": 6.808, | |
| "step": 34500 | |
| }, | |
| { | |
| "epoch": 9.564735237039091, | |
| "eval_valid_target_loss": 0.05206665024161339, | |
| "eval_valid_target_runtime": 6.5886, | |
| "eval_valid_target_samples_per_second": 217.649, | |
| "eval_valid_target_steps_per_second": 6.83, | |
| "step": 34500 | |
| }, | |
| { | |
| "epoch": 9.592459107291377, | |
| "grad_norm": 0.0861942321062088, | |
| "learning_rate": 4.092727652062034e-08, | |
| "loss": 0.0537, | |
| "step": 34600 | |
| }, | |
| { | |
| "epoch": 9.620182977543665, | |
| "grad_norm": 0.09521106630563736, | |
| "learning_rate": 3.555472345557365e-08, | |
| "loss": 0.0535, | |
| "step": 34700 | |
| }, | |
| { | |
| "epoch": 9.647906847795952, | |
| "grad_norm": 0.10885845869779587, | |
| "learning_rate": 3.055878704741e-08, | |
| "loss": 0.0542, | |
| "step": 34800 | |
| }, | |
| { | |
| "epoch": 9.67563071804824, | |
| "grad_norm": 0.09145703911781311, | |
| "learning_rate": 2.5939846301791804e-08, | |
| "loss": 0.0541, | |
| "step": 34900 | |
| }, | |
| { | |
| "epoch": 9.703354588300527, | |
| "grad_norm": 0.09051796793937683, | |
| "learning_rate": 2.1698251624438503e-08, | |
| "loss": 0.0544, | |
| "step": 35000 | |
| }, | |
| { | |
| "epoch": 9.703354588300527, | |
| "eval_valid_loss": 0.04752533510327339, | |
| "eval_valid_runtime": 6.4168, | |
| "eval_valid_samples_per_second": 215.528, | |
| "eval_valid_steps_per_second": 6.857, | |
| "step": 35000 | |
| }, | |
| { | |
| "epoch": 9.703354588300527, | |
| "eval_valid_target_loss": 0.05207618325948715, | |
| "eval_valid_target_runtime": 6.57, | |
| "eval_valid_target_samples_per_second": 218.265, | |
| "eval_valid_target_steps_per_second": 6.849, | |
| "step": 35000 | |
| }, | |
| { | |
| "epoch": 9.731078458552814, | |
| "grad_norm": 0.0903056338429451, | |
| "learning_rate": 1.7834324794546164e-08, | |
| "loss": 0.0539, | |
| "step": 35100 | |
| }, | |
| { | |
| "epoch": 9.758802328805102, | |
| "grad_norm": 0.0897304117679596, | |
| "learning_rate": 1.434835894037423e-08, | |
| "loss": 0.0539, | |
| "step": 35200 | |
| }, | |
| { | |
| "epoch": 9.786526199057388, | |
| "grad_norm": 0.10058806836605072, | |
| "learning_rate": 1.1240618517009416e-08, | |
| "loss": 0.0542, | |
| "step": 35300 | |
| }, | |
| { | |
| "epoch": 9.814250069309676, | |
| "grad_norm": 0.1056876927614212, | |
| "learning_rate": 8.511339286303432e-09, | |
| "loss": 0.0537, | |
| "step": 35400 | |
| }, | |
| { | |
| "epoch": 9.841973939561964, | |
| "grad_norm": 0.08990786969661713, | |
| "learning_rate": 6.1607282989856184e-09, | |
| "loss": 0.0547, | |
| "step": 35500 | |
| }, | |
| { | |
| "epoch": 9.841973939561964, | |
| "eval_valid_loss": 0.047528158873319626, | |
| "eval_valid_runtime": 6.4412, | |
| "eval_valid_samples_per_second": 214.712, | |
| "eval_valid_steps_per_second": 6.831, | |
| "step": 35500 | |
| }, | |
| { | |
| "epoch": 9.841973939561964, | |
| "eval_valid_target_loss": 0.05206017941236496, | |
| "eval_valid_target_runtime": 6.5864, | |
| "eval_valid_target_samples_per_second": 217.72, | |
| "eval_valid_target_steps_per_second": 6.832, | |
| "step": 35500 | |
| }, | |
| { | |
| "epoch": 9.86969780981425, | |
| "grad_norm": 0.08090436458587646, | |
| "learning_rate": 4.188963878958841e-09, | |
| "loss": 0.0536, | |
| "step": 35600 | |
| }, | |
| { | |
| "epoch": 9.897421680066538, | |
| "grad_norm": 0.08319131284952164, | |
| "learning_rate": 2.5961956097669827e-09, | |
| "loss": 0.0541, | |
| "step": 35700 | |
| }, | |
| { | |
| "epoch": 9.925145550318824, | |
| "grad_norm": 0.10666873306035995, | |
| "learning_rate": 1.3825443232517999e-09, | |
| "loss": 0.0541, | |
| "step": 35800 | |
| }, | |
| { | |
| "epoch": 9.952869420571112, | |
| "grad_norm": 0.10748881101608276, | |
| "learning_rate": 5.48102090381919e-10, | |
| "loss": 0.0543, | |
| "step": 35900 | |
| }, | |
| { | |
| "epoch": 9.9805932908234, | |
| "grad_norm": 0.10198221355676651, | |
| "learning_rate": 9.293221427231214e-11, | |
| "loss": 0.0533, | |
| "step": 36000 | |
| }, | |
| { | |
| "epoch": 9.9805932908234, | |
| "eval_valid_loss": 0.04753027856349945, | |
| "eval_valid_runtime": 6.4518, | |
| "eval_valid_samples_per_second": 214.359, | |
| "eval_valid_steps_per_second": 6.82, | |
| "step": 36000 | |
| }, | |
| { | |
| "epoch": 9.9805932908234, | |
| "eval_valid_target_loss": 0.05205439031124115, | |
| "eval_valid_target_runtime": 6.5698, | |
| "eval_valid_target_samples_per_second": 218.272, | |
| "eval_valid_target_steps_per_second": 6.85, | |
| "step": 36000 | |
| } | |
| ], | |
| "logging_steps": 100, | |
| "max_steps": 36070, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 10, | |
| "save_steps": 500, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": true | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 3.429394066302619e+19, | |
| "train_batch_size": 16, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |