| { |
| "best_metric": null, |
| "best_model_checkpoint": null, |
| "epoch": 5.0, |
| "eval_steps": 500, |
| "global_step": 395, |
| "is_hyper_param_search": false, |
| "is_local_process_zero": true, |
| "is_world_process_zero": true, |
| "log_history": [ |
| { |
| "epoch": 0.012658227848101266, |
| "grad_norm": 0.5064402371961658, |
| "learning_rate": 1.0000000000000002e-06, |
| "loss": 0.7968, |
| "step": 1 |
| }, |
| { |
| "epoch": 0.02531645569620253, |
| "grad_norm": 0.4347700160422649, |
| "learning_rate": 2.0000000000000003e-06, |
| "loss": 0.7971, |
| "step": 2 |
| }, |
| { |
| "epoch": 0.0379746835443038, |
| "grad_norm": 0.4600069866451295, |
| "learning_rate": 3e-06, |
| "loss": 0.7682, |
| "step": 3 |
| }, |
| { |
| "epoch": 0.05063291139240506, |
| "grad_norm": 0.38856164898778117, |
| "learning_rate": 4.000000000000001e-06, |
| "loss": 0.804, |
| "step": 4 |
| }, |
| { |
| "epoch": 0.06329113924050633, |
| "grad_norm": 0.2550662772218312, |
| "learning_rate": 5e-06, |
| "loss": 0.8014, |
| "step": 5 |
| }, |
| { |
| "epoch": 0.0759493670886076, |
| "grad_norm": 0.9890291578960435, |
| "learning_rate": 6e-06, |
| "loss": 0.8146, |
| "step": 6 |
| }, |
| { |
| "epoch": 0.08860759493670886, |
| "grad_norm": 0.8553074952653977, |
| "learning_rate": 7e-06, |
| "loss": 0.7909, |
| "step": 7 |
| }, |
| { |
| "epoch": 0.10126582278481013, |
| "grad_norm": 0.5481080562458982, |
| "learning_rate": 8.000000000000001e-06, |
| "loss": 0.806, |
| "step": 8 |
| }, |
| { |
| "epoch": 0.11392405063291139, |
| "grad_norm": 0.37793043326257825, |
| "learning_rate": 9e-06, |
| "loss": 0.7858, |
| "step": 9 |
| }, |
| { |
| "epoch": 0.12658227848101267, |
| "grad_norm": 0.5081915085652062, |
| "learning_rate": 1e-05, |
| "loss": 0.749, |
| "step": 10 |
| }, |
| { |
| "epoch": 0.13924050632911392, |
| "grad_norm": 0.6521886644076518, |
| "learning_rate": 1.1000000000000001e-05, |
| "loss": 0.8069, |
| "step": 11 |
| }, |
| { |
| "epoch": 0.1518987341772152, |
| "grad_norm": 0.5026663766470839, |
| "learning_rate": 1.2e-05, |
| "loss": 0.7862, |
| "step": 12 |
| }, |
| { |
| "epoch": 0.16455696202531644, |
| "grad_norm": 0.43919041970979394, |
| "learning_rate": 1.3000000000000001e-05, |
| "loss": 0.7846, |
| "step": 13 |
| }, |
| { |
| "epoch": 0.17721518987341772, |
| "grad_norm": 0.49349482728877997, |
| "learning_rate": 1.4e-05, |
| "loss": 0.7633, |
| "step": 14 |
| }, |
| { |
| "epoch": 0.189873417721519, |
| "grad_norm": 0.44284955692741484, |
| "learning_rate": 1.5000000000000002e-05, |
| "loss": 0.7951, |
| "step": 15 |
| }, |
| { |
| "epoch": 0.20253164556962025, |
| "grad_norm": 0.46628061714192676, |
| "learning_rate": 1.6000000000000003e-05, |
| "loss": 0.8081, |
| "step": 16 |
| }, |
| { |
| "epoch": 0.21518987341772153, |
| "grad_norm": 0.41362438662118695, |
| "learning_rate": 1.7e-05, |
| "loss": 0.767, |
| "step": 17 |
| }, |
| { |
| "epoch": 0.22784810126582278, |
| "grad_norm": 0.43028098922164054, |
| "learning_rate": 1.8e-05, |
| "loss": 0.8082, |
| "step": 18 |
| }, |
| { |
| "epoch": 0.24050632911392406, |
| "grad_norm": 0.4088201650759829, |
| "learning_rate": 1.9e-05, |
| "loss": 0.7876, |
| "step": 19 |
| }, |
| { |
| "epoch": 0.25316455696202533, |
| "grad_norm": 0.4236896815328937, |
| "learning_rate": 2e-05, |
| "loss": 0.7934, |
| "step": 20 |
| }, |
| { |
| "epoch": 0.26582278481012656, |
| "grad_norm": 0.3801519316427054, |
| "learning_rate": 2.1000000000000002e-05, |
| "loss": 0.8217, |
| "step": 21 |
| }, |
| { |
| "epoch": 0.27848101265822783, |
| "grad_norm": 0.4353103208992625, |
| "learning_rate": 2.2000000000000003e-05, |
| "loss": 0.802, |
| "step": 22 |
| }, |
| { |
| "epoch": 0.2911392405063291, |
| "grad_norm": 0.4056304765778736, |
| "learning_rate": 2.3e-05, |
| "loss": 0.8004, |
| "step": 23 |
| }, |
| { |
| "epoch": 0.3037974683544304, |
| "grad_norm": 0.42870239352523093, |
| "learning_rate": 2.4e-05, |
| "loss": 0.8005, |
| "step": 24 |
| }, |
| { |
| "epoch": 0.31645569620253167, |
| "grad_norm": 0.4388221915073862, |
| "learning_rate": 2.5e-05, |
| "loss": 0.7949, |
| "step": 25 |
| }, |
| { |
| "epoch": 0.3291139240506329, |
| "grad_norm": 0.47344060385708536, |
| "learning_rate": 2.6000000000000002e-05, |
| "loss": 0.7977, |
| "step": 26 |
| }, |
| { |
| "epoch": 0.34177215189873417, |
| "grad_norm": 0.4871886869801061, |
| "learning_rate": 2.7000000000000002e-05, |
| "loss": 0.8074, |
| "step": 27 |
| }, |
| { |
| "epoch": 0.35443037974683544, |
| "grad_norm": 0.5731199334104394, |
| "learning_rate": 2.8e-05, |
| "loss": 0.8203, |
| "step": 28 |
| }, |
| { |
| "epoch": 0.3670886075949367, |
| "grad_norm": 0.9389275389340311, |
| "learning_rate": 2.9e-05, |
| "loss": 0.8262, |
| "step": 29 |
| }, |
| { |
| "epoch": 0.379746835443038, |
| "grad_norm": 1.2115180715535663, |
| "learning_rate": 3.0000000000000004e-05, |
| "loss": 0.8104, |
| "step": 30 |
| }, |
| { |
| "epoch": 0.3924050632911392, |
| "grad_norm": 0.8412491217200665, |
| "learning_rate": 3.1e-05, |
| "loss": 0.7697, |
| "step": 31 |
| }, |
| { |
| "epoch": 0.4050632911392405, |
| "grad_norm": 1.1810315994151994, |
| "learning_rate": 3.2000000000000005e-05, |
| "loss": 0.8183, |
| "step": 32 |
| }, |
| { |
| "epoch": 0.4177215189873418, |
| "grad_norm": 0.7336114416401421, |
| "learning_rate": 3.3e-05, |
| "loss": 0.7915, |
| "step": 33 |
| }, |
| { |
| "epoch": 0.43037974683544306, |
| "grad_norm": 1.0322176098181708, |
| "learning_rate": 3.4e-05, |
| "loss": 0.8054, |
| "step": 34 |
| }, |
| { |
| "epoch": 0.4430379746835443, |
| "grad_norm": 1.0870886821908625, |
| "learning_rate": 3.5000000000000004e-05, |
| "loss": 0.7942, |
| "step": 35 |
| }, |
| { |
| "epoch": 0.45569620253164556, |
| "grad_norm": 1.1509564204460234, |
| "learning_rate": 3.6e-05, |
| "loss": 0.8058, |
| "step": 36 |
| }, |
| { |
| "epoch": 0.46835443037974683, |
| "grad_norm": 0.8072450413210277, |
| "learning_rate": 3.7000000000000005e-05, |
| "loss": 0.8012, |
| "step": 37 |
| }, |
| { |
| "epoch": 0.4810126582278481, |
| "grad_norm": 1.1375575820376653, |
| "learning_rate": 3.8e-05, |
| "loss": 0.7727, |
| "step": 38 |
| }, |
| { |
| "epoch": 0.4936708860759494, |
| "grad_norm": 0.6984855676314381, |
| "learning_rate": 3.9e-05, |
| "loss": 0.8279, |
| "step": 39 |
| }, |
| { |
| "epoch": 0.5063291139240507, |
| "grad_norm": 0.6890356108321417, |
| "learning_rate": 4e-05, |
| "loss": 0.8, |
| "step": 40 |
| }, |
| { |
| "epoch": 0.5189873417721519, |
| "grad_norm": 0.7836814209745863, |
| "learning_rate": 3.9999216858560603e-05, |
| "loss": 0.849, |
| "step": 41 |
| }, |
| { |
| "epoch": 0.5316455696202531, |
| "grad_norm": 0.770626888486783, |
| "learning_rate": 3.999686749557346e-05, |
| "loss": 0.7986, |
| "step": 42 |
| }, |
| { |
| "epoch": 0.5443037974683544, |
| "grad_norm": 0.5889698309751599, |
| "learning_rate": 3.999295209502691e-05, |
| "loss": 0.7991, |
| "step": 43 |
| }, |
| { |
| "epoch": 0.5569620253164557, |
| "grad_norm": 0.7643640268290252, |
| "learning_rate": 3.998747096355221e-05, |
| "loss": 0.8069, |
| "step": 44 |
| }, |
| { |
| "epoch": 0.569620253164557, |
| "grad_norm": 0.5973666051464454, |
| "learning_rate": 3.9980424530399475e-05, |
| "loss": 0.818, |
| "step": 45 |
| }, |
| { |
| "epoch": 0.5822784810126582, |
| "grad_norm": 0.6018566974452557, |
| "learning_rate": 3.997181334740408e-05, |
| "loss": 0.8202, |
| "step": 46 |
| }, |
| { |
| "epoch": 0.5949367088607594, |
| "grad_norm": 0.500026806418615, |
| "learning_rate": 3.9961638088943465e-05, |
| "loss": 0.7914, |
| "step": 47 |
| }, |
| { |
| "epoch": 0.6075949367088608, |
| "grad_norm": 0.5536025945061174, |
| "learning_rate": 3.994989955188427e-05, |
| "loss": 0.7837, |
| "step": 48 |
| }, |
| { |
| "epoch": 0.620253164556962, |
| "grad_norm": 0.4765009895810983, |
| "learning_rate": 3.993659865551998e-05, |
| "loss": 0.814, |
| "step": 49 |
| }, |
| { |
| "epoch": 0.6329113924050633, |
| "grad_norm": 0.47847721633882073, |
| "learning_rate": 3.99217364414989e-05, |
| "loss": 0.8074, |
| "step": 50 |
| }, |
| { |
| "epoch": 0.6455696202531646, |
| "grad_norm": 0.47280549738166827, |
| "learning_rate": 3.990531407374262e-05, |
| "loss": 0.8049, |
| "step": 51 |
| }, |
| { |
| "epoch": 0.6582278481012658, |
| "grad_norm": 0.42005120027664167, |
| "learning_rate": 3.9887332838354784e-05, |
| "loss": 0.8341, |
| "step": 52 |
| }, |
| { |
| "epoch": 0.6708860759493671, |
| "grad_norm": 0.423268612947329, |
| "learning_rate": 3.986779414352047e-05, |
| "loss": 0.7464, |
| "step": 53 |
| }, |
| { |
| "epoch": 0.6835443037974683, |
| "grad_norm": 0.42906392610316346, |
| "learning_rate": 3.984669951939583e-05, |
| "loss": 0.7935, |
| "step": 54 |
| }, |
| { |
| "epoch": 0.6962025316455697, |
| "grad_norm": 0.4670274491545538, |
| "learning_rate": 3.982405061798829e-05, |
| "loss": 0.7739, |
| "step": 55 |
| }, |
| { |
| "epoch": 0.7088607594936709, |
| "grad_norm": 0.5205590281674367, |
| "learning_rate": 3.9799849213027186e-05, |
| "loss": 0.8025, |
| "step": 56 |
| }, |
| { |
| "epoch": 0.7215189873417721, |
| "grad_norm": 0.47278324921832826, |
| "learning_rate": 3.9774097199824824e-05, |
| "loss": 0.7848, |
| "step": 57 |
| }, |
| { |
| "epoch": 0.7341772151898734, |
| "grad_norm": 0.45158006725544253, |
| "learning_rate": 3.974679659512807e-05, |
| "loss": 0.8095, |
| "step": 58 |
| }, |
| { |
| "epoch": 0.7468354430379747, |
| "grad_norm": 0.4391143464763561, |
| "learning_rate": 3.971794953696041e-05, |
| "loss": 0.783, |
| "step": 59 |
| }, |
| { |
| "epoch": 0.759493670886076, |
| "grad_norm": 0.45441260412279694, |
| "learning_rate": 3.9687558284454515e-05, |
| "loss": 0.7963, |
| "step": 60 |
| }, |
| { |
| "epoch": 0.7721518987341772, |
| "grad_norm": 0.4550922518852085, |
| "learning_rate": 3.96556252176753e-05, |
| "loss": 0.8334, |
| "step": 61 |
| }, |
| { |
| "epoch": 0.7848101265822784, |
| "grad_norm": 0.3712309670648417, |
| "learning_rate": 3.962215283743356e-05, |
| "loss": 0.8051, |
| "step": 62 |
| }, |
| { |
| "epoch": 0.7974683544303798, |
| "grad_norm": 0.3585063618209627, |
| "learning_rate": 3.9587143765090096e-05, |
| "loss": 0.8459, |
| "step": 63 |
| }, |
| { |
| "epoch": 0.810126582278481, |
| "grad_norm": 0.37138144105274296, |
| "learning_rate": 3.955060074235045e-05, |
| "loss": 0.853, |
| "step": 64 |
| }, |
| { |
| "epoch": 0.8227848101265823, |
| "grad_norm": 0.4363375524824417, |
| "learning_rate": 3.951252663105014e-05, |
| "loss": 0.8164, |
| "step": 65 |
| }, |
| { |
| "epoch": 0.8354430379746836, |
| "grad_norm": 0.47787123482225796, |
| "learning_rate": 3.947292441293062e-05, |
| "loss": 0.8222, |
| "step": 66 |
| }, |
| { |
| "epoch": 0.8481012658227848, |
| "grad_norm": 0.5017223706338163, |
| "learning_rate": 3.943179718940569e-05, |
| "loss": 0.7934, |
| "step": 67 |
| }, |
| { |
| "epoch": 0.8607594936708861, |
| "grad_norm": 0.4949845610321875, |
| "learning_rate": 3.938914818131866e-05, |
| "loss": 0.7981, |
| "step": 68 |
| }, |
| { |
| "epoch": 0.8734177215189873, |
| "grad_norm": 0.49920491454568733, |
| "learning_rate": 3.934498072869008e-05, |
| "loss": 0.8281, |
| "step": 69 |
| }, |
| { |
| "epoch": 0.8860759493670886, |
| "grad_norm": 0.45034846062729655, |
| "learning_rate": 3.92992982904562e-05, |
| "loss": 0.7719, |
| "step": 70 |
| }, |
| { |
| "epoch": 0.8987341772151899, |
| "grad_norm": 0.44394745297725413, |
| "learning_rate": 3.925210444419806e-05, |
| "loss": 0.7916, |
| "step": 71 |
| }, |
| { |
| "epoch": 0.9113924050632911, |
| "grad_norm": 0.38715911801365394, |
| "learning_rate": 3.9203402885861334e-05, |
| "loss": 0.8019, |
| "step": 72 |
| }, |
| { |
| "epoch": 0.9240506329113924, |
| "grad_norm": 0.48760649952731494, |
| "learning_rate": 3.915319742946687e-05, |
| "loss": 0.8223, |
| "step": 73 |
| }, |
| { |
| "epoch": 0.9367088607594937, |
| "grad_norm": 0.5616971172642145, |
| "learning_rate": 3.910149200681199e-05, |
| "loss": 0.8079, |
| "step": 74 |
| }, |
| { |
| "epoch": 0.9493670886075949, |
| "grad_norm": 0.5425560304150727, |
| "learning_rate": 3.904829066716263e-05, |
| "loss": 0.8329, |
| "step": 75 |
| }, |
| { |
| "epoch": 0.9620253164556962, |
| "grad_norm": 0.4498377825200447, |
| "learning_rate": 3.8993597576936154e-05, |
| "loss": 0.8053, |
| "step": 76 |
| }, |
| { |
| "epoch": 0.9746835443037974, |
| "grad_norm": 0.34642979074048774, |
| "learning_rate": 3.893741701937509e-05, |
| "loss": 0.8226, |
| "step": 77 |
| }, |
| { |
| "epoch": 0.9873417721518988, |
| "grad_norm": 0.4402127889952929, |
| "learning_rate": 3.887975339421172e-05, |
| "loss": 0.8552, |
| "step": 78 |
| }, |
| { |
| "epoch": 1.0, |
| "grad_norm": 0.44688397413240505, |
| "learning_rate": 3.882061121732349e-05, |
| "loss": 0.7747, |
| "step": 79 |
| }, |
| { |
| "epoch": 1.0126582278481013, |
| "grad_norm": 0.4768098067060385, |
| "learning_rate": 3.8759995120379355e-05, |
| "loss": 0.7735, |
| "step": 80 |
| }, |
| { |
| "epoch": 1.0253164556962024, |
| "grad_norm": 0.5623984337396924, |
| "learning_rate": 3.869790985047704e-05, |
| "loss": 0.7581, |
| "step": 81 |
| }, |
| { |
| "epoch": 1.0379746835443038, |
| "grad_norm": 0.576604325996257, |
| "learning_rate": 3.863436026977132e-05, |
| "loss": 0.7318, |
| "step": 82 |
| }, |
| { |
| "epoch": 1.0506329113924051, |
| "grad_norm": 0.5059595351594764, |
| "learning_rate": 3.85693513550932e-05, |
| "loss": 0.7125, |
| "step": 83 |
| }, |
| { |
| "epoch": 1.0632911392405062, |
| "grad_norm": 0.6040561142397214, |
| "learning_rate": 3.850288819756019e-05, |
| "loss": 0.7377, |
| "step": 84 |
| }, |
| { |
| "epoch": 1.0759493670886076, |
| "grad_norm": 0.5450877100328588, |
| "learning_rate": 3.843497600217757e-05, |
| "loss": 0.7555, |
| "step": 85 |
| }, |
| { |
| "epoch": 1.0886075949367089, |
| "grad_norm": 0.6556623197542489, |
| "learning_rate": 3.836562008743079e-05, |
| "loss": 0.7659, |
| "step": 86 |
| }, |
| { |
| "epoch": 1.1012658227848102, |
| "grad_norm": 0.6204913635746594, |
| "learning_rate": 3.8294825884868926e-05, |
| "loss": 0.6745, |
| "step": 87 |
| }, |
| { |
| "epoch": 1.1139240506329113, |
| "grad_norm": 0.6187620108671996, |
| "learning_rate": 3.822259893867936e-05, |
| "loss": 0.7572, |
| "step": 88 |
| }, |
| { |
| "epoch": 1.1265822784810127, |
| "grad_norm": 0.63340304982857, |
| "learning_rate": 3.814894490525356e-05, |
| "loss": 0.735, |
| "step": 89 |
| }, |
| { |
| "epoch": 1.139240506329114, |
| "grad_norm": 0.6506066390683658, |
| "learning_rate": 3.807386955274408e-05, |
| "loss": 0.7628, |
| "step": 90 |
| }, |
| { |
| "epoch": 1.1518987341772151, |
| "grad_norm": 0.7725263361365678, |
| "learning_rate": 3.79973787606129e-05, |
| "loss": 0.7295, |
| "step": 91 |
| }, |
| { |
| "epoch": 1.1645569620253164, |
| "grad_norm": 0.6852645943983061, |
| "learning_rate": 3.7919478519170917e-05, |
| "loss": 0.7416, |
| "step": 92 |
| }, |
| { |
| "epoch": 1.1772151898734178, |
| "grad_norm": 0.47014030202252644, |
| "learning_rate": 3.7840174929108854e-05, |
| "loss": 0.7671, |
| "step": 93 |
| }, |
| { |
| "epoch": 1.189873417721519, |
| "grad_norm": 0.7194557838652339, |
| "learning_rate": 3.775947420101948e-05, |
| "loss": 0.7862, |
| "step": 94 |
| }, |
| { |
| "epoch": 1.2025316455696202, |
| "grad_norm": 0.6351379876630061, |
| "learning_rate": 3.767738265491122e-05, |
| "loss": 0.7313, |
| "step": 95 |
| }, |
| { |
| "epoch": 1.2151898734177216, |
| "grad_norm": 0.5068026128077132, |
| "learning_rate": 3.759390671971325e-05, |
| "loss": 0.753, |
| "step": 96 |
| }, |
| { |
| "epoch": 1.2278481012658227, |
| "grad_norm": 0.5153165957337801, |
| "learning_rate": 3.750905293277197e-05, |
| "loss": 0.7108, |
| "step": 97 |
| }, |
| { |
| "epoch": 1.240506329113924, |
| "grad_norm": 0.4463914078700893, |
| "learning_rate": 3.742282793933906e-05, |
| "loss": 0.7415, |
| "step": 98 |
| }, |
| { |
| "epoch": 1.2531645569620253, |
| "grad_norm": 0.43884131852560887, |
| "learning_rate": 3.733523849205105e-05, |
| "loss": 0.7492, |
| "step": 99 |
| }, |
| { |
| "epoch": 1.2658227848101267, |
| "grad_norm": 0.47035841712987453, |
| "learning_rate": 3.724629145040056e-05, |
| "loss": 0.7238, |
| "step": 100 |
| }, |
| { |
| "epoch": 1.2784810126582278, |
| "grad_norm": 0.3948950399020623, |
| "learning_rate": 3.715599378019899e-05, |
| "loss": 0.7715, |
| "step": 101 |
| }, |
| { |
| "epoch": 1.2911392405063291, |
| "grad_norm": 0.4199178359283047, |
| "learning_rate": 3.7064352553031077e-05, |
| "loss": 0.7384, |
| "step": 102 |
| }, |
| { |
| "epoch": 1.3037974683544304, |
| "grad_norm": 0.3810474521618989, |
| "learning_rate": 3.6971374945701076e-05, |
| "loss": 0.7311, |
| "step": 103 |
| }, |
| { |
| "epoch": 1.3164556962025316, |
| "grad_norm": 0.43493588286043355, |
| "learning_rate": 3.687706823967073e-05, |
| "loss": 0.7197, |
| "step": 104 |
| }, |
| { |
| "epoch": 1.3291139240506329, |
| "grad_norm": 0.41688680545253287, |
| "learning_rate": 3.6781439820488974e-05, |
| "loss": 0.7375, |
| "step": 105 |
| }, |
| { |
| "epoch": 1.3417721518987342, |
| "grad_norm": 0.4278932668997099, |
| "learning_rate": 3.66844971772136e-05, |
| "loss": 0.7262, |
| "step": 106 |
| }, |
| { |
| "epoch": 1.3544303797468356, |
| "grad_norm": 0.4174836859191787, |
| "learning_rate": 3.6586247901824724e-05, |
| "loss": 0.7262, |
| "step": 107 |
| }, |
| { |
| "epoch": 1.3670886075949367, |
| "grad_norm": 0.38016614127893766, |
| "learning_rate": 3.648669968863023e-05, |
| "loss": 0.758, |
| "step": 108 |
| }, |
| { |
| "epoch": 1.379746835443038, |
| "grad_norm": 0.3858746148822291, |
| "learning_rate": 3.6385860333663236e-05, |
| "loss": 0.7175, |
| "step": 109 |
| }, |
| { |
| "epoch": 1.3924050632911391, |
| "grad_norm": 0.4231365876121473, |
| "learning_rate": 3.628373773407149e-05, |
| "loss": 0.7308, |
| "step": 110 |
| }, |
| { |
| "epoch": 1.4050632911392404, |
| "grad_norm": 0.3877572843485212, |
| "learning_rate": 3.6180339887498953e-05, |
| "loss": 0.7385, |
| "step": 111 |
| }, |
| { |
| "epoch": 1.4177215189873418, |
| "grad_norm": 0.4824392041857397, |
| "learning_rate": 3.6075674891459466e-05, |
| "loss": 0.7468, |
| "step": 112 |
| }, |
| { |
| "epoch": 1.4303797468354431, |
| "grad_norm": 0.3857299814846945, |
| "learning_rate": 3.59697509427026e-05, |
| "loss": 0.7357, |
| "step": 113 |
| }, |
| { |
| "epoch": 1.4430379746835442, |
| "grad_norm": 0.3846553226693622, |
| "learning_rate": 3.5862576336571725e-05, |
| "loss": 0.7079, |
| "step": 114 |
| }, |
| { |
| "epoch": 1.4556962025316456, |
| "grad_norm": 0.3824538614301241, |
| "learning_rate": 3.575415946635437e-05, |
| "loss": 0.7305, |
| "step": 115 |
| }, |
| { |
| "epoch": 1.4683544303797469, |
| "grad_norm": 0.48456126880227873, |
| "learning_rate": 3.564450882262491e-05, |
| "loss": 0.7712, |
| "step": 116 |
| }, |
| { |
| "epoch": 1.481012658227848, |
| "grad_norm": 0.43078583341853544, |
| "learning_rate": 3.5533632992579644e-05, |
| "loss": 0.7397, |
| "step": 117 |
| }, |
| { |
| "epoch": 1.4936708860759493, |
| "grad_norm": 0.3228096958908832, |
| "learning_rate": 3.542154065936429e-05, |
| "loss": 0.7628, |
| "step": 118 |
| }, |
| { |
| "epoch": 1.5063291139240507, |
| "grad_norm": 0.36882818550163615, |
| "learning_rate": 3.530824060139396e-05, |
| "loss": 0.7512, |
| "step": 119 |
| }, |
| { |
| "epoch": 1.518987341772152, |
| "grad_norm": 0.4152208277284043, |
| "learning_rate": 3.51937416916657e-05, |
| "loss": 0.7277, |
| "step": 120 |
| }, |
| { |
| "epoch": 1.5316455696202531, |
| "grad_norm": 0.45042265983600566, |
| "learning_rate": 3.507805289706362e-05, |
| "loss": 0.7774, |
| "step": 121 |
| }, |
| { |
| "epoch": 1.5443037974683544, |
| "grad_norm": 0.4093269543947324, |
| "learning_rate": 3.496118327765662e-05, |
| "loss": 0.7377, |
| "step": 122 |
| }, |
| { |
| "epoch": 1.5569620253164556, |
| "grad_norm": 0.34666215542123036, |
| "learning_rate": 3.48431419859889e-05, |
| "loss": 0.6989, |
| "step": 123 |
| }, |
| { |
| "epoch": 1.5696202531645569, |
| "grad_norm": 0.41171436492196756, |
| "learning_rate": 3.472393826636317e-05, |
| "loss": 0.7268, |
| "step": 124 |
| }, |
| { |
| "epoch": 1.5822784810126582, |
| "grad_norm": 0.39148886668508454, |
| "learning_rate": 3.460358145411669e-05, |
| "loss": 0.7261, |
| "step": 125 |
| }, |
| { |
| "epoch": 1.5949367088607596, |
| "grad_norm": 0.35889572408473175, |
| "learning_rate": 3.4482080974890163e-05, |
| "loss": 0.7589, |
| "step": 126 |
| }, |
| { |
| "epoch": 1.6075949367088609, |
| "grad_norm": 0.31803696533319203, |
| "learning_rate": 3.4359446343889624e-05, |
| "loss": 0.7575, |
| "step": 127 |
| }, |
| { |
| "epoch": 1.620253164556962, |
| "grad_norm": 0.3442315711735276, |
| "learning_rate": 3.4235687165141215e-05, |
| "loss": 0.7554, |
| "step": 128 |
| }, |
| { |
| "epoch": 1.6329113924050633, |
| "grad_norm": 0.3499026072634049, |
| "learning_rate": 3.411081313073906e-05, |
| "loss": 0.7267, |
| "step": 129 |
| }, |
| { |
| "epoch": 1.6455696202531644, |
| "grad_norm": 0.3260690025254289, |
| "learning_rate": 3.398483402008629e-05, |
| "loss": 0.7418, |
| "step": 130 |
| }, |
| { |
| "epoch": 1.6582278481012658, |
| "grad_norm": 0.34543980322658424, |
| "learning_rate": 3.385775969912908e-05, |
| "loss": 0.7427, |
| "step": 131 |
| }, |
| { |
| "epoch": 1.6708860759493671, |
| "grad_norm": 0.2988847374657102, |
| "learning_rate": 3.3729600119584115e-05, |
| "loss": 0.7638, |
| "step": 132 |
| }, |
| { |
| "epoch": 1.6835443037974684, |
| "grad_norm": 0.35687120319995136, |
| "learning_rate": 3.3600365318159136e-05, |
| "loss": 0.757, |
| "step": 133 |
| }, |
| { |
| "epoch": 1.6962025316455698, |
| "grad_norm": 0.38899552043256846, |
| "learning_rate": 3.3470065415767004e-05, |
| "loss": 0.7182, |
| "step": 134 |
| }, |
| { |
| "epoch": 1.7088607594936709, |
| "grad_norm": 0.3737695829412944, |
| "learning_rate": 3.333871061673302e-05, |
| "loss": 0.7499, |
| "step": 135 |
| }, |
| { |
| "epoch": 1.721518987341772, |
| "grad_norm": 0.34946652396486505, |
| "learning_rate": 3.3206311207995816e-05, |
| "loss": 0.751, |
| "step": 136 |
| }, |
| { |
| "epoch": 1.7341772151898733, |
| "grad_norm": 0.3905323790588904, |
| "learning_rate": 3.307287755830176e-05, |
| "loss": 0.7622, |
| "step": 137 |
| }, |
| { |
| "epoch": 1.7468354430379747, |
| "grad_norm": 0.4436210357200594, |
| "learning_rate": 3.2938420117392896e-05, |
| "loss": 0.7522, |
| "step": 138 |
| }, |
| { |
| "epoch": 1.759493670886076, |
| "grad_norm": 0.38693991633101565, |
| "learning_rate": 3.28029494151886e-05, |
| "loss": 0.7213, |
| "step": 139 |
| }, |
| { |
| "epoch": 1.7721518987341773, |
| "grad_norm": 0.39844510406756306, |
| "learning_rate": 3.2666476060960945e-05, |
| "loss": 0.7457, |
| "step": 140 |
| }, |
| { |
| "epoch": 1.7848101265822784, |
| "grad_norm": 0.3438391926323602, |
| "learning_rate": 3.252901074250384e-05, |
| "loss": 0.7379, |
| "step": 141 |
| }, |
| { |
| "epoch": 1.7974683544303798, |
| "grad_norm": 0.3706564483674349, |
| "learning_rate": 3.2390564225296015e-05, |
| "loss": 0.7182, |
| "step": 142 |
| }, |
| { |
| "epoch": 1.810126582278481, |
| "grad_norm": 0.3372688197355965, |
| "learning_rate": 3.225114735165797e-05, |
| "loss": 0.7246, |
| "step": 143 |
| }, |
| { |
| "epoch": 1.8227848101265822, |
| "grad_norm": 0.3490816845008961, |
| "learning_rate": 3.211077103990278e-05, |
| "loss": 0.7545, |
| "step": 144 |
| }, |
| { |
| "epoch": 1.8354430379746836, |
| "grad_norm": 0.3243213100104822, |
| "learning_rate": 3.1969446283481157e-05, |
| "loss": 0.7326, |
| "step": 145 |
| }, |
| { |
| "epoch": 1.8481012658227849, |
| "grad_norm": 0.33681274516080584, |
| "learning_rate": 3.18271841501204e-05, |
| "loss": 0.7286, |
| "step": 146 |
| }, |
| { |
| "epoch": 1.8607594936708862, |
| "grad_norm": 0.3263960804788961, |
| "learning_rate": 3.1683995780957706e-05, |
| "loss": 0.7408, |
| "step": 147 |
| }, |
| { |
| "epoch": 1.8734177215189873, |
| "grad_norm": 0.4287588920382392, |
| "learning_rate": 3.153989238966763e-05, |
| "loss": 0.7244, |
| "step": 148 |
| }, |
| { |
| "epoch": 1.8860759493670884, |
| "grad_norm": 0.38141083746028315, |
| "learning_rate": 3.13948852615839e-05, |
| "loss": 0.784, |
| "step": 149 |
| }, |
| { |
| "epoch": 1.8987341772151898, |
| "grad_norm": 0.34623678740897884, |
| "learning_rate": 3.124898575281562e-05, |
| "loss": 0.7436, |
| "step": 150 |
| }, |
| { |
| "epoch": 1.9113924050632911, |
| "grad_norm": 0.31200271097016913, |
| "learning_rate": 3.11022052893579e-05, |
| "loss": 0.7289, |
| "step": 151 |
| }, |
| { |
| "epoch": 1.9240506329113924, |
| "grad_norm": 0.345655007271936, |
| "learning_rate": 3.095455536619711e-05, |
| "loss": 0.7175, |
| "step": 152 |
| }, |
| { |
| "epoch": 1.9367088607594938, |
| "grad_norm": 0.2985815658075589, |
| "learning_rate": 3.080604754641057e-05, |
| "loss": 0.7395, |
| "step": 153 |
| }, |
| { |
| "epoch": 1.9493670886075949, |
| "grad_norm": 0.29992477661085093, |
| "learning_rate": 3.065669346026106e-05, |
| "loss": 0.7121, |
| "step": 154 |
| }, |
| { |
| "epoch": 1.9620253164556962, |
| "grad_norm": 0.3091766644852409, |
| "learning_rate": 3.0506504804285977e-05, |
| "loss": 0.7742, |
| "step": 155 |
| }, |
| { |
| "epoch": 1.9746835443037973, |
| "grad_norm": 0.3004387796291261, |
| "learning_rate": 3.0355493340381347e-05, |
| "loss": 0.7566, |
| "step": 156 |
| }, |
| { |
| "epoch": 1.9873417721518987, |
| "grad_norm": 0.2818258498162388, |
| "learning_rate": 3.020367089488069e-05, |
| "loss": 0.7343, |
| "step": 157 |
| }, |
| { |
| "epoch": 2.0, |
| "grad_norm": 0.46120255431120377, |
| "learning_rate": 3.0051049357628855e-05, |
| "loss": 0.7143, |
| "step": 158 |
| }, |
| { |
| "epoch": 2.0126582278481013, |
| "grad_norm": 0.7007547558193586, |
| "learning_rate": 2.9897640681050877e-05, |
| "loss": 0.6753, |
| "step": 159 |
| }, |
| { |
| "epoch": 2.0253164556962027, |
| "grad_norm": 0.9125428937129891, |
| "learning_rate": 2.9743456879215934e-05, |
| "loss": 0.6672, |
| "step": 160 |
| }, |
| { |
| "epoch": 2.037974683544304, |
| "grad_norm": 0.6576444790472206, |
| "learning_rate": 2.9588510026896485e-05, |
| "loss": 0.6357, |
| "step": 161 |
| }, |
| { |
| "epoch": 2.050632911392405, |
| "grad_norm": 0.6240263238612856, |
| "learning_rate": 2.9432812258622615e-05, |
| "loss": 0.654, |
| "step": 162 |
| }, |
| { |
| "epoch": 2.0632911392405062, |
| "grad_norm": 0.636252929878224, |
| "learning_rate": 2.9276375767731762e-05, |
| "loss": 0.6887, |
| "step": 163 |
| }, |
| { |
| "epoch": 2.0759493670886076, |
| "grad_norm": 0.5494250514074286, |
| "learning_rate": 2.91192128054138e-05, |
| "loss": 0.6473, |
| "step": 164 |
| }, |
| { |
| "epoch": 2.088607594936709, |
| "grad_norm": 0.5045894432855015, |
| "learning_rate": 2.8961335679751573e-05, |
| "loss": 0.6344, |
| "step": 165 |
| }, |
| { |
| "epoch": 2.1012658227848102, |
| "grad_norm": 0.5005394468401111, |
| "learning_rate": 2.8802756754757023e-05, |
| "loss": 0.6585, |
| "step": 166 |
| }, |
| { |
| "epoch": 2.1139240506329116, |
| "grad_norm": 0.49433013853859775, |
| "learning_rate": 2.864348844940292e-05, |
| "loss": 0.6407, |
| "step": 167 |
| }, |
| { |
| "epoch": 2.1265822784810124, |
| "grad_norm": 0.4621959574809036, |
| "learning_rate": 2.8483543236650245e-05, |
| "loss": 0.6551, |
| "step": 168 |
| }, |
| { |
| "epoch": 2.1392405063291138, |
| "grad_norm": 0.4815573164281136, |
| "learning_rate": 2.832293364247141e-05, |
| "loss": 0.646, |
| "step": 169 |
| }, |
| { |
| "epoch": 2.151898734177215, |
| "grad_norm": 0.41298258433472074, |
| "learning_rate": 2.81616722448693e-05, |
| "loss": 0.6421, |
| "step": 170 |
| }, |
| { |
| "epoch": 2.1645569620253164, |
| "grad_norm": 0.482755731008833, |
| "learning_rate": 2.7999771672892213e-05, |
| "loss": 0.6706, |
| "step": 171 |
| }, |
| { |
| "epoch": 2.1772151898734178, |
| "grad_norm": 0.4103357196002942, |
| "learning_rate": 2.783724460564485e-05, |
| "loss": 0.6301, |
| "step": 172 |
| }, |
| { |
| "epoch": 2.189873417721519, |
| "grad_norm": 0.4305743048719905, |
| "learning_rate": 2.7674103771295345e-05, |
| "loss": 0.645, |
| "step": 173 |
| }, |
| { |
| "epoch": 2.2025316455696204, |
| "grad_norm": 0.3767776887976166, |
| "learning_rate": 2.7510361946078482e-05, |
| "loss": 0.6574, |
| "step": 174 |
| }, |
| { |
| "epoch": 2.2151898734177213, |
| "grad_norm": 0.4176066280959923, |
| "learning_rate": 2.734603195329514e-05, |
| "loss": 0.6892, |
| "step": 175 |
| }, |
| { |
| "epoch": 2.2278481012658227, |
| "grad_norm": 0.43231771488851645, |
| "learning_rate": 2.7181126662308015e-05, |
| "loss": 0.6604, |
| "step": 176 |
| }, |
| { |
| "epoch": 2.240506329113924, |
| "grad_norm": 0.31970836274455994, |
| "learning_rate": 2.7015658987533807e-05, |
| "loss": 0.6631, |
| "step": 177 |
| }, |
| { |
| "epoch": 2.2531645569620253, |
| "grad_norm": 0.40412759499171763, |
| "learning_rate": 2.684964188743182e-05, |
| "loss": 0.6245, |
| "step": 178 |
| }, |
| { |
| "epoch": 2.2658227848101267, |
| "grad_norm": 0.33056157659688, |
| "learning_rate": 2.6683088363489118e-05, |
| "loss": 0.6591, |
| "step": 179 |
| }, |
| { |
| "epoch": 2.278481012658228, |
| "grad_norm": 0.3825570425350689, |
| "learning_rate": 2.6516011459202356e-05, |
| "loss": 0.6735, |
| "step": 180 |
| }, |
| { |
| "epoch": 2.291139240506329, |
| "grad_norm": 0.44429816345893774, |
| "learning_rate": 2.634842425905626e-05, |
| "loss": 0.6572, |
| "step": 181 |
| }, |
| { |
| "epoch": 2.3037974683544302, |
| "grad_norm": 0.36759937742377186, |
| "learning_rate": 2.618033988749895e-05, |
| "loss": 0.6773, |
| "step": 182 |
| }, |
| { |
| "epoch": 2.3164556962025316, |
| "grad_norm": 0.7066277320761736, |
| "learning_rate": 2.6011771507914096e-05, |
| "loss": 0.6712, |
| "step": 183 |
| }, |
| { |
| "epoch": 2.329113924050633, |
| "grad_norm": 0.3328197240799398, |
| "learning_rate": 2.5842732321590034e-05, |
| "loss": 0.6364, |
| "step": 184 |
| }, |
| { |
| "epoch": 2.3417721518987342, |
| "grad_norm": 0.32651931336905726, |
| "learning_rate": 2.5673235566685935e-05, |
| "loss": 0.6351, |
| "step": 185 |
| }, |
| { |
| "epoch": 2.3544303797468356, |
| "grad_norm": 0.30693242039281354, |
| "learning_rate": 2.5503294517195062e-05, |
| "loss": 0.6547, |
| "step": 186 |
| }, |
| { |
| "epoch": 2.367088607594937, |
| "grad_norm": 0.3203146955735119, |
| "learning_rate": 2.533292248190523e-05, |
| "loss": 0.6399, |
| "step": 187 |
| }, |
| { |
| "epoch": 2.379746835443038, |
| "grad_norm": 0.34990420700832414, |
| "learning_rate": 2.5162132803356522e-05, |
| "loss": 0.647, |
| "step": 188 |
| }, |
| { |
| "epoch": 2.392405063291139, |
| "grad_norm": 0.3521480721118919, |
| "learning_rate": 2.499093885679642e-05, |
| "loss": 0.6339, |
| "step": 189 |
| }, |
| { |
| "epoch": 2.4050632911392404, |
| "grad_norm": 0.30986847706468584, |
| "learning_rate": 2.481935404913229e-05, |
| "loss": 0.6454, |
| "step": 190 |
| }, |
| { |
| "epoch": 2.4177215189873418, |
| "grad_norm": 0.323632412960347, |
| "learning_rate": 2.4647391817881457e-05, |
| "loss": 0.6729, |
| "step": 191 |
| }, |
| { |
| "epoch": 2.430379746835443, |
| "grad_norm": 0.30405952286836885, |
| "learning_rate": 2.4475065630118855e-05, |
| "loss": 0.6392, |
| "step": 192 |
| }, |
| { |
| "epoch": 2.4430379746835444, |
| "grad_norm": 0.3154660761081758, |
| "learning_rate": 2.4302388981422355e-05, |
| "loss": 0.6673, |
| "step": 193 |
| }, |
| { |
| "epoch": 2.4556962025316453, |
| "grad_norm": 0.2941674571513733, |
| "learning_rate": 2.4129375394815878e-05, |
| "loss": 0.6517, |
| "step": 194 |
| }, |
| { |
| "epoch": 2.4683544303797467, |
| "grad_norm": 0.30895015825392347, |
| "learning_rate": 2.3956038419710348e-05, |
| "loss": 0.6633, |
| "step": 195 |
| }, |
| { |
| "epoch": 2.481012658227848, |
| "grad_norm": 0.27566382378720106, |
| "learning_rate": 2.3782391630842587e-05, |
| "loss": 0.6045, |
| "step": 196 |
| }, |
| { |
| "epoch": 2.4936708860759493, |
| "grad_norm": 0.3384225646038617, |
| "learning_rate": 2.3608448627212207e-05, |
| "loss": 0.6405, |
| "step": 197 |
| }, |
| { |
| "epoch": 2.5063291139240507, |
| "grad_norm": 0.2832906947274659, |
| "learning_rate": 2.343422303101664e-05, |
| "loss": 0.6848, |
| "step": 198 |
| }, |
| { |
| "epoch": 2.518987341772152, |
| "grad_norm": 0.3299659781184637, |
| "learning_rate": 2.3259728486584297e-05, |
| "loss": 0.6615, |
| "step": 199 |
| }, |
| { |
| "epoch": 2.5316455696202533, |
| "grad_norm": 0.3200656687512706, |
| "learning_rate": 2.3084978659306048e-05, |
| "loss": 0.6912, |
| "step": 200 |
| }, |
| { |
| "epoch": 2.5443037974683547, |
| "grad_norm": 0.308174008124388, |
| "learning_rate": 2.2909987234565026e-05, |
| "loss": 0.6785, |
| "step": 201 |
| }, |
| { |
| "epoch": 2.5569620253164556, |
| "grad_norm": 0.3544663621063538, |
| "learning_rate": 2.2734767916664854e-05, |
| "loss": 0.6323, |
| "step": 202 |
| }, |
| { |
| "epoch": 2.569620253164557, |
| "grad_norm": 0.3354628232787944, |
| "learning_rate": 2.255933442775641e-05, |
| "loss": 0.6524, |
| "step": 203 |
| }, |
| { |
| "epoch": 2.5822784810126582, |
| "grad_norm": 0.32689453372708943, |
| "learning_rate": 2.2383700506763204e-05, |
| "loss": 0.6748, |
| "step": 204 |
| }, |
| { |
| "epoch": 2.5949367088607596, |
| "grad_norm": 0.31737671727647254, |
| "learning_rate": 2.22078799083054e-05, |
| "loss": 0.6363, |
| "step": 205 |
| }, |
| { |
| "epoch": 2.607594936708861, |
| "grad_norm": 0.2825787669688371, |
| "learning_rate": 2.203188640162265e-05, |
| "loss": 0.6218, |
| "step": 206 |
| }, |
| { |
| "epoch": 2.620253164556962, |
| "grad_norm": 0.336620459599055, |
| "learning_rate": 2.185573376949578e-05, |
| "loss": 0.6731, |
| "step": 207 |
| }, |
| { |
| "epoch": 2.632911392405063, |
| "grad_norm": 0.3156677816260327, |
| "learning_rate": 2.167943580716737e-05, |
| "loss": 0.621, |
| "step": 208 |
| }, |
| { |
| "epoch": 2.6455696202531644, |
| "grad_norm": 0.34535943079382136, |
| "learning_rate": 2.150300632126142e-05, |
| "loss": 0.6288, |
| "step": 209 |
| }, |
| { |
| "epoch": 2.6582278481012658, |
| "grad_norm": 0.3124546569938381, |
| "learning_rate": 2.132645912870208e-05, |
| "loss": 0.6761, |
| "step": 210 |
| }, |
| { |
| "epoch": 2.670886075949367, |
| "grad_norm": 0.34600235147462305, |
| "learning_rate": 2.1149808055631603e-05, |
| "loss": 0.6679, |
| "step": 211 |
| }, |
| { |
| "epoch": 2.6835443037974684, |
| "grad_norm": 0.27065178228808984, |
| "learning_rate": 2.0973066936327565e-05, |
| "loss": 0.6321, |
| "step": 212 |
| }, |
| { |
| "epoch": 2.6962025316455698, |
| "grad_norm": 0.3362060155593516, |
| "learning_rate": 2.0796249612119405e-05, |
| "loss": 0.6677, |
| "step": 213 |
| }, |
| { |
| "epoch": 2.708860759493671, |
| "grad_norm": 0.2973638329827599, |
| "learning_rate": 2.061936993030451e-05, |
| "loss": 0.6412, |
| "step": 214 |
| }, |
| { |
| "epoch": 2.721518987341772, |
| "grad_norm": 0.3220694196130532, |
| "learning_rate": 2.044244174306374e-05, |
| "loss": 0.6541, |
| "step": 215 |
| }, |
| { |
| "epoch": 2.7341772151898733, |
| "grad_norm": 0.3027641705895448, |
| "learning_rate": 2.026547890637662e-05, |
| "loss": 0.6019, |
| "step": 216 |
| }, |
| { |
| "epoch": 2.7468354430379747, |
| "grad_norm": 0.3207247320311963, |
| "learning_rate": 2.0088495278936212e-05, |
| "loss": 0.6417, |
| "step": 217 |
| }, |
| { |
| "epoch": 2.759493670886076, |
| "grad_norm": 0.36163473633397253, |
| "learning_rate": 1.991150472106379e-05, |
| "loss": 0.6766, |
| "step": 218 |
| }, |
| { |
| "epoch": 2.7721518987341773, |
| "grad_norm": 0.2691926923083107, |
| "learning_rate": 1.9734521093623388e-05, |
| "loss": 0.6562, |
| "step": 219 |
| }, |
| { |
| "epoch": 2.7848101265822782, |
| "grad_norm": 0.34359844342765505, |
| "learning_rate": 1.9557558256936266e-05, |
| "loss": 0.6446, |
| "step": 220 |
| }, |
| { |
| "epoch": 2.7974683544303796, |
| "grad_norm": 0.2645899195098649, |
| "learning_rate": 1.9380630069695498e-05, |
| "loss": 0.6554, |
| "step": 221 |
| }, |
| { |
| "epoch": 2.810126582278481, |
| "grad_norm": 0.29583460748029244, |
| "learning_rate": 1.9203750387880602e-05, |
| "loss": 0.6227, |
| "step": 222 |
| }, |
| { |
| "epoch": 2.8227848101265822, |
| "grad_norm": 0.264820920107283, |
| "learning_rate": 1.9026933063672445e-05, |
| "loss": 0.6447, |
| "step": 223 |
| }, |
| { |
| "epoch": 2.8354430379746836, |
| "grad_norm": 0.30771232323975206, |
| "learning_rate": 1.88501919443684e-05, |
| "loss": 0.6315, |
| "step": 224 |
| }, |
| { |
| "epoch": 2.848101265822785, |
| "grad_norm": 0.2553957352820967, |
| "learning_rate": 1.8673540871297927e-05, |
| "loss": 0.6478, |
| "step": 225 |
| }, |
| { |
| "epoch": 2.8607594936708862, |
| "grad_norm": 0.32629305580702406, |
| "learning_rate": 1.8496993678738587e-05, |
| "loss": 0.6201, |
| "step": 226 |
| }, |
| { |
| "epoch": 2.8734177215189876, |
| "grad_norm": 0.26966933565658424, |
| "learning_rate": 1.8320564192832634e-05, |
| "loss": 0.6299, |
| "step": 227 |
| }, |
| { |
| "epoch": 2.8860759493670884, |
| "grad_norm": 0.31554171568490863, |
| "learning_rate": 1.8144266230504227e-05, |
| "loss": 0.6681, |
| "step": 228 |
| }, |
| { |
| "epoch": 2.8987341772151898, |
| "grad_norm": 0.24839739411092723, |
| "learning_rate": 1.7968113598377356e-05, |
| "loss": 0.6602, |
| "step": 229 |
| }, |
| { |
| "epoch": 2.911392405063291, |
| "grad_norm": 0.3091165612385795, |
| "learning_rate": 1.779212009169461e-05, |
| "loss": 0.6233, |
| "step": 230 |
| }, |
| { |
| "epoch": 2.9240506329113924, |
| "grad_norm": 0.2952522337046769, |
| "learning_rate": 1.7616299493236806e-05, |
| "loss": 0.5962, |
| "step": 231 |
| }, |
| { |
| "epoch": 2.9367088607594938, |
| "grad_norm": 0.25845370633874015, |
| "learning_rate": 1.74406655722436e-05, |
| "loss": 0.6636, |
| "step": 232 |
| }, |
| { |
| "epoch": 2.9493670886075947, |
| "grad_norm": 0.2648453369227927, |
| "learning_rate": 1.7265232083335153e-05, |
| "loss": 0.6138, |
| "step": 233 |
| }, |
| { |
| "epoch": 2.962025316455696, |
| "grad_norm": 0.273906606291419, |
| "learning_rate": 1.7090012765434974e-05, |
| "loss": 0.6455, |
| "step": 234 |
| }, |
| { |
| "epoch": 2.9746835443037973, |
| "grad_norm": 0.27228477538008333, |
| "learning_rate": 1.6915021340693952e-05, |
| "loss": 0.6657, |
| "step": 235 |
| }, |
| { |
| "epoch": 2.9873417721518987, |
| "grad_norm": 0.2522393915225999, |
| "learning_rate": 1.6740271513415706e-05, |
| "loss": 0.6516, |
| "step": 236 |
| }, |
| { |
| "epoch": 3.0, |
| "grad_norm": 0.8108189735549292, |
| "learning_rate": 1.6565776968983365e-05, |
| "loss": 0.5983, |
| "step": 237 |
| }, |
| { |
| "epoch": 3.0126582278481013, |
| "grad_norm": 1.232958795490363, |
| "learning_rate": 1.6391551372787796e-05, |
| "loss": 0.5634, |
| "step": 238 |
| }, |
| { |
| "epoch": 3.0253164556962027, |
| "grad_norm": 0.7844357772988507, |
| "learning_rate": 1.6217608369157417e-05, |
| "loss": 0.5224, |
| "step": 239 |
| }, |
| { |
| "epoch": 3.037974683544304, |
| "grad_norm": 0.7401617998186302, |
| "learning_rate": 1.6043961580289656e-05, |
| "loss": 0.5593, |
| "step": 240 |
| }, |
| { |
| "epoch": 3.050632911392405, |
| "grad_norm": 0.7914854111998476, |
| "learning_rate": 1.5870624605184125e-05, |
| "loss": 0.5482, |
| "step": 241 |
| }, |
| { |
| "epoch": 3.0632911392405062, |
| "grad_norm": 0.41812530802608444, |
| "learning_rate": 1.569761101857765e-05, |
| "loss": 0.5137, |
| "step": 242 |
| }, |
| { |
| "epoch": 3.0759493670886076, |
| "grad_norm": 0.7451435988290669, |
| "learning_rate": 1.552493436988115e-05, |
| "loss": 0.563, |
| "step": 243 |
| }, |
| { |
| "epoch": 3.088607594936709, |
| "grad_norm": 0.49313219014296494, |
| "learning_rate": 1.5352608182118546e-05, |
| "loss": 0.5424, |
| "step": 244 |
| }, |
| { |
| "epoch": 3.1012658227848102, |
| "grad_norm": 0.49266839267171114, |
| "learning_rate": 1.5180645950867714e-05, |
| "loss": 0.5249, |
| "step": 245 |
| }, |
| { |
| "epoch": 3.1139240506329116, |
| "grad_norm": 0.4851788656855937, |
| "learning_rate": 1.5009061143203585e-05, |
| "loss": 0.578, |
| "step": 246 |
| }, |
| { |
| "epoch": 3.1265822784810124, |
| "grad_norm": 0.4031242612434171, |
| "learning_rate": 1.4837867196643481e-05, |
| "loss": 0.5394, |
| "step": 247 |
| }, |
| { |
| "epoch": 3.1392405063291138, |
| "grad_norm": 0.4659828084891507, |
| "learning_rate": 1.466707751809478e-05, |
| "loss": 0.5356, |
| "step": 248 |
| }, |
| { |
| "epoch": 3.151898734177215, |
| "grad_norm": 0.4410019177616585, |
| "learning_rate": 1.4496705482804943e-05, |
| "loss": 0.5746, |
| "step": 249 |
| }, |
| { |
| "epoch": 3.1645569620253164, |
| "grad_norm": 0.4151476929851989, |
| "learning_rate": 1.4326764433314066e-05, |
| "loss": 0.5787, |
| "step": 250 |
| }, |
| { |
| "epoch": 3.1772151898734178, |
| "grad_norm": 0.383715772905628, |
| "learning_rate": 1.4157267678409969e-05, |
| "loss": 0.527, |
| "step": 251 |
| }, |
| { |
| "epoch": 3.189873417721519, |
| "grad_norm": 0.3626013559607639, |
| "learning_rate": 1.3988228492085911e-05, |
| "loss": 0.5433, |
| "step": 252 |
| }, |
| { |
| "epoch": 3.2025316455696204, |
| "grad_norm": 0.4357243664660663, |
| "learning_rate": 1.3819660112501054e-05, |
| "loss": 0.5408, |
| "step": 253 |
| }, |
| { |
| "epoch": 3.2151898734177213, |
| "grad_norm": 0.3559742228799713, |
| "learning_rate": 1.3651575740943746e-05, |
| "loss": 0.5162, |
| "step": 254 |
| }, |
| { |
| "epoch": 3.2278481012658227, |
| "grad_norm": 0.37225929452921475, |
| "learning_rate": 1.3483988540797652e-05, |
| "loss": 0.558, |
| "step": 255 |
| }, |
| { |
| "epoch": 3.240506329113924, |
| "grad_norm": 0.37206907021086916, |
| "learning_rate": 1.331691163651089e-05, |
| "loss": 0.533, |
| "step": 256 |
| }, |
| { |
| "epoch": 3.2531645569620253, |
| "grad_norm": 0.33216380945004786, |
| "learning_rate": 1.315035811256819e-05, |
| "loss": 0.5075, |
| "step": 257 |
| }, |
| { |
| "epoch": 3.2658227848101267, |
| "grad_norm": 0.3559597126097541, |
| "learning_rate": 1.29843410124662e-05, |
| "loss": 0.537, |
| "step": 258 |
| }, |
| { |
| "epoch": 3.278481012658228, |
| "grad_norm": 0.320506398509115, |
| "learning_rate": 1.2818873337691993e-05, |
| "loss": 0.5483, |
| "step": 259 |
| }, |
| { |
| "epoch": 3.291139240506329, |
| "grad_norm": 0.42262029967412273, |
| "learning_rate": 1.265396804670487e-05, |
| "loss": 0.5238, |
| "step": 260 |
| }, |
| { |
| "epoch": 3.3037974683544302, |
| "grad_norm": 0.30332749065508885, |
| "learning_rate": 1.2489638053921525e-05, |
| "loss": 0.5401, |
| "step": 261 |
| }, |
| { |
| "epoch": 3.3164556962025316, |
| "grad_norm": 0.29727127727930525, |
| "learning_rate": 1.2325896228704656e-05, |
| "loss": 0.5448, |
| "step": 262 |
| }, |
| { |
| "epoch": 3.329113924050633, |
| "grad_norm": 0.30191023413246443, |
| "learning_rate": 1.2162755394355152e-05, |
| "loss": 0.525, |
| "step": 263 |
| }, |
| { |
| "epoch": 3.3417721518987342, |
| "grad_norm": 0.26659773269162895, |
| "learning_rate": 1.2000228327107787e-05, |
| "loss": 0.5156, |
| "step": 264 |
| }, |
| { |
| "epoch": 3.3544303797468356, |
| "grad_norm": 0.2914720363368646, |
| "learning_rate": 1.1838327755130701e-05, |
| "loss": 0.5414, |
| "step": 265 |
| }, |
| { |
| "epoch": 3.367088607594937, |
| "grad_norm": 0.27291872478588397, |
| "learning_rate": 1.1677066357528591e-05, |
| "loss": 0.5423, |
| "step": 266 |
| }, |
| { |
| "epoch": 3.379746835443038, |
| "grad_norm": 0.26389957420834564, |
| "learning_rate": 1.151645676334976e-05, |
| "loss": 0.5566, |
| "step": 267 |
| }, |
| { |
| "epoch": 3.392405063291139, |
| "grad_norm": 0.2860279883558859, |
| "learning_rate": 1.1356511550597085e-05, |
| "loss": 0.5107, |
| "step": 268 |
| }, |
| { |
| "epoch": 3.4050632911392404, |
| "grad_norm": 0.25422431621057706, |
| "learning_rate": 1.1197243245242978e-05, |
| "loss": 0.5218, |
| "step": 269 |
| }, |
| { |
| "epoch": 3.4177215189873418, |
| "grad_norm": 0.25158942663841277, |
| "learning_rate": 1.103866432024843e-05, |
| "loss": 0.5262, |
| "step": 270 |
| }, |
| { |
| "epoch": 3.430379746835443, |
| "grad_norm": 0.2589285114479854, |
| "learning_rate": 1.0880787194586206e-05, |
| "loss": 0.5134, |
| "step": 271 |
| }, |
| { |
| "epoch": 3.4430379746835444, |
| "grad_norm": 0.2430388328861912, |
| "learning_rate": 1.0723624232268244e-05, |
| "loss": 0.5422, |
| "step": 272 |
| }, |
| { |
| "epoch": 3.4556962025316453, |
| "grad_norm": 0.25667840874038994, |
| "learning_rate": 1.0567187741377394e-05, |
| "loss": 0.5442, |
| "step": 273 |
| }, |
| { |
| "epoch": 3.4683544303797467, |
| "grad_norm": 0.2444208638869945, |
| "learning_rate": 1.0411489973103525e-05, |
| "loss": 0.5236, |
| "step": 274 |
| }, |
| { |
| "epoch": 3.481012658227848, |
| "grad_norm": 0.25520542385502243, |
| "learning_rate": 1.0256543120784074e-05, |
| "loss": 0.5307, |
| "step": 275 |
| }, |
| { |
| "epoch": 3.4936708860759493, |
| "grad_norm": 0.22684740767572276, |
| "learning_rate": 1.0102359318949136e-05, |
| "loss": 0.5146, |
| "step": 276 |
| }, |
| { |
| "epoch": 3.5063291139240507, |
| "grad_norm": 0.25241690666255856, |
| "learning_rate": 9.948950642371157e-06, |
| "loss": 0.5518, |
| "step": 277 |
| }, |
| { |
| "epoch": 3.518987341772152, |
| "grad_norm": 0.23534671335842364, |
| "learning_rate": 9.79632910511932e-06, |
| "loss": 0.5216, |
| "step": 278 |
| }, |
| { |
| "epoch": 3.5316455696202533, |
| "grad_norm": 0.24981549650103071, |
| "learning_rate": 9.64450665961866e-06, |
| "loss": 0.4958, |
| "step": 279 |
| }, |
| { |
| "epoch": 3.5443037974683547, |
| "grad_norm": 0.22674198828661088, |
| "learning_rate": 9.493495195714028e-06, |
| "loss": 0.5204, |
| "step": 280 |
| }, |
| { |
| "epoch": 3.5569620253164556, |
| "grad_norm": 0.22144196882208078, |
| "learning_rate": 9.343306539738947e-06, |
| "loss": 0.5227, |
| "step": 281 |
| }, |
| { |
| "epoch": 3.569620253164557, |
| "grad_norm": 0.2448947065204887, |
| "learning_rate": 9.193952453589436e-06, |
| "loss": 0.5384, |
| "step": 282 |
| }, |
| { |
| "epoch": 3.5822784810126582, |
| "grad_norm": 0.23465042331194777, |
| "learning_rate": 9.045444633802891e-06, |
| "loss": 0.535, |
| "step": 283 |
| }, |
| { |
| "epoch": 3.5949367088607596, |
| "grad_norm": 0.23044954454232866, |
| "learning_rate": 8.897794710642098e-06, |
| "loss": 0.543, |
| "step": 284 |
| }, |
| { |
| "epoch": 3.607594936708861, |
| "grad_norm": 0.23172133896309619, |
| "learning_rate": 8.751014247184388e-06, |
| "loss": 0.5676, |
| "step": 285 |
| }, |
| { |
| "epoch": 3.620253164556962, |
| "grad_norm": 0.2239517484243891, |
| "learning_rate": 8.605114738416103e-06, |
| "loss": 0.5513, |
| "step": 286 |
| }, |
| { |
| "epoch": 3.632911392405063, |
| "grad_norm": 0.22102907836542232, |
| "learning_rate": 8.460107610332371e-06, |
| "loss": 0.5388, |
| "step": 287 |
| }, |
| { |
| "epoch": 3.6455696202531644, |
| "grad_norm": 0.20401099424371477, |
| "learning_rate": 8.316004219042297e-06, |
| "loss": 0.5481, |
| "step": 288 |
| }, |
| { |
| "epoch": 3.6582278481012658, |
| "grad_norm": 0.228631540338843, |
| "learning_rate": 8.172815849879607e-06, |
| "loss": 0.528, |
| "step": 289 |
| }, |
| { |
| "epoch": 3.670886075949367, |
| "grad_norm": 0.22456084983117358, |
| "learning_rate": 8.030553716518853e-06, |
| "loss": 0.5367, |
| "step": 290 |
| }, |
| { |
| "epoch": 3.6835443037974684, |
| "grad_norm": 0.2144948006077472, |
| "learning_rate": 7.889228960097228e-06, |
| "loss": 0.5495, |
| "step": 291 |
| }, |
| { |
| "epoch": 3.6962025316455698, |
| "grad_norm": 0.20727905170409366, |
| "learning_rate": 7.748852648342038e-06, |
| "loss": 0.5353, |
| "step": 292 |
| }, |
| { |
| "epoch": 3.708860759493671, |
| "grad_norm": 0.22303267730870863, |
| "learning_rate": 7.609435774703982e-06, |
| "loss": 0.5529, |
| "step": 293 |
| }, |
| { |
| "epoch": 3.721518987341772, |
| "grad_norm": 0.22134421892787995, |
| "learning_rate": 7.470989257496164e-06, |
| "loss": 0.5393, |
| "step": 294 |
| }, |
| { |
| "epoch": 3.7341772151898733, |
| "grad_norm": 0.21417557348033345, |
| "learning_rate": 7.333523939039057e-06, |
| "loss": 0.5545, |
| "step": 295 |
| }, |
| { |
| "epoch": 3.7468354430379747, |
| "grad_norm": 0.21894766689359085, |
| "learning_rate": 7.197050584811405e-06, |
| "loss": 0.5298, |
| "step": 296 |
| }, |
| { |
| "epoch": 3.759493670886076, |
| "grad_norm": 0.2168491406607123, |
| "learning_rate": 7.061579882607108e-06, |
| "loss": 0.5169, |
| "step": 297 |
| }, |
| { |
| "epoch": 3.7721518987341773, |
| "grad_norm": 0.21650711756284644, |
| "learning_rate": 6.9271224416982394e-06, |
| "loss": 0.5638, |
| "step": 298 |
| }, |
| { |
| "epoch": 3.7848101265822782, |
| "grad_norm": 0.21621186272192375, |
| "learning_rate": 6.7936887920041825e-06, |
| "loss": 0.5747, |
| "step": 299 |
| }, |
| { |
| "epoch": 3.7974683544303796, |
| "grad_norm": 0.22551644100804194, |
| "learning_rate": 6.661289383266984e-06, |
| "loss": 0.526, |
| "step": 300 |
| }, |
| { |
| "epoch": 3.810126582278481, |
| "grad_norm": 0.21154644204339484, |
| "learning_rate": 6.529934584233e-06, |
| "loss": 0.5246, |
| "step": 301 |
| }, |
| { |
| "epoch": 3.8227848101265822, |
| "grad_norm": 0.21191179945056313, |
| "learning_rate": 6.399634681840865e-06, |
| "loss": 0.5202, |
| "step": 302 |
| }, |
| { |
| "epoch": 3.8354430379746836, |
| "grad_norm": 0.21949330053946264, |
| "learning_rate": 6.270399880415894e-06, |
| "loss": 0.5362, |
| "step": 303 |
| }, |
| { |
| "epoch": 3.848101265822785, |
| "grad_norm": 0.21496707551459943, |
| "learning_rate": 6.1422403008709255e-06, |
| "loss": 0.5578, |
| "step": 304 |
| }, |
| { |
| "epoch": 3.8607594936708862, |
| "grad_norm": 0.21879240379023276, |
| "learning_rate": 6.01516597991372e-06, |
| "loss": 0.5491, |
| "step": 305 |
| }, |
| { |
| "epoch": 3.8734177215189876, |
| "grad_norm": 0.20979663130601378, |
| "learning_rate": 5.889186869260941e-06, |
| "loss": 0.5531, |
| "step": 306 |
| }, |
| { |
| "epoch": 3.8860759493670884, |
| "grad_norm": 0.21587617052561336, |
| "learning_rate": 5.764312834858792e-06, |
| "loss": 0.55, |
| "step": 307 |
| }, |
| { |
| "epoch": 3.8987341772151898, |
| "grad_norm": 0.20020932496573068, |
| "learning_rate": 5.640553656110379e-06, |
| "loss": 0.5522, |
| "step": 308 |
| }, |
| { |
| "epoch": 3.911392405063291, |
| "grad_norm": 0.20804979119508457, |
| "learning_rate": 5.517919025109839e-06, |
| "loss": 0.5716, |
| "step": 309 |
| }, |
| { |
| "epoch": 3.9240506329113924, |
| "grad_norm": 0.20614216741584623, |
| "learning_rate": 5.396418545883318e-06, |
| "loss": 0.4969, |
| "step": 310 |
| }, |
| { |
| "epoch": 3.9367088607594938, |
| "grad_norm": 0.20211375359560393, |
| "learning_rate": 5.276061733636833e-06, |
| "loss": 0.5284, |
| "step": 311 |
| }, |
| { |
| "epoch": 3.9493670886075947, |
| "grad_norm": 0.19804610287780805, |
| "learning_rate": 5.156858014011104e-06, |
| "loss": 0.5728, |
| "step": 312 |
| }, |
| { |
| "epoch": 3.962025316455696, |
| "grad_norm": 0.20530850942815684, |
| "learning_rate": 5.038816722343387e-06, |
| "loss": 0.5184, |
| "step": 313 |
| }, |
| { |
| "epoch": 3.9746835443037973, |
| "grad_norm": 0.21108690974572725, |
| "learning_rate": 4.921947102936388e-06, |
| "loss": 0.5706, |
| "step": 314 |
| }, |
| { |
| "epoch": 3.9873417721518987, |
| "grad_norm": 0.19727815232903248, |
| "learning_rate": 4.806258308334306e-06, |
| "loss": 0.5692, |
| "step": 315 |
| }, |
| { |
| "epoch": 4.0, |
| "grad_norm": 0.9831501621136417, |
| "learning_rate": 4.69175939860605e-06, |
| "loss": 0.4787, |
| "step": 316 |
| }, |
| { |
| "epoch": 4.012658227848101, |
| "grad_norm": 0.4870908032169019, |
| "learning_rate": 4.578459340635719e-06, |
| "loss": 0.4425, |
| "step": 317 |
| }, |
| { |
| "epoch": 4.025316455696203, |
| "grad_norm": 0.4998371951593972, |
| "learning_rate": 4.466367007420365e-06, |
| "loss": 0.4669, |
| "step": 318 |
| }, |
| { |
| "epoch": 4.037974683544304, |
| "grad_norm": 0.7307305347874821, |
| "learning_rate": 4.3554911773751e-06, |
| "loss": 0.4637, |
| "step": 319 |
| }, |
| { |
| "epoch": 4.050632911392405, |
| "grad_norm": 0.5837987958939078, |
| "learning_rate": 4.2458405336456395e-06, |
| "loss": 0.5016, |
| "step": 320 |
| }, |
| { |
| "epoch": 4.063291139240507, |
| "grad_norm": 0.4363099151425374, |
| "learning_rate": 4.137423663428281e-06, |
| "loss": 0.4438, |
| "step": 321 |
| }, |
| { |
| "epoch": 4.075949367088608, |
| "grad_norm": 0.38653794232932476, |
| "learning_rate": 4.0302490572973996e-06, |
| "loss": 0.4521, |
| "step": 322 |
| }, |
| { |
| "epoch": 4.0886075949367084, |
| "grad_norm": 0.4955367791540508, |
| "learning_rate": 3.924325108540534e-06, |
| "loss": 0.4322, |
| "step": 323 |
| }, |
| { |
| "epoch": 4.10126582278481, |
| "grad_norm": 0.4852002953831795, |
| "learning_rate": 3.819660112501053e-06, |
| "loss": 0.4106, |
| "step": 324 |
| }, |
| { |
| "epoch": 4.113924050632911, |
| "grad_norm": 0.4095691127815476, |
| "learning_rate": 3.7162622659285185e-06, |
| "loss": 0.4734, |
| "step": 325 |
| }, |
| { |
| "epoch": 4.1265822784810124, |
| "grad_norm": 0.3007661855202624, |
| "learning_rate": 3.614139666336769e-06, |
| "loss": 0.4694, |
| "step": 326 |
| }, |
| { |
| "epoch": 4.139240506329114, |
| "grad_norm": 0.318953245064844, |
| "learning_rate": 3.5133003113697717e-06, |
| "loss": 0.4766, |
| "step": 327 |
| }, |
| { |
| "epoch": 4.151898734177215, |
| "grad_norm": 0.4934240339772568, |
| "learning_rate": 3.413752098175285e-06, |
| "loss": 0.4708, |
| "step": 328 |
| }, |
| { |
| "epoch": 4.1645569620253164, |
| "grad_norm": 0.4367189363583746, |
| "learning_rate": 3.315502822786407e-06, |
| "loss": 0.4662, |
| "step": 329 |
| }, |
| { |
| "epoch": 4.177215189873418, |
| "grad_norm": 0.29691965725456393, |
| "learning_rate": 3.21856017951103e-06, |
| "loss": 0.4829, |
| "step": 330 |
| }, |
| { |
| "epoch": 4.189873417721519, |
| "grad_norm": 0.21949706369317049, |
| "learning_rate": 3.1229317603292707e-06, |
| "loss": 0.4508, |
| "step": 331 |
| }, |
| { |
| "epoch": 4.2025316455696204, |
| "grad_norm": 0.30563669994626963, |
| "learning_rate": 3.0286250542989215e-06, |
| "loss": 0.4324, |
| "step": 332 |
| }, |
| { |
| "epoch": 4.215189873417722, |
| "grad_norm": 0.342864683828975, |
| "learning_rate": 2.93564744696893e-06, |
| "loss": 0.4352, |
| "step": 333 |
| }, |
| { |
| "epoch": 4.227848101265823, |
| "grad_norm": 0.31679071734680664, |
| "learning_rate": 2.8440062198010187e-06, |
| "loss": 0.4622, |
| "step": 334 |
| }, |
| { |
| "epoch": 4.2405063291139244, |
| "grad_norm": 0.24543561211413686, |
| "learning_rate": 2.753708549599443e-06, |
| "loss": 0.4803, |
| "step": 335 |
| }, |
| { |
| "epoch": 4.253164556962025, |
| "grad_norm": 0.21593306695142758, |
| "learning_rate": 2.664761507948945e-06, |
| "loss": 0.4645, |
| "step": 336 |
| }, |
| { |
| "epoch": 4.265822784810126, |
| "grad_norm": 0.23125129304548442, |
| "learning_rate": 2.5771720606609486e-06, |
| "loss": 0.4732, |
| "step": 337 |
| }, |
| { |
| "epoch": 4.2784810126582276, |
| "grad_norm": 0.3013286878261682, |
| "learning_rate": 2.4909470672280334e-06, |
| "loss": 0.4543, |
| "step": 338 |
| }, |
| { |
| "epoch": 4.291139240506329, |
| "grad_norm": 0.24233569882928516, |
| "learning_rate": 2.4060932802867498e-06, |
| "loss": 0.4459, |
| "step": 339 |
| }, |
| { |
| "epoch": 4.30379746835443, |
| "grad_norm": 0.23681738509390698, |
| "learning_rate": 2.322617345088778e-06, |
| "loss": 0.4625, |
| "step": 340 |
| }, |
| { |
| "epoch": 4.3164556962025316, |
| "grad_norm": 0.20138071955548387, |
| "learning_rate": 2.2405257989805264e-06, |
| "loss": 0.4611, |
| "step": 341 |
| }, |
| { |
| "epoch": 4.329113924050633, |
| "grad_norm": 0.21014293227174918, |
| "learning_rate": 2.1598250708911504e-06, |
| "loss": 0.5027, |
| "step": 342 |
| }, |
| { |
| "epoch": 4.341772151898734, |
| "grad_norm": 0.2289994155198736, |
| "learning_rate": 2.0805214808290896e-06, |
| "loss": 0.4529, |
| "step": 343 |
| }, |
| { |
| "epoch": 4.3544303797468356, |
| "grad_norm": 0.22414927849521216, |
| "learning_rate": 2.0026212393871057e-06, |
| "loss": 0.4592, |
| "step": 344 |
| }, |
| { |
| "epoch": 4.367088607594937, |
| "grad_norm": 0.22167034678197836, |
| "learning_rate": 1.926130447255925e-06, |
| "loss": 0.4539, |
| "step": 345 |
| }, |
| { |
| "epoch": 4.379746835443038, |
| "grad_norm": 0.18660957212383408, |
| "learning_rate": 1.8510550947464479e-06, |
| "loss": 0.4544, |
| "step": 346 |
| }, |
| { |
| "epoch": 4.3924050632911396, |
| "grad_norm": 0.18945678506406408, |
| "learning_rate": 1.7774010613206406e-06, |
| "loss": 0.4953, |
| "step": 347 |
| }, |
| { |
| "epoch": 4.405063291139241, |
| "grad_norm": 0.19337415817857806, |
| "learning_rate": 1.7051741151310786e-06, |
| "loss": 0.4648, |
| "step": 348 |
| }, |
| { |
| "epoch": 4.417721518987342, |
| "grad_norm": 0.19973937848994527, |
| "learning_rate": 1.6343799125692194e-06, |
| "loss": 0.4364, |
| "step": 349 |
| }, |
| { |
| "epoch": 4.430379746835443, |
| "grad_norm": 0.1999439373837463, |
| "learning_rate": 1.5650239978224346e-06, |
| "loss": 0.4487, |
| "step": 350 |
| }, |
| { |
| "epoch": 4.443037974683544, |
| "grad_norm": 0.19944223834974942, |
| "learning_rate": 1.4971118024398124e-06, |
| "loss": 0.5032, |
| "step": 351 |
| }, |
| { |
| "epoch": 4.455696202531645, |
| "grad_norm": 0.18772542243736196, |
| "learning_rate": 1.4306486449068002e-06, |
| "loss": 0.4732, |
| "step": 352 |
| }, |
| { |
| "epoch": 4.468354430379747, |
| "grad_norm": 0.1851071919628138, |
| "learning_rate": 1.365639730228685e-06, |
| "loss": 0.4691, |
| "step": 353 |
| }, |
| { |
| "epoch": 4.481012658227848, |
| "grad_norm": 0.19283845554051482, |
| "learning_rate": 1.3020901495229632e-06, |
| "loss": 0.4812, |
| "step": 354 |
| }, |
| { |
| "epoch": 4.493670886075949, |
| "grad_norm": 0.17316072830820867, |
| "learning_rate": 1.240004879620651e-06, |
| "loss": 0.4464, |
| "step": 355 |
| }, |
| { |
| "epoch": 4.506329113924051, |
| "grad_norm": 0.26638042061870965, |
| "learning_rate": 1.1793887826765094e-06, |
| "loss": 0.4568, |
| "step": 356 |
| }, |
| { |
| "epoch": 4.518987341772152, |
| "grad_norm": 0.1769364498679372, |
| "learning_rate": 1.1202466057882777e-06, |
| "loss": 0.4351, |
| "step": 357 |
| }, |
| { |
| "epoch": 4.531645569620253, |
| "grad_norm": 0.21054583188705406, |
| "learning_rate": 1.0625829806249133e-06, |
| "loss": 0.4736, |
| "step": 358 |
| }, |
| { |
| "epoch": 4.544303797468355, |
| "grad_norm": 0.18281527921298735, |
| "learning_rate": 1.0064024230638547e-06, |
| "loss": 0.464, |
| "step": 359 |
| }, |
| { |
| "epoch": 4.556962025316456, |
| "grad_norm": 0.18749354572565427, |
| "learning_rate": 9.517093328373739e-07, |
| "loss": 0.4347, |
| "step": 360 |
| }, |
| { |
| "epoch": 4.569620253164557, |
| "grad_norm": 0.19801669246463768, |
| "learning_rate": 8.985079931880114e-07, |
| "loss": 0.4556, |
| "step": 361 |
| }, |
| { |
| "epoch": 4.582278481012658, |
| "grad_norm": 0.1707845437181247, |
| "learning_rate": 8.468025705331406e-07, |
| "loss": 0.4678, |
| "step": 362 |
| }, |
| { |
| "epoch": 4.594936708860759, |
| "grad_norm": 0.17927167077263217, |
| "learning_rate": 7.965971141386708e-07, |
| "loss": 0.4805, |
| "step": 363 |
| }, |
| { |
| "epoch": 4.6075949367088604, |
| "grad_norm": 0.18123408677924785, |
| "learning_rate": 7.478955558019408e-07, |
| "loss": 0.509, |
| "step": 364 |
| }, |
| { |
| "epoch": 4.620253164556962, |
| "grad_norm": 0.17934939572634542, |
| "learning_rate": 7.007017095438029e-07, |
| "loss": 0.4385, |
| "step": 365 |
| }, |
| { |
| "epoch": 4.632911392405063, |
| "grad_norm": 0.18707997046238803, |
| "learning_rate": 6.550192713099224e-07, |
| "loss": 0.4341, |
| "step": 366 |
| }, |
| { |
| "epoch": 4.6455696202531644, |
| "grad_norm": 0.18084289990559332, |
| "learning_rate": 6.108518186813462e-07, |
| "loss": 0.4702, |
| "step": 367 |
| }, |
| { |
| "epoch": 4.658227848101266, |
| "grad_norm": 0.17471257906003163, |
| "learning_rate": 5.682028105943161e-07, |
| "loss": 0.4348, |
| "step": 368 |
| }, |
| { |
| "epoch": 4.670886075949367, |
| "grad_norm": 0.17312815925285896, |
| "learning_rate": 5.270755870693877e-07, |
| "loss": 0.4369, |
| "step": 369 |
| }, |
| { |
| "epoch": 4.6835443037974684, |
| "grad_norm": 0.17442658053131146, |
| "learning_rate": 4.874733689498645e-07, |
| "loss": 0.4593, |
| "step": 370 |
| }, |
| { |
| "epoch": 4.69620253164557, |
| "grad_norm": 0.17553291293796242, |
| "learning_rate": 4.493992576495609e-07, |
| "loss": 0.4607, |
| "step": 371 |
| }, |
| { |
| "epoch": 4.708860759493671, |
| "grad_norm": 0.17573326550033613, |
| "learning_rate": 4.1285623490990413e-07, |
| "loss": 0.4422, |
| "step": 372 |
| }, |
| { |
| "epoch": 4.7215189873417724, |
| "grad_norm": 0.17271762501593244, |
| "learning_rate": 3.778471625664404e-07, |
| "loss": 0.4476, |
| "step": 373 |
| }, |
| { |
| "epoch": 4.734177215189874, |
| "grad_norm": 0.16852411866432154, |
| "learning_rate": 3.4437478232470123e-07, |
| "loss": 0.4392, |
| "step": 374 |
| }, |
| { |
| "epoch": 4.746835443037975, |
| "grad_norm": 0.17227172970015317, |
| "learning_rate": 3.124417155454884e-07, |
| "loss": 0.433, |
| "step": 375 |
| }, |
| { |
| "epoch": 4.759493670886076, |
| "grad_norm": 0.16744296803228972, |
| "learning_rate": 2.820504630395915e-07, |
| "loss": 0.4389, |
| "step": 376 |
| }, |
| { |
| "epoch": 4.772151898734177, |
| "grad_norm": 0.16903573661066207, |
| "learning_rate": 2.532034048719312e-07, |
| "loss": 0.4574, |
| "step": 377 |
| }, |
| { |
| "epoch": 4.784810126582278, |
| "grad_norm": 0.17807232065432885, |
| "learning_rate": 2.259028001751773e-07, |
| "loss": 0.4046, |
| "step": 378 |
| }, |
| { |
| "epoch": 4.7974683544303796, |
| "grad_norm": 0.18476464618425043, |
| "learning_rate": 2.0015078697281477e-07, |
| "loss": 0.4509, |
| "step": 379 |
| }, |
| { |
| "epoch": 4.810126582278481, |
| "grad_norm": 0.16737994039781423, |
| "learning_rate": 1.7594938201170863e-07, |
| "loss": 0.4227, |
| "step": 380 |
| }, |
| { |
| "epoch": 4.822784810126582, |
| "grad_norm": 0.2648344981339244, |
| "learning_rate": 1.533004806041727e-07, |
| "loss": 0.4483, |
| "step": 381 |
| }, |
| { |
| "epoch": 4.8354430379746836, |
| "grad_norm": 0.16978562036616543, |
| "learning_rate": 1.3220585647953256e-07, |
| "loss": 0.4477, |
| "step": 382 |
| }, |
| { |
| "epoch": 4.848101265822785, |
| "grad_norm": 0.1642498077680966, |
| "learning_rate": 1.1266716164521906e-07, |
| "loss": 0.4584, |
| "step": 383 |
| }, |
| { |
| "epoch": 4.860759493670886, |
| "grad_norm": 0.1690920633180304, |
| "learning_rate": 9.46859262573896e-08, |
| "loss": 0.4531, |
| "step": 384 |
| }, |
| { |
| "epoch": 4.8734177215189876, |
| "grad_norm": 0.17582359187836866, |
| "learning_rate": 7.826355850110378e-08, |
| "loss": 0.4448, |
| "step": 385 |
| }, |
| { |
| "epoch": 4.886075949367089, |
| "grad_norm": 0.17280369527452052, |
| "learning_rate": 6.340134448002966e-08, |
| "loss": 0.4594, |
| "step": 386 |
| }, |
| { |
| "epoch": 4.89873417721519, |
| "grad_norm": 0.16986660465712172, |
| "learning_rate": 5.0100448115737446e-08, |
| "loss": 0.4524, |
| "step": 387 |
| }, |
| { |
| "epoch": 4.911392405063291, |
| "grad_norm": 0.1791349245983108, |
| "learning_rate": 3.836191105654141e-08, |
| "loss": 0.4791, |
| "step": 388 |
| }, |
| { |
| "epoch": 4.924050632911392, |
| "grad_norm": 0.1651267565065545, |
| "learning_rate": 2.8186652595918464e-08, |
| "loss": 0.4937, |
| "step": 389 |
| }, |
| { |
| "epoch": 4.936708860759493, |
| "grad_norm": 0.17381577748034596, |
| "learning_rate": 1.957546960052792e-08, |
| "loss": 0.457, |
| "step": 390 |
| }, |
| { |
| "epoch": 4.949367088607595, |
| "grad_norm": 0.16169549660015017, |
| "learning_rate": 1.2529036447792576e-08, |
| "loss": 0.4568, |
| "step": 391 |
| }, |
| { |
| "epoch": 4.962025316455696, |
| "grad_norm": 0.1649898709443881, |
| "learning_rate": 7.0479049730920454e-09, |
| "loss": 0.4363, |
| "step": 392 |
| }, |
| { |
| "epoch": 4.974683544303797, |
| "grad_norm": 0.171903361970066, |
| "learning_rate": 3.132504426548444e-09, |
| "loss": 0.4822, |
| "step": 393 |
| }, |
| { |
| "epoch": 4.987341772151899, |
| "grad_norm": 0.1729850210189732, |
| "learning_rate": 7.831414393999481e-10, |
| "loss": 0.4518, |
| "step": 394 |
| }, |
| { |
| "epoch": 5.0, |
| "grad_norm": 0.38031759897505635, |
| "learning_rate": 0.0, |
| "loss": 0.4016, |
| "step": 395 |
| }, |
| { |
| "epoch": 5.0, |
| "step": 395, |
| "total_flos": 5028535166042112.0, |
| "train_loss": 0.637588576350031, |
| "train_runtime": 11226.821, |
| "train_samples_per_second": 4.454, |
| "train_steps_per_second": 0.035 |
| } |
| ], |
| "logging_steps": 1, |
| "max_steps": 395, |
| "num_input_tokens_seen": 0, |
| "num_train_epochs": 5, |
| "save_steps": 500, |
| "stateful_callbacks": { |
| "TrainerControl": { |
| "args": { |
| "should_epoch_stop": false, |
| "should_evaluate": false, |
| "should_log": false, |
| "should_save": true, |
| "should_training_stop": true |
| }, |
| "attributes": {} |
| } |
| }, |
| "total_flos": 5028535166042112.0, |
| "train_batch_size": 1, |
| "trial_name": null, |
| "trial_params": null |
| } |
|
|