| { | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 6.955414012738854, | |
| "eval_steps": 500, | |
| "global_step": 364, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.01910828025477707, | |
| "grad_norm": 2.6096313072602495, | |
| "learning_rate": 5.405405405405406e-09, | |
| "loss": 0.4868, | |
| "step": 1 | |
| }, | |
| { | |
| "epoch": 0.03821656050955414, | |
| "grad_norm": 2.485833239472616, | |
| "learning_rate": 1.0810810810810811e-08, | |
| "loss": 0.4952, | |
| "step": 2 | |
| }, | |
| { | |
| "epoch": 0.05732484076433121, | |
| "grad_norm": 2.4354566981342787, | |
| "learning_rate": 1.6216216216216218e-08, | |
| "loss": 0.5004, | |
| "step": 3 | |
| }, | |
| { | |
| "epoch": 0.07643312101910828, | |
| "grad_norm": 2.69399408718244, | |
| "learning_rate": 2.1621621621621623e-08, | |
| "loss": 0.4742, | |
| "step": 4 | |
| }, | |
| { | |
| "epoch": 0.09554140127388536, | |
| "grad_norm": 2.668365174634573, | |
| "learning_rate": 2.7027027027027028e-08, | |
| "loss": 0.4986, | |
| "step": 5 | |
| }, | |
| { | |
| "epoch": 0.11464968152866242, | |
| "grad_norm": 2.5930151821263263, | |
| "learning_rate": 3.2432432432432436e-08, | |
| "loss": 0.4814, | |
| "step": 6 | |
| }, | |
| { | |
| "epoch": 0.1337579617834395, | |
| "grad_norm": 2.501772570078749, | |
| "learning_rate": 3.783783783783784e-08, | |
| "loss": 0.4915, | |
| "step": 7 | |
| }, | |
| { | |
| "epoch": 0.15286624203821655, | |
| "grad_norm": 2.397400915307242, | |
| "learning_rate": 4.3243243243243246e-08, | |
| "loss": 0.4875, | |
| "step": 8 | |
| }, | |
| { | |
| "epoch": 0.17197452229299362, | |
| "grad_norm": 2.490822236098687, | |
| "learning_rate": 4.864864864864865e-08, | |
| "loss": 0.491, | |
| "step": 9 | |
| }, | |
| { | |
| "epoch": 0.1910828025477707, | |
| "grad_norm": 2.5012006390851873, | |
| "learning_rate": 5.4054054054054056e-08, | |
| "loss": 0.5067, | |
| "step": 10 | |
| }, | |
| { | |
| "epoch": 0.21019108280254778, | |
| "grad_norm": 2.6416774026317325, | |
| "learning_rate": 5.945945945945946e-08, | |
| "loss": 0.4942, | |
| "step": 11 | |
| }, | |
| { | |
| "epoch": 0.22929936305732485, | |
| "grad_norm": 2.7487556432425997, | |
| "learning_rate": 6.486486486486487e-08, | |
| "loss": 0.5075, | |
| "step": 12 | |
| }, | |
| { | |
| "epoch": 0.2484076433121019, | |
| "grad_norm": 2.4970308931627256, | |
| "learning_rate": 7.027027027027027e-08, | |
| "loss": 0.4895, | |
| "step": 13 | |
| }, | |
| { | |
| "epoch": 0.267515923566879, | |
| "grad_norm": 2.5273192931380897, | |
| "learning_rate": 7.567567567567568e-08, | |
| "loss": 0.4813, | |
| "step": 14 | |
| }, | |
| { | |
| "epoch": 0.28662420382165604, | |
| "grad_norm": 2.6313225516871395, | |
| "learning_rate": 8.108108108108108e-08, | |
| "loss": 0.4867, | |
| "step": 15 | |
| }, | |
| { | |
| "epoch": 0.3057324840764331, | |
| "grad_norm": 2.3941944376719113, | |
| "learning_rate": 8.648648648648649e-08, | |
| "loss": 0.4816, | |
| "step": 16 | |
| }, | |
| { | |
| "epoch": 0.3248407643312102, | |
| "grad_norm": 2.396927453705762, | |
| "learning_rate": 9.189189189189189e-08, | |
| "loss": 0.487, | |
| "step": 17 | |
| }, | |
| { | |
| "epoch": 0.34394904458598724, | |
| "grad_norm": 2.4427598740118777, | |
| "learning_rate": 9.72972972972973e-08, | |
| "loss": 0.483, | |
| "step": 18 | |
| }, | |
| { | |
| "epoch": 0.3630573248407643, | |
| "grad_norm": 2.3754544947268035, | |
| "learning_rate": 1.027027027027027e-07, | |
| "loss": 0.4837, | |
| "step": 19 | |
| }, | |
| { | |
| "epoch": 0.3821656050955414, | |
| "grad_norm": 2.564444044871758, | |
| "learning_rate": 1.0810810810810811e-07, | |
| "loss": 0.5065, | |
| "step": 20 | |
| }, | |
| { | |
| "epoch": 0.4012738853503185, | |
| "grad_norm": 2.4115957977957274, | |
| "learning_rate": 1.135135135135135e-07, | |
| "loss": 0.4962, | |
| "step": 21 | |
| }, | |
| { | |
| "epoch": 0.42038216560509556, | |
| "grad_norm": 2.5329095962208665, | |
| "learning_rate": 1.1891891891891891e-07, | |
| "loss": 0.4996, | |
| "step": 22 | |
| }, | |
| { | |
| "epoch": 0.4394904458598726, | |
| "grad_norm": 2.4093237985701506, | |
| "learning_rate": 1.2432432432432432e-07, | |
| "loss": 0.4972, | |
| "step": 23 | |
| }, | |
| { | |
| "epoch": 0.4585987261146497, | |
| "grad_norm": 2.389370360307483, | |
| "learning_rate": 1.2972972972972974e-07, | |
| "loss": 0.4831, | |
| "step": 24 | |
| }, | |
| { | |
| "epoch": 0.47770700636942676, | |
| "grad_norm": 2.4000353855448058, | |
| "learning_rate": 1.3513513513513512e-07, | |
| "loss": 0.5065, | |
| "step": 25 | |
| }, | |
| { | |
| "epoch": 0.4968152866242038, | |
| "grad_norm": 2.489575248228843, | |
| "learning_rate": 1.4054054054054055e-07, | |
| "loss": 0.4716, | |
| "step": 26 | |
| }, | |
| { | |
| "epoch": 0.5159235668789809, | |
| "grad_norm": 2.5917592445480784, | |
| "learning_rate": 1.4594594594594595e-07, | |
| "loss": 0.5162, | |
| "step": 27 | |
| }, | |
| { | |
| "epoch": 0.535031847133758, | |
| "grad_norm": 2.681299634238704, | |
| "learning_rate": 1.5135135135135135e-07, | |
| "loss": 0.4704, | |
| "step": 28 | |
| }, | |
| { | |
| "epoch": 0.554140127388535, | |
| "grad_norm": 2.608824979278355, | |
| "learning_rate": 1.5675675675675675e-07, | |
| "loss": 0.5204, | |
| "step": 29 | |
| }, | |
| { | |
| "epoch": 0.5732484076433121, | |
| "grad_norm": 2.395493492503365, | |
| "learning_rate": 1.6216216216216215e-07, | |
| "loss": 0.4752, | |
| "step": 30 | |
| }, | |
| { | |
| "epoch": 0.5923566878980892, | |
| "grad_norm": 2.3454050765128804, | |
| "learning_rate": 1.6756756756756755e-07, | |
| "loss": 0.4821, | |
| "step": 31 | |
| }, | |
| { | |
| "epoch": 0.6114649681528662, | |
| "grad_norm": 2.500652429228352, | |
| "learning_rate": 1.7297297297297298e-07, | |
| "loss": 0.4933, | |
| "step": 32 | |
| }, | |
| { | |
| "epoch": 0.6305732484076433, | |
| "grad_norm": 2.287679137272454, | |
| "learning_rate": 1.7837837837837836e-07, | |
| "loss": 0.4781, | |
| "step": 33 | |
| }, | |
| { | |
| "epoch": 0.6496815286624203, | |
| "grad_norm": 2.3784928759155934, | |
| "learning_rate": 1.8378378378378379e-07, | |
| "loss": 0.4969, | |
| "step": 34 | |
| }, | |
| { | |
| "epoch": 0.6687898089171974, | |
| "grad_norm": 2.44085570936594, | |
| "learning_rate": 1.891891891891892e-07, | |
| "loss": 0.523, | |
| "step": 35 | |
| }, | |
| { | |
| "epoch": 0.6878980891719745, | |
| "grad_norm": 2.31145296336897, | |
| "learning_rate": 1.945945945945946e-07, | |
| "loss": 0.4863, | |
| "step": 36 | |
| }, | |
| { | |
| "epoch": 0.7070063694267515, | |
| "grad_norm": 2.3645970243471623, | |
| "learning_rate": 2e-07, | |
| "loss": 0.4936, | |
| "step": 37 | |
| }, | |
| { | |
| "epoch": 0.7261146496815286, | |
| "grad_norm": 2.3453615625368553, | |
| "learning_rate": 1.999953850085163e-07, | |
| "loss": 0.4848, | |
| "step": 38 | |
| }, | |
| { | |
| "epoch": 0.7452229299363057, | |
| "grad_norm": 2.3293217503132806, | |
| "learning_rate": 1.999815404600282e-07, | |
| "loss": 0.4893, | |
| "step": 39 | |
| }, | |
| { | |
| "epoch": 0.7643312101910829, | |
| "grad_norm": 2.1222113857893237, | |
| "learning_rate": 1.999584676323851e-07, | |
| "loss": 0.4644, | |
| "step": 40 | |
| }, | |
| { | |
| "epoch": 0.7834394904458599, | |
| "grad_norm": 2.255275631085622, | |
| "learning_rate": 1.9992616865520512e-07, | |
| "loss": 0.4541, | |
| "step": 41 | |
| }, | |
| { | |
| "epoch": 0.802547770700637, | |
| "grad_norm": 2.012214823415202, | |
| "learning_rate": 1.998846465096783e-07, | |
| "loss": 0.466, | |
| "step": 42 | |
| }, | |
| { | |
| "epoch": 0.821656050955414, | |
| "grad_norm": 2.189296418289646, | |
| "learning_rate": 1.9983390502829166e-07, | |
| "loss": 0.4717, | |
| "step": 43 | |
| }, | |
| { | |
| "epoch": 0.8407643312101911, | |
| "grad_norm": 2.099897798957949, | |
| "learning_rate": 1.9977394889447523e-07, | |
| "loss": 0.4574, | |
| "step": 44 | |
| }, | |
| { | |
| "epoch": 0.8598726114649682, | |
| "grad_norm": 2.188648446058437, | |
| "learning_rate": 1.9970478364216996e-07, | |
| "loss": 0.4674, | |
| "step": 45 | |
| }, | |
| { | |
| "epoch": 0.8789808917197452, | |
| "grad_norm": 1.9210110621178644, | |
| "learning_rate": 1.996264156553169e-07, | |
| "loss": 0.4529, | |
| "step": 46 | |
| }, | |
| { | |
| "epoch": 0.8980891719745223, | |
| "grad_norm": 1.9817997941615975, | |
| "learning_rate": 1.9953885216726785e-07, | |
| "loss": 0.4677, | |
| "step": 47 | |
| }, | |
| { | |
| "epoch": 0.9171974522292994, | |
| "grad_norm": 2.1547734650281276, | |
| "learning_rate": 1.9944210126011788e-07, | |
| "loss": 0.4752, | |
| "step": 48 | |
| }, | |
| { | |
| "epoch": 0.9363057324840764, | |
| "grad_norm": 1.8645798339123572, | |
| "learning_rate": 1.9933617186395914e-07, | |
| "loss": 0.4428, | |
| "step": 49 | |
| }, | |
| { | |
| "epoch": 0.9554140127388535, | |
| "grad_norm": 2.016567397630417, | |
| "learning_rate": 1.9922107375605698e-07, | |
| "loss": 0.4816, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.9745222929936306, | |
| "grad_norm": 1.786631190203663, | |
| "learning_rate": 1.990968175599471e-07, | |
| "loss": 0.4674, | |
| "step": 51 | |
| }, | |
| { | |
| "epoch": 0.9936305732484076, | |
| "grad_norm": 2.04989081734429, | |
| "learning_rate": 1.9896341474445524e-07, | |
| "loss": 0.4748, | |
| "step": 52 | |
| }, | |
| { | |
| "epoch": 1.0127388535031847, | |
| "grad_norm": 1.9769794080531382, | |
| "learning_rate": 1.9882087762263852e-07, | |
| "loss": 0.4728, | |
| "step": 53 | |
| }, | |
| { | |
| "epoch": 1.0318471337579618, | |
| "grad_norm": 1.9938931161500741, | |
| "learning_rate": 1.9866921935064905e-07, | |
| "loss": 0.453, | |
| "step": 54 | |
| }, | |
| { | |
| "epoch": 1.0509554140127388, | |
| "grad_norm": 1.7014310774732082, | |
| "learning_rate": 1.9850845392651947e-07, | |
| "loss": 0.4541, | |
| "step": 55 | |
| }, | |
| { | |
| "epoch": 1.070063694267516, | |
| "grad_norm": 1.7639902909661307, | |
| "learning_rate": 1.983385961888711e-07, | |
| "loss": 0.4484, | |
| "step": 56 | |
| }, | |
| { | |
| "epoch": 1.089171974522293, | |
| "grad_norm": 1.872307114043005, | |
| "learning_rate": 1.981596618155441e-07, | |
| "loss": 0.4779, | |
| "step": 57 | |
| }, | |
| { | |
| "epoch": 1.10828025477707, | |
| "grad_norm": 1.9070131132592882, | |
| "learning_rate": 1.9797166732215075e-07, | |
| "loss": 0.4376, | |
| "step": 58 | |
| }, | |
| { | |
| "epoch": 1.127388535031847, | |
| "grad_norm": 1.8165653538892719, | |
| "learning_rate": 1.977746300605507e-07, | |
| "loss": 0.4424, | |
| "step": 59 | |
| }, | |
| { | |
| "epoch": 1.1464968152866242, | |
| "grad_norm": 1.7003231286369576, | |
| "learning_rate": 1.9756856821724967e-07, | |
| "loss": 0.4485, | |
| "step": 60 | |
| }, | |
| { | |
| "epoch": 1.1656050955414012, | |
| "grad_norm": 1.676286306752662, | |
| "learning_rate": 1.9735350081172067e-07, | |
| "loss": 0.449, | |
| "step": 61 | |
| }, | |
| { | |
| "epoch": 1.1847133757961783, | |
| "grad_norm": 1.7123905470024705, | |
| "learning_rate": 1.9712944769464862e-07, | |
| "loss": 0.4622, | |
| "step": 62 | |
| }, | |
| { | |
| "epoch": 1.2038216560509554, | |
| "grad_norm": 1.4763448653935922, | |
| "learning_rate": 1.9689642954609806e-07, | |
| "loss": 0.4396, | |
| "step": 63 | |
| }, | |
| { | |
| "epoch": 1.2229299363057324, | |
| "grad_norm": 1.6374280386081919, | |
| "learning_rate": 1.966544678736044e-07, | |
| "loss": 0.4535, | |
| "step": 64 | |
| }, | |
| { | |
| "epoch": 1.2420382165605095, | |
| "grad_norm": 1.4620089914563061, | |
| "learning_rate": 1.9640358501018882e-07, | |
| "loss": 0.4296, | |
| "step": 65 | |
| }, | |
| { | |
| "epoch": 1.2611464968152866, | |
| "grad_norm": 1.5335907875182524, | |
| "learning_rate": 1.961438041122969e-07, | |
| "loss": 0.4386, | |
| "step": 66 | |
| }, | |
| { | |
| "epoch": 1.2802547770700636, | |
| "grad_norm": 1.5655814455212236, | |
| "learning_rate": 1.9587514915766122e-07, | |
| "loss": 0.4462, | |
| "step": 67 | |
| }, | |
| { | |
| "epoch": 1.2993630573248407, | |
| "grad_norm": 1.4879500094292435, | |
| "learning_rate": 1.9559764494308834e-07, | |
| "loss": 0.436, | |
| "step": 68 | |
| }, | |
| { | |
| "epoch": 1.3184713375796178, | |
| "grad_norm": 1.4464449790937988, | |
| "learning_rate": 1.9531131708217004e-07, | |
| "loss": 0.458, | |
| "step": 69 | |
| }, | |
| { | |
| "epoch": 1.3375796178343948, | |
| "grad_norm": 1.4077038156465773, | |
| "learning_rate": 1.9501619200291905e-07, | |
| "loss": 0.4233, | |
| "step": 70 | |
| }, | |
| { | |
| "epoch": 1.356687898089172, | |
| "grad_norm": 1.4596682102109033, | |
| "learning_rate": 1.9471229694533e-07, | |
| "loss": 0.4322, | |
| "step": 71 | |
| }, | |
| { | |
| "epoch": 1.3757961783439492, | |
| "grad_norm": 1.299363763403682, | |
| "learning_rate": 1.9439965995886488e-07, | |
| "loss": 0.4383, | |
| "step": 72 | |
| }, | |
| { | |
| "epoch": 1.394904458598726, | |
| "grad_norm": 1.4211694941800017, | |
| "learning_rate": 1.9407830989986428e-07, | |
| "loss": 0.4404, | |
| "step": 73 | |
| }, | |
| { | |
| "epoch": 1.4140127388535033, | |
| "grad_norm": 1.4172304910905302, | |
| "learning_rate": 1.9374827642888395e-07, | |
| "loss": 0.4323, | |
| "step": 74 | |
| }, | |
| { | |
| "epoch": 1.4331210191082802, | |
| "grad_norm": 1.3076536857533756, | |
| "learning_rate": 1.9340959000795706e-07, | |
| "loss": 0.4009, | |
| "step": 75 | |
| }, | |
| { | |
| "epoch": 1.4522292993630574, | |
| "grad_norm": 1.3024081707340522, | |
| "learning_rate": 1.9306228189778253e-07, | |
| "loss": 0.4499, | |
| "step": 76 | |
| }, | |
| { | |
| "epoch": 1.4713375796178343, | |
| "grad_norm": 1.3118587098446126, | |
| "learning_rate": 1.927063841548398e-07, | |
| "loss": 0.4322, | |
| "step": 77 | |
| }, | |
| { | |
| "epoch": 1.4904458598726116, | |
| "grad_norm": 1.350632356042114, | |
| "learning_rate": 1.923419296284299e-07, | |
| "loss": 0.4321, | |
| "step": 78 | |
| }, | |
| { | |
| "epoch": 1.5095541401273884, | |
| "grad_norm": 1.3513117365592737, | |
| "learning_rate": 1.919689519576436e-07, | |
| "loss": 0.4334, | |
| "step": 79 | |
| }, | |
| { | |
| "epoch": 1.5286624203821657, | |
| "grad_norm": 1.302417278861503, | |
| "learning_rate": 1.9158748556825634e-07, | |
| "loss": 0.4316, | |
| "step": 80 | |
| }, | |
| { | |
| "epoch": 1.5477707006369426, | |
| "grad_norm": 1.367016731118727, | |
| "learning_rate": 1.911975656695509e-07, | |
| "loss": 0.4387, | |
| "step": 81 | |
| }, | |
| { | |
| "epoch": 1.5668789808917198, | |
| "grad_norm": 1.34754516501708, | |
| "learning_rate": 1.907992282510675e-07, | |
| "loss": 0.4372, | |
| "step": 82 | |
| }, | |
| { | |
| "epoch": 1.5859872611464967, | |
| "grad_norm": 1.2443808808257364, | |
| "learning_rate": 1.90392510079282e-07, | |
| "loss": 0.4299, | |
| "step": 83 | |
| }, | |
| { | |
| "epoch": 1.605095541401274, | |
| "grad_norm": 1.389465502979021, | |
| "learning_rate": 1.8997744869421245e-07, | |
| "loss": 0.4319, | |
| "step": 84 | |
| }, | |
| { | |
| "epoch": 1.6242038216560508, | |
| "grad_norm": 1.3395108076243718, | |
| "learning_rate": 1.8955408240595392e-07, | |
| "loss": 0.4154, | |
| "step": 85 | |
| }, | |
| { | |
| "epoch": 1.643312101910828, | |
| "grad_norm": 1.399829650509336, | |
| "learning_rate": 1.8912245029114278e-07, | |
| "loss": 0.4227, | |
| "step": 86 | |
| }, | |
| { | |
| "epoch": 1.662420382165605, | |
| "grad_norm": 1.2821155822840626, | |
| "learning_rate": 1.8868259218934966e-07, | |
| "loss": 0.4424, | |
| "step": 87 | |
| }, | |
| { | |
| "epoch": 1.6815286624203822, | |
| "grad_norm": 1.228514092123298, | |
| "learning_rate": 1.882345486994024e-07, | |
| "loss": 0.4211, | |
| "step": 88 | |
| }, | |
| { | |
| "epoch": 1.700636942675159, | |
| "grad_norm": 1.199257741732802, | |
| "learning_rate": 1.877783611756389e-07, | |
| "loss": 0.4271, | |
| "step": 89 | |
| }, | |
| { | |
| "epoch": 1.7197452229299364, | |
| "grad_norm": 1.2278711006510985, | |
| "learning_rate": 1.8731407172408987e-07, | |
| "loss": 0.4309, | |
| "step": 90 | |
| }, | |
| { | |
| "epoch": 1.7388535031847132, | |
| "grad_norm": 1.2446640718011999, | |
| "learning_rate": 1.8684172319859257e-07, | |
| "loss": 0.4203, | |
| "step": 91 | |
| }, | |
| { | |
| "epoch": 1.7579617834394905, | |
| "grad_norm": 1.2657944122598679, | |
| "learning_rate": 1.863613591968355e-07, | |
| "loss": 0.4431, | |
| "step": 92 | |
| }, | |
| { | |
| "epoch": 1.7770700636942676, | |
| "grad_norm": 1.212712280704066, | |
| "learning_rate": 1.8587302405633417e-07, | |
| "loss": 0.4304, | |
| "step": 93 | |
| }, | |
| { | |
| "epoch": 1.7961783439490446, | |
| "grad_norm": 1.1895034383874779, | |
| "learning_rate": 1.8537676285033885e-07, | |
| "loss": 0.4083, | |
| "step": 94 | |
| }, | |
| { | |
| "epoch": 1.8152866242038217, | |
| "grad_norm": 1.1891183671713428, | |
| "learning_rate": 1.848726213836744e-07, | |
| "loss": 0.4244, | |
| "step": 95 | |
| }, | |
| { | |
| "epoch": 1.8343949044585988, | |
| "grad_norm": 1.181853541101605, | |
| "learning_rate": 1.8436064618851224e-07, | |
| "loss": 0.434, | |
| "step": 96 | |
| }, | |
| { | |
| "epoch": 1.8535031847133758, | |
| "grad_norm": 1.1581194250533744, | |
| "learning_rate": 1.8384088452007576e-07, | |
| "loss": 0.4225, | |
| "step": 97 | |
| }, | |
| { | |
| "epoch": 1.872611464968153, | |
| "grad_norm": 1.0795760687778084, | |
| "learning_rate": 1.8331338435227837e-07, | |
| "loss": 0.4103, | |
| "step": 98 | |
| }, | |
| { | |
| "epoch": 1.89171974522293, | |
| "grad_norm": 1.111129745427032, | |
| "learning_rate": 1.8277819437329574e-07, | |
| "loss": 0.4279, | |
| "step": 99 | |
| }, | |
| { | |
| "epoch": 1.910828025477707, | |
| "grad_norm": 1.1105627528286264, | |
| "learning_rate": 1.8223536398107174e-07, | |
| "loss": 0.4129, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 1.929936305732484, | |
| "grad_norm": 1.0293128045168645, | |
| "learning_rate": 1.8168494327875916e-07, | |
| "loss": 0.4042, | |
| "step": 101 | |
| }, | |
| { | |
| "epoch": 1.9490445859872612, | |
| "grad_norm": 1.029497275251704, | |
| "learning_rate": 1.8112698307009504e-07, | |
| "loss": 0.4157, | |
| "step": 102 | |
| }, | |
| { | |
| "epoch": 1.9681528662420382, | |
| "grad_norm": 1.05771473532387, | |
| "learning_rate": 1.8056153485471165e-07, | |
| "loss": 0.4163, | |
| "step": 103 | |
| }, | |
| { | |
| "epoch": 1.9872611464968153, | |
| "grad_norm": 1.044232706779553, | |
| "learning_rate": 1.7998865082338287e-07, | |
| "loss": 0.411, | |
| "step": 104 | |
| }, | |
| { | |
| "epoch": 2.0063694267515926, | |
| "grad_norm": 1.0728042673445248, | |
| "learning_rate": 1.7940838385320732e-07, | |
| "loss": 0.4282, | |
| "step": 105 | |
| }, | |
| { | |
| "epoch": 2.0254777070063694, | |
| "grad_norm": 1.0124517008376481, | |
| "learning_rate": 1.788207875027274e-07, | |
| "loss": 0.3981, | |
| "step": 106 | |
| }, | |
| { | |
| "epoch": 2.0445859872611467, | |
| "grad_norm": 0.9892098063897707, | |
| "learning_rate": 1.7822591600698629e-07, | |
| "loss": 0.4033, | |
| "step": 107 | |
| }, | |
| { | |
| "epoch": 2.0636942675159236, | |
| "grad_norm": 0.9882249812487839, | |
| "learning_rate": 1.7762382427252165e-07, | |
| "loss": 0.4124, | |
| "step": 108 | |
| }, | |
| { | |
| "epoch": 2.082802547770701, | |
| "grad_norm": 1.0014484680247562, | |
| "learning_rate": 1.7701456787229803e-07, | |
| "loss": 0.4155, | |
| "step": 109 | |
| }, | |
| { | |
| "epoch": 2.1019108280254777, | |
| "grad_norm": 0.9692056423689174, | |
| "learning_rate": 1.7639820304057742e-07, | |
| "loss": 0.4005, | |
| "step": 110 | |
| }, | |
| { | |
| "epoch": 2.121019108280255, | |
| "grad_norm": 0.9672739874787071, | |
| "learning_rate": 1.7577478666772882e-07, | |
| "loss": 0.4239, | |
| "step": 111 | |
| }, | |
| { | |
| "epoch": 2.140127388535032, | |
| "grad_norm": 0.9718929588628977, | |
| "learning_rate": 1.7514437629497717e-07, | |
| "loss": 0.3962, | |
| "step": 112 | |
| }, | |
| { | |
| "epoch": 2.159235668789809, | |
| "grad_norm": 0.986945591632025, | |
| "learning_rate": 1.7450703010909262e-07, | |
| "loss": 0.4134, | |
| "step": 113 | |
| }, | |
| { | |
| "epoch": 2.178343949044586, | |
| "grad_norm": 0.9390145485362039, | |
| "learning_rate": 1.738628069370195e-07, | |
| "loss": 0.3845, | |
| "step": 114 | |
| }, | |
| { | |
| "epoch": 2.1974522292993632, | |
| "grad_norm": 1.0524118338702655, | |
| "learning_rate": 1.7321176624044687e-07, | |
| "loss": 0.4186, | |
| "step": 115 | |
| }, | |
| { | |
| "epoch": 2.21656050955414, | |
| "grad_norm": 1.0444330849016288, | |
| "learning_rate": 1.7255396811032013e-07, | |
| "loss": 0.4024, | |
| "step": 116 | |
| }, | |
| { | |
| "epoch": 2.2356687898089174, | |
| "grad_norm": 0.951189410440023, | |
| "learning_rate": 1.718894732612947e-07, | |
| "loss": 0.4007, | |
| "step": 117 | |
| }, | |
| { | |
| "epoch": 2.254777070063694, | |
| "grad_norm": 0.9707318478778935, | |
| "learning_rate": 1.7121834302613186e-07, | |
| "loss": 0.4081, | |
| "step": 118 | |
| }, | |
| { | |
| "epoch": 2.2738853503184715, | |
| "grad_norm": 0.9577436750510042, | |
| "learning_rate": 1.7054063935003812e-07, | |
| "loss": 0.407, | |
| "step": 119 | |
| }, | |
| { | |
| "epoch": 2.2929936305732483, | |
| "grad_norm": 0.9668619767258039, | |
| "learning_rate": 1.6985642478494727e-07, | |
| "loss": 0.4095, | |
| "step": 120 | |
| }, | |
| { | |
| "epoch": 2.3121019108280256, | |
| "grad_norm": 1.0171913879297245, | |
| "learning_rate": 1.6916576248374716e-07, | |
| "loss": 0.4069, | |
| "step": 121 | |
| }, | |
| { | |
| "epoch": 2.3312101910828025, | |
| "grad_norm": 1.0128147920459283, | |
| "learning_rate": 1.684687161944506e-07, | |
| "loss": 0.3945, | |
| "step": 122 | |
| }, | |
| { | |
| "epoch": 2.3503184713375798, | |
| "grad_norm": 0.9324167222212655, | |
| "learning_rate": 1.6776535025431129e-07, | |
| "loss": 0.3979, | |
| "step": 123 | |
| }, | |
| { | |
| "epoch": 2.3694267515923566, | |
| "grad_norm": 0.9284341391341622, | |
| "learning_rate": 1.6705572958388573e-07, | |
| "loss": 0.3799, | |
| "step": 124 | |
| }, | |
| { | |
| "epoch": 2.388535031847134, | |
| "grad_norm": 0.9419873392238237, | |
| "learning_rate": 1.6633991968104092e-07, | |
| "loss": 0.4152, | |
| "step": 125 | |
| }, | |
| { | |
| "epoch": 2.4076433121019107, | |
| "grad_norm": 0.9461512919509681, | |
| "learning_rate": 1.6561798661490902e-07, | |
| "loss": 0.42, | |
| "step": 126 | |
| }, | |
| { | |
| "epoch": 2.426751592356688, | |
| "grad_norm": 0.8953846015326995, | |
| "learning_rate": 1.6488999701978902e-07, | |
| "loss": 0.3988, | |
| "step": 127 | |
| }, | |
| { | |
| "epoch": 2.445859872611465, | |
| "grad_norm": 0.8817601115413886, | |
| "learning_rate": 1.6415601808899658e-07, | |
| "loss": 0.3941, | |
| "step": 128 | |
| }, | |
| { | |
| "epoch": 2.464968152866242, | |
| "grad_norm": 0.9329382880500549, | |
| "learning_rate": 1.63416117568662e-07, | |
| "loss": 0.4108, | |
| "step": 129 | |
| }, | |
| { | |
| "epoch": 2.484076433121019, | |
| "grad_norm": 0.9466218469022187, | |
| "learning_rate": 1.6267036375147723e-07, | |
| "loss": 0.3977, | |
| "step": 130 | |
| }, | |
| { | |
| "epoch": 2.5031847133757963, | |
| "grad_norm": 0.9073527766440519, | |
| "learning_rate": 1.6191882547039266e-07, | |
| "loss": 0.3973, | |
| "step": 131 | |
| }, | |
| { | |
| "epoch": 2.522292993630573, | |
| "grad_norm": 0.9277154102961712, | |
| "learning_rate": 1.6116157209226352e-07, | |
| "loss": 0.3842, | |
| "step": 132 | |
| }, | |
| { | |
| "epoch": 2.5414012738853504, | |
| "grad_norm": 0.9132161554797611, | |
| "learning_rate": 1.6039867351144777e-07, | |
| "loss": 0.39, | |
| "step": 133 | |
| }, | |
| { | |
| "epoch": 2.5605095541401273, | |
| "grad_norm": 0.9506909219497836, | |
| "learning_rate": 1.5963020014335436e-07, | |
| "loss": 0.3836, | |
| "step": 134 | |
| }, | |
| { | |
| "epoch": 2.5796178343949046, | |
| "grad_norm": 0.9031771763957327, | |
| "learning_rate": 1.5885622291794428e-07, | |
| "loss": 0.4173, | |
| "step": 135 | |
| }, | |
| { | |
| "epoch": 2.5987261146496814, | |
| "grad_norm": 0.9589857428653794, | |
| "learning_rate": 1.580768132731837e-07, | |
| "loss": 0.3959, | |
| "step": 136 | |
| }, | |
| { | |
| "epoch": 2.6178343949044587, | |
| "grad_norm": 0.9063396256825602, | |
| "learning_rate": 1.5729204314845e-07, | |
| "loss": 0.41, | |
| "step": 137 | |
| }, | |
| { | |
| "epoch": 2.6369426751592355, | |
| "grad_norm": 0.8875281986575585, | |
| "learning_rate": 1.56501984977892e-07, | |
| "loss": 0.4012, | |
| "step": 138 | |
| }, | |
| { | |
| "epoch": 2.656050955414013, | |
| "grad_norm": 0.8990703005870186, | |
| "learning_rate": 1.5570671168374436e-07, | |
| "loss": 0.4024, | |
| "step": 139 | |
| }, | |
| { | |
| "epoch": 2.6751592356687897, | |
| "grad_norm": 0.8977323600712754, | |
| "learning_rate": 1.5490629666959666e-07, | |
| "loss": 0.3899, | |
| "step": 140 | |
| }, | |
| { | |
| "epoch": 2.694267515923567, | |
| "grad_norm": 0.9571943294092002, | |
| "learning_rate": 1.5410081381361829e-07, | |
| "loss": 0.401, | |
| "step": 141 | |
| }, | |
| { | |
| "epoch": 2.713375796178344, | |
| "grad_norm": 0.857629614904157, | |
| "learning_rate": 1.5329033746173973e-07, | |
| "loss": 0.3886, | |
| "step": 142 | |
| }, | |
| { | |
| "epoch": 2.732484076433121, | |
| "grad_norm": 0.9442460364317703, | |
| "learning_rate": 1.5247494242079021e-07, | |
| "loss": 0.4211, | |
| "step": 143 | |
| }, | |
| { | |
| "epoch": 2.7515923566878984, | |
| "grad_norm": 0.9057891101965285, | |
| "learning_rate": 1.5165470395159313e-07, | |
| "loss": 0.3977, | |
| "step": 144 | |
| }, | |
| { | |
| "epoch": 2.770700636942675, | |
| "grad_norm": 0.862105132626373, | |
| "learning_rate": 1.5082969776201945e-07, | |
| "loss": 0.3916, | |
| "step": 145 | |
| }, | |
| { | |
| "epoch": 2.789808917197452, | |
| "grad_norm": 0.8863875640101582, | |
| "learning_rate": 1.5e-07, | |
| "loss": 0.375, | |
| "step": 146 | |
| }, | |
| { | |
| "epoch": 2.8089171974522293, | |
| "grad_norm": 0.9095481357284252, | |
| "learning_rate": 1.4916568724649686e-07, | |
| "loss": 0.3965, | |
| "step": 147 | |
| }, | |
| { | |
| "epoch": 2.8280254777070066, | |
| "grad_norm": 0.8835243071238383, | |
| "learning_rate": 1.4832683650843506e-07, | |
| "loss": 0.3857, | |
| "step": 148 | |
| }, | |
| { | |
| "epoch": 2.8471337579617835, | |
| "grad_norm": 0.8444620499975368, | |
| "learning_rate": 1.4748352521159491e-07, | |
| "loss": 0.3868, | |
| "step": 149 | |
| }, | |
| { | |
| "epoch": 2.8662420382165603, | |
| "grad_norm": 0.9320821828556558, | |
| "learning_rate": 1.4663583119346538e-07, | |
| "loss": 0.4109, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 2.8853503184713376, | |
| "grad_norm": 0.9093581394378585, | |
| "learning_rate": 1.4578383269606002e-07, | |
| "loss": 0.3965, | |
| "step": 151 | |
| }, | |
| { | |
| "epoch": 2.904458598726115, | |
| "grad_norm": 0.8671575603335869, | |
| "learning_rate": 1.4492760835869502e-07, | |
| "loss": 0.3765, | |
| "step": 152 | |
| }, | |
| { | |
| "epoch": 2.9235668789808917, | |
| "grad_norm": 0.8770313068851825, | |
| "learning_rate": 1.4406723721073087e-07, | |
| "loss": 0.4112, | |
| "step": 153 | |
| }, | |
| { | |
| "epoch": 2.9426751592356686, | |
| "grad_norm": 0.8731275332990308, | |
| "learning_rate": 1.4320279866427796e-07, | |
| "loss": 0.3931, | |
| "step": 154 | |
| }, | |
| { | |
| "epoch": 2.961783439490446, | |
| "grad_norm": 0.929306686854739, | |
| "learning_rate": 1.4233437250686693e-07, | |
| "loss": 0.4045, | |
| "step": 155 | |
| }, | |
| { | |
| "epoch": 2.980891719745223, | |
| "grad_norm": 0.920864970116778, | |
| "learning_rate": 1.4146203889408418e-07, | |
| "loss": 0.4011, | |
| "step": 156 | |
| }, | |
| { | |
| "epoch": 3.0, | |
| "grad_norm": 0.9256172158294934, | |
| "learning_rate": 1.4058587834217354e-07, | |
| "loss": 0.4051, | |
| "step": 157 | |
| }, | |
| { | |
| "epoch": 3.0191082802547773, | |
| "grad_norm": 0.8701795137759962, | |
| "learning_rate": 1.397059717206048e-07, | |
| "loss": 0.3837, | |
| "step": 158 | |
| }, | |
| { | |
| "epoch": 3.038216560509554, | |
| "grad_norm": 0.8658948728093971, | |
| "learning_rate": 1.3882240024460924e-07, | |
| "loss": 0.3993, | |
| "step": 159 | |
| }, | |
| { | |
| "epoch": 3.0573248407643314, | |
| "grad_norm": 0.8550607152625319, | |
| "learning_rate": 1.3793524546768356e-07, | |
| "loss": 0.4131, | |
| "step": 160 | |
| }, | |
| { | |
| "epoch": 3.0764331210191083, | |
| "grad_norm": 0.8522647937778317, | |
| "learning_rate": 1.370445892740626e-07, | |
| "loss": 0.3922, | |
| "step": 161 | |
| }, | |
| { | |
| "epoch": 3.0955414012738856, | |
| "grad_norm": 0.8942298522208958, | |
| "learning_rate": 1.361505138711613e-07, | |
| "loss": 0.3886, | |
| "step": 162 | |
| }, | |
| { | |
| "epoch": 3.1146496815286624, | |
| "grad_norm": 0.8173283412874084, | |
| "learning_rate": 1.3525310178198706e-07, | |
| "loss": 0.3795, | |
| "step": 163 | |
| }, | |
| { | |
| "epoch": 3.1337579617834397, | |
| "grad_norm": 0.8975022792003201, | |
| "learning_rate": 1.343524358375229e-07, | |
| "loss": 0.3788, | |
| "step": 164 | |
| }, | |
| { | |
| "epoch": 3.1528662420382165, | |
| "grad_norm": 0.8121287355981773, | |
| "learning_rate": 1.3344859916908204e-07, | |
| "loss": 0.3714, | |
| "step": 165 | |
| }, | |
| { | |
| "epoch": 3.171974522292994, | |
| "grad_norm": 0.8780441128171443, | |
| "learning_rate": 1.325416752006351e-07, | |
| "loss": 0.384, | |
| "step": 166 | |
| }, | |
| { | |
| "epoch": 3.1910828025477707, | |
| "grad_norm": 0.8709703387717114, | |
| "learning_rate": 1.3163174764110982e-07, | |
| "loss": 0.3937, | |
| "step": 167 | |
| }, | |
| { | |
| "epoch": 3.210191082802548, | |
| "grad_norm": 0.8352312687881929, | |
| "learning_rate": 1.3071890047666496e-07, | |
| "loss": 0.4, | |
| "step": 168 | |
| }, | |
| { | |
| "epoch": 3.229299363057325, | |
| "grad_norm": 0.8370238487705333, | |
| "learning_rate": 1.2980321796293835e-07, | |
| "loss": 0.3929, | |
| "step": 169 | |
| }, | |
| { | |
| "epoch": 3.248407643312102, | |
| "grad_norm": 0.8754516601719645, | |
| "learning_rate": 1.288847846172701e-07, | |
| "loss": 0.3858, | |
| "step": 170 | |
| }, | |
| { | |
| "epoch": 3.267515923566879, | |
| "grad_norm": 0.8493944760687623, | |
| "learning_rate": 1.2796368521090143e-07, | |
| "loss": 0.3753, | |
| "step": 171 | |
| }, | |
| { | |
| "epoch": 3.286624203821656, | |
| "grad_norm": 0.8672706457828706, | |
| "learning_rate": 1.270400047611508e-07, | |
| "loss": 0.3889, | |
| "step": 172 | |
| }, | |
| { | |
| "epoch": 3.305732484076433, | |
| "grad_norm": 0.8578114494949349, | |
| "learning_rate": 1.261138285235663e-07, | |
| "loss": 0.3909, | |
| "step": 173 | |
| }, | |
| { | |
| "epoch": 3.3248407643312103, | |
| "grad_norm": 0.88573195675492, | |
| "learning_rate": 1.2518524198405698e-07, | |
| "loss": 0.4025, | |
| "step": 174 | |
| }, | |
| { | |
| "epoch": 3.343949044585987, | |
| "grad_norm": 0.8388605708511595, | |
| "learning_rate": 1.2425433085100222e-07, | |
| "loss": 0.3965, | |
| "step": 175 | |
| }, | |
| { | |
| "epoch": 3.3630573248407645, | |
| "grad_norm": 0.8254754751848654, | |
| "learning_rate": 1.2332118104734109e-07, | |
| "loss": 0.3962, | |
| "step": 176 | |
| }, | |
| { | |
| "epoch": 3.3821656050955413, | |
| "grad_norm": 0.8404360589007722, | |
| "learning_rate": 1.223858787026415e-07, | |
| "loss": 0.3836, | |
| "step": 177 | |
| }, | |
| { | |
| "epoch": 3.4012738853503186, | |
| "grad_norm": 0.8797609449074615, | |
| "learning_rate": 1.2144851014515054e-07, | |
| "loss": 0.3936, | |
| "step": 178 | |
| }, | |
| { | |
| "epoch": 3.4203821656050954, | |
| "grad_norm": 0.8426178024076959, | |
| "learning_rate": 1.2050916189382645e-07, | |
| "loss": 0.3931, | |
| "step": 179 | |
| }, | |
| { | |
| "epoch": 3.4394904458598727, | |
| "grad_norm": 0.8675648308269749, | |
| "learning_rate": 1.195679206503528e-07, | |
| "loss": 0.3867, | |
| "step": 180 | |
| }, | |
| { | |
| "epoch": 3.4585987261146496, | |
| "grad_norm": 0.870049660418513, | |
| "learning_rate": 1.1862487329113604e-07, | |
| "loss": 0.3943, | |
| "step": 181 | |
| }, | |
| { | |
| "epoch": 3.477707006369427, | |
| "grad_norm": 0.8394567213573604, | |
| "learning_rate": 1.1768010685928685e-07, | |
| "loss": 0.3856, | |
| "step": 182 | |
| }, | |
| { | |
| "epoch": 3.4968152866242037, | |
| "grad_norm": 0.8307302061680673, | |
| "learning_rate": 1.1673370855658591e-07, | |
| "loss": 0.392, | |
| "step": 183 | |
| }, | |
| { | |
| "epoch": 3.515923566878981, | |
| "grad_norm": 0.8963343005230818, | |
| "learning_rate": 1.1578576573543539e-07, | |
| "loss": 0.3962, | |
| "step": 184 | |
| }, | |
| { | |
| "epoch": 3.535031847133758, | |
| "grad_norm": 0.8672782496610111, | |
| "learning_rate": 1.1483636589079626e-07, | |
| "loss": 0.3898, | |
| "step": 185 | |
| }, | |
| { | |
| "epoch": 3.554140127388535, | |
| "grad_norm": 0.8164723155704265, | |
| "learning_rate": 1.138855966521124e-07, | |
| "loss": 0.3876, | |
| "step": 186 | |
| }, | |
| { | |
| "epoch": 3.573248407643312, | |
| "grad_norm": 0.8104375092397201, | |
| "learning_rate": 1.1293354577522263e-07, | |
| "loss": 0.3972, | |
| "step": 187 | |
| }, | |
| { | |
| "epoch": 3.5923566878980893, | |
| "grad_norm": 0.8800104752834295, | |
| "learning_rate": 1.1198030113426074e-07, | |
| "loss": 0.3887, | |
| "step": 188 | |
| }, | |
| { | |
| "epoch": 3.611464968152866, | |
| "grad_norm": 0.8243201770417823, | |
| "learning_rate": 1.110259507135447e-07, | |
| "loss": 0.4074, | |
| "step": 189 | |
| }, | |
| { | |
| "epoch": 3.6305732484076434, | |
| "grad_norm": 0.818166857201434, | |
| "learning_rate": 1.1007058259945583e-07, | |
| "loss": 0.3903, | |
| "step": 190 | |
| }, | |
| { | |
| "epoch": 3.6496815286624202, | |
| "grad_norm": 0.8265082024151587, | |
| "learning_rate": 1.0911428497230832e-07, | |
| "loss": 0.3961, | |
| "step": 191 | |
| }, | |
| { | |
| "epoch": 3.6687898089171975, | |
| "grad_norm": 0.885762495423559, | |
| "learning_rate": 1.0815714609821025e-07, | |
| "loss": 0.3728, | |
| "step": 192 | |
| }, | |
| { | |
| "epoch": 3.6878980891719744, | |
| "grad_norm": 0.8413664352472379, | |
| "learning_rate": 1.071992543209167e-07, | |
| "loss": 0.4015, | |
| "step": 193 | |
| }, | |
| { | |
| "epoch": 3.7070063694267517, | |
| "grad_norm": 0.8676923040006305, | |
| "learning_rate": 1.0624069805367557e-07, | |
| "loss": 0.3792, | |
| "step": 194 | |
| }, | |
| { | |
| "epoch": 3.7261146496815285, | |
| "grad_norm": 0.8027532451126252, | |
| "learning_rate": 1.0528156577106702e-07, | |
| "loss": 0.3695, | |
| "step": 195 | |
| }, | |
| { | |
| "epoch": 3.745222929936306, | |
| "grad_norm": 0.8005652974605109, | |
| "learning_rate": 1.0432194600083739e-07, | |
| "loss": 0.3844, | |
| "step": 196 | |
| }, | |
| { | |
| "epoch": 3.7643312101910826, | |
| "grad_norm": 0.8538263494199514, | |
| "learning_rate": 1.0336192731572803e-07, | |
| "loss": 0.3728, | |
| "step": 197 | |
| }, | |
| { | |
| "epoch": 3.78343949044586, | |
| "grad_norm": 0.858813703248373, | |
| "learning_rate": 1.0240159832530007e-07, | |
| "loss": 0.3982, | |
| "step": 198 | |
| }, | |
| { | |
| "epoch": 3.802547770700637, | |
| "grad_norm": 0.8995549935364188, | |
| "learning_rate": 1.0144104766775572e-07, | |
| "loss": 0.4082, | |
| "step": 199 | |
| }, | |
| { | |
| "epoch": 3.821656050955414, | |
| "grad_norm": 0.8240235309228603, | |
| "learning_rate": 1.0048036400175708e-07, | |
| "loss": 0.3817, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 3.840764331210191, | |
| "grad_norm": 0.7944949119815911, | |
| "learning_rate": 9.951963599824293e-08, | |
| "loss": 0.4014, | |
| "step": 201 | |
| }, | |
| { | |
| "epoch": 3.859872611464968, | |
| "grad_norm": 0.8080274716468211, | |
| "learning_rate": 9.855895233224429e-08, | |
| "loss": 0.3874, | |
| "step": 202 | |
| }, | |
| { | |
| "epoch": 3.8789808917197455, | |
| "grad_norm": 0.8704734628918741, | |
| "learning_rate": 9.759840167469994e-08, | |
| "loss": 0.3776, | |
| "step": 203 | |
| }, | |
| { | |
| "epoch": 3.8980891719745223, | |
| "grad_norm": 0.7760511777395571, | |
| "learning_rate": 9.663807268427197e-08, | |
| "loss": 0.3834, | |
| "step": 204 | |
| }, | |
| { | |
| "epoch": 3.917197452229299, | |
| "grad_norm": 0.8332364115746148, | |
| "learning_rate": 9.567805399916259e-08, | |
| "loss": 0.374, | |
| "step": 205 | |
| }, | |
| { | |
| "epoch": 3.9363057324840764, | |
| "grad_norm": 0.8277765731528189, | |
| "learning_rate": 9.471843422893297e-08, | |
| "loss": 0.3868, | |
| "step": 206 | |
| }, | |
| { | |
| "epoch": 3.9554140127388537, | |
| "grad_norm": 0.8608849591995096, | |
| "learning_rate": 9.375930194632446e-08, | |
| "loss": 0.385, | |
| "step": 207 | |
| }, | |
| { | |
| "epoch": 3.9745222929936306, | |
| "grad_norm": 0.8787510622754615, | |
| "learning_rate": 9.28007456790833e-08, | |
| "loss": 0.3726, | |
| "step": 208 | |
| }, | |
| { | |
| "epoch": 3.9936305732484074, | |
| "grad_norm": 0.7972052654644051, | |
| "learning_rate": 9.184285390178977e-08, | |
| "loss": 0.3775, | |
| "step": 209 | |
| }, | |
| { | |
| "epoch": 4.012738853503185, | |
| "grad_norm": 0.8068665514966669, | |
| "learning_rate": 9.088571502769167e-08, | |
| "loss": 0.3872, | |
| "step": 210 | |
| }, | |
| { | |
| "epoch": 4.031847133757962, | |
| "grad_norm": 0.8146646388290683, | |
| "learning_rate": 8.992941740054417e-08, | |
| "loss": 0.3878, | |
| "step": 211 | |
| }, | |
| { | |
| "epoch": 4.050955414012739, | |
| "grad_norm": 0.8655592462822442, | |
| "learning_rate": 8.897404928645527e-08, | |
| "loss": 0.3886, | |
| "step": 212 | |
| }, | |
| { | |
| "epoch": 4.070063694267516, | |
| "grad_norm": 0.7920711667344885, | |
| "learning_rate": 8.801969886573929e-08, | |
| "loss": 0.3854, | |
| "step": 213 | |
| }, | |
| { | |
| "epoch": 4.089171974522293, | |
| "grad_norm": 0.8379714883887506, | |
| "learning_rate": 8.706645422477737e-08, | |
| "loss": 0.3691, | |
| "step": 214 | |
| }, | |
| { | |
| "epoch": 4.10828025477707, | |
| "grad_norm": 0.8437233766568482, | |
| "learning_rate": 8.611440334788762e-08, | |
| "loss": 0.3744, | |
| "step": 215 | |
| }, | |
| { | |
| "epoch": 4.127388535031847, | |
| "grad_norm": 0.8909392753897909, | |
| "learning_rate": 8.516363410920375e-08, | |
| "loss": 0.3962, | |
| "step": 216 | |
| }, | |
| { | |
| "epoch": 4.146496815286624, | |
| "grad_norm": 0.8203045296400541, | |
| "learning_rate": 8.42142342645646e-08, | |
| "loss": 0.3862, | |
| "step": 217 | |
| }, | |
| { | |
| "epoch": 4.165605095541402, | |
| "grad_norm": 0.7744540478897702, | |
| "learning_rate": 8.326629144341405e-08, | |
| "loss": 0.3642, | |
| "step": 218 | |
| }, | |
| { | |
| "epoch": 4.1847133757961785, | |
| "grad_norm": 0.863804040299928, | |
| "learning_rate": 8.231989314071316e-08, | |
| "loss": 0.3964, | |
| "step": 219 | |
| }, | |
| { | |
| "epoch": 4.203821656050955, | |
| "grad_norm": 0.801440996806109, | |
| "learning_rate": 8.137512670886396e-08, | |
| "loss": 0.3837, | |
| "step": 220 | |
| }, | |
| { | |
| "epoch": 4.222929936305732, | |
| "grad_norm": 0.813319837107103, | |
| "learning_rate": 8.04320793496472e-08, | |
| "loss": 0.4017, | |
| "step": 221 | |
| }, | |
| { | |
| "epoch": 4.24203821656051, | |
| "grad_norm": 0.8601056640259767, | |
| "learning_rate": 7.949083810617357e-08, | |
| "loss": 0.3857, | |
| "step": 222 | |
| }, | |
| { | |
| "epoch": 4.261146496815287, | |
| "grad_norm": 0.8181368232094955, | |
| "learning_rate": 7.855148985484945e-08, | |
| "loss": 0.3812, | |
| "step": 223 | |
| }, | |
| { | |
| "epoch": 4.280254777070064, | |
| "grad_norm": 0.8346128486570273, | |
| "learning_rate": 7.761412129735851e-08, | |
| "loss": 0.3852, | |
| "step": 224 | |
| }, | |
| { | |
| "epoch": 4.2993630573248405, | |
| "grad_norm": 0.8108319089562595, | |
| "learning_rate": 7.667881895265893e-08, | |
| "loss": 0.3732, | |
| "step": 225 | |
| }, | |
| { | |
| "epoch": 4.318471337579618, | |
| "grad_norm": 0.8157847645705257, | |
| "learning_rate": 7.574566914899778e-08, | |
| "loss": 0.3724, | |
| "step": 226 | |
| }, | |
| { | |
| "epoch": 4.337579617834395, | |
| "grad_norm": 0.8304647677067434, | |
| "learning_rate": 7.481475801594301e-08, | |
| "loss": 0.3727, | |
| "step": 227 | |
| }, | |
| { | |
| "epoch": 4.356687898089172, | |
| "grad_norm": 0.7881108642715712, | |
| "learning_rate": 7.38861714764337e-08, | |
| "loss": 0.3878, | |
| "step": 228 | |
| }, | |
| { | |
| "epoch": 4.375796178343949, | |
| "grad_norm": 0.8099514584757647, | |
| "learning_rate": 7.29599952388492e-08, | |
| "loss": 0.3782, | |
| "step": 229 | |
| }, | |
| { | |
| "epoch": 4.3949044585987265, | |
| "grad_norm": 0.8755861192813177, | |
| "learning_rate": 7.203631478909857e-08, | |
| "loss": 0.3689, | |
| "step": 230 | |
| }, | |
| { | |
| "epoch": 4.414012738853503, | |
| "grad_norm": 0.7788977257515683, | |
| "learning_rate": 7.111521538272996e-08, | |
| "loss": 0.3685, | |
| "step": 231 | |
| }, | |
| { | |
| "epoch": 4.43312101910828, | |
| "grad_norm": 0.8404128728999694, | |
| "learning_rate": 7.019678203706163e-08, | |
| "loss": 0.3904, | |
| "step": 232 | |
| }, | |
| { | |
| "epoch": 4.452229299363057, | |
| "grad_norm": 0.8342544645155691, | |
| "learning_rate": 6.928109952333506e-08, | |
| "loss": 0.3962, | |
| "step": 233 | |
| }, | |
| { | |
| "epoch": 4.471337579617835, | |
| "grad_norm": 0.7989245810139801, | |
| "learning_rate": 6.836825235889018e-08, | |
| "loss": 0.3964, | |
| "step": 234 | |
| }, | |
| { | |
| "epoch": 4.490445859872612, | |
| "grad_norm": 0.8134468790899694, | |
| "learning_rate": 6.74583247993649e-08, | |
| "loss": 0.4079, | |
| "step": 235 | |
| }, | |
| { | |
| "epoch": 4.509554140127388, | |
| "grad_norm": 0.8029199573915637, | |
| "learning_rate": 6.655140083091793e-08, | |
| "loss": 0.3887, | |
| "step": 236 | |
| }, | |
| { | |
| "epoch": 4.528662420382165, | |
| "grad_norm": 0.7735464879684218, | |
| "learning_rate": 6.56475641624771e-08, | |
| "loss": 0.3738, | |
| "step": 237 | |
| }, | |
| { | |
| "epoch": 4.547770700636943, | |
| "grad_norm": 0.8847195049321371, | |
| "learning_rate": 6.474689821801294e-08, | |
| "loss": 0.3777, | |
| "step": 238 | |
| }, | |
| { | |
| "epoch": 4.56687898089172, | |
| "grad_norm": 0.846199572978138, | |
| "learning_rate": 6.384948612883871e-08, | |
| "loss": 0.3851, | |
| "step": 239 | |
| }, | |
| { | |
| "epoch": 4.585987261146497, | |
| "grad_norm": 0.8240989805661456, | |
| "learning_rate": 6.29554107259374e-08, | |
| "loss": 0.3928, | |
| "step": 240 | |
| }, | |
| { | |
| "epoch": 4.6050955414012735, | |
| "grad_norm": 0.835310477492176, | |
| "learning_rate": 6.206475453231643e-08, | |
| "loss": 0.3839, | |
| "step": 241 | |
| }, | |
| { | |
| "epoch": 4.624203821656051, | |
| "grad_norm": 0.8319583235045928, | |
| "learning_rate": 6.117759975539074e-08, | |
| "loss": 0.3698, | |
| "step": 242 | |
| }, | |
| { | |
| "epoch": 4.643312101910828, | |
| "grad_norm": 0.8181812452470683, | |
| "learning_rate": 6.029402827939519e-08, | |
| "loss": 0.3683, | |
| "step": 243 | |
| }, | |
| { | |
| "epoch": 4.662420382165605, | |
| "grad_norm": 0.8783636137594268, | |
| "learning_rate": 5.941412165782644e-08, | |
| "loss": 0.3785, | |
| "step": 244 | |
| }, | |
| { | |
| "epoch": 4.681528662420382, | |
| "grad_norm": 0.8165663248513478, | |
| "learning_rate": 5.853796110591582e-08, | |
| "loss": 0.3838, | |
| "step": 245 | |
| }, | |
| { | |
| "epoch": 4.7006369426751595, | |
| "grad_norm": 0.8484264910287741, | |
| "learning_rate": 5.7665627493133084e-08, | |
| "loss": 0.3759, | |
| "step": 246 | |
| }, | |
| { | |
| "epoch": 4.719745222929936, | |
| "grad_norm": 0.8285714683962552, | |
| "learning_rate": 5.6797201335722055e-08, | |
| "loss": 0.3824, | |
| "step": 247 | |
| }, | |
| { | |
| "epoch": 4.738853503184713, | |
| "grad_norm": 0.8112577497500143, | |
| "learning_rate": 5.593276278926912e-08, | |
| "loss": 0.3645, | |
| "step": 248 | |
| }, | |
| { | |
| "epoch": 4.757961783439491, | |
| "grad_norm": 0.8374711715413657, | |
| "learning_rate": 5.5072391641305003e-08, | |
| "loss": 0.3961, | |
| "step": 249 | |
| }, | |
| { | |
| "epoch": 4.777070063694268, | |
| "grad_norm": 0.8019560325800055, | |
| "learning_rate": 5.4216167303939996e-08, | |
| "loss": 0.3838, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 4.796178343949045, | |
| "grad_norm": 0.8122463812859406, | |
| "learning_rate": 5.33641688065346e-08, | |
| "loss": 0.3731, | |
| "step": 251 | |
| }, | |
| { | |
| "epoch": 4.8152866242038215, | |
| "grad_norm": 0.812582559765451, | |
| "learning_rate": 5.251647478840511e-08, | |
| "loss": 0.3858, | |
| "step": 252 | |
| }, | |
| { | |
| "epoch": 4.834394904458598, | |
| "grad_norm": 0.8306986525054737, | |
| "learning_rate": 5.167316349156494e-08, | |
| "loss": 0.3934, | |
| "step": 253 | |
| }, | |
| { | |
| "epoch": 4.853503184713376, | |
| "grad_norm": 0.8503018239839694, | |
| "learning_rate": 5.0834312753503117e-08, | |
| "loss": 0.3998, | |
| "step": 254 | |
| }, | |
| { | |
| "epoch": 4.872611464968153, | |
| "grad_norm": 0.7713185185461976, | |
| "learning_rate": 5.000000000000002e-08, | |
| "loss": 0.397, | |
| "step": 255 | |
| }, | |
| { | |
| "epoch": 4.89171974522293, | |
| "grad_norm": 0.8282244186999879, | |
| "learning_rate": 4.9170302237980564e-08, | |
| "loss": 0.3874, | |
| "step": 256 | |
| }, | |
| { | |
| "epoch": 4.9108280254777075, | |
| "grad_norm": 0.8193062680508688, | |
| "learning_rate": 4.8345296048406856e-08, | |
| "loss": 0.3856, | |
| "step": 257 | |
| }, | |
| { | |
| "epoch": 4.929936305732484, | |
| "grad_norm": 0.7737494059301828, | |
| "learning_rate": 4.752505757920977e-08, | |
| "loss": 0.3679, | |
| "step": 258 | |
| }, | |
| { | |
| "epoch": 4.949044585987261, | |
| "grad_norm": 0.8625873751464171, | |
| "learning_rate": 4.6709662538260266e-08, | |
| "loss": 0.3743, | |
| "step": 259 | |
| }, | |
| { | |
| "epoch": 4.968152866242038, | |
| "grad_norm": 0.8362733962777252, | |
| "learning_rate": 4.5899186186381725e-08, | |
| "loss": 0.4043, | |
| "step": 260 | |
| }, | |
| { | |
| "epoch": 4.987261146496815, | |
| "grad_norm": 0.8032355034719745, | |
| "learning_rate": 4.5093703330403374e-08, | |
| "loss": 0.377, | |
| "step": 261 | |
| }, | |
| { | |
| "epoch": 5.006369426751593, | |
| "grad_norm": 0.7748662472235436, | |
| "learning_rate": 4.429328831625565e-08, | |
| "loss": 0.386, | |
| "step": 262 | |
| }, | |
| { | |
| "epoch": 5.025477707006369, | |
| "grad_norm": 0.8066132016507707, | |
| "learning_rate": 4.3498015022108e-08, | |
| "loss": 0.3887, | |
| "step": 263 | |
| }, | |
| { | |
| "epoch": 5.044585987261146, | |
| "grad_norm": 0.7758364698198985, | |
| "learning_rate": 4.270795685155001e-08, | |
| "loss": 0.3826, | |
| "step": 264 | |
| }, | |
| { | |
| "epoch": 5.063694267515924, | |
| "grad_norm": 0.7661247386701426, | |
| "learning_rate": 4.1923186726816305e-08, | |
| "loss": 0.3622, | |
| "step": 265 | |
| }, | |
| { | |
| "epoch": 5.082802547770701, | |
| "grad_norm": 0.905331062267788, | |
| "learning_rate": 4.114377708205571e-08, | |
| "loss": 0.3933, | |
| "step": 266 | |
| }, | |
| { | |
| "epoch": 5.101910828025478, | |
| "grad_norm": 0.8011338273436825, | |
| "learning_rate": 4.036979985664566e-08, | |
| "loss": 0.3928, | |
| "step": 267 | |
| }, | |
| { | |
| "epoch": 5.1210191082802545, | |
| "grad_norm": 0.8194712847479001, | |
| "learning_rate": 3.9601326488552255e-08, | |
| "loss": 0.3817, | |
| "step": 268 | |
| }, | |
| { | |
| "epoch": 5.140127388535032, | |
| "grad_norm": 0.7802367995341835, | |
| "learning_rate": 3.883842790773647e-08, | |
| "loss": 0.351, | |
| "step": 269 | |
| }, | |
| { | |
| "epoch": 5.159235668789809, | |
| "grad_norm": 0.8239153884425812, | |
| "learning_rate": 3.808117452960734e-08, | |
| "loss": 0.3937, | |
| "step": 270 | |
| }, | |
| { | |
| "epoch": 5.178343949044586, | |
| "grad_norm": 0.859267053614819, | |
| "learning_rate": 3.732963624852274e-08, | |
| "loss": 0.388, | |
| "step": 271 | |
| }, | |
| { | |
| "epoch": 5.197452229299363, | |
| "grad_norm": 0.7725510162309273, | |
| "learning_rate": 3.658388243133804e-08, | |
| "loss": 0.3867, | |
| "step": 272 | |
| }, | |
| { | |
| "epoch": 5.2165605095541405, | |
| "grad_norm": 0.8062649067737858, | |
| "learning_rate": 3.584398191100341e-08, | |
| "loss": 0.3778, | |
| "step": 273 | |
| }, | |
| { | |
| "epoch": 5.235668789808917, | |
| "grad_norm": 0.8136961652490752, | |
| "learning_rate": 3.5110002980210973e-08, | |
| "loss": 0.3856, | |
| "step": 274 | |
| }, | |
| { | |
| "epoch": 5.254777070063694, | |
| "grad_norm": 0.8069370792436726, | |
| "learning_rate": 3.438201338509098e-08, | |
| "loss": 0.381, | |
| "step": 275 | |
| }, | |
| { | |
| "epoch": 5.273885350318471, | |
| "grad_norm": 0.7912165067167394, | |
| "learning_rate": 3.366008031895904e-08, | |
| "loss": 0.3947, | |
| "step": 276 | |
| }, | |
| { | |
| "epoch": 5.292993630573249, | |
| "grad_norm": 0.7925916759275095, | |
| "learning_rate": 3.294427041611425e-08, | |
| "loss": 0.3663, | |
| "step": 277 | |
| }, | |
| { | |
| "epoch": 5.312101910828026, | |
| "grad_norm": 0.8380350829763229, | |
| "learning_rate": 3.223464974568874e-08, | |
| "loss": 0.3998, | |
| "step": 278 | |
| }, | |
| { | |
| "epoch": 5.3312101910828025, | |
| "grad_norm": 0.8059508700292909, | |
| "learning_rate": 3.15312838055494e-08, | |
| "loss": 0.3811, | |
| "step": 279 | |
| }, | |
| { | |
| "epoch": 5.350318471337579, | |
| "grad_norm": 0.8537272020739671, | |
| "learning_rate": 3.083423751625281e-08, | |
| "loss": 0.3908, | |
| "step": 280 | |
| }, | |
| { | |
| "epoch": 5.369426751592357, | |
| "grad_norm": 0.7769880588570798, | |
| "learning_rate": 3.014357521505273e-08, | |
| "loss": 0.3876, | |
| "step": 281 | |
| }, | |
| { | |
| "epoch": 5.388535031847134, | |
| "grad_norm": 0.7970916009843863, | |
| "learning_rate": 2.9459360649961896e-08, | |
| "loss": 0.3915, | |
| "step": 282 | |
| }, | |
| { | |
| "epoch": 5.407643312101911, | |
| "grad_norm": 0.7822357636738501, | |
| "learning_rate": 2.878165697386812e-08, | |
| "loss": 0.3925, | |
| "step": 283 | |
| }, | |
| { | |
| "epoch": 5.426751592356688, | |
| "grad_norm": 0.8155762137747297, | |
| "learning_rate": 2.811052673870534e-08, | |
| "loss": 0.3804, | |
| "step": 284 | |
| }, | |
| { | |
| "epoch": 5.445859872611465, | |
| "grad_norm": 0.8167709665350011, | |
| "learning_rate": 2.7446031889679888e-08, | |
| "loss": 0.341, | |
| "step": 285 | |
| }, | |
| { | |
| "epoch": 5.464968152866242, | |
| "grad_norm": 0.8378049495290578, | |
| "learning_rate": 2.6788233759553138e-08, | |
| "loss": 0.383, | |
| "step": 286 | |
| }, | |
| { | |
| "epoch": 5.484076433121019, | |
| "grad_norm": 0.852274854933172, | |
| "learning_rate": 2.61371930629805e-08, | |
| "loss": 0.3752, | |
| "step": 287 | |
| }, | |
| { | |
| "epoch": 5.503184713375796, | |
| "grad_norm": 0.8005697568656431, | |
| "learning_rate": 2.549296989090738e-08, | |
| "loss": 0.3817, | |
| "step": 288 | |
| }, | |
| { | |
| "epoch": 5.522292993630574, | |
| "grad_norm": 0.8111352520735085, | |
| "learning_rate": 2.4855623705022788e-08, | |
| "loss": 0.3924, | |
| "step": 289 | |
| }, | |
| { | |
| "epoch": 5.54140127388535, | |
| "grad_norm": 0.8502144978449777, | |
| "learning_rate": 2.4225213332271198e-08, | |
| "loss": 0.3982, | |
| "step": 290 | |
| }, | |
| { | |
| "epoch": 5.560509554140127, | |
| "grad_norm": 0.8175467759233411, | |
| "learning_rate": 2.3601796959422582e-08, | |
| "loss": 0.3713, | |
| "step": 291 | |
| }, | |
| { | |
| "epoch": 5.579617834394904, | |
| "grad_norm": 0.8048011556362245, | |
| "learning_rate": 2.2985432127701942e-08, | |
| "loss": 0.3716, | |
| "step": 292 | |
| }, | |
| { | |
| "epoch": 5.598726114649682, | |
| "grad_norm": 0.8222153363537893, | |
| "learning_rate": 2.237617572747834e-08, | |
| "loss": 0.3644, | |
| "step": 293 | |
| }, | |
| { | |
| "epoch": 5.617834394904459, | |
| "grad_norm": 0.7973734267283885, | |
| "learning_rate": 2.1774083993013716e-08, | |
| "loss": 0.3768, | |
| "step": 294 | |
| }, | |
| { | |
| "epoch": 5.6369426751592355, | |
| "grad_norm": 0.7885706909780547, | |
| "learning_rate": 2.117921249727258e-08, | |
| "loss": 0.3772, | |
| "step": 295 | |
| }, | |
| { | |
| "epoch": 5.656050955414012, | |
| "grad_norm": 0.8460370600218118, | |
| "learning_rate": 2.0591616146792702e-08, | |
| "loss": 0.3722, | |
| "step": 296 | |
| }, | |
| { | |
| "epoch": 5.67515923566879, | |
| "grad_norm": 0.7822794422028582, | |
| "learning_rate": 2.001134917661713e-08, | |
| "loss": 0.3833, | |
| "step": 297 | |
| }, | |
| { | |
| "epoch": 5.694267515923567, | |
| "grad_norm": 0.8417179595540075, | |
| "learning_rate": 1.9438465145288373e-08, | |
| "loss": 0.3852, | |
| "step": 298 | |
| }, | |
| { | |
| "epoch": 5.713375796178344, | |
| "grad_norm": 0.8260325891913302, | |
| "learning_rate": 1.8873016929904938e-08, | |
| "loss": 0.3822, | |
| "step": 299 | |
| }, | |
| { | |
| "epoch": 5.732484076433121, | |
| "grad_norm": 0.8235232563783705, | |
| "learning_rate": 1.831505672124083e-08, | |
| "loss": 0.3909, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 5.751592356687898, | |
| "grad_norm": 0.82251787065623, | |
| "learning_rate": 1.776463601892825e-08, | |
| "loss": 0.3806, | |
| "step": 301 | |
| }, | |
| { | |
| "epoch": 5.770700636942675, | |
| "grad_norm": 0.8219732366109578, | |
| "learning_rate": 1.7221805626704277e-08, | |
| "loss": 0.3932, | |
| "step": 302 | |
| }, | |
| { | |
| "epoch": 5.789808917197452, | |
| "grad_norm": 0.8500419948581102, | |
| "learning_rate": 1.6686615647721637e-08, | |
| "loss": 0.3969, | |
| "step": 303 | |
| }, | |
| { | |
| "epoch": 5.80891719745223, | |
| "grad_norm": 0.8195669552486191, | |
| "learning_rate": 1.615911547992426e-08, | |
| "loss": 0.3777, | |
| "step": 304 | |
| }, | |
| { | |
| "epoch": 5.828025477707007, | |
| "grad_norm": 0.8203695492022786, | |
| "learning_rate": 1.5639353811487744e-08, | |
| "loss": 0.3683, | |
| "step": 305 | |
| }, | |
| { | |
| "epoch": 5.8471337579617835, | |
| "grad_norm": 0.8228092581987145, | |
| "learning_rate": 1.5127378616325602e-08, | |
| "loss": 0.3779, | |
| "step": 306 | |
| }, | |
| { | |
| "epoch": 5.86624203821656, | |
| "grad_norm": 0.7965205823215074, | |
| "learning_rate": 1.4623237149661139e-08, | |
| "loss": 0.3753, | |
| "step": 307 | |
| }, | |
| { | |
| "epoch": 5.885350318471337, | |
| "grad_norm": 0.8408800872828248, | |
| "learning_rate": 1.4126975943665842e-08, | |
| "loss": 0.377, | |
| "step": 308 | |
| }, | |
| { | |
| "epoch": 5.904458598726115, | |
| "grad_norm": 0.8319451588726945, | |
| "learning_rate": 1.3638640803164514e-08, | |
| "loss": 0.3891, | |
| "step": 309 | |
| }, | |
| { | |
| "epoch": 5.923566878980892, | |
| "grad_norm": 0.8007835686361502, | |
| "learning_rate": 1.3158276801407431e-08, | |
| "loss": 0.3858, | |
| "step": 310 | |
| }, | |
| { | |
| "epoch": 5.942675159235669, | |
| "grad_norm": 0.8095992909101725, | |
| "learning_rate": 1.268592827591014e-08, | |
| "loss": 0.3629, | |
| "step": 311 | |
| }, | |
| { | |
| "epoch": 5.961783439490446, | |
| "grad_norm": 0.7963300257867078, | |
| "learning_rate": 1.2221638824361069e-08, | |
| "loss": 0.3782, | |
| "step": 312 | |
| }, | |
| { | |
| "epoch": 5.980891719745223, | |
| "grad_norm": 0.7726630979855738, | |
| "learning_rate": 1.1765451300597573e-08, | |
| "loss": 0.3834, | |
| "step": 313 | |
| }, | |
| { | |
| "epoch": 6.0, | |
| "grad_norm": 0.7479398982894709, | |
| "learning_rate": 1.131740781065037e-08, | |
| "loss": 0.3633, | |
| "step": 314 | |
| }, | |
| { | |
| "epoch": 6.019108280254777, | |
| "grad_norm": 0.7967021599978159, | |
| "learning_rate": 1.0877549708857225e-08, | |
| "loss": 0.3777, | |
| "step": 315 | |
| }, | |
| { | |
| "epoch": 6.038216560509555, | |
| "grad_norm": 0.7649932643254016, | |
| "learning_rate": 1.0445917594046071e-08, | |
| "loss": 0.3533, | |
| "step": 316 | |
| }, | |
| { | |
| "epoch": 6.057324840764331, | |
| "grad_norm": 0.7847115578358245, | |
| "learning_rate": 1.0022551305787563e-08, | |
| "loss": 0.3847, | |
| "step": 317 | |
| }, | |
| { | |
| "epoch": 6.076433121019108, | |
| "grad_norm": 0.8113396532764215, | |
| "learning_rate": 9.607489920717981e-09, | |
| "loss": 0.4053, | |
| "step": 318 | |
| }, | |
| { | |
| "epoch": 6.095541401273885, | |
| "grad_norm": 0.7933481355886415, | |
| "learning_rate": 9.200771748932512e-09, | |
| "loss": 0.3712, | |
| "step": 319 | |
| }, | |
| { | |
| "epoch": 6.114649681528663, | |
| "grad_norm": 0.8031446051590815, | |
| "learning_rate": 8.802434330449127e-09, | |
| "loss": 0.3706, | |
| "step": 320 | |
| }, | |
| { | |
| "epoch": 6.13375796178344, | |
| "grad_norm": 0.7899975029998316, | |
| "learning_rate": 8.412514431743656e-09, | |
| "loss": 0.3926, | |
| "step": 321 | |
| }, | |
| { | |
| "epoch": 6.1528662420382165, | |
| "grad_norm": 0.83711210181786, | |
| "learning_rate": 8.031048042356392e-09, | |
| "loss": 0.3828, | |
| "step": 322 | |
| }, | |
| { | |
| "epoch": 6.171974522292993, | |
| "grad_norm": 0.8108835667977151, | |
| "learning_rate": 7.65807037157007e-09, | |
| "loss": 0.3953, | |
| "step": 323 | |
| }, | |
| { | |
| "epoch": 6.191082802547771, | |
| "grad_norm": 0.7940213310862922, | |
| "learning_rate": 7.293615845160195e-09, | |
| "loss": 0.3798, | |
| "step": 324 | |
| }, | |
| { | |
| "epoch": 6.210191082802548, | |
| "grad_norm": 0.8090923493531231, | |
| "learning_rate": 6.9377181022174604e-09, | |
| "loss": 0.3489, | |
| "step": 325 | |
| }, | |
| { | |
| "epoch": 6.229299363057325, | |
| "grad_norm": 0.8150664351790909, | |
| "learning_rate": 6.590409992042956e-09, | |
| "loss": 0.3652, | |
| "step": 326 | |
| }, | |
| { | |
| "epoch": 6.248407643312102, | |
| "grad_norm": 0.8247491005855746, | |
| "learning_rate": 6.25172357111603e-09, | |
| "loss": 0.3793, | |
| "step": 327 | |
| }, | |
| { | |
| "epoch": 6.267515923566879, | |
| "grad_norm": 0.8107418363338103, | |
| "learning_rate": 5.921690100135712e-09, | |
| "loss": 0.3737, | |
| "step": 328 | |
| }, | |
| { | |
| "epoch": 6.286624203821656, | |
| "grad_norm": 0.7755128906973803, | |
| "learning_rate": 5.600340041135132e-09, | |
| "loss": 0.3662, | |
| "step": 329 | |
| }, | |
| { | |
| "epoch": 6.305732484076433, | |
| "grad_norm": 0.7795624367146271, | |
| "learning_rate": 5.2877030546700115e-09, | |
| "loss": 0.3737, | |
| "step": 330 | |
| }, | |
| { | |
| "epoch": 6.32484076433121, | |
| "grad_norm": 0.7803109557900149, | |
| "learning_rate": 4.9838079970809245e-09, | |
| "loss": 0.3644, | |
| "step": 331 | |
| }, | |
| { | |
| "epoch": 6.343949044585988, | |
| "grad_norm": 0.8101411487647653, | |
| "learning_rate": 4.688682917829967e-09, | |
| "loss": 0.3822, | |
| "step": 332 | |
| }, | |
| { | |
| "epoch": 6.3630573248407645, | |
| "grad_norm": 0.7817124326904151, | |
| "learning_rate": 4.402355056911655e-09, | |
| "loss": 0.3877, | |
| "step": 333 | |
| }, | |
| { | |
| "epoch": 6.382165605095541, | |
| "grad_norm": 0.7937959337403729, | |
| "learning_rate": 4.124850842338778e-09, | |
| "loss": 0.3831, | |
| "step": 334 | |
| }, | |
| { | |
| "epoch": 6.401273885350318, | |
| "grad_norm": 0.8137404165669514, | |
| "learning_rate": 3.856195887703095e-09, | |
| "loss": 0.3774, | |
| "step": 335 | |
| }, | |
| { | |
| "epoch": 6.420382165605096, | |
| "grad_norm": 0.7855307116650134, | |
| "learning_rate": 3.5964149898111585e-09, | |
| "loss": 0.3837, | |
| "step": 336 | |
| }, | |
| { | |
| "epoch": 6.439490445859873, | |
| "grad_norm": 0.8358282343705817, | |
| "learning_rate": 3.345532126395578e-09, | |
| "loss": 0.3764, | |
| "step": 337 | |
| }, | |
| { | |
| "epoch": 6.45859872611465, | |
| "grad_norm": 0.8278966939308083, | |
| "learning_rate": 3.103570453901938e-09, | |
| "loss": 0.3764, | |
| "step": 338 | |
| }, | |
| { | |
| "epoch": 6.477707006369426, | |
| "grad_norm": 0.8027521273066138, | |
| "learning_rate": 2.8705523053513814e-09, | |
| "loss": 0.3924, | |
| "step": 339 | |
| }, | |
| { | |
| "epoch": 6.496815286624204, | |
| "grad_norm": 0.8392704792372261, | |
| "learning_rate": 2.6464991882793277e-09, | |
| "loss": 0.3735, | |
| "step": 340 | |
| }, | |
| { | |
| "epoch": 6.515923566878981, | |
| "grad_norm": 0.8018405582197404, | |
| "learning_rate": 2.4314317827503373e-09, | |
| "loss": 0.4072, | |
| "step": 341 | |
| }, | |
| { | |
| "epoch": 6.535031847133758, | |
| "grad_norm": 0.8060724582366596, | |
| "learning_rate": 2.2253699394493065e-09, | |
| "loss": 0.3985, | |
| "step": 342 | |
| }, | |
| { | |
| "epoch": 6.554140127388535, | |
| "grad_norm": 0.7983964289622926, | |
| "learning_rate": 2.0283326778492536e-09, | |
| "loss": 0.3623, | |
| "step": 343 | |
| }, | |
| { | |
| "epoch": 6.573248407643312, | |
| "grad_norm": 0.804309012435283, | |
| "learning_rate": 1.8403381844558808e-09, | |
| "loss": 0.3869, | |
| "step": 344 | |
| }, | |
| { | |
| "epoch": 6.592356687898089, | |
| "grad_norm": 0.8306246032091474, | |
| "learning_rate": 1.661403811128903e-09, | |
| "loss": 0.3854, | |
| "step": 345 | |
| }, | |
| { | |
| "epoch": 6.611464968152866, | |
| "grad_norm": 0.7989197463157789, | |
| "learning_rate": 1.4915460734805096e-09, | |
| "loss": 0.3805, | |
| "step": 346 | |
| }, | |
| { | |
| "epoch": 6.630573248407643, | |
| "grad_norm": 0.7970938285648654, | |
| "learning_rate": 1.3307806493509377e-09, | |
| "loss": 0.3928, | |
| "step": 347 | |
| }, | |
| { | |
| "epoch": 6.649681528662421, | |
| "grad_norm": 0.7755028655582555, | |
| "learning_rate": 1.1791223773614634e-09, | |
| "loss": 0.3836, | |
| "step": 348 | |
| }, | |
| { | |
| "epoch": 6.6687898089171975, | |
| "grad_norm": 0.8335219559439776, | |
| "learning_rate": 1.036585255544764e-09, | |
| "loss": 0.3884, | |
| "step": 349 | |
| }, | |
| { | |
| "epoch": 6.687898089171974, | |
| "grad_norm": 0.8565667245306822, | |
| "learning_rate": 9.031824400528854e-10, | |
| "loss": 0.3725, | |
| "step": 350 | |
| }, | |
| { | |
| "epoch": 6.707006369426751, | |
| "grad_norm": 0.7761233262394732, | |
| "learning_rate": 7.789262439430012e-10, | |
| "loss": 0.3911, | |
| "step": 351 | |
| }, | |
| { | |
| "epoch": 6.726114649681529, | |
| "grad_norm": 0.7917666091257868, | |
| "learning_rate": 6.638281360408338e-10, | |
| "loss": 0.3621, | |
| "step": 352 | |
| }, | |
| { | |
| "epoch": 6.745222929936306, | |
| "grad_norm": 0.8117653905421635, | |
| "learning_rate": 5.578987398821344e-10, | |
| "loss": 0.389, | |
| "step": 353 | |
| }, | |
| { | |
| "epoch": 6.764331210191083, | |
| "grad_norm": 0.77977323640196, | |
| "learning_rate": 4.611478327321339e-10, | |
| "loss": 0.3728, | |
| "step": 354 | |
| }, | |
| { | |
| "epoch": 6.7834394904458595, | |
| "grad_norm": 0.8277863837971791, | |
| "learning_rate": 3.735843446830866e-10, | |
| "loss": 0.3943, | |
| "step": 355 | |
| }, | |
| { | |
| "epoch": 6.802547770700637, | |
| "grad_norm": 0.8004170000824867, | |
| "learning_rate": 2.952163578300193e-10, | |
| "loss": 0.3571, | |
| "step": 356 | |
| }, | |
| { | |
| "epoch": 6.821656050955414, | |
| "grad_norm": 0.8632389118864879, | |
| "learning_rate": 2.2605110552477157e-10, | |
| "loss": 0.4046, | |
| "step": 357 | |
| }, | |
| { | |
| "epoch": 6.840764331210191, | |
| "grad_norm": 0.8235570172827087, | |
| "learning_rate": 1.6609497170834154e-10, | |
| "loss": 0.4033, | |
| "step": 358 | |
| }, | |
| { | |
| "epoch": 6.859872611464969, | |
| "grad_norm": 0.8187520426135899, | |
| "learning_rate": 1.1535349032167907e-10, | |
| "loss": 0.3824, | |
| "step": 359 | |
| }, | |
| { | |
| "epoch": 6.8789808917197455, | |
| "grad_norm": 0.8083284571775858, | |
| "learning_rate": 7.38313447948724e-11, | |
| "loss": 0.3867, | |
| "step": 360 | |
| }, | |
| { | |
| "epoch": 6.898089171974522, | |
| "grad_norm": 0.8626430719203882, | |
| "learning_rate": 4.153236761488266e-11, | |
| "loss": 0.3762, | |
| "step": 361 | |
| }, | |
| { | |
| "epoch": 6.917197452229299, | |
| "grad_norm": 0.8074033893142732, | |
| "learning_rate": 1.8459539971804605e-11, | |
| "loss": 0.3756, | |
| "step": 362 | |
| }, | |
| { | |
| "epoch": 6.936305732484076, | |
| "grad_norm": 0.7789034490224797, | |
| "learning_rate": 4.614991483686825e-12, | |
| "loss": 0.3659, | |
| "step": 363 | |
| }, | |
| { | |
| "epoch": 6.955414012738854, | |
| "grad_norm": 0.8786861995300276, | |
| "learning_rate": 0.0, | |
| "loss": 0.3722, | |
| "step": 364 | |
| }, | |
| { | |
| "epoch": 6.955414012738854, | |
| "step": 364, | |
| "total_flos": 1.6645198251751014e+17, | |
| "train_loss": 0.407676433632662, | |
| "train_runtime": 2489.5823, | |
| "train_samples_per_second": 14.059, | |
| "train_steps_per_second": 0.146 | |
| } | |
| ], | |
| "logging_steps": 1, | |
| "max_steps": 364, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 7, | |
| "save_steps": 500, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": true | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 1.6645198251751014e+17, | |
| "train_batch_size": 1, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |