{ "best_metric": null, "best_model_checkpoint": null, "epoch": 4.977457168620378, "eval_steps": 500, "global_step": 345, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.014427412082957619, "grad_norm": 6.30643255752035, "learning_rate": 2.285714285714286e-06, "loss": 0.8669, "step": 1 }, { "epoch": 0.028854824165915238, "grad_norm": 6.348193987490592, "learning_rate": 4.571428571428572e-06, "loss": 0.87, "step": 2 }, { "epoch": 0.04328223624887286, "grad_norm": 5.780015353700753, "learning_rate": 6.857142857142858e-06, "loss": 0.8477, "step": 3 }, { "epoch": 0.057709648331830475, "grad_norm": 4.234197549907419, "learning_rate": 9.142857142857144e-06, "loss": 0.8118, "step": 4 }, { "epoch": 0.0721370604147881, "grad_norm": 2.3155214212097306, "learning_rate": 1.1428571428571429e-05, "loss": 0.7747, "step": 5 }, { "epoch": 0.08656447249774572, "grad_norm": 5.1118273278397846, "learning_rate": 1.3714285714285716e-05, "loss": 0.8053, "step": 6 }, { "epoch": 0.10099188458070334, "grad_norm": 7.1277227926615, "learning_rate": 1.6000000000000003e-05, "loss": 0.7794, "step": 7 }, { "epoch": 0.11541929666366095, "grad_norm": 8.187688384697006, "learning_rate": 1.8285714285714288e-05, "loss": 0.7978, "step": 8 }, { "epoch": 0.12984670874661858, "grad_norm": 5.093195709417533, "learning_rate": 2.057142857142857e-05, "loss": 0.7518, "step": 9 }, { "epoch": 0.1442741208295762, "grad_norm": 3.1265792874681977, "learning_rate": 2.2857142857142858e-05, "loss": 0.7116, "step": 10 }, { "epoch": 0.1587015329125338, "grad_norm": 2.5188366289278323, "learning_rate": 2.5142857142857143e-05, "loss": 0.6783, "step": 11 }, { "epoch": 0.17312894499549145, "grad_norm": 1.4940573526303949, "learning_rate": 2.742857142857143e-05, "loss": 0.6532, "step": 12 }, { "epoch": 0.18755635707844906, "grad_norm": 1.6352594758559187, "learning_rate": 2.9714285714285717e-05, "loss": 0.6347, "step": 13 }, { "epoch": 0.20198376916140667, "grad_norm": 1.2187588508966425, "learning_rate": 3.2000000000000005e-05, "loss": 0.6195, "step": 14 }, { "epoch": 0.2164111812443643, "grad_norm": 1.2755607008946352, "learning_rate": 3.4285714285714284e-05, "loss": 0.6142, "step": 15 }, { "epoch": 0.2308385933273219, "grad_norm": 1.0456746979084692, "learning_rate": 3.6571428571428576e-05, "loss": 0.6038, "step": 16 }, { "epoch": 0.24526600541027954, "grad_norm": 1.4162214811220066, "learning_rate": 3.885714285714286e-05, "loss": 0.5997, "step": 17 }, { "epoch": 0.25969341749323716, "grad_norm": 1.123092592959995, "learning_rate": 4.114285714285714e-05, "loss": 0.5855, "step": 18 }, { "epoch": 0.27412082957619477, "grad_norm": 1.247093949716292, "learning_rate": 4.342857142857143e-05, "loss": 0.5783, "step": 19 }, { "epoch": 0.2885482416591524, "grad_norm": 0.9162444696210892, "learning_rate": 4.5714285714285716e-05, "loss": 0.5762, "step": 20 }, { "epoch": 0.30297565374211, "grad_norm": 1.7011597008607717, "learning_rate": 4.8e-05, "loss": 0.5788, "step": 21 }, { "epoch": 0.3174030658250676, "grad_norm": 1.0313827493696333, "learning_rate": 5.0285714285714286e-05, "loss": 0.5711, "step": 22 }, { "epoch": 0.3318304779080252, "grad_norm": 2.0638257126228083, "learning_rate": 5.257142857142858e-05, "loss": 0.589, "step": 23 }, { "epoch": 0.3462578899909829, "grad_norm": 1.2864831655829803, "learning_rate": 5.485714285714286e-05, "loss": 0.5638, "step": 24 }, { "epoch": 0.3606853020739405, "grad_norm": 1.9339557478184641, "learning_rate": 5.714285714285715e-05, "loss": 0.58, "step": 25 }, { "epoch": 0.3751127141568981, "grad_norm": 1.5799611967220424, "learning_rate": 5.9428571428571434e-05, "loss": 0.5647, "step": 26 }, { "epoch": 0.38954012623985573, "grad_norm": 1.251826447578104, "learning_rate": 6.171428571428573e-05, "loss": 0.5586, "step": 27 }, { "epoch": 0.40396753832281335, "grad_norm": 1.5898160634262704, "learning_rate": 6.400000000000001e-05, "loss": 0.5526, "step": 28 }, { "epoch": 0.41839495040577096, "grad_norm": 1.1139638002205856, "learning_rate": 6.62857142857143e-05, "loss": 0.5503, "step": 29 }, { "epoch": 0.4328223624887286, "grad_norm": 1.2940676544230694, "learning_rate": 6.857142857142857e-05, "loss": 0.556, "step": 30 }, { "epoch": 0.4472497745716862, "grad_norm": 1.4125777791117147, "learning_rate": 7.085714285714287e-05, "loss": 0.5429, "step": 31 }, { "epoch": 0.4616771866546438, "grad_norm": 0.6917885537888634, "learning_rate": 7.314285714285715e-05, "loss": 0.537, "step": 32 }, { "epoch": 0.47610459873760147, "grad_norm": 1.0491224041512421, "learning_rate": 7.542857142857144e-05, "loss": 0.5431, "step": 33 }, { "epoch": 0.4905320108205591, "grad_norm": 1.1841266996810977, "learning_rate": 7.771428571428572e-05, "loss": 0.5408, "step": 34 }, { "epoch": 0.5049594229035167, "grad_norm": 1.3485916713931527, "learning_rate": 8e-05, "loss": 0.5369, "step": 35 }, { "epoch": 0.5193868349864743, "grad_norm": 1.381633742212091, "learning_rate": 7.999794598960815e-05, "loss": 0.5447, "step": 36 }, { "epoch": 0.5338142470694319, "grad_norm": 1.113965104171002, "learning_rate": 7.999178416938051e-05, "loss": 0.5343, "step": 37 }, { "epoch": 0.5482416591523895, "grad_norm": 3.2942969304440877, "learning_rate": 7.998151517213926e-05, "loss": 0.5223, "step": 38 }, { "epoch": 0.5626690712353472, "grad_norm": 1.8632536229114898, "learning_rate": 7.996714005251569e-05, "loss": 0.5358, "step": 39 }, { "epoch": 0.5770964833183048, "grad_norm": 1.3196787168296893, "learning_rate": 7.994866028684212e-05, "loss": 0.5372, "step": 40 }, { "epoch": 0.5915238954012624, "grad_norm": 1.1177800163283473, "learning_rate": 7.992607777300004e-05, "loss": 0.5274, "step": 41 }, { "epoch": 0.60595130748422, "grad_norm": 1.016337599180268, "learning_rate": 7.989939483022537e-05, "loss": 0.5209, "step": 42 }, { "epoch": 0.6203787195671776, "grad_norm": 1.395873276215236, "learning_rate": 7.98686141988702e-05, "loss": 0.5209, "step": 43 }, { "epoch": 0.6348061316501352, "grad_norm": 0.5976851739788052, "learning_rate": 7.983373904012138e-05, "loss": 0.5189, "step": 44 }, { "epoch": 0.6492335437330928, "grad_norm": 0.8805510948652211, "learning_rate": 7.97947729356758e-05, "loss": 0.5158, "step": 45 }, { "epoch": 0.6636609558160504, "grad_norm": 0.9279297933278495, "learning_rate": 7.975171988737267e-05, "loss": 0.5237, "step": 46 }, { "epoch": 0.6780883678990082, "grad_norm": 1.2997587022594848, "learning_rate": 7.970458431678239e-05, "loss": 0.5426, "step": 47 }, { "epoch": 0.6925157799819658, "grad_norm": 0.7359557953904674, "learning_rate": 7.965337106475256e-05, "loss": 0.5146, "step": 48 }, { "epoch": 0.7069431920649234, "grad_norm": 0.950140709774444, "learning_rate": 7.959808539091077e-05, "loss": 0.5207, "step": 49 }, { "epoch": 0.721370604147881, "grad_norm": 0.9262413860600672, "learning_rate": 7.953873297312447e-05, "loss": 0.5114, "step": 50 }, { "epoch": 0.7357980162308386, "grad_norm": 0.659161414732944, "learning_rate": 7.947531990691778e-05, "loss": 0.5065, "step": 51 }, { "epoch": 0.7502254283137962, "grad_norm": 0.6193539698881468, "learning_rate": 7.940785270484556e-05, "loss": 0.5082, "step": 52 }, { "epoch": 0.7646528403967539, "grad_norm": 0.6484786655929663, "learning_rate": 7.933633829582451e-05, "loss": 0.5073, "step": 53 }, { "epoch": 0.7790802524797115, "grad_norm": 0.5246514937970682, "learning_rate": 7.926078402442161e-05, "loss": 0.5034, "step": 54 }, { "epoch": 0.7935076645626691, "grad_norm": 0.7555014101024001, "learning_rate": 7.918119765009979e-05, "loss": 0.5011, "step": 55 }, { "epoch": 0.8079350766456267, "grad_norm": 0.7908819876305151, "learning_rate": 7.909758734642103e-05, "loss": 0.5034, "step": 56 }, { "epoch": 0.8223624887285843, "grad_norm": 0.8314592679002677, "learning_rate": 7.900996170020697e-05, "loss": 0.4941, "step": 57 }, { "epoch": 0.8367899008115419, "grad_norm": 0.6541181086969657, "learning_rate": 7.8918329710657e-05, "loss": 0.4971, "step": 58 }, { "epoch": 0.8512173128944995, "grad_norm": 0.572541817797756, "learning_rate": 7.882270078842407e-05, "loss": 0.4945, "step": 59 }, { "epoch": 0.8656447249774571, "grad_norm": 0.7068090025887374, "learning_rate": 7.872308475464818e-05, "loss": 0.496, "step": 60 }, { "epoch": 0.8800721370604148, "grad_norm": 0.5503187486959946, "learning_rate": 7.861949183994774e-05, "loss": 0.4921, "step": 61 }, { "epoch": 0.8944995491433724, "grad_norm": 0.6381310310432255, "learning_rate": 7.851193268336894e-05, "loss": 0.4993, "step": 62 }, { "epoch": 0.90892696122633, "grad_norm": 0.7118765648942602, "learning_rate": 7.840041833129304e-05, "loss": 0.488, "step": 63 }, { "epoch": 0.9233543733092876, "grad_norm": 0.8229902231799235, "learning_rate": 7.828496023630193e-05, "loss": 0.4886, "step": 64 }, { "epoch": 0.9377817853922452, "grad_norm": 0.8472076467453001, "learning_rate": 7.816557025600196e-05, "loss": 0.4954, "step": 65 }, { "epoch": 0.9522091974752029, "grad_norm": 0.9110400155251186, "learning_rate": 7.804226065180615e-05, "loss": 0.4869, "step": 66 }, { "epoch": 0.9666366095581606, "grad_norm": 1.0607731617003107, "learning_rate": 7.791504408767492e-05, "loss": 0.4867, "step": 67 }, { "epoch": 0.9810640216411182, "grad_norm": 0.8705421661545191, "learning_rate": 7.778393362881549e-05, "loss": 0.4873, "step": 68 }, { "epoch": 0.9954914337240758, "grad_norm": 0.598223438503932, "learning_rate": 7.764894274034014e-05, "loss": 0.4866, "step": 69 }, { "epoch": 1.0099188458070334, "grad_norm": 1.3726649121858878, "learning_rate": 7.751008528588322e-05, "loss": 0.8287, "step": 70 }, { "epoch": 1.024346257889991, "grad_norm": 1.4843499216749163, "learning_rate": 7.736737552617749e-05, "loss": 0.4874, "step": 71 }, { "epoch": 1.0387736699729486, "grad_norm": 0.5218559993997877, "learning_rate": 7.722082811758939e-05, "loss": 0.4768, "step": 72 }, { "epoch": 1.0532010820559061, "grad_norm": 1.3743390311137267, "learning_rate": 7.707045811061396e-05, "loss": 0.4805, "step": 73 }, { "epoch": 1.0676284941388638, "grad_norm": 0.6090236020230758, "learning_rate": 7.691628094832901e-05, "loss": 0.4731, "step": 74 }, { "epoch": 1.0820559062218216, "grad_norm": 0.897847351023635, "learning_rate": 7.675831246480923e-05, "loss": 0.4821, "step": 75 }, { "epoch": 1.096483318304779, "grad_norm": 0.7029565517391541, "learning_rate": 7.659656888349997e-05, "loss": 0.4724, "step": 76 }, { "epoch": 1.1109107303877368, "grad_norm": 0.6183945563644426, "learning_rate": 7.643106681555106e-05, "loss": 0.4763, "step": 77 }, { "epoch": 1.1253381424706943, "grad_norm": 0.5847084533277753, "learning_rate": 7.626182325811089e-05, "loss": 0.4664, "step": 78 }, { "epoch": 1.139765554553652, "grad_norm": 0.6749737142635733, "learning_rate": 7.60888555925807e-05, "loss": 0.4671, "step": 79 }, { "epoch": 1.1541929666366095, "grad_norm": 0.4481798380260001, "learning_rate": 7.591218158282968e-05, "loss": 0.4656, "step": 80 }, { "epoch": 1.1686203787195673, "grad_norm": 0.649578910722019, "learning_rate": 7.573181937337037e-05, "loss": 0.4685, "step": 81 }, { "epoch": 1.1830477908025248, "grad_norm": 0.511575614745596, "learning_rate": 7.554778748749543e-05, "loss": 0.4608, "step": 82 }, { "epoch": 1.1974752028854825, "grad_norm": 0.5161021268023156, "learning_rate": 7.536010482537514e-05, "loss": 0.4613, "step": 83 }, { "epoch": 1.21190261496844, "grad_norm": 0.46897677283090455, "learning_rate": 7.516879066211644e-05, "loss": 0.4691, "step": 84 }, { "epoch": 1.2263300270513977, "grad_norm": 0.5762897639302133, "learning_rate": 7.497386464578329e-05, "loss": 0.4654, "step": 85 }, { "epoch": 1.2407574391343552, "grad_norm": 0.3969665274048659, "learning_rate": 7.477534679537885e-05, "loss": 0.4587, "step": 86 }, { "epoch": 1.255184851217313, "grad_norm": 0.4524782612023369, "learning_rate": 7.457325749878951e-05, "loss": 0.4534, "step": 87 }, { "epoch": 1.2696122633002704, "grad_norm": 0.5470294599409099, "learning_rate": 7.436761751069103e-05, "loss": 0.4643, "step": 88 }, { "epoch": 1.2840396753832282, "grad_norm": 0.5658245365895949, "learning_rate": 7.415844795041704e-05, "loss": 0.4602, "step": 89 }, { "epoch": 1.2984670874661859, "grad_norm": 0.6284954594621484, "learning_rate": 7.394577029979004e-05, "loss": 0.4676, "step": 90 }, { "epoch": 1.3128944995491434, "grad_norm": 0.7345913995003851, "learning_rate": 7.372960640091529e-05, "loss": 0.4606, "step": 91 }, { "epoch": 1.327321911632101, "grad_norm": 0.8342633496573308, "learning_rate": 7.350997845393752e-05, "loss": 0.4557, "step": 92 }, { "epoch": 1.3417493237150586, "grad_norm": 0.8330096859025692, "learning_rate": 7.328690901476095e-05, "loss": 0.4647, "step": 93 }, { "epoch": 1.3561767357980163, "grad_norm": 0.6546676985057208, "learning_rate": 7.306042099273297e-05, "loss": 0.4592, "step": 94 }, { "epoch": 1.3706041478809738, "grad_norm": 0.47502637705371126, "learning_rate": 7.283053764829106e-05, "loss": 0.4605, "step": 95 }, { "epoch": 1.3850315599639313, "grad_norm": 0.5531078683869538, "learning_rate": 7.259728259057417e-05, "loss": 0.4567, "step": 96 }, { "epoch": 1.399458972046889, "grad_norm": 0.515899958416822, "learning_rate": 7.236067977499791e-05, "loss": 0.4578, "step": 97 }, { "epoch": 1.4138863841298468, "grad_norm": 0.3492664441384964, "learning_rate": 7.212075350079437e-05, "loss": 0.4561, "step": 98 }, { "epoch": 1.4283137962128043, "grad_norm": 0.42413300170898927, "learning_rate": 7.187752840851661e-05, "loss": 0.4569, "step": 99 }, { "epoch": 1.442741208295762, "grad_norm": 0.4947663891832909, "learning_rate": 7.163102947750794e-05, "loss": 0.456, "step": 100 }, { "epoch": 1.4571686203787195, "grad_norm": 0.36507776313239376, "learning_rate": 7.13812820233367e-05, "loss": 0.4592, "step": 101 }, { "epoch": 1.4715960324616773, "grad_norm": 0.37547804843247373, "learning_rate": 7.112831169519617e-05, "loss": 0.459, "step": 102 }, { "epoch": 1.4860234445446348, "grad_norm": 0.36635807000670995, "learning_rate": 7.087214447327049e-05, "loss": 0.4561, "step": 103 }, { "epoch": 1.5004508566275925, "grad_norm": 0.315478417939894, "learning_rate": 7.061280666606646e-05, "loss": 0.4563, "step": 104 }, { "epoch": 1.5148782687105502, "grad_norm": 0.4096625613828037, "learning_rate": 7.035032490771165e-05, "loss": 0.4541, "step": 105 }, { "epoch": 1.5293056807935077, "grad_norm": 0.4422620826291203, "learning_rate": 7.008472615521898e-05, "loss": 0.4508, "step": 106 }, { "epoch": 1.5437330928764652, "grad_norm": 0.3213468597989991, "learning_rate": 6.98160376857184e-05, "loss": 0.458, "step": 107 }, { "epoch": 1.558160504959423, "grad_norm": 0.35471415827924724, "learning_rate": 6.954428709365527e-05, "loss": 0.4563, "step": 108 }, { "epoch": 1.5725879170423807, "grad_norm": 0.4247233136060684, "learning_rate": 6.926950228795663e-05, "loss": 0.4516, "step": 109 }, { "epoch": 1.5870153291253382, "grad_norm": 0.31840084731849594, "learning_rate": 6.89917114891648e-05, "loss": 0.4547, "step": 110 }, { "epoch": 1.6014427412082957, "grad_norm": 0.3573055805732088, "learning_rate": 6.871094322653916e-05, "loss": 0.4574, "step": 111 }, { "epoch": 1.6158701532912534, "grad_norm": 0.33089511640034097, "learning_rate": 6.842722633512614e-05, "loss": 0.4568, "step": 112 }, { "epoch": 1.630297565374211, "grad_norm": 0.32234159444311866, "learning_rate": 6.814058995279793e-05, "loss": 0.4506, "step": 113 }, { "epoch": 1.6447249774571686, "grad_norm": 0.2842035714714675, "learning_rate": 6.785106351725992e-05, "loss": 0.4451, "step": 114 }, { "epoch": 1.6591523895401261, "grad_norm": 0.24782641096472402, "learning_rate": 6.755867676302747e-05, "loss": 0.4524, "step": 115 }, { "epoch": 1.6735798016230838, "grad_norm": 0.29530488172037256, "learning_rate": 6.726345971837217e-05, "loss": 0.4523, "step": 116 }, { "epoch": 1.6880072137060416, "grad_norm": 0.29231108013584617, "learning_rate": 6.69654427022379e-05, "loss": 0.448, "step": 117 }, { "epoch": 1.702434625788999, "grad_norm": 0.3209263624489444, "learning_rate": 6.666465632112707e-05, "loss": 0.4523, "step": 118 }, { "epoch": 1.7168620378719566, "grad_norm": 0.4315596822756952, "learning_rate": 6.636113146595729e-05, "loss": 0.4491, "step": 119 }, { "epoch": 1.7312894499549143, "grad_norm": 0.4570225349432179, "learning_rate": 6.60548993088889e-05, "loss": 0.4464, "step": 120 }, { "epoch": 1.745716862037872, "grad_norm": 0.44762480786064185, "learning_rate": 6.574599130012355e-05, "loss": 0.4548, "step": 121 }, { "epoch": 1.7601442741208295, "grad_norm": 0.4937434929135096, "learning_rate": 6.543443916467426e-05, "loss": 0.4503, "step": 122 }, { "epoch": 1.7745716862037872, "grad_norm": 0.606568052119448, "learning_rate": 6.512027489910718e-05, "loss": 0.4486, "step": 123 }, { "epoch": 1.788999098286745, "grad_norm": 0.6858758315433683, "learning_rate": 6.480353076825566e-05, "loss": 0.449, "step": 124 }, { "epoch": 1.8034265103697025, "grad_norm": 0.5123808792652511, "learning_rate": 6.448423930190653e-05, "loss": 0.4464, "step": 125 }, { "epoch": 1.81785392245266, "grad_norm": 0.38964320431553595, "learning_rate": 6.416243329145923e-05, "loss": 0.4475, "step": 126 }, { "epoch": 1.8322813345356177, "grad_norm": 0.35099016991264836, "learning_rate": 6.383814578655829e-05, "loss": 0.4547, "step": 127 }, { "epoch": 1.8467087466185754, "grad_norm": 0.3451471240491199, "learning_rate": 6.351141009169893e-05, "loss": 0.4502, "step": 128 }, { "epoch": 1.861136158701533, "grad_norm": 0.33153601295599006, "learning_rate": 6.31822597628068e-05, "loss": 0.4487, "step": 129 }, { "epoch": 1.8755635707844904, "grad_norm": 0.34266592441777854, "learning_rate": 6.28507286037917e-05, "loss": 0.4477, "step": 130 }, { "epoch": 1.8899909828674482, "grad_norm": 0.3492224166038735, "learning_rate": 6.251685066307592e-05, "loss": 0.4577, "step": 131 }, { "epoch": 1.9044183949504059, "grad_norm": 0.2600600833378922, "learning_rate": 6.218066023009743e-05, "loss": 0.4491, "step": 132 }, { "epoch": 1.9188458070333634, "grad_norm": 0.2930478733859803, "learning_rate": 6.184219183178842e-05, "loss": 0.4378, "step": 133 }, { "epoch": 1.9332732191163209, "grad_norm": 0.344123397095677, "learning_rate": 6.150148022902922e-05, "loss": 0.4486, "step": 134 }, { "epoch": 1.9477006311992786, "grad_norm": 0.32732494053257644, "learning_rate": 6.11585604130785e-05, "loss": 0.4451, "step": 135 }, { "epoch": 1.9621280432822363, "grad_norm": 0.25454887232448653, "learning_rate": 6.081346760197953e-05, "loss": 0.4435, "step": 136 }, { "epoch": 1.9765554553651938, "grad_norm": 0.21336525188734806, "learning_rate": 6.04662372369433e-05, "loss": 0.4459, "step": 137 }, { "epoch": 1.9909828674481513, "grad_norm": 0.21510264038063648, "learning_rate": 6.0116904978708716e-05, "loss": 0.4451, "step": 138 }, { "epoch": 2.0054102795311093, "grad_norm": 0.3886088967850276, "learning_rate": 5.976550670388023e-05, "loss": 0.7365, "step": 139 }, { "epoch": 2.019837691614067, "grad_norm": 0.5461141945560231, "learning_rate": 5.941207850124325e-05, "loss": 0.4274, "step": 140 }, { "epoch": 2.0342651036970243, "grad_norm": 0.7233438360497401, "learning_rate": 5.9056656668057806e-05, "loss": 0.4257, "step": 141 }, { "epoch": 2.048692515779982, "grad_norm": 0.902604447839341, "learning_rate": 5.8699277706330854e-05, "loss": 0.4327, "step": 142 }, { "epoch": 2.0631199278629397, "grad_norm": 0.9842345625256362, "learning_rate": 5.833997831906746e-05, "loss": 0.4206, "step": 143 }, { "epoch": 2.0775473399458972, "grad_norm": 0.7550138291557669, "learning_rate": 5.7978795406501365e-05, "loss": 0.4213, "step": 144 }, { "epoch": 2.0919747520288547, "grad_norm": 0.5725375243656562, "learning_rate": 5.761576606230538e-05, "loss": 0.4232, "step": 145 }, { "epoch": 2.1064021641118122, "grad_norm": 0.5871563051625412, "learning_rate": 5.725092756978177e-05, "loss": 0.4268, "step": 146 }, { "epoch": 2.12082957619477, "grad_norm": 0.6848078352834541, "learning_rate": 5.688431739803328e-05, "loss": 0.4231, "step": 147 }, { "epoch": 2.1352569882777277, "grad_norm": 0.47360287031992565, "learning_rate": 5.651597319811505e-05, "loss": 0.4245, "step": 148 }, { "epoch": 2.149684400360685, "grad_norm": 0.43267908202913546, "learning_rate": 5.6145932799167795e-05, "loss": 0.421, "step": 149 }, { "epoch": 2.164111812443643, "grad_norm": 0.5225940009578477, "learning_rate": 5.5774234204532746e-05, "loss": 0.4171, "step": 150 }, { "epoch": 2.1785392245266006, "grad_norm": 0.345292795118154, "learning_rate": 5.5400915587848713e-05, "loss": 0.4176, "step": 151 }, { "epoch": 2.192966636609558, "grad_norm": 0.37397788119190706, "learning_rate": 5.502601528913161e-05, "loss": 0.4185, "step": 152 }, { "epoch": 2.2073940486925157, "grad_norm": 0.33142951490345385, "learning_rate": 5.464957181083692e-05, "loss": 0.4185, "step": 153 }, { "epoch": 2.2218214607754736, "grad_norm": 0.2921058390866845, "learning_rate": 5.427162381390543e-05, "loss": 0.417, "step": 154 }, { "epoch": 2.236248872858431, "grad_norm": 0.34198696626119557, "learning_rate": 5.389221011379281e-05, "loss": 0.4165, "step": 155 }, { "epoch": 2.2506762849413886, "grad_norm": 0.26908479849148176, "learning_rate": 5.351136967648323e-05, "loss": 0.4193, "step": 156 }, { "epoch": 2.265103697024346, "grad_norm": 0.31962185227765055, "learning_rate": 5.3129141614487456e-05, "loss": 0.4279, "step": 157 }, { "epoch": 2.279531109107304, "grad_norm": 0.376211538661627, "learning_rate": 5.274556518282607e-05, "loss": 0.4195, "step": 158 }, { "epoch": 2.2939585211902616, "grad_norm": 0.28546559354766526, "learning_rate": 5.23606797749979e-05, "loss": 0.4199, "step": 159 }, { "epoch": 2.308385933273219, "grad_norm": 0.35404717031780875, "learning_rate": 5.1974524918934336e-05, "loss": 0.4194, "step": 160 }, { "epoch": 2.3228133453561766, "grad_norm": 0.32804234637360613, "learning_rate": 5.15871402729397e-05, "loss": 0.4215, "step": 161 }, { "epoch": 2.3372407574391345, "grad_norm": 0.25853378935309307, "learning_rate": 5.1198565621618444e-05, "loss": 0.42, "step": 162 }, { "epoch": 2.351668169522092, "grad_norm": 0.29254513463752485, "learning_rate": 5.0808840871789155e-05, "loss": 0.4137, "step": 163 }, { "epoch": 2.3660955816050495, "grad_norm": 0.2324430211066698, "learning_rate": 5.0418006048386134e-05, "loss": 0.4174, "step": 164 }, { "epoch": 2.3805229936880075, "grad_norm": 0.22977260261166277, "learning_rate": 5.002610129034883e-05, "loss": 0.418, "step": 165 }, { "epoch": 2.394950405770965, "grad_norm": 0.25178175225388516, "learning_rate": 4.963316684649951e-05, "loss": 0.4215, "step": 166 }, { "epoch": 2.4093778178539225, "grad_norm": 0.18022661655296157, "learning_rate": 4.923924307140974e-05, "loss": 0.414, "step": 167 }, { "epoch": 2.42380522993688, "grad_norm": 0.23950853172671158, "learning_rate": 4.8844370421255886e-05, "loss": 0.419, "step": 168 }, { "epoch": 2.4382326420198375, "grad_norm": 0.19718161732313788, "learning_rate": 4.8448589449664305e-05, "loss": 0.4124, "step": 169 }, { "epoch": 2.4526600541027954, "grad_norm": 0.1804834440563653, "learning_rate": 4.805194080354641e-05, "loss": 0.4179, "step": 170 }, { "epoch": 2.467087466185753, "grad_norm": 0.20353053079969263, "learning_rate": 4.765446521892426e-05, "loss": 0.4104, "step": 171 }, { "epoch": 2.4815148782687104, "grad_norm": 0.16177819342894753, "learning_rate": 4.725620351674693e-05, "loss": 0.4202, "step": 172 }, { "epoch": 2.4959422903516684, "grad_norm": 0.16071769506654357, "learning_rate": 4.685719659869815e-05, "loss": 0.4083, "step": 173 }, { "epoch": 2.510369702434626, "grad_norm": 0.1725361486750181, "learning_rate": 4.645748544299574e-05, "loss": 0.4153, "step": 174 }, { "epoch": 2.5247971145175834, "grad_norm": 0.16753825050295582, "learning_rate": 4.605711110018307e-05, "loss": 0.4123, "step": 175 }, { "epoch": 2.539224526600541, "grad_norm": 0.17717032081528933, "learning_rate": 4.565611468891318e-05, "loss": 0.4129, "step": 176 }, { "epoch": 2.5536519386834984, "grad_norm": 0.1564598566236543, "learning_rate": 4.525453739172586e-05, "loss": 0.4117, "step": 177 }, { "epoch": 2.5680793507664563, "grad_norm": 0.15287663289000603, "learning_rate": 4.48524204508182e-05, "loss": 0.4183, "step": 178 }, { "epoch": 2.582506762849414, "grad_norm": 0.18206218031669835, "learning_rate": 4.444980516380895e-05, "loss": 0.4117, "step": 179 }, { "epoch": 2.5969341749323718, "grad_norm": 0.16895498094131148, "learning_rate": 4.4046732879497295e-05, "loss": 0.4148, "step": 180 }, { "epoch": 2.6113615870153293, "grad_norm": 0.20384116046961775, "learning_rate": 4.364324499361626e-05, "loss": 0.4121, "step": 181 }, { "epoch": 2.625788999098287, "grad_norm": 0.18201505177744084, "learning_rate": 4.3239382944581384e-05, "loss": 0.4154, "step": 182 }, { "epoch": 2.6402164111812443, "grad_norm": 0.16531279832670212, "learning_rate": 4.283518820923492e-05, "loss": 0.4134, "step": 183 }, { "epoch": 2.654643823264202, "grad_norm": 0.17869608399055636, "learning_rate": 4.243070229858624e-05, "loss": 0.4167, "step": 184 }, { "epoch": 2.6690712353471597, "grad_norm": 0.15659192579938305, "learning_rate": 4.202596675354851e-05, "loss": 0.415, "step": 185 }, { "epoch": 2.6834986474301172, "grad_norm": 0.1729110016630772, "learning_rate": 4.1621023140672524e-05, "loss": 0.4149, "step": 186 }, { "epoch": 2.6979260595130747, "grad_norm": 0.17987624911793657, "learning_rate": 4.121591304787772e-05, "loss": 0.4128, "step": 187 }, { "epoch": 2.7123534715960327, "grad_norm": 0.16277022431055213, "learning_rate": 4.081067808018111e-05, "loss": 0.4115, "step": 188 }, { "epoch": 2.72678088367899, "grad_norm": 0.1614060894054725, "learning_rate": 4.040535985542445e-05, "loss": 0.4188, "step": 189 }, { "epoch": 2.7412082957619477, "grad_norm": 0.1498519807080618, "learning_rate": 4e-05, "loss": 0.4172, "step": 190 }, { "epoch": 2.755635707844905, "grad_norm": 0.1604036678202687, "learning_rate": 3.959464014457557e-05, "loss": 0.4077, "step": 191 }, { "epoch": 2.7700631199278627, "grad_norm": 0.13770932722249057, "learning_rate": 3.91893219198189e-05, "loss": 0.4195, "step": 192 }, { "epoch": 2.7844905320108206, "grad_norm": 0.15035210016285183, "learning_rate": 3.87840869521223e-05, "loss": 0.4134, "step": 193 }, { "epoch": 2.798917944093778, "grad_norm": 0.15201640612716522, "learning_rate": 3.837897685932748e-05, "loss": 0.4106, "step": 194 }, { "epoch": 2.8133453561767356, "grad_norm": 0.13650157280906988, "learning_rate": 3.7974033246451496e-05, "loss": 0.4156, "step": 195 }, { "epoch": 2.8277727682596936, "grad_norm": 0.17964938669673042, "learning_rate": 3.7569297701413765e-05, "loss": 0.4154, "step": 196 }, { "epoch": 2.842200180342651, "grad_norm": 0.1243561060549184, "learning_rate": 3.716481179076509e-05, "loss": 0.4197, "step": 197 }, { "epoch": 2.8566275924256086, "grad_norm": 0.17089769484487582, "learning_rate": 3.676061705541864e-05, "loss": 0.4152, "step": 198 }, { "epoch": 2.871055004508566, "grad_norm": 0.17561155960318975, "learning_rate": 3.635675500638375e-05, "loss": 0.4167, "step": 199 }, { "epoch": 2.885482416591524, "grad_norm": 0.16307396978150157, "learning_rate": 3.595326712050272e-05, "loss": 0.418, "step": 200 }, { "epoch": 2.8999098286744815, "grad_norm": 0.18681533479112983, "learning_rate": 3.555019483619106e-05, "loss": 0.418, "step": 201 }, { "epoch": 2.914337240757439, "grad_norm": 0.1692680534023291, "learning_rate": 3.5147579549181805e-05, "loss": 0.4095, "step": 202 }, { "epoch": 2.928764652840397, "grad_norm": 0.1647112968457325, "learning_rate": 3.4745462608274143e-05, "loss": 0.421, "step": 203 }, { "epoch": 2.9431920649233545, "grad_norm": 0.1645824019282664, "learning_rate": 3.434388531108683e-05, "loss": 0.4201, "step": 204 }, { "epoch": 2.957619477006312, "grad_norm": 0.16193821543079018, "learning_rate": 3.394288889981695e-05, "loss": 0.4144, "step": 205 }, { "epoch": 2.9720468890892695, "grad_norm": 0.15576654979169963, "learning_rate": 3.354251455700427e-05, "loss": 0.421, "step": 206 }, { "epoch": 2.986474301172227, "grad_norm": 0.11204665102016201, "learning_rate": 3.314280340130187e-05, "loss": 0.4169, "step": 207 }, { "epoch": 3.000901713255185, "grad_norm": 0.2597744580379488, "learning_rate": 3.274379648325308e-05, "loss": 0.7047, "step": 208 }, { "epoch": 3.0153291253381425, "grad_norm": 0.29242080182868324, "learning_rate": 3.234553478107575e-05, "loss": 0.3922, "step": 209 }, { "epoch": 3.0297565374211, "grad_norm": 0.15554632560519327, "learning_rate": 3.194805919645359e-05, "loss": 0.3914, "step": 210 }, { "epoch": 3.044183949504058, "grad_norm": 0.22638176078144323, "learning_rate": 3.155141055033571e-05, "loss": 0.389, "step": 211 }, { "epoch": 3.0586113615870154, "grad_norm": 0.22235251051875934, "learning_rate": 3.115562957874413e-05, "loss": 0.3894, "step": 212 }, { "epoch": 3.073038773669973, "grad_norm": 0.14895254239929756, "learning_rate": 3.0760756928590265e-05, "loss": 0.3855, "step": 213 }, { "epoch": 3.0874661857529304, "grad_norm": 0.21985837426895496, "learning_rate": 3.0366833153500502e-05, "loss": 0.3899, "step": 214 }, { "epoch": 3.1018935978358884, "grad_norm": 0.1448532296100453, "learning_rate": 2.997389870965118e-05, "loss": 0.3853, "step": 215 }, { "epoch": 3.116321009918846, "grad_norm": 0.18340169272282977, "learning_rate": 2.958199395161388e-05, "loss": 0.3885, "step": 216 }, { "epoch": 3.1307484220018034, "grad_norm": 0.16252308646393857, "learning_rate": 2.9191159128210865e-05, "loss": 0.388, "step": 217 }, { "epoch": 3.145175834084761, "grad_norm": 0.15643803474572993, "learning_rate": 2.8801434378381566e-05, "loss": 0.3918, "step": 218 }, { "epoch": 3.159603246167719, "grad_norm": 0.16477382717354483, "learning_rate": 2.841285972706032e-05, "loss": 0.3848, "step": 219 }, { "epoch": 3.1740306582506763, "grad_norm": 0.1428234224200868, "learning_rate": 2.8025475081065684e-05, "loss": 0.3916, "step": 220 }, { "epoch": 3.188458070333634, "grad_norm": 0.15459593532143248, "learning_rate": 2.7639320225002108e-05, "loss": 0.3868, "step": 221 }, { "epoch": 3.2028854824165913, "grad_norm": 0.1376918823828853, "learning_rate": 2.725443481717394e-05, "loss": 0.3869, "step": 222 }, { "epoch": 3.2173128944995493, "grad_norm": 0.12950376396508245, "learning_rate": 2.687085838551255e-05, "loss": 0.391, "step": 223 }, { "epoch": 3.2317403065825068, "grad_norm": 0.15236052575941866, "learning_rate": 2.6488630323516785e-05, "loss": 0.3854, "step": 224 }, { "epoch": 3.2461677186654643, "grad_norm": 0.12413662200660247, "learning_rate": 2.6107789886207195e-05, "loss": 0.3932, "step": 225 }, { "epoch": 3.260595130748422, "grad_norm": 0.12948714851227347, "learning_rate": 2.5728376186094582e-05, "loss": 0.392, "step": 226 }, { "epoch": 3.2750225428313797, "grad_norm": 0.13509083763614343, "learning_rate": 2.5350428189163095e-05, "loss": 0.3893, "step": 227 }, { "epoch": 3.2894499549143372, "grad_norm": 0.11596299194935494, "learning_rate": 2.4973984710868394e-05, "loss": 0.3853, "step": 228 }, { "epoch": 3.3038773669972947, "grad_norm": 0.11495064647362904, "learning_rate": 2.4599084412151283e-05, "loss": 0.3881, "step": 229 }, { "epoch": 3.3183047790802522, "grad_norm": 0.11377790156854924, "learning_rate": 2.4225765795467267e-05, "loss": 0.3881, "step": 230 }, { "epoch": 3.33273219116321, "grad_norm": 0.11176541174980999, "learning_rate": 2.3854067200832226e-05, "loss": 0.3849, "step": 231 }, { "epoch": 3.3471596032461677, "grad_norm": 0.10932782133038507, "learning_rate": 2.348402680188496e-05, "loss": 0.3913, "step": 232 }, { "epoch": 3.361587015329125, "grad_norm": 0.12116739999194517, "learning_rate": 2.3115682601966726e-05, "loss": 0.3909, "step": 233 }, { "epoch": 3.376014427412083, "grad_norm": 0.11683332854779228, "learning_rate": 2.274907243021824e-05, "loss": 0.384, "step": 234 }, { "epoch": 3.3904418394950406, "grad_norm": 0.10329122415194655, "learning_rate": 2.2384233937694626e-05, "loss": 0.3891, "step": 235 }, { "epoch": 3.404869251577998, "grad_norm": 0.11676332764357526, "learning_rate": 2.202120459349864e-05, "loss": 0.3879, "step": 236 }, { "epoch": 3.4192966636609556, "grad_norm": 0.11043415196225377, "learning_rate": 2.1660021680932565e-05, "loss": 0.3907, "step": 237 }, { "epoch": 3.4337240757439136, "grad_norm": 0.10352103209720392, "learning_rate": 2.130072229366916e-05, "loss": 0.3868, "step": 238 }, { "epoch": 3.448151487826871, "grad_norm": 0.11106271281959253, "learning_rate": 2.0943343331942208e-05, "loss": 0.3872, "step": 239 }, { "epoch": 3.4625788999098286, "grad_norm": 0.100859861129825, "learning_rate": 2.0587921498756768e-05, "loss": 0.3841, "step": 240 }, { "epoch": 3.4770063119927865, "grad_norm": 0.11902184783806945, "learning_rate": 2.0234493296119776e-05, "loss": 0.389, "step": 241 }, { "epoch": 3.491433724075744, "grad_norm": 0.09752054045307186, "learning_rate": 1.9883095021291294e-05, "loss": 0.3894, "step": 242 }, { "epoch": 3.5058611361587015, "grad_norm": 0.1157887524298405, "learning_rate": 1.9533762763056714e-05, "loss": 0.3864, "step": 243 }, { "epoch": 3.520288548241659, "grad_norm": 0.0962545228356216, "learning_rate": 1.918653239802048e-05, "loss": 0.3911, "step": 244 }, { "epoch": 3.5347159603246165, "grad_norm": 0.11589978846585437, "learning_rate": 1.8841439586921515e-05, "loss": 0.3873, "step": 245 }, { "epoch": 3.5491433724075745, "grad_norm": 0.10235501875925748, "learning_rate": 1.849851977097078e-05, "loss": 0.3919, "step": 246 }, { "epoch": 3.563570784490532, "grad_norm": 0.10642762647275054, "learning_rate": 1.8157808168211605e-05, "loss": 0.3862, "step": 247 }, { "epoch": 3.5779981965734895, "grad_norm": 0.10705125409234852, "learning_rate": 1.7819339769902568e-05, "loss": 0.3826, "step": 248 }, { "epoch": 3.5924256086564474, "grad_norm": 0.11011000435589068, "learning_rate": 1.7483149336924105e-05, "loss": 0.3896, "step": 249 }, { "epoch": 3.606853020739405, "grad_norm": 0.10299367912221409, "learning_rate": 1.71492713962083e-05, "loss": 0.3818, "step": 250 }, { "epoch": 3.6212804328223624, "grad_norm": 0.09896534243305305, "learning_rate": 1.6817740237193213e-05, "loss": 0.3899, "step": 251 }, { "epoch": 3.63570784490532, "grad_norm": 0.10057029872247607, "learning_rate": 1.648858990830108e-05, "loss": 0.3865, "step": 252 }, { "epoch": 3.6501352569882775, "grad_norm": 0.10556137735012057, "learning_rate": 1.6161854213441724e-05, "loss": 0.3857, "step": 253 }, { "epoch": 3.6645626690712354, "grad_norm": 0.09912849463045817, "learning_rate": 1.5837566708540776e-05, "loss": 0.3882, "step": 254 }, { "epoch": 3.678990081154193, "grad_norm": 0.10873331871358806, "learning_rate": 1.5515760698093485e-05, "loss": 0.3913, "step": 255 }, { "epoch": 3.693417493237151, "grad_norm": 0.10135375429134282, "learning_rate": 1.5196469231744338e-05, "loss": 0.3918, "step": 256 }, { "epoch": 3.7078449053201084, "grad_norm": 0.101442765978251, "learning_rate": 1.4879725100892821e-05, "loss": 0.3898, "step": 257 }, { "epoch": 3.722272317403066, "grad_norm": 0.09944828474807176, "learning_rate": 1.456556083532577e-05, "loss": 0.3888, "step": 258 }, { "epoch": 3.7366997294860234, "grad_norm": 0.10063514199400612, "learning_rate": 1.4254008699876468e-05, "loss": 0.3875, "step": 259 }, { "epoch": 3.751127141568981, "grad_norm": 0.1073202284969319, "learning_rate": 1.394510069111112e-05, "loss": 0.3825, "step": 260 }, { "epoch": 3.765554553651939, "grad_norm": 0.11199636802016412, "learning_rate": 1.3638868534042732e-05, "loss": 0.3912, "step": 261 }, { "epoch": 3.7799819657348963, "grad_norm": 0.09460154248342248, "learning_rate": 1.3335343678872947e-05, "loss": 0.3919, "step": 262 }, { "epoch": 3.794409377817854, "grad_norm": 0.10030251095406782, "learning_rate": 1.3034557297762108e-05, "loss": 0.3897, "step": 263 }, { "epoch": 3.8088367899008118, "grad_norm": 0.09707674946485532, "learning_rate": 1.2736540281627833e-05, "loss": 0.3882, "step": 264 }, { "epoch": 3.8232642019837693, "grad_norm": 0.10066191197693501, "learning_rate": 1.2441323236972536e-05, "loss": 0.3838, "step": 265 }, { "epoch": 3.8376916140667268, "grad_norm": 0.09882561767806158, "learning_rate": 1.2148936482740106e-05, "loss": 0.3876, "step": 266 }, { "epoch": 3.8521190261496843, "grad_norm": 0.09393577574639751, "learning_rate": 1.1859410047202076e-05, "loss": 0.3949, "step": 267 }, { "epoch": 3.8665464382326418, "grad_norm": 0.10491601613830169, "learning_rate": 1.1572773664873877e-05, "loss": 0.3945, "step": 268 }, { "epoch": 3.8809738503155997, "grad_norm": 0.09433909557518863, "learning_rate": 1.1289056773460848e-05, "loss": 0.3907, "step": 269 }, { "epoch": 3.895401262398557, "grad_norm": 0.09718276334267877, "learning_rate": 1.100828851083521e-05, "loss": 0.3892, "step": 270 }, { "epoch": 3.9098286744815147, "grad_norm": 0.09130729699370443, "learning_rate": 1.0730497712043375e-05, "loss": 0.3877, "step": 271 }, { "epoch": 3.9242560865644727, "grad_norm": 0.0989960086350818, "learning_rate": 1.0455712906344742e-05, "loss": 0.3905, "step": 272 }, { "epoch": 3.93868349864743, "grad_norm": 0.08478658948822386, "learning_rate": 1.0183962314281616e-05, "loss": 0.3809, "step": 273 }, { "epoch": 3.9531109107303877, "grad_norm": 0.08732293651393247, "learning_rate": 9.91527384478102e-06, "loss": 0.3909, "step": 274 }, { "epoch": 3.967538322813345, "grad_norm": 0.09248422321017552, "learning_rate": 9.649675092288366e-06, "loss": 0.3904, "step": 275 }, { "epoch": 3.981965734896303, "grad_norm": 0.08874068919252195, "learning_rate": 9.387193333933542e-06, "loss": 0.3901, "step": 276 }, { "epoch": 3.9963931469792606, "grad_norm": 0.10386496694166722, "learning_rate": 9.127855526729518e-06, "loss": 0.4421, "step": 277 }, { "epoch": 4.010820559062219, "grad_norm": 0.17465874605366377, "learning_rate": 8.87168830480385e-06, "loss": 0.5908, "step": 278 }, { "epoch": 4.025247971145176, "grad_norm": 0.10653039126787628, "learning_rate": 8.618717976663316e-06, "loss": 0.3731, "step": 279 }, { "epoch": 4.039675383228134, "grad_norm": 0.09575070816517416, "learning_rate": 8.368970522492064e-06, "loss": 0.368, "step": 280 }, { "epoch": 4.054102795311091, "grad_norm": 0.10014800890252488, "learning_rate": 8.122471591483405e-06, "loss": 0.379, "step": 281 }, { "epoch": 4.068530207394049, "grad_norm": 0.10719719334181581, "learning_rate": 7.879246499205635e-06, "loss": 0.3747, "step": 282 }, { "epoch": 4.082957619477006, "grad_norm": 0.1034029506118448, "learning_rate": 7.639320225002106e-06, "loss": 0.3675, "step": 283 }, { "epoch": 4.097385031559964, "grad_norm": 0.09719883839292859, "learning_rate": 7.402717409425846e-06, "loss": 0.3745, "step": 284 }, { "epoch": 4.111812443642922, "grad_norm": 0.09348527541393592, "learning_rate": 7.169462351708958e-06, "loss": 0.3746, "step": 285 }, { "epoch": 4.1262398557258795, "grad_norm": 0.09622234742870885, "learning_rate": 6.939579007267041e-06, "loss": 0.3669, "step": 286 }, { "epoch": 4.140667267808837, "grad_norm": 0.10062068129651956, "learning_rate": 6.7130909852390504e-06, "loss": 0.377, "step": 287 }, { "epoch": 4.1550946798917945, "grad_norm": 0.09063820087374816, "learning_rate": 6.490021546062495e-06, "loss": 0.3725, "step": 288 }, { "epoch": 4.169522091974752, "grad_norm": 0.0978449706487065, "learning_rate": 6.270393599084719e-06, "loss": 0.3701, "step": 289 }, { "epoch": 4.1839495040577095, "grad_norm": 0.09420394070648874, "learning_rate": 6.054229700209959e-06, "loss": 0.3686, "step": 290 }, { "epoch": 4.198376916140667, "grad_norm": 0.09135183952593588, "learning_rate": 5.841552049582979e-06, "loss": 0.3668, "step": 291 }, { "epoch": 4.2128043282236245, "grad_norm": 0.08941854382744684, "learning_rate": 5.632382489308983e-06, "loss": 0.3753, "step": 292 }, { "epoch": 4.227231740306583, "grad_norm": 0.09033071999727058, "learning_rate": 5.4267425012105e-06, "loss": 0.371, "step": 293 }, { "epoch": 4.24165915238954, "grad_norm": 0.08363022499101917, "learning_rate": 5.224653204621155e-06, "loss": 0.3699, "step": 294 }, { "epoch": 4.256086564472498, "grad_norm": 0.0794997380043983, "learning_rate": 5.026135354216717e-06, "loss": 0.3703, "step": 295 }, { "epoch": 4.270513976555455, "grad_norm": 0.08331150232989441, "learning_rate": 4.8312093378835645e-06, "loss": 0.3729, "step": 296 }, { "epoch": 4.284941388638413, "grad_norm": 0.08516826877199297, "learning_rate": 4.63989517462486e-06, "loss": 0.3757, "step": 297 }, { "epoch": 4.29936880072137, "grad_norm": 0.08386630568073708, "learning_rate": 4.452212512504579e-06, "loss": 0.3766, "step": 298 }, { "epoch": 4.313796212804328, "grad_norm": 0.08120526732790356, "learning_rate": 4.268180626629641e-06, "loss": 0.3751, "step": 299 }, { "epoch": 4.328223624887286, "grad_norm": 0.0797417427617793, "learning_rate": 4.087818417170337e-06, "loss": 0.3711, "step": 300 }, { "epoch": 4.342651036970244, "grad_norm": 0.08091306914486351, "learning_rate": 3.9111444074193e-06, "loss": 0.3704, "step": 301 }, { "epoch": 4.357078449053201, "grad_norm": 0.08277906868820106, "learning_rate": 3.7381767418891303e-06, "loss": 0.3736, "step": 302 }, { "epoch": 4.371505861136159, "grad_norm": 0.08109844725998167, "learning_rate": 3.568933184448944e-06, "loss": 0.3679, "step": 303 }, { "epoch": 4.385933273219116, "grad_norm": 0.076043565671384, "learning_rate": 3.403431116500038e-06, "loss": 0.3737, "step": 304 }, { "epoch": 4.400360685302074, "grad_norm": 0.0786325856472425, "learning_rate": 3.241687535190776e-06, "loss": 0.3722, "step": 305 }, { "epoch": 4.414788097385031, "grad_norm": 0.07882441543843179, "learning_rate": 3.08371905167101e-06, "loss": 0.3746, "step": 306 }, { "epoch": 4.429215509467989, "grad_norm": 0.0813528283180034, "learning_rate": 2.929541889386056e-06, "loss": 0.3698, "step": 307 }, { "epoch": 4.443642921550947, "grad_norm": 0.07778147610676125, "learning_rate": 2.7791718824106186e-06, "loss": 0.3747, "step": 308 }, { "epoch": 4.458070333633905, "grad_norm": 0.07497215994153009, "learning_rate": 2.6326244738225183e-06, "loss": 0.3793, "step": 309 }, { "epoch": 4.472497745716862, "grad_norm": 0.0751254260494879, "learning_rate": 2.489914714116788e-06, "loss": 0.3707, "step": 310 }, { "epoch": 4.48692515779982, "grad_norm": 0.0748304985473765, "learning_rate": 2.3510572596598678e-06, "loss": 0.3728, "step": 311 }, { "epoch": 4.501352569882777, "grad_norm": 0.07793338755657392, "learning_rate": 2.2160663711845176e-06, "loss": 0.3733, "step": 312 }, { "epoch": 4.515779981965735, "grad_norm": 0.07545418335066799, "learning_rate": 2.084955912325093e-06, "loss": 0.3663, "step": 313 }, { "epoch": 4.530207394048692, "grad_norm": 0.0784362383534773, "learning_rate": 1.957739348193859e-06, "loss": 0.3694, "step": 314 }, { "epoch": 4.544634806131651, "grad_norm": 0.07501282928300265, "learning_rate": 1.8344297439980475e-06, "loss": 0.3739, "step": 315 }, { "epoch": 4.559062218214608, "grad_norm": 0.07184746061800508, "learning_rate": 1.715039763698081e-06, "loss": 0.372, "step": 316 }, { "epoch": 4.573489630297566, "grad_norm": 0.0768286859431056, "learning_rate": 1.5995816687069687e-06, "loss": 0.367, "step": 317 }, { "epoch": 4.587917042380523, "grad_norm": 0.07399515759551432, "learning_rate": 1.4880673166310612e-06, "loss": 0.3734, "step": 318 }, { "epoch": 4.602344454463481, "grad_norm": 0.07232964813350647, "learning_rate": 1.3805081600522585e-06, "loss": 0.3697, "step": 319 }, { "epoch": 4.616771866546438, "grad_norm": 0.07389169724144744, "learning_rate": 1.276915245351833e-06, "loss": 0.3666, "step": 320 }, { "epoch": 4.631199278629396, "grad_norm": 0.071013638237935, "learning_rate": 1.1772992115759351e-06, "loss": 0.3704, "step": 321 }, { "epoch": 4.645626690712353, "grad_norm": 0.07478471657750946, "learning_rate": 1.081670289343002e-06, "loss": 0.372, "step": 322 }, { "epoch": 4.660054102795311, "grad_norm": 0.0713434887145259, "learning_rate": 9.900382997930413e-07, "loss": 0.3754, "step": 323 }, { "epoch": 4.674481514878269, "grad_norm": 0.0730981387918326, "learning_rate": 9.024126535789812e-07, "loss": 0.3684, "step": 324 }, { "epoch": 4.6889089269612265, "grad_norm": 0.07158489318241744, "learning_rate": 8.188023499002206e-07, "loss": 0.3808, "step": 325 }, { "epoch": 4.703336339044184, "grad_norm": 0.07013751377947393, "learning_rate": 7.392159755783957e-07, "loss": 0.3626, "step": 326 }, { "epoch": 4.7177637511271415, "grad_norm": 0.07150689349177662, "learning_rate": 6.636617041754978e-07, "loss": 0.3723, "step": 327 }, { "epoch": 4.732191163210099, "grad_norm": 0.07003675588222115, "learning_rate": 5.921472951544527e-07, "loss": 0.3689, "step": 328 }, { "epoch": 4.7466185752930565, "grad_norm": 0.06957634634931112, "learning_rate": 5.246800930822371e-07, "loss": 0.3751, "step": 329 }, { "epoch": 4.761045987376015, "grad_norm": 0.07044531133634477, "learning_rate": 4.6126702687554483e-07, "loss": 0.371, "step": 330 }, { "epoch": 4.775473399458972, "grad_norm": 0.07077686025594564, "learning_rate": 4.0191460908923563e-07, "loss": 0.3676, "step": 331 }, { "epoch": 4.78990081154193, "grad_norm": 0.07253004421887527, "learning_rate": 3.4662893524745276e-07, "loss": 0.3781, "step": 332 }, { "epoch": 4.804328223624887, "grad_norm": 0.07510496456067554, "learning_rate": 2.954156832176214e-07, "loss": 0.3783, "step": 333 }, { "epoch": 4.818755635707845, "grad_norm": 0.0706180896307413, "learning_rate": 2.482801126273371e-07, "loss": 0.371, "step": 334 }, { "epoch": 4.833183047790802, "grad_norm": 0.06847497940017412, "learning_rate": 2.0522706432419382e-07, "loss": 0.3702, "step": 335 }, { "epoch": 4.84761045987376, "grad_norm": 0.07040073406374138, "learning_rate": 1.6626095987862134e-07, "loss": 0.3703, "step": 336 }, { "epoch": 4.862037871956717, "grad_norm": 0.06922886880619938, "learning_rate": 1.3138580112979083e-07, "loss": 0.3693, "step": 337 }, { "epoch": 4.876465284039675, "grad_norm": 0.07071379713825318, "learning_rate": 1.0060516977462797e-07, "loss": 0.3683, "step": 338 }, { "epoch": 4.890892696122633, "grad_norm": 0.0707082821934966, "learning_rate": 7.39222269999651e-08, "loss": 0.3795, "step": 339 }, { "epoch": 4.905320108205591, "grad_norm": 0.06885902870580198, "learning_rate": 5.133971315788966e-08, "loss": 0.3671, "step": 340 }, { "epoch": 4.919747520288548, "grad_norm": 0.06873108316208869, "learning_rate": 3.285994748430721e-08, "loss": 0.3738, "step": 341 }, { "epoch": 4.934174932371506, "grad_norm": 0.06944038886061686, "learning_rate": 1.8484827860754118e-08, "loss": 0.3691, "step": 342 }, { "epoch": 4.948602344454463, "grad_norm": 0.07137425763584286, "learning_rate": 8.215830619486831e-09, "loss": 0.3709, "step": 343 }, { "epoch": 4.963029756537421, "grad_norm": 0.0716171836993154, "learning_rate": 2.054010391856487e-09, "loss": 0.3704, "step": 344 }, { "epoch": 4.977457168620378, "grad_norm": 0.0713787661901706, "learning_rate": 0.0, "loss": 0.3736, "step": 345 }, { "epoch": 4.977457168620378, "step": 345, "total_flos": 9.173613467414823e+18, "train_loss": 0.4462836230146712, "train_runtime": 80545.5288, "train_samples_per_second": 2.202, "train_steps_per_second": 0.004 } ], "logging_steps": 1, "max_steps": 345, "num_input_tokens_seen": 0, "num_train_epochs": 5, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 9.173613467414823e+18, "train_batch_size": 1, "trial_name": null, "trial_params": null }