{ "best_metric": null, "best_model_checkpoint": null, "epoch": 2.989937106918239, "eval_steps": 50000, "global_step": 594, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.010062893081761006, "grad_norm": 1.6687748432159424, "learning_rate": 6.666666666666667e-08, "loss": 0.369, "step": 2 }, { "epoch": 0.02012578616352201, "grad_norm": 1.5292283296585083, "learning_rate": 1.3333333333333334e-07, "loss": 0.3732, "step": 4 }, { "epoch": 0.03018867924528302, "grad_norm": 1.5824713706970215, "learning_rate": 2e-07, "loss": -0.1619, "step": 6 }, { "epoch": 0.04025157232704402, "grad_norm": 4.105996608734131, "learning_rate": 2.6666666666666667e-07, "loss": 0.2398, "step": 8 }, { "epoch": 0.050314465408805034, "grad_norm": 1.4400302171707153, "learning_rate": 3.333333333333333e-07, "loss": -0.5621, "step": 10 }, { "epoch": 0.06037735849056604, "grad_norm": 2.3048486709594727, "learning_rate": 4e-07, "loss": -0.6602, "step": 12 }, { "epoch": 0.07044025157232704, "grad_norm": 2.4866607189178467, "learning_rate": 4.6666666666666666e-07, "loss": -1.24, "step": 14 }, { "epoch": 0.08050314465408805, "grad_norm": 3.4124677181243896, "learning_rate": 5.333333333333333e-07, "loss": 0.462, "step": 16 }, { "epoch": 0.09056603773584905, "grad_norm": 1.5936415195465088, "learning_rate": 6e-07, "loss": -0.0692, "step": 18 }, { "epoch": 0.10062893081761007, "grad_norm": 1.9987062215805054, "learning_rate": 6.666666666666666e-07, "loss": 0.5051, "step": 20 }, { "epoch": 0.11069182389937107, "grad_norm": 2.565603017807007, "learning_rate": 7.333333333333332e-07, "loss": -0.0248, "step": 22 }, { "epoch": 0.12075471698113208, "grad_norm": 3.2282676696777344, "learning_rate": 8e-07, "loss": -0.6335, "step": 24 }, { "epoch": 0.13081761006289308, "grad_norm": 1.868457555770874, "learning_rate": 8.666666666666667e-07, "loss": -0.8462, "step": 26 }, { "epoch": 0.14088050314465408, "grad_norm": 2.7205371856689453, "learning_rate": 9.333333333333333e-07, "loss": 2.6132, "step": 28 }, { "epoch": 0.1509433962264151, "grad_norm": 3.2904088497161865, "learning_rate": 1e-06, "loss": 0.4139, "step": 30 }, { "epoch": 0.1610062893081761, "grad_norm": 1.7929654121398926, "learning_rate": 1.0666666666666667e-06, "loss": 1.9297, "step": 32 }, { "epoch": 0.1710691823899371, "grad_norm": 2.788813591003418, "learning_rate": 1.1333333333333332e-06, "loss": -1.4279, "step": 34 }, { "epoch": 0.1811320754716981, "grad_norm": 1.792971134185791, "learning_rate": 1.2e-06, "loss": 0.1433, "step": 36 }, { "epoch": 0.19119496855345913, "grad_norm": 2.238489866256714, "learning_rate": 1.2666666666666665e-06, "loss": 0.3927, "step": 38 }, { "epoch": 0.20125786163522014, "grad_norm": 2.905518054962158, "learning_rate": 1.3333333333333332e-06, "loss": 1.079, "step": 40 }, { "epoch": 0.21132075471698114, "grad_norm": 1.6354607343673706, "learning_rate": 1.4e-06, "loss": 0.1258, "step": 42 }, { "epoch": 0.22138364779874214, "grad_norm": 2.0974748134613037, "learning_rate": 1.4666666666666665e-06, "loss": 0.0546, "step": 44 }, { "epoch": 0.23144654088050315, "grad_norm": 1.619780421257019, "learning_rate": 1.5333333333333334e-06, "loss": -1.0396, "step": 46 }, { "epoch": 0.24150943396226415, "grad_norm": 1.9667820930480957, "learning_rate": 1.6e-06, "loss": -0.4011, "step": 48 }, { "epoch": 0.25157232704402516, "grad_norm": 1.9112639427185059, "learning_rate": 1.6666666666666667e-06, "loss": 0.8607, "step": 50 }, { "epoch": 0.26163522012578616, "grad_norm": 2.6148829460144043, "learning_rate": 1.7333333333333334e-06, "loss": 0.6988, "step": 52 }, { "epoch": 0.27169811320754716, "grad_norm": 2.6693756580352783, "learning_rate": 1.8e-06, "loss": -1.0175, "step": 54 }, { "epoch": 0.28176100628930817, "grad_norm": 2.0184097290039062, "learning_rate": 1.8666666666666667e-06, "loss": -0.1263, "step": 56 }, { "epoch": 0.2918238993710692, "grad_norm": 1.4805622100830078, "learning_rate": 1.933333333333333e-06, "loss": -0.4554, "step": 58 }, { "epoch": 0.3018867924528302, "grad_norm": 1.6097267866134644, "learning_rate": 2e-06, "loss": 0.5408, "step": 60 }, { "epoch": 0.3119496855345912, "grad_norm": 1.720683217048645, "learning_rate": 1.9999307783070657e-06, "loss": 1.3892, "step": 62 }, { "epoch": 0.3220125786163522, "grad_norm": 2.825670003890991, "learning_rate": 1.999723122811548e-06, "loss": 0.9162, "step": 64 }, { "epoch": 0.3320754716981132, "grad_norm": 2.550844430923462, "learning_rate": 1.9993770622619783e-06, "loss": -0.1783, "step": 66 }, { "epoch": 0.3421383647798742, "grad_norm": 2.4842543601989746, "learning_rate": 1.998892644568149e-06, "loss": -1.0679, "step": 68 }, { "epoch": 0.3522012578616352, "grad_norm": 1.9450500011444092, "learning_rate": 1.9982699367944866e-06, "loss": 1.4075, "step": 70 }, { "epoch": 0.3622641509433962, "grad_norm": 2.419877052307129, "learning_rate": 1.9975090251507638e-06, "loss": -0.5993, "step": 72 }, { "epoch": 0.3723270440251572, "grad_norm": 1.7247552871704102, "learning_rate": 1.9966100149801647e-06, "loss": 1.2249, "step": 74 }, { "epoch": 0.38238993710691827, "grad_norm": 2.8694651126861572, "learning_rate": 1.995573030744701e-06, "loss": 0.279, "step": 76 }, { "epoch": 0.39245283018867927, "grad_norm": 3.444533586502075, "learning_rate": 1.994398216007982e-06, "loss": 2.4944, "step": 78 }, { "epoch": 0.4025157232704403, "grad_norm": 1.145507574081421, "learning_rate": 1.993085733415337e-06, "loss": -0.0775, "step": 80 }, { "epoch": 0.4125786163522013, "grad_norm": 2.018376111984253, "learning_rate": 1.9916357646713006e-06, "loss": -0.1244, "step": 82 }, { "epoch": 0.4226415094339623, "grad_norm": 3.317014694213867, "learning_rate": 1.9900485105144544e-06, "loss": -0.5761, "step": 84 }, { "epoch": 0.4327044025157233, "grad_norm": 1.426088809967041, "learning_rate": 1.9883241906896385e-06, "loss": 1.364, "step": 86 }, { "epoch": 0.4427672955974843, "grad_norm": 2.031130790710449, "learning_rate": 1.986463043917528e-06, "loss": 0.9214, "step": 88 }, { "epoch": 0.4528301886792453, "grad_norm": 2.133758068084717, "learning_rate": 1.984465327861583e-06, "loss": -1.4531, "step": 90 }, { "epoch": 0.4628930817610063, "grad_norm": 2.5162205696105957, "learning_rate": 1.9823313190923794e-06, "loss": -0.7078, "step": 92 }, { "epoch": 0.4729559748427673, "grad_norm": 1.5902796983718872, "learning_rate": 1.980061313049315e-06, "loss": -1.3553, "step": 94 }, { "epoch": 0.4830188679245283, "grad_norm": 2.366024971008301, "learning_rate": 1.9776556239997142e-06, "loss": 0.4744, "step": 96 }, { "epoch": 0.4930817610062893, "grad_norm": 2.211918354034424, "learning_rate": 1.975114584995313e-06, "loss": 0.532, "step": 98 }, { "epoch": 0.5031446540880503, "grad_norm": 1.664931058883667, "learning_rate": 1.972438547826156e-06, "loss": -0.5974, "step": 100 }, { "epoch": 0.5132075471698113, "grad_norm": 2.5771172046661377, "learning_rate": 1.969627882971888e-06, "loss": -0.4213, "step": 102 }, { "epoch": 0.5232704402515723, "grad_norm": 3.083601236343384, "learning_rate": 1.9666829795504693e-06, "loss": -1.491, "step": 104 }, { "epoch": 0.5333333333333333, "grad_norm": 3.069186210632324, "learning_rate": 1.9636042452643e-06, "loss": -0.6719, "step": 106 }, { "epoch": 0.5433962264150943, "grad_norm": 1.642295479774475, "learning_rate": 1.960392106343779e-06, "loss": -0.8876, "step": 108 }, { "epoch": 0.5534591194968553, "grad_norm": 2.7487986087799072, "learning_rate": 1.9570470074882946e-06, "loss": -0.8838, "step": 110 }, { "epoch": 0.5635220125786163, "grad_norm": 4.342981338500977, "learning_rate": 1.9535694118046583e-06, "loss": 0.6486, "step": 112 }, { "epoch": 0.5735849056603773, "grad_norm": 2.6165924072265625, "learning_rate": 1.949959800742991e-06, "loss": 0.901, "step": 114 }, { "epoch": 0.5836477987421383, "grad_norm": 3.7529544830322266, "learning_rate": 1.9462186740300695e-06, "loss": -1.5828, "step": 116 }, { "epoch": 0.5937106918238994, "grad_norm": 0.95662921667099, "learning_rate": 1.942346549600144e-06, "loss": -1.2115, "step": 118 }, { "epoch": 0.6037735849056604, "grad_norm": 3.2608375549316406, "learning_rate": 1.9383439635232293e-06, "loss": 1.1846, "step": 120 }, { "epoch": 0.6138364779874214, "grad_norm": 2.937685966491699, "learning_rate": 1.9342114699308956e-06, "loss": 0.5849, "step": 122 }, { "epoch": 0.6238993710691824, "grad_norm": 3.030308485031128, "learning_rate": 1.929949640939548e-06, "loss": 1.0768, "step": 124 }, { "epoch": 0.6339622641509434, "grad_norm": 1.6450515985488892, "learning_rate": 1.925559066571221e-06, "loss": -0.815, "step": 126 }, { "epoch": 0.6440251572327044, "grad_norm": 4.359044075012207, "learning_rate": 1.9210403546718966e-06, "loss": 1.4768, "step": 128 }, { "epoch": 0.6540880503144654, "grad_norm": 2.591158628463745, "learning_rate": 1.91639413082735e-06, "loss": 0.4688, "step": 130 }, { "epoch": 0.6641509433962264, "grad_norm": 3.594324827194214, "learning_rate": 1.9116210382765418e-06, "loss": -0.4207, "step": 132 }, { "epoch": 0.6742138364779874, "grad_norm": 4.136204242706299, "learning_rate": 1.9067217378225652e-06, "loss": -1.2546, "step": 134 }, { "epoch": 0.6842767295597484, "grad_norm": 3.1914331912994385, "learning_rate": 1.9016969077411645e-06, "loss": -1.6023, "step": 136 }, { "epoch": 0.6943396226415094, "grad_norm": 2.6611359119415283, "learning_rate": 1.8965472436868284e-06, "loss": 0.0919, "step": 138 }, { "epoch": 0.7044025157232704, "grad_norm": 3.068580150604248, "learning_rate": 1.8912734585964855e-06, "loss": 0.3057, "step": 140 }, { "epoch": 0.7144654088050314, "grad_norm": 7.307640552520752, "learning_rate": 1.8858762825907997e-06, "loss": 1.6571, "step": 142 }, { "epoch": 0.7245283018867924, "grad_norm": 2.129241943359375, "learning_rate": 1.8803564628730913e-06, "loss": 0.5422, "step": 144 }, { "epoch": 0.7345911949685534, "grad_norm": 3.773325204849243, "learning_rate": 1.8747147636258916e-06, "loss": 0.7144, "step": 146 }, { "epoch": 0.7446540880503144, "grad_norm": 1.3420393466949463, "learning_rate": 1.8689519659051466e-06, "loss": -1.1075, "step": 148 }, { "epoch": 0.7547169811320755, "grad_norm": 6.70538854598999, "learning_rate": 1.8630688675320841e-06, "loss": -1.9595, "step": 150 }, { "epoch": 0.7647798742138365, "grad_norm": 4.187305927276611, "learning_rate": 1.857066282982763e-06, "loss": -0.5234, "step": 152 }, { "epoch": 0.7748427672955975, "grad_norm": 2.975940465927124, "learning_rate": 1.850945043275312e-06, "loss": -0.3984, "step": 154 }, { "epoch": 0.7849056603773585, "grad_norm": 2.44286847114563, "learning_rate": 1.844705995854882e-06, "loss": 1.109, "step": 156 }, { "epoch": 0.7949685534591195, "grad_norm": 12.523564338684082, "learning_rate": 1.8383500044763226e-06, "loss": -2.0379, "step": 158 }, { "epoch": 0.8050314465408805, "grad_norm": 4.5152716636657715, "learning_rate": 1.8318779490846e-06, "loss": -0.6498, "step": 160 }, { "epoch": 0.8150943396226416, "grad_norm": 2.565892457962036, "learning_rate": 1.8252907256929774e-06, "loss": 0.039, "step": 162 }, { "epoch": 0.8251572327044026, "grad_norm": 3.789813756942749, "learning_rate": 1.8185892462589636e-06, "loss": -0.0521, "step": 164 }, { "epoch": 0.8352201257861636, "grad_norm": 4.709334373474121, "learning_rate": 1.8117744385580623e-06, "loss": -0.7899, "step": 166 }, { "epoch": 0.8452830188679246, "grad_norm": 2.444716453552246, "learning_rate": 1.8048472460553256e-06, "loss": 0.2275, "step": 168 }, { "epoch": 0.8553459119496856, "grad_norm": 2.314274549484253, "learning_rate": 1.7978086277747379e-06, "loss": -0.9168, "step": 170 }, { "epoch": 0.8654088050314466, "grad_norm": 3.4260716438293457, "learning_rate": 1.7906595581664461e-06, "loss": -0.6274, "step": 172 }, { "epoch": 0.8754716981132076, "grad_norm": 2.7144453525543213, "learning_rate": 1.7834010269718524e-06, "loss": -0.9649, "step": 174 }, { "epoch": 0.8855345911949686, "grad_norm": 3.8050897121429443, "learning_rate": 1.7760340390865917e-06, "loss": -0.262, "step": 176 }, { "epoch": 0.8955974842767296, "grad_norm": 2.8164639472961426, "learning_rate": 1.7685596144214107e-06, "loss": -1.1909, "step": 178 }, { "epoch": 0.9056603773584906, "grad_norm": 4.633458614349365, "learning_rate": 1.7609787877609676e-06, "loss": 0.4428, "step": 180 }, { "epoch": 0.9157232704402516, "grad_norm": 2.8389792442321777, "learning_rate": 1.7532926086205726e-06, "loss": -0.5821, "step": 182 }, { "epoch": 0.9257861635220126, "grad_norm": 2.226238965988159, "learning_rate": 1.7455021411008906e-06, "loss": 0.3515, "step": 184 }, { "epoch": 0.9358490566037736, "grad_norm": 2.591329336166382, "learning_rate": 1.737608463740622e-06, "loss": -0.306, "step": 186 }, { "epoch": 0.9459119496855346, "grad_norm": 3.7576334476470947, "learning_rate": 1.7296126693671882e-06, "loss": 0.1704, "step": 188 }, { "epoch": 0.9559748427672956, "grad_norm": 2.887920618057251, "learning_rate": 1.7215158649454346e-06, "loss": -0.2494, "step": 190 }, { "epoch": 0.9660377358490566, "grad_norm": 4.349538326263428, "learning_rate": 1.7133191714243802e-06, "loss": 2.3405, "step": 192 }, { "epoch": 0.9761006289308176, "grad_norm": 4.317368984222412, "learning_rate": 1.7050237235820287e-06, "loss": 0.4566, "step": 194 }, { "epoch": 0.9861635220125786, "grad_norm": 5.087897300720215, "learning_rate": 1.696630669868267e-06, "loss": 0.1502, "step": 196 }, { "epoch": 0.9962264150943396, "grad_norm": 4.70991325378418, "learning_rate": 1.6881411722458687e-06, "loss": -0.3574, "step": 198 }, { "epoch": 1.0069182389937108, "grad_norm": 2.966017007827759, "learning_rate": 1.6795564060296292e-06, "loss": 0.9311, "step": 200 }, { "epoch": 1.0169811320754718, "grad_norm": 2.300924777984619, "learning_rate": 1.6708775597236505e-06, "loss": 0.2717, "step": 202 }, { "epoch": 1.0270440251572328, "grad_norm": 6.384905815124512, "learning_rate": 1.6621058348568004e-06, "loss": -0.0504, "step": 204 }, { "epoch": 1.0371069182389938, "grad_norm": 4.002950668334961, "learning_rate": 1.6532424458163691e-06, "loss": -0.2334, "step": 206 }, { "epoch": 1.0471698113207548, "grad_norm": 12.800736427307129, "learning_rate": 1.6442886196799464e-06, "loss": -1.2455, "step": 208 }, { "epoch": 1.0572327044025158, "grad_norm": 5.464755535125732, "learning_rate": 1.6352455960455384e-06, "loss": 1.8264, "step": 210 }, { "epoch": 1.0672955974842768, "grad_norm": 5.672085762023926, "learning_rate": 1.6261146268599562e-06, "loss": -1.0013, "step": 212 }, { "epoch": 1.0773584905660378, "grad_norm": 4.908372402191162, "learning_rate": 1.6168969762454894e-06, "loss": -1.0382, "step": 214 }, { "epoch": 1.0874213836477988, "grad_norm": 7.087652683258057, "learning_rate": 1.607593920324899e-06, "loss": -0.4295, "step": 216 }, { "epoch": 1.0974842767295598, "grad_norm": 3.5187363624572754, "learning_rate": 1.5982067470447458e-06, "loss": -0.0398, "step": 218 }, { "epoch": 1.1075471698113208, "grad_norm": 2.593596935272217, "learning_rate": 1.5887367559970822e-06, "loss": 0.7915, "step": 220 }, { "epoch": 1.1176100628930818, "grad_norm": 6.099729061126709, "learning_rate": 1.5791852582395332e-06, "loss": -1.0834, "step": 222 }, { "epoch": 1.1276729559748428, "grad_norm": 6.590648174285889, "learning_rate": 1.5695535761137888e-06, "loss": 0.9158, "step": 224 }, { "epoch": 1.1377358490566039, "grad_norm": 5.639819145202637, "learning_rate": 1.5598430430625333e-06, "loss": -1.5288, "step": 226 }, { "epoch": 1.1477987421383649, "grad_norm": 3.02219820022583, "learning_rate": 1.550055003444841e-06, "loss": -0.0297, "step": 228 }, { "epoch": 1.1578616352201259, "grad_norm": 6.338824272155762, "learning_rate": 1.5401908123500586e-06, "loss": -0.7611, "step": 230 }, { "epoch": 1.1679245283018869, "grad_norm": 3.917799949645996, "learning_rate": 1.530251835410199e-06, "loss": 0.4777, "step": 232 }, { "epoch": 1.1779874213836479, "grad_norm": 6.309770584106445, "learning_rate": 1.520239448610882e-06, "loss": 1.729, "step": 234 }, { "epoch": 1.1880503144654089, "grad_norm": 1.9973816871643066, "learning_rate": 1.5101550381008375e-06, "loss": -1.5997, "step": 236 }, { "epoch": 1.1981132075471699, "grad_norm": 6.434890270233154, "learning_rate": 1.5e-06, "loss": -1.5788, "step": 238 }, { "epoch": 1.2081761006289309, "grad_norm": 2.8913328647613525, "learning_rate": 1.4897757402062284e-06, "loss": 0.2666, "step": 240 }, { "epoch": 1.2182389937106919, "grad_norm": 5.833925724029541, "learning_rate": 1.4794836742006664e-06, "loss": 0.969, "step": 242 }, { "epoch": 1.228301886792453, "grad_norm": 3.047639846801758, "learning_rate": 1.4691252268517794e-06, "loss": -0.7864, "step": 244 }, { "epoch": 1.238364779874214, "grad_norm": 11.185049057006836, "learning_rate": 1.4587018322180904e-06, "loss": -1.8447, "step": 246 }, { "epoch": 1.248427672955975, "grad_norm": 3.9488909244537354, "learning_rate": 1.4482149333496455e-06, "loss": 1.3762, "step": 248 }, { "epoch": 1.258490566037736, "grad_norm": 8.695211410522461, "learning_rate": 1.4376659820882306e-06, "loss": 2.1336, "step": 250 }, { "epoch": 1.268553459119497, "grad_norm": 6.01567268371582, "learning_rate": 1.427056438866376e-06, "loss": -0.8317, "step": 252 }, { "epoch": 1.278616352201258, "grad_norm": 4.584295272827148, "learning_rate": 1.4163877725051677e-06, "loss": 0.409, "step": 254 }, { "epoch": 1.288679245283019, "grad_norm": 5.3349480628967285, "learning_rate": 1.4056614600108995e-06, "loss": 0.106, "step": 256 }, { "epoch": 1.29874213836478, "grad_norm": 2.8550000190734863, "learning_rate": 1.3948789863705913e-06, "loss": 0.6895, "step": 258 }, { "epoch": 1.308805031446541, "grad_norm": 6.208876132965088, "learning_rate": 1.3840418443464013e-06, "loss": -0.5366, "step": 260 }, { "epoch": 1.318867924528302, "grad_norm": 4.392048358917236, "learning_rate": 1.3731515342689651e-06, "loss": 0.9175, "step": 262 }, { "epoch": 1.328930817610063, "grad_norm": 5.677616596221924, "learning_rate": 1.3622095638296825e-06, "loss": -0.8256, "step": 264 }, { "epoch": 1.338993710691824, "grad_norm": 3.6334376335144043, "learning_rate": 1.3512174478719892e-06, "loss": -1.949, "step": 266 }, { "epoch": 1.349056603773585, "grad_norm": 4.466569423675537, "learning_rate": 1.3401767081816368e-06, "loss": 1.0635, "step": 268 }, { "epoch": 1.359119496855346, "grad_norm": 6.331056594848633, "learning_rate": 1.32908887327601e-06, "loss": -0.801, "step": 270 }, { "epoch": 1.369182389937107, "grad_norm": 5.03653621673584, "learning_rate": 1.317955478192515e-06, "loss": -0.2086, "step": 272 }, { "epoch": 1.379245283018868, "grad_norm": 2.39367413520813, "learning_rate": 1.3067780642760637e-06, "loss": -1.0548, "step": 274 }, { "epoch": 1.389308176100629, "grad_norm": 6.588123321533203, "learning_rate": 1.295558178965684e-06, "loss": 1.0341, "step": 276 }, { "epoch": 1.39937106918239, "grad_norm": 3.3789021968841553, "learning_rate": 1.284297375580287e-06, "loss": -0.1079, "step": 278 }, { "epoch": 1.409433962264151, "grad_norm": 4.275945663452148, "learning_rate": 1.272997213103621e-06, "loss": 1.3644, "step": 280 }, { "epoch": 1.419496855345912, "grad_norm": 5.876030921936035, "learning_rate": 1.2616592559684408e-06, "loss": -1.5156, "step": 282 }, { "epoch": 1.429559748427673, "grad_norm": 3.4462649822235107, "learning_rate": 1.2502850738399199e-06, "loss": 0.2908, "step": 284 }, { "epoch": 1.439622641509434, "grad_norm": 3.7064943313598633, "learning_rate": 1.2388762413983444e-06, "loss": -1.058, "step": 286 }, { "epoch": 1.449685534591195, "grad_norm": 4.951382637023926, "learning_rate": 1.2274343381211066e-06, "loss": 0.4712, "step": 288 }, { "epoch": 1.459748427672956, "grad_norm": 4.248599052429199, "learning_rate": 1.215960948064036e-06, "loss": 0.1037, "step": 290 }, { "epoch": 1.469811320754717, "grad_norm": 4.509840488433838, "learning_rate": 1.2044576596421002e-06, "loss": 0.6964, "step": 292 }, { "epoch": 1.479874213836478, "grad_norm": 1.8829210996627808, "learning_rate": 1.1929260654094969e-06, "loss": -0.0571, "step": 294 }, { "epoch": 1.489937106918239, "grad_norm": 6.426050662994385, "learning_rate": 1.1813677618391757e-06, "loss": 0.5038, "step": 296 }, { "epoch": 1.5, "grad_norm": 3.1166653633117676, "learning_rate": 1.1697843491018187e-06, "loss": -1.3007, "step": 298 }, { "epoch": 1.510062893081761, "grad_norm": 2.824904680252075, "learning_rate": 1.1581774308443039e-06, "loss": 0.6687, "step": 300 }, { "epoch": 1.520125786163522, "grad_norm": 1.3138232231140137, "learning_rate": 1.1465486139676953e-06, "loss": 0.8043, "step": 302 }, { "epoch": 1.530188679245283, "grad_norm": 3.3225157260894775, "learning_rate": 1.1348995084047749e-06, "loss": 0.5529, "step": 304 }, { "epoch": 1.540251572327044, "grad_norm": 5.321311950683594, "learning_rate": 1.1232317268971584e-06, "loss": 0.1101, "step": 306 }, { "epoch": 1.550314465408805, "grad_norm": 10.030771255493164, "learning_rate": 1.1115468847720245e-06, "loss": -0.9142, "step": 308 }, { "epoch": 1.560377358490566, "grad_norm": 2.3845436573028564, "learning_rate": 1.0998465997184796e-06, "loss": 0.6053, "step": 310 }, { "epoch": 1.570440251572327, "grad_norm": 3.853327512741089, "learning_rate": 1.0881324915636018e-06, "loss": 0.1398, "step": 312 }, { "epoch": 1.580503144654088, "grad_norm": 2.7320926189422607, "learning_rate": 1.076406182048187e-06, "loss": -1.7586, "step": 314 }, { "epoch": 1.590566037735849, "grad_norm": 2.23327374458313, "learning_rate": 1.0646692946022285e-06, "loss": -0.8936, "step": 316 }, { "epoch": 1.60062893081761, "grad_norm": 6.662895679473877, "learning_rate": 1.0529234541201631e-06, "loss": 1.1678, "step": 318 }, { "epoch": 1.610691823899371, "grad_norm": 2.96289324760437, "learning_rate": 1.0411702867359178e-06, "loss": -0.3086, "step": 320 }, { "epoch": 1.620754716981132, "grad_norm": 2.9261276721954346, "learning_rate": 1.0294114195977794e-06, "loss": 0.7558, "step": 322 }, { "epoch": 1.630817610062893, "grad_norm": 3.917189598083496, "learning_rate": 1.0176484806431287e-06, "loss": 0.1406, "step": 324 }, { "epoch": 1.640880503144654, "grad_norm": 8.924764633178711, "learning_rate": 1.0058830983730622e-06, "loss": -3.2015, "step": 326 }, { "epoch": 1.650943396226415, "grad_norm": 3.501892328262329, "learning_rate": 9.94116901626938e-07, "loss": -1.6323, "step": 328 }, { "epoch": 1.661006289308176, "grad_norm": 2.972134828567505, "learning_rate": 9.823515193568714e-07, "loss": -1.4688, "step": 330 }, { "epoch": 1.671069182389937, "grad_norm": 6.309866428375244, "learning_rate": 9.705885804022205e-07, "loss": 0.4812, "step": 332 }, { "epoch": 1.681132075471698, "grad_norm": 4.435581207275391, "learning_rate": 9.588297132640824e-07, "loss": 0.0122, "step": 334 }, { "epoch": 1.691194968553459, "grad_norm": 4.168426513671875, "learning_rate": 9.470765458798368e-07, "loss": -0.787, "step": 336 }, { "epoch": 1.70125786163522, "grad_norm": 3.8862287998199463, "learning_rate": 9.353307053977715e-07, "loss": -0.3479, "step": 338 }, { "epoch": 1.711320754716981, "grad_norm": 4.058013439178467, "learning_rate": 9.23593817951813e-07, "loss": 0.7891, "step": 340 }, { "epoch": 1.721383647798742, "grad_norm": 9.581009864807129, "learning_rate": 9.118675084363985e-07, "loss": -0.5769, "step": 342 }, { "epoch": 1.731446540880503, "grad_norm": 4.200214862823486, "learning_rate": 9.001534002815207e-07, "loss": -1.3016, "step": 344 }, { "epoch": 1.741509433962264, "grad_norm": 2.9621429443359375, "learning_rate": 8.884531152279755e-07, "loss": -1.772, "step": 346 }, { "epoch": 1.751572327044025, "grad_norm": 3.36149001121521, "learning_rate": 8.767682731028414e-07, "loss": -0.7338, "step": 348 }, { "epoch": 1.761635220125786, "grad_norm": 3.888066053390503, "learning_rate": 8.651004915952252e-07, "loss": -0.5376, "step": 350 }, { "epoch": 1.771698113207547, "grad_norm": 2.9135375022888184, "learning_rate": 8.534513860323045e-07, "loss": -0.2755, "step": 352 }, { "epoch": 1.7817610062893081, "grad_norm": 2.2403316497802734, "learning_rate": 8.41822569155696e-07, "loss": -0.5882, "step": 354 }, { "epoch": 1.7918238993710691, "grad_norm": 6.112231731414795, "learning_rate": 8.302156508981815e-07, "loss": 0.1197, "step": 356 }, { "epoch": 1.8018867924528301, "grad_norm": 6.92394495010376, "learning_rate": 8.18632238160824e-07, "loss": 0.122, "step": 358 }, { "epoch": 1.8119496855345911, "grad_norm": 8.573149681091309, "learning_rate": 8.070739345905031e-07, "loss": -1.2034, "step": 360 }, { "epoch": 1.8220125786163521, "grad_norm": 3.436896562576294, "learning_rate": 7.955423403578997e-07, "loss": -0.336, "step": 362 }, { "epoch": 1.8320754716981131, "grad_norm": 3.0969924926757812, "learning_rate": 7.840390519359643e-07, "loss": -0.6976, "step": 364 }, { "epoch": 1.8421383647798741, "grad_norm": 3.821650266647339, "learning_rate": 7.725656618788937e-07, "loss": -1.231, "step": 366 }, { "epoch": 1.8522012578616351, "grad_norm": 3.3464226722717285, "learning_rate": 7.611237586016557e-07, "loss": 0.8503, "step": 368 }, { "epoch": 1.8622641509433961, "grad_norm": 3.881531238555908, "learning_rate": 7.497149261600802e-07, "loss": 0.3178, "step": 370 }, { "epoch": 1.8723270440251572, "grad_norm": 1.9269695281982422, "learning_rate": 7.383407440315595e-07, "loss": -0.2027, "step": 372 }, { "epoch": 1.8823899371069182, "grad_norm": 11.40230941772461, "learning_rate": 7.27002786896379e-07, "loss": -0.1666, "step": 374 }, { "epoch": 1.8924528301886792, "grad_norm": 2.309051752090454, "learning_rate": 7.157026244197131e-07, "loss": -0.0113, "step": 376 }, { "epoch": 1.9025157232704402, "grad_norm": 13.750130653381348, "learning_rate": 7.044418210343159e-07, "loss": -0.5592, "step": 378 }, { "epoch": 1.9125786163522012, "grad_norm": 2.372840166091919, "learning_rate": 6.932219357239361e-07, "loss": -0.173, "step": 380 }, { "epoch": 1.9226415094339622, "grad_norm": 11.330310821533203, "learning_rate": 6.820445218074848e-07, "loss": -1.36, "step": 382 }, { "epoch": 1.9327044025157232, "grad_norm": 7.450850009918213, "learning_rate": 6.7091112672399e-07, "loss": -1.447, "step": 384 }, { "epoch": 1.9427672955974842, "grad_norm": 12.863826751708984, "learning_rate": 6.598232918183631e-07, "loss": 1.0882, "step": 386 }, { "epoch": 1.9528301886792452, "grad_norm": 5.197085380554199, "learning_rate": 6.487825521280108e-07, "loss": -0.2821, "step": 388 }, { "epoch": 1.9628930817610062, "grad_norm": 2.8584909439086914, "learning_rate": 6.377904361703177e-07, "loss": 0.6447, "step": 390 }, { "epoch": 1.9729559748427672, "grad_norm": 9.712791442871094, "learning_rate": 6.26848465731035e-07, "loss": 1.5534, "step": 392 }, { "epoch": 1.9830188679245282, "grad_norm": 8.965962409973145, "learning_rate": 6.159581556535987e-07, "loss": 1.1777, "step": 394 }, { "epoch": 1.9930817610062892, "grad_norm": 2.6333396434783936, "learning_rate": 6.051210136294088e-07, "loss": 0.6377, "step": 396 }, { "epoch": 2.0037735849056606, "grad_norm": 4.632491588592529, "learning_rate": 5.943385399891003e-07, "loss": 0.7307, "step": 398 }, { "epoch": 2.0138364779874216, "grad_norm": 4.375370979309082, "learning_rate": 5.836122274948324e-07, "loss": 1.2132, "step": 400 }, { "epoch": 2.0238993710691826, "grad_norm": 3.335942268371582, "learning_rate": 5.729435611336239e-07, "loss": -0.5918, "step": 402 }, { "epoch": 2.0339622641509436, "grad_norm": 6.7062506675720215, "learning_rate": 5.623340179117694e-07, "loss": -0.9562, "step": 404 }, { "epoch": 2.0440251572327046, "grad_norm": 3.223489761352539, "learning_rate": 5.517850666503546e-07, "loss": 0.6964, "step": 406 }, { "epoch": 2.0540880503144656, "grad_norm": 7.602553367614746, "learning_rate": 5.412981677819093e-07, "loss": -2.6532, "step": 408 }, { "epoch": 2.0641509433962266, "grad_norm": 2.123918056488037, "learning_rate": 5.308747731482206e-07, "loss": -1.1065, "step": 410 }, { "epoch": 2.0742138364779876, "grad_norm": 5.430229187011719, "learning_rate": 5.20516325799334e-07, "loss": -0.7525, "step": 412 }, { "epoch": 2.0842767295597486, "grad_norm": 5.109172344207764, "learning_rate": 5.102242597937717e-07, "loss": -1.5795, "step": 414 }, { "epoch": 2.0943396226415096, "grad_norm": 3.5902011394500732, "learning_rate": 5.000000000000002e-07, "loss": -0.4448, "step": 416 }, { "epoch": 2.1044025157232706, "grad_norm": 3.8342630863189697, "learning_rate": 4.89844961899163e-07, "loss": -1.3424, "step": 418 }, { "epoch": 2.1144654088050316, "grad_norm": 5.093093395233154, "learning_rate": 4.797605513891178e-07, "loss": 0.6365, "step": 420 }, { "epoch": 2.1245283018867926, "grad_norm": 6.690524578094482, "learning_rate": 4.6974816458980116e-07, "loss": 0.0718, "step": 422 }, { "epoch": 2.1345911949685537, "grad_norm": 3.328261375427246, "learning_rate": 4.598091876499417e-07, "loss": -1.2867, "step": 424 }, { "epoch": 2.1446540880503147, "grad_norm": 2.5299105644226074, "learning_rate": 4.499449965551586e-07, "loss": -0.0399, "step": 426 }, { "epoch": 2.1547169811320757, "grad_norm": 7.731986045837402, "learning_rate": 4.401569569374668e-07, "loss": 0.4734, "step": 428 }, { "epoch": 2.1647798742138367, "grad_norm": 6.546573162078857, "learning_rate": 4.3044642388621144e-07, "loss": -0.9198, "step": 430 }, { "epoch": 2.1748427672955977, "grad_norm": 5.20041561126709, "learning_rate": 4.208147417604664e-07, "loss": 0.1999, "step": 432 }, { "epoch": 2.1849056603773587, "grad_norm": 7.04267692565918, "learning_rate": 4.1126324400291756e-07, "loss": -0.0014, "step": 434 }, { "epoch": 2.1949685534591197, "grad_norm": 1.8967030048370361, "learning_rate": 4.0179325295525426e-07, "loss": -0.4547, "step": 436 }, { "epoch": 2.2050314465408807, "grad_norm": 7.423833847045898, "learning_rate": 3.924060796751012e-07, "loss": 1.2133, "step": 438 }, { "epoch": 2.2150943396226417, "grad_norm": 5.08156156539917, "learning_rate": 3.83103023754511e-07, "loss": -0.5562, "step": 440 }, { "epoch": 2.2251572327044027, "grad_norm": 2.8167994022369385, "learning_rate": 3.738853731400439e-07, "loss": 0.1852, "step": 442 }, { "epoch": 2.2352201257861637, "grad_norm": 3.1104578971862793, "learning_rate": 3.6475440395446147e-07, "loss": -0.9611, "step": 444 }, { "epoch": 2.2452830188679247, "grad_norm": 2.3350167274475098, "learning_rate": 3.5571138032005365e-07, "loss": 0.3598, "step": 446 }, { "epoch": 2.2553459119496857, "grad_norm": 3.4781851768493652, "learning_rate": 3.4675755418363053e-07, "loss": 0.1132, "step": 448 }, { "epoch": 2.2654088050314467, "grad_norm": 5.0868706703186035, "learning_rate": 3.378941651431996e-07, "loss": 0.7901, "step": 450 }, { "epoch": 2.2754716981132077, "grad_norm": 4.737022876739502, "learning_rate": 3.291224402763495e-07, "loss": -0.5819, "step": 452 }, { "epoch": 2.2855345911949687, "grad_norm": 3.6209828853607178, "learning_rate": 3.2044359397037046e-07, "loss": -0.2148, "step": 454 }, { "epoch": 2.2955974842767297, "grad_norm": 6.26187801361084, "learning_rate": 3.118588277541312e-07, "loss": -0.7123, "step": 456 }, { "epoch": 2.3056603773584907, "grad_norm": 3.300475597381592, "learning_rate": 3.0336933013173305e-07, "loss": 0.3813, "step": 458 }, { "epoch": 2.3157232704402517, "grad_norm": 4.379162311553955, "learning_rate": 2.9497627641797106e-07, "loss": -0.9063, "step": 460 }, { "epoch": 2.3257861635220127, "grad_norm": 4.494270324707031, "learning_rate": 2.8668082857562004e-07, "loss": 0.7504, "step": 462 }, { "epoch": 2.3358490566037737, "grad_norm": 4.654480457305908, "learning_rate": 2.784841350545656e-07, "loss": -0.4204, "step": 464 }, { "epoch": 2.3459119496855347, "grad_norm": 3.090691089630127, "learning_rate": 2.7038733063281173e-07, "loss": 0.6562, "step": 466 }, { "epoch": 2.3559748427672957, "grad_norm": 3.110882520675659, "learning_rate": 2.623915362593778e-07, "loss": -0.6948, "step": 468 }, { "epoch": 2.3660377358490567, "grad_norm": 8.367574691772461, "learning_rate": 2.5449785889910956e-07, "loss": -1.445, "step": 470 }, { "epoch": 2.3761006289308177, "grad_norm": 1.8932026624679565, "learning_rate": 2.467073913794272e-07, "loss": 0.3359, "step": 472 }, { "epoch": 2.3861635220125788, "grad_norm": 4.765536785125732, "learning_rate": 2.3902121223903226e-07, "loss": -0.9514, "step": 474 }, { "epoch": 2.3962264150943398, "grad_norm": 4.574184894561768, "learning_rate": 2.3144038557858913e-07, "loss": 0.6839, "step": 476 }, { "epoch": 2.4062893081761008, "grad_norm": 6.006104469299316, "learning_rate": 2.2396596091340803e-07, "loss": 0.0796, "step": 478 }, { "epoch": 2.4163522012578618, "grad_norm": 4.098776340484619, "learning_rate": 2.1659897302814744e-07, "loss": -0.9333, "step": 480 }, { "epoch": 2.4264150943396228, "grad_norm": 4.418032646179199, "learning_rate": 2.0934044183355383e-07, "loss": -1.8508, "step": 482 }, { "epoch": 2.4364779874213838, "grad_norm": 11.399324417114258, "learning_rate": 2.0219137222526183e-07, "loss": 1.1837, "step": 484 }, { "epoch": 2.4465408805031448, "grad_norm": 5.924710273742676, "learning_rate": 1.9515275394467446e-07, "loss": -0.0577, "step": 486 }, { "epoch": 2.456603773584906, "grad_norm": 7.316831111907959, "learning_rate": 1.8822556144193756e-07, "loss": 0.1237, "step": 488 }, { "epoch": 2.466666666666667, "grad_norm": 6.5416765213012695, "learning_rate": 1.8141075374103632e-07, "loss": -1.9742, "step": 490 }, { "epoch": 2.476729559748428, "grad_norm": 5.302765369415283, "learning_rate": 1.7470927430702276e-07, "loss": 1.6366, "step": 492 }, { "epoch": 2.486792452830189, "grad_norm": 6.104937553405762, "learning_rate": 1.6812205091539978e-07, "loss": -0.9508, "step": 494 }, { "epoch": 2.49685534591195, "grad_norm": 3.6209168434143066, "learning_rate": 1.6164999552367765e-07, "loss": -0.6157, "step": 496 }, { "epoch": 2.506918238993711, "grad_norm": 11.832756996154785, "learning_rate": 1.5529400414511805e-07, "loss": -1.168, "step": 498 }, { "epoch": 2.516981132075472, "grad_norm": 9.809549331665039, "learning_rate": 1.4905495672468783e-07, "loss": 0.3619, "step": 500 }, { "epoch": 2.527044025157233, "grad_norm": 5.026820182800293, "learning_rate": 1.42933717017237e-07, "loss": -0.3516, "step": 502 }, { "epoch": 2.537106918238994, "grad_norm": 4.968526363372803, "learning_rate": 1.3693113246791588e-07, "loss": -0.383, "step": 504 }, { "epoch": 2.547169811320755, "grad_norm": 5.452160835266113, "learning_rate": 1.3104803409485354e-07, "loss": -0.3609, "step": 506 }, { "epoch": 2.557232704402516, "grad_norm": 6.929769992828369, "learning_rate": 1.2528523637410836e-07, "loss": -0.109, "step": 508 }, { "epoch": 2.567295597484277, "grad_norm": 5.186896800994873, "learning_rate": 1.1964353712690888e-07, "loss": 0.3748, "step": 510 }, { "epoch": 2.577358490566038, "grad_norm": 2.7618138790130615, "learning_rate": 1.1412371740920035e-07, "loss": 0.6345, "step": 512 }, { "epoch": 2.587421383647799, "grad_norm": 9.840655326843262, "learning_rate": 1.0872654140351457e-07, "loss": -0.4424, "step": 514 }, { "epoch": 2.59748427672956, "grad_norm": 5.229491233825684, "learning_rate": 1.0345275631317163e-07, "loss": 0.1269, "step": 516 }, { "epoch": 2.607547169811321, "grad_norm": 3.292207956314087, "learning_rate": 9.830309225883559e-08, "loss": -0.8045, "step": 518 }, { "epoch": 2.617610062893082, "grad_norm": 2.8611297607421875, "learning_rate": 9.327826217743451e-08, "loss": 0.6012, "step": 520 }, { "epoch": 2.627672955974843, "grad_norm": 6.323940277099609, "learning_rate": 8.837896172345827e-08, "loss": -0.5895, "step": 522 }, { "epoch": 2.637735849056604, "grad_norm": 7.645895957946777, "learning_rate": 8.360586917264977e-08, "loss": 0.5182, "step": 524 }, { "epoch": 2.647798742138365, "grad_norm": 6.323966979980469, "learning_rate": 7.895964532810317e-08, "loss": -0.3837, "step": 526 }, { "epoch": 2.657861635220126, "grad_norm": 7.799415588378906, "learning_rate": 7.444093342877899e-08, "loss": -0.7239, "step": 528 }, { "epoch": 2.667924528301887, "grad_norm": 6.719019889831543, "learning_rate": 7.005035906045197e-08, "loss": 0.2248, "step": 530 }, { "epoch": 2.677987421383648, "grad_norm": 5.086057186126709, "learning_rate": 6.578853006910402e-08, "loss": 0.5775, "step": 532 }, { "epoch": 2.688050314465409, "grad_norm": 3.6781728267669678, "learning_rate": 6.165603647677054e-08, "loss": 0.0562, "step": 534 }, { "epoch": 2.69811320754717, "grad_norm": 9.493392944335938, "learning_rate": 5.765345039985647e-08, "loss": 0.205, "step": 536 }, { "epoch": 2.708176100628931, "grad_norm": 4.998286247253418, "learning_rate": 5.378132596993046e-08, "loss": 0.9461, "step": 538 }, { "epoch": 2.718238993710692, "grad_norm": 4.373546600341797, "learning_rate": 5.0040199257009196e-08, "loss": -0.7566, "step": 540 }, { "epoch": 2.728301886792453, "grad_norm": 8.538968086242676, "learning_rate": 4.6430588195341847e-08, "loss": 0.9457, "step": 542 }, { "epoch": 2.738364779874214, "grad_norm": 8.773660659790039, "learning_rate": 4.295299251170537e-08, "loss": -0.2537, "step": 544 }, { "epoch": 2.748427672955975, "grad_norm": 5.2722978591918945, "learning_rate": 3.9607893656220745e-08, "loss": 0.8571, "step": 546 }, { "epoch": 2.758490566037736, "grad_norm": 7.540788650512695, "learning_rate": 3.639575473569989e-08, "loss": -2.1415, "step": 548 }, { "epoch": 2.768553459119497, "grad_norm": 3.7448925971984863, "learning_rate": 3.331702044953066e-08, "loss": -1.1784, "step": 550 }, { "epoch": 2.778616352201258, "grad_norm": 3.103691577911377, "learning_rate": 3.037211702811182e-08, "loss": -0.3766, "step": 552 }, { "epoch": 2.788679245283019, "grad_norm": 4.002925872802734, "learning_rate": 2.75614521738442e-08, "loss": -1.5215, "step": 554 }, { "epoch": 2.79874213836478, "grad_norm": 6.615825176239014, "learning_rate": 2.488541500468666e-08, "loss": 0.4594, "step": 556 }, { "epoch": 2.808805031446541, "grad_norm": 4.420342922210693, "learning_rate": 2.2344376000285604e-08, "loss": 0.0622, "step": 558 }, { "epoch": 2.818867924528302, "grad_norm": 5.796300888061523, "learning_rate": 1.9938686950684567e-08, "loss": -0.9306, "step": 560 }, { "epoch": 2.828930817610063, "grad_norm": 4.024370193481445, "learning_rate": 1.766868090762075e-08, "loss": -0.4119, "step": 562 }, { "epoch": 2.838993710691824, "grad_norm": 9.87598705291748, "learning_rate": 1.553467213841664e-08, "loss": -0.1066, "step": 564 }, { "epoch": 2.849056603773585, "grad_norm": 6.048956871032715, "learning_rate": 1.3536956082472073e-08, "loss": -0.7316, "step": 566 }, { "epoch": 2.859119496855346, "grad_norm": 5.084702968597412, "learning_rate": 1.1675809310361495e-08, "loss": -1.3274, "step": 568 }, { "epoch": 2.869182389937107, "grad_norm": 4.490642070770264, "learning_rate": 9.951489485545694e-09, "loss": 0.1211, "step": 570 }, { "epoch": 2.879245283018868, "grad_norm": 9.895052909851074, "learning_rate": 8.364235328699564e-09, "loss": 1.1259, "step": 572 }, { "epoch": 2.889308176100629, "grad_norm": 4.905172348022461, "learning_rate": 6.914266584662987e-09, "loss": -0.1241, "step": 574 }, { "epoch": 2.89937106918239, "grad_norm": 3.0340776443481445, "learning_rate": 5.60178399201805e-09, "loss": -0.6671, "step": 576 }, { "epoch": 2.909433962264151, "grad_norm": 3.124040126800537, "learning_rate": 4.42696925529884e-09, "loss": -1.8007, "step": 578 }, { "epoch": 2.919496855345912, "grad_norm": 5.664621353149414, "learning_rate": 3.3899850198353397e-09, "loss": 0.1159, "step": 580 }, { "epoch": 2.929559748427673, "grad_norm": 4.976583957672119, "learning_rate": 2.4909748492362158e-09, "loss": -1.2106, "step": 582 }, { "epoch": 2.939622641509434, "grad_norm": 5.037308216094971, "learning_rate": 1.730063205513277e-09, "loss": 0.8336, "step": 584 }, { "epoch": 2.949685534591195, "grad_norm": 4.580456733703613, "learning_rate": 1.1073554318509203e-09, "loss": 0.378, "step": 586 }, { "epoch": 2.959748427672956, "grad_norm": 4.7945990562438965, "learning_rate": 6.229377380218003e-10, "loss": -0.0708, "step": 588 }, { "epoch": 2.969811320754717, "grad_norm": 14.472588539123535, "learning_rate": 2.7687718845148535e-10, "loss": -0.0673, "step": 590 }, { "epoch": 2.979874213836478, "grad_norm": 9.063091278076172, "learning_rate": 6.92216929342182e-11, "loss": -0.5115, "step": 592 }, { "epoch": 2.989937106918239, "grad_norm": 5.872649192810059, "learning_rate": 0.0, "loss": 0.3585, "step": 594 }, { "epoch": 2.989937106918239, "step": 594, "total_flos": 5.151263974762742e+17, "train_loss": -0.14183720302852718, "train_runtime": 1424.6739, "train_samples_per_second": 13.386, "train_steps_per_second": 0.417 } ], "logging_steps": 2, "max_steps": 594, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": false, "should_training_stop": false }, "attributes": {} } }, "total_flos": 5.151263974762742e+17, "train_batch_size": 1, "trial_name": null, "trial_params": null }