| { | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 2.989937106918239, | |
| "eval_steps": 50000, | |
| "global_step": 594, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.010062893081761006, | |
| "grad_norm": 1.6687748432159424, | |
| "learning_rate": 6.666666666666667e-08, | |
| "loss": 0.369, | |
| "step": 2 | |
| }, | |
| { | |
| "epoch": 0.02012578616352201, | |
| "grad_norm": 1.5292283296585083, | |
| "learning_rate": 1.3333333333333334e-07, | |
| "loss": 0.3732, | |
| "step": 4 | |
| }, | |
| { | |
| "epoch": 0.03018867924528302, | |
| "grad_norm": 1.5824713706970215, | |
| "learning_rate": 2e-07, | |
| "loss": -0.1619, | |
| "step": 6 | |
| }, | |
| { | |
| "epoch": 0.04025157232704402, | |
| "grad_norm": 4.105996608734131, | |
| "learning_rate": 2.6666666666666667e-07, | |
| "loss": 0.2398, | |
| "step": 8 | |
| }, | |
| { | |
| "epoch": 0.050314465408805034, | |
| "grad_norm": 1.4400302171707153, | |
| "learning_rate": 3.333333333333333e-07, | |
| "loss": -0.5621, | |
| "step": 10 | |
| }, | |
| { | |
| "epoch": 0.06037735849056604, | |
| "grad_norm": 2.3048486709594727, | |
| "learning_rate": 4e-07, | |
| "loss": -0.6602, | |
| "step": 12 | |
| }, | |
| { | |
| "epoch": 0.07044025157232704, | |
| "grad_norm": 2.4866607189178467, | |
| "learning_rate": 4.6666666666666666e-07, | |
| "loss": -1.24, | |
| "step": 14 | |
| }, | |
| { | |
| "epoch": 0.08050314465408805, | |
| "grad_norm": 3.4124677181243896, | |
| "learning_rate": 5.333333333333333e-07, | |
| "loss": 0.462, | |
| "step": 16 | |
| }, | |
| { | |
| "epoch": 0.09056603773584905, | |
| "grad_norm": 1.5936415195465088, | |
| "learning_rate": 6e-07, | |
| "loss": -0.0692, | |
| "step": 18 | |
| }, | |
| { | |
| "epoch": 0.10062893081761007, | |
| "grad_norm": 1.9987062215805054, | |
| "learning_rate": 6.666666666666666e-07, | |
| "loss": 0.5051, | |
| "step": 20 | |
| }, | |
| { | |
| "epoch": 0.11069182389937107, | |
| "grad_norm": 2.565603017807007, | |
| "learning_rate": 7.333333333333332e-07, | |
| "loss": -0.0248, | |
| "step": 22 | |
| }, | |
| { | |
| "epoch": 0.12075471698113208, | |
| "grad_norm": 3.2282676696777344, | |
| "learning_rate": 8e-07, | |
| "loss": -0.6335, | |
| "step": 24 | |
| }, | |
| { | |
| "epoch": 0.13081761006289308, | |
| "grad_norm": 1.868457555770874, | |
| "learning_rate": 8.666666666666667e-07, | |
| "loss": -0.8462, | |
| "step": 26 | |
| }, | |
| { | |
| "epoch": 0.14088050314465408, | |
| "grad_norm": 2.7205371856689453, | |
| "learning_rate": 9.333333333333333e-07, | |
| "loss": 2.6132, | |
| "step": 28 | |
| }, | |
| { | |
| "epoch": 0.1509433962264151, | |
| "grad_norm": 3.2904088497161865, | |
| "learning_rate": 1e-06, | |
| "loss": 0.4139, | |
| "step": 30 | |
| }, | |
| { | |
| "epoch": 0.1610062893081761, | |
| "grad_norm": 1.7929654121398926, | |
| "learning_rate": 1.0666666666666667e-06, | |
| "loss": 1.9297, | |
| "step": 32 | |
| }, | |
| { | |
| "epoch": 0.1710691823899371, | |
| "grad_norm": 2.788813591003418, | |
| "learning_rate": 1.1333333333333332e-06, | |
| "loss": -1.4279, | |
| "step": 34 | |
| }, | |
| { | |
| "epoch": 0.1811320754716981, | |
| "grad_norm": 1.792971134185791, | |
| "learning_rate": 1.2e-06, | |
| "loss": 0.1433, | |
| "step": 36 | |
| }, | |
| { | |
| "epoch": 0.19119496855345913, | |
| "grad_norm": 2.238489866256714, | |
| "learning_rate": 1.2666666666666665e-06, | |
| "loss": 0.3927, | |
| "step": 38 | |
| }, | |
| { | |
| "epoch": 0.20125786163522014, | |
| "grad_norm": 2.905518054962158, | |
| "learning_rate": 1.3333333333333332e-06, | |
| "loss": 1.079, | |
| "step": 40 | |
| }, | |
| { | |
| "epoch": 0.21132075471698114, | |
| "grad_norm": 1.6354607343673706, | |
| "learning_rate": 1.4e-06, | |
| "loss": 0.1258, | |
| "step": 42 | |
| }, | |
| { | |
| "epoch": 0.22138364779874214, | |
| "grad_norm": 2.0974748134613037, | |
| "learning_rate": 1.4666666666666665e-06, | |
| "loss": 0.0546, | |
| "step": 44 | |
| }, | |
| { | |
| "epoch": 0.23144654088050315, | |
| "grad_norm": 1.619780421257019, | |
| "learning_rate": 1.5333333333333334e-06, | |
| "loss": -1.0396, | |
| "step": 46 | |
| }, | |
| { | |
| "epoch": 0.24150943396226415, | |
| "grad_norm": 1.9667820930480957, | |
| "learning_rate": 1.6e-06, | |
| "loss": -0.4011, | |
| "step": 48 | |
| }, | |
| { | |
| "epoch": 0.25157232704402516, | |
| "grad_norm": 1.9112639427185059, | |
| "learning_rate": 1.6666666666666667e-06, | |
| "loss": 0.8607, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.26163522012578616, | |
| "grad_norm": 2.6148829460144043, | |
| "learning_rate": 1.7333333333333334e-06, | |
| "loss": 0.6988, | |
| "step": 52 | |
| }, | |
| { | |
| "epoch": 0.27169811320754716, | |
| "grad_norm": 2.6693756580352783, | |
| "learning_rate": 1.8e-06, | |
| "loss": -1.0175, | |
| "step": 54 | |
| }, | |
| { | |
| "epoch": 0.28176100628930817, | |
| "grad_norm": 2.0184097290039062, | |
| "learning_rate": 1.8666666666666667e-06, | |
| "loss": -0.1263, | |
| "step": 56 | |
| }, | |
| { | |
| "epoch": 0.2918238993710692, | |
| "grad_norm": 1.4805622100830078, | |
| "learning_rate": 1.933333333333333e-06, | |
| "loss": -0.4554, | |
| "step": 58 | |
| }, | |
| { | |
| "epoch": 0.3018867924528302, | |
| "grad_norm": 1.6097267866134644, | |
| "learning_rate": 2e-06, | |
| "loss": 0.5408, | |
| "step": 60 | |
| }, | |
| { | |
| "epoch": 0.3119496855345912, | |
| "grad_norm": 1.720683217048645, | |
| "learning_rate": 1.9999307783070657e-06, | |
| "loss": 1.3892, | |
| "step": 62 | |
| }, | |
| { | |
| "epoch": 0.3220125786163522, | |
| "grad_norm": 2.825670003890991, | |
| "learning_rate": 1.999723122811548e-06, | |
| "loss": 0.9162, | |
| "step": 64 | |
| }, | |
| { | |
| "epoch": 0.3320754716981132, | |
| "grad_norm": 2.550844430923462, | |
| "learning_rate": 1.9993770622619783e-06, | |
| "loss": -0.1783, | |
| "step": 66 | |
| }, | |
| { | |
| "epoch": 0.3421383647798742, | |
| "grad_norm": 2.4842543601989746, | |
| "learning_rate": 1.998892644568149e-06, | |
| "loss": -1.0679, | |
| "step": 68 | |
| }, | |
| { | |
| "epoch": 0.3522012578616352, | |
| "grad_norm": 1.9450500011444092, | |
| "learning_rate": 1.9982699367944866e-06, | |
| "loss": 1.4075, | |
| "step": 70 | |
| }, | |
| { | |
| "epoch": 0.3622641509433962, | |
| "grad_norm": 2.419877052307129, | |
| "learning_rate": 1.9975090251507638e-06, | |
| "loss": -0.5993, | |
| "step": 72 | |
| }, | |
| { | |
| "epoch": 0.3723270440251572, | |
| "grad_norm": 1.7247552871704102, | |
| "learning_rate": 1.9966100149801647e-06, | |
| "loss": 1.2249, | |
| "step": 74 | |
| }, | |
| { | |
| "epoch": 0.38238993710691827, | |
| "grad_norm": 2.8694651126861572, | |
| "learning_rate": 1.995573030744701e-06, | |
| "loss": 0.279, | |
| "step": 76 | |
| }, | |
| { | |
| "epoch": 0.39245283018867927, | |
| "grad_norm": 3.444533586502075, | |
| "learning_rate": 1.994398216007982e-06, | |
| "loss": 2.4944, | |
| "step": 78 | |
| }, | |
| { | |
| "epoch": 0.4025157232704403, | |
| "grad_norm": 1.145507574081421, | |
| "learning_rate": 1.993085733415337e-06, | |
| "loss": -0.0775, | |
| "step": 80 | |
| }, | |
| { | |
| "epoch": 0.4125786163522013, | |
| "grad_norm": 2.018376111984253, | |
| "learning_rate": 1.9916357646713006e-06, | |
| "loss": -0.1244, | |
| "step": 82 | |
| }, | |
| { | |
| "epoch": 0.4226415094339623, | |
| "grad_norm": 3.317014694213867, | |
| "learning_rate": 1.9900485105144544e-06, | |
| "loss": -0.5761, | |
| "step": 84 | |
| }, | |
| { | |
| "epoch": 0.4327044025157233, | |
| "grad_norm": 1.426088809967041, | |
| "learning_rate": 1.9883241906896385e-06, | |
| "loss": 1.364, | |
| "step": 86 | |
| }, | |
| { | |
| "epoch": 0.4427672955974843, | |
| "grad_norm": 2.031130790710449, | |
| "learning_rate": 1.986463043917528e-06, | |
| "loss": 0.9214, | |
| "step": 88 | |
| }, | |
| { | |
| "epoch": 0.4528301886792453, | |
| "grad_norm": 2.133758068084717, | |
| "learning_rate": 1.984465327861583e-06, | |
| "loss": -1.4531, | |
| "step": 90 | |
| }, | |
| { | |
| "epoch": 0.4628930817610063, | |
| "grad_norm": 2.5162205696105957, | |
| "learning_rate": 1.9823313190923794e-06, | |
| "loss": -0.7078, | |
| "step": 92 | |
| }, | |
| { | |
| "epoch": 0.4729559748427673, | |
| "grad_norm": 1.5902796983718872, | |
| "learning_rate": 1.980061313049315e-06, | |
| "loss": -1.3553, | |
| "step": 94 | |
| }, | |
| { | |
| "epoch": 0.4830188679245283, | |
| "grad_norm": 2.366024971008301, | |
| "learning_rate": 1.9776556239997142e-06, | |
| "loss": 0.4744, | |
| "step": 96 | |
| }, | |
| { | |
| "epoch": 0.4930817610062893, | |
| "grad_norm": 2.211918354034424, | |
| "learning_rate": 1.975114584995313e-06, | |
| "loss": 0.532, | |
| "step": 98 | |
| }, | |
| { | |
| "epoch": 0.5031446540880503, | |
| "grad_norm": 1.664931058883667, | |
| "learning_rate": 1.972438547826156e-06, | |
| "loss": -0.5974, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.5132075471698113, | |
| "grad_norm": 2.5771172046661377, | |
| "learning_rate": 1.969627882971888e-06, | |
| "loss": -0.4213, | |
| "step": 102 | |
| }, | |
| { | |
| "epoch": 0.5232704402515723, | |
| "grad_norm": 3.083601236343384, | |
| "learning_rate": 1.9666829795504693e-06, | |
| "loss": -1.491, | |
| "step": 104 | |
| }, | |
| { | |
| "epoch": 0.5333333333333333, | |
| "grad_norm": 3.069186210632324, | |
| "learning_rate": 1.9636042452643e-06, | |
| "loss": -0.6719, | |
| "step": 106 | |
| }, | |
| { | |
| "epoch": 0.5433962264150943, | |
| "grad_norm": 1.642295479774475, | |
| "learning_rate": 1.960392106343779e-06, | |
| "loss": -0.8876, | |
| "step": 108 | |
| }, | |
| { | |
| "epoch": 0.5534591194968553, | |
| "grad_norm": 2.7487986087799072, | |
| "learning_rate": 1.9570470074882946e-06, | |
| "loss": -0.8838, | |
| "step": 110 | |
| }, | |
| { | |
| "epoch": 0.5635220125786163, | |
| "grad_norm": 4.342981338500977, | |
| "learning_rate": 1.9535694118046583e-06, | |
| "loss": 0.6486, | |
| "step": 112 | |
| }, | |
| { | |
| "epoch": 0.5735849056603773, | |
| "grad_norm": 2.6165924072265625, | |
| "learning_rate": 1.949959800742991e-06, | |
| "loss": 0.901, | |
| "step": 114 | |
| }, | |
| { | |
| "epoch": 0.5836477987421383, | |
| "grad_norm": 3.7529544830322266, | |
| "learning_rate": 1.9462186740300695e-06, | |
| "loss": -1.5828, | |
| "step": 116 | |
| }, | |
| { | |
| "epoch": 0.5937106918238994, | |
| "grad_norm": 0.95662921667099, | |
| "learning_rate": 1.942346549600144e-06, | |
| "loss": -1.2115, | |
| "step": 118 | |
| }, | |
| { | |
| "epoch": 0.6037735849056604, | |
| "grad_norm": 3.2608375549316406, | |
| "learning_rate": 1.9383439635232293e-06, | |
| "loss": 1.1846, | |
| "step": 120 | |
| }, | |
| { | |
| "epoch": 0.6138364779874214, | |
| "grad_norm": 2.937685966491699, | |
| "learning_rate": 1.9342114699308956e-06, | |
| "loss": 0.5849, | |
| "step": 122 | |
| }, | |
| { | |
| "epoch": 0.6238993710691824, | |
| "grad_norm": 3.030308485031128, | |
| "learning_rate": 1.929949640939548e-06, | |
| "loss": 1.0768, | |
| "step": 124 | |
| }, | |
| { | |
| "epoch": 0.6339622641509434, | |
| "grad_norm": 1.6450515985488892, | |
| "learning_rate": 1.925559066571221e-06, | |
| "loss": -0.815, | |
| "step": 126 | |
| }, | |
| { | |
| "epoch": 0.6440251572327044, | |
| "grad_norm": 4.359044075012207, | |
| "learning_rate": 1.9210403546718966e-06, | |
| "loss": 1.4768, | |
| "step": 128 | |
| }, | |
| { | |
| "epoch": 0.6540880503144654, | |
| "grad_norm": 2.591158628463745, | |
| "learning_rate": 1.91639413082735e-06, | |
| "loss": 0.4688, | |
| "step": 130 | |
| }, | |
| { | |
| "epoch": 0.6641509433962264, | |
| "grad_norm": 3.594324827194214, | |
| "learning_rate": 1.9116210382765418e-06, | |
| "loss": -0.4207, | |
| "step": 132 | |
| }, | |
| { | |
| "epoch": 0.6742138364779874, | |
| "grad_norm": 4.136204242706299, | |
| "learning_rate": 1.9067217378225652e-06, | |
| "loss": -1.2546, | |
| "step": 134 | |
| }, | |
| { | |
| "epoch": 0.6842767295597484, | |
| "grad_norm": 3.1914331912994385, | |
| "learning_rate": 1.9016969077411645e-06, | |
| "loss": -1.6023, | |
| "step": 136 | |
| }, | |
| { | |
| "epoch": 0.6943396226415094, | |
| "grad_norm": 2.6611359119415283, | |
| "learning_rate": 1.8965472436868284e-06, | |
| "loss": 0.0919, | |
| "step": 138 | |
| }, | |
| { | |
| "epoch": 0.7044025157232704, | |
| "grad_norm": 3.068580150604248, | |
| "learning_rate": 1.8912734585964855e-06, | |
| "loss": 0.3057, | |
| "step": 140 | |
| }, | |
| { | |
| "epoch": 0.7144654088050314, | |
| "grad_norm": 7.307640552520752, | |
| "learning_rate": 1.8858762825907997e-06, | |
| "loss": 1.6571, | |
| "step": 142 | |
| }, | |
| { | |
| "epoch": 0.7245283018867924, | |
| "grad_norm": 2.129241943359375, | |
| "learning_rate": 1.8803564628730913e-06, | |
| "loss": 0.5422, | |
| "step": 144 | |
| }, | |
| { | |
| "epoch": 0.7345911949685534, | |
| "grad_norm": 3.773325204849243, | |
| "learning_rate": 1.8747147636258916e-06, | |
| "loss": 0.7144, | |
| "step": 146 | |
| }, | |
| { | |
| "epoch": 0.7446540880503144, | |
| "grad_norm": 1.3420393466949463, | |
| "learning_rate": 1.8689519659051466e-06, | |
| "loss": -1.1075, | |
| "step": 148 | |
| }, | |
| { | |
| "epoch": 0.7547169811320755, | |
| "grad_norm": 6.70538854598999, | |
| "learning_rate": 1.8630688675320841e-06, | |
| "loss": -1.9595, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 0.7647798742138365, | |
| "grad_norm": 4.187305927276611, | |
| "learning_rate": 1.857066282982763e-06, | |
| "loss": -0.5234, | |
| "step": 152 | |
| }, | |
| { | |
| "epoch": 0.7748427672955975, | |
| "grad_norm": 2.975940465927124, | |
| "learning_rate": 1.850945043275312e-06, | |
| "loss": -0.3984, | |
| "step": 154 | |
| }, | |
| { | |
| "epoch": 0.7849056603773585, | |
| "grad_norm": 2.44286847114563, | |
| "learning_rate": 1.844705995854882e-06, | |
| "loss": 1.109, | |
| "step": 156 | |
| }, | |
| { | |
| "epoch": 0.7949685534591195, | |
| "grad_norm": 12.523564338684082, | |
| "learning_rate": 1.8383500044763226e-06, | |
| "loss": -2.0379, | |
| "step": 158 | |
| }, | |
| { | |
| "epoch": 0.8050314465408805, | |
| "grad_norm": 4.5152716636657715, | |
| "learning_rate": 1.8318779490846e-06, | |
| "loss": -0.6498, | |
| "step": 160 | |
| }, | |
| { | |
| "epoch": 0.8150943396226416, | |
| "grad_norm": 2.565892457962036, | |
| "learning_rate": 1.8252907256929774e-06, | |
| "loss": 0.039, | |
| "step": 162 | |
| }, | |
| { | |
| "epoch": 0.8251572327044026, | |
| "grad_norm": 3.789813756942749, | |
| "learning_rate": 1.8185892462589636e-06, | |
| "loss": -0.0521, | |
| "step": 164 | |
| }, | |
| { | |
| "epoch": 0.8352201257861636, | |
| "grad_norm": 4.709334373474121, | |
| "learning_rate": 1.8117744385580623e-06, | |
| "loss": -0.7899, | |
| "step": 166 | |
| }, | |
| { | |
| "epoch": 0.8452830188679246, | |
| "grad_norm": 2.444716453552246, | |
| "learning_rate": 1.8048472460553256e-06, | |
| "loss": 0.2275, | |
| "step": 168 | |
| }, | |
| { | |
| "epoch": 0.8553459119496856, | |
| "grad_norm": 2.314274549484253, | |
| "learning_rate": 1.7978086277747379e-06, | |
| "loss": -0.9168, | |
| "step": 170 | |
| }, | |
| { | |
| "epoch": 0.8654088050314466, | |
| "grad_norm": 3.4260716438293457, | |
| "learning_rate": 1.7906595581664461e-06, | |
| "loss": -0.6274, | |
| "step": 172 | |
| }, | |
| { | |
| "epoch": 0.8754716981132076, | |
| "grad_norm": 2.7144453525543213, | |
| "learning_rate": 1.7834010269718524e-06, | |
| "loss": -0.9649, | |
| "step": 174 | |
| }, | |
| { | |
| "epoch": 0.8855345911949686, | |
| "grad_norm": 3.8050897121429443, | |
| "learning_rate": 1.7760340390865917e-06, | |
| "loss": -0.262, | |
| "step": 176 | |
| }, | |
| { | |
| "epoch": 0.8955974842767296, | |
| "grad_norm": 2.8164639472961426, | |
| "learning_rate": 1.7685596144214107e-06, | |
| "loss": -1.1909, | |
| "step": 178 | |
| }, | |
| { | |
| "epoch": 0.9056603773584906, | |
| "grad_norm": 4.633458614349365, | |
| "learning_rate": 1.7609787877609676e-06, | |
| "loss": 0.4428, | |
| "step": 180 | |
| }, | |
| { | |
| "epoch": 0.9157232704402516, | |
| "grad_norm": 2.8389792442321777, | |
| "learning_rate": 1.7532926086205726e-06, | |
| "loss": -0.5821, | |
| "step": 182 | |
| }, | |
| { | |
| "epoch": 0.9257861635220126, | |
| "grad_norm": 2.226238965988159, | |
| "learning_rate": 1.7455021411008906e-06, | |
| "loss": 0.3515, | |
| "step": 184 | |
| }, | |
| { | |
| "epoch": 0.9358490566037736, | |
| "grad_norm": 2.591329336166382, | |
| "learning_rate": 1.737608463740622e-06, | |
| "loss": -0.306, | |
| "step": 186 | |
| }, | |
| { | |
| "epoch": 0.9459119496855346, | |
| "grad_norm": 3.7576334476470947, | |
| "learning_rate": 1.7296126693671882e-06, | |
| "loss": 0.1704, | |
| "step": 188 | |
| }, | |
| { | |
| "epoch": 0.9559748427672956, | |
| "grad_norm": 2.887920618057251, | |
| "learning_rate": 1.7215158649454346e-06, | |
| "loss": -0.2494, | |
| "step": 190 | |
| }, | |
| { | |
| "epoch": 0.9660377358490566, | |
| "grad_norm": 4.349538326263428, | |
| "learning_rate": 1.7133191714243802e-06, | |
| "loss": 2.3405, | |
| "step": 192 | |
| }, | |
| { | |
| "epoch": 0.9761006289308176, | |
| "grad_norm": 4.317368984222412, | |
| "learning_rate": 1.7050237235820287e-06, | |
| "loss": 0.4566, | |
| "step": 194 | |
| }, | |
| { | |
| "epoch": 0.9861635220125786, | |
| "grad_norm": 5.087897300720215, | |
| "learning_rate": 1.696630669868267e-06, | |
| "loss": 0.1502, | |
| "step": 196 | |
| }, | |
| { | |
| "epoch": 0.9962264150943396, | |
| "grad_norm": 4.70991325378418, | |
| "learning_rate": 1.6881411722458687e-06, | |
| "loss": -0.3574, | |
| "step": 198 | |
| }, | |
| { | |
| "epoch": 1.0069182389937108, | |
| "grad_norm": 2.966017007827759, | |
| "learning_rate": 1.6795564060296292e-06, | |
| "loss": 0.9311, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 1.0169811320754718, | |
| "grad_norm": 2.300924777984619, | |
| "learning_rate": 1.6708775597236505e-06, | |
| "loss": 0.2717, | |
| "step": 202 | |
| }, | |
| { | |
| "epoch": 1.0270440251572328, | |
| "grad_norm": 6.384905815124512, | |
| "learning_rate": 1.6621058348568004e-06, | |
| "loss": -0.0504, | |
| "step": 204 | |
| }, | |
| { | |
| "epoch": 1.0371069182389938, | |
| "grad_norm": 4.002950668334961, | |
| "learning_rate": 1.6532424458163691e-06, | |
| "loss": -0.2334, | |
| "step": 206 | |
| }, | |
| { | |
| "epoch": 1.0471698113207548, | |
| "grad_norm": 12.800736427307129, | |
| "learning_rate": 1.6442886196799464e-06, | |
| "loss": -1.2455, | |
| "step": 208 | |
| }, | |
| { | |
| "epoch": 1.0572327044025158, | |
| "grad_norm": 5.464755535125732, | |
| "learning_rate": 1.6352455960455384e-06, | |
| "loss": 1.8264, | |
| "step": 210 | |
| }, | |
| { | |
| "epoch": 1.0672955974842768, | |
| "grad_norm": 5.672085762023926, | |
| "learning_rate": 1.6261146268599562e-06, | |
| "loss": -1.0013, | |
| "step": 212 | |
| }, | |
| { | |
| "epoch": 1.0773584905660378, | |
| "grad_norm": 4.908372402191162, | |
| "learning_rate": 1.6168969762454894e-06, | |
| "loss": -1.0382, | |
| "step": 214 | |
| }, | |
| { | |
| "epoch": 1.0874213836477988, | |
| "grad_norm": 7.087652683258057, | |
| "learning_rate": 1.607593920324899e-06, | |
| "loss": -0.4295, | |
| "step": 216 | |
| }, | |
| { | |
| "epoch": 1.0974842767295598, | |
| "grad_norm": 3.5187363624572754, | |
| "learning_rate": 1.5982067470447458e-06, | |
| "loss": -0.0398, | |
| "step": 218 | |
| }, | |
| { | |
| "epoch": 1.1075471698113208, | |
| "grad_norm": 2.593596935272217, | |
| "learning_rate": 1.5887367559970822e-06, | |
| "loss": 0.7915, | |
| "step": 220 | |
| }, | |
| { | |
| "epoch": 1.1176100628930818, | |
| "grad_norm": 6.099729061126709, | |
| "learning_rate": 1.5791852582395332e-06, | |
| "loss": -1.0834, | |
| "step": 222 | |
| }, | |
| { | |
| "epoch": 1.1276729559748428, | |
| "grad_norm": 6.590648174285889, | |
| "learning_rate": 1.5695535761137888e-06, | |
| "loss": 0.9158, | |
| "step": 224 | |
| }, | |
| { | |
| "epoch": 1.1377358490566039, | |
| "grad_norm": 5.639819145202637, | |
| "learning_rate": 1.5598430430625333e-06, | |
| "loss": -1.5288, | |
| "step": 226 | |
| }, | |
| { | |
| "epoch": 1.1477987421383649, | |
| "grad_norm": 3.02219820022583, | |
| "learning_rate": 1.550055003444841e-06, | |
| "loss": -0.0297, | |
| "step": 228 | |
| }, | |
| { | |
| "epoch": 1.1578616352201259, | |
| "grad_norm": 6.338824272155762, | |
| "learning_rate": 1.5401908123500586e-06, | |
| "loss": -0.7611, | |
| "step": 230 | |
| }, | |
| { | |
| "epoch": 1.1679245283018869, | |
| "grad_norm": 3.917799949645996, | |
| "learning_rate": 1.530251835410199e-06, | |
| "loss": 0.4777, | |
| "step": 232 | |
| }, | |
| { | |
| "epoch": 1.1779874213836479, | |
| "grad_norm": 6.309770584106445, | |
| "learning_rate": 1.520239448610882e-06, | |
| "loss": 1.729, | |
| "step": 234 | |
| }, | |
| { | |
| "epoch": 1.1880503144654089, | |
| "grad_norm": 1.9973816871643066, | |
| "learning_rate": 1.5101550381008375e-06, | |
| "loss": -1.5997, | |
| "step": 236 | |
| }, | |
| { | |
| "epoch": 1.1981132075471699, | |
| "grad_norm": 6.434890270233154, | |
| "learning_rate": 1.5e-06, | |
| "loss": -1.5788, | |
| "step": 238 | |
| }, | |
| { | |
| "epoch": 1.2081761006289309, | |
| "grad_norm": 2.8913328647613525, | |
| "learning_rate": 1.4897757402062284e-06, | |
| "loss": 0.2666, | |
| "step": 240 | |
| }, | |
| { | |
| "epoch": 1.2182389937106919, | |
| "grad_norm": 5.833925724029541, | |
| "learning_rate": 1.4794836742006664e-06, | |
| "loss": 0.969, | |
| "step": 242 | |
| }, | |
| { | |
| "epoch": 1.228301886792453, | |
| "grad_norm": 3.047639846801758, | |
| "learning_rate": 1.4691252268517794e-06, | |
| "loss": -0.7864, | |
| "step": 244 | |
| }, | |
| { | |
| "epoch": 1.238364779874214, | |
| "grad_norm": 11.185049057006836, | |
| "learning_rate": 1.4587018322180904e-06, | |
| "loss": -1.8447, | |
| "step": 246 | |
| }, | |
| { | |
| "epoch": 1.248427672955975, | |
| "grad_norm": 3.9488909244537354, | |
| "learning_rate": 1.4482149333496455e-06, | |
| "loss": 1.3762, | |
| "step": 248 | |
| }, | |
| { | |
| "epoch": 1.258490566037736, | |
| "grad_norm": 8.695211410522461, | |
| "learning_rate": 1.4376659820882306e-06, | |
| "loss": 2.1336, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 1.268553459119497, | |
| "grad_norm": 6.01567268371582, | |
| "learning_rate": 1.427056438866376e-06, | |
| "loss": -0.8317, | |
| "step": 252 | |
| }, | |
| { | |
| "epoch": 1.278616352201258, | |
| "grad_norm": 4.584295272827148, | |
| "learning_rate": 1.4163877725051677e-06, | |
| "loss": 0.409, | |
| "step": 254 | |
| }, | |
| { | |
| "epoch": 1.288679245283019, | |
| "grad_norm": 5.3349480628967285, | |
| "learning_rate": 1.4056614600108995e-06, | |
| "loss": 0.106, | |
| "step": 256 | |
| }, | |
| { | |
| "epoch": 1.29874213836478, | |
| "grad_norm": 2.8550000190734863, | |
| "learning_rate": 1.3948789863705913e-06, | |
| "loss": 0.6895, | |
| "step": 258 | |
| }, | |
| { | |
| "epoch": 1.308805031446541, | |
| "grad_norm": 6.208876132965088, | |
| "learning_rate": 1.3840418443464013e-06, | |
| "loss": -0.5366, | |
| "step": 260 | |
| }, | |
| { | |
| "epoch": 1.318867924528302, | |
| "grad_norm": 4.392048358917236, | |
| "learning_rate": 1.3731515342689651e-06, | |
| "loss": 0.9175, | |
| "step": 262 | |
| }, | |
| { | |
| "epoch": 1.328930817610063, | |
| "grad_norm": 5.677616596221924, | |
| "learning_rate": 1.3622095638296825e-06, | |
| "loss": -0.8256, | |
| "step": 264 | |
| }, | |
| { | |
| "epoch": 1.338993710691824, | |
| "grad_norm": 3.6334376335144043, | |
| "learning_rate": 1.3512174478719892e-06, | |
| "loss": -1.949, | |
| "step": 266 | |
| }, | |
| { | |
| "epoch": 1.349056603773585, | |
| "grad_norm": 4.466569423675537, | |
| "learning_rate": 1.3401767081816368e-06, | |
| "loss": 1.0635, | |
| "step": 268 | |
| }, | |
| { | |
| "epoch": 1.359119496855346, | |
| "grad_norm": 6.331056594848633, | |
| "learning_rate": 1.32908887327601e-06, | |
| "loss": -0.801, | |
| "step": 270 | |
| }, | |
| { | |
| "epoch": 1.369182389937107, | |
| "grad_norm": 5.03653621673584, | |
| "learning_rate": 1.317955478192515e-06, | |
| "loss": -0.2086, | |
| "step": 272 | |
| }, | |
| { | |
| "epoch": 1.379245283018868, | |
| "grad_norm": 2.39367413520813, | |
| "learning_rate": 1.3067780642760637e-06, | |
| "loss": -1.0548, | |
| "step": 274 | |
| }, | |
| { | |
| "epoch": 1.389308176100629, | |
| "grad_norm": 6.588123321533203, | |
| "learning_rate": 1.295558178965684e-06, | |
| "loss": 1.0341, | |
| "step": 276 | |
| }, | |
| { | |
| "epoch": 1.39937106918239, | |
| "grad_norm": 3.3789021968841553, | |
| "learning_rate": 1.284297375580287e-06, | |
| "loss": -0.1079, | |
| "step": 278 | |
| }, | |
| { | |
| "epoch": 1.409433962264151, | |
| "grad_norm": 4.275945663452148, | |
| "learning_rate": 1.272997213103621e-06, | |
| "loss": 1.3644, | |
| "step": 280 | |
| }, | |
| { | |
| "epoch": 1.419496855345912, | |
| "grad_norm": 5.876030921936035, | |
| "learning_rate": 1.2616592559684408e-06, | |
| "loss": -1.5156, | |
| "step": 282 | |
| }, | |
| { | |
| "epoch": 1.429559748427673, | |
| "grad_norm": 3.4462649822235107, | |
| "learning_rate": 1.2502850738399199e-06, | |
| "loss": 0.2908, | |
| "step": 284 | |
| }, | |
| { | |
| "epoch": 1.439622641509434, | |
| "grad_norm": 3.7064943313598633, | |
| "learning_rate": 1.2388762413983444e-06, | |
| "loss": -1.058, | |
| "step": 286 | |
| }, | |
| { | |
| "epoch": 1.449685534591195, | |
| "grad_norm": 4.951382637023926, | |
| "learning_rate": 1.2274343381211066e-06, | |
| "loss": 0.4712, | |
| "step": 288 | |
| }, | |
| { | |
| "epoch": 1.459748427672956, | |
| "grad_norm": 4.248599052429199, | |
| "learning_rate": 1.215960948064036e-06, | |
| "loss": 0.1037, | |
| "step": 290 | |
| }, | |
| { | |
| "epoch": 1.469811320754717, | |
| "grad_norm": 4.509840488433838, | |
| "learning_rate": 1.2044576596421002e-06, | |
| "loss": 0.6964, | |
| "step": 292 | |
| }, | |
| { | |
| "epoch": 1.479874213836478, | |
| "grad_norm": 1.8829210996627808, | |
| "learning_rate": 1.1929260654094969e-06, | |
| "loss": -0.0571, | |
| "step": 294 | |
| }, | |
| { | |
| "epoch": 1.489937106918239, | |
| "grad_norm": 6.426050662994385, | |
| "learning_rate": 1.1813677618391757e-06, | |
| "loss": 0.5038, | |
| "step": 296 | |
| }, | |
| { | |
| "epoch": 1.5, | |
| "grad_norm": 3.1166653633117676, | |
| "learning_rate": 1.1697843491018187e-06, | |
| "loss": -1.3007, | |
| "step": 298 | |
| }, | |
| { | |
| "epoch": 1.510062893081761, | |
| "grad_norm": 2.824904680252075, | |
| "learning_rate": 1.1581774308443039e-06, | |
| "loss": 0.6687, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 1.520125786163522, | |
| "grad_norm": 1.3138232231140137, | |
| "learning_rate": 1.1465486139676953e-06, | |
| "loss": 0.8043, | |
| "step": 302 | |
| }, | |
| { | |
| "epoch": 1.530188679245283, | |
| "grad_norm": 3.3225157260894775, | |
| "learning_rate": 1.1348995084047749e-06, | |
| "loss": 0.5529, | |
| "step": 304 | |
| }, | |
| { | |
| "epoch": 1.540251572327044, | |
| "grad_norm": 5.321311950683594, | |
| "learning_rate": 1.1232317268971584e-06, | |
| "loss": 0.1101, | |
| "step": 306 | |
| }, | |
| { | |
| "epoch": 1.550314465408805, | |
| "grad_norm": 10.030771255493164, | |
| "learning_rate": 1.1115468847720245e-06, | |
| "loss": -0.9142, | |
| "step": 308 | |
| }, | |
| { | |
| "epoch": 1.560377358490566, | |
| "grad_norm": 2.3845436573028564, | |
| "learning_rate": 1.0998465997184796e-06, | |
| "loss": 0.6053, | |
| "step": 310 | |
| }, | |
| { | |
| "epoch": 1.570440251572327, | |
| "grad_norm": 3.853327512741089, | |
| "learning_rate": 1.0881324915636018e-06, | |
| "loss": 0.1398, | |
| "step": 312 | |
| }, | |
| { | |
| "epoch": 1.580503144654088, | |
| "grad_norm": 2.7320926189422607, | |
| "learning_rate": 1.076406182048187e-06, | |
| "loss": -1.7586, | |
| "step": 314 | |
| }, | |
| { | |
| "epoch": 1.590566037735849, | |
| "grad_norm": 2.23327374458313, | |
| "learning_rate": 1.0646692946022285e-06, | |
| "loss": -0.8936, | |
| "step": 316 | |
| }, | |
| { | |
| "epoch": 1.60062893081761, | |
| "grad_norm": 6.662895679473877, | |
| "learning_rate": 1.0529234541201631e-06, | |
| "loss": 1.1678, | |
| "step": 318 | |
| }, | |
| { | |
| "epoch": 1.610691823899371, | |
| "grad_norm": 2.96289324760437, | |
| "learning_rate": 1.0411702867359178e-06, | |
| "loss": -0.3086, | |
| "step": 320 | |
| }, | |
| { | |
| "epoch": 1.620754716981132, | |
| "grad_norm": 2.9261276721954346, | |
| "learning_rate": 1.0294114195977794e-06, | |
| "loss": 0.7558, | |
| "step": 322 | |
| }, | |
| { | |
| "epoch": 1.630817610062893, | |
| "grad_norm": 3.917189598083496, | |
| "learning_rate": 1.0176484806431287e-06, | |
| "loss": 0.1406, | |
| "step": 324 | |
| }, | |
| { | |
| "epoch": 1.640880503144654, | |
| "grad_norm": 8.924764633178711, | |
| "learning_rate": 1.0058830983730622e-06, | |
| "loss": -3.2015, | |
| "step": 326 | |
| }, | |
| { | |
| "epoch": 1.650943396226415, | |
| "grad_norm": 3.501892328262329, | |
| "learning_rate": 9.94116901626938e-07, | |
| "loss": -1.6323, | |
| "step": 328 | |
| }, | |
| { | |
| "epoch": 1.661006289308176, | |
| "grad_norm": 2.972134828567505, | |
| "learning_rate": 9.823515193568714e-07, | |
| "loss": -1.4688, | |
| "step": 330 | |
| }, | |
| { | |
| "epoch": 1.671069182389937, | |
| "grad_norm": 6.309866428375244, | |
| "learning_rate": 9.705885804022205e-07, | |
| "loss": 0.4812, | |
| "step": 332 | |
| }, | |
| { | |
| "epoch": 1.681132075471698, | |
| "grad_norm": 4.435581207275391, | |
| "learning_rate": 9.588297132640824e-07, | |
| "loss": 0.0122, | |
| "step": 334 | |
| }, | |
| { | |
| "epoch": 1.691194968553459, | |
| "grad_norm": 4.168426513671875, | |
| "learning_rate": 9.470765458798368e-07, | |
| "loss": -0.787, | |
| "step": 336 | |
| }, | |
| { | |
| "epoch": 1.70125786163522, | |
| "grad_norm": 3.8862287998199463, | |
| "learning_rate": 9.353307053977715e-07, | |
| "loss": -0.3479, | |
| "step": 338 | |
| }, | |
| { | |
| "epoch": 1.711320754716981, | |
| "grad_norm": 4.058013439178467, | |
| "learning_rate": 9.23593817951813e-07, | |
| "loss": 0.7891, | |
| "step": 340 | |
| }, | |
| { | |
| "epoch": 1.721383647798742, | |
| "grad_norm": 9.581009864807129, | |
| "learning_rate": 9.118675084363985e-07, | |
| "loss": -0.5769, | |
| "step": 342 | |
| }, | |
| { | |
| "epoch": 1.731446540880503, | |
| "grad_norm": 4.200214862823486, | |
| "learning_rate": 9.001534002815207e-07, | |
| "loss": -1.3016, | |
| "step": 344 | |
| }, | |
| { | |
| "epoch": 1.741509433962264, | |
| "grad_norm": 2.9621429443359375, | |
| "learning_rate": 8.884531152279755e-07, | |
| "loss": -1.772, | |
| "step": 346 | |
| }, | |
| { | |
| "epoch": 1.751572327044025, | |
| "grad_norm": 3.36149001121521, | |
| "learning_rate": 8.767682731028414e-07, | |
| "loss": -0.7338, | |
| "step": 348 | |
| }, | |
| { | |
| "epoch": 1.761635220125786, | |
| "grad_norm": 3.888066053390503, | |
| "learning_rate": 8.651004915952252e-07, | |
| "loss": -0.5376, | |
| "step": 350 | |
| }, | |
| { | |
| "epoch": 1.771698113207547, | |
| "grad_norm": 2.9135375022888184, | |
| "learning_rate": 8.534513860323045e-07, | |
| "loss": -0.2755, | |
| "step": 352 | |
| }, | |
| { | |
| "epoch": 1.7817610062893081, | |
| "grad_norm": 2.2403316497802734, | |
| "learning_rate": 8.41822569155696e-07, | |
| "loss": -0.5882, | |
| "step": 354 | |
| }, | |
| { | |
| "epoch": 1.7918238993710691, | |
| "grad_norm": 6.112231731414795, | |
| "learning_rate": 8.302156508981815e-07, | |
| "loss": 0.1197, | |
| "step": 356 | |
| }, | |
| { | |
| "epoch": 1.8018867924528301, | |
| "grad_norm": 6.92394495010376, | |
| "learning_rate": 8.18632238160824e-07, | |
| "loss": 0.122, | |
| "step": 358 | |
| }, | |
| { | |
| "epoch": 1.8119496855345911, | |
| "grad_norm": 8.573149681091309, | |
| "learning_rate": 8.070739345905031e-07, | |
| "loss": -1.2034, | |
| "step": 360 | |
| }, | |
| { | |
| "epoch": 1.8220125786163521, | |
| "grad_norm": 3.436896562576294, | |
| "learning_rate": 7.955423403578997e-07, | |
| "loss": -0.336, | |
| "step": 362 | |
| }, | |
| { | |
| "epoch": 1.8320754716981131, | |
| "grad_norm": 3.0969924926757812, | |
| "learning_rate": 7.840390519359643e-07, | |
| "loss": -0.6976, | |
| "step": 364 | |
| }, | |
| { | |
| "epoch": 1.8421383647798741, | |
| "grad_norm": 3.821650266647339, | |
| "learning_rate": 7.725656618788937e-07, | |
| "loss": -1.231, | |
| "step": 366 | |
| }, | |
| { | |
| "epoch": 1.8522012578616351, | |
| "grad_norm": 3.3464226722717285, | |
| "learning_rate": 7.611237586016557e-07, | |
| "loss": 0.8503, | |
| "step": 368 | |
| }, | |
| { | |
| "epoch": 1.8622641509433961, | |
| "grad_norm": 3.881531238555908, | |
| "learning_rate": 7.497149261600802e-07, | |
| "loss": 0.3178, | |
| "step": 370 | |
| }, | |
| { | |
| "epoch": 1.8723270440251572, | |
| "grad_norm": 1.9269695281982422, | |
| "learning_rate": 7.383407440315595e-07, | |
| "loss": -0.2027, | |
| "step": 372 | |
| }, | |
| { | |
| "epoch": 1.8823899371069182, | |
| "grad_norm": 11.40230941772461, | |
| "learning_rate": 7.27002786896379e-07, | |
| "loss": -0.1666, | |
| "step": 374 | |
| }, | |
| { | |
| "epoch": 1.8924528301886792, | |
| "grad_norm": 2.309051752090454, | |
| "learning_rate": 7.157026244197131e-07, | |
| "loss": -0.0113, | |
| "step": 376 | |
| }, | |
| { | |
| "epoch": 1.9025157232704402, | |
| "grad_norm": 13.750130653381348, | |
| "learning_rate": 7.044418210343159e-07, | |
| "loss": -0.5592, | |
| "step": 378 | |
| }, | |
| { | |
| "epoch": 1.9125786163522012, | |
| "grad_norm": 2.372840166091919, | |
| "learning_rate": 6.932219357239361e-07, | |
| "loss": -0.173, | |
| "step": 380 | |
| }, | |
| { | |
| "epoch": 1.9226415094339622, | |
| "grad_norm": 11.330310821533203, | |
| "learning_rate": 6.820445218074848e-07, | |
| "loss": -1.36, | |
| "step": 382 | |
| }, | |
| { | |
| "epoch": 1.9327044025157232, | |
| "grad_norm": 7.450850009918213, | |
| "learning_rate": 6.7091112672399e-07, | |
| "loss": -1.447, | |
| "step": 384 | |
| }, | |
| { | |
| "epoch": 1.9427672955974842, | |
| "grad_norm": 12.863826751708984, | |
| "learning_rate": 6.598232918183631e-07, | |
| "loss": 1.0882, | |
| "step": 386 | |
| }, | |
| { | |
| "epoch": 1.9528301886792452, | |
| "grad_norm": 5.197085380554199, | |
| "learning_rate": 6.487825521280108e-07, | |
| "loss": -0.2821, | |
| "step": 388 | |
| }, | |
| { | |
| "epoch": 1.9628930817610062, | |
| "grad_norm": 2.8584909439086914, | |
| "learning_rate": 6.377904361703177e-07, | |
| "loss": 0.6447, | |
| "step": 390 | |
| }, | |
| { | |
| "epoch": 1.9729559748427672, | |
| "grad_norm": 9.712791442871094, | |
| "learning_rate": 6.26848465731035e-07, | |
| "loss": 1.5534, | |
| "step": 392 | |
| }, | |
| { | |
| "epoch": 1.9830188679245282, | |
| "grad_norm": 8.965962409973145, | |
| "learning_rate": 6.159581556535987e-07, | |
| "loss": 1.1777, | |
| "step": 394 | |
| }, | |
| { | |
| "epoch": 1.9930817610062892, | |
| "grad_norm": 2.6333396434783936, | |
| "learning_rate": 6.051210136294088e-07, | |
| "loss": 0.6377, | |
| "step": 396 | |
| }, | |
| { | |
| "epoch": 2.0037735849056606, | |
| "grad_norm": 4.632491588592529, | |
| "learning_rate": 5.943385399891003e-07, | |
| "loss": 0.7307, | |
| "step": 398 | |
| }, | |
| { | |
| "epoch": 2.0138364779874216, | |
| "grad_norm": 4.375370979309082, | |
| "learning_rate": 5.836122274948324e-07, | |
| "loss": 1.2132, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 2.0238993710691826, | |
| "grad_norm": 3.335942268371582, | |
| "learning_rate": 5.729435611336239e-07, | |
| "loss": -0.5918, | |
| "step": 402 | |
| }, | |
| { | |
| "epoch": 2.0339622641509436, | |
| "grad_norm": 6.7062506675720215, | |
| "learning_rate": 5.623340179117694e-07, | |
| "loss": -0.9562, | |
| "step": 404 | |
| }, | |
| { | |
| "epoch": 2.0440251572327046, | |
| "grad_norm": 3.223489761352539, | |
| "learning_rate": 5.517850666503546e-07, | |
| "loss": 0.6964, | |
| "step": 406 | |
| }, | |
| { | |
| "epoch": 2.0540880503144656, | |
| "grad_norm": 7.602553367614746, | |
| "learning_rate": 5.412981677819093e-07, | |
| "loss": -2.6532, | |
| "step": 408 | |
| }, | |
| { | |
| "epoch": 2.0641509433962266, | |
| "grad_norm": 2.123918056488037, | |
| "learning_rate": 5.308747731482206e-07, | |
| "loss": -1.1065, | |
| "step": 410 | |
| }, | |
| { | |
| "epoch": 2.0742138364779876, | |
| "grad_norm": 5.430229187011719, | |
| "learning_rate": 5.20516325799334e-07, | |
| "loss": -0.7525, | |
| "step": 412 | |
| }, | |
| { | |
| "epoch": 2.0842767295597486, | |
| "grad_norm": 5.109172344207764, | |
| "learning_rate": 5.102242597937717e-07, | |
| "loss": -1.5795, | |
| "step": 414 | |
| }, | |
| { | |
| "epoch": 2.0943396226415096, | |
| "grad_norm": 3.5902011394500732, | |
| "learning_rate": 5.000000000000002e-07, | |
| "loss": -0.4448, | |
| "step": 416 | |
| }, | |
| { | |
| "epoch": 2.1044025157232706, | |
| "grad_norm": 3.8342630863189697, | |
| "learning_rate": 4.89844961899163e-07, | |
| "loss": -1.3424, | |
| "step": 418 | |
| }, | |
| { | |
| "epoch": 2.1144654088050316, | |
| "grad_norm": 5.093093395233154, | |
| "learning_rate": 4.797605513891178e-07, | |
| "loss": 0.6365, | |
| "step": 420 | |
| }, | |
| { | |
| "epoch": 2.1245283018867926, | |
| "grad_norm": 6.690524578094482, | |
| "learning_rate": 4.6974816458980116e-07, | |
| "loss": 0.0718, | |
| "step": 422 | |
| }, | |
| { | |
| "epoch": 2.1345911949685537, | |
| "grad_norm": 3.328261375427246, | |
| "learning_rate": 4.598091876499417e-07, | |
| "loss": -1.2867, | |
| "step": 424 | |
| }, | |
| { | |
| "epoch": 2.1446540880503147, | |
| "grad_norm": 2.5299105644226074, | |
| "learning_rate": 4.499449965551586e-07, | |
| "loss": -0.0399, | |
| "step": 426 | |
| }, | |
| { | |
| "epoch": 2.1547169811320757, | |
| "grad_norm": 7.731986045837402, | |
| "learning_rate": 4.401569569374668e-07, | |
| "loss": 0.4734, | |
| "step": 428 | |
| }, | |
| { | |
| "epoch": 2.1647798742138367, | |
| "grad_norm": 6.546573162078857, | |
| "learning_rate": 4.3044642388621144e-07, | |
| "loss": -0.9198, | |
| "step": 430 | |
| }, | |
| { | |
| "epoch": 2.1748427672955977, | |
| "grad_norm": 5.20041561126709, | |
| "learning_rate": 4.208147417604664e-07, | |
| "loss": 0.1999, | |
| "step": 432 | |
| }, | |
| { | |
| "epoch": 2.1849056603773587, | |
| "grad_norm": 7.04267692565918, | |
| "learning_rate": 4.1126324400291756e-07, | |
| "loss": -0.0014, | |
| "step": 434 | |
| }, | |
| { | |
| "epoch": 2.1949685534591197, | |
| "grad_norm": 1.8967030048370361, | |
| "learning_rate": 4.0179325295525426e-07, | |
| "loss": -0.4547, | |
| "step": 436 | |
| }, | |
| { | |
| "epoch": 2.2050314465408807, | |
| "grad_norm": 7.423833847045898, | |
| "learning_rate": 3.924060796751012e-07, | |
| "loss": 1.2133, | |
| "step": 438 | |
| }, | |
| { | |
| "epoch": 2.2150943396226417, | |
| "grad_norm": 5.08156156539917, | |
| "learning_rate": 3.83103023754511e-07, | |
| "loss": -0.5562, | |
| "step": 440 | |
| }, | |
| { | |
| "epoch": 2.2251572327044027, | |
| "grad_norm": 2.8167994022369385, | |
| "learning_rate": 3.738853731400439e-07, | |
| "loss": 0.1852, | |
| "step": 442 | |
| }, | |
| { | |
| "epoch": 2.2352201257861637, | |
| "grad_norm": 3.1104578971862793, | |
| "learning_rate": 3.6475440395446147e-07, | |
| "loss": -0.9611, | |
| "step": 444 | |
| }, | |
| { | |
| "epoch": 2.2452830188679247, | |
| "grad_norm": 2.3350167274475098, | |
| "learning_rate": 3.5571138032005365e-07, | |
| "loss": 0.3598, | |
| "step": 446 | |
| }, | |
| { | |
| "epoch": 2.2553459119496857, | |
| "grad_norm": 3.4781851768493652, | |
| "learning_rate": 3.4675755418363053e-07, | |
| "loss": 0.1132, | |
| "step": 448 | |
| }, | |
| { | |
| "epoch": 2.2654088050314467, | |
| "grad_norm": 5.0868706703186035, | |
| "learning_rate": 3.378941651431996e-07, | |
| "loss": 0.7901, | |
| "step": 450 | |
| }, | |
| { | |
| "epoch": 2.2754716981132077, | |
| "grad_norm": 4.737022876739502, | |
| "learning_rate": 3.291224402763495e-07, | |
| "loss": -0.5819, | |
| "step": 452 | |
| }, | |
| { | |
| "epoch": 2.2855345911949687, | |
| "grad_norm": 3.6209828853607178, | |
| "learning_rate": 3.2044359397037046e-07, | |
| "loss": -0.2148, | |
| "step": 454 | |
| }, | |
| { | |
| "epoch": 2.2955974842767297, | |
| "grad_norm": 6.26187801361084, | |
| "learning_rate": 3.118588277541312e-07, | |
| "loss": -0.7123, | |
| "step": 456 | |
| }, | |
| { | |
| "epoch": 2.3056603773584907, | |
| "grad_norm": 3.300475597381592, | |
| "learning_rate": 3.0336933013173305e-07, | |
| "loss": 0.3813, | |
| "step": 458 | |
| }, | |
| { | |
| "epoch": 2.3157232704402517, | |
| "grad_norm": 4.379162311553955, | |
| "learning_rate": 2.9497627641797106e-07, | |
| "loss": -0.9063, | |
| "step": 460 | |
| }, | |
| { | |
| "epoch": 2.3257861635220127, | |
| "grad_norm": 4.494270324707031, | |
| "learning_rate": 2.8668082857562004e-07, | |
| "loss": 0.7504, | |
| "step": 462 | |
| }, | |
| { | |
| "epoch": 2.3358490566037737, | |
| "grad_norm": 4.654480457305908, | |
| "learning_rate": 2.784841350545656e-07, | |
| "loss": -0.4204, | |
| "step": 464 | |
| }, | |
| { | |
| "epoch": 2.3459119496855347, | |
| "grad_norm": 3.090691089630127, | |
| "learning_rate": 2.7038733063281173e-07, | |
| "loss": 0.6562, | |
| "step": 466 | |
| }, | |
| { | |
| "epoch": 2.3559748427672957, | |
| "grad_norm": 3.110882520675659, | |
| "learning_rate": 2.623915362593778e-07, | |
| "loss": -0.6948, | |
| "step": 468 | |
| }, | |
| { | |
| "epoch": 2.3660377358490567, | |
| "grad_norm": 8.367574691772461, | |
| "learning_rate": 2.5449785889910956e-07, | |
| "loss": -1.445, | |
| "step": 470 | |
| }, | |
| { | |
| "epoch": 2.3761006289308177, | |
| "grad_norm": 1.8932026624679565, | |
| "learning_rate": 2.467073913794272e-07, | |
| "loss": 0.3359, | |
| "step": 472 | |
| }, | |
| { | |
| "epoch": 2.3861635220125788, | |
| "grad_norm": 4.765536785125732, | |
| "learning_rate": 2.3902121223903226e-07, | |
| "loss": -0.9514, | |
| "step": 474 | |
| }, | |
| { | |
| "epoch": 2.3962264150943398, | |
| "grad_norm": 4.574184894561768, | |
| "learning_rate": 2.3144038557858913e-07, | |
| "loss": 0.6839, | |
| "step": 476 | |
| }, | |
| { | |
| "epoch": 2.4062893081761008, | |
| "grad_norm": 6.006104469299316, | |
| "learning_rate": 2.2396596091340803e-07, | |
| "loss": 0.0796, | |
| "step": 478 | |
| }, | |
| { | |
| "epoch": 2.4163522012578618, | |
| "grad_norm": 4.098776340484619, | |
| "learning_rate": 2.1659897302814744e-07, | |
| "loss": -0.9333, | |
| "step": 480 | |
| }, | |
| { | |
| "epoch": 2.4264150943396228, | |
| "grad_norm": 4.418032646179199, | |
| "learning_rate": 2.0934044183355383e-07, | |
| "loss": -1.8508, | |
| "step": 482 | |
| }, | |
| { | |
| "epoch": 2.4364779874213838, | |
| "grad_norm": 11.399324417114258, | |
| "learning_rate": 2.0219137222526183e-07, | |
| "loss": 1.1837, | |
| "step": 484 | |
| }, | |
| { | |
| "epoch": 2.4465408805031448, | |
| "grad_norm": 5.924710273742676, | |
| "learning_rate": 1.9515275394467446e-07, | |
| "loss": -0.0577, | |
| "step": 486 | |
| }, | |
| { | |
| "epoch": 2.456603773584906, | |
| "grad_norm": 7.316831111907959, | |
| "learning_rate": 1.8822556144193756e-07, | |
| "loss": 0.1237, | |
| "step": 488 | |
| }, | |
| { | |
| "epoch": 2.466666666666667, | |
| "grad_norm": 6.5416765213012695, | |
| "learning_rate": 1.8141075374103632e-07, | |
| "loss": -1.9742, | |
| "step": 490 | |
| }, | |
| { | |
| "epoch": 2.476729559748428, | |
| "grad_norm": 5.302765369415283, | |
| "learning_rate": 1.7470927430702276e-07, | |
| "loss": 1.6366, | |
| "step": 492 | |
| }, | |
| { | |
| "epoch": 2.486792452830189, | |
| "grad_norm": 6.104937553405762, | |
| "learning_rate": 1.6812205091539978e-07, | |
| "loss": -0.9508, | |
| "step": 494 | |
| }, | |
| { | |
| "epoch": 2.49685534591195, | |
| "grad_norm": 3.6209168434143066, | |
| "learning_rate": 1.6164999552367765e-07, | |
| "loss": -0.6157, | |
| "step": 496 | |
| }, | |
| { | |
| "epoch": 2.506918238993711, | |
| "grad_norm": 11.832756996154785, | |
| "learning_rate": 1.5529400414511805e-07, | |
| "loss": -1.168, | |
| "step": 498 | |
| }, | |
| { | |
| "epoch": 2.516981132075472, | |
| "grad_norm": 9.809549331665039, | |
| "learning_rate": 1.4905495672468783e-07, | |
| "loss": 0.3619, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 2.527044025157233, | |
| "grad_norm": 5.026820182800293, | |
| "learning_rate": 1.42933717017237e-07, | |
| "loss": -0.3516, | |
| "step": 502 | |
| }, | |
| { | |
| "epoch": 2.537106918238994, | |
| "grad_norm": 4.968526363372803, | |
| "learning_rate": 1.3693113246791588e-07, | |
| "loss": -0.383, | |
| "step": 504 | |
| }, | |
| { | |
| "epoch": 2.547169811320755, | |
| "grad_norm": 5.452160835266113, | |
| "learning_rate": 1.3104803409485354e-07, | |
| "loss": -0.3609, | |
| "step": 506 | |
| }, | |
| { | |
| "epoch": 2.557232704402516, | |
| "grad_norm": 6.929769992828369, | |
| "learning_rate": 1.2528523637410836e-07, | |
| "loss": -0.109, | |
| "step": 508 | |
| }, | |
| { | |
| "epoch": 2.567295597484277, | |
| "grad_norm": 5.186896800994873, | |
| "learning_rate": 1.1964353712690888e-07, | |
| "loss": 0.3748, | |
| "step": 510 | |
| }, | |
| { | |
| "epoch": 2.577358490566038, | |
| "grad_norm": 2.7618138790130615, | |
| "learning_rate": 1.1412371740920035e-07, | |
| "loss": 0.6345, | |
| "step": 512 | |
| }, | |
| { | |
| "epoch": 2.587421383647799, | |
| "grad_norm": 9.840655326843262, | |
| "learning_rate": 1.0872654140351457e-07, | |
| "loss": -0.4424, | |
| "step": 514 | |
| }, | |
| { | |
| "epoch": 2.59748427672956, | |
| "grad_norm": 5.229491233825684, | |
| "learning_rate": 1.0345275631317163e-07, | |
| "loss": 0.1269, | |
| "step": 516 | |
| }, | |
| { | |
| "epoch": 2.607547169811321, | |
| "grad_norm": 3.292207956314087, | |
| "learning_rate": 9.830309225883559e-08, | |
| "loss": -0.8045, | |
| "step": 518 | |
| }, | |
| { | |
| "epoch": 2.617610062893082, | |
| "grad_norm": 2.8611297607421875, | |
| "learning_rate": 9.327826217743451e-08, | |
| "loss": 0.6012, | |
| "step": 520 | |
| }, | |
| { | |
| "epoch": 2.627672955974843, | |
| "grad_norm": 6.323940277099609, | |
| "learning_rate": 8.837896172345827e-08, | |
| "loss": -0.5895, | |
| "step": 522 | |
| }, | |
| { | |
| "epoch": 2.637735849056604, | |
| "grad_norm": 7.645895957946777, | |
| "learning_rate": 8.360586917264977e-08, | |
| "loss": 0.5182, | |
| "step": 524 | |
| }, | |
| { | |
| "epoch": 2.647798742138365, | |
| "grad_norm": 6.323966979980469, | |
| "learning_rate": 7.895964532810317e-08, | |
| "loss": -0.3837, | |
| "step": 526 | |
| }, | |
| { | |
| "epoch": 2.657861635220126, | |
| "grad_norm": 7.799415588378906, | |
| "learning_rate": 7.444093342877899e-08, | |
| "loss": -0.7239, | |
| "step": 528 | |
| }, | |
| { | |
| "epoch": 2.667924528301887, | |
| "grad_norm": 6.719019889831543, | |
| "learning_rate": 7.005035906045197e-08, | |
| "loss": 0.2248, | |
| "step": 530 | |
| }, | |
| { | |
| "epoch": 2.677987421383648, | |
| "grad_norm": 5.086057186126709, | |
| "learning_rate": 6.578853006910402e-08, | |
| "loss": 0.5775, | |
| "step": 532 | |
| }, | |
| { | |
| "epoch": 2.688050314465409, | |
| "grad_norm": 3.6781728267669678, | |
| "learning_rate": 6.165603647677054e-08, | |
| "loss": 0.0562, | |
| "step": 534 | |
| }, | |
| { | |
| "epoch": 2.69811320754717, | |
| "grad_norm": 9.493392944335938, | |
| "learning_rate": 5.765345039985647e-08, | |
| "loss": 0.205, | |
| "step": 536 | |
| }, | |
| { | |
| "epoch": 2.708176100628931, | |
| "grad_norm": 4.998286247253418, | |
| "learning_rate": 5.378132596993046e-08, | |
| "loss": 0.9461, | |
| "step": 538 | |
| }, | |
| { | |
| "epoch": 2.718238993710692, | |
| "grad_norm": 4.373546600341797, | |
| "learning_rate": 5.0040199257009196e-08, | |
| "loss": -0.7566, | |
| "step": 540 | |
| }, | |
| { | |
| "epoch": 2.728301886792453, | |
| "grad_norm": 8.538968086242676, | |
| "learning_rate": 4.6430588195341847e-08, | |
| "loss": 0.9457, | |
| "step": 542 | |
| }, | |
| { | |
| "epoch": 2.738364779874214, | |
| "grad_norm": 8.773660659790039, | |
| "learning_rate": 4.295299251170537e-08, | |
| "loss": -0.2537, | |
| "step": 544 | |
| }, | |
| { | |
| "epoch": 2.748427672955975, | |
| "grad_norm": 5.2722978591918945, | |
| "learning_rate": 3.9607893656220745e-08, | |
| "loss": 0.8571, | |
| "step": 546 | |
| }, | |
| { | |
| "epoch": 2.758490566037736, | |
| "grad_norm": 7.540788650512695, | |
| "learning_rate": 3.639575473569989e-08, | |
| "loss": -2.1415, | |
| "step": 548 | |
| }, | |
| { | |
| "epoch": 2.768553459119497, | |
| "grad_norm": 3.7448925971984863, | |
| "learning_rate": 3.331702044953066e-08, | |
| "loss": -1.1784, | |
| "step": 550 | |
| }, | |
| { | |
| "epoch": 2.778616352201258, | |
| "grad_norm": 3.103691577911377, | |
| "learning_rate": 3.037211702811182e-08, | |
| "loss": -0.3766, | |
| "step": 552 | |
| }, | |
| { | |
| "epoch": 2.788679245283019, | |
| "grad_norm": 4.002925872802734, | |
| "learning_rate": 2.75614521738442e-08, | |
| "loss": -1.5215, | |
| "step": 554 | |
| }, | |
| { | |
| "epoch": 2.79874213836478, | |
| "grad_norm": 6.615825176239014, | |
| "learning_rate": 2.488541500468666e-08, | |
| "loss": 0.4594, | |
| "step": 556 | |
| }, | |
| { | |
| "epoch": 2.808805031446541, | |
| "grad_norm": 4.420342922210693, | |
| "learning_rate": 2.2344376000285604e-08, | |
| "loss": 0.0622, | |
| "step": 558 | |
| }, | |
| { | |
| "epoch": 2.818867924528302, | |
| "grad_norm": 5.796300888061523, | |
| "learning_rate": 1.9938686950684567e-08, | |
| "loss": -0.9306, | |
| "step": 560 | |
| }, | |
| { | |
| "epoch": 2.828930817610063, | |
| "grad_norm": 4.024370193481445, | |
| "learning_rate": 1.766868090762075e-08, | |
| "loss": -0.4119, | |
| "step": 562 | |
| }, | |
| { | |
| "epoch": 2.838993710691824, | |
| "grad_norm": 9.87598705291748, | |
| "learning_rate": 1.553467213841664e-08, | |
| "loss": -0.1066, | |
| "step": 564 | |
| }, | |
| { | |
| "epoch": 2.849056603773585, | |
| "grad_norm": 6.048956871032715, | |
| "learning_rate": 1.3536956082472073e-08, | |
| "loss": -0.7316, | |
| "step": 566 | |
| }, | |
| { | |
| "epoch": 2.859119496855346, | |
| "grad_norm": 5.084702968597412, | |
| "learning_rate": 1.1675809310361495e-08, | |
| "loss": -1.3274, | |
| "step": 568 | |
| }, | |
| { | |
| "epoch": 2.869182389937107, | |
| "grad_norm": 4.490642070770264, | |
| "learning_rate": 9.951489485545694e-09, | |
| "loss": 0.1211, | |
| "step": 570 | |
| }, | |
| { | |
| "epoch": 2.879245283018868, | |
| "grad_norm": 9.895052909851074, | |
| "learning_rate": 8.364235328699564e-09, | |
| "loss": 1.1259, | |
| "step": 572 | |
| }, | |
| { | |
| "epoch": 2.889308176100629, | |
| "grad_norm": 4.905172348022461, | |
| "learning_rate": 6.914266584662987e-09, | |
| "loss": -0.1241, | |
| "step": 574 | |
| }, | |
| { | |
| "epoch": 2.89937106918239, | |
| "grad_norm": 3.0340776443481445, | |
| "learning_rate": 5.60178399201805e-09, | |
| "loss": -0.6671, | |
| "step": 576 | |
| }, | |
| { | |
| "epoch": 2.909433962264151, | |
| "grad_norm": 3.124040126800537, | |
| "learning_rate": 4.42696925529884e-09, | |
| "loss": -1.8007, | |
| "step": 578 | |
| }, | |
| { | |
| "epoch": 2.919496855345912, | |
| "grad_norm": 5.664621353149414, | |
| "learning_rate": 3.3899850198353397e-09, | |
| "loss": 0.1159, | |
| "step": 580 | |
| }, | |
| { | |
| "epoch": 2.929559748427673, | |
| "grad_norm": 4.976583957672119, | |
| "learning_rate": 2.4909748492362158e-09, | |
| "loss": -1.2106, | |
| "step": 582 | |
| }, | |
| { | |
| "epoch": 2.939622641509434, | |
| "grad_norm": 5.037308216094971, | |
| "learning_rate": 1.730063205513277e-09, | |
| "loss": 0.8336, | |
| "step": 584 | |
| }, | |
| { | |
| "epoch": 2.949685534591195, | |
| "grad_norm": 4.580456733703613, | |
| "learning_rate": 1.1073554318509203e-09, | |
| "loss": 0.378, | |
| "step": 586 | |
| }, | |
| { | |
| "epoch": 2.959748427672956, | |
| "grad_norm": 4.7945990562438965, | |
| "learning_rate": 6.229377380218003e-10, | |
| "loss": -0.0708, | |
| "step": 588 | |
| }, | |
| { | |
| "epoch": 2.969811320754717, | |
| "grad_norm": 14.472588539123535, | |
| "learning_rate": 2.7687718845148535e-10, | |
| "loss": -0.0673, | |
| "step": 590 | |
| }, | |
| { | |
| "epoch": 2.979874213836478, | |
| "grad_norm": 9.063091278076172, | |
| "learning_rate": 6.92216929342182e-11, | |
| "loss": -0.5115, | |
| "step": 592 | |
| }, | |
| { | |
| "epoch": 2.989937106918239, | |
| "grad_norm": 5.872649192810059, | |
| "learning_rate": 0.0, | |
| "loss": 0.3585, | |
| "step": 594 | |
| }, | |
| { | |
| "epoch": 2.989937106918239, | |
| "step": 594, | |
| "total_flos": 5.151263974762742e+17, | |
| "train_loss": -0.14183720302852718, | |
| "train_runtime": 1424.6739, | |
| "train_samples_per_second": 13.386, | |
| "train_steps_per_second": 0.417 | |
| } | |
| ], | |
| "logging_steps": 2, | |
| "max_steps": 594, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 3, | |
| "save_steps": 500, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": false, | |
| "should_training_stop": false | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 5.151263974762742e+17, | |
| "train_batch_size": 1, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |