| { | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 2.0, | |
| "eval_steps": 200, | |
| "global_step": 170, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.011764705882352941, | |
| "grad_norm": 0.035323630468616266, | |
| "learning_rate": 1.6666666666666667e-05, | |
| "loss": 0.1079, | |
| "step": 1 | |
| }, | |
| { | |
| "epoch": 0.023529411764705882, | |
| "grad_norm": 0.03136221824048882, | |
| "learning_rate": 3.3333333333333335e-05, | |
| "loss": 0.1141, | |
| "step": 2 | |
| }, | |
| { | |
| "epoch": 0.03529411764705882, | |
| "grad_norm": 0.03804619743095702, | |
| "learning_rate": 5e-05, | |
| "loss": 0.1204, | |
| "step": 3 | |
| }, | |
| { | |
| "epoch": 0.047058823529411764, | |
| "grad_norm": 0.04070206833240694, | |
| "learning_rate": 6.666666666666667e-05, | |
| "loss": 0.1276, | |
| "step": 4 | |
| }, | |
| { | |
| "epoch": 0.058823529411764705, | |
| "grad_norm": 0.07589226019496102, | |
| "learning_rate": 8.333333333333334e-05, | |
| "loss": 0.1689, | |
| "step": 5 | |
| }, | |
| { | |
| "epoch": 0.07058823529411765, | |
| "grad_norm": 0.07319090807548947, | |
| "learning_rate": 0.0001, | |
| "loss": 0.1345, | |
| "step": 6 | |
| }, | |
| { | |
| "epoch": 0.08235294117647059, | |
| "grad_norm": 0.06996995505114396, | |
| "learning_rate": 9.999082642158973e-05, | |
| "loss": 0.1202, | |
| "step": 7 | |
| }, | |
| { | |
| "epoch": 0.09411764705882353, | |
| "grad_norm": 0.05248182376644038, | |
| "learning_rate": 9.99633090525405e-05, | |
| "loss": 0.0983, | |
| "step": 8 | |
| }, | |
| { | |
| "epoch": 0.10588235294117647, | |
| "grad_norm": 0.06219777780481942, | |
| "learning_rate": 9.991745799016206e-05, | |
| "loss": 0.0954, | |
| "step": 9 | |
| }, | |
| { | |
| "epoch": 0.11764705882352941, | |
| "grad_norm": 0.06595531802798588, | |
| "learning_rate": 9.985329005918702e-05, | |
| "loss": 0.0977, | |
| "step": 10 | |
| }, | |
| { | |
| "epoch": 0.12941176470588237, | |
| "grad_norm": 0.03606112500648923, | |
| "learning_rate": 9.977082880559725e-05, | |
| "loss": 0.0626, | |
| "step": 11 | |
| }, | |
| { | |
| "epoch": 0.1411764705882353, | |
| "grad_norm": 0.0722550944651642, | |
| "learning_rate": 9.967010448798375e-05, | |
| "loss": 0.0929, | |
| "step": 12 | |
| }, | |
| { | |
| "epoch": 0.15294117647058825, | |
| "grad_norm": 0.054771186088142604, | |
| "learning_rate": 9.955115406644356e-05, | |
| "loss": 0.0845, | |
| "step": 13 | |
| }, | |
| { | |
| "epoch": 0.16470588235294117, | |
| "grad_norm": 0.051628745573165394, | |
| "learning_rate": 9.941402118901744e-05, | |
| "loss": 0.0659, | |
| "step": 14 | |
| }, | |
| { | |
| "epoch": 0.17647058823529413, | |
| "grad_norm": 0.0486190850226073, | |
| "learning_rate": 9.92587561756735e-05, | |
| "loss": 0.0531, | |
| "step": 15 | |
| }, | |
| { | |
| "epoch": 0.18823529411764706, | |
| "grad_norm": 0.05588842841049585, | |
| "learning_rate": 9.908541599984276e-05, | |
| "loss": 0.0642, | |
| "step": 16 | |
| }, | |
| { | |
| "epoch": 0.2, | |
| "grad_norm": 0.05641046123379798, | |
| "learning_rate": 9.889406426751296e-05, | |
| "loss": 0.0588, | |
| "step": 17 | |
| }, | |
| { | |
| "epoch": 0.21176470588235294, | |
| "grad_norm": 0.054058284278693136, | |
| "learning_rate": 9.868477119388896e-05, | |
| "loss": 0.0649, | |
| "step": 18 | |
| }, | |
| { | |
| "epoch": 0.2235294117647059, | |
| "grad_norm": 0.0531804463754218, | |
| "learning_rate": 9.84576135776276e-05, | |
| "loss": 0.0677, | |
| "step": 19 | |
| }, | |
| { | |
| "epoch": 0.23529411764705882, | |
| "grad_norm": 0.058590111826598894, | |
| "learning_rate": 9.821267477265705e-05, | |
| "loss": 0.0606, | |
| "step": 20 | |
| }, | |
| { | |
| "epoch": 0.24705882352941178, | |
| "grad_norm": 0.04785785363882365, | |
| "learning_rate": 9.795004465759065e-05, | |
| "loss": 0.0609, | |
| "step": 21 | |
| }, | |
| { | |
| "epoch": 0.25882352941176473, | |
| "grad_norm": 0.05206751702161926, | |
| "learning_rate": 9.766981960274653e-05, | |
| "loss": 0.0475, | |
| "step": 22 | |
| }, | |
| { | |
| "epoch": 0.27058823529411763, | |
| "grad_norm": 0.037054145575494114, | |
| "learning_rate": 9.737210243478521e-05, | |
| "loss": 0.0415, | |
| "step": 23 | |
| }, | |
| { | |
| "epoch": 0.2823529411764706, | |
| "grad_norm": 0.04044144505681365, | |
| "learning_rate": 9.705700239897809e-05, | |
| "loss": 0.0474, | |
| "step": 24 | |
| }, | |
| { | |
| "epoch": 0.29411764705882354, | |
| "grad_norm": 0.030749676244442067, | |
| "learning_rate": 9.672463511912055e-05, | |
| "loss": 0.0422, | |
| "step": 25 | |
| }, | |
| { | |
| "epoch": 0.3058823529411765, | |
| "grad_norm": 0.035338570129669074, | |
| "learning_rate": 9.637512255510475e-05, | |
| "loss": 0.044, | |
| "step": 26 | |
| }, | |
| { | |
| "epoch": 0.3176470588235294, | |
| "grad_norm": 0.03379157572576983, | |
| "learning_rate": 9.600859295816708e-05, | |
| "loss": 0.0398, | |
| "step": 27 | |
| }, | |
| { | |
| "epoch": 0.32941176470588235, | |
| "grad_norm": 0.04025818097716925, | |
| "learning_rate": 9.56251808238275e-05, | |
| "loss": 0.0386, | |
| "step": 28 | |
| }, | |
| { | |
| "epoch": 0.3411764705882353, | |
| "grad_norm": 0.04406959077203071, | |
| "learning_rate": 9.522502684253709e-05, | |
| "loss": 0.0432, | |
| "step": 29 | |
| }, | |
| { | |
| "epoch": 0.35294117647058826, | |
| "grad_norm": 0.03978101147616123, | |
| "learning_rate": 9.480827784805278e-05, | |
| "loss": 0.0437, | |
| "step": 30 | |
| }, | |
| { | |
| "epoch": 0.36470588235294116, | |
| "grad_norm": 0.06465311190984376, | |
| "learning_rate": 9.437508676355773e-05, | |
| "loss": 0.0361, | |
| "step": 31 | |
| }, | |
| { | |
| "epoch": 0.3764705882352941, | |
| "grad_norm": 0.040734739666672064, | |
| "learning_rate": 9.392561254554713e-05, | |
| "loss": 0.0418, | |
| "step": 32 | |
| }, | |
| { | |
| "epoch": 0.38823529411764707, | |
| "grad_norm": 0.049975058896066196, | |
| "learning_rate": 9.346002012550027e-05, | |
| "loss": 0.0392, | |
| "step": 33 | |
| }, | |
| { | |
| "epoch": 0.4, | |
| "grad_norm": 0.035517771488692076, | |
| "learning_rate": 9.297848034936006e-05, | |
| "loss": 0.0339, | |
| "step": 34 | |
| }, | |
| { | |
| "epoch": 0.4117647058823529, | |
| "grad_norm": 0.03716302242601897, | |
| "learning_rate": 9.248116991484229e-05, | |
| "loss": 0.0342, | |
| "step": 35 | |
| }, | |
| { | |
| "epoch": 0.4235294117647059, | |
| "grad_norm": 0.025634143517022603, | |
| "learning_rate": 9.19682713065975e-05, | |
| "loss": 0.0288, | |
| "step": 36 | |
| }, | |
| { | |
| "epoch": 0.43529411764705883, | |
| "grad_norm": 0.03813921333883497, | |
| "learning_rate": 9.143997272924973e-05, | |
| "loss": 0.0369, | |
| "step": 37 | |
| }, | |
| { | |
| "epoch": 0.4470588235294118, | |
| "grad_norm": 0.035523177764954665, | |
| "learning_rate": 9.089646803833589e-05, | |
| "loss": 0.0342, | |
| "step": 38 | |
| }, | |
| { | |
| "epoch": 0.4588235294117647, | |
| "grad_norm": 0.04123934232531618, | |
| "learning_rate": 9.033795666917191e-05, | |
| "loss": 0.0388, | |
| "step": 39 | |
| }, | |
| { | |
| "epoch": 0.47058823529411764, | |
| "grad_norm": 0.03480415683019527, | |
| "learning_rate": 8.976464356367134e-05, | |
| "loss": 0.0345, | |
| "step": 40 | |
| }, | |
| { | |
| "epoch": 0.4823529411764706, | |
| "grad_norm": 0.04301396030731473, | |
| "learning_rate": 8.917673909514322e-05, | |
| "loss": 0.0408, | |
| "step": 41 | |
| }, | |
| { | |
| "epoch": 0.49411764705882355, | |
| "grad_norm": 0.04119245023494202, | |
| "learning_rate": 8.857445899109715e-05, | |
| "loss": 0.0319, | |
| "step": 42 | |
| }, | |
| { | |
| "epoch": 0.5058823529411764, | |
| "grad_norm": 0.03800838266272386, | |
| "learning_rate": 8.795802425408352e-05, | |
| "loss": 0.0285, | |
| "step": 43 | |
| }, | |
| { | |
| "epoch": 0.5176470588235295, | |
| "grad_norm": 0.026205323264901936, | |
| "learning_rate": 8.732766108059813e-05, | |
| "loss": 0.0322, | |
| "step": 44 | |
| }, | |
| { | |
| "epoch": 0.5294117647058824, | |
| "grad_norm": 0.03931353228016249, | |
| "learning_rate": 8.668360077808093e-05, | |
| "loss": 0.0346, | |
| "step": 45 | |
| }, | |
| { | |
| "epoch": 0.5411764705882353, | |
| "grad_norm": 0.047847025199635816, | |
| "learning_rate": 8.602607968003935e-05, | |
| "loss": 0.0402, | |
| "step": 46 | |
| }, | |
| { | |
| "epoch": 0.5529411764705883, | |
| "grad_norm": 0.03579784853738504, | |
| "learning_rate": 8.535533905932738e-05, | |
| "loss": 0.0262, | |
| "step": 47 | |
| }, | |
| { | |
| "epoch": 0.5647058823529412, | |
| "grad_norm": 0.06521605078365518, | |
| "learning_rate": 8.467162503961208e-05, | |
| "loss": 0.0274, | |
| "step": 48 | |
| }, | |
| { | |
| "epoch": 0.5764705882352941, | |
| "grad_norm": 0.027318223641263305, | |
| "learning_rate": 8.397518850506028e-05, | |
| "loss": 0.0321, | |
| "step": 49 | |
| }, | |
| { | |
| "epoch": 0.5882352941176471, | |
| "grad_norm": 0.03667920290649668, | |
| "learning_rate": 8.326628500827826e-05, | |
| "loss": 0.0348, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.6, | |
| "grad_norm": 0.03841473526092613, | |
| "learning_rate": 8.254517467653858e-05, | |
| "loss": 0.0286, | |
| "step": 51 | |
| }, | |
| { | |
| "epoch": 0.611764705882353, | |
| "grad_norm": 0.03512510354697874, | |
| "learning_rate": 8.181212211632799e-05, | |
| "loss": 0.0335, | |
| "step": 52 | |
| }, | |
| { | |
| "epoch": 0.6235294117647059, | |
| "grad_norm": 0.03758958960229201, | |
| "learning_rate": 8.106739631625217e-05, | |
| "loss": 0.0351, | |
| "step": 53 | |
| }, | |
| { | |
| "epoch": 0.6352941176470588, | |
| "grad_norm": 0.03141384942404683, | |
| "learning_rate": 8.03112705483319e-05, | |
| "loss": 0.0292, | |
| "step": 54 | |
| }, | |
| { | |
| "epoch": 0.6470588235294118, | |
| "grad_norm": 0.05123314473482919, | |
| "learning_rate": 7.954402226772804e-05, | |
| "loss": 0.0315, | |
| "step": 55 | |
| }, | |
| { | |
| "epoch": 0.6588235294117647, | |
| "grad_norm": 0.05526844683394433, | |
| "learning_rate": 7.876593301093104e-05, | |
| "loss": 0.0376, | |
| "step": 56 | |
| }, | |
| { | |
| "epoch": 0.6705882352941176, | |
| "grad_norm": 0.03442418193147859, | |
| "learning_rate": 7.797728829245321e-05, | |
| "loss": 0.0298, | |
| "step": 57 | |
| }, | |
| { | |
| "epoch": 0.6823529411764706, | |
| "grad_norm": 0.05381333860636251, | |
| "learning_rate": 7.717837750006106e-05, | |
| "loss": 0.0285, | |
| "step": 58 | |
| }, | |
| { | |
| "epoch": 0.6941176470588235, | |
| "grad_norm": 0.04595131495742476, | |
| "learning_rate": 7.636949378858646e-05, | |
| "loss": 0.0323, | |
| "step": 59 | |
| }, | |
| { | |
| "epoch": 0.7058823529411765, | |
| "grad_norm": 0.04547237722119376, | |
| "learning_rate": 7.555093397235552e-05, | |
| "loss": 0.0374, | |
| "step": 60 | |
| }, | |
| { | |
| "epoch": 0.7176470588235294, | |
| "grad_norm": 0.029277787154415737, | |
| "learning_rate": 7.472299841627451e-05, | |
| "loss": 0.0305, | |
| "step": 61 | |
| }, | |
| { | |
| "epoch": 0.7294117647058823, | |
| "grad_norm": 0.054416629180822844, | |
| "learning_rate": 7.388599092561315e-05, | |
| "loss": 0.0305, | |
| "step": 62 | |
| }, | |
| { | |
| "epoch": 0.7411764705882353, | |
| "grad_norm": 0.07521136626264392, | |
| "learning_rate": 7.304021863452524e-05, | |
| "loss": 0.0337, | |
| "step": 63 | |
| }, | |
| { | |
| "epoch": 0.7529411764705882, | |
| "grad_norm": 0.027594110933489315, | |
| "learning_rate": 7.218599189334799e-05, | |
| "loss": 0.0268, | |
| "step": 64 | |
| }, | |
| { | |
| "epoch": 0.7647058823529411, | |
| "grad_norm": 0.044043131009126026, | |
| "learning_rate": 7.1323624154721e-05, | |
| "loss": 0.0333, | |
| "step": 65 | |
| }, | |
| { | |
| "epoch": 0.7764705882352941, | |
| "grad_norm": 0.088281551033726, | |
| "learning_rate": 7.045343185856701e-05, | |
| "loss": 0.0367, | |
| "step": 66 | |
| }, | |
| { | |
| "epoch": 0.788235294117647, | |
| "grad_norm": 0.04328862177911276, | |
| "learning_rate": 6.957573431597646e-05, | |
| "loss": 0.0327, | |
| "step": 67 | |
| }, | |
| { | |
| "epoch": 0.8, | |
| "grad_norm": 0.03272017374355755, | |
| "learning_rate": 6.869085359203844e-05, | |
| "loss": 0.0309, | |
| "step": 68 | |
| }, | |
| { | |
| "epoch": 0.8117647058823529, | |
| "grad_norm": 0.06280711357209813, | |
| "learning_rate": 6.779911438766116e-05, | |
| "loss": 0.0327, | |
| "step": 69 | |
| }, | |
| { | |
| "epoch": 0.8235294117647058, | |
| "grad_norm": 0.052927974729157626, | |
| "learning_rate": 6.690084392042513e-05, | |
| "loss": 0.0312, | |
| "step": 70 | |
| }, | |
| { | |
| "epoch": 0.8352941176470589, | |
| "grad_norm": 0.05429533945696401, | |
| "learning_rate": 6.599637180451294e-05, | |
| "loss": 0.0348, | |
| "step": 71 | |
| }, | |
| { | |
| "epoch": 0.8470588235294118, | |
| "grad_norm": 0.048917527979116436, | |
| "learning_rate": 6.508602992975963e-05, | |
| "loss": 0.0317, | |
| "step": 72 | |
| }, | |
| { | |
| "epoch": 0.8588235294117647, | |
| "grad_norm": 0.031886426687786254, | |
| "learning_rate": 6.417015233986786e-05, | |
| "loss": 0.0311, | |
| "step": 73 | |
| }, | |
| { | |
| "epoch": 0.8705882352941177, | |
| "grad_norm": 0.03376820411697015, | |
| "learning_rate": 6.32490751098331e-05, | |
| "loss": 0.0263, | |
| "step": 74 | |
| }, | |
| { | |
| "epoch": 0.8823529411764706, | |
| "grad_norm": 0.03767361887537691, | |
| "learning_rate": 6.232313622262296e-05, | |
| "loss": 0.0343, | |
| "step": 75 | |
| }, | |
| { | |
| "epoch": 0.8941176470588236, | |
| "grad_norm": 0.04332615343931221, | |
| "learning_rate": 6.139267544515689e-05, | |
| "loss": 0.0304, | |
| "step": 76 | |
| }, | |
| { | |
| "epoch": 0.9058823529411765, | |
| "grad_norm": 0.04146445958416543, | |
| "learning_rate": 6.045803420363084e-05, | |
| "loss": 0.0318, | |
| "step": 77 | |
| }, | |
| { | |
| "epoch": 0.9176470588235294, | |
| "grad_norm": 0.03675850801075872, | |
| "learning_rate": 5.951955545823342e-05, | |
| "loss": 0.0286, | |
| "step": 78 | |
| }, | |
| { | |
| "epoch": 0.9294117647058824, | |
| "grad_norm": 0.03951469611584691, | |
| "learning_rate": 5.8577583577298924e-05, | |
| "loss": 0.0297, | |
| "step": 79 | |
| }, | |
| { | |
| "epoch": 0.9411764705882353, | |
| "grad_norm": 0.03613565309891407, | |
| "learning_rate": 5.7632464210943726e-05, | |
| "loss": 0.0261, | |
| "step": 80 | |
| }, | |
| { | |
| "epoch": 0.9529411764705882, | |
| "grad_norm": 0.031008899018911143, | |
| "learning_rate": 5.668454416423242e-05, | |
| "loss": 0.026, | |
| "step": 81 | |
| }, | |
| { | |
| "epoch": 0.9647058823529412, | |
| "grad_norm": 0.0692628338321925, | |
| "learning_rate": 5.573417126992003e-05, | |
| "loss": 0.0266, | |
| "step": 82 | |
| }, | |
| { | |
| "epoch": 0.9764705882352941, | |
| "grad_norm": 0.05852763315877547, | |
| "learning_rate": 5.478169426081712e-05, | |
| "loss": 0.0295, | |
| "step": 83 | |
| }, | |
| { | |
| "epoch": 0.9882352941176471, | |
| "grad_norm": 0.03199303327911728, | |
| "learning_rate": 5.38274626418248e-05, | |
| "loss": 0.0292, | |
| "step": 84 | |
| }, | |
| { | |
| "epoch": 1.0, | |
| "grad_norm": 0.03733696573812923, | |
| "learning_rate": 5.287182656168618e-05, | |
| "loss": 0.0287, | |
| "step": 85 | |
| }, | |
| { | |
| "epoch": 1.011764705882353, | |
| "grad_norm": 0.04222433319800323, | |
| "learning_rate": 5.191513668450178e-05, | |
| "loss": 0.0275, | |
| "step": 86 | |
| }, | |
| { | |
| "epoch": 1.0235294117647058, | |
| "grad_norm": 0.033149244513485374, | |
| "learning_rate": 5.095774406105571e-05, | |
| "loss": 0.0245, | |
| "step": 87 | |
| }, | |
| { | |
| "epoch": 1.035294117647059, | |
| "grad_norm": 0.030866296785570155, | |
| "learning_rate": 5e-05, | |
| "loss": 0.0227, | |
| "step": 88 | |
| }, | |
| { | |
| "epoch": 1.0470588235294118, | |
| "grad_norm": 0.029363896734282895, | |
| "learning_rate": 4.9042255938944296e-05, | |
| "loss": 0.0284, | |
| "step": 89 | |
| }, | |
| { | |
| "epoch": 1.0588235294117647, | |
| "grad_norm": 0.03458859192281693, | |
| "learning_rate": 4.8084863315498234e-05, | |
| "loss": 0.0256, | |
| "step": 90 | |
| }, | |
| { | |
| "epoch": 1.0705882352941176, | |
| "grad_norm": 0.033499852353591555, | |
| "learning_rate": 4.712817343831384e-05, | |
| "loss": 0.028, | |
| "step": 91 | |
| }, | |
| { | |
| "epoch": 1.0823529411764705, | |
| "grad_norm": 0.03348874233244485, | |
| "learning_rate": 4.6172537358175214e-05, | |
| "loss": 0.0269, | |
| "step": 92 | |
| }, | |
| { | |
| "epoch": 1.0941176470588236, | |
| "grad_norm": 0.03268399220561992, | |
| "learning_rate": 4.521830573918289e-05, | |
| "loss": 0.0268, | |
| "step": 93 | |
| }, | |
| { | |
| "epoch": 1.1058823529411765, | |
| "grad_norm": 0.03535497654299178, | |
| "learning_rate": 4.4265828730079987e-05, | |
| "loss": 0.0287, | |
| "step": 94 | |
| }, | |
| { | |
| "epoch": 1.1176470588235294, | |
| "grad_norm": 0.035249786662349535, | |
| "learning_rate": 4.331545583576758e-05, | |
| "loss": 0.0254, | |
| "step": 95 | |
| }, | |
| { | |
| "epoch": 1.1294117647058823, | |
| "grad_norm": 0.042058533392068366, | |
| "learning_rate": 4.236753578905627e-05, | |
| "loss": 0.0257, | |
| "step": 96 | |
| }, | |
| { | |
| "epoch": 1.1411764705882352, | |
| "grad_norm": 0.035662240284239075, | |
| "learning_rate": 4.142241642270108e-05, | |
| "loss": 0.023, | |
| "step": 97 | |
| }, | |
| { | |
| "epoch": 1.1529411764705881, | |
| "grad_norm": 0.03205942374398337, | |
| "learning_rate": 4.0480444541766576e-05, | |
| "loss": 0.0227, | |
| "step": 98 | |
| }, | |
| { | |
| "epoch": 1.1647058823529413, | |
| "grad_norm": 0.03410677741547236, | |
| "learning_rate": 3.954196579636918e-05, | |
| "loss": 0.025, | |
| "step": 99 | |
| }, | |
| { | |
| "epoch": 1.1764705882352942, | |
| "grad_norm": 0.037977473156353136, | |
| "learning_rate": 3.8607324554843136e-05, | |
| "loss": 0.0245, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 1.188235294117647, | |
| "grad_norm": 0.034304594099498986, | |
| "learning_rate": 3.7676863777377054e-05, | |
| "loss": 0.0251, | |
| "step": 101 | |
| }, | |
| { | |
| "epoch": 1.2, | |
| "grad_norm": 0.04024140735442772, | |
| "learning_rate": 3.675092489016693e-05, | |
| "loss": 0.0258, | |
| "step": 102 | |
| }, | |
| { | |
| "epoch": 1.2117647058823529, | |
| "grad_norm": 0.03640815220830249, | |
| "learning_rate": 3.582984766013215e-05, | |
| "loss": 0.0254, | |
| "step": 103 | |
| }, | |
| { | |
| "epoch": 1.223529411764706, | |
| "grad_norm": 0.05308696935770744, | |
| "learning_rate": 3.4913970070240386e-05, | |
| "loss": 0.0317, | |
| "step": 104 | |
| }, | |
| { | |
| "epoch": 1.2352941176470589, | |
| "grad_norm": 0.03532605452768607, | |
| "learning_rate": 3.4003628195487057e-05, | |
| "loss": 0.0272, | |
| "step": 105 | |
| }, | |
| { | |
| "epoch": 1.2470588235294118, | |
| "grad_norm": 0.03399447551991043, | |
| "learning_rate": 3.309915607957487e-05, | |
| "loss": 0.0256, | |
| "step": 106 | |
| }, | |
| { | |
| "epoch": 1.2588235294117647, | |
| "grad_norm": 0.03154244903849424, | |
| "learning_rate": 3.2200885612338845e-05, | |
| "loss": 0.0207, | |
| "step": 107 | |
| }, | |
| { | |
| "epoch": 1.2705882352941176, | |
| "grad_norm": 0.04861329315126559, | |
| "learning_rate": 3.130914640796157e-05, | |
| "loss": 0.0297, | |
| "step": 108 | |
| }, | |
| { | |
| "epoch": 1.2823529411764705, | |
| "grad_norm": 0.04026826659177659, | |
| "learning_rate": 3.0424265684023558e-05, | |
| "loss": 0.0238, | |
| "step": 109 | |
| }, | |
| { | |
| "epoch": 1.2941176470588236, | |
| "grad_norm": 0.03529433347937337, | |
| "learning_rate": 2.9546568141433006e-05, | |
| "loss": 0.0247, | |
| "step": 110 | |
| }, | |
| { | |
| "epoch": 1.3058823529411765, | |
| "grad_norm": 0.035218945260167514, | |
| "learning_rate": 2.8676375845279013e-05, | |
| "loss": 0.0271, | |
| "step": 111 | |
| }, | |
| { | |
| "epoch": 1.3176470588235294, | |
| "grad_norm": 0.03409897136114451, | |
| "learning_rate": 2.7814008106652012e-05, | |
| "loss": 0.0253, | |
| "step": 112 | |
| }, | |
| { | |
| "epoch": 1.3294117647058823, | |
| "grad_norm": 0.03345892031561033, | |
| "learning_rate": 2.6959781365474758e-05, | |
| "loss": 0.0196, | |
| "step": 113 | |
| }, | |
| { | |
| "epoch": 1.3411764705882354, | |
| "grad_norm": 0.041711810588722247, | |
| "learning_rate": 2.6114009074386846e-05, | |
| "loss": 0.025, | |
| "step": 114 | |
| }, | |
| { | |
| "epoch": 1.3529411764705883, | |
| "grad_norm": 0.03404740591421045, | |
| "learning_rate": 2.527700158372548e-05, | |
| "loss": 0.0245, | |
| "step": 115 | |
| }, | |
| { | |
| "epoch": 1.3647058823529412, | |
| "grad_norm": 0.03806934179838023, | |
| "learning_rate": 2.4449066027644475e-05, | |
| "loss": 0.0202, | |
| "step": 116 | |
| }, | |
| { | |
| "epoch": 1.3764705882352941, | |
| "grad_norm": 0.037833218608223404, | |
| "learning_rate": 2.363050621141354e-05, | |
| "loss": 0.0269, | |
| "step": 117 | |
| }, | |
| { | |
| "epoch": 1.388235294117647, | |
| "grad_norm": 0.040060220347060395, | |
| "learning_rate": 2.282162249993895e-05, | |
| "loss": 0.0259, | |
| "step": 118 | |
| }, | |
| { | |
| "epoch": 1.4, | |
| "grad_norm": 0.039492305883863446, | |
| "learning_rate": 2.20227117075468e-05, | |
| "loss": 0.0263, | |
| "step": 119 | |
| }, | |
| { | |
| "epoch": 1.4117647058823528, | |
| "grad_norm": 0.04146681928718736, | |
| "learning_rate": 2.1234066989068972e-05, | |
| "loss": 0.0237, | |
| "step": 120 | |
| }, | |
| { | |
| "epoch": 1.423529411764706, | |
| "grad_norm": 0.044440473554041314, | |
| "learning_rate": 2.0455977732271993e-05, | |
| "loss": 0.0221, | |
| "step": 121 | |
| }, | |
| { | |
| "epoch": 1.4352941176470588, | |
| "grad_norm": 0.033828391729890175, | |
| "learning_rate": 1.9688729451668114e-05, | |
| "loss": 0.0203, | |
| "step": 122 | |
| }, | |
| { | |
| "epoch": 1.4470588235294117, | |
| "grad_norm": 0.038308578853440155, | |
| "learning_rate": 1.893260368374786e-05, | |
| "loss": 0.0271, | |
| "step": 123 | |
| }, | |
| { | |
| "epoch": 1.4588235294117646, | |
| "grad_norm": 0.035696891554767116, | |
| "learning_rate": 1.818787788367202e-05, | |
| "loss": 0.0241, | |
| "step": 124 | |
| }, | |
| { | |
| "epoch": 1.4705882352941178, | |
| "grad_norm": 0.03450606035436337, | |
| "learning_rate": 1.7454825323461448e-05, | |
| "loss": 0.0227, | |
| "step": 125 | |
| }, | |
| { | |
| "epoch": 1.4823529411764707, | |
| "grad_norm": 0.04502948564553977, | |
| "learning_rate": 1.673371499172174e-05, | |
| "loss": 0.0282, | |
| "step": 126 | |
| }, | |
| { | |
| "epoch": 1.4941176470588236, | |
| "grad_norm": 0.03775360455129196, | |
| "learning_rate": 1.6024811494939724e-05, | |
| "loss": 0.022, | |
| "step": 127 | |
| }, | |
| { | |
| "epoch": 1.5058823529411764, | |
| "grad_norm": 0.035858486799161725, | |
| "learning_rate": 1.532837496038792e-05, | |
| "loss": 0.0247, | |
| "step": 128 | |
| }, | |
| { | |
| "epoch": 1.5176470588235293, | |
| "grad_norm": 0.03737088577494754, | |
| "learning_rate": 1.4644660940672627e-05, | |
| "loss": 0.0257, | |
| "step": 129 | |
| }, | |
| { | |
| "epoch": 1.5294117647058822, | |
| "grad_norm": 0.03461325321460445, | |
| "learning_rate": 1.3973920319960655e-05, | |
| "loss": 0.0217, | |
| "step": 130 | |
| }, | |
| { | |
| "epoch": 1.5411764705882351, | |
| "grad_norm": 0.034501790257859405, | |
| "learning_rate": 1.3316399221919074e-05, | |
| "loss": 0.0232, | |
| "step": 131 | |
| }, | |
| { | |
| "epoch": 1.5529411764705883, | |
| "grad_norm": 0.03782875214918853, | |
| "learning_rate": 1.2672338919401866e-05, | |
| "loss": 0.0225, | |
| "step": 132 | |
| }, | |
| { | |
| "epoch": 1.5647058823529412, | |
| "grad_norm": 0.04306989405034864, | |
| "learning_rate": 1.2041975745916472e-05, | |
| "loss": 0.0269, | |
| "step": 133 | |
| }, | |
| { | |
| "epoch": 1.576470588235294, | |
| "grad_norm": 0.037700226280294416, | |
| "learning_rate": 1.1425541008902851e-05, | |
| "loss": 0.0249, | |
| "step": 134 | |
| }, | |
| { | |
| "epoch": 1.5882352941176472, | |
| "grad_norm": 0.03890317937719569, | |
| "learning_rate": 1.082326090485679e-05, | |
| "loss": 0.0255, | |
| "step": 135 | |
| }, | |
| { | |
| "epoch": 1.6, | |
| "grad_norm": 0.04219733674070769, | |
| "learning_rate": 1.0235356436328675e-05, | |
| "loss": 0.0286, | |
| "step": 136 | |
| }, | |
| { | |
| "epoch": 1.611764705882353, | |
| "grad_norm": 0.03273981321791409, | |
| "learning_rate": 9.662043330828085e-06, | |
| "loss": 0.0201, | |
| "step": 137 | |
| }, | |
| { | |
| "epoch": 1.6235294117647059, | |
| "grad_norm": 0.03302849889769006, | |
| "learning_rate": 9.103531961664118e-06, | |
| "loss": 0.0212, | |
| "step": 138 | |
| }, | |
| { | |
| "epoch": 1.6352941176470588, | |
| "grad_norm": 0.04021301444135266, | |
| "learning_rate": 8.560027270750277e-06, | |
| "loss": 0.0244, | |
| "step": 139 | |
| }, | |
| { | |
| "epoch": 1.6470588235294117, | |
| "grad_norm": 0.034002953931695744, | |
| "learning_rate": 8.031728693402502e-06, | |
| "loss": 0.0193, | |
| "step": 140 | |
| }, | |
| { | |
| "epoch": 1.6588235294117646, | |
| "grad_norm": 0.03494170057614315, | |
| "learning_rate": 7.518830085157735e-06, | |
| "loss": 0.0232, | |
| "step": 141 | |
| }, | |
| { | |
| "epoch": 1.6705882352941175, | |
| "grad_norm": 0.05067951397515134, | |
| "learning_rate": 7.0215196506399515e-06, | |
| "loss": 0.0217, | |
| "step": 142 | |
| }, | |
| { | |
| "epoch": 1.6823529411764706, | |
| "grad_norm": 0.035476236926014315, | |
| "learning_rate": 6.539979874499747e-06, | |
| "loss": 0.0213, | |
| "step": 143 | |
| }, | |
| { | |
| "epoch": 1.6941176470588235, | |
| "grad_norm": 0.036655504933770414, | |
| "learning_rate": 6.07438745445289e-06, | |
| "loss": 0.0234, | |
| "step": 144 | |
| }, | |
| { | |
| "epoch": 1.7058823529411766, | |
| "grad_norm": 0.041657363570965146, | |
| "learning_rate": 5.624913236442286e-06, | |
| "loss": 0.0294, | |
| "step": 145 | |
| }, | |
| { | |
| "epoch": 1.7176470588235295, | |
| "grad_norm": 0.05150076449892236, | |
| "learning_rate": 5.191722151947226e-06, | |
| "loss": 0.0258, | |
| "step": 146 | |
| }, | |
| { | |
| "epoch": 1.7294117647058824, | |
| "grad_norm": 0.0404588183080739, | |
| "learning_rate": 4.7749731574629196e-06, | |
| "loss": 0.0224, | |
| "step": 147 | |
| }, | |
| { | |
| "epoch": 1.7411764705882353, | |
| "grad_norm": 0.033224356486454734, | |
| "learning_rate": 4.374819176172501e-06, | |
| "loss": 0.021, | |
| "step": 148 | |
| }, | |
| { | |
| "epoch": 1.7529411764705882, | |
| "grad_norm": 0.03795419337831155, | |
| "learning_rate": 3.991407041832912e-06, | |
| "loss": 0.022, | |
| "step": 149 | |
| }, | |
| { | |
| "epoch": 1.7647058823529411, | |
| "grad_norm": 0.04200165421199087, | |
| "learning_rate": 3.6248774448952695e-06, | |
| "loss": 0.0265, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 1.776470588235294, | |
| "grad_norm": 0.03845197891559611, | |
| "learning_rate": 3.2753648808794503e-06, | |
| "loss": 0.024, | |
| "step": 151 | |
| }, | |
| { | |
| "epoch": 1.788235294117647, | |
| "grad_norm": 0.037815699401078935, | |
| "learning_rate": 2.942997601021924e-06, | |
| "loss": 0.0257, | |
| "step": 152 | |
| }, | |
| { | |
| "epoch": 1.8, | |
| "grad_norm": 0.03519274514771082, | |
| "learning_rate": 2.6278975652147875e-06, | |
| "loss": 0.0232, | |
| "step": 153 | |
| }, | |
| { | |
| "epoch": 1.811764705882353, | |
| "grad_norm": 0.03706158210535721, | |
| "learning_rate": 2.330180397253473e-06, | |
| "loss": 0.0232, | |
| "step": 154 | |
| }, | |
| { | |
| "epoch": 1.8235294117647058, | |
| "grad_norm": 0.04248155649655448, | |
| "learning_rate": 2.049955342409349e-06, | |
| "loss": 0.0227, | |
| "step": 155 | |
| }, | |
| { | |
| "epoch": 1.835294117647059, | |
| "grad_norm": 0.0368432870786149, | |
| "learning_rate": 1.7873252273429509e-06, | |
| "loss": 0.0224, | |
| "step": 156 | |
| }, | |
| { | |
| "epoch": 1.8470588235294119, | |
| "grad_norm": 0.03332568698884207, | |
| "learning_rate": 1.542386422372405e-06, | |
| "loss": 0.0208, | |
| "step": 157 | |
| }, | |
| { | |
| "epoch": 1.8588235294117648, | |
| "grad_norm": 0.036213210721090155, | |
| "learning_rate": 1.3152288061110518e-06, | |
| "loss": 0.0237, | |
| "step": 158 | |
| }, | |
| { | |
| "epoch": 1.8705882352941177, | |
| "grad_norm": 0.035507819488375524, | |
| "learning_rate": 1.1059357324870455e-06, | |
| "loss": 0.0212, | |
| "step": 159 | |
| }, | |
| { | |
| "epoch": 1.8823529411764706, | |
| "grad_norm": 0.032773052246892385, | |
| "learning_rate": 9.145840001572537e-07, | |
| "loss": 0.0209, | |
| "step": 160 | |
| }, | |
| { | |
| "epoch": 1.8941176470588235, | |
| "grad_norm": 0.03720092396666804, | |
| "learning_rate": 7.41243824326504e-07, | |
| "loss": 0.0237, | |
| "step": 161 | |
| }, | |
| { | |
| "epoch": 1.9058823529411764, | |
| "grad_norm": 0.061643042263861664, | |
| "learning_rate": 5.859788109825793e-07, | |
| "loss": 0.0263, | |
| "step": 162 | |
| }, | |
| { | |
| "epoch": 1.9176470588235293, | |
| "grad_norm": 0.037441015710064, | |
| "learning_rate": 4.48845933556441e-07, | |
| "loss": 0.025, | |
| "step": 163 | |
| }, | |
| { | |
| "epoch": 1.9294117647058824, | |
| "grad_norm": 0.03459803638637435, | |
| "learning_rate": 3.2989551201624835e-07, | |
| "loss": 0.0215, | |
| "step": 164 | |
| }, | |
| { | |
| "epoch": 1.9411764705882353, | |
| "grad_norm": 0.04948419428076397, | |
| "learning_rate": 2.2917119440275524e-07, | |
| "loss": 0.0273, | |
| "step": 165 | |
| }, | |
| { | |
| "epoch": 1.9529411764705882, | |
| "grad_norm": 0.040939656098896986, | |
| "learning_rate": 1.4670994081297795e-07, | |
| "loss": 0.0251, | |
| "step": 166 | |
| }, | |
| { | |
| "epoch": 1.9647058823529413, | |
| "grad_norm": 0.03941121437437399, | |
| "learning_rate": 8.254200983794369e-08, | |
| "loss": 0.0273, | |
| "step": 167 | |
| }, | |
| { | |
| "epoch": 1.9764705882352942, | |
| "grad_norm": 0.04847566902018068, | |
| "learning_rate": 3.669094745950008e-08, | |
| "loss": 0.0239, | |
| "step": 168 | |
| }, | |
| { | |
| "epoch": 1.988235294117647, | |
| "grad_norm": 0.04650396689287853, | |
| "learning_rate": 9.17357841028199e-09, | |
| "loss": 0.0207, | |
| "step": 169 | |
| }, | |
| { | |
| "epoch": 2.0, | |
| "grad_norm": 0.04099329304128327, | |
| "learning_rate": 0.0, | |
| "loss": 0.0232, | |
| "step": 170 | |
| }, | |
| { | |
| "epoch": 2.0, | |
| "step": 170, | |
| "total_flos": 861011422740480.0, | |
| "train_loss": 0.03621660071041654, | |
| "train_runtime": 2036.7134, | |
| "train_samples_per_second": 0.668, | |
| "train_steps_per_second": 0.083 | |
| } | |
| ], | |
| "logging_steps": 1, | |
| "max_steps": 170, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 2, | |
| "save_steps": 50000, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": true | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 861011422740480.0, | |
| "train_batch_size": 2, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |