{ "best_metric": null, "best_model_checkpoint": null, "epoch": 2.0, "eval_steps": 200, "global_step": 170, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.011764705882352941, "grad_norm": 0.035323630468616266, "learning_rate": 1.6666666666666667e-05, "loss": 0.1079, "step": 1 }, { "epoch": 0.023529411764705882, "grad_norm": 0.03136221824048882, "learning_rate": 3.3333333333333335e-05, "loss": 0.1141, "step": 2 }, { "epoch": 0.03529411764705882, "grad_norm": 0.03804619743095702, "learning_rate": 5e-05, "loss": 0.1204, "step": 3 }, { "epoch": 0.047058823529411764, "grad_norm": 0.04070206833240694, "learning_rate": 6.666666666666667e-05, "loss": 0.1276, "step": 4 }, { "epoch": 0.058823529411764705, "grad_norm": 0.07589226019496102, "learning_rate": 8.333333333333334e-05, "loss": 0.1689, "step": 5 }, { "epoch": 0.07058823529411765, "grad_norm": 0.07319090807548947, "learning_rate": 0.0001, "loss": 0.1345, "step": 6 }, { "epoch": 0.08235294117647059, "grad_norm": 0.06996995505114396, "learning_rate": 9.999082642158973e-05, "loss": 0.1202, "step": 7 }, { "epoch": 0.09411764705882353, "grad_norm": 0.05248182376644038, "learning_rate": 9.99633090525405e-05, "loss": 0.0983, "step": 8 }, { "epoch": 0.10588235294117647, "grad_norm": 0.06219777780481942, "learning_rate": 9.991745799016206e-05, "loss": 0.0954, "step": 9 }, { "epoch": 0.11764705882352941, "grad_norm": 0.06595531802798588, "learning_rate": 9.985329005918702e-05, "loss": 0.0977, "step": 10 }, { "epoch": 0.12941176470588237, "grad_norm": 0.03606112500648923, "learning_rate": 9.977082880559725e-05, "loss": 0.0626, "step": 11 }, { "epoch": 0.1411764705882353, "grad_norm": 0.0722550944651642, "learning_rate": 9.967010448798375e-05, "loss": 0.0929, "step": 12 }, { "epoch": 0.15294117647058825, "grad_norm": 0.054771186088142604, "learning_rate": 9.955115406644356e-05, "loss": 0.0845, "step": 13 }, { "epoch": 0.16470588235294117, "grad_norm": 0.051628745573165394, "learning_rate": 9.941402118901744e-05, "loss": 0.0659, "step": 14 }, { "epoch": 0.17647058823529413, "grad_norm": 0.0486190850226073, "learning_rate": 9.92587561756735e-05, "loss": 0.0531, "step": 15 }, { "epoch": 0.18823529411764706, "grad_norm": 0.05588842841049585, "learning_rate": 9.908541599984276e-05, "loss": 0.0642, "step": 16 }, { "epoch": 0.2, "grad_norm": 0.05641046123379798, "learning_rate": 9.889406426751296e-05, "loss": 0.0588, "step": 17 }, { "epoch": 0.21176470588235294, "grad_norm": 0.054058284278693136, "learning_rate": 9.868477119388896e-05, "loss": 0.0649, "step": 18 }, { "epoch": 0.2235294117647059, "grad_norm": 0.0531804463754218, "learning_rate": 9.84576135776276e-05, "loss": 0.0677, "step": 19 }, { "epoch": 0.23529411764705882, "grad_norm": 0.058590111826598894, "learning_rate": 9.821267477265705e-05, "loss": 0.0606, "step": 20 }, { "epoch": 0.24705882352941178, "grad_norm": 0.04785785363882365, "learning_rate": 9.795004465759065e-05, "loss": 0.0609, "step": 21 }, { "epoch": 0.25882352941176473, "grad_norm": 0.05206751702161926, "learning_rate": 9.766981960274653e-05, "loss": 0.0475, "step": 22 }, { "epoch": 0.27058823529411763, "grad_norm": 0.037054145575494114, "learning_rate": 9.737210243478521e-05, "loss": 0.0415, "step": 23 }, { "epoch": 0.2823529411764706, "grad_norm": 0.04044144505681365, "learning_rate": 9.705700239897809e-05, "loss": 0.0474, "step": 24 }, { "epoch": 0.29411764705882354, "grad_norm": 0.030749676244442067, "learning_rate": 9.672463511912055e-05, "loss": 0.0422, "step": 25 }, { "epoch": 0.3058823529411765, "grad_norm": 0.035338570129669074, "learning_rate": 9.637512255510475e-05, "loss": 0.044, "step": 26 }, { "epoch": 0.3176470588235294, "grad_norm": 0.03379157572576983, "learning_rate": 9.600859295816708e-05, "loss": 0.0398, "step": 27 }, { "epoch": 0.32941176470588235, "grad_norm": 0.04025818097716925, "learning_rate": 9.56251808238275e-05, "loss": 0.0386, "step": 28 }, { "epoch": 0.3411764705882353, "grad_norm": 0.04406959077203071, "learning_rate": 9.522502684253709e-05, "loss": 0.0432, "step": 29 }, { "epoch": 0.35294117647058826, "grad_norm": 0.03978101147616123, "learning_rate": 9.480827784805278e-05, "loss": 0.0437, "step": 30 }, { "epoch": 0.36470588235294116, "grad_norm": 0.06465311190984376, "learning_rate": 9.437508676355773e-05, "loss": 0.0361, "step": 31 }, { "epoch": 0.3764705882352941, "grad_norm": 0.040734739666672064, "learning_rate": 9.392561254554713e-05, "loss": 0.0418, "step": 32 }, { "epoch": 0.38823529411764707, "grad_norm": 0.049975058896066196, "learning_rate": 9.346002012550027e-05, "loss": 0.0392, "step": 33 }, { "epoch": 0.4, "grad_norm": 0.035517771488692076, "learning_rate": 9.297848034936006e-05, "loss": 0.0339, "step": 34 }, { "epoch": 0.4117647058823529, "grad_norm": 0.03716302242601897, "learning_rate": 9.248116991484229e-05, "loss": 0.0342, "step": 35 }, { "epoch": 0.4235294117647059, "grad_norm": 0.025634143517022603, "learning_rate": 9.19682713065975e-05, "loss": 0.0288, "step": 36 }, { "epoch": 0.43529411764705883, "grad_norm": 0.03813921333883497, "learning_rate": 9.143997272924973e-05, "loss": 0.0369, "step": 37 }, { "epoch": 0.4470588235294118, "grad_norm": 0.035523177764954665, "learning_rate": 9.089646803833589e-05, "loss": 0.0342, "step": 38 }, { "epoch": 0.4588235294117647, "grad_norm": 0.04123934232531618, "learning_rate": 9.033795666917191e-05, "loss": 0.0388, "step": 39 }, { "epoch": 0.47058823529411764, "grad_norm": 0.03480415683019527, "learning_rate": 8.976464356367134e-05, "loss": 0.0345, "step": 40 }, { "epoch": 0.4823529411764706, "grad_norm": 0.04301396030731473, "learning_rate": 8.917673909514322e-05, "loss": 0.0408, "step": 41 }, { "epoch": 0.49411764705882355, "grad_norm": 0.04119245023494202, "learning_rate": 8.857445899109715e-05, "loss": 0.0319, "step": 42 }, { "epoch": 0.5058823529411764, "grad_norm": 0.03800838266272386, "learning_rate": 8.795802425408352e-05, "loss": 0.0285, "step": 43 }, { "epoch": 0.5176470588235295, "grad_norm": 0.026205323264901936, "learning_rate": 8.732766108059813e-05, "loss": 0.0322, "step": 44 }, { "epoch": 0.5294117647058824, "grad_norm": 0.03931353228016249, "learning_rate": 8.668360077808093e-05, "loss": 0.0346, "step": 45 }, { "epoch": 0.5411764705882353, "grad_norm": 0.047847025199635816, "learning_rate": 8.602607968003935e-05, "loss": 0.0402, "step": 46 }, { "epoch": 0.5529411764705883, "grad_norm": 0.03579784853738504, "learning_rate": 8.535533905932738e-05, "loss": 0.0262, "step": 47 }, { "epoch": 0.5647058823529412, "grad_norm": 0.06521605078365518, "learning_rate": 8.467162503961208e-05, "loss": 0.0274, "step": 48 }, { "epoch": 0.5764705882352941, "grad_norm": 0.027318223641263305, "learning_rate": 8.397518850506028e-05, "loss": 0.0321, "step": 49 }, { "epoch": 0.5882352941176471, "grad_norm": 0.03667920290649668, "learning_rate": 8.326628500827826e-05, "loss": 0.0348, "step": 50 }, { "epoch": 0.6, "grad_norm": 0.03841473526092613, "learning_rate": 8.254517467653858e-05, "loss": 0.0286, "step": 51 }, { "epoch": 0.611764705882353, "grad_norm": 0.03512510354697874, "learning_rate": 8.181212211632799e-05, "loss": 0.0335, "step": 52 }, { "epoch": 0.6235294117647059, "grad_norm": 0.03758958960229201, "learning_rate": 8.106739631625217e-05, "loss": 0.0351, "step": 53 }, { "epoch": 0.6352941176470588, "grad_norm": 0.03141384942404683, "learning_rate": 8.03112705483319e-05, "loss": 0.0292, "step": 54 }, { "epoch": 0.6470588235294118, "grad_norm": 0.05123314473482919, "learning_rate": 7.954402226772804e-05, "loss": 0.0315, "step": 55 }, { "epoch": 0.6588235294117647, "grad_norm": 0.05526844683394433, "learning_rate": 7.876593301093104e-05, "loss": 0.0376, "step": 56 }, { "epoch": 0.6705882352941176, "grad_norm": 0.03442418193147859, "learning_rate": 7.797728829245321e-05, "loss": 0.0298, "step": 57 }, { "epoch": 0.6823529411764706, "grad_norm": 0.05381333860636251, "learning_rate": 7.717837750006106e-05, "loss": 0.0285, "step": 58 }, { "epoch": 0.6941176470588235, "grad_norm": 0.04595131495742476, "learning_rate": 7.636949378858646e-05, "loss": 0.0323, "step": 59 }, { "epoch": 0.7058823529411765, "grad_norm": 0.04547237722119376, "learning_rate": 7.555093397235552e-05, "loss": 0.0374, "step": 60 }, { "epoch": 0.7176470588235294, "grad_norm": 0.029277787154415737, "learning_rate": 7.472299841627451e-05, "loss": 0.0305, "step": 61 }, { "epoch": 0.7294117647058823, "grad_norm": 0.054416629180822844, "learning_rate": 7.388599092561315e-05, "loss": 0.0305, "step": 62 }, { "epoch": 0.7411764705882353, "grad_norm": 0.07521136626264392, "learning_rate": 7.304021863452524e-05, "loss": 0.0337, "step": 63 }, { "epoch": 0.7529411764705882, "grad_norm": 0.027594110933489315, "learning_rate": 7.218599189334799e-05, "loss": 0.0268, "step": 64 }, { "epoch": 0.7647058823529411, "grad_norm": 0.044043131009126026, "learning_rate": 7.1323624154721e-05, "loss": 0.0333, "step": 65 }, { "epoch": 0.7764705882352941, "grad_norm": 0.088281551033726, "learning_rate": 7.045343185856701e-05, "loss": 0.0367, "step": 66 }, { "epoch": 0.788235294117647, "grad_norm": 0.04328862177911276, "learning_rate": 6.957573431597646e-05, "loss": 0.0327, "step": 67 }, { "epoch": 0.8, "grad_norm": 0.03272017374355755, "learning_rate": 6.869085359203844e-05, "loss": 0.0309, "step": 68 }, { "epoch": 0.8117647058823529, "grad_norm": 0.06280711357209813, "learning_rate": 6.779911438766116e-05, "loss": 0.0327, "step": 69 }, { "epoch": 0.8235294117647058, "grad_norm": 0.052927974729157626, "learning_rate": 6.690084392042513e-05, "loss": 0.0312, "step": 70 }, { "epoch": 0.8352941176470589, "grad_norm": 0.05429533945696401, "learning_rate": 6.599637180451294e-05, "loss": 0.0348, "step": 71 }, { "epoch": 0.8470588235294118, "grad_norm": 0.048917527979116436, "learning_rate": 6.508602992975963e-05, "loss": 0.0317, "step": 72 }, { "epoch": 0.8588235294117647, "grad_norm": 0.031886426687786254, "learning_rate": 6.417015233986786e-05, "loss": 0.0311, "step": 73 }, { "epoch": 0.8705882352941177, "grad_norm": 0.03376820411697015, "learning_rate": 6.32490751098331e-05, "loss": 0.0263, "step": 74 }, { "epoch": 0.8823529411764706, "grad_norm": 0.03767361887537691, "learning_rate": 6.232313622262296e-05, "loss": 0.0343, "step": 75 }, { "epoch": 0.8941176470588236, "grad_norm": 0.04332615343931221, "learning_rate": 6.139267544515689e-05, "loss": 0.0304, "step": 76 }, { "epoch": 0.9058823529411765, "grad_norm": 0.04146445958416543, "learning_rate": 6.045803420363084e-05, "loss": 0.0318, "step": 77 }, { "epoch": 0.9176470588235294, "grad_norm": 0.03675850801075872, "learning_rate": 5.951955545823342e-05, "loss": 0.0286, "step": 78 }, { "epoch": 0.9294117647058824, "grad_norm": 0.03951469611584691, "learning_rate": 5.8577583577298924e-05, "loss": 0.0297, "step": 79 }, { "epoch": 0.9411764705882353, "grad_norm": 0.03613565309891407, "learning_rate": 5.7632464210943726e-05, "loss": 0.0261, "step": 80 }, { "epoch": 0.9529411764705882, "grad_norm": 0.031008899018911143, "learning_rate": 5.668454416423242e-05, "loss": 0.026, "step": 81 }, { "epoch": 0.9647058823529412, "grad_norm": 0.0692628338321925, "learning_rate": 5.573417126992003e-05, "loss": 0.0266, "step": 82 }, { "epoch": 0.9764705882352941, "grad_norm": 0.05852763315877547, "learning_rate": 5.478169426081712e-05, "loss": 0.0295, "step": 83 }, { "epoch": 0.9882352941176471, "grad_norm": 0.03199303327911728, "learning_rate": 5.38274626418248e-05, "loss": 0.0292, "step": 84 }, { "epoch": 1.0, "grad_norm": 0.03733696573812923, "learning_rate": 5.287182656168618e-05, "loss": 0.0287, "step": 85 }, { "epoch": 1.011764705882353, "grad_norm": 0.04222433319800323, "learning_rate": 5.191513668450178e-05, "loss": 0.0275, "step": 86 }, { "epoch": 1.0235294117647058, "grad_norm": 0.033149244513485374, "learning_rate": 5.095774406105571e-05, "loss": 0.0245, "step": 87 }, { "epoch": 1.035294117647059, "grad_norm": 0.030866296785570155, "learning_rate": 5e-05, "loss": 0.0227, "step": 88 }, { "epoch": 1.0470588235294118, "grad_norm": 0.029363896734282895, "learning_rate": 4.9042255938944296e-05, "loss": 0.0284, "step": 89 }, { "epoch": 1.0588235294117647, "grad_norm": 0.03458859192281693, "learning_rate": 4.8084863315498234e-05, "loss": 0.0256, "step": 90 }, { "epoch": 1.0705882352941176, "grad_norm": 0.033499852353591555, "learning_rate": 4.712817343831384e-05, "loss": 0.028, "step": 91 }, { "epoch": 1.0823529411764705, "grad_norm": 0.03348874233244485, "learning_rate": 4.6172537358175214e-05, "loss": 0.0269, "step": 92 }, { "epoch": 1.0941176470588236, "grad_norm": 0.03268399220561992, "learning_rate": 4.521830573918289e-05, "loss": 0.0268, "step": 93 }, { "epoch": 1.1058823529411765, "grad_norm": 0.03535497654299178, "learning_rate": 4.4265828730079987e-05, "loss": 0.0287, "step": 94 }, { "epoch": 1.1176470588235294, "grad_norm": 0.035249786662349535, "learning_rate": 4.331545583576758e-05, "loss": 0.0254, "step": 95 }, { "epoch": 1.1294117647058823, "grad_norm": 0.042058533392068366, "learning_rate": 4.236753578905627e-05, "loss": 0.0257, "step": 96 }, { "epoch": 1.1411764705882352, "grad_norm": 0.035662240284239075, "learning_rate": 4.142241642270108e-05, "loss": 0.023, "step": 97 }, { "epoch": 1.1529411764705881, "grad_norm": 0.03205942374398337, "learning_rate": 4.0480444541766576e-05, "loss": 0.0227, "step": 98 }, { "epoch": 1.1647058823529413, "grad_norm": 0.03410677741547236, "learning_rate": 3.954196579636918e-05, "loss": 0.025, "step": 99 }, { "epoch": 1.1764705882352942, "grad_norm": 0.037977473156353136, "learning_rate": 3.8607324554843136e-05, "loss": 0.0245, "step": 100 }, { "epoch": 1.188235294117647, "grad_norm": 0.034304594099498986, "learning_rate": 3.7676863777377054e-05, "loss": 0.0251, "step": 101 }, { "epoch": 1.2, "grad_norm": 0.04024140735442772, "learning_rate": 3.675092489016693e-05, "loss": 0.0258, "step": 102 }, { "epoch": 1.2117647058823529, "grad_norm": 0.03640815220830249, "learning_rate": 3.582984766013215e-05, "loss": 0.0254, "step": 103 }, { "epoch": 1.223529411764706, "grad_norm": 0.05308696935770744, "learning_rate": 3.4913970070240386e-05, "loss": 0.0317, "step": 104 }, { "epoch": 1.2352941176470589, "grad_norm": 0.03532605452768607, "learning_rate": 3.4003628195487057e-05, "loss": 0.0272, "step": 105 }, { "epoch": 1.2470588235294118, "grad_norm": 0.03399447551991043, "learning_rate": 3.309915607957487e-05, "loss": 0.0256, "step": 106 }, { "epoch": 1.2588235294117647, "grad_norm": 0.03154244903849424, "learning_rate": 3.2200885612338845e-05, "loss": 0.0207, "step": 107 }, { "epoch": 1.2705882352941176, "grad_norm": 0.04861329315126559, "learning_rate": 3.130914640796157e-05, "loss": 0.0297, "step": 108 }, { "epoch": 1.2823529411764705, "grad_norm": 0.04026826659177659, "learning_rate": 3.0424265684023558e-05, "loss": 0.0238, "step": 109 }, { "epoch": 1.2941176470588236, "grad_norm": 0.03529433347937337, "learning_rate": 2.9546568141433006e-05, "loss": 0.0247, "step": 110 }, { "epoch": 1.3058823529411765, "grad_norm": 0.035218945260167514, "learning_rate": 2.8676375845279013e-05, "loss": 0.0271, "step": 111 }, { "epoch": 1.3176470588235294, "grad_norm": 0.03409897136114451, "learning_rate": 2.7814008106652012e-05, "loss": 0.0253, "step": 112 }, { "epoch": 1.3294117647058823, "grad_norm": 0.03345892031561033, "learning_rate": 2.6959781365474758e-05, "loss": 0.0196, "step": 113 }, { "epoch": 1.3411764705882354, "grad_norm": 0.041711810588722247, "learning_rate": 2.6114009074386846e-05, "loss": 0.025, "step": 114 }, { "epoch": 1.3529411764705883, "grad_norm": 0.03404740591421045, "learning_rate": 2.527700158372548e-05, "loss": 0.0245, "step": 115 }, { "epoch": 1.3647058823529412, "grad_norm": 0.03806934179838023, "learning_rate": 2.4449066027644475e-05, "loss": 0.0202, "step": 116 }, { "epoch": 1.3764705882352941, "grad_norm": 0.037833218608223404, "learning_rate": 2.363050621141354e-05, "loss": 0.0269, "step": 117 }, { "epoch": 1.388235294117647, "grad_norm": 0.040060220347060395, "learning_rate": 2.282162249993895e-05, "loss": 0.0259, "step": 118 }, { "epoch": 1.4, "grad_norm": 0.039492305883863446, "learning_rate": 2.20227117075468e-05, "loss": 0.0263, "step": 119 }, { "epoch": 1.4117647058823528, "grad_norm": 0.04146681928718736, "learning_rate": 2.1234066989068972e-05, "loss": 0.0237, "step": 120 }, { "epoch": 1.423529411764706, "grad_norm": 0.044440473554041314, "learning_rate": 2.0455977732271993e-05, "loss": 0.0221, "step": 121 }, { "epoch": 1.4352941176470588, "grad_norm": 0.033828391729890175, "learning_rate": 1.9688729451668114e-05, "loss": 0.0203, "step": 122 }, { "epoch": 1.4470588235294117, "grad_norm": 0.038308578853440155, "learning_rate": 1.893260368374786e-05, "loss": 0.0271, "step": 123 }, { "epoch": 1.4588235294117646, "grad_norm": 0.035696891554767116, "learning_rate": 1.818787788367202e-05, "loss": 0.0241, "step": 124 }, { "epoch": 1.4705882352941178, "grad_norm": 0.03450606035436337, "learning_rate": 1.7454825323461448e-05, "loss": 0.0227, "step": 125 }, { "epoch": 1.4823529411764707, "grad_norm": 0.04502948564553977, "learning_rate": 1.673371499172174e-05, "loss": 0.0282, "step": 126 }, { "epoch": 1.4941176470588236, "grad_norm": 0.03775360455129196, "learning_rate": 1.6024811494939724e-05, "loss": 0.022, "step": 127 }, { "epoch": 1.5058823529411764, "grad_norm": 0.035858486799161725, "learning_rate": 1.532837496038792e-05, "loss": 0.0247, "step": 128 }, { "epoch": 1.5176470588235293, "grad_norm": 0.03737088577494754, "learning_rate": 1.4644660940672627e-05, "loss": 0.0257, "step": 129 }, { "epoch": 1.5294117647058822, "grad_norm": 0.03461325321460445, "learning_rate": 1.3973920319960655e-05, "loss": 0.0217, "step": 130 }, { "epoch": 1.5411764705882351, "grad_norm": 0.034501790257859405, "learning_rate": 1.3316399221919074e-05, "loss": 0.0232, "step": 131 }, { "epoch": 1.5529411764705883, "grad_norm": 0.03782875214918853, "learning_rate": 1.2672338919401866e-05, "loss": 0.0225, "step": 132 }, { "epoch": 1.5647058823529412, "grad_norm": 0.04306989405034864, "learning_rate": 1.2041975745916472e-05, "loss": 0.0269, "step": 133 }, { "epoch": 1.576470588235294, "grad_norm": 0.037700226280294416, "learning_rate": 1.1425541008902851e-05, "loss": 0.0249, "step": 134 }, { "epoch": 1.5882352941176472, "grad_norm": 0.03890317937719569, "learning_rate": 1.082326090485679e-05, "loss": 0.0255, "step": 135 }, { "epoch": 1.6, "grad_norm": 0.04219733674070769, "learning_rate": 1.0235356436328675e-05, "loss": 0.0286, "step": 136 }, { "epoch": 1.611764705882353, "grad_norm": 0.03273981321791409, "learning_rate": 9.662043330828085e-06, "loss": 0.0201, "step": 137 }, { "epoch": 1.6235294117647059, "grad_norm": 0.03302849889769006, "learning_rate": 9.103531961664118e-06, "loss": 0.0212, "step": 138 }, { "epoch": 1.6352941176470588, "grad_norm": 0.04021301444135266, "learning_rate": 8.560027270750277e-06, "loss": 0.0244, "step": 139 }, { "epoch": 1.6470588235294117, "grad_norm": 0.034002953931695744, "learning_rate": 8.031728693402502e-06, "loss": 0.0193, "step": 140 }, { "epoch": 1.6588235294117646, "grad_norm": 0.03494170057614315, "learning_rate": 7.518830085157735e-06, "loss": 0.0232, "step": 141 }, { "epoch": 1.6705882352941175, "grad_norm": 0.05067951397515134, "learning_rate": 7.0215196506399515e-06, "loss": 0.0217, "step": 142 }, { "epoch": 1.6823529411764706, "grad_norm": 0.035476236926014315, "learning_rate": 6.539979874499747e-06, "loss": 0.0213, "step": 143 }, { "epoch": 1.6941176470588235, "grad_norm": 0.036655504933770414, "learning_rate": 6.07438745445289e-06, "loss": 0.0234, "step": 144 }, { "epoch": 1.7058823529411766, "grad_norm": 0.041657363570965146, "learning_rate": 5.624913236442286e-06, "loss": 0.0294, "step": 145 }, { "epoch": 1.7176470588235295, "grad_norm": 0.05150076449892236, "learning_rate": 5.191722151947226e-06, "loss": 0.0258, "step": 146 }, { "epoch": 1.7294117647058824, "grad_norm": 0.0404588183080739, "learning_rate": 4.7749731574629196e-06, "loss": 0.0224, "step": 147 }, { "epoch": 1.7411764705882353, "grad_norm": 0.033224356486454734, "learning_rate": 4.374819176172501e-06, "loss": 0.021, "step": 148 }, { "epoch": 1.7529411764705882, "grad_norm": 0.03795419337831155, "learning_rate": 3.991407041832912e-06, "loss": 0.022, "step": 149 }, { "epoch": 1.7647058823529411, "grad_norm": 0.04200165421199087, "learning_rate": 3.6248774448952695e-06, "loss": 0.0265, "step": 150 }, { "epoch": 1.776470588235294, "grad_norm": 0.03845197891559611, "learning_rate": 3.2753648808794503e-06, "loss": 0.024, "step": 151 }, { "epoch": 1.788235294117647, "grad_norm": 0.037815699401078935, "learning_rate": 2.942997601021924e-06, "loss": 0.0257, "step": 152 }, { "epoch": 1.8, "grad_norm": 0.03519274514771082, "learning_rate": 2.6278975652147875e-06, "loss": 0.0232, "step": 153 }, { "epoch": 1.811764705882353, "grad_norm": 0.03706158210535721, "learning_rate": 2.330180397253473e-06, "loss": 0.0232, "step": 154 }, { "epoch": 1.8235294117647058, "grad_norm": 0.04248155649655448, "learning_rate": 2.049955342409349e-06, "loss": 0.0227, "step": 155 }, { "epoch": 1.835294117647059, "grad_norm": 0.0368432870786149, "learning_rate": 1.7873252273429509e-06, "loss": 0.0224, "step": 156 }, { "epoch": 1.8470588235294119, "grad_norm": 0.03332568698884207, "learning_rate": 1.542386422372405e-06, "loss": 0.0208, "step": 157 }, { "epoch": 1.8588235294117648, "grad_norm": 0.036213210721090155, "learning_rate": 1.3152288061110518e-06, "loss": 0.0237, "step": 158 }, { "epoch": 1.8705882352941177, "grad_norm": 0.035507819488375524, "learning_rate": 1.1059357324870455e-06, "loss": 0.0212, "step": 159 }, { "epoch": 1.8823529411764706, "grad_norm": 0.032773052246892385, "learning_rate": 9.145840001572537e-07, "loss": 0.0209, "step": 160 }, { "epoch": 1.8941176470588235, "grad_norm": 0.03720092396666804, "learning_rate": 7.41243824326504e-07, "loss": 0.0237, "step": 161 }, { "epoch": 1.9058823529411764, "grad_norm": 0.061643042263861664, "learning_rate": 5.859788109825793e-07, "loss": 0.0263, "step": 162 }, { "epoch": 1.9176470588235293, "grad_norm": 0.037441015710064, "learning_rate": 4.48845933556441e-07, "loss": 0.025, "step": 163 }, { "epoch": 1.9294117647058824, "grad_norm": 0.03459803638637435, "learning_rate": 3.2989551201624835e-07, "loss": 0.0215, "step": 164 }, { "epoch": 1.9411764705882353, "grad_norm": 0.04948419428076397, "learning_rate": 2.2917119440275524e-07, "loss": 0.0273, "step": 165 }, { "epoch": 1.9529411764705882, "grad_norm": 0.040939656098896986, "learning_rate": 1.4670994081297795e-07, "loss": 0.0251, "step": 166 }, { "epoch": 1.9647058823529413, "grad_norm": 0.03941121437437399, "learning_rate": 8.254200983794369e-08, "loss": 0.0273, "step": 167 }, { "epoch": 1.9764705882352942, "grad_norm": 0.04847566902018068, "learning_rate": 3.669094745950008e-08, "loss": 0.0239, "step": 168 }, { "epoch": 1.988235294117647, "grad_norm": 0.04650396689287853, "learning_rate": 9.17357841028199e-09, "loss": 0.0207, "step": 169 }, { "epoch": 2.0, "grad_norm": 0.04099329304128327, "learning_rate": 0.0, "loss": 0.0232, "step": 170 }, { "epoch": 2.0, "step": 170, "total_flos": 861011422740480.0, "train_loss": 0.03621660071041654, "train_runtime": 2036.7134, "train_samples_per_second": 0.668, "train_steps_per_second": 0.083 } ], "logging_steps": 1, "max_steps": 170, "num_input_tokens_seen": 0, "num_train_epochs": 2, "save_steps": 50000, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 861011422740480.0, "train_batch_size": 2, "trial_name": null, "trial_params": null }