| { | |
| "best_global_step": null, | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 0.33831406822667043, | |
| "eval_steps": 500, | |
| "global_step": 900, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.0037590452025185604, | |
| "grad_norm": 2.616206169128418, | |
| "learning_rate": 1.6000000000000003e-05, | |
| "loss": 4.4449, | |
| "step": 10 | |
| }, | |
| { | |
| "epoch": 0.007518090405037121, | |
| "grad_norm": 1.0848251581192017, | |
| "learning_rate": 3.6e-05, | |
| "loss": 3.5961, | |
| "step": 20 | |
| }, | |
| { | |
| "epoch": 0.01127713560755568, | |
| "grad_norm": 1.1111085414886475, | |
| "learning_rate": 5.6000000000000006e-05, | |
| "loss": 2.9221, | |
| "step": 30 | |
| }, | |
| { | |
| "epoch": 0.015036180810074242, | |
| "grad_norm": 1.2459989786148071, | |
| "learning_rate": 7.6e-05, | |
| "loss": 1.9429, | |
| "step": 40 | |
| }, | |
| { | |
| "epoch": 0.0187952260125928, | |
| "grad_norm": 0.8213669657707214, | |
| "learning_rate": 9.6e-05, | |
| "loss": 1.9535, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.02255427121511136, | |
| "grad_norm": 0.8069262504577637, | |
| "learning_rate": 9.979695431472081e-05, | |
| "loss": 1.7172, | |
| "step": 60 | |
| }, | |
| { | |
| "epoch": 0.026313316417629923, | |
| "grad_norm": 1.422139286994934, | |
| "learning_rate": 9.954314720812184e-05, | |
| "loss": 1.6907, | |
| "step": 70 | |
| }, | |
| { | |
| "epoch": 0.030072361620148483, | |
| "grad_norm": 1.3577821254730225, | |
| "learning_rate": 9.928934010152285e-05, | |
| "loss": 1.8705, | |
| "step": 80 | |
| }, | |
| { | |
| "epoch": 0.03383140682266704, | |
| "grad_norm": 1.192848801612854, | |
| "learning_rate": 9.903553299492386e-05, | |
| "loss": 1.5954, | |
| "step": 90 | |
| }, | |
| { | |
| "epoch": 0.0375904520251856, | |
| "grad_norm": 1.2164387702941895, | |
| "learning_rate": 9.878172588832489e-05, | |
| "loss": 1.5556, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.04134949722770416, | |
| "grad_norm": 1.162097692489624, | |
| "learning_rate": 9.852791878172589e-05, | |
| "loss": 1.5447, | |
| "step": 110 | |
| }, | |
| { | |
| "epoch": 0.04510854243022272, | |
| "grad_norm": 0.9176503419876099, | |
| "learning_rate": 9.827411167512691e-05, | |
| "loss": 1.4747, | |
| "step": 120 | |
| }, | |
| { | |
| "epoch": 0.04886758763274129, | |
| "grad_norm": 1.1206398010253906, | |
| "learning_rate": 9.802030456852792e-05, | |
| "loss": 1.5876, | |
| "step": 130 | |
| }, | |
| { | |
| "epoch": 0.05262663283525985, | |
| "grad_norm": 0.6738734841346741, | |
| "learning_rate": 9.776649746192893e-05, | |
| "loss": 1.5071, | |
| "step": 140 | |
| }, | |
| { | |
| "epoch": 0.05638567803777841, | |
| "grad_norm": 1.0791672468185425, | |
| "learning_rate": 9.751269035532995e-05, | |
| "loss": 1.3875, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 0.06014472324029697, | |
| "grad_norm": 0.8023204803466797, | |
| "learning_rate": 9.725888324873097e-05, | |
| "loss": 1.4511, | |
| "step": 160 | |
| }, | |
| { | |
| "epoch": 0.06390376844281552, | |
| "grad_norm": 1.1953641176223755, | |
| "learning_rate": 9.700507614213198e-05, | |
| "loss": 1.551, | |
| "step": 170 | |
| }, | |
| { | |
| "epoch": 0.06766281364533408, | |
| "grad_norm": 0.9535043835639954, | |
| "learning_rate": 9.6751269035533e-05, | |
| "loss": 1.6191, | |
| "step": 180 | |
| }, | |
| { | |
| "epoch": 0.07142185884785264, | |
| "grad_norm": 1.397887945175171, | |
| "learning_rate": 9.649746192893402e-05, | |
| "loss": 1.5753, | |
| "step": 190 | |
| }, | |
| { | |
| "epoch": 0.0751809040503712, | |
| "grad_norm": 0.7615554332733154, | |
| "learning_rate": 9.624365482233503e-05, | |
| "loss": 1.3002, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 0.07893994925288976, | |
| "grad_norm": 1.2040668725967407, | |
| "learning_rate": 9.598984771573605e-05, | |
| "loss": 1.3894, | |
| "step": 210 | |
| }, | |
| { | |
| "epoch": 0.08269899445540832, | |
| "grad_norm": 0.8942911028862, | |
| "learning_rate": 9.573604060913706e-05, | |
| "loss": 1.4224, | |
| "step": 220 | |
| }, | |
| { | |
| "epoch": 0.08645803965792688, | |
| "grad_norm": 0.9624324440956116, | |
| "learning_rate": 9.548223350253807e-05, | |
| "loss": 1.4281, | |
| "step": 230 | |
| }, | |
| { | |
| "epoch": 0.09021708486044544, | |
| "grad_norm": 0.9280639886856079, | |
| "learning_rate": 9.522842639593908e-05, | |
| "loss": 1.4149, | |
| "step": 240 | |
| }, | |
| { | |
| "epoch": 0.09397613006296401, | |
| "grad_norm": 1.081101655960083, | |
| "learning_rate": 9.497461928934011e-05, | |
| "loss": 1.4395, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 0.09773517526548257, | |
| "grad_norm": 0.6382322311401367, | |
| "learning_rate": 9.472081218274112e-05, | |
| "loss": 1.2719, | |
| "step": 260 | |
| }, | |
| { | |
| "epoch": 0.10149422046800113, | |
| "grad_norm": 0.6878702044487, | |
| "learning_rate": 9.446700507614213e-05, | |
| "loss": 1.358, | |
| "step": 270 | |
| }, | |
| { | |
| "epoch": 0.1052532656705197, | |
| "grad_norm": 0.6313768029212952, | |
| "learning_rate": 9.421319796954316e-05, | |
| "loss": 1.323, | |
| "step": 280 | |
| }, | |
| { | |
| "epoch": 0.10901231087303825, | |
| "grad_norm": 0.7255018353462219, | |
| "learning_rate": 9.395939086294417e-05, | |
| "loss": 1.2451, | |
| "step": 290 | |
| }, | |
| { | |
| "epoch": 0.11277135607555681, | |
| "grad_norm": 0.9848262667655945, | |
| "learning_rate": 9.370558375634518e-05, | |
| "loss": 1.3681, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 0.11653040127807537, | |
| "grad_norm": 0.9399502277374268, | |
| "learning_rate": 9.34517766497462e-05, | |
| "loss": 1.3431, | |
| "step": 310 | |
| }, | |
| { | |
| "epoch": 0.12028944648059393, | |
| "grad_norm": 0.5780265927314758, | |
| "learning_rate": 9.31979695431472e-05, | |
| "loss": 1.4748, | |
| "step": 320 | |
| }, | |
| { | |
| "epoch": 0.1240484916831125, | |
| "grad_norm": 0.8427544832229614, | |
| "learning_rate": 9.294416243654823e-05, | |
| "loss": 1.3739, | |
| "step": 330 | |
| }, | |
| { | |
| "epoch": 0.12780753688563104, | |
| "grad_norm": 0.9551161527633667, | |
| "learning_rate": 9.269035532994924e-05, | |
| "loss": 1.3434, | |
| "step": 340 | |
| }, | |
| { | |
| "epoch": 0.13156658208814961, | |
| "grad_norm": 0.5216005444526672, | |
| "learning_rate": 9.243654822335026e-05, | |
| "loss": 1.2726, | |
| "step": 350 | |
| }, | |
| { | |
| "epoch": 0.13532562729066816, | |
| "grad_norm": 1.0278292894363403, | |
| "learning_rate": 9.218274111675127e-05, | |
| "loss": 1.3311, | |
| "step": 360 | |
| }, | |
| { | |
| "epoch": 0.13908467249318673, | |
| "grad_norm": 0.8886487483978271, | |
| "learning_rate": 9.192893401015229e-05, | |
| "loss": 1.5468, | |
| "step": 370 | |
| }, | |
| { | |
| "epoch": 0.14284371769570528, | |
| "grad_norm": 0.7291779518127441, | |
| "learning_rate": 9.16751269035533e-05, | |
| "loss": 1.366, | |
| "step": 380 | |
| }, | |
| { | |
| "epoch": 0.14660276289822385, | |
| "grad_norm": 0.8320682048797607, | |
| "learning_rate": 9.142131979695432e-05, | |
| "loss": 1.4284, | |
| "step": 390 | |
| }, | |
| { | |
| "epoch": 0.1503618081007424, | |
| "grad_norm": 0.5894179940223694, | |
| "learning_rate": 9.116751269035534e-05, | |
| "loss": 1.451, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 0.15412085330326097, | |
| "grad_norm": 0.5339919924736023, | |
| "learning_rate": 9.091370558375635e-05, | |
| "loss": 1.3939, | |
| "step": 410 | |
| }, | |
| { | |
| "epoch": 0.15787989850577952, | |
| "grad_norm": 0.6264607310295105, | |
| "learning_rate": 9.065989847715737e-05, | |
| "loss": 1.4581, | |
| "step": 420 | |
| }, | |
| { | |
| "epoch": 0.1616389437082981, | |
| "grad_norm": 1.09451162815094, | |
| "learning_rate": 9.040609137055838e-05, | |
| "loss": 1.3846, | |
| "step": 430 | |
| }, | |
| { | |
| "epoch": 0.16539798891081664, | |
| "grad_norm": 0.7401901483535767, | |
| "learning_rate": 9.015228426395939e-05, | |
| "loss": 1.4449, | |
| "step": 440 | |
| }, | |
| { | |
| "epoch": 0.16915703411333521, | |
| "grad_norm": 0.8674114346504211, | |
| "learning_rate": 8.98984771573604e-05, | |
| "loss": 1.5506, | |
| "step": 450 | |
| }, | |
| { | |
| "epoch": 0.17291607931585376, | |
| "grad_norm": 0.6718773245811462, | |
| "learning_rate": 8.964467005076143e-05, | |
| "loss": 1.4187, | |
| "step": 460 | |
| }, | |
| { | |
| "epoch": 0.17667512451837233, | |
| "grad_norm": 0.858181893825531, | |
| "learning_rate": 8.939086294416244e-05, | |
| "loss": 1.4919, | |
| "step": 470 | |
| }, | |
| { | |
| "epoch": 0.18043416972089088, | |
| "grad_norm": 0.8369438052177429, | |
| "learning_rate": 8.913705583756345e-05, | |
| "loss": 1.3876, | |
| "step": 480 | |
| }, | |
| { | |
| "epoch": 0.18419321492340945, | |
| "grad_norm": 0.5989262461662292, | |
| "learning_rate": 8.888324873096448e-05, | |
| "loss": 1.2856, | |
| "step": 490 | |
| }, | |
| { | |
| "epoch": 0.18795226012592803, | |
| "grad_norm": 0.4753406345844269, | |
| "learning_rate": 8.862944162436549e-05, | |
| "loss": 1.1997, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 0.19171130532844657, | |
| "grad_norm": 0.6064810752868652, | |
| "learning_rate": 8.83756345177665e-05, | |
| "loss": 1.4085, | |
| "step": 510 | |
| }, | |
| { | |
| "epoch": 0.19547035053096515, | |
| "grad_norm": 0.7949738502502441, | |
| "learning_rate": 8.812182741116751e-05, | |
| "loss": 1.4494, | |
| "step": 520 | |
| }, | |
| { | |
| "epoch": 0.1992293957334837, | |
| "grad_norm": 0.6465169191360474, | |
| "learning_rate": 8.786802030456854e-05, | |
| "loss": 1.3264, | |
| "step": 530 | |
| }, | |
| { | |
| "epoch": 0.20298844093600227, | |
| "grad_norm": 0.7491289377212524, | |
| "learning_rate": 8.761421319796955e-05, | |
| "loss": 1.3157, | |
| "step": 540 | |
| }, | |
| { | |
| "epoch": 0.20674748613852081, | |
| "grad_norm": 0.9102064967155457, | |
| "learning_rate": 8.736040609137056e-05, | |
| "loss": 1.4135, | |
| "step": 550 | |
| }, | |
| { | |
| "epoch": 0.2105065313410394, | |
| "grad_norm": 0.9291871786117554, | |
| "learning_rate": 8.710659898477158e-05, | |
| "loss": 1.3683, | |
| "step": 560 | |
| }, | |
| { | |
| "epoch": 0.21426557654355793, | |
| "grad_norm": 0.8890318274497986, | |
| "learning_rate": 8.685279187817259e-05, | |
| "loss": 1.3436, | |
| "step": 570 | |
| }, | |
| { | |
| "epoch": 0.2180246217460765, | |
| "grad_norm": 0.6270791292190552, | |
| "learning_rate": 8.659898477157361e-05, | |
| "loss": 1.2768, | |
| "step": 580 | |
| }, | |
| { | |
| "epoch": 0.22178366694859505, | |
| "grad_norm": 0.8151299953460693, | |
| "learning_rate": 8.634517766497462e-05, | |
| "loss": 1.2226, | |
| "step": 590 | |
| }, | |
| { | |
| "epoch": 0.22554271215111363, | |
| "grad_norm": 0.5168191194534302, | |
| "learning_rate": 8.609137055837564e-05, | |
| "loss": 1.315, | |
| "step": 600 | |
| }, | |
| { | |
| "epoch": 0.22930175735363217, | |
| "grad_norm": 0.5865817666053772, | |
| "learning_rate": 8.583756345177666e-05, | |
| "loss": 1.2823, | |
| "step": 610 | |
| }, | |
| { | |
| "epoch": 0.23306080255615075, | |
| "grad_norm": 0.9748789072036743, | |
| "learning_rate": 8.558375634517767e-05, | |
| "loss": 1.3953, | |
| "step": 620 | |
| }, | |
| { | |
| "epoch": 0.2368198477586693, | |
| "grad_norm": 0.8284083604812622, | |
| "learning_rate": 8.532994923857869e-05, | |
| "loss": 1.3654, | |
| "step": 630 | |
| }, | |
| { | |
| "epoch": 0.24057889296118787, | |
| "grad_norm": 0.9949294924736023, | |
| "learning_rate": 8.50761421319797e-05, | |
| "loss": 1.2715, | |
| "step": 640 | |
| }, | |
| { | |
| "epoch": 0.24433793816370641, | |
| "grad_norm": 0.5778189897537231, | |
| "learning_rate": 8.482233502538071e-05, | |
| "loss": 1.2411, | |
| "step": 650 | |
| }, | |
| { | |
| "epoch": 0.248096983366225, | |
| "grad_norm": 0.6740812659263611, | |
| "learning_rate": 8.456852791878172e-05, | |
| "loss": 1.2492, | |
| "step": 660 | |
| }, | |
| { | |
| "epoch": 0.25185602856874356, | |
| "grad_norm": 0.9166800379753113, | |
| "learning_rate": 8.431472081218275e-05, | |
| "loss": 1.4728, | |
| "step": 670 | |
| }, | |
| { | |
| "epoch": 0.2556150737712621, | |
| "grad_norm": 0.6058619618415833, | |
| "learning_rate": 8.406091370558376e-05, | |
| "loss": 1.4004, | |
| "step": 680 | |
| }, | |
| { | |
| "epoch": 0.25937411897378065, | |
| "grad_norm": 1.2171576023101807, | |
| "learning_rate": 8.380710659898477e-05, | |
| "loss": 1.3007, | |
| "step": 690 | |
| }, | |
| { | |
| "epoch": 0.26313316417629923, | |
| "grad_norm": 0.5887770056724548, | |
| "learning_rate": 8.35532994923858e-05, | |
| "loss": 1.374, | |
| "step": 700 | |
| }, | |
| { | |
| "epoch": 0.2668922093788178, | |
| "grad_norm": 0.7483983635902405, | |
| "learning_rate": 8.329949238578681e-05, | |
| "loss": 1.3807, | |
| "step": 710 | |
| }, | |
| { | |
| "epoch": 0.2706512545813363, | |
| "grad_norm": 0.7405831217765808, | |
| "learning_rate": 8.304568527918782e-05, | |
| "loss": 1.3439, | |
| "step": 720 | |
| }, | |
| { | |
| "epoch": 0.2744102997838549, | |
| "grad_norm": 0.8032345771789551, | |
| "learning_rate": 8.279187817258883e-05, | |
| "loss": 1.2951, | |
| "step": 730 | |
| }, | |
| { | |
| "epoch": 0.27816934498637347, | |
| "grad_norm": 0.7847606539726257, | |
| "learning_rate": 8.253807106598986e-05, | |
| "loss": 1.2761, | |
| "step": 740 | |
| }, | |
| { | |
| "epoch": 0.28192839018889204, | |
| "grad_norm": 0.8183022737503052, | |
| "learning_rate": 8.228426395939086e-05, | |
| "loss": 1.1964, | |
| "step": 750 | |
| }, | |
| { | |
| "epoch": 0.28568743539141056, | |
| "grad_norm": 0.6903379559516907, | |
| "learning_rate": 8.203045685279188e-05, | |
| "loss": 1.3832, | |
| "step": 760 | |
| }, | |
| { | |
| "epoch": 0.28944648059392913, | |
| "grad_norm": 0.8767275214195251, | |
| "learning_rate": 8.17766497461929e-05, | |
| "loss": 1.4244, | |
| "step": 770 | |
| }, | |
| { | |
| "epoch": 0.2932055257964477, | |
| "grad_norm": 0.830820620059967, | |
| "learning_rate": 8.152284263959391e-05, | |
| "loss": 1.2934, | |
| "step": 780 | |
| }, | |
| { | |
| "epoch": 0.2969645709989663, | |
| "grad_norm": 0.8802516460418701, | |
| "learning_rate": 8.126903553299493e-05, | |
| "loss": 1.4081, | |
| "step": 790 | |
| }, | |
| { | |
| "epoch": 0.3007236162014848, | |
| "grad_norm": 0.7873942255973816, | |
| "learning_rate": 8.101522842639594e-05, | |
| "loss": 1.3219, | |
| "step": 800 | |
| }, | |
| { | |
| "epoch": 0.3044826614040034, | |
| "grad_norm": 1.127156376838684, | |
| "learning_rate": 8.076142131979696e-05, | |
| "loss": 1.2824, | |
| "step": 810 | |
| }, | |
| { | |
| "epoch": 0.30824170660652195, | |
| "grad_norm": 0.9335768818855286, | |
| "learning_rate": 8.050761421319797e-05, | |
| "loss": 1.3021, | |
| "step": 820 | |
| }, | |
| { | |
| "epoch": 0.3120007518090405, | |
| "grad_norm": 0.6120972633361816, | |
| "learning_rate": 8.0253807106599e-05, | |
| "loss": 1.3155, | |
| "step": 830 | |
| }, | |
| { | |
| "epoch": 0.31575979701155904, | |
| "grad_norm": 0.5859377980232239, | |
| "learning_rate": 8e-05, | |
| "loss": 1.3117, | |
| "step": 840 | |
| }, | |
| { | |
| "epoch": 0.3195188422140776, | |
| "grad_norm": 0.6444947719573975, | |
| "learning_rate": 7.974619289340102e-05, | |
| "loss": 1.3348, | |
| "step": 850 | |
| }, | |
| { | |
| "epoch": 0.3232778874165962, | |
| "grad_norm": 0.907409131526947, | |
| "learning_rate": 7.949238578680203e-05, | |
| "loss": 1.2427, | |
| "step": 860 | |
| }, | |
| { | |
| "epoch": 0.32703693261911476, | |
| "grad_norm": 0.5900988578796387, | |
| "learning_rate": 7.923857868020304e-05, | |
| "loss": 1.3427, | |
| "step": 870 | |
| }, | |
| { | |
| "epoch": 0.3307959778216333, | |
| "grad_norm": 0.8546301126480103, | |
| "learning_rate": 7.898477157360407e-05, | |
| "loss": 1.3824, | |
| "step": 880 | |
| }, | |
| { | |
| "epoch": 0.33455502302415185, | |
| "grad_norm": 0.9503180384635925, | |
| "learning_rate": 7.873096446700508e-05, | |
| "loss": 1.2671, | |
| "step": 890 | |
| }, | |
| { | |
| "epoch": 0.33831406822667043, | |
| "grad_norm": 0.6486964821815491, | |
| "learning_rate": 7.847715736040609e-05, | |
| "loss": 1.3737, | |
| "step": 900 | |
| } | |
| ], | |
| "logging_steps": 10, | |
| "max_steps": 3990, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 2, | |
| "save_steps": 300, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": false | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 5.663021612477645e+16, | |
| "train_batch_size": 2, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |