{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 3.0, "eval_steps": 500, "global_step": 171, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.017543859649122806, "grad_norm": 3.77577782421705, "learning_rate": 0.0, "loss": 1.1733, "num_tokens": 427344.0, "step": 1 }, { "epoch": 0.03508771929824561, "grad_norm": 3.7130563556909215, "learning_rate": 1.6666666666666667e-06, "loss": 1.164, "num_tokens": 866449.0, "step": 2 }, { "epoch": 0.05263157894736842, "grad_norm": 3.970572390407664, "learning_rate": 3.3333333333333333e-06, "loss": 1.2254, "num_tokens": 1268563.0, "step": 3 }, { "epoch": 0.07017543859649122, "grad_norm": 3.512030300056592, "learning_rate": 5e-06, "loss": 1.2077, "num_tokens": 1669265.0, "step": 4 }, { "epoch": 0.08771929824561403, "grad_norm": 2.37795690695823, "learning_rate": 6.666666666666667e-06, "loss": 1.0413, "num_tokens": 2093363.0, "step": 5 }, { "epoch": 0.10526315789473684, "grad_norm": 1.525864968420599, "learning_rate": 8.333333333333334e-06, "loss": 0.874, "num_tokens": 2505147.0, "step": 6 }, { "epoch": 0.12280701754385964, "grad_norm": 1.4347947716935, "learning_rate": 1e-05, "loss": 0.8159, "num_tokens": 2931742.0, "step": 7 }, { "epoch": 0.14035087719298245, "grad_norm": 2.6084462153475063, "learning_rate": 9.999184354855868e-06, "loss": 0.6802, "num_tokens": 3316057.0, "step": 8 }, { "epoch": 0.15789473684210525, "grad_norm": 1.5740859932252258, "learning_rate": 9.996737715102133e-06, "loss": 0.6019, "num_tokens": 3761399.0, "step": 9 }, { "epoch": 0.17543859649122806, "grad_norm": 1.5450933348615719, "learning_rate": 9.99266096766761e-06, "loss": 0.5439, "num_tokens": 4179394.0, "step": 10 }, { "epoch": 0.19298245614035087, "grad_norm": 0.9018401685238229, "learning_rate": 9.98695559040975e-06, "loss": 0.4438, "num_tokens": 4600345.0, "step": 11 }, { "epoch": 0.21052631578947367, "grad_norm": 0.5196800459649759, "learning_rate": 9.979623651578881e-06, "loss": 0.395, "num_tokens": 4980610.0, "step": 12 }, { "epoch": 0.22807017543859648, "grad_norm": 0.4799071601752034, "learning_rate": 9.970667809068476e-06, "loss": 0.3892, "num_tokens": 5395359.0, "step": 13 }, { "epoch": 0.24561403508771928, "grad_norm": 0.31207959270885294, "learning_rate": 9.960091309451626e-06, "loss": 0.3808, "num_tokens": 5805663.0, "step": 14 }, { "epoch": 0.2631578947368421, "grad_norm": 0.31760448278346387, "learning_rate": 9.947897986804131e-06, "loss": 0.3708, "num_tokens": 6201438.0, "step": 15 }, { "epoch": 0.2807017543859649, "grad_norm": 0.3009817279323404, "learning_rate": 9.93409226131462e-06, "loss": 0.3557, "num_tokens": 6614132.0, "step": 16 }, { "epoch": 0.2982456140350877, "grad_norm": 0.2599822684525422, "learning_rate": 9.91867913768218e-06, "loss": 0.3284, "num_tokens": 7030814.0, "step": 17 }, { "epoch": 0.3157894736842105, "grad_norm": 0.24197734921107386, "learning_rate": 9.901664203302126e-06, "loss": 0.3272, "num_tokens": 7453748.0, "step": 18 }, { "epoch": 0.3333333333333333, "grad_norm": 0.2741912470593866, "learning_rate": 9.883053626240503e-06, "loss": 0.3181, "num_tokens": 7874681.0, "step": 19 }, { "epoch": 0.3508771929824561, "grad_norm": 0.21560866373997145, "learning_rate": 9.862854152998112e-06, "loss": 0.3102, "num_tokens": 8349009.0, "step": 20 }, { "epoch": 0.3684210526315789, "grad_norm": 0.203790119667157, "learning_rate": 9.841073106064852e-06, "loss": 0.3038, "num_tokens": 8779295.0, "step": 21 }, { "epoch": 0.38596491228070173, "grad_norm": 0.21543607735771148, "learning_rate": 9.81771838126524e-06, "loss": 0.3039, "num_tokens": 9155380.0, "step": 22 }, { "epoch": 0.40350877192982454, "grad_norm": 0.19448461940928388, "learning_rate": 9.792798444896107e-06, "loss": 0.2923, "num_tokens": 9572881.0, "step": 23 }, { "epoch": 0.42105263157894735, "grad_norm": 0.20104199677846465, "learning_rate": 9.766322330657499e-06, "loss": 0.3004, "num_tokens": 9972480.0, "step": 24 }, { "epoch": 0.43859649122807015, "grad_norm": 0.19715422132031898, "learning_rate": 9.738299636377863e-06, "loss": 0.291, "num_tokens": 10406948.0, "step": 25 }, { "epoch": 0.45614035087719296, "grad_norm": 0.19192801250536232, "learning_rate": 9.70874052053476e-06, "loss": 0.289, "num_tokens": 10821942.0, "step": 26 }, { "epoch": 0.47368421052631576, "grad_norm": 0.18616122639207905, "learning_rate": 9.677655698572326e-06, "loss": 0.2622, "num_tokens": 11234170.0, "step": 27 }, { "epoch": 0.49122807017543857, "grad_norm": 0.192192334546507, "learning_rate": 9.645056439016827e-06, "loss": 0.275, "num_tokens": 11627308.0, "step": 28 }, { "epoch": 0.5087719298245614, "grad_norm": 0.18829460979027127, "learning_rate": 9.610954559391704e-06, "loss": 0.2754, "num_tokens": 12022656.0, "step": 29 }, { "epoch": 0.5263157894736842, "grad_norm": 0.17300283701775698, "learning_rate": 9.57536242193364e-06, "loss": 0.2692, "num_tokens": 12444405.0, "step": 30 }, { "epoch": 0.543859649122807, "grad_norm": 0.17695771762278176, "learning_rate": 9.538292929111114e-06, "loss": 0.2734, "num_tokens": 12837998.0, "step": 31 }, { "epoch": 0.5614035087719298, "grad_norm": 0.16869993658358656, "learning_rate": 9.499759518947156e-06, "loss": 0.2657, "num_tokens": 13261798.0, "step": 32 }, { "epoch": 0.5789473684210527, "grad_norm": 0.15857386041778282, "learning_rate": 9.459776160147941e-06, "loss": 0.2559, "num_tokens": 13717762.0, "step": 33 }, { "epoch": 0.5964912280701754, "grad_norm": 0.16181853610382824, "learning_rate": 9.418357347038999e-06, "loss": 0.2427, "num_tokens": 14142042.0, "step": 34 }, { "epoch": 0.6140350877192983, "grad_norm": 0.16729961186553155, "learning_rate": 9.375518094310904e-06, "loss": 0.2546, "num_tokens": 14543269.0, "step": 35 }, { "epoch": 0.631578947368421, "grad_norm": 0.16125107833920507, "learning_rate": 9.331273931576306e-06, "loss": 0.2455, "num_tokens": 14941779.0, "step": 36 }, { "epoch": 0.6491228070175439, "grad_norm": 0.1651336881757046, "learning_rate": 9.285640897740316e-06, "loss": 0.2479, "num_tokens": 15378768.0, "step": 37 }, { "epoch": 0.6666666666666666, "grad_norm": 0.16309612539485216, "learning_rate": 9.238635535186247e-06, "loss": 0.2524, "num_tokens": 15784254.0, "step": 38 }, { "epoch": 0.6842105263157895, "grad_norm": 0.1613815194130245, "learning_rate": 9.19027488377886e-06, "loss": 0.252, "num_tokens": 16187575.0, "step": 39 }, { "epoch": 0.7017543859649122, "grad_norm": 0.15494556090577644, "learning_rate": 9.140576474687263e-06, "loss": 0.2397, "num_tokens": 16592627.0, "step": 40 }, { "epoch": 0.7192982456140351, "grad_norm": 0.15730027265317648, "learning_rate": 9.0895583240297e-06, "loss": 0.2396, "num_tokens": 17019496.0, "step": 41 }, { "epoch": 0.7368421052631579, "grad_norm": 0.15536856311696803, "learning_rate": 9.037238926342544e-06, "loss": 0.2388, "num_tokens": 17448731.0, "step": 42 }, { "epoch": 0.7543859649122807, "grad_norm": 0.16373703052589486, "learning_rate": 8.983637247875872e-06, "loss": 0.2447, "num_tokens": 17852418.0, "step": 43 }, { "epoch": 0.7719298245614035, "grad_norm": 0.1564039931823383, "learning_rate": 8.92877271971802e-06, "loss": 0.2317, "num_tokens": 18284414.0, "step": 44 }, { "epoch": 0.7894736842105263, "grad_norm": 0.15673969833275106, "learning_rate": 8.872665230751644e-06, "loss": 0.2445, "num_tokens": 18700575.0, "step": 45 }, { "epoch": 0.8070175438596491, "grad_norm": 0.16227301290569535, "learning_rate": 8.815335120443822e-06, "loss": 0.2369, "num_tokens": 19112507.0, "step": 46 }, { "epoch": 0.8245614035087719, "grad_norm": 0.16192875727409162, "learning_rate": 8.756803171472817e-06, "loss": 0.2488, "num_tokens": 19497572.0, "step": 47 }, { "epoch": 0.8421052631578947, "grad_norm": 0.14878902875215574, "learning_rate": 8.69709060219416e-06, "loss": 0.221, "num_tokens": 19887057.0, "step": 48 }, { "epoch": 0.8596491228070176, "grad_norm": 0.168304643511618, "learning_rate": 8.636219058948823e-06, "loss": 0.2338, "num_tokens": 20294327.0, "step": 49 }, { "epoch": 0.8771929824561403, "grad_norm": 0.14440251287899045, "learning_rate": 8.574210608216206e-06, "loss": 0.2165, "num_tokens": 20731445.0, "step": 50 }, { "epoch": 0.8947368421052632, "grad_norm": 0.14573622062248015, "learning_rate": 8.511087728614863e-06, "loss": 0.2291, "num_tokens": 21165129.0, "step": 51 }, { "epoch": 0.9122807017543859, "grad_norm": 0.15767251624934578, "learning_rate": 8.446873302753783e-06, "loss": 0.2231, "num_tokens": 21564437.0, "step": 52 }, { "epoch": 0.9298245614035088, "grad_norm": 0.14330856767528197, "learning_rate": 8.381590608937251e-06, "loss": 0.2274, "num_tokens": 22012280.0, "step": 53 }, { "epoch": 0.9473684210526315, "grad_norm": 0.1524166655671973, "learning_rate": 8.315263312726248e-06, "loss": 0.2131, "num_tokens": 22396001.0, "step": 54 }, { "epoch": 0.9649122807017544, "grad_norm": 0.15336585183985868, "learning_rate": 8.247915458359473e-06, "loss": 0.2195, "num_tokens": 22793769.0, "step": 55 }, { "epoch": 0.9824561403508771, "grad_norm": 0.1588593078419976, "learning_rate": 8.179571460037096e-06, "loss": 0.2345, "num_tokens": 23201717.0, "step": 56 }, { "epoch": 1.0, "grad_norm": 0.14666483129173052, "learning_rate": 8.110256093070393e-06, "loss": 0.2346, "num_tokens": 23647950.0, "step": 57 }, { "epoch": 1.0175438596491229, "grad_norm": 0.15418874889368148, "learning_rate": 8.039994484900463e-06, "loss": 0.2268, "num_tokens": 24100529.0, "step": 58 }, { "epoch": 1.0350877192982457, "grad_norm": 0.14747387727593939, "learning_rate": 7.968812105989316e-06, "loss": 0.2155, "num_tokens": 24540892.0, "step": 59 }, { "epoch": 1.0526315789473684, "grad_norm": 0.154346076339797, "learning_rate": 7.896734760586599e-06, "loss": 0.2057, "num_tokens": 24956824.0, "step": 60 }, { "epoch": 1.0701754385964912, "grad_norm": 0.14685651214148715, "learning_rate": 7.82378857737533e-06, "loss": 0.2036, "num_tokens": 25384518.0, "step": 61 }, { "epoch": 1.087719298245614, "grad_norm": 0.16326674348993506, "learning_rate": 7.75e-06, "loss": 0.2001, "num_tokens": 25771807.0, "step": 62 }, { "epoch": 1.1052631578947367, "grad_norm": 0.147771119836904, "learning_rate": 7.675395777480538e-06, "loss": 0.1996, "num_tokens": 26177417.0, "step": 63 }, { "epoch": 1.1228070175438596, "grad_norm": 0.14003657083220583, "learning_rate": 7.600002954515532e-06, "loss": 0.2072, "num_tokens": 26622325.0, "step": 64 }, { "epoch": 1.1403508771929824, "grad_norm": 0.15332767124685198, "learning_rate": 7.523848861678297e-06, "loss": 0.2065, "num_tokens": 27045078.0, "step": 65 }, { "epoch": 1.1578947368421053, "grad_norm": 0.15183433287347486, "learning_rate": 7.446961105509289e-06, "loss": 0.2032, "num_tokens": 27438828.0, "step": 66 }, { "epoch": 1.1754385964912282, "grad_norm": 0.14554656331938712, "learning_rate": 7.36936755850849e-06, "loss": 0.2054, "num_tokens": 27854689.0, "step": 67 }, { "epoch": 1.1929824561403508, "grad_norm": 0.1537836129829156, "learning_rate": 7.2910963490313815e-06, "loss": 0.1949, "num_tokens": 28233580.0, "step": 68 }, { "epoch": 1.2105263157894737, "grad_norm": 0.14572761360447276, "learning_rate": 7.212175851092154e-06, "loss": 0.1958, "num_tokens": 28641897.0, "step": 69 }, { "epoch": 1.2280701754385965, "grad_norm": 0.13708384426430809, "learning_rate": 7.132634674077884e-06, "loss": 0.2021, "num_tokens": 29084929.0, "step": 70 }, { "epoch": 1.2456140350877192, "grad_norm": 0.1486755044006831, "learning_rate": 7.052501652377368e-06, "loss": 0.2044, "num_tokens": 29482516.0, "step": 71 }, { "epoch": 1.263157894736842, "grad_norm": 0.14957101767673275, "learning_rate": 6.971805834928399e-06, "loss": 0.2048, "num_tokens": 29899136.0, "step": 72 }, { "epoch": 1.280701754385965, "grad_norm": 0.1486401064457622, "learning_rate": 6.890576474687264e-06, "loss": 0.2068, "num_tokens": 30317666.0, "step": 73 }, { "epoch": 1.2982456140350878, "grad_norm": 0.15958496167902586, "learning_rate": 6.808843018024296e-06, "loss": 0.1986, "num_tokens": 30734034.0, "step": 74 }, { "epoch": 1.3157894736842106, "grad_norm": 0.1383546863269268, "learning_rate": 6.726635094049291e-06, "loss": 0.199, "num_tokens": 31155917.0, "step": 75 }, { "epoch": 1.3333333333333333, "grad_norm": 0.14368049999314014, "learning_rate": 6.643982503870693e-06, "loss": 0.2032, "num_tokens": 31573757.0, "step": 76 }, { "epoch": 1.3508771929824561, "grad_norm": 0.13900291410105262, "learning_rate": 6.560915209792424e-06, "loss": 0.2016, "num_tokens": 32010739.0, "step": 77 }, { "epoch": 1.368421052631579, "grad_norm": 0.13844326865953724, "learning_rate": 6.477463324452286e-06, "loss": 0.1925, "num_tokens": 32424467.0, "step": 78 }, { "epoch": 1.3859649122807016, "grad_norm": 0.1433757292045691, "learning_rate": 6.393657099905854e-06, "loss": 0.2008, "num_tokens": 32834770.0, "step": 79 }, { "epoch": 1.4035087719298245, "grad_norm": 0.14196299072627255, "learning_rate": 6.309526916659843e-06, "loss": 0.1924, "num_tokens": 33255872.0, "step": 80 }, { "epoch": 1.4210526315789473, "grad_norm": 0.13753823275156205, "learning_rate": 6.225103272658889e-06, "loss": 0.2034, "num_tokens": 33706927.0, "step": 81 }, { "epoch": 1.4385964912280702, "grad_norm": 0.1384289808314504, "learning_rate": 6.140416772229785e-06, "loss": 0.1917, "num_tokens": 34112601.0, "step": 82 }, { "epoch": 1.456140350877193, "grad_norm": 0.14627667997521285, "learning_rate": 6.0554981149871276e-06, "loss": 0.2063, "num_tokens": 34517104.0, "step": 83 }, { "epoch": 1.4736842105263157, "grad_norm": 0.1558573666402862, "learning_rate": 5.970378084704441e-06, "loss": 0.1994, "num_tokens": 34897309.0, "step": 84 }, { "epoch": 1.4912280701754386, "grad_norm": 0.1417112898190135, "learning_rate": 5.88508753815478e-06, "loss": 0.1881, "num_tokens": 35307793.0, "step": 85 }, { "epoch": 1.5087719298245614, "grad_norm": 0.1385582692497573, "learning_rate": 5.799657393924869e-06, "loss": 0.198, "num_tokens": 35741435.0, "step": 86 }, { "epoch": 1.526315789473684, "grad_norm": 0.15662493183414183, "learning_rate": 5.714118621206843e-06, "loss": 0.1909, "num_tokens": 36110154.0, "step": 87 }, { "epoch": 1.543859649122807, "grad_norm": 0.14798100623872662, "learning_rate": 5.6285022285716325e-06, "loss": 0.2063, "num_tokens": 36508508.0, "step": 88 }, { "epoch": 1.5614035087719298, "grad_norm": 0.13949297838603725, "learning_rate": 5.542839252728096e-06, "loss": 0.2056, "num_tokens": 36962199.0, "step": 89 }, { "epoch": 1.5789473684210527, "grad_norm": 0.1388395627421214, "learning_rate": 5.457160747271906e-06, "loss": 0.1977, "num_tokens": 37416119.0, "step": 90 }, { "epoch": 1.5964912280701755, "grad_norm": 0.13753404705909872, "learning_rate": 5.371497771428368e-06, "loss": 0.1988, "num_tokens": 37844052.0, "step": 91 }, { "epoch": 1.6140350877192984, "grad_norm": 0.13813649436163167, "learning_rate": 5.2858813787931605e-06, "loss": 0.193, "num_tokens": 38281149.0, "step": 92 }, { "epoch": 1.631578947368421, "grad_norm": 0.13735934501319846, "learning_rate": 5.2003426060751324e-06, "loss": 0.1948, "num_tokens": 38696776.0, "step": 93 }, { "epoch": 1.6491228070175439, "grad_norm": 0.14464652766257102, "learning_rate": 5.114912461845223e-06, "loss": 0.1954, "num_tokens": 39118421.0, "step": 94 }, { "epoch": 1.6666666666666665, "grad_norm": 0.1441541884884212, "learning_rate": 5.02962191529556e-06, "loss": 0.1969, "num_tokens": 39531921.0, "step": 95 }, { "epoch": 1.6842105263157894, "grad_norm": 0.14549147633711634, "learning_rate": 4.944501885012875e-06, "loss": 0.1987, "num_tokens": 39942510.0, "step": 96 }, { "epoch": 1.7017543859649122, "grad_norm": 0.14239386063143547, "learning_rate": 4.859583227770218e-06, "loss": 0.1942, "num_tokens": 40349157.0, "step": 97 }, { "epoch": 1.719298245614035, "grad_norm": 0.14101938277964904, "learning_rate": 4.774896727341113e-06, "loss": 0.1896, "num_tokens": 40755487.0, "step": 98 }, { "epoch": 1.736842105263158, "grad_norm": 0.1513563377956916, "learning_rate": 4.6904730833401575e-06, "loss": 0.1741, "num_tokens": 41109588.0, "step": 99 }, { "epoch": 1.7543859649122808, "grad_norm": 0.14171664584567437, "learning_rate": 4.606342900094147e-06, "loss": 0.1978, "num_tokens": 41549463.0, "step": 100 }, { "epoch": 1.7719298245614035, "grad_norm": 0.1403769471140474, "learning_rate": 4.5225366755477165e-06, "loss": 0.2018, "num_tokens": 41986009.0, "step": 101 }, { "epoch": 1.7894736842105263, "grad_norm": 0.14588775725754724, "learning_rate": 4.439084790207577e-06, "loss": 0.1991, "num_tokens": 42393517.0, "step": 102 }, { "epoch": 1.807017543859649, "grad_norm": 0.14195463960673751, "learning_rate": 4.35601749612931e-06, "loss": 0.1954, "num_tokens": 42788971.0, "step": 103 }, { "epoch": 1.8245614035087718, "grad_norm": 0.14486517700153345, "learning_rate": 4.273364905950711e-06, "loss": 0.2001, "num_tokens": 43200059.0, "step": 104 }, { "epoch": 1.8421052631578947, "grad_norm": 0.14536497153998434, "learning_rate": 4.191156981975704e-06, "loss": 0.1881, "num_tokens": 43591515.0, "step": 105 }, { "epoch": 1.8596491228070176, "grad_norm": 0.1483672348465985, "learning_rate": 4.109423525312738e-06, "loss": 0.1936, "num_tokens": 43989015.0, "step": 106 }, { "epoch": 1.8771929824561404, "grad_norm": 0.14387471159557752, "learning_rate": 4.028194165071603e-06, "loss": 0.1959, "num_tokens": 44390867.0, "step": 107 }, { "epoch": 1.8947368421052633, "grad_norm": 0.14319263387686854, "learning_rate": 3.9474983476226335e-06, "loss": 0.2026, "num_tokens": 44814288.0, "step": 108 }, { "epoch": 1.912280701754386, "grad_norm": 0.13718763298366718, "learning_rate": 3.867365325922116e-06, "loss": 0.1919, "num_tokens": 45232685.0, "step": 109 }, { "epoch": 1.9298245614035088, "grad_norm": 0.13661747990592807, "learning_rate": 3.7878241489078473e-06, "loss": 0.192, "num_tokens": 45633905.0, "step": 110 }, { "epoch": 1.9473684210526314, "grad_norm": 0.13757723840377134, "learning_rate": 3.7089036509686216e-06, "loss": 0.196, "num_tokens": 46052270.0, "step": 111 }, { "epoch": 1.9649122807017543, "grad_norm": 0.14009156799615108, "learning_rate": 3.630632441491512e-06, "loss": 0.1945, "num_tokens": 46479271.0, "step": 112 }, { "epoch": 1.9824561403508771, "grad_norm": 0.1392559652525668, "learning_rate": 3.5530388944907124e-06, "loss": 0.1985, "num_tokens": 46884227.0, "step": 113 }, { "epoch": 2.0, "grad_norm": 0.13976428969132587, "learning_rate": 3.476151138321705e-06, "loss": 0.1995, "num_tokens": 47297644.0, "step": 114 }, { "epoch": 2.017543859649123, "grad_norm": 0.1378198428541279, "learning_rate": 3.3999970454844688e-06, "loss": 0.1724, "num_tokens": 47688068.0, "step": 115 }, { "epoch": 2.0350877192982457, "grad_norm": 0.134440422974191, "learning_rate": 3.3246042225194626e-06, "loss": 0.1796, "num_tokens": 48092477.0, "step": 116 }, { "epoch": 2.0526315789473686, "grad_norm": 0.13660484419562605, "learning_rate": 3.2500000000000015e-06, "loss": 0.1763, "num_tokens": 48476841.0, "step": 117 }, { "epoch": 2.0701754385964914, "grad_norm": 0.14109474340650238, "learning_rate": 3.176211422624672e-06, "loss": 0.1778, "num_tokens": 48854905.0, "step": 118 }, { "epoch": 2.087719298245614, "grad_norm": 0.13774654351946805, "learning_rate": 3.103265239413401e-06, "loss": 0.1793, "num_tokens": 49295065.0, "step": 119 }, { "epoch": 2.1052631578947367, "grad_norm": 0.14705463035874308, "learning_rate": 3.0311878940106864e-06, "loss": 0.1885, "num_tokens": 49711843.0, "step": 120 }, { "epoch": 2.1228070175438596, "grad_norm": 0.13965440849358451, "learning_rate": 2.9600055150995397e-06, "loss": 0.1804, "num_tokens": 50121373.0, "step": 121 }, { "epoch": 2.1403508771929824, "grad_norm": 0.1431354792667028, "learning_rate": 2.889743906929609e-06, "loss": 0.1761, "num_tokens": 50524660.0, "step": 122 }, { "epoch": 2.1578947368421053, "grad_norm": 0.13549946577694855, "learning_rate": 2.820428539962905e-06, "loss": 0.1756, "num_tokens": 50952097.0, "step": 123 }, { "epoch": 2.175438596491228, "grad_norm": 0.13874042982824947, "learning_rate": 2.7520845416405285e-06, "loss": 0.1787, "num_tokens": 51357662.0, "step": 124 }, { "epoch": 2.192982456140351, "grad_norm": 0.13352052067268536, "learning_rate": 2.6847366872737535e-06, "loss": 0.1786, "num_tokens": 51772391.0, "step": 125 }, { "epoch": 2.2105263157894735, "grad_norm": 0.13750830287403998, "learning_rate": 2.618409391062751e-06, "loss": 0.1827, "num_tokens": 52198396.0, "step": 126 }, { "epoch": 2.2280701754385963, "grad_norm": 0.14077287411728898, "learning_rate": 2.5531266972462176e-06, "loss": 0.1786, "num_tokens": 52585564.0, "step": 127 }, { "epoch": 2.245614035087719, "grad_norm": 0.13893984896019573, "learning_rate": 2.4889122713851397e-06, "loss": 0.1788, "num_tokens": 52997398.0, "step": 128 }, { "epoch": 2.263157894736842, "grad_norm": 0.13788162656378736, "learning_rate": 2.425789391783796e-06, "loss": 0.1878, "num_tokens": 53407933.0, "step": 129 }, { "epoch": 2.280701754385965, "grad_norm": 0.13629331805149528, "learning_rate": 2.36378094105118e-06, "loss": 0.1836, "num_tokens": 53817667.0, "step": 130 }, { "epoch": 2.2982456140350878, "grad_norm": 0.145846640939152, "learning_rate": 2.302909397805841e-06, "loss": 0.1761, "num_tokens": 54208139.0, "step": 131 }, { "epoch": 2.3157894736842106, "grad_norm": 0.1415158735561498, "learning_rate": 2.2431968285271843e-06, "loss": 0.1861, "num_tokens": 54616138.0, "step": 132 }, { "epoch": 2.3333333333333335, "grad_norm": 0.1399694993181749, "learning_rate": 2.1846648795561777e-06, "loss": 0.18, "num_tokens": 55028264.0, "step": 133 }, { "epoch": 2.3508771929824563, "grad_norm": 0.1340221566625987, "learning_rate": 2.1273347692483574e-06, "loss": 0.1818, "num_tokens": 55474995.0, "step": 134 }, { "epoch": 2.3684210526315788, "grad_norm": 0.13728502667314055, "learning_rate": 2.071227280281982e-06, "loss": 0.1697, "num_tokens": 55872252.0, "step": 135 }, { "epoch": 2.3859649122807016, "grad_norm": 0.13569940106251407, "learning_rate": 2.016362752124129e-06, "loss": 0.1799, "num_tokens": 56295990.0, "step": 136 }, { "epoch": 2.4035087719298245, "grad_norm": 0.1433225385861297, "learning_rate": 1.9627610736574575e-06, "loss": 0.1744, "num_tokens": 56700633.0, "step": 137 }, { "epoch": 2.4210526315789473, "grad_norm": 0.13712140562366157, "learning_rate": 1.9104416759703017e-06, "loss": 0.1772, "num_tokens": 57123351.0, "step": 138 }, { "epoch": 2.43859649122807, "grad_norm": 0.14064914274676912, "learning_rate": 1.8594235253127373e-06, "loss": 0.1794, "num_tokens": 57541451.0, "step": 139 }, { "epoch": 2.456140350877193, "grad_norm": 0.15170132064659694, "learning_rate": 1.8097251162211405e-06, "loss": 0.1831, "num_tokens": 57962223.0, "step": 140 }, { "epoch": 2.473684210526316, "grad_norm": 0.13964776563103484, "learning_rate": 1.7613644648137543e-06, "loss": 0.1756, "num_tokens": 58375881.0, "step": 141 }, { "epoch": 2.4912280701754383, "grad_norm": 0.13507579048092097, "learning_rate": 1.7143591022596846e-06, "loss": 0.1821, "num_tokens": 58796929.0, "step": 142 }, { "epoch": 2.5087719298245617, "grad_norm": 0.13875107577532086, "learning_rate": 1.6687260684236943e-06, "loss": 0.1773, "num_tokens": 59207995.0, "step": 143 }, { "epoch": 2.526315789473684, "grad_norm": 0.14061593378122658, "learning_rate": 1.6244819056890975e-06, "loss": 0.1716, "num_tokens": 59582578.0, "step": 144 }, { "epoch": 2.543859649122807, "grad_norm": 0.12901477335373565, "learning_rate": 1.5816426529610035e-06, "loss": 0.1764, "num_tokens": 60014351.0, "step": 145 }, { "epoch": 2.56140350877193, "grad_norm": 0.13513262564013573, "learning_rate": 1.5402238398520614e-06, "loss": 0.1742, "num_tokens": 60428513.0, "step": 146 }, { "epoch": 2.5789473684210527, "grad_norm": 0.12744611421871882, "learning_rate": 1.5002404810528452e-06, "loss": 0.1798, "num_tokens": 60870775.0, "step": 147 }, { "epoch": 2.5964912280701755, "grad_norm": 0.1281932184087842, "learning_rate": 1.4617070708888882e-06, "loss": 0.1788, "num_tokens": 61333167.0, "step": 148 }, { "epoch": 2.6140350877192984, "grad_norm": 0.13398144271039164, "learning_rate": 1.4246375780663613e-06, "loss": 0.1792, "num_tokens": 61737623.0, "step": 149 }, { "epoch": 2.6315789473684212, "grad_norm": 0.13540743049220252, "learning_rate": 1.389045440608296e-06, "loss": 0.1755, "num_tokens": 62143293.0, "step": 150 }, { "epoch": 2.6491228070175437, "grad_norm": 0.13564465493581726, "learning_rate": 1.354943560983175e-06, "loss": 0.1735, "num_tokens": 62558499.0, "step": 151 }, { "epoch": 2.6666666666666665, "grad_norm": 0.12805186009140426, "learning_rate": 1.3223443014276738e-06, "loss": 0.1736, "num_tokens": 63004628.0, "step": 152 }, { "epoch": 2.6842105263157894, "grad_norm": 0.1328569132316143, "learning_rate": 1.2912594794652406e-06, "loss": 0.1642, "num_tokens": 63387346.0, "step": 153 }, { "epoch": 2.7017543859649122, "grad_norm": 0.1325321320124978, "learning_rate": 1.2617003636221394e-06, "loss": 0.169, "num_tokens": 63804970.0, "step": 154 }, { "epoch": 2.719298245614035, "grad_norm": 0.13540714382771668, "learning_rate": 1.2336776693425028e-06, "loss": 0.1744, "num_tokens": 64196162.0, "step": 155 }, { "epoch": 2.736842105263158, "grad_norm": 0.14020781213013872, "learning_rate": 1.2072015551038933e-06, "loss": 0.1811, "num_tokens": 64585657.0, "step": 156 }, { "epoch": 2.754385964912281, "grad_norm": 0.14012421310202808, "learning_rate": 1.1822816187347625e-06, "loss": 0.1882, "num_tokens": 64990929.0, "step": 157 }, { "epoch": 2.7719298245614032, "grad_norm": 0.13359559789919473, "learning_rate": 1.1589268939351499e-06, "loss": 0.1644, "num_tokens": 65419394.0, "step": 158 }, { "epoch": 2.7894736842105265, "grad_norm": 0.1293973137684263, "learning_rate": 1.1371458470018896e-06, "loss": 0.1686, "num_tokens": 65848256.0, "step": 159 }, { "epoch": 2.807017543859649, "grad_norm": 0.12796590503255867, "learning_rate": 1.1169463737594995e-06, "loss": 0.173, "num_tokens": 66276026.0, "step": 160 }, { "epoch": 2.824561403508772, "grad_norm": 0.1386629969970847, "learning_rate": 1.0983357966978747e-06, "loss": 0.1698, "num_tokens": 66662640.0, "step": 161 }, { "epoch": 2.8421052631578947, "grad_norm": 0.1312256058443758, "learning_rate": 1.0813208623178199e-06, "loss": 0.1831, "num_tokens": 67101128.0, "step": 162 }, { "epoch": 2.8596491228070176, "grad_norm": 0.13566699518356568, "learning_rate": 1.0659077386853817e-06, "loss": 0.1918, "num_tokens": 67527335.0, "step": 163 }, { "epoch": 2.8771929824561404, "grad_norm": 0.13207372833151468, "learning_rate": 1.0521020131958692e-06, "loss": 0.18, "num_tokens": 67953220.0, "step": 164 }, { "epoch": 2.8947368421052633, "grad_norm": 0.1351068484295404, "learning_rate": 1.0399086905483752e-06, "loss": 0.1796, "num_tokens": 68401961.0, "step": 165 }, { "epoch": 2.912280701754386, "grad_norm": 0.13512574709033598, "learning_rate": 1.0293321909315242e-06, "loss": 0.1742, "num_tokens": 68815465.0, "step": 166 }, { "epoch": 2.9298245614035086, "grad_norm": 0.13483699443340522, "learning_rate": 1.0203763484211196e-06, "loss": 0.1778, "num_tokens": 69255767.0, "step": 167 }, { "epoch": 2.9473684210526314, "grad_norm": 0.13142810366700336, "learning_rate": 1.0130444095902514e-06, "loss": 0.1842, "num_tokens": 69678619.0, "step": 168 }, { "epoch": 2.9649122807017543, "grad_norm": 0.13693412077884642, "learning_rate": 1.0073390323323897e-06, "loss": 0.177, "num_tokens": 70098414.0, "step": 169 }, { "epoch": 2.982456140350877, "grad_norm": 0.1320466096940847, "learning_rate": 1.0032622848978689e-06, "loss": 0.168, "num_tokens": 70513950.0, "step": 170 }, { "epoch": 3.0, "grad_norm": 0.1365255306036787, "learning_rate": 1.000815645144134e-06, "loss": 0.1794, "num_tokens": 70937090.0, "step": 171 }, { "epoch": 3.0, "step": 171, "total_flos": 2.276376686268252e+17, "train_loss": 0.25581319124726526, "train_runtime": 2707.0199, "train_samples_per_second": 8.073, "train_steps_per_second": 0.063 } ], "logging_steps": 1, "max_steps": 171, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 2.276376686268252e+17, "train_batch_size": 8, "trial_name": null, "trial_params": null }