| { | |
| "best_global_step": null, | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 3.0, | |
| "eval_steps": 500, | |
| "global_step": 171, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.017543859649122806, | |
| "grad_norm": 3.77577782421705, | |
| "learning_rate": 0.0, | |
| "loss": 1.1733, | |
| "num_tokens": 427344.0, | |
| "step": 1 | |
| }, | |
| { | |
| "epoch": 0.03508771929824561, | |
| "grad_norm": 3.7130563556909215, | |
| "learning_rate": 1.6666666666666667e-06, | |
| "loss": 1.164, | |
| "num_tokens": 866449.0, | |
| "step": 2 | |
| }, | |
| { | |
| "epoch": 0.05263157894736842, | |
| "grad_norm": 3.970572390407664, | |
| "learning_rate": 3.3333333333333333e-06, | |
| "loss": 1.2254, | |
| "num_tokens": 1268563.0, | |
| "step": 3 | |
| }, | |
| { | |
| "epoch": 0.07017543859649122, | |
| "grad_norm": 3.512030300056592, | |
| "learning_rate": 5e-06, | |
| "loss": 1.2077, | |
| "num_tokens": 1669265.0, | |
| "step": 4 | |
| }, | |
| { | |
| "epoch": 0.08771929824561403, | |
| "grad_norm": 2.37795690695823, | |
| "learning_rate": 6.666666666666667e-06, | |
| "loss": 1.0413, | |
| "num_tokens": 2093363.0, | |
| "step": 5 | |
| }, | |
| { | |
| "epoch": 0.10526315789473684, | |
| "grad_norm": 1.525864968420599, | |
| "learning_rate": 8.333333333333334e-06, | |
| "loss": 0.874, | |
| "num_tokens": 2505147.0, | |
| "step": 6 | |
| }, | |
| { | |
| "epoch": 0.12280701754385964, | |
| "grad_norm": 1.4347947716935, | |
| "learning_rate": 1e-05, | |
| "loss": 0.8159, | |
| "num_tokens": 2931742.0, | |
| "step": 7 | |
| }, | |
| { | |
| "epoch": 0.14035087719298245, | |
| "grad_norm": 2.6084462153475063, | |
| "learning_rate": 9.999184354855868e-06, | |
| "loss": 0.6802, | |
| "num_tokens": 3316057.0, | |
| "step": 8 | |
| }, | |
| { | |
| "epoch": 0.15789473684210525, | |
| "grad_norm": 1.5740859932252258, | |
| "learning_rate": 9.996737715102133e-06, | |
| "loss": 0.6019, | |
| "num_tokens": 3761399.0, | |
| "step": 9 | |
| }, | |
| { | |
| "epoch": 0.17543859649122806, | |
| "grad_norm": 1.5450933348615719, | |
| "learning_rate": 9.99266096766761e-06, | |
| "loss": 0.5439, | |
| "num_tokens": 4179394.0, | |
| "step": 10 | |
| }, | |
| { | |
| "epoch": 0.19298245614035087, | |
| "grad_norm": 0.9018401685238229, | |
| "learning_rate": 9.98695559040975e-06, | |
| "loss": 0.4438, | |
| "num_tokens": 4600345.0, | |
| "step": 11 | |
| }, | |
| { | |
| "epoch": 0.21052631578947367, | |
| "grad_norm": 0.5196800459649759, | |
| "learning_rate": 9.979623651578881e-06, | |
| "loss": 0.395, | |
| "num_tokens": 4980610.0, | |
| "step": 12 | |
| }, | |
| { | |
| "epoch": 0.22807017543859648, | |
| "grad_norm": 0.4799071601752034, | |
| "learning_rate": 9.970667809068476e-06, | |
| "loss": 0.3892, | |
| "num_tokens": 5395359.0, | |
| "step": 13 | |
| }, | |
| { | |
| "epoch": 0.24561403508771928, | |
| "grad_norm": 0.31207959270885294, | |
| "learning_rate": 9.960091309451626e-06, | |
| "loss": 0.3808, | |
| "num_tokens": 5805663.0, | |
| "step": 14 | |
| }, | |
| { | |
| "epoch": 0.2631578947368421, | |
| "grad_norm": 0.31760448278346387, | |
| "learning_rate": 9.947897986804131e-06, | |
| "loss": 0.3708, | |
| "num_tokens": 6201438.0, | |
| "step": 15 | |
| }, | |
| { | |
| "epoch": 0.2807017543859649, | |
| "grad_norm": 0.3009817279323404, | |
| "learning_rate": 9.93409226131462e-06, | |
| "loss": 0.3557, | |
| "num_tokens": 6614132.0, | |
| "step": 16 | |
| }, | |
| { | |
| "epoch": 0.2982456140350877, | |
| "grad_norm": 0.2599822684525422, | |
| "learning_rate": 9.91867913768218e-06, | |
| "loss": 0.3284, | |
| "num_tokens": 7030814.0, | |
| "step": 17 | |
| }, | |
| { | |
| "epoch": 0.3157894736842105, | |
| "grad_norm": 0.24197734921107386, | |
| "learning_rate": 9.901664203302126e-06, | |
| "loss": 0.3272, | |
| "num_tokens": 7453748.0, | |
| "step": 18 | |
| }, | |
| { | |
| "epoch": 0.3333333333333333, | |
| "grad_norm": 0.2741912470593866, | |
| "learning_rate": 9.883053626240503e-06, | |
| "loss": 0.3181, | |
| "num_tokens": 7874681.0, | |
| "step": 19 | |
| }, | |
| { | |
| "epoch": 0.3508771929824561, | |
| "grad_norm": 0.21560866373997145, | |
| "learning_rate": 9.862854152998112e-06, | |
| "loss": 0.3102, | |
| "num_tokens": 8349009.0, | |
| "step": 20 | |
| }, | |
| { | |
| "epoch": 0.3684210526315789, | |
| "grad_norm": 0.203790119667157, | |
| "learning_rate": 9.841073106064852e-06, | |
| "loss": 0.3038, | |
| "num_tokens": 8779295.0, | |
| "step": 21 | |
| }, | |
| { | |
| "epoch": 0.38596491228070173, | |
| "grad_norm": 0.21543607735771148, | |
| "learning_rate": 9.81771838126524e-06, | |
| "loss": 0.3039, | |
| "num_tokens": 9155380.0, | |
| "step": 22 | |
| }, | |
| { | |
| "epoch": 0.40350877192982454, | |
| "grad_norm": 0.19448461940928388, | |
| "learning_rate": 9.792798444896107e-06, | |
| "loss": 0.2923, | |
| "num_tokens": 9572881.0, | |
| "step": 23 | |
| }, | |
| { | |
| "epoch": 0.42105263157894735, | |
| "grad_norm": 0.20104199677846465, | |
| "learning_rate": 9.766322330657499e-06, | |
| "loss": 0.3004, | |
| "num_tokens": 9972480.0, | |
| "step": 24 | |
| }, | |
| { | |
| "epoch": 0.43859649122807015, | |
| "grad_norm": 0.19715422132031898, | |
| "learning_rate": 9.738299636377863e-06, | |
| "loss": 0.291, | |
| "num_tokens": 10406948.0, | |
| "step": 25 | |
| }, | |
| { | |
| "epoch": 0.45614035087719296, | |
| "grad_norm": 0.19192801250536232, | |
| "learning_rate": 9.70874052053476e-06, | |
| "loss": 0.289, | |
| "num_tokens": 10821942.0, | |
| "step": 26 | |
| }, | |
| { | |
| "epoch": 0.47368421052631576, | |
| "grad_norm": 0.18616122639207905, | |
| "learning_rate": 9.677655698572326e-06, | |
| "loss": 0.2622, | |
| "num_tokens": 11234170.0, | |
| "step": 27 | |
| }, | |
| { | |
| "epoch": 0.49122807017543857, | |
| "grad_norm": 0.192192334546507, | |
| "learning_rate": 9.645056439016827e-06, | |
| "loss": 0.275, | |
| "num_tokens": 11627308.0, | |
| "step": 28 | |
| }, | |
| { | |
| "epoch": 0.5087719298245614, | |
| "grad_norm": 0.18829460979027127, | |
| "learning_rate": 9.610954559391704e-06, | |
| "loss": 0.2754, | |
| "num_tokens": 12022656.0, | |
| "step": 29 | |
| }, | |
| { | |
| "epoch": 0.5263157894736842, | |
| "grad_norm": 0.17300283701775698, | |
| "learning_rate": 9.57536242193364e-06, | |
| "loss": 0.2692, | |
| "num_tokens": 12444405.0, | |
| "step": 30 | |
| }, | |
| { | |
| "epoch": 0.543859649122807, | |
| "grad_norm": 0.17695771762278176, | |
| "learning_rate": 9.538292929111114e-06, | |
| "loss": 0.2734, | |
| "num_tokens": 12837998.0, | |
| "step": 31 | |
| }, | |
| { | |
| "epoch": 0.5614035087719298, | |
| "grad_norm": 0.16869993658358656, | |
| "learning_rate": 9.499759518947156e-06, | |
| "loss": 0.2657, | |
| "num_tokens": 13261798.0, | |
| "step": 32 | |
| }, | |
| { | |
| "epoch": 0.5789473684210527, | |
| "grad_norm": 0.15857386041778282, | |
| "learning_rate": 9.459776160147941e-06, | |
| "loss": 0.2559, | |
| "num_tokens": 13717762.0, | |
| "step": 33 | |
| }, | |
| { | |
| "epoch": 0.5964912280701754, | |
| "grad_norm": 0.16181853610382824, | |
| "learning_rate": 9.418357347038999e-06, | |
| "loss": 0.2427, | |
| "num_tokens": 14142042.0, | |
| "step": 34 | |
| }, | |
| { | |
| "epoch": 0.6140350877192983, | |
| "grad_norm": 0.16729961186553155, | |
| "learning_rate": 9.375518094310904e-06, | |
| "loss": 0.2546, | |
| "num_tokens": 14543269.0, | |
| "step": 35 | |
| }, | |
| { | |
| "epoch": 0.631578947368421, | |
| "grad_norm": 0.16125107833920507, | |
| "learning_rate": 9.331273931576306e-06, | |
| "loss": 0.2455, | |
| "num_tokens": 14941779.0, | |
| "step": 36 | |
| }, | |
| { | |
| "epoch": 0.6491228070175439, | |
| "grad_norm": 0.1651336881757046, | |
| "learning_rate": 9.285640897740316e-06, | |
| "loss": 0.2479, | |
| "num_tokens": 15378768.0, | |
| "step": 37 | |
| }, | |
| { | |
| "epoch": 0.6666666666666666, | |
| "grad_norm": 0.16309612539485216, | |
| "learning_rate": 9.238635535186247e-06, | |
| "loss": 0.2524, | |
| "num_tokens": 15784254.0, | |
| "step": 38 | |
| }, | |
| { | |
| "epoch": 0.6842105263157895, | |
| "grad_norm": 0.1613815194130245, | |
| "learning_rate": 9.19027488377886e-06, | |
| "loss": 0.252, | |
| "num_tokens": 16187575.0, | |
| "step": 39 | |
| }, | |
| { | |
| "epoch": 0.7017543859649122, | |
| "grad_norm": 0.15494556090577644, | |
| "learning_rate": 9.140576474687263e-06, | |
| "loss": 0.2397, | |
| "num_tokens": 16592627.0, | |
| "step": 40 | |
| }, | |
| { | |
| "epoch": 0.7192982456140351, | |
| "grad_norm": 0.15730027265317648, | |
| "learning_rate": 9.0895583240297e-06, | |
| "loss": 0.2396, | |
| "num_tokens": 17019496.0, | |
| "step": 41 | |
| }, | |
| { | |
| "epoch": 0.7368421052631579, | |
| "grad_norm": 0.15536856311696803, | |
| "learning_rate": 9.037238926342544e-06, | |
| "loss": 0.2388, | |
| "num_tokens": 17448731.0, | |
| "step": 42 | |
| }, | |
| { | |
| "epoch": 0.7543859649122807, | |
| "grad_norm": 0.16373703052589486, | |
| "learning_rate": 8.983637247875872e-06, | |
| "loss": 0.2447, | |
| "num_tokens": 17852418.0, | |
| "step": 43 | |
| }, | |
| { | |
| "epoch": 0.7719298245614035, | |
| "grad_norm": 0.1564039931823383, | |
| "learning_rate": 8.92877271971802e-06, | |
| "loss": 0.2317, | |
| "num_tokens": 18284414.0, | |
| "step": 44 | |
| }, | |
| { | |
| "epoch": 0.7894736842105263, | |
| "grad_norm": 0.15673969833275106, | |
| "learning_rate": 8.872665230751644e-06, | |
| "loss": 0.2445, | |
| "num_tokens": 18700575.0, | |
| "step": 45 | |
| }, | |
| { | |
| "epoch": 0.8070175438596491, | |
| "grad_norm": 0.16227301290569535, | |
| "learning_rate": 8.815335120443822e-06, | |
| "loss": 0.2369, | |
| "num_tokens": 19112507.0, | |
| "step": 46 | |
| }, | |
| { | |
| "epoch": 0.8245614035087719, | |
| "grad_norm": 0.16192875727409162, | |
| "learning_rate": 8.756803171472817e-06, | |
| "loss": 0.2488, | |
| "num_tokens": 19497572.0, | |
| "step": 47 | |
| }, | |
| { | |
| "epoch": 0.8421052631578947, | |
| "grad_norm": 0.14878902875215574, | |
| "learning_rate": 8.69709060219416e-06, | |
| "loss": 0.221, | |
| "num_tokens": 19887057.0, | |
| "step": 48 | |
| }, | |
| { | |
| "epoch": 0.8596491228070176, | |
| "grad_norm": 0.168304643511618, | |
| "learning_rate": 8.636219058948823e-06, | |
| "loss": 0.2338, | |
| "num_tokens": 20294327.0, | |
| "step": 49 | |
| }, | |
| { | |
| "epoch": 0.8771929824561403, | |
| "grad_norm": 0.14440251287899045, | |
| "learning_rate": 8.574210608216206e-06, | |
| "loss": 0.2165, | |
| "num_tokens": 20731445.0, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.8947368421052632, | |
| "grad_norm": 0.14573622062248015, | |
| "learning_rate": 8.511087728614863e-06, | |
| "loss": 0.2291, | |
| "num_tokens": 21165129.0, | |
| "step": 51 | |
| }, | |
| { | |
| "epoch": 0.9122807017543859, | |
| "grad_norm": 0.15767251624934578, | |
| "learning_rate": 8.446873302753783e-06, | |
| "loss": 0.2231, | |
| "num_tokens": 21564437.0, | |
| "step": 52 | |
| }, | |
| { | |
| "epoch": 0.9298245614035088, | |
| "grad_norm": 0.14330856767528197, | |
| "learning_rate": 8.381590608937251e-06, | |
| "loss": 0.2274, | |
| "num_tokens": 22012280.0, | |
| "step": 53 | |
| }, | |
| { | |
| "epoch": 0.9473684210526315, | |
| "grad_norm": 0.1524166655671973, | |
| "learning_rate": 8.315263312726248e-06, | |
| "loss": 0.2131, | |
| "num_tokens": 22396001.0, | |
| "step": 54 | |
| }, | |
| { | |
| "epoch": 0.9649122807017544, | |
| "grad_norm": 0.15336585183985868, | |
| "learning_rate": 8.247915458359473e-06, | |
| "loss": 0.2195, | |
| "num_tokens": 22793769.0, | |
| "step": 55 | |
| }, | |
| { | |
| "epoch": 0.9824561403508771, | |
| "grad_norm": 0.1588593078419976, | |
| "learning_rate": 8.179571460037096e-06, | |
| "loss": 0.2345, | |
| "num_tokens": 23201717.0, | |
| "step": 56 | |
| }, | |
| { | |
| "epoch": 1.0, | |
| "grad_norm": 0.14666483129173052, | |
| "learning_rate": 8.110256093070393e-06, | |
| "loss": 0.2346, | |
| "num_tokens": 23647950.0, | |
| "step": 57 | |
| }, | |
| { | |
| "epoch": 1.0175438596491229, | |
| "grad_norm": 0.15418874889368148, | |
| "learning_rate": 8.039994484900463e-06, | |
| "loss": 0.2268, | |
| "num_tokens": 24100529.0, | |
| "step": 58 | |
| }, | |
| { | |
| "epoch": 1.0350877192982457, | |
| "grad_norm": 0.14747387727593939, | |
| "learning_rate": 7.968812105989316e-06, | |
| "loss": 0.2155, | |
| "num_tokens": 24540892.0, | |
| "step": 59 | |
| }, | |
| { | |
| "epoch": 1.0526315789473684, | |
| "grad_norm": 0.154346076339797, | |
| "learning_rate": 7.896734760586599e-06, | |
| "loss": 0.2057, | |
| "num_tokens": 24956824.0, | |
| "step": 60 | |
| }, | |
| { | |
| "epoch": 1.0701754385964912, | |
| "grad_norm": 0.14685651214148715, | |
| "learning_rate": 7.82378857737533e-06, | |
| "loss": 0.2036, | |
| "num_tokens": 25384518.0, | |
| "step": 61 | |
| }, | |
| { | |
| "epoch": 1.087719298245614, | |
| "grad_norm": 0.16326674348993506, | |
| "learning_rate": 7.75e-06, | |
| "loss": 0.2001, | |
| "num_tokens": 25771807.0, | |
| "step": 62 | |
| }, | |
| { | |
| "epoch": 1.1052631578947367, | |
| "grad_norm": 0.147771119836904, | |
| "learning_rate": 7.675395777480538e-06, | |
| "loss": 0.1996, | |
| "num_tokens": 26177417.0, | |
| "step": 63 | |
| }, | |
| { | |
| "epoch": 1.1228070175438596, | |
| "grad_norm": 0.14003657083220583, | |
| "learning_rate": 7.600002954515532e-06, | |
| "loss": 0.2072, | |
| "num_tokens": 26622325.0, | |
| "step": 64 | |
| }, | |
| { | |
| "epoch": 1.1403508771929824, | |
| "grad_norm": 0.15332767124685198, | |
| "learning_rate": 7.523848861678297e-06, | |
| "loss": 0.2065, | |
| "num_tokens": 27045078.0, | |
| "step": 65 | |
| }, | |
| { | |
| "epoch": 1.1578947368421053, | |
| "grad_norm": 0.15183433287347486, | |
| "learning_rate": 7.446961105509289e-06, | |
| "loss": 0.2032, | |
| "num_tokens": 27438828.0, | |
| "step": 66 | |
| }, | |
| { | |
| "epoch": 1.1754385964912282, | |
| "grad_norm": 0.14554656331938712, | |
| "learning_rate": 7.36936755850849e-06, | |
| "loss": 0.2054, | |
| "num_tokens": 27854689.0, | |
| "step": 67 | |
| }, | |
| { | |
| "epoch": 1.1929824561403508, | |
| "grad_norm": 0.1537836129829156, | |
| "learning_rate": 7.2910963490313815e-06, | |
| "loss": 0.1949, | |
| "num_tokens": 28233580.0, | |
| "step": 68 | |
| }, | |
| { | |
| "epoch": 1.2105263157894737, | |
| "grad_norm": 0.14572761360447276, | |
| "learning_rate": 7.212175851092154e-06, | |
| "loss": 0.1958, | |
| "num_tokens": 28641897.0, | |
| "step": 69 | |
| }, | |
| { | |
| "epoch": 1.2280701754385965, | |
| "grad_norm": 0.13708384426430809, | |
| "learning_rate": 7.132634674077884e-06, | |
| "loss": 0.2021, | |
| "num_tokens": 29084929.0, | |
| "step": 70 | |
| }, | |
| { | |
| "epoch": 1.2456140350877192, | |
| "grad_norm": 0.1486755044006831, | |
| "learning_rate": 7.052501652377368e-06, | |
| "loss": 0.2044, | |
| "num_tokens": 29482516.0, | |
| "step": 71 | |
| }, | |
| { | |
| "epoch": 1.263157894736842, | |
| "grad_norm": 0.14957101767673275, | |
| "learning_rate": 6.971805834928399e-06, | |
| "loss": 0.2048, | |
| "num_tokens": 29899136.0, | |
| "step": 72 | |
| }, | |
| { | |
| "epoch": 1.280701754385965, | |
| "grad_norm": 0.1486401064457622, | |
| "learning_rate": 6.890576474687264e-06, | |
| "loss": 0.2068, | |
| "num_tokens": 30317666.0, | |
| "step": 73 | |
| }, | |
| { | |
| "epoch": 1.2982456140350878, | |
| "grad_norm": 0.15958496167902586, | |
| "learning_rate": 6.808843018024296e-06, | |
| "loss": 0.1986, | |
| "num_tokens": 30734034.0, | |
| "step": 74 | |
| }, | |
| { | |
| "epoch": 1.3157894736842106, | |
| "grad_norm": 0.1383546863269268, | |
| "learning_rate": 6.726635094049291e-06, | |
| "loss": 0.199, | |
| "num_tokens": 31155917.0, | |
| "step": 75 | |
| }, | |
| { | |
| "epoch": 1.3333333333333333, | |
| "grad_norm": 0.14368049999314014, | |
| "learning_rate": 6.643982503870693e-06, | |
| "loss": 0.2032, | |
| "num_tokens": 31573757.0, | |
| "step": 76 | |
| }, | |
| { | |
| "epoch": 1.3508771929824561, | |
| "grad_norm": 0.13900291410105262, | |
| "learning_rate": 6.560915209792424e-06, | |
| "loss": 0.2016, | |
| "num_tokens": 32010739.0, | |
| "step": 77 | |
| }, | |
| { | |
| "epoch": 1.368421052631579, | |
| "grad_norm": 0.13844326865953724, | |
| "learning_rate": 6.477463324452286e-06, | |
| "loss": 0.1925, | |
| "num_tokens": 32424467.0, | |
| "step": 78 | |
| }, | |
| { | |
| "epoch": 1.3859649122807016, | |
| "grad_norm": 0.1433757292045691, | |
| "learning_rate": 6.393657099905854e-06, | |
| "loss": 0.2008, | |
| "num_tokens": 32834770.0, | |
| "step": 79 | |
| }, | |
| { | |
| "epoch": 1.4035087719298245, | |
| "grad_norm": 0.14196299072627255, | |
| "learning_rate": 6.309526916659843e-06, | |
| "loss": 0.1924, | |
| "num_tokens": 33255872.0, | |
| "step": 80 | |
| }, | |
| { | |
| "epoch": 1.4210526315789473, | |
| "grad_norm": 0.13753823275156205, | |
| "learning_rate": 6.225103272658889e-06, | |
| "loss": 0.2034, | |
| "num_tokens": 33706927.0, | |
| "step": 81 | |
| }, | |
| { | |
| "epoch": 1.4385964912280702, | |
| "grad_norm": 0.1384289808314504, | |
| "learning_rate": 6.140416772229785e-06, | |
| "loss": 0.1917, | |
| "num_tokens": 34112601.0, | |
| "step": 82 | |
| }, | |
| { | |
| "epoch": 1.456140350877193, | |
| "grad_norm": 0.14627667997521285, | |
| "learning_rate": 6.0554981149871276e-06, | |
| "loss": 0.2063, | |
| "num_tokens": 34517104.0, | |
| "step": 83 | |
| }, | |
| { | |
| "epoch": 1.4736842105263157, | |
| "grad_norm": 0.1558573666402862, | |
| "learning_rate": 5.970378084704441e-06, | |
| "loss": 0.1994, | |
| "num_tokens": 34897309.0, | |
| "step": 84 | |
| }, | |
| { | |
| "epoch": 1.4912280701754386, | |
| "grad_norm": 0.1417112898190135, | |
| "learning_rate": 5.88508753815478e-06, | |
| "loss": 0.1881, | |
| "num_tokens": 35307793.0, | |
| "step": 85 | |
| }, | |
| { | |
| "epoch": 1.5087719298245614, | |
| "grad_norm": 0.1385582692497573, | |
| "learning_rate": 5.799657393924869e-06, | |
| "loss": 0.198, | |
| "num_tokens": 35741435.0, | |
| "step": 86 | |
| }, | |
| { | |
| "epoch": 1.526315789473684, | |
| "grad_norm": 0.15662493183414183, | |
| "learning_rate": 5.714118621206843e-06, | |
| "loss": 0.1909, | |
| "num_tokens": 36110154.0, | |
| "step": 87 | |
| }, | |
| { | |
| "epoch": 1.543859649122807, | |
| "grad_norm": 0.14798100623872662, | |
| "learning_rate": 5.6285022285716325e-06, | |
| "loss": 0.2063, | |
| "num_tokens": 36508508.0, | |
| "step": 88 | |
| }, | |
| { | |
| "epoch": 1.5614035087719298, | |
| "grad_norm": 0.13949297838603725, | |
| "learning_rate": 5.542839252728096e-06, | |
| "loss": 0.2056, | |
| "num_tokens": 36962199.0, | |
| "step": 89 | |
| }, | |
| { | |
| "epoch": 1.5789473684210527, | |
| "grad_norm": 0.1388395627421214, | |
| "learning_rate": 5.457160747271906e-06, | |
| "loss": 0.1977, | |
| "num_tokens": 37416119.0, | |
| "step": 90 | |
| }, | |
| { | |
| "epoch": 1.5964912280701755, | |
| "grad_norm": 0.13753404705909872, | |
| "learning_rate": 5.371497771428368e-06, | |
| "loss": 0.1988, | |
| "num_tokens": 37844052.0, | |
| "step": 91 | |
| }, | |
| { | |
| "epoch": 1.6140350877192984, | |
| "grad_norm": 0.13813649436163167, | |
| "learning_rate": 5.2858813787931605e-06, | |
| "loss": 0.193, | |
| "num_tokens": 38281149.0, | |
| "step": 92 | |
| }, | |
| { | |
| "epoch": 1.631578947368421, | |
| "grad_norm": 0.13735934501319846, | |
| "learning_rate": 5.2003426060751324e-06, | |
| "loss": 0.1948, | |
| "num_tokens": 38696776.0, | |
| "step": 93 | |
| }, | |
| { | |
| "epoch": 1.6491228070175439, | |
| "grad_norm": 0.14464652766257102, | |
| "learning_rate": 5.114912461845223e-06, | |
| "loss": 0.1954, | |
| "num_tokens": 39118421.0, | |
| "step": 94 | |
| }, | |
| { | |
| "epoch": 1.6666666666666665, | |
| "grad_norm": 0.1441541884884212, | |
| "learning_rate": 5.02962191529556e-06, | |
| "loss": 0.1969, | |
| "num_tokens": 39531921.0, | |
| "step": 95 | |
| }, | |
| { | |
| "epoch": 1.6842105263157894, | |
| "grad_norm": 0.14549147633711634, | |
| "learning_rate": 4.944501885012875e-06, | |
| "loss": 0.1987, | |
| "num_tokens": 39942510.0, | |
| "step": 96 | |
| }, | |
| { | |
| "epoch": 1.7017543859649122, | |
| "grad_norm": 0.14239386063143547, | |
| "learning_rate": 4.859583227770218e-06, | |
| "loss": 0.1942, | |
| "num_tokens": 40349157.0, | |
| "step": 97 | |
| }, | |
| { | |
| "epoch": 1.719298245614035, | |
| "grad_norm": 0.14101938277964904, | |
| "learning_rate": 4.774896727341113e-06, | |
| "loss": 0.1896, | |
| "num_tokens": 40755487.0, | |
| "step": 98 | |
| }, | |
| { | |
| "epoch": 1.736842105263158, | |
| "grad_norm": 0.1513563377956916, | |
| "learning_rate": 4.6904730833401575e-06, | |
| "loss": 0.1741, | |
| "num_tokens": 41109588.0, | |
| "step": 99 | |
| }, | |
| { | |
| "epoch": 1.7543859649122808, | |
| "grad_norm": 0.14171664584567437, | |
| "learning_rate": 4.606342900094147e-06, | |
| "loss": 0.1978, | |
| "num_tokens": 41549463.0, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 1.7719298245614035, | |
| "grad_norm": 0.1403769471140474, | |
| "learning_rate": 4.5225366755477165e-06, | |
| "loss": 0.2018, | |
| "num_tokens": 41986009.0, | |
| "step": 101 | |
| }, | |
| { | |
| "epoch": 1.7894736842105263, | |
| "grad_norm": 0.14588775725754724, | |
| "learning_rate": 4.439084790207577e-06, | |
| "loss": 0.1991, | |
| "num_tokens": 42393517.0, | |
| "step": 102 | |
| }, | |
| { | |
| "epoch": 1.807017543859649, | |
| "grad_norm": 0.14195463960673751, | |
| "learning_rate": 4.35601749612931e-06, | |
| "loss": 0.1954, | |
| "num_tokens": 42788971.0, | |
| "step": 103 | |
| }, | |
| { | |
| "epoch": 1.8245614035087718, | |
| "grad_norm": 0.14486517700153345, | |
| "learning_rate": 4.273364905950711e-06, | |
| "loss": 0.2001, | |
| "num_tokens": 43200059.0, | |
| "step": 104 | |
| }, | |
| { | |
| "epoch": 1.8421052631578947, | |
| "grad_norm": 0.14536497153998434, | |
| "learning_rate": 4.191156981975704e-06, | |
| "loss": 0.1881, | |
| "num_tokens": 43591515.0, | |
| "step": 105 | |
| }, | |
| { | |
| "epoch": 1.8596491228070176, | |
| "grad_norm": 0.1483672348465985, | |
| "learning_rate": 4.109423525312738e-06, | |
| "loss": 0.1936, | |
| "num_tokens": 43989015.0, | |
| "step": 106 | |
| }, | |
| { | |
| "epoch": 1.8771929824561404, | |
| "grad_norm": 0.14387471159557752, | |
| "learning_rate": 4.028194165071603e-06, | |
| "loss": 0.1959, | |
| "num_tokens": 44390867.0, | |
| "step": 107 | |
| }, | |
| { | |
| "epoch": 1.8947368421052633, | |
| "grad_norm": 0.14319263387686854, | |
| "learning_rate": 3.9474983476226335e-06, | |
| "loss": 0.2026, | |
| "num_tokens": 44814288.0, | |
| "step": 108 | |
| }, | |
| { | |
| "epoch": 1.912280701754386, | |
| "grad_norm": 0.13718763298366718, | |
| "learning_rate": 3.867365325922116e-06, | |
| "loss": 0.1919, | |
| "num_tokens": 45232685.0, | |
| "step": 109 | |
| }, | |
| { | |
| "epoch": 1.9298245614035088, | |
| "grad_norm": 0.13661747990592807, | |
| "learning_rate": 3.7878241489078473e-06, | |
| "loss": 0.192, | |
| "num_tokens": 45633905.0, | |
| "step": 110 | |
| }, | |
| { | |
| "epoch": 1.9473684210526314, | |
| "grad_norm": 0.13757723840377134, | |
| "learning_rate": 3.7089036509686216e-06, | |
| "loss": 0.196, | |
| "num_tokens": 46052270.0, | |
| "step": 111 | |
| }, | |
| { | |
| "epoch": 1.9649122807017543, | |
| "grad_norm": 0.14009156799615108, | |
| "learning_rate": 3.630632441491512e-06, | |
| "loss": 0.1945, | |
| "num_tokens": 46479271.0, | |
| "step": 112 | |
| }, | |
| { | |
| "epoch": 1.9824561403508771, | |
| "grad_norm": 0.1392559652525668, | |
| "learning_rate": 3.5530388944907124e-06, | |
| "loss": 0.1985, | |
| "num_tokens": 46884227.0, | |
| "step": 113 | |
| }, | |
| { | |
| "epoch": 2.0, | |
| "grad_norm": 0.13976428969132587, | |
| "learning_rate": 3.476151138321705e-06, | |
| "loss": 0.1995, | |
| "num_tokens": 47297644.0, | |
| "step": 114 | |
| }, | |
| { | |
| "epoch": 2.017543859649123, | |
| "grad_norm": 0.1378198428541279, | |
| "learning_rate": 3.3999970454844688e-06, | |
| "loss": 0.1724, | |
| "num_tokens": 47688068.0, | |
| "step": 115 | |
| }, | |
| { | |
| "epoch": 2.0350877192982457, | |
| "grad_norm": 0.134440422974191, | |
| "learning_rate": 3.3246042225194626e-06, | |
| "loss": 0.1796, | |
| "num_tokens": 48092477.0, | |
| "step": 116 | |
| }, | |
| { | |
| "epoch": 2.0526315789473686, | |
| "grad_norm": 0.13660484419562605, | |
| "learning_rate": 3.2500000000000015e-06, | |
| "loss": 0.1763, | |
| "num_tokens": 48476841.0, | |
| "step": 117 | |
| }, | |
| { | |
| "epoch": 2.0701754385964914, | |
| "grad_norm": 0.14109474340650238, | |
| "learning_rate": 3.176211422624672e-06, | |
| "loss": 0.1778, | |
| "num_tokens": 48854905.0, | |
| "step": 118 | |
| }, | |
| { | |
| "epoch": 2.087719298245614, | |
| "grad_norm": 0.13774654351946805, | |
| "learning_rate": 3.103265239413401e-06, | |
| "loss": 0.1793, | |
| "num_tokens": 49295065.0, | |
| "step": 119 | |
| }, | |
| { | |
| "epoch": 2.1052631578947367, | |
| "grad_norm": 0.14705463035874308, | |
| "learning_rate": 3.0311878940106864e-06, | |
| "loss": 0.1885, | |
| "num_tokens": 49711843.0, | |
| "step": 120 | |
| }, | |
| { | |
| "epoch": 2.1228070175438596, | |
| "grad_norm": 0.13965440849358451, | |
| "learning_rate": 2.9600055150995397e-06, | |
| "loss": 0.1804, | |
| "num_tokens": 50121373.0, | |
| "step": 121 | |
| }, | |
| { | |
| "epoch": 2.1403508771929824, | |
| "grad_norm": 0.1431354792667028, | |
| "learning_rate": 2.889743906929609e-06, | |
| "loss": 0.1761, | |
| "num_tokens": 50524660.0, | |
| "step": 122 | |
| }, | |
| { | |
| "epoch": 2.1578947368421053, | |
| "grad_norm": 0.13549946577694855, | |
| "learning_rate": 2.820428539962905e-06, | |
| "loss": 0.1756, | |
| "num_tokens": 50952097.0, | |
| "step": 123 | |
| }, | |
| { | |
| "epoch": 2.175438596491228, | |
| "grad_norm": 0.13874042982824947, | |
| "learning_rate": 2.7520845416405285e-06, | |
| "loss": 0.1787, | |
| "num_tokens": 51357662.0, | |
| "step": 124 | |
| }, | |
| { | |
| "epoch": 2.192982456140351, | |
| "grad_norm": 0.13352052067268536, | |
| "learning_rate": 2.6847366872737535e-06, | |
| "loss": 0.1786, | |
| "num_tokens": 51772391.0, | |
| "step": 125 | |
| }, | |
| { | |
| "epoch": 2.2105263157894735, | |
| "grad_norm": 0.13750830287403998, | |
| "learning_rate": 2.618409391062751e-06, | |
| "loss": 0.1827, | |
| "num_tokens": 52198396.0, | |
| "step": 126 | |
| }, | |
| { | |
| "epoch": 2.2280701754385963, | |
| "grad_norm": 0.14077287411728898, | |
| "learning_rate": 2.5531266972462176e-06, | |
| "loss": 0.1786, | |
| "num_tokens": 52585564.0, | |
| "step": 127 | |
| }, | |
| { | |
| "epoch": 2.245614035087719, | |
| "grad_norm": 0.13893984896019573, | |
| "learning_rate": 2.4889122713851397e-06, | |
| "loss": 0.1788, | |
| "num_tokens": 52997398.0, | |
| "step": 128 | |
| }, | |
| { | |
| "epoch": 2.263157894736842, | |
| "grad_norm": 0.13788162656378736, | |
| "learning_rate": 2.425789391783796e-06, | |
| "loss": 0.1878, | |
| "num_tokens": 53407933.0, | |
| "step": 129 | |
| }, | |
| { | |
| "epoch": 2.280701754385965, | |
| "grad_norm": 0.13629331805149528, | |
| "learning_rate": 2.36378094105118e-06, | |
| "loss": 0.1836, | |
| "num_tokens": 53817667.0, | |
| "step": 130 | |
| }, | |
| { | |
| "epoch": 2.2982456140350878, | |
| "grad_norm": 0.145846640939152, | |
| "learning_rate": 2.302909397805841e-06, | |
| "loss": 0.1761, | |
| "num_tokens": 54208139.0, | |
| "step": 131 | |
| }, | |
| { | |
| "epoch": 2.3157894736842106, | |
| "grad_norm": 0.1415158735561498, | |
| "learning_rate": 2.2431968285271843e-06, | |
| "loss": 0.1861, | |
| "num_tokens": 54616138.0, | |
| "step": 132 | |
| }, | |
| { | |
| "epoch": 2.3333333333333335, | |
| "grad_norm": 0.1399694993181749, | |
| "learning_rate": 2.1846648795561777e-06, | |
| "loss": 0.18, | |
| "num_tokens": 55028264.0, | |
| "step": 133 | |
| }, | |
| { | |
| "epoch": 2.3508771929824563, | |
| "grad_norm": 0.1340221566625987, | |
| "learning_rate": 2.1273347692483574e-06, | |
| "loss": 0.1818, | |
| "num_tokens": 55474995.0, | |
| "step": 134 | |
| }, | |
| { | |
| "epoch": 2.3684210526315788, | |
| "grad_norm": 0.13728502667314055, | |
| "learning_rate": 2.071227280281982e-06, | |
| "loss": 0.1697, | |
| "num_tokens": 55872252.0, | |
| "step": 135 | |
| }, | |
| { | |
| "epoch": 2.3859649122807016, | |
| "grad_norm": 0.13569940106251407, | |
| "learning_rate": 2.016362752124129e-06, | |
| "loss": 0.1799, | |
| "num_tokens": 56295990.0, | |
| "step": 136 | |
| }, | |
| { | |
| "epoch": 2.4035087719298245, | |
| "grad_norm": 0.1433225385861297, | |
| "learning_rate": 1.9627610736574575e-06, | |
| "loss": 0.1744, | |
| "num_tokens": 56700633.0, | |
| "step": 137 | |
| }, | |
| { | |
| "epoch": 2.4210526315789473, | |
| "grad_norm": 0.13712140562366157, | |
| "learning_rate": 1.9104416759703017e-06, | |
| "loss": 0.1772, | |
| "num_tokens": 57123351.0, | |
| "step": 138 | |
| }, | |
| { | |
| "epoch": 2.43859649122807, | |
| "grad_norm": 0.14064914274676912, | |
| "learning_rate": 1.8594235253127373e-06, | |
| "loss": 0.1794, | |
| "num_tokens": 57541451.0, | |
| "step": 139 | |
| }, | |
| { | |
| "epoch": 2.456140350877193, | |
| "grad_norm": 0.15170132064659694, | |
| "learning_rate": 1.8097251162211405e-06, | |
| "loss": 0.1831, | |
| "num_tokens": 57962223.0, | |
| "step": 140 | |
| }, | |
| { | |
| "epoch": 2.473684210526316, | |
| "grad_norm": 0.13964776563103484, | |
| "learning_rate": 1.7613644648137543e-06, | |
| "loss": 0.1756, | |
| "num_tokens": 58375881.0, | |
| "step": 141 | |
| }, | |
| { | |
| "epoch": 2.4912280701754383, | |
| "grad_norm": 0.13507579048092097, | |
| "learning_rate": 1.7143591022596846e-06, | |
| "loss": 0.1821, | |
| "num_tokens": 58796929.0, | |
| "step": 142 | |
| }, | |
| { | |
| "epoch": 2.5087719298245617, | |
| "grad_norm": 0.13875107577532086, | |
| "learning_rate": 1.6687260684236943e-06, | |
| "loss": 0.1773, | |
| "num_tokens": 59207995.0, | |
| "step": 143 | |
| }, | |
| { | |
| "epoch": 2.526315789473684, | |
| "grad_norm": 0.14061593378122658, | |
| "learning_rate": 1.6244819056890975e-06, | |
| "loss": 0.1716, | |
| "num_tokens": 59582578.0, | |
| "step": 144 | |
| }, | |
| { | |
| "epoch": 2.543859649122807, | |
| "grad_norm": 0.12901477335373565, | |
| "learning_rate": 1.5816426529610035e-06, | |
| "loss": 0.1764, | |
| "num_tokens": 60014351.0, | |
| "step": 145 | |
| }, | |
| { | |
| "epoch": 2.56140350877193, | |
| "grad_norm": 0.13513262564013573, | |
| "learning_rate": 1.5402238398520614e-06, | |
| "loss": 0.1742, | |
| "num_tokens": 60428513.0, | |
| "step": 146 | |
| }, | |
| { | |
| "epoch": 2.5789473684210527, | |
| "grad_norm": 0.12744611421871882, | |
| "learning_rate": 1.5002404810528452e-06, | |
| "loss": 0.1798, | |
| "num_tokens": 60870775.0, | |
| "step": 147 | |
| }, | |
| { | |
| "epoch": 2.5964912280701755, | |
| "grad_norm": 0.1281932184087842, | |
| "learning_rate": 1.4617070708888882e-06, | |
| "loss": 0.1788, | |
| "num_tokens": 61333167.0, | |
| "step": 148 | |
| }, | |
| { | |
| "epoch": 2.6140350877192984, | |
| "grad_norm": 0.13398144271039164, | |
| "learning_rate": 1.4246375780663613e-06, | |
| "loss": 0.1792, | |
| "num_tokens": 61737623.0, | |
| "step": 149 | |
| }, | |
| { | |
| "epoch": 2.6315789473684212, | |
| "grad_norm": 0.13540743049220252, | |
| "learning_rate": 1.389045440608296e-06, | |
| "loss": 0.1755, | |
| "num_tokens": 62143293.0, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 2.6491228070175437, | |
| "grad_norm": 0.13564465493581726, | |
| "learning_rate": 1.354943560983175e-06, | |
| "loss": 0.1735, | |
| "num_tokens": 62558499.0, | |
| "step": 151 | |
| }, | |
| { | |
| "epoch": 2.6666666666666665, | |
| "grad_norm": 0.12805186009140426, | |
| "learning_rate": 1.3223443014276738e-06, | |
| "loss": 0.1736, | |
| "num_tokens": 63004628.0, | |
| "step": 152 | |
| }, | |
| { | |
| "epoch": 2.6842105263157894, | |
| "grad_norm": 0.1328569132316143, | |
| "learning_rate": 1.2912594794652406e-06, | |
| "loss": 0.1642, | |
| "num_tokens": 63387346.0, | |
| "step": 153 | |
| }, | |
| { | |
| "epoch": 2.7017543859649122, | |
| "grad_norm": 0.1325321320124978, | |
| "learning_rate": 1.2617003636221394e-06, | |
| "loss": 0.169, | |
| "num_tokens": 63804970.0, | |
| "step": 154 | |
| }, | |
| { | |
| "epoch": 2.719298245614035, | |
| "grad_norm": 0.13540714382771668, | |
| "learning_rate": 1.2336776693425028e-06, | |
| "loss": 0.1744, | |
| "num_tokens": 64196162.0, | |
| "step": 155 | |
| }, | |
| { | |
| "epoch": 2.736842105263158, | |
| "grad_norm": 0.14020781213013872, | |
| "learning_rate": 1.2072015551038933e-06, | |
| "loss": 0.1811, | |
| "num_tokens": 64585657.0, | |
| "step": 156 | |
| }, | |
| { | |
| "epoch": 2.754385964912281, | |
| "grad_norm": 0.14012421310202808, | |
| "learning_rate": 1.1822816187347625e-06, | |
| "loss": 0.1882, | |
| "num_tokens": 64990929.0, | |
| "step": 157 | |
| }, | |
| { | |
| "epoch": 2.7719298245614032, | |
| "grad_norm": 0.13359559789919473, | |
| "learning_rate": 1.1589268939351499e-06, | |
| "loss": 0.1644, | |
| "num_tokens": 65419394.0, | |
| "step": 158 | |
| }, | |
| { | |
| "epoch": 2.7894736842105265, | |
| "grad_norm": 0.1293973137684263, | |
| "learning_rate": 1.1371458470018896e-06, | |
| "loss": 0.1686, | |
| "num_tokens": 65848256.0, | |
| "step": 159 | |
| }, | |
| { | |
| "epoch": 2.807017543859649, | |
| "grad_norm": 0.12796590503255867, | |
| "learning_rate": 1.1169463737594995e-06, | |
| "loss": 0.173, | |
| "num_tokens": 66276026.0, | |
| "step": 160 | |
| }, | |
| { | |
| "epoch": 2.824561403508772, | |
| "grad_norm": 0.1386629969970847, | |
| "learning_rate": 1.0983357966978747e-06, | |
| "loss": 0.1698, | |
| "num_tokens": 66662640.0, | |
| "step": 161 | |
| }, | |
| { | |
| "epoch": 2.8421052631578947, | |
| "grad_norm": 0.1312256058443758, | |
| "learning_rate": 1.0813208623178199e-06, | |
| "loss": 0.1831, | |
| "num_tokens": 67101128.0, | |
| "step": 162 | |
| }, | |
| { | |
| "epoch": 2.8596491228070176, | |
| "grad_norm": 0.13566699518356568, | |
| "learning_rate": 1.0659077386853817e-06, | |
| "loss": 0.1918, | |
| "num_tokens": 67527335.0, | |
| "step": 163 | |
| }, | |
| { | |
| "epoch": 2.8771929824561404, | |
| "grad_norm": 0.13207372833151468, | |
| "learning_rate": 1.0521020131958692e-06, | |
| "loss": 0.18, | |
| "num_tokens": 67953220.0, | |
| "step": 164 | |
| }, | |
| { | |
| "epoch": 2.8947368421052633, | |
| "grad_norm": 0.1351068484295404, | |
| "learning_rate": 1.0399086905483752e-06, | |
| "loss": 0.1796, | |
| "num_tokens": 68401961.0, | |
| "step": 165 | |
| }, | |
| { | |
| "epoch": 2.912280701754386, | |
| "grad_norm": 0.13512574709033598, | |
| "learning_rate": 1.0293321909315242e-06, | |
| "loss": 0.1742, | |
| "num_tokens": 68815465.0, | |
| "step": 166 | |
| }, | |
| { | |
| "epoch": 2.9298245614035086, | |
| "grad_norm": 0.13483699443340522, | |
| "learning_rate": 1.0203763484211196e-06, | |
| "loss": 0.1778, | |
| "num_tokens": 69255767.0, | |
| "step": 167 | |
| }, | |
| { | |
| "epoch": 2.9473684210526314, | |
| "grad_norm": 0.13142810366700336, | |
| "learning_rate": 1.0130444095902514e-06, | |
| "loss": 0.1842, | |
| "num_tokens": 69678619.0, | |
| "step": 168 | |
| }, | |
| { | |
| "epoch": 2.9649122807017543, | |
| "grad_norm": 0.13693412077884642, | |
| "learning_rate": 1.0073390323323897e-06, | |
| "loss": 0.177, | |
| "num_tokens": 70098414.0, | |
| "step": 169 | |
| }, | |
| { | |
| "epoch": 2.982456140350877, | |
| "grad_norm": 0.1320466096940847, | |
| "learning_rate": 1.0032622848978689e-06, | |
| "loss": 0.168, | |
| "num_tokens": 70513950.0, | |
| "step": 170 | |
| }, | |
| { | |
| "epoch": 3.0, | |
| "grad_norm": 0.1365255306036787, | |
| "learning_rate": 1.000815645144134e-06, | |
| "loss": 0.1794, | |
| "num_tokens": 70937090.0, | |
| "step": 171 | |
| }, | |
| { | |
| "epoch": 3.0, | |
| "step": 171, | |
| "total_flos": 2.276376686268252e+17, | |
| "train_loss": 0.25581319124726526, | |
| "train_runtime": 2707.0199, | |
| "train_samples_per_second": 8.073, | |
| "train_steps_per_second": 0.063 | |
| } | |
| ], | |
| "logging_steps": 1, | |
| "max_steps": 171, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 3, | |
| "save_steps": 500, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": true | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 2.276376686268252e+17, | |
| "train_batch_size": 8, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |