| { | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 2.990047413608987, | |
| "eval_steps": 1000, | |
| "global_step": 70000, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.004271496305155696, | |
| "grad_norm": 5.181503772735596, | |
| "learning_rate": 7.118451025056948e-07, | |
| "loss": 5.7231, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.008542992610311393, | |
| "grad_norm": 4.810484886169434, | |
| "learning_rate": 1.4236902050113896e-06, | |
| "loss": 5.663, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 0.012814488915467088, | |
| "grad_norm": 6.623730182647705, | |
| "learning_rate": 2.1355353075170844e-06, | |
| "loss": 5.8302, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 0.017085985220622785, | |
| "grad_norm": 4.488267421722412, | |
| "learning_rate": 2.847380410022779e-06, | |
| "loss": 5.5719, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 0.02135748152577848, | |
| "grad_norm": 5.217711448669434, | |
| "learning_rate": 3.559225512528474e-06, | |
| "loss": 5.572, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 0.025628977830934176, | |
| "grad_norm": 6.749180316925049, | |
| "learning_rate": 4.271070615034169e-06, | |
| "loss": 5.6101, | |
| "step": 600 | |
| }, | |
| { | |
| "epoch": 0.029900474136089872, | |
| "grad_norm": 7.648950099945068, | |
| "learning_rate": 4.9829157175398636e-06, | |
| "loss": 5.6377, | |
| "step": 700 | |
| }, | |
| { | |
| "epoch": 0.03417197044124557, | |
| "grad_norm": 8.535877227783203, | |
| "learning_rate": 5.694760820045558e-06, | |
| "loss": 5.3197, | |
| "step": 800 | |
| }, | |
| { | |
| "epoch": 0.03844346674640126, | |
| "grad_norm": 10.138134002685547, | |
| "learning_rate": 6.406605922551254e-06, | |
| "loss": 5.2204, | |
| "step": 900 | |
| }, | |
| { | |
| "epoch": 0.04271496305155696, | |
| "grad_norm": 9.492013931274414, | |
| "learning_rate": 7.118451025056948e-06, | |
| "loss": 5.1271, | |
| "step": 1000 | |
| }, | |
| { | |
| "epoch": 0.04271496305155696, | |
| "eval_runtime": 404.87, | |
| "eval_samples_per_second": 115.647, | |
| "eval_steps_per_second": 14.456, | |
| "step": 1000 | |
| }, | |
| { | |
| "epoch": 0.046986459356712654, | |
| "grad_norm": 9.8007230758667, | |
| "learning_rate": 7.830296127562643e-06, | |
| "loss": 4.8906, | |
| "step": 1100 | |
| }, | |
| { | |
| "epoch": 0.05125795566186835, | |
| "grad_norm": 15.544700622558594, | |
| "learning_rate": 8.542141230068338e-06, | |
| "loss": 4.7123, | |
| "step": 1200 | |
| }, | |
| { | |
| "epoch": 0.05552945196702405, | |
| "grad_norm": 11.552872657775879, | |
| "learning_rate": 9.253986332574032e-06, | |
| "loss": 4.542, | |
| "step": 1300 | |
| }, | |
| { | |
| "epoch": 0.059800948272179744, | |
| "grad_norm": 8.390003204345703, | |
| "learning_rate": 9.965831435079727e-06, | |
| "loss": 4.1772, | |
| "step": 1400 | |
| }, | |
| { | |
| "epoch": 0.06407244457733544, | |
| "grad_norm": 6.352422714233398, | |
| "learning_rate": 1.0677676537585422e-05, | |
| "loss": 4.1877, | |
| "step": 1500 | |
| }, | |
| { | |
| "epoch": 0.06834394088249114, | |
| "grad_norm": 11.394415855407715, | |
| "learning_rate": 1.1389521640091117e-05, | |
| "loss": 4.1625, | |
| "step": 1600 | |
| }, | |
| { | |
| "epoch": 0.07261543718764683, | |
| "grad_norm": 7.095893859863281, | |
| "learning_rate": 1.2101366742596812e-05, | |
| "loss": 4.0512, | |
| "step": 1700 | |
| }, | |
| { | |
| "epoch": 0.07688693349280253, | |
| "grad_norm": 11.011418342590332, | |
| "learning_rate": 1.2813211845102508e-05, | |
| "loss": 4.0444, | |
| "step": 1800 | |
| }, | |
| { | |
| "epoch": 0.08115842979795823, | |
| "grad_norm": 7.294766902923584, | |
| "learning_rate": 1.35250569476082e-05, | |
| "loss": 4.1648, | |
| "step": 1900 | |
| }, | |
| { | |
| "epoch": 0.08542992610311392, | |
| "grad_norm": 9.359979629516602, | |
| "learning_rate": 1.4236902050113896e-05, | |
| "loss": 4.1958, | |
| "step": 2000 | |
| }, | |
| { | |
| "epoch": 0.08542992610311392, | |
| "eval_runtime": 404.9689, | |
| "eval_samples_per_second": 115.619, | |
| "eval_steps_per_second": 14.453, | |
| "step": 2000 | |
| }, | |
| { | |
| "epoch": 0.08970142240826962, | |
| "grad_norm": 9.699823379516602, | |
| "learning_rate": 1.494874715261959e-05, | |
| "loss": 4.0555, | |
| "step": 2100 | |
| }, | |
| { | |
| "epoch": 0.09397291871342531, | |
| "grad_norm": 9.576581954956055, | |
| "learning_rate": 1.5660592255125285e-05, | |
| "loss": 4.1073, | |
| "step": 2200 | |
| }, | |
| { | |
| "epoch": 0.09824441501858101, | |
| "grad_norm": 9.165473937988281, | |
| "learning_rate": 1.637243735763098e-05, | |
| "loss": 4.0373, | |
| "step": 2300 | |
| }, | |
| { | |
| "epoch": 0.1025159113237367, | |
| "grad_norm": 9.258238792419434, | |
| "learning_rate": 1.7084282460136675e-05, | |
| "loss": 3.8695, | |
| "step": 2400 | |
| }, | |
| { | |
| "epoch": 0.1067874076288924, | |
| "grad_norm": 10.60352897644043, | |
| "learning_rate": 1.779612756264237e-05, | |
| "loss": 3.8457, | |
| "step": 2500 | |
| }, | |
| { | |
| "epoch": 0.1110589039340481, | |
| "grad_norm": 8.64367961883545, | |
| "learning_rate": 1.8507972665148065e-05, | |
| "loss": 3.8887, | |
| "step": 2600 | |
| }, | |
| { | |
| "epoch": 0.1153304002392038, | |
| "grad_norm": 11.192293167114258, | |
| "learning_rate": 1.9219817767653758e-05, | |
| "loss": 4.0215, | |
| "step": 2700 | |
| }, | |
| { | |
| "epoch": 0.11960189654435949, | |
| "grad_norm": 11.153294563293457, | |
| "learning_rate": 1.9931662870159454e-05, | |
| "loss": 3.9655, | |
| "step": 2800 | |
| }, | |
| { | |
| "epoch": 0.12387339284951518, | |
| "grad_norm": 9.78614616394043, | |
| "learning_rate": 2.064350797266515e-05, | |
| "loss": 4.0018, | |
| "step": 2900 | |
| }, | |
| { | |
| "epoch": 0.12814488915467087, | |
| "grad_norm": 11.2694730758667, | |
| "learning_rate": 2.1355353075170844e-05, | |
| "loss": 3.9469, | |
| "step": 3000 | |
| }, | |
| { | |
| "epoch": 0.12814488915467087, | |
| "eval_runtime": 404.9852, | |
| "eval_samples_per_second": 115.614, | |
| "eval_steps_per_second": 14.452, | |
| "step": 3000 | |
| }, | |
| { | |
| "epoch": 0.1324163854598266, | |
| "grad_norm": 9.227404594421387, | |
| "learning_rate": 2.2067198177676537e-05, | |
| "loss": 3.9662, | |
| "step": 3100 | |
| }, | |
| { | |
| "epoch": 0.13668788176498228, | |
| "grad_norm": 13.020267486572266, | |
| "learning_rate": 2.2779043280182233e-05, | |
| "loss": 3.9011, | |
| "step": 3200 | |
| }, | |
| { | |
| "epoch": 0.14095937807013798, | |
| "grad_norm": 8.806400299072266, | |
| "learning_rate": 2.349088838268793e-05, | |
| "loss": 3.9507, | |
| "step": 3300 | |
| }, | |
| { | |
| "epoch": 0.14523087437529367, | |
| "grad_norm": 7.716139793395996, | |
| "learning_rate": 2.4202733485193623e-05, | |
| "loss": 3.9536, | |
| "step": 3400 | |
| }, | |
| { | |
| "epoch": 0.14950237068044936, | |
| "grad_norm": 12.768115997314453, | |
| "learning_rate": 2.4914578587699316e-05, | |
| "loss": 3.9268, | |
| "step": 3500 | |
| }, | |
| { | |
| "epoch": 0.15377386698560505, | |
| "grad_norm": 10.236725807189941, | |
| "learning_rate": 2.5626423690205016e-05, | |
| "loss": 3.8231, | |
| "step": 3600 | |
| }, | |
| { | |
| "epoch": 0.15804536329076074, | |
| "grad_norm": 13.891462326049805, | |
| "learning_rate": 2.633826879271071e-05, | |
| "loss": 3.8843, | |
| "step": 3700 | |
| }, | |
| { | |
| "epoch": 0.16231685959591646, | |
| "grad_norm": 12.619128227233887, | |
| "learning_rate": 2.70501138952164e-05, | |
| "loss": 3.8942, | |
| "step": 3800 | |
| }, | |
| { | |
| "epoch": 0.16658835590107215, | |
| "grad_norm": 8.733612060546875, | |
| "learning_rate": 2.77619589977221e-05, | |
| "loss": 3.7588, | |
| "step": 3900 | |
| }, | |
| { | |
| "epoch": 0.17085985220622785, | |
| "grad_norm": 10.724875450134277, | |
| "learning_rate": 2.8473804100227792e-05, | |
| "loss": 3.8525, | |
| "step": 4000 | |
| }, | |
| { | |
| "epoch": 0.17085985220622785, | |
| "eval_runtime": 404.7816, | |
| "eval_samples_per_second": 115.672, | |
| "eval_steps_per_second": 14.46, | |
| "step": 4000 | |
| }, | |
| { | |
| "epoch": 0.17513134851138354, | |
| "grad_norm": 9.703784942626953, | |
| "learning_rate": 2.9185649202733488e-05, | |
| "loss": 3.7397, | |
| "step": 4100 | |
| }, | |
| { | |
| "epoch": 0.17940284481653923, | |
| "grad_norm": 10.537924766540527, | |
| "learning_rate": 2.989749430523918e-05, | |
| "loss": 3.8457, | |
| "step": 4200 | |
| }, | |
| { | |
| "epoch": 0.18367434112169492, | |
| "grad_norm": 13.7029447555542, | |
| "learning_rate": 3.0609339407744874e-05, | |
| "loss": 3.8889, | |
| "step": 4300 | |
| }, | |
| { | |
| "epoch": 0.18794583742685061, | |
| "grad_norm": 11.692300796508789, | |
| "learning_rate": 3.132118451025057e-05, | |
| "loss": 3.9057, | |
| "step": 4400 | |
| }, | |
| { | |
| "epoch": 0.19221733373200633, | |
| "grad_norm": 11.873428344726562, | |
| "learning_rate": 3.203302961275627e-05, | |
| "loss": 3.8959, | |
| "step": 4500 | |
| }, | |
| { | |
| "epoch": 0.19648883003716203, | |
| "grad_norm": 10.291272163391113, | |
| "learning_rate": 3.274487471526196e-05, | |
| "loss": 3.8212, | |
| "step": 4600 | |
| }, | |
| { | |
| "epoch": 0.20076032634231772, | |
| "grad_norm": 10.874945640563965, | |
| "learning_rate": 3.3456719817767654e-05, | |
| "loss": 3.8884, | |
| "step": 4700 | |
| }, | |
| { | |
| "epoch": 0.2050318226474734, | |
| "grad_norm": 15.713820457458496, | |
| "learning_rate": 3.416856492027335e-05, | |
| "loss": 3.9066, | |
| "step": 4800 | |
| }, | |
| { | |
| "epoch": 0.2093033189526291, | |
| "grad_norm": 11.526785850524902, | |
| "learning_rate": 3.488041002277905e-05, | |
| "loss": 3.7686, | |
| "step": 4900 | |
| }, | |
| { | |
| "epoch": 0.2135748152577848, | |
| "grad_norm": 10.42326545715332, | |
| "learning_rate": 3.559225512528474e-05, | |
| "loss": 3.784, | |
| "step": 5000 | |
| }, | |
| { | |
| "epoch": 0.2135748152577848, | |
| "eval_runtime": 404.8034, | |
| "eval_samples_per_second": 115.666, | |
| "eval_steps_per_second": 14.459, | |
| "step": 5000 | |
| }, | |
| { | |
| "epoch": 0.2178463115629405, | |
| "grad_norm": 14.717082023620605, | |
| "learning_rate": 3.630410022779043e-05, | |
| "loss": 3.8462, | |
| "step": 5100 | |
| }, | |
| { | |
| "epoch": 0.2221178078680962, | |
| "grad_norm": 11.965718269348145, | |
| "learning_rate": 3.701594533029613e-05, | |
| "loss": 3.9435, | |
| "step": 5200 | |
| }, | |
| { | |
| "epoch": 0.2263893041732519, | |
| "grad_norm": 10.752185821533203, | |
| "learning_rate": 3.7727790432801826e-05, | |
| "loss": 3.8663, | |
| "step": 5300 | |
| }, | |
| { | |
| "epoch": 0.2306608004784076, | |
| "grad_norm": 12.059910774230957, | |
| "learning_rate": 3.8439635535307516e-05, | |
| "loss": 3.7925, | |
| "step": 5400 | |
| }, | |
| { | |
| "epoch": 0.23493229678356328, | |
| "grad_norm": 9.081160545349121, | |
| "learning_rate": 3.915148063781321e-05, | |
| "loss": 3.8639, | |
| "step": 5500 | |
| }, | |
| { | |
| "epoch": 0.23920379308871897, | |
| "grad_norm": 10.45064926147461, | |
| "learning_rate": 3.986332574031891e-05, | |
| "loss": 3.8497, | |
| "step": 5600 | |
| }, | |
| { | |
| "epoch": 0.24347528939387467, | |
| "grad_norm": 15.188603401184082, | |
| "learning_rate": 4.0575170842824605e-05, | |
| "loss": 3.7753, | |
| "step": 5700 | |
| }, | |
| { | |
| "epoch": 0.24774678569903036, | |
| "grad_norm": 9.032523155212402, | |
| "learning_rate": 4.12870159453303e-05, | |
| "loss": 3.8309, | |
| "step": 5800 | |
| }, | |
| { | |
| "epoch": 0.2520182820041861, | |
| "grad_norm": 9.886519432067871, | |
| "learning_rate": 4.199886104783599e-05, | |
| "loss": 3.8508, | |
| "step": 5900 | |
| }, | |
| { | |
| "epoch": 0.25628977830934174, | |
| "grad_norm": 11.432881355285645, | |
| "learning_rate": 4.271070615034169e-05, | |
| "loss": 3.8327, | |
| "step": 6000 | |
| }, | |
| { | |
| "epoch": 0.25628977830934174, | |
| "eval_runtime": 404.7732, | |
| "eval_samples_per_second": 115.675, | |
| "eval_steps_per_second": 14.46, | |
| "step": 6000 | |
| }, | |
| { | |
| "epoch": 0.26056127461449746, | |
| "grad_norm": 10.134676933288574, | |
| "learning_rate": 4.3422551252847384e-05, | |
| "loss": 3.8419, | |
| "step": 6100 | |
| }, | |
| { | |
| "epoch": 0.2648327709196532, | |
| "grad_norm": 12.583077430725098, | |
| "learning_rate": 4.4134396355353074e-05, | |
| "loss": 3.884, | |
| "step": 6200 | |
| }, | |
| { | |
| "epoch": 0.26910426722480885, | |
| "grad_norm": 9.845976829528809, | |
| "learning_rate": 4.484624145785877e-05, | |
| "loss": 3.7787, | |
| "step": 6300 | |
| }, | |
| { | |
| "epoch": 0.27337576352996457, | |
| "grad_norm": 21.58133888244629, | |
| "learning_rate": 4.555808656036447e-05, | |
| "loss": 3.8962, | |
| "step": 6400 | |
| }, | |
| { | |
| "epoch": 0.27764725983512023, | |
| "grad_norm": 12.139480590820312, | |
| "learning_rate": 4.626993166287016e-05, | |
| "loss": 3.722, | |
| "step": 6500 | |
| }, | |
| { | |
| "epoch": 0.28191875614027595, | |
| "grad_norm": 8.343817710876465, | |
| "learning_rate": 4.698177676537586e-05, | |
| "loss": 3.8009, | |
| "step": 6600 | |
| }, | |
| { | |
| "epoch": 0.2861902524454316, | |
| "grad_norm": 17.52387809753418, | |
| "learning_rate": 4.769362186788155e-05, | |
| "loss": 3.8687, | |
| "step": 6700 | |
| }, | |
| { | |
| "epoch": 0.29046174875058733, | |
| "grad_norm": 7.540428638458252, | |
| "learning_rate": 4.8405466970387246e-05, | |
| "loss": 3.727, | |
| "step": 6800 | |
| }, | |
| { | |
| "epoch": 0.29473324505574305, | |
| "grad_norm": 11.794758796691895, | |
| "learning_rate": 4.911731207289294e-05, | |
| "loss": 3.7971, | |
| "step": 6900 | |
| }, | |
| { | |
| "epoch": 0.2990047413608987, | |
| "grad_norm": 10.799798011779785, | |
| "learning_rate": 4.982915717539863e-05, | |
| "loss": 3.8319, | |
| "step": 7000 | |
| }, | |
| { | |
| "epoch": 0.2990047413608987, | |
| "eval_runtime": 404.96, | |
| "eval_samples_per_second": 115.621, | |
| "eval_steps_per_second": 14.453, | |
| "step": 7000 | |
| }, | |
| { | |
| "epoch": 0.30327623766605444, | |
| "grad_norm": 10.189812660217285, | |
| "learning_rate": 4.993988197883213e-05, | |
| "loss": 3.8696, | |
| "step": 7100 | |
| }, | |
| { | |
| "epoch": 0.3075477339712101, | |
| "grad_norm": 9.960589408874512, | |
| "learning_rate": 4.986077931940072e-05, | |
| "loss": 3.7356, | |
| "step": 7200 | |
| }, | |
| { | |
| "epoch": 0.3118192302763658, | |
| "grad_norm": 19.682788848876953, | |
| "learning_rate": 4.978167665996931e-05, | |
| "loss": 3.8524, | |
| "step": 7300 | |
| }, | |
| { | |
| "epoch": 0.3160907265815215, | |
| "grad_norm": 10.302993774414062, | |
| "learning_rate": 4.9702574000537896e-05, | |
| "loss": 3.6824, | |
| "step": 7400 | |
| }, | |
| { | |
| "epoch": 0.3203622228866772, | |
| "grad_norm": 11.691702842712402, | |
| "learning_rate": 4.9623471341106494e-05, | |
| "loss": 3.8522, | |
| "step": 7500 | |
| }, | |
| { | |
| "epoch": 0.3246337191918329, | |
| "grad_norm": 8.01658821105957, | |
| "learning_rate": 4.954436868167508e-05, | |
| "loss": 3.8386, | |
| "step": 7600 | |
| }, | |
| { | |
| "epoch": 0.3289052154969886, | |
| "grad_norm": 8.131669998168945, | |
| "learning_rate": 4.946526602224367e-05, | |
| "loss": 3.7587, | |
| "step": 7700 | |
| }, | |
| { | |
| "epoch": 0.3331767118021443, | |
| "grad_norm": 63.0767822265625, | |
| "learning_rate": 4.938616336281226e-05, | |
| "loss": 3.9019, | |
| "step": 7800 | |
| }, | |
| { | |
| "epoch": 0.3374482081073, | |
| "grad_norm": 10.04031753540039, | |
| "learning_rate": 4.930706070338085e-05, | |
| "loss": 3.8661, | |
| "step": 7900 | |
| }, | |
| { | |
| "epoch": 0.3417197044124557, | |
| "grad_norm": 8.829032897949219, | |
| "learning_rate": 4.922795804394944e-05, | |
| "loss": 3.8105, | |
| "step": 8000 | |
| }, | |
| { | |
| "epoch": 0.3417197044124557, | |
| "eval_runtime": 405.1664, | |
| "eval_samples_per_second": 115.562, | |
| "eval_steps_per_second": 14.446, | |
| "step": 8000 | |
| }, | |
| { | |
| "epoch": 0.34599120071761136, | |
| "grad_norm": 15.314312934875488, | |
| "learning_rate": 4.914885538451803e-05, | |
| "loss": 3.6279, | |
| "step": 8100 | |
| }, | |
| { | |
| "epoch": 0.3502626970227671, | |
| "grad_norm": 8.690494537353516, | |
| "learning_rate": 4.906975272508662e-05, | |
| "loss": 3.8336, | |
| "step": 8200 | |
| }, | |
| { | |
| "epoch": 0.3545341933279228, | |
| "grad_norm": 10.526517868041992, | |
| "learning_rate": 4.8990650065655206e-05, | |
| "loss": 3.8046, | |
| "step": 8300 | |
| }, | |
| { | |
| "epoch": 0.35880568963307846, | |
| "grad_norm": 7.85405969619751, | |
| "learning_rate": 4.8911547406223804e-05, | |
| "loss": 3.7721, | |
| "step": 8400 | |
| }, | |
| { | |
| "epoch": 0.3630771859382342, | |
| "grad_norm": 11.473519325256348, | |
| "learning_rate": 4.883244474679239e-05, | |
| "loss": 3.7854, | |
| "step": 8500 | |
| }, | |
| { | |
| "epoch": 0.36734868224338985, | |
| "grad_norm": 9.746980667114258, | |
| "learning_rate": 4.875334208736098e-05, | |
| "loss": 3.7573, | |
| "step": 8600 | |
| }, | |
| { | |
| "epoch": 0.37162017854854557, | |
| "grad_norm": 9.924898147583008, | |
| "learning_rate": 4.867423942792957e-05, | |
| "loss": 3.8054, | |
| "step": 8700 | |
| }, | |
| { | |
| "epoch": 0.37589167485370123, | |
| "grad_norm": 8.137608528137207, | |
| "learning_rate": 4.859513676849816e-05, | |
| "loss": 3.8245, | |
| "step": 8800 | |
| }, | |
| { | |
| "epoch": 0.38016317115885695, | |
| "grad_norm": 8.987218856811523, | |
| "learning_rate": 4.851603410906675e-05, | |
| "loss": 3.742, | |
| "step": 8900 | |
| }, | |
| { | |
| "epoch": 0.38443466746401267, | |
| "grad_norm": 9.04791259765625, | |
| "learning_rate": 4.843693144963534e-05, | |
| "loss": 3.7748, | |
| "step": 9000 | |
| }, | |
| { | |
| "epoch": 0.38443466746401267, | |
| "eval_runtime": 404.3649, | |
| "eval_samples_per_second": 115.791, | |
| "eval_steps_per_second": 14.475, | |
| "step": 9000 | |
| }, | |
| { | |
| "epoch": 0.38870616376916833, | |
| "grad_norm": 10.426161766052246, | |
| "learning_rate": 4.835782879020393e-05, | |
| "loss": 3.823, | |
| "step": 9100 | |
| }, | |
| { | |
| "epoch": 0.39297766007432405, | |
| "grad_norm": 8.951033592224121, | |
| "learning_rate": 4.8278726130772516e-05, | |
| "loss": 3.5732, | |
| "step": 9200 | |
| }, | |
| { | |
| "epoch": 0.3972491563794797, | |
| "grad_norm": 10.766894340515137, | |
| "learning_rate": 4.819962347134111e-05, | |
| "loss": 3.7286, | |
| "step": 9300 | |
| }, | |
| { | |
| "epoch": 0.40152065268463544, | |
| "grad_norm": 9.429593086242676, | |
| "learning_rate": 4.81205208119097e-05, | |
| "loss": 3.7493, | |
| "step": 9400 | |
| }, | |
| { | |
| "epoch": 0.4057921489897911, | |
| "grad_norm": 14.518060684204102, | |
| "learning_rate": 4.804141815247829e-05, | |
| "loss": 3.81, | |
| "step": 9500 | |
| }, | |
| { | |
| "epoch": 0.4100636452949468, | |
| "grad_norm": 20.795759201049805, | |
| "learning_rate": 4.7962315493046875e-05, | |
| "loss": 3.7045, | |
| "step": 9600 | |
| }, | |
| { | |
| "epoch": 0.41433514160010254, | |
| "grad_norm": 10.095163345336914, | |
| "learning_rate": 4.788321283361547e-05, | |
| "loss": 3.7767, | |
| "step": 9700 | |
| }, | |
| { | |
| "epoch": 0.4186066379052582, | |
| "grad_norm": 14.195402145385742, | |
| "learning_rate": 4.780411017418406e-05, | |
| "loss": 3.6874, | |
| "step": 9800 | |
| }, | |
| { | |
| "epoch": 0.4228781342104139, | |
| "grad_norm": 8.357501029968262, | |
| "learning_rate": 4.772500751475265e-05, | |
| "loss": 3.6675, | |
| "step": 9900 | |
| }, | |
| { | |
| "epoch": 0.4271496305155696, | |
| "grad_norm": 8.715255737304688, | |
| "learning_rate": 4.764590485532124e-05, | |
| "loss": 3.7657, | |
| "step": 10000 | |
| }, | |
| { | |
| "epoch": 0.4271496305155696, | |
| "eval_runtime": 405.002, | |
| "eval_samples_per_second": 115.609, | |
| "eval_steps_per_second": 14.452, | |
| "step": 10000 | |
| }, | |
| { | |
| "epoch": 0.4314211268207253, | |
| "grad_norm": 11.821653366088867, | |
| "learning_rate": 4.7566802195889826e-05, | |
| "loss": 3.6386, | |
| "step": 10100 | |
| }, | |
| { | |
| "epoch": 0.435692623125881, | |
| "grad_norm": 10.1320161819458, | |
| "learning_rate": 4.748769953645842e-05, | |
| "loss": 3.808, | |
| "step": 10200 | |
| }, | |
| { | |
| "epoch": 0.4399641194310367, | |
| "grad_norm": 9.089192390441895, | |
| "learning_rate": 4.740859687702701e-05, | |
| "loss": 3.7376, | |
| "step": 10300 | |
| }, | |
| { | |
| "epoch": 0.4442356157361924, | |
| "grad_norm": 9.963017463684082, | |
| "learning_rate": 4.73294942175956e-05, | |
| "loss": 3.8257, | |
| "step": 10400 | |
| }, | |
| { | |
| "epoch": 0.4485071120413481, | |
| "grad_norm": 9.806687355041504, | |
| "learning_rate": 4.7250391558164185e-05, | |
| "loss": 3.7903, | |
| "step": 10500 | |
| }, | |
| { | |
| "epoch": 0.4527786083465038, | |
| "grad_norm": 10.770492553710938, | |
| "learning_rate": 4.7171288898732776e-05, | |
| "loss": 3.7205, | |
| "step": 10600 | |
| }, | |
| { | |
| "epoch": 0.45705010465165946, | |
| "grad_norm": 11.635176658630371, | |
| "learning_rate": 4.709218623930137e-05, | |
| "loss": 3.6938, | |
| "step": 10700 | |
| }, | |
| { | |
| "epoch": 0.4613216009568152, | |
| "grad_norm": 8.8875732421875, | |
| "learning_rate": 4.701308357986996e-05, | |
| "loss": 3.7123, | |
| "step": 10800 | |
| }, | |
| { | |
| "epoch": 0.46559309726197085, | |
| "grad_norm": 7.193441867828369, | |
| "learning_rate": 4.6933980920438544e-05, | |
| "loss": 3.6563, | |
| "step": 10900 | |
| }, | |
| { | |
| "epoch": 0.46986459356712656, | |
| "grad_norm": 9.051284790039062, | |
| "learning_rate": 4.6854878261007135e-05, | |
| "loss": 3.7284, | |
| "step": 11000 | |
| }, | |
| { | |
| "epoch": 0.46986459356712656, | |
| "eval_runtime": 405.9538, | |
| "eval_samples_per_second": 115.338, | |
| "eval_steps_per_second": 14.418, | |
| "step": 11000 | |
| }, | |
| { | |
| "epoch": 0.4741360898722823, | |
| "grad_norm": 12.352762222290039, | |
| "learning_rate": 4.677577560157573e-05, | |
| "loss": 3.789, | |
| "step": 11100 | |
| }, | |
| { | |
| "epoch": 0.47840758617743795, | |
| "grad_norm": 9.574907302856445, | |
| "learning_rate": 4.669667294214432e-05, | |
| "loss": 3.7442, | |
| "step": 11200 | |
| }, | |
| { | |
| "epoch": 0.48267908248259367, | |
| "grad_norm": 9.639921188354492, | |
| "learning_rate": 4.661757028271291e-05, | |
| "loss": 3.7511, | |
| "step": 11300 | |
| }, | |
| { | |
| "epoch": 0.48695057878774933, | |
| "grad_norm": 10.295048713684082, | |
| "learning_rate": 4.6538467623281494e-05, | |
| "loss": 3.7032, | |
| "step": 11400 | |
| }, | |
| { | |
| "epoch": 0.49122207509290505, | |
| "grad_norm": 8.52436351776123, | |
| "learning_rate": 4.6459364963850086e-05, | |
| "loss": 3.7266, | |
| "step": 11500 | |
| }, | |
| { | |
| "epoch": 0.4954935713980607, | |
| "grad_norm": 12.574061393737793, | |
| "learning_rate": 4.638026230441868e-05, | |
| "loss": 3.7196, | |
| "step": 11600 | |
| }, | |
| { | |
| "epoch": 0.49976506770321644, | |
| "grad_norm": 12.421334266662598, | |
| "learning_rate": 4.630115964498727e-05, | |
| "loss": 3.6734, | |
| "step": 11700 | |
| }, | |
| { | |
| "epoch": 0.5040365640083722, | |
| "grad_norm": 14.321782112121582, | |
| "learning_rate": 4.6222056985555854e-05, | |
| "loss": 3.8048, | |
| "step": 11800 | |
| }, | |
| { | |
| "epoch": 0.5083080603135278, | |
| "grad_norm": 12.409293174743652, | |
| "learning_rate": 4.614295432612445e-05, | |
| "loss": 3.7366, | |
| "step": 11900 | |
| }, | |
| { | |
| "epoch": 0.5125795566186835, | |
| "grad_norm": 9.002853393554688, | |
| "learning_rate": 4.6063851666693037e-05, | |
| "loss": 3.7532, | |
| "step": 12000 | |
| }, | |
| { | |
| "epoch": 0.5125795566186835, | |
| "eval_runtime": 404.8624, | |
| "eval_samples_per_second": 115.649, | |
| "eval_steps_per_second": 14.457, | |
| "step": 12000 | |
| }, | |
| { | |
| "epoch": 0.5168510529238393, | |
| "grad_norm": 8.214455604553223, | |
| "learning_rate": 4.598474900726162e-05, | |
| "loss": 3.7221, | |
| "step": 12100 | |
| }, | |
| { | |
| "epoch": 0.5211225492289949, | |
| "grad_norm": 7.018616199493408, | |
| "learning_rate": 4.590564634783022e-05, | |
| "loss": 3.6718, | |
| "step": 12200 | |
| }, | |
| { | |
| "epoch": 0.5253940455341506, | |
| "grad_norm": 12.610575675964355, | |
| "learning_rate": 4.5826543688398804e-05, | |
| "loss": 3.7282, | |
| "step": 12300 | |
| }, | |
| { | |
| "epoch": 0.5296655418393064, | |
| "grad_norm": 10.168871879577637, | |
| "learning_rate": 4.5747441028967396e-05, | |
| "loss": 3.7455, | |
| "step": 12400 | |
| }, | |
| { | |
| "epoch": 0.533937038144462, | |
| "grad_norm": 9.421287536621094, | |
| "learning_rate": 4.566833836953599e-05, | |
| "loss": 3.8177, | |
| "step": 12500 | |
| }, | |
| { | |
| "epoch": 0.5382085344496177, | |
| "grad_norm": 11.314359664916992, | |
| "learning_rate": 4.558923571010458e-05, | |
| "loss": 3.7797, | |
| "step": 12600 | |
| }, | |
| { | |
| "epoch": 0.5424800307547734, | |
| "grad_norm": 10.510274887084961, | |
| "learning_rate": 4.5510133050673163e-05, | |
| "loss": 3.7639, | |
| "step": 12700 | |
| }, | |
| { | |
| "epoch": 0.5467515270599291, | |
| "grad_norm": 14.740921020507812, | |
| "learning_rate": 4.5431030391241755e-05, | |
| "loss": 3.8299, | |
| "step": 12800 | |
| }, | |
| { | |
| "epoch": 0.5510230233650848, | |
| "grad_norm": 7.322781562805176, | |
| "learning_rate": 4.5351927731810346e-05, | |
| "loss": 3.8357, | |
| "step": 12900 | |
| }, | |
| { | |
| "epoch": 0.5552945196702405, | |
| "grad_norm": 10.399321556091309, | |
| "learning_rate": 4.527282507237894e-05, | |
| "loss": 3.6613, | |
| "step": 13000 | |
| }, | |
| { | |
| "epoch": 0.5552945196702405, | |
| "eval_runtime": 404.2488, | |
| "eval_samples_per_second": 115.825, | |
| "eval_steps_per_second": 14.479, | |
| "step": 13000 | |
| }, | |
| { | |
| "epoch": 0.5595660159753962, | |
| "grad_norm": 8.805388450622559, | |
| "learning_rate": 4.519372241294752e-05, | |
| "loss": 3.7299, | |
| "step": 13100 | |
| }, | |
| { | |
| "epoch": 0.5638375122805519, | |
| "grad_norm": 7.358932018280029, | |
| "learning_rate": 4.5114619753516114e-05, | |
| "loss": 3.7956, | |
| "step": 13200 | |
| }, | |
| { | |
| "epoch": 0.5681090085857076, | |
| "grad_norm": 8.338736534118652, | |
| "learning_rate": 4.5035517094084706e-05, | |
| "loss": 3.8274, | |
| "step": 13300 | |
| }, | |
| { | |
| "epoch": 0.5723805048908632, | |
| "grad_norm": 9.217434883117676, | |
| "learning_rate": 4.495641443465329e-05, | |
| "loss": 3.8071, | |
| "step": 13400 | |
| }, | |
| { | |
| "epoch": 0.576652001196019, | |
| "grad_norm": 8.726869583129883, | |
| "learning_rate": 4.487731177522189e-05, | |
| "loss": 3.7296, | |
| "step": 13500 | |
| }, | |
| { | |
| "epoch": 0.5809234975011747, | |
| "grad_norm": 11.813636779785156, | |
| "learning_rate": 4.479820911579047e-05, | |
| "loss": 3.8608, | |
| "step": 13600 | |
| }, | |
| { | |
| "epoch": 0.5851949938063303, | |
| "grad_norm": 11.595725059509277, | |
| "learning_rate": 4.4719106456359065e-05, | |
| "loss": 3.7096, | |
| "step": 13700 | |
| }, | |
| { | |
| "epoch": 0.5894664901114861, | |
| "grad_norm": 9.259355545043945, | |
| "learning_rate": 4.4640003796927656e-05, | |
| "loss": 3.6732, | |
| "step": 13800 | |
| }, | |
| { | |
| "epoch": 0.5937379864166418, | |
| "grad_norm": 13.34984016418457, | |
| "learning_rate": 4.456090113749625e-05, | |
| "loss": 3.8131, | |
| "step": 13900 | |
| }, | |
| { | |
| "epoch": 0.5980094827217974, | |
| "grad_norm": 10.516456604003906, | |
| "learning_rate": 4.448179847806483e-05, | |
| "loss": 3.7439, | |
| "step": 14000 | |
| }, | |
| { | |
| "epoch": 0.5980094827217974, | |
| "eval_runtime": 404.1711, | |
| "eval_samples_per_second": 115.847, | |
| "eval_steps_per_second": 14.481, | |
| "step": 14000 | |
| }, | |
| { | |
| "epoch": 0.6022809790269531, | |
| "grad_norm": 12.842930793762207, | |
| "learning_rate": 4.4402695818633424e-05, | |
| "loss": 3.7682, | |
| "step": 14100 | |
| }, | |
| { | |
| "epoch": 0.6065524753321089, | |
| "grad_norm": 19.421875, | |
| "learning_rate": 4.4323593159202015e-05, | |
| "loss": 3.663, | |
| "step": 14200 | |
| }, | |
| { | |
| "epoch": 0.6108239716372645, | |
| "grad_norm": 7.454352855682373, | |
| "learning_rate": 4.42444904997706e-05, | |
| "loss": 3.7463, | |
| "step": 14300 | |
| }, | |
| { | |
| "epoch": 0.6150954679424202, | |
| "grad_norm": 13.48552131652832, | |
| "learning_rate": 4.41653878403392e-05, | |
| "loss": 3.649, | |
| "step": 14400 | |
| }, | |
| { | |
| "epoch": 0.619366964247576, | |
| "grad_norm": 11.968147277832031, | |
| "learning_rate": 4.408628518090778e-05, | |
| "loss": 3.7516, | |
| "step": 14500 | |
| }, | |
| { | |
| "epoch": 0.6236384605527316, | |
| "grad_norm": 12.150687217712402, | |
| "learning_rate": 4.4007182521476375e-05, | |
| "loss": 3.7322, | |
| "step": 14600 | |
| }, | |
| { | |
| "epoch": 0.6279099568578873, | |
| "grad_norm": 8.789100646972656, | |
| "learning_rate": 4.3928079862044966e-05, | |
| "loss": 3.6886, | |
| "step": 14700 | |
| }, | |
| { | |
| "epoch": 0.632181453163043, | |
| "grad_norm": 10.21249008178711, | |
| "learning_rate": 4.384897720261356e-05, | |
| "loss": 3.6862, | |
| "step": 14800 | |
| }, | |
| { | |
| "epoch": 0.6364529494681987, | |
| "grad_norm": 9.880081176757812, | |
| "learning_rate": 4.376987454318214e-05, | |
| "loss": 3.6766, | |
| "step": 14900 | |
| }, | |
| { | |
| "epoch": 0.6407244457733544, | |
| "grad_norm": 11.831520080566406, | |
| "learning_rate": 4.3690771883750734e-05, | |
| "loss": 3.645, | |
| "step": 15000 | |
| }, | |
| { | |
| "epoch": 0.6407244457733544, | |
| "eval_runtime": 404.5696, | |
| "eval_samples_per_second": 115.733, | |
| "eval_steps_per_second": 14.467, | |
| "step": 15000 | |
| }, | |
| { | |
| "epoch": 0.6449959420785101, | |
| "grad_norm": 8.91051959991455, | |
| "learning_rate": 4.3611669224319325e-05, | |
| "loss": 3.6847, | |
| "step": 15100 | |
| }, | |
| { | |
| "epoch": 0.6492674383836659, | |
| "grad_norm": 9.260310173034668, | |
| "learning_rate": 4.353256656488791e-05, | |
| "loss": 3.7197, | |
| "step": 15200 | |
| }, | |
| { | |
| "epoch": 0.6535389346888215, | |
| "grad_norm": 10.138089179992676, | |
| "learning_rate": 4.34534639054565e-05, | |
| "loss": 3.6529, | |
| "step": 15300 | |
| }, | |
| { | |
| "epoch": 0.6578104309939772, | |
| "grad_norm": 8.813399314880371, | |
| "learning_rate": 4.337436124602509e-05, | |
| "loss": 3.6541, | |
| "step": 15400 | |
| }, | |
| { | |
| "epoch": 0.6620819272991328, | |
| "grad_norm": 9.144048690795898, | |
| "learning_rate": 4.3295258586593684e-05, | |
| "loss": 3.5101, | |
| "step": 15500 | |
| }, | |
| { | |
| "epoch": 0.6663534236042886, | |
| "grad_norm": 8.948995590209961, | |
| "learning_rate": 4.321615592716227e-05, | |
| "loss": 3.7848, | |
| "step": 15600 | |
| }, | |
| { | |
| "epoch": 0.6706249199094443, | |
| "grad_norm": 9.42180061340332, | |
| "learning_rate": 4.313705326773087e-05, | |
| "loss": 3.5926, | |
| "step": 15700 | |
| }, | |
| { | |
| "epoch": 0.6748964162146, | |
| "grad_norm": 8.974250793457031, | |
| "learning_rate": 4.305795060829945e-05, | |
| "loss": 3.6967, | |
| "step": 15800 | |
| }, | |
| { | |
| "epoch": 0.6791679125197557, | |
| "grad_norm": 12.110358238220215, | |
| "learning_rate": 4.2978847948868044e-05, | |
| "loss": 3.6521, | |
| "step": 15900 | |
| }, | |
| { | |
| "epoch": 0.6834394088249114, | |
| "grad_norm": 9.907513618469238, | |
| "learning_rate": 4.2899745289436635e-05, | |
| "loss": 3.703, | |
| "step": 16000 | |
| }, | |
| { | |
| "epoch": 0.6834394088249114, | |
| "eval_runtime": 404.4023, | |
| "eval_samples_per_second": 115.781, | |
| "eval_steps_per_second": 14.473, | |
| "step": 16000 | |
| }, | |
| { | |
| "epoch": 0.687710905130067, | |
| "grad_norm": 8.421221733093262, | |
| "learning_rate": 4.2820642630005226e-05, | |
| "loss": 3.5446, | |
| "step": 16100 | |
| }, | |
| { | |
| "epoch": 0.6919824014352227, | |
| "grad_norm": 8.890350341796875, | |
| "learning_rate": 4.274153997057381e-05, | |
| "loss": 3.7051, | |
| "step": 16200 | |
| }, | |
| { | |
| "epoch": 0.6962538977403785, | |
| "grad_norm": 7.985939979553223, | |
| "learning_rate": 4.26624373111424e-05, | |
| "loss": 3.7894, | |
| "step": 16300 | |
| }, | |
| { | |
| "epoch": 0.7005253940455342, | |
| "grad_norm": 9.559375762939453, | |
| "learning_rate": 4.2583334651710994e-05, | |
| "loss": 3.7158, | |
| "step": 16400 | |
| }, | |
| { | |
| "epoch": 0.7047968903506898, | |
| "grad_norm": 9.920624732971191, | |
| "learning_rate": 4.250423199227958e-05, | |
| "loss": 3.7286, | |
| "step": 16500 | |
| }, | |
| { | |
| "epoch": 0.7090683866558456, | |
| "grad_norm": 13.251816749572754, | |
| "learning_rate": 4.242512933284817e-05, | |
| "loss": 3.7271, | |
| "step": 16600 | |
| }, | |
| { | |
| "epoch": 0.7133398829610013, | |
| "grad_norm": 11.248520851135254, | |
| "learning_rate": 4.234602667341676e-05, | |
| "loss": 3.6633, | |
| "step": 16700 | |
| }, | |
| { | |
| "epoch": 0.7176113792661569, | |
| "grad_norm": 7.556328296661377, | |
| "learning_rate": 4.226692401398535e-05, | |
| "loss": 3.6706, | |
| "step": 16800 | |
| }, | |
| { | |
| "epoch": 0.7218828755713126, | |
| "grad_norm": 6.298122406005859, | |
| "learning_rate": 4.218782135455394e-05, | |
| "loss": 3.7307, | |
| "step": 16900 | |
| }, | |
| { | |
| "epoch": 0.7261543718764684, | |
| "grad_norm": 10.17672061920166, | |
| "learning_rate": 4.2108718695122536e-05, | |
| "loss": 3.7274, | |
| "step": 17000 | |
| }, | |
| { | |
| "epoch": 0.7261543718764684, | |
| "eval_runtime": 403.9594, | |
| "eval_samples_per_second": 115.908, | |
| "eval_steps_per_second": 14.489, | |
| "step": 17000 | |
| }, | |
| { | |
| "epoch": 0.730425868181624, | |
| "grad_norm": 8.036096572875977, | |
| "learning_rate": 4.202961603569112e-05, | |
| "loss": 3.641, | |
| "step": 17100 | |
| }, | |
| { | |
| "epoch": 0.7346973644867797, | |
| "grad_norm": 8.982136726379395, | |
| "learning_rate": 4.195051337625971e-05, | |
| "loss": 3.7079, | |
| "step": 17200 | |
| }, | |
| { | |
| "epoch": 0.7389688607919355, | |
| "grad_norm": 11.923723220825195, | |
| "learning_rate": 4.1871410716828304e-05, | |
| "loss": 3.7252, | |
| "step": 17300 | |
| }, | |
| { | |
| "epoch": 0.7432403570970911, | |
| "grad_norm": 10.336372375488281, | |
| "learning_rate": 4.179230805739689e-05, | |
| "loss": 3.6793, | |
| "step": 17400 | |
| }, | |
| { | |
| "epoch": 0.7475118534022468, | |
| "grad_norm": 10.367877960205078, | |
| "learning_rate": 4.171320539796548e-05, | |
| "loss": 3.6833, | |
| "step": 17500 | |
| }, | |
| { | |
| "epoch": 0.7517833497074025, | |
| "grad_norm": 8.473801612854004, | |
| "learning_rate": 4.163410273853407e-05, | |
| "loss": 3.6678, | |
| "step": 17600 | |
| }, | |
| { | |
| "epoch": 0.7560548460125582, | |
| "grad_norm": 7.864530563354492, | |
| "learning_rate": 4.155500007910266e-05, | |
| "loss": 3.6848, | |
| "step": 17700 | |
| }, | |
| { | |
| "epoch": 0.7603263423177139, | |
| "grad_norm": 10.16886043548584, | |
| "learning_rate": 4.147589741967125e-05, | |
| "loss": 3.8073, | |
| "step": 17800 | |
| }, | |
| { | |
| "epoch": 0.7645978386228696, | |
| "grad_norm": 10.161076545715332, | |
| "learning_rate": 4.1396794760239846e-05, | |
| "loss": 3.7171, | |
| "step": 17900 | |
| }, | |
| { | |
| "epoch": 0.7688693349280253, | |
| "grad_norm": 9.742669105529785, | |
| "learning_rate": 4.131769210080843e-05, | |
| "loss": 3.6089, | |
| "step": 18000 | |
| }, | |
| { | |
| "epoch": 0.7688693349280253, | |
| "eval_runtime": 403.6413, | |
| "eval_samples_per_second": 115.999, | |
| "eval_steps_per_second": 14.5, | |
| "step": 18000 | |
| }, | |
| { | |
| "epoch": 0.773140831233181, | |
| "grad_norm": 7.834203243255615, | |
| "learning_rate": 4.123858944137702e-05, | |
| "loss": 3.7618, | |
| "step": 18100 | |
| }, | |
| { | |
| "epoch": 0.7774123275383367, | |
| "grad_norm": 8.670865058898926, | |
| "learning_rate": 4.1159486781945614e-05, | |
| "loss": 3.5937, | |
| "step": 18200 | |
| }, | |
| { | |
| "epoch": 0.7816838238434923, | |
| "grad_norm": 10.17273998260498, | |
| "learning_rate": 4.10803841225142e-05, | |
| "loss": 3.6682, | |
| "step": 18300 | |
| }, | |
| { | |
| "epoch": 0.7859553201486481, | |
| "grad_norm": 7.384734630584717, | |
| "learning_rate": 4.100128146308279e-05, | |
| "loss": 3.7461, | |
| "step": 18400 | |
| }, | |
| { | |
| "epoch": 0.7902268164538038, | |
| "grad_norm": 9.957521438598633, | |
| "learning_rate": 4.092217880365138e-05, | |
| "loss": 3.646, | |
| "step": 18500 | |
| }, | |
| { | |
| "epoch": 0.7944983127589594, | |
| "grad_norm": 9.32741928100586, | |
| "learning_rate": 4.084307614421997e-05, | |
| "loss": 3.7085, | |
| "step": 18600 | |
| }, | |
| { | |
| "epoch": 0.7987698090641152, | |
| "grad_norm": 8.64340591430664, | |
| "learning_rate": 4.076397348478856e-05, | |
| "loss": 3.7954, | |
| "step": 18700 | |
| }, | |
| { | |
| "epoch": 0.8030413053692709, | |
| "grad_norm": 8.776473999023438, | |
| "learning_rate": 4.068487082535715e-05, | |
| "loss": 3.7171, | |
| "step": 18800 | |
| }, | |
| { | |
| "epoch": 0.8073128016744265, | |
| "grad_norm": 13.726973533630371, | |
| "learning_rate": 4.060576816592574e-05, | |
| "loss": 3.7593, | |
| "step": 18900 | |
| }, | |
| { | |
| "epoch": 0.8115842979795822, | |
| "grad_norm": 8.291767120361328, | |
| "learning_rate": 4.052666550649433e-05, | |
| "loss": 3.6206, | |
| "step": 19000 | |
| }, | |
| { | |
| "epoch": 0.8115842979795822, | |
| "eval_runtime": 404.755, | |
| "eval_samples_per_second": 115.68, | |
| "eval_steps_per_second": 14.461, | |
| "step": 19000 | |
| }, | |
| { | |
| "epoch": 0.815855794284738, | |
| "grad_norm": 9.087343215942383, | |
| "learning_rate": 4.044756284706292e-05, | |
| "loss": 3.6748, | |
| "step": 19100 | |
| }, | |
| { | |
| "epoch": 0.8201272905898936, | |
| "grad_norm": 7.206308364868164, | |
| "learning_rate": 4.0368460187631515e-05, | |
| "loss": 3.6976, | |
| "step": 19200 | |
| }, | |
| { | |
| "epoch": 0.8243987868950493, | |
| "grad_norm": 13.21072006225586, | |
| "learning_rate": 4.02893575282001e-05, | |
| "loss": 3.7547, | |
| "step": 19300 | |
| }, | |
| { | |
| "epoch": 0.8286702832002051, | |
| "grad_norm": 11.711308479309082, | |
| "learning_rate": 4.0210254868768685e-05, | |
| "loss": 3.7046, | |
| "step": 19400 | |
| }, | |
| { | |
| "epoch": 0.8329417795053607, | |
| "grad_norm": 7.493105411529541, | |
| "learning_rate": 4.013115220933728e-05, | |
| "loss": 3.6605, | |
| "step": 19500 | |
| }, | |
| { | |
| "epoch": 0.8372132758105164, | |
| "grad_norm": 10.920802116394043, | |
| "learning_rate": 4.005204954990587e-05, | |
| "loss": 3.701, | |
| "step": 19600 | |
| }, | |
| { | |
| "epoch": 0.8414847721156721, | |
| "grad_norm": 8.588319778442383, | |
| "learning_rate": 3.997294689047446e-05, | |
| "loss": 3.7119, | |
| "step": 19700 | |
| }, | |
| { | |
| "epoch": 0.8457562684208279, | |
| "grad_norm": 9.688274383544922, | |
| "learning_rate": 3.989384423104305e-05, | |
| "loss": 3.6968, | |
| "step": 19800 | |
| }, | |
| { | |
| "epoch": 0.8500277647259835, | |
| "grad_norm": 13.46649169921875, | |
| "learning_rate": 3.981474157161164e-05, | |
| "loss": 3.6608, | |
| "step": 19900 | |
| }, | |
| { | |
| "epoch": 0.8542992610311392, | |
| "grad_norm": 9.020798683166504, | |
| "learning_rate": 3.973563891218023e-05, | |
| "loss": 3.7014, | |
| "step": 20000 | |
| }, | |
| { | |
| "epoch": 0.8542992610311392, | |
| "eval_runtime": 404.5707, | |
| "eval_samples_per_second": 115.733, | |
| "eval_steps_per_second": 14.467, | |
| "step": 20000 | |
| }, | |
| { | |
| "epoch": 0.858570757336295, | |
| "grad_norm": 7.667457103729248, | |
| "learning_rate": 3.9656536252748825e-05, | |
| "loss": 3.6261, | |
| "step": 20100 | |
| }, | |
| { | |
| "epoch": 0.8628422536414506, | |
| "grad_norm": 10.752103805541992, | |
| "learning_rate": 3.957743359331741e-05, | |
| "loss": 3.7678, | |
| "step": 20200 | |
| }, | |
| { | |
| "epoch": 0.8671137499466063, | |
| "grad_norm": 8.758957862854004, | |
| "learning_rate": 3.9498330933885994e-05, | |
| "loss": 3.8176, | |
| "step": 20300 | |
| }, | |
| { | |
| "epoch": 0.871385246251762, | |
| "grad_norm": 9.372211456298828, | |
| "learning_rate": 3.941922827445459e-05, | |
| "loss": 3.6383, | |
| "step": 20400 | |
| }, | |
| { | |
| "epoch": 0.8756567425569177, | |
| "grad_norm": 10.67364501953125, | |
| "learning_rate": 3.934012561502318e-05, | |
| "loss": 3.7067, | |
| "step": 20500 | |
| }, | |
| { | |
| "epoch": 0.8799282388620734, | |
| "grad_norm": 12.151751518249512, | |
| "learning_rate": 3.926102295559177e-05, | |
| "loss": 3.7826, | |
| "step": 20600 | |
| }, | |
| { | |
| "epoch": 0.884199735167229, | |
| "grad_norm": 7.820495128631592, | |
| "learning_rate": 3.918192029616036e-05, | |
| "loss": 3.6867, | |
| "step": 20700 | |
| }, | |
| { | |
| "epoch": 0.8884712314723848, | |
| "grad_norm": 9.453180313110352, | |
| "learning_rate": 3.910281763672895e-05, | |
| "loss": 3.7301, | |
| "step": 20800 | |
| }, | |
| { | |
| "epoch": 0.8927427277775405, | |
| "grad_norm": 11.202925682067871, | |
| "learning_rate": 3.9023714977297536e-05, | |
| "loss": 3.6845, | |
| "step": 20900 | |
| }, | |
| { | |
| "epoch": 0.8970142240826962, | |
| "grad_norm": 13.49270248413086, | |
| "learning_rate": 3.894461231786613e-05, | |
| "loss": 3.7193, | |
| "step": 21000 | |
| }, | |
| { | |
| "epoch": 0.8970142240826962, | |
| "eval_runtime": 403.5135, | |
| "eval_samples_per_second": 116.036, | |
| "eval_steps_per_second": 14.505, | |
| "step": 21000 | |
| }, | |
| { | |
| "epoch": 0.9012857203878518, | |
| "grad_norm": 8.086437225341797, | |
| "learning_rate": 3.886550965843472e-05, | |
| "loss": 3.6406, | |
| "step": 21100 | |
| }, | |
| { | |
| "epoch": 0.9055572166930076, | |
| "grad_norm": 10.620895385742188, | |
| "learning_rate": 3.878640699900331e-05, | |
| "loss": 3.762, | |
| "step": 21200 | |
| }, | |
| { | |
| "epoch": 0.9098287129981633, | |
| "grad_norm": 6.320925712585449, | |
| "learning_rate": 3.8707304339571896e-05, | |
| "loss": 3.7283, | |
| "step": 21300 | |
| }, | |
| { | |
| "epoch": 0.9141002093033189, | |
| "grad_norm": 8.072772026062012, | |
| "learning_rate": 3.862820168014049e-05, | |
| "loss": 3.6657, | |
| "step": 21400 | |
| }, | |
| { | |
| "epoch": 0.9183717056084747, | |
| "grad_norm": 8.310846328735352, | |
| "learning_rate": 3.854909902070908e-05, | |
| "loss": 3.7271, | |
| "step": 21500 | |
| }, | |
| { | |
| "epoch": 0.9226432019136304, | |
| "grad_norm": 6.958920478820801, | |
| "learning_rate": 3.846999636127766e-05, | |
| "loss": 3.8027, | |
| "step": 21600 | |
| }, | |
| { | |
| "epoch": 0.926914698218786, | |
| "grad_norm": 10.530051231384277, | |
| "learning_rate": 3.839089370184626e-05, | |
| "loss": 3.6227, | |
| "step": 21700 | |
| }, | |
| { | |
| "epoch": 0.9311861945239417, | |
| "grad_norm": 9.503134727478027, | |
| "learning_rate": 3.8311791042414846e-05, | |
| "loss": 3.7262, | |
| "step": 21800 | |
| }, | |
| { | |
| "epoch": 0.9354576908290975, | |
| "grad_norm": 8.891386985778809, | |
| "learning_rate": 3.823268838298344e-05, | |
| "loss": 3.6882, | |
| "step": 21900 | |
| }, | |
| { | |
| "epoch": 0.9397291871342531, | |
| "grad_norm": 9.793424606323242, | |
| "learning_rate": 3.815358572355203e-05, | |
| "loss": 3.7304, | |
| "step": 22000 | |
| }, | |
| { | |
| "epoch": 0.9397291871342531, | |
| "eval_runtime": 404.4356, | |
| "eval_samples_per_second": 115.771, | |
| "eval_steps_per_second": 14.472, | |
| "step": 22000 | |
| }, | |
| { | |
| "epoch": 0.9440006834394088, | |
| "grad_norm": 8.443710327148438, | |
| "learning_rate": 3.807448306412062e-05, | |
| "loss": 3.6348, | |
| "step": 22100 | |
| }, | |
| { | |
| "epoch": 0.9482721797445646, | |
| "grad_norm": 8.634336471557617, | |
| "learning_rate": 3.7995380404689205e-05, | |
| "loss": 3.6732, | |
| "step": 22200 | |
| }, | |
| { | |
| "epoch": 0.9525436760497202, | |
| "grad_norm": 9.720598220825195, | |
| "learning_rate": 3.79162777452578e-05, | |
| "loss": 3.7, | |
| "step": 22300 | |
| }, | |
| { | |
| "epoch": 0.9568151723548759, | |
| "grad_norm": 9.388401985168457, | |
| "learning_rate": 3.783717508582639e-05, | |
| "loss": 3.646, | |
| "step": 22400 | |
| }, | |
| { | |
| "epoch": 0.9610866686600316, | |
| "grad_norm": 8.947492599487305, | |
| "learning_rate": 3.775807242639497e-05, | |
| "loss": 3.5409, | |
| "step": 22500 | |
| }, | |
| { | |
| "epoch": 0.9653581649651873, | |
| "grad_norm": 9.596850395202637, | |
| "learning_rate": 3.7678969766963565e-05, | |
| "loss": 3.6636, | |
| "step": 22600 | |
| }, | |
| { | |
| "epoch": 0.969629661270343, | |
| "grad_norm": 12.37696361541748, | |
| "learning_rate": 3.7599867107532156e-05, | |
| "loss": 3.7021, | |
| "step": 22700 | |
| }, | |
| { | |
| "epoch": 0.9739011575754987, | |
| "grad_norm": 8.767573356628418, | |
| "learning_rate": 3.752076444810075e-05, | |
| "loss": 3.6152, | |
| "step": 22800 | |
| }, | |
| { | |
| "epoch": 0.9781726538806544, | |
| "grad_norm": 8.559804916381836, | |
| "learning_rate": 3.744166178866933e-05, | |
| "loss": 3.6866, | |
| "step": 22900 | |
| }, | |
| { | |
| "epoch": 0.9824441501858101, | |
| "grad_norm": 7.025428771972656, | |
| "learning_rate": 3.736255912923793e-05, | |
| "loss": 3.7933, | |
| "step": 23000 | |
| }, | |
| { | |
| "epoch": 0.9824441501858101, | |
| "eval_runtime": 403.8391, | |
| "eval_samples_per_second": 115.942, | |
| "eval_steps_per_second": 14.493, | |
| "step": 23000 | |
| }, | |
| { | |
| "epoch": 0.9867156464909658, | |
| "grad_norm": 8.558939933776855, | |
| "learning_rate": 3.7283456469806515e-05, | |
| "loss": 3.6242, | |
| "step": 23100 | |
| }, | |
| { | |
| "epoch": 0.9909871427961214, | |
| "grad_norm": 7.838054656982422, | |
| "learning_rate": 3.720435381037511e-05, | |
| "loss": 3.7873, | |
| "step": 23200 | |
| }, | |
| { | |
| "epoch": 0.9952586391012772, | |
| "grad_norm": 9.238251686096191, | |
| "learning_rate": 3.71252511509437e-05, | |
| "loss": 3.7437, | |
| "step": 23300 | |
| }, | |
| { | |
| "epoch": 0.9995301354064329, | |
| "grad_norm": 8.105572700500488, | |
| "learning_rate": 3.704614849151229e-05, | |
| "loss": 3.5954, | |
| "step": 23400 | |
| }, | |
| { | |
| "epoch": 1.0038016317115885, | |
| "grad_norm": 8.4044771194458, | |
| "learning_rate": 3.6967045832080874e-05, | |
| "loss": 3.6959, | |
| "step": 23500 | |
| }, | |
| { | |
| "epoch": 1.0080731280167443, | |
| "grad_norm": 7.410630702972412, | |
| "learning_rate": 3.6887943172649466e-05, | |
| "loss": 3.5738, | |
| "step": 23600 | |
| }, | |
| { | |
| "epoch": 1.0123446243218999, | |
| "grad_norm": 13.264152526855469, | |
| "learning_rate": 3.680884051321806e-05, | |
| "loss": 3.7171, | |
| "step": 23700 | |
| }, | |
| { | |
| "epoch": 1.0166161206270556, | |
| "grad_norm": 10.43344783782959, | |
| "learning_rate": 3.672973785378664e-05, | |
| "loss": 3.7067, | |
| "step": 23800 | |
| }, | |
| { | |
| "epoch": 1.0208876169322114, | |
| "grad_norm": 10.395238876342773, | |
| "learning_rate": 3.665063519435524e-05, | |
| "loss": 3.7069, | |
| "step": 23900 | |
| }, | |
| { | |
| "epoch": 1.025159113237367, | |
| "grad_norm": 9.611321449279785, | |
| "learning_rate": 3.6571532534923825e-05, | |
| "loss": 3.6583, | |
| "step": 24000 | |
| }, | |
| { | |
| "epoch": 1.025159113237367, | |
| "eval_runtime": 403.723, | |
| "eval_samples_per_second": 115.976, | |
| "eval_steps_per_second": 14.498, | |
| "step": 24000 | |
| }, | |
| { | |
| "epoch": 1.0294306095425227, | |
| "grad_norm": 7.200385570526123, | |
| "learning_rate": 3.6492429875492417e-05, | |
| "loss": 3.6306, | |
| "step": 24100 | |
| }, | |
| { | |
| "epoch": 1.0337021058476785, | |
| "grad_norm": 9.20836067199707, | |
| "learning_rate": 3.641332721606101e-05, | |
| "loss": 3.6762, | |
| "step": 24200 | |
| }, | |
| { | |
| "epoch": 1.037973602152834, | |
| "grad_norm": 7.563958644866943, | |
| "learning_rate": 3.63342245566296e-05, | |
| "loss": 3.7167, | |
| "step": 24300 | |
| }, | |
| { | |
| "epoch": 1.0422450984579898, | |
| "grad_norm": 13.854744911193848, | |
| "learning_rate": 3.6255121897198184e-05, | |
| "loss": 3.618, | |
| "step": 24400 | |
| }, | |
| { | |
| "epoch": 1.0465165947631456, | |
| "grad_norm": 7.969038963317871, | |
| "learning_rate": 3.6176019237766776e-05, | |
| "loss": 3.722, | |
| "step": 24500 | |
| }, | |
| { | |
| "epoch": 1.0507880910683012, | |
| "grad_norm": 9.738038063049316, | |
| "learning_rate": 3.609691657833537e-05, | |
| "loss": 3.7552, | |
| "step": 24600 | |
| }, | |
| { | |
| "epoch": 1.055059587373457, | |
| "grad_norm": 10.093921661376953, | |
| "learning_rate": 3.601781391890395e-05, | |
| "loss": 3.7502, | |
| "step": 24700 | |
| }, | |
| { | |
| "epoch": 1.0593310836786127, | |
| "grad_norm": 6.876298427581787, | |
| "learning_rate": 3.593871125947254e-05, | |
| "loss": 3.6343, | |
| "step": 24800 | |
| }, | |
| { | |
| "epoch": 1.0636025799837683, | |
| "grad_norm": 7.968320846557617, | |
| "learning_rate": 3.5859608600041135e-05, | |
| "loss": 3.6845, | |
| "step": 24900 | |
| }, | |
| { | |
| "epoch": 1.067874076288924, | |
| "grad_norm": 9.148797988891602, | |
| "learning_rate": 3.5780505940609726e-05, | |
| "loss": 3.6106, | |
| "step": 25000 | |
| }, | |
| { | |
| "epoch": 1.067874076288924, | |
| "eval_runtime": 403.8237, | |
| "eval_samples_per_second": 115.947, | |
| "eval_steps_per_second": 14.494, | |
| "step": 25000 | |
| }, | |
| { | |
| "epoch": 1.0721455725940796, | |
| "grad_norm": 9.72523307800293, | |
| "learning_rate": 3.570140328117831e-05, | |
| "loss": 3.6919, | |
| "step": 25100 | |
| }, | |
| { | |
| "epoch": 1.0764170688992354, | |
| "grad_norm": 9.23186206817627, | |
| "learning_rate": 3.562230062174691e-05, | |
| "loss": 3.6378, | |
| "step": 25200 | |
| }, | |
| { | |
| "epoch": 1.0806885652043912, | |
| "grad_norm": 8.581460952758789, | |
| "learning_rate": 3.5543197962315494e-05, | |
| "loss": 3.7005, | |
| "step": 25300 | |
| }, | |
| { | |
| "epoch": 1.0849600615095467, | |
| "grad_norm": 9.83565902709961, | |
| "learning_rate": 3.5464095302884085e-05, | |
| "loss": 3.6315, | |
| "step": 25400 | |
| }, | |
| { | |
| "epoch": 1.0892315578147025, | |
| "grad_norm": 7.4981770515441895, | |
| "learning_rate": 3.538499264345268e-05, | |
| "loss": 3.703, | |
| "step": 25500 | |
| }, | |
| { | |
| "epoch": 1.0935030541198583, | |
| "grad_norm": 9.84447193145752, | |
| "learning_rate": 3.530588998402126e-05, | |
| "loss": 3.703, | |
| "step": 25600 | |
| }, | |
| { | |
| "epoch": 1.0977745504250138, | |
| "grad_norm": 11.77198600769043, | |
| "learning_rate": 3.522678732458985e-05, | |
| "loss": 3.74, | |
| "step": 25700 | |
| }, | |
| { | |
| "epoch": 1.1020460467301696, | |
| "grad_norm": 9.35525131225586, | |
| "learning_rate": 3.5147684665158445e-05, | |
| "loss": 3.6574, | |
| "step": 25800 | |
| }, | |
| { | |
| "epoch": 1.1063175430353254, | |
| "grad_norm": 11.326153755187988, | |
| "learning_rate": 3.5068582005727036e-05, | |
| "loss": 3.6052, | |
| "step": 25900 | |
| }, | |
| { | |
| "epoch": 1.110589039340481, | |
| "grad_norm": 8.957196235656738, | |
| "learning_rate": 3.498947934629562e-05, | |
| "loss": 3.6924, | |
| "step": 26000 | |
| }, | |
| { | |
| "epoch": 1.110589039340481, | |
| "eval_runtime": 404.7232, | |
| "eval_samples_per_second": 115.689, | |
| "eval_steps_per_second": 14.462, | |
| "step": 26000 | |
| }, | |
| { | |
| "epoch": 1.1148605356456367, | |
| "grad_norm": 8.46112060546875, | |
| "learning_rate": 3.491037668686421e-05, | |
| "loss": 3.5975, | |
| "step": 26100 | |
| }, | |
| { | |
| "epoch": 1.1191320319507922, | |
| "grad_norm": 10.088958740234375, | |
| "learning_rate": 3.4831274027432804e-05, | |
| "loss": 3.6885, | |
| "step": 26200 | |
| }, | |
| { | |
| "epoch": 1.123403528255948, | |
| "grad_norm": 8.147522926330566, | |
| "learning_rate": 3.4752171368001395e-05, | |
| "loss": 3.587, | |
| "step": 26300 | |
| }, | |
| { | |
| "epoch": 1.1276750245611038, | |
| "grad_norm": 9.306445121765137, | |
| "learning_rate": 3.467306870856999e-05, | |
| "loss": 3.6943, | |
| "step": 26400 | |
| }, | |
| { | |
| "epoch": 1.1319465208662594, | |
| "grad_norm": 7.206762790679932, | |
| "learning_rate": 3.459396604913858e-05, | |
| "loss": 3.8088, | |
| "step": 26500 | |
| }, | |
| { | |
| "epoch": 1.1362180171714151, | |
| "grad_norm": 7.733761787414551, | |
| "learning_rate": 3.451486338970716e-05, | |
| "loss": 3.6604, | |
| "step": 26600 | |
| }, | |
| { | |
| "epoch": 1.140489513476571, | |
| "grad_norm": 12.075209617614746, | |
| "learning_rate": 3.4435760730275754e-05, | |
| "loss": 3.6716, | |
| "step": 26700 | |
| }, | |
| { | |
| "epoch": 1.1447610097817265, | |
| "grad_norm": 13.957979202270508, | |
| "learning_rate": 3.4356658070844346e-05, | |
| "loss": 3.6605, | |
| "step": 26800 | |
| }, | |
| { | |
| "epoch": 1.1490325060868822, | |
| "grad_norm": 6.747539520263672, | |
| "learning_rate": 3.427755541141293e-05, | |
| "loss": 3.6524, | |
| "step": 26900 | |
| }, | |
| { | |
| "epoch": 1.153304002392038, | |
| "grad_norm": 7.960626602172852, | |
| "learning_rate": 3.419845275198152e-05, | |
| "loss": 3.6574, | |
| "step": 27000 | |
| }, | |
| { | |
| "epoch": 1.153304002392038, | |
| "eval_runtime": 403.5226, | |
| "eval_samples_per_second": 116.033, | |
| "eval_steps_per_second": 14.505, | |
| "step": 27000 | |
| }, | |
| { | |
| "epoch": 1.1575754986971936, | |
| "grad_norm": 6.446537971496582, | |
| "learning_rate": 3.4119350092550114e-05, | |
| "loss": 3.6968, | |
| "step": 27100 | |
| }, | |
| { | |
| "epoch": 1.1618469950023493, | |
| "grad_norm": 8.360943794250488, | |
| "learning_rate": 3.4040247433118705e-05, | |
| "loss": 3.7481, | |
| "step": 27200 | |
| }, | |
| { | |
| "epoch": 1.1661184913075051, | |
| "grad_norm": 9.145593643188477, | |
| "learning_rate": 3.396114477368729e-05, | |
| "loss": 3.7588, | |
| "step": 27300 | |
| }, | |
| { | |
| "epoch": 1.1703899876126607, | |
| "grad_norm": 11.084358215332031, | |
| "learning_rate": 3.388204211425589e-05, | |
| "loss": 3.6393, | |
| "step": 27400 | |
| }, | |
| { | |
| "epoch": 1.1746614839178164, | |
| "grad_norm": 11.370959281921387, | |
| "learning_rate": 3.380293945482447e-05, | |
| "loss": 3.6566, | |
| "step": 27500 | |
| }, | |
| { | |
| "epoch": 1.1789329802229722, | |
| "grad_norm": 9.31324291229248, | |
| "learning_rate": 3.372383679539306e-05, | |
| "loss": 3.5833, | |
| "step": 27600 | |
| }, | |
| { | |
| "epoch": 1.1832044765281278, | |
| "grad_norm": 10.302188873291016, | |
| "learning_rate": 3.3644734135961656e-05, | |
| "loss": 3.6591, | |
| "step": 27700 | |
| }, | |
| { | |
| "epoch": 1.1874759728332835, | |
| "grad_norm": 9.487174034118652, | |
| "learning_rate": 3.356563147653024e-05, | |
| "loss": 3.6245, | |
| "step": 27800 | |
| }, | |
| { | |
| "epoch": 1.191747469138439, | |
| "grad_norm": 8.596895217895508, | |
| "learning_rate": 3.348652881709883e-05, | |
| "loss": 3.7252, | |
| "step": 27900 | |
| }, | |
| { | |
| "epoch": 1.1960189654435949, | |
| "grad_norm": 8.368951797485352, | |
| "learning_rate": 3.3407426157667423e-05, | |
| "loss": 3.7371, | |
| "step": 28000 | |
| }, | |
| { | |
| "epoch": 1.1960189654435949, | |
| "eval_runtime": 404.648, | |
| "eval_samples_per_second": 115.71, | |
| "eval_steps_per_second": 14.464, | |
| "step": 28000 | |
| }, | |
| { | |
| "epoch": 1.2002904617487506, | |
| "grad_norm": 15.659690856933594, | |
| "learning_rate": 3.3328323498236015e-05, | |
| "loss": 3.5971, | |
| "step": 28100 | |
| }, | |
| { | |
| "epoch": 1.2045619580539062, | |
| "grad_norm": 7.678028106689453, | |
| "learning_rate": 3.32492208388046e-05, | |
| "loss": 3.722, | |
| "step": 28200 | |
| }, | |
| { | |
| "epoch": 1.208833454359062, | |
| "grad_norm": 7.527515888214111, | |
| "learning_rate": 3.317011817937319e-05, | |
| "loss": 3.6987, | |
| "step": 28300 | |
| }, | |
| { | |
| "epoch": 1.2131049506642178, | |
| "grad_norm": 7.842383861541748, | |
| "learning_rate": 3.309101551994178e-05, | |
| "loss": 3.6394, | |
| "step": 28400 | |
| }, | |
| { | |
| "epoch": 1.2173764469693733, | |
| "grad_norm": 7.929213523864746, | |
| "learning_rate": 3.3011912860510374e-05, | |
| "loss": 3.6538, | |
| "step": 28500 | |
| }, | |
| { | |
| "epoch": 1.221647943274529, | |
| "grad_norm": 10.908970832824707, | |
| "learning_rate": 3.293281020107896e-05, | |
| "loss": 3.8107, | |
| "step": 28600 | |
| }, | |
| { | |
| "epoch": 1.2259194395796849, | |
| "grad_norm": 13.46042251586914, | |
| "learning_rate": 3.285370754164755e-05, | |
| "loss": 3.7662, | |
| "step": 28700 | |
| }, | |
| { | |
| "epoch": 1.2301909358848404, | |
| "grad_norm": 9.317851066589355, | |
| "learning_rate": 3.277460488221614e-05, | |
| "loss": 3.7211, | |
| "step": 28800 | |
| }, | |
| { | |
| "epoch": 1.2344624321899962, | |
| "grad_norm": 8.503124237060547, | |
| "learning_rate": 3.2695502222784726e-05, | |
| "loss": 3.5346, | |
| "step": 28900 | |
| }, | |
| { | |
| "epoch": 1.2387339284951517, | |
| "grad_norm": 7.7580718994140625, | |
| "learning_rate": 3.2616399563353325e-05, | |
| "loss": 3.7839, | |
| "step": 29000 | |
| }, | |
| { | |
| "epoch": 1.2387339284951517, | |
| "eval_runtime": 404.9737, | |
| "eval_samples_per_second": 115.617, | |
| "eval_steps_per_second": 14.453, | |
| "step": 29000 | |
| }, | |
| { | |
| "epoch": 1.2430054248003075, | |
| "grad_norm": 6.63585090637207, | |
| "learning_rate": 3.253729690392191e-05, | |
| "loss": 3.7525, | |
| "step": 29100 | |
| }, | |
| { | |
| "epoch": 1.2472769211054633, | |
| "grad_norm": 8.438802719116211, | |
| "learning_rate": 3.24581942444905e-05, | |
| "loss": 3.6626, | |
| "step": 29200 | |
| }, | |
| { | |
| "epoch": 1.251548417410619, | |
| "grad_norm": 10.101741790771484, | |
| "learning_rate": 3.237909158505909e-05, | |
| "loss": 3.6074, | |
| "step": 29300 | |
| }, | |
| { | |
| "epoch": 1.2558199137157746, | |
| "grad_norm": 7.5071797370910645, | |
| "learning_rate": 3.2299988925627684e-05, | |
| "loss": 3.6081, | |
| "step": 29400 | |
| }, | |
| { | |
| "epoch": 1.2600914100209304, | |
| "grad_norm": 8.632312774658203, | |
| "learning_rate": 3.222088626619627e-05, | |
| "loss": 3.5992, | |
| "step": 29500 | |
| }, | |
| { | |
| "epoch": 1.264362906326086, | |
| "grad_norm": 8.449037551879883, | |
| "learning_rate": 3.214178360676487e-05, | |
| "loss": 3.5541, | |
| "step": 29600 | |
| }, | |
| { | |
| "epoch": 1.2686344026312417, | |
| "grad_norm": 7.799576759338379, | |
| "learning_rate": 3.206268094733345e-05, | |
| "loss": 3.6541, | |
| "step": 29700 | |
| }, | |
| { | |
| "epoch": 1.2729058989363975, | |
| "grad_norm": 11.673965454101562, | |
| "learning_rate": 3.1983578287902036e-05, | |
| "loss": 3.6498, | |
| "step": 29800 | |
| }, | |
| { | |
| "epoch": 1.277177395241553, | |
| "grad_norm": 8.359036445617676, | |
| "learning_rate": 3.1904475628470635e-05, | |
| "loss": 3.6494, | |
| "step": 29900 | |
| }, | |
| { | |
| "epoch": 1.2814488915467088, | |
| "grad_norm": 13.046087265014648, | |
| "learning_rate": 3.182537296903922e-05, | |
| "loss": 3.8596, | |
| "step": 30000 | |
| }, | |
| { | |
| "epoch": 1.2814488915467088, | |
| "eval_runtime": 404.5555, | |
| "eval_samples_per_second": 115.737, | |
| "eval_steps_per_second": 14.468, | |
| "step": 30000 | |
| }, | |
| { | |
| "epoch": 1.2857203878518644, | |
| "grad_norm": 10.439358711242676, | |
| "learning_rate": 3.174627030960781e-05, | |
| "loss": 3.7013, | |
| "step": 30100 | |
| }, | |
| { | |
| "epoch": 1.2899918841570202, | |
| "grad_norm": 7.784947395324707, | |
| "learning_rate": 3.16671676501764e-05, | |
| "loss": 3.7573, | |
| "step": 30200 | |
| }, | |
| { | |
| "epoch": 1.294263380462176, | |
| "grad_norm": 11.142706871032715, | |
| "learning_rate": 3.1588064990744994e-05, | |
| "loss": 3.6061, | |
| "step": 30300 | |
| }, | |
| { | |
| "epoch": 1.2985348767673317, | |
| "grad_norm": 8.045978546142578, | |
| "learning_rate": 3.150896233131358e-05, | |
| "loss": 3.6615, | |
| "step": 30400 | |
| }, | |
| { | |
| "epoch": 1.3028063730724873, | |
| "grad_norm": 8.409544944763184, | |
| "learning_rate": 3.142985967188217e-05, | |
| "loss": 3.6075, | |
| "step": 30500 | |
| }, | |
| { | |
| "epoch": 1.307077869377643, | |
| "grad_norm": 10.13918685913086, | |
| "learning_rate": 3.135075701245076e-05, | |
| "loss": 3.6318, | |
| "step": 30600 | |
| }, | |
| { | |
| "epoch": 1.3113493656827986, | |
| "grad_norm": 10.452644348144531, | |
| "learning_rate": 3.1271654353019346e-05, | |
| "loss": 3.6653, | |
| "step": 30700 | |
| }, | |
| { | |
| "epoch": 1.3156208619879544, | |
| "grad_norm": 8.69783878326416, | |
| "learning_rate": 3.119255169358794e-05, | |
| "loss": 3.5687, | |
| "step": 30800 | |
| }, | |
| { | |
| "epoch": 1.3198923582931101, | |
| "grad_norm": 9.234668731689453, | |
| "learning_rate": 3.111344903415653e-05, | |
| "loss": 3.6629, | |
| "step": 30900 | |
| }, | |
| { | |
| "epoch": 1.324163854598266, | |
| "grad_norm": 7.855345249176025, | |
| "learning_rate": 3.103434637472512e-05, | |
| "loss": 3.7406, | |
| "step": 31000 | |
| }, | |
| { | |
| "epoch": 1.324163854598266, | |
| "eval_runtime": 404.076, | |
| "eval_samples_per_second": 115.874, | |
| "eval_steps_per_second": 14.485, | |
| "step": 31000 | |
| }, | |
| { | |
| "epoch": 1.3284353509034215, | |
| "grad_norm": 13.292342185974121, | |
| "learning_rate": 3.0955243715293705e-05, | |
| "loss": 3.7277, | |
| "step": 31100 | |
| }, | |
| { | |
| "epoch": 1.3327068472085772, | |
| "grad_norm": 14.126769065856934, | |
| "learning_rate": 3.0876141055862304e-05, | |
| "loss": 3.8065, | |
| "step": 31200 | |
| }, | |
| { | |
| "epoch": 1.3369783435137328, | |
| "grad_norm": 7.635355472564697, | |
| "learning_rate": 3.079703839643089e-05, | |
| "loss": 3.6653, | |
| "step": 31300 | |
| }, | |
| { | |
| "epoch": 1.3412498398188886, | |
| "grad_norm": 9.28641128540039, | |
| "learning_rate": 3.071793573699948e-05, | |
| "loss": 3.7125, | |
| "step": 31400 | |
| }, | |
| { | |
| "epoch": 1.3455213361240443, | |
| "grad_norm": 8.960599899291992, | |
| "learning_rate": 3.063883307756807e-05, | |
| "loss": 3.639, | |
| "step": 31500 | |
| }, | |
| { | |
| "epoch": 1.3497928324292, | |
| "grad_norm": 10.085050582885742, | |
| "learning_rate": 3.055973041813666e-05, | |
| "loss": 3.7317, | |
| "step": 31600 | |
| }, | |
| { | |
| "epoch": 1.3540643287343557, | |
| "grad_norm": 8.527816772460938, | |
| "learning_rate": 3.048062775870525e-05, | |
| "loss": 3.6044, | |
| "step": 31700 | |
| }, | |
| { | |
| "epoch": 1.3583358250395112, | |
| "grad_norm": 11.678420066833496, | |
| "learning_rate": 3.0401525099273835e-05, | |
| "loss": 3.572, | |
| "step": 31800 | |
| }, | |
| { | |
| "epoch": 1.362607321344667, | |
| "grad_norm": 5.9545207023620605, | |
| "learning_rate": 3.032242243984243e-05, | |
| "loss": 3.7374, | |
| "step": 31900 | |
| }, | |
| { | |
| "epoch": 1.3668788176498228, | |
| "grad_norm": 8.175214767456055, | |
| "learning_rate": 3.024331978041102e-05, | |
| "loss": 3.624, | |
| "step": 32000 | |
| }, | |
| { | |
| "epoch": 1.3668788176498228, | |
| "eval_runtime": 403.7487, | |
| "eval_samples_per_second": 115.968, | |
| "eval_steps_per_second": 14.497, | |
| "step": 32000 | |
| }, | |
| { | |
| "epoch": 1.3711503139549786, | |
| "grad_norm": 7.345489978790283, | |
| "learning_rate": 3.016421712097961e-05, | |
| "loss": 3.6508, | |
| "step": 32100 | |
| }, | |
| { | |
| "epoch": 1.375421810260134, | |
| "grad_norm": 10.301737785339355, | |
| "learning_rate": 3.0085114461548198e-05, | |
| "loss": 3.5836, | |
| "step": 32200 | |
| }, | |
| { | |
| "epoch": 1.3796933065652899, | |
| "grad_norm": 8.771992683410645, | |
| "learning_rate": 3.000601180211679e-05, | |
| "loss": 3.7915, | |
| "step": 32300 | |
| }, | |
| { | |
| "epoch": 1.3839648028704454, | |
| "grad_norm": 9.168205261230469, | |
| "learning_rate": 2.9926909142685378e-05, | |
| "loss": 3.6517, | |
| "step": 32400 | |
| }, | |
| { | |
| "epoch": 1.3882362991756012, | |
| "grad_norm": 7.1654744148254395, | |
| "learning_rate": 2.984780648325397e-05, | |
| "loss": 3.503, | |
| "step": 32500 | |
| }, | |
| { | |
| "epoch": 1.392507795480757, | |
| "grad_norm": 8.390599250793457, | |
| "learning_rate": 2.9768703823822557e-05, | |
| "loss": 3.7276, | |
| "step": 32600 | |
| }, | |
| { | |
| "epoch": 1.3967792917859125, | |
| "grad_norm": 12.229002952575684, | |
| "learning_rate": 2.9689601164391152e-05, | |
| "loss": 3.7753, | |
| "step": 32700 | |
| }, | |
| { | |
| "epoch": 1.4010507880910683, | |
| "grad_norm": 11.649025917053223, | |
| "learning_rate": 2.9610498504959737e-05, | |
| "loss": 3.6765, | |
| "step": 32800 | |
| }, | |
| { | |
| "epoch": 1.4053222843962239, | |
| "grad_norm": 8.619730949401855, | |
| "learning_rate": 2.9531395845528325e-05, | |
| "loss": 3.6508, | |
| "step": 32900 | |
| }, | |
| { | |
| "epoch": 1.4095937807013796, | |
| "grad_norm": 9.323366165161133, | |
| "learning_rate": 2.945229318609692e-05, | |
| "loss": 3.7256, | |
| "step": 33000 | |
| }, | |
| { | |
| "epoch": 1.4095937807013796, | |
| "eval_runtime": 404.2192, | |
| "eval_samples_per_second": 115.833, | |
| "eval_steps_per_second": 14.48, | |
| "step": 33000 | |
| }, | |
| { | |
| "epoch": 1.4138652770065354, | |
| "grad_norm": 13.431550979614258, | |
| "learning_rate": 2.9373190526665504e-05, | |
| "loss": 3.7312, | |
| "step": 33100 | |
| }, | |
| { | |
| "epoch": 1.4181367733116912, | |
| "grad_norm": 7.221197128295898, | |
| "learning_rate": 2.92940878672341e-05, | |
| "loss": 3.5233, | |
| "step": 33200 | |
| }, | |
| { | |
| "epoch": 1.4224082696168467, | |
| "grad_norm": 8.221494674682617, | |
| "learning_rate": 2.9214985207802687e-05, | |
| "loss": 3.6815, | |
| "step": 33300 | |
| }, | |
| { | |
| "epoch": 1.4266797659220025, | |
| "grad_norm": 8.996779441833496, | |
| "learning_rate": 2.913588254837128e-05, | |
| "loss": 3.5838, | |
| "step": 33400 | |
| }, | |
| { | |
| "epoch": 1.430951262227158, | |
| "grad_norm": 7.899658679962158, | |
| "learning_rate": 2.9056779888939867e-05, | |
| "loss": 3.5632, | |
| "step": 33500 | |
| }, | |
| { | |
| "epoch": 1.4352227585323138, | |
| "grad_norm": 7.839208602905273, | |
| "learning_rate": 2.897767722950846e-05, | |
| "loss": 3.6154, | |
| "step": 33600 | |
| }, | |
| { | |
| "epoch": 1.4394942548374696, | |
| "grad_norm": 7.053780555725098, | |
| "learning_rate": 2.8898574570077047e-05, | |
| "loss": 3.6218, | |
| "step": 33700 | |
| }, | |
| { | |
| "epoch": 1.4437657511426254, | |
| "grad_norm": 11.56430721282959, | |
| "learning_rate": 2.8819471910645635e-05, | |
| "loss": 3.6301, | |
| "step": 33800 | |
| }, | |
| { | |
| "epoch": 1.448037247447781, | |
| "grad_norm": 12.948223114013672, | |
| "learning_rate": 2.8740369251214226e-05, | |
| "loss": 3.6278, | |
| "step": 33900 | |
| }, | |
| { | |
| "epoch": 1.4523087437529367, | |
| "grad_norm": 10.741262435913086, | |
| "learning_rate": 2.8661266591782814e-05, | |
| "loss": 3.7338, | |
| "step": 34000 | |
| }, | |
| { | |
| "epoch": 1.4523087437529367, | |
| "eval_runtime": 404.4425, | |
| "eval_samples_per_second": 115.769, | |
| "eval_steps_per_second": 14.472, | |
| "step": 34000 | |
| }, | |
| { | |
| "epoch": 1.4565802400580923, | |
| "grad_norm": 8.808819770812988, | |
| "learning_rate": 2.858216393235141e-05, | |
| "loss": 3.5755, | |
| "step": 34100 | |
| }, | |
| { | |
| "epoch": 1.460851736363248, | |
| "grad_norm": 7.9008097648620605, | |
| "learning_rate": 2.8503061272919994e-05, | |
| "loss": 3.5744, | |
| "step": 34200 | |
| }, | |
| { | |
| "epoch": 1.4651232326684038, | |
| "grad_norm": 10.011557579040527, | |
| "learning_rate": 2.842395861348859e-05, | |
| "loss": 3.7014, | |
| "step": 34300 | |
| }, | |
| { | |
| "epoch": 1.4693947289735594, | |
| "grad_norm": 8.058487892150879, | |
| "learning_rate": 2.8344855954057177e-05, | |
| "loss": 3.6554, | |
| "step": 34400 | |
| }, | |
| { | |
| "epoch": 1.4736662252787152, | |
| "grad_norm": 7.3602824211120605, | |
| "learning_rate": 2.8265753294625768e-05, | |
| "loss": 3.7166, | |
| "step": 34500 | |
| }, | |
| { | |
| "epoch": 1.4779377215838707, | |
| "grad_norm": 7.900210857391357, | |
| "learning_rate": 2.8186650635194356e-05, | |
| "loss": 3.6276, | |
| "step": 34600 | |
| }, | |
| { | |
| "epoch": 1.4822092178890265, | |
| "grad_norm": 7.839376926422119, | |
| "learning_rate": 2.8107547975762948e-05, | |
| "loss": 3.6036, | |
| "step": 34700 | |
| }, | |
| { | |
| "epoch": 1.4864807141941823, | |
| "grad_norm": 8.925679206848145, | |
| "learning_rate": 2.8028445316331536e-05, | |
| "loss": 3.6868, | |
| "step": 34800 | |
| }, | |
| { | |
| "epoch": 1.490752210499338, | |
| "grad_norm": 8.532880783081055, | |
| "learning_rate": 2.7949342656900124e-05, | |
| "loss": 3.7388, | |
| "step": 34900 | |
| }, | |
| { | |
| "epoch": 1.4950237068044936, | |
| "grad_norm": 9.397866249084473, | |
| "learning_rate": 2.7870239997468716e-05, | |
| "loss": 3.7206, | |
| "step": 35000 | |
| }, | |
| { | |
| "epoch": 1.4950237068044936, | |
| "eval_runtime": 404.715, | |
| "eval_samples_per_second": 115.691, | |
| "eval_steps_per_second": 14.462, | |
| "step": 35000 | |
| }, | |
| { | |
| "epoch": 1.4992952031096494, | |
| "grad_norm": 9.152342796325684, | |
| "learning_rate": 2.7791137338037304e-05, | |
| "loss": 3.6816, | |
| "step": 35100 | |
| }, | |
| { | |
| "epoch": 1.503566699414805, | |
| "grad_norm": 7.594329357147217, | |
| "learning_rate": 2.77120346786059e-05, | |
| "loss": 3.6713, | |
| "step": 35200 | |
| }, | |
| { | |
| "epoch": 1.5078381957199607, | |
| "grad_norm": 9.826537132263184, | |
| "learning_rate": 2.7632932019174483e-05, | |
| "loss": 3.6467, | |
| "step": 35300 | |
| }, | |
| { | |
| "epoch": 1.5121096920251165, | |
| "grad_norm": 9.374577522277832, | |
| "learning_rate": 2.7553829359743078e-05, | |
| "loss": 3.6588, | |
| "step": 35400 | |
| }, | |
| { | |
| "epoch": 1.5163811883302722, | |
| "grad_norm": 10.790063858032227, | |
| "learning_rate": 2.7474726700311666e-05, | |
| "loss": 3.5689, | |
| "step": 35500 | |
| }, | |
| { | |
| "epoch": 1.5206526846354278, | |
| "grad_norm": 10.145702362060547, | |
| "learning_rate": 2.7395624040880258e-05, | |
| "loss": 3.7457, | |
| "step": 35600 | |
| }, | |
| { | |
| "epoch": 1.5249241809405834, | |
| "grad_norm": 11.168187141418457, | |
| "learning_rate": 2.7316521381448846e-05, | |
| "loss": 3.5706, | |
| "step": 35700 | |
| }, | |
| { | |
| "epoch": 1.5291956772457391, | |
| "grad_norm": 9.234560012817383, | |
| "learning_rate": 2.7237418722017437e-05, | |
| "loss": 3.6744, | |
| "step": 35800 | |
| }, | |
| { | |
| "epoch": 1.533467173550895, | |
| "grad_norm": 10.015559196472168, | |
| "learning_rate": 2.7158316062586025e-05, | |
| "loss": 3.6374, | |
| "step": 35900 | |
| }, | |
| { | |
| "epoch": 1.5377386698560507, | |
| "grad_norm": 8.472687721252441, | |
| "learning_rate": 2.7079213403154613e-05, | |
| "loss": 3.6958, | |
| "step": 36000 | |
| }, | |
| { | |
| "epoch": 1.5377386698560507, | |
| "eval_runtime": 404.1589, | |
| "eval_samples_per_second": 115.85, | |
| "eval_steps_per_second": 14.482, | |
| "step": 36000 | |
| }, | |
| { | |
| "epoch": 1.5420101661612062, | |
| "grad_norm": 7.5909199714660645, | |
| "learning_rate": 2.7000110743723205e-05, | |
| "loss": 3.6096, | |
| "step": 36100 | |
| }, | |
| { | |
| "epoch": 1.546281662466362, | |
| "grad_norm": 18.598318099975586, | |
| "learning_rate": 2.6921008084291793e-05, | |
| "loss": 3.6368, | |
| "step": 36200 | |
| }, | |
| { | |
| "epoch": 1.5505531587715176, | |
| "grad_norm": 10.265989303588867, | |
| "learning_rate": 2.6841905424860388e-05, | |
| "loss": 3.6886, | |
| "step": 36300 | |
| }, | |
| { | |
| "epoch": 1.5548246550766733, | |
| "grad_norm": 16.7838077545166, | |
| "learning_rate": 2.6762802765428973e-05, | |
| "loss": 3.4999, | |
| "step": 36400 | |
| }, | |
| { | |
| "epoch": 1.559096151381829, | |
| "grad_norm": 9.542481422424316, | |
| "learning_rate": 2.6683700105997567e-05, | |
| "loss": 3.5173, | |
| "step": 36500 | |
| }, | |
| { | |
| "epoch": 1.5633676476869849, | |
| "grad_norm": 7.0144758224487305, | |
| "learning_rate": 2.6604597446566156e-05, | |
| "loss": 3.6004, | |
| "step": 36600 | |
| }, | |
| { | |
| "epoch": 1.5676391439921404, | |
| "grad_norm": 7.273271560668945, | |
| "learning_rate": 2.6525494787134747e-05, | |
| "loss": 3.5539, | |
| "step": 36700 | |
| }, | |
| { | |
| "epoch": 1.571910640297296, | |
| "grad_norm": 9.942744255065918, | |
| "learning_rate": 2.6446392127703335e-05, | |
| "loss": 3.6823, | |
| "step": 36800 | |
| }, | |
| { | |
| "epoch": 1.5761821366024518, | |
| "grad_norm": 8.686135292053223, | |
| "learning_rate": 2.6367289468271923e-05, | |
| "loss": 3.6757, | |
| "step": 36900 | |
| }, | |
| { | |
| "epoch": 1.5804536329076075, | |
| "grad_norm": 6.468233108520508, | |
| "learning_rate": 2.6288186808840515e-05, | |
| "loss": 3.6318, | |
| "step": 37000 | |
| }, | |
| { | |
| "epoch": 1.5804536329076075, | |
| "eval_runtime": 403.714, | |
| "eval_samples_per_second": 115.978, | |
| "eval_steps_per_second": 14.498, | |
| "step": 37000 | |
| }, | |
| { | |
| "epoch": 1.5847251292127633, | |
| "grad_norm": 8.390809059143066, | |
| "learning_rate": 2.6209084149409103e-05, | |
| "loss": 3.6048, | |
| "step": 37100 | |
| }, | |
| { | |
| "epoch": 1.588996625517919, | |
| "grad_norm": 11.824224472045898, | |
| "learning_rate": 2.6129981489977694e-05, | |
| "loss": 3.6128, | |
| "step": 37200 | |
| }, | |
| { | |
| "epoch": 1.5932681218230746, | |
| "grad_norm": 9.557259559631348, | |
| "learning_rate": 2.6050878830546282e-05, | |
| "loss": 3.5252, | |
| "step": 37300 | |
| }, | |
| { | |
| "epoch": 1.5975396181282302, | |
| "grad_norm": 10.761728286743164, | |
| "learning_rate": 2.5971776171114874e-05, | |
| "loss": 3.6447, | |
| "step": 37400 | |
| }, | |
| { | |
| "epoch": 1.601811114433386, | |
| "grad_norm": 7.978828430175781, | |
| "learning_rate": 2.5892673511683462e-05, | |
| "loss": 3.6211, | |
| "step": 37500 | |
| }, | |
| { | |
| "epoch": 1.6060826107385417, | |
| "grad_norm": 8.314446449279785, | |
| "learning_rate": 2.5813570852252057e-05, | |
| "loss": 3.6197, | |
| "step": 37600 | |
| }, | |
| { | |
| "epoch": 1.6103541070436975, | |
| "grad_norm": 7.391338348388672, | |
| "learning_rate": 2.573446819282064e-05, | |
| "loss": 3.6123, | |
| "step": 37700 | |
| }, | |
| { | |
| "epoch": 1.614625603348853, | |
| "grad_norm": 9.402429580688477, | |
| "learning_rate": 2.5655365533389236e-05, | |
| "loss": 3.7008, | |
| "step": 37800 | |
| }, | |
| { | |
| "epoch": 1.6188970996540089, | |
| "grad_norm": 9.703052520751953, | |
| "learning_rate": 2.5576262873957825e-05, | |
| "loss": 3.6748, | |
| "step": 37900 | |
| }, | |
| { | |
| "epoch": 1.6231685959591644, | |
| "grad_norm": 7.890733242034912, | |
| "learning_rate": 2.5497160214526413e-05, | |
| "loss": 3.6766, | |
| "step": 38000 | |
| }, | |
| { | |
| "epoch": 1.6231685959591644, | |
| "eval_runtime": 403.3234, | |
| "eval_samples_per_second": 116.09, | |
| "eval_steps_per_second": 14.512, | |
| "step": 38000 | |
| }, | |
| { | |
| "epoch": 1.6274400922643202, | |
| "grad_norm": 7.096985816955566, | |
| "learning_rate": 2.5418057555095004e-05, | |
| "loss": 3.6308, | |
| "step": 38100 | |
| }, | |
| { | |
| "epoch": 1.631711588569476, | |
| "grad_norm": 10.07547378540039, | |
| "learning_rate": 2.5338954895663592e-05, | |
| "loss": 3.5905, | |
| "step": 38200 | |
| }, | |
| { | |
| "epoch": 1.6359830848746317, | |
| "grad_norm": 8.68416690826416, | |
| "learning_rate": 2.5259852236232184e-05, | |
| "loss": 3.5352, | |
| "step": 38300 | |
| }, | |
| { | |
| "epoch": 1.6402545811797873, | |
| "grad_norm": 10.171316146850586, | |
| "learning_rate": 2.5180749576800772e-05, | |
| "loss": 3.6892, | |
| "step": 38400 | |
| }, | |
| { | |
| "epoch": 1.6445260774849428, | |
| "grad_norm": 12.549208641052246, | |
| "learning_rate": 2.5101646917369363e-05, | |
| "loss": 3.6036, | |
| "step": 38500 | |
| }, | |
| { | |
| "epoch": 1.6487975737900986, | |
| "grad_norm": 9.339801788330078, | |
| "learning_rate": 2.502254425793795e-05, | |
| "loss": 3.5937, | |
| "step": 38600 | |
| }, | |
| { | |
| "epoch": 1.6530690700952544, | |
| "grad_norm": 7.933904647827148, | |
| "learning_rate": 2.4943441598506543e-05, | |
| "loss": 3.588, | |
| "step": 38700 | |
| }, | |
| { | |
| "epoch": 1.6573405664004102, | |
| "grad_norm": 11.310770988464355, | |
| "learning_rate": 2.486433893907513e-05, | |
| "loss": 3.7504, | |
| "step": 38800 | |
| }, | |
| { | |
| "epoch": 1.6616120627055657, | |
| "grad_norm": 9.128674507141113, | |
| "learning_rate": 2.4785236279643722e-05, | |
| "loss": 3.65, | |
| "step": 38900 | |
| }, | |
| { | |
| "epoch": 1.6658835590107215, | |
| "grad_norm": 10.278397560119629, | |
| "learning_rate": 2.4706133620212314e-05, | |
| "loss": 3.7471, | |
| "step": 39000 | |
| }, | |
| { | |
| "epoch": 1.6658835590107215, | |
| "eval_runtime": 403.4984, | |
| "eval_samples_per_second": 116.04, | |
| "eval_steps_per_second": 14.506, | |
| "step": 39000 | |
| }, | |
| { | |
| "epoch": 1.670155055315877, | |
| "grad_norm": 9.706366539001465, | |
| "learning_rate": 2.4627030960780902e-05, | |
| "loss": 3.6192, | |
| "step": 39100 | |
| }, | |
| { | |
| "epoch": 1.6744265516210328, | |
| "grad_norm": 8.632245063781738, | |
| "learning_rate": 2.4547928301349494e-05, | |
| "loss": 3.5752, | |
| "step": 39200 | |
| }, | |
| { | |
| "epoch": 1.6786980479261886, | |
| "grad_norm": 8.402227401733398, | |
| "learning_rate": 2.4468825641918085e-05, | |
| "loss": 3.6774, | |
| "step": 39300 | |
| }, | |
| { | |
| "epoch": 1.6829695442313444, | |
| "grad_norm": 8.275464057922363, | |
| "learning_rate": 2.4389722982486673e-05, | |
| "loss": 3.6148, | |
| "step": 39400 | |
| }, | |
| { | |
| "epoch": 1.6872410405365, | |
| "grad_norm": 9.17482852935791, | |
| "learning_rate": 2.431062032305526e-05, | |
| "loss": 3.6413, | |
| "step": 39500 | |
| }, | |
| { | |
| "epoch": 1.6915125368416555, | |
| "grad_norm": 8.914527893066406, | |
| "learning_rate": 2.4231517663623853e-05, | |
| "loss": 3.7191, | |
| "step": 39600 | |
| }, | |
| { | |
| "epoch": 1.6957840331468113, | |
| "grad_norm": 8.066243171691895, | |
| "learning_rate": 2.415241500419244e-05, | |
| "loss": 3.5606, | |
| "step": 39700 | |
| }, | |
| { | |
| "epoch": 1.700055529451967, | |
| "grad_norm": 9.488569259643555, | |
| "learning_rate": 2.4073312344761032e-05, | |
| "loss": 3.6873, | |
| "step": 39800 | |
| }, | |
| { | |
| "epoch": 1.7043270257571228, | |
| "grad_norm": 9.717203140258789, | |
| "learning_rate": 2.399420968532962e-05, | |
| "loss": 3.688, | |
| "step": 39900 | |
| }, | |
| { | |
| "epoch": 1.7085985220622786, | |
| "grad_norm": 8.048073768615723, | |
| "learning_rate": 2.3915107025898212e-05, | |
| "loss": 3.515, | |
| "step": 40000 | |
| }, | |
| { | |
| "epoch": 1.7085985220622786, | |
| "eval_runtime": 404.1323, | |
| "eval_samples_per_second": 115.858, | |
| "eval_steps_per_second": 14.483, | |
| "step": 40000 | |
| }, | |
| { | |
| "epoch": 1.7128700183674341, | |
| "grad_norm": 9.101920127868652, | |
| "learning_rate": 2.3836004366466803e-05, | |
| "loss": 3.6674, | |
| "step": 40100 | |
| }, | |
| { | |
| "epoch": 1.7171415146725897, | |
| "grad_norm": 6.701783180236816, | |
| "learning_rate": 2.375690170703539e-05, | |
| "loss": 3.6339, | |
| "step": 40200 | |
| }, | |
| { | |
| "epoch": 1.7214130109777455, | |
| "grad_norm": 9.65266227722168, | |
| "learning_rate": 2.3677799047603983e-05, | |
| "loss": 3.5557, | |
| "step": 40300 | |
| }, | |
| { | |
| "epoch": 1.7256845072829012, | |
| "grad_norm": 9.488314628601074, | |
| "learning_rate": 2.359869638817257e-05, | |
| "loss": 3.7312, | |
| "step": 40400 | |
| }, | |
| { | |
| "epoch": 1.729956003588057, | |
| "grad_norm": 8.73523235321045, | |
| "learning_rate": 2.3519593728741163e-05, | |
| "loss": 3.6714, | |
| "step": 40500 | |
| }, | |
| { | |
| "epoch": 1.7342274998932126, | |
| "grad_norm": 9.438526153564453, | |
| "learning_rate": 2.344049106930975e-05, | |
| "loss": 3.6664, | |
| "step": 40600 | |
| }, | |
| { | |
| "epoch": 1.7384989961983683, | |
| "grad_norm": 9.409259796142578, | |
| "learning_rate": 2.336138840987834e-05, | |
| "loss": 3.5598, | |
| "step": 40700 | |
| }, | |
| { | |
| "epoch": 1.742770492503524, | |
| "grad_norm": 6.831430435180664, | |
| "learning_rate": 2.328228575044693e-05, | |
| "loss": 3.7215, | |
| "step": 40800 | |
| }, | |
| { | |
| "epoch": 1.7470419888086797, | |
| "grad_norm": 8.387484550476074, | |
| "learning_rate": 2.320318309101552e-05, | |
| "loss": 3.578, | |
| "step": 40900 | |
| }, | |
| { | |
| "epoch": 1.7513134851138354, | |
| "grad_norm": 9.247336387634277, | |
| "learning_rate": 2.312408043158411e-05, | |
| "loss": 3.615, | |
| "step": 41000 | |
| }, | |
| { | |
| "epoch": 1.7513134851138354, | |
| "eval_runtime": 404.0825, | |
| "eval_samples_per_second": 115.872, | |
| "eval_steps_per_second": 14.485, | |
| "step": 41000 | |
| }, | |
| { | |
| "epoch": 1.7555849814189912, | |
| "grad_norm": 11.280122756958008, | |
| "learning_rate": 2.30449777721527e-05, | |
| "loss": 3.5713, | |
| "step": 41100 | |
| }, | |
| { | |
| "epoch": 1.7598564777241468, | |
| "grad_norm": 8.902118682861328, | |
| "learning_rate": 2.2965875112721293e-05, | |
| "loss": 3.6999, | |
| "step": 41200 | |
| }, | |
| { | |
| "epoch": 1.7641279740293023, | |
| "grad_norm": 11.0384521484375, | |
| "learning_rate": 2.288677245328988e-05, | |
| "loss": 3.7134, | |
| "step": 41300 | |
| }, | |
| { | |
| "epoch": 1.768399470334458, | |
| "grad_norm": 8.986517906188965, | |
| "learning_rate": 2.2807669793858472e-05, | |
| "loss": 3.5391, | |
| "step": 41400 | |
| }, | |
| { | |
| "epoch": 1.7726709666396139, | |
| "grad_norm": 9.237929344177246, | |
| "learning_rate": 2.272856713442706e-05, | |
| "loss": 3.781, | |
| "step": 41500 | |
| }, | |
| { | |
| "epoch": 1.7769424629447697, | |
| "grad_norm": 12.143738746643066, | |
| "learning_rate": 2.264946447499565e-05, | |
| "loss": 3.5613, | |
| "step": 41600 | |
| }, | |
| { | |
| "epoch": 1.7812139592499252, | |
| "grad_norm": 9.296298027038574, | |
| "learning_rate": 2.257036181556424e-05, | |
| "loss": 3.6645, | |
| "step": 41700 | |
| }, | |
| { | |
| "epoch": 1.785485455555081, | |
| "grad_norm": 9.721207618713379, | |
| "learning_rate": 2.2491259156132828e-05, | |
| "loss": 3.5764, | |
| "step": 41800 | |
| }, | |
| { | |
| "epoch": 1.7897569518602365, | |
| "grad_norm": 11.145936012268066, | |
| "learning_rate": 2.241215649670142e-05, | |
| "loss": 3.5321, | |
| "step": 41900 | |
| }, | |
| { | |
| "epoch": 1.7940284481653923, | |
| "grad_norm": 10.27043628692627, | |
| "learning_rate": 2.233305383727001e-05, | |
| "loss": 3.6625, | |
| "step": 42000 | |
| }, | |
| { | |
| "epoch": 1.7940284481653923, | |
| "eval_runtime": 405.1907, | |
| "eval_samples_per_second": 115.555, | |
| "eval_steps_per_second": 14.445, | |
| "step": 42000 | |
| }, | |
| { | |
| "epoch": 1.798299944470548, | |
| "grad_norm": 10.281463623046875, | |
| "learning_rate": 2.22539511778386e-05, | |
| "loss": 3.5566, | |
| "step": 42100 | |
| }, | |
| { | |
| "epoch": 1.8025714407757039, | |
| "grad_norm": 6.728999614715576, | |
| "learning_rate": 2.217484851840719e-05, | |
| "loss": 3.5608, | |
| "step": 42200 | |
| }, | |
| { | |
| "epoch": 1.8068429370808594, | |
| "grad_norm": 6.053191184997559, | |
| "learning_rate": 2.209574585897578e-05, | |
| "loss": 3.685, | |
| "step": 42300 | |
| }, | |
| { | |
| "epoch": 1.811114433386015, | |
| "grad_norm": 8.071969032287598, | |
| "learning_rate": 2.201664319954437e-05, | |
| "loss": 3.6003, | |
| "step": 42400 | |
| }, | |
| { | |
| "epoch": 1.8153859296911707, | |
| "grad_norm": 29.326370239257812, | |
| "learning_rate": 2.1937540540112962e-05, | |
| "loss": 3.6615, | |
| "step": 42500 | |
| }, | |
| { | |
| "epoch": 1.8196574259963265, | |
| "grad_norm": 8.652432441711426, | |
| "learning_rate": 2.185843788068155e-05, | |
| "loss": 3.5731, | |
| "step": 42600 | |
| }, | |
| { | |
| "epoch": 1.8239289223014823, | |
| "grad_norm": 11.717292785644531, | |
| "learning_rate": 2.1779335221250138e-05, | |
| "loss": 3.6371, | |
| "step": 42700 | |
| }, | |
| { | |
| "epoch": 1.828200418606638, | |
| "grad_norm": 10.365557670593262, | |
| "learning_rate": 2.170023256181873e-05, | |
| "loss": 3.6857, | |
| "step": 42800 | |
| }, | |
| { | |
| "epoch": 1.8324719149117936, | |
| "grad_norm": 12.400829315185547, | |
| "learning_rate": 2.1621129902387317e-05, | |
| "loss": 3.6896, | |
| "step": 42900 | |
| }, | |
| { | |
| "epoch": 1.8367434112169492, | |
| "grad_norm": 8.40799331665039, | |
| "learning_rate": 2.154202724295591e-05, | |
| "loss": 3.6611, | |
| "step": 43000 | |
| }, | |
| { | |
| "epoch": 1.8367434112169492, | |
| "eval_runtime": 403.8313, | |
| "eval_samples_per_second": 115.944, | |
| "eval_steps_per_second": 14.494, | |
| "step": 43000 | |
| }, | |
| { | |
| "epoch": 1.841014907522105, | |
| "grad_norm": 10.518604278564453, | |
| "learning_rate": 2.14629245835245e-05, | |
| "loss": 3.6118, | |
| "step": 43100 | |
| }, | |
| { | |
| "epoch": 1.8452864038272607, | |
| "grad_norm": 7.877737998962402, | |
| "learning_rate": 2.138382192409309e-05, | |
| "loss": 3.5943, | |
| "step": 43200 | |
| }, | |
| { | |
| "epoch": 1.8495579001324165, | |
| "grad_norm": 12.722783088684082, | |
| "learning_rate": 2.130471926466168e-05, | |
| "loss": 3.6583, | |
| "step": 43300 | |
| }, | |
| { | |
| "epoch": 1.853829396437572, | |
| "grad_norm": 8.382994651794434, | |
| "learning_rate": 2.1225616605230268e-05, | |
| "loss": 3.5931, | |
| "step": 43400 | |
| }, | |
| { | |
| "epoch": 1.8581008927427278, | |
| "grad_norm": 10.603730201721191, | |
| "learning_rate": 2.114651394579886e-05, | |
| "loss": 3.6558, | |
| "step": 43500 | |
| }, | |
| { | |
| "epoch": 1.8623723890478834, | |
| "grad_norm": 6.978638172149658, | |
| "learning_rate": 2.106741128636745e-05, | |
| "loss": 3.504, | |
| "step": 43600 | |
| }, | |
| { | |
| "epoch": 1.8666438853530392, | |
| "grad_norm": 7.777115345001221, | |
| "learning_rate": 2.0988308626936036e-05, | |
| "loss": 3.7573, | |
| "step": 43700 | |
| }, | |
| { | |
| "epoch": 1.870915381658195, | |
| "grad_norm": 8.054482460021973, | |
| "learning_rate": 2.0909205967504627e-05, | |
| "loss": 3.5624, | |
| "step": 43800 | |
| }, | |
| { | |
| "epoch": 1.8751868779633507, | |
| "grad_norm": 8.191532135009766, | |
| "learning_rate": 2.083010330807322e-05, | |
| "loss": 3.6115, | |
| "step": 43900 | |
| }, | |
| { | |
| "epoch": 1.8794583742685063, | |
| "grad_norm": 9.908390998840332, | |
| "learning_rate": 2.0751000648641807e-05, | |
| "loss": 3.6564, | |
| "step": 44000 | |
| }, | |
| { | |
| "epoch": 1.8794583742685063, | |
| "eval_runtime": 403.8168, | |
| "eval_samples_per_second": 115.949, | |
| "eval_steps_per_second": 14.494, | |
| "step": 44000 | |
| }, | |
| { | |
| "epoch": 1.8837298705736618, | |
| "grad_norm": 10.703449249267578, | |
| "learning_rate": 2.06718979892104e-05, | |
| "loss": 3.6265, | |
| "step": 44100 | |
| }, | |
| { | |
| "epoch": 1.8880013668788176, | |
| "grad_norm": 8.703311920166016, | |
| "learning_rate": 2.059279532977899e-05, | |
| "loss": 3.601, | |
| "step": 44200 | |
| }, | |
| { | |
| "epoch": 1.8922728631839734, | |
| "grad_norm": 16.844961166381836, | |
| "learning_rate": 2.0513692670347578e-05, | |
| "loss": 3.5735, | |
| "step": 44300 | |
| }, | |
| { | |
| "epoch": 1.8965443594891291, | |
| "grad_norm": 7.944665908813477, | |
| "learning_rate": 2.043459001091617e-05, | |
| "loss": 3.6514, | |
| "step": 44400 | |
| }, | |
| { | |
| "epoch": 1.9008158557942847, | |
| "grad_norm": 10.938014030456543, | |
| "learning_rate": 2.0355487351484758e-05, | |
| "loss": 3.6739, | |
| "step": 44500 | |
| }, | |
| { | |
| "epoch": 1.9050873520994405, | |
| "grad_norm": 7.884680271148682, | |
| "learning_rate": 2.027638469205335e-05, | |
| "loss": 3.6705, | |
| "step": 44600 | |
| }, | |
| { | |
| "epoch": 1.909358848404596, | |
| "grad_norm": 10.993422508239746, | |
| "learning_rate": 2.0197282032621937e-05, | |
| "loss": 3.5416, | |
| "step": 44700 | |
| }, | |
| { | |
| "epoch": 1.9136303447097518, | |
| "grad_norm": 9.719098091125488, | |
| "learning_rate": 2.0118179373190525e-05, | |
| "loss": 3.6548, | |
| "step": 44800 | |
| }, | |
| { | |
| "epoch": 1.9179018410149076, | |
| "grad_norm": 9.458189964294434, | |
| "learning_rate": 2.0039076713759117e-05, | |
| "loss": 3.7504, | |
| "step": 44900 | |
| }, | |
| { | |
| "epoch": 1.9221733373200633, | |
| "grad_norm": 10.599435806274414, | |
| "learning_rate": 1.9959974054327708e-05, | |
| "loss": 3.5734, | |
| "step": 45000 | |
| }, | |
| { | |
| "epoch": 1.9221733373200633, | |
| "eval_runtime": 404.0106, | |
| "eval_samples_per_second": 115.893, | |
| "eval_steps_per_second": 14.487, | |
| "step": 45000 | |
| }, | |
| { | |
| "epoch": 1.926444833625219, | |
| "grad_norm": 9.23690128326416, | |
| "learning_rate": 1.9880871394896296e-05, | |
| "loss": 3.7208, | |
| "step": 45100 | |
| }, | |
| { | |
| "epoch": 1.9307163299303745, | |
| "grad_norm": 7.124606609344482, | |
| "learning_rate": 1.9801768735464888e-05, | |
| "loss": 3.6351, | |
| "step": 45200 | |
| }, | |
| { | |
| "epoch": 1.9349878262355302, | |
| "grad_norm": 8.71446704864502, | |
| "learning_rate": 1.9722666076033476e-05, | |
| "loss": 3.6835, | |
| "step": 45300 | |
| }, | |
| { | |
| "epoch": 1.939259322540686, | |
| "grad_norm": 9.558823585510254, | |
| "learning_rate": 1.9643563416602067e-05, | |
| "loss": 3.5569, | |
| "step": 45400 | |
| }, | |
| { | |
| "epoch": 1.9435308188458418, | |
| "grad_norm": 9.622088432312012, | |
| "learning_rate": 1.956446075717066e-05, | |
| "loss": 3.6797, | |
| "step": 45500 | |
| }, | |
| { | |
| "epoch": 1.9478023151509976, | |
| "grad_norm": 8.641619682312012, | |
| "learning_rate": 1.9485358097739247e-05, | |
| "loss": 3.6377, | |
| "step": 45600 | |
| }, | |
| { | |
| "epoch": 1.952073811456153, | |
| "grad_norm": 12.308704376220703, | |
| "learning_rate": 1.940625543830784e-05, | |
| "loss": 3.5211, | |
| "step": 45700 | |
| }, | |
| { | |
| "epoch": 1.9563453077613087, | |
| "grad_norm": 8.850275993347168, | |
| "learning_rate": 1.9327152778876426e-05, | |
| "loss": 3.5652, | |
| "step": 45800 | |
| }, | |
| { | |
| "epoch": 1.9606168040664644, | |
| "grad_norm": 8.595603942871094, | |
| "learning_rate": 1.9248050119445015e-05, | |
| "loss": 3.6181, | |
| "step": 45900 | |
| }, | |
| { | |
| "epoch": 1.9648883003716202, | |
| "grad_norm": 8.737709999084473, | |
| "learning_rate": 1.9168947460013606e-05, | |
| "loss": 3.6392, | |
| "step": 46000 | |
| }, | |
| { | |
| "epoch": 1.9648883003716202, | |
| "eval_runtime": 403.5624, | |
| "eval_samples_per_second": 116.022, | |
| "eval_steps_per_second": 14.503, | |
| "step": 46000 | |
| }, | |
| { | |
| "epoch": 1.969159796676776, | |
| "grad_norm": 10.178166389465332, | |
| "learning_rate": 1.9089844800582198e-05, | |
| "loss": 3.5406, | |
| "step": 46100 | |
| }, | |
| { | |
| "epoch": 1.9734312929819315, | |
| "grad_norm": 8.49496841430664, | |
| "learning_rate": 1.9010742141150786e-05, | |
| "loss": 3.5631, | |
| "step": 46200 | |
| }, | |
| { | |
| "epoch": 1.9777027892870873, | |
| "grad_norm": 12.1917724609375, | |
| "learning_rate": 1.8931639481719377e-05, | |
| "loss": 3.6878, | |
| "step": 46300 | |
| }, | |
| { | |
| "epoch": 1.9819742855922429, | |
| "grad_norm": 7.169999599456787, | |
| "learning_rate": 1.8852536822287965e-05, | |
| "loss": 3.5653, | |
| "step": 46400 | |
| }, | |
| { | |
| "epoch": 1.9862457818973986, | |
| "grad_norm": 9.828686714172363, | |
| "learning_rate": 1.8773434162856557e-05, | |
| "loss": 3.5959, | |
| "step": 46500 | |
| }, | |
| { | |
| "epoch": 1.9905172782025544, | |
| "grad_norm": 11.669685363769531, | |
| "learning_rate": 1.8694331503425148e-05, | |
| "loss": 3.7558, | |
| "step": 46600 | |
| }, | |
| { | |
| "epoch": 1.9947887745077102, | |
| "grad_norm": 9.722572326660156, | |
| "learning_rate": 1.8615228843993736e-05, | |
| "loss": 3.5793, | |
| "step": 46700 | |
| }, | |
| { | |
| "epoch": 1.9990602708128657, | |
| "grad_norm": 7.060891151428223, | |
| "learning_rate": 1.8536126184562324e-05, | |
| "loss": 3.5613, | |
| "step": 46800 | |
| }, | |
| { | |
| "epoch": 2.0033317671180213, | |
| "grad_norm": 7.597713470458984, | |
| "learning_rate": 1.8457023525130916e-05, | |
| "loss": 3.527, | |
| "step": 46900 | |
| }, | |
| { | |
| "epoch": 2.007603263423177, | |
| "grad_norm": 8.622049331665039, | |
| "learning_rate": 1.8377920865699504e-05, | |
| "loss": 3.576, | |
| "step": 47000 | |
| }, | |
| { | |
| "epoch": 2.007603263423177, | |
| "eval_runtime": 404.752, | |
| "eval_samples_per_second": 115.681, | |
| "eval_steps_per_second": 14.461, | |
| "step": 47000 | |
| }, | |
| { | |
| "epoch": 2.011874759728333, | |
| "grad_norm": 7.117955207824707, | |
| "learning_rate": 1.8298818206268095e-05, | |
| "loss": 3.6644, | |
| "step": 47100 | |
| }, | |
| { | |
| "epoch": 2.0161462560334886, | |
| "grad_norm": 7.748778820037842, | |
| "learning_rate": 1.8219715546836687e-05, | |
| "loss": 3.5904, | |
| "step": 47200 | |
| }, | |
| { | |
| "epoch": 2.0204177523386444, | |
| "grad_norm": 7.402785301208496, | |
| "learning_rate": 1.8140612887405275e-05, | |
| "loss": 3.6069, | |
| "step": 47300 | |
| }, | |
| { | |
| "epoch": 2.0246892486437997, | |
| "grad_norm": 7.453569412231445, | |
| "learning_rate": 1.8061510227973867e-05, | |
| "loss": 3.6795, | |
| "step": 47400 | |
| }, | |
| { | |
| "epoch": 2.0289607449489555, | |
| "grad_norm": 8.299507141113281, | |
| "learning_rate": 1.7982407568542455e-05, | |
| "loss": 3.6194, | |
| "step": 47500 | |
| }, | |
| { | |
| "epoch": 2.0332322412541113, | |
| "grad_norm": 10.050152778625488, | |
| "learning_rate": 1.7903304909111046e-05, | |
| "loss": 3.5512, | |
| "step": 47600 | |
| }, | |
| { | |
| "epoch": 2.037503737559267, | |
| "grad_norm": 8.691873550415039, | |
| "learning_rate": 1.7824202249679638e-05, | |
| "loss": 3.6216, | |
| "step": 47700 | |
| }, | |
| { | |
| "epoch": 2.041775233864423, | |
| "grad_norm": 7.912090301513672, | |
| "learning_rate": 1.7745099590248222e-05, | |
| "loss": 3.5601, | |
| "step": 47800 | |
| }, | |
| { | |
| "epoch": 2.0460467301695786, | |
| "grad_norm": 9.80728530883789, | |
| "learning_rate": 1.7665996930816814e-05, | |
| "loss": 3.6074, | |
| "step": 47900 | |
| }, | |
| { | |
| "epoch": 2.050318226474734, | |
| "grad_norm": 11.86419677734375, | |
| "learning_rate": 1.7586894271385405e-05, | |
| "loss": 3.5964, | |
| "step": 48000 | |
| }, | |
| { | |
| "epoch": 2.050318226474734, | |
| "eval_runtime": 403.5223, | |
| "eval_samples_per_second": 116.033, | |
| "eval_steps_per_second": 14.505, | |
| "step": 48000 | |
| }, | |
| { | |
| "epoch": 2.0545897227798897, | |
| "grad_norm": 8.644769668579102, | |
| "learning_rate": 1.7507791611953993e-05, | |
| "loss": 3.53, | |
| "step": 48100 | |
| }, | |
| { | |
| "epoch": 2.0588612190850455, | |
| "grad_norm": 8.596597671508789, | |
| "learning_rate": 1.7428688952522585e-05, | |
| "loss": 3.6423, | |
| "step": 48200 | |
| }, | |
| { | |
| "epoch": 2.0631327153902013, | |
| "grad_norm": 8.68507194519043, | |
| "learning_rate": 1.7349586293091173e-05, | |
| "loss": 3.5187, | |
| "step": 48300 | |
| }, | |
| { | |
| "epoch": 2.067404211695357, | |
| "grad_norm": 12.417092323303223, | |
| "learning_rate": 1.7270483633659764e-05, | |
| "loss": 3.6873, | |
| "step": 48400 | |
| }, | |
| { | |
| "epoch": 2.0716757080005124, | |
| "grad_norm": 7.873465061187744, | |
| "learning_rate": 1.7191380974228356e-05, | |
| "loss": 3.556, | |
| "step": 48500 | |
| }, | |
| { | |
| "epoch": 2.075947204305668, | |
| "grad_norm": 9.485852241516113, | |
| "learning_rate": 1.7112278314796944e-05, | |
| "loss": 3.5671, | |
| "step": 48600 | |
| }, | |
| { | |
| "epoch": 2.080218700610824, | |
| "grad_norm": 9.282876968383789, | |
| "learning_rate": 1.7033175655365536e-05, | |
| "loss": 3.5649, | |
| "step": 48700 | |
| }, | |
| { | |
| "epoch": 2.0844901969159797, | |
| "grad_norm": 9.663043022155762, | |
| "learning_rate": 1.6954072995934127e-05, | |
| "loss": 3.5675, | |
| "step": 48800 | |
| }, | |
| { | |
| "epoch": 2.0887616932211355, | |
| "grad_norm": 9.47641372680664, | |
| "learning_rate": 1.6874970336502712e-05, | |
| "loss": 3.5404, | |
| "step": 48900 | |
| }, | |
| { | |
| "epoch": 2.0930331895262912, | |
| "grad_norm": 9.768278121948242, | |
| "learning_rate": 1.6795867677071303e-05, | |
| "loss": 3.6144, | |
| "step": 49000 | |
| }, | |
| { | |
| "epoch": 2.0930331895262912, | |
| "eval_runtime": 403.9425, | |
| "eval_samples_per_second": 115.913, | |
| "eval_steps_per_second": 14.49, | |
| "step": 49000 | |
| }, | |
| { | |
| "epoch": 2.0973046858314466, | |
| "grad_norm": 9.314282417297363, | |
| "learning_rate": 1.6716765017639895e-05, | |
| "loss": 3.6568, | |
| "step": 49100 | |
| }, | |
| { | |
| "epoch": 2.1015761821366024, | |
| "grad_norm": 8.707430839538574, | |
| "learning_rate": 1.6637662358208483e-05, | |
| "loss": 3.5775, | |
| "step": 49200 | |
| }, | |
| { | |
| "epoch": 2.105847678441758, | |
| "grad_norm": 11.704259872436523, | |
| "learning_rate": 1.6558559698777074e-05, | |
| "loss": 3.5568, | |
| "step": 49300 | |
| }, | |
| { | |
| "epoch": 2.110119174746914, | |
| "grad_norm": 8.504453659057617, | |
| "learning_rate": 1.6479457039345662e-05, | |
| "loss": 3.6528, | |
| "step": 49400 | |
| }, | |
| { | |
| "epoch": 2.1143906710520697, | |
| "grad_norm": 8.935593605041504, | |
| "learning_rate": 1.6400354379914254e-05, | |
| "loss": 3.7016, | |
| "step": 49500 | |
| }, | |
| { | |
| "epoch": 2.1186621673572255, | |
| "grad_norm": 8.349204063415527, | |
| "learning_rate": 1.6321251720482845e-05, | |
| "loss": 3.5431, | |
| "step": 49600 | |
| }, | |
| { | |
| "epoch": 2.122933663662381, | |
| "grad_norm": 11.8608980178833, | |
| "learning_rate": 1.6242149061051433e-05, | |
| "loss": 3.5844, | |
| "step": 49700 | |
| }, | |
| { | |
| "epoch": 2.1272051599675366, | |
| "grad_norm": 7.555705547332764, | |
| "learning_rate": 1.6163046401620025e-05, | |
| "loss": 3.5815, | |
| "step": 49800 | |
| }, | |
| { | |
| "epoch": 2.1314766562726923, | |
| "grad_norm": 9.529816627502441, | |
| "learning_rate": 1.6083943742188613e-05, | |
| "loss": 3.5485, | |
| "step": 49900 | |
| }, | |
| { | |
| "epoch": 2.135748152577848, | |
| "grad_norm": 9.32353401184082, | |
| "learning_rate": 1.60048410827572e-05, | |
| "loss": 3.589, | |
| "step": 50000 | |
| }, | |
| { | |
| "epoch": 2.135748152577848, | |
| "eval_runtime": 404.1831, | |
| "eval_samples_per_second": 115.844, | |
| "eval_steps_per_second": 14.481, | |
| "step": 50000 | |
| }, | |
| { | |
| "epoch": 2.140019648883004, | |
| "grad_norm": 8.285137176513672, | |
| "learning_rate": 1.5925738423325793e-05, | |
| "loss": 3.6874, | |
| "step": 50100 | |
| }, | |
| { | |
| "epoch": 2.144291145188159, | |
| "grad_norm": 6.94751501083374, | |
| "learning_rate": 1.5846635763894384e-05, | |
| "loss": 3.6489, | |
| "step": 50200 | |
| }, | |
| { | |
| "epoch": 2.148562641493315, | |
| "grad_norm": 11.093490600585938, | |
| "learning_rate": 1.5767533104462972e-05, | |
| "loss": 3.6675, | |
| "step": 50300 | |
| }, | |
| { | |
| "epoch": 2.1528341377984708, | |
| "grad_norm": 8.154306411743164, | |
| "learning_rate": 1.5688430445031564e-05, | |
| "loss": 3.5934, | |
| "step": 50400 | |
| }, | |
| { | |
| "epoch": 2.1571056341036265, | |
| "grad_norm": 8.806336402893066, | |
| "learning_rate": 1.5609327785600152e-05, | |
| "loss": 3.5804, | |
| "step": 50500 | |
| }, | |
| { | |
| "epoch": 2.1613771304087823, | |
| "grad_norm": 10.496975898742676, | |
| "learning_rate": 1.5530225126168743e-05, | |
| "loss": 3.6913, | |
| "step": 50600 | |
| }, | |
| { | |
| "epoch": 2.165648626713938, | |
| "grad_norm": 9.081565856933594, | |
| "learning_rate": 1.5451122466737335e-05, | |
| "loss": 3.7297, | |
| "step": 50700 | |
| }, | |
| { | |
| "epoch": 2.1699201230190934, | |
| "grad_norm": 7.850902557373047, | |
| "learning_rate": 1.5372019807305923e-05, | |
| "loss": 3.7112, | |
| "step": 50800 | |
| }, | |
| { | |
| "epoch": 2.174191619324249, | |
| "grad_norm": 8.145720481872559, | |
| "learning_rate": 1.529291714787451e-05, | |
| "loss": 3.6324, | |
| "step": 50900 | |
| }, | |
| { | |
| "epoch": 2.178463115629405, | |
| "grad_norm": 8.924689292907715, | |
| "learning_rate": 1.52138144884431e-05, | |
| "loss": 3.6598, | |
| "step": 51000 | |
| }, | |
| { | |
| "epoch": 2.178463115629405, | |
| "eval_runtime": 404.5353, | |
| "eval_samples_per_second": 115.743, | |
| "eval_steps_per_second": 14.468, | |
| "step": 51000 | |
| }, | |
| { | |
| "epoch": 2.1827346119345608, | |
| "grad_norm": 13.303974151611328, | |
| "learning_rate": 1.513471182901169e-05, | |
| "loss": 3.5284, | |
| "step": 51100 | |
| }, | |
| { | |
| "epoch": 2.1870061082397165, | |
| "grad_norm": 8.976105690002441, | |
| "learning_rate": 1.5055609169580282e-05, | |
| "loss": 3.6514, | |
| "step": 51200 | |
| }, | |
| { | |
| "epoch": 2.1912776045448723, | |
| "grad_norm": 7.439825057983398, | |
| "learning_rate": 1.4976506510148872e-05, | |
| "loss": 3.5687, | |
| "step": 51300 | |
| }, | |
| { | |
| "epoch": 2.1955491008500276, | |
| "grad_norm": 8.54857349395752, | |
| "learning_rate": 1.4897403850717462e-05, | |
| "loss": 3.7166, | |
| "step": 51400 | |
| }, | |
| { | |
| "epoch": 2.1998205971551834, | |
| "grad_norm": 11.23593521118164, | |
| "learning_rate": 1.4818301191286051e-05, | |
| "loss": 3.5591, | |
| "step": 51500 | |
| }, | |
| { | |
| "epoch": 2.204092093460339, | |
| "grad_norm": 13.313474655151367, | |
| "learning_rate": 1.4739198531854643e-05, | |
| "loss": 3.7213, | |
| "step": 51600 | |
| }, | |
| { | |
| "epoch": 2.208363589765495, | |
| "grad_norm": 9.998103141784668, | |
| "learning_rate": 1.4660095872423233e-05, | |
| "loss": 3.5843, | |
| "step": 51700 | |
| }, | |
| { | |
| "epoch": 2.2126350860706507, | |
| "grad_norm": 8.799863815307617, | |
| "learning_rate": 1.4580993212991822e-05, | |
| "loss": 3.7109, | |
| "step": 51800 | |
| }, | |
| { | |
| "epoch": 2.216906582375806, | |
| "grad_norm": 7.352701187133789, | |
| "learning_rate": 1.4501890553560412e-05, | |
| "loss": 3.6722, | |
| "step": 51900 | |
| }, | |
| { | |
| "epoch": 2.221178078680962, | |
| "grad_norm": 12.166138648986816, | |
| "learning_rate": 1.4422787894129e-05, | |
| "loss": 3.582, | |
| "step": 52000 | |
| }, | |
| { | |
| "epoch": 2.221178078680962, | |
| "eval_runtime": 403.5081, | |
| "eval_samples_per_second": 116.037, | |
| "eval_steps_per_second": 14.505, | |
| "step": 52000 | |
| }, | |
| { | |
| "epoch": 2.2254495749861176, | |
| "grad_norm": 10.341227531433105, | |
| "learning_rate": 1.434368523469759e-05, | |
| "loss": 3.7105, | |
| "step": 52100 | |
| }, | |
| { | |
| "epoch": 2.2297210712912734, | |
| "grad_norm": 7.697736740112305, | |
| "learning_rate": 1.426458257526618e-05, | |
| "loss": 3.5896, | |
| "step": 52200 | |
| }, | |
| { | |
| "epoch": 2.233992567596429, | |
| "grad_norm": 7.957235336303711, | |
| "learning_rate": 1.4185479915834771e-05, | |
| "loss": 3.5472, | |
| "step": 52300 | |
| }, | |
| { | |
| "epoch": 2.2382640639015845, | |
| "grad_norm": 7.778316020965576, | |
| "learning_rate": 1.4106377256403361e-05, | |
| "loss": 3.5998, | |
| "step": 52400 | |
| }, | |
| { | |
| "epoch": 2.2425355602067403, | |
| "grad_norm": 8.099467277526855, | |
| "learning_rate": 1.4027274596971951e-05, | |
| "loss": 3.7143, | |
| "step": 52500 | |
| }, | |
| { | |
| "epoch": 2.246807056511896, | |
| "grad_norm": 8.077199935913086, | |
| "learning_rate": 1.394817193754054e-05, | |
| "loss": 3.6727, | |
| "step": 52600 | |
| }, | |
| { | |
| "epoch": 2.251078552817052, | |
| "grad_norm": 10.278371810913086, | |
| "learning_rate": 1.3869069278109132e-05, | |
| "loss": 3.638, | |
| "step": 52700 | |
| }, | |
| { | |
| "epoch": 2.2553500491222076, | |
| "grad_norm": 10.49933910369873, | |
| "learning_rate": 1.3789966618677722e-05, | |
| "loss": 3.5718, | |
| "step": 52800 | |
| }, | |
| { | |
| "epoch": 2.2596215454273634, | |
| "grad_norm": 10.37414264678955, | |
| "learning_rate": 1.3710863959246312e-05, | |
| "loss": 3.551, | |
| "step": 52900 | |
| }, | |
| { | |
| "epoch": 2.2638930417325187, | |
| "grad_norm": 6.969189643859863, | |
| "learning_rate": 1.36317612998149e-05, | |
| "loss": 3.593, | |
| "step": 53000 | |
| }, | |
| { | |
| "epoch": 2.2638930417325187, | |
| "eval_runtime": 404.0848, | |
| "eval_samples_per_second": 115.872, | |
| "eval_steps_per_second": 14.485, | |
| "step": 53000 | |
| }, | |
| { | |
| "epoch": 2.2681645380376745, | |
| "grad_norm": 7.354485511779785, | |
| "learning_rate": 1.355265864038349e-05, | |
| "loss": 3.5927, | |
| "step": 53100 | |
| }, | |
| { | |
| "epoch": 2.2724360343428303, | |
| "grad_norm": 10.107403755187988, | |
| "learning_rate": 1.347355598095208e-05, | |
| "loss": 3.6626, | |
| "step": 53200 | |
| }, | |
| { | |
| "epoch": 2.276707530647986, | |
| "grad_norm": 9.613969802856445, | |
| "learning_rate": 1.339445332152067e-05, | |
| "loss": 3.6073, | |
| "step": 53300 | |
| }, | |
| { | |
| "epoch": 2.280979026953142, | |
| "grad_norm": 7.995043754577637, | |
| "learning_rate": 1.3315350662089259e-05, | |
| "loss": 3.5675, | |
| "step": 53400 | |
| }, | |
| { | |
| "epoch": 2.2852505232582976, | |
| "grad_norm": 7.049370765686035, | |
| "learning_rate": 1.323624800265785e-05, | |
| "loss": 3.6559, | |
| "step": 53500 | |
| }, | |
| { | |
| "epoch": 2.289522019563453, | |
| "grad_norm": 10.962531089782715, | |
| "learning_rate": 1.315714534322644e-05, | |
| "loss": 3.667, | |
| "step": 53600 | |
| }, | |
| { | |
| "epoch": 2.2937935158686087, | |
| "grad_norm": 8.100302696228027, | |
| "learning_rate": 1.307804268379503e-05, | |
| "loss": 3.6044, | |
| "step": 53700 | |
| }, | |
| { | |
| "epoch": 2.2980650121737645, | |
| "grad_norm": 8.079455375671387, | |
| "learning_rate": 1.299894002436362e-05, | |
| "loss": 3.7206, | |
| "step": 53800 | |
| }, | |
| { | |
| "epoch": 2.3023365084789202, | |
| "grad_norm": 8.500101089477539, | |
| "learning_rate": 1.2919837364932211e-05, | |
| "loss": 3.6075, | |
| "step": 53900 | |
| }, | |
| { | |
| "epoch": 2.306608004784076, | |
| "grad_norm": 12.927189826965332, | |
| "learning_rate": 1.2840734705500801e-05, | |
| "loss": 3.654, | |
| "step": 54000 | |
| }, | |
| { | |
| "epoch": 2.306608004784076, | |
| "eval_runtime": 403.7677, | |
| "eval_samples_per_second": 115.963, | |
| "eval_steps_per_second": 14.496, | |
| "step": 54000 | |
| }, | |
| { | |
| "epoch": 2.3108795010892313, | |
| "grad_norm": 7.132537841796875, | |
| "learning_rate": 1.2761632046069388e-05, | |
| "loss": 3.6083, | |
| "step": 54100 | |
| }, | |
| { | |
| "epoch": 2.315150997394387, | |
| "grad_norm": 8.612030982971191, | |
| "learning_rate": 1.2682529386637979e-05, | |
| "loss": 3.5848, | |
| "step": 54200 | |
| }, | |
| { | |
| "epoch": 2.319422493699543, | |
| "grad_norm": 8.210352897644043, | |
| "learning_rate": 1.2603426727206569e-05, | |
| "loss": 3.7611, | |
| "step": 54300 | |
| }, | |
| { | |
| "epoch": 2.3236939900046987, | |
| "grad_norm": 9.12792682647705, | |
| "learning_rate": 1.2524324067775159e-05, | |
| "loss": 3.518, | |
| "step": 54400 | |
| }, | |
| { | |
| "epoch": 2.3279654863098544, | |
| "grad_norm": 9.999760627746582, | |
| "learning_rate": 1.2445221408343748e-05, | |
| "loss": 3.5513, | |
| "step": 54500 | |
| }, | |
| { | |
| "epoch": 2.3322369826150102, | |
| "grad_norm": 12.578611373901367, | |
| "learning_rate": 1.236611874891234e-05, | |
| "loss": 3.5614, | |
| "step": 54600 | |
| }, | |
| { | |
| "epoch": 2.3365084789201656, | |
| "grad_norm": 12.089159965515137, | |
| "learning_rate": 1.228701608948093e-05, | |
| "loss": 3.5944, | |
| "step": 54700 | |
| }, | |
| { | |
| "epoch": 2.3407799752253213, | |
| "grad_norm": 7.965277671813965, | |
| "learning_rate": 1.2207913430049518e-05, | |
| "loss": 3.4852, | |
| "step": 54800 | |
| }, | |
| { | |
| "epoch": 2.345051471530477, | |
| "grad_norm": 10.866728782653809, | |
| "learning_rate": 1.2128810770618108e-05, | |
| "loss": 3.5431, | |
| "step": 54900 | |
| }, | |
| { | |
| "epoch": 2.349322967835633, | |
| "grad_norm": 10.489164352416992, | |
| "learning_rate": 1.2049708111186699e-05, | |
| "loss": 3.4309, | |
| "step": 55000 | |
| }, | |
| { | |
| "epoch": 2.349322967835633, | |
| "eval_runtime": 404.0413, | |
| "eval_samples_per_second": 115.884, | |
| "eval_steps_per_second": 14.486, | |
| "step": 55000 | |
| }, | |
| { | |
| "epoch": 2.3535944641407887, | |
| "grad_norm": 8.494600296020508, | |
| "learning_rate": 1.1970605451755289e-05, | |
| "loss": 3.4994, | |
| "step": 55100 | |
| }, | |
| { | |
| "epoch": 2.3578659604459444, | |
| "grad_norm": 9.755086898803711, | |
| "learning_rate": 1.1891502792323879e-05, | |
| "loss": 3.5679, | |
| "step": 55200 | |
| }, | |
| { | |
| "epoch": 2.3621374567510998, | |
| "grad_norm": 9.621931076049805, | |
| "learning_rate": 1.1812400132892468e-05, | |
| "loss": 3.6264, | |
| "step": 55300 | |
| }, | |
| { | |
| "epoch": 2.3664089530562555, | |
| "grad_norm": 12.946763038635254, | |
| "learning_rate": 1.1733297473461058e-05, | |
| "loss": 3.5741, | |
| "step": 55400 | |
| }, | |
| { | |
| "epoch": 2.3706804493614113, | |
| "grad_norm": 8.984336853027344, | |
| "learning_rate": 1.1654194814029648e-05, | |
| "loss": 3.5317, | |
| "step": 55500 | |
| }, | |
| { | |
| "epoch": 2.374951945666567, | |
| "grad_norm": 9.304176330566406, | |
| "learning_rate": 1.1575092154598238e-05, | |
| "loss": 3.6013, | |
| "step": 55600 | |
| }, | |
| { | |
| "epoch": 2.379223441971723, | |
| "grad_norm": 8.324792861938477, | |
| "learning_rate": 1.1495989495166828e-05, | |
| "loss": 3.5887, | |
| "step": 55700 | |
| }, | |
| { | |
| "epoch": 2.383494938276878, | |
| "grad_norm": 11.814824104309082, | |
| "learning_rate": 1.1416886835735419e-05, | |
| "loss": 3.4865, | |
| "step": 55800 | |
| }, | |
| { | |
| "epoch": 2.387766434582034, | |
| "grad_norm": 9.219450950622559, | |
| "learning_rate": 1.1337784176304007e-05, | |
| "loss": 3.5598, | |
| "step": 55900 | |
| }, | |
| { | |
| "epoch": 2.3920379308871897, | |
| "grad_norm": 10.202199935913086, | |
| "learning_rate": 1.1258681516872597e-05, | |
| "loss": 3.6973, | |
| "step": 56000 | |
| }, | |
| { | |
| "epoch": 2.3920379308871897, | |
| "eval_runtime": 405.1455, | |
| "eval_samples_per_second": 115.568, | |
| "eval_steps_per_second": 14.447, | |
| "step": 56000 | |
| }, | |
| { | |
| "epoch": 2.3963094271923455, | |
| "grad_norm": 10.352853775024414, | |
| "learning_rate": 1.1179578857441188e-05, | |
| "loss": 3.5715, | |
| "step": 56100 | |
| }, | |
| { | |
| "epoch": 2.4005809234975013, | |
| "grad_norm": 9.794927597045898, | |
| "learning_rate": 1.1100476198009778e-05, | |
| "loss": 3.5751, | |
| "step": 56200 | |
| }, | |
| { | |
| "epoch": 2.4048524198026566, | |
| "grad_norm": 9.24485969543457, | |
| "learning_rate": 1.1021373538578368e-05, | |
| "loss": 3.6939, | |
| "step": 56300 | |
| }, | |
| { | |
| "epoch": 2.4091239161078124, | |
| "grad_norm": 6.9035162925720215, | |
| "learning_rate": 1.0942270879146956e-05, | |
| "loss": 3.6613, | |
| "step": 56400 | |
| }, | |
| { | |
| "epoch": 2.413395412412968, | |
| "grad_norm": 9.021778106689453, | |
| "learning_rate": 1.0863168219715548e-05, | |
| "loss": 3.5978, | |
| "step": 56500 | |
| }, | |
| { | |
| "epoch": 2.417666908718124, | |
| "grad_norm": 7.050608158111572, | |
| "learning_rate": 1.0784065560284137e-05, | |
| "loss": 3.7212, | |
| "step": 56600 | |
| }, | |
| { | |
| "epoch": 2.4219384050232797, | |
| "grad_norm": 8.771140098571777, | |
| "learning_rate": 1.0704962900852727e-05, | |
| "loss": 3.6447, | |
| "step": 56700 | |
| }, | |
| { | |
| "epoch": 2.4262099013284355, | |
| "grad_norm": 14.564820289611816, | |
| "learning_rate": 1.0625860241421317e-05, | |
| "loss": 3.6091, | |
| "step": 56800 | |
| }, | |
| { | |
| "epoch": 2.4304813976335913, | |
| "grad_norm": 10.664299011230469, | |
| "learning_rate": 1.0546757581989907e-05, | |
| "loss": 3.6506, | |
| "step": 56900 | |
| }, | |
| { | |
| "epoch": 2.4347528939387466, | |
| "grad_norm": 14.445178985595703, | |
| "learning_rate": 1.0467654922558497e-05, | |
| "loss": 3.6226, | |
| "step": 57000 | |
| }, | |
| { | |
| "epoch": 2.4347528939387466, | |
| "eval_runtime": 404.9602, | |
| "eval_samples_per_second": 115.621, | |
| "eval_steps_per_second": 14.453, | |
| "step": 57000 | |
| }, | |
| { | |
| "epoch": 2.4390243902439024, | |
| "grad_norm": 19.93160057067871, | |
| "learning_rate": 1.0388552263127086e-05, | |
| "loss": 3.6144, | |
| "step": 57100 | |
| }, | |
| { | |
| "epoch": 2.443295886549058, | |
| "grad_norm": 7.793177604675293, | |
| "learning_rate": 1.0309449603695676e-05, | |
| "loss": 3.6577, | |
| "step": 57200 | |
| }, | |
| { | |
| "epoch": 2.447567382854214, | |
| "grad_norm": 7.95759391784668, | |
| "learning_rate": 1.0230346944264268e-05, | |
| "loss": 3.6079, | |
| "step": 57300 | |
| }, | |
| { | |
| "epoch": 2.4518388791593697, | |
| "grad_norm": 10.07507610321045, | |
| "learning_rate": 1.0151244284832856e-05, | |
| "loss": 3.5974, | |
| "step": 57400 | |
| }, | |
| { | |
| "epoch": 2.456110375464525, | |
| "grad_norm": 9.73681926727295, | |
| "learning_rate": 1.0072141625401446e-05, | |
| "loss": 3.5602, | |
| "step": 57500 | |
| }, | |
| { | |
| "epoch": 2.460381871769681, | |
| "grad_norm": 18.652366638183594, | |
| "learning_rate": 9.993038965970037e-06, | |
| "loss": 3.5649, | |
| "step": 57600 | |
| }, | |
| { | |
| "epoch": 2.4646533680748366, | |
| "grad_norm": 10.758431434631348, | |
| "learning_rate": 9.913936306538627e-06, | |
| "loss": 3.6587, | |
| "step": 57700 | |
| }, | |
| { | |
| "epoch": 2.4689248643799924, | |
| "grad_norm": 8.963933944702148, | |
| "learning_rate": 9.834833647107217e-06, | |
| "loss": 3.5872, | |
| "step": 57800 | |
| }, | |
| { | |
| "epoch": 2.473196360685148, | |
| "grad_norm": 12.521937370300293, | |
| "learning_rate": 9.755730987675805e-06, | |
| "loss": 3.6379, | |
| "step": 57900 | |
| }, | |
| { | |
| "epoch": 2.4774678569903035, | |
| "grad_norm": 8.87618350982666, | |
| "learning_rate": 9.676628328244396e-06, | |
| "loss": 3.6867, | |
| "step": 58000 | |
| }, | |
| { | |
| "epoch": 2.4774678569903035, | |
| "eval_runtime": 403.9214, | |
| "eval_samples_per_second": 115.919, | |
| "eval_steps_per_second": 14.49, | |
| "step": 58000 | |
| }, | |
| { | |
| "epoch": 2.4817393532954592, | |
| "grad_norm": 8.210921287536621, | |
| "learning_rate": 9.597525668812986e-06, | |
| "loss": 3.5951, | |
| "step": 58100 | |
| }, | |
| { | |
| "epoch": 2.486010849600615, | |
| "grad_norm": 11.452202796936035, | |
| "learning_rate": 9.518423009381576e-06, | |
| "loss": 3.573, | |
| "step": 58200 | |
| }, | |
| { | |
| "epoch": 2.490282345905771, | |
| "grad_norm": 6.497128486633301, | |
| "learning_rate": 9.439320349950166e-06, | |
| "loss": 3.5676, | |
| "step": 58300 | |
| }, | |
| { | |
| "epoch": 2.4945538422109266, | |
| "grad_norm": 10.434738159179688, | |
| "learning_rate": 9.360217690518757e-06, | |
| "loss": 3.6214, | |
| "step": 58400 | |
| }, | |
| { | |
| "epoch": 2.4988253385160824, | |
| "grad_norm": 10.927915573120117, | |
| "learning_rate": 9.281115031087345e-06, | |
| "loss": 3.6166, | |
| "step": 58500 | |
| }, | |
| { | |
| "epoch": 2.503096834821238, | |
| "grad_norm": 9.382610321044922, | |
| "learning_rate": 9.202012371655935e-06, | |
| "loss": 3.6148, | |
| "step": 58600 | |
| }, | |
| { | |
| "epoch": 2.5073683311263935, | |
| "grad_norm": 10.243247032165527, | |
| "learning_rate": 9.122909712224525e-06, | |
| "loss": 3.5586, | |
| "step": 58700 | |
| }, | |
| { | |
| "epoch": 2.5116398274315492, | |
| "grad_norm": 9.074312210083008, | |
| "learning_rate": 9.043807052793116e-06, | |
| "loss": 3.5202, | |
| "step": 58800 | |
| }, | |
| { | |
| "epoch": 2.515911323736705, | |
| "grad_norm": 8.498826026916504, | |
| "learning_rate": 8.964704393361706e-06, | |
| "loss": 3.6136, | |
| "step": 58900 | |
| }, | |
| { | |
| "epoch": 2.520182820041861, | |
| "grad_norm": 9.749957084655762, | |
| "learning_rate": 8.885601733930294e-06, | |
| "loss": 3.6116, | |
| "step": 59000 | |
| }, | |
| { | |
| "epoch": 2.520182820041861, | |
| "eval_runtime": 403.7151, | |
| "eval_samples_per_second": 115.978, | |
| "eval_steps_per_second": 14.498, | |
| "step": 59000 | |
| }, | |
| { | |
| "epoch": 2.5244543163470166, | |
| "grad_norm": 12.452668190002441, | |
| "learning_rate": 8.806499074498886e-06, | |
| "loss": 3.6322, | |
| "step": 59100 | |
| }, | |
| { | |
| "epoch": 2.528725812652172, | |
| "grad_norm": 10.466354370117188, | |
| "learning_rate": 8.727396415067475e-06, | |
| "loss": 3.623, | |
| "step": 59200 | |
| }, | |
| { | |
| "epoch": 2.5329973089573277, | |
| "grad_norm": 11.655385971069336, | |
| "learning_rate": 8.648293755636065e-06, | |
| "loss": 3.6222, | |
| "step": 59300 | |
| }, | |
| { | |
| "epoch": 2.5372688052624834, | |
| "grad_norm": 46.87141799926758, | |
| "learning_rate": 8.569191096204655e-06, | |
| "loss": 3.5937, | |
| "step": 59400 | |
| }, | |
| { | |
| "epoch": 2.541540301567639, | |
| "grad_norm": 7.815254211425781, | |
| "learning_rate": 8.490088436773245e-06, | |
| "loss": 3.6052, | |
| "step": 59500 | |
| }, | |
| { | |
| "epoch": 2.545811797872795, | |
| "grad_norm": 8.2904052734375, | |
| "learning_rate": 8.410985777341835e-06, | |
| "loss": 3.668, | |
| "step": 59600 | |
| }, | |
| { | |
| "epoch": 2.5500832941779503, | |
| "grad_norm": 7.5048017501831055, | |
| "learning_rate": 8.331883117910424e-06, | |
| "loss": 3.5193, | |
| "step": 59700 | |
| }, | |
| { | |
| "epoch": 2.554354790483106, | |
| "grad_norm": 8.502148628234863, | |
| "learning_rate": 8.252780458479014e-06, | |
| "loss": 3.5909, | |
| "step": 59800 | |
| }, | |
| { | |
| "epoch": 2.558626286788262, | |
| "grad_norm": 7.68582820892334, | |
| "learning_rate": 8.173677799047606e-06, | |
| "loss": 3.5942, | |
| "step": 59900 | |
| }, | |
| { | |
| "epoch": 2.5628977830934176, | |
| "grad_norm": 8.871585845947266, | |
| "learning_rate": 8.094575139616194e-06, | |
| "loss": 3.609, | |
| "step": 60000 | |
| }, | |
| { | |
| "epoch": 2.5628977830934176, | |
| "eval_runtime": 403.125, | |
| "eval_samples_per_second": 116.148, | |
| "eval_steps_per_second": 14.519, | |
| "step": 60000 | |
| }, | |
| { | |
| "epoch": 2.5671692793985734, | |
| "grad_norm": 11.707693099975586, | |
| "learning_rate": 8.015472480184783e-06, | |
| "loss": 3.559, | |
| "step": 60100 | |
| }, | |
| { | |
| "epoch": 2.5714407757037288, | |
| "grad_norm": 11.136000633239746, | |
| "learning_rate": 7.936369820753373e-06, | |
| "loss": 3.6089, | |
| "step": 60200 | |
| }, | |
| { | |
| "epoch": 2.575712272008885, | |
| "grad_norm": 8.095897674560547, | |
| "learning_rate": 7.857267161321965e-06, | |
| "loss": 3.5446, | |
| "step": 60300 | |
| }, | |
| { | |
| "epoch": 2.5799837683140403, | |
| "grad_norm": 9.27779769897461, | |
| "learning_rate": 7.778164501890555e-06, | |
| "loss": 3.6471, | |
| "step": 60400 | |
| }, | |
| { | |
| "epoch": 2.584255264619196, | |
| "grad_norm": 10.214181900024414, | |
| "learning_rate": 7.699061842459143e-06, | |
| "loss": 3.568, | |
| "step": 60500 | |
| }, | |
| { | |
| "epoch": 2.588526760924352, | |
| "grad_norm": 7.064481258392334, | |
| "learning_rate": 7.619959183027733e-06, | |
| "loss": 3.6581, | |
| "step": 60600 | |
| }, | |
| { | |
| "epoch": 2.5927982572295076, | |
| "grad_norm": 10.396333694458008, | |
| "learning_rate": 7.540856523596324e-06, | |
| "loss": 3.5857, | |
| "step": 60700 | |
| }, | |
| { | |
| "epoch": 2.5970697535346634, | |
| "grad_norm": 9.091317176818848, | |
| "learning_rate": 7.461753864164914e-06, | |
| "loss": 3.5121, | |
| "step": 60800 | |
| }, | |
| { | |
| "epoch": 2.6013412498398187, | |
| "grad_norm": 8.94211483001709, | |
| "learning_rate": 7.3826512047335035e-06, | |
| "loss": 3.5963, | |
| "step": 60900 | |
| }, | |
| { | |
| "epoch": 2.6056127461449745, | |
| "grad_norm": 9.317548751831055, | |
| "learning_rate": 7.303548545302094e-06, | |
| "loss": 3.6764, | |
| "step": 61000 | |
| }, | |
| { | |
| "epoch": 2.6056127461449745, | |
| "eval_runtime": 403.6369, | |
| "eval_samples_per_second": 116.0, | |
| "eval_steps_per_second": 14.501, | |
| "step": 61000 | |
| }, | |
| { | |
| "epoch": 2.6098842424501303, | |
| "grad_norm": 9.0656156539917, | |
| "learning_rate": 7.224445885870683e-06, | |
| "loss": 3.5781, | |
| "step": 61100 | |
| }, | |
| { | |
| "epoch": 2.614155738755286, | |
| "grad_norm": 12.859307289123535, | |
| "learning_rate": 7.145343226439273e-06, | |
| "loss": 3.553, | |
| "step": 61200 | |
| }, | |
| { | |
| "epoch": 2.618427235060442, | |
| "grad_norm": 10.962692260742188, | |
| "learning_rate": 7.0662405670078635e-06, | |
| "loss": 3.6146, | |
| "step": 61300 | |
| }, | |
| { | |
| "epoch": 2.622698731365597, | |
| "grad_norm": 11.84343147277832, | |
| "learning_rate": 6.987137907576453e-06, | |
| "loss": 3.6113, | |
| "step": 61400 | |
| }, | |
| { | |
| "epoch": 2.626970227670753, | |
| "grad_norm": 8.770605087280273, | |
| "learning_rate": 6.908035248145044e-06, | |
| "loss": 3.6307, | |
| "step": 61500 | |
| }, | |
| { | |
| "epoch": 2.6312417239759087, | |
| "grad_norm": 11.979937553405762, | |
| "learning_rate": 6.828932588713632e-06, | |
| "loss": 3.5754, | |
| "step": 61600 | |
| }, | |
| { | |
| "epoch": 2.6355132202810645, | |
| "grad_norm": 8.271350860595703, | |
| "learning_rate": 6.749829929282223e-06, | |
| "loss": 3.6004, | |
| "step": 61700 | |
| }, | |
| { | |
| "epoch": 2.6397847165862203, | |
| "grad_norm": 9.494888305664062, | |
| "learning_rate": 6.6707272698508125e-06, | |
| "loss": 3.5873, | |
| "step": 61800 | |
| }, | |
| { | |
| "epoch": 2.6440562128913756, | |
| "grad_norm": 8.384838104248047, | |
| "learning_rate": 6.591624610419403e-06, | |
| "loss": 3.7076, | |
| "step": 61900 | |
| }, | |
| { | |
| "epoch": 2.648327709196532, | |
| "grad_norm": 11.468506813049316, | |
| "learning_rate": 6.512521950987993e-06, | |
| "loss": 3.6043, | |
| "step": 62000 | |
| }, | |
| { | |
| "epoch": 2.648327709196532, | |
| "eval_runtime": 403.9819, | |
| "eval_samples_per_second": 115.901, | |
| "eval_steps_per_second": 14.488, | |
| "step": 62000 | |
| }, | |
| { | |
| "epoch": 2.652599205501687, | |
| "grad_norm": 9.485078811645508, | |
| "learning_rate": 6.433419291556582e-06, | |
| "loss": 3.6547, | |
| "step": 62100 | |
| }, | |
| { | |
| "epoch": 2.656870701806843, | |
| "grad_norm": 6.771136283874512, | |
| "learning_rate": 6.3543166321251725e-06, | |
| "loss": 3.6359, | |
| "step": 62200 | |
| }, | |
| { | |
| "epoch": 2.6611421981119987, | |
| "grad_norm": 7.749585151672363, | |
| "learning_rate": 6.275213972693762e-06, | |
| "loss": 3.6473, | |
| "step": 62300 | |
| }, | |
| { | |
| "epoch": 2.6654136944171545, | |
| "grad_norm": 9.156508445739746, | |
| "learning_rate": 6.196111313262352e-06, | |
| "loss": 3.5664, | |
| "step": 62400 | |
| }, | |
| { | |
| "epoch": 2.6696851907223103, | |
| "grad_norm": 7.322949409484863, | |
| "learning_rate": 6.117008653830942e-06, | |
| "loss": 3.6327, | |
| "step": 62500 | |
| }, | |
| { | |
| "epoch": 2.6739566870274656, | |
| "grad_norm": 9.038566589355469, | |
| "learning_rate": 6.0379059943995325e-06, | |
| "loss": 3.588, | |
| "step": 62600 | |
| }, | |
| { | |
| "epoch": 2.6782281833326214, | |
| "grad_norm": 9.974699020385742, | |
| "learning_rate": 5.9588033349681214e-06, | |
| "loss": 3.5183, | |
| "step": 62700 | |
| }, | |
| { | |
| "epoch": 2.682499679637777, | |
| "grad_norm": 15.095208168029785, | |
| "learning_rate": 5.879700675536712e-06, | |
| "loss": 3.6799, | |
| "step": 62800 | |
| }, | |
| { | |
| "epoch": 2.686771175942933, | |
| "grad_norm": 6.498071670532227, | |
| "learning_rate": 5.800598016105302e-06, | |
| "loss": 3.665, | |
| "step": 62900 | |
| }, | |
| { | |
| "epoch": 2.6910426722480887, | |
| "grad_norm": 10.172649383544922, | |
| "learning_rate": 5.721495356673892e-06, | |
| "loss": 3.5558, | |
| "step": 63000 | |
| }, | |
| { | |
| "epoch": 2.6910426722480887, | |
| "eval_runtime": 404.4896, | |
| "eval_samples_per_second": 115.756, | |
| "eval_steps_per_second": 14.47, | |
| "step": 63000 | |
| }, | |
| { | |
| "epoch": 2.695314168553244, | |
| "grad_norm": 9.67616081237793, | |
| "learning_rate": 5.6423926972424814e-06, | |
| "loss": 3.6391, | |
| "step": 63100 | |
| }, | |
| { | |
| "epoch": 2.6995856648584, | |
| "grad_norm": 8.725837707519531, | |
| "learning_rate": 5.563290037811072e-06, | |
| "loss": 3.5525, | |
| "step": 63200 | |
| }, | |
| { | |
| "epoch": 2.7038571611635556, | |
| "grad_norm": 7.677910327911377, | |
| "learning_rate": 5.484187378379661e-06, | |
| "loss": 3.5785, | |
| "step": 63300 | |
| }, | |
| { | |
| "epoch": 2.7081286574687113, | |
| "grad_norm": 9.097688674926758, | |
| "learning_rate": 5.405084718948252e-06, | |
| "loss": 3.6759, | |
| "step": 63400 | |
| }, | |
| { | |
| "epoch": 2.712400153773867, | |
| "grad_norm": 9.70285415649414, | |
| "learning_rate": 5.325982059516841e-06, | |
| "loss": 3.641, | |
| "step": 63500 | |
| }, | |
| { | |
| "epoch": 2.7166716500790224, | |
| "grad_norm": 8.540017127990723, | |
| "learning_rate": 5.246879400085431e-06, | |
| "loss": 3.6462, | |
| "step": 63600 | |
| }, | |
| { | |
| "epoch": 2.7209431463841782, | |
| "grad_norm": 9.38048267364502, | |
| "learning_rate": 5.167776740654021e-06, | |
| "loss": 3.6205, | |
| "step": 63700 | |
| }, | |
| { | |
| "epoch": 2.725214642689334, | |
| "grad_norm": 7.8036417961120605, | |
| "learning_rate": 5.088674081222611e-06, | |
| "loss": 3.5581, | |
| "step": 63800 | |
| }, | |
| { | |
| "epoch": 2.7294861389944898, | |
| "grad_norm": 8.558833122253418, | |
| "learning_rate": 5.009571421791201e-06, | |
| "loss": 3.6245, | |
| "step": 63900 | |
| }, | |
| { | |
| "epoch": 2.7337576352996455, | |
| "grad_norm": 10.551793098449707, | |
| "learning_rate": 4.93046876235979e-06, | |
| "loss": 3.6178, | |
| "step": 64000 | |
| }, | |
| { | |
| "epoch": 2.7337576352996455, | |
| "eval_runtime": 403.4664, | |
| "eval_samples_per_second": 116.049, | |
| "eval_steps_per_second": 14.507, | |
| "step": 64000 | |
| }, | |
| { | |
| "epoch": 2.7380291316048013, | |
| "grad_norm": 7.700866222381592, | |
| "learning_rate": 4.851366102928381e-06, | |
| "loss": 3.5907, | |
| "step": 64100 | |
| }, | |
| { | |
| "epoch": 2.742300627909957, | |
| "grad_norm": 10.438343048095703, | |
| "learning_rate": 4.772263443496971e-06, | |
| "loss": 3.6566, | |
| "step": 64200 | |
| }, | |
| { | |
| "epoch": 2.7465721242151124, | |
| "grad_norm": 10.483076095581055, | |
| "learning_rate": 4.693160784065561e-06, | |
| "loss": 3.5357, | |
| "step": 64300 | |
| }, | |
| { | |
| "epoch": 2.750843620520268, | |
| "grad_norm": 10.251741409301758, | |
| "learning_rate": 4.61405812463415e-06, | |
| "loss": 3.5436, | |
| "step": 64400 | |
| }, | |
| { | |
| "epoch": 2.755115116825424, | |
| "grad_norm": 7.065600872039795, | |
| "learning_rate": 4.534955465202741e-06, | |
| "loss": 3.588, | |
| "step": 64500 | |
| }, | |
| { | |
| "epoch": 2.7593866131305798, | |
| "grad_norm": 6.57476282119751, | |
| "learning_rate": 4.45585280577133e-06, | |
| "loss": 3.5464, | |
| "step": 64600 | |
| }, | |
| { | |
| "epoch": 2.7636581094357355, | |
| "grad_norm": 10.847752571105957, | |
| "learning_rate": 4.376750146339921e-06, | |
| "loss": 3.6241, | |
| "step": 64700 | |
| }, | |
| { | |
| "epoch": 2.767929605740891, | |
| "grad_norm": 9.701374053955078, | |
| "learning_rate": 4.29764748690851e-06, | |
| "loss": 3.6638, | |
| "step": 64800 | |
| }, | |
| { | |
| "epoch": 2.7722011020460466, | |
| "grad_norm": 8.982709884643555, | |
| "learning_rate": 4.2185448274771e-06, | |
| "loss": 3.5764, | |
| "step": 64900 | |
| }, | |
| { | |
| "epoch": 2.7764725983512024, | |
| "grad_norm": 11.895380973815918, | |
| "learning_rate": 4.13944216804569e-06, | |
| "loss": 3.6685, | |
| "step": 65000 | |
| }, | |
| { | |
| "epoch": 2.7764725983512024, | |
| "eval_runtime": 404.1638, | |
| "eval_samples_per_second": 115.849, | |
| "eval_steps_per_second": 14.482, | |
| "step": 65000 | |
| }, | |
| { | |
| "epoch": 2.780744094656358, | |
| "grad_norm": 7.826374053955078, | |
| "learning_rate": 4.06033950861428e-06, | |
| "loss": 3.6005, | |
| "step": 65100 | |
| }, | |
| { | |
| "epoch": 2.785015590961514, | |
| "grad_norm": 9.21237564086914, | |
| "learning_rate": 3.98123684918287e-06, | |
| "loss": 3.5123, | |
| "step": 65200 | |
| }, | |
| { | |
| "epoch": 2.7892870872666693, | |
| "grad_norm": 9.817154884338379, | |
| "learning_rate": 3.902134189751459e-06, | |
| "loss": 3.6442, | |
| "step": 65300 | |
| }, | |
| { | |
| "epoch": 2.793558583571825, | |
| "grad_norm": 9.74837875366211, | |
| "learning_rate": 3.823031530320049e-06, | |
| "loss": 3.5619, | |
| "step": 65400 | |
| }, | |
| { | |
| "epoch": 2.797830079876981, | |
| "grad_norm": 8.357489585876465, | |
| "learning_rate": 3.74392887088864e-06, | |
| "loss": 3.613, | |
| "step": 65500 | |
| }, | |
| { | |
| "epoch": 2.8021015761821366, | |
| "grad_norm": 10.162979125976562, | |
| "learning_rate": 3.664826211457229e-06, | |
| "loss": 3.636, | |
| "step": 65600 | |
| }, | |
| { | |
| "epoch": 2.8063730724872924, | |
| "grad_norm": 9.95310115814209, | |
| "learning_rate": 3.5857235520258194e-06, | |
| "loss": 3.7321, | |
| "step": 65700 | |
| }, | |
| { | |
| "epoch": 2.8106445687924477, | |
| "grad_norm": 8.15718936920166, | |
| "learning_rate": 3.5066208925944088e-06, | |
| "loss": 3.6125, | |
| "step": 65800 | |
| }, | |
| { | |
| "epoch": 2.814916065097604, | |
| "grad_norm": 11.377549171447754, | |
| "learning_rate": 3.427518233162999e-06, | |
| "loss": 3.6859, | |
| "step": 65900 | |
| }, | |
| { | |
| "epoch": 2.8191875614027593, | |
| "grad_norm": 7.227240562438965, | |
| "learning_rate": 3.348415573731589e-06, | |
| "loss": 3.6915, | |
| "step": 66000 | |
| }, | |
| { | |
| "epoch": 2.8191875614027593, | |
| "eval_runtime": 403.9368, | |
| "eval_samples_per_second": 115.914, | |
| "eval_steps_per_second": 14.49, | |
| "step": 66000 | |
| }, | |
| { | |
| "epoch": 2.823459057707915, | |
| "grad_norm": 9.386198043823242, | |
| "learning_rate": 3.2693129143001786e-06, | |
| "loss": 3.5728, | |
| "step": 66100 | |
| }, | |
| { | |
| "epoch": 2.827730554013071, | |
| "grad_norm": 9.223987579345703, | |
| "learning_rate": 3.1902102548687688e-06, | |
| "loss": 3.6028, | |
| "step": 66200 | |
| }, | |
| { | |
| "epoch": 2.8320020503182266, | |
| "grad_norm": 7.933398246765137, | |
| "learning_rate": 3.1111075954373586e-06, | |
| "loss": 3.6051, | |
| "step": 66300 | |
| }, | |
| { | |
| "epoch": 2.8362735466233824, | |
| "grad_norm": 8.923637390136719, | |
| "learning_rate": 3.0320049360059483e-06, | |
| "loss": 3.6427, | |
| "step": 66400 | |
| }, | |
| { | |
| "epoch": 2.8405450429285377, | |
| "grad_norm": 9.507107734680176, | |
| "learning_rate": 2.9529022765745386e-06, | |
| "loss": 3.6082, | |
| "step": 66500 | |
| }, | |
| { | |
| "epoch": 2.8448165392336935, | |
| "grad_norm": 8.927079200744629, | |
| "learning_rate": 2.8737996171431284e-06, | |
| "loss": 3.6545, | |
| "step": 66600 | |
| }, | |
| { | |
| "epoch": 2.8490880355388493, | |
| "grad_norm": 11.978940963745117, | |
| "learning_rate": 2.7946969577117186e-06, | |
| "loss": 3.6035, | |
| "step": 66700 | |
| }, | |
| { | |
| "epoch": 2.853359531844005, | |
| "grad_norm": 8.081381797790527, | |
| "learning_rate": 2.7155942982803084e-06, | |
| "loss": 3.5326, | |
| "step": 66800 | |
| }, | |
| { | |
| "epoch": 2.857631028149161, | |
| "grad_norm": 7.906234264373779, | |
| "learning_rate": 2.6364916388488986e-06, | |
| "loss": 3.5, | |
| "step": 66900 | |
| }, | |
| { | |
| "epoch": 2.861902524454316, | |
| "grad_norm": 7.646207809448242, | |
| "learning_rate": 2.5573889794174884e-06, | |
| "loss": 3.6673, | |
| "step": 67000 | |
| }, | |
| { | |
| "epoch": 2.861902524454316, | |
| "eval_runtime": 403.9147, | |
| "eval_samples_per_second": 115.921, | |
| "eval_steps_per_second": 14.491, | |
| "step": 67000 | |
| }, | |
| { | |
| "epoch": 2.866174020759472, | |
| "grad_norm": 7.473504543304443, | |
| "learning_rate": 2.478286319986078e-06, | |
| "loss": 3.5667, | |
| "step": 67100 | |
| }, | |
| { | |
| "epoch": 2.8704455170646277, | |
| "grad_norm": 7.414896011352539, | |
| "learning_rate": 2.399183660554668e-06, | |
| "loss": 3.703, | |
| "step": 67200 | |
| }, | |
| { | |
| "epoch": 2.8747170133697835, | |
| "grad_norm": 10.893074035644531, | |
| "learning_rate": 2.3200810011232577e-06, | |
| "loss": 3.5693, | |
| "step": 67300 | |
| }, | |
| { | |
| "epoch": 2.8789885096749392, | |
| "grad_norm": 9.107102394104004, | |
| "learning_rate": 2.240978341691848e-06, | |
| "loss": 3.5448, | |
| "step": 67400 | |
| }, | |
| { | |
| "epoch": 2.8832600059800946, | |
| "grad_norm": 12.374472618103027, | |
| "learning_rate": 2.1618756822604377e-06, | |
| "loss": 3.5897, | |
| "step": 67500 | |
| }, | |
| { | |
| "epoch": 2.887531502285251, | |
| "grad_norm": 10.249347686767578, | |
| "learning_rate": 2.0827730228290275e-06, | |
| "loss": 3.6645, | |
| "step": 67600 | |
| }, | |
| { | |
| "epoch": 2.891802998590406, | |
| "grad_norm": 10.349568367004395, | |
| "learning_rate": 2.0036703633976173e-06, | |
| "loss": 3.5886, | |
| "step": 67700 | |
| }, | |
| { | |
| "epoch": 2.896074494895562, | |
| "grad_norm": 9.08791732788086, | |
| "learning_rate": 1.9245677039662075e-06, | |
| "loss": 3.6344, | |
| "step": 67800 | |
| }, | |
| { | |
| "epoch": 2.9003459912007177, | |
| "grad_norm": 23.89297866821289, | |
| "learning_rate": 1.8454650445347973e-06, | |
| "loss": 3.5352, | |
| "step": 67900 | |
| }, | |
| { | |
| "epoch": 2.9046174875058735, | |
| "grad_norm": 7.383826732635498, | |
| "learning_rate": 1.7663623851033873e-06, | |
| "loss": 3.5756, | |
| "step": 68000 | |
| }, | |
| { | |
| "epoch": 2.9046174875058735, | |
| "eval_runtime": 403.9584, | |
| "eval_samples_per_second": 115.908, | |
| "eval_steps_per_second": 14.489, | |
| "step": 68000 | |
| }, | |
| { | |
| "epoch": 2.9088889838110292, | |
| "grad_norm": 10.697755813598633, | |
| "learning_rate": 1.6872597256719771e-06, | |
| "loss": 3.7499, | |
| "step": 68100 | |
| }, | |
| { | |
| "epoch": 2.9131604801161846, | |
| "grad_norm": 8.796770095825195, | |
| "learning_rate": 1.608157066240567e-06, | |
| "loss": 3.6521, | |
| "step": 68200 | |
| }, | |
| { | |
| "epoch": 2.9174319764213403, | |
| "grad_norm": 9.28873348236084, | |
| "learning_rate": 1.529054406809157e-06, | |
| "loss": 3.6073, | |
| "step": 68300 | |
| }, | |
| { | |
| "epoch": 2.921703472726496, | |
| "grad_norm": 9.964879035949707, | |
| "learning_rate": 1.449951747377747e-06, | |
| "loss": 3.5816, | |
| "step": 68400 | |
| }, | |
| { | |
| "epoch": 2.925974969031652, | |
| "grad_norm": 8.402993202209473, | |
| "learning_rate": 1.3708490879463367e-06, | |
| "loss": 3.5752, | |
| "step": 68500 | |
| }, | |
| { | |
| "epoch": 2.9302464653368077, | |
| "grad_norm": 11.395208358764648, | |
| "learning_rate": 1.2917464285149267e-06, | |
| "loss": 3.5633, | |
| "step": 68600 | |
| }, | |
| { | |
| "epoch": 2.934517961641963, | |
| "grad_norm": 8.817408561706543, | |
| "learning_rate": 1.2126437690835167e-06, | |
| "loss": 3.6463, | |
| "step": 68700 | |
| }, | |
| { | |
| "epoch": 2.9387894579471188, | |
| "grad_norm": 12.337442398071289, | |
| "learning_rate": 1.1335411096521067e-06, | |
| "loss": 3.6794, | |
| "step": 68800 | |
| }, | |
| { | |
| "epoch": 2.9430609542522745, | |
| "grad_norm": 8.89869213104248, | |
| "learning_rate": 1.0544384502206965e-06, | |
| "loss": 3.4842, | |
| "step": 68900 | |
| }, | |
| { | |
| "epoch": 2.9473324505574303, | |
| "grad_norm": 8.63214111328125, | |
| "learning_rate": 9.753357907892865e-07, | |
| "loss": 3.5863, | |
| "step": 69000 | |
| }, | |
| { | |
| "epoch": 2.9473324505574303, | |
| "eval_runtime": 404.2191, | |
| "eval_samples_per_second": 115.833, | |
| "eval_steps_per_second": 14.48, | |
| "step": 69000 | |
| }, | |
| { | |
| "epoch": 2.951603946862586, | |
| "grad_norm": 8.96723747253418, | |
| "learning_rate": 8.962331313578763e-07, | |
| "loss": 3.6939, | |
| "step": 69100 | |
| }, | |
| { | |
| "epoch": 2.9558754431677414, | |
| "grad_norm": 9.602298736572266, | |
| "learning_rate": 8.171304719264663e-07, | |
| "loss": 3.7252, | |
| "step": 69200 | |
| }, | |
| { | |
| "epoch": 2.960146939472897, | |
| "grad_norm": 12.317747116088867, | |
| "learning_rate": 7.380278124950561e-07, | |
| "loss": 3.6703, | |
| "step": 69300 | |
| }, | |
| { | |
| "epoch": 2.964418435778053, | |
| "grad_norm": 7.297631740570068, | |
| "learning_rate": 6.589251530636461e-07, | |
| "loss": 3.6542, | |
| "step": 69400 | |
| }, | |
| { | |
| "epoch": 2.9686899320832087, | |
| "grad_norm": 8.46646499633789, | |
| "learning_rate": 5.79822493632236e-07, | |
| "loss": 3.6304, | |
| "step": 69500 | |
| }, | |
| { | |
| "epoch": 2.9729614283883645, | |
| "grad_norm": 9.025291442871094, | |
| "learning_rate": 5.007198342008259e-07, | |
| "loss": 3.5323, | |
| "step": 69600 | |
| }, | |
| { | |
| "epoch": 2.9772329246935203, | |
| "grad_norm": 8.952362060546875, | |
| "learning_rate": 4.216171747694158e-07, | |
| "loss": 3.5854, | |
| "step": 69700 | |
| }, | |
| { | |
| "epoch": 2.981504420998676, | |
| "grad_norm": 6.965066432952881, | |
| "learning_rate": 3.4251451533800567e-07, | |
| "loss": 3.4377, | |
| "step": 69800 | |
| }, | |
| { | |
| "epoch": 2.9857759173038314, | |
| "grad_norm": 10.185367584228516, | |
| "learning_rate": 2.6341185590659557e-07, | |
| "loss": 3.5891, | |
| "step": 69900 | |
| }, | |
| { | |
| "epoch": 2.990047413608987, | |
| "grad_norm": 11.02718734741211, | |
| "learning_rate": 1.8430919647518552e-07, | |
| "loss": 3.5265, | |
| "step": 70000 | |
| }, | |
| { | |
| "epoch": 2.990047413608987, | |
| "eval_runtime": 403.8967, | |
| "eval_samples_per_second": 115.926, | |
| "eval_steps_per_second": 14.491, | |
| "step": 70000 | |
| } | |
| ], | |
| "logging_steps": 100, | |
| "max_steps": 70233, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 3, | |
| "save_steps": 1000, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": false | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 2.3668072761491424e+16, | |
| "train_batch_size": 8, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |